
    kh                         d Z ddlmZ ddlZddlmZ ddlmc mZ ddl	m
Z
mZ ddlmZ ddlmZ dee   fd	Z G d
 dej$                        Z G d dej$                        Zy)aP   Bottleneck Self Attention (Bottleneck Transformers)

Paper: `Bottleneck Transformers for Visual Recognition` - https://arxiv.org/abs/2101.11605

@misc{2101.11605,
Author = {Aravind Srinivas and Tsung-Yi Lin and Niki Parmar and Jonathon Shlens and Pieter Abbeel and Ashish Vaswani},
Title = {Bottleneck Transformers for Visual Recognition},
Year = {2021},
}

Based on ref gist at: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2

This impl is a WIP but given that it is based on the ref gist likely not too far off.

Hacked together by / Copyright 2021 Ross Wightman
    )ListN   )	to_2tuplemake_divisible)trunc_normal_)_assertpermute_maskc                    | j                   \  }}}}| |j                  dd      z  }|j                  d|d|z  dz
        }t        j                  |ddg      j                  d      }t        j                  |d|dz
  g      }|j                  d|dz   d|z  dz
        }|ddd||dz
  df   }|j                  ||d||      j                  dd|dd      }|j                  |      S )a   Compute relative logits along one dimension

    As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
    Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925

    Args:
        q: (batch, heads, height, width, dim)
        rel_k: (2 * width - 1, dim)
        permute_mask: permute output dim according to this
       r   r   N)shape	transposereshapeFpadflattenexpandpermute)	qrel_kr	   BHWdimxx_pads	            W/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/layers/bottleneck_attn.pyrel_logits_1dr      s     77LAq!S	
U__R$	$A			"aQ"A EE!aV$$Q'EEE%!QU$E MM"a!eQUQY/Ea!QUVmA 	
		!Q1a ''B2r:A99\""    c                   (     e Zd ZdZ fdZd Z xZS )PosEmbedRelz Relative Position Embedding
    As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
    Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925
    c                 d   t         |           t        |      \  | _        | _        || _        t        j                  t        j                  | j                  dz  dz
  |      |z        | _
        t        j                  t        j                  | j                  dz  dz
  |      |z        | _        y )Nr   r   )super__init__r   heightwidthdim_headnn	Parametertorchrandn
height_rel	width_rel)self	feat_sizer(   scale	__class__s       r   r%   zPosEmbedRel.__init__=   s    "+I"6TZ ,,u{{4;;?Q3F'QTY'YZekk$**q.12Dh&ORW&WXr    c                 *   |j                   \  }}}|j                  || j                  | j                  d      }t	        || j
                  d      }|j                  dd      }t	        || j                  d      }||z   }|j                  |||      }|S )Nr   )r   r      r      )r	   r   r   )r   r4   r   r5   r   )r   r   r&   r'   r   r.   r   r-   )r/   r   r   HW_rel_logits_wrel_logits_h
rel_logitss           r   forwardzPosEmbedRel.forwardD   s    772q IIadjj"5$Q_U KK1$QoV!L0
''2r2
r    )__name__
__module____qualname____doc__r%   r;   __classcell__r2   s   @r   r"   r"   8   s    Yr    r"   c                   4     e Zd ZdZ	 	 d fd	Zd Zd Z xZS )BottleneckAttna   Bottleneck Attention
    Paper: `Bottleneck Transformers for Visual Recognition` - https://arxiv.org/abs/2101.11605

    The internal dimensions of the attention module are controlled by the interaction of several arguments.
      * the output dimension of the module is specified by dim_out, which falls back to input dim if not set
      * the value (v) dimension is set to dim_out // num_heads, the v projection determines the output dim
      * the query and key (qk) dimensions are determined by
        * num_heads * dim_head if dim_head is not None
        * num_heads * (dim_out * attn_ratio // num_heads) if dim_head is None
      * as seen above, attn_ratio determines the ratio of q and k relative to the output if dim_head not used

    Args:
        dim (int): input dimension to the module
        dim_out (int): output dimension of the module, same as dim if not set
        stride (int): output stride of the module, avg pool used if stride == 2 (default: 1).
        num_heads (int): parallel attention heads (default: 4)
        dim_head (int): dimension of query and key heads, calculated from dim_out * attn_ratio // num_heads if not set
        qk_ratio (float): ratio of q and k dimensions to output dimension when dim_head not set. (default: 1.0)
        qkv_bias (bool): add bias to q, k, and v projections
        scale_pos_embed (bool): scale the position embedding as well as Q @ K
    c
                    t         
|           |J d       |xs |}||z  dk(  sJ || _        |xs t        ||z  d      |z  | _        || j                  z  | _        || j                  z  | _        || j
                  z  | _        | j                  dz  | _        |	| _	        t        j                  || j                  dz  | j                  z   d|      | _        t        || j                  | j                  	      | _        |dk(  rt        j                  dd      nt        j                          | _        | j%                          y )
NzBA concrete feature size matching expected input (H, W) is requiredr      )divisor      r   r   )bias)r(   r1   )r$   r%   	num_headsr   dim_head_qk
dim_head_v
dim_out_qk	dim_out_vr1   scale_pos_embedr)   Conv2dqkvr"   	pos_embed	AvgPool2dIdentitypoolreset_parameters)r/   r   dim_outr0   striderI   r(   qk_ratioqkv_biasrN   r2   s             r   r%   zBottleneckAttn.__init__j   s%    	$j&jj$.S"a'''"#a~g6HRS'TXa'a!T^^3#d&6&66"T__4%%-
.99S$//A"5"FPXY %Y9I9IQUQ[Q[\*0A+BLLA&2;;=	r    c                 :   t        | j                  j                  | j                  j                  j                  d   dz         t        | j                  j
                  | j                         t        | j                  j                  | j                         y )Nr   rG   )std)r   rP   weightr   rQ   r-   r1   r.   )r/   s    r   rU   zBottleneckAttn.reset_parameters   s\    dhhoo488??+@+@+Ct+KLdnn//TZZ@dnn..DJJ?r    c                    |j                   \  }}}}t        || j                  j                  k(  d       t        || j                  j                  k(  d       | j                  |      }t        j                  || j                  | j                  | j                  gd      \  }}}|j                  || j                  z  | j                  d      j                  dd      }|j                  || j                  z  | j                  d      }|j                  || j                  z  | j                  d      j                  dd      }| j                  r%||z  | j                  |      z   | j                   z  }	n$||z  | j                   z  | j                  |      z   }	|	j#                  d      }	|	|z  j                  dd      j                  || j                  ||      }
| j%                  |
      }
|
S )N r   )r   r   r   )r   r   rQ   r&   r'   rP   r+   splitrL   rM   r   rI   rJ   r   rK   rN   r1   softmaxrT   )r/   r   r   Cr   r   r   kvattnouts              r   r;   zBottleneckAttn.forward   s   WW
1aT^^***B/T^^)))2.HHQK ++a$//4??DNN!SYZ[1aIIa$..($*:*:B?II"bQIIa$..($*:*:B?IIa$..($//2>HHRPEDNN1--;DETZZ'$..*;;D|||#ax""2r*221dnnaKiin
r    )NNr   r5   Ng      ?FF)r<   r=   r>   r?   r%   rU   r;   r@   rA   s   @r   rC   rC   T   s"    , VZ:? 0@
r    rC   )r?   typingr   r+   torch.nnr)   torch.nn.functional
functionalr   helpersr   r   weight_initr   trace_utilsr   intr   Moduler"   rC    r    r   <module>rp      sV          . &  #$s) #8")) 8IRYY Ir    