
    kh)                         d Z ddlmZ ddlZddlmZ ddlmc mZ ddlm	Z	 ddl
mZ ddlmZ d	ee   fd
Z G d dej                         Z G d dej                         Zy)a]   Halo Self Attention

Paper: `Scaling Local Self-Attention for Parameter Efficient Visual Backbones`
    - https://arxiv.org/abs/2103.12731

@misc{2103.12731,
Author = {Ashish Vaswani and Prajit Ramachandran and Aravind Srinivas and Niki Parmar and Blake Hechtman and
    Jonathon Shlens},
Title = {Scaling Local Self-Attention for Parameter Efficient Visual Backbones},
Year = {2021},
}

Status:
This impl is a WIP, there is no official ref impl and some details in paper weren't clear to me.
The attention mechanism works but it's slow as implemented.

Hacked together by / Copyright 2021 Ross Wightman
    )ListN)nn   )make_divisible)trunc_normal_)_assertpermute_maskc                    | j                   \  }}}}|j                   d   }|dz   dz  }| |j                  dd      z  }	|	j                  d||      }	t        j                  |	ddg      j                  d      }
t        j                  |
d||z
  g      }
|
j                  d|dz   |      }
|
ddd||dz
  df   }	|	j                  ||d||      j                  dd|dd      }	|	j                  |      S )a~   Compute relative logits along one dimension

    As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
    Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925

    Args:
        q: (batch, height, width, dim)
        rel_k: (2 * window - 1, dim)
        permute_mask: permute output dim according to this
    r   r      N)shape	transposereshapeFpadflattenexpandpermute)qrel_kr	   BHWdimrel_sizewin_sizexx_pads              Q/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/layers/halo_attn.pyrel_logits_1dr!      s     77LAq!S{{1~H1"H	
U__R$	$A			"a"A EE!aV$$Q'EEE%!X\*+E MM"a!eX.Ea!X\]"#A 	
		!Q1h'..r2xRHA99\""    c                   (     e Zd ZdZ fdZd Z xZS )PosEmbedRelz Relative Position Embedding
    As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
    Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925

    c                    t         |           || _        || _        t	        j
                  t        j                  |dz  dz
  |      |z        | _        t	        j
                  t        j                  |dz  dz
  |      |z        | _	        y)z
        Args:
            block_size (int): block size
            win_size (int): neighbourhood window size
            dim_head (int): attention head dim
            scale (float): scale factor (for init)
        r   r   N)
super__init__
block_sizedim_headr   	Parametertorchrandn
height_rel	width_rel)selfr(   r   r)   scale	__class__s        r    r'   zPosEmbedRel.__init__C   sr     	$ ,,u{{8a<!3CX'NQV'VWekk(Q,2BH&MPU&UVr"   c                 B   |j                   \  }}}}|j                  d| j                  | j                  | j                        }t	        || j
                  d      }|j                  dd      }t	        || j                  d      }||z   }|j                  |||d      }|S )Nr   )r   r      r      )r	   r   r   )r   r3   r   r4   r   )r   r   r(   r)   r!   r.   r   r-   )	r/   r   r   BBHW_rel_logits_wrel_logits_h
rel_logitss	            r    forwardzPosEmbedRel.forwardQ   s    ww2r1 IIb$//4??DMMJ$Q_U KK1$QoV!L0
''2r26
r"   )__name__
__module____qualname____doc__r'   r;   __classcell__r1   s   @r    r$   r$   =   s    
Wr"   r$   c                   4     e Zd ZdZ	 	 d fd	Zd Zd Z xZS )HaloAttna   Halo Attention

    Paper: `Scaling Local Self-Attention for Parameter Efficient Visual Backbones`
        - https://arxiv.org/abs/2103.12731

    The internal dimensions of the attention module are controlled by the interaction of several arguments.
      * the output dimension of the module is specified by dim_out, which falls back to input dim if not set
      * the value (v) dimension is set to dim_out // num_heads, the v projection determines the output dim
      * the query and key (qk) dimensions are determined by
        * num_heads * dim_head if dim_head is not None
        * num_heads * (dim_out * attn_ratio // num_heads) if dim_head is None
      * as seen above, attn_ratio determines the ratio of q and k relative to the output if dim_head not used

    Args:
        dim (int): input dimension to the module
        dim_out (int): output dimension of the module, same as dim if not set
        feat_size (Tuple[int, int]): size of input feature_map (not used, for arg compat with bottle/lambda)
        stride: output stride of the module, query downscaled if > 1 (default: 1).
        num_heads: parallel attention heads (default: 8).
        dim_head: dimension of query and key heads, calculated from dim_out * attn_ratio // num_heads if not set
        block_size (int): size of blocks. (default: 8)
        halo_size (int): size of halo overlap. (default: 3)
        qk_ratio (float): ratio of q and k dimensions to output dimension when dim_head not set. (default: 1.0)
        qkv_bias (bool) : add bias to q, k, and v projections
        avg_down (bool): use average pool downsample instead of strided query blocks
        scale_pos_embed (bool): scale the position embedding as well as Q @ K
    c                    t         |           |xs |}||z  dk(  sJ |dv sJ || _        |xs t        ||	z  d      |z  | _        || j                  z  | _        || j                  z  | _        || j
                  z  | _        | j                  dz  | _        || _	        |x| _
        | _        || _        ||dz  z   | _        d| _        d}|dkD  r5|xs ||z  dk7  }|rdn|| _        | j                  | j                  z  | _        t        j                   || j                  d| j                  |
	      | _        t        j                   || j                  | j                  z   d|

      | _        t'        | j                  | j                  | j                  | j                        | _        |rt        j*                  dd      nt        j,                         | _        | j1                          y )Nr   )r   r      )divisor      r   r   F)stridebias)rI   )r(   r   r)   r0   )r&   r'   	num_headsr   dim_head_qk
dim_head_v
dim_out_qk	dim_out_vr0   scale_pos_embedr(   block_size_ds	halo_sizer   block_strider   Conv2dr   kvr$   	pos_embed	AvgPool2dIdentitypoolreset_parameters)r/   r   dim_out	feat_sizerH   rJ   r)   r(   rQ   qk_ratioqkv_biasavg_downrO   use_avg_poolr1   s                 r    r'   zHaloAttn.__init__}   s    	.S"a'''"#a~g6HRS'TXa'a!T^^3#d&6&66"T__4%%-
./99$,""Y]2A:#?zF':a'?L%1vD!%D4E4E!ED
 34;L;LS[\))C4>>!A18T$))DMMDL\L\dhdndnp +7BLLA&BKKM	r"   c                    | j                   j                  j                  d   dz  }t        | j                   j                  |       t        | j                  j                  |       t        | j
                  j                  | j                         t        | j
                  j                  | j                         y )Nr   rG   )std)	r   weightr   r   rT   rU   r-   r0   r.   )r/   ra   s     r    rY   zHaloAttn.reset_parameters   ss    ffmm!!!$,dffmm-dggnn#.dnn//TZZ@dnn..DJJ?r"   c                    |j                   \  }}}}t        || j                  z  dk(  d       t        || j                  z  dk(  d       || j                  z  }|| j                  z  }||z  }| j                  |      }	|	j	                  d| j
                  || j                  || j                        j                  dddddd      }	|	j	                  || j                  z  | j
                  d|      j                  dd      }	| j                  |      }
t        j                  |
| j                  | j                  | j                  | j                  g      }
|
j                  d| j                  | j                        j                  d| j                  | j                        j	                  || j                  z  | j
                  | j                   z   |d      j                  dddd      }
t#        j$                  |
| j
                  | j                   gd	      \  }}| j&                  r5|	|j                  dd
      z  | j)                  |	      z   | j*                  z  }n4|	|j                  dd
      z  | j*                  z  | j)                  |	      z   }|j-                  d	      }||z  j                  dd      }|j	                  d| j                  | j                  ||      }|j                  ddddd      j/                         j1                  || j2                  || j4                  z  || j4                  z        }| j7                  |      }|S )Nr    r   r   r3      r   r4   )r   r   )r   r   r(   r   r   rK   rP   r   rJ   r   rT   r   r   rQ   unfoldr   rL   r+   splitrO   rU   r0   softmax
contiguousviewrN   rR   rX   )r/   r   r   Cr   r   num_h_blocksnum_w_blocks
num_blocksr   rT   kvattnouts                  r    r;   zHaloAttn.forward   s   WW
1aDOO#q("-DOO#q("-DOO+DOO+!L0
FF1III  $,,lD<N<NPPWPWXY[\^_abdeghPi 	
 IIa$..($*:*:B
KUUVWYZ[ WWQZ UU2WXYYq$--9@@DMMSWSbSbckk 0 04?? BJPRTT[T[\]_`bcefTg 	{{2 0 0$//BK1 B++dnnQ.??4::MDB++tzz9DNN1<MMD|||#ax""1a(kk"d00$2D2DlT`akk!Q1a(335::t~~qD$5$55qD<M<M7MO iin
r"   )NNr   rE   NrE   r3   g      ?FFF)r<   r=   r>   r?   r'   rY   r;   r@   rA   s   @r    rC   rC   a   s#    8 tuJO# J@)r"   rC   )r?   typingr   r+   r   torch.nn.functional
functionalr   helpersr   weight_initr   trace_utilsr   intr!   Moduler$   rC    r"   r    <module>r|      s]   $      # &  #$s) #>!")) !Hqryy qhr"   