
    kh~!                         d dl mZmZmZ d dlZd dlmZ d dlmZ ddl	m
Z
 ddlmZ ddej                  d	eej                     fd
Z G d dej                        Z G d dej                        Zy)    )FinalOptionalTypeN)nn)
functional   )use_fused_attn)apply_rot_embed_catscores	attn_maskc                     || S | |z   S N )r   r   s     Q/var/www/teggl/fontify/venv/lib/python3.12/site-packages/timm/layers/attention.pymaybe_add_maskr      s    &6>FY,>>    c                        e Zd ZU dZee   ed<   	 	 	 	 	 	 	 	 ddededededed	ed
edede	e
ej                        ddf fdZ	 ddej                  de	ej                     dej                  fdZ xZS )	Attentiona  Standard Multi-head Self Attention module with QKV projection.

    This module implements the standard multi-head attention mechanism used in transformers.
    It supports both the fused attention implementation (scaled_dot_product_attention) for
    efficiency when available, and a manual implementation otherwise. The module includes
    options for QK normalization, attention dropout, and projection dropout.
    
fused_attnNdim	num_headsqkv_biasqk_norm
scale_norm	proj_bias	attn_drop	proj_drop
norm_layerreturnc
                    t         
|           ||z  dk(  sJ d       |s|r	|	J d       || _        ||z  | _        | j                  dz  | _        t               | _        t        j                  ||dz  |      | _	        |r |	| j                        nt        j                         | _        |r |	| j                        nt        j                         | _        t        j                  |      | _        |r |	|      nt        j                         | _        t        j                  |||      | _        t        j                  |      | _        y)ag  Initialize the Attention module.

        Args:
            dim: Input dimension of the token embeddings
            num_heads: Number of attention heads
            qkv_bias: Whether to use bias in the query, key, value projections
            qk_norm: Whether to apply normalization to query and key vectors
            proj_bias: Whether to use bias in the output projection
            attn_drop: Dropout rate applied to the attention weights
            proj_drop: Dropout rate applied after the output projection
            norm_layer: Normalization layer constructor for QK normalization if enabled
        r   z$dim should be divisible by num_headsN<norm_layer must be provided if qk_norm or scale_norm is True         bias)super__init__r   head_dimscaler	   r   r   LinearqkvIdentityq_normk_normDropoutr   normprojr   )selfr   r   r   r   r   r   r   r   r   	__class__s             r   r'   zAttention.__init__   s   0 	Y!#K%KK#j)i+ii)"y(]]d*
(*99S#'93:j/3:j/I.'1JsOr{{}	IIc3Y7	I.r   xr   c                    |j                   \  }}}| j                  |      j                  ||d| j                  | j                        j                  ddddd      }|j                  d      \  }}}	| j                  |      | j                  |      }}| j                  r=t        j                  |||	|| j                  r| j                  j                  nd      }nX|| j                  z  }||j!                  dd	      z  }
t#        |
|      }
|
j%                  d	
      }
| j                  |
      }
|
|	z  }|j!                  dd      j                  |||      }| j'                  |      }| j)                  |      }| j+                  |      }|S )Nr#      r   r              r   	dropout_pr   )shaper+   reshaper   r(   permuteunbindr-   r.   r   Fscaled_dot_product_attentiontrainingr   pr)   	transposer   softmaxr0   r1   r   )r2   r4   r   BNCr+   qkvattns              r   forwardzAttention.forwardB   sX   
 ''1ahhqk!!!Q4>>4==IQQRSUVXY[\^_`**Q-1a{{1~t{{1~1??..1a#.2mm$..**A DJJAq{{2r**D!$	2D<<B<'D>>$'DqAKK1%%aA.IIaLIIaLNN1r   )   FFFTr8   r8   Nr   )__name__
__module____qualname____doc__r   bool__annotations__intfloatr   r   r   Moduler'   torchTensorrO   __classcell__r3   s   @r   r   r      s     d
 "!$"!!48'/'/ '/ 	'/
 '/ '/ '/ '/ '/ !bii1'/ 
'/X 15||  - 
	r   r   c                   
    e Zd ZU dZej
                  j                  e   ed<   	 	 	 	 	 	 	 	 	 	 dde	de	dedede	de
d	e
d
ee	   deej                     dedef fdZ	 	 ddeej                      deej                      fdZ xZS )AttentionRopez A Self Attention module with ROPE support.

    Includes options for:
     * QK normalization option
     * Attention output (scale) normalization
     * Fused or unfused QKV projection support
    r   r   r   r   	qkv_fusednum_prefix_tokensr   r   attn_head_dimr   r   r   c                 V   t         |           |s|
r	|	J d       || _        ||z  }||}|| j                  z  }|dz  | _        || _        t               | _        |r6t        j                  ||dz  |      | _	        dx| _
        x| _        | _        n^d| _	        t        j                  |||      | _
        t        j                  |||      | _        t        j                  |||      | _        |
r |	|      nt        j                         | _        |
r |	|      nt        j                         | _        t        j                   |      | _        |r |	|      nt        j                         | _        t        j                  ||      | _        t        j                   |      | _        y)a  Initialize the Attention module.

        Args:
            dim: Input dimension of the token embeddings
            num_heads: Number of attention heads
            qkv_bias: Whether to add a bias term to the query, key, and value projections
            num_prefix_tokens: Number of reg/cls tokens at the beginning of the sequence that
                should not have position embeddings applied
            attn_drop: Dropout rate for attention weights
            proj_drop: Dropout rate for the output projection
            attn_head_dim: Dimension of each attention head (if None, computed as dim // num_heads)
            norm_layer: Normalization layer constructor to use for QK and scale normalization
            qk_norm: Enable normalization of query (Q) and key (K) vectors with norm_layer
            scale_norm: Enable normalization (scaling) of attention output with norm_layer
        Nr!   r"   r#   r$   )r&   r'   r   r)   ra   r	   r   r   r*   r+   q_projk_projv_projr,   r-   r.   r/   r   r0   r1   r   )r2   r   r   r   r`   ra   r   r   rb   r   r   r   r(   attn_dimr3   s                 r   r'   zAttentionRope.__init__k   sV   : 	)i+ii)")#$$Hdnn,%
!2(*yyhlBDH6::DK:$+DH))CADK))CADK))CADK.5j*2;;=.5j*2;;=I.,6Jx(BKKM	IIh,	I.r   roper   c                    |j                   \  }}}| j                  Y| j                  |      }|j                  ||d| j                  d      j	                  ddddd      }|j                  d      \  }}	}
n| j                  |      j                  ||| j                  d      j                  dd      }| j                  |      j                  ||| j                  d      j                  dd      }	| j                  |      j                  ||| j                  d      j                  dd      }
| j                  |      | j                  |	      }	}|| j                  }t        j                  |ddddd|ddf   t        |dddd|dddf   |      gd      j!                  |
      }t        j                  |	ddddd|ddf   t        |	dddd|dddf   |      gd      j!                  |
      }	| j"                  r=t%        j&                  ||	|
|| j(                  r| j*                  j,                  nd	
      }nX|| j.                  z  }||	j                  dd      z  }t1        ||      }|j3                  d      }| j+                  |      }||
z  }|j                  dd      j                  |||      }| j5                  |      }| j7                  |      }| j9                  |      }|S )a  Forward pass for the attention module.

        Args:
            x: Input tensor of shape (batch_size, sequence_length, embedding_dim)
            rope: Rotary position embeddings tensor for position-aware attention
            attn_mask: Optional attention mask to apply during attention computation

        Returns:
            Tensor of shape (batch_size, sequence_length, embedding_dim)
        Nr#   r<   r6   r   r   r7   r=   r8   r9   r;   )r>   r+   r?   r   r@   rA   rd   rF   re   rf   r-   r.   ra   rZ   catr
   type_asr   rB   rC   rD   r   rE   r)   r   rG   r0   r1   r   )r2   r4   rh   r   rH   rI   rJ   r+   rK   rL   rM   nptrN   s                r   rO   zAttentionRope.forward   s     ''1a88((1+C++aAt~~r:BB1aAqQCjjmGAq!A&&q!T^^R@JJ1aPAA&&q!T^^R@JJ1aPAA&&q!T^^R@JJ1aPA{{1~t{{1~1((C		1Q4C4]+-@1aq=AQSW-XY_`aiijklA		1Q4C4]+-@1aq=AQSW-XY_`aiijklA??..1a#.2mm$..**A DJJAB++D!$	2D<<B<'D>>$'DqAKK1%%aA.IIaLIIaLNN1r   )
rP   TTr   r8   r8   NNFF)NN)rQ   rR   rS   rT   rZ   jitr   rU   rV   rW   rX   r   r   r   rY   r'   r[   rO   r\   r]   s   @r   r_   r_   a   s     		%%
 !"%&!!+/*.!$7/7/ 7/ 	7/
 7/  #7/ 7/ 7/ $C=7/ RYY7/ 7/ 7/x ,004	5 5<<(5  -	5r   r_   r   )typingr   r   r   rZ   r   torch.nnr   rB   configr	   pos_embed_sincosr
   r[   r   rY   r   r_   r   r   r   <module>rr      s_    ( (   $ " 1?5<< ?HU\\4J ?O		 OdxBII xr   