
    i 5                        d Z ddlmZmZ ddlmZ ddlZddlmc m	Z
 ddlmZmZmZmZ ddlmZ ddlmZmZmZmZ g dZej0                  j3                  e       ej0                  j3                  e       ej0                  j3                  e       ej0                  j3                  e        G d	 d
e      Z G d dej6                        ZdefdZdefdZy)zCDefines bias subclasses that work with scaled_dot_product_attention    )autoIntEnum)warnN)can_use_efficient_attentioncan_use_flash_attentionis_flash_attention_available
SDPAParams)_raise_kernel_warnings)_calculate_scale_input_requires_grad_postprocess_flash_output_validate_sdpa_input)causal_upper_leftcausal_lower_rightCausalVariant
CausalBiasc                   ,    e Zd ZdZ e       Z e       Zy)r   a+  
    Enum for causal variants used in attention mechanisms.

    Defines two types of causal biases:

    ``UPPER_LEFT``: Represents upper-left triangular bias for standard causal attention.
    The equivalent pytorch code for constructing this bias is:

    .. code-block:: python

        torch.tril(torch.ones(size, dtype=torch.bool))

    For instance, with ``shape=(3,4)``, the materialized bias tensor will be:

    .. code-block:: text

        [[1, 0, 0, 0],
         [1, 1, 0, 0],
         [1, 1, 1, 0]]


    ``LOWER_RIGHT``: Represents lower-right triangular bias, the include values are aligned to the lower
    right corner of the matrix.

    The equivalent pytorch code for constructing this bias is:

    .. code-block:: python

        diagonal_offset = size[1] - size[0]
        torch.tril(
            torch.ones(size, dtype=torch.bool),
            diagonal=diagonal_offset,
        )

    For instance, with ``shape=(3,4)``, the materialized bias tensor will be:

    .. code-block:: text

        [[1, 1, 0, 0],
         [1, 1, 1, 0],
         [1, 1, 1, 1]]

    Note that these variants are equivalent to each other when the sequence lengths of the query and key/value
    tensors are equal since the triangular matrix is square.

    .. warning:: This enum is a prototype and subject to change.
    N)__name__
__module____qualname____doc__r   
UPPER_LEFTLOWER_RIGHT     Q/var/www/html/engine/venv/lib/python3.12/site-packages/torch/nn/attention/bias.pyr   r   !   s    .` J&Kr   r   c                       e Zd ZdZdedededdf fdZdej                  dej                  fd	Z
dej                  dej                  fd
Zddej                  dz  dej                  fdZe	 	 	 	 ddej                  dej                  dej                  dd dedededz  dedej                  fd       Zed fd	       ZdefdZ xZS )r   aN  
    A bias representing causal attention patterns. For an overview of the bias structure, see the :class:`CausalVariant` enum.

    This class is used for defining causal (triangular) attention biases. For construing the bias, there exist
    two factory functions: :func:`causal_upper_left` and :func:`causal_lower_right`.

    Example:

    .. code-block:: python

        from torch.nn.attention.bias import causal_lower_right

        bsz, num_heads, seqlen_q, seqlen_kv, head_dim = 32, 8, 4, 12, 8

        # Create a lower-right causal bias
        attn_bias = causal_lower_right(seqlen_q, seqlen_kv)

        q = torch.randn(
            bsz, num_heads, seqlen_q, head_dim, device="cuda", dtype=torch.float16
        )
        k = torch.randn(
            bsz, num_heads, seqlen_kv, head_dim, device="cuda", dtype=torch.float16
        )
        v = torch.randn(
            bsz, num_heads, seqlen_kv, head_dim, device="cuda", dtype=torch.float16
        )

        out = F.scaled_dot_product_attention(q, k, v, attn_bias)

    .. warning:: This class is a prototype and subject to change.
    variant	seq_len_q
seq_len_kvreturnNc                     t        |t              sJ t        |           || _        || _        || _        ||kD  r"|t        j                  k(  rt        dd       yyy)a  
        Initializes the CausalBias instance with a specified variant and sequence lengths.

        Args:
            variant (CausalVariant): The type of causal bias to use (either UPPER_LEFT or LOWER_RIGHT).
            seq_len_q (int): The sequence length of the query tensor.
            seq_len_kv (int): The sequence length of the key/value tensor.

        Raises a warning if the LOWER_RIGHT variant is used with seq_len_q > seq_len_kv, as it may produce NaNs.
        zTLower right causal bias will produce NaNs in the output when seq_len_q > seq_len_kv!   )
stacklevelN)	
isinstancer   super__init__r   r   r    r   r   )selfr   r   r    	__class__s       r   r'   zCausalBias.__init__w   sa     '=111"$z!g1J1J&Jf 'K!r   devicec                     t        j                  t        j                  | j                  | j                  |t         j
                              S )zUpper left causal biasr*   dtype)torchtrilonesr   r    boolr(   r*   s     r   _upper_leftzCausalBias._upper_left   s1    zzJJt~~tvUZZX
 	
r   c                     | j                   | j                  z
  }t        j                  t        j                  | j                  | j                   |t        j
                        |      S )zLower right causal biasr,   )diagonal)r    r   r.   r/   r0   r1   )r(   r*   diagonal_offsets      r   _lower_rightzCausalBias._lower_right   sK    //DNN:zzJJejj %	
 	
r   c                     |t        j                  d      }| j                  t        j                  k(  r| j                  |      S | j                  t        j                  k(  r| j                  |      S y)a  
        Materializes the causal bias into a tensor form.

        Depending on the variant, this method generates either an upper-left or lower-right
        triangular matrix to represent the causal bias.

        Args:
            device (Optional[torch.device]): The device on which to create the tensor. Defaults to CPU.

        Returns:
            torch.Tensor: The materialized bias tensor.
        Ncpu)r.   r*   r   r   r   r3   r   r7   r2   s     r   _materializezCausalBias._materialize   sb     >\\%(F<<=333##F++\\]666$$V,, 7r   querykeyvalue	attn_mask	dropout_p	is_causalscale
enable_gqac                 z   |rt        d      |j                  |j                  k(  s|j                  t        j
                  k(  rt        j                  | ||d|d||      S |j                  t        j                  k(  r&t        | ||d|||       t        | ||d|||      }t        |      r| j                  j                  dk(  rdnd}	| j                  d      }
t        |
|      }|
|	z  d	k7  }|r|	|
|	z  z
  }t         j"                  j$                  j'                  | d	|f      } t         j"                  j$                  j'                  |d	|f      }t         j"                  j$                  j'                  |d	|f      }t         j(                  j*                  j-                  | |||dd
|      d	   }t/        ||
      S t1        |      rd
}t3        | ||      rd}t         j(                  j*                  j5                  | j7                  dd      |j7                  dd      |j7                  dd      ddddd|t9        |j                        ||d      d	   j7                  dd      S t;        |       t        j                  | |||j=                  | j                        |d
||      S t        d|j                         )a8  
        Handles the logic for computing attention with the specified causal bias.

        Args:
            query (Tensor): Query tensor; shape :math:`(N, ..., L, E)`.
            key (Tensor): Key tensor; shape :math:`(N, ..., S, E)`.
            value (Tensor): Value tensor; shape :math:`(N, ..., S, Ev)`.
            attn_mask (CausalBias): The type of causal attention to apply.
                A boolean mask where a value of True indicates that the element *should* take part in attention.
                A float mask of the same type as query, key, value that is added to the attention score.
            dropout_p (float): Dropout probability; if greater than 0.0, dropout is applied
            is_causal (bool): If true, assumes upper left causal attention masking and errors if both attn_mask and is_causal
                are set.
            scale (optional float): Scaling factor applied prior to softmax. If None, the default value is set
                to :math:`\frac{1}{\sqrt{E}}`.
            enable_gqa (optional bool): If set to True, Grouped Query Attention (GQA) is enabled, by default it is set to False.

        Returns:
            output (Tensor): Attention output; shape :math:`(N, ..., L, Ev)`.

        Raises:
            ValueError: If the causal bias variant is not a CausalVariant type.

        z.CausalBias should not be used with causal=TrueNT)r>   r?   r@   rA   rB   xpu@      r   F)r@   return_debug_maskrA      r#   )
biascu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_kr?   custom_mask_typecompute_log_sumexprA   seqlen_kz<CausalBias.variant must be a CausalVariant type, but found: )
ValueErrorr   r    r   r   r   Fscaled_dot_product_attentionr   r   r	   r   r*   typesizer   r.   nn
functionalpadopsaten#_scaled_dot_product_flash_attentionr   r   r   _efficient_attention_forward	transposeintr
   r:   )r;   r<   r=   r>   r?   r@   rA   rB   sdpa_params	alignmentog_head_sizeog_scaleneeds_paddingpad_lenoutrP   s                   r   	_dispatchzCausalBias._dispatch   s   F MNN 9#7#77  M$<$<<11#%	 	 -";";; UD)YPUV$sE4IzK '{3"',,"3"3u"<B!	$zz"~+L%@ ,y 8A = '<)+CDG!HH//33EAw<HE((--11#7|DC!HH//33EAw<HEiinnHH"&+" I   1lCC*;7%*"'sE:)-&yy~~BBOOAq)MM!Q'OOAq)!%!%!%!%'%():):%;'9! C   Yq!_%  '{355'44U\\B'#)	 	 NyO`O`Nab r   c                     |i }|t         j                  j                  j                  u r | j                  |i |S t
        |   ||||      S )zjDefines the behavior of torch.nn.functional.scaled_dot_product_attention when the attn_bias is an AttnBias)r.   rW   rX   rT   rg   r&   __torch_function__)clsfunctypesargskwargsr)   s        r   ri   zCausalBias.__torch_function__$  sS     >F588&&CCC 3==$1&11w)$tVDDr   c                 >    | j                         j                         S N)r:   __repr__)r(   s    r   rq   zCausalBias.__repr__-  s      "++--r   rp   )g        FNF)r   N)r   r   r   r   r   r_   r'   r.   r*   Tensorr3   r7   r:   staticmethodfloatr1   rg   classmethodri   strrq   __classcell__)r)   s   @r   r   r   V   s;   @ # 3 SW ,
%,, 
5<< 

5<< 
ELL 
-5<<$#6 -%,, -(  " o||o\\o ||o  	o
 o o t|o o 
o ob E E.# .r   r   r!   c                  l    t        |       dk(  sJ d       | \  }}t        t        j                  ||      S )a&  
    Creates an upper-left triangular causal bias.

    This function generates a upper-left triangular matrix to represent causal attention bias with a
    diagonal offset set so that the inclusive values are aligned to the upper left corner of the matrix.
    This equivalent to the `is_causal=True` argument in `scaled_dot_product_attention`.

    The equivalent pytorch code for constructing this bias is:

    .. code-block:: python

        torch.tril(torch.ones(size, dtype=torch.bool))

    For instance, with `shape=(3,4)`, the materialized bias tensor will be:

    .. code-block:: text

        [[1, 0, 0, 0],
         [1, 1, 0, 0],
         [1, 1, 1, 0]]

    Args:
        size: The size of the bias matrix.

    Returns:
        CausalBias: The UPPER_LEFT triangular causal bias variant.
    r#   z*causal_upper_left only supports 2D tensors)lenr   r   r   rV   r   r    s      r   r   r   1  s9    8 t9>GGG> Izm..	:FFr   c                  l    t        |       dk(  sJ d       | \  }}t        t        j                  ||      S )a:  
    Creates a lower-right triangular causal bias.

    This function generates a lower-right triangular matrix to represent causal attention bias with a
    diagonal offset set so that the inclusive values are aligned to the lower right corner of the matrix.

    The equivalent pytorch code for constructing this bias is:

    .. code-block:: python

        diagonal_offset = size[1] - size[0]
        torch.tril(
            torch.ones(size, dtype=torch.bool),
            diagonal=diagonal_offset,
        )

    For instance, with `shape=(3,4)`, the materialized bias tensor will be:

    .. code-block:: text

        [[1, 1, 0, 0],
         [1, 1, 1, 0],
         [1, 1, 1, 1]]

    Args:
        size: The size of the bias matrix.

    Returns:
        CausalBias: The LOWER_RIGHT triangular causal bias variant.
    r#   z+causal_lower_right only supports 2D tensors)ry   r   r   r   rz   s      r   r   r   R  s9    > t9>HHH> Izm//JGGr   )r   enumr   r   warningsr   r.   torch.nn.functionalrW   rX   rS   torch.backends.cudar   r   r   r	   torch.nn.attentionr
   torch.nn.attention._utilsr   r   r   r   __all___dynamoallow_in_graphr   rr   r   r   r   r   r   r   <module>r      s    I       6  U   9 :   4 5   8 9   Z (2G 2jX. X.vG
 GB!H !Hr   