
    iu'                     2   d Z ddlZddlmZ ddlmZmZ ddlZ ej                  e	      Z
ddgZ ed      d	ed
efd       Z G d de      Zej                   j#                  di       	 d+dej$                  dej$                  dej$                  dej$                  dej$                  dededed
eej$                  ej$                  ej$                  f   fd       Zej*                  	 d+dej$                  dej$                  dej$                  dej$                  dej$                  dededed
eej$                  ej$                  ej$                  f   fd       Z	 	 d,dej$                  dej$                  dej$                  dej$                  dej$                  dededededz  d
ej$                  eej$                  ej$                  f   z  fdZdedeedf   ded
dfdZej                   j#                  d i       d!ej$                  dej$                  dej$                  dej$                  d"ej$                  d#ej$                  dej$                  dej$                  dededed$ej$                  d
eej$                  ej$                  ej$                  f   fd%       Zej*                  d!ej$                  dej$                  dej$                  dej$                  d"ej$                  d#ej$                  dej$                  dej$                  dededed$ej$                  d
eej$                  ej$                  ej$                  f   fd&       Zded!ej$                  d'ej$                  d(ej$                  d
eej$                  dz  df   f
d)Zej9                  ee*       y)-z
Variable-length attention implementation using Flash Attention.

This module provides a high-level Python interface for variable-length attention
that calls into the optimized Flash Attention kernels.
    N)	lru_cache)Any
NamedTuplevarlen_attn
AuxRequest   )maxsizedevice_indexreturnc                      y)z;Cache device capability check to avoid repeated CUDA calls.F )r
   s    S/var/www/html/engine/venv/lib/python3.12/site-packages/torch/nn/attention/varlen.py_should_use_cudnnr      s         c                        e Zd ZU dZdZeed<   y)r   z
    Request which auxiliary outputs to compute from varlen_attn.

    Each field is a boolean indicating whether that auxiliary output should be computed.
    FlseN)__name__
__module____qualname____doc__r   bool__annotations__r   r   r   r   r      s     Cr   ztorch_attn::_varlen_attn)mutates_argsquerykeyvaluecu_seq_qcu_seq_kmax_qmax_k	is_causalc                    | j                   xr t        | j                  j                        }|rYt        j                  d       t        j                  j                  j                  | ||d||||dd|d      }	|	d   |	d   |	d   }}}
nNt        j                  d	       t        j                  j                  j                  | ||||||d|d

      \  }
}}}}t        j                  dt        j                  | j                        }|
||fS )z
    Private custom op for variable-length attention.

    This is the internal implementation. Users should use the public varlen_attn function instead.
    #Using cuDNN backend for varlen_attnNT        Fr         -Using Flash Attention backend for varlen_attn)return_debug_mask   dtypedevice)is_cudar   r-   indexloginfotorchopsaten_cudnn_attention_forward_flash_attention_forwardzerosuint64)r   r   r   r   r   r   r    r!   	use_cudnnresultoutputsoftmax_lse	rng_state_
rng_state_s                  r   _varlen_attnr@   $   s   " G"3ELL4F4F"GI6788
 *0F1IvayY@A/4yy~~/V/V# 0W 0
,Y1 ELLJ ;
**r   c                 >   t        j                  |       }| j                  d      }	| j                  d      }
t        j                  |
|	ft         j                  | j
                        }t        j                  dt         j                  | j
                        }|||fS )z
    Fake implementation for meta tensor computation and tracing.

    Based on the 3D varlen path from meta__flash_attention_forward:
    - query shape: (total, num_heads, head_dim)
    - logsumexp shape: (num_heads, total_q)
    r   r%   r+   r)   )r2   
empty_likesizeemptyfloatr-   r8   )r   r   r   r   r   r   r    r!   r;   total_q	num_heads	logsumexpr=   s                r   _varlen_attn_fakerI   ^   s{    & e$F jjmG

1I	GEKKI DU\\JI9i''r   
return_auxc	           
          t         j                  j                  j                  | |||||||      \  }	}
}||j                  r|	|
fS |	S )a9  
    Compute variable-length attention using Flash Attention.
    This function is similar to scaled_dot_product_attention but optimized for
    variable-length sequences using cumulative sequence position tensors.
    Args:
    - query (Tensor): Query tensor; shape :math:`(T_q, H, D)`
    - key (Tensor): Key tensor; shape :math:`(T_k, H, D)`
    - value (Tensor): Value tensor; shape :math:`(T_k, H, D)`
    - cu_seq_q (Tensor): Cumulative sequence positions for queries; shape :math:`(N+1,)`
    - cu_seq_k (Tensor): Cumulative sequence positions for keys/values; shape :math:`(N+1,)`
    - max_q (int): Maximum query sequence length in the batch.
    - max_k (int): Maximum key/value sequence length in the batch.
    - is_causal (bool, optional): If set to True, applies causal masking (default: False).
    - return_aux (Optional[AuxRequest]): If not None and ``return_aux.lse`` is True, also returns the logsumexp tensor.

    Shape legend:
    - :math:`N`: Batch size
    - :math:`T_q`: Total number of query tokens in the batch (sum of all query sequence lengths)
    - :math:`T_k`: Total number of key/value tokens in the batch (sum of all key/value sequence lengths)
    - :math:`H`: Number of attention heads
    - :math:`D`: Head dimension

    Returns:
    - Tensor: Output tensor from attention computation
    - If ``return_aux`` is not None and ``return_aux.lse`` is True, returns a tuple of Tensors:
    (output, lse), where lse is the logsumexp

    Example::

        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
        >>> batch_size, max_seq_len, embed_dim, num_heads = 2, 512, 1024, 16
        >>> head_dim = embed_dim // num_heads
        >>> seq_lengths = []
        >>> for _ in range(batch_size):
        ...     length = torch.randint(1, max_seq_len // 64 + 1, (1,)).item() * 64
        ...     seq_lengths.append(min(length, max_seq_len))
        >>> seq_lengths = torch.tensor(seq_lengths, device="cuda")
        >>> total_tokens = seq_lengths.sum().item()
        >>>
        >>> # Create packed query, key, value tensors
        >>> query = torch.randn(
        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
        ... )
        >>> key = torch.randn(
        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
        ... )
        >>> value = torch.randn(
        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
        ... )
        >>>
        >>> # Build cumulative sequence tensor
        >>> cu_seq = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
        >>> cu_seq[1:] = seq_lengths.cumsum(0)
        >>> max_len = seq_lengths.max().item()
        >>>
        >>> # Call varlen_attn
        >>> output = varlen_attn(
        ...     query, key, value, cu_seq, cu_seq, max_len, max_len, is_causal=False
        ... )
    )r2   r3   
torch_attnr@   r   )r   r   r   r   r   r   r    r!   rJ   outr   r>   s               r   r   r      sR    N ))&&33sE8XueYKCa *..CxJr   ctxinputs.r;   c           
          |\  }}}}}}}	}
|\  }}}| j                  ||||||||       || _        |	| _        |
| _        y N)save_for_backwardr   r    r!   )rN   rO   r;   r   r   r   r   r   r   r    r!   rM   r   r=   s                 r   _setup_contextrS      sU    EKBE3x5% Ci%eXxc9UCICICMr   z!torch_attn::_varlen_attn_backwardgrad_outrM   r   r=   c                    t        j                  d|j                        }|j                  xr t	        |j                  j
                        }|rPt        j                  d       t         j                  j                  j                  | |||||||||	d|
||      \  }}}nOt        j                  d       t         j                  j                  j                  | |||||||||	d|
||      \  }}}|||fS )Nr   )r-   r#   r$   r'   )r2   rD   r-   r.   r   r/   r0   r1   r3   r4   _cudnn_attention_backward_flash_attention_backward)rT   r   r   r   rM   r   r   r   r   r    r!   r=   unusedr9   dqdkdvs                    r   _varlen_attn_backwardr\      s     [[5<<0FG"3ELL4F4F"GI67YY^^==

B" 	@AYY^^==

B  r2:r   c                     t        j                  |      }t        j                  |      }t        j                  |      }|||fS )zF
    Fake implementation for meta tensor computation and tracing.
    )r2   rB   )rT   r   r   r   rM   r   r   r   r   r    r!   r=   
grad_querygrad_key
grad_values                  r   _varlen_attn_backward_fakera     sA    & !!%(J$H!!%(Jx++r   grad_lsegrad_rngc                     | j                   \  }}}}}}	}
}| j                  }| j                  }| j                  }t        j
                  j                  j                  |||||	|
||||||      \  }}}|||d d d d d d f	S rQ   )saved_tensorsr   r    r!   r2   r3   rL   r\   )rN   rT   rb   rc   r   r   r   r   r   rM   r   r=   r   r    r!   rY   rZ   r[   s                     r   	_backwardrf   ,  s     BEARAR>E3x3YIIEIIEI%%;;JBB r2tT4tT99r   )setup_context)F)FN)r   logging	functoolsr   typingr   r   r2   	getLoggerr   r0   __all__intr   r   r   library	custom_opTensortupler@   register_fakerI   r   rS   r\   ra   rf   register_autogradr   r   r   <module>rt      s     "  g!,
' 1C D  
  3"E 6+<<6+	6+ <<6+ ll	6+
 ll6+ 6+ 6+ 6+ 5<<u||346+ F6+r  (<<(	( <<( ll	(
 ll( ( ( ( 5<<u||34( (P $(L<<L	L <<L ll	L
 llL L L L T!L \\E%,,455L^ U38_ c d  <2N5ll5<<5 
5 <<	5
 
5 
5 ll5 ll5 5 5 5 ||5 5<<u||345 O5p $$,ll,<<, 
, <<	,
 
, 
, ll, ll, , , , ||, 5<<u||34, %,2:	::05:HM:
5<<$#$:4   y  Gr   