
    i.                    D   U d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlmZmZ dd	lmZ e
rdd
lmZ ddlZddlmZ dgZdaded<   e G d d             Zed%d       Z	 d&	 	 	 d'dZed(d       Zd)dZ	 d*	 	 	 	 	 	 	 	 	 d+dZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d,dZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d-dZ ed      Z d.dZ!	 d/	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d0dZ"	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d1dZ#ddddddd	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d2dZ$dddd	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d3dZ%	 	 	 d4dd 	 	 	 	 	 	 	 	 	 	 	 	 	 d5d!Z&dd 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d6d"Z' ejP                  d#e$       y)7zUBER PROTOTYPE!!!    )annotationsN)	dataclass)cache)AnyTYPE_CHECKING)TypeVarTupleUnpack   )	_registry)
ModuleType)Libraryregister_flash_attention_fa4
str | None_FA4_MODULE_PATHc                       e Zd ZU ded<   ddZy)
_FA4HandlezLibrary | Nonelibraryc                    d | _         y N)r   )selfs    Q/var/www/html/engine/venv/lib/python3.12/site-packages/torch/nn/attention/_fa4.pyremovez_FA4Handle.remove"   s	        N)returnNone)__name__
__module____qualname____annotations__r    r   r   r   r      s    r   r   c                J    t         j                  j                  |       \  }}|S r   )torchcudaget_device_capability)devicemajor_s      r   _get_device_majorr(   &   s    zz//7HE1Lr   c                B    t        |       }| at        t                     S )z
    Register FA4 flash attention kernels with the PyTorch dispatcher.

    Args:
        module_path: Python module path to the FA4 implementation.
    )_fa4_import_moduler   r   _fa4_register_kernels)module_pathr'   s     r   r   r   ,   s#     	;'A"+-..r   c                ~    t        j                  |       }t        |d      rt        |d      st        d|  d      |S )N_flash_attn_fwd_flash_attn_bwdzModule 'z' does not expose FA4 kernels)	importlibimport_modulehasattrRuntimeError)r,   modules     r   r*   r*   ;   sA    $$[1F6,-WVEV5WXk]2OPQQMr   c                     t        ddd      } | j                  dt        d       | j                  dt        d       | j                  dt        d       | j                  dt
        d       | S )NatenIMPLCUDA_flash_attention_forward_flash_attention_backward#_scaled_dot_product_flash_attention,_scaled_dot_product_flash_attention_backward)r   impl!_fa4_flash_attention_forward_impl"_fa4_flash_attention_backward_impl4_fa4_scaled_dot_product_flash_attention_forward_impl5_fa4_scaled_dot_product_flash_attention_backward_impl)libs    r   r+   r+   C   sg    
&&&
)CHH')JFSHH(*LfUHH-<
 HH6=
 Jr   c                   t        d |D              syt        |D ch c]  }|j                   c}      dk7  ry| j                  t        j
                  t        j                  fvry|D ])  \  }}|j                  t        j                  k7  s$| dc S  || j                         dk7  ry|| j                         d	k7  ry
t        j                  j                         syt        | j                        dvryy c c}w )Nc              3  4   K   | ]  }|j                     y wr   )is_cuda.0ts     r   	<genexpr>z,_fa4_common_support_error.<locals>.<genexpr>Z   s     *Qqyy*s   zinputs must be CUDA tensorsr
   inputs must share devicez'query dtype must be float16 or bfloat16z dtype must be float32   zdense query must be 4D   zragged query must be 3DzCUDA not available)	   
   z+FA4 requires compute capability 9.0 or 10.0)alllenr%   dtyper"   float16bfloat16float32dimr#   is_availabler(   )querytensors	cum_seq_qrequire_fp32rH   nametensors          r   _fa4_common_support_errorr]   T   s     *'**,
g&AHH&'1,){{5==%..998$ 3f<<5==(V1223 UYY[A-'!1(::""$#&g5< 's   C:c                    |dk7  ry|ry|y|+|j                   t        j                  k7  ry|j                  syt	        | | ||f|      }||dk(  ry|S y )	N        dropout_p must be 0zreturn_debug_mask must be Falsezalibi_slopes not supportedzseqused_k must be int32zseqused_k must be CUDArJ   z(query, key, value must be on same device)rQ   r"   int32rE   r]   )	rW   keyvalue	dropout_preturn_debug_maskalibi_slopes	seqused_krY   errors	            r   _fa4_forward_support_errorri   n   sy     C$0+??ekk),  +%	UE
 ..=r   c
           	     R    |dk7  ry||	yt        || |||||f|d|ff      }
|
|
S y )Nr_   r`   z windowed attention not supported	logsumexp)rZ   )r]   )grad_outrW   rb   rc   outrk   rd   rY   window_size_leftwindow_size_rightrh   s              r   _fa4_backward_support_errorrp      sY     C$#'8'D1%	5#uc95"I.0	E r   Tsc                 &    t        d | D              S )Nc              3  @   K   | ]  }|j                  d d        yw)r
      N)	transposerF   s     r   rI   z#_transpose_dense.<locals>.<genexpr>   s     4qQ"4s   )tuple)rX   s    r   _transpose_denserw      s    4G444r   c           	         t         t        d      t        t               }||||d|||	|	j                         nd d}|
|
|d<    |j                  | ||fi |\  }
}|
|j                         fS )NFA4 not registeredT)softmax_scalecausalrn   ro   
return_lsecu_seqlens_qcu_seqlens_krg   rm   )r   r3   r*   
contiguousr.   )rW   rb   rc   cu_seq_qcu_seq_kscale	is_causalrn   ro   rg   rm   r4   kwargslses                 r   _fa4_run_forwardr      s     /00 01F ,.  /8/DY))+$	F u%v%%eS%B6BHC   r   c
                    t         t        d      t        t               }
|
j                  ||||| |j	                         ||	||
      \  }}}|||fS )Nry   )rz   r{   r}   r~   )r   r3   r*   r/   r   )rl   rW   rb   rc   rm   rk   r   r   r   r   r4   dqdkdvs                 r   _fa4_run_backwardr      sq     /00 01F'' ( JBB r2:r   )r   rn   ro   rg   rf   rm   c
                  t        | ||||	|||      }|t        d|       t        | |||||
|||||      \  }}t        j                  dt        j
                  | j                        }t        j                  dt        j
                  | j                        }t        j                  d| j                  | j                        }|||||fS )Nz)FA4 flash_attention forward unsupported: )rt   )rQ   r%   r    r   )	ri   r3   r   r"   zerosuint64r%   emptyrQ   )rW   rb   rc   rY   	cum_seq_kmax_qmax_krd   r   re   r   rn   ro   rg   rf   rm   rh   r   	rng_statephilox_offset
debug_masks                        r   r>   r>      s    & '	E FugNOOHC DU\\JIKK%,,u||LMQekk%,,GJYz99r   )r   rn   ro   c                   t        | ||||||
|||
      }|t        d|       t        | |||||||||
      \  }}}|||fS )Nz*FA4 flash_attention backward unsupported: )rp   r3   r   )rl   rW   rb   rc   rm   rk   rY   r   r   r   rd   r   r   unusedr   rn   ro   rh   r   r   r   s                        r   r?   r?   "  s    ( (E GwOPP"JBB r2:r   r   c                  t        | ||||d d d       }|t        d|       t        | ||      \  }}	}
t        j                  |       }|j                  dd      }|j                  d      }|	j                  d      }t        ||	|
d d |||||||      \  }}}}}| j                  d      }|j                  d      }||d d |||||f	S )NzFA4 SDPA forward unsupported: r
   rt   )r   rm   )ri   r3   rw   r"   
empty_likeru   sizer>   )rW   rb   rc   rd   r   re   r   rh   qkvout_bhsdout_bshdmax_q_flashmax_k_flashr'   r   r   r   r   r   r   s                         r   r@   r@   S  s    '	E ;E7CDDuc51GAq!
 &H!!!Q'H&&)K&&)K3T			40AsI}j JJqMEHHQKE
 
r   c                   t        | ||||||
d d d 
      }|t        d|       t        |||||       \  }}}}}|j                  d      }|j                  d      }	t	        ||||||d d ||	|
||||      \  }}}t        |||      \  }}}|||fS )NzFA4 SDPA backward unsupported: rt   r   )rp   r3   rw   r   r?   )rl   rW   rb   rc   rm   rk   rY   r   r   r   rd   r   philox_seedr   r   rh   r   r   r   ogor   r   r   s                           r   rA   rA     s    $ (E <UGDEE%eS%hGNAq!QJJqMEHHQKE3
				JBB" ""b"-JBBr2:r   FA4)register_fn)r%   ztorch.devicer   int)zflash_attn.cute.interface)r,   strr   r   )r,   r   r   r   )r   r   )r    )
rW   torch.TensorrX   ztuple[torch.Tensor, ...]rY   torch.Tensor | NonerZ   z$tuple[tuple[str, torch.Tensor], ...]r   r   )rW   r   rb   r   rc   r   rd   floatre   boolrf   r   rg   r   rY   r   r   r   )rl   r   rW   r   rb   r   rc   r   rm   r   rk   r   rd   r   rY   r   rn   
int | Nonero   r   r   r   )rX   z
Unpack[Ts]r   ztuple[Unpack[Ts]]r   )rW   r   rb   r   rc   r   r   r   r   r   r   float | Noner   r   rn   r   ro   r   rg   r   rm   r   r   z!tuple[torch.Tensor, torch.Tensor])rl   r   rW   r   rb   r   rc   r   rm   r   rk   r   r   r   r   r   r   r   r   r   r   z/tuple[torch.Tensor, torch.Tensor, torch.Tensor]) rW   r   rb   r   rc   r   rY   r   r   r   r   r   r   r   rd   r   r   r   re   r   r   r   rn   r   ro   r   rg   r   rf   r   rm   r   )"rl   r   rW   r   rb   r   rc   r   rm   r   rk   r   rY   r   r   r   r   r   r   r   rd   r   r   r   r   r   r   r   r   r   rn   r   ro   r   )r_   FF)rW   r   rb   r   rc   r   rd   r   r   r   re   r   r   r   )rl   r   rW   r   rb   r   rc   r   rm   r   rk   r   rY   r   r   r   r   r   r   r   rd   r   r   r   r   r   r   r   r   r   ))__doc__
__future__r   r0   dataclassesr   	functoolsr   typingr   r   typing_extensionsr   r	    r   typesr   r"   torch.libraryr   __all__r   r   r   r(   r   r*   r+   r]   ri   rp   rq   rw   r   r   r>   r?   r@   rA   register_flash_attention_implr    r   r   <module>r      s%    #  !  % 2     ! #
  $ * #      3///  * :<	% # 7	
 4	  	
  & # # B 
 	
 
   # ! " 6 $5  $!!	! ! "	!
 "! ! ! !! "! #! 
! '!B 
 	
 
  " "   5T #'$(%)(,##/:/:	/: /: #	/:
 #/: /: /: /: /: /: /: !/: "/: #/:  &!/:" 
#/:D #'$(%... 
. 	.
 
. . #. #. . . . . . .  !." !#.$ "%.j #: ::	: : 	:
 : : :Z !555 
5 	5
 
5 5 #5 #5 5 5 5 5 5  5  !5p (	 ' ';W Xr   