Skip to content

【dcu】dcu cutlass fa #71337

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 18 additions & 39 deletions cmake/external/flashattn.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -32,51 +32,30 @@ if(WITH_ROCM)
set(FLASHATTN_LIBRARIES
"${FLASHATTN_INSTALL_DIR}/lib/libflashattn${CMAKE_SHARED_LIBRARY_SUFFIX}"
CACHE FILEPATH "flash-attn Library" FORCE)

set(FLASHATTN_C_FLAGS ${CMAKE_C_FLAGS})
set(FLASHATTN_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
set(FLASHATTN_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
set(FLASHATTN_CXX_FLAGS
"${CMAKE_CXX_FLAGS} -w -Wno-deprecated-builtins -Wno-deprecated -DNDEBUG -U__HIP_NO_HALF_OPERATORS__ -U__HIP_NO_HALF_CONVERSIONS__ -fPIC -O3 -std=c++17 -D__HIP_PLATFORM_HCC__=1 --offload-arch=gfx928 -D__gfx940__ -mllvm -enable-num-vgprs-512=true"
)
set(FLASHATTN_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
set(FLASHATTN_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
set(FA_BUILD_DIR "${FLASHATTN_PREFIX_DIR}/src/extern_flashattn-build/")

ExternalProject_Add(
extern_flashattn
GIT_REPOSITORY ${FA_REPOSITORY}
GIT_TAG ${FA_TAG}
SOURCE_DIR ${SOURCE_DIR}
PREFIX ${FLASHATTN_PREFIX_DIR}
UPDATE_COMMAND ""
PATCH_COMMAND ""
#BUILD_ALWAYS 1
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${ROCM_PATH}/bin/hipcc
-DAMDGPU_TARGETS=gfx928
-DCMAKE_CXX_COMPILER_LAUNCHER=${CCACHE_PATH}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_C_FLAGS=${FLASHATTN_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${FLASHATTN_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${FLASHATTN_C_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS=${FLASHATTN_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${FLASHATTN_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${FLASHATTN_CXX_FLAGS_DEBUG}
-DCMAKE_INSTALL_PREFIX=${FLASHATTN_INSTALL_DIR}
-DWITH_GPU=${WITH_GPU}
-DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}
-DWITH_ROCM=${WITH_ROCM}
-DWITH_OMP=${USE_OMP}
-DBUILD_SHARED=ON
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-DCMAKE_JOB_POOL_COMPILE:STRING=compile
-DCMAKE_JOB_POOLS:STRING=compile=4
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_INSTALL_PREFIX:PATH=${FLASHATTN_INSTALL_DIR}
BUILD_BYPRODUCTS ${FLASHATTN_LIBRARIES})
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
LOG_DOWNLOAD ON)

add_custom_command(
TARGET extern_flashattn
POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${FLASHATTN_INCLUDE_DIR}
COMMAND ${CMAKE_COMMAND} -E copy_if_different "${SOURCE_DIR}/flash_attn.h"
${FLASHATTN_INCLUDE_DIR}/
COMMAND ${CMAKE_COMMAND} -E make_directory ${FA_BUILD_DIR}
COMMAND ${CMAKE_COMMAND} -E copy_if_different
"${SOURCE_DIR}/libflashattn.so" ${FA_BUILD_DIR}/
COMMAND ${CMAKE_COMMAND} -E make_directory ${FLASHATTN_LIB_DIR}
COMMAND ${CMAKE_COMMAND} -E copy_if_different
"${SOURCE_DIR}/libflashattn.so" ${FLASHATTN_LIB_DIR}/)
else()

add_definitions(-DPADDLE_WITH_FLASHATTN)
Expand Down
161 changes: 31 additions & 130 deletions paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -257,12 +257,8 @@ void FlashAttnUnpaddedGradBaseKernel(
kdq = &dq_tmp;
}

#ifdef PADDLE_WITH_HIP
std::initializer_list<int64_t> dk_dv_shape = {total_k, num_heads, head_size};
#else
std::initializer_list<int64_t> dk_dv_shape = {
total_k, num_heads_k, num_heads / num_heads_k, head_size};
#endif

DenseTensor *kdk = dk, *kdv = dv;
DenseTensor dk_tmp;
Expand Down Expand Up @@ -313,43 +309,6 @@ void FlashAttnUnpaddedGradBaseKernel(

VLOG(10) << "FlashAttn bwd seed: " << params.seed
<< ", offset: " << params.offset;
#ifdef PADDLE_WITH_HIP
bool succ = phi::dynload::flash_attn_varlen_bwd(
dout.data(),
q.data(),
k.data(),
v.data(),
out.data(),
params.softmax_d.data(),
softmax_lse.data(),
cu_seqlens_q.data<int32_t>(),
cu_seqlens_k.data<int32_t>(),
params.rng_state.data(),
kdq->data(),
kdk->data(),
kdv->data(),
params.dq_accum.data(),
params.batch_size,
params.max_seqlen_q,
params.max_seqlen_k,
params.seqlen_q_rounded,
params.seqlen_k_rounded,
params.num_heads,
params.num_heads_k,
params.head_size,
params.head_size_rounded,
params.dropout,
params.softmax_scale,
1.0f / params.softmax_scale,
params.causal,
params.is_bf16,
num_splits,
stream,
params.seed,
params.offset,
params.attn_mask_tensor ? params.attn_mask_tensor->data() : nullptr,
params.attn_mask_tensor ? params.mask_dims.data() : nullptr);
#else
bool succ = phi::dynload::flash_attn_varlen_bwd(
dout.data(),
q.data(),
Expand Down Expand Up @@ -410,56 +369,19 @@ void FlashAttnUnpaddedGradBaseKernel(
max_seqlen_k * kdv->strides()[0],
max_seqlen_q * dout.strides()[0],
varlen_padded);
#endif
CheckFlashAttnStatus(succ);
if (!is_mha) {
if (dk) {
#ifdef PADDLE_WITH_HIP
if (dk->meta().is_contiguous())
phi::SumKernel<T, Context>(
ctx,
dk_tmp.Resize(
{total_k, num_heads_k, num_heads / num_heads_k, head_size}),
{2},
dk->type(),
false,
dk);
else
kvReduceForGQA<T, Context>(
ctx,
dk_tmp.Resize(
{total_k, num_heads_k, num_heads / num_heads_k, head_size}),
dk);
#else
if (dk->meta().is_contiguous())
phi::SumKernel<T, Context>(ctx, dk_tmp, {2}, dk->type(), false, dk);
else
kvReduceForGQA<T, Context>(ctx, dk_tmp, dk);
#endif
}
if (dv) {
#ifdef PADDLE_WITH_HIP
if (dv->meta().is_contiguous())
phi::SumKernel<T, Context>(
ctx,
dv_tmp.Resize(
{total_k, num_heads_k, num_heads / num_heads_k, head_size}),
{2},
dv->type(),
false,
dv);
else
kvReduceForGQA<T, Context>(
ctx,
dv_tmp.Resize(
{total_k, num_heads_k, num_heads / num_heads_k, head_size}),
dv);
#else
if (dv->meta().is_contiguous())
phi::SumKernel<T, Context>(ctx, dv_tmp, {2}, dv->type(), false, dv);
else
kvReduceForGQA<T, Context>(ctx, dv_tmp, dv);
#endif
}
}
#else
Expand Down Expand Up @@ -655,13 +577,8 @@ void FlashAttnGradBaseKernel(

bool is_mha = (num_heads == num_heads_k);

#ifdef PADDLE_WITH_HIP
std::initializer_list<int64_t> dk_dv_shape = {
batch_size, seqlen_k, num_heads, head_size};
#else
std::initializer_list<int64_t> dk_dv_shape = {
batch_size, seqlen_k, num_heads_k, num_heads / num_heads_k, head_size};
#endif

DenseTensor* kdq = dq;
DenseTensor dq_tmp;
Expand Down Expand Up @@ -822,7 +739,37 @@ void FlashAttnGradBaseKernel(
params.seed,
params.offset,
params.attn_mask_tensor ? params.attn_mask_tensor->data() : nullptr,
params.attn_mask_tensor ? params.mask_dims.data() : nullptr);
params.attn_mask_tensor ? params.mask_dims.data() : nullptr,
is_flashmask ? downstart_row_indices_data : nullptr,
is_flashmask ? params.startend_row_indices_dims.data() : nullptr,
is_flashmask ? upend_row_indices_data : nullptr,
is_flashmask ? downend_row_indices_data : nullptr,
is_flashmask ? upstart_row_indices_data : nullptr,
is_flashmask ? flashmask_maxmin.data() : nullptr,
q.strides()[1],
k.strides()[1],
v.strides()[1],
q.strides()[2],
k.strides()[2],
v.strides()[2],
out.strides()[1],
out.strides()[2],
q.strides()[0],
k.strides()[0],
v.strides()[0],
out.strides()[0],
kdq->strides()[1],
kdk->strides()[1],
kdv->strides()[1],
kdq->strides()[2],
kdk->strides()[kdk->strides().size() - 2],
kdv->strides()[kdv->strides().size() - 2],
dout.strides()[1],
dout.strides()[2],
kdq->strides()[0],
kdk->strides()[0],
kdv->strides()[0],
dout.strides()[0]);
#else
bool succ;
int arch =
Expand Down Expand Up @@ -978,63 +925,17 @@ void FlashAttnGradBaseKernel(
CheckFlashAttnStatus(succ);
if (!is_mha) {
if (dk) {
#ifdef PADDLE_WITH_HIP
if (dk->meta().is_contiguous())
phi::SumKernel<T, Context>(ctx,
dk_tmp.Resize({batch_size,
seqlen_k,
num_heads_k,
num_heads / num_heads_k,
head_size}),
{3},
dk->type(),
false,
dk);
else
kvReduceBatchedForGQA<T, Context>(
ctx,
dk_tmp.Resize({batch_size,
seqlen_k,
num_heads_k,
num_heads / num_heads_k,
head_size}),
dk);
#else
if (dk->meta().is_contiguous())
phi::SumKernel<T, Context>(ctx, dk_tmp, {3}, dk->type(), false, dk);
else
kvReduceBatchedForGQA<T, Context>(ctx, dk_tmp, dk);
#endif
}

if (dv) {
#ifdef PADDLE_WITH_HIP
if (dv->meta().is_contiguous())
phi::SumKernel<T, Context>(ctx,
dv_tmp.Resize({batch_size,
seqlen_k,
num_heads_k,
num_heads / num_heads_k,
head_size}),
{3},
dv->type(),
false,
dv);
else
kvReduceBatchedForGQA<T, Context>(
ctx,
dv_tmp.Resize({batch_size,
seqlen_k,
num_heads_k,
num_heads / num_heads_k,
head_size}),
dv);
#else
if (dv->meta().is_contiguous())
phi::SumKernel<T, Context>(ctx, dv_tmp, {3}, dv->type(), false, dv);
else
kvReduceBatchedForGQA<T, Context>(ctx, dv_tmp, dv);
#endif
}
}
#else
Expand Down
Loading
Loading