Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
129 commits
Select commit Hold shift + click to select a range
c91052c
[Inference All2All] Supports internode_ll_two_stage all2all communica…
carryyu Jul 8, 2025
6bd8a9e
[Inference All2All] Modify kMaxNumQPs in internodel_ll_two_stage
carryyu Jul 8, 2025
7ebb4f5
[Inference All2All] Modify code-style
carryyu Jul 8, 2025
85ad4db
[Inference All2All] Modify code-style
carryyu Jul 8, 2025
f487c0f
[Inference All2All] fix unit test
carryyu Jul 8, 2025
630b95c
[Inference All2All] modify codestyle and enhance unit test
carryyu Jul 10, 2025
444bb8e
[Inference All2All] modify codestyle and enhance unit test
carryyu Jul 10, 2025
7bf5f18
[Inference All2All] supports batch_send and enhance unit test
carryyu Jul 10, 2025
9f555fc
lzy test
szluyu99 Jul 15, 2025
61a49aa
add return_recv_hook
szluyu99 Jul 15, 2025
5eeb2f8
fixed num_sums
szluyu99 Jul 15, 2025
0e44ef6
Merge pull request #1 from l1351868270/m2n_return_recv_hook
l1351868270 Jul 15, 2025
a534b07
add m2n support
l1351868270 Jul 16, 2025
12486b4
add m2n test
l1351868270 Jul 16, 2025
7588e44
update test
szluyu99 Jul 16, 2025
25b4890
add support m2n buffer
l1351868270 Jul 17, 2025
e8475fe
add support m2n buffer
l1351868270 Jul 17, 2025
fb8c463
Merge branch 'm2n' of https://github.com/l1351868270/Paddle into m2n
l1351868270 Jul 17, 2025
9dec8d9
update build environment
l1351868270 Jul 17, 2025
93187ac
update build environment
szluyu99 Jul 17, 2025
20ab024
Merge branch 'm2n' of https://github.com/l1351868270/Paddle into m2n
l1351868270 Jul 17, 2025
1a461cf
m2n test add data check
szluyu99 Jul 18, 2025
bb91e5a
add m2n demo
l1351868270 Jul 21, 2025
dad1bec
Merge pull request #2 from l1351868270/m2n_lsl
l1351868270 Jul 21, 2025
3735ebd
pull the latest code
szluyu99 Jul 21, 2025
cba4253
test update
Jul 21, 2025
89ea91a
update test
szluyu99 Jul 21, 2025
b5d5743
update test
szluyu99 Jul 21, 2025
f00ca31
update m2n return_recv_hook
szluyu99 Jul 21, 2025
7035ac3
update e2a_irecv
l1351868270 Jul 22, 2025
8129b58
Merge pull request #4 from l1351868270/m2n_lsl
l1351868270 Jul 22, 2025
cac91e4
support start_port
l1351868270 Jul 22, 2025
63c24d7
Merge pull request #5 from l1351868270/m2n_lsl
l1351868270 Jul 22, 2025
07e1bed
fix 64 experts bug
szluyu99 Jul 22, 2025
9d1bb80
update m2n demo
l1351868270 Jul 23, 2025
4910170
set sm_count 24
szluyu99 Jul 23, 2025
36947ea
Merge pull request #6 from l1351868270/m2n_lsl_0722
l1351868270 Jul 23, 2025
7515f88
add async_finish support
l1351868270 Jul 25, 2025
7a64f4c
add send recv support
l1351868270 Jul 25, 2025
8bf9752
update test m2n demo
l1351868270 Jul 25, 2025
efc89c7
update test m2n demo
l1351868270 Jul 25, 2025
d3f3e70
two dispatch hang
l1351868270 Jul 29, 2025
511f309
solve dd dc cd bugs
l1351868270 Jul 30, 2025
cb309ac
m2n code independence from all2all code
l1351868270 Jul 31, 2025
465c618
support hook mode on communication stream
l1351868270 Aug 1, 2025
4028a55
Merge pull request #11 from l1351868270/m2n_dev_hook_comm_lsl
l1351868270 Aug 1, 2025
267ce49
add two dispatch test
l1351868270 Aug 1, 2025
9afd321
Merge pull request #12 from l1351868270/m2n_dev_hook_comm_lsl
l1351868270 Aug 1, 2025
958c358
add two batch size and two layer test
l1351868270 Aug 1, 2025
5daf9fc
Merge pull request #13 from l1351868270/m2n_dev_hook_comm_lsl
l1351868270 Aug 1, 2025
3eabb9c
add dispatch combine test
l1351868270 Aug 1, 2025
d18d3e5
Merge pull request #15 from l1351868270/m2n_dev_hook_comm_lsl
l1351868270 Aug 1, 2025
8086a3e
fix illegale memory
Aug 4, 2025
ec377cc
add v3 api
l1351868270 Aug 4, 2025
f79b7d2
Merge pull request #17 from l1351868270/m2n_dev_lsl_0804
l1351868270 Aug 4, 2025
c3cf0e8
add dispatch and combine wait
l1351868270 Aug 5, 2025
e588b10
Merge pull request #18 from l1351868270/m2n_dev_wait_lsl_0805
l1351868270 Aug 5, 2025
1f60f1f
fix continuous dispatch wrong https://github.com/PaddlePaddle/Paddle/…
l1351868270 Aug 5, 2025
b2dd8c2
Merge pull request #19 from l1351868270/m2n_dev_wait_lsl_0805
l1351868270 Aug 5, 2025
ea774de
Revert "fix continuous dispatch wrong"
l1351868270 Aug 5, 2025
566ce13
Merge pull request #20 from l1351868270/revert-19-m2n_dev_wait_lsl_0805
l1351868270 Aug 5, 2025
8eaca08
fix continuous dispatch wrong
l1351868270 Aug 5, 2025
3bec1c2
Merge pull request #21 from l1351868270/m2n_dev_wait_lsl_0805
l1351868270 Aug 5, 2025
cf7ee95
support wait all rank complete
l1351868270 Aug 6, 2025
7a570b2
Merge pull request #22 from l1351868270/m2n_dev_wait_lsl_0805
l1351868270 Aug 6, 2025
bc2f1c0
change test file
Aug 8, 2025
259e65c
change test file
Aug 8, 2025
0302f6d
fix receive
Aug 8, 2025
c1c6ad7
all layer simulate
Aug 8, 2025
021c649
update all layers
l1351868270 Aug 8, 2025
7590c6a
update all layers
l1351868270 Aug 8, 2025
703c8c5
update all layers
l1351868270 Aug 9, 2025
707fa1e
update all layers
l1351868270 Aug 9, 2025
3f12339
update all layers
l1351868270 Aug 9, 2025
179fb2d
update all layers
l1351868270 Aug 11, 2025
e26c98d
update all layers
l1351868270 Aug 11, 2025
b883549
update log
l1351868270 Aug 11, 2025
ad01f3e
update log
l1351868270 Aug 11, 2025
6c0a2a5
fix 51 layer hang
l1351868270 Aug 11, 2025
2e8937a
fix 51 layer hang
l1351868270 Aug 11, 2025
c49001b
fix dispatch 51 layer hang
l1351868270 Aug 11, 2025
d9be6a7
add hang log
l1351868270 Aug 12, 2025
6085562
add nvl sync log
l1351868270 Aug 12, 2025
9e78917
fix nvl all2all hang
l1351868270 Aug 12, 2025
79c2f68
fix workspace conflict
l1351868270 Aug 12, 2025
5f231b7
fix workspace conflict
l1351868270 Aug 13, 2025
9a1cb23
fix can not overlap
l1351868270 Aug 14, 2025
9e8fc76
fix can not overlap complete
l1351868270 Aug 15, 2025
5c8ef64
when time out, break
l1351868270 Aug 15, 2025
318556e
fix complete hang
l1351868270 Aug 16, 2025
860631d
convert zeros to empty
l1351868270 Aug 18, 2025
258ca43
convert zeros to empty
l1351868270 Aug 18, 2025
2857d73
convert zeros to empty
l1351868270 Aug 18, 2025
b342858
a simple overlap method
l1351868270 Aug 19, 2025
e757905
fix accuracy is not correct
l1351868270 Aug 20, 2025
5a70b97
moe first disptch wait event
l1351868270 Aug 20, 2025
5727eb5
add nvl log
l1351868270 Aug 21, 2025
98d2c4a
fix single test accuracy is not correct
l1351868270 Aug 22, 2025
a230211
update test_m2n_all_layers_v3
l1351868270 Aug 22, 2025
4dff1ab
increase workpace memory
l1351868270 Aug 22, 2025
b226288
delete m2n test
zhoutianzi666 Sep 28, 2025
8257e42
delete m2n test
zhoutianzi666 Sep 28, 2025
6d7f7fb
initial commit /root/paddlejob/workspace/env_run/output/zkk/erniebot-…
zhoutianzi666 Sep 28, 2025
ae547f2
简化 m2n_ll_two_stage.cu
zhoutianzi666 Sep 28, 2025
4c8a61b
make code 漂亮
zhoutianzi666 Sep 29, 2025
e69570b
make code 漂亮
zhoutianzi666 Sep 29, 2025
02c0517
make code 漂亮
zhoutianzi666 Sep 29, 2025
1689620
make code 漂亮
zhoutianzi666 Sep 29, 2025
bff1fef
make code 漂亮
zhoutianzi666 Sep 30, 2025
ed8152c
make code 漂亮
zhoutianzi666 Sep 30, 2025
7dc6498
make code 漂亮
zhoutianzi666 Oct 1, 2025
38e7db7
merge develop
zhoutianzi666 Oct 1, 2025
f650dd4
not modify /root/paddlejob/workspace/env_run/output/zkk/erniebot-dev/…
zhoutianzi666 Oct 2, 2025
d1819ab
restore /root/paddlejob/workspace/env_run/output/zkk/erniebot-dev/202…
zhoutianzi666 Oct 9, 2025
3abf2fe
restore cmake
zhoutianzi666 Oct 9, 2025
b123378
restore
zhoutianzi666 Oct 9, 2025
e953b36
add comment
zhoutianzi666 Oct 9, 2025
aa9fc7b
update /root/paddlejob/workspace/env_run/output/zkk/erniebot-dev/2024…
zhoutianzi666 Oct 9, 2025
baeec36
not modify
zhoutianzi666 Oct 9, 2025
6614880
add comment
zhoutianzi666 Oct 9, 2025
5780c65
add comment
zhoutianzi666 Oct 9, 2025
fe8afde
format code
zhoutianzi666 Oct 9, 2025
7c6fa63
add comment
zhoutianzi666 Oct 9, 2025
8f36b6f
add comment
zhoutianzi666 Oct 9, 2025
2a489cf
format code
zhoutianzi666 Oct 9, 2025
51bc1ac
format code
zhoutianzi666 Oct 9, 2025
8c54bd3
format code
zhoutianzi666 Oct 9, 2025
2bbbb48
add test
zhoutianzi666 Oct 9, 2025
49dace1
update code /root/paddlejob/workspace/env_run/output/zkk/erniebot-dev…
zhoutianzi666 Oct 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions paddle/fluid/distributed/collective/deep_ep/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,13 @@ if(WITH_NVSHMEM)
CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}")

set(DEEPEP_KERNEL_SRCS
kernels/intranode.cu kernels/runtime.cu kernels/internode.cu
kernels/internode_ll.cu kernels/internode_ll_two_stage.cu)
kernels/intranode.cu
kernels/runtime.cu
kernels/internode.cu
kernels/internode_ll.cu
kernels/internode_ll_two_stage.cu
kernels/internode_ll.cu
kernels/m2n_ll_two_stage.cu)
cc_library(
deepep_kernels
SRCS ${DEEPEP_KERNEL_SRCS}
Expand Down
28 changes: 28 additions & 0 deletions paddle/fluid/distributed/collective/deep_ep/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,10 +149,14 @@ struct LowLatencyBuffer {
void* dispatch_rdma_send_buffer = nullptr;
void* dispatch_rdma_recv_data_buffer = nullptr;
int* dispatch_rdma_recv_count_buffer = nullptr;
// Note(ZKK) this is only used in M2N !
Comment thread
zhoutianzi666 marked this conversation as resolved.
int* dispatch_rdma_recv_complete_buffer = nullptr;

void* combine_rdma_send_buffer = nullptr;
void* combine_rdma_recv_data_buffer = nullptr;
int* combine_rdma_recv_flag_buffer = nullptr;
// Note(ZKK) this is only used in M2N !
int* combine_rdma_recv_complete_buffer = nullptr;

void* combine_rdma_send_buffer_data_start = nullptr;
size_t num_bytes_per_combine_msg = 0;
Expand Down Expand Up @@ -244,11 +248,19 @@ struct LowLatencyLayout {
advance<int*>(rdma_buffer,
send_buffer_bytes * 2 + recv_buffer_bytes * 2 +
signaling_buffer_bytes * i),
// Note(ZKK): dispatch_rdma_recv_complete_buffer is only used in M2N!
// so here we symbolically add a 0 to it
advance<int*>(rdma_buffer, 0),

advance(rdma_buffer, send_buffer_bytes * i),
advance(rdma_buffer, send_buffer_bytes * 2 + recv_buffer_bytes * i),
advance<int*>(rdma_buffer,
send_buffer_bytes * 2 + recv_buffer_bytes * 2 +
signaling_buffer_bytes * i),
// Note(ZKK): combine_rdma_recv_complete_buffer is only used in M2N!
// so here we symbolically add a 0 to it
advance<int*>(rdma_buffer, 0),

advance(rdma_buffer, send_buffer_bytes * i),
num_bytes_per_combine_msg};
}
Expand Down Expand Up @@ -318,6 +330,12 @@ struct LowLatencyTwoStageLayout {
combine_recv_flag_buffer_bytes);
total_bytes += signaling_buffer_bytes * 2;

// Symmetric complete signaling buffers
// Note(ZKK): this is only used in M2N!
size_t recv_complete_buffer_bytes =
2 * M2N_NUM_MAX_MICRO_BATCHES * num_ranks * sizeof(int);
total_bytes += recv_complete_buffer_bytes * 2;

// Assign pointers
for (int i = 0; i < 2; ++i) {
buffers[i] = {
Expand All @@ -327,11 +345,21 @@ struct LowLatencyTwoStageLayout {
advance<int*>(rdma_buffer,
send_buffer_bytes * 2 + recv_buffer_bytes * 2 +
signaling_buffer_bytes * i),
// dispatch_rdma_recv_complete_buffer!
advance<int*>(rdma_buffer,
send_buffer_bytes * 2 + recv_buffer_bytes * 2 +
signaling_buffer_bytes * 2 +
recv_complete_buffer_bytes * i),
advance(rdma_buffer, send_buffer_bytes * i),
advance(rdma_buffer, send_buffer_bytes * 2 + recv_buffer_bytes * i),
advance<int*>(rdma_buffer,
send_buffer_bytes * 2 + recv_buffer_bytes * 2 +
signaling_buffer_bytes * i),
// combine_rdma_recv_complete_buffer!
advance<int*>(rdma_buffer,
send_buffer_bytes * 2 + recv_buffer_bytes * 2 +
signaling_buffer_bytes * 2 +
recv_complete_buffer_bytes * i),
advance(rdma_buffer, send_buffer_bytes * i),
num_bytes_per_combine_msg};
}
Expand Down
Loading