Skip to content

Coordinate placement #82

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions benchmark/scripts/launch_server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,19 @@ MIN_INPUT_LEN=100
MAX_INPUT_LEN=300
MIN_OUTPUT_LEN=100
MAX_OUTPUT_LEN=500
N_NODE=1
N_NODE=2
N_GPU_PER_NODE=4
NUM_LAYERS=32
NUM_EXPERTS=4
NUM_EXPERTS=8
MAX_BATCH_SIZE_ATTN=160
MAX_BATCH_SIZE_EXP=512
GRAPH_STRIDE=8
step_attn=1
dp_size=2
dp_size=4
step_exp=1
ep_size=2
ep_size=4
top_k=1
placement=pipeline

REPORT_DIR=./reports

Expand Down Expand Up @@ -41,7 +42,7 @@ python benchmark/server.py \
--step-exp $step_exp \
--dp-size $dp_size \
--ep-size $ep_size \
# -ca \
--file $REPORT_TABLE \
--analyze-throughput \
--placement $placement \
--trace
57 changes: 56 additions & 1 deletion disagmoe/utils/placement.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ def solve(self) -> ModelPlacement:
self.model_config.num_layers, self.model_config.num_experts,
self.cluster_config.n_node, self.cluster_config.n_gpu
)
print(place)
place = self._add_edges(place)
place = self._update_expert_rank(place)
place = self._update_attn_dp_rank(place)
Expand Down Expand Up @@ -507,11 +508,63 @@ def _update_attn_dp_rank(self, place: ModelPlacement) -> ModelPlacement:
place.attn_dp_ranks = {dev_id: dev_id for dev_id in place.attn}
return place

class CoordinatePlacement(ColocatePlacement):
"""
Each node is seperated into two groups, [0, gpu_attn) for attn DP, and [gpu_attn, n_gpu) for experts.
"""

def __init__(self, model_config: ModelConfig, cluster_config: ClusterConfig, gpu_attn: int=-1):
super().__init__(model_config, cluster_config)
# self.gpu_attn = gpu_attn if gpu_attn != -1 else cluster_config.n_gpu // 2
self.gpu_attn = cluster_config.n_gpu - model_config.ep_size // cluster_config.n_node

@override
def _solve(self, n_layer: int, n_expert: int, n_node: int, n_gpu_per_node: int) -> ModelPlacement:
num_devices = n_node * n_gpu_per_node

all_layers = list(range(n_layer))
attns = {}
experts = {}

cnt_expert = 0
attn_dp_cnt = 0
attn_dp_ranks = {}
for i in range(n_node):
for j in range(n_gpu_per_node):
dev_id = i * n_gpu_per_node + j
if j < self.gpu_attn:
attns[dev_id] = all_layers
attn_dp_ranks[dev_id] = attn_dp_cnt
attn_dp_cnt += 1
else:
experts[dev_id] = []
for _ in range(self.model_config.num_experts_per_rank):
assert cnt_expert < n_expert
for layer_id in all_layers:
experts[dev_id].append((layer_id, cnt_expert))
cnt_expert += 1

device_groups = {
i: [i] for i in range(num_devices)
}

return ModelPlacement(
attns, experts, self.cluster_config.id_tokenizer, self.cluster_config.id_sampler,
{}, {},
attn_dp_ranks=attn_dp_ranks,
device_groups=device_groups
)

@override
def _update_attn_dp_rank(self, place: ModelPlacement) -> ModelPlacement:
return place

_placement_cls: Dict[str, PlacementBase] = {
"colocate": ColocatePlacement,
"single": SinglePlacement,
"interleave": InterleavePlacement,
"pipeline": PipelinePlacement,
"coordinate": CoordinatePlacement,
}

def get_model_placement(
Expand All @@ -527,8 +580,10 @@ def get_model_placement(

if strategy == "pipeline":
solver = cls(model_config, cluster_config, *args, **kwargs)
elif strategy == "colocate":
elif strategy in ["colocate", "coordinate"]:
solver = cls(model_config, cluster_config)
else:
raise NotImplementedError()

place: ModelPlacement = solver.solve()

Expand Down
21 changes: 21 additions & 0 deletions tests/place/test_coordinate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from disagmoe.utils.placement import *
from disagmoe.config import mixtral_config
from disagmoe.utils.constants import *

config = mixtral_config
config.num_layers = 32
config.ep_size = 2
config.tp_size = 1
config.dp_size = 2
config.num_experts = 4

mp = get_model_placement(
config,
ClusterConfig(1, 4, 40 * GiB, TOKENIZER_DEV_ID, SAMPLER_DEV_ID),
strategy="coordinate",
)

print(mp)

for i in range(0, 4):
print(mp.rank_at(i, config.num_experts_per_rank))