Skip to content

Commit fd06778

Browse files
authored
[BUG FIX] Fix contact information retrieval again. (#1320)
* Fix contact information retrival. * Run CI on idle coreweave nodes preferably. * Enable running benchmarks on multiple GPUs.
1 parent 18a4761 commit fd06778

File tree

8 files changed

+146
-43
lines changed

8 files changed

+146
-43
lines changed

.github/workflows/linux-gpu.yml

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,12 @@ jobs:
3535
3636
mkdir -p "${HOME}/.cache"
3737
38+
# Prefer idle nodes if any
39+
IDLE_NODES=$(sinfo -h -o "%N %t" | awk '$2 == "idle" {print $1}')
40+
if [[ -n "$IDLE_NODES" ]]; then
41+
NODELIST="--nodelist=$IDLE_NODES"
42+
fi
43+
3844
srun \
3945
--container-image="/mnt/data/images/genesis-v${GENESIS_IMAGE_VER}.sqsh" \
4046
--container-mounts=\
@@ -44,7 +50,7 @@ jobs:
4450
--export=\
4551
HF_TOKEN="${HF_TOKEN}",\
4652
NVIDIA_DRIVER_CAPABILITIES=all \
47-
--partition=hpc-mid --nodes=1 --gpus=1 --time="${TIMEOUT_MINUTES}" \
53+
--partition=hpc-mid ${NODELIST} --nodes=1 --time="${TIMEOUT_MINUTES}" \
4854
--job-name=${SLURM_JOB_NAME} \
4955
bash -c "
5056
pip install -e '.[dev,render]' && \
@@ -69,16 +75,16 @@ jobs:
6975
"${{ github.workspace }}":/root/workspace \
7076
--no-container-mount-home --container-workdir=/root/workspace \
7177
--export=${SLURM_ENV_VARS} \
72-
--partition=hpc-mid --exclusive --nodes=1 --gpus=1 --time="${TIMEOUT_MINUTES}" \
78+
--partition=hpc-mid --exclusive --nodes=1 --time="${TIMEOUT_MINUTES}" \
7379
--job-name=${SLURM_JOB_NAME} \
7480
bash -c "
7581
: # sudo apt install -y tmate && \
7682
tmate -S /tmp/tmate.sock new-session -d && \
7783
tmate -S /tmp/tmate.sock wait tmate-ready && \
7884
tmate -S /tmp/tmate.sock display -p '#{tmate_ssh}'
7985
pip install -e '.[dev,render]' && \
80-
pytest --print -x -m 'benchmarks' --backend gpu ./tests && \
81-
cp 'speed_test.txt' '/mnt/data/artifacts/speed_test_${SLURM_JOB_NAME}.txt'
86+
pytest --print -x -m 'benchmarks' ./tests && \
87+
cat speed_test*.txt > '/mnt/data/artifacts/speed_test_${SLURM_JOB_NAME}.txt'
8288
: # tmate -S /tmp/tmate.sock wait tmate-exit
8389
"
8490

examples/drone/hover_env.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ def step(self, actions):
134134
self.drone.set_propellels_rpm((1 + exec_actions * 0.8) * 14468.429183500699)
135135
# update target pos
136136
if self.target is not None:
137-
self.target.set_pos(self.commands, zero_velocity=True, envs_idx=list(range(self.num_envs)))
137+
self.target.set_pos(self.commands, zero_velocity=True)
138138
self.scene.step()
139139

140140
# update buffers

genesis/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,7 @@ def _display_greeting(INFO_length):
291291
wave_width = max(0, min(38, wave_width))
292292
bar_width = wave_width * 2 + 9
293293
wave = ("┈┉" * wave_width)[:wave_width]
294+
global logger
294295
logger.info(f"~<╭{'─'*(bar_width)}╮>~")
295296
logger.info(f"~<│{wave}>~ ~~~~<Genesis>~~~~ ~<{wave}│>~")
296297
logger.info(f"~<╰{'─'*(bar_width)}╯>~")
@@ -314,9 +315,10 @@ def _custom_excepthook(exctype, value, tb):
314315
print("".join(traceback.format_exception(exctype, value, tb)))
315316

316317
# Logger the exception right before exit if possible
318+
global logger
317319
try:
318320
logger.error(f"{exctype.__name__}: {value}")
319-
except AttributeError:
321+
except (AttributeError, NameError):
320322
# Logger may not be configured at this point
321323
pass
322324

genesis/engine/entities/rigid_entity/rigid_entity.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2748,7 +2748,7 @@ def get_contacts(self, with_entity=None, exclude_self_contact=False):
27482748
if self._solver.n_envs == 0:
27492749
contacts_info = {key: value[valid_mask] for key, value in contacts_info.items()}
27502750
else:
2751-
contacts_info = {key: value[:, valid_mask] for key, value in contacts_info.items()}
2751+
contacts_info["valid_mask"] = valid_mask
27522752

27532753
contacts_info["force_a"] = -contacts_info["force"]
27542754
contacts_info["force_b"] = +contacts_info["force"]

genesis/engine/solvers/rigid/collider_decomp.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2088,19 +2088,20 @@ def get_contacts(self, as_tensor: bool = True, to_torch: bool = True):
20882088
# Allocate output buffer
20892089
if to_torch:
20902090
iout = torch.full((out_size, 4), -1, dtype=gs.tc_int, device=gs.device)
2091-
fout = torch.empty((out_size, 10), dtype=gs.tc_float, device=gs.device)
2091+
fout = torch.zeros((out_size, 10), dtype=gs.tc_float, device=gs.device)
20922092
else:
20932093
iout = np.full((out_size, 4), -1, dtype=gs.np_int)
2094-
fout = np.empty((out_size, 10), dtype=gs.np_float)
2094+
fout = np.zeros((out_size, 10), dtype=gs.np_float)
20952095

20962096
# Copy contact data
2097-
self._kernel_get_contacts(as_tensor, iout, fout)
2097+
if n_contacts_max > 0:
2098+
self._kernel_get_contacts(as_tensor, iout, fout)
20982099

2099-
# Return structured view (no copy)
2100+
# Build structured view (no copy)
21002101
if as_tensor:
21012102
if self._solver.n_envs > 0:
2102-
iout = iout.reshape((n_contacts_max, n_envs, -1))
2103-
fout = fout.reshape((n_contacts_max, n_envs, -1))
2103+
iout = iout.reshape((n_envs, n_contacts_max, 4))
2104+
fout = fout.reshape((n_envs, n_contacts_max, 10))
21042105
iout_chunks = (iout[..., 0], iout[..., 1], iout[..., 2], iout[..., 3])
21052106
fout_chunks = (fout[..., 0], fout[..., 1:4], fout[..., 4:7], fout[..., 7:])
21062107
values = (*iout_chunks, *fout_chunks)
@@ -2135,10 +2136,7 @@ def get_contacts(self, as_tensor: bool = True, to_torch: bool = True):
21352136
values = (*iout_chunks, *fout_chunks)
21362137

21372138
contacts_info = dict(
2138-
zip(
2139-
("link_a", "link_b", "geom_a", "geom_b", "penetration", "position", "normal", "force"),
2140-
(value.swapaxes(0, 1) for value in values) if as_tensor and self._solver.n_envs > 0 else values,
2141-
)
2139+
zip(("link_a", "link_b", "geom_a", "geom_b", "penetration", "position", "normal", "force"), values)
21422140
)
21432141

21442142
# Cache contact information before returning

tests/conftest.py

Lines changed: 73 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import gc
22
import os
3+
import re
34
import sys
45
from enum import Enum
56

@@ -30,11 +31,23 @@ def pytest_make_parametrize_id(config, val, argname):
3031

3132
@pytest.hookimpl(tryfirst=True)
3233
def pytest_cmdline_main(config: pytest.Config) -> None:
33-
# Force disabling distributed framework if benchmarks are selected
34+
# Make sure that benchmarks are running on GPU and the number of workers if valid
3435
expr = Expression.compile(config.option.markexpr)
3536
is_benchmarks = expr.evaluate(MarkMatcher.from_markers((pytest.mark.benchmarks,)))
3637
if is_benchmarks:
37-
config.option.numprocesses = 0
38+
# Make sure that GPU backend is enforced
39+
backend = config.getoption("--backend")
40+
if backend == "cpu":
41+
raise ValueError("Running benchmarks on CPU is not supported.")
42+
config.option.backend = "gpu"
43+
44+
# Make sure that the number of workers is not too large
45+
if isinstance(config.option.numprocesses, int):
46+
max_workers = max(pytest_xdist_auto_num_workers(config), 1)
47+
if config.option.numprocesses > max_workers:
48+
raise ValueError(
49+
f"The number of workers for running benchmarks cannot exceed '{max_workers}' on this machine."
50+
)
3851

3952
# Force disabling forked for non-linux systems
4053
if not sys.platform.startswith("linux"):
@@ -45,21 +58,40 @@ def pytest_cmdline_main(config: pytest.Config) -> None:
4558
if show_viewer:
4659
config.option.numprocesses = 0
4760

48-
# Disable low-level parallelization if distributed framework is enabled
49-
# if config.option.numprocesses > 0:
50-
os.environ["OMP_NUM_THREADS"] = "1"
51-
os.environ["OPENBLAS_NUM_THREADS"] = "1"
52-
os.environ["MKL_NUM_THREADS"] = "1"
53-
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
54-
os.environ["NUMEXPR_NUM_THREADS"] = "1"
55-
os.environ["NUMBA_NUM_THREADS"] = "1"
61+
# Disable low-level parallelization if distributed framework is enabled.
62+
# FIXME: It should be set to `max(int(physical_core_count / num_workers), 1)`, but 'num_workers' may be unknown.
63+
if not is_benchmarks and config.option.numprocesses != 0:
64+
os.environ["OMP_NUM_THREADS"] = "1"
65+
os.environ["OPENBLAS_NUM_THREADS"] = "1"
66+
os.environ["MKL_NUM_THREADS"] = "1"
67+
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
68+
os.environ["NUMEXPR_NUM_THREADS"] = "1"
69+
os.environ["NUMBA_NUM_THREADS"] = "1"
70+
71+
72+
def _get_gpu_indices():
73+
nvidia_gpu_indices = os.environ.get("CUDA_VISIBLE_DEVICES")
74+
if nvidia_gpu_indices is not None:
75+
return tuple(sorted(map(int, nvidia_gpu_indices.split(","))))
76+
77+
if sys.platform == "linux":
78+
nvidia_gpu_indices = []
79+
nvidia_gpu_interface_path = "/proc/driver/nvidia/gpus/"
80+
if os.path.exists(nvidia_gpu_interface_path):
81+
for device_path in os.listdir(nvidia_gpu_interface_path):
82+
with open(os.path.join(nvidia_gpu_interface_path, device_path, "information"), "r") as f:
83+
gpu_id = int(re.search(r"Device Minor:\s+(\d+)", f.read()).group(1))
84+
nvidia_gpu_indices.append(gpu_id)
85+
return tuple(sorted(nvidia_gpu_indices))
86+
87+
return (0,)
5688

5789

5890
def pytest_xdist_auto_num_workers(config):
59-
# Get available memory (RAM & VRAM) and number of physical cores.
6091
import psutil
6192
import genesis as gs
6293

94+
# Get available memory (RAM & VRAM) and number of physical cores.
6395
physical_core_count = psutil.cpu_count(logical=False)
6496
_, _, ram_memory, _ = gs.utils.get_device(gs.cpu)
6597
_, _, vram_memory, _ = gs.utils.get_device(gs.gpu)
@@ -76,15 +108,39 @@ def pytest_xdist_auto_num_workers(config):
76108
else:
77109
ram_memory_per_worker = 7.5
78110
vram_memory_per_worker = 1.6
79-
return min(
80-
int(ram_memory / ram_memory_per_worker),
81-
int(vram_memory / vram_memory_per_worker),
111+
num_workers = min(
82112
physical_core_count,
113+
max(int(ram_memory / ram_memory_per_worker), 1),
114+
max(int(vram_memory / vram_memory_per_worker), 1),
83115
)
84116

117+
# Special treatment for benchmarks
118+
expr = Expression.compile(config.option.markexpr)
119+
is_benchmarks = expr.evaluate(MarkMatcher.from_markers((pytest.mark.benchmarks,)))
120+
if is_benchmarks:
121+
num_cpu_per_gpu = 4
122+
num_workers = min(
123+
num_workers,
124+
len(_get_gpu_indices()),
125+
max(int(physical_core_count / num_cpu_per_gpu), 1),
126+
)
127+
128+
return num_workers
129+
130+
131+
def pytest_runtest_setup(item):
132+
# Enforce GPU affinity that distributed framework is enabled
133+
worker_id = os.environ.get("PYTEST_XDIST_WORKER")
134+
if worker_id and worker_id.startswith("gw"):
135+
worker_num = int(worker_id[2:])
136+
gpu_indices = _get_gpu_indices()
137+
gpu_num = worker_num % len(gpu_indices)
138+
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_indices[gpu_num])
139+
os.environ["TI_VISIBLE_DEVICE"] = str(gpu_indices[gpu_num])
140+
85141

86142
def pytest_addoption(parser):
87-
parser.addoption("--backend", action="store", default="cpu", help="Default simulation backend.")
143+
parser.addoption("--backend", action="store", default=None, help="Default simulation backend.")
88144
parser.addoption("--vis", action="store_true", default=False, help="Enable interactive viewer.")
89145

90146

@@ -97,7 +153,7 @@ def show_viewer(pytestconfig):
97153
def backend(pytestconfig):
98154
import genesis as gs
99155

100-
backend = pytestconfig.getoption("--backend", "cpu")
156+
backend = pytestconfig.getoption("--backend") or gs.cpu
101157
if isinstance(backend, str):
102158
return getattr(gs.constants.backend, backend)
103159
return backend
@@ -220,6 +276,7 @@ def initialize_genesis(request, backend, taichi_offline_cache):
220276
else:
221277
precision = "32"
222278
debug = False
279+
223280
try:
224281
if not taichi_offline_cache:
225282
os.environ["TI_OFFLINE_CACHE"] = "0"

tests/test_rigid_benchmarks.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
import hashlib
22
import numbers
33
import os
4-
import pytest
4+
import tempfile
55
import time
66
from enum import Enum
77
from pathlib import Path
88
from typing import Any
99

1010
import numpy as np
11+
import pytest
1112
import torch
1213
import wandb
1314

@@ -216,10 +217,25 @@ def get_file_morph_options(**kwargs):
216217

217218
@pytest.fixture(scope="session")
218219
def stream_writers(backend, printer_session):
219-
log_path = Path(REPORT_FILE)
220-
if os.path.exists(log_path):
221-
os.remove(log_path)
222-
fd = open(log_path, "w")
220+
report_path = Path(REPORT_FILE)
221+
222+
# Delete old unrelated worker-specific reports
223+
worker_id = os.environ.get("PYTEST_XDIST_WORKER")
224+
if worker_id == "gw0":
225+
worker_count = int(os.environ["PYTEST_XDIST_WORKER_COUNT"])
226+
227+
for path in report_path.parent.glob("-".join((report_path.stem, "*.txt"))):
228+
_, worker_id = path.stem.rsplit("-", 1)
229+
worker_num = int(worker_id[2:])
230+
if worker_num >= worker_count:
231+
path.unlink()
232+
233+
# Create new empty worker-specific report
234+
report_name = "-".join((report_path.stem, worker_id))
235+
report_path = report_path.with_name(f"{report_name}.txt")
236+
if report_path.exists():
237+
report_path.unlink()
238+
fd = open(report_path, "w")
223239

224240
yield (lambda msg: print(msg, file=fd), printer_session)
225241

tests/test_rigid_physics.py

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2361,18 +2361,42 @@ def test_data_accessor(n_envs, batched, tol):
23612361
# Simulate for a while, until they collide with something
23622362
for _ in range(400):
23632363
gs_sim.step()
2364-
gs_n_contacts = gs_sim.rigid_solver.collider.n_contacts.to_torch(device="cpu")
2364+
2365+
gs_n_contacts = gs_sim.rigid_solver.collider.n_contacts.to_numpy()
2366+
assert len(gs_n_contacts) == max(n_envs, 1)
2367+
for as_tensor in (False, True):
2368+
for to_torch in (False, True):
2369+
contacts_info = gs_sim.rigid_solver.collider.get_contacts(as_tensor, to_torch)
2370+
for value in contacts_info.values():
2371+
if n_envs > 0:
2372+
assert n_envs == len(value)
2373+
else:
2374+
assert gs_n_contacts[0] == len(value)
2375+
value = value[None] if as_tensor else (value,)
2376+
2377+
for i_b in range(n_envs):
2378+
n_contacts = gs_n_contacts[i_b]
2379+
if as_tensor:
2380+
assert isinstance(value, torch.Tensor if to_torch else np.ndarray)
2381+
if value.dtype in (gs.tc_int, gs.np_int):
2382+
assert (value[i_b, :n_contacts] != -1).all()
2383+
assert (value[i_b, n_contacts:] == -1).all()
2384+
else:
2385+
assert_allclose(value[i_b, n_contacts:], 0.0, tol=0)
2386+
else:
2387+
assert isinstance(value, (list, tuple))
2388+
assert value[i_b].shape[0] == n_contacts
2389+
if value[i_b].dtype in (gs.tc_int, gs.np_int):
2390+
assert (value[i_b] != -1).all()
2391+
23652392
if (gs_n_contacts > 0).all():
23662393
break
23672394
else:
23682395
assert False
23692396
gs_sim.rigid_solver._kernel_forward_dynamics()
23702397
gs_sim.rigid_solver._func_constraint_force()
23712398

2372-
# Make sure that contact info accessor is working
2373-
for as_tensor in (False, True):
2374-
for to_torch in (False, True):
2375-
contacts_info = gs_sim.rigid_solver.collider.get_contacts(as_tensor, to_torch)
2399+
gs_robot.get_contacts()
23762400

23772401
# Make sure that all the robots ends up in the different state
23782402
qposs = gs_robot.get_qpos().cpu()

0 commit comments

Comments
 (0)