Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Benchmark the AirFRANS datapipe throughput via the physicsnemo DataLoader.

Instantiates the full pipeline from the Hydra config, wraps it in
physicsnemo.datapipes.DataLoader with the same collate as training, and
measures wall-clock time per batch over N iterations.

Usage
-----
# Arrow reader (default; dataset_path from conf/config.yaml)
python benchmark_datapipe.py

# Override config from CLI
python benchmark_datapipe.py dataset_path=/path/to/arrow +n_samples=50

# VTK reader
python benchmark_datapipe.py reader=vtk data_dir=/path/to/vtk
"""

from __future__ import annotations

import logging
import statistics
import time
from typing import Any, Sequence

import hydra
import torch
from omegaconf import DictConfig, OmegaConf
from torch.utils.data import SequentialSampler

from physicsnemo.datapipes import DataLoader as PhysicsnemoDataLoader
from physicsnemo.datapipes import Dataset as PhysicsnemoDataset
from tensordict import TensorDict

from physicsnemo_dataset import _structured_tensordict_to_airfrans_sample

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

_CUDA_AVAILABLE = torch.cuda.is_available()
_BYTES_PER_MB = 2**20


def collate_single(
samples: Sequence[tuple[TensorDict, dict[str, Any]]],
):
"""Collate for batch_size=1: convert structured TensorDict to AirFRANSSample."""
data, _ = samples[0]
return _structured_tensordict_to_airfrans_sample(data)


def _gpu_memory_mb() -> dict[str, float] | None:
"""Return current GPU memory stats in MB, or None if CUDA not available."""
if not _CUDA_AVAILABLE:
return None
torch.cuda.synchronize()
return {
"allocated_mb": torch.cuda.memory_allocated() / _BYTES_PER_MB,
"reserved_mb": torch.cuda.memory_reserved() / _BYTES_PER_MB,
"max_allocated_mb": torch.cuda.max_memory_allocated() / _BYTES_PER_MB,
"max_reserved_mb": torch.cuda.max_memory_reserved() / _BYTES_PER_MB,
}


def benchmark(
dataloader: PhysicsnemoDataLoader,
n_samples: int,
) -> tuple[list[float], dict[str, float] | None]:
dataset = dataloader.dataset
actual_n = min(n_samples, len(dataset))
if actual_n == 0:
logger.warning("Dataset is empty — nothing to benchmark.")
return [], _gpu_memory_mb() if _CUDA_AVAILABLE else None

logger.info("Warming up (1 batch)...")
_ = next(iter(dataloader))

if _CUDA_AVAILABLE:
torch.cuda.reset_peak_memory_stats()
torch.cuda.synchronize()

logger.info("Timing %d batches...", actual_n)
times: list[float] = []
it = iter(dataloader)
for _ in range(actual_n):
t0 = time.perf_counter()
_ = next(it)
times.append(time.perf_counter() - t0)

gpu_stats = _gpu_memory_mb()
return times, gpu_stats


def print_results(
times: list[float],
gpu_stats: dict[str, float] | None = None,
) -> None:
n = len(times)
if n == 0:
return
total = sum(times)
mean = statistics.mean(times)
std = statistics.stdev(times) if n > 1 else 0.0
median = statistics.median(times)
throughput = n / total if total > 0 else 0.0

header = (
f"{'Samples':>8s} {'Total (s)':>10s} {'Mean (s)':>10s} "
f"{'Median (s)':>11s} {'Std (s)':>10s} {'Throughput':>12s}"
)
sep = "-" * len(header)
print()
print(sep)
print(header)
print(sep)
print(
f"{n:>8d} {total:>10.3f} {mean:>10.4f} "
f"{median:>11.4f} {std:>10.4f} {throughput:>10.2f}/s"
)
print(sep)

if gpu_stats is not None:
print()
print("GPU memory (peak during benchmark):")
print(
f" max allocated: {gpu_stats['max_allocated_mb']:.2f} MB "
f"max reserved: {gpu_stats['max_reserved_mb']:.2f} MB"
)
print(
f" current allocated: {gpu_stats['allocated_mb']:.2f} MB "
f"current reserved: {gpu_stats['reserved_mb']:.2f} MB"
)
print()


@hydra.main(
version_base=None,
config_path="./conf",
config_name="config",
)
def main(cfg: DictConfig) -> None:
n_samples: int = cfg.get("n_samples", 100)

print("=== AirFRANS Datapipe Benchmark ===")
print()
print(OmegaConf.to_yaml(cfg, resolve=True))

logger.info("Building physicsnemo dataloader...")
dataset: PhysicsnemoDataset = hydra.utils.instantiate(cfg.dataset)
sampler = SequentialSampler(dataset)
dataloader = PhysicsnemoDataLoader(
dataset,
batch_size=1,
sampler=sampler,
collate_fn=collate_single,
)
logger.info("Dataset size: %d samples", len(dataset))

times, gpu_stats = benchmark(dataloader, n_samples)
print_results(times, gpu_stats)

dataset.close()


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

defaults:
- reader@reader: arrow
- transforms: transforms
- _self_

# Data paths (set via CLI overrides or environment)
dataset_path: /lustre/fsw/portfolios/coreai/projects/coreai_modulus_cae/datasets/airfrans/huggingface/ # used by reader=arrow
# dataset_path: /lustre/fsw/portfolios/coreai/projects/coreai_modulus_cae/datasets/airfrans/Dataset/ # used by reader=vtk
Comment on lines +23 to +24
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hardcoded internal NVIDIA cluster path in public config

The dataset_path default is set to a NVIDIA-internal Lustre path (/lustre/fsw/portfolios/coreai/...). This path will not resolve for anyone outside the internal cluster and should be replaced with a placeholder, requiring users to provide their own path via CLI override.

Suggested change
dataset_path: /lustre/fsw/portfolios/coreai/projects/coreai_modulus_cae/datasets/airfrans/huggingface/ # used by reader=arrow
# dataset_path: /lustre/fsw/portfolios/coreai/projects/coreai_modulus_cae/datasets/airfrans/Dataset/ # used by reader=vtk
dataset_path: /path/to/airfrans/dataset # set via CLI: dataset_path=/your/path


# Dataset parameters
task: full
split: train
device: auto

# Reader (populated from defaults)
reader: {}

# Dataset wrapping the reader + transforms
dataset:
_target_: physicsnemo.datapipes.Dataset
device: ${device}
reader: ${reader}
transforms:
- ${transforms.gradients}
- ${transforms.normals}
- ${transforms.freestream}
- ${transforms.nondimensionalize}
- ${transforms.forces}
- ${transforms.patch}
- ${transforms.to_airfrans_sample}
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

_target_: pipeline.arrow_reader.AirFRANSArrowReader
dataset_path: ${dataset_path}
task: ${task}
split: ${split}
pin_memory: true
include_index_in_metadata: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

_target_: pipeline.vtk_reader.AirFRANSVTKReader
dataset_path: ${dataset_path}
task: ${task}
split: ${split}
pin_memory: true
include_index_in_metadata: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

gradients:
_target_: pipeline.transforms.ComputeGradients

normals:
_target_: pipeline.transforms.ComputeAirfoilNormals

freestream:
_target_: pipeline.transforms.ComputeFreestreamQuantities

nondimensionalize:
_target_: pipeline.transforms.NondimensionalizeFields

forces:
_target_: pipeline.transforms.ComputeForceCoefficients

patch:
_target_: pipeline.transforms.PatchNonPhysicalValues
threshold: 1.02
warn_fraction: 0.0001

to_airfrans_sample:
_target_: pipeline.transforms.ToAirFRANSSampleStructure
Loading