Skip to content

Commit 132aa5f

Browse files
Ben Carverfacebook-github-bot
authored andcommitted
Remove nccl_cvars.(h|cc) in favor of genrule-based generation
Summary: This diff removes the `nccl_cvars.cc` and `nccl_cvars.h` files now that they'll instead be built/generated/provided by `genrule`. This diff also updates the MCCL build script, `comms/mccl/build/build.sh`, to be compatible with the new `genrule`-based build for the `ncclx-cvars` library. **Note**: this diff was originally published, landed, and reverted as D87668052. Differential Revision: D88748045
1 parent ca1c031 commit 132aa5f

File tree

6 files changed

+194
-7856
lines changed

6 files changed

+194
-7856
lines changed

build_ncclx.sh

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,41 @@ if [[ -z "${NCCL_BUILD_SKIP_DEPS}" ]]; then
248248
build_comms_tracing_service
249249
fi
250250

251+
# Generate nccl_cvars files (these are no longer checked into the repo)
252+
# The files are generated by extractcvars.py which reads nccl_cvars.yaml and nccl_cvars.cc.in
253+
echo "Generating nccl_cvars files..."
254+
CVARS_DIR="$BASE_DIR/comms/utils/cvars"
255+
256+
# Validate that the required source files exist
257+
if [ ! -f "$CVARS_DIR/extractcvars.py" ]; then
258+
echo "ERROR: extractcvars.py not found at $CVARS_DIR/extractcvars.py"
259+
exit 1
260+
fi
261+
if [ ! -f "$CVARS_DIR/nccl_cvars.yaml" ]; then
262+
echo "ERROR: nccl_cvars.yaml not found at $CVARS_DIR/nccl_cvars.yaml"
263+
exit 1
264+
fi
265+
if [ ! -f "$CVARS_DIR/nccl_cvars.cc.in" ]; then
266+
echo "ERROR: nccl_cvars.cc.in not found at $CVARS_DIR/nccl_cvars.cc.in"
267+
exit 1
268+
fi
269+
270+
# Install ruamel-yaml if not already installed (required by extractcvars.py)
271+
if [[ -z "${NCCL_SKIP_CONDA_INSTALL}" ]]; then
272+
conda install ruamel.yaml --yes
273+
fi
274+
275+
# Run the extractcvars.py script directly to generate the files
276+
export NCCL_CVARS_OUTPUT_DIR="$CVARS_DIR"
277+
python3 "$CVARS_DIR/extractcvars.py"
278+
279+
# Verify the files were generated
280+
if [ ! -f "$CVARS_DIR/nccl_cvars.h" ] || [ ! -f "$CVARS_DIR/nccl_cvars.cc" ]; then
281+
echo "ERROR: Failed to generate nccl_cvars files"
282+
exit 1
283+
fi
284+
echo "Successfully generated nccl_cvars files in $CVARS_DIR"
285+
251286
# set up the third-party ldflags
252287
export PKG_CONFIG_PATH="${CONDA_LIB_DIR}"/pkgconfig
253288
THRIFT_SERVICE_LDFLAGS=(

build_rcclx.sh

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,41 @@ if [[ -z "${NCCL_BUILD_SKIP_DEPS}" ]]; then
216216
build_third_party
217217
fi
218218

219+
# Generate nccl_cvars files (these are no longer checked into the repo)
220+
# The files are generated by extractcvars.py which reads nccl_cvars.yaml and nccl_cvars.cc.in
221+
echo "Generating nccl_cvars files..."
222+
CVARS_DIR="$BASE_DIR/comms/utils/cvars"
223+
224+
# Validate that the required source files exist
225+
if [ ! -f "$CVARS_DIR/extractcvars.py" ]; then
226+
echo "ERROR: extractcvars.py not found at $CVARS_DIR/extractcvars.py"
227+
exit 1
228+
fi
229+
if [ ! -f "$CVARS_DIR/nccl_cvars.yaml" ]; then
230+
echo "ERROR: nccl_cvars.yaml not found at $CVARS_DIR/nccl_cvars.yaml"
231+
exit 1
232+
fi
233+
if [ ! -f "$CVARS_DIR/nccl_cvars.cc.in" ]; then
234+
echo "ERROR: nccl_cvars.cc.in not found at $CVARS_DIR/nccl_cvars.cc.in"
235+
exit 1
236+
fi
237+
238+
# Install ruamel-yaml if not already installed (required by extractcvars.py)
239+
if [[ -z "${NCCL_SKIP_CONDA_INSTALL}" ]]; then
240+
conda install ruamel.yaml --yes
241+
fi
242+
243+
# Run the extractcvars.py script directly to generate the files
244+
export NCCL_CVARS_OUTPUT_DIR="$CVARS_DIR"
245+
python3 "$CVARS_DIR/extractcvars.py"
246+
247+
# Verify the files were generated
248+
if [ ! -f "$CVARS_DIR/nccl_cvars.h" ] || [ ! -f "$CVARS_DIR/nccl_cvars.cc" ]; then
249+
echo "ERROR: Failed to generate nccl_cvars files"
250+
exit 1
251+
fi
252+
echo "Successfully generated nccl_cvars files in $CVARS_DIR"
253+
219254
if [ "$CLEAN_BUILD" == 1 ]; then
220255
rm -rf "$BUILDDIR"
221256
fi

comms/ncclx/v2_27/maint/oss_build.sh

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,41 @@ fi
288288

289289
mkdir -p $BUILDDIR
290290

291+
# Generate nccl_cvars files (these are no longer checked into the repo)
292+
# The files are generated by extractcvars.py which reads nccl_cvars.yaml and nccl_cvars.cc.in
293+
echo "Generating nccl_cvars files..."
294+
CVARS_DIR="$FBCODE_DIR/comms/utils/cvars"
295+
296+
# Validate that the required source files exist
297+
if [ ! -f "$CVARS_DIR/extractcvars.py" ]; then
298+
echo "ERROR: extractcvars.py not found at $CVARS_DIR/extractcvars.py"
299+
exit 1
300+
fi
301+
if [ ! -f "$CVARS_DIR/nccl_cvars.yaml" ]; then
302+
echo "ERROR: nccl_cvars.yaml not found at $CVARS_DIR/nccl_cvars.yaml"
303+
exit 1
304+
fi
305+
if [ ! -f "$CVARS_DIR/nccl_cvars.cc.in" ]; then
306+
echo "ERROR: nccl_cvars.cc.in not found at $CVARS_DIR/nccl_cvars.cc.in"
307+
exit 1
308+
fi
309+
310+
# Install ruamel-yaml if not already installed (required by extractcvars.py)
311+
if [ -z "$SKIP_CONDA_INSTALL" ]; then
312+
conda install -p "$CONDA_DIR" ruamel.yaml --yes
313+
fi
314+
315+
# Run the extractcvars.py script directly to generate the files
316+
export NCCL_CVARS_OUTPUT_DIR="$CVARS_DIR"
317+
python3 "$CVARS_DIR/extractcvars.py"
318+
319+
# Verify the files were generated
320+
if [ ! -f "$CVARS_DIR/nccl_cvars.h" ] || [ ! -f "$CVARS_DIR/nccl_cvars.cc" ]; then
321+
echo "ERROR: Failed to generate nccl_cvars files"
322+
exit 1
323+
fi
324+
echo "Successfully generated nccl_cvars files in $CVARS_DIR"
325+
291326
# Use nccl relative to fbcode dir (configurable for Docker builds)
292327
export NCCL_HOME=${NCCL_HOME:-$FBCODE_DIR/comms/ncclx/v2_27}
293328
pushd "${NCCL_HOME}"

comms/utils/cvars/README.md

Lines changed: 89 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,115 @@
11
# Custom VARS
22

3-
NCCLX CVARS - Strongly typed configurable knobs for NCCLX. All
4-
configuration knobs are defined here, and can be used in source
5-
file by including `nccl_cvars.h` and use typed CVAR by its name.
3+
NCCLX CVARS - Strongly typed configurable knobs for NCCLX. All configuration
4+
knobs are defined here, and can be used in source file by including
5+
`nccl_cvars.h` and use typed CVAR by its name.
66

77
## User Guide
88

9-
Refer to `nccl_cvars.yaml` for CVAR documentation and their default
10-
values.
9+
Refer to `nccl_cvars.yaml` for CVAR documentation and their default values.
1110

1211
CVAR can be provided to program in two ways
13-
1) Environment variable - e.g. `NCCL_DEBUG=warn nccl_allreduce_perf ...`
14-
2) Config Variable - define in `/etc/nccl.conf` and it'll be picked up
12+
13+
1. Environment variable - e.g. `NCCL_DEBUG=warn nccl_allreduce_perf ...`
14+
2. Config Variable - define in `/etc/nccl.conf` and it'll be picked up
1515
automatically by program.
1616

17-
Environment variable will take precedence over config variable. If not
18-
specified in either, the default value will be used.
17+
Environment variable will take precedence over config variable. If not specified
18+
in either, the default value will be used.
1919

2020
## Developer Guide
2121

2222
All CVARs are defined in `nccl_cvars.yaml`. To add a new CVAR:
23-
1) Add the CVAR definition in `nccl_cvars.yaml`
24-
2) Build any target that depends on `//comms/utils/cvars:ncclx-cvars` - the files will be auto-generated via genrule
25-
3) Include `#include "comms/utils/cvars/nccl_cvars.h"` and use your CVAR in program
2623

27-
**Note:** `nccl_cvars.h` and `nccl_cvars.cc` are now generated at build time using a genrule.
28-
They should **NOT** be manually edited or committed to the repository. The genrule automatically
29-
generates these files from `nccl_cvars.yaml` using `extractcvars.py` whenever you build a target
30-
that depends on the `ncclx-cvars` library.
24+
1. Add the CVAR definition in `nccl_cvars.yaml`
25+
2. Build any target that depends on `//comms/utils/cvars:ncclx-cvars` - the
26+
files will be auto-generated via genrule
27+
3. Include `#include "comms/utils/cvars/nccl_cvars.h"` and use your CVAR in
28+
program
29+
30+
**Note:** `nccl_cvars.h` and `nccl_cvars.cc` are now generated at build time
31+
using a genrule. They should **NOT** be manually edited or committed to the
32+
repository. The genrule automatically generates these files from
33+
`nccl_cvars.yaml` using `extractcvars.py` whenever you build a target that
34+
depends on the `ncclx-cvars` library.
3135

3236
To regenerate the files manually (for development/testing), you can run:
37+
3338
```bash
3439
cd ~/fbsource/fbcode && buck2 run comms/utils/cvars:extractcvars
3540
```
3641

37-
The CVAR is initialized as part of ncclInit and it is done by `initEnv` from `init.cc`. CVAR
38-
must not be used before initialization.
42+
The CVAR is initialized as part of ncclInit and it is done by `initEnv` from
43+
`init.cc`. CVAR must not be used before initialization.
44+
45+
## Including CVARs in Build Scripts (OSS/Non-Buck Builds)
46+
47+
For OSS builds or other build scripts that don't use Buck2, you need to generate
48+
the `nccl_cvars.h` and `nccl_cvars.cc` files before building. There are two
49+
approaches:
50+
51+
### Option 1: Using Buck2 Genrule (when Buck2 is available)
52+
53+
If Buck2 is available in your build environment, you can use the genrule to
54+
generate the files:
55+
56+
```bash
57+
GENRULE_OUTPUT=$(buck2 build fbcode//comms/utils/cvars:generate_nccl_cvars --show-full-output 2>&1 | grep "generate_nccl_cvars" | awk '{print $2}')
58+
if [ -n "$GENRULE_OUTPUT" ]; then
59+
cp "$GENRULE_OUTPUT/nccl_cvars.h" "$CVARS_DIR/nccl_cvars.h"
60+
cp "$GENRULE_OUTPUT/nccl_cvars.cc" "$CVARS_DIR/nccl_cvars.cc"
61+
fi
62+
```
63+
64+
### Option 2: Running extractcvars.py Directly (recommended for OSS builds)
65+
66+
For builds outside of Buck2 (e.g., conda/Docker builds), run the
67+
`extractcvars.py` script directly:
68+
69+
```bash
70+
# Set the output directory for the generated files
71+
CVARS_DIR="$FBCODE_DIR/comms/utils/cvars"
72+
73+
# Validate that the required source files exist
74+
if [ ! -f "$CVARS_DIR/extractcvars.py" ]; then
75+
echo "ERROR: extractcvars.py not found"
76+
exit 1
77+
fi
78+
if [ ! -f "$CVARS_DIR/nccl_cvars.yaml" ]; then
79+
echo "ERROR: nccl_cvars.yaml not found"
80+
exit 1
81+
fi
82+
if [ ! -f "$CVARS_DIR/nccl_cvars.cc.in" ]; then
83+
echo "ERROR: nccl_cvars.cc.in not found"
84+
exit 1
85+
fi
86+
87+
# Install ruamel-yaml (required by extractcvars.py)
88+
conda install ruamel.yaml --yes # or: pip install ruamel.yaml
89+
90+
# Run the script to generate the files
91+
export NCCL_CVARS_OUTPUT_DIR="$CVARS_DIR"
92+
python3 "$CVARS_DIR/extractcvars.py"
93+
94+
# Verify the files were generated
95+
if [ ! -f "$CVARS_DIR/nccl_cvars.h" ] || [ ! -f "$CVARS_DIR/nccl_cvars.cc" ]; then
96+
echo "ERROR: Failed to generate nccl_cvars files"
97+
exit 1
98+
fi
99+
```
39100

40101
## Changed NCCL CVAR Default values
41102

42-
NCCL_RAS_ENABLE - default value changed from 1 to 0
43-
NCCL_CTRAN_IB_MAX_QPS - default value changed from 1 to 16
44-
NCCL_CTRAN_IB_QP_MAX_MSGS - default value changed from 4 to 128
45-
NCCL_CTRAN_IB_QP_SCALING_THRESHOLD - default value changed from 131072 to 524288
46-
NCCL_CTRAN_IB_QP_CONFIG_XDC - default value changed from "" to "1048576,16,spray,128"
47-
NCCL_CTRAN_IB_QP_CONFIG_XRACK - default value changed from "" to "1048576,16,spray,128"
48-
NCCL_CTRAN_IB_QP_CONFIG_XZONE - default value changed from "" to "1048576,16,spray,128"
49-
NCCL_CTRAN_IB_VC_MODE - default value changed from "spray" to "dqplb"
103+
NCCL_RAS_ENABLE - default value changed from 1 to 0 NCCL_CTRAN_IB_MAX_QPS -
104+
default value changed from 1 to 16 NCCL_CTRAN_IB_QP_MAX_MSGS - default value
105+
changed from 4 to 128 NCCL_CTRAN_IB_QP_SCALING_THRESHOLD - default value changed
106+
from 131072 to 524288 NCCL_CTRAN_IB_QP_CONFIG_XDC - default value changed from
107+
"" to "1048576,16,spray,128" NCCL_CTRAN_IB_QP_CONFIG_XRACK - default value
108+
changed from "" to "1048576,16,spray,128" NCCL_CTRAN_IB_QP_CONFIG_XZONE -
109+
default value changed from "" to "1048576,16,spray,128" NCCL_CTRAN_IB_VC_MODE -
110+
default value changed from "spray" to "dqplb"
50111

51112
## NCCL Baseline Adapter
52113

53-
The NCCL Baseline Adapter API is designed to provide a similar interface to the baseline/third-party NCCL library's `ncclGetEnv` and `ncclLoadParam` functions.
114+
The NCCL Baseline Adapter API is designed to provide a similar interface to the
115+
baseline/third-party NCCL library's `ncclGetEnv` and `ncclLoadParam` functions.

0 commit comments

Comments
 (0)