NVIDIA
diff --git a/‎.github/container/test-maxtext.sh‎
Lines changed: 53 additions & 17 deletions b/‎.github/container/test-maxtext.sh‎
Lines changed: 53 additions & 17 deletions
diff --git a/‎.github/triage/jax_toolbox_triage/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎.github/triage/jax_toolbox_triage/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/triage/jax_toolbox_triage/args.py‎
Lines changed: 112 additions & 0 deletions b/‎.github/triage/jax_toolbox_triage/args.py‎
Lines changed: 112 additions & 0 deletions
diff --git a/‎.github/triage/jax_toolbox_triage/docker.py‎
Lines changed: 79 additions & 0 deletions b/‎.github/triage/jax_toolbox_triage/docker.py‎
Lines changed: 79 additions & 0 deletions
@@ -13,7 +13,7 @@ usage() {
     echo "Usage: $0 [OPTIONS]"
     echo ""
     echo "  OPTIONS                    DESCRIPTION"
-    echo "  -a, --additional-args      Additional args to pass to MaxText/train.py"
+    echo "  -a, --additional-args      Additional args to pass to MaxText/train.py. Can be passed many times."
     echo "  --mem-fraction             Specify the percentage of memory to preallocate for XLA. Example: 0.90, 0.85, 0.65". Default to 0.90, contradicting JAX default of 0.75.
     echo "  --model-name               Specify the model names to run [Preferred]. If you specify model name then you do not need to specify decoder-block. Currently supported ootb models: 
                                        gemma-2b, gemma-7b, gpt3-175b, gpt3-22b, gpt3-52k, gpt3-6b, llama2-13b, llama2-70b, llama2-7b, llama3-70b, llama3-8b, mistral-7b, mixtral-8x7b" 
@@ -34,7 +34,7 @@ usage() {
                                        1. test-maxtext.sh -b 2 --model-name=gpt3-52k
                                        2. test-maxtext.sh -b 2 --model-name=gemma-2b --dtype=fp8
                                        3. test-maxtext.sh -n 1 -b 2 --model-name=llama2-7b --attn-type=cudnn_flash_te --remat-policy=minimal_flash --steps=10 --fsdp=8 --output train_output --multiprocess
-                                       4. test-maxtext.sh -n 1 -b 2 --model-name=llama2-7b --attn-type=cudnn_flash_te --remat-policy=minimal_flash --steps=10 --fsdp=8 --output train_output --multiprocess -a scan_layers=false max_target_length=4096 use_iota_embed=true logits_dot_in_fp32=false
+                                       4. test-maxtext.sh -n 1 -b 2 --model-name=llama2-7b --attn-type=cudnn_flash_te --remat-policy=minimal_flash --steps=10 --fsdp=8 --output train_output --multiprocess -a "scan_layers=false max_target_length=4096 use_iota_embed=true logits_dot_in_fp32=false"
                                        5. test-maxtext.sh -n 1 -b 2 --model-name=llama2-7b --attn-type=cudnn_flash_te --remat-policy=minimal_flash --dtype=fp8 --steps=10 --fsdp=8 --output train_output --multiprocess
                                        6. test-maxtext.sh -n 8 -b 2 --model-name=llama2-7b --attn-type=cudnn_flash_te --remat-policy=minimal_flash --steps=10 --output train_output --fsdp=8 --data-parallel=8 --multiprocess
                                        7. test-maxtext.sh -n 8 -b 2 --model-name=llama2-7b --attn-type=cudnn_flash_te --remat-policy=minimal_flash --steps=10 --output train_output --fsdp=4 --tensor-parallel=2 --data-parallel=8 --multiprocess
@@ -76,7 +76,7 @@ eval set -- "$args"
 while [ : ]; do
     case "$1" in
     -a | --additional-args)
-        ADDITIONAL_ARGS="$2"
+        ADDITIONAL_ARGS="$ADDITIONAL_ARGS $2"
         shift 2
         ;;
     --mem-fraction)
@@ -245,22 +245,58 @@ RUN_NAME="logdir" ## the RUN_NAME cannot be changed
 if [ -z "$DECODER_BLOCK" ]; then
 
     # this part could be used to test different model ootb
-    RUN_SETTINGS="MaxText/train.py MaxText/configs/base.yml run_name=${RUN_NAME} model_name=${MODEL}\
-        steps=$STEPS per_device_batch_size=${BATCH_PER_GPU} remat_policy=${REMAT_POLICY} enable_checkpointing=false\
-        base_output_directory=$OUTPUT dataset_path=local dataset_type=synthetic hardware=$HARDWARE\
-        dcn_fsdp_parallelism=$dcn_FSDP ici_fsdp_parallelism=$ici_FSDP\
-        ici_data_parallelism=$ici_DP dcn_data_parallelism=$dcn_DP\
-        ici_tensor_parallelism=$ici_TP dcn_tensor_parallelism=1 ${ADDITIONAL_ARGS}"
-
+    RUN_SETTINGS="MaxText/train.py \
+        MaxText/configs/base.yml \
+        run_name=${RUN_NAME} \
+        model_name=${MODEL} \
+        steps=${STEPS} \
+        per_device_batch_size=${BATCH_PER_GPU} \
+        remat_policy=${REMAT_POLICY} \
+        enable_checkpointing=false\
+        base_output_directory=${OUTPUT} \
+        dataset_path=local \
+        dataset_type=synthetic \
+        hardware=${HARDWARE} \
+        enable_goodput_recording=false \
+        monitor_goodput=false \
+        dcn_fsdp_parallelism=${dcn_FSDP} \
+        ici_fsdp_parallelism=${ici_FSDP} \
+        ici_data_parallelism=${ici_DP} \
+        dcn_data_parallelism=${dcn_DP} \
+        ici_tensor_parallelism=${ici_TP} \
+        dcn_tensor_parallelism=1 \
+        ${ADDITIONAL_ARGS}"
 else
     # this is essentially used for CI run
-    RUN_SETTINGS="MaxText/train.py MaxText/configs/base.yml run_name=${RUN_NAME} logits_via_embedding=true decoder_block=${DECODER_BLOCK} \
-        steps=$STEPS per_device_batch_size=${BATCH_PER_GPU} base_emb_dim=2560 base_mlp_dim=8192 remat_policy=${REMAT_POLICY} attention=${ATTN_TYPE}\
-        base_num_query_heads=8 base_num_kv_heads=8 base_num_decoder_layers=8 head_dim=128 enable_checkpointing=false\
-        base_output_directory=$OUTPUT dataset_path=local dataset_type=synthetic hardware=$HARDWARE\
-        dcn_fsdp_parallelism=$dcn_FSDP ici_fsdp_parallelism=$ici_FSDP\
-        ici_data_parallelism=$ici_DP dcn_data_parallelism=$dcn_DP\
-        ici_tensor_parallelism=$ici_TP dcn_tensor_parallelism=1 ${ADDITIONAL_ARGS}"
+    RUN_SETTINGS="MaxText/train.py \
+        MaxText/configs/base.yml \
+        run_name=${RUN_NAME} \
+        decoder_block=${DECODER_BLOCK} \
+        steps=$STEPS \
+        per_device_batch_size=${BATCH_PER_GPU} \
+        base_emb_dim=2560 \
+        base_mlp_dim=8192 \
+        remat_policy=${REMAT_POLICY} \
+        attention=${ATTN_TYPE} \
+        base_num_query_heads=8 \
+        base_num_kv_heads=8 \
+        base_num_decoder_layers=8 \
+        head_dim=128 \
+        logits_via_embedding=true \
+        enable_checkpointing=false \
+        base_output_directory=${OUTPUT} \
+        dataset_path=local \
+        dataset_type=synthetic \
+        hardware=${HARDWARE} \
+        enable_goodput_recording=false \
+        monitor_goodput=false \
+        dcn_fsdp_parallelism=${dcn_FSDP} \
+        ici_fsdp_parallelism=${ici_FSDP} \
+        ici_data_parallelism=${ici_DP} \
+        dcn_data_parallelism=${dcn_DP} \
+        ici_tensor_parallelism=${ici_TP} \
+        dcn_tensor_parallelism=1 \
+        ${ADDITIONAL_ARGS}"
 fi
 
 echo "Command: python3 $RUN_SETTINGS"
 
@@ -0,0 +1,3 @@
+from .main import main
+
+__all__ = ["main"]
@@ -0,0 +1,112 @@
+import argparse
+import datetime
+import getpass
+import os
+import pathlib
+import tempfile
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="""
+            Triage failures in JAX/XLA-related tests. The expectation is that the given
+            test command is failing in recent versions, but that it passed in the past. The
+            script first triages the regression with a search of the nightly containers,
+            and then refines the search to a particular commit of JAX or XLA.""",
+    )
+
+    container_search_args = parser.add_argument_group(
+        title="Container-level search",
+        description="""
+            First, it is verified that the test command fails on the given end date, unless
+            both --end-date and --skip-precondition-checks were passed. Then, the program
+            searches backwards to find a container when the given test did pass. The
+            --start-date option can be used to speed up this search, if you already know a
+            date on which the test was passing. The earliest failure is located to within
+            --threshold-days days.""",
+    )
+    commit_search_args = parser.add_argument_group(
+        title="Commit-level search",
+        description="""
+            Second, the failure is localised to a commit of JAX or XLA by re-building and
+            re-testing inside the earliest container that demonstrates the failure. At each
+            point, the oldest JAX commit that is newer than XLA is used.""",
+    )
+    parser.add_argument(
+        "--container",
+        help="""
+            Container to use. Example: jax, pax, triton. Used to construct the URLs of
+            nightly containers, like ghcr.io/nvidia/jax:CONTAINER-YYYY-MM-DD.""",
+        required=True,
+    )
+    parser.add_argument(
+        "--output-prefix",
+        default=datetime.datetime.now().strftime("triage-%Y-%m-%d-%H-%M-%S"),
+        help="""
+            Prefix for output log and JSON files. Default: triage-YYYY-MM-DD-HH-MM-SS.
+            An INFO-and-above log is written as PREFIX.log, a DEBUG-and-above log is
+            written as PREFIX-debug.log, and a JSON summary is written as
+            PREFIX-summary.json""",
+        type=pathlib.Path,
+    )
+    parser.add_argument(
+        "--skip-precondition-checks",
+        action="store_true",
+        help="""
+            Skip checks that should pass by construction. This saves time, but may yield
+            incorrect results if you are not careful. Specifically this means that the test
+            is assumed to fail on --end-date (if specified), pass on --start-date (if
+            specified), and fail after recompilation in the earliest-known-failure
+            container. Careful use of this option, along with --start-date, --end-date and
+            --threshold-days, allows the container-level search to be skipped.""",
+    )
+    parser.add_argument(
+        "test_command",
+        nargs="+",
+        help="""
+            Command to execute inside the container. This should be as targeted as
+            possible.""",
+    )
+    container_search_args.add_argument(
+        "--end-date",
+        help="""
+            Initial estimate of the earliest nightly container date where the test case
+            fails. Defaults to the newest available nightly container date. If this and
+            --skip-precondition-checks are both set then it will not be verified that the
+            test case fails on this date.""",
+        type=lambda s: datetime.date.fromisoformat(s),
+    )
+    container_search_args.add_argument(
+        "--start-date",
+        help="""
+            Initial estimate of the latest nightly container date where the test case
+            passes. Defaults to the day before --end-date, but setting this to a date
+            further in the past may lead to faster convergence of the initial backwards
+            search for a date when the test case passed. If this and
+            --skip-precondition-checks are both set then the test case *must* pass on
+            this date, which will *not* be verified.""",
+        type=lambda s: datetime.date.fromisoformat(s),
+    )
+    container_search_args.add_argument(
+        "--threshold-days",
+        default=1,
+        help="""
+            Convergence threshold. Ideally, the container-level search will continue while
+            the number of days separating the last known success and first known failure is
+            smaller than this value. The minimum, and default, value is 1. Note that in
+            case of nightly build failures the search may finish without reaching this
+            threshold.""",
+        type=int,
+    )
+    commit_search_args.add_argument(
+        "--bazel-cache",
+        default=os.path.join(
+            tempfile.gettempdir(), f"{getpass.getuser()}-bazel-triage-cache"
+        ),
+        help="""
+            Bazel cache to use when [re-]building JAX/XLA during the fine search. This can
+            be a remote cache server or a local directory. Using a persistent cache can
+            significantly speed up the commit-level search. By default, uses a temporary
+            directory including the name of the current user.""",
+    )
+    return parser.parse_args()
@@ -0,0 +1,79 @@
+import logging
+import pathlib
+import subprocess
+import typing
+
+
+class DockerContainer:
+    def __init__(
+        self,
+        url: str,
+        *,
+        logger: logging.Logger,
+        mounts: typing.List[typing.Tuple[pathlib.Path, pathlib.Path]],
+    ):
+        self._logger = logger
+        self._mount_args = []
+        for src, dst in mounts:
+            self._mount_args += ["-v", f"{src}:{dst}"]
+        self._url = url
+
+    def __enter__(self):
+        result = subprocess.run(
+            [
+                "docker",
+                "run",
+                "--detach",
+                # Otherwise bazel shutdown hangs.
+                "--init",
+                "--gpus=all",
+                "--shm-size=1g",
+            ]
+            + self._mount_args
+            + [
+                self._url,
+                "sleep",
+                "infinity",
+            ],
+            check=True,
+            encoding="utf-8",
+            stderr=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+        )
+        self._id = result.stdout.strip()
+        return self
+
+    def __exit__(self, *exc_info):
+        subprocess.run(
+            ["docker", "stop", self._id],
+            check=True,
+            stderr=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+        )
+
+    def exec(
+        self, command: typing.List[str], workdir=None
+    ) -> subprocess.CompletedProcess:
+        """
+        Run a command inside a persistent container.
+        """
+        workdir = [] if workdir is None else ["--workdir", workdir]
+        return subprocess.run(
+            ["docker", "exec"] + workdir + [self._id] + command,
+            encoding="utf-8",
+            stderr=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+        )
+
+    def check_exec(
+        self, cmd: typing.List[str], **kwargs
+    ) -> subprocess.CompletedProcess:
+        result = self.exec(cmd, **kwargs)
+        if result.returncode != 0:
+            self._logger.fatal(
+                f"{' '.join(cmd)} exited with return code {result.returncode}"
+            )
+            self._logger.fatal(result.stdout)
+            self._logger.fatal(result.stderr)
+            result.check_returncode()
+        return result
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .main import main`
	`2`	`+`
	`3`	`+__all__ = ["main"]`