LBANN
diff --git a/‎.gitlab-ci.yml‎
Lines changed: 53 additions & 0 deletions b/‎.gitlab-ci.yml‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎.gitlab/common/common.yml‎
Lines changed: 10 additions & 4 deletions b/‎.gitlab/common/common.yml‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎.gitlab/common/run-catch-tests-flux.sh‎
Lines changed: 3 additions & 8 deletions b/‎.gitlab/common/run-catch-tests-flux.sh‎
Lines changed: 3 additions & 8 deletions
diff --git a/‎.gitlab/common/run-catch-tests-lsf.sh‎
Lines changed: 78 additions & 0 deletions b/‎.gitlab/common/run-catch-tests-lsf.sh‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎.gitlab/corona/pipeline.yml‎
Lines changed: 9 additions & 4 deletions b/‎.gitlab/corona/pipeline.yml‎
Lines changed: 9 additions & 4 deletions
@@ -40,6 +40,19 @@ corona testing:
     strategy: depend
     include: .gitlab/corona/pipeline.yml
 
+corona distconv testing:
+  stage: run-all-clusters
+  variables:
+    JOB_NAME_SUFFIX: _distconv
+    SPACK_ENV_BASE_NAME_MODIFIER: "-distconv"
+    SPACK_SPECS: "+rocm +distconv"
+    WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}"
+    WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}"
+    TEST_FLAG: "test_*_distconv.py"
+  trigger:
+    strategy: depend
+    include: .gitlab/corona/pipeline.yml
+
 lassen testing:
   stage: run-all-clusters
   variables:
@@ -49,6 +62,20 @@ lassen testing:
     strategy: depend
     include: .gitlab/lassen/pipeline.yml
 
+lassen distconv testing:
+  stage: run-all-clusters
+  variables:
+    JOB_NAME_SUFFIX: _distconv
+    SPACK_ENV_BASE_NAME_MODIFIER: "-multi-stage-distconv"
+    SPACK_SPECS: "+cuda +distconv +fft"
+#    SPACK_SPECS: "+cuda +distconv +nvshmem +fft"
+    WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}"
+    WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}"
+    TEST_FLAG: "test_*_distconv.py"
+  trigger:
+    strategy: depend
+    include: .gitlab/lassen/multi_stage_pipeline.yml
+
 pascal testing:
   stage: run-all-clusters
   variables:
@@ -68,6 +95,19 @@ pascal compiler testing:
     strategy: depend
     include: .gitlab/pascal/pipeline_compiler_tests.yml
 
+pascal distconv testing:
+  stage: run-all-clusters
+  variables:
+    JOB_NAME_SUFFIX: _distconv
+    SPACK_SPECS: "%[email protected] +cuda +distconv +fft"
+    BUILD_SCRIPT_OPTIONS: "--no-default-mirrors"
+    WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}"
+    WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}"
+    TEST_FLAG: "test_*_distconv.py"
+  trigger:
+    strategy: depend
+    include: .gitlab/pascal/pipeline.yml
+
 tioga testing:
   stage: run-all-clusters
   variables:
@@ -76,3 +116,16 @@ tioga testing:
   trigger:
     strategy: depend
     include: .gitlab/tioga/pipeline.yml
+
+tioga distconv testing:
+  stage: run-all-clusters
+  variables:
+    JOB_NAME_SUFFIX: _distconv
+    SPACK_ENV_BASE_NAME_MODIFIER: "-distconv"
+    SPACK_SPECS: "+rocm +distconv"
+    WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}"
+    WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}"
+    TEST_FLAG: "test_*_distconv.py"
+  trigger:
+    strategy: depend
+    include: .gitlab/tioga/pipeline.yml
@@ -29,11 +29,11 @@
   variables:
     # This is based on the assumption that each runner will only ever
     # be able to run one pipeline on a given cluster at one time.
-    SPACK_ENV_BASE_NAME: gitlab-${CI_COMMIT_BRANCH}-${GITLAB_USER_LOGIN}-${SYSTEM_NAME}${SPACK_ENV_BASE_NAME_EXTENSION}-${CI_RUNNER_SHORT_TOKEN}
+    SPACK_ENV_BASE_NAME: gitlab${SPACK_ENV_BASE_NAME_MODIFIER}-${CI_COMMIT_BRANCH}-${GITLAB_USER_LOGIN}-${SYSTEM_NAME}${SPACK_ENV_BASE_NAME_EXTENSION}-${CI_RUNNER_SHORT_TOKEN}
 
     # This variable is the name used to identify the job in the Slurm
     # queue. We need this to be able to access the correct jobid.
-    JOB_NAME: ${CI_PROJECT_NAME}_${CI_PIPELINE_ID}
+    JOB_NAME: ${CI_PROJECT_NAME}_${CI_PIPELINE_ID}${JOB_NAME_SUFFIX}
 
     # This is needed to ensure that we run as lbannusr.
     LLNL_SERVICE_USER: lbannusr
@@ -105,7 +105,7 @@
     - ml use ${LBANN_MODFILES_DIR}
     - ml load lbann
     - echo "Using LBANN binary $(which lbann)"
-    - echo "export SPACK_DEP_ENV_NAME=${SPACK_ENV_NAME}" > spack-ci-env-name.sh
+    - echo "export SPACK_ENV_NAME=${SPACK_ENV_NAME}" > spack-ci-env-name.sh
     - echo "export SPACK_ARCH=${SPACK_ARCH}" >> spack-ci-env-name.sh
     - echo "export SPACK_ARCH_TARGET=${SPACK_ARCH_TARGET}" >> spack-ci-env-name.sh
     - echo "export LBANN_BUILD_PARENT_DIR=${LBANN_BUILD_PARENT_DIR}" >> spack-ci-env-name.sh
@@ -137,7 +137,13 @@
       - builds/lbann_${SYSTEM_NAME}_${SPACK_ENV_BASE_NAME}-*${CI_CONCURRENT_ID}-*/*.cmake
       - builds/lbann_${SYSTEM_NAME}_${SPACK_ENV_BASE_NAME}-*${CI_CONCURRENT_ID}-*/build/CMakeCache.txt
       - builds/lbann_${SYSTEM_NAME}_${SPACK_ENV_BASE_NAME}-*${CI_CONCURRENT_ID}-*/build/build.ninja
-      - builds/lbann_${SYSTEM_NAME}_${SPACK_ENV_BASE_NAME}-*${CI_CONCURRENT_ID}-*/build/unit_test/*
       - ${RESULTS_DIR}/*
     exclude:
       - builds/lbann_${SYSTEM_NAME}_${SPACK_ENV_BASE_NAME}-*${CI_CONCURRENT_ID}-*/build/**/*.o
+      - builds/lbann_${SYSTEM_NAME}_${SPACK_ENV_BASE_NAME}-*${CI_CONCURRENT_ID}-*/build/unit_test/*
+
+.lbann-test-rules:
+  rules:
+    - if: $JOB_NAME_SUFFIX == "_distconv"
+      when: never
+    - if: $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME == $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME
@@ -52,14 +52,9 @@ export LD_LIBRARY_PATH=${ROCM_PATH}/lib:${LD_LIBRARY_PATH}
 
 cd ${LBANN_BUILD_DIR}
 
-
-flux run --label-io -n4 -N2 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task sh -c 'taskset -cp $$; printenv | grep VISIBLE' | sort
-
-flux run --label-io -n4 -N2 -g 1 -o cpu-affinity=off -o gpu-affinity=per-task sh -c 'taskset -cp $$; printenv | grep VISIBLE' | sort
-
 echo "Running sequential catch tests"
 
-flux run -N 1 -n 1 -g 1 -t 5m \
+flux run -N 1 -n 1 --exclusive -o nosetpgrp ${EXTRA_FLUX_ARGS} -t 5m \
      ./unit_test/seq-catch-tests \
      -r JUnit \
      -o ${OUTPUT_DIR}/seq-catch-results.xml
@@ -71,7 +66,7 @@ echo "Running MPI catch tests with ${LBANN_NNODES} nodes and ${TEST_TASKS_PER_NO
 
 flux run \
      -N ${LBANN_NNODES} -n $((${TEST_TASKS_PER_NODE} * ${LBANN_NNODES})) \
-     -g 1 -t 5m -o gpu-affinity=per-task -o cpu-affinity=per-task -o mpibind=off \
+     -t 5m --exclusive -o nosetpgrp ${EXTRA_FLUX_ARGS} \
      ./unit_test/mpi-catch-tests "exclude:[random]" "exclude:[filesystem]"\
      -r JUnit \
      -o "${OUTPUT_DIR}/mpi-catch-results-rank=%r-size=%s.xml"
@@ -83,7 +78,7 @@ echo "Running MPI filesystem catch tests"
 
 flux run \
      -N ${LBANN_NNODES} -n $((${TEST_TASKS_PER_NODE} * ${LBANN_NNODES})) \
-     -g 1 -t 5m -o gpu-affinity=per-task -o cpu-affinity=per-task -o mpibind=off \
+     -t 5m --exclusive -o nosetpgrp ${EXTRA_FLUX_ARGS} \
      ./unit_test/mpi-catch-tests -s "[filesystem]" \
      -r JUnit \
      -o "${OUTPUT_DIR}/mpi-catch-filesystem-results-rank=%r-size=%s.xml"
 
@@ -0,0 +1,78 @@
+################################################################################
+## Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
+## Produced at the Lawrence Livermore National Laboratory.
+## Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+## the CONTRIBUTORS file. <[email protected]>
+##
+## LLNL-CODE-697807.
+## All rights reserved.
+##
+## This file is part of LBANN: Livermore Big Artificial Neural Network
+## Toolkit. For details, see http://software.llnl.gov/LBANN or
+## https://github.com/LLNL/LBANN.
+##
+## Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+## may not use this file except in compliance with the License.  You may
+## obtain a copy of the License at:
+##
+## http://www.apache.org/licenses/LICENSE-2.0
+##
+## Unless required by applicable law or agreed to in writing, software
+## distributed under the License is distributed on an "AS IS" BASIS,
+## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+## implied. See the License for the specific language governing
+## permissions and limitations under the license.
+################################################################################
+
+#!/bin/bash
+cd ${LBANN_BUILD_DIR}
+
+# Configure the output directory
+OUTPUT_DIR=${CI_PROJECT_DIR}/${RESULTS_DIR}
+if [[ -d ${OUTPUT_DIR} ]];
+then
+    rm -rf ${OUTPUT_DIR}
+fi
+mkdir -p ${OUTPUT_DIR}
+
+FAILED_JOBS=""
+
+lrun -N 1 -n 1 -W 5 \
+     ./unit_test/seq-catch-tests \
+     -r JUnit \
+     -o ${OUTPUT_DIR}/seq-catch-results.xml
+if [[ $? -ne 0 ]]; then
+    FAILED_JOBS+=" seq"
+fi
+
+lrun -N ${LBANN_NNODES} -n $(($TEST_TASKS_PER_NODE * ${LBANN_NNODES})) \
+     -T $TEST_TASKS_PER_NODE \
+     -W 5 ${TEST_MPIBIND_FLAG} \
+     ./unit_test/mpi-catch-tests "exclude:[externallayer]" "exclude:[filesystem]" \
+     -r JUnit \
+     -o "${OUTPUT_DIR}/mpi-catch-results-rank=%r-size=%s.xml"
+if [[ $? -ne 0 ]]; then
+    FAILED_JOBS+=" mpi"
+fi
+
+lrun -N ${LBANN_NNODES} -n $(($TEST_TASKS_PER_NODE * ${LBANN_NNODES})) \
+     -T $TEST_TASKS_PER_NODE \
+     -W 5 ${TEST_MPIBIND_FLAG} \
+     ./unit_test/mpi-catch-tests "[filesystem]" \
+     -r JUnit \
+     -o "${OUTPUT_DIR}/mpi-catch-filesystem-results-rank=%r-size=%s.xml"
+if [[ $? -ne 0 ]];
+then
+    FAILED_JOBS+=" mpi-filesystem"
+fi
+
+# Try to write a semi-useful message to this file since it's being
+# saved as an artifact. It's not completely outside the realm that
+# someone would look at it.
+if [[ -n "${FAILED_JOBS}" ]];
+then
+    echo "Some Catch2 tests failed:${FAILED_JOBS}" > ${OUTPUT_DIR}/catch-tests-failed.txt
+fi
+
+# Return "success" so that the pytest-based testing can run.
+exit 0
@@ -55,7 +55,7 @@ allocate lc resources:
     - export TEST_TIME=$([[ -n "${WITH_WEEKLY}" ]] && echo "150m" || echo "120m")
     - export LBANN_NNODES=$([[ -n "${WITH_WEEKLY}" ]] && echo "4" || echo "2")
     - export FLUX_F58_FORCE_ASCII=t
-    - jobid=$(flux --parent alloc -N ${LBANN_NNODES} -g 1 -t ${TEST_TIME} --job-name=${JOB_NAME} --bg)
+    - jobid=$(flux --parent alloc -N ${LBANN_NNODES} --exclusive -t ${TEST_TIME} --job-name=${JOB_NAME} --bg)
     - export JOB_ID=$jobid
   timeout: 6h
 
@@ -79,6 +79,7 @@ build and install:
     - export TEST_MPIBIND_FLAG="--mpibind=off"
     - export SPACK_ARCH=$(flux proxy ${JOB_ID} flux mini run -N 1 spack arch)
     - export SPACK_ARCH_TARGET=$(flux proxy ${JOB_ID} flux mini run -N 1 spack arch -t)
+    - export EXTRA_FLUX_ARGS="-o pmi=pmix"
     - !reference [.setup_lbann, script]
     - flux proxy ${JOB_ID} .gitlab/common/run-catch-tests-flux.sh
 
@@ -97,7 +98,8 @@ unit tests:
     - export OMP_NUM_THREADS=10
     - "export FLUX_JOB_ID=$(flux jobs -no {id}:{name} | grep ${JOB_NAME} | awk -F: '{print $1}')"
     - cd ci_test/unit_tests
-    - flux proxy ${FLUX_JOB_ID} lbann_pfe.sh -m pytest -s -vv --durations=0 --junitxml=results.xml
+    # - echo "Running unit tests with file pattern: ${TEST_FLAG}"
+    - flux proxy ${FLUX_JOB_ID} python3 -m pytest -s -vv --durations=0 --junitxml=results.xml ${TEST_FLAG}
   artifacts:
     when: always
     paths:
@@ -114,15 +116,18 @@ integration tests:
   stage: test
   dependencies:
     - build and install
+  rules:
+    - !reference [.lbann-test-rules, rules]
   script:
     - echo "== RUNNING PYTHON-BASED INTEGRATION TESTS =="
     - echo "Testing $(which lbann)"
     - export OMP_NUM_THREADS=10
     - "export FLUX_JOB_ID=$(flux jobs -no {id}:{name} | grep ${JOB_NAME} | awk -F: '{print $1}')"
     - cd ci_test/integration_tests
     - export WEEKLY_FLAG=${WITH_WEEKLY:+--weekly}
-    - echo "python3 -m pytest -s -vv --durations=0 ${WEEKLY_FLAG} --junitxml=results.xml"
-    - flux proxy ${FLUX_JOB_ID} lbann_pfe.sh -m pytest -s -vv --durations=0 ${WEEKLY_FLAG} --junitxml=results.xml
+    # - echo "Running integration tests with file pattern: ${TEST_FLAG}"
+    # - echo "python3 -m pytest -s -vv --durations=0 ${WEEKLY_FLAG} --junitxml=results.xml ${TEST_FLAG}"
+    - flux proxy ${FLUX_JOB_ID} python3 -m pytest -s -vv --durations=0 ${WEEKLY_FLAG} --junitxml=results.xml ${TEST_FLAG}
   artifacts:
     when: always
     paths: