Skip to content

[Build] segfault when run unitest (ctest)  #15224

Open
@sl1pkn07

Description

Describe the issue

build onxxruntime 1.14.1 with enabled unittest, when run it, get segfault

using system libraries:

flatbuffers 23.3.3
cxxopts 3.1.1
abseil-cpp 20230125.1
nlohmann-json 3.11.2
chrono-date 3.0.1
protobuf 21.12
python 3.10.10
cmake 3.26.1
gcc 12.2.1
cuda 12.1.0
onednn 3.0.1
nccl 2.17.1
re2 20230301
pybind11 2.10.4
eigen 3.4.0
boost 1.81.0
gtest 1.13.0

Urgency

No response

Target platform

linux

Build script

_ENABLE_CUDA=1
_ENABLE_TENSORRT=0

# Check PKGBUILDs of python-pytorch and tensorflow for CUDA architectures built by official packages
_CUDA_ARCHITECTURES="52-real;53-real;60-real;61-real;62-real;70-real;72-real;75-real;80-real;86-real;87-real;89-real;90-real;90-virtual"

  # DNNL +3.x.x
  patch -d onnxruntime -Np1 -i 14267.diff  # https://github.com/microsoft/onnxruntime/pull/14267

  # find system nlohmann-json
  sed 's|3.10 ||g' \
    -i onnxruntime/cmake/external/onnxruntime_external_deps.cmake

  # find system chrono-date
  sed -e 's|${DEP_SHA1_date}|&\n \ \ \ \ \ \FIND_PACKAGE_ARGS NAMES date|g' \
      -e 's|date_interface|date::date-tz|g' \
      -i onnxruntime/cmake/external/onnxruntime_external_deps.cmake \
      -i onnxruntime/cmake/onnxruntime_common.cmake \
      -i onnxruntime/cmake/onnxruntime_unittests.cmake

  # find system abseil-cpp
  sed 's|ABSL_PATCH_COMMAND}|&\n\ \ \ \ \FIND_PACKAGE_ARGS NAMES absl|g' \
    -i onnxruntime/cmake/external/abseil-cpp.cmake

  # find system cxxopts
  sed 's|${DEP_SHA1_cxxopts}|&\n\ \ \ \ \FIND_PACKAGE_ARGS NAMES cxxopts|g' \
    -i onnxruntime/cmake/external/onnxruntime_external_deps.cmake

  if [[ $_ENABLE_TENSORRT = 1 ]]; then
    # Tensorrt 8.6 EA
    patch -Np1 -i 15089.diff # https://github.com/microsoft/onnxruntime/pull/15089#issuecomment-1477934966

    # Update Tensorboard 00d59e65d866a6d4b9fe855dce81ee6ba8b40c4f
    sed -e 's|373eb09e4c5d2b3cc2493f0949dc4be6b6a45e81|00d59e65d866a6d4b9fe855dce81ee6ba8b40c4f|g' \
        -e 's|67b833913605a4f3f499894ab11528a702c2b381|ff427b6a135344d86b65fa2928fbd29886eefaec|g' \
        -i onnxruntime/cmake/deps.txt

    # Update onnx_tensorrt 6872a9473391a73b96741711d52b98c2c3e25146
    sed -e 's|369d6676423c2a6dbf4a5665c4b5010240d99d3c|6872a9473391a73b96741711d52b98c2c3e25146|g' \
        -e 's|62119892edfb78689061790140c439b111491275|75462057c95f7fdbc256179f0a0e9e4b7be28ae3|g' \
        -i onnxruntime/cmake/deps.txt
  fi

  patch -d onnxruntime -Np1 -i install-orttraining-files.diff
  patch -d onnxruntime -Np1 -i system-dnnl.diff
  patch -d onnxruntime -Np1 -i system-flatbuffers.patch

  # fix build with gcc12(?), take idea from https://github.com/microsoft/onnxruntime/pull/11667 and https://github.com/microsoft/onnxruntime/pull/10014
  sed 's|dims)|TensorShape(dims))|g' \
    -i onnxruntime/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_qdq.cc

  # fix missing #include <iostream>
  sed '11a#include <iostream>' \
    -i onnxruntime/orttraining/orttraining/test/training_api/trainer/trainer.cc

  (
  cd onnxruntime/onnxruntime/core/flatbuffers/schema
  python compile_schema.py --flatc /usr/bin/flatc
  )

  # More than 18 cores full my 128Gb Ram rig :/
  _nproc="$(nproc)"
  if [[ ${_nproc} -gt 18 ]]; then
    _nproc=18
  else
    _nproc="${_nproc}"
  fi

  if [[ ${_ENABLE_CUDA} = 1 ]]; then
    export CC="/opt/cuda/bin/gcc"
    export CXX="/opt/cuda/bin/g++"
    export CUDAHOSTCXX="${CXX}"
  fi

  # Gcc 12+
  export CXXFLAGS="${CXXFLAGS} -Wno-maybe-uninitialized"

  # Use -Donnxruntime_ENABLE_LAZY_TENSOR=OFF as it requires patched python-pytorch
  # See: https://github.com/microsoft/onnxruntime/pull/10460 https://github.com/pytorch/pytorch/pulls/wschin
  local _cmake_args=(
    -DCMAKE_BUILD_TYPE=Debug
    -DCMAKE_INSTALL_PREFIX=/usr
    -DCMAKE_SKIP_INSTALL_RPATH=OFF
    -DCMAKE_SKIP_RPATH=OFF
    -Donnxruntime_ENABLE_PYTHON=ON
    -Donnxruntime_BUILD_SHARED_LIB=ON
    -Donnxruntime_BUILD_UNIT_TESTS=ON
    -Donnxruntime_ENABLE_TRAINING=ON
    -Donnxruntime_ENABLE_LAZY_TENSOR=OFF
    -Donnxruntime_USE_MPI=ON
    -Donnxruntime_USE_DNNL=ON
    -Donnxruntime_USE_PREINSTALLED_EIGEN=ON
    -Deigen_SOURCE_PATH=$(pkg-config --cflags eigen3 | sed 's|-I||g')
  )

  # Use protobuf-lite instead of full protobuf to workaround symbol conflicts
  # with onnx; see https://github.com/onnx/onnx/issues/1277 for details.
  _cmake_args+=(
#     -DONNX_CUSTOM_PROTOC_EXECUTABLE=/usr/bin/protoc
    -Donnxruntime_USE_FULL_PROTOBUF=OFF
  )

  if [[ ${_ENABLE_CUDA} = 1 ]]; then
    _cmake_args+=(
      -DCMAKE_CUDA_ARCHITECTURES="${_CUDA_ARCHITECTURES}"
      -DCMAKE_CUDA_STANDARD_REQUIRED=ON
      -DCMAKE_CXX_STANDARD_REQUIRED=ON
#       -DCMAKE_CUDA_COMPILER=/opt/cuda/bin/nvcc
      -Donnxruntime_USE_CUDA=ON
      -Donnxruntime_CUDA_HOME=/opt/cuda
      -Donnxruntime_CUDNN_HOME=/usr
      -Donnxruntime_USE_NCCL=ON
      -Donnxruntime_NVCC_THREADS="${_nproc}"
    )
  fi

  if [[ ${_ENABLE_TENSORRT} = 1 ]]; then
    _cmake_args+=(
      -Donnxruntime_USE_TENSORRT=ON
      -Donnxruntime_USE_TENSORRT_BUILTIN_PARSER=ON
    )
  fi

  cmake -S onnxruntime/cmake -B build \
  "${_cmake_args[@]}" \
  "$@"

  cmake --build build -j${_nproc} -v

  (cd build;
  python ../onnxruntime/setup.py bdist_wheel -d ../dist
  )

install-orttraining-files.diff.txt
system-flatbuffers.patch.txt
system-dnnl.diff.txt

Error / output

segfault.log

./onnxruntime_test_all
gdb ./onnxruntime_test_all

Visual Studio Version

No response

GCC / Compiler Version

12.2.1

Metadata

Assignees

No one assigned

    Labels

    buildbuild issues; typically submitted using templateep:CUDAissues related to the CUDA execution providerep:TensorRTissues related to TensorRT execution providerep:oneDNNquestions/issues related to DNNL EPquantizationissues related to quantization

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions