Description
batch为4时还正常工作,为8时core
报错如下:
W20240723 13:38:27.120533 3523 cudnn_conv_util.cpp:105] Currently available alogrithm (algo=0, require memory=0, idx=1) meeting requirments (max_workspace_size=2147483648, determinism=0) is not fastest. Fastest algorithm (1) requires memory 4304404496
W20240723 13:38:27.142623 3523 cudnn_conv_util.cpp:105] Currently available alogrithm (algo=0, require memory=0, idx=1) meeting requirments (max_workspace_size=2147483648, determinism=0) is not fastest. Fastest algorithm (1) requires memory 4299685904
W20240723 13:38:27.148941 3523 cudnn_conv_util.cpp:105] Currently available alogrithm (algo=0, require memory=0, idx=1) meeting requirments (max_workspace_size=2147483648, determinism=0) is not fastest. Fastest algorithm (1) requires memory 2149842960
F20240723 13:38:28.295286 3523 cudnn_util.cpp:74] Check failed: cudnnSetTensor4dDescriptor(val_, cudnn_data_format, GetCudnnDataType(data_type), data_num, channels, kernel_h, kernel_w) : CUDNN_STATUS_NOT_SUPPORTED (9)
*** Check failure stack trace: ***
@ 0x7f3ae947996a google::LogMessage::Fail()
@ 0x7f3ae947c8a1 google::LogMessage::SendToLog()
@ 0x7f3ae9479499 google::LogMessage::Flush()
@ 0x7f3ae947d189 google::LogMessageFatal::~LogMessageFatal()
@ 0x7f3ada9f446a oneflow::CudnnTensorDesc::CudnnTensorDesc()
@ 0x7f3ada9e7059 oneflow::CudnnConvArgs::CudnnConvArgs()
@ 0x7f3ae1d3f080 oneflow::CudnnConv2dEngine::Init()
@ 0x7f3ae1d3ae3c oneflow::Conv2dEngineMgr::GetConv2dEngine()
@ 0x7f3ae088e18b ZZNK7oneflow12_GLOBAL__N_122Conv2dTuningWarmupPass5ApplyEPNS_3JobEPNS_10JobPassCtxEENKUlPKNS_6OpNodeEE1_clES8
@ 0x7f3ae088fcae oneflow::(anonymous namespace)::Conv2dTuningWarmupPass::Apply()
@ 0x7f3ae06bde54 _ZZN7oneflow23LazyJobBuildAndInferCtx8CompleteEvENKUlRKSsiE2_clES2_i
@ 0x7f3ae06c357d oneflow::LazyJobBuildAndInferCtx::Complete()
@ 0x7f3d221ec646 oneflow::CurJobBuildAndInferCtx_Complete()
@ 0x7f3d221ed49b (unknown)
@ 0x7f3d21f418f0 (unknown)
@ 0x5f6489 PyCFunction_Call
@ 0x5f7056 _PyObject_MakeTpCall
@ 0x57107e _PyEval_EvalFrameDefault
@ 0x569cea _PyEval_EvalCodeWithName
@ 0x5f6a13 _PyFunction_Vectorcall
@ 0x50b23c (unknown)
@ 0x5f5c02 PyObject_Call
@ 0x56d1fc _PyEval_EvalFrameDefault
@ 0x569cea _PyEval_EvalCodeWithName
@ 0x5f6a13 _PyFunction_Vectorcall
@ 0x50b23c (unknown)
@ 0x5f5c02 PyObject_Call
@ 0x56d1fc _PyEval_EvalFrameDefault
@ 0x569cea _PyEval_EvalCodeWithName
@ 0x5f6a13 _PyFunction_Vectorcall
@ 0x50b23c (unknown)
@ 0x5f5c02 PyObject_Call
Stack trace (most recent call last):
Object "/usr/local/lib/python3.8/dist-packages/oneflow/_oneflow_internal.cpython-38-x86_64-linux-gnu.so", at 0x7f3d21f418ef, in
Object "/usr/local/lib/python3.8/dist-packages/oneflow/_oneflow_internal.cpython-38-x86_64-linux-gnu.so", at 0x7f3d221ed49a, in
Object "/usr/local/lib/python3.8/dist-packages/oneflow/_oneflow_internal.cpython-38-x86_64-linux-gnu.so", at 0x7f3d221ec645, in CurJobBuildAndInferCtx_Complete()
Object "/usr/local/lib/python3.8/dist-packages/oneflow/../oneflow.libs/liboneflow-d4147d4b.so", at 0x7f3ae06c357c, in LazyJobBuildAndInferCtx::Complete()
Object "/usr/local/lib/python3.8/dist-packages/oneflow/../oneflow.libs/liboneflow-d4147d4b.so", at 0x7f3ae06bde53, in
Object "/usr/local/lib/python3.8/dist-packages/oneflow/../oneflow.libs/liboneflow-d4147d4b.so", at 0x7f3ae088fcad, in
Object "/usr/local/lib/python3.8/dist-packages/oneflow/../oneflow.libs/liboneflow-d4147d4b.so", at 0x7f3ae088e18a, in
Object "/usr/local/lib/python3.8/dist-packages/oneflow/../oneflow.libs/liboneflow-d4147d4b.so", at 0x7f3ae1d3ae3b, in Conv2dEngineMgr::GetConv2dEngine(ep::CudaStream*, Conv2dConfig const&, Conv2dArguement const&, std::string const&)
Object "/usr/local/lib/python3.8/dist-packages/oneflow/../oneflow.libs/liboneflow-d4147d4b.so", at 0x7f3ae1d3f07f, in CudnnConv2dEngine::Init(ep::CudaStream*, Conv2dConfig const&, Conv2dArguement const&)
Object "/usr/local/lib/python3.8/dist-packages/oneflow/../oneflow.libs/liboneflow-d4147d4b.so", at 0x7f3ada9e7058, in CudnnConvArgs::CudnnConvArgs(DataType, ShapeView const&, DataType, ShapeView const&, DataType, ShapeView const&, std::string const&, std::vector const&, std::vector const&, std::vector const&, int, unsigned long, bool, bool, bool)
Object "/usr/local/lib/python3.8/dist-packages/oneflow/../oneflow.libs/liboneflow-d4147d4b.so", at 0x7f3ada9f4469, in CudnnTensorDesc::CudnnTensorDesc(DataType, ShapeView const&, std::string const&)
Object "/usr/local/lib/python3.8/dist-packages/oneflow/../oneflow.libs/liboneflow-d4147d4b.so", at 0x7f3ae947d188, in
Object "/usr/local/lib/python3.8/dist-packages/oneflow/../oneflow.libs/liboneflow-d4147d4b.so", at 0x7f3ae9479498, in
Object "/usr/local/lib/python3.8/dist-packages/oneflow/../oneflow.libs/liboneflow-d4147d4b.so", at 0x7f3ae947c8a0, in
Object "/usr/local/lib/python3.8/dist-packages/oneflow/../oneflow.libs/liboneflow-d4147d4b.so", at 0x7f3ae9479969, in
Object "/usr/local/lib/python3.8/dist-packages/oneflow/../oneflow.libs/liboneflow-d4147d4b.so", at 0x7f3ad969244e, in
Aborted (Signal sent by tkill() 3523 0)
Aborted (core dumped)