Skip to content

Signal handlers should only use async-signal-safe functions #9676

@julianbrost

Description

@julianbrost

I just observed the following deadlock while testing #9653. In the following, you can see a nice stack trace were a signal handler is executed on a thread that is currently inside operator new (presumably holding a lock at that point) and the signal handler also does operator new and is waiting for a lock:

Thread 10 (Thread 0x7f0cfa752700 (LWP 94) "icinga2"):
#0  __lll_lock_wait_private (futex=futex@entry=0x7f0c7c000020) at ./lowlevellock.c:35
#1  0x00007f0cfcb9332b in __GI___libc_malloc (bytes=50) at malloc.c:3064
#2  0x00007f0cfcee70b5 in operator new(unsigned long) () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#3  0x00007f0cfcf747b2 in std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_assign(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#4  0x00005557b175b4aa in icinga::Log::~Log() [clone .part.0] ()
#5  0x00005557b168bfcc in icinga::Application::SigUsr1Handler(int) ()
#6  <signal handler called>
#7  _int_malloc (av=av@entry=0x7f0c7c000020, bytes=bytes@entry=128) at malloc.c:4024
#8  0x00007f0cfcb93299 in __GI___libc_malloc (bytes=128) at malloc.c:3066
#9  0x00007f0cfcee70b5 in operator new(unsigned long) () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#10 0x00005557b1bc1852 in std::vector<std::vector<icinga::String, std::allocator<icinga::String> >, std::allocator<std::vector<icinga::String, std::allocator<icinga::String> > > >::vector(std::initializer_list<std::vector<icinga::String, std::allocator<icinga::String> > >, std::allocator<std::vector<icinga::String, std::allocator<icinga::String> > > const&) [clone .constprop.0] ()
#11 0x00005557b1b5eca8 in icinga::IcingaDB::UpdateState(boost::intrusive_ptr<icinga::Checkable> const&, icinga::IcingaDB::StateUpdate) ()
#12 0x00005557b1ba69a0 in boost::detail::function::void_function_obj_invoker3<icinga::IcingaDB::ConfigStaticInitialize()::{lambda(boost::intrusive_ptr<icinga::Checkable> const&, boost::intrusive_ptr<icinga::CheckResult> const&, boost::intrusive_ptr<icinga::MessageOrigin> const&)#8}, void, boost::intrusive_ptr<icinga::Checkable> const&, boost::intrusive_ptr<icinga::CheckResult> const&, boost::intrusive_ptr<icinga::MessageOrigin> const&>::invoke(boost::detail::function::function_buffer&, boost::intrusive_ptr<icinga::Checkable> const&, boost::intrusive_ptr<icinga::CheckResult> const&, boost::intrusive_ptr<icinga::MessageOrigin> const&) [clone .lto_priv.0] ()
#13 0x00005557b19ad4d5 in boost::signals2::detail::signal_impl<void (boost::intrusive_ptr<icinga::Checkable> const&, boost::intrusive_ptr<icinga::CheckResult> const&, boost::intrusive_ptr<icinga::MessageOrigin> const&), boost::signals2::optional_last_value<void>, int, std::less<int>, boost::function<void (boost::intrusive_ptr<icinga::Checkable> const&, boost::intrusive_ptr<icinga::CheckResult> const&, boost::intrusive_ptr<icinga::MessageOrigin> const&)>, boost::function<void (boost::signals2::connection const&, boost::intrusive_ptr<icinga::Checkable> const&, boost::intrusive_ptr<icinga::CheckResult> const&, boost::intrusive_ptr<icinga::MessageOrigin> const&)>, boost::signals2::mutex>::operator()(boost::intrusive_ptr<icinga::Checkable> const&, boost::intrusive_ptr<icinga::CheckResult> const&, boost::intrusive_ptr<icinga::MessageOrigin> const&) ()
#14 0x00005557b18ef58a in icinga::Checkable::ProcessCheckResult(boost::intrusive_ptr<icinga::CheckResult> const&, boost::intrusive_ptr<icinga::MessageOrigin> const&) ()
#15 0x00005557b1a07cbf in icinga::DummyCheckTask::ScriptFunc(boost::intrusive_ptr<icinga::Checkable> const&, boost::intrusive_ptr<icinga::CheckResult> const&, boost::intrusive_ptr<icinga::Dictionary> const&, bool) ()
#16 0x00005557b1a10d58 in std::_Function_handler<icinga::Value (std::vector<icinga::Value, std::allocator<icinga::Value> > const&), icinga::WrapFunction<void (*)(boost::intrusive_ptr<icinga::Checkable> const&, boost::intrusive_ptr<icinga::CheckResult> const&, boost::intrusive_ptr<icinga::Dictionary> const&, bool)>(void (*)(boost::intrusive_ptr<icinga::Checkable> const&, boost::intrusive_ptr<icinga::CheckResult> const&, boost::intrusive_ptr<icinga::Dictionary> const&, bool))::{lambda(std::vector<icinga::Value, std::allocator<icinga::Value> > const&)#1}>::_M_invoke(std::_Any_data const&, std::vector<icinga::Value, std::allocator<icinga::Value> > const&) ()
#17 0x00005557b16c4b6b in icinga::Function::Invoke(std::vector<icinga::Value, std::allocator<icinga::Value> > const&) ()
#18 0x00005557b18f2ff3 in icinga::CheckCommand::Execute(boost::intrusive_ptr<icinga::Checkable> const&, boost::intrusive_ptr<icinga::CheckResult> const&, boost::intrusive_ptr<icinga::Dictionary> const&, bool) ()
#19 0x00005557b18f3b1f in icinga::Checkable::ExecuteCheck() ()
#20 0x00005557b1a187bf in icinga::CheckerComponent::ExecuteCheckHelper(boost::intrusive_ptr<icinga::Checkable> const&) ()
#21 0x00005557b171deca in icinga::ThreadPool::Post<std::function<void ()> >(std::function<void ()>, icinga::SchedulerPolicy)::{lambda()#1}::operator()() const ()
#22 0x00005557b17475d7 in boost::asio::detail::executor_op<icinga::ThreadPool::Post<std::function<void ()> >(std::function<void ()>, icinga::SchedulerPolicy)::{lambda()#1}, std::allocator<void>, boost::asio::detail::scheduler_operation>::do_complete(void*, boost::asio::detail::scheduler_operation*, boost::system::error_code const&, unsigned long) ()
#23 0x00005557b1c2b1ef in boost::asio::detail::scheduler::run(boost::system::error_code&) [clone .constprop.0] ()
#24 0x00005557b1756a78 in boost::asio::detail::posix_thread::func<boost::asio::thread_pool::thread_function>::run() ()
#25 0x00005557b16815d0 in boost_asio_detail_posix_thread_function ()
#26 0x00007f0cfd015ea7 in start_thread (arg=<optimized out>) at pthread_create.c:477
#27 0x00007f0cfcc08a2f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95

Here, the offending code is this:

void Application::SigUsr1Handler(int)
{
Log(LogInformation, "Application")
<< "Received USR1 signal, reopening application logs.";

But other places are also affected:

void Application::RequestShutdown()
{
Log(LogInformation, "Application", "Received request to shut down.");

which is reachable from
static void WorkerSignalHandler(int num, siginfo_t *info, void*)
{
switch (num) {
case SIGUSR1:
// Catches SIGUSR1 as long as the actual handler (logrotate)
// has not been installed not to let SIGUSR1 terminate the process
break;
case SIGUSR2:
if (info->si_pid == 0 || info->si_pid == l_UmbrellaPid) {
// The umbrella process allowed us to continue working beyond config validation
l_AllowedToWork.store(true);
}
break;
case SIGINT:
case SIGTERM:
if (info->si_pid == 0 || info->si_pid == l_UmbrellaPid) {
// The umbrella process requested our termination
Application::RequestShutdown();
}
break;

Similarly (here you may argue that we're crashing already, but this could potentially change a crash with an immediate restart by systemd for example to a hanging process):

void Application::SigAbrtHandler(int)
{
#ifndef _WIN32
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_handler = SIG_DFL;
sigaction(SIGABRT, &sa, nullptr);
#endif /* _WIN32 */
std::cerr << "Caught SIGABRT." << std::endl
<< "Current time: " << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", Utility::GetTime()) << std::endl
<< std::endl;
String fname = GetCrashReportFilename();
String dirName = Utility::DirName(fname);
if (!Utility::PathExists(dirName)) {
#ifndef _WIN32
if (mkdir(dirName.CStr(), 0700) < 0 && errno != EEXIST) {
#else /*_ WIN32 */
if (mkdir(dirName.CStr()) < 0 && errno != EEXIST) {
#endif /* _WIN32 */
std::cerr << "Could not create directory '" << dirName << "': Error " << errno << ", " << strerror(errno) << "\n";
}
}
bool interactive_debugger = Configuration::AttachDebugger;
if (!interactive_debugger) {
std::ofstream ofs;
ofs.open(fname.CStr());
Log(LogCritical, "Application")
<< "Icinga 2 has terminated unexpectedly. Additional information can be found in '" << fname << "'" << "\n";
ofs << "Caught SIGABRT.\n"
<< "Current time: " << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", Utility::GetTime()) << "\n\n";
DisplayInfoMessage(ofs);
ofs << "\nStacktrace:\n" << StackTraceFormatter(boost::stacktrace::stacktrace()) << "\n";
DisplayBugMessage(ofs);
ofs << "\n";
ofs.close();
} else {
Log(LogCritical, "Application", "Icinga 2 has terminated unexpectedly. Attaching debugger...");
}
AttachDebugger(fname, interactive_debugger);
}

In general, the signal handlers should be kept minimal and may only use async-signal-safe functions.

Metadata

Metadata

Labels

bugSomething isn't workingref/NC

Type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions