Skip to content

Commit 52964dc

Browse files
kavehahmadi60meta-codesync[bot]
authored andcommitted
Emit daemon EdenFS mount health events
Summary: Run the mount-health checker for configured checkouts that the daemon still has mounted during accidental unmount recovery. Log eden_mount_health_issue events from daemon_periodic for unhealthy running mounts, including a timeout event when the worker check does not complete promptly. Reviewed By: muirdm Differential Revision: D108201465 fbshipit-source-id: a29ac9db0e1cb60b56dead34d5f90663edb92782
1 parent 74c4b9d commit 52964dc

3 files changed

Lines changed: 181 additions & 2 deletions

File tree

eden/fs/service/BUCK

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,7 @@ cpp_library(
393393
"//eden/fs/utils:eden_task_queue",
394394
"//eden/fs/utils:fs_channel_types",
395395
"//eden/fs/utils:matcher",
396+
"//eden/fs/utils:mount_health_check",
396397
"//eden/fs/utils:mount_info_table",
397398
"//eden/fs/utils:nfs_socket",
398399
"//eden/fs/utils:proc_util",
@@ -401,6 +402,7 @@ cpp_library(
401402
"//folly:conv",
402403
"//folly:exception",
403404
"//folly:file_util",
405+
"//folly:indestructible",
404406
"//folly:random",
405407
"//folly:stop_watch",
406408
"//folly:string",

eden/fs/service/EdenServer.cpp

Lines changed: 169 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,15 @@
99

1010
#include <cpptoml.h>
1111
#include <algorithm>
12+
#include <atomic>
1213
#include <chrono>
1314

1415
#include <sys/stat.h>
1516
#include <fstream>
1617
#include <functional>
1718
#include <iterator>
1819
#include <memory>
20+
#include <optional>
1921
#include <sstream>
2022
#include <string>
2123
#include <string_view>
@@ -26,6 +28,7 @@
2628
#include <fmt/core.h>
2729
#include <folly/Exception.h>
2830
#include <folly/FileUtil.h>
31+
#include <folly/Indestructible.h>
2932
#include <folly/SocketAddress.h>
3033

3134
#include <folly/json/json.h>
@@ -37,6 +40,7 @@
3740
#include <folly/coro/Task.h>
3841
#include <folly/executors/CPUThreadPoolExecutor.h>
3942
#include <folly/executors/thread_factory/NamedThreadFactory.h>
43+
#include <folly/futures/Future.h>
4044
#include <folly/io/async/AsyncSignalHandler.h>
4145
#include <folly/io/async/HHWheelTimer.h>
4246
#include <folly/logging/xlog.h>
@@ -107,6 +111,7 @@
107111
#include "eden/fs/utils/EdenError.h"
108112
#include "eden/fs/utils/EdenTaskQueue.h"
109113
#include "eden/fs/utils/FsChannelTypes.h"
114+
#include "eden/fs/utils/MountHealthCheck.h"
110115
#include "eden/fs/utils/NfsSocket.h"
111116
#include "eden/fs/utils/NotImplemented.h"
112117
#include "eden/fs/utils/ProcUtil.h"
@@ -175,6 +180,100 @@ namespace {
175180

176181
using namespace facebook::eden;
177182

183+
constexpr auto kMountHealthCheckTimeout = std::chrono::seconds{5};
184+
185+
struct MountHealthIssueLogContext {
186+
MountHealthIssueLogContext(
187+
std::weak_ptr<EdenFsEventsLogger> edenFsEventsLogger,
188+
std::string mountPath,
189+
std::string repoSource)
190+
: edenFsEventsLogger_{std::move(edenFsEventsLogger)},
191+
mountPath{std::move(mountPath)},
192+
repoSource{std::move(repoSource)} {}
193+
194+
void log(std::string reason, std::string error) const {
195+
auto edenFsEventsLogger = edenFsEventsLogger_.lock();
196+
if (!edenFsEventsLogger) {
197+
return;
198+
}
199+
edenFsEventsLogger->logEvent(
200+
EdenMountHealthIssue{
201+
"daemon_periodic",
202+
std::move(reason),
203+
std::string{mountPath},
204+
std::string{mountPath},
205+
"configured_checkout",
206+
std::string{repoSource},
207+
std::move(error),
208+
false,
209+
false});
210+
}
211+
212+
std::weak_ptr<EdenFsEventsLogger> edenFsEventsLogger_;
213+
std::string mountPath;
214+
std::string repoSource;
215+
};
216+
217+
folly::CPUThreadPoolExecutor* getMountHealthCheckThreadPool() {
218+
// Process-lifetime so a stuck filesystem syscall cannot make EdenServer
219+
// teardown wait for the health-check worker thread.
220+
static folly::Indestructible<folly::CPUThreadPoolExecutor> executor(
221+
2, std::make_shared<folly::NamedThreadFactory>("MountHealthCheck"));
222+
return executor.get();
223+
}
224+
225+
void logMountHealthCheckTimeout(
226+
const MountHealthIssueLogContext& logContext,
227+
std::string error) {
228+
try {
229+
logContext.log(
230+
std::string{edenMountHealthIssueReasonString(
231+
EdenMountHealthIssueReason::DaemonRunningMountTimedOut)},
232+
std::move(error));
233+
} catch (const std::exception& ex) {
234+
XLOGF(ERR, "EdenFS mount health timeout logging failed: {}", ex.what());
235+
} catch (...) {
236+
XLOG(ERR, "EdenFS mount health timeout logging failed");
237+
}
238+
}
239+
240+
void scheduleMountHealthCheckStartTimeout(
241+
std::shared_ptr<std::atomic_bool> started,
242+
std::shared_ptr<std::atomic_bool> completed,
243+
std::shared_ptr<const MountHealthIssueLogContext> logContext) {
244+
folly::futures::detachOnGlobalCPUExecutor(
245+
folly::futures::sleep(kMountHealthCheckTimeout)
246+
.deferValue([started = std::move(started),
247+
completed = std::move(completed),
248+
logContext = std::move(logContext)](auto&&) {
249+
if (started->load(std::memory_order_acquire)) {
250+
return;
251+
}
252+
if (completed->exchange(true)) {
253+
return;
254+
}
255+
logMountHealthCheckTimeout(
256+
*logContext,
257+
"EdenFS mount health check did not start within timeout; "
258+
"worker pool may be saturated");
259+
}));
260+
}
261+
262+
void scheduleMountHealthCheckRuntimeTimeout(
263+
std::shared_ptr<std::atomic_bool> completed,
264+
std::shared_ptr<const MountHealthIssueLogContext> logContext) {
265+
folly::futures::detachOnGlobalCPUExecutor(
266+
folly::futures::sleep(kMountHealthCheckTimeout)
267+
.deferValue([completed = std::move(completed),
268+
logContext = std::move(logContext)](auto&&) {
269+
if (completed->exchange(true)) {
270+
return;
271+
}
272+
logMountHealthCheckTimeout(
273+
*logContext, "EdenFS mount health check timed out");
274+
}));
275+
}
276+
178277
std::shared_ptr<Notifier> getPlatformNotifier(
179278
std::shared_ptr<ReloadableConfig> config,
180279
std::shared_ptr<EdenFsEventsLogger> edenFsEventsLogger,
@@ -655,6 +754,8 @@ EdenServer::EdenServer(
655754
edenConfig->prefetchOptimizations.getValue()},
656755
prefetchFilesV2Executor_{
657756
makePrefetchFilesV2Threads(thriftUsePrefetchExecutor_, edenConfig)},
757+
runningMountHealthChecks_{std::make_shared<
758+
folly::Synchronized<std::unordered_set<std::string>>>()},
658759
progressManager_{
659760
std::make_unique<folly::Synchronized<EdenServer::ProgressManager>>()},
660761
startupStatusChannel_{std::move(startupStatusChannel)},
@@ -3324,6 +3425,71 @@ bool EdenServer::isWorkingCopyGCRunningForAnyMount() const {
33243425
return false;
33253426
}
33263427

3428+
void EdenServer::scheduleRunningMountHealthCheck(
3429+
const AbsolutePath& mountPath,
3430+
std::string repoSource) {
3431+
auto mountPathString = std::string{mountPath.view()};
3432+
auto runningMountHealthChecks = runningMountHealthChecks_;
3433+
{
3434+
auto runningChecks = runningMountHealthChecks->wlock();
3435+
if (!runningChecks->insert(mountPathString).second) {
3436+
return;
3437+
}
3438+
}
3439+
// Keep the mount marked in-flight until the worker returns. The timeout logs
3440+
// a stuck probe but cannot interrupt the blocked syscall; clearing this on
3441+
// timeout would let later ticks enqueue duplicate stuck probes.
3442+
3443+
auto started = make_shared<std::atomic_bool>(false);
3444+
auto completed = make_shared<std::atomic_bool>(false);
3445+
auto logContext = make_shared<const MountHealthIssueLogContext>(
3446+
serverState_->getEdenFsEventsLogger(),
3447+
std::move(mountPathString),
3448+
std::move(repoSource));
3449+
3450+
scheduleMountHealthCheckStartTimeout(started, completed, logContext);
3451+
3452+
// Run the potentially blocking filesystem probe on a dedicated pool. The
3453+
// runtime timeout is armed after the worker starts so executor queue delay
3454+
// and a hung EdenFS mount can be reported separately.
3455+
folly::via(
3456+
getMountHealthCheckThreadPool(),
3457+
[started, completed, logContext]() {
3458+
started->store(true, std::memory_order_release);
3459+
if (completed->load(std::memory_order_acquire)) {
3460+
return std::optional<EdenMountHealthCheckIssue>{};
3461+
}
3462+
scheduleMountHealthCheckRuntimeTimeout(completed, logContext);
3463+
return checkRunningEdenMountHealth(logContext->mountPath);
3464+
})
3465+
.thenTry(
3466+
[completed, logContext, runningMountHealthChecks](
3467+
folly::Try<std::optional<EdenMountHealthCheckIssue>>&& result) {
3468+
const auto alreadyCompleted = completed->exchange(true);
3469+
{
3470+
auto runningChecks = runningMountHealthChecks->wlock();
3471+
runningChecks->erase(logContext->mountPath);
3472+
}
3473+
if (alreadyCompleted) {
3474+
return;
3475+
}
3476+
if (result.hasException()) {
3477+
XLOGF(
3478+
WARN,
3479+
"EdenFS mount health check failed: {}",
3480+
result.exception().what());
3481+
return;
3482+
}
3483+
const auto& issue = result.value();
3484+
if (!issue.has_value()) {
3485+
return;
3486+
}
3487+
logContext->log(
3488+
std::string{edenMountHealthIssueReasonString(issue->reason)},
3489+
issue->error);
3490+
});
3491+
}
3492+
33273493
void EdenServer::accidentalUnmountRecovery() {
33283494
XLOGF(DBG5, "Performing accidental unmount recovery.");
33293495
folly::dynamic dirs = folly::dynamic::object();
@@ -3349,7 +3515,9 @@ void EdenServer::accidentalUnmountRecovery() {
33493515
auto mountPath = canonicalPath(client.first.stringPiece());
33503516
const auto it = mountPoints->find(mountPath);
33513517

3352-
if (it == mountPoints->end()) {
3518+
if (it != mountPoints->end()) {
3519+
scheduleRunningMountHealthCheck(mountPath, client.second.asString());
3520+
} else {
33533521
// This mount point is not currently mounted, but it was configured
33543522
// in config.json. This means that the client was unmounted.
33553523
// We should attempt to remount it, if it is unmounted accidentally.

eden/fs/service/EdenServer.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include <string>
1616
#include <string_view>
1717
#include <unordered_map>
18+
#include <unordered_set>
1819
#include <vector>
1920

2021
#include <folly/CancellationToken.h>
@@ -60,7 +61,7 @@ class ServerStream;
6061

6162
namespace folly {
6263
class EventBase;
63-
}
64+
} // namespace folly
6465

6566
namespace facebook::eden {
6667

@@ -714,6 +715,11 @@ class EdenServer : private TakeoverHandler {
714715
// attempts to recover it.
715716
void accidentalUnmountRecovery();
716717

718+
// Checks a running mount point without blocking the main EventBase.
719+
void scheduleRunningMountHealthCheck(
720+
const AbsolutePath& mountPath,
721+
std::string repoSource);
722+
717723
// Detects when NFS backed repos are being crawled.
718724
void detectNfsCrawl();
719725

@@ -919,6 +925,9 @@ class EdenServer : private TakeoverHandler {
919925
*/
920926
std::shared_ptr<folly::Executor> prefetchFilesV2Executor_;
921927

928+
std::shared_ptr<folly::Synchronized<std::unordered_set<std::string>>>
929+
runningMountHealthChecks_;
930+
922931
/**
923932
* Remounting progress state.
924933
*/

0 commit comments

Comments
 (0)