99
1010#include < cpptoml.h>
1111#include < algorithm>
12+ #include < atomic>
1213#include < chrono>
1314
1415#include < sys/stat.h>
1516#include < fstream>
1617#include < functional>
1718#include < iterator>
1819#include < memory>
20+ #include < optional>
1921#include < sstream>
2022#include < string>
2123#include < string_view>
2628#include < fmt/core.h>
2729#include < folly/Exception.h>
2830#include < folly/FileUtil.h>
31+ #include < folly/Indestructible.h>
2932#include < folly/SocketAddress.h>
3033
3134#include < folly/json/json.h>
3740#include < folly/coro/Task.h>
3841#include < folly/executors/CPUThreadPoolExecutor.h>
3942#include < folly/executors/thread_factory/NamedThreadFactory.h>
43+ #include < folly/futures/Future.h>
4044#include < folly/io/async/AsyncSignalHandler.h>
4145#include < folly/io/async/HHWheelTimer.h>
4246#include < folly/logging/xlog.h>
107111#include " eden/fs/utils/EdenError.h"
108112#include " eden/fs/utils/EdenTaskQueue.h"
109113#include " eden/fs/utils/FsChannelTypes.h"
114+ #include " eden/fs/utils/MountHealthCheck.h"
110115#include " eden/fs/utils/NfsSocket.h"
111116#include " eden/fs/utils/NotImplemented.h"
112117#include " eden/fs/utils/ProcUtil.h"
@@ -175,6 +180,100 @@ namespace {
175180
176181using namespace facebook ::eden;
177182
183+ constexpr auto kMountHealthCheckTimeout = std::chrono::seconds{5 };
184+
185+ struct MountHealthIssueLogContext {
186+ MountHealthIssueLogContext (
187+ std::weak_ptr<EdenFsEventsLogger> edenFsEventsLogger,
188+ std::string mountPath,
189+ std::string repoSource)
190+ : edenFsEventsLogger_{std::move (edenFsEventsLogger)},
191+ mountPath{std::move (mountPath)},
192+ repoSource{std::move (repoSource)} {}
193+
194+ void log (std::string reason, std::string error) const {
195+ auto edenFsEventsLogger = edenFsEventsLogger_.lock ();
196+ if (!edenFsEventsLogger) {
197+ return ;
198+ }
199+ edenFsEventsLogger->logEvent (
200+ EdenMountHealthIssue{
201+ " daemon_periodic" ,
202+ std::move (reason),
203+ std::string{mountPath},
204+ std::string{mountPath},
205+ " configured_checkout" ,
206+ std::string{repoSource},
207+ std::move (error),
208+ false ,
209+ false });
210+ }
211+
212+ std::weak_ptr<EdenFsEventsLogger> edenFsEventsLogger_;
213+ std::string mountPath;
214+ std::string repoSource;
215+ };
216+
217+ folly::CPUThreadPoolExecutor* getMountHealthCheckThreadPool () {
218+ // Process-lifetime so a stuck filesystem syscall cannot make EdenServer
219+ // teardown wait for the health-check worker thread.
220+ static folly::Indestructible<folly::CPUThreadPoolExecutor> executor (
221+ 2 , std::make_shared<folly::NamedThreadFactory>(" MountHealthCheck" ));
222+ return executor.get ();
223+ }
224+
225+ void logMountHealthCheckTimeout (
226+ const MountHealthIssueLogContext& logContext,
227+ std::string error) {
228+ try {
229+ logContext.log (
230+ std::string{edenMountHealthIssueReasonString (
231+ EdenMountHealthIssueReason::DaemonRunningMountTimedOut)},
232+ std::move (error));
233+ } catch (const std::exception& ex) {
234+ XLOGF (ERR , " EdenFS mount health timeout logging failed: {}" , ex.what ());
235+ } catch (...) {
236+ XLOG (ERR , " EdenFS mount health timeout logging failed" );
237+ }
238+ }
239+
240+ void scheduleMountHealthCheckStartTimeout (
241+ std::shared_ptr<std::atomic_bool> started,
242+ std::shared_ptr<std::atomic_bool> completed,
243+ std::shared_ptr<const MountHealthIssueLogContext> logContext) {
244+ folly::futures::detachOnGlobalCPUExecutor (
245+ folly::futures::sleep (kMountHealthCheckTimeout )
246+ .deferValue ([started = std::move (started),
247+ completed = std::move (completed),
248+ logContext = std::move (logContext)](auto &&) {
249+ if (started->load (std::memory_order_acquire)) {
250+ return ;
251+ }
252+ if (completed->exchange (true )) {
253+ return ;
254+ }
255+ logMountHealthCheckTimeout (
256+ *logContext,
257+ " EdenFS mount health check did not start within timeout; "
258+ " worker pool may be saturated" );
259+ }));
260+ }
261+
262+ void scheduleMountHealthCheckRuntimeTimeout (
263+ std::shared_ptr<std::atomic_bool> completed,
264+ std::shared_ptr<const MountHealthIssueLogContext> logContext) {
265+ folly::futures::detachOnGlobalCPUExecutor (
266+ folly::futures::sleep (kMountHealthCheckTimeout )
267+ .deferValue ([completed = std::move (completed),
268+ logContext = std::move (logContext)](auto &&) {
269+ if (completed->exchange (true )) {
270+ return ;
271+ }
272+ logMountHealthCheckTimeout (
273+ *logContext, " EdenFS mount health check timed out" );
274+ }));
275+ }
276+
178277std::shared_ptr<Notifier> getPlatformNotifier (
179278 std::shared_ptr<ReloadableConfig> config,
180279 std::shared_ptr<EdenFsEventsLogger> edenFsEventsLogger,
@@ -655,6 +754,8 @@ EdenServer::EdenServer(
655754 edenConfig->prefetchOptimizations .getValue ()},
656755 prefetchFilesV2Executor_{
657756 makePrefetchFilesV2Threads (thriftUsePrefetchExecutor_, edenConfig)},
757+ runningMountHealthChecks_{std::make_shared<
758+ folly::Synchronized<std::unordered_set<std::string>>>()},
658759 progressManager_{
659760 std::make_unique<folly::Synchronized<EdenServer::ProgressManager>>()},
660761 startupStatusChannel_{std::move (startupStatusChannel)},
@@ -3324,6 +3425,71 @@ bool EdenServer::isWorkingCopyGCRunningForAnyMount() const {
33243425 return false ;
33253426}
33263427
3428+ void EdenServer::scheduleRunningMountHealthCheck (
3429+ const AbsolutePath& mountPath,
3430+ std::string repoSource) {
3431+ auto mountPathString = std::string{mountPath.view ()};
3432+ auto runningMountHealthChecks = runningMountHealthChecks_;
3433+ {
3434+ auto runningChecks = runningMountHealthChecks->wlock ();
3435+ if (!runningChecks->insert (mountPathString).second ) {
3436+ return ;
3437+ }
3438+ }
3439+ // Keep the mount marked in-flight until the worker returns. The timeout logs
3440+ // a stuck probe but cannot interrupt the blocked syscall; clearing this on
3441+ // timeout would let later ticks enqueue duplicate stuck probes.
3442+
3443+ auto started = make_shared<std::atomic_bool>(false );
3444+ auto completed = make_shared<std::atomic_bool>(false );
3445+ auto logContext = make_shared<const MountHealthIssueLogContext>(
3446+ serverState_->getEdenFsEventsLogger (),
3447+ std::move (mountPathString),
3448+ std::move (repoSource));
3449+
3450+ scheduleMountHealthCheckStartTimeout (started, completed, logContext);
3451+
3452+ // Run the potentially blocking filesystem probe on a dedicated pool. The
3453+ // runtime timeout is armed after the worker starts so executor queue delay
3454+ // and a hung EdenFS mount can be reported separately.
3455+ folly::via (
3456+ getMountHealthCheckThreadPool (),
3457+ [started, completed, logContext]() {
3458+ started->store (true , std::memory_order_release);
3459+ if (completed->load (std::memory_order_acquire)) {
3460+ return std::optional<EdenMountHealthCheckIssue>{};
3461+ }
3462+ scheduleMountHealthCheckRuntimeTimeout (completed, logContext);
3463+ return checkRunningEdenMountHealth (logContext->mountPath );
3464+ })
3465+ .thenTry (
3466+ [completed, logContext, runningMountHealthChecks](
3467+ folly::Try<std::optional<EdenMountHealthCheckIssue>>&& result) {
3468+ const auto alreadyCompleted = completed->exchange (true );
3469+ {
3470+ auto runningChecks = runningMountHealthChecks->wlock ();
3471+ runningChecks->erase (logContext->mountPath );
3472+ }
3473+ if (alreadyCompleted) {
3474+ return ;
3475+ }
3476+ if (result.hasException ()) {
3477+ XLOGF (
3478+ WARN ,
3479+ " EdenFS mount health check failed: {}" ,
3480+ result.exception ().what ());
3481+ return ;
3482+ }
3483+ const auto & issue = result.value ();
3484+ if (!issue.has_value ()) {
3485+ return ;
3486+ }
3487+ logContext->log (
3488+ std::string{edenMountHealthIssueReasonString (issue->reason )},
3489+ issue->error );
3490+ });
3491+ }
3492+
33273493void EdenServer::accidentalUnmountRecovery () {
33283494 XLOGF (DBG5 , " Performing accidental unmount recovery." );
33293495 folly::dynamic dirs = folly::dynamic::object ();
@@ -3349,7 +3515,9 @@ void EdenServer::accidentalUnmountRecovery() {
33493515 auto mountPath = canonicalPath (client.first .stringPiece ());
33503516 const auto it = mountPoints->find (mountPath);
33513517
3352- if (it == mountPoints->end ()) {
3518+ if (it != mountPoints->end ()) {
3519+ scheduleRunningMountHealthCheck (mountPath, client.second .asString ());
3520+ } else {
33533521 // This mount point is not currently mounted, but it was configured
33543522 // in config.json. This means that the client was unmounted.
33553523 // We should attempt to remount it, if it is unmounted accidentally.
0 commit comments