Skip to content

Commit 38ec2e7

Browse files
mehrdadnedoakes
authored andcommitted
Revert "Use Boost.Process instead of pid_t (#6510)" (#6909)
This reverts commit fb8e361.
1 parent 2fca550 commit 38ec2e7

11 files changed

+205
-335
lines changed

BUILD.bazel

-1
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,6 @@ cc_library(
399399
":stats_lib",
400400
":worker_rpc",
401401
"@boost//:asio",
402-
"@boost//:process",
403402
"@com_github_jupp0r_prometheus_cpp//pull",
404403
"@com_google_absl//absl/base:core_headers",
405404
"@com_google_absl//absl/container:flat_hash_set",

bazel/ray_deps_setup.bzl

-2
Original file line numberDiff line numberDiff line change
@@ -123,8 +123,6 @@ def ray_deps_setup():
123123
# Backport Clang-Cl patch on Boost 1.69 to Boost <= 1.68:
124124
# https://lists.boost.org/Archives/boost/2018/09/243420.php
125125
"//thirdparty/patches:boost-type_traits-trivial_move.patch",
126-
# Partially backport waitpid() patch on Boost 1.72 to Boost <= 1.68
127-
"//thirdparty/patches:boost-process-teminate-waitpid-nohang.patch",
128126
],
129127
)
130128

python/ray/test_utils.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def wait_for_pid_to_exit(pid, timeout=20):
3939
return
4040
time.sleep(0.1)
4141
raise RayTestTimeoutException(
42-
"Timed out while waiting for process {} to exit.".format(pid))
42+
"Timed out while waiting for process to exit.")
4343

4444

4545
def wait_for_children_of_pid(pid, num_children=1, timeout=20):
@@ -51,8 +51,8 @@ def wait_for_children_of_pid(pid, num_children=1, timeout=20):
5151
return
5252
time.sleep(0.1)
5353
raise RayTestTimeoutException(
54-
"Timed out while waiting for process {} children to start "
55-
"({}/{} started).".format(pid, num_alive, num_children))
54+
"Timed out while waiting for process children to start "
55+
"({}/{} started).".format(num_alive, num_children))
5656

5757

5858
def wait_for_children_of_pid_to_exit(pid, timeout=20):

src/ray/raylet/node_manager.cc

+27-34
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,8 @@ NodeManager::NodeManager(boost::asio::io_service &io_service,
8989
object_manager_profile_timer_(io_service),
9090
initial_config_(config),
9191
local_available_resources_(config.resource_config),
92-
worker_pool_(io_service, config.num_initial_workers,
93-
config.maximum_startup_concurrency, gcs_client_,
94-
config.worker_commands),
92+
worker_pool_(config.num_initial_workers, config.maximum_startup_concurrency,
93+
gcs_client_, config.worker_commands),
9594
scheduling_policy_(local_queues_),
9695
reconstruction_policy_(
9796
io_service_,
@@ -229,23 +228,22 @@ void NodeManager::HandleUnexpectedWorkerFailure(
229228
}
230229

231230
void NodeManager::KillWorker(std::shared_ptr<Worker> worker) {
232-
#ifdef _WIN32
233-
// TODO(mehrdadn): Implement implement graceful process termination mechanism
234-
#else
235231
// If we're just cleaning up a single worker, allow it some time to clean
236232
// up its state before force killing. The client socket will be closed
237233
// and the worker struct will be freed after the timeout.
238-
kill(worker->Process().get()->id(), SIGTERM);
239-
#endif
234+
kill(worker->Pid(), SIGTERM);
240235

241236
auto retry_timer = std::make_shared<boost::asio::deadline_timer>(io_service_);
242237
auto retry_duration = boost::posix_time::milliseconds(
243238
RayConfig::instance().kill_worker_timeout_milliseconds());
244239
retry_timer->expires_from_now(retry_duration);
245240
retry_timer->async_wait([retry_timer, worker](const boost::system::error_code &error) {
246-
RAY_LOG(DEBUG) << "Send SIGKILL to worker, pid=" << worker->Process().get()->id();
247-
// Force kill worker
248-
worker->Process().get()->terminate();
241+
RAY_LOG(DEBUG) << "Send SIGKILL to worker, pid=" << worker->Pid();
242+
// Force kill worker. TODO(mehrdadn, rkn): The worker may have already died
243+
// and had its PID reassigned to a different process, at least on Windows.
244+
// On Linux, this may or may not be the case, depending on e.g. whether
245+
// the process has been already waited on. Regardless, this must be fixed.
246+
kill(worker->Pid(), SIGKILL);
249247
});
250248
}
251249

@@ -857,9 +855,8 @@ void NodeManager::ProcessClientMessage(
857855
RAY_LOG(DEBUG) << "[Worker] Message "
858856
<< protocol::EnumNameMessageType(message_type_value) << "("
859857
<< message_type << ") from worker with PID "
860-
<< (registered_worker
861-
? std::to_string(registered_worker->Process().get()->id())
862-
: "nil");
858+
<< (registered_worker ? std::to_string(registered_worker->Pid())
859+
: "nil");
863860
if (registered_worker && registered_worker->IsDead()) {
864861
// For a worker that is marked as dead (because the job has died already),
865862
// all the messages are ignored except DisconnectClient.
@@ -966,6 +963,12 @@ void NodeManager::ProcessClientMessage(
966963
void NodeManager::ProcessRegisterClientRequestMessage(
967964
const std::shared_ptr<LocalClientConnection> &client, const uint8_t *message_data) {
968965
client->Register();
966+
auto message = flatbuffers::GetRoot<protocol::RegisterClientRequest>(message_data);
967+
Language language = static_cast<Language>(message->language());
968+
WorkerID worker_id = from_flatbuf<WorkerID>(*message->worker_id());
969+
auto worker = std::make_shared<Worker>(worker_id, message->worker_pid(), language,
970+
message->port(), client, client_call_manager_);
971+
Status status;
969972
flatbuffers::FlatBufferBuilder fbb;
970973
auto reply =
971974
ray::protocol::CreateRegisterClientReply(fbb, to_flatbuf(fbb, self_node_id_));
@@ -980,31 +983,24 @@ void NodeManager::ProcessRegisterClientRequestMessage(
980983
}
981984
});
982985

983-
auto message = flatbuffers::GetRoot<protocol::RegisterClientRequest>(message_data);
984-
Language language = static_cast<Language>(message->language());
985-
WorkerID worker_id = from_flatbuf<WorkerID>(*message->worker_id());
986-
pid_t pid = message->worker_pid();
987-
auto worker = std::make_shared<Worker>(worker_id, language, message->port(), client,
988-
client_call_manager_);
989986
if (message->is_worker()) {
990987
// Register the new worker.
991-
if (worker_pool_.RegisterWorker(worker, pid).ok()) {
988+
if (worker_pool_.RegisterWorker(worker).ok()) {
992989
HandleWorkerAvailable(worker->Connection());
993990
}
994991
} else {
995992
// Register the new driver.
996-
worker->SetProcess(ProcessHandle::FromPid(pid));
997993
const JobID job_id = from_flatbuf<JobID>(*message->job_id());
998994
// Compute a dummy driver task id from a given driver.
999995
const TaskID driver_task_id = TaskID::ComputeDriverTaskId(worker_id);
1000996
worker->AssignTaskId(driver_task_id);
1001997
worker->AssignJobId(job_id);
1002-
Status status = worker_pool_.RegisterDriver(worker);
998+
status = worker_pool_.RegisterDriver(worker);
1003999
if (status.ok()) {
10041000
local_queues_.AddDriverTaskId(driver_task_id);
1005-
auto job_data_ptr =
1006-
gcs::CreateJobTableData(job_id, /*is_dead*/ false, std::time(nullptr),
1007-
initial_config_.node_manager_address, pid);
1001+
auto job_data_ptr = gcs::CreateJobTableData(
1002+
job_id, /*is_dead*/ false, std::time(nullptr),
1003+
initial_config_.node_manager_address, message->worker_pid());
10081004
RAY_CHECK_OK(gcs_client_->Jobs().AsyncAdd(job_data_ptr, nullptr));
10091005
}
10101006
}
@@ -1200,8 +1196,7 @@ void NodeManager::ProcessDisconnectClientMessage(
12001196
cluster_resource_map_[self_node_id_].Release(lifetime_resources.ToResourceSet());
12011197
worker->ResetLifetimeResourceIds();
12021198

1203-
RAY_LOG(DEBUG) << "Worker (pid=" << worker->Process().get()->id()
1204-
<< ") is disconnected. "
1199+
RAY_LOG(DEBUG) << "Worker (pid=" << worker->Pid() << ") is disconnected. "
12051200
<< "job_id: " << worker->GetAssignedJobId();
12061201

12071202
// Since some resources may have been released, we can try to dispatch more tasks.
@@ -1215,8 +1210,7 @@ void NodeManager::ProcessDisconnectClientMessage(
12151210
local_queues_.RemoveDriverTaskId(TaskID::ComputeDriverTaskId(driver_id));
12161211
worker_pool_.DisconnectDriver(worker);
12171212

1218-
RAY_LOG(DEBUG) << "Driver (pid=" << worker->Process().get()->id()
1219-
<< ") is disconnected. "
1213+
RAY_LOG(DEBUG) << "Driver (pid=" << worker->Pid() << ") is disconnected. "
12201214
<< "job_id: " << job_id;
12211215
}
12221216

@@ -2296,8 +2290,7 @@ void NodeManager::AssignTask(const std::shared_ptr<Worker> &worker, const Task &
22962290
}
22972291

22982292
RAY_LOG(DEBUG) << "Assigning task " << spec.TaskId() << " to worker with pid "
2299-
<< worker->Process().get()->id()
2300-
<< ", worker id: " << worker->WorkerId();
2293+
<< worker->Pid() << ", worker id: " << worker->WorkerId();
23012294
flatbuffers::FlatBufferBuilder fbb;
23022295

23032296
// Resource accounting: acquire resources for the assigned task.
@@ -3128,7 +3121,7 @@ void NodeManager::HandleGetNodeStats(const rpc::GetNodeStatsRequest &request,
31283121
rpc::SendReplyCallback send_reply_callback) {
31293122
for (const auto &driver : worker_pool_.GetAllDrivers()) {
31303123
auto worker_stats = reply->add_workers_stats();
3131-
worker_stats->set_pid(driver->Process().get()->id());
3124+
worker_stats->set_pid(driver->Pid());
31323125
worker_stats->set_is_driver(true);
31333126
}
31343127
for (const auto task : local_queues_.GetTasks(TaskState::INFEASIBLE)) {
@@ -3191,7 +3184,7 @@ void NodeManager::HandleGetNodeStats(const rpc::GetNodeStatsRequest &request,
31913184
<< status.ToString();
31923185
} else {
31933186
auto worker_stats = reply->add_workers_stats();
3194-
worker_stats->set_pid(worker->Process().get()->id());
3187+
worker_stats->set_pid(worker->Pid());
31953188
worker_stats->set_is_driver(false);
31963189
reply->set_num_workers(reply->num_workers() + 1);
31973190
worker_stats->mutable_core_worker_stats()->MergeFrom(r.core_worker_stats());

src/ray/raylet/worker.cc

+3-7
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,11 @@ namespace ray {
1212
namespace raylet {
1313

1414
/// A constructor responsible for initializing the state of a worker.
15-
Worker::Worker(const WorkerID &worker_id, const Language &language, int port,
15+
Worker::Worker(const WorkerID &worker_id, pid_t pid, const Language &language, int port,
1616
std::shared_ptr<LocalClientConnection> connection,
1717
rpc::ClientCallManager &client_call_manager)
1818
: worker_id_(worker_id),
19+
pid_(pid),
1920
language_(language),
2021
port_(port),
2122
connection_(connection),
@@ -41,12 +42,7 @@ bool Worker::IsBlocked() const { return blocked_; }
4142

4243
WorkerID Worker::WorkerId() const { return worker_id_; }
4344

44-
ProcessHandle Worker::Process() const { return proc_; }
45-
46-
void Worker::SetProcess(const ProcessHandle &proc) {
47-
RAY_CHECK(!proc_); // this procedure should not be called multiple times
48-
proc_ = proc;
49-
}
45+
pid_t Worker::Pid() const { return pid_; }
5046

5147
Language Worker::GetLanguage() const { return language_; }
5248

src/ray/raylet/worker.h

+7-8
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99
#include "ray/common/task/task.h"
1010
#include "ray/common/task/task_common.h"
1111
#include "ray/rpc/worker/core_worker_client.h"
12-
#include "ray/util/process.h"
12+
13+
#include <unistd.h> // pid_t
1314

1415
namespace ray {
1516

@@ -21,8 +22,7 @@ namespace raylet {
2122
class Worker {
2223
public:
2324
/// A constructor that initializes a worker object.
24-
/// NOTE: You MUST manually set the worker process.
25-
Worker(const WorkerID &worker_id, const Language &language, int port,
25+
Worker(const WorkerID &worker_id, pid_t pid, const Language &language, int port,
2626
std::shared_ptr<LocalClientConnection> connection,
2727
rpc::ClientCallManager &client_call_manager);
2828
/// A destructor responsible for freeing all worker state.
@@ -34,9 +34,8 @@ class Worker {
3434
bool IsBlocked() const;
3535
/// Return the worker's ID.
3636
WorkerID WorkerId() const;
37-
/// Return the worker process.
38-
ProcessHandle Process() const;
39-
void SetProcess(const ProcessHandle &proc);
37+
/// Return the worker's PID.
38+
pid_t Pid() const;
4039
Language GetLanguage() const;
4140
int Port() const;
4241
void AssignTaskId(const TaskID &task_id);
@@ -80,8 +79,8 @@ class Worker {
8079
private:
8180
/// The worker's ID.
8281
WorkerID worker_id_;
83-
/// The worker's process.
84-
ProcessHandle proc_;
82+
/// The worker's PID.
83+
pid_t pid_;
8584
/// The language type of this worker.
8685
Language language_;
8786
/// Port that this worker listens on.

0 commit comments

Comments
 (0)