diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 84f423fa583..963d4201afe 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -355,7 +355,8 @@ if (ENABLE_CGROUPS_v2) linux/ebpf.cpp slave/containerizer/mesos/isolators/cgroups2/controller.cpp slave/containerizer/mesos/isolators/cgroups2/controllers/core.cpp - slave/containerizer/mesos/isolators/cgroups2/controllers/cpu.cpp) + slave/containerizer/mesos/isolators/cgroups2/controllers/cpu.cpp + slave/containerizer/mesos/isolators/cgroups2/controllers/memory.cpp) endif () diff --git a/src/Makefile.am b/src/Makefile.am index 3677df5076e..779b893fca3 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1503,7 +1503,9 @@ MESOS_LINUX_FILES += \ slave/containerizer/mesos/isolators/cgroups2/controllers/core.cpp \ slave/containerizer/mesos/isolators/cgroups2/controllers/core.hpp \ slave/containerizer/mesos/isolators/cgroups2/controllers/cpu.cpp \ - slave/containerizer/mesos/isolators/cgroups2/controllers/cpu.hpp + slave/containerizer/mesos/isolators/cgroups2/controllers/cpu.hpp \ + slave/containerizer/mesos/isolators/cgroups2/controllers/memory.cpp \ + slave/containerizer/mesos/isolators/cgroups2/controllers/memory.hpp endif if ENABLE_SECCOMP_ISOLATOR diff --git a/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.cpp b/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.cpp index a53e65cfbf3..ae6e5aa7854 100644 --- a/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.cpp +++ b/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.cpp @@ -20,6 +20,7 @@ #include "slave/containerizer/mesos/isolators/cgroups2/cgroups2.hpp" #include "slave/containerizer/mesos/isolators/cgroups2/controllers/core.hpp" #include "slave/containerizer/mesos/isolators/cgroups2/controllers/cpu.hpp" +#include "slave/containerizer/mesos/isolators/cgroups2/controllers/memory.hpp" #include #include @@ -74,7 +75,8 @@ Try Cgroups2IsolatorProcess::create(const Flags& flags) { hashmap>(*)(const Flags&)> creators = { {"core", &CoreControllerProcess::create}, - {"cpu", &CpuControllerProcess::create} + {"cpu", &CpuControllerProcess::create}, + {"mem", &MemoryControllerProcess::create} }; hashmap> controllers; diff --git a/src/slave/containerizer/mesos/isolators/cgroups2/constants.hpp b/src/slave/containerizer/mesos/isolators/cgroups2/constants.hpp index dafc7f92fff..9498a4779fd 100644 --- a/src/slave/containerizer/mesos/isolators/cgroups2/constants.hpp +++ b/src/slave/containerizer/mesos/isolators/cgroups2/constants.hpp @@ -32,8 +32,12 @@ const uint64_t MIN_CPU_SHARES = 2; // Linux constant. const Duration CPU_CFS_PERIOD = Milliseconds(100); // Linux default. const Duration MIN_CPU_CFS_QUOTA = Milliseconds(1); +// Memory controller constants. +const Bytes MIN_MEMORY = Megabytes(32); + const std::string CGROUPS_V2_CONTROLLER_CORE_NAME = "core"; const std::string CGROUPS_V2_CONTROLLER_CPU_NAME = "cpu"; +const std::string CGROUPS_V2_CONTROLLER_MEMORY_NAME = "memory"; } // namespace slave { } // namespace internal { diff --git a/src/slave/containerizer/mesos/isolators/cgroups2/controllers/memory.cpp b/src/slave/containerizer/mesos/isolators/cgroups2/controllers/memory.cpp new file mode 100644 index 00000000000..5a69cb95acf --- /dev/null +++ b/src/slave/containerizer/mesos/isolators/cgroups2/controllers/memory.cpp @@ -0,0 +1,394 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include + +#include "common/protobuf_utils.hpp" + +#include "linux/cgroups2.hpp" + +#include "slave/containerizer/mesos/isolators/cgroups2/constants.hpp" +#include "slave/containerizer/mesos/isolators/cgroups2/controllers/memory.hpp" + +using process::Failure; +using process::Future; +using process::PID; +using process::Owned; + +using cgroups2::memory::Stats; + +using mesos::slave::ContainerConfig; +using mesos::slave::ContainerLimitation; + +using std::ostringstream; +using std::string; + +namespace mesos { +namespace internal { +namespace slave { + +Try> MemoryControllerProcess::create(const Flags& flags) +{ + return Owned(new MemoryControllerProcess(flags)); +} + + +MemoryControllerProcess::MemoryControllerProcess(const Flags& _flags) + : ProcessBase(process::ID::generate("cgroups-v2-memory-controller")), + ControllerProcess(_flags) {} + + +string MemoryControllerProcess::name() const +{ + return CGROUPS_V2_CONTROLLER_MEMORY_NAME; +} + + +Future MemoryControllerProcess::prepare( + const ContainerID& containerId, + const string& cgroup, + const ContainerConfig& containerConfig) +{ + if (infos.contains(containerId)) { + return Failure("Already prepared"); + } + + infos.put(containerId, Info()); + + oomListen(containerId, cgroup); + + return Nothing(); +} + + +Future MemoryControllerProcess::isolate( + const ContainerID& containerId, + const string& cgroup, + pid_t pid) +{ + if (!infos.contains(containerId)) { + return Failure("Unknown container"); + } + + // TODO(dleamy): Implement manual OOM score adjustment, similar to as it done + // in the cgroups v1 isolator. + + return Nothing(); +} + + +Future MemoryControllerProcess::recover( + const ContainerID& containerId, + const string& cgroup) +{ + if (infos.contains(containerId)) { + return Failure("Already recovered"); + } + + infos.put(containerId, Info()); + infos[containerId].hardLimitUpdated = true; + + oomListen(containerId, cgroup); + + return Nothing(); +} + + +Future MemoryControllerProcess::watch( + const ContainerID& containerId, + const string& cgroup) +{ + if (!infos.contains(containerId)) { + return Failure("Unknown container"); + } + + return infos[containerId].limitation.future(); +} + + +Future MemoryControllerProcess::update( + const ContainerID& containerId, + const string& cgroup, + const Resources& resourceRequests, + const google::protobuf::Map& resourceLimits) +{ + if (!infos.contains(containerId)) { + return Failure("Unknown container"); + } + + if (resourceRequests.mem().isNone()) { + return Failure("No memory resources requested"); + } + + Bytes memory = *resourceRequests.mem(); + Bytes softLimit = std::max(memory, MIN_MEMORY); + + // Set the soft memory limit. + Try high = cgroups2::memory::set_high(cgroup, softLimit); + if (high.isError()) { + return Failure("Failed to set soft memory limit: " + high.error()); + } + + LOG(INFO) << "Updated soft memory limit to " << softLimit << " for container " + << containerId; + + // Determine the new hard memory limit. + Option newHardLimit = [&resourceLimits, &softLimit]() -> Option + { + if (resourceLimits.count("mem") > 0) { + double requestedLimit = resourceLimits.at("mem").value(); + if (std::isinf(requestedLimit)) { + return None(); + } + + return std::max( + Megabytes(static_cast(requestedLimit)), MIN_MEMORY); + } + + return softLimit; + }(); + + Result currentHardLimit = cgroups2::memory::max(cgroup); + if (currentHardLimit.isError()) { + return Failure("Failed to get current hard memory limit: " + + currentHardLimit.error()); + } + + // We only update the hard limit if: + // 1) The hard limit has not yet been set for the container, or + // 2) The new hard limit is greater than the existing hard limit. + // + // This is done to avoid the chance of triggering an OOM by reducing the + // hard limit to below the current memory usage. + + bool updateHardLimit = !infos[containerId].hardLimitUpdated + || newHardLimit.isNone() // infinite memory limit + || *newHardLimit > *currentHardLimit; + + if (updateHardLimit) { + Try max = cgroups2::memory::set_max(cgroup, newHardLimit); + if (max.isError()) { + return Failure("Failed to set hard memory limit: " + max.error()); + } + + infos[containerId].hardLimitUpdated = true; + } + + return Nothing(); +} + + +Future MemoryControllerProcess::usage( + const ContainerID& containerId, + const string& cgroup) +{ + if (!infos.contains(containerId)) { + return Failure("Cannot report resource usage for unknown container " + "'" + stringify(containerId) + "'"); + } + + LOG(INFO) << "Collecting 'memory' usage statistics for container " + << containerId; + + ResourceStatistics stats; + + // User memory usage. + Try usage = cgroups2::memory::usage(cgroup); + if (usage.isError()) { + return Failure("Failed to get user memory usage: " + usage.error()); + } + + stats.set_mem_total_bytes(usage->bytes()); + stats.set_mem_rss_bytes(usage->bytes()); + + // Kernel memory usage. + Try memoryStats = cgroups2::memory::stats(cgroup); + if (memoryStats.isError()) { + return Failure("Failed to get cgroup memory stats: " + memoryStats.error()); + } + + stats.set_mem_kmem_usage_bytes(memoryStats->kernel.bytes()); + + // Kernel TCP buffers usage. + stats.set_mem_kmem_tcp_usage_bytes(memoryStats->sock.bytes()); + + // Page cache usage. + stats.set_mem_file_bytes(memoryStats->file.bytes()); + stats.set_mem_cache_bytes(memoryStats->file.bytes()); + + // Anonymous memory usage. + stats.set_mem_anon_bytes(memoryStats->anon.bytes()); + + // File mapped memory usage. + stats.set_mem_mapped_file_bytes(memoryStats->file_mapped.bytes()); + + // Total unevictable memory. + // Equivalent to 'memory.min', the minimum amount of memory that will never + // be reclaimed by the kernel. + Try min = cgroups2::memory::min(cgroup); + if (min.isError()) { + return Failure("Failed to get unrevocable memory size: " + min.error()); + } + + stats.set_mem_unevictable_bytes(min->bytes()); + + return stats; +} + + +Future MemoryControllerProcess::cleanup( + const ContainerID& containerId, + const string& cgroup) +{ + if (!infos.contains(containerId)) { + LOG(INFO) << "Ignoring memory cleanup for unknown container " + << containerId; + + return Nothing(); + } + + if (infos[containerId].oom.isPending()) { + infos[containerId].oom.discard(); + } + + infos.erase(containerId); + + return Nothing(); +} + + +void MemoryControllerProcess::oomListen( + const ContainerID& containerId, + const string& cgroup) +{ + if (!infos.contains(containerId)) { + LOG(ERROR) << "Cannot listen for OOM events for unknown container " + << containerId; + return; + } + + infos[containerId].oom = cgroups2::memory::oom(cgroup); + + LOG(INFO) << "Listening for OOM events for container " + << containerId; + + infos[containerId].oom.onAny( + defer(PID(this), + &MemoryControllerProcess::oomed, + containerId, + cgroup, + lambda::_1)); +} + + +void MemoryControllerProcess::oomed( + const ContainerID& containerId, + const string& cgroup, + const Future& oom) +{ + if (oom.isDiscarded()) { + LOG(INFO) << "OOM event listener discarded"; + return; + } + + if (oom.isFailed()) { + LOG(ERROR) << "OOM event listener failed: " << oom.failure(); + return; + } + + if (!infos.contains(containerId)) { + // It is likely that process exited is executed before this + // function (e.g. The kill and OOM events happen at the same time, + // and the process exit event arrives first). Therefore, we should + // not report a fatal error here. + LOG(INFO) << "OOM event received for terminated container"; + return; + } + + LOG(INFO) << "OOM detected for container" << containerId; + + // Construct a message for the limitation to help with debugging + // the OOM. + + ostringstream limitMessage; + limitMessage << "Memory limit exceeded: "; + + // TODO(dleamy): Report the peak memory usage of the container. The + // 'memory.peak' control is only available on newer Linux + // kernels. + + // Report memory statistics. + limitMessage << "\nMEMORY STATISTICS: \n"; + Try memoryStats = cgroups2::memory::stats(cgroup); + if (memoryStats.isError()) { + LOG(ERROR) << "Failed to get cgroup memory stats: " << memoryStats.error(); + return; + } + + limitMessage << "Anon: " << memoryStats->anon << "\n"; + limitMessage << "File: " << memoryStats->file << "\n"; + limitMessage << "Kernel: " << memoryStats->kernel << "\n"; + limitMessage << "Kernel TCP: " << memoryStats->sock << "\n"; + limitMessage << "Kernel Stack: " << memoryStats->kernel_stack << "\n"; + limitMessage << "Page Tables: " << memoryStats->pagetables << "\n"; + limitMessage << "VMalloc: " << memoryStats->vmalloc << "\n"; + + // TODO(dleamy): Report 'total_mapped_file' cgroups v2 equivalent. + // "The amount of file backed memory, in bytes, mapped into + // a process' address space." + + LOG(INFO) << limitMessage.str(); + + Result hardLimit = cgroups2::memory::max(cgroup); + if (hardLimit.isError()) { + LOG(ERROR) << "Failed to get hard memory limit: " << hardLimit.error(); + return; + } + + if (hardLimit.isNone()) { + LOG(ERROR) << "No hard limit was set for the container - unexpected OOM"; + return; + } + + // Complete the container limitation promise with a memory resource + // limitation. + // + // TODO(jieyu): This is not accurate if the memory resource is from + // a non-star role or spans roles (e.g., "*" and "role"). Ideally, + // we should save the resources passed in and report it here. + // + // TODO(dleamy): We report the hard limit because not all machines have + // access to 'memory.peak', the peak memory usage of the + // cgroup. + double megabytes = (double) hardLimit->bytes() / Bytes::MEGABYTES; + Resources memory = *Resources::parse("mem", stringify(megabytes), "*"); + + infos[containerId].limitation.set( + protobuf::slave::createContainerLimitation( + memory, + limitMessage.str(), + TaskStatus::REASON_CONTAINER_LIMITATION_MEMORY)); +} + +} // namespace slave { +} // namespace internal { +} // namespace mesos { \ No newline at end of file diff --git a/src/slave/containerizer/mesos/isolators/cgroups2/controllers/memory.hpp b/src/slave/containerizer/mesos/isolators/cgroups2/controllers/memory.hpp new file mode 100644 index 00000000000..4a80717e670 --- /dev/null +++ b/src/slave/containerizer/mesos/isolators/cgroups2/controllers/memory.hpp @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef __MEMORY_HPP__ +#define __MEMORY_HPP__ + +#include + +#include + +#include + +#include "slave/flags.hpp" +#include "slave/containerizer/mesos/isolators/cgroups2/controller.hpp" + +namespace mesos { +namespace internal { +namespace slave { + +class MemoryControllerProcess : public ControllerProcess +{ +public: + static Try> create( + const Flags& flags); + + ~MemoryControllerProcess() override = default; + + std::string name() const override; + + process::Future prepare( + const ContainerID& containerId, + const std::string& cgroup, + const mesos::slave::ContainerConfig& containerConfig) override; + + process::Future isolate( + const ContainerID& containerId, + const std::string& cgroup, + pid_t pid) override; + + process::Future recover( + const ContainerID& containerId, + const std::string& cgroup) override; + + process::Future watch( + const ContainerID& containerId, + const std::string& cgroup) override; + + process::Future update( + const ContainerID& containerId, + const std::string& cgroup, + const Resources& resourceRequests, + const google::protobuf::Map< + std::string, Value::Scalar>& resourceLimits = {}) override; + + process::Future usage( + const ContainerID& containerId, + const std::string& cgroup) override; + + process::Future cleanup( + const ContainerID& containerId, + const std::string& cgroup) override; + +private: + struct Info + { + process::Future oom; + + process::Promise limitation; + + // Check if the hard memory limit has been updated for the container. + // Also true if the container was recovered. + bool hardLimitUpdated = false; + }; + + MemoryControllerProcess(const Flags& flags); + + void oomListen( + const ContainerID& containerId, + const std::string& cgroup); + + void oomed( + const ContainerID& containerId, + const std::string& cgroup, + const process::Future& oomFuture); + + hashmap infos; +}; + +} // namespace slave { +} // namespace internal { +} // namespace mesos { + +#endif // __MEMORY_HPP__ \ No newline at end of file diff --git a/src/tests/containerizer/memory_isolator_tests.cpp b/src/tests/containerizer/memory_isolator_tests.cpp index ec0f359a253..be8475f515b 100644 --- a/src/tests/containerizer/memory_isolator_tests.cpp +++ b/src/tests/containerizer/memory_isolator_tests.cpp @@ -45,7 +45,7 @@ namespace internal { namespace tests { class MemoryIsolatorTest - : public MesosTest, + : public ContainerizerTest, public WithParamInterface {}; @@ -152,6 +152,11 @@ TEST_P(MemoryIsolatorTest, ROOT_MemUsage) // Metrics for kmem are only enabled with memory isolation. if (GetParam() == "cgroups/mem") { + // Check that at least one page of kernel memory is used. Each page is + // 4096 bytes so we expect at least that much memory to be used. +#ifdef ENABLE_CGROUPS_V2 + ASSERT_LT(4096u, usage->mem_kmem_usage_bytes()); +#else Result hierarchy = cgroups::hierarchy("memory"); ASSERT_SOME(hierarchy); @@ -160,11 +165,10 @@ TEST_P(MemoryIsolatorTest, ROOT_MemUsage) ASSERT_SOME(kmemExists); if (kmemExists.get()) { - // We can assume more than 4096 bytes here, since a kernel page size is - // 4kB and we are allocating at least one. ASSERT_LT(4096u, usage->mem_kmem_usage_bytes()); } - } +#endif // ENABLE_CGROUPS_V2 + } driver.stop(); driver.join(); diff --git a/src/tests/mesos.cpp b/src/tests/mesos.cpp index 762200cda73..ffbb30b9551 100644 --- a/src/tests/mesos.cpp +++ b/src/tests/mesos.cpp @@ -586,9 +586,7 @@ slave::Flags ContainerizerTest::CreateSlaveFlags() // Use cgroup isolators if they're available and we're root. // TODO(idownes): Refactor the cgroups/non-cgroups code. if (cgroupsV2() && *user == "root") { - // TODO(dleamy): Add the memory isolator once it's supported by the cgroups - // v2 isolator. - flags.isolation = "cgroups/cpu"; + flags.isolation = "cgroups/cpu,cgroups/mem"; flags.cgroups_hierarchy = "/sys/fs/cgroup"; flags.cgroups_root = TEST_CGROUPS_ROOT; } else if (cgroups::enabled() && *user == "root") {