Skip to content

Commit b31c6f1

Browse files
yfeldblumfacebook-github-bot
authored andcommitted
optimize CacheLocality load-time init from sysfs on aarch64
Summary: There are a few tricks we can do to optimize reading from the sysfs tree. * `openat` vs `open` to reduce the cost of path-walking. * `read` each file fully into a big-enough stack buffer v.s. going through `ifstream` and `getline`. The files are single-line and tiny. * When we need two files together, only read the second one if reading from the first one succeeded. We use failed reads as sentinels. Reduces cold init cost from >=5ms to >=4ms on one dev server. Reviewed By: dmm-fb Differential Revision: D78570201 fbshipit-source-id: c946e9fdffe654b5f47b8c41200810314b0ec809
1 parent f3c4879 commit b31c6f1

File tree

5 files changed

+70
-19
lines changed

5 files changed

+70
-19
lines changed

folly/concurrency/BUCK

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,8 +119,10 @@ non_fbcode_target(
119119
"//third-party/glog:glog",
120120
"//xplat/folly:hash_hash",
121121
"//xplat/folly:optional",
122+
"//xplat/folly:portability_fcntl",
122123
"//xplat/folly:portability_unistd",
123124
"//xplat/folly:system_thread_id",
125+
"//xplat/folly/container:reserve",
124126
"//xplat/folly/lang:exception",
125127
],
126128
exported_deps = [
@@ -227,9 +229,11 @@ fbcode_target(
227229
"//folly:indestructible",
228230
"//folly:memory",
229231
"//folly:scope_guard",
232+
"//folly/container:reserve",
230233
"//folly/detail:static_singleton_manager",
231234
"//folly/hash:hash",
232235
"//folly/lang:exception",
236+
"//folly/portability:fcntl",
233237
"//folly/portability:unistd",
234238
"//folly/system:thread_id",
235239
],

folly/concurrency/CacheLocality.cpp

Lines changed: 50 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#define _GNU_SOURCE 1 // for RTLD_NOLOAD
2121
#include <dlfcn.h>
2222
#endif
23+
#include <filesystem>
2324
#include <fstream>
2425
#include <mutex>
2526
#include <numeric>
@@ -31,9 +32,11 @@
3132
#include <folly/Indestructible.h>
3233
#include <folly/Memory.h>
3334
#include <folly/ScopeGuard.h>
35+
#include <folly/container/Reserve.h>
3436
#include <folly/detail/StaticSingletonManager.h>
3537
#include <folly/hash/Hash.h>
3638
#include <folly/lang/Exception.h>
39+
#include <folly/portability/Fcntl.h>
3740
#include <folly/portability/Unistd.h>
3841
#include <folly/system/ThreadId.h>
3942

@@ -179,30 +182,67 @@ static size_t parseLeadingNumber(const std::string& line) {
179182
return val;
180183
}
181184

182-
CacheLocality CacheLocality::readFromSysfsTree(
183-
const std::function<std::string(std::string const&)>& mapping) {
185+
CacheLocality CacheLocality::readFromSysfsTree(std::string_view root) {
186+
#if defined(_WIN32)
187+
// windows does not have openat and open flag constants
188+
return CacheLocality::uniform(0);
189+
#else
190+
184191
// the list of cache equivalence classes, where equivalence classes
185192
// are named by the smallest cpu in the class
186193
std::vector<std::vector<size_t>> equivClassesByCpu;
187194

195+
auto rdfile = [&](auto dirfd, auto name) {
196+
auto fd = ::openat(dirfd, name.c_str(), O_CLOEXEC, O_RDONLY);
197+
if (fd < 0) {
198+
return std::string(); // stop condition for the inner loop below
199+
}
200+
auto fdg = makeGuard([=] { ::close(fd); });
201+
alignas(64) char buf[64];
202+
int ret = 0;
203+
do {
204+
ret = ::pread(fd, buf, sizeof(buf), 0);
205+
} while (ret < 0 && errno == EINVAL);
206+
if (ret < 0) {
207+
return std::string();
208+
}
209+
return std::string(buf, to_unsigned(ret));
210+
};
211+
212+
auto subroot = std::filesystem::path(root) / "sys/devices/system/cpu";
213+
int allfd = ::open(subroot.c_str(), O_DIRECTORY | O_CLOEXEC, O_RDONLY);
214+
assert(allfd >= 0);
215+
size_t maxindex = 0;
188216
for (size_t cpu = 0;; ++cpu) {
217+
auto cpuroot = fmt::format("cpu{}/cache", cpu);
218+
int cpufd =
219+
::openat(allfd, cpuroot.c_str(), O_DIRECTORY | O_CLOEXEC, O_RDONLY);
220+
if (cpufd < 0) {
221+
break;
222+
}
189223
std::vector<size_t> levels;
224+
grow_capacity_by(levels, maxindex);
190225
for (size_t index = 0;; ++index) {
191-
auto dir = fmt::format(
192-
"/sys/devices/system/cpu/cpu{}/cache/index{}/", cpu, index);
193-
auto cacheType = mapping(dir + "type");
194-
auto equivStr = mapping(dir + "shared_cpu_list");
195-
if (cacheType.empty() || equivStr.empty()) {
226+
auto dir = fmt::format("index{}/", index);
227+
auto cacheType = rdfile(cpufd, dir + "type");
228+
if (cacheType.empty()) {
196229
// no more caches
197230
break;
198231
}
199232
if (cacheType[0] == 'I') {
200233
// cacheType in { "Data", "Instruction", "Unified" }. skip icache
201234
continue;
202235
}
236+
// only try to read the second file once we know we will need it
237+
auto equivStr = rdfile(cpufd, dir + "shared_cpu_list");
238+
if (equivStr.empty()) {
239+
// no more caches
240+
break;
241+
}
203242
auto equiv = parseLeadingNumber(equivStr);
204243
levels.push_back(equiv);
205244
}
245+
maxindex = std::max(maxindex, levels.size());
206246

207247
if (levels.empty()) {
208248
// no levels at all for this cpu, we must be done
@@ -216,15 +256,12 @@ CacheLocality CacheLocality::readFromSysfsTree(
216256
}
217257

218258
return CacheLocality{std::move(equivClassesByCpu)};
259+
260+
#endif
219261
}
220262

221263
CacheLocality CacheLocality::readFromSysfs() {
222-
return readFromSysfsTree([](std::string const& name) {
223-
std::ifstream xi(name.c_str());
224-
std::string rv;
225-
std::getline(xi, rv);
226-
return rv;
227-
});
264+
return readFromSysfsTree();
228265
}
229266

230267
namespace {

folly/concurrency/CacheLocality.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include <functional>
2424
#include <limits>
2525
#include <string>
26+
#include <string_view>
2627
#include <type_traits>
2728
#include <vector>
2829

@@ -106,8 +107,7 @@ struct CacheLocality {
106107
/// not exist. The function will be called with paths of the form
107108
/// /sys/devices/system/cpu/cpu*/cache/index*/{type,shared_cpu_list} .
108109
/// Throws an exception if no caches can be parsed at all.
109-
static CacheLocality readFromSysfsTree(
110-
const std::function<std::string(std::string const&)>& mapping);
110+
static CacheLocality readFromSysfsTree(std::string_view root = "/");
111111

112112
/// Reads CacheLocality information from the real sysfs filesystem.
113113
/// Throws an exception if no cache information can be loaded.

folly/concurrency/test/BUCK

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ fbcode_target(
6363
"//folly/portability:sys_resource",
6464
"//folly/portability:unistd",
6565
"//folly/test:test_utils",
66+
"//folly/testing:test_util",
6667
],
6768
external_deps = [
6869
"glog",

folly/concurrency/test/CacheLocalityTest.cpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,14 @@
2626
#include <folly/portability/SysResource.h>
2727
#include <folly/portability/Unistd.h>
2828
#include <folly/test/TestUtils.h>
29+
#include <folly/testing/TestUtil.h>
2930

3031
#include <glog/logging.h>
3132

3233
using namespace folly;
3334

35+
#if !defined(_WIN32)
36+
3437
/// This is the relevant nodes from a production box's sysfs tree. If you
3538
/// think this map is ugly you should see the version of this test that
3639
/// used a real directory tree. To reduce the chance of testing error
@@ -309,10 +312,14 @@ static std::unordered_map<std::string, std::string> fakeSysfsTree = {
309312
{"/sys/devices/system/cpu/cpu31/cache/index3/type", "Unified"}};
310313

311314
TEST(CacheLocality, FakeSysfs) {
312-
auto parsed = CacheLocality::readFromSysfsTree([](std::string name) {
313-
auto iter = fakeSysfsTree.find(name);
314-
return iter == fakeSysfsTree.end() ? std::string() : iter->second;
315-
});
315+
folly::test::TemporaryDirectory tmpdir;
316+
for (const auto& [k, v] : fakeSysfsTree) {
317+
auto path = tmpdir.path() / k;
318+
fs::create_directories(path.parent_path());
319+
std::ofstream(path.string()) << v << std::endl;
320+
}
321+
auto root = tmpdir.path();
322+
auto parsed = CacheLocality::readFromSysfsTree(tmpdir.path().string());
316323

317324
size_t expectedNumCpus = 32;
318325
std::vector<size_t> expectedNumCachesByLevel = {16, 16, 2};
@@ -325,6 +332,8 @@ TEST(CacheLocality, FakeSysfs) {
325332
EXPECT_EQ(expectedLocalityIndexByCpu, parsed.localityIndexByCpu);
326333
}
327334

335+
#endif
336+
328337
static const std::vector<std::string> fakeProcCpuinfo = {
329338
"processor : 0",
330339
"vendor_id : GenuineIntel",

0 commit comments

Comments
 (0)