-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[InstrProf] Created Thread local counter instrumentation, compiler-rt runtimes #95494
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
LLVM can now generate increments to counters in thread local storage. Use a new compiler-rt runtime to atomically add thread local counters to global counters on thread exit. The clang driver will link the new runtime libraries in when the new option -fprofile-thread-local is specified. Signed-off-by: Andrew Wock <[email protected]>
@llvm/pr-subscribers-clang @llvm/pr-subscribers-clang-driver Author: Andrew Wock (ajwock) ChangesLLVM can now generate increments to counters in thread local storage. Use a new compiler-rt runtime to atomically add thread local counters to global counters on thread exit. The clang driver will link the new runtime libraries in when the new option -fprofile-thread-local is specified. More details available in the RFC on discourse. Patch is 67.04 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/95494.diff 36 Files Affected:
diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index f954857b0235a..f7db513b92909 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -2932,6 +2932,14 @@ indexed format, regardeless whether it is produced by frontend or the IR pass.
overhead. ``prefer-atomic`` will be transformed to ``atomic`` when supported
by the target, or ``single`` otherwise.
+.. option:: -fprofile-thread-local
+
+ Increment profile counters in thread local storage and atomically add their
+ values to global counters on thread exit. This has the potential to deliver
+ both accuracy and high performance whenever there is high thread contention
+ on profile counters. This is an experimental option and it is only supported
+ on 64-bit linux.
+
Fine Tuning Profile Collection
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 7ffc40a00504f..7cd0bfb6d71b5 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -218,6 +218,7 @@ ENUM_CODEGENOPT(ProfileUse, ProfileInstrKind, 2, ProfileNone)
/// instrumented. Selected group numbers can be 0 to N-1 inclusive.
VALUE_CODEGENOPT(ProfileTotalFunctionGroups, 32, 1)
VALUE_CODEGENOPT(ProfileSelectedFunctionGroup, 32, 0)
+CODEGENOPT(InstrProfileThreadLocal, 1, 0) ///< Counters are updated on a per-thread basis
CODEGENOPT(CoverageMapping , 1, 0) ///< Generate coverage mapping regions to
///< enable code coverage analysis.
CODEGENOPT(DumpCoverageMapping , 1, 0) ///< Dump the generated coverage mapping
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index d44faa55c456f..aab5b63c991f1 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1768,6 +1768,9 @@ def fprofile_instr_generate : Flag<["-"], "fprofile-instr-generate">,
def fprofile_instr_generate_EQ : Joined<["-"], "fprofile-instr-generate=">,
Group<f_Group>, Visibility<[ClangOption, CLOption]>, MetaVarName<"<file>">,
HelpText<"Generate instrumented code to collect execution counts into <file> (overridden by LLVM_PROFILE_FILE env var)">;
+def fprofile_thread_local : Flag<["-"], "fprofile-thread-local">,
+ Group<f_Group>, Visibility<[ClangOption, CLOption]>,
+ HelpText<"Generage profile counters in thread local storage">;
def fprofile_instr_use : Flag<["-"], "fprofile-instr-use">, Group<f_Group>,
Visibility<[ClangOption, CLOption]>;
def fprofile_instr_use_EQ : Joined<["-"], "fprofile-instr-use=">,
diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h
index 9789cfacafd78..162c730782afb 100644
--- a/clang/include/clang/Driver/ToolChain.h
+++ b/clang/include/clang/Driver/ToolChain.h
@@ -752,6 +752,12 @@ class ToolChain {
virtual void addProfileRTLibs(const llvm::opt::ArgList &Args,
llvm::opt::ArgStringList &CmdArgs) const;
+ /// addThreadLocalProfileRTLibs - With -fprofile-threadlocal, add the
+ /// threadlocal profile runtime static + shared library pair.
+ virtual void
+ addThreadLocalProfileRTLibs(const llvm::opt::ArgList &Args,
+ llvm::opt::ArgStringList &CmdArgs) const;
+
/// Add arguments to use system-specific CUDA includes.
virtual void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
llvm::opt::ArgStringList &CC1Args) const;
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 40ab2e91125d1..4708cb7df5044 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -1078,6 +1078,16 @@ void ToolChain::addProfileRTLibs(const llvm::opt::ArgList &Args,
CmdArgs.push_back(getCompilerRTArgString(Args, "profile"));
}
+void ToolChain::addThreadLocalProfileRTLibs(
+ const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const {
+ if (needsProfileRT(Args) && Args.hasArg(options::OPT_fprofile_thread_local)) {
+ // Static first, so we can specify '-u' where needed
+ CmdArgs.push_back(getCompilerRTArgString(Args, "profile_threadlocal"));
+ CmdArgs.push_back(getCompilerRTArgString(Args, "profile_threadlocal",
+ ToolChain::FT_Shared));
+ }
+}
+
ToolChain::RuntimeLibType ToolChain::GetRuntimeLibType(
const ArgList &Args) const {
if (runtimeLibType)
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index b8d8ff3db5d1f..cd63ac56fecf6 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -720,6 +720,18 @@ static void addPGOAndCoverageFlags(const ToolChain &TC, Compilation &C,
CmdArgs.push_back("-fcoverage-mcdc");
}
+ if (Args.hasArg(options::OPT_fprofile_thread_local)) {
+ if (!ProfileGenerateArg)
+ D.Diag(clang::diag::err_drv_argument_only_allowed_with)
+ << "-fprofile-thread-local"
+ << "-fprofile-instr-generate";
+
+ // Clang cc1 is not in the know about thread local coverage, but llvm
+ // should be
+ CmdArgs.push_back("-mllvm");
+ CmdArgs.push_back("-instr-prof-thread-local");
+ }
+
if (Arg *A = Args.getLastArg(options::OPT_ffile_compilation_dir_EQ,
options::OPT_fcoverage_compilation_dir_EQ)) {
if (A->getOption().matches(options::OPT_ffile_compilation_dir_EQ))
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index 2222dea431c3c..0a889f957786a 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -843,6 +843,13 @@ void Linux::addProfileRTLibs(const llvm::opt::ArgList &Args,
CmdArgs.push_back(Args.MakeArgString(
Twine("-u", llvm::getInstrProfRuntimeHookVarName())));
ToolChain::addProfileRTLibs(Args, CmdArgs);
+
+ if (needsProfileRT(Args) && Args.hasArg(options::OPT_fprofile_thread_local)) {
+ CmdArgs.push_back(Args.MakeArgString(Twine(
+ "-u",
+ llvm::StringRef("__llvm_profile_tls_register_thread_exit_handler"))));
+ }
+ ToolChain::addThreadLocalProfileRTLibs(Args, CmdArgs);
}
void Linux::addExtraOpts(llvm::opt::ArgStringList &CmdArgs) const {
diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index e9866d94b762c..8655bcf498437 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -312,6 +312,9 @@ INSTR_PROF_SECT_ENTRY(IPSK_data, \
INSTR_PROF_SECT_ENTRY(IPSK_cnts, \
INSTR_PROF_QUOTE(INSTR_PROF_CNTS_COMMON), \
INSTR_PROF_CNTS_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_tls_cnts, \
+ INSTR_PROF_QUOTE(INSTR_PROF_TLS_CNTS_COMMON), \
+ INSTR_PROF_CNTS_COFF, "__DATA,")
INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \
INSTR_PROF_QUOTE(INSTR_PROF_BITS_COMMON), \
INSTR_PROF_BITS_COFF, "__DATA,")
@@ -750,6 +753,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
#define INSTR_PROF_NAME_COMMON __llvm_prf_names
#define INSTR_PROF_VNAME_COMMON __llvm_prf_vns
#define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts
+#define INSTR_PROF_TLS_CNTS_COMMON __llvm_tls_prf_cnts
#define INSTR_PROF_BITS_COMMON __llvm_prf_bits
#define INSTR_PROF_VALS_COMMON __llvm_prf_vals
#define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds
diff --git a/compiler-rt/lib/profile/CMakeLists.txt b/compiler-rt/lib/profile/CMakeLists.txt
index 45e5164891751..b9f3a20bb328d 100644
--- a/compiler-rt/lib/profile/CMakeLists.txt
+++ b/compiler-rt/lib/profile/CMakeLists.txt
@@ -70,14 +70,25 @@ set(PROFILE_SOURCES
InstrProfilingUtil.c
)
+set(PROFILE_STATIC_TLS_SOURCES
+ InstrProfilingTLS.c
+ InstrProfilingStaticTLSLinux.cpp)
+
+set(PROFILE_SHARED_TLS_SOURCES
+ InstrProfilingTLSDyLib.c
+ InstrProfilingDyLibLinux.cpp)
+
set(PROFILE_HEADERS
InstrProfiling.h
InstrProfilingInternal.h
InstrProfilingPort.h
InstrProfilingUtil.h
+ InstrProfilingTLS.h
WindowsMMap.h
)
+set(PROFILE_LINK_LIBS ${SANITIZER_COMMON_LINK_LIBS})
+
if(WIN32)
list(APPEND PROFILE_SOURCES
WindowsMMap.c
@@ -134,6 +145,30 @@ if(APPLE)
ADDITIONAL_HEADERS ${PROFILE_HEADERS}
PARENT_TARGET profile)
else()
+ #if(UNIX AND NOT APPLE AND NOT ANDROID)
+ if(OS_NAME MATCHES "Linux")
+ add_compiler_rt_runtime(clang_rt.profile_threadlocal
+ STATIC
+ OS ${PROFILE_SUPPORTED_OS}
+ ARCHS ${PROFILE_SUPPORTED_ARCH}
+ CFLAGS ${EXTRA_FLAGS}
+ SOURCES ${PROFILE_STATIC_TLS_SOURCES}
+ ADDITIONAL_HEADERS ${PROFILE_HEADERS}
+ PARENT_TARGET profile)
+
+ add_compiler_rt_runtime(clang_rt.profile_threadlocal
+ SHARED
+ OS ${PROFILE_SUPPORTED_OS}
+ ARCHS ${PROFILE_SUPPORTED_ARCH}
+ CFLAGS ${EXTRA_FLAGS}
+ SOURCES ${PROFILE_SHARED_TLS_SOURCES}
+ ADDITIONAL_HEADERS ${PROFILE_HEADERS}
+ OBJECT_LIBS RTInterception
+ RTSanitizerCommon
+ RTSanitizerCommonLibc
+ PARENT_TARGET profile)
+ endif()
+
add_compiler_rt_runtime(clang_rt.profile
STATIC
ARCHS ${PROFILE_SUPPORTED_ARCH}
diff --git a/compiler-rt/lib/profile/InstrProfilingDyLibLinux.cpp b/compiler-rt/lib/profile/InstrProfilingDyLibLinux.cpp
new file mode 100644
index 0000000000000..47f2baa6a5815
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingDyLibLinux.cpp
@@ -0,0 +1,63 @@
+#if defined(__linux__) || defined(__FreeBSD__) || defined(__Fuchsia__) || \
+ (defined(__sun__) && defined(__svr4__)) || defined(__NetBSD__) || \
+ defined(_AIX)
+
+#include <elf.h>
+#include <link.h>
+#endif
+#include <stdlib.h>
+#include <string.h>
+
+extern "C" {
+
+#include "InstrProfiling.h"
+#include "InstrProfilingInternal.h"
+#include "InstrProfilingTLS.h"
+#include "InstrProfilingTLSDyLib.h"
+}
+
+#include "interception/interception.h"
+
+extern "C" {
+
+struct pthread_wrapper_arg {
+ void *(*fn)(void *);
+ void *arg;
+ uint32_t arg_keepalive;
+};
+
+void *pthread_fn_wrapper(void *arg_ptr) {
+ struct pthread_wrapper_arg *wrapper_arg =
+ (struct pthread_wrapper_arg *)arg_ptr;
+ void *(*fn)(void *) = __atomic_load_n(&wrapper_arg->fn, __ATOMIC_RELAXED);
+ void *arg = __atomic_load_n(&wrapper_arg->arg, __ATOMIC_RELAXED);
+ __atomic_store_n(&wrapper_arg->arg_keepalive, 0, __ATOMIC_RELEASE);
+
+ // startup
+ // Do nothing (TLS is automatically loaded and zeroed)
+ void *retval = fn(arg);
+ // cleanup
+ run_thread_exit_handlers();
+ // Combine counters with main counters
+ return retval;
+}
+
+void __llvm_register_profile_intercepts() { register_profile_intercepts(); }
+
+} // end extern "C"
+
+INTERCEPTOR(int, pthread_create, void *thread, void *attr,
+ void *(*start_routine)(void *), void *arg) {
+ int res = -1;
+ struct pthread_wrapper_arg wrapper_arg = {(void *(*)(void *))start_routine,
+ arg, 1};
+
+ // do pthread
+ res = REAL(pthread_create)(thread, attr, pthread_fn_wrapper, &wrapper_arg);
+ // Spin wait for child thread to copy arguments
+ while (__atomic_load_n(&wrapper_arg.arg_keepalive, __ATOMIC_ACQUIRE) == 1)
+ ;
+ return res;
+}
+
+void register_profile_intercepts() { INTERCEPT_FUNCTION(pthread_create); }
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index e4d99ef4872bd..64775f24fd83c 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -34,6 +34,7 @@
#include "InstrProfiling.h"
#include "InstrProfilingInternal.h"
#include "InstrProfilingPort.h"
+#include "InstrProfilingTLS.h"
#include "InstrProfilingUtil.h"
/* From where is profile name specified.
@@ -1084,6 +1085,8 @@ void __llvm_profile_set_filename(const char *FilenamePat) {
parseAndSetFilename(FilenamePat, PNS_runtime_api, 1);
}
+void (*on_main_thread_exit)(void) = NULL;
+
/* The public API for writing profile data into the file with name
* set by previous calls to __llvm_profile_set_filename or
* __llvm_profile_override_default_filename or
@@ -1097,6 +1100,9 @@ int __llvm_profile_write_file(void) {
// Temporarily suspend getting SIGKILL when the parent exits.
int PDeathSig = lprofSuspendSigKill();
+ if (on_main_thread_exit)
+ on_main_thread_exit();
+
if (lprofProfileDumped() || __llvm_profile_is_continuous_mode_enabled()) {
PROF_NOTE("Profile data not written to file: %s.\n", "already written");
if (PDeathSig == 1)
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
index b766436497b74..4f96523a56a37 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
@@ -45,6 +45,7 @@ extern __llvm_profile_data PROF_DATA_STOP COMPILER_RT_VISIBILITY
COMPILER_RT_WEAK;
extern char PROF_CNTS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
extern char PROF_CNTS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+
extern VTableProfData PROF_VTABLE_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
extern VTableProfData PROF_VTABLE_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
extern char PROF_VNAME_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
diff --git a/compiler-rt/lib/profile/InstrProfilingStaticTLSLinux.cpp b/compiler-rt/lib/profile/InstrProfilingStaticTLSLinux.cpp
new file mode 100644
index 0000000000000..fc5f785e1ab40
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingStaticTLSLinux.cpp
@@ -0,0 +1,123 @@
+#if defined(__linux__) || defined(__FreeBSD__) || defined(__Fuchsia__) || \
+ (defined(__sun__) && defined(__svr4__)) || defined(__NetBSD__) || \
+ defined(_AIX)
+
+#include <elf.h>
+#include <link.h>
+#endif
+#include <stdlib.h>
+#include <string.h>
+
+extern "C" {
+
+#include "InstrProfiling.h"
+#include "InstrProfilingInternal.h"
+#include "InstrProfilingTLS.h"
+}
+
+extern "C" {
+
+#define PROF_TLS_CNTS_START INSTR_PROF_SECT_START(INSTR_PROF_TLS_CNTS_COMMON)
+#define PROF_TLS_CNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_TLS_CNTS_COMMON)
+
+extern char PROF_TLS_CNTS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern char PROF_TLS_CNTS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+
+COMPILER_RT_VISIBILITY char *__llvm_profile_begin_tls_counters(void) {
+ return &PROF_TLS_CNTS_START;
+}
+COMPILER_RT_VISIBILITY char *__llvm_profile_end_tls_counters(void) {
+ return &PROF_TLS_CNTS_STOP;
+}
+
+struct finalization_data {
+ char *mod_begin;
+ char *tls_img_begin;
+ char *tls_img_end;
+ char *cnts_begin;
+ char *cnts_end;
+};
+
+// This is O(num_modules + num_counters) unfortunately. If there were a
+// mechanism to calculate the thread-local start of a thread-local section like
+// there is a mechanism to calculate the static start of a static section (i.e.
+// __start_$sectionname), that would simplify implementation a lot and make this
+// just O(num_counters).
+static int FindAndAddCounters_cb(struct dl_phdr_info *info, size_t size,
+ void *data) {
+ finalization_data *fdata = (finalization_data *)data;
+ char *mod_begin = fdata->mod_begin;
+ // We're looking for a match to the dladdr calculated based on PROF_CNTS_START
+ if (mod_begin != (char *)info->dlpi_addr) {
+ return 0;
+ }
+
+ if (info->dlpi_tls_data == NULL) {
+ return 1;
+ }
+
+ const Elf64_Phdr *hdr = info->dlpi_phdr;
+ const Elf64_Phdr *last_hdr = hdr + info->dlpi_phnum;
+
+ const Elf64_Phdr *tls_hdr;
+ for (; hdr != last_hdr; ++hdr) {
+ if (hdr->p_type == PT_TLS) {
+ tls_hdr = hdr;
+ goto found_tls_ph;
+ }
+ }
+ return 1;
+found_tls_ph:
+ uint64_t num_counters =
+ __llvm_profile_get_num_counters(fdata->tls_img_begin, fdata->tls_img_end);
+ uint64_t counter_size = __llvm_profile_counter_entry_size();
+
+ // Calculate the offset of __llvm_prf_tls_cnts into the tls block for this
+ // module. The addresses in use below correspond to the tls initialization
+ // image, which is statically allocated for the module, rather than the TLS
+ // block itself.
+ uint64_t ph_true_vaddr =
+ (uint64_t)info->dlpi_addr + (uint64_t)tls_hdr->p_vaddr;
+ uint64_t tls_cnts_tlsblk_offset =
+ (uint64_t)fdata->tls_img_begin - ph_true_vaddr;
+
+ // Calculate the thread local copy of __llvm_prf_tls_cnts for this module.
+ uint64_t tls_prf_cnts_modlocal_begin =
+ (uint64_t)info->dlpi_tls_data + tls_cnts_tlsblk_offset;
+
+ // We don't support single byte counters because they are also resilient to
+ // thread synchronization issues and they are designed to avoid memory
+ // overhead, which is the opposite of what TL counters do.
+ // TODO: warn?
+ if (counter_size == sizeof(uint64_t)) {
+ uint64_t *tls_cnt = (uint64_t *)tls_prf_cnts_modlocal_begin;
+ uint64_t *tls_end = (uint64_t *)tls_cnt + num_counters;
+ uint64_t *cnt = (uint64_t *)fdata->cnts_begin;
+ for (; tls_cnt != tls_end; tls_cnt++, cnt++) {
+ __atomic_fetch_add(cnt, *tls_cnt, __ATOMIC_RELAXED);
+ }
+ }
+ return 1;
+}
+
+COMPILER_RT_VISIBILITY
+void __llvm_profile_tls_counters_finalize(void) {
+ struct finalization_data fdata = {0};
+ fdata.tls_img_begin = __llvm_profile_begin_tls_counters();
+ fdata.tls_img_end = __llvm_profile_end_tls_counters();
+ fdata.cnts_begin = __llvm_profile_begin_counters();
+ fdata.cnts_end = __llvm_profile_end_counters();
+
+ if (!fdata.tls_img_begin || !fdata.tls_img_end || !fdata.cnts_begin ||
+ !fdata.cnts_end) {
+ return;
+ }
+
+ Dl_info info;
+ if (dladdr(fdata.cnts_begin, &info) == 0) {
+ return;
+ }
+ fdata.mod_begin = (char *)info.dli_fbase;
+ dl_iterate_phdr(FindAndAddCounters_cb, &fdata);
+}
+}
diff --git a/compiler-rt/lib/profile/InstrProfilingTLS.c b/compiler-rt/lib/profile/InstrProfilingTLS.c
new file mode 100644
index 0000000000000..029ed9e542e5a
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingTLS.c
@@ -0,0 +1,29 @@
+#include "InstrProfilingTLS.h"
+#include "InstrProfiling.h"
+
+struct texit_fn_node module_node COMPILER_RT_VISIBILITY;
+
+// We act as a shim between the profile_threadlocal sharedlib
+// and the profile static lib. We need to the tell the static lib
+// to add all of the counters up on main thread exit, but the
+// shared lib is the one who knows how to do that and whether its
+// already been done.
+//
+// In the constructor we pass flush_main_thread_counters from the
+// sharedlib to the non-tls statlib's on_main_thread_exit fnptr.
+extern void flush_main_thread_counters(void);
+extern void (*on_main_thread_exit)(void);
+
+__attribute__((constructor)) COMPILER_RT_VISIBILITY void
+__llvm_profile_tls_register_thread_exit_handler(void) {
+ module_node.prev = NULL;
+ module_node.next = NULL;
+ module_node.fn = __llvm_profile_tls_counters_finalize;
+ register_tls_prfcnts_module_thread_exit_handler(&module_node);
+ if (!on_main_thread_exit) {
+ on_main_thread_exit = flush_main_thread_counters;
+ }
+}
+
+// TODO: Add destructor
+// (But not yet, I'm scared)
diff --git a/compiler-rt/lib/profile/InstrProfilingTLS.h b/compiler-rt/lib/profile/InstrProfilingTLS.h
new file mode 100644
index 0000000000000..1b6001d27d375
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingTLS.h
@@ -0,0 +1,39 @@
+#ifndef INSTR_PROFILING_TLS_H
+#define INSTR_PROFILING_TLS_H
+
+char *__llvm_profile_begin_tls_counters(void);
+char *__llvm_profile_end_tls_counters(void);
+
+/*!
+ * \brief Add counter values from TLS to the global counters for the program
+ *
+ * On thread exit, atomically add the values in TLS counters to the static
+ * counters for the whole process.
+ */
+void __llvm_profile_tls_counters_finalize(void);
+
+/*
+ * Dylib stuff
+ */
+typedef void (*texit_fnc)(void);
+
+typedef struct texit_fn_node {
+ struct texit_fn_node *prev;
+ texit_fnc fn;
+ struct texit_fn_node *next;
+} texit_fn_node;
+
+// TODO: really this should be write-preferring rwlocked
+struct texit_fn_registry {
+ int texit_mtx;
+ texit_fn_node head;
+ texit_fn_node tail;
+};
+
+void register_tls_prfcnts_module_thread_exit_handler(texit_fn_node *new_node);
+void unregister_tls_prfcnts_module_thread_exit_handler(texit_fn_node *new_node);
+void run_thread_exit_handlers(void);
+
+void register_profile...
[truncated]
|
LLVM can now generate increments to counters in thread local storage.
Use a new compiler-rt runtime to atomically add thread local counters to global counters on thread exit.
The clang driver will link the new runtime libraries in when the new option -fprofile-thread-local is specified.
More details available in the RFC on discourse.