diff --git a/docs/installation.md b/docs/installation.md
index 9f98f07146..48a66b9f57 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -58,7 +58,7 @@ Just as in Linux, SPAdes is ready to use and no further installation steps are r
 If you wish to compile SPAdes by yourself you will need the following libraries to be pre-installed:
 
 -   g++ (version 9 or higher)
--   cmake (version 3.16 or higher)
+-   cmake (version 3.18 or higher)
 -   zlib
 -   libbz2
 
diff --git a/ext/src/CMakeLists.txt b/ext/src/CMakeLists.txt
index 8b2315a425..39ed0b2623 100644
--- a/ext/src/CMakeLists.txt
+++ b/ext/src/CMakeLists.txt
@@ -31,6 +31,7 @@ add_subdirectory(llvm)
 add_subdirectory(bwa)
 add_subdirectory(gqf)
 add_subdirectory(edlib)
+set(LEXY_ENABLE_INSTALL OFF)
 add_subdirectory(lexy)
 add_subdirectory(easel)
 add_subdirectory(hmmer)
diff --git a/ext/src/mimalloc/.gitignore b/ext/src/mimalloc/.gitignore
index 3639d32417..bcedbc3f7d 100644
--- a/ext/src/mimalloc/.gitignore
+++ b/ext/src/mimalloc/.gitignore
@@ -1,8 +1,13 @@
+build
 ide/vs20??/*.db
 ide/vs20??/*.opendb
 ide/vs20??/*.user
-ide/vs20??/*.vcxproj.filters
 ide/vs20??/.vs
+ide/vs20??/VTune*
 out/
 docs/
 *.zip
+*.tar
+*.gz
+.vscode
+.DS_STore
diff --git a/ext/src/mimalloc/CMakeLists.txt b/ext/src/mimalloc/CMakeLists.txt
index 88bb2a521e..ee043aeacf 100644
--- a/ext/src/mimalloc/CMakeLists.txt
+++ b/ext/src/mimalloc/CMakeLists.txt
@@ -1,90 +1,222 @@
-cmake_minimum_required(VERSION 3.10)
-project(libmimalloc C CXX)
+cmake_minimum_required(VERSION 3.18)
+project(libmimalloc C)
 
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_CXX_STANDARD 17)
 
 option(MI_SECURE            "Use full security mitigations (like guard pages, allocation randomization, double-free mitigation, and free-list corruption detection)" OFF)
-option(MI_DEBUG_FULL        "Use full internal heap invariant checking in DEBUG mode (expensive)" OFF)
-option(MI_PADDING           "Enable padding to detect heap block overflow (used only in DEBUG mode)" ON)
-option(MI_OVERRIDE          "Override the standard malloc interface (e.g. define entry points for malloc() etc)" ON)
+option(MI_OVERRIDE          "Override the standard malloc interface (i.e. define entry points for 'malloc', 'free', etc)" ON)
 option(MI_XMALLOC           "Enable abort() call on memory allocation failure by default" OFF)
 option(MI_SHOW_ERRORS       "Show error and warning messages by default (only enabled by default in DEBUG mode)" OFF)
+option(MI_GUARDED           "Build with guard pages behind certain object allocations (implies MI_NO_PADDING=ON)" OFF)
 option(MI_USE_CXX           "Use the C++ compiler to compile the library (instead of the C compiler)" OFF)
-option(MI_SEE_ASM           "Generate assembly files" OFF)
-option(MI_OSX_INTERPOSE     "Use interpose to override standard malloc on macOS" ON)
-option(MI_OSX_ZONE          "Use malloc zone to override standard malloc on macOS" ON) 
-option(MI_LOCAL_DYNAMIC_TLS "Use slightly slower, dlopen-compatible TLS mechanism (Unix)" OFF)
+option(MI_OPT_ARCH          "Only for optimized builds: turn on architecture specific optimizations (for x64: '-march=haswell;-mavx2' (2013), for arm64: '-march=armv8.1-a' (2016))" OFF)
+option(MI_OPT_SIMD          "Use SIMD instructions (requires MI_OPT_ARCH to be enabled)" OFF)
+
+option(MI_DEBUG             "Enable assertion checks (enabled by default in a debug build)" OFF)
+option(MI_DEBUG_INTERNAL    "Enable assertion and internal invariant checks (enabled by default in a debug build)" OFF)
+option(MI_DEBUG_FULL        "Enable assertion checks and expensive internal heap invariant checking" OFF)
+
+option(MI_DEBUG_TSAN        "Build with thread sanitizer (needs clang)" OFF)
+option(MI_DEBUG_UBSAN       "Build with undefined-behavior sanitizer (needs clang++)" OFF)
+option(MI_TRACK_VALGRIND    "Compile with Valgrind support (adds a small overhead)" OFF)
+option(MI_TRACK_ASAN        "Compile with address sanitizer support (adds a small overhead)" OFF)
+option(MI_TRACK_ETW         "Compile with Windows event tracing (ETW) support (adds a small overhead)" OFF)
+
 option(MI_BUILD_SHARED      "Build shared library" ON)
 option(MI_BUILD_STATIC      "Build static library" ON)
 option(MI_BUILD_OBJECT      "Build object library" ON)
 option(MI_BUILD_TESTS       "Build test executables" ON)
-option(MI_DEBUG_TSAN        "Build with thread sanitizer (needs clang)" OFF)
-option(MI_DEBUG_UBSAN       "Build with undefined-behavior sanitizer (needs clang++)" OFF)
-option(MI_SKIP_COLLECT_ON_EXIT, "Skip collecting memory on program exit" OFF)
+
+option(MI_OSX_INTERPOSE     "Use interpose to override standard malloc on macOS" ON)
+option(MI_OSX_ZONE          "Use malloc zone to override standard malloc on macOS" ON)
+option(MI_WIN_REDIRECT      "Use redirection module ('mimalloc-redirect') on Windows if compiling mimalloc as a DLL" ON)
+option(MI_WIN_DIRECT_TLS    "Use only direct TLS slots on Windows to avoid extra tests in the malloc fast path (only works if the program uses less than 64 TlsAlloc'd slots in total)" OFF)
+option(MI_LOCAL_DYNAMIC_TLS "Use local-dynamic-tls, a slightly slower but dlopen-compatible thread local storage mechanism (Unix)" OFF)
+option(MI_LIBC_MUSL         "Enable this when linking with musl libc" OFF)
+
+option(MI_PADDING           "Enable padding to detect heap block overflow (always on in DEBUG or SECURE mode, or with Valgrind/ASAN)" OFF)
+option(MI_SKIP_COLLECT_ON_EXIT "Skip collecting memory on program exit" OFF)
+option(MI_NO_PADDING        "Force no use of padding even in DEBUG mode etc." OFF)
+option(MI_INSTALL_TOPLEVEL  "Install directly into $CMAKE_INSTALL_PREFIX instead of PREFIX/lib/mimalloc-version" OFF)
+option(MI_NO_THP            "Disable transparent huge pages support on Linux/Android for the mimalloc process only" OFF)
+option(MI_EXTRA_CPPDEFS     "Extra pre-processor definitions (use as `-DMI_EXTRA_CPPDEFS=\"opt1=val1;opt2=val2\"`)" "")
+option(MI_SEE_ASM           "Generate assembly files" OFF)
+
+# negated options for vcpkg features
+option(MI_NO_USE_CXX        "Use plain C compilation (has priority over MI_USE_CXX)" OFF)
+option(MI_NO_OPT_ARCH       "Do not use architecture specific optimizations (like '-march=armv8.1-a' for example) (has priority over MI_OPT_ARCH)" OFF)
 
 # deprecated options
+option(MI_WIN_USE_FLS       "Use Fiber local storage on Windows to detect thread termination (deprecated)" OFF)
 option(MI_CHECK_FULL        "Use full internal invariant checking in DEBUG mode (deprecated, use MI_DEBUG_FULL instead)" OFF)
-option(MI_INSTALL_TOPLEVEL  "Install directly into $CMAKE_INSTALL_PREFIX instead of PREFIX/lib/mimalloc-version (deprecated)" OFF)
 option(MI_USE_LIBATOMIC     "Explicitly link with -latomic (on older systems) (deprecated and detected automatically)" OFF)
 
+include(CheckLinkerFlag)    # requires cmake 3.18
+include(CheckIncludeFiles)
 include(GNUInstallDirs)
 include("cmake/mimalloc-config-version.cmake")
 
 set(mi_sources
-    src/stats.c
-    src/random.c
-    src/os.c
-    src/bitmap.c
-    src/arena.c
-    src/segment-cache.c
-    src/segment.c
-    src/page.c
     src/alloc.c
     src/alloc-aligned.c
     src/alloc-posix.c
+    src/arena.c
+    src/arena-meta.c
+    src/bitmap.c
     src/heap.c
+    src/init.c
+    src/libc.c
     src/options.c
-    src/init.c)
+    src/os.c
+    src/page.c
+    src/page-map.c
+    src/random.c
+    src/stats.c
+    src/theap.c
+    src/threadlocal.c
+    src/prim/prim.c)
+
+set(mi_cflags "")
+set(mi_cflags_static "")            # extra flags for a static library build
+set(mi_cflags_dynamic "")           # extra flags for a shared-object library build
+set(mi_libraries "")
 
+if(MI_EXTRA_CPPDEFS)
+ set(mi_defines ${MI_EXTRA_CPPDEFS})
+else()
+ set(mi_defines "")
+endif()
+
+# pass git revision as a define
+if(EXISTS "${CMAKE_SOURCE_DIR}/.git/index")
+  find_package(Git)
+  if(GIT_FOUND)
+    execute_process(COMMAND ${GIT_EXECUTABLE} "describe" OUTPUT_VARIABLE mi_git_describe RESULT_VARIABLE mi_git_res ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if(mi_git_res EQUAL "0")
+      list(APPEND mi_defines "MI_GIT_DESCRIBE=${mi_git_describe}")
+      # add to dependencies so we rebuild if the git head commit changes
+      set_property(GLOBAL APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${CMAKE_SOURCE_DIR}/.git/index")
+    endif()
+  endif()
+endif()
 
 # -----------------------------------------------------------------------------
-# Convenience: set default build type depending on the build directory
+# Convenience: set default build type and compiler depending on the build directory
 # -----------------------------------------------------------------------------
 
-message(STATUS "")    
+message(STATUS "")
 if (NOT CMAKE_BUILD_TYPE)
-  if ("${CMAKE_BINARY_DIR}" MATCHES ".*(D|d)ebug$" OR  MI_DEBUG_FULL)
-    message(STATUS "No build type selected, default to: Debug")
+  if ("${CMAKE_BINARY_DIR}" MATCHES ".*((D|d)ebug|asan|tsan|ubsan|valgrind)$")
+    message(STATUS "No build type selected, default to 'Debug'")
     set(CMAKE_BUILD_TYPE "Debug")
   else()
-    message(STATUS "No build type selected, default to: Release")
+    message(STATUS "No build type selected, default to 'Release'")
     set(CMAKE_BUILD_TYPE "Release")
   endif()
 endif()
 
+if (CMAKE_GENERATOR MATCHES "^Visual Studio.*$")
+  message(STATUS "Note: when building with Visual Studio the build type is specified when building.")
+  message(STATUS "For example: 'cmake --build . --config=Release")
+endif()
+
 if("${CMAKE_BINARY_DIR}" MATCHES ".*(S|s)ecure$")
   message(STATUS "Default to secure build")
   set(MI_SECURE "ON")
 endif()
 
 
+# Determine architecture
+set(MI_OPT_ARCH_FLAGS "")
+set(MI_ARCH "unknown")
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86|i[3456]86)$" OR CMAKE_GENERATOR_PLATFORM MATCHES "^(x86|Win32)$")
+  set(MI_ARCH "x86")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|x64|amd64|AMD64)$" OR CMAKE_GENERATOR_PLATFORM STREQUAL "x64" OR "x86_64" IN_LIST CMAKE_OSX_ARCHITECTURES) # must be before arm64
+  set(MI_ARCH "x64")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64|armv[89].?|ARM64)$" OR CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64" OR "arm64" IN_LIST CMAKE_OSX_ARCHITECTURES)
+  set(MI_ARCH "arm64")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|armv[34567].?|ARM)$")
+  set(MI_ARCH "arm32")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv|riscv32|riscv64)$")
+  if(CMAKE_SIZEOF_VOID_P==4)
+    set(MI_ARCH "riscv32")
+  else()
+    set(MI_ARCH "riscv64")
+  endif()
+else()
+  set(MI_ARCH ${CMAKE_SYSTEM_PROCESSOR})
+endif()
+message(STATUS "Architecture: ${MI_ARCH}") # (${CMAKE_SYSTEM_PROCESSOR}, ${CMAKE_GENERATOR_PLATFORM}, ${CMAKE_GENERATOR})")
+
+# negative overrides (mainly to support vcpkg features)
+if(MI_NO_USE_CXX)
+  set(MI_USE_CXX "OFF")
+endif()
+
+if(MI_NO_OPT_ARCH)
+  set(MI_OPT_ARCH "OFF")
+elseif(MI_ARCH STREQUAL "arm64")
+  set(MI_OPT_ARCH "ON")  # enable armv8.1-a by default on arm64 unless MI_NO_OPT_ARCH is set
+endif()
+
 # -----------------------------------------------------------------------------
-# Process options
+# Enable the C++ compiler early on if needed
 # -----------------------------------------------------------------------------
 
-if(CMAKE_C_COMPILER_ID MATCHES "MSVC|Intel")
+# clang-cl detection on windows
+if(CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_FRONTEND_VARIANT STREQUAL "MSVC")
+  set(MI_CLANG_CL "ON")
+endif()
+
+# force C++ compilation with msvc or clang-cl to use modern C++ atomics
+if(CMAKE_C_COMPILER_ID MATCHES "MSVC|Intel" OR MI_CLANG_CL)
+  set(MI_USE_CXX "ON")
+elseif(MI_DEBUG_UBSAN AND CMAKE_BUILD_TYPE MATCHES "Debug")  # ubsan needs C++
   set(MI_USE_CXX "ON")
 endif()
 
+# enable C++ ?
+if(MI_USE_CXX)
+  enable_language(CXX)
+  message(STATUS "Use the C++ compiler to compile (MI_USE_CXX=ON)")
+  if(CMAKE_CXX_COMPILER_ID MATCHES "Intel" AND NOT CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM")
+    list(APPEND mi_cflags -Kc++)
+  endif()
+endif()
+
+
+# -----------------------------------------------------------------------------
+# Process options
+# -----------------------------------------------------------------------------
+
+# put -Wall early so other warnings can be disabled selectively
+if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang")
+  if (MI_CLANG_CL)
+    list(APPEND mi_cflags -W)
+  else()
+    list(APPEND mi_cflags -Wall -Wextra -Wpedantic)
+  endif()
+endif()
+if(CMAKE_C_COMPILER_ID MATCHES "GNU")
+    list(APPEND mi_cflags -Wall -Wextra)
+endif()
+if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+    list(APPEND mi_cflags -Wall)
+endif()
+
+if(CMAKE_CXX_COMPILER_ID MATCHES "AppleClang|Clang")
+  list(APPEND mi_cflags -Wno-deprecated)
+endif()
+
 if(MI_OVERRIDE)
   message(STATUS "Override standard malloc (MI_OVERRIDE=ON)")
   if(APPLE)
     if(MI_OSX_ZONE)
       # use zone's on macOS
       message(STATUS "  Use malloc zone to override malloc (MI_OSX_ZONE=ON)")
-      list(APPEND mi_sources src/alloc-override-osx.c)
-      list(APPEND mi_defines MI_OSX_ZONE=1)      
+      list(APPEND mi_sources src/prim/osx/alloc-override-zone.c)
+      list(APPEND mi_defines MI_OSX_ZONE=1)
       if (NOT MI_OSX_INTERPOSE)
         message(STATUS "  WARNING: zone overriding usually also needs interpose (use -DMI_OSX_INTERPOSE=ON)")
       endif()
@@ -94,7 +226,7 @@ if(MI_OVERRIDE)
       message(STATUS "  Use interpose to override malloc (MI_OSX_INTERPOSE=ON)")
       list(APPEND mi_defines MI_OSX_INTERPOSE=1)
       if (NOT MI_OSX_ZONE)
-        message(STATUS "  WARNING: interpose usually also needs zone overriding (use -DMI_OSX_INTERPOSE=ON)")
+        message(STATUS "  WARNING: interpose usually also needs zone overriding (use -DMI_OSX_ZONE=ON)")
       endif()
     endif()
     if(MI_USE_CXX AND MI_OSX_INTERPOSE)
@@ -103,14 +235,89 @@ if(MI_OVERRIDE)
   endif()
 endif()
 
+if(WIN32)
+  if (NOT MI_WIN_REDIRECT)
+    # use a negative define for backward compatibility
+    list(APPEND mi_defines MI_WIN_NOREDIRECT=1)
+  endif()
+endif()
+
 if(MI_SECURE)
   message(STATUS "Set full secure build (MI_SECURE=ON)")
   list(APPEND mi_defines MI_SECURE=4)
 endif()
 
+if(MI_TRACK_VALGRIND)
+  CHECK_INCLUDE_FILES("valgrind/valgrind.h;valgrind/memcheck.h" MI_HAS_VALGRINDH)
+  if (NOT MI_HAS_VALGRINDH)
+    set(MI_TRACK_VALGRIND OFF)
+    message(WARNING "Cannot find the 'valgrind/valgrind.h' and 'valgrind/memcheck.h' -- install valgrind first?")
+    message(STATUS  "Disabling Valgrind support (MI_TRACK_VALGRIND=OFF)")
+  else()
+    message(STATUS "Compile with Valgrind support (MI_TRACK_VALGRIND=ON)")
+    list(APPEND mi_defines MI_TRACK_VALGRIND=1)
+  endif()
+endif()
+
+if(MI_TRACK_ASAN)
+  if (APPLE AND MI_OVERRIDE)
+    set(MI_TRACK_ASAN OFF)
+    message(WARNING "Cannot enable address sanitizer support on macOS if MI_OVERRIDE is ON (MI_TRACK_ASAN=OFF)")
+  endif()
+  if (MI_TRACK_VALGRIND)
+    set(MI_TRACK_ASAN OFF)
+    message(WARNING "Cannot enable address sanitizer support with also Valgrind support enabled (MI_TRACK_ASAN=OFF)")
+  endif()
+  if(MI_TRACK_ASAN)
+    CHECK_INCLUDE_FILES("sanitizer/asan_interface.h" MI_HAS_ASANH)
+    if (NOT MI_HAS_ASANH)
+      set(MI_TRACK_ASAN OFF)
+      message(WARNING "Cannot find the 'sanitizer/asan_interface.h' -- install address sanitizer support first")
+      message(STATUS  "Compile **without** address sanitizer support (MI_TRACK_ASAN=OFF)")
+    else()
+      message(STATUS "Compile with address sanitizer support (MI_TRACK_ASAN=ON)")
+      list(APPEND mi_defines MI_TRACK_ASAN=1)
+      list(APPEND mi_cflags -fsanitize=address)
+      list(APPEND mi_libraries -fsanitize=address)
+    endif()
+  endif()
+endif()
+
+if(MI_TRACK_ETW)
+  if(NOT WIN32)
+    set(MI_TRACK_ETW OFF)
+    message(WARNING "Can only enable ETW support on Windows (MI_TRACK_ETW=OFF)")
+  endif()
+  if (MI_TRACK_VALGRIND OR MI_TRACK_ASAN)
+    set(MI_TRACK_ETW OFF)
+    message(WARNING "Cannot enable ETW support with also Valgrind or ASAN support enabled (MI_TRACK_ETW=OFF)")
+  endif()
+  if(MI_TRACK_ETW)
+    message(STATUS "Compile with Windows event tracing support (MI_TRACK_ETW=ON)")
+    list(APPEND mi_defines MI_TRACK_ETW=1)
+  endif()
+endif()
+
+if(MI_GUARDED)
+  message(STATUS "Compile guard pages behind certain object allocations (MI_GUARDED=ON)")
+  list(APPEND mi_defines MI_GUARDED=1)
+  if(NOT MI_NO_PADDING)
+    message(STATUS "  Disabling padding due to guard pages (MI_NO_PADDING=ON)")
+    set(MI_NO_PADDING ON)
+  endif()
+endif()
+
 if(MI_SEE_ASM)
   message(STATUS "Generate assembly listings (MI_SEE_ASM=ON)")
-  list(APPEND mi_cflags -save-temps)
+  if(CMAKE_C_COMPILER_ID MATCHES "MSVC" OR MI_CLANG_CL)
+    list(APPEND mi_cflags -FA)
+  else()
+    list(APPEND mi_cflags -save-temps)
+    if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 14)
+      message(STATUS "No GNU Line marker")
+      list(APPEND mi_cflags -Wno-gnu-line-marker)
+    endif()
+  endif()
 endif()
 
 if(MI_CHECK_FULL)
@@ -124,13 +331,28 @@ if (MI_SKIP_COLLECT_ON_EXIT)
 endif()
 
 if(MI_DEBUG_FULL)
-  message(STATUS "Set debug level to full internal invariant checking (MI_DEBUG_FULL=ON)")
-  list(APPEND mi_defines MI_DEBUG=3)   # full invariant checking
-endif()
-
-if(NOT MI_PADDING)
-  message(STATUS "Disable padding of heap blocks in debug mode (MI_PADDING=OFF)")
+  message(STATUS "Set debug level to full assertion and internal invariant checking (MI_DEBUG_FULL=ON, expensive)")
+  list(APPEND mi_defines MI_DEBUG=3)   # full invariant checking (mi_assert, mi_assert_internal, and mi_assert_expensive)
+elseif(MI_DEBUG_INTERNAL)
+  message(STATUS "Set debug level to internal assertion and invariant checking (MI_DEBUG_INTERNAL=ON)")
+  list(APPEND mi_defines MI_DEBUG=2)   # invariant checking (mi_assert and mi_assert_internal)
+elseif(MI_DEBUG)
+  message(STATUS "Set debug level to assertion checking (MI_DEBUG=ON)")
+  list(APPEND mi_defines MI_DEBUG=1)   # assertion checking (mi_assert)
+elseif(CMAKE_BUILD_TYPE MATCHES "Debug")
+  message(STATUS "Set debug level to internal assertion and invariant checking (CMAKE_BUILD_TYPE=Debug)")
+  set(MI_DEBUG_INTERNAL ON)
+  list(APPEND mi_defines MI_DEBUG=2)   # invariant checking (mi_assert and mi_assert_internal)
+endif()
+
+if(MI_NO_PADDING)
+  message(STATUS "Suppress any padding of heap blocks (MI_NO_PADDING=ON)")
   list(APPEND mi_defines MI_PADDING=0)
+else()
+  if(MI_PADDING)
+    message(STATUS "Enable explicit padding of heap blocks (MI_PADDING=ON)")
+    list(APPEND mi_defines MI_PADDING=1)
+  endif()
 endif()
 
 if(MI_XMALLOC)
@@ -148,102 +370,206 @@ if(MI_DEBUG_TSAN)
     message(STATUS "Build with thread sanitizer (MI_DEBUG_TSAN=ON)")
     list(APPEND mi_defines MI_TSAN=1)
     list(APPEND mi_cflags -fsanitize=thread -g -O1)
-    list(APPEND CMAKE_EXE_LINKER_FLAGS -fsanitize=thread)
+    list(APPEND mi_libraries -fsanitize=thread)
   else()
-    message(WARNING "Can only use thread sanitizer with clang (MI_DEBUG_TSAN=ON but ignored)")    
-  endif()  
+    message(WARNING "Can only use thread sanitizer with clang (MI_DEBUG_TSAN=ON but ignored)")
+  endif()
 endif()
 
 if(MI_DEBUG_UBSAN)
-  if(CMAKE_BUILD_TYPE MATCHES "Debug")    
-    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-      message(STATUS "Build with undefined-behavior sanitizer (MI_DEBUG_UBSAN=ON)")
-      list(APPEND mi_cflags -fsanitize=undefined -g -fno-sanitize-recover=undefined)
-      list(APPEND CMAKE_EXE_LINKER_FLAGS -fsanitize=undefined)
-      if (NOT MI_USE_CXX)
-        message(STATUS "(switch to use C++ due to MI_DEBUG_UBSAN)")
-        set(MI_USE_CXX "ON")
+  if(CMAKE_BUILD_TYPE MATCHES "Debug")
+    if(MI_USE_CXX)
+      if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+        message(STATUS "Build with undefined-behavior sanitizer (MI_DEBUG_UBSAN=ON)")
+        list(APPEND mi_defines MI_UBSAN=1)
+        list(APPEND mi_cflags -fsanitize=undefined -g -fno-sanitize-recover=undefined)
+        list(APPEND mi_libraries -fsanitize=undefined)
+      else()
+        message(WARNING "Can only use undefined-behavior sanitizer with clang++ (MI_DEBUG_UBSAN=ON but ignored)")
       endif()
     else()
-      message(WARNING "Can only use undefined-behavior sanitizer with clang++ (MI_DEBUG_UBSAN=ON but ignored)")    
-    endif()  
+      message(WARNING "Can only use undefined-behavior sanitizer with a C++ build (MI_USE_CXX=ON)")
+    endif()
   else()
-    message(WARNING "Can only use thread sanitizer with a debug build (CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})")    
+    message(WARNING "Can only use undefined-behavior sanitizer with a debug build (CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})")
   endif()
 endif()
 
-if(MI_USE_CXX)
-  message(STATUS "Use the C++ compiler to compile (MI_USE_CXX=ON)")
-  set_source_files_properties(${mi_sources} PROPERTIES LANGUAGE CXX )
-  set_source_files_properties(src/static.c test/test-api.c test/test-api-fill test/test-stress PROPERTIES LANGUAGE CXX )
-  if(CMAKE_CXX_COMPILER_ID MATCHES "AppleClang|Clang")
-    list(APPEND mi_cflags -Wno-deprecated)
+if(CMAKE_SYSTEM_NAME MATCHES "Linux|Android")
+  if(MI_NO_THP)
+    message(STATUS "Disable transparent huge pages support (MI_NO_THP=ON)")
+    list(APPEND mi_defines MI_NO_THP=1)
   endif()
-  if(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
-    list(APPEND mi_cflags -Kc++)
+endif()
+
+if(MI_LIBC_MUSL)
+  message(STATUS "Assume using musl libc (MI_LIBC_MUSL=ON)")
+  list(APPEND mi_defines MI_LIBC_MUSL=1)
+endif()
+
+if(MI_WIN_USE_FLS)
+  message(STATUS "Use the Fiber API to detect thread termination (deprecated) (MI_WIN_USE_FLS=ON)")
+  list(APPEND mi_defines MI_WIN_USE_FLS=1)
+endif()
+
+if(MI_WIN_DIRECT_TLS)
+  message(STATUS "Use only direct TLS slots on Windows to avoid extra tests in the malloc fast path -- this only works if the program uses less than 64 TlsAlloc'd slots in total! (MI_WIN_USE_ONLY_DIRECT_TLS=ON)")
+  list(APPEND mi_defines MI_WIN_DIRECT_TLS=1)
+endif()
+
+# Check /proc/cpuinfo for an SV39 MMU and limit the virtual address bits.
+# (this will skip the aligned hinting in that case. Issue #939, #949)
+if (EXISTS /proc/cpuinfo)
+  file(STRINGS /proc/cpuinfo mi_sv39_mmu REGEX "^mmu[ \t]+:[ \t]+sv39$")
+  if (mi_sv39_mmu)
+    MESSAGE( STATUS "Set virtual address bits to 39 (SV39 MMU detected)" )
+    list(APPEND mi_defines MI_DEFAULT_VIRTUAL_ADDRESS_BITS=39)
   endif()
 endif()
 
+# On Haiku use `-DCMAKE_INSTALL_PREFIX` instead, issue #788
+# if(CMAKE_SYSTEM_NAME MATCHES "Haiku")
+#   SET(CMAKE_INSTALL_LIBDIR ~/config/non-packaged/lib)
+#   SET(CMAKE_INSTALL_INCLUDEDIR ~/config/non-packaged/headers)
+# endif()
+
 # Compiler flags
-if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU")
-  list(APPEND mi_cflags -Wall -Wextra -Wno-unknown-pragmas -fvisibility=hidden)
+if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU" AND NOT MI_CLANG_CL)
+  list(APPEND mi_cflags -Wno-unknown-pragmas -fvisibility=hidden)
   if(NOT MI_USE_CXX)
     list(APPEND mi_cflags -Wstrict-prototypes)
-  endif()  
+  endif()
   if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang")
-    list(APPEND mi_cflags -Wpedantic -Wno-static-in-inline)
+    list(APPEND mi_cflags -Wno-static-in-inline)
   endif()
 endif()
 
 if(CMAKE_C_COMPILER_ID MATCHES "Intel")
-  list(APPEND mi_cflags -Wall -fvisibility=hidden)
+  list(APPEND mi_cflags -fvisibility=hidden)
 endif()
 
-if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM_NAME MATCHES "Haiku")
+if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM_NAME MATCHES "Haiku" AND NOT MI_CLANG_CL)
   if(MI_LOCAL_DYNAMIC_TLS)
     list(APPEND mi_cflags -ftls-model=local-dynamic)
   else()
-    list(APPEND mi_cflags -ftls-model=initial-exec)
+    if(MI_LIBC_MUSL)
+      # with musl we use local-dynamic for the static build, see issue #644
+      list(APPEND mi_cflags_static  -ftls-model=local-dynamic)
+      list(APPEND mi_cflags_dynamic -ftls-model=initial-exec)
+      message(STATUS "Use local dynamic TLS for the static build (since MI_LIBC_MUSL=ON)")
+    else()
+      list(APPEND mi_cflags -ftls-model=initial-exec)
+    endif()
   endif()
+endif()
+
+if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel")
   if(MI_OVERRIDE)
     list(APPEND mi_cflags -fno-builtin-malloc)
   endif()
 endif()
 
-if (MSVC AND MSVC_VERSION GREATER_EQUAL 1914)
+# Compiler and architecture specific flags
+if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM_NAME MATCHES "Haiku")
+  if(MI_OPT_ARCH)
+    if(APPLE AND CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang" AND CMAKE_OSX_ARCHITECTURES)   # to support multi-arch binaries (#999)
+      if("arm64" IN_LIST CMAKE_OSX_ARCHITECTURES)
+        list(APPEND MI_OPT_ARCH_FLAGS "-Xarch_arm64;-march=armv8.1-a")
+      endif()
+      if("x86_64" IN_LIST CMAKE_OSX_ARCHITECTURES)
+        list(APPEND MI_OPT_ARCH_FLAGS "-Xarch_x86_64;-march=haswell;-Xarch_x86_64;-mavx2")
+      endif()
+    elseif(MI_ARCH STREQUAL "x64")
+      set(MI_OPT_ARCH_FLAGS "-march=haswell;-mavx2")    # fast bit scan (since 2013)
+    elseif(MI_ARCH STREQUAL "arm64")
+      set(MI_OPT_ARCH_FLAGS "-march=armv8.1-a")         # fast atomics (since 2016)
+    endif()
+  endif()
+endif()
+
+if (MSVC AND MSVC_VERSION GREATER_EQUAL 1914) # vs2017+
   list(APPEND mi_cflags /Zc:__cplusplus)
+  if(MI_OPT_ARCH AND NOT MI_CLANG_CL)
+    if(MI_ARCH STREQUAL "x64")
+      set(MI_OPT_ARCH_FLAGS "/arch:AVX2")
+    elseif(MI_ARCH STREQUAL "arm64")
+      set(MI_OPT_ARCH_FLAGS "/arch:armv8.1")
+    endif()
+  endif()
+endif()
+
+if(MINGW)
+  add_definitions(-D_WIN32_WINNT=0x600)                # issue #976
+endif()
+
+if(MI_OPT_ARCH_FLAGS)
+  list(APPEND mi_cflags ${MI_OPT_ARCH_FLAGS})
+  message(STATUS "Architecture specific optimization is enabled (with ${MI_OPT_ARCH_FLAGS}) (MI_OPT_ARCH=ON)")
+  if (MI_OPT_SIMD)
+    list(APPEND mi_defines "MI_OPT_SIMD=1")
+    message(STATUS "SIMD instructions are enabled (MI_OPT_SIMD=ON)")
+  endif()
+elseif(MI_OPT_SIMD)
+  message(STATUS "SIMD instructions are not enabled (either MI_OPT_ARCH=OFF or this architecture has no SIMD support)")
 endif()
 
 # extra needed libraries
+
+# we prefer -l<lib> test over `find_library` as sometimes core libraries
+# like `libatomic` are not on the system path (see issue #898)
+function(find_link_library libname outlibname)
+  check_linker_flag(C "-l${libname}" mi_has_lib${libname})
+  if (mi_has_lib${libname})
+    message(VERBOSE "link library: -l${libname}")
+    set(${outlibname} ${libname} PARENT_SCOPE)
+  else()
+    find_library(MI_LIBPATH_${libname} ${libname})
+    if (MI_LIBPATH_${libname})
+      message(VERBOSE "link library ${libname} at ${MI_LIBPATH_${libname}}")
+      set(${outlibname} ${MI_LIBPATH_${libname}} PARENT_SCOPE)
+    else()
+      message(VERBOSE "link library not found: ${libname}")
+      set(${outlibname} "" PARENT_SCOPE)
+    endif()
+  endif()
+endfunction()
+
 if(WIN32)
   list(APPEND mi_libraries psapi shell32 user32 advapi32 bcrypt)
 else()
-  find_library(MI_LIBPTHREAD pthread)
-  if (MI_LIBPTHREAD)                      
-    list(APPEND mi_libraries ${MI_LIBPTHREAD})
+  find_link_library("pthread" MI_LIB_PTHREAD)
+  if(MI_LIB_PTHREAD)
+    list(APPEND mi_libraries "${MI_LIB_PTHREAD}")
   endif()
-  find_library(MI_LIBRT rt)
-  if(MI_LIBRT)
-    list(APPEND mi_libraries ${MI_LIBRT})
-  endif()  
-  find_library(MI_LIBATOMIC atomic)
-  if (MI_LIBATOMIC OR MI_USE_LIBATOMIC) 
-    list(APPEND mi_libraries atomic)
+  find_link_library("rt" MI_LIB_RT)
+  if(MI_LIB_RT)
+    list(APPEND mi_libraries "${MI_LIB_RT}")
   endif()
+  find_link_library("atomic" MI_LIB_ATOMIC)
+  if(MI_LIB_ATOMIC)
+    list(APPEND mi_libraries "${MI_LIB_ATOMIC}")
+  endif()
+endif()
+
+# set language for source files now
+if(MI_USE_CXX)
+  set_source_files_properties(${mi_sources} PROPERTIES LANGUAGE CXX )
+  set_source_files_properties(src/static.c test/test-api.c test/test-api-fill test/test-stress PROPERTIES LANGUAGE CXX )
 endif()
 
+
 # -----------------------------------------------------------------------------
 # Install and output names
 # -----------------------------------------------------------------------------
 
 # dynamic/shared library and symlinks always go to /usr/local/lib equivalent
-set(mi_install_libdir   "${CMAKE_INSTALL_LIBDIR}")
+# we use ${CMAKE_INSTALL_BINDIR} and ${CMAKE_INSTALL_LIBDIR}.
 
 # static libraries and object files, includes, and cmake config files
 # are either installed at top level, or use versioned directories for side-by-side installation (default)
 if (MI_INSTALL_TOPLEVEL)
   set(mi_install_objdir     "${CMAKE_INSTALL_LIBDIR}")
-  set(mi_install_incdir     "${CMAKE_INSTALL_INCLUDEDIR}")        
+  set(mi_install_incdir     "${CMAKE_INSTALL_INCLUDEDIR}")
   set(mi_install_cmakedir   "${CMAKE_INSTALL_LIBDIR}/cmake/mimalloc")
 else()
   set(mi_install_objdir     "${CMAKE_INSTALL_LIBDIR}/mimalloc-${mi_version}")       # for static library and object files
@@ -251,16 +577,24 @@ else()
   set(mi_install_cmakedir   "${CMAKE_INSTALL_LIBDIR}/cmake/mimalloc-${mi_version}") # for cmake package info
 endif()
 
+set(mi_libname "mimalloc")
 if(MI_SECURE)
-  set(mi_basename "mimalloc-secure")
-else()
-  set(mi_basename "mimalloc")
+  set(mi_libname "${mi_libname}-secure")
+endif()
+if(MI_TRACK_VALGRIND)
+  set(mi_libname "${mi_libname}-valgrind")
+endif()
+if(MI_TRACK_ASAN)
+  set(mi_libname "${mi_libname}-asan")
 endif()
-
 string(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_LC)
-if(NOT(CMAKE_BUILD_TYPE_LC MATCHES "^(release|relwithdebinfo|minsizerel|none)$"))
-  set(mi_basename "${mi_basename}-${CMAKE_BUILD_TYPE_LC}") #append build type (e.g. -debug) if not a release version
+list(APPEND mi_defines "MI_CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE_LC}")  #todo: multi-config project needs $<CONFIG> ?
+if(CMAKE_BUILD_TYPE_LC MATCHES "^(release|relwithdebinfo|minsizerel|none)$")
+  list(APPEND mi_defines MI_BUILD_RELEASE)
+else()
+  set(mi_libname "${mi_libname}-${CMAKE_BUILD_TYPE_LC}") #append build type (e.g. -debug) if not a release version
 endif()
+
 if(MI_BUILD_SHARED)
   list(APPEND mi_build_targets "shared")
 endif()
@@ -275,8 +609,8 @@ if(MI_BUILD_TESTS)
 endif()
 
 message(STATUS "")
-message(STATUS "Library base name: ${mi_basename}")
-message(STATUS "Version          : ${mi_version}")
+message(STATUS "Library name     : ${mi_libname}")
+message(STATUS "Version          : ${mi_version}.${mi_version_patch}")
 message(STATUS "Build type       : ${CMAKE_BUILD_TYPE_LC}")
 if(MI_USE_CXX)
   message(STATUS "C++ Compiler     : ${CMAKE_CXX_COMPILER}")
@@ -296,53 +630,66 @@ message(STATUS "")
 # shared library
 if(MI_BUILD_SHARED)
   add_library(mimalloc SHARED ${mi_sources})
-  set_target_properties(mimalloc PROPERTIES VERSION ${mi_version} SOVERSION ${mi_version_major} OUTPUT_NAME ${mi_basename} )
+  set_target_properties(mimalloc PROPERTIES VERSION ${mi_version} SOVERSION ${mi_version_major} OUTPUT_NAME ${mi_libname} )
   target_compile_definitions(mimalloc PRIVATE ${mi_defines} MI_SHARED_LIB MI_SHARED_LIB_EXPORT)
-  target_compile_options(mimalloc PRIVATE ${mi_cflags})
-  target_link_libraries(mimalloc PUBLIC ${mi_libraries})
+  target_compile_options(mimalloc PRIVATE ${mi_cflags} ${mi_cflags_dynamic})
+  target_link_libraries(mimalloc PRIVATE ${mi_libraries})
   target_include_directories(mimalloc PUBLIC
       $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
       $<INSTALL_INTERFACE:${mi_install_incdir}>
   )
-  if(WIN32)
-    # On windows copy the mimalloc redirection dll too.
-    if(CMAKE_SIZEOF_VOID_P EQUAL 4)
+  install(TARGETS mimalloc EXPORT mimalloc ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
+  install(EXPORT mimalloc DESTINATION ${mi_install_cmakedir})
+
+  if(WIN32 AND NOT MINGW)
+    # On windows, the import library name for the dll would clash with the static mimalloc.lib library
+    # so we postfix the dll import library with `.dll.lib` (and also the .pdb debug file)
+    set_property(TARGET mimalloc PROPERTY ARCHIVE_OUTPUT_NAME "${mi_libname}.dll" )
+    install(FILES "$<TARGET_FILE_DIR:mimalloc>/${mi_libname}.dll.lib" DESTINATION ${CMAKE_INSTALL_LIBDIR})
+    set_property(TARGET mimalloc PROPERTY PDB_NAME "${mi_libname}.dll")
+    # don't try to install the pdb since it may not be generated depending on the configuration
+    # install(FILES "$<TARGET_FILE_DIR:mimalloc>/${mi_libname}.dll.pdb" DESTINATION ${CMAKE_INSTALL_LIBDIR})
+  endif()
+  if(WIN32 AND MI_WIN_REDIRECT)
+    if(MINGW)
+      set_property(TARGET mimalloc PROPERTY PREFIX "")
+    endif()
+    # On windows, link and copy the mimalloc redirection dll too.
+    if(CMAKE_GENERATOR_PLATFORM STREQUAL "arm64ec")
+      set(MIMALLOC_REDIRECT_SUFFIX "-arm64ec")
+    elseif(MI_ARCH STREQUAL "x64")
+      set(MIMALLOC_REDIRECT_SUFFIX "")
+      if(CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM64")
+        message(STATUS "Note: x64 code emulated on Windows for arm64 should use an arm64ec build of 'mimalloc.dll'")
+        message(STATUS "      together with 'mimalloc-redirect-arm64ec.dll'. See the 'bin\\readme.md' for more information.")
+      endif()
+    elseif(MI_ARCH STREQUAL "x86")
       set(MIMALLOC_REDIRECT_SUFFIX "32")
     else()
-      set(MIMALLOC_REDIRECT_SUFFIX "")
+      set(MIMALLOC_REDIRECT_SUFFIX "-${MI_ARCH}")  # -arm64 etc.
     endif()
 
-    target_link_libraries(mimalloc PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/bin/mimalloc-redirect${MIMALLOC_REDIRECT_SUFFIX}.lib)
+    target_link_libraries(mimalloc PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/bin/mimalloc-redirect${MIMALLOC_REDIRECT_SUFFIX}.lib)  # the DLL import library
     add_custom_command(TARGET mimalloc POST_BUILD
       COMMAND "${CMAKE_COMMAND}" -E copy "${CMAKE_CURRENT_SOURCE_DIR}/bin/mimalloc-redirect${MIMALLOC_REDIRECT_SUFFIX}.dll" $<TARGET_FILE_DIR:mimalloc>
       COMMENT "Copy mimalloc-redirect${MIMALLOC_REDIRECT_SUFFIX}.dll to output directory")
-    install(FILES "$<TARGET_FILE_DIR:mimalloc>/mimalloc-redirect${MIMALLOC_REDIRECT_SUFFIX}.dll" DESTINATION ${mi_install_libdir})
+    install(FILES "$<TARGET_FILE_DIR:mimalloc>/mimalloc-redirect${MIMALLOC_REDIRECT_SUFFIX}.dll" DESTINATION ${CMAKE_INSTALL_BINDIR})
   endif()
-
-  install(TARGETS mimalloc EXPORT mimalloc DESTINATION ${mi_install_libdir} LIBRARY)  
-  install(EXPORT mimalloc DESTINATION ${mi_install_cmakedir})
 endif()
 
+
 # static library
 if (MI_BUILD_STATIC)
   add_library(mimalloc-static STATIC ${mi_sources})
+  set_property(TARGET mimalloc-static PROPERTY OUTPUT_NAME ${mi_libname})
   set_property(TARGET mimalloc-static PROPERTY POSITION_INDEPENDENT_CODE ON)
   target_compile_definitions(mimalloc-static PRIVATE ${mi_defines} MI_STATIC_LIB)
-  target_compile_options(mimalloc-static PRIVATE ${mi_cflags})
-  target_link_libraries(mimalloc-static PUBLIC ${mi_libraries})
+  target_compile_options(mimalloc-static PRIVATE ${mi_cflags} ${mi_cflags_static})
+  target_link_libraries(mimalloc-static PRIVATE ${mi_libraries})
   target_include_directories(mimalloc-static PUBLIC
       $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
       $<INSTALL_INTERFACE:${mi_install_incdir}>
   )
-  if(WIN32)
-    # When building both static and shared libraries on Windows, a static library should use a
-    # different output name to avoid the conflict with the import library of a shared one.
-    string(REPLACE "mimalloc" "mimalloc-static" mi_output_name ${mi_basename})
-    set_target_properties(mimalloc-static PROPERTIES OUTPUT_NAME ${mi_output_name})
-  else()
-    set_target_properties(mimalloc-static PROPERTIES OUTPUT_NAME ${mi_basename})
-  endif()
-
   install(TARGETS mimalloc-static EXPORT mimalloc DESTINATION ${mi_install_objdir} LIBRARY)
   install(EXPORT mimalloc DESTINATION ${mi_install_cmakedir})
 endif()
@@ -351,6 +698,7 @@ endif()
 install(FILES include/mimalloc.h DESTINATION ${mi_install_incdir})
 install(FILES include/mimalloc-override.h DESTINATION ${mi_install_incdir})
 install(FILES include/mimalloc-new-delete.h DESTINATION ${mi_install_incdir})
+install(FILES include/mimalloc-stats.h DESTINATION ${mi_install_incdir})
 install(FILES cmake/mimalloc-config.cmake DESTINATION ${mi_install_cmakedir})
 install(FILES cmake/mimalloc-config-version.cmake DESTINATION ${mi_install_cmakedir})
 
@@ -360,22 +708,54 @@ if (MI_BUILD_OBJECT)
   add_library(mimalloc-obj OBJECT src/static.c)
   set_property(TARGET mimalloc-obj PROPERTY POSITION_INDEPENDENT_CODE ON)
   target_compile_definitions(mimalloc-obj PRIVATE ${mi_defines})
-  target_compile_options(mimalloc-obj PRIVATE ${mi_cflags})
+  target_compile_options(mimalloc-obj PRIVATE ${mi_cflags} ${mi_cflags_static})
   target_include_directories(mimalloc-obj PUBLIC
       $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
       $<INSTALL_INTERFACE:${mi_install_incdir}>
   )
 
+  # Copy the generated object file (`static.o`) to the output directory (as `mimalloc.o`)
+  if(CMAKE_GENERATOR MATCHES "^Visual Studio.*$")
+    set(mimalloc-obj-static "${CMAKE_CURRENT_BINARY_DIR}/mimalloc-obj.dir/$<CONFIG>/static${CMAKE_C_OUTPUT_EXTENSION}")
+  else()
+    set(mimalloc-obj-static "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/mimalloc-obj.dir/src/static.c${CMAKE_C_OUTPUT_EXTENSION}")
+  endif()
+  set(mimalloc-obj-out "${CMAKE_CURRENT_BINARY_DIR}/${mi_libname}${CMAKE_C_OUTPUT_EXTENSION}")
+  add_custom_command(OUTPUT ${mimalloc-obj-out} DEPENDS mimalloc-obj COMMAND "${CMAKE_COMMAND}" -E copy "${mimalloc-obj-static}" "${mimalloc-obj-out}")
+  add_custom_target(mimalloc-obj-target ALL DEPENDS ${mimalloc-obj-out})
+
+
   # the following seems to lead to cmake warnings/errors on some systems, disable for now :-(
   # install(TARGETS mimalloc-obj EXPORT mimalloc DESTINATION ${mi_install_objdir})
 
   # the FILES expression can also be: $<TARGET_OBJECTS:mimalloc-obj>
   # but that fails cmake versions less than 3.10 so we leave it as is for now
-  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/mimalloc-obj.dir/src/static.c${CMAKE_C_OUTPUT_EXTENSION}
+  install(FILES ${mimalloc-obj-static}
           DESTINATION ${mi_install_objdir}
-          RENAME ${mi_basename}${CMAKE_C_OUTPUT_EXTENSION} )
+          RENAME ${mi_libname}${CMAKE_C_OUTPUT_EXTENSION} )
 endif()
 
+
+# pkg-config file support
+set(mi_pc_libraries "")
+foreach(item IN LISTS mi_libraries)
+  if(item MATCHES " *[-].*")
+    set(mi_pc_libraries "${mi_pc_libraries} ${item}")
+  else()
+    set(mi_pc_libraries "${mi_pc_libraries} -l${item}")
+  endif()
+endforeach()
+
+include("cmake/JoinPaths.cmake")
+join_paths(mi_pc_includedir "\${prefix}" "${CMAKE_INSTALL_INCLUDEDIR}")
+join_paths(mi_pc_libdir "\${prefix}" "${CMAKE_INSTALL_LIBDIR}")
+
+configure_file(mimalloc.pc.in mimalloc.pc @ONLY)
+install(FILES "${CMAKE_CURRENT_BINARY_DIR}/mimalloc.pc"
+        DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/")
+
+
+
 # -----------------------------------------------------------------------------
 # API surface testing
 # -----------------------------------------------------------------------------
@@ -383,15 +763,42 @@ endif()
 if (MI_BUILD_TESTS)
   enable_testing()
 
+  # static link tests
   foreach(TEST_NAME api api-fill stress)
     add_executable(mimalloc-test-${TEST_NAME} test/test-${TEST_NAME}.c)
     target_compile_definitions(mimalloc-test-${TEST_NAME} PRIVATE ${mi_defines})
     target_compile_options(mimalloc-test-${TEST_NAME} PRIVATE ${mi_cflags})
     target_include_directories(mimalloc-test-${TEST_NAME} PRIVATE include)
-    target_link_libraries(mimalloc-test-${TEST_NAME} PRIVATE mimalloc ${mi_libraries})
-
+    if(MI_BUILD_STATIC AND NOT MI_DEBUG_TSAN)
+      target_link_libraries(mimalloc-test-${TEST_NAME} PRIVATE mimalloc-static ${mi_libraries})
+    elseif(MI_BUILD_SHARED)
+      target_link_libraries(mimalloc-test-${TEST_NAME} PRIVATE mimalloc ${mi_libraries})
+    else()
+      message(STATUS "cannot build TSAN tests without MI_BUILD_SHARED being enabled")
+    endif()
     add_test(NAME test-${TEST_NAME} COMMAND mimalloc-test-${TEST_NAME})
   endforeach()
+
+  # dynamic override test
+  if(MI_BUILD_SHARED AND NOT (MI_TRACK_ASAN OR MI_DEBUG_TSAN OR MI_DEBUG_UBSAN))
+    add_executable(mimalloc-test-stress-dynamic test/test-stress.c)
+    target_compile_definitions(mimalloc-test-stress-dynamic PRIVATE ${mi_defines} "USE_STD_MALLOC=1")
+    target_compile_options(mimalloc-test-stress-dynamic PRIVATE ${mi_cflags})
+    target_include_directories(mimalloc-test-stress-dynamic PRIVATE include)
+    if(WIN32)
+      target_compile_definitions(mimalloc-test-stress-dynamic PRIVATE "MI_LINK_VERSION=1")  # link mi_version
+      target_link_libraries(mimalloc-test-stress-dynamic PRIVATE mimalloc ${mi_libraries})  # link mi_version
+      add_test(NAME test-stress-dynamic COMMAND ${CMAKE_COMMAND} -E env MIMALLOC_VERBOSE=1 $<TARGET_FILE:mimalloc-test-stress-dynamic>)
+    else()
+      target_link_libraries(mimalloc-test-stress-dynamic PRIVATE ${mi_libraries}) # pthreads, issue 1158
+      if(APPLE)
+        set(LD_PRELOAD "DYLD_INSERT_LIBRARIES")
+      else()
+        set(LD_PRELOAD "LD_PRELOAD")
+      endif()
+      add_test(NAME test-stress-dynamic COMMAND ${CMAKE_COMMAND} -E env MIMALLOC_VERBOSE=1 ${LD_PRELOAD}=$<TARGET_FILE:mimalloc> $<TARGET_FILE:mimalloc-test-stress-dynamic>)
+    endif()
+  endif()
 endif()
 
 # -----------------------------------------------------------------------------
diff --git a/ext/src/mimalloc/LICENSE b/ext/src/mimalloc/LICENSE
index 670b668a0c..53315ebee5 100644
--- a/ext/src/mimalloc/LICENSE
+++ b/ext/src/mimalloc/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2018-2021 Microsoft Corporation, Daan Leijen
+Copyright (c) 2018-2025 Microsoft Corporation, Daan Leijen
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/ext/src/mimalloc/SECURITY.md b/ext/src/mimalloc/SECURITY.md
new file mode 100644
index 0000000000..b3c89efc85
--- /dev/null
+++ b/ext/src/mimalloc/SECURITY.md
@@ -0,0 +1,41 @@
+<!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
+
+## Security
+
+Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
+
+If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
+
+## Reporting Security Issues
+
+**Please do not report security vulnerabilities through public GitHub issues.**
+
+Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
+
+If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
+
+You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
+
+Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
+
+  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
+  * Full paths of source file(s) related to the manifestation of the issue
+  * The location of the affected source code (tag/branch/commit or direct URL)
+  * Any special configuration required to reproduce the issue
+  * Step-by-step instructions to reproduce the issue
+  * Proof-of-concept or exploit code (if possible)
+  * Impact of the issue, including how an attacker might exploit the issue
+
+This information will help us triage your report more quickly.
+
+If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
+
+## Preferred Languages
+
+We prefer all communications to be in English.
+
+## Policy
+
+Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
+
+<!-- END MICROSOFT SECURITY.MD BLOCK -->
diff --git a/ext/src/mimalloc/azure-pipelines.yml b/ext/src/mimalloc/azure-pipelines.yml
index eb2a3d4315..3e3d700e96 100644
--- a/ext/src/mimalloc/azure-pipelines.yml
+++ b/ext/src/mimalloc/azure-pipelines.yml
@@ -6,16 +6,15 @@
 trigger:
   branches:
     include:
-    - master
-    - dev
-    - dev-slice
+    - main
+    - dev*
   tags:
     include:
     - v*
 
-jobs:  
+jobs:
 - job:
-  displayName: Windows
+  displayName: Windows 2022
   pool:
     vmImage:
       windows-2022
@@ -29,10 +28,30 @@ jobs:
         BuildType: release
         cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
         MSBuildConfiguration: Release
+      Release SIMD:
+        BuildType: release-simd
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_OPT_ARCH=ON -DMI_OPT_SIMD=ON -DMI_WIN_USE_FIXED_TLS=ON
+        MSBuildConfiguration: Release
       Secure:
         BuildType: secure
         cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_SECURE=ON
         MSBuildConfiguration: Release
+      Debug x86:
+        BuildType: debug
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON -A Win32
+        MSBuildConfiguration: Debug
+      Release x86:
+        BuildType: release
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -A Win32
+        MSBuildConfiguration: Release
+      Debug Fixed TLS:
+        BuildType: debug
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON -DMI_WIN_USE_FIXED_TLS=ON
+        MSBuildConfiguration: Debug
+      Release Fixed TLS:
+        BuildType: release
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_WIN_USE_FIXED_TLS=ON
+        MSBuildConfiguration: Release
   steps:
   - task: CMake@1
     inputs:
@@ -43,7 +62,7 @@ jobs:
       solution: $(BuildType)/libmimalloc.sln
       configuration: '$(MSBuildConfiguration)'
       msbuildArguments: -m
-  - script: ctest --verbose --timeout 120 -C $(MSBuildConfiguration)
+  - script: ctest --verbose --timeout 240 -C $(MSBuildConfiguration)
     workingDirectory: $(BuildType)
     displayName: CTest
   #- script: $(BuildType)\$(BuildType)\mimalloc-test-stress
@@ -52,10 +71,10 @@ jobs:
   #  artifact: mimalloc-windows-$(BuildType)
 
 - job:
-  displayName: Linux
+  displayName: Ubuntu 22.04
   pool:
     vmImage:
-     ubuntu-18.04
+     ubuntu-22.04
   strategy:
     matrix:
       Debug:
@@ -88,6 +107,11 @@ jobs:
         CXX: clang++
         BuildType: release-clang
         cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
+      Release SIMD Clang:
+        CC: clang
+        CXX: clang++
+        BuildType: release-simd-clang
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_OPT_ARCH=ON -DMI_OPT_SIMD=ON
       Secure Clang:
         CC: clang
         CXX: clang++
@@ -98,6 +122,27 @@ jobs:
         CXX: clang++
         BuildType: debug-clang-cxx
         cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON -DMI_USE_CXX=ON
+      Debug ASAN Clang:
+        CC: clang
+        CXX: clang++
+        BuildType: debug-asan-clang
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON -DMI_TRACK_ASAN=ON
+      Debug UBSAN Clang:
+        CC: clang
+        CXX: clang++
+        BuildType: debug-ubsan-clang
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON -DMI_DEBUG_UBSAN=ON
+      Debug TSAN Clang++:
+        CC: clang
+        CXX: clang++
+        BuildType: debug-tsan-clang-cxx
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=RelWithDebInfo -DMI_USE_CXX=ON -DMI_DEBUG_TSAN=ON
+      Debug Guarded Clang:
+        CC: clang
+        CXX: clang
+        BuildType: debug-guarded-clang
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=RelWithDebInfo -DMI_DEBUG_FULL=ON -DMI_GUARDED=ON
+
   steps:
   - task: CMake@1
     inputs:
@@ -105,17 +150,19 @@ jobs:
       cmakeArgs: .. $(cmakeExtraArgs)
   - script: make -j$(nproc) -C $(BuildType)
     displayName: Make
-  - script: ctest --verbose --timeout 120
+  - script: ctest --verbose --timeout 240
     workingDirectory: $(BuildType)
     displayName: CTest
+    env:
+      MIMALLOC_GUARDED_SAMPLE_RATE: 1000
 #  - upload: $(Build.SourcesDirectory)/$(BuildType)
 #    artifact: mimalloc-ubuntu-$(BuildType)
 
 - job:
-  displayName: macOS
+  displayName: macOS 14 (Sonoma)
   pool:
     vmImage:
-      macOS-latest
+      macOS-14
   strategy:
     matrix:
       Debug:
@@ -124,6 +171,9 @@ jobs:
       Release:
         BuildType: release
         cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
+      Release SIMD:
+        BuildType: release-simd
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_OPT_ARCH=ON -DMI_OPT_SIMD=ON
       Secure:
         BuildType: secure
         cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_SECURE=ON
@@ -134,48 +184,79 @@ jobs:
       cmakeArgs: .. $(cmakeExtraArgs)
   - script: make -j$(sysctl -n hw.ncpu) -C $(BuildType)
     displayName: Make
-  # - script: MIMALLOC_VERBOSE=1 ./mimalloc-test-api
-  #   workingDirectory: $(BuildType)
-  #   displayName: TestAPI
-  # - script: MIMALLOC_VERBOSE=1 ./mimalloc-test-stress
-  #   workingDirectory: $(BuildType)
-  #   displayName: TestStress    
-  - script: ctest --verbose --timeout 120
+  - script: ctest --verbose --timeout 240
     workingDirectory: $(BuildType)
     displayName: CTest
-    
 #  - upload: $(Build.SourcesDirectory)/$(BuildType)
 #    artifact: mimalloc-macos-$(BuildType)
 
-# - job:
-#   displayName: Windows-2017
-#   pool:
-#     vmImage:
-#       vs2017-win2016
-#   strategy:
-#     matrix:
-#       Debug:
-#         BuildType: debug
-#         cmakeExtraArgs: -A x64 -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
-#         MSBuildConfiguration: Debug
-#       Release:
-#         BuildType: release
-#         cmakeExtraArgs: -A x64 -DCMAKE_BUILD_TYPE=Release
-#         MSBuildConfiguration: Release
-#       Secure:
-#         BuildType: secure
-#         cmakeExtraArgs: -A x64 -DCMAKE_BUILD_TYPE=Release -DMI_SECURE=ON
-#         MSBuildConfiguration: Release
-#   steps:
-#   - task: CMake@1
-#     inputs:
-#       workingDirectory: $(BuildType)
-#       cmakeArgs: .. $(cmakeExtraArgs)
-#   - task: MSBuild@1
-#     inputs:
-#       solution: $(BuildType)/libmimalloc.sln
-#       configuration: '$(MSBuildConfiguration)'
-#   - script: |
-#       cd $(BuildType)
-#       ctest --verbose --timeout 120
-#     displayName: CTest
+# ----------------------------------------------------------
+# Other OS versions (just debug mode)
+# ----------------------------------------------------------
+
+- job:
+  displayName: Ubuntu 24.04
+  pool:
+    vmImage:
+      ubuntu-24.04
+  strategy:
+    matrix:
+      Debug:
+        CC: gcc
+        CXX: g++
+        BuildType: debug
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
+      Debug++:
+        CC: gcc
+        CXX: g++
+        BuildType: debug-cxx
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON -DMI_USE_CXX=ON
+      Debug Clang:
+        CC: clang
+        CXX: clang++
+        BuildType: debug-clang
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
+      Debug++ Clang:
+        CC: clang
+        CXX: clang++
+        BuildType: debug-clang-cxx
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON -DMI_USE_CXX=ON
+      Release Clang:
+        CC: clang
+        CXX: clang++
+        BuildType: release-clang
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
+  steps:
+  - task: CMake@1
+    inputs:
+      workingDirectory: $(BuildType)
+      cmakeArgs: .. $(cmakeExtraArgs)
+  - script: make -j$(nproc) -C $(BuildType)
+    displayName: Make
+  - script: ctest --verbose --timeout 240
+    workingDirectory: $(BuildType)
+    displayName: CTest
+
+- job:
+  displayName: macOS 15 (Sequoia)
+  pool:
+    vmImage:
+      macOS-15
+  strategy:
+    matrix:
+      Debug:
+        BuildType: debug
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
+      Release:
+        BuildType: release
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
+  steps:
+  - task: CMake@1
+    inputs:
+      workingDirectory: $(BuildType)
+      cmakeArgs: .. $(cmakeExtraArgs)
+  - script: make -j$(sysctl -n hw.ncpu) -C $(BuildType)
+    displayName: Make
+  - script: ctest --verbose --timeout 240
+    workingDirectory: $(BuildType)
+    displayName: CTest
diff --git a/ext/src/mimalloc/cmake/JoinPaths.cmake b/ext/src/mimalloc/cmake/JoinPaths.cmake
new file mode 100644
index 0000000000..c68d91b84d
--- /dev/null
+++ b/ext/src/mimalloc/cmake/JoinPaths.cmake
@@ -0,0 +1,23 @@
+# This module provides function for joining paths
+# known from most languages
+#
+# SPDX-License-Identifier: (MIT OR CC0-1.0)
+# Copyright 2020 Jan Tojnar
+# https://github.com/jtojnar/cmake-snips
+#
+# Modelled after Python’s os.path.join
+# https://docs.python.org/3.7/library/os.path.html#os.path.join
+# Windows not supported
+function(join_paths joined_path first_path_segment)
+    set(temp_path "${first_path_segment}")
+    foreach(current_segment IN LISTS ARGN)
+        if(NOT ("${current_segment}" STREQUAL ""))
+            if(IS_ABSOLUTE "${current_segment}")
+                set(temp_path "${current_segment}")
+            else()
+                set(temp_path "${temp_path}/${current_segment}")
+            endif()
+        endif()
+    endforeach()
+    set(${joined_path} "${temp_path}" PARENT_SCOPE)
+endfunction()
diff --git a/ext/src/mimalloc/cmake/mimalloc-config-version.cmake b/ext/src/mimalloc/cmake/mimalloc-config-version.cmake
index 8063afe6b9..d95a4a3bb8 100644
--- a/ext/src/mimalloc/cmake/mimalloc-config-version.cmake
+++ b/ext/src/mimalloc/cmake/mimalloc-config-version.cmake
@@ -1,6 +1,6 @@
-set(mi_version_major 2)
-set(mi_version_minor 0)
-set(mi_version_patch 6)
+set(mi_version_major 3)
+set(mi_version_minor 2)
+set(mi_version_patch 8)
 set(mi_version ${mi_version_major}.${mi_version_minor})
 
 set(PACKAGE_VERSION ${mi_version})
diff --git a/ext/src/mimalloc/cmake/mimalloc-config.cmake b/ext/src/mimalloc/cmake/mimalloc-config.cmake
index 8a28e37e7e..a49b02a25a 100644
--- a/ext/src/mimalloc/cmake/mimalloc-config.cmake
+++ b/ext/src/mimalloc/cmake/mimalloc-config.cmake
@@ -2,13 +2,13 @@ include(${CMAKE_CURRENT_LIST_DIR}/mimalloc.cmake)
 get_filename_component(MIMALLOC_CMAKE_DIR "${CMAKE_CURRENT_LIST_DIR}" PATH)  # one up from the cmake dir, e.g. /usr/local/lib/cmake/mimalloc-2.0
 get_filename_component(MIMALLOC_VERSION_DIR "${CMAKE_CURRENT_LIST_DIR}" NAME)
 string(REPLACE "/lib/cmake" "/lib" MIMALLOC_LIBRARY_DIR "${MIMALLOC_CMAKE_DIR}")
-if("${MIMALLOC_VERSION_DIR}" EQUAL "mimalloc")  
+if("${MIMALLOC_VERSION_DIR}" EQUAL "mimalloc")
   # top level install
   string(REPLACE "/lib/cmake" "/include" MIMALLOC_INCLUDE_DIR "${MIMALLOC_CMAKE_DIR}")
   set(MIMALLOC_OBJECT_DIR "${MIMALLOC_LIBRARY_DIR}")
-else()  
+else()
   # versioned
   string(REPLACE "/lib/cmake/" "/include/" MIMALLOC_INCLUDE_DIR "${CMAKE_CURRENT_LIST_DIR}")
-  string(REPLACE "/lib/cmake/" "/lib/" MIMALLOC_OBJECT_DIR "${CMAKE_CURRENT_LIST_DIR}")  
-endif()  
+  string(REPLACE "/lib/cmake/" "/lib/" MIMALLOC_OBJECT_DIR "${CMAKE_CURRENT_LIST_DIR}")
+endif()
 set(MIMALLOC_TARGET_DIR "${MIMALLOC_LIBRARY_DIR}") # legacy
diff --git a/ext/src/mimalloc/include/mimalloc-internal.h b/ext/src/mimalloc/include/mimalloc-internal.h
deleted file mode 100644
index d691eca586..0000000000
--- a/ext/src/mimalloc/include/mimalloc-internal.h
+++ /dev/null
@@ -1,1049 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-#pragma once
-#ifndef MIMALLOC_INTERNAL_H
-#define MIMALLOC_INTERNAL_H
-
-#include "mimalloc-types.h"
-
-#if (MI_DEBUG>0)
-#define mi_trace_message(...)  _mi_trace_message(__VA_ARGS__)
-#else
-#define mi_trace_message(...)
-#endif
-
-#define MI_CACHE_LINE          64
-#if defined(_MSC_VER)
-#pragma warning(disable:4127)   // suppress constant conditional warning (due to MI_SECURE paths)
-#pragma warning(disable:26812)  // unscoped enum warning
-#define mi_decl_noinline        __declspec(noinline)
-#define mi_decl_thread          __declspec(thread)
-#define mi_decl_cache_align     __declspec(align(MI_CACHE_LINE))
-#elif (defined(__GNUC__) && (__GNUC__ >= 3)) || defined(__clang__) // includes clang and icc
-#define mi_decl_noinline        __attribute__((noinline))
-#define mi_decl_thread          __thread
-#define mi_decl_cache_align     __attribute__((aligned(MI_CACHE_LINE)))
-#else
-#define mi_decl_noinline
-#define mi_decl_thread          __thread        // hope for the best :-)
-#define mi_decl_cache_align
-#endif
-
-#if defined(__EMSCRIPTEN__) && !defined(__wasi__)
-#define __wasi__
-#endif
-
-#if defined(__cplusplus)
-#define mi_decl_externc       extern "C"
-#else
-#define mi_decl_externc  
-#endif
-
-#if !defined(_WIN32) && !defined(__wasi__) 
-#define  MI_USE_PTHREADS
-#include <pthread.h>
-#endif
-
-// "options.c"
-void       _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message);
-void       _mi_fprintf(mi_output_fun* out, void* arg, const char* fmt, ...);
-void       _mi_warning_message(const char* fmt, ...);
-void       _mi_verbose_message(const char* fmt, ...);
-void       _mi_trace_message(const char* fmt, ...);
-void       _mi_options_init(void);
-void       _mi_error_message(int err, const char* fmt, ...);
-
-// random.c
-void       _mi_random_init(mi_random_ctx_t* ctx);
-void       _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx);
-uintptr_t  _mi_random_next(mi_random_ctx_t* ctx);
-uintptr_t  _mi_heap_random_next(mi_heap_t* heap);
-uintptr_t  _mi_os_random_weak(uintptr_t extra_seed);
-static inline uintptr_t _mi_random_shuffle(uintptr_t x);
-
-// init.c
-extern mi_decl_cache_align mi_stats_t       _mi_stats_main;
-extern mi_decl_cache_align const mi_page_t  _mi_page_empty;
-bool       _mi_is_main_thread(void);
-size_t     _mi_current_thread_count(void);
-bool       _mi_preloading(void);  // true while the C runtime is not ready
-
-// os.c
-size_t     _mi_os_page_size(void);
-void       _mi_os_init(void);                                      // called from process init
-void*      _mi_os_alloc(size_t size, mi_stats_t* stats);           // to allocate thread local data
-void       _mi_os_free(void* p, size_t size, mi_stats_t* stats);   // to free thread local data
-
-bool       _mi_os_protect(void* addr, size_t size);
-bool       _mi_os_unprotect(void* addr, size_t size);
-bool       _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* stats);
-bool       _mi_os_decommit(void* p, size_t size, mi_stats_t* stats);
-bool       _mi_os_reset(void* p, size_t size, mi_stats_t* stats);
-// bool       _mi_os_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
-size_t     _mi_os_good_alloc_size(size_t size);
-bool       _mi_os_has_overcommit(void);
-
-// arena.c
-void*      _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
-void*      _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
-void       _mi_arena_free(void* p, size_t size, size_t memid, bool is_committed, mi_os_tld_t* tld);
-
-// "segment-cache.c"
-void*      _mi_segment_cache_pop(size_t size, mi_commit_mask_t* commit_mask, mi_commit_mask_t* decommit_mask, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
-bool       _mi_segment_cache_push(void* start, size_t size, size_t memid, const mi_commit_mask_t* commit_mask, const mi_commit_mask_t* decommit_mask, bool is_large, bool is_pinned, mi_os_tld_t* tld);
-void       _mi_segment_cache_collect(bool force, mi_os_tld_t* tld);
-void       _mi_segment_map_allocated_at(const mi_segment_t* segment);
-void       _mi_segment_map_freed_at(const mi_segment_t* segment);
-
-// "segment.c"
-mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_wsize, mi_segments_tld_t* tld, mi_os_tld_t* os_tld);
-void       _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld);
-void       _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld);
-bool       _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segments_tld_t* tld);
-void       _mi_segment_thread_collect(mi_segments_tld_t* tld);
-void       _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
-
-uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size); // page start for any page
-void       _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld);
-void       _mi_abandoned_await_readers(void);
-void       _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld);
-
-
-
-// "page.c"
-void*      _mi_malloc_generic(mi_heap_t* heap, size_t size)  mi_attr_noexcept mi_attr_malloc;
-
-void       _mi_page_retire(mi_page_t* page) mi_attr_noexcept;                  // free the page if there are no other pages with many free blocks
-void       _mi_page_unfull(mi_page_t* page);
-void       _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force);   // free the page
-void       _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq);            // abandon the page, to be picked up by another thread...
-void       _mi_heap_delayed_free(mi_heap_t* heap);
-void       _mi_heap_collect_retired(mi_heap_t* heap, bool force);
-
-void       _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never);
-size_t     _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append);
-void       _mi_deferred_free(mi_heap_t* heap, bool force);
-
-void       _mi_page_free_collect(mi_page_t* page,bool force);
-void       _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page);   // callback from segments
-
-size_t     _mi_bin_size(uint8_t bin);           // for stats
-uint8_t    _mi_bin(size_t size);                // for stats
-
-// "heap.c"
-void       _mi_heap_destroy_pages(mi_heap_t* heap);
-void       _mi_heap_collect_abandon(mi_heap_t* heap);
-void       _mi_heap_set_default_direct(mi_heap_t* heap);
-
-// "stats.c"
-void       _mi_stats_done(mi_stats_t* stats);
-
-mi_msecs_t  _mi_clock_now(void);
-mi_msecs_t  _mi_clock_end(mi_msecs_t start);
-mi_msecs_t  _mi_clock_start(void);
-
-// "alloc.c"
-void*       _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept;  // called from `_mi_malloc_generic`
-void*       _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept;
-void*       _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept;
-mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p);
-bool        _mi_free_delayed_block(mi_block_t* block);
-void        _mi_block_zero_init(const mi_page_t* page, void* p, size_t size);
-
-#if MI_DEBUG>1
-bool        _mi_page_is_valid(mi_page_t* page);
-#endif
-
-
-// ------------------------------------------------------
-// Branches
-// ------------------------------------------------------
-
-#if defined(__GNUC__) || defined(__clang__)
-#define mi_unlikely(x)     __builtin_expect(!!(x),false)
-#define mi_likely(x)       __builtin_expect(!!(x),true)
-#else
-#define mi_unlikely(x)     (x)
-#define mi_likely(x)       (x)
-#endif
-
-#ifndef __has_builtin
-#define __has_builtin(x)  0
-#endif
-
-
-/* -----------------------------------------------------------
-  Error codes passed to `_mi_fatal_error`
-  All are recoverable but EFAULT is a serious error and aborts by default in secure mode.
-  For portability define undefined error codes using common Unix codes:
-  <https://www-numi.fnal.gov/offline_software/srt_public_context/WebDocs/Errors/unix_system_errors.html>
------------------------------------------------------------ */
-#include <errno.h>
-#ifndef EAGAIN         // double free
-#define EAGAIN (11)
-#endif
-#ifndef ENOMEM         // out of memory
-#define ENOMEM (12)
-#endif
-#ifndef EFAULT         // corrupted free-list or meta-data
-#define EFAULT (14)
-#endif
-#ifndef EINVAL         // trying to free an invalid pointer
-#define EINVAL (22)
-#endif
-#ifndef EOVERFLOW      // count*size overflow
-#define EOVERFLOW (75)
-#endif
-
-
-/* -----------------------------------------------------------
-  Inlined definitions
------------------------------------------------------------ */
-#define MI_UNUSED(x)     (void)(x)
-#if (MI_DEBUG>0)
-#define MI_UNUSED_RELEASE(x)
-#else
-#define MI_UNUSED_RELEASE(x)  MI_UNUSED(x)
-#endif
-
-#define MI_INIT4(x)   x(),x(),x(),x()
-#define MI_INIT8(x)   MI_INIT4(x),MI_INIT4(x)
-#define MI_INIT16(x)  MI_INIT8(x),MI_INIT8(x)
-#define MI_INIT32(x)  MI_INIT16(x),MI_INIT16(x)
-#define MI_INIT64(x)  MI_INIT32(x),MI_INIT32(x)
-#define MI_INIT128(x) MI_INIT64(x),MI_INIT64(x)
-#define MI_INIT256(x) MI_INIT128(x),MI_INIT128(x)
-
-
-// Is `x` a power of two? (0 is considered a power of two)
-static inline bool _mi_is_power_of_two(uintptr_t x) {
-  return ((x & (x - 1)) == 0);
-}
-
-// Align upwards
-static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
-  mi_assert_internal(alignment != 0);
-  uintptr_t mask = alignment - 1;
-  if ((alignment & mask) == 0) {  // power of two?
-    return ((sz + mask) & ~mask);
-  }
-  else {
-    return (((sz + mask)/alignment)*alignment);
-  }
-}
-
-// Align downwards
-static inline uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) {
-  mi_assert_internal(alignment != 0);
-  uintptr_t mask = alignment - 1;
-  if ((alignment & mask) == 0) { // power of two?
-    return (sz & ~mask);
-  }
-  else {
-    return ((sz / alignment) * alignment);
-  }
-}
-
-// Divide upwards: `s <= _mi_divide_up(s,d)*d < s+d`.
-static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) {
-  mi_assert_internal(divider != 0);
-  return (divider == 0 ? size : ((size + divider - 1) / divider));
-}
-
-// Is memory zero initialized?
-static inline bool mi_mem_is_zero(void* p, size_t size) {
-  for (size_t i = 0; i < size; i++) {
-    if (((uint8_t*)p)[i] != 0) return false;
-  }
-  return true;
-}
-
-
-// Align a byte size to a size in _machine words_,
-// i.e. byte size == `wsize*sizeof(void*)`.
-static inline size_t _mi_wsize_from_size(size_t size) {
-  mi_assert_internal(size <= SIZE_MAX - sizeof(uintptr_t));
-  return (size + sizeof(uintptr_t) - 1) / sizeof(uintptr_t);
-}
-
-// Overflow detecting multiply
-#if __has_builtin(__builtin_umul_overflow) || (defined(__GNUC__) && (__GNUC__ >= 5))
-#include <limits.h>      // UINT_MAX, ULONG_MAX
-#if defined(_CLOCK_T)    // for Illumos
-#undef _CLOCK_T
-#endif
-static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
-  #if (SIZE_MAX == ULONG_MAX)
-    return __builtin_umull_overflow(count, size, (unsigned long *)total);
-  #elif (SIZE_MAX == UINT_MAX)
-    return __builtin_umul_overflow(count, size, (unsigned int *)total);
-  #else
-    return __builtin_umulll_overflow(count, size, (unsigned long long *)total);
-  #endif
-}
-#else /* __builtin_umul_overflow is unavailable */
-static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
-  #define MI_MUL_NO_OVERFLOW ((size_t)1 << (4*sizeof(size_t)))  // sqrt(SIZE_MAX)
-  *total = count * size;
-  return ((size >= MI_MUL_NO_OVERFLOW || count >= MI_MUL_NO_OVERFLOW)
-    && size > 0 && (SIZE_MAX / size) < count);
-}
-#endif
-
-// Safe multiply `count*size` into `total`; return `true` on overflow.
-static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* total) {
-  if (count==1) {  // quick check for the case where count is one (common for C++ allocators)
-    *total = size;
-    return false;
-  }
-  else if (mi_unlikely(mi_mul_overflow(count, size, total))) {
-    _mi_error_message(EOVERFLOW, "allocation request is too large (%zu * %zu bytes)\n", count, size);
-    *total = SIZE_MAX;
-    return true;
-  }
-  else return false;
-}
-
-
-/* ----------------------------------------------------------------------------------------
-The thread local default heap: `_mi_get_default_heap` returns the thread local heap.
-On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a
-__thread local variable (`_mi_heap_default`). With the initial-exec TLS model this ensures
-that the storage will always be available (allocated on the thread stacks).
-On some platforms though we cannot use that when overriding `malloc` since the underlying
-TLS implementation (or the loader) will call itself `malloc` on a first access and recurse.
-We try to circumvent this in an efficient way:
-- macOSX : we use an unused TLS slot from the OS allocated slots (MI_TLS_SLOT). On OSX, the
-           loader itself calls `malloc` even before the modules are initialized.
-- OpenBSD: we use an unused slot from the pthread block (MI_TLS_PTHREAD_SLOT_OFS).
-- DragonFly: defaults are working but seem slow compared to freeBSD (see PR #323)
-------------------------------------------------------------------------------------------- */
-
-extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value of the thread local default heap
-extern bool _mi_process_is_initialized;
-mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing heap
-
-#if defined(MI_MALLOC_OVERRIDE)
-#if defined(__APPLE__) // macOS
-#define MI_TLS_SLOT               89  // seems unused? 
-// #define MI_TLS_RECURSE_GUARD 1     
-// other possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
-// see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
-#elif defined(__OpenBSD__)
-// use end bytes of a name; goes wrong if anyone uses names > 23 characters (ptrhread specifies 16) 
-// see <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
-#define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 4*sizeof(void*) + 24)  
-// #elif defined(__DragonFly__)
-// #warning "mimalloc is not working correctly on DragonFly yet."
-// #define MI_TLS_PTHREAD_SLOT_OFS   (4 + 1*sizeof(void*))  // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>
-#elif defined(__ANDROID__)
-// See issue #381
-#define MI_TLS_PTHREAD
-#endif
-#endif
-
-#if defined(MI_TLS_SLOT)
-static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept;   // forward declaration
-#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
-static inline mi_heap_t** mi_tls_pthread_heap_slot(void) {
-  pthread_t self = pthread_self();
-  #if defined(__DragonFly__)
-  if (self==NULL) {
-    mi_heap_t* pheap_main = _mi_heap_main_get();
-    return &pheap_main;
-  }
-  #endif
-  return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS);
-}
-#elif defined(MI_TLS_PTHREAD)
-extern pthread_key_t _mi_heap_default_key;
-#endif
-
-// Default heap to allocate from (if not using TLS- or pthread slots).
-// Do not use this directly but use through `mi_heap_get_default()` (or the unchecked `mi_get_default_heap`).
-// This thread local variable is only used when neither MI_TLS_SLOT, MI_TLS_PTHREAD, or MI_TLS_PTHREAD_SLOT_OFS are defined.
-// However, on the Apple M1 we do use the address of this variable as the unique thread-id (issue #356).
-extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
-
-static inline mi_heap_t* mi_get_default_heap(void) {
-#if defined(MI_TLS_SLOT)
-  mi_heap_t* heap = (mi_heap_t*)mi_tls_slot(MI_TLS_SLOT);
-  if (mi_unlikely(heap == NULL)) {
-    #ifdef __GNUC__
-    __asm(""); // prevent conditional load of the address of _mi_heap_empty
-    #endif
-    heap = (mi_heap_t*)&_mi_heap_empty;    
-  }
-  return heap;
-#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
-  mi_heap_t* heap = *mi_tls_pthread_heap_slot();
-  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
-#elif defined(MI_TLS_PTHREAD)
-  mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
-  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
-#else
-  #if defined(MI_TLS_RECURSE_GUARD)  
-  if (mi_unlikely(!_mi_process_is_initialized)) return _mi_heap_main_get();
-  #endif
-  return _mi_heap_default;
-#endif
-}
-
-static inline bool mi_heap_is_default(const mi_heap_t* heap) {
-  return (heap == mi_get_default_heap());
-}
-
-static inline bool mi_heap_is_backing(const mi_heap_t* heap) {
-  return (heap->tld->heap_backing == heap);
-}
-
-static inline bool mi_heap_is_initialized(mi_heap_t* heap) {
-  mi_assert_internal(heap != NULL);
-  return (heap != &_mi_heap_empty);
-}
-
-static inline uintptr_t _mi_ptr_cookie(const void* p) {
-  extern mi_heap_t _mi_heap_main;
-  mi_assert_internal(_mi_heap_main.cookie != 0);
-  return ((uintptr_t)p ^ _mi_heap_main.cookie);
-}
-
-/* -----------------------------------------------------------
-  Pages
------------------------------------------------------------ */
-
-static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t size) {
-  mi_assert_internal(size <= (MI_SMALL_SIZE_MAX + MI_PADDING_SIZE));
-  const size_t idx = _mi_wsize_from_size(size);
-  mi_assert_internal(idx < MI_PAGES_DIRECT);
-  return heap->pages_free_direct[idx];
-}
-
-// Get the page belonging to a certain size class
-static inline mi_page_t* _mi_get_free_small_page(size_t size) {
-  return _mi_heap_get_free_small_page(mi_get_default_heap(), size);
-}
-
-// Segment that contains the pointer
-static inline mi_segment_t* _mi_ptr_segment(const void* p) {
-  // mi_assert_internal(p != NULL);
-  return (mi_segment_t*)((uintptr_t)p & ~MI_SEGMENT_MASK);
-}
-
-static inline mi_page_t* mi_slice_to_page(mi_slice_t* s) {
-  mi_assert_internal(s->slice_offset== 0 && s->slice_count > 0);
-  return (mi_page_t*)(s);
-}
-
-static inline mi_slice_t* mi_page_to_slice(mi_page_t* p) {
-  mi_assert_internal(p->slice_offset== 0 && p->slice_count > 0);
-  return (mi_slice_t*)(p);
-}
-
-// Segment belonging to a page
-static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) {
-  mi_segment_t* segment = _mi_ptr_segment(page); 
-  mi_assert_internal(segment == NULL || ((mi_slice_t*)page >= segment->slices && (mi_slice_t*)page < segment->slices + segment->slice_entries));
-  return segment;
-}
-
-static inline mi_slice_t* mi_slice_first(const mi_slice_t* slice) {
-  mi_slice_t* start = (mi_slice_t*)((uint8_t*)slice - slice->slice_offset);
-  mi_assert_internal(start >= _mi_ptr_segment(slice)->slices);
-  mi_assert_internal(start->slice_offset == 0);
-  mi_assert_internal(start + start->slice_count > slice);
-  return start;
-}
-
-// Get the page containing the pointer
-static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) {
-  ptrdiff_t diff = (uint8_t*)p - (uint8_t*)segment;
-  mi_assert_internal(diff >= 0 && diff < (ptrdiff_t)MI_SEGMENT_SIZE);
-  size_t idx = (size_t)diff >> MI_SEGMENT_SLICE_SHIFT;
-  mi_assert_internal(idx < segment->slice_entries);
-  mi_slice_t* slice0 = (mi_slice_t*)&segment->slices[idx];
-  mi_slice_t* slice = mi_slice_first(slice0);  // adjust to the block that holds the page data
-  mi_assert_internal(slice->slice_offset == 0);
-  mi_assert_internal(slice >= segment->slices && slice < segment->slices + segment->slice_entries);
-  return mi_slice_to_page(slice);
-}
-
-// Quick page start for initialized pages
-static inline uint8_t* _mi_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size) {
-  return _mi_segment_page_start(segment, page, page_size);
-}
-
-// Get the page containing the pointer
-static inline mi_page_t* _mi_ptr_page(void* p) {
-  return _mi_segment_page_of(_mi_ptr_segment(p), p);
-}
-
-// Get the block size of a page (special case for huge objects)
-static inline size_t mi_page_block_size(const mi_page_t* page) {
-  const size_t bsize = page->xblock_size;
-  mi_assert_internal(bsize > 0);
-  if (mi_likely(bsize < MI_HUGE_BLOCK_SIZE)) {
-    return bsize;
-  }
-  else {
-    size_t psize;
-    _mi_segment_page_start(_mi_page_segment(page), page, &psize);
-    return psize;
-  }
-}
-
-// Get the usable block size of a page without fixed padding.
-// This may still include internal padding due to alignment and rounding up size classes.
-static inline size_t mi_page_usable_block_size(const mi_page_t* page) {
-  return mi_page_block_size(page) - MI_PADDING_SIZE;
-}
-
-// size of a segment
-static inline size_t mi_segment_size(mi_segment_t* segment) {
-  return segment->segment_slices * MI_SEGMENT_SLICE_SIZE;
-}
-
-static inline uint8_t* mi_segment_end(mi_segment_t* segment) {
-  return (uint8_t*)segment + mi_segment_size(segment);
-}
-
-// Thread free access
-static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) {
-  return (mi_block_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & ~3);
-}
-
-static inline mi_delayed_t mi_page_thread_free_flag(const mi_page_t* page) {
-  return (mi_delayed_t)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & 3);
-}
-
-// Heap access
-static inline mi_heap_t* mi_page_heap(const mi_page_t* page) {
-  return (mi_heap_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xheap));
-}
-
-static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
-  mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING);
-  mi_atomic_store_release(&page->xheap,(uintptr_t)heap);
-}
-
-// Thread free flag helpers
-static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) {
-  return (mi_block_t*)(tf & ~0x03);
-}
-static inline mi_delayed_t mi_tf_delayed(mi_thread_free_t tf) {
-  return (mi_delayed_t)(tf & 0x03);
-}
-static inline mi_thread_free_t mi_tf_make(mi_block_t* block, mi_delayed_t delayed) {
-  return (mi_thread_free_t)((uintptr_t)block | (uintptr_t)delayed);
-}
-static inline mi_thread_free_t mi_tf_set_delayed(mi_thread_free_t tf, mi_delayed_t delayed) {
-  return mi_tf_make(mi_tf_block(tf),delayed);
-}
-static inline mi_thread_free_t mi_tf_set_block(mi_thread_free_t tf, mi_block_t* block) {
-  return mi_tf_make(block, mi_tf_delayed(tf));
-}
-
-// are all blocks in a page freed?
-// note: needs up-to-date used count, (as the `xthread_free` list may not be empty). see `_mi_page_collect_free`.
-static inline bool mi_page_all_free(const mi_page_t* page) {
-  mi_assert_internal(page != NULL);
-  return (page->used == 0);
-}
-
-// are there any available blocks?
-static inline bool mi_page_has_any_available(const mi_page_t* page) {
-  mi_assert_internal(page != NULL && page->reserved > 0);
-  return (page->used < page->reserved || (mi_page_thread_free(page) != NULL));
-}
-
-// are there immediately available blocks, i.e. blocks available on the free list.
-static inline bool mi_page_immediate_available(const mi_page_t* page) {
-  mi_assert_internal(page != NULL);
-  return (page->free != NULL);
-}
-
-// is more than 7/8th of a page in use?
-static inline bool mi_page_mostly_used(const mi_page_t* page) {
-  if (page==NULL) return true;
-  uint16_t frac = page->reserved / 8U;
-  return (page->reserved - page->used <= frac);
-}
-
-static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size) {
-  return &((mi_heap_t*)heap)->pages[_mi_bin(size)];
-}
-
-
-
-//-----------------------------------------------------------
-// Page flags
-//-----------------------------------------------------------
-static inline bool mi_page_is_in_full(const mi_page_t* page) {
-  return page->flags.x.in_full;
-}
-
-static inline void mi_page_set_in_full(mi_page_t* page, bool in_full) {
-  page->flags.x.in_full = in_full;
-}
-
-static inline bool mi_page_has_aligned(const mi_page_t* page) {
-  return page->flags.x.has_aligned;
-}
-
-static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
-  page->flags.x.has_aligned = has_aligned;
-}
-
-
-/* -------------------------------------------------------------------
-Encoding/Decoding the free list next pointers
-
-This is to protect against buffer overflow exploits where the
-free list is mutated. Many hardened allocators xor the next pointer `p`
-with a secret key `k1`, as `p^k1`. This prevents overwriting with known
-values but might be still too weak: if the attacker can guess
-the pointer `p` this  can reveal `k1` (since `p^k1^p == k1`).
-Moreover, if multiple blocks can be read as well, the attacker can
-xor both as `(p1^k1) ^ (p2^k1) == p1^p2` which may reveal a lot
-about the pointers (and subsequently `k1`).
-
-Instead mimalloc uses an extra key `k2` and encodes as `((p^k2)<<<k1)+k1`.
-Since these operations are not associative, the above approaches do not
-work so well any more even if the `p` can be guesstimated. For example,
-for the read case we can subtract two entries to discard the `+k1` term,
-but that leads to `((p1^k2)<<<k1) - ((p2^k2)<<<k1)` at best.
-We include the left-rotation since xor and addition are otherwise linear
-in the lowest bit. Finally, both keys are unique per page which reduces
-the re-use of keys by a large factor.
-
-We also pass a separate `null` value to be used as `NULL` or otherwise
-`(k2<<<k1)+k1` would appear (too) often as a sentinel value.
-------------------------------------------------------------------- */
-
-static inline bool mi_is_in_same_segment(const void* p, const void* q) {
-  return (_mi_ptr_segment(p) == _mi_ptr_segment(q));
-}
-
-static inline bool mi_is_in_same_page(const void* p, const void* q) {
-  mi_segment_t* segment = _mi_ptr_segment(p);
-  if (_mi_ptr_segment(q) != segment) return false;
-  // assume q may be invalid // return (_mi_segment_page_of(segment, p) == _mi_segment_page_of(segment, q));
-  mi_page_t* page = _mi_segment_page_of(segment, p);
-  size_t psize;
-  uint8_t* start = _mi_segment_page_start(segment, page, &psize);
-  return (start <= (uint8_t*)q && (uint8_t*)q < start + psize);
-}
-
-static inline uintptr_t mi_rotl(uintptr_t x, uintptr_t shift) {
-  shift %= MI_INTPTR_BITS;
-  return (shift==0 ? x : ((x << shift) | (x >> (MI_INTPTR_BITS - shift))));
-}
-static inline uintptr_t mi_rotr(uintptr_t x, uintptr_t shift) {
-  shift %= MI_INTPTR_BITS;
-  return (shift==0 ? x : ((x >> shift) | (x << (MI_INTPTR_BITS - shift))));
-}
-
-static inline void* mi_ptr_decode(const void* null, const mi_encoded_t x, const uintptr_t* keys) {
-  void* p = (void*)(mi_rotr(x - keys[0], keys[0]) ^ keys[1]);
-  return (mi_unlikely(p==null) ? NULL : p);
-}
-
-static inline mi_encoded_t mi_ptr_encode(const void* null, const void* p, const uintptr_t* keys) {
-  uintptr_t x = (uintptr_t)(mi_unlikely(p==NULL) ? null : p);
-  return mi_rotl(x ^ keys[1], keys[0]) + keys[0];
-}
-
-static inline mi_block_t* mi_block_nextx( const void* null, const mi_block_t* block, const uintptr_t* keys ) {
-  #ifdef MI_ENCODE_FREELIST
-  return (mi_block_t*)mi_ptr_decode(null, block->next, keys);
-  #else
-  MI_UNUSED(keys); MI_UNUSED(null);
-  return (mi_block_t*)block->next;
-  #endif
-}
-
-static inline void mi_block_set_nextx(const void* null, mi_block_t* block, const mi_block_t* next, const uintptr_t* keys) {
-  #ifdef MI_ENCODE_FREELIST
-  block->next = mi_ptr_encode(null, next, keys);
-  #else
-  MI_UNUSED(keys); MI_UNUSED(null);
-  block->next = (mi_encoded_t)next;
-  #endif
-}
-
-static inline mi_block_t* mi_block_next(const mi_page_t* page, const mi_block_t* block) {
-  #ifdef MI_ENCODE_FREELIST
-  mi_block_t* next = mi_block_nextx(page,block,page->keys);
-  // check for free list corruption: is `next` at least in the same page?
-  // TODO: check if `next` is `page->block_size` aligned?
-  if (mi_unlikely(next!=NULL && !mi_is_in_same_page(block, next))) {
-    _mi_error_message(EFAULT, "corrupted free list entry of size %zub at %p: value 0x%zx\n", mi_page_block_size(page), block, (uintptr_t)next);
-    next = NULL;
-  }
-  return next;
-  #else
-  MI_UNUSED(page);
-  return mi_block_nextx(page,block,NULL);
-  #endif
-}
-
-static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, const mi_block_t* next) {
-  #ifdef MI_ENCODE_FREELIST
-  mi_block_set_nextx(page,block,next, page->keys);
-  #else
-  MI_UNUSED(page);
-  mi_block_set_nextx(page,block,next,NULL);
-  #endif
-}
-
-
-// -------------------------------------------------------------------
-// commit mask
-// -------------------------------------------------------------------
-
-static inline void mi_commit_mask_create_empty(mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    cm->mask[i] = 0;
-  }
-}
-
-static inline void mi_commit_mask_create_full(mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    cm->mask[i] = ~((size_t)0);
-  }
-}
-
-static inline bool mi_commit_mask_is_empty(const mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    if (cm->mask[i] != 0) return false;
-  }
-  return true;
-}
-
-static inline bool mi_commit_mask_is_full(const mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    if (cm->mask[i] != ~((size_t)0)) return false;
-  }
-  return true;
-}
-
-// defined in `segment.c`:
-size_t _mi_commit_mask_committed_size(const mi_commit_mask_t* cm, size_t total);
-size_t _mi_commit_mask_next_run(const mi_commit_mask_t* cm, size_t* idx);
-
-#define mi_commit_mask_foreach(cm,idx,count) \
-  idx = 0; \
-  while ((count = _mi_commit_mask_next_run(cm,&idx)) > 0) { 
-        
-#define mi_commit_mask_foreach_end() \
-    idx += count; \
-  }
-      
-
-
-
-// -------------------------------------------------------------------
-// Fast "random" shuffle
-// -------------------------------------------------------------------
-
-static inline uintptr_t _mi_random_shuffle(uintptr_t x) {
-  if (x==0) { x = 17; }   // ensure we don't get stuck in generating zeros
-#if (MI_INTPTR_SIZE==8)
-  // by Sebastiano Vigna, see: <http://xoshiro.di.unimi.it/splitmix64.c>
-  x ^= x >> 30;
-  x *= 0xbf58476d1ce4e5b9UL;
-  x ^= x >> 27;
-  x *= 0x94d049bb133111ebUL;
-  x ^= x >> 31;
-#elif (MI_INTPTR_SIZE==4)
-  // by Chris Wellons, see: <https://nullprogram.com/blog/2018/07/31/>
-  x ^= x >> 16;
-  x *= 0x7feb352dUL;
-  x ^= x >> 15;
-  x *= 0x846ca68bUL;
-  x ^= x >> 16;
-#endif
-  return x;
-}
-
-// -------------------------------------------------------------------
-// Optimize numa node access for the common case (= one node)
-// -------------------------------------------------------------------
-
-int    _mi_os_numa_node_get(mi_os_tld_t* tld);
-size_t _mi_os_numa_node_count_get(void);
-
-extern _Atomic(size_t) _mi_numa_node_count;
-static inline int _mi_os_numa_node(mi_os_tld_t* tld) {
-  if (mi_likely(mi_atomic_load_relaxed(&_mi_numa_node_count) == 1)) return 0;
-  else return _mi_os_numa_node_get(tld);
-}
-static inline size_t _mi_os_numa_node_count(void) {
-  const size_t count = mi_atomic_load_relaxed(&_mi_numa_node_count);
-  if (mi_likely(count>0)) return count;
-  else return _mi_os_numa_node_count_get();
-}
-
-
-// -------------------------------------------------------------------
-// Getting the thread id should be performant as it is called in the
-// fast path of `_mi_free` and we specialize for various platforms.
-// We only require _mi_threadid() to return a unique id for each thread.
-// -------------------------------------------------------------------
-#if defined(_WIN32)
-
-#define WIN32_LEAN_AND_MEAN
-#include <windows.h>
-static inline mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
-  // Windows: works on Intel and ARM in both 32- and 64-bit
-  return (uintptr_t)NtCurrentTeb();
-}
-
-// We use assembly for a fast thread id on the main platforms. The TLS layout depends on 
-// both the OS and libc implementation so we use specific tests for each main platform.
-// If you test on another platform and it works please send a PR :-)
-// see also https://akkadia.org/drepper/tls.pdf for more info on the TLS register.
-#elif defined(__GNUC__) && ( \
-           (defined(__GLIBC__)   && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
-        || (defined(__APPLE__)   && (defined(__x86_64__) || defined(__aarch64__))) \
-        || (defined(__BIONIC__)  && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
-        || (defined(__FreeBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
-        || (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
-      )
-
-static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept {
-  void* res;
-  const size_t ofs = (slot*sizeof(void*));
-  #if defined(__i386__)
-    __asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86 32-bit always uses GS
-  #elif defined(__APPLE__) && defined(__x86_64__)
-    __asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 macOSX uses GS
-  #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
-    __asm__("movl %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x32 ABI
-  #elif defined(__x86_64__)
-    __asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 Linux, BSD uses FS
-  #elif defined(__arm__)
-    void** tcb; MI_UNUSED(ofs);
-    __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
-    res = tcb[slot];
-  #elif defined(__aarch64__)
-    void** tcb; MI_UNUSED(ofs);
-    #if defined(__APPLE__) // M1, issue #343
-    __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
-    #else
-    __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
-    #endif
-    res = tcb[slot];
-  #endif
-  return res;
-}
-
-// setting a tls slot is only used on macOS for now
-static inline void mi_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
-  const size_t ofs = (slot*sizeof(void*));
-  #if defined(__i386__)
-    __asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // 32-bit always uses GS
-  #elif defined(__APPLE__) && defined(__x86_64__)
-    __asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 macOS uses GS
-  #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
-    __asm__("movl %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x32 ABI
-  #elif defined(__x86_64__)
-    __asm__("movq %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 Linux, BSD uses FS
-  #elif defined(__arm__)
-    void** tcb; MI_UNUSED(ofs);
-    __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
-    tcb[slot] = value;
-  #elif defined(__aarch64__)
-    void** tcb; MI_UNUSED(ofs);
-    #if defined(__APPLE__) // M1, issue #343
-    __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
-    #else
-    __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
-    #endif
-    tcb[slot] = value;
-  #endif
-}
-
-static inline mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
-  #if defined(__BIONIC__)
-    // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id
-    // see: https://github.com/aosp-mirror/platform_bionic/blob/c44b1d0676ded732df4b3b21c5f798eacae93228/libc/platform/bionic/tls_defines.h#L86
-    return (uintptr_t)mi_tls_slot(1);
-  #else
-    // in all our other targets, slot 0 is the thread id
-    // glibc: https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/x86_64/nptl/tls.h
-    // apple: https://github.com/apple/darwin-xnu/blob/main/libsyscall/os/tsd.h#L36
-    return (uintptr_t)mi_tls_slot(0);
-  #endif
-}
-
-#else
-
-// otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms).
-static inline mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
-  return (uintptr_t)&_mi_heap_default;
-}
-
-#endif
-
-
-// -----------------------------------------------------------------------
-// Count bits: trailing or leading zeros (with MI_INTPTR_BITS on all zero)
-// -----------------------------------------------------------------------
-
-#if defined(__GNUC__)
-
-#include <limits.h>       // LONG_MAX
-#define MI_HAVE_FAST_BITSCAN
-static inline size_t mi_clz(uintptr_t x) {
-  if (x==0) return MI_INTPTR_BITS;
-#if (INTPTR_MAX == LONG_MAX)
-  return __builtin_clzl(x);
-#else
-  return __builtin_clzll(x);
-#endif
-}
-static inline size_t mi_ctz(uintptr_t x) {
-  if (x==0) return MI_INTPTR_BITS;
-#if (INTPTR_MAX == LONG_MAX)
-  return __builtin_ctzl(x);
-#else
-  return __builtin_ctzll(x);
-#endif
-}
-
-#elif defined(_MSC_VER) 
-
-#include <limits.h>       // LONG_MAX
-#define MI_HAVE_FAST_BITSCAN
-static inline size_t mi_clz(uintptr_t x) {
-  if (x==0) return MI_INTPTR_BITS;
-  unsigned long idx;
-#if (INTPTR_MAX == LONG_MAX)
-  _BitScanReverse(&idx, x);
-#else
-  _BitScanReverse64(&idx, x);
-#endif  
-  return ((MI_INTPTR_BITS - 1) - idx);
-}
-static inline size_t mi_ctz(uintptr_t x) {
-  if (x==0) return MI_INTPTR_BITS;
-  unsigned long idx;
-#if (INTPTR_MAX == LONG_MAX)
-  _BitScanForward(&idx, x);
-#else
-  _BitScanForward64(&idx, x);
-#endif  
-  return idx;
-}
-
-#else
-static inline size_t mi_ctz32(uint32_t x) {
-  // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
-  static const unsigned char debruijn[32] = {
-    0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
-    31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
-  };
-  if (x==0) return 32;
-  return debruijn[((x & -(int32_t)x) * 0x077CB531UL) >> 27];
-}
-static inline size_t mi_clz32(uint32_t x) {
-  // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
-  static const uint8_t debruijn[32] = {
-    31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1,
-    23, 19, 11, 3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0
-  };
-  if (x==0) return 32;
-  x |= x >> 1;
-  x |= x >> 2;
-  x |= x >> 4;
-  x |= x >> 8;
-  x |= x >> 16;
-  return debruijn[(uint32_t)(x * 0x07C4ACDDUL) >> 27];
-}
-
-static inline size_t mi_clz(uintptr_t x) {
-  if (x==0) return MI_INTPTR_BITS;  
-#if (MI_INTPTR_BITS <= 32)
-  return mi_clz32((uint32_t)x);
-#else
-  size_t count = mi_clz32((uint32_t)(x >> 32));
-  if (count < 32) return count;
-  return (32 + mi_clz32((uint32_t)x));
-#endif
-}
-static inline size_t mi_ctz(uintptr_t x) {
-  if (x==0) return MI_INTPTR_BITS;
-#if (MI_INTPTR_BITS <= 32)
-  return mi_ctz32((uint32_t)x);
-#else
-  size_t count = mi_ctz32((uint32_t)x);
-  if (count < 32) return count;
-  return (32 + mi_ctz32((uint32_t)(x>>32)));
-#endif
-}
-
-#endif
-
-// "bit scan reverse": Return index of the highest bit (or MI_INTPTR_BITS if `x` is zero)
-static inline size_t mi_bsr(uintptr_t x) {
-  return (x==0 ? MI_INTPTR_BITS : MI_INTPTR_BITS - 1 - mi_clz(x));
-}
-
-
-// ---------------------------------------------------------------------------------
-// Provide our own `_mi_memcpy` for potential performance optimizations.
-//
-// For now, only on Windows with msvc/clang-cl we optimize to `rep movsb` if 
-// we happen to run on x86/x64 cpu's that have "fast short rep movsb" (FSRM) support 
-// (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017). See also issue #201 and pr #253. 
-// ---------------------------------------------------------------------------------
-
-#if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
-#include <intrin.h>
-#include <string.h>
-extern bool _mi_cpu_has_fsrm;
-static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
-  if (_mi_cpu_has_fsrm) {
-    __movsb((unsigned char*)dst, (const unsigned char*)src, n);
-  }
-  else {
-    memcpy(dst, src, n); // todo: use noinline?
-  }
-}
-#else
-#include <string.h>
-static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
-  memcpy(dst, src, n);
-}
-#endif
-
-
-// -------------------------------------------------------------------------------
-// The `_mi_memcpy_aligned` can be used if the pointers are machine-word aligned 
-// This is used for example in `mi_realloc`.
-// -------------------------------------------------------------------------------
-
-#if (defined(__GNUC__) && (__GNUC__ >= 4)) || defined(__clang__)
-// On GCC/CLang we provide a hint that the pointers are word aligned.
-#include <string.h>
-static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) {
-  mi_assert_internal(((uintptr_t)dst % MI_INTPTR_SIZE == 0) && ((uintptr_t)src % MI_INTPTR_SIZE == 0));
-  void* adst = __builtin_assume_aligned(dst, MI_INTPTR_SIZE);
-  const void* asrc = __builtin_assume_aligned(src, MI_INTPTR_SIZE);
-  _mi_memcpy(adst, asrc, n);
-}
-#else
-// Default fallback on `_mi_memcpy`
-static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) {
-  mi_assert_internal(((uintptr_t)dst % MI_INTPTR_SIZE == 0) && ((uintptr_t)src % MI_INTPTR_SIZE == 0));
-  _mi_memcpy(dst, src, n);
-}
-#endif
-
-
-#endif
diff --git a/ext/src/mimalloc/include/mimalloc-new-delete.h b/ext/src/mimalloc/include/mimalloc-new-delete.h
index 2749a0be94..c16f4a6653 100644
--- a/ext/src/mimalloc/include/mimalloc-new-delete.h
+++ b/ext/src/mimalloc/include/mimalloc-new-delete.h
@@ -22,17 +22,26 @@ terms of the MIT license. A copy of the license can be found in the file
   #include <new>
   #include <mimalloc.h>
 
+  #if defined(_MSC_VER) && defined(_Ret_notnull_) && defined(_Post_writable_byte_size_)
+  // stay consistent with VCRT definitions
+  #define mi_decl_new(n)          mi_decl_nodiscard mi_decl_restrict _Ret_notnull_ _Post_writable_byte_size_(n)
+  #define mi_decl_new_nothrow(n)  mi_decl_nodiscard mi_decl_restrict _Ret_maybenull_ _Success_(return != NULL) _Post_writable_byte_size_(n)
+  #else
+  #define mi_decl_new(n)          mi_decl_nodiscard mi_decl_restrict
+  #define mi_decl_new_nothrow(n)  mi_decl_nodiscard mi_decl_restrict
+  #endif
+
   void operator delete(void* p) noexcept              { mi_free(p); };
   void operator delete[](void* p) noexcept            { mi_free(p); };
 
   void operator delete  (void* p, const std::nothrow_t&) noexcept { mi_free(p); }
   void operator delete[](void* p, const std::nothrow_t&) noexcept { mi_free(p); }
 
-  void* operator new(std::size_t n) noexcept(false)   { return mi_new(n); }
-  void* operator new[](std::size_t n) noexcept(false) { return mi_new(n); }
+  mi_decl_new(n) void* operator new(std::size_t n) noexcept(false) { return mi_new(n); }
+  mi_decl_new(n) void* operator new[](std::size_t n) noexcept(false) { return mi_new(n); }
 
-  void* operator new  (std::size_t n, const std::nothrow_t& tag) noexcept { (void)(tag); return mi_new_nothrow(n); }
-  void* operator new[](std::size_t n, const std::nothrow_t& tag) noexcept { (void)(tag); return mi_new_nothrow(n); }
+  mi_decl_new_nothrow(n) void* operator new  (std::size_t n, const std::nothrow_t& tag) noexcept { (void)(tag); return mi_new_nothrow(n); }
+  mi_decl_new_nothrow(n) void* operator new[](std::size_t n, const std::nothrow_t& tag) noexcept { (void)(tag); return mi_new_nothrow(n); }
 
   #if (__cplusplus >= 201402L || _MSC_VER >= 1916)
   void operator delete  (void* p, std::size_t n) noexcept { mi_free_size(p,n); };
@@ -44,9 +53,9 @@ terms of the MIT license. A copy of the license can be found in the file
   void operator delete[](void* p, std::align_val_t al) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
   void operator delete  (void* p, std::size_t n, std::align_val_t al) noexcept { mi_free_size_aligned(p, n, static_cast<size_t>(al)); };
   void operator delete[](void* p, std::size_t n, std::align_val_t al) noexcept { mi_free_size_aligned(p, n, static_cast<size_t>(al)); };
-  void operator delete  (void* p, std::align_val_t al, const std::nothrow_t& tag) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
-  void operator delete[](void* p, std::align_val_t al, const std::nothrow_t& tag) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
-  
+  void operator delete  (void* p, std::align_val_t al, const std::nothrow_t&) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
+  void operator delete[](void* p, std::align_val_t al, const std::nothrow_t&) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
+
   void* operator new  (std::size_t n, std::align_val_t al) noexcept(false) { return mi_new_aligned(n, static_cast<size_t>(al)); }
   void* operator new[](std::size_t n, std::align_val_t al) noexcept(false) { return mi_new_aligned(n, static_cast<size_t>(al)); }
   void* operator new  (std::size_t n, std::align_val_t al, const std::nothrow_t&) noexcept { return mi_new_aligned_nothrow(n, static_cast<size_t>(al)); }
diff --git a/ext/src/mimalloc/include/mimalloc-override.h b/ext/src/mimalloc/include/mimalloc-override.h
index c63b0b91a7..48a8a6226a 100644
--- a/ext/src/mimalloc/include/mimalloc-override.h
+++ b/ext/src/mimalloc/include/mimalloc-override.h
@@ -24,7 +24,7 @@ not accidentally mix pointers from different allocators).
 #define free(p)                 mi_free(p)
 
 #define strdup(s)               mi_strdup(s)
-#define strndup(s,n)              mi_strndup(s,n)
+#define strndup(s,n)            mi_strndup(s,n)
 #define realpath(f,n)           mi_realpath(f,n)
 
 // Microsoft extensions
@@ -43,6 +43,7 @@ not accidentally mix pointers from different allocators).
 #define reallocf(p,n)           mi_reallocf(p,n)
 #define malloc_size(p)          mi_usable_size(p)
 #define malloc_usable_size(p)   mi_usable_size(p)
+#define malloc_good_size(sz)    mi_malloc_good_size(sz)
 #define cfree(p)                mi_free(p)
 
 #define valloc(n)               mi_valloc(n)
diff --git a/ext/src/mimalloc/include/mimalloc-stats.h b/ext/src/mimalloc/include/mimalloc-stats.h
new file mode 100644
index 0000000000..04fcb131b8
--- /dev/null
+++ b/ext/src/mimalloc/include/mimalloc-stats.h
@@ -0,0 +1,157 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2024-2025, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_STATS_H
+#define MIMALLOC_STATS_H
+
+#include <mimalloc.h>
+#include <stdint.h>
+
+#define MI_STAT_VERSION   4  // increased on every backward incompatible change
+
+// alignment for atomic fields
+#if defined(_MSC_VER)
+#define mi_decl_align(a)        __declspec(align(a))
+#elif defined(__GNUC__)
+#define mi_decl_align(a)        __attribute__((aligned(a)))
+#elif __cplusplus >= 201103L
+#define mi_decl_align(a)        alignas(a)
+#else
+#define mi_decl_align(a)
+#endif
+
+
+// count allocation over time
+typedef struct mi_stat_count_s {
+  int64_t total;                              // total allocated
+  int64_t peak;                               // peak allocation
+  int64_t current;                            // current allocation
+} mi_stat_count_t;
+
+// counters only increase
+typedef struct mi_stat_counter_s {
+  int64_t total;                              // total count
+} mi_stat_counter_t;
+
+#define MI_STAT_FIELDS() \
+  MI_STAT_COUNT(pages)                      /* count of mimalloc pages */ \
+  MI_STAT_COUNT(reserved)                   /* reserved memory bytes */ \
+  MI_STAT_COUNT(committed)                  /* committed bytes */ \
+  MI_STAT_COUNTER(reset)                    /* reset bytes */ \
+  MI_STAT_COUNTER(purged)                   /* purged bytes */ \
+  MI_STAT_COUNT(page_committed)             /* committed memory inside pages */ \
+  MI_STAT_COUNT(pages_abandoned)            /* abandonded pages count */ \
+  MI_STAT_COUNT(threads)                    /* number of threads */ \
+  MI_STAT_COUNT(malloc_normal)              /* allocated bytes <= MI_LARGE_OBJ_SIZE_MAX */ \
+  MI_STAT_COUNT(malloc_huge)                /* allocated bytes in huge pages */ \
+  MI_STAT_COUNT(malloc_requested)           /* malloc requested bytes */ \
+  \
+  MI_STAT_COUNTER(mmap_calls) \
+  MI_STAT_COUNTER(commit_calls) \
+  MI_STAT_COUNTER(reset_calls) \
+  MI_STAT_COUNTER(purge_calls) \
+  MI_STAT_COUNTER(arena_count)              /* number of memory arena's */ \
+  MI_STAT_COUNTER(malloc_normal_count)      /* number of blocks <= MI_LARGE_OBJ_SIZE_MAX */ \
+  MI_STAT_COUNTER(malloc_huge_count)        /* number of huge bloks */ \
+  MI_STAT_COUNTER(malloc_guarded_count)     /* number of allocations with guard pages */ \
+  \
+  /* internal statistics */ \
+  MI_STAT_COUNTER(arena_rollback_count) \
+  MI_STAT_COUNTER(arena_purges) \
+  MI_STAT_COUNTER(pages_extended)           /* number of page extensions */ \
+  MI_STAT_COUNTER(pages_retire)             /* number of pages that are retired */ \
+  MI_STAT_COUNTER(page_searches)            /* total pages searched for a fresh page */ \
+  MI_STAT_COUNTER(page_searches_count)      /* searched count for a fresh page */ \
+  /* only on v1 and v2 */ \
+  MI_STAT_COUNT(segments) \
+  MI_STAT_COUNT(segments_abandoned) \
+  MI_STAT_COUNT(segments_cache) \
+  MI_STAT_COUNT(_segments_reserved) \
+  /* only on v3 */ \
+  MI_STAT_COUNT(heaps) \
+  MI_STAT_COUNTER(pages_reclaim_on_alloc) \
+  MI_STAT_COUNTER(pages_reclaim_on_free) \
+  MI_STAT_COUNTER(pages_reabandon_full) \
+  MI_STAT_COUNTER(pages_unabandon_busy_wait)
+
+// Size bins for chunks
+typedef enum mi_chunkbin_e {
+  MI_CBIN_SMALL,    // slice_count == 1
+  MI_CBIN_OTHER,    // slice_count: any other from the other bins, and 1 <= slice_count <= MI_BCHUNK_BITS
+  MI_CBIN_MEDIUM,   // slice_count == 8
+  MI_CBIN_LARGE,    // slice_count == MI_SIZE_BITS  (only used if MI_ENABLE_LARGE_PAGES is 1)
+  MI_CBIN_HUGE,     // slice_count > MI_BCHUNK_BITS
+  MI_CBIN_NONE,     // no bin assigned yet (the chunk is completely free)
+  MI_CBIN_COUNT
+} mi_chunkbin_t;
+
+
+// Define the statistics structure
+#define MI_BIN_HUGE             (73U)   // see types.h
+#define MI_STAT_COUNT(stat)     mi_stat_count_t stat;
+#define MI_STAT_COUNTER(stat)   mi_stat_counter_t stat;
+
+typedef struct mi_stats_s
+{
+  size_t size;          // size of the mi_stats_t structure 
+  size_t version;       
+
+  mi_decl_align(8)  MI_STAT_FIELDS()
+
+  // future extension
+  mi_stat_count_t   _stat_reserved[4];
+  mi_stat_counter_t _stat_counter_reserved[4];
+
+  // size segregated statistics
+  mi_stat_count_t   malloc_bins[MI_BIN_HUGE+1];   // allocation per size bin
+  mi_stat_count_t   page_bins[MI_BIN_HUGE+1];     // pages allocated per size bin
+  mi_stat_count_t   chunk_bins[MI_CBIN_COUNT];    // chunks per page sizes
+} mi_stats_t;
+
+#undef MI_STAT_COUNT
+#undef MI_STAT_COUNTER
+
+// helper
+#define mi_stats_t_decl(name)  mi_stats_t name = { 0 }; name.size = sizeof(mi_stats_t); name.version = MI_STAT_VERSION;
+
+// Exported definitions
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// stats from a heap
+mi_decl_export bool    mi_heap_stats_get(mi_heap_t* heap, mi_stats_t* stats) mi_attr_noexcept;
+mi_decl_export char*   mi_heap_stats_get_json(mi_heap_t* heap, size_t buf_size, char* buf) mi_attr_noexcept;      // use mi_free to free the result if the input buf == NULL
+mi_decl_export void    mi_heap_stats_print_out(mi_heap_t* heap, mi_output_fun* out, void* arg) mi_attr_noexcept;
+
+// stats from a subprocess and its heaps aggregated
+mi_decl_export bool    mi_subproc_stats_get(mi_subproc_id_t subproc_id, mi_stats_t* stats) mi_attr_noexcept;
+mi_decl_export char*   mi_subproc_stats_get_json(mi_subproc_id_t subproc_id, size_t buf_size, char* buf) mi_attr_noexcept;      // use mi_free to free the result if the input buf == NULL
+mi_decl_export void    mi_subproc_stats_print_out(mi_subproc_id_t subproc_id, mi_output_fun* out, void* arg) mi_attr_noexcept;
+// print subprocess and all its heap stats segregated
+mi_decl_export void    mi_subproc_heap_stats_print_out(mi_subproc_id_t subproc_id, mi_output_fun* out, void* arg) mi_attr_noexcept;
+
+// stats aggregated for the current subprocess and all its heaps.
+mi_decl_export bool    mi_stats_get(mi_stats_t* stats) mi_attr_noexcept;
+mi_decl_export char*   mi_stats_get_json(size_t buf_size, char* buf) mi_attr_noexcept;      // use mi_free to free the result if the input buf == NULL
+mi_decl_export void    mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
+
+// add the stats of the heap to the subprocess and clear the heap stats
+mi_decl_export void    mi_heap_stats_merge_to_subproc(mi_heap_t* heap);
+
+// stats from the subprocess without aggregating its heaps
+mi_decl_export bool    mi_subproc_stats_get_exclusive(mi_subproc_id_t subproc_id, mi_stats_t* stats) mi_attr_noexcept;
+
+mi_decl_export char*   mi_stats_as_json(mi_stats_t* stats, size_t buf_size, char* buf) mi_attr_noexcept;      // use mi_free to free the result if the input buf == NULL
+mi_decl_export size_t  mi_stats_get_bin_size(size_t bin) mi_attr_noexcept;
+mi_decl_export size_t mi_stats_total_mem() mi_attr_noexcept; // SPADES_LOCAL  
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // MIMALLOC_STATS_H
diff --git a/ext/src/mimalloc/include/mimalloc-types.h b/ext/src/mimalloc/include/mimalloc-types.h
deleted file mode 100644
index 32492c4feb..0000000000
--- a/ext/src/mimalloc/include/mimalloc-types.h
+++ /dev/null
@@ -1,599 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-#pragma once
-#ifndef MIMALLOC_TYPES_H
-#define MIMALLOC_TYPES_H
-
-#include <stddef.h>   // ptrdiff_t
-#include <stdint.h>   // uintptr_t, uint16_t, etc
-#include "mimalloc-atomic.h"  // _Atomic
-
-#ifdef _MSC_VER
-#pragma warning(disable:4214) // bitfield is not int
-#endif 
-
-// Minimal alignment necessary. On most platforms 16 bytes are needed
-// due to SSE registers for example. This must be at least `sizeof(void*)`
-#ifndef MI_MAX_ALIGN_SIZE
-#define MI_MAX_ALIGN_SIZE  16   // sizeof(max_align_t)
-#endif
-
-// ------------------------------------------------------
-// Variants
-// ------------------------------------------------------
-
-// Define NDEBUG in the release version to disable assertions.
-// #define NDEBUG
-#define MI_DEBUG 0
-
-// Define MI_STAT as 1 to maintain statistics; set it to 2 to have detailed statistics (but costs some performance).
-#define MI_STAT 1
-
-// Define MI_SECURE to enable security mitigations
-// #define MI_SECURE 1  // guard page around metadata
-// #define MI_SECURE 2  // guard page around each mimalloc page
-// #define MI_SECURE 3  // encode free lists (detect corrupted free list (buffer overflow), and invalid pointer free)
-// #define MI_SECURE 4  // checks for double free. (may be more expensive)
-
-#if !defined(MI_SECURE)
-#define MI_SECURE 0
-#endif
-
-// Define MI_DEBUG for debug mode
-// #define MI_DEBUG 1  // basic assertion checks and statistics, check double free, corrupted free list, and invalid pointer free.
-// #define MI_DEBUG 2  // + internal assertion checks
-// #define MI_DEBUG 3  // + extensive internal invariant checking (cmake -DMI_DEBUG_FULL=ON)
-#if !defined(MI_DEBUG)
-#if !defined(NDEBUG) || defined(_DEBUG)
-#define MI_DEBUG 2
-#else
-#define MI_DEBUG 0
-#endif
-#endif
-
-// Reserve extra padding at the end of each block to be more resilient against heap block overflows.
-// The padding can detect byte-precise buffer overflow on free.
-#if !defined(MI_PADDING) && (MI_DEBUG>=1)
-#define MI_PADDING  1
-#endif
-
-
-// Encoded free lists allow detection of corrupted free lists
-// and can detect buffer overflows, modify after free, and double `free`s.
-#if (MI_SECURE>=3 || MI_DEBUG>=1 || MI_PADDING > 0)
-#define MI_ENCODE_FREELIST  1
-#endif
-
-
-// ------------------------------------------------------
-// Platform specific values
-// ------------------------------------------------------
-
-// ------------------------------------------------------
-// Size of a pointer.
-// We assume that `sizeof(void*)==sizeof(intptr_t)`
-// and it holds for all platforms we know of.
-//
-// However, the C standard only requires that:
-//  p == (void*)((intptr_t)p))
-// but we also need:
-//  i == (intptr_t)((void*)i)
-// or otherwise one might define an intptr_t type that is larger than a pointer...
-// ------------------------------------------------------
-
-#if INTPTR_MAX > INT64_MAX
-# define MI_INTPTR_SHIFT (4)  // assume 128-bit  (as on arm CHERI for example)
-#elif INTPTR_MAX == INT64_MAX
-# define MI_INTPTR_SHIFT (3)
-#elif INTPTR_MAX == INT32_MAX
-# define MI_INTPTR_SHIFT (2)
-#else
-#error platform pointers must be 32, 64, or 128 bits
-#endif
-
-#if SIZE_MAX == UINT64_MAX
-# define MI_SIZE_SHIFT (3)
-typedef int64_t  mi_ssize_t;
-#elif SIZE_MAX == UINT32_MAX
-# define MI_SIZE_SHIFT (2)
-typedef int32_t  mi_ssize_t;
-#else
-#error platform objects must be 32 or 64 bits
-#endif
-
-#if (SIZE_MAX/2) > LONG_MAX
-# define MI_ZU(x)  x##ULL
-# define MI_ZI(x)  x##LL
-#else
-# define MI_ZU(x)  x##UL
-# define MI_ZI(x)  x##L
-#endif
-
-#define MI_INTPTR_SIZE  (1<<MI_INTPTR_SHIFT)
-#define MI_INTPTR_BITS  (MI_INTPTR_SIZE*8)
-
-#define MI_SIZE_SIZE  (1<<MI_SIZE_SHIFT)
-#define MI_SIZE_BITS  (MI_SIZE_SIZE*8)
-
-#define MI_KiB     (MI_ZU(1024))
-#define MI_MiB     (MI_KiB*MI_KiB)
-#define MI_GiB     (MI_MiB*MI_KiB)
-
-
-// ------------------------------------------------------
-// Main internal data-structures
-// ------------------------------------------------------
-
-// Main tuning parameters for segment and page sizes
-// Sizes for 64-bit (usually divide by two for 32-bit)
-#define MI_SEGMENT_SLICE_SHIFT            (13 + MI_INTPTR_SHIFT)         // 64KiB  (32KiB on 32-bit)
-
-#if MI_INTPTR_SIZE > 4
-#define MI_SEGMENT_SHIFT                  (10 + MI_SEGMENT_SLICE_SHIFT)  // 64MiB
-#else
-#define MI_SEGMENT_SHIFT                  ( 7 + MI_SEGMENT_SLICE_SHIFT)  // 4MiB on 32-bit
-#endif
-
-#define MI_SMALL_PAGE_SHIFT               (MI_SEGMENT_SLICE_SHIFT)       // 64KiB
-#define MI_MEDIUM_PAGE_SHIFT              ( 3 + MI_SMALL_PAGE_SHIFT)     // 512KiB
-
-
-// Derived constants
-#define MI_SEGMENT_SIZE                   (MI_ZU(1)<<MI_SEGMENT_SHIFT)
-#define MI_SEGMENT_ALIGN                  MI_SEGMENT_SIZE
-#define MI_SEGMENT_MASK                   (MI_SEGMENT_SIZE - 1)
-#define MI_SEGMENT_SLICE_SIZE             (MI_ZU(1)<< MI_SEGMENT_SLICE_SHIFT)
-#define MI_SLICES_PER_SEGMENT             (MI_SEGMENT_SIZE / MI_SEGMENT_SLICE_SIZE) // 1024
-
-#define MI_SMALL_PAGE_SIZE                (MI_ZU(1)<<MI_SMALL_PAGE_SHIFT)
-#define MI_MEDIUM_PAGE_SIZE               (MI_ZU(1)<<MI_MEDIUM_PAGE_SHIFT)
-
-#define MI_SMALL_OBJ_SIZE_MAX             (MI_SMALL_PAGE_SIZE/4)   // 8KiB on 64-bit
-#define MI_MEDIUM_OBJ_SIZE_MAX            (MI_MEDIUM_PAGE_SIZE/4)  // 128KiB on 64-bit
-#define MI_MEDIUM_OBJ_WSIZE_MAX           (MI_MEDIUM_OBJ_SIZE_MAX/MI_INTPTR_SIZE)   
-#define MI_LARGE_OBJ_SIZE_MAX             (MI_SEGMENT_SIZE/2)      // 32MiB on 64-bit
-#define MI_LARGE_OBJ_WSIZE_MAX            (MI_LARGE_OBJ_SIZE_MAX/MI_INTPTR_SIZE)
-
-// Maximum number of size classes. (spaced exponentially in 12.5% increments)
-#define MI_BIN_HUGE  (73U)
-
-#if (MI_MEDIUM_OBJ_WSIZE_MAX >= 655360)
-#error "mimalloc internal: define more bins"
-#endif
-#if (MI_ALIGNMENT_MAX > MI_SEGMENT_SIZE/2)
-#error "mimalloc internal: the max aligned boundary is too large for the segment size"
-#endif
-#if (MI_ALIGNED_MAX % MI_SEGMENT_SLICE_SIZE != 0)
-#error "mimalloc internal: the max aligned boundary must be an integral multiple of the segment slice size"
-#endif
-
-// Maximum slice offset (15)
-#define MI_MAX_SLICE_OFFSET               ((MI_ALIGNMENT_MAX / MI_SEGMENT_SLICE_SIZE) - 1)
-
-// Used as a special value to encode block sizes in 32 bits.
-#define MI_HUGE_BLOCK_SIZE                ((uint32_t)(2*MI_GiB))
-
-// blocks up to this size are always allocated aligned
-#define MI_MAX_ALIGN_GUARANTEE            (8*MI_MAX_ALIGN_SIZE)  
-
-
-
-
-// ------------------------------------------------------
-// Mimalloc pages contain allocated blocks
-// ------------------------------------------------------
-
-// The free lists use encoded next fields
-// (Only actually encodes when MI_ENCODED_FREELIST is defined.)
-typedef uintptr_t  mi_encoded_t;
-
-// thread id's
-typedef size_t     mi_threadid_t;
-
-// free lists contain blocks
-typedef struct mi_block_s {
-  mi_encoded_t next;
-} mi_block_t;
-
-
-// The delayed flags are used for efficient multi-threaded free-ing
-typedef enum mi_delayed_e {
-  MI_USE_DELAYED_FREE   = 0, // push on the owning heap thread delayed list
-  MI_DELAYED_FREEING    = 1, // temporary: another thread is accessing the owning heap
-  MI_NO_DELAYED_FREE    = 2, // optimize: push on page local thread free queue if another block is already in the heap thread delayed free list
-  MI_NEVER_DELAYED_FREE = 3  // sticky, only resets on page reclaim
-} mi_delayed_t;
-
-
-// The `in_full` and `has_aligned` page flags are put in a union to efficiently
-// test if both are false (`full_aligned == 0`) in the `mi_free` routine.
-#if !MI_TSAN
-typedef union mi_page_flags_s {
-  uint8_t full_aligned;
-  struct {
-    uint8_t in_full : 1;
-    uint8_t has_aligned : 1;
-  } x;
-} mi_page_flags_t;
-#else
-// under thread sanitizer, use a byte for each flag to suppress warning, issue #130
-typedef union mi_page_flags_s {
-  uint16_t full_aligned;
-  struct {
-    uint8_t in_full;
-    uint8_t has_aligned;
-  } x;
-} mi_page_flags_t;
-#endif
-
-// Thread free list.
-// We use the bottom 2 bits of the pointer for mi_delayed_t flags
-typedef uintptr_t mi_thread_free_t;
-
-// A page contains blocks of one specific size (`block_size`).
-// Each page has three list of free blocks:
-// `free` for blocks that can be allocated,
-// `local_free` for freed blocks that are not yet available to `mi_malloc`
-// `thread_free` for freed blocks by other threads
-// The `local_free` and `thread_free` lists are migrated to the `free` list
-// when it is exhausted. The separate `local_free` list is necessary to
-// implement a monotonic heartbeat. The `thread_free` list is needed for
-// avoiding atomic operations in the common case.
-//
-//
-// `used - |thread_free|` == actual blocks that are in use (alive)
-// `used - |thread_free| + |free| + |local_free| == capacity`
-//
-// We don't count `freed` (as |free|) but use `used` to reduce
-// the number of memory accesses in the `mi_page_all_free` function(s).
-//
-// Notes: 
-// - Access is optimized for `mi_free` and `mi_page_alloc` (in `alloc.c`)
-// - Using `uint16_t` does not seem to slow things down
-// - The size is 8 words on 64-bit which helps the page index calculations
-//   (and 10 words on 32-bit, and encoded free lists add 2 words. Sizes 10 
-//    and 12 are still good for address calculation)
-// - To limit the structure size, the `xblock_size` is 32-bits only; for 
-//   blocks > MI_HUGE_BLOCK_SIZE the size is determined from the segment page size
-// - `thread_free` uses the bottom bits as a delayed-free flags to optimize
-//   concurrent frees where only the first concurrent free adds to the owning
-//   heap `thread_delayed_free` list (see `alloc.c:mi_free_block_mt`).
-//   The invariant is that no-delayed-free is only set if there is
-//   at least one block that will be added, or as already been added, to 
-//   the owning heap `thread_delayed_free` list. This guarantees that pages
-//   will be freed correctly even if only other threads free blocks.
-typedef struct mi_page_s {
-  // "owned" by the segment
-  uint32_t              slice_count;       // slices in this page (0 if not a page)
-  uint32_t              slice_offset;      // distance from the actual page data slice (0 if a page)
-  uint8_t               is_reset : 1;        // `true` if the page memory was reset
-  uint8_t               is_committed : 1;    // `true` if the page virtual memory is committed
-  uint8_t               is_zero_init : 1;    // `true` if the page was zero initialized
-
-  // layout like this to optimize access in `mi_malloc` and `mi_free`
-  uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
-  uint16_t              reserved;          // number of blocks reserved in memory
-  mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
-  uint8_t               is_zero : 1;         // `true` if the blocks in the free list are zero initialized
-  uint8_t               retire_expire : 7;   // expiration count for retired blocks
-
-  mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
-  #ifdef MI_ENCODE_FREELIST
-  uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`)
-  #endif
-  uint32_t              used;              // number of blocks in use (including blocks in `local_free` and `thread_free`)
-  uint32_t              xblock_size;       // size available in each block (always `>0`) 
-
-  mi_block_t* local_free;                  // list of deferred free blocks by this thread (migrates to `free`)
-  _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
-  _Atomic(uintptr_t)        xheap;
-
-  struct mi_page_s* next;                  // next page owned by this thread with the same `block_size`
-  struct mi_page_s* prev;                  // previous page owned by this thread with the same `block_size`
-
-  // 64-bit 9 words, 32-bit 12 words, (+2 for secure)
-  #if MI_INTPTR_SIZE==8
-  uintptr_t padding[1];
-  #endif
-} mi_page_t;
-
-
-
-typedef enum mi_page_kind_e {
-  MI_PAGE_SMALL,    // small blocks go into 64KiB pages inside a segment
-  MI_PAGE_MEDIUM,   // medium blocks go into medium pages inside a segment
-  MI_PAGE_LARGE,    // larger blocks go into a page of just one block
-  MI_PAGE_HUGE,     // huge blocks (> 16 MiB) are put into a single page in a single segment.
-} mi_page_kind_t;
-
-typedef enum mi_segment_kind_e {
-  MI_SEGMENT_NORMAL, // MI_SEGMENT_SIZE size with pages inside.
-  MI_SEGMENT_HUGE,   // > MI_LARGE_SIZE_MAX segment with just one huge page inside.
-} mi_segment_kind_t;
-
-// ------------------------------------------------------
-// A segment holds a commit mask where a bit is set if
-// the corresponding MI_COMMIT_SIZE area is committed.
-// The MI_COMMIT_SIZE must be a multiple of the slice
-// size. If it is equal we have the most fine grained 
-// decommit (but setting it higher can be more efficient).
-// The MI_MINIMAL_COMMIT_SIZE is the minimal amount that will
-// be committed in one go which can be set higher than
-// MI_COMMIT_SIZE for efficiency (while the decommit mask
-// is still tracked in fine-grained MI_COMMIT_SIZE chunks)
-// ------------------------------------------------------
-
-#define MI_MINIMAL_COMMIT_SIZE      (2*MI_MiB)
-#define MI_COMMIT_SIZE              (MI_SEGMENT_SLICE_SIZE)              // 64KiB
-#define MI_COMMIT_MASK_BITS         (MI_SEGMENT_SIZE / MI_COMMIT_SIZE)  
-#define MI_COMMIT_MASK_FIELD_BITS    MI_SIZE_BITS
-#define MI_COMMIT_MASK_FIELD_COUNT  (MI_COMMIT_MASK_BITS / MI_COMMIT_MASK_FIELD_BITS)
-
-#if (MI_COMMIT_MASK_BITS != (MI_COMMIT_MASK_FIELD_COUNT * MI_COMMIT_MASK_FIELD_BITS))
-#error "the segment size must be exactly divisible by the (commit size * size_t bits)"
-#endif
-
-typedef struct mi_commit_mask_s {
-  size_t mask[MI_COMMIT_MASK_FIELD_COUNT];
-} mi_commit_mask_t;
-
-typedef mi_page_t  mi_slice_t;
-typedef int64_t    mi_msecs_t;
-
-
-// Segments are large allocated memory blocks (8mb on 64 bit) from
-// the OS. Inside segments we allocated fixed size _pages_ that
-// contain blocks.
-typedef struct mi_segment_s {
-  size_t            memid;              // memory id for arena allocation
-  bool              mem_is_pinned;      // `true` if we cannot decommit/reset/protect in this memory (i.e. when allocated using large OS pages)    
-  bool              mem_is_large;       // in large/huge os pages?
-  bool              mem_is_committed;   // `true` if the whole segment is eagerly committed
-
-  bool              allow_decommit;     
-  mi_msecs_t        decommit_expire;
-  mi_commit_mask_t  decommit_mask;
-  mi_commit_mask_t  commit_mask;
-
-  _Atomic(struct mi_segment_s*) abandoned_next;
-
-  // from here is zero initialized
-  struct mi_segment_s* next;            // the list of freed segments in the cache (must be first field, see `segment.c:mi_segment_init`)
-  
-  size_t            abandoned;          // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
-  size_t            abandoned_visits;   // count how often this segment is visited in the abandoned list (to force reclaim it it is too long)
-  size_t            used;               // count of pages in use
-  uintptr_t         cookie;             // verify addresses in debug mode: `mi_ptr_cookie(segment) == segment->cookie`  
-
-  size_t            segment_slices;      // for huge segments this may be different from `MI_SLICES_PER_SEGMENT`
-  size_t            segment_info_slices; // initial slices we are using segment info and possible guard pages.
-
-  // layout like this to optimize access in `mi_free`
-  mi_segment_kind_t kind;
-  _Atomic(mi_threadid_t) thread_id;      // unique id of the thread owning this segment
-  size_t            slice_entries;       // entries in the `slices` array, at most `MI_SLICES_PER_SEGMENT`
-  mi_slice_t        slices[MI_SLICES_PER_SEGMENT];
-} mi_segment_t;
-
-
-// ------------------------------------------------------
-// Heaps
-// Provide first-class heaps to allocate from.
-// A heap just owns a set of pages for allocation and
-// can only be allocate/reallocate from the thread that created it.
-// Freeing blocks can be done from any thread though.
-// Per thread, the segments are shared among its heaps.
-// Per thread, there is always a default heap that is
-// used for allocation; it is initialized to statically
-// point to an empty heap to avoid initialization checks
-// in the fast path.
-// ------------------------------------------------------
-
-// Thread local data
-typedef struct mi_tld_s mi_tld_t;
-
-// Pages of a certain block size are held in a queue.
-typedef struct mi_page_queue_s {
-  mi_page_t* first;
-  mi_page_t* last;
-  size_t     block_size;
-} mi_page_queue_t;
-
-#define MI_BIN_FULL  (MI_BIN_HUGE+1)
-
-// Random context
-typedef struct mi_random_cxt_s {
-  uint32_t input[16];
-  uint32_t output[16];
-  int      output_available;
-} mi_random_ctx_t;
-
-
-// In debug mode there is a padding structure at the end of the blocks to check for buffer overflows
-#if (MI_PADDING)
-typedef struct mi_padding_s {
-  uint32_t canary; // encoded block value to check validity of the padding (in case of overflow)
-  uint32_t delta;  // padding bytes before the block. (mi_usable_size(p) - delta == exact allocated bytes)
-} mi_padding_t;
-#define MI_PADDING_SIZE   (sizeof(mi_padding_t))
-#define MI_PADDING_WSIZE  ((MI_PADDING_SIZE + MI_INTPTR_SIZE - 1) / MI_INTPTR_SIZE)
-#else
-#define MI_PADDING_SIZE   0
-#define MI_PADDING_WSIZE  0
-#endif
-
-#define MI_PAGES_DIRECT   (MI_SMALL_WSIZE_MAX + MI_PADDING_WSIZE + 1)
-
-
-// A heap owns a set of pages.
-struct mi_heap_s {
-  mi_tld_t*             tld;
-  mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
-  mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
-  _Atomic(mi_block_t*)  thread_delayed_free;
-  mi_threadid_t         thread_id;                           // thread this heap belongs too
-  uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
-  uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
-  mi_random_ctx_t       random;                              // random number context used for secure allocation
-  size_t                page_count;                          // total number of pages in the `pages` queues.
-  size_t                page_retired_min;                    // smallest retired index (retired pages are fully free, but still in the page queues)
-  size_t                page_retired_max;                    // largest retired index into the `pages` array.
-  mi_heap_t*            next;                                // list of heaps per thread
-  bool                  no_reclaim;                          // `true` if this heap should not reclaim abandoned pages
-};
-
-
-
-// ------------------------------------------------------
-// Debug
-// ------------------------------------------------------
-
-#if !defined(MI_DEBUG_UNINIT)
-#define MI_DEBUG_UNINIT     (0xD0)
-#endif
-#if !defined(MI_DEBUG_FREED)
-#define MI_DEBUG_FREED      (0xDF)
-#endif
-#if !defined(MI_DEBUG_PADDING)
-#define MI_DEBUG_PADDING    (0xDE)
-#endif
-
-#if (MI_DEBUG)
-// use our own assertion to print without memory allocation
-void _mi_assert_fail(const char* assertion, const char* fname, unsigned int line, const char* func );
-#define mi_assert(expr)     ((expr) ? (void)0 : _mi_assert_fail(#expr,__FILE__,__LINE__,__func__))
-#else
-#define mi_assert(x)
-#endif
-
-#if (MI_DEBUG>1)
-#define mi_assert_internal    mi_assert
-#else
-#define mi_assert_internal(x)
-#endif
-
-#if (MI_DEBUG>2)
-#define mi_assert_expensive   mi_assert
-#else
-#define mi_assert_expensive(x)
-#endif
-
-// ------------------------------------------------------
-// Statistics
-// ------------------------------------------------------
-
-#ifndef MI_STAT
-#if (MI_DEBUG>0)
-#define MI_STAT 2
-#else
-#define MI_STAT 0
-#endif
-#endif
-
-typedef struct mi_stat_count_s {
-  int64_t allocated;
-  int64_t freed;
-  int64_t peak;
-  int64_t current;
-} mi_stat_count_t;
-
-typedef struct mi_stat_counter_s {
-  int64_t total;
-  int64_t count;
-} mi_stat_counter_t;
-
-typedef struct mi_stats_s {
-  mi_stat_count_t segments;
-  mi_stat_count_t pages;
-  mi_stat_count_t reserved;
-  mi_stat_count_t committed;
-  mi_stat_count_t reset;
-  mi_stat_count_t page_committed;
-  mi_stat_count_t segments_abandoned;
-  mi_stat_count_t pages_abandoned;
-  mi_stat_count_t threads;
-  mi_stat_count_t normal;
-  mi_stat_count_t huge;
-  mi_stat_count_t large;
-  mi_stat_count_t malloc;
-  mi_stat_count_t segments_cache;
-  mi_stat_counter_t pages_extended;
-  mi_stat_counter_t mmap_calls;
-  mi_stat_counter_t commit_calls;
-  mi_stat_counter_t page_no_retire;
-  mi_stat_counter_t searches;
-  mi_stat_counter_t normal_count;
-  mi_stat_counter_t huge_count;
-  mi_stat_counter_t large_count;
-#if MI_STAT>1
-  mi_stat_count_t normal_bins[MI_BIN_HUGE+1];
-#endif
-} mi_stats_t;
-
-
-void _mi_stat_increase(mi_stat_count_t* stat, size_t amount);
-void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount);
-void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
-
-#if (MI_STAT)
-#define mi_stat_increase(stat,amount)         _mi_stat_increase( &(stat), amount)
-#define mi_stat_decrease(stat,amount)         _mi_stat_decrease( &(stat), amount)
-#define mi_stat_counter_increase(stat,amount) _mi_stat_counter_increase( &(stat), amount)
-#else
-#define mi_stat_increase(stat,amount)         (void)0
-#define mi_stat_decrease(stat,amount)         (void)0
-#define mi_stat_counter_increase(stat,amount) (void)0
-#endif
-
-#define mi_heap_stat_counter_increase(heap,stat,amount)  mi_stat_counter_increase( (heap)->tld->stats.stat, amount)
-#define mi_heap_stat_increase(heap,stat,amount)  mi_stat_increase( (heap)->tld->stats.stat, amount)
-#define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)
-
-// ------------------------------------------------------
-// Thread Local data
-// ------------------------------------------------------
-
-// A "span" is is an available range of slices. The span queues keep
-// track of slice spans of at most the given `slice_count` (but more than the previous size class).
-typedef struct mi_span_queue_s {
-  mi_slice_t* first;
-  mi_slice_t* last;
-  size_t      slice_count;
-} mi_span_queue_t;
-
-#define MI_SEGMENT_BIN_MAX (35)     // 35 == mi_segment_bin(MI_SLICES_PER_SEGMENT)
-
-// OS thread local data
-typedef struct mi_os_tld_s {
-  size_t                region_idx;   // start point for next allocation
-  mi_stats_t*           stats;        // points to tld stats
-} mi_os_tld_t;
-
-
-// Segments thread local data
-typedef struct mi_segments_tld_s {
-  mi_span_queue_t     spans[MI_SEGMENT_BIN_MAX+1];  // free slice spans inside segments
-  size_t              count;        // current number of segments;
-  size_t              peak_count;   // peak number of segments
-  size_t              current_size; // current size of all segments
-  size_t              peak_size;    // peak size of all segments
-  mi_stats_t*         stats;        // points to tld stats
-  mi_os_tld_t*        os;           // points to os stats
-} mi_segments_tld_t;
-
-// Thread local data
-struct mi_tld_s {
-  unsigned long long  heartbeat;     // monotonic heartbeat count
-  bool                recurse;       // true if deferred was called; used to prevent infinite recursion.
-  mi_heap_t*          heap_backing;  // backing heap of this thread (cannot be deleted)
-  mi_heap_t*          heaps;         // list of heaps in this thread (so we can abandon all when the thread terminates)
-  mi_segments_tld_t   segments;      // segment tld
-  mi_os_tld_t         os;            // os tld
-  mi_stats_t          stats;         // statistics
-};
-
-#endif
diff --git a/ext/src/mimalloc/include/mimalloc.h b/ext/src/mimalloc/include/mimalloc.h
index 915d06b36b..f8fff1ad87 100644
--- a/ext/src/mimalloc/include/mimalloc.h
+++ b/ext/src/mimalloc/include/mimalloc.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2026, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_H
 #define MIMALLOC_H
 
-#define MI_MALLOC_VERSION 206   // major + 2 digits minor
+#define MI_MALLOC_VERSION 3208   // major + minor + 2 digits patch
 
 // ------------------------------------------------------
 // Compiler specific attributes
@@ -28,6 +28,8 @@ terms of the MIT license. A copy of the license can be found in the file
   #define mi_decl_nodiscard    [[nodiscard]]
 #elif (defined(__GNUC__) && (__GNUC__ >= 4)) || defined(__clang__)  // includes clang, icc, and clang-cl
   #define mi_decl_nodiscard    __attribute__((warn_unused_result))
+#elif defined(_HAS_NODISCARD)
+  #define mi_decl_nodiscard    _NODISCARD
 #elif (_MSC_VER >= 1700)
   #define mi_decl_nodiscard    _Check_return_
 #else
@@ -115,7 +117,7 @@ mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_strndup(const char* s
 mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept mi_attr_malloc;
 
 // ------------------------------------------------------
-// Extended functionality
+// Extended allocation functions
 // ------------------------------------------------------
 #define MI_SMALL_WSIZE_MAX  (128)
 #define MI_SMALL_SIZE_MAX   (MI_SMALL_WSIZE_MAX*sizeof(void*))
@@ -132,8 +134,44 @@ mi_decl_nodiscard mi_decl_export size_t mi_usable_size(const void* p) mi_attr_no
 mi_decl_nodiscard mi_decl_export size_t mi_good_size(size_t size)     mi_attr_noexcept;
 
 
+// -------------------------------------------------------------------------------------
+// Aligned allocation
+// Note that `alignment` always follows `size` for consistency with unaligned
+// allocation, but unfortunately this differs from `posix_memalign` and `aligned_alloc`.
+// -------------------------------------------------------------------------------------
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1, 2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1, 2);
+mi_decl_nodiscard mi_decl_export void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(2);
+
+
+// ------------------------------------------------------
+// Typed allocation, the type is always the first parameter
+// ------------------------------------------------------
+
+#define mi_malloc_tp(tp)                 ((tp*)mi_malloc(sizeof(tp)))
+#define mi_zalloc_tp(tp)                 ((tp*)mi_zalloc(sizeof(tp)))
+#define mi_calloc_tp(tp,n)               ((tp*)mi_calloc(n,sizeof(tp)))
+#define mi_mallocn_tp(tp,n)              ((tp*)mi_mallocn(n,sizeof(tp)))
+#define mi_reallocn_tp(tp,p,n)           ((tp*)mi_reallocn(p,n,sizeof(tp)))
+#define mi_recalloc_tp(tp,p,n)           ((tp*)mi_recalloc(p,n,sizeof(tp)))
+
+#define mi_heap_malloc_tp(tp,hp)         ((tp*)mi_heap_malloc(hp,sizeof(tp)))
+#define mi_heap_zalloc_tp(tp,hp)         ((tp*)mi_heap_zalloc(hp,sizeof(tp)))
+#define mi_heap_calloc_tp(tp,hp,n)       ((tp*)mi_heap_calloc(hp,n,sizeof(tp)))
+#define mi_heap_mallocn_tp(tp,hp,n)      ((tp*)mi_heap_mallocn(hp,n,sizeof(tp)))
+#define mi_heap_reallocn_tp(tp,hp,p,n)   ((tp*)mi_heap_reallocn(hp,p,n,sizeof(tp)))
+#define mi_heap_recalloc_tp(tp,hp,p,n)   ((tp*)mi_heap_recalloc(hp,p,n,sizeof(tp)))
+
+
 // ------------------------------------------------------
 // Internals
+// See also `mimalloc-stats.h` for statistics
 // ------------------------------------------------------
 
 typedef void (mi_cdecl mi_deferred_free_fun)(bool force, unsigned long long heartbeat, void* arg);
@@ -145,64 +183,70 @@ mi_decl_export void mi_register_output(mi_output_fun* out, void* arg) mi_attr_no
 typedef void (mi_cdecl mi_error_fun)(int err, void* arg);
 mi_decl_export void mi_register_error(mi_error_fun* fun, void* arg);
 
-mi_decl_export void mi_collect(bool force)    mi_attr_noexcept;
-mi_decl_export int  mi_version(void)          mi_attr_noexcept;
-mi_decl_export void mi_stats_reset(void)      mi_attr_noexcept;
-mi_decl_export void mi_stats_merge(void)      mi_attr_noexcept;
-mi_decl_export void mi_stats_print(void* out) mi_attr_noexcept;  // backward compatibility: `out` is ignored and should be NULL
-mi_decl_export void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
-mi_decl_export size_t mi_stats_total_mem(void) mi_attr_noexcept;
+mi_decl_export void mi_collect(bool force)      mi_attr_noexcept;
+mi_decl_export int  mi_version(void)            mi_attr_noexcept;
+mi_decl_export void mi_options_print(void)      mi_attr_noexcept;
+mi_decl_export void mi_process_info_print(void) mi_attr_noexcept;
+mi_decl_export void mi_options_print_out(mi_output_fun* out, void* arg)      mi_attr_noexcept;
+mi_decl_export void mi_process_info_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
+mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs,
+                                    size_t* current_rss, size_t* peak_rss,
+                                    size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept;
+
 
+
+// Generally do not use the following as these are usually called automatically
 mi_decl_export void mi_process_init(void)     mi_attr_noexcept;
+mi_decl_export void mi_cdecl mi_process_done(void) mi_attr_noexcept;
 mi_decl_export void mi_thread_init(void)      mi_attr_noexcept;
 mi_decl_export void mi_thread_done(void)      mi_attr_noexcept;
-mi_decl_export void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
+mi_decl_export void mi_thread_set_in_threadpool(void) mi_attr_noexcept; // communicate that a thread is in a threadpool
 
-mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, 
-                                    size_t* current_rss, size_t* peak_rss, 
-                                    size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept;
 
-// -------------------------------------------------------------------------------------
-// Aligned allocation
-// Note that `alignment` always follows `size` for consistency with unaligned
-// allocation, but unfortunately this differs from `posix_memalign` and `aligned_alloc`.
-// -------------------------------------------------------------------------------------
-#define MI_ALIGNMENT_MAX   (1024*1024UL)    // maximum supported alignment is 1MiB
+// -----------------------------------------------------------------
+// Return allocated block size (if the return value is not NULL)
+// -----------------------------------------------------------------
 
-mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
-mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
-mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
-mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
-mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2) mi_attr_alloc_align(3);
-mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
-mi_decl_nodiscard mi_decl_export void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(2) mi_attr_alloc_align(3);
-mi_decl_nodiscard mi_decl_export void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_umalloc(size_t size, size_t* block_size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_ucalloc(size_t count, size_t size, size_t* block_size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
+mi_decl_nodiscard mi_decl_export void* mi_urealloc(void* p, size_t newsize, size_t* block_size_pre, size_t* block_size_post) mi_attr_noexcept mi_attr_alloc_size(2);
+mi_decl_export void mi_ufree(void* p, size_t* block_size) mi_attr_noexcept;
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_umalloc_aligned(size_t size, size_t alignment, size_t* block_size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_uzalloc_aligned(size_t size, size_t alignment, size_t* block_size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_umalloc_small(size_t size, size_t* block_size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_uzalloc_small(size_t size, size_t* block_size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
 
 
 // -------------------------------------------------------------------------------------
-// Heaps: first-class, but can only allocate from the same thread that created it.
+// Heaps: first-class. Can allocate from any thread (and be free'd from any thread)
+// Heaps keep allocations in separate pages from each other (but share the arena's and free'd pages)
 // -------------------------------------------------------------------------------------
 
 struct mi_heap_s;
 typedef struct mi_heap_s mi_heap_t;
 
 mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_new(void);
-mi_decl_export void       mi_heap_delete(mi_heap_t* heap);
-mi_decl_export void       mi_heap_destroy(mi_heap_t* heap);
-mi_decl_export mi_heap_t* mi_heap_set_default(mi_heap_t* heap);
-mi_decl_export mi_heap_t* mi_heap_get_default(void);
-mi_decl_export mi_heap_t* mi_heap_get_backing(void);
-mi_decl_export void       mi_heap_collect(mi_heap_t* heap, bool force) mi_attr_noexcept;
-
-mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
-mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
-mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_export void mi_heap_delete(mi_heap_t* heap);            // move live blocks to the main heap
+mi_decl_export void mi_heap_destroy(mi_heap_t* heap);           // free all live blocks
+mi_decl_export void mi_heap_set_numa_affinity(mi_heap_t* heap, int numa_node);
+mi_decl_export void mi_heap_collect(mi_heap_t* heap, bool force);
+
+mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_main(void);
+mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_of(const void* p);
+mi_decl_nodiscard mi_decl_export bool       mi_heap_contains(const mi_heap_t* heap, const void* p);
+mi_decl_nodiscard mi_decl_export bool       mi_any_heap_contains(const void* p);
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc(mi_heap_t* theap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_zalloc(mi_heap_t* heap, size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
 mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
-mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc_small(mi_heap_t* theap, size_t size)   mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
 
 mi_decl_nodiscard mi_decl_export void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize)              mi_attr_noexcept mi_attr_alloc_size(3);
-mi_decl_nodiscard mi_decl_export void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size)  mi_attr_noexcept mi_attr_alloc_size2(3,4);
-mi_decl_nodiscard mi_decl_export void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize)             mi_attr_noexcept mi_attr_alloc_size(3);
+mi_decl_nodiscard mi_decl_export void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size)  mi_attr_noexcept mi_attr_alloc_size2(3, 4);
+mi_decl_nodiscard mi_decl_export void* mi_heap_reallocf(mi_heap_t* theap, void* p, size_t newsize)            mi_attr_noexcept mi_attr_alloc_size(3);
 
 mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const char* s)            mi_attr_noexcept mi_attr_malloc;
 mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept mi_attr_malloc;
@@ -234,69 +278,150 @@ mi_decl_nodiscard mi_decl_export void* mi_recalloc_aligned(void* p, size_t newco
 mi_decl_nodiscard mi_decl_export void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size2(2,3);
 
 mi_decl_nodiscard mi_decl_export void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize)                mi_attr_noexcept mi_attr_alloc_size(3);
-mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t newcount, size_t size)  mi_attr_noexcept mi_attr_alloc_size2(3,4);
+mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t newcount, size_t size)  mi_attr_noexcept mi_attr_alloc_size2(3, 4);
 
 mi_decl_nodiscard mi_decl_export void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(3) mi_attr_alloc_align(4);
 mi_decl_nodiscard mi_decl_export void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(3);
-mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept mi_attr_alloc_size2(3,4) mi_attr_alloc_align(5);
-mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size2(3,4);
+mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept mi_attr_alloc_size2(3, 4) mi_attr_alloc_align(5);
+mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size2(3, 4);
+
 
 
 // ------------------------------------------------------
-// Analysis
+// Visiting pages and individual blocks in a heap.
 // ------------------------------------------------------
 
-mi_decl_export bool mi_heap_contains_block(mi_heap_t* heap, const void* p);
-mi_decl_export bool mi_heap_check_owned(mi_heap_t* heap, const void* p);
-mi_decl_export bool mi_check_owned(const void* p);
-
 // An area of heap space contains blocks of a single size.
 typedef struct mi_heap_area_s {
-  void*  blocks;      // start of the area containing heap blocks
+  void*  blocks;      // start of the area containing theap blocks
   size_t reserved;    // bytes reserved for this area (virtual)
   size_t committed;   // current available bytes for this area
   size_t used;        // number of allocated blocks
   size_t block_size;  // size in bytes of each block
   size_t full_block_size; // size in bytes of a full block including padding and metadata.
+  void*  reserved1;   // internal
 } mi_heap_area_t;
 
 typedef bool (mi_cdecl mi_block_visit_fun)(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg);
 
-mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_all_blocks, mi_block_visit_fun* visitor, void* arg);
+mi_decl_export bool   mi_heap_visit_blocks(mi_heap_t* heap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg);
+mi_decl_export bool   mi_heap_visit_abandoned_blocks(mi_heap_t* heap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg);
+
+
+// ------------------------------------------------------
+// Arena memory management
+// Arena's are larger memory area's provided by the OS or user
+// ------------------------------------------------------
 
-// Experimental
-mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept;
 mi_decl_nodiscard mi_decl_export bool mi_is_redirected(void) mi_attr_noexcept;
 
-mi_decl_export int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept;
-mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept;
+mi_decl_export int    mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept;
+mi_decl_export int    mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept;
 
-mi_decl_export int  mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept;
-mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept;
+mi_decl_export int    mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept;
+mi_decl_export bool   mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_pinned /* cannot decommit/reset? */, bool is_zero, int numa_node) mi_attr_noexcept;
 
-mi_decl_export void mi_debug_show_arenas(void) mi_attr_noexcept;
+mi_decl_export void   mi_debug_show_arenas(void) mi_attr_noexcept;
+mi_decl_export void   mi_arenas_print(void) mi_attr_noexcept;
+mi_decl_export size_t mi_arena_min_alignment(void);
+mi_decl_export size_t mi_arena_min_size(void);
 
-// deprecated
-mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
+typedef void* mi_arena_id_t;
+mi_decl_export void*  mi_arena_area(mi_arena_id_t arena_id, size_t* size);
+mi_decl_export int    mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
+mi_decl_export int    mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
+mi_decl_export bool   mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_pinned, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
+mi_decl_export bool   mi_arena_contains(mi_arena_id_t arena_id, const void* p);
+
+// Create a heap that only allocates in the specified arena
+mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id);
+
+
+// ------------------------------------------------------
+// Subprocesses
+// Advanced: allow sub-processes whose memory arena's stay fully separated (and no reclamation between them).
+// Used for example for separate interpreters in one process.
+// ------------------------------------------------------
+
+typedef void* mi_subproc_id_t;                        
+mi_decl_export mi_subproc_id_t mi_subproc_main(void);
+mi_decl_export mi_subproc_id_t mi_subproc_current(void);
+mi_decl_export mi_subproc_id_t mi_subproc_new(void);
+mi_decl_export void mi_subproc_destroy(mi_subproc_id_t subproc);
+mi_decl_export void mi_subproc_add_current_thread(mi_subproc_id_t subproc); // this should be called right after a thread is created (and no allocation has taken place yet)
+
+typedef bool (mi_cdecl mi_heap_visit_fun)(mi_heap_t* heap, void* arg);
+mi_decl_export bool mi_subproc_visit_heaps(mi_subproc_id_t subproc, mi_heap_visit_fun* visitor, void* arg);
+
+
+// -------------------------------------------------------------------------------------
+// A "theap" is a thread-local heap. This API is only provided for special circumstances like runtimes
+// that already have a thread-local context and can store the theap there for (slightly) faster allocations.
+// This also allows to set a default theap for the current thread so that `malloc` etc. allocate from
+// that theap (instead of the main (t)heap).
+// Theaps are first-class, but can only allocate from the same thread that created it.
+// Allocation through a `theap` may be a tiny bit faster than using plain malloc
+// (as we don't need to lookup the thread local variable).
+// -------------------------------------------------------------------------------------
+
+struct mi_theap_s;
+typedef struct mi_theap_s mi_theap_t;
+
+mi_decl_export mi_theap_t* mi_heap_theap(mi_heap_t* heap);
+mi_decl_export mi_theap_t* mi_theap_set_default(mi_theap_t* theap);
+mi_decl_export mi_theap_t* mi_theap_get_default(void);
+mi_decl_export void        mi_theap_collect(mi_theap_t* theap, bool force) mi_attr_noexcept;
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_theap_malloc(mi_theap_t* theap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_theap_zalloc(mi_theap_t* theap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_theap_calloc(mi_theap_t* theap, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_theap_malloc_small(mi_theap_t* theap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_theap_malloc_aligned(mi_theap_t* theap, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export                  void* mi_theap_realloc(mi_theap_t* theap, void* p, size_t newsize)              mi_attr_noexcept mi_attr_alloc_size(3);
 
 
 // ------------------------------------------------------
-// Convenience
+// Experimental
 // ------------------------------------------------------
 
-#define mi_malloc_tp(tp)                ((tp*)mi_malloc(sizeof(tp)))
-#define mi_zalloc_tp(tp)                ((tp*)mi_zalloc(sizeof(tp)))
-#define mi_calloc_tp(tp,n)              ((tp*)mi_calloc(n,sizeof(tp)))
-#define mi_mallocn_tp(tp,n)             ((tp*)mi_mallocn(n,sizeof(tp)))
-#define mi_reallocn_tp(p,tp,n)          ((tp*)mi_reallocn(p,n,sizeof(tp)))
-#define mi_recalloc_tp(p,tp,n)          ((tp*)mi_recalloc(p,n,sizeof(tp)))
+// Experimental: objects followed by a guard page.
+// Setting the sample rate on a specific theap can be used to test parts of the program more
+// specifically (in combination with `mi_theap_set_default`).
+// A sample rate of 0 disables guarded objects, while 1 uses a guard page for every object.
+// A seed of 0 uses a random start point. Only objects within the size bound are eligable for guard pages.
+mi_decl_export void mi_theap_guarded_set_sample_rate(mi_theap_t* theap, size_t sample_rate, size_t seed);
+mi_decl_export void mi_theap_guarded_set_size_bound(mi_theap_t* theap, size_t min, size_t max);
+
+// very experimental
+typedef bool (mi_cdecl mi_commit_fun_t)(bool commit, void* start, size_t size, bool* is_zero, void* user_arg);
+mi_decl_export bool  mi_manage_memory(void* start, size_t size, bool is_committed, bool is_pinned, bool is_zero, int numa_node, bool exclusive,
+                                      mi_commit_fun_t* commit_fun, void* commit_fun_arg, mi_arena_id_t* arena_id) mi_attr_noexcept;
+
+//mi_decl_export bool  mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* accessed_size, size_t* size);
+//mi_decl_export bool  mi_arena_reload(void* start, size_t size, mi_commit_fun_t* commit_fun, void* commit_fun_arg, mi_arena_id_t* arena_id);
+//mi_decl_export bool  mi_theap_reload(mi_theap_t* theap, mi_arena_id_t arena);
+//mi_decl_export void  mi_theap_unload(mi_theap_t* theap);
+
+
+// ------------------------------------------------------
+// Deprecated
+// ------------------------------------------------------
+
+mi_decl_export bool mi_check_owned(const void* p);
+
+mi_decl_export void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept;
+mi_decl_export bool mi_theap_visit_blocks(const mi_theap_t* theap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg);
+
+mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
+mi_decl_export void mi_collect_reduce(size_t target_thread_owned) mi_attr_noexcept;
+
+mi_decl_export void mi_stats_reset(void)      mi_attr_noexcept;
+mi_decl_export void mi_stats_merge(void)      mi_attr_noexcept;
+mi_decl_export void mi_stats_print(void* out) mi_attr_noexcept;  // backward compatibility: `out` is ignored and should be NULL
+
+mi_decl_export void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;  // not deprecated but declared in `mimalloc-stats.h` now.
 
-#define mi_heap_malloc_tp(hp,tp)        ((tp*)mi_heap_malloc(hp,sizeof(tp)))
-#define mi_heap_zalloc_tp(hp,tp)        ((tp*)mi_heap_zalloc(hp,sizeof(tp)))
-#define mi_heap_calloc_tp(hp,tp,n)      ((tp*)mi_heap_calloc(hp,n,sizeof(tp)))
-#define mi_heap_mallocn_tp(hp,tp,n)     ((tp*)mi_heap_mallocn(hp,n,sizeof(tp)))
-#define mi_heap_reallocn_tp(hp,p,tp,n)  ((tp*)mi_heap_reallocn(hp,p,n,sizeof(tp)))
-#define mi_heap_recalloc_tp(hp,p,tp,n)  ((tp*)mi_heap_recalloc(hp,p,n,sizeof(tp)))
 
 
 // ------------------------------------------------------
@@ -305,34 +430,60 @@ mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size
 
 typedef enum mi_option_e {
   // stable options
-  mi_option_show_errors,
-  mi_option_show_stats,
-  mi_option_verbose,
-  // some of the following options are experimental
-  // (deprecated options are kept for binary backward compatibility with v1.x versions)
-  mi_option_eager_commit,
-  mi_option_deprecated_eager_region_commit,
-  mi_option_deprecated_reset_decommits,
-  mi_option_large_os_pages,           // use large (2MiB) OS pages, implies eager commit
-  mi_option_reserve_huge_os_pages,    // reserve N huge OS pages (1GiB) at startup
-  mi_option_reserve_huge_os_pages_at, // reserve huge OS pages at a specific NUMA node
-  mi_option_reserve_os_memory,        // reserve specified amount of OS memory at startup
+  mi_option_show_errors,                // print error messages
+  mi_option_show_stats,                 // print statistics on termination
+  mi_option_verbose,                    // print verbose messages
+  // advanced options
+  mi_option_deprecated_eager_commit,    
+  mi_option_arena_eager_commit,         // eager commit arenas? Use 2 to enable just on overcommit systems (=2)
+  mi_option_purge_decommits,            // should a memory purge decommit? (=1). Set to 0 to use memory reset on a purge (instead of decommit)
+  mi_option_allow_large_os_pages,       // allow use of large (2 or 4 MiB) OS pages, implies eager commit.
+  mi_option_reserve_huge_os_pages,      // reserve N huge OS pages (1GiB pages) at startup
+  mi_option_reserve_huge_os_pages_at,   // reserve huge OS pages at a specific NUMA node
+  mi_option_reserve_os_memory,          // reserve specified amount of OS memory in an arena at startup (internally, this value is in KiB; use `mi_option_get_size`)
   mi_option_deprecated_segment_cache,
-  mi_option_page_reset,
-  mi_option_abandoned_page_decommit,
+  mi_option_deprecated_page_reset,
+  mi_option_deprecated_abandoned_page_purge,
   mi_option_deprecated_segment_reset,
-  mi_option_eager_commit_delay,
-  mi_option_decommit_delay,
-  mi_option_use_numa_nodes,           // 0 = use available numa nodes, otherwise use at most N nodes.
-  mi_option_limit_os_alloc,           // 1 = do not use OS memory for allocation (but only reserved arenas)
-  mi_option_os_tag,
-  mi_option_max_errors,
-  mi_option_max_warnings,
-  mi_option_max_segment_reclaim,
-  mi_option_allow_decommit,
-  mi_option_segment_decommit_delay,  
-  mi_option_decommit_extend_delay,
-  _mi_option_last
+  mi_option_deprecated_eager_commit_delay, 
+  mi_option_purge_delay,                // memory purging is delayed by N milli seconds; use 0 for immediate purging or -1 for no purging at all. (=10)
+  mi_option_use_numa_nodes,             // 0 = use all available numa nodes, otherwise use at most N nodes.
+  mi_option_disallow_os_alloc,          // 1 = do not use OS memory for allocation (but only programmatically reserved arenas)
+  mi_option_os_tag,                     // tag used for OS logging (macOS only for now) (=100)
+  mi_option_max_errors,                 // issue at most N error messages
+  mi_option_max_warnings,               // issue at most N warning messages
+  mi_option_deprecated_max_segment_reclaim,  // max. percentage of the abandoned segments can be reclaimed per try (=10%)
+  mi_option_destroy_on_exit,            // if set, release all memory on exit; sometimes used for dynamic unloading but can be unsafe
+  mi_option_arena_reserve,              // initial memory size for arena reservation (= 1 GiB on 64-bit) (internally, this value is in KiB; use `mi_option_get_size`)
+  mi_option_arena_purge_mult,           // multiplier for `purge_delay` for the purging delay for arenas (=10)
+  mi_option_deprecated_purge_extend_delay,
+  mi_option_disallow_arena_alloc,       // 1 = do not use arena's for allocation (except if using specific arena id's)
+  mi_option_retry_on_oom,               // retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. (only on windows)
+  mi_option_visit_abandoned,            // allow visiting theap blocks from abandoned threads (=0)
+  mi_option_guarded_min,                // only used when building with MI_GUARDED: minimal rounded object size for guarded objects (=0)
+  mi_option_guarded_max,                // only used when building with MI_GUARDED: maximal rounded object size for guarded objects (=0)
+  mi_option_guarded_precise,            // disregard minimal alignment requirement to always place guarded blocks exactly in front of a guard page (=0)
+  mi_option_guarded_sample_rate,        // 1 out of N allocations in the min/max range will be guarded (=1000)
+  mi_option_guarded_sample_seed,        // can be set to allow for a (more) deterministic re-execution when a guard page is triggered (=0)
+  mi_option_generic_collect,            // collect theaps every N (=10000) generic allocation calls
+  mi_option_page_reclaim_on_free,       // reclaim abandoned pages on a free (=0). -1 disallowr always, 0 allows if the page originated from the current theap, 1 allow always
+  mi_option_page_full_retain,           // retain N full (small) pages per size class (=2)
+  mi_option_page_max_candidates,        // max candidate pages to consider for allocation (=4)
+  mi_option_max_vabits,                 // max user space virtual address bits to consider (=48)
+  mi_option_pagemap_commit,             // commit the full pagemap (to always catch invalid pointer uses) (=0)
+  mi_option_page_commit_on_demand,      // commit page memory on-demand
+  mi_option_page_max_reclaim,           // don't reclaim pages of the same originating theap if we already own N pages (in that size class) (=-1 (unlimited))
+  mi_option_page_cross_thread_max_reclaim, // don't reclaim pages across threads if we already own N pages (in that size class) (=16)
+  mi_option_allow_thp,                  // allow transparent huge pages? (=1) (on Android =0 by default). Set to 0 to disable THP for the process.
+  mi_option_minimal_purge_size,         // set minimal purge size (in KiB) (=0). By default set to either 64 or 2048 if THP is enabled.
+  mi_option_arena_max_object_size,      // set maximal object size that can be allocated in an arena (in KiB) (=2GiB on 64-bit). 
+  _mi_option_last,
+  // legacy option names
+  mi_option_large_os_pages = mi_option_allow_large_os_pages,
+  mi_option_eager_region_commit = mi_option_arena_eager_commit,
+  mi_option_reset_decommits = mi_option_purge_decommits,
+  mi_option_reset_delay = mi_option_purge_delay,
+  mi_option_limit_os_alloc = mi_option_disallow_os_alloc
 } mi_option_t;
 
 
@@ -342,8 +493,9 @@ mi_decl_export void mi_option_disable(mi_option_t option);
 mi_decl_export void mi_option_set_enabled(mi_option_t option, bool enable);
 mi_decl_export void mi_option_set_enabled_default(mi_option_t option, bool enable);
 
-mi_decl_nodiscard mi_decl_export long mi_option_get(mi_option_t option);
-mi_decl_nodiscard mi_decl_export long mi_option_get_clamp(mi_option_t option, long min, long max);
+mi_decl_nodiscard mi_decl_export long   mi_option_get(mi_option_t option);
+mi_decl_nodiscard mi_decl_export long   mi_option_get_clamp(mi_option_t option, long min, long max);
+mi_decl_nodiscard mi_decl_export size_t mi_option_get_size(mi_option_t option);
 mi_decl_export void mi_option_set(mi_option_t option, long value);
 mi_decl_export void mi_option_set_default(mi_option_t option, long value);
 
@@ -351,7 +503,7 @@ mi_decl_export void mi_option_set_default(mi_option_t option, long value);
 // -------------------------------------------------------------------------------------------------------
 // "mi" prefixed implementations of various posix, Unix, Windows, and C++ allocation functions.
 // (This can be convenient when providing overrides of these functions as done in `mimalloc-override.h`.)
-// note: we use `mi_cfree` as "checked free" and it checks if the pointer is in our heap before free-ing.
+// note: we use `mi_cfree` as "checked free" and it checks if the pointer is in our theap before free-ing.
 // -------------------------------------------------------------------------------------------------------
 
 mi_decl_export void  mi_cfree(void* p) mi_attr_noexcept;
@@ -390,6 +542,9 @@ mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_n(size_t count, s
 mi_decl_nodiscard mi_decl_export void* mi_new_realloc(void* p, size_t newsize)                mi_attr_alloc_size(2);
 mi_decl_nodiscard mi_decl_export void* mi_new_reallocn(void* p, size_t newcount, size_t size) mi_attr_alloc_size2(2, 3);
 
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_alloc_new(mi_heap_t* heap, size_t size)                 mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_alloc_new_n(mi_heap_t* heap, size_t count, size_t size) mi_attr_malloc mi_attr_alloc_size2(2, 3);
+
 #ifdef __cplusplus
 }
 #endif
@@ -407,7 +562,7 @@ mi_decl_nodiscard mi_decl_export void* mi_new_reallocn(void* p, size_t newcount,
 #include <utility>     // std::forward
 #endif
 
-template<class T> struct mi_stl_allocator {
+template<class T> struct _mi_stl_allocator_common {
   typedef T                 value_type;
   typedef std::size_t       size_type;
   typedef std::ptrdiff_t    difference_type;
@@ -415,6 +570,27 @@ template<class T> struct mi_stl_allocator {
   typedef value_type const& const_reference;
   typedef value_type*       pointer;
   typedef value_type const* const_pointer;
+
+  #if ((__cplusplus >= 201103L) || (_MSC_VER > 1900))  // C++11
+  using propagate_on_container_copy_assignment = std::true_type;
+  using propagate_on_container_move_assignment = std::true_type;
+  using propagate_on_container_swap            = std::true_type;
+  template <class U, class ...Args> void construct(U* p, Args&& ...args) { ::new(p) U(std::forward<Args>(args)...); }
+  template <class U> void destroy(U* p) mi_attr_noexcept { p->~U(); }
+  #else
+  void construct(pointer p, value_type const& val) { ::new(p) value_type(val); }
+  void destroy(pointer p) { p->~value_type(); }
+  #endif
+
+  size_type     max_size() const mi_attr_noexcept { return (PTRDIFF_MAX/sizeof(value_type)); }
+  pointer       address(reference x) const        { return &x; }
+  const_pointer address(const_reference x) const  { return &x; }
+};
+
+template<class T> struct mi_stl_allocator : public _mi_stl_allocator_common<T> {
+  using typename _mi_stl_allocator_common<T>::size_type;
+  using typename _mi_stl_allocator_common<T>::value_type;
+  using typename _mi_stl_allocator_common<T>::pointer;
   template <class U> struct rebind { typedef mi_stl_allocator<U> other; };
 
   mi_stl_allocator()                                             mi_attr_noexcept = default;
@@ -431,24 +607,91 @@ template<class T> struct mi_stl_allocator {
   #endif
 
   #if ((__cplusplus >= 201103L) || (_MSC_VER > 1900))  // C++11
-  using propagate_on_container_copy_assignment = std::true_type;
-  using propagate_on_container_move_assignment = std::true_type;
-  using propagate_on_container_swap            = std::true_type;
-  using is_always_equal                        = std::true_type;
-  template <class U, class ...Args> void construct(U* p, Args&& ...args) { ::new(p) U(std::forward<Args>(args)...); }
-  template <class U> void destroy(U* p) mi_attr_noexcept { p->~U(); }
-  #else
-  void construct(pointer p, value_type const& val) { ::new(p) value_type(val); }
-  void destroy(pointer p) { p->~value_type(); }
+  using is_always_equal = std::true_type;
   #endif
-
-  size_type     max_size() const mi_attr_noexcept { return (PTRDIFF_MAX/sizeof(value_type)); }
-  pointer       address(reference x) const        { return &x; }
-  const_pointer address(const_reference x) const  { return &x; }
 };
 
 template<class T1,class T2> bool operator==(const mi_stl_allocator<T1>& , const mi_stl_allocator<T2>& ) mi_attr_noexcept { return true; }
 template<class T1,class T2> bool operator!=(const mi_stl_allocator<T1>& , const mi_stl_allocator<T2>& ) mi_attr_noexcept { return false; }
+
+
+#if (__cplusplus >= 201103L) || (_MSC_VER >= 1900)  // C++11
+#define MI_HAS_HEAP_STL_ALLOCATOR 1
+
+#include <memory>      // std::shared_ptr
+
+// Common base class for STL allocators in a specific theap
+template<class T, bool _mi_destroy> struct _mi_heap_stl_allocator_common : public _mi_stl_allocator_common<T> {
+  using typename _mi_stl_allocator_common<T>::size_type;
+  using typename _mi_stl_allocator_common<T>::value_type;
+  using typename _mi_stl_allocator_common<T>::pointer;
+
+  _mi_heap_stl_allocator_common(mi_heap_t* hp) : heap(hp, [](mi_heap_t*) {}) {}    /* will not delete nor destroy the passed in heap */
+
+  #if (__cplusplus >= 201703L)  // C++17
+  mi_decl_nodiscard T* allocate(size_type count) { return static_cast<T*>(mi_heap_alloc_new_n(this->heap.get(), count, sizeof(T))); }
+  mi_decl_nodiscard T* allocate(size_type count, const void*) { return allocate(count); }
+  #else
+  mi_decl_nodiscard pointer allocate(size_type count, const void* = 0) { return static_cast<pointer>(mi_heap_alloc_new_n(this->heap.get(), count, sizeof(value_type))); }
+  #endif
+
+  #if ((__cplusplus >= 201103L) || (_MSC_VER > 1900))  // C++11
+  using is_always_equal = std::false_type;
+  #endif
+
+  void collect(bool force) { mi_heap_collect(this->heap.get(), force); }
+  template<class U> bool is_equal(const _mi_heap_stl_allocator_common<U, _mi_destroy>& x) const { return (this->heap == x.heap); }
+
+protected:
+  std::shared_ptr<mi_heap_t> heap;
+  template<class U, bool D> friend struct _mi_heap_stl_allocator_common;
+
+  _mi_heap_stl_allocator_common() {
+    mi_heap_t* hp = mi_heap_new();
+    this->heap.reset(hp, (_mi_destroy ? &heap_destroy : &heap_delete));  /* calls heap_delete/destroy when the refcount drops to zero */
+  }
+  _mi_heap_stl_allocator_common(const _mi_heap_stl_allocator_common& x) mi_attr_noexcept : heap(x.heap) { }
+  template<class U> _mi_heap_stl_allocator_common(const _mi_heap_stl_allocator_common<U, _mi_destroy>& x) mi_attr_noexcept : heap(x.heap) { }
+
+private:
+  static void heap_delete(mi_heap_t* hp)  { if (hp != NULL) { mi_heap_delete(hp); } }
+  static void heap_destroy(mi_heap_t* hp) { if (hp != NULL) { mi_heap_destroy(hp); } }
+};
+
+// STL allocator allocation in a specific heap
+template<class T> struct mi_heap_stl_allocator : public _mi_heap_stl_allocator_common<T, false> {
+  using typename _mi_heap_stl_allocator_common<T, false>::size_type;
+  mi_heap_stl_allocator() : _mi_heap_stl_allocator_common<T, false>() { } // creates fresh heap that is deleted when the destructor is called
+  mi_heap_stl_allocator(mi_heap_t* hp) : _mi_heap_stl_allocator_common<T, false>(hp) { }  // no delete nor destroy on the passed in heap
+  template<class U> mi_heap_stl_allocator(const mi_heap_stl_allocator<U>& x) mi_attr_noexcept : _mi_heap_stl_allocator_common<T, false>(x) { }
+
+  mi_heap_stl_allocator select_on_container_copy_construction() const { return *this; }
+  void deallocate(T* p, size_type) { mi_free(p); }
+  template<class U> struct rebind { typedef mi_heap_stl_allocator<U> other; };
+};
+
+template<class T1, class T2> bool operator==(const mi_heap_stl_allocator<T1>& x, const mi_heap_stl_allocator<T2>& y) mi_attr_noexcept { return (x.is_equal(y)); }
+template<class T1, class T2> bool operator!=(const mi_heap_stl_allocator<T1>& x, const mi_heap_stl_allocator<T2>& y) mi_attr_noexcept { return (!x.is_equal(y)); }
+
+
+// STL allocator allocation in a specific heap, where `free` does nothing and
+// the heap is destroyed in one go on destruction -- use with care!
+template<class T> struct mi_heap_destroy_stl_allocator : public _mi_heap_stl_allocator_common<T, true> {
+  using typename _mi_heap_stl_allocator_common<T, true>::size_type;
+  mi_heap_destroy_stl_allocator() : _mi_heap_stl_allocator_common<T, true>() { } // creates fresh heap that is destroyed when the destructor is called
+  mi_heap_destroy_stl_allocator(mi_heap_t* hp) : _mi_heap_stl_allocator_common<T, true>(hp) { }  // no delete nor destroy on the passed in heap
+  template<class U> mi_heap_destroy_stl_allocator(const mi_heap_destroy_stl_allocator<U>& x) mi_attr_noexcept : _mi_heap_stl_allocator_common<T, true>(x) { }
+
+  mi_heap_destroy_stl_allocator select_on_container_copy_construction() const { return *this; }
+  void deallocate(T*, size_type) { /* do nothing as we destroy the heap on destruct. */ }
+  template<class U> struct rebind { typedef mi_heap_destroy_stl_allocator<U> other; };
+};
+
+template<class T1, class T2> bool operator==(const mi_heap_destroy_stl_allocator<T1>& x, const mi_heap_destroy_stl_allocator<T2>& y) mi_attr_noexcept { return (x.is_equal(y)); }
+template<class T1, class T2> bool operator!=(const mi_heap_destroy_stl_allocator<T1>& x, const mi_heap_destroy_stl_allocator<T2>& y) mi_attr_noexcept { return (!x.is_equal(y)); }
+
+#endif // C++11
+
 #endif // __cplusplus
 
 #endif
diff --git a/ext/src/mimalloc/include/mimalloc-atomic.h b/ext/src/mimalloc/include/mimalloc/atomic.h
similarity index 58%
rename from ext/src/mimalloc/include/mimalloc-atomic.h
rename to ext/src/mimalloc/include/mimalloc/atomic.h
index 7ad5da5851..699c78b458 100644
--- a/ext/src/mimalloc/include/mimalloc-atomic.h
+++ b/ext/src/mimalloc/include/mimalloc/atomic.h
@@ -1,45 +1,64 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2021 Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024 Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #pragma once
-#ifndef MIMALLOC_ATOMIC_H
-#define MIMALLOC_ATOMIC_H
+#ifndef MI_ATOMIC_H
+#define MI_ATOMIC_H
+
+// include windows.h or pthreads.h
+#if defined(_WIN32)
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+#elif !defined(__wasi__) && (!defined(__EMSCRIPTEN__) || defined(__EMSCRIPTEN_PTHREADS__))
+#define  MI_USE_PTHREADS
+#include <pthread.h>
+#endif
 
 // --------------------------------------------------------------------------------------------
 // Atomics
 // We need to be portable between C, C++, and MSVC.
-// We base the primitives on the C/C++ atomics and create a mimimal wrapper for MSVC in C compilation mode. 
-// This is why we try to use only `uintptr_t` and `<type>*` as atomic types. 
-// To gain better insight in the range of used atomics, we use explicitly named memory order operations 
+// We base the primitives on the C/C++ atomics and create a minimal wrapper for MSVC in C compilation mode.
+// This is why we try to use only `uintptr_t` and `<type>*` as atomic types.
+// To gain better insight in the range of used atomics, we use explicitly named memory order operations
 // instead of passing the memory order as a parameter.
 // -----------------------------------------------------------------------------------------------
 
 #if defined(__cplusplus)
 // Use C++ atomics
 #include <atomic>
-#define  _Atomic(tp)            std::atomic<tp>
-#define  mi_atomic(name)        std::atomic_##name
-#define  mi_memory_order(name)  std::memory_order_##name
-#if !defined(ATOMIC_VAR_INIT) || (__cplusplus >= 202002L) // c++20, see issue #571
- #define MI_ATOMIC_VAR_INIT(x)  x
+#define  _Atomic(tp)              std::atomic<tp>
+#define  mi_atomic(name)          std::atomic_##name
+#define  mi_memory_order(name)    std::memory_order_##name
+#if (__cplusplus >= 202002L)      // c++20, see issue #571
+ #define MI_ATOMIC_VAR_INIT(x)    x
+#elif !defined(ATOMIC_VAR_INIT)
+ #define MI_ATOMIC_VAR_INIT(x)    x
 #else
- #define MI_ATOMIC_VAR_INIT(x)  ATOMIC_VAR_INIT(x)
+ #define MI_ATOMIC_VAR_INIT(x)    ATOMIC_VAR_INIT(x)
 #endif
 #elif defined(_MSC_VER)
 // Use MSVC C wrapper for C11 atomics
-#define  _Atomic(tp)            tp 
-#define  MI_ATOMIC_VAR_INIT(x)  x
-#define  mi_atomic(name)        mi_atomic_##name
-#define  mi_memory_order(name)  mi_memory_order_##name
+#define  _Atomic(tp)              tp
+#define  MI_ATOMIC_VAR_INIT(x)    x
+#define  mi_atomic(name)          mi_atomic_##name
+#define  mi_memory_order(name)    mi_memory_order_##name
 #else
 // Use C11 atomics
 #include <stdatomic.h>
-#define  mi_atomic(name)        atomic_##name
-#define  mi_memory_order(name)  memory_order_##name
-#define  MI_ATOMIC_VAR_INIT(x)  ATOMIC_VAR_INIT(x)
+#define  mi_atomic(name)          atomic_##name
+#define  mi_memory_order(name)    memory_order_##name
+#if (__STDC_VERSION__ >= 201710L) // c17, see issue #735
+ #define MI_ATOMIC_VAR_INIT(x)    x
+#elif !defined(ATOMIC_VAR_INIT)
+ #define MI_ATOMIC_VAR_INIT(x)    x
+#else
+ #define MI_ATOMIC_VAR_INIT(x)    ATOMIC_VAR_INIT(x)
+#endif
 #endif
 
 // Various defines for all used memory orders in mimalloc
@@ -53,18 +72,24 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_atomic_load_relaxed(p)                mi_atomic(load_explicit)(p,mi_memory_order(relaxed))
 #define mi_atomic_store_release(p,x)             mi_atomic(store_explicit)(p,x,mi_memory_order(release))
 #define mi_atomic_store_relaxed(p,x)             mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_exchange_relaxed(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_exchange_release(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(release))
 #define mi_atomic_exchange_acq_rel(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(acq_rel))
+
+#define mi_atomic_cas_weak_relaxed(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(relaxed),mi_memory_order(relaxed))
 #define mi_atomic_cas_weak_release(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
 #define mi_atomic_cas_weak_acq_rel(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
+#define mi_atomic_cas_strong_relaxed(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(relaxed),mi_memory_order(relaxed))
 #define mi_atomic_cas_strong_release(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
 #define mi_atomic_cas_strong_acq_rel(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
 
 #define mi_atomic_add_relaxed(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(relaxed))
-#define mi_atomic_sub_relaxed(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_add_acq_rel(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_sub_relaxed(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_sub_acq_rel(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_and_relaxed(p,x)               mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_and_acq_rel(p,x)               mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_or_relaxed(p,x)                mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_or_acq_rel(p,x)                mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(acq_rel))
 
 #define mi_atomic_increment_relaxed(p)           mi_atomic_add_relaxed(p,(uintptr_t)1)
@@ -91,6 +116,8 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub);
 #define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release(p,exp,(tp*)des)
 #define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel(p,exp,(tp*)des)
 #define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release(p,exp,(tp*)des)
+#define mi_atomic_cas_ptr_strong_acq_rel(tp,p,exp,des)  mi_atomic_cas_strong_acq_rel(p,exp,(tp*)des)
+#define mi_atomic_exchange_ptr_relaxed(tp,p,x)          mi_atomic_exchange_relaxed(p,(tp*)x)
 #define mi_atomic_exchange_ptr_release(tp,p,x)          mi_atomic_exchange_release(p,(tp*)x)
 #define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          mi_atomic_exchange_acq_rel(p,(tp*)x)
 #else
@@ -99,6 +126,8 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub);
 #define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release(p,exp,des)
 #define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel(p,exp,des)
 #define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release(p,exp,des)
+#define mi_atomic_cas_ptr_strong_acq_rel(tp,p,exp,des)  mi_atomic_cas_strong_acq_rel(p,exp,des)
+#define mi_atomic_exchange_ptr_relaxed(tp,p,x)          mi_atomic_exchange_relaxed(p,x)
 #define mi_atomic_exchange_ptr_release(tp,p,x)          mi_atomic_exchange_release(p,x)
 #define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          mi_atomic_exchange_acq_rel(p,x)
 #endif
@@ -107,24 +136,30 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub);
 static inline int64_t mi_atomic_addi64_relaxed(volatile int64_t* p, int64_t add) {
   return mi_atomic(fetch_add_explicit)((_Atomic(int64_t)*)p, add, mi_memory_order(relaxed));
 }
+static inline void mi_atomic_void_addi64_relaxed(volatile int64_t* p, const volatile int64_t* padd) {
+  const int64_t add = mi_atomic_load_relaxed((_Atomic(int64_t)*)padd);
+  if (add != 0) {
+    mi_atomic(fetch_add_explicit)((_Atomic(int64_t)*)p, add, mi_memory_order(relaxed));
+  }
+}
 static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) {
   int64_t current = mi_atomic_load_relaxed((_Atomic(int64_t)*)p);
   while (current < x && !mi_atomic_cas_weak_release((_Atomic(int64_t)*)p, &current, x)) { /* nothing */ };
 }
 
 // Used by timers
-#define mi_atomic_loadi64_acquire(p)    mi_atomic(load_explicit)(p,mi_memory_order(acquire))
-#define mi_atomic_loadi64_relaxed(p)    mi_atomic(load_explicit)(p,mi_memory_order(relaxed))
-#define mi_atomic_storei64_release(p,x) mi_atomic(store_explicit)(p,x,mi_memory_order(release))
-#define mi_atomic_storei64_relaxed(p,x) mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_loadi64_acquire(p)            mi_atomic(load_explicit)(p,mi_memory_order(acquire))
+#define mi_atomic_loadi64_relaxed(p)            mi_atomic(load_explicit)(p,mi_memory_order(relaxed))
+#define mi_atomic_storei64_release(p,x)         mi_atomic(store_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_storei64_relaxed(p,x)         mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
 
+#define mi_atomic_casi64_strong_acq_rel(p,e,d)  mi_atomic_cas_strong_acq_rel(p,e,d)
+#define mi_atomic_addi64_acq_rel(p,i)           mi_atomic_add_acq_rel(p,i)
 
 
 #elif defined(_MSC_VER)
 
-// MSVC C compilation wrapper that uses Interlocked operations to model C11 atomics.
-#define WIN32_LEAN_AND_MEAN
-#include <windows.h>
+// Legacy MSVC plain C compilation wrapper that uses Interlocked operations to model C11 atomics.
 #include <intrin.h>
 #ifdef _WIN64
 typedef LONG64   msc_intptr_t;
@@ -189,7 +224,7 @@ static inline uintptr_t mi_atomic_load_explicit(_Atomic(uintptr_t) const* p, mi_
 #else
   uintptr_t x = *p;
   if (mo > mi_memory_order_relaxed) {
-    while (!mi_atomic_compare_exchange_weak_explicit(p, &x, x, mo, mi_memory_order_relaxed)) { /* nothing */ };
+    while (!mi_atomic_compare_exchange_weak_explicit((_Atomic(uintptr_t)*)p, &x, x, mo, mi_memory_order_relaxed)) { /* nothing */ };
   }
   return x;
 #endif
@@ -238,6 +273,14 @@ static inline int64_t mi_atomic_addi64_relaxed(volatile _Atomic(int64_t)*p, int6
   return current;
 #endif
 }
+
+static inline void mi_atomic_void_addi64_relaxed(volatile int64_t* p, const volatile int64_t* padd) {
+  const int64_t add = *padd;
+  if (add != 0) {
+    mi_atomic_addi64_relaxed((volatile _Atomic(int64_t)*)p, add);
+  }
+}
+
 static inline void mi_atomic_maxi64_relaxed(volatile _Atomic(int64_t)*p, int64_t x) {
   int64_t current;
   do {
@@ -245,6 +288,21 @@ static inline void mi_atomic_maxi64_relaxed(volatile _Atomic(int64_t)*p, int64_t
   } while (current < x && _InterlockedCompareExchange64(p, x, current) != current);
 }
 
+static inline void mi_atomic_addi64_acq_rel(volatile _Atomic(int64_t*)p, int64_t i) {
+  mi_atomic_addi64_relaxed(p, i);
+}
+
+static inline bool mi_atomic_casi64_strong_acq_rel(volatile _Atomic(int64_t*)p, int64_t* exp, int64_t des) {
+  int64_t read = _InterlockedCompareExchange64(p, des, *exp);
+  if (read == *exp) {
+    return true;
+  }
+  else {
+    *exp = read;
+    return false;
+  }
+}
+
 // The pointer macros cast to `uintptr_t`.
 #define mi_atomic_load_ptr_acquire(tp,p)                (tp*)mi_atomic_load_acquire((_Atomic(uintptr_t)*)(p))
 #define mi_atomic_load_ptr_relaxed(tp,p)                (tp*)mi_atomic_load_relaxed((_Atomic(uintptr_t)*)(p))
@@ -253,6 +311,8 @@ static inline void mi_atomic_maxi64_relaxed(volatile _Atomic(int64_t)*p, int64_t
 #define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
 #define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
 #define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
+#define mi_atomic_cas_ptr_strong_acq_rel(tp,p,exp,des)  mi_atomic_cas_strong_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
+#define mi_atomic_exchange_ptr_relaxed(tp,p,x)          (tp*)mi_atomic_exchange_relaxed((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
 #define mi_atomic_exchange_ptr_release(tp,p,x)          (tp*)mi_atomic_exchange_release((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
 #define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          (tp*)mi_atomic_exchange_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
 
@@ -275,15 +335,41 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub) {
   return (intptr_t)mi_atomic_addi(p, -sub);
 }
 
-// Yield 
+
+// ----------------------------------------------------------------------
+// Once and Guard
+// ----------------------------------------------------------------------
+
+typedef _Atomic(uintptr_t) mi_atomic_once_t;
+
+// Returns true only on the first invocation
+static inline bool mi_atomic_once( mi_atomic_once_t* once ) {
+  if (mi_atomic_load_relaxed(once) != 0) return false;     // quick test
+  uintptr_t expected = 0;
+  return mi_atomic_cas_strong_acq_rel(once, &expected, (uintptr_t)1); // try to set to 1
+}
+
+typedef _Atomic(uintptr_t) mi_atomic_guard_t;
+
+// Allows only one thread to execute at a time
+#define mi_atomic_guard(guard) \
+  uintptr_t _mi_guard_expected = 0; \
+  for(bool _mi_guard_once = true; \
+      _mi_guard_once && mi_atomic_cas_strong_acq_rel(guard,&_mi_guard_expected,(uintptr_t)1); \
+      (mi_atomic_store_release(guard,(uintptr_t)0), _mi_guard_once = false) )
+
+
+
+// ----------------------------------------------------------------------
+// Yield
+// ----------------------------------------------------------------------
+
 #if defined(__cplusplus)
 #include <thread>
 static inline void mi_atomic_yield(void) {
   std::this_thread::yield();
 }
 #elif defined(_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#include <windows.h>
 static inline void mi_atomic_yield(void) {
   YieldProcessor();
 }
@@ -293,8 +379,9 @@ static inline void mi_atomic_yield(void) {
   _mm_pause();
 }
 #elif (defined(__GNUC__) || defined(__clang__)) && \
-      (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__armel__) || defined(__ARMEL__) || \
-       defined(__aarch64__) || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__))
+      (defined(__x86_64__) || defined(__i386__) || \
+       defined(__aarch64__) || defined(__arm__) || \
+       defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__POWERPC__))
 #if defined(__x86_64__) || defined(__i386__)
 static inline void mi_atomic_yield(void) {
   __asm__ volatile ("pause" ::: "memory");
@@ -303,19 +390,27 @@ static inline void mi_atomic_yield(void) {
 static inline void mi_atomic_yield(void) {
   __asm__ volatile("wfe");
 }
-#elif (defined(__arm__) && __ARM_ARCH__ >= 7)
+#elif defined(__arm__)
+#if __ARM_ARCH >= 7
 static inline void mi_atomic_yield(void) {
   __asm__ volatile("yield" ::: "memory");
 }
-#elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)
+#else
 static inline void mi_atomic_yield(void) {
-  __asm__ __volatile__ ("or 27,27,27" ::: "memory");
+  __asm__ volatile ("nop" ::: "memory");
 }
-#elif defined(__armel__) || defined(__ARMEL__)
+#endif
+#elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__POWERPC__)
+#ifdef __APPLE__
 static inline void mi_atomic_yield(void) {
-  __asm__ volatile ("nop" ::: "memory");
+  __asm__ volatile ("or r27,r27,r27" ::: "memory");
+}
+#else
+static inline void mi_atomic_yield(void) {
+  __asm__ __volatile__ ("or 27,27,27" ::: "memory");
 }
 #endif
+#endif
 #elif defined(__sun)
 // Fallback for other archs
 #include <synch.h>
@@ -335,4 +430,137 @@ static inline void mi_atomic_yield(void) {
 #endif
 
 
-#endif // __MIMALLOC_ATOMIC_H
+// ----------------------------------------------------------------------
+// Locks
+// These should be light-weight in-process only locks.
+// Only used for reserving arena's and to maintain the abandoned list.
+// ----------------------------------------------------------------------
+#if _MSC_VER
+#pragma warning(disable:26110)  // unlock with holding lock
+#endif
+
+#define mi_lock(lock)    for(bool _go = (mi_lock_acquire(lock),true); _go; (mi_lock_release(lock), _go=false) )
+
+#if defined(_WIN32)
+
+#if 1
+#define mi_lock_t  SRWLOCK   // slim reader-writer lock
+
+static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
+  return TryAcquireSRWLockExclusive(lock);
+}
+static inline void mi_lock_acquire(mi_lock_t* lock) {
+  AcquireSRWLockExclusive(lock);
+}
+static inline void mi_lock_release(mi_lock_t* lock) {
+  ReleaseSRWLockExclusive(lock);
+}
+static inline void mi_lock_init(mi_lock_t* lock) {
+  InitializeSRWLock(lock);
+}
+static inline void mi_lock_done(mi_lock_t* lock) {
+  (void)(lock);
+}
+
+#else
+#define mi_lock_t  CRITICAL_SECTION
+
+static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
+  return TryEnterCriticalSection(lock);
+}
+static inline void mi_lock_acquire(mi_lock_t* lock) {
+  EnterCriticalSection(lock);
+}
+static inline void mi_lock_release(mi_lock_t* lock) {
+  LeaveCriticalSection(lock);
+}
+static inline void mi_lock_init(mi_lock_t* lock) {
+  InitializeCriticalSection(lock);
+}
+static inline void mi_lock_done(mi_lock_t* lock) {
+  DeleteCriticalSection(lock);
+}
+
+#endif
+
+#elif defined(MI_USE_PTHREADS)
+
+#include <string.h> // memcpy
+void _mi_error_message(int err, const char* fmt, ...);
+
+#define mi_lock_t  pthread_mutex_t
+
+static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
+  return (pthread_mutex_trylock(lock) == 0);
+}
+static inline void mi_lock_acquire(mi_lock_t* lock) {
+  const int err = pthread_mutex_lock(lock);
+  if (err != 0) {
+    _mi_error_message(err, "internal error: lock cannot be acquired (err %i)\n", err);
+  }
+}
+static inline void mi_lock_release(mi_lock_t* lock) {
+  pthread_mutex_unlock(lock);
+}
+static inline void mi_lock_init(mi_lock_t* lock) {
+  if(lock==NULL) return;
+  // use instead of pthread_mutex_init since that can cause allocation on some platforms (and recursively initialize)
+  const mi_lock_t temp_lock = PTHREAD_MUTEX_INITIALIZER;  
+  memcpy(lock,&temp_lock,sizeof(temp_lock));
+}
+static inline void mi_lock_done(mi_lock_t* lock) {
+  pthread_mutex_destroy(lock);
+}
+
+#elif defined(__cplusplus)
+
+#include <mutex>
+#define mi_lock_t  std::mutex
+
+static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
+  return lock->try_lock();
+}
+static inline void mi_lock_acquire(mi_lock_t* lock) {
+  lock->lock();
+}
+static inline void mi_lock_release(mi_lock_t* lock) {
+  lock->unlock();
+}
+static inline void mi_lock_init(mi_lock_t* lock) {
+  (void)(lock);
+}
+static inline void mi_lock_done(mi_lock_t* lock) {
+  (void)(lock);
+}
+
+#else
+
+// fall back to poor man's locks.
+// this should only be the case in a single-threaded environment (like __wasi__)
+
+#define mi_lock_t  _Atomic(uintptr_t)
+
+static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
+  uintptr_t expected = 0;
+  return mi_atomic_cas_strong_acq_rel(lock, &expected, (uintptr_t)1);
+}
+static inline void mi_lock_acquire(mi_lock_t* lock) {
+  for (int i = 0; i < 1000; i++) {  // for at most 1000 tries?
+    if (mi_lock_try_acquire(lock)) return;
+    mi_atomic_yield();
+  }
+}
+static inline void mi_lock_release(mi_lock_t* lock) {
+  mi_atomic_store_release(lock, (uintptr_t)0);
+}
+static inline void mi_lock_init(mi_lock_t* lock) {
+  mi_lock_release(lock);
+}
+static inline void mi_lock_done(mi_lock_t* lock) {
+  (void)(lock);
+}
+
+#endif
+
+
+#endif // MI_ATOMIC_H
diff --git a/ext/src/mimalloc/include/mimalloc/bits.h b/ext/src/mimalloc/include/mimalloc/bits.h
new file mode 100644
index 0000000000..c40b32f6b0
--- /dev/null
+++ b/ext/src/mimalloc/include/mimalloc/bits.h
@@ -0,0 +1,342 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2024 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+  Bit operation, and platform dependent definition (MI_INTPTR_SIZE etc)
+---------------------------------------------------------------------------- */
+
+#pragma once
+#ifndef MI_BITS_H
+#define MI_BITS_H
+
+#include <stddef.h>   // size_t
+#include <stdint.h>   // int64_t etc
+#include <stdbool.h>  // bool
+
+// ------------------------------------------------------
+// Size of a pointer.
+// We assume that `sizeof(void*)==sizeof(intptr_t)`
+// and it holds for all platforms we know of.
+//
+// However, the C standard only requires that:
+//  p == (void*)((intptr_t)p))
+// but we also need:
+//  i == (intptr_t)((void*)i)
+// or otherwise one might define an intptr_t type that is larger than a pointer...
+// ------------------------------------------------------
+
+#if INTPTR_MAX > INT64_MAX
+# define MI_INTPTR_SHIFT (4)  // assume 128-bit  (as on arm CHERI for example)
+#elif INTPTR_MAX == INT64_MAX
+# define MI_INTPTR_SHIFT (3)
+#elif INTPTR_MAX == INT32_MAX
+# define MI_INTPTR_SHIFT (2)
+#else
+#error platform pointers must be 32, 64, or 128 bits
+#endif
+
+#if (INTPTR_MAX) > LONG_MAX
+# define MI_PU(x)  x##ULL
+#else
+# define MI_PU(x)  x##UL
+#endif
+
+#if SIZE_MAX == UINT64_MAX
+# define MI_SIZE_SHIFT (3)
+typedef int64_t  mi_ssize_t;
+#elif SIZE_MAX == UINT32_MAX
+# define MI_SIZE_SHIFT (2)
+typedef int32_t  mi_ssize_t;
+#else
+#error platform objects must be 32 or 64 bits in size
+#endif
+
+#if (SIZE_MAX/2) > LONG_MAX
+# define MI_ZU(x)  x##ULL
+#else
+# define MI_ZU(x)  x##UL
+#endif
+
+#define MI_INTPTR_SIZE  (1<<MI_INTPTR_SHIFT)
+#define MI_INTPTR_BITS  (MI_INTPTR_SIZE*8)
+
+#define MI_SIZE_SIZE  (1<<MI_SIZE_SHIFT)
+#define MI_SIZE_BITS  (MI_SIZE_SIZE*8)
+
+#define MI_KiB     (MI_ZU(1024))
+#define MI_MiB     (MI_KiB*MI_KiB)
+#define MI_GiB     (MI_MiB*MI_KiB)
+
+
+/* --------------------------------------------------------------------------------
+  Architecture
+-------------------------------------------------------------------------------- */
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)  // consider arm64ec as arm64
+#define MI_ARCH_ARM64     1
+#elif defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
+#define MI_ARCH_X64       1
+#elif defined(__i386__) || defined(__i386) || defined(_M_IX86) || defined(_X86_) || defined(__X86__)
+#define MI_ARCH_X86       1
+#elif defined(__arm__) || defined(_ARM) || defined(_M_ARM)  || defined(_M_ARMT) || defined(__arm)
+#define MI_ARCH_ARM32     1
+#elif defined(__riscv) || defined(_M_RISCV)
+#define MI_ARCH_RISCV     1
+#if (LONG_MAX == INT32_MAX)
+#define MI_ARCH_RISCV32   1
+#else
+#define MI_ARCH_RISCV64   1
+#endif
+#endif
+
+#if MI_ARCH_X64 && defined(__AVX2__)
+#include <immintrin.h>
+#elif MI_ARCH_ARM64 && MI_OPT_SIMD
+#include <arm_neon.h>
+#endif
+#if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+#include <intrin.h>
+#endif
+
+#if MI_ARCH_X64 && defined(__AVX2__) && !defined(__BMI2__) // msvc
+#define __BMI2__  1
+#endif
+#if MI_ARCH_X64 && (defined(__AVX2__) || defined(__BMI2__)) && !defined(__BMI1__) // msvc
+#define __BMI1__  1
+#endif
+
+// Define big endian if needed
+// #define MI_BIG_ENDIAN  1
+
+// maximum virtual address bits in a user-space pointer
+#if MI_DEFAULT_VIRTUAL_ADDRESS_BITS > 0 
+#define MI_MAX_VABITS     MI_DEFAULT_VIRTUAL_ADDRESS_BITS
+#elif   MI_ARCH_X64
+#define MI_MAX_VABITS     (47)
+#elif MI_INTPTR_SIZE > 4
+#define MI_MAX_VABITS     (48)
+#else
+#define MI_MAX_VABITS     (32)
+#endif
+
+// use a flat page-map (or a 2-level one)
+#ifndef MI_PAGE_MAP_FLAT
+#if MI_MAX_VABITS <= 40 && !defined(__APPLE__) 
+#define MI_PAGE_MAP_FLAT  1
+#else
+#define MI_PAGE_MAP_FLAT  0
+#endif
+#endif
+
+
+/* --------------------------------------------------------------------------------
+  Builtin's
+-------------------------------------------------------------------------------- */
+
+#ifndef __has_builtin
+#define __has_builtin(x)  0
+#endif
+
+#define mi_builtin(name)        __builtin_##name
+#define mi_has_builtin(name)    __has_builtin(__builtin_##name)
+
+#if (LONG_MAX == INT32_MAX)
+#define mi_builtin32(name)       mi_builtin(name##l)
+#define mi_has_builtin32(name)   mi_has_builtin(name##l)
+#else
+#define mi_builtin32(name)       mi_builtin(name)
+#define mi_has_builtin32(name)   mi_has_builtin(name)
+#endif
+#if (LONG_MAX == INT64_MAX)
+#define mi_builtin64(name)       mi_builtin(name##l)
+#define mi_has_builtin64(name)   mi_has_builtin(name##l)
+#else
+#define mi_builtin64(name)       mi_builtin(name##ll)
+#define mi_has_builtin64(name)   mi_has_builtin(name##ll)
+#endif
+
+#if (MI_SIZE_BITS == 32)
+#define mi_builtinz(name)        mi_builtin32(name)
+#define mi_has_builtinz(name)    mi_has_builtin32(name)
+#define mi_msc_builtinz(name)    name
+#elif (MI_SIZE_BITS == 64)
+#define mi_builtinz(name)        mi_builtin64(name)
+#define mi_has_builtinz(name)    mi_has_builtin64(name)
+#define mi_msc_builtinz(name)    name##64
+#endif
+
+/* --------------------------------------------------------------------------------
+  Popcount and count trailing/leading zero's
+-------------------------------------------------------------------------------- */
+
+size_t _mi_popcount_generic(size_t x);
+
+static inline size_t mi_popcount(size_t x) {
+  #if mi_has_builtinz(popcount)
+    return mi_builtinz(popcount)(x);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    return mi_msc_builtinz(__popcnt)(x);
+  #elif MI_ARCH_X64 && defined(__BMI1__)
+    return (size_t)_mm_popcnt_u64(x);
+  #else
+    #define MI_HAS_FAST_POPCOUNT  0
+    return (x<=1 ? x : _mi_popcount_generic(x));
+  #endif
+}
+
+#ifndef MI_HAS_FAST_POPCOUNT
+#define MI_HAS_FAST_POPCOUNT 1
+#endif
+
+
+
+size_t _mi_clz_generic(size_t x);
+size_t _mi_ctz_generic(size_t x);
+
+static inline size_t mi_ctz(size_t x) {
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 tzcnt is defined for 0
+    size_t r;
+    __asm ("tzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc");
+    return r;
+  #elif defined(_MSC_VER) && MI_ARCH_X64 && defined(__BMI1__) 
+    return _tzcnt_u64(x);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    unsigned long idx;
+    return (mi_msc_builtinz(_BitScanForward)(&idx, x) ? (size_t)idx : MI_SIZE_BITS);
+  #elif mi_has_builtinz(ctz)
+    return (x!=0 ? (size_t)mi_builtinz(ctz)(x) : MI_SIZE_BITS);
+  #elif defined(__GNUC__) && (MI_ARCH_X64 || MI_ARCH_X86)
+    size_t r = MI_SIZE_BITS;  // bsf leaves destination unmodified if the argument is 0 (see <https://github.com/llvm/llvm-project/pull/102885>)
+    __asm ("bsf\t%1, %0" : "+r"(r) : "r"(x) : "cc");
+    return r;
+  #elif MI_HAS_FAST_POPCOUNT
+    return (x!=0 ? (mi_popcount(x^(x-1))-1) : MI_SIZE_BITS);
+  #else
+    #define MI_HAS_FAST_BITSCAN  0
+    return (x!=0 ? _mi_ctz_generic(x) : MI_SIZE_BITS);
+  #endif
+}
+
+static inline size_t mi_clz(size_t x) {
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 lzcnt is defined for 0
+    size_t r;
+    __asm ("lzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc");
+    return r;
+  #elif defined(_MSC_VER) && MI_ARCH_X64 && defined(__BMI1__) 
+    return _lzcnt_u64(x);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    unsigned long idx;
+    return (mi_msc_builtinz(_BitScanReverse)(&idx, x) ? MI_SIZE_BITS - 1 - (size_t)idx : MI_SIZE_BITS);
+  #elif mi_has_builtinz(clz)
+    return (x!=0 ? (size_t)mi_builtinz(clz)(x) : MI_SIZE_BITS);
+  #elif defined(__GNUC__) && (MI_ARCH_X64 || MI_ARCH_X86)
+    if (x==0) return MI_SIZE_BITS;
+    size_t r;
+    __asm ("bsr\t%1, %0" : "=r"(r) : "r"(x) : "cc");
+    return (MI_SIZE_BITS - 1 - r);
+  #else
+    #define MI_HAS_FAST_BITSCAN  0
+    return (x!=0 ? _mi_clz_generic(x) : MI_SIZE_BITS);
+  #endif
+}
+
+#ifndef MI_HAS_FAST_BITSCAN
+#define MI_HAS_FAST_BITSCAN 1
+#endif
+
+/* --------------------------------------------------------------------------------
+  find trailing/leading zero  (bit scan forward/reverse)
+-------------------------------------------------------------------------------- */
+
+// Bit scan forward: find the least significant bit that is set (i.e. count trailing zero's)
+// return false if `x==0` (with `*idx` undefined) and true otherwise,
+// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
+static inline bool mi_bsf(size_t x, size_t* idx) {
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) && (!defined(__clang_major__) || __clang_major__ >= 9)
+    // on x64 the carry flag is set on zero which gives better codegen
+    bool is_zero;
+    __asm ( "tzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc" );
+    return !is_zero;
+  #elif 0 && defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    unsigned long i;
+    return (mi_msc_builtinz(_BitScanForward)(&i, x) ? (*idx = (size_t)i, true) : false);
+  #else
+    return (x!=0 ? (*idx = mi_ctz(x), true) : false);
+  #endif
+}
+
+// Bit scan reverse: find the most significant bit that is set
+// return false if `x==0` (with `*idx` undefined) and true otherwise,
+// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
+static inline bool mi_bsr(size_t x, size_t* idx) {
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__)  && (!defined(__clang_major__) || __clang_major__ >= 9)
+    // on x64 the carry flag is set on zero which gives better codegen
+    bool is_zero;
+    __asm ("lzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc");
+    return !is_zero;
+  #elif 0 && defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    unsigned long i;
+    return (mi_msc_builtinz(_BitScanReverse)(&i, x) ? (*idx = (size_t)i, true) : false);
+  #else
+    return (x!=0 ? (*idx = MI_SIZE_BITS - 1 - mi_clz(x), true) : false);
+  #endif
+}
+
+
+/* --------------------------------------------------------------------------------
+  rotate
+-------------------------------------------------------------------------------- */
+
+static inline size_t mi_rotr(size_t x, size_t r) {
+  #if (mi_has_builtin(rotateright64) && MI_SIZE_BITS==64)
+    return mi_builtin(rotateright64)(x,r);
+  #elif (mi_has_builtin(rotateright32) && MI_SIZE_BITS==32)
+    return mi_builtin(rotateright32)(x,r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_ARM64)
+    return _rotr64(x, (int)r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X86 || MI_ARCH_ARM32)
+    return _lrotr(x,(int)r);
+  #else
+    // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
+    // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
+    const unsigned int rshift = (unsigned int)(r) & (MI_SIZE_BITS-1);
+    return ((x >> rshift) | (x << ((-rshift) & (MI_SIZE_BITS-1))));
+  #endif
+}
+
+static inline size_t mi_rotl(size_t x, size_t r) {
+  #if (mi_has_builtin(rotateleft64) && MI_SIZE_BITS==64)
+    return mi_builtin(rotateleft64)(x,r);
+  #elif (mi_has_builtin(rotateleft32) && MI_SIZE_BITS==32)
+    return mi_builtin(rotateleft32)(x,r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_ARM64)
+    return _rotl64(x, (int)r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X86 || MI_ARCH_ARM32)
+    return _lrotl(x, (int)r);
+  #else
+    // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
+    // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
+    const unsigned int rshift = (unsigned int)(r) & (MI_SIZE_BITS-1);
+    return ((x << rshift) | (x >> ((-rshift) & (MI_SIZE_BITS-1))));
+  #endif
+}
+
+static inline uint32_t mi_rotl32(uint32_t x, uint32_t r) {
+  #if mi_has_builtin(rotateleft32)
+    return mi_builtin(rotateleft32)(x,r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    return _lrotl(x, (int)r);
+  #else
+    // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
+    // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
+    const unsigned int rshift = (unsigned int)(r) & 31;
+    return ((x << rshift) | (x >> ((-rshift) & 31)));
+  #endif
+}
+
+
+#endif // MI_BITS_H
diff --git a/ext/src/mimalloc/include/mimalloc/internal.h b/ext/src/mimalloc/include/mimalloc/internal.h
new file mode 100644
index 0000000000..bf0b6975aa
--- /dev/null
+++ b/ext/src/mimalloc/include/mimalloc/internal.h
@@ -0,0 +1,1238 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MI_INTERNAL_H
+#define MI_INTERNAL_H
+
+// --------------------------------------------------------------------------
+// This file contains the internal API's of mimalloc and various utility
+// functions and macros.
+// --------------------------------------------------------------------------
+
+#include "types.h"
+#include "track.h"
+#include "bits.h"
+
+
+// --------------------------------------------------------------------------
+// Compiler defines
+// --------------------------------------------------------------------------
+
+#if (MI_DEBUG>0)
+#define mi_trace_message(...)  _mi_trace_message(__VA_ARGS__)
+#else
+#define mi_trace_message(...)
+#endif
+
+#define mi_decl_cache_align     mi_decl_align(64)
+
+#if defined(_MSC_VER)
+#pragma warning(disable:4127)   // suppress constant conditional warning (due to MI_SECURE paths)
+#pragma warning(disable:26812)  // unscoped enum warning
+#define mi_decl_forceinline     __forceinline
+#define mi_decl_noinline        __declspec(noinline)
+#define mi_decl_thread          __declspec(thread)
+#define mi_decl_noreturn        __declspec(noreturn)
+#define mi_decl_weak
+#define mi_decl_hidden
+#define mi_decl_cold
+#elif (defined(__GNUC__) && (__GNUC__ >= 3)) || defined(__clang__) // includes clang and icc
+#if !MI_TRACK_ASAN
+#define mi_decl_forceinline     __attribute__((always_inline)) inline
+#else
+#define mi_decl_forceinline     inline
+#endif
+#define mi_decl_noinline        __attribute__((noinline))
+#define mi_decl_thread          __thread
+#define mi_decl_noreturn        __attribute__((noreturn))
+#define mi_decl_weak            __attribute__((weak))
+#define mi_decl_hidden          __attribute__((visibility("hidden")))
+#if (__GNUC__ >= 4) || defined(__clang__)
+#define mi_decl_cold            __attribute__((cold))
+#else
+#define mi_decl_cold
+#endif
+#elif __cplusplus >= 201103L    // c++11
+#define mi_decl_forceinline     inline
+#define mi_decl_noinline
+#define mi_decl_thread          thread_local
+#define mi_decl_noreturn        [[noreturn]]
+#define mi_decl_weak
+#define mi_decl_hidden
+#define mi_decl_cold
+#else
+#define mi_decl_forceinline     inline
+#define mi_decl_noinline
+#define mi_decl_thread          __thread        // hope for the best :-)
+#define mi_decl_noreturn
+#define mi_decl_weak
+#define mi_decl_hidden
+#define mi_decl_cold
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#define mi_unlikely(x)     (__builtin_expect(!!(x),false))
+#define mi_likely(x)       (__builtin_expect(!!(x),true))
+#elif (defined(__cplusplus) && (__cplusplus >= 202002L)) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
+#define mi_unlikely(x)     (x) [[unlikely]]
+#define mi_likely(x)       (x) [[likely]]
+#else
+#define mi_unlikely(x)     (x)
+#define mi_likely(x)       (x)
+#endif
+
+#ifndef __has_builtin
+#define __has_builtin(x)    0
+#endif
+
+#if defined(__cplusplus)
+#define mi_decl_externc     extern "C"
+#else
+#define mi_decl_externc
+#endif
+
+#if (defined(__GNUC__) && (__GNUC__ >= 7)) || defined(__clang__) // includes clang and icc
+#define mi_decl_maybe_unused    __attribute__((unused))
+#elif __cplusplus >= 201703L    // c++17
+#define mi_decl_maybe_unused    [[maybe_unused]]
+#else
+#define mi_decl_maybe_unused
+#endif
+
+#if defined(__cplusplus)
+#define mi_decl_externc         extern "C"
+#else
+#define mi_decl_externc
+#endif
+
+
+#if defined(__EMSCRIPTEN__) && !defined(__wasi__)
+#define __wasi__
+#endif
+
+
+// --------------------------------------------------------------------------
+// Internal functions
+// --------------------------------------------------------------------------
+
+
+// "libc.c"
+#include <stdarg.h>
+int           _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args);
+int           _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...);
+char          _mi_toupper(char c);
+int           _mi_strnicmp(const char* s, const char* t, size_t n);
+void          _mi_strlcpy(char* dest, const char* src, size_t dest_size);
+void          _mi_strlcat(char* dest, const char* src, size_t dest_size);
+size_t        _mi_strlen(const char* s);
+size_t        _mi_strnlen(const char* s, size_t max_len);
+char*         _mi_strnstr(char* s, size_t max_len, const char* pat);
+bool          _mi_getenv(const char* name, char* result, size_t result_size);
+
+// "options.c"
+void          _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message);
+void          _mi_fprintf(mi_output_fun* out, void* arg, const char* fmt, ...);
+void          _mi_raw_message(const char* fmt, ...);
+void          _mi_message(const char* fmt, ...);
+void          _mi_warning_message(const char* fmt, ...);
+void          _mi_verbose_message(const char* fmt, ...);
+void          _mi_trace_message(const char* fmt, ...);
+void          _mi_options_init(void);
+void          _mi_options_post_init(void);
+long          _mi_option_get_fast(mi_option_t option);
+void          _mi_error_message(int err, const char* fmt, ...);
+
+// random.c
+void          _mi_random_init(mi_random_ctx_t* ctx);
+void          _mi_random_init_weak(mi_random_ctx_t* ctx);
+void          _mi_random_reinit_if_weak(mi_random_ctx_t * ctx);
+void          _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx);
+uintptr_t     _mi_random_next(mi_random_ctx_t* ctx);
+uintptr_t     _mi_theap_random_next(mi_theap_t* theap);
+uintptr_t     _mi_os_random_weak(uintptr_t extra_seed);
+static inline uintptr_t _mi_random_shuffle(uintptr_t x);
+
+// init.c
+extern mi_decl_hidden mi_decl_cache_align const mi_page_t  _mi_page_empty;
+void          _mi_auto_process_init(void);
+void mi_cdecl _mi_auto_process_done(void) mi_attr_noexcept;
+bool          _mi_is_redirected(void);
+bool          _mi_allocator_init(const char** message);
+void          _mi_allocator_done(void);
+bool          _mi_is_main_thread(void);
+bool          _mi_preloading(void);           // true while the C runtime is not initialized yet
+void          _mi_thread_done(mi_theap_t* theap);
+
+mi_subproc_t* _mi_subproc(void);
+mi_subproc_t* _mi_subproc_main(void);
+mi_heap_t*    _mi_subproc_heap_main(mi_subproc_t* subproc);
+mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id);
+
+mi_threadid_t _mi_thread_id(void) mi_attr_noexcept;
+size_t        _mi_thread_seq_id(void) mi_attr_noexcept;
+bool          _mi_is_heap_main(const mi_heap_t* heap);
+void          _mi_theap_guarded_init(mi_theap_t* theap);
+void          _mi_theap_options_init(mi_theap_t* theap);
+mi_theap_t*   _mi_theap_default_safe(void);             // ensure the returned theap is initialized
+
+// os.c
+void          _mi_os_init(void);                                            // called from process init
+void*         _mi_os_alloc(size_t size, mi_memid_t* memid);
+void*         _mi_os_zalloc(size_t size, mi_memid_t* memid);
+void          _mi_os_free(void* p, size_t size, mi_memid_t memid);
+void          _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid, mi_subproc_t* subproc );
+
+size_t        _mi_os_page_size(void);
+size_t        _mi_os_guard_page_size(void);
+size_t        _mi_os_good_alloc_size(size_t size);
+bool          _mi_os_has_overcommit(void);
+bool          _mi_os_has_virtual_reserve(void);
+size_t        _mi_os_virtual_address_bits(void);
+size_t        _mi_os_minimal_purge_size(void);
+
+bool          _mi_os_reset(void* addr, size_t size);
+bool          _mi_os_decommit(void* addr, size_t size);
+void          _mi_os_reuse(void* p, size_t size);
+mi_decl_nodiscard bool _mi_os_commit(void* p, size_t size, bool* is_zero);
+mi_decl_nodiscard bool _mi_os_commit_ex(void* addr, size_t size, bool* is_zero, size_t stat_size);
+mi_decl_nodiscard bool _mi_os_protect(void* addr, size_t size);
+bool          _mi_os_unprotect(void* addr, size_t size);
+bool          _mi_os_purge(void* p, size_t size);
+bool          _mi_os_purge_ex(void* p, size_t size, bool allow_reset, size_t stats_size, mi_commit_fun_t* commit_fun, void* commit_fun_arg);
+
+size_t        _mi_os_secure_guard_page_size(void);
+bool          _mi_os_secure_guard_page_set_at(void* addr, mi_memid_t memid);
+bool          _mi_os_secure_guard_page_set_before(void* addr, mi_memid_t memid);
+bool          _mi_os_secure_guard_page_reset_at(void* addr, mi_memid_t memid);
+bool          _mi_os_secure_guard_page_reset_before(void* addr, mi_memid_t memid);
+
+int           _mi_os_numa_node(void);
+int           _mi_os_numa_node_count(void);
+
+void*         _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid);
+void*         _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_memid_t* memid);
+
+void*         _mi_os_get_aligned_hint(size_t try_alignment, size_t size);
+bool          _mi_os_canuse_large_page(size_t size, size_t alignment);
+size_t        _mi_os_large_page_size(void);
+void*         _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid);
+
+// threadlocal.c
+
+mi_thread_local_t _mi_thread_local_create(void);
+void          _mi_thread_local_free( mi_thread_local_t key );
+bool          _mi_thread_local_set(  mi_thread_local_t key, void* val );
+void*         _mi_thread_local_get(  mi_thread_local_t key );
+void          _mi_thread_locals_init(void);
+void          _mi_thread_locals_done(void);
+void          _mi_thread_locals_thread_done(void);
+
+// arena.c
+mi_arena_id_t _mi_arena_id_none(void);
+mi_arena_t*   _mi_arena_from_id(mi_arena_id_t id);
+bool          _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_t* request_arena);
+
+void*         _mi_arenas_alloc(mi_heap_t* heap, size_t size, bool commit, bool allow_pinned, mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid);
+void*         _mi_arenas_alloc_aligned(mi_heap_t* heap, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_pinned, mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid);
+void          _mi_arenas_free(void* p, size_t size, mi_memid_t memid);
+bool          _mi_arenas_contain(const void* p);
+void          _mi_arenas_collect(bool force_purge, bool visit_all, mi_tld_t* tld);
+void          _mi_arenas_unsafe_destroy_all(mi_subproc_t* subproc);
+
+mi_page_t*    _mi_arenas_page_alloc(mi_theap_t* theap, size_t block_size, size_t page_alignment);
+void          _mi_arenas_page_free(mi_page_t* page, mi_theap_t* current_theapx /* can be NULL */);
+void          _mi_arenas_page_abandon(mi_page_t* page, mi_theap_t* current_theap);
+void          _mi_arenas_page_unabandon(mi_page_t* page, mi_theap_t* current_theapx /* can be NULL */);
+bool          _mi_arenas_page_try_reabandon_to_mapped(mi_page_t* page);
+
+// arena-meta.c
+void*         _mi_meta_zalloc( size_t size, mi_memid_t* memid );
+void          _mi_meta_free(void* p, size_t size, mi_memid_t memid);
+bool          _mi_meta_is_meta_page(void* p);
+
+// "page-map.c"
+bool          _mi_page_map_init(void);
+mi_decl_nodiscard bool _mi_page_map_register(mi_page_t* page);
+void          _mi_page_map_unregister(mi_page_t* page);
+void          _mi_page_map_unregister_range(void* start, size_t size);
+mi_page_t*    _mi_safe_ptr_page(const void* p);
+void          _mi_page_map_unsafe_destroy(mi_subproc_t* subproc);
+
+// "page.c"
+void*         _mi_malloc_generic(mi_theap_t* theap, size_t size, size_t zero_huge_alignment, size_t* usable)  mi_attr_noexcept mi_attr_malloc;
+
+void          _mi_page_retire(mi_page_t* page) mi_attr_noexcept;       // free the page if there are no other pages with many free blocks
+void          _mi_page_unfull(mi_page_t* page);
+void          _mi_page_free(mi_page_t* page, mi_page_queue_t* pq);     // free the page
+void          _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq);  // abandon the page, to be picked up by another thread...
+
+size_t        _mi_page_queue_append(mi_theap_t* theap, mi_page_queue_t* pq, mi_page_queue_t* append);
+void          _mi_deferred_free(mi_theap_t* theap, bool force);
+
+void          _mi_page_free_collect(mi_page_t* page, bool force);
+void          _mi_page_free_collect_partly(mi_page_t* page, mi_block_t* head);
+mi_decl_nodiscard bool _mi_page_init(mi_theap_t* theap, mi_page_t* page);
+bool          _mi_page_queue_is_valid(mi_theap_t* theap, const mi_page_queue_t* pq);
+
+size_t        _mi_page_stats_bin(const mi_page_t* page); // for stats
+size_t        _mi_bin_size(size_t bin);                  // for stats
+size_t        _mi_bin(size_t size);                      // for stats
+
+// "theap.c"
+mi_theap_t*   _mi_theap_create(mi_heap_t* heap, mi_tld_t* tld);
+void          _mi_theap_delete(mi_theap_t* theap);
+void          _mi_theap_default_set(mi_theap_t* theap);
+void          _mi_theap_cached_set(mi_theap_t* theap);
+void          _mi_theap_collect_retired(mi_theap_t* theap, bool force);
+bool          _mi_theap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_block_visit_fun* visitor, void* arg);
+void          _mi_theap_page_reclaim(mi_theap_t* theap, mi_page_t* page);
+void          _mi_theap_free(mi_theap_t* theap);
+
+// "heap.c"
+void          _mi_heap_area_init(mi_heap_area_t* area, mi_page_t* page);
+mi_decl_cold  mi_theap_t* _mi_heap_theap_get_or_init(const mi_heap_t* heap);  // get (and possible create) the theap belonging to a heap
+mi_decl_cold  mi_theap_t* _mi_heap_theap_get_peek(const mi_heap_t* heap);     // get the theap for a heap without initializing (and return NULL in that case)
+void          _mi_heap_move_pages(mi_heap_t* heap_from, mi_heap_t* heap_to);  // in "arena.c"
+void          _mi_heap_destroy_pages(mi_heap_t* heap_from);                   // in "arena.c"
+
+
+// "stats.c"
+void          _mi_stats_init(void);
+void          _mi_stats_merge_into(mi_stats_t* to, mi_stats_t* from);
+
+mi_msecs_t    _mi_clock_now(void);
+mi_msecs_t    _mi_clock_end(mi_msecs_t start);
+mi_msecs_t    _mi_clock_start(void);
+
+// "alloc.c"
+void*         _mi_page_malloc_zero(mi_theap_t* theap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept;                  // called from `_mi_theap_malloc_aligned`
+void*         _mi_theap_malloc_zero(mi_theap_t* theap, size_t size, bool zero, size_t* usable) mi_attr_noexcept;
+void*         _mi_theap_malloc_zero_ex(mi_theap_t* theap, size_t size, bool zero, size_t huge_alignment, size_t* usable) mi_attr_noexcept;     // called from `_mi_theap_malloc_aligned`
+void*         _mi_theap_realloc_zero(mi_theap_t* theap, void* p, size_t newsize, bool zero, size_t* usable_pre, size_t* usable_post) mi_attr_noexcept;
+mi_block_t*   _mi_page_ptr_unalign(const mi_page_t* page, const void* p);
+void          _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size);
+
+#if MI_DEBUG>1
+bool          _mi_page_is_valid(mi_page_t* page);
+#endif
+
+
+// ------------------------------------------------------
+// Assertions
+// ------------------------------------------------------
+
+#if (MI_DEBUG)
+// use our own assertion to print without memory allocation
+mi_decl_noreturn mi_decl_cold void _mi_assert_fail(const char* assertion, const char* fname, unsigned int line, const char* func) mi_attr_noexcept;
+#define mi_assert(expr)     ((expr) ? (void)0 : _mi_assert_fail(#expr,__FILE__,__LINE__,__func__))
+#else
+#define mi_assert(x)
+#endif
+
+#if (MI_DEBUG>1)
+#define mi_assert_internal    mi_assert
+#else
+#define mi_assert_internal(x)
+#endif
+
+#if (MI_DEBUG>2)
+#define mi_assert_expensive   mi_assert
+#else
+#define mi_assert_expensive(x)
+#endif
+
+
+/* -----------------------------------------------------------
+  Statistics (in `stats.c`)
+----------------------------------------------------------- */
+
+// add to stat keeping track of the peak
+void __mi_stat_increase(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_decrease(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_increase_mt(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_decrease_mt(mi_stat_count_t* stat, size_t amount);
+
+// adjust stat in special cases to compensate for double counting (and does not adjust peak values and can decrease the total)
+void __mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_adjust_increase_mt(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_adjust_decrease_mt(mi_stat_count_t* stat, size_t amount);
+
+// counters can just be increased
+void __mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
+void __mi_stat_counter_increase_mt(mi_stat_counter_t* stat, size_t amount);
+
+#define mi_heap_stat_counter_increase(heap,stat,amount)         __mi_stat_counter_increase_mt( &(heap)->stats.stat, amount)
+#define mi_heap_stat_increase(heap,stat,amount)                 __mi_stat_increase_mt( &(heap)->stats.stat, amount)
+#define mi_heap_stat_decrease(heap,stat,amount)                 __mi_stat_decrease_mt( &(heap)->stats.stat, amount)
+#define mi_heap_stat_adjust_increase(heap,stat,amnt)            __mi_stat_adjust_increase_mt( &(heap)->stats.stat, amnt)
+#define mi_heap_stat_adjust_decrease(heap,stat,amnt)            __mi_stat_adjust_decrease_mt( &(heap)->stats.stat, amnt)
+
+#define mi_subproc_stat_counter_increase(subproc,stat,amount)   __mi_stat_counter_increase_mt( &(subproc)->stats.stat, amount)
+#define mi_subproc_stat_increase(subproc,stat,amount)           __mi_stat_increase_mt( &(subproc)->stats.stat, amount)
+#define mi_subproc_stat_decrease(subproc,stat,amount)           __mi_stat_decrease_mt( &(subproc)->stats.stat, amount)
+#define mi_subproc_stat_adjust_increase(subproc,stat,amount)    __mi_stat_adjust_increase_mt( &(subproc)->stats.stat, amount)
+#define mi_subproc_stat_adjust_decrease(subproc,stat,amount)    __mi_stat_adjust_decrease_mt( &(subproc)->stats.stat, amount)
+
+#define mi_os_stat_counter_increase(stat,amount)                mi_subproc_stat_counter_increase(_mi_subproc(),stat,amount)
+#define mi_os_stat_increase(stat,amount)                        mi_subproc_stat_increase(_mi_subproc(),stat,amount)
+#define mi_os_stat_decrease(stat,amount)                        mi_subproc_stat_decrease(_mi_subproc(),stat,amount)
+
+#define mi_theap_stat_counter_increase(theap,stat,amount)       __mi_stat_counter_increase( &(theap)->stats.stat, amount)
+#define mi_theap_stat_increase(theap,stat,amount)               __mi_stat_increase( &(theap)->stats.stat, amount)
+#define mi_theap_stat_decrease(theap,stat,amount)               __mi_stat_decrease( &(theap)->stats.stat, amount)
+#define mi_theap_stat_adjust_increase(theap,stat,amnt)          __mi_stat_adjust_increase( &(theap)->stats.stat, amnt)
+#define mi_theap_stat_adjust_decrease(theap,stat,amnt)          __mi_stat_adjust_decrease( &(theap)->stats.stat, amnt)
+
+
+/* -----------------------------------------------------------
+  Options (exposed for the debugger)
+----------------------------------------------------------- */
+typedef enum mi_option_init_e {
+  MI_OPTION_UNINIT,       // not yet initialized
+  MI_OPTION_DEFAULTED,    // not found in the environment, use default value
+  MI_OPTION_INITIALIZED   // found in environment or set explicitly
+} mi_option_init_t;
+
+typedef struct mi_option_desc_s {
+  long              value;  // the value
+  mi_option_init_t  init;   // is it initialized yet? (from the environment)
+  mi_option_t       option; // for debugging: the option index should match the option
+  const char*       name;   // option name without `mimalloc_` prefix
+  const char*       legacy_name; // potential legacy option name
+} mi_option_desc_t;
+
+
+
+/* -----------------------------------------------------------
+  Inlined definitions
+----------------------------------------------------------- */
+#define MI_UNUSED(x)     (void)(x)
+#ifndef NDEBUG
+#define MI_UNUSED_RELEASE(x)
+#else
+#define MI_UNUSED_RELEASE(x)  MI_UNUSED(x)
+#endif
+
+#define MI_INIT4(x)   x(),x(),x(),x()
+#define MI_INIT8(x)   MI_INIT4(x),MI_INIT4(x)
+#define MI_INIT16(x)  MI_INIT8(x),MI_INIT8(x)
+#define MI_INIT32(x)  MI_INIT16(x),MI_INIT16(x)
+#define MI_INIT64(x)  MI_INIT32(x),MI_INIT32(x)
+#define MI_INIT128(x) MI_INIT64(x),MI_INIT64(x)
+#define MI_INIT256(x) MI_INIT128(x),MI_INIT128(x)
+
+#define MI_INIT74(x)  MI_INIT64(x),MI_INIT8(x),x(),x()
+#define MI_INIT5(x)   MI_INIT4(x),x()
+
+#include <string.h>
+// initialize a local variable to zero; use memset as compilers optimize constant sized memset's
+#define _mi_memzero_var(x)  memset(&x,0,sizeof(x))
+
+// Is `x` a power of two? (0 is considered a power of two)
+static inline bool _mi_is_power_of_two(uintptr_t x) {
+  return ((x & (x - 1)) == 0);
+}
+
+// Is a pointer aligned?
+static inline bool _mi_is_aligned(void* p, size_t alignment) {
+  mi_assert_internal(alignment != 0);
+  return (((uintptr_t)p % alignment) == 0);
+}
+
+// Align upwards
+static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
+  mi_assert_internal(alignment != 0);
+  uintptr_t mask = alignment - 1;
+  if ((alignment & mask) == 0) {  // power of two?
+    return ((sz + mask) & ~mask);
+  }
+  else {
+    return (((sz + mask)/alignment)*alignment);
+  }
+}
+
+
+// Align a pointer upwards
+static inline uint8_t* _mi_align_up_ptr(void* p, size_t alignment) {
+  return (uint8_t*)_mi_align_up((uintptr_t)p, alignment);
+}
+
+
+static inline uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) {
+  mi_assert_internal(alignment != 0);
+  uintptr_t mask = alignment - 1;
+  if ((alignment & mask) == 0) { // power of two?
+    return (sz & ~mask);
+  }
+  else {
+    return ((sz / alignment) * alignment);
+  }
+}
+
+static inline void* mi_align_down_ptr(void* p, size_t alignment) {
+  return (void*)_mi_align_down((uintptr_t)p, alignment);
+}
+
+// Divide upwards: `s <= _mi_divide_up(s,d)*d < s+d`.
+static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) {
+  mi_assert_internal(divider != 0);
+  return (divider == 0 ? size : ((size + divider - 1) / divider));
+}
+
+
+// clamp an integer
+static inline size_t _mi_clamp(size_t sz, size_t min, size_t max) {
+  if (sz < min) return min;
+  else if (sz > max) return max;
+  else return sz;
+}
+
+// Is memory zero initialized?
+static inline bool mi_mem_is_zero(const void* p, size_t size) {
+  for (size_t i = 0; i < size; i++) {
+    if (((uint8_t*)p)[i] != 0) return false;
+  }
+  return true;
+}
+
+// Align a byte size to a size in _machine words_,
+// i.e. byte size == `wsize*sizeof(void*)`.
+static inline size_t _mi_wsize_from_size(size_t size) {
+  mi_assert_internal(size <= SIZE_MAX - sizeof(uintptr_t));
+  return (size + sizeof(uintptr_t) - 1) / sizeof(uintptr_t);
+}
+
+// Overflow detecting multiply
+#if __has_builtin(__builtin_umul_overflow) || (defined(__GNUC__) && (__GNUC__ >= 5))
+#include <limits.h>      // UINT_MAX, ULONG_MAX
+#if defined(_CLOCK_T)    // for Illumos
+#undef _CLOCK_T
+#endif
+static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
+  #if (SIZE_MAX == ULONG_MAX)
+    return __builtin_umull_overflow(count, size, (unsigned long *)total);
+  #elif (SIZE_MAX == UINT_MAX)
+    return __builtin_umul_overflow(count, size, (unsigned int *)total);
+  #else
+    return __builtin_umulll_overflow(count, size, (unsigned long long *)total);
+  #endif
+}
+#else /* __builtin_umul_overflow is unavailable */
+static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
+  *total = count*size;
+  if mi_likely(((size|count)>>(4*MI_SIZE_SIZE))==0) {  // did size and count fit both in the lower half bits of a size_t?
+    return false;
+  }
+  else {
+    return (size!=0 && (SIZE_MAX / size) < count);
+  }
+}
+#endif
+
+// Safe multiply `count*size` into `total`; return `true` on overflow.
+static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* total) {
+  if (count==1) {  // quick check for the case where count is one (common for C++ allocators)
+    *total = size;
+    return false;
+  }
+  else if mi_likely(!mi_mul_overflow(count, size, total)) {
+    return false;
+  }
+  else {
+    #if MI_DEBUG > 0
+    _mi_error_message(EOVERFLOW, "allocation request is too large (%zu * %zu bytes)\n", count, size);
+    #endif
+    *total = SIZE_MAX;
+    return true;
+  }
+}
+
+
+/*----------------------------------------------------------------------------------------
+  Heap functions
+------------------------------------------------------------------------------------------- */
+
+extern mi_decl_hidden const mi_theap_t _mi_theap_empty;       // read-only empty theap, initial value of the thread local default theap (in the MI_TLS_MODEL_THREAD_LOCAL)
+extern mi_decl_hidden const mi_theap_t _mi_theap_empty_wrong; // read-only empty theap used to signal that a theap for a heap could not be allocated
+
+static inline bool mi_theap_is_initialized(const mi_theap_t* theap) {
+  return (theap != NULL && theap->heap != NULL);
+}
+
+static inline mi_page_t* _mi_theap_get_free_small_page(mi_theap_t* theap, size_t size) {
+  mi_assert_internal(size <= (MI_SMALL_SIZE_MAX + MI_PADDING_SIZE));
+  const size_t idx = _mi_wsize_from_size(size);
+  mi_assert_internal(idx < MI_PAGES_DIRECT);
+  return theap->pages_free_direct[idx];
+}
+
+
+//static inline uintptr_t _mi_ptr_cookie(const void* p) {
+//  extern mi_theap_t _mi_theap_main;
+//  mi_assert_internal(_mi_theap_main.cookie != 0);
+//  return ((uintptr_t)p ^ _mi_theap_main.cookie);
+//}
+
+
+/* -----------------------------------------------------------
+  The page map maps addresses to `mi_page_t` pointers
+----------------------------------------------------------- */
+
+#if MI_PAGE_MAP_FLAT
+
+// flat page-map committed on demand, using one byte per slice (64 KiB).
+// single indirection and low commit, but large initial virtual reserve (4 GiB with 48 bit virtual addresses)
+// used by default on <= 40 bit virtual address spaces.
+extern mi_decl_hidden uint8_t* _mi_page_map;
+
+static inline size_t _mi_page_map_index(const void* p) {
+  return (size_t)((uintptr_t)p >> MI_ARENA_SLICE_SHIFT);
+}
+
+static inline mi_page_t* _mi_ptr_page_ex(const void* p, bool* valid) {
+  const size_t idx = _mi_page_map_index(p);
+  const size_t ofs = _mi_page_map[idx];
+  if (valid != NULL) { *valid = (ofs != 0); }
+  return (mi_page_t*)((((uintptr_t)p >> MI_ARENA_SLICE_SHIFT) + 1 - ofs) << MI_ARENA_SLICE_SHIFT);
+}
+
+static inline mi_page_t* _mi_checked_ptr_page(const void* p) {
+  bool valid;
+  mi_page_t* const page = _mi_ptr_page_ex(p, &valid);
+  return (valid ? page : NULL);
+}
+
+static inline mi_page_t* _mi_unchecked_ptr_page(const void* p) {
+  return _mi_ptr_page_ex(p, NULL);
+}
+
+#else
+
+// 2-level page map:
+// double indirection, but low commit and low virtual reserve.
+//
+// the page-map is usually 4 MiB (for 48 bit virtual addresses) and points to sub maps of 64 KiB.
+// the page-map is committed on-demand (in 64 KiB parts) (and sub-maps are committed on-demand as well)
+// one sub page-map = 64 KiB => covers 2^(16-3) * 2^16 = 2^29 = 512 MiB address space
+// the page-map needs 48-(16+13) = 19 bits => 2^19 sub map pointers = 2^22 bytes = 4 MiB reserved size.
+#define MI_PAGE_MAP_SUB_SHIFT     (13)
+#define MI_PAGE_MAP_SUB_COUNT     (MI_ZU(1) << MI_PAGE_MAP_SUB_SHIFT)
+#define MI_PAGE_MAP_SHIFT         (MI_MAX_VABITS - MI_PAGE_MAP_SUB_SHIFT - MI_ARENA_SLICE_SHIFT)
+#define MI_PAGE_MAP_COUNT         (MI_ZU(1) << MI_PAGE_MAP_SHIFT)
+
+typedef mi_page_t**   mi_submap_t;
+extern mi_decl_hidden _Atomic(mi_submap_t)* _mi_page_map;
+
+static inline size_t _mi_page_map_index(const void* p, size_t* sub_idx) {
+  const size_t u = (size_t)((uintptr_t)p / MI_ARENA_SLICE_SIZE);
+  if (sub_idx != NULL) { *sub_idx = u % MI_PAGE_MAP_SUB_COUNT; }
+  return (u / MI_PAGE_MAP_SUB_COUNT);
+}
+
+static inline mi_submap_t _mi_page_map_at(size_t idx) {
+  return mi_atomic_load_ptr_relaxed(mi_page_t*, &_mi_page_map[idx]);
+}
+
+static inline mi_page_t* _mi_unchecked_ptr_page(const void* p) {
+  size_t sub_idx;
+  const size_t idx = _mi_page_map_index(p, &sub_idx);
+  return (_mi_page_map_at(idx))[sub_idx];  // NULL if p==NULL
+}
+
+static inline mi_page_t* _mi_checked_ptr_page(const void* p) {
+  size_t sub_idx;
+  const size_t idx = _mi_page_map_index(p, &sub_idx);
+  mi_submap_t const sub = _mi_page_map_at(idx);
+  if mi_unlikely(sub == NULL) return NULL;
+  return sub[sub_idx];
+}
+
+#endif
+
+
+static inline mi_page_t* _mi_ptr_page(const void* p) {
+  mi_assert_internal(p==NULL || mi_is_in_heap_region(p));
+  #if MI_DEBUG || MI_SECURE || defined(__APPLE__)
+  return _mi_checked_ptr_page(p);
+  #else
+  return _mi_unchecked_ptr_page(p);
+  #endif
+}
+
+
+// Get the block size of a page
+static inline size_t mi_page_block_size(const mi_page_t* page) {
+  mi_assert_internal(page->block_size > 0);
+  return page->block_size;
+}
+
+// Page start
+static inline uint8_t* mi_page_start(const mi_page_t* page) {
+  return page->page_start;
+}
+
+static inline size_t mi_page_size(const mi_page_t* page) {
+  return mi_page_block_size(page) * page->reserved;
+}
+
+static inline uint8_t* mi_page_area(const mi_page_t* page, size_t* size) {
+  if (size) { *size = mi_page_size(page); }
+  return mi_page_start(page);
+}
+
+static inline size_t mi_page_info_size(void) {
+  return _mi_align_up(sizeof(mi_page_t), MI_MAX_ALIGN_SIZE);
+}
+
+static inline bool mi_page_contains_address(const mi_page_t* page, const void* p) {
+  size_t psize;
+  uint8_t* start = mi_page_area(page, &psize);
+  return (start <= (uint8_t*)p && (uint8_t*)p < start + psize);
+}
+
+static inline bool mi_page_is_in_arena(const mi_page_t* page) {
+  return (page->memid.memkind == MI_MEM_ARENA);
+}
+
+static inline bool mi_page_is_singleton(const mi_page_t* page) {
+  return (page->reserved == 1);
+}
+
+// Get the usable block size of a page without fixed padding.
+// This may still include internal padding due to alignment and rounding up size classes.
+static inline size_t mi_page_usable_block_size(const mi_page_t* page) {
+  return mi_page_block_size(page) - MI_PADDING_SIZE;
+}
+
+// This may change if we locate page info outside the page data slices
+static inline uint8_t* mi_page_slice_start(const mi_page_t* page) {
+  return (uint8_t*)page;
+}
+
+// This gives the offset relative to the start slice of a page. This may change if we ever
+// locate page info outside the page-data itself.
+static inline size_t mi_page_slice_offset_of(const mi_page_t* page, size_t offset_relative_to_page_start) {
+  return (page->page_start - mi_page_slice_start(page)) + offset_relative_to_page_start;
+}
+
+static inline size_t mi_page_committed(const mi_page_t* page) {
+  return (page->slice_committed == 0 ? mi_page_size(page) : page->slice_committed - (page->page_start - mi_page_slice_start(page)));
+}
+
+// are all blocks in a page freed?
+// note: needs up-to-date used count, (as the `xthread_free` list may not be empty). see `_mi_page_collect_free`.
+static inline bool mi_page_all_free(const mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  return (page->used == 0);
+}
+
+// are there immediately available blocks, i.e. blocks available on the free list.
+static inline bool mi_page_immediate_available(const mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  return (page->free != NULL);
+}
+
+
+// is the page not yet used up to its reserved space?
+static inline bool mi_page_is_expandable(const mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_internal(page->capacity <= page->reserved);
+  return (page->capacity < page->reserved);
+}
+
+
+static inline bool mi_page_is_full(mi_page_t* page) {
+  const bool full = (page->reserved == page->used);
+  mi_assert_internal(!full || page->free == NULL);
+  return full;
+}
+
+// is more than 7/8th of a page in use?
+static inline bool mi_page_is_mostly_used(const mi_page_t* page) {
+  if (page==NULL) return true;
+  uint16_t frac = page->reserved / 8U;
+  return (page->reserved - page->used <= frac);
+}
+
+// is more than (n-1)/n'th of a page in use?
+static inline bool mi_page_is_used_at_frac(const mi_page_t* page, uint16_t n) {
+  if (page==NULL) return true;
+  uint16_t frac = page->reserved / n;
+  return (page->reserved - page->used <= frac);
+}
+
+
+static inline bool mi_page_is_huge(const mi_page_t* page) {
+  return (mi_page_is_singleton(page) &&
+          (page->block_size > MI_LARGE_MAX_OBJ_SIZE ||
+           (mi_memkind_is_os(page->memid.memkind) && page->memid.mem.os.base < (void*)page)));
+}
+
+static inline mi_page_queue_t* mi_page_queue(const mi_theap_t* theap, size_t size) {
+  mi_page_queue_t* const pq = &((mi_theap_t*)theap)->pages[_mi_bin(size)];
+  if (size <= MI_LARGE_MAX_OBJ_SIZE) { mi_assert_internal(pq->block_size <= MI_LARGE_MAX_OBJ_SIZE); }
+  return pq;
+}
+
+
+//-----------------------------------------------------------
+// Page thread id and flags
+//-----------------------------------------------------------
+
+// Thread id of thread that owns this page (with flags in the bottom 2 bits)
+static inline mi_threadid_t mi_page_xthread_id(const mi_page_t* page) {
+  return mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_id);
+}
+
+// Plain thread id of the thread that owns this page
+static inline mi_threadid_t mi_page_thread_id(const mi_page_t* page) {
+  return (mi_page_xthread_id(page) & ~MI_PAGE_FLAG_MASK);
+}
+
+static inline mi_page_flags_t mi_page_flags(const mi_page_t* page) {
+  return (mi_page_xthread_id(page) & MI_PAGE_FLAG_MASK);
+}
+
+static inline void mi_page_flags_set(mi_page_t* page, bool set, mi_page_flags_t newflag) {
+  if (set) { mi_atomic_or_relaxed(&page->xthread_id, newflag); }
+      else { mi_atomic_and_relaxed(&page->xthread_id, ~newflag); }
+}
+
+static inline bool mi_page_is_in_full(const mi_page_t* page) {
+  return ((mi_page_flags(page) & MI_PAGE_IN_FULL_QUEUE) != 0);
+}
+
+static inline void mi_page_set_in_full(mi_page_t* page, bool in_full) {
+  mi_page_flags_set(page, in_full, MI_PAGE_IN_FULL_QUEUE);
+}
+
+static inline bool mi_page_has_interior_pointers(const mi_page_t* page) {
+  return ((mi_page_flags(page) & MI_PAGE_HAS_INTERIOR_POINTERS) != 0);
+}
+
+static inline void mi_page_set_has_interior_pointers(mi_page_t* page, bool has_aligned) {
+  mi_page_flags_set(page, has_aligned, MI_PAGE_HAS_INTERIOR_POINTERS);
+}
+
+static inline void mi_page_set_theap(mi_page_t* page, mi_theap_t* theap) {
+  // mi_assert_internal(!mi_page_is_in_full(page));  // can happen when destroying pages on theap_destroy
+  page->theap = theap;
+  const mi_threadid_t tid = (theap == NULL ? MI_THREADID_ABANDONED : theap->tld->thread_id);
+  mi_assert_internal((tid & MI_PAGE_FLAG_MASK) == 0);
+
+  // we need to use an atomic cas since a concurrent thread may still set the MI_PAGE_HAS_INTERIOR_POINTERS flag (see `alloc_aligned.c`).
+  mi_threadid_t xtid_old = mi_page_xthread_id(page);
+  mi_threadid_t xtid;
+  do {
+    xtid = tid | (xtid_old & MI_PAGE_FLAG_MASK);
+  } while (!mi_atomic_cas_weak_release(&page->xthread_id, &xtid_old, xtid));
+}
+
+static inline bool mi_page_is_abandoned(const mi_page_t* page) {
+  // note: the xtheap field of an abandoned theap is set to the subproc (for fast reclaim-on-free)
+  return (mi_page_thread_id(page) <= MI_THREADID_ABANDONED_MAPPED);
+}
+
+static inline bool mi_page_is_abandoned_mapped(const mi_page_t* page) {
+  return (mi_page_thread_id(page) == MI_THREADID_ABANDONED_MAPPED);
+}
+
+static inline void mi_page_set_abandoned_mapped(mi_page_t* page) {
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_atomic_or_relaxed(&page->xthread_id, MI_THREADID_ABANDONED_MAPPED);
+}
+
+static inline void mi_page_clear_abandoned_mapped(mi_page_t* page) {
+  mi_assert_internal(mi_page_is_abandoned_mapped(page));
+  mi_atomic_and_relaxed(&page->xthread_id, MI_PAGE_FLAG_MASK);
+}
+
+
+static inline mi_theap_t* mi_page_theap(const mi_page_t* page) {
+  mi_assert_internal(!mi_page_is_abandoned(page));
+  mi_assert_internal(page->theap != NULL);
+  return page->theap;
+}
+
+static inline mi_tld_t* mi_page_tld(const mi_page_t* page) {
+  mi_assert_internal(!mi_page_is_abandoned(page));
+  mi_assert_internal(page->theap != NULL);
+  return page->theap->tld;
+}
+
+
+static inline mi_heap_t* mi_page_heap(const mi_page_t* page) {
+  mi_heap_t* heap = page->heap;
+  // we use NULL for the main heap to make `_mi_page_get_associated_theap` fast in `free.c:mi_abandoned_page_try_reclaim`.
+  if mi_likely(heap==NULL) heap = mi_heap_main();
+  mi_assert_internal(heap != NULL);
+  return heap;
+}
+
+//-----------------------------------------------------------
+// Thread free list and ownership
+//-----------------------------------------------------------
+
+// Thread free flag helpers
+static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) {
+  return (mi_block_t*)(tf & ~1);
+}
+static inline bool mi_tf_is_owned(mi_thread_free_t tf) {
+  return ((tf & 1) == 1);
+}
+static inline mi_thread_free_t mi_tf_create(mi_block_t* block, bool owned) {
+  return (mi_thread_free_t)((uintptr_t)block | (owned ? 1 : 0));
+}
+
+// Thread free access
+static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) {
+  return mi_tf_block(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free));
+}
+
+// are there any available blocks?
+static inline bool mi_page_has_any_available(const mi_page_t* page) {
+  mi_assert_internal(page != NULL && page->reserved > 0);
+  return (page->used < page->reserved || (mi_page_thread_free(page) != NULL));
+}
+
+// Owned?
+static inline bool mi_page_is_owned(const mi_page_t* page) {
+  return mi_tf_is_owned(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free));
+}
+
+// get ownership; returns true if the page was not owned before.
+static inline bool mi_page_claim_ownership(mi_page_t* page) {
+  const uintptr_t old = mi_atomic_or_acq_rel(&page->xthread_free, 1);
+  return ((old&1)==0);
+}
+
+
+/* -------------------------------------------------------------------
+  Guarded objects
+------------------------------------------------------------------- */
+#if MI_GUARDED
+
+// we always align guarded pointers in a block at an offset
+// the block `next` field is then used as a tag to distinguish regular offset aligned blocks from guarded ones
+#define MI_BLOCK_TAG_ALIGNED   ((mi_encoded_t)(0))
+#define MI_BLOCK_TAG_GUARDED   (~MI_BLOCK_TAG_ALIGNED)
+
+static inline bool mi_block_ptr_is_guarded(const mi_block_t* block, const void* p) {
+  const ptrdiff_t offset = (uint8_t*)p - (uint8_t*)block;
+  return (offset >= (ptrdiff_t)(sizeof(mi_block_t)) && block->next == MI_BLOCK_TAG_GUARDED);
+}
+
+static inline bool mi_theap_malloc_use_guarded(mi_theap_t* theap, size_t size) {
+  // this code is written to result in fast assembly as it is on the hot path for allocation
+  const size_t count = theap->guarded_sample_count - 1;  // if the rate was 0, this will underflow and count for a long time..
+  if mi_likely(count != 0) {
+    // no sample
+    theap->guarded_sample_count = count;
+    return false;
+  }
+  else if (size >= theap->guarded_size_min && size <= theap->guarded_size_max) {
+    // use guarded allocation
+    theap->guarded_sample_count = theap->guarded_sample_rate;  // reset
+    return (theap->guarded_sample_rate != 0);
+  }
+  else {
+    // failed size criteria, rewind count (but don't write to an empty theap)
+    if (theap->guarded_sample_rate != 0) { theap->guarded_sample_count = 1; }
+    return false;
+  }
+}
+
+mi_decl_restrict void* _mi_theap_malloc_guarded(mi_theap_t* theap, size_t size, bool zero) mi_attr_noexcept;
+
+#endif
+
+
+/* -------------------------------------------------------------------
+Encoding/Decoding the free list next pointers
+
+This is to protect against buffer overflow exploits where the
+free list is mutated. Many hardened allocators xor the next pointer `p`
+with a secret key `k1`, as `p^k1`. This prevents overwriting with known
+values but might be still too weak: if the attacker can guess
+the pointer `p` this  can reveal `k1` (since `p^k1^p == k1`).
+Moreover, if multiple blocks can be read as well, the attacker can
+xor both as `(p1^k1) ^ (p2^k1) == p1^p2` which may reveal a lot
+about the pointers (and subsequently `k1`).
+
+Instead mimalloc uses an extra key `k2` and encodes as `((p^k2)<<<k1)+k1`.
+Since these operations are not associative, the above approaches do not
+work so well any more even if the `p` can be guesstimated. For example,
+for the read case we can subtract two entries to discard the `+k1` term,
+but that leads to `((p1^k2)<<<k1) - ((p2^k2)<<<k1)` at best.
+We include the left-rotation since xor and addition are otherwise linear
+in the lowest bit. Finally, both keys are unique per page which reduces
+the re-use of keys by a large factor.
+
+We also pass a separate `null` value to be used as `NULL` or otherwise
+`(k2<<<k1)+k1` would appear (too) often as a sentinel value.
+------------------------------------------------------------------- */
+
+static inline bool mi_is_in_same_page(const void* p, const void* q) {
+  mi_page_t* page = _mi_ptr_page(p);
+  return mi_page_contains_address(page,q);
+  // return (_mi_ptr_page(p) == _mi_ptr_page(q));
+}
+
+static inline void* mi_ptr_decode(const void* null, const mi_encoded_t x, const uintptr_t* keys) {
+  void* p = (void*)(mi_rotr(x - keys[0], keys[0]) ^ keys[1]);
+  return (p==null ? NULL : p);
+}
+
+static inline mi_encoded_t mi_ptr_encode(const void* null, const void* p, const uintptr_t* keys) {
+  uintptr_t x = (uintptr_t)(p==NULL ? null : p);
+  return mi_rotl(x ^ keys[1], keys[0]) + keys[0];
+}
+
+static inline uint32_t mi_ptr_encode_canary(const void* null, const void* p, const uintptr_t* keys) {
+  const uint32_t x = (uint32_t)(mi_ptr_encode(null,p,keys));
+  // make the lowest byte 0 to prevent spurious read overflows which could be a security issue (issue #951)
+  #if MI_BIG_ENDIAN
+  return (x & 0x00FFFFFF);
+  #else
+  return (x & 0xFFFFFF00);
+  #endif
+}
+
+static inline mi_block_t* mi_block_nextx( const void* null, const mi_block_t* block, const uintptr_t* keys ) {
+  mi_track_mem_defined(block,sizeof(mi_block_t));
+  mi_block_t* next;
+  #ifdef MI_ENCODE_FREELIST
+  next = (mi_block_t*)mi_ptr_decode(null, block->next, keys);
+  #else
+  MI_UNUSED(keys); MI_UNUSED(null);
+  next = (mi_block_t*)block->next;
+  #endif
+  mi_track_mem_noaccess(block,sizeof(mi_block_t));
+  return next;
+}
+
+static inline void mi_block_set_nextx(const void* null, mi_block_t* block, const mi_block_t* next, const uintptr_t* keys) {
+  mi_track_mem_undefined(block,sizeof(mi_block_t));
+  #ifdef MI_ENCODE_FREELIST
+  block->next = mi_ptr_encode(null, next, keys);
+  #else
+  MI_UNUSED(keys); MI_UNUSED(null);
+  block->next = (mi_encoded_t)next;
+  #endif
+  mi_track_mem_noaccess(block,sizeof(mi_block_t));
+}
+
+static inline mi_block_t* mi_block_next(const mi_page_t* page, const mi_block_t* block) {
+  #ifdef MI_ENCODE_FREELIST
+  mi_block_t* next = mi_block_nextx(page,block,page->keys);
+  // check for free list corruption: is `next` at least in the same page?
+  // TODO: check if `next` is `page->block_size` aligned?
+  if mi_unlikely(next!=NULL && !mi_is_in_same_page(block, next)) {
+    _mi_error_message(EFAULT, "corrupted free list entry of size %zub at %p: value 0x%zx\n", mi_page_block_size(page), block, (uintptr_t)next);
+    next = NULL;
+  }
+  return next;
+  #else
+  MI_UNUSED(page);
+  return mi_block_nextx(page,block,NULL);
+  #endif
+}
+
+static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, const mi_block_t* next) {
+  #ifdef MI_ENCODE_FREELIST
+  mi_block_set_nextx(page,block,next, page->keys);
+  #else
+  MI_UNUSED(page);
+  mi_block_set_nextx(page,block,next,NULL);
+  #endif
+}
+
+/* -----------------------------------------------------------
+  arena blocks
+----------------------------------------------------------- */
+
+// Blocks needed for a given byte size
+static inline size_t mi_slice_count_of_size(size_t size) {
+  return _mi_divide_up(size, MI_ARENA_SLICE_SIZE);
+}
+
+// Byte size of a number of blocks
+static inline size_t mi_size_of_slices(size_t bcount) {
+  return (bcount * MI_ARENA_SLICE_SIZE);
+}
+
+
+/* -----------------------------------------------------------
+  memory id's
+----------------------------------------------------------- */
+
+static inline mi_memid_t _mi_memid_create(mi_memkind_t memkind) {
+  mi_memid_t memid;
+  _mi_memzero_var(memid);
+  memid.memkind = memkind;
+  return memid;
+}
+
+static inline mi_memid_t _mi_memid_none(void) {
+  return _mi_memid_create(MI_MEM_NONE);
+}
+
+static inline mi_memid_t _mi_memid_create_os(void* base, size_t size, bool committed, bool is_zero, bool is_large) {
+  mi_memid_t memid = _mi_memid_create(MI_MEM_OS);
+  memid.mem.os.base = base;
+  memid.mem.os.size = size;
+  memid.initially_committed = committed;
+  memid.initially_zero = is_zero;
+  memid.is_pinned = is_large;
+  return memid;
+}
+
+static inline mi_memid_t _mi_memid_create_meta(void* mpage, size_t block_idx, size_t block_count) {
+  mi_memid_t memid = _mi_memid_create(MI_MEM_META);
+  memid.mem.meta.meta_page = mpage;
+  memid.mem.meta.block_index = (uint32_t)block_idx;
+  memid.mem.meta.block_count = (uint32_t)block_count;
+  memid.initially_committed = true;
+  memid.initially_zero = true;
+  memid.is_pinned = true;
+  return memid;
+}
+
+
+// -------------------------------------------------------------------
+// Fast "random" shuffle
+// -------------------------------------------------------------------
+
+static inline uintptr_t _mi_random_shuffle(uintptr_t x) {
+  if (x==0) { x = 17; }   // ensure we don't get stuck in generating zeros
+#if (MI_INTPTR_SIZE>=8)
+  // by Sebastiano Vigna, see: <http://xoshiro.di.unimi.it/splitmix64.c>
+  x ^= x >> 30;
+  x *= 0xbf58476d1ce4e5b9UL;
+  x ^= x >> 27;
+  x *= 0x94d049bb133111ebUL;
+  x ^= x >> 31;
+#elif (MI_INTPTR_SIZE==4)
+  // by Chris Wellons, see: <https://nullprogram.com/blog/2018/07/31/>
+  x ^= x >> 16;
+  x *= 0x7feb352dUL;
+  x ^= x >> 15;
+  x *= 0x846ca68bUL;
+  x ^= x >> 16;
+#endif
+  return x;
+}
+
+
+// ---------------------------------------------------------------------------------
+// Provide our own `_mi_memcpy/set` for potential performance optimizations.
+//
+// For now, only on x64/x86 we optimize to `rep movsb/stosb`.
+// Generally, we check for "fast short rep movsb/stosb" (FSRM/FSRS) or "fast enhanced rep movsb" (ERMS) support
+// (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017). See also issue #201 and pr #253.
+// Todo: we see improvements on win32 but less with glibc; we might want to only enable this on windows.
+// ---------------------------------------------------------------------------------
+
+#if !MI_TRACK_ENABLED && (MI_ARCH_X64 || MI_ARCH_X86) && (defined(_WIN32) || defined(__GNUC__))
+
+extern mi_decl_hidden size_t _mi_cpu_movsb_max;  // in init.c
+extern mi_decl_hidden size_t _mi_cpu_stosb_max;
+
+static inline void mi_rep_movsb(void* dst, const void* src, size_t n) {
+  #if defined(__GNUC__)
+  __asm volatile("rep movsb" : "+D"(dst), "+c"(n), "+S"(src) : : "memory");
+  #else
+  __movsb((unsigned char*)dst, (const unsigned char*)src, n);
+  #endif
+}
+
+static inline void mi_rep_stosb(void* dst, uint8_t val, size_t n) {
+  #if defined(__GNUC__)
+  __asm volatile("rep stosb" : "+D"(dst), "+c"(n) : "a"(val) : "memory");
+  #else
+  __stosb((unsigned char*)dst, val, n);
+  #endif
+}
+
+static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
+  if mi_likely(n <= _mi_cpu_movsb_max) {  // has fsrm && n <= 127  (todo: and maybe has erms?)
+    mi_rep_movsb(dst, src, n);
+  }
+  else {
+    memcpy(dst, src, n);
+  }
+}
+
+static inline void _mi_memset(void* dst, int val, size_t n) {
+  if mi_likely(n <= _mi_cpu_stosb_max) {  // has fsrs && n <= 127
+    mi_rep_stosb(dst, (uint8_t)val, n);
+  }
+  else {
+    memset(dst, val, n);
+  }
+}
+
+#else
+
+static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
+  memcpy(dst, src, n);
+}
+
+static inline void _mi_memset(void* dst, int val, size_t n) {
+  memset(dst, val, n);
+}
+
+#endif
+
+// -------------------------------------------------------------------------------
+// The `_mi_memcpy_aligned` can be used if the pointers are machine-word aligned
+// This is used for example in `mi_realloc`.
+// -------------------------------------------------------------------------------
+
+#if (defined(__GNUC__) && (__GNUC__ >= 4)) || defined(__clang__)
+
+// On GCC/CLang we provide a hint that the pointers are word aligned.
+static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) {
+  mi_assert_internal(((uintptr_t)dst % MI_INTPTR_SIZE == 0) && ((uintptr_t)src % MI_INTPTR_SIZE == 0));
+  void* adst = __builtin_assume_aligned(dst, MI_INTPTR_SIZE);
+  const void* asrc = __builtin_assume_aligned(src, MI_INTPTR_SIZE);
+  _mi_memcpy(adst, asrc, n);
+}
+
+static inline void _mi_memset_aligned(void* dst, int val, size_t n) {
+  mi_assert_internal((uintptr_t)dst % MI_INTPTR_SIZE == 0);
+  void* adst = __builtin_assume_aligned(dst, MI_INTPTR_SIZE);
+  _mi_memset(adst, val, n);
+}
+
+#else
+
+// Default fallback on `_mi_memcpy`
+static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) {
+  mi_assert_internal(((uintptr_t)dst % MI_INTPTR_SIZE == 0) && ((uintptr_t)src % MI_INTPTR_SIZE == 0));
+  _mi_memcpy(dst, src, n);
+}
+
+static inline void _mi_memset_aligned(void* dst, int val, size_t n) {
+  mi_assert_internal((uintptr_t)dst % MI_INTPTR_SIZE == 0);
+  _mi_memset(dst, val, n);
+}
+
+#endif
+
+static inline void _mi_memzero(void* dst, size_t n) {
+  _mi_memset(dst, 0, n);
+}
+
+static inline void _mi_memzero_aligned(void* dst, size_t n) {
+  _mi_memset_aligned(dst, 0, n);
+}
+
+
+
+#endif  // MI_INTERNAL_H
diff --git a/ext/src/mimalloc/include/mimalloc/prim.h b/ext/src/mimalloc/include/mimalloc/prim.h
new file mode 100644
index 0000000000..75ad758e83
--- /dev/null
+++ b/ext/src/mimalloc/include/mimalloc/prim.h
@@ -0,0 +1,535 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_PRIM_H
+#define MIMALLOC_PRIM_H
+#include "internal.h"             // mi_decl_hidden
+
+// --------------------------------------------------------------------------
+// This file specifies the primitive portability API.
+// Each OS/host needs to implement these primitives, see `src/prim`
+// for implementations on Window, macOS, WASI, and Linux/Unix.
+//
+// note: on all primitive functions, we always have result parameters != NULL, and:
+//  addr != NULL and page aligned
+//  size > 0     and page aligned
+//  the return value is an error code as an `int` where 0 is success
+// --------------------------------------------------------------------------
+
+// OS memory configuration
+typedef struct mi_os_mem_config_s {
+  size_t  page_size;              // default to 4KiB
+  size_t  large_page_size;        // 0 if not supported, usually 2MiB (4MiB on Windows)
+  size_t  alloc_granularity;      // smallest allocation size (usually 4KiB, on Windows 64KiB)
+  size_t  physical_memory_in_kib; // physical memory size in KiB
+  size_t  virtual_address_bits;   // usually 48 or 56 bits on 64-bit systems. (used to determine secure randomization)
+  bool    has_overcommit;         // can we reserve more memory than can be actually committed?
+  bool    has_partial_free;       // can allocated blocks be freed partially? (true for mmap, false for VirtualAlloc)
+  bool    has_virtual_reserve;    // supports virtual address space reservation? (if true we can reserve virtual address space without using commit or physical memory)
+  bool    has_transparent_huge_pages;  // true if transparent huge pages are enabled (on Linux)
+} mi_os_mem_config_t;
+
+// Initialize
+void _mi_prim_mem_init( mi_os_mem_config_t* config );
+
+// Free OS memory
+int _mi_prim_free(void* addr, size_t size );
+
+// Allocate OS memory. Return NULL on error.
+// The `try_alignment` is just a hint and the returned pointer does not have to be aligned.
+// If `commit` is false, the virtual memory range only needs to be reserved (with no access)
+// which will later be committed explicitly using `_mi_prim_commit`.
+// `is_zero` is set to true if the memory was zero initialized (as on most OS's)
+// The `hint_addr` address is either `NULL` or a preferred allocation address but can be ignored.
+// pre: !commit => !allow_large
+//      try_alignment >= _mi_os_page_size() and a power of 2
+int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr);
+
+// Commit memory. Returns error code or 0 on success.
+// For example, on Linux this would make the memory PROT_READ|PROT_WRITE.
+// `is_zero` is set to true if the memory was zero initialized (e.g. on Windows)
+int _mi_prim_commit(void* addr, size_t size, bool* is_zero);
+
+// Decommit memory. Returns error code or 0 on success. The `needs_recommit` result is true
+// if the memory would need to be re-committed. For example, on Windows this is always true,
+// but on Linux we could use MADV_DONTNEED to decommit which does not need a recommit.
+// pre: needs_recommit != NULL
+int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit);
+
+// Reset memory. The range keeps being accessible but the content might be reset to zero at any moment.
+// Returns error code or 0 on success.
+int _mi_prim_reset(void* addr, size_t size);
+
+// Reuse memory. This is called for memory that is already committed but
+// may have been reset (`_mi_prim_reset`) or decommitted (`_mi_prim_decommit`) where `needs_recommit` was false.
+// Returns error code or 0 on success. On most platforms this is a no-op.
+int _mi_prim_reuse(void* addr, size_t size);
+
+// Protect memory. Returns error code or 0 on success.
+int _mi_prim_protect(void* addr, size_t size, bool protect);
+
+// Allocate huge (1GiB) pages possibly associated with a NUMA node.
+// `is_zero` is set to true if the memory was zero initialized (as on most OS's)
+// pre: size > 0  and a multiple of 1GiB.
+//      numa_node is either negative (don't care), or a numa node number.
+int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr);
+
+// Return the current NUMA node
+size_t _mi_prim_numa_node(void);
+
+// Return the number of logical NUMA nodes
+size_t _mi_prim_numa_node_count(void);
+
+// Clock ticks
+mi_msecs_t _mi_prim_clock_now(void);
+
+// Return process information (only for statistics)
+typedef struct mi_process_info_s {
+  mi_msecs_t  elapsed;
+  mi_msecs_t  utime;
+  mi_msecs_t  stime;
+  size_t      current_rss;
+  size_t      peak_rss;
+  size_t      current_commit;
+  size_t      peak_commit;
+  size_t      page_faults;
+} mi_process_info_t;
+
+void _mi_prim_process_info(mi_process_info_t* pinfo);
+
+// Default stderr output. (only for warnings etc. with verbose enabled)
+// msg != NULL && _mi_strlen(msg) > 0
+void _mi_prim_out_stderr( const char* msg );
+
+// Get an environment variable. (only for options)
+// name != NULL, result != NULL, result_size >= 64
+bool _mi_prim_getenv(const char* name, char* result, size_t result_size);
+
+
+// Fill a buffer with strong randomness; return `false` on error or if
+// there is no strong randomization available.
+bool _mi_prim_random_buf(void* buf, size_t buf_len);
+
+// Called on the first thread start, and should ensure `_mi_thread_done` is called on thread termination.
+void _mi_prim_thread_init_auto_done(void);
+
+// Called on process exit and may take action to clean up resources associated with the thread auto done.
+void _mi_prim_thread_done_auto_done(void);
+
+// Called when the default theap for a thread changes
+void _mi_prim_thread_associate_default_theap(mi_theap_t* theap);
+
+// Is this thread part of a thread pool?
+bool _mi_prim_thread_is_in_threadpool(void);
+
+
+//-------------------------------------------------------------------
+// Access to TLS (thread local storage) slots.
+// We need fast access to both a unique thread id (in `free.c:mi_free`) and
+// to a thread-local theap pointer (in `alloc.c:mi_malloc`).
+// To achieve this we use specialized code for various platforms.
+//-------------------------------------------------------------------
+
+// On some libc + platform combinations we can directly access a thread-local storage (TLS) slot.
+// The TLS layout depends on both the OS and libc implementation so we use specific tests for each main platform.
+// If you test on another platform and it works please send a PR :-)
+// see also https://akkadia.org/drepper/tls.pdf for more info on the TLS register.
+//
+// Note: we would like to prefer `__builtin_thread_pointer()` nowadays instead of using assembly,
+// but unfortunately we can not detect support reliably (see issue #883)
+// We also use it on Apple OS as we use a TLS slot for the default theap there.
+#if (defined(_WIN32)) || \
+    (defined(__GNUC__) && ( \
+           (defined(__GLIBC__)   && (defined(__x86_64__) || defined(__i386__) || (defined(__arm__) && __ARM_ARCH >= 7) || defined(__aarch64__))) \
+        || (defined(__APPLE__)   && (defined(__x86_64__) || defined(__aarch64__) || defined(__POWERPC__))) \
+        || (defined(__BIONIC__)  && (defined(__x86_64__) || defined(__i386__) || (defined(__arm__) && __ARM_ARCH >= 7) || defined(__aarch64__))) \
+        || (defined(__FreeBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
+        || (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
+      ))
+
+static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept {
+  void* res;
+  const size_t ofs = (slot*sizeof(void*));
+  #if defined(_WIN32)
+    #if (_M_X64 || _M_AMD64) && !defined(_M_ARM64EC)
+      res = (void*)__readgsqword((unsigned long)ofs);   // direct load at offset from gs
+    #elif _M_IX86 && !defined(_M_ARM64EC)
+      res = (void*)__readfsdword((unsigned long)ofs);   // direct load at offset from fs
+    #else
+      res = ((void**)NtCurrentTeb())[slot]; MI_UNUSED(ofs);
+    #endif
+  #elif defined(__i386__)
+    __asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86 32-bit always uses GS
+  #elif defined(__APPLE__) && defined(__x86_64__)
+    __asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 macOSX uses GS
+  #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
+    __asm__("movl %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x32 ABI
+  #elif defined(__x86_64__)
+    __asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 Linux, BSD uses FS
+  #elif defined(__arm__)
+    void** tcb; MI_UNUSED(ofs);
+    __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
+    res = tcb[slot];
+  #elif defined(__aarch64__)
+    void** tcb; MI_UNUSED(ofs);
+    #if defined(__APPLE__) // M1, issue #343
+    __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
+    #else
+    __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+    #endif
+    res = tcb[slot];
+  #elif defined(__APPLE__) && defined(__POWERPC__) // ppc, issue #781
+    MI_UNUSED(ofs);
+    res = pthread_getspecific(slot);
+  #else
+    #define MI_HAS_TLS_SLOT 0
+    MI_UNUSED(ofs);
+    res = NULL;
+  #endif
+  return res;
+}
+
+#ifndef MI_HAS_TLS_SLOT
+#define MI_HAS_TLS_SLOT 1
+#endif
+
+// setting a tls slot is only used on macOS for now
+static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
+  const size_t ofs = (slot*sizeof(void*));
+  #if defined(_WIN32)
+    ((void**)NtCurrentTeb())[slot] = value; MI_UNUSED(ofs);
+  #elif defined(__i386__)
+    __asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // 32-bit always uses GS
+  #elif defined(__APPLE__) && defined(__x86_64__)
+    __asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 macOS uses GS
+  #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
+    __asm__("movl %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x32 ABI
+  #elif defined(__x86_64__)
+    __asm__("movq %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 Linux, BSD uses FS
+  #elif defined(__arm__)
+    void** tcb; MI_UNUSED(ofs);
+    __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
+    tcb[slot] = value;
+  #elif defined(__aarch64__)
+    void** tcb; MI_UNUSED(ofs);
+    #if defined(__APPLE__) // M1, issue #343
+    __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
+    #else
+    __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+    #endif
+    tcb[slot] = value;
+  #elif defined(__APPLE__) && defined(__POWERPC__) // ppc, issue #781
+    MI_UNUSED(ofs);
+    pthread_setspecific(slot, value);
+  #else
+    MI_UNUSED(ofs); MI_UNUSED(value);
+  #endif
+}
+
+#endif
+
+
+// defined in `init.c`; do not use these directly
+extern mi_decl_hidden mi_decl_thread mi_theap_t* __mi_theap_main;     // theap belonging to the main heap
+extern mi_decl_hidden bool _mi_process_is_initialized;                // has mi_process_init been called?
+
+
+//-------------------------------------------------------------------
+// Get a fast unique thread id.
+//
+// Getting the thread id should be performant as it is called in the
+// fast path of `_mi_free` and we specialize for various platforms as
+// inlined definitions. Regular code should call `init.c:_mi_thread_id()`.
+// We only require _mi_prim_thread_id() to return a unique id
+// for each thread (unequal to zero).
+//-------------------------------------------------------------------
+
+
+// Do we have __builtin_thread_pointer? This would be the preferred way to get a unique thread id
+// but unfortunately, it seems we cannot test for this reliably at this time (see issue #883)
+// Nevertheless, it seems needed on older graviton platforms (see issue #851).
+// For now, we only enable this for specific platforms.
+#if !defined(__APPLE__)  /* on apple (M1) the wrong register is read (tpidr_el0 instead of tpidrro_el0) so fall back to TLS slot assembly (<https://github.com/microsoft/mimalloc/issues/343#issuecomment-763272369>)*/ \
+    && !defined(__CYGWIN__) \
+    && !defined(MI_LIBC_MUSL) \
+    && (!defined(__clang_major__) || __clang_major__ >= 14)  /* older clang versions emit bad code; fall back to using the TLS slot (<https://lore.kernel.org/linux-arm-kernel/202110280952.352F66D8@keescook/T/>) */
+  #if    (defined(__GNUC__) && (__GNUC__ >= 7)  && defined(__aarch64__)) /* aarch64 for older gcc versions (issue #851) */ \
+      || (defined(__GNUC__) && (__GNUC__ >= 11) && defined(__x86_64__)) \
+      || (defined(__clang_major__) && (__clang_major__ >= 14) && (defined(__aarch64__) || defined(__x86_64__)))
+    #define MI_USE_BUILTIN_THREAD_POINTER  1
+  #endif
+#endif
+
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept;
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  const mi_threadid_t tid = __mi_prim_thread_id();
+  mi_assert_internal(tid > 1);
+  mi_assert_internal((tid & MI_PAGE_FLAG_MASK) == 0);  // bottom 2 bits are clear?
+  return tid;
+}
+
+// Get a unique id for the current thread.
+#if defined(MI_PRIM_THREAD_ID)
+
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
+  return MI_PRIM_THREAD_ID();  // used for example by CPython for a free threaded build (see python/cpython#115488)
+}
+
+#elif defined(_WIN32)
+
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
+  // Windows: works on Intel and ARM in both 32- and 64-bit
+  return (uintptr_t)NtCurrentTeb();
+}
+
+#elif MI_USE_BUILTIN_THREAD_POINTER
+
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
+  // Works on most Unix based platforms with recent compilers
+  return (uintptr_t)__builtin_thread_pointer();
+}
+
+#elif MI_HAS_TLS_SLOT
+
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
+  #if defined(__BIONIC__)
+    // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id
+    // see: https://github.com/aosp-mirror/platform_bionic/blob/c44b1d0676ded732df4b3b21c5f798eacae93228/libc/platform/bionic/tls_defines.h#L86
+    return (uintptr_t)mi_prim_tls_slot(1);
+  #else
+    // in all our other targets, slot 0 is the thread id
+    // glibc: https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/x86_64/nptl/tls.h
+    // apple: https://github.com/apple/darwin-xnu/blob/main/libsyscall/os/tsd.h#L36
+    return (uintptr_t)mi_prim_tls_slot(0);
+  #endif
+}
+
+#else
+
+// otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms).
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
+  return (uintptr_t)&__mi_theap_main;
+}
+
+#endif
+
+
+
+/* ----------------------------------------------------------------------------------------
+Get the thread local default theap: `_mi_theap_default()` (and the cached heap `_mi_theap_cached`).
+
+This is inlined here as it is on the fast path for allocation functions.
+We have 4 models:
+
+- MI_TLS_MODEL_THREAD_LOCAL: use regular thread local (default on Linux, FreeBSD, etc)
+    On most platforms (Linux, FreeBSD, NetBSD, etc), this just returns a
+    thread local variable (`__mi_theap_default`). With the initial-exec TLS model this ensures
+    that the storage will always be available and properly initialized (with an empty theap).
+
+    On some platforms the underlying TLS implementation (or the loader) will call itself `malloc`
+    on a first access to a thread local and recurse in the MI_TLS_MODEL_THREAD_LOCAL.
+    A way around this is to define MI_TLS_RECURSE_GUARD which adds an extra check if the process
+    is initialized before accessing the thread-local. This is a check in the fast path though
+    so this should be avoided.
+
+- MI_TLS_MODEL_FIXED_SLOT: use a fixed slot in the TLS block (default on macOS)
+    This reserves an unused and fixed TLS slot. This is fast and avoids the problem
+    where the underlying TLS implementation (or the loader) will call itself `malloc`
+    on a first access to a thread local (and recurse in the MI_TLS_MODEL_THREAD_LOCAL).
+    This goes wrong though if the OS or a library uses the same fixed slot.
+
+- MI_TLS_MODEL_DYNAMIC_WIN32: use a dynamically allocated slot with TlsAlloc. (default on Windows)
+    Windows has somewhat slow thread locals so by default we use TlsAlloc'd slots which
+    can be more efficient. First tries to use one of the "direct" first 64 slots which 
+    are the fastest, but falls back to using "expansion" slots when needed (up to 1088 slots).
+    (If the allocated slot happens to always be under 64 for a particular program,
+    one might use cmake with `-DMI_WIN_DIRECT_TLS=ON` to skip the expansion slot test in the fast path.)
+
+- MI_TLS_MODEL_DYNAMIC_PTHREADS: use `pthread_getspecific`. (default on OpenBSD, maybe good for Android as well?)
+    Use pthread local storage. Somewhat slow but can work well depending on the platform.
+
+Each model should define `MI_THEAP_INITASNULL` to signify that the initial value
+returned from `_mi_theap_default()` can be `NULL` (instead of the address of the empty heap).
+This incurs an extra check in the fast path (but can often be combined in an existing check).
+------------------------------------------------------------------------------------------- */
+
+static inline mi_theap_t* _mi_theap_default(void);
+static inline mi_theap_t* _mi_theap_cached(void);
+
+#if defined(_WIN32)
+  #define MI_TLS_MODEL_DYNAMIC_WIN32        1    
+#elif defined(__APPLE__)  // macOS
+  // #define MI_TLS_MODEL_DYNAMIC_PTHREADS  1    // also works but a bit slower
+  #define MI_TLS_MODEL_FIXED_SLOT           1
+  #define MI_TLS_MODEL_FIXED_SLOT_DEFAULT   108  // seems unused. @apple: it would be great to get 2 official slots for custom allocators :-)
+  #define MI_TLS_MODEL_FIXED_SLOT_CACHED    109
+  // we used before __PTK_FRAMEWORK_OLDGC_KEY9 (89) but that seems used now.
+  // see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
+#elif defined(__OpenBSD__) || defined(__ANDROID__)
+  #define MI_TLS_MODEL_DYNAMIC_PTHREADS     1
+  // #define MI_TLS_MODEL_DYNAMIC_PTHREADS_DEFAULT_ENTRY_IS_NULL  1
+#else
+  #define MI_TLS_MODEL_THREAD_LOCAL         1
+#endif
+
+// Declared this way to optimize register spills and branches
+mi_decl_cold mi_decl_noinline mi_theap_t* _mi_theap_empty_get(void);
+
+static inline mi_theap_t* __mi_theap_empty(void) {
+  #if __GNUC__
+  __asm("");  // prevent conditional load
+  return (mi_theap_t*)&_mi_theap_empty;
+  #else
+  return _mi_theap_empty_get();
+  #endif
+}
+
+#if MI_TLS_MODEL_THREAD_LOCAL
+// Thread local with an initial value (default on Linux). Very efficient.
+
+extern mi_decl_hidden mi_decl_thread mi_theap_t* __mi_theap_default;  // default theap to allocate from
+extern mi_decl_hidden mi_decl_thread mi_theap_t* __mi_theap_cached;   // theap from the last used heap
+
+static inline mi_theap_t* _mi_theap_default(void) {
+  #if defined(MI_TLS_RECURSE_GUARD)
+  if (mi_unlikely(!_mi_process_is_initialized)) return _mi_theap_empty_get();
+  #endif
+  return __mi_theap_default;
+}
+
+static inline mi_theap_t* _mi_theap_cached(void) {
+  return __mi_theap_cached;
+}
+
+#elif MI_TLS_MODEL_FIXED_SLOT
+// Fixed TLS slot (default on macOS).
+#define MI_THEAP_INITASNULL  1
+
+static inline mi_theap_t* _mi_theap_default(void) {
+  return (mi_theap_t*)mi_prim_tls_slot(MI_TLS_MODEL_FIXED_SLOT_DEFAULT);
+}
+
+static inline mi_theap_t* _mi_theap_cached(void) {
+  return (mi_theap_t*)mi_prim_tls_slot(MI_TLS_MODEL_FIXED_SLOT_CACHED);
+}
+
+#elif MI_TLS_MODEL_DYNAMIC_WIN32
+// Dynamic TLS slot (default on Windows)
+#define MI_THEAP_INITASNULL  1
+
+// We try to use direct slots (64), but can also use the expansion slots (upto 1024 extra available)
+// See <https://www.geoffchappell.com/studies/windows/km/ntoskrnl/inc/api/pebteb/teb/index.htm> for the offsets.
+#if MI_SIZE_SIZE==4
+#define MI_TLS_EXPANSION_SLOT    (0x0F94 / MI_SIZE_SIZE)
+#else
+#define MI_TLS_EXPANSION_SLOT    (0x1780 / MI_SIZE_SIZE)
+#endif
+
+extern mi_decl_hidden size_t _mi_theap_default_slot;
+extern mi_decl_hidden size_t _mi_theap_cached_slot;
+extern mi_decl_hidden size_t _mi_theap_default_expansion_slot;
+extern mi_decl_hidden size_t _mi_theap_cached_expansion_slot;
+
+static inline mi_theap_t* _mi_theap_default(void) {
+  const size_t slot = _mi_theap_default_slot;
+  mi_theap_t* theap  = (mi_theap_t*)mi_prim_tls_slot(slot);
+  #if !MI_WIN_DIRECT_TLS
+  if mi_unlikely(slot==MI_TLS_EXPANSION_SLOT) { // in TlsExpansionSlots ?
+    if mi_likely(theap!=NULL) {                 // initialized (on this thread)?
+      theap = ((mi_theap_t**)theap)[_mi_theap_default_expansion_slot];
+    }
+  }
+  #endif
+  return theap;
+}
+
+static inline mi_theap_t* _mi_theap_cached(void) {
+  const size_t slot = _mi_theap_cached_slot;
+  mi_theap_t* theap = (mi_theap_t*)mi_prim_tls_slot(slot);
+  #if !MI_WIN_DIRECT_TLS
+  if mi_unlikely(slot==MI_TLS_EXPANSION_SLOT) { // in TlsExpansionSlots ?
+    if mi_likely(theap!=NULL) {                 // initialized (on this thread)?
+      theap = ((mi_theap_t**)theap)[_mi_theap_cached_expansion_slot];
+    }
+  }
+  #endif
+  return theap;
+}
+
+#elif MI_TLS_MODEL_DYNAMIC_PTHREADS
+// Dynamic pthread slot on less common platforms. This is not too bad. (default on OpenBSD)
+#define MI_THEAP_INITASNULL  1
+
+extern mi_decl_hidden pthread_key_t _mi_theap_default_key;
+extern mi_decl_hidden pthread_key_t _mi_theap_cached_key;
+
+static inline mi_theap_t* _mi_theap_default(void) {
+  #if !MI_TLS_MODEL_DYNAMIC_PTHREADS_DEFAULT_ENTRY_IS_NULL
+  // we can skip this check if using the initial key will return NULL from pthread_getspecific
+  if mi_unlikely(_mi_theap_default_key==0) { return NULL; }
+  #endif
+  return (mi_theap_t*)pthread_getspecific(_mi_theap_default_key);
+}
+
+static inline mi_theap_t* _mi_theap_cached(void) {
+  #if !MI_TLS_MODEL_DYNAMIC_PTHREADS_DEFAULT_ENTRY_IS_NULL
+  // we can skip this check if using the initial key will return NULL from pthread_getspecific
+  if mi_unlikely(_mi_theap_cached_key==0) { return NULL; }
+  #endif
+  return (mi_theap_t*)pthread_getspecific(_mi_theap_cached_key);
+}
+
+#else
+#error "no TLS model is defined for this platform?"
+#endif
+
+
+// Check if a thread is initialized (without using a thread-local if using fixed slots)
+static inline bool _mi_thread_is_initialized(void) {
+  return (mi_theap_is_initialized(_mi_theap_default()));
+}
+
+// Get (and possible create) the theap belonging to a heap
+// We cache the last accessed theap in `_mi_theap_cached` for better performance.
+static inline mi_theap_t* _mi_heap_theap(const mi_heap_t* heap) {
+  mi_theap_t* theap = _mi_theap_cached();
+  #if MI_THEAP_INITASNULL
+  if mi_likely(theap!=NULL && theap->heap==heap) return theap;
+  #else
+  if mi_likely(theap->heap==heap) return theap;
+  #endif
+  return _mi_heap_theap_get_or_init(heap);
+}
+
+// Get the theap belonging to a heap without creating in if it is not yet initialized.
+static inline mi_theap_t* _mi_heap_theap_peek(const mi_heap_t* heap) {
+  mi_theap_t* theap = _mi_theap_cached();
+  #if MI_THEAP_INITASNULL
+  if mi_unlikely(theap==NULL || theap->heap!=heap)
+  #else
+  if mi_unlikely(theap->heap!=heap)
+  #endif
+  {
+    theap = _mi_heap_theap_get_peek(heap);  // don't update the cache on a query (?)
+  }
+  mi_assert(theap==NULL || theap->heap==heap);
+  return theap;
+}
+
+// Find the associated theap or NULL if it does not exist (during shutdown)
+// Should be fast as it is called in `free.c:mi_free_try_collect`.
+static inline mi_theap_t* _mi_page_associated_theap_peek(mi_page_t* page) {
+  mi_heap_t* const heap = page->heap;
+  mi_theap_t* theap;
+  if mi_likely(heap==NULL) { theap = __mi_theap_main; }  // note: on macOS accessing the thread_local can cause allocation during thread shutdown (and reinitialize the thread)!
+                      else { theap = _mi_heap_theap_peek(heap); }
+  mi_assert_internal(theap==NULL || _mi_thread_id()==theap->tld->thread_id);
+  return theap;
+}
+
+#endif  // MI_PRIM_H
diff --git a/ext/src/mimalloc/include/mimalloc/track.h b/ext/src/mimalloc/include/mimalloc/track.h
new file mode 100644
index 0000000000..8f8b93f9aa
--- /dev/null
+++ b/ext/src/mimalloc/include/mimalloc/track.h
@@ -0,0 +1,145 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MI_TRACK_H
+#define MI_TRACK_H
+
+/* ------------------------------------------------------------------------------------------------------
+Track memory ranges with macros for tools like Valgrind address sanitizer, or other memory checkers.
+These can be defined for tracking allocation:
+
+  #define mi_track_malloc_size(p,reqsize,size,zero)
+  #define mi_track_free_size(p,_size)
+
+The macros are set up such that the size passed to `mi_track_free_size`
+always matches the size of `mi_track_malloc_size`. (currently, `size == mi_usable_size(p)`).
+The `reqsize` is what the user requested, and `size >= reqsize`.
+The `size` is either byte precise (and `size==reqsize`) if `MI_PADDING` is enabled,
+or otherwise it is the usable block size which may be larger than the original request.
+Use `_mi_block_size_of(void* p)` to get the full block size that was allocated (including padding etc).
+The `zero` parameter is `true` if the allocated block is zero initialized.
+
+Optional:
+
+  #define mi_track_align(p,alignedp,offset,size)
+  #define mi_track_resize(p,oldsize,newsize)
+  #define mi_track_init()
+
+The `mi_track_align` is called right after a `mi_track_malloc` for aligned pointers in a block.
+The corresponding `mi_track_free` still uses the block start pointer and original size (corresponding to the `mi_track_malloc`).
+The `mi_track_resize` is currently unused but could be called on reallocations within a block.
+`mi_track_init` is called at program start.
+
+The following macros are for tools like asan and valgrind to track whether memory is
+defined, undefined, or not accessible at all:
+
+  #define mi_track_mem_defined(p,size)
+  #define mi_track_mem_undefined(p,size)
+  #define mi_track_mem_noaccess(p,size)
+
+-------------------------------------------------------------------------------------------------------*/
+
+#if MI_TRACK_VALGRIND
+// valgrind tool
+
+#define MI_TRACK_ENABLED      1
+#define MI_TRACK_HEAP_DESTROY 1           // track free of individual blocks on theap_destroy
+#define MI_TRACK_TOOL         "valgrind"
+
+#include <valgrind/valgrind.h>
+#include <valgrind/memcheck.h>
+
+#define mi_track_malloc_size(p,reqsize,size,zero) VALGRIND_MALLOCLIKE_BLOCK(p,size,MI_PADDING_SIZE /*red zone*/,zero)
+#define mi_track_free_size(p,_size)               VALGRIND_FREELIKE_BLOCK(p,MI_PADDING_SIZE /*red zone*/)
+#define mi_track_resize(p,oldsize,newsize)        VALGRIND_RESIZEINPLACE_BLOCK(p,oldsize,newsize,MI_PADDING_SIZE /*red zone*/)
+#define mi_track_mem_defined(p,size)              VALGRIND_MAKE_MEM_DEFINED(p,size)
+#define mi_track_mem_undefined(p,size)            VALGRIND_MAKE_MEM_UNDEFINED(p,size)
+#define mi_track_mem_noaccess(p,size)             VALGRIND_MAKE_MEM_NOACCESS(p,size)
+
+#elif MI_TRACK_ASAN
+// address sanitizer
+
+#define MI_TRACK_ENABLED      1
+#define MI_TRACK_HEAP_DESTROY 0
+#define MI_TRACK_TOOL         "asan"
+
+#include <sanitizer/asan_interface.h>
+
+#define mi_track_malloc_size(p,reqsize,size,zero) ASAN_UNPOISON_MEMORY_REGION(p,size)
+#define mi_track_free_size(p,size)                ASAN_POISON_MEMORY_REGION(p,size)
+#define mi_track_mem_defined(p,size)              ASAN_UNPOISON_MEMORY_REGION(p,size)
+#define mi_track_mem_undefined(p,size)            ASAN_UNPOISON_MEMORY_REGION(p,size)
+#define mi_track_mem_noaccess(p,size)             ASAN_POISON_MEMORY_REGION(p,size)
+
+#elif MI_TRACK_ETW
+// windows event tracing
+
+#define MI_TRACK_ENABLED      1
+#define MI_TRACK_HEAP_DESTROY 1
+#define MI_TRACK_TOOL         "ETW"
+
+#include "../src/prim/windows/etw.h"
+
+#define mi_track_init()                           EventRegistermicrosoft_windows_mimalloc();
+#define mi_track_malloc_size(p,reqsize,size,zero) EventWriteETW_MI_ALLOC((UINT64)(p), size)
+#define mi_track_free_size(p,size)                EventWriteETW_MI_FREE((UINT64)(p), size)
+
+#else
+// no tracking
+
+#define MI_TRACK_ENABLED      0
+#define MI_TRACK_HEAP_DESTROY 0
+#define MI_TRACK_TOOL         "none"
+
+#define mi_track_malloc_size(p,reqsize,size,zero)
+#define mi_track_free_size(p,_size)
+
+#endif
+
+// -------------------
+// Utility definitions
+
+#ifndef mi_track_resize
+#define mi_track_resize(p,oldsize,newsize)      mi_track_free_size(p,oldsize); mi_track_malloc(p,newsize,false)
+#endif
+
+#ifndef mi_track_align
+#define mi_track_align(p,alignedp,offset,size)  mi_track_mem_noaccess(p,offset)
+#endif
+
+#ifndef mi_track_init
+#define mi_track_init()
+#endif
+
+#ifndef mi_track_mem_defined
+#define mi_track_mem_defined(p,size)
+#endif
+
+#ifndef mi_track_mem_undefined
+#define mi_track_mem_undefined(p,size)
+#endif
+
+#ifndef mi_track_mem_noaccess
+#define mi_track_mem_noaccess(p,size)
+#endif
+
+
+#if MI_PADDING
+#define mi_track_malloc(p,reqsize,zero) \
+  if ((p)!=NULL) { \
+    mi_assert_internal(mi_usable_size(p)==(reqsize)); \
+    mi_track_malloc_size(p,reqsize,reqsize,zero); \
+  }
+#else
+#define mi_track_malloc(p,reqsize,zero) \
+  if ((p)!=NULL) { \
+    mi_assert_internal(mi_usable_size(p)>=(reqsize)); \
+    mi_track_malloc_size(p,reqsize,mi_usable_size(p),zero); \
+  }
+#endif
+
+#endif // MI_TRACK_H
diff --git a/ext/src/mimalloc/include/mimalloc/types.h b/ext/src/mimalloc/include/mimalloc/types.h
new file mode 100644
index 0000000000..1583b39692
--- /dev/null
+++ b/ext/src/mimalloc/include/mimalloc/types.h
@@ -0,0 +1,707 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MI_TYPES_H
+#define MI_TYPES_H
+
+// --------------------------------------------------------------------------
+// This file contains the main type definitions for mimalloc:
+// mi_heap_t      : all data for a heap; usually there is just one main default heap.
+// mi_theap_t     : a thread local heap belonging to a specific heap:
+//                  maintains lists of thread-local heap pages that have free space.
+// mi_page_t      : a mimalloc page (usually 64KiB or 512KiB) from
+//                  where objects of a single size are allocated.
+//                  Note: we write "OS page" for OS memory pages while
+//                  using plain "page" for mimalloc pages (`mi_page_t`).
+// mi_arena_t     : a large memory area where pages are allocated (process shared)
+// mi_tld_t       : thread local data
+// mi_subproc_t   : all heaps belong to a sub-process (usually just the main one)
+// --------------------------------------------------------------------------
+
+
+#include <mimalloc-stats.h>
+#include <stddef.h>   // ptrdiff_t
+#include <stdint.h>   // uintptr_t, uint16_t, etc
+#include <stdbool.h>  // bool
+#include <limits.h>   // SIZE_MAX etc.
+#include <errno.h>    // error codes
+#include "bits.h"     // size defines (MI_INTPTR_SIZE etc), bit operations
+#include "atomic.h"   // _Atomic primitives
+
+// Minimal alignment necessary. On most platforms 16 bytes are needed
+// due to SSE registers for example. This must be at least `sizeof(void*)`
+#ifndef MI_MAX_ALIGN_SIZE
+#define MI_MAX_ALIGN_SIZE  16   // sizeof(max_align_t)
+#endif
+
+
+// ------------------------------------------------------
+// Variants
+// ------------------------------------------------------
+
+// Define NDEBUG in the release version to disable assertions.
+// #define NDEBUG
+
+// Define MI_TRACK_<tool> to enable tracking support
+// #define MI_TRACK_VALGRIND 1
+// #define MI_TRACK_ASAN     1
+// #define MI_TRACK_ETW      1
+
+// Define MI_STAT as 1 to maintain statistics; set it to 2 to have detailed statistics (but costs some performance).
+#define MI_STAT 1
+
+// Define MI_SECURE to enable security mitigations. Level 1 has minimal performance impact,
+// but protects most metadata with guard pages:
+//   #define MI_SECURE 1  // guard page around metadata; check pointer validity on free
+//
+// Level 2 has more performance impact but protect well against various buffer overflows
+// by surrounding all mimalloc pages with guard pages:
+//   #define MI_SECURE 2  // guard page around each mimalloc page (can fragment VMA's with large theaps..)
+//
+// The next two levels can have more performance cost:
+//   #define MI_SECURE 3  // randomize allocations, encode free lists (detect corrupted free list (buffer overflow), and invalid pointer free)
+//   #define MI_SECURE 4  // checks for double free. (may be more expensive)
+
+#if !defined(MI_SECURE)
+#define MI_SECURE 0
+#endif
+
+// Define MI_DEBUG for assertion and invariant checking
+// #define MI_DEBUG 1  // basic assertion checks and statistics, check double free, corrupted free list, and invalid pointer free. (cmake -DMI_DEBUG=ON)
+// #define MI_DEBUG 2  // + internal assertion checks (cmake -DMI_DEBUG_INTERNAL=ON)
+// #define MI_DEBUG 3  // + extensive internal invariant checking (cmake -DMI_DEBUG_FULL=ON)
+#if !defined(MI_DEBUG)
+#if defined(MI_BUILD_RELEASE) || defined(NDEBUG)
+#define MI_DEBUG 0
+#else
+#define MI_DEBUG 2
+#endif
+#endif
+
+// Statistics (0=only essential, 1=normal, 2=more fine-grained (expensive) tracking)
+#ifndef MI_STAT
+#if (MI_DEBUG>0)
+#define MI_STAT 2
+#else
+#define MI_STAT 0
+#endif
+#endif
+
+// Use guard pages behind objects of a certain size (set by the MIMALLOC_DEBUG_GUARDED_MIN/MAX options)
+// Padding should be disabled when using guard pages
+// #define MI_GUARDED 1
+#if MI_GUARDED
+#define MI_PADDING  0
+#endif
+
+// Reserve extra padding at the end of each block to be more resilient against theap block overflows.
+// The padding can detect buffer overflow on free.
+#if !defined(MI_PADDING) && (MI_SECURE>=3 || MI_DEBUG>=1 || (MI_TRACK_VALGRIND || MI_TRACK_ASAN || MI_TRACK_ETW))
+#define MI_PADDING  1
+#endif
+
+// Check padding bytes; allows byte-precise buffer overflow detection
+#if !defined(MI_PADDING_CHECK) && MI_PADDING && (MI_SECURE>=3 || MI_DEBUG>=1)
+#define MI_PADDING_CHECK 1
+#endif
+
+
+// Encoded free lists allow detection of corrupted free lists
+// and can detect buffer overflows, modify after free, and double `free`s.
+#if (MI_SECURE>=3 || MI_DEBUG>=1)
+#define MI_ENCODE_FREELIST  1
+#endif
+
+// Enable large pages for objects between 64KiB and 512KiB.
+// This should perhaps be disabled by default as for many workloads the block sizes above 64 KiB
+// are quite random which can lead to too many partially used large pages (but see issue #1104).
+#ifndef MI_ENABLE_LARGE_PAGES
+#define MI_ENABLE_LARGE_PAGES  1
+#endif
+
+// --------------------------------------------------------------
+// Sizes of internal data-structures
+// (comments specify sizes on 64-bit, usually 32-bit is halved)
+// --------------------------------------------------------------
+
+// Main size parameter; determines max arena sizes and max arena object sizes etc.
+#ifndef MI_ARENA_SLICE_SHIFT
+  #ifdef  MI_SMALL_PAGE_SHIFT   // backward compatibility
+  #define MI_ARENA_SLICE_SHIFT              MI_SMALL_PAGE_SHIFT
+  #elif MI_SECURE && __APPLE__ && MI_ARCH_ARM64
+  #define MI_ARENA_SLICE_SHIFT              (17)                        // 128 KiB to not waste too much due to 16 KiB guard pages
+  #else
+  #define MI_ARENA_SLICE_SHIFT              (13 + MI_SIZE_SHIFT)        // 64 KiB (32 KiB on 32-bit)
+  #endif
+#endif
+#if MI_ARENA_SLICE_SHIFT < 12
+#error Arena slices should be at least 4KiB
+#endif
+
+#ifndef MI_BCHUNK_BITS_SHIFT
+  #if MI_ARENA_SLICE_SHIFT <= 13    // <= 8KiB
+  #define MI_BCHUNK_BITS_SHIFT              (7)   // 128 bits
+  #elif MI_ARENA_SLICE_SHIFT < 16   // <= 32KiB
+  #define MI_BCHUNK_BITS_SHIFT              (8)   // 256 bits
+  #else
+  #define MI_BCHUNK_BITS_SHIFT              (6 + MI_SIZE_SHIFT)       // 512 bits (or 256 on 32-bit)
+  #endif
+#endif
+
+#define MI_BCHUNK_BITS                    (1 << MI_BCHUNK_BITS_SHIFT)         // sub-bitmaps in arena's are "bchunks" of 512 bits
+#define MI_ARENA_SLICE_SIZE               (MI_ZU(1) << MI_ARENA_SLICE_SHIFT)  // arena's allocate in slices of 64 KiB
+#define MI_ARENA_SLICE_ALIGN              (MI_ARENA_SLICE_SIZE)
+
+#define MI_ARENA_MIN_OBJ_SLICES           (1)
+#define MI_ARENA_MAX_CHUNK_OBJ_SLICES     (MI_BCHUNK_BITS)                    // 32 MiB (or 8 MiB on 32-bit)
+
+#define MI_ARENA_MIN_OBJ_SIZE             (MI_ARENA_MIN_OBJ_SLICES * MI_ARENA_SLICE_SIZE)
+#define MI_ARENA_MAX_CHUNK_OBJ_SIZE       (MI_ARENA_MAX_CHUNK_OBJ_SLICES * MI_ARENA_SLICE_SIZE)
+
+#if MI_ARENA_MAX_CHUNK_OBJ_SIZE < MI_SIZE_SIZE*1024
+#error maximum object size may be too small to hold local thread data
+#endif
+
+#define MI_SMALL_PAGE_SIZE                MI_ARENA_MIN_OBJ_SIZE                    // 64 KiB
+#define MI_MEDIUM_PAGE_SIZE               (8*MI_SMALL_PAGE_SIZE)                   // 512 KiB  (=byte in the bchunk bitmap)
+#define MI_LARGE_PAGE_SIZE                (MI_SIZE_SIZE*MI_MEDIUM_PAGE_SIZE)       // 4 MiB    (=word in the bchunk bitmap)
+
+
+// Maximum number of size classes. (spaced exponentially in 12.5% increments)
+#if MI_BIN_HUGE != 73U
+#error "mimalloc internal: expecting 73 bins"
+#endif
+#define MI_BIN_FULL  (MI_BIN_HUGE+1)
+#define MI_BIN_COUNT (MI_BIN_FULL+1)
+
+// We never allocate more than PTRDIFF_MAX (see also <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
+#define MI_MAX_ALLOC_SIZE        PTRDIFF_MAX
+
+// Minimal commit for a page on-demand commit (should be >= OS page size)
+#define MI_PAGE_MIN_COMMIT_SIZE  MI_ARENA_SLICE_SIZE
+
+
+// ------------------------------------------------------
+// Arena's are large reserved areas of memory allocated from
+// the OS that are managed by mimalloc to efficiently
+// allocate MI_ARENA_SLICE_SIZE slices of memory for the
+// mimalloc pages.
+// ------------------------------------------------------
+
+// A large memory arena where pages are allocated in.
+typedef struct mi_arena_s mi_arena_t;     // defined below
+
+
+// ------------------------------------------------------
+// Heaps contain allocated blocks. Heaps are self-contained
+// but share the (sub-process) memory in the arena's.
+// ------------------------------------------------------
+
+// A first-class heap.
+typedef struct mi_heap_s mi_heap_t;       // heaps
+
+// ------------------------------------------------------
+// We can have sub-processes that are fully separated
+// from each other (for running multiple Python interpreters
+// for example). A sub-process holds the memory arenas and heaps.
+// ------------------------------------------------------
+
+// A sub-process
+typedef struct mi_subproc_s mi_subproc_t;
+
+
+// ---------------------------------------------------------------
+// a memory id tracks the provenance of arena/OS allocated memory
+// ---------------------------------------------------------------
+
+// Memory can reside in arena's, direct OS allocated, meta-data pages, or statically allocated.
+// The memid keeps track of this.
+typedef enum mi_memkind_e {
+  MI_MEM_NONE,      // not allocated
+  MI_MEM_EXTERNAL,  // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example)
+  MI_MEM_STATIC,    // allocated in a static area and should not be freed (the initial main theap data for example (`init.c`))
+  MI_MEM_META,      // allocated with the meta data allocator (`arena-meta.c`)
+  MI_MEM_OS,        // allocated from the OS
+  MI_MEM_OS_HUGE,   // allocated as huge OS pages (usually 1GiB, pinned to physical memory)
+  MI_MEM_OS_REMAP,  // allocated in a remapable area (i.e. using `mremap`)
+  MI_MEM_ARENA,     // allocated from an arena (the usual case) (`arena.c`)
+  MI_MEM_HEAP_MAIN  // allocated in the main heap (for theaps)
+} mi_memkind_t;
+
+static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
+  return (memkind >= MI_MEM_OS && memkind <= MI_MEM_OS_REMAP);
+}
+
+static inline bool mi_memkind_needs_no_free(mi_memkind_t memkind) {
+  return (memkind <= MI_MEM_STATIC);
+}
+
+
+typedef struct mi_memid_os_info {
+  void*         base;               // actual base address of the block (used for offset aligned allocations)
+  size_t        size;               // allocated full size
+  // size_t        alignment;       // alignment at allocation
+} mi_memid_os_info_t;
+
+typedef struct mi_memid_arena_info {
+  mi_arena_t*   arena;              // arena that contains this memory
+  uint32_t      slice_index;        // slice index in the arena
+  uint32_t      slice_count;        // allocated slices
+} mi_memid_arena_info_t;
+
+typedef struct mi_memid_meta_info {
+  void*         meta_page;          // meta-page that contains the block
+  uint32_t      block_index;        // block index in the meta-data page
+  uint32_t      block_count;        // allocated blocks
+} mi_memid_meta_info_t;
+
+typedef struct mi_memid_s {
+  union {
+    mi_memid_os_info_t    os;       // only used for MI_MEM_OS
+    mi_memid_arena_info_t arena;    // only used for MI_MEM_ARENA
+    mi_memid_meta_info_t  meta;     // only used for MI_MEM_META
+  } mem;
+  mi_memkind_t  memkind;
+  bool          is_pinned;          // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large (2Mib) or huge (1GiB) OS pages)
+  bool          initially_committed;// `true` if the memory was originally allocated as committed
+  bool          initially_zero;     // `true` if the memory was originally zero initialized
+} mi_memid_t;
+
+
+static inline bool mi_memid_is_os(mi_memid_t memid) {
+  return mi_memkind_is_os(memid.memkind);
+}
+
+static inline bool mi_memid_needs_no_free(mi_memid_t memid) {
+  return mi_memkind_needs_no_free(memid.memkind);
+}
+
+static inline mi_arena_t* mi_memid_arena(mi_memid_t memid) {
+  return (memid.memkind == MI_MEM_ARENA ? memid.mem.arena.arena : NULL);
+}
+
+
+// ------------------------------------------------------
+// Mimalloc pages contain allocated blocks
+// ------------------------------------------------------
+
+// The free lists use encoded next fields
+// (Only actually encodes when MI_ENCODED_FREELIST is defined.)
+typedef uintptr_t  mi_encoded_t;
+
+// thread id's
+typedef size_t     mi_threadid_t;
+
+// free lists contain blocks
+typedef struct mi_block_s {
+  mi_encoded_t next;
+} mi_block_t;
+
+
+// The page flags are put in the bottom 2 bits of the thread_id (for a fast test in `mi_free`)
+// If `has_interior_pointers` is true if the page has pointers at an offset in a block (so we have to unalign to the block start before free-ing)
+// `in_full_queue` is true if the page is full and resides in the full queue (so we move it to a regular queue on free-ing)
+#define MI_PAGE_IN_FULL_QUEUE           MI_ZU(0x01)
+#define MI_PAGE_HAS_INTERIOR_POINTERS   MI_ZU(0x02)
+#define MI_PAGE_FLAG_MASK               MI_ZU(0x03)
+typedef size_t mi_page_flags_t;
+
+// There are two special threadid's: 0 for pages that are abandoned (and not in a theap queue),
+// and 4 for abandoned & mapped threads -- abandoned-mapped pages are abandoned but also mapped
+// in an arena (in `mi_heap_t.arena_pages.pages_abandoned`) so these can be quickly found for reuse.
+// Abondoning partially used pages allows for sharing of this memory between threads (in particular if threads are blocked)
+#define MI_THREADID_ABANDONED           MI_ZU(0)
+#define MI_THREADID_ABANDONED_MAPPED    (MI_PAGE_FLAG_MASK + 1)
+
+// Thread free list.
+// Points to a list of blocks that are freed by other threads.
+// The least-bit is set if the page is owned by the current thread. (`mi_page_is_owned`).
+// Ownership is required before we can read any non-atomic fields in the page.
+// This way we can push a block on the thread free list and try to claim ownership atomically in `free.c:mi_free_block_mt`.
+typedef uintptr_t mi_thread_free_t;
+
+// A page contains blocks of one specific size (`block_size`).
+// Each page has three list of free blocks:
+// `free` for blocks that can be allocated,
+// `local_free` for freed blocks that are not yet available to `mi_malloc`
+// `thread_free` for freed blocks by other threads
+// The `local_free` and `thread_free` lists are migrated to the `free` list
+// when it is exhausted. The separate `local_free` list is necessary to
+// implement a monotonic heartbeat. The `thread_free` list is needed for
+// avoiding atomic operations when allocating from the owning thread.
+//
+// `used - |thread_free|` == actual blocks that are in use (alive)
+// `used - |thread_free| + |free| + |local_free| == capacity`
+//
+// We don't count "freed" (as |free|) but use only the `used` field to reduce
+// the number of memory accesses in the `mi_page_all_free` function(s).
+// Use `_mi_page_free_collect` to collect the thread_free list and update the `used` count.
+//
+// Notes:
+// - Non-atomic fields can only be accessed if having _ownership_ (low bit of `xthread_free` is 1).
+//   Combining the `thread_free` list with an ownership bit allows a concurrent `free` to atomically
+//   free an object and (re)claim ownership if the page was abandoned.
+// - If a page is not part of a theap it is called "abandoned"  (`theap==NULL`) -- in
+//   that case the `xthreadid` is 0 or 4 (4 is for abandoned pages that
+//   are in the `pages_abandoned` lists of an arena, these are called "mapped" abandoned pages).
+// - page flags are in the bottom 3 bits of `xthread_id` for the fast path in `mi_free`.
+// - The layout is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc`
+// - Using `uint16_t` does not seem to slow things down
+
+typedef struct mi_page_s {
+  _Atomic(mi_threadid_t)    xthread_id;        // thread this page belongs to. (= `theap->thread_id (or 0 or 4 if abandoned) | page_flags`)
+
+  mi_block_t*               free;              // list of available free blocks (`malloc` allocates from this list)
+  uint16_t                  used;              // number of blocks in use (including blocks in `thread_free`)
+  uint16_t                  capacity;          // number of blocks committed
+  uint16_t                  reserved;          // number of blocks reserved in memory
+  uint8_t                   retire_expire;     // expiration count for retired blocks
+  bool                      free_is_zero;      // `true` if the blocks in the free list are zero initialized
+
+  mi_block_t*               local_free;        // list of deferred free blocks by this thread (migrates to `free`)
+  _Atomic(mi_thread_free_t) xthread_free;      // list of deferred free blocks freed by other threads (= `mi_block_t* | (1 if owned)`)
+
+  size_t                    block_size;        // const: size available in each block (always `>0`)
+  uint8_t*                  page_start;        // const: start of the blocks
+
+  #if (MI_ENCODE_FREELIST || MI_PADDING)
+  uintptr_t                 keys[2];           // const: two random keys to encode the free lists (see `_mi_block_next`) or padding canary
+  #endif
+
+  mi_theap_t*               theap;             // the theap owning this page (may not be valid or NULL for abandoned pages)
+  mi_heap_t*                heap;              // const: the heap owning this page
+
+  struct mi_page_s*         next;              // next page owned by the theap with the same `block_size`
+  struct mi_page_s*         prev;              // previous page owned by the theap with the same `block_size`
+  size_t                    slice_committed;   // committed size relative to the first arena slice of the page data (or 0 if the page is fully committed already)
+  mi_memid_t                memid;             // const: provenance of the page memory
+} mi_page_t;
+
+
+// ------------------------------------------------------
+// Object sizes
+// ------------------------------------------------------
+
+#define MI_PAGE_ALIGN                     MI_ARENA_SLICE_ALIGN      // pages must be aligned on this for the page map.
+#define MI_PAGE_MIN_START_BLOCK_ALIGN     MI_MAX_ALIGN_SIZE         // minimal block alignment for the first block in a page (16b)
+#define MI_PAGE_MAX_START_BLOCK_ALIGN2    (4*MI_KiB)                // maximal block alignment for "power of 2"-sized blocks (such that we guarantee natural alignment)
+#define MI_PAGE_OSPAGE_BLOCK_ALIGN2       (4*MI_KiB)                // also aligns any multiple of this size to avoid TLB misses.
+#define MI_PAGE_MAX_OVERALLOC_ALIGN       MI_ARENA_SLICE_SIZE       // (64 KiB) limit for which we overallocate in arena pages, beyond this use OS allocation
+
+#if (MI_ENCODE_FREELIST || MI_PADDING) && MI_SIZE_SIZE == 8
+#define MI_PAGE_INFO_SIZE                 ((MI_INTPTR_SHIFT+2)*32)  // 160    >= sizeof(mi_page_t)
+#else
+#define MI_PAGE_INFO_SIZE                 ((MI_INTPTR_SHIFT+1)*32)  // 128/96 >= sizeof(mi_page_t)
+#endif
+
+// The max object sizes are intended to not waste more than ~ 12.5% internally over the page sizes.
+#define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_OSPAGE_BLOCK_ALIGN2)/6)   // = 10 KiB
+#if MI_ENABLE_LARGE_PAGES
+#define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_OSPAGE_BLOCK_ALIGN2)/6)  // ~ 84 KiB
+#define MI_LARGE_MAX_OBJ_SIZE             (MI_LARGE_PAGE_SIZE/8)    // <= 512 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin`
+#else
+#define MI_MEDIUM_MAX_OBJ_SIZE            (MI_MEDIUM_PAGE_SIZE/8)   // <= 64 KiB
+#define MI_LARGE_MAX_OBJ_SIZE             MI_MEDIUM_MAX_OBJ_SIZE    // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin`
+#endif
+#define MI_LARGE_MAX_OBJ_WSIZE            (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE)
+
+#if (MI_LARGE_MAX_OBJ_WSIZE >= 655360)
+#error "mimalloc internal: define more bins"
+#endif
+
+
+// ------------------------------------------------------
+// Page kinds
+// ------------------------------------------------------
+
+typedef enum mi_page_kind_e {
+  MI_PAGE_SMALL,      // small blocks go into 64KiB pages
+  MI_PAGE_MEDIUM,     // medium blocks go into 512KiB pages
+  MI_PAGE_LARGE,      // larger blocks go into 4MiB pages (if `MI_ENABLE_LARGE_PAGES==1`)
+  MI_PAGE_SINGLETON   // page containing a single block.
+                      // used for blocks `> MI_LARGE_MAX_OBJ_SIZE` or an aligment `> MI_PAGE_MAX_OVERALLOC_ALIGN`.
+} mi_page_kind_t;
+
+
+
+// ------------------------------------------------------
+// A "theap" is a thread local heap which owns pages.
+// (making them thread-local avoids atomic operations)
+//
+// All theaps belong to a (non-thread-local) heap.
+// A theap just owns a set of pages for allocation and
+// can only be allocate/reallocate from the thread that created it.
+// Freeing blocks can be done from any thread though.
+//
+// Per thread, there is always a default theap that belongs
+// to the default heap. It is initialized to statically
+// point initially to an empty theap to avoid initialization
+// checks in the fast path.
+// ------------------------------------------------------
+
+// Thread local data
+typedef struct mi_tld_s mi_tld_t;   // defined below
+
+// Pages of a certain block size are held in a queue.
+typedef struct mi_page_queue_s {
+  mi_page_t* first;
+  mi_page_t* last;
+  size_t     count;
+  size_t     block_size;
+} mi_page_queue_t;
+
+// Random context
+typedef struct mi_random_cxt_s {
+  uint32_t input[16];
+  uint32_t output[16];
+  int      output_available;
+  bool     weak;
+} mi_random_ctx_t;
+
+
+// In debug mode there is a padding structure at the end of the blocks to check for buffer overflows
+#if MI_PADDING
+typedef struct mi_padding_s {
+  uint32_t canary; // encoded block value to check validity of the padding (in case of overflow)
+  uint32_t delta;  // padding bytes before the block. (mi_usable_size(p) - delta == exact allocated bytes)
+} mi_padding_t;
+#define MI_PADDING_SIZE   (sizeof(mi_padding_t))
+#define MI_PADDING_WSIZE  ((MI_PADDING_SIZE + MI_INTPTR_SIZE - 1) / MI_INTPTR_SIZE)
+#else
+#define MI_PADDING_SIZE   0
+#define MI_PADDING_WSIZE  0
+#endif
+
+#define MI_PAGES_DIRECT   (MI_SMALL_WSIZE_MAX + MI_PADDING_WSIZE + 1)
+
+
+// A thread-local heap ("theap") owns a set of thread-local pages.
+struct mi_theap_s {
+  mi_tld_t*             tld;                                 // thread-local data
+  mi_heap_t*            heap;                                // the heap this theap belongs to.
+  unsigned long long    heartbeat;                           // monotonic heartbeat count
+  uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
+  mi_random_ctx_t       random;                              // random number context used for secure allocation
+  size_t                page_count;                          // total number of pages in the `pages` queues.
+  size_t                page_retired_min;                    // smallest retired index (retired pages are fully free, but still in the page queues)
+  size_t                page_retired_max;                    // largest retired index into the `pages` array.
+  long                  generic_count;                       // how often is `_mi_malloc_generic` called?
+  long                  generic_collect_count;               // how often is `_mi_malloc_generic` called without collecting?
+
+  mi_theap_t*           tnext;                               // list of theaps in this thread
+  mi_theap_t*           tprev;
+  mi_theap_t*           hnext;                               // list of theaps of the owning `heap`
+  mi_theap_t*           hprev;
+
+  long                  page_full_retain;                    // how many full pages can be retained per queue (before abandoning them)
+  bool                  allow_page_reclaim;                  // `true` if this theap should not reclaim abandoned pages
+  bool                  allow_page_abandon;                  // `true` if this theap can abandon pages to reduce memory footprint
+  #if MI_GUARDED
+  size_t                guarded_size_min;                    // minimal size for guarded objects
+  size_t                guarded_size_max;                    // maximal size for guarded objects
+  size_t                guarded_sample_rate;                 // sample rate (set to 0 to disable guarded pages)
+  size_t                guarded_sample_count;                // current sample count (counting down to 0)
+  #endif
+  mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
+  mi_page_queue_t       pages[MI_BIN_COUNT];                 // queue of pages for each size class (or "bin")
+  mi_memid_t            memid;                               // provenance of the theap struct itself (meta or os)
+  mi_stats_t            stats;                               // thread-local statistics
+};
+
+
+
+
+// ------------------------------------------------------
+// Heaps contain allocated blocks. Heaps are self-contained
+// but share the (sub-process) memory in the arena's.
+// ------------------------------------------------------
+
+// Keep track of all owned and abandoned pages in the arena's
+struct mi_arena_pages_s;
+typedef struct mi_arena_pages_s mi_arena_pages_t;
+
+#define MI_MAX_ARENAS   (160)   // Limited for now (and takes up .bss).. but arena's scale up exponentially (see `mi_arena_reserve`)
+                                // 160 arenas is enough for ~2 TiB memory
+
+// A dynamic thread-local variable; 0 for an invalid thread-local
+typedef size_t mi_thread_local_t;
+
+typedef struct mi_heap_s {
+  mi_subproc_t*         subproc;                        // a heap belongs to a subprocess
+  size_t                heap_seq;                       // unique sequence number for heaps in this subprocess
+  mi_heap_t*            next;                           // list of heaps in this subprocess
+  mi_heap_t*            prev;
+  mi_thread_local_t     theap;                          // dynamic thread local for the thread-local theaps of this heap
+
+  mi_arena_t*           exclusive_arena;                // if the heap should only allocate from a specific arena (or NULL)
+  int                   numa_node;                      // if >=0, prefer this numa node for allocations
+
+  mi_theap_t*           theaps;                         // list of all thread-local theaps belonging to this heap (using the `hnext`/`hprev` fields)
+  mi_lock_t             theaps_lock;                    // lock for the theaps list operations
+
+  _Atomic(size_t)       abandoned_count[MI_BIN_COUNT];  // total count of abandoned pages in this heap
+  mi_page_t*            os_abandoned_pages;             // list of pages that are OS allocated and not in an arena
+  mi_lock_t             os_abandoned_pages_lock;        // lock for the os abandoned pages list (this lock protects list operations)
+
+  _Atomic(mi_arena_pages_t*) arena_pages[MI_MAX_ARENAS]; // track owned and abandoned pages in the arenas (entries can be NULL)
+  mi_lock_t             arena_pages_lock;                // lock to update the arena_pages array
+
+  mi_stats_t            stats;                           // statistics for this heap; periodically updated by merging from each theap
+} mi_heap_t;
+
+
+// ------------------------------------------------------
+// Sub processes do not reclaim or visit pages from other sub processes.
+// These are essentially the static variables of a process, and
+// usually there is only one subprocess. This can be used for example
+// by CPython to have separate interpreters within one process.
+// Each thread can only belong to one subprocess
+// (and needs to call `mi_subproc_add_current_thread` before any allocations).
+// ------------------------------------------------------
+
+struct mi_subproc_s {
+  size_t                subproc_seq;                    // unique id for sub-processes
+  mi_subproc_t*         next;                           // list of all sub-processes
+  mi_subproc_t*         prev;
+
+  _Atomic(size_t)       arena_count;                    // current count of arena's
+  _Atomic(mi_arena_t*)  arenas[MI_MAX_ARENAS];          // arena's of this sub-process
+  mi_lock_t             arena_reserve_lock;             // lock to ensure arena's get reserved one at a time
+  mi_decl_align(8)                                      // needed on some 32-bit platforms
+  _Atomic(int64_t)      purge_expire;                   // expiration is set if any arenas can be purged
+
+  _Atomic(mi_heap_t*)   heap_main;                      // main heap for this sub process
+  mi_heap_t*            heaps;                          // heaps belonging to this sub-process
+  mi_lock_t             heaps_lock;
+
+  _Atomic(size_t)       thread_count;                   // current threads associated with this sub-process
+  _Atomic(size_t)       thread_total_count;             // total created threads associated with this sub-process
+  _Atomic(size_t)       heap_count;                     // current heaps in this sub-process (== |heaps|)
+  _Atomic(size_t)       heap_total_count;               // total created heaps in this sub-process
+
+  mi_memid_t            memid;                          // provenance of this memory block (meta or static)
+  mi_decl_align(8)                                      // needed on some 32-bit platforms
+  mi_stats_t            stats;                          // subprocess statistics; updated for arena/OS stats like committed,
+                                                        // and otherwise merged with heap stats when those are deleted
+};
+
+
+// ------------------------------------------------------
+// Thread Local data
+// ------------------------------------------------------
+
+// Milliseconds as in `int64_t` to avoid overflows
+typedef int64_t  mi_msecs_t;
+
+// Thread local data
+struct mi_tld_s {
+  mi_threadid_t         thread_id;            // thread id of this thread
+  size_t                thread_seq;           // thread sequence id (linear count of created threads)
+  int                   numa_node;            // thread preferred numa node
+  mi_subproc_t*         subproc;              // sub-process this thread belongs to.
+  mi_theap_t*           theaps;               // list of theaps in this thread (so we can abandon all when the thread terminates)
+  bool                  recurse;              // true if deferred was called; used to prevent infinite recursion.
+  bool                  is_in_threadpool;     // true if this thread is part of a threadpool (and can run arbitrary tasks)
+  mi_memid_t            memid;                // provenance of the tld memory itself (meta or OS)
+};
+
+
+/* ----------------------------------------------------------------------------
+  Arenas are fixed area's of OS memory from which we can allocate
+  large blocks (>= MI_ARENA_MIN_BLOCK_SIZE).
+  In contrast to the rest of mimalloc, the arenas are shared between
+  threads and need to be accessed using atomic operations (using atomic `mi_bitmap_t`'s).
+
+  Arenas are also used to for huge OS page (1GiB) reservations or for reserving
+  OS memory upfront which can be improve performance or is sometimes needed
+  on embedded devices. We can also employ this with WASI or `sbrk` systems
+  to reserve large arenas upfront and be able to reuse the memory more effectively.
+-----------------------------------------------------------------------------*/
+
+#define MI_ARENA_BIN_COUNT      (MI_BIN_COUNT)
+#define MI_ARENA_MIN_SIZE       (MI_BCHUNK_BITS * MI_ARENA_SLICE_SIZE)           // 32 MiB (or 8 MiB on 32-bit)
+#define MI_ARENA_MAX_SIZE       (MI_BITMAP_MAX_BIT_COUNT * MI_ARENA_SLICE_SIZE)
+
+typedef struct mi_bitmap_s  mi_bitmap_t;    // atomic bitmap  (defined in `src/bitmap.h`)
+typedef struct mi_bbitmap_s mi_bbitmap_t;   // atomic binned bitmap (defined in `src/bitmap.h`)
+
+typedef struct mi_arena_pages_s {
+  mi_bitmap_t* pages;                // all registered pages (abandoned and owned)
+  mi_bitmap_t* pages_abandoned[MI_ARENA_BIN_COUNT];  // abandoned pages per size bin (a set bit means the start of the page)
+  // followed by the bitmaps (whose sizes depend on the arena size)
+} mi_arena_pages_t;
+
+
+// A memory arena
+typedef struct mi_arena_s {
+  mi_memid_t          memid;                // provenance of the memory area
+  mi_subproc_t*       subproc;              // subprocess this arena belongs to (`this 'element-of' this->subproc->arenas`)
+  size_t              arena_idx;            // index in the arenas array
+
+  size_t              slice_count;          // total size of the area in arena slices (of `MI_ARENA_SLICE_SIZE`)
+  size_t              info_slices;          // initial slices reserved for the arena bitmaps
+  int                 numa_node;            // associated NUMA node
+  bool                is_exclusive;         // only allow allocations if specifically for this arena
+  mi_decl_align(8)                          // needed on some 32-bit platforms
+  _Atomic(mi_msecs_t) purge_expire;         // expiration time when slices can be purged from `slices_purge`.
+  mi_commit_fun_t*    commit_fun;           // custom commit/decommit memory
+  void*               commit_fun_arg;       // user argument for a custom commit function
+
+  size_t              total_size;           // for (user given) memory more than MI_ARENA_MAX_SIZE, we use N arena's to cover it. The first (parent) has the total size (and the other sub-arena's 0).
+  mi_arena_t*         parent;               // if this is a sub arena, this points to the first one in the memory area.
+
+  mi_bbitmap_t*       slices_free;          // is the slice free? (a binned bitmap with size classes)
+  mi_bitmap_t*        slices_committed;     // is the slice committed? (i.e. accessible)
+  mi_bitmap_t*        slices_dirty;         // is the slice potentially non-zero?
+  mi_bitmap_t*        slices_purge;         // slices that can be purged
+  mi_arena_pages_t    pages_main;           // arena page bitmaps for the main heap are allocated up front as well
+
+  // followed by the bitmaps (whose sizes depend on the arena size)
+  // note: when adding bitmaps revise `mi_arena_info_slices_needed`
+} mi_arena_t;
+
+
+
+/* -----------------------------------------------------------
+  Error codes passed to `_mi_fatal_error`
+  All are recoverable but EFAULT is a serious error and aborts by default in secure mode.
+  For portability define undefined error codes using common Unix codes:
+  <https://www-numi.fnal.gov/offline_software/srt_public_context/WebDocs/Errors/unix_system_errors.html>
+----------------------------------------------------------- */
+
+#ifndef EAGAIN         // double free
+#define EAGAIN (11)
+#endif
+#ifndef ENOMEM         // out of memory
+#define ENOMEM (12)
+#endif
+#ifndef EFAULT         // corrupted free-list or meta-data
+#define EFAULT (14)
+#endif
+#ifndef EINVAL         // trying to free an invalid pointer
+#define EINVAL (22)
+#endif
+#ifndef EOVERFLOW      // count*size overflow
+#define EOVERFLOW (75)
+#endif
+
+/* -----------------------------------------------------------
+  Debug constants
+----------------------------------------------------------- */
+
+#if !defined(MI_DEBUG_UNINIT)
+#define MI_DEBUG_UNINIT     (0xD0)
+#endif
+#if !defined(MI_DEBUG_FREED)
+#define MI_DEBUG_FREED      (0xDF)
+#endif
+#if !defined(MI_DEBUG_PADDING)
+#define MI_DEBUG_PADDING    (0xDE)
+#endif
+
+
+#endif // MI_TYPES_H
diff --git a/ext/src/mimalloc/mimalloc.pc.in b/ext/src/mimalloc/mimalloc.pc.in
new file mode 100644
index 0000000000..80922256ae
--- /dev/null
+++ b/ext/src/mimalloc/mimalloc.pc.in
@@ -0,0 +1,11 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+libdir=@mi_pc_libdir@
+includedir=@mi_pc_includedir@
+
+Name: @PROJECT_NAME@
+Description: A compact general purpose allocator with excellent performance
+Version: @PACKAGE_VERSION@
+URL: https://github.com/microsoft/mimalloc/
+Libs: -L${libdir} -l@mi_libname@
+Libs.private: @mi_pc_libraries@
+Cflags: -I${includedir}
diff --git a/ext/src/mimalloc/readme.md b/ext/src/mimalloc/readme.md
index dc22597be1..3c312d2dcd 100644
--- a/ext/src/mimalloc/readme.md
+++ b/ext/src/mimalloc/readme.md
@@ -1,7 +1,7 @@
 
 <img align="left" width="100" height="100" src="doc/mimalloc-logo.png"/>
 
-[<img align="right" src="https://dev.azure.com/Daan0324/mimalloc/_apis/build/status/microsoft.mimalloc?branchName=dev"/>](https://dev.azure.com/Daan0324/mimalloc/_build?definitionId=1&_a=summary)
+[<img align="right" src="https://dev.azure.com/Daan0324/mimalloc/_apis/build/status/microsoft.mimalloc?branchName=dev3"/>](https://dev.azure.com/Daan0324/mimalloc/_build?definitionId=1&_a=summary)
 
 # mimalloc
 
@@ -9,24 +9,30 @@
 
 mimalloc (pronounced "me-malloc")
 is a general purpose allocator with excellent [performance](#performance) characteristics.
-Initially developed by Daan Leijen for the run-time systems of the
+Initially developed by Daan Leijen for the runtime systems of the
 [Koka](https://koka-lang.github.io) and [Lean](https://github.com/leanprover/lean) languages.
 
-Latest release tag: `v2.0.6` (2022-04-14).  
-Latest stable  tag: `v1.7.6` (2022-02-14).
+Latest release   : `v3.2.8` (2026-02-03) release candidate 3, please report any issues.  
+Latest v2 release: `v2.2.7` (2026-01-15).  
+Latest v1 release: `v1.9.7` (2026-01-15).
 
 mimalloc is a drop-in replacement for `malloc` and can be used in other programs
 without code changes, for example, on dynamically linked ELF-based systems (Linux, BSD, etc.) you can use it as:
 ```
 > LD_PRELOAD=/usr/lib/libmimalloc.so  myprogram
 ```
-It also has an easy way to override the default allocator in [Windows](#override_on_windows). Notable aspects of the design include:
+It also includes a way to dynamically override the default allocator in [Windows](#override_on_windows). 
+Notable aspects of the design include:
 
-- __small and consistent__: the library is about 8k LOC using simple and
+- __small and consistent__: the library is about 10k LOC using simple and
   consistent data structures. This makes it very suitable
   to integrate and adapt in other projects. For runtime systems it
   provides hooks for a monotonic _heartbeat_ and deferred freeing (for
   bounded worst-case times with reference counting).
+  Partly due to its simplicity, mimalloc has been ported to many systems (Windows, macOS,
+  Linux, WASM, various BSD's, Haiku, MUSL, etc) and has excellent support for dynamic overriding.
+  At the same time, it is an industrial strength allocator that runs (very) large scale
+  distributed services on thousands of machines with excellent worst case latencies.
 - __free list sharding__: instead of one big free list (per size class) we have
   many smaller lists per "mimalloc page" which reduces fragmentation and
   increases locality --
@@ -36,13 +42,13 @@ It also has an easy way to override the default allocator in [Windows](#override
   per mimalloc page, but for each page we have multiple free lists. In particular, there
   is one list for thread-local `free` operations, and another one for concurrent `free`
   operations. Free-ing from another thread can now be a single CAS without needing
-  sophisticated coordination between threads. Since there will be 
+  sophisticated coordination between threads. Since there will be
   thousands of separate free lists, contention is naturally distributed over the heap,
   and the chance of contending on a single location will be low -- this is quite
   similar to randomized algorithms like skip lists where adding
   a random oracle removes the need for a more complex algorithm.
-- __eager page reset__: when a "page" becomes empty (with increased chance
-  due to free list sharding) the memory is marked to the OS as unused ("reset" or "purged")
+- __eager page purging__: when a "page" becomes empty (with increased chance
+  due to free list sharding) the memory is marked to the OS as unused (reset or decommitted)
   reducing (real) memory pressure and fragmentation, especially in long running
   programs.
 - __secure__: _mimalloc_ can be built in secure mode, adding guard pages,
@@ -50,71 +56,81 @@ It also has an easy way to override the default allocator in [Windows](#override
   heap vulnerabilities. The performance penalty is usually around 10% on average
   over our benchmarks.
 - __first-class heaps__: efficiently create and use multiple heaps to allocate across different regions.
-  A heap can be destroyed at once instead of deallocating each object separately.  
+  A heap can be destroyed at once instead of deallocating each object separately.
+  New: v3 has true first-class heaps where one can allocate in a heap from any thread.   
 - __bounded__: it does not suffer from _blowup_ \[1\], has bounded worst-case allocation
-  times (_wcat_), bounded space overhead (~0.2% meta-data, with low internal fragmentation),
-  and has no internal points of contention using only atomic operations.
+  times (_wcat_) (upto OS primitives), bounded space overhead (~0.2% meta-data, with low
+  internal fragmentation), and has no internal points of contention using only atomic operations.
 - __fast__: In our benchmarks (see [below](#performance)),
   _mimalloc_ outperforms other leading allocators (_jemalloc_, _tcmalloc_, _Hoard_, etc),
-  and often uses less memory. A nice property
-  is that it does consistently well over a wide range of benchmarks. There is also good huge OS page
-  support for larger server programs.
+  and often uses less memory. A nice property is that it does consistently well over a wide range
+  of benchmarks. There is also good huge OS page support for larger server programs.
 
 The [documentation](https://microsoft.github.io/mimalloc) gives a full overview of the API.
-You can read more on the design of _mimalloc_ in the [technical report](https://www.microsoft.com/en-us/research/publication/mimalloc-free-list-sharding-in-action) which also has detailed benchmark results.   
+You can read more on the design of mimalloc in the [technical report](https://www.microsoft.com/en-us/research/publication/mimalloc-free-list-sharding-in-action) which also has detailed benchmark results.
 
-Enjoy!  
+Enjoy!
 
-### Branches
+### Versions
 
-* `master`: latest stable release (based on `dev-slice`).
-* `dev`: development branch for mimalloc v1. Use this branch for submitting PR's.
-* `dev-slice`: development branch for mimalloc v2. This branch is downstream of `dev`.
+There are three maintained versions of mimalloc. These are mostly equal except for how the OS memory is handled. 
+New development is mostly on v3, while v1 and v2 are maintained with security and bug fixes. 
 
-### Releases
-
-Note: the `v2.x` version has a new algorithm for managing internal mimalloc pages that tends to use reduce memory usage
-  and fragmentation compared to mimalloc `v1.x` (especially for large workloads). Should otherwise have similar performance
-  (see [below](#performance)); please report if you observe any significant performance regression.
-
-* 2022-04-14, `v1.7.6`, `v2.0.6`: fix fallback path for aligned OS allocation on Windows, improve Windows aligned allocation
-  even when compiling with older SDK's, fix dynamic overriding on macOS Monterey, fix MSVC C++ dynamic overriding, fix
-  warnings under Clang 14, improve performance if many OS threads are created and destroyed, fix statistics for large object
-  allocations, using MIMALLOC_VERBOSE=1 has no maximum on the number of error messages, various small fixes.
-
-* 2022-02-14, `v1.7.5`, `v2.0.5` (alpha): fix malloc override on
-  Windows 11, fix compilation with musl, potentially reduced
-  committed memory, add `bin/minject` for Windows, 
-  improved wasm support, faster aligned allocation,
-  various small fixes.
-
-* 2021-11-14, `v1.7.3`, `v2.0.3` (beta): improved WASM support, improved macOS support and performance (including
-  M1), improved performance for v2 for large objects, Python integration improvements, more standard
-  installation directories, various small fixes.
+- __v1__: initial design of mimalloc (release tags: `v1.9.x`, development branch `dev`). Send PR's against this version if possible.
+- __v2__: main mimalloc version. Uses thread-local segments to reduce fragmentation. (release tags: `v2.2.x`, development branch `dev2` and `main`)
+- __v3__: simplifies the lock-free design of previous versions and improves sharing of 
+        memory between threads. On certain large workloads this version may use 
+        (much) less memory. Also supports true first-class heaps (that can allocate from any thread) 
+        and has more efficient heap-walking (for the CPython GC for example).
+        (release tags: `v3.2.x`, development branch `dev3`).
 
-* 2021-06-17, `v1.7.2`, `v2.0.2` (beta): support M1, better installation layout on Linux, fix
-  thread_id on Android, prefer 2-6TiB area for aligned allocation to work better on pre-windows 8, various small fixes.
+### Releases
 
-* 2021-04-06, `v1.7.1`, `v2.0.1` (beta): fix bug in arena allocation for huge pages, improved aslr on large allocations, initial M1 support (still experimental).
-  
-* 2021-01-31, `v2.0.0`: beta release 2.0: new slice algorithm for managing internal mimalloc pages.
-  
-* 2021-01-31, `v1.7.0`: stable release 1.7: support explicit user provided memory regions, more precise statistics,
-  improve macOS overriding, initial support for Apple M1, improved DragonFly support, faster memcpy on Windows, various small fixes.
+* 2026-02-03, `v3.2.8` (rc3): Fix thread reinitialize issue on macOS. Fix SIMD codegen bug on older
+  GCC versions. Extend Windows TLS slot limit from 64 to 1088. Report commit statistics more precise.
+  Fixes issue in free-page search in arenas.
+* 2026-01-15, `v1.9.7`, `v2.2.7`, `v3.2.7` (rc2): Fix zero initializing blocks that were OS allocated.  
+  For v3 various bug and performance fixes. Fix Debian 32-bit compilation.
+* 2026-01-08, `v1.9.6`, `v2.2.6`, `v3.2.6` (rc1): Important bug fixes. Many improvements to v3 including 
+  true first-class heaps where one can allocate in heap from any thread, and track statistics per heap as well.
+  Added `MIMALLOC_ALLOW_THP` option. This is by default enabled except on Android. When THP is detected on v3,
+  mimalloc will set the `MIMALLOC_MINIMAL_PURGE_SIZE` to 2MiB to avoid breaking up potential THP huge pages.
+  v3 uses faster TLS access on Windows, and has improved performance for `mi_calloc` and aligned allocations.
+  Fixed rare race condition on older v3, fixed potential buffer overflow in debug statistics, add API for returning
+  allocated sizes on allocation and free.
+* 2025-06-09, `v1.9.4`, `v2.2.4`, `v3.1.4` (beta) : Some important bug fixes, including a case where OS memory
+  was not always fully released. Improved v3 performance, build on XBox, fix build on Android, support interpose 
+  for older macOS versions, use MADV_FREE_REUSABLE on macOS, always check commit success, better support for Windows 
+  fixed TLS offset, etc.
+* 2025-03-28, `v1.9.3`, `v2.2.3`, `v3.0.3` (beta) : Various small bug and build fixes, including:
+  fix arm32 pre v7 builds, fix mingw build, get runtime statistics, improve statistic commit counts, 
+  fix execution on non BMI1 x64 systems. 
+* 2025-03-06, `v1.9.2`, `v2.2.2`, `v3.0.2-beta`: Various small bug and build fixes. 
+  Add `mi_options_print`, `mi_arenas_print`, and the experimental `mi_stat_get` and `mi_stat_get_json`. 
+  Add `mi_thread_set_in_threadpool` and `mi_heap_set_numa_affinity` (v3 only). Add vcpkg portfile. 
+  Upgrade mimalloc-redirect to v1.3.2. `MI_OPT_ARCH` is off by default now but still assumes armv8.1-a on arm64
+  for fast atomic operations. Add QNX support.
+* 2025-01-03, `v1.8.9`, `v2.1.9`, `v3.0.1-alpha`: Interim release. Support Windows arm64. New [guarded](#guarded) build that can place OS 
+  guard pages behind objects to catch buffer overflows as they occur. 
+  Many small fixes: build on Windows arm64, cygwin, riscV, and dragonfly; fix Windows static library initialization to account for
+  thread local destructors (in Rust/C++); macOS tag change; macOS TLS slot fix; improve stats; 
+  consistent `mimalloc.dll` on Windows (instead of `mimalloc-override.dll`); fix mimalloc-redirect on Win11 H2; 
+  add 0-byte to canary; upstream CPython fixes; reduce .bss size; allow fixed TLS slot on Windows for improved performance.
 
 * [Older release notes](#older-release-notes)
 
 Special thanks to:
 
+* Sergiy Kuryata for his contributions on reducing memory commit -- especially on Windows with the Windows thread pool (now implemented in v3).
 * [David Carlier](https://devnexen.blogspot.com/) (@devnexen) for his many contributions, and making
   mimalloc work better on many less common operating systems, like Haiku, Dragonfly, etc.
 * Mary Feofanova (@mary3000), Evgeniy Moiseenko, and Manuel Pöter (@mpoeter) for making mimalloc TSAN checkable, and finding
   memory model bugs using the [genMC] model checker.
 * Weipeng Liu (@pongba), Zhuowei Li, Junhua Wang, and Jakub Szymanski, for their early support of mimalloc and deployment
   at large scale services, leading to many improvements in the mimalloc algorithms for large workloads.
-* Jason Gibson (@jasongibson) for exhaustive testing on large scale workloads and server environments, and finding complex bugs 
+* Jason Gibson (@jasongibson) for exhaustive testing on large scale workloads and server environments, and finding complex bugs
   in (early versions of) `mimalloc`.
-* Manuel Pöter (@mpoeter) and Sam Gross(@colesbury) for finding an ABA concurrency issue in abandoned segment reclamation. Sam also created the [no GIL](https://github.com/colesbury/nogil) Python fork which 
+* Manuel Pöter (@mpoeter) and Sam Gross(@colesbury) for finding an ABA concurrency issue in abandoned segment reclamation. Sam also created the [no GIL](https://github.com/colesbury/nogil) Python fork which
   uses mimalloc internally.
 
 
@@ -128,21 +144,21 @@ mimalloc is used in various large scale low-latency services and programs, for e
 <a href="https://azure.microsoft.com/"><img height="50" align="left" src="https://upload.wikimedia.org/wikipedia/commons/a/a8/Microsoft_Azure_Logo.svg"></a>
 <a href="https://deathstrandingpc.505games.com"><img height="100" src="doc/ds-logo.png"></a>
 <a href="https://docs.unrealengine.com/4.26/en-US/WhatsNew/Builds/ReleaseNotes/4_25/"><img height="100" src="doc/unreal-logo.svg"></a>
-<a href="https://github.com/ablab/spades"><img height="100" src="doc/spades-logo.png"></a>
+<a href="https://cab.spbu.ru/software/spades/"><img height="100" src="doc/spades-logo.png"></a>
 
 
 # Building
 
 ## Windows
 
-Open `ide/vs2019/mimalloc.sln` in Visual Studio 2019 and build.
-The `mimalloc` project builds a static library (in `out/msvc-x64`), while the
-`mimalloc-override` project builds a DLL for overriding malloc
+Open `ide/vs2022/mimalloc.sln` in Visual Studio 2022 and build.
+The `mimalloc-lib` project builds a static library (in `out/msvc-x64`), while the
+`mimalloc-override-dll` project builds a DLL for overriding malloc
 in the entire program.
 
-## macOS, Linux, BSD, etc.
+## Linux, macOS, BSD, etc.
 
-We use [`cmake`](https://cmake.org)<sup>1</sup> as the build system:
+We use [`cmake`](https://cmake.org) as the build system:
 
 ```
 > mkdir -p out/release
@@ -165,32 +181,58 @@ maintains detailed statistics as:
 > cmake -DCMAKE_BUILD_TYPE=Debug ../..
 > make
 ```
+
 This will name the shared library as `libmimalloc-debug.so`.
 
-Finally, you can build a _secure_ version that uses guard pages, encrypted
-free lists, etc., as:
+Finally, you can build a _secure_ version that uses guard pages, encrypted free lists, etc., as:
+
 ```
 > mkdir -p out/secure
 > cd out/secure
 > cmake -DMI_SECURE=ON ../..
 > make
 ```
+
 This will name the shared library as `libmimalloc-secure.so`.
-Use `ccmake`<sup>2</sup> instead of `cmake`
-to see and customize all the available build options.
+Use `cmake ../.. -LH` to see all the available build options.
+
+The examples use the default compiler. If you like to use another, use:
+
+```
+> CC=clang CXX=clang++ cmake ../..
+```
 
-Notes:
-1. Install CMake: `sudo apt-get install cmake`
-2. Install CCMake: `sudo apt-get install cmake-curses-gui`
+## Cmake with Visual Studio
 
+You can also use cmake on Windows. Open a Visual Studio 2022 development prompt 
+and invoke `cmake` with the right [generator](https://cmake.org/cmake/help/latest/generator/Visual%20Studio%2017%202022.html) 
+and architecture, like:
 
-## Single source
+```
+> cmake ..\.. -G "Visual Studio 17 2022" -A x64 -DMI_OVERRIDE=ON
+```
+
+The cmake build type is specified when actually building, for example:
+
+```
+> cmake --build . --config=Release
+```
+
+You can also install the [LLVM toolset](https://learn.microsoft.com/en-us/cpp/build/clang-support-msbuild?view=msvc-170#install-1) 
+on Windows to build with the `clang-cl` compiler directly:
+
+```
+> cmake ../.. -G "Visual Studio 17 2022" -T ClangCl
+```
+
+
+## Single Source
 
 You can also directly build the single `src/static.c` file as part of your project without
 needing `cmake` at all. Make sure to also add the mimalloc `include` directory to the include path.
 
 
-# Using the library
+# Using the Library
 
 The preferred usage is including `<mimalloc.h>`, linking with
 the shared- or static library, and using the `mi_malloc` API exclusively for allocation. For example,
@@ -202,7 +244,7 @@ mimalloc uses only safe OS calls (`mmap` and `VirtualAlloc`) and can co-exist
 with other allocators linked to the same program.
 If you use `cmake`, you can simply use:
 ```
-find_package(mimalloc 1.4 REQUIRED)
+find_package(mimalloc 1.8 REQUIRED)
 ```
 in your `CMakeLists.txt` to find a locally installed mimalloc. Then use either:
 ```
@@ -215,8 +257,8 @@ target_link_libraries(myapp PUBLIC mimalloc-static)
 to link with the static library. See `test\CMakeLists.txt` for an example.
 
 For best performance in C++ programs, it is also recommended to override the
-global `new` and `delete` operators. For convience, mimalloc provides
-[`mimalloc-new-delete.h`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-new-delete.h) which does this for you -- just include it in a single(!) source file in your project.
+global `new` and `delete` operators. For convenience, mimalloc provides
+[`mimalloc-new-delete.h`](include/mimalloc-new-delete.h) which does this for you -- just include it in a single(!) source file in your project.
 In C++, mimalloc also provides the `mi_stl_allocator` struct which implements the `std::allocator`
 interface.
 
@@ -227,33 +269,52 @@ and statistics (`MIMALLOC_SHOW_STATS=1`) (in the debug version):
 
 175451865205073170563711388363 = 374456281610909315237213 * 468551
 
-heap stats:     peak      total      freed       unit
-normal   2:    16.4 kb    17.5 mb    17.5 mb      16 b   ok
-normal   3:    16.3 kb    15.2 mb    15.2 mb      24 b   ok
-normal   4:      64 b      4.6 kb     4.6 kb      32 b   ok
-normal   5:      80 b    118.4 kb   118.4 kb      40 b   ok
-normal   6:      48 b       48 b       48 b       48 b   ok
-normal  17:     960 b      960 b      960 b      320 b   ok
-
-heap stats:     peak      total      freed       unit
-    normal:    33.9 kb    32.8 mb    32.8 mb       1 b   ok
-      huge:       0 b        0 b        0 b        1 b   ok
-     total:    33.9 kb    32.8 mb    32.8 mb       1 b   ok
-malloc requested:         32.8 mb
-
- committed:    58.2 kb    58.2 kb    58.2 kb       1 b   ok
-  reserved:     2.0 mb     2.0 mb     2.0 mb       1 b   ok
-     reset:       0 b        0 b        0 b        1 b   ok
-  segments:       1          1          1
--abandoned:       0
-     pages:       6          6          6
--abandoned:       0
-     mmaps:       3
- mmap fast:       0
- mmap slow:       1
-   threads:       0
-   elapsed:     2.022s
-   process: user: 1.781s, system: 0.016s, faults: 756, reclaims: 0, rss: 2.7 mb
+subproc 0
+ blocks          peak       total     current       block      total#
+  bin S    4:    75.3 KiB    55.2 MiB     0          32   B       1.8 M    ok
+  bin S    6:    31.0 KiB   180.4 KiB     0          48   B       3.8 K    ok
+  bin S    8:    64   B      64   B       0          64   B       1        ok
+  bin S    9:   160   B     160   B       0          80   B       2        ok
+  bin S   17:     1.2 KiB     1.2 KiB     0         320   B       4        ok
+  bin S   21:   640   B       3.1 KiB     0         640   B       5        ok
+  bin S   33:     5.0 KiB     5.0 KiB     0           5.0 KiB     1        ok
+
+  binned    :    84.2 Ki     41.5 Mi      0                                ok
+  huge      :     0           0           0                                ok
+  total     :    84.2 KiB    41.5 MiB     0
+  malloc req:                29.7 MiB
+
+ pages           peak       total     current       block      total#
+  touched   :   152.8 KiB   152.8 KiB   152.8 KiB
+  pages     :     8          14           0                                ok
+  abandoned :     1         249           0                                ok
+  reclaima  :     0
+  reclaimf  :   249
+  reabandon :     0
+  waits     :     0
+  extended  :    38
+  retire    :    35
+  searches  :     0.7 avg
+
+ arenas          peak       total     current       block      total#
+  reserved  :     1.0 GiB     1.0 GiB     1.0 GiB
+  committed :     4.8 MiB     4.8 MiB     4.4 MiB
+  reset     :     0
+  purged    :   385.5 Ki
+  arenas    :     1
+  rollback  :     0
+  mmaps     :     3
+  commits   :     0
+  resets    :     1
+  purges    :     2
+  guarded   :     0
+  heaps     :     1           1           1
+
+ process         peak       total     current       block      total#
+  threads   :     1           1           1
+  numa nodes:     1
+  elapsed   :     0.553 s
+  process   : user: 0.557 s, system: 0.013 s, faults: 29, peak rss: 2.1 MiB, peak commit: 4.8 MiB
 ```
 
 The above model of using the `mi_` prefixed API is not always possible
@@ -263,46 +324,57 @@ completely and redirect all calls to the _mimalloc_ library instead .
 
 ## Environment Options
 
-You can set further options either programmatically (using [`mi_option_set`](https://microsoft.github.io/mimalloc/group__options.html)),
-or via environment variables:
+You can set further options either programmatically (using [`mi_option_set`](https://microsoft.github.io/mimalloc/group__options.html)), or via environment variables:
 
 - `MIMALLOC_SHOW_STATS=1`: show statistics when the program terminates.
-- `MIMALLOC_VERBOSE=1`: show verbose messages.
+- `MIMALLOC_VERBOSE=1`: show verbose messages (including statistics).
 - `MIMALLOC_SHOW_ERRORS=1`: show error and warning messages.
-- `MIMALLOC_PAGE_RESET=0`: by default, mimalloc will reset (or purge) OS pages that are not in use, to signal to the OS
-   that the underlying physical memory can be reused. This can reduce memory fragmentation in long running (server)
-   programs. By setting it to `0` this will no longer be done which can improve performance for batch-like programs.
-   As an alternative, the `MIMALLOC_RESET_DELAY=`<msecs> can be set higher (100ms by default) to make the page
-   reset occur less frequently instead of turning it off completely.
+
+Advanced options:
+
+- `MIMALLOC_ARENA_EAGER_COMMIT=2`: turns on eager commit for the large arenas (usually 1GiB) from which mimalloc
+   allocates segments and pages. Set this to 2 (default) to
+   only enable this on overcommit systems (e.g. Linux). Set this to 1 to enable explicitly on other systems
+   as well (like Windows or macOS) which may improve performance (as the whole arena is committed at once).
+   Note that eager commit only increases the commit but not the actual the peak resident set
+   (rss) so it is generally ok to enable this.
+- `MIMALLOC_PURGE_DELAY=N`: the delay in `N` milli-seconds (by default `1000` in v3) after which mimalloc will purge
+   OS pages that are not in use. This signals to the OS that the underlying physical memory can be reused which
+   can reduce memory fragmentation especially in long running (server) programs. Setting `N` to `0` purges immediately when
+   a page becomes unused which can improve memory usage but also decreases performance.
+   Setting it to `-1` disables purging completely.
+- `MIMALLOC_PURGE_DECOMMITS=1`: By default "purging" memory means unused memory is decommitted (`MEM_DECOMMIT` on Windows,
+   `MADV_DONTNEED` (which decresease rss immediately) on `mmap` systems). Set this to 0 to instead "reset" unused
+   memory on a purge (`MEM_RESET` on Windows, generally `MADV_FREE` (which does not decrease rss immediately) on `mmap` systems).
+   Mimalloc generally does not "free" OS memory but only "purges" OS memory, in other words, it tries to keep virtual
+   address ranges and decommits within those ranges (to make the underlying physical memory available to other processes).
+
+Further options for large workloads and services:
+
+- `MIMALLOC_ALLOW_THP=1`: By default always allow transparent huge pages (THP) on Linux systems. On Android only this is
+   by default off. When set to `0`, THP is disabled for the process that mimalloc runs in. If enabled, mimalloc also sets
+   the `MIMALLOC_MINIMAL_PURGE_SIZE` in v3 to 2MiB to avoid potentially breaking up transparent huge pages.
 - `MIMALLOC_USE_NUMA_NODES=N`: pretend there are at most `N` NUMA nodes. If not set, the actual NUMA nodes are detected
    at runtime. Setting `N` to 1 may avoid problems in some virtual environments. Also, setting it to a lower number than
    the actual NUMA nodes is fine and will only cause threads to potentially allocate more memory across actual NUMA
    nodes (but this can happen in any case as NUMA local allocation is always a best effort but not guaranteed).
-- `MIMALLOC_LARGE_OS_PAGES=1`: use large OS pages (2MiB) when available; for some workloads this can significantly
-   improve performance. Use `MIMALLOC_VERBOSE` to check if the large OS pages are enabled -- usually one needs
-   to explicitly allow large OS pages (as on [Windows][windows-huge] and [Linux][linux-huge]). However, sometimes
+- `MIMALLOC_ALLOW_LARGE_OS_PAGES=0`: Set to 1 to use large OS pages (2 or 4MiB) when available; for some workloads this can
+   significantly improve performance. However, large OS pages cannot be purged or shared with other processes so may lead
+   to increased memory usage in some cases.
+   Use `MIMALLOC_VERBOSE` to check if the large OS pages are enabled -- usually one needs
+   to explicitly give permissions for large OS pages (as on [Windows][windows-huge] and [Linux][linux-huge]). However, sometimes
    the OS is very slow to reserve contiguous physical memory for large OS pages so use with care on systems that
    can have fragmented memory (for that reason, we generally recommend to use `MIMALLOC_RESERVE_HUGE_OS_PAGES` instead whenever possible).
-   <!--
-   - `MIMALLOC_EAGER_REGION_COMMIT=1`: on Windows, commit large (256MiB) regions eagerly. On Windows, these regions
-   show in the working set even though usually just a small part is committed to physical memory. This is why it
-   turned off by default on Windows as it looks not good in the task manager. However, turning it on has no
-   real drawbacks and may improve performance by a little.
-   -->
-- `MIMALLOC_RESERVE_HUGE_OS_PAGES=N`: where N is the number of 1GiB _huge_ OS pages. This reserves the huge pages at
+- `MIMALLOC_RESERVE_HUGE_OS_PAGES=N`: where `N` is the number of 1GiB _huge_ OS pages. This reserves the huge pages at
    startup and sometimes this can give a large (latency) performance improvement on big workloads.
-   Usually it is better to not use
-   `MIMALLOC_LARGE_OS_PAGES` in combination with this setting. Just like large OS pages, use with care as reserving
+   Usually it is better to not use `MIMALLOC_ALLOW_LARGE_OS_PAGES=1` in combination with this setting. Just like large
+   OS pages, use with care as reserving
    contiguous physical memory can take a long time when memory is fragmented (but reserving the huge pages is done at
    startup only once).
-   Note that we usually need to explicitly enable huge OS pages (as on [Windows][windows-huge] and [Linux][linux-huge])).
-   With huge OS pages, it may be beneficial to set the setting
-   `MIMALLOC_EAGER_COMMIT_DELAY=N` (`N` is 1 by default) to delay the initial `N` segments (of 4MiB)
-   of a thread to not allocate in the huge OS pages; this prevents threads that are short lived
-   and allocate just a little to take up space in the huge OS page area (which cannot be reset).
+   Note that we usually need to explicitly give permission for huge OS pages (as on [Windows][windows-huge] and [Linux][linux-huge])).
    The huge pages are usually allocated evenly among NUMA nodes.
-   We can use `MIMALLOC_RESERVE_HUGE_OS_PAGES_AT=N` where `N` is the numa node (starting at 0) to allocate all 
-   the huge pages at a specific numa node instead. 
+   We can use `MIMALLOC_RESERVE_HUGE_OS_PAGES_AT=N` where `N` is the numa node (starting at 0) to allocate all
+   the huge pages at a specific numa node instead.
 
 Use caution when using `fork` in combination with either large or huge OS pages: on a fork, the OS uses copy-on-write
 for all pages in the original process including the huge OS pages. When any memory is now written in that area, the
@@ -330,13 +402,39 @@ As always, evaluate with care as part of an overall security strategy as all of
 
 ## Debug Mode
 
-When _mimalloc_ is built using debug mode, various checks are done at runtime to catch development errors.
+When _mimalloc_ is built using debug mode, (`-DCMAKE_BUILD_TYPE=Debug`), 
+various checks are done at runtime to catch development errors.
 
 - Statistics are maintained in detail for each object size. They can be shown using `MIMALLOC_SHOW_STATS=1` at runtime.
 - All objects have padding at the end to detect (byte precise) heap block overflows.
 - Double free's, and freeing invalid heap pointers are detected.
 - Corrupted free-lists and some forms of use-after-free are detected.
 
+## Guarded Mode
+
+<span id="guarded">_mimalloc_ can be build in guarded mode using the `-DMI_GUARDED=ON` flags in `cmake`.</span>
+This enables placing OS guard pages behind certain object allocations to catch buffer overflows as they occur.
+This can be invaluable to catch buffer-overflow bugs in large programs. However, it also means that any object
+allocated with a guard page takes at least 8 KiB memory for the guard page and its alignment. As such, allocating
+a guard page for every allocation may be too expensive both in terms of memory, and in terms of performance with
+many system calls. Therefore, there are various environment variables (and options) to tune this:
+
+- `MIMALLOC_GUARDED_SAMPLE_RATE=N`: Set the sample rate to `N` (by default 4000). This mode places a guard page
+  behind every `N` suitable object allocations (per thread). Since the performance in guarded mode without placing
+  guard pages is close to release mode, this can be used to enable guard pages even in production to catch latent 
+  buffer overflow bugs. Set the sample rate to `1` to guard every object, and to `0` to place no guard pages at all.
+
+- `MIMALLOC_GUARDED_SAMPLE_SEED=N`: Start sampling at `N` (by default random). Can be used to reproduce a buffer
+  overflow if needed.
+
+- `MIMALLOC_GUARDED_MIN=N`, `MIMALLOC_GUARDED_MAX=N`: Minimal and maximal _rounded_ object sizes for which a guard 
+  page is considered (`0` and `1GiB` respectively). If you suspect a buffer overflow occurs with an object of size
+  141, set the minimum and maximum to `148` and the sample rate to `1` to have all of those guarded.
+
+- `MIMALLOC_GUARDED_PRECISE=1`: If we have an object of size 13, we would usually place it an aligned 16 bytes in
+  front of the guard page. Using `MIMALLOC_GUARDED_PRECISE` places it exactly 13 bytes before a page so that even
+  a 1 byte overflow is detected. This violates the C/C++ minimal alignment guarantees though so use with care.
+
 
 # Overriding Standard Malloc
 
@@ -346,7 +444,7 @@ Overriding the standard `malloc` (and `new`) can be done either _dynamically_ or
 
 This is the recommended way to override the standard malloc interface.
 
-### Override on Linux, BSD
+### Dynamic Override on Linux, BSD
 
 On these ELF-based systems we preload the mimalloc shared
 library so all calls to the standard `malloc` interface are
@@ -365,7 +463,7 @@ or run with the debug version to get detailed statistics:
 > env MIMALLOC_SHOW_STATS=1 LD_PRELOAD=/usr/lib/libmimalloc-debug.so myprogram
 ```
 
-### Override on MacOS
+### Dynamic Override on MacOS
 
 On macOS we can also preload the mimalloc shared
 library so all calls to the standard `malloc` interface are
@@ -378,55 +476,163 @@ Note that certain security restrictions may apply when doing this from
 the [shell](https://stackoverflow.com/questions/43941322/dyld-insert-libraries-ignored-when-calling-application-through-bash).
 
 
-### Override on Windows
+### Dynamic Override on Windows
 
-<span id="override_on_windows">Overriding on Windows</span> is robust and has the
-particular advantage to be able to redirect all malloc/free calls that go through
-the (dynamic) C runtime allocator, including those from other DLL's or libraries.
+<span id="override_on_windows">We use a separate redirection DLL to override mimalloc on Windows</span> 
+such that we redirect all malloc/free calls that go through the (dynamic) C runtime allocator, 
+including those from other DLL's or libraries. As it intercepts all allocation calls on a low level, 
+it can be used on large programs that include other 3rd party components.
+There are four requirements to make the overriding work well:
 
-The overriding on Windows requires that you link your program explicitly with
-the mimalloc DLL and use the C-runtime library as a DLL (using the `/MD` or `/MDd` switch).
-Also, the `mimalloc-redirect.dll` (or `mimalloc-redirect32.dll`) must be put
-in the same folder as the main `mimalloc-override.dll` at runtime (as it is a dependency).
-The redirection DLL ensures that all calls to the C runtime malloc API get redirected to
-mimalloc (in `mimalloc-override.dll`).
+1. Use the C-runtime library as a DLL (using the `/MD` or `/MDd` switch).
 
-To ensure the mimalloc DLL is loaded at run-time it is easiest to insert some
-call to the mimalloc API in the `main` function, like `mi_version()`
-(or use the `/INCLUDE:mi_version` switch on the linker). See the `mimalloc-override-test` project
-for an example on how to use this. For best performance on Windows with C++, it
-is also recommended to also override the `new`/`delete` operations (by including
-[`mimalloc-new-delete.h`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-new-delete.h) a single(!) source file in your project).
+2. Link your program explicitly with the `mimalloc.dll.lib` export library for the `mimalloc.dll`.
+   (which must be compiled with `-DMI_OVERRIDE=ON`, which is the default though).
+   To ensure the `mimalloc.dll` is actually loaded at run-time it is easiest 
+   to insert some call to the mimalloc API in the `main` function, like `mi_version()`
+   (or use the `/include:mi_version` switch on the linker command, or
+   similarly, `#pragma comment(linker, "/include:mi_version")` in some source file). 
+   See the `mimalloc-test-override` project for an example on how to use this. 
+
+3. The `mimalloc-redirect.dll` must be put in the same directory as the main 
+   `mimalloc.dll` at runtime (as it is a dependency of that DLL).
+   The redirection DLL ensures that all calls to the C runtime malloc API get 
+   redirected to mimalloc functions (which reside in `mimalloc.dll`).
+
+4. Ensure the `mimalloc.dll` comes as early as possible in the import
+   list of the final executable (so it can intercept all potential allocations).
+   You can use `minject -l <exe>` to check this if needed.
+
+For best performance on Windows with C++, it is also recommended to also override 
+the `new`/`delete` operations (by including [`mimalloc-new-delete.h`](include/mimalloc-new-delete.h)
+a single(!) source file in your project).
 
 The environment variable `MIMALLOC_DISABLE_REDIRECT=1` can be used to disable dynamic
-overriding at run-time. Use `MIMALLOC_VERBOSE=1` to check if mimalloc was successfully redirected.
+overriding at run-time. Use `MIMALLOC_VERBOSE=1` to check if mimalloc was successfully 
+redirected.
 
-(Note: in principle, it is possible to even patch existing executables without any recompilation
-if they are linked with the dynamic C runtime (`ucrtbase.dll`) -- just put the `mimalloc-override.dll`
-into the import table (and put `mimalloc-redirect.dll` in the same folder)
-Such patching can be done for example with [CFF Explorer](https://ntcore.com/?page_id=388)).
+For different platforms than x64, you may need a specific [redirection dll](bin).
+Furthermore, we cannot always re-link an executable or ensure `mimalloc.dll` comes
+first in the import table. In such cases the [`minject`](bin) tool can be used
+to patch the executable's import tables.
 
 
 ## Static override
 
 On Unix-like systems, you can also statically link with _mimalloc_ to override the standard
 malloc interface. The recommended way is to link the final program with the
-_mimalloc_ single object file (`mimalloc-override.o`). We use
+_mimalloc_ single object file (`mimalloc.o`). We use
 an object file instead of a library file as linkers give preference to
 that over archives to resolve symbols. To ensure that the standard
 malloc interface resolves to the _mimalloc_ library, link it as the first
 object file. For example:
+
 ```
-> gcc -o myprogram mimalloc-override.o  myfile1.c ...
+> gcc -o myprogram mimalloc.o  myfile1.c ...
 ```
 
 Another way to override statically that works on all platforms, is to
 link statically to mimalloc (as shown in the introduction) and include a
 header file in each source file that re-defines `malloc` etc. to `mi_malloc`.
-This is provided by [`mimalloc-override.h`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-override.h). This only works reliably though if all sources are
+This is provided by [`mimalloc-override.h`](include/mimalloc-override.h). This only works 
+reliably though if all sources are
 under your control or otherwise mixing of pointers from different heaps may occur!
 
 
+# Tools
+
+Generally, we recommend using the standard allocator with memory tracking tools, but mimalloc
+can also be build to support the [address sanitizer][asan] or the excellent [Valgrind] tool.
+Moreover, it can be build to support Windows event tracing ([ETW]).
+This has a small performance overhead but does allow detecting memory leaks and byte-precise
+buffer overflows directly on final executables. See also the `test/test-wrong.c` file to test with various tools.
+
+## Valgrind
+
+To build with [valgrind] support, use the `MI_TRACK_VALGRIND=ON` cmake option:
+
+```
+> cmake ../.. -DMI_TRACK_VALGRIND=ON
+```
+
+This can also be combined with secure mode or debug mode.
+You can then run your programs directly under valgrind:
+
+```
+> valgrind <myprogram>
+```
+
+If you rely on overriding `malloc`/`free` by mimalloc (instead of using the `mi_malloc`/`mi_free` API directly),
+you also need to tell `valgrind` to not intercept those calls itself, and use:
+
+```
+> MIMALLOC_SHOW_STATS=1 valgrind  --soname-synonyms=somalloc=*mimalloc* -- <myprogram>
+```
+
+By setting the `MIMALLOC_SHOW_STATS` environment variable you can check that mimalloc is indeed
+used and not the standard allocator. Even though the [Valgrind option][valgrind-soname]
+is called `--soname-synonyms`, this also works when overriding with a static library or object file.
+To dynamically override mimalloc using `LD_PRELOAD` together with `valgrind`, use:
+
+```
+> valgrind --trace-children=yes --soname-synonyms=somalloc=*mimalloc* /usr/bin/env LD_PRELOAD=/usr/lib/libmimalloc.so -- <myprogram>
+```
+
+See also the `test/test-wrong.c` file to test with `valgrind`.
+
+Valgrind support is in its initial development -- please report any issues.
+
+[Valgrind]: https://valgrind.org/
+[valgrind-soname]: https://valgrind.org/docs/manual/manual-core.html#opt.soname-synonyms
+
+## ASAN
+
+To build with the address sanitizer, use the `-DMI_TRACK_ASAN=ON` cmake option:
+
+```
+> cmake ../.. -DMI_TRACK_ASAN=ON
+```
+
+This can also be combined with secure mode or debug mode.
+You can then run your programs as:'
+
+```
+> ASAN_OPTIONS=verbosity=1 <myprogram>
+```
+
+When you link a program with an address sanitizer build of mimalloc, you should
+generally compile that program too with the address sanitizer enabled.
+For example, assuming you build mimalloc in `out/debug`:
+
+```
+clang -g -o test-wrong -Iinclude test/test-wrong.c out/debug/libmimalloc-asan-debug.a -lpthread -fsanitize=address -fsanitize-recover=address
+```
+
+Since the address sanitizer redirects the standard allocation functions, on some platforms (macOSX for example)
+it is required to compile mimalloc with `-DMI_OVERRIDE=OFF`.
+Address sanitizer support is in its initial development -- please report any issues.
+
+[asan]: https://github.com/google/sanitizers/wiki/AddressSanitizer
+
+## ETW
+
+Event tracing for Windows ([ETW]) provides a high performance way to capture all allocations though
+mimalloc and analyze them later. To build with ETW support, use the `-DMI_TRACK_ETW=ON` cmake option.
+
+You can then capture an allocation trace using the Windows performance recorder (WPR), using the
+`src/prim/windows/etw-mimalloc.wprp` profile. In an admin prompt, you can use:
+```
+> wpr -start src\prim\windows\etw-mimalloc.wprp -filemode
+> <my_mimalloc_program>
+> wpr -stop <my_mimalloc_program>.etl
+```
+and then open `<my_mimalloc_program>.etl` in the Windows Performance Analyzer (WPA), or
+use a tool like [TraceControl] that is specialized for analyzing mimalloc traces.
+
+[ETW]: https://learn.microsoft.com/en-us/windows-hardware/test/wpt/event-tracing-for-windows
+[TraceControl]: https://github.com/xinglonghe/TraceControl
+
+
 # Performance
 
 Last update: 2021-01-30
@@ -465,7 +671,7 @@ as [mimalloc-bench](https://github.com/daanx/mimalloc-bench).
 ## Benchmark Results on a 16-core AMD 5950x (Zen3)
 
 Testing on the 16-core AMD 5950x processor at 3.4Ghz (4.9Ghz boost), with
-with 32GiB memory at 3600Mhz, running	Ubuntu 20.04 with glibc 2.31 and GCC 9.3.0.
+32GiB memory at 3600Mhz, running	Ubuntu 20.04 with glibc 2.31 and GCC 9.3.0.
 
 We measure three versions of _mimalloc_: the main version `mi` (tag:v1.7.0),
 the new v2.0 beta version as `xmi` (tag:v2.0.0), and the main version in secure mode as `smi` (tag:v1.7.0).
@@ -532,7 +738,7 @@ The _alloc-test_, by
 [OLogN Technologies AG](http://ithare.com/testing-memory-allocators-ptmalloc2-tcmalloc-hoard-jemalloc-while-trying-to-simulate-real-world-loads/), is a very allocation intensive benchmark doing millions of
 allocations in various size classes. The test is scaled such that when an
 allocator performs almost identically on _alloc-test1_ as _alloc-testN_ it
-means that it scales linearly. 
+means that it scales linearly.
 
 The _sh6bench_ and _sh8bench_ benchmarks are
 developed by [MicroQuill](http://www.microquill.com/) as part of SmartHeap.
@@ -687,6 +893,60 @@ provided by the bot. You will only need to do this once across all repos using o
 
 # Older Release Notes
 
+* 2024-05-21, `v1.8.7`, `v2.1.7`: Fix build issues on less common platforms. Started upstreaming patches
+  from the CPython [integration](https://github.com/python/cpython/issues/113141#issuecomment-2119255217). Upstream `vcpkg` patches.
+* 2024-05-13, `v1.8.6`, `v2.1.6`: Fix build errors on various (older) platforms. Refactored aligned allocation.
+* 2024-04-22, `v1.8.4`, `v2.1.4`: Fixes various bugs and build issues. Add `MI_LIBC_MUSL` cmake flag for musl builds.
+  Free-ing code is refactored into a separate module (`free.c`). Mimalloc page info is simplified with the block size
+  directly available (and new `block_size_shift` to improve aligned block free-ing).
+  New approach to collection of abandoned segments: When
+  a thread terminates the segments it owns are abandoned (containing still live objects) and these can be
+  reclaimed by other threads. We no longer use a list of abandoned segments but this is now done using bitmaps in arena's
+  which is more concurrent (and more aggressive). Abandoned memory can now also be reclaimed if a thread frees an object in
+  an abandoned page (which can be disabled using `mi_option_abandoned_reclaim_on_free`). The option `mi_option_max_segment_reclaim`
+  gives a maximum percentage of abandoned segments that can be reclaimed per try (=10%).
+
+* 2023-04-24, `v1.8.2`, `v2.1.2`: Fixes build issues on freeBSD, musl, and C17 (UE 5.1.1). Reduce code size/complexity
+  by removing regions and segment-cache's and only use arenas with improved memory purging -- this may improve memory
+  usage as well for larger services. Renamed options for consistency. Improved Valgrind and ASAN checking.
+
+* 2023-04-03, `v1.8.1`, `v2.1.1`: Fixes build issues on some platforms.
+
+* 2023-03-29, `v1.8.0`, `v2.1.0`: Improved support dynamic overriding on Windows 11. Improved tracing precision
+  with [asan](#asan) and [Valgrind](#valgrind), and added Windows event tracing [ETW](#ETW) (contributed by Xinglong He). Created an OS
+  abstraction layer to make it easier to port and separate platform dependent code (in `src/prim`). Fixed C++ STL compilation on older Microsoft C++ compilers, and various small bug fixes.
+
+* 2022-12-23, `v1.7.9`, `v2.0.9`: Supports building with [asan](#asan) and improved [Valgrind](#valgrind) support.
+  Support arbitrary large alignments (in particular for `std::pmr` pools).
+  Added C++ STL allocators attached to a specific heap (thanks @vmarkovtsev).
+  Heap walks now visit all object (including huge objects). Support Windows nano server containers (by Johannes Schindelin,@dscho).
+  Various small bug fixes.
+
+* 2022-11-03, `v1.7.7`, `v2.0.7`: Initial support for [Valgrind](#valgrind) for leak testing and heap block overflow
+  detection. Initial
+  support for attaching heaps to a specific memory area (only in v2). Fix `realloc` behavior for zero size blocks, remove restriction to integral multiple of the alignment in `alloc_align`, improved aligned allocation performance, reduced contention with many threads on few processors (thank you @dposluns!), vs2022 support, support `pkg-config`, .
+
+* 2022-04-14, `v1.7.6`, `v2.0.6`: fix fallback path for aligned OS allocation on Windows, improve Windows aligned allocation
+  even when compiling with older SDK's, fix dynamic overriding on macOS Monterey, fix MSVC C++ dynamic overriding, fix
+  warnings under Clang 14, improve performance if many OS threads are created and destroyed, fix statistics for large object
+  allocations, using MIMALLOC_VERBOSE=1 has no maximum on the number of error messages, various small fixes.
+
+* 2022-02-14, `v1.7.5`, `v2.0.5` (alpha): fix malloc override on
+  Windows 11, fix compilation with musl, potentially reduced
+  committed memory, add `bin/minject` for Windows,
+  improved wasm support, faster aligned allocation,
+  various small fixes.
+
+* 2021-11-14, `v1.7.3`, `v2.0.3` (beta): improved WASM support, improved macOS support and performance (including
+  M1), improved performance for v2 for large objects, Python integration improvements, more standard
+  installation directories, various small fixes.
+* 2021-06-17, `v1.7.2`, `v2.0.2` (beta): support M1, better installation layout on Linux, fix
+  thread_id on Android, prefer 2-6TiB area for aligned allocation to work better on pre-windows 8, various small fixes.
+* 2021-04-06, `v1.7.1`, `v2.0.1` (beta): fix bug in arena allocation for huge pages, improved aslr on large allocations, initial M1 support (still experimental).
+* 2021-01-31, `v2.0.0`: beta release 2.0: new slice algorithm for managing internal mimalloc pages.
+* 2021-01-31, `v1.7.0`: stable release 1.7: support explicit user provided memory regions, more precise statistics,
+  improve macOS overriding, initial support for Apple M1, improved DragonFly support, faster memcpy on Windows, various small fixes.
+
 * 2020-09-24, `v1.6.7`: stable release 1.6: using standard C atomics, passing tsan testing, improved
   handling of failing to commit on Windows, add [`mi_process_info`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc.h#L156) api call.
 * 2020-08-06, `v1.6.4`: stable release 1.6: improved error recovery in low-memory situations,
@@ -708,9 +968,9 @@ provided by the bot. You will only need to do this once across all repos using o
 more eager concurrent free, addition of STL allocator, fixed potential memory leak.
 * 2020-01-15, `v1.3.0`: stable release 1.3: bug fixes, improved randomness and [stronger
 free list encoding](https://github.com/microsoft/mimalloc/blob/783e3377f79ee82af43a0793910a9f2d01ac7863/include/mimalloc-internal.h#L396) in secure mode.
+
 * 2019-12-22, `v1.2.2`: stable release 1.2: minor updates.
 * 2019-11-22, `v1.2.0`: stable release 1.2: bug fixes, improved secure mode (free list corruption checks, double free mitigation). Improved dynamic overriding on Windows.
 * 2019-10-07, `v1.1.0`: stable release 1.1.
 * 2019-09-01, `v1.0.8`: pre-release 8: more robust windows dynamic overriding, initial huge page support.
 * 2019-08-10, `v1.0.6`: pre-release 6: various performance improvements.
-
diff --git a/ext/src/mimalloc/src/alloc-aligned.c b/ext/src/mimalloc/src/alloc-aligned.c
index fce0fd7498..fe8ae1725d 100644
--- a/ext/src/mimalloc/src/alloc-aligned.c
+++ b/ext/src/mimalloc/src/alloc-aligned.c
@@ -1,167 +1,324 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/prim.h"  // _mi_theap_default
 
-#include <string.h>  // memset
+#include <string.h>     // memset
 
 // ------------------------------------------------------
 // Aligned Allocation
 // ------------------------------------------------------
 
-// Fallback primitive aligned allocation -- split out for better codegen
-static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
-{
-  mi_assert_internal(size <= PTRDIFF_MAX);
-  mi_assert_internal(alignment!=0 && _mi_is_power_of_two(alignment) && alignment <= MI_ALIGNMENT_MAX);
+static bool mi_malloc_is_naturally_aligned( size_t size, size_t alignment ) {
+  // certain blocks are always allocated at a certain natural alignment.
+  // (see also `arena.c:mi_arenas_page_alloc_fresh`).
+  mi_assert_internal(_mi_is_power_of_two(alignment) && (alignment > 0));
+  if (alignment > size) return false;
+  const size_t bsize = mi_good_size(size);
+  const bool ok = (bsize <= MI_PAGE_MAX_START_BLOCK_ALIGN2 && _mi_is_power_of_two(bsize)) ||             // power-of-two under N
+                  (alignment==MI_PAGE_OSPAGE_BLOCK_ALIGN2 && (bsize % MI_PAGE_OSPAGE_BLOCK_ALIGN2)==0);  // or multiple of N
+  if (ok) { mi_assert_internal((bsize & (alignment-1)) == 0); } // since both power of 2 and alignment <= size
+  return ok;
+}
 
-  const uintptr_t align_mask = alignment-1;  // for any x, `(x & align_mask) == (x % alignment)`
-  const size_t padsize = size + MI_PADDING_SIZE;
+#if MI_GUARDED
+static mi_decl_restrict void* mi_theap_malloc_guarded_aligned(mi_theap_t* theap, size_t size, size_t alignment, bool zero) mi_attr_noexcept {
+  // use over allocation for guarded blocksl
+  #if MI_THEAP_INITASNULL
+  if mi_unlikely(theap==NULL) { theap = _mi_theap_empty_get(); }
+  #endif
+  mi_assert_internal(alignment > 0 && alignment < MI_PAGE_MAX_OVERALLOC_ALIGN);
+  const size_t oversize = size + alignment - 1;
+  void* base = _mi_theap_malloc_guarded(theap, oversize, zero);
+  void* p = _mi_align_up_ptr(base, alignment);
+  mi_track_align(base, p, (uint8_t*)p - (uint8_t*)base, size);
+  mi_assert_internal(mi_usable_size(p) >= size);
+  mi_assert_internal(_mi_is_aligned(p, alignment));
+  return p;
+}
 
-  // use regular allocation if it is guaranteed to fit the alignment constraints
-  if (offset==0 && alignment<=padsize && padsize<=MI_MAX_ALIGN_GUARANTEE && (padsize&align_mask)==0) {
-    void* p = _mi_heap_malloc_zero(heap, size, zero);
-    mi_assert_internal(p == NULL || ((uintptr_t)p % alignment) == 0);
-    return p;
-  }
+static void* mi_theap_malloc_zero_no_guarded(mi_theap_t* theap, size_t size, bool zero, size_t* usable) {
+  #if MI_THEAP_INITASNULL
+  if mi_unlikely(theap==NULL) { theap = _mi_theap_empty_get(); }
+  #endif
+  const size_t rate = theap->guarded_sample_rate;
+  // only write if `rate!=0` so we don't write to the constant `_mi_theap_empty`
+  if (rate != 0) { theap->guarded_sample_rate = 0; }
+  void* p = _mi_theap_malloc_zero(theap, size, zero, usable);
+  if (rate != 0) { theap->guarded_sample_rate = rate; }
+  return p;
+}
+#else
+static void* mi_theap_malloc_zero_no_guarded(mi_theap_t* theap, size_t size, bool zero, size_t* usable) {
+  return _mi_theap_malloc_zero(theap, size, zero, usable);
+}
+#endif
 
-  // otherwise over-allocate
-  void* p = _mi_heap_malloc_zero(heap, size + alignment - 1, zero);
-  if (p == NULL) return NULL;
+// Fallback aligned allocation that over-allocates -- split out for better codegen
+static mi_decl_noinline void* mi_theap_malloc_zero_aligned_at_overalloc(mi_theap_t* const theap, const size_t size, const size_t alignment, const size_t offset, const bool zero, size_t* usable) mi_attr_noexcept
+{
+  mi_assert_internal(size <= (MI_MAX_ALLOC_SIZE - MI_PADDING_SIZE));
+  mi_assert_internal(alignment != 0 && _mi_is_power_of_two(alignment));
+
+  void* p;
+  size_t oversize;
+  if mi_unlikely(alignment > MI_PAGE_MAX_OVERALLOC_ALIGN) {
+    // use OS allocation for large alignments and allocate inside a singleton page (not in an arena)
+    // This can support alignments >= MI_PAGE_ALIGN by ensuring the object can be aligned
+    // in the first (and single) page such that the page info is `MI_PAGE_ALIGN` bytes before it (and can be found in the _mi_page_map).
+    if mi_unlikely(offset != 0) {
+      // todo: cannot support offset alignment for very large alignments yet
+      #if MI_DEBUG > 0
+      _mi_error_message(EOVERFLOW, "aligned allocation with a large alignment cannot be used with an alignment offset (size %zu, alignment %zu, offset %zu)\n", size, alignment, offset);
+      #endif
+      return NULL;
+    }
+    oversize = (size <= MI_SMALL_SIZE_MAX ? MI_SMALL_SIZE_MAX + 1 /* ensure we use generic malloc path */ : size);
+    // note: no guarded as alignment > 0
+    p = _mi_theap_malloc_zero_ex(theap, oversize, zero, alignment, usable); // the page block size should be large enough to align in the single huge page block
+    if (p == NULL) return NULL;
+  }
+  else {
+    // otherwise over-allocate
+    oversize = (size < MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : size) + alignment - 1;  // adjust for size <= 16; with size 0 and aligment 64k, we would allocate a 64k block and pointing just beyond that.
+    p = mi_theap_malloc_zero_no_guarded(theap, oversize, zero, usable);
+    if (p == NULL) return NULL;
+  }
 
   // .. and align within the allocation
-  uintptr_t adjust = alignment - (((uintptr_t)p + offset) & align_mask);
-  mi_assert_internal(adjust <= alignment);
-  void* aligned_p = (adjust == alignment ? p : (void*)((uintptr_t)p + adjust));
-  if (aligned_p != p) mi_page_set_has_aligned(_mi_ptr_page(p), true);
+  const uintptr_t align_mask = alignment - 1;  // for any x, `(x & align_mask) == (x % alignment)`
+  const uintptr_t poffset = ((uintptr_t)p + offset) & align_mask;
+  const uintptr_t adjust  = (poffset == 0 ? 0 : alignment - poffset);
+  mi_assert_internal(adjust < alignment);
+  void* aligned_p = (void*)((uintptr_t)p + adjust);
+
+  // note: after the above allocation, the page may be abandoned now (as it became full, see `page.c:_mi_malloc_generic`)
+  // and we no longer own it. We should be careful to only read constant fields in the page,
+  // or use safe atomic access as in `mi_page_set_has_interior_pointers`.
+  // (we can access the page though since the just allocated pointer keeps it alive)
+  mi_page_t* page = _mi_ptr_page(p);
+  if (aligned_p != p) {
+    mi_page_set_has_interior_pointers(page, true);
+    #if MI_GUARDED
+    // set tag to aligned so mi_usable_size works with guard pages
+    if (adjust >= sizeof(mi_block_t)) {
+      mi_block_t* const block = (mi_block_t*)p;
+      block->next = MI_BLOCK_TAG_ALIGNED;
+    }
+    #endif
+    _mi_padding_shrink(page, (mi_block_t*)p, adjust + size);
+  }
+  // todo: expand padding if overallocated ?
+
+  mi_assert_internal(mi_page_usable_block_size(page) >= adjust + size);
   mi_assert_internal(((uintptr_t)aligned_p + offset) % alignment == 0);
-  mi_assert_internal(p == _mi_page_ptr_unalign(_mi_ptr_segment(aligned_p), _mi_ptr_page(aligned_p), aligned_p));
+  mi_assert_internal(mi_usable_size(aligned_p)>=size);
+  mi_assert_internal(mi_usable_size(p) == mi_usable_size(aligned_p)+adjust);
+  #if MI_DEBUG > 1
+  mi_page_t* const apage = _mi_ptr_page(aligned_p);
+  void* unalign_p = _mi_page_ptr_unalign(apage, aligned_p);
+  mi_assert_internal(p == unalign_p);
+  #endif
+
+  // now zero the block if needed
+  //if (alignment > MI_PAGE_MAX_OVERALLOC_ALIGN) {
+  //  // for the tracker, on huge aligned allocations only from the start of the large block is defined
+  //  mi_track_mem_undefined(aligned_p, size);
+  //  if (zero) {
+  //    _mi_memzero_aligned(aligned_p, mi_usable_size(aligned_p));
+  //  }
+  //}
+
+  if (p != aligned_p) {
+    mi_track_align(p,aligned_p,adjust,mi_usable_size(aligned_p));
+    #if MI_GUARDED
+    mi_track_mem_defined(p, sizeof(mi_block_t));
+    #endif
+  }
   return aligned_p;
 }
 
-// Primitive aligned allocation
-static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
+// Generic primitive aligned allocation -- split out for better codegen
+static mi_decl_noinline void* mi_theap_malloc_zero_aligned_at_generic(mi_theap_t* const theap, const size_t size, const size_t alignment, const size_t offset, const bool zero, size_t* usable) mi_attr_noexcept
 {
-  // note: we don't require `size > offset`, we just guarantee that the address at offset is aligned regardless of the allocated size.
-  mi_assert(alignment > 0);
-  if (mi_unlikely(alignment==0 || !_mi_is_power_of_two(alignment))) { // require power-of-two (see <https://en.cppreference.com/w/c/memory/aligned_alloc>)
+  mi_assert_internal(alignment != 0 && _mi_is_power_of_two(alignment));
+  // we don't allocate more than MI_MAX_ALLOC_SIZE (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
+  if mi_unlikely(size > (MI_MAX_ALLOC_SIZE - MI_PADDING_SIZE)) {
     #if MI_DEBUG > 0
-    _mi_error_message(EOVERFLOW, "aligned allocation requires the alignment to be a power-of-two (size %zu, alignment %zu)\n", size, alignment);
+    _mi_error_message(EOVERFLOW, "aligned allocation request is too large (size %zu, alignment %zu)\n", size, alignment);
     #endif
     return NULL;
   }
-  if (mi_unlikely(alignment > MI_ALIGNMENT_MAX)) {  // we cannot align at a boundary larger than this (or otherwise we cannot find segment headers)
-    #if MI_DEBUG > 0
-    _mi_error_message(EOVERFLOW, "aligned allocation has a maximum alignment of %zu (size %zu, alignment %zu)\n", MI_ALIGNMENT_MAX, size, alignment);
-    #endif
-    return NULL;
+
+  // use regular allocation if it is guaranteed to fit the alignment constraints.
+  // this is important to try as the fast path in `mi_theap_malloc_zero_aligned` only works when there exist
+  // a page with the right block size, and if we always use the over-alloc fallback that would never happen.
+  if (offset == 0 && mi_malloc_is_naturally_aligned(size,alignment)) {
+    void* p = mi_theap_malloc_zero_no_guarded(theap, size, zero, usable);
+    mi_assert_internal(p == NULL || ((uintptr_t)p % alignment) == 0);
+    const bool is_aligned_or_null = (((uintptr_t)p) & (alignment-1))==0;
+    if mi_likely(is_aligned_or_null) {
+      return p;
+    }
+    else {
+      // this should never happen if the `mi_malloc_is_naturally_aligned` check is correct..
+      mi_assert(false);
+      mi_free(p);
+    }
   }
-  if (mi_unlikely(size > PTRDIFF_MAX)) {          // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)                                                    
+
+  // fall back to over-allocation
+  return mi_theap_malloc_zero_aligned_at_overalloc(theap,size,alignment,offset,zero,usable);
+}
+
+
+// Primitive aligned allocation
+static inline void* mi_theap_malloc_zero_aligned_at(mi_theap_t* const theap, const size_t size, const size_t alignment, const size_t offset, const bool zero, size_t* usable) mi_attr_noexcept
+{
+  // note: we don't require `size > offset`, we just guarantee that the address at offset is aligned regardless of the allocated size.
+  if mi_unlikely(alignment == 0 || !_mi_is_power_of_two(alignment)) { // require power-of-two (see <https://en.cppreference.com/w/c/memory/aligned_alloc>)
     #if MI_DEBUG > 0
-    _mi_error_message(EOVERFLOW, "aligned allocation request is too large (size %zu, alignment %zu)\n", size, alignment);
+    _mi_error_message(EOVERFLOW, "aligned allocation requires the alignment to be a power-of-two (size %zu, alignment %zu)\n", size, alignment);
     #endif
     return NULL;
   }
-  const uintptr_t align_mask = alignment-1;       // for any x, `(x & align_mask) == (x % alignment)`
-  const size_t padsize = size + MI_PADDING_SIZE;  // note: cannot overflow due to earlier size > PTRDIFF_MAX check
+
+  #if MI_GUARDED
+  #if MI_THEAP_INITASNULL
+  if mi_likely(theap!=NULL)
+  #endif
+  if (offset==0 && alignment < MI_PAGE_MAX_OVERALLOC_ALIGN && mi_theap_malloc_use_guarded(theap,size)) {
+    return mi_theap_malloc_guarded_aligned(theap, size, alignment, zero);
+  }
+  #endif
 
   // try first if there happens to be a small block available with just the right alignment
-  if (mi_likely(padsize <= MI_SMALL_SIZE_MAX)) {
-    mi_page_t* page = _mi_heap_get_free_small_page(heap, padsize);
-    const bool is_aligned = (((uintptr_t)page->free+offset) & align_mask)==0;
-    if (mi_likely(page->free != NULL && is_aligned))
-    {
-      #if MI_STAT>1
-      mi_heap_stat_increase(heap, malloc, size);
-      #endif
-      void* p = _mi_page_malloc(heap, page, padsize); // TODO: inline _mi_page_malloc
-      mi_assert_internal(p != NULL);
-      mi_assert_internal(((uintptr_t)p + offset) % alignment == 0);
-      if (zero) { _mi_block_zero_init(page, p, size); }
-      return p;
+  // since most small power-of-2 blocks (under MI_PAGE_MAX_BLOCK_START_ALIGN2) are already
+  // naturally aligned this can be often the case.
+  #if MI_THEAP_INITASNULL
+  if mi_likely(theap!=NULL)
+  #endif
+  {
+    if mi_likely(size <= MI_SMALL_SIZE_MAX && alignment <= size) {
+      const uintptr_t align_mask = alignment-1;       // for any x, `(x & align_mask) == (x % alignment)`
+      const size_t padsize = size + MI_PADDING_SIZE;
+      mi_page_t* page = _mi_theap_get_free_small_page(theap, padsize);
+      if mi_likely(page->free != NULL) {
+        const bool is_aligned = (((uintptr_t)page->free + offset) & align_mask)==0;
+        if mi_likely(is_aligned)
+        {
+          if (usable!=NULL) { *usable = mi_page_usable_block_size(page); }
+          void* p = _mi_page_malloc_zero(theap, page, padsize, zero);
+          mi_assert_internal(p != NULL);
+          mi_assert_internal(((uintptr_t)p + offset) % alignment == 0);
+          mi_track_malloc(p, size, zero);
+          return p;
+        }
+      }
     }
   }
-  // fallback
-  return mi_heap_malloc_zero_aligned_at_fallback(heap, size, alignment, offset, zero);
+
+  // fallback to generic aligned allocation
+  return mi_theap_malloc_zero_aligned_at_generic(theap, size, alignment, offset, zero, usable);
 }
 
 
 // ------------------------------------------------------
-// Optimized mi_heap_malloc_aligned / mi_malloc_aligned
+// Internal mi_theap_malloc_aligned / mi_malloc_aligned
 // ------------------------------------------------------
 
-mi_decl_restrict void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_malloc_zero_aligned_at(heap, size, alignment, offset, false);
+static mi_decl_restrict void* mi_theap_malloc_aligned_at(mi_theap_t* theap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_theap_malloc_zero_aligned_at(theap, size, alignment, offset, false, NULL);
 }
 
-mi_decl_restrict void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
-  #if !MI_PADDING
-  // without padding, any small sized allocation is naturally aligned (see also `_mi_segment_page_start`)
-  if (!_mi_is_power_of_two(alignment)) return NULL;
-  if (mi_likely(_mi_is_power_of_two(size) && size >= alignment && size <= MI_SMALL_SIZE_MAX))
-  #else
-  // with padding, we can only guarantee this for fixed alignments
-  if (mi_likely((alignment == sizeof(void*) || (alignment == MI_MAX_ALIGN_SIZE && size > (MI_MAX_ALIGN_SIZE/2)))
-                && size <= MI_SMALL_SIZE_MAX))
-  #endif
-  {
-    // fast path for common alignment and size
-    return mi_heap_malloc_small(heap, size);
-  }
-  else {
-    return mi_heap_malloc_aligned_at(heap, size, alignment, 0);
-  }
+mi_decl_nodiscard mi_decl_restrict void* mi_theap_malloc_aligned(mi_theap_t* theap, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_theap_malloc_aligned_at(theap, size, alignment, 0);
+}
+
+static mi_decl_restrict void* mi_theap_zalloc_aligned_at(mi_theap_t* theap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_theap_malloc_zero_aligned_at(theap, size, alignment, offset, true, NULL);
 }
 
+static mi_decl_restrict void* mi_theap_zalloc_aligned(mi_theap_t* theap, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_theap_zalloc_aligned_at(theap, size, alignment, 0);
+}
+
+static mi_decl_restrict void* mi_theap_calloc_aligned_at(mi_theap_t* theap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(count, size, &total)) return NULL;
+  return mi_theap_zalloc_aligned_at(theap, total, alignment, offset);
+}
+
+static mi_decl_restrict void* mi_theap_calloc_aligned(mi_theap_t* theap, size_t count, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_theap_calloc_aligned_at(theap, count, size, alignment, 0);
+}
+
+
 // ------------------------------------------------------
 // Aligned Allocation
 // ------------------------------------------------------
 
-mi_decl_restrict void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_malloc_zero_aligned_at(heap, size, alignment, offset, true);
+mi_decl_nodiscard mi_decl_restrict void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_theap_malloc_aligned_at(_mi_theap_default(), size, alignment, offset);
 }
 
-mi_decl_restrict void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
-  return mi_heap_zalloc_aligned_at(heap, size, alignment, 0);
+mi_decl_nodiscard mi_decl_restrict void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_theap_malloc_aligned(_mi_theap_default(), size, alignment);
 }
 
-mi_decl_restrict void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
-  size_t total;
-  if (mi_count_size_overflow(count, size, &total)) return NULL;
-  return mi_heap_zalloc_aligned_at(heap, total, alignment, offset);
+mi_decl_nodiscard mi_decl_restrict void* mi_umalloc_aligned(size_t size, size_t alignment, size_t* block_size) mi_attr_noexcept {
+  return mi_theap_malloc_zero_aligned_at(_mi_theap_default(), size, alignment, 0, false, block_size);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_theap_zalloc_aligned_at(_mi_theap_default(), size, alignment, offset);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_theap_zalloc_aligned(_mi_theap_default(), size, alignment);
 }
 
-mi_decl_restrict void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept {
-  return mi_heap_calloc_aligned_at(heap,count,size,alignment,0);
+mi_decl_nodiscard mi_decl_restrict void* mi_uzalloc_aligned(size_t size, size_t alignment, size_t* block_size) mi_attr_noexcept {
+  return mi_theap_malloc_zero_aligned_at(_mi_theap_default(), size, alignment, 0, true, block_size);
 }
 
-mi_decl_restrict void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_malloc_aligned_at(mi_get_default_heap(), size, alignment, offset);
+mi_decl_nodiscard mi_decl_restrict void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_theap_calloc_aligned_at(_mi_theap_default(), count, size, alignment, offset);
 }
 
-mi_decl_restrict void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
-  return mi_heap_malloc_aligned(mi_get_default_heap(), size, alignment);
+mi_decl_nodiscard mi_decl_restrict void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_theap_calloc_aligned(_mi_theap_default(), count, size, alignment);
 }
 
-mi_decl_restrict void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_zalloc_aligned_at(mi_get_default_heap(), size, alignment, offset);
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_theap_malloc_aligned_at(_mi_heap_theap(heap), size, alignment, offset);
 }
 
-mi_decl_restrict void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
-  return mi_heap_zalloc_aligned(mi_get_default_heap(), size, alignment);
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_theap_malloc_aligned(_mi_heap_theap(heap), size, alignment);
 }
 
-mi_decl_restrict void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_calloc_aligned_at(mi_get_default_heap(), count, size, alignment, offset);
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_theap_zalloc_aligned_at(_mi_heap_theap(heap), size, alignment, offset);
 }
 
-mi_decl_restrict void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept {
-  return mi_heap_calloc_aligned(mi_get_default_heap(), count, size, alignment);
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_theap_zalloc_aligned(_mi_heap_theap(heap), size, alignment);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_theap_calloc_aligned_at(_mi_heap_theap(heap), count, size, alignment, offset);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_theap_calloc_aligned(_mi_heap_theap(heap), count, size, alignment);
 }
 
 
@@ -169,29 +326,23 @@ mi_decl_restrict void* mi_calloc_aligned(size_t count, size_t size, size_t align
 // Aligned re-allocation
 // ------------------------------------------------------
 
-static void* mi_heap_realloc_zero_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset, bool zero) mi_attr_noexcept {
+static void* mi_theap_realloc_zero_aligned_at(mi_theap_t* theap, void* p, size_t newsize, size_t alignment, size_t offset, bool zero) mi_attr_noexcept {
   mi_assert(alignment > 0);
-  if (alignment <= sizeof(uintptr_t)) return _mi_heap_realloc_zero(heap,p,newsize,zero);
-  if (p == NULL) return mi_heap_malloc_zero_aligned_at(heap,newsize,alignment,offset,zero);
+  if (alignment <= sizeof(uintptr_t)) return _mi_theap_realloc_zero(theap,p,newsize,zero,NULL,NULL);
+  if (p == NULL) return mi_theap_malloc_zero_aligned_at(theap,newsize,alignment,offset,zero,NULL);
   size_t size = mi_usable_size(p);
   if (newsize <= size && newsize >= (size - (size / 2))
       && (((uintptr_t)p + offset) % alignment) == 0) {
     return p;  // reallocation still fits, is aligned and not more than 50% waste
   }
   else {
-    void* newp = mi_heap_malloc_aligned_at(heap,newsize,alignment,offset);
+    // note: we don't zero allocate upfront so we only zero initialize the expanded part
+    void* newp = mi_theap_malloc_aligned_at(theap,newsize,alignment,offset);
     if (newp != NULL) {
       if (zero && newsize > size) {
-        const mi_page_t* page = _mi_ptr_page(newp);
-        if (page->is_zero) {
-          // already zero initialized
-          mi_assert_expensive(mi_mem_is_zero(newp,newsize));
-        }
-        else {
-          // also set last word in the previous allocation to zero to ensure any padding is zero-initialized
-          size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0);
-          memset((uint8_t*)newp + start, 0, newsize - start);
-        }
+        // also set last word in the previous allocation to zero to ensure any padding is zero-initialized
+        size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0);
+        _mi_memzero((uint8_t*)newp + start, newsize - start);
       }
       _mi_memcpy_aligned(newp, p, (newsize > size ? size : newsize));
       mi_free(p); // only free if successful
@@ -200,62 +351,89 @@ static void* mi_heap_realloc_zero_aligned_at(mi_heap_t* heap, void* p, size_t ne
   }
 }
 
-static void* mi_heap_realloc_zero_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, bool zero) mi_attr_noexcept {
+static void* mi_theap_realloc_zero_aligned(mi_theap_t* theap, void* p, size_t newsize, size_t alignment, bool zero) mi_attr_noexcept {
   mi_assert(alignment > 0);
-  if (alignment <= sizeof(uintptr_t)) return _mi_heap_realloc_zero(heap,p,newsize,zero);
+  if (alignment <= sizeof(uintptr_t)) return _mi_theap_realloc_zero(theap,p,newsize,zero,NULL,NULL);
   size_t offset = ((uintptr_t)p % alignment); // use offset of previous allocation (p can be NULL)
-  return mi_heap_realloc_zero_aligned_at(heap,p,newsize,alignment,offset,zero);
+  return mi_theap_realloc_zero_aligned_at(theap,p,newsize,alignment,offset,zero);
 }
 
-void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_realloc_zero_aligned_at(heap,p,newsize,alignment,offset,false);
+static void* mi_theap_realloc_aligned_at(mi_theap_t* theap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_theap_realloc_zero_aligned_at(theap,p,newsize,alignment,offset,false);
 }
 
-void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
-  return mi_heap_realloc_zero_aligned(heap,p,newsize,alignment,false);
+static void* mi_theap_realloc_aligned(mi_theap_t* theap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_theap_realloc_zero_aligned(theap,p,newsize,alignment,false);
 }
 
-void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_realloc_zero_aligned_at(heap, p, newsize, alignment, offset, true);
+static void* mi_theap_rezalloc_aligned_at(mi_theap_t* theap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_theap_realloc_zero_aligned_at(theap, p, newsize, alignment, offset, true);
 }
 
-void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
-  return mi_heap_realloc_zero_aligned(heap, p, newsize, alignment, true);
+static void* mi_theap_rezalloc_aligned(mi_theap_t* theap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_theap_realloc_zero_aligned(theap, p, newsize, alignment, true);
 }
 
-void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+static void* mi_theap_recalloc_aligned_at(mi_theap_t* theap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(newcount, size, &total)) return NULL;
-  return mi_heap_rezalloc_aligned_at(heap, p, total, alignment, offset);
+  return mi_theap_rezalloc_aligned_at(theap, p, total, alignment, offset);
 }
 
-void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
+static void* mi_theap_recalloc_aligned(mi_theap_t* theap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(newcount, size, &total)) return NULL;
-  return mi_heap_rezalloc_aligned(heap, p, total, alignment);
+  return mi_theap_rezalloc_aligned(theap, p, total, alignment);
 }
 
-void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_realloc_aligned_at(mi_get_default_heap(), p, newsize, alignment, offset);
+
+mi_decl_nodiscard void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_theap_realloc_aligned_at(_mi_theap_default(), p, newsize, alignment, offset);
 }
 
-void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
-  return mi_heap_realloc_aligned(mi_get_default_heap(), p, newsize, alignment);
+mi_decl_nodiscard void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_theap_realloc_aligned(_mi_theap_default(), p, newsize, alignment);
 }
 
-void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_rezalloc_aligned_at(mi_get_default_heap(), p, newsize, alignment, offset);
+mi_decl_nodiscard void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_theap_rezalloc_aligned_at(_mi_theap_default(), p, newsize, alignment, offset);
 }
 
-void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
-  return mi_heap_rezalloc_aligned(mi_get_default_heap(), p, newsize, alignment);
+mi_decl_nodiscard void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_theap_rezalloc_aligned(_mi_theap_default(), p, newsize, alignment);
 }
 
-void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_recalloc_aligned_at(mi_get_default_heap(), p, newcount, size, alignment, offset);
+mi_decl_nodiscard void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_theap_recalloc_aligned_at(_mi_theap_default(), p, newcount, size, alignment, offset);
 }
 
-void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
-  return mi_heap_recalloc_aligned(mi_get_default_heap(), p, newcount, size, alignment);
+mi_decl_nodiscard void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_theap_recalloc_aligned(_mi_theap_default(), p, newcount, size, alignment);
 }
 
+
+mi_decl_nodiscard void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_theap_realloc_aligned_at(_mi_heap_theap(heap), p, newsize, alignment, offset);
+}
+
+mi_decl_nodiscard void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_theap_realloc_aligned(_mi_heap_theap(heap), p, newsize, alignment);
+}
+
+mi_decl_nodiscard void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_theap_rezalloc_aligned_at(_mi_heap_theap(heap), p, newsize, alignment, offset);
+}
+
+mi_decl_nodiscard void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_theap_rezalloc_aligned(_mi_heap_theap(heap), p, newsize, alignment);
+}
+
+mi_decl_nodiscard void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_theap_recalloc_aligned_at(_mi_heap_theap(heap), p, newcount, size, alignment, offset);
+}
+
+mi_decl_nodiscard void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_theap_recalloc_aligned(_mi_heap_theap(heap), p, newcount, size, alignment);
+}
+
+
diff --git a/ext/src/mimalloc/src/alloc-override.c b/ext/src/mimalloc/src/alloc-override.c
index e29cb4b231..882e0607a8 100644
--- a/ext/src/mimalloc/src/alloc-override.c
+++ b/ext/src/mimalloc/src/alloc-override.c
@@ -13,7 +13,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #error "It is only possible to override "malloc" on Windows when building as a DLL (and linking the C runtime as a DLL)"
 #endif
 
-#if defined(MI_MALLOC_OVERRIDE) && !(defined(_WIN32)) 
+#if defined(MI_MALLOC_OVERRIDE) && !(defined(_WIN32))
 
 #if defined(__APPLE__)
 #include <AvailabilityMacros.h>
@@ -23,13 +23,13 @@ mi_decl_externc size_t malloc_good_size(size_t size);
 #endif
 
 // helper definition for C override of C++ new
-typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t;
+typedef void* mi_nothrow_t;
 
 // ------------------------------------------------------
 // Override system malloc
 // ------------------------------------------------------
 
-#if (defined(__GNUC__) || defined(__clang__)) && !defined(__APPLE__) && !defined(MI_VALGRIND)
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__APPLE__) && !MI_TRACK_ENABLED
   // gcc, clang: use aliasing to alias the exported function to one of our `mi_` functions
   #if (defined(__GNUC__) && __GNUC__ >= 9)
     #pragma GCC diagnostic ignored "-Wattributes"  // or we get warnings that nodiscard is ignored on a forward
@@ -43,7 +43,7 @@ typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t;
   #define MI_FORWARD0(fun,x)      MI_FORWARD(fun)
   #define MI_FORWARD02(fun,x,y)   MI_FORWARD(fun)
 #else
-  // otherwise use forwarding by calling our `mi_` function 
+  // otherwise use forwarding by calling our `mi_` function
   #define MI_FORWARD1(fun,x)      { return fun(x); }
   #define MI_FORWARD2(fun,x,y)    { return fun(x,y); }
   #define MI_FORWARD3(fun,x,y,z)  { return fun(x,y,z); }
@@ -51,11 +51,17 @@ typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t;
   #define MI_FORWARD02(fun,x,y)   { fun(x,y); }
 #endif
 
-#if defined(__APPLE__) && defined(MI_SHARED_LIB_EXPORT) && defined(MI_OSX_INTERPOSE)    
-  // define MI_OSX_IS_INTERPOSED as we should not provide forwarding definitions for 
+
+#if defined(__APPLE__) && defined(MI_SHARED_LIB_EXPORT) && defined(MI_OSX_INTERPOSE)
+  // define MI_OSX_IS_INTERPOSED as we should not provide forwarding definitions for
   // functions that are interposed (or the interposing does not work)
   #define MI_OSX_IS_INTERPOSED
 
+  mi_decl_externc size_t mi_malloc_size_checked(void *p) {
+    if (!mi_is_in_heap_region(p)) return 0;
+    return mi_usable_size(p);
+  }
+
   // use interposing so `DYLD_INSERT_LIBRARIES` works without `DYLD_FORCE_FLAT_NAMESPACE=1`
   // See: <https://books.google.com/books?id=K8vUkpOXhN4C&pg=PA73>
   struct mi_interpose_s {
@@ -64,23 +70,21 @@ typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t;
   };
   #define MI_INTERPOSE_FUN(oldfun,newfun) { (const void*)&newfun, (const void*)&oldfun }
   #define MI_INTERPOSE_MI(fun)            MI_INTERPOSE_FUN(fun,mi_##fun)
-  
-  __attribute__((used)) static struct mi_interpose_s _mi_interposes[]  __attribute__((section("__DATA, __interpose"))) =
+
+  #define MI_INTERPOSE_DECLS(name)        __attribute__((used)) static struct mi_interpose_s name[]  __attribute__((section("__DATA, __interpose")))
+
+  MI_INTERPOSE_DECLS(_mi_interposes) =
   {
     MI_INTERPOSE_MI(malloc),
     MI_INTERPOSE_MI(calloc),
     MI_INTERPOSE_MI(realloc),
     MI_INTERPOSE_MI(strdup),
-    MI_INTERPOSE_MI(strndup),
     MI_INTERPOSE_MI(realpath),
     MI_INTERPOSE_MI(posix_memalign),
     MI_INTERPOSE_MI(reallocf),
     MI_INTERPOSE_MI(valloc),
-    MI_INTERPOSE_MI(malloc_size),
+    MI_INTERPOSE_FUN(malloc_size,mi_malloc_size_checked),
     MI_INTERPOSE_MI(malloc_good_size),
-    #if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15 
-    MI_INTERPOSE_MI(aligned_alloc),
-    #endif
     #ifdef MI_OSX_ZONE
     // we interpose malloc_default_zone in alloc-override-osx.c so we can use mi_free safely
     MI_INTERPOSE_MI(free),
@@ -91,6 +95,12 @@ typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t;
     MI_INTERPOSE_FUN(vfree,mi_cfree),
     #endif
   };
+  MI_INTERPOSE_DECLS(_mi_interposes_10_7) __OSX_AVAILABLE(10.7) = {
+    MI_INTERPOSE_MI(strndup),
+  };
+  MI_INTERPOSE_DECLS(_mi_interposes_10_15) __OSX_AVAILABLE(10.15) = {
+    MI_INTERPOSE_MI(aligned_alloc),
+  };
 
   #ifdef __cplusplus
   extern "C" {
@@ -122,11 +132,19 @@ typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t;
   // cannot override malloc unless using a dll.
   // we just override new/delete which does work in a static library.
 #else
-  // On all other systems forward to our API  
-  void* malloc(size_t size)              MI_FORWARD1(mi_malloc, size)
-  void* calloc(size_t size, size_t n)    MI_FORWARD2(mi_calloc, size, n)
-  void* realloc(void* p, size_t newsize) MI_FORWARD2(mi_realloc, p, newsize)
-  void  free(void* p)                    MI_FORWARD0(mi_free, p)
+  // On all other systems forward allocation primitives to our API
+  mi_decl_export void* malloc(size_t size)              MI_FORWARD1(mi_malloc, size)
+  mi_decl_export void* calloc(size_t size, size_t n)    MI_FORWARD2(mi_calloc, size, n)
+  mi_decl_export void* realloc(void* p, size_t newsize) MI_FORWARD2(mi_realloc, p, newsize)
+  mi_decl_export void  free(void* p)                    MI_FORWARD0(mi_free, p)
+  // In principle we do not need to forward `strdup`/`strndup` but on some systems these do not use `malloc` internally (but a more primitive call)
+  // We only override if `strdup` is not a macro (as on some older libc's, see issue #885)
+  #if !defined(strdup)
+  mi_decl_export char* strdup(const char* str)             MI_FORWARD1(mi_strdup, str)
+  #endif
+  #if !defined(strndup) && (!defined(__APPLE__) || (defined(MAC_OS_X_VERSION_10_7) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_7))
+  mi_decl_export char* strndup(const char* str, size_t n)  MI_FORWARD2(mi_strndup, str, n)
+  #endif
 #endif
 
 #if (defined(__GNUC__) || defined(__clang__)) && !defined(__APPLE__)
@@ -168,34 +186,40 @@ typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t;
   void operator delete[](void* p, std::size_t n, std::align_val_t al) noexcept { mi_free_size_aligned(p, n, static_cast<size_t>(al)); };
   void operator delete  (void* p, std::align_val_t al, const std::nothrow_t&) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
   void operator delete[](void* p, std::align_val_t al, const std::nothrow_t&) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
-  
+
   void* operator new( std::size_t n, std::align_val_t al)   noexcept(false) { return mi_new_aligned(n, static_cast<size_t>(al)); }
   void* operator new[]( std::size_t n, std::align_val_t al) noexcept(false) { return mi_new_aligned(n, static_cast<size_t>(al)); }
   void* operator new  (std::size_t n, std::align_val_t al, const std::nothrow_t&) noexcept { return mi_new_aligned_nothrow(n, static_cast<size_t>(al)); }
   void* operator new[](std::size_t n, std::align_val_t al, const std::nothrow_t&) noexcept { return mi_new_aligned_nothrow(n, static_cast<size_t>(al)); }
   #endif
 
-#elif (defined(__GNUC__) || defined(__clang__)) 
+#elif (defined(__GNUC__) || defined(__clang__))
   // ------------------------------------------------------
   // Override by defining the mangled C++ names of the operators (as
   // used by GCC and CLang).
   // See <https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling>
   // ------------------------------------------------------
-  
+
   void _ZdlPv(void* p)            MI_FORWARD0(mi_free,p) // delete
   void _ZdaPv(void* p)            MI_FORWARD0(mi_free,p) // delete[]
   void _ZdlPvm(void* p, size_t n) MI_FORWARD02(mi_free_size,p,n)
   void _ZdaPvm(void* p, size_t n) MI_FORWARD02(mi_free_size,p,n)
+
   void _ZdlPvSt11align_val_t(void* p, size_t al)            { mi_free_aligned(p,al); }
   void _ZdaPvSt11align_val_t(void* p, size_t al)            { mi_free_aligned(p,al); }
   void _ZdlPvmSt11align_val_t(void* p, size_t n, size_t al) { mi_free_size_aligned(p,n,al); }
   void _ZdaPvmSt11align_val_t(void* p, size_t n, size_t al) { mi_free_size_aligned(p,n,al); }
-  
+
+  void _ZdlPvRKSt9nothrow_t(void* p, mi_nothrow_t tag)      { MI_UNUSED(tag); mi_free(p); }  // operator delete(void*, std::nothrow_t const&)
+  void _ZdaPvRKSt9nothrow_t(void* p, mi_nothrow_t tag)      { MI_UNUSED(tag); mi_free(p); }  // operator delete[](void*, std::nothrow_t const&)
+  void _ZdlPvSt11align_val_tRKSt9nothrow_t(void* p, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); mi_free_aligned(p,al); } // operator delete(void*, std::align_val_t, std::nothrow_t const&)
+  void _ZdaPvSt11align_val_tRKSt9nothrow_t(void* p, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); mi_free_aligned(p,al); } // operator delete[](void*, std::align_val_t, std::nothrow_t const&)
+
   #if (MI_INTPTR_SIZE==8)
     void* _Znwm(size_t n)                             MI_FORWARD1(mi_new,n)  // new 64-bit
     void* _Znam(size_t n)                             MI_FORWARD1(mi_new,n)  // new[] 64-bit
     void* _ZnwmRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }
-    void* _ZnamRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }     
+    void* _ZnamRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }
     void* _ZnwmSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al)
     void* _ZnamSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al)
     void* _ZnwmSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
@@ -204,7 +228,7 @@ typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t;
     void* _Znwj(size_t n)                             MI_FORWARD1(mi_new,n)  // new 64-bit
     void* _Znaj(size_t n)                             MI_FORWARD1(mi_new,n)  // new[] 64-bit
     void* _ZnwjRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }
-    void* _ZnajRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }   
+    void* _ZnajRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }
     void* _ZnwjSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al)
     void* _ZnajSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al)
     void* _ZnwjSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
@@ -226,7 +250,7 @@ extern "C" {
   // Forward Posix/Unix calls as well
   void*  reallocf(void* p, size_t newsize) MI_FORWARD2(mi_reallocf,p,newsize)
   size_t malloc_size(const void* p)        MI_FORWARD1(mi_usable_size,p)
-  #if !defined(__ANDROID__) && !defined(__FreeBSD__)
+  #if !defined(__ANDROID__) && !defined(__FreeBSD__) && !defined(__DragonFly__)
   size_t malloc_usable_size(void *p)       MI_FORWARD1(mi_usable_size,p)
   #else
   size_t malloc_usable_size(const void *p) MI_FORWARD1(mi_usable_size,p)
@@ -234,30 +258,41 @@ extern "C" {
 
   // No forwarding here due to aliasing/name mangling issues
   void*  valloc(size_t size)               { return mi_valloc(size); }
-  void   vfree(void* p)                    { mi_free(p); }                
+  void   vfree(void* p)                    { mi_free(p); }
   size_t malloc_good_size(size_t size)     { return mi_malloc_good_size(size); }
   int    posix_memalign(void** p, size_t alignment, size_t size) { return mi_posix_memalign(p, alignment, size); }
-  
+
   // `aligned_alloc` is only available when __USE_ISOC11 is defined.
+  // Note: it seems __USE_ISOC11 is not defined in musl (and perhaps other libc's) so we only check
+  // for it if using glibc.
   // Note: Conda has a custom glibc where `aligned_alloc` is declared `static inline` and we cannot
   // override it, but both _ISOC11_SOURCE and __USE_ISOC11 are undefined in Conda GCC7 or GCC9.
   // Fortunately, in the case where `aligned_alloc` is declared as `static inline` it
   // uses internally `memalign`, `posix_memalign`, or `_aligned_malloc` so we  can avoid overriding it ourselves.
-  #if __USE_ISOC11 
+  #if !defined(__GLIBC__) || __USE_ISOC11
   void* aligned_alloc(size_t alignment, size_t size) { return mi_aligned_alloc(alignment, size); }
   #endif
 #endif
 
 // no forwarding here due to aliasing/name mangling issues
-void  cfree(void* p)                                    { mi_free(p); } 
+void  cfree(void* p)                                    { mi_free(p); }
 void* pvalloc(size_t size)                              { return mi_pvalloc(size); }
-void* reallocarray(void* p, size_t count, size_t size)  { return mi_reallocarray(p, count, size); }
-int   reallocarr(void* p, size_t count, size_t size)    { return mi_reallocarr(p, count, size); }
 void* memalign(size_t alignment, size_t size)           { return mi_memalign(alignment, size); }
 void* _aligned_malloc(size_t alignment, size_t size)    { return mi_aligned_alloc(alignment, size); }
+void* reallocarray(void* p, size_t count, size_t size)  { return mi_reallocarray(p, count, size); }
+// some systems define reallocarr so mark it as a weak symbol (#751)
+mi_decl_weak int reallocarr(void* p, size_t count, size_t size)    { return mi_reallocarr(p, count, size); }
+
+#if defined(__wasi__)
+  // forward __libc interface (see PR #667)
+  void* __libc_malloc(size_t size)                      MI_FORWARD1(mi_malloc, size)
+  void* __libc_calloc(size_t count, size_t size)        MI_FORWARD2(mi_calloc, count, size)
+  void* __libc_realloc(void* p, size_t size)            MI_FORWARD2(mi_realloc, p, size)
+  void  __libc_free(void* p)                            MI_FORWARD0(mi_free, p)
+  void* __libc_memalign(size_t alignment, size_t size)  { return mi_memalign(alignment, size); }
 
-#if defined(__GLIBC__) && defined(__linux__)
-  // forward __libc interface (needed for glibc-based Linux distributions)
+#elif defined(__linux__)
+  // forward __libc interface (needed for glibc-based and musl-based Linux distributions)
   void* __libc_malloc(size_t size)                      MI_FORWARD1(mi_malloc,size)
   void* __libc_calloc(size_t count, size_t size)        MI_FORWARD2(mi_calloc,count,size)
   void* __libc_realloc(void* p, size_t size)            MI_FORWARD2(mi_realloc,p,size)
diff --git a/ext/src/mimalloc/src/alloc-posix.c b/ext/src/mimalloc/src/alloc-posix.c
index 176e7ec307..225752fd87 100644
--- a/ext/src/mimalloc/src/alloc-posix.c
+++ b/ext/src/mimalloc/src/alloc-posix.c
@@ -10,7 +10,7 @@ terms of the MIT license. A copy of the license can be found in the file
 // for convenience and used when overriding these functions.
 // ------------------------------------------------------------------------
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
+#include "mimalloc/internal.h"
 
 // ------------------------------------------------------
 // Posix & Unix functions definitions
@@ -33,12 +33,12 @@ terms of the MIT license. A copy of the license can be found in the file
 
 
 mi_decl_nodiscard size_t mi_malloc_size(const void* p) mi_attr_noexcept {
-  //if (!mi_is_in_heap_region(p)) return 0;
+  // if (!mi_is_in_heap_region(p)) return 0;
   return mi_usable_size(p);
 }
 
 mi_decl_nodiscard size_t mi_malloc_usable_size(const void *p) mi_attr_noexcept {
-  //if (!mi_is_in_heap_region(p)) return 0;
+  // if (!mi_is_in_heap_region(p)) return 0;
   return mi_usable_size(p);
 }
 
@@ -56,7 +56,8 @@ int mi_posix_memalign(void** p, size_t alignment, size_t size) mi_attr_noexcept
   // Note: The spec dictates we should not modify `*p` on an error. (issue#27)
   // <http://man7.org/linux/man-pages/man3/posix_memalign.3.html>
   if (p == NULL) return EINVAL;
-  if (alignment % sizeof(void*) != 0) return EINVAL;                   // natural alignment
+  if ((alignment % sizeof(void*)) != 0) return EINVAL;                 // natural alignment
+  // it is also required that alignment is a power of 2 and > 0; this is checked in `mi_malloc_aligned`
   if (alignment==0 || !_mi_is_power_of_two(alignment)) return EINVAL;  // not a power of 2
   void* q = mi_malloc_aligned(size, alignment);
   if (q==NULL && size != 0) return ENOMEM;
@@ -83,13 +84,16 @@ mi_decl_nodiscard mi_decl_restrict void* mi_pvalloc(size_t size) mi_attr_noexcep
 }
 
 mi_decl_nodiscard mi_decl_restrict void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept {
-  if (mi_unlikely((size&(alignment-1)) != 0)) { // C11 requires alignment>0 && integral multiple, see <https://en.cppreference.com/w/c/memory/aligned_alloc>
-    #if MI_DEBUG > 0
-    _mi_error_message(EOVERFLOW, "(mi_)aligned_alloc requires the size to be an integral multiple of the alignment (size %zu, alignment %zu)\n", size, alignment);
-    #endif
-    return NULL;
-  }
-  // C11 also requires alignment to be a power-of-two which is checked in mi_malloc_aligned
+  // C11 requires the size to be an integral multiple of the alignment, see <https://en.cppreference.com/w/c/memory/aligned_alloc>.
+  // unfortunately, it turns out quite some programs pass a size that is not an integral multiple so skip this check..
+  /* if mi_unlikely((size & (alignment - 1)) != 0) { // C11 requires alignment>0 && integral multiple, see <https://en.cppreference.com/w/c/memory/aligned_alloc>
+      #if MI_DEBUG > 0
+      _mi_error_message(EOVERFLOW, "(mi_)aligned_alloc requires the size to be an integral multiple of the alignment (size %zu, alignment %zu)\n", size, alignment);
+      #endif
+      return NULL;
+    }
+  */
+  // C11 also requires alignment to be a power-of-two (and > 0) which is checked in mi_malloc_aligned
   void* p = mi_malloc_aligned(size, alignment);
   mi_assert_internal(((uintptr_t)p % alignment) == 0);
   return p;
@@ -107,9 +111,9 @@ mi_decl_nodiscard int mi_reallocarr( void* p, size_t count, size_t size ) mi_att
     errno = EINVAL;
     return EINVAL;
   }
-  void** op = (void**)p;  
+  void** op = (void**)p;
   void* newp = mi_reallocarray(*op, count, size);
-  if (mi_unlikely(newp == NULL)) return errno;
+  if mi_unlikely(newp == NULL) { return errno; }
   *op = newp;
   return 0;
 }
@@ -146,7 +150,7 @@ int mi_dupenv_s(char** buf, size_t* size, const char* name) mi_attr_noexcept {
   else {
     *buf = mi_strdup(p);
     if (*buf==NULL) return ENOMEM;
-    if (size != NULL) *size = strlen(p);
+    if (size != NULL) *size = _mi_strlen(p);
   }
   return 0;
 }
diff --git a/ext/src/mimalloc/src/alloc.c b/ext/src/mimalloc/src/alloc.c
index 3e20bd5c6b..d7b34e8cb7 100644
--- a/ext/src/mimalloc/src/alloc.c
+++ b/ext/src/mimalloc/src/alloc.c
@@ -1,5 +1,6 @@
+
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -9,14 +10,16 @@ terms of the MIT license. A copy of the license can be found in the file
 #endif
 
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"   // _mi_prim_thread_id()
 
-#include <string.h>  // memset, strlen
-#include <stdlib.h>  // malloc, exit
+#include <string.h>      // memset, strlen (for mi_strdup)
+#include <stdlib.h>      // malloc, abort
 
 #define MI_IN_ALLOC_C
 #include "alloc-override.c"
+#include "free.c"
 #undef MI_IN_ALLOC_C
 
 // ------------------------------------------------------
@@ -25,668 +28,447 @@ terms of the MIT license. A copy of the license can be found in the file
 
 // Fast allocation in a page: just pop from the free list.
 // Fall back to generic allocation only if the list is empty.
-extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept {
-  mi_assert_internal(page->xblock_size==0||mi_page_block_size(page) >= size);
+// Note: in release mode the (inlined) routine is about 7 instructions with a single test.
+static mi_decl_forceinline void* mi_page_malloc_zero(mi_theap_t* theap, mi_page_t* page, size_t size, bool zero, size_t* usable) mi_attr_noexcept
+{
+  if (page->block_size != 0) { // not the empty theap
+    mi_assert_internal(mi_page_block_size(page) >= size);
+    mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+    mi_assert_internal(_mi_ptr_page(page)==page);
+  }
+
+  // check the free list
   mi_block_t* const block = page->free;
-  if (mi_unlikely(block == NULL)) {
-    return _mi_malloc_generic(heap, size); 
+  if mi_unlikely(block == NULL) {
+    return _mi_malloc_generic(theap, size, (zero ? 1 : 0), usable);
   }
   mi_assert_internal(block != NULL && _mi_ptr_page(block) == page);
+  if (usable != NULL) { *usable = mi_page_usable_block_size(page); };
+
   // pop from the free list
-  page->used++;
   page->free = mi_block_next(page, block);
+  page->used++;
   mi_assert_internal(page->free == NULL || _mi_ptr_page(page->free) == page);
+  mi_assert_internal(page->block_size < MI_MAX_ALIGN_SIZE || _mi_is_aligned(block, MI_MAX_ALIGN_SIZE));
 
-#if (MI_DEBUG>0)
-  if (!page->is_zero) { memset(block, MI_DEBUG_UNINIT, size); }
-#elif (MI_SECURE!=0)
-  block->next = 0;  // don't leak internal data
-#endif
-
-#if (MI_STAT>0)
-  const size_t bsize = mi_page_usable_block_size(page);
-  if (bsize <= MI_MEDIUM_OBJ_SIZE_MAX) {
-    mi_heap_stat_increase(heap, normal, bsize);
-    mi_heap_stat_counter_increase(heap, normal_count, 1);
-#if (MI_STAT>1)
-    const size_t bin = _mi_bin(bsize);
-    mi_heap_stat_increase(heap, normal_bins[bin], 1);
-#endif
+  #if MI_DEBUG>3
+  if (page->free_is_zero && size > sizeof(*block)) {
+    mi_assert_expensive(mi_mem_is_zero(block+1,size - sizeof(*block)));
   }
-#endif
-
-#if (MI_PADDING > 0) && defined(MI_ENCODE_FREELIST)
-  mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + mi_page_usable_block_size(page));
-  ptrdiff_t delta = ((uint8_t*)padding - (uint8_t*)block - (size - MI_PADDING_SIZE));
-  mi_assert_internal(delta >= 0 && mi_page_usable_block_size(page) >= (size - MI_PADDING_SIZE + delta));
-  padding->canary = (uint32_t)(mi_ptr_encode(page,block,page->keys));
-  padding->delta  = (uint32_t)(delta);
-  uint8_t* fill = (uint8_t*)padding - delta;
-  const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // set at most N initial padding bytes
-  for (size_t i = 0; i < maxpad; i++) { fill[i] = MI_DEBUG_PADDING; }
-#endif
+  #endif
 
-  return block;
-}
+  // allow use of the block internally
+  // note: when tracking we need to avoid ever touching the MI_PADDING since
+  // that is tracked by valgrind etc. as non-accessible (through the red-zone, see `mimalloc/track.h`)
+  const size_t bsize = mi_page_usable_block_size(page);
+  mi_track_mem_undefined(block, bsize);
 
-// allocate a small block
-extern inline mi_decl_restrict void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept {
-  mi_assert(heap!=NULL);
-  mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id()); // heaps are thread local
-  mi_assert(size <= MI_SMALL_SIZE_MAX);
-  #if (MI_PADDING)
-  if (size == 0) {
-    size = sizeof(void*);
-  }
-  #endif
-  mi_page_t* page = _mi_heap_get_free_small_page(heap,size + MI_PADDING_SIZE);
-  void* p = _mi_page_malloc(heap, page, size + MI_PADDING_SIZE);
-  mi_assert_internal(p==NULL || mi_usable_size(p) >= size);
-  #if MI_STAT>1
-  if (p != NULL) {
-    if (!mi_heap_is_initialized(heap)) { heap = mi_get_default_heap(); }
-    mi_heap_stat_increase(heap, malloc, mi_usable_size(p));
+  #if (MI_STAT>0)
+  if (bsize <= MI_LARGE_MAX_OBJ_SIZE) {
+    mi_theap_stat_increase(theap, malloc_normal, bsize);
+    #if (MI_STAT>1)
+    mi_theap_stat_counter_increase(theap, malloc_normal_count, 1);
+    const size_t bin = _mi_bin(bsize);
+    mi_theap_stat_increase(theap, malloc_bins[bin], 1);
+    mi_theap_stat_increase(theap, malloc_requested, size - MI_PADDING_SIZE);
+    #endif
   }
   #endif
-  return p;
-}
-
-extern inline mi_decl_restrict void* mi_malloc_small(size_t size) mi_attr_noexcept {
-  return mi_heap_malloc_small(mi_get_default_heap(), size);
-}
 
-// The main allocation function
-extern inline mi_decl_restrict void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
-  if (mi_likely(size <= MI_SMALL_SIZE_MAX)) {
-    return mi_heap_malloc_small(heap, size);
+  // zero the block? note: we need to zero the full block size (issue #63)
+  if mi_likely(!zero) {
+    // #if MI_SECURE
+    block->next = 0;  // don't leak internal data
+    // #endif
+    #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN
+      if (!mi_page_is_huge(page)) { memset(block, MI_DEBUG_UNINIT, bsize); }
+    #endif    
   }
   else {
-    mi_assert(heap!=NULL);
-    mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id()); // heaps are thread local
-    void* const p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE);      // note: size can overflow but it is detected in malloc_generic
-    mi_assert_internal(p == NULL || mi_usable_size(p) >= size);
-    #if MI_STAT>1
-    if (p != NULL) {
-      if (!mi_heap_is_initialized(heap)) { heap = mi_get_default_heap(); }
-      mi_heap_stat_increase(heap, malloc, mi_usable_size(p));
+    if (!page->free_is_zero) {
+      _mi_memzero_aligned(block, bsize);
+    }
+    else {
+      block->next = 0;
+      mi_track_mem_defined(block, bsize);
     }
-    #endif
-    return p;
   }
-}
 
-extern inline mi_decl_restrict void* mi_malloc(size_t size) mi_attr_noexcept {
-  return mi_heap_malloc(mi_get_default_heap(), size);
-}
-
-
-void _mi_block_zero_init(const mi_page_t* page, void* p, size_t size) {
-  // note: we need to initialize the whole usable block size to zero, not just the requested size,
-  // or the recalloc/rezalloc functions cannot safely expand in place (see issue #63)
-  MI_UNUSED(size);
-  mi_assert_internal(p != NULL);
-  mi_assert_internal(mi_usable_size(p) >= size); // size can be zero
-  mi_assert_internal(_mi_ptr_page(p)==page);
-  if (page->is_zero && size > sizeof(mi_block_t)) {
-    // already zero initialized memory
-    ((mi_block_t*)p)->next = 0;  // clear the free list pointer
-    mi_assert_expensive(mi_mem_is_zero(p, mi_usable_size(p)));
-  }
-  else {
-    // otherwise memset
-    memset(p, 0, mi_usable_size(p));
-  }
-}
+  #if MI_PADDING // && !MI_TRACK_ENABLED
+    mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + bsize);
+    ptrdiff_t delta = ((uint8_t*)padding - (uint8_t*)block - (size - MI_PADDING_SIZE));
+    #if (MI_DEBUG>=2)
+    mi_assert_internal(delta >= 0 && bsize >= (size - MI_PADDING_SIZE + delta));
+    #endif
+    mi_track_mem_defined(padding,sizeof(mi_padding_t));  // note: re-enable since mi_page_usable_block_size may set noaccess
+    padding->canary = mi_ptr_encode_canary(page,block,page->keys);
+    padding->delta  = (uint32_t)(delta);
+    #if MI_PADDING_CHECK
+    if (!mi_page_is_huge(page)) {
+      uint8_t* fill = (uint8_t*)padding - delta;
+      const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // set at most N initial padding bytes
+      for (size_t i = 0; i < maxpad; i++) { fill[i] = MI_DEBUG_PADDING; }
+    }
+    #endif
+  #endif
 
-// zero initialized small block
-mi_decl_restrict void* mi_zalloc_small(size_t size) mi_attr_noexcept {
-  void* p = mi_malloc_small(size);
-  if (p != NULL) {
-    _mi_block_zero_init(_mi_ptr_page(p), p, size);  // todo: can we avoid getting the page again?
-  }
-  return p;
+  return block;
 }
 
-void* _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept {
-  void* p = mi_heap_malloc(heap,size);
-  if (zero && p != NULL) {
-    _mi_block_zero_init(_mi_ptr_page(p),p,size);  // todo: can we avoid getting the page again?
-  }
-  return p;
+// extra entries for improved efficiency in `alloc-aligned.c` (and in `page.c:mi_malloc_generic`.
+extern void* _mi_page_malloc_zero(mi_theap_t* theap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept {
+  return mi_page_malloc_zero(theap, page, size, zero, NULL);
 }
 
-extern inline mi_decl_restrict void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
-  return _mi_heap_malloc_zero(heap, size, true);
-}
+#if MI_GUARDED
+mi_decl_restrict void* _mi_theap_malloc_guarded(mi_theap_t* theap, size_t size, bool zero) mi_attr_noexcept;
+#endif
 
-mi_decl_restrict void* mi_zalloc(size_t size) mi_attr_noexcept {
-  return mi_heap_zalloc(mi_get_default_heap(),size);
-}
+// main allocation primitives for small and generic allocation
 
+// internal small size allocation
+static mi_decl_forceinline mi_decl_restrict void* mi_theap_malloc_small_zero_nonnull(mi_theap_t* theap, size_t size, bool zero, size_t* usable) mi_attr_noexcept
+{
+  mi_assert(theap != NULL);
+  mi_assert(size <= MI_SMALL_SIZE_MAX);
+  #if MI_DEBUG
+  const uintptr_t tid = _mi_thread_id();
+  mi_assert(theap->tld->thread_id == 0 || theap->tld->thread_id == tid); // theaps are thread local
+  #endif
+  #if (MI_PADDING || MI_GUARDED)
+  if mi_unlikely(size == 0) { size = sizeof(void*); }
+  #endif
+  #if MI_GUARDED
+  if mi_unlikely(mi_theap_malloc_use_guarded(theap,size)) {
+    return _mi_theap_malloc_guarded(theap, size, zero);
+  }
+  #endif
 
-// ------------------------------------------------------
-// Check for double free in secure and debug mode
-// This is somewhat expensive so only enabled for secure mode 4
-// ------------------------------------------------------
+  // get page in constant time, and allocate from it
+  mi_page_t* page = _mi_theap_get_free_small_page(theap, size + MI_PADDING_SIZE);
+  void* const p = mi_page_malloc_zero(theap, page, size + MI_PADDING_SIZE, zero, usable);
+  mi_track_malloc(p,size,zero);
 
-#if (MI_ENCODE_FREELIST && (MI_SECURE>=4 || MI_DEBUG!=0))
-// linear check if the free list contains a specific element
-static bool mi_list_contains(const mi_page_t* page, const mi_block_t* list, const mi_block_t* elem) {
-  while (list != NULL) {
-    if (elem==list) return true;
-    list = mi_block_next(page, list);
+  #if MI_DEBUG>3
+  if (p != NULL && zero) {
+    mi_assert_expensive(mi_mem_is_zero(p, size));
   }
-  return false;
+  #endif
+  return p;
 }
 
-static mi_decl_noinline bool mi_check_is_double_freex(const mi_page_t* page, const mi_block_t* block) {
-  // The decoded value is in the same page (or NULL).
-  // Walk the free lists to verify positively if it is already freed
-  if (mi_list_contains(page, page->free, block) ||
-      mi_list_contains(page, page->local_free, block) ||
-      mi_list_contains(page, mi_page_thread_free(page), block))
-  {
-    _mi_error_message(EAGAIN, "double free detected of block %p with size %zu\n", block, mi_page_block_size(page));
-    return true;
+// internal generic allocation
+static mi_decl_forceinline void* mi_theap_malloc_generic(mi_theap_t* theap, size_t size, bool zero, size_t huge_alignment, size_t* usable) mi_attr_noexcept
+{
+  #if MI_GUARDED
+  #if MI_THEAP_INITASNULL
+  if (theap!=NULL)
+  #endif
+  if (huge_alignment==0 && mi_theap_malloc_use_guarded(theap, size)) {
+    return _mi_theap_malloc_guarded(theap, size, zero);
   }
-  return false;
-}
+  #endif
+  #if !MI_THEAP_INITASNULL
+  mi_assert(theap!=NULL);
+  #endif
+  mi_assert(theap==NULL || theap->tld->thread_id == 0 || theap->tld->thread_id == _mi_thread_id());   // theaps are thread local
+  mi_assert((huge_alignment & 1)==0);
+  void* const p = _mi_malloc_generic(theap, size + MI_PADDING_SIZE, (zero ? 1 : 0) | huge_alignment, usable);  // note: size can overflow but it is detected in malloc_generic
+  mi_track_malloc(p, size, zero);
 
-static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
-  mi_block_t* n = mi_block_nextx(page, block, page->keys); // pretend it is freed, and get the decoded first field
-  if (((uintptr_t)n & (MI_INTPTR_SIZE-1))==0 &&  // quick check: aligned pointer?
-      (n==NULL || mi_is_in_same_page(block, n))) // quick check: in same page or NULL?
-  {
-    // Suspicous: decoded value a in block is in the same page (or NULL) -- maybe a double free?
-    // (continue in separate function to improve code generation)
-    return mi_check_is_double_freex(page, block);
+  #if MI_DEBUG>3
+  if (p != NULL && zero) {
+    mi_assert_expensive(mi_mem_is_zero(p, size));
   }
-  return false;
-}
-#else
-static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
-  MI_UNUSED(page);
-  MI_UNUSED(block);
-  return false;
+  #endif
+  return p;
 }
-#endif
 
-// ---------------------------------------------------------------------------
-// Check for heap block overflow by setting up padding at the end of the block
-// ---------------------------------------------------------------------------
-
-#if (MI_PADDING>0) && defined(MI_ENCODE_FREELIST)
-static bool mi_page_decode_padding(const mi_page_t* page, const mi_block_t* block, size_t* delta, size_t* bsize) {
-  *bsize = mi_page_usable_block_size(page);
-  const mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + *bsize);
-  *delta = padding->delta;
-  return ((uint32_t)mi_ptr_encode(page,block,page->keys) == padding->canary && *delta <= *bsize);
-}
-
-// Return the exact usable size of a block.
-static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
-  size_t bsize;
-  size_t delta;
-  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
-  mi_assert_internal(ok); mi_assert_internal(delta <= bsize);
-  return (ok ? bsize - delta : 0);
-}
-
-static bool mi_verify_padding(const mi_page_t* page, const mi_block_t* block, size_t* size, size_t* wrong) {
-  size_t bsize;
-  size_t delta;
-  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
-  *size = *wrong = bsize;
-  if (!ok) return false;
-  mi_assert_internal(bsize >= delta);
-  *size = bsize - delta;
-  uint8_t* fill = (uint8_t*)block + bsize - delta;
-  const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // check at most the first N padding bytes
-  for (size_t i = 0; i < maxpad; i++) {
-    if (fill[i] != MI_DEBUG_PADDING) {
-      *wrong = bsize - delta + i;
-      return false;
-    }
+// internal small allocation
+static mi_decl_forceinline mi_decl_restrict void* mi_theap_malloc_small_zero(mi_theap_t* theap, size_t size, bool zero, size_t* usable) mi_attr_noexcept {
+  #if MI_THEAP_INITASNULL
+  if (theap!=NULL) {
+    return mi_theap_malloc_small_zero_nonnull(theap, size, zero, usable);
+  }
+  else {
+    return mi_theap_malloc_generic(theap, size, zero, 0, usable); // tailcall
   }
-  return true;
+  #else
+  return mi_theap_malloc_small_zero_nonnull(theap, size, zero, usable);
+  #endif
 }
 
-static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
-  size_t size;
-  size_t wrong;
-  if (!mi_verify_padding(page,block,&size,&wrong)) {
-    _mi_error_message(EFAULT, "buffer overflow in heap block %p of size %zu: write after %zu bytes\n", block, size, wrong );
-  }
-}
-
-// When a non-thread-local block is freed, it becomes part of the thread delayed free
-// list that is freed later by the owning heap. If the exact usable size is too small to
-// contain the pointer for the delayed list, then shrink the padding (by decreasing delta)
-// so it will later not trigger an overflow error in `mi_free_block`.
-static void mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
-  size_t bsize;
-  size_t delta;
-  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
-  mi_assert_internal(ok);
-  if (!ok || (bsize - delta) >= min_size) return;  // usually already enough space
-  mi_assert_internal(bsize >= min_size);
-  if (bsize < min_size) return;  // should never happen
-  size_t new_delta = (bsize - min_size);
-  mi_assert_internal(new_delta < bsize);
-  mi_padding_t* padding = (mi_padding_t*)((uint8_t*)block + bsize);
-  padding->delta = (uint32_t)new_delta;
-}
-#else
-static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
-  MI_UNUSED(page);
-  MI_UNUSED(block);
+
+// allocate a small block
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_theap_malloc_small(mi_theap_t* theap, size_t size) mi_attr_noexcept {
+  return mi_theap_malloc_small_zero(theap, size, false, NULL);
 }
 
-static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
-  MI_UNUSED(block);
-  return mi_page_usable_block_size(page);
+mi_decl_nodiscard mi_decl_restrict void* mi_malloc_small(size_t size) mi_attr_noexcept {
+  return mi_theap_malloc_small(_mi_theap_default(), size);
 }
 
-static void mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
-  MI_UNUSED(page);
-  MI_UNUSED(block);
-  MI_UNUSED(min_size);
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  return mi_theap_malloc_small_zero_nonnull(_mi_heap_theap(heap), size, false, NULL);
 }
-#endif
 
-// only maintain stats for smaller objects if requested
-#if (MI_STAT>0)
-static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
-  #if (MI_STAT < 2)  
-  MI_UNUSED(block);
-  #endif
-  mi_heap_t* const heap = mi_heap_get_default();
-  const size_t bsize = mi_page_usable_block_size(page);
-  #if (MI_STAT>1)
-  const size_t usize = mi_page_usable_size_of(page, block);
-  mi_heap_stat_decrease(heap, malloc, usize);
-  #endif  
-  if (bsize <= MI_MEDIUM_OBJ_SIZE_MAX) {
-    mi_heap_stat_decrease(heap, normal, bsize);
-    #if (MI_STAT > 1)
-    mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], 1);
-    #endif
-  }
-  else if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
-    mi_heap_stat_decrease(heap, large, bsize);
+// The main internal allocation functions
+static mi_decl_forceinline void* mi_theap_malloc_zero_nonnull(mi_theap_t* theap, size_t size, bool zero, size_t huge_alignment, size_t* usable) mi_attr_noexcept {
+  // fast path for small objects
+  if mi_likely(size <= MI_SMALL_SIZE_MAX) {
+    mi_assert_internal(huge_alignment == 0);
+    return mi_theap_malloc_small_zero_nonnull(theap, size, zero, usable);
   }
   else {
-    mi_heap_stat_decrease(heap, huge, bsize);
+    return mi_theap_malloc_generic(theap, size, zero, huge_alignment, usable);
   }
 }
-#else
-static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
-  MI_UNUSED(page); MI_UNUSED(block);
-}
-#endif
 
-// ------------------------------------------------------
-// Free
-// ------------------------------------------------------
-
-// multi-threaded free
-static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* block)
-{
-  // The padding check may access the non-thread-owned page for the key values.
-  // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
-  mi_check_padding(page, block);
-  mi_padding_shrink(page, block, sizeof(mi_block_t)); // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
-  #if (MI_DEBUG!=0)
-  memset(block, MI_DEBUG_FREED, mi_usable_size(block));
+extern mi_decl_forceinline void* _mi_theap_malloc_zero_ex(mi_theap_t* theap, size_t size, bool zero, size_t huge_alignment, size_t* usable) mi_attr_noexcept {
+  // fast path for small objects
+  #if MI_THEAP_INITASNULL
+  if mi_likely(theap!=NULL && size <= MI_SMALL_SIZE_MAX)
+  #else
+  if mi_likely(size <= MI_SMALL_SIZE_MAX)
   #endif
-
-  // huge page segments are always abandoned and can be freed immediately
-  mi_segment_t* segment = _mi_page_segment(page);
-  if (segment->kind==MI_SEGMENT_HUGE) {
-    _mi_segment_huge_page_free(segment, page, block);
-    return;
-  }
-
-  // Try to put the block on either the page-local thread free list, or the heap delayed free list.
-  mi_thread_free_t tfreex;
-  bool use_delayed;
-  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
-  do {
-    use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE);
-    if (mi_unlikely(use_delayed)) {
-      // unlikely: this only happens on the first concurrent free in a page that is in the full list
-      tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING);
-    }
-    else {
-      // usual: directly add to page thread_free list
-      mi_block_set_next(page, block, mi_tf_block(tfree));
-      tfreex = mi_tf_set_block(tfree,block);
-    }
-  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
-
-  if (mi_unlikely(use_delayed)) {
-    // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
-    mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page);
-    mi_assert_internal(heap != NULL);
-    if (heap != NULL) {
-      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
-      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
-      do {
-        mi_block_set_nextx(heap,block,dfree, heap->keys);
-      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
-    }
-
-    // and reset the MI_DELAYED_FREEING flag
-    tfree = mi_atomic_load_relaxed(&page->xthread_free);
-    do {
-      tfreex = tfree;
-      mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
-      tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
-    } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
-  }
-}
-
-// regular free
-static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block)
-{
-  // and push it on the free list
-  if (mi_likely(local)) {
-    // owning thread can free a block directly
-    if (mi_unlikely(mi_check_is_double_free(page, block))) return;
-    mi_check_padding(page, block);
-    #if (MI_DEBUG!=0)
-    memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
-    #endif
-    mi_block_set_next(page, block, page->local_free);
-    page->local_free = block;
-    page->used--;
-    if (mi_unlikely(mi_page_all_free(page))) {
-      _mi_page_retire(page);
-    }
-    else if (mi_unlikely(mi_page_is_in_full(page))) {
-      _mi_page_unfull(page);
-    }
+  {
+    mi_assert_internal(huge_alignment == 0);
+    return mi_theap_malloc_small_zero_nonnull(theap, size, zero, usable);
   }
   else {
-    _mi_free_block_mt(page,block);
+    return mi_theap_malloc_generic(theap, size, zero, huge_alignment, usable);
   }
 }
 
-
-// Adjust a block that was allocated aligned, to the actual start of the block in the page.
-mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p) {
-  mi_assert_internal(page!=NULL && p!=NULL);
-  const size_t diff   = (uint8_t*)p - _mi_page_start(segment, page, NULL);
-  const size_t adjust = (diff % mi_page_block_size(page));
-  return (mi_block_t*)((uintptr_t)p - adjust);
+void* _mi_theap_malloc_zero(mi_theap_t* theap, size_t size, bool zero, size_t* usable) mi_attr_noexcept {
+  return _mi_theap_malloc_zero_ex(theap, size, zero, 0, usable);
 }
 
 
-static void mi_decl_noinline mi_free_generic(const mi_segment_t* segment, bool local, void* p) mi_attr_noexcept {
-  mi_page_t* const page = _mi_segment_page_of(segment, p);
-  mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
-  mi_stat_free(page, block);
-  _mi_free_block(page, local, block);
-}
+// Main allocation functions
 
-// Get the segment data belonging to a pointer
-// This is just a single `and` in assembly but does further checks in debug mode
-// (and secure mode) if this was a valid pointer.
-static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg) 
-{
-  MI_UNUSED(msg);
-#if (MI_DEBUG>0)
-  if (mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0)) {
-    _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p);
-    return NULL;
-  }
-#endif
-
-  mi_segment_t* const segment = _mi_ptr_segment(p);
-  if (mi_unlikely(segment == NULL)) return NULL;  // checks also for (p==NULL)
-
-#if (MI_DEBUG>0)
-  if (mi_unlikely(!mi_is_in_heap_region(p))) {
-    _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n"
-      "(this may still be a valid very large allocation (over 64MiB))\n", msg, p);
-    if (mi_likely(_mi_ptr_cookie(segment) == segment->cookie)) {
-      _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
-    }
-  }
-#endif
-#if (MI_DEBUG>0 || MI_SECURE>=4)
-  if (mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie)) {
-    _mi_error_message(EINVAL, "%s: pointer does not point to a valid heap space: %p\n", msg, p);
-    return NULL;
-  }
-#endif
-  return segment;
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_theap_malloc(mi_theap_t* theap, size_t size) mi_attr_noexcept {
+  return _mi_theap_malloc_zero(theap, size, false, NULL);
 }
 
-// Free a block 
-void mi_free(void* p) mi_attr_noexcept
-{
-  mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free");
-  if (mi_unlikely(segment == NULL)) return; 
-
-  mi_threadid_t tid = _mi_thread_id();
-  mi_page_t* const page = _mi_segment_page_of(segment, p);
-  
-  if (mi_likely(tid == mi_atomic_load_relaxed(&segment->thread_id) && page->flags.full_aligned == 0)) {  // the thread id matches and it is not a full page, nor has aligned blocks
-    // local, and not full or aligned
-    mi_block_t* block = (mi_block_t*)(p);
-    if (mi_unlikely(mi_check_is_double_free(page,block))) return;
-    mi_check_padding(page, block);
-    mi_stat_free(page, block);
-    #if (MI_DEBUG!=0)
-    memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
-    #endif
-    mi_block_set_next(page, block, page->local_free);
-    page->local_free = block;
-    if (mi_unlikely(--page->used == 0)) {   // using this expression generates better code than: page->used--; if (mi_page_all_free(page))    
-      _mi_page_retire(page);
-    }
-  }
-  else {
-    // non-local, aligned blocks, or a full page; use the more generic path
-    // note: recalc page in generic to improve code generation
-    mi_free_generic(segment, tid == segment->thread_id, p);
-  }
+mi_decl_nodiscard mi_decl_restrict void* mi_malloc(size_t size) mi_attr_noexcept {
+   return mi_theap_malloc(_mi_theap_default(), size);
 }
 
-bool _mi_free_delayed_block(mi_block_t* block) {
-  // get segment and page
-  const mi_segment_t* const segment = _mi_ptr_segment(block);
-  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
-  mi_assert_internal(_mi_thread_id() == segment->thread_id);
-  mi_page_t* const page = _mi_segment_page_of(segment, block);
-
-  // Clear the no-delayed flag so delayed freeing is used again for this page.
-  // This must be done before collecting the free lists on this page -- otherwise
-  // some blocks may end up in the page `thread_free` list with no blocks in the
-  // heap `thread_delayed_free` list which may cause the page to be never freed!
-  // (it would only be freed if we happen to scan it in `mi_page_queue_find_free_ex`)
-  _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, false /* dont overwrite never delayed */);
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  return mi_theap_malloc_zero_nonnull(_mi_heap_theap(heap), size, false, 0, NULL);
+}
 
-  // collect all other non-local frees to ensure up-to-date `used` count
-  _mi_page_free_collect(page, false);
 
-  // and free the block (possibly freeing the page as well since used is updated)
-  _mi_free_block(page, true, block);
-  return true;
+// zero initialized small block
+mi_decl_nodiscard mi_decl_restrict void* mi_zalloc_small(size_t size) mi_attr_noexcept {
+  return mi_theap_malloc_small_zero(_mi_theap_default(), size, true, NULL);
 }
 
-// Bytes available in a block
-mi_decl_noinline static size_t mi_page_usable_aligned_size_of(const mi_segment_t* segment, const mi_page_t* page, const void* p) mi_attr_noexcept {
-  const mi_block_t* block = _mi_page_ptr_unalign(segment, page, p);
-  const size_t size = mi_page_usable_size_of(page, block);
-  const ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)block;
-  mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
-  return (size - adjust);
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_theap_zalloc(mi_theap_t* theap, size_t size) mi_attr_noexcept {
+  return _mi_theap_malloc_zero(theap, size, true, NULL);
 }
 
-static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept {
-  const mi_segment_t* const segment = mi_checked_ptr_segment(p, msg);
-  if (segment==NULL) return 0;  // also returns 0 if `p == NULL`
-  const mi_page_t* const page = _mi_segment_page_of(segment, p);  
-  if (mi_likely(!mi_page_has_aligned(page))) {
-    const mi_block_t* block = (const mi_block_t*)p;
-    return mi_page_usable_size_of(page, block);
-  }
-  else {
-    // split out to separate routine for improved code generation
-    return mi_page_usable_aligned_size_of(segment, page, p);
-  }
+mi_decl_nodiscard mi_decl_restrict void* mi_zalloc(size_t size) mi_attr_noexcept {
+  return _mi_theap_malloc_zero(_mi_theap_default(), size, true, NULL);
 }
 
-size_t mi_usable_size(const void* p) mi_attr_noexcept {
-  return _mi_usable_size(p, "mi_usable_size");
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  return mi_theap_malloc_zero_nonnull(_mi_heap_theap(heap), size, true, 0, NULL);
 }
 
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_theap_calloc(mi_theap_t* theap, size_t count, size_t size) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(count,size,&total)) return NULL;
+  return mi_theap_zalloc(theap,total);
+}
 
-// ------------------------------------------------------
-// ensure explicit external inline definitions are emitted!
-// ------------------------------------------------------
-
-#ifdef __cplusplus
-void* _mi_externs[] = {
-  (void*)&_mi_page_malloc,
-  (void*)&mi_malloc,
-  (void*)&mi_malloc_small,
-  (void*)&mi_zalloc_small,
-  (void*)&mi_heap_malloc,
-  (void*)&mi_heap_zalloc,
-  (void*)&mi_heap_malloc_small
-};
-#endif
+mi_decl_nodiscard mi_decl_restrict void* mi_calloc(size_t count, size_t size) mi_attr_noexcept {
+  return mi_theap_calloc(_mi_theap_default(),count,size);
+}
 
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(count, size, &total)) return NULL;
+  return mi_heap_zalloc(heap, total);
+}
 
-// ------------------------------------------------------
-// Allocation extensions
-// ------------------------------------------------------
+// Return usable size
+mi_decl_nodiscard mi_decl_restrict void* mi_umalloc_small(size_t size, size_t* usable) mi_attr_noexcept {
+  return mi_theap_malloc_small_zero(_mi_theap_default(), size, false, usable);
+}
 
-void mi_free_size(void* p, size_t size) mi_attr_noexcept {
-  MI_UNUSED_RELEASE(size);
-  mi_assert(p == NULL || size <= _mi_usable_size(p,"mi_free_size"));
-  mi_free(p);
+mi_decl_nodiscard mi_decl_restrict void* mi_theap_umalloc(mi_theap_t* theap, size_t size, size_t* usable) mi_attr_noexcept {
+  return _mi_theap_malloc_zero_ex(theap, size, false, 0, usable);
 }
 
-void mi_free_size_aligned(void* p, size_t size, size_t alignment) mi_attr_noexcept {
-  MI_UNUSED_RELEASE(alignment);
-  mi_assert(((uintptr_t)p % alignment) == 0);
-  mi_free_size(p,size);
+mi_decl_nodiscard mi_decl_restrict void* mi_umalloc(size_t size, size_t* usable) mi_attr_noexcept {
+  return mi_theap_umalloc(_mi_theap_default(), size, usable);
 }
 
-void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept {
-  MI_UNUSED_RELEASE(alignment);
-  mi_assert(((uintptr_t)p % alignment) == 0);
-  mi_free(p);
+mi_decl_nodiscard mi_decl_restrict void* mi_uzalloc(size_t size, size_t* usable) mi_attr_noexcept {
+  return _mi_theap_malloc_zero_ex(_mi_theap_default(), size, true, 0, usable);
 }
 
-extern inline mi_decl_restrict void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
+mi_decl_nodiscard mi_decl_restrict void* mi_ucalloc(size_t count, size_t size, size_t* usable) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(count,size,&total)) return NULL;
-  return mi_heap_zalloc(heap,total);
+  return mi_uzalloc(total, usable);
 }
 
-mi_decl_restrict void* mi_calloc(size_t count, size_t size) mi_attr_noexcept {
-  return mi_heap_calloc(mi_get_default_heap(),count,size);
+// Uninitialized `calloc`
+static mi_decl_restrict void* mi_theap_mallocn(mi_theap_t* theap, size_t count, size_t size) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(count, size, &total)) return NULL;
+  return mi_theap_malloc(theap, total);
 }
 
-// Uninitialized `calloc`
-extern mi_decl_restrict void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
+mi_decl_nodiscard mi_decl_restrict void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept {
+  return mi_theap_mallocn(_mi_theap_default(),count,size);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(count, size, &total)) return NULL;
   return mi_heap_malloc(heap, total);
 }
 
-mi_decl_restrict void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept {
-  return mi_heap_mallocn(mi_get_default_heap(),count,size);
-}
 
 // Expand (or shrink) in place (or fail)
 void* mi_expand(void* p, size_t newsize) mi_attr_noexcept {
   #if MI_PADDING
-  // we do not shrink/expand with padding enabled 
+  // we do not shrink/expand with padding enabled
   MI_UNUSED(p); MI_UNUSED(newsize);
   return NULL;
   #else
   if (p == NULL) return NULL;
-  const size_t size = _mi_usable_size(p,"mi_expand");
+  const mi_page_t* const page = mi_validate_ptr_page(p,"mi_expand");
+  const size_t size = _mi_usable_size(p,page);
   if (newsize > size) return NULL;
   return p; // it fits
   #endif
 }
 
-void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept {
-  const size_t size = _mi_usable_size(p,"mi_realloc"); // also works if p == NULL
-  if (mi_unlikely(newsize <= size && newsize >= (size / 2))) {
-    // todo: adjust potential padding to reflect the new size?
+void* _mi_theap_realloc_zero(mi_theap_t* theap, void* p, size_t newsize, bool zero, size_t* usable_pre, size_t* usable_post) mi_attr_noexcept {
+  // if p == NULL then behave as malloc.
+  // else if size == 0 then reallocate to a zero-sized block (and don't return NULL, just as mi_malloc(0)).
+  // (this means that returning NULL always indicates an error, and `p` will not have been freed in that case.)
+  const mi_page_t* page;
+  size_t size;
+  if (p==NULL) {
+    page = NULL;
+    size = 0;
+    if (usable_pre!=NULL) { *usable_pre = 0; }
+  }
+  else {
+    page = mi_validate_ptr_page(p,"mi_realloc");
+    size = _mi_usable_size(p,page);
+    if (usable_pre!=NULL) { *usable_pre = mi_page_usable_block_size(page); }
+  }
+  if mi_unlikely(newsize<=size && newsize>=(size/2) && newsize>0  // note: newsize must be > 0 or otherwise we return NULL for realloc(NULL,0)
+                  && mi_page_heap(page)==theap->heap)             // and within the same heap
+  {
+    mi_assert_internal(p!=NULL);
+    // todo: do not track as the usable size is still the same in the free; adjust potential padding?
+    // mi_track_resize(p,size,newsize)
+    // if (newsize < size) { mi_track_mem_noaccess((uint8_t*)p + newsize, size - newsize); }
+    if (usable_post!=NULL) { *usable_post = mi_page_usable_block_size(page); }
     return p;  // reallocation still fits and not more than 50% waste
   }
-  void* newp = mi_heap_malloc(heap,newsize);
-  if (mi_likely(newp != NULL)) {
+  void* newp = mi_theap_umalloc(theap,newsize,usable_post);
+  if mi_likely(newp != NULL) {
     if (zero && newsize > size) {
       // also set last word in the previous allocation to zero to ensure any padding is zero-initialized
       const size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0);
-      memset((uint8_t*)newp + start, 0, newsize - start);
+      _mi_memzero((uint8_t*)newp + start, newsize - start);
+    }
+    else if (newsize == 0) {
+      ((uint8_t*)newp)[0] = 0; // work around for applications that expect zero-reallocation to be zero initialized (issue #725)
     }
-    if (mi_likely(p != NULL)) {
-      _mi_memcpy_aligned(newp, p, (newsize > size ? size : newsize));
-      mi_free(p); // only free the original pointer if successful
+    if mi_likely(p != NULL) {
+      const size_t copysize = (newsize > size ? size : newsize);
+      mi_track_mem_defined(p,copysize);  // _mi_useable_size may be too large for byte precise memory tracking..
+      _mi_memcpy(newp, p, copysize);
+      mi_free(p); // only free the original pointer if successful  // todo: optimize since page is known?
     }
   }
   return newp;
 }
 
-void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
-  return _mi_heap_realloc_zero(heap, p, newsize, false);  
+mi_decl_nodiscard void* mi_theap_realloc(mi_theap_t* theap, void* p, size_t newsize) mi_attr_noexcept {
+  return _mi_theap_realloc_zero(theap, p, newsize, false, NULL, NULL);
 }
 
-void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
+static void* mi_theap_reallocn(mi_theap_t* theap, void* p, size_t count, size_t size) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(count, size, &total)) return NULL;
-  return mi_heap_realloc(heap, p, total);
+  return mi_theap_realloc(theap, p, total);
 }
 
 
 // Reallocate but free `p` on errors
-void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
-  void* newp = mi_heap_realloc(heap, p, newsize);
+static void* mi_theap_reallocf(mi_theap_t* theap, void* p, size_t newsize) mi_attr_noexcept {
+  void* newp = mi_theap_realloc(theap, p, newsize);
   if (newp==NULL && p!=NULL) mi_free(p);
   return newp;
 }
 
-void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
-  return _mi_heap_realloc_zero(heap, p, newsize, true);
+static void* mi_theap_rezalloc(mi_theap_t* theap, void* p, size_t newsize) mi_attr_noexcept {
+  return _mi_theap_realloc_zero(theap, p, newsize, true, NULL, NULL);
 }
 
-void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
+static void* mi_theap_recalloc(mi_theap_t* theap, void* p, size_t count, size_t size) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(count, size, &total)) return NULL;
-  return mi_heap_rezalloc(heap, p, total);
+  return mi_theap_rezalloc(theap, p, total);
 }
 
 
-void* mi_realloc(void* p, size_t newsize) mi_attr_noexcept {
-  return mi_heap_realloc(mi_get_default_heap(),p,newsize);
+mi_decl_nodiscard void* mi_realloc(void* p, size_t newsize) mi_attr_noexcept {
+  return mi_theap_realloc(_mi_theap_default(),p,newsize);
 }
 
-void* mi_reallocn(void* p, size_t count, size_t size) mi_attr_noexcept {
-  return mi_heap_reallocn(mi_get_default_heap(),p,count,size);
+mi_decl_nodiscard void* mi_reallocn(void* p, size_t count, size_t size) mi_attr_noexcept {
+  return mi_theap_reallocn(_mi_theap_default(),p,count,size);
+}
+
+mi_decl_nodiscard void* mi_urealloc(void* p, size_t newsize, size_t* usable_pre, size_t* usable_post) mi_attr_noexcept {
+  return _mi_theap_realloc_zero(_mi_theap_default(),p,newsize, false, usable_pre, usable_post);
 }
 
 // Reallocate but free `p` on errors
-void* mi_reallocf(void* p, size_t newsize) mi_attr_noexcept {
-  return mi_heap_reallocf(mi_get_default_heap(),p,newsize);
+mi_decl_nodiscard void* mi_reallocf(void* p, size_t newsize) mi_attr_noexcept {
+  return mi_theap_reallocf(_mi_theap_default(),p,newsize);
+}
+
+mi_decl_nodiscard void* mi_rezalloc(void* p, size_t newsize) mi_attr_noexcept {
+  return mi_theap_rezalloc(_mi_theap_default(), p, newsize);
 }
 
-void* mi_rezalloc(void* p, size_t newsize) mi_attr_noexcept {
-  return mi_heap_rezalloc(mi_get_default_heap(), p, newsize);
+mi_decl_nodiscard void* mi_recalloc(void* p, size_t count, size_t size) mi_attr_noexcept {
+  return mi_theap_recalloc(_mi_theap_default(), p, count, size);
 }
 
-void* mi_recalloc(void* p, size_t count, size_t size) mi_attr_noexcept {
-  return mi_heap_recalloc(mi_get_default_heap(), p, count, size);
+
+mi_decl_nodiscard void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+  return mi_theap_realloc(_mi_heap_theap(heap), p, newsize);
+}
+
+mi_decl_nodiscard void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
+  return mi_theap_reallocn(_mi_heap_theap(heap), p, count, size);
+}
+
+// Reallocate but free `p` on errors
+mi_decl_nodiscard void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+  return mi_theap_reallocf(_mi_heap_theap(heap), p, newsize);
+}
+
+mi_decl_nodiscard void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+  return mi_theap_rezalloc(_mi_heap_theap(heap), p, newsize);
+}
+
+mi_decl_nodiscard void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
+  return mi_theap_recalloc(_mi_heap_theap(heap), p, count, size);
 }
 
 
@@ -696,33 +478,41 @@ void* mi_recalloc(void* p, size_t count, size_t size) mi_attr_noexcept {
 // ------------------------------------------------------
 
 // `strdup` using mi_malloc
-mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const char* s) mi_attr_noexcept {
+mi_decl_nodiscard static mi_decl_restrict char* mi_theap_strdup(mi_theap_t* theap, const char* s) mi_attr_noexcept {
   if (s == NULL) return NULL;
-  size_t n = strlen(s);
-  char* t = (char*)mi_heap_malloc(heap,n+1);
-  if (t != NULL) _mi_memcpy(t, s, n + 1);
+  size_t len = _mi_strlen(s);
+  char* t = (char*)mi_theap_malloc(theap,len+1);
+  if (t == NULL) return NULL;
+  _mi_memcpy(t, s, len);
+  t[len] = 0;
   return t;
 }
 
-mi_decl_restrict char* mi_strdup(const char* s) mi_attr_noexcept {
-  return mi_heap_strdup(mi_get_default_heap(), s);
+mi_decl_nodiscard mi_decl_restrict char* mi_strdup(const char* s) mi_attr_noexcept {
+  return mi_theap_strdup(_mi_theap_default(), s);
+}
+
+mi_decl_nodiscard mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const char* s) mi_attr_noexcept {
+  return mi_theap_strdup(_mi_heap_theap(heap), s);
 }
 
 // `strndup` using mi_malloc
-mi_decl_restrict char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept {
+mi_decl_nodiscard static mi_decl_restrict char* mi_theap_strndup(mi_theap_t* theap, const char* s, size_t n) mi_attr_noexcept {
   if (s == NULL) return NULL;
-  const char* end = (const char*)memchr(s, 0, n);  // find end of string in the first `n` characters (returns NULL if not found)
-  const size_t m = (end != NULL ? (size_t)(end - s) : n);  // `m` is the minimum of `n` or the end-of-string
-  mi_assert_internal(m <= n);
-  char* t = (char*)mi_heap_malloc(heap, m+1);
+  const size_t len = _mi_strnlen(s,n);  // len <= n
+  char* t = (char*)mi_theap_malloc(theap, len+1);
   if (t == NULL) return NULL;
-  _mi_memcpy(t, s, m);
-  t[m] = 0;
+  _mi_memcpy(t, s, len);
+  t[len] = 0;
   return t;
 }
 
-mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_attr_noexcept {
-  return mi_heap_strndup(mi_get_default_heap(),s,n);
+mi_decl_nodiscard mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_attr_noexcept {
+  return mi_theap_strndup(_mi_theap_default(),s,n);
+}
+
+mi_decl_nodiscard mi_decl_restrict char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept {
+  return mi_theap_strndup(_mi_heap_theap(heap), s, n);
 }
 
 #ifndef __wasi__
@@ -731,8 +521,8 @@ mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_attr_noexcept {
 #ifndef PATH_MAX
 #define PATH_MAX MAX_PATH
 #endif
-#include <windows.h>
-mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
+
+mi_decl_nodiscard static mi_decl_restrict char* mi_theap_realpath(mi_theap_t* theap, const char* fname, char* resolved_name) mi_attr_noexcept {
   // todo: use GetFullPathNameW to allow longer file names
   char buf[PATH_MAX];
   DWORD res = GetFullPathNameA(fname, PATH_MAX, (resolved_name == NULL ? buf : resolved_name), NULL);
@@ -746,40 +536,31 @@ mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char
     return resolved_name;
   }
   else {
-    return mi_heap_strndup(heap, buf, PATH_MAX);
+    return mi_theap_strndup(theap, buf, PATH_MAX);
   }
 }
 #else
-#include <unistd.h>  // pathconf
-static size_t mi_path_max(void) {
-  static size_t path_max = 0;
-  if (path_max <= 0) {
-    long m = pathconf("/",_PC_PATH_MAX);
-    if (m <= 0) path_max = 4096;      // guess
-    else if (m < 256) path_max = 256; // at least 256
-    else path_max = m;
-  }
-  return path_max;
-}
-
-char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
+char* mi_theap_realpath(mi_theap_t* theap, const char* fname, char* resolved_name) mi_attr_noexcept {
   if (resolved_name != NULL) {
     return realpath(fname,resolved_name);
   }
   else {
-    size_t n  = mi_path_max();
-    char* buf = (char*)mi_malloc(n+1);
-    if (buf==NULL) return NULL;
-    char* rname  = realpath(fname,buf);
-    char* result = mi_heap_strndup(heap,rname,n); // ok if `rname==NULL`
-    mi_free(buf);
+    char* rname = realpath(fname, NULL);
+    if (rname == NULL) return NULL;
+    char* result = mi_theap_strdup(theap, rname);
+    mi_cfree(rname);  // use checked free (which may be redirected to our free but that's ok)
+    // note: with ASAN realpath is intercepted and mi_cfree may leak the returned pointer :-(
     return result;
   }
 }
 #endif
 
-mi_decl_restrict char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept {
-  return mi_heap_realpath(mi_get_default_heap(),fname,resolved_name);
+mi_decl_nodiscard mi_decl_restrict char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept {
+  return mi_theap_realpath(_mi_theap_default(),fname,resolved_name);
+}
+
+mi_decl_nodiscard mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
+  return mi_theap_realpath(_mi_heap_theap(heap), fname, resolved_name);
 }
 #endif
 
@@ -800,12 +581,16 @@ static bool mi_try_new_handler(bool nothrow) {
   #else
     std::new_handler h = std::set_new_handler();
     std::set_new_handler(h);
-  #endif  
+  #endif
   if (h==NULL) {
-    _mi_error_message(ENOMEM, "out of memory in 'new'");      
+    _mi_error_message(ENOMEM, "out of memory in 'new'");
+    #if defined(_CPPUNWIND) || defined(__cpp_exceptions)  // exceptions are not always enabled
     if (!nothrow) {
       throw std::bad_alloc();
     }
+    #else
+    MI_UNUSED(nothrow);
+    #endif
     return false;
   }
   else {
@@ -816,8 +601,8 @@ static bool mi_try_new_handler(bool nothrow) {
 #else
 typedef void (*std_new_handler_t)(void);
 
-#if (defined(__GNUC__) || defined(__clang__))
-std_new_handler_t __attribute((weak)) _ZSt15get_new_handlerv(void) {
+#if (defined(__GNUC__) || (defined(__clang__) && !defined(_MSC_VER)))  // exclude clang-cl, see issue #631
+std_new_handler_t __attribute__((weak)) _ZSt15get_new_handlerv(void) {
   return NULL;
 }
 static std_new_handler_t mi_get_new_handler(void) {
@@ -825,7 +610,7 @@ static std_new_handler_t mi_get_new_handler(void) {
 }
 #else
 // note: on windows we could dynamically link to `?get_new_handler@std@@YAP6AXXZXZ`.
-static std_new_handler_t mi_get_new_handler() {
+static std_new_handler_t mi_get_new_handler(void) {
   return NULL;
 }
 #endif
@@ -833,7 +618,7 @@ static std_new_handler_t mi_get_new_handler() {
 static bool mi_try_new_handler(bool nothrow) {
   std_new_handler_t h = mi_get_new_handler();
   if (h==NULL) {
-    _mi_error_message(ENOMEM, "out of memory in 'new'");       
+    _mi_error_message(ENOMEM, "out of memory in 'new'");
     if (!nothrow) {
       abort();  // cannot throw in plain C, use abort
     }
@@ -846,27 +631,67 @@ static bool mi_try_new_handler(bool nothrow) {
 }
 #endif
 
-static mi_decl_noinline void* mi_try_new(size_t size, bool nothrow ) {
+static mi_decl_noinline void* mi_theap_try_new(mi_theap_t* theap, size_t size, bool nothrow ) {
   void* p = NULL;
   while(p == NULL && mi_try_new_handler(nothrow)) {
-    p = mi_malloc(size);
+    p = mi_theap_malloc(theap,size);
   }
   return p;
 }
 
-mi_decl_restrict void* mi_new(size_t size) {
-  void* p = mi_malloc(size);
-  if (mi_unlikely(p == NULL)) return mi_try_new(size,false);
+static mi_decl_noinline void* mi_try_new(size_t size, bool nothrow) {
+  return mi_theap_try_new(_mi_theap_default(), size, nothrow);
+}
+
+static mi_decl_noinline void* mi_heap_try_new(mi_heap_t* heap, size_t size, bool nothrow) {
+  return mi_theap_try_new(_mi_heap_theap(heap), size, nothrow);
+}
+
+
+mi_decl_nodiscard static mi_decl_restrict void* mi_theap_alloc_new(mi_theap_t* theap, size_t size) {
+  void* p = mi_theap_malloc(theap,size);
+  if mi_unlikely(p == NULL) return mi_theap_try_new(theap, size, false);
+  return p;
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_new(size_t size) {
+  return mi_theap_alloc_new(_mi_theap_default(), size);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_alloc_new(mi_heap_t* heap, size_t size) {
+  void* p = mi_heap_malloc(heap, size);
+  if mi_unlikely(p == NULL) return mi_heap_try_new(heap, size, false);
   return p;
 }
 
-mi_decl_restrict void* mi_new_nothrow(size_t size) mi_attr_noexcept {
+
+mi_decl_nodiscard static mi_decl_restrict void* mi_theap_alloc_new_n(mi_theap_t* theap, size_t count, size_t size) {
+  size_t total;
+  if mi_unlikely(mi_count_size_overflow(count, size, &total)) {
+    mi_try_new_handler(false);  // on overflow we invoke the try_new_handler once to potentially throw std::bad_alloc
+    return NULL;
+  }
+  else {
+    return mi_theap_alloc_new(theap,total);
+  }
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_new_n(size_t count, size_t size) {
+  return mi_theap_alloc_new_n(_mi_theap_default(), count, size);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_alloc_new_n(mi_heap_t* heap, size_t count, size_t size) {
+  return mi_theap_alloc_new_n(_mi_heap_theap(heap), count, size);
+}
+
+
+mi_decl_nodiscard mi_decl_restrict void* mi_new_nothrow(size_t size) mi_attr_noexcept {
   void* p = mi_malloc(size);
-  if (mi_unlikely(p == NULL)) return mi_try_new(size, true);
+  if mi_unlikely(p == NULL) return mi_try_new(size, true);
   return p;
 }
 
-mi_decl_restrict void* mi_new_aligned(size_t size, size_t alignment) {
+mi_decl_nodiscard mi_decl_restrict void* mi_new_aligned(size_t size, size_t alignment) {
   void* p;
   do {
     p = mi_malloc_aligned(size, alignment);
@@ -875,7 +700,7 @@ mi_decl_restrict void* mi_new_aligned(size_t size, size_t alignment) {
   return p;
 }
 
-mi_decl_restrict void* mi_new_aligned_nothrow(size_t size, size_t alignment) mi_attr_noexcept {
+mi_decl_nodiscard mi_decl_restrict void* mi_new_aligned_nothrow(size_t size, size_t alignment) mi_attr_noexcept {
   void* p;
   do {
     p = mi_malloc_aligned(size, alignment);
@@ -884,18 +709,7 @@ mi_decl_restrict void* mi_new_aligned_nothrow(size_t size, size_t alignment) mi_
   return p;
 }
 
-mi_decl_restrict void* mi_new_n(size_t count, size_t size) {
-  size_t total;
-  if (mi_unlikely(mi_count_size_overflow(count, size, &total))) {
-    mi_try_new_handler(false);  // on overflow we invoke the try_new_handler once to potentially throw std::bad_alloc
-    return NULL;
-  }
-  else {
-    return mi_new(total);
-  }
-}
-
-void* mi_new_realloc(void* p, size_t newsize) {
+mi_decl_nodiscard void* mi_new_realloc(void* p, size_t newsize) {
   void* q;
   do {
     q = mi_realloc(p, newsize);
@@ -903,9 +717,9 @@ void* mi_new_realloc(void* p, size_t newsize) {
   return q;
 }
 
-void* mi_new_reallocn(void* p, size_t newcount, size_t size) {
+mi_decl_nodiscard void* mi_new_reallocn(void* p, size_t newcount, size_t size) {
   size_t total;
-  if (mi_unlikely(mi_count_size_overflow(newcount, size, &total))) {
+  if mi_unlikely(mi_count_size_overflow(newcount, size, &total)) {
     mi_try_new_handler(false);  // on overflow we invoke the try_new_handler once to potentially throw std::bad_alloc
     return NULL;
   }
@@ -913,3 +727,109 @@ void* mi_new_reallocn(void* p, size_t newcount, size_t size) {
     return mi_new_realloc(p, total);
   }
 }
+
+#if MI_GUARDED
+// We always allocate a guarded allocation at an offset (`mi_page_has_interior_pointers` will be true).
+// We then set the first word of the block to `0` for regular offset aligned allocations (in `alloc-aligned.c`)
+// and the first word to `~0` for guarded allocations to have a correct `mi_usable_size`
+
+static void* mi_block_ptr_set_guarded(mi_block_t* block, size_t obj_size) {
+  // TODO: we can still make padding work by moving it out of the guard page area
+  mi_page_t* const page = _mi_ptr_page(block);
+  mi_page_set_has_interior_pointers(page, true);
+  block->next = MI_BLOCK_TAG_GUARDED;
+
+  // set guard page at the end of the block
+  const size_t block_size = mi_page_block_size(page);  // must use `block_size` to match `mi_free_local`
+  const size_t os_page_size = _mi_os_page_size();
+  mi_assert_internal(block_size >= obj_size + os_page_size + sizeof(mi_block_t));
+  if (block_size < obj_size + os_page_size + sizeof(mi_block_t)) {
+    // should never happen
+    mi_free(block);
+    return NULL;
+  }
+  uint8_t* guard_page = (uint8_t*)block + block_size - os_page_size;
+  // note: the alignment of the guard page relies on blocks being os_page_size aligned which
+  // is ensured in `mi_arena_page_alloc_fresh`.
+  mi_assert_internal(_mi_is_aligned(block, os_page_size));
+  mi_assert_internal(_mi_is_aligned(guard_page, os_page_size));
+  if (!page->memid.is_pinned && _mi_is_aligned(guard_page, os_page_size)) {
+    const bool ok = _mi_os_protect(guard_page, os_page_size);
+    if mi_unlikely(!ok) {
+      _mi_warning_message("failed to set a guard page behind an object (object %p of size %zu)\n", block, block_size);
+    }
+  }
+  else {
+    _mi_warning_message("unable to set a guard page behind an object due to pinned memory (large OS pages?) (object %p of size %zu)\n", block, block_size);
+  }
+
+  // align pointer just in front of the guard page
+  size_t offset = block_size - os_page_size - obj_size;
+  mi_assert_internal(offset > sizeof(mi_block_t));
+  if (offset > MI_PAGE_MAX_OVERALLOC_ALIGN) {
+    // give up to place it right in front of the guard page if the offset is too large for unalignment
+    offset = MI_PAGE_MAX_OVERALLOC_ALIGN;
+  }
+  void* p = (uint8_t*)block + offset;
+  mi_track_align(block, p, offset, obj_size);
+  mi_track_mem_defined(block, sizeof(mi_block_t));
+  return p;
+}
+
+mi_decl_restrict void* _mi_theap_malloc_guarded(mi_theap_t* theap, size_t size, bool zero) mi_attr_noexcept
+{
+  #if defined(MI_PADDING_SIZE)
+  mi_assert(MI_PADDING_SIZE==0);
+  #endif
+  // allocate multiple of page size ending in a guard page
+  // ensure minimal alignment requirement?
+  const size_t os_page_size = _mi_os_page_size();
+  const size_t obj_size = (mi_option_is_enabled(mi_option_guarded_precise) ? size : _mi_align_up(size, MI_MAX_ALIGN_SIZE));
+  const size_t bsize    = _mi_align_up(_mi_align_up(obj_size, MI_MAX_ALIGN_SIZE) + sizeof(mi_block_t), MI_MAX_ALIGN_SIZE);
+  const size_t req_size = _mi_align_up(bsize + os_page_size, os_page_size);
+  mi_block_t* const block = (mi_block_t*)_mi_malloc_generic(theap, req_size, (zero ? 1 : 0), NULL);
+  if (block==NULL) return NULL;
+  void* const p = mi_block_ptr_set_guarded(block, obj_size);
+
+  // stats
+  mi_track_malloc(p, size, zero);
+  if (p != NULL) {
+    if (!mi_theap_is_initialized(theap)) { theap = _mi_theap_default(); }
+    #if MI_STAT>1
+    mi_theap_stat_adjust_decrease(theap, malloc_requested, req_size);
+    mi_theap_stat_increase(theap, malloc_requested, size);
+    #endif
+    mi_theap_stat_counter_increase(theap, malloc_guarded_count, 1);
+  }
+  #if MI_DEBUG>3
+  if (p != NULL && zero) {
+    mi_assert_expensive(mi_mem_is_zero(p, size));
+  }
+  #endif
+  return p;
+}
+#endif
+
+// ------------------------------------------------------
+// ensure explicit external inline definitions are emitted!
+// ------------------------------------------------------
+
+#ifdef __cplusplus
+void* _mi_externs[] = {
+  (void*)&_mi_page_malloc_zero,
+  (void*)&_mi_theap_malloc_zero,
+  (void*)&_mi_theap_malloc_zero_ex,
+  (void*)&mi_theap_malloc,
+  (void*)&mi_theap_zalloc,
+  (void*)&mi_theap_malloc_small,
+  (void*)&mi_malloc,
+  (void*)&mi_malloc_small,
+  (void*)&mi_zalloc,
+  (void*)&mi_zalloc_small,
+  (void*)&mi_heap_malloc,
+  (void*)&mi_heap_malloc_small,
+  (void*)&mi_malloc_aligned
+  // (void*)&mi_theap_alloc_new,
+  // (void*)&mi_theap_alloc_new_n
+};
+#endif
diff --git a/ext/src/mimalloc/src/arena-meta.c b/ext/src/mimalloc/src/arena-meta.c
new file mode 100644
index 0000000000..7ff336e8c2
--- /dev/null
+++ b/ext/src/mimalloc/src/arena-meta.c
@@ -0,0 +1,179 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2024, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+  We have a special "mini" allocator just for allocation of meta-data like
+  the theap (`mi_theap_t`) or thread-local data (`mi_tld_t`).
+
+  We reuse the bitmap of the arena's for allocation of 64b blocks inside
+  an arena slice (64KiB).
+  We always ensure that meta data is zero'd (we zero on `free`)
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "bitmap.h"
+
+/* -----------------------------------------------------------
+  Meta data allocation
+----------------------------------------------------------- */
+
+#define MI_META_PAGE_SIZE         MI_ARENA_SLICE_SIZE
+#define MI_META_PAGE_ALIGN        MI_ARENA_SLICE_ALIGN
+
+// large enough such that META_MAX_SIZE > 4k (even on 32-bit)
+#define MI_META_BLOCK_SIZE        (1 << (16 - MI_BCHUNK_BITS_SHIFT))        // 128 on 64-bit
+#define MI_META_BLOCK_ALIGN       MI_META_BLOCK_SIZE
+#define MI_META_BLOCKS_PER_PAGE   (MI_META_PAGE_SIZE / MI_META_BLOCK_SIZE)  // 512
+#define MI_META_MAX_SIZE          (MI_BCHUNK_SIZE * MI_META_BLOCK_SIZE)
+
+#if MI_META_MAX_SIZE <= 4096
+#error "max meta object size should be at least 4KiB"
+#endif
+
+typedef struct mi_meta_page_s  {
+  _Atomic(struct mi_meta_page_s*)  next;    // a linked list of meta-data pages (never released)
+  mi_memid_t                       memid;   // provenance of the meta-page memory itself
+  mi_bbitmap_t                     blocks_free;  // a small bitmap with 1 bit per block.
+} mi_meta_page_t;
+
+static mi_decl_cache_align _Atomic(mi_meta_page_t*)  mi_meta_pages = MI_ATOMIC_VAR_INIT(NULL);
+
+
+#if MI_DEBUG > 1
+static mi_meta_page_t* mi_meta_page_of_ptr(void* p, size_t* block_idx) {
+  mi_meta_page_t* mpage = (mi_meta_page_t*)((uint8_t*)mi_align_down_ptr(p,MI_META_PAGE_ALIGN) + _mi_os_secure_guard_page_size());
+  if (block_idx != NULL) {
+    *block_idx = ((uint8_t*)p - (uint8_t*)mpage) / MI_META_BLOCK_SIZE;
+  }
+  return mpage;
+}
+#endif
+
+static mi_meta_page_t* mi_meta_page_next( mi_meta_page_t* mpage ) {
+  return mi_atomic_load_ptr_acquire(mi_meta_page_t, &mpage->next);
+}
+
+static void* mi_meta_block_start( mi_meta_page_t* mpage, size_t block_idx ) {
+  mi_assert_internal(_mi_is_aligned((uint8_t*)mpage - _mi_os_secure_guard_page_size(), MI_META_PAGE_ALIGN));
+  mi_assert_internal(block_idx < MI_META_BLOCKS_PER_PAGE);
+  void* p = ((uint8_t*)mpage - _mi_os_secure_guard_page_size() + (block_idx * MI_META_BLOCK_SIZE));
+  mi_assert_internal(mpage == mi_meta_page_of_ptr(p,NULL));
+  return p;
+}
+
+// allocate a fresh meta page and add it to the global list.
+static mi_meta_page_t* mi_meta_page_zalloc(void) {
+  // allocate a fresh arena slice
+  // note: careful with _mi_subproc as it may recurse into mi_tld and meta_page_zalloc again.. (same with _mi_os_numa_node()...)
+  mi_memid_t memid;
+  uint8_t* base = (uint8_t*)_mi_arenas_alloc_aligned(mi_heap_main(), MI_META_PAGE_SIZE, MI_META_PAGE_ALIGN, 0,
+                                                                    true /* commit*/, (MI_SECURE==0) /* allow large? */,
+                                                                    NULL /* req arena */, 0 /* thread_seq */, -1 /* numa node */, &memid);
+  if (base == NULL) return NULL;
+  mi_assert_internal(_mi_is_aligned(base,MI_META_PAGE_ALIGN));
+  if (!memid.initially_zero) {
+    _mi_memzero_aligned(base, MI_ARENA_SLICE_SIZE);
+  }
+
+  // guard pages
+  #if MI_SECURE >= 1
+  _mi_os_secure_guard_page_set_at(base, memid);
+  _mi_os_secure_guard_page_set_before(base + MI_META_PAGE_SIZE, memid);
+  #endif
+
+  // initialize the page and free block bitmap
+  mi_meta_page_t* mpage = (mi_meta_page_t*)(base + _mi_os_secure_guard_page_size());
+  mpage->memid = memid;
+  mi_bbitmap_init(&mpage->blocks_free, MI_META_BLOCKS_PER_PAGE, true /* already_zero */);
+  const size_t mpage_size  = offsetof(mi_meta_page_t,blocks_free) + mi_bbitmap_size(MI_META_BLOCKS_PER_PAGE, NULL);
+  const size_t info_blocks = _mi_divide_up(mpage_size,MI_META_BLOCK_SIZE);
+  const size_t guard_blocks = _mi_divide_up(_mi_os_secure_guard_page_size(), MI_META_BLOCK_SIZE);
+  mi_assert_internal(info_blocks + 2*guard_blocks < MI_META_BLOCKS_PER_PAGE);
+  mi_bbitmap_unsafe_setN(&mpage->blocks_free, info_blocks + guard_blocks, MI_META_BLOCKS_PER_PAGE - info_blocks - 2*guard_blocks);
+
+  // push atomically in front of the meta page list
+  // (note: there is no ABA issue since we never free meta-pages)
+  mi_meta_page_t* old = mi_atomic_load_ptr_acquire(mi_meta_page_t,&mi_meta_pages);
+  do {
+    mi_atomic_store_ptr_release(mi_meta_page_t, &mpage->next, old);
+  } while(!mi_atomic_cas_ptr_weak_acq_rel(mi_meta_page_t,&mi_meta_pages,&old,mpage));
+  return mpage;
+}
+
+
+// allocate meta-data
+mi_decl_noinline void* _mi_meta_zalloc( size_t size, mi_memid_t* pmemid )
+{
+  mi_assert_internal(pmemid != NULL);
+  size = _mi_align_up(size,MI_META_BLOCK_SIZE);
+  if (size == 0 || size > MI_META_MAX_SIZE) return NULL;
+  const size_t block_count = _mi_divide_up(size,MI_META_BLOCK_SIZE);
+  mi_assert_internal(block_count > 0 && block_count < MI_BCHUNK_BITS);
+  mi_meta_page_t* mpage0 = mi_atomic_load_ptr_acquire(mi_meta_page_t,&mi_meta_pages);
+  mi_meta_page_t* mpage = mpage0;
+  while (mpage != NULL) {
+    size_t block_idx;
+    if (mi_bbitmap_try_find_and_clearN(&mpage->blocks_free, block_count, 0, &block_idx)) {
+      // found and claimed `block_count` blocks
+      *pmemid = _mi_memid_create_meta(mpage, block_idx, block_count);
+      return mi_meta_block_start(mpage,block_idx);
+    }
+    else {
+      mpage = mi_meta_page_next(mpage);
+    }
+  }
+  // failed to find space in existing pages
+  if (mi_atomic_load_ptr_acquire(mi_meta_page_t,&mi_meta_pages) != mpage0) {
+    // the page list was updated by another thread in the meantime, retry
+    return _mi_meta_zalloc(size,pmemid);
+  }
+  // otherwise, allocate a fresh metapage and try once more
+  mpage = mi_meta_page_zalloc();
+  if (mpage != NULL) {
+    size_t block_idx;
+    if (mi_bbitmap_try_find_and_clearN(&mpage->blocks_free, block_count, 0, &block_idx)) {
+      // found and claimed `block_count` blocks
+      *pmemid = _mi_memid_create_meta(mpage, block_idx, block_count);
+      return mi_meta_block_start(mpage,block_idx);
+    }
+  }
+  // if all this failed, allocate from the OS
+  return _mi_os_alloc(size, pmemid);
+}
+
+// free meta-data
+mi_decl_noinline void _mi_meta_free(void* p, size_t size, mi_memid_t memid) {
+  if (p==NULL) return;
+  if (memid.memkind == MI_MEM_META) {
+    mi_assert_internal(_mi_divide_up(size, MI_META_BLOCK_SIZE) == memid.mem.meta.block_count);
+    const size_t block_count = memid.mem.meta.block_count;
+    const size_t block_idx   = memid.mem.meta.block_index;
+    mi_meta_page_t* mpage = (mi_meta_page_t*)memid.mem.meta.meta_page;
+    mi_assert_internal(mi_meta_page_of_ptr(p,NULL) == mpage);
+    mi_assert_internal(block_idx + block_count <= MI_META_BLOCKS_PER_PAGE);
+    mi_assert_internal(mi_bbitmap_is_clearN(&mpage->blocks_free, block_idx, block_count));
+    // we zero on free (and on the initial page allocation) so we don't need a "dirty" map
+    _mi_memzero_aligned(mi_meta_block_start(mpage, block_idx), block_count*MI_META_BLOCK_SIZE);
+    mi_bbitmap_setN(&mpage->blocks_free, block_idx, block_count);
+  }
+  else {
+    _mi_arenas_free(p,size,memid);
+  }
+}
+
+// used for debug output
+bool _mi_meta_is_meta_page(void* p)
+{
+  mi_meta_page_t* mpage0 = mi_atomic_load_ptr_acquire(mi_meta_page_t, &mi_meta_pages);
+  mi_meta_page_t* mpage = mpage0;
+  while (mpage != NULL) {
+    if ((void*)mpage == p) return true;
+    mpage = mi_meta_page_next(mpage);
+  }
+  return false;
+}
diff --git a/ext/src/mimalloc/src/arena.c b/ext/src/mimalloc/src/arena.c
index 6b1e951f34..1de14dbe37 100644
--- a/ext/src/mimalloc/src/arena.c
+++ b/ext/src/mimalloc/src/arena.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2019-2021, Microsoft Research, Daan Leijen
+Copyright (c) 2019-2025, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -7,108 +7,188 @@ terms of the MIT license. A copy of the license can be found in the file
 
 /* ----------------------------------------------------------------------------
 "Arenas" are fixed area's of OS memory from which we can allocate
-large blocks (>= MI_ARENA_MIN_BLOCK_SIZE, 4MiB).
+large blocks (>= MI_ARENA_MIN_BLOCK_SIZE, 64KiB).
 In contrast to the rest of mimalloc, the arenas are shared between
 threads and need to be accessed using atomic operations.
 
-Currently arenas are only used to for huge OS page (1GiB) reservations,
-or direct OS memory reservations -- otherwise it delegates to direct allocation from the OS.
-In the future, we can expose an API to manually add more kinds of arenas
-which is sometimes needed for embedded devices or shared memory for example.
-(We can also employ this with WASI or `sbrk` systems to reserve large arenas
- on demand and be able to reuse them efficiently).
+Arenas are also used to for huge OS page (1GiB) reservations or for reserving
+OS memory upfront which can be improve performance or is sometimes needed
+on embedded devices. We can also employ this with WASI or `sbrk` systems
+to reserve large arenas upfront and be able to reuse the memory more effectively.
 
 The arena allocation needs to be thread safe and we use an atomic bitmap to allocate.
 -----------------------------------------------------------------------------*/
+
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/prim.h"
+#include "bitmap.h"
 
-#include <string.h>  // memset
-#include <errno.h> // ENOMEM
+/* -----------------------------------------------------------
+  Arena id's
+----------------------------------------------------------- */
 
-#include "bitmap.h"  // atomic bitmap
+mi_arena_id_t _mi_arena_id_none(void) {
+  return NULL;
+}
 
+mi_arena_t* _mi_arena_from_id(mi_arena_id_t id) {
+  mi_arena_t* const arena = (mi_arena_t*)id;
+  mi_assert_internal(arena==NULL || arena->parent==NULL); // id's should never point to sub-arena's
+  return arena;
+}
 
-// os.c
-void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_stats_t* stats);
-void  _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* stats);
+mi_arena_id_t mi_arena_id_from_arena(mi_arena_t* arena) {
+  mi_assert_internal(arena==NULL || arena->parent==NULL);
+  return (arena==NULL ? _mi_arena_id_none() : (mi_arena_id_t)arena);
+}
 
-void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize);
-void  _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats);
 
-bool  _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
-bool  _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
+static bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena) {
+  if (arena == req_arena) return true;                         // they match
+  if (arena == NULL) return false;
+  if (req_arena == NULL && !arena->is_exclusive) return true;  // or the arena is not exclusive, and we didn't request a specific one
+  if (arena->parent != NULL && arena->parent == req_arena) return true;  // sub-arena? (note that req_arena is never a sub arena)
+  return false;
+}
 
+bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_t* request_arena) {
+  if (memid.memkind == MI_MEM_ARENA) {
+    return mi_arena_is_suitable(memid.mem.arena.arena, request_arena);
+  }
+  else {
+    return mi_arena_is_suitable(NULL, request_arena);
+  }
+}
 
-/* -----------------------------------------------------------
-  Arena allocation
------------------------------------------------------------ */
+size_t mi_arenas_get_count(mi_subproc_t* subproc) {
+  return mi_atomic_load_relaxed(&subproc->arena_count);
+}
+
+mi_arena_t* mi_arena_from_index(mi_subproc_t* subproc, size_t idx) {
+  mi_assert_internal(idx < mi_arenas_get_count(subproc));
+  return mi_atomic_load_ptr_relaxed(mi_arena_t, &subproc->arenas[idx]);
+}
+
+static size_t mi_arena_info_slices(mi_arena_t* arena) {
+  return arena->info_slices;
+}
+
+#if MI_DEBUG > 1
+static bool mi_heap_has_page(mi_heap_t* heap, mi_arena_t* arena, mi_page_t* page) {
+  mi_assert(arena->arena_idx < MI_MAX_ARENAS);
+  mi_arena_pages_t* arena_pages = heap->arena_pages[arena->arena_idx];
+  return (page->memid.memkind == MI_MEM_ARENA &&
+          page->memid.mem.arena.arena == arena &&
+          arena_pages != NULL &&
+          mi_bitmap_is_setN(arena_pages->pages, page->memid.mem.arena.slice_index, 1));
+}
+#endif
 
+size_t mi_arena_min_alignment(void) {
+  return MI_ARENA_SLICE_ALIGN;
+}
 
-// Block info: bit 0 contains the `in_use` bit, the upper bits the
-// size in count of arena blocks.
-typedef uintptr_t mi_block_info_t;
-#define MI_ARENA_BLOCK_SIZE   (MI_SEGMENT_SIZE)        // 8MiB  (must be at least MI_SEGMENT_ALIGN)
-#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 4MiB
-#define MI_MAX_ARENAS         (64)                     // not more than 256 (since we use 8 bits in the memid)
+size_t mi_arena_min_size(void) {
+  return MI_ARENA_MIN_SIZE;
+}
 
-// A memory arena descriptor
-typedef struct mi_arena_s {
-  _Atomic(uint8_t*) start;                // the start of the memory area
-  size_t   block_count;                   // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
-  size_t   field_count;                   // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`)
-  int      numa_node;                     // associated NUMA node
-  bool     is_zero_init;                  // is the arena zero initialized?
-  bool     allow_decommit;                // is decommit allowed? if true, is_large should be false and blocks_committed != NULL
-  bool     is_large;                      // large- or huge OS pages (always committed)
-  _Atomic(size_t) search_idx;             // optimization to start the search for free blocks
-  mi_bitmap_field_t* blocks_dirty;        // are the blocks potentially non-zero?
-  mi_bitmap_field_t* blocks_committed;    // are the blocks committed? (can be NULL for memory that cannot be decommitted)
-  mi_bitmap_field_t  blocks_inuse[1];     // in-place bitmap of in-use blocks (of size `field_count`)
-} mi_arena_t;
+static size_t mi_arena_max_object_size(void) {
+  size_t max_size = mi_option_get_size(mi_option_arena_max_object_size);
+  max_size = _mi_align_up(max_size, MI_ARENA_SLICE_SIZE);
+  if (max_size <= MI_ARENA_MIN_OBJ_SIZE) {
+    return MI_ARENA_MIN_OBJ_SIZE;
+  }
+  else if (max_size >= MI_ARENA_MAX_SIZE - MI_BCHUNK_SIZE) {  // minus a bchunk to accommodate meta info
+    return (MI_ARENA_MAX_SIZE - MI_BCHUNK_SIZE);
+  }
+  else {
+    return max_size;
+  }
+}
 
+mi_decl_nodiscard static bool mi_arena_commit(mi_arena_t* arena, void* start, size_t size, bool* is_zero, size_t already_committed) {
+  if (arena != NULL && arena->commit_fun != NULL) {
+    return (*arena->commit_fun)(true, start, size, is_zero, arena->commit_fun_arg);
+  }
+  else if (already_committed > 0) {
+    return _mi_os_commit_ex(start, size, is_zero, already_committed);
+  }
+  else {
+    return _mi_os_commit(start, size, is_zero);
+  }
+}
 
-// The available arenas
-static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS];
-static mi_decl_cache_align _Atomic(size_t)      mi_arena_count; // = 0
 
 
 /* -----------------------------------------------------------
-  Arena allocations get a memory id where the lower 8 bits are
-  the arena index +1, and the upper bits the block index.
+  Util
 ----------------------------------------------------------- */
 
-// Use `0` as a special id for direct OS allocated memory.
-#define MI_MEMID_OS   0
 
-static size_t mi_arena_id_create(size_t arena_index, mi_bitmap_index_t bitmap_index) {
-  mi_assert_internal(arena_index < 0xFE);
-  mi_assert_internal(((bitmap_index << 8) >> 8) == bitmap_index); // no overflow?
-  return ((bitmap_index << 8) | ((arena_index+1) & 0xFF));
+// Size of an arena
+static size_t mi_arena_size(mi_arena_t* arena) {
+  return mi_size_of_slices(arena->slice_count);
 }
 
-static void mi_arena_id_indices(size_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index) {
-  mi_assert_internal(memid != MI_MEMID_OS);
-  *arena_index = (memid & 0xFF) - 1;
-  *bitmap_index = (memid >> 8);
+// Start of the arena memory area
+static uint8_t* mi_arena_start(mi_arena_t* arena) {
+  return ((uint8_t*)arena);
 }
 
-static size_t mi_block_count_of_size(size_t size) {
-  return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
+// Start of a slice
+uint8_t* mi_arena_slice_start(mi_arena_t* arena, size_t slice_index) {
+  return (mi_arena_start(arena) + mi_size_of_slices(slice_index));
 }
 
-/* -----------------------------------------------------------
-  Thread safe allocation in an arena
------------------------------------------------------------ */
-static bool mi_arena_alloc(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx)
-{
-  size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx);  // start from last search; ok to be relaxed as the exact start does not matter
-  if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx)) {
-    mi_atomic_store_relaxed(&arena->search_idx, mi_bitmap_index_field(*bitmap_idx));  // start search from found location next time around
-    return true;
-  };
-  return false;
+// Arena area
+void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
+  if (size != NULL) *size = 0;
+  mi_arena_t* arena = _mi_arena_from_id(arena_id);
+  if (arena == NULL) return NULL;
+  if (size != NULL) {
+    mi_assert_internal(mi_size_of_slices(arena->slice_count) <= arena->total_size);
+    *size = arena->total_size;
+  }
+  return mi_arena_start(arena);
+}
+
+
+// Create an arena memid
+static mi_memid_t mi_memid_create_arena(mi_arena_t* arena, size_t slice_index, size_t slice_count) {
+  mi_assert_internal(slice_index < UINT32_MAX);
+  mi_assert_internal(slice_count < UINT32_MAX);
+  mi_assert_internal(slice_count > 0);
+  mi_assert_internal(slice_index < arena->slice_count);
+  mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA);
+  memid.mem.arena.arena = arena;
+  memid.mem.arena.slice_index = (uint32_t)slice_index;
+  memid.mem.arena.slice_count = (uint32_t)slice_count;
+  return memid;
+}
+
+// get the arena and slice span
+static mi_arena_t* mi_arena_from_memid(mi_memid_t memid, size_t* slice_index, size_t* slice_count) {
+  mi_assert_internal(memid.memkind == MI_MEM_ARENA);
+  mi_arena_t* arena = memid.mem.arena.arena;
+  if (slice_index!=NULL) { *slice_index = memid.mem.arena.slice_index; }
+  if (slice_count!=NULL) { *slice_count = memid.mem.arena.slice_count; }
+  return arena;
+}
+
+static size_t mi_page_full_size(mi_page_t* page) {
+  if (page->memid.memkind == MI_MEM_ARENA) {
+    return page->memid.mem.arena.slice_count * MI_ARENA_SLICE_SIZE;
+  }
+  else if (mi_memid_is_os(page->memid) || page->memid.memkind == MI_MEM_EXTERNAL) {
+    mi_assert_internal((uint8_t*)page->memid.mem.os.base <= (uint8_t*)page);
+    const ptrdiff_t presize = (uint8_t*)page - (uint8_t*)page->memid.mem.os.base;
+    mi_assert_internal((ptrdiff_t)page->memid.mem.os.size >= presize);
+    return (presize > (ptrdiff_t)page->memid.mem.os.size ? 0 : page->memid.mem.os.size - presize);
+  }
+  else {
+    return 0;
+  }
 }
 
 
@@ -116,331 +196,2223 @@ static bool mi_arena_alloc(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t*
   Arena Allocation
 ----------------------------------------------------------- */
 
-static mi_decl_noinline void* mi_arena_alloc_from(mi_arena_t* arena, size_t arena_index, size_t needed_bcount,
-                                                  bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
+static mi_decl_noinline void* mi_arena_try_alloc_at(
+  mi_arena_t* arena, size_t slice_count, bool commit, size_t tseq, mi_memid_t* memid)
 {
-  mi_bitmap_index_t bitmap_index;
-  if (!mi_arena_alloc(arena, needed_bcount, &bitmap_index)) return NULL;
-
-  // claimed it! set the dirty bits (todo: no need for an atomic op here?)
-  void* p    = arena->start + (mi_bitmap_index_bit(bitmap_index)*MI_ARENA_BLOCK_SIZE);
-  *memid     = mi_arena_id_create(arena_index, bitmap_index);
-  *is_zero   = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
-  *large     = arena->is_large;
-  *is_pinned = (arena->is_large || !arena->allow_decommit);
-  if (arena->blocks_committed == NULL) {
-    // always committed
-    *commit = true;
-  }
-  else if (*commit) {
-    // arena not committed as a whole, but commit requested: ensure commit now
-    bool any_uncommitted;
-    _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted);
-    if (any_uncommitted) {
-      bool commit_zero;
-      _mi_os_commit(p, needed_bcount * MI_ARENA_BLOCK_SIZE, &commit_zero, tld->stats);
-      if (commit_zero) *is_zero = true;
+  size_t slice_index;
+  if (!mi_bbitmap_try_find_and_clearN(arena->slices_free, slice_count, tseq, &slice_index)) return NULL;
+
+  // claimed it!
+  void* p = mi_arena_slice_start(arena, slice_index);
+  *memid = mi_memid_create_arena(arena, slice_index, slice_count);
+  memid->is_pinned = arena->memid.is_pinned;
+
+  // set the dirty bits and track which slices become accessible
+  size_t touched_slices = slice_count;
+  if (arena->memid.initially_zero) {
+    size_t already_dirty = 0;
+    memid->initially_zero = mi_bitmap_setN(arena->slices_dirty, slice_index, slice_count, &already_dirty);
+    mi_assert_internal(already_dirty <= touched_slices);
+    touched_slices -= already_dirty;
+  }
+
+  // set commit state
+  if (commit) {
+    // commit requested, but the range may not be committed as a whole: ensure it is committed now
+    const size_t already_committed = mi_bitmap_popcountN(arena->slices_committed, slice_index, slice_count);
+    if (already_committed < slice_count) {
+      // not all committed, try to commit now
+      bool commit_zero = false;
+      if (!_mi_os_commit_ex(p, mi_size_of_slices(slice_count), &commit_zero, mi_size_of_slices(slice_count - already_committed))) {
+        // if the commit fails, release ownership, and return NULL;
+        // note: this does not roll back dirty bits but that is ok.
+        mi_bbitmap_setN(arena->slices_free, slice_index, slice_count);
+        return NULL;
+      }
+      if (commit_zero) {
+        memid->initially_zero = true;
+      }
+
+      // set the commit bits
+      mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, NULL);
+
+      // committed
+      #if MI_DEBUG > 1
+      if (memid->initially_zero) {
+        if (!mi_mem_is_zero(p, mi_size_of_slices(slice_count))) {
+          _mi_error_message(EFAULT, "internal error: arena allocation was not zero-initialized!\n");
+          memid->initially_zero = false;
+        }
+      }
+      #endif
+    }
+    else {
+      // already fully committed.
+      _mi_os_reuse(p, mi_size_of_slices(slice_count));
+      // if the OS has overcommit, and this is the first time we access these pages, then
+      // count the commit now (as at arena reserve we didn't count those commits as these are on-demand)
+      if (_mi_os_has_overcommit() && touched_slices > 0) {
+        mi_subproc_stat_increase( arena->subproc, committed, mi_size_of_slices(touched_slices));
+      }
+    }
+
+    mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
+    memid->initially_committed = true;
+
+    // tool support
+    if (memid->initially_zero) {
+      mi_track_mem_defined(p, slice_count * MI_ARENA_SLICE_SIZE);
+    }
+    else {
+      mi_track_mem_undefined(p, slice_count * MI_ARENA_SLICE_SIZE);
     }
   }
   else {
-    // no need to commit, but check if already fully committed
-    *commit = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
+    // no need to commit, but check if it is already fully committed
+    memid->initially_committed = mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count);
+    if (!memid->initially_committed) {
+      // partly committed.. adjust stats
+      size_t already_committed_count = 0;
+      mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed_count);
+      mi_bitmap_clearN(arena->slices_committed, slice_index, slice_count);
+      mi_subproc_stat_decrease(arena->subproc, committed, mi_size_of_slices(already_committed_count));
+    }
   }
+
+  mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+  if (commit) { mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); }
+  if (commit) { mi_assert_internal(memid->initially_committed); }
+  mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count));
+
   return p;
 }
 
-static mi_decl_noinline void* mi_arena_allocate(int numa_node, size_t size, size_t alignment, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
-{  
-  MI_UNUSED_RELEASE(alignment);
-  mi_assert_internal(alignment <= MI_SEGMENT_ALIGN);
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);  
-  const size_t bcount = mi_block_count_of_size(size);
-  if (mi_likely(max_arena == 0)) return NULL;
-  mi_assert_internal(size <= bcount*MI_ARENA_BLOCK_SIZE);
 
-  // try numa affine allocation
-  for (size_t i = 0; i < max_arena; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
-    if (arena==NULL) break; // end reached
-    if ((arena->numa_node<0 || arena->numa_node==numa_node) && // numa local?
-      (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
-    {
-      void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_pinned, is_zero, memid, tld);
-      mi_assert_internal((uintptr_t)p % alignment == 0);
-      if (p != NULL) {
-        return p;
-      }
-    }
+static int mi_reserve_os_memory_ex2(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id);
+
+// try to reserve a fresh arena space
+static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_large, mi_arena_id_t* arena_id)
+{
+  const size_t arena_count = mi_arenas_get_count(subproc);
+  if (arena_count > (MI_MAX_ARENAS - 4)) return false;
+
+  // calc reserve
+  size_t arena_reserve = mi_option_get_size(mi_option_arena_reserve);
+  if (arena_reserve == 0) return false;
+
+  if (!_mi_os_has_virtual_reserve()) {
+    arena_reserve = arena_reserve/4;  // be conservative if virtual reserve is not supported (for WASM for example)
   }
+  arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_SLICE_SIZE);
 
-  // try from another numa node instead..
-  for (size_t i = 0; i < max_arena; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
-    if (arena==NULL) break; // end reached
-    if ((arena->numa_node>=0 && arena->numa_node!=numa_node) && // not numa local!
-      (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
-    {
-      void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_pinned, is_zero, memid, tld);
-      mi_assert_internal((uintptr_t)p % alignment == 0);
-      if (p != NULL) {
-        return p;
-      }
+  if (arena_count >= 1 && arena_count <= 128) {
+    // scale up the arena sizes exponentially every 8 entries
+    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16);
+    size_t reserve = 0;
+    if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) {
+      arena_reserve = reserve;
     }
   }
-  return NULL;
-}
 
+  // try to accommodate the requested size for huge allocations
+  req_size = _mi_align_up(req_size + MI_ARENA_MAX_CHUNK_OBJ_SIZE, MI_ARENA_MAX_CHUNK_OBJ_SIZE); // over-reserve for meta-info
+  if (arena_reserve < req_size) {
+    arena_reserve = req_size;
+  }
 
-void* _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_pinned, bool* is_zero,
-                              size_t* memid, mi_os_tld_t* tld)
-{
-  mi_assert_internal(commit != NULL && is_pinned != NULL && is_zero != NULL && memid != NULL && tld != NULL);
-  mi_assert_internal(size > 0);
-  *memid   = MI_MEMID_OS;
-  *is_zero = false;
-  *is_pinned = false;
+  // check arena bounds
+  const size_t min_reserve = MI_ARENA_MIN_SIZE;
+  const size_t max_reserve = MI_ARENA_MAX_SIZE;   // 16 GiB
+  if (arena_reserve < min_reserve) {
+    arena_reserve = min_reserve;
+  }
+  else if (arena_reserve > max_reserve) {
+    arena_reserve = max_reserve;
+  }
 
-  bool default_large = false;
-  if (large==NULL) large = &default_large;     // ensure `large != NULL`
-  const int numa_node = _mi_os_numa_node(tld); // current numa node
+  // should be able to at least handle the current allocation size
+  if (arena_reserve < req_size) return false;
 
-  // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
-  if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN) {
-    void* p = mi_arena_allocate(numa_node, size, alignment, commit, large, is_pinned, is_zero, memid, tld);
-    if (p != NULL) return p;
-  }
+  // commit eagerly?
+  bool arena_commit = false;
+  const bool overcommit = _mi_os_has_overcommit();
+  if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = overcommit; }
+  else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; }
 
-  // finally, fall back to the OS
-  if (mi_option_is_enabled(mi_option_limit_os_alloc)) {
-    errno = ENOMEM;
-    return NULL;
+  // on an OS with overcommit (Linux) we don't count the commit yet as it is on-demand. Once a slice
+  // is actually allocated for the first time it will be counted.
+  const bool adjust = (overcommit && arena_commit);
+  if (adjust) { mi_subproc_stat_adjust_decrease( subproc, committed, arena_reserve); }
+  // and try to reserve the arena
+  int err = mi_reserve_os_memory_ex2(subproc, arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id);
+  if (err != 0) {
+    if (adjust) { mi_subproc_stat_adjust_increase( subproc, committed, arena_reserve); } // roll back
+    // failed to allocate: try a smaller size arena as fallback?
+    const size_t small_arena_reserve = 4 * MI_ARENA_MIN_SIZE; // 128 MiB (or 32 MiB on 32-bit)
+    if (arena_reserve > small_arena_reserve && small_arena_reserve > req_size) {
+      // try again
+      if (adjust) { mi_subproc_stat_adjust_decrease(subproc, committed, small_arena_reserve); }
+      err = mi_reserve_os_memory_ex2(subproc, small_arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id);
+      if (err != 0 && adjust) { mi_subproc_stat_adjust_increase( subproc, committed, small_arena_reserve); } // roll back
+    }
   }
-  *is_zero = true;
-  *memid   = MI_MEMID_OS;  
-  void* p = _mi_os_alloc_aligned(size, alignment, *commit, large, tld->stats);
-  if (p != NULL) *is_pinned = *large;
-  return p;
+  return (err==0);
 }
 
-void* _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
-{
-  return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, commit, large, is_pinned, is_zero, memid, tld);
-}
+
+
 
 /* -----------------------------------------------------------
-  Arena free
+  Arena iteration
 ----------------------------------------------------------- */
 
-void _mi_arena_free(void* p, size_t size, size_t memid, bool all_committed, mi_os_tld_t* tld) {
-  mi_assert_internal(size > 0 && tld->stats != NULL);
-  if (p==NULL) return;
-  if (size==0) return;
+static inline bool mi_arena_is_suitable_ex(mi_arena_t* arena, mi_arena_t* req_arena, bool match_numa, int numa_node, bool allow_pinned) {
+  if (!allow_pinned && arena->memid.is_pinned) return false;
+  if (!mi_arena_is_suitable(arena, req_arena)) return false;
+  if (req_arena == NULL) { // if not specific, check numa affinity
+    const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node);
+    if (match_numa) { if (!numa_suitable) return false; }
+               else { if (numa_suitable)  return false; }
+  }
+  return true;
+}
 
-  if (memid == MI_MEMID_OS) {
-    // was a direct OS allocation, pass through
-    _mi_os_free_ex(p, size, all_committed, tld->stats);
+// determine the start of search; important to keep heaps and threads
+// into their own memory regions to reduce contention.
+static size_t mi_arena_start_idx(mi_heap_t* heap, size_t tseq, size_t arena_cycle) {
+  const size_t hseq   = heap->heap_seq;
+  const size_t hcount = mi_atomic_load_relaxed(&heap->subproc->heap_count);
+  if (arena_cycle <= 1)     return 0;
+  if (hseq==0 || hcount<=1) return (tseq % arena_cycle); // common for single heap programs
+
+  // spread heaps evenly among arena's, and then evenly for threads in their fraction
+  size_t start;
+  mi_assert_internal(arena_cycle <= 0x8FF);             // prevent overflow on 32-bit
+  const size_t frac = (arena_cycle * 256) / hcount;     // fraction in the arena_cycle; at most: arena_cycle * 0x100
+  if (frac==0) {
+    // many heaps (> 256 per arena)
+    start = (hseq % arena_cycle);
   }
   else {
-    // allocated in an arena
-    size_t arena_idx;
-    size_t bitmap_idx;
-    mi_arena_id_indices(memid, &arena_idx, &bitmap_idx);
-    mi_assert_internal(arena_idx < MI_MAX_ARENAS);
-    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t,&mi_arenas[arena_idx]);
-    mi_assert_internal(arena != NULL);
-    const size_t blocks = mi_block_count_of_size(size);
-    // checks
-    if (arena == NULL) {
-      _mi_error_message(EINVAL, "trying to free from non-existent arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
-      return;
-    }
-    mi_assert_internal(arena->field_count > mi_bitmap_index_field(bitmap_idx));
-    if (arena->field_count <= mi_bitmap_index_field(bitmap_idx)) {
-      _mi_error_message(EINVAL, "trying to free from non-existent arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
-      return;
-    }
-    // potentially decommit
-    if (!arena->allow_decommit || arena->blocks_committed == NULL) {
-      mi_assert_internal(all_committed); // note: may be not true as we may "pretend" to be not committed (in segment.c)
-    }
-    else {
-      mi_assert_internal(arena->blocks_committed != NULL);
-      _mi_os_decommit(p, blocks * MI_ARENA_BLOCK_SIZE, tld->stats); // ok if this fails
-      _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx);
+    const size_t hspot = (hseq % hcount);
+    start = (frac * hspot) / 256;
+    if (frac >= 512) {  // at least 2 arena's per heap?
+      start = start + (tseq % (frac/256));
     }
-    // and make it available to others again 
-    bool all_inuse = _mi_bitmap_unclaim_across(arena->blocks_inuse, arena->field_count, blocks, bitmap_idx);
-    if (!all_inuse) {
-      _mi_error_message(EAGAIN, "trying to free an already freed block: %p, size %zu\n", p, size);
-      return;
-    };
   }
+  mi_assert_internal(start < arena_cycle);
+  return start;
 }
 
+#define mi_forall_arenas(heap, req_arena, tseq, name_arena) { \
+  const size_t _arena_count = mi_arenas_get_count(heap->subproc); \
+  const size_t _arena_cycle = (_arena_count == 0 ? 0 : _arena_count - 1); /* first search the arenas below the last one */ \
+  /* always start searching in the arena's below the max */ \
+  const size_t _start = mi_arena_start_idx(heap,tseq,_arena_cycle); \
+  for (size_t _i = 0; _i < _arena_count; _i++) { \
+    mi_arena_t* name_arena; \
+    if (req_arena != NULL) { \
+      name_arena = req_arena; /* if there is a specific req_arena, only search that one */\
+      if (_i > 0) break;      /* only once */ \
+    } \
+    else { \
+      size_t _idx; \
+      if (_i < _arena_cycle) { \
+        _idx = _i + _start; \
+        if (_idx >= _arena_cycle) { _idx -= _arena_cycle; } /* adjust so we rotate through the cycle */ \
+      } \
+      else { \
+        _idx = _i; /* remaining arena's after the cycle */ \
+      } \
+      name_arena = mi_arena_from_index(heap->subproc,_idx); \
+    } \
+    if (name_arena != NULL) \
+    {
+
+#define mi_forall_arenas_end()  \
+    } \
+  } \
+  }
+
+#define mi_forall_suitable_arenas(heap, req_arena, tseq, match_numa, numa_node, allow_large, name_arena) \
+  mi_forall_arenas(heap, req_arena,tseq,name_arena) { \
+    if (mi_arena_is_suitable_ex(name_arena, req_arena, match_numa, numa_node, allow_large)) { \
+
+#define mi_forall_suitable_arenas_end() \
+  }} \
+  mi_forall_arenas_end()
+
 /* -----------------------------------------------------------
-  Add an arena.
+  Arena allocation
 ----------------------------------------------------------- */
 
-static bool mi_arena_add(mi_arena_t* arena) {
-  mi_assert_internal(arena != NULL);
-  mi_assert_internal((uintptr_t)mi_atomic_load_ptr_relaxed(uint8_t,&arena->start) % MI_SEGMENT_ALIGN == 0);
-  mi_assert_internal(arena->block_count > 0);
+// allocate slices from the arenas
+static mi_decl_noinline void* mi_arenas_try_find_free(
+  mi_heap_t* heap, size_t slice_count, size_t alignment,
+  bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid)
+{
+  // mi_assert_internal(slice_count <= mi_slice_count_of_size(MI_ARENA_MAX_CHUNK_OBJ_SIZE));
+  mi_assert(alignment <= MI_ARENA_SLICE_ALIGN);
+  if (alignment > MI_ARENA_SLICE_ALIGN) return NULL;
 
-  size_t i = mi_atomic_increment_acq_rel(&mi_arena_count);
-  if (i >= MI_MAX_ARENAS) {
-    mi_atomic_decrement_acq_rel(&mi_arena_count);
-    return false;
+  // search arena's
+  mi_forall_suitable_arenas(heap, req_arena, tseq, true /* only numa matching */, numa_node, allow_large, arena)
+  {
+    void* p = mi_arena_try_alloc_at(arena, slice_count, commit, tseq, memid);
+    if (p != NULL) return p;
   }
-  mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena);
-  return true;
+  mi_forall_suitable_arenas_end();
+  if (numa_node < 0) return NULL;
+
+  // search again but now regardless of preferred numa affinity
+  mi_forall_suitable_arenas(heap, req_arena, tseq, false /* numa non-matching now */, numa_node, allow_large, arena)
+  {
+    void* p = mi_arena_try_alloc_at(arena, slice_count, commit, tseq, memid);
+    if (p != NULL) return p;
+  }
+  mi_forall_suitable_arenas_end();
+  return NULL;
 }
 
-bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept
+// Allocate slices from the arena's -- potentially allocating a fresh arena
+static mi_decl_noinline void* mi_arenas_try_alloc(
+  mi_heap_t* heap,
+  size_t slice_count, size_t alignment,
+  bool commit, bool allow_large,
+  mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid)
 {
-  if (size < MI_ARENA_BLOCK_SIZE) return false;
+  // mi_assert(slice_count <= MI_ARENA_MAX_CHUNK_OBJ_SLICES);
+  mi_assert(alignment <= MI_ARENA_SLICE_ALIGN);
+  void* p;
+
+  // not too large?
+  if (slice_count * MI_ARENA_SLICE_SIZE > MI_ARENA_MAX_SIZE) return NULL;
+
+  // try to find free slices in the arena's
+  p = mi_arenas_try_find_free(heap, slice_count, alignment, commit, allow_large, req_arena, tseq, numa_node, memid);
+  if (p != NULL) return p;
+
+  // did we need a specific arena?
+  if (req_arena != NULL) return NULL;
+
+  // don't create arena's while preloading (todo: or should we?)
+  if (_mi_preloading()) return NULL;
+
+  // don't create arena's if OS allocation is disallowed
+  if (mi_option_is_enabled(mi_option_disallow_os_alloc)) return NULL;
 
-  if (is_large) {
-    mi_assert_internal(is_committed);
-    is_committed = true;
+  // otherwise, try to reserve a new arena -- but one thread at a time.. (todo: allow 2 or 4 to reduce contention?)
+  mi_subproc_t* const subproc = heap->subproc;
+  const size_t arena_count = mi_arenas_get_count(subproc);
+  mi_lock(&subproc->arena_reserve_lock) {
+    if (arena_count == mi_arenas_get_count(subproc)) {
+      // we are the first to enter the lock, reserve a fresh arena
+      mi_arena_id_t arena_id = _mi_arena_id_none();
+      mi_arena_reserve(subproc, mi_size_of_slices(slice_count), allow_large, &arena_id);
+    }
+    else {
+      // another thread already reserved a new arena
+    }
   }
-  
-  const size_t bcount = size / MI_ARENA_BLOCK_SIZE; 
-  const size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS);
-  const size_t bitmaps = (is_committed ? 2 : 3);
-  const size_t asize  = sizeof(mi_arena_t) + (bitmaps*fields*sizeof(mi_bitmap_field_t));
-  mi_arena_t* arena   = (mi_arena_t*)_mi_os_alloc(asize, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
-  if (arena == NULL) return false;
+  // try once more to allocate in the new arena
+  mi_assert_internal(req_arena == NULL);
+  p = mi_arenas_try_find_free(heap, slice_count, alignment, commit, allow_large, req_arena, tseq, numa_node, memid);
+  if (p != NULL) return p;
 
-  arena->block_count = bcount;
-  arena->field_count = fields;
-  arena->start = (uint8_t*)start;
-  arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
-  arena->is_large     = is_large;
-  arena->is_zero_init = is_zero;
-  arena->allow_decommit = !is_large && !is_committed; // only allow decommit for initially uncommitted memory
-  arena->search_idx   = 0;
-  arena->blocks_dirty = &arena->blocks_inuse[fields]; // just after inuse bitmap
-  arena->blocks_committed = (!arena->allow_decommit ? NULL : &arena->blocks_inuse[2*fields]); // just after dirty bitmap
-  // the bitmaps are already zero initialized due to os_alloc
-  // initialize committed bitmap?
-  if (arena->blocks_committed != NULL && is_committed) {
-    memset((void*)arena->blocks_committed, 0xFF, fields*sizeof(mi_bitmap_field_t)); // cast to void* to avoid atomic warning
-  }
-  // and claim leftover blocks if needed (so we never allocate there)
-  ptrdiff_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount;
-  mi_assert_internal(post >= 0);
-  if (post > 0) {
-    // don't use leftover bits at the end
-    mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
-    _mi_bitmap_claim(arena->blocks_inuse, fields, post, postidx, NULL);
-  }
-
-  mi_arena_add(arena);
-  return true;
+  return NULL;
 }
 
-// Reserve a range of regular OS memory
-int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept 
+// Allocate from the OS (if allowed)
+static void* mi_arena_os_alloc_aligned(
+  size_t size, size_t alignment, size_t align_offset,
+  bool commit, bool allow_large,
+  mi_arena_id_t req_arena_id, mi_memid_t* memid)
 {
-  size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block
-  bool large = allow_large;
-  void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, &large, &_mi_stats_main);
-  if (start==NULL) return ENOMEM;
-  if (!mi_manage_os_memory(start, size, (large || commit), large, true, -1)) {
-    _mi_os_free_ex(start, size, commit, &_mi_stats_main);
-    _mi_verbose_message("failed to reserve %zu k memory\n", _mi_divide_up(size,1024));
-    return ENOMEM;
+  // if we cannot use OS allocation, return NULL
+  if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) {
+    errno = ENOMEM;
+    return NULL;
   }
-  _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size,1024), large ? " (in large os pages)" : "");
-  return 0;
-}
 
-static size_t mi_debug_show_bitmap(const char* prefix, mi_bitmap_field_t* fields, size_t field_count ) {
-  size_t inuse_count = 0;
-  for (size_t i = 0; i < field_count; i++) {
-    char buf[MI_BITMAP_FIELD_BITS + 1];
-    uintptr_t field = mi_atomic_load_relaxed(&fields[i]);
-    for (size_t bit = 0; bit < MI_BITMAP_FIELD_BITS; bit++) {
-      bool inuse = ((((uintptr_t)1 << bit) & field) != 0);
-      if (inuse) inuse_count++;
-      buf[MI_BITMAP_FIELD_BITS - 1 - bit] = (inuse ? 'x' : '.');
-    }
-    buf[MI_BITMAP_FIELD_BITS] = 0;
-    _mi_verbose_message("%s%s\n", prefix, buf);
+  if (align_offset > 0) {
+    return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid);
+  }
+  else {
+    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid);
   }
-  return inuse_count;
 }
 
-void mi_debug_show_arenas(void) mi_attr_noexcept {
-  size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count);
-  for (size_t i = 0; i < max_arenas; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
-    if (arena == NULL) break;
-    size_t inuse_count = 0;
-    _mi_verbose_message("arena %zu: %zu blocks with %zu fields\n", i, arena->block_count, arena->field_count);
-    inuse_count += mi_debug_show_bitmap("  ", arena->blocks_inuse, arena->field_count);
-    _mi_verbose_message("  blocks in use ('x'): %zu\n", inuse_count);
+
+// Allocate large sized memory
+void* _mi_arenas_alloc_aligned( mi_heap_t* heap,
+  size_t size, size_t alignment, size_t align_offset,
+  bool commit, bool allow_large,
+  mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid)
+{
+  mi_assert_internal(memid != NULL);
+  mi_assert_internal(size > 0);
+
+  // try to allocate in an arena if the alignment is small enough and the object is not too small (as for theap meta data)
+  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) &&                // is arena allocation allowed?
+      size >= MI_ARENA_MIN_OBJ_SIZE && size <= mi_arena_max_object_size() &&  // and not too small or too large
+      alignment <= MI_ARENA_SLICE_ALIGN && align_offset == 0)                 // and good alignment
+  {
+    const size_t slice_count = mi_slice_count_of_size(size);
+    void* p = mi_arenas_try_alloc(heap, slice_count, alignment, commit, allow_large, req_arena, tseq, numa_node, memid);
+    if (p != NULL) return p;
   }
+
+  // fall back to the OS
+  void* p = mi_arena_os_alloc_aligned(size, alignment, align_offset, commit, allow_large, req_arena, memid);
+  return p;
+}
+
+void* _mi_arenas_alloc(mi_heap_t* heap, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid)
+{
+  return _mi_arenas_alloc_aligned(heap, size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena, tseq, numa_node, memid);
 }
 
+
+
 /* -----------------------------------------------------------
-  Reserve a huge page arena.
+  Arena page allocation
 ----------------------------------------------------------- */
-// reserve at a specific numa node
-int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept {
-  if (pages==0) return 0;
-  if (numa_node < -1) numa_node = -1;
-  if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count();
-  size_t hsize = 0;
-  size_t pages_reserved = 0;
-  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize);
-  if (p==NULL || pages_reserved==0) {
-    _mi_warning_message("failed to reserve %zu GiB huge pages\n", pages);
-    return ENOMEM;
+
+// release ownership of a page. This may free the page if all blocks were concurrently
+// freed in the meantime. Returns true if the page was freed.
+static bool mi_abandoned_page_unown(mi_page_t* page, mi_theap_t* current_theap) {
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(_mi_thread_id()==current_theap->tld->thread_id);
+  mi_thread_free_t tf_new;
+  mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free);
+  do {
+    mi_assert_internal(mi_tf_is_owned(tf_old));
+    while mi_unlikely(mi_tf_block(tf_old) != NULL) {
+      _mi_page_free_collect(page, false);  // update used
+      if (mi_page_all_free(page)) {        // it may become free just before unowning it
+        _mi_arenas_page_unabandon(page, current_theap);
+        _mi_arenas_page_free(page, current_theap);
+        return true;
+      }
+      tf_old = mi_atomic_load_relaxed(&page->xthread_free);
+    }
+    mi_assert_internal(mi_tf_block(tf_old)==NULL);
+    tf_new = mi_tf_create(NULL, false);
+  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new));
+  return false;
+}
+
+
+static bool mi_arena_try_claim_abandoned(size_t slice_index, mi_arena_t* arena, bool* keep_abandoned) {
+  // found an abandoned page of the right size
+  mi_page_t* const page  = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
+  // can we claim ownership?
+  if (!mi_page_claim_ownership(page)) {
+    // there was a concurrent free that reclaims this page ..
+    // we need to keep it in the abandoned map as the free will call `mi_arena_page_unabandon`,
+    // and wait for readers (us!) to finish. This is why it is very important to set the abandoned
+    // bit again (or otherwise the unabandon will never stop waiting).
+    *keep_abandoned = true;
+    return false;
   }
-  _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages);
+  else {
+    // yes, we can reclaim it, keep the abandoned map entry clear
+    *keep_abandoned = false;
+    return true;
+  }
+}
 
-  if (!mi_manage_os_memory(p, hsize, true, true, true, numa_node)) {
-    _mi_os_free_huge_pages(p, hsize, &_mi_stats_main);
-    return ENOMEM;
+// allocate initial arena_pages from the main heap
+static mi_arena_pages_t* mi_arena_pages_alloc(mi_arena_t* arena);
+
+static mi_arena_pages_t* mi_heap_arena_pages(mi_heap_t* heap, mi_arena_t* arena) {
+  mi_assert_internal(arena!=NULL);
+  mi_assert_internal(heap!=NULL);
+  mi_assert(arena->arena_idx < MI_MAX_ARENAS);
+  return mi_atomic_load_ptr_relaxed(mi_arena_pages_t, &heap->arena_pages[arena->arena_idx]);
+}
+
+static mi_arena_t* mi_page_arena_pages(mi_page_t* page, size_t* slice_index, size_t* slice_count, mi_arena_pages_t** parena_pages) {
+  // todo: maybe store the arena* directly in the page?
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_arena_t* const arena = mi_arena_from_memid(page->memid, slice_index, slice_count);
+  mi_assert_internal(arena != NULL);
+  if (parena_pages != NULL) {
+    mi_arena_pages_t* const arena_pages = mi_heap_arena_pages(mi_page_heap(page), arena);
+    mi_assert_internal(arena_pages != NULL);
+    mi_assert_internal(slice_index==NULL || mi_bitmap_is_set(arena_pages->pages, *slice_index));
+    *parena_pages = arena_pages;
   }
-  return 0;
+  return arena;
 }
 
+static mi_arena_pages_t* mi_heap_ensure_arena_pages(mi_heap_t* heap, mi_arena_t* arena) {
+  mi_assert_internal(arena!=NULL);
+  mi_assert_internal(heap!=NULL);
+  mi_assert(arena->arena_idx < MI_MAX_ARENAS);
+  mi_arena_pages_t* arena_pages = mi_heap_arena_pages(heap, arena);
+  if (arena_pages==NULL) {
+    mi_lock(&heap->arena_pages_lock) {
+      arena_pages = mi_atomic_load_ptr_acquire(mi_arena_pages_t, &heap->arena_pages[arena->arena_idx]);
+      if (arena_pages == NULL) {  // still NULL?
+        if (_mi_is_heap_main(heap)) {
+          // the page info for the main heap is always allocated as part of an arena
+          arena_pages = &arena->pages_main;
+        }
+        else {
+          // always allocate the arena pages info from the main heap
+          // todo: allocate into the current arena?
+          arena_pages = mi_arena_pages_alloc(arena);
+        }
+        mi_atomic_store_ptr_release(mi_arena_pages_t, &heap->arena_pages[arena->arena_idx], arena_pages);
+      }
+    }
+  }
+  if (_mi_is_heap_main(heap)) { mi_assert(arena_pages != NULL); }  // can never fail
+  return arena_pages;
+}
 
-// reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected)
-int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept {
-  if (pages == 0) return 0;
+static mi_page_t* mi_arenas_page_try_find_abandoned(mi_theap_t* theap, size_t slice_count, size_t block_size)
+{
+  mi_heap_t* const heap = theap->heap;
+  const size_t tseq = theap->tld->thread_seq;
+  mi_arena_t* const req_arena = heap->exclusive_arena;
 
-  // pages per numa node
-  size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count());
-  if (numa_count <= 0) numa_count = 1;
-  const size_t pages_per = pages / numa_count;
-  const size_t pages_mod = pages % numa_count;
-  const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50);
+  MI_UNUSED(slice_count);
+  const size_t bin = _mi_bin(block_size);
+  mi_assert_internal(bin < MI_BIN_COUNT);
 
-  // reserve evenly among numa nodes
-  for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
-    size_t node_pages = pages_per;  // can be 0
-    if (numa_node < pages_mod) node_pages++;
-    int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per);
-    if (err) return err;
-    if (pages < node_pages) {
-      pages = 0;
+  // any abandoned in our size class?
+  mi_assert_internal(heap != NULL);
+  if (mi_atomic_load_relaxed(&heap->abandoned_count[bin]) == 0) {
+    return NULL;
+  }
+
+  // search arena's
+  const bool allow_large = true;
+  const int  any_numa = -1;
+  const bool match_numa = true;
+  mi_forall_suitable_arenas(heap, req_arena, tseq, match_numa, any_numa, allow_large, arena)
+  {
+    mi_arena_pages_t* const arena_pages = mi_heap_arena_pages(heap, arena);
+    if (arena_pages != NULL) {
+      size_t slice_index;
+      mi_bitmap_t* const bitmap = arena_pages->pages_abandoned[bin];
+
+      if (mi_bitmap_try_find_and_claim(bitmap, tseq, &slice_index, &mi_arena_try_claim_abandoned, arena)) {
+        // found an abandoned page of the right size
+        // and claimed ownership.
+        mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
+        mi_assert_internal(mi_page_is_owned(page));
+        mi_assert_internal(mi_page_is_abandoned(page));
+        mi_assert_internal(mi_heap_has_page(heap, arena, page));
+        mi_atomic_decrement_relaxed(&heap->abandoned_count[bin]);
+        mi_theap_stat_decrease(theap, pages_abandoned, 1);
+        mi_theap_stat_counter_increase(theap, pages_reclaim_on_alloc, 1);
+
+        _mi_page_free_collect(page, false);  // update `used` count
+        mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+        mi_assert_internal(page->slice_committed > 0 || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
+        mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count));
+        mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+        mi_assert_internal(_mi_ptr_page(page)==page);
+        mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
+        mi_assert_internal(mi_page_block_size(page) == block_size);
+        mi_assert_internal(!mi_page_is_full(page));
+        return page;
+      }
+    }
+  }
+  mi_forall_suitable_arenas_end();
+  return NULL;
+}
+
+
+// Allocate a fresh page
+static mi_page_t* mi_arenas_page_alloc_fresh(mi_theap_t* theap, size_t slice_count, size_t block_size, size_t block_alignment, bool commit)
+{
+  const bool allow_large = (MI_SECURE < 2); // 2 = guard page at end of each arena page
+  const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN);
+  const size_t page_alignment = MI_ARENA_SLICE_ALIGN;
+
+  mi_heap_t*  const heap = theap->heap;
+  mi_tld_t*   const tld  = theap->tld;
+  mi_arena_t* const req_arena = heap->exclusive_arena;
+  const int numa_node = (heap->numa_node >= 0 ? heap->numa_node : tld->numa_node);
+
+
+  // try to allocate from free space in arena's
+  mi_memid_t memid = _mi_memid_none();
+  mi_page_t* page = NULL;
+  const size_t alloc_size = mi_size_of_slices(slice_count);
+  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) &&       // allowed to allocate from arena's?
+      !os_align &&                                                   // not large alignment
+      slice_count <= mi_arena_max_object_size()/MI_ARENA_SLICE_SIZE) // and not too large
+  {
+    page = (mi_page_t*)mi_arenas_try_alloc(heap, slice_count, page_alignment, commit, allow_large, req_arena, tld->thread_seq, numa_node, &memid);
+    if (page != NULL) {
+      mi_arena_pages_t* const arena_pages = mi_heap_ensure_arena_pages(heap, memid.mem.arena.arena);
+      if (arena_pages==NULL) {
+        _mi_arenas_free(page, mi_size_of_slices(slice_count), page->memid); // roll back
+        page = NULL;
+      }
+      else {
+        mi_assert_internal(mi_bitmap_is_clearN(arena_pages->pages, memid.mem.arena.slice_index, memid.mem.arena.slice_count));
+        mi_bitmap_set(arena_pages->pages, memid.mem.arena.slice_index);
+      }
+    }
+  }
+
+  // otherwise fall back to the OS
+  if (page == NULL) {
+    if (os_align) {
+      // note: slice_count already includes the page
+      mi_assert_internal(slice_count >= mi_slice_count_of_size(block_size) + mi_slice_count_of_size(page_alignment));
+      page = (mi_page_t*)mi_arena_os_alloc_aligned(alloc_size, block_alignment, page_alignment /* align offset */, commit, allow_large, req_arena, &memid);
     }
     else {
-      pages -= node_pages;
+      page = (mi_page_t*)mi_arena_os_alloc_aligned(alloc_size, page_alignment, 0 /* align offset */, commit, allow_large, req_arena, &memid);
     }
   }
 
-  return 0;
-}
+  if (page == NULL) return NULL;
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(!os_align || _mi_is_aligned((uint8_t*)page + page_alignment, block_alignment));
 
-int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
-  MI_UNUSED(max_secs);
-  _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n");
-  if (pages_reserved != NULL) *pages_reserved = 0;
-  int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0));
-  if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
-  return err;
-}
+  // guard page at the end of mimalloc page?
+  #if MI_SECURE < 2
+  const size_t page_noguard_size = alloc_size;
+  #else
+  mi_assert(alloc_size > _mi_os_secure_guard_page_size());
+  const size_t page_noguard_size = alloc_size - _mi_os_secure_guard_page_size();
+  if (memid.initially_committed) {
+    _mi_os_secure_guard_page_set_at((uint8_t*)page + page_noguard_size, memid);
+  }
+  #endif
+
+  // claimed free slices: initialize the page partly
+  if (!memid.initially_zero && memid.initially_committed) {
+    mi_track_mem_undefined(page, slice_count * MI_ARENA_SLICE_SIZE);
+    _mi_memzero_aligned(page, sizeof(*page));
+  }
+  else if (memid.initially_committed) {
+    mi_track_mem_defined(page, slice_count * MI_ARENA_SLICE_SIZE);
+  }
+  #if MI_DEBUG > 1
+  if (memid.initially_zero && memid.initially_committed) {
+    if (!mi_mem_is_zero(page, page_noguard_size)) {
+      _mi_error_message(EFAULT, "internal error: page memory was not zero initialized.\n");
+      memid.initially_zero = false;
+      _mi_memzero_aligned(page, sizeof(*page));
+    }
+  }
+  #endif
+  mi_assert(MI_PAGE_INFO_SIZE >= mi_page_info_size());
+
+  size_t block_start;
+  #if MI_GUARDED
+  // in a guarded build, we align pages with blocks a multiple of an OS page size, to the OS page size
+  // this ensures that all blocks in such pages are OS page size aligned (which is needed for the guard pages)
+  const size_t os_page_size = _mi_os_page_size();
+  mi_assert_internal(MI_PAGE_ALIGN >= os_page_size);
+  if (!os_align && block_size % os_page_size == 0 && block_size > os_page_size /* at least 2 or more */ ) {
+    block_start = _mi_align_up(mi_page_info_size(), os_page_size);
+  }
+  else
+  #endif
+  if (os_align) {
+    block_start = MI_PAGE_ALIGN;
+  }
+  else if (_mi_is_power_of_two(block_size) && block_size <= MI_PAGE_MAX_START_BLOCK_ALIGN2) {
+    // naturally align power-of-2 blocks up to MI_PAGE_MAX_START_BLOCK_ALIGN2 size (4KiB)
+    block_start = _mi_align_up(mi_page_info_size(), block_size);
+  }
+  else if (block_size != 0 && (block_size % MI_PAGE_OSPAGE_BLOCK_ALIGN2) == 0) {
+    // also align large pages that are a multiple of MI_PAGE_OSPAGE_BLOCK_ALIGN2 (4KiB)
+    block_start = _mi_align_up(mi_page_info_size(), MI_PAGE_OSPAGE_BLOCK_ALIGN2);
+  }
+  else {
+    // otherwise start after the info
+    block_start = mi_page_info_size();
+  }
+  const size_t reserved    = (os_align ? 1 : (page_noguard_size - block_start) / block_size);
+  mi_assert_internal(reserved > 0 && reserved <= UINT16_MAX);
+
+  // commit first block?
+  size_t commit_size = 0;
+  if (!memid.initially_committed) {
+    commit_size = _mi_align_up(block_start + block_size, MI_PAGE_MIN_COMMIT_SIZE);
+    if (commit_size > page_noguard_size) { commit_size = page_noguard_size; }
+    bool is_zero;
+    if mi_unlikely(!mi_arena_commit( mi_memid_arena(memid), page, commit_size, &is_zero, 0)) {
+      _mi_arenas_free(page, alloc_size, memid);
+      return NULL;
+    }
+    if (!memid.initially_zero && !is_zero) {
+      _mi_memzero_aligned(page, commit_size);
+    }
+  }
+
+  // initialize
+  page->reserved = (uint16_t)reserved;
+  page->page_start = (uint8_t*)page + block_start;
+  page->block_size = block_size;
+  page->slice_committed = commit_size;
+  page->memid = memid;
+  page->free_is_zero = memid.initially_zero;
+
+  // and own it
+  mi_page_claim_ownership(page);
+
+  // register in the page map
+  if mi_unlikely(!_mi_page_map_register(page)) {
+    _mi_arenas_free( page, alloc_size, memid );
+    return NULL;
+  }
+
+  // stats
+  mi_theap_stat_increase(theap, pages, 1);
+  mi_theap_stat_increase(theap, page_bins[_mi_page_stats_bin(page)], 1);
+
+  mi_assert_internal(_mi_ptr_page(page)==page);
+  mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
+  mi_assert_internal(mi_page_block_size(page) == block_size);
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(mi_page_is_owned(page));
+
+  return page;
+}
+
+// Allocate a regular small/medium/large page.
+static mi_page_t* mi_arenas_page_regular_alloc(mi_theap_t* theap, size_t slice_count, size_t block_size)
+{
+  // 1. look for an abandoned page
+  mi_page_t* page = mi_arenas_page_try_find_abandoned(theap, slice_count, block_size);
+  if (page != NULL) {
+    return page;  // return as abandoned
+  }
+
+  // 2. find a free block, potentially allocating a new arena
+  const long commit_on_demand = mi_option_get(mi_option_page_commit_on_demand);
+  const bool commit = (slice_count <= mi_slice_count_of_size(MI_PAGE_MIN_COMMIT_SIZE) ||  // always commit small pages
+                       (commit_on_demand == 2 && _mi_os_has_overcommit()) || (commit_on_demand == 0));
+  page = mi_arenas_page_alloc_fresh(theap, slice_count, block_size, 1, commit);
+  if (page == NULL) return NULL;
+
+  mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.slice_count == slice_count);
+  if (!_mi_page_init(theap, page)) {
+    _mi_arenas_free( page, mi_page_full_size(page), page->memid);
+    return NULL;
+  }
+
+  return page;
+}
+
+// Allocate a page containing one block (very large, or with large alignment)
+static mi_page_t* mi_arenas_page_singleton_alloc(mi_theap_t* theap, size_t block_size, size_t block_alignment)
+{
+  const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN);
+  const size_t info_size = (os_align ? MI_PAGE_ALIGN : mi_page_info_size());
+  #if MI_SECURE < 2
+  const size_t slice_count = mi_slice_count_of_size(info_size + block_size);
+  #else
+  const size_t slice_count = mi_slice_count_of_size(_mi_align_up(info_size + block_size, _mi_os_secure_guard_page_size()) + _mi_os_secure_guard_page_size());
+  #endif
+
+  mi_page_t* page = mi_arenas_page_alloc_fresh(theap, slice_count, block_size, block_alignment, true /* commit singletons always */);
+  if (page == NULL) return NULL;
+
+  mi_assert(page->reserved == 1);
+  if (!_mi_page_init(theap, page)) {
+    _mi_arenas_free( page, mi_page_full_size(page), page->memid);
+    return NULL;
+  }
+
+  return page;
+}
+
+
+mi_page_t* _mi_arenas_page_alloc(mi_theap_t* theap, size_t block_size, size_t block_alignment) {
+  mi_page_t* page;
+  if mi_unlikely(block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN) {
+    mi_assert_internal(_mi_is_power_of_two(block_alignment));
+    page = mi_arenas_page_singleton_alloc(theap, block_size, block_alignment);
+  }
+  else if (block_size <= MI_SMALL_MAX_OBJ_SIZE) {
+    page = mi_arenas_page_regular_alloc(theap, mi_slice_count_of_size(MI_SMALL_PAGE_SIZE), block_size);
+  }
+  else if (block_size <= MI_MEDIUM_MAX_OBJ_SIZE) {
+    page = mi_arenas_page_regular_alloc(theap, mi_slice_count_of_size(MI_MEDIUM_PAGE_SIZE), block_size);
+  }
+  #if MI_ENABLE_LARGE_PAGES
+  else if (block_size <= MI_LARGE_MAX_OBJ_SIZE) {
+    page = mi_arenas_page_regular_alloc(theap, mi_slice_count_of_size(MI_LARGE_PAGE_SIZE), block_size);
+  }
+  #endif
+  else {
+    page = mi_arenas_page_singleton_alloc(theap, block_size, block_alignment);
+  }
+  if mi_unlikely(page == NULL) {
+    return NULL;
+  }
+  // mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc);
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_ptr_page(page)==page);
+  mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
+  mi_assert_internal(block_alignment <= MI_PAGE_MAX_OVERALLOC_ALIGN || _mi_is_aligned(mi_page_start(page), block_alignment));
+
+  return page;
+}
+
+void _mi_arenas_page_free(mi_page_t* page, mi_theap_t* current_theapx) {
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_ptr_page(page)==page);
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_all_free(page));
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(page->next==NULL && page->prev==NULL);
+  mi_assert_internal(current_theapx == NULL || _mi_thread_id()==current_theapx->tld->thread_id);
+
+  if (current_theapx != NULL) {
+    mi_theap_stat_decrease(current_theapx, page_bins[_mi_page_stats_bin(page)], 1);
+    mi_theap_stat_decrease(current_theapx, pages, 1);
+  }
+  else {
+    mi_heap_t* const heap = mi_page_heap(page);
+    mi_heap_stat_decrease(heap, page_bins[_mi_page_stats_bin(page)], 1);
+    mi_heap_stat_decrease(heap, pages, 1);
+  }
+
+  #if MI_DEBUG>1
+  if (page->memid.memkind==MI_MEM_ARENA && !mi_page_is_full(page)) {
+    size_t bin = _mi_bin(mi_page_block_size(page));
+    size_t slice_index;
+    size_t slice_count;
+    mi_arena_pages_t* arena_pages = NULL;
+    mi_arena_t* const arena = mi_page_arena_pages(page, &slice_index, &slice_count, &arena_pages);
+    mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+    mi_assert_internal(page->slice_committed > 0 || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
+    mi_assert_internal(mi_bitmap_is_clearN(arena_pages->pages_abandoned[bin], slice_index, 1));
+    mi_assert_internal(mi_bitmap_is_setN(arena_pages->pages, slice_index, 1));
+    // note: we cannot check for `!mi_page_is_abandoned_and_mapped` since that may
+    // be (temporarily) not true if the free happens while trying to reclaim
+    // see `mi_arena_try_claim_abandoned`
+  }
+  #endif
+
+  // recommit guard page at the end?
+  // we must do this since we may later allocate large spans over this page and cannot have a guard page in between
+  #if MI_SECURE >= 2
+  if (!page->memid.is_pinned) {
+    _mi_os_secure_guard_page_reset_before((uint8_t*)page + mi_page_full_size(page), page->memid);
+  }
+  #endif
+
+  // unregister page
+  _mi_page_map_unregister(page);
+  if (page->memid.memkind == MI_MEM_ARENA) {
+    mi_arena_pages_t* arena_pages;
+    size_t slice_index;
+    size_t slice_count; MI_UNUSED(slice_count);
+    mi_arena_t* const arena = mi_page_arena_pages(page, &slice_index, &slice_count, &arena_pages);
+    mi_assert_internal(arena_pages!=NULL);
+    mi_bitmap_clear(arena_pages->pages, slice_index);
+    if (page->slice_committed > 0) {
+      // if committed on-demand, set the commit bits to account commit properly
+      mi_assert_internal(mi_page_full_size(page) >= page->slice_committed);
+      const size_t total_slices = page->slice_committed / MI_ARENA_SLICE_SIZE;  // conservative
+      //mi_assert_internal(mi_bitmap_is_clearN(arena->slices_committed, slice_index, total_slices));
+      mi_assert_internal(slice_count >= total_slices);
+      if (total_slices > 0) {
+        mi_bitmap_setN(arena->slices_committed, slice_index, total_slices, NULL);
+      }
+      // any left over?
+      const size_t extra = page->slice_committed % MI_ARENA_SLICE_SIZE;
+      if (extra > 0) {
+        // pretend it was decommitted already
+        mi_subproc_stat_decrease(arena->subproc, committed, extra);
+      }
+    }
+    else {
+      mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
+    }
+  }
+  _mi_arenas_free(page, mi_page_full_size(page), page->memid);
+}
+
+/* -----------------------------------------------------------
+  Arena abandon
+----------------------------------------------------------- */
+
+void _mi_arenas_page_abandon(mi_page_t* page, mi_theap_t* current_theap) {
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_ptr_page(page)==page);
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(!mi_page_all_free(page));
+  mi_assert_internal(page->next==NULL && page->prev == NULL);
+  mi_assert_internal(_mi_thread_id()==current_theap->tld->thread_id);
+  // mi_assert_internal(current_theap == _mi_page_associated_theap(page));
+
+  mi_heap_t* heap = mi_page_heap(page); mi_assert_internal(heap==current_theap->heap);
+  if (page->memid.memkind==MI_MEM_ARENA && !mi_page_is_full(page)) {
+    // make available for allocations
+    size_t bin = _mi_bin(mi_page_block_size(page));
+    size_t slice_index;
+    size_t slice_count;
+    mi_arena_pages_t* arena_pages = NULL;
+    mi_arena_t* const arena = mi_page_arena_pages(page, &slice_index, &slice_count, &arena_pages); MI_UNUSED(arena);
+
+    mi_assert_internal(!mi_page_is_singleton(page));
+    mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+    mi_assert_internal(page->slice_committed > 0 || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
+    mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count));
+
+    mi_page_set_abandoned_mapped(page);
+    const bool was_clear = mi_bitmap_set(arena_pages->pages_abandoned[bin], slice_index);
+    MI_UNUSED(was_clear); mi_assert_internal(was_clear);
+    mi_atomic_increment_relaxed(&heap->abandoned_count[bin]);
+    mi_theap_stat_increase(current_theap, pages_abandoned, 1);    
+  }
+  else {
+    // page is full (or a singleton), or the page is OS/externally allocated
+    // leave as is; it will be reclaimed when an object is free'd in the page
+    // but for non-arena pages, add to the subproc list so these can be visited
+    if (page->memid.memkind != MI_MEM_ARENA && mi_option_is_enabled(mi_option_visit_abandoned)) {
+      mi_lock(&heap->os_abandoned_pages_lock) {
+        // push in front
+        page->prev = NULL;
+        page->next = heap->os_abandoned_pages;
+        if (page->next != NULL) { page->next->prev = page; }
+        heap->os_abandoned_pages = page;
+      }
+    }
+    mi_theap_stat_increase(current_theap, pages_abandoned, 1);
+  }
+  mi_abandoned_page_unown(page, current_theap);
+}
+
+
+// this is called from `free.c:mi_free_try_collect_mt` only.
+bool _mi_arenas_page_try_reabandon_to_mapped(mi_page_t* page) {
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_ptr_page(page)==page);
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(!mi_page_is_abandoned_mapped(page));
+  mi_assert_internal(!mi_page_is_full(page));
+  mi_assert_internal(!mi_page_all_free(page));
+  mi_assert_internal(!mi_page_is_singleton(page));
+  if (mi_page_is_full(page) || mi_page_is_abandoned_mapped(page) || page->memid.memkind != MI_MEM_ARENA) {
+    return false;
+  }
+  else {
+    // do not use _mi_heap_theap as we may call this during shutdown of threads and don't want to reinitialize the theap
+    mi_theap_t* const theap = _mi_page_associated_theap_peek(page);
+    if (theap == NULL) {
+      return false;
+    }
+    else {
+      mi_theap_stat_counter_increase(theap, pages_reabandon_full, 1);
+      mi_theap_stat_adjust_decrease(theap, pages_abandoned, 1);  // adjust as we are not abandoning fresh
+      _mi_arenas_page_abandon(page, theap);
+      return true;
+    }
+  }
+}
+
+// called from `mi_free` if trying to unabandon an abandoned page
+void _mi_arenas_page_unabandon(mi_page_t* page, mi_theap_t* current_theapx) {
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_ptr_page(page)==page);
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(current_theapx==NULL || _mi_thread_id()==current_theapx->tld->thread_id);
+
+  mi_heap_t* const heap = mi_page_heap(page);
+  if (mi_page_is_abandoned_mapped(page)) {
+    mi_assert_internal(page->memid.memkind==MI_MEM_ARENA);
+    // remove from the abandoned map
+    size_t bin = _mi_bin(mi_page_block_size(page));
+    size_t slice_index;
+    size_t slice_count;
+    mi_arena_pages_t* arena_pages;
+    mi_arena_t* arena = mi_page_arena_pages(page, &slice_index, &slice_count, &arena_pages);  MI_UNUSED(arena);
+
+    mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+    mi_assert_internal(page->slice_committed > 0 || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
+
+    // this busy waits until a concurrent reader (from alloc_abandoned) is done
+    mi_bitmap_clear_once_set(arena_pages->pages_abandoned[bin], slice_index);
+    mi_page_clear_abandoned_mapped(page);
+    mi_atomic_decrement_relaxed(&heap->abandoned_count[bin]);
+  }
+  else {
+    // page is full (or a singleton), page is OS allocated
+    // if not an arena page, remove from the subproc os pages list
+    if (page->memid.memkind != MI_MEM_ARENA && mi_option_is_enabled(mi_option_visit_abandoned)) {
+      mi_lock(&heap->os_abandoned_pages_lock) {
+        if (page->prev != NULL) { page->prev->next = page->next; }
+        if (page->next != NULL) { page->next->prev = page->prev; }
+        if (heap->os_abandoned_pages == page) { heap->os_abandoned_pages = page->next; }
+        page->next = NULL;
+        page->prev = NULL;
+      }
+    }
+  }
+  if (current_theapx!=NULL) {
+    mi_theap_stat_decrease(current_theapx, pages_abandoned, 1);
+  }
+  else {
+    mi_heap_stat_decrease(heap, pages_abandoned, 1);
+  }
+}
+
+
+/* -----------------------------------------------------------
+  Arena free
+----------------------------------------------------------- */
+static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slices);
+static void mi_arenas_try_purge(bool force, bool visit_all, mi_subproc_t* subproc, size_t tseq);
+
+void _mi_arenas_free(void* p, size_t size, mi_memid_t memid) {
+  if (p==NULL) return;
+  if (size==0) return;
+
+  // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.)
+  mi_track_mem_undefined(p, size);
+
+  if (mi_memkind_is_os(memid.memkind)) {
+    // was a direct OS allocation, pass through
+    _mi_os_free(p, size, memid);
+  }
+  else if (memid.memkind == MI_MEM_ARENA) {
+    // allocated in an arena
+    size_t slice_count;
+    size_t slice_index;
+    mi_arena_t* arena = mi_arena_from_memid(memid, &slice_index, &slice_count);
+    mi_assert_internal((size%MI_ARENA_SLICE_SIZE)==0);
+    mi_assert_internal((slice_count*MI_ARENA_SLICE_SIZE)==size);
+    mi_assert_internal(mi_arena_slice_start(arena,slice_index) <= (uint8_t*)p);
+    mi_assert_internal(mi_arena_slice_start(arena,slice_index) + mi_size_of_slices(slice_count) > (uint8_t*)p);
+    // checks
+    if (arena == NULL) {
+      _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      return;
+    }
+    mi_assert_internal(slice_index < arena->slice_count);
+    mi_assert_internal(slice_index >= mi_arena_info_slices(arena));
+    if (slice_index < mi_arena_info_slices(arena) || slice_index > arena->slice_count) {
+      _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      return;
+    }
+
+    // potentially decommit
+    if (!arena->memid.is_pinned /* && !arena->memid.initially_committed */) { // todo: allow decommit even if initially committed?
+      // (delay) purge the page
+      mi_arena_schedule_purge(arena, slice_index, slice_count);
+    }
+
+    // and make it available to others again
+    bool all_inuse = mi_bbitmap_setN(arena->slices_free, slice_index, slice_count);
+    if (!all_inuse) {
+      _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", mi_arena_slice_start(arena,slice_index), mi_size_of_slices(slice_count));
+      return;
+    };
+  }
+  else if (memid.memkind == MI_MEM_META) {
+    _mi_meta_free(p, size, memid);
+  }
+  else {
+    // arena was none, external, or static; nothing to do
+    mi_assert_internal(mi_memid_needs_no_free(memid));
+  }
+
+  // try to purge expired decommits
+  // mi_arenas_try_purge(false, false, NULL);
+}
+
+// Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired
+void _mi_arenas_collect(bool force_purge, bool visit_all, mi_tld_t* tld) {
+  mi_arenas_try_purge(force_purge, visit_all, tld->subproc, tld->thread_seq);
+}
+
+
+// Is a pointer contained in the given arena area?
+static bool mi_arena_strictly_contains(mi_arena_t* arena, const void* p) {
+  return (arena != NULL && 
+          mi_arena_start(arena) <= (const uint8_t*)p &&
+          mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) >(const uint8_t*)p);
+}
+
+// Is a pointer inside any of our arenas?
+static bool mi_arenas_contain_ex(const void* p, mi_arena_t* parent) {
+  mi_subproc_t* subproc = _mi_subproc();
+  const size_t max_arena = mi_arenas_get_count(subproc);
+  for (size_t i = 0; i < max_arena; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &subproc->arenas[i]);
+    if (arena != NULL) {
+      if (parent==NULL || arena==parent || arena->parent==parent) {
+        if (mi_arena_strictly_contains(arena, p)) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+// Is a pointer inside any of our arenas?
+bool _mi_arenas_contain(const void* p) {
+  return mi_arenas_contain_ex(p, NULL);
+}
+
+// Is a pointer contained in the given arena area?
+bool mi_arena_contains(mi_arena_id_t arena_id, const void* p) {
+  mi_arena_t* arena = _mi_arena_from_id(arena_id);
+  if (arena==NULL) return false;
+  else if (mi_arena_strictly_contains(arena, p)) return true;
+  else return mi_arenas_contain_ex(p, arena);  // maybe a subarena?
+}
+
+
+/* -----------------------------------------------------------
+  Remove an arena.
+----------------------------------------------------------- */
+
+// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
+// for dynamic libraries that are unloaded and need to release all their allocated memory.
+static void mi_arenas_unsafe_destroy(mi_subproc_t* subproc) {
+  mi_assert_internal(subproc != NULL);
+  const size_t arena_count = mi_arenas_get_count(subproc);
+  for (size_t i = 0; i < arena_count; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &subproc->arenas[i]);
+    if (arena != NULL) {
+      // mi_lock_done(&arena->abandoned_visit_lock);
+      mi_atomic_store_ptr_release(mi_arena_t, &subproc->arenas[i], NULL);
+      if (mi_memkind_is_os(arena->memid.memkind)) {
+        _mi_os_free_ex(mi_arena_start(arena), mi_arena_size(arena), true, arena->memid, subproc); // pass `subproc` to avoid accessing the theap pointer (in `_mi_subproc()`)
+      }
+    }
+  }
+  // try to lower the max arena.
+  size_t expected = arena_count;
+  mi_atomic_cas_strong_acq_rel(&subproc->arena_count, &expected, 0);
+}
+
+
+// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
+// for dynamic libraries that are unloaded and need to release all their allocated memory.
+void _mi_arenas_unsafe_destroy_all(mi_subproc_t* subproc) {
+  mi_arenas_unsafe_destroy(subproc);
+  // mi_arenas_try_purge(true /* force purge */, true /* visit all*/, subproc, 0 /* thread seq */);  // purge non-owned arenas
+}
+
+
+/* -----------------------------------------------------------
+  Add an arena.
+----------------------------------------------------------- */
+
+static bool mi_arenas_add(mi_subproc_t* subproc, mi_arena_t* arena, mi_arena_id_t* arena_id)
+{
+  mi_assert_internal(arena != NULL);
+  mi_assert_internal(arena->slice_count > 0);
+  if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); }
+
+  // try to find a NULL entry
+  mi_arena_t* expected;
+  size_t count = mi_arenas_get_count(subproc);
+  for( size_t i = 0; i < count; i++) {
+    if (mi_arena_from_index(subproc,i) == NULL) {
+      arena->arena_idx = i;
+      expected = NULL;
+      if (mi_atomic_cas_ptr_strong_release(mi_arena_t, &subproc->arenas[i], &expected, arena)) {
+        // success
+        if (arena_id != NULL) { *arena_id = mi_arena_id_from_arena(arena); }
+        return true;
+      }
+    }
+  }
+
+  // otherwise, try to allocate a fresh slot
+  while(count<MI_MAX_ARENAS) {
+    if (mi_atomic_cas_strong_release(&subproc->arena_count, &count, count+1)) {
+      arena->arena_idx = count;
+      expected = NULL;
+      if (mi_atomic_cas_ptr_strong_release(mi_arena_t, &subproc->arenas[count], &expected, arena)) {
+        mi_subproc_stat_counter_increase(arena->subproc, arena_count, 1);
+        if (arena_id != NULL) { *arena_id = mi_arena_id_from_arena(arena); }
+        return true;
+      }
+    }
+  }
+
+  // failed
+  arena->arena_idx = 0;
+  arena->subproc = NULL;
+  return false;
+}
+
+static size_t mi_arena_pages_size(size_t slice_count, size_t* bitmap_base) {
+  if (slice_count == 0) slice_count = MI_BCHUNK_BITS;
+  mi_assert_internal((slice_count % MI_BCHUNK_BITS) == 0);
+  const size_t base_size = _mi_align_up(sizeof(mi_arena_pages_t), MI_BCHUNK_SIZE);
+  const size_t bitmaps_count = 1 + MI_ARENA_BIN_COUNT; // pages, and abandoned
+  const size_t bitmaps_size = bitmaps_count * mi_bitmap_size(slice_count, NULL);
+  const size_t size = base_size + bitmaps_size;
+  if (bitmap_base != NULL) *bitmap_base = base_size;
+  return size;
+}
+
+static size_t mi_arena_info_slices_needed(size_t slice_count, size_t* bitmap_base) {
+  if (slice_count == 0) slice_count = MI_BCHUNK_BITS;
+  mi_assert_internal((slice_count % MI_BCHUNK_BITS) == 0);
+  const size_t base_size = _mi_align_up(sizeof(mi_arena_t), MI_BCHUNK_SIZE);
+  const size_t bitmaps_count = 4 + MI_ARENA_BIN_COUNT; // commit, dirty, purge, pages, and abandoned
+  const size_t bitmaps_size = bitmaps_count * mi_bitmap_size(slice_count, NULL) + mi_bbitmap_size(slice_count, NULL); // + free
+  const size_t size = base_size + bitmaps_size;
+
+  const size_t os_page_size = _mi_os_page_size();
+  const size_t info_size = _mi_align_up(size, os_page_size) + _mi_os_secure_guard_page_size();
+  const size_t info_slices = mi_slice_count_of_size(info_size);
+
+  if (bitmap_base != NULL) *bitmap_base = base_size;
+  return info_slices;
+}
+
+static mi_bitmap_t* mi_arena_bitmap_init(size_t slice_count, uint8_t** base) {
+  mi_bitmap_t* bitmap = (mi_bitmap_t*)(*base);
+  *base = (*base) + mi_bitmap_init(bitmap, slice_count, true /* already zero */);
+  return bitmap;
+}
+
+static mi_bbitmap_t* mi_arena_bbitmap_init(size_t slice_count, uint8_t** base) {
+  mi_bbitmap_t* bbitmap = (mi_bbitmap_t*)(*base);
+  *base = (*base) + mi_bbitmap_init(bbitmap, slice_count, true /* already zero */);
+  return bbitmap;
+}
+
+static mi_arena_pages_t* mi_arena_pages_alloc(mi_arena_t* arena) {
+  const size_t slice_count = arena->slice_count;
+  size_t bitmap_base = 0;
+  const size_t size = mi_arena_pages_size(slice_count, &bitmap_base);
+  mi_arena_pages_t* arena_pages = (mi_arena_pages_t*)mi_heap_zalloc_aligned(mi_heap_main(), size, MI_BCHUNK_SIZE);
+  if (arena_pages==NULL) return NULL;
+  uint8_t* base = (uint8_t*)arena_pages + bitmap_base;
+  mi_assert_internal(_mi_is_aligned(base, MI_BCHUNK_SIZE));
+  arena_pages->pages = mi_arena_bitmap_init(slice_count, &base);
+  for (size_t i = 0; i < MI_ARENA_BIN_COUNT; i++) {
+    arena_pages->pages_abandoned[i] = mi_arena_bitmap_init(slice_count, &base);
+  }
+  return arena_pages;
+}
+
+static mi_arena_t* mi_arena_initialize(mi_subproc_t* subproc, void* start,
+                                        size_t slice_count, mi_arena_t* parent, size_t total_size,
+                                        int numa_node, bool exclusive,
+                                        mi_memid_t memid, mi_commit_fun_t* commit_fun, void* commit_fun_arg, mi_arena_id_t* arena_id)
+{
+  mi_assert_internal(_mi_is_aligned(start,MI_ARENA_SLICE_ALIGN));
+  mi_assert_internal(mi_size_of_slices(slice_count)>=MI_ARENA_MIN_SIZE);
+
+  if (slice_count > MI_BITMAP_MAX_BIT_COUNT) {  // 16 GiB for now
+    // note: this should never happen if called from `mi_manage_os_memory` (as that allocates sub-arenas when needed)
+    _mi_warning_message("cannot use OS memory since it is too large (size %zu MiB, maximum is %zu MiB)", mi_size_of_slices(slice_count)/MI_MiB, mi_size_of_slices(MI_BITMAP_MAX_BIT_COUNT)/MI_MiB);
+    return NULL;
+  }
+
+  size_t bitmap_base;
+  const size_t info_slices = mi_arena_info_slices_needed(slice_count, &bitmap_base);
+  if (slice_count < info_slices+1) {
+    _mi_warning_message("cannot use OS memory since it is not large enough (size %zu KiB, minimum required is %zu KiB)", mi_size_of_slices(slice_count)/MI_KiB, mi_size_of_slices(info_slices+1)/MI_KiB);
+    return NULL;
+  }
+  else if (info_slices >= MI_ARENA_MAX_CHUNK_OBJ_SLICES) {
+    _mi_warning_message("cannot use OS memory since it is too large with respect to the maximum object size (size %zu MiB, meta-info slices %zu, maximum object slices are %zu)", mi_size_of_slices(slice_count)/MI_MiB, info_slices, MI_ARENA_MAX_CHUNK_OBJ_SLICES);
+    return NULL;
+  }
+
+  mi_arena_t* arena = (mi_arena_t*)start;
+
+  // commit & zero if needed
+  if (!memid.initially_committed) {
+    size_t commit_size = mi_size_of_slices(info_slices);
+    // leave a guard OS page decommitted at the end?
+    if (!memid.is_pinned) { commit_size -= _mi_os_secure_guard_page_size(); }
+    bool ok = false;
+    if (commit_fun != NULL) {
+      ok = (*commit_fun)(true /* commit */, arena, commit_size, NULL, commit_fun_arg);
+    }
+    else {
+      ok = _mi_os_commit(arena, commit_size, NULL);
+    }
+    if (!ok) {
+      _mi_warning_message("unable to commit meta-data for OS memory");
+      return NULL;
+    }
+  }
+  else if (!memid.is_pinned) {
+    // if MI_SECURE, set a guard page at the end
+    // todo: this does not respect the commit_fun as the memid is of external memory
+    _mi_os_secure_guard_page_set_before((uint8_t*)arena + mi_size_of_slices(info_slices), memid);
+  }
+  if (!memid.initially_zero) {
+    _mi_memzero(arena, mi_size_of_slices(info_slices) - _mi_os_secure_guard_page_size());
+  }
+
+  // init
+  arena->subproc = subproc;
+  arena->memid = memid;
+  arena->is_exclusive = exclusive;
+  arena->slice_count = slice_count;
+  arena->info_slices = info_slices;
+  arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
+  arena->purge_expire = 0;
+  arena->commit_fun = commit_fun;
+  arena->commit_fun_arg = commit_fun_arg;
+  arena->parent = parent;
+  arena->total_size = total_size;
+
+  // init bitmaps
+  uint8_t* base = mi_arena_start(arena) + bitmap_base;
+  arena->slices_free = mi_arena_bbitmap_init(slice_count, &base);
+  arena->slices_committed = mi_arena_bitmap_init(slice_count, &base);
+  arena->slices_dirty = mi_arena_bitmap_init(slice_count, &base);
+  arena->slices_purge = mi_arena_bitmap_init(slice_count, &base);
+  arena->pages_main.pages = mi_arena_bitmap_init(slice_count, &base);
+  for (size_t i = 0; i < MI_ARENA_BIN_COUNT; i++) {
+    arena->pages_main.pages_abandoned[i] = mi_arena_bitmap_init(slice_count, &base);
+  }
+  mi_assert_internal(mi_size_of_slices(info_slices) >= (size_t)(base - mi_arena_start(arena)));
+
+  // reserve our meta info (and reserve slices outside the memory area)
+  mi_bbitmap_unsafe_setN(arena->slices_free, info_slices /* start */, arena->slice_count - info_slices);
+  if (memid.initially_committed) {
+    mi_bitmap_unsafe_setN(arena->slices_committed, 0, arena->slice_count);
+  }
+  if (!memid.initially_zero) {
+    mi_bitmap_unsafe_setN(arena->slices_dirty, 0, arena->slice_count);
+  }
+  
+  if (!mi_arenas_add(subproc, arena, arena_id)) { return NULL;  }
+  return arena;
+}
+
+static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t size, int numa_node, bool exclusive,
+  mi_memid_t memid, mi_commit_fun_t* commit_fun, void* commit_fun_arg, mi_arena_id_t* arena_id) mi_attr_noexcept
+{
+  // checks
+  mi_assert(_mi_is_aligned(start, MI_ARENA_SLICE_SIZE));
+  mi_assert(start!=NULL);
+  if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); }
+  if (start==NULL) return false;
+  if (!_mi_is_aligned(start, MI_ARENA_SLICE_SIZE)) {
+    // we can align the start since the memid tracks the real base of the memory.
+    void* const aligned_start = _mi_align_up_ptr(start, MI_ARENA_SLICE_SIZE);
+    const size_t diff = (uint8_t*)aligned_start - (uint8_t*)start;
+    if (diff >= size || (size - diff) < MI_ARENA_SLICE_SIZE) {
+      _mi_warning_message("after alignment, the size of the arena becomes too small (memory at %p with size %zu)\n", start, size);
+      return false;
+    }
+    start = aligned_start;
+    size = size - diff;
+  }
+
+  // allocate enough arena's to span the full memory area
+  // the first arena is the owner, the rest are "sub-arena" (with `parent` pointing to the first one)
+  size_t total_slice_count = _mi_align_down(size / MI_ARENA_SLICE_SIZE, MI_BCHUNK_BITS);
+  size_t total_size = mi_size_of_slices(total_slice_count);
+  if (total_size < MI_ARENA_MIN_SIZE) {
+    _mi_warning_message("cannot use OS memory since it is not large enough (size %zu KiB, minimum required is %zu KiB)", size/MI_KiB, MI_ARENA_MIN_SIZE/MI_KiB);
+    return false;
+  }
+
+  mi_arena_t* parent = NULL;
+  do {
+    // counting down on the total_slice_count
+    size_t slice_count = total_slice_count;
+    if (slice_count > MI_BITMAP_MAX_BIT_COUNT) {  // 16 GiB for now (with 64KiB slices)
+      slice_count = MI_BITMAP_MAX_BIT_COUNT;
+    }
+    
+    // initialize
+    mi_arena_t* arena = mi_arena_initialize( subproc, start, slice_count, parent,
+                                              (parent==NULL ? total_size : 0), numa_node, exclusive,
+                                              memid, commit_fun, commit_fun_arg,
+                                              (parent==NULL ? arena_id : NULL));
+    if (arena==NULL) {
+      // failed to initialize due to failing commit or too many arena's
+      if (parent==NULL) {
+        return false;
+      }
+      else {
+        // partial success, but failed to use the full area..
+        // todo: roll-back in this case? that requires a lock on the arena's array though
+        mi_assert(mi_size_of_slices(total_slice_count) <= parent->total_size);
+        parent->total_size -= mi_size_of_slices(total_slice_count);
+        return true;
+      }
+    }
+
+    // success
+    if (parent==NULL) { 
+      parent = arena; 
+      memid.memkind = MI_MEM_NONE;
+    }
+    mi_assert(slice_count <= total_slice_count);
+    total_slice_count -= slice_count;
+    start = (uint8_t*)start + mi_size_of_slices(slice_count);
+  } 
+  while (total_slice_count > 0);
+
+  return true;
+}
+ 
+bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_pinned, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+  mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL);
+  memid.mem.os.base = start;
+  memid.mem.os.size = size;
+  memid.initially_committed = is_committed;
+  memid.initially_zero = is_zero;
+  memid.is_pinned = is_pinned;
+  return mi_manage_os_memory_ex2(_mi_subproc(), start, size, numa_node, exclusive, memid, NULL, NULL, arena_id);
+}
+
+bool mi_manage_memory(void* start, size_t size, bool is_committed, bool is_zero, bool is_pinned, int numa_node, bool exclusive, mi_commit_fun_t* commit_fun, void* commit_fun_arg, mi_arena_id_t* arena_id) mi_attr_noexcept
+{
+  mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL);
+  memid.mem.os.base = start;
+  memid.mem.os.size = size;
+  memid.initially_committed = is_committed;
+  memid.initially_zero = is_zero;
+  memid.is_pinned = is_pinned;
+  return mi_manage_os_memory_ex2(_mi_subproc(), start, size, numa_node, exclusive, memid, commit_fun, commit_fun_arg, arena_id);
+}
+
+
+// Reserve a range of regular OS memory
+static int mi_reserve_os_memory_ex2(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) {
+  if (arena_id != NULL) *arena_id = _mi_arena_id_none();
+  size = _mi_align_up(size, MI_ARENA_SLICE_SIZE); // at least one slice
+  mi_memid_t memid;
+  void* start = _mi_os_alloc_aligned(size, MI_ARENA_SLICE_ALIGN, commit, allow_large, &memid);
+  if (start == NULL) return ENOMEM;
+  if (!mi_manage_os_memory_ex2(subproc, start, size, -1 /* numa node */, exclusive, memid, NULL, NULL, arena_id)) {
+    _mi_os_free_ex(start, size, commit, memid, NULL);
+    _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024));
+    return ENOMEM;
+  }
+  _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), memid.is_pinned ? " (in large os pages)" : "");
+  // mi_debug_show_arenas(true, true, false);
+
+  return 0;
+}
+
+// Reserve a range of regular OS memory
+int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+  return mi_reserve_os_memory_ex2(_mi_subproc(), size, commit, allow_large, exclusive, arena_id);
+}
+
+// Manage a range of regular OS memory
+bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept {
+  return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false /* exclusive? */, NULL);
+}
+
+// Reserve a range of regular OS memory
+int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept {
+  return mi_reserve_os_memory_ex(size, commit, allow_large, false, NULL);
+}
+
+
+/* -----------------------------------------------------------
+  Debugging
+----------------------------------------------------------- */
+
+// Return idx of the slice past the last used slice
+static size_t mi_arena_used_slices(mi_arena_t* arena) {
+  size_t idx;
+  if (mi_bbitmap_bsr_inv(arena->slices_free, &idx)) {
+    return (idx + 1);
+  }
+  else {
+    return mi_arena_info_slices(arena);
+  }
+}
+
+static size_t mi_debug_show_bfield(mi_bfield_t field, char* buf, size_t* k) {
+  size_t bit_set_count = 0;
+  for (int bit = 0; bit < MI_BFIELD_BITS; bit++) {
+    bool is_set = ((((mi_bfield_t)1 << bit) & field) != 0);
+    if (is_set) bit_set_count++;
+    buf[*k] = (is_set ? 'x' : '.');
+    *k = *k + 1;
+  }
+  return bit_set_count;
+}
+
+typedef enum mi_ansi_color_e {
+  MI_BLACK = 30,
+  MI_MAROON,
+  MI_DARKGREEN,
+  MI_ORANGE,
+  MI_NAVY,
+  MI_PURPLE,
+  MI_TEAL,
+  MI_GRAY,
+  MI_DARKGRAY = 90,
+  MI_RED,
+  MI_GREEN,
+  MI_YELLOW,
+  MI_BLUE,
+  MI_MAGENTA,
+  MI_CYAN,
+  MI_WHITE
+} mi_ansi_color_t;
+
+static void mi_debug_color(char* buf, size_t* k, mi_ansi_color_t color) {
+  *k += _mi_snprintf(buf + *k, 32, "\x1B[%dm", (int)color);
+}
+
+static int mi_page_commit_usage(mi_page_t* page) {
+  // if (mi_page_size(page) <= MI_PAGE_MIN_COMMIT_SIZE) return 100;
+  const size_t committed_size = mi_page_committed(page);
+  const size_t used_size = page->used * mi_page_block_size(page);
+  return (int)(used_size * 100 / committed_size);
+}
+
+static size_t mi_debug_show_page_bfield(char* buf, size_t* k, mi_arena_t* arena, size_t slice_index, long* pbit_of_page, mi_ansi_color_t* pcolor_of_page ) {
+  size_t bit_set_count = 0;
+  long bit_of_page = *pbit_of_page;
+  mi_ansi_color_t color = *pcolor_of_page;
+  mi_ansi_color_t prev_color = MI_GRAY;
+  for (int bit = 0; bit < MI_BFIELD_BITS; bit++, bit_of_page--) {
+    // bool is_set = ((((mi_bfield_t)1 << bit) & field) != 0);
+    void* start = mi_arena_slice_start(arena, slice_index + bit);
+    mi_page_t* page = _mi_safe_ptr_page(start);
+    char c = ' ';
+    if (start==page) {
+      mi_assert_internal(bit_of_page <= 0);
+      bit_set_count++;
+      c = 'p';
+      color = MI_GRAY;
+      if (mi_page_is_singleton(page)) { c = 's'; }
+      else if (mi_page_is_full(page)) { c = 'f'; }
+      if (!mi_page_is_abandoned(page)) { c = _mi_toupper(c); }
+      int commit_usage = mi_page_commit_usage(page);
+      if (commit_usage < 25) { color = MI_MAROON; }
+      else if (commit_usage < 50) { color = MI_ORANGE; }
+      else if (commit_usage < 75) { color = MI_TEAL; }
+      else color = MI_DARKGREEN;
+      bit_of_page = (long)page->memid.mem.arena.slice_count;
+    }
+    else {
+      c = '?';
+      if (bit_of_page > 0) { c = '-'; }
+      else if (_mi_meta_is_meta_page(start)) { c = 'm'; color = MI_GRAY; }
+      else if (slice_index + bit < arena->info_slices) { c = 'i'; color = MI_GRAY; }
+      // else if (mi_bitmap_is_setN(arena->pages_purge, slice_index + bit, NULL)) { c = '*'; }
+      else if (mi_bbitmap_is_setN(arena->slices_free, slice_index+bit,1)) {
+        if (mi_bitmap_is_set(arena->slices_purge, slice_index + bit)) { c = '~'; color = MI_ORANGE; }
+        else if (mi_bitmap_is_set(arena->slices_committed, slice_index + bit)) { c = '_'; color = MI_GRAY; }
+        else { c = '.'; color = MI_GRAY; }
+      }
+      if (bit==MI_BFIELD_BITS-1 && bit_of_page > 1) { c = '>'; }
+    }
+    if (color != prev_color) {
+      mi_debug_color(buf, k, color);
+      prev_color = color;
+    }
+    buf[*k] = c; *k += 1;
+  }
+  mi_debug_color(buf, k, MI_GRAY);
+  *pbit_of_page = bit_of_page;
+  *pcolor_of_page = color;
+  return bit_set_count;
+}
+
+static size_t mi_debug_show_chunks(const char* header1, const char* header2, const char* header3,
+                                   size_t slice_count, size_t chunk_count,
+                                   mi_bchunk_t* chunks, mi_bchunkmap_t* chunk_bins, bool invert, mi_arena_t* arena, bool narrow)
+{
+  _mi_raw_message("\x1B[37m%s%s%s (use/commit: \x1B[31m0 - 25%%\x1B[33m - 50%%\x1B[36m - 75%%\x1B[32m - 100%%\x1B[0m)\n", header1, header2, header3);
+  const size_t fields_per_line = (narrow ? 2 : 4);
+  const size_t used_slice_count = mi_arena_used_slices(arena);
+  size_t bit_count = 0;
+  size_t bit_set_count = 0;
+  long bit_of_page = 0;
+  mi_ansi_color_t color_of_page = MI_GRAY;
+  for (size_t i = 0; i < chunk_count && bit_count < slice_count; i++) {
+    char buf[5*MI_BCHUNK_BITS + 64]; _mi_memzero(buf, sizeof(buf));
+    if (bit_count > used_slice_count && i+2 < chunk_count) {
+      const size_t diff = chunk_count - 1 - i;
+      bit_count += diff*MI_BCHUNK_BITS;
+      _mi_raw_message("  |\n");
+      i = chunk_count-1;
+    }
+
+    size_t k = 0;
+
+    if (i<10)        { buf[k++] = ('0' + (char)i); buf[k++] = ' '; buf[k++] = ' '; }
+    else if (i<100)  { buf[k++] = ('0' + (char)(i/10)); buf[k++] = ('0' + (char)(i%10)); buf[k++] = ' '; }
+    else if (i<1000) { buf[k++] = ('0' + (char)(i/100)); buf[k++] = ('0' + (char)((i%100)/10)); buf[k++] = ('0' + (char)(i%10)); }
+
+    char chunk_kind = ' ';
+    if (chunk_bins != NULL) {
+      switch (mi_bbitmap_debug_get_bin(chunk_bins,i)) {
+        case MI_CBIN_SMALL:  chunk_kind = 'S'; break;
+        case MI_CBIN_MEDIUM: chunk_kind = 'M'; break;
+        case MI_CBIN_LARGE:  chunk_kind = 'L'; break;
+        case MI_CBIN_HUGE:   chunk_kind = 'H'; break;
+        case MI_CBIN_OTHER:  chunk_kind = 'X'; break;
+        default: chunk_kind = ' '; break; // suppress warning
+        // case MI_CBIN_NONE: chunk_kind = 'N'; break;
+      }
+    }
+    buf[k++] = chunk_kind;
+    buf[k++] = ' ';
+
+    for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
+      if (j > 0 && (j % fields_per_line) == 0) {
+        // buf[k++] = '\n'; _mi_memset(buf+k,' ',7); k += 7;
+        _mi_raw_message("  %s\n\x1B[37m", buf);
+        _mi_memzero(buf, sizeof(buf));
+        _mi_memset(buf, ' ', 5); k = 5;
+      }
+      if (bit_count < slice_count) {
+        mi_bfield_t bfield = 0;
+        if (chunks!=NULL) {
+          bfield = chunks[i].bfields[j];
+        }
+        if (invert) bfield = ~bfield;
+        size_t xcount = (chunks==NULL ? mi_debug_show_page_bfield(buf, &k, arena, bit_count, &bit_of_page, &color_of_page)
+                                      : mi_debug_show_bfield(bfield, buf, &k));
+        if (invert) xcount = MI_BFIELD_BITS - xcount;
+        bit_set_count += xcount;
+        buf[k++] = ' ';
+      }
+      else {
+        _mi_memset(buf + k, 'o', MI_BFIELD_BITS);
+        k += MI_BFIELD_BITS;
+      }
+      bit_count += MI_BFIELD_BITS;
+    }
+    _mi_raw_message("  %s\n\x1B[37m", buf);
+  }
+  _mi_raw_message("\x1B[0m  total pages: %zu\n", bit_set_count);
+  return bit_set_count;
+}
+
+//static size_t mi_debug_show_bitmap_binned(const char* header1, const char* header2, const char* header3, size_t slice_count,
+//                                           mi_bitmap_t* bitmap, mi_bchunkmap_t* chunk_bins, bool invert, mi_arena_t* arena, bool narrow) {
+//  return mi_debug_show_chunks(header1, header2, header3, slice_count, mi_bitmap_chunk_count(bitmap), &bitmap->chunks[0], chunk_bins, invert, arena, narrow);
+//}
+
+static void mi_debug_show_arenas_ex(mi_heap_t* heap, bool show_pages, bool narrow) mi_attr_noexcept {
+  mi_subproc_t* subproc = heap->subproc;
+  size_t max_arenas = mi_arenas_get_count(subproc);
+  //size_t free_total = 0;
+  //size_t slice_total = 0;
+  //size_t abandoned_total = 0;
+  size_t page_total = 0;
+  for (size_t i = 0; i < max_arenas; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &subproc->arenas[i]);
+    if (arena == NULL) break;
+    mi_assert(arena->subproc == subproc);
+    // slice_total += arena->slice_count;
+    _mi_raw_message("%sarena %zu at %p: %zu slices (%zu MiB)%s%s, subproc: %p, numa: %i\n", 
+        (arena->parent==NULL ? "" : "(sub)"), i, arena, arena->slice_count, (size_t)(mi_size_of_slices(arena->slice_count)/MI_MiB), 
+        (arena->memid.is_pinned ? ", pinned" : ""), (arena->is_exclusive ? ", exclusive" : ""), 
+        arena->subproc, arena->numa_node);
+    //if (show_inuse) {
+    //  free_total += mi_debug_show_bbitmap("in-use slices", arena->slice_count, arena->slices_free, true, NULL);
+    //}
+    //if (show_committed) {
+    //  mi_debug_show_bitmap("committed slices", arena->slice_count, arena->slices_committed, false, NULL);
+    //}
+    // todo: abandoned slices
+    //if (show_purge) {
+    //  purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false, NULL);
+    //}
+    if (show_pages) {
+      // mi_arena_pages_t* arena_pages = mi_heap_arena_pages(heap, arena);
+      // if (arena_pages != NULL)
+      {
+        const char* header1 = "chunks (p:page, f:full, s:singleton, P,F,S:not abandoned, i:arena-info, m:meta-data, ~:free-purgable, _:free-committed, .:free-reserved)";
+        const char* header2 = (narrow ? "\n       " : " ");
+        const char* header3 = "(chunk bin: S:small, M : medium, L : large, X : other)";
+        page_total += mi_debug_show_chunks(header1, header2, header3, arena->slice_count,
+                                           mi_bbitmap_chunk_count(arena->slices_free), NULL,
+                                           arena->slices_free->chunkmap_bins, false, arena, narrow);
+      }
+    }
+  }
+  // if (show_inuse)     _mi_raw_message("total inuse slices    : %zu\n", slice_total - free_total);
+  // if (show_abandoned) _mi_raw_message("total abandoned slices: %zu\n", abandoned_total);
+  if (show_pages) _mi_raw_message("total pages in arenas: %zu\n", page_total);
+}
+
+void mi_debug_show_arenas(void) mi_attr_noexcept {
+  mi_debug_show_arenas_ex(mi_heap_main(), true /* show pages */, true /* narrow? */);
+}
+
+void mi_arenas_print(void) mi_attr_noexcept {
+  mi_debug_show_arenas();
+}
+
+
+/* -----------------------------------------------------------
+  Reserve a huge page arena.
+----------------------------------------------------------- */
+// reserve at a specific numa node
+int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+  if (arena_id != NULL) *arena_id = NULL;
+  if (pages==0) return 0;
+  if (numa_node < -1) numa_node = -1;
+  if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count();
+  size_t hsize = 0;
+  size_t pages_reserved = 0;
+  mi_memid_t memid;
+  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize, &memid);
+  if (p==NULL || pages_reserved==0) {
+    _mi_warning_message("failed to reserve %zu GiB huge pages\n", pages);
+    return ENOMEM;
+  }
+  _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages);
+
+  if (!mi_manage_os_memory_ex2(_mi_subproc(), p, hsize, numa_node, exclusive, memid, NULL, NULL, arena_id)) {
+    _mi_os_free(p, hsize, memid);
+    return ENOMEM;
+  }
+  return 0;
+}
+
+int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept {
+  return mi_reserve_huge_os_pages_at_ex(pages, numa_node, timeout_msecs, false, NULL);
+}
+
+// reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected)
+int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept {
+  if (pages == 0) return 0;
+
+  // pages per numa node
+  int numa_count = (numa_nodes > 0 && numa_nodes <= INT_MAX ? (int)numa_nodes : _mi_os_numa_node_count());
+  if (numa_count <= 0) { numa_count = 1; }
+  const size_t pages_per = pages / numa_count;
+  const size_t pages_mod = pages % numa_count;
+  const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50);
+
+  // reserve evenly among numa nodes
+  for (int numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
+    size_t node_pages = pages_per;  // can be 0
+    if ((size_t)numa_node < pages_mod) { node_pages++; }
+    int err = mi_reserve_huge_os_pages_at(node_pages, numa_node, timeout_per);
+    if (err) return err;
+    if (pages < node_pages) {
+      pages = 0;
+    }
+    else {
+      pages -= node_pages;
+    }
+  }
+
+  return 0;
+}
+
+int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
+  MI_UNUSED(max_secs);
+  _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n");
+  if (pages_reserved != NULL) *pages_reserved = 0;
+  int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0));
+  if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
+  return err;
+}
+
+
+
+
+
+/* -----------------------------------------------------------
+  Arena purge
+----------------------------------------------------------- */
+
+static long mi_arena_purge_delay(void) {
+  // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay
+  return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult));
+}
+
+// reset or decommit in an arena and update the commit bitmap
+// assumes we own the area (i.e. slices_free is claimed by us)
+// returns if the memory is no longer committed (versus reset which keeps the commit)
+static bool mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_count) {
+  mi_assert_internal(!arena->memid.is_pinned);
+  mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+
+  const size_t size = mi_size_of_slices(slice_count);
+  void* const p = mi_arena_slice_start(arena, slice_index);
+  //const bool all_committed = mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count);
+  size_t already_committed;
+  mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed); // pretend all committed.. (as we lack a clearN call that counts the already set bits..)
+  const bool all_committed = (already_committed == slice_count);
+  const bool needs_recommit = _mi_os_purge_ex(p, size, all_committed /* allow reset? */, mi_size_of_slices(already_committed), arena->commit_fun, arena->commit_fun_arg);
+
+  if (needs_recommit) {
+    // no longer committed
+    mi_bitmap_clearN(arena->slices_committed, slice_index, slice_count);
+    // we just counted in the purge to decommit all, but the some part was not committed so adjust that here
+    // mi_subproc_stat_decrease(arena->subproc, committed, mi_size_of_slices(slice_count - already_committed));
+  }
+  else if (!all_committed) {
+    // we cannot assume any of these are committed any longer (even with reset since we did setN and may have marked uncommitted slices as committed)
+    mi_bitmap_clearN(arena->slices_committed, slice_index, slice_count);
+    // we adjust the commit count as parts will be re-committed
+    // mi_subproc_stat_decrease(arena->subproc, committed, mi_size_of_slices(already_committed));
+  }
+
+  return needs_recommit;
+}
+
+
+// Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls.
+// Note: assumes we (still) own the area as we may purge immediately
+static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slice_count) {
+  const long delay = mi_arena_purge_delay();
+  if (arena->memid.is_pinned || delay < 0 || _mi_preloading()) return;  // is purging allowed at all?
+
+  mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+  if (delay == 0) {
+    // purge directly
+    mi_arena_purge(arena, slice_index, slice_count);
+  }
+  else {
+    // schedule purge
+    const mi_msecs_t expire = _mi_clock_now() + delay;
+    mi_msecs_t expire0 = 0;
+    if (mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire0, expire)) {
+      // expiration was not yet set
+      // maybe set the global arenas expire as well (if it wasn't set already)
+      mi_assert_internal(expire0==0);
+      mi_atomic_casi64_strong_acq_rel(&arena->subproc->purge_expire, &expire0, expire);
+    }
+    else {
+      // already an expiration was set
+    }
+    mi_bitmap_setN(arena->slices_purge, slice_index, slice_count, NULL);
+  }
+}
+
+typedef struct mi_purge_visit_info_s {
+  mi_msecs_t now;
+  mi_msecs_t delay;
+  bool all_purged;
+  bool any_purged;
+} mi_purge_visit_info_t;
+
+static bool mi_arena_try_purge_range(mi_arena_t* arena, size_t slice_index, size_t slice_count) {
+  mi_assert(slice_count < MI_BCHUNK_BITS);
+  if (mi_bbitmap_try_clearNC(arena->slices_free, slice_index, slice_count)) {
+    // purge
+    bool decommitted = mi_arena_purge(arena, slice_index, slice_count); MI_UNUSED(decommitted);
+    mi_assert_internal(!decommitted || mi_bitmap_is_clearN(arena->slices_committed, slice_index, slice_count));
+    // and reset the free range
+    mi_bbitmap_setN(arena->slices_free, slice_index, slice_count);
+    return true;
+  }
+  else {
+    // was allocated again already
+    return false;
+  }
+}
+
+static bool mi_arena_try_purge_visitor(size_t slice_index, size_t slice_count, mi_arena_t* arena, void* arg) {
+  mi_purge_visit_info_t* vinfo = (mi_purge_visit_info_t*)arg;
+  // try to purge: first claim the free blocks
+  if (mi_arena_try_purge_range(arena, slice_index, slice_count)) {
+    vinfo->any_purged = true;
+    vinfo->all_purged = true;
+  }
+  else if (slice_count > 1)
+  {
+    // failed to claim the full range, try per slice instead
+    for (size_t i = 0; i < slice_count; i++) {
+      const bool purged = mi_arena_try_purge_range(arena, slice_index + i, 1);
+      vinfo->any_purged = vinfo->any_purged || purged;
+      vinfo->all_purged = vinfo->all_purged && purged;
+    }
+  }
+  // don't clear the purge bits as that is done atomically be the _bitmap_forall_set_ranges
+  // mi_bitmap_clearN(arena->slices_purge, slice_index, slice_count);
+  return true; // continue
+}
+
+// returns true if anything was purged
+static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
+{
+  // check pre-conditions
+  if (arena->memid.is_pinned) return false;
+
+  // expired yet?
+  mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
+  if (!force && (expire == 0 || expire > now)) return false;
+
+  // reset expire
+  mi_atomic_storei64_release(&arena->purge_expire, (mi_msecs_t)0);
+  mi_subproc_stat_counter_increase(arena->subproc, arena_purges, 1);
+
+  // go through all purge info's  (with max MI_BFIELD_BITS ranges at a time)
+  // this also clears those ranges atomically (so any newly freed blocks will get purged next
+  // time around)
+  mi_purge_visit_info_t vinfo = { now, mi_arena_purge_delay(), true /*all?*/, false /*any?*/};
+
+  // we purge by at least `minslices` to not fragment transparent huge pages for example
+  const size_t minslices = mi_slice_count_of_size(_mi_os_minimal_purge_size());
+  _mi_bitmap_forall_setc_rangesn(arena->slices_purge, minslices, &mi_arena_try_purge_visitor, arena, &vinfo);
+
+  return vinfo.any_purged;
+}
+
+
+static void mi_arenas_try_purge(bool force, bool visit_all, mi_subproc_t* subproc, size_t tseq)
+{
+  // try purge can be called often so try to only run when needed
+  const long delay = mi_arena_purge_delay();
+  if (_mi_preloading() || delay <= 0) return;  // nothing will be scheduled
+
+  // check if any arena needs purging?
+  const mi_msecs_t now = _mi_clock_now();
+  const mi_msecs_t arenas_expire = mi_atomic_loadi64_acquire(&subproc->purge_expire);
+  if (!visit_all && !force && (arenas_expire == 0 || arenas_expire > now)) return;
+
+  const size_t max_arena = mi_arenas_get_count(subproc);
+  if (max_arena == 0) return;
+
+  // allow only one thread to purge at a time (todo: allow concurrent purging?)
+  static mi_atomic_guard_t purge_guard;
+  mi_atomic_guard(&purge_guard)
+  {
+    // increase global expire: at most one purge per delay cycle
+    if (arenas_expire > now) { mi_atomic_storei64_release(&subproc->purge_expire, now + (delay/10)); }
+    const size_t arena_start = tseq % max_arena;
+    size_t max_purge_count = (visit_all ? max_arena : (max_arena/4)+1);
+    bool all_visited = true;
+    bool any_purged = false;
+    for (size_t _i = 0; _i < max_arena; _i++) {
+      size_t i = _i + arena_start;
+      if (i >= max_arena) { i -= max_arena; }
+      mi_arena_t* arena = mi_arena_from_index(subproc,i);
+      if (arena != NULL) {
+        if (mi_arena_try_purge(arena, now, force)) {
+          any_purged = true;
+          if (max_purge_count <= 1) {
+            all_visited = false;
+            break;
+          }
+          max_purge_count--;
+        }
+      }
+    }
+    if (all_visited && !any_purged) {
+      mi_atomic_storei64_release(&subproc->purge_expire, 0);
+    }
+  }
+}
+
+
+/* -----------------------------------------------------------
+  Visit all pages and blocks in a heap
+----------------------------------------------------------- */
+
+typedef struct mi_heap_visit_info_s {
+  mi_heap_t* heap;
+  mi_block_visit_fun* visitor;
+  void* arg;
+  bool visit_blocks;
+} mi_heap_visit_info_t;
+
+static bool mi_heap_visit_page(mi_page_t* page, mi_heap_visit_info_t* vinfo) {
+  mi_heap_area_t area;
+  _mi_heap_area_init(&area, page);
+  mi_assert_internal(vinfo->heap == mi_page_heap(page));
+  if (!vinfo->visitor(vinfo->heap, &area, NULL, area.block_size, vinfo->arg)) {
+    return false;
+  }
+  if (vinfo->visit_blocks) {
+    return _mi_theap_area_visit_blocks(&area, page, vinfo->visitor, vinfo->arg);
+  }
+  else {
+    return true;
+  }
+}
+
+static bool mi_heap_visit_page_at(size_t slice_index, size_t slice_count, mi_arena_t* arena, void* arg) {
+  MI_UNUSED(slice_count);
+  mi_heap_visit_info_t* vinfo = (mi_heap_visit_info_t*)arg;
+  mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
+  return mi_heap_visit_page(page, vinfo);
+}
+
+bool _mi_heap_visit_blocks(mi_heap_t* heap, bool abandoned_only, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
+  mi_assert(visitor!=NULL);
+  if (visitor==NULL) return false;
+  if (heap==NULL) { heap = mi_heap_main(); }
+  // visit all pages in a heap
+  // we don't have to claim because we assume we are the only thread running (with this heap).
+  // (but we could atomically claim as well by first doing abandoned_reclaim and afterwards reabandoning).
+  mi_heap_visit_info_t visit_info = { heap, visitor, arg, visit_blocks };
+  bool ok = true;
+  mi_forall_arenas(heap, NULL, 0, arena) {
+    mi_arena_pages_t* arena_pages = mi_heap_arena_pages(heap, arena);
+    if (ok && arena_pages != NULL) {
+      if (abandoned_only) {
+        for (size_t bin = 0; ok && bin < MI_BIN_COUNT; bin++) {
+          // todo: if we had a single abandoned page map as well, this can be faster.
+          if (mi_atomic_load_relaxed(&heap->abandoned_count[bin]) > 0) {
+            ok = _mi_bitmap_forall_set(arena_pages->pages_abandoned[bin], &mi_heap_visit_page_at, arena, &visit_info);
+          }
+        }
+      }
+      else {
+        ok = _mi_bitmap_forall_set(arena_pages->pages, &mi_heap_visit_page_at, arena, &visit_info);
+      }
+    }
+  }
+  mi_forall_arenas_end();
+  if (!ok) return false;
+
+  // visit abandoned pages in OS allocated memory
+  // (technically we don't need the initial lock as we assume we are the only thread running in this subproc)
+  mi_page_t* page = NULL;
+  mi_lock(&heap->os_abandoned_pages_lock) {
+    page = heap->os_abandoned_pages;
+  }
+  while (ok && page != NULL) {
+    mi_page_t* next = page->next;  // read upfront in case the visitor frees the page
+    ok = mi_heap_visit_page(page, &visit_info);
+    page = next;
+  }
+
+  return ok;
+}
+
+bool mi_heap_visit_blocks(mi_heap_t* heap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
+  return _mi_heap_visit_blocks(heap, false, visit_blocks, visitor, arg);
+}
+
+bool mi_heap_visit_abandoned_blocks(mi_heap_t* heap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
+  return _mi_heap_visit_blocks(heap, true, visit_blocks, visitor, arg);
+}
+
+
+typedef struct mi_heap_delete_visit_info_s {
+  mi_heap_t*  heap_target;
+  mi_theap_t* theap_target;
+  mi_theap_t* theap;
+} mi_heap_delete_visit_info_t;
+
+static bool mi_heap_delete_page(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg) {
+  MI_UNUSED(block); MI_UNUSED(block_size); MI_UNUSED(heap);
+  mi_heap_delete_visit_info_t* info = (mi_heap_delete_visit_info_t*)arg;
+  mi_heap_t*  heap_target           = info->heap_target;
+  mi_theap_t* const theap           = info->theap;       mi_assert_internal(theap->heap == heap);
+  mi_page_t*  const page            = (mi_page_t*)area->reserved1;
+
+  mi_page_claim_ownership(page);       // claim ownership
+  if (mi_page_is_abandoned(page)) {
+    _mi_arenas_page_unabandon(page,theap);
+  }
+  else {
+    page->next = page->prev = NULL;    // yikes.. better not to try to access this from a thread later on..
+    mi_page_set_theap(page,NULL);      // set threadid to abandoned
+  }
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(mi_page_is_owned(page));
+
+  if (page->used==0) {
+    // free the page
+    _mi_arenas_page_free(page, theap);
+  }
+  else if (heap_target==NULL) {
+    // destroy the page
+    page->used=0;                        // note: invariant `|local_free| + |free| == reserved - used`  does not hold in this case
+    _mi_arenas_page_free(page, theap);
+  }
+  else {
+    // move the page to `heap_target` as an abandoned page
+    // first remove it from the current heap
+    const size_t sbin = _mi_page_stats_bin(page);
+    size_t slice_index;
+    size_t slice_count;
+    mi_arena_pages_t* arena_pages = NULL;
+    mi_arena_t* const arena = mi_page_arena_pages(page, &slice_index, &slice_count, &arena_pages);
+    mi_assert_internal(mi_bitmap_is_set(arena_pages->pages, slice_index));
+    mi_bitmap_clear(arena_pages->pages, slice_index);
+    mi_theap_stat_decrease(theap, page_bins[sbin], 1);
+    mi_theap_stat_decrease(theap, pages, 1);
+    mi_theap_t* theap_target = info->theap_target;
+
+    // and then add it to the new target heap
+    mi_arena_pages_t* arena_pages_target = mi_heap_ensure_arena_pages(heap_target, arena);
+    if mi_unlikely(arena_pages_target==NULL) {
+      // if we cannot allocate this, we move it to the main heap instead (which does not require allocation)
+      heap_target = mi_heap_main();
+      theap_target = mi_heap_theap(heap_target);
+      arena_pages_target = mi_heap_ensure_arena_pages(heap_target, arena);
+      mi_assert_internal(arena_pages_target!=NULL);
+    }
+    mi_assert_internal(mi_bitmap_is_clear(arena_pages_target->pages, slice_index));
+    mi_bitmap_set(arena_pages_target->pages, slice_index);
+    page->heap = heap_target;
+    mi_theap_stat_increase(theap_target, page_bins[sbin], 1);
+    mi_theap_stat_increase(theap_target, pages, 1);
+
+    // and abandon in the new heap
+    _mi_arenas_page_abandon(page,theap_target);
+  }
+  return true;
+}
+
+static void mi_heap_delete_pages(mi_heap_t* heap, mi_heap_t* heap_target) {
+  mi_theap_t* const theap_target = (heap_target != NULL ? _mi_heap_theap(heap_target) : NULL);
+  mi_theap_t* const theap = _mi_heap_theap(heap);
+  mi_heap_delete_visit_info_t info = { heap_target, theap_target, theap };
+  _mi_heap_visit_blocks(heap, false, false, &mi_heap_delete_page, &info);
+  #if MI_DEBUG>1
+  // no more arena pages?
+  for (size_t i = 0; i < MI_ARENA_BIN_COUNT; i++) {
+    mi_arena_pages_t* const arena_pages = mi_atomic_load_relaxed(&heap->arena_pages[i]);
+    if (arena_pages!=NULL) {
+      mi_assert_internal(mi_bitmap_is_all_clear(arena_pages->pages));
+    }
+  }
+  // nor os abandoned pages?
+  mi_lock(&heap->os_abandoned_pages_lock) {
+
+    mi_assert_internal(heap->os_abandoned_pages == NULL);
+  }
+  // nor arena abandoned pages?
+  for (size_t i = 0; i < MI_BIN_COUNT; i++) {
+    mi_assert_internal(mi_atomic_load_relaxed(&heap->abandoned_count[i])==0);
+  }
+  #endif
+}
+
+void _mi_heap_move_pages(mi_heap_t* heap_from, mi_heap_t* heap_to) {
+  if (_mi_is_heap_main(heap_from)) return;
+  if (heap_to==NULL) { heap_to = mi_heap_main(); }
+  mi_heap_delete_pages(heap_from, heap_to);
+}
+
+void _mi_heap_destroy_pages(mi_heap_t* heap_from) {
+  if (_mi_is_heap_main(heap_from)) return;
+  mi_heap_delete_pages(heap_from, NULL);
+}
+
+/* -----------------------------------------------------------
+  Unloading and reloading an arena.
+----------------------------------------------------------- */
+/*
+static bool mi_arena_page_register(size_t slice_index, size_t slice_count, mi_arena_t* arena, void* arg) {
+  MI_UNUSED(arg); MI_UNUSED(slice_count);
+  mi_assert_internal(slice_count == 1);
+  mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
+  mi_assert_internal(mi_bitmap_is_setN(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index, 1));
+  if (!_mi_page_map_register(page)) return false; // break
+  mi_assert_internal(_mi_ptr_page(page)==page);
+  return true;
+}
+
+mi_decl_nodiscard static bool mi_arena_pages_reregister(mi_arena_t* arena) {
+  return _mi_bitmap_forall_set(arena->pages, &mi_arena_page_register, arena, NULL);
+}
+
+mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* accessed_size, size_t* full_size) {
+  mi_arena_t* arena = _mi_arena_from_id(arena_id);
+  if (arena==NULL) {
+    return false;
+  }
+  else if (!arena->is_exclusive) {
+    _mi_warning_message("cannot unload a non-exclusive arena (id %zu at %p)\n", arena_id, arena);
+    return false;
+  }
+  else if (arena->memid.memkind != MI_MEM_EXTERNAL) {
+    _mi_warning_message("can only unload managed arena's for external memory (id %zu at %p)\n", arena_id, arena);
+    return false;
+  }
+
+  // find accessed size
+  const size_t asize = mi_size_of_slices(mi_arena_used_slices(arena));
+  if (base != NULL) { *base = (void*)arena; }
+  if (full_size != NULL) { *full_size = arena->memid.mem.os.size;  }
+  if (accessed_size != NULL) { *accessed_size = asize; }
+
+  // adjust abandoned page count
+  mi_subproc_t* const subproc = arena->subproc;
+  for (size_t bin = 0; bin < MI_BIN_COUNT; bin++) {
+    const size_t count = mi_bitmap_popcount(arena->pages_abandoned[bin]);
+    if (count > 0) { mi_atomic_decrement_acq_rel(&subproc->abandoned_count[bin]); }
+  }
+
+  // unregister the pages
+  _mi_page_map_unregister_range(arena, asize);
+
+  // set arena entry to NULL
+  const size_t count = mi_arenas_get_count(subproc);
+  for(size_t i = 0; i < count; i++) {
+    if (mi_arena_from_index(subproc, i) == arena) {
+      mi_atomic_store_ptr_release(mi_arena_t, &subproc->arenas[i], NULL);
+      if (i + 1 == count) { // try adjust the count?
+        size_t expected = count;
+        mi_atomic_cas_strong_acq_rel(&subproc->arena_count, &expected, count-1);
+      }
+      break;
+    }
+  }
+  return true;
+}
+
+mi_decl_export bool mi_arena_reload(void* start, size_t size, mi_commit_fun_t* commit_fun, void* commit_fun_arg, mi_arena_id_t* arena_id) {
+  // assume the memory area is already containing the arena
+  if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); }
+  if (start == NULL || size == 0) return false;
+  mi_arena_t* arena = (mi_arena_t*)start;
+  mi_memid_t memid = arena->memid;
+  if (memid.memkind != MI_MEM_EXTERNAL) {
+    _mi_warning_message("can only reload arena's from external memory (%p)\n", arena);
+    return false;
+  }
+  if (memid.mem.os.base != start) {
+    _mi_warning_message("the reloaded arena base address differs from the external memory (arena: %p, external: %p)\n", arena, start);
+    return false;
+  }
+  if (memid.mem.os.size != size) {
+    _mi_warning_message("the reloaded arena size differs from the external memory (arena size: %zu, external size: %zu)\n", arena->memid.mem.os.size, size);
+    return false;
+  }
+  if (!arena->is_exclusive) {
+    _mi_warning_message("the reloaded arena is not exclusive\n");
+    return false;
+  }
+
+  // re-initialize
+  arena->is_exclusive = true;
+  arena->commit_fun = commit_fun;
+  arena->commit_fun_arg = commit_fun_arg;
+  arena->subproc = _mi_subproc();
+  if (!mi_arenas_add(arena->subproc, arena, arena_id)) {
+    return false;
+  }
+  if (!mi_arena_pages_reregister(arena)) {
+    // todo: clear arena entry in the subproc?
+    return false;
+  }
+
+  // adjust abandoned page count
+  for (size_t bin = 0; bin < MI_BIN_COUNT; bin++) {
+    const size_t count = mi_bitmap_popcount(arena->pages_abandoned[bin]);
+    if (count > 0) { mi_atomic_decrement_acq_rel(&arena->subproc->abandoned_count[bin]); }
+  }
+
+  return true;
+}
+
+*/
diff --git a/ext/src/mimalloc/src/bitmap.c b/ext/src/mimalloc/src/bitmap.c
index af6de0a12c..6960503725 100644
--- a/ext/src/mimalloc/src/bitmap.c
+++ b/ext/src/mimalloc/src/bitmap.c
@@ -1,395 +1,1930 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2019-2021 Microsoft Research, Daan Leijen
+Copyright (c) 2019-2024 Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 
 /* ----------------------------------------------------------------------------
-Concurrent bitmap that can set/reset sequences of bits atomically,
-represeted as an array of fields where each field is a machine word (`size_t`)
-
-There are two api's; the standard one cannot have sequences that cross
-between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
-(this is used in region allocation)
-
-The `_across` postfixed functions do allow sequences that can cross over
-between the fields. (This is used in arena allocation)
+Concurrent bitmap that can set/reset sequences of bits atomically
 ---------------------------------------------------------------------------- */
 
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/bits.h"
 #include "bitmap.h"
 
-/* -----------------------------------------------------------
-  Bitmap definition
------------------------------------------------------------ */
+#ifndef MI_OPT_SIMD
+#define MI_OPT_SIMD   0
+#endif
+
+/* --------------------------------------------------------------------------------
+  bfields
+-------------------------------------------------------------------------------- */
 
-// The bit mask for a given number of blocks at a specified bit index.
-static inline size_t mi_bitmap_mask_(size_t count, size_t bitidx) {
-  mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS);
-  mi_assert_internal(count > 0);
-  if (count >= MI_BITMAP_FIELD_BITS) return MI_BITMAP_FIELD_FULL;
-  if (count == 0) return 0;
-  return ((((size_t)1 << count) - 1) << bitidx);
+static inline size_t mi_bfield_ctz(mi_bfield_t x) {
+  return mi_ctz(x);
 }
 
+static inline size_t mi_bfield_clz(mi_bfield_t x) {
+  return mi_clz(x);
+}
 
-/* -----------------------------------------------------------
-  Claim a bit sequence atomically
------------------------------------------------------------ */
+static inline size_t mi_bfield_popcount(mi_bfield_t x) {
+  return mi_popcount(x);
+}
 
-// Try to atomically claim a sequence of `count` bits in a single
-// field at `idx` in `bitmap`. Returns `true` on success.
-inline bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx)
-{
-  mi_assert_internal(bitmap_idx != NULL);
-  mi_assert_internal(count <= MI_BITMAP_FIELD_BITS);
-  mi_assert_internal(count > 0);
-  mi_bitmap_field_t* field = &bitmap[idx];
-  size_t map  = mi_atomic_load_relaxed(field);
-  if (map==MI_BITMAP_FIELD_FULL) return false; // short cut
-
-  // search for 0-bit sequence of length count
-  const size_t mask = mi_bitmap_mask_(count, 0);
-  const size_t bitidx_max = MI_BITMAP_FIELD_BITS - count;
-
-#ifdef MI_HAVE_FAST_BITSCAN
-  size_t bitidx = mi_ctz(~map);    // quickly find the first zero bit if possible
-#else
-  size_t bitidx = 0;               // otherwise start at 0
+static inline mi_bfield_t mi_bfield_clear_least_bit(mi_bfield_t x) {
+  return (x & (x-1));
+}
+
+// find the least significant bit that is set (i.e. count trailing zero's)
+// return false if `x==0` (with `*idx` undefined) and true otherwise,
+// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
+static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) {
+  return mi_bsf(x,idx);
+}
+
+// find the most significant bit that is set.
+// return false if `x==0` (with `*idx` undefined) and true otherwise,
+// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
+static inline bool mi_bfield_find_highest_bit(mi_bfield_t x, size_t* idx) {
+  return mi_bsr(x, idx);
+}
+
+
+
+// find each set bit in a bit field `x` and clear it, until it becomes zero.
+static inline bool mi_bfield_foreach_bit(mi_bfield_t* x, size_t* idx) {
+  const bool found = mi_bfield_find_least_bit(*x, idx);
+  *x = mi_bfield_clear_least_bit(*x);
+  return found;
+}
+
+static inline mi_bfield_t mi_bfield_zero(void) {
+  return 0;
+}
+
+static inline mi_bfield_t mi_bfield_one(void) {
+  return 1;
+}
+
+static inline mi_bfield_t mi_bfield_all_set(void) {
+  return ~((mi_bfield_t)0);
+}
+
+// mask of `bit_count` bits set shifted to the left by `shiftl`
+static inline mi_bfield_t mi_bfield_mask(size_t bit_count, size_t shiftl) {
+  mi_assert_internal(bit_count > 0);
+  mi_assert_internal(bit_count + shiftl <= MI_BFIELD_BITS);
+  mi_assert_internal(shiftl < MI_BFIELD_BITS);
+  const mi_bfield_t mask0 = (bit_count < MI_BFIELD_BITS ? (mi_bfield_one() << bit_count)-1 : mi_bfield_all_set());
+  return (mask0 << shiftl);
+}
+
+
+// ------- mi_bfield_atomic_set ---------------------------------------
+// the `_set` functions return also the count of bits that were already set (for commit statistics)
+// the `_clear` functions return also whether the new bfield is all clear or not (for the chunk_map)
+
+// Set a bit atomically. Returns `true` if the bit transitioned from 0 to 1
+static inline bool mi_bfield_atomic_set(_Atomic(mi_bfield_t)*b, size_t idx) {
+  mi_assert_internal(idx < MI_BFIELD_BITS);
+  const mi_bfield_t mask = mi_bfield_mask(1, idx);;
+  const mi_bfield_t old = mi_atomic_or_acq_rel(b, mask);
+  return ((old&mask) == 0);
+}
+
+// Clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0.
+// `all_clear` is set if the new bfield is zero.
+static inline bool mi_bfield_atomic_clear(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_clear) {
+  mi_assert_internal(idx < MI_BFIELD_BITS);
+  const mi_bfield_t mask = mi_bfield_mask(1, idx);;
+  mi_bfield_t old = mi_atomic_and_acq_rel(b, ~mask);
+  if (all_clear != NULL) { *all_clear = ((old&~mask)==0); }
+  return ((old&mask) == mask);
+}
+
+// Clear a bit but only when/once it is set. This is used by concurrent free's while
+// the page is abandoned and mapped. This can incure a busy wait :-( but it should
+// happen almost never (and is accounted for in the stats)
+static inline void mi_bfield_atomic_clear_once_set(_Atomic(mi_bfield_t)*b, size_t idx) {
+  mi_assert_internal(idx < MI_BFIELD_BITS);
+  const mi_bfield_t mask = mi_bfield_mask(1, idx);;
+  mi_bfield_t old = mi_atomic_load_relaxed(b);
+  do {
+    if mi_unlikely((old&mask) == 0) {
+      old = mi_atomic_load_acquire(b);
+      if ((old&mask)==0) {
+        mi_subproc_stat_counter_increase(_mi_subproc(), pages_unabandon_busy_wait, 1);
+      }
+      while ((old&mask)==0) { // busy wait
+        mi_atomic_yield();
+        old = mi_atomic_load_acquire(b);
+      }
+    }
+  } while (!mi_atomic_cas_weak_acq_rel(b,&old, (old&~mask)));
+  mi_assert_internal((old&mask)==mask);  // we should only clear when it was set
+}
+
+// Set a mask set of bits atomically, and return true of the mask bits transitioned from all 0's to 1's.
+// `already_set` contains the count of bits that were already set (used when committing ranges to account
+// statistics correctly).
+static inline bool mi_bfield_atomic_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, size_t* already_set) {
+  mi_assert_internal(mask != 0);
+  mi_bfield_t old = mi_atomic_load_relaxed(b);
+  while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask)) {};  // try to atomically set the mask bits until success
+  if (already_set!=NULL) { *already_set = mi_bfield_popcount(old&mask); }
+  return ((old&mask) == 0);
+}
+
+// Clear a mask set of bits atomically, and return true of the mask bits transitioned from all 1's to 0's
+// `all_clear` is set to `true` if the new bfield became zero.
+static inline bool mi_bfield_atomic_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* all_clear) {
+  mi_assert_internal(mask != 0);
+  mi_bfield_t old = mi_atomic_load_relaxed(b);
+  while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask)) {};  // try to atomically clear the mask bits until success
+  if (all_clear != NULL) { *all_clear = ((old&~mask)==0); }
+  return ((old&mask) == mask);
+}
+
+static inline bool mi_bfield_atomic_setX(_Atomic(mi_bfield_t)*b, size_t* already_set) {
+  const mi_bfield_t old = mi_atomic_exchange_release(b, mi_bfield_all_set());
+  if (already_set!=NULL) { *already_set = mi_bfield_popcount(old); }
+  return (old==0);
+}
+
+// static inline bool mi_bfield_atomic_clearX(_Atomic(mi_bfield_t)*b, bool* all_clear) {
+//   const mi_bfield_t old = mi_atomic_exchange_release(b, mi_bfield_zero());
+//   if (all_clear!=NULL) { *all_clear = true; }
+//   return (~old==0);
+// }
+
+// ------- mi_bfield_atomic_try_clear ---------------------------------------
+
+
+// Tries to clear a mask atomically, and returns true if the mask bits atomically transitioned from mask to 0
+// and false otherwise (leaving the bit field as is).
+// `all_clear` is set to `true` if the new bfield became zero.
+static inline bool mi_bfield_atomic_try_clear_mask_of(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, mi_bfield_t expect, bool* all_clear) {
+  mi_assert_internal(mask != 0);
+  // try to atomically clear the mask bits
+  do {
+    if ((expect & mask) != mask) {  // are all bits still set?
+      if (all_clear != NULL) { *all_clear = (expect == 0); }
+      return false;
+    }
+  } while (!mi_atomic_cas_weak_acq_rel(b, &expect, expect & ~mask));
+  if (all_clear != NULL) { *all_clear = ((expect & ~mask) == 0);  }
+  return true;
+}
+
+static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)* b, mi_bfield_t mask, bool* all_clear) {
+  mi_assert_internal(mask != 0);
+  const mi_bfield_t expect = mi_atomic_load_relaxed(b);
+  return mi_bfield_atomic_try_clear_mask_of(b, mask, expect, all_clear);
+}
+
+// Tries to clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0
+// and `false` otherwise leaving the bfield `b` as-is.
+// `all_clear` is set to true if the new bfield became zero (and false otherwise)
+mi_decl_maybe_unused static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)* b, size_t idx, bool* all_clear) {
+  mi_assert_internal(idx < MI_BFIELD_BITS);
+  const mi_bfield_t mask = mi_bfield_one()<<idx;
+  return mi_bfield_atomic_try_clear_mask(b, mask, all_clear);
+}
+
+// Tries to clear a byte atomically, and returns true if the byte atomically transitioned from 0xFF to 0
+// `all_clear` is set to true if the new bfield became zero (and false otherwise)
+mi_decl_maybe_unused static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_clear) {
+  mi_assert_internal(idx < MI_BFIELD_BITS);
+  mi_assert_internal((idx%8)==0);
+  const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<idx;
+  return mi_bfield_atomic_try_clear_mask(b, mask, all_clear);
+}
+
+// Try to clear a full field of bits atomically, and return true all bits transitioned from all 1's to 0's.
+// and false otherwise leaving the bit field as-is.
+// `all_clear` is set to true if the new bfield became zero (which is always the case if successful).
+static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b, bool* all_clear) {
+  mi_bfield_t old = mi_bfield_all_set();
+  if (mi_atomic_cas_strong_acq_rel(b, &old, mi_bfield_zero())) {
+    if (all_clear != NULL) { *all_clear = true; }
+    return true;
+  }
+  else return false;
+}
+
+
+// ------- mi_bfield_atomic_is_set ---------------------------------------
+
+// Check if a bit is set
+static inline bool mi_bfield_atomic_is_set(const _Atomic(mi_bfield_t)*b, const size_t idx) {
+  const mi_bfield_t x = mi_atomic_load_relaxed(b);
+  return ((x & mi_bfield_mask(1,idx)) != 0);
+}
+
+// Check if a bit is clear
+static inline bool mi_bfield_atomic_is_clear(const _Atomic(mi_bfield_t)*b, const size_t idx) {
+  const mi_bfield_t x = mi_atomic_load_relaxed(b);
+  return ((x & mi_bfield_mask(1, idx)) == 0);
+}
+
+// Check if a bit is xset
+static inline bool mi_bfield_atomic_is_xset(mi_xset_t set, const _Atomic(mi_bfield_t)*b, const size_t idx) {
+  if (set) return mi_bfield_atomic_is_set(b, idx);
+      else return mi_bfield_atomic_is_clear(b, idx);
+}
+
+// Check if all bits corresponding to a mask are set.
+static inline bool mi_bfield_atomic_is_set_mask(const _Atomic(mi_bfield_t)* b, mi_bfield_t mask) {
+  mi_assert_internal(mask != 0);
+  const mi_bfield_t x = mi_atomic_load_relaxed(b);
+  return ((x & mask) == mask);
+}
+
+// Check if all bits corresponding to a mask are clear.
+static inline bool mi_bfield_atomic_is_clear_mask(const _Atomic(mi_bfield_t)* b, mi_bfield_t mask) {
+  mi_assert_internal(mask != 0);
+  const mi_bfield_t x = mi_atomic_load_relaxed(b);
+  return ((x & mask) == 0);
+}
+
+// Check if all bits corresponding to a mask are set/cleared.
+static inline bool mi_bfield_atomic_is_xset_mask(mi_xset_t set, const _Atomic(mi_bfield_t)* b, mi_bfield_t mask) {
+  mi_assert_internal(mask != 0);
+  if (set) return mi_bfield_atomic_is_set_mask(b, mask);
+      else return mi_bfield_atomic_is_clear_mask(b, mask);
+}
+
+// Count bits in a mask
+static inline size_t mi_bfield_atomic_popcount_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
+  const mi_bfield_t x = mi_atomic_load_relaxed(b);
+  return mi_bfield_popcount(x & mask);
+}
+
+
+/* --------------------------------------------------------------------------------
+ bitmap chunks
+-------------------------------------------------------------------------------- */
+
+// ------- mi_bchunk_set ---------------------------------------
+
+// Set a single bit
+static inline bool mi_bchunk_set(mi_bchunk_t* chunk, size_t cidx, size_t* already_set) {
+  mi_assert_internal(cidx < MI_BCHUNK_BITS);
+  const size_t i = cidx / MI_BFIELD_BITS;
+  const size_t idx = cidx % MI_BFIELD_BITS;
+  const bool was_clear = mi_bfield_atomic_set(&chunk->bfields[i], idx);
+  if (already_set != NULL) { *already_set = (was_clear ? 0 : 1); }
+  return was_clear;
+}
+
+// Set `0 < n <= MI_BFIELD_BITS`, and return true of the mask bits transitioned from all 0's to 1's.
+// `already_set` contains the count of bits that were already set (used when committing ranges to account
+// statistics correctly).
+// Can cross over two bfields.
+static inline bool mi_bchunk_setNX(mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* already_set) {
+  mi_assert_internal(cidx < MI_BCHUNK_BITS);
+  mi_assert_internal(n > 0 && n <= MI_BFIELD_BITS);
+  const size_t i = cidx / MI_BFIELD_BITS;
+  const size_t idx = cidx % MI_BFIELD_BITS;
+  if mi_likely(idx + n <= MI_BFIELD_BITS) {
+    // within one field
+    return mi_bfield_atomic_set_mask(&chunk->bfields[i], mi_bfield_mask(n,idx), already_set);
+  }
+  else {
+    // spanning two fields
+    const size_t m = MI_BFIELD_BITS - idx;  // bits to clear in the first field
+    mi_assert_internal(m < n);
+    mi_assert_internal(i < MI_BCHUNK_FIELDS - 1);
+    mi_assert_internal(idx + m <= MI_BFIELD_BITS);
+    size_t already_set1;
+    const bool all_set1 = mi_bfield_atomic_set_mask(&chunk->bfields[i], mi_bfield_mask(m, idx), &already_set1);
+    mi_assert_internal(n - m > 0);
+    mi_assert_internal(n - m < MI_BFIELD_BITS);
+    size_t already_set2;
+    const bool all_set2 = mi_bfield_atomic_set_mask(&chunk->bfields[i+1], mi_bfield_mask(n - m, 0), &already_set2);
+    if (already_set != NULL) { *already_set = already_set1 + already_set2; }
+    return (all_set1 && all_set2);
+  }
+}
+
+// Set a sequence of `n` bits within a chunk.
+// Returns true if all bits transitioned from 0 to 1 (or 1 to 0).
+mi_decl_noinline static bool mi_bchunk_xsetNC(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* palready_set, bool* pmaybe_all_clear) {
+  mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);
+  mi_assert_internal(n>0);
+  bool all_transition = true;
+  bool maybe_all_clear = true;
+  size_t total_already_set = 0;
+  size_t idx   = cidx % MI_BFIELD_BITS;
+  size_t field = cidx / MI_BFIELD_BITS;
+  while (n > 0) {
+    size_t m = MI_BFIELD_BITS - idx;   // m is the bits to xset in this field
+    if (m > n) { m = n; }
+    mi_assert_internal(idx + m <= MI_BFIELD_BITS);
+    mi_assert_internal(field < MI_BCHUNK_FIELDS);
+    const mi_bfield_t mask = mi_bfield_mask(m, idx);
+    size_t already_set = 0;
+    bool all_clear = false;
+    const bool transition = (set ? mi_bfield_atomic_set_mask(&chunk->bfields[field], mask, &already_set)
+                                 : mi_bfield_atomic_clear_mask(&chunk->bfields[field], mask, &all_clear));
+    mi_assert_internal((transition && already_set == 0) || (!transition && already_set > 0));
+    all_transition = all_transition && transition;
+    total_already_set += already_set;
+    maybe_all_clear = maybe_all_clear && all_clear;
+    // next field
+    field++;
+    idx = 0;
+    mi_assert_internal(m <= n);
+    n -= m;
+  }
+  if (palready_set!=NULL) { *palready_set = total_already_set; }
+  if (pmaybe_all_clear!=NULL) { *pmaybe_all_clear = maybe_all_clear; }
+  return all_transition;
+}
+
+static inline bool mi_bchunk_setN(mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* already_set) {
+  mi_assert_internal(n>0 && n <= MI_BCHUNK_BITS);
+  if (n==1) return mi_bchunk_set(chunk, cidx, already_set);
+  // if (n==8 && (cidx%8) == 0) return mi_bchunk_set8(chunk, cidx, already_set);
+  // if (n==MI_BFIELD_BITS) return mi_bchunk_setX(chunk, cidx, already_set);
+  if (n<=MI_BFIELD_BITS) return mi_bchunk_setNX(chunk, cidx, n, already_set);
+  return mi_bchunk_xsetNC(MI_BIT_SET, chunk, cidx, n, already_set, NULL);
+}
+
+// ------- mi_bchunk_clear ---------------------------------------
+
+static inline bool mi_bchunk_clear(mi_bchunk_t* chunk, size_t cidx, bool* all_clear) {
+  mi_assert_internal(cidx < MI_BCHUNK_BITS);
+  const size_t i = cidx / MI_BFIELD_BITS;
+  const size_t idx = cidx % MI_BFIELD_BITS;
+  return mi_bfield_atomic_clear(&chunk->bfields[i], idx, all_clear);
+}
+
+static inline bool mi_bchunk_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* maybe_all_clear) {
+  mi_assert_internal(n>0 && n <= MI_BCHUNK_BITS);
+  if (n==1) return mi_bchunk_clear(chunk, cidx, maybe_all_clear);
+  // if (n==8) return mi_bchunk_clear8(chunk, cidx, maybe_all_clear);
+  // if (n==MI_BFIELD_BITS) return mi_bchunk_clearX(chunk, cidx, maybe_all_clear);
+  // TODO: implement mi_bchunk_xsetNX instead of setNX
+  return mi_bchunk_xsetNC(MI_BIT_CLEAR, chunk, cidx, n, NULL, maybe_all_clear);
+}
+
+// Check if a sequence of `n` bits within a chunk are all set/cleared.
+// This can cross bfield's
+mi_decl_noinline static size_t mi_bchunk_popcountNC(mi_bchunk_t* chunk, size_t field_idx, size_t idx, size_t n) {
+  mi_assert_internal((field_idx*MI_BFIELD_BITS) + idx + n <= MI_BCHUNK_BITS);
+  size_t count = 0;
+  while (n > 0) {
+    size_t m = MI_BFIELD_BITS - idx;   // m is the bits to xset in this field
+    if (m > n) { m = n; }
+    mi_assert_internal(idx + m <= MI_BFIELD_BITS);
+    mi_assert_internal(field_idx < MI_BCHUNK_FIELDS);
+    const size_t mask = mi_bfield_mask(m, idx);
+    count += mi_bfield_atomic_popcount_mask(&chunk->bfields[field_idx], mask);
+    // next field
+    field_idx++;
+    idx = 0;
+    n -= m;
+  }
+  return count;
+}
+
+// Count set bits a sequence of `n` bits.
+static inline size_t mi_bchunk_popcountN(mi_bchunk_t* chunk, size_t cidx, size_t n) {
+  mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);
+  mi_assert_internal(n>0);
+  if (n==0) return 0;
+  const size_t i = cidx / MI_BFIELD_BITS;
+  const size_t idx = cidx % MI_BFIELD_BITS;
+  if (n==1) { return (mi_bfield_atomic_is_set(&chunk->bfields[i], idx) ? 1 : 0); }
+  if (idx + n <= MI_BFIELD_BITS) { return mi_bfield_atomic_popcount_mask(&chunk->bfields[i], mi_bfield_mask(n, idx)); }
+  return mi_bchunk_popcountNC(chunk, i, idx, n);
+}
+
+
+// ------- mi_bchunk_is_xset ---------------------------------------
+
+// Check if a sequence of `n` bits within a chunk are all set/cleared.
+// This can cross bfield's
+mi_decl_noinline static bool mi_bchunk_is_xsetNC(mi_xset_t set, const mi_bchunk_t* chunk, size_t field_idx, size_t idx, size_t n) {
+  mi_assert_internal((field_idx*MI_BFIELD_BITS) + idx + n <= MI_BCHUNK_BITS);
+  while (n > 0) {
+    size_t m = MI_BFIELD_BITS - idx;   // m is the bits to xset in this field
+    if (m > n) { m = n; }
+    mi_assert_internal(idx + m <= MI_BFIELD_BITS);
+    mi_assert_internal(field_idx < MI_BCHUNK_FIELDS);
+    const size_t mask = mi_bfield_mask(m, idx);
+    if (!mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[field_idx], mask)) {
+      return false;
+    }
+    // next field
+    field_idx++;
+    idx = 0;
+    n -= m;
+  }
+  return true;
+}
+
+// Check if a sequence of `n` bits within a chunk are all set/cleared.
+static inline bool mi_bchunk_is_xsetN(mi_xset_t set, const mi_bchunk_t* chunk, size_t cidx, size_t n) {
+  mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);
+  mi_assert_internal(n>0);
+  if (n==0) return true;
+  const size_t i = cidx / MI_BFIELD_BITS;
+  const size_t idx = cidx % MI_BFIELD_BITS;
+  if (n==1) { return mi_bfield_atomic_is_xset(set, &chunk->bfields[i], idx); }
+  if (idx + n <= MI_BFIELD_BITS) { return mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[i], mi_bfield_mask(n, idx)); }
+  return mi_bchunk_is_xsetNC(set, chunk, i, idx, n);
+}
+
+
+// ------- mi_bchunk_try_clear  ---------------------------------------
+
+// Clear `0 < n <= MI_BITFIELD_BITS`. Can cross over a bfield boundary.
+static inline bool mi_bchunk_try_clearNX(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* pmaybe_all_clear) {
+  mi_assert_internal(cidx < MI_BCHUNK_BITS);
+  mi_assert_internal(n <= MI_BFIELD_BITS);
+  const size_t i = cidx / MI_BFIELD_BITS;
+  const size_t idx = cidx % MI_BFIELD_BITS;
+  if mi_likely(idx + n <= MI_BFIELD_BITS) {
+    // within one field
+    return mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], mi_bfield_mask(n, idx), pmaybe_all_clear);
+  }
+  else {
+    // spanning two fields (todo: use double-word atomic ops?)
+    const size_t m = MI_BFIELD_BITS - idx;  // bits to clear in the first field
+    mi_assert_internal(m < n);
+    mi_assert_internal(i < MI_BCHUNK_FIELDS - 1);
+    bool field1_is_clear;
+    if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], mi_bfield_mask(m, idx), &field1_is_clear)) return false;
+    // try the second field as well
+    mi_assert_internal(n - m > 0);
+    mi_assert_internal(n - m < MI_BFIELD_BITS);
+    bool field2_is_clear;
+    if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[i+1], mi_bfield_mask(n - m, 0), &field2_is_clear)) {
+      // we failed to clear the second field, restore the first one
+      mi_bfield_atomic_set_mask(&chunk->bfields[i], mi_bfield_mask(m, idx), NULL);
+      return false;
+    }
+    if (pmaybe_all_clear != NULL) { *pmaybe_all_clear = field1_is_clear && field2_is_clear;  }
+    return true;
+  }
+}
+
+// Clear a full aligned bfield.
+// static inline bool mi_bchunk_try_clearX(mi_bchunk_t* chunk, size_t cidx, bool* pmaybe_all_clear) {
+//   mi_assert_internal(cidx < MI_BCHUNK_BITS);
+//   mi_assert_internal((cidx%MI_BFIELD_BITS) == 0);
+//   const size_t i = cidx / MI_BFIELD_BITS;
+//   return mi_bfield_atomic_try_clearX(&chunk->bfields[i], pmaybe_all_clear);
+// }
+
+// Try to atomically clear a sequence of `n` bits within a chunk.
+// Returns true if all bits transitioned from 1 to 0,
+// and false otherwise leaving all bit fields as is.
+// Note: this is the complex one as we need to unwind partial atomic operations if we fail halfway..
+// `maybe_all_clear` is set to `true` if all the bfields involved become zero.
+mi_decl_noinline static bool mi_bchunk_try_clearNC(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* pmaybe_all_clear) {
+  mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);
+  mi_assert_internal(n>0);
+  if (pmaybe_all_clear != NULL) { *pmaybe_all_clear = true; }
+  if (n==0) return true;
+
+  // first field
+  const size_t start_idx = cidx % MI_BFIELD_BITS;
+  const size_t start_field = cidx / MI_BFIELD_BITS;
+  size_t field = start_field;
+  size_t m = MI_BFIELD_BITS - start_idx;   // m are the bits to clear in this field
+  if (m > n) { m = n; }
+  mi_assert_internal(start_idx + m <= MI_BFIELD_BITS);
+  mi_assert_internal(start_field < MI_BCHUNK_FIELDS);
+  const mi_bfield_t mask_start = mi_bfield_mask(m, start_idx);
+  bool maybe_all_clear;
+  if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[field], mask_start, &maybe_all_clear)) return false;
+
+  // done?
+  mi_assert_internal(m <= n);
+  n -= m;
+
+  // continue with mid fields and last field: if these fail we need to recover by unsetting previous fields
+  // mid fields?
+  while (n >= MI_BFIELD_BITS) {
+    field++;
+    mi_assert_internal(field < MI_BCHUNK_FIELDS);
+    bool field_is_clear;
+    if (!mi_bfield_atomic_try_clearX(&chunk->bfields[field], &field_is_clear)) goto restore;
+    maybe_all_clear = maybe_all_clear && field_is_clear;
+    n -= MI_BFIELD_BITS;
+  }
+
+  // last field?
+  if (n > 0) {
+    mi_assert_internal(n < MI_BFIELD_BITS);
+    field++;
+    mi_assert_internal(field < MI_BCHUNK_FIELDS);
+    const mi_bfield_t mask_end = mi_bfield_mask(n, 0);
+    bool field_is_clear;
+    if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[field], mask_end, &field_is_clear)) goto restore;
+    maybe_all_clear = maybe_all_clear && field_is_clear;
+  }
+
+  if (pmaybe_all_clear != NULL) { *pmaybe_all_clear = maybe_all_clear; }
+  return true;
+
+restore:
+  // `field` is the index of the field that failed to set atomically; we need to restore all previous fields
+  mi_assert_internal(field > start_field);
+  while( field > start_field) {
+    field--;
+    if (field == start_field) {
+      mi_bfield_atomic_set_mask(&chunk->bfields[field], mask_start, NULL);
+    }
+    else {
+      mi_bfield_atomic_setX(&chunk->bfields[field], NULL);  // mid-field: set all bits again
+    }
+  }
+  return false;
+}
+
+
+static inline bool mi_bchunk_try_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* maybe_all_clear) {
+  mi_assert_internal(n>0);
+  // if (n==MI_BFIELD_BITS) return mi_bchunk_try_clearX(chunk, cidx, maybe_all_clear);
+  if (n<=MI_BFIELD_BITS) return mi_bchunk_try_clearNX(chunk, cidx, n, maybe_all_clear);
+  return mi_bchunk_try_clearNC(chunk, cidx, n, maybe_all_clear);
+}
+
+
+// ------- mi_bchunk_try_find_and_clear ---------------------------------------
+
+#if MI_OPT_SIMD && defined(__AVX2__)
+mi_decl_maybe_unused static inline __m256i mi_mm256_zero(void) {
+  return _mm256_setzero_si256();
+}
+mi_decl_maybe_unused static inline __m256i mi_mm256_ones(void) {
+  return _mm256_set1_epi64x(~0);
+}
+mi_decl_maybe_unused static inline bool mi_mm256_is_ones(__m256i vec) {
+  return _mm256_testc_si256(vec, _mm256_cmpeq_epi32(vec, vec));
+}
+mi_decl_maybe_unused static inline bool mi_mm256_is_zero( __m256i vec) {
+  return _mm256_testz_si256(vec,vec);
+}
 #endif
-  size_t m = (mask << bitidx);     // invariant: m == mask shifted by bitidx
-
-  // scan linearly for a free range of zero bits
-  while (bitidx <= bitidx_max) {
-    const size_t mapm = map & m;
-    if (mapm == 0) {  // are the mask bits free at bitidx?
-      mi_assert_internal((m >> bitidx) == mask); // no overflow?
-      const size_t newmap = map | m;
-      mi_assert_internal((newmap^map) >> bitidx == mask);
-      if (!mi_atomic_cas_weak_acq_rel(field, &map, newmap)) {  // TODO: use strong cas here?
-        // no success, another thread claimed concurrently.. keep going (with updated `map`)
-        continue;
+
+static inline bool mi_bchunk_try_find_and_clear_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx) {
+  mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
+  // note: this must be acquire (and not relaxed), or otherwise the AVX code below can loop forever
+  // as the compiler won't reload the registers vec1 and vec2 from memory again.
+  const mi_bfield_t b = mi_atomic_load_acquire(&chunk->bfields[chunk_idx]);
+  size_t idx;
+  if (mi_bfield_find_least_bit(b, &idx)) {           // find the least bit
+    if mi_likely(mi_bfield_atomic_try_clear_mask_of(&chunk->bfields[chunk_idx], mi_bfield_mask(1,idx), b, NULL)) {  // clear it atomically
+      *pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
+      mi_assert_internal(*pidx < MI_BCHUNK_BITS);
+      return true;
+    }
+  }
+  return false;
+}
+
+// Find least 1-bit in a chunk and try to clear it atomically
+// set `*pidx` to the bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
+// This is used to find free slices and abandoned pages and should be efficient.
+// todo: try neon version
+static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx) {
+  #if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256)
+  for(int tries=0; tries<4; tries++) {   // paranoia: at most 4 tries
+    const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
+    const __m256i vcmp = _mm256_cmpeq_epi64(vec, mi_mm256_zero()); // (elem64 == 0 ? 0xFF  : 0)
+    const uint32_t mask = ~_mm256_movemask_epi8(vcmp);  // mask of most significant bit of each byte (so each 8 bits are all set or clear)
+    // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a bit set (and thus can be cleared)
+    if (mask==0) return false;
+    mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24
+    const size_t chunk_idx = _tzcnt_u32(mask) / 8;
+    if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;
+    // try again
+    // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded
+    // we add an explicit memory barrier as older gcc compilers do not reload the registers even with an atomic acquire (issue #1206)
+    #if defined(__GNUC__)
+    __asm __volatile ("" : : "g"(chunk) : "memory");
+    #endif
+  }
+  #elif MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  for(int tries=0; tries<4; tries++) {   // paranoia: at most 4 tries
+    size_t chunk_idx = 0;
+    #if 0
+    // one vector at a time
+    __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
+    if (mi_mm256_is_zero(vec)) {
+      chunk_idx += 4;
+      vec = _mm256_load_si256(((const __m256i*)chunk->bfields) + 1);
+    }
+    const __m256i vcmp = _mm256_cmpeq_epi64(vec, mi_mm256_zero()); // (elem64 == 0 ? 0xFF  : 0)
+    const uint32_t mask = ~_mm256_movemask_epi8(vcmp);  // mask of most significant bit of each byte (so each 8 bits are all set or clear)
+    // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a bit set (and thus can be cleared)
+    if (mask==0) return false;
+    mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24
+    chunk_idx += _tzcnt_u32(mask) / 8;
+    #else
+    // a cache line is 64b so we can just as well load all at the same time
+    const __m256i vec1  = _mm256_load_si256((const __m256i*)chunk->bfields);
+    const __m256i vec2  = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
+    const __m256i cmpv  = mi_mm256_zero();
+    const __m256i vcmp1 = _mm256_cmpeq_epi64(vec1, cmpv); // (elem64 == 0 ? 0xFF  : 0)
+    const __m256i vcmp2 = _mm256_cmpeq_epi64(vec2, cmpv); // (elem64 == 0 ? 0xFF  : 0)
+    const uint32_t mask1 = ~_mm256_movemask_epi8(vcmp1);  // mask of most significant bit of each byte (so each 8 bits are all set or clear)
+    const uint32_t mask2 = ~_mm256_movemask_epi8(vcmp2);  // mask of most significant bit of each byte (so each 8 bits are all set or clear)
+    const uint64_t mask = ((uint64_t)mask2 << 32) | mask1;
+    // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a bit set (and thus can be cleared)
+    if (mask==0) return false;
+    mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , ..
+    chunk_idx = mi_ctz(mask) / 8;
+    #endif
+    if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;
+    // try again
+    // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded
+    // we add an explicit memory barrier as older gcc compilers do not reload the registers even with an atomic acquire (issue #1206)
+    #if defined(__GNUC__)
+    __asm __volatile ("" : : "g"(chunk) : "memory");
+    #endif
+  }
+  #elif MI_OPT_SIMD && (MI_BCHUNK_BITS==512) && MI_ARCH_ARM64
+  for(int tries=0; tries<4; tries++) {   // paranoia: at most 4 tries
+    // a cache line is 64b so we can just as well load all at the same time (?)
+    const uint64x2_t vzero1_lo = vceqzq_u64(vld1q_u64((uint64_t*)chunk->bfields));        // 2x64 bit is_zero
+    const uint64x2_t vzero1_hi = vceqzq_u64(vld1q_u64((uint64_t*)chunk->bfields + 2));    // 2x64 bit is_zero
+    const uint64x2_t vzero2_lo = vceqzq_u64(vld1q_u64((uint64_t*)chunk->bfields + 4));    // 2x64 bit is_zero
+    const uint64x2_t vzero2_hi = vceqzq_u64(vld1q_u64((uint64_t*)chunk->bfields + 6));    // 2x64 bit is_zero
+    const uint32x4_t vzero1    = vuzp1q_u32(vreinterpretq_u32_u64(vzero1_lo),vreinterpretq_u32_u64(vzero1_hi)); // unzip even elements: narrow to 4x32 bit is_zero ()
+    const uint32x4_t vzero2    = vuzp1q_u32(vreinterpretq_u32_u64(vzero2_lo),vreinterpretq_u32_u64(vzero2_hi)); // unzip even elements: narrow to 4x32 bit is_zero ()
+    const uint32x4_t vzero1x   = vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(vzero1), 24));        // shift-right 2x32bit elem by 24: lo 16 bits contain the 2 lo bytes
+    const uint32x4_t vzero2x   = vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(vzero2), 24));
+    const uint16x8_t vzero12   = vreinterpretq_u16_u32(vuzp1q_u32(vzero1x,vzero2x));                           // unzip even 32-bit elements into one vector
+    const uint8x8_t  vzero     = vmovn_u16(vzero12);                                                           // narrow the bottom 16-bits
+    const uint64_t mask = ~vget_lane_u64(vreinterpret_u64_u8(vzero), 0);  // 1 byte for each bfield (0xFF => bfield has a bit set)
+    if (mask==0) return false;
+    mi_assert_internal((mi_ctz(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , ..
+    const size_t chunk_idx = mi_ctz(mask) / 8;
+    if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;
+    // try again
+    // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded
+    // we add an explicit memory barrier as older gcc compilers do not reload the registers even with an atomic acquire (issue #1206)
+    #if defined(__GNUC__)
+    __asm __volatile ("" : : "g"(chunk) : "memory");
+    #endif
+  }
+  #else
+  for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
+    if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx)) return true;
+  }
+  #endif
+  return false;  
+}
+
+static inline bool mi_bchunk_try_find_and_clear_1(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
+  mi_assert_internal(n==1); MI_UNUSED(n);
+  return mi_bchunk_try_find_and_clear(chunk, pidx);
+}
+
+mi_decl_maybe_unused static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx) {
+  const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
+  // has_set8 has low bit in each byte set if the byte in x == 0xFF
+  const mi_bfield_t has_set8 =
+    ((~b - MI_BFIELD_LO_BIT8) &      // high bit set if byte in x is 0xFF or < 0x7F
+     (b  & MI_BFIELD_HI_BIT8))       // high bit set if byte in x is >= 0x80
+     >> 7;                           // shift high bit to low bit
+  size_t idx;
+  if (mi_bfield_find_least_bit(has_set8, &idx)) { // find least 1-bit
+    mi_assert_internal(idx <= (MI_BFIELD_BITS - 8));
+    mi_assert_internal((idx%8)==0);
+    if mi_likely(mi_bfield_atomic_try_clear_mask_of(&chunk->bfields[chunk_idx], (mi_bfield_t)0xFF << idx, b, NULL)) {  // unset the byte atomically
+      *pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
+      mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
+      return true;
+    }
+  }
+  return false;
+}
+
+// find least aligned byte in a chunk with all bits set, and try unset it atomically
+// set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
+// Used to find medium size pages in the free blocks.
+// todo: try neon version
+static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, size_t* pidx) {
+  #if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  while (true) {
+    // since a cache-line is 64b, load all at once
+    const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
+    const __m256i vec2 = _mm256_load_si256((const __m256i*)chunk->bfields+1);
+    const __m256i cmpv = mi_mm256_ones();
+    const __m256i vcmp1 = _mm256_cmpeq_epi8(vec1, cmpv); // (byte == ~0 ? 0xFF : 0)
+    const __m256i vcmp2 = _mm256_cmpeq_epi8(vec2, cmpv); // (byte == ~0 ? 0xFF : 0)
+    const uint32_t mask1 = _mm256_movemask_epi8(vcmp1);    // mask of most significant bit of each byte
+    const uint32_t mask2 = _mm256_movemask_epi8(vcmp2);    // mask of most significant bit of each byte
+    const uint64_t mask = ((uint64_t)mask2 << 32) | mask1;
+    // mask is inverted, so each bit is 0xFF iff the corresponding byte has a bit set (and thus can be cleared)
+    if (mask==0) return false;
+    const size_t bidx = _tzcnt_u64(mask);          // byte-idx of the byte in the chunk
+    const size_t chunk_idx = bidx / 8;
+    const size_t idx = (bidx % 8)*8;
+    mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
+    if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], idx, NULL)) {  // clear it atomically
+      *pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
+      mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
+      return true;
+    }
+    // try again
+    // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded  }
+  }
+  #else
+    for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
+      if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx)) return true;
+    }
+    return false;
+  #endif
+}
+
+static inline bool mi_bchunk_try_find_and_clear_8(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
+  mi_assert_internal(n==8); MI_UNUSED(n);
+  return mi_bchunk_try_find_and_clear8(chunk, pidx);
+}
+
+
+// find a sequence of `n` bits in a chunk with `0 < n <= MI_BFIELD_BITS` with all bits set,
+// and try to clear them atomically.
+// set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success.
+// will cross bfield boundaries.
+mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
+  if (n == 0 || n > MI_BFIELD_BITS) return false;
+  const mi_bfield_t mask = mi_bfield_mask(n, 0);
+  // for all fields in the chunk
+  for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
+    mi_bfield_t b0 = mi_atomic_load_relaxed(&chunk->bfields[i]);
+    mi_bfield_t b = b0;
+    size_t idx;
+
+    // is there a range inside the field?
+    while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit
+      if (idx + n > MI_BFIELD_BITS) break; // too short: maybe cross over, or continue with the next field
+
+      const size_t bmask = mask<<idx;
+      mi_assert_internal(bmask>>idx == mask);
+      if ((b&bmask) == bmask) { // found a match with all bits set, try clearing atomically
+        if mi_likely(mi_bfield_atomic_try_clear_mask_of(&chunk->bfields[i], bmask, b0, NULL)) {
+          *pidx = (i*MI_BFIELD_BITS) + idx;
+          mi_assert_internal(*pidx < MI_BCHUNK_BITS);
+          mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS);
+          return true;
+        }
+        else {
+          // if we failed to atomically commit, reload b and try again from the start
+          b = b0 = mi_atomic_load_acquire(&chunk->bfields[i]);
+        }
       }
       else {
-        // success, we claimed the bits!
-        *bitmap_idx = mi_bitmap_index_create(idx, bitidx);
+        // advance by clearing the least run of ones, for example, with n>=4, idx=2:
+        // b             = 1111 1101 1010 1100
+        // .. + (1<<idx) = 1111 1101 1011 0000
+        // .. & b        = 1111 1101 1010 0000
+        b = b & (b + (mi_bfield_one() << idx));
+      }
+    }
+
+    // check if we can cross into the next bfield
+    if (b!=0 && i < MI_BCHUNK_FIELDS-1) {
+      const size_t post = mi_bfield_clz(~b);
+      if (post > 0) {
+        const size_t pre = mi_bfield_ctz(~mi_atomic_load_relaxed(&chunk->bfields[i+1]));
+        if (post + pre >= n) {
+          // it fits -- try to claim it atomically
+          const size_t cidx = (i*MI_BFIELD_BITS) + (MI_BFIELD_BITS - post);
+          if (mi_bchunk_try_clearNX(chunk, cidx, n, NULL)) {
+            // we cleared all atomically
+            *pidx = cidx;
+            mi_assert_internal(*pidx < MI_BCHUNK_BITS);
+            mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS);
+            return true;
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
+// find a sequence of `n` bits in a chunk with `n <= MI_BCHUNK_BITS` with all bits set,
+// and try to clear them atomically.
+// set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success.
+// This can cross bfield boundaries.
+static mi_decl_noinline bool mi_bchunk_try_find_and_clearNC(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
+  if (n == 0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
+
+  // we first scan ahead to see if there is a range of `n` set bits, and only then try to clear atomically
+  mi_assert_internal(n>0);
+  const size_t skip_count = (n-1)/MI_BFIELD_BITS;
+  size_t cidx;
+  for (size_t i = 0; i < MI_BCHUNK_FIELDS - skip_count; i++)
+  {
+    size_t m = n;   // bits to go
+
+    // first field
+    mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
+    size_t ones = mi_bfield_clz(~b);
+
+    cidx = (i*MI_BFIELD_BITS) + (MI_BFIELD_BITS - ones);  // start index
+    if (ones >= m) {
+      // we found enough bits already!
+      m = 0;
+    }
+    else if (ones > 0) {
+      // keep scanning further fields until we have enough bits
+      m -= ones;
+      size_t j = 1;   // field count from i
+      while (i+j < MI_BCHUNK_FIELDS) {
+        mi_assert_internal(m > 0);
+        b = mi_atomic_load_relaxed(&chunk->bfields[i+j]);
+        ones = mi_bfield_ctz(~b);
+        if (ones >= m) {
+          // we found enough bits
+          m = 0;
+          break;
+        }
+        else if (ones == MI_BFIELD_BITS) {
+          // not enough yet, proceed to the next field
+          j++;
+          m -= MI_BFIELD_BITS;
+        }
+        else {
+          // the range was not enough, start from scratch
+          i = i + j - 1;  // no need to re-scan previous fields, except the last one (with clz this time)
+          mi_assert_internal(m>0);
+          break;
+        }
+      }
+    }
+
+    // did we find a range?
+    if (m==0) {
+      if (mi_bchunk_try_clearN(chunk, cidx, n, NULL)) {
+        // we cleared all atomically
+        *pidx = cidx;
+        mi_assert_internal(*pidx < MI_BCHUNK_BITS);
+        mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS);
         return true;
       }
+      // note: if we fail for a small `n` on the first field, we don't rescan that field (as `i` is incremented)
     }
-    else {
-      // on to the next bit range
-#ifdef MI_HAVE_FAST_BITSCAN
-      const size_t shift = (count == 1 ? 1 : mi_bsr(mapm) - bitidx + 1);
-      mi_assert_internal(shift > 0 && shift <= count);
+    // otherwise continue searching
+  }
+  return false;
+}
+
+
+
+// ------- mi_bchunk_clear_once_set ---------------------------------------
+
+static inline void mi_bchunk_clear_once_set(mi_bchunk_t* chunk, size_t cidx) {
+  mi_assert_internal(cidx < MI_BCHUNK_BITS);
+  const size_t i = cidx / MI_BFIELD_BITS;
+  const size_t idx = cidx % MI_BFIELD_BITS;
+  mi_bfield_atomic_clear_once_set(&chunk->bfields[i], idx);
+}
+
+
+// ------- mi_bitmap_all_are_clear ---------------------------------------
+
+
+// are all bits in a bitmap chunk clear?
+static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) {
+  #if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256)
+  const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
+  return mi_mm256_is_zero(vec);
+  #elif MI_OPT_SIMD &&  defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  // a 64b cache-line contains the entire chunk anyway so load both at once
+  const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
+  const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
+  return (mi_mm256_is_zero(_mm256_or_si256(vec1,vec2)));
+  #elif MI_OPT_SIMD && (MI_BCHUNK_BITS==512) && MI_ARCH_ARM64
+  const uint64x2_t v0 = vld1q_u64((uint64_t*)chunk->bfields);
+  const uint64x2_t v1 = vld1q_u64((uint64_t*)chunk->bfields + 2);
+  const uint64x2_t v2 = vld1q_u64((uint64_t*)chunk->bfields + 4);
+  const uint64x2_t v3 = vld1q_u64((uint64_t*)chunk->bfields + 6);
+  const uint64x2_t v  = vorrq_u64(vorrq_u64(v0,v1),vorrq_u64(v2,v3));
+  return (vmaxvq_u32(vreinterpretq_u32_u64(v)) == 0);
+  #else
+  for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
+    if (mi_atomic_load_relaxed(&chunk->bfields[i]) != 0) return false;
+  }
+  return true;
+  #endif
+}
+
+// are all bits in a bitmap chunk set?
+static inline bool mi_bchunk_all_are_set_relaxed(mi_bchunk_t* chunk) {
+#if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256)
+  const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
+  return mi_mm256_is_ones(vec);
+#elif MI_OPT_SIMD &&  defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  // a 64b cache-line contains the entire chunk anyway so load both at once
+  const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
+  const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
+  return (mi_mm256_is_ones(_mm256_and_si256(vec1, vec2)));
+#elif MI_OPT_SIMD && (MI_BCHUNK_BITS==512) && MI_ARCH_ARM64
+  const uint64x2_t v0 = vld1q_u64((uint64_t*)chunk->bfields);
+  const uint64x2_t v1 = vld1q_u64((uint64_t*)chunk->bfields + 2);
+  const uint64x2_t v2 = vld1q_u64((uint64_t*)chunk->bfields + 4);
+  const uint64x2_t v3 = vld1q_u64((uint64_t*)chunk->bfields + 6);
+  const uint64x2_t v  = vandq_u64(vandq_u64(v0,v1),vandq_u64(v2,v3));
+  return (vminvq_u32(vreinterpretq_u32_u64(v)) == 0xFFFFFFFFUL);
 #else
-      const size_t shift = 1;
+  for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
+    if (~mi_atomic_load_relaxed(&chunk->bfields[i]) != 0) return false;
+  }
+  return true;
 #endif
-      bitidx += shift;
-      m <<= shift;
+}
+
+
+static bool mi_bchunk_bsr(mi_bchunk_t* chunk, size_t* pidx) {
+  for (size_t i = MI_BCHUNK_FIELDS; i > 0; ) {
+    i--;
+    mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
+    size_t idx;
+    if (mi_bsr(b, &idx)) {
+      *pidx = (i*MI_BFIELD_BITS) + idx;
+      return true;
     }
   }
-  // no bits found
   return false;
 }
 
-// Find `count` bits of 0 and set them to 1 atomically; returns `true` on success.
-// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-// `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
-bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) {
-  size_t idx = start_field_idx;
-  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
-    if (idx >= bitmap_fields) idx = 0; // wrap
-    if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
+static bool mi_bchunk_bsr_inv(mi_bchunk_t* chunk, size_t* pidx) {
+  for (size_t i = MI_BCHUNK_FIELDS; i > 0; ) {
+    i--;
+    mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
+    size_t idx;
+    if (mi_bsr(~b, &idx)) {
+      *pidx = (i*MI_BFIELD_BITS) + idx;
       return true;
     }
   }
   return false;
 }
 
-/*
-// Find `count` bits of 0 and set them to 1 atomically; returns `true` on success.
-// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never span fields.
-bool _mi_bitmap_try_find_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t count, mi_bitmap_index_t* bitmap_idx) {
-  return _mi_bitmap_try_find_from_claim(bitmap, bitmap_fields, 0, count, bitmap_idx);
+static size_t mi_bchunk_popcount(mi_bchunk_t* chunk) {
+  size_t popcount = 0;
+  for (size_t i = 0; i < MI_BCHUNK_FIELDS; i++) {
+    const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
+    popcount += mi_bfield_popcount(b);
+  }
+  return popcount;
+}
+
+
+/* --------------------------------------------------------------------------------
+ bitmap chunkmap
+-------------------------------------------------------------------------------- */
+
+static void mi_bitmap_chunkmap_set(mi_bitmap_t* bitmap, size_t chunk_idx) {
+  mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap));
+  mi_bchunk_set(&bitmap->chunkmap, chunk_idx, NULL);
+}
+
+static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) {
+  mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap));
+  // check if the corresponding chunk is all clear
+  if (!mi_bchunk_all_are_clear_relaxed(&bitmap->chunks[chunk_idx])) return false;
+  // clear the chunkmap bit
+  mi_bchunk_clear(&bitmap->chunkmap, chunk_idx, NULL);
+  // .. but a concurrent set may have happened in between our all-clear test and the clearing of the
+  // bit in the mask. We check again to catch this situation.
+  if (!mi_bchunk_all_are_clear_relaxed(&bitmap->chunks[chunk_idx])) {
+    mi_bchunk_set(&bitmap->chunkmap, chunk_idx, NULL);
+    return false;
+  }
+  return true;
 }
-*/
 
-// Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously.
-bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  const size_t idx = mi_bitmap_index_field(bitmap_idx);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const size_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
-  // mi_assert_internal((bitmap[idx] & mask) == mask);
-  size_t prev = mi_atomic_and_acq_rel(&bitmap[idx], ~mask);
-  return ((prev & mask) == mask);
+
+/* --------------------------------------------------------------------------------
+  bitmap
+-------------------------------------------------------------------------------- */
+
+size_t mi_bitmap_size(size_t bit_count, size_t* pchunk_count) {
+  mi_assert_internal((bit_count % MI_BCHUNK_BITS) == 0);
+  bit_count = _mi_align_up(bit_count, MI_BCHUNK_BITS);
+  mi_assert_internal(bit_count <= MI_BITMAP_MAX_BIT_COUNT);
+  mi_assert_internal(bit_count > 0);
+  const size_t chunk_count = bit_count / MI_BCHUNK_BITS;
+  mi_assert_internal(chunk_count >= 1);
+  const size_t size = offsetof(mi_bitmap_t,chunks) + (chunk_count * MI_BCHUNK_SIZE);
+  mi_assert_internal( (size%MI_BCHUNK_SIZE) == 0 );
+  if (pchunk_count != NULL) { *pchunk_count = chunk_count;  }
+  return size;
 }
 
 
-// Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero) {
-  const size_t idx = mi_bitmap_index_field(bitmap_idx);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const size_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
-  //mi_assert_internal(any_zero != NULL || (bitmap[idx] & mask) == 0);
-  size_t prev = mi_atomic_or_acq_rel(&bitmap[idx], mask);
-  if (any_zero != NULL) *any_zero = ((prev & mask) != mask);
-  return ((prev & mask) == 0);
+// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true
+// returns the size of the bitmap
+size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero) {
+  size_t chunk_count;
+  const size_t size = mi_bitmap_size(bit_count, &chunk_count);
+  if (!already_zero) {
+    _mi_memzero_aligned(bitmap, size);
+  }
+  mi_atomic_store_release(&bitmap->chunk_count, chunk_count);
+  mi_assert_internal(mi_atomic_load_relaxed(&bitmap->chunk_count) <= MI_BITMAP_MAX_CHUNK_COUNT);
+  return size;
 }
 
-// Returns `true` if all `count` bits were 1. `any_ones` is `true` if there was at least one bit set to one.
-static bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_ones) {
-  const size_t idx = mi_bitmap_index_field(bitmap_idx);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const size_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
-  size_t field = mi_atomic_load_relaxed(&bitmap[idx]);
-  if (any_ones != NULL) *any_ones = ((field & mask) != 0);
-  return ((field & mask) == mask);
+
+// Set a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
+static void mi_bchunks_unsafe_setN(mi_bchunk_t* chunks, mi_bchunkmap_t* cmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+
+  // start chunk and index
+  size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  const size_t cidx = idx % MI_BCHUNK_BITS;
+  const size_t ccount = _mi_divide_up(n, MI_BCHUNK_BITS);
+
+  // first update the chunkmap
+  mi_bchunk_setN(cmap, chunk_idx, ccount, NULL);
+
+  // first chunk
+  size_t m = MI_BCHUNK_BITS - cidx;
+  if (m > n) { m = n; }
+  mi_bchunk_setN(&chunks[chunk_idx], cidx, m, NULL);
+
+  // n can be large so use memset for efficiency for all in-between chunks
+  chunk_idx++;
+  n -= m;
+  const size_t mid_chunks = n / MI_BCHUNK_BITS;
+  if (mid_chunks > 0) {
+    _mi_memset(&chunks[chunk_idx], ~0, mid_chunks * MI_BCHUNK_SIZE);
+    chunk_idx += mid_chunks;
+    n -= (mid_chunks * MI_BCHUNK_BITS);
+  }
+
+  // last chunk
+  if (n > 0) {
+    mi_assert_internal(n < MI_BCHUNK_BITS);
+    mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
+    mi_bchunk_setN(&chunks[chunk_idx], 0, n, NULL);
+  }
 }
 
-bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  return mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, NULL);
+// Set a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
+void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap));
+  mi_bchunks_unsafe_setN(&bitmap->chunks[0], &bitmap->chunkmap, idx, n);
 }
 
-bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  bool any_ones;
-  mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
-  return any_ones;
+
+
+
+// ------- mi_bitmap_xset ---------------------------------------
+
+// Set a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
+bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* palready_set) {
+  mi_assert_internal(n>0);
+  const size_t maxbits = mi_bitmap_max_bits(bitmap);
+  mi_assert_internal(idx + n <= maxbits);
+  if (idx+n > maxbits) { // paranoia
+    if (idx >= maxbits) return false;
+    n = maxbits - idx;
+  }
+
+  // iterate through the chunks
+  size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  size_t cidx = idx % MI_BCHUNK_BITS;
+  bool were_allclear = true;
+  size_t already_set = 0;
+  while (n > 0) {
+    const size_t m = (cidx + n > MI_BCHUNK_BITS ? MI_BCHUNK_BITS - cidx : n);
+    size_t _already_set = 0;
+    were_allclear = mi_bchunk_setN(&bitmap->chunks[chunk_idx], cidx, m, &_already_set) && were_allclear;
+    already_set += _already_set;
+    mi_bitmap_chunkmap_set(bitmap, chunk_idx); // set afterwards
+    mi_assert_internal(m <= n);
+    n -= m;
+    cidx = 0;
+    chunk_idx++;
+  }
+  if (palready_set != NULL) { *palready_set = already_set;  }
+  return were_allclear;
+}
+
+// Clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 1's to 0's.
+bool mi_bitmap_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  const size_t maxbits = mi_bitmap_max_bits(bitmap);
+  mi_assert_internal(idx + n <= maxbits);
+  if (idx+n > maxbits) { // paranoia
+    if (idx >= maxbits) return false;
+    n = maxbits - idx;
+  }
+
+  // iterate through the chunks
+  size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  size_t cidx = idx % MI_BCHUNK_BITS;
+  bool were_allset = true;
+  while (n > 0) {
+    const size_t m = (cidx + n > MI_BCHUNK_BITS ? MI_BCHUNK_BITS - cidx : n);
+    bool maybe_all_clear = false;
+    were_allset = mi_bchunk_clearN(&bitmap->chunks[chunk_idx], cidx, m, &maybe_all_clear) && were_allset;
+    if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); }
+    mi_assert_internal(m <= n);
+    n -= m;
+    cidx = 0;
+    chunk_idx++;
+  }
+  return were_allset;
 }
 
+// Count bits set in a range of `n` bits.
+size_t mi_bitmap_popcountN( mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  const size_t maxbits = mi_bitmap_max_bits(bitmap);
+  mi_assert_internal(idx + n <= maxbits);
+  if (idx+n > maxbits) { // paranoia
+    if (idx >= maxbits) return 0;
+    n = maxbits - idx;
+  }
 
-//--------------------------------------------------------------------------
-// the `_across` functions work on bitmaps where sequences can cross over
-// between the fields. This is used in arena allocation
-//--------------------------------------------------------------------------
+  // iterate through the chunks
+  size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  size_t cidx = idx % MI_BCHUNK_BITS;
+  size_t popcount = 0;
+  while (n > 0) {
+    const size_t m = (cidx + n > MI_BCHUNK_BITS ? MI_BCHUNK_BITS - cidx : n);
+    popcount += mi_bchunk_popcountN(&bitmap->chunks[chunk_idx], cidx, m);
+    mi_assert_internal(m <= n);
+    n -= m;
+    cidx = 0;
+    chunk_idx++;
+  }
+  return popcount;
+}
 
-// Try to atomically claim a sequence of `count` bits starting from the field 
-// at `idx` in `bitmap` and crossing into subsequent fields. Returns `true` on success.
-static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t idx, const size_t count, const size_t retries, mi_bitmap_index_t* bitmap_idx)
+
+// Set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0)
+bool mi_bitmap_set(mi_bitmap_t* bitmap, size_t idx) {
+  return mi_bitmap_setN(bitmap, idx, 1, NULL);
+}
+
+bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx) {
+  return mi_bitmap_clearN(bitmap, idx, 1);
+}
+
+
+
+// ------- mi_bitmap_is_xset ---------------------------------------
+
+// Is a sequence of n bits already all set/cleared?
+bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  const size_t maxbits = mi_bitmap_max_bits(bitmap);
+  mi_assert_internal(idx + n <= maxbits);
+  if (idx+n > maxbits) { // paranoia
+    if (idx >= maxbits) return false;
+    n = maxbits - idx;
+  }
+
+  // iterate through the chunks
+  size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  size_t cidx = idx % MI_BCHUNK_BITS;
+  bool xset = true;
+  while (n > 0 && xset) {
+    const size_t m = (cidx + n > MI_BCHUNK_BITS ? MI_BCHUNK_BITS - cidx : n);
+    xset = mi_bchunk_is_xsetN(set, &bitmap->chunks[chunk_idx], cidx, m) && xset;
+    mi_assert_internal(m <= n);
+    n -= m;
+    cidx = 0;
+    chunk_idx++;
+  }
+  return xset;
+}
+
+bool mi_bitmap_is_all_clear(mi_bitmap_t* bitmap) {
+  return mi_bitmap_is_xsetN(MI_BIT_CLEAR, bitmap, 0, mi_bitmap_max_bits(bitmap));
+}
+
+/* --------------------------------------------------------------------------------
+  Iterate through a bfield
+-------------------------------------------------------------------------------- */
+
+// Cycle iteration through a bitfield. This is used to space out threads
+// so there is less chance of contention. When searching for a free page we
+// like to first search only the accessed part (so we reuse better). This
+// high point is called the `cycle`.
+//
+// We then iterate through the bitfield as:
+// first: [start, cycle>
+// then : [0, start>
+// then : [cycle, MI_BFIELD_BITS>
+//
+// The start is determined usually as `tseq % cycle` to have each thread
+// start at a different spot.
+// - We use `popcount` to improve branch prediction (maybe not needed? can we simplify?)
+// - The `cycle_mask` is the part `[start, cycle>`.
+#define mi_bfield_iterate(bfield,start,cycle,name_idx,SUF) { \
+  mi_assert_internal(start <= cycle); \
+  mi_assert_internal(start < MI_BFIELD_BITS); \
+  mi_assert_internal(cycle <= MI_BFIELD_BITS); \
+  const mi_bfield_t _cycle_mask##SUF = mi_bfield_mask(cycle - start, start); \
+  size_t _bcount##SUF = mi_bfield_popcount(bfield); \
+  mi_bfield_t _b##SUF = bfield & _cycle_mask##SUF; /* process [start, cycle> first*/\
+  while(_bcount##SUF > 0) { \
+    _bcount##SUF--;\
+    if (_b##SUF==0) { _b##SUF = bfield & ~_cycle_mask##SUF; } /* process [0,start> + [cycle, MI_BFIELD_BITS> next */ \
+    /* size_t name_idx; */ \
+    const bool _found##SUF = mi_bfield_find_least_bit(_b##SUF,&name_idx); \
+    _b##SUF = mi_bfield_clear_least_bit(_b##SUF); /* clear early so `continue` works */ \
+    mi_assert_internal(_found##SUF); MI_UNUSED(_found##SUF); \
+    { \
+
+#define mi_bfield_iterate_end(SUF) \
+    } \
+  } \
+}
+
+
+#define mi_bfield_cycle_iterate(bfield,tseq,cycle,name_idx,SUF) { \
+  const size_t _start##SUF = (uint32_t)(tseq) % (uint32_t)(cycle); /* or: 0 to always search from the start? */\
+  mi_bfield_iterate(bfield,_start##SUF,cycle,name_idx,SUF)
+
+#define mi_bfield_cycle_iterate_end(SUF) \
+  mi_bfield_iterate_end(SUF); \
+}
+
+
+/* --------------------------------------------------------------------------------
+  mi_bitmap_find
+  (used to find free pages)
+-------------------------------------------------------------------------------- */
+
+typedef bool (mi_bitmap_visit_fun_t)(mi_bitmap_t* bitmap, size_t chunk_idx, size_t n, size_t* idx, void* arg1, void* arg2);
+
+// Go through the bitmap and for every sequence of `n` set bits, call the visitor function.
+// If it returns `true` stop the search.
+static inline bool mi_bitmap_find(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx, mi_bitmap_visit_fun_t* on_find, void* arg1, void* arg2)
 {
-  mi_assert_internal(bitmap_idx != NULL);
-  
-  // check initial trailing zeros
-  mi_bitmap_field_t* field = &bitmap[idx];
-  size_t map = mi_atomic_load_relaxed(field);  
-  const size_t initial = mi_clz(map);  // count of initial zeros starting at idx
-  mi_assert_internal(initial <= MI_BITMAP_FIELD_BITS);
-  if (initial == 0)     return false;
-  if (initial >= count) return _mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx);     // no need to cross fields
-  if (_mi_divide_up(count - initial, MI_BITMAP_FIELD_BITS) >= (bitmap_fields - idx)) return false; // not enough entries
-
-  // scan ahead
-  size_t found = initial;
-  size_t mask = 0;     // mask bits for the final field
-  while(found < count) {
-    field++;
-    map = mi_atomic_load_relaxed(field);
-    const size_t mask_bits = (found + MI_BITMAP_FIELD_BITS <= count ? MI_BITMAP_FIELD_BITS : (count - found));
-    mask = mi_bitmap_mask_(mask_bits, 0);
-    if ((map & mask) != 0) return false;
-    found += mask_bits;
-  }
-  mi_assert_internal(field < &bitmap[bitmap_fields]);
-
-  // found range of zeros up to the final field; mask contains mask in the final field
-  // now claim it atomically
-  mi_bitmap_field_t* const final_field = field;
-  const size_t final_mask = mask;
-  mi_bitmap_field_t* const initial_field = &bitmap[idx];
-  const size_t initial_mask = mi_bitmap_mask_(initial, MI_BITMAP_FIELD_BITS - initial);
-
-  // initial field
-  size_t newmap;
-  field = initial_field;
-  map = mi_atomic_load_relaxed(field);
-  do {
-    newmap = map | initial_mask;
-    if ((map & initial_mask) != 0) { goto rollback; };
-  } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
-  
-  // intermediate fields
-  while (++field < final_field) {
-    newmap = MI_BITMAP_FIELD_FULL;
-    map = 0;
-    if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) { goto rollback; }
-  }
-  
-  // final field
-  mi_assert_internal(field == final_field);
-  map = mi_atomic_load_relaxed(field);
-  do {
-    newmap = map | final_mask;
-    if ((map & final_mask) != 0) { goto rollback; }
-  } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
+  const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS);
+  for (size_t i = 0; i < chunkmap_max; i++) {
+    // and for each chunkmap entry we iterate over its bits to find the chunks
+    const mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]);
+    size_t hi;
+    if (mi_bfield_find_highest_bit(cmap_entry, &hi)) {
+      size_t eidx = 0;
+      mi_bfield_cycle_iterate(cmap_entry, tseq%8, hi+1, eidx, Y) // reduce the tseq to 8 bins to reduce using extra memory (see `mstress`)
+      {
+        mi_assert_internal(eidx <= MI_BFIELD_BITS);
+        const size_t chunk_idx = i*MI_BFIELD_BITS + eidx;
+        mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
+        if ((*on_find)(bitmap, chunk_idx, n, pidx, arg1, arg2)) {
+          return true;
+        }
+      }
+      mi_bfield_cycle_iterate_end(Y);
+    }
+  }
+  return false;
+}
+
+
+/* --------------------------------------------------------------------------------
+  Bitmap: try_find_and_claim  -- used to allocate abandoned pages
+  note: the compiler will fully inline the indirect function call
+-------------------------------------------------------------------------------- */
 
-  // claimed!
-  *bitmap_idx = mi_bitmap_index_create(idx, MI_BITMAP_FIELD_BITS - initial);
+typedef struct mi_claim_fun_data_s {
+  mi_arena_t*   arena;  
+} mi_claim_fun_data_t;
+
+static bool mi_bitmap_try_find_and_claim_visit(mi_bitmap_t* bitmap, size_t chunk_idx, size_t n, size_t* pidx, void* arg1, void* arg2)
+{
+  mi_assert_internal(n==1); MI_UNUSED(n);
+  mi_claim_fun_t* claim_fun = (mi_claim_fun_t*)arg1;
+  mi_claim_fun_data_t* claim_data = (mi_claim_fun_data_t*)arg2;
+  size_t cidx;
+  if mi_likely(mi_bchunk_try_find_and_clear(&bitmap->chunks[chunk_idx], &cidx)) {
+    const size_t slice_index = (chunk_idx * MI_BCHUNK_BITS) + cidx;
+    mi_assert_internal(slice_index < mi_bitmap_max_bits(bitmap));
+    bool keep_set = true;
+    if ((*claim_fun)(slice_index, claim_data->arena, &keep_set)) {
+      // success!
+      mi_assert_internal(!keep_set);
+      *pidx = slice_index;
+      return true;
+    }
+    else {
+      // failed to claim it, set abandoned mapping again (unless the page was freed)
+      if (keep_set) {
+        const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx, NULL);
+        mi_assert_internal(wasclear); MI_UNUSED(wasclear);
+      }
+    }
+  }
+  else {
+    // we may find that all are cleared only on a second iteration but that is ok as
+    // the chunkmap is a conservative approximation.
+    mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx);
+  }
+  return false;
+}
+
+// Find a set bit in the bitmap and try to atomically clear it and claim it.
+// (Used to find pages in the pages_abandoned bitmaps.)
+mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx,
+  mi_claim_fun_t* claim, mi_arena_t* arena )
+{
+  mi_claim_fun_data_t claim_data = { arena };
+  return mi_bitmap_find(bitmap, tseq, 1, pidx, &mi_bitmap_try_find_and_claim_visit, (void*)claim, &claim_data);
+}
+
+
+bool mi_bitmap_bsr(mi_bitmap_t* bitmap, size_t* idx) {
+  const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS);
+  for (size_t i = chunkmap_max; i > 0; ) {
+    i--;
+    mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]);
+    size_t cmap_idx;
+    if (mi_bsr(cmap,&cmap_idx)) {
+      // highest chunk
+      const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx;
+      size_t cidx;
+      if (mi_bchunk_bsr(&bitmap->chunks[chunk_idx], &cidx)) {
+        *idx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// Return count of all set bits in a bitmap.
+size_t mi_bitmap_popcount(mi_bitmap_t* bitmap) {
+  // for all chunkmap entries
+  size_t popcount = 0;
+  const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS);
+  for (size_t i = 0; i < chunkmap_max; i++) {
+    mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]);
+    size_t cmap_idx;
+    // for each chunk (corresponding to a set bit in a chunkmap entry)
+    while (mi_bfield_foreach_bit(&cmap_entry, &cmap_idx)) {
+      const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx;
+      // count bits in a chunk
+      popcount += mi_bchunk_popcount(&bitmap->chunks[chunk_idx]);
+    }
+  }
+  return popcount;
+}
+
+
+
+// Clear a bit once it is set.
+void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx) {
+  mi_assert_internal(idx < mi_bitmap_max_bits(bitmap));
+  const size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  const size_t cidx = idx % MI_BCHUNK_BITS;
+  mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
+  mi_bchunk_clear_once_set(&bitmap->chunks[chunk_idx], cidx);
+}
+
+
+// Visit all set bits in a bitmap.
+// todo: optimize further? maybe use avx512 to directly get all indices using a mask_compressstore?
+bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg) {
+  // for all chunkmap entries
+  const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS);
+  for(size_t i = 0; i < chunkmap_max; i++) {
+    mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]);
+    size_t cmap_idx;
+    // for each chunk (corresponding to a set bit in a chunkmap entry)
+    while (mi_bfield_foreach_bit(&cmap_entry, &cmap_idx)) {
+      const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx;
+      // for each chunk field
+      mi_bchunk_t* const chunk = &bitmap->chunks[chunk_idx];
+      for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
+        const size_t base_idx = (chunk_idx*MI_BCHUNK_BITS) + (j*MI_BFIELD_BITS);
+        mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[j]);
+        size_t bidx;
+        while (mi_bfield_foreach_bit(&b, &bidx)) {
+          const size_t idx = base_idx + bidx;
+          if (!visit(idx, 1, arena, arg)) return false;
+        }
+      }
+    }
+  }
   return true;
+}
 
-rollback: 
-  // roll back intermediate fields
-  while (--field > initial_field) {
-    newmap = 0;
-    map = MI_BITMAP_FIELD_FULL;
-    mi_assert_internal(mi_atomic_load_relaxed(field) == map);
-    mi_atomic_store_release(field, newmap);
+// Visit all set bits in a bitmap but try to return ranges (within bfields) if possible.
+// Also clear those ranges atomically.
+// Used by purging to purge larger ranges when possible
+// todo: optimize further? maybe use avx512 to directly get all indices using a mask_compressstore?
+bool _mi_bitmap_forall_setc_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg) {
+  // for all chunkmap entries
+  const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS);
+  for (size_t i = 0; i < chunkmap_max; i++) {
+    mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]);
+    size_t cmap_idx;
+    // for each chunk (corresponding to a set bit in a chunkmap entry)
+    while (mi_bfield_foreach_bit(&cmap_entry, &cmap_idx)) {
+      const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx;
+      // for each chunk field
+      mi_bchunk_t* const chunk = &bitmap->chunks[chunk_idx];
+      for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
+        const size_t base_idx = (chunk_idx*MI_BCHUNK_BITS) + (j*MI_BFIELD_BITS);
+        mi_bfield_t b = mi_atomic_exchange_relaxed(&chunk->bfields[j], 0);
+#if MI_DEBUG > 1
+        const size_t bpopcount = mi_popcount(b);
+        size_t rngcount = 0;
+#endif
+        size_t bidx;
+        while (mi_bfield_find_least_bit(b, &bidx)) {
+          size_t rng = mi_ctz(~(b>>bidx)); // all the set bits from bidx
+#if MI_DEBUG > 1
+          rngcount += rng;
+#endif
+          const size_t idx = base_idx + bidx;
+          mi_assert_internal(rng>=1 && rng<=MI_BFIELD_BITS);
+          mi_assert_internal((idx % MI_BFIELD_BITS) + rng <= MI_BFIELD_BITS);
+          mi_assert_internal((idx / MI_BCHUNK_BITS) < mi_bitmap_chunk_count(bitmap));
+          if (!visit(idx, rng, arena, arg)) return false;
+          // clear rng bits in b
+          b = b & ~mi_bfield_mask(rng, bidx);
+        }
+        mi_assert_internal(rngcount == bpopcount);
+      }
+    }
   }
-  if (field == initial_field) {
-    map = mi_atomic_load_relaxed(field);
-    do {
-      mi_assert_internal((map & initial_mask) == initial_mask);
-      newmap = map & ~initial_mask;
-    } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
-  }  
-  // retry? (we make a recursive call instead of goto to be able to use const declarations)
-  if (retries < 4) {
-    return mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, retries+1, bitmap_idx);
+  return true;
+}
+
+// Visit all set bits in a bitmap but try to return ranges (within bfields) if possible,
+// but only in chunks of at least `rngslices` slices (that are also aligned at `rngslices`)
+// and clear those ranges atomically.
+// However, the `rngslices` are capped at `MI_BFIELD_BITS` at most.
+// Used by purging to purge larger ranges when possible. With transparent huge pages we only
+// want to purge whole huge pages (2 MiB) at a time which is what the `rngslices` parameter achieves.
+bool _mi_bitmap_forall_setc_rangesn(mi_bitmap_t* bitmap, size_t rngslices, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg) 
+{
+  // use the generic routine for `rngslices<=1` (as that one finds longest ranges at a time)
+  if (rngslices<=1) {
+    return _mi_bitmap_forall_setc_ranges(bitmap, visit, arena, arg);
   }
-  else {
-    return false;
+  // mi_assert_internal(rngslices <= MI_BFIELD_BITS);  
+  if (rngslices > MI_BFIELD_BITS) { rngslices = MI_BFIELD_BITS;  } // cap at MI_BFIELD_BITS at most
+
+  // for all chunkmap entries
+  const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS);
+  for (size_t i = 0; i < chunkmap_max; i++) {
+    mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]);
+    size_t cmap_idx;
+    // for each chunk (corresponding to a set bit in a chunkmap entry)
+    while (mi_bfield_foreach_bit(&cmap_entry, &cmap_idx)) {
+      const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx;
+      // for each chunk field
+      mi_bchunk_t* const chunk = &bitmap->chunks[chunk_idx];
+      for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
+        const size_t base_idx = (chunk_idx*MI_BCHUNK_BITS) + (j*MI_BFIELD_BITS);
+        mi_bfield_t b = mi_atomic_exchange_relaxed(&chunk->bfields[j], 0);                // atomic clear
+        mi_bfield_t skipped = 0;                                                          // but track which bits we skip so we can restore them
+        for(size_t shift = 0; rngslices + shift <= MI_BFIELD_BITS; shift += rngslices) {  // per `rngslices` to keep alignment
+          const mi_bfield_t rngmask = mi_bfield_mask(rngslices, shift);
+          if ((b & rngmask) == rngmask) {
+            const size_t idx = base_idx + shift;
+            if (!visit(idx, rngslices, arena, arg)) {
+              // break early
+              if (skipped != 0) {
+                mi_atomic_or_relaxed(&chunk->bfields[j], skipped);
+                return false;
+              }
+            }
+          }
+          else {
+            skipped = skipped | (b & rngmask);
+          }          
+        } 
+        
+        if (skipped != 0) {
+          mi_atomic_or_relaxed(&chunk->bfields[j], skipped);
+        }
+      }
+    }
   }
+  return true;
 }
 
 
-// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
-// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) {
-  mi_assert_internal(count > 0);
-  if (count==1) return _mi_bitmap_try_find_from_claim(bitmap, bitmap_fields, start_field_idx, count, bitmap_idx);
-  size_t idx = start_field_idx;
-  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
-    if (idx >= bitmap_fields) idx = 0; // wrap
-    // try to claim inside the field
-    if (count <= MI_BITMAP_FIELD_BITS) {
-      if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
+/* --------------------------------------------------------------------------------
+  binned bitmap's
+-------------------------------------------------------------------------------- */
+
+
+size_t mi_bbitmap_size(size_t bit_count, size_t* pchunk_count) {
+  // mi_assert_internal((bit_count % MI_BCHUNK_BITS) == 0);
+  bit_count = _mi_align_up(bit_count, MI_BCHUNK_BITS);
+  mi_assert_internal(bit_count <= MI_BITMAP_MAX_BIT_COUNT);
+  mi_assert_internal(bit_count > 0);
+  const size_t chunk_count = bit_count / MI_BCHUNK_BITS;
+  mi_assert_internal(chunk_count >= 1);
+  const size_t size = offsetof(mi_bbitmap_t,chunks) + (chunk_count * MI_BCHUNK_SIZE);
+  mi_assert_internal( (size%MI_BCHUNK_SIZE) == 0 );
+  if (pchunk_count != NULL) { *pchunk_count = chunk_count;  }
+  return size;
+}
+
+// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true
+// returns the size of the bitmap
+size_t mi_bbitmap_init(mi_bbitmap_t* bbitmap, size_t bit_count, bool already_zero) {
+  size_t chunk_count;
+  const size_t size = mi_bbitmap_size(bit_count, &chunk_count);
+  if (!already_zero) {
+    _mi_memzero_aligned(bbitmap, size);
+  }
+  mi_atomic_store_release(&bbitmap->chunk_count, chunk_count);
+  mi_assert_internal(mi_atomic_load_relaxed(&bbitmap->chunk_count) <= MI_BITMAP_MAX_CHUNK_COUNT);
+  return size;
+}
+
+void mi_bbitmap_unsafe_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  mi_assert_internal(idx + n <= mi_bbitmap_max_bits(bbitmap));
+  mi_bchunks_unsafe_setN(&bbitmap->chunks[0], &bbitmap->chunkmap, idx, n);
+}
+
+bool mi_bbitmap_bsr_inv(mi_bbitmap_t* bbitmap, size_t* idx) {
+  const size_t chunkmap_max = _mi_divide_up(mi_bbitmap_chunk_count(bbitmap), MI_BFIELD_BITS);
+  for (size_t i = chunkmap_max; i > 0; ) {
+    i--;
+    mi_bfield_t cmap = mi_atomic_load_relaxed(&bbitmap->chunkmap.bfields[i]);
+    size_t cmap_idx;
+    if (mi_bsr(~cmap, &cmap_idx)) {
+      // highest chunk
+      const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx;
+      size_t cidx;
+      if (mi_bchunk_bsr_inv(&bbitmap->chunks[chunk_idx], &cidx)) {
+        *idx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
         return true;
       }
     }
-    // try to claim across fields
-    if (mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, 0, bitmap_idx)) {
-      return true;
+  }
+  return false;
+}
+
+
+/* --------------------------------------------------------------------------------
+ binned bitmap used to track free slices
+-------------------------------------------------------------------------------- */
+
+// Assign a specific size bin to a chunk
+static void mi_bbitmap_set_chunk_bin(mi_bbitmap_t* bbitmap, size_t chunk_idx, mi_chunkbin_t bin) {
+  mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
+  for (mi_chunkbin_t ibin = MI_CBIN_SMALL; ibin < MI_CBIN_NONE; ibin = mi_chunkbin_inc(ibin)) {
+    if (ibin == bin) {
+      const bool was_clear = mi_bchunk_set(& bbitmap->chunkmap_bins[ibin], chunk_idx, NULL);
+      if (was_clear) { mi_os_stat_increase(chunk_bins[ibin],1); }
+    }
+    else {
+      const bool was_set = mi_bchunk_clear(&bbitmap->chunkmap_bins[ibin], chunk_idx, NULL);
+      if (was_set) { mi_os_stat_decrease(chunk_bins[ibin],1); }
+    }
+  }
+}
+
+mi_chunkbin_t mi_bbitmap_debug_get_bin(const mi_bchunkmap_t* chunkmap_bins, size_t chunk_idx) {
+  for (mi_chunkbin_t ibin = MI_CBIN_SMALL; ibin < MI_CBIN_NONE; ibin = mi_chunkbin_inc(ibin)) {
+    if (mi_bchunk_is_xsetN(MI_BIT_SET, &chunkmap_bins[ibin], chunk_idx, 1)) {
+      return ibin;
+    }
+  }
+  return MI_CBIN_NONE;
+}
+
+// Track the index of the highest chunk that is accessed.
+static void mi_bbitmap_chunkmap_set_max(mi_bbitmap_t* bbitmap, size_t chunk_idx) {
+  size_t oldmax = mi_atomic_load_relaxed(&bbitmap->chunk_max_accessed);
+  if mi_unlikely(chunk_idx > oldmax) {
+    mi_atomic_cas_strong_relaxed(&bbitmap->chunk_max_accessed, &oldmax, chunk_idx);
+  }
+}
+
+// Set a bit in the chunkmap
+static void mi_bbitmap_chunkmap_set(mi_bbitmap_t* bbitmap, size_t chunk_idx, bool check_all_set) {
+  mi_assert(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
+  if (check_all_set) {
+    if (mi_bchunk_all_are_set_relaxed(&bbitmap->chunks[chunk_idx])) {
+      // all slices are free in this chunk: return back to the NONE bin
+      mi_bbitmap_set_chunk_bin(bbitmap, chunk_idx, MI_CBIN_NONE);
+    }
+  }
+  mi_bchunk_set(&bbitmap->chunkmap, chunk_idx, NULL);
+  mi_bbitmap_chunkmap_set_max(bbitmap, chunk_idx);
+}
+
+static bool mi_bbitmap_chunkmap_try_clear(mi_bbitmap_t* bbitmap, size_t chunk_idx) {
+  mi_assert(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
+  // check if the corresponding chunk is all clear
+  if (!mi_bchunk_all_are_clear_relaxed(&bbitmap->chunks[chunk_idx])) return false;
+  // clear the chunkmap bit
+  mi_bchunk_clear(&bbitmap->chunkmap, chunk_idx, NULL);
+  // .. but a concurrent set may have happened in between our all-clear test and the clearing of the
+  // bit in the mask. We check again to catch this situation. (note: mi_bchunk_clear must be acq-rel)
+  if (!mi_bchunk_all_are_clear_relaxed(&bbitmap->chunks[chunk_idx])) {
+    mi_bchunk_set(&bbitmap->chunkmap, chunk_idx, NULL);
+    return false;
+  }
+  mi_bbitmap_chunkmap_set_max(bbitmap, chunk_idx);
+  return true;
+}
+
+
+/* --------------------------------------------------------------------------------
+  mi_bbitmap_setN, try_clearN, and is_xsetN
+  (used to find free pages)
+-------------------------------------------------------------------------------- */
+
+// Set a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
+bool mi_bbitmap_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  const size_t maxbits = mi_bbitmap_max_bits(bbitmap);
+  mi_assert_internal(idx + n <= maxbits);
+  if (idx+n > maxbits) { // paranoia
+    if (idx >= maxbits) return false;
+    n = maxbits - idx;
+  }
+
+  // iterate through the chunks
+  size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  size_t cidx = idx % MI_BCHUNK_BITS;
+  bool were_allclear = true;
+  while (n > 0) {
+    const size_t m = (cidx + n > MI_BCHUNK_BITS ? MI_BCHUNK_BITS - cidx : n);
+    were_allclear = mi_bchunk_setN(&bbitmap->chunks[chunk_idx], cidx, m, NULL) && were_allclear;
+    mi_bbitmap_chunkmap_set(bbitmap, chunk_idx, true); // set afterwards
+    mi_assert_internal(m <= n);
+    n -= m;
+    cidx = 0;
+    chunk_idx++;
+  }
+  return were_allclear;
+}
+
+// ------- mi_bbitmap_try_clearNC ---------------------------------------
+
+// Try to clear `n` bits at `idx` where `n <= MI_BCHUNK_BITS`.
+bool mi_bbitmap_try_clearNC(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  mi_assert_internal(n<=MI_BCHUNK_BITS);
+  mi_assert_internal(idx + n <= mi_bbitmap_max_bits(bbitmap));
+
+  const size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  const size_t cidx = idx % MI_BCHUNK_BITS;
+  mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);  // don't cross chunks (for now)
+  mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
+  if (cidx + n > MI_BCHUNK_BITS) return false;
+  bool maybe_all_clear = false;
+  const bool cleared = mi_bchunk_try_clearN(&bbitmap->chunks[chunk_idx], cidx, n, &maybe_all_clear);
+  if (cleared && maybe_all_clear) { mi_bbitmap_chunkmap_try_clear(bbitmap, chunk_idx); }
+  // note: we don't set the size class for an explicit try_clearN (only used by purging)
+  return cleared;
+}
+
+
+
+// ------- mi_bbitmap_is_xset ---------------------------------------
+
+// Is a sequence of n bits already all set/cleared?
+bool mi_bbitmap_is_xsetN(mi_xset_t set, mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  const size_t maxbits = mi_bbitmap_max_bits(bbitmap);
+  mi_assert_internal(idx + n <= maxbits);
+  if (idx+n > maxbits) { // paranoia
+    if (idx >= maxbits) return false;
+    n = maxbits - idx;
+  }
+
+  // iterate through the chunks
+  size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  size_t cidx = idx % MI_BCHUNK_BITS;
+  bool xset = true;
+  while (n > 0 && xset) {
+    const size_t m = (cidx + n > MI_BCHUNK_BITS ? MI_BCHUNK_BITS - cidx : n);
+    xset = mi_bchunk_is_xsetN(set, &bbitmap->chunks[chunk_idx], cidx, m) && xset;
+    mi_assert_internal(m <= n);
+    n -= m;
+    cidx = 0;
+    chunk_idx++;
+  }
+  return xset;
+}
+
+
+
+
+/* --------------------------------------------------------------------------------
+  mi_bbitmap_find
+  (used to find free pages)
+-------------------------------------------------------------------------------- */
+
+typedef bool (mi_bchunk_try_find_and_clear_fun_t)(mi_bchunk_t* chunk, size_t n, size_t* idx);
+
+// Go through the bbitmap and for every sequence of `n` set bits, call the visitor function.
+// If it returns `true` stop the search.
+//
+// This is used for finding free blocks and it is important to be efficient (with 2-level bitscan)
+// but also reduce fragmentation (through size bins).
+static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap, size_t tseq, size_t n, size_t* pidx, mi_bchunk_try_find_and_clear_fun_t* on_find)
+{
+  // we space out threads to reduce contention
+  const size_t cmap_max_count  = _mi_divide_up(mi_bbitmap_chunk_count(bbitmap),MI_BFIELD_BITS);
+  const size_t chunk_acc       = mi_atomic_load_relaxed(&bbitmap->chunk_max_accessed);
+  const size_t cmap_acc        = chunk_acc / MI_BFIELD_BITS;
+  const size_t cmap_acc_bits   = 1 + (chunk_acc % MI_BFIELD_BITS);
+
+  // create a mask over the chunkmap entries to iterate over them efficiently
+  mi_assert_internal(MI_BFIELD_BITS >= MI_BCHUNK_FIELDS);
+  const mi_bfield_t cmap_mask  = mi_bfield_mask(cmap_max_count,0);
+  const size_t cmap_cycle      = cmap_acc+1;
+  const mi_chunkbin_t bbin = mi_chunkbin_of(n);
+  // visit each cmap entry
+  size_t cmap_idx = 0;
+  mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X)
+  {
+    // and for each chunkmap entry we iterate over its bits to find the chunks
+    const mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bbitmap->chunkmap.bfields[cmap_idx]);
+    const size_t cmap_entry_cycle = (cmap_idx != cmap_acc ? MI_BFIELD_BITS : cmap_acc_bits);
+    if (cmap_entry == 0) {
+      continue;
+    }
+
+    // get size bin masks
+    mi_bfield_t cmap_bins[MI_CBIN_COUNT] = { 0 };
+    cmap_bins[MI_CBIN_NONE] = cmap_entry;
+    for (mi_chunkbin_t ibin = MI_CBIN_SMALL; ibin < MI_CBIN_NONE; ibin = mi_chunkbin_inc(ibin)) {
+      const mi_bfield_t cmap_bin = mi_atomic_load_relaxed(&bbitmap->chunkmap_bins[ibin].bfields[cmap_idx]);
+      cmap_bins[ibin] = cmap_bin & cmap_entry;
+      cmap_bins[MI_CBIN_NONE] &= ~cmap_bin;      // clear bits that are in an assigned size bin
+    }
+
+    // consider only chunks for a particular size bin at a time
+    // this picks the best bin only within a cmap entry (~ 1GiB address space), but avoids multiple
+    // iterations through all entries.
+    mi_assert_internal(bbin < MI_CBIN_NONE);
+    for (mi_chunkbin_t ibin = MI_CBIN_SMALL; ibin <= MI_CBIN_NONE;
+          // skip from bbin to NONE (so, say, a SMALL will never be placed in a OTHER, MEDIUM, or LARGE chunk to reduce fragmentation)
+          ibin = (ibin == bbin ? MI_CBIN_NONE : mi_chunkbin_inc(ibin)))
+    {
+      mi_assert_internal(ibin < MI_CBIN_COUNT);
+      const mi_bfield_t cmap_bin = cmap_bins[ibin];
+      size_t eidx = 0;
+      mi_bfield_cycle_iterate(cmap_bin, tseq, cmap_entry_cycle, eidx, Y)
+      {
+        // assertion doesn't quite hold as the max_accessed may be out-of-date
+        // mi_assert_internal(cmap_entry_cycle > eidx || ibin == MI_CBIN_NONE);
+
+        // get the chunk
+        const size_t chunk_idx = cmap_idx*MI_BFIELD_BITS + eidx;
+        mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx];
+
+        size_t cidx;
+        if ((*on_find)(chunk, n, &cidx)) {
+          if (cidx==0 && ibin == MI_CBIN_NONE) { // only the first block determines the size bin
+            // this chunk is now reserved for the `bbin` size class
+            mi_bbitmap_set_chunk_bin(bbitmap, chunk_idx, bbin);
+          }
+          *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
+          mi_assert_internal(*pidx + n <= mi_bbitmap_max_bits(bbitmap));
+          return true;
+        }
+        else {
+          // todo: should _on_find_ return a boolean if there is a chance all are clear to avoid calling `try_clear?`
+          // we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation.
+          mi_bbitmap_chunkmap_try_clear(bbitmap, chunk_idx);
+        }
+      }
+      mi_bfield_cycle_iterate_end(Y);
     }
   }
+  mi_bfield_cycle_iterate_end(X);
   return false;
 }
 
-// Helper for masks across fields; returns the mid count, post_mask may be 0
-static size_t mi_bitmap_mask_across(mi_bitmap_index_t bitmap_idx, size_t bitmap_fields, size_t count, size_t* pre_mask, size_t* mid_mask, size_t* post_mask) {
-  MI_UNUSED_RELEASE(bitmap_fields);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  if (mi_likely(bitidx + count <= MI_BITMAP_FIELD_BITS)) {
-    *pre_mask = mi_bitmap_mask_(count, bitidx);
-    *mid_mask = 0;
-    *post_mask = 0;
-    mi_assert_internal(mi_bitmap_index_field(bitmap_idx) < bitmap_fields);
-    return 0;
+/* --------------------------------------------------------------------------------
+  mi_bbitmap_try_find_and_clear -- used to find free pages
+  note: the compiler will fully inline the indirect function calls
+-------------------------------------------------------------------------------- */
+
+bool mi_bbitmap_try_find_and_clear(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx) {
+  return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, 1, pidx, &mi_bchunk_try_find_and_clear_1);
+}
+
+bool mi_bbitmap_try_find_and_clear8(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx) {
+  return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, 8, pidx, &mi_bchunk_try_find_and_clear_8);
+}
+
+// bool mi_bbitmap_try_find_and_clearX(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx) {
+//   return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, MI_BFIELD_BITS, pidx, &mi_bchunk_try_find_and_clear_X);
+// }
+
+bool mi_bbitmap_try_find_and_clearNX(mi_bbitmap_t* bbitmap, size_t tseq, size_t n, size_t* pidx) {
+  mi_assert_internal(n<=MI_BFIELD_BITS);
+  return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, n, pidx, &mi_bchunk_try_find_and_clearNX);
+}
+
+bool mi_bbitmap_try_find_and_clearNC(mi_bbitmap_t* bbitmap, size_t tseq, size_t n, size_t* pidx) {
+  mi_assert_internal(n<=MI_BCHUNK_BITS);
+  return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, n, pidx, &mi_bchunk_try_find_and_clearNC);
+}
+
+
+/* --------------------------------------------------------------------------------
+  mi_bbitmap_try_find_and_clear for huge objects spanning multiple chunks
+-------------------------------------------------------------------------------- */
+
+// Try to atomically clear `n` bits starting at `chunk_idx` where `n` can span over multiple chunks
+static bool mi_bchunk_try_clearN_(mi_bbitmap_t* bbitmap, size_t chunk_idx, size_t n) {
+  mi_assert_internal((chunk_idx * MI_BCHUNK_BITS) + n <= mi_bbitmap_max_bits(bbitmap));
+
+  size_t m = n;      // bits to go
+  size_t count = 0;  // chunk count
+  while (m > 0) {
+    mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx + count];
+    if (!mi_bchunk_try_clearN(chunk, 0, (m > MI_BCHUNK_BITS ? MI_BCHUNK_BITS : m), NULL)) {
+      goto rollback;
+    }
+    m = (m <= MI_BCHUNK_BITS ? 0 : m - MI_BCHUNK_BITS);
+    count++;
   }
-  else {
-    const size_t pre_bits = MI_BITMAP_FIELD_BITS - bitidx;
-    mi_assert_internal(pre_bits < count);
-    *pre_mask = mi_bitmap_mask_(pre_bits, bitidx);
-    count -= pre_bits;
-    const size_t mid_count = (count / MI_BITMAP_FIELD_BITS);
-    *mid_mask = MI_BITMAP_FIELD_FULL;
-    count %= MI_BITMAP_FIELD_BITS;
-    *post_mask = (count==0 ? 0 : mi_bitmap_mask_(count, 0));
-    mi_assert_internal(mi_bitmap_index_field(bitmap_idx) + mid_count + (count==0 ? 0 : 1) < bitmap_fields);
-    return mid_count;
-  }
-}
-
-// Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously.
-bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  size_t idx = mi_bitmap_index_field(bitmap_idx);
-  size_t pre_mask;
-  size_t mid_mask;
-  size_t post_mask;
-  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);  
-  bool all_one = true;
-  mi_bitmap_field_t* field = &bitmap[idx];
-  size_t prev = mi_atomic_and_acq_rel(field++, ~pre_mask);
-  if ((prev & pre_mask) != pre_mask) all_one = false;
-  while(mid_count-- > 0) {
-    prev = mi_atomic_and_acq_rel(field++, ~mid_mask);
-    if ((prev & mid_mask) != mid_mask) all_one = false;
-  }
-  if (post_mask!=0) {
-    prev = mi_atomic_and_acq_rel(field, ~post_mask);
-    if ((prev & post_mask) != post_mask) all_one = false;
-  }
-  return all_one;  
-}
-
-// Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero) {
-  size_t idx = mi_bitmap_index_field(bitmap_idx);
-  size_t pre_mask;
-  size_t mid_mask;
-  size_t post_mask;
-  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
-  bool all_zero = true;
-  bool any_zero = false;
-  _Atomic(size_t)*field = &bitmap[idx];
-  size_t prev = mi_atomic_or_acq_rel(field++, pre_mask);
-  if ((prev & pre_mask) != 0) all_zero = false;
-  if ((prev & pre_mask) != pre_mask) any_zero = true;
-  while (mid_count-- > 0) {
-    prev = mi_atomic_or_acq_rel(field++, mid_mask);
-    if ((prev & mid_mask) != 0) all_zero = false;
-    if ((prev & mid_mask) != mid_mask) any_zero = true;
-  }
-  if (post_mask!=0) {
-    prev = mi_atomic_or_acq_rel(field, post_mask);
-    if ((prev & post_mask) != 0) all_zero = false;
-    if ((prev & post_mask) != post_mask) any_zero = true;
-  }
-  if (pany_zero != NULL) *pany_zero = any_zero;
-  return all_zero;
-}
-
-
-// Returns `true` if all `count` bits were 1. 
-// `any_ones` is `true` if there was at least one bit set to one.
-static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_ones) {
-  size_t idx = mi_bitmap_index_field(bitmap_idx);
-  size_t pre_mask;
-  size_t mid_mask;
-  size_t post_mask;
-  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
-  bool all_ones = true;
-  bool any_ones = false;
-  mi_bitmap_field_t* field = &bitmap[idx];
-  size_t prev = mi_atomic_load_relaxed(field++);
-  if ((prev & pre_mask) != pre_mask) all_ones = false;
-  if ((prev & pre_mask) != 0) any_ones = true;
-  while (mid_count-- > 0) {
-    prev = mi_atomic_load_relaxed(field++);
-    if ((prev & mid_mask) != mid_mask) all_ones = false;
-    if ((prev & mid_mask) != 0) any_ones = true;
-  }
-  if (post_mask!=0) {
-    prev = mi_atomic_load_relaxed(field);
-    if ((prev & post_mask) != post_mask) all_ones = false;
-    if ((prev & post_mask) != 0) any_ones = true;
-  }  
-  if (pany_ones != NULL) *pany_ones = any_ones;
-  return all_ones;
-}
-
-bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  return mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, NULL);
-}
-
-bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  bool any_ones;
-  mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
-  return any_ones;
+  return true;
+
+rollback:
+  // we only need to reset chunks the we just fully cleared
+  while (count > 0) {
+    count--;
+    mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx + count];
+    mi_bchunk_setN(chunk, 0, MI_BCHUNK_BITS, NULL);
+  }
+  return false;
+}
+
+// Go through the bbitmap to find a sequence of `n` bits and clear them atomically where `n > MI_ARENA_MAX_CHUNK_OBJ_SIZE`
+// Since these are very large object allocations we always search from the start and only consider starting at the start
+// of a chunk (for fragmentation and efficiency).
+// Todo: for now we try to find full empty chunks to cover `n` but we can allow a partial chunk at the end
+// Todo: This scans directly through the chunks -- we might want to consult the cmap as well?
+bool mi_bbitmap_try_find_and_clearN_(mi_bbitmap_t* bbitmap, size_t tseq, size_t n, size_t* pidx) {
+  MI_UNUSED(tseq);
+  mi_assert(n > 0); if (n==0) { return false; }
+
+  const size_t chunk_max = mi_bbitmap_chunk_count(bbitmap);
+  const size_t chunk_req = _mi_divide_up(n, MI_BCHUNK_BITS);  // minimal number of chunks needed
+  if (chunk_max < chunk_req) { return false; }
+
+  // iterate through the chunks
+  size_t chunk_idx = 0;
+  while (chunk_idx <= chunk_max - chunk_req)
+  {
+    size_t count = 0;  // chunk count
+    do {
+      mi_assert_internal(chunk_idx + count < chunk_max);
+      mi_bchunk_t* const chunk = &bbitmap->chunks[chunk_idx + count];
+      if (!mi_bchunk_all_are_set_relaxed(chunk)) {
+        break;
+      }
+      else {
+        count++;
+      }
+    }
+    while (count < chunk_req);
+
+    // did we find a suitable range?
+    if (count == chunk_req) {
+      // now try to claim it!
+      if (mi_bchunk_try_clearN_(bbitmap, chunk_idx, n)) {
+        *pidx = (chunk_idx * MI_BCHUNK_BITS);
+        for (size_t i = 0; i < count; i++) {
+          mi_bbitmap_set_chunk_bin(bbitmap, chunk_idx + i, MI_CBIN_HUGE);
+        }
+        mi_assert_internal(*pidx + n <= mi_bbitmap_max_bits(bbitmap));
+        return true;
+      }
+    }
+
+    // keep searching but skip the scanned range
+    chunk_idx += count+1;
+  }
+  return false;
 }
+
+
+
+
+
diff --git a/ext/src/mimalloc/src/bitmap.h b/ext/src/mimalloc/src/bitmap.h
index 7bd3106c9c..2ecc3141d3 100644
--- a/ext/src/mimalloc/src/bitmap.h
+++ b/ext/src/mimalloc/src/bitmap.h
@@ -1,107 +1,343 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2019-2020 Microsoft Research, Daan Leijen
+Copyright (c) 2019-2024 Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 
 /* ----------------------------------------------------------------------------
-Concurrent bitmap that can set/reset sequences of bits atomically,
-represeted as an array of fields where each field is a machine word (`size_t`)
-
-There are two api's; the standard one cannot have sequences that cross
-between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
-(this is used in region allocation)
-
-The `_across` postfixed functions do allow sequences that can cross over
-between the fields. (This is used in arena allocation)
+Concurrent bitmap that can set/reset sequences of bits atomically
 ---------------------------------------------------------------------------- */
 #pragma once
 #ifndef MI_BITMAP_H
 #define MI_BITMAP_H
 
-/* -----------------------------------------------------------
-  Bitmap definition
------------------------------------------------------------ */
+/* --------------------------------------------------------------------------------
+  Atomic bitmaps with release/acquire guarantees:
+
+  `mi_bfield_t`: is a single machine word that can efficiently be bit counted (usually `size_t`)
+      each bit usually represents a single MI_ARENA_SLICE_SIZE in an arena (64 KiB).
+      We need 16K bits to represent a 1GiB arena.
+
+  `mi_bchunk_t`: a chunk of bfield's of a total of MI_BCHUNK_BITS (= 512 on 64-bit, 256 on 32-bit)
+      allocations never span across chunks -- so MI_ARENA_MAX_OBJ_SIZE is the number
+      of bits in a chunk times the MI_ARENA_SLICE_SIZE (512 * 64KiB = 32 MiB).
+      These chunks are cache-aligned and we can use AVX2/AVX512/NEON/SVE/SVE2/etc. instructions
+      to scan for bits (perhaps) more efficiently.
+
+      We allocate byte-sized ranges aligned to bytes in the bfield, and bfield-sized
+      ranges aligned to a bfield.
+
+    Searching linearly through the chunks would be too slow (16K bits per GiB).
+    Instead we add a "chunkmap" to do a two-level search (more or less a btree of depth 2).
+
+   `mi_bchunkmap_t` (== `mi_bchunk_t`): for each chunk we track if it has (potentially) any bit set.
+      The chunkmap has 1 bit per chunk that is set if the chunk potentially has a bit set.
+      This is used to avoid scanning every chunk. (and thus strictly an optimization)
+      It is conservative: it is fine to set a bit in the chunk map even if the chunk turns out
+      to have no bits set. It is also allowed to briefly have a clear bit even if the
+      chunk has bits set -- as long as we guarantee that the bit will be set later on;
+      (this allows us to set the chunkmap bit right after we set a bit in the corresponding chunk).
+
+      However, when we clear a bit in a chunk, and the chunk is indeed all clear, we
+      cannot safely clear the bit corresponding to the chunk in the chunkmap since it
+      may race with another thread setting a bit in the same chunk. Therefore, when
+      clearing, we first test if a chunk is clear, then clear the chunkmap bit, and
+      then test again to catch any set bits that we may have missed.
+
+      Since the chunkmap may thus be briefly out-of-sync, this means that we may sometimes
+      not find a free page even though it's there (but we accept this as we avoid taking
+      full locks). (Another way to do this is to use an epoch but we like to avoid that complexity
+      for now).
+
+   `mi_bitmap_t`: a bitmap with N chunks. A bitmap has a chunkmap of MI_BCHUNK_BITS (512)
+      and thus has at most 512 chunks (=2^18 bits x 64 KiB slices = 16 GiB max arena size).
+      The minimum is 1 chunk which is a 32 MiB arena.
+
+   For now, the implementation assumes MI_HAS_FAST_BITSCAN and uses trailing-zero-count
+   and pop-count (but we think it can be adapted work reasonably well on older hardware too)
+--------------------------------------------------------------------------------------------- */
+
+// A word-size bit field.
+typedef size_t mi_bfield_t;
+
+#define MI_BFIELD_BITS_SHIFT         (MI_SIZE_SHIFT+3)
+#define MI_BFIELD_BITS               (1 << MI_BFIELD_BITS_SHIFT)
+#define MI_BFIELD_SIZE               (MI_BFIELD_BITS/8)
+#define MI_BFIELD_LO_BIT8            (((~(mi_bfield_t)0))/0xFF)         // 0x01010101 ..
+#define MI_BFIELD_HI_BIT8            (MI_BFIELD_LO_BIT8 << 7)           // 0x80808080 ..
+
+#define MI_BCHUNK_SIZE               (MI_BCHUNK_BITS / 8)
+#define MI_BCHUNK_FIELDS             (MI_BCHUNK_BITS / MI_BFIELD_BITS)  // 8 on both 64- and 32-bit
+
+
+// some compiler (msvc in C mode) cannot have expressions in the alignment attribute
+#if MI_BCHUNK_SIZE==64
+#define mi_decl_bchunk_align  mi_decl_align(64)
+#elif MI_BCHUNK_SIZE==32
+#define mi_decl_bchunk_align  mi_decl_align(32)
+#else
+#define mi_decl_bchunk_align  mi_decl_align(MI_BCHUNK_SIZE)
+#endif
+
+
+// A bitmap chunk contains 512 bits on 64-bit  (256 on 32-bit)
+typedef mi_decl_bchunk_align struct mi_bchunk_s {
+  _Atomic(mi_bfield_t) bfields[MI_BCHUNK_FIELDS];
+} mi_bchunk_t;
+
+
+// The chunkmap has one bit per corresponding chunk that is set if the chunk potentially has bits set.
+// The chunkmap is itself a chunk.
+typedef mi_bchunk_t mi_bchunkmap_t;
+
+#define MI_BCHUNKMAP_BITS             MI_BCHUNK_BITS
+
+#define MI_BITMAP_MAX_CHUNK_COUNT     (MI_BCHUNKMAP_BITS)
+#define MI_BITMAP_MIN_CHUNK_COUNT     (1)
+#if MI_SIZE_BITS > 32
+#define MI_BITMAP_DEFAULT_CHUNK_COUNT     (64)  // 2 GiB on 64-bit -- this is for the page map
+#else
+#define MI_BITMAP_DEFAULT_CHUNK_COUNT      (1)
+#endif
+#define MI_BITMAP_MAX_BIT_COUNT       (MI_BITMAP_MAX_CHUNK_COUNT * MI_BCHUNK_BITS)  // 16 GiB arena
+#define MI_BITMAP_MIN_BIT_COUNT       (MI_BITMAP_MIN_CHUNK_COUNT * MI_BCHUNK_BITS)  // 32 MiB arena
+#define MI_BITMAP_DEFAULT_BIT_COUNT   (MI_BITMAP_DEFAULT_CHUNK_COUNT * MI_BCHUNK_BITS)  // 2 GiB arena
+
+
+// An atomic bitmap
+typedef mi_decl_bchunk_align struct mi_bitmap_s {
+  _Atomic(size_t)  chunk_count;         // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
+  size_t           _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 1];    // suppress warning on msvc
+  mi_bchunkmap_t   chunkmap;
+  mi_bchunk_t      chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT];        // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT
+} mi_bitmap_t;
+
+
+static inline size_t mi_bitmap_chunk_count(const mi_bitmap_t* bitmap) {
+  return mi_atomic_load_relaxed(&((mi_bitmap_t*)bitmap)->chunk_count);
+}
+
+static inline size_t mi_bitmap_max_bits(const mi_bitmap_t* bitmap) {
+  return (mi_bitmap_chunk_count(bitmap) * MI_BCHUNK_BITS);
+}
+
+
+
+/* --------------------------------------------------------------------------------
+  Atomic bitmap operations
+-------------------------------------------------------------------------------- */
+
+// Many operations are generic over setting or clearing the bit sequence: we use `mi_xset_t` for this (true if setting, false if clearing)
+typedef bool  mi_xset_t;
+#define MI_BIT_SET    (true)
+#define MI_BIT_CLEAR  (false)
+
+
+// Required size of a bitmap to represent `bit_count` bits.
+size_t mi_bitmap_size(size_t bit_count, size_t* chunk_count);
+
+// Initialize a bitmap to all clear; avoid a mem_zero if `already_zero` is true
+// returns the size of the bitmap.
+size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero);
+
+// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks).
+// Not atomic so only use if still local to a thread.
+void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n);
+
 
-#define MI_BITMAP_FIELD_BITS   (8*MI_SIZE_SIZE)
-#define MI_BITMAP_FIELD_FULL   (~((size_t)0))   // all bits set
+// Set a bit in the bitmap; returns `true` if it atomically transitioned from 0 to 1
+bool mi_bitmap_set(mi_bitmap_t* bitmap, size_t idx);
 
-// An atomic bitmap of `size_t` fields
-typedef _Atomic(size_t)  mi_bitmap_field_t;
-typedef mi_bitmap_field_t*  mi_bitmap_t;
+// Clear a bit in the bitmap; returns `true` if it atomically transitioned from 1 to 0
+bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx);
 
-// A bitmap index is the index of the bit in a bitmap.
-typedef size_t mi_bitmap_index_t;
+// Set a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's
+// If `already_set` is not NULL, it is set to count of bits were already all set.
+// (this is used for correct statistics if commiting over a partially committed area)
+bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_set);
 
-// Create a bit index.
-static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx) {
-  mi_assert_internal(bitidx < MI_BITMAP_FIELD_BITS);
-  return (idx*MI_BITMAP_FIELD_BITS) + bitidx;
+// Clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 1's to 0's
+bool mi_bitmap_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n);
+
+
+// Is a sequence of n bits already all set/cleared?
+bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
+
+// Is the bitmap completely clear?
+bool mi_bitmap_is_all_clear(mi_bitmap_t* bitmap);
+
+// Is a sequence of n bits already set?
+// (Used to check if a memory range is already committed)
+static inline bool mi_bitmap_is_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  return mi_bitmap_is_xsetN(MI_BIT_SET, bitmap, idx, n);
+}
+
+// Is a sequence of n bits already clear?
+static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  return mi_bitmap_is_xsetN(MI_BIT_CLEAR, bitmap, idx, n);
+}
+
+static inline bool mi_bitmap_is_set(mi_bitmap_t* bitmap, size_t idx) {
+  return mi_bitmap_is_setN(bitmap, idx, 1);
+}
+
+static inline bool mi_bitmap_is_clear(mi_bitmap_t* bitmap, size_t idx) {
+  return mi_bitmap_is_clearN(bitmap, idx, 1);
+}
+
+// Called once a bit is cleared to see if the memory slice can be claimed.
+typedef bool (mi_claim_fun_t)(size_t slice_index, mi_arena_t* arena, bool* keep_set);
+
+// Find a set bits in the bitmap, atomically clear it, and check if `claim` returns true.
+// If not claimed, continue on (potentially setting the bit again depending on `keep_set`).
+// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
+mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx,
+                                                    mi_claim_fun_t* claim, mi_arena_t* arena );
+
+
+// Atomically clear a bit but only if it is set. Will block otherwise until the bit is set.
+// This is used to delay free-ing a page that it at the same time being considered to be
+// allocated from `mi_arena_try_abandoned` (and is in the `claim` function of `mi_bitmap_try_find_and_claim`).
+void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx);
+
+
+// If a bit is set in the bitmap, return `true` and set `idx` to the index of the highest bit.
+// Otherwise return `false` (and `*idx` is undefined).
+// Used for unloading arena's
+bool mi_bitmap_bsr(mi_bitmap_t* bitmap, size_t* idx);
+
+// Return count of all set bits in a bitmap.
+size_t mi_bitmap_popcount(mi_bitmap_t* bitmap);
+
+
+typedef bool (mi_forall_set_fun_t)(size_t slice_index, size_t slice_count, mi_arena_t* arena, void* arg2);
+
+// Visit all set bits in a bitmap (`slice_count == 1`)
+bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg);
+
+// Visit all set bits in a bitmap with larger ranges if possible (`slice_count >= 1`)
+// Ranges will never cross chunk boundaries though (and `slice_count <= MI_BCHUNK_BITS`)
+bool _mi_bitmap_forall_setc_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg);
+
+// Visit all set bits in a bitmap with at least `rngslices` at a time (and aligned to `rngslices`). 
+// This is used by purging to not break up transparent huge pages for example.
+// Ranges will never cross chunk boundaries (and `slice_count <= MI_BCHUNK_BITS`).
+bool _mi_bitmap_forall_setc_rangesn(mi_bitmap_t* bitmap, size_t rngslices, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg);
+
+// Count all set bits in given range in the bitmap.
+size_t mi_bitmap_popcountN( mi_bitmap_t* bitmap, size_t idx, size_t n);
+
+/* ----------------------------------------------------------------------------
+  Binned concurrent bitmap
+  Assigns a size class to each chunk such that small blocks don't cause too
+  much fragmentation since we keep chunks for larger blocks separate.
+---------------------------------------------------------------------------- */
+
+// mi_chunkbin_t is defined in mimalloc-stats.h
+
+static inline mi_chunkbin_t mi_chunkbin_inc(mi_chunkbin_t bbin) {
+  mi_assert_internal(bbin < MI_CBIN_COUNT);
+  return (mi_chunkbin_t)((int)bbin + 1);
 }
 
-// Create a bit index.
-static inline mi_bitmap_index_t mi_bitmap_index_create_from_bit(size_t full_bitidx) {  
-  return mi_bitmap_index_create(full_bitidx / MI_BITMAP_FIELD_BITS, full_bitidx % MI_BITMAP_FIELD_BITS);
+static inline mi_chunkbin_t mi_chunkbin_dec(mi_chunkbin_t bbin) {
+  mi_assert_internal(bbin > MI_CBIN_NONE);
+  return (mi_chunkbin_t)((int)bbin - 1);
 }
 
-// Get the field index from a bit index.
-static inline size_t mi_bitmap_index_field(mi_bitmap_index_t bitmap_idx) {
-  return (bitmap_idx / MI_BITMAP_FIELD_BITS);
+static inline mi_chunkbin_t mi_chunkbin_of(size_t slice_count) {
+  if (slice_count==1) return MI_CBIN_SMALL;
+  if (slice_count==8) return MI_CBIN_MEDIUM;
+  #if MI_ENABLE_LARGE_PAGES
+  if (slice_count==MI_BFIELD_BITS) return MI_CBIN_LARGE;
+  #endif
+  if (slice_count > MI_BCHUNK_BITS) return MI_CBIN_HUGE;
+  return MI_CBIN_OTHER;
 }
 
-// Get the bit index in a bitmap field
-static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx) {
-  return (bitmap_idx % MI_BITMAP_FIELD_BITS);
+// An atomic "binned" bitmap for the free slices where we keep chunks reserved for particular size classes
+typedef mi_decl_bchunk_align struct mi_bbitmap_s {
+  _Atomic(size_t)  chunk_count;         // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
+  _Atomic(size_t)  chunk_max_accessed;  // max chunk index that was once cleared or set
+  #if (MI_BCHUNK_SIZE / MI_SIZE_SIZE) > 2
+  size_t           _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2];    // suppress warning on msvc by aligning manually
+  #endif
+  mi_bchunkmap_t   chunkmap;
+  mi_bchunkmap_t   chunkmap_bins[MI_CBIN_COUNT - 1];             // chunkmaps with bit set if the chunk is in that size class (excluding MI_CBIN_NONE)
+  mi_bchunk_t      chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT];        // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT
+} mi_bbitmap_t;
+
+
+static inline size_t mi_bbitmap_chunk_count(const mi_bbitmap_t* bbitmap) {
+  return mi_atomic_load_relaxed(&((mi_bbitmap_t*)bbitmap)->chunk_count);
 }
 
-// Get the full bit index
-static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) {
-  return bitmap_idx;
+static inline size_t mi_bbitmap_max_bits(const mi_bbitmap_t* bbitmap) {
+  return (mi_bbitmap_chunk_count(bbitmap) * MI_BCHUNK_BITS);
 }
 
-/* -----------------------------------------------------------
-  Claim a bit sequence atomically
------------------------------------------------------------ */
+mi_chunkbin_t mi_bbitmap_debug_get_bin(const mi_bchunk_t* chunkmap_bins, size_t chunk_idx);
 
-// Try to atomically claim a sequence of `count` bits in a single
-// field at `idx` in `bitmap`. Returns `true` on success.
-bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+size_t mi_bbitmap_size(size_t bit_count, size_t* chunk_count);
 
-// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
-bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+// If a bit is clear in the bitmap, return `true` and set `idx` to the index of the highest bit that was clear.
+// Otherwise return `false` (and `*idx` is undefined).
+// Used for debug output.
+bool mi_bbitmap_bsr_inv(mi_bbitmap_t* bbitmap, size_t* idx);
 
-// Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously.
-bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+// Initialize a bitmap to all clear; avoid a mem_zero if `already_zero` is true
+// returns the size of the bitmap.
+size_t mi_bbitmap_init(mi_bbitmap_t* bbitmap, size_t bit_count, bool already_zero);
 
-// Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero);
+// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks).
+// Not atomic so only use if still local to a thread.
+void mi_bbitmap_unsafe_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
 
-bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
-bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
 
+// Set a sequence of `n` bits in the bbitmap; returns `true` if atomically transitioned from all 0's to 1's
+bool mi_bbitmap_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
 
-//--------------------------------------------------------------------------
-// the `_across` functions work on bitmaps where sequences can cross over
-// between the fields. This is used in arena allocation
-//--------------------------------------------------------------------------
 
-// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
-// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+// Is a sequence of n bits already all set/cleared?
+bool mi_bbitmap_is_xsetN(mi_xset_t set, mi_bbitmap_t* bbitmap, size_t idx, size_t n);
 
-// Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously.
-bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+// Is a sequence of n bits already set?
+// (Used to check if a memory range is already committed)
+static inline bool mi_bbitmap_is_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
+  return mi_bbitmap_is_xsetN(MI_BIT_SET, bbitmap, idx, n);
+}
 
-// Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero);
+// Is a sequence of n bits already clear?
+static inline bool mi_bbitmap_is_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
+  return mi_bbitmap_is_xsetN(MI_BIT_CLEAR, bbitmap, idx, n);
+}
 
-bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
-bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
 
-#endif
+// Try to atomically transition `n` bits from all set to all clear. Returns `true` on succes.
+// `n` cannot cross chunk boundaries, where `n <= MI_CHUNK_BITS`.
+bool mi_bbitmap_try_clearNC(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
+
+
+// Specialized versions for common bit sequence sizes
+bool mi_bbitmap_try_find_and_clear(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx);  // 1-bit
+bool mi_bbitmap_try_find_and_clear8(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx); // 8-bits
+// bool mi_bbitmap_try_find_and_clearX(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx); // MI_BFIELD_BITS
+bool mi_bbitmap_try_find_and_clearNX(mi_bbitmap_t* bbitmap, size_t n, size_t tseq, size_t* pidx); // < MI_BFIELD_BITS
+bool mi_bbitmap_try_find_and_clearNC(mi_bbitmap_t* bbitmap, size_t n, size_t tseq, size_t* pidx); // > MI_BFIELD_BITS <= MI_BCHUNK_BITS
+bool mi_bbitmap_try_find_and_clearN_(mi_bbitmap_t* bbitmap, size_t n, size_t tseq, size_t* pidx); // > MI_BCHUNK_BITS
+
+// Find a sequence of `n` bits in the bbitmap with all bits set, and try to atomically clear all.
+// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
+mi_decl_nodiscard static inline bool mi_bbitmap_try_find_and_clearN(mi_bbitmap_t* bbitmap, size_t n, size_t tseq, size_t* pidx) {
+  if (n==1) return mi_bbitmap_try_find_and_clear(bbitmap, tseq, pidx);               // small pages
+  if (n==8) return mi_bbitmap_try_find_and_clear8(bbitmap, tseq, pidx);              // medium pages
+  // if (n==MI_BFIELD_BITS) return mi_bbitmap_try_find_and_clearX(bbitmap, tseq, pidx); // large pages
+  if (n==0) return false;
+  if (n<=MI_BFIELD_BITS) return mi_bbitmap_try_find_and_clearNX(bbitmap, tseq, n, pidx);
+  if (n<=MI_BCHUNK_BITS) return mi_bbitmap_try_find_and_clearNC(bbitmap, tseq, n, pidx);
+  return mi_bbitmap_try_find_and_clearN_(bbitmap, tseq, n, pidx);
+}
+
+
+#endif // MI_BITMAP_H
diff --git a/ext/src/mimalloc/src/free.c b/ext/src/mimalloc/src/free.c
new file mode 100644
index 0000000000..9be8f26c35
--- /dev/null
+++ b/ext/src/mimalloc/src/free.c
@@ -0,0 +1,622 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#if !defined(MI_IN_ALLOC_C)
+#error "this file should be included from 'alloc.c' (so aliases can work from alloc-override)"
+// add includes help an IDE
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/prim.h"   // _mi_prim_thread_id()
+#endif
+
+// forward declarations
+static void   mi_check_padding(const mi_page_t* page, const mi_block_t* block);
+static bool   mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block);
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block);
+static void   mi_stat_free(const mi_page_t* page, const mi_block_t* block);
+
+
+// ------------------------------------------------------
+// Free
+// ------------------------------------------------------
+
+// regular free of a (thread local) block pointer
+// fast path written carefully to prevent spilling on the stack
+static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool track_stats, bool check_full)
+{
+  // checks
+  if mi_unlikely(mi_check_is_double_free(page, block)) return;
+  mi_check_padding(page, block);
+  if (track_stats) { mi_stat_free(page, block); }
+  #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN && !MI_GUARDED
+  memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
+  #endif
+  if (track_stats) { mi_track_free_size(block, mi_page_usable_size_of(page, block)); } // faster then mi_usable_size as we already know the page and that p is unaligned
+
+  // actual free: push on the local free list
+  mi_block_set_next(page, block, page->local_free);
+  page->local_free = block;
+  if mi_unlikely(--page->used == 0) {
+    if (page->retire_expire==0) { // no need to re-retire retired pages (happens when we alloc/free one block repeatedly in an empty page)
+      _mi_page_retire(page); 
+    }
+  }
+  else if mi_unlikely(check_full && mi_page_is_in_full(page)) {
+    _mi_page_unfull(page);
+  }
+}
+
+// Forward declaration for multi-threaded collect
+static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* mt_free) mi_attr_noexcept;
+
+// Free a block multi-threaded
+static inline void mi_free_block_mt(mi_page_t* page, mi_block_t* block) mi_attr_noexcept
+{
+  // adjust stats (after padding check and potentially recursive `mi_free` above)
+  mi_stat_free(page, block);    // stat_free may access the padding
+  mi_track_free_size(block, mi_page_usable_size_of(page, block));
+
+  // _mi_padding_shrink(page, block, sizeof(mi_block_t));
+#if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
+  size_t dbgsize = mi_usable_size(block);
+  if (dbgsize > MI_MiB) { dbgsize = MI_MiB; }
+  _mi_memset_aligned(block, MI_DEBUG_FREED, dbgsize);
+#endif
+
+  // push atomically on the page thread free list
+  mi_thread_free_t tf_new;
+  mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free);
+  do {
+    mi_block_set_next(page, block, mi_tf_block(tf_old));
+    tf_new = mi_tf_create(block, true /* always use owned: try to claim it if the page is abandoned */);
+  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new)); // todo: release is enough?
+
+  // and atomically try to collect the page if it was abandoned
+  const bool is_owned_now = !mi_tf_is_owned(tf_old);
+  if (is_owned_now) {
+    mi_assert_internal(mi_page_is_abandoned(page));
+    mi_free_try_collect_mt(page,block);
+  }
+}
+
+
+// Adjust a block that was allocated aligned, to the actual start of the block in the page.
+// note: this can be called from `mi_free_generic_mt` where a non-owning thread accesses the
+// `page_start` and `block_size` fields; however these are constant and the page won't be
+// deallocated (as the block we are freeing keeps it alive) and thus safe to read concurrently.
+mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p) {
+  mi_assert_internal(page!=NULL && p!=NULL);
+
+  const size_t diff = (uint8_t*)p - mi_page_start(page);
+  const size_t block_size = mi_page_block_size(page);
+  const size_t adjust = (_mi_is_power_of_two(block_size) ? diff & (block_size - 1) : diff % block_size);
+  return (mi_block_t*)((uintptr_t)p - adjust);
+}
+
+// forward declaration for a MI_GUARDED build
+#if MI_GUARDED
+static void mi_block_unguard(mi_page_t* page, mi_block_t* block, void* p); // forward declaration
+static inline void mi_block_check_unguard(mi_page_t* page, mi_block_t* block, void* p) {
+  if (mi_block_ptr_is_guarded(block, p)) { mi_block_unguard(page, block, p); }
+}
+#else
+static inline void mi_block_check_unguard(mi_page_t* page, mi_block_t* block, void* p) {
+  MI_UNUSED(page); MI_UNUSED(block); MI_UNUSED(p);
+}
+#endif
+
+static inline mi_block_t* mi_validate_block_from_ptr( const mi_page_t* page, void* p ) {
+  mi_assert(_mi_page_ptr_unalign(page,p) == (mi_block_t*)p); // should never be an interior pointer
+  #if MI_SECURE > 0
+  // in secure mode we always unalign to guard against free-ing interior pointers
+  return _mi_page_ptr_unalign(page,p);
+  #else
+  MI_UNUSED(page);
+  return (mi_block_t*)p;
+  #endif
+}
+
+
+// free a local pointer  (page parameter comes first for better codegen)
+static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, void* p) mi_attr_noexcept {
+  mi_assert_internal(p!=NULL && page != NULL);
+  mi_block_t* const block = (mi_page_has_interior_pointers(page) ? _mi_page_ptr_unalign(page, p) : mi_validate_block_from_ptr(page,p));
+  mi_block_check_unguard(page, block, p);
+  mi_free_block_local(page, block, true /* track stats */, true /* check for a full page */);
+}
+
+// free a pointer owned by another thread (page parameter comes first for better codegen)
+static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, void* p) mi_attr_noexcept {
+  mi_assert_internal(p!=NULL && page != NULL);
+  mi_block_t* const block = (mi_page_has_interior_pointers(page) ? _mi_page_ptr_unalign(page, p) : mi_validate_block_from_ptr(page,p));
+  mi_block_check_unguard(page, block, p);
+  mi_free_block_mt(page, block);
+}
+
+// generic free (for runtime integration)
+void mi_decl_noinline _mi_free_generic(mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
+  if (is_local) mi_free_generic_local(page,p);
+           else mi_free_generic_mt(page,p);
+}
+
+
+// Get the page belonging to a pointer
+// Does further checks in debug mode to see if this was a valid pointer.
+static inline mi_page_t* mi_validate_ptr_page(const void* p, const char* msg)
+{
+  MI_UNUSED_RELEASE(msg);
+  #if MI_DEBUG
+  if mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0 && !mi_option_is_enabled(mi_option_guarded_precise)) {
+    _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p);
+    return NULL;
+  }
+  mi_page_t* page = _mi_safe_ptr_page(p);
+  if (p != NULL && page == NULL) {
+    _mi_error_message(EINVAL, "%s: invalid pointer: %p\n", msg, p);
+  }
+  return page;
+  #else
+  return _mi_ptr_page(p);
+  #endif
+}
+
+// Free a block
+// Fast path written carefully to prevent register spilling on the stack
+static mi_decl_forceinline void mi_free_ex(void* p, size_t* usable)  
+{
+  mi_page_t* const page = mi_validate_ptr_page(p,"mi_free");
+  if mi_unlikely(page==NULL) return;  // page will be NULL if p==NULL
+  mi_assert_internal(p!=NULL && page!=NULL);
+  if (usable!=NULL) { *usable = mi_page_usable_block_size(page); }
+
+  const mi_threadid_t xtid = (_mi_prim_thread_id() ^ mi_page_xthread_id(page));
+  if mi_likely(xtid == 0) {                        // `tid == mi_page_thread_id(page) && mi_page_flags(page) == 0`
+    // thread-local, aligned, and not a full page
+    mi_block_t* const block = mi_validate_block_from_ptr(page,p);
+    mi_free_block_local(page, block, true /* track stats */, false /* no need to check if the page is full */);
+  }
+  else if (xtid <= MI_PAGE_FLAG_MASK) {            // `tid == mi_page_thread_id(page) && mi_page_flags(page) != 0`
+    // page is local, but is full or contains (inner) aligned blocks; use generic path
+    mi_free_generic_local(page, p);
+  }
+  // free-ing in a page owned by a theap in another thread, or an abandoned page (not belonging to a theap)
+  else if ((xtid & MI_PAGE_FLAG_MASK) == 0) {      // `tid != mi_page_thread_id(page) && mi_page_flags(page) == 0`
+    // blocks are aligned (and not a full page); push on the thread_free list
+    mi_block_t* const block = mi_validate_block_from_ptr(page,p);
+    mi_free_block_mt(page,block);
+  }
+  else {
+    // page is full or contains (inner) aligned blocks; use generic multi-thread path
+    mi_free_generic_mt(page, p);
+  }
+}
+
+void mi_free(void* p) mi_attr_noexcept {
+  mi_free_ex(p, NULL);
+}
+
+void mi_ufree(void* p, size_t* usable) mi_attr_noexcept {
+  mi_free_ex(p, usable);
+}
+
+// --------------------------------------------------------------------------------------------
+// `mi_free_try_collect_mt`: Potentially collect a page in a free in an abandoned page.
+// 1. if the page becomes empty, free it
+// 2. if it can be reclaimed, reclaim it in our theap
+// 3. if it went to < 7/8th used, re-abandon to be mapped (so it can be found by theaps looking for free pages)
+// --------------------------------------------------------------------------------------------
+
+// Helper for mi_free_try_collect_mt: free if the page has no more used blocks (this is updated by `_mi_page_free_collect(_partly)`)
+static bool mi_abandoned_page_try_free(mi_page_t* page)
+{
+  if (!mi_page_all_free(page)) return false;
+  // first remove it from the abandoned pages in the arena (if mapped, this might wait for any readers to finish)
+  _mi_arenas_page_unabandon(page,NULL);
+  _mi_arenas_page_free(page,NULL); // we can now free the page directly
+  return true;
+}
+
+// Helper for mi_free_try_collect_mt: try if we can reabandon a previously abandoned mostly full page to be mapped
+static bool mi_abandoned_page_try_reabandon_to_mapped(mi_page_t* page)
+{
+  // if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations
+  // We only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page
+  if (mi_page_is_mostly_used(page)) return false;   // not too full
+  if (page->memid.memkind != MI_MEM_ARENA || mi_page_is_abandoned_mapped(page)) return false;  // and not already mapped (or unmappable)
+
+  mi_assert(!mi_page_is_full(page));
+  return _mi_arenas_page_try_reabandon_to_mapped(page);
+}
+
+// Release ownership of a page. This may free or reabandond the page if other blocks are concurrently
+// freed in the meantime. Returns `true` if the page was freed.
+// By passing the captured `expected_thread_free`, we can often avoid calling `mi_page_free_collect`.
+static void mi_abandoned_page_unown_from_free(mi_page_t* page, mi_block_t* expected_thread_free) {
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(!mi_page_all_free(page));
+  // try to cas atomically the original free list (`mt_free`) back with the ownership cleared.
+  mi_thread_free_t tf_expect = mi_tf_create(expected_thread_free, true);
+  mi_thread_free_t tf_new    = mi_tf_create(expected_thread_free, false);
+  while mi_unlikely(!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_expect, tf_new)) {
+    mi_assert_internal(mi_tf_is_owned(tf_expect));
+    // while the xthread_free list is not empty..
+    while (mi_tf_block(tf_expect) != NULL) {
+      // if there were concurrent updates to the thread-free list, we retry to free or reabandon to mapped (if it became !mosty_used).
+      _mi_page_free_collect(page,false);  // update used count
+      if (mi_abandoned_page_try_free(page)) return;
+      if (mi_abandoned_page_try_reabandon_to_mapped(page)) return;
+      // otherwise continue un-owning
+      tf_expect = mi_atomic_load_relaxed(&page->xthread_free);
+    }
+    // and try again to release ownership
+    mi_assert_internal(mi_tf_block(tf_expect)==NULL);
+    tf_new = mi_tf_create(NULL, false);
+  }
+}
+
+static inline bool mi_page_queue_len_is_atmost( mi_theap_t* theap, size_t block_size, long atmost) {
+  if (atmost < 0) return false;
+  mi_page_queue_t* const pq = mi_page_queue(theap,block_size);
+  mi_assert_internal(pq!=NULL);
+  return (pq->count <= (size_t)atmost);
+}
+
+// Helper for mi_free_try_collect_mt:  try to reclaim the page for ourselves
+static mi_decl_noinline bool mi_abandoned_page_try_reclaim(mi_page_t* page, long reclaim_on_free) mi_attr_noexcept
+{
+  // note: reclaiming can improve benchmarks like `larson` or `rbtree-ck` a lot even in the single-threaded case,
+  // since free-ing from an owned page avoids atomic operations. However, if we reclaim too eagerly in
+  // a multi-threaded scenario we may start to hold on to too much memory and reduce reuse among threads.
+  // If the current theap is where the page originally came from, we reclaim much more eagerly while
+  // 'cross-thread' reclaiming on free is by default off (and we only 'reclaim' these by finding the abandoned
+  // pages when we allocate a fresh page).
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(!mi_page_all_free(page));
+  mi_assert_internal(page->block_size <= MI_SMALL_SIZE_MAX);
+  mi_assert_internal(reclaim_on_free >= 0);
+
+  // dont reclaim if we just have terminated this thread and we should
+  // not reinitialize the theap for this thread. (can happen due to thread-local destructors for example -- issue #944)
+  if (!_mi_thread_is_initialized()) return false;
+
+  // get our theap 
+  mi_theap_t* const theap = _mi_page_associated_theap_peek(page);
+  if (theap==NULL || !theap->allow_page_reclaim) return false;
+  
+  // todo: cache `is_in_threadpool` and `exclusive_arena` directly in the theap for performance?
+  // set max_reclaim limit
+  long max_reclaim = 0;
+  if mi_likely(theap == page->theap) {  // did this page originate from the current theap? (and thus allocated from this thread)
+    // originating theap
+    max_reclaim = _mi_option_get_fast(theap->tld->is_in_threadpool ? mi_option_page_cross_thread_max_reclaim : mi_option_page_max_reclaim);
+  }
+  else if (reclaim_on_free == 1 &&               // if cross-thread is allowed
+            !theap->tld->is_in_threadpool &&      // and we are not part of a threadpool
+            !mi_page_is_mostly_used(page) &&     // and the page is not too full
+            _mi_arena_memid_is_suitable(page->memid, theap->heap->exclusive_arena)) {   // and it fits our memory
+    // across threads
+    max_reclaim = _mi_option_get_fast(mi_option_page_cross_thread_max_reclaim);
+  }
+
+  // are we within the reclaim limit?
+  if (max_reclaim >= 0 && !mi_page_queue_len_is_atmost(theap, page->block_size, max_reclaim)) {
+    return false;
+  }
+
+  // reclaim the page into this theap
+  // first remove it from the abandoned pages in the arena -- this might wait for any readers to finish
+  _mi_arenas_page_unabandon(page, theap);
+  _mi_theap_page_reclaim(theap, page);
+  mi_theap_stat_counter_increase(theap, pages_reclaim_on_free, 1);
+  return true;
+}
+
+
+// We freed a block in an abandoned page (that was not owned). Try to collect
+static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* mt_free) mi_attr_noexcept
+{
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(mt_free != NULL);
+  // we own the page now, and it is safe to collect the thread atomic free list
+  if (page->block_size <= MI_SMALL_SIZE_MAX) {
+    // use the `_partly` version to avoid atomic operations since we already have the `mt_free` pointing into the thread free list
+    // (after this the `used` count might be too high (as some blocks may have been concurrently added to the thread free list and are yet uncounted).
+    //  however, if the page became completely free, the used count is guaranteed to be 0.)
+    mi_assert_internal(page->reserved>=16); // below this even one freed block goes from full to no longer mostly used.
+    _mi_page_free_collect_partly(page, mt_free);    
+  }
+  else {
+    // for larger blocks we use the regular collect 
+    _mi_page_free_collect(page,false /* no force */);
+    mt_free = NULL; // expected page->xthread_free value after collection
+  }
+  const long reclaim_on_free = _mi_option_get_fast(mi_option_page_reclaim_on_free);
+  #if MI_DEBUG > 1
+  if (mi_page_is_singleton(page)) { mi_assert_internal(mi_page_all_free(page)); }
+  if (mi_page_is_full(page))      { mi_assert(mi_page_is_mostly_used(page)); }
+  #endif
+
+  // try to: 1. free it, 2. reclaim it, or 3. reabandon it to be mapped
+  if (mi_abandoned_page_try_free(page)) return;
+  if (page->block_size <= MI_SMALL_SIZE_MAX && reclaim_on_free >= 0) {  // early test for better codegen
+    if (mi_abandoned_page_try_reclaim(page, reclaim_on_free)) return;
+  }
+  if (mi_abandoned_page_try_reabandon_to_mapped(page)) return;
+  
+  // otherwise unown the page again
+  mi_abandoned_page_unown_from_free(page, mt_free);
+}
+
+
+// ------------------------------------------------------
+// Usable size
+// ------------------------------------------------------
+
+// Bytes available in a block
+static size_t mi_decl_noinline mi_page_usable_aligned_size_of(const mi_page_t* page, const void* p) mi_attr_noexcept {
+  const mi_block_t* block = _mi_page_ptr_unalign(page, p);
+  const size_t size = mi_page_usable_size_of(page, block);
+  const ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)block;
+  mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
+  const size_t aligned_size = (size - adjust);
+  #if MI_GUARDED
+  if (mi_block_ptr_is_guarded(block, p)) {
+    return aligned_size - _mi_os_page_size();
+  }
+  #endif
+  return aligned_size;
+}
+
+static inline size_t _mi_usable_size(const void* p, const mi_page_t* page) mi_attr_noexcept {
+  if mi_unlikely(page==NULL) return 0;
+  if mi_likely(!mi_page_has_interior_pointers(page)) {
+    const mi_block_t* block = (const mi_block_t*)p;
+    return mi_page_usable_size_of(page, block);
+  }
+  else {
+    // split out to separate routine for improved code generation
+    return mi_page_usable_aligned_size_of(page, p);
+  }
+}
+
+mi_decl_nodiscard size_t mi_usable_size(const void* p) mi_attr_noexcept {
+  const mi_page_t* const page = mi_validate_ptr_page(p,"mi_usable_size");
+  return _mi_usable_size(p,page);
+}
+
+
+// ------------------------------------------------------
+// Free variants
+// ------------------------------------------------------
+
+void mi_free_size(void* p, size_t size) mi_attr_noexcept {
+  MI_UNUSED_RELEASE(size);
+  #if MI_DEBUG
+  const mi_page_t* const page = mi_validate_ptr_page(p,"mi_free_size");  
+  const size_t available = _mi_usable_size(p,page);
+  mi_assert(p == NULL || size <= available || available == 0 /* invalid pointer */ );
+  #endif
+  mi_free(p);
+}
+
+void mi_free_size_aligned(void* p, size_t size, size_t alignment) mi_attr_noexcept {
+  MI_UNUSED_RELEASE(alignment);
+  mi_assert(((uintptr_t)p % alignment) == 0);
+  mi_free_size(p,size);
+}
+
+void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept {
+  MI_UNUSED_RELEASE(alignment);
+  mi_assert(((uintptr_t)p % alignment) == 0);
+  mi_free(p);
+}
+
+
+// ------------------------------------------------------
+// Check for double free in secure and debug mode
+// This is somewhat expensive so only enabled for secure mode 4
+// ------------------------------------------------------
+
+#if (MI_ENCODE_FREELIST && (MI_SECURE>=4 || MI_DEBUG!=0))
+// linear check if the free list contains a specific element
+static bool mi_list_contains(const mi_page_t* page, const mi_block_t* list, const mi_block_t* elem) {
+  while (list != NULL) {
+    if (elem==list) return true;
+    list = mi_block_next(page, list);
+  }
+  return false;
+}
+
+static mi_decl_noinline bool mi_check_is_double_freex(const mi_page_t* page, const mi_block_t* block) {
+  // The decoded value is in the same page (or NULL).
+  // Walk the free lists to verify positively if it is already freed
+  if (mi_list_contains(page, page->free, block) ||
+      mi_list_contains(page, page->local_free, block) ||
+      mi_list_contains(page, mi_page_thread_free(page), block))
+  {
+    _mi_error_message(EAGAIN, "double free detected of block %p with size %zu\n", block, mi_page_block_size(page));
+    return true;
+  }
+  return false;
+}
+
+#define mi_track_page(page,access)  { size_t psize; void* pstart = _mi_page_start(_mi_page_segment(page),page,&psize); mi_track_mem_##access( pstart, psize); }
+
+static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
+  bool is_double_free = false;
+  mi_block_t* n = mi_block_nextx(page, block, page->keys); // pretend it is freed, and get the decoded first field
+  if (((uintptr_t)n & (MI_INTPTR_SIZE-1))==0 &&  // quick check: aligned pointer?
+      (n==NULL || mi_is_in_same_page(block, n))) // quick check: in same page or NULL?
+  {
+    // Suspicious: decoded value a in block is in the same page (or NULL) -- maybe a double free?
+    // (continue in separate function to improve code generation)
+    is_double_free = mi_check_is_double_freex(page, block);
+  }
+  return is_double_free;
+}
+#else
+static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+  return false;
+}
+#endif
+
+
+// ---------------------------------------------------------------------------
+// Check for theap block overflow by setting up padding at the end of the block
+// ---------------------------------------------------------------------------
+
+#if MI_PADDING // && !MI_TRACK_ENABLED
+static bool mi_page_decode_padding(const mi_page_t* page, const mi_block_t* block, size_t* delta, size_t* bsize) {
+  *bsize = mi_page_usable_block_size(page);
+  const mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + *bsize);
+  mi_track_mem_defined(padding,sizeof(mi_padding_t));
+  *delta = padding->delta;
+  uint32_t canary = padding->canary;
+  uintptr_t keys[2];
+  keys[0] = page->keys[0];
+  keys[1] = page->keys[1];
+  bool ok = (mi_ptr_encode_canary(page,block,keys) == canary && *delta <= *bsize);
+  mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
+  return ok;
+}
+
+// Return the exact usable size of a block.
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  mi_assert_internal(ok); mi_assert_internal(delta <= bsize);
+  return (ok ? bsize - delta : 0);
+}
+
+// When a non-thread-local block is freed, it becomes part of the thread delayed free
+// list that is freed later by the owning theap. If the exact usable size is too small to
+// contain the pointer for the delayed list, then shrink the padding (by decreasing delta)
+// so it will later not trigger an overflow error in `mi_free_block`.
+void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  mi_assert_internal(ok);
+  if (!ok || (bsize - delta) >= min_size) return;  // usually already enough space
+  mi_assert_internal(bsize >= min_size);
+  if (bsize < min_size) return;  // should never happen
+  size_t new_delta = (bsize - min_size);
+  mi_assert_internal(new_delta < bsize);
+  mi_padding_t* padding = (mi_padding_t*)((uint8_t*)block + bsize);
+  mi_track_mem_defined(padding,sizeof(mi_padding_t));
+  padding->delta = (uint32_t)new_delta;
+  mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
+}
+#else
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(block);
+  return mi_page_usable_block_size(page);
+}
+
+void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+  MI_UNUSED(min_size);
+}
+#endif
+
+#if MI_PADDING && MI_PADDING_CHECK
+
+static bool mi_verify_padding(const mi_page_t* page, const mi_block_t* block, size_t* size, size_t* wrong) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  *size = *wrong = bsize;
+  if (!ok) return false;
+  mi_assert_internal(bsize >= delta);
+  *size = bsize - delta;
+  if (!mi_page_is_huge(page)) {
+    uint8_t* fill = (uint8_t*)block + bsize - delta;
+    const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // check at most the first N padding bytes
+    mi_track_mem_defined(fill, maxpad);
+    for (size_t i = 0; i < maxpad; i++) {
+      if (fill[i] != MI_DEBUG_PADDING) {
+        *wrong = bsize - delta + i;
+        ok = false;
+        break;
+      }
+    }
+    mi_track_mem_noaccess(fill, maxpad);
+  }
+  return ok;
+}
+
+static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
+  size_t size;
+  size_t wrong;
+  if (!mi_verify_padding(page,block,&size,&wrong)) {
+    _mi_error_message(EFAULT, "buffer overflow in theap block %p of size %zu: write after %zu bytes\n", block, size, wrong );
+  }
+}
+
+#else
+
+static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+}
+
+#endif
+
+// only maintain stats for smaller objects if requested
+#if (MI_STAT>0)
+static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(block);
+  mi_theap_t* const theap = _mi_theap_default();
+  if (!mi_theap_is_initialized(theap)) return; // (for now) skip statistics if free'd after thread_done was called (usually a thread cleanup call by the OS)
+
+  const size_t bsize = mi_page_usable_block_size(page);
+  // #if (MI_STAT>1)
+  // const size_t usize = mi_page_usable_size_of(page, block);
+  // mi_theap_stat_decrease(theap, malloc_requested, usize);
+  // #endif
+  if (bsize <= MI_LARGE_MAX_OBJ_SIZE) {
+    mi_theap_stat_decrease(theap, malloc_normal, bsize);
+    #if (MI_STAT > 1)
+    mi_theap_stat_decrease(theap, malloc_bins[_mi_bin(bsize)], 1);
+    #endif
+  }
+  else {
+    const size_t bpsize = mi_page_block_size(page);  // match stat in page.c:mi_huge_page_alloc
+    mi_theap_stat_decrease(theap, malloc_huge, bpsize);
+  }
+}
+#else
+void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page); MI_UNUSED(block);
+}
+#endif
+
+
+// Remove guard page when building with MI_GUARDED
+#if MI_GUARDED
+static void mi_block_unguard(mi_page_t* page, mi_block_t* block, void* p) {
+  MI_UNUSED(p);
+  mi_assert_internal(mi_block_ptr_is_guarded(block, p));
+  mi_assert_internal(mi_page_has_interior_pointers(page));
+  mi_assert_internal((uint8_t*)p - (uint8_t*)block >= (ptrdiff_t)sizeof(mi_block_t));
+  mi_assert_internal(block->next == MI_BLOCK_TAG_GUARDED);
+
+  const size_t bsize = mi_page_block_size(page);
+  const size_t psize = _mi_os_page_size();
+  mi_assert_internal(bsize > psize);
+  mi_assert_internal(!page->memid.is_pinned);
+  void* gpage = (uint8_t*)block + bsize - psize;
+  mi_assert_internal(_mi_is_aligned(gpage, psize));
+  _mi_os_unprotect(gpage, psize);
+}
+#endif
diff --git a/ext/src/mimalloc/src/heap.c b/ext/src/mimalloc/src/heap.c
index 816d961ae9..080cfd1270 100644
--- a/ext/src/mimalloc/src/heap.c
+++ b/ext/src/mimalloc/src/heap.c
@@ -1,580 +1,219 @@
 /*----------------------------------------------------------------------------
-Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/prim.h"  // _mi_theap_default
 
-#include <string.h>  // memset, memcpy
-
-#if defined(_MSC_VER) && (_MSC_VER < 1920)
-#pragma warning(disable:4204)  // non-constant aggregate initializer
-#endif
 
 /* -----------------------------------------------------------
-  Helpers
+  Heap's
 ----------------------------------------------------------- */
 
-// return `true` if ok, `false` to break
-typedef bool (heap_page_visitor_fun)(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2);
-
-// Visit all pages in a heap; returns `false` if break was called.
-static bool mi_heap_visit_pages(mi_heap_t* heap, heap_page_visitor_fun* fn, void* arg1, void* arg2)
-{
-  if (heap==NULL || heap->page_count==0) return 0;
-
-  // visit all pages
-  #if MI_DEBUG>1
-  size_t total = heap->page_count;
-  #endif
-  size_t count = 0;
-  for (size_t i = 0; i <= MI_BIN_FULL; i++) {
-    mi_page_queue_t* pq = &heap->pages[i];
-    mi_page_t* page = pq->first;
-    while(page != NULL) {
-      mi_page_t* next = page->next; // save next in case the page gets removed from the queue
-      mi_assert_internal(mi_page_heap(page) == heap);
-      count++;
-      if (!fn(heap, pq, page, arg1, arg2)) return false;
-      page = next; // and continue
-    }
-  }
-  mi_assert_internal(count == total);
-  return true;
+mi_theap_t* mi_heap_theap(mi_heap_t* heap) {
+  return _mi_heap_theap(heap);
 }
 
-
-#if MI_DEBUG>=2
-static bool mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
-  MI_UNUSED(arg1);
-  MI_UNUSED(arg2);
-  MI_UNUSED(pq);
-  mi_assert_internal(mi_page_heap(page) == heap);
-  mi_segment_t* segment = _mi_page_segment(page);
-  mi_assert_internal(segment->thread_id == heap->thread_id);
-  mi_assert_expensive(_mi_page_is_valid(page));
-  return true;
-}
-#endif
-#if MI_DEBUG>=3
-static bool mi_heap_is_valid(mi_heap_t* heap) {
-  mi_assert_internal(heap!=NULL);
-  mi_heap_visit_pages(heap, &mi_heap_page_is_valid, NULL, NULL);
-  return true;
+void mi_heap_set_numa_affinity(mi_heap_t* heap, int numa_node) {
+  if (heap==NULL) { heap = mi_heap_main(); }
+  heap->numa_node = (numa_node < 0 ? -1 : numa_node % _mi_os_numa_node_count());
 }
-#endif
-
-
-
-
-/* -----------------------------------------------------------
-  "Collect" pages by migrating `local_free` and `thread_free`
-  lists and freeing empty pages. This is done when a thread
-  stops (and in that case abandons pages if there are still
-  blocks alive)
------------------------------------------------------------ */
-
-typedef enum mi_collect_e {
-  MI_NORMAL,
-  MI_FORCE,
-  MI_ABANDON
-} mi_collect_t;
-
 
-static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg_collect, void* arg2 ) {
-  MI_UNUSED(arg2);
-  MI_UNUSED(heap);
-  mi_assert_internal(mi_heap_page_is_valid(heap, pq, page, NULL, NULL));
-  mi_collect_t collect = *((mi_collect_t*)arg_collect);
-  _mi_page_free_collect(page, collect >= MI_FORCE);
-  if (mi_page_all_free(page)) {
-    // no more used blocks, free the page. 
-    // note: this will free retired pages as well.
-    _mi_page_free(page, pq, collect >= MI_FORCE);
-  }
-  else if (collect == MI_ABANDON) {
-    // still used blocks but the thread is done; abandon the page
-    _mi_page_abandon(page, pq);
-  }
-  return true; // don't break
+void mi_heap_stats_merge_to_subproc(mi_heap_t* heap) {
+  if (heap==NULL) { heap = mi_heap_main(); }
+  _mi_stats_merge_into(&heap->subproc->stats, &heap->stats);
 }
 
-static bool mi_heap_page_never_delayed_free(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
-  MI_UNUSED(arg1);
-  MI_UNUSED(arg2);
-  MI_UNUSED(heap);
-  MI_UNUSED(pq);
-  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
-  return true; // don't break
+void mi_heap_stats_merge_to_main(mi_heap_t* heap) {
+  if (heap==NULL) return;
+  _mi_stats_merge_into(&mi_heap_main()->stats, &heap->stats);
 }
 
-static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
+static mi_theap_t* mi_heap_init_theap(const mi_heap_t* const_heap)
 {
-  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
-
-  const bool force = collect >= MI_FORCE;  
-  _mi_deferred_free(heap, force);
-
-  // note: never reclaim on collect but leave it to threads that need storage to reclaim 
-  const bool force_main = 
-    #ifdef NDEBUG
-      collect == MI_FORCE
-    #else
-      collect >= MI_FORCE
-    #endif
-      && _mi_is_main_thread() && mi_heap_is_backing(heap) && !heap->no_reclaim;
+  mi_heap_t* heap = (mi_heap_t*)const_heap;
+  mi_assert_internal(heap!=NULL);
 
-  if (force_main) {
-    // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments.
-    // if all memory is freed by now, all segments should be freed.
-    _mi_abandoned_reclaim_all(heap, &heap->tld->segments);
-  }
-  
-  // if abandoning, mark all pages to no longer add to delayed_free
-  if (collect == MI_ABANDON) {
-    mi_heap_visit_pages(heap, &mi_heap_page_never_delayed_free, NULL, NULL);
+  if (_mi_is_heap_main(heap)) {
+    // this can be called if the (main) thread is not yet initialized (as no allocation happened)
+    mi_thread_init();
+    mi_theap_t* theap = _mi_heap_theap(heap);
+    mi_assert_internal(theap!=NULL);
+    return theap;
+  }
+
+  // otherwise initialize the theap for this heap
+  // get the thread local
+  mi_theap_t* theap = NULL;
+  if (heap->theap==0) {
+    // initialize thread locals
+    heap->theap = _mi_thread_local_create();
+    if (heap->theap==0) {
+      _mi_error_message(EFAULT, "unable to dynamically create a thread local for a heap\n");
+      return NULL;
+    }
   }
-
-  // free thread delayed blocks.
-  // (if abandoning, after this there are no more thread-delayed references into the pages.)
-  _mi_heap_delayed_free(heap);
-
-  // collect retired pages
-  _mi_heap_collect_retired(heap, force);
-
-  // collect all pages owned by this thread
-  mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
-  mi_assert_internal( collect != MI_ABANDON || mi_atomic_load_ptr_acquire(mi_block_t,&heap->thread_delayed_free) == NULL );
-
-  // collect abandoned segments (in particular, decommit expired parts of segments in the abandoned segment list)
-  // note: forced decommit can be quite expensive if many threads are created/destroyed so we do not force on abandonment
-  _mi_abandoned_collect(heap, collect == MI_FORCE /* force? */, &heap->tld->segments);
-
-  // collect segment local caches
-  if (force) {
-    _mi_segment_thread_collect(&heap->tld->segments);
+  else {
+    // get current thread local
+    theap = (mi_theap_t*)_mi_thread_local_get(heap->theap);
   }
 
-  // decommit in global segment caches
-  // note: forced decommit can be quite expensive if many threads are created/destroyed so we do not force on abandonment
-  _mi_segment_cache_collect( collect == MI_FORCE, &heap->tld->os);  
-
-  // collect regions on program-exit (or shared library unload)
-  if (force && _mi_is_main_thread() && mi_heap_is_backing(heap)) {
-    //_mi_mem_collect(&heap->tld->os);
+  // create a fresh theap?
+  if (theap==NULL) {
+    theap = _mi_theap_create(heap, _mi_theap_default_safe()->tld);
+    if (theap==NULL) {
+      _mi_error_message(EFAULT, "unable to allocate memory for a thread local heap\n");
+      return NULL;
+    }
+    if (!_mi_thread_local_set(heap->theap, theap)) {
+      _mi_error_message(EFAULT, "unable to allocate memory for a thread local storage\n");
+      return NULL;
+    }
   }
+  return theap;
 }
 
-void _mi_heap_collect_abandon(mi_heap_t* heap) {
-  mi_heap_collect_ex(heap, MI_ABANDON);
-}
 
-void mi_heap_collect(mi_heap_t* heap, bool force) mi_attr_noexcept {
-  mi_heap_collect_ex(heap, (force ? MI_FORCE : MI_NORMAL));
-}
-
-void mi_collect(bool force) mi_attr_noexcept {
-  mi_heap_collect(mi_get_default_heap(), force);
+// get the theap for a heap without initializing (and return NULL in that case)
+mi_theap_t* _mi_heap_theap_get_peek(const mi_heap_t* heap) {
+  if (heap==NULL || _mi_is_heap_main(heap)) {
+    return __mi_theap_main;  // don't call _mi_theap_main as it may still be NULL
+  }
+  else {
+    return (mi_theap_t*)_mi_thread_local_get(heap->theap);
+  }
 }
 
-
-/* -----------------------------------------------------------
-  Heap new
------------------------------------------------------------ */
-
-mi_heap_t* mi_heap_get_default(void) {
-  mi_thread_init();
-  return mi_get_default_heap();
+// get (and possibly create) the theap belonging to a heap
+mi_theap_t* _mi_heap_theap_get_or_init(const mi_heap_t* heap)
+{
+  mi_theap_t* theap = _mi_heap_theap_peek(heap);
+  if mi_unlikely(theap==NULL) {
+    theap = mi_heap_init_theap(heap);
+    if (theap==NULL) { return (mi_theap_t*)&_mi_theap_empty_wrong; }  // this will return NULL from page.c:_mi_malloc_generic
+  }
+  _mi_theap_cached_set(theap);
+  return theap;
 }
 
-mi_heap_t* mi_heap_get_backing(void) {
-  mi_heap_t* heap = mi_heap_get_default();
-  mi_assert_internal(heap!=NULL);
-  mi_heap_t* bheap = heap->tld->heap_backing;
-  mi_assert_internal(bheap!=NULL);
-  mi_assert_internal(bheap->thread_id == _mi_thread_id());
-  return bheap;
-}
 
-mi_heap_t* mi_heap_new(void) {
-  mi_heap_t* bheap = mi_heap_get_backing();
-  mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t);  // todo: OS allocate in secure mode?
+mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t exclusive_arena_id) {
+  // always allocate heap data in the (subprocess) main heap
+  mi_heap_t* heap_main = mi_heap_main();
+  // todo: allocate heap data in the exclusive arena ?
+  mi_heap_t* heap = (mi_heap_t*)mi_heap_zalloc( heap_main, sizeof(mi_heap_t) );
   if (heap==NULL) return NULL;
-  _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t));
-  heap->tld = bheap->tld;
-  heap->thread_id = _mi_thread_id();
-  _mi_random_split(&bheap->random, &heap->random);
-  heap->cookie  = _mi_heap_random_next(heap) | 1;
-  heap->keys[0] = _mi_heap_random_next(heap);
-  heap->keys[1] = _mi_heap_random_next(heap);
-  heap->no_reclaim = true;  // don't reclaim abandoned pages or otherwise destroy is unsafe
-  // push on the thread local heaps list
-  heap->next = heap->tld->heaps;
-  heap->tld->heaps = heap;
-  return heap;
-}
 
-uintptr_t _mi_heap_random_next(mi_heap_t* heap) {
-  return _mi_random_next(&heap->random);
+  // init fields
+  heap->subproc = heap_main->subproc;
+  heap->heap_seq = mi_atomic_increment_relaxed(&heap_main->subproc->heap_total_count);
+  heap->exclusive_arena = _mi_arena_from_id(exclusive_arena_id);
+  heap->numa_node = -1; // no initial affinity
+
+  mi_lock_init(&heap->theaps_lock);
+  mi_lock_init(&heap->os_abandoned_pages_lock);
+  mi_lock_init(&heap->arena_pages_lock);
+
+  // push onto the subproc heaps
+  mi_lock(&heap->subproc->heaps_lock) {
+    mi_heap_t* head = heap->subproc->heaps;
+    heap->prev = NULL;
+    heap->next = head;
+    if (head!=NULL) { head->prev = heap;  }
+    heap->subproc->heaps = heap;
+  }
+  mi_atomic_increment_relaxed(&heap_main->subproc->heap_count);
+  mi_subproc_stat_increase(heap_main->subproc, heaps, 1);
+  return heap;
 }
 
-// zero out the page queues
-static void mi_heap_reset_pages(mi_heap_t* heap) {
-  mi_assert_internal(heap != NULL);
-  mi_assert_internal(mi_heap_is_initialized(heap));
-  // TODO: copy full empty heap instead?
-  memset(&heap->pages_free_direct, 0, sizeof(heap->pages_free_direct));
-#ifdef MI_MEDIUM_DIRECT
-  memset(&heap->pages_free_medium, 0, sizeof(heap->pages_free_medium));
-#endif
-  _mi_memcpy_aligned(&heap->pages, &_mi_heap_empty.pages, sizeof(heap->pages));
-  heap->thread_delayed_free = NULL;
-  heap->page_count = 0;
+mi_heap_t* mi_heap_new(void) {
+  return mi_heap_new_in_arena(0);
 }
 
-// called from `mi_heap_destroy` and `mi_heap_delete` to free the internal heap resources.
+// free the heap resources (assuming the pages are already moved/destroyed)
 static void mi_heap_free(mi_heap_t* heap) {
-  mi_assert(heap != NULL);
-  mi_assert_internal(mi_heap_is_initialized(heap));
-  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
-  if (mi_heap_is_backing(heap)) return; // dont free the backing heap
-
-  // reset default
-  if (mi_heap_is_default(heap)) {
-    _mi_heap_set_default_direct(heap->tld->heap_backing);
+  mi_assert_internal(heap!=NULL && !_mi_is_heap_main(heap));
+
+  // free all theaps belonging to this heap
+  mi_theap_t* theap = NULL;
+  mi_lock(&heap->theaps_lock) { theap = heap->theaps; }
+  while(theap != NULL) {
+    mi_theap_t* next = NULL;
+    mi_lock(&heap->theaps_lock) { next = theap->hnext; }
+    _mi_theap_free(theap);
+    theap = next;
+  }
+  mi_lock(&heap->theaps_lock) { theap = heap->theaps; }
+  mi_assert_internal(theap==NULL);
+
+  // free all arena pages infos
+  mi_lock(&heap->arena_pages_lock) {
+    for (size_t i = 0; i < MI_MAX_ARENAS; i++) {
+      mi_arena_pages_t* arena_pages = mi_atomic_load_ptr_relaxed(mi_arena_pages_t, &heap->arena_pages[i]);
+      if (arena_pages!=NULL) {
+        mi_atomic_store_ptr_relaxed(mi_arena_pages_t, &heap->arena_pages[i], NULL);
+        mi_free(arena_pages);
+      }
+    }
   }
 
-  // remove ourselves from the thread local heaps list
-  // linear search but we expect the number of heaps to be relatively small
-  mi_heap_t* prev = NULL;
-  mi_heap_t* curr = heap->tld->heaps; 
-  while (curr != heap && curr != NULL) {
-    prev = curr;
-    curr = curr->next;
-  }
-  mi_assert_internal(curr == heap);
-  if (curr == heap) {
-    if (prev != NULL) { prev->next = heap->next; }
-                 else { heap->tld->heaps = heap->next; }
+  // remove the heap from the subproc
+  mi_heap_stats_merge_to_main(heap);
+  mi_atomic_decrement_relaxed(&heap->subproc->heap_count);
+  mi_subproc_stat_decrease(heap->subproc, heaps, 1);
+  mi_lock(&heap->subproc->heaps_lock) {
+    if (heap->next!=NULL) { heap->next->prev = heap->prev; }
+    if (heap->prev!=NULL) { heap->prev->next = heap->next; }
+                     else { heap->subproc->heaps = heap->next; }
   }
-  mi_assert_internal(heap->tld->heaps != NULL);
 
-  // and free the used memory
+  _mi_thread_local_free(heap->theap);
+  mi_lock_done(&heap->theaps_lock);
+  mi_lock_done(&heap->os_abandoned_pages_lock);
   mi_free(heap);
 }
 
-
-/* -----------------------------------------------------------
-  Heap destroy
------------------------------------------------------------ */
-
-static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
-  MI_UNUSED(arg1);
-  MI_UNUSED(arg2);
-  MI_UNUSED(heap);
-  MI_UNUSED(pq);
-
-  // ensure no more thread_delayed_free will be added
-  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
-
-  // stats
-  const size_t bsize = mi_page_block_size(page);
-  if (bsize > MI_MEDIUM_OBJ_SIZE_MAX) {
-    if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
-      mi_heap_stat_decrease(heap, large, bsize);
-    }
-    else {
-      mi_heap_stat_decrease(heap, huge, bsize);
-    }
+void mi_heap_delete(mi_heap_t* heap) {
+  if (heap==NULL) return;
+  if (_mi_is_heap_main(heap)) {
+    _mi_warning_message("cannot delete the main heap\n");
+    return;
   }
-#if (MI_STAT)
-  _mi_page_free_collect(page, false);  // update used count
-  const size_t inuse = page->used;
-  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
-    mi_heap_stat_decrease(heap, normal, bsize * inuse);
-#if (MI_STAT>1)
-    mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], inuse);
-#endif
-  }
-  mi_heap_stat_decrease(heap, malloc, bsize * inuse);  // todo: off for aligned blocks...
-#endif
-
-  /// pretend it is all free now
-  mi_assert_internal(mi_page_thread_free(page) == NULL);
-  page->used = 0;
-
-  // and free the page
-  // mi_page_free(page,false);
-  page->next = NULL;
-  page->prev = NULL;
-  _mi_segment_page_free(page,false /* no force? */, &heap->tld->segments);
-
-  return true; // keep going
-}
-
-void _mi_heap_destroy_pages(mi_heap_t* heap) {
-  mi_heap_visit_pages(heap, &_mi_heap_page_destroy, NULL, NULL);
-  mi_heap_reset_pages(heap);
+  _mi_heap_move_pages(heap, mi_heap_main());
+  mi_heap_free(heap);
 }
 
 void mi_heap_destroy(mi_heap_t* heap) {
-  mi_assert(heap != NULL);
-  mi_assert(mi_heap_is_initialized(heap));
-  mi_assert(heap->no_reclaim);
-  mi_assert_expensive(mi_heap_is_valid(heap));
-  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
-  if (!heap->no_reclaim) {
-    // don't free in case it may contain reclaimed pages
-    mi_heap_delete(heap);
-  }
-  else {
-    // free all pages
-    _mi_heap_destroy_pages(heap);
-    mi_heap_free(heap);
+  if (heap==NULL) return;
+  if (_mi_is_heap_main(heap)) {
+    _mi_warning_message("cannot destroy the main heap\n");
+    return;
   }
-}
-
-
-
-/* -----------------------------------------------------------
-  Safe Heap delete
------------------------------------------------------------ */
-
-// Transfer the pages from one heap to the other
-static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
-  mi_assert_internal(heap!=NULL);
-  if (from==NULL || from->page_count == 0) return;
-
-  // reduce the size of the delayed frees
-  _mi_heap_delayed_free(from);
-  
-  // transfer all pages by appending the queues; this will set a new heap field 
-  // so threads may do delayed frees in either heap for a while.
-  // note: appending waits for each page to not be in the `MI_DELAYED_FREEING` state
-  // so after this only the new heap will get delayed frees
-  for (size_t i = 0; i <= MI_BIN_FULL; i++) {
-    mi_page_queue_t* pq = &heap->pages[i];
-    mi_page_queue_t* append = &from->pages[i];
-    size_t pcount = _mi_page_queue_append(heap, pq, append);
-    heap->page_count += pcount;
-    from->page_count -= pcount;
-  }
-  mi_assert_internal(from->page_count == 0);
-
-  // and do outstanding delayed frees in the `from` heap  
-  // note: be careful here as the `heap` field in all those pages no longer point to `from`,
-  // turns out to be ok as `_mi_heap_delayed_free` only visits the list and calls a 
-  // the regular `_mi_free_delayed_block` which is safe.
-  _mi_heap_delayed_free(from);  
-  #if !defined(_MSC_VER) || (_MSC_VER > 1900) // somehow the following line gives an error in VS2015, issue #353
-  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_block_t,&from->thread_delayed_free) == NULL);
-  #endif
-
-  // and reset the `from` heap
-  mi_heap_reset_pages(from);  
-}
-
-// Safe delete a heap without freeing any still allocated blocks in that heap.
-void mi_heap_delete(mi_heap_t* heap)
-{
-  mi_assert(heap != NULL);
-  mi_assert(mi_heap_is_initialized(heap));
-  mi_assert_expensive(mi_heap_is_valid(heap));
-  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
-
-  if (!mi_heap_is_backing(heap)) {
-    // tranfer still used pages to the backing heap
-    mi_heap_absorb(heap->tld->heap_backing, heap);
-  }
-  else {
-    // the backing heap abandons its pages
-    _mi_heap_collect_abandon(heap);
-  }
-  mi_assert_internal(heap->page_count==0);
+  _mi_heap_destroy_pages(heap);
   mi_heap_free(heap);
 }
 
-mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
-  mi_assert(heap != NULL);
-  mi_assert(mi_heap_is_initialized(heap));
-  if (heap==NULL || !mi_heap_is_initialized(heap)) return NULL;
-  mi_assert_expensive(mi_heap_is_valid(heap));
-  mi_heap_t* old = mi_get_default_heap();
-  _mi_heap_set_default_direct(heap);
-  return old;
-}
-
-
-
-
-/* -----------------------------------------------------------
-  Analysis
------------------------------------------------------------ */
-
-// static since it is not thread safe to access heaps from other threads.
-static mi_heap_t* mi_heap_of_block(const void* p) {
-  if (p == NULL) return NULL;
-  mi_segment_t* segment = _mi_ptr_segment(p);
-  bool valid = (_mi_ptr_cookie(segment) == segment->cookie);
-  mi_assert_internal(valid);
-  if (mi_unlikely(!valid)) return NULL;
-  return mi_page_heap(_mi_segment_page_of(segment,p));
-}
-
-bool mi_heap_contains_block(mi_heap_t* heap, const void* p) {
-  mi_assert(heap != NULL);
-  if (heap==NULL || !mi_heap_is_initialized(heap)) return false;
-  return (heap == mi_heap_of_block(p));
+mi_heap_t* mi_heap_of(const void* p) {
+  mi_page_t* page = _mi_safe_ptr_page(p);
+  if (page==NULL) return NULL;
+  return mi_page_heap(page);
 }
 
-
-static bool mi_heap_page_check_owned(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* p, void* vfound) {
-  MI_UNUSED(heap);
-  MI_UNUSED(pq);
-  bool* found = (bool*)vfound;
-  mi_segment_t* segment = _mi_page_segment(page);
-  void* start = _mi_page_start(segment, page, NULL);
-  void* end   = (uint8_t*)start + (page->capacity * mi_page_block_size(page));
-  *found = (p >= start && p < end);
-  return (!*found); // continue if not found
+bool mi_any_heap_contains(const void* p) {
+  return (mi_heap_of(p)!=NULL);
 }
 
-bool mi_heap_check_owned(mi_heap_t* heap, const void* p) {
-  mi_assert(heap != NULL);
-  if (heap==NULL || !mi_heap_is_initialized(heap)) return false;
-  if (((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0) return false;  // only aligned pointers
-  bool found = false;
-  mi_heap_visit_pages(heap, &mi_heap_page_check_owned, (void*)p, &found);
-  return found;
+bool mi_heap_contains(const mi_heap_t* heap, const void* p) {
+  if (heap==NULL) { heap = mi_heap_main(); }
+  return (heap==mi_heap_of(p));
 }
 
+// deprecated
 bool mi_check_owned(const void* p) {
-  return mi_heap_check_owned(mi_get_default_heap(), p);
-}
-
-/* -----------------------------------------------------------
-  Visit all heap blocks and areas
-  Todo: enable visiting abandoned pages, and
-        enable visiting all blocks of all heaps across threads
------------------------------------------------------------ */
-
-// Separate struct to keep `mi_page_t` out of the public interface
-typedef struct mi_heap_area_ex_s {
-  mi_heap_area_t area;
-  mi_page_t*     page;
-} mi_heap_area_ex_t;
-
-static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_visit_fun* visitor, void* arg) {
-  mi_assert(xarea != NULL);
-  if (xarea==NULL) return true;
-  const mi_heap_area_t* area = &xarea->area;
-  mi_page_t* page = xarea->page;
-  mi_assert(page != NULL);
-  if (page == NULL) return true;
-
-  _mi_page_free_collect(page,true);
-  mi_assert_internal(page->local_free == NULL);
-  if (page->used == 0) return true;
-
-  const size_t bsize = mi_page_block_size(page);
-  const size_t ubsize = mi_page_usable_block_size(page); // without padding
-  size_t   psize;
-  uint8_t* pstart = _mi_page_start(_mi_page_segment(page), page, &psize);
-
-  if (page->capacity == 1) {
-    // optimize page with one block
-    mi_assert_internal(page->used == 1 && page->free == NULL);
-    return visitor(mi_page_heap(page), area, pstart, ubsize, arg);
-  }
-
-  // create a bitmap of free blocks.
-  #define MI_MAX_BLOCKS   (MI_SMALL_PAGE_SIZE / sizeof(void*))
-  uintptr_t free_map[MI_MAX_BLOCKS / sizeof(uintptr_t)];
-  memset(free_map, 0, sizeof(free_map));
-
-  size_t free_count = 0;
-  for (mi_block_t* block = page->free; block != NULL; block = mi_block_next(page,block)) {
-    free_count++;
-    mi_assert_internal((uint8_t*)block >= pstart && (uint8_t*)block < (pstart + psize));
-    size_t offset = (uint8_t*)block - pstart;
-    mi_assert_internal(offset % bsize == 0);
-    size_t blockidx = offset / bsize;  // Todo: avoid division?
-    mi_assert_internal( blockidx < MI_MAX_BLOCKS);
-    size_t bitidx = (blockidx / sizeof(uintptr_t));
-    size_t bit = blockidx - (bitidx * sizeof(uintptr_t));
-    free_map[bitidx] |= ((uintptr_t)1 << bit);
-  }
-  mi_assert_internal(page->capacity == (free_count + page->used));
-
-  // walk through all blocks skipping the free ones
-  size_t used_count = 0;
-  for (size_t i = 0; i < page->capacity; i++) {
-    size_t bitidx = (i / sizeof(uintptr_t));
-    size_t bit = i - (bitidx * sizeof(uintptr_t));
-    uintptr_t m = free_map[bitidx];
-    if (bit == 0 && m == UINTPTR_MAX) {
-      i += (sizeof(uintptr_t) - 1); // skip a run of free blocks
-    }
-    else if ((m & ((uintptr_t)1 << bit)) == 0) {
-      used_count++;
-      uint8_t* block = pstart + (i * bsize);
-      if (!visitor(mi_page_heap(page), area, block, ubsize, arg)) return false;
-    }
-  }
-  mi_assert_internal(page->used == used_count);
-  return true;
-}
-
-typedef bool (mi_heap_area_visit_fun)(const mi_heap_t* heap, const mi_heap_area_ex_t* area, void* arg);
-
-
-static bool mi_heap_visit_areas_page(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* vfun, void* arg) {
-  MI_UNUSED(heap);
-  MI_UNUSED(pq);
-  mi_heap_area_visit_fun* fun = (mi_heap_area_visit_fun*)vfun;
-  mi_heap_area_ex_t xarea;
-  const size_t bsize = mi_page_block_size(page);
-  const size_t ubsize = mi_page_usable_block_size(page);
-  xarea.page = page;
-  xarea.area.reserved = page->reserved * bsize;
-  xarea.area.committed = page->capacity * bsize;
-  xarea.area.blocks = _mi_page_start(_mi_page_segment(page), page, NULL);
-  xarea.area.used = page->used * bsize;
-  xarea.area.block_size = ubsize;
-  xarea.area.full_block_size = bsize;
-  return fun(heap, &xarea, arg);
-}
-
-// Visit all heap pages as areas
-static bool mi_heap_visit_areas(const mi_heap_t* heap, mi_heap_area_visit_fun* visitor, void* arg) {
-  if (visitor == NULL) return false;
-  return mi_heap_visit_pages((mi_heap_t*)heap, &mi_heap_visit_areas_page, (void*)(visitor), arg); // note: function pointer to void* :-{
-}
-
-// Just to pass arguments
-typedef struct mi_visit_blocks_args_s {
-  bool  visit_blocks;
-  mi_block_visit_fun* visitor;
-  void* arg;
-} mi_visit_blocks_args_t;
-
-static bool mi_heap_area_visitor(const mi_heap_t* heap, const mi_heap_area_ex_t* xarea, void* arg) {
-  mi_visit_blocks_args_t* args = (mi_visit_blocks_args_t*)arg;
-  if (!args->visitor(heap, &xarea->area, NULL, xarea->area.block_size, args->arg)) return false;
-  if (args->visit_blocks) {
-    return mi_heap_area_visit_blocks(xarea, args->visitor, args->arg);
-  }
-  else {
-    return true;
-  }
-}
-
-// Visit all blocks in a heap
-bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
-  mi_visit_blocks_args_t args = { visit_blocks, visitor, arg };
-  return mi_heap_visit_areas(heap, &mi_heap_area_visitor, &args);
+  return mi_any_heap_contains(p);
 }
diff --git a/ext/src/mimalloc/src/init.c b/ext/src/mimalloc/src/init.c
index 19124afef9..3161450fdf 100644
--- a/ext/src/mimalloc/src/init.c
+++ b/ext/src/mimalloc/src/init.c
@@ -1,36 +1,40 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/prim.h"
 
 #include <string.h>  // memcpy, memset
 #include <stdlib.h>  // atexit
 
+#define MI_MEMID_INIT(kind)   {{{NULL,0}}, kind, true /* pinned */, true /* committed */, false /* zero */ }
+#define MI_MEMID_STATIC       MI_MEMID_INIT(MI_MEM_STATIC)
+
 // Empty page used to initialize the small free pages array
 const mi_page_t _mi_page_empty = {
-  0, false, false, false, false,
-  0,       // capacity
-  0,       // reserved capacity
-  { 0 },   // flags
-  false,   // is_zero
-  0,       // retire_expire
-  NULL,    // free
-  #if MI_ENCODE_FREELIST
-  { 0, 0 },
-  #endif
-  0,       // used
-  0,       // xblock_size
-  NULL,    // local_free
-  MI_ATOMIC_VAR_INIT(0), // xthread_free
-  MI_ATOMIC_VAR_INIT(0), // xheap
-  NULL, NULL
-  #if MI_INTPTR_SIZE==8
-  , { 0 }  // padding
+  MI_ATOMIC_VAR_INIT(0),  // xthread_id
+  NULL,                   // free
+  0,                      // used
+  0,                      // capacity
+  0,                      // reserved capacity
+  0,                      // retire_expire
+  false,                  // is_zero
+  NULL,                   // local_free
+  MI_ATOMIC_VAR_INIT(0),  // xthread_free
+  0,                      // block_size
+  NULL,                   // page_start
+  #if (MI_PADDING || MI_ENCODE_FREELIST)
+  { 0, 0 },               // keys
   #endif
+  NULL,                   // theap
+  NULL,                   // heap
+  NULL, NULL,             // next, prev
+  MI_ARENA_SLICE_SIZE,    // page_committed
+  MI_MEMID_STATIC         // memid
 };
 
 #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
@@ -45,7 +49,7 @@ const mi_page_t _mi_page_empty = {
 
 
 // Empty page queues for every bin
-#define QNULL(sz)  { NULL, NULL, (sz)*sizeof(uintptr_t) }
+#define QNULL(sz)  { NULL, NULL, 0, (sz)*sizeof(uintptr_t) }
 #define MI_PAGE_QUEUES_EMPTY \
   { QNULL(1), \
     QNULL(     1), QNULL(     2), QNULL(     3), QNULL(     4), QNULL(     5), QNULL(     6), QNULL(     7), QNULL(     8), /* 8 */ \
@@ -57,285 +61,536 @@ const mi_page_t _mi_page_empty = {
     QNULL( 10240), QNULL( 12288), QNULL( 14336), QNULL( 16384), QNULL( 20480), QNULL( 24576), QNULL( 28672), QNULL( 32768), /* 56 */ \
     QNULL( 40960), QNULL( 49152), QNULL( 57344), QNULL( 65536), QNULL( 81920), QNULL( 98304), QNULL(114688), QNULL(131072), /* 64 */ \
     QNULL(163840), QNULL(196608), QNULL(229376), QNULL(262144), QNULL(327680), QNULL(393216), QNULL(458752), QNULL(524288), /* 72 */ \
-    QNULL(MI_MEDIUM_OBJ_WSIZE_MAX + 1  /* 655360, Huge queue */), \
-    QNULL(MI_MEDIUM_OBJ_WSIZE_MAX + 2) /* Full queue */ }
+    QNULL(MI_LARGE_MAX_OBJ_WSIZE + 1  /* 655360, Huge queue */), \
+    QNULL(MI_LARGE_MAX_OBJ_WSIZE + 2) /* Full queue */ }
 
-#define MI_STAT_COUNT_NULL()  {0,0,0,0}
+#define MI_STAT_COUNT_NULL()  {0,0,0}
 
 // Empty statistics
-#if MI_STAT>1
-#define MI_STAT_COUNT_END_NULL()  , { MI_STAT_COUNT_NULL(), MI_INIT32(MI_STAT_COUNT_NULL) }
-#else
-#define MI_STAT_COUNT_END_NULL()
-#endif
-
 #define MI_STATS_NULL  \
-  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 },     \
-  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } \
-  MI_STAT_COUNT_END_NULL()
-
-
-// Empty slice span queues for every bin
-#define SQNULL(sz)  { NULL, NULL, sz }
-#define MI_SEGMENT_SPAN_QUEUES_EMPTY \
-  { SQNULL(1), \
-    SQNULL(     1), SQNULL(     2), SQNULL(     3), SQNULL(     4), SQNULL(     5), SQNULL(     6), SQNULL(     7), SQNULL(    10), /*  8 */ \
-    SQNULL(    12), SQNULL(    14), SQNULL(    16), SQNULL(    20), SQNULL(    24), SQNULL(    28), SQNULL(    32), SQNULL(    40), /* 16 */ \
-    SQNULL(    48), SQNULL(    56), SQNULL(    64), SQNULL(    80), SQNULL(    96), SQNULL(   112), SQNULL(   128), SQNULL(   160), /* 24 */ \
-    SQNULL(   192), SQNULL(   224), SQNULL(   256), SQNULL(   320), SQNULL(   384), SQNULL(   448), SQNULL(   512), SQNULL(   640), /* 32 */ \
-    SQNULL(   768), SQNULL(   896), SQNULL(  1024) /* 35 */ }
-
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  { 0 }, { 0 }, \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  { 0 }, { 0 }, { 0 }, { 0 }, \
+  { 0 }, { 0 }, { 0 }, { 0 }, \
+  \
+  { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, \
+  MI_INIT5(MI_STAT_COUNT_NULL), \
+  { 0 }, { 0 }, { 0 }, { 0 },  \
+  \
+  { MI_INIT4(MI_STAT_COUNT_NULL) }, \
+  { { 0 }, { 0 }, { 0 }, { 0 } }, \
+  \
+  { MI_INIT74(MI_STAT_COUNT_NULL) }, \
+  { MI_INIT74(MI_STAT_COUNT_NULL) }, \
+  { MI_INIT5(MI_STAT_COUNT_NULL) }
 
 // --------------------------------------------------------
-// Statically allocate an empty heap as the initial
-// thread local value for the default heap,
-// and statically allocate the backing heap for the main
+// Statically allocate an empty theap as the initial
+// thread local value for the default theap,
+// and statically allocate the backing theap for the main
 // thread so it can function without doing any allocation
 // itself (as accessing a thread local for the first time
 // may lead to allocation itself on some platforms)
 // --------------------------------------------------------
 
-mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
-  NULL,
+static mi_decl_cache_align mi_subproc_t subproc_main
+#if __cplusplus
+= { };     // empty initializer to prevent running the constructor (with msvc)
+#else
+= { 0 };   // C zero initialize
+#endif
+
+static mi_subproc_t* subprocs = &subproc_main;
+static mi_lock_t     subprocs_lock;
+
+static mi_decl_cache_align mi_tld_t tld_empty = {
+  0,                      // thread_id
+  0,                      // thread_seq
+  0,                      // default numa node
+  &subproc_main,          // subproc
+  NULL,                   // theaps list
+  false,                  // recurse
+  false,                  // is_in_threadpool
+  MI_MEMID_STATIC         // memid
+};
+
+mi_decl_cache_align const mi_theap_t _mi_theap_empty = {
+  &tld_empty,             // tld
+  NULL,                   // heap
+  0,                      // heartbeat
+  0,                      // cookie
+  { {0}, {0}, 0, true },  // random
+  0,                      // page count
+  MI_BIN_FULL, 0,         // page retired min/max
+  0, 0,                   // generic count
+  NULL, NULL,             // tnext, tprev
+  NULL, NULL,             // hnext, hprev
+  0,                      // full page retain
+  false,                  // allow reclaim
+  true,                   // allow abandon
+  #if MI_GUARDED
+  0, 0, 0, 1,             // sample count is 1 so we never write to it (see `internal.h:mi_theap_malloc_use_guarded`)
+  #endif
   MI_SMALL_PAGES_EMPTY,
   MI_PAGE_QUEUES_EMPTY,
-  MI_ATOMIC_VAR_INIT(NULL),
-  0,                // tid
-  0,                // cookie
-  { 0, 0 },         // keys
-  { {0}, {0}, 0 },
-  0,                // page count
-  MI_BIN_FULL, 0,   // page retired min/max
-  NULL,             // next
-  false
+  MI_MEMID_STATIC,
+  { sizeof(mi_stats_t), MI_STAT_VERSION, MI_STATS_NULL },      // stats
 };
 
-#define tld_empty_stats  ((mi_stats_t*)((uint8_t*)&tld_empty + offsetof(mi_tld_t,stats)))
-#define tld_empty_os     ((mi_os_tld_t*)((uint8_t*)&tld_empty + offsetof(mi_tld_t,os)))
-
-mi_decl_cache_align static const mi_tld_t tld_empty = {
-  0,
-  false,
-  NULL, NULL,
-  { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, tld_empty_stats, tld_empty_os }, // segments
-  { 0, tld_empty_stats }, // os
-  { MI_STATS_NULL }       // stats
+mi_decl_cache_align const mi_theap_t _mi_theap_empty_wrong = {
+  &tld_empty,             // tld
+  NULL,                   // heap
+  0,                      // heartbeat
+  0,                      // cookie
+  { {0}, {0}, 0, true },  // random
+  0,                      // page count
+  MI_BIN_FULL, 0,         // page retired min/max
+  0, 0,                   // generic count
+  NULL, NULL,             // tnext, tprev
+  NULL, NULL,             // hnext, hprev
+  0,                      // full page retain
+  false,                  // allow reclaim
+  true,                   // allow abandon
+  #if MI_GUARDED
+  0, 0, 0, 1,             // sample count is 1 so we never write to it (see `internal.h:mi_theap_malloc_use_guarded`)
+  #endif
+  MI_SMALL_PAGES_EMPTY,
+  MI_PAGE_QUEUES_EMPTY,
+  MI_MEMID_STATIC,
+  { sizeof(mi_stats_t), MI_STAT_VERSION, MI_STATS_NULL },      // stats
 };
 
-// the thread-local default heap for allocation
-mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
+// Heap for the main thread
 
-extern mi_heap_t _mi_heap_main;
+extern mi_decl_hidden mi_decl_cache_align mi_theap_t theap_main;
+extern mi_decl_hidden mi_decl_cache_align mi_heap_t  heap_main;
 
-static mi_tld_t tld_main = {
-  0, false,
-  &_mi_heap_main, & _mi_heap_main,
-  { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, &tld_main.stats, &tld_main.os }, // segments
-  { 0, &tld_main.stats },  // os
-  { MI_STATS_NULL }       // stats
+static mi_decl_cache_align mi_tld_t tld_main = {
+  0,                      // thread_id
+  0,                      // thread_seq
+  0,                      // numa node
+  &subproc_main,          // subproc
+  &theap_main,            // theaps list
+  false,                  // recurse
+  false,                  // is_in_threadpool
+  MI_MEMID_STATIC         // memid
 };
 
-mi_heap_t _mi_heap_main = {
-  &tld_main,
+mi_decl_cache_align mi_theap_t theap_main = {
+  &tld_main,              // thread local data
+  &heap_main,             // main heap
+  0,                      // heartbeat
+  0,                      // initial cookie
+  { {0x846ca68b}, {0}, 0, true },  // random
+  0,                      // page count
+  MI_BIN_FULL, 0,         // page retired min/max
+  0, 0,                   // generic count
+  NULL, NULL,             // tnext, tprev
+  NULL, NULL,             // hnext, hprev
+  2,                      // full page retain
+  true,                   // allow page reclaim
+  true,                   // allow page abandon
+  #if MI_GUARDED
+  0, 0, 0, 0,
+  #endif
   MI_SMALL_PAGES_EMPTY,
   MI_PAGE_QUEUES_EMPTY,
-  MI_ATOMIC_VAR_INIT(NULL),
-  0,                // thread id
-  0,                // initial cookie
-  { 0, 0 },         // the key of the main heap can be fixed (unlike page keys that need to be secure!)
-  { {0x846ca68b}, {0}, 0 },  // random
-  0,                // page count
-  MI_BIN_FULL, 0,   // page retired min/max
-  NULL,             // next heap
-  false             // can reclaim
+  MI_MEMID_STATIC,
+  { sizeof(mi_stats_t), MI_STAT_VERSION, MI_STATS_NULL },      // stats
 };
 
+mi_decl_cache_align mi_heap_t heap_main
+#if __cplusplus
+  = { };     // empty initializer to prevent running the constructor (with msvc)
+#else
+  = { 0 };   // C zero initialize
+#endif
+
+mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
+  return _mi_prim_thread_id();
+}
+
+// the theap belonging to the main heap
+mi_decl_hidden mi_decl_thread mi_theap_t* __mi_theap_main = NULL;
+
+#if MI_TLS_MODEL_THREAD_LOCAL
+// the thread-local main theap for allocation
+mi_decl_hidden mi_decl_thread mi_theap_t* __mi_theap_default = (mi_theap_t*)&_mi_theap_empty;
+// the last used non-main theap
+mi_decl_hidden mi_decl_thread mi_theap_t* __mi_theap_cached = (mi_theap_t*)&_mi_theap_empty;
+#endif
+
 bool _mi_process_is_initialized = false;  // set to `true` in `mi_process_init`.
 
-mi_stats_t _mi_stats_main = { MI_STATS_NULL };
+mi_stats_t _mi_stats_main = { sizeof(mi_stats_t), MI_STAT_VERSION, MI_STATS_NULL };
 
+#if MI_GUARDED
+mi_decl_export void mi_theap_guarded_set_sample_rate(mi_theap_t* theap, size_t sample_rate, size_t seed) {
+  theap->guarded_sample_rate  = sample_rate;
+  theap->guarded_sample_count = sample_rate;  // count down samples
+  if (theap->guarded_sample_rate > 1) {
+    if (seed == 0) {
+      seed = _mi_theap_random_next(theap);
+    }
+    theap->guarded_sample_count = (seed % theap->guarded_sample_rate) + 1;  // start at random count between 1 and `sample_rate`
+  }
+}
 
-static void mi_heap_main_init(void) {
-  if (_mi_heap_main.cookie == 0) {
-    _mi_heap_main.thread_id = _mi_thread_id();
-    _mi_heap_main.cookie = _mi_os_random_weak((uintptr_t)&mi_heap_main_init);
-    _mi_random_init(&_mi_heap_main.random);
-    _mi_heap_main.keys[0] = _mi_heap_random_next(&_mi_heap_main);
-    _mi_heap_main.keys[1] = _mi_heap_random_next(&_mi_heap_main);
+mi_decl_export void mi_theap_guarded_set_size_bound(mi_theap_t* theap, size_t min, size_t max) {
+  theap->guarded_size_min = min;
+  theap->guarded_size_max = (min > max ? min : max);
+}
+
+void _mi_theap_guarded_init(mi_theap_t* theap) {
+  mi_theap_guarded_set_sample_rate(theap,
+    (size_t)mi_option_get_clamp(mi_option_guarded_sample_rate, 0, LONG_MAX),
+    (size_t)mi_option_get(mi_option_guarded_sample_seed));
+  mi_theap_guarded_set_size_bound(theap,
+    (size_t)mi_option_get_clamp(mi_option_guarded_min, 0, LONG_MAX),
+    (size_t)mi_option_get_clamp(mi_option_guarded_max, 0, LONG_MAX) );
+}
+#else
+mi_decl_export void mi_theap_guarded_set_sample_rate(mi_theap_t* theap, size_t sample_rate, size_t seed) {
+  MI_UNUSED(theap); MI_UNUSED(sample_rate); MI_UNUSED(seed);
+}
+
+mi_decl_export void mi_theap_guarded_set_size_bound(mi_theap_t* theap, size_t min, size_t max) {
+  MI_UNUSED(theap); MI_UNUSED(min); MI_UNUSED(max);
+}
+void _mi_theap_guarded_init(mi_theap_t* theap) {
+  MI_UNUSED(theap);
+}
+#endif
+
+/* -----------------------------------------------------------
+  Initialization
+  Note: on some platforms lock_init or just a thread local access
+  can cause allocation and induce recursion during initialization.
+----------------------------------------------------------- */
+
+
+// Initialize main subproc
+static void mi_subproc_main_init(void) {
+  if (subproc_main.memid.memkind != MI_MEM_STATIC) {
+    subproc_main.memid = _mi_memid_create(MI_MEM_STATIC);
+    subproc_main.heaps = &heap_main;
+    subproc_main.heap_total_count = 1;
+    subproc_main.heap_count = 1;
+    mi_atomic_store_ptr_release(mi_heap_t, &subproc_main.heap_main, &heap_main);
+    __mi_stat_increase_mt(&subproc_main.stats.heaps, 1);
+    mi_lock_init(&subproc_main.arena_reserve_lock);
+    mi_lock_init(&subproc_main.heaps_lock);
+    mi_lock_init(&subprocs_lock);
+  }
+}
+
+// Initialize main tld
+static void mi_tld_main_init(void) {
+  if (tld_main.thread_id == 0) {
+    tld_main.thread_id = _mi_prim_thread_id();
   }
 }
 
-mi_heap_t* _mi_heap_main_get(void) {
-  mi_heap_main_init();
-  return &_mi_heap_main;
+void _mi_theap_options_init(mi_theap_t* theap) {
+  theap->allow_page_reclaim = (mi_option_get(mi_option_page_reclaim_on_free) >= 0);
+  theap->allow_page_abandon = (mi_option_get(mi_option_page_full_retain) >= 0);
+  theap->page_full_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32);
+}
+
+// Initialization of the (statically allocated) main theap, and the main tld and subproc.
+static void mi_theap_main_init(void) {
+  if mi_unlikely(theap_main.memid.memkind != MI_MEM_STATIC) {
+    // theap
+    theap_main.memid = _mi_memid_create(MI_MEM_STATIC);
+    #if defined(__APPLE__) || defined(_WIN32) && !defined(MI_SHARED_LIB)
+      _mi_random_init_weak(&theap_main.random);    // prevent allocation failure during bcrypt dll initialization with static linking (issue #1185)
+    #else
+      _mi_random_init(&theap_main.random);
+    #endif
+    theap_main.cookie  = _mi_theap_random_next(&theap_main);
+    _mi_theap_options_init(&theap_main);
+    _mi_theap_guarded_init(&theap_main);
+  }
+}
+
+// Initialize main heap
+static void mi_heap_main_init(void) {
+  if mi_unlikely(heap_main.subproc == NULL) {
+    heap_main.subproc = &subproc_main;
+    heap_main.theaps = &theap_main;
+
+    mi_theap_main_init();
+    mi_subproc_main_init();
+    mi_tld_main_init();
+
+    mi_lock_init(&heap_main.theaps_lock);
+    mi_lock_init(&heap_main.os_abandoned_pages_lock);
+    mi_lock_init(&heap_main.arena_pages_lock);
+  }
 }
 
 
 /* -----------------------------------------------------------
-  Initialization and freeing of the thread local heaps
+  Thread local data
 ----------------------------------------------------------- */
 
-// note: in x64 in release build `sizeof(mi_thread_data_t)` is under 4KiB (= OS page size).
-typedef struct mi_thread_data_s {
-  mi_heap_t  heap;  // must come first due to cast in `_mi_heap_done`
-  mi_tld_t   tld;
-} mi_thread_data_t;
-
-
-// Thread meta-data is allocated directly from the OS. For
-// some programs that do not use thread pools and allocate and
-// destroy many OS threads, this may causes too much overhead 
-// per thread so we maintain a small cache of recently freed metadata.
-
-#define TD_CACHE_SIZE (8)
-static _Atomic(mi_thread_data_t*) td_cache[TD_CACHE_SIZE];
-
-static mi_thread_data_t* mi_thread_data_alloc(void) {
-  // try to find thread metadata in the cache
-  mi_thread_data_t* td;
-  for (int i = 0; i < TD_CACHE_SIZE; i++) {
-    td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
-    if (td != NULL) {
-      td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL); 
-      if (td != NULL) {
-        return td;
-      }
-    }
+// Allocate fresh tld
+static mi_tld_t* mi_tld_alloc(void) {
+  if (_mi_is_main_thread()) {
+    mi_atomic_increment_relaxed(&tld_main.subproc->thread_count);
+    return &tld_main;
   }
-  // if that fails, allocate directly from the OS
-  td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &_mi_stats_main);
-  if (td == NULL) {
-    // if this fails, try once more. (issue #257)
-    td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &_mi_stats_main);
-    if (td == NULL) {
-      // really out of memory
-      _mi_error_message(ENOMEM, "unable to allocate thread local heap metadata (%zu bytes)\n", sizeof(mi_thread_data_t));
+  else {
+    // allocate tld meta-data
+    // note: we need to be careful to not access the tld from `_mi_meta_zalloc`
+    // (and in turn from `_mi_arena_alloc_aligned` and `_mi_os_alloc_aligned`).
+    mi_memid_t memid;
+    mi_tld_t* tld = (mi_tld_t*)_mi_meta_zalloc(sizeof(mi_tld_t), &memid);
+    if (tld==NULL) {
+      _mi_error_message(ENOMEM, "unable to allocate memory for thread local data\n");
+      return NULL;
     }
+    tld->memid = memid;
+    tld->theaps = NULL;
+    tld->subproc = &subproc_main;
+    tld->numa_node = _mi_os_numa_node();
+    tld->thread_id = _mi_prim_thread_id();
+    tld->thread_seq = mi_atomic_increment_relaxed(&tld->subproc->thread_total_count);
+    tld->is_in_threadpool = _mi_prim_thread_is_in_threadpool();
+    mi_atomic_increment_relaxed(&tld->subproc->thread_count);
+    return tld;
   }
-  return td;
 }
 
-static void mi_thread_data_free( mi_thread_data_t* tdfree ) {
-  // try to add the thread metadata to the cache
-  for (int i = 0; i < TD_CACHE_SIZE; i++) {
-    mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
-    if (td == NULL) {
-      mi_thread_data_t* expected = NULL;
-      if (mi_atomic_cas_ptr_weak_acq_rel(mi_thread_data_t, &td_cache[i], &expected, tdfree)) {
-        return;
-      }
-    }
+#define MI_TLD_INVALID  ((mi_tld_t*)1)
+
+mi_decl_noinline static void mi_tld_free(mi_tld_t* tld) {
+  if (tld != NULL && tld != MI_TLD_INVALID) {
+    mi_atomic_decrement_relaxed(&tld->subproc->thread_count);
+    _mi_meta_free(tld, sizeof(mi_tld_t), tld->memid);
   }
-  // if that fails, just free it directly
-  _mi_os_free(tdfree, sizeof(mi_thread_data_t), &_mi_stats_main);
+  #if 0
+  // do not read/write to `thread_tld` on older macOS <= 14 as that will re-initialize the thread local storage
+  // (since we are calling this during pthread shutdown)
+  // (and this could happen on other systems as well, so let's never do it)
+  thread_tld = MI_TLD_INVALID;
+  #endif
 }
 
-static void mi_thread_data_collect(void) {
-  // free all thread metadata from the cache
-  for (int i = 0; i < TD_CACHE_SIZE; i++) {
-    mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
-    if (td != NULL) {
-      td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL);
-      if (td != NULL) {
-        _mi_os_free( td, sizeof(mi_thread_data_t), &_mi_stats_main );
-      }
-    }
+// return the thread local heap ensuring it is initialized (and not `NULL` or `&_mi_theap_empty`);
+mi_theap_t* _mi_theap_default_safe(void) {
+  mi_theap_t* theap = _mi_theap_default();
+  if mi_likely(mi_theap_is_initialized(theap)) return theap;
+  mi_thread_init();
+  mi_assert_internal(mi_theap_is_initialized(_mi_theap_default()));
+  return _mi_theap_default();
+}
+
+
+mi_subproc_t* _mi_subproc_main(void) {
+  return &subproc_main;
+}
+
+mi_subproc_t* _mi_subproc(void) {
+  // should work without doing initialization (as it may be called from `_mi_tld -> mi_tld_alloc ... -> os_alloc -> _mi_subproc()`
+  // todo: this will still fail on OS systems where the first access to a thread-local causes allocation.
+  //       on such systems we can check for this with the _mi_prim_get_default_theap as those are protected (by being
+  //       stored in a TLS slot for example)
+  mi_theap_t* theap = _mi_theap_default();
+  if (theap == NULL) {
+    return _mi_subproc_main();
+  }
+  else {
+    return theap->tld->subproc;  // avoid using thread local storage (`thread_tld`)
   }
 }
 
-// Initialize the thread local default heap, called from `mi_thread_init`
-static bool _mi_heap_init(void) {
-  if (mi_heap_is_initialized(mi_get_default_heap())) return true;
-  if (_mi_is_main_thread()) {
-    // mi_assert_internal(_mi_heap_main.thread_id != 0);  // can happen on freeBSD where alloc is called before any initialization
-    // the main heap is statically allocated
-    mi_heap_main_init();
-    _mi_heap_set_default_direct(&_mi_heap_main);
-    //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_get_default_heap());
+mi_heap_t* _mi_subproc_heap_main(mi_subproc_t* subproc) {
+  mi_heap_t* heap = mi_atomic_load_ptr_relaxed(mi_heap_t,&subproc->heap_main);
+  if mi_likely(heap!=NULL) {
+    return heap;
   }
   else {
-    // use `_mi_os_alloc` to allocate directly from the OS
-    mi_thread_data_t* td = mi_thread_data_alloc();
-    if (td == NULL) return false;
-
-    // OS allocated so already zero initialized
-    mi_tld_t*  tld = &td->tld;
-    mi_heap_t* heap = &td->heap;
-    _mi_memcpy_aligned(tld, &tld_empty, sizeof(*tld));
-    _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(*heap));
-    heap->thread_id = _mi_thread_id();
-    _mi_random_init(&heap->random);
-    heap->cookie  = _mi_heap_random_next(heap) | 1;
-    heap->keys[0] = _mi_heap_random_next(heap);
-    heap->keys[1] = _mi_heap_random_next(heap);
-    heap->tld = tld;
-    tld->heap_backing = heap;
-    tld->heaps = heap;
-    tld->segments.stats = &tld->stats;
-    tld->segments.os = &tld->os;
-    tld->os.stats = &tld->stats;
-    _mi_heap_set_default_direct(heap);    
+    mi_heap_main_init();
+    mi_assert_internal(mi_atomic_load_relaxed(&subproc->heap_main) != NULL);
+    return mi_atomic_load_ptr_relaxed(mi_heap_t,&subproc->heap_main);
   }
-  return false;
 }
 
-// Free the thread local default heap (called from `mi_thread_done`)
-static bool _mi_heap_done(mi_heap_t* heap) {
-  if (!mi_heap_is_initialized(heap)) return true;
+mi_heap_t* mi_heap_main(void) {
+  return _mi_subproc_heap_main(_mi_subproc()); // don't use _mi_theap_main() so this call works during process_init
+}
 
-  // reset default heap
-  _mi_heap_set_default_direct(_mi_is_main_thread() ? &_mi_heap_main : (mi_heap_t*)&_mi_heap_empty);
+bool _mi_is_heap_main(const mi_heap_t* heap) {
+  mi_assert_internal(heap!=NULL);
+  return (_mi_subproc_heap_main(heap->subproc) == heap);
+}
 
-  // switch to backing heap
-  heap = heap->tld->heap_backing;
-  if (!mi_heap_is_initialized(heap)) return false;
+/* -----------------------------------------------------------
+  Sub process
+----------------------------------------------------------- */
 
-  // delete all non-backing heaps in this thread
-  mi_heap_t* curr = heap->tld->heaps;
-  while (curr != NULL) {
-    mi_heap_t* next = curr->next; // save `next` as `curr` will be freed
-    if (curr != heap) {
-      mi_assert_internal(!mi_heap_is_backing(curr));
-      mi_heap_delete(curr);
+mi_subproc_id_t mi_subproc_main(void) {
+  return _mi_subproc_main();
+}
+
+mi_subproc_id_t mi_subproc_current(void) {
+  return _mi_subproc();
+}
+
+mi_subproc_id_t mi_subproc_new(void) {
+  static _Atomic(size_t) subproc_total_count;
+  mi_memid_t memid;
+  mi_subproc_t* subproc = (mi_subproc_t*)_mi_meta_zalloc(sizeof(mi_subproc_t),&memid);
+  if (subproc == NULL) return NULL;
+  subproc->memid = memid;
+  subproc->subproc_seq = mi_atomic_increment_relaxed(&subproc_total_count) + 1;
+  mi_lock_init(&subproc->arena_reserve_lock);
+  mi_lock_init(&subproc->heaps_lock);
+  mi_lock(&subprocs_lock) {
+    // push on subproc list
+    subproc->next = subprocs;
+    if (subprocs!=NULL) { subprocs->prev = subproc; }
+    subprocs = subproc;
+  }
+  return subproc;
+}
+
+mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id) {
+  return (subproc_id == NULL ? &subproc_main : (mi_subproc_t*)subproc_id);
+}
+
+// destroy all subproc resources including arena's, heap's etc.
+static void mi_subproc_unsafe_destroy(mi_subproc_t* subproc)
+{
+  // remove from the subproc list
+  mi_lock(&subprocs_lock) {
+    if (subproc->next!=NULL) { subproc->next->prev = subproc->prev;  }
+    if (subproc->prev!=NULL) { subproc->prev->next = subproc->next;  }
+                        else { mi_assert_internal(subprocs==subproc);  subprocs = subproc->next; }
+  }
+
+  // destroy all subproc heaps
+  mi_lock(&subproc->heaps_lock) {
+    mi_heap_t* heap = subproc->heaps;
+    while (heap != NULL) {
+      mi_heap_t* next = heap->next;
+      if (heap!=subproc->heap_main) {mi_heap_destroy(heap); }
+      heap = next;
     }
-    curr = next;
+    mi_assert_internal(subproc->heaps == subproc->heap_main);
+    mi_heap_destroy(subproc->heap_main);
   }
-  mi_assert_internal(heap->tld->heaps == heap && heap->next == NULL);
-  mi_assert_internal(mi_heap_is_backing(heap));
 
-  // collect if not the main thread
-  if (heap != &_mi_heap_main) {
-    _mi_heap_collect_abandon(heap);
+  // merge stats back into the main subproc?
+  if (subproc!=&subproc_main) {
+    _mi_arenas_unsafe_destroy_all(subproc);
+    _mi_stats_merge_into(&subproc_main.stats, &subproc->stats);
+
+    // safe to release
+    // todo: should we refcount subprocesses?
+    mi_lock_done(&subproc->arena_reserve_lock);
+    mi_lock_done(&subproc->heaps_lock);
+    _mi_meta_free(subproc, sizeof(mi_subproc_t), subproc->memid);
   }
-  
-  // merge stats
-  _mi_stats_done(&heap->tld->stats);  
+}
 
-  // free if not the main thread
-  if (heap != &_mi_heap_main) {
-    // the following assertion does not always hold for huge segments as those are always treated
-    // as abondened: one may allocate it in one thread, but deallocate in another in which case
-    // the count can be too large or negative. todo: perhaps not count huge segments? see issue #363
-    // mi_assert_internal(heap->tld->segments.count == 0 || heap->thread_id != _mi_thread_id());
-    mi_thread_data_free((mi_thread_data_t*)heap);
+void mi_subproc_destroy(mi_subproc_id_t subproc_id) {
+  if (subproc_id == NULL) return;
+  mi_subproc_unsafe_destroy(_mi_subproc_from_id(subproc_id));
+}
+
+static void mi_subprocs_unsafe_destroy_all(void) {
+  mi_lock(&subprocs_lock) {
+    mi_subproc_t* subproc = subprocs;
+    while (subproc!=NULL) {
+      mi_subproc_t* next = subproc->next;
+      if (subproc!=&subproc_main) {
+        mi_subproc_unsafe_destroy(subproc);
+      }
+      subproc = next;
+    }
+  }
+  mi_subproc_unsafe_destroy(&subproc_main);
+}
+
+
+void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) {
+  mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id);
+  mi_tld_t* const tld = _mi_theap_default_safe()->tld;
+  mi_assert(tld->subproc== &subproc_main);
+  if (tld->subproc != &subproc_main) {
+    _mi_warning_message("unable to add thread to the subprocess as it was already in another subprocess (id: %p)\n", subproc);
+    return;
+  }
+  tld->subproc = subproc;
+  tld->thread_seq = mi_atomic_increment_relaxed(&subproc->thread_total_count);
+  mi_atomic_decrement_relaxed(&subproc_main.thread_count);
+  mi_atomic_increment_relaxed(&subproc->thread_count);
+}
+
+
+bool mi_subproc_visit_heaps(mi_subproc_id_t subproc_id, mi_heap_visit_fun* visitor, void* arg) {
+  mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id);
+  if (subproc==NULL) return false;
+  bool ok = true;
+  mi_lock(&subproc->heaps_lock) {
+    for (mi_heap_t* heap = subproc->heaps; heap!=NULL && ok; heap = heap->next) {
+      ok = (*visitor)(heap, arg);
+    }
+  }
+  return ok;
+}
+
+
+/* -----------------------------------------------------------
+  Allocate theap data
+----------------------------------------------------------- */
+
+// Initialize the thread local default theap, called from `mi_thread_init`
+static mi_theap_t* _mi_thread_init_theap_default(void) {
+  mi_theap_t* theap = _mi_theap_default();
+  if (mi_theap_is_initialized(theap)) return theap;
+  if (_mi_is_main_thread()) {
+    mi_heap_main_init();
+    theap = &theap_main;
   }
   else {
-    mi_thread_data_collect(); // free cached thread metadata  
-    #if 0  
-    // never free the main thread even in debug mode; if a dll is linked statically with mimalloc,
+    // allocates tld data
+    // note: we cannot access thread-locals yet as that can cause (recursive) allocation
+    // (on macOS <= 14 for example where the loader allocates thread-local data on demand).
+    mi_tld_t* tld = mi_tld_alloc();
+    // allocate and initialize the theap for the main heap
+    theap = _mi_theap_create(mi_heap_main(), tld);
+  }
+  // associate the theap with this thread
+  // (this is safe, on macOS for example, the theap is set in a dedicated TLS slot and thus does not cause recursive allocation)
+  _mi_theap_default_set(theap);
+  return theap;
+}
+
+
+// Free the thread local theaps
+static void mi_thread_theaps_done(mi_tld_t* tld)
+{
+  // reset the thread local theaps
+  __mi_theap_main = NULL;
+  _mi_theap_default_set((mi_theap_t*)&_mi_theap_empty);
+  _mi_theap_cached_set((mi_theap_t*)&_mi_theap_empty);
+
+  // delete all theaps in this thread
+  mi_theap_t* curr = tld->theaps;
+  while (curr != NULL) {
+    mi_theap_t* next = curr->tnext; // save `tnext` as `curr` will be freed
+    // never destroy theaps; if a dll is linked statically with mimalloc,
     // there may still be delete/free calls after the mi_fls_done is called. Issue #207
-    _mi_heap_destroy_pages(heap);
-    mi_assert_internal(heap->tld->heap_backing == &_mi_heap_main);
-    #endif
+    _mi_theap_delete(curr);
+    curr = next;
   }
-  return false;
+  mi_assert(_mi_theap_default()==(mi_theap_t*)&_mi_theap_empty); // careful to not re-initialize the default theap during theap_delete
+  mi_assert(!mi_theap_is_initialized(_mi_theap_default()));
 }
 
 
@@ -348,7 +603,7 @@ static bool _mi_heap_done(mi_heap_t* heap) {
 // 1. windows dynamic library:
 //     call from DllMain on DLL_THREAD_DETACH
 // 2. windows static library:
-//     use `FlsAlloc` to call a destructor when the thread is done
+//     use special linker section to call a destructor when the thread is done
 // 3. unix, pthreads:
 //     use a pthread key to call a destructor when a pthread is done
 //
@@ -356,257 +611,375 @@ static bool _mi_heap_done(mi_heap_t* heap) {
 // to set up the thread local keys.
 // --------------------------------------------------------
 
-static void _mi_thread_done(mi_heap_t* default_heap);
-
-#if defined(_WIN32) && defined(MI_SHARED_LIB)
-  // nothing to do as it is done in DllMain
-#elif defined(_WIN32) && !defined(MI_SHARED_LIB)
-  // use thread local storage keys to detect thread ending
-  #include <windows.h>
-  #include <fibersapi.h>
-  #if (_WIN32_WINNT < 0x600)  // before Windows Vista 
-  WINBASEAPI DWORD WINAPI FlsAlloc( _In_opt_ PFLS_CALLBACK_FUNCTION lpCallback );
-  WINBASEAPI PVOID WINAPI FlsGetValue( _In_ DWORD dwFlsIndex );
-  WINBASEAPI BOOL  WINAPI FlsSetValue( _In_ DWORD dwFlsIndex, _In_opt_ PVOID lpFlsData );
-  WINBASEAPI BOOL  WINAPI FlsFree(_In_ DWORD dwFlsIndex);
-  #endif
-  static DWORD mi_fls_key = (DWORD)(-1);
-  static void NTAPI mi_fls_done(PVOID value) {
-    if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
-  }
-#elif defined(MI_USE_PTHREADS)
-  // use pthread local storage keys to detect thread ending
-  // (and used with MI_TLS_PTHREADS for the default heap)
-  pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1);
-  static void mi_pthread_done(void* value) {
-    if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
-  }
-#elif defined(__wasi__)
-// no pthreads in the WebAssembly Standard Interface
-#else
-  #pragma message("define a way to call mi_thread_done when a thread is done")
-#endif
-
 // Set up handlers so `mi_thread_done` is called automatically
 static void mi_process_setup_auto_thread_done(void) {
   static bool tls_initialized = false; // fine if it races
   if (tls_initialized) return;
   tls_initialized = true;
-  #if defined(_WIN32) && defined(MI_SHARED_LIB)
-    // nothing to do as it is done in DllMain
-  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
-    mi_fls_key = FlsAlloc(&mi_fls_done);
-  #elif defined(MI_USE_PTHREADS)
-    mi_assert_internal(_mi_heap_default_key == (pthread_key_t)(-1));
-    pthread_key_create(&_mi_heap_default_key, &mi_pthread_done);
-  #endif
-  _mi_heap_set_default_direct(&_mi_heap_main);
+  _mi_prim_thread_init_auto_done();
+  _mi_theap_default_set(&theap_main);
 }
 
 
 bool _mi_is_main_thread(void) {
-  return (_mi_heap_main.thread_id==0 || _mi_heap_main.thread_id == _mi_thread_id());
+  return (tld_main.thread_id==0 || tld_main.thread_id == _mi_thread_id());
 }
 
-static _Atomic(size_t) thread_count = MI_ATOMIC_VAR_INIT(1);
 
-size_t  _mi_current_thread_count(void) {
-  return mi_atomic_load_relaxed(&thread_count);
-}
-
-// This is called from the `mi_malloc_generic`
+// Initialize thread
 void mi_thread_init(void) mi_attr_noexcept
 {
   // ensure our process has started already
   mi_process_init();
-  
-  // initialize the thread local default heap
-  // (this will call `_mi_heap_set_default_direct` and thus set the
-  //  fiber/pthread key to a non-zero value, ensuring `_mi_thread_done` is called)
-  if (_mi_heap_init()) return;  // returns true if already initialized
+  // if the theap_default is already set we have already initialized
+  if (_mi_thread_is_initialized()) return;
+
+  // initialize the default theap
+  _mi_thread_init_theap_default();
 
-  _mi_stat_increase(&_mi_stats_main.threads, 1);
-  mi_atomic_increment_relaxed(&thread_count);
-  //_mi_verbose_message("thread init: 0x%zx\n", _mi_thread_id());
+  mi_heap_stat_increase(mi_heap_main(), threads, 1);
+  // _mi_verbose_message("thread init: 0x%zx\n", _mi_thread_id());
 }
 
 void mi_thread_done(void) mi_attr_noexcept {
-  _mi_thread_done(mi_get_default_heap());
-}
-
-static void _mi_thread_done(mi_heap_t* heap) {
-  mi_atomic_decrement_relaxed(&thread_count);
-  _mi_stat_decrease(&_mi_stats_main.threads, 1);
-
-  // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps...
-  if (heap->thread_id != _mi_thread_id()) return;
-  
-  // abandon the thread local heap
-  if (_mi_heap_done(heap)) return;  // returns true if already ran
-}
-
-void _mi_heap_set_default_direct(mi_heap_t* heap)  {
-  mi_assert_internal(heap != NULL);
-  #if defined(MI_TLS_SLOT)
-  mi_tls_slot_set(MI_TLS_SLOT,heap);
-  #elif defined(MI_TLS_PTHREAD_SLOT_OFS)
-  *mi_tls_pthread_heap_slot() = heap;
-  #elif defined(MI_TLS_PTHREAD)
-  // we use _mi_heap_default_key
-  #else
-  _mi_heap_default = heap;
-  #endif
+  _mi_thread_done(NULL);
+}
 
-  // ensure the default heap is passed to `_mi_thread_done`
-  // setting to a non-NULL value also ensures `mi_thread_done` is called.
-  #if defined(_WIN32) && defined(MI_SHARED_LIB)
-    // nothing to do as it is done in DllMain
-  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
-    mi_assert_internal(mi_fls_key != 0);
-    FlsSetValue(mi_fls_key, heap);
-  #elif defined(MI_USE_PTHREADS)
-  if (_mi_heap_default_key != (pthread_key_t)(-1)) {  // can happen during recursive invocation on freeBSD
-    pthread_setspecific(_mi_heap_default_key, heap);
+void _mi_thread_done(mi_theap_t* _theap_main)
+{
+  // NULL can be passed on some platforms
+  if (_theap_main==NULL) {
+    _theap_main = __mi_theap_main;
   }
-  #endif
-}
 
+  // prevent re-entrancy through theap_done/theap_set_default_direct (issue #699)
+  if (!mi_theap_is_initialized(_theap_main)) {
+    return;
+  }
 
-// --------------------------------------------------------
-// Run functions on process init/done, and thread init/done
-// --------------------------------------------------------
-static void mi_process_done(void);
+  // release dynamic thread_local's
+  _mi_thread_locals_thread_done();
 
-static bool os_preloading = true;    // true until this module is initialized
-static bool mi_redirected = false;   // true if malloc redirects to mi_malloc
+  // note: we store the tld as we should avoid reading `thread_tld` at this point (to avoid reinitializing the thread local storage)
+  mi_tld_t* const tld = _theap_main->tld;
 
-// Returns true if this module has not been initialized; Don't use C runtime routines until it returns false.
-bool _mi_preloading(void) {
-  return os_preloading;
+  // adjust stats
+  mi_heap_stat_decrease(_mi_subproc_heap_main(tld->subproc), threads, 1);  // todo: or `_theap_main->heap`?
+
+  // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local theaps...
+  if (tld->thread_id != _mi_prim_thread_id()) return;
+
+  // delete the thread local theaps
+  mi_thread_theaps_done(tld);
+
+  // free thread local data
+  mi_tld_free(tld);
 }
 
-mi_decl_nodiscard bool mi_is_redirected(void) mi_attr_noexcept {
-  return mi_redirected;
+
+mi_decl_cold mi_decl_noinline mi_theap_t* _mi_theap_empty_get(void) {
+  return (mi_theap_t*)&_mi_theap_empty;
 }
 
-// Communicate with the redirection module on Windows
-#if defined(_WIN32) && defined(MI_SHARED_LIB)
-#ifdef __cplusplus
-extern "C" {
+#if MI_TLS_MODEL_DYNAMIC_WIN32
+
+// If we can, we use one of the 64 direct TLS slots (but fall back to expansion slots if needed)
+// See <https://en.wikipedia.org/wiki/Win32_Thread_Information_Block> for the offsets.
+#if MI_SIZE_SIZE==4
+#define MI_TLS_DIRECT_FIRST             (0x0E10 / MI_SIZE_SIZE)
+#else
+#define MI_TLS_DIRECT_FIRST             (0x1480 / MI_SIZE_SIZE)
+#endif
+#define MI_TLS_DIRECT_SLOTS             (64)
+#define MI_TLS_EXPANSION_SLOTS          (1024)
+
+#if !MI_WIN_DIRECT_TLS
+#define MI_TLS_INITIAL_SLOT             MI_TLS_EXPANSION_SLOT
+#define MI_TLS_INITIAL_EXPANSION_SLOT   (MI_TLS_EXPANSION_SLOTS-1)
+#else
+// with only direct entries, use the "arbitrary user data" field 
+// and assume it is NULL (see also <http://www.nynaeve.net/?p=98>)
+#define MI_TLS_INITIAL_EXPANSION_SLOT   (0)
+#define MI_TLS_INITIAL_SLOT             (5)
 #endif
-mi_decl_export void _mi_redirect_entry(DWORD reason) {
-  // called on redirection; careful as this may be called before DllMain
-  if (reason == DLL_PROCESS_ATTACH) {
-    mi_redirected = true;
+
+// we initially use the last of the expansion slots as the default NULL.
+// note: this will fail if the program allocates exactly 1024+64 slots with TlsAlloc (which is quite unlikely)
+mi_decl_hidden mi_decl_cache_align size_t _mi_theap_default_slot = MI_TLS_INITIAL_SLOT;
+mi_decl_hidden size_t _mi_theap_default_expansion_slot = MI_TLS_INITIAL_EXPANSION_SLOT;
+mi_decl_hidden size_t _mi_theap_cached_slot            = MI_TLS_INITIAL_SLOT;
+mi_decl_hidden size_t _mi_theap_cached_expansion_slot  = MI_TLS_INITIAL_EXPANSION_SLOT;
+
+static size_t mi_win_tls_slot_alloc(size_t* extended) {
+  const DWORD slot = TlsAlloc();
+  if (slot==TLS_OUT_OF_INDEXES || slot >= MI_TLS_DIRECT_SLOTS + MI_TLS_EXPANSION_SLOTS - 1) {
+    // note: we also fail if the program already allocated the maximum number of expansion slots (as we use the last one as the default)
+    *extended = 0;
+    return 0;
   }
-  else if (reason == DLL_PROCESS_DETACH) {
-    mi_redirected = false;
+  else if (slot<MI_TLS_DIRECT_SLOTS) {
+    *extended = 0;
+    return (slot + MI_TLS_DIRECT_FIRST);
   }
-  else if (reason == DLL_THREAD_DETACH) {
-    mi_thread_done();
+  else {
+    #if MI_WIN_DIRECT_TLS
+    *extended = 0;
+    return 0;
+    #else
+    *extended = (slot - MI_TLS_DIRECT_SLOTS);
+    return MI_TLS_EXPANSION_SLOT;
+    #endif
   }
 }
-__declspec(dllimport) bool mi_allocator_init(const char** message);
-__declspec(dllimport) void mi_allocator_done(void);
-#ifdef __cplusplus
+
+mi_decl_cold mi_theap_t* _mi_win_tls_slots_init(void) {
+  static mi_atomic_once_t tls_slots_init;
+  if (mi_atomic_once(&tls_slots_init)) {
+    _mi_theap_default_slot = mi_win_tls_slot_alloc(&_mi_theap_default_expansion_slot);
+    _mi_theap_cached_slot = mi_win_tls_slot_alloc(&_mi_theap_cached_expansion_slot);
+    if (_mi_theap_cached_slot==0) {
+      _mi_error_message(EFAULT, "unable to allocate fast TLS user slot (0x%zx)\n", _mi_theap_cached_slot);
+    }
+  }
+  return (mi_theap_t*)&_mi_theap_empty;
 }
-#endif
-#else
-static bool mi_allocator_init(const char** message) {
-  if (message != NULL) *message = NULL;
-  return true;
+
+static void mi_win_tls_slot_set(size_t slot, size_t extended_slot, void* value) {
+  mi_assert_internal((slot >= MI_TLS_DIRECT_FIRST && slot < MI_TLS_DIRECT_FIRST + MI_TLS_DIRECT_SLOTS) || slot == MI_TLS_EXPANSION_SLOT);
+  if (slot < MI_TLS_DIRECT_FIRST + MI_TLS_DIRECT_SLOTS) {
+    mi_prim_tls_slot_set(slot, value);
+  }
+  else {
+    mi_assert_internal(extended_slot < MI_TLS_EXPANSION_SLOTS);
+    TlsSetValue((DWORD)(extended_slot + MI_TLS_DIRECT_SLOTS), value);  // use TlsSetValue to initialize the TlsExpansion array if needed
+  }
 }
-static void mi_allocator_done(void) {
-  // nothing to do
+
+#elif MI_TLS_MODEL_DYNAMIC_PTHREADS
+
+// only for pthreads for now
+mi_decl_hidden pthread_key_t _mi_theap_default_key = 0;
+mi_decl_hidden pthread_key_t _mi_theap_cached_key = 0;
+
+mi_decl_cold mi_theap_t* _mi_tls_keys_init(void) {
+  static mi_atomic_once_t tls_keys_init;
+  if (mi_atomic_once(&tls_keys_init)) {
+    pthread_key_create(&_mi_theap_default_key, NULL);
+    pthread_key_create(&_mi_theap_cached_key, NULL);
+  }
+  return (mi_theap_t*)&_mi_theap_empty;
 }
+
 #endif
 
-// Called once by the process loader
-static void mi_process_load(void) {
-  mi_heap_main_init();
-  #if defined(MI_TLS_RECURSE_GUARD)
-  volatile mi_heap_t* dummy = _mi_heap_default; // access TLS to allocate it before setting tls_initialized to true;
-  MI_UNUSED(dummy);
+void _mi_theap_cached_set(mi_theap_t* theap) {
+  #if MI_TLS_MODEL_THREAD_LOCAL
+    __mi_theap_cached = theap;
+  #elif MI_TLS_MODEL_FIXED_SLOT
+    mi_prim_tls_slot_set(MI_TLS_MODEL_FIXED_SLOT_CACHED, theap);
+  #elif MI_TLS_MODEL_DYNAMIC_WIN32
+    _mi_win_tls_slots_init();
+    mi_win_tls_slot_set(_mi_theap_cached_slot, _mi_theap_cached_expansion_slot, theap);
+  #elif MI_TLS_MODEL_DYNAMIC_PTHREADS
+    _mi_tls_keys_init();
+    if (_mi_theap_cached_key!=0) pthread_setspecific(_mi_theap_cached_key, theap);
   #endif
-  os_preloading = false;
-  #if !(defined(_WIN32) && defined(MI_SHARED_LIB))  // use Dll process detach (see below) instead of atexit (issue #521)
-  atexit(&mi_process_done);  
+}
+
+void _mi_theap_default_set(mi_theap_t* theap)  {
+  mi_assert_internal(theap != NULL);
+  mi_assert_internal(theap->tld->thread_id==0 || theap->tld->thread_id==_mi_thread_id());
+  #if MI_TLS_MODEL_THREAD_LOCAL
+    __mi_theap_default = theap;
+  #elif MI_TLS_MODEL_FIXED_SLOT
+    mi_prim_tls_slot_set(MI_TLS_MODEL_FIXED_SLOT_DEFAULT, theap);
+  #elif MI_TLS_MODEL_DYNAMIC_WIN32
+    _mi_win_tls_slots_init();
+    mi_win_tls_slot_set(_mi_theap_default_slot, _mi_theap_default_expansion_slot, theap);
+  #elif MI_TLS_MODEL_DYNAMIC_PTHREADS
+    _mi_tls_keys_init();
+    if (_mi_theap_default_key!=0) pthread_setspecific(_mi_theap_default_key, theap);
   #endif
-  _mi_options_init();
+
+  // set theap main if needed
+  if (mi_theap_is_initialized(theap)) {
+    // ensure the default theap is passed to `_mi_thread_done` as on some platforms we cannot access TLS at thread termination (as it would allocate again)
+    _mi_prim_thread_associate_default_theap(theap);
+    if (_mi_is_heap_main(theap->heap)) {
+      __mi_theap_main = theap;
+    }
+  }
+}
+
+void mi_thread_set_in_threadpool(void) mi_attr_noexcept {
+  mi_theap_t* theap = _mi_theap_default_safe();
+  theap->tld->is_in_threadpool = true;
+}
+
+// --------------------------------------------------------
+// Run functions on process init/done, and thread init/done
+// --------------------------------------------------------
+static bool os_preloading = true;    // true until this module is initialized
+
+// Returns true if this module has not been initialized; Don't use C runtime routines until it returns false.
+bool mi_decl_noinline _mi_preloading(void) {
+  return os_preloading;
+}
+
+// Returns true if mimalloc was redirected
+mi_decl_nodiscard bool mi_is_redirected(void) mi_attr_noexcept {
+  return _mi_is_redirected();
+}
+
+// Called once by the process loader from `src/prim/prim.c`
+void _mi_auto_process_init(void) {
+  // mi_heap_main_init();
+  // #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
+  // volatile mi_theap_t* dummy = __mi_theap_default; // access TLS to allocate it before setting tls_initialized to true;
+  // if (dummy == NULL) return;                       // use dummy or otherwise the access may get optimized away (issue #697)
+  // #endif
+
+  os_preloading = false;
+  mi_assert_internal(_mi_is_main_thread());
+
   mi_process_init();
-  //mi_stats_reset();-
-  if (mi_redirected) _mi_verbose_message("malloc is redirected.\n");
+  mi_process_setup_auto_thread_done();
+  _mi_thread_locals_init();
+  _mi_options_post_init();  // now we can print to stderr
+  if (_mi_is_redirected()) _mi_verbose_message("malloc is redirected.\n");
 
   // show message from the redirector (if present)
   const char* msg = NULL;
-  mi_allocator_init(&msg);
+  _mi_allocator_init(&msg);
   if (msg != NULL && (mi_option_is_enabled(mi_option_verbose) || mi_option_is_enabled(mi_option_show_errors))) {
     _mi_fputs(NULL,NULL,NULL,msg);
   }
+
+  // reseed random
+  _mi_random_reinit_if_weak(&theap_main.random);
 }
 
-#if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
-#include <intrin.h>
-mi_decl_cache_align bool _mi_cpu_has_fsrm = false;
+// CPU features
+mi_decl_cache_align size_t _mi_cpu_movsb_max = 0;  // for size <= max, rep movsb is fast
+mi_decl_cache_align size_t _mi_cpu_stosb_max = 0;  // for size <= max, rep stosb is fast
+mi_decl_cache_align bool _mi_cpu_has_popcnt = false;
+
+#if (MI_ARCH_X64 || MI_ARCH_X86)
+#if defined(__GNUC__)
+// #include <cpuid.h>
+static bool mi_cpuid(uint32_t* regs4, uint32_t level, uint32_t sublevel) {
+  // note: use explicit assembly instead of __get_cpuid as we need the sublevel (in ecx)
+  // (on Ubuntu 22 with WSL the __get_cpuid does not clear ecx for level 7 which is incorrect).
+  uint32_t eax, ebx, ecx, edx;
+  __asm __volatile("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a"(level), "c"(sublevel) : );
+  regs4[0] = eax;
+  regs4[1] = ebx;
+  regs4[2] = ecx;
+  regs4[3] = edx;
+  return true;
+}
+
+#elif defined(_MSC_VER)
+static bool mi_cpuid(uint32_t* regs4, uint32_t level, uint32_t sublevel) {
+  __cpuidex((int32_t*)regs4, (int32_t)level, (int32_t)sublevel);
+  return true;
+}
+#else
+static bool mi_cpuid(uint32_t* regs4, uint32_t level, uint32_t sublevel) {
+  MI_UNUSED(regs4); MI_UNUSED(level); MI_UNUSED(sublevel);
+  return false;
+}
+#endif
 
 static void mi_detect_cpu_features(void) {
-  // FSRM for fast rep movsb support (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017))
-  int32_t cpu_info[4];
-  __cpuid(cpu_info, 7);
-  _mi_cpu_has_fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see <https ://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features>
+  // FSRM for fast short rep movsb support (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017))
+  // EMRS for fast enhanced rep movsb/stosb support (not used at the moment, memcpy always seems faster?)
+  // FSRS for fast short rep stosb
+  bool amd = false;
+  bool fsrm = false;
+  // bool erms = false;
+  bool fsrs = false;
+  uint32_t cpu_info[4];
+  if (mi_cpuid(cpu_info, 0, 0)) {
+    amd = (cpu_info[2]==0x444d4163); // (Auth enti cAMD)
+  }
+  if (mi_cpuid(cpu_info, 7, 0)) {
+    fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see <https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features>
+    // erms = ((cpu_info[1] & (1 << 9)) != 0); // bit 9 of EBX : see <https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features>
+  }
+  if (mi_cpuid(cpu_info, 7, 1)) {
+    fsrs = ((cpu_info[1] & (1 << 11)) != 0); // bit 11 of EBX: see <https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=1:_Extended_Features>
+  }
+  if (mi_cpuid(cpu_info, 1, 0)) {
+    _mi_cpu_has_popcnt = ((cpu_info[2] & (1 << 23)) != 0); // bit 23 of ECX : see <https://en.wikipedia.org/wiki/CPUID#EAX=1:_Processor_Info_and_Feature_Bits>
+  }
+
+  if (fsrm) {
+    _mi_cpu_movsb_max = 127;
+  }
+  if (fsrs || (amd && fsrm)) {  // fsrm on amd implies fsrs, see: https://marc.info/?l=git-commits-head&m=168186277717803
+    _mi_cpu_stosb_max = 127;
+  }
 }
+
 #else
 static void mi_detect_cpu_features(void) {
-  // nothing
+  #if MI_ARCH_ARM64
+  _mi_cpu_has_popcnt = true;
+  #endif
 }
 #endif
 
+
 // Initialize the process; called by thread_init or the process loader
 void mi_process_init(void) mi_attr_noexcept {
   // ensure we are called once
-  if (_mi_process_is_initialized) return;
+  static mi_atomic_once_t process_init;
+	// #if _MSC_VER < 1920
+	// mi_heap_main_init(); // vs2017 can dynamically re-initialize theap_main
+	// #endif
+  if (!mi_atomic_once(&process_init)) return;
   _mi_verbose_message("process init: 0x%zx\n", _mi_thread_id());
-  _mi_process_is_initialized = true;
-  mi_process_setup_auto_thread_done();
 
-  
   mi_detect_cpu_features();
+  _mi_options_init();
+  _mi_stats_init();
   _mi_os_init();
-  mi_heap_main_init();
-  #if (MI_DEBUG)
-  _mi_verbose_message("debug level : %d\n", MI_DEBUG);
-  #endif
-  _mi_verbose_message("secure level: %d\n", MI_SECURE);
+  // the following can potentially allocate (on freeBSD for pthread keys)
+  // todo: do 2-phase so we can use stats at first, then later init the keys?
+  mi_heap_main_init(); // before page_map_init so stats are working
+  _mi_page_map_init(); // todo: this could fail.. should we abort in that case?
   mi_thread_init();
+  _mi_process_is_initialized = true;
 
-  #if defined(_WIN32) && !defined(MI_SHARED_LIB)
-  // When building as a static lib the FLS cleanup happens to early for the main thread.
+  #if defined(_WIN32) && defined(MI_WIN_USE_FLS)
+  // On windows, when building as a static lib the FLS cleanup happens to early for the main thread.
   // To avoid this, set the FLS value for the main thread to NULL so the fls cleanup
   // will not call _mi_thread_done on the (still executing) main thread. See issue #508.
-  FlsSetValue(mi_fls_key, NULL);
+  _mi_prim_thread_associate_default_theap(NULL);
   #endif
 
-  mi_stats_reset();  // only call stat reset *after* thread init (or the heap tld == NULL)
-
+  // mi_stats_reset();  // only call stat reset *after* thread init (or the theap tld == NULL)
+  mi_track_init();
   if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
     size_t pages = mi_option_get_clamp(mi_option_reserve_huge_os_pages, 0, 128*1024);
-    long reserve_at = mi_option_get(mi_option_reserve_huge_os_pages_at);
+    int reserve_at  = (int)mi_option_get_clamp(mi_option_reserve_huge_os_pages_at, -1, INT_MAX);
     if (reserve_at != -1) {
       mi_reserve_huge_os_pages_at(pages, reserve_at, pages*500);
     } else {
       mi_reserve_huge_os_pages_interleave(pages, 0, pages*500);
     }
-  } 
+  }
   if (mi_option_is_enabled(mi_option_reserve_os_memory)) {
     long ksize = mi_option_get(mi_option_reserve_os_memory);
     if (ksize > 0) {
-      mi_reserve_os_memory((size_t)ksize*MI_KiB, true /* commit? */, true /* allow large pages? */);
+      mi_reserve_os_memory((size_t)ksize*MI_KiB, true, true);
     }
   }
 }
 
-// Called when the process is done (through `at_exit`)
-static void mi_process_done(void) {
+// Called when the process is done (cdecl as it is used with `at_exit` on some platforms)
+void mi_cdecl mi_process_done(void) mi_attr_noexcept {
   // only shutdown if we were initialized
   if (!_mi_process_is_initialized) return;
   // ensure we are called once
@@ -614,80 +987,42 @@ static void mi_process_done(void) {
   if (process_done) return;
   process_done = true;
 
-  #if defined(_WIN32) && !defined(MI_SHARED_LIB)
-  FlsFree(mi_fls_key);  // call thread-done on all threads (except the main thread) to prevent dangling callback pointer if statically linked with a DLL; Issue #208
-  #endif
-  
+  // free dynamic thread locals (if used at all)
+  _mi_thread_locals_done();
+
+  // release any thread specific resources and ensure _mi_thread_done is called on all but the main thread
+  _mi_prim_thread_done_auto_done();
+
   #ifndef MI_SKIP_COLLECT_ON_EXIT
-    #if (MI_DEBUG != 0) || !defined(MI_SHARED_LIB)  
+    #if (MI_DEBUG || !defined(MI_SHARED_LIB))
     // free all memory if possible on process exit. This is not needed for a stand-alone process
     // but should be done if mimalloc is statically linked into another shared library which
     // is repeatedly loaded/unloaded, see issue #281.
-    mi_collect(true /* force */ );
+    mi_theap_collect(_mi_theap_default(), true /* force */);
     #endif
   #endif
 
-  if (mi_option_is_enabled(mi_option_show_stats) || mi_option_is_enabled(mi_option_verbose)) {
-    mi_stats_print(NULL);
-  }
-  mi_allocator_done();  
-  _mi_verbose_message("process done: 0x%zx\n", _mi_heap_main.thread_id);
-  os_preloading = true; // don't call the C runtime anymore
-}
-
-
-
-#if defined(_WIN32) && defined(MI_SHARED_LIB)
-  // Windows DLL: easy to hook into process_init and thread_done
-  __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE inst, DWORD reason, LPVOID reserved) {
-    MI_UNUSED(reserved);
-    MI_UNUSED(inst);
-    if (reason==DLL_PROCESS_ATTACH) {
-      mi_process_load();
-    }
-    else if (reason==DLL_PROCESS_DETACH) {
-      mi_process_done();
-    }
-    else if (reason==DLL_THREAD_DETACH) {
-      if (!mi_is_redirected()) {
-        mi_thread_done();
-      }
-    }    
-    return TRUE;
-  }
-
-#elif defined(_MSC_VER)
-  // MSVC: use data section magic for static libraries
-  // See <https://www.codeguru.com/cpp/misc/misc/applicationcontrol/article.php/c6945/Running-Code-Before-and-After-Main.htm>
-  static int _mi_process_init(void) {
-    mi_process_load();
-    return 0;
+  // Forcefully release all retained memory; this can be dangerous in general if overriding regular malloc/free
+  // since after process_done there might still be other code running that calls `free` (like at_exit routines,
+  // or C-runtime termination code.
+  if (mi_option_is_enabled(mi_option_destroy_on_exit)) {
+    mi_subprocs_unsafe_destroy_all();
+    _mi_page_map_unsafe_destroy(_mi_subproc_main());
   }
-  typedef int(*_mi_crt_callback_t)(void);
-  #if defined(_M_X64) || defined(_M_ARM64)
-    __pragma(comment(linker, "/include:" "_mi_msvc_initu"))
-    #pragma section(".CRT$XIU", long, read)
-  #else
-    __pragma(comment(linker, "/include:" "__mi_msvc_initu"))
-  #endif
-  #pragma data_seg(".CRT$XIU")
-  mi_decl_externc _mi_crt_callback_t _mi_msvc_initu[] = { &_mi_process_init };
-  #pragma data_seg()
-
-#elif defined(__cplusplus)
-  // C++: use static initialization to detect process start
-  static bool _mi_process_init(void) {
-    mi_process_load();
-    return (_mi_heap_main.thread_id != 0);
+  else {
+    mi_heap_stats_merge_to_subproc(mi_heap_main());
   }
-  static bool mi_initialized = _mi_process_init();
 
-#elif defined(__GNUC__) || defined(__clang__)
-  // GCC,Clang: use the constructor attribute
-  static void __attribute__((constructor)) _mi_process_init(void) {
-    mi_process_load();
+  if (mi_option_is_enabled(mi_option_show_stats) || mi_option_is_enabled(mi_option_verbose)) {
+    mi_subproc_stats_print_out(NULL, NULL, NULL);
   }
+  mi_lock_done(&subprocs_lock);
+  _mi_allocator_done();
+  _mi_verbose_message("process done: 0x%zx\n", tld_main.thread_id);
+  os_preloading = true; // don't call the C runtime anymore
+}
 
-#else
-#pragma message("define a way to call mi_process_load on your platform")
-#endif
+void mi_cdecl _mi_auto_process_done(void) mi_attr_noexcept {
+  if (_mi_option_get_fast(mi_option_destroy_on_exit)>1) return;
+  mi_process_done();
+}
diff --git a/ext/src/mimalloc/src/libc.c b/ext/src/mimalloc/src/libc.c
new file mode 100644
index 0000000000..4c891c1e65
--- /dev/null
+++ b/ext/src/mimalloc/src/libc.c
@@ -0,0 +1,432 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+// --------------------------------------------------------
+// This module defines various std libc functions to reduce
+// the dependency on libc, and also prevent errors caused
+// by some libc implementations when called before `main`
+// executes (due to malloc redirection)
+// --------------------------------------------------------
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/prim.h"      // mi_prim_getenv
+
+char _mi_toupper(char c) {
+  if (c >= 'a' && c <= 'z') return (c - 'a' + 'A');
+                       else return c;
+}
+
+int _mi_strnicmp(const char* s, const char* t, size_t n) {
+  if (n == 0) return 0;
+  for (; *s != 0 && *t != 0 && n > 0; s++, t++, n--) {
+    if (_mi_toupper(*s) != _mi_toupper(*t)) break;
+  }
+  return (n == 0 ? 0 : *s - *t);
+}
+
+void _mi_strlcpy(char* dest, const char* src, size_t dest_size) {
+  if (dest==NULL || src==NULL || dest_size == 0) return;
+  // copy until end of src, or when dest is (almost) full
+  while (*src != 0 && dest_size > 1) {
+    *dest++ = *src++;
+    dest_size--;
+  }
+  // always zero terminate
+  *dest = 0;
+}
+
+void _mi_strlcat(char* dest, const char* src, size_t dest_size) {
+  if (dest==NULL || src==NULL || dest_size == 0) return;
+  // find end of string in the dest buffer
+  while (*dest != 0 && dest_size > 1) {
+    dest++;
+    dest_size--;
+  }
+  // and catenate
+  _mi_strlcpy(dest, src, dest_size);
+}
+
+size_t _mi_strlen(const char* s) {
+  if (s==NULL) return 0;
+  size_t len = 0;
+  while(s[len] != 0) { len++; }
+  return len;
+}
+
+size_t _mi_strnlen(const char* s, size_t max_len) {
+  if (s==NULL) return 0;
+  size_t len = 0;
+  while(s[len] != 0 && len < max_len) { len++; }
+  return len;
+}
+
+char* _mi_strnstr(char* s, size_t max_len, const char* pat) {
+  if (s==NULL) return NULL;
+  if (pat==NULL) return s;
+  const size_t m = _mi_strnlen(s, max_len);
+  const size_t n = _mi_strlen(pat);  
+  for (size_t start = 0; start + n <= m; start++) {
+    size_t i = 0;
+    while (i<n && pat[i]==s[start+i]) {
+      i++;
+    }
+    if (i==n) return &s[start];
+  }
+  return NULL;
+}
+
+#ifdef MI_NO_GETENV
+bool _mi_getenv(const char* name, char* result, size_t result_size) {
+  MI_UNUSED(name);
+  MI_UNUSED(result);
+  MI_UNUSED(result_size);
+  return false;
+}
+#else
+bool _mi_getenv(const char* name, char* result, size_t result_size) {
+  if (name==NULL || result == NULL || result_size < 64) return false;
+  return _mi_prim_getenv(name,result,result_size);
+}
+#endif
+
+// --------------------------------------------------------
+// Define our own limited `_mi_vsnprintf` and `_mi_snprintf`
+// This is mostly to avoid calling these when libc is not yet
+// initialized (and to reduce dependencies)
+//
+// format:      d i, p x u, s
+// prec:        z l ll L
+// width:       10
+// align-left:  -
+// fill:        0
+// plus:        +
+// --------------------------------------------------------
+
+static void mi_outc(char c, char** out, char* end) {
+  char* p = *out;
+  if (p >= end) return;
+  *p = c;
+  *out = p + 1;
+}
+
+static void mi_outs(const char* s, char** out, char* end) {
+  if (s == NULL) return;
+  char* p = *out;
+  while (*s != 0 && p < end) {
+    *p++ = *s++;
+  }
+  *out = p;
+}
+
+static void mi_out_fill(char fill, size_t len, char** out, char* end) {
+  char* p = *out;
+  for (size_t i = 0; i < len && p < end; i++) {
+    *p++ = fill;
+  }
+  *out = p;
+}
+
+static void mi_out_alignright(char fill, char* start, size_t len, size_t extra, char* end) {
+  if (len == 0 || extra == 0) return;
+  if (start + len + extra >= end) return;
+  // move `len` characters to the right (in reverse since it can overlap)
+  for (size_t i = 1; i <= len; i++) {
+    start[len + extra - i] = start[len - i];
+  }
+  // and fill the start
+  for (size_t i = 0; i < extra; i++) {
+    start[i] = fill;
+  }
+}
+
+
+static void mi_out_num(uintmax_t x, size_t base, char prefix, char** out, char* end)
+{
+  if (x == 0 || base == 0 || base > 16) {
+    if (prefix != 0) { mi_outc(prefix, out, end); }
+    mi_outc('0',out,end);
+  }
+  else {
+    // output digits in reverse
+    char* start = *out;
+    while (x > 0) {
+      char digit = (char)(x % base);
+      mi_outc((digit <= 9 ? '0' + digit : 'A' + digit - 10),out,end);
+      x = x / base;
+    }
+    if (prefix != 0) {
+      mi_outc(prefix, out, end);
+    }
+    size_t len = *out - start;
+    // and reverse in-place
+    for (size_t i = 0; i < (len / 2); i++) {
+      char c = start[len - i - 1];
+      start[len - i - 1] = start[i];
+      start[i] = c;
+    }
+  }
+}
+
+
+#define MI_NEXTC()  c = *in; if (c==0) break; in++;
+
+int _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
+  if (buf == NULL || bufsize == 0 || fmt == NULL) return 0;
+  buf[bufsize - 1] = 0;
+  char* const end = buf + (bufsize - 1);
+  const char* in = fmt;
+  char* out = buf;
+  while (true) {
+    if (out >= end) break;
+    char c;
+    MI_NEXTC();
+    if (c != '%') {
+      if (c == '\\') {
+        MI_NEXTC();
+        switch (c) {
+        case 'e': mi_outc('\x1B', &out, end); break;
+        case 't': mi_outc('\t', &out, end); break;
+        case 'n': mi_outc('\n', &out, end); break;
+        case 'r': mi_outc('\r', &out, end); break;
+        case '\\': mi_outc('\\', &out, end); break;
+        default: /* ignore */ break;
+        }
+      }
+      else if ((c >= ' ' && c <= '~') || c=='\n' || c=='\r' || c=='\t' || c=='\x1b') { // output visible ascii or standard control only
+        mi_outc(c, &out, end);
+      }
+    }
+    else {
+      MI_NEXTC();
+      char   fill = ' ';
+      size_t width = 0;
+      char   numtype = 'd';
+      char   numplus = 0;
+      bool   alignright = true;
+      if (c == '+' || c == ' ') { numplus = c; MI_NEXTC(); }
+      if (c == '-') { alignright = false; MI_NEXTC(); }
+      if (c == '0') { fill = '0'; MI_NEXTC(); }
+      if (c >= '1' && c <= '9') {
+        width = (c - '0'); MI_NEXTC();
+        while (c >= '0' && c <= '9') {
+          width = (10 * width) + (c - '0'); MI_NEXTC();
+        }
+        if (c == 0) break;  // extra check due to while
+      }
+      if (c == 'z' || c == 't' || c == 'L') { numtype = c; MI_NEXTC(); }
+      else if (c == 'l') {
+        numtype = c; MI_NEXTC();
+        if (c == 'l') { numtype = 'L'; MI_NEXTC(); }
+      }
+
+      char* start = out;
+      if (c == '%') {
+        mi_outc('%', &out, end);
+      }
+      else if (c == 's') {
+        // string
+        const char* s = va_arg(args, const char*);
+        mi_outs(s, &out, end);
+      }
+      else if (c == 'p' || c == 'x' || c == 'u') {
+        // unsigned
+        uintmax_t x = 0;
+        if (c == 'x' || c == 'u') {
+          if (numtype == 'z')       x = va_arg(args, size_t);
+          else if (numtype == 't')  x = va_arg(args, uintptr_t); // unsigned ptrdiff_t
+          else if (numtype == 'L')  x = va_arg(args, unsigned long long);
+          else if (numtype == 'l')  x = va_arg(args, unsigned long);
+                               else x = va_arg(args, unsigned int);
+        }
+        else if (c == 'p') {
+          x = va_arg(args, uintptr_t);
+          mi_outs("0x", &out, end);
+          start = out;
+          width = (width >= 2 ? width - 2 : 0);
+        }
+        if (width == 0 && (c == 'x' || c == 'p')) {
+          if (c == 'p')   { width = 2 * (x <= UINT32_MAX ? 4 : ((x >> 16) <= UINT32_MAX ? 6 : sizeof(void*))); }
+          if (width == 0) { width = 2; }
+          fill = '0';
+        }
+        mi_out_num(x, (c == 'x' || c == 'p' ? 16 : 10), numplus, &out, end);
+      }
+      else if (c == 'i' || c == 'd') {
+        // signed
+        intmax_t x = 0;
+        if (numtype == 'z')       x = va_arg(args, intptr_t );
+        else if (numtype == 't')  x = va_arg(args, ptrdiff_t);
+        else if (numtype == 'L')  x = va_arg(args, long long);
+        else if (numtype == 'l')  x = va_arg(args, long);
+                             else x = va_arg(args, int);
+        char pre = 0;
+        if (x < 0) {
+          pre = '-';
+          if (x > INTMAX_MIN) { x = -x; }
+        }
+        else if (numplus != 0) {
+          pre = numplus;
+        }
+        mi_out_num((uintmax_t)x, 10, pre, &out, end);
+      }
+      else if (c >= ' ' && c <= '~') {
+        // unknown format
+        mi_outc('%', &out, end);
+        mi_outc(c, &out, end);
+      }
+
+      // fill & align
+      mi_assert_internal(out <= end);
+      mi_assert_internal(out >= start);
+      const size_t len = out - start;
+      if (len < width) {
+        mi_out_fill(fill, width - len, &out, end);
+        if (alignright && out <= end) {
+          mi_out_alignright(fill, start, len, width - len, end);
+        }
+      }
+    }
+  }
+  mi_assert_internal(out <= end);
+  *out = 0;
+  return (int)(out - buf);
+}
+
+int _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  const int written = _mi_vsnprintf(buf, buflen, fmt, args);
+  va_end(args);
+  return written;
+}
+
+
+
+// --------------------------------------------------------
+// generic trailing and leading zero count, and popcount
+// --------------------------------------------------------
+
+#if !MI_HAS_FAST_BITSCAN
+
+static size_t mi_ctz_generic32(uint32_t x) {
+  // de Bruijn multiplication, see <http://keithandkatie.com/keith/papers/debruijn.html>
+  static const uint8_t debruijn[32] = {
+    0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+    31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
+  };
+  if (x==0) return 32;
+  return debruijn[(uint32_t)((x & -(int32_t)x) * (uint32_t)(0x077CB531U)) >> 27];
+}
+
+static size_t mi_clz_generic32(uint32_t x) {
+  // de Bruijn multiplication, see <http://keithandkatie.com/keith/papers/debruijn.html>
+  static const uint8_t debruijn[32] = {
+    31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1,
+    23, 19, 11, 3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0
+  };
+  if (x==0) return 32;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  return debruijn[(uint32_t)(x * (uint32_t)(0x07C4ACDDU)) >> 27];
+}
+
+size_t _mi_ctz_generic(size_t x) {
+  if (x==0) return MI_SIZE_BITS;
+  #if (MI_SIZE_BITS <= 32)
+    return mi_ctz_generic32((uint32_t)x);
+  #else
+    const uint32_t lo = (uint32_t)x;
+    if (lo != 0) {
+      return mi_ctz_generic32(lo);
+    }
+    else {
+      return (32 + mi_ctz_generic32((uint32_t)(x>>32)));
+    }
+  #endif
+}
+
+size_t _mi_clz_generic(size_t x) {
+  if (x==0) return MI_SIZE_BITS;
+  #if (MI_SIZE_BITS <= 32)
+    return mi_clz_generic32((uint32_t)x);
+  #else
+    const uint32_t hi = (uint32_t)(x>>32);
+    if (hi != 0) {
+      return mi_clz_generic32(hi);
+    }
+    else {
+      return 32 + mi_clz_generic32((uint32_t)x);
+    }
+  #endif
+}
+
+#endif // bit scan
+
+
+#if MI_SIZE_SIZE == 4
+#define mi_mask_even_bits32      (0x55555555)
+#define mi_mask_even_pairs32     (0x33333333)
+#define mi_mask_even_nibbles32   (0x0F0F0F0F)
+
+// sum of all the bytes in `x` if it is guaranteed that the sum < 256!
+static size_t mi_byte_sum32(uint32_t x) {
+  // perform `x * 0x01010101`: the highest byte contains the sum of all bytes.
+  x += (x << 8);
+  x += (x << 16);
+  return (size_t)(x >> 24);
+}
+
+static size_t mi_popcount_generic32(uint32_t x) {
+  // first count each 2-bit group `a`, where: a==0b00 -> 00, a==0b01 -> 01, a==0b10 -> 01, a==0b11 -> 10
+  // in other words, `a - (a>>1)`; to do this in parallel, we need to mask to prevent spilling a bit pair
+  // into the lower bit-pair:
+  x = x - ((x >> 1) & mi_mask_even_bits32);
+  // add the 2-bit pair results
+  x = (x & mi_mask_even_pairs32) + ((x >> 2) & mi_mask_even_pairs32);
+  // add the 4-bit nibble results
+  x = (x + (x >> 4)) & mi_mask_even_nibbles32;
+  // each byte now has a count of its bits, we can sum them now:
+  return mi_byte_sum32(x);
+}
+
+mi_decl_noinline size_t _mi_popcount_generic(size_t x) {
+  if (x<=1) return x;
+  if (~x==0) return MI_SIZE_BITS;
+  return mi_popcount_generic32(x);
+}
+
+#else
+#define mi_mask_even_bits64      (0x5555555555555555)
+#define mi_mask_even_pairs64     (0x3333333333333333)
+#define mi_mask_even_nibbles64   (0x0F0F0F0F0F0F0F0F)
+
+// sum of all the bytes in `x` if it is guaranteed that the sum < 256!
+static size_t mi_byte_sum64(uint64_t x) {
+  x += (x << 8);
+  x += (x << 16);
+  x += (x << 32);
+  return (size_t)(x >> 56);
+}
+
+static size_t mi_popcount_generic64(uint64_t x) {
+  x = x - ((x >> 1) & mi_mask_even_bits64);
+  x = (x & mi_mask_even_pairs64) + ((x >> 2) & mi_mask_even_pairs64);
+  x = (x + (x >> 4)) & mi_mask_even_nibbles64;
+  return mi_byte_sum64(x);
+}
+
+mi_decl_noinline size_t _mi_popcount_generic(size_t x) {
+  if (x<=1) return x;
+  if (~x==0) return MI_SIZE_BITS;
+  return mi_popcount_generic64(x);
+}
+#endif
+
diff --git a/ext/src/mimalloc/src/options.c b/ext/src/mimalloc/src/options.c
index 30610074d3..aa642c13cd 100644
--- a/ext/src/mimalloc/src/options.c
+++ b/ext/src/mimalloc/src/options.c
@@ -1,23 +1,16 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
-
-#include <stdio.h>
-#include <stdlib.h> // strtol
-#include <string.h> // strncpy, strncat, strlen, strstr
-#include <ctype.h>  // toupper
-#include <stdarg.h>
-
-#ifdef _MSC_VER
-#pragma warning(disable:4996)   // strncpy, strncat
-#endif
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"  // mi_prim_out_stderr
 
+#include <stdio.h>      // stdin/stdout
+#include <stdlib.h>     // abort
 
 static long mi_max_error_count   = 16; // stop outputting errors after this (use < 0 for no limit)
 static long mi_max_warning_count = 16; // stop outputting warnings after this (use < 0 for no limit)
@@ -28,9 +21,6 @@ int mi_version(void) mi_attr_noexcept {
   return MI_MALLOC_VERSION;
 }
 
-#ifdef _WIN32
-#include <conio.h>
-#endif
 
 // --------------------------------------------------------
 // Options
@@ -38,89 +28,251 @@ int mi_version(void) mi_attr_noexcept {
 // concurrently initialized, but an initializing data race
 // is ok since they resolve to the same value.
 // --------------------------------------------------------
-typedef enum mi_init_e {
-  UNINIT,       // not yet initialized
-  DEFAULTED,    // not found in the environment, use default value
-  INITIALIZED   // found in environment or set explicitly
-} mi_init_t;
-
-typedef struct mi_option_desc_s {
-  long        value;  // the value
-  mi_init_t   init;   // is it initialized yet? (from the environment)
-  mi_option_t option; // for debugging: the option index should match the option
-  const char* name;   // option name without `mimalloc_` prefix
-  const char* legacy_name; // potential legacy v1.x option name
-} mi_option_desc_t;
+
 
 #define MI_OPTION(opt)                  mi_option_##opt, #opt, NULL
 #define MI_OPTION_LEGACY(opt,legacy)    mi_option_##opt, #opt, #legacy
 
-static mi_option_desc_t options[_mi_option_last] =
+// Some options can be set at build time for statically linked libraries
+// (use `-DMI_EXTRA_CPPDEFS="opt1=val1;opt2=val2"`)
+//
+// This is useful if we cannot pass them as environment variables
+// (and setting them programmatically would be too late)
+
+#ifndef MI_DEFAULT_VERBOSE
+#define MI_DEFAULT_VERBOSE 0
+#endif
+
+#ifndef MI_DEFAULT_ARENA_EAGER_COMMIT
+#define MI_DEFAULT_ARENA_EAGER_COMMIT 2
+#endif
+
+// in KiB
+#ifndef MI_DEFAULT_ARENA_RESERVE
+ #if (MI_INTPTR_SIZE>4)
+  #define MI_DEFAULT_ARENA_RESERVE 1024L*1024L
+ #else
+  #define MI_DEFAULT_ARENA_RESERVE 128L*1024L
+ #endif
+#endif
+
+#ifndef MI_DEFAULT_ARENA_MAX_OBJECT_SIZE
+#define MI_DEFAULT_ARENA_MAX_OBJECT_SIZE   ((MI_SIZE_BITS * MI_ARENA_MAX_CHUNK_OBJ_SIZE)/MI_KiB)  /* 2 GiB (or 256 MiB on 32-bit), larger than this is alloc'd by the OS */
+#endif
+
+#ifndef MI_DEFAULT_DISALLOW_ARENA_ALLOC
+#define MI_DEFAULT_DISALLOW_ARENA_ALLOC 0
+#endif
+
+#ifndef MI_DEFAULT_ALLOW_LARGE_OS_PAGES
+#define MI_DEFAULT_ALLOW_LARGE_OS_PAGES 0
+#endif
+
+#ifndef MI_DEFAULT_RESERVE_HUGE_OS_PAGES
+#define MI_DEFAULT_RESERVE_HUGE_OS_PAGES 0
+#endif
+
+#ifndef MI_DEFAULT_RESERVE_OS_MEMORY
+#define MI_DEFAULT_RESERVE_OS_MEMORY 0
+#endif
+
+#ifndef MI_DEFAULT_GUARDED_SAMPLE_RATE
+#if MI_GUARDED
+#define MI_DEFAULT_GUARDED_SAMPLE_RATE 4000
+#else
+#define MI_DEFAULT_GUARDED_SAMPLE_RATE 0
+#endif
+#endif
+
+#ifndef MI_DEFAULT_PAGEMAP_COMMIT
+#if defined(__APPLE__)  // when overloading malloc, we still get mixed pointers sometimes on macOS; this avoids a bad access
+#define MI_DEFAULT_PAGEMAP_COMMIT 1
+#else
+#define MI_DEFAULT_PAGEMAP_COMMIT 0
+#endif
+#endif
+
+#ifndef MI_DEFAULT_PAGE_MAX_RECLAIM
+#define MI_DEFAULT_PAGE_MAX_RECLAIM  (-1)               // unlimited
+#endif
+
+#ifndef MI_DEFAULT_PAGE_CROSS_THREAD_MAX_RECLAIM
+#define MI_DEFAULT_PAGE_CROSS_THREAD_MAX_RECLAIM  32
+#endif
+
+#ifndef MI_DEFAULT_ALLOW_THP
+#if defined(__ANDROID__)
+#define MI_DEFAULT_ALLOW_THP  0
+#else
+#define MI_DEFAULT_ALLOW_THP  1
+#endif
+#endif
+
+// Static options
+static mi_option_desc_t mi_options[_mi_option_last] =
 {
   // stable options
-  #if MI_DEBUG || defined(MI_SHOW_ERRORS)
-  { 1, UNINIT, MI_OPTION(show_errors) },
-  #else
-  { 0, UNINIT, MI_OPTION(show_errors) },
-  #endif
-  { 0, UNINIT, MI_OPTION(show_stats) },
-  { 0, UNINIT, MI_OPTION(verbose) },
-
-  // Some of the following options are experimental and not all combinations are valid. Use with care.
-  { 1, UNINIT, MI_OPTION(eager_commit) },        // commit per segment directly (8MiB)  (but see also `eager_commit_delay`)
-  { 0, UNINIT, MI_OPTION(deprecated_eager_region_commit) },
-  { 0, UNINIT, MI_OPTION(deprecated_reset_decommits) },
-  { 0, UNINIT, MI_OPTION(large_os_pages) },      // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
-  { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },  // per 1GiB huge pages
-  { -1, UNINIT, MI_OPTION(reserve_huge_os_pages_at) }, // reserve huge pages at node N
-  { 0, UNINIT, MI_OPTION(reserve_os_memory)     },
-  { 0, UNINIT, MI_OPTION(deprecated_segment_cache) },  // cache N segments per thread
-  { 0, UNINIT, MI_OPTION(page_reset) },          // reset page memory on free
-  { 0, UNINIT, MI_OPTION_LEGACY(abandoned_page_decommit, abandoned_page_reset) },// decommit free page memory when a thread terminates  
-  { 0, UNINIT, MI_OPTION(deprecated_segment_reset) },
-  #if defined(__NetBSD__)
-  { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
-  #elif defined(_WIN32)
-  { 4, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
-  #else
-  { 1, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
-  #endif
-  { 25,   UNINIT, MI_OPTION_LEGACY(decommit_delay, reset_delay) }, // page decommit delay in milli-seconds
-  { 0,    UNINIT, MI_OPTION(use_numa_nodes) },    // 0 = use available numa nodes, otherwise use at most N nodes. 
-  { 0,    UNINIT, MI_OPTION(limit_os_alloc) },    // 1 = do not use OS memory for allocation (but only reserved arenas)
-  { 100,  UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose
-  { 16,   UNINIT, MI_OPTION(max_errors) },        // maximum errors that are output
-  { 16,   UNINIT, MI_OPTION(max_warnings) },      // maximum warnings that are output
-  { 8,    UNINIT, MI_OPTION(max_segment_reclaim)},// max. number of segment reclaims from the abandoned segments per try.  
-  { 1,    UNINIT, MI_OPTION(allow_decommit) },    // decommit slices when no longer used (after decommit_delay milli-seconds)
-  { 500,  UNINIT, MI_OPTION(segment_decommit_delay) }, // decommit delay in milli-seconds for freed segments
-  { 2,    UNINIT, MI_OPTION(decommit_extend_delay) }
+#if MI_DEBUG || defined(MI_SHOW_ERRORS)
+  { 1, MI_OPTION_UNINIT, MI_OPTION(show_errors) },
+#else
+  { 0, MI_OPTION_UNINIT, MI_OPTION(show_errors) },
+#endif
+  { 0, MI_OPTION_UNINIT, MI_OPTION(show_stats) },
+  { MI_DEFAULT_VERBOSE, MI_OPTION_UNINIT, MI_OPTION(verbose) },
+
+  // some of the following options are experimental and not all combinations are allowed.
+  { 1, MI_OPTION_UNINIT, MI_OPTION(deprecated_eager_commit) },  
+  { MI_DEFAULT_ARENA_EAGER_COMMIT,
+       MI_OPTION_UNINIT, MI_OPTION_LEGACY(arena_eager_commit,eager_region_commit) }, // eager commit arena's? 2 is used to enable this only on an OS that has overcommit (i.e. linux)
+  { 1, MI_OPTION_UNINIT, MI_OPTION_LEGACY(purge_decommits,reset_decommits) },        // purge decommits memory (instead of reset) (note: on linux this uses MADV_DONTNEED for decommit)
+  { MI_DEFAULT_ALLOW_LARGE_OS_PAGES,
+       MI_OPTION_UNINIT, MI_OPTION_LEGACY(allow_large_os_pages,large_os_pages) },    // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
+  { MI_DEFAULT_RESERVE_HUGE_OS_PAGES,
+       MI_OPTION_UNINIT, MI_OPTION(reserve_huge_os_pages) },      // per 1GiB huge pages
+  {-1, MI_OPTION_UNINIT, MI_OPTION(reserve_huge_os_pages_at) },   // reserve huge pages at node N
+  { MI_DEFAULT_RESERVE_OS_MEMORY,
+       MI_OPTION_UNINIT, MI_OPTION(reserve_os_memory)     },      // reserve N KiB OS memory in advance (use `option_get_size`)
+  { 0, MI_OPTION_UNINIT, MI_OPTION(deprecated_segment_cache) },   // cache N segments per thread
+  { 0, MI_OPTION_UNINIT, MI_OPTION(deprecated_page_reset) },      // reset page memory on free
+  { 0, MI_OPTION_UNINIT, MI_OPTION(deprecated_abandoned_page_purge) }, 
+  { 0, MI_OPTION_UNINIT, MI_OPTION(deprecated_segment_reset) },   // reset segment memory on free (needs eager commit)
+  { 1, MI_OPTION_UNINIT, MI_OPTION(deprecated_eager_commit_delay) },  
+  { 1000,MI_OPTION_UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
+  { 0,   MI_OPTION_UNINIT, MI_OPTION(use_numa_nodes) },           // 0 = use available numa nodes, otherwise use at most N nodes.
+  { 0,   MI_OPTION_UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) },           // 1 = do not use OS memory for allocation (but only reserved arenas)
+  { 100, MI_OPTION_UNINIT, MI_OPTION(os_tag) },                   // only apple specific for now but might serve more or less related purpose
+  { 32,  MI_OPTION_UNINIT, MI_OPTION(max_errors) },               // maximum errors that are output
+  { 32,  MI_OPTION_UNINIT, MI_OPTION(max_warnings) },             // maximum warnings that are output
+  { 10,  MI_OPTION_UNINIT, MI_OPTION(deprecated_max_segment_reclaim)},       // max. percentage of the abandoned segments to be reclaimed per try.
+  { 0,   MI_OPTION_UNINIT, MI_OPTION(destroy_on_exit)},           // release all OS memory on process exit; careful with dangling pointer or after-exit frees!
+  { MI_DEFAULT_ARENA_RESERVE, MI_OPTION_UNINIT, MI_OPTION(arena_reserve) }, // reserve memory N KiB at a time (=1GiB) (use `option_get_size`)
+  { 1,   MI_OPTION_UNINIT, MI_OPTION(arena_purge_mult) },         // purge delay multiplier for arena's
+  { 1,   MI_OPTION_UNINIT, MI_OPTION_LEGACY(deprecated_purge_extend_delay, decommit_extend_delay) },
+  { MI_DEFAULT_DISALLOW_ARENA_ALLOC,   MI_OPTION_UNINIT, MI_OPTION(disallow_arena_alloc) }, // 1 = do not use arena's for allocation (except if using specific arena id's)
+  { 400, MI_OPTION_UNINIT, MI_OPTION(retry_on_oom) },             // windows only: retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries.
+#if defined(MI_VISIT_ABANDONED)
+  { 1,   MI_OPTION_INITIALIZED, MI_OPTION(visit_abandoned) },     // allow visiting theap blocks in abandoned segments; requires taking locks during reclaim.
+#else
+  { 0,   MI_OPTION_UNINIT, MI_OPTION(visit_abandoned) },
+#endif
+  { 0,   MI_OPTION_UNINIT, MI_OPTION(guarded_min) },              // only used when building with MI_GUARDED: minimal rounded object size for guarded objects
+  { MI_GiB, MI_OPTION_UNINIT, MI_OPTION(guarded_max) },           // only used when building with MI_GUARDED: maximal rounded object size for guarded objects
+  { 0,   MI_OPTION_UNINIT, MI_OPTION(guarded_precise) },          // disregard minimal alignment requirement to always place guarded blocks exactly in front of a guard page (=0)
+  { MI_DEFAULT_GUARDED_SAMPLE_RATE,
+         MI_OPTION_UNINIT, MI_OPTION(guarded_sample_rate)},       // 1 out of N allocations in the min/max range will be guarded (=4000)
+  { 0,   MI_OPTION_UNINIT, MI_OPTION(guarded_sample_seed)},
+  { 10000, MI_OPTION_UNINIT, MI_OPTION(generic_collect) },        // collect theaps every N (=10000) generic allocation calls
+  { 0,   MI_OPTION_UNINIT, MI_OPTION_LEGACY(page_reclaim_on_free, abandoned_reclaim_on_free) },// reclaim abandoned (small) pages on a free: -1 = disable completely, 0 = only reclaim into the originating theap, 1 = reclaim on free across theaps
+  { 2,   MI_OPTION_UNINIT, MI_OPTION(page_full_retain) },         // number of (small) pages to retain in the free page queues
+  { 4,   MI_OPTION_UNINIT, MI_OPTION(page_max_candidates) },      // max search to find a best page candidate
+  { 0,   MI_OPTION_UNINIT, MI_OPTION(max_vabits) },               // max virtual address space bits
+  { MI_DEFAULT_PAGEMAP_COMMIT,
+         MI_OPTION_UNINIT, MI_OPTION(pagemap_commit) },           // commit the full pagemap upfront?
+  { 0,   MI_OPTION_UNINIT, MI_OPTION(page_commit_on_demand) },    // commit pages on-demand (2 disables this only on overcommit systems (like Linux))
+  { MI_DEFAULT_PAGE_MAX_RECLAIM,
+         MI_OPTION_UNINIT, MI_OPTION(page_max_reclaim) },         // don't reclaim (small) pages of the same originating theap if we already own N pages in that size class
+  { MI_DEFAULT_PAGE_CROSS_THREAD_MAX_RECLAIM,
+         MI_OPTION_UNINIT, MI_OPTION(page_cross_thread_max_reclaim) }, // don't reclaim (small) pages across threads if we already own N pages in that size class
+  { MI_DEFAULT_ALLOW_THP,
+         MI_OPTION_UNINIT, MI_OPTION(allow_thp) },                // allow transparent huge pages? (=1) (on Android =0 by default). Set to 0 to disable THP for the process.
+  { 0,   MI_OPTION_UNINIT, MI_OPTION(minimal_purge_size) },       // set minimal purge size (in KiB) (=0). By default set to either 64 or 2048 if THP is enabled.
+  { MI_DEFAULT_ARENA_MAX_OBJECT_SIZE,   
+         MI_OPTION_UNINIT, MI_OPTION(arena_max_object_size) },    // set maximal object size that can be allocated in an arena (in KiB) (=2GiB on 64-bit). 
 };
 
 static void mi_option_init(mi_option_desc_t* desc);
 
+static bool mi_option_has_size_in_kib(mi_option_t option) {
+  return (option == mi_option_reserve_os_memory || option == mi_option_arena_reserve || 
+          option == mi_option_minimal_purge_size || option == mi_option_arena_max_object_size);
+}
+
 void _mi_options_init(void) {
-  // called on process load; should not be called before the CRT is initialized!
-  // (e.g. do not call this from process_init as that may run before CRT initialization)
-  mi_add_stderr_output(); // now it safe to use stderr for output
+  // called on process load
   for(int i = 0; i < _mi_option_last; i++ ) {
     mi_option_t option = (mi_option_t)i;
     long l = mi_option_get(option); MI_UNUSED(l); // initialize
-    if (option != mi_option_verbose) {
-      mi_option_desc_t* desc = &options[option];
-      _mi_verbose_message("option '%s': %ld\n", desc->name, desc->value);
-    }
   }
   mi_max_error_count = mi_option_get(mi_option_max_errors);
   mi_max_warning_count = mi_option_get(mi_option_max_warnings);
+  #if MI_GUARDED
+  if (mi_option_get(mi_option_guarded_sample_rate) > 0) {
+    if (mi_option_is_enabled(mi_option_allow_large_os_pages)) {
+      mi_option_disable(mi_option_allow_large_os_pages);
+      _mi_warning_message("option 'allow_large_os_pages' is disabled to allow for guarded objects\n");
+    }
+  }
+  #endif  
+}
+
+// called at actual process load, it should be safe to print now
+void _mi_options_post_init(void) {
+  mi_add_stderr_output(); // now it safe to use stderr for output
+  if (mi_option_is_enabled(mi_option_verbose)) { mi_options_print(); }
+}
+
+#define mi_stringifyx(str)  #str                // and stringify
+#define mi_stringify(str)   mi_stringifyx(str)  // expand
+
+mi_decl_export void mi_options_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept
+{
+  // show version
+  const int vermajor = MI_MALLOC_VERSION/1000;
+  const int verminor = (MI_MALLOC_VERSION%1000)/100;
+  const int verpatch = (MI_MALLOC_VERSION%100);
+  _mi_fprintf(out, arg, "v%i.%i.%i%s%s (built on %s, %s)\n", vermajor, verminor, verpatch,
+      #if defined(MI_CMAKE_BUILD_TYPE)
+      ", " mi_stringify(MI_CMAKE_BUILD_TYPE)
+      #else
+      ""
+      #endif
+      ,
+      #if defined(MI_GIT_DESCRIBE)
+      ", git " mi_stringify(MI_GIT_DESCRIBE)
+      #else
+      ""
+      #endif
+      , __DATE__, __TIME__);
+
+  // show options
+  for (int i = 0; i < _mi_option_last; i++) {
+    mi_option_t option = (mi_option_t)i;
+    long l = mi_option_get(option); MI_UNUSED(l); // possibly initialize
+    mi_option_desc_t* desc = &mi_options[option];
+    _mi_fprintf(out, arg, "option '%s': %ld %s\n", desc->name, desc->value, (mi_option_has_size_in_kib(option) ? "KiB" : ""));
+  }
+
+  // show build configuration
+  _mi_fprintf(out, arg, "debug level : %d\n", MI_DEBUG );
+  _mi_fprintf(out, arg, "secure level: %d\n", MI_SECURE );
+  _mi_fprintf(out, arg, "mem tracking: %s\n", MI_TRACK_TOOL);
+  #if MI_GUARDED
+  _mi_fprintf(out, arg, "guarded build: %s\n", mi_option_get(mi_option_guarded_sample_rate) != 0 ? "enabled" : "disabled");
+  #endif
+  #if MI_TSAN
+  _mi_fprintf(out, arg, "thread santizer enabled\n");
+  #endif
 }
 
+mi_decl_export void mi_options_print(void) mi_attr_noexcept {
+  mi_options_print_out(NULL, NULL);
+}
+
+long _mi_option_get_fast(mi_option_t option) {
+  mi_assert(option >= 0 && option < _mi_option_last);
+  mi_option_desc_t* desc = &mi_options[option];
+  mi_assert(desc->option == option);  // index should match the option
+  //mi_assert(desc->init != MI_OPTION_UNINIT);
+  return desc->value;
+}
+
+
 mi_decl_nodiscard long mi_option_get(mi_option_t option) {
   mi_assert(option >= 0 && option < _mi_option_last);
   if (option < 0 || option >= _mi_option_last) return 0;
-  mi_option_desc_t* desc = &options[option];
+  mi_option_desc_t* desc = &mi_options[option];
   mi_assert(desc->option == option);  // index should match the option
-  if (mi_unlikely(desc->init == UNINIT)) {
+  if mi_unlikely(desc->init == MI_OPTION_UNINIT) {
     mi_option_init(desc);
   }
   return desc->value;
@@ -131,20 +283,36 @@ mi_decl_nodiscard long mi_option_get_clamp(mi_option_t option, long min, long ma
   return (x < min ? min : (x > max ? max : x));
 }
 
+mi_decl_nodiscard size_t mi_option_get_size(mi_option_t option) {
+  const long x = mi_option_get(option);
+  size_t size = (x < 0 ? 0 : (size_t)x);
+  if (mi_option_has_size_in_kib(option)) {
+    size *= MI_KiB;
+  }
+  return size;
+}
+
 void mi_option_set(mi_option_t option, long value) {
   mi_assert(option >= 0 && option < _mi_option_last);
   if (option < 0 || option >= _mi_option_last) return;
-  mi_option_desc_t* desc = &options[option];
+  mi_option_desc_t* desc = &mi_options[option];
   mi_assert(desc->option == option);  // index should match the option
   desc->value = value;
-  desc->init = INITIALIZED;
+  desc->init = MI_OPTION_INITIALIZED;
+  // ensure min/max range; be careful to not recurse.
+  if (desc->option == mi_option_guarded_min && _mi_option_get_fast(mi_option_guarded_max) < value) {
+    mi_option_set(mi_option_guarded_max, value);
+  }
+  else if (desc->option == mi_option_guarded_max && _mi_option_get_fast(mi_option_guarded_min) > value) {
+    mi_option_set(mi_option_guarded_min, value);
+  }
 }
 
 void mi_option_set_default(mi_option_t option, long value) {
   mi_assert(option >= 0 && option < _mi_option_last);
   if (option < 0 || option >= _mi_option_last) return;
-  mi_option_desc_t* desc = &options[option];
-  if (desc->init != INITIALIZED) {
+  mi_option_desc_t* desc = &mi_options[option];
+  if (desc->init != MI_OPTION_INITIALIZED) {
     desc->value = value;
   }
 }
@@ -169,28 +337,11 @@ void mi_option_disable(mi_option_t option) {
   mi_option_set_enabled(option,false);
 }
 
-
-static void mi_out_stderr(const char* msg, void* arg) {
+static void mi_cdecl mi_out_stderr(const char* msg, void* arg) {
   MI_UNUSED(arg);
-  if (msg == NULL) return;
-  #ifdef _WIN32
-  // on windows with redirection, the C runtime cannot handle locale dependent output
-  // after the main thread closes so we use direct console output.
-  if (!_mi_preloading()) { 
-    // _cputs(msg);  // _cputs cannot be used at is aborts if it fails to lock the console
-    static HANDLE hcon = INVALID_HANDLE_VALUE;
-    if (hcon == INVALID_HANDLE_VALUE) {
-      hcon = GetStdHandle(STD_ERROR_HANDLE);
-    }
-    const size_t len = strlen(msg);
-    if (hcon != INVALID_HANDLE_VALUE && len > 0 && len < UINT32_MAX) {
-      DWORD written = 0;
-      WriteConsoleA(hcon, msg, (DWORD)len, &written, NULL);
-    }
+  if (msg != NULL && msg[0] != 0) {
+    _mi_prim_out_stderr(msg);
   }
-  #else
-  fputs(msg, stderr);
-  #endif
 }
 
 // Since an output function can be registered earliest in the `main`
@@ -198,16 +349,16 @@ static void mi_out_stderr(const char* msg, void* arg) {
 // an output function is registered it is called immediately with
 // the output up to that point.
 #ifndef MI_MAX_DELAY_OUTPUT
-#define MI_MAX_DELAY_OUTPUT ((size_t)(32*1024))
+#define MI_MAX_DELAY_OUTPUT ((size_t)(16*1024))
 #endif
-static char out_buf[MI_MAX_DELAY_OUTPUT+1];
+static char mi_output_buffer[MI_MAX_DELAY_OUTPUT+1];
 static _Atomic(size_t) out_len;
 
-static void mi_out_buf(const char* msg, void* arg) {
+static void mi_cdecl mi_out_buf(const char* msg, void* arg) {
   MI_UNUSED(arg);
   if (msg==NULL) return;
   if (mi_atomic_load_relaxed(&out_len)>=MI_MAX_DELAY_OUTPUT) return;
-  size_t n = strlen(msg);
+  size_t n = _mi_strlen(msg);
   if (n==0) return;
   // claim space
   size_t start = mi_atomic_add_acq_rel(&out_len, n);
@@ -216,7 +367,8 @@ static void mi_out_buf(const char* msg, void* arg) {
   if (start+n >= MI_MAX_DELAY_OUTPUT) {
     n = MI_MAX_DELAY_OUTPUT-start-1;
   }
-  _mi_memcpy(&out_buf[start], msg, n);
+  mi_assert_internal(start + n <= MI_MAX_DELAY_OUTPUT);
+  _mi_memcpy(&mi_output_buffer[start], msg, n);
 }
 
 static void mi_out_buf_flush(mi_output_fun* out, bool no_more_buf, void* arg) {
@@ -225,17 +377,17 @@ static void mi_out_buf_flush(mi_output_fun* out, bool no_more_buf, void* arg) {
   size_t count = mi_atomic_add_acq_rel(&out_len, (no_more_buf ? MI_MAX_DELAY_OUTPUT : 1));
   // and output the current contents
   if (count>MI_MAX_DELAY_OUTPUT) count = MI_MAX_DELAY_OUTPUT;
-  out_buf[count] = 0;
-  out(out_buf,arg);
+  mi_output_buffer[count] = 0;
+  out(mi_output_buffer,arg);
   if (!no_more_buf) {
-    out_buf[count] = '\n'; // if continue with the buffer, insert a newline
+    mi_output_buffer[count] = '\n'; // if continue with the buffer, insert a newline
   }
 }
 
 
 // Once this module is loaded, switch to this routine
 // which outputs to stderr and the delayed output buffer.
-static void mi_out_buf_stderr(const char* msg, void* arg) {
+static void mi_cdecl mi_out_buf_stderr(const char* msg, void* arg) {
   mi_out_stderr(msg,arg);
   mi_out_buf(msg,arg);
 }
@@ -264,10 +416,12 @@ void mi_register_output(mi_output_fun* out, void* arg) mi_attr_noexcept {
 }
 
 // add stderr to the delayed output after the module is loaded
-static void mi_add_stderr_output() {
+static void mi_add_stderr_output(void) {
   mi_assert_internal(mi_out_default == NULL);
-  mi_out_buf_flush(&mi_out_stderr, false, NULL); // flush current contents to stderr
-  mi_out_default = &mi_out_buf_stderr;           // and add stderr to the delayed output
+  if (mi_out_default==NULL) {
+    mi_out_buf_flush(&mi_out_stderr, false, NULL); // flush current contents to stderr
+    mi_out_default = &mi_out_buf_stderr;           // and add stderr to the delayed output
+  }
 }
 
 // --------------------------------------------------------
@@ -280,11 +434,11 @@ static _Atomic(size_t) warning_count; // = 0;  // when >= max_warning_count stop
 // inside the C runtime causes another message.
 // In some cases (like on macOS) the loader already allocates which
 // calls into mimalloc; if we then access thread locals (like `recurse`)
-// this may crash as the access may call _tlv_bootstrap that tries to 
+// this may crash as the access may call _tlv_bootstrap that tries to
 // (recursively) invoke malloc again to allocate space for the thread local
 // variables on demand. This is why we use a _mi_preloading test on such
 // platforms. However, C code generator may move the initial thread local address
-// load before the `if` and we therefore split it out in a separate funcion.
+// load before the `if` and we therefore split it out in a separate function.
 static mi_decl_thread bool recurse = false;
 
 static mi_decl_noinline bool mi_recurse_enter_prim(void) {
@@ -298,21 +452,21 @@ static mi_decl_noinline void mi_recurse_exit_prim(void) {
 }
 
 static bool mi_recurse_enter(void) {
-  #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
-  if (_mi_preloading()) return true;
+  #if defined(__APPLE__) || defined(__ANDROID__) || defined(MI_TLS_RECURSE_GUARD)
+  if (_mi_preloading()) return false;
   #endif
   return mi_recurse_enter_prim();
 }
 
 static void mi_recurse_exit(void) {
-  #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
+  #if defined(__APPLE__) || defined(__ANDROID__) || defined(MI_TLS_RECURSE_GUARD)
   if (_mi_preloading()) return;
   #endif
   mi_recurse_exit_prim();
 }
 
 void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message) {
-  if (out==NULL || (FILE*)out==stdout || (FILE*)out==stderr) { // TODO: use mi_out_stderr for stderr?
+  if (out==NULL || (void*)out==(void*)stdout || (void*)out==(void*)stderr) { // TODO: use mi_out_stderr for stderr?
     if (!mi_recurse_enter()) return;
     out = mi_out_get_default(&arg);
     if (prefix != NULL) out(prefix, arg);
@@ -326,12 +480,12 @@ void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* me
 }
 
 // Define our own limited `fprintf` that avoids memory allocation.
-// We do this using `snprintf` with a limited buffer.
+// We do this using `_mi_vsnprintf` with a limited buffer.
 static void mi_vfprintf( mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args ) {
-  char buf[512];
+  char buf[992];
   if (fmt==NULL) return;
   if (!mi_recurse_enter()) return;
-  vsnprintf(buf,sizeof(buf)-1,fmt,args);
+  _mi_vsnprintf(buf, sizeof(buf)-1, fmt, args);
   mi_recurse_exit();
   _mi_fputs(out,arg,prefix,buf);
 }
@@ -344,9 +498,9 @@ void _mi_fprintf( mi_output_fun* out, void* arg, const char* fmt, ... ) {
 }
 
 static void mi_vfprintf_thread(mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args) {
-  if (prefix != NULL && strlen(prefix) <= 32 && !_mi_is_main_thread()) {
+  if (prefix != NULL && _mi_strnlen(prefix,33) <= 32 && !_mi_is_main_thread()) {
     char tprefix[64];
-    snprintf(tprefix, sizeof(tprefix), "%sthread 0x%zx: ", prefix, _mi_thread_id());
+    _mi_snprintf(tprefix, sizeof(tprefix), "%sthread 0x%tx: ", prefix, (uintptr_t)_mi_thread_id());
     mi_vfprintf(out, arg, tprefix, fmt, args);
   }
   else {
@@ -354,6 +508,20 @@ static void mi_vfprintf_thread(mi_output_fun* out, void* arg, const char* prefix
   }
 }
 
+void _mi_raw_message(const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  mi_vfprintf(NULL, NULL, NULL, fmt, args);
+  va_end(args);
+}
+
+void _mi_message(const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  mi_vfprintf_thread(NULL, NULL, "mimalloc: ", fmt, args);
+  va_end(args);
+}
+
 void _mi_trace_message(const char* fmt, ...) {
   if (mi_option_get(mi_option_verbose) <= 1) return;  // only with verbose level 2 or higher
   va_list args;
@@ -391,7 +559,7 @@ void _mi_warning_message(const char* fmt, ...) {
 
 
 #if MI_DEBUG
-void _mi_assert_fail(const char* assertion, const char* fname, unsigned line, const char* func ) {
+mi_decl_noreturn mi_decl_cold void _mi_assert_fail(const char* assertion, const char* fname, unsigned line, const char* func ) mi_attr_noexcept {
   _mi_fprintf(NULL, NULL, "mimalloc: assertion failed: at \"%s\":%u, %s\n  assertion: \"%s\"\n", fname, line, (func==NULL?"":func), assertion);
   abort();
 }
@@ -406,7 +574,7 @@ static _Atomic(void*) mi_error_arg;     // = NULL
 
 static void mi_error_default(int err) {
   MI_UNUSED(err);
-#if (MI_DEBUG>0) 
+#if (MI_DEBUG>0)
   if (err==EFAULT) {
     #ifdef _MSC_VER
     __debugbreak();
@@ -421,6 +589,7 @@ static void mi_error_default(int err) {
 #endif
 #if defined(MI_XMALLOC)
   if (err==ENOMEM || err==EOVERFLOW) { // abort on memory allocation fails in xmalloc mode
+    // SPADES_LOCAL begin: show process info
     size_t elapsed, user_time, sys_time;
     size_t current_rss, peak_rss, current_commit, peak_commit, page_faults;
     mi_process_info(&elapsed, &user_time, &sys_time, &current_rss, &peak_rss, &current_commit, &peak_commit, &page_faults);
@@ -430,7 +599,7 @@ static void mi_error_default(int err) {
                  user_time/1000, user_time%1000, sys_time/1000, sys_time%1000, (unsigned long)page_faults );
     _mi_fprintf(NULL, NULL, "%10s: current: %lu, peak: %lu\n", "rss", current_rss, peak_rss);
     _mi_fprintf(NULL, NULL, "%10s: current: %lu, peak: %lu\n", "commit", current_commit, peak_commit);
-
+    // SPADES_LOCAL end
     abort();
   }
 #endif
@@ -460,178 +629,80 @@ void _mi_error_message(int err, const char* fmt, ...) {
 // Initialize options by checking the environment
 // --------------------------------------------------------
 
-static void mi_strlcpy(char* dest, const char* src, size_t dest_size) {
-  if (dest==NULL || src==NULL || dest_size == 0) return;
-  // copy until end of src, or when dest is (almost) full
-  while (*src != 0 && dest_size > 1) {
-    *dest++ = *src++;
-    dest_size--;
-  }
-  // always zero terminate
-  *dest = 0;
-}
+// TODO: implement ourselves to reduce dependencies on the C runtime
+#include <stdlib.h> // strtol
+#include <string.h> // strstr
 
-static void mi_strlcat(char* dest, const char* src, size_t dest_size) {
-  if (dest==NULL || src==NULL || dest_size == 0) return;
-  // find end of string in the dest buffer
-  while (*dest != 0 && dest_size > 1) {
-    dest++;
-    dest_size--;
-  }
-  // and catenate
-  mi_strlcpy(dest, src, dest_size);
-}
 
-#ifdef MI_NO_GETENV
-static bool mi_getenv(const char* name, char* result, size_t result_size) {
-  MI_UNUSED(name);
-  MI_UNUSED(result);
-  MI_UNUSED(result_size);
-  return false;
-}
-#else
-static inline int mi_strnicmp(const char* s, const char* t, size_t n) {
-  if (n==0) return 0;
-  for (; *s != 0 && *t != 0 && n > 0; s++, t++, n--) {
-    if (toupper(*s) != toupper(*t)) break;
-  }
-  return (n==0 ? 0 : *s - *t);
-}
-#if defined _WIN32
-// On Windows use GetEnvironmentVariable instead of getenv to work
-// reliably even when this is invoked before the C runtime is initialized.
-// i.e. when `_mi_preloading() == true`.
-// Note: on windows, environment names are not case sensitive.
-#include <windows.h>
-static bool mi_getenv(const char* name, char* result, size_t result_size) {
-  result[0] = 0;
-  size_t len = GetEnvironmentVariableA(name, result, (DWORD)result_size);
-  return (len > 0 && len < result_size);
-}
-#elif !defined(MI_USE_ENVIRON) || (MI_USE_ENVIRON!=0)
-// On Posix systemsr use `environ` to acces environment variables 
-// even before the C runtime is initialized.
-#if defined(__APPLE__) && defined(__has_include) && __has_include(<crt_externs.h>)
-#include <crt_externs.h>
-static char** mi_get_environ(void) {
-  return (*_NSGetEnviron());
-}
-#else 
-extern char** environ;
-static char** mi_get_environ(void) {
-  return environ;
-}
-#endif
-static bool mi_getenv(const char* name, char* result, size_t result_size) {
-  if (name==NULL) return false;  
-  const size_t len = strlen(name);
-  if (len == 0) return false;  
-  char** env = mi_get_environ();
-  if (env == NULL) return false;
-  // compare up to 256 entries
-  for (int i = 0; i < 256 && env[i] != NULL; i++) {
-    const char* s = env[i];
-    if (mi_strnicmp(name, s, len) == 0 && s[len] == '=') { // case insensitive
-      // found it
-      mi_strlcpy(result, s + len + 1, result_size);
-      return true;
-    }
-  }
-  return false;
-}
-#else  
-// fallback: use standard C `getenv` but this cannot be used while initializing the C runtime
-static bool mi_getenv(const char* name, char* result, size_t result_size) {
-  // cannot call getenv() when still initializing the C runtime.
-  if (_mi_preloading()) return false;
-  const char* s = getenv(name);
-  if (s == NULL) {
-    // we check the upper case name too.
-    char buf[64+1];
-    size_t len = strlen(name);
-    if (len >= sizeof(buf)) len = sizeof(buf) - 1;
-    for (size_t i = 0; i < len; i++) {
-      buf[i] = toupper(name[i]);
-    }
-    buf[len] = 0;
-    s = getenv(buf);
-  }
-  if (s != NULL && strlen(s) < result_size) {
-    mi_strlcpy(result, s, result_size);
-    return true;
-  }
-  else {
-    return false;
-  }
-}
-#endif  // !MI_USE_ENVIRON
-#endif  // !MI_NO_GETENV
-
-static void mi_option_init(mi_option_desc_t* desc) {  
+static void mi_option_init(mi_option_desc_t* desc) {
   // Read option value from the environment
-  char s[64+1];
+  char s[64 + 1];
   char buf[64+1];
-  mi_strlcpy(buf, "mimalloc_", sizeof(buf));
-  mi_strlcat(buf, desc->name, sizeof(buf));
-  bool found = mi_getenv(buf,s,sizeof(s));
+  _mi_strlcpy(buf, "mimalloc_", sizeof(buf));
+  _mi_strlcat(buf, desc->name, sizeof(buf));
+  bool found = _mi_getenv(buf, s, sizeof(s));
   if (!found && desc->legacy_name != NULL) {
-    mi_strlcpy(buf, "mimalloc_", sizeof(buf));
-    mi_strlcat(buf, desc->legacy_name, sizeof(buf));
-    found = mi_getenv(buf,s,sizeof(s));
+    _mi_strlcpy(buf, "mimalloc_", sizeof(buf));
+    _mi_strlcat(buf, desc->legacy_name, sizeof(buf));
+    found = _mi_getenv(buf, s, sizeof(s));
     if (found) {
-      _mi_warning_message("environment option \"mimalloc_%s\" is deprecated -- use \"mimalloc_%s\" instead.\n", desc->legacy_name, desc->name );
-    }    
+      _mi_warning_message("environment option \"mimalloc_%s\" is deprecated -- use \"mimalloc_%s\" instead.\n", desc->legacy_name, desc->name);
+    }
   }
 
   if (found) {
-    size_t len = strlen(s);
-    if (len >= sizeof(buf)) len = sizeof(buf) - 1;
+    size_t len = _mi_strnlen(s, sizeof(buf) - 1);
     for (size_t i = 0; i < len; i++) {
-      buf[i] = (char)toupper(s[i]);
+      buf[i] = _mi_toupper(s[i]);
     }
     buf[len] = 0;
-    if (buf[0]==0 || strstr("1;TRUE;YES;ON", buf) != NULL) {
+    if (buf[0] == 0 || strstr("1;TRUE;YES;ON", buf) != NULL) {
       desc->value = 1;
-      desc->init = INITIALIZED;
+      desc->init = MI_OPTION_INITIALIZED;
     }
     else if (strstr("0;FALSE;NO;OFF", buf) != NULL) {
       desc->value = 0;
-      desc->init = INITIALIZED;
+      desc->init = MI_OPTION_INITIALIZED;
     }
     else {
       char* end = buf;
       long value = strtol(buf, &end, 10);
-      if (desc->option == mi_option_reserve_os_memory) {
-        // this option is interpreted in KiB to prevent overflow of `long`
+      if (mi_option_has_size_in_kib(desc->option)) {
+        // this option is interpreted in KiB to prevent overflow of `long` for large allocations
+        // (long is 32-bit on 64-bit windows, which allows for 4TiB max.)
+        size_t size = (value < 0 ? 0 : (size_t)value);
+        bool overflow = false;
         if (*end == 'K') { end++; }
-        else if (*end == 'M') { value *= MI_KiB; end++; }
-        else if (*end == 'G') { value *= MI_MiB; end++; }
-        else { value = (value + MI_KiB - 1) / MI_KiB; }
-        if (end[0] == 'I' && end[1] == 'B') { end += 2; }
-        else if (*end == 'B') { end++; }
+        else if (*end == 'M') { overflow = mi_mul_overflow(size,MI_KiB,&size); end++; }
+        else if (*end == 'G') { overflow = mi_mul_overflow(size,MI_MiB,&size); end++; }
+        else if (*end == 'T') { overflow = mi_mul_overflow(size,MI_GiB,&size); end++; }
+        else { size = (size + MI_KiB - 1) / MI_KiB; }
+        if (end[0] == 'I' && end[1] == 'B') { end += 2; } // KiB, MiB, GiB, TiB
+        else if (*end == 'B') { end++; }                  // Kb, Mb, Gb, Tb
+        if (overflow || size > MI_MAX_ALLOC_SIZE) { size = (MI_MAX_ALLOC_SIZE / MI_KiB); }
+        value = (size > LONG_MAX ? LONG_MAX : (long)size);
       }
       if (*end == 0) {
-        desc->value = value;
-        desc->init = INITIALIZED;
+        mi_option_set(desc->option, value);
       }
       else {
         // set `init` first to avoid recursion through _mi_warning_message on mimalloc_verbose.
-        desc->init = DEFAULTED;
+        desc->init = MI_OPTION_DEFAULTED;
         if (desc->option == mi_option_verbose && desc->value == 0) {
           // if the 'mimalloc_verbose' env var has a bogus value we'd never know
           // (since the value defaults to 'off') so in that case briefly enable verbose
           desc->value = 1;
-          _mi_warning_message("environment option mimalloc_%s has an invalid value.\n", desc->name );
+          _mi_warning_message("environment option mimalloc_%s has an invalid value.\n", desc->name);
           desc->value = 0;
         }
         else {
-          _mi_warning_message("environment option mimalloc_%s has an invalid value.\n", desc->name );
+          _mi_warning_message("environment option mimalloc_%s has an invalid value.\n", desc->name);
         }
       }
     }
-    mi_assert_internal(desc->init != UNINIT);
+    mi_assert_internal(desc->init != MI_OPTION_UNINIT);
   }
   else if (!_mi_preloading()) {
-    desc->init = DEFAULTED;
+    desc->init = MI_OPTION_DEFAULTED;
   }
 }
diff --git a/ext/src/mimalloc/src/os.c b/ext/src/mimalloc/src/os.c
index 52b2500712..45d1d8ea7f 100644
--- a/ext/src/mimalloc/src/os.c
+++ b/ext/src/mimalloc/src/os.c
@@ -1,118 +1,87 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
-#ifndef _DEFAULT_SOURCE
-#define _DEFAULT_SOURCE   // ensure mmap flags are defined
-#endif
-
-#if defined(__sun)
-// illumos provides new mman.h api when any of these are defined
-// otherwise the old api based on caddr_t which predates the void pointers one.
-// stock solaris provides only the former, chose to atomically to discard those
-// flags only here rather than project wide tough.
-#undef _XOPEN_SOURCE
-#undef _POSIX_C_SOURCE
-#endif
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
-
-#include <string.h>  // strerror
-
-#ifdef _MSC_VER
-#pragma warning(disable:4996)  // strerror
-#endif
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"
 
-#if defined(__wasi__)
-#define MI_USE_SBRK
-#endif
-
-#if defined(_WIN32)
-#include <windows.h>
-#elif defined(__wasi__)
-#include <unistd.h>    // sbrk
-#else
-#include <sys/mman.h>  // mmap
-#include <unistd.h>    // sysconf
-#if defined(__linux__)
-#include <features.h>
-#include <fcntl.h>
-#if defined(__GLIBC__)
-#include <linux/mman.h> // linux mmap flags
+/* -----------------------------------------------------------
+  Initialization.
+----------------------------------------------------------- */
+#ifndef MI_DEFAULT_PHYSICAL_MEMORY_IN_KIB
+#if MI_INTPTR_SIZE < 8
+#define MI_DEFAULT_PHYSICAL_MEMORY_IN_KIB   4*MI_MiB    // 4 GiB
 #else
-#include <sys/mman.h>
-#endif
-#endif
-#if defined(__APPLE__)
-#include <TargetConditionals.h>
-#if !TARGET_IOS_IPHONE && !TARGET_IOS_SIMULATOR
-#include <mach/vm_statistics.h>
-#endif
-#endif
-#if defined(__FreeBSD__) || defined(__DragonFly__)
-#include <sys/param.h>
-#if __FreeBSD_version >= 1200000
-#include <sys/cpuset.h>
-#include <sys/domainset.h>
-#endif
-#include <sys/sysctl.h>
+#define MI_DEFAULT_PHYSICAL_MEMORY_IN_KIB   32*MI_MiB   // 32 GiB
 #endif
 #endif
 
-/* -----------------------------------------------------------
-  Initialization.
-  On windows initializes support for aligned allocation and
-  large OS pages (if MIMALLOC_LARGE_OS_PAGES is true).
------------------------------------------------------------ */
-bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
-bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats);
+static mi_os_mem_config_t mi_os_mem_config = {
+  4096,     // page size
+  0,        // large page size (usually 2MiB)
+  4096,     // allocation granularity
+  MI_DEFAULT_PHYSICAL_MEMORY_IN_KIB,
+  MI_MAX_VABITS, // in `bits.h`
+  true,     // has overcommit?  (if true we use MAP_NORESERVE on mmap systems)
+  false,    // can we partially free allocated blocks? (on mmap systems we can free anywhere in a mapped range, but on Windows we must free the entire span)
+  true,     // has virtual reserve? (if true we can reserve virtual address space without using commit or physical memory)
+  false     // has transparent huge pages? (if true we purge in (aligned) large page size chunks only to not fragment such pages)
+};
 
-static void* mi_align_up_ptr(void* p, size_t alignment) {
-  return (void*)_mi_align_up((uintptr_t)p, alignment);
+bool _mi_os_has_overcommit(void) {
+  return mi_os_mem_config.has_overcommit;
 }
 
-static void* mi_align_down_ptr(void* p, size_t alignment) {
-  return (void*)_mi_align_down((uintptr_t)p, alignment);
+bool _mi_os_has_virtual_reserve(void) {
+  return mi_os_mem_config.has_virtual_reserve;
 }
 
 
-// page size (initialized properly in `os_init`)
-static size_t os_page_size = 4096;
-
-// minimal allocation granularity
-static size_t os_alloc_granularity = 4096;
-
-// if non-zero, use large page allocation
-static size_t large_os_page_size = 0;
-
-// is memory overcommit allowed? 
-// set dynamically in _mi_os_init (and if true we use MAP_NORESERVE)
-static bool os_overcommit = true;
-
-bool _mi_os_has_overcommit(void) {
-  return os_overcommit;
-}
-
 // OS (small) page size
 size_t _mi_os_page_size(void) {
-  return os_page_size;
+  return mi_os_mem_config.page_size;
 }
 
 // if large OS pages are supported (2 or 4MiB), then return the size, otherwise return the small page size (4KiB)
 size_t _mi_os_large_page_size(void) {
-  return (large_os_page_size != 0 ? large_os_page_size : _mi_os_page_size());
+  return (mi_os_mem_config.large_page_size != 0 ? mi_os_mem_config.large_page_size : _mi_os_page_size());
+}
+
+// minimal purge size. Can be larger than the page size if transparent huge pages are enabled.
+size_t _mi_os_minimal_purge_size(void) {
+  size_t minsize = mi_option_get_size(mi_option_minimal_purge_size);
+  if (minsize != 0) {
+    return _mi_align_up(minsize, _mi_os_page_size());
+  }
+  else if (mi_os_mem_config.has_transparent_huge_pages && mi_option_is_enabled(mi_option_allow_thp)) {
+    return _mi_os_large_page_size();
+  }
+  else {
+    return _mi_os_page_size();
+  }
+}
+
+size_t _mi_os_guard_page_size(void) {
+  const size_t gsize = _mi_os_page_size();
+  mi_assert(gsize <= (MI_ARENA_SLICE_SIZE/4)); // issue #1166
+  return gsize;
 }
 
-#if !defined(MI_USE_SBRK) && !defined(__wasi__)
-static bool use_large_os_page(size_t size, size_t alignment) {
+size_t _mi_os_virtual_address_bits(void) {
+  const size_t vbits = mi_os_mem_config.virtual_address_bits;
+  mi_assert(vbits <= MI_MAX_VABITS);
+  return vbits;
+}
+
+bool _mi_os_canuse_large_page(size_t size, size_t alignment) {
   // if we have access, check the size and alignment requirements
-  if (large_os_page_size == 0 || !mi_option_is_enabled(mi_option_large_os_pages)) return false;
-  return ((size % large_os_page_size) == 0 && (alignment % large_os_page_size) == 0);
+  if (mi_os_mem_config.large_page_size == 0) return false;
+  return ((size % mi_os_mem_config.large_page_size) == 0 && (alignment % mi_os_mem_config.large_page_size) == 0);
 }
-#endif
 
 // round to a good OS allocation size (bounded by max 12.5% waste)
 size_t _mi_os_good_alloc_size(size_t size) {
@@ -122,579 +91,151 @@ size_t _mi_os_good_alloc_size(size_t size) {
   else if (size < 8*MI_MiB) align_size = 256*MI_KiB;
   else if (size < 32*MI_MiB) align_size = 1*MI_MiB;
   else align_size = 4*MI_MiB;
-  if (mi_unlikely(size >= (SIZE_MAX - align_size))) return size; // possible overflow?
+  if mi_unlikely(size >= (SIZE_MAX - align_size)) return size; // possible overflow?
   return _mi_align_up(size, align_size);
 }
 
-#if defined(_WIN32)
-// We use VirtualAlloc2 for aligned allocation, but it is only supported on Windows 10 and Windows Server 2016.
-// So, we need to look it up dynamically to run on older systems. (use __stdcall for 32-bit compatibility)
-// NtAllocateVirtualAllocEx is used for huge OS page allocation (1GiB)
-// We define a minimal MEM_EXTENDED_PARAMETER ourselves in order to be able to compile with older SDK's.
-typedef enum MI_MEM_EXTENDED_PARAMETER_TYPE_E {
-  MiMemExtendedParameterInvalidType = 0,
-  MiMemExtendedParameterAddressRequirements,
-  MiMemExtendedParameterNumaNode,
-  MiMemExtendedParameterPartitionHandle,
-  MiMemExtendedParameterUserPhysicalHandle,
-  MiMemExtendedParameterAttributeFlags,
-  MiMemExtendedParameterMax
-} MI_MEM_EXTENDED_PARAMETER_TYPE; 
-
-typedef struct DECLSPEC_ALIGN(8) MI_MEM_EXTENDED_PARAMETER_S {
-  struct { DWORD64 Type : 8; DWORD64 Reserved : 56; } Type;
-  union  { DWORD64 ULong64; PVOID Pointer; SIZE_T Size; HANDLE Handle; DWORD ULong; } Arg;
-} MI_MEM_EXTENDED_PARAMETER;
-
-typedef struct MI_MEM_ADDRESS_REQUIREMENTS_S {
-  PVOID  LowestStartingAddress;
-  PVOID  HighestEndingAddress;
-  SIZE_T Alignment;
-} MI_MEM_ADDRESS_REQUIREMENTS;
-
-#define MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE   0x00000010
-
-#include <winternl.h>
-typedef PVOID    (__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
-typedef NTSTATUS (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
-static PVirtualAlloc2 pVirtualAlloc2 = NULL;
-static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL;
-
-// Similarly, GetNumaProcesorNodeEx is only supported since Windows 7
-typedef struct MI_PROCESSOR_NUMBER_S { WORD Group; BYTE Number; BYTE Reserved; } MI_PROCESSOR_NUMBER;
-
-typedef VOID (__stdcall *PGetCurrentProcessorNumberEx)(MI_PROCESSOR_NUMBER* ProcNumber);
-typedef BOOL (__stdcall *PGetNumaProcessorNodeEx)(MI_PROCESSOR_NUMBER* Processor, PUSHORT NodeNumber);
-typedef BOOL (__stdcall* PGetNumaNodeProcessorMaskEx)(USHORT Node, PGROUP_AFFINITY ProcessorMask);
-static PGetCurrentProcessorNumberEx pGetCurrentProcessorNumberEx = NULL;
-static PGetNumaProcessorNodeEx      pGetNumaProcessorNodeEx = NULL;
-static PGetNumaNodeProcessorMaskEx  pGetNumaNodeProcessorMaskEx = NULL;
-
-static bool mi_win_enable_large_os_pages(void)
-{
-  if (large_os_page_size > 0) return true;
-
-  // Try to see if large OS pages are supported
-  // To use large pages on Windows, we first need access permission
-  // Set "Lock pages in memory" permission in the group policy editor
-  // <https://devblogs.microsoft.com/oldnewthing/20110128-00/?p=11643>
-  unsigned long err = 0;
-  HANDLE token = NULL;
-  BOOL ok = OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token);
-  if (ok) {
-    TOKEN_PRIVILEGES tp;
-    ok = LookupPrivilegeValue(NULL, TEXT("SeLockMemoryPrivilege"), &tp.Privileges[0].Luid);
-    if (ok) {
-      tp.PrivilegeCount = 1;
-      tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
-      ok = AdjustTokenPrivileges(token, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0);
-      if (ok) {
-        err = GetLastError();
-        ok = (err == ERROR_SUCCESS);
-        if (ok) {
-          large_os_page_size = GetLargePageMinimum();
-        }
-      }
-    }
-    CloseHandle(token);
-  }
-  if (!ok) {
-    if (err == 0) err = GetLastError();
-    _mi_warning_message("cannot enable large OS page support, error %lu\n", err);
-  }
-  return (ok!=0);
-}
-
-void _mi_os_init(void) 
-{
-  os_overcommit = false;
-  // get the page size
-  SYSTEM_INFO si;
-  GetSystemInfo(&si);
-  if (si.dwPageSize > 0) os_page_size = si.dwPageSize;
-  if (si.dwAllocationGranularity > 0) os_alloc_granularity = si.dwAllocationGranularity;
-  // get the VirtualAlloc2 function
-  HINSTANCE  hDll;
-  hDll = LoadLibrary(TEXT("kernelbase.dll"));
-  if (hDll != NULL) {
-    // use VirtualAlloc2FromApp if possible as it is available to Windows store apps
-    pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2FromApp");
-    if (pVirtualAlloc2==NULL) pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2");
-    FreeLibrary(hDll);
-  }
-  // NtAllocateVirtualMemoryEx is used for huge page allocation
-  hDll = LoadLibrary(TEXT("ntdll.dll"));
-  if (hDll != NULL) {
-    pNtAllocateVirtualMemoryEx = (PNtAllocateVirtualMemoryEx)(void (*)(void))GetProcAddress(hDll, "NtAllocateVirtualMemoryEx");
-    FreeLibrary(hDll);
-  }
-  // Try to use Win7+ numa API
-  hDll = LoadLibrary(TEXT("kernel32.dll"));
-  if (hDll != NULL) {
-    pGetCurrentProcessorNumberEx = (PGetCurrentProcessorNumberEx)(void (*)(void))GetProcAddress(hDll, "GetCurrentProcessorNumberEx");
-    pGetNumaProcessorNodeEx = (PGetNumaProcessorNodeEx)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNodeEx");
-    pGetNumaNodeProcessorMaskEx = (PGetNumaNodeProcessorMaskEx)(void (*)(void))GetProcAddress(hDll, "GetNumaNodeProcessorMaskEx");
-    FreeLibrary(hDll);
-  }
-  if (mi_option_is_enabled(mi_option_large_os_pages) || mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
-    mi_win_enable_large_os_pages();
-  }
-}
-#elif defined(__wasi__)
-void _mi_os_init(void) {
-  os_overcommit = false;
-  os_page_size = 64*MI_KiB; // WebAssembly has a fixed page size: 64KiB
-  os_alloc_granularity = 16;
-}
-
-#else  // generic unix
-
-static void os_detect_overcommit(void) {
-#if defined(__linux__)
-  int fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY);
-	if (fd < 0) return;
-  char buf[32];
-  ssize_t nread = read(fd, &buf, sizeof(buf));
-	close(fd);
-  // <https://www.kernel.org/doc/Documentation/vm/overcommit-accounting>
-  // 0: heuristic overcommit, 1: always overcommit, 2: never overcommit (ignore NORESERVE)
-  if (nread >= 1) {
-    os_overcommit = (buf[0] == '0' || buf[0] == '1');
-  }
-#elif defined(__FreeBSD__)
-  int val = 0;
-  size_t olen = sizeof(val);
-  if (sysctlbyname("vm.overcommit", &val, &olen, NULL, 0) == 0) {
-    os_overcommit = (val != 0);
-  }  
-#else
-  // default: overcommit is true  
-#endif
-}
-
 void _mi_os_init(void) {
-  // get the page size
-  long result = sysconf(_SC_PAGESIZE);
-  if (result > 0) {
-    os_page_size = (size_t)result;
-    os_alloc_granularity = os_page_size;
-  }
-  large_os_page_size = 2*MI_MiB; // TODO: can we query the OS for this?
-  os_detect_overcommit();
-}
-#endif
-
-
-#if defined(MADV_NORMAL)
-static int mi_madvise(void* addr, size_t length, int advice) {
-  #if defined(__sun)
-  return madvise((caddr_t)addr, length, advice);  // Solaris needs cast (issue #520)
-  #else
-  return madvise(addr, length, advice);
-  #endif
+  _mi_prim_mem_init(&mi_os_mem_config);
 }
-#endif
 
 
 /* -----------------------------------------------------------
-  aligned hinting
+  Util
 -------------------------------------------------------------- */
+bool _mi_os_decommit(void* addr, size_t size);
+bool _mi_os_commit(void* addr, size_t size, bool* is_zero);
 
-// On 64-bit systems, we can do efficient aligned allocation by using
-// the 2TiB to 30TiB area to allocate those.
-#if (MI_INTPTR_SIZE >= 8)
-static mi_decl_cache_align _Atomic(uintptr_t)aligned_base;
-
-// Return a MI_SEGMENT_SIZE aligned address that is probably available.
-// If this returns NULL, the OS will determine the address but on some OS's that may not be 
-// properly aligned which can be more costly as it needs to be adjusted afterwards.
-// For a size > 1GiB this always returns NULL in order to guarantee good ASLR randomization; 
-// (otherwise an initial large allocation of say 2TiB has a 50% chance to include (known) addresses 
-//  in the middle of the 2TiB - 6TiB address range (see issue #372))
-
-#define MI_HINT_BASE ((uintptr_t)2 << 40)  // 2TiB start
-#define MI_HINT_AREA ((uintptr_t)4 << 40)  // upto 6TiB   (since before win8 there is "only" 8TiB available to processes)
-#define MI_HINT_MAX  ((uintptr_t)30 << 40) // wrap after 30TiB (area after 32TiB is used for huge OS pages)
-
-static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size)
-{
-  if (try_alignment <= 1 || try_alignment > MI_SEGMENT_SIZE) return NULL;
-  size = _mi_align_up(size, MI_SEGMENT_SIZE);
-  if (size > 1*MI_GiB) return NULL;  // guarantee the chance of fixed valid address is at most 1/(MI_HINT_AREA / 1<<30) = 1/4096.
-  #if (MI_SECURE>0)
-  size += MI_SEGMENT_SIZE;        // put in `MI_SEGMENT_SIZE` virtual gaps between hinted blocks; this splits VLA's but increases guarded areas.
-  #endif
-
-  uintptr_t hint = mi_atomic_add_acq_rel(&aligned_base, size);
-  if (hint == 0 || hint > MI_HINT_MAX) {   // wrap or initialize
-    uintptr_t init = MI_HINT_BASE;
-    #if (MI_SECURE>0 || MI_DEBUG==0)       // security: randomize start of aligned allocations unless in debug mode
-    uintptr_t r = _mi_heap_random_next(mi_get_default_heap());
-    init = init + ((MI_SEGMENT_SIZE * ((r>>17) & 0xFFFFF)) % MI_HINT_AREA);  // (randomly 20 bits)*4MiB == 0 to 4TiB
-    #endif
-    uintptr_t expected = hint + size;
-    mi_atomic_cas_strong_acq_rel(&aligned_base, &expected, init);
-    hint = mi_atomic_add_acq_rel(&aligned_base, size); // this may still give 0 or > MI_HINT_MAX but that is ok, it is a hint after all
-  }
-  if (hint%try_alignment != 0) return NULL;
-  return (void*)hint;
-}
-#else
-static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
+void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
   MI_UNUSED(try_alignment); MI_UNUSED(size);
   return NULL;
 }
-#endif
-
-/* -----------------------------------------------------------
-  Free memory
--------------------------------------------------------------- */
-
-static bool mi_os_mem_free(void* addr, size_t size, bool was_committed, mi_stats_t* stats)
-{
-  if (addr == NULL || size == 0) return true; // || _mi_os_is_huge_reserved(addr)
-  bool err = false;
-#if defined(_WIN32)
-  DWORD errcode = 0;
-  err = (VirtualFree(addr, 0, MEM_RELEASE) == 0);
-  if (err) { errcode = GetLastError(); }
-  if (errcode == ERROR_INVALID_ADDRESS) {
-    // In mi_os_mem_alloc_aligned the fallback path may have returned a pointer inside
-    // the memory region returned by VirtualAlloc; in that case we need to free using
-    // the start of the region.
-    MEMORY_BASIC_INFORMATION info = { 0, 0 };
-    VirtualQuery(addr, &info, sizeof(info));
-    if (info.AllocationBase < addr && ((uint8_t*)addr - (uint8_t*)info.AllocationBase) < MI_SEGMENT_SIZE) {
-      errcode = 0;
-      err = (VirtualFree(info.AllocationBase, 0, MEM_RELEASE) == 0);
-      if (err) { errcode = GetLastError(); }
-    }
-  }
-  if (errcode != 0) {
-    _mi_warning_message("unable to release OS memory: error code 0x%x, addr: %p, size: %zu\n", errcode, addr, size);
-  }
-#elif defined(MI_USE_SBRK) || defined(__wasi__)
-  err = false; // sbrk heap cannot be shrunk
-#else
-  err = (munmap(addr, size) == -1);
-  if (err) {
-    _mi_warning_message("unable to release OS memory: %s, addr: %p, size: %zu\n", strerror(errno), addr, size);
-  }
-#endif
-  if (was_committed) { _mi_stat_decrease(&stats->committed, size); }
-  _mi_stat_decrease(&stats->reserved, size);
-  return !err;  
-}
 
 
 /* -----------------------------------------------------------
-  Raw allocation on Windows (VirtualAlloc) 
--------------------------------------------------------------- */
-
-#ifdef _WIN32
- 
-#define MEM_COMMIT_RESERVE  (MEM_COMMIT|MEM_RESERVE)
+  Guard page allocation
+----------------------------------------------------------- */
 
-static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment, DWORD flags) {
-#if (MI_INTPTR_SIZE >= 8)
-  // on 64-bit systems, try to use the virtual address area after 2TiB for 4MiB aligned allocations
-  if (addr == NULL) {
-    void* hint = mi_os_get_aligned_hint(try_alignment,size);
-    if (hint != NULL) {
-      void* p = VirtualAlloc(hint, size, flags, PAGE_READWRITE);
-      if (p != NULL) return p;
-      _mi_verbose_message("warning: unable to allocate hinted aligned OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x)\n", size, GetLastError(), hint, try_alignment, flags);
-      // fall through on error
-    }
-  } 
-#endif
-  // on modern Windows try use VirtualAlloc2 for aligned allocation
-  if (try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
-    MI_MEM_ADDRESS_REQUIREMENTS reqs = { 0, 0, 0 };
-    reqs.Alignment = try_alignment;
-    MI_MEM_EXTENDED_PARAMETER param = { {0, 0}, {0} };
-    param.Type.Type = MiMemExtendedParameterAddressRequirements;
-    param.Arg.Pointer = &reqs;
-    void* p = (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, &param, 1);
-    if (p != NULL) return p;
-    _mi_warning_message("unable to allocate aligned OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x)\n", size, GetLastError(), addr, try_alignment, flags);
-    // fall through on error
-  }
-  // last resort
-  return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
+// In secure mode, return the size of a guard page, otherwise 0
+size_t _mi_os_secure_guard_page_size(void) {
+  #if MI_SECURE > 0
+  return _mi_os_guard_page_size();
+  #else
+  return 0;
+  #endif
 }
 
-static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment, DWORD flags, bool large_only, bool allow_large, bool* is_large) {
-  mi_assert_internal(!(large_only && !allow_large));
-  static _Atomic(size_t) large_page_try_ok; // = 0;
-  void* p = NULL;
-  // Try to allocate large OS pages (2MiB) if allowed or required.
-  if ((large_only || use_large_os_page(size, try_alignment))
-      && allow_large && (flags&MEM_COMMIT)!=0 && (flags&MEM_RESERVE)!=0) {
-    size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
-    if (!large_only && try_ok > 0) {
-      // if a large page allocation fails, it seems the calls to VirtualAlloc get very expensive.
-      // therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times.
-      mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1);
+// In secure mode, try to decommit an area and output a warning if this fails.
+bool _mi_os_secure_guard_page_set_at(void* addr, mi_memid_t memid) {
+  if (addr == NULL) return true;
+  #if MI_SECURE > 0
+  bool ok = false;
+  if (!memid.is_pinned) {
+    mi_arena_t* const arena = mi_memid_arena(memid);
+    if (arena != NULL && arena->commit_fun != NULL) {
+      ok = (*(arena->commit_fun))(false /* decommit */, addr, _mi_os_secure_guard_page_size(), NULL, arena->commit_fun_arg);
     }
     else {
-      // large OS pages must always reserve and commit.
-      *is_large = true;
-      p = mi_win_virtual_allocx(addr, size, try_alignment, flags | MEM_LARGE_PAGES);
-      if (large_only) return p;
-      // fall back to non-large page allocation on error (`p == NULL`).
-      if (p == NULL) {
-        mi_atomic_store_release(&large_page_try_ok,10UL);  // on error, don't try again for the next N allocations
-      }
+      ok = _mi_os_decommit(addr, _mi_os_secure_guard_page_size());
     }
   }
-  // Fall back to regular page allocation
-  if (p == NULL) {
-    *is_large = ((flags&MEM_LARGE_PAGES) != 0);
-    p = mi_win_virtual_allocx(addr, size, try_alignment, flags);
-  }
-  if (p == NULL) {
-    _mi_warning_message("unable to allocate OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x, large only: %d, allow large: %d)\n", size, GetLastError(), addr, try_alignment, flags, large_only, allow_large);
+  if (!ok) {
+    _mi_error_message(EINVAL, "secure level %d, but failed to commit guard page (at %p of size %zu)\n", MI_SECURE, addr, _mi_os_secure_guard_page_size());
   }
-  return p;
+  return ok;
+  #else
+  MI_UNUSED(memid);
+  return true;
+  #endif
 }
 
-/* -----------------------------------------------------------
-  Raw allocation using `sbrk` or `wasm_memory_grow`
--------------------------------------------------------------- */
-
-#elif defined(MI_USE_SBRK) || defined(__wasi__)
-#if defined(MI_USE_SBRK) 
-  static void* mi_memory_grow( size_t size ) {
-    void* p = sbrk(size);
-    if (p == (void*)(-1)) return NULL;
-    #if !defined(__wasi__) // on wasi this is always zero initialized already (?)
-    memset(p,0,size); 
-    #endif
-    return p;
-  }
-#elif defined(__wasi__)
-  static void* mi_memory_grow( size_t size ) {
-    size_t base = (size > 0 ? __builtin_wasm_memory_grow(0,_mi_divide_up(size, _mi_os_page_size()))
-                            : __builtin_wasm_memory_size(0));
-    if (base == SIZE_MAX) return NULL;     
-    return (void*)(base * _mi_os_page_size());    
-  }
-#endif
-
-#if defined(MI_USE_PTHREADS)
-static pthread_mutex_t mi_heap_grow_mutex = PTHREAD_MUTEX_INITIALIZER;
-#endif
+// In secure mode, try to decommit an area and output a warning if this fails.
+bool _mi_os_secure_guard_page_set_before(void* addr, mi_memid_t memid) {
+  return _mi_os_secure_guard_page_set_at((uint8_t*)addr - _mi_os_secure_guard_page_size(), memid);
+}
 
-static void* mi_heap_grow(size_t size, size_t try_alignment) {
-  void* p = NULL;
-  if (try_alignment <= 1) {
-    // `sbrk` is not thread safe in general so try to protect it (we could skip this on WASM but leave it in for now)
-    #if defined(MI_USE_PTHREADS) 
-    pthread_mutex_lock(&mi_heap_grow_mutex);
-    #endif
-    p = mi_memory_grow(size);
-    #if defined(MI_USE_PTHREADS)
-    pthread_mutex_unlock(&mi_heap_grow_mutex);
-    #endif
-  }
-  else {
-    void* base = NULL;
-    size_t alloc_size = 0;
-    // to allocate aligned use a lock to try to avoid thread interaction
-    // between getting the current size and actual allocation
-    // (also, `sbrk` is not thread safe in general)
-    #if defined(MI_USE_PTHREADS)
-    pthread_mutex_lock(&mi_heap_grow_mutex);
-    #endif
-    {
-      void* current = mi_memory_grow(0);  // get current size
-      if (current != NULL) {
-        void* aligned_current = mi_align_up_ptr(current, try_alignment);  // and align from there to minimize wasted space
-        alloc_size = _mi_align_up( ((uint8_t*)aligned_current - (uint8_t*)current) + size, _mi_os_page_size());
-        base = mi_memory_grow(alloc_size);        
-      }
+// In secure mode, try to recommit an area
+bool _mi_os_secure_guard_page_reset_at(void* addr, mi_memid_t memid) {
+  if (addr == NULL) return true;
+  #if MI_SECURE > 0
+  if (!memid.is_pinned) {
+    mi_arena_t* const arena = mi_memid_arena(memid);
+    if (arena != NULL && arena->commit_fun != NULL) {
+      return (*(arena->commit_fun))(true, addr, _mi_os_secure_guard_page_size(), NULL, arena->commit_fun_arg);
     }
-    #if defined(MI_USE_PTHREADS)
-    pthread_mutex_unlock(&mi_heap_grow_mutex);
-    #endif
-    if (base != NULL) {
-      p = mi_align_up_ptr(base, try_alignment);
-      if ((uint8_t*)p + size > (uint8_t*)base + alloc_size) {
-        // another thread used wasm_memory_grow/sbrk in-between and we do not have enough
-        // space after alignment. Give up (and waste the space as we cannot shrink :-( )
-        // (in `mi_os_mem_alloc_aligned` this will fall back to overallocation to align)
-        p = NULL;
-      }
+    else {
+      return _mi_os_commit(addr, _mi_os_secure_guard_page_size(), NULL);
     }
   }
-  if (p == NULL) {
-    _mi_warning_message("unable to allocate sbrk/wasm_memory_grow OS memory (%zu bytes, %zu alignment)\n", size, try_alignment);    
-    errno = ENOMEM;
-    return NULL;
-  }
-  mi_assert_internal( try_alignment == 0 || (uintptr_t)p % try_alignment == 0 );
-  return p;
+  #else
+  MI_UNUSED(memid);
+  #endif
+  return true;
+}
+
+// In secure mode, try to recommit an area
+bool _mi_os_secure_guard_page_reset_before(void* addr, mi_memid_t memid) {
+  return _mi_os_secure_guard_page_reset_at((uint8_t*)addr - _mi_os_secure_guard_page_size(), memid);
 }
 
+
 /* -----------------------------------------------------------
-  Raw allocation on Unix's (mmap)
+  Free memory
 -------------------------------------------------------------- */
-#else 
-#define MI_OS_USE_MMAP
-static void* mi_unix_mmapx(void* addr, size_t size, size_t try_alignment, int protect_flags, int flags, int fd) {
-  MI_UNUSED(try_alignment);  
-  #if defined(MAP_ALIGNED)  // BSD
-  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
-    size_t n = mi_bsr(try_alignment);
-    if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) {  // alignment is a power of 2 and 4096 <= alignment <= 1GiB
-      flags |= MAP_ALIGNED(n);
-      void* p = mmap(addr, size, protect_flags, flags | MAP_ALIGNED(n), fd, 0);
-      if (p!=MAP_FAILED) return p;
-      // fall back to regular mmap
-    }
-  }
-  #elif defined(MAP_ALIGN)  // Solaris
-  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
-    void* p = mmap((void*)try_alignment, size, protect_flags, flags | MAP_ALIGN, fd, 0);  // addr parameter is the required alignment
-    if (p!=MAP_FAILED) return p;
-    // fall back to regular mmap
+
+static void mi_os_free_huge_os_pages(void* p, size_t size, mi_subproc_t* subproc);
+
+static void mi_os_prim_free(void* addr, size_t size, size_t commit_size, mi_subproc_t* subproc) {
+  mi_assert_internal((size % _mi_os_page_size()) == 0);
+  if (addr == NULL) return; // || _mi_os_is_huge_reserved(addr)
+  int err = _mi_prim_free(addr, size);  // allow size==0 (issue #1041)
+  if (err != 0) {
+    _mi_warning_message("unable to free OS memory (error: %d (0x%x), size: 0x%zx bytes, address: %p)\n", err, err, size, addr);
   }
-  #endif
-  #if (MI_INTPTR_SIZE >= 8) && !defined(MAP_ALIGNED)
-  // on 64-bit systems, use the virtual address area after 2TiB for 4MiB aligned allocations
-  if (addr == NULL) {
-    void* hint = mi_os_get_aligned_hint(try_alignment, size);
-    if (hint != NULL) {
-      void* p = mmap(hint, size, protect_flags, flags, fd, 0);
-      if (p!=MAP_FAILED) return p;
-      // fall back to regular mmap
-    }
+  if (subproc == NULL) { subproc = _mi_subproc(); } // from `mi_arenas_unsafe_destroy` we pass subproc_main explicitly as we can no longer use the theap pointer
+  if (commit_size > 0) {
+    mi_subproc_stat_decrease(subproc, committed, commit_size);
   }
-  #endif
-  // regular mmap
-  void* p = mmap(addr, size, protect_flags, flags, fd, 0);
-  if (p!=MAP_FAILED) return p;  
-  // failed to allocate
-  return NULL;
-}
-
-static int mi_unix_mmap_fd(void) {
-#if defined(VM_MAKE_TAG)
-  // macOS: tracking anonymous page with a specific ID. (All up to 98 are taken officially but LLVM sanitizers had taken 99)
-  int os_tag = (int)mi_option_get(mi_option_os_tag);
-  if (os_tag < 100 || os_tag > 255) os_tag = 100;
-  return VM_MAKE_TAG(os_tag);
-#else
-  return -1;
-#endif
+  mi_subproc_stat_decrease(subproc, reserved, size);
 }
 
-static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int protect_flags, bool large_only, bool allow_large, bool* is_large) {
-  void* p = NULL;
-  #if !defined(MAP_ANONYMOUS)
-  #define MAP_ANONYMOUS  MAP_ANON
-  #endif
-  #if !defined(MAP_NORESERVE)
-  #define MAP_NORESERVE  0
-  #endif
-  const int fd = mi_unix_mmap_fd();
-  int flags = MAP_PRIVATE | MAP_ANONYMOUS;
-  if (_mi_os_has_overcommit()) {
-    flags |= MAP_NORESERVE;
-  }  
-  #if defined(PROT_MAX)
-  protect_flags |= PROT_MAX(PROT_READ | PROT_WRITE); // BSD
-  #endif    
-  // huge page allocation
-  if ((large_only || use_large_os_page(size, try_alignment)) && allow_large) {
-    static _Atomic(size_t) large_page_try_ok; // = 0;
-    size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
-    if (!large_only && try_ok > 0) {
-      // If the OS is not configured for large OS pages, or the user does not have
-      // enough permission, the `mmap` will always fail (but it might also fail for other reasons).
-      // Therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times
-      // to avoid too many failing calls to mmap.
-      mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1);
-    }
-    else {
-      int lflags = flags & ~MAP_NORESERVE;  // using NORESERVE on huge pages seems to fail on Linux
-      int lfd = fd;
-      #ifdef MAP_ALIGNED_SUPER
-      lflags |= MAP_ALIGNED_SUPER;
-      #endif
-      #ifdef MAP_HUGETLB
-      lflags |= MAP_HUGETLB;
-      #endif
-      #ifdef MAP_HUGE_1GB
-      static bool mi_huge_pages_available = true;
-      if ((size % MI_GiB) == 0 && mi_huge_pages_available) {
-        lflags |= MAP_HUGE_1GB;
+void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t memid, mi_subproc_t* subproc /* can be NULL */) {
+  if (mi_memkind_is_os(memid.memkind)) {
+    size_t csize = memid.mem.os.size;
+    if (csize==0) { csize = _mi_os_good_alloc_size(size); }
+    mi_assert_internal(csize >= size);
+    size_t commit_size = (still_committed ? csize : 0);
+    void* base = addr;
+    // different base? (due to alignment)
+    if (memid.mem.os.base != base) {
+      mi_assert(memid.mem.os.base <= addr);
+      base = memid.mem.os.base;
+      const size_t diff = (uint8_t*)addr - (uint8_t*)memid.mem.os.base;
+      if (memid.mem.os.size==0) {
+        csize += diff;
       }
-      else
-      #endif
-      {
-        #ifdef MAP_HUGE_2MB
-        lflags |= MAP_HUGE_2MB;
-        #endif
-      }
-      #ifdef VM_FLAGS_SUPERPAGE_SIZE_2MB
-      lfd |= VM_FLAGS_SUPERPAGE_SIZE_2MB;
-      #endif
-      if (large_only || lflags != flags) {
-        // try large OS page allocation
-        *is_large = true;
-        p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, lflags, lfd);
-        #ifdef MAP_HUGE_1GB
-        if (p == NULL && (lflags & MAP_HUGE_1GB) != 0) {
-          mi_huge_pages_available = false; // don't try huge 1GiB pages again
-          _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (error %i)\n", errno);
-          lflags = ((lflags & ~MAP_HUGE_1GB) | MAP_HUGE_2MB);
-          p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, lflags, lfd);
-        }
-        #endif
-        if (large_only) return p;
-        if (p == NULL) {
-          mi_atomic_store_release(&large_page_try_ok, (size_t)8);  // on error, don't try again for the next N allocations
-        }
+      if (still_committed) {
+        commit_size -= diff;  // the (addr-base) part was already un-committed
       }
     }
-  }
-  // regular allocation
-  if (p == NULL) {
-    *is_large = false;
-    p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, flags, fd);
-    if (p != NULL) {
-      #if defined(MADV_HUGEPAGE)
-      // Many Linux systems don't allow MAP_HUGETLB but they support instead
-      // transparent huge pages (THP). Generally, it is not required to call `madvise` with MADV_HUGE
-      // though since properly aligned allocations will already use large pages if available
-      // in that case -- in particular for our large regions (in `memory.c`).
-      // However, some systems only allow THP if called with explicit `madvise`, so
-      // when large OS pages are enabled for mimalloc, we call `madvise` anyways.
-      if (allow_large && use_large_os_page(size, try_alignment)) {
-        if (mi_madvise(p, size, MADV_HUGEPAGE) == 0) {
-          *is_large = true; // possibly
-        };
-      }
-      #elif defined(__sun)
-      if (allow_large && use_large_os_page(size, try_alignment)) {
-        struct memcntl_mha cmd = {0};
-        cmd.mha_pagesize = large_os_page_size;
-        cmd.mha_cmd = MHA_MAPSIZE_VA;
-        if (memcntl((caddr_t)p, size, MC_HAT_ADVISE, (caddr_t)&cmd, 0, 0) == 0) {
-          *is_large = true;
-        }
-      }      
-      #endif
+    // free it
+    if (memid.memkind == MI_MEM_OS_HUGE) {
+      mi_assert(memid.is_pinned);
+      mi_os_free_huge_os_pages(base, csize, subproc);
+    }
+    else {
+      mi_os_prim_free(base, csize, (still_committed ? commit_size : 0), subproc);
     }
   }
-  if (p == NULL) {
-    _mi_error_message(errno, "unable to allocate OS memory (%zu bytes, error code: %i [%s], address: %p, large only: %d, allow large: %d)\n", size, errno, strerror(errno), addr, large_only, allow_large);
+  else {
+    // nothing to do
+    mi_assert(memid.memkind < MI_MEM_OS);
   }
-  return p;
 }
-#endif
+
+void  _mi_os_free(void* p, size_t size, mi_memid_t memid) {
+  _mi_os_free_ex(p, size, true, memid, NULL);
+}
 
 
 /* -----------------------------------------------------------
@@ -702,148 +243,223 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
 -------------------------------------------------------------- */
 
 // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
-static void* mi_os_mem_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, mi_stats_t* stats) {
+// Also `hint_addr` is a hint and may be ignored.
+static void* mi_os_prim_alloc_at(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero) {
   mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
+  mi_assert_internal(is_zero != NULL);
+  mi_assert_internal(is_large != NULL);
   if (size == 0) return NULL;
-  if (!commit) allow_large = false;
-  if (try_alignment == 0) try_alignment = 1; // avoid 0 to ensure there will be no divide by zero when aligning
-
+  if (!commit) { allow_large = false; }
+  if (try_alignment == 0) { try_alignment = 1; } // avoid 0 to ensure there will be no divide by zero when aligning
+  *is_zero = false;
   void* p = NULL;
-  /*
-  if (commit && allow_large) {
-    p = _mi_os_try_alloc_from_huge_reserved(size, try_alignment);
-    if (p != NULL) {
-      *is_large = true;
-      return p;
-    }
+  int err = _mi_prim_alloc(hint_addr, size, try_alignment, commit, allow_large, is_large, is_zero, &p);
+  if (err != 0) {
+    _mi_warning_message("unable to allocate OS memory (error: %d (0x%x), addr: %p, size: 0x%zx bytes, align: 0x%zx, commit: %d, allow large: %d)\n", err, err, hint_addr, size, try_alignment, commit, allow_large);
   }
-  */
 
-  #if defined(_WIN32)
-    int flags = MEM_RESERVE;
-    if (commit) { flags |= MEM_COMMIT; }
-    p = mi_win_virtual_alloc(NULL, size, try_alignment, flags, false, allow_large, is_large);
-  #elif defined(MI_USE_SBRK) || defined(__wasi__)
-    MI_UNUSED(allow_large);
-    *is_large = false;
-    p = mi_heap_grow(size, try_alignment);
-  #else
-    int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
-    p = mi_unix_mmap(NULL, size, try_alignment, protect_flags, false, allow_large, is_large);
-  #endif
-  mi_stat_counter_increase(stats->mmap_calls, 1);
+  mi_os_stat_counter_increase(mmap_calls, 1);
   if (p != NULL) {
-    _mi_stat_increase(&stats->reserved, size);
-    if (commit) { _mi_stat_increase(&stats->committed, size); }
+    mi_os_stat_increase(reserved, size);
+    if (commit) {
+      mi_os_stat_increase(committed, size);
+      // seems needed for asan (or `mimalloc-test-api` fails)
+      #ifdef MI_TRACK_ASAN
+      if (*is_zero) { mi_track_mem_defined(p,size); }
+               else { mi_track_mem_undefined(p,size); }
+      #endif
+    }
   }
   return p;
 }
 
+static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero) {
+  return mi_os_prim_alloc_at(NULL, size, try_alignment, commit, allow_large, is_large, is_zero);
+}
+
 
 // Primitive aligned allocation from the OS.
 // This function guarantees the allocated memory is aligned.
-static void* mi_os_mem_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_large, mi_stats_t* stats) {
+static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** base) {
   mi_assert_internal(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0));
   mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
   mi_assert_internal(is_large != NULL);
+  mi_assert_internal(is_zero != NULL);
+  mi_assert_internal(base != NULL);
   if (!commit) allow_large = false;
   if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL;
   size = _mi_align_up(size, _mi_os_page_size());
 
-  // try first with a hint (this will be aligned directly on Win 10+ or BSD)
-  void* p = mi_os_mem_alloc(size, alignment, commit, allow_large, is_large, stats);
-  if (p == NULL) return NULL;
-  
-  // if not aligned, free it, overallocate, and unmap around it
-  if (((uintptr_t)p % alignment != 0)) {
-    mi_os_mem_free(p, size, commit, stats);
-    _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (%zu bytes, address: %p, alignment: %zu, commit: %d)\n", size, p, alignment, commit);
+  // try a direct allocation if the alignment is below the default, or if larger than 1/8 fraction of the size.
+  const bool try_direct_alloc = (alignment <= mi_os_mem_config.alloc_granularity || alignment > size/8);
+
+  void* p = NULL;
+  if (try_direct_alloc) {
+    p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero);
+  }
+
+  // aligned already?
+  if (p != NULL && ((uintptr_t)p % alignment) == 0) {
+    *base = p;
+  }
+  else {
+    // if not aligned, free it, overallocate, and unmap around it
+    #if !MI_TRACK_ASAN
+    if (try_direct_alloc) {
+      _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (size: 0x%zx bytes, address: %p, alignment: 0x%zx, commit: %d)\n", size, p, alignment, commit);
+    }
+    #endif
+    if (p != NULL) { mi_os_prim_free(p, size, (commit ? size : 0), NULL); }
     if (size >= (SIZE_MAX - alignment)) return NULL; // overflow
     const size_t over_size = size + alignment;
 
-#if _WIN32
-    // over-allocate uncommitted (virtual) memory
-    p = mi_os_mem_alloc(over_size, 0 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, stats);
-    if (p == NULL) return NULL;
-    
-    // set p to the aligned part in the full region
-    // note: this is dangerous on Windows as VirtualFree needs the actual region pointer
-    // but in mi_os_mem_free we handle this (hopefully exceptional) situation.
-    p = mi_align_up_ptr(p, alignment);
-
-    // explicitly commit only the aligned part
-    if (commit) {
-      _mi_os_commit(p, size, NULL, stats);
+    if (!mi_os_mem_config.has_partial_free) {  // win32 virtualAlloc cannot free parts of an allocated block
+      // over-allocate uncommitted (virtual) memory
+      p = mi_os_prim_alloc(over_size, 1 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, is_zero);
+      if (p == NULL) return NULL;
+
+      // set p to the aligned part in the full region
+      // note: on Windows VirtualFree needs the actual base pointer
+      // this is handledby having the `base` field in the memid.
+      *base = p; // remember the base
+      p = _mi_align_up_ptr(p, alignment);
+
+      // explicitly commit only the aligned part
+      if (commit) {
+        if (!_mi_os_commit(p, size, NULL)) {
+          mi_os_prim_free(*base, over_size, 0, NULL);
+          return NULL;
+        }
+      }
+    }
+    else  { // mmap can free inside an allocation
+      // overallocate...
+      p = mi_os_prim_alloc(over_size, 1, commit, false, is_large, is_zero);
+      if (p == NULL) return NULL;
+
+      // and selectively unmap parts around the over-allocated area.
+      void* aligned_p = _mi_align_up_ptr(p, alignment);
+      size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p;
+      size_t mid_size = _mi_align_up(size, _mi_os_page_size());
+      size_t post_size = over_size - pre_size - mid_size;
+      mi_assert_internal(pre_size < over_size&& post_size < over_size&& mid_size >= size);
+      if (pre_size > 0)  { mi_os_prim_free(p, pre_size, (commit ? pre_size : 0), NULL); }
+      if (post_size > 0) { mi_os_prim_free((uint8_t*)aligned_p + mid_size, post_size, (commit ? post_size : 0), NULL); }
+      // we can return the aligned pointer on `mmap` systems
+      p = aligned_p;
+      *base = aligned_p; // since we freed the pre part, `*base == p`.
     }
-#else
-    // overallocate...
-    p = mi_os_mem_alloc(over_size, 1, commit, false, is_large, stats);
-    if (p == NULL) return NULL;
-    // and selectively unmap parts around the over-allocated area. (noop on sbrk)
-    void* aligned_p = mi_align_up_ptr(p, alignment);
-    size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p;
-    size_t mid_size = _mi_align_up(size, _mi_os_page_size());
-    size_t post_size = over_size - pre_size - mid_size;
-    mi_assert_internal(pre_size < over_size && post_size < over_size && mid_size >= size);
-    if (pre_size > 0)  mi_os_mem_free(p, pre_size, commit, stats);
-    if (post_size > 0) mi_os_mem_free((uint8_t*)aligned_p + mid_size, post_size, commit, stats);
-    // we can return the aligned pointer on `mmap` (and sbrk) systems
-    p = aligned_p;
-#endif
   }
 
-  mi_assert_internal(p == NULL || (p != NULL && ((uintptr_t)p % alignment) == 0));
+  mi_assert_internal(p == NULL || (p != NULL && *base != NULL && ((uintptr_t)p % alignment) == 0));
   return p;
 }
 
 
 /* -----------------------------------------------------------
-  OS API: alloc, free, alloc_aligned
+  OS API: alloc and alloc_aligned
 ----------------------------------------------------------- */
 
-void* _mi_os_alloc(size_t size, mi_stats_t* tld_stats) {
-  MI_UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
+void* _mi_os_alloc(size_t size, mi_memid_t* memid) {
+  *memid = _mi_memid_none();
   if (size == 0) return NULL;
   size = _mi_os_good_alloc_size(size);
-  bool is_large = false;
-  return mi_os_mem_alloc(size, 0, true, false, &is_large, stats);
-}
-
-void  _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* tld_stats) {
-  MI_UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
-  if (size == 0 || p == NULL) return;
-  size = _mi_os_good_alloc_size(size);
-  mi_os_mem_free(p, size, was_committed, stats);
-}
+  bool os_is_large = false;
+  bool os_is_zero  = false;
+  void* p = mi_os_prim_alloc(size, 0, true, false, &os_is_large, &os_is_zero);
+  if (p == NULL) return NULL;
 
-void  _mi_os_free(void* p, size_t size, mi_stats_t* stats) {
-  _mi_os_free_ex(p, size, true, stats);
+  *memid = _mi_memid_create_os(p, size, true, os_is_zero, os_is_large);
+  mi_assert_internal(memid->mem.os.size >= size);
+  mi_assert_internal(memid->initially_committed);
+  return p;
 }
 
-void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_stats_t* tld_stats)
+void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid)
 {
-  MI_UNUSED(&mi_os_get_aligned_hint); // suppress unused warnings
-  MI_UNUSED(tld_stats);
+  MI_UNUSED(&_mi_os_get_aligned_hint); // suppress unused warnings
+  *memid = _mi_memid_none();
   if (size == 0) return NULL;
   size = _mi_os_good_alloc_size(size);
   alignment = _mi_align_up(alignment, _mi_os_page_size());
-  bool allow_large = false;
-  if (large != NULL) {
-    allow_large = *large;
-    *large = false;
+
+  bool os_is_large = false;
+  bool os_is_zero  = false;
+  void* os_base = NULL;
+  void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base );
+  if (p == NULL) return NULL;
+
+  *memid = _mi_memid_create_os(p, size, commit, os_is_zero, os_is_large);
+  memid->mem.os.base = os_base;
+  memid->mem.os.size += ((uint8_t*)p - (uint8_t*)os_base);  // todo: return from prim_alloc_aligned?
+
+  mi_assert_internal(memid->mem.os.size >= size);
+  mi_assert_internal(_mi_is_aligned(p,alignment));
+  if (commit) { mi_assert_internal(memid->initially_committed); }
+  return p;
+}
+
+
+mi_decl_nodiscard static void* mi_os_ensure_zero(void* p, size_t size, mi_memid_t* memid) {
+  if (p==NULL || size==0) return p;
+  // ensure committed
+  if (!memid->initially_committed) {
+    bool is_zero = false;
+    if (!_mi_os_commit(p, size, &is_zero)) {
+      _mi_os_free(p, size, *memid);
+      return NULL;
+    }
+    memid->initially_committed = true;
   }
-  return mi_os_mem_alloc_aligned(size, alignment, commit, allow_large, (large!=NULL?large:&allow_large), &_mi_stats_main /*tld->stats*/ );
+  // ensure zero'd
+  if (memid->initially_zero) return p;
+  _mi_memzero_aligned(p,size);
+  memid->initially_zero = true;
+  return p;
+}
+
+void*  _mi_os_zalloc(size_t size, mi_memid_t* memid) {
+  void* p = _mi_os_alloc(size,memid);
+  return mi_os_ensure_zero(p, size, memid);
 }
 
+/* -----------------------------------------------------------
+  OS aligned allocation with an offset. This is used
+  for large alignments > MI_BLOCK_ALIGNMENT_MAX. We use a large mimalloc
+  page where the object can be aligned at an offset from the start of the segment.
+  As we may need to overallocate, we need to free such pointers using `mi_free_aligned`
+  to use the actual start of the memory region.
+----------------------------------------------------------- */
 
+void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offset, bool commit, bool allow_large, mi_memid_t* memid) {
+  mi_assert(offset <= size);
+  mi_assert((alignment % _mi_os_page_size()) == 0);
+  *memid = _mi_memid_none();
+  if (offset == 0) {
+    // regular aligned allocation
+    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid);
+  }
+  else {
+    // overallocate to align at an offset
+    const size_t extra = _mi_align_up(offset, alignment) - offset;
+    const size_t oversize = size + extra;
+    void* const start = _mi_os_alloc_aligned(oversize, alignment, commit, allow_large, memid);
+    if (start == NULL) return NULL;
+
+    void* const p = (uint8_t*)start + extra;
+    mi_assert(_mi_is_aligned((uint8_t*)p + offset, alignment));
+    // decommit the overallocation at the start
+    if (commit && extra > _mi_os_page_size()) {
+      _mi_os_decommit(start, extra);
+    }
+    return p;
+  }
+}
 
 /* -----------------------------------------------------------
   OS memory API: reset, commit, decommit, protect, unprotect.
 ----------------------------------------------------------- */
 
-
 // OS page align within a given area, either conservative (pages inside the area only),
 // or not (straddling pages outside the area is possible)
 static void* mi_os_page_align_areax(bool conservative, void* addr, size_t size, size_t* newsize) {
@@ -851,11 +467,11 @@ static void* mi_os_page_align_areax(bool conservative, void* addr, size_t size,
   if (newsize != NULL) *newsize = 0;
   if (size == 0 || addr == NULL) return NULL;
 
-  // page align conservatively within the range
-  void* start = (conservative ? mi_align_up_ptr(addr, _mi_os_page_size())
+  // page align conservatively within the range, or liberally straddling pages outside the range
+  void* start = (conservative ? _mi_align_up_ptr(addr, _mi_os_page_size())
     : mi_align_down_ptr(addr, _mi_os_page_size()));
   void* end = (conservative ? mi_align_down_ptr((uint8_t*)addr + size, _mi_os_page_size())
-    : mi_align_up_ptr((uint8_t*)addr + size, _mi_os_page_size()));
+    : _mi_align_up_ptr((uint8_t*)addr + size, _mi_os_page_size()));
   ptrdiff_t diff = (uint8_t*)end - (uint8_t*)start;
   if (diff <= 0) return NULL;
 
@@ -868,188 +484,132 @@ static void* mi_os_page_align_area_conservative(void* addr, size_t size, size_t*
   return mi_os_page_align_areax(true, addr, size, newsize);
 }
 
-static void mi_mprotect_hint(int err) {
-#if defined(MI_OS_USE_MMAP) && (MI_SECURE>=2) // guard page around every mimalloc page
-  if (err == ENOMEM) {
-    _mi_warning_message("the previous warning may have been caused by a low memory map limit.\n"
-                        "  On Linux this is controlled by the vm.max_map_count. For example:\n"
-                        "  > sudo sysctl -w vm.max_map_count=262144\n");
-  }
-#else
-  MI_UNUSED(err);
-#endif
-}
-
-// Commit/Decommit memory.
-// Usually commit is aligned liberal, while decommit is aligned conservative.
-// (but not for the reset version where we want commit to be conservative as well)
-static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservative, bool* is_zero, mi_stats_t* stats) {
-  // page align in the range, commit liberally, decommit conservative
+bool _mi_os_commit_ex(void* addr, size_t size, bool* is_zero, size_t stat_size) {
   if (is_zero != NULL) { *is_zero = false; }
+  mi_os_stat_counter_increase(commit_calls, 1);
+
+  // page align range
   size_t csize;
-  void* start = mi_os_page_align_areax(conservative, addr, size, &csize);
-  if (csize == 0) return true;  // || _mi_os_is_huge_reserved(addr))
-  int err = 0;
-  if (commit) {
-    _mi_stat_increase(&stats->committed, size);  // use size for precise commit vs. decommit
-    _mi_stat_counter_increase(&stats->commit_calls, 1);
-  }
-  else {
-    _mi_stat_decrease(&stats->committed, size);
-  }
+  void* start = mi_os_page_align_areax(false /* conservative? */, addr, size, &csize);
+  if (csize == 0) return true;
 
-  #if defined(_WIN32)
-  if (commit) {
-    // *is_zero = true;  // note: if the memory was already committed, the call succeeds but the memory is not zero'd
-    void* p = VirtualAlloc(start, csize, MEM_COMMIT, PAGE_READWRITE);
-    err = (p == start ? 0 : GetLastError());
-  }
-  else {
-    BOOL ok = VirtualFree(start, csize, MEM_DECOMMIT);
-    err = (ok ? 0 : GetLastError());
-  }
-  #elif defined(__wasi__)
-  // WebAssembly guests can't control memory protection
-  #elif 0 && defined(MAP_FIXED) && !defined(__APPLE__)
-  // Linux: disabled for now as mmap fixed seems much more expensive than MADV_DONTNEED (and splits VMA's?)
-  if (commit) {
-    // commit: just change the protection
-    err = mprotect(start, csize, (PROT_READ | PROT_WRITE));
-    if (err != 0) { err = errno; }
-  } 
-  else {
-    // decommit: use mmap with MAP_FIXED to discard the existing memory (and reduce rss)
-    const int fd = mi_unix_mmap_fd();
-    void* p = mmap(start, csize, PROT_NONE, (MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE), fd, 0);
-    if (p != start) { err = errno; }
+  // commit
+  bool os_is_zero = false;
+  int err = _mi_prim_commit(start, csize, &os_is_zero);
+  if (err != 0) {
+    _mi_warning_message("cannot commit OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize);
+    return false;
   }
-  #else
-  // Linux, macOSX and others.
-  if (commit) {
-    // commit: ensure we can access the area    
-    err = mprotect(start, csize, (PROT_READ | PROT_WRITE));
-    if (err != 0) { err = errno; }
-  } 
-  else {
-    #if defined(MADV_DONTNEED) && MI_DEBUG == 0 && MI_SECURE == 0
-    // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE)
-    // (on the other hand, MADV_FREE would be good enough.. it is just not reflected in the stats :-( )
-    err = madvise(start, csize, MADV_DONTNEED);
-    #else
-    // decommit: just disable access (also used in debug and secure mode to trap on illegal access)
-    err = mprotect(start, csize, PROT_NONE);
-    if (err != 0) { err = errno; }
-    #endif
-    //#if defined(MADV_FREE_REUSE)
-    //  while ((err = mi_madvise(start, csize, MADV_FREE_REUSE)) != 0 && errno == EAGAIN) { errno = 0; }
-    //#endif
+  if (os_is_zero && is_zero != NULL) {
+    *is_zero = true;
+    mi_assert_expensive(mi_mem_is_zero(start, csize));
   }
+  // note: the following seems required for asan (otherwise `mimalloc-test-stress` fails)
+  #ifdef MI_TRACK_ASAN
+  if (os_is_zero) { mi_track_mem_defined(start,csize); }
+             else { mi_track_mem_undefined(start,csize); }
   #endif
+  mi_os_stat_increase(committed, stat_size);  // use size for precise commit vs. decommit
+  return true;
+}
+
+bool _mi_os_commit(void* addr, size_t size, bool* is_zero) {
+  return _mi_os_commit_ex(addr, size, is_zero, size);
+}
+
+static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit, size_t stat_size) {
+  mi_assert_internal(needs_recommit!=NULL);
+  mi_os_stat_decrease(committed, stat_size);
+
+  // page align
+  size_t csize;
+  void* start = mi_os_page_align_area_conservative(addr, size, &csize);
+  if (csize == 0) return true;
+
+  // decommit
+  *needs_recommit = true;
+  int err = _mi_prim_decommit(start,csize,needs_recommit);
   if (err != 0) {
-    _mi_warning_message("%s error: start: %p, csize: 0x%zx, err: %i\n", commit ? "commit" : "decommit", start, csize, err);
-    mi_mprotect_hint(err);
+    _mi_warning_message("cannot decommit OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize);
   }
   mi_assert_internal(err == 0);
   return (err == 0);
 }
 
-bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {
-  MI_UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
-  return mi_os_commitx(addr, size, true, false /* liberal */, is_zero, stats);
-}
-
-bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* tld_stats) {
-  MI_UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
-  bool is_zero;
-  return mi_os_commitx(addr, size, false, true /* conservative */, &is_zero, stats);
+bool _mi_os_decommit(void* addr, size_t size) {
+  bool needs_recommit;
+  return mi_os_decommit_ex(addr, size, &needs_recommit, size);
 }
 
-/*
-static bool mi_os_commit_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* stats) {  
-  return mi_os_commitx(addr, size, true, true // conservative
-                      , is_zero, stats);
-}
-*/
 
 // Signal to the OS that the address range is no longer in use
 // but may be used later again. This will release physical memory
 // pages and reduce swapping while keeping the memory committed.
 // We page align to a conservative area inside the range to reset.
-static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats) {
+bool _mi_os_reset(void* addr, size_t size) {
   // page align conservatively within the range
   size_t csize;
   void* start = mi_os_page_align_area_conservative(addr, size, &csize);
   if (csize == 0) return true;  // || _mi_os_is_huge_reserved(addr)
-  if (reset) _mi_stat_increase(&stats->reset, csize);
-        else _mi_stat_decrease(&stats->reset, csize);
-  if (!reset) return true; // nothing to do on unreset!
+  mi_os_stat_counter_increase(reset, csize);
+  mi_os_stat_counter_increase(reset_calls, 1);
 
-  #if (MI_DEBUG>1)
-  if (MI_SECURE==0) {
-    memset(start, 0, csize); // pretend it is eagerly reset
-  }
+  #if (MI_DEBUG>1) && !MI_SECURE && !MI_TRACK_ENABLED // && !MI_TSAN
+  memset(start, 0, csize); // pretend it is eagerly reset
   #endif
 
-#if defined(_WIN32)
-  // Testing shows that for us (on `malloc-large`) MEM_RESET is 2x faster than DiscardVirtualMemory
-  void* p = VirtualAlloc(start, csize, MEM_RESET, PAGE_READWRITE);
-  mi_assert_internal(p == start);
-  #if 1
-  if (p == start && start != NULL) {
-    VirtualUnlock(start,csize); // VirtualUnlock after MEM_RESET removes the memory from the working set
-  }
-  #endif
-  if (p != start) return false;
-#else
-#if defined(MADV_FREE)
-  static _Atomic(size_t) advice = MI_ATOMIC_VAR_INIT(MADV_FREE);
-  int oadvice = (int)mi_atomic_load_relaxed(&advice);
-  int err;
-  while ((err = mi_madvise(start, csize, oadvice)) != 0 && errno == EAGAIN) { errno = 0;  };
-  if (err != 0 && errno == EINVAL && oadvice == MADV_FREE) {  
-    // if MADV_FREE is not supported, fall back to MADV_DONTNEED from now on
-    mi_atomic_store_release(&advice, (size_t)MADV_DONTNEED);
-    err = mi_madvise(start, csize, MADV_DONTNEED);
-  }
-#elif defined(__wasi__)
-  int err = 0;
-#else
-  int err = mi_madvise(start, csize, MADV_DONTNEED);
-#endif
+  int err = _mi_prim_reset(start, csize);
   if (err != 0) {
-    _mi_warning_message("madvise reset error: start: %p, csize: 0x%zx, errno: %i\n", start, csize, errno);
+    _mi_warning_message("cannot reset OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize);
   }
-  //mi_assert(err == 0);
-  if (err != 0) return false;
-#endif
-  return true;
+  return (err == 0);
 }
 
-// Signal to the OS that the address range is no longer in use
-// but may be used later again. This will release physical memory
-// pages and reduce swapping while keeping the memory committed.
-// We page align to a conservative area inside the range to reset.
-bool _mi_os_reset(void* addr, size_t size, mi_stats_t* tld_stats) {
-  MI_UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
-  return mi_os_resetx(addr, size, true, stats);
+
+void _mi_os_reuse( void* addr, size_t size ) {
+  // page align conservatively within the range
+  size_t csize = 0;
+  void* const start = mi_os_page_align_area_conservative(addr, size, &csize);
+  if (csize == 0) return;
+  const int err = _mi_prim_reuse(start, csize);
+  if (err != 0) {
+    _mi_warning_message("cannot reuse OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize);
+  }
 }
 
-/*
-bool _mi_os_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {
-  MI_UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
-  if (mi_option_is_enabled(mi_option_reset_decommits)) {
-    return mi_os_commit_unreset(addr, size, is_zero, stats);  // re-commit it (conservatively!)
+// either resets or decommits memory, returns true if the memory needs
+// to be recommitted if it is to be re-used later on.
+bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, size_t stat_size, mi_commit_fun_t* commit_fun, void* commit_fun_arg)
+{
+  if (mi_option_get(mi_option_purge_delay) < 0) return false;  // is purging allowed?
+  mi_os_stat_counter_increase(purge_calls, 1);
+  mi_os_stat_counter_increase(purged, size);
+
+  if (commit_fun != NULL) {
+    bool decommitted = (*commit_fun)(false, p, size, NULL, commit_fun_arg);
+    return decommitted; // needs_recommit?
+  }
+  else if (mi_option_is_enabled(mi_option_purge_decommits) &&   // should decommit?
+           !_mi_preloading())                                   // don't decommit during preloading (unsafe)
+  {
+    bool needs_recommit = true;
+    mi_os_decommit_ex(p, size, &needs_recommit, stat_size);
+    return needs_recommit;
   }
   else {
-    *is_zero = false;
-    return mi_os_resetx(addr, size, false, stats);
+    if (allow_reset) {  // this can sometimes be not allowed if the range is not fully committed (on Windows, we cannot reset uncommitted memory)
+      _mi_os_reset(p, size);
+    }
+    return false;  // needs no recommit
   }
 }
-*/
+
+// either resets or decommits memory, returns true if the memory needs
+// to be recommitted if it is to be re-used later on.
+bool _mi_os_purge(void* p, size_t size) {
+  return _mi_os_purge_ex(p, size, true, size, NULL, NULL);
+}
+
 
 // Protect a region in memory to be not accessible.
 static  bool mi_os_protectx(void* addr, size_t size, bool protect) {
@@ -1062,20 +622,9 @@ static  bool mi_os_protectx(void* addr, size_t size, bool protect) {
 	  _mi_warning_message("cannot mprotect memory allocated in huge OS pages\n");
   }
   */
-  int err = 0;
-#ifdef _WIN32
-  DWORD oldprotect = 0;
-  BOOL ok = VirtualProtect(start, csize, protect ? PAGE_NOACCESS : PAGE_READWRITE, &oldprotect);
-  err = (ok ? 0 : GetLastError());
-#elif defined(__wasi__)
-  err = 0;
-#else
-  err = mprotect(start, csize, protect ? PROT_NONE : (PROT_READ | PROT_WRITE));
-  if (err != 0) { err = errno; }
-#endif
+  int err = _mi_prim_protect(start,csize,protect);
   if (err != 0) {
-    _mi_warning_message("mprotect error: start: %p, csize: 0x%zx, err: %i\n", start, csize, err);
-    mi_mprotect_hint(err);
+    _mi_warning_message("cannot %s OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", (protect ? "protect" : "unprotect"), err, err, start, csize);
   }
   return (err == 0);
 }
@@ -1090,115 +639,12 @@ bool _mi_os_unprotect(void* addr, size_t size) {
 
 
 
-bool _mi_os_shrink(void* p, size_t oldsize, size_t newsize, mi_stats_t* stats) {
-  // page align conservatively within the range
-  mi_assert_internal(oldsize > newsize && p != NULL);
-  if (oldsize < newsize || p == NULL) return false;
-  if (oldsize == newsize) return true;
-
-  // oldsize and newsize should be page aligned or we cannot shrink precisely
-  void* addr = (uint8_t*)p + newsize;
-  size_t size = 0;
-  void* start = mi_os_page_align_area_conservative(addr, oldsize - newsize, &size);
-  if (size == 0 || start != addr) return false;
-
-#ifdef _WIN32
-  // we cannot shrink on windows, but we can decommit
-  return _mi_os_decommit(start, size, stats);
-#else
-  return mi_os_mem_free(start, size, true, stats);
-#endif
-}
-
-
 /* ----------------------------------------------------------------------------
 Support for allocating huge OS pages (1Gib) that are reserved up-front
 and possibly associated with a specific NUMA node. (use `numa_node>=0`)
 -----------------------------------------------------------------------------*/
 #define MI_HUGE_OS_PAGE_SIZE  (MI_GiB)
 
-#if defined(_WIN32) && (MI_INTPTR_SIZE >= 8)
-static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
-{
-  mi_assert_internal(size%MI_GiB == 0);
-  mi_assert_internal(addr != NULL);
-  const DWORD flags = MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE;
-
-  mi_win_enable_large_os_pages();
-
-  MI_MEM_EXTENDED_PARAMETER params[3] = { {{0,0},{0}},{{0,0},{0}},{{0,0},{0}} };
-  // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
-  static bool mi_huge_pages_available = true;
-  if (pNtAllocateVirtualMemoryEx != NULL && mi_huge_pages_available) {
-    params[0].Type.Type = MiMemExtendedParameterAttributeFlags;
-    params[0].Arg.ULong64 = MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE;
-    ULONG param_count = 1;
-    if (numa_node >= 0) {
-      param_count++;
-      params[1].Type.Type = MiMemExtendedParameterNumaNode;
-      params[1].Arg.ULong = (unsigned)numa_node;
-    }
-    SIZE_T psize = size;
-    void* base = addr;
-    NTSTATUS err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, params, param_count);
-    if (err == 0 && base != NULL) {
-      return base;
-    }
-    else {
-      // fall back to regular large pages
-      mi_huge_pages_available = false; // don't try further huge pages
-      _mi_warning_message("unable to allocate using huge (1GiB) pages, trying large (2MiB) pages instead (status 0x%lx)\n", err);
-    }
-  }
-  // on modern Windows try use VirtualAlloc2 for numa aware large OS page allocation
-  if (pVirtualAlloc2 != NULL && numa_node >= 0) {
-    params[0].Type.Type = MiMemExtendedParameterNumaNode;
-    params[0].Arg.ULong = (unsigned)numa_node;
-    return (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, params, 1);
-  }
-  
-  // otherwise use regular virtual alloc on older windows
-  return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
-}
-
-#elif defined(MI_OS_USE_MMAP) && (MI_INTPTR_SIZE >= 8) && !defined(__HAIKU__)
-#include <sys/syscall.h>
-#ifndef MPOL_PREFERRED
-#define MPOL_PREFERRED 1
-#endif
-#if defined(SYS_mbind)
-static long mi_os_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
-  return syscall(SYS_mbind, start, len, mode, nmask, maxnode, flags);
-}
-#else
-static long mi_os_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
-  MI_UNUSED(start); MI_UNUSED(len); MI_UNUSED(mode); MI_UNUSED(nmask); MI_UNUSED(maxnode); MI_UNUSED(flags);
-  return 0;
-}
-#endif
-static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) {
-  mi_assert_internal(size%MI_GiB == 0);
-  bool is_large = true;
-  void* p = mi_unix_mmap(addr, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
-  if (p == NULL) return NULL;
-  if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes
-    unsigned long numa_mask = (1UL << numa_node);
-    // TODO: does `mbind` work correctly for huge OS pages? should we
-    // use `set_mempolicy` before calling mmap instead?
-    // see: <https://lkml.org/lkml/2017/2/9/875>
-    long err = mi_os_mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
-    if (err != 0) {
-      _mi_warning_message("failed to bind huge (1GiB) pages to numa node %d: %s\n", numa_node, strerror(errno));
-    }
-  }
-  return p;
-}
-#else
-static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) {
-  MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(numa_node);
-  return NULL;
-}
-#endif
 
 #if (MI_INTPTR_SIZE >= 8)
 // To ensure proper alignment, use our own area for huge OS pages
@@ -1216,15 +662,14 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
     start = huge_start;
     if (start == 0) {
       // Initialize the start address after the 32TiB area
-      start = ((uintptr_t)32 << 40);  // 32TiB virtual start address
-#if (MI_SECURE>0 || MI_DEBUG==0)      // security: randomize start of huge pages unless in debug mode
-      uintptr_t r = _mi_heap_random_next(mi_get_default_heap());
+      start = ((uintptr_t)8 << 40);   // 8TiB virtual start address
+    #if (MI_SECURE>0 || MI_DEBUG==0)  // security: randomize start of huge pages unless in debug mode
+      uintptr_t r = _mi_theap_random_next(_mi_theap_default());
       start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x0FFF));  // (randomly 12bits)*1GiB == between 0 to 4TiB
-#endif
+    #endif
     }
     end = start + size;
-    mi_assert_internal(end % MI_SEGMENT_SIZE == 0);
-  } while (!mi_atomic_cas_strong_acq_rel(&mi_huge_start, &huge_start, end));
+  } while (!mi_atomic_cas_weak_acq_rel(&mi_huge_start, &huge_start, end));
 
   if (total_size != NULL) *total_size = size;
   return (uint8_t*)start;
@@ -1237,37 +682,47 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
 }
 #endif
 
-// Allocate MI_SEGMENT_SIZE aligned huge pages
-void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_msecs, size_t* pages_reserved, size_t* psize) {
+// Allocate MI_ARENA_SLICE_ALIGN aligned huge pages
+void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_msecs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid) {
+  *memid = _mi_memid_none();
   if (psize != NULL) *psize = 0;
   if (pages_reserved != NULL) *pages_reserved = 0;
   size_t size = 0;
-  uint8_t* start = mi_os_claim_huge_pages(pages, &size);
+  uint8_t* const start = mi_os_claim_huge_pages(pages, &size);
   if (start == NULL) return NULL; // or 32-bit systems
 
   // Allocate one page at the time but try to place them contiguously
   // We allocate one page at the time to be able to abort if it takes too long
   // or to at least allocate as many as available on the system.
   mi_msecs_t start_t = _mi_clock_start();
-  size_t page;
-  for (page = 0; page < pages; page++) {
+  size_t page = 0;
+  bool all_zero = true;
+  while (page < pages) {
     // allocate a page
+    bool is_zero = false;
     void* addr = start + (page * MI_HUGE_OS_PAGE_SIZE);
-    void* p = mi_os_alloc_huge_os_pagesx(addr, MI_HUGE_OS_PAGE_SIZE, numa_node);
+    void* p = NULL;
+    int err = _mi_prim_alloc_huge_os_pages(addr, MI_HUGE_OS_PAGE_SIZE, numa_node, &is_zero, &p);
+    if (!is_zero) { all_zero = false;  }
+    if (err != 0) {
+      _mi_warning_message("unable to allocate huge OS page (error: %d (0x%x), address: %p, size: %zx bytes)\n", err, err, addr, MI_HUGE_OS_PAGE_SIZE);
+      break;
+    }
 
     // Did we succeed at a contiguous address?
     if (p != addr) {
       // no success, issue a warning and break
       if (p != NULL) {
-        _mi_warning_message("could not allocate contiguous huge page %zu at %p\n", page, addr);
-        _mi_os_free(p, MI_HUGE_OS_PAGE_SIZE, &_mi_stats_main);
+        _mi_warning_message("could not allocate contiguous huge OS page %zu at %p\n", page, addr);
+        mi_os_prim_free(p, MI_HUGE_OS_PAGE_SIZE, MI_HUGE_OS_PAGE_SIZE, NULL);
       }
       break;
     }
 
     // success, record it
-    _mi_stat_increase(&_mi_stats_main.committed, MI_HUGE_OS_PAGE_SIZE);
-    _mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE);
+    page++;  // increase before timeout check (see issue #711)
+    mi_os_stat_increase(committed, MI_HUGE_OS_PAGE_SIZE);
+    mi_os_stat_increase(reserved, MI_HUGE_OS_PAGE_SIZE);
 
     // check for timeout
     if (max_msecs > 0) {
@@ -1279,7 +734,7 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
         }
       }
       if (elapsed > max_msecs) {
-        _mi_warning_message("huge page allocation timed out\n");
+        _mi_warning_message("huge OS page allocation timed out (after allocating %zu page(s))\n", page);
         break;
       }
     }
@@ -1287,157 +742,117 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
   mi_assert_internal(page*MI_HUGE_OS_PAGE_SIZE <= size);
   if (pages_reserved != NULL) { *pages_reserved = page; }
   if (psize != NULL) { *psize = page * MI_HUGE_OS_PAGE_SIZE; }
+  if (page != 0) {
+    mi_assert(start != NULL);
+    *memid = _mi_memid_create_os(start, size, true /* is committed */, all_zero, true /* is_large */);
+    memid->memkind = MI_MEM_OS_HUGE;
+    mi_assert(memid->is_pinned);
+    #ifdef MI_TRACK_ASAN
+    if (all_zero) { mi_track_mem_defined(start,size); }
+    #endif
+  }
   return (page == 0 ? NULL : start);
 }
 
 // free every huge page in a range individually (as we allocated per page)
 // note: needed with VirtualAlloc but could potentially be done in one go on mmap'd systems.
-void _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats) {
+static void mi_os_free_huge_os_pages(void* p, size_t size, mi_subproc_t* subproc) {
   if (p==NULL || size==0) return;
   uint8_t* base = (uint8_t*)p;
   while (size >= MI_HUGE_OS_PAGE_SIZE) {
-    _mi_os_free(base, MI_HUGE_OS_PAGE_SIZE, stats);
+    mi_os_prim_free(base, MI_HUGE_OS_PAGE_SIZE, MI_HUGE_OS_PAGE_SIZE, subproc);
     size -= MI_HUGE_OS_PAGE_SIZE;
     base += MI_HUGE_OS_PAGE_SIZE;
   }
 }
 
+
 /* ----------------------------------------------------------------------------
 Support NUMA aware allocation
 -----------------------------------------------------------------------------*/
-#ifdef _WIN32  
-static size_t mi_os_numa_nodex(void) {
-  USHORT numa_node = 0;
-  if (pGetCurrentProcessorNumberEx != NULL && pGetNumaProcessorNodeEx != NULL) {
-    // Extended API is supported
-    MI_PROCESSOR_NUMBER pnum;
-    (*pGetCurrentProcessorNumberEx)(&pnum);
-    USHORT nnode = 0;
-    BOOL ok = (*pGetNumaProcessorNodeEx)(&pnum, &nnode);
-    if (ok) numa_node = nnode;
-  }
-  else {
-    // Vista or earlier, use older API that is limited to 64 processors. Issue #277
-    DWORD pnum = GetCurrentProcessorNumber();
-    UCHAR nnode = 0;
-    BOOL ok = GetNumaProcessorNode((UCHAR)pnum, &nnode);
-    if (ok) numa_node = nnode;    
-  }
-  return numa_node;
-}
 
-static size_t mi_os_numa_node_countx(void) {
-  ULONG numa_max = 0;
-  GetNumaHighestNodeNumber(&numa_max);
-  // find the highest node number that has actual processors assigned to it. Issue #282
-  while(numa_max > 0) {
-    if (pGetNumaNodeProcessorMaskEx != NULL) {
-      // Extended API is supported
-      GROUP_AFFINITY affinity;
-      if ((*pGetNumaNodeProcessorMaskEx)((USHORT)numa_max, &affinity)) {
-        if (affinity.Mask != 0) break;  // found the maximum non-empty node
-      }
+static _Atomic(size_t) mi_numa_node_count; // = 0   // cache the node count
+
+int _mi_os_numa_node_count(void) {
+  size_t count = mi_atomic_load_acquire(&mi_numa_node_count);
+  if mi_unlikely(count == 0) {
+    long ncount = mi_option_get(mi_option_use_numa_nodes); // given explicitly?
+    if (ncount > 0 && ncount < INT_MAX) {
+      count = (size_t)ncount;
     }
     else {
-      // Vista or earlier, use older API that is limited to 64 processors.
-      ULONGLONG mask;
-      if (GetNumaNodeProcessorMask((UCHAR)numa_max, &mask)) {
-        if (mask != 0) break; // found the maximum non-empty node
-      };
+      const size_t n = _mi_prim_numa_node_count(); // or detect dynamically
+      if (n == 0 || n > INT_MAX) { count = 1; }
+                            else { count = n; }
     }
-    // max node was invalid or had no processor assigned, try again
-    numa_max--;
+    mi_atomic_store_release(&mi_numa_node_count, count); // save it
+    if (count>1) { _mi_verbose_message("using %zd numa regions\n", count); }
   }
-  return ((size_t)numa_max + 1);
+  mi_assert_internal(count > 0 && count <= INT_MAX);
+  return (int)count;
 }
-#elif defined(__linux__)
-#include <sys/syscall.h>  // getcpu
-#include <stdio.h>        // access
-
-static size_t mi_os_numa_nodex(void) {
-#ifdef SYS_getcpu
-  unsigned long node = 0;
-  unsigned long ncpu = 0;
-  long err = syscall(SYS_getcpu, &ncpu, &node, NULL);
-  if (err != 0) return 0;
-  return node;
-#else
-  return 0;
-#endif
+
+static int mi_os_numa_node_get(void) {
+  int numa_count = _mi_os_numa_node_count();
+  if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0
+  // never more than the node count and >= 0
+  const size_t n = _mi_prim_numa_node();
+  int numa_node = (n < INT_MAX ? (int)n : 0);
+  if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
+  return numa_node;
 }
-static size_t mi_os_numa_node_countx(void) {
-  char buf[128];
-  unsigned node = 0;
-  for(node = 0; node < 256; node++) {
-    // enumerate node entries -- todo: it there a more efficient way to do this? (but ensure there is no allocation)
-    snprintf(buf, 127, "/sys/devices/system/node/node%u", node + 1);
-    if (access(buf,R_OK) != 0) break;
+
+int _mi_os_numa_node(void) {
+  if mi_likely(mi_atomic_load_relaxed(&mi_numa_node_count) == 1) {
+    return 0;
   }
-  return (node+1);
-}
-#elif defined(__FreeBSD__) && __FreeBSD_version >= 1200000
-static size_t mi_os_numa_nodex(void) {
-  domainset_t dom;
-  size_t node;
-  int policy;
-  if (cpuset_getdomain(CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, sizeof(dom), &dom, &policy) == -1) return 0ul;
-  for (node = 0; node < MAXMEMDOM; node++) {
-    if (DOMAINSET_ISSET(node, &dom)) return node;
+  else {
+    return mi_os_numa_node_get();
   }
-  return 0ul;
 }
-static size_t mi_os_numa_node_countx(void) {
-  size_t ndomains = 0;
-  size_t len = sizeof(ndomains);
-  if (sysctlbyname("vm.ndomains", &ndomains, &len, NULL, 0) == -1) return 0ul;
-  return ndomains;
-}
-#elif defined(__DragonFly__)
-static size_t mi_os_numa_nodex(void) {
-  // TODO: DragonFly does not seem to provide any userland means to get this information.
-  return 0ul;
-}
-static size_t mi_os_numa_node_countx(void) {
-  size_t ncpus = 0, nvirtcoresperphys = 0;
-  size_t len = sizeof(size_t);
-  if (sysctlbyname("hw.ncpu", &ncpus, &len, NULL, 0) == -1) return 0ul;
-  if (sysctlbyname("hw.cpu_topology_ht_ids", &nvirtcoresperphys, &len, NULL, 0) == -1) return 0ul;
-  return nvirtcoresperphys * ncpus;
+
+
+/* ----------------------------------------------------------------------------
+  Public API
+-----------------------------------------------------------------------------*/
+#if 0
+mi_decl_export void* mi_os_alloc(size_t size, bool commit, size_t* full_size) {
+  return mi_os_alloc_aligned(size, mi_os_mem_config.alloc_granularity, commit, NULL, full_size);
+}
+
+static void* mi_os_alloc_aligned_ex(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_committed, bool* is_pinned, void** base, size_t* full_size) {
+  mi_memid_t memid = _mi_memid_none();
+  void* p = _mi_os_alloc_aligned(size, alignment, commit, allow_large, &memid);
+  if (p == NULL) return p;
+  if (is_committed != NULL) { *is_committed = memid.initially_committed;  }
+  if (is_pinned != NULL) { *is_pinned = memid.is_pinned;  }
+  if (base != NULL) { *base = memid.mem.os.base;  }
+  if (full_size != NULL) { *full_size = memid.mem.os.size;  }
+  if (!memid.initially_zero && memid.initially_committed) {
+    _mi_memzero_aligned(memid.mem.os.base, memid.mem.os.size);
+  }
+  return p;
 }
-#else
-static size_t mi_os_numa_nodex(void) {
-  return 0;
+
+mi_decl_export void* mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, void** base, size_t* full_size) {
+  return mi_os_alloc_aligned_ex(size, alignment, commit, false, NULL, NULL, base, full_size);
 }
-static size_t mi_os_numa_node_countx(void) {
-  return 1;
+
+mi_decl_export void* mi_os_alloc_aligned_allow_large(size_t size, size_t alignment, bool commit, bool* is_committed, bool* is_pinned, void** base, size_t* full_size) {
+  return mi_os_alloc_aligned_ex(size, alignment, commit, true, is_committed, is_pinned, base, full_size);
 }
-#endif
 
-_Atomic(size_t)  _mi_numa_node_count; // = 0   // cache the node count
+mi_decl_export void  mi_os_free(void* p, size_t size) {
+  if (p==NULL || size == 0) return;
+  mi_memid_t memid = _mi_memid_create_os(p, size, true, false, false);
+  _mi_os_free(p, size, memid);
+}
 
-size_t _mi_os_numa_node_count_get(void) {
-  size_t count = mi_atomic_load_acquire(&_mi_numa_node_count);
-  if (count <= 0) {
-    long ncount = mi_option_get(mi_option_use_numa_nodes); // given explicitly?
-    if (ncount > 0) {
-      count = (size_t)ncount;
-    }
-    else {
-      count = mi_os_numa_node_countx(); // or detect dynamically
-      if (count == 0) count = 1;
-    }    
-    mi_atomic_store_release(&_mi_numa_node_count, count); // save it
-    _mi_verbose_message("using %zd numa regions\n", count);
-  }
-  return count;
+mi_decl_export void  mi_os_commit(void* p, size_t size) {
+  _mi_os_commit(p, size, NULL);
 }
 
-int _mi_os_numa_node_get(mi_os_tld_t* tld) {
-  MI_UNUSED(tld);
-  size_t numa_count = _mi_os_numa_node_count();
-  if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0
-  // never more than the node count and >= 0
-  size_t numa_node = mi_os_numa_nodex();
-  if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
-  return (int)numa_node;
+mi_decl_export void  mi_os_decommit(void* p, size_t size) {
+  _mi_os_decommit(p, size);
 }
+#endif
diff --git a/ext/src/mimalloc/src/page-map.c b/ext/src/mimalloc/src/page-map.c
new file mode 100644
index 0000000000..d8393e72ba
--- /dev/null
+++ b/ext/src/mimalloc/src/page-map.c
@@ -0,0 +1,441 @@
+/*----------------------------------------------------------------------------
+Copyright (c) 2023-2025, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "bitmap.h"
+
+static void mi_page_map_cannot_commit(void) {
+  _mi_warning_message("unable to commit the allocation page-map on-demand\n" );
+}
+
+#if MI_PAGE_MAP_FLAT
+
+// The page-map contains a byte for each 64kb slice in the address space.
+// For an address `a` where `ofs = _mi_page_map[a >> 16]`:
+// 0 = unused
+// 1 = the slice at `a & ~0xFFFF` is a mimalloc page.
+// 1 < ofs <= 127 = the slice is part of a page, starting at `(((a>>16) - ofs - 1) << 16)`.
+//
+// 1 byte per slice => 1 TiB address space needs a 2^14 * 2^16 = 16 MiB page map.
+// A full 256 TiB address space (48 bit) needs a 4 GiB page map.
+// A full 4 GiB address space (32 bit) needs only a 64 KiB page map.
+
+mi_decl_cache_align uint8_t* _mi_page_map = NULL;
+static void*        mi_page_map_max_address = NULL;
+static mi_memid_t   mi_page_map_memid;
+
+#define MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT   MI_ARENA_SLICE_SIZE
+static mi_bitmap_t* mi_page_map_commit; // one bit per committed 64 KiB entries
+
+mi_decl_nodiscard static bool mi_page_map_ensure_committed(size_t idx, size_t slice_count);
+
+bool _mi_page_map_init(void) {
+  size_t vbits = (size_t)mi_option_get_clamp(mi_option_max_vabits, 0, MI_SIZE_BITS);
+  if (vbits == 0) {
+    vbits = _mi_os_virtual_address_bits();
+    #if MI_ARCH_X64  // canonical address is limited to the first 128 TiB
+    if (vbits >= 48) { vbits = 47; }
+    #endif
+  }
+
+  // Allocate the page map and commit bits
+  mi_page_map_max_address = (void*)(vbits >= MI_SIZE_BITS ? (SIZE_MAX - MI_ARENA_SLICE_SIZE + 1) : (MI_PU(1) << vbits));
+  const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_SLICE_SHIFT));
+  const bool commit = (page_map_size <= 1*MI_MiB || mi_option_is_enabled(mi_option_pagemap_commit)); // _mi_os_has_overcommit(); // commit on-access on Linux systems?
+  const size_t commit_bits = _mi_divide_up(page_map_size, MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT);
+  const size_t bitmap_size = (commit ? 0 : mi_bitmap_size(commit_bits, NULL));
+  const size_t reserve_size = bitmap_size + page_map_size;
+  uint8_t* const base = (uint8_t*)_mi_os_alloc_aligned(reserve_size, 1, commit, true /* allow large */, &mi_page_map_memid);
+  if (base==NULL) {
+    _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB);
+    return false;
+  }
+  if (mi_page_map_memid.initially_committed && !mi_page_map_memid.initially_zero) {
+    _mi_warning_message("internal: the page map was committed but not zero initialized!\n");
+    _mi_memzero_aligned(base, reserve_size);
+  }
+  if (bitmap_size > 0) {
+    mi_page_map_commit = (mi_bitmap_t*)base;
+    if (!_mi_os_commit(mi_page_map_commit, bitmap_size, NULL)) {
+      mi_page_map_cannot_commit();
+      return false;
+    }
+    mi_bitmap_init(mi_page_map_commit, commit_bits, true);
+  }
+  _mi_page_map = base + bitmap_size;
+
+  // commit the first part so NULL pointers get resolved without an access violation
+  if (!commit) {
+    if (!mi_page_map_ensure_committed(0, 1)) {
+      mi_page_map_cannot_commit();
+      return false;
+    }
+  }
+  _mi_page_map[0] = 1; // so _mi_ptr_page(NULL) == NULL
+  mi_assert_internal(_mi_ptr_page(NULL)==NULL);
+  return true;
+}
+
+void _mi_page_map_unsafe_destroy(mi_subproc_t* subproc) {
+  mi_assert_internal(subproc != NULL);
+  mi_assert_internal(_mi_page_map != NULL);
+  if (_mi_page_map == NULL) return;
+  _mi_os_free_ex(mi_page_map_memid.mem.os.base, mi_page_map_memid.mem.os.size, true, mi_page_map_memid, subproc);
+  _mi_page_map = NULL;
+  mi_page_map_commit = NULL;
+  mi_page_map_max_address = NULL;
+  mi_page_map_memid = _mi_memid_none();
+}
+
+
+static bool mi_page_map_ensure_committed(size_t idx, size_t slice_count) {
+  // is the page map area that contains the page address committed?
+  // we always set the commit bits so we can track what ranges are in-use.
+  // we only actually commit if the map wasn't committed fully already.
+  if (mi_page_map_commit != NULL) {
+    const size_t commit_idx = idx / MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT;
+    const size_t commit_idx_hi = (idx + slice_count - 1) / MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT;
+    for (size_t i = commit_idx; i <= commit_idx_hi; i++) {  // per bit to avoid crossing over bitmap chunks
+      if (mi_bitmap_is_clear(mi_page_map_commit, i)) {
+        // this may race, in which case we do multiple commits (which is ok)
+        bool is_zero;
+        uint8_t* const start = _mi_page_map + (i * MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT);
+        const size_t   size  = MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT;
+        if (!_mi_os_commit(start, size, &is_zero)) {
+          mi_page_map_cannot_commit();
+          return false;
+        }
+        if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(start, size); }
+        mi_bitmap_set(mi_page_map_commit, i);
+      }
+    }
+  }
+  #if MI_DEBUG > 0
+  _mi_page_map[idx] = 0;
+  _mi_page_map[idx+slice_count-1] = 0;
+  #endif
+  return true;
+}
+
+
+static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* slice_count) {
+  size_t page_size;
+  *page_start = mi_page_area(page, &page_size);
+  if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE - MI_ARENA_SLICE_SIZE; }  // furthest interior pointer
+  *slice_count = mi_slice_count_of_size(page_size) + (((uint8_t*)*page_start - (uint8_t*)page)/MI_ARENA_SLICE_SIZE); // add for large aligned blocks
+  return _mi_page_map_index(page);
+}
+
+bool _mi_page_map_register(mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_page_map != NULL);  // should be initialized before multi-thread access!
+  if mi_unlikely(_mi_page_map == NULL) {
+    if (!_mi_page_map_init()) return false;
+  }
+  mi_assert(_mi_page_map!=NULL);
+  uint8_t* page_start;
+  size_t   slice_count;
+  const size_t idx = mi_page_map_get_idx(page, &page_start, &slice_count);
+
+  if (!mi_page_map_ensure_committed(idx, slice_count)) {
+    return false;
+  }
+
+  // set the offsets
+  for (size_t i = 0; i < slice_count; i++) {
+    mi_assert_internal(i < 128);
+    _mi_page_map[idx + i] = (uint8_t)(i+1);
+  }
+  return true;
+}
+
+void _mi_page_map_unregister(mi_page_t* page) {
+  mi_assert_internal(_mi_page_map != NULL);
+  // get index and count
+  uint8_t* page_start;
+  size_t   slice_count;
+  const size_t idx = mi_page_map_get_idx(page, &page_start, &slice_count);
+  // unset the offsets
+  _mi_memzero(_mi_page_map + idx, slice_count);
+}
+
+void _mi_page_map_unregister_range(void* start, size_t size) {
+  const size_t slice_count = _mi_divide_up(size, MI_ARENA_SLICE_SIZE);
+  const uintptr_t index = _mi_page_map_index(start);
+  // todo: scan the commit bits and clear only those ranges?
+  if (!mi_page_map_ensure_committed(index, slice_count)) { // we commit the range in total;
+    return;
+  }
+  _mi_memzero(&_mi_page_map[index], slice_count);
+}
+
+
+mi_page_t* _mi_safe_ptr_page(const void* p) {
+  if mi_unlikely(p >= mi_page_map_max_address) return NULL;
+  const uintptr_t idx = _mi_page_map_index(p);
+  if mi_unlikely(mi_page_map_commit != NULL && !mi_bitmap_is_set(mi_page_map_commit, idx/MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT)) return NULL;
+  const uintptr_t ofs = _mi_page_map[idx];
+  if mi_unlikely(ofs == 0) return NULL;
+  return (mi_page_t*)((((uintptr_t)p >> MI_ARENA_SLICE_SHIFT) - ofs + 1) << MI_ARENA_SLICE_SHIFT);
+}
+
+mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
+  return (_mi_safe_ptr_page(p) != NULL);
+}
+
+#else
+
+// A 2-level page map
+#define MI_PAGE_MAP_SUB_SIZE          (MI_PAGE_MAP_SUB_COUNT * sizeof(mi_page_t*))
+#define MI_PAGE_MAP_ENTRIES_PER_CBIT  (MI_PAGE_MAP_COUNT / MI_BFIELD_BITS)
+
+mi_decl_cache_align _Atomic(mi_submap_t)* _mi_page_map;
+static size_t       mi_page_map_count;
+static void*        mi_page_map_max_address;
+static mi_memid_t   mi_page_map_memid;
+static mi_lock_t    mi_page_map_lock;
+
+// divide the main map in 64 (`MI_BFIELD_BITS`) parts commit those parts on demand
+static _Atomic(mi_bfield_t)  mi_page_map_commit;
+
+mi_decl_nodiscard static inline bool mi_page_map_is_committed(size_t idx, size_t* pbit_idx) {
+  mi_bfield_t commit = mi_atomic_load_relaxed(&mi_page_map_commit);
+  const size_t bit_idx = idx/MI_PAGE_MAP_ENTRIES_PER_CBIT;
+  mi_assert_internal(bit_idx < MI_BFIELD_BITS);
+  if (pbit_idx != NULL) { *pbit_idx = bit_idx; }
+  return ((commit & (MI_ZU(1) << bit_idx)) != 0);
+}
+
+mi_decl_nodiscard static bool mi_page_map_ensure_committed(size_t idx, mi_submap_t* submap) {
+  mi_assert_internal(submap!=NULL && *submap==NULL);
+  size_t bit_idx;
+  if mi_unlikely(!mi_page_map_is_committed(idx, &bit_idx)) {
+    uint8_t* start = (uint8_t*)&_mi_page_map[bit_idx * MI_PAGE_MAP_ENTRIES_PER_CBIT];
+    if (!_mi_os_commit(start, MI_PAGE_MAP_ENTRIES_PER_CBIT * sizeof(mi_submap_t), NULL)) {
+      mi_page_map_cannot_commit();
+      return false;
+    }
+    mi_atomic_or_acq_rel(&mi_page_map_commit, MI_ZU(1) << bit_idx);
+  }
+  *submap = mi_atomic_load_ptr_acquire(mi_page_t*, &_mi_page_map[idx]); // acquire _mi_page_map_at(idx);
+  return true;
+}
+
+// initialize the page map
+bool _mi_page_map_init(void) {
+  size_t vbits = (size_t)mi_option_get_clamp(mi_option_max_vabits, 0, MI_SIZE_BITS);
+  if (vbits == 0) {
+    vbits = _mi_os_virtual_address_bits();
+    #if MI_ARCH_X64  // canonical address is limited to the first 128 TiB
+    if (vbits >= 48) { vbits = 47; }
+    #endif
+  }
+
+  // Allocate the page map and commit bits
+  mi_assert(MI_MAX_VABITS >= vbits);
+  mi_page_map_max_address = (void*)(vbits >= MI_SIZE_BITS ? (SIZE_MAX - MI_ARENA_SLICE_SIZE + 1) : (MI_PU(1) << vbits));
+  mi_page_map_count = (MI_ZU(1) << (vbits - MI_PAGE_MAP_SUB_SHIFT - MI_ARENA_SLICE_SHIFT));
+  mi_assert(mi_page_map_count <= MI_PAGE_MAP_COUNT);
+  const size_t os_page_size = _mi_os_page_size();
+  const size_t page_map_size = _mi_align_up( mi_page_map_count * sizeof(mi_page_t**), os_page_size);
+  const size_t submap_size = MI_PAGE_MAP_SUB_SIZE;
+  const size_t reserve_size = page_map_size + submap_size;
+  #if MI_SECURE
+  const bool commit = true;  // the whole page map is valid and we can reliably check any pointer
+  #else
+  const bool commit = page_map_size <= 64*MI_KiB ||
+                      mi_option_is_enabled(mi_option_pagemap_commit) || _mi_os_has_overcommit();
+  #endif
+  _mi_page_map = (_Atomic(mi_page_t**)*)_mi_os_alloc_aligned(reserve_size, 1, commit, true /* allow large */, &mi_page_map_memid);
+  if (_mi_page_map==NULL) {
+    _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB);
+    return false;
+  }
+  if (mi_page_map_memid.initially_committed && !mi_page_map_memid.initially_zero) {
+    _mi_warning_message("internal: the page map was committed but not zero initialized!\n");
+    _mi_memzero_aligned(_mi_page_map, page_map_size);
+  }
+  mi_atomic_store_release(&mi_page_map_commit, (mi_page_map_memid.initially_committed ? ~MI_ZU(0) : MI_ZU(0)));
+
+  // ensure there is a submap for the NULL address
+  mi_page_t** const sub0 = (mi_page_t**)((uint8_t*)_mi_page_map + page_map_size);  // we reserved a submap part at the end already
+  if (!mi_page_map_memid.initially_committed) {
+    if (!_mi_os_commit(sub0, submap_size, NULL)) {  // commit full submap (issue #1087)
+      mi_page_map_cannot_commit();
+      return false;
+    }
+  }
+  if (!mi_page_map_memid.initially_zero) {     // initialize low addresses with NULL
+    _mi_memzero_aligned(sub0, submap_size);
+  }
+  mi_submap_t nullsub = NULL;
+  if (!mi_page_map_ensure_committed(0,&nullsub)) {
+    mi_page_map_cannot_commit();
+    return false;
+  }
+  mi_atomic_store_ptr_release(mi_page_t*, &_mi_page_map[0], sub0);
+  mi_lock_init(&mi_page_map_lock);             // initialize late in case the lock init causes allocation
+  
+  mi_assert_internal(_mi_ptr_page(NULL)==NULL);
+  return true;
+}
+
+
+void _mi_page_map_unsafe_destroy(mi_subproc_t* subproc) {
+  mi_assert_internal(subproc != NULL);
+  mi_assert_internal(_mi_page_map != NULL);
+  if (_mi_page_map == NULL) return;
+  mi_lock_done(&mi_page_map_lock);  
+  for (size_t idx = 1; idx < mi_page_map_count; idx++) {  // skip entry 0 (as we allocate that submap at the end of the page_map)
+    // free all sub-maps
+    if (mi_page_map_is_committed(idx, NULL)) {
+      mi_submap_t sub = _mi_page_map_at(idx);
+      if (sub != NULL) {
+        mi_memid_t memid = _mi_memid_create_os(sub, MI_PAGE_MAP_SUB_SIZE, true, false, false);
+        _mi_os_free_ex(memid.mem.os.base, memid.mem.os.size, true, memid, subproc);
+        mi_atomic_store_ptr_release(mi_page_t*, &_mi_page_map[idx], NULL);
+      }
+    }
+  }
+  _mi_os_free_ex(_mi_page_map, mi_page_map_memid.mem.os.size, true, mi_page_map_memid, subproc);
+  _mi_page_map = NULL;
+  mi_page_map_count = 0;
+  mi_page_map_memid = _mi_memid_none();
+  mi_page_map_max_address = NULL;
+  mi_atomic_store_release(&mi_page_map_commit, 0);
+}
+
+
+mi_decl_nodiscard static bool mi_page_map_ensure_submap_at(size_t idx, mi_submap_t* submap) {
+  mi_assert_internal(submap!=NULL && *submap==NULL);
+  mi_submap_t sub = NULL;
+  if (!mi_page_map_ensure_committed(idx, &sub)) {
+    return false;
+  }
+  if mi_unlikely(sub == NULL) {
+    // sub map not yet allocated, alloc now
+    mi_lock(&mi_page_map_lock) 
+    {
+      sub = mi_atomic_load_ptr_acquire(mi_page_t*, &_mi_page_map[idx]); // reload
+      if (sub==NULL) // not yet allocated by another thread?      
+      {
+        mi_memid_t memid;
+        const size_t submap_size = MI_PAGE_MAP_SUB_SIZE;
+        sub = (mi_submap_t)_mi_os_zalloc(submap_size, &memid);        
+        if (sub==NULL) {
+          _mi_warning_message("internal error: unable to extend the page map\n");          
+        }
+        else {
+          mi_submap_t expect = NULL;
+          if (!mi_atomic_cas_ptr_strong_acq_rel(mi_page_t*, &_mi_page_map[idx], &expect, sub)) {
+            // another thread already allocated it.. free and continue
+            _mi_os_free(sub, submap_size, memid);
+            sub = expect;
+          }
+        }
+      }
+    }
+    if (sub==NULL) return false; // unable to allocate the submap..
+  }
+  mi_assert_internal(sub!=NULL);
+  *submap = sub;
+  return true;
+}
+
+static bool mi_page_map_set_range_prim(mi_page_t* page, size_t idx, size_t sub_idx, size_t slice_count) {
+  // is the page map area that contains the page address committed?
+  while (slice_count > 0) {
+    mi_submap_t sub = NULL;
+    if (!mi_page_map_ensure_submap_at(idx, &sub)) {
+      return false;
+    };
+    mi_assert_internal(sub!=NULL);
+    // set the offsets for the page
+    while (slice_count > 0 && sub_idx < MI_PAGE_MAP_SUB_COUNT) {
+      sub[sub_idx] = page;
+      slice_count--;
+      sub_idx++;
+    }
+    idx++; // potentially wrap around to the next idx
+    sub_idx = 0;
+  }
+  return true;
+}
+
+static bool mi_page_map_set_range(mi_page_t* page, size_t idx, size_t sub_idx, size_t slice_count) {
+  if mi_unlikely(!mi_page_map_set_range_prim(page,idx,sub_idx,slice_count)) {
+    // failed to commit, call again to reset the page pointer if needed
+    if (page!=NULL) {
+      mi_page_map_set_range_prim(NULL,idx,sub_idx,slice_count);
+    }
+    return false;
+  }
+  return true;
+}
+
+static size_t mi_page_map_get_idx(mi_page_t* page, size_t* sub_idx, size_t* slice_count) {
+  size_t page_size;
+  uint8_t* page_start = mi_page_area(page, &page_size);
+  if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE - MI_ARENA_SLICE_SIZE; }  // furthest interior pointer
+  *slice_count = mi_slice_count_of_size(page_size) + ((page_start - (uint8_t*)page)/MI_ARENA_SLICE_SIZE); // add for large aligned blocks
+  return _mi_page_map_index(page, sub_idx);
+}
+
+bool _mi_page_map_register(mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_page_map != NULL);  // should be initialized before multi-thread access!
+  if mi_unlikely(_mi_page_map == NULL) {
+    if (!_mi_page_map_init()) return false;
+  }
+  mi_assert(_mi_page_map!=NULL);
+  size_t   slice_count;
+  size_t   sub_idx;
+  const size_t idx = mi_page_map_get_idx(page, &sub_idx, &slice_count);
+  return mi_page_map_set_range(page, idx, sub_idx, slice_count);
+}
+
+void _mi_page_map_unregister(mi_page_t* page) {
+  mi_assert_internal(_mi_page_map != NULL);
+  mi_assert_internal(page != NULL);
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  if mi_unlikely(_mi_page_map == NULL) return;
+  // get index and count
+  size_t slice_count;
+  size_t sub_idx;
+  const size_t idx = mi_page_map_get_idx(page, &sub_idx, &slice_count);
+  // unset the offsets
+  mi_page_map_set_range(NULL, idx, sub_idx, slice_count);
+}
+
+void _mi_page_map_unregister_range(void* start, size_t size) {
+  if mi_unlikely(_mi_page_map == NULL) return;
+  const size_t slice_count = _mi_divide_up(size, MI_ARENA_SLICE_SIZE);
+  size_t sub_idx;
+  const uintptr_t idx = _mi_page_map_index(start, &sub_idx);
+  mi_page_map_set_range(NULL, idx, sub_idx, slice_count);  // todo: avoid committing if not already committed?
+}
+
+// Return NULL for invalid pointers
+mi_page_t* _mi_safe_ptr_page(const void* p) {
+  if (p==NULL) return NULL;
+  if mi_unlikely(p >= mi_page_map_max_address) return NULL;
+  size_t sub_idx;
+  const size_t idx = _mi_page_map_index(p,&sub_idx);
+  if mi_unlikely(!mi_page_map_is_committed(idx,NULL)) return NULL;
+  mi_page_t** const sub = _mi_page_map[idx];
+  if mi_unlikely(sub==NULL) return NULL;
+  return sub[sub_idx];
+}
+
+mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
+  return (_mi_safe_ptr_page(p) != NULL);
+}
+
+#endif
diff --git a/ext/src/mimalloc/src/page-queue.c b/ext/src/mimalloc/src/page-queue.c
index 92f933c2a0..4c54cb9a20 100644
--- a/ext/src/mimalloc/src/page-queue.c
+++ b/ext/src/mimalloc/src/page-queue.c
@@ -1,5 +1,5 @@
 /*----------------------------------------------------------------------------
-Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -11,6 +11,10 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #ifndef MI_IN_PAGE_C
 #error "this file should be included from 'page.c'"
+// include to help an IDE
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
 #endif
 
 /* -----------------------------------------------------------
@@ -34,15 +38,19 @@ terms of the MIT license. A copy of the license can be found in the file
 
 
 static inline bool mi_page_queue_is_huge(const mi_page_queue_t* pq) {
-  return (pq->block_size == (MI_MEDIUM_OBJ_SIZE_MAX+sizeof(uintptr_t)));
+  return (pq->block_size == (MI_LARGE_MAX_OBJ_SIZE+sizeof(uintptr_t)));
 }
 
 static inline bool mi_page_queue_is_full(const mi_page_queue_t* pq) {
-  return (pq->block_size == (MI_MEDIUM_OBJ_SIZE_MAX+(2*sizeof(uintptr_t))));
+  return (pq->block_size == (MI_LARGE_MAX_OBJ_SIZE+(2*sizeof(uintptr_t))));
 }
 
 static inline bool mi_page_queue_is_special(const mi_page_queue_t* pq) {
-  return (pq->block_size > MI_MEDIUM_OBJ_SIZE_MAX);
+  return (pq->block_size > MI_LARGE_MAX_OBJ_SIZE);
+}
+
+static inline size_t mi_page_queue_count(const mi_page_queue_t* pq) {
+  return pq->count;
 }
 
 /* -----------------------------------------------------------
@@ -53,27 +61,23 @@ static inline bool mi_page_queue_is_special(const mi_page_queue_t* pq) {
 // Returns MI_BIN_HUGE if the size is too large.
 // We use `wsize` for the size in "machine word sizes",
 // i.e. byte size == `wsize*sizeof(void*)`.
-static inline uint8_t mi_bin(size_t size) {
+static mi_decl_noinline size_t mi_bin(size_t size) {
   size_t wsize = _mi_wsize_from_size(size);
-  uint8_t bin;
-  if (wsize <= 1) {
-    bin = 1;
+#if defined(MI_ALIGN4W)
+  if mi_likely(wsize <= 4) {
+    return (wsize <= 1 ? 1 : (wsize+1)&~1); // round to double word sizes
   }
-  #if defined(MI_ALIGN4W)
-  else if (wsize <= 4) {
-    bin = (uint8_t)((wsize+1)&~1); // round to double word sizes
+#elif defined(MI_ALIGN2W)
+  if mi_likely(wsize <= 8) {
+    return (wsize <= 1 ? 1 : (wsize+1)&~1); // round to double word sizes
   }
-  #elif defined(MI_ALIGN2W)
-  else if (wsize <= 8) {
-    bin = (uint8_t)((wsize+1)&~1); // round to double word sizes
-  }
-  #else
-  else if (wsize <= 8) {
-    bin = (uint8_t)wsize;
+#else
+  if mi_likely(wsize <= 8) {
+    return (wsize == 0 ? 1 : wsize);
   }
-  #endif
-  else if (wsize > MI_MEDIUM_OBJ_WSIZE_MAX) {
-    bin = MI_BIN_HUGE;
+#endif
+  else if mi_unlikely(wsize > MI_LARGE_MAX_OBJ_WSIZE) {
+    return MI_BIN_HUGE;
   }
   else {
     #if defined(MI_ALIGN4W)
@@ -81,15 +85,14 @@ static inline uint8_t mi_bin(size_t size) {
     #endif
     wsize--;
     // find the highest bit
-    uint8_t b = (uint8_t)mi_bsr(wsize);  // note: wsize != 0
+    const size_t b = (MI_SIZE_BITS - 1 - mi_clz(wsize));  // note: wsize != 0
     // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation).
     // - adjust with 3 because we use do not round the first 8 sizes
     //   which each get an exact bin
-    bin = ((b << 2) + (uint8_t)((wsize >> (b - 2)) & 0x03)) - 3;
-    mi_assert_internal(bin < MI_BIN_HUGE);
+    const size_t bin = ((b << 2) + ((wsize >> (b - 2)) & 0x03)) - 3;
+    mi_assert_internal(bin > 0 && bin < MI_BIN_HUGE);
+    return bin;
   }
-  mi_assert_internal(bin > 0 && bin <= MI_BIN_HUGE);
-  return bin;
 }
 
 
@@ -98,21 +101,22 @@ static inline uint8_t mi_bin(size_t size) {
   Queue of pages with free blocks
 ----------------------------------------------------------- */
 
-uint8_t _mi_bin(size_t size) {
+size_t _mi_bin(size_t size) {
   return mi_bin(size);
 }
 
-size_t _mi_bin_size(uint8_t bin) {
-  return _mi_heap_empty.pages[bin].block_size;
+size_t _mi_bin_size(size_t bin) {
+  mi_assert_internal(bin <= MI_BIN_HUGE);
+  return _mi_theap_empty.pages[bin].block_size;
 }
 
 // Good size for allocation
-size_t mi_good_size(size_t size) mi_attr_noexcept {
-  if (size <= MI_MEDIUM_OBJ_SIZE_MAX) {
-    return _mi_bin_size(mi_bin(size));
+mi_decl_nodiscard mi_decl_export size_t mi_good_size(size_t size) mi_attr_noexcept {
+  if (size <= MI_LARGE_MAX_OBJ_SIZE) {
+    return _mi_bin_size(mi_bin(size + MI_PADDING_SIZE));
   }
   else {
-    return _mi_align_up(size,_mi_os_page_size());
+    return _mi_align_up(size + MI_PADDING_SIZE,_mi_os_page_size());
   }
 }
 
@@ -132,26 +136,65 @@ static bool mi_page_queue_contains(mi_page_queue_t* queue, const mi_page_t* page
 #endif
 
 #if (MI_DEBUG>1)
-static bool mi_heap_contains_queue(const mi_heap_t* heap, const mi_page_queue_t* pq) {
-  return (pq >= &heap->pages[0] && pq <= &heap->pages[MI_BIN_FULL]);
+static bool mi_theap_contains_queue(const mi_theap_t* theap, const mi_page_queue_t* pq) {
+  return (pq >= &theap->pages[0] && pq <= &theap->pages[MI_BIN_FULL]);
 }
 #endif
 
-static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) {
-  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : mi_bin(page->xblock_size));
-  mi_heap_t* heap = mi_page_heap(page);
-  mi_assert_internal(heap != NULL && bin <= MI_BIN_FULL);
-  mi_page_queue_t* pq = &heap->pages[bin];
-  mi_assert_internal(bin >= MI_BIN_HUGE || page->xblock_size == pq->block_size);
-  mi_assert_expensive(mi_page_queue_contains(pq, page));
-  return pq;
+bool _mi_page_queue_is_valid(mi_theap_t* theap, const mi_page_queue_t* pq) {
+  MI_UNUSED_RELEASE(theap);
+  if (pq==NULL) return false;
+  size_t count = 0; MI_UNUSED_RELEASE(count);
+  mi_page_t* prev = NULL; MI_UNUSED_RELEASE(prev);
+  for (mi_page_t* page = pq->first; page != NULL; page = page->next) {
+    mi_assert_internal(page->prev == prev);
+    if (mi_page_is_in_full(page)) {
+      mi_assert_internal(_mi_wsize_from_size(pq->block_size) == MI_LARGE_MAX_OBJ_WSIZE + 2);
+    }
+    else if (mi_page_is_huge(page)) {
+      mi_assert_internal(_mi_wsize_from_size(pq->block_size) == MI_LARGE_MAX_OBJ_WSIZE + 1);
+    }
+    else {
+      mi_assert_internal(mi_page_block_size(page) == pq->block_size);
+    }
+    mi_assert_internal(page->theap == theap);
+    if (page->next == NULL) {
+      mi_assert_internal(pq->last == page);
+    }
+    count++;
+    prev = page;
+  }
+  mi_assert_internal(pq->count == count);
+  return true;
 }
 
-static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page) {
-  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : mi_bin(page->xblock_size));
+static size_t mi_page_bin(const mi_page_t* page) {
+  const size_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : (mi_page_is_huge(page) ? MI_BIN_HUGE : mi_bin(mi_page_block_size(page))));
   mi_assert_internal(bin <= MI_BIN_FULL);
-  mi_page_queue_t* pq = &heap->pages[bin];
-  mi_assert_internal(mi_page_is_in_full(page) || page->xblock_size == pq->block_size);
+  return bin;
+}
+
+// returns the page bin without using MI_BIN_FULL for statistics
+size_t _mi_page_stats_bin(const mi_page_t* page) {
+  const size_t bin = (mi_page_is_huge(page) ? MI_BIN_HUGE : mi_bin(mi_page_block_size(page)));
+  mi_assert_internal(bin <= MI_BIN_HUGE);
+  return bin;
+}
+
+static mi_page_queue_t* mi_theap_page_queue_of(mi_theap_t* theap, const mi_page_t* page) {
+  mi_assert_internal(theap!=NULL);
+  const size_t bin = mi_page_bin(page);
+  mi_page_queue_t* pq = &theap->pages[bin];
+  mi_assert_internal((mi_page_block_size(page) == pq->block_size) ||
+                       (mi_page_is_huge(page) && mi_page_queue_is_huge(pq)) ||
+                         (mi_page_is_in_full(page) && mi_page_queue_is_full(pq)));
+  return pq;
+}
+
+static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) {
+  mi_theap_t* theap = mi_page_theap(page);
+  mi_page_queue_t* pq = mi_theap_page_queue_of(theap, page);
+  mi_assert_expensive(mi_page_queue_contains(pq, page));
   return pq;
 }
 
@@ -160,8 +203,8 @@ static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t*
 // size without having to compute the bin. This means when the
 // current free page queue is updated for a small bin, we need to update a
 // range of entries in `_mi_page_small_free`.
-static inline void mi_heap_queue_first_update(mi_heap_t* heap, const mi_page_queue_t* pq) {
-  mi_assert_internal(mi_heap_contains_queue(heap,pq));
+static inline void mi_theap_queue_first_update(mi_theap_t* theap, const mi_page_queue_t* pq) {
+  mi_assert_internal(mi_theap_contains_queue(theap,pq));
   size_t size = pq->block_size;
   if (size > MI_SMALL_SIZE_MAX) return;
 
@@ -171,7 +214,7 @@ static inline void mi_heap_queue_first_update(mi_heap_t* heap, const mi_page_que
   // find index in the right direct page array
   size_t start;
   size_t idx = _mi_wsize_from_size(size);
-  mi_page_t** pages_free = heap->pages_free_direct;
+  mi_page_t** pages_free = theap->pages_free_direct;
 
   if (pages_free[idx] == page) return;  // already set
 
@@ -181,9 +224,9 @@ static inline void mi_heap_queue_first_update(mi_heap_t* heap, const mi_page_que
   }
   else {
     // find previous size; due to minimal alignment upto 3 previous bins may need to be skipped
-    uint8_t bin = mi_bin(size);
+    size_t bin = mi_bin(size);
     const mi_page_queue_t* prev = pq - 1;
-    while( bin == mi_bin(prev->block_size) && prev > &heap->pages[0]) {
+    while( bin == mi_bin(prev->block_size) && prev > &theap->pages[0]) {
       prev--;
     }
     start = 1 + _mi_wsize_from_size(prev->block_size);
@@ -206,37 +249,40 @@ static bool mi_page_queue_is_empty(mi_page_queue_t* queue) {
 static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
   mi_assert_internal(page != NULL);
   mi_assert_expensive(mi_page_queue_contains(queue, page));
-  mi_assert_internal(page->xblock_size == queue->block_size || (page->xblock_size > MI_MEDIUM_OBJ_SIZE_MAX && mi_page_queue_is_huge(queue))  || (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
-  mi_heap_t* heap = mi_page_heap(page);
-
+  mi_assert_internal(queue->count >= 1);
+  mi_assert_internal(mi_page_block_size(page) == queue->block_size ||
+                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) ||
+                        (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
+  mi_theap_t* theap = mi_page_theap(page);
   if (page->prev != NULL) page->prev->next = page->next;
   if (page->next != NULL) page->next->prev = page->prev;
   if (page == queue->last)  queue->last = page->prev;
   if (page == queue->first) {
     queue->first = page->next;
     // update first
-    mi_assert_internal(mi_heap_contains_queue(heap, queue));
-    mi_heap_queue_first_update(heap,queue);
+    mi_assert_internal(mi_theap_contains_queue(theap, queue));
+    mi_theap_queue_first_update(theap,queue);
   }
-  heap->page_count--;
+  theap->page_count--;
+  queue->count--;
   page->next = NULL;
   page->prev = NULL;
-  // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), NULL);
   mi_page_set_in_full(page,false);
 }
 
 
-static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_t* page) {
-  mi_assert_internal(mi_page_heap(page) == heap);
+static void mi_page_queue_push(mi_theap_t* theap, mi_page_queue_t* queue, mi_page_t* page) {
+  mi_assert_internal(mi_page_theap(page) == theap);
   mi_assert_internal(!mi_page_queue_contains(queue, page));
-
-  mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
-  mi_assert_internal(page->xblock_size == queue->block_size ||
-                      (page->xblock_size > MI_MEDIUM_OBJ_SIZE_MAX) ||
+  #if MI_HUGE_PAGE_ABANDON
+  mi_assert_internal(_mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
+  #endif
+  mi_assert_internal(mi_page_block_size(page) == queue->block_size ||
+                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) ||
                         (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
 
   mi_page_set_in_full(page, mi_page_queue_is_full(queue));
-  // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), heap);
+
   page->next = queue->first;
   page->prev = NULL;
   if (queue->first != NULL) {
@@ -247,77 +293,153 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_
   else {
     queue->first = queue->last = page;
   }
+  queue->count++;
 
   // update direct
-  mi_heap_queue_first_update(heap, queue);
-  heap->page_count++;
+  mi_theap_queue_first_update(theap, queue);
+  theap->page_count++;
 }
 
+static void mi_page_queue_push_at_end(mi_theap_t* theap, mi_page_queue_t* queue, mi_page_t* page) {
+  mi_assert_internal(mi_page_theap(page) == theap);
+  mi_assert_internal(!mi_page_queue_contains(queue, page));
+
+  mi_assert_internal(mi_page_block_size(page) == queue->block_size ||
+                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) ||
+                       (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
 
-static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* from, mi_page_t* page) {
+  mi_page_set_in_full(page, mi_page_queue_is_full(queue));
+
+  page->prev = queue->last;
+  page->next = NULL;
+  if (queue->last != NULL) {
+    mi_assert_internal(queue->last->next == NULL);
+    queue->last->next = page;
+    queue->last = page;
+  }
+  else {
+    queue->first = queue->last = page;
+  }
+  queue->count++;
+
+  // update direct
+  if (queue->first == page) {
+    mi_theap_queue_first_update(theap, queue);
+  }
+  theap->page_count++;
+}
+
+static void mi_page_queue_move_to_front(mi_theap_t* theap, mi_page_queue_t* queue, mi_page_t* page) {
+  mi_assert_internal(mi_page_theap(page) == theap);
+  mi_assert_internal(mi_page_queue_contains(queue, page));
+  if (queue->first == page) return;
+  mi_page_queue_remove(queue, page);
+  mi_page_queue_push(theap, queue, page);
+  mi_assert_internal(queue->first == page);
+}
+
+static void mi_page_queue_enqueue_from_ex(mi_page_queue_t* to, mi_page_queue_t* from, bool enqueue_at_end, mi_page_t* page) {
   mi_assert_internal(page != NULL);
+  mi_assert_internal(from->count >= 1);
   mi_assert_expensive(mi_page_queue_contains(from, page));
   mi_assert_expensive(!mi_page_queue_contains(to, page));
+  const size_t bsize = mi_page_block_size(page);
+  MI_UNUSED(bsize);
+  mi_assert_internal((bsize == to->block_size && bsize == from->block_size) ||
+                     (bsize == to->block_size && mi_page_queue_is_full(from)) ||
+                     (bsize == from->block_size && mi_page_queue_is_full(to)) ||
+                     (mi_page_is_huge(page) && mi_page_queue_is_huge(to)) ||
+                     (mi_page_is_huge(page) && mi_page_queue_is_full(to)));
 
-  mi_assert_internal((page->xblock_size == to->block_size && page->xblock_size == from->block_size) ||
-                     (page->xblock_size == to->block_size && mi_page_queue_is_full(from)) ||
-                     (page->xblock_size == from->block_size && mi_page_queue_is_full(to)) ||
-                     (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(to)) ||
-                     (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_full(to)));
+  mi_theap_t* theap = mi_page_theap(page);
 
-  mi_heap_t* heap = mi_page_heap(page);
+  // delete from `from`
   if (page->prev != NULL) page->prev->next = page->next;
   if (page->next != NULL) page->next->prev = page->prev;
   if (page == from->last)  from->last = page->prev;
   if (page == from->first) {
     from->first = page->next;
     // update first
-    mi_assert_internal(mi_heap_contains_queue(heap, from));
-    mi_heap_queue_first_update(heap, from);
+    mi_assert_internal(mi_theap_contains_queue(theap, from));
+    mi_theap_queue_first_update(theap, from);
   }
-
-  page->prev = to->last;
-  page->next = NULL;
-  if (to->last != NULL) {
-    mi_assert_internal(heap == mi_page_heap(to->last));
-    to->last->next = page;
-    to->last = page;
+  from->count--;
+
+  // insert into `to`
+  to->count++;
+  if (enqueue_at_end) {
+    // enqueue at the end
+    page->prev = to->last;
+    page->next = NULL;
+    if (to->last != NULL) {
+      mi_assert_internal(theap == mi_page_theap(to->last));
+      to->last->next = page;
+      to->last = page;
+    }
+    else {
+      to->first = page;
+      to->last = page;
+      mi_theap_queue_first_update(theap, to);
+    }
   }
   else {
-    to->first = page;
-    to->last = page;
-    mi_heap_queue_first_update(heap, to);
+    if (to->first != NULL) {
+      // enqueue at 2nd place
+      mi_assert_internal(theap == mi_page_theap(to->first));
+      mi_page_t* next = to->first->next;
+      page->prev = to->first;
+      page->next = next;
+      to->first->next = page;
+      if (next != NULL) {
+        next->prev = page;
+      }
+      else {
+        to->last = page;
+      }
+    }
+    else {
+      // enqueue at the head (singleton list)
+      page->prev = NULL;
+      page->next = NULL;
+      to->first = page;
+      to->last = page;
+      mi_theap_queue_first_update(theap, to);
+    }
   }
 
   mi_page_set_in_full(page, mi_page_queue_is_full(to));
 }
 
-// Only called from `mi_heap_absorb`.
-size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append) {
-  mi_assert_internal(mi_heap_contains_queue(heap,pq));
+static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* from, mi_page_t* page) {
+  mi_page_queue_enqueue_from_ex(to, from, true /* enqueue at the end */, page);
+}
+
+static void mi_page_queue_enqueue_from_full(mi_page_queue_t* to, mi_page_queue_t* from, mi_page_t* page) {
+  // note: we could insert at the front to increase reuse, but it slows down certain benchmarks (like `alloc-test`)
+  mi_page_queue_enqueue_from_ex(to, from, true /* enqueue at the end of the `to` queue? */, page);
+}
+
+// Only called from `mi_theap_absorb`.
+size_t _mi_page_queue_append(mi_theap_t* theap, mi_page_queue_t* pq, mi_page_queue_t* append) {
+  mi_assert_internal(mi_theap_contains_queue(theap,pq));
   mi_assert_internal(pq->block_size == append->block_size);
 
   if (append->first==NULL) return 0;
 
-  // set append pages to new heap and count
+  // set append pages to new theap and count
   size_t count = 0;
   for (mi_page_t* page = append->first; page != NULL; page = page->next) {
-    // inline `mi_page_set_heap` to avoid wrong assertion during absorption;
-    // in this case it is ok to be delayed freeing since both "to" and "from" heap are still alive.
-    mi_atomic_store_release(&page->xheap, (uintptr_t)heap); 
-    // set the flag to delayed free (not overriding NEVER_DELAYED_FREE) which has as a
-    // side effect that it spins until any DELAYED_FREEING is finished. This ensures
-    // that after appending only the new heap will be used for delayed free operations.
-    _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, false);
+    mi_page_set_theap(page, theap);
     count++;
   }
+  mi_assert_internal(count == append->count);
 
   if (pq->last==NULL) {
     // take over afresh
     mi_assert_internal(pq->first==NULL);
     pq->first = append->first;
     pq->last = append->last;
-    mi_heap_queue_first_update(heap, pq);
+    mi_theap_queue_first_update(theap, pq);
   }
   else {
     // append to end
@@ -327,5 +449,7 @@ size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue
     append->first->prev = pq->last;
     pq->last = append->last;
   }
+  pq->count += append->count;
+
   return count;
 }
diff --git a/ext/src/mimalloc/src/page.c b/ext/src/mimalloc/src/page.c
index fd6c5397d0..8f9fbab284 100644
--- a/ext/src/mimalloc/src/page.c
+++ b/ext/src/mimalloc/src/page.c
@@ -1,5 +1,5 @@
 /*----------------------------------------------------------------------------
-Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -12,8 +12,9 @@ terms of the MIT license. A copy of the license can be found in the file
 ----------------------------------------------------------- */
 
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"
 
 /* -----------------------------------------------------------
   Definition of page queues for each block size
@@ -36,14 +37,14 @@ static inline mi_block_t* mi_page_block_at(const mi_page_t* page, void* page_sta
   return (mi_block_t*)((uint8_t*)page_start + (i * block_size));
 }
 
-static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t size, mi_tld_t* tld);
-static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld);
+static bool mi_page_extend_free(mi_theap_t* theap, mi_page_t* page);
 
 #if (MI_DEBUG>=3)
 static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) {
+  mi_assert_internal(_mi_ptr_page(page) == page);
   size_t count = 0;
   while (head != NULL) {
-    mi_assert_internal(page == _mi_ptr_page(head));
+    mi_assert_internal((uint8_t*)head - (uint8_t*)page > (ptrdiff_t)MI_LARGE_PAGE_SIZE || page == _mi_ptr_page(head));
     count++;
     head = mi_block_next(page, head);
   }
@@ -59,32 +60,38 @@ static inline uint8_t* mi_page_area(const mi_page_t* page) {
 
 static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) {
   size_t psize;
-  uint8_t* page_area = _mi_page_start(_mi_page_segment(page), page, &psize);
+  uint8_t* page_area = mi_page_area(page, &psize);
   mi_block_t* start = (mi_block_t*)page_area;
   mi_block_t* end   = (mi_block_t*)(page_area + psize);
   while(p != NULL) {
     if (p < start || p >= end) return false;
     p = mi_block_next(page, p);
   }
+#if MI_DEBUG>3 // generally too expensive to check this
+  if (page->free_is_zero) {
+    const size_t ubsize = mi_page_usable_block_size(page);
+    for (mi_block_t* block = page->free; block != NULL; block = mi_block_next(page, block)) {
+      mi_assert_expensive(mi_mem_is_zero(block + 1, ubsize - sizeof(mi_block_t)));
+    }
+  }
+#endif
   return true;
 }
 
 static bool mi_page_is_valid_init(mi_page_t* page) {
-  mi_assert_internal(page->xblock_size > 0);
+  mi_assert_internal(mi_page_block_size(page) > 0);
   mi_assert_internal(page->used <= page->capacity);
   mi_assert_internal(page->capacity <= page->reserved);
 
-  mi_segment_t* segment = _mi_page_segment(page);
-  uint8_t* start = _mi_page_start(segment,page,NULL);
-  mi_assert_internal(start == _mi_segment_page_start(segment,page,NULL));
-  //const size_t bsize = mi_page_block_size(page);
+  // const size_t bsize = mi_page_block_size(page);
+  // uint8_t* start = mi_page_start(page);
   //mi_assert_internal(start + page->capacity*page->block_size == page->top);
 
   mi_assert_internal(mi_page_list_is_valid(page,page->free));
   mi_assert_internal(mi_page_list_is_valid(page,page->local_free));
 
   #if MI_DEBUG>3 // generally too expensive to check this
-  if (page->is_zero) {
+  if (page->free_is_zero) {
     const size_t ubsize = mi_page_usable_block_size(page);
     for(mi_block_t* block = page->free; block != NULL; block = mi_block_next(page,block)) {
       mi_assert_expensive(mi_mem_is_zero(block + 1, ubsize - sizeof(mi_block_t)));
@@ -92,10 +99,12 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
   }
   #endif
 
+  #if !MI_TRACK_ENABLED && !MI_TSAN
   mi_block_t* tfree = mi_page_thread_free(page);
   mi_assert_internal(mi_page_list_is_valid(page, tfree));
   //size_t tfree_count = mi_page_list_count(page, tfree);
   //mi_assert_internal(tfree_count <= page->thread_freed + 1);
+  #endif
 
   size_t free_count = mi_page_list_count(page, page->free) + mi_page_list_count(page, page->local_free);
   mi_assert_internal(page->used + free_count == page->capacity);
@@ -103,78 +112,45 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
   return true;
 }
 
+extern mi_decl_hidden bool _mi_process_is_initialized;             // has mi_process_init been called?
+
 bool _mi_page_is_valid(mi_page_t* page) {
   mi_assert_internal(mi_page_is_valid_init(page));
   #if MI_SECURE
   mi_assert_internal(page->keys[0] != 0);
   #endif
-  if (mi_page_heap(page)!=NULL) {
-    mi_segment_t* segment = _mi_page_segment(page);
-
-    mi_assert_internal(!_mi_process_is_initialized || segment->thread_id==0 || segment->thread_id == mi_page_heap(page)->thread_id);
-    if (segment->kind != MI_SEGMENT_HUGE) {
+  if (!mi_page_is_abandoned(page)) {
+    //mi_assert_internal(!_mi_process_is_initialized);
+    {
       mi_page_queue_t* pq = mi_page_queue_of(page);
       mi_assert_internal(mi_page_queue_contains(pq, page));
-      mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_MEDIUM_OBJ_SIZE_MAX || mi_page_is_in_full(page));
-      mi_assert_internal(mi_heap_contains_queue(mi_page_heap(page),pq));
+      mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_is_huge(page) || mi_page_is_in_full(page));
+      // mi_assert_internal(mi_theap_contains_queue(mi_page_theap(page),pq));
     }
   }
   return true;
 }
 #endif
 
-void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never) {
-  mi_thread_free_t tfreex;
-  mi_delayed_t     old_delay;
-  mi_thread_free_t tfree;  
-  do {
-    tfree = mi_atomic_load_acquire(&page->xthread_free); // note: must acquire as we can break/repeat this loop and not do a CAS;
-    tfreex = mi_tf_set_delayed(tfree, delay);
-    old_delay = mi_tf_delayed(tfree);
-    if (mi_unlikely(old_delay == MI_DELAYED_FREEING)) {
-      mi_atomic_yield(); // delay until outstanding MI_DELAYED_FREEING are done.
-      // tfree = mi_tf_set_delayed(tfree, MI_NO_DELAYED_FREE); // will cause CAS to busy fail
-    }
-    else if (delay == old_delay) {
-      break; // avoid atomic operation if already equal
-    }
-    else if (!override_never && old_delay == MI_NEVER_DELAYED_FREE) {
-      break; // leave never-delayed flag set
-    }
-  } while ((old_delay == MI_DELAYED_FREEING) ||
-           !mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
-}
 
 /* -----------------------------------------------------------
   Page collect the `local_free` and `thread_free` lists
 ----------------------------------------------------------- */
 
-// Collect the local `thread_free` list using an atomic exchange.
-// Note: The exchange must be done atomically as this is used right after
-// moving to the full list in `mi_page_collect_ex` and we need to
-// ensure that there was no race where the page became unfull just before the move.
-static void _mi_page_thread_free_collect(mi_page_t* page)
+static void mi_page_thread_collect_to_local(mi_page_t* page, mi_block_t* head)
 {
-  mi_block_t* head;
-  mi_thread_free_t tfreex;
-  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
-  do {
-    head = mi_tf_block(tfree);
-    tfreex = mi_tf_set_block(tfree,NULL);
-  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex));
-
-  // return if the list is empty
   if (head == NULL) return;
 
-  // find the tail -- also to get a proper count (without data races)
-  uint32_t max_count = page->capacity; // cannot collect more than capacity
-  uint32_t count = 1;
-  mi_block_t* tail = head;
+  // find the last block in the list -- also to get a proper use count (without data races)
+  size_t max_count = page->capacity; // cannot collect more than capacity
+  size_t count = 1;
+  mi_block_t* last = head;
   mi_block_t* next;
-  while ((next = mi_block_next(page,tail)) != NULL && count <= max_count) {
+  while ((next = mi_block_next(page, last)) != NULL && count <= max_count) {
     count++;
-    tail = next;
+    last = next;
   }
+
   // if `count > max_count` there was a memory corruption (possibly infinite list due to double multi-threaded free)
   if (count > max_count) {
     _mi_error_message(EFAULT, "corrupted thread-free list\n");
@@ -182,28 +158,57 @@ static void _mi_page_thread_free_collect(mi_page_t* page)
   }
 
   // and append the current local free list
-  mi_block_set_next(page,tail, page->local_free);
+  mi_block_set_next(page, last, page->local_free);
   page->local_free = head;
 
   // update counts now
-  page->used -= count;
+  mi_assert_internal(count <= UINT16_MAX);
+  mi_assert_internal(page->used >= (uint16_t)count);
+  page->used = page->used - (uint16_t)count;
+}
+
+// Collect the local `thread_free` list using an atomic exchange.
+static void mi_page_thread_free_collect(mi_page_t* page)
+{
+  // atomically capture the thread free list
+  mi_block_t* head;
+  mi_thread_free_t tfreex;
+  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
+  do {
+    head = mi_tf_block(tfree);
+    if mi_likely(head == NULL) return; // return if the list is empty
+    tfreex = mi_tf_create(NULL,mi_tf_is_owned(tfree));  // set the thread free list to NULL
+  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex));  // release is enough?
+  mi_assert_internal(head != NULL);
+
+  // and move it to the local list
+  mi_page_thread_collect_to_local(page, head);
+}
+
+// returns `true` if after collection `mi_page_immediate_available` is true.
+static bool mi_page_free_quick_collect(mi_page_t* page) {
+  if (page->free != NULL) return true;
+  if (page->local_free == NULL) return false;
+  // move local_free to free
+  page->free = page->local_free;
+  page->local_free = NULL;
+  page->free_is_zero = false;
+  return true;
 }
 
 void _mi_page_free_collect(mi_page_t* page, bool force) {
   mi_assert_internal(page!=NULL);
 
   // collect the thread free list
-  if (force || mi_page_thread_free(page) != NULL) {  // quick test to avoid an atomic operation
-    _mi_page_thread_free_collect(page);
-  }
+  mi_page_thread_free_collect(page);
 
   // and the local free list
   if (page->local_free != NULL) {
-    if (mi_likely(page->free == NULL)) {
+    if mi_likely(page->free == NULL) {
       // usual case
       page->free = page->local_free;
       page->local_free = NULL;
-      page->is_zero = false;
+      page->free_is_zero = false;
     }
     else if (force) {
       // append -- only on shutdown (force) as this is a linear operation
@@ -215,101 +220,156 @@ void _mi_page_free_collect(mi_page_t* page, bool force) {
       mi_block_set_next(page, tail, page->free);
       page->free = page->local_free;
       page->local_free = NULL;
-      page->is_zero = false;
+      page->free_is_zero = false;
     }
   }
 
   mi_assert_internal(!force || page->local_free == NULL);
 }
 
+// Collect elements in the thread-free list starting at `head`. This is an optimized
+// version of `_mi_page_free_collect` to be used from `free.c:_mi_free_collect_mt` that avoids atomic access to `xthread_free`.
+//
+// `head` must be in the `xthread_free` list. It will not collect `head` itself
+// so the `used` count is not fully updated in general. However, if the `head` is
+// the last remaining element, it will be collected and the used count will become `0` (so `mi_page_all_free` becomes true).
+void _mi_page_free_collect_partly(mi_page_t* page, mi_block_t* head) {
+  if (head == NULL) return;
+  mi_block_t* next = mi_block_next(page,head);  // we cannot collect the head element itself as `page->thread_free` may point to it (and we want to avoid atomic ops)
+  if (next != NULL) {
+    mi_block_set_next(page, head, NULL);
+    mi_page_thread_collect_to_local(page, next);
+    if (page->local_free != NULL && page->free == NULL) {
+      page->free = page->local_free;
+      page->local_free = NULL;
+      page->free_is_zero = false;
+    }
+  }
+  if (page->used == 1) {
+    // all elements are free'd since we skipped the `head` element itself
+    mi_assert_internal(mi_tf_block(mi_atomic_load_relaxed(&page->xthread_free)) == head);
+    mi_assert_internal(mi_block_next(page,head) == NULL);
+    _mi_page_free_collect(page, false);  // collect the final element
+  }
+}
 
 
 /* -----------------------------------------------------------
   Page fresh and retire
 ----------------------------------------------------------- */
 
+/*
 // called from segments when reclaiming abandoned pages
-void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
+void _mi_page_reclaim(mi_theap_t* theap, mi_page_t* page) {
+  // mi_page_set_theap(page, theap);
+  // _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after theap is set)
+  _mi_page_free_collect(page, false); // ensure used count is up to date
+
   mi_assert_expensive(mi_page_is_valid_init(page));
+  // mi_assert_internal(mi_page_theap(page) == theap);
+  // mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE);
 
-  mi_assert_internal(mi_page_heap(page) == heap);
-  mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE);
-  mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
-  mi_assert_internal(!page->is_reset);
   // TODO: push on full queue immediately if it is full?
-  mi_page_queue_t* pq = mi_page_queue(heap, mi_page_block_size(page));
-  mi_page_queue_push(heap, pq, page);
+  mi_page_queue_t* pq = mi_theap_page_queue_of(theap, page);
+  mi_page_queue_push(theap, pq, page);
   mi_assert_expensive(_mi_page_is_valid(page));
 }
+*/
 
-// allocate a fresh page from a segment
-static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size_t block_size) {
-  mi_assert_internal(pq==NULL||mi_heap_contains_queue(heap, pq));
-  mi_page_t* page = _mi_segment_page_alloc(heap, block_size, &heap->tld->segments, &heap->tld->os);
+// called from `mi_free` on a reclaim, and fresh_alloc if we get an abandoned page
+void _mi_theap_page_reclaim(mi_theap_t* theap, mi_page_t* page)
+{
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_ptr_page(page)==page);
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_is_abandoned(page));
+
+  mi_page_set_theap(page,theap);
+  _mi_page_free_collect(page, false); // ensure used count is up to date
+  mi_page_queue_t* pq = mi_theap_page_queue_of(theap, page);
+  mi_page_queue_push_at_end(theap, pq, page);
+  mi_assert_expensive(_mi_page_is_valid(page));
+}
+
+void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
+  _mi_page_free_collect(page, false); // ensure used count is up to date
+  if (mi_page_all_free(page)) {
+    _mi_page_free(page, pq);
+  }
+  else {
+    mi_page_queue_remove(pq, page);
+    mi_theap_t* theap = page->theap;
+    mi_page_set_theap(page, NULL);
+    page->theap = theap; // don't actually set theap to NULL so we can reclaim_on_free within the same theap
+    _mi_arenas_page_abandon(page, theap);
+    _mi_arenas_collect(false, false, theap->tld); // allow purging
+  }
+}
+
+
+// allocate a fresh page from an arena
+static mi_page_t* mi_page_fresh_alloc(mi_theap_t* theap, mi_page_queue_t* pq, size_t block_size, size_t page_alignment) {
+  #if !MI_HUGE_PAGE_ABANDON
+  mi_assert_internal(pq != NULL);
+  mi_assert_internal(mi_theap_contains_queue(theap, pq));
+  mi_assert_internal(page_alignment > 0 || block_size > MI_LARGE_MAX_OBJ_SIZE || block_size == pq->block_size);
+  #endif
+  mi_page_t* page = _mi_arenas_page_alloc(theap, block_size, page_alignment);
   if (page == NULL) {
-    // this may be out-of-memory, or an abandoned page was reclaimed (and in our queue)
+    // out-of-memory
     return NULL;
   }
-  mi_assert_internal(pq==NULL || _mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
-  mi_page_init(heap, page, block_size, heap->tld);
-  mi_heap_stat_increase(heap, pages, 1);
-  if (pq!=NULL) mi_page_queue_push(heap, pq, page); // huge pages use pq==NULL
+  if (mi_page_is_abandoned(page)) {
+    _mi_theap_page_reclaim(theap, page);
+    if (!mi_page_immediate_available(page)) {
+      if (mi_page_is_expandable(page)) {
+        if (!mi_page_extend_free(theap, page)) {
+          return NULL; // cannot commit
+        };
+      }
+      else {
+        mi_assert(false); // should not happen?
+        return NULL;
+      }
+    }
+  }
+  else if (pq != NULL) {
+    mi_page_queue_push(theap, pq, page);
+  }
+  mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size);
   mi_assert_expensive(_mi_page_is_valid(page));
   return page;
 }
 
 // Get a fresh page to use
-static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) {
-  mi_assert_internal(mi_heap_contains_queue(heap, pq));
-  mi_page_t* page = mi_page_fresh_alloc(heap, pq, pq->block_size);
+static mi_page_t* mi_page_fresh(mi_theap_t* theap, mi_page_queue_t* pq) {
+  mi_assert_internal(mi_theap_contains_queue(theap, pq));
+  mi_page_t* page = mi_page_fresh_alloc(theap, pq, pq->block_size, 0);
   if (page==NULL) return NULL;
   mi_assert_internal(pq->block_size==mi_page_block_size(page));
-  mi_assert_internal(pq==mi_page_queue(heap, mi_page_block_size(page)));
+  mi_assert_internal(pq==mi_theap_page_queue_of(theap, page));
   return page;
 }
 
-/* -----------------------------------------------------------
-   Do any delayed frees
-   (put there by other threads if they deallocated in a full page)
------------------------------------------------------------ */
-void _mi_heap_delayed_free(mi_heap_t* heap) {
-  // take over the list (note: no atomic exchange since it is often NULL)
-  mi_block_t* block = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
-  while (block != NULL && !mi_atomic_cas_ptr_weak_acq_rel(mi_block_t, &heap->thread_delayed_free, &block, NULL)) { /* nothing */ };
-
-  // and free them all
-  while(block != NULL) {
-    mi_block_t* next = mi_block_nextx(heap,block, heap->keys);
-    // use internal free instead of regular one to keep stats etc correct
-    if (!_mi_free_delayed_block(block)) {
-      // we might already start delayed freeing while another thread has not yet
-      // reset the delayed_freeing flag; in that case delay it further by reinserting.
-      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
-      do {
-        mi_block_set_nextx(heap, block, dfree, heap->keys);
-      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
-    }
-    block = next;
-  }
-}
 
 /* -----------------------------------------------------------
   Unfull, abandon, free and retire
 ----------------------------------------------------------- */
 
-// Move a page from the full list back to a regular list
+// Move a page from the full list back to a regular list (called from thread-local mi_free)
 void _mi_page_unfull(mi_page_t* page) {
   mi_assert_internal(page != NULL);
   mi_assert_expensive(_mi_page_is_valid(page));
   mi_assert_internal(mi_page_is_in_full(page));
+  mi_assert_internal(!mi_page_theap(page)->allow_page_abandon);
   if (!mi_page_is_in_full(page)) return;
 
-  mi_heap_t* heap = mi_page_heap(page);
-  mi_page_queue_t* pqfull = &heap->pages[MI_BIN_FULL];
+  mi_theap_t* theap = mi_page_theap(page);
+  mi_page_queue_t* pqfull = &theap->pages[MI_BIN_FULL];
   mi_page_set_in_full(page, false); // to get the right queue
-  mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
+  mi_page_queue_t* pq = mi_theap_page_queue_of(theap, page);
   mi_page_set_in_full(page, true);
-  mi_page_queue_enqueue_from(pq, pqfull, page);
+  mi_page_queue_enqueue_from_full(pq, pqfull, page);
 }
 
 static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) {
@@ -317,75 +377,48 @@ static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) {
   mi_assert_internal(!mi_page_immediate_available(page));
   mi_assert_internal(!mi_page_is_in_full(page));
 
-  if (mi_page_is_in_full(page)) return;
-  mi_page_queue_enqueue_from(&mi_page_heap(page)->pages[MI_BIN_FULL], pq, page);
-  _mi_page_free_collect(page,false);  // try to collect right away in case another thread freed just before MI_USE_DELAYED_FREE was set
-}
-
-
-// Abandon a page with used blocks at the end of a thread.
-// Note: only call if it is ensured that no references exist from
-// the `page->heap->thread_delayed_free` into this page.
-// Currently only called through `mi_heap_collect_ex` which ensures this.
-void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
-  mi_assert_internal(page != NULL);
-  mi_assert_expensive(_mi_page_is_valid(page));
-  mi_assert_internal(pq == mi_page_queue_of(page));
-  mi_assert_internal(mi_page_heap(page) != NULL);
-
-  mi_heap_t* pheap = mi_page_heap(page);
-
-  // remove from our page list
-  mi_segments_tld_t* segments_tld = &pheap->tld->segments;
-  mi_page_queue_remove(pq, page);
-
-  // page is no longer associated with our heap
-  mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
-  mi_page_set_heap(page, NULL);
-
-#if MI_DEBUG>1
-  // check there are no references left..
-  for (mi_block_t* block = (mi_block_t*)pheap->thread_delayed_free; block != NULL; block = mi_block_nextx(pheap, block, pheap->keys)) {
-    mi_assert_internal(_mi_ptr_page(block) != page);
+  mi_theap_t* theap = mi_page_theap(page);
+  if (theap->allow_page_abandon) {
+    // abandon full pages (this is the usual case in order to allow for sharing of memory between theaps)
+    _mi_page_abandon(page, pq);
+  }
+  else if (!mi_page_is_in_full(page)) {
+    // put full pages in a theap local queue (this is for theaps that cannot abandon, for example, if the theap can be destroyed)
+    mi_page_queue_enqueue_from(&mi_page_theap(page)->pages[MI_BIN_FULL], pq, page);
+    _mi_page_free_collect(page, false);  // try to collect right away in case another thread freed just before MI_USE_DELAYED_FREE was set
   }
-#endif
-
-  // and abandon it
-  mi_assert_internal(mi_page_heap(page) == NULL);
-  _mi_segment_page_abandon(page,segments_tld);
 }
 
 
 // Free a page with no more free blocks
-void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
+void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq) {
   mi_assert_internal(page != NULL);
   mi_assert_expensive(_mi_page_is_valid(page));
   mi_assert_internal(pq == mi_page_queue_of(page));
   mi_assert_internal(mi_page_all_free(page));
-  mi_assert_internal(mi_page_thread_free_flag(page)!=MI_DELAYED_FREEING);
+  // mi_assert_internal(mi_page_thread_free_flag(page)!=MI_DELAYED_FREEING);
 
   // no more aligned blocks in here
-  mi_page_set_has_aligned(page, false);
-
-  mi_heap_t* heap = mi_page_heap(page);
+  mi_page_set_has_interior_pointers(page, false);
 
   // remove from the page list
-  // (no need to do _mi_heap_delayed_free first as all blocks are already free)
-  mi_segments_tld_t* segments_tld = &heap->tld->segments;
+  // (no need to do _mi_theap_delayed_free first as all blocks are already free)
   mi_page_queue_remove(pq, page);
 
   // and free it
-  mi_page_set_heap(page,NULL);
-  _mi_segment_page_free(page, force, segments_tld);
+  mi_theap_t* theap = mi_page_theap(page); mi_assert_internal(theap!=NULL);
+  mi_page_set_theap(page,NULL);
+  _mi_arenas_page_free(page, theap);
+  _mi_arenas_collect(false, false, theap->tld);  // allow purging
 }
 
-// Retire parameters
-#define MI_MAX_RETIRE_SIZE    MI_MEDIUM_OBJ_SIZE_MAX  
-#define MI_RETIRE_CYCLES      (8)
+#define MI_MAX_RETIRE_SIZE    MI_LARGE_OBJ_SIZE_MAX   // should be less than size for MI_BIN_HUGE
+#define MI_RETIRE_CYCLES      (16)
 
 // Retire a page with no more used blocks
 // Important to not retire too quickly though as new
 // allocations might coming.
+//
 // Note: called from `mi_free` and benchmarks often
 // trigger this due to freeing everything and then
 // allocating again so careful when changing this.
@@ -393,8 +426,9 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
   mi_assert_internal(page != NULL);
   mi_assert_expensive(_mi_page_is_valid(page));
   mi_assert_internal(mi_page_all_free(page));
-  
-  mi_page_set_has_aligned(page, false);
+
+  if (page->retire_expire!=0) return;  // already retired, just keep it retired
+  mi_page_set_has_interior_pointers(page, false);
 
   // don't retire too often..
   // (or we end up retiring and re-allocating most of the time)
@@ -403,36 +437,41 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
   // how to check this efficiently though...
   // for now, we don't retire if it is the only page left of this size class.
   mi_page_queue_t* pq = mi_page_queue_of(page);
-  if (mi_likely(page->xblock_size <= MI_MAX_RETIRE_SIZE && !mi_page_is_in_full(page))) {
+  #if MI_RETIRE_CYCLES > 0
+  const size_t bsize = mi_page_block_size(page);
+  if mi_likely( /* bsize < MI_MAX_RETIRE_SIZE && */ !mi_page_queue_is_special(pq)) {  // not full or huge queue?
     if (pq->last==page && pq->first==page) { // the only page in the queue?
-      mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
-      page->retire_expire = 1 + (page->xblock_size <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);      
-      mi_heap_t* heap = mi_page_heap(page);
-      mi_assert_internal(pq >= heap->pages);
-      const size_t index = pq - heap->pages;
+      mi_theap_t* theap = mi_page_theap(page);
+      #if MI_STAT>0
+      mi_theap_stat_counter_increase(theap, pages_retire, 1);
+      #endif
+      page->retire_expire = (bsize <= MI_SMALL_MAX_OBJ_SIZE ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
+      mi_assert_internal(pq >= theap->pages);
+      const size_t index = pq - theap->pages;
       mi_assert_internal(index < MI_BIN_FULL && index < MI_BIN_HUGE);
-      if (index < heap->page_retired_min) heap->page_retired_min = index;
-      if (index > heap->page_retired_max) heap->page_retired_max = index;
+      if (index < theap->page_retired_min) theap->page_retired_min = index;
+      if (index > theap->page_retired_max) theap->page_retired_max = index;
       mi_assert_internal(mi_page_all_free(page));
-      return; // dont't free after all
+      return; // don't free after all
     }
   }
-  _mi_page_free(page, pq, false);
+  #endif
+  _mi_page_free(page, pq);
 }
 
 // free retired pages: we don't need to look at the entire queues
 // since we only retire pages that are at the head position in a queue.
-void _mi_heap_collect_retired(mi_heap_t* heap, bool force) {
+void _mi_theap_collect_retired(mi_theap_t* theap, bool force) {
   size_t min = MI_BIN_FULL;
   size_t max = 0;
-  for(size_t bin = heap->page_retired_min; bin <= heap->page_retired_max; bin++) {
-    mi_page_queue_t* pq   = &heap->pages[bin];
+  for(size_t bin = theap->page_retired_min; bin <= theap->page_retired_max; bin++) {
+    mi_page_queue_t* pq   = &theap->pages[bin];
     mi_page_t*       page = pq->first;
     if (page != NULL && page->retire_expire != 0) {
       if (mi_page_all_free(page)) {
         page->retire_expire--;
-        if (force || page->retire_expire == 0) {
-          _mi_page_free(pq->first, pq, force);
+        if (page->retire_expire == 0 || force) {
+          _mi_page_free(page, pq);
         }
         else {
           // keep retired, update min/max
@@ -445,9 +484,32 @@ void _mi_heap_collect_retired(mi_heap_t* heap, bool force) {
       }
     }
   }
-  heap->page_retired_min = min;
-  heap->page_retired_max = max;
+  theap->page_retired_min = min;
+  theap->page_retired_max = max;
+}
+
+/*
+static void mi_theap_collect_full_pages(mi_theap_t* theap) {
+  // note: normally full pages get immediately abandoned and the full queue is always empty
+  // this path is only used if abandoning is disabled due to a destroy-able theap or options
+  // set by the user.
+  mi_page_queue_t* pq = &theap->pages[MI_BIN_FULL];
+  for (mi_page_t* page = pq->first; page != NULL; ) {
+    mi_page_t* next = page->next;         // get next in case we free the page
+    _mi_page_free_collect(page, false);   // register concurrent free's
+    // no longer full?
+    if (!mi_page_is_full(page)) {
+      if (mi_page_all_free(page)) {
+        _mi_page_free(page, pq);
+      }
+      else {
+        _mi_page_unfull(page);
+      }
+    }
+    page = next;
+  }
 }
+*/
 
 
 /* -----------------------------------------------------------
@@ -460,15 +522,14 @@ void _mi_heap_collect_retired(mi_heap_t* heap, bool force) {
 #define MI_MAX_SLICES       (1UL << MI_MAX_SLICE_SHIFT)
 #define MI_MIN_SLICES       (2)
 
-static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats) {
-  MI_UNUSED(stats);
-  #if (MI_SECURE<=2)
+static void mi_page_free_list_extend_secure(mi_theap_t* const theap, mi_page_t* const page, const size_t bsize, const size_t extend) {
+  #if (MI_SECURE<3)
   mi_assert_internal(page->free == NULL);
   mi_assert_internal(page->local_free == NULL);
   #endif
   mi_assert_internal(page->capacity + extend <= page->reserved);
   mi_assert_internal(bsize == mi_page_block_size(page));
-  void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL);
+  void* const page_area = mi_page_start(page);
 
   // initialize a randomized free list
   // set up `slice_count` slices to alternate between
@@ -489,7 +550,7 @@ static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* co
 
   // and initialize the free list by randomly threading through them
   // set up first element
-  const uintptr_t r = _mi_heap_random_next(heap);
+  const uintptr_t r = _mi_theap_random_next(theap);
   size_t current = r % slice_count;
   counts[current]--;
   mi_block_t* const free_start = blocks[current];
@@ -517,16 +578,15 @@ static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* co
   page->free = free_start;
 }
 
-static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats)
+static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, const size_t bsize, const size_t extend)
 {
-  MI_UNUSED(stats);
-  #if (MI_SECURE <= 2)
+  #if (MI_SECURE<3)
   mi_assert_internal(page->free == NULL);
   mi_assert_internal(page->local_free == NULL);
   #endif
   mi_assert_internal(page->capacity + extend <= page->reserved);
   mi_assert_internal(bsize == mi_page_block_size(page));
-  void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL );
+  void* const page_area = mi_page_start(page);
 
   mi_block_t* const start = mi_page_block_at(page, page_area, bsize, page->capacity);
 
@@ -548,7 +608,7 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co
 ----------------------------------------------------------- */
 
 #define MI_MAX_EXTEND_SIZE    (4*1024)      // heuristic, one OS page seems to work well.
-#if (MI_SECURE>0)
+#if (MI_SECURE>=3)
 #define MI_MIN_EXTEND         (8*MI_SECURE) // extend at least by this many
 #else
 #define MI_MIN_EXTEND         (1)
@@ -559,29 +619,31 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co
 // Note: we also experimented with "bump" allocation on the first
 // allocations but this did not speed up any benchmark (due to an
 // extra test in malloc? or cache effects?)
-static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld) {
-  MI_UNUSED(tld); 
+static bool mi_page_extend_free(mi_theap_t* theap, mi_page_t* page) {
   mi_assert_expensive(mi_page_is_valid_init(page));
-  #if (MI_SECURE<=2)
+  #if (MI_SECURE<3)
   mi_assert(page->free == NULL);
   mi_assert(page->local_free == NULL);
-  if (page->free != NULL) return;
+  if (page->free != NULL) return true;
   #endif
-  if (page->capacity >= page->reserved) return;
+  if (page->capacity >= page->reserved) return true;
 
   size_t page_size;
-  _mi_page_start(_mi_page_segment(page), page, &page_size);
-  mi_stat_counter_increase(tld->stats.pages_extended, 1);
+  //uint8_t* page_start =
+  mi_page_area(page, &page_size);
+  #if MI_STAT>0
+  mi_theap_stat_counter_increase(theap, pages_extended, 1);
+  #endif
 
   // calculate the extend count
-  const size_t bsize = (page->xblock_size < MI_HUGE_BLOCK_SIZE ? page->xblock_size : page_size);
-  size_t extend = page->reserved - page->capacity;
+  const size_t bsize = mi_page_block_size(page);
+  size_t extend = (size_t)page->reserved - page->capacity;
   mi_assert_internal(extend > 0);
 
-  size_t max_extend = (bsize >= MI_MAX_EXTEND_SIZE ? MI_MIN_EXTEND : MI_MAX_EXTEND_SIZE/(uint32_t)bsize);
+  size_t max_extend = (bsize >= MI_MAX_EXTEND_SIZE ? MI_MIN_EXTEND : MI_MAX_EXTEND_SIZE/bsize);
   if (max_extend < MI_MIN_EXTEND) { max_extend = MI_MIN_EXTEND; }
   mi_assert_internal(max_extend > 0);
-    
+
   if (extend > max_extend) {
     // ensure we don't touch memory beyond the page to reduce page commit.
     // the `lean` benchmark tests this. Going from 1 to 8 increases rss by 50%.
@@ -591,68 +653,79 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
   mi_assert_internal(extend > 0 && extend + page->capacity <= page->reserved);
   mi_assert_internal(extend < (1UL<<16));
 
+  // commit on demand?
+  if (page->slice_committed > 0) {
+    const size_t needed_size = (page->capacity + extend)*bsize;
+    const size_t needed_commit = _mi_align_up( mi_page_slice_offset_of(page, needed_size), MI_PAGE_MIN_COMMIT_SIZE );
+    if (needed_commit > page->slice_committed) {
+      mi_assert_internal(((needed_commit - page->slice_committed) % _mi_os_page_size()) == 0);
+      if (!_mi_os_commit(mi_page_slice_start(page) + page->slice_committed, needed_commit - page->slice_committed, NULL)) {
+        return false;
+      }
+      page->slice_committed = needed_commit;
+    }
+  }
+
   // and append the extend the free list
-  if (extend < MI_MIN_SLICES || MI_SECURE==0) { //!mi_option_is_enabled(mi_option_secure)) {
-    mi_page_free_list_extend(page, bsize, extend, &tld->stats );
+  if (extend < MI_MIN_SLICES || MI_SECURE<3) { //!mi_option_is_enabled(mi_option_secure)) {
+    mi_page_free_list_extend(page, bsize, extend );
   }
   else {
-    mi_page_free_list_extend_secure(heap, page, bsize, extend, &tld->stats);
+    mi_page_free_list_extend_secure(theap, page, bsize, extend);
   }
   // enable the new free list
   page->capacity += (uint16_t)extend;
-  mi_stat_increase(tld->stats.page_committed, extend * bsize);
-
-  // extension into zero initialized memory preserves the zero'd free list
-  if (!page->is_zero_init) {
-    page->is_zero = false;
-  }
+  #if MI_STAT>0
+  mi_theap_stat_increase(theap, page_committed, extend * bsize);
+  #endif
   mi_assert_expensive(mi_page_is_valid_init(page));
+  return true;
 }
 
-// Initialize a fresh page
-static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi_tld_t* tld) {
+// Initialize a fresh page (that is already partially initialized)
+mi_decl_nodiscard bool _mi_page_init(mi_theap_t* theap, mi_page_t* page) {
   mi_assert(page != NULL);
-  mi_segment_t* segment = _mi_page_segment(page);
-  mi_assert(segment != NULL);
-  mi_assert_internal(block_size > 0);
-  // set fields
-  mi_page_set_heap(page, heap);
-  page->xblock_size = (block_size < MI_HUGE_BLOCK_SIZE ? (uint32_t)block_size : MI_HUGE_BLOCK_SIZE); // initialize before _mi_segment_page_start
+  mi_assert(theap!=NULL);
+  page->heap = (_mi_is_heap_main(theap->heap) ? NULL : theap->heap); // faster for `mi_page_associated_theap`
+  mi_page_set_theap(page, theap);
+
   size_t page_size;
-  _mi_segment_page_start(segment, page, &page_size);
-  mi_assert_internal(mi_page_block_size(page) <= page_size);
-  mi_assert_internal(page_size <= page->slice_count*MI_SEGMENT_SLICE_SIZE);
-  mi_assert_internal(page_size / block_size < (1L<<16));
-  page->reserved = (uint16_t)(page_size / block_size);
-  #ifdef MI_ENCODE_FREELIST
-  page->keys[0] = _mi_heap_random_next(heap);
-  page->keys[1] = _mi_heap_random_next(heap);
+  uint8_t* page_start = mi_page_area(page, &page_size); MI_UNUSED(page_start);
+  mi_track_mem_noaccess(page_start,page_size);
+  mi_assert_internal(page_size / mi_page_block_size(page) < (1L<<16));
+  mi_assert_internal(page->reserved > 0);
+  #if (MI_PADDING || MI_ENCODE_FREELIST)
+  page->keys[0] = _mi_theap_random_next(theap);
+  page->keys[1] = _mi_theap_random_next(theap);
   #endif
-  #if MI_DEBUG > 0
-  page->is_zero = false; // ensure in debug mode we initialize with MI_DEBUG_UNINIT, see issue #501
-  #else
-  page->is_zero = page->is_zero_init;
+  #if MI_DEBUG>2
+  if (page->memid.initially_zero) {
+    mi_track_mem_defined(page->page_start, mi_page_committed(page));
+    mi_assert_expensive(mi_mem_is_zero(page_start, mi_page_committed(page)));
+  }
   #endif
 
-  mi_assert_internal(page->is_committed);
-  mi_assert_internal(!page->is_reset);
+  mi_assert_internal(page->theap!=NULL);
+  mi_assert_internal(page->theap == mi_page_theap(page));
   mi_assert_internal(page->capacity == 0);
   mi_assert_internal(page->free == NULL);
   mi_assert_internal(page->used == 0);
-  mi_assert_internal(page->xthread_free == 0);
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(page->xthread_free == 1);
   mi_assert_internal(page->next == NULL);
   mi_assert_internal(page->prev == NULL);
   mi_assert_internal(page->retire_expire == 0);
-  mi_assert_internal(!mi_page_has_aligned(page));
-  #if (MI_ENCODE_FREELIST)
+  mi_assert_internal(!mi_page_has_interior_pointers(page));
+  #if (MI_PADDING || MI_ENCODE_FREELIST)
   mi_assert_internal(page->keys[0] != 0);
   mi_assert_internal(page->keys[1] != 0);
   #endif
   mi_assert_expensive(mi_page_is_valid_init(page));
 
   // initialize an initial free list
-  mi_page_extend_free(heap,page,tld);
+  if (!mi_page_extend_free(theap,page)) return false;
   mi_assert(mi_page_immediate_available(page));
+  return true;
 }
 
 
@@ -661,81 +734,140 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
 -------------------------------------------------------------*/
 
 // Find a page with free blocks of `page->block_size`.
-static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq, bool first_try)
+static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_theap_t* theap, mi_page_queue_t* pq, bool first_try)
 {
   // search through the pages in "next fit" order
   size_t count = 0;
+  long candidate_limit = 0;          // we reset this on the first candidate to limit the search
+  long page_full_retain = (pq->block_size > MI_SMALL_MAX_OBJ_SIZE ? 0 : theap->page_full_retain); // only retain small pages
+  mi_page_t* page_candidate = NULL;  // a page with free space
   mi_page_t* page = pq->first;
+
   while (page != NULL)
   {
-    mi_page_t* next = page->next; // remember next
+    mi_page_t* next = page->next; // remember next (as this page can move to another queue)
     count++;
+    candidate_limit--;
 
-    // 0. collect freed blocks by us and other threads
-    _mi_page_free_collect(page, false);
+    // search up to N pages for a best candidate
 
-    // 1. if the page contains free blocks, we are done
-    if (mi_page_immediate_available(page)) {
-      break;  // pick this one
+    // is the local free list non-empty?
+    bool immediate_available = mi_page_immediate_available(page);
+    if (!immediate_available) {
+      // collect freed blocks by us and other threads to we get a proper use count
+      _mi_page_free_collect(page, false);
+      immediate_available = mi_page_immediate_available(page);
     }
 
-    // 2. Try to extend
-    if (page->capacity < page->reserved) {
-      mi_page_extend_free(heap, page, heap->tld);
-      mi_assert_internal(mi_page_immediate_available(page));
-      break;
+    // if the page is completely full, move it to the `mi_pages_full`
+    // queue so we don't visit long-lived pages too often.
+    if (!immediate_available && !mi_page_is_expandable(page)) {
+      page_full_retain--;
+      if (page_full_retain < 0) {
+        mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page));
+        mi_page_to_full(page, pq);
+      }
+    }
+    else {
+      // the page has free space, make it a candidate
+      // we prefer non-expandable pages with high usage as candidates (to reduce commit, and increase chances of free-ing up pages)
+      if (page_candidate == NULL) {
+        page_candidate = page;
+        candidate_limit = _mi_option_get_fast(mi_option_page_max_candidates);
+      }
+      else if (mi_page_all_free(page_candidate)) {
+        _mi_page_free(page_candidate, pq);
+        page_candidate = page;
+      }
+      // prefer to reuse fuller pages (in the hope the less used page gets freed)
+      else if (page->used >= page_candidate->used && !mi_page_is_mostly_used(page)) { // && !mi_page_is_expandable(page)) {
+        page_candidate = page;
+      }
+      // if we find a non-expandable candidate, or searched for N pages, return with the best candidate
+      if (immediate_available || candidate_limit <= 0) {
+        mi_assert_internal(page_candidate!=NULL);
+        break;
+      }
+    }
+
+  #if 0
+    // first-fit algorithm without candidates
+    // If the page contains free blocks, we are done
+    if (mi_page_immediate_available(page) || mi_page_is_expandable(page)) {
+      break;  // pick this one
     }
 
-    // 3. If the page is completely full, move it to the `mi_pages_full`
+    // If the page is completely full, move it to the `mi_pages_full`
     // queue so we don't visit long-lived pages too often.
     mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page));
     mi_page_to_full(page, pq);
+  #endif
 
     page = next;
   } // for each page
 
-  mi_heap_stat_counter_increase(heap, searches, count);
+  mi_theap_stat_counter_increase(theap, page_searches, count);
+  mi_theap_stat_counter_increase(theap, page_searches_count, 1);
+
+  // set the page to the best candidate
+  if (page_candidate != NULL) {
+    page = page_candidate;
+  }
+  if (page != NULL) {
+    if (!mi_page_immediate_available(page)) {
+      mi_assert_internal(mi_page_is_expandable(page));
+      if (!mi_page_extend_free(theap, page)) {
+        page = NULL; // failed to extend
+      }
+    }
+    mi_assert_internal(page == NULL || mi_page_immediate_available(page));
+  }
 
   if (page == NULL) {
-    _mi_heap_collect_retired(heap, false); // perhaps make a page available?
-    page = mi_page_fresh(heap, pq);
+    _mi_theap_collect_retired(theap, false); // perhaps make a page available
+    page = mi_page_fresh(theap, pq);
+    mi_assert_internal(page == NULL || mi_page_immediate_available(page));
     if (page == NULL && first_try) {
       // out-of-memory _or_ an abandoned page with free blocks was reclaimed, try once again
-      page = mi_page_queue_find_free_ex(heap, pq, false);      
+      page = mi_page_queue_find_free_ex(theap, pq, false);
+      mi_assert_internal(page == NULL || mi_page_immediate_available(page));
     }
   }
   else {
-    mi_assert(pq->first == page);
+    mi_assert_internal(page == NULL || mi_page_immediate_available(page));
+    // move the page to the front of the queue
+    mi_page_queue_move_to_front(theap, pq, page);
     page->retire_expire = 0;
+    // _mi_theap_collect_retired(theap, false); // update retire counts; note: increases rss on MemoryLoad bench so don't do this
   }
   mi_assert_internal(page == NULL || mi_page_immediate_available(page));
+
+
   return page;
 }
 
 
 
 // Find a page with free blocks of `size`.
-static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) {
-  mi_page_queue_t* pq = mi_page_queue(heap,size);
+static mi_page_t* mi_find_free_page(mi_theap_t* theap, mi_page_queue_t* pq) {
+  // mi_page_queue_t* pq = mi_page_queue(theap, size);
+  mi_assert_internal(!mi_page_queue_is_huge(pq));
+
+  // check the first page: we even do this with candidate search or otherwise we re-search every time
   mi_page_t* page = pq->first;
-  if (page != NULL) {
-   #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness      
-    if (page->capacity < page->reserved && ((_mi_heap_random_next(heap) & 1) == 1)) {
-      mi_page_extend_free(heap, page, heap->tld);
+  if mi_likely(page != NULL && mi_page_free_quick_collect(page)) {
+    #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness
+    if (page->capacity < page->reserved && ((_mi_theap_random_next(theap) & 1) == 1)) {
+      (void)mi_page_extend_free(theap, page);  // ok if this fails
       mi_assert_internal(mi_page_immediate_available(page));
     }
-    else 
-   #endif
-    {
-      _mi_page_free_collect(page,false);
-    }
-    
-    if (mi_page_immediate_available(page)) {
-      page->retire_expire = 0;
-      return page; // fast path
-    }
+    #endif
+    page->retire_expire = 0;
+    return page; // fast path
+  }
+  else {
+    return mi_page_queue_find_free_ex(theap, pq, true);
   }
-  return mi_page_queue_find_free_ex(heap, pq, true);
 }
 
 
@@ -749,12 +881,12 @@ static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) {
 static mi_deferred_free_fun* volatile deferred_free = NULL;
 static _Atomic(void*) deferred_arg; // = NULL
 
-void _mi_deferred_free(mi_heap_t* heap, bool force) {
-  heap->tld->heartbeat++;
-  if (deferred_free != NULL && !heap->tld->recurse) {
-    heap->tld->recurse = true;
-    deferred_free(force, heap->tld->heartbeat, mi_atomic_load_ptr_relaxed(void,&deferred_arg));
-    heap->tld->recurse = false;
+void _mi_deferred_free(mi_theap_t* theap, bool force) {
+  theap->heartbeat++;
+  if (deferred_free != NULL && !theap->tld->recurse) {
+    theap->tld->recurse = true;
+    deferred_free(force, theap->heartbeat, mi_atomic_load_ptr_relaxed(void,&deferred_arg));
+    theap->tld->recurse = false;
   }
 }
 
@@ -768,40 +900,30 @@ void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noex
   General allocation
 ----------------------------------------------------------- */
 
-// Large and huge page allocation.
-// Huge pages are allocated directly without being in a queue.
-// Because huge pages contain just one block, and the segment contains
-// just that page, we always treat them as abandoned and any thread
-// that frees the block can free the whole page and segment directly.
-static mi_page_t* mi_large_huge_page_alloc(mi_heap_t* heap, size_t size) {
-  size_t block_size = _mi_os_good_alloc_size(size);
-  mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE);
-  bool is_huge = (block_size > MI_LARGE_OBJ_SIZE_MAX);
-  mi_page_queue_t* pq = (is_huge ? NULL : mi_page_queue(heap, block_size));
-  mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size);
+// Huge pages contain just one block, and the segment contains just that page.
+// Huge pages are also use if the requested alignment is very large (> MI_BLOCK_ALIGNMENT_MAX)
+// so their size is not always `> MI_LARGE_OBJ_SIZE_MAX`.
+static mi_page_t* mi_huge_page_alloc(mi_theap_t* theap, size_t size, size_t page_alignment, mi_page_queue_t* pq) {
+  const size_t block_size = _mi_os_good_alloc_size(size);
+  // mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE || page_alignment > 0);
+  #if MI_HUGE_PAGE_ABANDON
+  #error todo.
+  #else
+  // mi_page_queue_t* pq = mi_page_queue(theap, MI_LARGE_MAX_OBJ_SIZE+1);  // always in the huge queue regardless of the block size
+  mi_assert_internal(mi_page_queue_is_huge(pq));
+  #endif
+  mi_page_t* page = mi_page_fresh_alloc(theap, pq, block_size, page_alignment);
   if (page != NULL) {
+    mi_assert_internal(mi_page_block_size(page) >= size);
     mi_assert_internal(mi_page_immediate_available(page));
-    
-    if (pq == NULL) {
-      // huge pages are directly abandoned
-      mi_assert_internal(_mi_page_segment(page)->kind == MI_SEGMENT_HUGE);
-      mi_assert_internal(_mi_page_segment(page)->used==1);
-      mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue
-      mi_page_set_heap(page, NULL);
-    }
-    else {
-      mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
-    }
-    
-    const size_t bsize = mi_page_usable_block_size(page);  // note: not `mi_page_block_size` to account for padding
-    if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
-      mi_heap_stat_increase(heap, large, bsize);
-      mi_heap_stat_counter_increase(heap, large_count, 1);
-    }
-    else {
-      mi_heap_stat_increase(heap, huge, bsize);
-      mi_heap_stat_counter_increase(heap, huge_count, 1);
-    }
+    mi_assert_internal(mi_page_is_huge(page));
+    mi_assert_internal(mi_page_is_singleton(page));
+    #if MI_HUGE_PAGE_ABANDON
+    mi_assert_internal(mi_page_is_abandoned(page));
+    mi_page_set_theap(page, NULL);
+    #endif
+    mi_theap_stat_increase(theap, malloc_huge, mi_page_block_size(page));
+    mi_theap_stat_counter_increase(theap, malloc_huge_count, 1);
   }
   return page;
 }
@@ -809,61 +931,99 @@ static mi_page_t* mi_large_huge_page_alloc(mi_heap_t* heap, size_t size) {
 
 // Allocate a page
 // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
-static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+static mi_page_t* mi_find_page(mi_theap_t* theap, size_t size, size_t huge_alignment) mi_attr_noexcept {
+  const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`
+  if mi_unlikely(req_size > MI_MAX_ALLOC_SIZE) {
+    _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size);
+    return NULL;
+  }
+  mi_page_queue_t* pq = mi_page_queue(theap, (huge_alignment > 0 ? MI_LARGE_MAX_OBJ_SIZE+1 : size));
   // huge allocation?
-  const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`  
-  if (mi_unlikely(req_size > (MI_MEDIUM_OBJ_SIZE_MAX - MI_PADDING_SIZE) )) {
-    if (mi_unlikely(req_size > PTRDIFF_MAX)) {  // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
-      _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size);
-      return NULL;
-    }
-    else {
-      return mi_large_huge_page_alloc(heap,size);
-    }
+  if mi_unlikely(mi_page_queue_is_huge(pq) || req_size > MI_MAX_ALLOC_SIZE) {
+    return mi_huge_page_alloc(theap,size,huge_alignment,pq);
   }
   else {
     // otherwise find a page with free blocks in our size segregated queues
+    #if MI_PADDING
     mi_assert_internal(size >= MI_PADDING_SIZE);
-    return mi_find_free_page(heap, size);
+    #endif
+    return mi_find_free_page(theap, pq);
   }
 }
 
+
 // Generic allocation routine if the fast path (`alloc.c:mi_page_malloc`) does not succeed.
 // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
-void* _mi_malloc_generic(mi_heap_t* heap, size_t size) mi_attr_noexcept
+// The `huge_alignment` is normally 0 but is set to a multiple of MI_SLICE_SIZE for
+// very large requested alignments in which case we use a huge singleton page.
+// Note: we put `bool zero, size_t huge_alignment` into one parameter (with zero in the low bit)
+// to use 4 parameters which compiles better on msvc for the malloc fast path.
+void* _mi_malloc_generic(mi_theap_t* theap, size_t size, size_t zero_huge_alignment, size_t* usable) mi_attr_noexcept
 {
-  mi_assert_internal(heap != NULL);
+  const bool zero = ((zero_huge_alignment & 1) != 0);
+  const size_t huge_alignment = (zero_huge_alignment & ~1);
+
+  #if !MI_THEAP_INITASNULL
+  mi_assert_internal(theap != NULL);
+  #endif
 
   // initialize if necessary
-  if (mi_unlikely(!mi_heap_is_initialized(heap))) {
-    mi_thread_init(); // calls `_mi_heap_init` in turn
-    heap = mi_get_default_heap();
-    if (mi_unlikely(!mi_heap_is_initialized(heap))) { return NULL; }
+  if mi_unlikely(!mi_theap_is_initialized(theap)) {
+    if (theap==&_mi_theap_empty_wrong) {
+      // we were unable to allocate a theap for a first-class heap
+      return NULL;
+    }
+    // otherwise we initialize the thread and its default theap
+    mi_thread_init();
+    theap = _mi_theap_default();
+    if mi_unlikely(!mi_theap_is_initialized(theap)) { return NULL; }
+    mi_assert_internal(_mi_theap_default()==theap);
+  }
+  mi_assert_internal(mi_theap_is_initialized(theap));
+
+  // do administrative tasks every N generic mallocs
+  if mi_unlikely(++theap->generic_count >= 1000) {
+    theap->generic_collect_count += theap->generic_count;
+    theap->generic_count = 0;
+    // call potential deferred free routines
+    _mi_deferred_free(theap, false);
+    // free retired pages
+    _mi_theap_collect_retired(theap, false);
+
+    // collect every once in a while (10000 by default)
+    const long generic_collect = mi_option_get_clamp(mi_option_generic_collect, 1, 1000000L);
+    if (theap->generic_collect_count >= generic_collect) {
+      theap->generic_collect_count = 0;
+      mi_theap_collect(theap, false /* force? */);
+    }
   }
-  mi_assert_internal(mi_heap_is_initialized(heap));
-
-  // call potential deferred free routines
-  _mi_deferred_free(heap, false);
-
-  // free delayed frees from other threads
-  _mi_heap_delayed_free(heap);
 
   // find (or allocate) a page of the right size
-  mi_page_t* page = mi_find_page(heap, size);
-  if (mi_unlikely(page == NULL)) { // first time out of memory, try to collect and retry the allocation once more
-    mi_heap_collect(heap, true /* force */);
-    page = mi_find_page(heap, size);
+  mi_page_t* page = mi_find_page(theap, size, huge_alignment);
+  if mi_unlikely(page == NULL) { // first time out of memory, try to collect and retry the allocation once more
+    mi_theap_collect(theap, true /* force? */);
+    page = mi_find_page(theap, size, huge_alignment);
   }
 
-  if (mi_unlikely(page == NULL)) { // out of memory
-    const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`  
+  if mi_unlikely(page == NULL) { // out of memory
+    const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`
     _mi_error_message(ENOMEM, "unable to allocate memory (%zu bytes)\n", req_size);
     return NULL;
   }
 
   mi_assert_internal(mi_page_immediate_available(page));
   mi_assert_internal(mi_page_block_size(page) >= size);
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_ptr_page(page)==page);
 
-  // and try again, this time succeeding! (i.e. this should never recurse)
-  return _mi_page_malloc(heap, page, size);
+  // and try again, this time succeeding! (i.e. this should never recurse through _mi_page_malloc)
+  if (usable!=NULL) { *usable = mi_page_usable_block_size(page); }
+  void* const p = _mi_page_malloc_zero(theap,page,size,zero);
+  mi_assert_internal(p != NULL);
+
+  // move full pages to the full queue
+  if (mi_page_block_size(page) > MI_SMALL_MAX_OBJ_SIZE && mi_page_is_full(page)) {
+    mi_page_to_full(page, mi_page_queue_of(page));
+  }
+  return p;
 }
diff --git a/ext/src/mimalloc/src/prim/emscripten/prim.c b/ext/src/mimalloc/src/prim/emscripten/prim.c
new file mode 100644
index 0000000000..ab3d59eed8
--- /dev/null
+++ b/ext/src/mimalloc/src/prim/emscripten/prim.c
@@ -0,0 +1,252 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen, Alon Zakai
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+// This file is included in `src/prim/prim.c`
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"
+
+// Design
+// ======
+//
+// mimalloc is built on top of emmalloc. emmalloc is a minimal allocator on top
+// of sbrk. The reason for having three layers here is that we want mimalloc to
+// be able to allocate and release system memory properly, the same way it would
+// when using VirtualAlloc on Windows or mmap on POSIX, and sbrk is too limited.
+// Specifically, sbrk can only go up and down, and not "skip" over regions, and
+// so we end up either never freeing memory to the system, or we can get stuck
+// with holes.
+//
+// Atm wasm generally does *not* free memory back the system: once grown, we do
+// not shrink back down (https://github.com/WebAssembly/design/issues/1397).
+// However, that is expected to improve
+// (https://github.com/WebAssembly/memory-control/blob/main/proposals/memory-control/Overview.md)
+// and so we do not want to bake those limitations in here.
+//
+// Even without that issue, we want our system allocator to handle holes, that
+// is, it should merge freed regions and allow allocating new content there of
+// the full size, etc., so that we do not waste space. That means that the
+// system allocator really does need to handle the general problem of allocating
+// and freeing variable-sized chunks of memory in a random order, like malloc/
+// free do. And so it makes sense to layer mimalloc on top of such an
+// implementation.
+//
+// emmalloc makes sense for the lower level because it is small and simple while
+// still fully handling merging of holes etc. It is not the most efficient
+// allocator, but our assumption is that mimalloc needs to be fast while the
+// system allocator underneath it is called much less frequently.
+//
+
+//---------------------------------------------
+// init
+//---------------------------------------------
+
+void _mi_prim_mem_init( mi_os_mem_config_t* config) {
+  config->page_size = 64*MI_KiB; // WebAssembly has a fixed page size: 64KiB
+  config->alloc_granularity = 16;
+  config->has_overcommit = false;
+  config->has_partial_free = false;
+  config->has_virtual_reserve = false;
+}
+
+extern void emmalloc_free(void*);
+
+int _mi_prim_free(void* addr, size_t size) {
+  if (size==0) return 0;
+  emmalloc_free(addr);
+  return 0;
+}
+
+
+//---------------------------------------------
+// Allocation
+//---------------------------------------------
+
+extern void* emmalloc_memalign(size_t alignment, size_t size);
+
+// Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
+int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
+  MI_UNUSED(try_alignment); MI_UNUSED(allow_large); MI_UNUSED(commit); MI_UNUSED(hint_addr);
+  *is_large = false;
+  // TODO: Track the highest address ever seen; first uses of it are zeroes.
+  //       That assumes no one else uses sbrk but us (they could go up,
+  //       scribble, and then down), but we could assert on that perhaps.
+  *is_zero = false;
+  // emmalloc has a minimum alignment size.
+  #define MIN_EMMALLOC_ALIGN           8
+  if (try_alignment < MIN_EMMALLOC_ALIGN) {
+    try_alignment = MIN_EMMALLOC_ALIGN;
+  }
+  void* p = emmalloc_memalign(try_alignment, size);
+  *addr = p;
+  if (p == 0) {
+    return ENOMEM;
+  }
+  return 0;
+}
+
+
+//---------------------------------------------
+// Commit/Reset
+//---------------------------------------------
+
+int _mi_prim_commit(void* addr, size_t size, bool* is_zero) {
+  MI_UNUSED(addr); MI_UNUSED(size);
+  // See TODO above.
+  *is_zero = false;
+  return 0;
+}
+
+int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit) {
+  MI_UNUSED(addr); MI_UNUSED(size);
+  *needs_recommit = false;
+  return 0;
+}
+
+int _mi_prim_reset(void* addr, size_t size) {
+  MI_UNUSED(addr); MI_UNUSED(size);
+  return 0;
+}
+
+int _mi_prim_reuse(void* addr, size_t size) {
+  MI_UNUSED(addr); MI_UNUSED(size);
+  return 0;
+}
+
+int _mi_prim_protect(void* addr, size_t size, bool protect) {
+  MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(protect);
+  return 0;
+}
+
+
+//---------------------------------------------
+// Huge pages and NUMA nodes
+//---------------------------------------------
+
+int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) {
+  MI_UNUSED(hint_addr); MI_UNUSED(size); MI_UNUSED(numa_node);
+  *is_zero = true;
+  *addr = NULL;
+  return ENOSYS;
+}
+
+size_t _mi_prim_numa_node(void) {
+  return 0;
+}
+
+size_t _mi_prim_numa_node_count(void) {
+  return 1;
+}
+
+
+//----------------------------------------------------------------
+// Clock
+//----------------------------------------------------------------
+
+#include <emscripten/html5.h>
+
+mi_msecs_t _mi_prim_clock_now(void) {
+  return emscripten_date_now();
+}
+
+
+//----------------------------------------------------------------
+// Process info
+//----------------------------------------------------------------
+
+void _mi_prim_process_info(mi_process_info_t* pinfo)
+{
+  // use defaults
+  MI_UNUSED(pinfo);
+}
+
+
+//----------------------------------------------------------------
+// Output
+//----------------------------------------------------------------
+
+#include <emscripten/console.h>
+
+void _mi_prim_out_stderr( const char* msg) {
+  emscripten_console_error(msg);
+}
+
+
+//----------------------------------------------------------------
+// Environment
+//----------------------------------------------------------------
+
+bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
+  // For code size reasons, do not support environ customization for now.
+  MI_UNUSED(name);
+  MI_UNUSED(result);
+  MI_UNUSED(result_size);
+  return false;
+}
+
+
+//----------------------------------------------------------------
+// Random
+//----------------------------------------------------------------
+
+bool _mi_prim_random_buf(void* buf, size_t buf_len) {
+  int err = getentropy(buf, buf_len);
+  return !err;
+}
+
+
+//----------------------------------------------------------------
+// Thread init/done
+//----------------------------------------------------------------
+
+#if defined(MI_USE_PTHREADS)
+
+// use pthread local storage keys to detect thread ending
+// (and used with MI_TLS_PTHREADS for the default theap)
+pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1);
+
+static void mi_pthread_done(void* value) {
+  if (value!=NULL) {
+    _mi_thread_done((mi_theap_t*)value);
+  }
+}
+
+void _mi_prim_thread_init_auto_done(void) {
+  mi_assert_internal(_mi_heap_default_key == (pthread_key_t)(-1));
+  pthread_key_create(&_mi_heap_default_key, &mi_pthread_done);
+}
+
+void _mi_prim_thread_done_auto_done(void) {
+  // nothing to do
+}
+
+void _mi_prim_thread_associate_default_theap(mi_theap_t* theap) {
+  if (_mi_heap_default_key != (pthread_key_t)(-1)) {  // can happen during recursive invocation on freeBSD
+    pthread_setspecific(_mi_heap_default_key, theap);
+  }
+}
+
+#else
+
+void _mi_prim_thread_init_auto_done(void) {
+  // nothing
+}
+
+void _mi_prim_thread_done_auto_done(void) {
+  // nothing
+}
+
+void _mi_prim_thread_associate_default_theap(mi_theap_t* theap) {
+  MI_UNUSED(theap);
+}
+#endif
+
+bool _mi_prim_thread_is_in_threadpool(void) {
+  return false;
+}
diff --git a/ext/src/mimalloc/src/alloc-override-osx.c b/ext/src/mimalloc/src/prim/osx/alloc-override-zone.c
similarity index 93%
rename from ext/src/mimalloc/src/alloc-override-osx.c
rename to ext/src/mimalloc/src/prim/osx/alloc-override-zone.c
index 41d0a386e7..aa971c39fa 100644
--- a/ext/src/mimalloc/src/alloc-override-osx.c
+++ b/ext/src/mimalloc/src/prim/osx/alloc-override-zone.c
@@ -6,7 +6,7 @@ terms of the MIT license. A copy of the license can be found in the file
 -----------------------------------------------------------------------------*/
 
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
+#include "mimalloc/internal.h"
 
 #if defined(MI_MALLOC_OVERRIDE)
 
@@ -19,8 +19,8 @@ terms of the MIT license. A copy of the license can be found in the file
    This is done through the malloc zone interface.
    It seems to be most robust in combination with interposing
    though or otherwise we may get zone errors as there are could
-   be allocations done by the time we take over the 
-   zone. 
+   be allocations done by the time we take over the
+   zone.
 ------------------------------------------------------ */
 
 #include <AvailabilityMacros.h>
@@ -64,7 +64,8 @@ static void* zone_valloc(malloc_zone_t* zone, size_t size) {
 
 static void zone_free(malloc_zone_t* zone, void* p) {
   MI_UNUSED(zone);
-  mi_cfree(p);
+  // mi_cfree(p);  // checked free as `zone_free` may be called with invalid pointers
+  mi_free(p); // with the page_map and pagemap_commit=1 we can use the regular free
 }
 
 static void* zone_realloc(malloc_zone_t* zone, void* p, size_t newsize) {
@@ -83,7 +84,7 @@ static void zone_destroy(malloc_zone_t* zone) {
 }
 
 static unsigned zone_batch_malloc(malloc_zone_t* zone, size_t size, void** ps, unsigned count) {
-  size_t i;
+  unsigned i;
   for (i = 0; i < count; i++) {
     ps[i] = zone_malloc(zone, size);
     if (ps[i] == NULL) break;
@@ -195,7 +196,7 @@ static malloc_introspection_t mi_introspect = {
   .log = &intro_log,
   .force_lock = &intro_force_lock,
   .force_unlock = &intro_force_unlock,
-#if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6)
+#if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6) && !defined(__ppc__)
   .statistics = &intro_statistics,
   .zone_locked = &intro_zone_locked,
 #endif
@@ -215,8 +216,8 @@ static malloc_zone_t mi_malloc_zone = {
   .zone_name = "mimalloc",
   .batch_malloc = &zone_batch_malloc,
   .batch_free = &zone_batch_free,
-  .introspect = &mi_introspect,  
-#if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6)
+  .introspect = &mi_introspect,
+#if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6) && !defined(__ppc__)
   #if defined(MAC_OS_X_VERSION_10_14) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_14)
   .version = 10,
   #else
@@ -225,7 +226,9 @@ static malloc_zone_t mi_malloc_zone = {
   // switch to version 9+ on OSX 10.6 to support memalign.
   .memalign = &zone_memalign,
   .free_definite_size = &zone_free_definite_size,
+  #if defined(MAC_OS_X_VERSION_10_7) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_7)
   .pressure_relief = &zone_pressure_relief,
+  #endif
   #if defined(MAC_OS_X_VERSION_10_14) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_14)
   .claimed_address = &zone_claimed_address,
   #endif
@@ -242,7 +245,7 @@ static malloc_zone_t mi_malloc_zone = {
 #if defined(MI_OSX_INTERPOSE) && defined(MI_SHARED_LIB_EXPORT)
 
 // ------------------------------------------------------
-// Override malloc_xxx and malloc_zone_xxx api's to use only 
+// Override malloc_xxx and malloc_zone_xxx api's to use only
 // our mimalloc zone. Since even the loader uses malloc
 // on macOS, this ensures that all allocations go through
 // mimalloc (as all calls are interposed).
@@ -254,7 +257,7 @@ static malloc_zone_t mi_malloc_zone = {
 static inline malloc_zone_t* mi_get_default_zone(void)
 {
   static bool init;
-  if (mi_unlikely(!init)) { 
+  if mi_unlikely(!init) {
     init = true;
     malloc_zone_register(&mi_malloc_zone);  // by calling register we avoid a zone error on free (see <http://eatmyrandom.blogspot.com/2010/03/mallocfree-interception-on-mac-os-x.html>)
   }
@@ -272,7 +275,7 @@ static malloc_zone_t* mi_malloc_create_zone(vm_size_t size, unsigned flags) {
   return mi_get_default_zone();
 }
 
-static malloc_zone_t* mi_malloc_default_zone (void) {   
+static malloc_zone_t* mi_malloc_default_zone (void) {
   return mi_get_default_zone();
 }
 
@@ -292,11 +295,11 @@ static kern_return_t mi_malloc_get_all_zones (task_t task, memory_reader_t mr, v
   return KERN_SUCCESS;
 }
 
-static const char* mi_malloc_get_zone_name(malloc_zone_t* zone) {  
+static const char* mi_malloc_get_zone_name(malloc_zone_t* zone) {
   return (zone == NULL ? mi_malloc_zone.zone_name : zone->zone_name);
 }
 
-static void mi_malloc_set_zone_name(malloc_zone_t* zone, const char* name) {  
+static void mi_malloc_set_zone_name(malloc_zone_t* zone, const char* name) {
   MI_UNUSED(zone); MI_UNUSED(name);
 }
 
@@ -306,7 +309,7 @@ static int mi_malloc_jumpstart(uintptr_t cookie) {
 }
 
 static void mi__malloc_fork_prepare(void) {
-  // nothing  
+  // nothing
 }
 static void mi__malloc_fork_parent(void) {
   // nothing
@@ -367,13 +370,13 @@ __attribute__((used)) static const struct mi_interpose_s _mi_zone_interposes[]
   MI_INTERPOSE_MI(malloc_destroy_zone),
   MI_INTERPOSE_MI(malloc_get_all_zones),
   MI_INTERPOSE_MI(malloc_get_zone_name),
-  MI_INTERPOSE_MI(malloc_jumpstart),  
+  MI_INTERPOSE_MI(malloc_jumpstart),
   MI_INTERPOSE_MI(malloc_printf),
   MI_INTERPOSE_MI(malloc_set_zone_name),
   MI_INTERPOSE_MI(_malloc_fork_child),
   MI_INTERPOSE_MI(_malloc_fork_parent),
   MI_INTERPOSE_MI(_malloc_fork_prepare),
-  
+
   MI_INTERPOSE_ZONE(zone_batch_free),
   MI_INTERPOSE_ZONE(zone_batch_malloc),
   MI_INTERPOSE_ZONE(zone_calloc),
@@ -416,11 +419,12 @@ static inline malloc_zone_t* mi_get_default_zone(void)
 }
 
 #if defined(__clang__)
-__attribute__((constructor(0))) 
+__attribute__((constructor(101))) // highest priority
 #else
-__attribute__((constructor))      // seems not supported by g++-11 on the M1
+__attribute__((constructor))      // priority level is not supported by gcc
 #endif
-static void _mi_macos_override_malloc() {
+__attribute__((used))
+static void _mi_macos_override_malloc(void) {
   malloc_zone_t* purgeable_zone = NULL;
 
   #if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6)
diff --git a/ext/src/mimalloc/src/prim/osx/prim.c b/ext/src/mimalloc/src/prim/osx/prim.c
new file mode 100644
index 0000000000..8a2f4e8aa4
--- /dev/null
+++ b/ext/src/mimalloc/src/prim/osx/prim.c
@@ -0,0 +1,9 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+// We use the unix/prim.c with the mmap API on macOSX
+#include "../unix/prim.c"
diff --git a/ext/src/mimalloc/src/prim/prim.c b/ext/src/mimalloc/src/prim/prim.c
new file mode 100644
index 0000000000..5147bae81f
--- /dev/null
+++ b/ext/src/mimalloc/src/prim/prim.c
@@ -0,0 +1,76 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+// Select the implementation of the primitives
+// depending on the OS.
+
+#if defined(_WIN32)
+#include "windows/prim.c"  // VirtualAlloc (Windows)
+
+#elif defined(__APPLE__)
+#include "osx/prim.c"      // macOSX (actually defers to mmap in unix/prim.c)
+
+#elif defined(__wasi__)
+#define MI_USE_SBRK
+#include "wasi/prim.c"     // memory-grow or sbrk (Wasm)
+
+#elif defined(__EMSCRIPTEN__)
+#include "emscripten/prim.c" // emmalloc_*, + pthread support
+
+#else
+#include "unix/prim.c"     // mmap() (Linux, macOSX, BSD, Illumnos, Haiku, DragonFly, etc.)
+
+#endif
+
+// Generic process initialization
+#ifndef MI_PRIM_HAS_PROCESS_ATTACH
+#if defined(__GNUC__) || defined(__clang__)
+  // gcc,clang: use the constructor/destructor attribute
+  // which for both seem to run before regular constructors/destructors
+  #if defined(__clang__)
+    #define mi_attr_constructor __attribute__((constructor(101)))
+    #define mi_attr_destructor  __attribute__((destructor(101)))
+  #else
+    #define mi_attr_constructor __attribute__((constructor))
+    #define mi_attr_destructor  __attribute__((destructor))
+  #endif
+  static void mi_attr_constructor mi_process_attach(void) {
+    _mi_auto_process_init();
+  }
+  static void mi_attr_destructor mi_process_detach(void) {
+    _mi_auto_process_done();
+  }
+#elif defined(__cplusplus)
+  // C++: use static initialization to detect process start/end
+  // This is not guaranteed to be first/last but the best we can generally do?
+  struct mi_init_done_t {
+    mi_init_done_t() {
+      _mi_auto_process_init();
+    }
+    ~mi_init_done_t() {
+      _mi_auto_process_done();
+    }
+  };
+  static mi_init_done_t mi_init_done;
+ #else
+  #pragma message("define a way to call _mi_auto_process_init/done on your platform")
+#endif
+#endif
+
+// Generic allocator init/done callback
+#ifndef MI_PRIM_HAS_ALLOCATOR_INIT
+bool _mi_is_redirected(void) {
+  return false;
+}
+bool _mi_allocator_init(const char** message) {
+  if (message != NULL) { *message = NULL; }
+  return true;
+}
+void _mi_allocator_done(void) {
+  // nothing to do
+}
+#endif
diff --git a/ext/src/mimalloc/src/prim/readme.md b/ext/src/mimalloc/src/prim/readme.md
new file mode 100644
index 0000000000..380dd3a717
--- /dev/null
+++ b/ext/src/mimalloc/src/prim/readme.md
@@ -0,0 +1,9 @@
+## Portability Primitives
+
+This is the portability layer where all primitives needed from the OS are defined.
+
+- `include/mimalloc/prim.h`: primitive portability API definition.
+- `prim.c`: Selects one of `unix/prim.c`, `wasi/prim.c`, or `windows/prim.c` depending on the host platform
+            (and on macOS, `osx/prim.c` defers to `unix/prim.c`).
+
+Note: still work in progress, there may still be places in the sources that still depend on OS ifdef's.
\ No newline at end of file
diff --git a/ext/src/mimalloc/src/prim/unix/prim.c b/ext/src/mimalloc/src/prim/unix/prim.c
new file mode 100644
index 0000000000..6fa03a461f
--- /dev/null
+++ b/ext/src/mimalloc/src/prim/unix/prim.c
@@ -0,0 +1,997 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+// This file is included in `src/prim/prim.c`
+
+#ifndef _DEFAULT_SOURCE
+#define _DEFAULT_SOURCE   // ensure mmap flags and syscall are defined
+#endif
+
+#if defined(__sun)
+// illumos provides new mman.h api when any of these are defined
+// otherwise the old api based on caddr_t which predates the void pointers one.
+// stock solaris provides only the former, chose to atomically to discard those
+// flags only here rather than project wide tough.
+#undef _XOPEN_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/prim.h"
+
+#include <sys/mman.h>  // mmap
+#include <unistd.h>    // sysconf
+#include <fcntl.h>     // open, close, read, access
+#include <stdlib.h>    // getenv, arc4random_buf
+
+#if defined(__linux__)
+  #include <features.h>
+  #include <sys/prctl.h>    // THP disable, PR_SET_VMA
+  #include <sys/sysinfo.h>  // sysinfo
+  #if defined(__GLIBC__) && !defined(PR_SET_VMA)
+  #include <linux/prctl.h>
+  #endif
+  #if defined(__GLIBC__)
+  #include <linux/mman.h>   // linux mmap flags
+  #else
+  #include <sys/mman.h>
+  #endif
+#elif defined(__APPLE__)
+  #include <AvailabilityMacros.h>
+  #include <TargetConditionals.h>
+  #if !defined(TARGET_OS_OSX) || TARGET_OS_OSX   // see issue #879, used to be (!TARGET_IOS_IPHONE && !TARGET_IOS_SIMULATOR)
+  #include <mach/vm_statistics.h>    // VM_MAKE_TAG, VM_FLAGS_SUPERPAGE_SIZE_2MB, etc.
+  #endif
+  #if !defined(MAC_OS_X_VERSION_10_7)
+  #define MAC_OS_X_VERSION_10_7   1070
+  #endif
+  #include <sys/sysctl.h>
+#elif defined(__FreeBSD__) || defined(__DragonFly__)
+  #include <sys/param.h>
+  #if __FreeBSD_version >= 1200000
+  #include <sys/cpuset.h>
+  #include <sys/domainset.h>
+  #endif
+  #include <sys/sysctl.h>
+#endif
+
+#if (defined(__linux__) && !defined(__ANDROID__)) || defined(__FreeBSD__)
+  #define MI_HAS_SYSCALL_H
+  #include <sys/syscall.h>
+#endif
+
+#if !defined(MADV_DONTNEED) && defined(POSIX_MADV_DONTNEED)  // QNX
+#define MADV_DONTNEED  POSIX_MADV_DONTNEED
+#endif
+#if !defined(MADV_FREE) && defined(POSIX_MADV_FREE)  // QNX
+#define MADV_FREE  POSIX_MADV_FREE
+#endif
+
+#define MI_UNIX_LARGE_PAGE_SIZE (2*MI_MiB) // TODO: can we query the OS for this?
+
+//------------------------------------------------------------------------------------
+// Use syscalls for some primitives to allow for libraries that override open/read/close etc.
+// and do allocation themselves; using syscalls prevents recursion when mimalloc is
+// still initializing (issue #713)
+// Declare inline to avoid unused function warnings.
+//------------------------------------------------------------------------------------
+
+#if defined(MI_HAS_SYSCALL_H) && defined(SYS_open) && defined(SYS_close) && defined(SYS_read) && defined(SYS_access)
+
+static inline int mi_prim_open(const char* fpath, int open_flags) {
+  return syscall(SYS_open,fpath,open_flags,0);
+}
+static inline ssize_t mi_prim_read(int fd, void* buf, size_t bufsize) {
+  return syscall(SYS_read,fd,buf,bufsize);
+}
+static inline int mi_prim_close(int fd) {
+  return syscall(SYS_close,fd);
+}
+static inline int mi_prim_access(const char *fpath, int mode) {
+  return syscall(SYS_access,fpath,mode);
+}
+
+#else
+
+static inline int mi_prim_open(const char* fpath, int open_flags) {
+  return open(fpath,open_flags);
+}
+static inline ssize_t mi_prim_read(int fd, void* buf, size_t bufsize) {
+  return read(fd,buf,bufsize);
+}
+static inline int mi_prim_close(int fd) {
+  return close(fd);
+}
+static inline int mi_prim_access(const char *fpath, int mode) {
+  return access(fpath,mode);
+}
+
+#endif
+
+
+
+//---------------------------------------------
+// init
+//---------------------------------------------
+
+static bool unix_detect_overcommit(void) {
+  bool os_overcommit = true;
+  #if defined(__linux__)
+    int fd = mi_prim_open("/proc/sys/vm/overcommit_memory", O_RDONLY);
+    if (fd >= 0) {
+      char buf[32];
+      ssize_t nread = mi_prim_read(fd, &buf, sizeof(buf));
+      mi_prim_close(fd);
+      // <https://www.kernel.org/doc/Documentation/vm/overcommit-accounting>
+      // 0: heuristic overcommit, 1: always overcommit, 2: never overcommit (ignore NORESERVE)
+      if (nread >= 1) {
+        os_overcommit = (buf[0] == '0' || buf[0] == '1');
+      }
+    }
+  #elif defined(__FreeBSD__)
+    int val = 0;
+    size_t olen = sizeof(val);
+    if (sysctlbyname("vm.overcommit", &val, &olen, NULL, 0) == 0) {
+      os_overcommit = (val != 0);
+    }
+  #else
+    // default: overcommit is true
+  #endif
+  return os_overcommit;
+}
+
+static bool unix_detect_thp(void) {
+  bool thp_enabled = false;
+  #if defined(__linux__)
+  int fd = mi_prim_open("/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY);
+  if (fd >= 0) {
+    char buf[32];
+    ssize_t nread = mi_prim_read(fd, &buf, sizeof(buf));
+    mi_prim_close(fd);
+    // <https://www.kernel.org/doc/html/latest/admin-guide/mm/transhuge.html>
+    // between brackets is the current value, for example: always [madvise] never
+    if (nread >= 1) {
+      thp_enabled = (_mi_strnstr(buf,32,"[never]") == NULL);
+    }
+  }
+  #endif
+  return thp_enabled;
+}
+
+// try to detect the physical memory dynamically (if possible)
+static void unix_detect_physical_memory( size_t page_size, size_t* physical_memory_in_kib ) {
+  #if defined(CTL_HW) && (defined(HW_PHYSMEM64) || defined(HW_MEMSIZE))  // freeBSD, macOS
+    MI_UNUSED(page_size);
+    int64_t physical_memory = 0;
+    size_t length = sizeof(int64_t);
+    #if defined(HW_PHYSMEM64)
+    int mib[2] = { CTL_HW, HW_PHYSMEM64 };
+    #else
+    int mib[2] = { CTL_HW, HW_MEMSIZE };
+    #endif
+    const int err = sysctl(mib, 2, &physical_memory, &length, NULL, 0);
+    if (err==0 && physical_memory > 0) {
+      const int64_t phys_in_kib = physical_memory / MI_KiB;
+      if (phys_in_kib > 0 && (uint64_t)phys_in_kib <= SIZE_MAX) {
+        *physical_memory_in_kib = (size_t)phys_in_kib;
+      }
+    }
+  #elif defined(__linux__)
+    MI_UNUSED(page_size);
+    struct sysinfo info; _mi_memzero_var(info);
+    const int err = sysinfo(&info);
+    if (err==0 && info.totalram > 0 && info.totalram <= SIZE_MAX) {
+      *physical_memory_in_kib = (size_t)info.totalram / MI_KiB;
+    }
+  #elif defined(_SC_PHYS_PAGES)  // do not use by default as it might cause allocation (by using `fopen` to parse /proc/meminfo) (issue #1100)
+    const long pphys = sysconf(_SC_PHYS_PAGES);
+    const size_t psize_in_kib = page_size / MI_KiB;
+    if (psize_in_kib > 0 && pphys > 0 && (unsigned long)pphys <= SIZE_MAX && (size_t)pphys <= (SIZE_MAX/psize_in_kib)) {
+      *physical_memory_in_kib = (size_t)pphys * psize_in_kib;
+    }
+  #endif
+}
+
+void _mi_prim_mem_init( mi_os_mem_config_t* config )
+{
+  long psize = sysconf(_SC_PAGESIZE);
+  if (psize > 0 && (unsigned long)psize < SIZE_MAX) {
+    config->page_size = (size_t)psize;
+    config->alloc_granularity = (size_t)psize;
+    unix_detect_physical_memory(config->page_size, &config->physical_memory_in_kib);
+  }
+  config->large_page_size = MI_UNIX_LARGE_PAGE_SIZE;
+  config->has_overcommit = unix_detect_overcommit();
+  config->has_partial_free = true;    // mmap can free in parts
+  config->has_virtual_reserve = true; // todo: check if this true for NetBSD?  (for anonymous mmap with PROT_NONE)
+  config->has_transparent_huge_pages = unix_detect_thp();
+
+  // disable transparent huge pages for this process?
+  #if (defined(__linux__) || defined(__ANDROID__)) && defined(PR_GET_THP_DISABLE)
+  #if defined(MI_NO_THP)
+  if (true)
+  #else
+  if (!mi_option_is_enabled(mi_option_allow_thp)) // disable THP if requested through an option
+  #endif
+  {
+    config->has_transparent_huge_pages = false;
+    int val = 0;
+    if (prctl(PR_GET_THP_DISABLE, &val, 0, 0, 0) != 0) {
+      // Most likely since distros often come with always/madvise settings.
+      val = 1;
+      // Disabling only for mimalloc process rather than touching system wide settings
+      (void)prctl(PR_SET_THP_DISABLE, &val, 0, 0, 0);
+    }
+  }
+  #endif
+}
+
+
+//---------------------------------------------
+// free
+//---------------------------------------------
+
+int _mi_prim_free(void* addr, size_t size ) {
+  if (size==0) return 0;
+  bool err = (munmap(addr, size) == -1);
+  return (err ? errno : 0);
+}
+
+
+//---------------------------------------------
+// mmap
+//---------------------------------------------
+
+static int unix_madvise(void* addr, size_t size, int advice) {
+  #if defined(__sun)
+  int res = madvise((caddr_t)addr, size, advice);  // Solaris needs cast (issue #520)
+  #elif defined(__QNX__)
+  int res = posix_madvise(addr, size, advice);
+  #else
+  int res = madvise(addr, size, advice);
+  #endif
+  return (res==0 ? 0 : errno);
+}
+
+static void* unix_mmap_prim(void* addr, size_t size, int protect_flags, int flags, int fd) {
+  void* p = mmap(addr, size, protect_flags, flags, fd, 0 /* offset */);
+  #if defined(__linux__) && defined(PR_SET_VMA)
+  if (p!=MAP_FAILED && p!=NULL) {
+    prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, p, size, "mimalloc");
+  }
+  #endif
+  return p;
+}
+
+static void* unix_mmap_prim_aligned(void* addr, size_t size, size_t try_alignment, int protect_flags, int flags, int fd) {
+  MI_UNUSED(try_alignment);
+  void* p = NULL;
+  #if defined(MAP_ALIGNED)  // BSD
+  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
+    size_t n = 0;
+    mi_bsr(try_alignment, &n);
+    if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) {  // alignment is a power of 2 and 4096 <= alignment <= 1GiB
+      p = unix_mmap_prim(addr, size, protect_flags, flags | MAP_ALIGNED(n), fd);
+      if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) {
+        int err = errno;
+        _mi_trace_message("unable to directly request aligned OS memory (error: %d (0x%x), size: 0x%zx bytes, alignment: 0x%zx, hint address: %p)\n", err, err, size, try_alignment, addr);
+      }
+      if (p!=MAP_FAILED) return p;
+      // fall back to regular mmap
+    }
+  }
+  #elif defined(MAP_ALIGN)  // Solaris
+  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
+    p = unix_mmap_prim((void*)try_alignment, size, protect_flags, flags | MAP_ALIGN, fd);  // addr parameter is the required alignment
+    if (p!=MAP_FAILED) return p;
+    // fall back to regular mmap
+  }
+  #endif
+  #if (MI_INTPTR_SIZE >= 8) && !defined(MAP_ALIGNED)
+  // on 64-bit systems, use the virtual address area after 2TiB for 4MiB aligned allocations
+  if (addr == NULL) {
+    void* hint = _mi_os_get_aligned_hint(try_alignment, size);
+    if (hint != NULL) {
+      p = unix_mmap_prim(hint, size, protect_flags, flags, fd);
+      if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) {
+        #if MI_TRACK_ENABLED  // asan sometimes does not instrument errno correctly?
+        int err = 0;
+        #else
+        int err = errno;
+        #endif
+        _mi_trace_message("unable to directly request hinted aligned OS memory (error: %d (0x%x), size: 0x%zx bytes, alignment: 0x%zx, hint address: %p)\n", err, err, size, try_alignment, hint);
+      }
+      if (p!=MAP_FAILED) return p;
+      // fall back to regular mmap
+    }
+  }
+  #endif
+  // regular mmap
+  p = unix_mmap_prim(addr, size, protect_flags, flags, fd);
+  if (p!=MAP_FAILED) return p;
+  // failed to allocate
+  return NULL;
+}
+
+static int unix_mmap_fd(void) {
+  #if defined(VM_MAKE_TAG)
+  // macOS: tracking anonymous page with a specific ID. (All up to 98 are taken officially but LLVM sanitizers had taken 99)
+  int os_tag = (int)mi_option_get(mi_option_os_tag);
+  if (os_tag < 100 || os_tag > 255) { os_tag = 254; }
+  return VM_MAKE_TAG(os_tag);
+  #else
+  return -1;
+  #endif
+}
+
+static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protect_flags, bool large_only, bool allow_large, bool* is_large) {
+  #if !defined(MAP_ANONYMOUS)
+  #define MAP_ANONYMOUS  MAP_ANON
+  #endif
+  #if !defined(MAP_NORESERVE)
+  #define MAP_NORESERVE  0
+  #endif
+  void* p = NULL;
+  const int fd = unix_mmap_fd();
+  int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+  if (_mi_os_has_overcommit()) {
+    flags |= MAP_NORESERVE;
+  }
+  #if defined(PROT_MAX)
+  protect_flags |= PROT_MAX(PROT_READ | PROT_WRITE); // BSD
+  #endif
+  // huge page allocation
+  if (allow_large && (large_only || (_mi_os_canuse_large_page(size, try_alignment) && mi_option_is_enabled(mi_option_allow_large_os_pages)))) {
+    static _Atomic(size_t) large_page_try_ok; // = 0;
+    size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
+    if (!large_only && try_ok > 0) {
+      // If the OS is not configured for large OS pages, or the user does not have
+      // enough permission, the `mmap` will always fail (but it might also fail for other reasons).
+      // Therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times
+      // to avoid too many failing calls to mmap.
+      mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1);
+    }
+    else {
+      int lflags = flags & ~MAP_NORESERVE;  // using NORESERVE on huge pages seems to fail on Linux
+      int lfd = fd;
+      #ifdef MAP_ALIGNED_SUPER
+      lflags |= MAP_ALIGNED_SUPER;
+      #endif
+      #ifdef MAP_HUGETLB
+      lflags |= MAP_HUGETLB;
+      #endif
+      #ifdef MAP_HUGE_1GB
+      static bool mi_huge_pages_available = true;
+      if (large_only && (size % MI_GiB) == 0 && mi_huge_pages_available) {
+        lflags |= MAP_HUGE_1GB;
+      }
+      else
+      #endif
+      {
+        #ifdef MAP_HUGE_2MB
+        lflags |= MAP_HUGE_2MB;
+        #endif
+      }
+      #ifdef VM_FLAGS_SUPERPAGE_SIZE_2MB
+      lfd |= VM_FLAGS_SUPERPAGE_SIZE_2MB;
+      #endif
+      if (large_only || lflags != flags) {
+        // try large OS page allocation
+        *is_large = true;
+        p = unix_mmap_prim_aligned(addr, size, try_alignment, protect_flags, lflags, lfd);
+        #ifdef MAP_HUGE_1GB
+        if (p == NULL && (lflags & MAP_HUGE_1GB) == MAP_HUGE_1GB) {
+          mi_huge_pages_available = false; // don't try huge 1GiB pages again
+          if (large_only) {
+            _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (errno: %i)\n", errno);
+          }
+          lflags = ((lflags & ~MAP_HUGE_1GB) | MAP_HUGE_2MB);
+          p = unix_mmap_prim_aligned(addr, size, try_alignment, protect_flags, lflags, lfd);
+        }
+        #endif
+        if (large_only) return p;
+        if (p == NULL) {
+          mi_atomic_store_release(&large_page_try_ok, (size_t)8);  // on error, don't try again for the next N allocations
+        }
+      }
+    }
+  }
+  // regular allocation
+  if (p == NULL) {
+    *is_large = false;
+    p = unix_mmap_prim_aligned(addr, size, try_alignment, protect_flags, flags, fd);
+    #if !defined(MI_NO_THP)
+    if (p != NULL && allow_large && mi_option_is_enabled(mi_option_allow_thp) && _mi_os_canuse_large_page(size, try_alignment)) {
+      #if defined(MADV_HUGEPAGE)
+      // Many Linux systems don't allow MAP_HUGETLB but they support instead
+      // transparent huge pages (THP). Generally, it is not required to call `madvise` with MADV_HUGE
+      // though since properly aligned allocations will already use large pages if available
+      // in that case -- in particular for our large regions (in `memory.c`).
+      // However, some systems only allow THP if called with explicit `madvise`, so
+      // when large OS pages are enabled for mimalloc, we call `madvise` anyways.
+      if (unix_madvise(p, size, MADV_HUGEPAGE) == 0) {
+        // *is_large = true; // possibly
+      };
+      #elif defined(__sun)
+      struct memcntl_mha cmd = {0};
+      cmd.mha_pagesize = _mi_os_large_page_size();
+      cmd.mha_cmd = MHA_MAPSIZE_VA;
+      if (memcntl((caddr_t)p, size, MC_HAT_ADVISE, (caddr_t)&cmd, 0, 0) == 0) {
+        // *is_large = true; // possibly
+      }
+      #endif
+    }
+    #endif
+  }
+  return p;
+}
+
+// Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
+int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
+  mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
+  mi_assert_internal(commit || !allow_large);
+  mi_assert_internal(try_alignment > 0);
+  if (hint_addr == NULL && size >= 8*MI_UNIX_LARGE_PAGE_SIZE && try_alignment > 1 && _mi_is_power_of_two(try_alignment) && try_alignment < MI_UNIX_LARGE_PAGE_SIZE) {
+    try_alignment = MI_UNIX_LARGE_PAGE_SIZE; // try to align along large page size for larger allocations
+  }
+
+  *is_zero = true;
+  int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
+  *addr = unix_mmap(hint_addr, size, try_alignment, protect_flags, false, allow_large, is_large);
+  return (*addr != NULL ? 0 : errno);
+}
+
+
+//---------------------------------------------
+// Commit/Reset
+//---------------------------------------------
+
+static void unix_mprotect_hint(int err) {
+  #if defined(__linux__) && (MI_SECURE>=2) // guard page around every mimalloc page
+  if (err == ENOMEM) {
+    _mi_warning_message("The next warning may be caused by a low memory map limit.\n"
+                        "  On Linux this is controlled by the vm.max_map_count -- maybe increase it?\n"
+                        "  For example: sudo sysctl -w vm.max_map_count=262144\n");
+  }
+  #else
+  MI_UNUSED(err);
+  #endif
+}
+
+
+
+
+
+int _mi_prim_commit(void* start, size_t size, bool* is_zero) {
+  // commit: ensure we can access the area
+  // note: we may think that *is_zero can be true since the memory
+  // was either from mmap PROT_NONE, or from decommit MADV_DONTNEED, but
+  // we sometimes call commit on a range with still partially committed
+  // memory and `mprotect` does not zero the range.
+  *is_zero = false;
+  int err = mprotect(start, size, (PROT_READ | PROT_WRITE));
+  if (err != 0) {
+    err = errno;
+    unix_mprotect_hint(err);
+  }
+
+  #if defined(__APPLE__)
+    // MADV_FREE_REUSABLE is paired with MADV_FREE_REUSE for accounting
+    // if this memory was not marked as MADV_FREE_REUSABLE, this call is noop
+    madvise(start, size, MADV_FREE_REUSE);
+  #endif  
+  return err;
+}
+
+int _mi_prim_reuse(void* start, size_t size) {
+  MI_UNUSED(start); MI_UNUSED(size);
+  #if defined(__APPLE__) && defined(MADV_FREE_REUSE)
+  return unix_madvise(start, size, MADV_FREE_REUSE);
+  #endif
+  return 0;
+}
+
+int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) {
+  int err = 0;
+  #if defined(__APPLE__) && defined(MADV_FREE_REUSABLE)
+    // decommit on macOS: use MADV_FREE_REUSABLE as it does immediate rss accounting (issue #1097)
+    err = unix_madvise(start, size, MADV_FREE_REUSABLE);
+    if (err) { err = unix_madvise(start, size, MADV_DONTNEED); }
+  #else
+    // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE)
+    err = unix_madvise(start, size, MADV_DONTNEED);
+  #endif
+  #if !MI_DEBUG && MI_SECURE<=2
+    *needs_recommit = false;
+  #else
+    *needs_recommit = true;
+    mprotect(start, size, PROT_NONE);
+  #endif
+  /*
+  // decommit: use mmap with MAP_FIXED and PROT_NONE to discard the existing memory (and reduce rss)
+  *needs_recommit = true;
+  const int fd = unix_mmap_fd();
+  void* p = mmap(start, size, PROT_NONE, (MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE), fd, 0);
+  if (p != start) { err = errno; }
+  */
+  return err;
+}
+
+int _mi_prim_reset(void* start, size_t size) {
+  int err = 0;
+
+  // on macOS can use MADV_FREE_REUSABLE (but we disable this for now as it seems slower)
+  #if 0 && defined(__APPLE__) && defined(MADV_FREE_REUSABLE)
+  err = unix_madvise(start, size, MADV_FREE_REUSABLE);
+  if (err==0) return 0;
+  // fall through
+  #endif
+
+  #if defined(MADV_FREE)
+  // Otherwise, we try to use `MADV_FREE` as that is the fastest. A drawback though is that it
+  // will not reduce the `rss` stats in tools like `top` even though the memory is available
+  // to other processes. With the default `MIMALLOC_PURGE_DECOMMITS=1` we ensure that by
+  // default `MADV_DONTNEED` is used though.
+  static _Atomic(size_t) advice = MI_ATOMIC_VAR_INIT(MADV_FREE);
+  int oadvice = (int)mi_atomic_load_relaxed(&advice);
+  while ((err = unix_madvise(start, size, oadvice)) != 0 && errno == EAGAIN) { errno = 0;  };
+  if (err != 0 && errno == EINVAL && oadvice == MADV_FREE) {
+    // if MADV_FREE is not supported, fall back to MADV_DONTNEED from now on
+    mi_atomic_store_release(&advice, (size_t)MADV_DONTNEED);
+    err = unix_madvise(start, size, MADV_DONTNEED);
+  }
+  #else
+  err = unix_madvise(start, size, MADV_DONTNEED);
+  #endif
+  return err;
+}
+
+int _mi_prim_protect(void* start, size_t size, bool protect) {
+  int err = mprotect(start, size, protect ? PROT_NONE : (PROT_READ | PROT_WRITE));
+  if (err != 0) { err = errno; }
+  unix_mprotect_hint(err);
+  return err;
+}
+
+
+
+//---------------------------------------------
+// Huge page allocation
+//---------------------------------------------
+
+#if (MI_INTPTR_SIZE >= 8) && !defined(__HAIKU__) && !defined(__CYGWIN__)
+
+#ifndef MPOL_PREFERRED
+#define MPOL_PREFERRED 1
+#endif
+
+#if defined(MI_HAS_SYSCALL_H) && defined(SYS_mbind)
+static long mi_prim_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
+  return syscall(SYS_mbind, start, len, mode, nmask, maxnode, flags);
+}
+#else
+static long mi_prim_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
+  MI_UNUSED(start); MI_UNUSED(len); MI_UNUSED(mode); MI_UNUSED(nmask); MI_UNUSED(maxnode); MI_UNUSED(flags);
+  return 0;
+}
+#endif
+
+int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) {
+  bool is_large = true;
+  *is_zero = true;
+  *addr = unix_mmap(hint_addr, size, MI_ARENA_SLICE_ALIGN, PROT_READ | PROT_WRITE, true, true, &is_large);
+  if (*addr != NULL && numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes
+    unsigned long numa_mask = (1UL << numa_node);
+    // TODO: does `mbind` work correctly for huge OS pages? should we
+    // use `set_mempolicy` before calling mmap instead?
+    // see: <https://lkml.org/lkml/2017/2/9/875>
+    long err = mi_prim_mbind(*addr, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
+    if (err != 0) {
+      err = errno;
+      _mi_warning_message("failed to bind huge (1GiB) pages to numa node %d (error: %d (0x%x))\n", numa_node, err, err);
+    }
+  }
+  return (*addr != NULL ? 0 : errno);
+}
+
+#else
+
+int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) {
+  MI_UNUSED(hint_addr); MI_UNUSED(size); MI_UNUSED(numa_node);
+  *is_zero = false;
+  *addr = NULL;
+  return ENOMEM;
+}
+
+#endif
+
+//---------------------------------------------
+// NUMA nodes
+//---------------------------------------------
+
+#if defined(__linux__)
+
+size_t _mi_prim_numa_node(void) {
+  #if defined(MI_HAS_SYSCALL_H) && defined(SYS_getcpu)
+    unsigned long node = 0;
+    unsigned long ncpu = 0;
+    long err = syscall(SYS_getcpu, &ncpu, &node, NULL);
+    if (err != 0) return 0;
+    return node;
+  #else
+    return 0;
+  #endif
+}
+
+size_t _mi_prim_numa_node_count(void) {
+  char buf[128];
+  unsigned node = 0;
+  for(node = 0; node < 256; node++) {
+    // enumerate node entries -- todo: it there a more efficient way to do this? (but ensure there is no allocation)
+    _mi_snprintf(buf, 127, "/sys/devices/system/node/node%u", node + 1);
+    if (mi_prim_access(buf,R_OK) != 0) break;
+  }
+  return (node+1);
+}
+
+#elif defined(__FreeBSD__) && __FreeBSD_version >= 1200000
+
+size_t _mi_prim_numa_node(void) {
+  domainset_t dom;
+  size_t node;
+  int policy;
+  if (cpuset_getdomain(CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, sizeof(dom), &dom, &policy) == -1) return 0ul;
+  for (node = 0; node < MAXMEMDOM; node++) {
+    if (DOMAINSET_ISSET(node, &dom)) return node;
+  }
+  return 0ul;
+}
+
+size_t _mi_prim_numa_node_count(void) {
+  size_t ndomains = 0;
+  size_t len = sizeof(ndomains);
+  if (sysctlbyname("vm.ndomains", &ndomains, &len, NULL, 0) == -1) return 0ul;
+  return ndomains;
+}
+
+#elif defined(__DragonFly__)
+
+size_t _mi_prim_numa_node(void) {
+  // TODO: DragonFly does not seem to provide any userland means to get this information.
+  return 0ul;
+}
+
+size_t _mi_prim_numa_node_count(void) {
+  size_t ncpus = 0, nvirtcoresperphys = 0;
+  size_t len = sizeof(size_t);
+  if (sysctlbyname("hw.ncpu", &ncpus, &len, NULL, 0) == -1) return 0ul;
+  if (sysctlbyname("hw.cpu_topology_ht_ids", &nvirtcoresperphys, &len, NULL, 0) == -1) return 0ul;
+  return nvirtcoresperphys * ncpus;
+}
+
+#else
+
+size_t _mi_prim_numa_node(void) {
+  return 0;
+}
+
+size_t _mi_prim_numa_node_count(void) {
+  return 1;
+}
+
+#endif
+
+// ----------------------------------------------------------------
+// Clock
+// ----------------------------------------------------------------
+
+#include <time.h>
+
+#if defined(CLOCK_REALTIME) || defined(CLOCK_MONOTONIC)
+
+mi_msecs_t _mi_prim_clock_now(void) {
+  struct timespec t;
+  #ifdef CLOCK_MONOTONIC
+  clock_gettime(CLOCK_MONOTONIC, &t);
+  #else
+  clock_gettime(CLOCK_REALTIME, &t);
+  #endif
+  return ((mi_msecs_t)t.tv_sec * 1000) + ((mi_msecs_t)t.tv_nsec / 1000000);
+}
+
+#else
+
+// low resolution timer
+mi_msecs_t _mi_prim_clock_now(void) {
+  #if !defined(CLOCKS_PER_SEC) || (CLOCKS_PER_SEC == 1000) || (CLOCKS_PER_SEC == 0)
+  return (mi_msecs_t)clock();
+  #elif (CLOCKS_PER_SEC < 1000)
+  return (mi_msecs_t)clock() * (1000 / (mi_msecs_t)CLOCKS_PER_SEC);
+  #else
+  return (mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000);
+  #endif
+}
+
+#endif
+
+
+
+
+//----------------------------------------------------------------
+// Process info
+//----------------------------------------------------------------
+
+#if defined(__unix__) || defined(__unix) || defined(unix) || defined(__APPLE__) || defined(__HAIKU__)
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/resource.h>
+
+#if defined(__APPLE__)
+#include <mach/mach.h>
+#endif
+
+#if defined(__HAIKU__)
+#include <kernel/OS.h>
+#endif
+
+static mi_msecs_t timeval_secs(const struct timeval* tv) {
+  return ((mi_msecs_t)tv->tv_sec * 1000L) + ((mi_msecs_t)tv->tv_usec / 1000L);
+}
+
+void _mi_prim_process_info(mi_process_info_t* pinfo)
+{
+  struct rusage rusage;
+  getrusage(RUSAGE_SELF, &rusage);
+  pinfo->utime = timeval_secs(&rusage.ru_utime);
+  pinfo->stime = timeval_secs(&rusage.ru_stime);
+#if !defined(__HAIKU__)
+  pinfo->page_faults = rusage.ru_majflt;
+#endif
+#if defined(__HAIKU__)
+  // Haiku does not have (yet?) a way to
+  // get these stats per process
+  thread_info tid;
+  area_info mem;
+  ssize_t c;
+  get_thread_info(find_thread(0), &tid);
+  while (get_next_area_info(tid.team, &c, &mem) == B_OK) {
+    pinfo->peak_rss += mem.ram_size;
+  }
+  pinfo->page_faults = 0;
+#elif defined(__APPLE__)
+  pinfo->peak_rss = rusage.ru_maxrss;         // macos reports in bytes
+  #ifdef MACH_TASK_BASIC_INFO
+  struct mach_task_basic_info info;
+  mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT;
+  if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&info, &infoCount) == KERN_SUCCESS) {
+    pinfo->current_rss = (size_t)info.resident_size;
+  }
+  #else
+  struct task_basic_info info;
+  mach_msg_type_number_t infoCount = TASK_BASIC_INFO_COUNT;
+  if (task_info(mach_task_self(), TASK_BASIC_INFO, (task_info_t)&info, &infoCount) == KERN_SUCCESS) {
+    pinfo->current_rss = (size_t)info.resident_size;
+  }
+  #endif
+#else
+  pinfo->peak_rss = rusage.ru_maxrss * 1024;  // Linux/BSD report in KiB
+#endif
+  // use defaults for commit
+}
+
+#else
+
+#ifndef __wasi__
+// WebAssembly instances are not processes
+#pragma message("define a way to get process info")
+#endif
+
+void _mi_prim_process_info(mi_process_info_t* pinfo)
+{
+  // use defaults
+  MI_UNUSED(pinfo);
+}
+
+#endif
+
+
+//----------------------------------------------------------------
+// Output
+//----------------------------------------------------------------
+
+void _mi_prim_out_stderr( const char* msg ) {
+  fputs(msg,stderr);
+}
+
+
+//----------------------------------------------------------------
+// Environment
+//----------------------------------------------------------------
+
+#if !defined(MI_USE_ENVIRON) || (MI_USE_ENVIRON!=0)
+// On Posix systemsr use `environ` to access environment variables
+// even before the C runtime is initialized.
+#if defined(__APPLE__) && defined(__has_include) && __has_include(<crt_externs.h>)
+#include <crt_externs.h>
+static char** mi_get_environ(void) {
+  return (*_NSGetEnviron());
+}
+#else
+extern char** environ;
+static char** mi_get_environ(void) {
+  return environ;
+}
+#endif
+bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
+  if (name==NULL) return false;
+  const size_t len = _mi_strlen(name);
+  if (len == 0) return false;
+  char** env = mi_get_environ();
+  if (env == NULL) return false;
+  // compare up to 10000 entries
+  for (int i = 0; i < 10000 && env[i] != NULL; i++) {
+    const char* s = env[i];
+    if (_mi_strnicmp(name, s, len) == 0 && s[len] == '=') { // case insensitive
+      // found it
+      _mi_strlcpy(result, s + len + 1, result_size);
+      return true;
+    }
+  }
+  return false;
+}
+#else
+// fallback: use standard C `getenv` but this cannot be used while initializing the C runtime
+bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
+  // cannot call getenv() when still initializing the C runtime.
+  if (_mi_preloading()) return false;
+  const char* s = getenv(name);
+  if (s == NULL) {
+    // we check the upper case name too.
+    char buf[64+1];
+    size_t len = _mi_strnlen(name,sizeof(buf)-1);
+    for (size_t i = 0; i < len; i++) {
+      buf[i] = _mi_toupper(name[i]);
+    }
+    buf[len] = 0;
+    s = getenv(buf);
+  }
+  if (s == NULL || _mi_strnlen(s,result_size) >= result_size)  return false;
+  _mi_strlcpy(result, s, result_size);
+  return true;
+}
+#endif  // !MI_USE_ENVIRON
+
+
+//----------------------------------------------------------------
+// Random
+//----------------------------------------------------------------
+
+#if defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_15) && (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_15)
+#include <CommonCrypto/CommonCryptoError.h>
+#include <CommonCrypto/CommonRandom.h>
+
+bool _mi_prim_random_buf(void* buf, size_t buf_len) {
+  // We prefer CCRandomGenerateBytes as it returns an error code while arc4random_buf
+  // may fail silently on macOS. See PR #390, and <https://opensource.apple.com/source/Libc/Libc-1439.40.11/gen/FreeBSD/arc4random.c.auto.html>
+  return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess);
+}
+
+#elif defined(__ANDROID__) || defined(__DragonFly__) || \
+      defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
+      defined(__sun) || \
+      (defined(__APPLE__) && (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7))
+
+bool _mi_prim_random_buf(void* buf, size_t buf_len) {
+  arc4random_buf(buf, buf_len);
+  return true;
+}
+
+#elif defined(__APPLE__) || defined(__linux__) || defined(__HAIKU__)   // also for old apple versions < 10.7 (issue #829)
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <errno.h>
+
+bool _mi_prim_random_buf(void* buf, size_t buf_len) {
+  // Modern Linux provides `getrandom` but different distributions either use `sys/random.h` or `linux/random.h`
+  // and for the latter the actual `getrandom` call is not always defined.
+  // (see <https://stackoverflow.com/questions/45237324/why-doesnt-getrandom-compile>)
+  // We therefore use a syscall directly and fall back dynamically to /dev/urandom when needed.
+  #if defined(MI_HAS_SYSCALL_H) && defined(SYS_getrandom)
+    #ifndef GRND_NONBLOCK
+    #define GRND_NONBLOCK (1)
+    #endif
+    static _Atomic(uintptr_t) no_getrandom; // = 0
+    if (mi_atomic_load_acquire(&no_getrandom)==0) {
+      ssize_t ret = syscall(SYS_getrandom, buf, buf_len, GRND_NONBLOCK);
+      if (ret >= 0) return (buf_len == (size_t)ret);
+      if (errno != ENOSYS) return false;
+      mi_atomic_store_release(&no_getrandom, (uintptr_t)1); // don't call again, and fall back to /dev/urandom
+    }
+  #endif
+  int flags = O_RDONLY;
+  #if defined(O_CLOEXEC)
+  flags |= O_CLOEXEC;
+  #endif
+  int fd = mi_prim_open("/dev/urandom", flags);
+  if (fd < 0) return false;
+  size_t count = 0;
+  while(count < buf_len) {
+    ssize_t ret = mi_prim_read(fd, (char*)buf + count, buf_len - count);
+    if (ret<=0) {
+      if (errno!=EAGAIN && errno!=EINTR) break;
+    }
+    else {
+      count += ret;
+    }
+  }
+  mi_prim_close(fd);
+  return (count==buf_len);
+}
+
+#else
+
+bool _mi_prim_random_buf(void* buf, size_t buf_len) {
+  return false;
+}
+
+#endif
+
+
+//----------------------------------------------------------------
+// Thread init/done
+//----------------------------------------------------------------
+
+#if defined(MI_USE_PTHREADS)
+
+// use pthread local storage keys to detect thread ending
+// (and used with MI_TLS_PTHREADS for the default theap)
+pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1);
+
+static void mi_pthread_done(void* value) {
+  if (value!=NULL) {
+    _mi_thread_done((mi_theap_t*)value);
+  }
+}
+
+void _mi_prim_thread_init_auto_done(void) {
+  mi_assert_internal(_mi_heap_default_key == (pthread_key_t)(-1));
+  pthread_key_create(&_mi_heap_default_key, &mi_pthread_done);
+}
+
+void _mi_prim_thread_done_auto_done(void) {
+  if (_mi_heap_default_key != (pthread_key_t)(-1)) {  // do not leak the key, see issue #809
+    pthread_key_delete(_mi_heap_default_key);
+  }
+}
+
+void _mi_prim_thread_associate_default_theap(mi_theap_t* theap) {
+  if (_mi_heap_default_key != (pthread_key_t)(-1)) {  // can happen during recursive invocation on freeBSD
+    pthread_setspecific(_mi_heap_default_key, theap);
+  }
+}
+
+#else
+
+void _mi_prim_thread_init_auto_done(void) {
+  // nothing
+}
+
+void _mi_prim_thread_done_auto_done(void) {
+  // nothing
+}
+
+void _mi_prim_thread_associate_default_theap(mi_theap_t* theap) {
+  MI_UNUSED(theap);
+}
+
+#endif
+
+bool _mi_prim_thread_is_in_threadpool(void) {
+  return false;
+}
diff --git a/ext/src/mimalloc/src/prim/wasi/prim.c b/ext/src/mimalloc/src/prim/wasi/prim.c
new file mode 100644
index 0000000000..4e6270ddea
--- /dev/null
+++ b/ext/src/mimalloc/src/prim/wasi/prim.c
@@ -0,0 +1,288 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+// This file is included in `src/prim/prim.c`
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/prim.h"
+
+#include <stdio.h>   // fputs
+#include <stdlib.h>  // getenv
+
+//---------------------------------------------
+// Initialize
+//---------------------------------------------
+
+void _mi_prim_mem_init( mi_os_mem_config_t* config ) {
+  config->page_size = 64*MI_KiB; // WebAssembly has a fixed page size: 64KiB
+  config->alloc_granularity = 16;
+  config->has_overcommit = false;
+  config->has_partial_free = false;
+  config->has_virtual_reserve = false;
+}
+
+//---------------------------------------------
+// Free
+//---------------------------------------------
+
+int _mi_prim_free(void* addr, size_t size ) {
+  MI_UNUSED(addr); MI_UNUSED(size);
+  // wasi theap cannot be shrunk
+  return 0;
+}
+
+
+//---------------------------------------------
+// Allocation: sbrk or memory_grow
+//---------------------------------------------
+
+#if defined(MI_USE_SBRK)
+  #include <unistd.h>  // for sbrk
+
+  static void* mi_memory_grow( size_t size ) {
+    void* p = sbrk(size);
+    if (p == (void*)(-1)) return NULL;
+    #if !defined(__wasi__) // on wasi this is always zero initialized already (?)
+    memset(p,0,size);
+    #endif
+    return p;
+  }
+#elif defined(__wasi__)
+  static void* mi_memory_grow( size_t size ) {
+    size_t base = (size > 0 ? __builtin_wasm_memory_grow(0,_mi_divide_up(size, _mi_os_page_size()))
+                            : __builtin_wasm_memory_size(0));
+    if (base == SIZE_MAX) return NULL;
+    return (void*)(base * _mi_os_page_size());
+  }
+#endif
+
+#if defined(MI_USE_PTHREADS)
+static pthread_mutex_t mi_theap_grow_mutex = PTHREAD_MUTEX_INITIALIZER;
+#endif
+
+static void* mi_prim_mem_grow(size_t size, size_t try_alignment) {
+  void* p = NULL;
+  if (try_alignment <= 1) {
+    // `sbrk` is not thread safe in general so try to protect it (we could skip this on WASM but leave it in for now)
+    #if defined(MI_USE_PTHREADS)
+    pthread_mutex_lock(&mi_theap_grow_mutex);
+    #endif
+    p = mi_memory_grow(size);
+    #if defined(MI_USE_PTHREADS)
+    pthread_mutex_unlock(&mi_theap_grow_mutex);
+    #endif
+  }
+  else {
+    void* base = NULL;
+    size_t alloc_size = 0;
+    // to allocate aligned use a lock to try to avoid thread interaction
+    // between getting the current size and actual allocation
+    // (also, `sbrk` is not thread safe in general)
+    #if defined(MI_USE_PTHREADS)
+    pthread_mutex_lock(&mi_theap_grow_mutex);
+    #endif
+    {
+      void* current = mi_memory_grow(0);  // get current size
+      if (current != NULL) {
+        void* aligned_current = mi_align_up_ptr(current, try_alignment);  // and align from there to minimize wasted space
+        alloc_size = _mi_align_up( ((uint8_t*)aligned_current - (uint8_t*)current) + size, _mi_os_page_size());
+        base = mi_memory_grow(alloc_size);
+      }
+    }
+    #if defined(MI_USE_PTHREADS)
+    pthread_mutex_unlock(&mi_theap_grow_mutex);
+    #endif
+    if (base != NULL) {
+      p = mi_align_up_ptr(base, try_alignment);
+      if ((uint8_t*)p + size > (uint8_t*)base + alloc_size) {
+        // another thread used wasm_memory_grow/sbrk in-between and we do not have enough
+        // space after alignment. Give up (and waste the space as we cannot shrink :-( )
+        // (in `mi_os_mem_alloc_aligned` this will fall back to overallocation to align)
+        p = NULL;
+      }
+    }
+  }
+  /*
+  if (p == NULL) {
+    _mi_warning_message("unable to allocate sbrk/wasm_memory_grow OS memory (%zu bytes, %zu alignment)\n", size, try_alignment);
+    errno = ENOMEM;
+    return NULL;
+  }
+  */
+  mi_assert_internal( p == NULL || try_alignment == 0 || (uintptr_t)p % try_alignment == 0 );
+  return p;
+}
+
+// Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
+int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
+  MI_UNUSED(allow_large); MI_UNUSED(commit); MI_UNUSED(hint_addr);
+  *is_large = false;
+  *is_zero = false;
+  *addr = mi_prim_mem_grow(size, try_alignment);
+  return (*addr != NULL ? 0 : ENOMEM);
+}
+
+
+//---------------------------------------------
+// Commit/Reset/Protect
+//---------------------------------------------
+
+int _mi_prim_commit(void* addr, size_t size, bool* is_zero) {
+  MI_UNUSED(addr); MI_UNUSED(size);
+  *is_zero = false;
+  return 0;
+}
+
+int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit) {
+  MI_UNUSED(addr); MI_UNUSED(size);
+  *needs_recommit = false;
+  return 0;
+}
+
+int _mi_prim_reset(void* addr, size_t size) {
+  MI_UNUSED(addr); MI_UNUSED(size);
+  return 0;
+}
+
+int _mi_prim_reuse(void* addr, size_t size) {
+  MI_UNUSED(addr); MI_UNUSED(size);
+  return 0;
+}
+
+int _mi_prim_protect(void* addr, size_t size, bool protect) {
+  MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(protect);
+  return 0;
+}
+
+
+//---------------------------------------------
+// Huge pages and NUMA nodes
+//---------------------------------------------
+
+int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) {
+  MI_UNUSED(hint_addr); MI_UNUSED(size); MI_UNUSED(numa_node);
+  *is_zero = true;
+  *addr = NULL;
+  return ENOSYS;
+}
+
+size_t _mi_prim_numa_node(void) {
+  return 0;
+}
+
+size_t _mi_prim_numa_node_count(void) {
+  return 1;
+}
+
+
+//----------------------------------------------------------------
+// Clock
+//----------------------------------------------------------------
+
+#include <time.h>
+
+#if defined(CLOCK_REALTIME) || defined(CLOCK_MONOTONIC)
+
+mi_msecs_t _mi_prim_clock_now(void) {
+  struct timespec t;
+  #ifdef CLOCK_MONOTONIC
+  clock_gettime(CLOCK_MONOTONIC, &t);
+  #else
+  clock_gettime(CLOCK_REALTIME, &t);
+  #endif
+  return ((mi_msecs_t)t.tv_sec * 1000) + ((mi_msecs_t)t.tv_nsec / 1000000);
+}
+
+#else
+
+// low resolution timer
+mi_msecs_t _mi_prim_clock_now(void) {
+  #if !defined(CLOCKS_PER_SEC) || (CLOCKS_PER_SEC == 1000) || (CLOCKS_PER_SEC == 0)
+  return (mi_msecs_t)clock();
+  #elif (CLOCKS_PER_SEC < 1000)
+  return (mi_msecs_t)clock() * (1000 / (mi_msecs_t)CLOCKS_PER_SEC);
+  #else
+  return (mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000);
+  #endif
+}
+
+#endif
+
+
+//----------------------------------------------------------------
+// Process info
+//----------------------------------------------------------------
+
+void _mi_prim_process_info(mi_process_info_t* pinfo)
+{
+  // use defaults
+  MI_UNUSED(pinfo);
+}
+
+
+//----------------------------------------------------------------
+// Output
+//----------------------------------------------------------------
+
+void _mi_prim_out_stderr( const char* msg ) {
+  fputs(msg,stderr);
+}
+
+
+//----------------------------------------------------------------
+// Environment
+//----------------------------------------------------------------
+
+bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
+  // cannot call getenv() when still initializing the C runtime.
+  if (_mi_preloading()) return false;
+  const char* s = getenv(name);
+  if (s == NULL) {
+    // we check the upper case name too.
+    char buf[64+1];
+    size_t len = _mi_strnlen(name,sizeof(buf)-1);
+    for (size_t i = 0; i < len; i++) {
+      buf[i] = _mi_toupper(name[i]);
+    }
+    buf[len] = 0;
+    s = getenv(buf);
+  }
+  if (s == NULL || _mi_strnlen(s,result_size) >= result_size)  return false;
+  _mi_strlcpy(result, s, result_size);
+  return true;
+}
+
+
+//----------------------------------------------------------------
+// Random
+//----------------------------------------------------------------
+
+bool _mi_prim_random_buf(void* buf, size_t buf_len) {
+  return false;
+}
+
+
+//----------------------------------------------------------------
+// Thread init/done
+//----------------------------------------------------------------
+
+void _mi_prim_thread_init_auto_done(void) {
+  // nothing
+}
+
+void _mi_prim_thread_done_auto_done(void) {
+  // nothing
+}
+
+void _mi_prim_thread_associate_default_theap(mi_theap_t* theap) {
+  MI_UNUSED(theap);
+}
+
+bool _mi_prim_thread_is_in_threadpool(void) {
+  return false;
+}
diff --git a/ext/src/mimalloc/src/prim/windows/etw-mimalloc.wprp b/ext/src/mimalloc/src/prim/windows/etw-mimalloc.wprp
new file mode 100644
index 0000000000..b00cd7adf2
--- /dev/null
+++ b/ext/src/mimalloc/src/prim/windows/etw-mimalloc.wprp
@@ -0,0 +1,61 @@
+<WindowsPerformanceRecorder Version="1.0">
+  <Profiles>
+    <SystemCollector Id="WPR_initiated_WprApp_WPR_System_Collector" Name="WPR_initiated_WprApp_WPR System Collector">
+      <BufferSize Value="1024" />
+      <Buffers Value="100" />
+    </SystemCollector>
+    <EventCollector Id="Mimalloc_Collector" Name="Mimalloc Collector">
+      <BufferSize Value="1024" />
+      <Buffers Value="100" />
+    </EventCollector>
+    <SystemProvider Id="WPR_initiated_WprApp_WPR_System_Collector_Provider">
+      <Keywords>
+        <Keyword Value="Loader" />
+      </Keywords>
+    </SystemProvider>
+    <EventProvider Id="MimallocEventProvider" Name="138f4dbb-ee04-4899-aa0a-572ad4475779" NonPagedMemory="true" Stack="true">
+      <EventFilters FilterIn="true">
+        <EventId Value="100" />
+        <EventId Value="101" />
+      </EventFilters>
+    </EventProvider>
+    <Profile Id="CustomHeap.Verbose.File" Name="CustomHeap" Description="RunningProfile:CustomHeap.Verbose.File" LoggingMode="File" DetailLevel="Verbose">
+      <ProblemCategories>
+        <ProblemCategory Value="Resource Analysis" />
+      </ProblemCategories>
+      <Collectors>
+        <SystemCollectorId Value="WPR_initiated_WprApp_WPR_System_Collector">
+          <SystemProviderId Value="WPR_initiated_WprApp_WPR_System_Collector_Provider" />
+        </SystemCollectorId>
+        <EventCollectorId Value="Mimalloc_Collector">
+          <EventProviders>
+            <EventProviderId Value="MimallocEventProvider" >
+              <Keywords>
+                <Keyword Value="100"/>
+                <Keyword Value="101"/>
+              </Keywords>
+            </EventProviderId>
+          </EventProviders>
+        </EventCollectorId>
+      </Collectors>
+      <TraceMergeProperties>
+        <TraceMergeProperty Id="BaseVerboseTraceMergeProperties" Name="BaseTraceMergeProperties">
+          <DeletePreMergedTraceFiles Value="true" />
+          <FileCompression Value="false" />
+          <InjectOnly Value="false" />
+          <SkipMerge Value="false" />
+          <CustomEvents>
+            <CustomEvent Value="ImageId" />
+            <CustomEvent Value="BuildInfo" />
+            <CustomEvent Value="VolumeMapping" />
+            <CustomEvent Value="EventMetadata" />
+            <CustomEvent Value="PerfTrackMetadata" />
+            <CustomEvent Value="WinSAT" />
+            <CustomEvent Value="NetworkInterface" />
+          </CustomEvents>
+        </TraceMergeProperty>
+      </TraceMergeProperties>
+    </Profile>
+  </Profiles>
+</WindowsPerformanceRecorder>
+
diff --git a/ext/src/mimalloc/src/prim/windows/etw.h b/ext/src/mimalloc/src/prim/windows/etw.h
new file mode 100644
index 0000000000..4e0a092a10
--- /dev/null
+++ b/ext/src/mimalloc/src/prim/windows/etw.h
@@ -0,0 +1,905 @@
+//**********************************************************************`
+//* This is an include file generated by Message Compiler.             *`
+//*                                                                    *`
+//* Copyright (c) Microsoft Corporation. All Rights Reserved.          *`
+//**********************************************************************`
+#pragma once
+
+//*****************************************************************************
+//
+// Notes on the ETW event code generated by MC:
+//
+// - Structures and arrays of structures are treated as an opaque binary blob.
+//   The caller is responsible for packing the data for the structure into a
+//   single region of memory, with no padding between values. The macro will
+//   have an extra parameter for the length of the blob.
+// - Arrays of nul-terminated strings must be packed by the caller into a
+//   single binary blob containing the correct number of strings, with a nul
+//   after each string. The size of the blob is specified in characters, and
+//   includes the final nul.
+// - Arrays of SID are treated as a single binary blob. The caller is
+//   responsible for packing the SID values into a single region of memory with
+//   no padding.
+// - The length attribute on the data element in the manifest is significant
+//   for values with intype win:UnicodeString, win:AnsiString, or win:Binary.
+//   The length attribute must be specified for win:Binary, and is optional for
+//   win:UnicodeString and win:AnsiString (if no length is given, the strings
+//   are assumed to be nul-terminated). For win:UnicodeString, the length is
+//   measured in characters, not bytes.
+// - For an array of win:UnicodeString, win:AnsiString, or win:Binary, the
+//   length attribute applies to every value in the array, so every value in
+//   the array must have the same length. The values in the array are provided
+//   to the macro via a single pointer -- the caller is responsible for packing
+//   all of the values into a single region of memory with no padding between
+//   values.
+// - Values of type win:CountedUnicodeString, win:CountedAnsiString, and
+//   win:CountedBinary can be generated and collected on Vista or later.
+//   However, they may not decode properly without the Windows 10 2018 Fall
+//   Update.
+// - Arrays of type win:CountedUnicodeString, win:CountedAnsiString, and
+//   win:CountedBinary must be packed by the caller into a single region of
+//   memory. The format for each item is a UINT16 byte-count followed by that
+//   many bytes of data. When providing the array to the generated macro, you
+//   must provide the total size of the packed array data, including the UINT16
+//   sizes for each item. In the case of win:CountedUnicodeString, the data
+//   size is specified in WCHAR (16-bit) units. In the case of
+//   win:CountedAnsiString and win:CountedBinary, the data size is specified in
+//   bytes.
+//
+//*****************************************************************************
+
+#include <wmistr.h>
+#include <evntrace.h>
+#include <evntprov.h>
+
+#ifndef ETW_INLINE
+  #ifdef _ETW_KM_
+    // In kernel mode, save stack space by never inlining templates.
+    #define ETW_INLINE DECLSPEC_NOINLINE __inline
+  #else
+    // In user mode, save code size by inlining templates as appropriate.
+    #define ETW_INLINE __inline
+  #endif
+#endif // ETW_INLINE
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+//
+// MCGEN_DISABLE_PROVIDER_CODE_GENERATION macro:
+// Define this macro to have the compiler skip the generated functions in this
+// header.
+//
+#ifndef MCGEN_DISABLE_PROVIDER_CODE_GENERATION
+
+//
+// MCGEN_USE_KERNEL_MODE_APIS macro:
+// Controls whether the generated code uses kernel-mode or user-mode APIs.
+// - Set to 0 to use Windows user-mode APIs such as EventRegister.
+// - Set to 1 to use Windows kernel-mode APIs such as EtwRegister.
+// Default is based on whether the _ETW_KM_ macro is defined (i.e. by wdm.h).
+// Note that the APIs can also be overridden directly, e.g. by setting the
+// MCGEN_EVENTWRITETRANSFER or MCGEN_EVENTREGISTER macros.
+//
+#ifndef MCGEN_USE_KERNEL_MODE_APIS
+  #ifdef _ETW_KM_
+    #define MCGEN_USE_KERNEL_MODE_APIS 1
+  #else
+    #define MCGEN_USE_KERNEL_MODE_APIS 0
+  #endif
+#endif // MCGEN_USE_KERNEL_MODE_APIS
+
+//
+// MCGEN_HAVE_EVENTSETINFORMATION macro:
+// Controls how McGenEventSetInformation uses the EventSetInformation API.
+// - Set to 0 to disable the use of EventSetInformation
+//   (McGenEventSetInformation will always return an error).
+// - Set to 1 to directly invoke MCGEN_EVENTSETINFORMATION.
+// - Set to 2 to to locate EventSetInformation at runtime via GetProcAddress
+//   (user-mode) or MmGetSystemRoutineAddress (kernel-mode).
+// Default is determined as follows:
+// - If MCGEN_EVENTSETINFORMATION has been customized, set to 1
+//   (i.e. use MCGEN_EVENTSETINFORMATION).
+// - Else if the target OS version has EventSetInformation, set to 1
+//   (i.e. use MCGEN_EVENTSETINFORMATION).
+// - Else set to 2 (i.e. try to dynamically locate EventSetInformation).
+// Note that an McGenEventSetInformation function will only be generated if one
+// or more provider in a manifest has provider traits.
+//
+#ifndef MCGEN_HAVE_EVENTSETINFORMATION
+  #ifdef MCGEN_EVENTSETINFORMATION             // if MCGEN_EVENTSETINFORMATION has been customized,
+    #define MCGEN_HAVE_EVENTSETINFORMATION   1 //   directly invoke MCGEN_EVENTSETINFORMATION(...).
+  #elif MCGEN_USE_KERNEL_MODE_APIS             // else if using kernel-mode APIs,
+    #if NTDDI_VERSION >= 0x06040000            //   if target OS is Windows 10 or later,
+      #define MCGEN_HAVE_EVENTSETINFORMATION 1 //     directly invoke MCGEN_EVENTSETINFORMATION(...).
+    #else                                      //   else
+      #define MCGEN_HAVE_EVENTSETINFORMATION 2 //     find "EtwSetInformation" via MmGetSystemRoutineAddress.
+    #endif                                     // else (using user-mode APIs)
+  #else                                        //   if target OS and SDK is Windows 8 or later,
+    #if WINVER >= 0x0602 && defined(EVENT_FILTER_TYPE_SCHEMATIZED)
+      #define MCGEN_HAVE_EVENTSETINFORMATION 1 //     directly invoke MCGEN_EVENTSETINFORMATION(...).
+    #else                                      //   else
+      #define MCGEN_HAVE_EVENTSETINFORMATION 2 //     find "EventSetInformation" via GetModuleHandleExW/GetProcAddress.
+    #endif
+  #endif
+#endif // MCGEN_HAVE_EVENTSETINFORMATION
+
+//
+// MCGEN Override Macros
+//
+// The following override macros may be defined before including this header
+// to control the APIs used by this header:
+//
+// - MCGEN_EVENTREGISTER
+// - MCGEN_EVENTUNREGISTER
+// - MCGEN_EVENTSETINFORMATION
+// - MCGEN_EVENTWRITETRANSFER
+//
+// If the the macro is undefined, the MC implementation will default to the
+// corresponding ETW APIs. For example, if the MCGEN_EVENTREGISTER macro is
+// undefined, the EventRegister[MyProviderName] macro will use EventRegister
+// in user mode and will use EtwRegister in kernel mode.
+//
+// To prevent issues from conflicting definitions of these macros, the value
+// of the override macro will be used as a suffix in certain internal function
+// names. Because of this, the override macros must follow certain rules:
+//
+// - The macro must be defined before any MC-generated header is included and
+//   must not be undefined or redefined after any MC-generated header is
+//   included. Different translation units (i.e. different .c or .cpp files)
+//   may set the macros to different values, but within a translation unit
+//   (within a single .c or .cpp file), the macro must be set once and not
+//   changed.
+// - The override must be an object-like macro, not a function-like macro
+//   (i.e. the override macro must not have a parameter list).
+// - The override macro's value must be a simple identifier, i.e. must be
+//   something that starts with a letter or '_' and contains only letters,
+//   numbers, and '_' characters.
+// - If the override macro's value is the name of a second object-like macro,
+//   the second object-like macro must follow the same rules. (The override
+//   macro's value can also be the name of a function-like macro, in which
+//   case the function-like macro does not need to follow the same rules.)
+//
+// For example, the following will cause compile errors:
+//
+//   #define MCGEN_EVENTWRITETRANSFER MyNamespace::MyClass::MyFunction // Value has non-identifier characters (colon).
+//   #define MCGEN_EVENTWRITETRANSFER GetEventWriteFunctionPointer(7)  // Value has non-identifier characters (parentheses).
+//   #define MCGEN_EVENTWRITETRANSFER(h,e,a,r,c,d) EventWrite(h,e,c,d) // Override is defined as a function-like macro.
+//   #define MY_OBJECT_LIKE_MACRO     MyNamespace::MyClass::MyEventWriteFunction
+//   #define MCGEN_EVENTWRITETRANSFER MY_OBJECT_LIKE_MACRO // Evaluates to something with non-identifier characters (colon).
+//
+// The following would be ok:
+//
+//   #define MCGEN_EVENTWRITETRANSFER  MyEventWriteFunction1  // OK, suffix will be "MyEventWriteFunction1".
+//   #define MY_OBJECT_LIKE_MACRO      MyEventWriteFunction2
+//   #define MCGEN_EVENTWRITETRANSFER  MY_OBJECT_LIKE_MACRO   // OK, suffix will be "MyEventWriteFunction2".
+//   #define MY_FUNCTION_LIKE_MACRO(h,e,a,r,c,d) MyNamespace::MyClass::MyEventWriteFunction3(h,e,c,d)
+//   #define MCGEN_EVENTWRITETRANSFER  MY_FUNCTION_LIKE_MACRO // OK, suffix will be "MY_FUNCTION_LIKE_MACRO".
+//
+#ifndef MCGEN_EVENTREGISTER
+  #if MCGEN_USE_KERNEL_MODE_APIS
+    #define MCGEN_EVENTREGISTER        EtwRegister
+  #else
+    #define MCGEN_EVENTREGISTER        EventRegister
+  #endif
+#endif // MCGEN_EVENTREGISTER
+#ifndef MCGEN_EVENTUNREGISTER
+  #if MCGEN_USE_KERNEL_MODE_APIS
+    #define MCGEN_EVENTUNREGISTER      EtwUnregister
+  #else
+    #define MCGEN_EVENTUNREGISTER      EventUnregister
+  #endif
+#endif // MCGEN_EVENTUNREGISTER
+#ifndef MCGEN_EVENTSETINFORMATION
+  #if MCGEN_USE_KERNEL_MODE_APIS
+    #define MCGEN_EVENTSETINFORMATION  EtwSetInformation
+  #else
+    #define MCGEN_EVENTSETINFORMATION  EventSetInformation
+  #endif
+#endif // MCGEN_EVENTSETINFORMATION
+#ifndef MCGEN_EVENTWRITETRANSFER
+  #if MCGEN_USE_KERNEL_MODE_APIS
+    #define MCGEN_EVENTWRITETRANSFER   EtwWriteTransfer
+  #else
+    #define MCGEN_EVENTWRITETRANSFER   EventWriteTransfer
+  #endif
+#endif // MCGEN_EVENTWRITETRANSFER
+
+//
+// MCGEN_EVENT_ENABLED macro:
+// Override to control how the EventWrite[EventName] macros determine whether
+// an event is enabled. The default behavior is for EventWrite[EventName] to
+// use the EventEnabled[EventName] macros.
+//
+#ifndef MCGEN_EVENT_ENABLED
+#define MCGEN_EVENT_ENABLED(EventName) EventEnabled##EventName()
+#endif
+
+//
+// MCGEN_EVENT_ENABLED_FORCONTEXT macro:
+// Override to control how the EventWrite[EventName]_ForContext macros
+// determine whether an event is enabled. The default behavior is for
+// EventWrite[EventName]_ForContext to use the
+// EventEnabled[EventName]_ForContext macros.
+//
+#ifndef MCGEN_EVENT_ENABLED_FORCONTEXT
+#define MCGEN_EVENT_ENABLED_FORCONTEXT(pContext, EventName) EventEnabled##EventName##_ForContext(pContext)
+#endif
+
+//
+// MCGEN_ENABLE_CHECK macro:
+// Determines whether the specified event would be considered as enabled
+// based on the state of the specified context. Slightly faster than calling
+// McGenEventEnabled directly.
+//
+#ifndef MCGEN_ENABLE_CHECK
+#define MCGEN_ENABLE_CHECK(Context, Descriptor) (Context.IsEnabled && McGenEventEnabled(&Context, &Descriptor))
+#endif
+
+#if !defined(MCGEN_TRACE_CONTEXT_DEF)
+#define MCGEN_TRACE_CONTEXT_DEF
+// This structure is for use by MC-generated code and should not be used directly.
+typedef struct _MCGEN_TRACE_CONTEXT
+{
+    TRACEHANDLE            RegistrationHandle;
+    TRACEHANDLE            Logger;      // Used as pointer to provider traits.
+    ULONGLONG              MatchAnyKeyword;
+    ULONGLONG              MatchAllKeyword;
+    ULONG                  Flags;
+    ULONG                  IsEnabled;
+    UCHAR                  Level;
+    UCHAR                  Reserve;
+    USHORT                 EnableBitsCount;
+    PULONG                 EnableBitMask;
+    const ULONGLONG*       EnableKeyWords;
+    const UCHAR*           EnableLevel;
+} MCGEN_TRACE_CONTEXT, *PMCGEN_TRACE_CONTEXT;
+#endif // MCGEN_TRACE_CONTEXT_DEF
+
+#if !defined(MCGEN_LEVEL_KEYWORD_ENABLED_DEF)
+#define MCGEN_LEVEL_KEYWORD_ENABLED_DEF
+//
+// Determines whether an event with a given Level and Keyword would be
+// considered as enabled based on the state of the specified context.
+// Note that you may want to use MCGEN_ENABLE_CHECK instead of calling this
+// function directly.
+//
+FORCEINLINE
+BOOLEAN
+McGenLevelKeywordEnabled(
+    _In_ PMCGEN_TRACE_CONTEXT EnableInfo,
+    _In_ UCHAR Level,
+    _In_ ULONGLONG Keyword
+    )
+{
+    //
+    // Check if the event Level is lower than the level at which
+    // the channel is enabled.
+    // If the event Level is 0 or the channel is enabled at level 0,
+    // all levels are enabled.
+    //
+
+    if ((Level <= EnableInfo->Level) || // This also covers the case of Level == 0.
+        (EnableInfo->Level == 0)) {
+
+        //
+        // Check if Keyword is enabled
+        //
+
+        if ((Keyword == (ULONGLONG)0) ||
+            ((Keyword & EnableInfo->MatchAnyKeyword) &&
+             ((Keyword & EnableInfo->MatchAllKeyword) == EnableInfo->MatchAllKeyword))) {
+            return TRUE;
+        }
+    }
+
+    return FALSE;
+}
+#endif // MCGEN_LEVEL_KEYWORD_ENABLED_DEF
+
+#if !defined(MCGEN_EVENT_ENABLED_DEF)
+#define MCGEN_EVENT_ENABLED_DEF
+//
+// Determines whether the specified event would be considered as enabled based
+// on the state of the specified context. Note that you may want to use
+// MCGEN_ENABLE_CHECK instead of calling this function directly.
+//
+FORCEINLINE
+BOOLEAN
+McGenEventEnabled(
+    _In_ PMCGEN_TRACE_CONTEXT EnableInfo,
+    _In_ PCEVENT_DESCRIPTOR EventDescriptor
+    )
+{
+    return McGenLevelKeywordEnabled(EnableInfo, EventDescriptor->Level, EventDescriptor->Keyword);
+}
+#endif // MCGEN_EVENT_ENABLED_DEF
+
+#if !defined(MCGEN_CONTROL_CALLBACK)
+#define MCGEN_CONTROL_CALLBACK
+
+// This function is for use by MC-generated code and should not be used directly.
+DECLSPEC_NOINLINE __inline
+VOID
+__stdcall
+McGenControlCallbackV2(
+    _In_ LPCGUID SourceId,
+    _In_ ULONG ControlCode,
+    _In_ UCHAR Level,
+    _In_ ULONGLONG MatchAnyKeyword,
+    _In_ ULONGLONG MatchAllKeyword,
+    _In_opt_ PEVENT_FILTER_DESCRIPTOR FilterData,
+    _Inout_opt_ PVOID CallbackContext
+    )
+/*++
+
+Routine Description:
+
+    This is the notification callback for Windows Vista and later.
+
+Arguments:
+
+    SourceId - The GUID that identifies the session that enabled the provider.
+
+    ControlCode - The parameter indicates whether the provider
+                  is being enabled or disabled.
+
+    Level - The level at which the event is enabled.
+
+    MatchAnyKeyword - The bitmask of keywords that the provider uses to
+                      determine the category of events that it writes.
+
+    MatchAllKeyword - This bitmask additionally restricts the category
+                      of events that the provider writes.
+
+    FilterData - The provider-defined data.
+
+    CallbackContext - The context of the callback that is defined when the provider
+                      called EtwRegister to register itself.
+
+Remarks:
+
+    ETW calls this function to notify provider of enable/disable
+
+--*/
+{
+    PMCGEN_TRACE_CONTEXT Ctx = (PMCGEN_TRACE_CONTEXT)CallbackContext;
+    ULONG Ix;
+#ifndef MCGEN_PRIVATE_ENABLE_CALLBACK_V2
+    UNREFERENCED_PARAMETER(SourceId);
+    UNREFERENCED_PARAMETER(FilterData);
+#endif
+
+    if (Ctx == NULL) {
+        return;
+    }
+
+    switch (ControlCode) {
+
+        case EVENT_CONTROL_CODE_ENABLE_PROVIDER:
+            Ctx->Level = Level;
+            Ctx->MatchAnyKeyword = MatchAnyKeyword;
+            Ctx->MatchAllKeyword = MatchAllKeyword;
+            Ctx->IsEnabled = EVENT_CONTROL_CODE_ENABLE_PROVIDER;
+
+            for (Ix = 0; Ix < Ctx->EnableBitsCount; Ix += 1) {
+                if (McGenLevelKeywordEnabled(Ctx, Ctx->EnableLevel[Ix], Ctx->EnableKeyWords[Ix]) != FALSE) {
+                    Ctx->EnableBitMask[Ix >> 5] |= (1 << (Ix % 32));
+                } else {
+                    Ctx->EnableBitMask[Ix >> 5] &= ~(1 << (Ix % 32));
+                }
+            }
+            break;
+
+        case EVENT_CONTROL_CODE_DISABLE_PROVIDER:
+            Ctx->IsEnabled = EVENT_CONTROL_CODE_DISABLE_PROVIDER;
+            Ctx->Level = 0;
+            Ctx->MatchAnyKeyword = 0;
+            Ctx->MatchAllKeyword = 0;
+            if (Ctx->EnableBitsCount > 0) {
+#pragma warning(suppress: 26451) // Arithmetic overflow cannot occur, no matter the value of EnableBitCount
+                RtlZeroMemory(Ctx->EnableBitMask, (((Ctx->EnableBitsCount - 1) / 32) + 1) * sizeof(ULONG));
+            }
+            break;
+
+        default:
+            break;
+    }
+
+#ifdef MCGEN_PRIVATE_ENABLE_CALLBACK_V2
+    //
+    // Call user defined callback
+    //
+    MCGEN_PRIVATE_ENABLE_CALLBACK_V2(
+        SourceId,
+        ControlCode,
+        Level,
+        MatchAnyKeyword,
+        MatchAllKeyword,
+        FilterData,
+        CallbackContext
+        );
+#endif // MCGEN_PRIVATE_ENABLE_CALLBACK_V2
+
+    return;
+}
+
+#endif // MCGEN_CONTROL_CALLBACK
+
+#ifndef _mcgen_PENABLECALLBACK
+  #if MCGEN_USE_KERNEL_MODE_APIS
+    #define _mcgen_PENABLECALLBACK      PETWENABLECALLBACK
+  #else
+    #define _mcgen_PENABLECALLBACK      PENABLECALLBACK
+  #endif
+#endif // _mcgen_PENABLECALLBACK
+
+#if !defined(_mcgen_PASTE2)
+// This macro is for use by MC-generated code and should not be used directly.
+#define _mcgen_PASTE2(a, b) _mcgen_PASTE2_imp(a, b)
+#define _mcgen_PASTE2_imp(a, b) a##b
+#endif // _mcgen_PASTE2
+
+#if !defined(_mcgen_PASTE3)
+// This macro is for use by MC-generated code and should not be used directly.
+#define _mcgen_PASTE3(a, b, c) _mcgen_PASTE3_imp(a, b, c)
+#define _mcgen_PASTE3_imp(a, b, c) a##b##_##c
+#endif // _mcgen_PASTE3
+
+//
+// Macro validation
+//
+
+// Validate MCGEN_EVENTREGISTER:
+
+// Trigger an error if MCGEN_EVENTREGISTER is not an unqualified (simple) identifier:
+struct _mcgen_PASTE2(MCGEN_EVENTREGISTER_definition_must_be_an_unqualified_identifier_, MCGEN_EVENTREGISTER);
+
+// Trigger an error if MCGEN_EVENTREGISTER is redefined:
+typedef struct _mcgen_PASTE2(MCGEN_EVENTREGISTER_definition_must_be_an_unqualified_identifier_, MCGEN_EVENTREGISTER)
+    MCGEN_EVENTREGISTER_must_not_be_redefined_between_headers;
+
+// Trigger an error if MCGEN_EVENTREGISTER is defined as a function-like macro:
+typedef void MCGEN_EVENTREGISTER_must_not_be_a_functionLike_macro_MCGEN_EVENTREGISTER;
+typedef int _mcgen_PASTE2(MCGEN_EVENTREGISTER_must_not_be_a_functionLike_macro_, MCGEN_EVENTREGISTER);
+
+// Validate MCGEN_EVENTUNREGISTER:
+
+// Trigger an error if MCGEN_EVENTUNREGISTER is not an unqualified (simple) identifier:
+struct _mcgen_PASTE2(MCGEN_EVENTUNREGISTER_definition_must_be_an_unqualified_identifier_, MCGEN_EVENTUNREGISTER);
+
+// Trigger an error if MCGEN_EVENTUNREGISTER is redefined:
+typedef struct _mcgen_PASTE2(MCGEN_EVENTUNREGISTER_definition_must_be_an_unqualified_identifier_, MCGEN_EVENTUNREGISTER)
+    MCGEN_EVENTUNREGISTER_must_not_be_redefined_between_headers;
+
+// Trigger an error if MCGEN_EVENTUNREGISTER is defined as a function-like macro:
+typedef void MCGEN_EVENTUNREGISTER_must_not_be_a_functionLike_macro_MCGEN_EVENTUNREGISTER;
+typedef int _mcgen_PASTE2(MCGEN_EVENTUNREGISTER_must_not_be_a_functionLike_macro_, MCGEN_EVENTUNREGISTER);
+
+// Validate MCGEN_EVENTSETINFORMATION:
+
+// Trigger an error if MCGEN_EVENTSETINFORMATION is not an unqualified (simple) identifier:
+struct _mcgen_PASTE2(MCGEN_EVENTSETINFORMATION_definition_must_be_an_unqualified_identifier_, MCGEN_EVENTSETINFORMATION);
+
+// Trigger an error if MCGEN_EVENTSETINFORMATION is redefined:
+typedef struct _mcgen_PASTE2(MCGEN_EVENTSETINFORMATION_definition_must_be_an_unqualified_identifier_, MCGEN_EVENTSETINFORMATION)
+    MCGEN_EVENTSETINFORMATION_must_not_be_redefined_between_headers;
+
+// Trigger an error if MCGEN_EVENTSETINFORMATION is defined as a function-like macro:
+typedef void MCGEN_EVENTSETINFORMATION_must_not_be_a_functionLike_macro_MCGEN_EVENTSETINFORMATION;
+typedef int _mcgen_PASTE2(MCGEN_EVENTSETINFORMATION_must_not_be_a_functionLike_macro_, MCGEN_EVENTSETINFORMATION);
+
+// Validate MCGEN_EVENTWRITETRANSFER:
+
+// Trigger an error if MCGEN_EVENTWRITETRANSFER is not an unqualified (simple) identifier:
+struct _mcgen_PASTE2(MCGEN_EVENTWRITETRANSFER_definition_must_be_an_unqualified_identifier_, MCGEN_EVENTWRITETRANSFER);
+
+// Trigger an error if MCGEN_EVENTWRITETRANSFER is redefined:
+typedef struct _mcgen_PASTE2(MCGEN_EVENTWRITETRANSFER_definition_must_be_an_unqualified_identifier_, MCGEN_EVENTWRITETRANSFER)
+    MCGEN_EVENTWRITETRANSFER_must_not_be_redefined_between_headers;;
+
+// Trigger an error if MCGEN_EVENTWRITETRANSFER is defined as a function-like macro:
+typedef void MCGEN_EVENTWRITETRANSFER_must_not_be_a_functionLike_macro_MCGEN_EVENTWRITETRANSFER;
+typedef int _mcgen_PASTE2(MCGEN_EVENTWRITETRANSFER_must_not_be_a_functionLike_macro_, MCGEN_EVENTWRITETRANSFER);
+
+#ifndef McGenEventWrite_def
+#define McGenEventWrite_def
+
+// This macro is for use by MC-generated code and should not be used directly.
+#define McGenEventWrite _mcgen_PASTE2(McGenEventWrite_, MCGEN_EVENTWRITETRANSFER)
+
+// This function is for use by MC-generated code and should not be used directly.
+DECLSPEC_NOINLINE __inline
+ULONG __stdcall
+McGenEventWrite(
+    _In_ PMCGEN_TRACE_CONTEXT Context,
+    _In_ PCEVENT_DESCRIPTOR Descriptor,
+    _In_opt_ LPCGUID ActivityId,
+    _In_range_(1, 128) ULONG EventDataCount,
+    _Pre_cap_(EventDataCount) EVENT_DATA_DESCRIPTOR* EventData
+    )
+{
+    const USHORT UNALIGNED* Traits;
+
+    // Some customized MCGEN_EVENTWRITETRANSFER macros might ignore ActivityId.
+    UNREFERENCED_PARAMETER(ActivityId);
+
+    Traits = (const USHORT UNALIGNED*)(UINT_PTR)Context->Logger;
+
+    if (Traits == NULL) {
+        EventData[0].Ptr = 0;
+        EventData[0].Size = 0;
+        EventData[0].Reserved = 0;
+    } else {
+        EventData[0].Ptr = (ULONG_PTR)Traits;
+        EventData[0].Size = *Traits;
+        EventData[0].Reserved = 2; // EVENT_DATA_DESCRIPTOR_TYPE_PROVIDER_METADATA
+    }
+
+    return MCGEN_EVENTWRITETRANSFER(
+        Context->RegistrationHandle,
+        Descriptor,
+        ActivityId,
+        NULL,
+        EventDataCount,
+        EventData);
+}
+#endif // McGenEventWrite_def
+
+#if !defined(McGenEventRegisterUnregister)
+#define McGenEventRegisterUnregister
+
+// This macro is for use by MC-generated code and should not be used directly.
+#define McGenEventRegister _mcgen_PASTE2(McGenEventRegister_, MCGEN_EVENTREGISTER)
+
+#pragma warning(push)
+#pragma warning(disable:6103)
+// This function is for use by MC-generated code and should not be used directly.
+DECLSPEC_NOINLINE __inline
+ULONG __stdcall
+McGenEventRegister(
+    _In_ LPCGUID ProviderId,
+    _In_opt_ _mcgen_PENABLECALLBACK EnableCallback,
+    _In_opt_ PVOID CallbackContext,
+    _Inout_ PREGHANDLE RegHandle
+    )
+/*++
+
+Routine Description:
+
+    This function registers the provider with ETW.
+
+Arguments:
+
+    ProviderId - Provider ID to register with ETW.
+
+    EnableCallback - Callback to be used.
+
+    CallbackContext - Context for the callback.
+
+    RegHandle - Pointer to registration handle.
+
+Remarks:
+
+    Should not be called if the provider is already registered (i.e. should not
+    be called if *RegHandle != 0). Repeatedly registering a provider is a bug
+    and may indicate a race condition. However, for compatibility with previous
+    behavior, this function will return SUCCESS in this case.
+
+--*/
+{
+    ULONG Error;
+
+    if (*RegHandle != 0)
+    {
+        Error = 0; // ERROR_SUCCESS
+    }
+    else
+    {
+        Error = MCGEN_EVENTREGISTER(ProviderId, EnableCallback, CallbackContext, RegHandle);
+    }
+
+    return Error;
+}
+#pragma warning(pop)
+
+// This macro is for use by MC-generated code and should not be used directly.
+#define McGenEventUnregister _mcgen_PASTE2(McGenEventUnregister_, MCGEN_EVENTUNREGISTER)
+
+// This function is for use by MC-generated code and should not be used directly.
+DECLSPEC_NOINLINE __inline
+ULONG __stdcall
+McGenEventUnregister(_Inout_ PREGHANDLE RegHandle)
+/*++
+
+Routine Description:
+
+    Unregister from ETW and set *RegHandle = 0.
+
+Arguments:
+
+    RegHandle - the pointer to the provider registration handle
+
+Remarks:
+
+    If provider has not been registered (i.e. if *RegHandle == 0),
+    return SUCCESS. It is safe to call McGenEventUnregister even if the
+    call to McGenEventRegister returned an error.
+
+--*/
+{
+    ULONG Error;
+
+    if(*RegHandle == 0)
+    {
+        Error = 0; // ERROR_SUCCESS
+    }
+    else
+    {
+        Error = MCGEN_EVENTUNREGISTER(*RegHandle);
+        *RegHandle = (REGHANDLE)0;
+    }
+
+    return Error;
+}
+
+#endif // McGenEventRegisterUnregister
+
+#ifndef _mcgen_EVENT_BIT_SET
+  #if defined(_M_IX86) || defined(_M_X64)
+    // This macro is for use by MC-generated code and should not be used directly.
+    #define _mcgen_EVENT_BIT_SET(EnableBits, BitPosition) ((((const unsigned char*)EnableBits)[BitPosition >> 3] & (1u << (BitPosition & 7))) != 0)
+  #else // CPU type
+    // This macro is for use by MC-generated code and should not be used directly.
+    #define _mcgen_EVENT_BIT_SET(EnableBits, BitPosition) ((EnableBits[BitPosition >> 5] & (1u << (BitPosition & 31))) != 0)
+  #endif // CPU type
+#endif // _mcgen_EVENT_BIT_SET
+
+#endif // MCGEN_DISABLE_PROVIDER_CODE_GENERATION
+
+//+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+// Provider "microsoft-windows-mimalloc" event count 2
+//+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+// Provider GUID = 138f4dbb-ee04-4899-aa0a-572ad4475779
+EXTERN_C __declspec(selectany) const GUID ETW_MI_Provider = {0x138f4dbb, 0xee04, 0x4899, {0xaa, 0x0a, 0x57, 0x2a, 0xd4, 0x47, 0x57, 0x79}};
+
+#ifndef ETW_MI_Provider_Traits
+#define ETW_MI_Provider_Traits NULL
+#endif // ETW_MI_Provider_Traits
+
+//
+// Event Descriptors
+//
+EXTERN_C __declspec(selectany) const EVENT_DESCRIPTOR ETW_MI_ALLOC = {0x64, 0x1, 0x0, 0x4, 0x0, 0x0, 0x0};
+#define ETW_MI_ALLOC_value 0x64
+EXTERN_C __declspec(selectany) const EVENT_DESCRIPTOR ETW_MI_FREE = {0x65, 0x1, 0x0, 0x4, 0x0, 0x0, 0x0};
+#define ETW_MI_FREE_value 0x65
+
+//
+// MCGEN_DISABLE_PROVIDER_CODE_GENERATION macro:
+// Define this macro to have the compiler skip the generated functions in this
+// header.
+//
+#ifndef MCGEN_DISABLE_PROVIDER_CODE_GENERATION
+
+//
+// Event Enablement Bits
+// These variables are for use by MC-generated code and should not be used directly.
+//
+EXTERN_C __declspec(selectany) DECLSPEC_CACHEALIGN ULONG microsoft_windows_mimallocEnableBits[1];
+EXTERN_C __declspec(selectany) const ULONGLONG microsoft_windows_mimallocKeywords[1] = {0x0};
+EXTERN_C __declspec(selectany) const unsigned char microsoft_windows_mimallocLevels[1] = {4};
+
+//
+// Provider context
+//
+EXTERN_C __declspec(selectany) MCGEN_TRACE_CONTEXT ETW_MI_Provider_Context = {0, (ULONG_PTR)ETW_MI_Provider_Traits, 0, 0, 0, 0, 0, 0, 1, microsoft_windows_mimallocEnableBits, microsoft_windows_mimallocKeywords, microsoft_windows_mimallocLevels};
+
+//
+// Provider REGHANDLE
+//
+#define microsoft_windows_mimallocHandle (ETW_MI_Provider_Context.RegistrationHandle)
+
+//
+// This macro is set to 0, indicating that the EventWrite[Name] macros do not
+// have an Activity parameter. This is controlled by the -km and -um options.
+//
+#define ETW_MI_Provider_EventWriteActivity 0
+
+//
+// Register with ETW using the control GUID specified in the manifest.
+// Invoke this macro during module initialization (i.e. program startup,
+// DLL process attach, or driver load) to initialize the provider.
+// Note that if this function returns an error, the error means that
+// will not work, but no action needs to be taken -- even if EventRegister
+// returns an error, it is generally safe to use EventWrite and
+// EventUnregister macros (they will be no-ops if EventRegister failed).
+//
+#ifndef EventRegistermicrosoft_windows_mimalloc
+#define EventRegistermicrosoft_windows_mimalloc() McGenEventRegister(&ETW_MI_Provider, McGenControlCallbackV2, &ETW_MI_Provider_Context, &microsoft_windows_mimallocHandle)
+#endif
+
+//
+// Register with ETW using a specific control GUID (i.e. a GUID other than what
+// is specified in the manifest). Advanced scenarios only.
+//
+#ifndef EventRegisterByGuidmicrosoft_windows_mimalloc
+#define EventRegisterByGuidmicrosoft_windows_mimalloc(Guid) McGenEventRegister(&(Guid), McGenControlCallbackV2, &ETW_MI_Provider_Context, &microsoft_windows_mimallocHandle)
+#endif
+
+//
+// Unregister with ETW and close the provider.
+// Invoke this macro during module shutdown (i.e. program exit, DLL process
+// detach, or driver unload) to unregister the provider.
+// Note that you MUST call EventUnregister before DLL or driver unload
+// (not optional): failure to unregister a provider before DLL or driver unload
+// will result in crashes.
+//
+#ifndef EventUnregistermicrosoft_windows_mimalloc
+#define EventUnregistermicrosoft_windows_mimalloc() McGenEventUnregister(&microsoft_windows_mimallocHandle)
+#endif
+
+//
+// MCGEN_ENABLE_FORCONTEXT_CODE_GENERATION macro:
+// Define this macro to enable support for caller-allocated provider context.
+//
+#ifdef MCGEN_ENABLE_FORCONTEXT_CODE_GENERATION
+
+//
+// Advanced scenarios: Caller-allocated provider context.
+// Use when multiple differently-configured provider handles are needed,
+// e.g. for container-aware drivers, one context per container.
+//
+// Usage:
+//
+// - Caller enables the feature before including this header, e.g.
+//   #define MCGEN_ENABLE_FORCONTEXT_CODE_GENERATION 1
+// - Caller allocates memory, e.g. pContext = malloc(sizeof(McGenContext_microsoft_windows_mimalloc));
+// - Caller registers the provider, e.g. EventRegistermicrosoft_windows_mimalloc_ForContext(pContext);
+// - Caller writes events, e.g. EventWriteMyEvent_ForContext(pContext, ...);
+// - Caller unregisters, e.g. EventUnregistermicrosoft_windows_mimalloc_ForContext(pContext);
+// - Caller frees memory, e.g. free(pContext);
+//
+
+typedef struct tagMcGenContext_microsoft_windows_mimalloc {
+    // The fields of this structure are subject to change and should
+    // not be accessed directly. To access the provider's REGHANDLE,
+    // use microsoft_windows_mimallocHandle_ForContext(pContext).
+    MCGEN_TRACE_CONTEXT Context;
+    ULONG EnableBits[1];
+} McGenContext_microsoft_windows_mimalloc;
+
+#define EventRegistermicrosoft_windows_mimalloc_ForContext(pContext)             _mcgen_PASTE2(_mcgen_RegisterForContext_microsoft_windows_mimalloc_, MCGEN_EVENTREGISTER)(&ETW_MI_Provider, pContext)
+#define EventRegisterByGuidmicrosoft_windows_mimalloc_ForContext(Guid, pContext) _mcgen_PASTE2(_mcgen_RegisterForContext_microsoft_windows_mimalloc_, MCGEN_EVENTREGISTER)(&(Guid), pContext)
+#define EventUnregistermicrosoft_windows_mimalloc_ForContext(pContext)           McGenEventUnregister(&(pContext)->Context.RegistrationHandle)
+
+//
+// Provider REGHANDLE for caller-allocated context.
+//
+#define microsoft_windows_mimallocHandle_ForContext(pContext) ((pContext)->Context.RegistrationHandle)
+
+// This function is for use by MC-generated code and should not be used directly.
+// Initialize and register the caller-allocated context.
+__inline
+ULONG __stdcall
+_mcgen_PASTE2(_mcgen_RegisterForContext_microsoft_windows_mimalloc_, MCGEN_EVENTREGISTER)(
+    _In_ LPCGUID pProviderId,
+    _Out_ McGenContext_microsoft_windows_mimalloc* pContext)
+{
+    RtlZeroMemory(pContext, sizeof(*pContext));
+    pContext->Context.Logger = (ULONG_PTR)ETW_MI_Provider_Traits;
+    pContext->Context.EnableBitsCount = 1;
+    pContext->Context.EnableBitMask = pContext->EnableBits;
+    pContext->Context.EnableKeyWords = microsoft_windows_mimallocKeywords;
+    pContext->Context.EnableLevel = microsoft_windows_mimallocLevels;
+    return McGenEventRegister(
+        pProviderId,
+        McGenControlCallbackV2,
+        &pContext->Context,
+        &pContext->Context.RegistrationHandle);
+}
+
+// This function is for use by MC-generated code and should not be used directly.
+// Trigger a compile error if called with the wrong parameter type.
+FORCEINLINE
+_Ret_ McGenContext_microsoft_windows_mimalloc*
+_mcgen_CheckContextType_microsoft_windows_mimalloc(_In_ McGenContext_microsoft_windows_mimalloc* pContext)
+{
+    return pContext;
+}
+
+#endif // MCGEN_ENABLE_FORCONTEXT_CODE_GENERATION
+
+//
+// Enablement check macro for event "ETW_MI_ALLOC"
+//
+#define EventEnabledETW_MI_ALLOC() _mcgen_EVENT_BIT_SET(microsoft_windows_mimallocEnableBits, 0)
+#define EventEnabledETW_MI_ALLOC_ForContext(pContext) _mcgen_EVENT_BIT_SET(_mcgen_CheckContextType_microsoft_windows_mimalloc(pContext)->EnableBits, 0)
+
+//
+// Event write macros for event "ETW_MI_ALLOC"
+//
+#define EventWriteETW_MI_ALLOC(Address, Size) \
+        MCGEN_EVENT_ENABLED(ETW_MI_ALLOC) \
+        ? _mcgen_TEMPLATE_FOR_ETW_MI_ALLOC(&ETW_MI_Provider_Context, &ETW_MI_ALLOC, Address, Size) : 0
+#define EventWriteETW_MI_ALLOC_AssumeEnabled(Address, Size) \
+        _mcgen_TEMPLATE_FOR_ETW_MI_ALLOC(&ETW_MI_Provider_Context, &ETW_MI_ALLOC, Address, Size)
+#define EventWriteETW_MI_ALLOC_ForContext(pContext, Address, Size) \
+        MCGEN_EVENT_ENABLED_FORCONTEXT(pContext, ETW_MI_ALLOC) \
+        ? _mcgen_TEMPLATE_FOR_ETW_MI_ALLOC(&(pContext)->Context, &ETW_MI_ALLOC, Address, Size) : 0
+#define EventWriteETW_MI_ALLOC_ForContextAssumeEnabled(pContext, Address, Size) \
+        _mcgen_TEMPLATE_FOR_ETW_MI_ALLOC(&_mcgen_CheckContextType_microsoft_windows_mimalloc(pContext)->Context, &ETW_MI_ALLOC, Address, Size)
+
+// This macro is for use by MC-generated code and should not be used directly.
+#define _mcgen_TEMPLATE_FOR_ETW_MI_ALLOC _mcgen_PASTE2(McTemplateU0xx_, MCGEN_EVENTWRITETRANSFER)
+
+//
+// Enablement check macro for event "ETW_MI_FREE"
+//
+#define EventEnabledETW_MI_FREE() _mcgen_EVENT_BIT_SET(microsoft_windows_mimallocEnableBits, 0)
+#define EventEnabledETW_MI_FREE_ForContext(pContext) _mcgen_EVENT_BIT_SET(_mcgen_CheckContextType_microsoft_windows_mimalloc(pContext)->EnableBits, 0)
+
+//
+// Event write macros for event "ETW_MI_FREE"
+//
+#define EventWriteETW_MI_FREE(Address, Size) \
+        MCGEN_EVENT_ENABLED(ETW_MI_FREE) \
+        ? _mcgen_TEMPLATE_FOR_ETW_MI_FREE(&ETW_MI_Provider_Context, &ETW_MI_FREE, Address, Size) : 0
+#define EventWriteETW_MI_FREE_AssumeEnabled(Address, Size) \
+        _mcgen_TEMPLATE_FOR_ETW_MI_FREE(&ETW_MI_Provider_Context, &ETW_MI_FREE, Address, Size)
+#define EventWriteETW_MI_FREE_ForContext(pContext, Address, Size) \
+        MCGEN_EVENT_ENABLED_FORCONTEXT(pContext, ETW_MI_FREE) \
+        ? _mcgen_TEMPLATE_FOR_ETW_MI_FREE(&(pContext)->Context, &ETW_MI_FREE, Address, Size) : 0
+#define EventWriteETW_MI_FREE_ForContextAssumeEnabled(pContext, Address, Size) \
+        _mcgen_TEMPLATE_FOR_ETW_MI_FREE(&_mcgen_CheckContextType_microsoft_windows_mimalloc(pContext)->Context, &ETW_MI_FREE, Address, Size)
+
+// This macro is for use by MC-generated code and should not be used directly.
+#define _mcgen_TEMPLATE_FOR_ETW_MI_FREE _mcgen_PASTE2(McTemplateU0xx_, MCGEN_EVENTWRITETRANSFER)
+
+#endif // MCGEN_DISABLE_PROVIDER_CODE_GENERATION
+
+//
+// MCGEN_DISABLE_PROVIDER_CODE_GENERATION macro:
+// Define this macro to have the compiler skip the generated functions in this
+// header.
+//
+#ifndef MCGEN_DISABLE_PROVIDER_CODE_GENERATION
+
+//
+// Template Functions
+//
+
+//
+// Function for template "ETW_CUSTOM_HEAP_ALLOC_DATA" (and possibly others).
+// This function is for use by MC-generated code and should not be used directly.
+//
+#ifndef McTemplateU0xx_def
+#define McTemplateU0xx_def
+ETW_INLINE
+ULONG
+_mcgen_PASTE2(McTemplateU0xx_, MCGEN_EVENTWRITETRANSFER)(
+    _In_ PMCGEN_TRACE_CONTEXT Context,
+    _In_ PCEVENT_DESCRIPTOR Descriptor,
+    _In_ const unsigned __int64  _Arg0,
+    _In_ const unsigned __int64  _Arg1
+    )
+{
+#define McTemplateU0xx_ARGCOUNT 2
+
+    EVENT_DATA_DESCRIPTOR EventData[McTemplateU0xx_ARGCOUNT + 1];
+
+    EventDataDescCreate(&EventData[1],&_Arg0, sizeof(const unsigned __int64)  );
+
+    EventDataDescCreate(&EventData[2],&_Arg1, sizeof(const unsigned __int64)  );
+
+    return McGenEventWrite(Context, Descriptor, NULL, McTemplateU0xx_ARGCOUNT + 1, EventData);
+}
+#endif // McTemplateU0xx_def
+
+#endif // MCGEN_DISABLE_PROVIDER_CODE_GENERATION
+
+#if defined(__cplusplus)
+}
+#endif
diff --git a/ext/src/mimalloc/src/prim/windows/etw.man b/ext/src/mimalloc/src/prim/windows/etw.man
new file mode 100644
index 0000000000..cfd1f8a9ea
Binary files /dev/null and b/ext/src/mimalloc/src/prim/windows/etw.man differ
diff --git a/ext/src/mimalloc/src/prim/windows/prim.c b/ext/src/mimalloc/src/prim/windows/prim.c
new file mode 100644
index 0000000000..27d0a38ed3
--- /dev/null
+++ b/ext/src/mimalloc/src/prim/windows/prim.c
@@ -0,0 +1,921 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+// This file is included in `src/prim/prim.c`
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/prim.h"
+#include <stdio.h>   // fputs, stderr
+
+// xbox has no console IO
+#if !defined(WINAPI_FAMILY_PARTITION) || WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP | WINAPI_PARTITION_SYSTEM)
+#define MI_HAS_CONSOLE_IO
+#endif
+
+//---------------------------------------------
+// Dynamically bind Windows API points for portability
+//---------------------------------------------
+
+#if defined(_MSC_VER)
+#pragma warning(disable:4996)   // don't use GetVersionExW
+#endif
+
+static DWORD win_major_version = 6;
+static DWORD win_minor_version = 0;
+
+// We use VirtualAlloc2 for aligned allocation, but it is only supported on Windows 10 and Windows Server 2016.
+// So, we need to look it up dynamically to run on older systems. (use __stdcall for 32-bit compatibility)
+// NtAllocateVirtualAllocEx is used for huge OS page allocation (1GiB)
+// We define a minimal MEM_EXTENDED_PARAMETER ourselves in order to be able to compile with older SDK's.
+typedef enum MI_MEM_EXTENDED_PARAMETER_TYPE_E {
+  MiMemExtendedParameterInvalidType = 0,
+  MiMemExtendedParameterAddressRequirements,
+  MiMemExtendedParameterNumaNode,
+  MiMemExtendedParameterPartitionHandle,
+  MiMemExtendedParameterUserPhysicalHandle,
+  MiMemExtendedParameterAttributeFlags,
+  MiMemExtendedParameterMax
+} MI_MEM_EXTENDED_PARAMETER_TYPE;
+
+typedef struct DECLSPEC_ALIGN(8) MI_MEM_EXTENDED_PARAMETER_S {
+  struct { DWORD64 Type : 8; DWORD64 Reserved : 56; } Type;
+  union  { DWORD64 ULong64; PVOID Pointer; SIZE_T Size; HANDLE Handle; DWORD ULong; } Arg;
+} MI_MEM_EXTENDED_PARAMETER;
+
+typedef struct MI_MEM_ADDRESS_REQUIREMENTS_S {
+  PVOID  LowestStartingAddress;
+  PVOID  HighestEndingAddress;
+  SIZE_T Alignment;
+} MI_MEM_ADDRESS_REQUIREMENTS;
+
+#define MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE   0x00000010
+
+#include <winternl.h>
+typedef PVOID (__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
+typedef LONG  (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);  // avoid NTSTATUS as it is not defined on xbox (pr #1084)
+static PVirtualAlloc2 pVirtualAlloc2 = NULL;
+static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL;
+
+// Similarly, GetNumaProcessorNodeEx is only supported since Windows 7  (and GetNumaNodeProcessorMask is not supported on xbox)
+typedef struct MI_PROCESSOR_NUMBER_S { WORD Group; BYTE Number; BYTE Reserved; } MI_PROCESSOR_NUMBER;
+
+typedef VOID (__stdcall *PGetCurrentProcessorNumberEx)(MI_PROCESSOR_NUMBER* ProcNumber);
+typedef BOOL (__stdcall *PGetNumaProcessorNodeEx)(MI_PROCESSOR_NUMBER* Processor, PUSHORT NodeNumber);
+typedef BOOL (__stdcall* PGetNumaNodeProcessorMaskEx)(USHORT Node, PGROUP_AFFINITY ProcessorMask);
+typedef BOOL (__stdcall *PGetNumaProcessorNode)(UCHAR Processor, PUCHAR NodeNumber);
+typedef BOOL (__stdcall* PGetNumaNodeProcessorMask)(UCHAR Node, PULONGLONG ProcessorMask);
+typedef BOOL (__stdcall* PGetNumaHighestNodeNumber)(PULONG Node);
+static PGetCurrentProcessorNumberEx pGetCurrentProcessorNumberEx = NULL;
+static PGetNumaProcessorNodeEx      pGetNumaProcessorNodeEx = NULL;
+static PGetNumaNodeProcessorMaskEx  pGetNumaNodeProcessorMaskEx = NULL;
+static PGetNumaProcessorNode        pGetNumaProcessorNode = NULL;
+static PGetNumaNodeProcessorMask    pGetNumaNodeProcessorMask = NULL;
+static PGetNumaHighestNodeNumber    pGetNumaHighestNodeNumber = NULL;
+
+// Not available on xbox
+typedef SIZE_T(__stdcall* PGetLargePageMinimum)(VOID);
+static PGetLargePageMinimum pGetLargePageMinimum = NULL;
+
+// Available after Windows XP
+typedef BOOL (__stdcall *PGetPhysicallyInstalledSystemMemory)( PULONGLONG TotalMemoryInKilobytes );
+typedef BOOL (__stdcall* PGetVersionExW)(LPOSVERSIONINFOW lpVersionInformation);
+
+
+//---------------------------------------------
+// Enable large page support dynamically (if possible)
+//---------------------------------------------
+
+static bool win_enable_large_os_pages(size_t* large_page_size)
+{
+  static bool large_initialized = false;
+  if (large_initialized) return (_mi_os_large_page_size() > 0);
+  large_initialized = true;
+  if (pGetLargePageMinimum==NULL) return false;  // no large page support (xbox etc.)
+
+  // Try to see if large OS pages are supported
+  // To use large pages on Windows, we first need access permission
+  // Set "Lock pages in memory" permission in the group policy editor
+  // <https://devblogs.microsoft.com/oldnewthing/20110128-00/?p=11643>
+  unsigned long err = 0;
+  HANDLE token = NULL;
+  BOOL ok = OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token);
+  if (ok) {
+    TOKEN_PRIVILEGES tp;
+    ok = LookupPrivilegeValue(NULL, TEXT("SeLockMemoryPrivilege"), &tp.Privileges[0].Luid);
+    if (ok) {
+      tp.PrivilegeCount = 1;
+      tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+      ok = AdjustTokenPrivileges(token, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0);
+      if (ok) {
+        err = GetLastError();
+        ok = (err == ERROR_SUCCESS);
+        if (ok && large_page_size != NULL && pGetLargePageMinimum != NULL) {
+          *large_page_size = (*pGetLargePageMinimum)();
+        }
+      }
+    }
+    CloseHandle(token);
+  }
+  if (!ok) {
+    if (err == 0) err = GetLastError();
+    _mi_warning_message("cannot enable large OS page support, error %lu\n", err);
+  }
+  return (ok!=0);
+}
+
+
+//---------------------------------------------
+// Initialize
+//---------------------------------------------
+
+static DWORD win_allocation_granularity = 64*MI_KiB;
+
+void _mi_prim_mem_init( mi_os_mem_config_t* config )
+{
+  config->has_overcommit = false;
+  config->has_partial_free = false;
+  config->has_virtual_reserve = true;
+
+  // get the page size
+  SYSTEM_INFO si; _mi_memzero_var(si);
+  GetSystemInfo(&si);
+  if (si.dwPageSize > 0) { config->page_size = si.dwPageSize; }
+  if (si.dwAllocationGranularity > 0) {
+    config->alloc_granularity = si.dwAllocationGranularity;
+    win_allocation_granularity = si.dwAllocationGranularity;
+  }
+  // get virtual address bits
+  if ((uintptr_t)si.lpMaximumApplicationAddress > 0) {
+    const size_t vbits = MI_SIZE_BITS - mi_clz((uintptr_t)si.lpMaximumApplicationAddress);
+    config->virtual_address_bits = vbits;
+  }
+
+  // get the VirtualAlloc2 function
+  HINSTANCE hDll = LoadLibrary(TEXT("kernelbase.dll"));
+  if (hDll != NULL) {
+    // use VirtualAlloc2FromApp if possible as it is available to Windows store apps
+    pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2FromApp");
+    if (pVirtualAlloc2==NULL) pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2");
+    FreeLibrary(hDll);
+  }
+  // NtAllocateVirtualMemoryEx is used for huge page allocation
+  hDll = LoadLibrary(TEXT("ntdll.dll"));
+  if (hDll != NULL) {
+    pNtAllocateVirtualMemoryEx = (PNtAllocateVirtualMemoryEx)(void (*)(void))GetProcAddress(hDll, "NtAllocateVirtualMemoryEx");
+    FreeLibrary(hDll);
+  }
+  // Try to use Win7+ numa API
+  hDll = LoadLibrary(TEXT("kernel32.dll"));
+  if (hDll != NULL) {
+    pGetCurrentProcessorNumberEx = (PGetCurrentProcessorNumberEx)(void (*)(void))GetProcAddress(hDll, "GetCurrentProcessorNumberEx");
+    pGetNumaProcessorNodeEx = (PGetNumaProcessorNodeEx)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNodeEx");
+    pGetNumaNodeProcessorMaskEx = (PGetNumaNodeProcessorMaskEx)(void (*)(void))GetProcAddress(hDll, "GetNumaNodeProcessorMaskEx");
+    pGetNumaProcessorNode = (PGetNumaProcessorNode)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNode");
+    pGetNumaNodeProcessorMask = (PGetNumaNodeProcessorMask)(void (*)(void))GetProcAddress(hDll, "GetNumaNodeProcessorMask");
+    pGetNumaHighestNodeNumber = (PGetNumaHighestNodeNumber)(void (*)(void))GetProcAddress(hDll, "GetNumaHighestNodeNumber");
+    pGetLargePageMinimum = (PGetLargePageMinimum)(void (*)(void))GetProcAddress(hDll, "GetLargePageMinimum");
+    // Get physical memory (not available on XP, so check dynamically)
+    PGetPhysicallyInstalledSystemMemory pGetPhysicallyInstalledSystemMemory = (PGetPhysicallyInstalledSystemMemory)(void (*)(void))GetProcAddress(hDll,"GetPhysicallyInstalledSystemMemory");
+    if (pGetPhysicallyInstalledSystemMemory != NULL) {
+      ULONGLONG memInKiB = 0;
+      if ((*pGetPhysicallyInstalledSystemMemory)(&memInKiB)) {
+        if (memInKiB > 0 && memInKiB <= SIZE_MAX) {
+          config->physical_memory_in_kib = (size_t)memInKiB;
+        }
+      }
+    }
+    // Get Windows version
+    PGetVersionExW pGetVersionExW = (PGetVersionExW)(void (*)(void))GetProcAddress(hDll, "GetVersionExW");
+    if (pGetVersionExW != NULL) {
+      OSVERSIONINFOW version; _mi_memzero_var(version);
+      version.dwOSVersionInfoSize = sizeof(version);
+      if ((*pGetVersionExW)(&version)) {
+        win_major_version = version.dwMajorVersion;
+        win_minor_version = version.dwMinorVersion;
+      }
+    }
+    FreeLibrary(hDll);
+  }
+  // Enable large/huge OS page support?
+  if (mi_option_is_enabled(mi_option_allow_large_os_pages) || mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
+    win_enable_large_os_pages(&config->large_page_size);
+  }
+}
+
+
+//---------------------------------------------
+// Free
+//---------------------------------------------
+
+int _mi_prim_free(void* addr, size_t size ) {
+  MI_UNUSED(size);
+  DWORD errcode = 0;
+  bool err = (VirtualFree(addr, 0, MEM_RELEASE) == 0);
+  if (err) { errcode = GetLastError(); }
+  if (errcode == ERROR_INVALID_ADDRESS) {
+    // In mi_os_mem_alloc_aligned the fallback path may have returned a pointer inside
+    // the memory region returned by VirtualAlloc; in that case we need to free using
+    // the start of the region.
+    MEMORY_BASIC_INFORMATION info; _mi_memzero_var(info);
+    VirtualQuery(addr, &info, sizeof(info));
+    if (info.AllocationBase < addr && ((uint8_t*)addr - (uint8_t*)info.AllocationBase) < (ptrdiff_t)(4*MI_MiB)) {
+      errcode = 0;
+      err = (VirtualFree(info.AllocationBase, 0, MEM_RELEASE) == 0);
+      if (err) { errcode = GetLastError(); }
+    }
+  }
+  return (int)errcode;
+}
+
+
+//---------------------------------------------
+// VirtualAlloc
+//---------------------------------------------
+
+static void* win_virtual_alloc_prim_once(void* addr, size_t size, size_t try_alignment, DWORD flags) {
+  #if (MI_INTPTR_SIZE >= 8)
+  // on 64-bit systems, try to use the virtual address area after 2TiB for 4MiB aligned allocations
+  if (addr == NULL) {
+    void* hint = _mi_os_get_aligned_hint(try_alignment,size);
+    if (hint != NULL) {
+      void* p = VirtualAlloc(hint, size, flags, PAGE_READWRITE);
+      if (p != NULL) return p;
+      _mi_verbose_message("warning: unable to allocate hinted aligned OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x)\n", size, GetLastError(), hint, try_alignment, flags);
+      // fall through on error
+    }
+  }
+  #endif
+  // on modern Windows try use VirtualAlloc2 for aligned allocation
+  if (addr == NULL && try_alignment > win_allocation_granularity && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
+    MI_MEM_ADDRESS_REQUIREMENTS reqs = { 0, 0, 0 };
+    reqs.Alignment = try_alignment;
+    MI_MEM_EXTENDED_PARAMETER param = { {0, 0}, {0} };
+    param.Type.Type = MiMemExtendedParameterAddressRequirements;
+    param.Arg.Pointer = &reqs;
+    void* p = (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, &param, 1);
+    if (p != NULL) return p;
+    _mi_warning_message("unable to allocate aligned OS memory (0x%zx bytes, error code: 0x%x, address: %p, alignment: 0x%zx, flags: 0x%x)\n", size, GetLastError(), addr, try_alignment, flags);
+    // fall through on error
+  }
+  // last resort
+  return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
+}
+
+static bool win_is_out_of_memory_error(DWORD err) {
+  switch (err) {
+    case ERROR_COMMITMENT_MINIMUM:
+    case ERROR_COMMITMENT_LIMIT:
+    case ERROR_PAGEFILE_QUOTA:
+    case ERROR_NOT_ENOUGH_MEMORY:
+      return true;
+    default:
+      return false;
+  }
+}
+
+static void* win_virtual_alloc_prim(void* addr, size_t size, size_t try_alignment, DWORD flags) {
+  long max_retry_msecs = mi_option_get_clamp(mi_option_retry_on_oom, 0, 2000);  // at most 2 seconds
+  if (max_retry_msecs == 1) { max_retry_msecs = 100; }  // if one sets the option to "true"
+  for (long tries = 1; tries <= 10; tries++) {          // try at most 10 times (=2200ms)
+    void* p = win_virtual_alloc_prim_once(addr, size, try_alignment, flags);
+    if (p != NULL) {
+      // success, return the address
+      return p;
+    }
+    else if (max_retry_msecs > 0 && (try_alignment <= 8*MI_MiB) &&
+              (flags&MEM_COMMIT) != 0 && (flags&MEM_LARGE_PAGES) == 0 &&
+              win_is_out_of_memory_error(GetLastError())) {
+      // if committing regular memory and being out-of-memory,
+      // keep trying for a bit in case memory frees up after all. See issue #894
+      _mi_warning_message("out-of-memory on OS allocation, try again... (attempt %lu, 0x%zx bytes, error code: 0x%x, address: %p, alignment: 0x%zx, flags: 0x%x)\n", tries, size, GetLastError(), addr, try_alignment, flags);
+      long sleep_msecs = tries*40;  // increasing waits
+      if (sleep_msecs > max_retry_msecs) { sleep_msecs = max_retry_msecs; }
+      max_retry_msecs -= sleep_msecs;
+      Sleep(sleep_msecs);
+    }
+    else {
+      // otherwise return with an error
+      break;
+    }
+  }
+  return NULL;
+}
+
+static void* win_virtual_alloc(void* addr, size_t size, size_t try_alignment, DWORD flags, bool large_only, bool allow_large, bool* is_large) {
+  mi_assert_internal(!(large_only && !allow_large));
+  static _Atomic(size_t) large_page_try_ok; // = 0;
+  void* p = NULL;
+  // Try to allocate large OS pages (2MiB) if allowed or required.
+  if ((large_only || (_mi_os_canuse_large_page(size, try_alignment) && mi_option_is_enabled(mi_option_allow_large_os_pages)))
+      && allow_large && (flags&MEM_COMMIT)!=0 && (flags&MEM_RESERVE)!=0)
+  {
+    size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
+    if (!large_only && try_ok > 0) {
+      // if a large page allocation fails, it seems the calls to VirtualAlloc get very expensive.
+      // therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times.
+      mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1);
+    }
+    else {
+      // large OS pages must always reserve and commit.
+      *is_large = true;
+      p = win_virtual_alloc_prim(addr, size, try_alignment, flags | MEM_LARGE_PAGES);
+      if (large_only) return p;
+      // fall back to non-large page allocation on error (`p == NULL`).
+      if (p == NULL) {
+        mi_atomic_store_release(&large_page_try_ok,10UL);  // on error, don't try again for the next N allocations
+      }
+    }
+  }
+  // Fall back to regular page allocation
+  if (p == NULL) {
+    *is_large = ((flags&MEM_LARGE_PAGES) != 0);
+    p = win_virtual_alloc_prim(addr, size, try_alignment, flags);
+  }
+  //if (p == NULL) { _mi_warning_message("unable to allocate OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x, large only: %d, allow large: %d)\n", size, GetLastError(), addr, try_alignment, flags, large_only, allow_large); }
+  return p;
+}
+
+int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
+  mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
+  mi_assert_internal(commit || !allow_large);
+  mi_assert_internal(try_alignment > 0);
+  *is_zero = true;
+  int flags = MEM_RESERVE;
+  if (commit) { flags |= MEM_COMMIT; }
+  *addr = win_virtual_alloc(hint_addr, size, try_alignment, flags, false, allow_large, is_large);
+  return (*addr != NULL ? 0 : (int)GetLastError());
+}
+
+
+//---------------------------------------------
+// Commit/Reset/Protect
+//---------------------------------------------
+#ifdef _MSC_VER
+#pragma warning(disable:6250)   // suppress warning calling VirtualFree without MEM_RELEASE (for decommit)
+#endif
+
+int _mi_prim_commit(void* addr, size_t size, bool* is_zero) {
+  *is_zero = false;
+  /*
+  // zero'ing only happens on an initial commit... but checking upfront seems expensive..
+  _MEMORY_BASIC_INFORMATION meminfo; _mi_memzero_var(meminfo);
+  if (VirtualQuery(addr, &meminfo, size) > 0) {
+    if ((meminfo.State & MEM_COMMIT) == 0) {
+      *is_zero = true;
+    }
+  }
+  */
+  // commit
+  void* p = VirtualAlloc(addr, size, MEM_COMMIT, PAGE_READWRITE);
+  if (p == NULL) return (int)GetLastError();
+  return 0;
+}
+
+int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit) {
+  BOOL ok = VirtualFree(addr, size, MEM_DECOMMIT);
+  *needs_recommit = true;  // for safety, assume always decommitted even in the case of an error.
+  return (ok ? 0 : (int)GetLastError());
+}
+
+int _mi_prim_reset(void* addr, size_t size) {
+  void* p = VirtualAlloc(addr, size, MEM_RESET, PAGE_READWRITE);
+  mi_assert_internal(p == addr);
+  #if 0
+  if (p != NULL) {
+    VirtualUnlock(addr,size); // VirtualUnlock after MEM_RESET removes the memory directly from the working set
+  }
+  #endif
+  return (p != NULL ? 0 : (int)GetLastError());
+}
+
+int _mi_prim_reuse(void* addr, size_t size) {
+  MI_UNUSED(addr); MI_UNUSED(size);
+  return 0;
+}
+
+int _mi_prim_protect(void* addr, size_t size, bool protect) {
+  DWORD oldprotect = 0;
+  BOOL ok = VirtualProtect(addr, size, protect ? PAGE_NOACCESS : PAGE_READWRITE, &oldprotect);
+  return (ok ? 0 : (int)GetLastError());
+}
+
+
+//---------------------------------------------
+// Huge page allocation
+//---------------------------------------------
+
+static void* _mi_prim_alloc_huge_os_pagesx(void* hint_addr, size_t size, int numa_node)
+{
+  const DWORD flags = MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE;
+
+  win_enable_large_os_pages(NULL);
+
+  MI_MEM_EXTENDED_PARAMETER params[3] = { {{0,0},{0}},{{0,0},{0}},{{0,0},{0}} };
+  // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
+  static bool mi_huge_pages_available = true;
+  if (pNtAllocateVirtualMemoryEx != NULL && mi_huge_pages_available) {
+    params[0].Type.Type = MiMemExtendedParameterAttributeFlags;
+    params[0].Arg.ULong64 = MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE;
+    ULONG param_count = 1;
+    if (numa_node >= 0) {
+      param_count++;
+      params[1].Type.Type = MiMemExtendedParameterNumaNode;
+      params[1].Arg.ULong = (unsigned)numa_node;
+    }
+    SIZE_T psize = size;
+    void* base = hint_addr;
+    LONG err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, params, param_count);
+    if (err == 0 && base != NULL) {
+      return base;
+    }
+    else {
+      // fall back to regular large pages
+      mi_huge_pages_available = false; // don't try further huge pages
+      _mi_warning_message("unable to allocate using huge (1GiB) pages, trying large (2MiB) pages instead (status 0x%lx)\n", err);
+    }
+  }
+  // on modern Windows try use VirtualAlloc2 for numa aware large OS page allocation
+  if (pVirtualAlloc2 != NULL && numa_node >= 0) {
+    params[0].Type.Type = MiMemExtendedParameterNumaNode;
+    params[0].Arg.ULong = (unsigned)numa_node;
+    return (*pVirtualAlloc2)(GetCurrentProcess(), hint_addr, size, flags, PAGE_READWRITE, params, 1);
+  }
+
+  // otherwise use regular virtual alloc on older windows
+  return VirtualAlloc(hint_addr, size, flags, PAGE_READWRITE);
+}
+
+int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) {
+  *is_zero = true;
+  *addr = _mi_prim_alloc_huge_os_pagesx(hint_addr,size,numa_node);
+  return (*addr != NULL ? 0 : (int)GetLastError());
+}
+
+
+//---------------------------------------------
+// Numa nodes
+//---------------------------------------------
+
+size_t _mi_prim_numa_node(void) {
+  USHORT numa_node = 0;
+  if (pGetCurrentProcessorNumberEx != NULL && pGetNumaProcessorNodeEx != NULL) {
+    // Extended API is supported
+    MI_PROCESSOR_NUMBER pnum;
+    (*pGetCurrentProcessorNumberEx)(&pnum);
+    USHORT nnode = 0;
+    BOOL ok = (*pGetNumaProcessorNodeEx)(&pnum, &nnode);
+    if (ok) { numa_node = nnode; }
+  }
+  else if (pGetNumaProcessorNode != NULL) {
+    // Vista or earlier, use older API that is limited to 64 processors. Issue #277
+    DWORD pnum = GetCurrentProcessorNumber();
+    UCHAR nnode = 0;
+    BOOL ok = pGetNumaProcessorNode((UCHAR)pnum, &nnode);
+    if (ok) { numa_node = nnode; }
+  }
+  return numa_node;
+}
+
+size_t _mi_prim_numa_node_count(void) {
+  ULONG numa_max = 0;
+  if (pGetNumaHighestNodeNumber!=NULL) {
+    (*pGetNumaHighestNodeNumber)(&numa_max);
+  }
+  // find the highest node number that has actual processors assigned to it. Issue #282
+  while (numa_max > 0) {
+    if (pGetNumaNodeProcessorMaskEx != NULL) {
+      // Extended API is supported
+      GROUP_AFFINITY affinity;
+      if ((*pGetNumaNodeProcessorMaskEx)((USHORT)numa_max, &affinity)) {
+        if (affinity.Mask != 0) break;  // found the maximum non-empty node
+      }
+    }
+    else {
+      // Vista or earlier, use older API that is limited to 64 processors.
+      ULONGLONG mask;
+      if (pGetNumaNodeProcessorMask != NULL) {
+        if ((*pGetNumaNodeProcessorMask)((UCHAR)numa_max, &mask)) {
+          if (mask != 0) break; // found the maximum non-empty node
+        }
+      };
+    }
+    // max node was invalid or had no processor assigned, try again
+    numa_max--;
+  }
+  return ((size_t)numa_max + 1);
+}
+
+
+//----------------------------------------------------------------
+// Clock
+//----------------------------------------------------------------
+
+static mi_msecs_t mi_to_msecs(LARGE_INTEGER t) {
+  static LARGE_INTEGER mfreq; // = 0
+  if (mfreq.QuadPart == 0LL) {
+    LARGE_INTEGER f;
+    QueryPerformanceFrequency(&f);
+    mfreq.QuadPart = f.QuadPart/1000LL;
+    if (mfreq.QuadPart == 0) mfreq.QuadPart = 1;
+  }
+  return (mi_msecs_t)(t.QuadPart / mfreq.QuadPart);
+}
+
+mi_msecs_t _mi_prim_clock_now(void) {
+  LARGE_INTEGER t;
+  QueryPerformanceCounter(&t);
+  return mi_to_msecs(t);
+}
+
+
+//----------------------------------------------------------------
+// Process Info
+//----------------------------------------------------------------
+
+#include <psapi.h>
+
+static mi_msecs_t filetime_msecs(const FILETIME* ftime) {
+  ULARGE_INTEGER i;
+  i.LowPart = ftime->dwLowDateTime;
+  i.HighPart = ftime->dwHighDateTime;
+  mi_msecs_t msecs = (i.QuadPart / 10000); // FILETIME is in 100 nano seconds
+  return msecs;
+}
+
+typedef BOOL (WINAPI *PGetProcessMemoryInfo)(HANDLE, PPROCESS_MEMORY_COUNTERS, DWORD);
+static PGetProcessMemoryInfo pGetProcessMemoryInfo = NULL;
+
+void _mi_prim_process_info(mi_process_info_t* pinfo)
+{
+  FILETIME ct;
+  FILETIME ut;
+  FILETIME st;
+  FILETIME et;
+  GetProcessTimes(GetCurrentProcess(), &ct, &et, &st, &ut);
+  pinfo->utime = filetime_msecs(&ut);
+  pinfo->stime = filetime_msecs(&st);
+
+  // load psapi on demand
+  if (pGetProcessMemoryInfo == NULL) {
+    HINSTANCE hDll = LoadLibrary(TEXT("psapi.dll"));
+    if (hDll != NULL) {
+      pGetProcessMemoryInfo = (PGetProcessMemoryInfo)(void (*)(void))GetProcAddress(hDll, "GetProcessMemoryInfo");
+    }
+  }
+
+  // get process info
+  PROCESS_MEMORY_COUNTERS info; _mi_memzero_var(info);
+  if (pGetProcessMemoryInfo != NULL) {
+    pGetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info));
+  }
+  pinfo->current_rss    = (size_t)info.WorkingSetSize;
+  pinfo->peak_rss       = (size_t)info.PeakWorkingSetSize;
+  pinfo->current_commit = (size_t)info.PagefileUsage;
+  pinfo->peak_commit    = (size_t)info.PeakPagefileUsage;
+  pinfo->page_faults    = (size_t)info.PageFaultCount;
+}
+
+//----------------------------------------------------------------
+// Output
+//----------------------------------------------------------------
+
+void _mi_prim_out_stderr( const char* msg )
+{
+  // on windows with redirection, the C runtime cannot handle locale dependent output
+  // after the main thread closes so we use direct console output.
+  if (!_mi_preloading()) {
+    // _cputs(msg);  // _cputs cannot be used as it aborts when failing to lock the console
+    static HANDLE hcon = INVALID_HANDLE_VALUE;
+    static bool hconIsConsole = false;
+    if (hcon == INVALID_HANDLE_VALUE) {
+      hcon = GetStdHandle(STD_ERROR_HANDLE);
+      #ifdef MI_HAS_CONSOLE_IO
+      CONSOLE_SCREEN_BUFFER_INFO sbi;
+      hconIsConsole = ((hcon != INVALID_HANDLE_VALUE) && GetConsoleScreenBufferInfo(hcon, &sbi));
+      #endif
+    }
+    const size_t len = _mi_strlen(msg);
+    if (len > 0 && len < UINT32_MAX) {
+      DWORD written = 0;
+      if (hconIsConsole) {
+        #ifdef MI_HAS_CONSOLE_IO
+        WriteConsoleA(hcon, msg, (DWORD)len, &written, NULL);
+        #endif
+      }
+      else if (hcon != INVALID_HANDLE_VALUE) {
+        // use direct write if stderr was redirected
+        WriteFile(hcon, msg, (DWORD)len, &written, NULL);
+      }
+      else {
+        // finally fall back to fputs after all
+        fputs(msg, stderr);
+      }
+    }
+  }
+}
+
+
+//----------------------------------------------------------------
+// Environment
+//----------------------------------------------------------------
+
+// On Windows use GetEnvironmentVariable instead of getenv to work
+// reliably even when this is invoked before the C runtime is initialized.
+// i.e. when `_mi_preloading() == true`.
+// Note: on windows, environment names are not case sensitive.
+bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
+  result[0] = 0;
+  size_t len = GetEnvironmentVariableA(name, result, (DWORD)result_size);
+  return (len > 0 && len < result_size);
+}
+
+
+//----------------------------------------------------------------
+// Random
+//----------------------------------------------------------------
+
+#if defined(MI_USE_RTLGENRANDOM) // || defined(__cplusplus)
+// We prefer to use BCryptGenRandom instead of (the unofficial) RtlGenRandom but when using
+// dynamic overriding, we observed it can raise an exception when compiled with C++, and
+// sometimes deadlocks when also running under the VS debugger.
+// In contrast, issue #623 implies that on Windows Server 2019 we need to use BCryptGenRandom.
+// To be continued..
+#pragma comment (lib,"advapi32.lib")
+#define RtlGenRandom  SystemFunction036
+mi_decl_externc BOOLEAN NTAPI RtlGenRandom(PVOID RandomBuffer, ULONG RandomBufferLength);
+
+bool _mi_prim_random_buf(void* buf, size_t buf_len) {
+  return (RtlGenRandom(buf, (ULONG)buf_len) != 0);
+}
+
+#else
+
+#ifndef BCRYPT_USE_SYSTEM_PREFERRED_RNG
+#define BCRYPT_USE_SYSTEM_PREFERRED_RNG 0x00000002
+#endif
+
+typedef LONG (NTAPI *PBCryptGenRandom)(HANDLE, PUCHAR, ULONG, ULONG);
+static  PBCryptGenRandom pBCryptGenRandom = NULL;
+
+bool _mi_prim_random_buf(void* buf, size_t buf_len) {
+  if (pBCryptGenRandom == NULL) {
+    HINSTANCE hDll = LoadLibrary(TEXT("bcrypt.dll"));
+    if (hDll != NULL) {
+      pBCryptGenRandom = (PBCryptGenRandom)(void (*)(void))GetProcAddress(hDll, "BCryptGenRandom");
+    }
+    if (pBCryptGenRandom == NULL) return false;
+  }
+  return (pBCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0);
+}
+
+#endif  // MI_USE_RTLGENRANDOM
+
+
+//----------------------------------------------------------------
+// Thread pool?
+//----------------------------------------------------------------
+
+bool _mi_prim_thread_is_in_threadpool(void) {
+#if (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64)
+  if (win_major_version >= 6) {
+    // check if this thread belongs to a windows threadpool
+    // see: <https://www.geoffchappell.com/studies/windows/km/ntoskrnl/inc/api/pebteb/teb/index.htm>
+    struct _TEB* const teb = NtCurrentTeb();
+    void* const pool_data = *((void**)((uint8_t*)teb + (MI_SIZE_BITS == 32 ? 0x0F90 : 0x1778)));
+    return (pool_data != NULL);
+  }
+#endif
+  return false;
+}
+
+
+//----------------------------------------------------------------
+// Process & Thread Init/Done
+//----------------------------------------------------------------
+
+#if MI_WIN_USE_FIXED_TLS==1
+mi_decl_cache_align size_t _mi_win_tls_offset = 0;
+#endif
+
+//static void mi_debug_out(const char* s) {
+//  HANDLE h = GetStdHandle(STD_ERROR_HANDLE);
+//  WriteConsole(h, s, (DWORD)_mi_strlen(s), NULL, NULL);
+//}
+
+static void mi_win_tls_init(DWORD reason) {
+  if (reason==DLL_PROCESS_ATTACH || reason==DLL_THREAD_ATTACH) {
+    #if MI_WIN_USE_FIXED_TLS==1  // we must allocate a TLS slot dynamically
+    if (_mi_win_tls_offset == 0 && reason == DLL_PROCESS_ATTACH) {
+      const DWORD tls_slot = TlsAlloc();  // usually returns slot 1
+      if (tls_slot == TLS_OUT_OF_INDEXES) {
+        _mi_error_message(EFAULT, "unable to allocate the a TLS slot (rebuild without MI_WIN_USE_FIXED_TLS?)\n");
+      }
+      _mi_win_tls_offset = (size_t)tls_slot * sizeof(void*);
+    }
+    #endif
+    #if MI_HAS_TLS_SLOT >= 2  // we must initialize the TLS slot before any allocation
+    if (_mi_theap_default() == NULL) {
+      _mi_theap_default_set((mi_theap_t*)&_mi_theap_empty);
+      #if MI_DEBUG && MI_WIN_USE_FIXED_TLS==1
+      void* const p = TlsGetValue((DWORD)(_mi_win_tls_offset / sizeof(void*)));
+      mi_assert_internal(p == (void*)&_mi_theap_empty);
+      #endif
+    }
+    #endif
+  }
+}
+
+static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) {
+  MI_UNUSED(reserved);
+  MI_UNUSED(module);
+  mi_win_tls_init(reason);
+  if (reason==DLL_PROCESS_ATTACH) {
+    _mi_auto_process_init();
+  }
+  else if (reason==DLL_PROCESS_DETACH) {
+    _mi_auto_process_done();
+  }
+  else if (reason==DLL_THREAD_DETACH && !_mi_is_redirected()) {
+    _mi_thread_done(NULL);
+  }
+}
+
+
+#if defined(MI_SHARED_LIB)
+  #define MI_PRIM_HAS_PROCESS_ATTACH  1
+
+  // Windows DLL: easy to hook into process_init and thread_done
+  BOOL WINAPI DllMain(HINSTANCE inst, DWORD reason, LPVOID reserved) {
+    mi_win_main((PVOID)inst,reason,reserved);
+    return TRUE;
+  }
+
+  // nothing to do since `_mi_thread_done` is handled through the DLL_THREAD_DETACH event.
+  void _mi_prim_thread_init_auto_done(void) { }
+  void _mi_prim_thread_done_auto_done(void) { }
+  void _mi_prim_thread_associate_default_theap(mi_theap_t* theap) {
+    MI_UNUSED(theap);
+  }
+
+#elif !defined(MI_WIN_USE_FLS)
+  #define MI_PRIM_HAS_PROCESS_ATTACH  1
+
+  static void NTAPI mi_win_main_attach(PVOID module, DWORD reason, LPVOID reserved) {
+    if (reason == DLL_PROCESS_ATTACH || reason == DLL_THREAD_ATTACH) {
+      mi_win_main(module, reason, reserved);
+    }
+  }
+  static void NTAPI mi_win_main_detach(PVOID module, DWORD reason, LPVOID reserved) {
+    if (reason == DLL_PROCESS_DETACH || reason == DLL_THREAD_DETACH) {
+      mi_win_main(module, reason, reserved);
+    }
+  }
+
+  // Set up TLS callbacks in a statically linked library by using special data sections.
+  // See <https://stackoverflow.com/questions/14538159/tls-callback-in-windows>
+  // We use 2 entries to ensure we call attach events before constructors
+  // are called, and detach events after destructors are called.
+  #if defined(__cplusplus)
+  extern "C" {
+  #endif
+
+  #if defined(_WIN64)
+    #pragma comment(linker, "/INCLUDE:_tls_used")
+    #pragma comment(linker, "/INCLUDE:_mi_tls_callback_pre")
+    #pragma comment(linker, "/INCLUDE:_mi_tls_callback_post")
+    #pragma const_seg(".CRT$XLB")
+    extern const PIMAGE_TLS_CALLBACK _mi_tls_callback_pre[];
+    const PIMAGE_TLS_CALLBACK _mi_tls_callback_pre[] = { &mi_win_main_attach };
+    #pragma const_seg()
+    #pragma const_seg(".CRT$XLY")
+    extern const PIMAGE_TLS_CALLBACK _mi_tls_callback_post[];
+    const PIMAGE_TLS_CALLBACK _mi_tls_callback_post[] = { &mi_win_main_detach };
+    #pragma const_seg()
+  #else
+    #pragma comment(linker, "/INCLUDE:__tls_used")
+    #pragma comment(linker, "/INCLUDE:__mi_tls_callback_pre")
+    #pragma comment(linker, "/INCLUDE:__mi_tls_callback_post")
+    #pragma data_seg(".CRT$XLB")
+    PIMAGE_TLS_CALLBACK _mi_tls_callback_pre[] = { &mi_win_main_attach };
+    #pragma data_seg()
+    #pragma data_seg(".CRT$XLY")
+    PIMAGE_TLS_CALLBACK _mi_tls_callback_post[] = { &mi_win_main_detach };
+    #pragma data_seg()
+  #endif
+
+  #if defined(__cplusplus)
+  }
+  #endif
+
+  // nothing to do since `_mi_thread_done` is handled through the DLL_THREAD_DETACH event.
+  void _mi_prim_thread_init_auto_done(void) { }
+  void _mi_prim_thread_done_auto_done(void) { }
+  void _mi_prim_thread_associate_default_theap(mi_theap_t* theap) {
+    MI_UNUSED(theap);
+  }
+
+#else // deprecated: statically linked, use fiber api
+
+  #if defined(_MSC_VER) // on clang/gcc use the constructor attribute (in `src/prim/prim.c`)
+    // MSVC: use data section magic for static libraries
+    // See <https://www.codeguru.com/cpp/misc/misc/applicationcontrol/article.php/c6945/Running-Code-Before-and-After-Main.htm>
+    #define MI_PRIM_HAS_PROCESS_ATTACH 1
+
+    static int mi_process_attach(void) {
+      mi_win_main(NULL,DLL_PROCESS_ATTACH,NULL);
+      atexit(&_mi_auto_process_done);
+      return 0;
+    }
+    typedef int(*mi_crt_callback_t)(void);
+    #if defined(_WIN64)
+      #pragma comment(linker, "/INCLUDE:_mi_tls_callback")
+      #pragma section(".CRT$XIU", long, read)
+    #else
+      #pragma comment(linker, "/INCLUDE:__mi_tls_callback")
+    #endif
+    #pragma data_seg(".CRT$XIU")
+    mi_decl_externc mi_crt_callback_t _mi_tls_callback[] = { &mi_process_attach };
+    #pragma data_seg()
+  #endif
+
+  // use the fiber api for calling `_mi_thread_done`.
+  #include <fibersapi.h>
+  #if (_WIN32_WINNT < 0x600)  // before Windows Vista
+  WINBASEAPI DWORD WINAPI FlsAlloc( _In_opt_ PFLS_CALLBACK_FUNCTION lpCallback );
+  WINBASEAPI PVOID WINAPI FlsGetValue( _In_ DWORD dwFlsIndex );
+  WINBASEAPI BOOL  WINAPI FlsSetValue( _In_ DWORD dwFlsIndex, _In_opt_ PVOID lpFlsData );
+  WINBASEAPI BOOL  WINAPI FlsFree(_In_ DWORD dwFlsIndex);
+  #endif
+
+  static DWORD mi_fls_key = (DWORD)(-1);
+
+  static void NTAPI mi_fls_done(PVOID value) {
+    mi_theap_t* theap = (mi_theap_t*)value;
+    if (theap != NULL) {
+      _mi_thread_done(theap);
+      FlsSetValue(mi_fls_key, NULL);  // prevent recursion as _mi_thread_done may set it back to the main theap, issue #672
+    }
+  }
+
+  void _mi_prim_thread_init_auto_done(void) {
+    mi_fls_key = FlsAlloc(&mi_fls_done);
+  }
+
+  void _mi_prim_thread_done_auto_done(void) {
+    // call thread-done on all threads (except the main thread) to prevent
+    // dangling callback pointer if statically linked with a DLL; Issue #208
+    FlsFree(mi_fls_key);
+  }
+
+  void _mi_prim_thread_associate_default_theap(mi_theap_t* theap) {
+    mi_assert_internal(mi_fls_key != (DWORD)(-1));
+    FlsSetValue(mi_fls_key, theap);
+  }
+#endif
+
+// ----------------------------------------------------
+// Communicate with the redirection module on Windows
+// ----------------------------------------------------
+#if defined(MI_SHARED_LIB) && !defined(MI_WIN_NOREDIRECT)
+  #define MI_PRIM_HAS_ALLOCATOR_INIT 1
+
+  static bool mi_redirected = false;   // true if malloc redirects to mi_malloc
+
+  bool _mi_is_redirected(void) {
+    return mi_redirected;
+  }
+
+  #ifdef __cplusplus
+  extern "C" {
+  #endif
+  mi_decl_export void _mi_redirect_entry(DWORD reason) {
+    // called on redirection; careful as this may be called before DllMain
+    mi_win_tls_init(reason);
+    if (reason == DLL_PROCESS_ATTACH) {
+      mi_redirected = true;
+    }
+    else if (reason == DLL_PROCESS_DETACH) {
+      mi_redirected = false;
+    }
+    else if (reason == DLL_THREAD_DETACH) {
+      _mi_thread_done(NULL);
+    }
+  }
+  __declspec(dllimport) bool mi_cdecl mi_allocator_init(const char** message);
+  __declspec(dllimport) void mi_cdecl mi_allocator_done(void);
+  #ifdef __cplusplus
+  }
+  #endif
+  bool _mi_allocator_init(const char** message) {
+    return mi_allocator_init(message);
+  }
+  void _mi_allocator_done(void) {
+    mi_allocator_done();
+  }
+#endif
+
diff --git a/ext/src/mimalloc/src/prim/windows/readme.md b/ext/src/mimalloc/src/prim/windows/readme.md
new file mode 100644
index 0000000000..217c3d174d
--- /dev/null
+++ b/ext/src/mimalloc/src/prim/windows/readme.md
@@ -0,0 +1,17 @@
+## Primitives:
+
+- `prim.c` contains Windows primitives for OS allocation.
+
+## Event Tracing for Windows (ETW)
+
+- `etw.h` is generated from `etw.man` which contains the manifest for mimalloc events.
+  (100 is an allocation, 101 is for a free)
+
+- `etw-mimalloc.wprp` is a profile for the Windows Performance Recorder (WPR).
+  In an admin prompt, you can use:
+  ```
+  > wpr -start src\prim\windows\etw-mimalloc.wprp -filemode
+  > <my mimalloc program>
+  > wpr -stop test.etl
+  ``` 
+  and then open `test.etl` in the Windows Performance Analyzer (WPA).
\ No newline at end of file
diff --git a/ext/src/mimalloc/src/random.c b/ext/src/mimalloc/src/random.c
index d474a53a04..990e4894f3 100644
--- a/ext/src/mimalloc/src/random.c
+++ b/ext/src/mimalloc/src/random.c
@@ -4,14 +4,9 @@ This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
-#ifndef _DEFAULT_SOURCE
-#define _DEFAULT_SOURCE   // for syscall() on Linux
-#endif
-
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
-
-#include <string.h> // memset
+#include "mimalloc/internal.h"
+#include "mimalloc/prim.h"    // _mi_prim_random_buf
 
 /* ----------------------------------------------------------------------------
 We use our own PRNG to keep predictable performance of random number generation
@@ -37,15 +32,11 @@ The implementation uses regular C code which compiles very well on modern compil
 (gcc x64 has no register spills, and clang 6+ uses SSE instructions)
 -----------------------------------------------------------------------------*/
 
-static inline uint32_t rotl(uint32_t x, uint32_t shift) {
-  return (x << shift) | (x >> (32 - shift));
-}
-
 static inline void qround(uint32_t x[16], size_t a, size_t b, size_t c, size_t d) {
-  x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 16);
-  x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 12);
-  x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 8);
-  x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 7);
+  x[a] += x[b]; x[d] = mi_rotl32(x[d] ^ x[a], 16);
+  x[c] += x[d]; x[b] = mi_rotl32(x[b] ^ x[c], 12);
+  x[a] += x[b]; x[d] = mi_rotl32(x[d] ^ x[a], 8);
+  x[c] += x[d]; x[b] = mi_rotl32(x[b] ^ x[c], 7);
 }
 
 static void chacha_block(mi_random_ctx_t* ctx)
@@ -103,7 +94,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
   // since we only use chacha for randomness (and not encryption) we
   // do not _need_ to read 32-bit values as little endian but we do anyways
   // just for being compatible :-)
-  memset(ctx, 0, sizeof(*ctx));
+  _mi_memzero(ctx, sizeof(*ctx));
   for (size_t i = 0; i < 4; i++) {
     const uint8_t* sigma = (uint8_t*)"expand 32-byte k";
     ctx->input[i] = read32(sigma,i);
@@ -118,7 +109,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
 }
 
 static void chacha_split(mi_random_ctx_t* ctx, uint64_t nonce, mi_random_ctx_t* ctx_new) {
-  memset(ctx_new, 0, sizeof(*ctx_new));
+  _mi_memzero(ctx_new, sizeof(*ctx_new));
   _mi_memcpy(ctx_new->input, ctx->input, sizeof(ctx_new->input));
   ctx_new->input[12] = 0;
   ctx_new->input[13] = 0;
@@ -147,177 +138,72 @@ void _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* ctx_new) {
 
 uintptr_t _mi_random_next(mi_random_ctx_t* ctx) {
   mi_assert_internal(mi_random_is_initialized(ctx));
-  #if MI_INTPTR_SIZE <= 4
-    return chacha_next32(ctx);
-  #elif MI_INTPTR_SIZE == 8
-    return (((uintptr_t)chacha_next32(ctx) << 32) | chacha_next32(ctx));
-  #else
-  # error "define mi_random_next for this platform"
-  #endif
+  uintptr_t r;
+  do {
+    #if MI_INTPTR_SIZE <= 4
+    r = chacha_next32(ctx);
+    #elif MI_INTPTR_SIZE == 8
+    r = (((uintptr_t)chacha_next32(ctx) << 32) | chacha_next32(ctx));
+    #else
+    # error "define mi_random_next for this platform"
+    #endif
+  } while (r==0);
+  return r;
 }
 
 
 /* ----------------------------------------------------------------------------
-To initialize a fresh random context we rely on the OS:
-- Windows     : BCryptGenRandom (or RtlGenRandom)
-- macOS       : CCRandomGenerateBytes, arc4random_buf
-- bsd,wasi    : arc4random_buf
-- Linux       : getrandom,/dev/urandom
+To initialize a fresh random context.
 If we cannot get good randomness, we fall back to weak randomness based on a timer and ASLR.
 -----------------------------------------------------------------------------*/
 
-#if defined(_WIN32)
-
-#if defined(MI_USE_RTLGENRANDOM) || defined(__cplusplus)
-// We prefer to use BCryptGenRandom instead of (the unofficial) RtlGenRandom but when using 
-// dynamic overriding, we observed it can raise an exception when compiled with C++, and 
-// sometimes deadlocks when also running under the VS debugger.
-#pragma comment (lib,"advapi32.lib")
-#define RtlGenRandom  SystemFunction036
-#ifdef __cplusplus
-extern "C" {
-#endif
-BOOLEAN NTAPI RtlGenRandom(PVOID RandomBuffer, ULONG RandomBufferLength);
-#ifdef __cplusplus
-}
-#endif
-static bool os_random_buf(void* buf, size_t buf_len) {
-  return (RtlGenRandom(buf, (ULONG)buf_len) != 0);
-}
-#else
-#pragma comment (lib,"bcrypt.lib")
-#include <bcrypt.h>
-static bool os_random_buf(void* buf, size_t buf_len) {
-  return (BCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0);
-}
-#endif
-
-#elif defined(__APPLE__)
-#include <AvailabilityMacros.h>
-#if defined(MAC_OS_X_VERSION_10_10) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_10
-#include <CommonCrypto/CommonCryptoError.h>
-#include <CommonCrypto/CommonRandom.h>
-#endif
-static bool os_random_buf(void* buf, size_t buf_len) {
-  #if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15
-    // We prefere CCRandomGenerateBytes as it returns an error code while arc4random_buf
-    // may fail silently on macOS. See PR #390, and <https://opensource.apple.com/source/Libc/Libc-1439.40.11/gen/FreeBSD/arc4random.c.auto.html>      
-    return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess);
-  #else
-    // fall back on older macOS
-    arc4random_buf(buf, buf_len);
-    return true;
-  #endif
-}
-
-#elif defined(__ANDROID__) || defined(__DragonFly__) || \
-      defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
-      defined(__sun) // todo: what to use with __wasi__?
-#include <stdlib.h>
-static bool os_random_buf(void* buf, size_t buf_len) {
-  arc4random_buf(buf, buf_len);
-  return true;
-}
-#elif defined(__linux__) || defined(__HAIKU__)
-#if defined(__linux__)
-#include <sys/syscall.h>
-#endif
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <errno.h>
-static bool os_random_buf(void* buf, size_t buf_len) {
-  // Modern Linux provides `getrandom` but different distributions either use `sys/random.h` or `linux/random.h`
-  // and for the latter the actual `getrandom` call is not always defined.
-  // (see <https://stackoverflow.com/questions/45237324/why-doesnt-getrandom-compile>)
-  // We therefore use a syscall directly and fall back dynamically to /dev/urandom when needed.
-#ifdef SYS_getrandom
-  #ifndef GRND_NONBLOCK
-  #define GRND_NONBLOCK (1)
-  #endif
-  static _Atomic(uintptr_t) no_getrandom; // = 0
-  if (mi_atomic_load_acquire(&no_getrandom)==0) {
-    ssize_t ret = syscall(SYS_getrandom, buf, buf_len, GRND_NONBLOCK);
-    if (ret >= 0) return (buf_len == (size_t)ret);
-    if (errno != ENOSYS) return false;
-    mi_atomic_store_release(&no_getrandom, 1UL); // don't call again, and fall back to /dev/urandom
-  }
-#endif
-  int flags = O_RDONLY;
-  #if defined(O_CLOEXEC)
-  flags |= O_CLOEXEC;
-  #endif
-  int fd = open("/dev/urandom", flags, 0);
-  if (fd < 0) return false;
-  size_t count = 0;
-  while(count < buf_len) {
-    ssize_t ret = read(fd, (char*)buf + count, buf_len - count);
-    if (ret<=0) {
-      if (errno!=EAGAIN && errno!=EINTR) break;
-    }
-    else {
-      count += ret;
-    }
-  }
-  close(fd);
-  return (count==buf_len);
-}
-#else
-static bool os_random_buf(void* buf, size_t buf_len) {
-  return false;
-}
-#endif
-
-#if defined(_WIN32)
-#include <windows.h>
-#elif defined(__APPLE__)
-#include <mach/mach_time.h>
-#else
-#include <time.h>
-#endif
-
 uintptr_t _mi_os_random_weak(uintptr_t extra_seed) {
   uintptr_t x = (uintptr_t)&_mi_os_random_weak ^ extra_seed; // ASLR makes the address random
-  
-  #if defined(_WIN32)
-    LARGE_INTEGER pcount;
-    QueryPerformanceCounter(&pcount);
-    x ^= (uintptr_t)(pcount.QuadPart);
-  #elif defined(__APPLE__)
-    x ^= (uintptr_t)mach_absolute_time();
-  #else
-    struct timespec time;
-    clock_gettime(CLOCK_MONOTONIC, &time);
-    x ^= (uintptr_t)time.tv_sec;
-    x ^= (uintptr_t)time.tv_nsec;
-  #endif
+  x ^= _mi_prim_clock_now();
   // and do a few randomization steps
   uintptr_t max = ((x ^ (x >> 17)) & 0x0F) + 1;
-  for (uintptr_t i = 0; i < max; i++) {
+  for (uintptr_t i = 0; i < max || x==0; i++, x++) {
     x = _mi_random_shuffle(x);
   }
   mi_assert_internal(x != 0);
   return x;
 }
 
-void _mi_random_init(mi_random_ctx_t* ctx) {
+static void mi_random_init_ex(mi_random_ctx_t* ctx, bool use_weak) {
   uint8_t key[32];
-  if (!os_random_buf(key, sizeof(key))) {
+  if (use_weak || !_mi_prim_random_buf(key, sizeof(key))) {
     // if we fail to get random data from the OS, we fall back to a
     // weak random source based on the current time
     #if !defined(__wasi__)
-    _mi_warning_message("unable to use secure randomness\n");
+    if (!use_weak) { _mi_warning_message("unable to use secure randomness\n"); }
     #endif
     uintptr_t x = _mi_os_random_weak(0);
-    for (size_t i = 0; i < 8; i++) {  // key is eight 32-bit words.
+    for (size_t i = 0; i < 8; i++, x++) {  // key is eight 32-bit words.
       x = _mi_random_shuffle(x);
       ((uint32_t*)key)[i] = (uint32_t)x;
     }
+    ctx->weak = true;
+  }
+  else {
+    ctx->weak = false;
   }
   chacha_init(ctx, key, (uintptr_t)ctx /*nonce*/ );
 }
 
+void _mi_random_init(mi_random_ctx_t* ctx) {
+  mi_random_init_ex(ctx, false);
+}
+
+void _mi_random_init_weak(mi_random_ctx_t * ctx) {
+  mi_random_init_ex(ctx, true);
+}
+
+void _mi_random_reinit_if_weak(mi_random_ctx_t * ctx) {
+  if (ctx->weak) {
+    _mi_random_init(ctx);
+  }
+}
+
 /* --------------------------------------------------------
 test vectors from <https://tools.ietf.org/html/rfc8439>
 ----------------------------------------------------------- */
diff --git a/ext/src/mimalloc/src/region.c b/ext/src/mimalloc/src/region.c
deleted file mode 100644
index 72ce84947d..0000000000
--- a/ext/src/mimalloc/src/region.c
+++ /dev/null
@@ -1,505 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2019-2020, Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-
-/* ----------------------------------------------------------------------------
-This implements a layer between the raw OS memory (VirtualAlloc/mmap/sbrk/..)
-and the segment and huge object allocation by mimalloc. There may be multiple
-implementations of this (one could be the identity going directly to the OS,
-another could be a simple cache etc), but the current one uses large "regions".
-In contrast to the rest of mimalloc, the "regions" are shared between threads and
-need to be accessed using atomic operations.
-We need this memory layer between the raw OS calls because of:
-1. on `sbrk` like systems (like WebAssembly) we need our own memory maps in order
-   to reuse memory effectively.
-2. It turns out that for large objects, between 1MiB and 32MiB (?), the cost of
-   an OS allocation/free is still (much) too expensive relative to the accesses 
-   in that object :-( (`malloc-large` tests this). This means we need a cheaper 
-   way to reuse memory.
-3. This layer allows for NUMA aware allocation.
-
-Possible issues:
-- (2) can potentially be addressed too with a small cache per thread which is much
-  simpler. Generally though that requires shrinking of huge pages, and may overuse
-  memory per thread. (and is not compatible with `sbrk`).
-- Since the current regions are per-process, we need atomic operations to
-  claim blocks which may be contended
-- In the worst case, we need to search the whole region map (16KiB for 256GiB)
-  linearly. At what point will direct OS calls be faster? Is there a way to
-  do this better without adding too much complexity?
------------------------------------------------------------------------------*/
-#include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
-
-#include <string.h>  // memset
-
-#include "bitmap.h"
-
-// Internal raw OS interface
-size_t  _mi_os_large_page_size(void);
-bool    _mi_os_protect(void* addr, size_t size);
-bool    _mi_os_unprotect(void* addr, size_t size);
-bool    _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
-bool    _mi_os_decommit(void* p, size_t size, mi_stats_t* stats);
-bool    _mi_os_reset(void* p, size_t size, mi_stats_t* stats);
-bool    _mi_os_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
-
-// arena.c
-void    _mi_arena_free(void* p, size_t size, size_t memid, bool all_committed, mi_stats_t* stats);
-void*   _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
-void*   _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
-
-
-
-// Constants
-#if (MI_INTPTR_SIZE==8)
-#define MI_HEAP_REGION_MAX_SIZE    (256 * MI_GiB)  // 64KiB for the region map 
-#elif (MI_INTPTR_SIZE==4)
-#define MI_HEAP_REGION_MAX_SIZE    (3 * MI_GiB)    // ~ KiB for the region map
-#else
-#error "define the maximum heap space allowed for regions on this platform"
-#endif
-
-#define MI_SEGMENT_ALIGN          MI_SEGMENT_SIZE
-
-#define MI_REGION_MAX_BLOCKS      MI_BITMAP_FIELD_BITS
-#define MI_REGION_SIZE            (MI_SEGMENT_SIZE * MI_BITMAP_FIELD_BITS)    // 256MiB  (64MiB on 32 bits)
-#define MI_REGION_MAX             (MI_HEAP_REGION_MAX_SIZE / MI_REGION_SIZE)  // 1024  (48 on 32 bits)
-#define MI_REGION_MAX_OBJ_BLOCKS  (MI_REGION_MAX_BLOCKS/4)                    // 64MiB
-#define MI_REGION_MAX_OBJ_SIZE    (MI_REGION_MAX_OBJ_BLOCKS*MI_SEGMENT_SIZE)  
-
-// Region info 
-typedef union mi_region_info_u {
-  size_t value;      
-  struct {
-    bool  valid;        // initialized?
-    bool  is_large:1;   // allocated in fixed large/huge OS pages
-    bool  is_pinned:1;  // pinned memory cannot be decommitted
-    short numa_node;    // the associated NUMA node (where -1 means no associated node)
-  } x;
-} mi_region_info_t;
-
-
-// A region owns a chunk of REGION_SIZE (256MiB) (virtual) memory with
-// a bit map with one bit per MI_SEGMENT_SIZE (4MiB) block.
-typedef struct mem_region_s {
-  _Atomic(size_t)           info;        // mi_region_info_t.value
-  _Atomic(void*)            start;       // start of the memory area 
-  mi_bitmap_field_t         in_use;      // bit per in-use block
-  mi_bitmap_field_t         dirty;       // track if non-zero per block
-  mi_bitmap_field_t         commit;      // track if committed per block
-  mi_bitmap_field_t         reset;       // track if reset per block
-  _Atomic(size_t)           arena_memid; // if allocated from a (huge page) arena
-  _Atomic(size_t)           padding;     // round to 8 fields (needs to be atomic for msvc, see issue #508)
-} mem_region_t;
-
-// The region map
-static mem_region_t regions[MI_REGION_MAX];
-
-// Allocated regions
-static _Atomic(size_t) regions_count; // = 0;        
-
-
-/* ----------------------------------------------------------------------------
-Utility functions
------------------------------------------------------------------------------*/
-
-// Blocks (of 4MiB) needed for the given size.
-static size_t mi_region_block_count(size_t size) {
-  return _mi_divide_up(size, MI_SEGMENT_SIZE);
-}
-
-/*
-// Return a rounded commit/reset size such that we don't fragment large OS pages into small ones.
-static size_t mi_good_commit_size(size_t size) {
-  if (size > (SIZE_MAX - _mi_os_large_page_size())) return size;
-  return _mi_align_up(size, _mi_os_large_page_size());
-}
-*/
-
-// Return if a pointer points into a region reserved by us.
-mi_decl_nodiscard bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
-  if (p==NULL) return false;
-  size_t count = mi_atomic_load_relaxed(&regions_count);
-  for (size_t i = 0; i < count; i++) {
-    uint8_t* start = (uint8_t*)mi_atomic_load_ptr_relaxed(uint8_t, &regions[i].start);
-    if (start != NULL && (uint8_t*)p >= start && (uint8_t*)p < start + MI_REGION_SIZE) return true;
-  }
-  return false;
-}
-
-
-static void* mi_region_blocks_start(const mem_region_t* region, mi_bitmap_index_t bit_idx) {
-  uint8_t* start = (uint8_t*)mi_atomic_load_ptr_acquire(uint8_t, &((mem_region_t*)region)->start);
-  mi_assert_internal(start != NULL);
-  return (start + (bit_idx * MI_SEGMENT_SIZE));  
-}
-
-static size_t mi_memid_create(mem_region_t* region, mi_bitmap_index_t bit_idx) {
-  mi_assert_internal(bit_idx < MI_BITMAP_FIELD_BITS);
-  size_t idx = region - regions;
-  mi_assert_internal(&regions[idx] == region);
-  return (idx*MI_BITMAP_FIELD_BITS + bit_idx)<<1;
-}
-
-static size_t mi_memid_create_from_arena(size_t arena_memid) {
-  return (arena_memid << 1) | 1;
-}
-
-
-static bool mi_memid_is_arena(size_t id, mem_region_t** region, mi_bitmap_index_t* bit_idx, size_t* arena_memid) {
-  if ((id&1)==1) {
-    if (arena_memid != NULL) *arena_memid = (id>>1);
-    return true;
-  }
-  else {
-    size_t idx = (id >> 1) / MI_BITMAP_FIELD_BITS;
-    *bit_idx   = (mi_bitmap_index_t)(id>>1) % MI_BITMAP_FIELD_BITS;
-    *region    = &regions[idx];
-    return false;
-  }
-}
-
-
-/* ----------------------------------------------------------------------------
-  Allocate a region is allocated from the OS (or an arena)
------------------------------------------------------------------------------*/
-
-static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
-{
-  // not out of regions yet?
-  if (mi_atomic_load_relaxed(&regions_count) >= MI_REGION_MAX - 1) return false;
-
-  // try to allocate a fresh region from the OS
-  bool region_commit = (commit && mi_option_is_enabled(mi_option_eager_region_commit));
-  bool region_large = (commit && allow_large);
-  bool is_zero = false;
-  bool is_pinned = false;
-  size_t arena_memid = 0;
-  void* const start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, &is_pinned, &is_zero, &arena_memid, tld);
-  if (start == NULL) return false;
-  mi_assert_internal(!(region_large && !allow_large));
-  mi_assert_internal(!region_large || region_commit);
-
-  // claim a fresh slot
-  const size_t idx = mi_atomic_increment_acq_rel(&regions_count);
-  if (idx >= MI_REGION_MAX) {
-    mi_atomic_decrement_acq_rel(&regions_count);
-    _mi_arena_free(start, MI_REGION_SIZE, arena_memid, region_commit, tld->stats);
-    _mi_warning_message("maximum regions used: %zu GiB (perhaps recompile with a larger setting for MI_HEAP_REGION_MAX_SIZE)", _mi_divide_up(MI_HEAP_REGION_MAX_SIZE, MI_GiB));
-    return false;
-  }
-
-  // allocated, initialize and claim the initial blocks
-  mem_region_t* r = &regions[idx];
-  r->arena_memid  = arena_memid;
-  mi_atomic_store_release(&r->in_use, (size_t)0);
-  mi_atomic_store_release(&r->dirty, (is_zero ? 0 : MI_BITMAP_FIELD_FULL));
-  mi_atomic_store_release(&r->commit, (region_commit ? MI_BITMAP_FIELD_FULL : 0));
-  mi_atomic_store_release(&r->reset, (size_t)0);
-  *bit_idx = 0;
-  _mi_bitmap_claim(&r->in_use, 1, blocks, *bit_idx, NULL);
-  mi_atomic_store_ptr_release(void,&r->start, start);
-
-  // and share it 
-  mi_region_info_t info;
-  info.value = 0;                        // initialize the full union to zero
-  info.x.valid = true;
-  info.x.is_large = region_large;
-  info.x.is_pinned = is_pinned;
-  info.x.numa_node = (short)_mi_os_numa_node(tld);
-  mi_atomic_store_release(&r->info, info.value); // now make it available to others
-  *region = r;
-  return true;
-}
-
-/* ----------------------------------------------------------------------------
-  Try to claim blocks in suitable regions
------------------------------------------------------------------------------*/
-
-static bool mi_region_is_suitable(const mem_region_t* region, int numa_node, bool allow_large ) {
-  // initialized at all?
-  mi_region_info_t info;
-  info.value = mi_atomic_load_relaxed(&((mem_region_t*)region)->info);
-  if (info.value==0) return false;
-
-  // numa correct
-  if (numa_node >= 0) {  // use negative numa node to always succeed
-    int rnode = info.x.numa_node;
-    if (rnode >= 0 && rnode != numa_node) return false;
-  }
-
-  // check allow-large
-  if (!allow_large && info.x.is_large) return false;
-
-  return true;
-}
-
-
-static bool mi_region_try_claim(int numa_node, size_t blocks, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
-{
-  // try all regions for a free slot  
-  const size_t count = mi_atomic_load_relaxed(&regions_count); // monotonic, so ok to be relaxed
-  size_t idx = tld->region_idx; // Or start at 0 to reuse low addresses? Starting at 0 seems to increase latency though
-  for (size_t visited = 0; visited < count; visited++, idx++) {
-    if (idx >= count) idx = 0;  // wrap around
-    mem_region_t* r = &regions[idx];
-    // if this region suits our demand (numa node matches, large OS page matches)
-    if (mi_region_is_suitable(r, numa_node, allow_large)) {
-      // then try to atomically claim a segment(s) in this region
-      if (_mi_bitmap_try_find_claim_field(&r->in_use, 0, blocks, bit_idx)) {
-        tld->region_idx = idx;    // remember the last found position
-        *region = r;
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-
-static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
-{
-  mi_assert_internal(blocks <= MI_BITMAP_FIELD_BITS);
-  mem_region_t* region;
-  mi_bitmap_index_t bit_idx;
-  const int numa_node = (_mi_os_numa_node_count() <= 1 ? -1 : _mi_os_numa_node(tld));
-  // try to claim in existing regions
-  if (!mi_region_try_claim(numa_node, blocks, *large, &region, &bit_idx, tld)) {
-    // otherwise try to allocate a fresh region and claim in there
-    if (!mi_region_try_alloc_os(blocks, *commit, *large, &region, &bit_idx, tld)) {
-      // out of regions or memory
-      return NULL;
-    }
-  }
-  
-  // ------------------------------------------------
-  // found a region and claimed `blocks` at `bit_idx`, initialize them now
-  mi_assert_internal(region != NULL);
-  mi_assert_internal(_mi_bitmap_is_claimed(&region->in_use, 1, blocks, bit_idx));
-
-  mi_region_info_t info;
-  info.value = mi_atomic_load_acquire(&region->info);
-  uint8_t* start = (uint8_t*)mi_atomic_load_ptr_acquire(uint8_t,&region->start);
-  mi_assert_internal(!(info.x.is_large && !*large));
-  mi_assert_internal(start != NULL);
-
-  *is_zero   = _mi_bitmap_claim(&region->dirty, 1, blocks, bit_idx, NULL);  
-  *large     = info.x.is_large;
-  *is_pinned = info.x.is_pinned;
-  *memid     = mi_memid_create(region, bit_idx);
-  void* p = start + (mi_bitmap_index_bit_in_field(bit_idx) * MI_SEGMENT_SIZE);
-
-  // commit
-  if (*commit) {
-    // ensure commit
-    bool any_uncommitted;
-    _mi_bitmap_claim(&region->commit, 1, blocks, bit_idx, &any_uncommitted);
-    if (any_uncommitted) {
-      mi_assert_internal(!info.x.is_large && !info.x.is_pinned);
-      bool commit_zero = false;
-      if (!_mi_mem_commit(p, blocks * MI_SEGMENT_SIZE, &commit_zero, tld)) {
-        // failed to commit! unclaim and return
-        mi_bitmap_unclaim(&region->in_use, 1, blocks, bit_idx);
-        return NULL;
-      }
-      if (commit_zero) *is_zero = true;      
-    }
-  }
-  else {
-    // no need to commit, but check if already fully committed
-    *commit = _mi_bitmap_is_claimed(&region->commit, 1, blocks, bit_idx);
-  }  
-  mi_assert_internal(!*commit || _mi_bitmap_is_claimed(&region->commit, 1, blocks, bit_idx));
-
-  // unreset reset blocks
-  if (_mi_bitmap_is_any_claimed(&region->reset, 1, blocks, bit_idx)) {
-    // some blocks are still reset
-    mi_assert_internal(!info.x.is_large && !info.x.is_pinned);
-    mi_assert_internal(!mi_option_is_enabled(mi_option_eager_commit) || *commit || mi_option_get(mi_option_eager_commit_delay) > 0); 
-    mi_bitmap_unclaim(&region->reset, 1, blocks, bit_idx);
-    if (*commit || !mi_option_is_enabled(mi_option_reset_decommits)) { // only if needed
-      bool reset_zero = false;
-      _mi_mem_unreset(p, blocks * MI_SEGMENT_SIZE, &reset_zero, tld);
-      if (reset_zero) *is_zero = true;
-    }
-  }
-  mi_assert_internal(!_mi_bitmap_is_any_claimed(&region->reset, 1, blocks, bit_idx));
-  
-  #if (MI_DEBUG>=2)
-  if (*commit) { ((uint8_t*)p)[0] = 0; }
-  #endif
-  
-  // and return the allocation  
-  mi_assert_internal(p != NULL);  
-  return p;
-}
-
-
-/* ----------------------------------------------------------------------------
- Allocation
------------------------------------------------------------------------------*/
-
-// Allocate `size` memory aligned at `alignment`. Return non NULL on success, with a given memory `id`.
-// (`id` is abstract, but `id = idx*MI_REGION_MAP_BITS + bitidx`)
-void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
-{
-  mi_assert_internal(memid != NULL && tld != NULL);
-  mi_assert_internal(size > 0);
-  *memid = 0;
-  *is_zero = false;
-  *is_pinned = false;
-  bool default_large = false;
-  if (large==NULL) large = &default_large;  // ensure `large != NULL`  
-  if (size == 0) return NULL;
-  size = _mi_align_up(size, _mi_os_page_size());
-
-  // allocate from regions if possible
-  void* p = NULL;
-  size_t arena_memid;
-  const size_t blocks = mi_region_block_count(size);
-  if (blocks <= MI_REGION_MAX_OBJ_BLOCKS && alignment <= MI_SEGMENT_ALIGN) {
-    p = mi_region_try_alloc(blocks, commit, large, is_pinned, is_zero, memid, tld);    
-    if (p == NULL) {
-      _mi_warning_message("unable to allocate from region: size %zu\n", size);
-    }
-  }
-  if (p == NULL) {
-    // and otherwise fall back to the OS
-    p = _mi_arena_alloc_aligned(size, alignment, commit, large, is_pinned, is_zero, &arena_memid, tld);
-    *memid = mi_memid_create_from_arena(arena_memid);
-  }
-
-  if (p != NULL) {
-    mi_assert_internal((uintptr_t)p % alignment == 0);
-#if (MI_DEBUG>=2)
-    if (*commit) { ((uint8_t*)p)[0] = 0; } // ensure the memory is committed
-#endif
-  }
-  return p;
-}
-
-
-
-/* ----------------------------------------------------------------------------
-Free
------------------------------------------------------------------------------*/
-
-// Free previously allocated memory with a given id.
-void _mi_mem_free(void* p, size_t size, size_t id, bool full_commit, bool any_reset, mi_os_tld_t* tld) {
-  mi_assert_internal(size > 0 && tld != NULL);
-  if (p==NULL) return;
-  if (size==0) return;
-  size = _mi_align_up(size, _mi_os_page_size());
-  
-  size_t arena_memid = 0;
-  mi_bitmap_index_t bit_idx;
-  mem_region_t* region;
-  if (mi_memid_is_arena(id,&region,&bit_idx,&arena_memid)) {
-   // was a direct arena allocation, pass through
-    _mi_arena_free(p, size, arena_memid, full_commit, tld->stats);
-  }
-  else {
-    // allocated in a region
-    mi_assert_internal(size <= MI_REGION_MAX_OBJ_SIZE); if (size > MI_REGION_MAX_OBJ_SIZE) return;
-    const size_t blocks = mi_region_block_count(size);
-    mi_assert_internal(blocks + bit_idx <= MI_BITMAP_FIELD_BITS);
-    mi_region_info_t info;
-    info.value = mi_atomic_load_acquire(&region->info);
-    mi_assert_internal(info.value != 0);
-    void* blocks_start = mi_region_blocks_start(region, bit_idx);
-    mi_assert_internal(blocks_start == p); // not a pointer in our area?
-    mi_assert_internal(bit_idx + blocks <= MI_BITMAP_FIELD_BITS);
-    if (blocks_start != p || bit_idx + blocks > MI_BITMAP_FIELD_BITS) return; // or `abort`?
-
-    // committed?
-    if (full_commit && (size % MI_SEGMENT_SIZE) == 0) {
-      _mi_bitmap_claim(&region->commit, 1, blocks, bit_idx, NULL);
-    }
-
-    if (any_reset) {
-      // set the is_reset bits if any pages were reset
-      _mi_bitmap_claim(&region->reset, 1, blocks, bit_idx, NULL);
-    }
-
-    // reset the blocks to reduce the working set.
-    if (!info.x.is_large && !info.x.is_pinned && mi_option_is_enabled(mi_option_segment_reset) 
-       && (mi_option_is_enabled(mi_option_eager_commit) ||
-           mi_option_is_enabled(mi_option_reset_decommits))) // cannot reset halfway committed segments, use only `option_page_reset` instead            
-    {
-      bool any_unreset;
-      _mi_bitmap_claim(&region->reset, 1, blocks, bit_idx, &any_unreset);
-      if (any_unreset) {
-        _mi_abandoned_await_readers(); // ensure no more pending write (in case reset = decommit)
-        _mi_mem_reset(p, blocks * MI_SEGMENT_SIZE, tld);
-      }
-    }    
-
-    // and unclaim
-    bool all_unclaimed = mi_bitmap_unclaim(&region->in_use, 1, blocks, bit_idx);
-    mi_assert_internal(all_unclaimed); MI_UNUSED(all_unclaimed);
-  }
-}
-
-
-/* ----------------------------------------------------------------------------
-  collection
------------------------------------------------------------------------------*/
-void _mi_mem_collect(mi_os_tld_t* tld) {
-  // free every region that has no segments in use.
-  size_t rcount = mi_atomic_load_relaxed(&regions_count);
-  for (size_t i = 0; i < rcount; i++) {
-    mem_region_t* region = &regions[i];
-    if (mi_atomic_load_relaxed(&region->info) != 0) {
-      // if no segments used, try to claim the whole region
-      size_t m = mi_atomic_load_relaxed(&region->in_use);
-      while (m == 0 && !mi_atomic_cas_weak_release(&region->in_use, &m, MI_BITMAP_FIELD_FULL)) { /* nothing */ };
-      if (m == 0) {
-        // on success, free the whole region
-        uint8_t* start = (uint8_t*)mi_atomic_load_ptr_acquire(uint8_t,&regions[i].start);
-        size_t arena_memid = mi_atomic_load_relaxed(&regions[i].arena_memid);
-        size_t commit = mi_atomic_load_relaxed(&regions[i].commit);
-        memset((void*)&regions[i], 0, sizeof(mem_region_t));  // cast to void* to avoid atomic warning
-        // and release the whole region
-        mi_atomic_store_release(&region->info, (size_t)0);
-        if (start != NULL) { // && !_mi_os_is_huge_reserved(start)) {         
-          _mi_abandoned_await_readers(); // ensure no pending reads
-          _mi_arena_free(start, MI_REGION_SIZE, arena_memid, (~commit == 0), tld->stats);
-        }
-      }
-    }
-  }
-}
-
-
-/* ----------------------------------------------------------------------------
-  Other
------------------------------------------------------------------------------*/
-
-bool _mi_mem_reset(void* p, size_t size, mi_os_tld_t* tld) {
-  return _mi_os_reset(p, size, tld->stats);
-}
-
-bool _mi_mem_unreset(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld) {
-  return _mi_os_unreset(p, size, is_zero, tld->stats);
-}
-
-bool _mi_mem_commit(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld) {
-  return _mi_os_commit(p, size, is_zero, tld->stats);
-}
-
-bool _mi_mem_decommit(void* p, size_t size, mi_os_tld_t* tld) {
-  return _mi_os_decommit(p, size, tld->stats);
-}
-
-bool _mi_mem_protect(void* p, size_t size) {
-  return _mi_os_protect(p, size);
-}
-
-bool _mi_mem_unprotect(void* p, size_t size) {
-  return _mi_os_unprotect(p, size);
-}
diff --git a/ext/src/mimalloc/src/segment-cache.c b/ext/src/mimalloc/src/segment-cache.c
deleted file mode 100644
index aacdbc11d6..0000000000
--- a/ext/src/mimalloc/src/segment-cache.c
+++ /dev/null
@@ -1,360 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2020, Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-
-/* ----------------------------------------------------------------------------
-  Implements a cache of segments to avoid expensive OS calls and to reuse
-  the commit_mask to optimize the commit/decommit calls.
-  The full memory map of all segments is also implemented here.
------------------------------------------------------------------------------*/
-#include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
-
-#include "bitmap.h"  // atomic bitmap
-
-//#define MI_CACHE_DISABLE 1    // define to completely disable the segment cache
-
-#define MI_CACHE_FIELDS     (16)
-#define MI_CACHE_MAX        (MI_BITMAP_FIELD_BITS*MI_CACHE_FIELDS)       // 1024 on 64-bit
-
-#define BITS_SET()          MI_ATOMIC_VAR_INIT(UINTPTR_MAX)
-#define MI_CACHE_BITS_SET   MI_INIT16(BITS_SET)                          // note: update if MI_CACHE_FIELDS changes
-
-typedef struct mi_cache_slot_s {
-  void*               p;
-  size_t              memid;
-  bool                is_pinned;
-  mi_commit_mask_t    commit_mask;
-  mi_commit_mask_t    decommit_mask;
-  _Atomic(mi_msecs_t) expire;
-} mi_cache_slot_t;
-
-static mi_decl_cache_align mi_cache_slot_t cache[MI_CACHE_MAX];    // = 0
-
-static mi_decl_cache_align mi_bitmap_field_t cache_available[MI_CACHE_FIELDS] = { MI_CACHE_BITS_SET };        // zero bit = available!
-static mi_decl_cache_align mi_bitmap_field_t cache_available_large[MI_CACHE_FIELDS] = { MI_CACHE_BITS_SET };
-static mi_decl_cache_align mi_bitmap_field_t cache_inuse[MI_CACHE_FIELDS];   // zero bit = free
-
-
-mi_decl_noinline void* _mi_segment_cache_pop(size_t size, mi_commit_mask_t* commit_mask, mi_commit_mask_t* decommit_mask, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
-{
-#ifdef MI_CACHE_DISABLE
-  return NULL;
-#else
-
-  // only segment blocks
-  if (size != MI_SEGMENT_SIZE) return NULL;
-
-  // numa node determines start field
-  const int numa_node = _mi_os_numa_node(tld);
-  size_t start_field = 0;
-  if (numa_node > 0) {
-    start_field = (MI_CACHE_FIELDS / _mi_os_numa_node_count())*numa_node;
-    if (start_field >= MI_CACHE_FIELDS) start_field = 0;
-  }
-
-  // find an available slot
-  mi_bitmap_index_t bitidx = 0;
-  bool claimed = false;
-  if (*large) {  // large allowed?
-    claimed = _mi_bitmap_try_find_from_claim(cache_available_large, MI_CACHE_FIELDS, start_field, 1, &bitidx);
-    if (claimed) *large = true;
-  }
-  if (!claimed) {
-    claimed = _mi_bitmap_try_find_from_claim(cache_available, MI_CACHE_FIELDS, start_field, 1, &bitidx);
-    if (claimed) *large = false;
-  }
-
-  if (!claimed) return NULL;
-
-  // found a slot
-  mi_cache_slot_t* slot = &cache[mi_bitmap_index_bit(bitidx)];
-  void* p = slot->p;
-  *memid = slot->memid;
-  *is_pinned = slot->is_pinned;
-  *is_zero = false;
-  *commit_mask = slot->commit_mask;     
-  *decommit_mask = slot->decommit_mask;
-  slot->p = NULL;
-  mi_atomic_storei64_release(&slot->expire,(mi_msecs_t)0);
-  
-  // mark the slot as free again
-  mi_assert_internal(_mi_bitmap_is_claimed(cache_inuse, MI_CACHE_FIELDS, 1, bitidx));
-  _mi_bitmap_unclaim(cache_inuse, MI_CACHE_FIELDS, 1, bitidx);
-  return p;
-#endif
-}
-
-static mi_decl_noinline void mi_commit_mask_decommit(mi_commit_mask_t* cmask, void* p, size_t total, mi_stats_t* stats)
-{
-  if (mi_commit_mask_is_empty(cmask)) {
-    // nothing
-  }
-  else if (mi_commit_mask_is_full(cmask)) {
-    _mi_os_decommit(p, total, stats);
-  }
-  else {
-    // todo: one call to decommit the whole at once?
-    mi_assert_internal((total%MI_COMMIT_MASK_BITS)==0);
-    size_t part = total/MI_COMMIT_MASK_BITS;
-    size_t idx;
-    size_t count;    
-    mi_commit_mask_foreach(cmask, idx, count) {
-      void*  start = (uint8_t*)p + (idx*part);
-      size_t size = count*part;
-      _mi_os_decommit(start, size, stats);
-    }
-    mi_commit_mask_foreach_end()
-  }
-  mi_commit_mask_create_empty(cmask);
-}
-
-#define MI_MAX_PURGE_PER_PUSH  (4)
-
-static mi_decl_noinline void mi_segment_cache_purge(bool force, mi_os_tld_t* tld)
-{
-  MI_UNUSED(tld);
-  if (!mi_option_is_enabled(mi_option_allow_decommit)) return;
-  mi_msecs_t now = _mi_clock_now();
-  size_t purged = 0;
-  const size_t max_visits = (force ? MI_CACHE_MAX /* visit all */ : MI_CACHE_FIELDS /* probe at most N (=16) slots */);
-  size_t idx              = (force ? 0 : _mi_random_shuffle((uintptr_t)now) % MI_CACHE_MAX /* random start */ );
-  for (size_t visited = 0; visited < max_visits; visited++,idx++) {  // visit N slots
-    if (idx >= MI_CACHE_MAX) idx = 0; // wrap
-    mi_cache_slot_t* slot = &cache[idx];
-    mi_msecs_t expire = mi_atomic_loadi64_relaxed(&slot->expire);
-    if (expire != 0 && (force || now >= expire)) {  // racy read
-      // seems expired, first claim it from available
-      purged++;
-      mi_bitmap_index_t bitidx = mi_bitmap_index_create_from_bit(idx);
-      if (_mi_bitmap_claim(cache_available, MI_CACHE_FIELDS, 1, bitidx, NULL)) {
-        // was available, we claimed it
-        expire = mi_atomic_loadi64_acquire(&slot->expire);
-        if (expire != 0 && (force || now >= expire)) {  // safe read
-          // still expired, decommit it
-          mi_atomic_storei64_relaxed(&slot->expire,(mi_msecs_t)0);
-          mi_assert_internal(!mi_commit_mask_is_empty(&slot->commit_mask) && _mi_bitmap_is_claimed(cache_available_large, MI_CACHE_FIELDS, 1, bitidx));
-          _mi_abandoned_await_readers();  // wait until safe to decommit
-          // decommit committed parts
-          // TODO: instead of decommit, we could also free to the OS?
-          mi_commit_mask_decommit(&slot->commit_mask, slot->p, MI_SEGMENT_SIZE, tld->stats);
-          mi_commit_mask_create_empty(&slot->decommit_mask);
-        }
-        _mi_bitmap_unclaim(cache_available, MI_CACHE_FIELDS, 1, bitidx); // make it available again for a pop
-      }
-      if (!force && purged > MI_MAX_PURGE_PER_PUSH) break;  // bound to no more than N purge tries per push
-    }
-  }
-}
-
-void _mi_segment_cache_collect(bool force, mi_os_tld_t* tld) {
-  mi_segment_cache_purge(force, tld );
-}
-
-mi_decl_noinline bool _mi_segment_cache_push(void* start, size_t size, size_t memid, const mi_commit_mask_t* commit_mask, const mi_commit_mask_t* decommit_mask, bool is_large, bool is_pinned, mi_os_tld_t* tld)
-{
-#ifdef MI_CACHE_DISABLE
-  return false;
-#else
-
-  // only for normal segment blocks
-  if (size != MI_SEGMENT_SIZE || ((uintptr_t)start % MI_SEGMENT_ALIGN) != 0) return false;
-
-  // numa node determines start field
-  int numa_node = _mi_os_numa_node(NULL);
-  size_t start_field = 0;
-  if (numa_node > 0) {
-    start_field = (MI_CACHE_FIELDS / _mi_os_numa_node_count())*numa_node;
-    if (start_field >= MI_CACHE_FIELDS) start_field = 0;
-  }
-
-  // purge expired entries
-  mi_segment_cache_purge(false /* force? */, tld);
-
-  // find an available slot
-  mi_bitmap_index_t bitidx;
-  bool claimed = _mi_bitmap_try_find_from_claim(cache_inuse, MI_CACHE_FIELDS, start_field, 1, &bitidx);
-  if (!claimed) return false;
-
-  mi_assert_internal(_mi_bitmap_is_claimed(cache_available, MI_CACHE_FIELDS, 1, bitidx));
-  mi_assert_internal(_mi_bitmap_is_claimed(cache_available_large, MI_CACHE_FIELDS, 1, bitidx));
-#if MI_DEBUG>1
-  if (is_pinned || is_large) {
-    mi_assert_internal(mi_commit_mask_is_full(commit_mask));
-  }
-#endif
-
-  // set the slot
-  mi_cache_slot_t* slot = &cache[mi_bitmap_index_bit(bitidx)];
-  slot->p = start;
-  slot->memid = memid;
-  slot->is_pinned = is_pinned;
-  mi_atomic_storei64_relaxed(&slot->expire,(mi_msecs_t)0);
-  slot->commit_mask = *commit_mask;
-  slot->decommit_mask = *decommit_mask;
-  if (!mi_commit_mask_is_empty(commit_mask) && !is_large && !is_pinned && mi_option_is_enabled(mi_option_allow_decommit)) {
-    long delay = mi_option_get(mi_option_segment_decommit_delay);
-    if (delay == 0) {
-      _mi_abandoned_await_readers(); // wait until safe to decommit
-      mi_commit_mask_decommit(&slot->commit_mask, start, MI_SEGMENT_SIZE, tld->stats);
-      mi_commit_mask_create_empty(&slot->decommit_mask);
-    }
-    else {
-      mi_atomic_storei64_release(&slot->expire, _mi_clock_now() + delay);
-    }
-  }
-
-  // make it available
-  _mi_bitmap_unclaim((is_large ? cache_available_large : cache_available), MI_CACHE_FIELDS, 1, bitidx);
-  return true;
-#endif
-}
-
-
-/* -----------------------------------------------------------
-  The following functions are to reliably find the segment or
-  block that encompasses any pointer p (or NULL if it is not
-  in any of our segments).
-  We maintain a bitmap of all memory with 1 bit per MI_SEGMENT_SIZE (64MiB)
-  set to 1 if it contains the segment meta data.
------------------------------------------------------------ */
-
-
-#if (MI_INTPTR_SIZE==8)
-#define MI_MAX_ADDRESS    ((size_t)20 << 40)  // 20TB
-#else
-#define MI_MAX_ADDRESS    ((size_t)2 << 30)   // 2Gb
-#endif
-
-#define MI_SEGMENT_MAP_BITS  (MI_MAX_ADDRESS / MI_SEGMENT_SIZE)
-#define MI_SEGMENT_MAP_SIZE  (MI_SEGMENT_MAP_BITS / 8)
-#define MI_SEGMENT_MAP_WSIZE (MI_SEGMENT_MAP_SIZE / MI_INTPTR_SIZE)
-
-static _Atomic(uintptr_t) mi_segment_map[MI_SEGMENT_MAP_WSIZE + 1];  // 2KiB per TB with 64MiB segments
-
-static size_t mi_segment_map_index_of(const mi_segment_t* segment, size_t* bitidx) {
-  mi_assert_internal(_mi_ptr_segment(segment) == segment); // is it aligned on MI_SEGMENT_SIZE?
-  if ((uintptr_t)segment >= MI_MAX_ADDRESS) {
-    *bitidx = 0;
-    return MI_SEGMENT_MAP_WSIZE;
-  }
-  else {
-    const uintptr_t segindex = ((uintptr_t)segment) / MI_SEGMENT_SIZE;
-    *bitidx = segindex % MI_INTPTR_BITS;
-    const size_t mapindex = segindex / MI_INTPTR_BITS;
-    mi_assert_internal(mapindex < MI_SEGMENT_MAP_WSIZE);
-    return mapindex;
-  }
-}
-
-void _mi_segment_map_allocated_at(const mi_segment_t* segment) {
-  size_t bitidx;
-  size_t index = mi_segment_map_index_of(segment, &bitidx);
-  mi_assert_internal(index <= MI_SEGMENT_MAP_WSIZE);
-  if (index==MI_SEGMENT_MAP_WSIZE) return;
-  uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
-  uintptr_t newmask;
-  do {
-    newmask = (mask | ((uintptr_t)1 << bitidx));
-  } while (!mi_atomic_cas_weak_release(&mi_segment_map[index], &mask, newmask));
-}
-
-void _mi_segment_map_freed_at(const mi_segment_t* segment) {
-  size_t bitidx;
-  size_t index = mi_segment_map_index_of(segment, &bitidx);
-  mi_assert_internal(index <= MI_SEGMENT_MAP_WSIZE);
-  if (index == MI_SEGMENT_MAP_WSIZE) return;
-  uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
-  uintptr_t newmask;
-  do {
-    newmask = (mask & ~((uintptr_t)1 << bitidx));
-  } while (!mi_atomic_cas_weak_release(&mi_segment_map[index], &mask, newmask));
-}
-
-// Determine the segment belonging to a pointer or NULL if it is not in a valid segment.
-static mi_segment_t* _mi_segment_of(const void* p) {
-  mi_segment_t* segment = _mi_ptr_segment(p);
-  if (segment == NULL) return NULL; 
-  size_t bitidx;
-  size_t index = mi_segment_map_index_of(segment, &bitidx);
-  // fast path: for any pointer to valid small/medium/large object or first MI_SEGMENT_SIZE in huge
-  const uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
-  if (mi_likely((mask & ((uintptr_t)1 << bitidx)) != 0)) {
-    return segment; // yes, allocated by us
-  }
-  if (index==MI_SEGMENT_MAP_WSIZE) return NULL;
-
-  // TODO: maintain max/min allocated range for efficiency for more efficient rejection of invalid pointers?
-
-  // search downwards for the first segment in case it is an interior pointer
-  // could be slow but searches in MI_INTPTR_SIZE * MI_SEGMENT_SIZE (512MiB) steps trough
-  // valid huge objects
-  // note: we could maintain a lowest index to speed up the path for invalid pointers?
-  size_t lobitidx;
-  size_t loindex;
-  uintptr_t lobits = mask & (((uintptr_t)1 << bitidx) - 1);
-  if (lobits != 0) {
-    loindex = index;
-    lobitidx = mi_bsr(lobits);    // lobits != 0
-  }
-  else if (index == 0) {
-    return NULL;
-  }
-  else {
-    mi_assert_internal(index > 0);
-    uintptr_t lomask = mask;
-    loindex = index;
-    do {
-      loindex--;  
-      lomask = mi_atomic_load_relaxed(&mi_segment_map[loindex]);      
-    } while (lomask != 0 && loindex > 0);
-    if (lomask == 0) return NULL;
-    lobitidx = mi_bsr(lomask);    // lomask != 0
-  }
-  mi_assert_internal(loindex < MI_SEGMENT_MAP_WSIZE);
-  // take difference as the addresses could be larger than the MAX_ADDRESS space.
-  size_t diff = (((index - loindex) * (8*MI_INTPTR_SIZE)) + bitidx - lobitidx) * MI_SEGMENT_SIZE;
-  segment = (mi_segment_t*)((uint8_t*)segment - diff);
-
-  if (segment == NULL) return NULL;
-  mi_assert_internal((void*)segment < p);
-  bool cookie_ok = (_mi_ptr_cookie(segment) == segment->cookie);
-  mi_assert_internal(cookie_ok);
-  if (mi_unlikely(!cookie_ok)) return NULL;
-  if (((uint8_t*)segment + mi_segment_size(segment)) <= (uint8_t*)p) return NULL; // outside the range
-  mi_assert_internal(p >= (void*)segment && (uint8_t*)p < (uint8_t*)segment + mi_segment_size(segment));
-  return segment;
-}
-
-// Is this a valid pointer in our heap?
-static bool  mi_is_valid_pointer(const void* p) {
-  return (_mi_segment_of(p) != NULL);
-}
-
-mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
-  return mi_is_valid_pointer(p);
-}
-
-/*
-// Return the full segment range belonging to a pointer
-static void* mi_segment_range_of(const void* p, size_t* size) {
-  mi_segment_t* segment = _mi_segment_of(p);
-  if (segment == NULL) {
-    if (size != NULL) *size = 0;
-    return NULL;
-  }
-  else {
-    if (size != NULL) *size = segment->segment_size;
-    return segment;
-  }
-  mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page),tld));
-  mi_assert_internal(page == NULL || (mi_segment_page_size(_mi_page_segment(page)) - (MI_SECURE == 0 ? 0 : _mi_os_page_size())) >= block_size);
-  mi_reset_delayed(tld);
-  mi_assert_internal(page == NULL || mi_page_not_in_queue(page, tld));
-  return page;
-}
-*/
diff --git a/ext/src/mimalloc/src/segment.c b/ext/src/mimalloc/src/segment.c
deleted file mode 100644
index 800d4fc31f..0000000000
--- a/ext/src/mimalloc/src/segment.c
+++ /dev/null
@@ -1,1544 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-#include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
-
-#include <string.h>  // memset
-#include <stdio.h>
-
-#define MI_PAGE_HUGE_ALIGN  (256*1024)
-
-static void mi_segment_delayed_decommit(mi_segment_t* segment, bool force, mi_stats_t* stats);
-
-
-// -------------------------------------------------------------------
-// commit mask 
-// -------------------------------------------------------------------
-
-static bool mi_commit_mask_all_set(const mi_commit_mask_t* commit, const mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    if ((commit->mask[i] & cm->mask[i]) != cm->mask[i]) return false;
-  }
-  return true;
-}
-
-static bool mi_commit_mask_any_set(const mi_commit_mask_t* commit, const mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    if ((commit->mask[i] & cm->mask[i]) != 0) return true;
-  }
-  return false;
-}
-
-static void mi_commit_mask_create_intersect(const mi_commit_mask_t* commit, const mi_commit_mask_t* cm, mi_commit_mask_t* res) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    res->mask[i] = (commit->mask[i] & cm->mask[i]);
-  }
-}
-
-static void mi_commit_mask_clear(mi_commit_mask_t* res, const mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    res->mask[i] &= ~(cm->mask[i]);
-  }
-}
-
-static void mi_commit_mask_set(mi_commit_mask_t* res, const mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    res->mask[i] |= cm->mask[i];
-  }
-}
-
-static void mi_commit_mask_create(size_t bitidx, size_t bitcount, mi_commit_mask_t* cm) {
-  mi_assert_internal(bitidx < MI_COMMIT_MASK_BITS);
-  mi_assert_internal((bitidx + bitcount) <= MI_COMMIT_MASK_BITS);
-  if (bitcount == MI_COMMIT_MASK_BITS) {
-    mi_assert_internal(bitidx==0);
-    mi_commit_mask_create_full(cm);
-  }
-  else if (bitcount == 0) {
-    mi_commit_mask_create_empty(cm);
-  }
-  else {
-    mi_commit_mask_create_empty(cm);
-    size_t i = bitidx / MI_COMMIT_MASK_FIELD_BITS;
-    size_t ofs = bitidx % MI_COMMIT_MASK_FIELD_BITS;
-    while (bitcount > 0) {
-      mi_assert_internal(i < MI_COMMIT_MASK_FIELD_COUNT);
-      size_t avail = MI_COMMIT_MASK_FIELD_BITS - ofs;
-      size_t count = (bitcount > avail ? avail : bitcount);
-      size_t mask = (count >= MI_COMMIT_MASK_FIELD_BITS ? ~((size_t)0) : (((size_t)1 << count) - 1) << ofs);
-      cm->mask[i] = mask;
-      bitcount -= count;
-      ofs = 0;
-      i++;
-    }
-  }
-}
-
-size_t _mi_commit_mask_committed_size(const mi_commit_mask_t* cm, size_t total) {
-  mi_assert_internal((total%MI_COMMIT_MASK_BITS)==0);
-  size_t count = 0;
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    size_t mask = cm->mask[i];
-    if (~mask == 0) {
-      count += MI_COMMIT_MASK_FIELD_BITS;
-    }
-    else {
-      for (; mask != 0; mask >>= 1) {  // todo: use popcount
-        if ((mask&1)!=0) count++;
-      }
-    }
-  }
-  // we use total since for huge segments each commit bit may represent a larger size
-  return ((total / MI_COMMIT_MASK_BITS) * count);
-}
-
-
-size_t _mi_commit_mask_next_run(const mi_commit_mask_t* cm, size_t* idx) {
-  size_t i = (*idx) / MI_COMMIT_MASK_FIELD_BITS;
-  size_t ofs = (*idx) % MI_COMMIT_MASK_FIELD_BITS;
-  size_t mask = 0;
-  // find first ones
-  while (i < MI_COMMIT_MASK_FIELD_COUNT) {
-    mask = cm->mask[i];
-    mask >>= ofs;
-    if (mask != 0) {
-      while ((mask&1) == 0) {
-        mask >>= 1;
-        ofs++;
-      }
-      break;
-    }
-    i++;
-    ofs = 0;
-  }
-  if (i >= MI_COMMIT_MASK_FIELD_COUNT) {
-    // not found
-    *idx = MI_COMMIT_MASK_BITS;
-    return 0;
-  }
-  else {
-    // found, count ones
-    size_t count = 0;
-    *idx = (i*MI_COMMIT_MASK_FIELD_BITS) + ofs;
-    do {
-      mi_assert_internal(ofs < MI_COMMIT_MASK_FIELD_BITS && (mask&1) == 1);
-      do {
-        count++;
-        mask >>= 1;
-      } while ((mask&1) == 1);
-      if ((((*idx + count) % MI_COMMIT_MASK_FIELD_BITS) == 0)) {
-        i++;
-        if (i >= MI_COMMIT_MASK_FIELD_COUNT) break;
-        mask = cm->mask[i];
-        ofs = 0;
-      }
-    } while ((mask&1) == 1);
-    mi_assert_internal(count > 0);
-    return count;
-  }
-}
-
-
-/* --------------------------------------------------------------------------------
-  Segment allocation
-
-  If a  thread ends, it "abandons" pages with used blocks
-  and there is an abandoned segment list whose segments can
-  be reclaimed by still running threads, much like work-stealing.
--------------------------------------------------------------------------------- */
-
-
-/* -----------------------------------------------------------
-   Slices
------------------------------------------------------------ */
-
-
-static const mi_slice_t* mi_segment_slices_end(const mi_segment_t* segment) {
-  return &segment->slices[segment->slice_entries];
-}
-
-static uint8_t* mi_slice_start(const mi_slice_t* slice) {
-  mi_segment_t* segment = _mi_ptr_segment(slice);
-  mi_assert_internal(slice >= segment->slices && slice < mi_segment_slices_end(segment));
-  return ((uint8_t*)segment + ((slice - segment->slices)*MI_SEGMENT_SLICE_SIZE));
-}
-
-
-/* -----------------------------------------------------------
-   Bins
------------------------------------------------------------ */
-// Use bit scan forward to quickly find the first zero bit if it is available
-
-static inline size_t mi_slice_bin8(size_t slice_count) {
-  if (slice_count<=1) return slice_count;
-  mi_assert_internal(slice_count <= MI_SLICES_PER_SEGMENT);
-  slice_count--;
-  size_t s = mi_bsr(slice_count);  // slice_count > 1
-  if (s <= 2) return slice_count + 1;
-  size_t bin = ((s << 2) | ((slice_count >> (s - 2))&0x03)) - 4;
-  return bin;
-}
-
-static inline size_t mi_slice_bin(size_t slice_count) {
-  mi_assert_internal(slice_count*MI_SEGMENT_SLICE_SIZE <= MI_SEGMENT_SIZE);
-  mi_assert_internal(mi_slice_bin8(MI_SLICES_PER_SEGMENT) <= MI_SEGMENT_BIN_MAX);
-  size_t bin = mi_slice_bin8(slice_count);
-  mi_assert_internal(bin <= MI_SEGMENT_BIN_MAX);
-  return bin;
-}
-
-static inline size_t mi_slice_index(const mi_slice_t* slice) {
-  mi_segment_t* segment = _mi_ptr_segment(slice);
-  ptrdiff_t index = slice - segment->slices;
-  mi_assert_internal(index >= 0 && index < (ptrdiff_t)segment->slice_entries);
-  return index;
-}
-
-
-/* -----------------------------------------------------------
-   Slice span queues
------------------------------------------------------------ */
-
-static void mi_span_queue_push(mi_span_queue_t* sq, mi_slice_t* slice) {
-  // todo: or push to the end?
-  mi_assert_internal(slice->prev == NULL && slice->next==NULL);
-  slice->prev = NULL; // paranoia
-  slice->next = sq->first;
-  sq->first = slice;
-  if (slice->next != NULL) slice->next->prev = slice;
-                     else sq->last = slice;
-  slice->xblock_size = 0; // free
-}
-
-static mi_span_queue_t* mi_span_queue_for(size_t slice_count, mi_segments_tld_t* tld) {
-  size_t bin = mi_slice_bin(slice_count);
-  mi_span_queue_t* sq = &tld->spans[bin];
-  mi_assert_internal(sq->slice_count >= slice_count);
-  return sq;
-}
-
-static void mi_span_queue_delete(mi_span_queue_t* sq, mi_slice_t* slice) {
-  mi_assert_internal(slice->xblock_size==0 && slice->slice_count>0 && slice->slice_offset==0);
-  // should work too if the queue does not contain slice (which can happen during reclaim)
-  if (slice->prev != NULL) slice->prev->next = slice->next;
-  if (slice == sq->first) sq->first = slice->next;
-  if (slice->next != NULL) slice->next->prev = slice->prev;
-  if (slice == sq->last) sq->last = slice->prev;
-  slice->prev = NULL;
-  slice->next = NULL;
-  slice->xblock_size = 1; // no more free
-}
-
-
-/* -----------------------------------------------------------
- Invariant checking
------------------------------------------------------------ */
-
-static bool mi_slice_is_used(const mi_slice_t* slice) {
-  return (slice->xblock_size > 0);
-}
-
-
-#if (MI_DEBUG>=3)
-static bool mi_span_queue_contains(mi_span_queue_t* sq, mi_slice_t* slice) {
-  for (mi_slice_t* s = sq->first; s != NULL; s = s->next) {
-    if (s==slice) return true;
-  }
-  return false;
-}
-
-static bool mi_segment_is_valid(mi_segment_t* segment, mi_segments_tld_t* tld) {
-  mi_assert_internal(segment != NULL);
-  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
-  mi_assert_internal(segment->abandoned <= segment->used);
-  mi_assert_internal(segment->thread_id == 0 || segment->thread_id == _mi_thread_id());
-  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->decommit_mask)); // can only decommit committed blocks
-  //mi_assert_internal(segment->segment_info_size % MI_SEGMENT_SLICE_SIZE == 0);
-  mi_slice_t* slice = &segment->slices[0];
-  const mi_slice_t* end = mi_segment_slices_end(segment);
-  size_t used_count = 0;
-  mi_span_queue_t* sq;
-  while(slice < end) {
-    mi_assert_internal(slice->slice_count > 0);
-    mi_assert_internal(slice->slice_offset == 0);
-    size_t index = mi_slice_index(slice);
-    size_t maxindex = (index + slice->slice_count >= segment->slice_entries ? segment->slice_entries : index + slice->slice_count) - 1;
-    if (mi_slice_is_used(slice)) { // a page in use, we need at least MAX_SLICE_OFFSET valid back offsets
-      used_count++;
-      for (size_t i = 0; i <= MI_MAX_SLICE_OFFSET && index + i <= maxindex; i++) {
-        mi_assert_internal(segment->slices[index + i].slice_offset == i*sizeof(mi_slice_t));
-        mi_assert_internal(i==0 || segment->slices[index + i].slice_count == 0);
-        mi_assert_internal(i==0 || segment->slices[index + i].xblock_size == 1);
-      }
-      // and the last entry as well (for coalescing)
-      const mi_slice_t* last = slice + slice->slice_count - 1;
-      if (last > slice && last < mi_segment_slices_end(segment)) {
-        mi_assert_internal(last->slice_offset == (slice->slice_count-1)*sizeof(mi_slice_t));
-        mi_assert_internal(last->slice_count == 0);
-        mi_assert_internal(last->xblock_size == 1);
-      }
-    }
-    else {  // free range of slices; only last slice needs a valid back offset
-      mi_slice_t* last = &segment->slices[maxindex];
-      if (segment->kind != MI_SEGMENT_HUGE || slice->slice_count <= (segment->slice_entries - segment->segment_info_slices)) {
-        mi_assert_internal((uint8_t*)slice == (uint8_t*)last - last->slice_offset);
-      }
-      mi_assert_internal(slice == last || last->slice_count == 0 );
-      mi_assert_internal(last->xblock_size == 0 || (segment->kind==MI_SEGMENT_HUGE && last->xblock_size==1));
-      if (segment->kind != MI_SEGMENT_HUGE && segment->thread_id != 0) { // segment is not huge or abandoned
-        sq = mi_span_queue_for(slice->slice_count,tld);
-        mi_assert_internal(mi_span_queue_contains(sq,slice));
-      }
-    }
-    slice = &segment->slices[maxindex+1];
-  }
-  mi_assert_internal(slice == end);
-  mi_assert_internal(used_count == segment->used + 1);
-  return true;
-}
-#endif
-
-/* -----------------------------------------------------------
- Segment size calculations
------------------------------------------------------------ */
-
-static size_t mi_segment_info_size(mi_segment_t* segment) {
-  return segment->segment_info_slices * MI_SEGMENT_SLICE_SIZE;
-}
-
-static uint8_t* _mi_segment_page_start_from_slice(const mi_segment_t* segment, const mi_slice_t* slice, size_t xblock_size, size_t* page_size)
-{
-  ptrdiff_t idx = slice - segment->slices;
-  size_t psize = (size_t)slice->slice_count * MI_SEGMENT_SLICE_SIZE;
-  // make the start not OS page aligned for smaller blocks to avoid page/cache effects
-  size_t start_offset = (xblock_size >= MI_INTPTR_SIZE && xblock_size <= 1024 ? MI_MAX_ALIGN_GUARANTEE : 0); 
-  if (page_size != NULL) { *page_size = psize - start_offset; }
-  return (uint8_t*)segment + ((idx*MI_SEGMENT_SLICE_SIZE) + start_offset);
-}
-
-// Start of the page available memory; can be used on uninitialized pages
-uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size)
-{
-  const mi_slice_t* slice = mi_page_to_slice((mi_page_t*)page);
-  uint8_t* p = _mi_segment_page_start_from_slice(segment, slice, page->xblock_size, page_size);  
-  mi_assert_internal(page->xblock_size > 0 || _mi_ptr_page(p) == page);
-  mi_assert_internal(_mi_ptr_segment(p) == segment);
-  return p;
-}
-
-
-static size_t mi_segment_calculate_slices(size_t required, size_t* pre_size, size_t* info_slices) {
-  size_t page_size = _mi_os_page_size();
-  size_t isize     = _mi_align_up(sizeof(mi_segment_t), page_size);
-  size_t guardsize = 0;
-
-  if (MI_SECURE>0) {
-    // in secure mode, we set up a protected page in between the segment info
-    // and the page data (and one at the end of the segment)
-    guardsize =  page_size;
-    required  = _mi_align_up(required, page_size);
-  }
-
-  if (pre_size != NULL) *pre_size = isize;
-  isize = _mi_align_up(isize + guardsize, MI_SEGMENT_SLICE_SIZE);
-  if (info_slices != NULL) *info_slices = isize / MI_SEGMENT_SLICE_SIZE;
-  size_t segment_size = (required==0 ? MI_SEGMENT_SIZE : _mi_align_up( required + isize + guardsize, MI_SEGMENT_SLICE_SIZE) );  
-  mi_assert_internal(segment_size % MI_SEGMENT_SLICE_SIZE == 0);
-  return (segment_size / MI_SEGMENT_SLICE_SIZE);
-}
-
-
-/* ----------------------------------------------------------------------------
-Segment caches
-We keep a small segment cache per thread to increase local
-reuse and avoid setting/clearing guard pages in secure mode.
-------------------------------------------------------------------------------- */
-
-static void mi_segments_track_size(long segment_size, mi_segments_tld_t* tld) {
-  if (segment_size>=0) _mi_stat_increase(&tld->stats->segments,1);
-                  else _mi_stat_decrease(&tld->stats->segments,1);
-  tld->count += (segment_size >= 0 ? 1 : -1);
-  if (tld->count > tld->peak_count) tld->peak_count = tld->count;
-  tld->current_size += segment_size;
-  if (tld->current_size > tld->peak_size) tld->peak_size = tld->current_size;
-}
-
-static void mi_segment_os_free(mi_segment_t* segment, mi_segments_tld_t* tld) {
-  segment->thread_id = 0;
-  _mi_segment_map_freed_at(segment);
-  mi_segments_track_size(-((long)mi_segment_size(segment)),tld);
-  if (MI_SECURE>0) {
-    // _mi_os_unprotect(segment, mi_segment_size(segment)); // ensure no more guard pages are set
-    // unprotect the guard pages; we cannot just unprotect the whole segment size as part may be decommitted
-    size_t os_pagesize = _mi_os_page_size();
-    _mi_os_unprotect((uint8_t*)segment + mi_segment_info_size(segment) - os_pagesize, os_pagesize);
-    uint8_t* end = (uint8_t*)segment + mi_segment_size(segment) - os_pagesize;
-    _mi_os_unprotect(end, os_pagesize);
-  }
-
-  // purge delayed decommits now? (no, leave it to the cache)
-  // mi_segment_delayed_decommit(segment,true,tld->stats);
-  
-  // _mi_os_free(segment, mi_segment_size(segment), /*segment->memid,*/ tld->stats);
-  const size_t size = mi_segment_size(segment);
-  if (size != MI_SEGMENT_SIZE || !_mi_segment_cache_push(segment, size, segment->memid, &segment->commit_mask, &segment->decommit_mask, segment->mem_is_large, segment->mem_is_pinned, tld->os)) {
-    const size_t csize = _mi_commit_mask_committed_size(&segment->commit_mask, size);
-    if (csize > 0 && !segment->mem_is_pinned) _mi_stat_decrease(&_mi_stats_main.committed, csize);
-    _mi_abandoned_await_readers();  // wait until safe to free
-    _mi_arena_free(segment, mi_segment_size(segment), segment->memid, segment->mem_is_pinned /* pretend not committed to not double count decommits */, tld->os);
-  }
-}
-
-// called by threads that are terminating 
-void _mi_segment_thread_collect(mi_segments_tld_t* tld) {
-  MI_UNUSED(tld);
-  // nothing to do
-}
-
-
-/* -----------------------------------------------------------
-   Span management
------------------------------------------------------------ */
-
-static void mi_segment_commit_mask(mi_segment_t* segment, bool conservative, uint8_t* p, size_t size, uint8_t** start_p, size_t* full_size, mi_commit_mask_t* cm) {
-  mi_assert_internal(_mi_ptr_segment(p) == segment);
-  mi_assert_internal(segment->kind != MI_SEGMENT_HUGE);
-  mi_commit_mask_create_empty(cm);
-  if (size == 0 || size > MI_SEGMENT_SIZE || segment->kind == MI_SEGMENT_HUGE) return;
-  const size_t segstart = mi_segment_info_size(segment);
-  const size_t segsize = mi_segment_size(segment);
-  if (p >= (uint8_t*)segment + segsize) return;
-
-  size_t pstart = (p - (uint8_t*)segment);
-  mi_assert_internal(pstart + size <= segsize);
-
-  size_t start;
-  size_t end;
-  if (conservative) {
-    // decommit conservative
-    start = _mi_align_up(pstart, MI_COMMIT_SIZE);
-    end   = _mi_align_down(pstart + size, MI_COMMIT_SIZE);
-    mi_assert_internal(start >= segstart);
-    mi_assert_internal(end <= segsize);
-  }
-  else {
-    // commit liberal
-    start = _mi_align_down(pstart, MI_MINIMAL_COMMIT_SIZE);
-    end   = _mi_align_up(pstart + size, MI_MINIMAL_COMMIT_SIZE);
-  }
-  if (pstart >= segstart && start < segstart) {  // note: the mask is also calculated for an initial commit of the info area
-    start = segstart;
-  }
-  if (end > segsize) {
-    end = segsize;
-  }
-
-  mi_assert_internal(start <= pstart && (pstart + size) <= end);
-  mi_assert_internal(start % MI_COMMIT_SIZE==0 && end % MI_COMMIT_SIZE == 0);
-  *start_p   = (uint8_t*)segment + start;
-  *full_size = (end > start ? end - start : 0);
-  if (*full_size == 0) return;
-
-  size_t bitidx = start / MI_COMMIT_SIZE;
-  mi_assert_internal(bitidx < MI_COMMIT_MASK_BITS);
-  
-  size_t bitcount = *full_size / MI_COMMIT_SIZE; // can be 0
-  if (bitidx + bitcount > MI_COMMIT_MASK_BITS) {
-    _mi_warning_message("commit mask overflow: idx=%zu count=%zu start=%zx end=%zx p=0x%p size=%zu fullsize=%zu\n", bitidx, bitcount, start, end, p, size, *full_size);
-  }
-  mi_assert_internal((bitidx + bitcount) <= MI_COMMIT_MASK_BITS);
-  mi_commit_mask_create(bitidx, bitcount, cm);
-}
-
-
-static bool mi_segment_commitx(mi_segment_t* segment, bool commit, uint8_t* p, size_t size, mi_stats_t* stats) {    
-  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->decommit_mask));
-
-  // try to commit in at least MI_MINIMAL_COMMIT_SIZE sizes.
-  /*
-  if (commit && size > 0) {
-    const size_t csize = _mi_align_up(size, MI_MINIMAL_COMMIT_SIZE);
-    if (p + csize <= mi_segment_end(segment)) {
-      size = csize;
-    }
-  }
-  */
-  // commit liberal, but decommit conservative
-  uint8_t* start = NULL;
-  size_t   full_size = 0;
-  mi_commit_mask_t mask;
-  mi_segment_commit_mask(segment, !commit/*conservative*/, p, size, &start, &full_size, &mask);
-  if (mi_commit_mask_is_empty(&mask) || full_size==0) return true;
-
-  if (commit && !mi_commit_mask_all_set(&segment->commit_mask, &mask)) {
-    bool is_zero = false;
-    mi_commit_mask_t cmask;
-    mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);
-    _mi_stat_decrease(&_mi_stats_main.committed, _mi_commit_mask_committed_size(&cmask, MI_SEGMENT_SIZE)); // adjust for overlap
-    if (!_mi_os_commit(start,full_size,&is_zero,stats)) return false;    
-    mi_commit_mask_set(&segment->commit_mask, &mask);     
-  }
-  else if (!commit && mi_commit_mask_any_set(&segment->commit_mask, &mask)) {
-    mi_assert_internal((void*)start != (void*)segment);
-    //mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &mask));
-
-    mi_commit_mask_t cmask;
-    mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);
-    _mi_stat_increase(&_mi_stats_main.committed, full_size - _mi_commit_mask_committed_size(&cmask, MI_SEGMENT_SIZE)); // adjust for overlap
-    if (segment->allow_decommit) { 
-      _mi_os_decommit(start, full_size, stats); // ok if this fails
-    } 
-    mi_commit_mask_clear(&segment->commit_mask, &mask);
-  }
-  // increase expiration of reusing part of the delayed decommit
-  if (commit && mi_commit_mask_any_set(&segment->decommit_mask, &mask)) {
-    segment->decommit_expire = _mi_clock_now() + mi_option_get(mi_option_decommit_delay);
-  }
-  // always undo delayed decommits
-  mi_commit_mask_clear(&segment->decommit_mask, &mask);
-  return true;
-}
-
-static bool mi_segment_ensure_committed(mi_segment_t* segment, uint8_t* p, size_t size, mi_stats_t* stats) {
-  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->decommit_mask));
-  // note: assumes commit_mask is always full for huge segments as otherwise the commit mask bits can overflow
-  if (mi_commit_mask_is_full(&segment->commit_mask) && mi_commit_mask_is_empty(&segment->decommit_mask)) return true; // fully committed
-  return mi_segment_commitx(segment,true,p,size,stats);
-}
-
-static void mi_segment_perhaps_decommit(mi_segment_t* segment, uint8_t* p, size_t size, mi_stats_t* stats) {
-  if (!segment->allow_decommit) return;
-  if (mi_option_get(mi_option_decommit_delay) == 0) {
-    mi_segment_commitx(segment, false, p, size, stats);
-  }
-  else {
-    // register for future decommit in the decommit mask
-    uint8_t* start = NULL;
-    size_t   full_size = 0;
-    mi_commit_mask_t mask; 
-    mi_segment_commit_mask(segment, true /*conservative*/, p, size, &start, &full_size, &mask);
-    if (mi_commit_mask_is_empty(&mask) || full_size==0) return;
-    
-    // update delayed commit
-    mi_assert_internal(segment->decommit_expire > 0 || mi_commit_mask_is_empty(&segment->decommit_mask));      
-    mi_commit_mask_t cmask;
-    mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);  // only decommit what is committed; span_free may try to decommit more
-    mi_commit_mask_set(&segment->decommit_mask, &cmask);
-    mi_msecs_t now = _mi_clock_now();    
-    if (segment->decommit_expire == 0) {
-      // no previous decommits, initialize now
-      segment->decommit_expire = now + mi_option_get(mi_option_decommit_delay);
-    }
-    else if (segment->decommit_expire <= now) {
-      // previous decommit mask already expired
-      // mi_segment_delayed_decommit(segment, true, stats);
-      segment->decommit_expire = now + mi_option_get(mi_option_decommit_extend_delay); // (mi_option_get(mi_option_decommit_delay) / 8); // wait a tiny bit longer in case there is a series of free's
-    }
-    else {
-      // previous decommit mask is not yet expired, increase the expiration by a bit.
-      segment->decommit_expire += mi_option_get(mi_option_decommit_extend_delay);
-    }
-  }  
-}
-
-static void mi_segment_delayed_decommit(mi_segment_t* segment, bool force, mi_stats_t* stats) {
-  if (!segment->allow_decommit || mi_commit_mask_is_empty(&segment->decommit_mask)) return;
-  mi_msecs_t now = _mi_clock_now();
-  if (!force && now < segment->decommit_expire) return;
-
-  mi_commit_mask_t mask = segment->decommit_mask;
-  segment->decommit_expire = 0;
-  mi_commit_mask_create_empty(&segment->decommit_mask);
-
-  size_t idx;
-  size_t count;
-  mi_commit_mask_foreach(&mask, idx, count) {
-    // if found, decommit that sequence
-    if (count > 0) {
-      uint8_t* p = (uint8_t*)segment + (idx*MI_COMMIT_SIZE);
-      size_t size = count * MI_COMMIT_SIZE;
-      mi_segment_commitx(segment, false, p, size, stats);
-    }
-  }
-  mi_commit_mask_foreach_end()
-  mi_assert_internal(mi_commit_mask_is_empty(&segment->decommit_mask));
-}
-
-
-static bool mi_segment_is_abandoned(mi_segment_t* segment) {
-  return (segment->thread_id == 0);
-}
-
-// note: can be called on abandoned segments
-static void mi_segment_span_free(mi_segment_t* segment, size_t slice_index, size_t slice_count, mi_segments_tld_t* tld) {
-  mi_assert_internal(slice_index < segment->slice_entries);
-  mi_span_queue_t* sq = (segment->kind == MI_SEGMENT_HUGE || mi_segment_is_abandoned(segment) 
-                          ? NULL : mi_span_queue_for(slice_count,tld));
-  if (slice_count==0) slice_count = 1;
-  mi_assert_internal(slice_index + slice_count - 1 < segment->slice_entries);
-
-  // set first and last slice (the intermediates can be undetermined)
-  mi_slice_t* slice = &segment->slices[slice_index];
-  slice->slice_count = (uint32_t)slice_count;
-  mi_assert_internal(slice->slice_count == slice_count); // no overflow?
-  slice->slice_offset = 0;
-  if (slice_count > 1) {
-    mi_slice_t* last = &segment->slices[slice_index + slice_count - 1];
-    last->slice_count = 0;
-    last->slice_offset = (uint32_t)(sizeof(mi_page_t)*(slice_count - 1));
-    last->xblock_size = 0;
-  }
-
-  // perhaps decommit
-  mi_segment_perhaps_decommit(segment,mi_slice_start(slice),slice_count*MI_SEGMENT_SLICE_SIZE,tld->stats);
-  
-  // and push it on the free page queue (if it was not a huge page)
-  if (sq != NULL) mi_span_queue_push( sq, slice );
-             else slice->xblock_size = 0; // mark huge page as free anyways
-}
-
-/*
-// called from reclaim to add existing free spans
-static void mi_segment_span_add_free(mi_slice_t* slice, mi_segments_tld_t* tld) {
-  mi_segment_t* segment = _mi_ptr_segment(slice);
-  mi_assert_internal(slice->xblock_size==0 && slice->slice_count>0 && slice->slice_offset==0);
-  size_t slice_index = mi_slice_index(slice);
-  mi_segment_span_free(segment,slice_index,slice->slice_count,tld);
-}
-*/
-
-static void mi_segment_span_remove_from_queue(mi_slice_t* slice, mi_segments_tld_t* tld) {
-  mi_assert_internal(slice->slice_count > 0 && slice->slice_offset==0 && slice->xblock_size==0);
-  mi_assert_internal(_mi_ptr_segment(slice)->kind != MI_SEGMENT_HUGE);
-  mi_span_queue_t* sq = mi_span_queue_for(slice->slice_count, tld);
-  mi_span_queue_delete(sq, slice);
-}
-
-// note: can be called on abandoned segments
-static mi_slice_t* mi_segment_span_free_coalesce(mi_slice_t* slice, mi_segments_tld_t* tld) {
-  mi_assert_internal(slice != NULL && slice->slice_count > 0 && slice->slice_offset == 0);
-  mi_segment_t* segment = _mi_ptr_segment(slice);
-  bool is_abandoned = mi_segment_is_abandoned(segment);
-
-  // for huge pages, just mark as free but don't add to the queues
-  if (segment->kind == MI_SEGMENT_HUGE) {
-    mi_assert_internal(segment->used == 1);  // decreased right after this call in `mi_segment_page_clear`
-    slice->xblock_size = 0;  // mark as free anyways
-    // we should mark the last slice `xblock_size=0` now to maintain invariants but we skip it to 
-    // avoid a possible cache miss (and the segment is about to be freed)
-    return slice;
-  }
-
-  // otherwise coalesce the span and add to the free span queues
-  size_t slice_count = slice->slice_count;
-  mi_slice_t* next = slice + slice->slice_count;
-  mi_assert_internal(next <= mi_segment_slices_end(segment));
-  if (next < mi_segment_slices_end(segment) && next->xblock_size==0) {
-    // free next block -- remove it from free and merge
-    mi_assert_internal(next->slice_count > 0 && next->slice_offset==0);
-    slice_count += next->slice_count; // extend
-    if (!is_abandoned) { mi_segment_span_remove_from_queue(next, tld); }
-  }
-  if (slice > segment->slices) {
-    mi_slice_t* prev = mi_slice_first(slice - 1);
-    mi_assert_internal(prev >= segment->slices);
-    if (prev->xblock_size==0) {
-      // free previous slice -- remove it from free and merge
-      mi_assert_internal(prev->slice_count > 0 && prev->slice_offset==0);
-      slice_count += prev->slice_count;
-      if (!is_abandoned) { mi_segment_span_remove_from_queue(prev, tld); }
-      slice = prev;
-    }
-  }
-
-  // and add the new free page
-  mi_segment_span_free(segment, mi_slice_index(slice), slice_count, tld);
-  return slice;
-}
-
-
-static void mi_segment_slice_split(mi_segment_t* segment, mi_slice_t* slice, size_t slice_count, mi_segments_tld_t* tld) {
-  mi_assert_internal(_mi_ptr_segment(slice)==segment);
-  mi_assert_internal(slice->slice_count >= slice_count);
-  mi_assert_internal(slice->xblock_size > 0); // no more in free queue
-  if (slice->slice_count <= slice_count) return;
-  mi_assert_internal(segment->kind != MI_SEGMENT_HUGE);
-  size_t next_index = mi_slice_index(slice) + slice_count;
-  size_t next_count = slice->slice_count - slice_count;
-  mi_segment_span_free(segment, next_index, next_count, tld);
-  slice->slice_count = (uint32_t)slice_count;
-}
-
-// Note: may still return NULL if committing the memory failed
-static mi_page_t* mi_segment_span_allocate(mi_segment_t* segment, size_t slice_index, size_t slice_count, mi_segments_tld_t* tld) {
-  mi_assert_internal(slice_index < segment->slice_entries);
-  mi_slice_t* slice = &segment->slices[slice_index];
-  mi_assert_internal(slice->xblock_size==0 || slice->xblock_size==1);
-
-  // commit before changing the slice data
-  if (!mi_segment_ensure_committed(segment, _mi_segment_page_start_from_slice(segment, slice, 0, NULL), slice_count * MI_SEGMENT_SLICE_SIZE, tld->stats)) {
-    return NULL;  // commit failed!
-  }
-
-  // convert the slices to a page
-  slice->slice_offset = 0;
-  slice->slice_count = (uint32_t)slice_count;
-  mi_assert_internal(slice->slice_count == slice_count);
-  const size_t bsize = slice_count * MI_SEGMENT_SLICE_SIZE;
-  slice->xblock_size = (uint32_t)(bsize >= MI_HUGE_BLOCK_SIZE ? MI_HUGE_BLOCK_SIZE : bsize);
-  mi_page_t*  page = mi_slice_to_page(slice);
-  mi_assert_internal(mi_page_block_size(page) == bsize);
-
-  // set slice back pointers for the first MI_MAX_SLICE_OFFSET entries
-  size_t extra = slice_count-1;
-  if (extra > MI_MAX_SLICE_OFFSET) extra = MI_MAX_SLICE_OFFSET;
-  if (slice_index + extra >= segment->slice_entries) extra = segment->slice_entries - slice_index - 1;  // huge objects may have more slices than avaiable entries in the segment->slices
-  slice++;
-  for (size_t i = 1; i <= extra; i++, slice++) {
-    slice->slice_offset = (uint32_t)(sizeof(mi_slice_t)*i);
-    slice->slice_count = 0;
-    slice->xblock_size = 1;
-  }
-
-  // and also for the last one (if not set already) (the last one is needed for coalescing)
-  // note: the cast is needed for ubsan since the index can be larger than MI_SLICES_PER_SEGMENT for huge allocations (see #543)
-  mi_slice_t* last = &((mi_slice_t*)segment->slices)[slice_index + slice_count - 1]; 
-  if (last < mi_segment_slices_end(segment) && last >= slice) {
-    last->slice_offset = (uint32_t)(sizeof(mi_slice_t)*(slice_count-1));
-    last->slice_count = 0;
-    last->xblock_size = 1;
-  }
-  
-  // and initialize the page
-  page->is_reset = false;
-  page->is_committed = true;
-  segment->used++;
-  return page;
-}
-
-static mi_page_t* mi_segments_page_find_and_allocate(size_t slice_count, mi_segments_tld_t* tld) {
-  mi_assert_internal(slice_count*MI_SEGMENT_SLICE_SIZE <= MI_LARGE_OBJ_SIZE_MAX);
-  // search from best fit up
-  mi_span_queue_t* sq = mi_span_queue_for(slice_count, tld);
-  if (slice_count == 0) slice_count = 1;
-  while (sq <= &tld->spans[MI_SEGMENT_BIN_MAX]) {
-    for (mi_slice_t* slice = sq->first; slice != NULL; slice = slice->next) {
-      if (slice->slice_count >= slice_count) {
-        // found one
-        mi_span_queue_delete(sq, slice);
-        mi_segment_t* segment = _mi_ptr_segment(slice);
-        if (slice->slice_count > slice_count) {
-          mi_segment_slice_split(segment, slice, slice_count, tld);
-        }
-        mi_assert_internal(slice != NULL && slice->slice_count == slice_count && slice->xblock_size > 0);
-        mi_page_t* page = mi_segment_span_allocate(segment, mi_slice_index(slice), slice->slice_count, tld);
-        if (page == NULL) {
-          // commit failed; return NULL but first restore the slice
-          mi_segment_span_free_coalesce(slice, tld);
-          return NULL;
-        }
-        return page;        
-      }
-    }
-    sq++;
-  }
-  // could not find a page..
-  return NULL;
-}
-
-
-/* -----------------------------------------------------------
-   Segment allocation
------------------------------------------------------------ */
-
-// Allocate a segment from the OS aligned to `MI_SEGMENT_SIZE` .
-static mi_segment_t* mi_segment_init(mi_segment_t* segment, size_t required, mi_segments_tld_t* tld, mi_os_tld_t* os_tld, mi_page_t** huge_page)
-{
-  mi_assert_internal((required==0 && huge_page==NULL) || (required>0 && huge_page != NULL));
-  mi_assert_internal((segment==NULL) || (segment!=NULL && required==0));
-  // calculate needed sizes first
-  size_t info_slices;
-  size_t pre_size;
-  const size_t segment_slices = mi_segment_calculate_slices(required, &pre_size, &info_slices);
-  const size_t slice_entries = (segment_slices > MI_SLICES_PER_SEGMENT ? MI_SLICES_PER_SEGMENT : segment_slices);
-  const size_t segment_size = segment_slices * MI_SEGMENT_SLICE_SIZE;
-
-  // Commit eagerly only if not the first N lazy segments (to reduce impact of many threads that allocate just a little)
-  const bool eager_delay = (// !_mi_os_has_overcommit() &&             // never delay on overcommit systems
-                            _mi_current_thread_count() > 1 &&       // do not delay for the first N threads
-                            tld->count < (size_t)mi_option_get(mi_option_eager_commit_delay));
-  const bool eager = !eager_delay && mi_option_is_enabled(mi_option_eager_commit);
-  bool commit = eager || (required > 0); 
-  
-  // Try to get from our cache first
-  bool is_zero = false;
-  const bool commit_info_still_good = (segment != NULL);
-  mi_commit_mask_t commit_mask;
-  mi_commit_mask_t decommit_mask;
-  if (segment != NULL) {
-    commit_mask = segment->commit_mask;
-    decommit_mask = segment->decommit_mask;
-  }
-  else {
-    mi_commit_mask_create_empty(&commit_mask);
-    mi_commit_mask_create_empty(&decommit_mask);
-  }
-  if (segment==NULL) {
-    // Allocate the segment from the OS
-    bool mem_large = (!eager_delay && (MI_SECURE==0)); // only allow large OS pages once we are no longer lazy    
-    bool is_pinned = false;
-    size_t memid = 0;
-    segment = (mi_segment_t*)_mi_segment_cache_pop(segment_size, &commit_mask, &decommit_mask, &mem_large, &is_pinned, &is_zero, &memid, os_tld);
-    if (segment==NULL) {
-      segment = (mi_segment_t*)_mi_arena_alloc_aligned(segment_size, MI_SEGMENT_SIZE, &commit, &mem_large, &is_pinned, &is_zero, &memid, os_tld);
-      if (segment == NULL) return NULL;  // failed to allocate
-      if (commit) {
-        mi_commit_mask_create_full(&commit_mask);
-      }
-      else {
-        mi_commit_mask_create_empty(&commit_mask);
-      }
-    }    
-    mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0);
-
-    const size_t commit_needed = _mi_divide_up(info_slices*MI_SEGMENT_SLICE_SIZE, MI_COMMIT_SIZE);
-    mi_assert_internal(commit_needed>0);
-    mi_commit_mask_t commit_needed_mask;
-    mi_commit_mask_create(0, commit_needed, &commit_needed_mask);
-    if (!mi_commit_mask_all_set(&commit_mask, &commit_needed_mask)) {
-      // at least commit the info slices
-      mi_assert_internal(commit_needed*MI_COMMIT_SIZE >= info_slices*MI_SEGMENT_SLICE_SIZE);
-      bool ok = _mi_os_commit(segment, commit_needed*MI_COMMIT_SIZE, &is_zero, tld->stats);
-      if (!ok) return NULL; // failed to commit   
-      mi_commit_mask_set(&commit_mask, &commit_needed_mask); 
-    }
-    segment->memid = memid;
-    segment->mem_is_pinned = is_pinned;
-    segment->mem_is_large = mem_large;
-    segment->mem_is_committed = mi_commit_mask_is_full(&commit_mask);
-    mi_segments_track_size((long)(segment_size), tld);
-    _mi_segment_map_allocated_at(segment);
-  }
-
-  // zero the segment info? -- not always needed as it is zero initialized from the OS 
-  mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);  // tsan
-  if (!is_zero) {
-    ptrdiff_t ofs = offsetof(mi_segment_t, next);
-    size_t    prefix = offsetof(mi_segment_t, slices) - ofs;
-    memset((uint8_t*)segment+ofs, 0, prefix + sizeof(mi_slice_t)*segment_slices);
-  }
-
-  if (!commit_info_still_good) {
-    segment->commit_mask = commit_mask; // on lazy commit, the initial part is always committed
-    segment->allow_decommit = (mi_option_is_enabled(mi_option_allow_decommit) && !segment->mem_is_pinned && !segment->mem_is_large);    
-    if (segment->allow_decommit) {
-      segment->decommit_expire = _mi_clock_now() + mi_option_get(mi_option_decommit_delay);
-      segment->decommit_mask = decommit_mask;
-      mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->decommit_mask));
-      #if MI_DEBUG>2
-      const size_t commit_needed = _mi_divide_up(info_slices*MI_SEGMENT_SLICE_SIZE, MI_COMMIT_SIZE);
-      mi_commit_mask_t commit_needed_mask;
-      mi_commit_mask_create(0, commit_needed, &commit_needed_mask);
-      mi_assert_internal(!mi_commit_mask_any_set(&segment->decommit_mask, &commit_needed_mask));
-      #endif
-    }    
-    else {
-      mi_assert_internal(mi_commit_mask_is_empty(&decommit_mask));
-      segment->decommit_expire = 0;
-      mi_commit_mask_create_empty( &segment->decommit_mask );
-      mi_assert_internal(mi_commit_mask_is_empty(&segment->decommit_mask));
-    }
-  }
-  
-
-  // initialize segment info
-  segment->segment_slices = segment_slices;
-  segment->segment_info_slices = info_slices;
-  segment->thread_id = _mi_thread_id();
-  segment->cookie = _mi_ptr_cookie(segment);
-  segment->slice_entries = slice_entries;
-  segment->kind = (required == 0 ? MI_SEGMENT_NORMAL : MI_SEGMENT_HUGE);
-
-  // memset(segment->slices, 0, sizeof(mi_slice_t)*(info_slices+1));
-  _mi_stat_increase(&tld->stats->page_committed, mi_segment_info_size(segment));
-
-  // set up guard pages
-  size_t guard_slices = 0;
-  if (MI_SECURE>0) {
-    // in secure mode, we set up a protected page in between the segment info
-    // and the page data, and at the end of the segment.
-    size_t os_pagesize = _mi_os_page_size();    
-    mi_assert_internal(mi_segment_info_size(segment) - os_pagesize >= pre_size);
-    _mi_os_protect((uint8_t*)segment + mi_segment_info_size(segment) - os_pagesize, os_pagesize);
-    uint8_t* end = (uint8_t*)segment + mi_segment_size(segment) - os_pagesize;
-    mi_segment_ensure_committed(segment, end, os_pagesize, tld->stats);
-    _mi_os_protect(end, os_pagesize);
-    if (slice_entries == segment_slices) segment->slice_entries--; // don't use the last slice :-(
-    guard_slices = 1;
-  }
-
-  // reserve first slices for segment info
-  mi_page_t* page0 = mi_segment_span_allocate(segment, 0, info_slices, tld);
-  mi_assert_internal(page0!=NULL); if (page0==NULL) return NULL; // cannot fail as we always commit in advance  
-  mi_assert_internal(segment->used == 1);
-  segment->used = 0; // don't count our internal slices towards usage
-  
-  // initialize initial free pages
-  if (segment->kind == MI_SEGMENT_NORMAL) { // not a huge page
-    mi_assert_internal(huge_page==NULL);
-    mi_segment_span_free(segment, info_slices, segment->slice_entries - info_slices, tld);
-  }
-  else {
-    mi_assert_internal(huge_page!=NULL);
-    mi_assert_internal(mi_commit_mask_is_empty(&segment->decommit_mask));
-    mi_assert_internal(mi_commit_mask_is_full(&segment->commit_mask));
-    *huge_page = mi_segment_span_allocate(segment, info_slices, segment_slices - info_slices - guard_slices, tld);
-    mi_assert_internal(*huge_page != NULL); // cannot fail as we commit in advance 
-  }
-
-  mi_assert_expensive(mi_segment_is_valid(segment,tld));
-  return segment;
-}
-
-
-// Allocate a segment from the OS aligned to `MI_SEGMENT_SIZE` .
-static mi_segment_t* mi_segment_alloc(size_t required, mi_segments_tld_t* tld, mi_os_tld_t* os_tld, mi_page_t** huge_page) {
-  return mi_segment_init(NULL, required, tld, os_tld, huge_page);
-}
-
-
-static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t* tld) {
-  MI_UNUSED(force);
-  mi_assert_internal(segment != NULL);
-  mi_assert_internal(segment->next == NULL);
-  mi_assert_internal(segment->used == 0);
-
-  // Remove the free pages
-  mi_slice_t* slice = &segment->slices[0];
-  const mi_slice_t* end = mi_segment_slices_end(segment);
-  size_t page_count = 0;
-  while (slice < end) {
-    mi_assert_internal(slice->slice_count > 0);
-    mi_assert_internal(slice->slice_offset == 0);
-    mi_assert_internal(mi_slice_index(slice)==0 || slice->xblock_size == 0); // no more used pages ..
-    if (slice->xblock_size == 0 && segment->kind != MI_SEGMENT_HUGE) {
-      mi_segment_span_remove_from_queue(slice, tld);
-    }
-    page_count++;
-    slice = slice + slice->slice_count;
-  }
-  mi_assert_internal(page_count == 2); // first page is allocated by the segment itself
-
-  // stats
-  _mi_stat_decrease(&tld->stats->page_committed, mi_segment_info_size(segment));
-
-  // return it to the OS
-  mi_segment_os_free(segment, tld);
-}
-
-
-/* -----------------------------------------------------------
-   Page Free
------------------------------------------------------------ */
-
-static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld);
-
-// note: can be called on abandoned pages
-static mi_slice_t* mi_segment_page_clear(mi_page_t* page, mi_segments_tld_t* tld) {
-  mi_assert_internal(page->xblock_size > 0);
-  mi_assert_internal(mi_page_all_free(page));
-  mi_segment_t* segment = _mi_ptr_segment(page);
-  mi_assert_internal(segment->used > 0);
-  
-  size_t inuse = page->capacity * mi_page_block_size(page);
-  _mi_stat_decrease(&tld->stats->page_committed, inuse);
-  _mi_stat_decrease(&tld->stats->pages, 1);
-
-  // reset the page memory to reduce memory pressure?
-  if (!segment->mem_is_pinned && !page->is_reset && mi_option_is_enabled(mi_option_page_reset)) {
-    size_t psize;
-    uint8_t* start = _mi_page_start(segment, page, &psize);
-    page->is_reset = true;
-    _mi_os_reset(start, psize, tld->stats);
-  }
-
-  // zero the page data, but not the segment fields
-  page->is_zero_init = false;
-  ptrdiff_t ofs = offsetof(mi_page_t, capacity);
-  memset((uint8_t*)page + ofs, 0, sizeof(*page) - ofs);
-  page->xblock_size = 1;
-
-  // and free it
-  mi_slice_t* slice = mi_segment_span_free_coalesce(mi_page_to_slice(page), tld);  
-  segment->used--;
-  // cannot assert segment valid as it is called during reclaim
-  // mi_assert_expensive(mi_segment_is_valid(segment, tld));
-  return slice;
-}
-
-void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld)
-{
-  mi_assert(page != NULL);
-
-  mi_segment_t* segment = _mi_page_segment(page);
-  mi_assert_expensive(mi_segment_is_valid(segment,tld));
-
-  // mark it as free now
-  mi_segment_page_clear(page, tld);
-  mi_assert_expensive(mi_segment_is_valid(segment, tld));
-
-  if (segment->used == 0) {
-    // no more used pages; remove from the free list and free the segment
-    mi_segment_free(segment, force, tld);
-  }
-  else if (segment->used == segment->abandoned) {
-    // only abandoned pages; remove from free list and abandon
-    mi_segment_abandon(segment,tld);
-  }
-}
-
-
-/* -----------------------------------------------------------
-Abandonment
-
-When threads terminate, they can leave segments with
-live blocks (reachable through other threads). Such segments
-are "abandoned" and will be reclaimed by other threads to
-reuse their pages and/or free them eventually
-
-We maintain a global list of abandoned segments that are
-reclaimed on demand. Since this is shared among threads
-the implementation needs to avoid the A-B-A problem on
-popping abandoned segments: <https://en.wikipedia.org/wiki/ABA_problem>
-We use tagged pointers to avoid accidentially identifying
-reused segments, much like stamped references in Java.
-Secondly, we maintain a reader counter to avoid resetting
-or decommitting segments that have a pending read operation.
-
-Note: the current implementation is one possible design;
-another way might be to keep track of abandoned segments
-in the arenas/segment_cache's. This would have the advantage of keeping
-all concurrent code in one place and not needing to deal
-with ABA issues. The drawback is that it is unclear how to
-scan abandoned segments efficiently in that case as they
-would be spread among all other segments in the arenas.
------------------------------------------------------------ */
-
-// Use the bottom 20-bits (on 64-bit) of the aligned segment pointers
-// to put in a tag that increments on update to avoid the A-B-A problem.
-#define MI_TAGGED_MASK   MI_SEGMENT_MASK
-typedef uintptr_t        mi_tagged_segment_t;
-
-static mi_segment_t* mi_tagged_segment_ptr(mi_tagged_segment_t ts) {
-  return (mi_segment_t*)(ts & ~MI_TAGGED_MASK);
-}
-
-static mi_tagged_segment_t mi_tagged_segment(mi_segment_t* segment, mi_tagged_segment_t ts) {
-  mi_assert_internal(((uintptr_t)segment & MI_TAGGED_MASK) == 0);
-  uintptr_t tag = ((ts & MI_TAGGED_MASK) + 1) & MI_TAGGED_MASK;
-  return ((uintptr_t)segment | tag);
-}
-
-// This is a list of visited abandoned pages that were full at the time.
-// this list migrates to `abandoned` when that becomes NULL. The use of
-// this list reduces contention and the rate at which segments are visited.
-static mi_decl_cache_align _Atomic(mi_segment_t*)       abandoned_visited; // = NULL
-
-// The abandoned page list (tagged as it supports pop)
-static mi_decl_cache_align _Atomic(mi_tagged_segment_t) abandoned;         // = NULL
-
-// Maintain these for debug purposes (these counts may be a bit off)
-static mi_decl_cache_align _Atomic(size_t)           abandoned_count; 
-static mi_decl_cache_align _Atomic(size_t)           abandoned_visited_count;
-
-// We also maintain a count of current readers of the abandoned list
-// in order to prevent resetting/decommitting segment memory if it might
-// still be read.
-static mi_decl_cache_align _Atomic(size_t)           abandoned_readers; // = 0
-
-// Push on the visited list
-static void mi_abandoned_visited_push(mi_segment_t* segment) {
-  mi_assert_internal(segment->thread_id == 0);
-  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t,&segment->abandoned_next) == NULL);
-  mi_assert_internal(segment->next == NULL);
-  mi_assert_internal(segment->used > 0);
-  mi_segment_t* anext = mi_atomic_load_ptr_relaxed(mi_segment_t, &abandoned_visited);
-  do {
-    mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, anext);
-  } while (!mi_atomic_cas_ptr_weak_release(mi_segment_t, &abandoned_visited, &anext, segment));
-  mi_atomic_increment_relaxed(&abandoned_visited_count);
-}
-
-// Move the visited list to the abandoned list.
-static bool mi_abandoned_visited_revisit(void)
-{
-  // quick check if the visited list is empty
-  if (mi_atomic_load_ptr_relaxed(mi_segment_t, &abandoned_visited) == NULL) return false;
-
-  // grab the whole visited list
-  mi_segment_t* first = mi_atomic_exchange_ptr_acq_rel(mi_segment_t, &abandoned_visited, NULL);
-  if (first == NULL) return false;
-
-  // first try to swap directly if the abandoned list happens to be NULL
-  mi_tagged_segment_t afirst;
-  mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned);
-  if (mi_tagged_segment_ptr(ts)==NULL) {
-    size_t count = mi_atomic_load_relaxed(&abandoned_visited_count);
-    afirst = mi_tagged_segment(first, ts);
-    if (mi_atomic_cas_strong_acq_rel(&abandoned, &ts, afirst)) {
-      mi_atomic_add_relaxed(&abandoned_count, count);
-      mi_atomic_sub_relaxed(&abandoned_visited_count, count);
-      return true;
-    }
-  }
-
-  // find the last element of the visited list: O(n)
-  mi_segment_t* last = first;
-  mi_segment_t* next;
-  while ((next = mi_atomic_load_ptr_relaxed(mi_segment_t, &last->abandoned_next)) != NULL) {
-    last = next;
-  }
-
-  // and atomically prepend to the abandoned list
-  // (no need to increase the readers as we don't access the abandoned segments)
-  mi_tagged_segment_t anext = mi_atomic_load_relaxed(&abandoned);
-  size_t count;
-  do {
-    count = mi_atomic_load_relaxed(&abandoned_visited_count);
-    mi_atomic_store_ptr_release(mi_segment_t, &last->abandoned_next, mi_tagged_segment_ptr(anext));
-    afirst = mi_tagged_segment(first, anext);
-  } while (!mi_atomic_cas_weak_release(&abandoned, &anext, afirst));
-  mi_atomic_add_relaxed(&abandoned_count, count);
-  mi_atomic_sub_relaxed(&abandoned_visited_count, count);
-  return true;
-}
-
-// Push on the abandoned list.
-static void mi_abandoned_push(mi_segment_t* segment) {
-  mi_assert_internal(segment->thread_id == 0);
-  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
-  mi_assert_internal(segment->next == NULL);
-  mi_assert_internal(segment->used > 0);
-  mi_tagged_segment_t next;
-  mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned);
-  do {
-    mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, mi_tagged_segment_ptr(ts));
-    next = mi_tagged_segment(segment, ts);
-  } while (!mi_atomic_cas_weak_release(&abandoned, &ts, next));
-  mi_atomic_increment_relaxed(&abandoned_count);
-}
-
-// Wait until there are no more pending reads on segments that used to be in the abandoned list
-// called for example from `arena.c` before decommitting
-void _mi_abandoned_await_readers(void) {
-  size_t n;
-  do {
-    n = mi_atomic_load_acquire(&abandoned_readers);
-    if (n != 0) mi_atomic_yield();
-  } while (n != 0);
-}
-
-// Pop from the abandoned list
-static mi_segment_t* mi_abandoned_pop(void) {
-  mi_segment_t* segment;
-  // Check efficiently if it is empty (or if the visited list needs to be moved)
-  mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned);
-  segment = mi_tagged_segment_ptr(ts);
-  if (mi_likely(segment == NULL)) {
-    if (mi_likely(!mi_abandoned_visited_revisit())) { // try to swap in the visited list on NULL
-      return NULL;
-    }
-  }
-
-  // Do a pop. We use a reader count to prevent
-  // a segment to be decommitted while a read is still pending,
-  // and a tagged pointer to prevent A-B-A link corruption.
-  // (this is called from `region.c:_mi_mem_free` for example)
-  mi_atomic_increment_relaxed(&abandoned_readers);  // ensure no segment gets decommitted
-  mi_tagged_segment_t next = 0;
-  ts = mi_atomic_load_acquire(&abandoned);
-  do {
-    segment = mi_tagged_segment_ptr(ts);
-    if (segment != NULL) {
-      mi_segment_t* anext = mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next);
-      next = mi_tagged_segment(anext, ts); // note: reads the segment's `abandoned_next` field so should not be decommitted
-    }
-  } while (segment != NULL && !mi_atomic_cas_weak_acq_rel(&abandoned, &ts, next));
-  mi_atomic_decrement_relaxed(&abandoned_readers);  // release reader lock
-  if (segment != NULL) {
-    mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);
-    mi_atomic_decrement_relaxed(&abandoned_count);
-  }
-  return segment;
-}
-
-/* -----------------------------------------------------------
-   Abandon segment/page
------------------------------------------------------------ */
-
-static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
-  mi_assert_internal(segment->used == segment->abandoned);
-  mi_assert_internal(segment->used > 0);
-  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
-  mi_assert_internal(segment->abandoned_visits == 0);
-  mi_assert_expensive(mi_segment_is_valid(segment,tld));
-  
-  // remove the free pages from the free page queues
-  mi_slice_t* slice = &segment->slices[0];
-  const mi_slice_t* end = mi_segment_slices_end(segment);
-  while (slice < end) {
-    mi_assert_internal(slice->slice_count > 0);
-    mi_assert_internal(slice->slice_offset == 0);
-    if (slice->xblock_size == 0) { // a free page
-      mi_segment_span_remove_from_queue(slice,tld);
-      slice->xblock_size = 0; // but keep it free
-    }
-    slice = slice + slice->slice_count;
-  }
-
-  // perform delayed decommits
-  mi_segment_delayed_decommit(segment, mi_option_is_enabled(mi_option_abandoned_page_decommit) /* force? */, tld->stats);    
-  
-  // all pages in the segment are abandoned; add it to the abandoned list
-  _mi_stat_increase(&tld->stats->segments_abandoned, 1);
-  mi_segments_track_size(-((long)mi_segment_size(segment)), tld);
-  segment->thread_id = 0;
-  mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);
-  segment->abandoned_visits = 1;   // from 0 to 1 to signify it is abandoned
-  mi_abandoned_push(segment);
-}
-
-void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) {
-  mi_assert(page != NULL);
-  mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
-  mi_assert_internal(mi_page_heap(page) == NULL);
-  mi_segment_t* segment = _mi_page_segment(page);
-
-  mi_assert_expensive(mi_segment_is_valid(segment,tld));
-  segment->abandoned++;  
-
-  _mi_stat_increase(&tld->stats->pages_abandoned, 1);
-  mi_assert_internal(segment->abandoned <= segment->used);
-  if (segment->used == segment->abandoned) {
-    // all pages are abandoned, abandon the entire segment
-    mi_segment_abandon(segment, tld);
-  }
-}
-
-/* -----------------------------------------------------------
-  Reclaim abandoned pages
------------------------------------------------------------ */
-
-static mi_slice_t* mi_slices_start_iterate(mi_segment_t* segment, const mi_slice_t** end) {
-  mi_slice_t* slice = &segment->slices[0];
-  *end = mi_segment_slices_end(segment);
-  mi_assert_internal(slice->slice_count>0 && slice->xblock_size>0); // segment allocated page
-  slice = slice + slice->slice_count; // skip the first segment allocated page
-  return slice;
-}
-
-// Possibly free pages and check if free space is available
-static bool mi_segment_check_free(mi_segment_t* segment, size_t slices_needed, size_t block_size, mi_segments_tld_t* tld) 
-{
-  mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE);
-  mi_assert_internal(mi_segment_is_abandoned(segment));
-  bool has_page = false;
-  
-  // for all slices
-  const mi_slice_t* end;
-  mi_slice_t* slice = mi_slices_start_iterate(segment, &end);
-  while (slice < end) {
-    mi_assert_internal(slice->slice_count > 0);
-    mi_assert_internal(slice->slice_offset == 0);
-    if (mi_slice_is_used(slice)) { // used page
-      // ensure used count is up to date and collect potential concurrent frees
-      mi_page_t* const page = mi_slice_to_page(slice);
-      _mi_page_free_collect(page, false);
-      if (mi_page_all_free(page)) {
-        // if this page is all free now, free it without adding to any queues (yet) 
-        mi_assert_internal(page->next == NULL && page->prev==NULL);
-        _mi_stat_decrease(&tld->stats->pages_abandoned, 1);
-        segment->abandoned--;
-        slice = mi_segment_page_clear(page, tld); // re-assign slice due to coalesce!
-        mi_assert_internal(!mi_slice_is_used(slice));
-        if (slice->slice_count >= slices_needed) {
-          has_page = true;
-        }
-      }
-      else {
-        if (page->xblock_size == block_size && mi_page_has_any_available(page)) {
-          // a page has available free blocks of the right size
-          has_page = true;
-        }
-      }      
-    }
-    else {
-      // empty span
-      if (slice->slice_count >= slices_needed) {
-        has_page = true;
-      }
-    }
-    slice = slice + slice->slice_count;
-  }
-  return has_page;
-}
-
-// Reclaim an abandoned segment; returns NULL if the segment was freed
-// set `right_page_reclaimed` to `true` if it reclaimed a page of the right `block_size` that was not full.
-static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, size_t requested_block_size, bool* right_page_reclaimed, mi_segments_tld_t* tld) {
-  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
-  mi_assert_expensive(mi_segment_is_valid(segment, tld));
-  if (right_page_reclaimed != NULL) { *right_page_reclaimed = false; }
-
-  segment->thread_id = _mi_thread_id();
-  segment->abandoned_visits = 0;
-  mi_segments_track_size((long)mi_segment_size(segment), tld);
-  mi_assert_internal(segment->next == NULL);
-  _mi_stat_decrease(&tld->stats->segments_abandoned, 1);
-  
-  // for all slices
-  const mi_slice_t* end;
-  mi_slice_t* slice = mi_slices_start_iterate(segment, &end);
-  while (slice < end) {
-    mi_assert_internal(slice->slice_count > 0);
-    mi_assert_internal(slice->slice_offset == 0);
-    if (mi_slice_is_used(slice)) {
-      // in use: reclaim the page in our heap
-      mi_page_t* page = mi_slice_to_page(slice);
-      mi_assert_internal(!page->is_reset);
-      mi_assert_internal(page->is_committed);
-      mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
-      mi_assert_internal(mi_page_heap(page) == NULL);
-      mi_assert_internal(page->next == NULL && page->prev==NULL);
-      _mi_stat_decrease(&tld->stats->pages_abandoned, 1);
-      segment->abandoned--;
-      // set the heap again and allow delayed free again
-      mi_page_set_heap(page, heap);
-      _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set)
-      _mi_page_free_collect(page, false); // ensure used count is up to date
-      if (mi_page_all_free(page)) {
-        // if everything free by now, free the page
-        slice = mi_segment_page_clear(page, tld);   // set slice again due to coalesceing
-      }
-      else {
-        // otherwise reclaim it into the heap
-        _mi_page_reclaim(heap, page);
-        if (requested_block_size == page->xblock_size && mi_page_has_any_available(page)) {
-          if (right_page_reclaimed != NULL) { *right_page_reclaimed = true; }
-        }
-      }
-    }
-    else {
-      // the span is free, add it to our page queues
-      slice = mi_segment_span_free_coalesce(slice, tld); // set slice again due to coalesceing
-    }
-    mi_assert_internal(slice->slice_count>0 && slice->slice_offset==0);
-    slice = slice + slice->slice_count;
-  }
-
-  mi_assert(segment->abandoned == 0);
-  if (segment->used == 0) {  // due to page_clear
-    mi_assert_internal(right_page_reclaimed == NULL || !(*right_page_reclaimed));
-    mi_segment_free(segment, false, tld);
-    return NULL;
-  }
-  else {
-    return segment;
-  }
-}
-
-
-void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld) {
-  mi_segment_t* segment;
-  while ((segment = mi_abandoned_pop()) != NULL) {
-    mi_segment_reclaim(segment, heap, 0, NULL, tld);
-  }
-}
-
-static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slices, size_t block_size, bool* reclaimed, mi_segments_tld_t* tld)
-{
-  *reclaimed = false;
-  mi_segment_t* segment;
-  long max_tries = mi_option_get_clamp(mi_option_max_segment_reclaim, 8, 1024);     // limit the work to bound allocation times  
-  while ((max_tries-- > 0) && ((segment = mi_abandoned_pop()) != NULL)) {
-    segment->abandoned_visits++;
-    bool has_page = mi_segment_check_free(segment,needed_slices,block_size,tld); // try to free up pages (due to concurrent frees)
-    if (segment->used == 0) {
-      // free the segment (by forced reclaim) to make it available to other threads.
-      // note1: we prefer to free a segment as that might lead to reclaiming another
-      // segment that is still partially used.
-      // note2: we could in principle optimize this by skipping reclaim and directly
-      // freeing but that would violate some invariants temporarily)
-      mi_segment_reclaim(segment, heap, 0, NULL, tld);
-    }
-    else if (has_page) {
-      // found a large enough free span, or a page of the right block_size with free space 
-      // we return the result of reclaim (which is usually `segment`) as it might free
-      // the segment due to concurrent frees (in which case `NULL` is returned).
-      return mi_segment_reclaim(segment, heap, block_size, reclaimed, tld);
-    }
-    else if (segment->abandoned_visits > 3) {  
-      // always reclaim on 3rd visit to limit the abandoned queue length.
-      mi_segment_reclaim(segment, heap, 0, NULL, tld);
-    }
-    else {
-      // otherwise, push on the visited list so it gets not looked at too quickly again
-      mi_segment_delayed_decommit(segment, true /* force? */, tld->stats); // forced decommit if needed as we may not visit soon again
-      mi_abandoned_visited_push(segment);
-    }
-  }
-  return NULL;
-}
-
-
-void _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld)
-{
-  mi_segment_t* segment;
-  int max_tries = (force ? 16*1024 : 1024); // limit latency
-  if (force) {
-    mi_abandoned_visited_revisit(); 
-  }
-  while ((max_tries-- > 0) && ((segment = mi_abandoned_pop()) != NULL)) {
-    mi_segment_check_free(segment,0,0,tld); // try to free up pages (due to concurrent frees)
-    if (segment->used == 0) {
-      // free the segment (by forced reclaim) to make it available to other threads.
-      // note: we could in principle optimize this by skipping reclaim and directly
-      // freeing but that would violate some invariants temporarily)
-      mi_segment_reclaim(segment, heap, 0, NULL, tld);
-    }
-    else {
-      // otherwise, decommit if needed and push on the visited list 
-      // note: forced decommit can be expensive if many threads are destroyed/created as in mstress.
-      mi_segment_delayed_decommit(segment, force, tld->stats);
-      mi_abandoned_visited_push(segment);
-    }
-  }
-}
-
-/* -----------------------------------------------------------
-   Reclaim or allocate
------------------------------------------------------------ */
-
-static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t needed_slices, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) 
-{
-  mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE);
-  mi_assert_internal(block_size <= MI_LARGE_OBJ_SIZE_MAX);
-  
-  // 1. try to reclaim an abandoned segment
-  bool reclaimed;
-  mi_segment_t* segment = mi_segment_try_reclaim(heap, needed_slices, block_size, &reclaimed, tld);
-  if (reclaimed) {
-    // reclaimed the right page right into the heap
-    mi_assert_internal(segment != NULL);
-    return NULL; // pretend out-of-memory as the page will be in the page queue of the heap with available blocks
-  }
-  else if (segment != NULL) {
-    // reclaimed a segment with a large enough empty span in it
-    return segment;
-  }
-  // 2. otherwise allocate a fresh segment
-  return mi_segment_alloc(0, tld, os_tld, NULL);  
-}
-
-
-/* -----------------------------------------------------------
-   Page allocation
------------------------------------------------------------ */
-
-static mi_page_t* mi_segments_page_alloc(mi_heap_t* heap, mi_page_kind_t page_kind, size_t required, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
-{
-  mi_assert_internal(required <= MI_LARGE_OBJ_SIZE_MAX && page_kind <= MI_PAGE_LARGE);
-
-  // find a free page
-  size_t page_size = _mi_align_up(required, (required > MI_MEDIUM_PAGE_SIZE ? MI_MEDIUM_PAGE_SIZE : MI_SEGMENT_SLICE_SIZE));
-  size_t slices_needed = page_size / MI_SEGMENT_SLICE_SIZE;
-  mi_assert_internal(slices_needed * MI_SEGMENT_SLICE_SIZE == page_size);
-  mi_page_t* page = mi_segments_page_find_and_allocate(slices_needed, tld); //(required <= MI_SMALL_SIZE_MAX ? 0 : slices_needed), tld);
-  if (page==NULL) {
-    // no free page, allocate a new segment and try again
-    if (mi_segment_reclaim_or_alloc(heap, slices_needed, block_size, tld, os_tld) == NULL) {
-      // OOM or reclaimed a good page in the heap
-      return NULL;  
-    }
-    else {
-      // otherwise try again
-      return mi_segments_page_alloc(heap, page_kind, required, block_size, tld, os_tld);
-    }
-  }
-  mi_assert_internal(page != NULL && page->slice_count*MI_SEGMENT_SLICE_SIZE == page_size);
-  mi_assert_internal(_mi_ptr_segment(page)->thread_id == _mi_thread_id());
-  mi_segment_delayed_decommit(_mi_ptr_segment(page), false, tld->stats);
-  return page;
-}
-
-
-
-/* -----------------------------------------------------------
-   Huge page allocation
------------------------------------------------------------ */
-
-static mi_page_t* mi_segment_huge_page_alloc(size_t size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
-{
-  mi_page_t* page = NULL;
-  mi_segment_t* segment = mi_segment_alloc(size,tld,os_tld,&page);
-  if (segment == NULL || page==NULL) return NULL;
-  mi_assert_internal(segment->used==1);
-  mi_assert_internal(mi_page_block_size(page) >= size);  
-  segment->thread_id = 0; // huge segments are immediately abandoned
-  return page;
-}
-
-// free huge block from another thread
-void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block) {
-  // huge page segments are always abandoned and can be freed immediately by any thread
-  mi_assert_internal(segment->kind==MI_SEGMENT_HUGE);
-  mi_assert_internal(segment == _mi_page_segment(page));
-  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id)==0);
-
-  // claim it and free
-  mi_heap_t* heap = mi_heap_get_default(); // issue #221; don't use the internal get_default_heap as we need to ensure the thread is initialized.
-  // paranoia: if this it the last reference, the cas should always succeed
-  size_t expected_tid = 0;
-  if (mi_atomic_cas_strong_acq_rel(&segment->thread_id, &expected_tid, heap->thread_id)) {
-    mi_block_set_next(page, block, page->free);
-    page->free = block;
-    page->used--;
-    page->is_zero = false;
-    mi_assert(page->used == 0);
-    mi_tld_t* tld = heap->tld;
-    _mi_segment_page_free(page, true, &tld->segments);
-  }
-#if (MI_DEBUG!=0)
-  else {
-    mi_assert_internal(false);
-  }
-#endif
-}
-
-/* -----------------------------------------------------------
-   Page allocation and free
------------------------------------------------------------ */
-mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
-  mi_page_t* page;
-  if (block_size <= MI_SMALL_OBJ_SIZE_MAX) {
-    page = mi_segments_page_alloc(heap,MI_PAGE_SMALL,block_size,block_size,tld,os_tld);
-  }
-  else if (block_size <= MI_MEDIUM_OBJ_SIZE_MAX) {
-    page = mi_segments_page_alloc(heap,MI_PAGE_MEDIUM,MI_MEDIUM_PAGE_SIZE,block_size,tld, os_tld);
-  }
-  else if (block_size <= MI_LARGE_OBJ_SIZE_MAX) {
-    page = mi_segments_page_alloc(heap,MI_PAGE_LARGE,block_size,block_size,tld, os_tld);
-  }
-  else {
-    page = mi_segment_huge_page_alloc(block_size,tld,os_tld);
-  }
-  mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page),tld));
-  return page;
-}
-
-
diff --git a/ext/src/mimalloc/src/static.c b/ext/src/mimalloc/src/static.c
index 5b34ddbb6c..2383f65961 100644
--- a/ext/src/mimalloc/src/static.c
+++ b/ext/src/mimalloc/src/static.c
@@ -14,26 +14,30 @@ terms of the MIT license. A copy of the license can be found in the file
 #endif
 
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
+#include "mimalloc/internal.h"
 
 // For a static override we create a single object file
 // containing the whole library. If it is linked first
 // it will override all the standard library allocation
 // functions (on Unix's).
-#include "stats.c"
-#include "random.c"
-#include "os.c"
-#include "bitmap.c"
-#include "arena.c"
-#include "segment-cache.c"
-#include "segment.c"
-#include "page.c"
-#include "heap.c"
-#include "alloc.c"
+#include "alloc.c"          // includes alloc-override.c and free.c
 #include "alloc-aligned.c"
 #include "alloc-posix.c"
-#if MI_OSX_ZONE
-#include "alloc-override-osx.c"
-#endif
+#include "arena.c"
+#include "arena-meta.c"
+#include "bitmap.c"
+#include "heap.c"
 #include "init.c"
+#include "libc.c"
 #include "options.c"
+#include "os.c"
+#include "page.c"           // includes page-queue.c
+#include "page-map.c"
+#include "random.c"
+#include "stats.c"
+#include "theap.c"
+#include "threadlocal.c"
+#include "prim/prim.c"
+#if MI_OSX_ZONE
+#include "prim/osx/alloc-override-zone.c"
+#endif
diff --git a/ext/src/mimalloc/src/stats.c b/ext/src/mimalloc/src/stats.c
index 10d3deebc4..e3af054ec1 100644
--- a/ext/src/mimalloc/src/stats.c
+++ b/ext/src/mimalloc/src/stats.c
@@ -1,14 +1,15 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
+#include "mimalloc-stats.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"
 
-#include <stdio.h>  // fputs, stderr
 #include <string.h> // memset
 
 #if defined(_MSC_VER) && (_MSC_VER < 1920)
@@ -19,121 +20,130 @@ terms of the MIT license. A copy of the license can be found in the file
   Statistics operations
 ----------------------------------------------------------- */
 
-static bool mi_is_in_main(void* stat) {
-  return ((uint8_t*)stat >= (uint8_t*)&_mi_stats_main
-         && (uint8_t*)stat < ((uint8_t*)&_mi_stats_main + sizeof(mi_stats_t)));  
+static void mi_stat_update_mt(mi_stat_count_t* stat, int64_t amount) {
+  if (amount == 0) return;
+  // add atomically
+  int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount);
+  mi_atomic_maxi64_relaxed(&stat->peak, current + amount);
+  if (amount > 0) {
+    mi_atomic_addi64_relaxed(&stat->total, amount);
+  }
 }
 
 static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
   if (amount == 0) return;
-  if (mi_is_in_main(stat))
-  {
-    // add atomically (for abandoned pages)
-    int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount);
-    mi_atomic_maxi64_relaxed(&stat->peak, current + amount);
-    if (amount > 0) {
-      mi_atomic_addi64_relaxed(&stat->allocated,amount);
-    }
-    else {
-      mi_atomic_addi64_relaxed(&stat->freed, -amount);
-    }
-  }
-  else {
-    // add thread local
-    stat->current += amount;
-    if (stat->current > stat->peak) stat->peak = stat->current;
-    if (amount > 0) {
-      stat->allocated += amount;
-    }
-    else {
-      stat->freed += -amount;
-    }
-  }
+  // add thread local
+  stat->current += amount;
+  if (stat->current > stat->peak) { stat->peak = stat->current; }
+  if (amount > 0) { stat->total += amount; }
 }
 
-void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {  
-  if (mi_is_in_main(stat)) {
-    mi_atomic_addi64_relaxed( &stat->count, 1 );
-    mi_atomic_addi64_relaxed( &stat->total, (int64_t)amount );
-  }
-  else {
-    stat->count++;
-    stat->total += amount;
-  }
+
+void __mi_stat_counter_increase_mt(mi_stat_counter_t* stat, size_t amount) {
+  mi_atomic_addi64_relaxed(&stat->total, (int64_t)amount);
+}
+
+void __mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {
+  stat->total += amount;
 }
 
-void _mi_stat_increase(mi_stat_count_t* stat, size_t amount) {
+void __mi_stat_increase_mt(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_update_mt(stat, (int64_t)amount);
+}
+void __mi_stat_increase(mi_stat_count_t* stat, size_t amount) {
   mi_stat_update(stat, (int64_t)amount);
 }
 
-void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
+void __mi_stat_decrease_mt(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_update_mt(stat, -((int64_t)amount));
+}
+void __mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
   mi_stat_update(stat, -((int64_t)amount));
 }
 
+
+// Adjust stats to compensate; for example before committing a range,
+// first adjust downwards with parts that were already committed so
+// we avoid double counting.
+static void mi_stat_adjust_mt(mi_stat_count_t* stat, int64_t amount) {
+  if (amount == 0) return;
+  // adjust atomically
+  mi_atomic_addi64_relaxed(&stat->current, amount);
+  mi_atomic_addi64_relaxed(&stat->total, amount);
+}
+
+static void mi_stat_adjust(mi_stat_count_t* stat, int64_t amount) {
+  if (amount == 0) return;
+  stat->current += amount;
+  stat->total += amount;
+}
+
+void __mi_stat_adjust_increase_mt(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_adjust_mt(stat, (int64_t)amount);
+}
+void __mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_adjust(stat, (int64_t)amount);
+}
+void __mi_stat_adjust_decrease_mt(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_adjust_mt(stat, -((int64_t)amount));
+}
+void __mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_adjust(stat, -((int64_t)amount));
+}
+
+
 // must be thread safe as it is called from stats_merge
-static void mi_stat_add(mi_stat_count_t* stat, const mi_stat_count_t* src, int64_t unit) {
+static void mi_stat_count_add_mt(mi_stat_count_t* stat, const mi_stat_count_t* src) {
   if (stat==src) return;
-  if (src->allocated==0 && src->freed==0) return;
-  mi_atomic_addi64_relaxed( &stat->allocated, src->allocated * unit);
-  mi_atomic_addi64_relaxed( &stat->current, src->current * unit);
-  mi_atomic_addi64_relaxed( &stat->freed, src->freed * unit);
-  // peak scores do not work across threads.. 
-  mi_atomic_addi64_relaxed( &stat->peak, src->peak * unit);
+  mi_atomic_void_addi64_relaxed(&stat->total, &src->total);
+  const int64_t prev_current = mi_atomic_addi64_relaxed(&stat->current, src->current);
+
+  // Global current plus thread peak approximates new global peak
+  // note: peak scores do really not work across threads.
+  // we used to just add them together but that often overestimates in practice.
+  // similarly, max does not seem to work well. The current approach
+  // by Artem Kharytoniuk (@artem-lunarg) seems to work better, see PR#1112
+  // for a longer description.
+  mi_atomic_maxi64_relaxed(&stat->peak, prev_current + src->peak);
 }
 
-static void mi_stat_counter_add(mi_stat_counter_t* stat, const mi_stat_counter_t* src, int64_t unit) {
+static void mi_stat_counter_add_mt(mi_stat_counter_t* stat, const mi_stat_counter_t* src) {
   if (stat==src) return;
-  mi_atomic_addi64_relaxed( &stat->total, src->total * unit);
-  mi_atomic_addi64_relaxed( &stat->count, src->count * unit);
+  mi_atomic_void_addi64_relaxed(&stat->total, &src->total);
 }
 
+#define MI_STAT_COUNT(stat)    mi_stat_count_add_mt(&stats->stat, &src->stat);
+#define MI_STAT_COUNTER(stat)  mi_stat_counter_add_mt(&stats->stat, &src->stat);
+
 // must be thread safe as it is called from stats_merge
 static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
   if (stats==src) return;
-  mi_stat_add(&stats->segments, &src->segments,1);
-  mi_stat_add(&stats->pages, &src->pages,1);
-  mi_stat_add(&stats->reserved, &src->reserved, 1);
-  mi_stat_add(&stats->committed, &src->committed, 1);
-  mi_stat_add(&stats->reset, &src->reset, 1);
-  mi_stat_add(&stats->page_committed, &src->page_committed, 1);
-
-  mi_stat_add(&stats->pages_abandoned, &src->pages_abandoned, 1);
-  mi_stat_add(&stats->segments_abandoned, &src->segments_abandoned, 1);
-  mi_stat_add(&stats->threads, &src->threads, 1);
-
-  mi_stat_add(&stats->malloc, &src->malloc, 1);
-  mi_stat_add(&stats->segments_cache, &src->segments_cache, 1);
-  mi_stat_add(&stats->normal, &src->normal, 1);
-  mi_stat_add(&stats->huge, &src->huge, 1);
-  mi_stat_add(&stats->large, &src->large, 1);
-
-  mi_stat_counter_add(&stats->pages_extended, &src->pages_extended, 1);
-  mi_stat_counter_add(&stats->mmap_calls, &src->mmap_calls, 1);
-  mi_stat_counter_add(&stats->commit_calls, &src->commit_calls, 1);
-
-  mi_stat_counter_add(&stats->page_no_retire, &src->page_no_retire, 1);
-  mi_stat_counter_add(&stats->searches, &src->searches, 1);
-  mi_stat_counter_add(&stats->normal_count, &src->normal_count, 1);
-  mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1);
-  mi_stat_counter_add(&stats->large_count, &src->large_count, 1);
-#if MI_STAT>1
+
+  // copy all fields
+  MI_STAT_FIELDS()
+
+  #if MI_STAT>1
   for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
-    if (src->normal_bins[i].allocated > 0 || src->normal_bins[i].freed > 0) {
-      mi_stat_add(&stats->normal_bins[i], &src->normal_bins[i], 1);
-    }
+    mi_stat_count_add_mt(&stats->malloc_bins[i], &src->malloc_bins[i]);
+  }
+  #endif
+  for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
+    mi_stat_count_add_mt(&stats->page_bins[i], &src->page_bins[i]);
   }
-#endif
 }
 
+#undef MI_STAT_COUNT
+#undef MI_STAT_COUNTER
+
 /* -----------------------------------------------------------
   Display statistics
 ----------------------------------------------------------- */
 
-// unit > 0 : size in binary bytes 
+// unit > 0 : size in binary bytes
 // unit == 0: count as decimal
 // unit < 0 : count in binary
 static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, void* arg, const char* fmt) {
-  char buf[32]; buf[0] = 0;  
+  char buf[32]; _mi_memzero_var(buf);
   int  len = 32;
   const char* suffix = (unit <= 0 ? " " : "B");
   const int64_t base = (unit == 0 ? 1000 : 1024);
@@ -142,11 +152,11 @@ static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, void*
   const int64_t pos = (n < 0 ? -n : n);
   if (pos < base) {
     if (n!=1 || suffix[0] != 'B') {  // skip printing 1 B for the unit column
-      snprintf(buf, len, "%d %-3s", (int)n, (n==0 ? "" : suffix));
+      _mi_snprintf(buf, len, "%lld   %-3s", (long long)n, (n==0 ? "" : suffix));
     }
   }
   else {
-    int64_t divider = base;    
+    int64_t divider = base;
     const char* magnitude = "K";
     if (pos >= divider*base) { divider *= base; magnitude = "M"; }
     if (pos >= divider*base) { divider *= base; magnitude = "G"; }
@@ -154,10 +164,10 @@ static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, void*
     const long whole = (long)(tens/10);
     const long frac1 = (long)(tens%10);
     char unitdesc[8];
-    snprintf(unitdesc, 8, "%s%s%s", magnitude, (base==1024 ? "i" : ""), suffix);
-    snprintf(buf, len, "%ld.%ld %-3s", whole, (frac1 < 0 ? -frac1 : frac1), unitdesc);
+    _mi_snprintf(unitdesc, 8, "%s%s%s", magnitude, (base==1024 ? "i" : ""), suffix);
+    _mi_snprintf(buf, len, "%ld.%ld %-3s", whole, (frac1 < 0 ? -frac1 : frac1), unitdesc);
   }
-  _mi_fprintf(out, arg, (fmt==NULL ? "%11s" : fmt), buf);
+  _mi_fprintf(out, arg, (fmt==NULL ? "%12s" : fmt), buf);
 }
 
 
@@ -166,84 +176,104 @@ static void mi_print_amount(int64_t n, int64_t unit, mi_output_fun* out, void* a
 }
 
 static void mi_print_count(int64_t n, int64_t unit, mi_output_fun* out, void* arg) {
-  if (unit==1) _mi_fprintf(out, arg, "%11s"," ");
+  if (unit==1) _mi_fprintf(out, arg, "%12s"," ");
           else mi_print_amount(n,0,out,arg);
 }
 
-static void mi_stat_print(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg ) {
-  _mi_fprintf(out, arg,"%10s:", msg);
-  if (unit>0) {
-    mi_print_amount(stat->peak, unit, out, arg);
-    mi_print_amount(stat->allocated, unit, out, arg);
-    mi_print_amount(stat->freed, unit, out, arg);
-    mi_print_amount(stat->current, unit, out, arg);
-    mi_print_amount(unit, 1, out, arg);
-    mi_print_count(stat->allocated, unit, out, arg);
-    if (stat->allocated > stat->freed)
-      _mi_fprintf(out, arg, "  not all freed!\n");
-    else
-      _mi_fprintf(out, arg, "  ok\n");
-  }
-  else if (unit<0) {
-    mi_print_amount(stat->peak, -1, out, arg);
-    mi_print_amount(stat->allocated, -1, out, arg);
-    mi_print_amount(stat->freed, -1, out, arg);
-    mi_print_amount(stat->current, -1, out, arg);
-    if (unit==-1) {
-      _mi_fprintf(out, arg, "%22s", "");
+static void mi_stat_print_ex(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg, const char* notok ) {
+  _mi_fprintf(out, arg,"  %-10s:", msg);
+  if (unit != 0) {
+    if (unit > 0) {
+      mi_print_amount(stat->peak, unit, out, arg);
+      mi_print_amount(stat->total, unit, out, arg);
+      // mi_print_amount(stat->freed, unit, out, arg);
+      mi_print_amount(stat->current, unit, out, arg);
+      mi_print_amount(unit, 1, out, arg);
+      mi_print_count(stat->total, unit, out, arg);
     }
     else {
-      mi_print_amount(-unit, 1, out, arg);
-      mi_print_count((stat->allocated / -unit), 0, out, arg);
+      mi_print_amount(stat->peak, -1, out, arg);
+      mi_print_amount(stat->total, -1, out, arg);
+      // mi_print_amount(stat->freed, -1, out, arg);
+      mi_print_amount(stat->current, -1, out, arg);
+      if (unit == -1) {
+        _mi_fprintf(out, arg, "%24s", "");
+      }
+      else {
+        mi_print_amount(-unit, 1, out, arg);
+        mi_print_count((stat->total / -unit), 0, out, arg);
+      }
+    }
+    if (stat->current != 0) {
+      _mi_fprintf(out, arg, "  ");
+      _mi_fprintf(out, arg, (notok == NULL ? "not all freed" : notok));
+      _mi_fprintf(out, arg, "\n");
     }
-    if (stat->allocated > stat->freed)
-      _mi_fprintf(out, arg, "  not all freed!\n");
-    else
+    else {
       _mi_fprintf(out, arg, "  ok\n");
+    }
   }
   else {
     mi_print_amount(stat->peak, 1, out, arg);
-    mi_print_amount(stat->allocated, 1, out, arg);
-    _mi_fprintf(out, arg, "%11s", " ");  // no freed 
+    mi_print_amount(stat->total, 1, out, arg);
+    _mi_fprintf(out, arg, "%11s", " ");  // no freed
     mi_print_amount(stat->current, 1, out, arg);
     _mi_fprintf(out, arg, "\n");
   }
 }
 
+static void mi_stat_print(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg) {
+  mi_stat_print_ex(stat, msg, unit, out, arg, NULL);
+}
+
+#if MI_STAT>1
+static void mi_stat_total_print(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg) {
+  _mi_fprintf(out, arg, "  %-10s:", msg);
+  _mi_fprintf(out, arg, "%12s", " ");  // no peak
+  mi_print_amount(stat->total, unit, out, arg);
+  _mi_fprintf(out, arg, "\n");
+}
+#endif
+
 static void mi_stat_counter_print(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out, void* arg ) {
-  _mi_fprintf(out, arg, "%10s:", msg);
+  _mi_fprintf(out, arg, "  %-10s:", msg);
   mi_print_amount(stat->total, -1, out, arg);
   _mi_fprintf(out, arg, "\n");
 }
 
-static void mi_stat_counter_print_avg(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out, void* arg) {
-  const int64_t avg_tens = (stat->count == 0 ? 0 : (stat->total*10 / stat->count)); 
+
+static void mi_stat_average_print(size_t count, size_t total, const char* msg, mi_output_fun* out, void* arg) {
+  const int64_t avg_tens = (count == 0 ? 0 : (total*10 / count));
   const long avg_whole = (long)(avg_tens/10);
   const long avg_frac1 = (long)(avg_tens%10);
-  _mi_fprintf(out, arg, "%10s: %5ld.%ld avg\n", msg, avg_whole, avg_frac1);
+  _mi_fprintf(out, arg, "  %-10s: %5ld.%ld avg\n", msg, avg_whole, avg_frac1);
 }
 
 
-static void mi_print_header(mi_output_fun* out, void* arg ) {
-  _mi_fprintf(out, arg, "%10s: %10s %10s %10s %10s %10s %10s\n", "heap stats", "peak   ", "total   ", "freed   ", "current   ", "unit   ", "count   ");
+static void mi_print_header(const char* name,mi_output_fun* out, void* arg ) {
+  _mi_fprintf(out, arg, " %-11s %11s %11s %11s %11s %11s\n",
+                        name, "peak   ", "total   ", "current   ", "block   ", "total#   ");
 }
 
 #if MI_STAT>1
-static void mi_stats_print_bins(const mi_stat_count_t* bins, size_t max, const char* fmt, mi_output_fun* out, void* arg) {
+static bool mi_stats_print_bins(const mi_stat_count_t* bins, size_t max, mi_output_fun* out, void* arg) {
   bool found = false;
   char buf[64];
   for (size_t i = 0; i <= max; i++) {
-    if (bins[i].allocated > 0) {
+    if (bins[i].total > 0) {
       found = true;
-      int64_t unit = _mi_bin_size((uint8_t)i);
-      snprintf(buf, 64, "%s %3lu", fmt, (long)i);
-      mi_stat_print(&bins[i], buf, unit, out, arg);
+      const size_t unit = _mi_bin_size((uint8_t)i);
+      const char* pagekind = (unit <= MI_SMALL_MAX_OBJ_SIZE ? "S" :
+                               (unit <= MI_MEDIUM_MAX_OBJ_SIZE ? "M" :
+                                 (unit <= MI_LARGE_MAX_OBJ_SIZE ? "L" : "H")));
+      _mi_snprintf(buf, 64, "bin%2s  %3lu", pagekind, (long)i);
+      mi_stat_print(&bins[i], buf, (int64_t)unit, out, arg);
     }
   }
   if (found) {
     _mi_fprintf(out, arg, "\n");
-    mi_print_header(out, arg);
   }
+  return found;
 }
 #endif
 
@@ -257,7 +287,7 @@ typedef struct buffered_s {
   mi_output_fun* out;   // original output function
   void*          arg;   // and state
   char*          buf;   // local buffer of at least size `count+1`
-  size_t         used;  // currently used chars `used <= count`  
+  size_t         used;  // currently used chars `used <= count`
   size_t         count; // total chars available for output
 } buffered_t;
 
@@ -267,7 +297,7 @@ static void mi_buffered_flush(buffered_t* buf) {
   buf->used = 0;
 }
 
-static void mi_buffered_out(const char* msg, void* arg) {
+static void mi_cdecl mi_buffered_out(const char* msg, void* arg) {
   buffered_t* buf = (buffered_t*)arg;
   if (msg==NULL || buf==NULL) return;
   for (const char* src = msg; *src != 0; src++) {
@@ -283,166 +313,208 @@ static void mi_buffered_out(const char* msg, void* arg) {
 // Print statistics
 //------------------------------------------------------------
 
-static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults);
-
-static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0) mi_attr_noexcept {
-  // wrap the output function to be line buffered
-  char buf[256];
-  buffered_t buffer = { out0, arg0, NULL, 0, 255 };
-  buffer.buf = buf;
-  mi_output_fun* out = &mi_buffered_out;
-  void* arg = &buffer;
-
-  // and print using that
-  mi_print_header(out,arg);
-  #if MI_STAT>1
-  mi_stats_print_bins(stats->normal_bins, MI_BIN_HUGE, "normal",out,arg);
-  #endif
-  #if MI_STAT
-  mi_stat_print(&stats->normal, "normal", (stats->normal_count.count == 0 ? 1 : -(stats->normal.allocated / stats->normal_count.count)), out, arg);
-  mi_stat_print(&stats->large, "large", (stats->large_count.count == 0 ? 1 : -(stats->large.allocated / stats->large_count.count)), out, arg);
-  mi_stat_print(&stats->huge, "huge", (stats->huge_count.count == 0 ? 1 : -(stats->huge.allocated / stats->huge_count.count)), out, arg);
-  mi_stat_count_t total = { 0,0,0,0 };
-  mi_stat_add(&total, &stats->normal, 1);
-  mi_stat_add(&total, &stats->large, 1);
-  mi_stat_add(&total, &stats->huge, 1);
-  mi_stat_print(&total, "total", 1, out, arg);
-  #endif
-  #if MI_STAT>1
-  mi_stat_print(&stats->malloc, "malloc req", 1, out, arg);
-  _mi_fprintf(out, arg, "\n");
-  #endif
-  mi_stat_print(&stats->reserved, "reserved", 1, out, arg);
-  mi_stat_print(&stats->committed, "committed", 1, out, arg);
-  mi_stat_print(&stats->reset, "reset", 1, out, arg);
-  mi_stat_print(&stats->page_committed, "touched", 1, out, arg);
-  mi_stat_print(&stats->segments, "segments", -1, out, arg);
-  mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out, arg);
-  mi_stat_print(&stats->segments_cache, "-cached", -1, out, arg);
-  mi_stat_print(&stats->pages, "pages", -1, out, arg);
-  mi_stat_print(&stats->pages_abandoned, "-abandoned", -1, out, arg);
-  mi_stat_counter_print(&stats->pages_extended, "-extended", out, arg);
-  mi_stat_counter_print(&stats->page_no_retire, "-noretire", out, arg);
-  mi_stat_counter_print(&stats->mmap_calls, "mmaps", out, arg);
-  mi_stat_counter_print(&stats->commit_calls, "commits", out, arg);
-  mi_stat_print(&stats->threads, "threads", -1, out, arg);
-  mi_stat_counter_print_avg(&stats->searches, "searches", out, arg);
-  _mi_fprintf(out, arg, "%10s: %7zu\n", "numa nodes", _mi_os_numa_node_count());
-  
-  mi_msecs_t elapsed;
-  mi_msecs_t user_time;
-  mi_msecs_t sys_time;
+mi_decl_export void mi_process_info_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept
+{
+  size_t elapsed;
+  size_t user_time;
+  size_t sys_time;
   size_t current_rss;
   size_t peak_rss;
   size_t current_commit;
   size_t peak_commit;
   size_t page_faults;
-  mi_stat_process_info(&elapsed, &user_time, &sys_time, &current_rss, &peak_rss, &current_commit, &peak_commit, &page_faults);
-  _mi_fprintf(out, arg, "%10s: %7ld.%03ld s\n", "elapsed", elapsed/1000, elapsed%1000);
-  _mi_fprintf(out, arg, "%10s: user: %ld.%03ld s, system: %ld.%03ld s, faults: %lu, rss: ", "process",
-              user_time/1000, user_time%1000, sys_time/1000, sys_time%1000, (unsigned long)page_faults );
+  mi_process_info(&elapsed, &user_time, &sys_time, &current_rss, &peak_rss, &current_commit, &peak_commit, &page_faults);
+  _mi_fprintf(out, arg, "  %-10s: %5zu.%03zu s\n", "elapsed", elapsed/1000, elapsed%1000);
+  _mi_fprintf(out, arg, "  %-10s: user: %zu.%03zu s, system: %zu.%03zu s, faults: %zu, peak rss: ", "process",
+    user_time/1000, user_time%1000, sys_time/1000, sys_time%1000, page_faults);
   mi_printf_amount((int64_t)peak_rss, 1, out, arg, "%s");
   if (peak_commit > 0) {
-    _mi_fprintf(out, arg, ", commit: ");
+    _mi_fprintf(out, arg, ", peak commit: ");
     mi_printf_amount((int64_t)peak_commit, 1, out, arg, "%s");
   }
-  _mi_fprintf(out, arg, "\n");  
+  _mi_fprintf(out, arg, "\n");
 }
 
+void _mi_stats_print(const char* name, size_t id, mi_stats_t* stats, mi_output_fun* out0, void* arg0) mi_attr_noexcept {
+  // wrap the output function to be line buffered
+  char buf[256]; _mi_memzero_var(buf);
+  buffered_t buffer = { out0, arg0, NULL, 0, 255 };
+  buffer.buf = buf;
+  mi_output_fun* out = &mi_buffered_out;
+  void* arg = &buffer;
+
+  // and print using that
+  _mi_fprintf(out, arg, "%s %zu\n", name, id);
+
+  if (stats->malloc_normal.total + stats->malloc_huge.total != 0) {
+    #if MI_STAT>1
+    mi_print_header("blocks", out, arg);
+    mi_stats_print_bins(stats->malloc_bins, MI_BIN_HUGE, out, arg);
+    #endif
+    #if MI_STAT
+    mi_stat_print(&stats->malloc_normal, "binned", (stats->malloc_normal_count.total == 0 ? 1 : -1), out, arg);
+    mi_stat_print(&stats->malloc_huge, "huge", (stats->malloc_huge_count.total == 0 ? 1 : -1), out, arg);
+    mi_stat_count_t total = { 0,0,0 };
+    mi_stat_count_add_mt(&total, &stats->malloc_normal);
+    mi_stat_count_add_mt(&total, &stats->malloc_huge);
+    mi_stat_print_ex(&total, "total", 1, out, arg, "");
+    #if MI_STAT>1
+    mi_stat_total_print(&stats->malloc_requested, "malloc req", 1, out, arg);
+    #endif
+    _mi_fprintf(out, arg, "\n");
+    #endif
+  }
+
+  if (stats->pages.total != 0) {
+    mi_print_header("pages", out, arg);
+    mi_stat_print_ex(&stats->page_committed, "touched", 1, out, arg, "");
+    // mi_stat_print(&stats->segments, "segments", -1, out, arg);
+    // mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out, arg);
+    // mi_stat_print(&stats->segments_cache, "-cached", -1, out, arg);
+    mi_stat_print(&stats->pages, "pages", -1, out, arg);
+    mi_stat_print(&stats->pages_abandoned, "abandoned", -1, out, arg);
+    mi_stat_counter_print(&stats->pages_reclaim_on_alloc, "reclaima", out, arg);
+    mi_stat_counter_print(&stats->pages_reclaim_on_free, "reclaimf", out, arg);
+    mi_stat_counter_print(&stats->pages_reabandon_full, "reabandon", out, arg);
+    mi_stat_counter_print(&stats->pages_unabandon_busy_wait, "waits", out, arg);
+    mi_stat_counter_print(&stats->pages_extended, "extended", out, arg);
+    mi_stat_counter_print(&stats->pages_retire, "retire", out, arg);
+    mi_stat_average_print(stats->page_searches_count.total, stats->page_searches.total, "searches", out, arg);
+    _mi_fprintf(out, arg, "\n");
+  }
+
+  if (stats->arena_count.total > 0) {
+    mi_print_header("arenas", out, arg);
+    mi_stat_print_ex(&stats->reserved, "reserved", 1, out, arg, "");
+    mi_stat_print_ex(&stats->committed, "committed", 1, out, arg, "");
+    mi_stat_counter_print(&stats->reset, "reset", out, arg);
+    mi_stat_counter_print(&stats->purged, "purged", out, arg);
+
+    mi_stat_counter_print(&stats->arena_count, "arenas", out, arg);
+    mi_stat_counter_print(&stats->arena_rollback_count, "rollback", out, arg);
+    mi_stat_counter_print(&stats->mmap_calls, "mmaps", out, arg);
+    mi_stat_counter_print(&stats->commit_calls, "commits", out, arg);
+    mi_stat_counter_print(&stats->reset_calls, "resets", out, arg);
+    mi_stat_counter_print(&stats->purge_calls, "purges", out, arg);
+    mi_stat_counter_print(&stats->malloc_guarded_count, "guarded", out, arg);
+    mi_stat_print_ex(&stats->heaps, "heaps", -1, out, arg, "");
+    _mi_fprintf(out, arg, "\n");
+
+    mi_print_header("process", out, arg);
+    mi_stat_print_ex(&stats->threads, "threads", -1, out, arg, "");
+    _mi_fprintf(out, arg, "  %-10s: %5i\n", "numa nodes", _mi_os_numa_node_count());
+    mi_process_info_print_out(out, arg);
+  }
+  _mi_fprintf(out, arg, "\n");
+}
+
+
 static mi_msecs_t mi_process_start; // = 0
 
-static mi_stats_t* mi_stats_get_default(void) {
-  mi_heap_t* heap = mi_heap_get_default();
-  return &heap->tld->stats;
+// called on process init
+void _mi_stats_init(void) {
+  if (mi_process_start == 0) { mi_process_start = _mi_clock_start(); };
 }
 
-static void mi_stats_merge_from(mi_stats_t* stats) {
-  if (stats != &_mi_stats_main) {
-    mi_stats_add(&_mi_stats_main, stats);
-    memset(stats, 0, sizeof(mi_stats_t));
-  }
+static void mi_stats_add_into(mi_stats_t* to, mi_stats_t* from) {
+  mi_assert_internal(to != NULL && from != NULL);
+  if (to == from) return;
+  mi_stats_add(to, from);
+}
+
+void _mi_stats_merge_into(mi_stats_t* to, mi_stats_t* from) {
+  mi_assert_internal(to != NULL && from != NULL);
+  if (to == from) return;
+  mi_stats_add(to, from);
+  _mi_memzero(from, sizeof(mi_stats_t));
+}
+
+static mi_stats_t* mi_stats_merge_theap_to_heap(mi_theap_t* theap) mi_attr_noexcept {
+  mi_stats_t* stats = &theap->stats;
+  mi_stats_t* heap_stats = &theap->heap->stats;
+  _mi_stats_merge_into( heap_stats, stats );
+  return heap_stats;
+}
+
+static mi_stats_t* mi_heap_get_stats(mi_heap_t* heap) {
+  if (heap==NULL) { heap = mi_heap_main(); }
+  mi_theap_t* theap = _mi_heap_theap_peek(heap);
+  if (theap==NULL) return &heap->stats;
+              else return mi_stats_merge_theap_to_heap(theap);
 }
 
+// deprecated
 void mi_stats_reset(void) mi_attr_noexcept {
-  mi_stats_t* stats = mi_stats_get_default();
-  if (stats != &_mi_stats_main) { memset(stats, 0, sizeof(mi_stats_t)); }
-  memset(&_mi_stats_main, 0, sizeof(mi_stats_t));
-  if (mi_process_start == 0) { mi_process_start = _mi_clock_start(); };
+  if (!mi_theap_is_initialized(_mi_theap_default())) return;
+  mi_heap_get_stats(mi_heap_main());
+  mi_heap_stats_merge_to_subproc(mi_heap_main());
 }
 
-void mi_stats_merge(void) mi_attr_noexcept {
-  mi_stats_merge_from( mi_stats_get_default() );
+
+void mi_heap_stats_print_out(mi_heap_t* heap, mi_output_fun* out, void* arg) mi_attr_noexcept {
+  if (heap==NULL) { heap = mi_heap_main(); }
+  _mi_stats_print("heap", heap->heap_seq, mi_heap_get_stats(heap), out, arg);
 }
 
-void _mi_stats_done(mi_stats_t* stats) {  // called from `mi_thread_done`
-  mi_stats_merge_from(stats);
+typedef struct mi_heap_print_visit_info_s {
+  mi_output_fun* out;
+  void* out_arg;
+} mi_heap_print_visit_info_t;
+
+static bool mi_cdecl mi_heap_print_visitor(mi_heap_t* heap, void* arg) {
+  mi_heap_print_visit_info_t* vinfo = (mi_heap_print_visit_info_t*)(arg);
+  mi_heap_stats_print_out(heap, vinfo->out, vinfo->out_arg);
+  return true;
+}
+
+
+// show each heap and then the subproc
+void mi_subproc_heap_stats_print_out(mi_subproc_id_t subproc_id, mi_output_fun* out, void* arg) mi_attr_noexcept {
+  mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id);
+  if (subproc==NULL) return;
+  mi_heap_print_visit_info_t vinfo = { out, arg };
+  mi_subproc_visit_heaps(subproc, &mi_heap_print_visitor, &vinfo);
+  _mi_stats_print("subproc", subproc->subproc_seq, &subproc->stats, out, arg);
+}
+
+
+// aggregate all stats from the heaps and subproc and print those
+void mi_subproc_stats_print_out(mi_subproc_id_t subproc_id, mi_output_fun* out, void* arg) mi_attr_noexcept {
+  mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id);
+  if (subproc==NULL) return;
+  mi_stats_t_decl(stats); 
+  if (mi_subproc_stats_get(subproc_id, &stats)) {
+    _mi_stats_print("subproc", subproc->subproc_seq, &stats, out, arg);
+  }
 }
 
 void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
-  mi_stats_merge_from(mi_stats_get_default());
-  _mi_stats_print(&_mi_stats_main, out, arg);
+  mi_subproc_stats_print_out(mi_subproc_current(),out, arg);
 }
 
+// deprecated
 void mi_stats_print(void* out) mi_attr_noexcept {
   // for compatibility there is an `out` parameter (which can be `stdout` or `stderr`)
   mi_stats_print_out((mi_output_fun*)out, NULL);
 }
 
+// deprecated
 void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
-  _mi_stats_print(mi_stats_get_default(), out, arg);
-}
-
-size_t mi_stats_total_mem() mi_attr_noexcept {
-  mi_stat_count_t total = { 0,0,0,0 };
-  mi_stat_add(&total, &_mi_stats_main.normal, 1);
-  mi_stat_add(&total, &_mi_stats_main.huge, 1);
-  mi_stat_add(&total, &_mi_stats_main.large, 1);
-  return total.current < 0 ? 0 : total.current;
+  mi_theap_t* theap = _mi_theap_default();
+  if (theap==NULL || !mi_theap_is_initialized(theap)) return;
+  _mi_stats_print("heap", theap->heap->heap_seq, &theap->stats, out, arg);
+  mi_stats_merge_theap_to_heap(_mi_theap_default());
 }
 
 // ----------------------------------------------------------------
 // Basic timer for convenience; use milli-seconds to avoid doubles
 // ----------------------------------------------------------------
-#ifdef _WIN32
-#include <windows.h>
-static mi_msecs_t mi_to_msecs(LARGE_INTEGER t) {
-  static LARGE_INTEGER mfreq; // = 0
-  if (mfreq.QuadPart == 0LL) {
-    LARGE_INTEGER f;
-    QueryPerformanceFrequency(&f);
-    mfreq.QuadPart = f.QuadPart/1000LL;
-    if (mfreq.QuadPart == 0) mfreq.QuadPart = 1;
-  }
-  return (mi_msecs_t)(t.QuadPart / mfreq.QuadPart);  
-}
+
+static mi_msecs_t mi_clock_diff;
 
 mi_msecs_t _mi_clock_now(void) {
-  LARGE_INTEGER t;
-  QueryPerformanceCounter(&t);
-  return mi_to_msecs(t);
-}
-#else
-#include <time.h>
-#if defined(CLOCK_REALTIME) || defined(CLOCK_MONOTONIC)
-mi_msecs_t _mi_clock_now(void) {
-  struct timespec t;
-  #ifdef CLOCK_MONOTONIC
-  clock_gettime(CLOCK_MONOTONIC, &t);
-  #else  
-  clock_gettime(CLOCK_REALTIME, &t);
-  #endif
-  return ((mi_msecs_t)t.tv_sec * 1000) + ((mi_msecs_t)t.tv_nsec / 1000000);
-}
-#else
-// low resolution timer
-mi_msecs_t _mi_clock_now(void) {
-  return ((mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000));
+  return _mi_prim_clock_now();
 }
-#endif
-#endif
-
-
-static mi_msecs_t mi_clock_diff;
 
 mi_msecs_t _mi_clock_start(void) {
   if (mi_clock_diff == 0.0) {
@@ -462,130 +534,280 @@ mi_msecs_t _mi_clock_end(mi_msecs_t start) {
 // Basic process statistics
 // --------------------------------------------------------
 
-#if defined(_WIN32)
-#include <windows.h>
-#include <psapi.h>
-#pragma comment(lib,"psapi.lib")
+mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept
+{
+  mi_subproc_t* subproc = _mi_subproc_main();
+  mi_process_info_t pinfo;
+  _mi_memzero_var(pinfo);
+  pinfo.elapsed        = _mi_clock_end(mi_process_start);
+  pinfo.current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)(&subproc->stats.committed.current)));
+  pinfo.peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)(&subproc->stats.committed.peak)));
+  pinfo.current_rss    = pinfo.current_commit;
+  pinfo.peak_rss       = pinfo.peak_commit;
+  pinfo.utime          = 0;
+  pinfo.stime          = 0;
+  pinfo.page_faults    = 0;
 
-static mi_msecs_t filetime_msecs(const FILETIME* ftime) {
-  ULARGE_INTEGER i;
-  i.LowPart = ftime->dwLowDateTime;
-  i.HighPart = ftime->dwHighDateTime;
-  mi_msecs_t msecs = (i.QuadPart / 10000); // FILETIME is in 100 nano seconds
-  return msecs;
+  _mi_prim_process_info(&pinfo);
+
+  if (elapsed_msecs!=NULL)  *elapsed_msecs  = (pinfo.elapsed < 0 ? 0 : (pinfo.elapsed < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.elapsed : PTRDIFF_MAX));
+  if (user_msecs!=NULL)     *user_msecs     = (pinfo.utime < 0 ? 0 : (pinfo.utime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.utime : PTRDIFF_MAX));
+  if (system_msecs!=NULL)   *system_msecs   = (pinfo.stime < 0 ? 0 : (pinfo.stime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.stime : PTRDIFF_MAX));
+  if (current_rss!=NULL)    *current_rss    = pinfo.current_rss;
+  if (peak_rss!=NULL)       *peak_rss       = pinfo.peak_rss;
+  if (current_commit!=NULL) *current_commit = pinfo.current_commit;
+  if (peak_commit!=NULL)    *peak_commit    = pinfo.peak_commit;
+  if (page_faults!=NULL)    *page_faults    = pinfo.page_faults;
 }
 
-static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) 
-{
-  *elapsed = _mi_clock_end(mi_process_start);
-  FILETIME ct;
-  FILETIME ut;
-  FILETIME st;
-  FILETIME et;
-  GetProcessTimes(GetCurrentProcess(), &ct, &et, &st, &ut);
-  *utime = filetime_msecs(&ut);
-  *stime = filetime_msecs(&st);
-  PROCESS_MEMORY_COUNTERS info;
-  GetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info));
-  *current_rss    = (size_t)info.WorkingSetSize;
-  *peak_rss       = (size_t)info.PeakWorkingSetSize;
-  *current_commit = (size_t)info.PagefileUsage;
-  *peak_commit    = (size_t)info.PeakPagefileUsage;
-  *page_faults    = (size_t)info.PageFaultCount;  
-}
-
-#elif !defined(__wasi__) && (defined(__unix__) || defined(__unix) || defined(unix) || defined(__APPLE__) || defined(__HAIKU__))
-#include <stdio.h>
-#include <unistd.h>
-#include <sys/resource.h>
-
-#if defined(__APPLE__)
-#include <mach/mach.h>
-#endif
+mi_decl_export void mi_process_info_print(void) mi_attr_noexcept {
+  mi_process_info_print_out(NULL, NULL);
+}
 
-#if defined(__HAIKU__)
-#include <kernel/OS.h>
-#endif
 
-static mi_msecs_t timeval_secs(const struct timeval* tv) {
-  return ((mi_msecs_t)tv->tv_sec * 1000L) + ((mi_msecs_t)tv->tv_usec / 1000L);
+// --------------------------------------------------------
+// Return statistics
+// --------------------------------------------------------
+
+size_t mi_stats_get_bin_size(size_t bin) mi_attr_noexcept {
+  if (bin > MI_BIN_HUGE) return 0;
+  return _mi_bin_size(bin);
 }
 
-static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults)
-{
-  *elapsed = _mi_clock_end(mi_process_start);
-  struct rusage rusage;
-  getrusage(RUSAGE_SELF, &rusage);
-  *utime = timeval_secs(&rusage.ru_utime);
-  *stime = timeval_secs(&rusage.ru_stime);
-#if !defined(__HAIKU__)
-  *page_faults = rusage.ru_majflt;
-#endif
-  // estimate commit using our stats
-  *peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak));
-  *current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current));
-  *current_rss    = *current_commit;  // estimate 
-#if defined(__HAIKU__)
-  // Haiku does not have (yet?) a way to
-  // get these stats per process
-  thread_info tid;
-  area_info mem;
-  ssize_t c;
-  get_thread_info(find_thread(0), &tid);
-  while (get_next_area_info(tid.team, &c, &mem) == B_OK) {
-    *peak_rss += mem.ram_size;
+static bool _mi_stats_get(mi_stats_t* stats_in, mi_stats_t* stats_out) mi_attr_noexcept {
+  if (stats_out == NULL || stats_out->size != sizeof(mi_stats_t) || stats_out->version != MI_STAT_VERSION) return false;
+  if (stats_in == NULL || stats_in->size != stats_out->size) return false;
+  _mi_memcpy(stats_out, stats_in, stats_out->size);
+  return true;
+}
+
+bool mi_subproc_stats_get_exclusive(mi_subproc_id_t subproc_id, mi_stats_t* stats) mi_attr_noexcept {
+  return _mi_stats_get(&_mi_subproc_from_id(subproc_id)->stats, stats);
+}
+
+bool mi_heap_stats_get(mi_heap_t* heap, mi_stats_t* stats) mi_attr_noexcept {
+  return _mi_stats_get(mi_heap_get_stats(heap), stats);
+}
+
+
+static bool mi_cdecl mi_heap_aggregate_visitor(mi_heap_t* heap, void* arg) {
+  mi_stats_t* stats = (mi_stats_t*)arg;
+  mi_stats_add_into(stats, mi_heap_get_stats(heap));
+  return true;
+}
+
+bool mi_subproc_stats_get(mi_subproc_id_t subproc_id, mi_stats_t* stats) mi_attr_noexcept {
+  mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id);
+  if (stats == NULL || stats->size != sizeof(mi_stats_t) || stats->version != MI_STAT_VERSION) return false;
+  _mi_memzero(stats,stats->size);  
+  mi_subproc_visit_heaps(subproc, &mi_heap_aggregate_visitor, stats);
+  mi_stats_add_into(stats, &subproc->stats);
+  return true;
+}
+
+bool mi_stats_get(mi_stats_t* stats) mi_attr_noexcept {
+  return mi_subproc_stats_get(mi_subproc_current(), stats);
+}
+
+// SPADES_LOCAL: add some aggregation primitives
+size_t mi_stats_total_mem() mi_attr_noexcept {
+  mi_stat_count_t total = { 0,0,0 };
+  mi_stats_t_decl(stats);
+  
+  if (mi_stats_get(&stats)) {
+    mi_stat_count_add_mt(&total, &stats.malloc_normal);
+    mi_stat_count_add_mt(&total, &stats.malloc_huge);
+  }
+  
+  return total.current < 0 ? 0 : total.current;
+}
+
+// --------------------------------------------------------
+// Statics in json format
+// --------------------------------------------------------
+
+typedef struct mi_json_buf_s {
+  char*   buf;
+  size_t  size;
+  size_t  used;
+  bool    can_realloc;
+} mi_json_buf_t;
+
+static bool mi_json_buf_expand(mi_json_buf_t* hbuf) {
+  if (hbuf==NULL) return false;
+  if (hbuf->buf != NULL && hbuf->size>0) {
+    hbuf->buf[hbuf->size-1] = 0;
+  }
+  if (hbuf->size > SIZE_MAX/2 || !hbuf->can_realloc) return false;
+  const size_t newsize = (hbuf->size == 0 ? mi_good_size(12*MI_KiB) : 2*hbuf->size);
+  char* const  newbuf  = (char*)mi_rezalloc(hbuf->buf, newsize);
+  if (newbuf == NULL) return false;
+  hbuf->buf = newbuf;
+  hbuf->size = newsize;
+  return true;
+}
+
+static void mi_json_buf_print(mi_json_buf_t* hbuf, const char* msg) {
+  if (msg==NULL || hbuf==NULL) return;
+  if (hbuf->used + 1 >= hbuf->size && !hbuf->can_realloc) return;
+  for (const char* src = msg; *src != 0; src++) {
+    char c = *src;
+    if (hbuf->used + 1 >= hbuf->size) {
+      if (!mi_json_buf_expand(hbuf)) return;
+    }
+    mi_assert_internal(hbuf->used < hbuf->size);
+    hbuf->buf[hbuf->used++] = c;
   }
-  *page_faults = 0;
-#elif defined(__APPLE__)
-  *peak_rss = rusage.ru_maxrss;         // BSD reports in bytes
-  struct mach_task_basic_info info;
-  mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT;
-  if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&info, &infoCount) == KERN_SUCCESS) {
-    *current_rss = (size_t)info.resident_size;
+  mi_assert_internal(hbuf->used < hbuf->size);
+  hbuf->buf[hbuf->used] = 0;
+}
+
+static void mi_json_buf_print_count_bin(mi_json_buf_t* hbuf, const char* prefix, mi_stat_count_t* stat, size_t bin, bool add_comma) {
+  const size_t binsize = mi_stats_get_bin_size(bin);
+  const size_t pagesize = (binsize <= MI_SMALL_MAX_OBJ_SIZE ? MI_SMALL_PAGE_SIZE :
+                            (binsize <= MI_MEDIUM_MAX_OBJ_SIZE ? MI_MEDIUM_PAGE_SIZE :
+                              (binsize <= MI_LARGE_MAX_OBJ_SIZE ? MI_LARGE_PAGE_SIZE : 0)));
+  char buf[128];
+  _mi_snprintf(buf, 128, "%s{ \"total\": %lld, \"peak\": %lld, \"current\": %lld, \"block_size\": %zu, \"page_size\": %zu }%s\n", prefix, stat->total, stat->peak, stat->current, binsize, pagesize, (add_comma ? "," : ""));
+  buf[127] = 0;
+  mi_json_buf_print(hbuf, buf);
+}
+
+static void mi_json_buf_print_count_cbin(mi_json_buf_t* hbuf, const char* prefix, mi_stat_count_t* stat, mi_chunkbin_t bin, bool add_comma) {
+  const char* cbin = " ";
+  switch(bin) {
+    case MI_CBIN_SMALL:  cbin = "S"; break;
+    case MI_CBIN_MEDIUM: cbin = "M"; break;
+    case MI_CBIN_LARGE:  cbin = "L"; break;
+    case MI_CBIN_HUGE:   cbin = "H"; break;
+    case MI_CBIN_OTHER:  cbin = "X"; break;
+    default: cbin = " "; break;
   }
-#else
-  *peak_rss = rusage.ru_maxrss * 1024;  // Linux reports in KiB
-#endif  
+  char buf[128];
+  _mi_snprintf(buf, 128, "%s{ \"total\": %lld, \"peak\": %lld, \"current\": %lld, \"bin\": \"%s\" }%s\n", prefix, stat->total, stat->peak, stat->current, cbin, (add_comma ? "," : ""));
+  buf[127] = 0;
+  mi_json_buf_print(hbuf, buf);
 }
 
-#else
-#ifndef __wasi__
-// WebAssembly instances are not processes
-#pragma message("define a way to get process info")
-#endif
+static void mi_json_buf_print_count(mi_json_buf_t* hbuf, const char* prefix, mi_stat_count_t* stat, bool add_comma) {
+  char buf[128];
+  _mi_snprintf(buf, 128, "%s{ \"total\": %lld, \"peak\": %lld, \"current\": %lld }%s\n", prefix, stat->total, stat->peak, stat->current, (add_comma ? "," : ""));
+  buf[127] = 0;
+  mi_json_buf_print(hbuf, buf);
+}
 
-static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults)
-{
-  *elapsed = _mi_clock_end(mi_process_start);
-  *peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak));
-  *current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current));
-  *peak_rss    = *peak_commit;
-  *current_rss = *current_commit;
-  *page_faults = 0;
-  *utime = 0;
-  *stime = 0;
+static void mi_json_buf_print_count_value(mi_json_buf_t* hbuf, const char* name, mi_stat_count_t* stat) {
+  char buf[128];
+  _mi_snprintf(buf, 128, "  \"%s\": ", name);
+  buf[127] = 0;
+  mi_json_buf_print(hbuf, buf);
+  mi_json_buf_print_count(hbuf, "", stat, true);
 }
-#endif
 
+static void mi_json_buf_print_value(mi_json_buf_t* hbuf, const char* name, int64_t val) {
+  char buf[128];
+  _mi_snprintf(buf, 128, "  \"%s\": %lld,\n", name, val);
+  buf[127] = 0;
+  mi_json_buf_print(hbuf, buf);
+}
 
-mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept
-{
-  mi_msecs_t elapsed = 0;
-  mi_msecs_t utime = 0;
-  mi_msecs_t stime = 0;
-  size_t current_rss0 = 0;
-  size_t peak_rss0 = 0;
-  size_t current_commit0 = 0;
-  size_t peak_commit0 = 0;
-  size_t page_faults0 = 0;  
-  mi_stat_process_info(&elapsed,&utime, &stime, &current_rss0, &peak_rss0, &current_commit0, &peak_commit0, &page_faults0);
-  if (elapsed_msecs!=NULL)  *elapsed_msecs = (elapsed < 0 ? 0 : (elapsed < (mi_msecs_t)PTRDIFF_MAX ? (size_t)elapsed : PTRDIFF_MAX));
-  if (user_msecs!=NULL)     *user_msecs     = (utime < 0 ? 0 : (utime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)utime : PTRDIFF_MAX));
-  if (system_msecs!=NULL)   *system_msecs   = (stime < 0 ? 0 : (stime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)stime : PTRDIFF_MAX));
-  if (current_rss!=NULL)    *current_rss    = current_rss0;
-  if (peak_rss!=NULL)       *peak_rss       = peak_rss0;
-  if (current_commit!=NULL) *current_commit = current_commit0;
-  if (peak_commit!=NULL)    *peak_commit    = peak_commit0;
-  if (page_faults!=NULL)    *page_faults    = page_faults0;
+static void mi_json_buf_print_size(mi_json_buf_t* hbuf, const char* name, size_t val, bool add_comma) {
+  char buf[128];
+  _mi_snprintf(buf, 128, "    \"%s\": %zu%s\n", name, val, (add_comma ? "," : ""));
+  buf[127] = 0;
+  mi_json_buf_print(hbuf, buf);
 }
 
+static void mi_json_buf_print_counter_value(mi_json_buf_t* hbuf, const char* name, mi_stat_counter_t* stat) {
+  mi_json_buf_print_value(hbuf, name, stat->total);
+}
+
+#define MI_STAT_COUNT(stat)    mi_json_buf_print_count_value(&hbuf, #stat, &stats->stat);
+#define MI_STAT_COUNTER(stat)  mi_json_buf_print_counter_value(&hbuf, #stat, &stats->stat);
+
+static char* mi_stats_get_json_from(mi_stats_t* stats, size_t output_size, char* output_buf) mi_attr_noexcept {
+  if (stats==NULL || stats->size!=sizeof(mi_stats_t) || stats->version!=MI_STAT_VERSION) return NULL;
+  mi_json_buf_t hbuf = { NULL, 0, 0, true };
+  if (output_size > 0 && output_buf != NULL) {
+    _mi_memzero(output_buf, output_size);
+    hbuf.buf = output_buf;
+    hbuf.size = output_size;
+    hbuf.can_realloc = false;
+  }
+  else {
+    if (!mi_json_buf_expand(&hbuf)) return NULL;
+  }
+  mi_json_buf_print(&hbuf, "{\n");
+  mi_json_buf_print_value(&hbuf, "stat_version", MI_STAT_VERSION);
+  mi_json_buf_print_value(&hbuf, "mimalloc_version", MI_MALLOC_VERSION);
+
+  // process info
+  mi_json_buf_print(&hbuf, "  \"process\": {\n");
+  size_t elapsed;
+  size_t user_time;
+  size_t sys_time;
+  size_t current_rss;
+  size_t peak_rss;
+  size_t current_commit;
+  size_t peak_commit;
+  size_t page_faults;
+  mi_process_info(&elapsed, &user_time, &sys_time, &current_rss, &peak_rss, &current_commit, &peak_commit, &page_faults);
+  mi_json_buf_print_size(&hbuf, "elapsed_msecs", elapsed, true);
+  mi_json_buf_print_size(&hbuf, "user_msecs", user_time, true);
+  mi_json_buf_print_size(&hbuf, "system_msecs", sys_time, true);
+  mi_json_buf_print_size(&hbuf, "page_faults", page_faults, true);
+  mi_json_buf_print_size(&hbuf, "rss_current", current_rss, true);
+  mi_json_buf_print_size(&hbuf, "rss_peak", peak_rss, true);
+  mi_json_buf_print_size(&hbuf, "commit_current", current_commit, true);
+  mi_json_buf_print_size(&hbuf, "commit_peak", peak_commit, false);
+  mi_json_buf_print(&hbuf, "  },\n");
+
+  // statistics
+  MI_STAT_FIELDS()
+
+  // size bins
+  mi_json_buf_print(&hbuf, "  \"malloc_bins\": [\n");
+  for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
+    mi_json_buf_print_count_bin(&hbuf, "    ", &stats->malloc_bins[i], i, i!=MI_BIN_HUGE);
+  }
+  mi_json_buf_print(&hbuf, "  ],\n");
+  mi_json_buf_print(&hbuf, "  \"page_bins\": [\n");
+  for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
+    mi_json_buf_print_count_bin(&hbuf, "    ", &stats->page_bins[i], i, i!=MI_BIN_HUGE);
+  }
+  mi_json_buf_print(&hbuf, "  ],\n");
+  mi_json_buf_print(&hbuf, "  \"chunk_bins\": [\n");
+  for (size_t i = 0; i < MI_CBIN_COUNT; i++) {
+    mi_json_buf_print_count_cbin(&hbuf, "    ", &stats->chunk_bins[i], (mi_chunkbin_t)i, i!=MI_CBIN_COUNT-1);
+  }
+  mi_json_buf_print(&hbuf, "  ]\n");  
+  mi_json_buf_print(&hbuf, "}\n");
+  if (hbuf.used >= hbuf.size) {
+    // failed
+    if (hbuf.can_realloc) { mi_free(hbuf.buf); }
+    return NULL;
+  }
+  else {
+    return hbuf.buf;
+  }
+}
+
+char* mi_subproc_stats_get_json(mi_subproc_id_t subproc_id, size_t buf_size, char* buf) mi_attr_noexcept {
+  mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id);
+  if (subproc==NULL) return NULL;
+  mi_stats_t_decl(stats);
+  if (!mi_subproc_stats_get(subproc_id,&stats)) return NULL;
+  return mi_stats_get_json_from(&subproc->stats, buf_size, buf);  
+}
+
+char* mi_heap_stats_get_json(mi_heap_t* heap, size_t buf_size, char* buf) mi_attr_noexcept {
+  return mi_stats_get_json_from(mi_heap_get_stats(heap), buf_size, buf);
+}
+
+char* mi_stats_get_json(size_t buf_size, char* buf) mi_attr_noexcept {
+  return mi_subproc_stats_get_json(mi_subproc_current(), buf_size, buf);
+}
+
+char* mi_stats_as_json(mi_stats_t* stats, size_t buf_size, char* buf) mi_attr_noexcept {
+  return mi_stats_get_json_from(stats, buf_size, buf);
+}
diff --git a/ext/src/mimalloc/src/theap.c b/ext/src/mimalloc/src/theap.c
new file mode 100644
index 0000000000..47961afd39
--- /dev/null
+++ b/ext/src/mimalloc/src/theap.c
@@ -0,0 +1,674 @@
+/*----------------------------------------------------------------------------
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/prim.h"  // _mi_theap_default
+
+#if defined(_MSC_VER) && (_MSC_VER < 1920)
+#pragma warning(disable:4204)  // non-constant aggregate initializer
+#endif
+
+/* -----------------------------------------------------------
+  Helpers
+----------------------------------------------------------- */
+
+// return `true` if ok, `false` to break
+typedef bool (theap_page_visitor_fun)(mi_theap_t* theap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2);
+
+// Visit all pages in a theap; returns `false` if break was called.
+static bool mi_theap_visit_pages(mi_theap_t* theap, theap_page_visitor_fun* fn, void* arg1, void* arg2)
+{
+  if (theap==NULL || theap->page_count==0) return 0;
+
+  // visit all pages
+  #if MI_DEBUG>1
+  size_t total = theap->page_count;
+  size_t count = 0;
+  #endif
+
+  for (size_t i = 0; i <= MI_BIN_FULL; i++) {
+    mi_page_queue_t* pq = &theap->pages[i];
+    mi_page_t* page = pq->first;
+    while(page != NULL) {
+      mi_page_t* next = page->next; // save next in case the page gets removed from the queue
+      mi_assert_internal(mi_page_theap(page) == theap);
+      #if MI_DEBUG>1
+      count++;
+      #endif
+      if (!fn(theap, pq, page, arg1, arg2)) return false;
+      page = next; // and continue
+    }
+  }
+  mi_assert_internal(count == total);
+  return true;
+}
+
+
+#if MI_DEBUG>=2
+static bool mi_theap_page_is_valid(mi_theap_t* theap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
+  MI_UNUSED(arg1);
+  MI_UNUSED(arg2);
+  MI_UNUSED(pq);
+  mi_assert_internal(mi_page_theap(page) == theap);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  return true;
+}
+#endif
+#if MI_DEBUG>=3
+static bool mi_theap_is_valid(mi_theap_t* theap) {
+  mi_assert_internal(theap!=NULL);
+  mi_theap_visit_pages(theap, &mi_theap_page_is_valid, NULL, NULL);
+  for (size_t bin = 0; bin < MI_BIN_COUNT; bin++) {
+    mi_assert_internal(_mi_page_queue_is_valid(theap, &theap->pages[bin]));
+  }
+  return true;
+}
+#endif
+
+
+
+
+/* -----------------------------------------------------------
+  "Collect" pages by migrating `local_free` and `thread_free`
+  lists and freeing empty pages. This is done when a thread
+  stops (and in that case abandons pages if there are still
+  blocks alive)
+----------------------------------------------------------- */
+
+typedef enum mi_collect_e {
+  MI_NORMAL,
+  MI_FORCE,
+  MI_ABANDON
+} mi_collect_t;
+
+
+static bool mi_theap_page_collect(mi_theap_t* theap, mi_page_queue_t* pq, mi_page_t* page, void* arg_collect, void* arg2 ) {
+  MI_UNUSED(arg2);
+  MI_UNUSED(theap);
+  mi_assert_internal(mi_theap_page_is_valid(theap, pq, page, NULL, NULL));
+  mi_collect_t collect = *((mi_collect_t*)arg_collect);
+  _mi_page_free_collect(page, collect >= MI_FORCE);
+  if (mi_page_all_free(page)) {
+    // no more used blocks, possibly free the page.
+    if (collect >= MI_FORCE || page->retire_expire == 0) {  // either forced/abandon, or not already retired
+      // note: this will potentially free retired pages as well.
+      _mi_page_free(page, pq);
+    }
+  }
+  else if (collect == MI_ABANDON) {
+    // still used blocks but the thread is done; abandon the page
+    _mi_page_abandon(page, pq);
+  }
+  return true; // don't break
+}
+
+static void mi_theap_merge_stats(mi_theap_t* theap) {
+  mi_assert_internal(mi_theap_is_initialized(theap));
+  _mi_stats_merge_into(&theap->heap->stats, &theap->stats);
+}
+
+static void mi_theap_collect_ex(mi_theap_t* theap, mi_collect_t collect)
+{
+  if (theap==NULL || !mi_theap_is_initialized(theap)) return;
+  mi_assert_expensive(mi_theap_is_valid(theap));
+
+  const bool force = (collect >= MI_FORCE);
+  _mi_deferred_free(theap, force);
+
+  // python/cpython#112532: we may be called from a thread that is not the owner of the theap
+  // const bool is_main_thread = (_mi_is_main_thread() && theap->thread_id == _mi_thread_id());
+
+  // collect retired pages
+  _mi_theap_collect_retired(theap, force);
+
+  // collect all pages owned by this thread
+  mi_theap_visit_pages(theap, &mi_theap_page_collect, &collect, NULL);
+
+  // collect arenas (this is program wide so don't force purges on abandonment of threads)
+  //mi_atomic_storei64_release(&theap->tld->subproc->purge_expire, 1);
+  _mi_arenas_collect(collect == MI_FORCE /* force purge? */, collect >= MI_FORCE /* visit all? */, theap->tld);
+
+  // merge statistics
+  mi_theap_merge_stats(theap);
+}
+
+void _mi_theap_collect_abandon(mi_theap_t* theap) {
+  mi_theap_collect_ex(theap, MI_ABANDON);
+}
+
+void mi_theap_collect(mi_theap_t* theap, bool force) mi_attr_noexcept {
+  mi_theap_collect_ex(theap, (force ? MI_FORCE : MI_NORMAL));
+}
+
+void mi_collect(bool force) mi_attr_noexcept {
+  // cannot really collect process wide, just a theap..
+  mi_theap_collect(_mi_theap_default(), force);
+}
+
+void mi_heap_collect(mi_heap_t* heap, bool force) {
+  // cannot really collect a heap, just a theap..
+  mi_theap_collect(mi_heap_theap(heap), force);
+}
+
+/* -----------------------------------------------------------
+  Heap new
+----------------------------------------------------------- */
+
+mi_theap_t* mi_theap_get_default(void) {
+  mi_theap_t* theap = _mi_theap_default();
+  if mi_unlikely(!mi_theap_is_initialized(theap)) {
+    mi_thread_init();
+    theap = _mi_theap_default();
+    mi_assert_internal(mi_theap_is_initialized(theap));
+  }
+  return theap;
+}
+
+// todo: make order of parameters consistent (but would that break compat with CPython?)
+void _mi_theap_init(mi_theap_t* theap, mi_heap_t* heap, mi_tld_t* tld)
+{
+  mi_assert_internal(theap!=NULL);
+  mi_memid_t memid = theap->memid;
+  _mi_memcpy_aligned(theap, &_mi_theap_empty, sizeof(mi_theap_t));
+  theap->memid = memid;
+  theap->heap  = heap;
+  theap->tld   = tld;  // avoid reading the thread-local tld during initialization
+  
+  _mi_theap_options_init(theap);
+  if (theap->tld->is_in_threadpool) {
+    // if we run as part of a thread pool it is better to not arbitrarily reclaim abandoned pages into our theap.
+    // this is checked in `free.c:mi_free_try_collect_mt`
+    // .. but abandoning is good in this case: halve the full page retain (possibly to 0)
+    // (so blocked threads do not hold on to too much memory)
+    if (theap->page_full_retain > 0) {
+      theap->page_full_retain = theap->page_full_retain / 4;
+    }
+  }
+
+  // push on the thread local theaps list
+  mi_theap_t* head = theap->tld->theaps;
+  theap->tprev = NULL;
+  theap->tnext = head;
+  if (head!=NULL) { head->tprev = theap; }
+  theap->tld->theaps = theap;
+
+  // initialize random
+  if (head == NULL) {  // first theap in this thread?
+    #if defined(_WIN32) && !defined(MI_SHARED_LIB)
+      _mi_random_init_weak(&theap->random);    // prevent allocation failure during bcrypt dll initialization with static linking (issue #1185)
+    #else
+      _mi_random_init(&theap->random);
+    #endif
+  }
+  else {
+    _mi_random_split(&head->random, &theap->random);
+  }
+  theap->cookie  = _mi_theap_random_next(theap) | 1;
+  _mi_theap_guarded_init(theap);
+
+  // push on the heap's theap list
+  mi_lock(&heap->theaps_lock) {
+    head = heap->theaps;
+    theap->hprev = NULL;
+    theap->hnext = head;
+    if (head!=NULL) { head->hprev = theap; }
+    heap->theaps = theap;
+  }
+}
+
+mi_theap_t* _mi_theap_create(mi_heap_t* heap, mi_tld_t* tld) {
+  mi_assert_internal(tld!=NULL);
+  mi_assert_internal(heap!=NULL);
+  // allocate and initialize a theap
+  mi_memid_t memid;
+  mi_theap_t* theap;
+  //if (!_mi_is_heap_main(heap)) {
+  //  theap = (mi_theap_t*)mi_heap_zalloc(mi_heap_main(),sizeof(mi_theap_t));
+  //  memid = _mi_memid_create(MI_MEM_HEAP_MAIN);
+  //  memid.initially_zero = memid.initially_committed = true;
+  //}
+  //else
+  if (heap->exclusive_arena == NULL) {
+    theap = (mi_theap_t*)_mi_meta_zalloc(sizeof(mi_theap_t), &memid);
+  }
+  else {
+    // theaps associated with a specific arena are allocated in that arena
+    // note: takes up at least one slice which is quite wasteful...
+    const size_t size = _mi_align_up(sizeof(mi_theap_t),MI_ARENA_MIN_OBJ_SIZE);
+    theap = (mi_theap_t*)_mi_arenas_alloc(heap, size, true, true, heap->exclusive_arena, tld->thread_seq, tld->numa_node, &memid);
+    mi_assert_internal(memid.mem.os.size >= size);
+  }
+  if (theap==NULL) {
+    _mi_error_message(ENOMEM, "unable to allocate theap meta-data\n");
+    return NULL;
+  }
+  theap->memid = memid;
+  _mi_theap_init(theap, heap, tld);
+  return theap;
+}
+
+uintptr_t _mi_theap_random_next(mi_theap_t* theap) {
+  return _mi_random_next(&theap->random);
+}
+
+// called from `mi_theap_delete` to free the internal theap resources.
+void _mi_theap_free(mi_theap_t* theap) {
+  mi_assert(theap != NULL);
+  mi_assert_internal(mi_theap_is_initialized(theap));
+  if (theap==NULL || !mi_theap_is_initialized(theap)) return;
+
+  // merge stats to the owning heap
+  mi_theap_merge_stats(theap);
+
+  // remove ourselves from the heap theaps list
+  mi_lock(&theap->heap->theaps_lock) {
+    if (theap->hnext != NULL) { theap->hnext->hprev = theap->hprev; }
+    if (theap->hprev != NULL) { theap->hprev->hnext = theap->hnext; }
+                         else { mi_assert_internal(theap->heap->theaps == theap); theap->heap->theaps = theap->hnext; }
+  }
+
+  // remove ourselves from the thread local theaps list
+  if (theap->tnext != NULL) { theap->tnext->tprev = theap->tprev;  }
+  if (theap->tprev != NULL) { theap->tprev->tnext = theap->tnext;  }
+                       else { mi_assert_internal(theap->tld->theaps == theap); theap->tld->theaps = theap->tnext; }
+
+  // and free the used memory
+  if (theap->memid.memkind == MI_MEM_HEAP_MAIN) {  // note: for now unused as it would access theap_default stats in mi_free of the current theap
+    mi_assert_internal(_mi_is_heap_main(mi_heap_of(theap)));
+    mi_free(theap);
+  }
+  else if (theap->memid.memkind == MI_MEM_META) {
+    _mi_meta_free(theap, sizeof(*theap), theap->memid);
+  }
+  else {
+    _mi_arenas_free(theap, _mi_align_up(sizeof(*theap),MI_ARENA_MIN_OBJ_SIZE), theap->memid ); // issue #1168, avoid assertion failure
+  }
+}
+
+
+/* -----------------------------------------------------------
+  Heap destroy
+----------------------------------------------------------- */
+/*
+
+// zero out the page queues
+static void mi_theap_reset_pages(mi_theap_t* theap) {
+  mi_assert_internal(theap != NULL);
+  mi_assert_internal(mi_theap_is_initialized(theap));
+  // TODO: copy full empty theap instead?
+  _mi_memset(&theap->pages_free_direct, 0, sizeof(theap->pages_free_direct));
+  _mi_memcpy_aligned(&theap->pages, &_mi_theap_empty.pages, sizeof(theap->pages));
+  // theap->thread_delayed_free = NULL;
+  theap->page_count = 0;
+}
+
+static bool _mi_theap_page_destroy(mi_theap_t* theap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
+  MI_UNUSED(arg1);
+  MI_UNUSED(arg2);
+  MI_UNUSED(pq);
+
+  // ensure no more thread_delayed_free will be added
+  //_mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
+
+  // stats
+  const size_t bsize = mi_page_block_size(page);
+  if (bsize > MI_LARGE_MAX_OBJ_SIZE) {
+    mi_theap_stat_decrease(theap, malloc_huge, bsize);
+  }
+  #if (MI_STAT>0)
+  _mi_page_free_collect(page, false);  // update used count
+  const size_t inuse = page->used;
+  if (bsize <= MI_LARGE_MAX_OBJ_SIZE) {
+    mi_theap_stat_decrease(theap, malloc_normal, bsize * inuse);
+    #if (MI_STAT>1)
+    mi_theap_stat_decrease(theap, malloc_bins[_mi_bin(bsize)], inuse);
+    #endif
+  }
+  // mi_theap_stat_decrease(theap, malloc_requested, bsize * inuse);  // todo: off for aligned blocks...
+  #endif
+
+  /// pretend it is all free now
+  mi_assert_internal(mi_page_thread_free(page) == NULL);
+  page->used = 0;
+
+  // and free the page
+  // mi_page_free(page,false);
+  page->next = NULL;
+  page->prev = NULL;
+  mi_page_set_theap(page, NULL);
+  _mi_arenas_page_free(page, theap);
+
+  return true; // keep going
+}
+
+void _mi_theap_destroy_pages(mi_theap_t* theap) {
+  mi_theap_visit_pages(theap, &_mi_theap_page_destroy, NULL, NULL);
+  mi_theap_reset_pages(theap);
+}
+
+#if MI_TRACK_HEAP_DESTROY
+static bool mi_cdecl mi_theap_track_block_free(const mi_theap_t* theap, const mi_theap_area_t* area, void* block, size_t block_size, void* arg) {
+  MI_UNUSED(theap); MI_UNUSED(area);  MI_UNUSED(arg); MI_UNUSED(block_size);
+  mi_track_free_size(block,mi_usable_size(block));
+  return true;
+}
+#endif
+
+void mi_theap_destroy(mi_theap_t* theap) {
+  mi_assert(theap != NULL);
+  mi_assert(mi_theap_is_initialized(theap));
+  mi_assert(!theap->allow_page_reclaim);
+  mi_assert(!theap->allow_page_abandon);
+  mi_assert_expensive(mi_theap_is_valid(theap));
+  if (theap==NULL || !mi_theap_is_initialized(theap)) return;
+  #if MI_GUARDED
+  // _mi_warning_message("'mi_theap_destroy' called but MI_GUARDED is enabled -- using `mi_theap_delete` instead (theap at %p)\n", theap);
+  mi_theap_delete(theap);
+  return;
+  #else
+  if (theap->allow_page_reclaim) {
+    _mi_warning_message("'mi_theap_destroy' called but ignored as the theap was not created with 'allow_destroy' (theap at %p)\n", theap);
+    // don't free in case it may contain reclaimed pages,
+    mi_theap_delete(theap);
+  }
+  else {
+    // track all blocks as freed
+    #if MI_TRACK_HEAP_DESTROY
+    mi_theap_visit_blocks(theap, true, mi_theap_track_block_free, NULL);
+    #endif
+    // free all pages
+    _mi_theap_destroy_pages(theap);
+    mi_theap_free(theap,true);
+  }
+  #endif
+}
+
+// forcefully destroy all theaps in the current thread
+void _mi_theap_unsafe_destroy_all(mi_theap_t* theap) {
+  mi_assert_internal(theap != NULL);
+  if (theap == NULL) return;
+  mi_theap_t* curr = theap->tld->theaps;
+  while (curr != NULL) {
+    mi_theap_t* next = curr->next;
+    if (!curr->allow_page_reclaim) {
+      mi_theap_destroy(curr);
+    }
+    else {
+      _mi_theap_destroy_pages(curr);
+    }
+    curr = next;
+  }
+}
+*/
+
+/* -----------------------------------------------------------
+  Safe Heap delete
+----------------------------------------------------------- */
+
+// Safe delete a theap without freeing any still allocated blocks in that theap.
+void _mi_theap_delete(mi_theap_t* theap)
+{
+  mi_assert(theap != NULL);
+  mi_assert(mi_theap_is_initialized(theap));
+  mi_assert_expensive(mi_theap_is_valid(theap));
+  if (theap==NULL || !mi_theap_is_initialized(theap)) return;
+
+  // abandon all pages
+  _mi_theap_collect_abandon(theap);
+
+  mi_assert_internal(theap->page_count==0);
+  _mi_theap_free(theap);
+}
+
+
+
+/* -----------------------------------------------------------
+  Load/unload theaps
+----------------------------------------------------------- */
+/*
+void mi_theap_unload(mi_theap_t* theap) {
+  mi_assert(mi_theap_is_initialized(theap));
+  mi_assert_expensive(mi_theap_is_valid(theap));
+  if (theap==NULL || !mi_theap_is_initialized(theap)) return;
+  if (theap->heap->exclusive_arena == NULL) {
+    _mi_warning_message("cannot unload theaps that are not associated with an exclusive arena\n");
+    return;
+  }
+
+  // abandon all pages so all thread'id in the pages are cleared
+  _mi_theap_collect_abandon(theap);
+  mi_assert_internal(theap->page_count==0);
+
+  // remove from theap list
+  mi_theap_free(theap, false); // but don't actually free the memory
+
+  // disassociate from the current thread-local and static state
+  theap->tld = NULL;
+  return;
+}
+
+bool mi_theap_reload(mi_theap_t* theap, mi_arena_id_t arena_id) {
+  mi_assert(mi_theap_is_initialized(theap));
+  if (theap==NULL || !mi_theap_is_initialized(theap)) return false;
+  if (theap->heap->exclusive_arena == NULL) {
+    _mi_warning_message("cannot reload theaps that were not associated with an exclusive arena\n");
+    return false;
+  }
+  if (theap->tld != NULL) {
+    _mi_warning_message("cannot reload theaps that were not unloaded first\n");
+    return false;
+  }
+  mi_arena_t* arena = _mi_arena_from_id(arena_id);
+  if (theap->heap->exclusive_arena != arena) {
+    _mi_warning_message("trying to reload a theap at a different arena address: %p vs %p\n", theap->heap->exclusive_arena, arena);
+    return false;
+  }
+
+  mi_assert_internal(theap->page_count==0);
+
+  // re-associate with the current thread-local and static state
+  theap->tld = mi_theap_get_default()->tld;
+
+  // reinit direct pages (as we may be in a different process)
+  mi_assert_internal(theap->page_count == 0);
+  for (size_t i = 0; i < MI_PAGES_DIRECT; i++) {
+    theap->pages_free_direct[i] = (mi_page_t*)&_mi_page_empty;
+  }
+
+  // push on the thread local theaps list
+  theap->tnext = theap->tld->theaps;
+  theap->tld->theaps = theap;
+  return true;
+}
+*/
+
+
+/* -----------------------------------------------------------
+  Visit all theap blocks and areas
+  Todo: enable visiting abandoned pages, and
+        enable visiting all blocks of all theaps across threads
+----------------------------------------------------------- */
+
+void _mi_heap_area_init(mi_heap_area_t* area, mi_page_t* page) {
+  const size_t bsize = mi_page_block_size(page);
+  const size_t ubsize = mi_page_usable_block_size(page);
+  area->reserved = page->reserved * bsize;
+  area->committed = page->capacity * bsize;
+  area->blocks = mi_page_start(page);
+  area->used = page->used;   // number of blocks in use (#553)
+  area->block_size = ubsize;
+  area->full_block_size = bsize;
+  area->reserved1 = page;
+}
+
+static void mi_get_fast_divisor(size_t divisor, uint64_t* magic, size_t* shift) {
+  mi_assert_internal(divisor > 0 && divisor <= UINT32_MAX);
+  *shift = MI_SIZE_BITS - mi_clz(divisor - 1);
+  *magic = ((((uint64_t)1 << 32) * (((uint64_t)1 << *shift) - divisor)) / divisor + 1);
+}
+
+static size_t mi_fast_divide(size_t n, uint64_t magic, size_t shift) {
+  mi_assert_internal(n <= UINT32_MAX);
+  const uint64_t hi = ((uint64_t)n * magic) >> 32;
+  return (size_t)((hi + n) >> shift);
+}
+
+bool _mi_theap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_block_visit_fun* visitor, void* arg) {
+  mi_assert(area != NULL);
+  if (area==NULL) return true;
+  mi_assert(page != NULL);
+  if (page == NULL) return true;
+
+  _mi_page_free_collect(page,true);              // collect both thread_delayed and local_free
+  mi_assert_internal(page->local_free == NULL);
+  if (page->used == 0) return true;
+
+  size_t psize;
+  uint8_t* const pstart = mi_page_area(page, &psize);
+  mi_heap_t* const heap = mi_page_heap(page);
+  const size_t bsize    = mi_page_block_size(page);
+  const size_t ubsize   = mi_page_usable_block_size(page); // without padding
+
+  // optimize page with one block
+  if (page->capacity == 1) {
+    mi_assert_internal(page->used == 1 && page->free == NULL);
+    return visitor(heap, area, pstart, ubsize, arg);
+  }
+  mi_assert(bsize <= UINT32_MAX);
+
+  // optimize full pages
+  if (page->used == page->capacity) {
+    uint8_t* block = pstart;
+    for (size_t i = 0; i < page->capacity; i++) {
+      if (!visitor(heap, area, block, ubsize, arg)) return false;
+      block += bsize;
+    }
+    return true;
+  }
+
+  // create a bitmap of free blocks.
+  #define MI_MAX_BLOCKS   (MI_SMALL_PAGE_SIZE / sizeof(void*))
+  uintptr_t free_map[MI_MAX_BLOCKS / MI_INTPTR_BITS];
+  const uintptr_t bmapsize = _mi_divide_up(page->capacity, MI_INTPTR_BITS);
+  memset(free_map, 0, bmapsize * sizeof(intptr_t));
+  if (page->capacity % MI_INTPTR_BITS != 0) {
+    // mark left-over bits at the end as free
+    size_t shift   = (page->capacity % MI_INTPTR_BITS);
+    uintptr_t mask = (UINTPTR_MAX << shift);
+    free_map[bmapsize - 1] = mask;
+  }
+
+  // fast repeated division by the block size
+  uint64_t magic;
+  size_t   shift;
+  mi_get_fast_divisor(bsize, &magic, &shift);
+
+  #if MI_DEBUG>1
+  size_t free_count = 0;
+  #endif
+  for (mi_block_t* block = page->free; block != NULL; block = mi_block_next(page, block)) {
+    #if MI_DEBUG>1
+    free_count++;
+    #endif
+    mi_assert_internal((uint8_t*)block >= pstart && (uint8_t*)block < (pstart + psize));
+    size_t offset = (uint8_t*)block - pstart;
+    mi_assert_internal(offset % bsize == 0);
+    mi_assert_internal(offset <= UINT32_MAX);
+    size_t blockidx = mi_fast_divide(offset, magic, shift);
+    mi_assert_internal(blockidx == offset / bsize);
+    mi_assert_internal(blockidx < MI_MAX_BLOCKS);
+    size_t bitidx = (blockidx / MI_INTPTR_BITS);
+    size_t bit = blockidx - (bitidx * MI_INTPTR_BITS);
+    free_map[bitidx] |= ((uintptr_t)1 << bit);
+  }
+  mi_assert_internal(page->capacity == (free_count + page->used));
+
+  // walk through all blocks skipping the free ones
+  #if MI_DEBUG>1
+  size_t used_count = 0;
+  #endif
+  uint8_t* block = pstart;
+  for (size_t i = 0; i < bmapsize; i++) {
+    if (free_map[i] == 0) {
+      // every block is in use
+      for (size_t j = 0; j < MI_INTPTR_BITS; j++) {
+        #if MI_DEBUG>1
+        used_count++;
+        #endif
+        if (!visitor(heap, area, block, ubsize, arg)) return false;
+        block += bsize;
+      }
+    }
+    else {
+      // visit the used blocks in the mask
+      uintptr_t m = ~free_map[i];
+      while (m != 0) {
+        #if MI_DEBUG>1
+        used_count++;
+        #endif
+        size_t bitidx = mi_ctz(m);
+        if (!visitor(heap, area, block + (bitidx * bsize), ubsize, arg)) return false;
+        m &= m - 1;  // clear least significant bit
+      }
+      block += bsize * MI_INTPTR_BITS;
+    }
+  }
+  mi_assert_internal(page->used == used_count);
+  return true;
+}
+
+
+
+// Separate struct to keep `mi_page_t` out of the public interface
+typedef struct mi_theap_area_ex_s {
+  mi_heap_area_t area;
+  mi_page_t* page;
+} mi_theap_area_ex_t;
+
+typedef bool (mi_theap_area_visit_fun)(const mi_theap_t* theap, const mi_theap_area_ex_t* area, void* arg);
+
+static bool mi_theap_visit_areas_page(mi_theap_t* theap, mi_page_queue_t* pq, mi_page_t* page, void* vfun, void* arg) {
+  MI_UNUSED(theap);
+  MI_UNUSED(pq);
+  mi_theap_area_visit_fun* fun = (mi_theap_area_visit_fun*)vfun;
+  mi_theap_area_ex_t xarea;
+  xarea.page = page;
+  _mi_heap_area_init(&xarea.area, page);
+  return fun(theap, &xarea, arg);
+}
+
+// Visit all theap pages as areas
+static bool mi_theap_visit_areas(const mi_theap_t* theap, mi_theap_area_visit_fun* visitor, void* arg) {
+  if (visitor == NULL) return false;
+  return mi_theap_visit_pages((mi_theap_t*)theap, &mi_theap_visit_areas_page, (void*)(visitor), arg); // note: function pointer to void* :-{
+}
+
+// Just to pass arguments
+typedef struct mi_visit_blocks_args_s {
+  bool  visit_blocks;
+  mi_block_visit_fun* visitor;
+  void* arg;
+} mi_visit_blocks_args_t;
+
+static bool mi_theap_area_visitor(const mi_theap_t* theap, const mi_theap_area_ex_t* xarea, void* arg) {
+  mi_visit_blocks_args_t* args = (mi_visit_blocks_args_t*)arg;
+  if (!args->visitor(theap->heap, &xarea->area, NULL, xarea->area.block_size, args->arg)) return false;
+  if (args->visit_blocks) {
+    return _mi_theap_area_visit_blocks(&xarea->area, xarea->page, args->visitor, args->arg);
+  }
+  else {
+    return true;
+  }
+}
+
+// Visit all blocks in a theap
+bool mi_theap_visit_blocks(const mi_theap_t* theap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
+  mi_visit_blocks_args_t args = { visit_blocks, visitor, arg };
+  return mi_theap_visit_areas(theap, &mi_theap_area_visitor, &args);
+}
+
diff --git a/ext/src/mimalloc/src/threadlocal.c b/ext/src/mimalloc/src/threadlocal.c
new file mode 100644
index 0000000000..8e7c92b9f1
--- /dev/null
+++ b/ext/src/mimalloc/src/threadlocal.c
@@ -0,0 +1,175 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2025, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+Implement dynamic thread local variables (for heap's).
+Unlike most OS native implementations there is no limit on the number
+that can be allocated.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/prim.h"
+
+/* -----------------------------------------------------------
+Each thread can have (a dynamically expanding) array of
+thread-local values.
+----------------------------------------------------------- */
+
+typedef struct mi_thread_locals_s {
+  size_t count;
+  void*  slots[1];
+} mi_thread_locals_t;
+
+static mi_thread_locals_t mi_thread_locals_empty = { 0, {NULL} };
+
+mi_decl_thread mi_thread_locals_t* mi_thread_locals = &mi_thread_locals_empty;  // always point to a valid `mi_thread_locals_t`
+
+// dynamically reallocate the thread local slots when needed
+static mi_thread_locals_t* mi_thread_locals_expand(mi_thread_local_t atleast) {
+  mi_thread_locals_t* tls_old = mi_thread_locals;
+  const size_t count_old = tls_old->count;
+  size_t count;
+  if (count_old==0) {
+    tls_old = NULL; // so we allocate fresh from mi_thread_locals_empty
+    count = 16;     // start with 16 slots
+  } 
+  else if (count_old >= 1024) {
+    count = count_old + 1024;  // at some point increase linearly
+  }
+  else {
+    count = 2*count_old;       // and double initially
+  }
+  if (count <= atleast) {
+    count = atleast + 1;
+  }
+  mi_thread_locals_t* tls = (mi_thread_locals_t*)mi_rezalloc(tls_old, sizeof(mi_thread_locals_t) + count*sizeof(void*));
+  if mi_unlikely(tls==NULL) return NULL;
+  tls->count = count;
+  mi_thread_locals = tls;
+  return tls;
+}
+
+static mi_decl_noinline bool mi_thread_local_set_expand( mi_thread_local_t key, void* val ) {
+  if (val==NULL) return true;
+  mi_thread_locals_t* tls = mi_thread_locals_expand(key);
+  if (tls==NULL) return false;
+  mi_assert_internal(key < tls->count);
+  mi_assert_internal(tls == mi_thread_locals);
+  tls->slots[key] = val;
+  return true;
+}
+
+// set a tls slot; returns `true` if successful.
+// Can return `false` if we could not reallocate the slots array.
+bool _mi_thread_local_set( mi_thread_local_t key, void* val ) {
+  mi_thread_locals_t* tls = mi_thread_locals;
+  mi_assert_internal(tls!=NULL);
+  if mi_likely(key < tls->count) {
+    tls->slots[key] = val;
+    return true;
+  }
+  else {
+    return mi_thread_local_set_expand( key, val );  // tailcall
+  }
+}
+
+// get a tls slot value
+void* _mi_thread_local_get( mi_thread_local_t key ) {
+  const mi_thread_locals_t* const tls = mi_thread_locals;
+  mi_assert_internal(tls!=NULL);
+  if mi_likely(key < tls->count) {
+    return tls->slots[key];
+  }
+  else {
+    return NULL;
+  }
+}
+
+void _mi_thread_locals_thread_done(void) {
+  mi_thread_locals_t* const tls = mi_thread_locals;
+  if (tls!=NULL && tls->count > 0) {
+    mi_free(tls);
+    mi_thread_locals = &mi_thread_locals_empty;
+  }
+}
+
+/* -----------------------------------------------------------
+Create and free fresh TLS key's
+----------------------------------------------------------- */
+#include "bitmap.h"
+
+static mi_lock_t    mi_thread_locals_lock;    // we need a lock in order to re-allocate the slot bits
+static mi_bitmap_t* mi_thread_locals_free;    // reuse an arena bitmap to track which slots were assigned (1=free, 0=in-use)
+
+void _mi_thread_locals_init(void) {
+  mi_lock_init(&mi_thread_locals_lock);
+}
+
+void _mi_thread_locals_done(void) {
+  mi_lock(&mi_thread_locals_lock) {
+    mi_bitmap_t* const slots = mi_thread_locals_free;
+    mi_free(slots);
+  }
+  mi_lock_done(&mi_thread_locals_lock);
+}
+
+// strange signature but allows us to reuse the arena code for claiming free pages
+static bool mi_thread_local_claim(size_t _slice_index, mi_arena_t* _arena, bool* keep_set) {
+  MI_UNUSED(_slice_index); MI_UNUSED(_arena);
+  *keep_set = false;
+  return true;
+}
+
+static mi_thread_local_t mi_thread_local_create_expand(void) {
+  size_t key = 0;
+  mi_bitmap_t* slots = mi_thread_locals_free;
+  // 1024 bits at a time
+  const size_t oldcount = (slots==NULL ? 0 : mi_bitmap_max_bits(slots));
+  const size_t newcount = 1024 + oldcount;
+  const size_t newsize = mi_bitmap_size( newcount, NULL );
+  slots = (mi_bitmap_t*)mi_realloc_aligned(slots, newsize, MI_BCHUNK_SIZE);
+  if (slots != NULL) {
+    mi_bitmap_init(slots, newcount, true /* or otherwise we would zero all old entries */);
+    mi_bitmap_unsafe_setN(slots, oldcount, newcount - oldcount);
+    mi_thread_locals_free = slots;
+    size_t idx = 0;
+    if mi_likely(slots!=NULL && mi_bitmap_try_find_and_claim(slots,0,&idx,&mi_thread_local_claim,NULL)) {
+      key = idx+1;
+    }
+  }
+  return key;
+}
+
+// create a fresh key
+mi_thread_local_t _mi_thread_local_create(void) {
+  mi_thread_local_t key = 0;
+  mi_lock(&mi_thread_locals_lock) {
+    mi_bitmap_t* slots = mi_thread_locals_free;
+    size_t idx = 0;
+    if mi_likely(slots!=NULL && mi_bitmap_try_find_and_claim(slots,0,&idx,&mi_thread_local_claim,NULL)) {
+      key = idx+1;
+    }
+    else {
+      key = mi_thread_local_create_expand();
+    }
+  }
+  return key;
+}
+
+// free a key
+void _mi_thread_local_free(mi_thread_local_t key) {
+  if (key==0) return;
+  const size_t idx = key-1;
+  mi_lock(&mi_thread_locals_lock) {
+    mi_bitmap_t* const slots = mi_thread_locals_free;
+    if (slots!=NULL && idx < mi_bitmap_max_bits(slots)) {
+      mi_bitmap_set(slots,key-1);
+    }
+  }
+}
+
diff --git a/ext/src/mimalloc/test/CMakeLists.txt b/ext/src/mimalloc/test/CMakeLists.txt
new file mode 100644
index 0000000000..199f48094c
--- /dev/null
+++ b/ext/src/mimalloc/test/CMakeLists.txt
@@ -0,0 +1,56 @@
+cmake_minimum_required(VERSION 3.18)
+project(mimalloc-test C CXX)
+
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
+
+# Set default build type
+if (NOT CMAKE_BUILD_TYPE)
+  if ("${CMAKE_BINARY_DIR}" MATCHES ".*(D|d)ebug$")
+    message(STATUS "No build type selected, default to *** Debug ***")
+    set(CMAKE_BUILD_TYPE "Debug")
+  else()
+    message(STATUS "No build type selected, default to *** Release ***")
+    set(CMAKE_BUILD_TYPE "Release")
+  endif()
+endif()
+
+# Import mimalloc (if installed)
+find_package(mimalloc CONFIG REQUIRED)
+message(STATUS "Found mimalloc installed at: ${MIMALLOC_LIBRARY_DIR} (${MIMALLOC_VERSION_DIR})")
+
+
+# link with a dynamic shared library
+# use `LD_PRELOAD` to actually override malloc/free at runtime with mimalloc
+add_executable(dynamic-override  main-override.c)
+target_link_libraries(dynamic-override PUBLIC mimalloc)
+
+add_executable(dynamic-override-cxx  main-override.cpp)
+target_link_libraries(dynamic-override-cxx PUBLIC mimalloc)
+
+
+# overriding with a static object file works reliable as the symbols in the
+# object file have priority over those in library files
+add_executable(static-override-obj main-override.c ${MIMALLOC_OBJECT_DIR}/mimalloc${CMAKE_C_OUTPUT_EXTENSION})
+target_include_directories(static-override-obj PUBLIC ${MIMALLOC_INCLUDE_DIR})
+target_link_libraries(static-override-obj PUBLIC mimalloc-static)
+
+
+# overriding with a static library works too if using the `mimalloc-override.h`
+# header to redefine malloc/free. (the library already overrides new/delete)
+add_executable(static-override-static main-override-static.c)
+target_link_libraries(static-override-static PUBLIC mimalloc-static)
+
+
+# overriding with a static library: this may not work if the library is linked too late
+# on the command line after the C runtime library; but we cannot control that well in CMake
+add_executable(static-override main-override.c)
+target_link_libraries(static-override PUBLIC mimalloc-static)
+
+add_executable(static-override-cxx  main-override.cpp)
+target_link_libraries(static-override-cxx PUBLIC mimalloc-static)
+
+
+## test memory errors
+add_executable(test-wrong  test-wrong.c)
+target_link_libraries(test-wrong PUBLIC mimalloc)
diff --git a/ext/src/mimalloc/test/main-override-dep.cpp b/ext/src/mimalloc/test/main-override-dep.cpp
new file mode 100644
index 0000000000..772e3702bf
--- /dev/null
+++ b/ext/src/mimalloc/test/main-override-dep.cpp
@@ -0,0 +1,60 @@
+// Issue #981: test overriding allocation in a DLL that is compiled independent of mimalloc.
+// This is imported by the `mimalloc-test-override` project.
+#include <string>
+#include <iostream>
+#include "main-override-dep.h"
+
+std::string TestAllocInDll::GetString()
+{
+	char* test = new char[128];
+	memset(test, 0, 128);
+	const char* t = "test";
+	memcpy(test, t, 4);
+	std::string r = test;
+  std::cout << "override-dep: GetString: " << r << "\n";
+	delete[] test;
+	return r;
+}
+
+#include <windows.h>
+
+void TestAllocInDll::TestHeapAlloc()
+{
+	HANDLE theap = GetProcessHeap();
+	int* p = (int*)HeapAlloc(theap, 0, sizeof(int));
+	*p = 42;
+	HeapFree(theap, 0, p);
+}
+
+class Static {
+private:
+  void* p;
+public:
+  Static() {
+    printf("override-dep: static constructor\n");
+    p = malloc(64);
+    return;
+  }
+  ~Static() {
+    free(p);
+    printf("override-dep: static destructor\n");
+    return;
+  }
+};
+
+static Static s = Static();
+
+
+#include <windows.h>
+
+BOOL WINAPI DllMain(HINSTANCE module, DWORD reason, LPVOID reserved) {
+  (void)(reserved);
+  (void)(module);
+  if (reason==DLL_PROCESS_ATTACH) {
+    printf("override-dep: dll attach\n");
+  }
+  else if (reason==DLL_PROCESS_DETACH) {
+    printf("override-dep: dll detach\n");
+  }
+  return TRUE;
+}
diff --git a/ext/src/mimalloc/test/main-override-dep.h b/ext/src/mimalloc/test/main-override-dep.h
new file mode 100644
index 0000000000..9d4aabfdb6
--- /dev/null
+++ b/ext/src/mimalloc/test/main-override-dep.h
@@ -0,0 +1,12 @@
+#pragma once
+// Issue #981: test overriding allocation in a DLL that is compiled independent of mimalloc. 
+// This is imported by the `mimalloc-test-override` project.
+
+#include <string>
+
+class TestAllocInDll
+{
+public:
+	__declspec(dllexport) std::string GetString();
+	__declspec(dllexport) void TestHeapAlloc();
+};
diff --git a/ext/src/mimalloc/test/main-override-static.c b/ext/src/mimalloc/test/main-override-static.c
new file mode 100644
index 0000000000..bf5e5449f2
--- /dev/null
+++ b/ext/src/mimalloc/test/main-override-static.c
@@ -0,0 +1,524 @@
+#if _WIN32
+#include <windows.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+#include <stdint.h>
+
+#include <mimalloc.h>
+#include <mimalloc-override.h>  // redefines malloc etc.
+
+static void mi_bins(void);
+
+static void double_free1();
+static void double_free2();
+static void corrupt_free();
+static void block_overflow1();
+static void block_overflow2();
+static void invalid_free();
+static void test_aslr(void);
+static void test_process_info(void);
+static void test_reserved(void);
+static void negative_stat(void);
+static void alloc_huge(void);
+static void test_heap_walk(void);
+static void test_canary_leak(void);
+static void test_manage_os_memory(void);
+// static void test_large_pages(void);
+
+
+int main() {
+  mi_version();
+  mi_stats_reset();
+
+  // mi_bins();
+
+  // test_manage_os_memory();
+  // test_large_pages();
+  // detect double frees and theap corruption
+  // double_free1();
+  // double_free2();
+  // corrupt_free();
+  // block_overflow1();
+  // block_overflow2();
+  test_canary_leak();
+  // test_aslr();
+  // invalid_free();
+  // test_reserved();
+  // negative_stat();
+  // test_theap_walk();
+  // alloc_huge();
+
+
+  void* p1 = malloc(78);
+  void* p2 = malloc(24);
+  free(p1);
+  p1 = mi_malloc(8);
+  char* s = strdup("hello\n");
+  free(p2);
+
+  // mi_theap_t* h = mi_theap_new();
+  // mi_theap_set_default(h);
+
+  p2 = malloc(16);
+  p1 = realloc(p1, 32);
+  free(p1);
+  free(p2);
+  free(s);
+
+  /* now test if override worked by allocating/freeing across the api's*/
+  //p1 = mi_malloc(32);
+  //free(p1);
+  //p2 = malloc(32);
+  //mi_free(p2);
+
+  //mi_collect(true);
+  //mi_stats_print(NULL);
+
+  // test_process_info();
+
+  return 0;
+}
+
+static void invalid_free() {
+  free((void*)0xBADBEEF);
+  realloc((void*)0xBADBEEF, 10);
+}
+
+static void block_overflow1() {
+  uint8_t* p = (uint8_t*)mi_malloc(17);
+  p[18] = 0;
+  free(p);
+}
+
+static void block_overflow2() {
+  uint8_t* p = (uint8_t*)mi_malloc(16);
+  p[17] = 0;
+  free(p);
+}
+
+// The double free samples come ArcHeap [1] by Insu Yun (issue #161)
+// [1]: https://arxiv.org/pdf/1903.00503.pdf
+
+static void double_free1() {
+  void* p[256];
+  //uintptr_t buf[256];
+
+  p[0] = mi_malloc(622616);
+  p[1] = mi_malloc(655362);
+  p[2] = mi_malloc(786432);
+  mi_free(p[2]);
+  // [VULN] Double free
+  mi_free(p[2]);
+  p[3] = mi_malloc(786456);
+  // [BUG] Found overlap
+  // p[3]=0x429b2ea2000 (size=917504), p[1]=0x429b2e42000 (size=786432)
+  fprintf(stderr, "p3: %p-%p, p1: %p-%p, p2: %p\n", p[3], (uint8_t*)(p[3]) + 786456, p[1], (uint8_t*)(p[1]) + 655362, p[2]);
+}
+
+static void double_free2() {
+  void* p[256];
+  //uintptr_t buf[256];
+  // [INFO] Command buffer: 0x327b2000
+  // [INFO] Input size: 182
+  p[0] = malloc(712352);
+  p[1] = malloc(786432);
+  free(p[0]);
+  // [VULN] Double free
+  free(p[0]);
+  p[2] = malloc(786440);
+  p[3] = malloc(917504);
+  p[4] = malloc(786440);
+  // [BUG] Found overlap
+  // p[4]=0x433f1402000 (size=917504), p[1]=0x433f14c2000 (size=786432)
+  fprintf(stderr, "p1: %p-%p, p2: %p-%p\n", p[4], (uint8_t*)(p[4]) + 917504, p[1], (uint8_t*)(p[1]) + 786432);
+}
+
+
+// Try to corrupt the theap through buffer overflow
+#define N   256
+#define SZ  64
+
+static void corrupt_free() {
+  void* p[N];
+  // allocate
+  for (int i = 0; i < N; i++) {
+    p[i] = malloc(SZ);
+  }
+  // free some
+  for (int i = 0; i < N; i += (N/10)) {
+    free(p[i]);
+    p[i] = NULL;
+  }
+  // try to corrupt the free list
+  for (int i = 0; i < N; i++) {
+    if (p[i] != NULL) {
+      memset(p[i], 0, SZ+8);
+    }
+  }
+  // allocate more.. trying to trigger an allocation from a corrupted entry
+  // this may need many allocations to get there (if at all)
+  for (int i = 0; i < 4096; i++) {
+    malloc(SZ);
+  }
+}
+
+static void test_aslr(void) {
+  void* p[256];
+  p[0] = malloc(378200);
+  p[1] = malloc(1134626);
+  printf("p1: %p, p2: %p\n", p[0], p[1]);
+}
+
+static void test_process_info(void) {
+  size_t elapsed = 0;
+  size_t user_msecs = 0;
+  size_t system_msecs = 0;
+  size_t current_rss = 0;
+  size_t peak_rss = 0;
+  size_t current_commit = 0;
+  size_t peak_commit = 0;
+  size_t page_faults = 0;
+  for (int i = 0; i < 100000; i++) {
+    void* p = calloc(100, 10);
+    free(p);
+  }
+  mi_process_info(&elapsed, &user_msecs, &system_msecs, &current_rss, &peak_rss, &current_commit, &peak_commit, &page_faults);
+  printf("\n\n*** process info: elapsed %3zd.%03zd s, user: %3zd.%03zd s, rss: %zd b, commit: %zd b\n\n", elapsed/1000, elapsed%1000, user_msecs/1000, user_msecs%1000, peak_rss, peak_commit);
+}
+
+static void test_reserved(void) {
+#define KiB 1024ULL
+#define MiB (KiB*KiB)
+#define GiB (MiB*KiB)
+  mi_reserve_os_memory(3*GiB, false, true);
+  void* p1 = malloc(100);
+  void* p2 = malloc(100000);
+  void* p3 = malloc(2*GiB);
+  void* p4 = malloc(1*GiB + 100000);
+  free(p1);
+  free(p2);
+  free(p3);
+  p3 = malloc(1*GiB);
+  free(p4);
+}
+
+
+
+static void negative_stat(void) {
+  int* p = mi_malloc(60000);
+  mi_stats_print_out(NULL, NULL);
+  *p = 100;
+  mi_free(p);
+  mi_stats_print_out(NULL, NULL);
+}
+
+static void alloc_huge(void) {
+  void* p = mi_malloc(67108872);
+  mi_free(p);
+}
+
+static bool test_visit(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg) {
+  if (block == NULL) {
+    printf("visiting an area with blocks of size %zu (including padding)\n", area->full_block_size);
+  }
+  else {
+    printf("  block of size %zu (allocated size is %zu)\n", block_size, mi_usable_size(block));
+  }
+  return true;
+}
+
+static void test_heap_walk(void) {
+  mi_heap_t* heap = mi_heap_new();
+  mi_heap_malloc(heap, 16*2097152);
+  mi_heap_malloc(heap, 2067152);
+  mi_heap_malloc(heap, 2097160);
+  mi_heap_malloc(heap, 24576);
+  mi_heap_visit_blocks(heap, true, &test_visit, NULL);
+}
+
+static void test_canary_leak(void) {
+  char* p = mi_mallocn_tp(char, 22);
+  for (int i = 0; i < 22; i++) {
+    p[i] = '0'+i;
+  }
+  puts(p);
+  free(p);
+}
+
+#if _WIN32
+static void test_manage_os_memory(void) {
+  size_t size = 256 * 1024 * 1024;
+  void* ptr = VirtualAlloc(NULL, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+  mi_arena_id_t arena_id;
+  mi_manage_os_memory_ex(ptr, size, true /* committed */, true /* pinned */, false /* is zero */, -1 /* numa node */, true /* exclusive */, &arena_id);
+  mi_heap_t* cuda_theap = mi_heap_new_in_arena(arena_id);    // you can do this in any thread
+
+  // now allocate only in the cuda arena
+  void* p1 = mi_heap_malloc(cuda_theap, 8);
+  int* p2  = mi_heap_malloc_tp(int,cuda_theap);
+  *p2 = 42;
+
+  // and maybe set the cuda theap as the default theap? (but careful as now `malloc` will allocate in the cuda theap as well)
+  {
+    mi_theap_t* prev_default_theap = mi_theap_set_default(mi_heap_theap(cuda_theap));
+    void* p3 = mi_malloc(8);  // allocate in the cuda theap
+    mi_free(p3);
+  }
+  mi_free(p1);
+  mi_free(p2);
+}
+#else
+static void test_manage_os_memory(void) {
+  // empty
+}
+#endif
+
+// Experiment with huge OS pages
+#if 0
+
+#include <mimalloc/types.h>
+#include <mimalloc/internal.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+static void test_large_pages(void) {
+  mi_memid_t memid;
+
+#if 0
+  size_t pages_reserved;
+  size_t page_size;
+  uint8_t* p = (uint8_t*)_mi_os_alloc_huge_os_pages(1, -1, 30000, &pages_reserved, &page_size, &memid);
+  const size_t req_size = pages_reserved * page_size;
+#else
+  const size_t req_size = 64*MI_MiB;
+  uint8_t* p = (uint8_t*)_mi_os_alloc(req_size, &memid, NULL);
+#endif
+
+  p[0] = 1;
+
+  //_mi_os_protect(p, _mi_os_page_size());
+  //_mi_os_unprotect(p, _mi_os_page_size());
+  //_mi_os_decommit(p, _mi_os_page_size(), NULL);
+  if (madvise(p, req_size, MADV_HUGEPAGE) == 0) {
+    printf("advised huge pages\n");
+    _mi_os_decommit(p, _mi_os_page_size(), NULL);
+  };
+  _mi_os_free(p, req_size, memid, NULL);
+}
+
+#endif
+
+// ----------------------------
+// bin size experiments
+// ------------------------------
+
+#if 0
+#include <stdint.h>
+#include <stdbool.h>
+#include <mimalloc/bits.h>
+
+#define MI_LARGE_WSIZE_MAX (4*1024*1024 / MI_INTPTR_SIZE)
+
+#define MI_BIN_HUGE 100
+//#define MI_ALIGN2W
+
+// Bit scan reverse: return the index of the highest bit.
+static inline uint8_t mi_bsr32(uint32_t x);
+
+#if defined(_MSC_VER)
+//#include <Windows.h>
+#include <intrin.h>
+static inline uint8_t mi_bsr32(uint32_t x) {
+  uint32_t idx;
+  _BitScanReverse(&idx, x);
+  return idx;
+}
+#elif defined(__GNUC__) || defined(__clang__)
+static inline uint8_t mi_bsr32(uint32_t x) {
+  return (31 - __builtin_clz(x));
+}
+#else
+static inline uint8_t mi_bsr32(uint32_t x) {
+  // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
+  static const uint8_t debruijn[32] = {
+     31,  0, 22,  1, 28, 23, 18,  2, 29, 26, 24, 10, 19,  7,  3, 12,
+     30, 21, 27, 17, 25,  9,  6, 11, 20, 16,  8,  5, 15,  4, 14, 13,
+  };
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  x++;
+  return debruijn[(x*0x076be629) >> 27];
+}
+#endif
+
+
+// Bit scan reverse: return the index of the highest bit.
+uint8_t _mi_bsr(uintptr_t x) {
+  if (x == 0) return 0;
+  #if MI_INTPTR_SIZE==8
+  uint32_t hi = (x >> 32);
+  return (hi == 0 ? mi_bsr32((uint32_t)x) : 32 + mi_bsr32(hi));
+  #elif MI_INTPTR_SIZE==4
+  return mi_bsr32(x);
+  #else
+  # error "define bsr for non-32 or 64-bit platforms"
+  #endif
+}
+
+static inline size_t _mi_wsize_from_size(size_t size) {
+  return (size + sizeof(uintptr_t) - 1) / sizeof(uintptr_t);
+}
+
+// #define MI_ALIGN2W
+
+// Return the bin for a given field size.
+// Returns MI_BIN_HUGE if the size is too large.
+// We use `wsize` for the size in "machine word sizes",
+// i.e. byte size == `wsize*sizeof(void*)`.
+static inline size_t mi_bin(size_t wsize) {
+  // size_t wsize = _mi_wsize_from_size(size);
+  // size_t bin;
+  /*if (wsize <= 1) {
+    bin = 1;
+  }
+  */
+#if defined(MI_ALIGN4W)
+  if (wsize <= 4) {
+    return (wsize <= 1 ? 1 : (wsize+1)&~1); // round to double word sizes
+  }
+#elif defined(MI_ALIGN2W)
+  if (wsize <= 8) {
+    return (wsize <= 1 ? 1 : (wsize+1)&~1); // round to double word sizes
+  }
+#else
+  if (wsize <= 8) {
+    return (wsize == 0 ? 1 : wsize);
+  }
+#endif
+  else if (wsize > MI_LARGE_WSIZE_MAX) {
+    return MI_BIN_HUGE;
+  }
+  else {
+#if defined(MI_ALIGN4W)
+    if (wsize <= 16) { wsize = (wsize+3)&~3; } // round to 4x word sizes
+#endif
+    wsize--;
+    // find the highest bit
+    size_t idx;
+    mi_bsr(wsize, &idx);
+    uint8_t b = (uint8_t)idx;
+    // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation).
+    // - adjust with 3 because we use do not round the first 8 sizes
+    //   which each get an exact bin
+    const size_t bin = ((b << 2) + ((wsize >> (b - 2)) & 0x03)) - 3;
+    assert(bin > 0 && bin < MI_BIN_HUGE);
+    return bin;
+  }
+}
+
+
+static inline uint8_t _mi_bin4(size_t size) {
+  size_t wsize = _mi_wsize_from_size(size);
+  uint8_t bin;
+  if (wsize <= 1) {
+    bin = 1;
+  }
+#if defined(MI_ALIGN4W)
+  else if (wsize <= 4) {
+    bin = (uint8_t)((wsize+1)&~1); // round to double word sizes
+  }
+#elif defined(MI_ALIGN2W)
+  else if (wsize <= 8) {
+    bin = (uint8_t)((wsize+1)&~1); // round to double word sizes
+  }
+#else
+  else if (wsize <= 8) {
+    bin = (uint8_t)wsize;
+  }
+#endif
+  else if (wsize > MI_LARGE_WSIZE_MAX) {
+    bin = MI_BIN_HUGE;
+  }
+  else {
+    size_t idx;
+    mi_bsr(wsize, &idx);
+    uint8_t b = (uint8_t)idx;
+    bin = ((b << 1) + (uint8_t)((wsize >> (b - 1)) & 0x01)) + 3;
+  }
+  return bin;
+}
+
+static size_t _mi_binx4(size_t wsize) {
+  size_t bin;
+  if (wsize <= 1) {
+    bin = 1;
+  }
+  else if (wsize <= 8) {
+    // bin = (wsize+1)&~1; // round to double word sizes
+    bin = (uint8_t)wsize;
+  }
+  else {
+    size_t idx;
+    mi_bsr(wsize, &idx);
+    uint8_t b = (uint8_t)idx;
+    if (b <= 1) return wsize;
+    bin = ((b << 1) | (wsize >> (b - 1))&0x01) + 3;
+  }
+  return bin;
+}
+
+static size_t _mi_binx8(size_t bsize) {
+  if (bsize<=1) return bsize;
+  size_t idx;
+  mi_bsr(bsize, &idx);
+  uint8_t b = (uint8_t)idx;
+  if (b <= 2) return bsize;
+  size_t bin = ((b << 2) | (bsize >> (b - 2))&0x03) - 5;
+  return bin;
+}
+
+
+static inline size_t mi_binx(size_t wsize) {
+  uint8_t bin;
+  if (wsize <= 1) {
+    bin = 1;
+  }
+  else if (wsize <= 8) {
+    // bin = (wsize+1)&~1; // round to double word sizes
+    bin = (uint8_t)wsize;
+  }
+  else {
+    wsize--;
+    assert(wsize>0);
+    // find the highest bit
+    uint8_t b = (uint8_t)(MI_SIZE_BITS - 1 - mi_clz(wsize));
+
+    // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation).
+    // - adjust with 3 because we use do not round the first 8 sizes
+    //   which each get an exact bin
+    bin = ((b << 2) + (uint8_t)((wsize >> (b - 2)) & 0x03)) - 3;
+  }
+  return bin;
+}
+
+
+static void mi_bins(void) {
+  //printf("  QNULL(1), /* 0 */ \\\n  ");
+  size_t last_bin = 0;
+  for (size_t wsize = 1; wsize <= (4*1024*1024) / 8 + 1024; wsize++) {
+    size_t bin = mi_bin(wsize);
+    if (bin != last_bin) {
+      //printf("min bsize: %6zd, max bsize: %6zd, bin: %6zd\n", min_wsize, last_wsize, last_bin);
+      printf("QNULL(%6zd), ", wsize-1);
+      if (last_bin%8 == 0) printf("/* %zu */ \\\n  ", last_bin);
+      last_bin = bin;
+    }
+  }
+}
+#endif
diff --git a/ext/src/mimalloc/test/main-override.c b/ext/src/mimalloc/test/main-override.c
new file mode 100644
index 0000000000..284fdd2040
--- /dev/null
+++ b/ext/src/mimalloc/test/main-override.c
@@ -0,0 +1,36 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+
+#include <mimalloc-override.h>
+
+int main() {
+  mi_version();       // ensure mimalloc library is linked
+  void* p1 = malloc(78);
+  void* p2 = malloc(24);
+  free(p1);
+  p1 = malloc(8);
+  //char* s = strdup("hello\n");
+  free(p2);
+  p2 = malloc(16);
+  p1 = realloc(p1, 32);
+  free(p1);
+  free(p2);
+  //free(s);
+  //mi_collect(true);
+
+  /* now test if override worked by allocating/freeing across the api's*/
+  //p1 = mi_malloc(32);
+  //free(p1);
+  //p2 = malloc(32);
+  //mi_free(p2);
+  p1 = malloc(24);
+  p2 = reallocarray(p1, 16, 16);
+  free(p2);
+  p1 = malloc(24);
+  assert(reallocarr(&p1, 16, 16) == 0);
+  free(p1);
+  mi_stats_print(NULL);
+  return 0;
+}
diff --git a/ext/src/mimalloc/test/main-override.cpp b/ext/src/mimalloc/test/main-override.cpp
new file mode 100644
index 0000000000..c8b59ea324
--- /dev/null
+++ b/ext/src/mimalloc/test/main-override.cpp
@@ -0,0 +1,635 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+#include <stdint.h>
+
+#include <mimalloc.h>
+#include <new>
+#include <vector>
+#include <future>
+#include <iostream>
+#include <thread>
+#include <random>
+#include <chrono>
+#include <assert.h>
+
+#ifdef _WIN32
+#include <mimalloc-new-delete.h>
+#include <windows.h>
+static void msleep(unsigned long msecs) { Sleep(msecs); }
+#else
+#include <unistd.h>
+static void msleep(unsigned long msecs) { usleep(msecs * 1000UL); }
+#endif
+
+static void theap_thread_free_large(); // issue #221
+static void theap_no_delete();         // issue #202
+static void theap_late_free();         // issue #204
+static void padding_shrink();         // issue #209
+static void various_tests();
+static void test_mt_shutdown();
+static void fail_aslr();              // issue #372
+static void tsan_numa_test();         // issue #414
+static void strdup_test();            // issue #445
+static void theap_thread_free_huge();
+static void test_std_string();        // issue #697
+static void test_thread_local();      // issue #944
+// static void test_mixed0();             // issue #942
+static void test_mixed1();             // issue #942
+static void test_stl_allocators();
+static void test_join();              // issue #1177
+static void test_thread_leak(void);   // issue #1104
+static void test_perf(void);          // issue #1104
+static void test_perf2(void);         // issue #1104
+static void test_perf3(void);         // issue #1104
+static void test_perf4(void);         // issue #1104
+static void test_perf5(void);         // issue #1104
+
+#if _WIN32
+#include "main-override-dep.h"
+static void test_dep();               // issue #981: test overriding in another DLL
+#else
+static void test_dep() { };
+#endif
+
+int main() {
+  mi_stats_reset();  // ignore earlier allocations
+  //various_tests();
+  //test_mixed1();
+
+  // test_dep();
+  // test_join();
+
+  // test_thread_leak();
+  // test_perf();
+  // test_perf2();
+  // test_perf3();
+  // test_perf4();
+  test_perf5();
+
+  //test_std_string();
+  //test_thread_local();
+  // theap_thread_free_huge();
+  /*
+  theap_thread_free_large();
+  theap_no_delete();
+  theap_late_free();
+  padding_shrink();
+
+  tsan_numa_test();
+  */
+  /*
+  strdup_test();
+  test_stl_allocators();
+  test_mt_shutdown();
+  */
+  //fail_aslr();
+  mi_stats_print(NULL);
+  return 0;
+}
+
+static void* p = malloc(8);
+
+void free_p() {
+  free(p);
+  return;
+}
+
+class Test {
+private:
+  int i;
+public:
+  Test(int x) { i = x; }
+  ~Test() { }
+};
+
+
+static void various_tests() {
+  atexit(free_p);
+  void* p1 = malloc(78);
+  void* p2 = mi_malloc_aligned(24, 16);
+  free(p1);
+  p1 = malloc(8);
+  char* s = mi_strdup("hello\n");
+
+  mi_free(p2);
+  p2 = malloc(16);
+  p1 = realloc(p1, 32);
+  free(p1);
+  free(p2);
+  mi_free(s);
+
+  Test* t = new Test(42);
+  delete t;
+  t = new (std::nothrow) Test(42);
+  delete t;
+  auto tbuf = new unsigned char[sizeof(Test)];
+  t = new (tbuf) Test(42);
+  t->~Test();
+  delete[] tbuf;
+
+  #if _WIN32
+  const char* ptr = ::_Getdays();  // test _base overrid
+  free((void*)ptr);
+  #endif
+}
+
+class Static {
+private:
+  void* p;
+public:
+  Static() {
+    p = malloc(64);
+    return;
+  }
+  ~Static() {
+    free(p);
+    return;
+  }
+};
+
+static Static s = Static();
+
+
+static bool test_stl_allocator1() {
+  std::vector<int, mi_stl_allocator<int> > vec;
+  vec.push_back(1);
+  vec.pop_back();
+  return vec.size() == 0;
+}
+
+struct some_struct { int i; int j; double z; };
+
+
+#if _WIN32
+static void test_dep()
+{
+  TestAllocInDll t;
+  std::string s = t.GetString();
+  std::cout << "test_dep GetString: " << s << "\n";
+  t.TestHeapAlloc();
+}
+#endif
+
+
+static bool test_stl_allocator2() {
+  std::vector<some_struct, mi_stl_allocator<some_struct> > vec;
+  vec.push_back(some_struct());
+  vec.pop_back();
+  return vec.size() == 0;
+}
+
+#if MI_HAS_HEAP_STL_ALLOCATOR
+static bool test_stl_allocator3() {
+  std::vector<int, mi_heap_stl_allocator<int> > vec;
+  vec.push_back(1);
+  vec.pop_back();
+  return vec.size() == 0;
+}
+
+static bool test_stl_allocator4() {
+  std::vector<some_struct, mi_heap_stl_allocator<some_struct> > vec;
+  vec.push_back(some_struct());
+  vec.pop_back();
+  return vec.size() == 0;
+}
+
+static bool test_stl_allocator5() {
+  std::vector<int, mi_heap_destroy_stl_allocator<int> > vec;
+  vec.push_back(1);
+  vec.pop_back();
+  return vec.size() == 0;
+}
+
+static bool test_stl_allocator6() {
+  std::vector<some_struct, mi_heap_destroy_stl_allocator<some_struct> > vec;
+  vec.push_back(some_struct());
+  vec.pop_back();
+  return vec.size() == 0;
+}
+#endif
+
+static void test_stl_allocators() {
+  test_stl_allocator1();
+  test_stl_allocator2();
+#if MI_HAS_HEAP_STL_ALLOCATOR
+  test_stl_allocator3();
+  test_stl_allocator4();
+  test_stl_allocator5();
+  test_stl_allocator6();
+#endif
+}
+
+#if 0
+#include <algorithm>
+#include <chrono>
+#include <functional>
+#include <iostream>
+#include <thread>
+#include <vector>
+
+static void test_mixed0() {
+    std::vector<std::unique_ptr<std::size_t>> numbers(1024 * 1024 * 100);
+    std::vector<std::thread> threads(1);
+
+    std::atomic<std::size_t> index{};
+
+    auto start = std::chrono::system_clock::now();
+
+    for (auto& thread : threads) {
+        thread = std::thread{[&index, &numbers]() {
+            while (true) {
+                auto i = index.fetch_add(1, std::memory_order_relaxed);
+                if (i >= numbers.size()) return;
+
+                numbers[i] = std::make_unique<std::size_t>(i);
+            }
+        }};
+    }
+
+    for (auto& thread : threads) thread.join();
+
+    auto end = std::chrono::system_clock::now();
+
+    auto duration =
+        std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+    std::cout << "Running on " << threads.size() << " threads took " << duration
+              << std::endl;
+}
+#endif
+
+void asd() {
+  void* p = malloc(128);
+  free(p);
+}
+static void test_mixed1() {
+    std::thread thread(asd);
+    thread.join();
+}
+
+#if 0
+// issue #691
+static char* cptr;
+
+static void* thread1_allocate()
+{
+  cptr = mi_calloc_tp(char,22085632);
+  return NULL;
+}
+
+static void* thread2_free()
+{
+  assert(cptr);
+  mi_free(cptr);
+  cptr = NULL;
+  return NULL;
+}
+
+static void test_large_migrate(void) {
+  auto t1 = std::thread(thread1_allocate);
+  t1.join();
+  auto t2 = std::thread(thread2_free);
+  t2.join();
+  /*
+  pthread_t thread1, thread2;
+
+  pthread_create(&thread1, NULL, &thread1_allocate, NULL);
+  pthread_join(thread1, NULL);
+
+  pthread_create(&thread2, NULL, &thread2_free, NULL);
+  pthread_join(thread2, NULL);
+  */
+  return;
+}
+#endif
+
+// issue 445
+static void strdup_test() {
+#ifdef _MSC_VER
+  char* s = _strdup("hello\n");
+  char* buf = NULL;
+  size_t len;
+  _dupenv_s(&buf, &len, "MIMALLOC_VERBOSE");
+  mi_free(buf);
+  mi_free(s);
+#endif
+}
+
+// Issue #202
+static void heap_no_delete_worker() {
+  mi_heap_t* heap = mi_heap_new();
+  void* q = mi_heap_malloc(heap, 1024); (void)(q);
+  // mi_heap_delete(heap); // uncomment to prevent assertion
+}
+
+static void heap_no_delete() {
+  auto t1 = std::thread(heap_no_delete_worker);
+  t1.join();
+}
+
+
+// Issue #697
+static void test_std_string() {
+  std::string path = "/Users/xxxx/Library/Developer/Xcode/DerivedData/xxxxxxxxxx/Build/Intermediates.noindex/xxxxxxxxxxx/arm64/XX_lto.o/0.arm64.lto.o";
+  std::string path1 = "/Users/xxxx/Library/Developer/Xcode/DerivedData/xxxxxxxxxx/Build/Intermediates.noindex/xxxxxxxxxxx/arm64/XX_lto.o/1.arm64.lto.o";
+  std::cout << path + "\n>>>            " + path1 + "\n>>>            " << std::endl;
+}
+
+// Issue #204
+static volatile void* global_p;
+
+static void t1main() {
+  mi_heap_t* heap = mi_heap_new();
+  global_p = mi_heap_malloc(heap, 1024);
+  mi_heap_delete(heap);
+}
+
+static void theap_late_free() {
+  auto t1 = std::thread(t1main);
+
+  msleep(2000);
+  assert(global_p);
+  mi_free((void*)global_p);
+
+  t1.join();
+}
+
+// issue  #209
+static void* shared_p;
+static void alloc0(/* void* arg */)
+{
+  shared_p = mi_malloc(8);
+}
+
+static void padding_shrink(void)
+{
+  auto t1 = std::thread(alloc0);
+  t1.join();
+  mi_free(shared_p);
+}
+
+
+// Issue #221
+static void theap_thread_free_large_worker() {
+  mi_free(shared_p);
+}
+
+static void theap_thread_free_large() {
+  for (int i = 0; i < 100; i++) {
+    shared_p = mi_malloc_aligned(2*1024*1024 + 1, 8);
+    auto t1 = std::thread(theap_thread_free_large_worker);
+    t1.join();
+  }
+}
+
+static void theap_thread_free_huge_worker() {
+  mi_free(shared_p);
+}
+
+static void theap_thread_free_huge() {
+  for (int i = 0; i < 10; i++) {
+    shared_p = mi_malloc(1024 * 1024 * 1024);
+    auto t1 = std::thread(theap_thread_free_huge_worker);
+    t1.join();
+  }
+}
+
+static std::atomic<long> xgsum;
+
+static void local_alloc() {
+  long sum = 0;
+  for(int i = 0; i < 1000000; i++) {
+    const int n = 1 + std::rand() % 1000;
+    uint8_t* p = (uint8_t*)calloc(n, 1);
+    p[0] = 1;
+    sum += p[std::rand() % n];
+    if ((std::rand() % 100) > 24) {
+      free(p);
+    }
+  }
+  xgsum += sum;
+}
+
+static void test_thread_leak() {
+  std::vector<std::thread> threads;
+  for (int i=1; i<=100; ++i) {
+    threads.emplace_back(std::thread(&local_alloc));
+  }
+  for (auto& th : threads) {
+    th.join();
+  }
+}
+
+static void test_mt_shutdown()
+{
+  const int threads = 5;
+  std::vector< std::future< std::vector< char* > > > ts;
+
+  auto fn = [&]()
+  {
+    std::vector< char* > ps;
+    ps.reserve(1000);
+    for (int i = 0; i < 1000; i++)
+      ps.emplace_back(new char[1]);
+    return ps;
+  };
+
+  for (int i = 0; i < threads; i++)
+    ts.emplace_back(std::async(std::launch::async, fn));
+
+  for (auto& f : ts)
+    for (auto& p : f.get())
+      delete[] p;
+
+  std::cout << "done" << std::endl;
+}
+
+// issue #372
+static void fail_aslr() {
+  size_t sz = (size_t)(4ULL << 40); // 4TiB
+  void* p = malloc(sz);
+  printf("pointer p: %p: area up to %p\n", p, (uint8_t*)p + sz);
+  *(int*)0x5FFFFFFF000 = 0;  // should segfault
+}
+
+// issues #414
+static void dummy_worker() {
+  void* p = mi_malloc(0);
+  mi_free(p);
+}
+
+static void tsan_numa_test() {
+  auto t1 = std::thread(dummy_worker);
+  dummy_worker();
+  t1.join();
+}
+
+
+class MTest
+{
+    char *data;
+public:
+    MTest() { data = (char*)malloc(1024); }
+    ~MTest() { free(data); };
+};
+
+thread_local MTest tlVariable;
+
+void threadFun( int i )
+{
+    printf( "Thread %d\n", i );
+    std::this_thread::sleep_for( std::chrono::milliseconds(100) );
+}
+
+void test_thread_local()
+{
+    for( int i=1; i < 100; ++i )
+    {
+        std::thread t( threadFun, i );
+        t.join();
+        mi_stats_print(NULL);
+    }
+    return;
+}
+
+// issue #1177
+thread_local void* s_ptr = mi_malloc(1);
+
+void test_join() {
+  std::thread thread([]() { mi_free(s_ptr); });
+  thread.join();
+  mi_free(s_ptr);
+}
+
+
+static std::atomic<long> gsum;
+
+const int LEN[] = { 1000, 5000, 10000, 50000 };
+
+// adapted from example in
+// https://github.com/microsoft/mimalloc/issues/1104
+
+static void test_perf_local_alloc()
+{
+  // thread-local random number generator
+  std::minstd_rand rng(std::random_device{}());
+
+  long sum = 0;
+  for (int i = 0; i < 1000000; i++)
+  {
+    int len = LEN[rng() % 4];
+    int* p = (int*)mi_zalloc_aligned(len * sizeof(int), alignof(int));
+    p[0] = 1;
+    sum += p[rng() % len];
+    free(p);
+  }
+  std::cout << ".";
+  gsum += sum;
+}
+
+static void test_perf_run()
+{
+  std::vector<std::thread> threads;
+  for (int i = 0; i < 24; ++i)
+  {
+    threads.emplace_back(std::thread(&test_perf_local_alloc));
+  }
+  for (auto& th : threads)
+  {
+    th.join();
+  }
+  std::cout << "\n";
+}
+
+void test_perf(void)
+{
+  test_perf_run();
+  std::cout << "gsum: " << gsum.load() << "\n";
+}
+
+
+static int sum2;
+
+static void escape(uint8_t* p, size_t n) { 
+  if (n==0) return;
+  p[std::rand() % n] = 42;
+  sum2 += p[std::rand() % n];
+}
+
+void test_perf2(void) {  
+  for (size_t i = 0; i < 100000000; i++) {
+    const size_t n = 1000;
+    uint8_t* p = (uint8_t*)calloc(1, n);
+    escape(p,n);
+    free(p);
+  }
+}
+
+void test_perf3(void) {
+  for (size_t i = 0; i < 5; i++) {
+    const size_t n = (size_t)1*1024*1024*1024;
+    uint8_t* p = (uint8_t*)calloc(1, n);
+    escape(p, n);
+    free(p);
+  }
+}
+
+
+static void local_alloc4() {
+  for (int i = 0; i < 1000000; i++) {
+    const size_t n = i%1000;
+    uint8_t* p = (uint8_t*)calloc(1,n);
+    escape(p,n);
+    if (i % 4 > 0) {
+      free(p);
+    }
+  }
+}
+
+static void test_perf4(void) {
+  std::vector<std::thread> threads;
+  for (int i = 1; i <= 100; ++i) {
+    threads.emplace_back(std::thread(&local_alloc4));
+  }
+  for (auto& th : threads) {
+    th.join();
+  }
+}
+
+
+void escape5(uint8_t* p, size_t n) {
+  if (n==0) return;
+  for (size_t i = 0; i < n; i++) {
+    p[i] = (uint8_t)(i & 0xFF);
+  }
+  p[rand() % n] = (uint8_t)(n&0xFF);
+  // asm volatile("" : : "g"(p) : "memory");   
+}
+
+static long gsum5;
+
+static void local_alloc5() {
+  long sum = 0;
+  for (int i = 0; i < 500000; i++) {
+    const size_t n = i % 1000;
+    uint8_t* p = (uint8_t*)mi_malloc(n);
+    escape5(p, n);
+    if (i % 4 > 0) {
+      if (n>0) { sum += p[n-1]; }
+      mi_free(p);
+    }
+  }
+  gsum5 += sum;
+}
+
+static void test_perf5(void) {
+  std::vector<std::thread> threads;
+  for (int i = 1; i <= 100; ++i) {
+    threads.emplace_back(std::thread(&local_alloc5));
+  }
+  for (auto& th : threads) {
+    th.join();
+  }
+  printf("gsum5: %li\n", gsum5);
+}
diff --git a/ext/src/mimalloc/test/main.c b/ext/src/mimalloc/test/main.c
new file mode 100644
index 0000000000..6d94c8eca5
--- /dev/null
+++ b/ext/src/mimalloc/test/main.c
@@ -0,0 +1,46 @@
+#include <stdio.h>
+#include <assert.h>
+#include <mimalloc.h>
+
+void test_heap(void* p_out) {
+  mi_heap_t* heap = mi_heap_new();
+  void* p1 = mi_heap_malloc(heap,32);
+  void* p2 = mi_heap_malloc(heap,48);
+  mi_free(p_out);
+  mi_heap_destroy(heap);
+  //mi_theap_delete(theap); mi_free(p1); mi_free(p2);
+}
+
+void test_large() {
+  const size_t N = 1000;
+
+  for (size_t i = 0; i < N; ++i) {
+    size_t sz = 1ull << 21;
+    char* a = mi_mallocn_tp(char,sz);
+    for (size_t k = 0; k < sz; k++) { a[k] = 'x'; }
+    mi_free(a);
+  }
+}
+
+int main() {
+  void* p1 = mi_malloc(16);
+  void* p2 = mi_malloc(1000000);
+  mi_free(p1);
+  mi_free(p2);
+  p1 = mi_malloc(16);
+  p2 = mi_malloc(16);
+  mi_free(p1);
+  mi_free(p2);
+
+  test_heap(mi_malloc(32));
+
+  p1 = mi_malloc_aligned(64, 16);
+  p2 = mi_malloc_aligned(160,24);
+  mi_free(p2);
+  mi_free(p1);
+  //test_large();
+
+  mi_collect(true);
+  mi_stats_print(NULL);
+  return 0;
+}
diff --git a/ext/src/mimalloc/test/readme.md b/ext/src/mimalloc/test/readme.md
new file mode 100644
index 0000000000..db3524cd4f
--- /dev/null
+++ b/ext/src/mimalloc/test/readme.md
@@ -0,0 +1,16 @@
+Testing allocators is difficult as bugs may only surface after particular
+allocation patterns. The main approach to testing _mimalloc_ is therefore
+to have extensive internal invariant checking (see `page_is_valid` in `page.c`
+for example), which is enabled in debug mode with `-DMI_DEBUG_FULL=ON`.
+The main testing strategy is then to run [`mimalloc-bench`][bench] using full
+invariant checking to catch any potential problems over a wide range of intensive
+allocation benchmarks and programs.
+
+However, this does not test well for the entire API surface and this is tested
+with `test-api.c` when using `make test` (from `out/debug` etc). (This is
+not complete yet, please add to it.)
+
+The `main.c` and `main-override.c` are there to test if building and overriding
+from a local install works and therefore these build a separate `test/CMakeLists.txt`.
+
+[bench]: https://github.com/daanx/mimalloc-bench
diff --git a/ext/src/mimalloc/test/test-api-fill.c b/ext/src/mimalloc/test/test-api-fill.c
new file mode 100644
index 0000000000..eebbd394ef
--- /dev/null
+++ b/ext/src/mimalloc/test/test-api-fill.c
@@ -0,0 +1,343 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc/types.h"
+
+#include "testhelper.h"
+
+// ---------------------------------------------------------------------------
+// Helper functions
+// ---------------------------------------------------------------------------
+bool check_zero_init(uint8_t* p, size_t size);
+#if MI_DEBUG >= 2
+bool check_debug_fill_uninit(uint8_t* p, size_t size);
+bool check_debug_fill_freed(uint8_t* p, size_t size);
+#endif
+
+// ---------------------------------------------------------------------------
+// Main testing
+// ---------------------------------------------------------------------------
+int main(void) {
+  mi_option_disable(mi_option_verbose);
+
+  // ---------------------------------------------------
+  // Zeroing allocation
+  // ---------------------------------------------------
+  CHECK_BODY("zeroinit-zalloc-small") {
+    size_t zalloc_size = MI_SMALL_SIZE_MAX / 2;
+    uint8_t* p = (uint8_t*)mi_zalloc(zalloc_size);
+    result = check_zero_init(p, zalloc_size);
+    mi_free(p);
+  };
+  CHECK_BODY("zeroinit-zalloc-large") {
+    size_t zalloc_size = MI_SMALL_SIZE_MAX * 2;
+    uint8_t* p = (uint8_t*)mi_zalloc(zalloc_size);
+    result = check_zero_init(p, zalloc_size);
+    mi_free(p);
+  };
+  CHECK_BODY("zeroinit-zalloc_small") {
+    size_t zalloc_size = MI_SMALL_SIZE_MAX / 2;
+    uint8_t* p = (uint8_t*)mi_zalloc_small(zalloc_size);
+    result = check_zero_init(p, zalloc_size);
+    mi_free(p);
+  };
+
+  CHECK_BODY("zeroinit-calloc-small") {
+    size_t calloc_size = MI_SMALL_SIZE_MAX / 2;
+    uint8_t* p = (uint8_t*)mi_calloc(calloc_size, 1);
+    result = check_zero_init(p, calloc_size);
+    mi_free(p);
+  };
+  CHECK_BODY("zeroinit-calloc-large") {
+    size_t calloc_size = MI_SMALL_SIZE_MAX * 2;
+    uint8_t* p = (uint8_t*)mi_calloc(calloc_size, 1);
+    result = check_zero_init(p, calloc_size);
+    mi_free(p);
+  };
+
+  CHECK_BODY("zeroinit-rezalloc-small") {
+    size_t zalloc_size = MI_SMALL_SIZE_MAX / 2;
+    uint8_t* p = (uint8_t*)mi_zalloc(zalloc_size);
+    result = check_zero_init(p, zalloc_size);
+    zalloc_size *= 3;
+    p = (uint8_t*)mi_rezalloc(p, zalloc_size);
+    result &= check_zero_init(p, zalloc_size);
+    mi_free(p);
+  };
+  CHECK_BODY("zeroinit-rezalloc-large") {
+    size_t zalloc_size = MI_SMALL_SIZE_MAX * 2;
+    uint8_t* p = (uint8_t*)mi_zalloc(zalloc_size);
+    result = check_zero_init(p, zalloc_size);
+    zalloc_size *= 3;
+    p = (uint8_t*)mi_rezalloc(p, zalloc_size);
+    result &= check_zero_init(p, zalloc_size);
+    mi_free(p);
+  };
+
+  CHECK_BODY("zeroinit-recalloc-small") {
+    size_t calloc_size = MI_SMALL_SIZE_MAX / 2;
+    uint8_t* p = (uint8_t*)mi_calloc(calloc_size, 1);
+    result = check_zero_init(p, calloc_size);
+    calloc_size *= 3;
+    p = (uint8_t*)mi_recalloc(p, calloc_size, 1);
+    result &= check_zero_init(p, calloc_size);
+    mi_free(p);
+  };
+  CHECK_BODY("zeroinit-recalloc-large") {
+    size_t calloc_size = MI_SMALL_SIZE_MAX * 2;
+    uint8_t* p = (uint8_t*)mi_calloc(calloc_size, 1);
+    result = check_zero_init(p, calloc_size);
+    calloc_size *= 3;
+    p = (uint8_t*)mi_recalloc(p, calloc_size, 1);
+    result &= check_zero_init(p, calloc_size);
+    mi_free(p);
+  };
+
+  // ---------------------------------------------------
+  // Zeroing in aligned API
+  // ---------------------------------------------------
+  CHECK_BODY("zeroinit-zalloc_aligned-small") {
+    size_t zalloc_size = MI_SMALL_SIZE_MAX / 2;
+    uint8_t* p = (uint8_t*)mi_zalloc_aligned(zalloc_size, MI_MAX_ALIGN_SIZE * 2);
+    result = check_zero_init(p, zalloc_size);
+    mi_free(p);
+  };
+  CHECK_BODY("zeroinit-zalloc_aligned-large") {
+    size_t zalloc_size = MI_SMALL_SIZE_MAX * 2;
+    uint8_t* p = (uint8_t*)mi_zalloc_aligned(zalloc_size, MI_MAX_ALIGN_SIZE * 2);
+    result = check_zero_init(p, zalloc_size);
+    mi_free(p);
+  };
+
+  CHECK_BODY("zeroinit-calloc_aligned-small") {
+    size_t calloc_size = MI_SMALL_SIZE_MAX / 2;
+    uint8_t* p = (uint8_t*)mi_calloc_aligned(calloc_size, 1, MI_MAX_ALIGN_SIZE * 2);
+    result = check_zero_init(p, calloc_size);
+    mi_free(p);
+  };
+  CHECK_BODY("zeroinit-calloc_aligned-large") {
+    size_t calloc_size = MI_SMALL_SIZE_MAX * 2;
+    uint8_t* p = (uint8_t*)mi_calloc_aligned(calloc_size, 1, MI_MAX_ALIGN_SIZE * 2);
+    result = check_zero_init(p, calloc_size);
+    mi_free(p);
+  };
+
+  CHECK_BODY("zeroinit-rezalloc_aligned-small") {
+    size_t zalloc_size = MI_SMALL_SIZE_MAX / 2;
+    uint8_t* p = (uint8_t*)mi_zalloc_aligned(zalloc_size, MI_MAX_ALIGN_SIZE * 2);
+    result = check_zero_init(p, zalloc_size);
+    zalloc_size *= 3;
+    p = (uint8_t*)mi_rezalloc_aligned(p, zalloc_size, MI_MAX_ALIGN_SIZE * 2);
+    result &= check_zero_init(p, zalloc_size);
+    mi_free(p);
+  };
+  CHECK_BODY("zeroinit-rezalloc_aligned-large") {
+    size_t zalloc_size = MI_SMALL_SIZE_MAX * 2;
+    uint8_t* p = (uint8_t*)mi_zalloc_aligned(zalloc_size, MI_MAX_ALIGN_SIZE * 2);
+    result = check_zero_init(p, zalloc_size);
+    zalloc_size *= 3;
+    p = (uint8_t*)mi_rezalloc_aligned(p, zalloc_size, MI_MAX_ALIGN_SIZE * 2);
+    result &= check_zero_init(p, zalloc_size);
+    mi_free(p);
+  };
+
+  CHECK_BODY("zeroinit-recalloc_aligned-small") {
+    size_t calloc_size = MI_SMALL_SIZE_MAX / 2;
+    uint8_t* p = (uint8_t*)mi_calloc_aligned(calloc_size, 1, MI_MAX_ALIGN_SIZE * 2);
+    result = check_zero_init(p, calloc_size);
+    calloc_size *= 3;
+    p = (uint8_t*)mi_recalloc_aligned(p, calloc_size, 1, MI_MAX_ALIGN_SIZE * 2);
+    result &= check_zero_init(p, calloc_size);
+    mi_free(p);
+  };
+  CHECK_BODY("zeroinit-recalloc_aligned-large") {
+    size_t calloc_size = MI_SMALL_SIZE_MAX * 2;
+    uint8_t* p = (uint8_t*)mi_calloc_aligned(calloc_size, 1, MI_MAX_ALIGN_SIZE * 2);
+    result = check_zero_init(p, calloc_size);
+    calloc_size *= 3;
+    p = (uint8_t*)mi_recalloc_aligned(p, calloc_size, 1, MI_MAX_ALIGN_SIZE * 2);
+    result &= check_zero_init(p, calloc_size);
+    mi_free(p);
+  };
+
+#if (MI_DEBUG >= 2) && !MI_TSAN
+  // ---------------------------------------------------
+  // Debug filling
+  // ---------------------------------------------------
+  CHECK_BODY("uninit-malloc-small") {
+    size_t malloc_size = MI_SMALL_SIZE_MAX / 2;
+    uint8_t* p = (uint8_t*)mi_malloc(malloc_size);
+    result = check_debug_fill_uninit(p, malloc_size);
+    mi_free(p);
+  };
+  CHECK_BODY("uninit-malloc-large") {
+    size_t malloc_size = MI_SMALL_SIZE_MAX * 2;
+    uint8_t* p = (uint8_t*)mi_malloc(malloc_size);
+    result = check_debug_fill_uninit(p, malloc_size);
+    mi_free(p);
+  };
+
+  CHECK_BODY("uninit-malloc_small") {
+    size_t malloc_size = MI_SMALL_SIZE_MAX / 2;
+    uint8_t* p = (uint8_t*)mi_malloc_small(malloc_size);
+    result = check_debug_fill_uninit(p, malloc_size);
+    mi_free(p);
+  };
+
+  CHECK_BODY("uninit-realloc-small") {
+    size_t malloc_size = MI_SMALL_SIZE_MAX / 2;
+    uint8_t* p = (uint8_t*)mi_malloc(malloc_size);
+    result = check_debug_fill_uninit(p, malloc_size);
+    malloc_size *= 3;
+    p = (uint8_t*)mi_realloc(p, malloc_size);
+    result &= check_debug_fill_uninit(p, malloc_size);
+    mi_free(p);
+  };
+  CHECK_BODY("uninit-realloc-large") {
+    size_t malloc_size = MI_SMALL_SIZE_MAX * 2;
+    uint8_t* p = (uint8_t*)mi_malloc(malloc_size);
+    result = check_debug_fill_uninit(p, malloc_size);
+    malloc_size *= 3;
+    p = (uint8_t*)mi_realloc(p, malloc_size);
+    result &= check_debug_fill_uninit(p, malloc_size);
+    mi_free(p);
+  };
+
+  CHECK_BODY("uninit-mallocn-small") {
+    size_t malloc_size = MI_SMALL_SIZE_MAX / 2;
+    uint8_t* p = (uint8_t*)mi_mallocn(malloc_size, 1);
+    result = check_debug_fill_uninit(p, malloc_size);
+    mi_free(p);
+  };
+  CHECK_BODY("uninit-mallocn-large") {
+    size_t malloc_size = MI_SMALL_SIZE_MAX * 2;
+    uint8_t* p = (uint8_t*)mi_mallocn(malloc_size, 1);
+    result = check_debug_fill_uninit(p, malloc_size);
+    mi_free(p);
+  };
+
+  CHECK_BODY("uninit-reallocn-small") {
+    size_t malloc_size = MI_SMALL_SIZE_MAX / 2;
+    uint8_t* p = (uint8_t*)mi_mallocn(malloc_size, 1);
+    result = check_debug_fill_uninit(p, malloc_size);
+    malloc_size *= 3;
+    p = (uint8_t*)mi_reallocn(p, malloc_size, 1);
+    result &= check_debug_fill_uninit(p, malloc_size);
+    mi_free(p);
+  };
+  CHECK_BODY("uninit-reallocn-large") {
+    size_t malloc_size = MI_SMALL_SIZE_MAX * 2;
+    uint8_t* p = (uint8_t*)mi_mallocn(malloc_size, 1);
+    result = check_debug_fill_uninit(p, malloc_size);
+    malloc_size *= 3;
+    p = (uint8_t*)mi_reallocn(p, malloc_size, 1);
+    result &= check_debug_fill_uninit(p, malloc_size);
+    mi_free(p);
+  };
+
+  CHECK_BODY("uninit-malloc_aligned-small") {
+    size_t malloc_size = MI_SMALL_SIZE_MAX / 2;
+    uint8_t* p = (uint8_t*)mi_malloc_aligned(malloc_size, MI_MAX_ALIGN_SIZE * 2);
+    result = check_debug_fill_uninit(p, malloc_size);
+    mi_free(p);
+  };
+  CHECK_BODY("uninit-malloc_aligned-large") {
+    size_t malloc_size = MI_SMALL_SIZE_MAX * 2;
+    uint8_t* p = (uint8_t*)mi_malloc_aligned(malloc_size, MI_MAX_ALIGN_SIZE * 2);
+    result = check_debug_fill_uninit(p, malloc_size);
+    mi_free(p);
+  };
+
+  CHECK_BODY("uninit-realloc_aligned-small") {
+    size_t malloc_size = MI_SMALL_SIZE_MAX / 2;
+    uint8_t* p = (uint8_t*)mi_malloc_aligned(malloc_size, MI_MAX_ALIGN_SIZE * 2);
+    result = check_debug_fill_uninit(p, malloc_size);
+    malloc_size *= 3;
+    p = (uint8_t*)mi_realloc_aligned(p, malloc_size, MI_MAX_ALIGN_SIZE * 2);
+    result &= check_debug_fill_uninit(p, malloc_size);
+    mi_free(p);
+  };
+  CHECK_BODY("uninit-realloc_aligned-large") {
+    size_t malloc_size = MI_SMALL_SIZE_MAX * 2;
+    uint8_t* p = (uint8_t*)mi_malloc_aligned(malloc_size, MI_MAX_ALIGN_SIZE * 2);
+    result = check_debug_fill_uninit(p, malloc_size);
+    malloc_size *= 3;
+    p = (uint8_t*)mi_realloc_aligned(p, malloc_size, MI_MAX_ALIGN_SIZE * 2);
+    result &= check_debug_fill_uninit(p, malloc_size);
+    mi_free(p);
+  };
+
+  #if !(MI_TRACK_VALGRIND || MI_TRACK_ASAN || MI_GUARDED)
+  CHECK_BODY("fill-freed-small") {
+    size_t malloc_size = MI_SMALL_SIZE_MAX / 2;
+    uint8_t* p = (uint8_t*)mi_malloc(malloc_size);
+    mi_free(p);
+    // First sizeof(void*) bytes will contain housekeeping data, skip these
+    result = check_debug_fill_freed(p + sizeof(void*), malloc_size - sizeof(void*));
+  };
+  CHECK_BODY("fill-freed-large") {
+    size_t malloc_size = MI_SMALL_SIZE_MAX * 2;
+    uint8_t* p = (uint8_t*)mi_malloc(malloc_size);
+    mi_free(p);
+    // First sizeof(void*) bytes will contain housekeeping data, skip these
+    result = check_debug_fill_freed(p + sizeof(void*), malloc_size - sizeof(void*));
+  };
+  #endif
+#endif
+
+  // ---------------------------------------------------
+  // Done
+  // ---------------------------------------------------[]
+  return print_test_summary();
+}
+
+// ---------------------------------------------------------------------------
+// Helper functions
+// ---------------------------------------------------------------------------
+bool check_zero_init(uint8_t* p, size_t size) {
+  if(!p)
+    return false;
+  bool result = true;
+  for (size_t i = 0; i < size; ++i) {
+    result &= p[i] == 0;
+  }
+  return result;
+}
+
+#if MI_DEBUG >= 2
+bool check_debug_fill_uninit(uint8_t* p, size_t size) {
+#if MI_TRACK_VALGRIND || MI_TRACK_ASAN
+  (void)p; (void)size;
+  return true; // when compiled with valgrind we don't init on purpose
+#else
+  if(!p)
+    return false;
+
+  bool result = true;
+  for (size_t i = 0; i < size; ++i) {
+    result &= p[i] == MI_DEBUG_UNINIT;
+  }
+  return result;
+#endif
+}
+
+bool check_debug_fill_freed(uint8_t* p, size_t size) {
+#if MI_TRACK_VALGRIND
+  (void)p; (void)size;
+  return true; // when compiled with valgrind we don't fill on purpose
+#else
+  if(!p)
+    return false;
+
+  bool result = true;
+  for (size_t i = 0; i < size; ++i) {
+    result &= p[i] == MI_DEBUG_FREED;
+  }
+  return result;
+#endif
+}
+#endif
diff --git a/ext/src/mimalloc/test/test-api.c b/ext/src/mimalloc/test/test-api.c
new file mode 100644
index 0000000000..7ddfef12a2
--- /dev/null
+++ b/ext/src/mimalloc/test/test-api.c
@@ -0,0 +1,526 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic ignored "-Walloc-size-larger-than="
+#endif
+
+/*
+Testing allocators is difficult as bugs may only surface after particular
+allocation patterns. The main approach to testing _mimalloc_ is therefore
+to have extensive internal invariant checking (see `page_is_valid` in `page.c`
+for example), which is enabled in debug mode with `-DMI_DEBUG_FULL=ON`.
+The main testing is then to run `mimalloc-bench` [1] using full invariant checking
+to catch any potential problems over a wide range of intensive allocation bench
+marks.
+
+However, this does not test well for the entire API surface. In this test file
+we therefore test the API over various inputs. Please add more tests :-)
+
+[1] https://github.com/daanx/mimalloc-bench
+*/
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <errno.h>
+
+#ifdef __cplusplus
+#include <vector>
+#endif
+
+#include "mimalloc.h"
+// #include "mimalloc/internal.h"
+#include "mimalloc/types.h" // for MI_DEBUG and MI_PAGE_MAX_OVERALLOC_ALIGN
+
+#include "testhelper.h"
+
+// ---------------------------------------------------------------------------
+// Test functions
+// ---------------------------------------------------------------------------
+bool test_theap1(void);
+bool test_theap2(void);
+bool test_theap_arena_destroy(void);
+bool test_theap_arena_delete(void);
+bool test_stl_allocator1(void);
+bool test_stl_allocator2(void);
+
+bool test_stl_theap_allocator1(void);
+bool test_stl_theap_allocator2(void);
+bool test_stl_theap_allocator3(void);
+bool test_stl_theap_allocator4(void);
+
+bool mem_is_zero(uint8_t* p, size_t size) {
+  if (p==NULL) return false;
+  for (size_t i = 0; i < size; ++i) {
+    if (p[i] != 0) return false;
+  }
+  return true;
+}
+
+// ---------------------------------------------------------------------------
+// Main testing
+// ---------------------------------------------------------------------------
+int main(void) {
+  mi_option_disable(mi_option_verbose);
+
+  CHECK_BODY("malloc-aligned9a") { // test large alignments
+    void* p = mi_zalloc_aligned(1024 * 1024, 2);
+    mi_free(p);
+    p = mi_zalloc_aligned(1024 * 1024, 2);
+    mi_free(p);
+    result = true;
+  };
+
+
+  // ---------------------------------------------------
+  // Malloc
+  // ---------------------------------------------------
+
+  CHECK_BODY("malloc-zero") {
+    void* p = mi_malloc(0);
+    result = (p != NULL);
+    mi_free(p);
+  };
+  CHECK_BODY("malloc-nomem1") {
+    result = (mi_malloc((size_t)PTRDIFF_MAX + (size_t)1) == NULL);
+  };
+  CHECK_BODY("malloc-free-null") {
+    mi_free(NULL);
+  };
+  #if MI_INTPTR_BITS > 32
+  CHECK_BODY("malloc-free-invalid-low") {
+    mi_free((void*)(MI_ZU(0x0000000003990080))); // issue #1087
+  };
+  #endif
+  CHECK_BODY("calloc-overflow") {
+    // use (size_t)&mi_calloc to get some number without triggering compiler warnings
+    result = (mi_calloc((size_t)&mi_calloc,SIZE_MAX/1000) == NULL);
+  };
+  CHECK_BODY("calloc0") {
+    void* p = mi_calloc(0,1000);
+    result = (mi_usable_size(p) <= 16);
+    mi_free(p);
+  };
+  CHECK_BODY("malloc-large") {   // see PR #544.
+    void* p = mi_malloc(67108872);
+    mi_free(p);
+  };
+
+  // ---------------------------------------------------
+  // Extended
+  // ---------------------------------------------------
+  CHECK_BODY("posix_memalign1") {
+    void* p = &p;
+    int err = mi_posix_memalign(&p, sizeof(void*), 32);
+    result = ((err==0 && (uintptr_t)p % sizeof(void*) == 0) || p==&p);
+    mi_free(p);
+  };
+  CHECK_BODY("posix_memalign_no_align") {
+    void* p = &p;
+    int err = mi_posix_memalign(&p, 3, 32);
+    result = (err==EINVAL && p==&p);
+  };
+  CHECK_BODY("posix_memalign_zero") {
+    void* p = &p;
+    int err = mi_posix_memalign(&p, sizeof(void*), 0);
+    mi_free(p);
+    result = (err==0);
+  };
+  CHECK_BODY("posix_memalign_nopow2") {
+    void* p = &p;
+    int err = mi_posix_memalign(&p, 3*sizeof(void*), 32);
+    result = (err==EINVAL && p==&p);
+  };
+  CHECK_BODY("posix_memalign_nomem") {
+    void* p = &p;
+    int err = mi_posix_memalign(&p, sizeof(void*), SIZE_MAX);
+    result = (err==ENOMEM && p==&p);
+  };
+
+  // ---------------------------------------------------
+  // Aligned API
+  // ---------------------------------------------------
+  CHECK_BODY("malloc-aligned1") {
+    void* p = mi_malloc_aligned(32,32); result = (p != NULL && (uintptr_t)(p) % 32 == 0); mi_free(p);
+  };
+  CHECK_BODY("malloc-aligned2") {
+    void* p = mi_malloc_aligned(48,32); result = (p != NULL && (uintptr_t)(p) % 32 == 0); mi_free(p);
+  };
+  CHECK_BODY("malloc-aligned3") {
+    void* p1 = mi_malloc_aligned(48,32); bool result1 = (p1 != NULL && (uintptr_t)(p1) % 32 == 0);
+    void* p2 = mi_malloc_aligned(48,32); bool result2 = (p2 != NULL && (uintptr_t)(p2) % 32 == 0);
+    mi_free(p2);
+    mi_free(p1);
+    result = (result1&&result2);
+  };
+  CHECK_BODY("malloc-aligned4") {
+    void* p;
+    bool ok = true;
+    for (int i = 0; i < 8 && ok; i++) {
+      p = mi_malloc_aligned(8, 16);
+      ok = (p != NULL && (uintptr_t)(p) % 16 == 0); mi_free(p);
+    }
+    result = ok;
+  };
+  CHECK_BODY("malloc-aligned5") {
+    void* p = mi_malloc_aligned(4097,4096);
+    size_t usable = mi_usable_size(p);
+    result = (usable >= 4097 && usable < 16000);
+    fprintf(stderr, "malloc_aligned5: usable size: %zi.  ", usable);
+    mi_free(p);
+  };
+  /*
+  CHECK_BODY("malloc-aligned6") {
+    bool ok = true;
+    for (size_t align = 1; align <= MI_PAGE_MAX_OVERALLOC_ALIGN && ok; align *= 2) {
+      void* ps[8];
+      for (int i = 0; i < 8 && ok; i++) {
+        ps[i] = mi_malloc_aligned(align*13  // size
+                                 , align);
+        if (ps[i] == NULL || (uintptr_t)(ps[i]) % align != 0) {
+          ok = false;
+        }
+      }
+      for (int i = 0; i < 8 && ok; i++) {
+        mi_free(ps[i]);
+      }
+    }
+    result = ok;
+  };
+  */
+  CHECK_BODY("malloc-aligned7") {
+    void* p = mi_malloc_aligned(1024,MI_PAGE_MAX_OVERALLOC_ALIGN);
+    mi_free(p);
+    result = ((uintptr_t)p % MI_PAGE_MAX_OVERALLOC_ALIGN) == 0;
+  };
+  CHECK_BODY("malloc-aligned8") {
+    bool ok = true;
+    for (int i = 0; i < 5 && ok; i++) {
+      int n = (1 << i);
+      void* p = mi_malloc_aligned(1024, n * MI_PAGE_MAX_OVERALLOC_ALIGN);
+      ok = ((uintptr_t)p % (n*MI_PAGE_MAX_OVERALLOC_ALIGN)) == 0;
+      mi_free(p);
+    }
+    result = ok;
+  };
+  CHECK_BODY("malloc-aligned9") { // test large alignments
+    bool ok = true;
+    void* p[8];
+    const int max_align_shift =
+      #if SIZE_MAX > UINT32_MAX
+      28
+      #else
+      20
+      #endif
+      ;
+    size_t sizes[8] = { 8, 512, 1024 * 1024, MI_PAGE_MAX_OVERALLOC_ALIGN, MI_PAGE_MAX_OVERALLOC_ALIGN + 1, 2 * MI_PAGE_MAX_OVERALLOC_ALIGN, 8 * MI_PAGE_MAX_OVERALLOC_ALIGN, 0 };
+    for (int i = 0; i < max_align_shift && ok; i++) {
+      int align = (1 << i);
+      for (int j = 0; j < 8 && ok; j++) {
+        p[j] = mi_zalloc_aligned(sizes[j], align);
+        ok = ((uintptr_t)p[j] % align) == 0;
+      }
+      for (int j = 0; j < 8; j++) {
+        mi_free(p[j]);
+      }
+    }
+    result = ok;
+  };
+  CHECK_BODY("malloc-aligned10") {
+    bool ok = true;
+    void* p[10+1];
+    int align;
+    int j;
+    for(j = 0, align = 1; j <= 10 && ok; align *= 2, j++ ) {
+      p[j] = mi_malloc_aligned(43 + align, align);
+      ok = ((uintptr_t)p[j] % align) == 0;
+    }
+    for ( ; j > 0; j--) {
+      mi_free(p[j-1]);
+    }
+    result = ok;
+  }
+  //CHECK_BODY("malloc_aligned11") {
+  //  mi_theap_t* theap = mi_theap_new();
+  //  void* p = mi_theap_malloc_aligned(theap, 33554426, 8);
+  //  result = mi_theap_contains_block(theap, p);
+  //  mi_theap_destroy(theap);
+  //}
+  CHECK_BODY("mimalloc-aligned12") {
+    void* p = mi_malloc_aligned(0x100, 0x100);
+    result = (((uintptr_t)p % 0x100) == 0); // #602
+    mi_free(p);
+  }
+  CHECK_BODY("mimalloc-aligned13") {
+    bool ok = true;
+    for( size_t size = 1; size <= (MI_SMALL_SIZE_MAX * 2) && ok; size++ ) {
+      for(size_t align = 1; align <= size && ok; align *= 2 ) {
+        void* p[10];
+        for(int i = 0; i < 10 && ok; i++) {
+          p[i] = mi_malloc_aligned(size,align);;
+          ok = (p[i] != NULL && ((uintptr_t)(p[i]) % align) == 0);
+        }
+        for(int i = 0; i < 10 && ok; i++) {
+          mi_free(p[i]);
+        }
+        /*
+        if (ok && align <= size && ((size + MI_PADDING_SIZE) & (align-1)) == 0) {
+          size_t bsize = mi_good_size(size);
+          ok = (align <= bsize && (bsize & (align-1)) == 0);
+        }
+        */
+      }
+    }
+    result = ok;
+  }
+  CHECK_BODY("malloc-aligned-at1") {
+    void* p = mi_malloc_aligned_at(48,32,0); result = (p != NULL && ((uintptr_t)(p) + 0) % 32 == 0); mi_free(p);
+  };
+  CHECK_BODY("malloc-aligned-at2") {
+    void* p = mi_malloc_aligned_at(50,32,8); result = (p != NULL && ((uintptr_t)(p) + 8) % 32 == 0); mi_free(p);
+  };
+  CHECK_BODY("memalign1") {
+    void* p;
+    bool ok = true;
+    for (int i = 0; i < 8 && ok; i++) {
+      p = mi_memalign(16,8);
+      ok = (p != NULL && (uintptr_t)(p) % 16 == 0); mi_free(p);
+    }
+    result = ok;
+  };
+  CHECK_BODY("zalloc-aligned-small1") {
+    size_t zalloc_size = MI_SMALL_SIZE_MAX / 2;
+    uint8_t* p = (uint8_t*)mi_zalloc_aligned(zalloc_size, MI_MAX_ALIGN_SIZE * 2);
+    result = mem_is_zero(p, zalloc_size);
+    mi_free(p);
+  };
+  CHECK_BODY("rezalloc_aligned-small1") {
+    size_t zalloc_size = MI_SMALL_SIZE_MAX / 2;
+    uint8_t* p = (uint8_t*)mi_zalloc_aligned(zalloc_size, MI_MAX_ALIGN_SIZE * 2);
+    result = mem_is_zero(p, zalloc_size);
+    zalloc_size *= 3;
+    p = (uint8_t*)mi_rezalloc_aligned(p, zalloc_size, MI_MAX_ALIGN_SIZE * 2);
+    result = result && mem_is_zero(p, zalloc_size);
+    mi_free(p);
+  };
+
+  // ---------------------------------------------------
+  // Reallocation
+  // ---------------------------------------------------
+  CHECK_BODY("realloc-null") {
+    void* p = mi_realloc(NULL,4);
+    result = (p != NULL);
+    mi_free(p);
+  };
+
+  CHECK_BODY("realloc-null-sizezero") {
+    void* p = mi_realloc(NULL,0);  // <https://en.cppreference.com/w/c/memory/realloc> "If ptr is NULL, the behavior is the same as calling malloc(new_size)."
+    result = (p != NULL);
+    mi_free(p);
+  };
+
+  CHECK_BODY("realloc-sizezero") {
+    void* p = mi_malloc(4);
+    void* q = mi_realloc(p, 0);
+    result = (q != NULL);
+    mi_free(q);
+  };
+
+  CHECK_BODY("reallocarray-null-sizezero") {
+    void* p = mi_reallocarray(NULL,0,16);  // issue #574
+    result = (p != NULL && errno == 0);
+    mi_free(p);
+  };
+
+  // ---------------------------------------------------
+  // Returned block sizes
+  // ---------------------------------------------------
+  CHECK_BODY("umalloc1") {
+    for(size_t size = 1; size <= 32*MI_MiB; size *= 2 ) {
+      size_t bsize;
+      void* p = mi_umalloc(size,&bsize);
+      assert(bsize >= size);
+      size_t pre_size;
+      size_t post_size;
+      p = mi_urealloc(p, size + 1024, &pre_size, &post_size);
+      assert(pre_size == bsize);
+      assert(post_size >= size + 1024);
+      size_t fsize;
+      mi_ufree(p,&fsize);
+      assert(fsize == post_size);
+    }
+  }
+
+  // ---------------------------------------------------
+  // Heaps
+  // ---------------------------------------------------
+  //CHECK("theap_destroy", test_theap1());
+  //CHECK("theap_delete", test_theap2());
+  //CHECK("theap_arena_destroy", test_theap_arena_destroy());
+  //CHECK("theap_arena_delete", test_theap_arena_delete());
+
+  //mi_stats_print(NULL);
+
+  // ---------------------------------------------------
+  // various
+  // ---------------------------------------------------
+  #if !defined(MI_TRACK_ASAN)   // realpath may leak with ASAN enabled (as the ASAN allocator intercepts it)
+  CHECK_BODY("realpath") {
+    char* s = mi_realpath( ".", NULL );
+    // printf("realpath: %s\n",s);
+    mi_free(s);
+  };
+  #endif
+
+  CHECK("stl_allocator1", test_stl_allocator1());
+  CHECK("stl_allocator2", test_stl_allocator2());
+
+	//CHECK("stl_theap_allocator1", test_stl_theap_allocator1());
+	//CHECK("stl_theap_allocator2", test_stl_theap_allocator2());
+	//CHECK("stl_theap_allocator3", test_stl_theap_allocator3());
+	//CHECK("stl_theap_allocator4", test_stl_theap_allocator4());
+
+  // ---------------------------------------------------
+  // Done
+  // ---------------------------------------------------[]
+  return print_test_summary();
+}
+
+// ---------------------------------------------------
+// Larger test functions
+// ---------------------------------------------------
+
+/*
+bool test_theap1(void) {
+  mi_theap_t* theap = mi_theap_new();
+  int* p1 = mi_theap_malloc_tp(theap,int);
+  int* p2 = mi_theap_malloc_tp(theap,int);
+  *p1 = *p2 = 43;
+  mi_theap_destroy(theap);
+  return true;
+}
+
+bool test_theap2(void) {
+  mi_theap_t* theap = mi_theap_new();
+  int* p1 = mi_theap_malloc_tp(theap,int);
+  int* p2 = mi_theap_malloc_tp(theap,int);
+  mi_theap_delete(theap);
+  *p1 = 42;
+  mi_free(p1);
+  mi_free(p2);
+  return true;
+}
+
+bool test_theap_arena_destroy(void) {
+  mi_arena_id_t arena_id = NULL;
+  if (mi_reserve_os_memory_ex(64 * 1024 * 1024, true, false, true, &arena_id) != 0) {
+    return false;
+  }
+  mi_theap_t* theap = mi_theap_new_ex(0, true, arena_id);
+  if (theap == NULL) {
+    return false;
+  }
+  mi_theap_destroy(theap);
+  return true;
+}
+
+bool test_theap_arena_delete(void) {
+  mi_arena_id_t arena_id = NULL;
+  if (mi_reserve_os_memory_ex(64 * 1024 * 1024, true, false, true, &arena_id) != 0) {
+    return false;
+  }
+  mi_theap_t* theap = mi_theap_new_ex(0, true, arena_id);
+  if (theap == NULL) {
+    return false;
+  }
+  mi_theap_delete(theap);
+  return true;
+}
+*/
+bool test_stl_allocator1(void) {
+#ifdef __cplusplus
+  std::vector<int, mi_stl_allocator<int> > vec;
+  vec.push_back(1);
+  vec.pop_back();
+  return vec.size() == 0;
+#else
+  return true;
+#endif
+}
+
+struct some_struct  { int i; int j; double z; };
+
+bool test_stl_allocator2(void) {
+#ifdef __cplusplus
+  std::vector<some_struct, mi_stl_allocator<some_struct> > vec;
+  vec.push_back(some_struct());
+  vec.pop_back();
+  return vec.size() == 0;
+#else
+  return true;
+#endif
+}
+
+/*
+bool test_stl_theap_allocator1(void) {
+#ifdef __cplusplus
+  std::vector<some_struct, mi_theap_stl_allocator<some_struct> > vec;
+  vec.push_back(some_struct());
+  vec.pop_back();
+  return vec.size() == 0;
+#else
+  return true;
+#endif
+}
+
+bool test_stl_theap_allocator2(void) {
+#ifdef __cplusplus
+  std::vector<some_struct, mi_theap_destroy_stl_allocator<some_struct> > vec;
+  vec.push_back(some_struct());
+  vec.pop_back();
+  return vec.size() == 0;
+#else
+  return true;
+#endif
+}
+
+bool test_stl_theap_allocator3(void) {
+#ifdef __cplusplus
+	mi_theap_t* theap = mi_theap_new();
+	bool good = false;
+	{
+		mi_theap_stl_allocator<some_struct> myAlloc(theap);
+		std::vector<some_struct, mi_theap_stl_allocator<some_struct> > vec(myAlloc);
+		vec.push_back(some_struct());
+		vec.pop_back();
+		good = vec.size() == 0;
+	}
+	mi_theap_delete(theap);
+  return good;
+#else
+  return true;
+#endif
+}
+
+bool test_stl_theap_allocator4(void) {
+#ifdef __cplusplus
+	mi_theap_t* theap = mi_theap_new();
+	bool good = false;
+	{
+		mi_theap_destroy_stl_allocator<some_struct> myAlloc(theap);
+		std::vector<some_struct, mi_theap_destroy_stl_allocator<some_struct> > vec(myAlloc);
+		vec.push_back(some_struct());
+		vec.pop_back();
+		good = vec.size() == 0;
+	}
+	mi_theap_destroy(theap);
+  return good;
+#else
+  return true;
+#endif
+}
+*/
diff --git a/ext/src/mimalloc/test/test-stress.c b/ext/src/mimalloc/test/test-stress.c
new file mode 100644
index 0000000000..bbe0ec267b
--- /dev/null
+++ b/ext/src/mimalloc/test/test-stress.c
@@ -0,0 +1,509 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2025 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license.
+-----------------------------------------------------------------------------*/
+
+/* This is a stress test for the allocator, using multiple threads and
+   transferring objects between threads. It tries to reflect real-world workloads:
+   - allocation size is distributed linearly in powers of two
+   - with some fraction extra large (and some very large)
+   - the allocations are initialized and read again at free
+   - pointers transfer between threads
+   - threads are terminated and recreated with some objects surviving in between
+   - uses deterministic "randomness", but execution can still depend on
+     (random) thread scheduling. Do not use this test as a benchmark!
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+
+// #define MI_GUARDED         1
+// #define USE_STD_MALLOC     1
+
+#ifndef USE_STD_MALLOC
+#define MI_USE_HEAPS       4
+#endif
+
+// > mimalloc-test-stress [THREADS] [SCALE] [ITER]
+//
+// argument defaults
+#if defined(MI_TSAN)          // with thread-sanitizer reduce the threads to test within the azure pipeline limits
+static int THREADS = 8;
+static int SCALE   = 25;
+static int ITER    = 400;
+#elif defined(MI_UBSAN)       // with undefined behavious sanitizer reduce parameters to stay within the azure pipeline limits
+static int THREADS = 8;
+static int SCALE   = 25;
+static int ITER    = 20;
+#elif defined(MI_GUARDED)     // with debug guard pages reduce parameters to stay within the azure pipeline limits
+static int THREADS = 8;
+static int SCALE   = 10;
+static int ITER    = 10;
+#elif  0
+static int THREADS = 4;
+static int SCALE   = 10;
+static int ITER    = 20;
+#elif 0
+static int THREADS = 32;
+static int SCALE   = 50;
+static int ITER    = 50;
+#elif 0
+static int THREADS = 32;
+static int SCALE   = 25;
+static int ITER    = 50;
+#define ALLOW_LARGE true
+#else
+static int THREADS = 32;      // more repeatable if THREADS <= #processors
+static int SCALE   = 50;      // scaling factor
+static int ITER    = 50;      // N full iterations destructing and re-creating all threads
+#endif
+
+
+
+#define STRESS                // undefine for leak test
+
+#ifndef ALLOW_LARGE
+#define ALLOW_LARGE  false
+#endif
+
+static bool   allow_large_objects = ALLOW_LARGE;    // allow very large objects? (set to `true` if SCALE>100)
+
+static size_t use_one_size = 0;               // use single object size of `N * sizeof(uintptr_t)`?
+
+static bool   main_participates = false;       // main thread participates as a worker too
+
+#ifdef USE_STD_MALLOC
+
+#define custom_calloc(n,s)    calloc(n,s)
+#define custom_realloc(p,s)   realloc(p,s)
+#define custom_free(p)        free(p)
+
+#else
+
+#include <mimalloc.h>
+#include <mimalloc-stats.h>
+
+#ifdef MI_USE_HEAPS
+static mi_heap_t* current_heap;
+#define custom_calloc(n,s)    mi_heap_calloc(current_heap,n,s)
+#define custom_realloc(p,s)   mi_heap_realloc(current_heap,p,s)
+#define custom_free(p)        mi_free(p)
+#else
+#define custom_calloc(n,s)    mi_calloc(n,s)
+#define custom_realloc(p,s)   mi_realloc(p,s)
+#define custom_free(p)        mi_free(p)
+#endif
+
+#ifndef NDEBUG
+#define xMI_HEAP_WALK             // walk the theap objects?
+#endif
+
+#endif
+
+// transfer pointer between threads
+#define TRANSFERS     (1000)
+static volatile void* transfer[TRANSFERS];
+
+
+#if (UINTPTR_MAX != UINT32_MAX)
+const uintptr_t cookie = 0xbf58476d1ce4e5b9UL;
+#else
+const uintptr_t cookie = 0x1ce4e5b9UL;
+#endif
+
+static void* atomic_exchange_ptr(volatile void** p, void* newval);
+
+typedef uintptr_t* random_t;
+
+static uintptr_t pick(random_t r) {
+  uintptr_t x = *r;
+#if (UINTPTR_MAX > UINT32_MAX)
+  // by Sebastiano Vigna, see: <http://xoshiro.di.unimi.it/splitmix64.c>
+  x ^= x >> 30;
+  x *= 0xbf58476d1ce4e5b9UL;
+  x ^= x >> 27;
+  x *= 0x94d049bb133111ebUL;
+  x ^= x >> 31;
+#else
+  // by Chris Wellons, see: <https://nullprogram.com/blog/2018/07/31/>
+  x ^= x >> 16;
+  x *= 0x7feb352dUL;
+  x ^= x >> 15;
+  x *= 0x846ca68bUL;
+  x ^= x >> 16;
+#endif
+  *r = x;
+  return x;
+}
+
+static bool chance(size_t perc, random_t r) {
+  return (pick(r) % 100 <= perc);
+}
+
+static void* alloc_items(size_t items, random_t r) {
+  if (chance(1, r)) {
+    if (chance(1, r) && allow_large_objects) items *= 10000;       // 0.01% giant
+    else if (chance(10, r) && allow_large_objects) items *= 1000;  // 0.1% huge
+    else items *= 100;                                             // 1% large objects;
+  }
+  if (items>=32 && items<=40) items*=2;              // pthreads uses 320b allocations (this shows that more clearly in the stats)
+  if (use_one_size > 0) items = (use_one_size / sizeof(uintptr_t));
+  if (items==0) items = 1;  
+  uintptr_t* p = (uintptr_t*)custom_calloc(items,sizeof(uintptr_t));
+  if (p != NULL) {
+    for (uintptr_t i = 0; i < items; i++) {
+      assert(p[i] == 0);
+      p[i] = (items - i) ^ cookie;
+    }
+  }
+  return p;
+}
+
+static void free_items(void* p) {
+  if (p != NULL) {
+    uintptr_t* q = (uintptr_t*)p;
+    uintptr_t items = (q[0] ^ cookie);
+    for (uintptr_t i = 0; i < items; i++) {
+      if ((q[i] ^ cookie) != items - i) {
+        fprintf(stderr, "memory corruption at block %p at %zu\n", p, i);
+        abort();
+      }
+    }
+  }
+  custom_free(p);
+}
+
+#ifdef MI_HEAP_WALK
+static bool visit_blocks(const mi_theap_t* theap, const mi_theap_area_t* area, void* block, size_t block_size, void* arg) {
+  (void)(theap); (void)(area);
+  size_t* total = (size_t*)arg;
+  if (block != NULL) {
+    *total += block_size;
+  }
+  return true;
+}
+#endif
+
+static void stress(intptr_t tid) {
+  //bench_start_thread();
+  uintptr_t r = ((tid + 1) * 43); // rand();
+  const size_t max_item_shift = 5; // 128
+  const size_t max_item_retained_shift = max_item_shift + 2;
+  size_t allocs = 100 * ((size_t)SCALE) * (tid % 8 + 1); // some threads do more
+  size_t retain = allocs / 2;
+  void** data = NULL;
+  size_t data_size = 0;
+  size_t data_top = 0;
+  void** retained = (void**)custom_calloc(retain,sizeof(void*));
+  size_t retain_top = 0;
+
+  while (allocs > 0 || retain > 0) {
+    if (retain == 0 || (chance(50, &r) && allocs > 0)) {
+      // 50%+ alloc
+      allocs--;
+      if (data_top >= data_size) {
+        data_size += 100000;
+        data = (void**)custom_realloc(data, data_size * sizeof(void*));
+      }
+      data[data_top++] = alloc_items(1ULL << (pick(&r) % max_item_shift), &r);
+    }
+    else {
+      // 25% retain
+      retained[retain_top++] = alloc_items( 1ULL << (pick(&r) % max_item_retained_shift), &r);
+      retain--;
+    }
+    if (chance(66, &r) && data_top > 0) {
+      // 66% free previous alloc
+      size_t idx = pick(&r) % data_top;
+      free_items(data[idx]);
+      data[idx] = NULL;
+    }
+    if (chance(25, &r) && data_top > 0) {
+      // 25% exchange a local pointer with the (shared) transfer buffer.
+      size_t data_idx = pick(&r) % data_top;
+      size_t transfer_idx = pick(&r) % TRANSFERS;
+      void* p = data[data_idx];
+      void* q = atomic_exchange_ptr(&transfer[transfer_idx], p);
+      data[data_idx] = q;
+    }
+  }
+
+  #ifdef MI_HEAP_WALK
+  // walk the theap
+  size_t total = 0;
+  mi_theap_visit_blocks(mi_theap_get_default(), true, visit_blocks, &total);
+  #endif
+
+  // free everything that is left
+  for (size_t i = 0; i < retain_top; i++) {
+    free_items(retained[i]);
+  }
+  for (size_t i = 0; i < data_top; i++) {
+    free_items(data[i]);
+  }
+  custom_free(retained);
+  custom_free(data);
+  //bench_end_thread();
+}
+
+static void run_os_threads(size_t nthreads, void (*entry)(intptr_t tid));
+
+static void test_stress(void) {
+  #ifdef MI_USE_HEAPS
+  mi_heap_t* prev_heaps[MI_USE_HEAPS] = { NULL };
+  #endif
+  uintptr_t r = rand();
+  for (int n = 0; n < ITER; n++) {
+    
+    #ifdef MI_USE_HEAPS
+    // new heap for each iteration
+    if (prev_heaps[MI_USE_HEAPS-1] != NULL) {
+      mi_heap_delete(prev_heaps[MI_USE_HEAPS-1]);   // delete from N iterations ago
+    }
+    for(int i = MI_USE_HEAPS-1; i > 0; i--) {
+      prev_heaps[i] = prev_heaps[i-1];
+    }
+    prev_heaps[0] = current_heap;
+    current_heap = mi_heap_new();
+    #endif  
+
+    run_os_threads(THREADS, &stress);
+
+    #if !defined(NDEBUG) && !defined(USE_STD_MALLOC)
+    // switch between arena and OS allocation for testing
+    // mi_option_set_enabled(mi_option_disallow_arena_alloc, (n%2)==1);
+    #endif
+    #if defined(MI_HEAP_WALK) && defined(MI_USE_HEAPS)
+    size_t total = 0;
+    // mi_abandoned_visit_blocks(mi_subproc_main(), -1, true, visit_blocks, &total);
+    mi_heap_visit_blocks(heap, true, visit_blocks, &total);
+    #endif
+
+    for (int i = 0; i < TRANSFERS; i++) {
+      if (chance(50, &r) || n + 1 == ITER) { // free all on last run, otherwise free half of the transfers
+        void* p = atomic_exchange_ptr(&transfer[i], NULL);
+        free_items(p);
+      }
+    }
+    
+    #if !defined(NDEBUG) || defined(MI_TSAN)
+    if ((n + 1) % 10 == 0) {
+      printf("- iterations left: %3d\n", ITER - (n + 1));
+      #ifndef USE_STD_MALLOC
+      mi_debug_show_arenas();
+      #endif
+      //mi_collect(true);
+      //mi_debug_show_arenas();
+    }
+    #endif
+  }
+  
+  #ifndef USE_STD_MALLOC
+  mi_stats_print(NULL);
+  #endif
+  
+  // clean up  (a bit too early to test the final free_items still works correctly)
+  #ifdef MI_USE_HEAPS
+  for (int i = 0; i < MI_USE_HEAPS; i++) {
+    mi_heap_delete(prev_heaps[i]); prev_heaps[i] = NULL;
+  }
+  mi_heap_delete(current_heap); current_heap = NULL;
+  #endif
+
+  for (int i = 0; i < TRANSFERS; i++) {
+    void* p = atomic_exchange_ptr(&transfer[i], NULL);
+    if (p != NULL) {
+      free_items(p);
+    }
+  }
+}
+
+#ifndef STRESS
+static void leak(intptr_t tid) {
+  uintptr_t r = rand();
+  void* p = alloc_items(1 /*pick(&r)%128*/, &r);
+  if (chance(50, &r)) {
+    intptr_t i = (pick(&r) % TRANSFERS);
+    void* q = atomic_exchange_ptr(&transfer[i], p);
+    free_items(q);
+  }
+}
+
+static void test_leak(void) {
+  for (int n = 0; n < ITER; n++) {
+    run_os_threads(THREADS, &leak);
+    mi_collect(false);
+#ifndef NDEBUG
+    if ((n + 1) % 10 == 0) { printf("- iterations left: %3d\n", ITER - (n + 1)); }
+#endif
+  }
+}
+#endif
+
+#if defined(USE_STD_MALLOC) && defined(MI_LINK_VERSION)
+#ifdef __cplusplus
+extern "C"
+#endif
+int mi_version(void);
+#endif
+
+int main(int argc, char** argv) {
+  #ifdef MI_LINK_VERSION
+    mi_version();
+  #endif
+  #ifdef MI_HEAP_WALK
+    mi_option_enable(mi_option_visit_abandoned);
+  #endif
+  #if !defined(NDEBUG) && !defined(USE_STD_MALLOC)
+    mi_option_set(mi_option_arena_reserve, mi_arena_min_size()/1024 /* in KiB ! */);
+    mi_option_set(mi_option_purge_delay,1);    
+  #endif
+  #if defined(NDEBUG) && !defined(USE_STD_MALLOC)
+    // mi_option_set(mi_option_purge_delay,-1);
+    mi_option_set(mi_option_page_reclaim_on_free, 0);
+  #endif
+
+  // > mimalloc-test-stress [THREADS] [SCALE] [ITER]
+  if (argc >= 2) {
+    char* end;
+    long n = strtol(argv[1], &end, 10);
+    if (n > 0) THREADS = n;
+  }
+  if (argc >= 3) {
+    char* end;
+    long n = (strtol(argv[2], &end, 10));
+    if (n > 0) SCALE = n;
+  }
+  if (argc >= 4) {
+    char* end;
+    long n = (strtol(argv[3], &end, 10));
+    if (n > 0) ITER = n;
+  }
+  if (SCALE > 100) {
+    allow_large_objects = true;
+  }
+  printf("Using %d threads with a %d%% load-per-thread and %d iterations%s", THREADS, SCALE, ITER, (allow_large_objects ? " (allow large objects)" : ""));
+  #if MI_USE_HEAPS
+  printf(" (using %d rolling heaps)", MI_USE_HEAPS);
+  #endif
+  printf("\n");
+
+  #if !defined(NDEBUG) && !defined(USE_STD_MALLOC)
+  mi_stats_reset();
+  #endif
+
+  //mi_reserve_os_memory(1024*1024*1024ULL, false, true);
+  //int res = mi_reserve_huge_os_pages(4,1);
+  //printf("(reserve huge: %i\n)", res);
+
+  //bench_start_program();
+
+  // Run ITER full iterations where half the objects in the transfer buffer survive to the next round.
+  srand(0x7feb352d);
+  // mi_stats_reset();
+#ifdef STRESS
+    test_stress();
+#else
+    test_leak();
+#endif
+
+#ifndef USE_STD_MALLOC
+  #ifndef NDEBUG
+  mi_collect(true);
+  mi_debug_show_arenas();
+  //mi_collect(true);
+  //char* json = mi_stats_get_json(0, NULL);
+  //if (json != NULL) {
+  //  fputs(json,stderr);
+  //  mi_free(json);
+  //}
+  #endif
+  mi_collect(true);
+  mi_stats_print(NULL);
+#endif
+  //bench_end_program();
+  return 0;
+}
+
+
+static void (*thread_entry_fun)(intptr_t) = &stress;
+
+#ifdef _WIN32
+
+#include <windows.h>
+
+static DWORD WINAPI thread_entry(LPVOID param) {
+  thread_entry_fun((intptr_t)param);
+  return 0;
+}
+
+static void run_os_threads(size_t nthreads, void (*fun)(intptr_t)) {
+  thread_entry_fun = fun;
+  DWORD* tids = (DWORD*)custom_calloc(nthreads,sizeof(DWORD));
+  HANDLE* thandles = (HANDLE*)custom_calloc(nthreads,sizeof(HANDLE));
+  thandles[0] = GetCurrentThread(); // avoid lint warning
+  const size_t start = (main_participates ? 1 : 0);
+  for (size_t i = start; i < nthreads; i++) {
+    thandles[i] = CreateThread(0, 8*1024L, &thread_entry, (void*)(i), 0, &tids[i]);
+  }
+  if (main_participates) fun(0); // run the main thread as well
+  for (size_t i = start; i < nthreads; i++) {
+    WaitForSingleObject(thandles[i], INFINITE);
+  }
+  for (size_t i = start; i < nthreads; i++) {
+    CloseHandle(thandles[i]);
+  }
+  custom_free(tids);
+  custom_free(thandles);
+}
+
+static void* atomic_exchange_ptr(volatile void** p, void* newval) {
+#if (INTPTR_MAX == INT32_MAX)
+  return (void*)InterlockedExchange((volatile LONG*)p, (LONG)newval);
+#else
+  return (void*)InterlockedExchange64((volatile LONG64*)p, (LONG64)newval);
+#endif
+}
+#else
+
+#include <pthread.h>
+
+static void* thread_entry(void* param) {
+  thread_entry_fun((uintptr_t)param);
+  return NULL;
+}
+
+static void run_os_threads(size_t nthreads, void (*fun)(intptr_t)) {
+  thread_entry_fun = fun;
+  pthread_t* threads = (pthread_t*)custom_calloc(nthreads,sizeof(pthread_t));
+  memset(threads, 0, sizeof(pthread_t) * nthreads);
+  const size_t start = (main_participates ? 1 : 0);
+  //pthread_setconcurrency(nthreads);
+  for (size_t i = start; i < nthreads; i++) {
+    pthread_create(&threads[i], NULL, &thread_entry, (void*)i);
+  }
+  if (main_participates) fun(0); // run the main thread as well
+  for (size_t i = start; i < nthreads; i++) {
+    pthread_join(threads[i], NULL);
+  }
+  custom_free(threads);
+}
+
+#ifdef __cplusplus
+#include <atomic>
+static void* atomic_exchange_ptr(volatile void** p, void* newval) {
+  return std::atomic_exchange((volatile std::atomic<void*>*)p, newval);
+}
+#else
+#include <stdatomic.h>
+static void* atomic_exchange_ptr(volatile void** p, void* newval) {
+  return atomic_exchange((volatile _Atomic(void*)*)p, newval);
+}
+#endif
+
+#endif
diff --git a/ext/src/mimalloc/test/test-wrong.c b/ext/src/mimalloc/test/test-wrong.c
new file mode 100644
index 0000000000..56a2339a75
--- /dev/null
+++ b/ext/src/mimalloc/test/test-wrong.c
@@ -0,0 +1,92 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* test file for valgrind/asan support.
+
+   VALGRIND:
+   ----------
+   Compile in an "out/debug" folder:
+
+   > cd out/debug
+   > cmake ../.. -DMI_TRACK_VALGRIND=1
+   > make -j8
+
+   and then compile this file as:
+
+   > gcc -g -o test-wrong -I../../include ../../test/test-wrong.c libmimalloc-valgrind-debug.a -lpthread
+
+   and test as:
+
+   > valgrind ./test-wrong
+
+   
+   ASAN
+   ----------
+   Compile in an "out/debug" folder:
+
+   > cd out/debug
+   > cmake ../.. -DMI_TRACK_ASAN=1
+   > make -j8
+
+   and then compile this file as:
+
+   > clang -g -o test-wrong -I../../include ../../test/test-wrong.c libmimalloc-asan-debug.a -lpthread -fsanitize=address -fsanitize-recover=address
+
+   and test as:
+
+   > ASAN_OPTIONS=verbosity=1:halt_on_error=0 ./test-wrong
+
+
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include "mimalloc.h"
+
+#ifdef USE_STD_MALLOC
+# define mi(x) x
+#else
+# define mi(x) mi_##x
+#endif
+
+int main(int argc, char** argv) {
+  int* p = (int*)mi(malloc)(3*sizeof(int));
+
+  int* r = (int*)mi_malloc_aligned(8,16);
+  mi_free(r);
+
+  // illegal byte wise read
+  char* c = (char*)mi(malloc)(3);
+  printf("invalid byte: over: %d, under: %d\n", c[4], c[-1]);
+  mi(free)(c);
+
+  // undefined access
+  int* q = (int*)mi(malloc)(sizeof(int));
+  printf("undefined: %d\n", *q);
+
+  // illegal int read
+  printf("invalid: over: %d, under: %d\n", q[1], q[-1]);
+
+  *q = 42;
+
+  // buffer overflow
+  q[1] = 43;
+
+  // buffer underflow
+  q[-1] = 44;
+
+  mi(free)(q);
+
+  // double free
+  mi(free)(q);
+
+  // use after free
+  printf("use-after-free: %d\n", *q);
+
+  // leak p
+  // mi_free(p)
+  return 0;
+}
\ No newline at end of file
diff --git a/ext/src/mimalloc/test/testhelper.h b/ext/src/mimalloc/test/testhelper.h
new file mode 100644
index 0000000000..a972758411
--- /dev/null
+++ b/ext/src/mimalloc/test/testhelper.h
@@ -0,0 +1,49 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#ifndef TESTHELPER_H_
+#define TESTHELPER_H_
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <errno.h>
+
+// ---------------------------------------------------------------------------
+// Test macros: CHECK(name,predicate) and CHECK_BODY(name,body)
+// ---------------------------------------------------------------------------
+static int ok = 0;
+static int failed = 0;
+
+static bool check_result(bool result, const char* testname, const char* fname, long lineno) {
+  if (!(result)) {
+    failed++;
+    fprintf(stderr,"\n  FAILED: %s: %s:%ld\n", testname, fname, lineno);
+    /* exit(1); */
+  }
+  else {
+    ok++;
+    fprintf(stderr, "ok.\n");
+  }
+  return true;
+}
+
+#define CHECK_BODY(name) \
+  fprintf(stderr,"test: %s...  ", name ); \
+  errno = 0; \
+  for(bool done = false, result = true; !done; done = check_result(result,name,__FILE__,__LINE__))
+
+#define CHECK(name,expr)      CHECK_BODY(name){ result = (expr); }
+
+// Print summary of test. Return value can be directly use as a return value for main().
+static inline int print_test_summary(void)
+{
+  fprintf(stderr,"\n\n---------------------------------------------\n"
+                 "succeeded: %i\n"
+                 "failed   : %i\n\n", ok, failed);
+  return failed;
+}
+
+#endif // TESTHELPER_H_
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 05b03e5953..6e771146c3 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -8,7 +8,7 @@
 ############################################################################
 
 # Hard prerequisites
-cmake_minimum_required(VERSION 3.16)
+cmake_minimum_required(VERSION 3.18)
 
 project(SpadesToolkit
         VERSION 3.16.0
diff --git a/src/cmake/options.cmake b/src/cmake/options.cmake
index 424d459deb..c83c64acbd 100644
--- a/src/cmake/options.cmake
+++ b/src/cmake/options.cmake
@@ -98,10 +98,9 @@ if (SPADES_USE_MIMALLOC)
   set(MI_BUILD_SHARED OFF CACHE INTERNAL "" FORCE)
   set(MI_BUILD_OBJECT ON  CACHE INTERNAL "" FORCE)
   set(MI_USE_CXX      ON  CACHE INTERNAL "" FORCE)
-  if (SPADES_STATIC_BUILD AND NOT APPLE)
-    set(MI_OVERRIDE_WRAP ON CACHE INTERNAL "" FORCE)
-    set(MI_OVERRIDE_GLIBC OFF CACHE INTERNAL "" FORCE)
-  endif()
+  set(MI_SKIP_COLLECT_ON_EXIT ON CACHE INTERNAL "" FORCE)
+  set(MI_OPT_ARCH     ON  CACHE INTERNAL "" FORCE)
+  set(MI_OPT_SIMD     ON  CACHE INTERNAL "" FORCE)
 endif()
 
 option(SPADES_FORCE_COLORED_OUTPUT "Always produce ANSI-colored output (GNU/Clang only)." OFF)
diff --git a/src/common/utils/memory_limit.cpp b/src/common/utils/memory_limit.cpp
index ee3155d716..755725e513 100644
--- a/src/common/utils/memory_limit.cpp
+++ b/src/common/utils/memory_limit.cpp
@@ -29,6 +29,16 @@
 # include <jemalloc/jemalloc.h>
 #endif
 
+#if defined(SPADES_USE_MIMALLOC)
+extern "C" {
+    void mi_collect(bool);
+    size_t mi_stats_total_mem();
+    int mi_reserve_os_memory(size_t	size, bool commit, bool	allow_large);
+    void mi_debug_show_arenas();
+
+};
+#endif
+
 namespace utils {
 
 void limit_memory(size_t limit) {
@@ -54,6 +64,15 @@ void limit_memory(size_t limit) {
     } else {
         INFO("Memory limit set to " << GB << " Gb");
     }
+
+    #if defined(SPADES_USE_MIMALLOC)
+    // Reserve half of the limit memory
+    res = mi_reserve_os_memory(rl.rlim_cur / 2, false, true);
+    if (res != 0) {
+        WARN("Failed to reserve OS memory of " << GB << " Gb, mi_reserve_os_memory() call failed, errno = "
+             << errno << " (" << strerror(errno) << "). Watch your memory consumption!");
+    }
+    #endif
 }
 
 size_t get_memory_limit() {
@@ -88,14 +107,6 @@ size_t get_max_rss() {
 
 #endif
 
-#if defined(SPADES_USE_MIMALLOC)
-extern "C" {
-    void mi_stats_merge(void);
-    void mi_collect(bool);
-    size_t mi_stats_total_mem();
-};
-#endif
-
 size_t get_used_memory() {
 #if defined(SPADES_USE_JEMALLOC)
     // Update statistics cached by mallctl
@@ -121,13 +132,12 @@ size_t get_used_memory() {
     // The statistics is also collected per pool. So we essentially need to propagate
     // the stats from per-thread pool into main one
     if (omp_get_thread_num() > 0) {
-        mi_stats_merge();
+        mi_collect(true); // FIXME: hack-hack-hack
     } else {
         unsigned nthreads = omp_get_max_threads();
 #       pragma omp parallel for
         for (unsigned i = 0; i < 2*nthreads; ++i) {
             mi_collect(true); // FIXME: hack-hack-hack
-            mi_stats_merge();
         }
     }
     return mi_stats_total_mem();