diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 446421e4..360cbf6d 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -5,6 +5,7 @@ on:
   push:
     branches:
       - 'master'
+      - 'NewFastSIMD'
     paths-ignore:
       - 'NoiseTool/**'
       - '.github/**'
@@ -36,13 +37,13 @@ jobs:
       uses: actions/checkout@v3            
     
     - name: 'CMake Configure'
-      run: cmake -S ${{ github.workspace }} -B ${{ github.workspace }}/build -DCMAKE_BUILD_TYPE=Release -DFASTNOISE2_NOISETOOL=OFF -DFASTNOISE2_TESTS=ON ${{ matrix.cmake_options }}
+      run: cmake -S ${{ github.workspace }} -B ${{ github.workspace }}/build -DCMAKE_BUILD_TYPE=Release -DFASTNOISE2_TOOLS=OFF -DFASTNOISE2_TESTS=ON ${{ matrix.cmake_options }}
    
     - name: 'CMake Build'
       run: cmake --build ${{ github.workspace }}/build --config Release --target FastNoiseBenchmark --parallel 4
     
     - name: 'Upload artifact'
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: ${{ matrix.name }}-benchmark-bin
         path: ${{ github.workspace }}/build/Release/bin/
@@ -73,9 +74,3 @@ jobs:
         repository: Auburn/FastNoise2Benchmarking
         event-type: benchmark
         client-payload: '{"ref": "${{ github.ref }}", "sha": "${{ github.sha }}", "runid": "${{ github.run_id }}", "name": "${{ matrix.name }}", "msg": "${{ steps.message-format.outputs.value }}"}'
-
-  benchmarkbin-complete:
-    runs-on: ubuntu-latest
-    needs: benchmark-matrix
-    steps: 
-      - run: echo benchmarkbin-complete
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 5ede2767..3a490188 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -5,7 +5,7 @@ name: CI
 on:
   workflow_dispatch:
   push:
-    branches: [master,NewFastSIMD]
+    branches: [master]
   pull_request:
     branches: [master,NewFastSIMD]
   release:
@@ -36,13 +36,11 @@ jobs:
             name: Linux64-Clang
             cmake_options: -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++
           - os: macos-13
-            target: x86_64-apple-darwin
-            name: MacOSx86_64-Clang
-            cmake_options: -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++
-          - os: macos-14
-            target: aarch64-apple-darwin
-            name: MacOSaarch64-Clang
+            name: MacOS64-Clang
             cmake_options: -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++
+          - os: macos-latest
+            name: MacOSARM64-Clang
+            cmake_options: -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_OSX_ARCHITECTURES="x86_64;arm64"
 
     steps:
     - name: 'Install OpenGL & xorg'
@@ -55,22 +53,22 @@ jobs:
       uses: actions/checkout@v3
 
     - name: 'CMake Build Debug'
-      run: cmake -S ${{ github.workspace }} -B ${{ github.workspace }}/debug -DCMAKE_BUILD_TYPE=Debug -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX="${{ github.workspace }}/install/FastNoise2" -DFASTNOISE2_NOISETOOL=OFF -DFASTNOISE2_TESTS=OFF ${{ matrix.cmake_options }}
-
+      run: cmake -S ${{ github.workspace }} -B ${{ github.workspace }}/debug -DCMAKE_BUILD_TYPE=Debug -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX="${{ github.workspace }}/install/FastNoise2" -DFASTNOISE2_TOOLS=OFF -DFASTNOISE2_TESTS=OFF ${{ matrix.cmake_options }}
+   
     - name: 'CMake Install Debug'
       run: cmake --build ${{ github.workspace }}/debug --config Debug --target install --parallel 4
 
     - name: 'CMake Build Release'
-      run: cmake -S ${{ github.workspace }} -B ${{ github.workspace }}/release -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX="${{ github.workspace }}/install/FastNoise2" -DFASTNOISE2_NOISETOOL=ON -DFASTNOISE2_TESTS=ON ${{ matrix.cmake_options }}
-
+      run: cmake -S ${{ github.workspace }} -B ${{ github.workspace }}/release -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX="${{ github.workspace }}/install/FastNoise2" -DFASTNOISE2_TOOLS=ON -DFASTNOISE2_TESTS=ON ${{ matrix.cmake_options }}
+   
     - name: 'CMake Install Release'
       run: cmake --build ${{ github.workspace }}/release --config Release --target install --parallel 4
 
     - if: runner.os != 'Windows'
-      run: chmod +x ${{ github.workspace }}/install/FastNoise2/bin/NoiseTool
-
+      run: chmod +x ${{ github.workspace }}/install/FastNoise2/bin/NodeEditor
+      
     - name: 'Upload artifact'
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: ${{ matrix.name }}
         path: ${{ github.workspace }}/install/
@@ -91,57 +89,3 @@ jobs:
         file: ${{ github.workspace }}/${{ matrix.name }}.zip
         asset_name: FastNoise2-${{ github.event.release.tag_name }}-${{ matrix.name }}.zip
         tag: ${{ github.ref }}
-
-  macos-universal:
-    if: ${{ always() }}
-    needs: [ ci-matrix ]
-    name: macos Universal Build
-    runs-on: macos-latest
-    outputs:
-      matrix: ${{ steps.matrix.outputs.matrix }}
-    steps:
-    - name: 'Download artifact'
-      uses: actions/download-artifact@v3
-      with:
-        name: MacOSaarch64-Clang
-        path: MacOSaarch64-Clang
-    - name: 'Download artifact'
-      uses: actions/download-artifact@v3
-      with:
-        name: MacOSx86_64-Clang
-        path: MacOSx86_64-Clang
-    - name: 'Create Universal Binary'
-      run: |
-        mkdir -p universal/FastNoise2/lib universal/FastNoise2/bin
-        lipo -create \
-          -output universal/FastNoise2/lib/libFastNoise.dylib \
-            MacOSaarch64-Clang/FastNoise2/lib/libFastNoise.dylib \
-            MacOSx86_64-Clang/FastNoise2/lib/libFastNoise.dylib
-        lipo -create \
-          -output universal/FastNoise2/bin/NoiseTool \
-            MacOSaarch64-Clang/FastNoise2/bin/NoiseTool \
-            MacOSx86_64-Clang/FastNoise2/bin/NoiseTool
-        chmod +x universal/FastNoise2/bin/NoiseTool
-
-    - name: 'Upload artifact'
-      uses: actions/upload-artifact@v3
-      with:
-        name: MacOSUniversal-Clang
-        path: ${{ github.workspace }}/universal/
-
-    - name: 'Zip artifacts'
-      if: github.event_name == 'release'
-      uses: papeloto/action-zip@v1
-      with:
-        files: universal/
-        recursive: true
-        dest: MacOSUniversal-Clang.zip
-
-    - name: 'Upload release artifacts'
-      if: github.event_name == 'release'
-      uses: svenstaro/upload-release-action@v2
-      with:
-        repo_token: ${{ secrets.GITHUB_TOKEN }}
-        file: ${{ github.workspace }}/MacOSUniversal-Clang.zip
-        asset_name: FastNoise2-${{ github.event.release.tag_name }}-MacOSUniversal-Clang.zip
-        tag: ${{ github.ref }}
diff --git a/.gitignore b/.gitignore
index 79926da2..dc29054b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,7 +31,9 @@
 *.out
 *.app
 /.vs*
+/.idea
 /out
+/cmake-build*
 /build
 /enc_temp_folder
 /cpm-cache
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4d3fc7a4..51bffeba 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,35 +1,9 @@
 # CMakeList.txt : CMake project for FastNoise2
 cmake_minimum_required(VERSION 3.7.1)
 
-project(FastNoise2 VERSION 0.10.0)
+project(FastNoise2 VERSION 0.9.4)
 set(CMAKE_CXX_STANDARD 17)
 
-message("FastNoise2 Arch: ${CMAKE_SYSTEM_PROCESSOR}") 
-
-if(CMAKE_SYSTEM_PROCESSOR MATCHES armv7)
-
-    set(FASTSIMD_COMPILE_ARMV7 true)
-    set(FASTSIMD_COMPILE_ARM true)
-    set(FASTSIMD_COMPILE_HAVE_NEON true)
-
-elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL aarch64)
-
-    set(FASTSIMD_COMPILE_AARCH64 true)
-    set(FASTSIMD_COMPILE_ARM true)
-    set(FASTSIMD_COMPILE_HAVE_NEON true)
-
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES arm64)
-
-    set(FASTSIMD_COMPILE_ARM true)
-    set(FASTSIMD_COMPILE_HAVE_NEON true)
-
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES arm)
-
-    set(FASTSIMD_COMPILE_ARM true)
-
-endif()
-
-
 # determine whether this is a standalone project or included by other projects
 if (NOT DEFINED FASTNOISE2_STANDALONE_PROJECT)
     if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
@@ -42,8 +16,11 @@ endif()
 # Build DLL
 #set(BUILD_SHARED_LIBS ON) 
 
-option(FASTNOISE2_NOISETOOL "Build NoiseTool application" ${FASTNOISE2_STANDALONE_PROJECT})
+option(FASTNOISE2_TOOLS "Build \"Node Editor\" executable" ${FASTNOISE2_STANDALONE_PROJECT})
 option(FASTNOISE2_TESTS "Build tests" OFF)
+option(FASTNOISE2_UTILITY "Build utility tools" OFF)
+
+option(FASTNOISE2_STRICT_FP "Enable strict floating point calculations to ensure output from different SIMD feature sets match EXACTLY" OFF)
 
 if(MSVC)
     #setup pdb target location
@@ -62,18 +39,21 @@ endif()
 include(GNUInstallDirs) 
 set(install_targets "")
 
+include(cmake/CPM.cmake)
 add_subdirectory(src)
 
-if(FASTNOISE2_NOISETOOL)
-    include(cmake/CPM.cmake)
-    add_subdirectory(NoiseTool)
+if(FASTNOISE2_TOOLS)
+    add_subdirectory(tools)
 endif()
 
 if(FASTNOISE2_TESTS)
-    include(cmake/CPM.cmake)  
     add_subdirectory(tests)
 endif()
 
+if(FASTNOISE2_UTILITY)
+    add_subdirectory(util)
+endif()
+
 
 #Install -----------------------------------------------------------
 
diff --git a/CMakePresets.json b/CMakePresets.json
index 92ea1c3e..39f8a0cf 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -2,8 +2,8 @@
   "version": 3,
   "configurePresets": [
     {
-      "name": "noisetool",
-      "displayName": "NoiseTool",
+      "name": "tools",
+      "displayName": "Tools",
       "generator": "Ninja Multi-Config",
       "binaryDir": "${sourceDir}/out/build/${presetName}",
       "installDir": "${sourceDir}/out/install/${presetName}",
@@ -21,9 +21,9 @@
     {
       "name": "minimal",
       "displayName": "Minimal",
-      "inherits": "noisetool",
+      "inherits": "tools",
       "cacheVariables": {
-        "FASTNOISE2_NOISETOOL": {
+        "FASTNOISE2_TOOLS": {
           "value": "False",
           "type": "BOOL"
         },
@@ -36,9 +36,9 @@
     {
       "name": "all",
       "displayName": "All",
-      "inherits": "noisetool",
+      "inherits": "tools",
       "cacheVariables": {
-        "FASTNOISE2_NOISETOOL": {
+        "FASTNOISE2_TOOLS": {
           "value": "True",
           "type": "BOOL"
         },
@@ -51,14 +51,14 @@
   ],
   "buildPresets": [
     {
-      "name": "noisetool-debug",
-      "displayName": "NoiseTool Debug",
-      "configurePreset": "noisetool"
+      "name": "tools-debug",
+      "displayName": "tools Debug",
+      "configurePreset": "tools"
     },
     {
-      "name": "noisetool-release",
-      "displayName": "NoiseTool Release",
-      "configurePreset": "noisetool",
+      "name": "tools-release",
+      "displayName": "tools Release",
+      "configurePreset": "tools",
       "configuration": "Release"
     },
     {
diff --git a/NoiseTool/DemoNodeTrees.inl b/NoiseTool/DemoNodeTrees.inl
deleted file mode 100644
index 85b6add5..00000000
--- a/NoiseTool/DemoNodeTrees.inl
+++ /dev/null
@@ -1,7 +0,0 @@
-#pragma once
-
-inline const char* gDemoNodeTrees[][2] =
-{
-    { "Simple Terrain", "EQACAAAAAAAgQBAAAAAAQBkAEwDD9Sg/DQAEAAAAAAAgQAkAAGZmJj8AAAAAPwEEAAAAAAAAAEBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAM3MTD4AMzMzPwAAAAA/" },
-    { "Cellular Caves", "EwCamZk+GgABEQACAAAAAADgQBAAAACIQR8AFgABAAAACwADAAAAAgAAAAMAAAAEAAAAAAAAAD8BFAD//wAAAAAAAD8AAAAAPwAAAAA/AAAAAD8BFwAAAIC/AACAPz0KF0BSuB5AEwAAAKBABgAAj8J1PACamZk+AAAAAAAA4XoUPw==" },
-};
diff --git a/NoiseTool/MeshNoisePreview.cpp b/NoiseTool/MeshNoisePreview.cpp
deleted file mode 100644
index 5f9a9d3b..00000000
--- a/NoiseTool/MeshNoisePreview.cpp
+++ /dev/null
@@ -1,735 +0,0 @@
-#include <algorithm>
-#include <thread>
-#include <cmath>
-
-#include <Corrade/Utility/Resource.h>
-#include <Magnum/Math/Color.h>
-#include <Magnum/Math/Frustum.h>
-#include <Magnum/Math/Intersection.h>
-#include <Magnum/Math/Matrix4.h>
-#include <Magnum/GL/Context.h>
-#include <Magnum/GL/Extensions.h>
-
-#include "ImGuiExtra.h"
-#include "MeshNoisePreview.h"
-
-using namespace Magnum;
-
-MeshNoisePreview::MeshNoisePreview()
-{
-    mBuildData.frequency = 0.005f;
-    mBuildData.seed = 1338;
-    mBuildData.isoSurface = 0.0f;
-    mBuildData.heightmapMultiplier = 100.0f;
-    mBuildData.color = Color3( 1.0f );
-    mBuildData.meshType = MeshType_Voxel3D;
-
-    uint32_t threadCount = std::max( 2u, std::thread::hardware_concurrency() );
-
-    threadCount -= threadCount / 4;
-
-    for( uint32_t i = 0; i < threadCount; i++ )
-    {
-        mThreads.emplace_back( GenerateLoopThread, std::ref( mGenerateQueue ), std::ref( mCompleteQueue ) );
-    }
-
-    SetupSettingsHandlers();
-}
-
-MeshNoisePreview::~MeshNoisePreview()
-{
-    mGenerateQueue.KillThreads();
-
-    for( auto& thread : mThreads )
-    {
-        thread.join();
-    }
-}
-
-void MeshNoisePreview::ReGenerate( FastNoise::SmartNodeArg<> generator )
-{
-    mLoadRange = 200.0f;
-    mBuildData.generator = generator;
-    mBuildData.pos = Vector3i( 0 );
-
-    mMinMax = {};
-    mMinAirY = INFINITY;
-    mMaxSolidY = -INFINITY;
-
-    mRegisteredChunkPositions.clear();
-    mChunks.clear();
-    mGenerateQueue.Clear();
-    mBuildData.genVersion = mCompleteQueue.IncVersion();
-
-    Chunk::MeshData meshData;
-    while( mCompleteQueue.Pop( meshData ) )
-    {
-        meshData.Free();
-    }
-}
-
-void MeshNoisePreview::Draw( const Matrix4& transformation, const Matrix4& projection, const Vector3& cameraPosition )
-{
-    if( ImGui::Checkbox( "Generate Mesh Preview", &mEnabled ) )
-    {
-        ReGenerate( mBuildData.generator );    
-        ImGuiExtra::MarkSettingsDirty();    
-    }
-
-    if( !mBuildData.generator || !mEnabled )
-    {
-        return;
-    }
-
-    UpdateChunkQueues( cameraPosition );
-
-    Matrix4 transformationProjection = projection * transformation;
-
-    Frustum camFrustum = Frustum::fromMatrix( transformationProjection );
-    mShader.SetTransformationProjectionMatrix( transformationProjection );
-
-    mTriCount = 0;
-    mMeshesCount = 0;
-    uint32_t drawnTriCount = 0;
-
-    for( Chunk& chunk : mChunks )
-    {
-        if( GL::Mesh* mesh = chunk.GetMesh() )
-        {
-            int32_t meshTriCount = mesh->count();
-
-            mTriCount += meshTriCount;
-            mMeshesCount++;
-
-            Vector3 posf( chunk.GetPos());
-            Range3D bbox( posf, posf + Vector3( Chunk::SIZE + 1 ) );
-
-            if( mBuildData.meshType == MeshType_Heightmap2D )
-            {
-                bbox.min().y() = mMinMax.min * mBuildData.heightmapMultiplier;
-                bbox.max().y() = mMinMax.max * mBuildData.heightmapMultiplier;
-            }
-
-            if( Math::Intersection::rangeFrustum( bbox, camFrustum ) )
-            {
-                drawnTriCount += meshTriCount;
-                mShader.draw( *mesh );
-            }
-        }
-    }
-    mTriCount /= 3;
-
-    bool edited = false;
-    edited |= ImGui::Combo( "Mesh Type", reinterpret_cast<int*>( &mBuildData.meshType ), MeshTypeStrings );
-    edited |= ImGuiExtra::ScrollCombo( reinterpret_cast<int*>( &mBuildData.meshType ), MeshType_Count );
-    
-    if( ImGui::ColorEdit3( "Mesh Colour", mBuildData.color.data() ) )
-    {        
-        mShader.SetColorTint( mBuildData.color );
-        ImGuiExtra::MarkSettingsDirty();
-    }
-
-    edited |= ImGui::DragInt( "Seed", &mBuildData.seed );
-    edited |= ImGui::DragFloat( "Frequency", &mBuildData.frequency, 0.0005f, 0, 0, "%.4f" );
-
-    if( mBuildData.meshType == MeshType_Heightmap2D )
-    {
-        edited |= ImGui::DragFloat( "Heightmap Multiplier", &mBuildData.heightmapMultiplier, 0.5f );        
-    }
-    else
-    {
-        edited |= ImGui::DragFloat( "Iso Surface", &mBuildData.isoSurface, 0.02f );
-    }
-
-    if( edited )
-    {
-        ReGenerate( mBuildData.generator );
-        ImGuiExtra::MarkSettingsDirty();
-    }
-
-    float triLimitMil = (float)mTriLimit / 1000000.0f;
-    if( ImGui::DragFloat( "Triangle Limit", &triLimitMil, 1, 10.0f, 300.0f, "%0.1fM" ) )
-    {
-        mTriLimit = (uint32_t)( triLimitMil * 1000000 );
-        ImGuiExtra::MarkSettingsDirty();
-    }
-
-    ImGui::Text( "Triangle Count: %0.1fM (%0.1fM)", mTriCount / 1000000.0f, drawnTriCount / 3000000.0f );
-    ImGui::Text( "Voxel Count: %0.1fM", ( mChunks.size() * Chunk::SIZE * Chunk::SIZE * Chunk::SIZE ) / 1000000.0 );
-    ImGui::Text( "Loaded Chunks: %zu (%d)", mChunks.size(), mMeshesCount );
-
-    size_t generateCount = mGenerateQueue.Count();
-    ImGui::Text( "Meshing Chunks: %zu (%zu)", mRegisteredChunkPositions.size() - mChunks.size() - generateCount, generateCount );
-    ImGui::Text( "Chunk Load Range: %0.1f", mLoadRange );
-    ImGui::Text( "Generated Min (%0.6f) : Max (%0.6f)", mMinMax.min, mMinMax.max );
-
-    if( mBuildData.meshType != MeshType_Heightmap2D )
-    {
-        ImGui::Text( "Min Air Y (%0.1f) : Max Solid Y (%0.1f)", mMinAirY, mMaxSolidY );
-    }
-
-    ImGui::Text( "Camera Pos: %0.1f, %0.1f, %0.1f", cameraPosition.x(), cameraPosition.y(), cameraPosition.z() );
-
-    UpdateChunksForPosition( cameraPosition );
-}
-
-float MeshNoisePreview::GetLoadRangeModifier()
-{
-    return std::min( 0.01f, (float)(1000 / std::pow( std::min( 1000.0f, mLoadRange ), 1.5 ) ) );
-}
-
-void MeshNoisePreview::UpdateChunkQueues( const Vector3& position )
-{
-    size_t queueCount = mCompleteQueue.Count();
-
-    if( mTriCount > mTriLimit ) // Reduce load range if over tri limit
-    {
-        mLoadRange = std::max( mLoadRange * (1 - GetLoadRangeModifier()), Chunk::SIZE * 1.5f );
-    }
-
-    StartTimer();
-    Vector3i chunkPos = Vector3i( position - Vector3( Chunk::SIZE / 2.0f ) );
-
-    size_t newChunks = 0;
-    if( queueCount )
-    {        
-        Chunk::MeshData meshData;
-
-        while( GetTimerDurationMs() < 14 && mCompleteQueue.Pop( meshData ) )
-        {
-            mMinMax << meshData.minMax;
-            mMinAirY = std::min( mMinAirY, meshData.minAirY );
-            mMaxSolidY = std::max( mMaxSolidY, meshData.maxSolidY );
-            
-            mChunks.emplace_back( meshData );
-            newChunks++;
-        }
-        mAvgNewChunks += (newChunks - mAvgNewChunks) * 0.01f;
-    }
-
-    std::sort( mChunks.begin(), mChunks.end(), 
-        [chunkPos]( const Chunk& a, const Chunk& b )
-        {
-            return (chunkPos - a.GetPos()).dot() < (chunkPos - b.GetPos()).dot();
-        } );
-
-    // Unload further chunk if out of load range
-    size_t deletedChunks = 0;
-    while( !mChunks.empty() )
-    {
-        Vector3i backChunkPos = mChunks.back().GetPos();
-        float unloadRange = mLoadRange * 1.1f;
-        if( GetTimerDurationMs() < 15 && (chunkPos - backChunkPos).dot() > unloadRange * unloadRange )
-        {
-            mRegisteredChunkPositions.erase( backChunkPos );
-            mChunks.pop_back();
-            deletedChunks++;
-        }
-        else
-        {
-            break;
-        }
-    }
-
-    //ImGui::Text( " Queued Chunks: %zu", queueCount );
-    //ImGui::Text( "    New Chunks: %zu (%0.1f)", newChunks, mAvgNewChunks );
-    //ImGui::Text( "Deleted Chunks: %zu", deletedChunks );
-
-    // Increase load range if queue is not full
-    if( (double)mTriCount < mTriLimit * 0.85 && (mRegisteredChunkPositions.size() - mChunks.size()) < mThreads.size() * mAvgNewChunks )
-    {
-        mLoadRange = std::min( mLoadRange * (1 + GetLoadRangeModifier()), 3000.0f );
-    }
-
-}
-
-void MeshNoisePreview::UpdateChunksForPosition( Vector3 position )
-{
-    //StartTimer();
-    int chunkRange = (int)ceilf( mLoadRange / Chunk::SIZE );
-
-    position -= Vector3( Chunk::SIZE * 0.5f );
-    Vector3i positionI = Vector3i( position );
-
-    Vector3i chunkCenter = (positionI / Chunk::SIZE) * Chunk::SIZE;
-
-    std::vector<Vector3i> chunkPositions;
-    Vector3i chunkPos;
-    int loadRangeSq = (int)(mLoadRange * mLoadRange);
-
-    int staggerShift = std::min( 5, (int)((loadRangeSq * (int64_t)mLoadRange) / 1000000000) );
-    int staggerCount = (1 << staggerShift) - 1;
-
-    for( int x = -chunkRange; x <= chunkRange; x++ )
-    {
-        if( (x & staggerCount) != (mStaggerCheck & staggerCount) )
-        {
-            continue;
-        }
-
-        chunkPos.x() = x * Chunk::SIZE + chunkCenter.x();
-
-        for( int y = -chunkRange; y <= chunkRange; y++ )
-        {
-            if( mBuildData.meshType == MeshType_Heightmap2D )
-            {
-                positionI.y() = 0;
-                chunkPos.y() = 0;
-                y = chunkRange;
-            }
-            else
-            {
-                chunkPos.y() = y * Chunk::SIZE + chunkCenter.y();
-            }
-
-            for( int z = -chunkRange; z <= chunkRange; z++ )
-            {
-                chunkPos.z() = z * Chunk::SIZE + chunkCenter.z();
-
-
-                if( ( positionI - chunkPos ).dot() <= loadRangeSq &&
-                    !mRegisteredChunkPositions.contains( chunkPos ) )
-                {
-                    chunkPositions.push_back( chunkPos );
-                }
-            }
-        }
-    }
-
-    mStaggerCheck++;
-
-    std::sort( chunkPositions.begin(), chunkPositions.end(), [positionI]( const Vector3i& a, const Vector3i& b )
-    {
-        return (positionI - a).dot() < (positionI - b).dot();
-    } );
-
-    for( const Vector3i& pos : chunkPositions )
-    {
-        mBuildData.pos = pos;
-        mRegisteredChunkPositions.insert( pos );
-
-        if( mGenerateQueue.Push( mBuildData ) >= mThreads.size() * 16 )
-        {
-            break;
-        }
-    }
-
-    //ImGui::Text( "UpdateChunksForPosition(%d) Ms: %.2f", staggerShift, GetTimerDurationMs() );
-}
-
-void MeshNoisePreview::GenerateLoopThread( GenerateQueue<Chunk::BuildData>& generateQueue, CompleteQueue<Chunk::MeshData>& completeQueue )
-{
-    while( true )
-    {
-        Chunk::BuildData buildData = generateQueue.Pop();
-
-        if( generateQueue.ShouldKillThread() )
-        {
-            return;
-        }
-
-        Chunk::MeshData meshData = Chunk::BuildMeshData( buildData );
-
-        if( !completeQueue.Push( meshData, buildData.genVersion ) )
-        {
-            meshData.Free();
-        }
-    }
-}
-
-MeshNoisePreview::Chunk::MeshData MeshNoisePreview::Chunk::BuildMeshData( const BuildData& buildData )
-{
-    thread_local static std::vector<float> densityValues( SIZE_GEN * SIZE_GEN * SIZE_GEN );
-    thread_local static std::vector<VertexData> vertexData;
-    thread_local static std::vector<uint32_t> indicies;
-    
-    vertexData.clear();
-    indicies.clear();
-    
-    switch( buildData.meshType )
-    {
-    case MeshType_Voxel3D:
-        return BuildVoxel3DMesh( buildData, densityValues.data(), vertexData, indicies );
-
-    case MeshType_Heightmap2D:
-        return BuildHeightMap2DMesh( buildData, densityValues.data(), vertexData, indicies );
-
-    case MeshType_Count:
-        break;
-    }           
-
-    return MeshData( buildData.pos, {}, vertexData, indicies );
-}
-
-MeshNoisePreview::Chunk::MeshData MeshNoisePreview::Chunk::BuildVoxel3DMesh( const BuildData& buildData, float* densityValues, std::vector<VertexData>& vertexData, std::vector<uint32_t>& indicies )
-{
-    FastNoise::OutputMinMax minMax = buildData.generator->GenUniformGrid3D( densityValues,
-                                                                            buildData.pos.x() - 1, buildData.pos.y() - 1, buildData.pos.z() - 1,
-                                                                            SIZE_GEN, SIZE_GEN, SIZE_GEN, buildData.frequency, buildData.seed );
-    float minAir = INFINITY;
-    float maxSolid = -INFINITY;
-
-#if FASTNOISE_CALC_MIN_MAX
-    if( minMax.min > buildData.isoSurface )
-    {
-        minAir = (float)buildData.pos.y();
-    }
-    else if( minMax.max < buildData.isoSurface )
-    {
-        maxSolid = (float)buildData.pos.y() - 1.0f + SIZE;
-    }
-    else
-#endif
-    {
-        Vector3 light = LIGHT_DIR.normalized() * ( 1.0f - AMBIENT_LIGHT ) + Vector3( AMBIENT_LIGHT );
-
-        float xLight = std::abs( light.x() );
-        float yLight = std::abs( light.y() );
-        float zLight = std::abs( light.z() );
-
-        constexpr int32_t STEP_X = 1;
-        constexpr int32_t STEP_Y = SIZE_GEN;
-        constexpr int32_t STEP_Z = SIZE_GEN * SIZE_GEN;
-
-        int32_t noiseIdx = STEP_X + STEP_Y + STEP_Z;
-
-        for( uint32_t z = 0; z < SIZE; z++ )
-        {
-            float zf = z + (float)buildData.pos.z();
-
-            for( uint32_t y = 0; y < SIZE; y++ )
-            {
-                float yf = y + (float)buildData.pos.y();
-
-                for( uint32_t x = 0; x < SIZE; x++ )
-                {
-                    float xf = x + (float)buildData.pos.x();
-
-                    if( densityValues[noiseIdx] <= buildData.isoSurface ) // Is Solid?
-                    {
-                        maxSolid = std::max( yf, maxSolid );
-
-                        if( densityValues[noiseIdx + STEP_X] > buildData.isoSurface ) // Right
-                        {
-                            AddQuadAO( vertexData, indicies, densityValues, buildData.isoSurface, noiseIdx, STEP_X, STEP_Y, STEP_Z, xLight,
-                                       Vector3( xf + 1, yf, zf ), Vector3( xf + 1, yf + 1, zf ), Vector3( xf + 1, yf + 1, zf + 1 ), Vector3( xf + 1, yf, zf + 1 ) );
-                        }
-
-                        if( densityValues[noiseIdx - STEP_X] > buildData.isoSurface ) // Left
-                        {
-                            AddQuadAO( vertexData, indicies, densityValues, buildData.isoSurface, noiseIdx, -STEP_X, -STEP_Y, STEP_Z, 1.0f - xLight,
-                                       Vector3( xf, yf + 1, zf ), Vector3( xf, yf, zf ), Vector3( xf, yf, zf + 1 ), Vector3( xf, yf + 1, zf + 1 ) );
-                        }
-
-                        if( densityValues[noiseIdx + STEP_Y] > buildData.isoSurface ) // Up
-                        {
-                            AddQuadAO( vertexData, indicies, densityValues, buildData.isoSurface, noiseIdx, STEP_Y, STEP_Z, STEP_X, yLight,
-                                       Vector3( xf, yf + 1, zf ), Vector3( xf, yf + 1, zf + 1 ), Vector3( xf + 1, yf + 1, zf + 1 ), Vector3( xf + 1, yf + 1, zf ) );
-                        }
-
-                        if( densityValues[noiseIdx - STEP_Y] > buildData.isoSurface ) // Down
-                        {
-                            AddQuadAO( vertexData, indicies, densityValues, buildData.isoSurface, noiseIdx, -STEP_Y, -STEP_Z, STEP_X, 1.0f - yLight,
-                                       Vector3( xf, yf, zf + 1 ), Vector3( xf, yf, zf ), Vector3( xf + 1, yf, zf ), Vector3( xf + 1, yf, zf + 1 ) );
-                        }
-
-                        if( densityValues[noiseIdx + STEP_Z] > buildData.isoSurface ) // Forward
-                        {
-                            AddQuadAO( vertexData, indicies, densityValues, buildData.isoSurface, noiseIdx, STEP_Z, STEP_X, STEP_Y, zLight,
-                                       Vector3( xf, yf, zf + 1 ), Vector3( xf + 1, yf, zf + 1 ), Vector3( xf + 1, yf + 1, zf + 1 ), Vector3( xf, yf + 1, zf + 1 ) );
-                        }
-
-                        if( densityValues[noiseIdx - STEP_Z] > buildData.isoSurface ) // Back
-                        {
-                            AddQuadAO( vertexData, indicies, densityValues, buildData.isoSurface, noiseIdx, -STEP_Z, -STEP_X, STEP_Y, 1.0f - zLight,
-                                       Vector3( xf + 1, yf, zf ), Vector3( xf, yf, zf ), Vector3( xf, yf + 1, zf ), Vector3( xf + 1, yf + 1, zf ) );
-                        }
-                    }
-                    else
-                    {
-                        minAir = std::min( yf, minAir );
-                    }
-                    noiseIdx++;
-                }
-
-                noiseIdx += STEP_X * 2;
-            }
-
-            noiseIdx += STEP_Y * 2;
-        }
-    }
-
-    return MeshData( buildData.pos, minMax, vertexData, indicies, minAir, maxSolid );
-}
-
-void MeshNoisePreview::Chunk::AddQuadAO( std::vector<VertexData>& verts, std::vector<uint32_t>& indicies, const float* density, float isoSurface,
-                                         int32_t idx, int32_t facingOffset, int32_t offsetA, int32_t offsetB, float light, Vector3 pos00, Vector3 pos01, Vector3 pos11, Vector3 pos10 )
-{
-    int32_t facingIdx = idx + facingOffset;
-
-    uint8_t sideA0 = density[facingIdx - offsetA] <= isoSurface;
-    uint8_t sideA1 = density[facingIdx + offsetA] <= isoSurface;
-    uint8_t sideB0 = density[facingIdx - offsetB] <= isoSurface;
-    uint8_t sideB1 = density[facingIdx + offsetB] <= isoSurface;
-    
-    uint8_t corner00 = (sideA0 & sideB0) || density[facingIdx - offsetA - offsetB] <= isoSurface;
-    uint8_t corner01 = (sideA0 & sideB1) || density[facingIdx - offsetA + offsetB] <= isoSurface;
-    uint8_t corner10 = (sideA1 & sideB0) || density[facingIdx + offsetA - offsetB] <= isoSurface;
-    uint8_t corner11 = (sideA1 & sideB1) || density[facingIdx + offsetA + offsetB] <= isoSurface;
-
-    constexpr float aoAdjust = AO_STRENGTH / 3.0f; 
-
-    float ao00 = (float)(sideA0 + sideB0 + corner00) * aoAdjust;
-    float ao01 = (float)(sideA1 + sideB0 + corner10) * aoAdjust;
-    float ao10 = (float)(sideA0 + sideB1 + corner01) * aoAdjust;
-    float ao11 = (float)(sideA1 + sideB1 + corner11) * aoAdjust;
-
-    float densityLightShift = 1 - (isoSurface - density[idx]) * 2;
-    light *= densityLightShift * densityLightShift;
-
-    uint32_t vertIdx = (uint32_t)verts.size();
-    verts.emplace_back( pos00, (1.0f - ao00) * light );
-    verts.emplace_back( pos01, (1.0f - ao01) * light );
-    verts.emplace_back( pos10, (1.0f - ao10) * light );
-    verts.emplace_back( pos11, (1.0f - ao11) * light );
-
-    // Rotate tris to give best visuals for AO lighting
-    uint32_t triRotation = ( ao00 + ao11 > ao01 + ao10 ) * 2;    
-    indicies.push_back( vertIdx );
-    indicies.push_back( vertIdx + 3 - triRotation );
-    indicies.push_back( vertIdx + 2 );
-    indicies.push_back( vertIdx + 3 );
-    indicies.push_back( vertIdx + triRotation );
-    indicies.push_back( vertIdx + 1 );
-}
-
-MeshNoisePreview::Chunk::MeshData MeshNoisePreview::Chunk::BuildHeightMap2DMesh( const BuildData& buildData, float* densityValues, std::vector<VertexData>& vertexData, std::vector<uint32_t>& indicies )
-{
-    constexpr uint32_t SIZE_GEN_HEIGHTMAP = SIZE + 1;
-
-    FastNoise::OutputMinMax minMax = buildData.generator->GenUniformGrid2D( densityValues,
-                                                                            buildData.pos.x(), buildData.pos.z(),
-                                                                            SIZE_GEN_HEIGHTMAP, SIZE_GEN_HEIGHTMAP, buildData.frequency, buildData.seed );
-    constexpr int32_t STEP_X = 1;
-    constexpr int32_t STEP_Y = SIZE_GEN_HEIGHTMAP;
-
-    Vector3 sunLight = LIGHT_DIR.normalized() * ( 1.0f - AMBIENT_LIGHT ) + Vector3( AMBIENT_LIGHT );
-
-    int32_t noiseIdx = 0;
-
-    for( uint32_t y = 0; y < SIZE; y++ )
-    {
-        float yf = y + (float)buildData.pos.z();
-
-        for( uint32_t x = 0; x < SIZE; x++ )
-        {
-            float xf = x + (float)buildData.pos.x();
-
-            Vector3 v00( xf, densityValues[noiseIdx] * buildData.heightmapMultiplier, yf );
-            Vector3 v01( xf, densityValues[noiseIdx + STEP_Y] * buildData.heightmapMultiplier, yf + 1 );
-            Vector3 v10( xf + 1, densityValues[noiseIdx + STEP_X] * buildData.heightmapMultiplier, yf );
-            Vector3 v11( xf + 1, densityValues[noiseIdx + STEP_X + STEP_Y] * buildData.heightmapMultiplier, yf + 1 );            
-
-            // Normal for quad
-            float light = ( sunLight * (
-                Math::cross( v10 - v11, v00 - v11 ).normalized() +
-                Math::cross( v01 - v00, v11 - v00 ).normalized() ).normalized() ).dot();
-
-            uint32_t vertIdx = (uint32_t)vertexData.size();
-            vertexData.emplace_back( v00, light );
-            vertexData.emplace_back( v01, light );
-            vertexData.emplace_back( v10, light );
-            vertexData.emplace_back( v11, light );
-
-            // Slice quad along longest split
-            uint32_t triRotation = 2 * ( (v00 + v11).dot() < (v01 + v10).dot() );            
-            indicies.push_back( vertIdx );
-            indicies.push_back( vertIdx + 3 - triRotation );
-            indicies.push_back( vertIdx + 2 );
-            indicies.push_back( vertIdx + 3 );
-            indicies.push_back( vertIdx + triRotation );
-            indicies.push_back( vertIdx + 1 );
-
-            noiseIdx++;
-        }
-
-        noiseIdx += STEP_X;
-    }
-
-    return MeshData( buildData.pos, minMax, vertexData, indicies );
-}
-
-MeshNoisePreview::Chunk::Chunk( MeshData& meshData )
-{
-    mPos = meshData.pos;
-
-    if( !meshData.vertexData.isEmpty() )
-    {
-        //https://doc.magnum.graphics/magnum/classMagnum_1_1GL_1_1Mesh.html
-
-        mMesh = std::make_unique<GL::Mesh>( GL::MeshPrimitive::Triangles );
-
-        mMesh->addVertexBuffer( GL::Buffer( GL::Buffer::TargetHint::Array, meshData.vertexData ), 0, VertexLightShader::PositionLight{} );
-
-        if( meshData.indicies.isEmpty() )
-        {
-            mMesh->setCount( (int)meshData.vertexData.size() );
-        }
-        else
-        {
-            mMesh->setCount( (Int)meshData.indicies.size() );
-            mMesh->setIndexBuffer( GL::Buffer( GL::Buffer::TargetHint::ElementArray, meshData.indicies ), 0, GL::MeshIndexType::UnsignedInt, 0, (UnsignedInt)meshData.vertexData.size() - 1 );
-        }
-    }
-
-    meshData.Free();
-}
-
-MeshNoisePreview::VertexLightShader::VertexLightShader()
-{
-    Utility::Resource noiseToolResources( "NoiseTool" );
-
-#ifndef MAGNUM_TARGET_GLES
-    const GL::Version version = GL::Context::current().supportedVersion( { GL::Version::GL320, GL::Version::GL310, GL::Version::GL300, GL::Version::GL210 } );
-#else
-    const GL::Version version = GL::Context::current().supportedVersion( { GL::Version::GLES300, GL::Version::GLES200 } );
-#endif
-    
-    GL::Shader vert = CreateShader( version, GL::Shader::Type::Vertex );
-    GL::Shader frag = CreateShader( version, GL::Shader::Type::Fragment );
-    
-    CORRADE_INTERNAL_ASSERT_OUTPUT(
-        vert.addSource( noiseToolResources.getString( "VertexLight.vert" ) ).compile() );
-    CORRADE_INTERNAL_ASSERT_OUTPUT( 
-        frag.addSource( noiseToolResources.getString( "VertexLight.frag" ) ).compile() );
-
-    attachShader( vert );
-    attachShader( frag );
-
-    /* ES3 has this done in the shader directly */
-#if !defined(MAGNUM_TARGET_GLES) || defined(MAGNUM_TARGET_GLES2)
-#ifndef MAGNUM_TARGET_GLES
-    if( !GL::Context::current().isExtensionSupported<GL::Extensions::ARB::explicit_attrib_location>( version ) )
-#endif
-    {
-        bindAttributeLocation( PositionLight::Location, "positionLight" );
-    }
-#endif
-
-    CORRADE_INTERNAL_ASSERT_OUTPUT( link() );
-
-#ifndef MAGNUM_TARGET_GLES
-    if( !GL::Context::current().isExtensionSupported<GL::Extensions::ARB::explicit_uniform_location>( version ) )
-#endif
-    {
-        mTransformationProjectionMatrixUniform = uniformLocation( "transformationProjectionMatrix" );
-        mColorTintUniform = uniformLocation( "colorTint" );
-    }
-
-    /* Set defaults in OpenGL ES (for desktop they are set in shader code itself) */
-#ifdef MAGNUM_TARGET_GLES
-    SetTransformationProjectionMatrix( Matrix4{} );
-    SetColorTint( Color3 { 1.0f } );
-#endif
-}
-
-GL::Shader MeshNoisePreview::VertexLightShader::CreateShader( GL::Version version, GL::Shader::Type type )
-{
-    GL::Shader shader( version, type );
-
-#ifndef MAGNUM_TARGET_GLES
-    if( GL::Context::current().isExtensionDisabled<GL::Extensions::ARB::explicit_attrib_location>( version ) )
-        shader.addSource( "#define DISABLE_GL_ARB_explicit_attrib_location\n" );
-    if( GL::Context::current().isExtensionDisabled<GL::Extensions::ARB::shading_language_420pack>( version ) )
-        shader.addSource( "#define DISABLE_GL_ARB_shading_language_420pack\n" );
-    if( GL::Context::current().isExtensionDisabled<GL::Extensions::ARB::explicit_uniform_location>( version ) )
-        shader.addSource( "#define DISABLE_GL_ARB_explicit_uniform_location\n" );
-#endif
-
-#ifndef MAGNUM_TARGET_GLES2
-    if( type == GL::Shader::Type::Vertex && GL::Context::current().isExtensionDisabled<GL::Extensions::MAGNUM::shader_vertex_id>( version ) )
-        shader.addSource( "#define DISABLE_GL_MAGNUM_shader_vertex_id\n" );
-#endif
-
-/* My Android emulator (running on NVidia) doesn't define GL_ES
-       preprocessor macro, thus *all* the stock shaders fail to compile */
-/** @todo remove this when Android emulator is sane */
-#ifdef CORRADE_TARGET_ANDROID
-    shader.addSource( "#ifndef GL_ES\n#define GL_ES 1\n#endif\n" );
-#endif
-
-    return shader;
-}
-
-MeshNoisePreview::VertexLightShader& MeshNoisePreview::VertexLightShader::SetTransformationProjectionMatrix( const Matrix4& matrix )
-{
-    setUniform( mTransformationProjectionMatrixUniform, matrix );
-    return *this;
-}
-
-MeshNoisePreview::VertexLightShader& MeshNoisePreview::VertexLightShader::SetColorTint( const Color3& color )
-{
-    setUniform( mColorTintUniform, Vector4( color, 1.0f ) );
-    return *this;
-}
-
-void MeshNoisePreview::StartTimer()
-{
-    mTimerStart = std::chrono::high_resolution_clock::now();
-}
-
-float MeshNoisePreview::GetTimerDurationMs()
-{
-    return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - mTimerStart).count() / 1e3f;
-}
-
-void MeshNoisePreview::SetupSettingsHandlers()
-{
-    ImGuiSettingsHandler editorSettings;
-    editorSettings.TypeName = "NoiseToolMeshNoisePreview";
-    editorSettings.TypeHash = ImHashStr( editorSettings.TypeName );
-    editorSettings.UserData = this;
-    editorSettings.WriteAllFn = []( ImGuiContext* ctx, ImGuiSettingsHandler* handler, ImGuiTextBuffer* outBuf ) {
-        auto* meshNoisePreview = (MeshNoisePreview*)handler->UserData;
-        outBuf->appendf( "\n[%s][Settings]\n", handler->TypeName );
-
-        outBuf->appendf( "tri_limit=%d\n", (int)meshNoisePreview->mTriLimit );
-        outBuf->appendf( "frequency=%f\n", meshNoisePreview->mBuildData.frequency );
-        outBuf->appendf( "iso_surface=%f\n", meshNoisePreview->mBuildData.isoSurface );
-        outBuf->appendf( "heightmap_multiplier=%f\n", meshNoisePreview->mBuildData.heightmapMultiplier );
-        outBuf->appendf( "seed=%d\n", meshNoisePreview->mBuildData.seed );
-        outBuf->appendf( "color=%d\n", (int)meshNoisePreview->mBuildData.color.toSrgbInt() );
-        outBuf->appendf( "mesh_type=%d\n", (int)meshNoisePreview->mBuildData.meshType );
-        outBuf->appendf( "enabled=%d\n", (int)meshNoisePreview->mEnabled );
-    };
-    editorSettings.ReadOpenFn = []( ImGuiContext* ctx, ImGuiSettingsHandler* handler, const char* name ) -> void* {
-        if( strcmp( name, "Settings" ) == 0 )
-        {
-            return handler->UserData;
-        }
-
-        return nullptr;
-    };
-    editorSettings.ReadLineFn = []( ImGuiContext* ctx, ImGuiSettingsHandler* handler, void* entry, const char* line ) {
-        auto* meshNoisePreview = (MeshNoisePreview*)handler->UserData;
-
-        sscanf( line, "tri_limit=%d", &meshNoisePreview->mTriLimit );
-        sscanf( line, "frequency=%f", &meshNoisePreview->mBuildData.frequency );
-        sscanf( line, "iso_surface=%f", &meshNoisePreview->mBuildData.isoSurface );
-        sscanf( line, "heightmap_multiplier=%f", &meshNoisePreview->mBuildData.heightmapMultiplier );
-        sscanf( line, "seed=%d", &meshNoisePreview->mBuildData.seed );
-        sscanf( line, "mesh_type=%d", (int*)&meshNoisePreview->mBuildData.meshType );
-
-        int i;
-        if( sscanf( line, "color=%d", &i ) == 1 )
-        {
-            meshNoisePreview->mBuildData.color = Color3::fromSrgbInt( i );
-        }
-        else if( sscanf( line, "enabled=%d", &i ) == 1 )
-        {
-            meshNoisePreview->mEnabled = i;
-        }
-    };
-
-    ImGuiExtra::AddOrReplaceSettingsHandler( editorSettings );
-}
\ No newline at end of file
diff --git a/NoiseTool/NoiseTexture.cpp b/NoiseTool/NoiseTexture.cpp
deleted file mode 100644
index b0d5174d..00000000
--- a/NoiseTool/NoiseTexture.cpp
+++ /dev/null
@@ -1,416 +0,0 @@
-#include <cstdio>
-#include <fstream>
-#include <filesystem>
-
-#define IMGUI_DEFINE_MATH_OPERATORS
-#include <imgui.h>
-#include <imgui_internal.h>
-
-#include <Corrade/Containers/ArrayViewStl.h>
-#include <Magnum/PixelFormat.h>
-#include <Magnum/GL/TextureFormat.h>
-#include <Magnum/Math/Functions.h>
-#include <Magnum/ImGuiIntegration/Widgets.h>
-
-#include <FastNoise/Metadata.h>
-
-#include "NoiseTexture.h"
-#include "ImGuiExtra.h"
-
-
-using namespace Magnum;
-
-NoiseTexture::NoiseTexture()
-{
-    mBuildData.iteration = 0;
-    mBuildData.frequency = 0.02f;
-    mBuildData.seed = 1337;
-    mBuildData.size = { -1, -1 };
-    mBuildData.offset = {};
-    mBuildData.generationType = GenType_2D;
-
-    mExportBuildData.size = { 4096, 4096 };
-
-    for( size_t i = 0; i < 2; i++ )
-    {
-        mThreads.emplace_back( GenerateLoopThread, std::ref( mGenerateQueue ), std::ref( mCompleteQueue ) );
-    }
-
-    SetupSettingsHandlers();
-}
-
-NoiseTexture::~NoiseTexture()
-{
-    for( auto& thread : mThreads )
-    {
-        mGenerateQueue.KillThreads();
-        thread.join();
-    }
-    
-    if( mExportThread.joinable() )
-    {
-        mExportThread.join();
-    }
-}
-
-void NoiseTexture::Draw()
-{
-    TextureData texData;
-    if( mCompleteQueue.Pop( texData ) )
-    {
-        if( mCurrentIteration < texData.iteration )
-        {
-            mCurrentIteration = texData.iteration;
-            ImageView2D noiseImage( PixelFormat::RGBA8Srgb, texData.size, texData.textureData );
-            SetPreviewTexture( noiseImage );
-        }
-        texData.Free();
-    }
-
-    ImGui::SetNextWindowSize( ImVec2( 768, 768 ), ImGuiCond_FirstUseEver );
-    ImGui::SetNextWindowPos( ImVec2( 1143, 305 ), ImGuiCond_FirstUseEver );
-    if( ImGui::Begin( "Texture Preview", nullptr, ImGuiWindowFlags_NoScrollbar | ImGuiWindowFlags_NoScrollWithMouse ) )
-    {
-        //ImGui::Text( "Min: %0.6f Max: %0.6f", mMinMax.min, mMinMax.max );
-
-        ImGui::PushItemWidth( 82.0f );
-        bool edited = false;
-
-        edited |= ImGui::Combo( "Generation Type", reinterpret_cast<int*>( &mBuildData.generationType ), GenTypeStrings );
-        edited |= ImGuiExtra::ScrollCombo( reinterpret_cast<int*>( &mBuildData.generationType ), GenType_Count );
-        
-        ImVec2 contentSize = ImGui::GetContentRegionAvail();
-        ImGui::SameLine();
-
-        Vector2i texSize = { mBuildData.size.x(), mBuildData.size.y() };
-
-        if( ImGui::DragInt2( "Size", texSize.data(), 2, 4, 8192 ) )
-        {
-            ImVec2 delta( Vector2{ texSize - mBuildData.size } );
-
-            ImVec2 windowSize = ImGui::GetWindowSize();
-
-            windowSize += delta;
-            contentSize += delta;
-
-            ImGui::SetWindowSize( windowSize );
-        }
-        ImGui::SameLine();
-
-        edited |= ImGui::DragInt( "Seed", &mBuildData.seed );
-        ImGui::SameLine();
-
-        edited |= ImGui::DragFloat( "Frequency", &mBuildData.frequency, 0.001f );
-        ImGui::SameLine();
-
-        if( mBuildData.generator && ImGui::Button( "Export BMP" ) )
-        {
-            auto size = mExportBuildData.size;
-            mExportBuildData = mBuildData;
-            mExportBuildData.size = size;
-            ImGui::OpenPopup( "Export BMP" );
-        }
-
-        ImGui::PopItemWidth();
-
-        if( contentSize.x >= 1 && contentSize.y >= 1 &&
-            (edited || mBuildData.size.x() != (int)contentSize.x || mBuildData.size.y() != (int)contentSize.y) )
-        {
-            Vector2i newSize = { (int)contentSize.x, (int)contentSize.y };
-
-            mBuildData.offset.xy() -= Vector2( newSize - mBuildData.size ) / 2;
-            mBuildData.size = newSize;
-            ReGenerate( mBuildData.generator );
-        }
-
-        if( edited )
-        {
-            ImGuiExtra::MarkSettingsDirty();
-        }
-
-        ImGui::PushStyleColor( ImGuiCol_Button, 0 );
-        ImGui::PushStyleColor( ImGuiCol_ButtonActive, 0 );
-        ImGui::PushStyleColor( ImGuiCol_ButtonHovered, 0 );
-        ImGuiIntegration::imageButton( mNoiseTexture, Vector2( mNoiseTexture.imageSize( 0 ) ), {{},Vector2{1}}, 0 );
-        ImGui::PopStyleColor( 3 );
-
-        if( ImGui::IsItemHovered() )
-        {
-            Vector4 oldOffset = mBuildData.offset;
-
-            if( mBuildData.generationType != GenType_2DTiled && ImGui::IsMouseDragging( ImGuiMouseButton_Left ) )
-            {
-                Vector2 dragDelta( ImGui::GetMouseDragDelta( ImGuiMouseButton_Left ) );
-                ImGui::ResetMouseDragDelta( ImGuiMouseButton_Left );
-
-                mBuildData.offset.x() -= dragDelta.x();
-                mBuildData.offset.y() += dragDelta.y();
-            }
-            else if( (mBuildData.generationType == GenType_3D || mBuildData.generationType == GenType_4D)
-                && ImGui::IsMouseDragging( ImGuiMouseButton_Right ) )
-            {
-                Vector2 dragDelta( ImGui::GetMouseDragDelta( ImGuiMouseButton_Right ) );
-                ImGui::ResetMouseDragDelta( ImGuiMouseButton_Right );
-
-                mBuildData.offset.z() -= dragDelta.x();
-
-                if( mBuildData.generationType == GenType_4D )
-                {
-                    mBuildData.offset.w() -= dragDelta.y();
-                }
-            }
-
-            if( oldOffset != mBuildData.offset )
-            {
-                ReGenerate( mBuildData.generator );
-            }
-        }
-
-        DoExport();
-    }
-    ImGui::End();
-}
-
-void NoiseTexture::DoExport()
-{
-    if( ImGui::BeginPopupModal( "Export BMP", nullptr, ImGuiWindowFlags_NoResize | ImGuiWindowFlags_NoSavedSettings  ) )
-    {
-        ImGui::PushItemWidth( 82.0f );
-        if( ImGui::DragInt2( "Size", mExportBuildData.size.data(), 2, 4, 8192 * 4 ) )
-        {
-            ImGuiExtra::MarkSettingsDirty();
-        }
-
-        if( ImGui::Button( "Export (async)" ) )
-        {
-            ImGui::CloseCurrentPopup();
-
-            float relativeScale = (float)mExportBuildData.size.sum() / mBuildData.size.sum();
-            
-            mExportBuildData.frequency /= relativeScale;
-            mExportBuildData.offset *= relativeScale;
-
-            if( mExportThread.joinable() )
-            {
-                mExportThread.join();
-            }
-            mExportThread = std::thread([buildData = mExportBuildData]()
-            {
-                auto data = BuildTexture( buildData );
-
-                const char* nodeName = buildData.generator->GetMetadata().name;
-                std::string filename = nodeName;
-                filename += ".bmp";
-
-                // Iterate through file names if filename exists
-                for( int i = 1; i < 1024; i++ )
-                {
-                    if( !std::filesystem::exists( filename.c_str() ) )
-                    {
-                        break;
-                    }
-                    filename = nodeName;
-                    filename += '_' + std::to_string( i ) + ".bmp";
-                }   
-
-                std::ofstream file( filename.c_str(), std::ofstream::binary | std::ofstream::out | std::ofstream::trunc );
-
-                if( file.is_open() )
-                {
-                    struct BmpHeader
-                    {
-                        // File header (14)
-                        // char b = 'B';
-                        // char m = 'M';
-                        uint32_t fileSize;
-                        uint32_t reserved = 0;
-                        uint32_t dataOffset = 14u + 12u + (256u * 3u);
-                        // Bmp Info Header (12)
-                        uint32_t headerSize = 12u;
-                        uint16_t sizeX;
-                        uint16_t sizeY;
-                        uint16_t colorPlanes = 1u;
-                        uint16_t bitDepth = 8u;
-                    };
-
-                    int paddedSizeX = buildData.size.x();
-                    int padding = paddedSizeX % 4;
-                    if( padding )
-                    {
-                        padding = 4 - padding;
-                        paddedSizeX += padding;
-                    }
-
-                    BmpHeader header;
-                    header.fileSize = header.dataOffset + (uint32_t)(paddedSizeX * buildData.size.y());
-                    header.sizeX = (uint16_t)buildData.size.x();
-                    header.sizeY = (uint16_t)buildData.size.y();
-
-                    file << 'B' << 'M';
-                    file.write( reinterpret_cast<char*>( &header ), sizeof( BmpHeader ) );
-
-                    // Colour map
-                    for (int i = 0; i < 256; i++)
-                    {
-                        Vector3ub b3( (uint8_t)i );
-                        file.write( reinterpret_cast<char*>( b3.data() ), 3 );
-                    }
-
-                    int xIdx = padding ? buildData.size.x() : 0;
-
-                    for( uint32_t pix : data.textureData ) 
-                    {
-                        file.write( reinterpret_cast<char*>( &pix ), 1 );
-
-                        if( --xIdx == 0 )
-                        {
-                            xIdx = buildData.size.x();
-
-                            Vector3ub b3( 0 );
-                            file.write( reinterpret_cast<char*>( b3.data() ), padding );                        
-                        }
-                    }
-
-                    file.close();
-                }
-            } );
-        }
-
-        ImGui::PopItemWidth();
-        ImGui::EndPopup();
-    }
-}
-
-void NoiseTexture::SetPreviewTexture( ImageView2D& imageView )
-{
-    mNoiseTexture = GL::Texture2D();
-    mNoiseTexture.setStorage( 1, GL::TextureFormat::RGBA8, imageView.size() )
-        .setSubImage( 0, {}, imageView );
-}
-
-void NoiseTexture::ReGenerate( FastNoise::SmartNodeArg<> generator )
-{
-    mBuildData.generator = generator;
-    mBuildData.iteration++;
-
-    mGenerateQueue.Clear();
-
-    if( mBuildData.size.x() <= 0 || mBuildData.size.y() <= 0 )
-    {
-        return;
-    }
-
-    if( generator )
-    {
-        mGenerateQueue.Push( mBuildData );
-        return;
-    }
-
-    std::array<uint32_t, 16 * 16> blankTex = {};
-
-    ImageView2D noiseImage( PixelFormat::RGBA8Unorm, {16,16}, blankTex );
-    mCurrentIteration = mBuildData.iteration;
-
-    SetPreviewTexture( noiseImage );
-}
-
-
-NoiseTexture::TextureData NoiseTexture::BuildTexture( const BuildData& buildData )
-{
-    static thread_local std::vector<float> noiseData;
-    noiseData.resize( (size_t)buildData.size.x() * buildData.size.y() );
-
-    auto gen = FastNoise::New<FastNoise::ConvertRGBA8>( buildData.generator->GetSIMDLevel() );
-    gen->SetSource( buildData.generator );
-
-    FastNoise::OutputMinMax minMax;
-
-    switch( buildData.generationType )
-    {
-    case GenType_2D:
-        minMax = gen->GenUniformGrid2D( noiseData.data(), 
-            (int)buildData.offset.x(), (int)buildData.offset.y(),
-            buildData.size.x(), buildData.size.y(),
-            buildData.frequency, buildData.seed );
-        break;
-
-    case GenType_2DTiled:
-        minMax = gen->GenTileable2D( noiseData.data(),
-            buildData.size.x(), buildData.size.y(),
-            buildData.frequency, buildData.seed );
-        break;
-
-    case GenType_3D:
-        minMax = gen->GenUniformGrid3D( noiseData.data(),
-            (int)buildData.offset.x(), (int)buildData.offset.y(), (int)buildData.offset.z(),
-            buildData.size.x(), buildData.size.y(), 1,
-            buildData.frequency, buildData.seed );
-        break;
-
-    case GenType_4D:
-        minMax = gen->GenUniformGrid4D( noiseData.data(),
-            (int)buildData.offset.x(), (int)buildData.offset.y(), (int)buildData.offset.z(), (int)buildData.offset.w(),
-            buildData.size.x(), buildData.size.y(), 1, 1,
-            buildData.frequency, buildData.seed );
-        break;
-    case GenType_Count:
-        break;
-    }
-
-    return TextureData( buildData.iteration, buildData.size, minMax, noiseData );
-}
-
-void NoiseTexture::GenerateLoopThread( GenerateQueue<BuildData>& generateQueue, CompleteQueue<TextureData>& completeQueue )
-{
-    while( true )
-    {
-        BuildData buildData = generateQueue.Pop();
-
-        if( generateQueue.ShouldKillThread() )
-        {
-            return;
-        }
-
-        TextureData texData = BuildTexture( buildData );
-
-        if( !completeQueue.Push( texData ) )
-        {
-            texData.Free();
-        }
-    }
-}
-
-void NoiseTexture::SetupSettingsHandlers()
-{
-    ImGuiSettingsHandler editorSettings;
-    editorSettings.TypeName = "NoiseToolNoiseTexture";
-    editorSettings.TypeHash = ImHashStr( editorSettings.TypeName );
-    editorSettings.UserData = this;
-    editorSettings.WriteAllFn = []( ImGuiContext* ctx, ImGuiSettingsHandler* handler, ImGuiTextBuffer* outBuf ) {
-        auto* noiseTexture = (NoiseTexture*)handler->UserData;
-        outBuf->appendf( "\n[%s][Settings]\n", handler->TypeName );        
-
-        outBuf->appendf( "frequency=%f\n", noiseTexture->mBuildData.frequency );
-        outBuf->appendf( "seed=%d\n", noiseTexture->mBuildData.seed );
-        outBuf->appendf( "gen_type=%d\n", (int)noiseTexture->mBuildData.generationType );
-        outBuf->appendf( "export_size=%d:%d\n", noiseTexture->mExportBuildData.size.x(), noiseTexture->mExportBuildData.size.y() );
-    };
-    editorSettings.ReadOpenFn = []( ImGuiContext* ctx, ImGuiSettingsHandler* handler, const char* name ) -> void* {
-        if( strcmp( name, "Settings" ) == 0 )
-        {
-            return handler->UserData;
-        }
-
-        return nullptr;
-    };
-    editorSettings.ReadLineFn = []( ImGuiContext* ctx, ImGuiSettingsHandler* handler, void* entry, const char* line ) {
-        auto* noiseTexture = (NoiseTexture*)handler->UserData;
-        
-        sscanf( line, "frequency=%f", &noiseTexture->mBuildData.frequency );
-        sscanf( line, "seed=%d", &noiseTexture->mBuildData.seed );
-        sscanf( line, "gen_type=%d", (int*)&noiseTexture->mBuildData.generationType );
-        sscanf( line, "export_size=%d:%d", &noiseTexture->mExportBuildData.size.x() , &noiseTexture->mExportBuildData.size.y() );
-    };
-
-    ImGuiExtra::AddOrReplaceSettingsHandler( editorSettings );
-}
diff --git a/README.md b/README.md
index 4282a2f0..ee4f0065 100644
--- a/README.md
+++ b/README.md
@@ -37,13 +37,13 @@ Bindings:
 Roadmap:
 - [Vague collection of ideas](https://github.com/users/Auburn/projects/1)
 
-## Noise Tool
+## Node Editor
 
-The FastNoise2 noise tool provides a node graph editor to create trees of FastNoise2 nodes. Node trees can be exported as serialised strings and loaded into the FastNoise2 library in your own code. The noise tool has 2D and 3D previews for the node graph output, see screenshots below for examples.
+The FastNoise2 Node Editor tool provides a node graph editor to create trees of FastNoise2 nodes. Node trees can be exported as serialised strings and loaded into the FastNoise2 library in your own code. Node Editor has 2D and 3D previews for the node graph output, see screenshots below for examples.
 
-Check the [Releases](https://github.com/Auburn/FastNoise2/releases/latest) for compiled NoiseTool binaries
+Check the [Releases](https://github.com/Auburn/FastNoise2/releases/latest) for compiled Node Editor binaries
 
-![NoiseTool](https://user-images.githubusercontent.com/1349548/90967950-4e8da600-e4de-11ea-902a-94e72cb86481.png)
+![Node Editor](https://user-images.githubusercontent.com/1349548/90967950-4e8da600-e4de-11ea-902a-94e72cb86481.png)
 
 ## Performance
 
diff --git a/cmake/CPM.cmake b/cmake/CPM.cmake
index 9ae66399..8269a8bf 100644
--- a/cmake/CPM.cmake
+++ b/cmake/CPM.cmake
@@ -5,7 +5,7 @@
 # MIT License
 # -----------
 #[[
-  Copyright (c) 2021 Lars Melchior and additional contributors
+  Copyright (c) 2019-2023 Lars Melchior and contributors
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -28,10 +28,25 @@
 
 cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
 
-set(CURRENT_CPM_VERSION 0.32.2)
+# Initialize logging prefix
+if(NOT CPM_INDENT)
+  set(CPM_INDENT
+      "CPM:"
+      CACHE INTERNAL ""
+  )
+endif()
+
+if(NOT COMMAND cpm_message)
+  function(cpm_message)
+    message(${ARGV})
+  endfunction()
+endif()
 
+set(CURRENT_CPM_VERSION 0.40.2)
+
+get_filename_component(CPM_CURRENT_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}" REALPATH)
 if(CPM_DIRECTORY)
-  if(NOT CPM_DIRECTORY STREQUAL CMAKE_CURRENT_LIST_DIR)
+  if(NOT CPM_DIRECTORY STREQUAL CPM_CURRENT_DIRECTORY)
     if(CPM_VERSION VERSION_LESS CURRENT_CPM_VERSION)
       message(
         AUTHOR_WARNING
@@ -57,8 +72,42 @@ See https://github.com/cpm-cmake/CPM.cmake for more information."
   endif()
 endif()
 
+if(CURRENT_CPM_VERSION MATCHES "development-version")
+  message(
+    WARNING "${CPM_INDENT} Your project is using an unstable development version of CPM.cmake. \
+Please update to a recent release if possible. \
+See https://github.com/cpm-cmake/CPM.cmake for details."
+  )
+endif()
+
 set_property(GLOBAL PROPERTY CPM_INITIALIZED true)
 
+macro(cpm_set_policies)
+  # the policy allows us to change options without caching
+  cmake_policy(SET CMP0077 NEW)
+  set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+
+  # the policy allows us to change set(CACHE) without caching
+  if(POLICY CMP0126)
+    cmake_policy(SET CMP0126 NEW)
+    set(CMAKE_POLICY_DEFAULT_CMP0126 NEW)
+  endif()
+
+  # The policy uses the download time for timestamp, instead of the timestamp in the archive. This
+  # allows for proper rebuilds when a projects url changes
+  if(POLICY CMP0135)
+    cmake_policy(SET CMP0135 NEW)
+    set(CMAKE_POLICY_DEFAULT_CMP0135 NEW)
+  endif()
+
+  # treat relative git repository paths as being relative to the parent project's remote
+  if(POLICY CMP0150)
+    cmake_policy(SET CMP0150 NEW)
+    set(CMAKE_POLICY_DEFAULT_CMP0150 NEW)
+  endif()
+endmacro()
+cpm_set_policies()
+
 option(CPM_USE_LOCAL_PACKAGES "Always try to use `find_package` to get dependencies"
        $ENV{CPM_USE_LOCAL_PACKAGES}
 )
@@ -76,13 +125,17 @@ option(CPM_INCLUDE_ALL_IN_PACKAGE_LOCK
        "Add all packages added through CPM.cmake to the package lock"
        $ENV{CPM_INCLUDE_ALL_IN_PACKAGE_LOCK}
 )
+option(CPM_USE_NAMED_CACHE_DIRECTORIES
+       "Use additional directory of package name in cache on the most nested level."
+       $ENV{CPM_USE_NAMED_CACHE_DIRECTORIES}
+)
 
 set(CPM_VERSION
     ${CURRENT_CPM_VERSION}
     CACHE INTERNAL ""
 )
 set(CPM_DIRECTORY
-    ${CMAKE_CURRENT_LIST_DIR}
+    ${CPM_CURRENT_DIRECTORY}
     CACHE INTERNAL ""
 )
 set(CPM_FILE
@@ -191,19 +244,14 @@ function(cpm_package_name_and_ver_from_url url outName outVer)
   endif()
 endfunction()
 
-# Initialize logging prefix
-if(NOT CPM_INDENT)
-  set(CPM_INDENT
-      "CPM:"
-      CACHE INTERNAL ""
-  )
-endif()
-
 function(cpm_find_package NAME VERSION)
   string(REPLACE " " ";" EXTRA_ARGS "${ARGN}")
   find_package(${NAME} ${VERSION} ${EXTRA_ARGS} QUIET)
   if(${CPM_ARGS_NAME}_FOUND)
-    message(STATUS "${CPM_INDENT} using local package ${CPM_ARGS_NAME}@${VERSION}")
+    if(DEFINED ${CPM_ARGS_NAME}_VERSION)
+      set(VERSION ${${CPM_ARGS_NAME}_VERSION})
+    endif()
+    cpm_message(STATUS "${CPM_INDENT} Using local package ${CPM_ARGS_NAME}@${VERSION}")
     CPMRegisterPackage(${CPM_ARGS_NAME} "${VERSION}")
     set(CPM_PACKAGE_FOUND
         YES
@@ -223,7 +271,7 @@ function(cpm_create_module_file Name)
   if(NOT CPM_DONT_UPDATE_MODULE_PATH)
     # erase any previous modules
     file(WRITE ${CPM_MODULE_PATH}/Find${Name}.cmake
-         "include(${CPM_FILE})\n${ARGN}\nset(${Name}_FOUND TRUE)"
+         "include(\"${CPM_FILE}\")\n${ARGN}\nset(${Name}_FOUND TRUE)"
     )
   endif()
 endfunction()
@@ -240,14 +288,14 @@ function(CPMFindPackage)
     endif()
   endif()
 
-  if(CPM_DOWNLOAD_ALL)
-    CPMAddPackage(${ARGN})
-    cpm_export_variables(${CPM_ARGS_NAME})
-    return()
+  set(downloadPackage ${CPM_DOWNLOAD_ALL})
+  if(DEFINED CPM_DOWNLOAD_${CPM_ARGS_NAME})
+    set(downloadPackage ${CPM_DOWNLOAD_${CPM_ARGS_NAME}})
+  elseif(DEFINED ENV{CPM_DOWNLOAD_${CPM_ARGS_NAME}})
+    set(downloadPackage $ENV{CPM_DOWNLOAD_${CPM_ARGS_NAME}})
   endif()
-
-  cpm_check_if_package_already_added(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}")
-  if(CPM_PACKAGE_ALREADY_ADDED)
+  if(downloadPackage)
+    CPMAddPackage(${ARGN})
     cpm_export_variables(${CPM_ARGS_NAME})
     return()
   endif()
@@ -268,7 +316,7 @@ function(cpm_check_if_package_already_added CPM_ARGS_NAME CPM_ARGS_VERSION)
     if("${CPM_PACKAGE_VERSION}" VERSION_LESS "${CPM_ARGS_VERSION}")
       message(
         WARNING
-          "${CPM_INDENT} requires a newer version of ${CPM_ARGS_NAME} (${CPM_ARGS_VERSION}) than currently included (${CPM_PACKAGE_VERSION})."
+          "${CPM_INDENT} Requires a newer version of ${CPM_ARGS_NAME} (${CPM_ARGS_VERSION}) than currently included (${CPM_PACKAGE_VERSION})."
       )
     endif()
     cpm_get_fetch_properties(${CPM_ARGS_NAME})
@@ -325,11 +373,11 @@ function(cpm_parse_add_package_single_arg arg outArgs)
       set(packageType "git")
     else()
       # Give up
-      message(FATAL_ERROR "CPM: Can't determine package type of '${arg}'")
+      message(FATAL_ERROR "${CPM_INDENT} Can't determine package type of '${arg}'")
     endif()
   endif()
 
-  # For all packages we interpret @... as version. Only replace the last occurence. Thus URIs
+  # For all packages we interpret @... as version. Only replace the last occurrence. Thus URIs
   # containing '@' can be used
   string(REGEX REPLACE "@([^@]+)$" ";VERSION;\\1" out "${out}")
 
@@ -343,9 +391,9 @@ function(cpm_parse_add_package_single_arg arg outArgs)
     # We don't try to parse the version if it's not provided explicitly. cpm_get_version_from_url
     # should do this at a later point
   else()
-    # We should never get here. This is an assertion and hitting it means there's a bug in the code
-    # above. A packageType was set, but not handled by this if-else.
-    message(FATAL_ERROR "CPM: Unsupported package type '${packageType}' of '${arg}'")
+    # We should never get here. This is an assertion and hitting it means there's a problem with the
+    # code above. A packageType was set, but not handled by this if-else.
+    message(FATAL_ERROR "${CPM_INDENT} Unsupported package type '${packageType}' of '${arg}'")
   endif()
 
   set(${outArgs}
@@ -354,14 +402,185 @@ function(cpm_parse_add_package_single_arg arg outArgs)
   )
 endfunction()
 
+# Check that the working directory for a git repo is clean
+function(cpm_check_git_working_dir_is_clean repoPath gitTag isClean)
+
+  find_package(Git REQUIRED)
+
+  if(NOT GIT_EXECUTABLE)
+    # No git executable, assume directory is clean
+    set(${isClean}
+        TRUE
+        PARENT_SCOPE
+    )
+    return()
+  endif()
+
+  # check for uncommitted changes
+  execute_process(
+    COMMAND ${GIT_EXECUTABLE} status --porcelain
+    RESULT_VARIABLE resultGitStatus
+    OUTPUT_VARIABLE repoStatus
+    OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET
+    WORKING_DIRECTORY ${repoPath}
+  )
+  if(resultGitStatus)
+    # not supposed to happen, assume clean anyway
+    message(WARNING "${CPM_INDENT} Calling git status on folder ${repoPath} failed")
+    set(${isClean}
+        TRUE
+        PARENT_SCOPE
+    )
+    return()
+  endif()
+
+  if(NOT "${repoStatus}" STREQUAL "")
+    set(${isClean}
+        FALSE
+        PARENT_SCOPE
+    )
+    return()
+  endif()
+
+  # check for committed changes
+  execute_process(
+    COMMAND ${GIT_EXECUTABLE} diff -s --exit-code ${gitTag}
+    RESULT_VARIABLE resultGitDiff
+    OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_QUIET
+    WORKING_DIRECTORY ${repoPath}
+  )
+
+  if(${resultGitDiff} EQUAL 0)
+    set(${isClean}
+        TRUE
+        PARENT_SCOPE
+    )
+  else()
+    set(${isClean}
+        FALSE
+        PARENT_SCOPE
+    )
+  endif()
+
+endfunction()
+
+# Add PATCH_COMMAND to CPM_ARGS_UNPARSED_ARGUMENTS. This method consumes a list of files in ARGN
+# then generates a `PATCH_COMMAND` appropriate for `ExternalProject_Add()`. This command is appended
+# to the parent scope's `CPM_ARGS_UNPARSED_ARGUMENTS`.
+function(cpm_add_patches)
+  # Return if no patch files are supplied.
+  if(NOT ARGN)
+    return()
+  endif()
+
+  # Find the patch program.
+  find_program(PATCH_EXECUTABLE patch)
+  if(WIN32 AND NOT PATCH_EXECUTABLE)
+    # The Windows git executable is distributed with patch.exe. Find the path to the executable, if
+    # it exists, then search `../usr/bin` and `../../usr/bin` for patch.exe.
+    find_package(Git QUIET)
+    if(GIT_EXECUTABLE)
+      get_filename_component(extra_search_path ${GIT_EXECUTABLE} DIRECTORY)
+      get_filename_component(extra_search_path_1up ${extra_search_path} DIRECTORY)
+      get_filename_component(extra_search_path_2up ${extra_search_path_1up} DIRECTORY)
+      find_program(
+        PATCH_EXECUTABLE patch HINTS "${extra_search_path_1up}/usr/bin"
+                                     "${extra_search_path_2up}/usr/bin"
+      )
+    endif()
+  endif()
+  if(NOT PATCH_EXECUTABLE)
+    message(FATAL_ERROR "Couldn't find `patch` executable to use with PATCHES keyword.")
+  endif()
+
+  # Create a temporary
+  set(temp_list ${CPM_ARGS_UNPARSED_ARGUMENTS})
+
+  # Ensure each file exists (or error out) and add it to the list.
+  set(first_item True)
+  foreach(PATCH_FILE ${ARGN})
+    # Make sure the patch file exists, if we can't find it, try again in the current directory.
+    if(NOT EXISTS "${PATCH_FILE}")
+      if(NOT EXISTS "${CMAKE_CURRENT_LIST_DIR}/${PATCH_FILE}")
+        message(FATAL_ERROR "Couldn't find patch file: '${PATCH_FILE}'")
+      endif()
+      set(PATCH_FILE "${CMAKE_CURRENT_LIST_DIR}/${PATCH_FILE}")
+    endif()
+
+    # Convert to absolute path for use with patch file command.
+    get_filename_component(PATCH_FILE "${PATCH_FILE}" ABSOLUTE)
+
+    # The first patch entry must be preceded by "PATCH_COMMAND" while the following items are
+    # preceded by "&&".
+    if(first_item)
+      set(first_item False)
+      list(APPEND temp_list "PATCH_COMMAND")
+    else()
+      list(APPEND temp_list "&&")
+    endif()
+    # Add the patch command to the list
+    list(APPEND temp_list "${PATCH_EXECUTABLE}" "-p1" "<" "${PATCH_FILE}")
+  endforeach()
+
+  # Move temp out into parent scope.
+  set(CPM_ARGS_UNPARSED_ARGUMENTS
+      ${temp_list}
+      PARENT_SCOPE
+  )
+
+endfunction()
+
+# method to overwrite internal FetchContent properties, to allow using CPM.cmake to overload
+# FetchContent calls. As these are internal cmake properties, this method should be used carefully
+# and may need modification in future CMake versions. Source:
+# https://github.com/Kitware/CMake/blob/dc3d0b5a0a7d26d43d6cfeb511e224533b5d188f/Modules/FetchContent.cmake#L1152
+function(cpm_override_fetchcontent contentName)
+  cmake_parse_arguments(PARSE_ARGV 1 arg "" "SOURCE_DIR;BINARY_DIR" "")
+  if(NOT "${arg_UNPARSED_ARGUMENTS}" STREQUAL "")
+    message(FATAL_ERROR "${CPM_INDENT} Unsupported arguments: ${arg_UNPARSED_ARGUMENTS}")
+  endif()
+
+  string(TOLOWER ${contentName} contentNameLower)
+  set(prefix "_FetchContent_${contentNameLower}")
+
+  set(propertyName "${prefix}_sourceDir")
+  define_property(
+    GLOBAL
+    PROPERTY ${propertyName}
+    BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()"
+    FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}"
+  )
+  set_property(GLOBAL PROPERTY ${propertyName} "${arg_SOURCE_DIR}")
+
+  set(propertyName "${prefix}_binaryDir")
+  define_property(
+    GLOBAL
+    PROPERTY ${propertyName}
+    BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()"
+    FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}"
+  )
+  set_property(GLOBAL PROPERTY ${propertyName} "${arg_BINARY_DIR}")
+
+  set(propertyName "${prefix}_populated")
+  define_property(
+    GLOBAL
+    PROPERTY ${propertyName}
+    BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()"
+    FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}"
+  )
+  set_property(GLOBAL PROPERTY ${propertyName} TRUE)
+endfunction()
+
 # Download and add a package from source
 function(CPMAddPackage)
+  cpm_set_policies()
+
   list(LENGTH ARGN argnLength)
   if(argnLength EQUAL 1)
     cpm_parse_add_package_single_arg("${ARGN}" ARGN)
 
-    # The shorthand syntax implies EXCLUDE_FROM_ALL
-    set(ARGN "${ARGN};EXCLUDE_FROM_ALL;YES")
+    # The shorthand syntax implies EXCLUDE_FROM_ALL and SYSTEM
+    set(ARGN "${ARGN};EXCLUDE_FROM_ALL;YES;SYSTEM;YES;")
   endif()
 
   set(oneValueArgs
@@ -375,15 +594,16 @@ function(CPMAddPackage)
       BITBUCKET_REPOSITORY
       GIT_REPOSITORY
       SOURCE_DIR
-      DOWNLOAD_COMMAND
       FIND_PACKAGE_ARGUMENTS
       NO_CACHE
+      SYSTEM
       GIT_SHALLOW
       EXCLUDE_FROM_ALL
       SOURCE_SUBDIR
+      CUSTOM_CACHE_KEY
   )
 
-  set(multiValueArgs URL OPTIONS)
+  set(multiValueArgs URL OPTIONS DOWNLOAD_COMMAND PATCHES)
 
   cmake_parse_arguments(CPM_ARGS "" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}")
 
@@ -454,7 +674,7 @@ function(CPMAddPackage)
   if(NOT DEFINED CPM_ARGS_NAME)
     message(
       FATAL_ERROR
-        "CPM: 'NAME' was not provided and couldn't be automatically inferred for package added with arguments: '${ARGN}'"
+        "${CPM_INDENT} 'NAME' was not provided and couldn't be automatically inferred for package added with arguments: '${ARGN}'"
     )
   endif()
 
@@ -473,8 +693,11 @@ function(CPMAddPackage)
       NAME "${CPM_ARGS_NAME}"
       SOURCE_DIR "${PACKAGE_SOURCE}"
       EXCLUDE_FROM_ALL "${CPM_ARGS_EXCLUDE_FROM_ALL}"
+      SYSTEM "${CPM_ARGS_SYSTEM}"
+      PATCHES "${CPM_ARGS_PATCHES}"
       OPTIONS "${CPM_ARGS_OPTIONS}"
       SOURCE_SUBDIR "${CPM_ARGS_SOURCE_SUBDIR}"
+      DOWNLOAD_ONLY "${DOWNLOAD_ONLY}"
       FORCE True
     )
     cpm_export_variables(${CPM_ARGS_NAME})
@@ -492,19 +715,21 @@ function(CPMAddPackage)
     return()
   endif()
 
-  if(CPM_USE_LOCAL_PACKAGES OR CPM_LOCAL_PACKAGES_ONLY)
-    cpm_find_package(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}" ${CPM_ARGS_FIND_PACKAGE_ARGUMENTS})
+  if(NOT CPM_ARGS_FORCE)
+    if(CPM_USE_LOCAL_PACKAGES OR CPM_LOCAL_PACKAGES_ONLY)
+      cpm_find_package(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}" ${CPM_ARGS_FIND_PACKAGE_ARGUMENTS})
 
-    if(CPM_PACKAGE_FOUND)
-      cpm_export_variables(${CPM_ARGS_NAME})
-      return()
-    endif()
+      if(CPM_PACKAGE_FOUND)
+        cpm_export_variables(${CPM_ARGS_NAME})
+        return()
+      endif()
 
-    if(CPM_LOCAL_PACKAGES_ONLY)
-      message(
-        SEND_ERROR
-          "CPM: ${CPM_ARGS_NAME} not found via find_package(${CPM_ARGS_NAME} ${CPM_ARGS_VERSION})"
-      )
+      if(CPM_LOCAL_PACKAGES_ONLY)
+        message(
+          SEND_ERROR
+            "${CPM_INDENT} ${CPM_ARGS_NAME} not found via find_package(${CPM_ARGS_NAME} ${CPM_ARGS_VERSION})"
+        )
+      endif()
     endif()
   endif()
 
@@ -525,32 +750,88 @@ function(CPMAddPackage)
     set(CPM_FETCHCONTENT_BASE_DIR ${CMAKE_BINARY_DIR}/_deps)
   endif()
 
+  cpm_add_patches(${CPM_ARGS_PATCHES})
+
   if(DEFINED CPM_ARGS_DOWNLOAD_COMMAND)
     list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS DOWNLOAD_COMMAND ${CPM_ARGS_DOWNLOAD_COMMAND})
   elseif(DEFINED CPM_ARGS_SOURCE_DIR)
     list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS SOURCE_DIR ${CPM_ARGS_SOURCE_DIR})
+    if(NOT IS_ABSOLUTE ${CPM_ARGS_SOURCE_DIR})
+      # Expand `CPM_ARGS_SOURCE_DIR` relative path. This is important because EXISTS doesn't work
+      # for relative paths.
+      get_filename_component(
+        source_directory ${CPM_ARGS_SOURCE_DIR} REALPATH BASE_DIR ${CMAKE_CURRENT_BINARY_DIR}
+      )
+    else()
+      set(source_directory ${CPM_ARGS_SOURCE_DIR})
+    endif()
+    if(NOT EXISTS ${source_directory})
+      string(TOLOWER ${CPM_ARGS_NAME} lower_case_name)
+      # remove timestamps so CMake will re-download the dependency
+      file(REMOVE_RECURSE "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-subbuild")
+    endif()
   elseif(CPM_SOURCE_CACHE AND NOT CPM_ARGS_NO_CACHE)
     string(TOLOWER ${CPM_ARGS_NAME} lower_case_name)
     set(origin_parameters ${CPM_ARGS_UNPARSED_ARGUMENTS})
     list(SORT origin_parameters)
-    string(SHA1 origin_hash "${origin_parameters}")
-    set(download_directory ${CPM_SOURCE_CACHE}/${lower_case_name}/${origin_hash})
+    if(CPM_ARGS_CUSTOM_CACHE_KEY)
+      # Application set a custom unique directory name
+      set(download_directory ${CPM_SOURCE_CACHE}/${lower_case_name}/${CPM_ARGS_CUSTOM_CACHE_KEY})
+    elseif(CPM_USE_NAMED_CACHE_DIRECTORIES)
+      string(SHA1 origin_hash "${origin_parameters};NEW_CACHE_STRUCTURE_TAG")
+      set(download_directory ${CPM_SOURCE_CACHE}/${lower_case_name}/${origin_hash}/${CPM_ARGS_NAME})
+    else()
+      string(SHA1 origin_hash "${origin_parameters}")
+      set(download_directory ${CPM_SOURCE_CACHE}/${lower_case_name}/${origin_hash})
+    endif()
     # Expand `download_directory` relative path. This is important because EXISTS doesn't work for
     # relative paths.
     get_filename_component(download_directory ${download_directory} ABSOLUTE)
     list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS SOURCE_DIR ${download_directory})
+
+    if(CPM_SOURCE_CACHE)
+      file(LOCK ${download_directory}/../cmake.lock)
+    endif()
+
     if(EXISTS ${download_directory})
-      # avoid FetchContent modules to improve performance
-      set(${CPM_ARGS_NAME}_BINARY_DIR ${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-build)
-      set(${CPM_ARGS_NAME}_ADDED YES)
-      set(${CPM_ARGS_NAME}_SOURCE_DIR ${download_directory})
+      if(CPM_SOURCE_CACHE)
+        file(LOCK ${download_directory}/../cmake.lock RELEASE)
+      endif()
+
+      cpm_store_fetch_properties(
+        ${CPM_ARGS_NAME} "${download_directory}"
+        "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-build"
+      )
+      cpm_get_fetch_properties("${CPM_ARGS_NAME}")
+
+      if(DEFINED CPM_ARGS_GIT_TAG AND NOT (PATCH_COMMAND IN_LIST CPM_ARGS_UNPARSED_ARGUMENTS))
+        # warn if cache has been changed since checkout
+        cpm_check_git_working_dir_is_clean(${download_directory} ${CPM_ARGS_GIT_TAG} IS_CLEAN)
+        if(NOT ${IS_CLEAN})
+          message(
+            WARNING "${CPM_INDENT} Cache for ${CPM_ARGS_NAME} (${download_directory}) is dirty"
+          )
+        endif()
+      endif()
+
       cpm_add_subdirectory(
-        "${CPM_ARGS_NAME}" "${DOWNLOAD_ONLY}"
-        "${${CPM_ARGS_NAME}_SOURCE_DIR}/${CPM_ARGS_SOURCE_SUBDIR}" "${${CPM_ARGS_NAME}_BINARY_DIR}"
-        "${CPM_ARGS_EXCLUDE_FROM_ALL}" "${CPM_ARGS_OPTIONS}"
+        "${CPM_ARGS_NAME}"
+        "${DOWNLOAD_ONLY}"
+        "${${CPM_ARGS_NAME}_SOURCE_DIR}/${CPM_ARGS_SOURCE_SUBDIR}"
+        "${${CPM_ARGS_NAME}_BINARY_DIR}"
+        "${CPM_ARGS_EXCLUDE_FROM_ALL}"
+        "${CPM_ARGS_SYSTEM}"
+        "${CPM_ARGS_OPTIONS}"
       )
-      set(CPM_SKIP_FETCH TRUE)
       set(PACKAGE_INFO "${PACKAGE_INFO} at ${download_directory}")
+
+      # As the source dir is already cached/populated, we override the call to FetchContent.
+      set(CPM_SKIP_FETCH TRUE)
+      cpm_override_fetchcontent(
+        "${lower_case_name}" SOURCE_DIR "${${CPM_ARGS_NAME}_SOURCE_DIR}/${CPM_ARGS_SOURCE_SUBDIR}"
+        BINARY_DIR "${${CPM_ARGS_NAME}_BINARY_DIR}"
+      )
+
     else()
       # Enable shallow clone when GIT_TAG is not a commit hash. Our guess may not be accurate, but
       # it should guarantee no commit hash get mis-detected.
@@ -567,7 +848,7 @@ function(CPMAddPackage)
     endif()
   endif()
 
-  cpm_create_module_file(${CPM_ARGS_NAME} "CPMAddPackage(${ARGN})")
+  cpm_create_module_file(${CPM_ARGS_NAME} "CPMAddPackage(\"${ARGN}\")")
 
   if(CPM_PACKAGE_LOCK_ENABLED)
     if((CPM_ARGS_VERSION AND NOT CPM_ARGS_SOURCE_DIR) OR CPM_INCLUDE_ALL_IN_PACKAGE_LOCK)
@@ -579,20 +860,53 @@ function(CPMAddPackage)
     endif()
   endif()
 
-  message(
-    STATUS "${CPM_INDENT} adding package ${CPM_ARGS_NAME}@${CPM_ARGS_VERSION} (${PACKAGE_INFO})"
+  cpm_message(
+    STATUS "${CPM_INDENT} Adding package ${CPM_ARGS_NAME}@${CPM_ARGS_VERSION} (${PACKAGE_INFO})"
   )
 
   if(NOT CPM_SKIP_FETCH)
+    # CMake 3.28 added EXCLUDE, SYSTEM (3.25), and SOURCE_SUBDIR (3.18) to FetchContent_Declare.
+    # Calling FetchContent_MakeAvailable will then internally forward these options to
+    # add_subdirectory. Up until these changes, we had to call FetchContent_Populate and
+    # add_subdirectory separately, which is no longer necessary and has been deprecated as of 3.30.
+    set(fetchContentDeclareExtraArgs "")
+    if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.28.0")
+      if(${CPM_ARGS_EXCLUDE_FROM_ALL})
+        list(APPEND fetchContentDeclareExtraArgs EXCLUDE_FROM_ALL)
+      endif()
+      if(${CPM_ARGS_SYSTEM})
+        list(APPEND fetchContentDeclareExtraArgs SYSTEM)
+      endif()
+      if(DEFINED CPM_ARGS_SOURCE_SUBDIR)
+        list(APPEND fetchContentDeclareExtraArgs SOURCE_SUBDIR ${CPM_ARGS_SOURCE_SUBDIR})
+      endif()
+      # For CMake version <3.28 OPTIONS are parsed in cpm_add_subdirectory
+      if(CPM_ARGS_OPTIONS AND NOT DOWNLOAD_ONLY)
+        foreach(OPTION ${CPM_ARGS_OPTIONS})
+          cpm_parse_option("${OPTION}")
+          set(${OPTION_KEY} "${OPTION_VALUE}")
+        endforeach()
+      endif()
+    endif()
     cpm_declare_fetch(
-      "${CPM_ARGS_NAME}" "${CPM_ARGS_VERSION}" "${PACKAGE_INFO}" "${CPM_ARGS_UNPARSED_ARGUMENTS}"
-    )
-    cpm_fetch_package("${CPM_ARGS_NAME}")
-    cpm_add_subdirectory(
-      "${CPM_ARGS_NAME}" "${DOWNLOAD_ONLY}"
-      "${${CPM_ARGS_NAME}_SOURCE_DIR}/${CPM_ARGS_SOURCE_SUBDIR}" "${${CPM_ARGS_NAME}_BINARY_DIR}"
-      "${CPM_ARGS_EXCLUDE_FROM_ALL}" "${CPM_ARGS_OPTIONS}"
+      "${CPM_ARGS_NAME}" ${fetchContentDeclareExtraArgs} "${CPM_ARGS_UNPARSED_ARGUMENTS}"
     )
+
+    cpm_fetch_package("${CPM_ARGS_NAME}" ${DOWNLOAD_ONLY} populated ${CPM_ARGS_UNPARSED_ARGUMENTS})
+    if(CPM_SOURCE_CACHE AND download_directory)
+      file(LOCK ${download_directory}/../cmake.lock RELEASE)
+    endif()
+    if(${populated} AND ${CMAKE_VERSION} VERSION_LESS "3.28.0")
+      cpm_add_subdirectory(
+        "${CPM_ARGS_NAME}"
+        "${DOWNLOAD_ONLY}"
+        "${${CPM_ARGS_NAME}_SOURCE_DIR}/${CPM_ARGS_SOURCE_SUBDIR}"
+        "${${CPM_ARGS_NAME}_BINARY_DIR}"
+        "${CPM_ARGS_EXCLUDE_FROM_ALL}"
+        "${CPM_ARGS_SYSTEM}"
+        "${CPM_ARGS_OPTIONS}"
+      )
+    endif()
     cpm_get_fetch_properties("${CPM_ARGS_NAME}")
   endif()
 
@@ -605,7 +919,7 @@ macro(CPMGetPackage Name)
   if(DEFINED "CPM_DECLARATION_${Name}")
     CPMAddPackage(NAME ${Name})
   else()
-    message(SEND_ERROR "Cannot retrieve package ${Name}: no declaration available")
+    message(SEND_ERROR "${CPM_INDENT} Cannot retrieve package ${Name}: no declaration available")
   endif()
 endmacro()
 
@@ -623,10 +937,14 @@ macro(cpm_export_variables name)
       "${${name}_ADDED}"
       PARENT_SCOPE
   )
+  set(CPM_LAST_PACKAGE_NAME
+      "${name}"
+      PARENT_SCOPE
+  )
 endmacro()
 
 # declares a package, so that any call to CPMAddPackage for the package name will use these
-# arguments instead. Previous declarations will not be overriden.
+# arguments instead. Previous declarations will not be overridden.
 macro(CPMDeclarePackage Name)
   if(NOT DEFINED "CPM_DECLARATION_${Name}")
     set("CPM_DECLARATION_${Name}" "${ARGN}")
@@ -649,7 +967,7 @@ function(cpm_add_comment_to_package_lock Name)
   endif()
 endfunction()
 
-# includes the package lock file if it exists and creates a target `cpm-write-package-lock` to
+# includes the package lock file if it exists and creates a target `cpm-update-package-lock` to
 # update it
 macro(CPMUsePackageLock file)
   if(NOT CPM_DONT_CREATE_PACKAGE_LOCK)
@@ -689,9 +1007,9 @@ function(CPMGetPackageVersion PACKAGE OUTPUT)
 endfunction()
 
 # declares a package in FetchContent_Declare
-function(cpm_declare_fetch PACKAGE VERSION INFO)
+function(cpm_declare_fetch PACKAGE)
   if(${CPM_DRY_RUN})
-    message(STATUS "${CPM_INDENT} package not declared (dry run)")
+    cpm_message(STATUS "${CPM_INDENT} Package not declared (dry run)")
     return()
   endif()
 
@@ -703,18 +1021,32 @@ function(cpm_get_fetch_properties PACKAGE)
   if(${CPM_DRY_RUN})
     return()
   endif()
-  FetchContent_GetProperties(${PACKAGE})
-  string(TOLOWER ${PACKAGE} lpackage)
+
   set(${PACKAGE}_SOURCE_DIR
-      "${${lpackage}_SOURCE_DIR}"
+      "${CPM_PACKAGE_${PACKAGE}_SOURCE_DIR}"
       PARENT_SCOPE
   )
   set(${PACKAGE}_BINARY_DIR
-      "${${lpackage}_BINARY_DIR}"
+      "${CPM_PACKAGE_${PACKAGE}_BINARY_DIR}"
       PARENT_SCOPE
   )
 endfunction()
 
+function(cpm_store_fetch_properties PACKAGE source_dir binary_dir)
+  if(${CPM_DRY_RUN})
+    return()
+  endif()
+
+  set(CPM_PACKAGE_${PACKAGE}_SOURCE_DIR
+      "${source_dir}"
+      CACHE INTERNAL ""
+  )
+  set(CPM_PACKAGE_${PACKAGE}_BINARY_DIR
+      "${binary_dir}"
+      CACHE INTERNAL ""
+  )
+endfunction()
+
 # adds a package as a subdirectory if viable, according to provided options
 function(
   cpm_add_subdirectory
@@ -723,22 +1055,23 @@ function(
   SOURCE_DIR
   BINARY_DIR
   EXCLUDE
+  SYSTEM
   OPTIONS
 )
+
   if(NOT DOWNLOAD_ONLY AND EXISTS ${SOURCE_DIR}/CMakeLists.txt)
+    set(addSubdirectoryExtraArgs "")
     if(EXCLUDE)
-      set(addSubdirectoryExtraArgs EXCLUDE_FROM_ALL)
-    else()
-      set(addSubdirectoryExtraArgs "")
+      list(APPEND addSubdirectoryExtraArgs EXCLUDE_FROM_ALL)
+    endif()
+    if("${SYSTEM}" AND "${CMAKE_VERSION}" VERSION_GREATER_EQUAL "3.25")
+      # https://cmake.org/cmake/help/latest/prop_dir/SYSTEM.html#prop_dir:SYSTEM
+      list(APPEND addSubdirectoryExtraArgs SYSTEM)
     endif()
     if(OPTIONS)
-      # the policy allows us to change options without caching
-      cmake_policy(SET CMP0077 NEW)
-      set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
-
       foreach(OPTION ${OPTIONS})
-        cpm_parse_option(${OPTION})
-        set(${OPTION_KEY} ${OPTION_VALUE})
+        cpm_parse_option("${OPTION}")
+        set(${OPTION_KEY} "${OPTION_VALUE}")
       endforeach()
     endif()
     set(CPM_OLD_INDENT "${CPM_INDENT}")
@@ -750,19 +1083,49 @@ endfunction()
 
 # downloads a previously declared package via FetchContent and exports the variables
 # `${PACKAGE}_SOURCE_DIR` and `${PACKAGE}_BINARY_DIR` to the parent scope
-function(cpm_fetch_package PACKAGE)
+function(cpm_fetch_package PACKAGE DOWNLOAD_ONLY populated)
+  set(${populated}
+      FALSE
+      PARENT_SCOPE
+  )
   if(${CPM_DRY_RUN})
-    message(STATUS "${CPM_INDENT} package ${PACKAGE} not fetched (dry run)")
+    cpm_message(STATUS "${CPM_INDENT} Package ${PACKAGE} not fetched (dry run)")
     return()
   endif()
 
   FetchContent_GetProperties(${PACKAGE})
 
+  string(TOLOWER "${PACKAGE}" lower_case_name)
+
   if(NOT ${lower_case_name}_POPULATED)
-    FetchContent_Populate(${PACKAGE})
+    if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.28.0")
+      if(DOWNLOAD_ONLY)
+        # MakeAvailable will call add_subdirectory internally which is not what we want when
+        # DOWNLOAD_ONLY is set. Populate will only download the dependency without adding it to the
+        # build
+        FetchContent_Populate(
+          ${PACKAGE}
+          SOURCE_DIR "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-src"
+          BINARY_DIR "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-build"
+          SUBBUILD_DIR "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-subbuild"
+          ${ARGN}
+        )
+      else()
+        FetchContent_MakeAvailable(${PACKAGE})
+      endif()
+    else()
+      FetchContent_Populate(${PACKAGE})
+    endif()
+    set(${populated}
+        TRUE
+        PARENT_SCOPE
+    )
   endif()
 
-  string(TOLOWER "${PACKAGE}" lower_case_name)
+  cpm_store_fetch_properties(
+    ${CPM_ARGS_NAME} ${${lower_case_name}_SOURCE_DIR} ${${lower_case_name}_BINARY_DIR}
+  )
+
   set(${PACKAGE}_SOURCE_DIR
       ${${lower_case_name}_SOURCE_DIR}
       PARENT_SCOPE
@@ -775,15 +1138,15 @@ endfunction()
 
 # splits a package option
 function(cpm_parse_option OPTION)
-  string(REGEX MATCH "^[^ ]+" OPTION_KEY ${OPTION})
-  string(LENGTH ${OPTION} OPTION_LENGTH)
-  string(LENGTH ${OPTION_KEY} OPTION_KEY_LENGTH)
+  string(REGEX MATCH "^[^ ]+" OPTION_KEY "${OPTION}")
+  string(LENGTH "${OPTION}" OPTION_LENGTH)
+  string(LENGTH "${OPTION_KEY}" OPTION_KEY_LENGTH)
   if(OPTION_KEY_LENGTH STREQUAL OPTION_LENGTH)
     # no value for key provided, assume user wants to set option to "ON"
     set(OPTION_VALUE "ON")
   else()
     math(EXPR OPTION_KEY_LENGTH "${OPTION_KEY_LENGTH}+1")
-    string(SUBSTRING ${OPTION} "${OPTION_KEY_LENGTH}" "-1" OPTION_VALUE)
+    string(SUBSTRING "${OPTION}" "${OPTION_KEY_LENGTH}" "-1" OPTION_VALUE)
   endif()
   set(OPTION_KEY
       "${OPTION_KEY}"
@@ -813,7 +1176,7 @@ function(cpm_get_version_from_git_tag GIT_TAG RESULT)
   endif()
 endfunction()
 
-# guesses if the git tag is a commit hash or an actual tag or a branch nane.
+# guesses if the git tag is a commit hash or an actual tag or a branch name.
 function(cpm_is_git_tag_commit_hash GIT_TAG RESULT)
   string(LENGTH "${GIT_TAG}" length)
   # full hash has 40 characters, and short hash has at least 7 characters.
@@ -846,14 +1209,17 @@ function(cpm_prettify_package_arguments OUT_VAR IS_IN_COMMENT)
       DOWNLOAD_ONLY
       GITHUB_REPOSITORY
       GITLAB_REPOSITORY
+      BITBUCKET_REPOSITORY
       GIT_REPOSITORY
       SOURCE_DIR
-      DOWNLOAD_COMMAND
       FIND_PACKAGE_ARGUMENTS
       NO_CACHE
+      SYSTEM
       GIT_SHALLOW
+      EXCLUDE_FROM_ALL
+      SOURCE_SUBDIR
   )
-  set(multiValueArgs OPTIONS)
+  set(multiValueArgs URL OPTIONS DOWNLOAD_COMMAND)
   cmake_parse_arguments(CPM_ARGS "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
   foreach(oneArgName ${oneValueArgs})
diff --git a/include/FastNoise/FastNoise.h b/include/FastNoise/FastNoise.h
index 332cff00..2cd60451 100644
--- a/include/FastNoise/FastNoise.h
+++ b/include/FastNoise/FastNoise.h
@@ -1,5 +1,5 @@
 #pragma once
-#include "FastNoise_Config.h"
+#include "Utility/Config.h"
 
 // Node class definitions
 #include "Generators/BasicGenerators.h"
@@ -22,19 +22,15 @@ namespace FastNoise
     /// auto node = FastNoise::New<FastNoise::Simplex>();
     /// </example>
     /// <typeparam name="T">Node class to create</typeparam>
-    /// <param name="maxSimdLevel">Max SIMD level, Null = Auto</param>
+    /// <param name="maxSimdLevel">Max SIMD level, Max = Auto</param>
     /// <returns>SmartNode<T> is guaranteed not nullptr</returns>
     template<typename T>
-    SmartNode<T> New( FastSIMD::eLevel maxSimdLevel /*= FastSIMD::Level_Null*/ )
+    SmartNode<T> New( FastSIMD::FeatureSet maxFeatureSet /*= FastSIMD::FeatureSet::Max*/ )
     {
         static_assert( std::is_base_of<Generator, T>::value, "This function should only be used for FastNoise node classes, for example FastNoise::Simplex" );
         static_assert( std::is_member_function_pointer<decltype(&T::GetMetadata)>::value, "Cannot create abstract node class, use a derived class, for example: Fractal -> FractalFBm" );
 
-#if FASTNOISE_USE_SHARED_PTR
-        return SmartNode<T>( FastSIMD::New<T>( maxSimdLevel ) );
-#else
-        return SmartNode<T>( FastSIMD::New<T>( maxSimdLevel, &SmartNodeManager::Allocate ) );
-#endif
+        return SmartNode<T>( FastSIMD::NewDispatchClass<T>( maxFeatureSet, &SmartNodeManager::Allocate ) );
     }
 
     /// <summary>
@@ -43,8 +39,8 @@ namespace FastNoise
     /// <example>
     /// FastNoise::SmartNode<> rootNode = FastNoise::NewFromEncodedNodeTree( "DQAFAAAAAAAAQAgAAAAAAD8AAAAAAA==" );
     /// </example>
-    /// <param name="encodedNodeTreeString">Can be generated using the NoiseTool</param>
-    /// <param name="maxSimdLevel">Max SIMD level, Null = Auto</param>
+    /// <param name="encodedNodeTreeString">Can be generated using the Node Editor tool</param>
+    /// <param name="maxSimdLevel">Max SIMD level, Max = Auto</param>
     /// <returns>Root node of the tree, nullptr for invalid strings</returns>
-    FASTNOISE_API SmartNode<> NewFromEncodedNodeTree( const char* encodedNodeTreeString, FastSIMD::eLevel maxSimdLevel = FastSIMD::Level_Null );
+    FASTNOISE_API SmartNode<> NewFromEncodedNodeTree( const char* encodedNodeTreeString, FastSIMD::FeatureSet maxFeatureSet = FastSIMD::FeatureSet::Max );
 }
diff --git a/include/FastNoise/FastNoise_BuildList.inl b/include/FastNoise/FastNoise_BuildList.inl
deleted file mode 100644
index 674204f1..00000000
--- a/include/FastNoise/FastNoise_BuildList.inl
+++ /dev/null
@@ -1,131 +0,0 @@
-#pragma once
-
-#ifndef FASTSIMD_BUILD_CLASS
-#error Do not include this file
-#endif
-
-#ifndef FASTNOISE_CLASS
-#define FASTNOISE_CLASS( CLASS ) FastNoise::CLASS
-#endif
-
-#ifdef FASTSIMD_INCLUDE_HEADER_ONLY
-#include "Generators/Generator.h"
-#else
-#include "Generators/Generator.inl"
-#endif
-
-#ifdef FASTSIMD_INCLUDE_HEADER_ONLY
-#include "Generators/BasicGenerators.h"
-#else
-#include "Generators/BasicGenerators.inl"
-#endif
-
-#ifdef FASTSIMD_INCLUDE_HEADER_ONLY
-#include "Generators/Value.h"
-#else
-#include "Generators/Value.inl"
-#endif
-
-#ifdef FASTSIMD_INCLUDE_HEADER_ONLY
-#include "Generators/Perlin.h"
-#else
-#include "Generators/Perlin.inl"
-#endif
-
-#ifdef FASTSIMD_INCLUDE_HEADER_ONLY
-#include "Generators/Simplex.h"
-#else
-#include "Generators/Simplex.inl"
-#endif
-
-#ifdef FASTSIMD_INCLUDE_HEADER_ONLY
-#include "Generators/Cellular.h"
-#else
-#include "Generators/Cellular.inl"
-#endif
-
-#ifdef FASTSIMD_INCLUDE_HEADER_ONLY
-#include "Generators/Fractal.h"
-#else
-#include "Generators/Fractal.inl"
-#endif
-
-#ifdef FASTSIMD_INCLUDE_HEADER_ONLY
-#include "Generators/DomainWarp.h"
-#else
-#include "Generators/DomainWarp.inl"
-#endif
-
-#ifdef FASTSIMD_INCLUDE_HEADER_ONLY
-#include "Generators/DomainWarpFractal.h"
-#else
-#include "Generators/DomainWarpFractal.inl"
-#endif
-
-#ifdef FASTSIMD_INCLUDE_HEADER_ONLY
-#include "Generators/Modifiers.h"
-#else
-#include "Generators/Modifiers.inl"
-#endif
-
-#ifdef FASTSIMD_INCLUDE_HEADER_ONLY
-#include "Generators/Blends.h"
-#else
-#include "Generators/Blends.inl"
-#endif
-
-// Nodes
-// Order is important!
-// Always add to bottom of list,
-// inserting will break existing encoded node trees
-
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( Constant ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( White ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( Checkerboard ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( SineWave ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( PositionOutput ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( DistanceToPoint ) )
-                       
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( Value ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( Perlin ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( Simplex ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( OpenSimplex2 ) )
-                       
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( CellularValue ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( CellularDistance ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( CellularLookup ) )
-                       
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( FractalFBm ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( FractalPingPong ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( FractalRidged ) )
-                       
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( DomainWarpGradient ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( DomainWarpFractalProgressive ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( DomainWarpFractalIndependant ) )
-                       
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( DomainScale ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( DomainOffset ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( DomainRotate ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( SeedOffset ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( Remap ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( ConvertRGBA8 ) )
-                       
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( Add ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( Subtract ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( Multiply ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( Divide ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( Min ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( Max ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( MinSmooth ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( MaxSmooth ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( Fade ) )
-                       
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( Terrace ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( PowFloat ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( PowInt ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( DomainAxisScale ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( AddDimension ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( RemoveDimension ) )
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( GeneratorCache ) )
-
-FASTSIMD_BUILD_CLASS( FASTNOISE_CLASS( OpenSimplex2S ) )
diff --git a/include/FastNoise/FastNoise_C.h b/include/FastNoise/FastNoise_C.h
index 4a923ded..ce0f4a3c 100644
--- a/include/FastNoise/FastNoise_C.h
+++ b/include/FastNoise/FastNoise_C.h
@@ -1,13 +1,13 @@
 #ifndef FASTNOISE_C_H
 #define FASTNOISE_C_H
 
-#include "FastNoise_Export.h"
+#include "Utility/Export.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-FASTNOISE_API void* fnNewFromEncodedNodeTree( const char* encodedString, unsigned /*FastSIMD::eLevel*/ simdLevel /*0 = Auto*/ );
+FASTNOISE_API void* fnNewFromEncodedNodeTree( const char* encodedString, unsigned /*FastSIMD::FeatureSet*/ simdLevel /*~0u = Auto*/ );
 
 FASTNOISE_API void fnDeleteNodeRef( void* node );
 
@@ -17,17 +17,17 @@ FASTNOISE_API int fnGetMetadataID( const void* node );
 FASTNOISE_API void fnGenUniformGrid2D( const void* node, float* noiseOut,
                                        int xStart, int yStart,
                                        int xSize, int ySize,
-                                       float frequency, int seed, float* outputMinMax /*nullptr or float[2]*/ );
+                                       int seed, float* outputMinMax /*nullptr or float[2]*/ );
 
 FASTNOISE_API void fnGenUniformGrid3D( const void* node, float* noiseOut,
                                        int xStart, int yStart, int zStart,
                                        int xSize, int ySize, int zSize,
-                                       float frequency, int seed, float* outputMinMax /*nullptr or float[2]*/ );
+                                       int seed, float* outputMinMax /*nullptr or float[2]*/ );
 
 FASTNOISE_API void fnGenUniformGrid4D( const void* node, float* noiseOut,
                                        int xStart, int yStart, int zStart, int wStart,
                                        int xSize, int ySize, int zSize, int wSize,
-                                       float frequency, int seed, float* outputMinMax /*nullptr or float[2]*/ );
+                                       int seed, float* outputMinMax /*nullptr or float[2]*/ );
 
 FASTNOISE_API void fnGenPositionArray2D( const void* node, float* noiseOut, int count,
                                          const float* xPosArray, const float* yPosArray,
@@ -46,7 +46,7 @@ FASTNOISE_API void fnGenPositionArray4D( const void* node, float* noiseOut, int
 
 FASTNOISE_API void fnGenTileable2D( const void* node, float* noiseOut,
                                     int xSize, int ySize,
-                                    float frequency, int seed, float* outputMinMax /*nullptr or float[2]*/ );
+                                    int seed, float* outputMinMax /*nullptr or float[2]*/ );
 
 FASTNOISE_API float fnGenSingle2D( const void* node, float x, float y, int seed );
 FASTNOISE_API float fnGenSingle3D( const void* node, float x, float y, float z, int seed );
@@ -54,7 +54,7 @@ FASTNOISE_API float fnGenSingle4D( const void* node, float x, float y, float z,
 
 FASTNOISE_API int fnGetMetadataCount();
 FASTNOISE_API const char* fnGetMetadataName( int id ); // valid IDs up to `fnGetMetadataCount() - 1`
-FASTNOISE_API void* fnNewFromMetadata( int id, unsigned /*FastSIMD::eLevel*/ simdLevel /*0 = Auto*/ );
+FASTNOISE_API void* fnNewFromMetadata( int id, unsigned /*FastSIMD::FeatureSet*/ simdLevel /*~0u = Auto*/ );
 
 FASTNOISE_API int fnGetMetadataVariableCount( int id );
 FASTNOISE_API const char* fnGetMetadataVariableName( int id, int variableIndex );
diff --git a/include/FastNoise/FastNoise_Config.h b/include/FastNoise/FastNoise_Config.h
deleted file mode 100644
index a0d11f2c..00000000
--- a/include/FastNoise/FastNoise_Config.h
+++ /dev/null
@@ -1,45 +0,0 @@
-#pragma once
-#include <FastSIMD/FastSIMD.h>
-#include "FastNoise_Export.h"
-
-#define FASTNOISE_CALC_MIN_MAX true
-#define FASTNOISE_USE_SHARED_PTR false
-
-#if FASTNOISE_USE_SHARED_PTR
-#include <memory>
-#endif
-
-namespace FastNoise
-{
-    const FastSIMD::Level_BitFlags SUPPORTED_SIMD_LEVELS =
-        FastSIMD::Level_Scalar |
-        FastSIMD::Level_SSE2   |
-        FastSIMD::Level_SSE41  |
-        FastSIMD::Level_AVX2   |
-        FastSIMD::Level_AVX512 |
-        FastSIMD::Level_NEON   ;
-    
-    class Generator;
-    struct Metadata;
-
-    template<typename T>
-    struct MetadataT;
-
-#if FASTNOISE_USE_SHARED_PTR
-    template<typename T = Generator>
-    using SmartNode = std::shared_ptr<T>;
-#else
-    template<typename T = Generator>
-    class SmartNode;
-#endif
-
-    template<typename T = Generator>
-    using SmartNodeArg = const SmartNode<const T>&;
-
-    template<typename T>
-    SmartNode<T> New( FastSIMD::eLevel maxSimdLevel = FastSIMD::Level_Null );
-} // namespace FastNoise
-
-#if !FASTNOISE_USE_SHARED_PTR
-#include "SmartNode.h"
-#endif
diff --git a/include/FastNoise/FastNoise_Export.h b/include/FastNoise/FastNoise_Export.h
deleted file mode 100644
index 416cc91a..00000000
--- a/include/FastNoise/FastNoise_Export.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef FASTNOISE_EXPORT_H
-#define FASTNOISE_EXPORT_H
-
-#if !defined( FASTNOISE_STATIC_LIB ) && ( defined( _WIN32 ) || defined( __CYGWIN__ ) )
-#ifdef FASTNOISE_EXPORT
-#define FASTNOISE_API __declspec( dllexport )
-#else
-#define FASTNOISE_API __declspec( dllimport )
-#endif
-#else
-#define FASTNOISE_API
-#endif
-
-#endif
\ No newline at end of file
diff --git a/include/FastNoise/Generators/BasicGenerators.h b/include/FastNoise/Generators/BasicGenerators.h
index 2684f100..bb3001d5 100644
--- a/include/FastNoise/Generators/BasicGenerators.h
+++ b/include/FastNoise/Generators/BasicGenerators.h
@@ -3,10 +3,66 @@
 
 namespace FastNoise
 {
+    class ScalableGenerator : public virtual Generator
+    {
+    public:
+        void SetScale( float value )
+        {
+            mScale = value;
+            mFrequency = 1.0f / value;
+        }
+
+    protected:
+        float mScale = 100;
+        float mFrequency = 1.0f / 100;
+    };
+
+#ifdef FASTNOISE_METADATA
+    template<>
+    struct MetadataT<ScalableGenerator> : MetadataT<Generator>
+    {
+        MetadataT()
+        {
+            this->AddVariable( { "Feature Scale", "Effectively `1.0 / frequency`" }, 100.0f, &ScalableGenerator::SetScale, 0.f, 0.f, 0.25f );
+        }
+    };
+#endif
+
+    template<typename PARENT>
+    class VariableRange : public virtual PARENT
+    {
+    public:
+        void SetOutputMin( float value )
+        {
+            mRangeScale += mRangeMin - value;
+            mRangeMin = value;
+        }
+
+        void SetOutputMax( float value )
+        {
+            mRangeScale = ( value - mRangeMin );
+        }
+
+    protected:
+        float mRangeMin = -1;
+        float mRangeScale = 2;
+    };
+
+#ifdef FASTNOISE_METADATA
+    template<typename PARENT>
+    struct MetadataT<VariableRange<PARENT>> : MetadataT<PARENT>
+    {
+        MetadataT()
+        {
+            this->AddVariable( { "Output Min", "Minimum bound of output range" }, -1.0f, &VariableRange<PARENT>::SetOutputMin );
+            this->AddVariable( { "Output Max", "Maximum bound of output range" }, 1.0f, &VariableRange<PARENT>::SetOutputMax );
+        }
+    };
+#endif
+
     class Constant : public virtual Generator
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
 
         void SetValue( float value ) { mValue = value; }
@@ -19,84 +75,81 @@ namespace FastNoise
     template<>
     struct MetadataT<Constant> : MetadataT<Generator>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
             groups.push_back( "Basic Generators" );
-            this->AddVariable( "Value", 1.0f, &Constant::SetValue );
+            this->AddVariable( { "Value", "Constant output" }, 1.0f, &Constant::SetValue );
+
+            description =
+                "Outputs a constant value";
         }
     };
 #endif
 
-    class White : public virtual Generator
+    class White : public virtual VariableRange<Generator>
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
     };
 
 #ifdef FASTNOISE_METADATA
     template<>
-    struct MetadataT<White> : MetadataT<Generator>
+    struct MetadataT<White> : MetadataT<VariableRange<Generator>>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
             groups.push_back( "Basic Generators" );
+            
+            description = 
+                "White noise generator";
         }
     };
 #endif
 
-    class Checkerboard : public virtual Generator
+    class Checkerboard : public virtual VariableRange<ScalableGenerator>
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
-
-        void SetSize( float value ) { mSize = value; }
-
-    protected:
-        float mSize = 1.0f;
     };
 
 #ifdef FASTNOISE_METADATA
     template<>
-    struct MetadataT<Checkerboard> : MetadataT<Generator>
+    struct MetadataT<Checkerboard> : MetadataT<VariableRange<ScalableGenerator>>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
             groups.push_back( "Basic Generators" );
-            this->AddVariable( "Size", 1.0f, &Checkerboard::SetSize );
+            description =
+                "Outputs a checkerboard pattern\n"
+                "Each checkerboard cell is \"Feature Scale\" sized in each dimension";
         }
     };
 #endif
 
-    class SineWave : public virtual Generator
+    class SineWave : public virtual VariableRange<ScalableGenerator>
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
-
-        void SetScale( float value ) { mScale = value; }
-
-    protected:
-        float mScale = 1.0f;
     };
 
 #ifdef FASTNOISE_METADATA
     template<>
-    struct MetadataT<SineWave> : MetadataT<Generator>
+    struct MetadataT<SineWave> : MetadataT<VariableRange<ScalableGenerator>>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
             groups.push_back( "Basic Generators" );
-            this->AddVariable( "Scale", 1.0f, &SineWave::SetScale );
+
+            description =
+                "Outputs sine wave";
         }
     };
 #endif
@@ -104,15 +157,18 @@ namespace FastNoise
     class PositionOutput : public virtual Generator
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
 
         template<Dim D>
-        void Set( float multiplier, float offset = 0.0f ) { mMultiplier[(int)D] = multiplier; mOffset[(int)D] = offset; }
+        void SetMultiplier( float multiplier ) { mMultiplier[(int)D] = multiplier; }
+        template<Dim D>
+        void SetOffset( float offset ) { mOffset[(int)D] = offset; }
+        template<Dim D>
+        void SetOffset( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mOffset[(int)D], gen ); }
 
     protected:
         PerDimensionVariable<float> mMultiplier = 0.0f;
-        PerDimensionVariable<float> mOffset = 0.0f;
+        PerDimensionVariable<HybridSource> mOffset = 0.0f;
 
         template<typename T>
         friend struct MetadataT;
@@ -122,13 +178,18 @@ namespace FastNoise
     template<>
     struct MetadataT<PositionOutput> : MetadataT<Generator>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
             groups.push_back( "Basic Generators" );
-            this->AddPerDimensionVariable( "Multiplier", 0.0f, []( PositionOutput* p ) { return std::ref( p->mMultiplier ); } );
-            this->AddPerDimensionVariable( "Offset", 0.0f, []( PositionOutput* p ) { return std::ref( p->mOffset ); } );
+            this->AddPerDimensionVariable( { "Multiplier", "Read node description" }, 0.0f, []( PositionOutput* p ) { return std::ref( p->mMultiplier ); }, 0.f, 0.f, 0.001f );
+            this->AddPerDimensionHybridSource( { "Offset", "Read node description" }, 0.0f, []( PositionOutput* p ) { return std::ref( p->mOffset ); }, 0.25f );
+
+            description =
+                "Takes the input position and does the following per dimension\n"
+                "`(input + offset) * multiplier`\n"
+                "The output is the sum of all results";
         }
     };
 #endif
@@ -136,19 +197,33 @@ namespace FastNoise
     class DistanceToPoint : public virtual Generator
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
 
         void SetSource( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mSource, gen ); }
         void SetDistanceFunction( DistanceFunction value ) { mDistanceFunction = value; }
 
+        void SetMinkowskiP( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mMinkowskiP, gen ); }
+        void SetMinkowskiP( float value ) { mMinkowskiP = value; }
+
+        void SetPoint( float x, float y, float z = 0, float w = 0 )
+        {
+            mPoint[0] = x;
+            mPoint[1] = y;
+            mPoint[2] = z;
+            mPoint[3] = w;
+        }
+
+        template<Dim D>
+        void SetPoint( float value ) { mPoint[(int)D] = value; }
+
         template<Dim D>
-        void SetScale( float value ) { mPoint[(int)D] = value; }
+        void SetPoint( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mPoint[(int)D], gen ); }
 
     protected:
         GeneratorSource mSource;
+        HybridSource mMinkowskiP = 1.5f;
         DistanceFunction mDistanceFunction = DistanceFunction::EuclideanSquared;
-        PerDimensionVariable<float> mPoint = 0.0f;
+        PerDimensionVariable<HybridSource> mPoint = 0.0f;
 
         template<typename T>
         friend struct MetadataT;
@@ -158,13 +233,20 @@ namespace FastNoise
     template<>
     struct MetadataT<DistanceToPoint> : MetadataT<Generator>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
             groups.push_back( "Basic Generators" );
             this->AddVariableEnum( "Distance Function", DistanceFunction::Euclidean, &DistanceToPoint::SetDistanceFunction, kDistanceFunction_Strings );
-            this->AddPerDimensionVariable( "Point", 0.0f, []( DistanceToPoint* p ) { return std::ref( p->mPoint ); } );
+            this->AddPerDimensionHybridSource( { "Point", "Point in current domain space" }, 0.0f, []( DistanceToPoint* p ) { return std::ref( p->mPoint ); } );
+
+            this->AddHybridSource( { "Minkowski P", "Only affects Minkowski distance function\n1 = Manhattan\n2 = Euclidean" }, 1.5f, &DistanceToPoint::SetMinkowskiP, &DistanceToPoint::SetMinkowskiP );
+
+            description =
+                "Outputs distance between point and input position\n"
+                "Distance is calculated in current domain space,\n"
+                "ie affected by Domain Modifiers/Warping";
         }
     };
 #endif
diff --git a/include/FastNoise/Generators/BasicGenerators.inl b/include/FastNoise/Generators/BasicGenerators.inl
index fcd8c471..bca77a63 100644
--- a/include/FastNoise/Generators/BasicGenerators.inl
+++ b/include/FastNoise/Generators/BasicGenerators.inl
@@ -1,98 +1,117 @@
-#include "FastSIMD/InlInclude.h"
-
 #include "BasicGenerators.h"
 #include "Utils.inl"
 
-template<typename FS>
-class FS_T<FastNoise::Constant, FS> : public virtual FastNoise::Constant, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::ScalableGenerator, SIMD> : public virtual FastNoise::ScalableGenerator, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
+{
+protected:
+    template<typename... P>
+    FS_FORCEINLINE void ScalePositions( P&... pos ) const
+    {
+        float32v vFrequency( mFrequency );
+        ( (pos *= vFrequency), ... );
+    }
+};
+
+template<FastSIMD::FeatureSet SIMD, typename PARENT>
+class FastSIMD::DispatchClass<FastNoise::VariableRange<PARENT>, SIMD> : public virtual FastNoise::VariableRange<PARENT>, public FastSIMD::DispatchClass<PARENT, SIMD>
+{
+protected:
+    FS_FORCEINLINE float32v ScaleOutput( float32v value, float nativeMin, float nativeMax ) const
+    {
+        return FS::FMulAdd( float32v( 1.0f / ( nativeMax - nativeMin ) ) * float32v( this->mRangeScale ), value - float32v( nativeMin ), float32v( this->mRangeMin ) );
+    }
+};
+
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::Constant, SIMD> final : public virtual FastNoise::Constant, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
 
     template<typename... P>
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
         return float32v( mValue );
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::White, FS> : public virtual FastNoise::White, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::White, SIMD> final : public virtual FastNoise::White, public FastSIMD::DispatchClass<FastNoise::VariableRange<Generator>, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
 
     template<typename... P>
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
         size_t idx = 0;
-        ((pos = FS_Casti32_f32( (FS_Castf32_i32( pos ) ^ (FS_Castf32_i32( pos ) >> 16)) * int32v( FnPrimes::Lookup[idx++] ) )), ...);
+        ((pos = FS::Cast<float>( (FS::Cast<int32_t>( pos ) ^ (FS::Cast<int32_t>( pos ) >> 16)) * int32v( Primes::Lookup[idx++] ) )), ...);
 
-        return FnUtils::GetValueCoord( seed, FS_Castf32_i32( pos )... );
+        return this->ScaleOutput( GetValueCoord( seed, FS::Cast<int32_t>( pos )... ), -kValueBounds, kValueBounds );
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::Checkerboard, FS> : public virtual FastNoise::Checkerboard, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::Checkerboard, SIMD> final : public virtual FastNoise::Checkerboard, public FastSIMD::DispatchClass<FastNoise::VariableRange<ScalableGenerator>, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
 
     template<typename... P>
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
-        float32v multiplier = FS_Reciprocal_f32( float32v( mSize ) );
+        this->ScalePositions( pos... );
 
-        int32v value = (FS_Convertf32_i32( pos * multiplier ) ^ ...);
+        int32v value = (FS::Convert<int32_t>( pos ) ^ ...);
 
-        return float32v( 1.0f ) ^ FS_Casti32_f32( value << 31 );
+        return this->ScaleOutput( FS::Cast<float>( (value & int32v( 1 )) << 30 ), 0, 2 );
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::SineWave, FS> : public virtual FastNoise::SineWave, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::SineWave, SIMD> final : public virtual FastNoise::SineWave, public FastSIMD::DispatchClass<FastNoise::VariableRange<ScalableGenerator>, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
 
     template<typename... P>
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
-        float32v multiplier = FS_Reciprocal_f32( float32v( mScale ) );
+        this->ScalePositions( pos... );
 
-        return (FS_Sin_f32( pos * multiplier ) * ...);
+        return this->ScaleOutput( (FS::Sin( pos ) * ...), -1, 1 );
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::PositionOutput, FS> : public virtual FastNoise::PositionOutput, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::PositionOutput, SIMD> final : public virtual FastNoise::PositionOutput, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
 
     template<typename... P>
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
         size_t offsetIdx = 0;
         size_t multiplierIdx = 0;
+        float32v r( 0 );
 
-        (((pos += float32v( mOffset[offsetIdx++] )) *= float32v( mMultiplier[multiplierIdx++] )), ...);
-        return (pos + ...);
+        ((r = FS::FMulAdd( pos + this->GetSourceValue( mOffset[offsetIdx++], seed, pos... ), float32v( mMultiplier[multiplierIdx++]), r )), ...);
+        return r;
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::DistanceToPoint, FS> : public virtual FastNoise::DistanceToPoint, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::DistanceToPoint, SIMD> final : public virtual FastNoise::DistanceToPoint, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
 
     template<typename... P>
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
-        size_t pointIdx = 0;
+        [this, seed] ( P&... out, std::remove_reference_t<P>... pos )
+        {
+            size_t pointIdx = 0;
+            ((out -= this->GetSourceValue( mPoint[pointIdx++], seed, pos... )), ...);
+
+        }( pos..., pos... );
 
-        ((pos -= float32v( mPoint[pointIdx++] )), ...);
-        return FnUtils::CalcDistance( mDistanceFunction, pos... );
+        return CalcDistance( mDistanceFunction, mMinkowskiP, seed, pos... );
     }
 };
diff --git a/include/FastNoise/Generators/Blends.h b/include/FastNoise/Generators/Blends.h
index 1ad8f447..a2a026c0 100644
--- a/include/FastNoise/Generators/Blends.h
+++ b/include/FastNoise/Generators/Blends.h
@@ -21,9 +21,9 @@ namespace FastNoise
     template<>
     struct MetadataT<OperatorSourceLHS> : MetadataT<Generator>
     {
-        MetadataT()
+        MetadataT( const char* group = "Blends" )
         {
-            groups.push_back( "Blends" );
+            groups.push_back( group );
             this->AddGeneratorSource( "LHS", &OperatorSourceLHS::SetLHS );
             this->AddHybridSource( "RHS", 0.0f, &OperatorSourceLHS::SetRHS, &OperatorSourceLHS::SetRHS );
         }
@@ -47,9 +47,9 @@ namespace FastNoise
     template<>
     struct MetadataT<OperatorHybridLHS> : MetadataT<Generator>
     {
-        MetadataT()
+        MetadataT( const char* group = "Blends" )
         {
-            groups.push_back( "Blends" );
+            groups.push_back( group );
             this->AddHybridSource( "LHS", 0.0f, &OperatorHybridLHS::SetLHS, &OperatorHybridLHS::SetLHS );
             this->AddHybridSource( "RHS", 0.0f, &OperatorHybridLHS::SetRHS, &OperatorHybridLHS::SetRHS );
         }
@@ -59,7 +59,6 @@ namespace FastNoise
     class Add : public virtual OperatorSourceLHS
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
     };
 
@@ -67,14 +66,15 @@ namespace FastNoise
     template<>
     struct MetadataT<Add> : MetadataT<OperatorSourceLHS>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
+
+        MetadataT() : MetadataT<OperatorSourceLHS>( "Operators" ) {}
     };
 #endif
 
     class Subtract : public virtual OperatorHybridLHS
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
     };
 
@@ -82,14 +82,15 @@ namespace FastNoise
     template<>
     struct MetadataT<Subtract> : MetadataT<OperatorHybridLHS>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
+
+        MetadataT() : MetadataT<OperatorHybridLHS>( "Operators" ) {}
     };
 #endif
 
     class Multiply : public virtual OperatorSourceLHS
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
     };
 
@@ -97,14 +98,15 @@ namespace FastNoise
     template<>
     struct MetadataT<Multiply> : MetadataT<OperatorSourceLHS>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
+
+        MetadataT() : MetadataT<OperatorSourceLHS>( "Operators" ) {}
     };
 #endif
 
     class Divide : public virtual OperatorHybridLHS
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
     };
 
@@ -112,14 +114,31 @@ namespace FastNoise
     template<>
     struct MetadataT<Divide> : MetadataT<OperatorHybridLHS>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
+
+        MetadataT() : MetadataT<OperatorHybridLHS>( "Operators" ) {}
+    };
+#endif
+
+    class Modulus : public virtual OperatorHybridLHS
+    {
+    public:
+        const Metadata& GetMetadata() const override;
+    };
+
+#ifdef FASTNOISE_METADATA
+    template<>
+    struct MetadataT<Modulus> : MetadataT<OperatorHybridLHS>
+    {
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
+
+        MetadataT() : MetadataT<OperatorHybridLHS>( "Operators" ) {}
     };
 #endif
 
     class Min : public virtual OperatorSourceLHS
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
     };
 
@@ -127,14 +146,13 @@ namespace FastNoise
     template<>
     struct MetadataT<Min> : MetadataT<OperatorSourceLHS>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
     };
 #endif
 
     class Max : public virtual OperatorSourceLHS
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
     };
 
@@ -142,14 +160,13 @@ namespace FastNoise
     template<>
     struct MetadataT<Max> : MetadataT<OperatorSourceLHS>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
     };
 #endif
 
     class PowFloat : public virtual Generator
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
 
         void SetValue( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mValue, gen ); }
@@ -166,13 +183,15 @@ namespace FastNoise
     template<>
     struct MetadataT<PowFloat> : MetadataT<Generator>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
             groups.push_back( "Blends" );
             this->AddHybridSource( "Value", 2.0f, &PowFloat::SetValue, &PowFloat::SetValue );
             this->AddHybridSource( "Pow", 2.0f, &PowFloat::SetPow, &PowFloat::SetPow );
+
+            description = "Equivalent to std::powf( value, pow )";
         }
     };
 #endif
@@ -180,7 +199,6 @@ namespace FastNoise
     class PowInt : public virtual Generator
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
 
         void SetValue( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mValue, gen ); }
@@ -195,13 +213,15 @@ namespace FastNoise
     template<>
     struct MetadataT<PowInt> : MetadataT<Generator>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
             groups.push_back( "Blends" );
             this->AddGeneratorSource( "Value", &PowInt::SetValue );
-            this->AddVariable( "Pow", 2, &PowInt::SetPow, 2, INT_MAX );
+            this->AddVariable( "Pow", 2, &PowInt::SetPow, 2 );
+
+            description = "Faster than PowFloat node but only for int powers";
         }
     };
 #endif
@@ -209,7 +229,6 @@ namespace FastNoise
     class MinSmooth : public virtual OperatorSourceLHS
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
 
         void SetSmoothness( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mSmoothness, gen ); }
@@ -223,11 +242,17 @@ namespace FastNoise
     template<>
     struct MetadataT<MinSmooth> : MetadataT<OperatorSourceLHS>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
             this->AddHybridSource( "Smoothness", 0.1f, &MinSmooth::SetSmoothness, &MinSmooth::SetSmoothness );
+
+            description = 
+                "Quadratic Smooth Minimum\n"
+                "Smoothes the transition between the 2 inputs\n"
+                "For explanation see:\n"
+                "https://iquilezles.org/articles/smin/";
         }
     };
 #endif
@@ -235,7 +260,6 @@ namespace FastNoise
     class MaxSmooth : public virtual OperatorSourceLHS
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
 
         void SetSmoothness( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mSmoothness, gen ); }
@@ -249,11 +273,17 @@ namespace FastNoise
     template<>
     struct MetadataT<MaxSmooth> : MetadataT<OperatorSourceLHS>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
             this->AddHybridSource( "Smoothness", 0.1f, &MaxSmooth::SetSmoothness, &MaxSmooth::SetSmoothness );
+
+            description =
+                "Quadratic Smooth Maximum\n"
+                "Smoothes the transition between the 2 inputs\n"
+                "For explanation see:\n"
+                "https://iquilezles.org/articles/smin/";
         }
     };
 #endif
@@ -261,7 +291,13 @@ namespace FastNoise
     class Fade : public virtual Generator
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
+        enum class Interpolation
+        {
+            Linear,
+            Hermite,
+            Quintic,
+        };
+
         const Metadata& GetMetadata() const override;
         void SetA( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mA, gen ); }
         void SetB( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mB, gen ); }
@@ -269,24 +305,43 @@ namespace FastNoise
         void SetFade( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mFade, gen ); }
         void SetFade( float value ) { mFade = value; }
 
+        void SetFadeMin( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mFadeMin, gen ); }
+        void SetFadeMin( float value ) { mFadeMin = value; }
+
+        void SetFadeMax( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mFadeMax, gen ); }
+        void SetFadeMax( float value ) { mFadeMax = value; }
+
+        void SetInterpolation( Interpolation interpolation ) { mInterpolation = interpolation; }
+
     protected:
         GeneratorSource mA;
         GeneratorSource mB;
-        HybridSource mFade = 0.5f;
+        HybridSource mFade = 0;
+        HybridSource mFadeMin = -1.f;
+        HybridSource mFadeMax = 1.f;
+        Interpolation mInterpolation = Interpolation::Linear;
     };
 
 #ifdef FASTNOISE_METADATA
     template<>
     struct MetadataT<Fade> : MetadataT<Generator>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
             groups.push_back( "Blends" );
-            this->AddGeneratorSource( "A", &Fade::SetA );
-            this->AddGeneratorSource( "B", &Fade::SetB );
-            this->AddHybridSource( "Fade", 0.5f, &Fade::SetFade, &Fade::SetFade );
+            this->AddGeneratorSource( { "A", "From" }, &Fade::SetA );
+            this->AddGeneratorSource( { "B", "To" }, &Fade::SetB );
+            this->AddHybridSource( "Fade", 0, &Fade::SetFade, &Fade::SetFade );
+            this->AddHybridSource( "Fade Min", -1.f, &Fade::SetFadeMin, &Fade::SetFadeMin );
+            this->AddHybridSource( "Fade Max", 1.f, &Fade::SetFadeMax, &Fade::SetFadeMax );
+            this->AddVariableEnum( { "Interpolation", "Easing function" }, Fade::Interpolation::Linear, &Fade::SetInterpolation, "Linear", "Hermite", "Quintic" );            
+
+            description =
+                "Output fades between inputs A and B\n"
+                "Fade Min = 100% A\n"
+                "Fade Max = 100% B";
         }
     };
 #endif
diff --git a/include/FastNoise/Generators/Blends.inl b/include/FastNoise/Generators/Blends.inl
index 7f631d36..821e13a8 100644
--- a/include/FastNoise/Generators/Blends.inl
+++ b/include/FastNoise/Generators/Blends.inl
@@ -1,80 +1,89 @@
-#include "FastSIMD/InlInclude.h"
-
 #include "Blends.h"
 
-template<typename FS>
-class FS_T<FastNoise::Add, FS> : public virtual FastNoise::Add, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::Add, SIMD> final : public virtual FastNoise::Add, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
     
     template<typename... P> 
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
         return this->GetSourceValue( mLHS, seed, pos... ) + this->GetSourceValue( mRHS, seed, pos... );
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::Subtract, FS> : public virtual FastNoise::Subtract, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::Subtract, SIMD> final : public virtual FastNoise::Subtract, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
     
     template<typename... P> 
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
         return this->GetSourceValue( mLHS, seed, pos... ) - this->GetSourceValue( mRHS, seed, pos... );
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::Multiply, FS> : public virtual FastNoise::Multiply, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::Multiply, SIMD> final : public virtual FastNoise::Multiply, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
     
     template<typename... P> 
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
         return this->GetSourceValue( mLHS, seed, pos... ) * this->GetSourceValue( mRHS, seed, pos... );
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::Divide, FS> : public virtual FastNoise::Divide, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::Divide, SIMD> final : public virtual FastNoise::Divide, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
     
     template<typename... P> 
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
         return this->GetSourceValue( mLHS, seed, pos... ) / this->GetSourceValue( mRHS, seed, pos... );
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::PowFloat, FS> : public virtual FastNoise::PowFloat, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::Modulus, SIMD> final : public virtual FastNoise::Modulus, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
+{
+    FASTNOISE_IMPL_GEN_T;
+
+    template<typename... P>
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
+    {
+        float32v a = this->GetSourceValue( mLHS, seed, pos... );
+        float32v b = this->GetSourceValue( mRHS, seed, pos... );
+
+        return FS::Modulus( a, b );
+    }
+};
+
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::PowFloat, SIMD> final : public virtual FastNoise::PowFloat, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
     
     template<typename... P> 
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
-        return FS_Pow_f32( this->GetSourceValue( mValue, seed, pos... ), this->GetSourceValue( mPow, seed, pos... ) );
+        float32v value = FS::Max( FS::Abs( this->GetSourceValue( mValue, seed, pos... ) ), float32v( FLT_MIN ) );
+
+        return Pow( value, this->GetSourceValue( mPow, seed, pos... ) );
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::PowInt, FS> : public virtual FastNoise::PowInt, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::PowInt, SIMD> final : public virtual FastNoise::PowInt, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
     
     template<typename... P> 
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
         float32v value = this->GetSourceValue( mValue, seed, pos... );
         float32v pow = value * value;
@@ -88,86 +97,104 @@ class FS_T<FastNoise::PowInt, FS> : public virtual FastNoise::PowInt, public FS_
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::Min, FS> : public virtual FastNoise::Min, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::Min, SIMD> final : public virtual FastNoise::Min, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
     
     template<typename... P> 
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
-        return FS_Min_f32( this->GetSourceValue( mLHS, seed, pos... ), this->GetSourceValue( mRHS, seed, pos... ) );
+        return FS::Min( this->GetSourceValue( mLHS, seed, pos... ), this->GetSourceValue( mRHS, seed, pos... ) );
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::Max, FS> : public virtual FastNoise::Max, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::Max, SIMD> final : public virtual FastNoise::Max, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
     
     template<typename... P> 
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
-        return FS_Max_f32( this->GetSourceValue( mLHS, seed, pos... ), this->GetSourceValue( mRHS, seed, pos... ) );
+        return FS::Max( this->GetSourceValue( mLHS, seed, pos... ), this->GetSourceValue( mRHS, seed, pos... ) );
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::MinSmooth, FS> : public virtual FastNoise::MinSmooth, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::MinSmooth, SIMD> final : public virtual FastNoise::MinSmooth, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
     
     template<typename... P> 
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
         float32v a = this->GetSourceValue( mLHS, seed, pos... );
         float32v b = this->GetSourceValue( mRHS, seed, pos... );
-        float32v smoothness = FS_Max_f32( float32v( 1.175494351e-38f ), FS_Abs_f32( this->GetSourceValue( mSmoothness, seed, pos... ) ) );
+        float32v smoothness = FS::Max( float32v( FLT_MIN ), FS::Abs( this->GetSourceValue( mSmoothness, seed, pos... ) ) );
 
-        float32v h = FS_Max_f32( smoothness - FS_Abs_f32( a - b ), float32v( 0.0f ) );
+        float32v h = FS::Max( smoothness - FS::Abs( a - b ), float32v( 0.0f ) );
 
-        h *= FS_Reciprocal_f32( smoothness );
+        h *= FS::Reciprocal( smoothness );
 
-        return FS_FNMulAdd_f32( float32v( 1.0f / 6.0f ), h * h * h * smoothness, FS_Min_f32( a, b ) );
+        return FS::FNMulAdd( float32v( 1.0f / 6.0f ), h * h * h * smoothness, FS::Min( a, b ) );
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::MaxSmooth, FS> : public virtual FastNoise::MaxSmooth, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::MaxSmooth, SIMD> final : public virtual FastNoise::MaxSmooth, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
     
     template<typename... P> 
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
         float32v a = -this->GetSourceValue( mLHS, seed, pos... );
         float32v b = -this->GetSourceValue( mRHS, seed, pos... );
-        float32v smoothness = FS_Max_f32( float32v( 1.175494351e-38f ), FS_Abs_f32( this->GetSourceValue( mSmoothness, seed, pos... ) ) );
+        float32v smoothness = FS::Max( float32v( FLT_MIN ), FS::Abs( this->GetSourceValue( mSmoothness, seed, pos... ) ) );
 
-        float32v h = FS_Max_f32( smoothness - FS_Abs_f32( a - b ), float32v( 0.0f ) );
+        float32v h = FS::Max( smoothness - FS::Abs( a - b ), float32v( 0.0f ) );
 
-        h *= FS_Reciprocal_f32( smoothness );
+        h *= FS::Reciprocal( smoothness );
 
-        return -FS_FNMulAdd_f32( float32v( 1.0f / 6.0f ), h * h * h * smoothness, FS_Min_f32( a, b ) );
+        return -FS::FNMulAdd( float32v( 1.0f / 6.0f ), h * h * h * smoothness, FS::Min( a, b ) );
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::Fade, FS> : public virtual FastNoise::Fade, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::Fade, SIMD> final : public virtual FastNoise::Fade, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
     
     template<typename... P> 
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
-        float32v fade = FS_Abs_f32( this->GetSourceValue( mFade, seed, pos... ) );
+        float32v fade = this->GetSourceValue( mFade, seed, pos... );
+        float32v fadeMin = this->GetSourceValue( mFadeMin, seed, pos... );
+        float32v fadeMax = this->GetSourceValue( mFadeMax, seed, pos... );
+
+        float32v fadeRange = fadeMax - fadeMin;
+
+        fade = ( fade - fadeMin ) / fadeRange;
+
+        fade = FS::Max( float32v( 0 ), FS::Min( float32v( 1 ), fade ) );
+
+        switch( mInterpolation )
+        {
+        case Interpolation::Linear:
+            break;
+        case Interpolation::Hermite:
+            fade = InterpHermite( fade );
+            break;
+        case Interpolation::Quintic:
+            fade = InterpQuintic( fade );
+            break;
+        }
 
-        return FS_FMulAdd_f32( this->GetSourceValue( mA, seed, pos... ), float32v( 1 ) - fade, this->GetSourceValue( mB, seed, pos... ) * fade );
+        // Protect against nan from 0 range div
+        fade = FS::Select( fadeRange == float32v( 0 ), float32v( 0.5f ), fade );
+        
+        return Lerp( this->GetSourceValue( mA, seed, pos... ), this->GetSourceValue( mB, seed, pos... ), fade );
     }
 };
 
diff --git a/include/FastNoise/Generators/Cellular.h b/include/FastNoise/Generators/Cellular.h
index 6bfa84f5..fa1ad6a3 100644
--- a/include/FastNoise/Generators/Cellular.h
+++ b/include/FastNoise/Generators/Cellular.h
@@ -5,36 +5,43 @@
 
 namespace FastNoise
 {
-    class Cellular : public virtual Generator
+    template<typename PARENT = VariableRange<ScalableGenerator>>
+    class Cellular : public virtual PARENT
     {
     public:
+        void SetDistanceFunction( DistanceFunction value ) { mDistanceFunction = value; }
+
+        void SetMinkowskiP( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mMinkowskiP, gen ); }
+        void SetMinkowskiP( float value ) { mMinkowskiP = value; }
+
         void SetJitterModifier( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mJitterModifier, gen ); }
         void SetJitterModifier( float value ) { mJitterModifier = value; }
-        void SetDistanceFunction( DistanceFunction value ) { mDistanceFunction = value; }
 
     protected:
+        HybridSource mMinkowskiP = 1.5f;
         HybridSource mJitterModifier = 1.0f;
         DistanceFunction mDistanceFunction = DistanceFunction::EuclideanSquared;
     };
 
 #ifdef FASTNOISE_METADATA
-    template<>
-    struct MetadataT<Cellular> : MetadataT<Generator>
+    template<typename PARENT>
+    struct MetadataT<Cellular<PARENT>> : MetadataT<PARENT>
     {
         MetadataT()
         {
-            groups.push_back( "Coherent Noise" );
-            this->AddHybridSource( { "Jitter Modifier", "Above 1.0 will cause grid artifacts" }, 1.0f, &Cellular::SetJitterModifier, &Cellular::SetJitterModifier );
+            this->groups.push_back( "Coherent Noise" );
             this->AddVariableEnum( { "Distance Function", "How distance to closest cells is calculated\nHybrid is EuclideanSquared + Manhattan" },
-                DistanceFunction::EuclideanSquared, &Cellular::SetDistanceFunction, kDistanceFunction_Strings );
+                DistanceFunction::EuclideanSquared, &Cellular<PARENT>::SetDistanceFunction, kDistanceFunction_Strings );
+            this->AddHybridSource( { "Minkowski P", "Only affects Minkowski distance function\n1 = Manhattan\n2 = Euclidean" }, 1.5f, &Cellular<PARENT>::SetMinkowskiP, &Cellular<PARENT>::SetMinkowskiP );
+
+            this->AddHybridSource( { "Jitter Modifier", "Above 1.0 will cause grid artifacts\n0.0 will output a uniform grid" }, 1.0f, &Cellular<PARENT>::SetJitterModifier, &Cellular<PARENT>::SetJitterModifier );
         }
     };
 #endif
 
-    class CellularValue : public virtual Cellular
+    class CellularValue : public virtual Cellular<>
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
 
         static const int kMaxDistanceCount = 4;
@@ -47,9 +54,9 @@ namespace FastNoise
 
 #ifdef FASTNOISE_METADATA
     template<>
-    struct MetadataT<CellularValue> : MetadataT<Cellular>
+    struct MetadataT<CellularValue> : MetadataT<Cellular<>>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
@@ -57,16 +64,14 @@ namespace FastNoise
 
             description = 
                 "Returns value of Nth closest cell\n"
-                "Value is generated using white noise\n"
-                "Output is bounded -1 : 1";
+                "Value is generated using white noise";
         }
     };
 #endif
 
-    class CellularDistance : public virtual Cellular
+    class CellularDistance : public virtual Cellular<>
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
 
         enum class ReturnType
@@ -92,9 +97,9 @@ namespace FastNoise
 
 #ifdef FASTNOISE_METADATA
     template<>
-    struct MetadataT<CellularDistance> : MetadataT<Cellular>
+    struct MetadataT<CellularDistance> : MetadataT<Cellular<>>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
@@ -110,30 +115,26 @@ namespace FastNoise
     };
 #endif
 
-    class CellularLookup : public virtual Cellular
+    class CellularLookup : public virtual Cellular<ScalableGenerator>
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
 
         void SetLookup( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mLookup, gen ); }
-        void SetLookupFrequency( float freq ) { mLookupFreq = freq; }
 
     protected:
         GeneratorSource mLookup;
-        float mLookupFreq = 0.1f;
     };
 
 #ifdef FASTNOISE_METADATA
     template<>
-    struct MetadataT<CellularLookup> : MetadataT<Cellular>
+    struct MetadataT<CellularLookup> : MetadataT<Cellular<ScalableGenerator>>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
             this->AddGeneratorSource( { "Lookup", "Used to generate cell values" }, &CellularLookup::SetLookup );
-            this->AddVariable( { "Lookup Frequency", "Relative to the cellular frequency" }, 0.1f, &CellularLookup::SetLookupFrequency );
             
             description = 
                 "Returns value of closest cell\n"
diff --git a/include/FastNoise/Generators/Cellular.inl b/include/FastNoise/Generators/Cellular.inl
index 472b17e0..077898d3 100644
--- a/include/FastNoise/Generators/Cellular.inl
+++ b/include/FastNoise/Generators/Cellular.inl
@@ -1,43 +1,40 @@
-#include "FastSIMD/InlInclude.h"
-
 #include <cfloat>
 #include <array>
 
 #include "Cellular.h"
 #include "Utils.inl"
 
-template<typename FS>
-class FS_T<FastNoise::Cellular, FS> : public virtual FastNoise::Cellular, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD, typename PARENT>
+class FastSIMD::DispatchClass<FastNoise::Cellular<PARENT>, SIMD> : public virtual FastNoise::Cellular<PARENT>, public FastSIMD::DispatchClass<PARENT, SIMD>
 {
 protected:
-    const float kJitter2D = 0.437016f;
-    const float kJitter3D = 0.396144f;
-    const float kJitter4D = 0.366025f;
-    const float kJitterIdx23 = 0.190983f;
+    static constexpr float kJitter2D = 0.437016f;
+    static constexpr float kJitter3D = 0.396144f;
+    static constexpr float kJitter4D = 0.366025f;
+    static constexpr float kJitterIdx23 = 0.190983f;
 };
 
-template<typename FS>
-class FS_T<FastNoise::CellularValue, FS> : public virtual FastNoise::CellularValue, public FS_T<FastNoise::Cellular, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::CellularValue, SIMD> final : public virtual FastNoise::CellularValue, public FastSIMD::DispatchClass<FastNoise::Cellular<>, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
-
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y ) const final
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y ) const
     {
         float32v jitter = float32v( this->kJitter2D ) * this->GetSourceValue( mJitterModifier, seed, x, y );
-        std::array<float32v, kMaxDistanceCount> value;
+        std::array<int32v, kMaxDistanceCount> valueHash;
         std::array<float32v, kMaxDistanceCount> distance;
         
-        value.fill( float32v( INFINITY ) );
         distance.fill( float32v( INFINITY ) );
 
-        int32v xc = FS_Convertf32_i32( x ) + int32v( -1 );
-        int32v ycBase = FS_Convertf32_i32( y ) + int32v( -1 );
+        this->ScalePositions( x, y );
+
+        int32v xc = FS::Convert<int32_t>( x ) + int32v( -1 );
+        int32v ycBase = FS::Convert<int32_t>( y ) + int32v( -1 );
 
-        float32v xcf = FS_Converti32_f32( xc ) - x;
-        float32v ycfBase = FS_Converti32_f32( ycBase ) - y;
+        float32v xcf = FS::Convert<float>( xc ) - x;
+        float32v ycfBase = FS::Convert<float>( ycBase ) - y;
 
-        xc *= int32v( FnPrimes::X );
-        ycBase *= int32v( FnPrimes::Y );
+        xc *= int32v( Primes::X );
+        ycBase *= int32v( Primes::Y );
 
         for( int xi = 0; xi < 3; xi++ )
         {
@@ -45,66 +42,67 @@ class FS_T<FastNoise::CellularValue, FS> : public virtual FastNoise::CellularVal
             int32v yc = ycBase;
             for( int yi = 0; yi < 3; yi++ )
             {
-                int32v hash = FnUtils::HashPrimesHB( seed, xc, yc );
-                float32v xd = FS_Converti32_f32( hash & int32v( 0xffff ) ) - float32v( 0xffff / 2.0f );
-                float32v yd = FS_Converti32_f32( (hash >> 16) & int32v( 0xffff ) ) - float32v( 0xffff / 2.0f );
+                int32v hash = HashPrimesHB( seed, xc, yc );
+                float32v xd = FS::Convert<float>( hash & int32v( 0xffff ) ) - float32v( 0xffff / 2.0f );
+                float32v yd = FS::Convert<float>( FS::BitShiftRightZeroExtend( hash, 16 ) ) - float32v( 0xffff / 2.0f );
 
-                float32v invMag = jitter * FS_InvSqrt_f32( FS_FMulAdd_f32( xd, xd, yd * yd ) );
-                xd = FS_FMulAdd_f32( xd, invMag, xcf );
-                yd = FS_FMulAdd_f32( yd, invMag, ycf );
+                float32v invMag = jitter * FS::InvSqrt( FS::FMulAdd( xd, xd, yd * yd ) );
+                xd = FS::FMulAdd( xd, invMag, xcf );
+                yd = FS::FMulAdd( yd, invMag, ycf );
 
-                float32v newCellValue = float32v( (float)(1.0 / INT_MAX) ) * FS_Converti32_f32( hash );
-                float32v newDistance = FnUtils::CalcDistance( mDistanceFunction, xd, yd );
+                int32v newCellValueHash = hash;
+                float32v newDistance = CalcDistance<false>( mDistanceFunction, mMinkowskiP, seed, xd, yd );
 
                 for( int i = 0; ; i++ )
                 {
                     mask32v closer = newDistance < distance[i];
 
                     float32v localDistance = distance[i];
-                    float32v localCellValue = value[i];
+                    int32v localCellValueHash = valueHash[i];
 
-                    distance[i] = FS_Select_f32( closer, newDistance, distance[i] );
-                    value[i] = FS_Select_f32( closer, newCellValue, value[i] );
+                    distance[i] = FS::Select( closer, newDistance, distance[i] );
+                    valueHash[i] = FS::Select( closer, newCellValueHash, valueHash[i] );
 
                     if( i > mValueIndex )
                     {
                         break;
                     }
 
-                    newDistance = FS_Select_f32( closer, localDistance, newDistance );
-                    newCellValue = FS_Select_f32( closer, localCellValue, newCellValue );
+                    newDistance = FS::Select( closer, localDistance, newDistance );
+                    newCellValueHash = FS::Select( closer, localCellValueHash, newCellValueHash );
                 }
 
                 ycf += float32v( 1 );
-                yc += int32v( FnPrimes::Y );
+                yc += int32v( Primes::Y );
             }
             xcf += float32v( 1 );
-            xc += int32v( FnPrimes::X );
+            xc += int32v( Primes::X );
         }
 
-        return value[mValueIndex];
+        return this->ScaleOutput( FS::Convert<float>( valueHash[mValueIndex] ), -kValueBounds, kValueBounds );
     }
 
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z ) const final
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z ) const
     {
         float32v jitter = float32v( this->kJitter3D ) * this->GetSourceValue( mJitterModifier, seed, x, y, z );
-        std::array<float32v, kMaxDistanceCount> value;
+        std::array<int32v, kMaxDistanceCount> valueHash;
         std::array<float32v, kMaxDistanceCount> distance;
         
-        value.fill( float32v( INFINITY ) );
         distance.fill( float32v( INFINITY ) );
+
+        this->ScalePositions( x, y, z );
         
-        int32v xc = FS_Convertf32_i32( x ) + int32v( -1 );
-        int32v ycBase = FS_Convertf32_i32( y ) + int32v( -1 );
-        int32v zcBase = FS_Convertf32_i32( z ) + int32v( -1 );
+        int32v xc = FS::Convert<int32_t>( x ) + int32v( -1 );
+        int32v ycBase = FS::Convert<int32_t>( y ) + int32v( -1 );
+        int32v zcBase = FS::Convert<int32_t>( z ) + int32v( -1 );
         
-        float32v xcf = FS_Converti32_f32( xc ) - x;
-        float32v ycfBase = FS_Converti32_f32( ycBase ) - y;
-        float32v zcfBase = FS_Converti32_f32( zcBase ) - z;
+        float32v xcf = FS::Convert<float>( xc ) - x;
+        float32v ycfBase = FS::Convert<float>( ycBase ) - y;
+        float32v zcfBase = FS::Convert<float>( zcBase ) - z;
     
-        xc *= int32v( FnPrimes::X );
-        ycBase *= int32v( FnPrimes::Y );
-        zcBase *= int32v( FnPrimes::Z );
+        xc *= int32v( Primes::X );
+        ycBase *= int32v( Primes::Y );
+        zcBase *= int32v( Primes::Z );
     
         for( int xi = 0; xi < 3; xi++ )
         {
@@ -116,74 +114,75 @@ class FS_T<FastNoise::CellularValue, FS> : public virtual FastNoise::CellularVal
                 int32v zc = zcBase;
                 for( int zi = 0; zi < 3; zi++ )
                 {
-                    int32v hash = FnUtils::HashPrimesHB( seed, xc, yc, zc );
-                    float32v xd = FS_Converti32_f32( hash & int32v( 0x3ff ) ) - float32v( 0x3ff / 2.0f );
-                    float32v yd = FS_Converti32_f32( ( hash >> 10 ) & int32v( 0x3ff ) ) - float32v( 0x3ff / 2.0f );
-                    float32v zd = FS_Converti32_f32( ( hash >> 20 ) & int32v( 0x3ff ) ) - float32v( 0x3ff / 2.0f );
-                
-                    float32v invMag = jitter * FS_InvSqrt_f32( FS_FMulAdd_f32( xd, xd, FS_FMulAdd_f32( yd, yd, zd * zd ) ) );
-                    xd = FS_FMulAdd_f32( xd, invMag, xcf );
-                    yd = FS_FMulAdd_f32( yd, invMag, ycf );
-                    zd = FS_FMulAdd_f32( zd, invMag, zcf );
+                    int32v hash = HashPrimesHB( seed, xc, yc, zc );
+                    float32v xd = FS::Convert<float>( hash & int32v( 0x3ff ) ) - float32v( 0x3ff / 2.0f );
+                    float32v yd = FS::Convert<float>( ( hash >> 11 ) & int32v( 0x3ff ) ) - float32v( 0x3ff / 2.0f );
+                    float32v zd = FS::Convert<float>( FS::BitShiftRightZeroExtend( hash, 22 ) ) - float32v( 0x3ff / 2.0f );
                 
-                    float32v newCellValue = float32v( (float)(1.0 / INT_MAX) ) * FS_Converti32_f32( hash );
-                    float32v newDistance = FnUtils::CalcDistance( mDistanceFunction, xd, yd, zd );
+                    float32v invMag = jitter * FS::InvSqrt( FS::FMulAdd( xd, xd, FS::FMulAdd( yd, yd, zd * zd ) ) );
+                    xd = FS::FMulAdd( xd, invMag, xcf );
+                    yd = FS::FMulAdd( yd, invMag, ycf );
+                    zd = FS::FMulAdd( zd, invMag, zcf );
+
+                    int32v newCellValueHash = hash;
+                    float32v newDistance = CalcDistance<false>( mDistanceFunction, mMinkowskiP, seed, xd, yd, zd );
                 
                     for( int i = 0; ; i++ )
                     {
                         mask32v closer = newDistance < distance[i];
 
                         float32v localDistance = distance[i];
-                        float32v localCellValue = value[i];
+                        int32v localCellValueHash = valueHash[i];
 
-                        distance[i] = FS_Select_f32( closer, newDistance, distance[i] );
-                        value[i] = FS_Select_f32( closer, newCellValue, value[i] );
+                        distance[i] = FS::Select( closer, newDistance, distance[i] );
+                        valueHash[i] = FS::Select( closer, newCellValueHash, valueHash[i] );
 
                         if( i > mValueIndex )
                         {
                             break;
                         }
 
-                        newDistance = FS_Select_f32( closer, localDistance, newDistance );
-                        newCellValue = FS_Select_f32( closer, localCellValue, newCellValue );
+                        newDistance = FS::Select( closer, localDistance, newDistance );
+                        newCellValueHash = FS::Select( closer, localCellValueHash, newCellValueHash );
                     }
             
                     zcf += float32v( 1 );
-                    zc += int32v( FnPrimes::Z );
+                    zc += int32v( Primes::Z );
                 }
                 ycf += float32v( 1 );
-                yc += int32v( FnPrimes::Y );
+                yc += int32v( Primes::Y );
             }
             xcf += float32v( 1 );
-            xc += int32v( FnPrimes::X );
+            xc += int32v( Primes::X );
         }
-    
-        return value[mValueIndex];
+
+        return this->ScaleOutput( FS::Convert<float>( valueHash[mValueIndex] ), -kValueBounds, kValueBounds );
     }
 
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z , float32v w ) const final
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z , float32v w ) const
     {
         float32v jitter = float32v( this->kJitter4D ) * this->GetSourceValue( mJitterModifier, seed, x, y, z, w );
-        std::array<float32v, kMaxDistanceCount> value;
+        std::array<int32v, kMaxDistanceCount> valueHash;
         std::array<float32v, kMaxDistanceCount> distance;
         
-        value.fill( float32v( INFINITY ) );
         distance.fill( float32v( INFINITY ) );
+
+        this->ScalePositions( x, y, z, w );
         
-        int32v xc = FS_Convertf32_i32( x ) + int32v( -1 );
-        int32v ycBase = FS_Convertf32_i32( y ) + int32v( -1 );
-        int32v zcBase = FS_Convertf32_i32( z ) + int32v( -1 );
-        int32v wcBase = FS_Convertf32_i32( w ) + int32v( -1 );
+        int32v xc = FS::Convert<int32_t>( x ) + int32v( -1 );
+        int32v ycBase = FS::Convert<int32_t>( y ) + int32v( -1 );
+        int32v zcBase = FS::Convert<int32_t>( z ) + int32v( -1 );
+        int32v wcBase = FS::Convert<int32_t>( w ) + int32v( -1 );
         
-        float32v xcf = FS_Converti32_f32( xc ) - x;
-        float32v ycfBase = FS_Converti32_f32( ycBase ) - y;
-        float32v zcfBase = FS_Converti32_f32( zcBase ) - z;
-        float32v wcfBase = FS_Converti32_f32( wcBase ) - w;
+        float32v xcf = FS::Convert<float>( xc ) - x;
+        float32v ycfBase = FS::Convert<float>( ycBase ) - y;
+        float32v zcfBase = FS::Convert<float>( zcBase ) - z;
+        float32v wcfBase = FS::Convert<float>( wcBase ) - w;
     
-        xc *= int32v( FnPrimes::X );
-        ycBase *= int32v( FnPrimes::Y );
-        zcBase *= int32v( FnPrimes::Z );
-        wcBase *= int32v( FnPrimes::W );
+        xc *= int32v( Primes::X );
+        ycBase *= int32v( Primes::Y );
+        zcBase *= int32v( Primes::Z );
+        wcBase *= int32v( Primes::W );
     
         for( int xi = 0; xi < 3; xi++ )
         {
@@ -199,299 +198,311 @@ class FS_T<FastNoise::CellularValue, FS> : public virtual FastNoise::CellularVal
                     int32v wc = wcBase;
                     for( int wi = 0; wi < 3; wi++ )
                     {
-                        int32v hash = FnUtils::HashPrimesHB( seed, xc, yc, zc, wc );
-                        float32v xd = FS_Converti32_f32( hash & int32v( 0xff ) ) - float32v( 0xff / 2.0f );
-                        float32v yd = FS_Converti32_f32( (hash >> 8) & int32v( 0xff ) ) - float32v( 0xff / 2.0f );
-                        float32v zd = FS_Converti32_f32( (hash >> 16) & int32v( 0xff ) ) - float32v( 0xff / 2.0f );
-                        float32v wd = FS_Converti32_f32( (hash >> 24) & int32v( 0xff ) ) - float32v( 0xff / 2.0f );
+                        int32v hash = HashPrimesHB( seed, xc, yc, zc, wc );
+                        float32v xd = FS::Convert<float>( hash & int32v( 0xff ) ) - float32v( 0xff / 2.0f );
+                        float32v yd = FS::Convert<float>( (hash >> 8) & int32v( 0xff ) ) - float32v( 0xff / 2.0f );
+                        float32v zd = FS::Convert<float>( (hash >> 16) & int32v( 0xff ) ) - float32v( 0xff / 2.0f );
+                        float32v wd = FS::Convert<float>( FS::BitShiftRightZeroExtend( hash, 24 ) ) - float32v( 0xff / 2.0f );
 
-                        float32v invMag = jitter * FS_InvSqrt_f32( FS_FMulAdd_f32( xd, xd, FS_FMulAdd_f32( yd, yd, FS_FMulAdd_f32( zd, zd, wd * wd ) ) ) );
-                        xd = FS_FMulAdd_f32( xd, invMag, xcf );
-                        yd = FS_FMulAdd_f32( yd, invMag, ycf );
-                        zd = FS_FMulAdd_f32( zd, invMag, zcf );
-                        wd = FS_FMulAdd_f32( wd, invMag, wcf );
+                        float32v invMag = jitter * FS::InvSqrt( FS::FMulAdd( xd, xd, FS::FMulAdd( yd, yd, FS::FMulAdd( zd, zd, wd * wd ) ) ) );
+                        xd = FS::FMulAdd( xd, invMag, xcf );
+                        yd = FS::FMulAdd( yd, invMag, ycf );
+                        zd = FS::FMulAdd( zd, invMag, zcf );
+                        wd = FS::FMulAdd( wd, invMag, wcf );
 
-                        float32v newCellValue = float32v( (float)(1.0 / INT_MAX) ) * FS_Converti32_f32( hash );
-                        float32v newDistance = FnUtils::CalcDistance( mDistanceFunction, xd, yd, zd, wd );
+                        int32v newCellValueHash = hash;
+                        float32v newDistance = CalcDistance<false>( mDistanceFunction, mMinkowskiP, seed, xd, yd, zd, wd );
 
                         for( int i = 0; ; i++ )
                         {
                             mask32v closer = newDistance < distance[i];
 
                             float32v localDistance = distance[i];
-                            float32v localCellValue = value[i];
+                            int32v localCellValueHash = valueHash[i];
 
-                            distance[i] = FS_Select_f32( closer, newDistance, distance[i] );
-                            value[i] = FS_Select_f32( closer, newCellValue, value[i] );
+                            distance[i] = FS::Select( closer, newDistance, distance[i] );
+                            valueHash[i] = FS::Select( closer, newCellValueHash, valueHash[i] );
 
                             if( i > mValueIndex )
                             {
                                 break;
                             }
 
-                            newDistance = FS_Select_f32( closer, localDistance, newDistance );
-                            newCellValue = FS_Select_f32( closer, localCellValue, newCellValue );
+                            newDistance = FS::Select( closer, localDistance, newDistance );
+                            newCellValueHash = FS::Select( closer, localCellValueHash, newCellValueHash );
                         }
 
                         wcf += float32v( 1 );
-                        wc += int32v( FnPrimes::W );
+                        wc += int32v( Primes::W );
                     }
                     zcf += float32v( 1 );
-                    zc += int32v( FnPrimes::Z );
+                    zc += int32v( Primes::Z );
                 }
                 ycf += float32v( 1 );
-                yc += int32v( FnPrimes::Y );
+                yc += int32v( Primes::Y );
             }
             xcf += float32v( 1 );
-            xc += int32v( FnPrimes::X );
+            xc += int32v( Primes::X );
         }
-    
-        return value[mValueIndex];
+
+        return this->ScaleOutput( FS::Convert<float>( valueHash[mValueIndex] ), -kValueBounds, kValueBounds );
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::CellularDistance, FS> : public virtual FastNoise::CellularDistance, public FS_T<FastNoise::Cellular, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::CellularDistance, SIMD> final : public virtual FastNoise::CellularDistance, public FastSIMD::DispatchClass<FastNoise::Cellular<>, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
-
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y ) const final
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y ) const
     {
         float32v jitter = float32v( this->kJitter2D ) * this->GetSourceValue( mJitterModifier, seed, x, y );
 
         std::array<float32v, kMaxDistanceCount> distance;
         distance.fill( float32v( INFINITY ) );
 
-        int32v xc = FS_Convertf32_i32( x ) + int32v( -1 );
-        int32v ycBase = FS_Convertf32_i32( y ) + int32v( -1 );
+        this->ScalePositions( x, y );
+
+        int32v xc = FS::Convert<int32_t>( x ) + int32v( -1 );
+        int32v ycBase = FS::Convert<int32_t>( y ) + int32v( -1 );
 
-        float32v xcf = FS_Converti32_f32( xc ) - x;
-        float32v ycfBase = FS_Converti32_f32( ycBase ) - y;
+        float32v xcf = FS::Convert<float>( xc );
+        float32v ycfBase = FS::Convert<float>( ycBase );
 
-        xc *= int32v( FnPrimes::X );
-        ycBase *= int32v( FnPrimes::Y );
+        xc *= int32v( Primes::X );
+        ycBase *= int32v( Primes::Y );
 
         for( int xi = 0; xi < 3; xi++ )
         {
+            float32v xcfOffset = xcf - x;
             float32v ycf = ycfBase;
             int32v yc = ycBase;
             for ( int yi = 0; yi < 3; yi++ )
             {
-                int32v hash = FnUtils::HashPrimesHB( seed, xc, yc );
-                float32v xd = FS_Converti32_f32( hash & int32v( 0xffff ) ) - float32v( 0xffff / 2.0f );
-                float32v yd = FS_Converti32_f32( (hash >> 16) & int32v( 0xffff ) ) - float32v( 0xffff / 2.0f );
+                int32v hash = HashPrimesHB( seed, xc, yc );
+                float32v xd = FS::Convert<float>( hash & int32v( 0xffff ) ) - float32v( 0xffff / 2.0f );
+                float32v yd = FS::Convert<float>( FS::BitShiftRightZeroExtend( hash, 16 ) ) - float32v( 0xffff / 2.0f );
 
-                float32v invMag = jitter * FS_InvSqrt_f32( FS_FMulAdd_f32( xd, xd, yd * yd ) );
-                xd = FS_FMulAdd_f32( xd, invMag, xcf );
-                yd = FS_FMulAdd_f32( yd, invMag, ycf );
+                float32v invMag = jitter * FS::InvSqrt( FS::FMulAdd( xd, xd, yd * yd ) );
+                xd = FS::FMulAdd( xd, invMag, xcfOffset );
+                yd = FS::FMulAdd( yd, invMag, ycf - y );
 
-                float32v newDistance = FnUtils::CalcDistance( mDistanceFunction, xd, yd );
+                float32v newDistance = CalcDistance<false>( mDistanceFunction, mMinkowskiP, seed, xd, yd );
 
                 for( int i = kMaxDistanceCount - 1; i > 0; i-- )
                 {
-                    distance[i] = FS_Max_f32( FS_Min_f32( distance[i], newDistance ), distance[i - 1] );
+                    distance[i] = FS::Max( FS::Min( distance[i], newDistance ), distance[i - 1] );
                 }
 
-                distance[0] = FS_Min_f32( distance[0], newDistance );
+                distance[0] = FS::Min( distance[0], newDistance );
 
                 ycf += float32v( 1 );
-                yc += int32v( FnPrimes::Y );
+                yc += int32v( Primes::Y );
             }
             xcf += float32v( 1 );
-            xc += int32v( FnPrimes::X );
+            xc += int32v( Primes::X );
         }
 
-        return GetReturn( distance );
+        return GetReturn( distance, 1 + this->kJitter2D );
     }
 
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z ) const final
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z ) const
     {
         float32v jitter = float32v( this->kJitter3D ) * this->GetSourceValue( mJitterModifier, seed, x, y, z );
 
         std::array<float32v, kMaxDistanceCount> distance;
         distance.fill( float32v( INFINITY ) );
 
-        int32v xc = FS_Convertf32_i32( x ) + int32v( -1 );
-        int32v ycBase = FS_Convertf32_i32( y ) + int32v( -1 );
-        int32v zcBase = FS_Convertf32_i32( z ) + int32v( -1 );
+        this->ScalePositions( x, y, z );
+
+        int32v xc = FS::Convert<int32_t>( x ) + int32v( -1 );
+        int32v ycBase = FS::Convert<int32_t>( y ) + int32v( -1 );
+        int32v zcBase = FS::Convert<int32_t>( z ) + int32v( -1 );
 
-        float32v xcf = FS_Converti32_f32( xc ) - x;
-        float32v ycfBase = FS_Converti32_f32( ycBase ) - y;
-        float32v zcfBase = FS_Converti32_f32( zcBase ) - z;
+        float32v xcf = FS::Convert<float>( xc );
+        float32v ycfBase = FS::Convert<float>( ycBase );
+        float32v zcfBase = FS::Convert<float>( zcBase );
 
-        xc *= int32v( FnPrimes::X );
-        ycBase *= int32v( FnPrimes::Y );
-        zcBase *= int32v( FnPrimes::Z );
+        xc *= int32v( Primes::X );
+        ycBase *= int32v( Primes::Y );
+        zcBase *= int32v( Primes::Z );
 
         for( int xi = 0; xi < 3; xi++ )
         {
+            float32v xcfOffset = xcf - x;
             float32v ycf = ycfBase;
             int32v yc = ycBase;
             for( int yi = 0; yi < 3; yi++ )
             {
+                float32v ycfOffset = ycf - y;
                 float32v zcf = zcfBase;
                 int32v zc = zcBase;
                 for( int zi = 0; zi < 3; zi++ )
                 {
-                    int32v hash = FnUtils::HashPrimesHB( seed, xc, yc, zc );
-                    float32v xd = FS_Converti32_f32( hash & int32v( 0x3ff ) ) - float32v( 0x3ff / 2.0f );
-                    float32v yd = FS_Converti32_f32( (hash >> 10) & int32v( 0x3ff ) ) - float32v( 0x3ff / 2.0f );
-                    float32v zd = FS_Converti32_f32( (hash >> 20) & int32v( 0x3ff ) ) - float32v( 0x3ff / 2.0f );
+                    int32v hash = HashPrimesHB( seed, xc, yc, zc );
+                    float32v xd = FS::Convert<float>( hash & int32v( 0x3ff ) ) - float32v( 0x3ff / 2.0f );
+                    float32v yd = FS::Convert<float>( (hash >> 11) & int32v( 0x3ff ) ) - float32v( 0x3ff / 2.0f );
+                    float32v zd = FS::Convert<float>( FS::BitShiftRightZeroExtend( hash, 22 ) ) - float32v( 0x3ff / 2.0f );
 
-                    float32v invMag = jitter * FS_InvSqrt_f32( FS_FMulAdd_f32( xd, xd, FS_FMulAdd_f32( yd, yd, zd * zd ) ) );
-                    xd = FS_FMulAdd_f32( xd, invMag, xcf );
-                    yd = FS_FMulAdd_f32( yd, invMag, ycf );
-                    zd = FS_FMulAdd_f32( zd, invMag, zcf );
+                    float32v invMag = jitter * FS::InvSqrt( FS::FMulAdd( xd, xd, FS::FMulAdd( yd, yd, zd * zd ) ) );
+                    xd = FS::FMulAdd( xd, invMag, xcfOffset );
+                    yd = FS::FMulAdd( yd, invMag, ycfOffset );
+                    zd = FS::FMulAdd( zd, invMag, zcf - z );
 
-                    float32v newDistance = FnUtils::CalcDistance( mDistanceFunction, xd, yd, zd );
+                    float32v newDistance = CalcDistance<false>( mDistanceFunction, mMinkowskiP, seed, xd, yd, zd );
 
                     for( int i = kMaxDistanceCount - 1; i > 0; i-- )
                     {
-                        distance[i] = FS_Max_f32( FS_Min_f32( distance[i], newDistance ), distance[i - 1] );
+                        distance[i] = FS::Max( FS::Min( distance[i], newDistance ), distance[i - 1] );
                     }
 
-                    distance[0] = FS_Min_f32( distance[0], newDistance );
+                    distance[0] = FS::Min( distance[0], newDistance );
 
                     zcf += float32v( 1 );
-                    zc += int32v( FnPrimes::Z );
+                    zc += int32v( Primes::Z );
                 }
                 ycf += float32v( 1 );
-                yc += int32v( FnPrimes::Y );
+                yc += int32v( Primes::Y );
             }
             xcf += float32v( 1 );
-            xc += int32v( FnPrimes::X );
+            xc += int32v( Primes::X );
         }
 
-        return GetReturn( distance );
+        return GetReturn( distance, 1 + this->kJitter3D );
     }
 
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z, float32v w ) const final
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z, float32v w ) const
     {
         float32v jitter = float32v( this->kJitter4D ) * this->GetSourceValue( mJitterModifier, seed, x, y, z, w );
 
         std::array<float32v, kMaxDistanceCount> distance;
         distance.fill( float32v( INFINITY ) );
 
-        int32v xc = FS_Convertf32_i32( x ) + int32v( -1 );
-        int32v ycBase = FS_Convertf32_i32( y ) + int32v( -1 );
-        int32v zcBase = FS_Convertf32_i32( z ) + int32v( -1 );
-        int32v wcBase = FS_Convertf32_i32( w ) + int32v( -1 );
+        this->ScalePositions( x, y, z, w );
 
-        float32v xcf = FS_Converti32_f32( xc ) - x;
-        float32v ycfBase = FS_Converti32_f32( ycBase ) - y;
-        float32v zcfBase = FS_Converti32_f32( zcBase ) - z;
-        float32v wcfBase = FS_Converti32_f32( wcBase ) - w;
+        int32v xc = FS::Convert<int32_t>( x ) + int32v( -1 );
+        int32v ycBase = FS::Convert<int32_t>( y ) + int32v( -1 );
+        int32v zcBase = FS::Convert<int32_t>( z ) + int32v( -1 );
+        int32v wcBase = FS::Convert<int32_t>( w ) + int32v( -1 );
 
-        xc *= int32v( FnPrimes::X );
-        ycBase *= int32v( FnPrimes::Y );
-        zcBase *= int32v( FnPrimes::Z );
-        wcBase *= int32v( FnPrimes::W );
+        float32v xcf = FS::Convert<float>( xc );
+        float32v ycfBase = FS::Convert<float>( ycBase );
+        float32v zcfBase = FS::Convert<float>( zcBase );
+        float32v wcfBase = FS::Convert<float>( wcBase );
+
+        xc *= int32v( Primes::X );
+        ycBase *= int32v( Primes::Y );
+        zcBase *= int32v( Primes::Z );
+        wcBase *= int32v( Primes::W );
 
         for( int xi = 0; xi < 3; xi++ )
         {
+            float32v xcfOffset = xcf - x;
             float32v ycf = ycfBase;
             int32v yc = ycBase;
             for( int yi = 0; yi < 3; yi++ )
             {
+                float32v ycfOffset = ycf - y;
                 float32v zcf = zcfBase;
                 int32v zc = zcBase;
                 for( int zi = 0; zi < 3; zi++ )
                 {
+                    float32v zcfOffset = zcf - z;
                     float32v wcf = wcfBase;
                     int32v wc = wcBase;
                     for( int wi = 0; wi < 3; wi++ )
                     {
-                        int32v hash = FnUtils::HashPrimesHB( seed, xc, yc, zc, wc );
-                        float32v xd = FS_Converti32_f32( hash & int32v( 0xff ) ) - float32v( 0xff / 2.0f );
-                        float32v yd = FS_Converti32_f32( (hash >> 8) & int32v( 0xff ) ) - float32v( 0xff / 2.0f );
-                        float32v zd = FS_Converti32_f32( (hash >> 16) & int32v( 0xff ) ) - float32v( 0xff / 2.0f );
-                        float32v wd = FS_Converti32_f32( (hash >> 24) & int32v( 0xff ) ) - float32v( 0xff / 2.0f );
+                        int32v hash = HashPrimesHB( seed, xc, yc, zc, wc );
+                        float32v xd = FS::Convert<float>( hash & int32v( 0xff ) ) - float32v( 0xff / 2.0f );
+                        float32v yd = FS::Convert<float>( (hash >> 8) & int32v( 0xff ) ) - float32v( 0xff / 2.0f );
+                        float32v zd = FS::Convert<float>( (hash >> 16) & int32v( 0xff ) ) - float32v( 0xff / 2.0f );
+                        float32v wd = FS::Convert<float>( FS::BitShiftRightZeroExtend( hash, 24 ) ) - float32v( 0xff / 2.0f );
 
-                        float32v invMag = jitter * FS_InvSqrt_f32( FS_FMulAdd_f32( xd, xd, FS_FMulAdd_f32( yd, yd, FS_FMulAdd_f32( zd, zd, wd * wd ) ) ) );
-                        xd = FS_FMulAdd_f32( xd, invMag, xcf );
-                        yd = FS_FMulAdd_f32( yd, invMag, ycf );
-                        zd = FS_FMulAdd_f32( zd, invMag, zcf );
-                        wd = FS_FMulAdd_f32( wd, invMag, wcf );
+                        float32v invMag = jitter * FS::InvSqrt( FS::FMulAdd( xd, xd, FS::FMulAdd( yd, yd, FS::FMulAdd( zd, zd, wd * wd ) ) ) );
+                        xd = FS::FMulAdd( xd, invMag, xcfOffset );
+                        yd = FS::FMulAdd( yd, invMag, ycfOffset );
+                        zd = FS::FMulAdd( zd, invMag, zcfOffset );
+                        wd = FS::FMulAdd( wd, invMag, wcf - w );
 
-                        float32v newDistance = FnUtils::CalcDistance( mDistanceFunction, xd, yd, zd, wd );
+                        float32v newDistance = CalcDistance<false>( mDistanceFunction, mMinkowskiP, seed, xd, yd, zd, wd );
 
                         for( int i = kMaxDistanceCount - 1; i > 0; i-- )
                         {
-                            distance[i] = FS_Max_f32( FS_Min_f32( distance[i], newDistance ), distance[i - 1] );
+                            distance[i] = FS::Max( FS::Min( distance[i], newDistance ), distance[i - 1] );
                         }
 
-                        distance[0] = FS_Min_f32( distance[0], newDistance );
+                        distance[0] = FS::Min( distance[0], newDistance );
 
                         wcf += float32v( 1 );
-                        wc += int32v( FnPrimes::W );
+                        wc += int32v( Primes::W );
                     }
                     zcf += float32v( 1 );
-                    zc += int32v( FnPrimes::Z );
+                    zc += int32v( Primes::Z );
                 }
                 ycf += float32v( 1 );
-                yc += int32v( FnPrimes::Y );
+                yc += int32v( Primes::Y );
             }
             xcf += float32v( 1 );
-            xc += int32v( FnPrimes::X );
+            xc += int32v( Primes::X );
         }
 
-        return GetReturn( distance );
+        return GetReturn( distance, 1 + this->kJitter4D );
     }
 
-    FS_INLINE float32v GetReturn( std::array<float32v, kMaxDistanceCount>& distance ) const
+    FS_FORCEINLINE float32v GetReturn( std::array<float32v, kMaxDistanceCount>& distance, float maxDist ) const
     {
         if( mDistanceFunction == FastNoise::DistanceFunction::Euclidean )
         {
-            distance[mDistanceIndex0] *= FS_InvSqrt_f32( distance[mDistanceIndex0] );
-            distance[mDistanceIndex1] *= FS_InvSqrt_f32( distance[mDistanceIndex1] );
+            distance[mDistanceIndex0] *= FS::InvSqrt( distance[mDistanceIndex0] );
+            distance[mDistanceIndex1] *= FS::InvSqrt( distance[mDistanceIndex1] );
         }
 
+        maxDist *= maxDist;
+
         switch( mReturnType )
         {
         default:
         case ReturnType::Index0:
         {
-            return distance[mDistanceIndex0];
+            return this->ScaleOutput( distance[mDistanceIndex0], 0, maxDist );
         }
         case ReturnType::Index0Add1:
         {
-            return distance[mDistanceIndex0] + distance[mDistanceIndex1];
+            return this->ScaleOutput( distance[mDistanceIndex0] + distance[mDistanceIndex1], 0, maxDist * 2 );
         }
         case ReturnType::Index0Sub1:
         {
-            return distance[mDistanceIndex0] - distance[mDistanceIndex1];
+            return this->ScaleOutput( FS::Abs( distance[mDistanceIndex0] - distance[mDistanceIndex1] ), 0, maxDist );
         }
         case ReturnType::Index0Mul1:
         {
-            return distance[mDistanceIndex0] * distance[mDistanceIndex1];
+            return this->ScaleOutput( distance[mDistanceIndex0] * distance[mDistanceIndex1], 0, maxDist * maxDist );
         }
         case ReturnType::Index0Div1:
         {
-            return distance[mDistanceIndex0] * FS_Reciprocal_f32( distance[mDistanceIndex1] );
+            return this->ScaleOutput( distance[mDistanceIndex0] * FS::Reciprocal( distance[mDistanceIndex1] ), 0, maxDist );
         }
         }
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::CellularLookup, FS> : public virtual FastNoise::CellularLookup, public FS_T<FastNoise::Cellular, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::CellularLookup, SIMD> final : public virtual FastNoise::CellularLookup, public FastSIMD::DispatchClass<FastNoise::Cellular<ScalableGenerator>, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
-
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y ) const final
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y ) const
     {
         float32v jitter = float32v( this->kJitter2D ) * this->GetSourceValue( mJitterModifier, seed, x, y );
         float32v distance( FLT_MAX );
         float32v cellX, cellY;
 
-        int32v xc = FS_Convertf32_i32( x ) + int32v( -1 );
-        int32v ycBase = FS_Convertf32_i32( y ) + int32v( -1 );
+        this->ScalePositions( x, y );
 
-        float32v xcf = FS_Converti32_f32( xc ) - x;
-        float32v ycfBase = FS_Converti32_f32( ycBase ) - y;
+        int32v xc = FS::Convert<int32_t>( x ) + int32v( -1 );
+        int32v ycBase = FS::Convert<int32_t>( y ) + int32v( -1 );
 
-        xc *= int32v( FnPrimes::X );
-        ycBase *= int32v( FnPrimes::Y );
+        float32v xcf = FS::Convert<float>( xc );
+        float32v ycfBase = FS::Convert<float>( ycBase );
+
+        xc *= int32v( Primes::X );
+        ycBase *= int32v( Primes::Y );
 
         for( int xi = 0; xi < 3; xi++ )
         {
@@ -499,49 +510,53 @@ class FS_T<FastNoise::CellularLookup, FS> : public virtual FastNoise::CellularLo
             int32v yc = ycBase;
             for( int yi = 0; yi < 3; yi++ )
             {
-                int32v hash = FnUtils::HashPrimesHB( seed, xc, yc );
-                float32v xd = FS_Converti32_f32( hash & int32v( 0xffff ) ) - float32v( 0xffff / 2.0f );
-                float32v yd = FS_Converti32_f32( (hash >> 16) & int32v( 0xffff ) ) - float32v( 0xffff / 2.0f );
+                int32v hash = HashPrimesHB( seed, xc, yc );
+                float32v xd = FS::Convert<float>( hash & int32v( 0xffff ) ) - float32v( 0xffff / 2.0f );
+                float32v yd = FS::Convert<float>( FS::BitShiftRightZeroExtend( hash, 16 ) ) - float32v( 0xffff / 2.0f );
 
-                float32v invMag = jitter * FS_InvSqrt_f32( FS_FMulAdd_f32( xd, xd, yd * yd ) );
-                xd = FS_FMulAdd_f32( xd, invMag, xcf );
-                yd = FS_FMulAdd_f32( yd, invMag, ycf );
+                float32v invMag = jitter * FS::InvSqrt( FS::FMulAdd( xd, xd, yd * yd ) );
+                float32v localCellX = FS::FMulAdd( xd, invMag, xcf );
+                float32v localCellY = FS::FMulAdd( yd, invMag, ycf );
+                xd = localCellX - x;
+                yd = localCellY - y;
 
-                float32v newDistance = FnUtils::CalcDistance( mDistanceFunction, xd, yd );
+                float32v newDistance = CalcDistance<false>( mDistanceFunction, mMinkowskiP, seed, xd, yd );
 
                 mask32v closer = newDistance < distance;
-                distance = FS_Min_f32( newDistance, distance );
+                distance = FS::Min( newDistance, distance );
 
-                cellX = FS_Select_f32( closer, xd + x, cellX );
-                cellY = FS_Select_f32( closer, yd + y, cellY );
+                cellX = FS::Select( closer, localCellX, cellX );
+                cellY = FS::Select( closer, localCellY, cellY );
 
                 ycf += float32v( 1 );
-                yc += int32v( FnPrimes::Y );
+                yc += int32v( Primes::Y );
             }
             xcf += float32v( 1 );
-            xc += int32v( FnPrimes::X );
+            xc += int32v( Primes::X );
         }
 
-        return this->GetSourceValue( mLookup, seed - int32v( -1 ), cellX * float32v( mLookupFreq ), cellY * float32v( mLookupFreq ) );
+        return this->GetSourceValue( mLookup, seed - int32v( -1 ), cellX * float32v( mScale ), cellY * float32v( mScale ) );
     }
 
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z ) const final
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z ) const
     {
         float32v jitter = float32v( this->kJitter3D ) * this->GetSourceValue( mJitterModifier, seed, x, y, z );
         float32v distance( FLT_MAX );
         float32v cellX, cellY, cellZ;
 
-        int32v xc = FS_Convertf32_i32( x ) + int32v( -1 );
-        int32v ycBase = FS_Convertf32_i32( y ) + int32v( -1 );
-        int32v zcBase = FS_Convertf32_i32( z ) + int32v( -1 );
+        this->ScalePositions( x, y, z );
+
+        int32v xc = FS::Convert<int32_t>( x ) + int32v( -1 );
+        int32v ycBase = FS::Convert<int32_t>( y ) + int32v( -1 );
+        int32v zcBase = FS::Convert<int32_t>( z ) + int32v( -1 );
 
-        float32v xcf = FS_Converti32_f32( xc ) - x;
-        float32v ycfBase = FS_Converti32_f32( ycBase ) - y;
-        float32v zcfBase = FS_Converti32_f32( zcBase ) - z;
+        float32v xcf = FS::Convert<float>( xc );
+        float32v ycfBase = FS::Convert<float>( ycBase );
+        float32v zcfBase = FS::Convert<float>( zcBase );
 
-        xc *= int32v( FnPrimes::X );
-        ycBase *= int32v( FnPrimes::Y );
-        zcBase *= int32v( FnPrimes::Z );
+        xc *= int32v( Primes::X );
+        ycBase *= int32v( Primes::Y );
+        zcBase *= int32v( Primes::Z );
 
         for( int xi = 0; xi < 3; xi++ )
         {
@@ -553,58 +568,63 @@ class FS_T<FastNoise::CellularLookup, FS> : public virtual FastNoise::CellularLo
                 int32v zc = zcBase;
                 for( int zi = 0; zi < 3; zi++ )
                 {
-                    int32v hash = FnUtils::HashPrimesHB( seed, xc, yc, zc );
-                    float32v xd = FS_Converti32_f32( hash & int32v( 0x3ff ) ) - float32v( 0x3ff / 2.0f );
-                    float32v yd = FS_Converti32_f32( (hash >> 10) & int32v( 0x3ff ) ) - float32v( 0x3ff / 2.0f );
-                    float32v zd = FS_Converti32_f32( (hash >> 20) & int32v( 0x3ff ) ) - float32v( 0x3ff / 2.0f );
+                    int32v hash = HashPrimesHB( seed, xc, yc, zc );
+                    float32v xd = FS::Convert<float>( hash & int32v( 0x3ff ) ) - float32v( 0x3ff / 2.0f );
+                    float32v yd = FS::Convert<float>( (hash >> 11) & int32v( 0x3ff ) ) - float32v( 0x3ff / 2.0f );
+                    float32v zd = FS::Convert<float>( FS::BitShiftRightZeroExtend( hash, 22 ) ) - float32v( 0x3ff / 2.0f );
 
-                    float32v invMag = jitter * FS_InvSqrt_f32( FS_FMulAdd_f32( xd, xd, FS_FMulAdd_f32( yd, yd, zd * zd ) ) );
-                    xd = FS_FMulAdd_f32( xd, invMag, xcf );
-                    yd = FS_FMulAdd_f32( yd, invMag, ycf );
-                    zd = FS_FMulAdd_f32( zd, invMag, zcf );
+                    float32v invMag = jitter * FS::InvSqrt( FS::FMulAdd( xd, xd, FS::FMulAdd( yd, yd, zd * zd ) ) );
+                    float32v localCellX = FS::FMulAdd( xd, invMag, xcf );
+                    float32v localCellY = FS::FMulAdd( yd, invMag, ycf );
+                    float32v localCellZ = FS::FMulAdd( zd, invMag, zcf );
+                    xd = localCellX - x;
+                    yd = localCellY - y;
+                    zd = localCellZ - z;
 
-                    float32v newDistance = FnUtils::CalcDistance( mDistanceFunction, xd, yd, zd );
+                    float32v newDistance = CalcDistance<false>( mDistanceFunction, mMinkowskiP, seed, xd, yd, zd );
 
                     mask32v closer = newDistance < distance;
-                    distance = FS_Min_f32( newDistance, distance );
+                    distance = FS::Min( newDistance, distance );
 
-                    cellX = FS_Select_f32( closer, xd + x, cellX );
-                    cellY = FS_Select_f32( closer, yd + y, cellY );
-                    cellZ = FS_Select_f32( closer, zd + z, cellZ );
+                    cellX = FS::Select( closer, localCellX, cellX );
+                    cellY = FS::Select( closer, localCellY, cellY );
+                    cellZ = FS::Select( closer, localCellZ, cellZ );
 
                     zcf += float32v( 1 );
-                    zc += int32v( FnPrimes::Z );
+                    zc += int32v( Primes::Z );
                 }
                 ycf += float32v( 1 );
-                yc += int32v( FnPrimes::Y );
+                yc += int32v( Primes::Y );
             }
             xcf += float32v( 1 );
-            xc += int32v( FnPrimes::X );
+            xc += int32v( Primes::X );
         }
 
-        return this->GetSourceValue( mLookup, seed - int32v( -1 ), cellX * float32v( mLookupFreq ), cellY * float32v( mLookupFreq ), cellZ * float32v( mLookupFreq ) );
+        return this->GetSourceValue( mLookup, seed - int32v( -1 ), cellX * float32v( mScale ), cellY * float32v( mScale ), cellZ * float32v( mScale ) );
     }
 
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z, float32v w ) const final
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z, float32v w ) const
     {
         float32v jitter = float32v( this->kJitter4D ) * this->GetSourceValue( mJitterModifier, seed, x, y, z, w );
         float32v distance( FLT_MAX );
         float32v cellX, cellY, cellZ, cellW;
 
-        int32v xc = FS_Convertf32_i32( x ) + int32v( -1 );
-        int32v ycBase = FS_Convertf32_i32( y ) + int32v( -1 );
-        int32v zcBase = FS_Convertf32_i32( z ) + int32v( -1 );
-        int32v wcBase = FS_Convertf32_i32( w ) + int32v( -1 );
+        this->ScalePositions( x, y, z, w );
+
+        int32v xc = FS::Convert<int32_t>( x ) + int32v( -1 );
+        int32v ycBase = FS::Convert<int32_t>( y ) + int32v( -1 );
+        int32v zcBase = FS::Convert<int32_t>( z ) + int32v( -1 );
+        int32v wcBase = FS::Convert<int32_t>( w ) + int32v( -1 );
 
-        float32v xcf = FS_Converti32_f32( xc ) - x;
-        float32v ycfBase = FS_Converti32_f32( ycBase ) - y;
-        float32v zcfBase = FS_Converti32_f32( zcBase ) - z;
-        float32v wcfBase = FS_Converti32_f32( wcBase ) - w;
+        float32v xcf = FS::Convert<float>( xc );
+        float32v ycfBase = FS::Convert<float>( ycBase );
+        float32v zcfBase = FS::Convert<float>( zcBase );
+        float32v wcfBase = FS::Convert<float>( wcBase );
 
-        xc *= int32v( FnPrimes::X );
-        ycBase *= int32v( FnPrimes::Y );
-        zcBase *= int32v( FnPrimes::Z );
-        wcBase *= int32v( FnPrimes::W );
+        xc *= int32v( Primes::X );
+        ycBase *= int32v( Primes::Y );
+        zcBase *= int32v( Primes::Z );
+        wcBase *= int32v( Primes::W );
 
         for( int xi = 0; xi < 3; xi++ )
         {
@@ -620,41 +640,45 @@ class FS_T<FastNoise::CellularLookup, FS> : public virtual FastNoise::CellularLo
                     int32v wc = wcBase;
                     for( int wi = 0; wi < 3; wi++ )
                     {
-                        int32v hash = FnUtils::HashPrimesHB( seed, xc, yc, zc, wc );
-                        float32v xd = FS_Converti32_f32( hash & int32v( 0xff ) ) - float32v( 0xff / 2.0f );
-                        float32v yd = FS_Converti32_f32( (hash >> 8) & int32v( 0xff ) ) - float32v( 0xff / 2.0f );
-                        float32v zd = FS_Converti32_f32( (hash >> 16) & int32v( 0xff ) ) - float32v( 0xff / 2.0f );
-                        float32v wd = FS_Converti32_f32( (hash >> 24) & int32v( 0xff ) ) - float32v( 0xff / 2.0f );
-
-                        float32v invMag = jitter * FS_InvSqrt_f32( FS_FMulAdd_f32( xd, xd, FS_FMulAdd_f32( yd, yd, FS_FMulAdd_f32( zd, zd, wd * wd ) ) ) );
-                        xd = FS_FMulAdd_f32( xd, invMag, xcf );
-                        yd = FS_FMulAdd_f32( yd, invMag, ycf );
-                        zd = FS_FMulAdd_f32( zd, invMag, zcf );
-                        wd = FS_FMulAdd_f32( wd, invMag, wcf );
-
-                        float32v newDistance = FnUtils::CalcDistance( mDistanceFunction, xd, yd, zd, wd );
+                        int32v hash = HashPrimesHB( seed, xc, yc, zc, wc );
+                        float32v xd = FS::Convert<float>( hash & int32v( 0xff ) ) - float32v( 0xff / 2.0f );
+                        float32v yd = FS::Convert<float>( (hash >> 8) & int32v( 0xff ) ) - float32v( 0xff / 2.0f );
+                        float32v zd = FS::Convert<float>( (hash >> 16) & int32v( 0xff ) ) - float32v( 0xff / 2.0f );
+                        float32v wd = FS::Convert<float>( FS::BitShiftRightZeroExtend( hash, 24 ) ) - float32v( 0xff / 2.0f );
+
+                        float32v invMag = jitter * FS::InvSqrt( FS::FMulAdd( xd, xd, FS::FMulAdd( yd, yd, FS::FMulAdd( zd, zd, wd * wd ) ) ) );
+                        float32v localCellX = FS::FMulAdd( xd, invMag, xcf );
+                        float32v localCellY = FS::FMulAdd( yd, invMag, ycf );
+                        float32v localCellZ = FS::FMulAdd( zd, invMag, zcf );
+                        float32v localCellW = FS::FMulAdd( wd, invMag, wcf );
+                        xd = localCellX - x;
+                        yd = localCellY - y;
+                        zd = localCellZ - z;
+                        wd = localCellW - w;
+
+                        float32v newDistance = CalcDistance<false>( mDistanceFunction, mMinkowskiP, seed, xd, yd, zd, wd );
 
                         mask32v closer = newDistance < distance;
-                        distance = FS_Min_f32( newDistance, distance );
+                        distance = FS::Min( newDistance, distance );
 
-                        cellX = FS_Select_f32( closer, xd + x, cellX );
-                        cellY = FS_Select_f32( closer, yd + y, cellY );
-                        cellZ = FS_Select_f32( closer, zd + z, cellZ );
-                        cellW = FS_Select_f32( closer, wd + w, cellW );
+                        cellX = FS::Select( closer, localCellX, cellX );
+                        cellY = FS::Select( closer, localCellY, cellY );
+                        cellZ = FS::Select( closer, localCellZ, cellZ );
+                        cellW = FS::Select( closer, localCellW, cellW );
 
                         wcf += float32v( 1 );
-                        wc += int32v( FnPrimes::W );
+                        wc += int32v( Primes::W );
                     }
                     zcf += float32v( 1 );
-                    zc += int32v( FnPrimes::Z );
+                    zc += int32v( Primes::Z );
                 }
                 ycf += float32v( 1 );
-                yc += int32v( FnPrimes::Y );
+                yc += int32v( Primes::Y );
             }
             xcf += float32v( 1 );
-            xc += int32v( FnPrimes::X );
+            xc += int32v( Primes::X );
         }
 
-        return this->GetSourceValue( mLookup, seed - int32v( -1 ), cellX * float32v( mLookupFreq ), cellY * float32v( mLookupFreq ), cellZ * float32v( mLookupFreq ), cellW * float32v( mLookupFreq ) );
+        return this->GetSourceValue( mLookup, seed - int32v( -1 ), cellX * float32v( mScale ), cellY * float32v( mScale ), cellZ * float32v( mScale ), cellW * float32v( mScale ) );
     }
 };
diff --git a/include/FastNoise/Generators/DomainWarp.h b/include/FastNoise/Generators/DomainWarp.h
index 42b529c3..ea22f28e 100644
--- a/include/FastNoise/Generators/DomainWarp.h
+++ b/include/FastNoise/Generators/DomainWarp.h
@@ -3,30 +3,27 @@
 
 namespace FastNoise
 {
-    class DomainWarp : public virtual Generator
+    class DomainWarp : public virtual ScalableGenerator
     {
     public:
         void SetSource( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mSource, gen ); }
         void SetWarpAmplitude( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mWarpAmplitude, gen ); }
         void SetWarpAmplitude( float value ) { mWarpAmplitude = value; } 
-        void SetWarpFrequency( float value ) { mWarpFrequency = value; }
 
     protected:
         GeneratorSource mSource;
-        HybridSource mWarpAmplitude = 1.0f;
-        float mWarpFrequency = 0.5f;
+        HybridSource mWarpAmplitude = 50.0f;
     };
 
 #ifdef FASTNOISE_METADATA
     template<>
-    struct MetadataT<DomainWarp> : MetadataT<Generator>
+    struct MetadataT<DomainWarp> : MetadataT<ScalableGenerator>
     {
         MetadataT()
         {
             groups.push_back( "Domain Warp" );
             this->AddGeneratorSource( "Source", &DomainWarp::SetSource );
-            this->AddHybridSource( "Warp Amplitude", 1.0f, &DomainWarp::SetWarpAmplitude, &DomainWarp::SetWarpAmplitude );
-            this->AddVariable( "Warp Frequency", 0.5f, &DomainWarp::SetWarpFrequency );
+            this->AddHybridSource( { "Warp Amplitude", "Maximum (euclidean) distance the position can be moved from it's original location" }, 50.0f, &DomainWarp::SetWarpAmplitude, &DomainWarp::SetWarpAmplitude, 0.1f );
         }
     };
 #endif
@@ -34,7 +31,6 @@ namespace FastNoise
     class DomainWarpGradient : public virtual DomainWarp
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
     };
 
@@ -42,7 +38,15 @@ namespace FastNoise
     template<>
     struct MetadataT<DomainWarpGradient> : MetadataT<DomainWarp>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
+
+        MetadataT ()
+        {
+            description =
+                "Warps the input position using a simple uniform grid gradient, similar to perlin noise gradients.\n"
+                "The warped position is used when generating the attached source node\n"
+                "This node does not change the output value of the source node";
+        }
     };
 #endif
 }
diff --git a/include/FastNoise/Generators/DomainWarp.inl b/include/FastNoise/Generators/DomainWarp.inl
index cb006bd0..2de616c8 100644
--- a/include/FastNoise/Generators/DomainWarp.inl
+++ b/include/FastNoise/Generators/DomainWarp.inl
@@ -1,24 +1,21 @@
-#include "FastSIMD/InlInclude.h"
-
 #include "DomainWarp.h"
 #include "Utils.inl"
 
-template<typename FS>
-class FS_T<FastNoise::DomainWarp, FS> : public virtual FastNoise::DomainWarp, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::DomainWarp, SIMD> : public virtual FastNoise::DomainWarp, public FastSIMD::DispatchClass<FastNoise::ScalableGenerator, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
 
     template<typename... P>
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
-        Warp( seed, this->GetSourceValue( mWarpAmplitude, seed, pos... ), (pos * float32v( mWarpFrequency ))..., pos... );
+        Warp( seed, this->GetSourceValue( mWarpAmplitude, seed, pos... ), ( pos * float32v( this->mFrequency ) )..., pos... );
 
         return this->GetSourceValue( mSource, seed, pos...);
     }
 
 public:
-    float GetWarpFrequency() const { return mWarpFrequency; }
+    float GetWarpFrequency() const { return this->mFrequency; }
     const FastNoise::HybridSource& GetWarpAmplitude() const { return mWarpAmplitude; }
     const FastNoise::GeneratorSource& GetWarpSource() const { return mSource; }
 
@@ -27,74 +24,90 @@ public:
     virtual float32v FS_VECTORCALL Warp( int32v seed, float32v warpAmp, float32v x, float32v y, float32v z, float32v w, float32v& xOut, float32v& yOut, float32v& zOut, float32v& wOut ) const = 0;
 };
 
-template<typename FS>
-class FS_T<FastNoise::DomainWarpGradient, FS> : public virtual FastNoise::DomainWarpGradient, public FS_T<FastNoise::DomainWarp, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::DomainWarpGradient, SIMD> final : public virtual FastNoise::DomainWarpGradient, public FastSIMD::DispatchClass<FastNoise::DomainWarp, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
-
 public:
-    float32v FS_VECTORCALL Warp( int32v seed, float32v warpAmp, float32v x, float32v y, float32v& xOut, float32v& yOut ) const final
+    float32v FS_VECTORCALL Warp( int32v seed, float32v warpAmp, float32v x, float32v y, float32v& xOut, float32v& yOut ) const
     {
-        float32v xs = FS_Floor_f32( x );
-        float32v ys = FS_Floor_f32( y );
+        float32v xs = FS::Floor( x );
+        float32v ys = FS::Floor( y );
+
+        int32v x0 = FS::Convert<int32_t>( xs ) * int32v( Primes::X );
+        int32v y0 = FS::Convert<int32_t>( ys ) * int32v( Primes::Y );
+        int32v x1 = x0 + int32v( Primes::X );
+        int32v y1 = y0 + int32v( Primes::Y );
 
-        int32v x0 = FS_Convertf32_i32( xs ) * int32v( FnPrimes::X );
-        int32v y0 = FS_Convertf32_i32( ys ) * int32v( FnPrimes::Y );
-        int32v x1 = x0 + int32v( FnPrimes::X );
-        int32v y1 = y0 + int32v( FnPrimes::Y );
+        float32v xs1 = InterpHermite( x - xs );
+        float32v ys1 = InterpHermite( y - ys );
+        float32v xs0 = float32v( 1 ) - xs1;
+        float32v ys0 = float32v( 1 ) - ys1;
 
-        xs = FnUtils::InterpHermite( x - xs );
-        ys = FnUtils::InterpHermite( y - ys );
+        float32v normalise( 1.0f / (0xffff / 2.0f) );
 
     #define GRADIENT_COORD( _x, _y )\
-        int32v hash##_x##_y = FnUtils::HashPrimesHB(seed, x##_x, y##_y );\
-        float32v x##_x##_y = FS_Converti32_f32( hash##_x##_y & int32v( 0xffff ) );\
-        float32v y##_x##_y = FS_Converti32_f32( (hash##_x##_y >> 16) & int32v( 0xffff ) );
+        int32v hash##_x##_y = HashPrimesHB(seed, x##_x, y##_y );\
+        float32v contrib##_x##_y = xs##_x * ys##_y;\
+        xWarp = FS::FMulAdd( contrib##_x##_y, FS::Convert<float>( hash##_x##_y & int32v( 0xffff ) ), xWarp );\
+        yWarp = FS::FMulAdd( contrib##_x##_y, FS::Convert<float>( FS::BitShiftRightZeroExtend( hash##_x##_y, 16) ), yWarp )
+
+        int32v hash00 = HashPrimesHB(seed, x0, y0 );
+        float32v contrib00 = xs0 * ys0;
+        float32v xWarp = contrib00 * FS::Convert<float>( hash00 & int32v( 0xffff ) );
+        float32v yWarp = contrib00 * FS::Convert<float>( FS::BitShiftRightZeroExtend( hash00, 16) );
 
-        GRADIENT_COORD( 0, 0 );
         GRADIENT_COORD( 1, 0 );
         GRADIENT_COORD( 0, 1 );
         GRADIENT_COORD( 1, 1 );
 
     #undef GRADIENT_COORD
 
-        float32v normalise = float32v( 1.0f / (0xffff / 2.0f) );
-
-        float32v xWarp = (FnUtils::Lerp( FnUtils::Lerp( x00, x10, xs ), FnUtils::Lerp( x01, x11, xs ), ys ) - float32v( 0xffff / 2.0f )) * normalise;
-        float32v yWarp = (FnUtils::Lerp( FnUtils::Lerp( y00, y10, xs ), FnUtils::Lerp( y01, y11, xs ), ys ) - float32v( 0xffff / 2.0f )) * normalise;
+        xWarp = FS::FMulSub( xWarp, normalise, float32v( 1 ) );
+        yWarp = FS::FMulSub( yWarp, normalise, float32v( 1 ) );
 
-        xOut = FS_FMulAdd_f32( xWarp, warpAmp, xOut );
-        yOut = FS_FMulAdd_f32( yWarp, warpAmp, yOut );
+        xOut = FS::FMulAdd( xWarp, warpAmp, xOut );
+        yOut = FS::FMulAdd( yWarp, warpAmp, yOut );
 
-        float32v warpLengthSq = FS_FMulAdd_f32( xWarp, xWarp, yWarp * yWarp );
+        float32v warpLengthSq = FS::FMulAdd( xWarp, xWarp, yWarp * yWarp );
 
-        return warpLengthSq * FS_InvSqrt_f32( warpLengthSq );
+        return warpLengthSq * FS::InvSqrt( warpLengthSq );
     }
             
-    float32v FS_VECTORCALL Warp( int32v seed, float32v warpAmp, float32v x, float32v y, float32v z, float32v& xOut, float32v& yOut, float32v& zOut ) const final
+    float32v FS_VECTORCALL Warp( int32v seed, float32v warpAmp, float32v x, float32v y, float32v z, float32v& xOut, float32v& yOut, float32v& zOut ) const
     {
-        float32v xs = FS_Floor_f32( x );
-        float32v ys = FS_Floor_f32( y );
-        float32v zs = FS_Floor_f32( z );
-
-        int32v x0 = FS_Convertf32_i32( xs ) * int32v( FnPrimes::X );
-        int32v y0 = FS_Convertf32_i32( ys ) * int32v( FnPrimes::Y );
-        int32v z0 = FS_Convertf32_i32( zs ) * int32v( FnPrimes::Z );
-        int32v x1 = x0 + int32v( FnPrimes::X );
-        int32v y1 = y0 + int32v( FnPrimes::Y );
-        int32v z1 = z0 + int32v( FnPrimes::Z );
-
-        xs = FnUtils::InterpHermite( x - xs );
-        ys = FnUtils::InterpHermite( y - ys );
-        zs = FnUtils::InterpHermite( z - zs );
+        float32v xs = FS::Floor( x );
+        float32v ys = FS::Floor( y );
+        float32v zs = FS::Floor( z );
+
+        int32v x0 = FS::Convert<int32_t>( xs ) * int32v( Primes::X );
+        int32v y0 = FS::Convert<int32_t>( ys ) * int32v( Primes::Y );
+        int32v z0 = FS::Convert<int32_t>( zs ) * int32v( Primes::Z );
+        int32v x1 = x0 + int32v( Primes::X );
+        int32v y1 = y0 + int32v( Primes::Y );
+        int32v z1 = z0 + int32v( Primes::Z );
+
+        float32v xs1 = InterpHermite( x - xs );
+        float32v ys1 = InterpHermite( y - ys );
+        float32v zs1 = InterpHermite( z - zs );
+        float32v xs0 = float32v( 1 ) - xs1;
+        float32v ys0 = float32v( 1 ) - ys1;
+        float32v zs0 = float32v( 1 ) - zs1;
+
+        float32v normalise( 1.0f / (0x3ff / 2.0f) );
 
     #define GRADIENT_COORD( _x, _y, _z )\
-        int32v hash##_x##_y##_z = FnUtils::HashPrimesHB( seed, x##_x, y##_y, z##_z );\
-        float32v x##_x##_y##_z = FS_Converti32_f32( hash##_x##_y##_z & int32v( 0x3ff ) );\
-        float32v y##_x##_y##_z = FS_Converti32_f32( (hash##_x##_y##_z >> 10) & int32v( 0x3ff ) );\
-        float32v z##_x##_y##_z = FS_Converti32_f32( (hash##_x##_y##_z >> 20) & int32v( 0x3ff ) );
+        int32v hash##_x##_y##_z = HashPrimesHB( seed, x##_x, y##_y, z##_z );\
+        float32v contrib##_x##_y##_z = xs##_x * ys##_y * zs##_z;\
+        xWarp = FS::FMulAdd( contrib##_x##_y##_z, FS::Convert<float>( hash##_x##_y##_z & int32v( 0x3ff ) ), xWarp );\
+        yWarp = FS::FMulAdd( contrib##_x##_y##_z, FS::Convert<float>( (hash##_x##_y##_z >> 11) & int32v( 0x3ff ) ), yWarp );\
+        zWarp = FS::FMulAdd( contrib##_x##_y##_z, FS::Convert<float>( FS::BitShiftRightZeroExtend( hash##_x##_y##_z, 22 ) ), zWarp )
+
+        int32v hash000 = HashPrimesHB( seed, x0, y0, z0 );
+        float32v contrib000 = xs0 * ys0 * zs0;
+        float32v xWarp = contrib000 * FS::Convert<float>( hash000 & int32v( 0x3ff ) );
+        float32v yWarp = contrib000 * FS::Convert<float>( (hash000 >> 11) & int32v( 0x3ff ) );
+        float32v zWarp = contrib000 * FS::Convert<float>( FS::BitShiftRightZeroExtend( hash000, 22 ) );
 
-        GRADIENT_COORD( 0, 0, 0 );
         GRADIENT_COORD( 1, 0, 0 );
         GRADIENT_COORD( 0, 1, 0 );
         GRADIENT_COORD( 1, 1, 0 );
@@ -105,58 +118,61 @@ public:
 
     #undef GRADIENT_COORD
 
-        float32v x0z = FnUtils::Lerp( FnUtils::Lerp( x000, x100, xs ), FnUtils::Lerp( x010, x110, xs ), ys );
-        float32v y0z = FnUtils::Lerp( FnUtils::Lerp( y000, y100, xs ), FnUtils::Lerp( y010, y110, xs ), ys );
-        float32v z0z = FnUtils::Lerp( FnUtils::Lerp( z000, z100, xs ), FnUtils::Lerp( z010, z110, xs ), ys );
-                   
-        float32v x1z = FnUtils::Lerp( FnUtils::Lerp( x001, x101, xs ), FnUtils::Lerp( x011, x111, xs ), ys );
-        float32v y1z = FnUtils::Lerp( FnUtils::Lerp( y001, y101, xs ), FnUtils::Lerp( y011, y111, xs ), ys );
-        float32v z1z = FnUtils::Lerp( FnUtils::Lerp( z001, z101, xs ), FnUtils::Lerp( z011, z111, xs ), ys );
-
-        float32v normalise = float32v( 1.0f / (0x3ff / 2.0f) );
-
-        float32v xWarp = (FnUtils::Lerp( x0z, x1z, zs ) - float32v( 0x3ff / 2.0f )) * normalise;
-        float32v yWarp = (FnUtils::Lerp( y0z, y1z, zs ) - float32v( 0x3ff / 2.0f )) * normalise;
-        float32v zWarp = (FnUtils::Lerp( z0z, z1z, zs ) - float32v( 0x3ff / 2.0f )) * normalise;
+        xWarp = FS::FMulSub( xWarp, normalise, float32v( 1 ) );
+        yWarp = FS::FMulSub( yWarp, normalise, float32v( 1 ) );
+        zWarp = FS::FMulSub( zWarp, normalise, float32v( 1 ) );
 
-        xOut = FS_FMulAdd_f32( xWarp, warpAmp, xOut );
-        yOut = FS_FMulAdd_f32( yWarp, warpAmp, yOut );
-        zOut = FS_FMulAdd_f32( zWarp, warpAmp, zOut );
+        xOut = FS::FMulAdd( xWarp, warpAmp, xOut );
+        yOut = FS::FMulAdd( yWarp, warpAmp, yOut );
+        zOut = FS::FMulAdd( zWarp, warpAmp, zOut );
 
-        float32v warpLengthSq = FS_FMulAdd_f32( xWarp, xWarp, FS_FMulAdd_f32( yWarp, yWarp, zWarp * zWarp ) );
+        float32v warpLengthSq = FS::FMulAdd( xWarp, xWarp, FS::FMulAdd( yWarp, yWarp, zWarp * zWarp ) );
 
-        return warpLengthSq * FS_InvSqrt_f32( warpLengthSq );
+        return warpLengthSq * FS::InvSqrt( warpLengthSq );
     }
             
-    float32v FS_VECTORCALL Warp( int32v seed, float32v warpAmp, float32v x, float32v y, float32v z, float32v w, float32v& xOut, float32v& yOut, float32v& zOut, float32v& wOut ) const final
+    float32v FS_VECTORCALL Warp( int32v seed, float32v warpAmp, float32v x, float32v y, float32v z, float32v w, float32v& xOut, float32v& yOut, float32v& zOut, float32v& wOut ) const
     {
-        float32v xs = FS_Floor_f32( x );
-        float32v ys = FS_Floor_f32( y );
-        float32v zs = FS_Floor_f32( z );
-        float32v ws = FS_Floor_f32( w );
-
-        int32v x0 = FS_Convertf32_i32( xs ) * int32v( FnPrimes::X );
-        int32v y0 = FS_Convertf32_i32( ys ) * int32v( FnPrimes::Y );
-        int32v z0 = FS_Convertf32_i32( zs ) * int32v( FnPrimes::Z );
-        int32v w0 = FS_Convertf32_i32( ws ) * int32v( FnPrimes::W );
-        int32v x1 = x0 + int32v( FnPrimes::X );
-        int32v y1 = y0 + int32v( FnPrimes::Y );
-        int32v z1 = z0 + int32v( FnPrimes::Z );
-        int32v w1 = w0 + int32v( FnPrimes::W );
-
-        xs = FnUtils::InterpHermite( x - xs );
-        ys = FnUtils::InterpHermite( y - ys );
-        zs = FnUtils::InterpHermite( z - zs );
-        ws = FnUtils::InterpHermite( w - ws );
+        float32v xs = FS::Floor( x );
+        float32v ys = FS::Floor( y );
+        float32v zs = FS::Floor( z );
+        float32v ws = FS::Floor( w );
+
+        int32v x0 = FS::Convert<int32_t>( xs ) * int32v( Primes::X );
+        int32v y0 = FS::Convert<int32_t>( ys ) * int32v( Primes::Y );
+        int32v z0 = FS::Convert<int32_t>( zs ) * int32v( Primes::Z );
+        int32v w0 = FS::Convert<int32_t>( ws ) * int32v( Primes::W );
+        int32v x1 = x0 + int32v( Primes::X );
+        int32v y1 = y0 + int32v( Primes::Y );
+        int32v z1 = z0 + int32v( Primes::Z );
+        int32v w1 = w0 + int32v( Primes::W );
+
+        float32v xs1 = InterpHermite( x - xs );
+        float32v ys1 = InterpHermite( y - ys );
+        float32v zs1 = InterpHermite( z - zs );
+        float32v ws1 = InterpHermite( w - ws );
+        float32v xs0 = float32v( 1 ) - xs1;
+        float32v ys0 = float32v( 1 ) - ys1;
+        float32v zs0 = float32v( 1 ) - zs1;
+        float32v ws0 = float32v( 1 ) - ws1;
+
+        float32v normalise( 1.0f / (0xff / 2.0f) );
 
     #define GRADIENT_COORD( _x, _y, _z, _w )\
-        int32v hash##_x##_y##_z##_w = FnUtils::HashPrimesHB( seed, x##_x, y##_y, z##_z, w##_w );\
-        float32v x##_x##_y##_z##_w = FS_Converti32_f32( hash##_x##_y##_z##_w & int32v( 0xff ) );\
-        float32v y##_x##_y##_z##_w = FS_Converti32_f32( (hash##_x##_y##_z##_w >> 8) & int32v( 0xff ) );\
-        float32v z##_x##_y##_z##_w = FS_Converti32_f32( (hash##_x##_y##_z##_w >> 16) & int32v( 0xff ) );\
-        float32v w##_x##_y##_z##_w = FS_Converti32_f32( (hash##_x##_y##_z##_w >> 24) & int32v( 0xff ) );
+        int32v hash##_x##_y##_z##_w = HashPrimesHB( seed, x##_x, y##_y, z##_z, w##_w );\
+        float32v contrib##_x##_y##_z##_w = xs##_x * ys##_y * zs##_z * ws##_w;\
+        xWarp = FS::FMulAdd( contrib##_x##_y##_z##_w, FS::Convert<float>( hash##_x##_y##_z##_w & int32v( 0xff ) ), xWarp );\
+        yWarp = FS::FMulAdd( contrib##_x##_y##_z##_w, FS::Convert<float>( (hash##_x##_y##_z##_w >> 8) & int32v( 0xff ) ), yWarp );\
+        zWarp = FS::FMulAdd( contrib##_x##_y##_z##_w, FS::Convert<float>( (hash##_x##_y##_z##_w >> 16) & int32v( 0xff ) ), zWarp );\
+        wWarp = FS::FMulAdd( contrib##_x##_y##_z##_w, FS::Convert<float>( FS::BitShiftRightZeroExtend( hash##_x##_y##_z##_w, 24 ) ), wWarp )
+
+        int32v hash0000 = HashPrimesHB( seed, x0, y0, z0, w0 );
+        float32v contrib0000 = xs0 * ys0 * zs0 * ws0;
+        float32v xWarp = contrib0000 * FS::Convert<float>( hash0000 & int32v( 0xff ) );
+        float32v yWarp = contrib0000 * FS::Convert<float>( (hash0000 >> 8) & int32v( 0xff ) );
+        float32v zWarp = contrib0000 * FS::Convert<float>( (hash0000 >> 16) & int32v( 0xff ) );
+        float32v wWarp = contrib0000 * FS::Convert<float>( FS::BitShiftRightZeroExtend( hash0000, 24 ) );
 
-        GRADIENT_COORD( 0, 0, 0, 0 );
         GRADIENT_COORD( 1, 0, 0, 0 );
         GRADIENT_COORD( 0, 1, 0, 0 );
         GRADIENT_COORD( 1, 1, 0, 0 );
@@ -175,31 +191,19 @@ public:
 
     #undef GRADIENT_COORD
 
-        float32v x0w = FnUtils::Lerp( FnUtils::Lerp( FnUtils::Lerp( x0000, x1000, xs ), FnUtils::Lerp( x0100, x1100, xs ), ys ), FnUtils::Lerp( FnUtils::Lerp( x0010, x1010, xs ), FnUtils::Lerp( x0110, x1110, xs ), ys ), zs );
-        float32v y0w = FnUtils::Lerp( FnUtils::Lerp( FnUtils::Lerp( y0000, y1000, xs ), FnUtils::Lerp( y0100, y1100, xs ), ys ), FnUtils::Lerp( FnUtils::Lerp( y0010, y1010, xs ), FnUtils::Lerp( y0110, y1110, xs ), ys ), zs );
-        float32v z0w = FnUtils::Lerp( FnUtils::Lerp( FnUtils::Lerp( z0000, z1000, xs ), FnUtils::Lerp( z0100, z1100, xs ), ys ), FnUtils::Lerp( FnUtils::Lerp( z0010, z1010, xs ), FnUtils::Lerp( z0110, z1110, xs ), ys ), zs );
-        float32v w0w = FnUtils::Lerp( FnUtils::Lerp( FnUtils::Lerp( w0000, w1000, xs ), FnUtils::Lerp( w0100, w1100, xs ), ys ), FnUtils::Lerp( FnUtils::Lerp( w0010, w1010, xs ), FnUtils::Lerp( w0110, w1110, xs ), ys ), zs );
-
-        float32v x1w = FnUtils::Lerp( FnUtils::Lerp( FnUtils::Lerp( x0001, x1001, xs ), FnUtils::Lerp( x0101, x1101, xs ), ys ), FnUtils::Lerp( FnUtils::Lerp( x0011, x1011, xs ), FnUtils::Lerp( x0111, x1111, xs ), ys ), zs );
-        float32v y1w = FnUtils::Lerp( FnUtils::Lerp( FnUtils::Lerp( y0001, y1001, xs ), FnUtils::Lerp( y0101, y1101, xs ), ys ), FnUtils::Lerp( FnUtils::Lerp( y0011, y1011, xs ), FnUtils::Lerp( y0111, y1111, xs ), ys ), zs );
-        float32v z1w = FnUtils::Lerp( FnUtils::Lerp( FnUtils::Lerp( z0001, z1001, xs ), FnUtils::Lerp( z0101, z1101, xs ), ys ), FnUtils::Lerp( FnUtils::Lerp( z0011, z1011, xs ), FnUtils::Lerp( z0111, z1111, xs ), ys ), zs );
-        float32v w1w = FnUtils::Lerp( FnUtils::Lerp( FnUtils::Lerp( w0001, w1001, xs ), FnUtils::Lerp( w0101, w1101, xs ), ys ), FnUtils::Lerp( FnUtils::Lerp( w0011, w1011, xs ), FnUtils::Lerp( w0111, w1111, xs ), ys ), zs );                        
-
-        float32v normalise = float32v( 1.0f / (0xff / 2.0f) );
-
-        float32v xWarp = (FnUtils::Lerp( x0w, x1w, ws ) - float32v( 0xff / 2.0f )) * normalise;
-        float32v yWarp = (FnUtils::Lerp( y0w, y1w, ws ) - float32v( 0xff / 2.0f )) * normalise;
-        float32v zWarp = (FnUtils::Lerp( z0w, z1w, ws ) - float32v( 0xff / 2.0f )) * normalise;
-        float32v wWarp = (FnUtils::Lerp( w0w, w1w, ws ) - float32v( 0xff / 2.0f )) * normalise;
+        xWarp = FS::FMulSub( xWarp, normalise, float32v( 1 ) );
+        yWarp = FS::FMulSub( yWarp, normalise, float32v( 1 ) );
+        zWarp = FS::FMulSub( zWarp, normalise, float32v( 1 ) );
+        wWarp = FS::FMulSub( wWarp, normalise, float32v( 1 ) );
 
-        xOut = FS_FMulAdd_f32( xWarp, warpAmp, xOut );
-        yOut = FS_FMulAdd_f32( yWarp, warpAmp, yOut );
-        zOut = FS_FMulAdd_f32( zWarp, warpAmp, zOut );
-        wOut = FS_FMulAdd_f32( wWarp, warpAmp, wOut );
+        xOut = FS::FMulAdd( xWarp, warpAmp, xOut );
+        yOut = FS::FMulAdd( yWarp, warpAmp, yOut );
+        zOut = FS::FMulAdd( zWarp, warpAmp, zOut );
+        wOut = FS::FMulAdd( wWarp, warpAmp, wOut );
 
-        float32v warpLengthSq = FS_FMulAdd_f32( xWarp, xWarp, FS_FMulAdd_f32( yWarp, yWarp, FS_FMulAdd_f32( zWarp, zWarp, wWarp * wWarp ) ) );
+        float32v warpLengthSq = FS::FMulAdd( xWarp, xWarp, FS::FMulAdd( yWarp, yWarp, FS::FMulAdd( zWarp, zWarp, wWarp * wWarp ) ) );
 
-        return warpLengthSq * FS_InvSqrt_f32( warpLengthSq );
+        return warpLengthSq * FS::InvSqrt( warpLengthSq );
     }
 };
 
diff --git a/include/FastNoise/Generators/DomainWarpFractal.h b/include/FastNoise/Generators/DomainWarpFractal.h
index 73dc3c47..d9399db5 100644
--- a/include/FastNoise/Generators/DomainWarpFractal.h
+++ b/include/FastNoise/Generators/DomainWarpFractal.h
@@ -7,7 +7,6 @@ namespace FastNoise
     class DomainWarpFractalProgressive : public virtual Fractal<DomainWarp>
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
     };
 
@@ -15,11 +14,18 @@ namespace FastNoise
     template<>
     struct MetadataT<DomainWarpFractalProgressive> : MetadataT<Fractal<DomainWarp>>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
-        MetadataT() : MetadataT<Fractal<DomainWarp>>( "Domain Warp Source"  )
+        MetadataT() : MetadataT<Fractal<DomainWarp>>( { "Domain Warp Source", "Uses the algorithm from this domain warp node for each octave of the fractal" }, false )
         {
             groups.push_back( "Domain Warp" );
+            groups.push_back( "Fractal" );
+
+            description =
+                "The original input position is passed into the first domain warp octave\n"
+                "The warped output position from the previous octave is passed into\n"
+                "the next octave's input position and so on for each octave\n"
+                "The final position is used to generate the source node on the attached domain warp node";
         }
     };
 #endif
@@ -27,19 +33,22 @@ namespace FastNoise
     class DomainWarpFractalIndependant : public virtual Fractal<DomainWarp>
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
     };
 
 #ifdef FASTNOISE_METADATA
     template<>
-    struct MetadataT<DomainWarpFractalIndependant> : MetadataT<Fractal<DomainWarp>>
+    struct MetadataT<DomainWarpFractalIndependant> : MetadataT<DomainWarpFractalProgressive> // Inherits from DomainWarpFractalProgressive just to avoid duplicate code
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
-        MetadataT() : MetadataT<Fractal<DomainWarp>>( "Domain Warp Source"  )
+        MetadataT()
         {
-            groups.push_back( "Domain Warp" );
+            description =
+                "The original input position is passed into all domain warp octaves\n"
+                "The warped offset from all octaves is accumulated\n"
+                "and added to the original input position\n"
+                "This position is used to generate the source node on the attached domain warp node";
         }
     };
 #endif
diff --git a/include/FastNoise/Generators/DomainWarpFractal.inl b/include/FastNoise/Generators/DomainWarpFractal.inl
index ad7057e8..56186148 100644
--- a/include/FastNoise/Generators/DomainWarpFractal.inl
+++ b/include/FastNoise/Generators/DomainWarpFractal.inl
@@ -1,15 +1,12 @@
-#include "FastSIMD/InlInclude.h"
-
 #include "DomainWarpFractal.h"
 
-template<typename FS>
-class FS_T<FastNoise::DomainWarpFractalProgressive, FS> : public virtual FastNoise::DomainWarpFractalProgressive, public FS_T<FastNoise::Fractal<FastNoise::DomainWarp>, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::DomainWarpFractalProgressive, SIMD> final : public virtual FastNoise::DomainWarpFractalProgressive, public FastSIMD::DispatchClass<FastNoise::Fractal<FastNoise::DomainWarp>, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
 
     template<typename... P>
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
         auto* warp = this->GetSourceSIMD( mSource );
 
@@ -27,7 +24,7 @@ class FS_T<FastNoise::DomainWarpFractalProgressive, FS> : public virtual FastNoi
         {
             seedInc -= int32v( -1 );
             freq *= lacunarity;
-            amp *= FnUtils::Lerp( float32v( 1 ), float32v( 1 ) - strength, weightedStrength );
+            amp *= Lerp( float32v( 1 ), float32v( 1 ) - strength, weightedStrength );
             amp *= gain;
             strength = warp->Warp( seedInc, amp, (pos * freq)..., pos... );
         }
@@ -36,14 +33,13 @@ class FS_T<FastNoise::DomainWarpFractalProgressive, FS> : public virtual FastNoi
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::DomainWarpFractalIndependant, FS> : public virtual FastNoise::DomainWarpFractalIndependant, public FS_T<FastNoise::Fractal<FastNoise::DomainWarp>, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::DomainWarpFractalIndependant, SIMD> final : public virtual FastNoise::DomainWarpFractalIndependant, public FastSIMD::DispatchClass<FastNoise::Fractal<FastNoise::DomainWarp>, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
 
     template<typename... P>
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
         return [this, seed] ( std::remove_reference_t<P>... noisePos, std::remove_reference_t<P>... warpPos )
         {
@@ -63,7 +59,7 @@ class FS_T<FastNoise::DomainWarpFractalIndependant, FS> : public virtual FastNoi
             {
                 seedInc -= int32v( -1 );
                 freq *= lacunarity;
-                amp *= FnUtils::Lerp( float32v( 1 ), float32v( 1 ) - strength, weightedStrength );
+                amp *= Lerp( float32v( 1 ), float32v( 1 ) - strength, weightedStrength );
                 amp *= gain;
                 strength = warp->Warp( seedInc, amp, (noisePos * freq)..., warpPos... );
             }
diff --git a/include/FastNoise/Generators/DomainWarpSimplex.h b/include/FastNoise/Generators/DomainWarpSimplex.h
new file mode 100644
index 00000000..669056fa
--- /dev/null
+++ b/include/FastNoise/Generators/DomainWarpSimplex.h
@@ -0,0 +1,52 @@
+#pragma once
+#include "Generator.h"
+#include "DomainWarp.h"
+
+namespace FastNoise
+{
+    class DomainWarpSimplex : public virtual DomainWarp
+    {
+    public:
+        const Metadata& GetMetadata() const override;
+        
+        void SetVectorizationScheme( VectorizationScheme value ) { mVectorizationScheme = value; }
+
+    protected:
+        VectorizationScheme mVectorizationScheme = VectorizationScheme::OrthogonalGradientMatrix;
+    };
+
+#ifdef FASTNOISE_METADATA
+    template<>
+    struct MetadataT<DomainWarpSimplex> : MetadataT<DomainWarp>
+    {
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
+
+        MetadataT()
+        {
+            this->AddVariableEnum(
+                { "Vectorization Scheme", "Construction used by the noise to produce a vector output" },
+                VectorizationScheme::OrthogonalGradientMatrix, &DomainWarpSimplex::SetVectorizationScheme,
+                kVectorizationScheme_Strings
+            );
+        }
+    };
+#endif
+
+    class DomainWarpSuperSimplex : public virtual DomainWarpSimplex
+    {
+    public:
+        const Metadata& GetMetadata() const override;
+    };
+
+#ifdef FASTNOISE_METADATA
+    template<>
+    struct MetadataT<DomainWarpSuperSimplex> : MetadataT<DomainWarpSimplex>
+    {
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
+
+        MetadataT()
+        {
+        }
+    };
+#endif
+}
diff --git a/include/FastNoise/Generators/DomainWarpSimplex.inl b/include/FastNoise/Generators/DomainWarpSimplex.inl
new file mode 100644
index 00000000..5eaa2774
--- /dev/null
+++ b/include/FastNoise/Generators/DomainWarpSimplex.inl
@@ -0,0 +1,1102 @@
+#include "DomainWarpSimplex.h"
+#include "Utils.inl"
+
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::DomainWarpSimplex, SIMD> final : public virtual FastNoise::DomainWarpSimplex, public FastSIMD::DispatchClass<FastNoise::DomainWarp, SIMD>
+{
+public:
+    float32v FS_VECTORCALL Warp( int32v seed, float32v warpAmp, float32v x, float32v y, float32v& xOut, float32v& yOut ) const final
+    {
+        switch( mVectorizationScheme )
+        {
+        default:
+        case VectorizationScheme::OrthogonalGradientMatrix:
+            return Warp_2D<VectorizationScheme::OrthogonalGradientMatrix>( seed, warpAmp, x, y, xOut, yOut );
+        case VectorizationScheme::GradientOuterProduct:
+            return Warp_2D<VectorizationScheme::GradientOuterProduct>( seed, warpAmp, x, y, xOut, yOut );
+        }
+    }
+
+    float32v FS_VECTORCALL Warp( int32v seed, float32v warpAmp, float32v x, float32v y, float32v z, float32v& xOut, float32v& yOut, float32v& zOut ) const final
+    {
+        switch( mVectorizationScheme ) 
+        {
+        default:
+        case VectorizationScheme::OrthogonalGradientMatrix:
+            return Warp_3D<VectorizationScheme::OrthogonalGradientMatrix>( seed, warpAmp, x, y, z, xOut, yOut, zOut );
+        case VectorizationScheme::GradientOuterProduct:
+            return Warp_3D<VectorizationScheme::GradientOuterProduct>( seed, warpAmp, x, y, z, xOut, yOut, zOut );
+        }        
+    }
+
+    float32v FS_VECTORCALL Warp( int32v seed, float32v warpAmp, float32v x, float32v y, float32v z, float32v w, float32v& xOut, float32v& yOut, float32v& zOut, float32v& wOut ) const final
+    {
+        switch( mVectorizationScheme )
+        {
+        default:
+        case VectorizationScheme::OrthogonalGradientMatrix:
+            return Warp_4D<VectorizationScheme::OrthogonalGradientMatrix>( seed, warpAmp, x, y, z, w, xOut, yOut, zOut, wOut );
+        case VectorizationScheme::GradientOuterProduct:
+            return Warp_4D<VectorizationScheme::GradientOuterProduct>( seed, warpAmp, x, y, z, w, xOut, yOut, zOut, wOut );
+        }
+    }
+
+protected:
+    template<VectorizationScheme Scheme>
+    float32v FS_VECTORCALL Warp_2D( int32v seed, float32v warpAmp, float32v x, float32v y, float32v& xOut, float32v& yOut ) const
+    {
+        constexpr double kRoot3 = 1.7320508075688772935274463415059;
+        constexpr double kSkew2 = 1.0 / ( kRoot3 + 1.0 );
+        constexpr double kUnskew2 = -1.0 / ( kRoot3 + 3.0 );
+        constexpr double kFalloffRadiusSquared = 0.5;
+
+        float32v skewDelta = float32v( kSkew2 ) * ( x + y );
+        float32v xSkewed = x + skewDelta;
+        float32v ySkewed = y + skewDelta;
+
+        float32v xSkewedBase = FS::Floor( xSkewed );
+        float32v ySkewedBase = FS::Floor( ySkewed );
+        float32v dxSkewed = xSkewed - xSkewedBase;
+        float32v dySkewed = ySkewed - ySkewedBase;
+
+        int32v xPrimedBase = FS::Convert<int32_t>( xSkewedBase ) * int32v( Primes::X );
+        int32v yPrimedBase = FS::Convert<int32_t>( ySkewedBase ) * int32v( Primes::Y );
+
+        mask32v xGreaterEqualY = dxSkewed >= dySkewed;
+
+        float32v unskewDelta = float32v( kUnskew2 ) * ( dxSkewed + dySkewed );
+        float32v dx0 = dxSkewed + unskewDelta;
+        float32v dy0 = dySkewed + unskewDelta;
+
+        float32v dx1 = FS::MaskedIncrement( ~xGreaterEqualY, dx0 ) - float32v( kUnskew2 + 1 );
+        float32v dy1 = FS::MaskedIncrement( xGreaterEqualY, dy0 ) - float32v( kUnskew2 + 1 );
+        float32v dx2 = dx0 - float32v( kUnskew2 * 2 + 1 );
+        float32v dy2 = dy0 - float32v( kUnskew2 * 2 + 1 );
+
+        float32v falloff0 = FS::FNMulAdd( dx0, dx0, FS::FNMulAdd( dy0, dy0, float32v( kFalloffRadiusSquared ) ) );
+        float32v falloff1 = FS::FNMulAdd( dx1, dx1, FS::FNMulAdd( dy1, dy1, float32v( kFalloffRadiusSquared ) ) );
+        float32v falloff2 = falloff0 + FS::FMulAdd( unskewDelta,
+            float32v( -4.0 * ( kRoot3 + 2.0 ) / ( kRoot3 + 3.0 ) ),
+            float32v( -2.0 / 3.0 ) );
+
+        falloff0 = FS::Max( falloff0, float32v( 0 ) );
+        falloff1 = FS::Max( falloff1, float32v( 0 ) );
+        falloff2 = FS::Max( falloff2, float32v( 0 ) );
+
+        falloff0 *= falloff0; falloff0 *= falloff0;
+        falloff1 *= falloff1; falloff1 *= falloff1;
+        falloff2 *= falloff2; falloff2 *= falloff2;
+
+        float32v valueX( 0 );
+        float32v valueY( 0 );
+
+        ApplyVectorContributionSimplex<Scheme>( HashPrimes( seed, xPrimedBase, yPrimedBase ), dx0, dy0, falloff0, valueX, valueY );
+        ApplyVectorContributionSimplex<Scheme>( HashPrimes( seed, FS::MaskedAdd( xGreaterEqualY, xPrimedBase, int32v( Primes::X ) ), FS::InvMaskedAdd( xGreaterEqualY, yPrimedBase, int32v( Primes::Y ) ) ), dx1, dy1, falloff1, valueX, valueY );
+        ApplyVectorContributionSimplex<Scheme>( HashPrimes( seed, xPrimedBase + int32v( Primes::X ), yPrimedBase + int32v( Primes::Y ) ), dx2, dy2, falloff2, valueX, valueY );
+
+        constexpr double kBounding = ( Scheme == VectorizationScheme::GradientOuterProduct ?
+            49.918426513671875 / 2.0 :
+            70.1480577066486 );
+
+        warpAmp *= float32v( kBounding );
+        xOut = FS::FMulAdd( valueX, warpAmp, xOut );
+        yOut = FS::FMulAdd( valueY, warpAmp, yOut );
+
+        float32v warpLengthSq = FS::FMulAdd( valueY, valueY, valueX * valueX );
+        return warpLengthSq * FS::InvSqrt( warpLengthSq ) * warpAmp;
+    }
+
+    template<VectorizationScheme Scheme>
+    float32v FS_VECTORCALL Warp_3D( int32v seed, float32v warpAmp, float32v x, float32v y, float32v z, float32v& xOut, float32v& yOut, float32v& zOut ) const
+    {
+        constexpr double kSkew3 = 1.0 / 3.0;
+        constexpr double kReflectUnskew3 = -1.0 / 2.0;
+        constexpr double kFalloffRadiusSquared = 0.6;
+
+        float32v skewDelta = float32v( kSkew3 ) * ( x + y + z );
+        float32v xSkewed = x + skewDelta;
+        float32v ySkewed = y + skewDelta;
+        float32v zSkewed = z + skewDelta;
+
+        float32v xSkewedBase = FS::Floor( xSkewed );
+        float32v ySkewedBase = FS::Floor( ySkewed );
+        float32v zSkewedBase = FS::Floor( zSkewed );
+        float32v dxSkewed = xSkewed - xSkewedBase;
+        float32v dySkewed = ySkewed - ySkewedBase;
+        float32v dzSkewed = zSkewed - zSkewedBase;
+
+        int32v xPrimedBase = FS::Convert<int32_t>( xSkewedBase ) * int32v( Primes::X );
+        int32v yPrimedBase = FS::Convert<int32_t>( ySkewedBase ) * int32v( Primes::Y );
+        int32v zPrimedBase = FS::Convert<int32_t>( zSkewedBase ) * int32v( Primes::Z );
+
+        mask32v xGreaterEqualY = dxSkewed >= dySkewed;
+        mask32v yGreaterEqualZ = dySkewed >= dzSkewed;
+        mask32v xGreaterEqualZ = dxSkewed >= dzSkewed;
+
+        float32v unskewDelta = float32v( kReflectUnskew3 ) * ( dxSkewed + dySkewed + dzSkewed );
+        float32v dx0 = dxSkewed + unskewDelta;
+        float32v dy0 = dySkewed + unskewDelta;
+        float32v dz0 = dzSkewed + unskewDelta;
+
+        mask32v maskX1 = xGreaterEqualY & xGreaterEqualZ;
+        mask32v maskY1 = FS::BitwiseAndNot( yGreaterEqualZ, xGreaterEqualY );
+        mask32v maskZ1 = xGreaterEqualZ | yGreaterEqualZ; // Inv masked
+
+        mask32v nMaskX2 = xGreaterEqualY | xGreaterEqualZ; // Inv masked
+        mask32v nMaskY2 = FS::BitwiseAndNot( xGreaterEqualY, yGreaterEqualZ );
+        mask32v nMaskZ2 = xGreaterEqualZ & yGreaterEqualZ;
+
+        float32v dx3 = dx0 - float32v( kReflectUnskew3 * 3 + 1 );
+        float32v dy3 = dy0 - float32v( kReflectUnskew3 * 3 + 1 );
+        float32v dz3 = dz0 - float32v( kReflectUnskew3 * 3 + 1 );
+        float32v dx1 = FS::MaskedSub( maskX1, dx3, float32v( 1 ) ); // kReflectUnskew3 * 3 + 1 = kReflectUnskew3, so dx0 - kReflectUnskew3 = dx3
+        float32v dy1 = FS::MaskedSub( maskY1, dy3, float32v( 1 ) );
+        float32v dz1 = FS::InvMaskedSub( maskZ1, dz3, float32v( 1 ) );
+        float32v dx2 = FS::MaskedIncrement( ~nMaskX2, dx0 ); // kReflectUnskew3 * 2 - 1 = 0, so dx0 + ( kReflectUnskew3 * 2 - 1 ) = dx0
+        float32v dy2 = FS::MaskedIncrement( nMaskY2, dy0 );
+        float32v dz2 = FS::MaskedIncrement( nMaskZ2, dz0 );
+
+        float32v falloff0 = FS::FNMulAdd( dz0, dz0, FS::FNMulAdd( dy0, dy0, FS::FNMulAdd( dx0, dx0, float32v( kFalloffRadiusSquared ) ) ) );
+        float32v falloff1 = FS::FNMulAdd( dz1, dz1, FS::FNMulAdd( dy1, dy1, FS::FNMulAdd( dx1, dx1, float32v( kFalloffRadiusSquared ) ) ) );
+        float32v falloff2 = FS::FNMulAdd( dz2, dz2, FS::FNMulAdd( dy2, dy2, FS::FNMulAdd( dx2, dx2, float32v( kFalloffRadiusSquared ) ) ) );
+        float32v falloff3 = falloff0 - ( unskewDelta + float32v( 3.0 / 4.0 ) );
+
+        falloff0 = FS::Max( falloff0, float32v( 0 ) );
+        falloff1 = FS::Max( falloff1, float32v( 0 ) );
+        falloff2 = FS::Max( falloff2, float32v( 0 ) );
+        falloff3 = FS::Max( falloff3, float32v( 0 ) );
+
+        falloff0 *= falloff0; falloff0 *= falloff0;
+        falloff1 *= falloff1; falloff1 *= falloff1;
+        falloff2 *= falloff2; falloff2 *= falloff2;
+        falloff3 *= falloff3; falloff3 *= falloff3;
+
+        float32v valueX( 0 );
+        float32v valueY( 0 );
+        float32v valueZ( 0 );
+
+        ApplyVectorContributionCommon<Scheme>( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimedBase ), dx0, dy0, dz0, falloff0, valueX, valueY, valueZ );
+        ApplyVectorContributionCommon<Scheme>( HashPrimes( seed, FS::MaskedAdd( maskX1, xPrimedBase, int32v( Primes::X ) ), FS::MaskedAdd( maskY1, yPrimedBase, int32v( Primes::Y ) ), FS::InvMaskedAdd( maskZ1, zPrimedBase, int32v( Primes::Z ) ) ), dx1, dy1, dz1, falloff1, valueX, valueY, valueZ );
+        ApplyVectorContributionCommon<Scheme>( HashPrimes( seed, FS::MaskedAdd( nMaskX2, xPrimedBase, int32v( Primes::X ) ), FS::InvMaskedAdd( nMaskY2, yPrimedBase, int32v( Primes::Y ) ), FS::InvMaskedAdd( nMaskZ2, zPrimedBase, int32v( Primes::Z ) ) ), dx2, dy2, dz2, falloff2, valueX, valueY, valueZ );
+        ApplyVectorContributionCommon<Scheme>( HashPrimes( seed, xPrimedBase + int32v( Primes::X ), yPrimedBase + int32v( Primes::Y ), zPrimedBase + int32v( Primes::Z ) ), dx3, dy3, dz3, falloff3, valueX, valueY, valueZ );
+
+        if constexpr( Scheme != VectorizationScheme::OrthogonalGradientMatrix )
+        {
+            // Match gradient orientation.
+            constexpr double kReflect3D = -2.0 / 2.0;
+            float32v valueTransformDelta = float32v( kReflect3D ) * ( valueX + valueY + valueZ );
+            valueX += valueTransformDelta;
+            valueY += valueTransformDelta;
+            valueZ += valueTransformDelta;
+        }
+
+        constexpr double kBounding = ( Scheme == VectorizationScheme::GradientOuterProduct ?
+            32.69428253173828125 / 1.4142135623730951 :
+            16.281631889139874 );
+
+        warpAmp *= float32v( kBounding );
+        xOut = FS::FMulAdd( valueX, warpAmp, xOut );
+        yOut = FS::FMulAdd( valueY, warpAmp, yOut );
+        zOut = FS::FMulAdd( valueZ, warpAmp, zOut );
+
+        float32v warpLengthSq = FS::FMulAdd( valueZ, valueZ, FS::FMulAdd( valueY, valueY, valueX * valueX ) );
+        return warpLengthSq * FS::InvSqrt( warpLengthSq ) * warpAmp;
+    }
+
+    template<VectorizationScheme Scheme>
+    float32v FS_VECTORCALL Warp_4D( int32v seed, float32v warpAmp, float32v x, float32v y, float32v z, float32v w, float32v& xOut, float32v& yOut, float32v& zOut, float32v& wOut ) const
+    {
+        constexpr double kRoot5 = 2.2360679774997896964091736687313;
+        constexpr double kSkew4 = 1.0 / ( kRoot5 + 1.0 );
+        constexpr double kUnskew4 = -1.0 / ( kRoot5 + 5.0 );
+        constexpr double kFalloffRadiusSquared = 0.6;
+
+        float32v skewDelta = float32v( kSkew4 ) * ( x + y + z + w );
+        float32v xSkewed = x + skewDelta;
+        float32v ySkewed = y + skewDelta;
+        float32v zSkewed = z + skewDelta;
+        float32v wSkewed = w + skewDelta;
+
+        float32v xSkewedBase = FS::Floor( xSkewed );
+        float32v ySkewedBase = FS::Floor( ySkewed );
+        float32v zSkewedBase = FS::Floor( zSkewed );
+        float32v wSkewedBase = FS::Floor( wSkewed );
+        float32v dxSkewed = xSkewed - xSkewedBase;
+        float32v dySkewed = ySkewed - ySkewedBase;
+        float32v dzSkewed = zSkewed - zSkewedBase;
+        float32v dwSkewed = wSkewed - wSkewedBase;
+
+        int32v xPrimedBase = FS::Convert<int32_t>( xSkewedBase ) * int32v( Primes::X );
+        int32v yPrimedBase = FS::Convert<int32_t>( ySkewedBase ) * int32v( Primes::Y );
+        int32v zPrimedBase = FS::Convert<int32_t>( zSkewedBase ) * int32v( Primes::Z );
+        int32v wPrimedBase = FS::Convert<int32_t>( wSkewedBase ) * int32v( Primes::W );
+
+        float32v unskewDelta = float32v( kUnskew4 ) * ( dxSkewed + dySkewed + dzSkewed + dwSkewed );
+        float32v dx0 = dxSkewed + unskewDelta;
+        float32v dy0 = dySkewed + unskewDelta;
+        float32v dz0 = dzSkewed + unskewDelta;
+        float32v dw0 = dwSkewed + unskewDelta;
+
+        int32v rankX( 0 );
+        int32v rankY( 0 );
+        int32v rankZ( 0 );
+        int32v rankW( 0 );
+
+        mask32v xGreaterEqualY = dx0 >= dy0;
+        rankX = FS::MaskedIncrement( xGreaterEqualY, rankX );
+        rankY = FS::MaskedIncrement( ~xGreaterEqualY, rankY );
+
+        mask32v xGreaterEqualZ = dx0 >= dz0;
+        rankX = FS::MaskedIncrement( xGreaterEqualZ, rankX );
+        rankZ = FS::MaskedIncrement( ~xGreaterEqualZ, rankZ );
+
+        mask32v xGreaterEqualW = dx0 >= dw0;
+        rankX = FS::MaskedIncrement( xGreaterEqualW, rankX );
+        rankW = FS::MaskedIncrement( ~xGreaterEqualW, rankW );
+
+        mask32v yGreaterEqualZ = dy0 >= dz0;
+        rankY = FS::MaskedIncrement( yGreaterEqualZ, rankY );
+        rankZ = FS::MaskedIncrement( ~yGreaterEqualZ, rankZ );
+
+        mask32v yGreaterEqualW = dy0 >= dw0;
+        rankY = FS::MaskedIncrement( yGreaterEqualW, rankY );
+        rankW = FS::MaskedIncrement( ~yGreaterEqualW, rankW );
+
+        mask32v zGreaterEqualW = dz0 >= dw0;
+        rankZ = FS::MaskedIncrement( zGreaterEqualW, rankZ );
+        rankW = FS::MaskedIncrement( ~zGreaterEqualW, rankW );
+
+        mask32v maskX1 = rankX > int32v( 2 );
+        mask32v maskY1 = rankY > int32v( 2 );
+        mask32v maskZ1 = rankZ > int32v( 2 );
+        mask32v maskW1 = rankW > int32v( 2 );
+
+        mask32v maskX2 = rankX > int32v( 1 );
+        mask32v maskY2 = rankY > int32v( 1 );
+        mask32v maskZ2 = rankZ > int32v( 1 );
+        mask32v maskW2 = rankW > int32v( 1 );
+
+        mask32v maskX3 = rankX > int32v( 0 );
+        mask32v maskY3 = rankY > int32v( 0 );
+        mask32v maskZ3 = rankZ > int32v( 0 );
+        mask32v maskW3 = rankW > int32v( 0 );
+
+        float32v dx1 = FS::MaskedSub( maskX1, dx0, float32v( 1 ) ) - float32v( kUnskew4 );
+        float32v dy1 = FS::MaskedSub( maskY1, dy0, float32v( 1 ) ) - float32v( kUnskew4 );
+        float32v dz1 = FS::MaskedSub( maskZ1, dz0, float32v( 1 ) ) - float32v( kUnskew4 );
+        float32v dw1 = FS::MaskedSub( maskW1, dw0, float32v( 1 ) ) - float32v( kUnskew4 );
+        float32v dx2 = FS::MaskedSub( maskX2, dx0, float32v( 1 ) ) - float32v( kUnskew4 * 2 );
+        float32v dy2 = FS::MaskedSub( maskY2, dy0, float32v( 1 ) ) - float32v( kUnskew4 * 2 );
+        float32v dz2 = FS::MaskedSub( maskZ2, dz0, float32v( 1 ) ) - float32v( kUnskew4 * 2 );
+        float32v dw2 = FS::MaskedSub( maskW2, dw0, float32v( 1 ) ) - float32v( kUnskew4 * 2 );
+        float32v dx3 = FS::MaskedSub( maskX3, dx0, float32v( 1 ) ) - float32v( kUnskew4 * 3 );
+        float32v dy3 = FS::MaskedSub( maskY3, dy0, float32v( 1 ) ) - float32v( kUnskew4 * 3 );
+        float32v dz3 = FS::MaskedSub( maskZ3, dz0, float32v( 1 ) ) - float32v( kUnskew4 * 3 );
+        float32v dw3 = FS::MaskedSub( maskW3, dw0, float32v( 1 ) ) - float32v( kUnskew4 * 3 );
+        float32v dx4 = dx0 - float32v( kUnskew4 * 4 + 1 );
+        float32v dy4 = dy0 - float32v( kUnskew4 * 4 + 1 );
+        float32v dz4 = dz0 - float32v( kUnskew4 * 4 + 1 );
+        float32v dw4 = dw0 - float32v( kUnskew4 * 4 + 1 );
+
+        float32v falloff0 = FS::FNMulAdd( dw0, dw0, FS::FNMulAdd( dz0, dz0, FS::FNMulAdd( dy0, dy0, FS::FNMulAdd( dx0, dx0, float32v( kFalloffRadiusSquared ) ) ) ) );
+        float32v falloff1 = FS::FNMulAdd( dw1, dw1, FS::FNMulAdd( dz1, dz1, FS::FNMulAdd( dy1, dy1, FS::FNMulAdd( dx1, dx1, float32v( kFalloffRadiusSquared ) ) ) ) );
+        float32v falloff2 = FS::FNMulAdd( dw2, dw2, FS::FNMulAdd( dz2, dz2, FS::FNMulAdd( dy2, dy2, FS::FNMulAdd( dx2, dx2, float32v( kFalloffRadiusSquared ) ) ) ) );
+        float32v falloff3 = FS::FNMulAdd( dw3, dw3, FS::FNMulAdd( dz3, dz3, FS::FNMulAdd( dy3, dy3, FS::FNMulAdd( dx3, dx3, float32v( kFalloffRadiusSquared ) ) ) ) );
+        float32v falloff4 = falloff0 + FS::FMulAdd( unskewDelta,
+            float32v( -4.0 * ( kRoot5 + 3.0 ) / ( kRoot5 + 5.0 ) ),
+            float32v( -4.0 / 5.0 ) );
+
+        falloff0 = FS::Max( falloff0, float32v( 0 ) );
+        falloff1 = FS::Max( falloff1, float32v( 0 ) );
+        falloff2 = FS::Max( falloff2, float32v( 0 ) );
+        falloff3 = FS::Max( falloff3, float32v( 0 ) );
+        falloff4 = FS::Max( falloff4, float32v( 0 ) );
+
+        falloff0 *= falloff0; falloff0 *= falloff0;
+        falloff1 *= falloff1; falloff1 *= falloff1;
+        falloff2 *= falloff2; falloff2 *= falloff2;
+        falloff3 *= falloff3; falloff3 *= falloff3;
+        falloff4 *= falloff4; falloff4 *= falloff4;
+
+        float32v valueX( 0 );
+        float32v valueY( 0 );
+        float32v valueZ( 0 );
+        float32v valueW( 0 );
+
+        ApplyVectorContributionSimplex<Scheme>( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimedBase, wPrimedBase ), dx0, dy0, dz0, dw0, falloff0, valueX, valueY, valueZ, valueW );
+        ApplyVectorContributionSimplex<Scheme>( HashPrimes( seed,
+            FS::MaskedAdd( maskX1, xPrimedBase, int32v( Primes::X ) ),
+            FS::MaskedAdd( maskY1, yPrimedBase, int32v( Primes::Y ) ),
+            FS::MaskedAdd( maskZ1, zPrimedBase, int32v( Primes::Z ) ),
+            FS::MaskedAdd( maskW1, wPrimedBase, int32v( Primes::W ) ) ), dx1, dy1, dz1, dw1, falloff1, valueX, valueY, valueZ, valueW );
+        ApplyVectorContributionSimplex<Scheme>( HashPrimes( seed,
+            FS::MaskedAdd( maskX2, xPrimedBase, int32v( Primes::X ) ),
+            FS::MaskedAdd( maskY2, yPrimedBase, int32v( Primes::Y ) ),
+            FS::MaskedAdd( maskZ2, zPrimedBase, int32v( Primes::Z ) ),
+            FS::MaskedAdd( maskW2, wPrimedBase, int32v( Primes::W ) ) ), dx2, dy2, dz2, dw2, falloff2, valueX, valueY, valueZ, valueW );
+        ApplyVectorContributionSimplex<Scheme>( HashPrimes( seed,
+            FS::MaskedAdd( maskX3, xPrimedBase, int32v( Primes::X ) ),
+            FS::MaskedAdd( maskY3, yPrimedBase, int32v( Primes::Y ) ),
+            FS::MaskedAdd( maskZ3, zPrimedBase, int32v( Primes::Z ) ),
+            FS::MaskedAdd( maskW3, wPrimedBase, int32v( Primes::W ) ) ), dx3, dy3, dz3, dw3, falloff3, valueX, valueY, valueZ, valueW );
+        ApplyVectorContributionSimplex<Scheme>( HashPrimes( seed,
+            xPrimedBase + int32v( Primes::X ), yPrimedBase + int32v( Primes::Y ), zPrimedBase + int32v( Primes::Z ), wPrimedBase + int32v( Primes::W ) ),
+            dx4, dy4, dz4, dw4, falloff4, valueX, valueY, valueZ, valueW );
+
+        constexpr double kBounding = ( Scheme == VectorizationScheme::GradientOuterProduct ?
+            33.653125584827855 / 1.4142135623730951 :
+            30.88161777516092 );
+
+        warpAmp *= float32v( kBounding );
+        xOut = FS::FMulAdd( valueX, warpAmp, xOut );
+        yOut = FS::FMulAdd( valueY, warpAmp, yOut );
+        zOut = FS::FMulAdd( valueZ, warpAmp, zOut );
+
+        float32v warpLengthSq = FS::FMulAdd( valueW, valueW, FS::FMulAdd( valueZ, valueZ, FS::FMulAdd( valueY, valueY, valueX * valueX ) ) );
+        return warpLengthSq * FS::InvSqrt( warpLengthSq ) * warpAmp;
+    }
+};
+
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::DomainWarpSuperSimplex, SIMD> final : public virtual FastNoise::DomainWarpSuperSimplex, public FastSIMD::DispatchClass<FastNoise::DomainWarp, SIMD>
+{
+public:
+    float32v FS_VECTORCALL Warp( int32v seed, float32v warpAmp, float32v x, float32v y, float32v& xOut, float32v& yOut ) const final
+    {
+        switch( mVectorizationScheme )
+        {
+        default:
+        case VectorizationScheme::OrthogonalGradientMatrix:
+            return Warp_2D<VectorizationScheme::OrthogonalGradientMatrix>( seed, warpAmp, x, y, xOut, yOut );
+        case VectorizationScheme::GradientOuterProduct:
+            return Warp_2D<VectorizationScheme::GradientOuterProduct>( seed, warpAmp, x, y, xOut, yOut );
+        }
+    }
+
+    float32v FS_VECTORCALL Warp( int32v seed, float32v warpAmp, float32v x, float32v y, float32v z, float32v& xOut, float32v& yOut, float32v& zOut ) const final
+    {
+        switch( mVectorizationScheme ) 
+        {
+        default:
+        case VectorizationScheme::OrthogonalGradientMatrix:
+            return Warp_3D<VectorizationScheme::OrthogonalGradientMatrix>( seed, warpAmp, x, y, z, xOut, yOut, zOut );
+        case VectorizationScheme::GradientOuterProduct:
+            return Warp_3D<VectorizationScheme::GradientOuterProduct>( seed, warpAmp, x, y, z, xOut, yOut, zOut );
+        }        
+    }
+
+    float32v FS_VECTORCALL Warp( int32v seed, float32v warpAmp, float32v x, float32v y, float32v z, float32v w, float32v& xOut, float32v& yOut, float32v& zOut, float32v& wOut ) const final
+    {
+        switch( mVectorizationScheme )
+        {
+        default:
+        case VectorizationScheme::OrthogonalGradientMatrix:
+            return Warp_4D<VectorizationScheme::OrthogonalGradientMatrix>( seed, warpAmp, x, y, z, w, xOut, yOut, zOut, wOut );
+        case VectorizationScheme::GradientOuterProduct:
+            return Warp_4D<VectorizationScheme::GradientOuterProduct>( seed, warpAmp, x, y, z, w, xOut, yOut, zOut, wOut );
+        }
+    }
+
+protected:
+    template<VectorizationScheme Scheme>
+    float32v FS_VECTORCALL Warp_2D( int32v seed, float32v warpAmp, float32v x, float32v y, float32v& xOut, float32v& yOut ) const
+    {
+        constexpr double kRoot3 = 1.7320508075688772935274463415059;
+        constexpr double kSkew2 = 1.0 / ( kRoot3 + 1.0 );
+        constexpr double kUnskew2 = -1.0 / ( kRoot3 + 3.0 );
+        constexpr double kFalloffRadiusSquared = 2.0 / 3.0;
+
+        float32v skewDelta = float32v( kSkew2 ) * ( x + y );
+        float32v xSkewed = x + skewDelta;
+        float32v ySkewed = y + skewDelta;
+        float32v xSkewedBase = FS::Floor( xSkewed );
+        float32v ySkewedBase = FS::Floor( ySkewed );
+        float32v dxSkewed = xSkewed - xSkewedBase;
+        float32v dySkewed = ySkewed - ySkewedBase;
+        int32v xPrimedBase = FS::Convert<int32_t>( xSkewedBase ) * int32v( Primes::X );
+        int32v yPrimedBase = FS::Convert<int32_t>( ySkewedBase ) * int32v( Primes::Y );
+
+        mask32v forwardXY = dxSkewed + dySkewed > float32v( 1.0f );
+        float32v boundaryXY = FS::Masked( forwardXY, float32v( -1.0f ) );
+        mask32v forwardX = FS::FMulAdd( dxSkewed, float32v( -2.0f ), dySkewed ) < boundaryXY;
+        mask32v forwardY = FS::FMulAdd( dySkewed, float32v( -2.0f ), dxSkewed ) < boundaryXY;
+
+        float32v unskewDelta = float32v( kUnskew2 ) * ( dxSkewed + dySkewed );
+        float32v dxBase = dxSkewed + unskewDelta;
+        float32v dyBase = dySkewed + unskewDelta;
+
+        float32v falloffBase0;
+        float32v valueX( 0 );
+        float32v valueY( 0 );
+
+        // Vertex <0, 0>
+        {
+            int32v hash = HashPrimes( seed, xPrimedBase, yPrimedBase );
+            falloffBase0 = FS::FNMulAdd( dxBase, dxBase, FS::FNMulAdd( dyBase, dyBase, float32v( kFalloffRadiusSquared ) ) );
+            float32v falloff = falloffBase0; falloff *= falloff; falloff *= falloff;
+            ApplyVectorContributionSimplex<Scheme>( hash, dxBase, dyBase, falloff, valueX, valueY );
+        }
+
+        // Vertex <1, 1>
+        {
+            int32v hash = HashPrimes( seed, xPrimedBase + int32v( Primes::X ), yPrimedBase + int32v( Primes::Y ) );
+            float32v falloff = FS::FMulAdd( unskewDelta,
+                float32v( -4.0 * ( kRoot3 + 2.0 ) / ( kRoot3 + 3.0 ) ),
+                falloffBase0 - float32v( kFalloffRadiusSquared ) );
+            falloff *= falloff; falloff *= falloff;
+            ApplyVectorContributionSimplex<Scheme>( hash, dxBase - float32v( 2 * kUnskew2 + 1 ), dyBase - float32v( 2 * kUnskew2 + 1 ), falloff, valueX, valueY );
+        }
+
+        float32v xyDelta = FS::Select( forwardXY, float32v( kUnskew2 + 1 ), float32v( -kUnskew2 ) );
+        dxBase -= xyDelta;
+        dyBase -= xyDelta;
+
+        // Vertex <1, 0> or <-1, 0> or <1, 2>
+        {
+            int32v hash = HashPrimes( seed,
+                FS::InvMaskedSub( forwardXY, FS::MaskedAdd( forwardX, xPrimedBase, int32v( Primes::X * 2 ) ), int32v( Primes::X ) ),
+                FS::MaskedAdd( forwardXY, yPrimedBase, int32v( Primes::Y ) ) );
+            float32v dx = dxBase - FS::Select( forwardX, float32v( 1 + 2 * kUnskew2 ), float32v( -1 ) );
+            float32v dy = FS::MaskedSub( forwardX, dyBase, float32v( 2 * kUnskew2 ) );
+            float32v falloff = FS::Max( FS::FNMulAdd( dx, dx, FS::FNMulAdd( dy, dy, float32v( kFalloffRadiusSquared ) ) ), float32v( 0 ) );
+            falloff *= falloff; falloff *= falloff;
+            ApplyVectorContributionSimplex<Scheme>( hash, dx, dy, falloff, valueX, valueY );
+        }
+
+        // Vertex <0, 1> or <0, -1> or <2, 1>
+        {
+            int32v hash = HashPrimes( seed,
+                FS::MaskedAdd( forwardXY, xPrimedBase, int32v( Primes::X ) ),
+                FS::InvMaskedSub( forwardXY, FS::MaskedAdd( forwardY, yPrimedBase, int32v( (int32_t)( Primes::Y * 2LL ) ) ), int32v( Primes::Y ) ) );
+            float32v dx = FS::MaskedSub( forwardY, dxBase, float32v( 2 * kUnskew2 ) );
+            float32v dy = dyBase - FS::Select( forwardY, float32v( 1 + 2 * kUnskew2 ), float32v( -1 ) );
+            float32v falloff = FS::Max( FS::FNMulAdd( dx, dx, FS::FNMulAdd( dy, dy, float32v( kFalloffRadiusSquared ) ) ), float32v( 0 ) );
+            falloff *= falloff; falloff *= falloff;
+            ApplyVectorContributionSimplex<Scheme>( hash, dx, dy, falloff, valueX, valueY );
+        }
+
+        constexpr double kBounding = ( Scheme == VectorizationScheme::GradientOuterProduct ?
+            9.28993664146183 / 2.0 :
+            12.814453124999995 );
+
+        warpAmp *= float32v( kBounding );
+        xOut = FS::FMulAdd( valueX, warpAmp, xOut );
+        yOut = FS::FMulAdd( valueY, warpAmp, yOut );
+
+        float32v warpLengthSq = FS::FMulAdd( valueY, valueY, valueX * valueX );
+        return warpLengthSq * FS::InvSqrt( warpLengthSq ) * warpAmp;
+    }
+
+    template<VectorizationScheme Scheme>
+    float32v FS_VECTORCALL Warp_3D( int32v seed, float32v warpAmp, float32v x, float32v y, float32v z, float32v& xOut, float32v& yOut, float32v& zOut ) const
+    {
+        constexpr double kSkew3 = 1.0 / 3.0;
+        constexpr double kReflectUnskew3 = -1.0 / 2.0;
+        constexpr double kTwiceUnskew3 = -1.0 / 4.0;
+
+        constexpr double kDistanceSquaredA = 3.0 / 4.0;
+        constexpr double kDistanceSquaredB = 1.0;
+        constexpr double kFalloffRadiusSquared = kDistanceSquaredA;
+
+        float32v skewDelta = float32v( kSkew3 ) * ( x + y + z );
+
+        float32v xSkewed = x + skewDelta;
+        float32v ySkewed = y + skewDelta;
+        float32v zSkewed = z + skewDelta;
+        float32v xSkewedBase = FS::Floor( xSkewed );
+        float32v ySkewedBase = FS::Floor( ySkewed );
+        float32v zSkewedBase = FS::Floor( zSkewed );
+        float32v dxSkewed = xSkewed - xSkewedBase;
+        float32v dySkewed = ySkewed - ySkewedBase;
+        float32v dzSkewed = zSkewed - zSkewedBase;
+
+        // From unit cell base, find closest vertex
+        {
+            // Perform a double unskew to get the vector whose dot product with skewed vectors produces the unskewed result.
+            float32v twiceUnskewDelta = float32v( kTwiceUnskew3 ) * ( dxSkewed + dySkewed + dzSkewed );
+            float32v xNormal = dxSkewed + twiceUnskewDelta;
+            float32v yNormal = dySkewed + twiceUnskewDelta;
+            float32v zNormal = dzSkewed + twiceUnskewDelta;
+            float32v xyzNormal = -twiceUnskewDelta; // xNormal + yNormal + zNormal
+
+            // Using those, compare scores to determine which vertex is closest.
+            constexpr auto considerVertex = [] ( float32v& maxScore, int32v& moveMaskBits, float32v score, int32v bits ) constexpr
+                {
+                    moveMaskBits = FS::Select( score > maxScore, bits, moveMaskBits );
+                    maxScore = FS::Max( maxScore, score );
+                };
+            float32v maxScore = float32v( 0.375f );
+            int32v moveMaskBits = FS::Masked( xyzNormal > maxScore, int32v( -1 ) );
+            maxScore = FS::Max( maxScore, xyzNormal );
+            considerVertex( maxScore, moveMaskBits, xNormal, 0b001 );
+            considerVertex( maxScore, moveMaskBits, yNormal, 0b010 );
+            considerVertex( maxScore, moveMaskBits, zNormal, 0b100 );
+            maxScore += float32v( 0.125f ) - xyzNormal;
+            considerVertex( maxScore, moveMaskBits, -zNormal, 0b011 );
+            considerVertex( maxScore, moveMaskBits, -yNormal, 0b101 );
+            considerVertex( maxScore, moveMaskBits, -xNormal, 0b110 );
+
+            mask32v moveX = ( moveMaskBits & int32v( 0b001 ) ) != int32v( 0 );
+            mask32v moveY = ( moveMaskBits & int32v( 0b010 ) ) != int32v( 0 );
+            mask32v moveZ = ( moveMaskBits & int32v( 0b100 ) ) != int32v( 0 );
+
+            xSkewedBase = FS::MaskedIncrement( moveX, xSkewedBase );
+            ySkewedBase = FS::MaskedIncrement( moveY, ySkewedBase );
+            zSkewedBase = FS::MaskedIncrement( moveZ, zSkewedBase );
+
+            dxSkewed = FS::MaskedDecrement( moveX, dxSkewed );
+            dySkewed = FS::MaskedDecrement( moveY, dySkewed );
+            dzSkewed = FS::MaskedDecrement( moveZ, dzSkewed );
+        }
+
+        int32v xPrimedBase = FS::Convert<int32_t>( xSkewedBase ) * int32v( Primes::X );
+        int32v yPrimedBase = FS::Convert<int32_t>( ySkewedBase ) * int32v( Primes::Y );
+        int32v zPrimedBase = FS::Convert<int32_t>( zSkewedBase ) * int32v( Primes::Z );
+
+        float32v skewedCoordinateSum = dxSkewed + dySkewed + dzSkewed;
+        float32v twiceUnskewDelta = float32v( kTwiceUnskew3 ) * skewedCoordinateSum;
+        float32v xNormal = dxSkewed + twiceUnskewDelta;
+        float32v yNormal = dySkewed + twiceUnskewDelta;
+        float32v zNormal = dzSkewed + twiceUnskewDelta;
+        float32v xyzNormal = -twiceUnskewDelta; // xNormal + yNormal + zNormal
+
+        float32v unskewDelta = float32v( kReflectUnskew3 ) * skewedCoordinateSum;
+        float32v dxBase = dxSkewed + unskewDelta;
+        float32v dyBase = dySkewed + unskewDelta;
+        float32v dzBase = dzSkewed + unskewDelta;
+
+        float32v coordinateSum = float32v( 1 + 3 * kReflectUnskew3 ) * skewedCoordinateSum; // dxBase + dyBase + dzBase
+
+        float32v valueX( 0 );
+        float32v valueY( 0 );
+        float32v valueZ( 0 );
+        float32v falloffBaseStemA, falloffBaseStemB;
+
+        // Vertex <0, 0, 0>
+        {
+            float32v falloffBase = FS::FNMulAdd( dzBase, dzBase, FS::FNMulAdd( dyBase, dyBase, FS::FNMulAdd( dxBase, dxBase, float32v( kFalloffRadiusSquared ) ) ) ) * float32v( 0.5f );
+            falloffBaseStemA = falloffBase - float32v( kDistanceSquaredA * 0.5 );
+            falloffBaseStemB = falloffBase - float32v( kDistanceSquaredB * 0.5 );
+            ApplyVectorContributionCommon<Scheme>( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimedBase ), dxBase, dyBase, dzBase,
+                ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ );
+        }
+
+        // Vertex <1, 1, 1> or <-1, -1, -1>
+        {
+            mask32v signMask = xyzNormal < float32v( 0 );
+
+            int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) );
+            int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) );
+            int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) );
+
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+            float32v offset = float32v( 3 * kReflectUnskew3 + 1 ) ^ sign;
+
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset, coordinateSum, falloffBaseStemA ), float32v( 0.0f ) );
+
+            ApplyVectorContributionCommon<Scheme>( HashPrimes( seed, xPrimed, yPrimed, zPrimed ), dxBase - offset, dyBase - offset, dzBase - offset,
+                ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ );
+        }
+
+        // Vertex <1, 1, 0> or <-1, -1, 0>
+        {
+            mask32v signMask = xyzNormal < zNormal;
+
+            int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) );
+            int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) );
+
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+            float32v offset0 = float32v( 2 * kReflectUnskew3 ) ^ sign;
+
+            float32v falloffBase = FS::Min( ( sign ^ dzBase ) - falloffBaseStemB, float32v( 0.0f ) );
+
+            ApplyVectorContributionCommon<Scheme>( HashPrimes( seed, xPrimed, yPrimed, zPrimedBase ), dxBase, dyBase, dzBase - offset0,
+                ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ );
+        }
+
+        // Vertex <1, 0, 1> or <-1, 0, -1>
+        {
+            mask32v signMask = xyzNormal < yNormal;
+
+            int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) );
+            int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) );
+
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+            float32v offset0 = float32v( 2 * kReflectUnskew3 ) ^ sign;
+
+            float32v falloffBase = FS::Min( ( sign ^ dyBase ) - falloffBaseStemB, float32v( 0.0f ) );
+
+            ApplyVectorContributionCommon<Scheme>( HashPrimes( seed, xPrimed, yPrimedBase, zPrimed ), dxBase, dyBase - offset0, dzBase,
+                ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ );
+        }
+
+        // Vertex <0, 1, 1> or <0, -1, -1>
+        {
+            mask32v signMask = xyzNormal < xNormal;
+
+            int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) );
+            int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) );
+
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+            float32v offset0 = float32v( 2 * kReflectUnskew3 ) ^ sign;
+
+            float32v falloffBase = FS::Min( ( sign ^ dxBase ) - falloffBaseStemB, float32v( 0.0f ) );
+
+            ApplyVectorContributionCommon<Scheme>( HashPrimes( seed, xPrimedBase, yPrimed, zPrimed ), dxBase - offset0, dyBase, dzBase,
+                ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ );
+        }
+
+        // Vertex <1, 0, 0> or <-1, 0, 0>
+        {
+            mask32v signMask = xNormal < float32v( 0 );
+
+            int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) );
+
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+            float32v offset0 = float32v( kReflectUnskew3 ) ^ sign; // offset1 = -offset0 because kReflectUnskew3 + 1 = -kReflectUnskew3
+
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dxBase ), float32v( 0.0f ) );
+
+            ApplyVectorContributionCommon<Scheme>( HashPrimes( seed, xPrimed, yPrimedBase, zPrimedBase ), dxBase + offset0, dyBase - offset0, dzBase - offset0,
+                ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ );
+        }
+
+        // Vertex <0, 1, 0> or <0, -1, 0>
+        {
+            mask32v signMask = yNormal < float32v( 0 );
+
+            int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) );
+
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+            float32v offset0 = float32v( kReflectUnskew3 ) ^ sign; // offset1 = -offset0 because kReflectUnskew3 + 1 = -kReflectUnskew3
+
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dyBase ), float32v( 0.0f ) );
+
+            ApplyVectorContributionCommon<Scheme>( HashPrimes( seed, xPrimedBase, yPrimed, zPrimedBase ), dxBase - offset0, dyBase + offset0, dzBase - offset0,
+                ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ );
+        }
+
+        // Vertex <0, 0, 1> or <0, 0, -1>
+        {
+            mask32v signMask = zNormal < float32v( 0 );
+
+            int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) );
+
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+            float32v offset0 = float32v( kReflectUnskew3 ) ^ sign; // offset1 = -offset0 because kReflectUnskew3 + 1 = -kReflectUnskew3
+
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dzBase ), float32v( 0.0f ) );
+
+            ApplyVectorContributionCommon<Scheme>( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimed ), dxBase - offset0, dyBase - offset0, dzBase + offset0,
+                ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ );
+        }
+
+        if constexpr( Scheme != VectorizationScheme::OrthogonalGradientMatrix )
+        {
+            // Match gradient orientation.
+            constexpr double kReflect3D = -2.0 / 3.0;
+            float32v valueTransformDelta = float32v( kReflect3D ) * ( valueX + valueY + valueZ );
+            valueX += valueTransformDelta;
+            valueY += valueTransformDelta;
+            valueZ += valueTransformDelta;
+        }
+
+        constexpr double kBounding = ( Scheme == VectorizationScheme::GradientOuterProduct ?
+            144.736422163332608 / 1.4142135623730951 :
+            37.63698669623629 );
+
+        warpAmp *= float32v( kBounding );
+        xOut = FS::FMulAdd( valueX, warpAmp, xOut );
+        yOut = FS::FMulAdd( valueY, warpAmp, yOut );
+        zOut = FS::FMulAdd( valueZ, warpAmp, zOut );
+
+        float32v warpLengthSq = FS::FMulAdd( valueZ, valueZ, FS::FMulAdd( valueY, valueY, valueX * valueX ) );
+        return warpLengthSq * FS::InvSqrt( warpLengthSq ) * warpAmp;
+    }
+
+    template<VectorizationScheme Scheme>
+    float32v FS_VECTORCALL Warp_4D( int32v seed, float32v warpAmp, float32v x, float32v y, float32v z, float32v w, float32v& xOut, float32v& yOut, float32v& zOut, float32v& wOut ) const
+    {
+        constexpr double kRoot5 = 2.2360679774997896964091736687313;
+        constexpr double kSkew4 = 1.0 / ( kRoot5 + 1.0 );
+        constexpr double kUnskew4 = -1.0 / ( kRoot5 + 5.0 );
+        constexpr double kTwiceUnskew4 = -1.0 / 5.0;
+
+        constexpr double kDistanceSquaredA = 4.0 / 5.0;
+        constexpr double kDistanceSquaredB = 6.0 / 5.0;
+        constexpr double kFalloffRadiusSquared = kDistanceSquaredA;
+
+        float32v skewDelta = float32v( kSkew4 ) * ( x + y + z + w );
+
+        float32v xSkewed = x + skewDelta;
+        float32v ySkewed = y + skewDelta;
+        float32v zSkewed = z + skewDelta;
+        float32v wSkewed = w + skewDelta;
+        float32v xSkewedBase = FS::Floor( xSkewed );
+        float32v ySkewedBase = FS::Floor( ySkewed );
+        float32v zSkewedBase = FS::Floor( zSkewed );
+        float32v wSkewedBase = FS::Floor( wSkewed );
+        float32v dxSkewed = xSkewed - xSkewedBase;
+        float32v dySkewed = ySkewed - ySkewedBase;
+        float32v dzSkewed = zSkewed - zSkewedBase;
+        float32v dwSkewed = wSkewed - wSkewedBase;
+
+        // From unit cell base, find closest vertex
+        {
+            // Perform a double unskew to get the vector whose dot product with skewed vectors produces the unskewed result.
+            float32v twiceUnskewDelta = float32v( kTwiceUnskew4 ) * ( dxSkewed + dySkewed + dzSkewed + dwSkewed );
+            float32v xNormal = dxSkewed + twiceUnskewDelta;
+            float32v yNormal = dySkewed + twiceUnskewDelta;
+            float32v zNormal = dzSkewed + twiceUnskewDelta;
+            float32v wNormal = dwSkewed + twiceUnskewDelta;
+            float32v xyzwNormal = -twiceUnskewDelta; // xNormal + yNormal + zNormal + wNormal
+
+            // Using those, compare scores to determine which vertex is closest.
+            constexpr auto considerVertex = [] ( float32v& maxScore, int32v& moveMaskBits, float32v score, int32v bits ) constexpr
+                {
+                    moveMaskBits = FS::Select( score > maxScore, bits, moveMaskBits );
+                    maxScore = FS::Max( maxScore, score );
+                };
+            float32v maxScore = float32v( 0.6f ) - xyzwNormal;
+            int32v moveMaskBits = FS::Masked( float32v( 0.2f ) > maxScore, int32v( -1 ) );
+            maxScore = FS::Max( maxScore, float32v( 0.2f ) );
+            considerVertex( maxScore, moveMaskBits, -wNormal, 0b0111 );
+            considerVertex( maxScore, moveMaskBits, -zNormal, 0b1011 );
+            considerVertex( maxScore, moveMaskBits, -yNormal, 0b1101 );
+            considerVertex( maxScore, moveMaskBits, -xNormal, 0b1110 );
+            maxScore += xyzwNormal - float32v( 0.2f );
+            considerVertex( maxScore, moveMaskBits, xNormal, 0b0001 );
+            considerVertex( maxScore, moveMaskBits, yNormal, 0b0010 );
+            considerVertex( maxScore, moveMaskBits, zNormal, 0b0100 );
+            considerVertex( maxScore, moveMaskBits, wNormal, 0b1000 );
+            maxScore += float32v( 0.2f ) - xNormal;
+            considerVertex( maxScore, moveMaskBits, yNormal, 0b0011 );
+            considerVertex( maxScore, moveMaskBits, zNormal, 0b0101 );
+            considerVertex( maxScore, moveMaskBits, wNormal, 0b1001 );
+            maxScore += xNormal;
+            considerVertex( maxScore, moveMaskBits, yNormal + zNormal, 0b0110 );
+            maxScore -= wNormal;
+            considerVertex( maxScore, moveMaskBits, yNormal, 0b1010 );
+            considerVertex( maxScore, moveMaskBits, zNormal, 0b1100 );
+
+            mask32v moveX = ( moveMaskBits & int32v( 0b0001 ) ) != int32v( 0 );
+            mask32v moveY = ( moveMaskBits & int32v( 0b0010 ) ) != int32v( 0 );
+            mask32v moveZ = ( moveMaskBits & int32v( 0b0100 ) ) != int32v( 0 );
+            mask32v moveW = ( moveMaskBits & int32v( 0b1000 ) ) != int32v( 0 );
+
+            xSkewedBase = FS::MaskedIncrement( moveX, xSkewedBase );
+            ySkewedBase = FS::MaskedIncrement( moveY, ySkewedBase );
+            zSkewedBase = FS::MaskedIncrement( moveZ, zSkewedBase );
+            wSkewedBase = FS::MaskedIncrement( moveW, wSkewedBase );
+
+            dxSkewed = FS::MaskedDecrement( moveX, dxSkewed );
+            dySkewed = FS::MaskedDecrement( moveY, dySkewed );
+            dzSkewed = FS::MaskedDecrement( moveZ, dzSkewed );
+            dwSkewed = FS::MaskedDecrement( moveW, dwSkewed );
+        }
+
+        int32v xPrimedBase = FS::Convert<int32_t>( xSkewedBase ) * int32v( Primes::X );
+        int32v yPrimedBase = FS::Convert<int32_t>( ySkewedBase ) * int32v( Primes::Y );
+        int32v zPrimedBase = FS::Convert<int32_t>( zSkewedBase ) * int32v( Primes::Z );
+        int32v wPrimedBase = FS::Convert<int32_t>( wSkewedBase ) * int32v( Primes::W );
+
+        float32v skewedCoordinateSum = dxSkewed + dySkewed + dzSkewed + dwSkewed;
+        float32v twiceUnskewDelta = float32v( kTwiceUnskew4 ) * skewedCoordinateSum;
+        float32v xNormal = dxSkewed + twiceUnskewDelta;
+        float32v yNormal = dySkewed + twiceUnskewDelta;
+        float32v zNormal = dzSkewed + twiceUnskewDelta;
+        float32v wNormal = dwSkewed + twiceUnskewDelta;
+        float32v xyzwNormal = -twiceUnskewDelta; // xNormal + yNormal + zNormal + wNormal
+
+        float32v unskewDelta = float32v( kUnskew4 ) * skewedCoordinateSum;
+        float32v dxBase = dxSkewed + unskewDelta;
+        float32v dyBase = dySkewed + unskewDelta;
+        float32v dzBase = dzSkewed + unskewDelta;
+        float32v dwBase = dwSkewed + unskewDelta;
+
+        float32v coordinateSum = float32v( 1 + 4 * kUnskew4 ) * skewedCoordinateSum; // dxBase + dyBase + dzBase + dwBase
+
+        float32v valueX( 0 );
+        float32v valueY( 0 );
+        float32v valueZ( 0 );
+        float32v valueW( 0 );
+        float32v falloffBaseStemA, falloffBaseStemB;
+
+        // Vertex <0, 0, 0, 0>
+        {
+            float32v falloffBase = FS::FNMulAdd( dwBase, dwBase, FS::FNMulAdd( dzBase, dzBase, FS::FNMulAdd( dyBase, dyBase, FS::FNMulAdd( dxBase, dxBase, float32v( kFalloffRadiusSquared ) ) ) ) ) * float32v( 0.5f );
+            falloffBaseStemA = falloffBase - float32v( kDistanceSquaredA * 0.5 );
+            falloffBaseStemB = falloffBase - float32v( kDistanceSquaredB * 0.5 );
+            ApplyVectorContributionSimplex<Scheme>( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimedBase, wPrimedBase ), dxBase, dyBase, dzBase, dwBase,
+                ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW );
+        }
+
+        // Vertex <1, 1, 1, 1> or <-1, -1, -1, -1>
+        {
+            mask32v signMask = xyzwNormal < float32v( 0 );
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) );
+            int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) );
+            int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) );
+            int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) );
+
+            float32v offset = float32v( 4 * kUnskew4 + 1 ) ^ sign;
+
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset, coordinateSum, falloffBaseStemA ), float32v( 0.0f ) );
+
+            ApplyVectorContributionSimplex<Scheme>( HashPrimes( seed, xPrimed, yPrimed, zPrimed, wPrimed ), dxBase - offset, dyBase - offset, dzBase - offset, dwBase - offset,
+                ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW );
+        }
+
+        // Vertex <1, 1, 1, 0> or <-1, -1, -1, 0>
+        {
+            mask32v signMask = xyzwNormal < wNormal;
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) );
+            int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) );
+            int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) );
+
+            float32v offset1 = float32v( 3 * kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( 3 * kUnskew4 ) ^ sign;
+
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset1, coordinateSum, falloffBaseStemB ) - ( sign ^ dwBase ), float32v( 0.0f ) );
+
+            ApplyVectorContributionSimplex<Scheme>( HashPrimes( seed, xPrimed, yPrimed, zPrimed, wPrimedBase ), dxBase - offset1, dyBase - offset1, dzBase - offset1, dwBase - offset0,
+                ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW );
+        }
+
+        // Vertex <1, 1, 0, 1> or <-1, -1, 0, -1>
+        {
+            mask32v signMask = xyzwNormal < zNormal;
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) );
+            int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) );
+            int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) );
+
+            float32v offset1 = float32v( 3 * kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( 3 * kUnskew4 ) ^ sign;
+
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset1, coordinateSum, falloffBaseStemB ) - ( sign ^ dzBase ), float32v( 0.0f ) );
+
+            ApplyVectorContributionSimplex<Scheme>( HashPrimes( seed, xPrimed, yPrimed, zPrimedBase, wPrimed ), dxBase - offset1, dyBase - offset1, dzBase - offset0, dwBase - offset1,
+                ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW );
+        }
+
+        // Vertex <1, 0, 1, 1> or <-1, 0, -1, -1>
+        {
+            mask32v signMask = xyzwNormal < yNormal;
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) );
+            int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) );
+            int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) );
+
+            float32v offset1 = float32v( 3 * kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( 3 * kUnskew4 ) ^ sign;
+
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset1, coordinateSum, falloffBaseStemB ) - ( sign ^ dyBase ), float32v( 0.0f ) );
+
+            ApplyVectorContributionSimplex<Scheme>( HashPrimes( seed, xPrimed, yPrimedBase, zPrimed, wPrimed ), dxBase - offset1, dyBase - offset0, dzBase - offset1, dwBase - offset1,
+                ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW );
+        }
+
+        // Vertex <0, 1, 1, 1> or <0, -1, -1, -1>
+        {
+            mask32v signMask = xyzwNormal < xNormal;
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) );
+            int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) );
+            int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) );
+
+            float32v offset1 = float32v( 3 * kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( 3 * kUnskew4 ) ^ sign;
+
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset1, coordinateSum, falloffBaseStemB ) - ( sign ^ dxBase ), float32v( 0.0f ) );
+
+            ApplyVectorContributionSimplex<Scheme>( HashPrimes( seed, xPrimedBase, yPrimed, zPrimed, wPrimed ), dxBase - offset0, dyBase - offset1, dzBase - offset1, dwBase - offset1,
+                ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW );
+        }
+
+        // Vertex <1, 0, 0, 0> or <-1, 0, 0, 0>
+        {
+            mask32v signMask = xNormal < float32v( 0 );
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) );
+
+            float32v offset1 = float32v( kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( kUnskew4 ) ^ sign;
+
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dxBase ), float32v( 0.0f ) );
+
+            ApplyVectorContributionSimplex<Scheme>( HashPrimes( seed, xPrimed, yPrimedBase, zPrimedBase, wPrimedBase ), dxBase - offset1, dyBase - offset0, dzBase - offset0, dwBase - offset0,
+                ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW );
+        }
+
+        // Vertex <1, 1, 0, 0> or <-1, -1, 0, 0>
+        {
+            mask32v signMask = xNormal < -yNormal;
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) );
+            int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) );
+
+            float32v offset1 = float32v( 2 * kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( 2 * kUnskew4 ) ^ sign;
+
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemB ) + ( sign ^ ( dxBase + dyBase ) ), float32v( 0.0f ) );
+
+            ApplyVectorContributionSimplex<Scheme>( HashPrimes( seed, xPrimed, yPrimed, zPrimedBase, wPrimedBase ), dxBase - offset1, dyBase - offset1, dzBase - offset0, dwBase - offset0,
+                ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW );
+        }
+
+        // Vertex <1, 0, 1, 0> or <-1, 0, -1, 0>
+        {
+            mask32v signMask = xNormal < -zNormal;
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) );
+            int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) );
+
+            float32v offset1 = float32v( 2 * kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( 2 * kUnskew4 ) ^ sign;
+
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemB ) + ( sign ^ ( dxBase + dzBase ) ), float32v( 0.0f ) );
+
+            ApplyVectorContributionSimplex<Scheme>( HashPrimes( seed, xPrimed, yPrimedBase, zPrimed, wPrimedBase ), dxBase - offset1, dyBase - offset0, dzBase - offset1, dwBase - offset0,
+                ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW );
+        }
+
+        // Vertex <1, 0, 0, 1> or <-1, 0, 0, -1>
+        {
+            mask32v signMask = xNormal < -wNormal;
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) );
+            int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) );
+
+            float32v offset1 = float32v( 2 * kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( 2 * kUnskew4 ) ^ sign;
+
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemB ) + ( sign ^ ( dxBase + dwBase ) ), float32v( 0.0f ) );
+
+            ApplyVectorContributionSimplex<Scheme>( HashPrimes( seed, xPrimed, yPrimedBase, zPrimedBase, wPrimed ), dxBase - offset1, dyBase - offset0, dzBase - offset0, dwBase - offset1,
+                ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW );
+        }
+
+        // Vertex <0, 1, 0, 0> or <0, -1, 0, 0>
+        {
+            mask32v signMask = yNormal < float32v( 0 );
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) );
+
+            float32v offset1 = float32v( kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( kUnskew4 ) ^ sign;
+
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dyBase ), float32v( 0.0f ) );
+
+            ApplyVectorContributionSimplex<Scheme>( HashPrimes( seed, xPrimedBase, yPrimed, zPrimedBase, wPrimedBase ), dxBase - offset0, dyBase - offset1, dzBase - offset0, dwBase - offset0,
+                ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW );
+        }
+
+        // Vertex <0, 1, 1, 0> or <0, -1, -1, 0>
+        {
+            mask32v signMask = yNormal < -zNormal;
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) );
+            int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) );
+
+            float32v offset1 = float32v( 2 * kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( 2 * kUnskew4 ) ^ sign;
+
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemB ) + ( sign ^ ( dyBase + dzBase ) ), float32v( 0.0f ) );
+
+            ApplyVectorContributionSimplex<Scheme>( HashPrimes( seed, xPrimedBase, yPrimed, zPrimed, wPrimedBase ), dxBase - offset0, dyBase - offset1, dzBase - offset1, dwBase - offset0,
+                ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW );
+        }
+
+        // Vertex <0, 1, 0, 1> or <0, -1, 0, -1>
+        {
+            mask32v signMask = yNormal < -wNormal;
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) );
+            int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) );
+
+            float32v offset1 = float32v( 2 * kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( 2 * kUnskew4 ) ^ sign;
+
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemB ) + ( sign ^ ( dyBase + dwBase ) ), float32v( 0.0f ) );
+
+            ApplyVectorContributionSimplex<Scheme>( HashPrimes( seed, xPrimedBase, yPrimed, zPrimedBase, wPrimed ), dxBase - offset0, dyBase - offset1, dzBase - offset0, dwBase - offset1,
+                ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW );
+        }
+
+        // Vertex <0, 0, 1, 0> or <0, 0, -1, 0>
+        {
+            mask32v signMask = zNormal < float32v( 0 );
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) );
+
+            float32v offset1 = float32v( kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( kUnskew4 ) ^ sign;
+
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dzBase ), float32v( 0.0f ) );
+
+            ApplyVectorContributionSimplex<Scheme>( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimed, wPrimedBase ), dxBase - offset0, dyBase - offset0, dzBase - offset1, dwBase - offset0,
+                ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW );
+        }
+
+        // Vertex <0, 0, 1, 1> or <0, 0, -1, -1>
+        {
+            mask32v signMask = zNormal < -wNormal;
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) );
+            int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) );
+
+            float32v offset1 = float32v( 2 * kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( 2 * kUnskew4 ) ^ sign;
+
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemB ) + ( sign ^ ( dzBase + dwBase ) ), float32v( 0.0f ) );
+
+            ApplyVectorContributionSimplex<Scheme>( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimed, wPrimed ), dxBase - offset0, dyBase - offset0, dzBase - offset1, dwBase - offset1,
+                ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW );
+        }
+
+        // Vertex <0, 0, 0, 1> or <0, 0, 0, -1>
+        {
+            mask32v signMask = wNormal < float32v( 0 );
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) );
+
+            float32v offset1 = float32v( kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( kUnskew4 ) ^ sign;
+
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dwBase ), float32v( 0.0f ) );
+
+            ApplyVectorContributionSimplex<Scheme>( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimedBase, wPrimed ), dxBase - offset0, dyBase - offset0, dzBase - offset0, dwBase - offset1,
+                ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW );
+        }
+
+        constexpr double kBounding = ( Scheme == VectorizationScheme::GradientOuterProduct ?
+            115.21625311930542 / 1.4142135623730951 :
+            48.80058117543753 );
+
+        warpAmp *= float32v( kBounding );
+        xOut = FS::FMulAdd( valueX, warpAmp, xOut );
+        yOut = FS::FMulAdd( valueY, warpAmp, yOut );
+        zOut = FS::FMulAdd( valueZ, warpAmp, zOut );
+
+        float32v warpLengthSq = FS::FMulAdd( valueW, valueW, FS::FMulAdd( valueZ, valueZ, FS::FMulAdd( valueY, valueY, valueX * valueX ) ) );
+        return warpLengthSq * FS::InvSqrt( warpLengthSq ) * warpAmp;
+    }
+};
diff --git a/include/FastNoise/Generators/Fractal.h b/include/FastNoise/Generators/Fractal.h
index 5896b916..8e6e8534 100644
--- a/include/FastNoise/Generators/Fractal.h
+++ b/include/FastNoise/Generators/Fractal.h
@@ -42,10 +42,12 @@ namespace FastNoise
     template<typename T>
     struct MetadataT<Fractal<T>> : MetadataT<Generator>
     {
-        MetadataT( const char* sourceName = "Source" )
+        MetadataT( NameDesc sourceName = "Source", bool addGroup = true )
         {
-            groups.push_back( "Fractal" );
-
+            if( addGroup )
+            {
+                groups.push_back( "Fractal" );
+            }
             this->AddGeneratorSource( sourceName, &Fractal<T>::SetSource );
             this->AddHybridSource( "Gain", 0.5f, &Fractal<T>::SetGain, &Fractal<T>::SetGain );
             this->AddHybridSource( "Weighted Strength", 0.0f, &Fractal<T>::SetWeightedStrength, &Fractal<T>::SetWeightedStrength );
@@ -58,7 +60,6 @@ namespace FastNoise
     class FractalFBm : public virtual Fractal<>
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
     };
 
@@ -66,14 +67,13 @@ namespace FastNoise
     template<>
     struct MetadataT<FractalFBm> : MetadataT<Fractal<>>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
     };
 #endif
 
     class FractalRidged : public virtual Fractal<>
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
     };
 
@@ -81,28 +81,27 @@ namespace FastNoise
     template<>
     struct MetadataT<FractalRidged> : MetadataT<Fractal<>>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
     };
 #endif
 
     class FractalPingPong : public virtual Fractal<>
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
 
         void SetPingPongStrength( float value ) { mPingPongStrength = value; }
         void SetPingPongStrength( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mPingPongStrength, gen ); }
 
     protected:
-        HybridSource mPingPongStrength = 0.0f;
+        HybridSource mPingPongStrength = 2.0f;
     };
 
 #ifdef FASTNOISE_METADATA
     template<>
     struct MetadataT<FractalPingPong> : MetadataT<Fractal<>>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
diff --git a/include/FastNoise/Generators/Fractal.inl b/include/FastNoise/Generators/Fractal.inl
index 243d87ec..3dc10a68 100644
--- a/include/FastNoise/Generators/Fractal.inl
+++ b/include/FastNoise/Generators/Fractal.inl
@@ -1,21 +1,18 @@
-#include "FastSIMD/InlInclude.h"
-
 #include "Fractal.h"
 
-template<typename FS, typename T>
-class FS_T<FastNoise::Fractal<T>, FS> : public virtual FastNoise::Fractal<T>, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD, typename T>
+class FastSIMD::DispatchClass<FastNoise::Fractal<T>, SIMD> : public virtual FastNoise::Fractal<T>, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
 {
 
 };
 
-template<typename FS>
-class FS_T<FastNoise::FractalFBm, FS> : public virtual FastNoise::FractalFBm, public FS_T<FastNoise::Fractal<>, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::FractalFBm, SIMD> final : public virtual FastNoise::FractalFBm, public FastSIMD::DispatchClass<FastNoise::Fractal<>, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
 
     template<typename... P>
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
         float32v gain = this->GetSourceValue( mGain  , seed, pos... );
         float32v weightedStrength = this->GetSourceValue( mWeightedStrength, seed, pos... );
@@ -28,7 +25,7 @@ class FS_T<FastNoise::FractalFBm, FS> : public virtual FastNoise::FractalFBm, pu
         for( int i = 1; i < mOctaves; i++ )
         {
             seed -= int32v( -1 );
-            amp *= FnUtils::Lerp( float32v( 1 ), (noise + float32v( 1 )) * float32v( 0.5f ), weightedStrength );
+            amp *= Lerp( float32v( 1 ), (noise + float32v( 1 )) * float32v( 0.5f ), weightedStrength );
             amp *= gain;
 
             noise = this->GetSourceValue( mSource, seed, (pos *= lacunarity)... );
@@ -39,30 +36,29 @@ class FS_T<FastNoise::FractalFBm, FS> : public virtual FastNoise::FractalFBm, pu
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::FractalRidged, FS> : public virtual FastNoise::FractalRidged, public FS_T<FastNoise::Fractal<>, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::FractalRidged, SIMD> final : public virtual FastNoise::FractalRidged, public FastSIMD::DispatchClass<FastNoise::Fractal<>, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
 
     template<typename... P>
-    FS_INLINE float32v GenT(int32v seed, P... pos) const
+    FS_FORCEINLINE float32v GenT(int32v seed, P... pos) const
     {
         float32v gain = this->GetSourceValue( mGain, seed, pos... );
         float32v weightedStrength = this->GetSourceValue( mWeightedStrength, seed, pos... );
         float32v lacunarity( mLacunarity );
         float32v amp( mFractalBounding );
-        float32v noise = FS_Abs_f32( this->GetSourceValue( mSource, seed, pos... ) );
+        float32v noise = FS::Abs( this->GetSourceValue( mSource, seed, pos... ) );
 
         float32v sum = (noise * float32v( -2 ) + float32v( 1 )) * amp;
 
         for( int i = 1; i < mOctaves; i++ )
         {
             seed -= int32v( -1 );
-            amp *= FnUtils::Lerp( float32v( 1 ), float32v( 1 ) - noise, weightedStrength );
+            amp *= Lerp( float32v( 1 ), float32v( 1 ) - noise, weightedStrength );
             amp *= gain;
 
-            noise = FS_Abs_f32( this->GetSourceValue( mSource, seed, (pos *= lacunarity)... ) );
+            noise = FS::Abs( this->GetSourceValue( mSource, seed, (pos *= lacunarity)... ) );
             sum += (noise * float32v( -2 ) + float32v( 1 )) * amp;
         }
 
@@ -70,38 +66,37 @@ class FS_T<FastNoise::FractalRidged, FS> : public virtual FastNoise::FractalRidg
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::FractalPingPong, FS> : public virtual FastNoise::FractalPingPong, public FS_T<FastNoise::Fractal<>, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::FractalPingPong, SIMD> final : public virtual FastNoise::FractalPingPong, public FastSIMD::DispatchClass<FastNoise::Fractal<>, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
 
     static float32v PingPong( float32v t )
     {
-        t -= FS_Round_f32( t * float32v( 0.5f ) ) * float32v( 2 );
-        return FS_Select_f32( t < float32v( 1 ), t, float32v( 2 ) - t );
+        t -= FS::Floor( t * float32v( 0.5f ) ) * float32v( 2 );
+        return FS::Select( t < float32v( 1 ), t, float32v( 2 ) - t );
     }
 
     template<typename... P>
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
-        float32v gain = this->GetSourceValue( mGain  , seed, pos... );
+        float32v gain = this->GetSourceValue( mGain, seed, pos... );
         float32v weightedStrength = this->GetSourceValue( mWeightedStrength, seed, pos... );
         float32v pingPongStrength = this->GetSourceValue( mPingPongStrength, seed, pos... );
         float32v lacunarity( mLacunarity );
-        float32v amp( mFractalBounding );
-        float32v noise = PingPong( (this->GetSourceValue( mSource, seed, pos... ) + float32v( 1 )) * pingPongStrength );
+        float32v amp( mFractalBounding * 2 );
+        float32v noise = PingPong( this->GetSourceValue( mSource, seed, pos... ) * pingPongStrength );
 
-        float32v sum = noise * amp;
+        float32v sum = (noise - float32v( 0.5f )) * amp;
 
         for( int i = 1; i < mOctaves; i++ )
         {
             seed -= int32v( -1 );
-            amp *= FnUtils::Lerp( float32v( 1 ), (noise + float32v( 1 )) * float32v( 0.5f ), weightedStrength );
+            amp *= Lerp( float32v( 1 ), noise, weightedStrength );
             amp *= gain;
 
-            noise = PingPong( (this->GetSourceValue( mSource, seed, (pos *= lacunarity)... ) + float32v( 1 )) * pingPongStrength );
-            sum += noise * amp;
+            noise = PingPong( (this->GetSourceValue( mSource, seed, (pos *= lacunarity)... )) * pingPongStrength );
+            sum += (noise - float32v( 0.5f )) * amp;
         }
 
         return sum;
diff --git a/include/FastNoise/Generators/Generator.h b/include/FastNoise/Generators/Generator.h
index 80d65f35..648b72e5 100644
--- a/include/FastNoise/Generators/Generator.h
+++ b/include/FastNoise/Generators/Generator.h
@@ -2,15 +2,21 @@
 #include <cassert>
 #include <cmath>
 #include <algorithm>
+#include <atomic>
 
-#include "FastNoise/FastNoise_Config.h"
+#ifdef FASTNOISE_METADATA
+#include <tuple>
+#endif
 
-#if !defined( FASTNOISE_METADATA ) && defined( __INTELLISENSE__ )
-//#define FASTNOISE_METADATA
+#include "FastNoise/Utility/Config.h"
+
+#if !defined( FASTNOISE_METADATA ) && ( defined( __INTELLISENSE__ ) || defined( __CLION_IDE__ ) )
+#define FASTNOISE_METADATA
 #endif
 
 namespace FastNoise
 {
+    // Dimension
     enum class Dim
     {
         X, Y, Z, W,
@@ -29,6 +35,7 @@ namespace FastNoise
         Manhattan,
         Hybrid,
         MaxAxis,
+        Minkowski,
     };
 
     constexpr static const char* kDistanceFunction_Strings[] =
@@ -38,6 +45,31 @@ namespace FastNoise
         "Manhattan",
         "Hybrid",
         "Max Axis",
+        "Minkowski",
+    };
+
+    enum class SimplexType
+    {
+        Standard,
+        Super
+    };
+
+    constexpr static const char* kSimplexType_Strings[] =
+    {
+        "Standard",
+        "Smooth",
+    };
+
+    enum class VectorizationScheme
+    {
+        OrthogonalGradientMatrix,
+        GradientOuterProduct
+    };
+
+    constexpr static const char* kVectorizationScheme_Strings[] =
+    {
+        "Orthogonal Gradient Matrix",
+        "Gradient Outer Product",
     };
 
     struct OutputMinMax
@@ -81,40 +113,46 @@ namespace FastNoise
     {
         float constant;
 
-        HybridSourceT( float f = 0.0f )
-        {
-            constant = f;
-        }
+        constexpr HybridSourceT( float f = 0.0f ) : constant( f ) { }
     };
 
+    namespace Internal
+    {
+        void BumpNodeRefences( const Generator*, bool );
+    }
+
     class FASTNOISE_API Generator
     {
     public:
         template<typename T>
         friend struct MetadataT;
 
+        Generator() = default;
+        Generator( const Generator& ) = delete;
+        Generator( Generator&& ) = delete;
+
         virtual ~Generator() = default;
 
-        virtual FastSIMD::eLevel GetSIMDLevel() const = 0;
+        virtual FastSIMD::FeatureSet GetActiveFeatureSet() const = 0;
         virtual const Metadata& GetMetadata() const = 0;
 
         virtual OutputMinMax GenUniformGrid2D( float* out,
             int xStart, int yStart,
             int xSize,  int ySize,
-            float frequency, int seed ) const = 0;
+            int seed ) const = 0;
 
         virtual OutputMinMax GenUniformGrid3D( float* out,
             int xStart, int yStart, int zStart, 
             int xSize,  int ySize,  int zSize, 
-            float frequency, int seed ) const = 0;
+            int seed ) const = 0;
 
         virtual OutputMinMax GenUniformGrid4D( float* out,
             int xStart, int yStart, int zStart, int wStart,
             int xSize,  int ySize,  int zSize,  int wSize,
-            float frequency, int seed ) const = 0;
+            int seed ) const = 0;
 
         virtual OutputMinMax GenTileable2D( float* out,
-            int xSize, int ySize, float frequency, int seed ) const = 0; 
+            int xSize, int ySize, int seed ) const = 0; 
 
         virtual OutputMinMax GenPositionArray2D( float* out, int count,
             const float* xPosArray, const float* yPosArray,
@@ -138,7 +176,7 @@ namespace FastNoise
         {
             static_assert( std::is_base_of<Generator, T>::value, "T must be child of FastNoise::Generator class" );
 
-            assert( !gen.get() || GetSIMDLevel() == gen->GetSIMDLevel() ); // Ensure that all SIMD levels match
+            assert( !gen.get() || GetActiveFeatureSet() == gen->GetActiveFeatureSet() ); // Ensure that all SIMD levels match
 
             SetSourceSIMDPtr( static_cast<const Generator*>( gen.get() ), &memberVariable.simdGeneratorPtr );
             memberVariable.base = gen;
@@ -146,6 +184,11 @@ namespace FastNoise
 
     private:
         virtual void SetSourceSIMDPtr( const Generator* base, const void** simdPtr ) = 0;
+        virtual int32_t ReferencesFetchAdd( int32_t add = 0 ) const noexcept = 0;
+
+        template<typename>
+        friend class SmartNode;
+        friend void Internal::BumpNodeRefences( const Generator*, bool );
     };
 
     using GeneratorSource = GeneratorSourceT<Generator>;
@@ -159,6 +202,9 @@ namespace FastNoise
         T varArray[(int)Dim::Count];
 
         template<typename U = T>
+#if __cplusplus >= 201402L
+        constexpr
+#endif
         PerDimensionVariable( U value = 0 )
         {
             for( T& element : varArray )
@@ -184,12 +230,13 @@ namespace FastNoise
     {
     protected:
         template<typename T, typename U, typename = std::enable_if_t<!std::is_enum_v<T>>>
-        void AddVariable( NameDesc nameDesc, T defaultV, U&& func, T minV = 0, T maxV = 0 )
+        void AddVariable( NameDesc nameDesc, T defaultV, U&& func, T minV = 0, T maxV = 0, float uiDragSpeed = std::is_same_v<T, float> ? Metadata::kDefaultUiDragSpeedFloat : Metadata::kDefaultUiDragSpeedInt )
         {
             MemberVariable member;
             member.name = nameDesc.name;
             member.description = nameDesc.desc;
             member.valueDefault = defaultV;
+            member.valueUiDragSpeed = uiDragSpeed;
             member.valueMin = minV;
             member.valueMax = maxV;
 
@@ -208,13 +255,14 @@ namespace FastNoise
             memberVariables.push_back( member );
         }
 
-        template<typename T, typename U, typename = std::enable_if_t<!std::is_enum_v<T>>>
-        void AddVariable( NameDesc nameDesc, T defaultV, void(U::* func)(T), T minV = 0, T maxV = 0 )
+        template<typename T, typename U, typename V, typename = std::enable_if_t<!std::is_enum_v<T>>>
+        void AddVariable( NameDesc nameDesc, T defaultV, V ( U::*func )( T ), T minV = 0, T maxV = 0, float uiDragSpeed = std::is_same_v<T, float> ? Metadata::kDefaultUiDragSpeedFloat : Metadata::kDefaultUiDragSpeedInt )
         {
             MemberVariable member;
             member.name = nameDesc.name;
             member.description = nameDesc.desc;
             member.valueDefault = defaultV;
+            member.valueUiDragSpeed = uiDragSpeed;
             member.valueMin = minV;
             member.valueMax = maxV;
 
@@ -241,7 +289,7 @@ namespace FastNoise
             member.description = nameDesc.desc;
             member.type = MemberVariable::EEnum;
             member.valueDefault = (int)defaultV;
-            member.enumNames = { enumNames... };
+            ( member.enumNames.push_back( enumNames ), ... );
 
             member.setFunc = [func]( Generator* g, MemberVariable::ValueUnion v )
             {
@@ -264,7 +312,10 @@ namespace FastNoise
             member.description = nameDesc.desc;
             member.type = MemberVariable::EEnum;
             member.valueDefault = (int)defaultV;
-            member.enumNames = { enumNames, enumNames + ENUM_NAMES };
+            for( const char* enumName : enumNames )
+            {
+                member.enumNames.push_back( enumName );
+            }
 
             member.setFunc = [func]( Generator* g, MemberVariable::ValueUnion v )
             {
@@ -280,14 +331,15 @@ namespace FastNoise
         }
 
         template<typename T, typename U, typename = std::enable_if_t<!std::is_enum_v<T>>>
-        void AddPerDimensionVariable( NameDesc nameDesc, T defaultV, U&& func, T minV = 0, T maxV = 0 )
+        void AddPerDimensionVariable( NameDesc nameDesc, T defaultV, U&& func, T minV = 0, T maxV = 0, float uiDragSpeed = std::is_same_v<T, float> ? Metadata::kDefaultUiDragSpeedFloat : Metadata::kDefaultUiDragSpeedInt )
         {
-            for( int idx = 0; (size_t)idx < sizeof( PerDimensionVariable<T>::varArray ) / sizeof( *PerDimensionVariable<T>::varArray ); idx++ )
+            for( int idx = 0; (size_t)idx < (size_t)Dim::Count; idx++ )
             {
                 MemberVariable member;
                 member.name = nameDesc.name;
                 member.description = nameDesc.desc;
                 member.valueDefault = defaultV;
+                member.valueUiDragSpeed = uiDragSpeed;
                 member.valueMin = minV;
                 member.valueMax = maxV;
 
@@ -338,7 +390,7 @@ namespace FastNoise
             using GeneratorSourceT = typename std::invoke_result_t<U, GetArg<U, 0>>::type::Type;
             using T = typename GeneratorSourceT::Type;
 
-            for( int idx = 0; (size_t)idx < sizeof( PerDimensionVariable<GeneratorSourceT>::varArray ) / sizeof( *PerDimensionVariable<GeneratorSourceT>::varArray ); idx++ )
+            for( int idx = 0; (size_t)idx < (size_t)Dim::Count; idx++ )
             {
                 MemberNodeLookup member;
                 member.name = nameDesc.name;
@@ -365,12 +417,13 @@ namespace FastNoise
 
 
         template<typename T, typename U>
-        void AddHybridSource( NameDesc nameDesc, float defaultValue, void(U::* funcNode)(SmartNodeArg<T>), void(U::* funcValue)(float) )
+        void AddHybridSource( NameDesc nameDesc, float defaultValue, void ( U::*funcNode )( SmartNodeArg<T> ), void ( U::*funcValue )( float ), float uiDragSpeed = Metadata::kDefaultUiDragSpeedFloat )
         {
             MemberHybrid member;
             member.name = nameDesc.name;
             member.description = nameDesc.desc;
             member.valueDefault = defaultValue;
+            member.valueUiDragSpeed = uiDragSpeed;
 
             member.setNodeFunc = [funcNode]( Generator* g, SmartNodeArg<> s )
             {
@@ -400,17 +453,18 @@ namespace FastNoise
         }
 
         template<typename U>
-        void AddPerDimensionHybridSource( NameDesc nameDesc, float defaultV, U&& func )
+        void AddPerDimensionHybridSource( NameDesc nameDesc, float defaultV, U&& func, float uiDragSpeed = Metadata::kDefaultUiDragSpeedFloat )
         {
             using HybridSourceT = typename std::invoke_result_t<U, GetArg<U, 0>>::type::Type;
             using T = typename HybridSourceT::Type;
 
-            for( int idx = 0; (size_t)idx < sizeof( PerDimensionVariable<HybridSourceT>::varArray ) / sizeof( *PerDimensionVariable<HybridSourceT>::varArray ); idx++ )
+            for( int idx = 0; (size_t)idx < (size_t)Dim::Count; idx++ )
             {
                 MemberHybrid member;
                 member.name = nameDesc.name;
                 member.description = nameDesc.desc;
                 member.valueDefault = defaultV;
+                member.valueUiDragSpeed = uiDragSpeed;
                 member.dimensionIdx = idx;
 
                 member.setNodeFunc = [func, idx]( Generator* g, SmartNodeArg<> s )
diff --git a/include/FastNoise/Generators/Generator.inl b/include/FastNoise/Generators/Generator.inl
index 2165abdb..86f28a9a 100644
--- a/include/FastNoise/Generators/Generator.inl
+++ b/include/FastNoise/Generators/Generator.inl
@@ -1,34 +1,36 @@
 #include <cassert>
 #include <cstring>
-#include "FastSIMD/InlInclude.h"
 
 #include "Generator.h"
 
-#ifdef FS_SIMD_CLASS
 #pragma warning( disable:4250 )
-#endif
 
-template<typename FS>
-class FS_T<FastNoise::Generator, FS> : public virtual FastNoise::Generator
-{
-    FASTSIMD_DECLARE_FS_TYPES;
+using namespace FastNoise;
+
+static constexpr size_t kRegisterSize = std::max<size_t>( 4, FS::NativeRegisterCount<float>() * 2 );
+using float32v = FS::Register<float, kRegisterSize>;
+using int32v = FS::Register<std::int32_t, kRegisterSize>;
+using mask32v = typename float32v::MaskType;
 
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::Generator, SIMD> : public virtual FastNoise::Generator
+{
 public:
     virtual float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y ) const = 0;
     virtual float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z ) const = 0;
-    virtual float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z, float32v w ) const { return Gen( seed, x, y, z ); };
+    virtual float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z, float32v w ) const { return Gen( seed, x, y, z ); }
 
 #define FASTNOISE_IMPL_GEN_T\
     float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y ) const override { return GenT( seed, x, y ); }\
     float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z ) const override { return GenT( seed, x, y, z ); }\
     float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z, float32v w ) const override { return GenT( seed, x, y, z, w ); }
 
-    FastSIMD::eLevel GetSIMDLevel() const final
+    FastSIMD::FeatureSet GetActiveFeatureSet() const final
     {
-        return FS::SIMD_Level;
+        return FastSIMD::FeatureSetDefault();
     }
 
-    using VoidPtrStorageType = const FS_T<Generator, FS>*;
+    using VoidPtrStorageType = const DispatchClass<Generator, SIMD>*;
 
     void SetSourceSIMDPtr( const Generator* base, const void** simdPtr ) final
     {
@@ -44,7 +46,7 @@ public:
     }
 
     template<typename T, typename... POS>
-    FS_INLINE float32v FS_VECTORCALL GetSourceValue( const FastNoise::HybridSourceT<T>& memberVariable, int32v seed, POS... pos ) const
+    static FS_FORCEINLINE float32v FS_VECTORCALL GetSourceValue( const FastNoise::HybridSourceT<T>& memberVariable, int32v seed, POS... pos )
     {
         if( memberVariable.simdGeneratorPtr )
         {
@@ -56,7 +58,7 @@ public:
     }
 
     template<typename T, typename... POS>
-    FS_INLINE float32v FS_VECTORCALL GetSourceValue( const FastNoise::GeneratorSourceT<T>& memberVariable, int32v seed, POS... pos ) const
+    static FS_FORCEINLINE float32v FS_VECTORCALL GetSourceValue( const FastNoise::GeneratorSourceT<T>& memberVariable, int32v seed, POS... pos )
     {
         assert( memberVariable.simdGeneratorPtr );
         auto simdGen = reinterpret_cast<VoidPtrStorageType>( memberVariable.simdGeneratorPtr );
@@ -65,64 +67,64 @@ public:
     }
 
     template<typename T>
-    FS_INLINE const FS_T<T, FS>* GetSourceSIMD( const FastNoise::GeneratorSourceT<T>& memberVariable ) const
+    static FS_FORCEINLINE const DispatchClass<T, SIMD>* GetSourceSIMD( const FastNoise::GeneratorSourceT<T>& memberVariable )
     {
         assert( memberVariable.simdGeneratorPtr );
         auto simdGen = reinterpret_cast<VoidPtrStorageType>( memberVariable.simdGeneratorPtr );
 
-        auto simdT = static_cast<const FS_T<T, FS>*>( simdGen );
+        auto simdT = static_cast<const FastSIMD::DispatchClass<T, SIMD>*>( simdGen );
         return simdT;
     }
 
-    FastNoise::OutputMinMax GenUniformGrid2D( float* noiseOut, int xStart, int yStart, int xSize, int ySize, float frequency, int seed ) const final
+    FastNoise::OutputMinMax GenUniformGrid2D( float* noiseOut, int xStart, int yStart, int xSize, int ySize, int seed ) const final
     {
+        ScopeExitx86ZeroUpper zeroUpper;
         float32v min( INFINITY );
         float32v max( -INFINITY );
 
         int32v xIdx( xStart );
         int32v yIdx( yStart );
 
-        float32v freqV( frequency );
-
         int32v xSizeV( xSize );
         int32v xMax = xSizeV + xIdx + int32v( -1 );
 
         intptr_t totalValues = xSize * ySize;
         intptr_t index = 0;
 
-        xIdx += int32v::FS_Incremented();
+        xIdx += FS::LoadIncremented<int32v>();
 
         AxisReset<true>( xIdx, yIdx, xMax, xSizeV, xSize );
 
-        while( index < totalValues - (intptr_t)FS_Size_32() )
+        while( index < totalValues - (intptr_t)int32v::ElementCount )
         {
-            float32v xPos = FS_Converti32_f32( xIdx ) * freqV;
-            float32v yPos = FS_Converti32_f32( yIdx ) * freqV;
+            float32v xPos = FS::Convert<float>( xIdx );
+            float32v yPos = FS::Convert<float>( yIdx );
 
             float32v gen = Gen( int32v( seed ), xPos, yPos );
-            FS_Store_f32( &noiseOut[index], gen );
+            FS::Store( &noiseOut[index], gen );
 
 #if FASTNOISE_CALC_MIN_MAX
-            min = FS_Min_f32( min, gen );
-            max = FS_Max_f32( max, gen );
+            min = FS::Min( min, gen );
+            max = FS::Max( max, gen );
 #endif
 
-            index += FS_Size_32();
-            xIdx += int32v( FS_Size_32() );
+            index += int32v::ElementCount;
+            xIdx += int32v( int32v::ElementCount );
 
             AxisReset<false>( xIdx, yIdx, xMax, xSizeV, xSize );
         }
 
-        float32v xPos = FS_Converti32_f32( xIdx ) * freqV;
-        float32v yPos = FS_Converti32_f32( yIdx ) * freqV;
+        float32v xPos = FS::Convert<float>( xIdx );
+        float32v yPos = FS::Convert<float>( yIdx );
 
         float32v gen = Gen( int32v( seed ), xPos, yPos );
 
-        return DoRemaining( noiseOut, totalValues, index, min, max, gen );
+        return StoreRemaining( noiseOut, totalValues, index, min, max, gen );
     }
 
-    FastNoise::OutputMinMax GenUniformGrid3D( float* noiseOut, int xStart, int yStart, int zStart, int xSize, int ySize, int zSize, float frequency, int seed ) const final
+    FastNoise::OutputMinMax GenUniformGrid3D( float* noiseOut, int xStart, int yStart, int zStart, int xSize, int ySize, int zSize, int seed ) const final
     {
+        ScopeExitx86ZeroUpper zeroUpper;
         float32v min( INFINITY );
         float32v max( -INFINITY );
 
@@ -130,8 +132,6 @@ public:
         int32v yIdx( yStart );
         int32v zIdx( zStart );
 
-        float32v freqV( frequency );
-
         int32v xSizeV( xSize );
         int32v xMax = xSizeV + xIdx + int32v( -1 );
         int32v ySizeV( ySize );
@@ -140,43 +140,44 @@ public:
         intptr_t totalValues = xSize * ySize * zSize;
         intptr_t index = 0;
 
-        xIdx += int32v::FS_Incremented();
+        xIdx += FS::LoadIncremented<int32v>();
 
         AxisReset<true>( xIdx, yIdx, xMax, xSizeV, xSize );
         AxisReset<true>( yIdx, zIdx, yMax, ySizeV, xSize * ySize );
 
-        while( index < totalValues - (intptr_t)FS_Size_32() )
+        while( index < totalValues - (intptr_t)int32v::ElementCount )
         {
-            float32v xPos = FS_Converti32_f32( xIdx ) * freqV;
-            float32v yPos = FS_Converti32_f32( yIdx ) * freqV;
-            float32v zPos = FS_Converti32_f32( zIdx ) * freqV;
+            float32v xPos = FS::Convert<float>( xIdx );
+            float32v yPos = FS::Convert<float>( yIdx );
+            float32v zPos = FS::Convert<float>( zIdx );
 
             float32v gen = Gen( int32v( seed ), xPos, yPos, zPos );
-            FS_Store_f32( &noiseOut[index], gen );
+            FS::Store( &noiseOut[index], gen );
 
 #if FASTNOISE_CALC_MIN_MAX
-            min = FS_Min_f32( min, gen );
-            max = FS_Max_f32( max, gen );
+            min = FS::Min( min, gen );
+            max = FS::Max( max, gen );
 #endif
 
-            index += FS_Size_32();
-            xIdx += int32v( FS_Size_32() );
-            
+            index += int32v::ElementCount;
+            xIdx += int32v( int32v::ElementCount );
+
             AxisReset<false>( xIdx, yIdx, xMax, xSizeV, xSize );
             AxisReset<false>( yIdx, zIdx, yMax, ySizeV, xSize * ySize );
         }
 
-        float32v xPos = FS_Converti32_f32( xIdx ) * freqV;
-        float32v yPos = FS_Converti32_f32( yIdx ) * freqV;
-        float32v zPos = FS_Converti32_f32( zIdx ) * freqV;
+        float32v xPos = FS::Convert<float>( xIdx );
+        float32v yPos = FS::Convert<float>( yIdx );
+        float32v zPos = FS::Convert<float>( zIdx );
 
         float32v gen = Gen( int32v( seed ), xPos, yPos, zPos );
 
-        return DoRemaining( noiseOut, totalValues, index, min, max, gen );
+        return StoreRemaining( noiseOut, totalValues, index, min, max, gen );
     }
 
-    FastNoise::OutputMinMax GenUniformGrid4D( float* noiseOut, int xStart, int yStart, int zStart, int wStart, int xSize, int ySize, int zSize, int wSize, float frequency, int seed ) const final
+    FastNoise::OutputMinMax GenUniformGrid4D( float* noiseOut, int xStart, int yStart, int zStart, int wStart, int xSize, int ySize, int zSize, int wSize, int seed ) const final
     {
+        ScopeExitx86ZeroUpper zeroUpper;
         float32v min( INFINITY );
         float32v max( -INFINITY );
 
@@ -185,8 +186,6 @@ public:
         int32v zIdx( zStart );
         int32v wIdx( wStart );
 
-        float32v freqV( frequency );
-
         int32v xSizeV( xSize );
         int32v xMax = xSizeV + xIdx + int32v( -1 );
         int32v ySizeV( ySize );
@@ -197,155 +196,162 @@ public:
         intptr_t totalValues = xSize * ySize * zSize * wSize;
         intptr_t index = 0;
 
-        xIdx += int32v::FS_Incremented();
+        xIdx += FS::LoadIncremented<int32v>();
 
         AxisReset<true>( xIdx, yIdx, xMax, xSizeV, xSize );
         AxisReset<true>( yIdx, zIdx, yMax, ySizeV, xSize * ySize );
         AxisReset<true>( zIdx, wIdx, zMax, zSizeV, xSize * ySize * zSize );
 
-        while( index < totalValues - (intptr_t)FS_Size_32() )
+        while( index < totalValues - (intptr_t)int32v::ElementCount )
         {
-            float32v xPos = FS_Converti32_f32( xIdx ) * freqV;
-            float32v yPos = FS_Converti32_f32( yIdx ) * freqV;
-            float32v zPos = FS_Converti32_f32( zIdx ) * freqV;
-            float32v wPos = FS_Converti32_f32( wIdx ) * freqV;
+            float32v xPos = FS::Convert<float>( xIdx );
+            float32v yPos = FS::Convert<float>( yIdx );
+            float32v zPos = FS::Convert<float>( zIdx );
+            float32v wPos = FS::Convert<float>( wIdx );
 
             float32v gen = Gen( int32v( seed ), xPos, yPos, zPos, wPos );
-            FS_Store_f32( &noiseOut[index], gen );
+            FS::Store( &noiseOut[index], gen );
 
 #if FASTNOISE_CALC_MIN_MAX
-            min = FS_Min_f32( min, gen );
-            max = FS_Max_f32( max, gen );
+            min = FS::Min( min, gen );
+            max = FS::Max( max, gen );
 #endif
 
-            index += FS_Size_32();
-            xIdx += int32v( FS_Size_32() );
+            index += int32v::ElementCount;
+            xIdx += int32v( int32v::ElementCount );
 
             AxisReset<false>( xIdx, yIdx, xMax, xSizeV, xSize );
             AxisReset<false>( yIdx, zIdx, yMax, ySizeV, xSize * ySize );
             AxisReset<false>( zIdx, wIdx, zMax, zSizeV, xSize * ySize * zSize );
         }
 
-        float32v xPos = FS_Converti32_f32( xIdx ) * freqV;
-        float32v yPos = FS_Converti32_f32( yIdx ) * freqV;
-        float32v zPos = FS_Converti32_f32( zIdx ) * freqV;
-        float32v wPos = FS_Converti32_f32( wIdx ) * freqV;
+        float32v xPos = FS::Convert<float>( xIdx );
+        float32v yPos = FS::Convert<float>( yIdx );
+        float32v zPos = FS::Convert<float>( zIdx );
+        float32v wPos = FS::Convert<float>( wIdx );
 
         float32v gen = Gen( int32v( seed ), xPos, yPos, zPos, wPos );
 
-        return DoRemaining( noiseOut, totalValues, index, min, max, gen );
+        return StoreRemaining( noiseOut, totalValues, index, min, max, gen );
     }
 
     FastNoise::OutputMinMax GenPositionArray2D( float* noiseOut, int count, const float* xPosArray, const float* yPosArray, float xOffset, float yOffset, int seed ) const final
     {
+        ScopeExitx86ZeroUpper zeroUpper;
         float32v min( INFINITY );
         float32v max( -INFINITY );
 
         intptr_t index = 0;
-        while( index < count - (intptr_t)FS_Size_32() )
+        while( index < count - (intptr_t)int32v::ElementCount )
         {
-            float32v xPos = float32v( xOffset ) + FS_Load_f32( &xPosArray[index] );
-            float32v yPos = float32v( yOffset ) + FS_Load_f32( &yPosArray[index] );
+            float32v xPos = float32v( xOffset ) + FS::Load<float32v>( &xPosArray[index] );
+            float32v yPos = float32v( yOffset ) + FS::Load<float32v>( &yPosArray[index] );
 
             float32v gen = Gen( int32v( seed ), xPos, yPos );
-            FS_Store_f32( &noiseOut[index], gen );
+            FS::Store( &noiseOut[index], gen );
 
 #if FASTNOISE_CALC_MIN_MAX
-            min = FS_Min_f32( min, gen );
-            max = FS_Max_f32( max, gen );
+            min = FS::Min( min, gen );
+            max = FS::Max( max, gen );
 #endif
-            index += FS_Size_32();
+            index += int32v::ElementCount;
         }
 
-        float32v xPos = float32v( xOffset ) + FS_Load_f32( &xPosArray[index] );
-        float32v yPos = float32v( yOffset ) + FS_Load_f32( &yPosArray[index] );
+        float32v xPos = float32v( xOffset ) + LoadRemaining( xPosArray, count, index );
+        float32v yPos = float32v( yOffset ) + LoadRemaining( yPosArray, count, index );
 
         float32v gen = Gen( int32v( seed ), xPos, yPos );
 
-        return DoRemaining( noiseOut, count, index, min, max, gen );
+        return StoreRemaining<true>( noiseOut, count, index, min, max, gen );
     }
 
     FastNoise::OutputMinMax GenPositionArray3D( float* noiseOut, int count, const float* xPosArray, const float* yPosArray, const float* zPosArray, float xOffset, float yOffset, float zOffset, int seed ) const final
     {
+        ScopeExitx86ZeroUpper zeroUpper;
         float32v min( INFINITY );
         float32v max( -INFINITY );
 
         intptr_t index = 0;
-        while( index < count - (intptr_t)FS_Size_32() )
+        while( index < count - (intptr_t)int32v::ElementCount )
         {
-            float32v xPos = float32v( xOffset ) + FS_Load_f32( &xPosArray[index] );
-            float32v yPos = float32v( yOffset ) + FS_Load_f32( &yPosArray[index] );
-            float32v zPos = float32v( zOffset ) + FS_Load_f32( &zPosArray[index] );
+            float32v xPos = float32v( xOffset ) + FS::Load<float32v>( &xPosArray[index] );
+            float32v yPos = float32v( yOffset ) + FS::Load<float32v>( &yPosArray[index] );
+            float32v zPos = float32v( zOffset ) + FS::Load<float32v>( &zPosArray[index] );
 
             float32v gen = Gen( int32v( seed ), xPos, yPos, zPos );
-            FS_Store_f32( &noiseOut[index], gen );
+            FS::Store( &noiseOut[index], gen );
 
 #if FASTNOISE_CALC_MIN_MAX
-            min = FS_Min_f32( min, gen );
-            max = FS_Max_f32( max, gen );
+            min = FS::Min( min, gen );
+            max = FS::Max( max, gen );
 #endif
-            index += FS_Size_32();
+            index += int32v::ElementCount;
         }
 
-        float32v xPos = float32v( xOffset ) + FS_Load_f32( &xPosArray[index] );
-        float32v yPos = float32v( yOffset ) + FS_Load_f32( &yPosArray[index] );
-        float32v zPos = float32v( zOffset ) + FS_Load_f32( &zPosArray[index] );
+        float32v xPos = float32v( xOffset ) + LoadRemaining( xPosArray, count, index );
+        float32v yPos = float32v( yOffset ) + LoadRemaining( yPosArray, count, index );
+        float32v zPos = float32v( zOffset ) + LoadRemaining( zPosArray, count, index );
 
         float32v gen = Gen( int32v( seed ), xPos, yPos, zPos );
 
-        return DoRemaining( noiseOut, count, index, min, max, gen );
+        return StoreRemaining<true>( noiseOut, count, index, min, max, gen );
     }
 
     FastNoise::OutputMinMax GenPositionArray4D( float* noiseOut, int count, const float* xPosArray, const float* yPosArray, const float* zPosArray, const float* wPosArray, float xOffset, float yOffset, float zOffset, float wOffset, int seed ) const final
     {
+        ScopeExitx86ZeroUpper zeroUpper;
         float32v min( INFINITY );
         float32v max( -INFINITY );
 
         intptr_t index = 0;
-        while( index < count - (intptr_t)FS_Size_32() )
+        while( index < count - (intptr_t)int32v::ElementCount )
         {
-            float32v xPos = float32v( xOffset ) + FS_Load_f32( &xPosArray[index] );
-            float32v yPos = float32v( yOffset ) + FS_Load_f32( &yPosArray[index] );
-            float32v zPos = float32v( zOffset ) + FS_Load_f32( &zPosArray[index] );
-            float32v wPos = float32v( wOffset ) + FS_Load_f32( &wPosArray[index] );
+            float32v xPos = float32v( xOffset ) + FS::Load<float32v>( &xPosArray[index] );
+            float32v yPos = float32v( yOffset ) + FS::Load<float32v>( &yPosArray[index] );
+            float32v zPos = float32v( zOffset ) + FS::Load<float32v>( &zPosArray[index] );
+            float32v wPos = float32v( wOffset ) + FS::Load<float32v>( &wPosArray[index] );
 
             float32v gen = Gen( int32v( seed ), xPos, yPos, zPos, wPos );
-            FS_Store_f32( &noiseOut[index], gen );
+            FS::Store( &noiseOut[index], gen );
 
 #if FASTNOISE_CALC_MIN_MAX
-            min = FS_Min_f32( min, gen );
-            max = FS_Max_f32( max, gen );
+            min = FS::Min( min, gen );
+            max = FS::Max( max, gen );
 #endif
-            index += FS_Size_32();
+            index += int32v::ElementCount;
         }
 
-        float32v xPos = float32v( xOffset ) + FS_Load_f32( &xPosArray[index] );
-        float32v yPos = float32v( yOffset ) + FS_Load_f32( &yPosArray[index] );
-        float32v zPos = float32v( zOffset ) + FS_Load_f32( &zPosArray[index] );
-        float32v wPos = float32v( wOffset ) + FS_Load_f32( &wPosArray[index] );
+        float32v xPos = float32v( xOffset ) + LoadRemaining( xPosArray, count, index );
+        float32v yPos = float32v( yOffset ) + LoadRemaining( yPosArray, count, index );
+        float32v zPos = float32v( zOffset ) + LoadRemaining( zPosArray, count, index );
+        float32v wPos = float32v( wOffset ) + LoadRemaining( wPosArray, count, index );
 
         float32v gen = Gen( int32v( seed ), xPos, yPos, zPos, wPos );
 
-        return DoRemaining( noiseOut, count, index, min, max, gen );
+        return StoreRemaining<true>( noiseOut, count, index, min, max, gen );
     }
 
     float GenSingle2D( float x, float y, int seed ) const final
     {
-        return FS_Extract0_f32( Gen( int32v( seed ), float32v( x ), float32v( y ) ) );
+        ScopeExitx86ZeroUpper zeroUpper;
+        return FS::Extract0( Gen( int32v( seed ), float32v( x ), float32v( y ) ) );
     }
 
     float GenSingle3D( float x, float y, float z, int seed ) const final
     {
-        return FS_Extract0_f32( Gen( int32v( seed ), float32v( x ), float32v( y ), float32v( z ) ) );
+        ScopeExitx86ZeroUpper zeroUpper;
+        return FS::Extract0( Gen( int32v( seed ), float32v( x ), float32v( y ), float32v( z ) ) );
     }
 
     float GenSingle4D( float x, float y, float z, float w, int seed ) const final
     {
-        return FS_Extract0_f32( Gen( int32v( seed ), float32v( x ), float32v( y ), float32v( z ), float32v( w ) ) );
+        ScopeExitx86ZeroUpper zeroUpper;
+        return FS::Extract0( Gen( int32v( seed ), float32v( x ), float32v( y ), float32v( z ), float32v( w ) ) );
     }
 
-    FastNoise::OutputMinMax GenTileable2D( float* noiseOut, int xSize, int ySize, float frequency, int seed ) const final
+    FastNoise::OutputMinMax GenTileable2D( float* noiseOut, int xSize, int ySize, int seed ) const final
     {
+        ScopeExitx86ZeroUpper zeroUpper;
         float32v min( INFINITY );
         float32v max( -INFINITY );
 
@@ -362,79 +368,96 @@ public:
         float pi2Recip( 0.15915493667f );
         float xSizePi = (float)xSize * pi2Recip;
         float ySizePi = (float)ySize * pi2Recip;
-        float32v xFreq = float32v( frequency * xSizePi );
-        float32v yFreq = float32v( frequency * ySizePi );
+        float32v xFreq = float32v( xSizePi );
+        float32v yFreq = float32v( ySizePi );
         float32v xMul = float32v( 1 / xSizePi );
         float32v yMul = float32v( 1 / ySizePi );
 
-        xIdx += int32v::FS_Incremented();
+        xIdx += FS::LoadIncremented<int32v>();
 
         AxisReset<true>( xIdx, yIdx, xMax, xSizeV, xSize );
 
-        while( index < totalValues - (intptr_t)FS_Size_32() )
+        while( index < totalValues - (intptr_t)int32v::ElementCount )
         {
-            float32v xF = FS_Converti32_f32( xIdx ) * xMul;
-            float32v yF = FS_Converti32_f32( yIdx ) * yMul;
+            float32v xF = FS::Convert<float>( xIdx ) * xMul;
+            float32v yF = FS::Convert<float>( yIdx ) * yMul;
 
-            float32v xPos = FS_Cos_f32( xF ) * xFreq;
-            float32v yPos = FS_Cos_f32( yF ) * yFreq;
-            float32v zPos = FS_Sin_f32( xF ) * xFreq;
-            float32v wPos = FS_Sin_f32( yF ) * yFreq;
+            float32v xPos = FS::Cos( xF ) * xFreq;
+            float32v yPos = FS::Cos( yF ) * yFreq;
+            float32v zPos = FS::Sin( xF ) * xFreq;
+            float32v wPos = FS::Sin( yF ) * yFreq;
 
             float32v gen = Gen( int32v( seed ), xPos, yPos, zPos, wPos );
-            FS_Store_f32( &noiseOut[index], gen );
+            FS::Store( &noiseOut[index], gen );
 
 #if FASTNOISE_CALC_MIN_MAX
-            min = FS_Min_f32( min, gen );
-            max = FS_Max_f32( max, gen );
+            min = FS::Min( min, gen );
+            max = FS::Max( max, gen );
 #endif
 
-            index += FS_Size_32();
-            xIdx += int32v( FS_Size_32() );
+            index += int32v::ElementCount;
+            xIdx += int32v( int32v::ElementCount );
 
             AxisReset<false>( xIdx, yIdx, xMax, xSizeV, xSize );
         }
 
-        float32v xF = FS_Converti32_f32( xIdx ) * xMul;
-        float32v yF = FS_Converti32_f32( yIdx ) * yMul;
+        float32v xF = FS::Convert<float>( xIdx ) * xMul;
+        float32v yF = FS::Convert<float>( yIdx ) * yMul;
 
-        float32v xPos = FS_Cos_f32( xF ) * xFreq;
-        float32v yPos = FS_Cos_f32( yF ) * yFreq;
-        float32v zPos = FS_Sin_f32( xF ) * xFreq;
-        float32v wPos = FS_Sin_f32( yF ) * yFreq;
+        float32v xPos = FS::Cos( xF ) * xFreq;
+        float32v yPos = FS::Cos( yF ) * yFreq;
+        float32v zPos = FS::Sin( xF ) * xFreq;
+        float32v wPos = FS::Sin( yF ) * yFreq;
 
         float32v gen = Gen( int32v( seed ), xPos, yPos, zPos, wPos );
 
-        return DoRemaining( noiseOut, totalValues, index, min, max, gen );
+        return StoreRemaining( noiseOut, totalValues, index, min, max, gen );
     }
 
 private:
+    struct ScopeExitx86ZeroUpper
+    {
+        FS_FORCEINLINE ~ScopeExitx86ZeroUpper()
+        {
+            if constexpr( SIMD & FeatureFlag::AVX )
+            {
+                FS_BIND_INTRINSIC( _mm256_zeroupper )();
+            }
+        }
+    };
+
     template<bool INITIAL>
-    static FS_INLINE void AxisReset( int32v& aIdx, int32v& bIdx, int32v aMax, int32v aSize, size_t aStep )
+    static FS_FORCEINLINE void AxisReset( int32v& aIdx, int32v& bIdx, int32v aMax, int32v aSize, size_t aStep )
     {
-        for( size_t resetLoop = INITIAL ? aStep : 0; resetLoop < FS_Size_32(); resetLoop += aStep )
+        for( size_t resetLoop = INITIAL ? aStep : 0; resetLoop < int32v::ElementCount; resetLoop += aStep )
         {
             mask32v aReset = aIdx > aMax;
-            bIdx = FS_MaskedIncrement_i32( bIdx, aReset );
-            aIdx = FS_MaskedSub_i32( aIdx, aSize, aReset );
+            bIdx = FS::MaskedIncrement( aReset, bIdx );
+            aIdx = FS::MaskedSub( aReset, aIdx, aSize );
         }
     }
 
-    static FS_INLINE FastNoise::OutputMinMax DoRemaining( float* noiseOut, intptr_t totalValues, intptr_t index, float32v min, float32v max, float32v finalGen )
+    static FS_FORCEINLINE float32v LoadRemaining( const float* loadPtr, intptr_t totalValues, intptr_t index )        
     {
-        FastNoise::OutputMinMax minMax;
-        intptr_t remaining = totalValues - index;
-
-        if( remaining == (intptr_t)FS_Size_32() )
+        if( index == 0 )
         {
-            FS_Store_f32( &noiseOut[index], finalGen );
+            intptr_t remaining = totalValues - index;
 
-#if FASTNOISE_CALC_MIN_MAX
-            min = FS_Min_f32( min, finalGen );
-            max = FS_Max_f32( max, finalGen );
-#endif
+            float32v load;
+            std::memcpy( &load, loadPtr, remaining * sizeof( float ) );
+            return load;
         }
-        else
+
+        return FS::Load<float32v>( &loadPtr[totalValues - float32v::ElementCount] );
+    }
+
+    template<bool LOADREMAINING = false>
+    static FS_FORCEINLINE FastNoise::OutputMinMax StoreRemaining( float* noiseOut, intptr_t totalValues, intptr_t index, float32v min, float32v max, float32v finalGen )
+    {
+        FastNoise::OutputMinMax minMax;
+        intptr_t remaining = totalValues - index;
+
+        if( LOADREMAINING ? index == 0 : remaining != (intptr_t)int32v::ElementCount )
         {
             std::memcpy( &noiseOut[index], &finalGen, remaining * sizeof( float ) );
 
@@ -444,13 +467,22 @@ private:
                 minMax << noiseOut[index];
             }
             while( ++index < totalValues );
+#endif
+        }
+        else
+        {
+            FS::Store( &noiseOut[totalValues - float32v::ElementCount], finalGen );
+
+#if FASTNOISE_CALC_MIN_MAX
+            min = FS::Min( min, finalGen );
+            max = FS::Max( max, finalGen );
 #endif
         }
 
 #if FASTNOISE_CALC_MIN_MAX
         float* minP = reinterpret_cast<float*>(&min);
         float* maxP = reinterpret_cast<float*>(&max);
-        for( size_t i = 0; i < FS_Size_32(); i++ )
+        for( size_t i = 0; i < int32v::ElementCount; i++ )
         {
             minMax << FastNoise::OutputMinMax{ minP[i], maxP[i] };
         }
@@ -458,4 +490,16 @@ private:
 
         return minMax;
     }
+
+    int32_t ReferencesFetchAdd( int32_t add ) const noexcept final
+    {
+        if( add )
+        {
+            return mReferences.fetch_add( add, std::memory_order_relaxed );
+        }
+
+        return mReferences.load( std::memory_order_relaxed );
+    }
+    
+    mutable std::atomic<uint32_t> mReferences = 0;
 };
diff --git a/include/FastNoise/Generators/Modifiers.h b/include/FastNoise/Generators/Modifiers.h
index 2657e11a..e3a3fc60 100644
--- a/include/FastNoise/Generators/Modifiers.h
+++ b/include/FastNoise/Generators/Modifiers.h
@@ -6,11 +6,10 @@ namespace FastNoise
     class DomainScale : public virtual Generator
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
 
         void SetSource( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mSource, gen ); }
-        void SetScale( float value ) { mScale = value; }
+        void SetScaling( float value ) { mScale = value; }
 
     protected:
         GeneratorSource mSource;
@@ -21,13 +20,16 @@ namespace FastNoise
     template<>
     struct MetadataT<DomainScale> : MetadataT<Generator>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
-            groups.push_back( "Modifiers" );
+            groups.push_back( "Domain Modifiers" );
             this->AddGeneratorSource( "Source", &DomainScale::SetSource );
-            this->AddVariable( "Scale", 1.0f, &DomainScale::SetScale );
+            this->AddVariable( "Scaling", 1.0f, &DomainScale::SetScaling );
+
+            description =
+                "Scales the input coordinates uniformly before passing them to the source generator.";
         }
     };
 #endif
@@ -35,7 +37,6 @@ namespace FastNoise
     class DomainOffset : public virtual Generator
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
 
         void SetSource( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mSource, gen ); }
@@ -58,13 +59,16 @@ namespace FastNoise
     template<>
     struct MetadataT<DomainOffset> : MetadataT<Generator>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
-            groups.push_back( "Modifiers" );
+            groups.push_back( "Domain Modifiers" );
             this->AddGeneratorSource( "Source", &DomainOffset::SetSource );
-            this->AddPerDimensionHybridSource( "Offset", 0.0f, []( DomainOffset* p ) { return std::ref( p->mOffset ); } );
+            this->AddPerDimensionHybridSource( "Offset", 0.0f, []( DomainOffset* p ) { return std::ref( p->mOffset ); }, 0.25f );
+
+            description =
+                "Adds an offset to the input coordinates before passing them to the source generator";
         }
     };
 #endif
@@ -72,7 +76,6 @@ namespace FastNoise
     class DomainRotate : public virtual Generator
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
 
         void SetSource( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mSource, gen ); }
@@ -120,15 +123,21 @@ namespace FastNoise
     template<>
     struct MetadataT<DomainRotate> : MetadataT<Generator>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
-            groups.push_back( "Modifiers" );
+            groups.push_back( "Domain Modifiers" );
             this->AddGeneratorSource( "Source", &DomainRotate::SetSource );
             this->AddVariable( "Yaw", 0.0f, &DomainRotate::SetYaw );
             this->AddVariable( "Pitch", 0.0f, &DomainRotate::SetPitch );
             this->AddVariable( "Roll", 0.0f, &DomainRotate::SetRoll ); 
+
+            description =
+                "Rotates the input coordinates around the origin before passing them to the source generator\n"
+                "For 2D input coordinates a 2D rotation with Yaw is performed if Pitch and Roll are 0, otherwise a 3D rotation is performed\n"
+                "For 3D input coordinates a 3D rotation is performed\n"
+                "For 4D input coordinates no rotation is applied";
         }
     };
 #endif
@@ -136,7 +145,6 @@ namespace FastNoise
     class SeedOffset : public virtual Generator
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
 
         void SetSource( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mSource, gen ); }
@@ -151,13 +159,16 @@ namespace FastNoise
     template<>
     struct MetadataT<SeedOffset> : MetadataT<Generator>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
             groups.push_back( "Modifiers" );
             this->AddGeneratorSource( "Source", &SeedOffset::SetSource );
             this->AddVariable( "Seed Offset", 1, &SeedOffset::SetOffset );
+
+            description =
+                "Offsets the input seed before passing it to the source generator.";
         }
     };
 #endif
@@ -165,18 +176,28 @@ namespace FastNoise
     class Remap : public virtual Generator
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
 
         void SetSource( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mSource, gen ); }
-        void SetRemap( float fromMin, float fromMax, float toMin, float toMax ) { mFromMin = fromMin; mFromMax = fromMax; mToMin = toMin; mToMax = toMax; }
+        
+        void SetFromMin( float value ) { mFromMin = value; }
+        void SetFromMin( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mFromMin, gen ); }
+        
+        void SetFromMax( float value ) { mFromMax = value; }
+        void SetFromMax( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mFromMax, gen ); }
+        
+        void SetToMin( float value ) { mToMin = value; }
+        void SetToMin( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mToMin, gen ); }
+        
+        void SetToMax( float value ) { mToMax = value; }
+        void SetToMax( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mToMax, gen ); }
 
     protected:
         GeneratorSource mSource;
-        float mFromMin = -1.0f;
-        float mFromMax = 1.0f;
-        float mToMin = 0.0f;
-        float mToMax = 1.0f;
+        HybridSource mFromMin = -1.0f;
+        HybridSource mFromMax = 1.0f;
+        HybridSource mToMin = 0.0f;
+        HybridSource mToMax = 1.0f;
 
         template<typename T>
         friend struct MetadataT;
@@ -186,36 +207,21 @@ namespace FastNoise
     template<>
     struct MetadataT<Remap> : MetadataT<Generator>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
             groups.push_back( "Modifiers" );
             this->AddGeneratorSource( "Source", &Remap::SetSource );
-
-            this->AddVariable( "From Min", -1.0f,
-                []( Remap* p, float f )
-                {
-                    p->mFromMin = f;
-                } );
-
-            this->AddVariable( "From Max", 1.0f,
-                []( Remap* p, float f )
-                {
-                    p->mFromMax = f;
-                } );
-
-            this->AddVariable( "To Min", 0.0f,
-                []( Remap* p, float f )
-                {
-                    p->mToMin = f;
-                } );
-
-            this->AddVariable( "To Max", 1.0f,
-                []( Remap* p, float f )
-                {
-                    p->mToMax = f;
-                } );
+            
+            this->AddHybridSource( "From Min", -1.0f, &Remap::SetFromMin, &Remap::SetFromMin );
+            this->AddHybridSource( "From Max", 1.0f, &Remap::SetFromMax, &Remap::SetFromMax );
+            this->AddHybridSource( "To Min", 0.0f, &Remap::SetToMin, &Remap::SetToMin );
+            this->AddHybridSource( "To Max", 1.0f, &Remap::SetToMax, &Remap::SetToMax );            
+
+            description =
+                "Remaps the output value of the source generator from one range to another\n"
+                "Does not clamp values";
         }
     };
 #endif
@@ -223,7 +229,6 @@ namespace FastNoise
     class ConvertRGBA8 : public virtual Generator
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
 
         void SetSource( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mSource, gen ); }
@@ -242,7 +247,7 @@ namespace FastNoise
     template<>
     struct MetadataT<ConvertRGBA8> : MetadataT<Generator>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
@@ -260,6 +265,11 @@ namespace FastNoise
                 {
                     p->mMax = f;
                 } );
+
+            description =
+                "Used for converting a float into a greyscale RGBA8 texture format output\n"
+                "Clamps the source output between Min/Max, scales it to 0-255, and packs the result\n"
+                "into an RGBA8 color stored in a float. RGB will be the same value, Alpha is always 255";
         }
     };
 #endif
@@ -267,7 +277,6 @@ namespace FastNoise
     class Terrace : public virtual Generator
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
 
         void SetSource( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mSource, gen ); }
@@ -286,14 +295,17 @@ namespace FastNoise
     template<>
     struct MetadataT<Terrace> : MetadataT<Generator>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
             groups.push_back( "Modifiers" );
             this->AddGeneratorSource( "Source", &Terrace::SetSource );
-            this->AddVariable( "Multiplier", 1.0f, &Terrace::SetMultiplier );
-            this->AddVariable( "Smoothness", 0.0f, &Terrace::SetSmoothness );
+            this->AddVariable( { "Multiplier", "The size of the steps" }, 1.0f, &Terrace::SetMultiplier );
+            this->AddVariable( { "Smoothness", "How smooth the transitions between levels are" }, 0.0f, &Terrace::SetSmoothness );
+
+            description =
+                "Maps the source output onto specified terrace levels (steps).\n";
         }
     };
 #endif
@@ -301,13 +313,12 @@ namespace FastNoise
     class DomainAxisScale : public virtual Generator
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
 
         void SetSource( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mSource, gen ); }
 
         template<Dim D>
-        void SetScale( float value ) { mScale[(int)D] = value; }
+        void SetScaling( float value ) { mScale[(int)D] = value; }
 
     protected:
         GeneratorSource mSource;
@@ -321,13 +332,16 @@ namespace FastNoise
     template<>
     struct MetadataT<DomainAxisScale> : MetadataT<Generator>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
-            groups.push_back( "Modifiers" );
+            groups.push_back( "Domain Modifiers" );
             this->AddGeneratorSource( "Source", &DomainAxisScale::SetSource );
-            this->AddPerDimensionVariable( "Scale", 1.0f, []( DomainAxisScale* p ) { return std::ref( p->mScale ); } );
+            this->AddPerDimensionVariable( "Scaling", 1.0f, []( DomainAxisScale* p ) { return std::ref( p->mScale ); } );
+
+            description =
+                "Scales each axis of the input coordinates independently before passing them to the source generator.";
         }
     };
 #endif
@@ -335,7 +349,6 @@ namespace FastNoise
     class AddDimension : public virtual Generator
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
 
         void SetSource( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mSource, gen ); }
@@ -351,13 +364,17 @@ namespace FastNoise
     template<>
     struct MetadataT<AddDimension> : MetadataT<Generator>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
-            groups.push_back( "Modifiers" );
+            groups.push_back( "Domain Modifiers" );
             this->AddGeneratorSource( "Source", &AddDimension::SetSource );
-            this->AddHybridSource( "New Dimension Position", 0.0f, &AddDimension::SetNewDimensionPosition, &AddDimension::SetNewDimensionPosition );
+            this->AddHybridSource( { "New Dimension Position", "The position of the new dimension" }, 0.0f, &AddDimension::SetNewDimensionPosition, &AddDimension::SetNewDimensionPosition );
+
+            description =
+                "Adds a dimension to the input coordinates, new dimension is always the last dimension\n"
+                "The coordinates with the new dimension are passed to the source generator";
         }
     };
 #endif
@@ -365,7 +382,6 @@ namespace FastNoise
     class RemoveDimension : public virtual Generator
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
 
         void SetSource( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mSource, gen ); }
@@ -380,13 +396,16 @@ namespace FastNoise
     template<>
     struct MetadataT<RemoveDimension> : MetadataT<Generator>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
-            groups.push_back( "Modifiers" );
+            groups.push_back( "Domain Modifiers" );
             this->AddGeneratorSource( "Source", &RemoveDimension::SetSource );
             this->AddVariableEnum( "Remove Dimension", Dim::Y, &RemoveDimension::SetRemoveDimension, kDim_Strings );
+
+            description =
+                "Removes the specified dimension from the input coordinates before passing them to the source generator";
         }
     };
 #endif
@@ -394,7 +413,6 @@ namespace FastNoise
     class GeneratorCache : public virtual Generator
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
 
         void SetSource( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mSource, gen ); }
@@ -407,12 +425,114 @@ namespace FastNoise
     template<>
     struct MetadataT<GeneratorCache> : MetadataT<Generator>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
             groups.push_back( "Modifiers" );
             this->AddGeneratorSource( "Source", &GeneratorCache::SetSource );
+
+            description =
+                "Caches the output of the source generator. If the same input coordinates and seed are\n"
+                "requested again, the cached value is returned, improving performance for complex source generators";
+        }
+    };
+#endif
+
+    class SquareRoot : public virtual Generator
+    {
+    public:
+        const Metadata& GetMetadata() const override;
+
+        void SetSource( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mSource, gen ); }
+
+    protected:
+        GeneratorSource mSource;
+    };
+
+#ifdef FASTNOISE_METADATA
+    template<>
+    struct MetadataT<SquareRoot> : MetadataT<Generator>
+    {
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
+
+        MetadataT()
+        {
+            groups.push_back( "Modifiers" );
+            this->AddGeneratorSource( "Source", &SquareRoot::SetSource );
+
+            description =
+                "Returns the square root of the absolute value of the source output,\n"
+                "preserving the original sign (signed square root).";
+        }
+    };
+#endif
+
+    class Abs : public virtual Generator
+    {
+    public:
+        const Metadata& GetMetadata() const override;
+
+        void SetSource( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mSource, gen ); }
+
+    protected:
+        GeneratorSource mSource;
+    };
+
+#ifdef FASTNOISE_METADATA
+    template<>
+    struct MetadataT<Abs> : MetadataT<Generator>
+    {
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
+
+        MetadataT()
+        {
+            groups.push_back( "Modifiers" );
+            this->AddGeneratorSource( "Source", &Abs::SetSource );
+
+            description =
+                "Returns the absolute value of the source output.";
+        }
+    };
+#endif
+
+    enum class PlaneRotationType
+    {
+        ImproveXYPlanes,
+        ImproveXZPlanes
+    };
+
+    class DomainRotatePlane : public virtual Generator
+    {
+    public:
+        const Metadata& GetMetadata() const override;
+
+        void SetSource( SmartNodeArg<> gen ) { this->SetSourceMemberVariable( mSource, gen ); }
+        void SetRotationType( PlaneRotationType type ) { mRotationType = type; }
+
+    protected:
+        GeneratorSource mSource;
+        PlaneRotationType mRotationType = PlaneRotationType::ImproveXYPlanes;
+    };
+
+#ifdef FASTNOISE_METADATA
+    template<>
+    struct MetadataT<DomainRotatePlane> : MetadataT<Generator>
+    {
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
+
+        MetadataT()
+        {
+            groups.push_back( "Domain Modifiers" );
+            this->AddGeneratorSource( "Source", &DomainRotatePlane::SetSource );
+            
+            this->AddVariableEnum( "Rotation Type", PlaneRotationType::ImproveXYPlanes, &DomainRotatePlane::SetRotationType, "Improve XY Planes", "Improve XZ Planes" );
+
+            description =
+                "Applies preset rotation to improve noise in specific 3D planes. Faster than DomainRotate.\n"
+                "This helps reduce axis aligned artifacts in 3D noise for the specified plane.\n"
+                "For 2D input coordinates a 3rd dimension is added and the noise is optimized for the XY plane\n"
+                "For 4D input coordinates only the first 3 dimensions are rotated, the 4th dimension is passed through unchanged";
         }
     };
 #endif
diff --git a/include/FastNoise/Generators/Modifiers.inl b/include/FastNoise/Generators/Modifiers.inl
index c5e0f331..570f4b82 100644
--- a/include/FastNoise/Generators/Modifiers.inl
+++ b/include/FastNoise/Generators/Modifiers.inl
@@ -1,28 +1,24 @@
-#include "FastSIMD/InlInclude.h"
-
 #include "Modifiers.h"
 
-template<typename FS>
-class FS_T<FastNoise::DomainScale, FS> : public virtual FastNoise::DomainScale, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::DomainScale, SIMD> final : public virtual FastNoise::DomainScale, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
     
     template<typename... P> 
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
         return this->GetSourceValue( mSource, seed, (pos * float32v( mScale ))... );
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::DomainOffset, FS> : public virtual FastNoise::DomainOffset, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::DomainOffset, SIMD> final : public virtual FastNoise::DomainOffset, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
     
     template<typename... P> 
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
         return [this, seed]( std::remove_reference_t<P>... sourcePos, std::remove_reference_t<P>... offset )
         {
@@ -34,119 +30,118 @@ class FS_T<FastNoise::DomainOffset, FS> : public virtual FastNoise::DomainOffset
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::DomainRotate, FS> : public virtual FastNoise::DomainRotate, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::DomainRotate, SIMD> final : public virtual FastNoise::DomainRotate, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
-
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y ) const final
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y ) const
     {
         if( mPitchSin == 0.0f && mRollSin == 0.0f )
         {
             return this->GetSourceValue( mSource, seed,
-                FS_FNMulAdd_f32( y, float32v( mYawSin ), x * float32v( mYawCos ) ),
-                FS_FMulAdd_f32( x, float32v( mYawSin ), y * float32v( mYawCos ) ) );
+                FS::FNMulAdd( y, float32v( mYawSin ), x * float32v( mYawCos ) ),
+                FS::FMulAdd( x, float32v( mYawSin ), y * float32v( mYawCos ) ) );
         }
 
         return Gen( seed, x, y, float32v( 0 ) );
     }
 
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z ) const final
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z ) const
     {
         return this->GetSourceValue( mSource, seed,
-            FS_FMulAdd_f32( x, float32v( mXa ), FS_FMulAdd_f32( y, float32v( mXb ), z * float32v( mXc ) ) ),
-            FS_FMulAdd_f32( x, float32v( mYa ), FS_FMulAdd_f32( y, float32v( mYb ), z * float32v( mYc ) ) ),
-            FS_FMulAdd_f32( x, float32v( mZa ), FS_FMulAdd_f32( y, float32v( mZb ), z * float32v( mZc ) ) ) );
+            FS::FMulAdd( x, float32v( mXa ), FS::FMulAdd( y, float32v( mXb ), z * float32v( mXc ) ) ),
+            FS::FMulAdd( x, float32v( mYa ), FS::FMulAdd( y, float32v( mYb ), z * float32v( mYc ) ) ),
+            FS::FMulAdd( x, float32v( mZa ), FS::FMulAdd( y, float32v( mZb ), z * float32v( mZc ) ) ) );
     }
 
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z, float32v w ) const final
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z, float32v w ) const
     {
         // No rotation for 4D yet
         return this->GetSourceValue( mSource, seed, x, y, z, w );
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::SeedOffset, FS> : public virtual FastNoise::SeedOffset, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::SeedOffset, SIMD> final : public virtual FastNoise::SeedOffset, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
 
     template<typename... P>
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
         return this->GetSourceValue( mSource, seed + int32v( mOffset ), pos... );
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::Remap, FS> : public virtual FastNoise::Remap, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::Remap, SIMD> final : public virtual FastNoise::Remap, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
 
     template<typename... P>
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
         float32v source = this->GetSourceValue( mSource, seed, pos... );
+
+        float32v fromMin = this->GetSourceValue( mFromMin, seed, pos... );
+        float32v fromMax = this->GetSourceValue( mFromMax, seed, pos... );
+        float32v toMin = this->GetSourceValue( mToMin, seed, pos... );
+        float32v toMax = this->GetSourceValue( mToMax, seed, pos... );
             
-        return float32v( mToMin ) + (( source - float32v( mFromMin ) ) / float32v( mFromMax - mFromMin ) * float32v( mToMax - mToMin ));
+        return toMin + ( ( source - fromMin ) / ( fromMax - fromMin ) * ( toMax - toMin ) );
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::ConvertRGBA8, FS> : public virtual FastNoise::ConvertRGBA8, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::ConvertRGBA8, SIMD> final : public virtual FastNoise::ConvertRGBA8, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
 
     template<typename... P>
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
         float32v source = this->GetSourceValue( mSource, seed, pos... );
         
-        source = FS_Min_f32( source, float32v( mMax ));
-        source = FS_Max_f32( source, float32v( mMin ));
+        source = FS::Min( source, float32v( mMax ));
+        source = FS::Max( source, float32v( mMin ));
         source -= float32v( mMin );
 
         source *= float32v( 255.0f / (mMax - mMin) );
 
-        int32v byteVal = FS_Convertf32_i32( source );
+        int32v byteVal = FS::Convert<std::int32_t>( source );
 
         int32v output = int32v( 255 << 24 );
         output |= byteVal;
         output |= byteVal << 8;
         output |= byteVal << 16;
 
-        return FS_Casti32_f32( output );
+        return FS::Cast<float>( output );
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::Terrace, FS> : public virtual FastNoise::Terrace, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::Terrace, SIMD> final : public virtual FastNoise::Terrace, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
 
     template<typename... P>
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
         float32v source = this->GetSourceValue( mSource, seed, pos... );
 
         source *= float32v( mMultiplier );
-        float32v rounded = FS_Round_f32( source );
+        float32v rounded = FS::Round( source );
 
         if( mSmoothness != 0.0f )
         {
             float32v diff = rounded - source;
             mask32v diffSign = diff < float32v( 0 );
 
-            diff = FS_Abs_f32( diff );
+            diff = FS::Abs( diff );
             diff = float32v( 0.5f ) - diff;
 
             diff *= float32v( mSmoothnessRecip );
-            diff = FS_Min_f32( diff, float32v( 0.5f ) );
-            diff = FS_Select_f32( diffSign, float32v( 0.5f ) - diff, diff - float32v( 0.5f ) );
+            diff = FS::Min( diff, float32v( 0.5f ) );
+            diff = FS::Select( diffSign, float32v( 0.5f ) - diff, diff - float32v( 0.5f ) );
 
             rounded += diff;
         }
@@ -155,14 +150,13 @@ class FS_T<FastNoise::Terrace, FS> : public virtual FastNoise::Terrace, public F
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::DomainAxisScale, FS> : public virtual FastNoise::DomainAxisScale, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::DomainAxisScale, SIMD> final : public virtual FastNoise::DomainAxisScale, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
 
     template<typename... P>
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
         size_t idx = 0;
         ((pos *= float32v( mScale[idx++] )), ...);
@@ -171,14 +165,13 @@ class FS_T<FastNoise::DomainAxisScale, FS> : public virtual FastNoise::DomainAxi
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::AddDimension, FS> : public virtual FastNoise::AddDimension, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::AddDimension, SIMD> final : public virtual FastNoise::AddDimension, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
 
     template<typename... P>
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
         if constexpr( sizeof...(P) == (size_t)FastNoise::Dim::Count )
         {
@@ -191,17 +184,15 @@ class FS_T<FastNoise::AddDimension, FS> : public virtual FastNoise::AddDimension
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::RemoveDimension, FS> : public virtual FastNoise::RemoveDimension, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::RemoveDimension, SIMD> final : public virtual FastNoise::RemoveDimension, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
-
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y ) const final
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y ) const
     {
         return this->GetSourceValue( mSource, seed, x, y );
     }
 
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z ) const final
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z ) const
     {
         switch( mRemoveDimension )
         {
@@ -216,7 +207,7 @@ class FS_T<FastNoise::RemoveDimension, FS> : public virtual FastNoise::RemoveDim
         }
     }
 
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z, float32v w ) const final
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z, float32v w ) const
     {
         switch( mRemoveDimension )
         {
@@ -234,27 +225,28 @@ class FS_T<FastNoise::RemoveDimension, FS> : public virtual FastNoise::RemoveDim
     }
 };
 
-template<typename FS>
-class FS_T<FastNoise::GeneratorCache, FS> : public virtual FastNoise::GeneratorCache, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::GeneratorCache, SIMD> final : public virtual FastNoise::GeneratorCache, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
     FASTNOISE_IMPL_GEN_T;
 
     template<typename... P>
-    FS_INLINE float32v GenT( int32v seed, P... pos ) const
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
     {
         thread_local static const void* CachedGenerator = nullptr;
-        thread_local static float CachedValue[FS_Size_32()];
-        thread_local static float CachedPos[FS_Size_32()][sizeof...( P )];
+        thread_local static std::int32_t CachedSeed[int32v::ElementCount];
+        thread_local static float CachedPos[sizeof...(P)][int32v::ElementCount];
+        thread_local static float CachedValue[int32v::ElementCount];
         // TLS is not always aligned (compiler bug), need to avoid using SIMD types
-
-        float32v arrayPos[] = { pos... };
+        
+        const float32v arrayPos[] = { pos... };
 
         bool isSame = (CachedGenerator == mSource.simdGeneratorPtr);
+        isSame &= !FS::AnyMask( seed != FS::Load<int32v>( CachedSeed ) );
 
         for( size_t i = 0; i < sizeof...( P ); i++ )
         {
-            isSame &= !FS_AnyMask_bool( arrayPos[i] != FS_Load_f32( &CachedPos[i] ) );
+            isSame &= !FS::AnyMask( arrayPos[i] != FS::Load<float32v>( CachedPos[i] ) );
         }
 
         if( !isSame )
@@ -262,16 +254,108 @@ class FS_T<FastNoise::GeneratorCache, FS> : public virtual FastNoise::GeneratorC
             CachedGenerator = mSource.simdGeneratorPtr;
 
             float32v value = this->GetSourceValue( mSource, seed, pos... );
-            FS_Store_f32( &CachedValue, value );
+
+            FS::Store( CachedValue, value );
+            FS::Store( CachedSeed, seed );
 
             for( size_t i = 0; i < sizeof...(P); i++ )
             {
-                FS_Store_f32( &CachedPos[i], arrayPos[i] );
+                FS::Store( CachedPos[i], arrayPos[i] );
             }
 
             return value;
         }
 
-        return FS_Load_f32( &CachedValue );
+        return FS::Load<float32v>( CachedValue );
+    }
+};
+
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::SquareRoot, SIMD> final : public virtual FastNoise::SquareRoot, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
+{
+    FASTNOISE_IMPL_GEN_T;
+
+    template<typename... P>
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
+    {
+        float32v value = this->GetSourceValue( mSource, seed, pos... );
+        
+        float32v invSqrt = FS::InvSqrt( FS::Abs( value ) );
+
+        return FS::Masked( invSqrt != float32v( INFINITY ), value * invSqrt );
+    }
+};
+
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::Abs, SIMD> final : public virtual FastNoise::Abs, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
+{
+    FASTNOISE_IMPL_GEN_T;
+
+    template<typename... P>
+    FS_FORCEINLINE float32v GenT( int32v seed, P... pos ) const
+    {
+        float32v value = this->GetSourceValue( mSource, seed, pos... );
+        
+        return FS::Abs( value );
+    }
+};
+
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::DomainRotatePlane, SIMD> final : public virtual FastNoise::DomainRotatePlane, public FastSIMD::DispatchClass<FastNoise::Generator, SIMD>
+{
+    FS_FORCEINLINE void FS_VECTORCALL RotateCoords( PlaneRotationType rotationType, float32v& x, float32v& y, float32v& z ) const
+    {
+        float32v newX = x;
+        float32v newY = y;
+        float32v newZ = z;
+
+        switch( rotationType )
+        {
+        case PlaneRotationType::ImproveXYPlanes:
+        {
+            float32v xy = x + y;
+            float32v s2 = xy * float32v( -0.211324865405187f );
+            newZ = z * float32v( 0.577350269189626f );
+            newX = x + s2 - newZ;
+            newY = y + s2 - newZ;
+            newZ = newZ + xy * float32v( 0.577350269189626f );
+        }
+            break;
+
+        case PlaneRotationType::ImproveXZPlanes:
+        {
+            float32v xz = x + z;
+            float32v s2 = xz * float32v( -0.211324865405187f );
+            newY = y * float32v( 0.577350269189626f );
+            newX = x + s2 - newY;
+            newZ = z + s2 - newY;
+            newY = newY + xz * float32v( 0.577350269189626f );
+
+        }
+            break;
+        }
+        x = newX;
+        y = newY;
+        z = newZ;
+    }
+
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y ) const
+    {
+        float32v z = 0;
+        RotateCoords( PlaneRotationType::ImproveXYPlanes, x, y, z );
+
+        return this->GetSourceValue( mSource, seed, x, y, z );
+    }
+
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z ) const
+    {
+        RotateCoords( mRotationType, x, y, z );
+        return this->GetSourceValue( mSource, seed, x, y, z );
+    }
+
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z, float32v w ) const
+    {
+        RotateCoords( mRotationType, x, y, z );
+        return this->GetSourceValue( mSource, seed, x, y, z, w );
     }
 };
diff --git a/include/FastNoise/Generators/Perlin.h b/include/FastNoise/Generators/Perlin.h
index 88565d59..cf4832e5 100644
--- a/include/FastNoise/Generators/Perlin.h
+++ b/include/FastNoise/Generators/Perlin.h
@@ -3,18 +3,17 @@
 
 namespace FastNoise
 {
-    class Perlin : public virtual Generator
+    class Perlin : public virtual VariableRange<ScalableGenerator>
     {
     public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
         const Metadata& GetMetadata() const override;
     };
 
 #ifdef FASTNOISE_METADATA
     template<>
-    struct MetadataT<Perlin> : MetadataT<Generator>
+    struct MetadataT<Perlin> : MetadataT<VariableRange<ScalableGenerator>>
     {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
 
         MetadataT()
         {
diff --git a/include/FastNoise/Generators/Perlin.inl b/include/FastNoise/Generators/Perlin.inl
index 7351be42..edaa6f44 100644
--- a/include/FastNoise/Generators/Perlin.inl
+++ b/include/FastNoise/Generators/Perlin.inl
@@ -1,109 +1,119 @@
-#include "FastSIMD/InlInclude.h"
-
-#include "Perlin.h"
-#include "Utils.inl"
-
-template<typename FS>
-class FS_T<FastNoise::Perlin, FS> : public virtual FastNoise::Perlin, public FS_T<FastNoise::Generator, FS>
-{
-    FASTSIMD_DECLARE_FS_TYPES;
-
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y ) const final
-    {
-        float32v xs = FS_Floor_f32( x );
-        float32v ys = FS_Floor_f32( y );
-
-        int32v x0 = FS_Convertf32_i32( xs ) * int32v( FnPrimes::X );
-        int32v y0 = FS_Convertf32_i32( ys ) * int32v( FnPrimes::Y );
-        int32v x1 = x0 + int32v( FnPrimes::X );
-        int32v y1 = y0 + int32v( FnPrimes::Y );
-
-        float32v xf0 = xs = x - xs;
-        float32v yf0 = ys = y - ys;
-        float32v xf1 = xf0 - float32v( 1 );
-        float32v yf1 = yf0 - float32v( 1 );
-
-        xs = FnUtils::InterpQuintic( xs );
-        ys = FnUtils::InterpQuintic( ys );
-
-        return float32v( 0.579106986522674560546875f ) * FnUtils::Lerp(
-            FnUtils::Lerp( FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x0, y0 ), xf0, yf0 ), FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x1, y0 ), xf1, yf0 ), xs ),
-            FnUtils::Lerp( FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x0, y1 ), xf0, yf1 ), FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x1, y1 ), xf1, yf1 ), xs ), ys );
-    }
-
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z ) const final
-    {
-        float32v xs = FS_Floor_f32( x );
-        float32v ys = FS_Floor_f32( y );
-        float32v zs = FS_Floor_f32( z );
-
-        int32v x0 = FS_Convertf32_i32( xs ) * int32v( FnPrimes::X );
-        int32v y0 = FS_Convertf32_i32( ys ) * int32v( FnPrimes::Y );
-        int32v z0 = FS_Convertf32_i32( zs ) * int32v( FnPrimes::Z );
-        int32v x1 = x0 + int32v( FnPrimes::X );
-        int32v y1 = y0 + int32v( FnPrimes::Y );
-        int32v z1 = z0 + int32v( FnPrimes::Z );
-
-        float32v xf0 = xs = x - xs;
-        float32v yf0 = ys = y - ys;
-        float32v zf0 = zs = z - zs;
-        float32v xf1 = xf0 - float32v( 1 );
-        float32v yf1 = yf0 - float32v( 1 );
-        float32v zf1 = zf0 - float32v( 1 );
-
-        xs = FnUtils::InterpQuintic( xs );
-        ys = FnUtils::InterpQuintic( ys );
-        zs = FnUtils::InterpQuintic( zs );
-
-        return float32v( 0.964921414852142333984375f ) * FnUtils::Lerp( FnUtils::Lerp(
-            FnUtils::Lerp( FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x0, y0, z0 ), xf0, yf0, zf0 ), FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x1, y0, z0 ), xf1, yf0, zf0 ), xs ),
-            FnUtils::Lerp( FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x0, y1, z0 ), xf0, yf1, zf0 ), FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x1, y1, z0 ), xf1, yf1, zf0 ), xs ), ys ),
-            FnUtils::Lerp( 
-            FnUtils::Lerp( FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x0, y0, z1 ), xf0, yf0, zf1 ), FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x1, y0, z1 ), xf1, yf0, zf1 ), xs ),    
-            FnUtils::Lerp( FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x0, y1, z1 ), xf0, yf1, zf1 ), FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x1, y1, z1 ), xf1, yf1, zf1 ), xs ), ys ), zs );
-    }
-
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z, float32v w ) const final
-    {
-        float32v xs = FS_Floor_f32( x );
-        float32v ys = FS_Floor_f32( y );
-        float32v zs = FS_Floor_f32( z );
-        float32v ws = FS_Floor_f32( w );
-
-        int32v x0 = FS_Convertf32_i32( xs ) * int32v( FnPrimes::X );
-        int32v y0 = FS_Convertf32_i32( ys ) * int32v( FnPrimes::Y );
-        int32v z0 = FS_Convertf32_i32( zs ) * int32v( FnPrimes::Z );
-        int32v w0 = FS_Convertf32_i32( ws ) * int32v( FnPrimes::W );
-        int32v x1 = x0 + int32v( FnPrimes::X );
-        int32v y1 = y0 + int32v( FnPrimes::Y );
-        int32v z1 = z0 + int32v( FnPrimes::Z );
-        int32v w1 = w0 + int32v( FnPrimes::W );
-
-        float32v xf0 = xs = x - xs;
-        float32v yf0 = ys = y - ys;
-        float32v zf0 = zs = z - zs;
-        float32v wf0 = ws = w - ws;
-        float32v xf1 = xf0 - float32v( 1 );
-        float32v yf1 = yf0 - float32v( 1 );
-        float32v zf1 = zf0 - float32v( 1 );
-        float32v wf1 = wf0 - float32v( 1 );
-
-        xs = FnUtils::InterpQuintic( xs );
-        ys = FnUtils::InterpQuintic( ys );
-        zs = FnUtils::InterpQuintic( zs );
-        ws = FnUtils::InterpQuintic( ws );
-
-        return float32v( 0.964921414852142333984375f ) * FnUtils::Lerp( FnUtils::Lerp( FnUtils::Lerp(
-            FnUtils::Lerp( FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x0, y0, z0, w0 ), xf0, yf0, zf0, wf0 ), FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x1, y0, z0, w0 ), xf1, yf0, zf0, wf0 ), xs ),
-            FnUtils::Lerp( FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x0, y1, z0, w0 ), xf0, yf1, zf0, wf0 ), FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x1, y1, z0, w0 ), xf1, yf1, zf0, wf0 ), xs ), ys ),
-            FnUtils::Lerp(                                                                                                                                                     
-            FnUtils::Lerp( FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x0, y0, z1, w0 ), xf0, yf0, zf1, wf0 ), FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x1, y0, z1, w0 ), xf1, yf0, zf1, wf0 ), xs ),    
-            FnUtils::Lerp( FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x0, y1, z1, w0 ), xf0, yf1, zf1, wf0 ), FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x1, y1, z1, w0 ), xf1, yf1, zf1, wf0 ), xs ), ys ), zs ),
-            FnUtils::Lerp( FnUtils::Lerp(
-            FnUtils::Lerp( FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x0, y0, z0, w1 ), xf0, yf0, zf0, wf1 ), FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x1, y0, z0, w1 ), xf1, yf0, zf0, wf1 ), xs ),
-            FnUtils::Lerp( FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x0, y1, z0, w1 ), xf0, yf1, zf0, wf1 ), FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x1, y1, z0, w1 ), xf1, yf1, zf0, wf1 ), xs ), ys ),
-            FnUtils::Lerp(                                                                                                                                                     
-            FnUtils::Lerp( FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x0, y0, z1, w1 ), xf0, yf0, zf1, wf1 ), FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x1, y0, z1, w1 ), xf1, yf0, zf1, wf1 ), xs ),    
-            FnUtils::Lerp( FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x0, y1, z1, w1 ), xf0, yf1, zf1, wf1 ), FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, x1, y1, z1, w1 ), xf1, yf1, zf1, wf1 ), xs ), ys ), zs ), ws );
-    }
-};
+#include "Perlin.h"
+#include "Utils.inl"
+
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::Perlin, SIMD> final : public virtual FastNoise::Perlin, public FastSIMD::DispatchClass<FastNoise::VariableRange<ScalableGenerator>, SIMD>
+{    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y ) const
+    {
+        this->ScalePositions( x, y );
+
+        float32v xs = FS::Floor( x );
+        float32v ys = FS::Floor( y );
+
+        int32v x0 = FS::Convert<int32_t>( xs ) * int32v( Primes::X );
+        int32v y0 = FS::Convert<int32_t>( ys ) * int32v( Primes::Y );
+        int32v x1 = x0 + int32v( Primes::X );
+        int32v y1 = y0 + int32v( Primes::Y );
+
+        float32v xf0 = xs = x - xs;
+        float32v yf0 = ys = y - ys;
+        float32v xf1 = xf0 - float32v( 1 );
+        float32v yf1 = yf0 - float32v( 1 );
+
+        xs = InterpQuintic( xs );
+        ys = InterpQuintic( ys );
+
+        constexpr float kBounding = 0.579106986522674560546875f;
+
+        return this->ScaleOutput( Lerp(
+            Lerp( GetGradientDotPerlin( HashPrimes( seed, x0, y0 ), xf0, yf0 ), GetGradientDotPerlin( HashPrimes( seed, x1, y0 ), xf1, yf0 ), xs ),
+            Lerp( GetGradientDotPerlin( HashPrimes( seed, x0, y1 ), xf0, yf1 ), GetGradientDotPerlin( HashPrimes( seed, x1, y1 ), xf1, yf1 ), xs ), ys ),
+            -1 / kBounding, 1 / kBounding );
+    }
+
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z ) const
+    {
+        this->ScalePositions( x, y, z );
+
+        float32v xs = FS::Floor( x );
+        float32v ys = FS::Floor( y );
+        float32v zs = FS::Floor( z );
+
+        int32v x0 = FS::Convert<int32_t>( xs ) * int32v( Primes::X );
+        int32v y0 = FS::Convert<int32_t>( ys ) * int32v( Primes::Y );
+        int32v z0 = FS::Convert<int32_t>( zs ) * int32v( Primes::Z );
+        int32v x1 = x0 + int32v( Primes::X );
+        int32v y1 = y0 + int32v( Primes::Y );
+        int32v z1 = z0 + int32v( Primes::Z );
+
+        float32v xf0 = xs = x - xs;
+        float32v yf0 = ys = y - ys;
+        float32v zf0 = zs = z - zs;
+        float32v xf1 = xf0 - float32v( 1 );
+        float32v yf1 = yf0 - float32v( 1 );
+        float32v zf1 = zf0 - float32v( 1 );
+
+        xs = InterpQuintic( xs );
+        ys = InterpQuintic( ys );
+        zs = InterpQuintic( zs );
+
+        constexpr float kBounding = 0.964921414852142333984375f;
+
+        return this->ScaleOutput( Lerp( Lerp(
+            Lerp( GetGradientDotCommon( HashPrimes( seed, x0, y0, z0 ), xf0, yf0, zf0 ), GetGradientDotCommon( HashPrimes( seed, x1, y0, z0 ), xf1, yf0, zf0 ), xs ),
+            Lerp( GetGradientDotCommon( HashPrimes( seed, x0, y1, z0 ), xf0, yf1, zf0 ), GetGradientDotCommon( HashPrimes( seed, x1, y1, z0 ), xf1, yf1, zf0 ), xs ), ys ),
+            Lerp( 
+            Lerp( GetGradientDotCommon( HashPrimes( seed, x0, y0, z1 ), xf0, yf0, zf1 ), GetGradientDotCommon( HashPrimes( seed, x1, y0, z1 ), xf1, yf0, zf1 ), xs ),
+            Lerp( GetGradientDotCommon( HashPrimes( seed, x0, y1, z1 ), xf0, yf1, zf1 ), GetGradientDotCommon( HashPrimes( seed, x1, y1, z1 ), xf1, yf1, zf1 ), xs ), ys ), zs ),
+            -1 / kBounding, 1 / kBounding );
+    }
+
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z, float32v w ) const
+    {
+        this->ScalePositions( x, y, z, w );
+
+        float32v xs = FS::Floor( x );
+        float32v ys = FS::Floor( y );
+        float32v zs = FS::Floor( z );
+        float32v ws = FS::Floor( w );
+
+        int32v x0 = FS::Convert<int32_t>( xs ) * int32v( Primes::X );
+        int32v y0 = FS::Convert<int32_t>( ys ) * int32v( Primes::Y );
+        int32v z0 = FS::Convert<int32_t>( zs ) * int32v( Primes::Z );
+        int32v w0 = FS::Convert<int32_t>( ws ) * int32v( Primes::W );
+        int32v x1 = x0 + int32v( Primes::X );
+        int32v y1 = y0 + int32v( Primes::Y );
+        int32v z1 = z0 + int32v( Primes::Z );
+        int32v w1 = w0 + int32v( Primes::W );
+
+        float32v xf0 = xs = x - xs;
+        float32v yf0 = ys = y - ys;
+        float32v zf0 = zs = z - zs;
+        float32v wf0 = ws = w - ws;
+        float32v xf1 = xf0 - float32v( 1 );
+        float32v yf1 = yf0 - float32v( 1 );
+        float32v zf1 = zf0 - float32v( 1 );
+        float32v wf1 = wf0 - float32v( 1 );
+
+        xs = InterpQuintic( xs );
+        ys = InterpQuintic( ys );
+        zs = InterpQuintic( zs );
+        ws = InterpQuintic( ws );
+
+        constexpr float kBounding = 0.964921414852142333984375f;
+
+        return this->ScaleOutput( Lerp( Lerp( Lerp(
+            Lerp( GetGradientDotPerlin( HashPrimes( seed, x0, y0, z0, w0 ), xf0, yf0, zf0, wf0 ), GetGradientDotPerlin( HashPrimes( seed, x1, y0, z0, w0 ), xf1, yf0, zf0, wf0 ), xs ),
+            Lerp( GetGradientDotPerlin( HashPrimes( seed, x0, y1, z0, w0 ), xf0, yf1, zf0, wf0 ), GetGradientDotPerlin( HashPrimes( seed, x1, y1, z0, w0 ), xf1, yf1, zf0, wf0 ), xs ), ys ),
+            Lerp(                                                                                                                                                     
+            Lerp( GetGradientDotPerlin( HashPrimes( seed, x0, y0, z1, w0 ), xf0, yf0, zf1, wf0 ), GetGradientDotPerlin( HashPrimes( seed, x1, y0, z1, w0 ), xf1, yf0, zf1, wf0 ), xs ),    
+            Lerp( GetGradientDotPerlin( HashPrimes( seed, x0, y1, z1, w0 ), xf0, yf1, zf1, wf0 ), GetGradientDotPerlin( HashPrimes( seed, x1, y1, z1, w0 ), xf1, yf1, zf1, wf0 ), xs ), ys ), zs ),
+            Lerp( Lerp(
+            Lerp( GetGradientDotPerlin( HashPrimes( seed, x0, y0, z0, w1 ), xf0, yf0, zf0, wf1 ), GetGradientDotPerlin( HashPrimes( seed, x1, y0, z0, w1 ), xf1, yf0, zf0, wf1 ), xs ),
+            Lerp( GetGradientDotPerlin( HashPrimes( seed, x0, y1, z0, w1 ), xf0, yf1, zf0, wf1 ), GetGradientDotPerlin( HashPrimes( seed, x1, y1, z0, w1 ), xf1, yf1, zf0, wf1 ), xs ), ys ),
+            Lerp(                                                                                                                                                     
+            Lerp( GetGradientDotPerlin( HashPrimes( seed, x0, y0, z1, w1 ), xf0, yf0, zf1, wf1 ), GetGradientDotPerlin( HashPrimes( seed, x1, y0, z1, w1 ), xf1, yf0, zf1, wf1 ), xs ),    
+            Lerp( GetGradientDotPerlin( HashPrimes( seed, x0, y1, z1, w1 ), xf0, yf1, zf1, wf1 ), GetGradientDotPerlin( HashPrimes( seed, x1, y1, z1, w1 ), xf1, yf1, zf1, wf1 ), xs ), ys ), zs ), ws ),
+            -1 / kBounding, 1 / kBounding );
+    }
+};
diff --git a/include/FastNoise/Generators/Simplex.h b/include/FastNoise/Generators/Simplex.h
index 51310f85..240c0212 100644
--- a/include/FastNoise/Generators/Simplex.h
+++ b/include/FastNoise/Generators/Simplex.h
@@ -1,77 +1,52 @@
-#pragma once
-#include "Generator.h"
-
-namespace FastNoise
-{
-    class Simplex : public virtual Generator
-    {
-    public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
-        const Metadata& GetMetadata() const override;
-    };
-
-#ifdef FASTNOISE_METADATA
-    template<>
-    struct MetadataT<Simplex> : MetadataT<Generator>
-    {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
-
-        MetadataT()
-        {
-            groups.push_back( "Coherent Noise" );
-
-            description = 
-                "Smooth gradient noise from an N dimensional simplex grid\n"
-                "Developed by Ken Perlin in 2001";
-        }
-    };
-#endif
-
-    class OpenSimplex2 : public virtual Generator
-    {
-    public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
-        const Metadata& GetMetadata() const override;
-    };
-
-#ifdef FASTNOISE_METADATA
-    template<>
-    struct MetadataT<OpenSimplex2> : MetadataT<Generator>
-    {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
-
-        MetadataT()
-        {
-            groups.push_back( "Coherent Noise" );
-
-            description = 
-                "Smooth gradient noise from an N dimensional simplex grid, alternate implementation\n"
-                "Developed by K.jpg in 2019";
-        }
-    };
-#endif
-
-    class OpenSimplex2S : public virtual Generator
-    {
-    public:
-        FASTSIMD_LEVEL_SUPPORT(FastNoise::SUPPORTED_SIMD_LEVELS);
-        const Metadata& GetMetadata() const override;
-    };
-
-#ifdef FASTNOISE_METADATA
-    template<>
-    struct MetadataT<OpenSimplex2S> : MetadataT<Generator>
-    {
-        SmartNode<> CreateNode(FastSIMD::eLevel) const override;
-
-        MetadataT()
-        {
-            groups.push_back("Coherent Noise");
-
-            description =
-                "Smoother gradient noise from an N dimensional simplex grid\n"
-                "Developed by K.jpg in 2017";
-        }
-    };
-#endif
-}
+#pragma once
+#include "Generator.h"
+
+namespace FastNoise
+{
+    class Simplex : public virtual VariableRange<ScalableGenerator>
+    {
+    public:
+        const Metadata& GetMetadata() const override;
+    };
+
+#ifdef FASTNOISE_METADATA
+    template<>
+    struct MetadataT<Simplex> : MetadataT<VariableRange<ScalableGenerator>>
+    {
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
+
+        MetadataT()
+        {
+            groups.push_back( "Coherent Noise" );
+
+            description = 
+                "Smooth gradient noise from an N dimensional simplex grid\n"
+                "Developed by Ken Perlin in 2001";
+        }
+    };
+#endif
+
+    class SuperSimplex : public virtual VariableRange<ScalableGenerator>
+    {
+    public:
+        const Metadata& GetMetadata() const override;
+    };
+
+#ifdef FASTNOISE_METADATA
+    template<>
+    struct MetadataT<SuperSimplex> : MetadataT<VariableRange<ScalableGenerator>>
+    {
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
+
+        MetadataT()
+        {
+            groups.push_back( "Coherent Noise" );
+
+            description =
+                "Extra smooth gradient noise from an N dimensional simplex grid\n"
+                "Slower to generate than Simplex noise\n"
+                "Developed by K.jpg";
+        }
+    };
+#endif
+}
diff --git a/include/FastNoise/Generators/Simplex.inl b/include/FastNoise/Generators/Simplex.inl
index 09e33ca8..7af0d194 100644
--- a/include/FastNoise/Generators/Simplex.inl
+++ b/include/FastNoise/Generators/Simplex.inl
@@ -1,521 +1,929 @@
-#include "FastSIMD/InlInclude.h"
-
 #include "Simplex.h"
 #include "Utils.inl"
 
-template<typename FS>
-class FS_T<FastNoise::Simplex, FS> : public virtual FastNoise::Simplex, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::Simplex, SIMD> final : public virtual FastNoise::Simplex, public FastSIMD::DispatchClass<FastNoise::VariableRange<ScalableGenerator>, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
-
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y ) const final
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y ) const
     {
-        const float SQRT3 = 1.7320508075688772935274463415059f;
-        const float F2 = 0.5f * (SQRT3 - 1.0f);
-        const float G2 = (3.0f - SQRT3) / 6.0f;
-
-        float32v f = float32v( F2 ) * (x + y);
-        float32v x0 = FS_Floor_f32( x + f );
-        float32v y0 = FS_Floor_f32( y + f );
+        this->ScalePositions( x, y );
+
+        constexpr double kRoot3 = 1.7320508075688772935274463415059;
+        constexpr double kSkew2 = 1.0 / ( kRoot3 + 1.0 );
+        constexpr double kUnskew2 = -1.0 / ( kRoot3 + 3.0 );
+        constexpr double kFalloffRadiusSquared = 0.5;
+
+        float32v skewDelta = float32v( kSkew2 ) * ( x + y );
+        float32v xSkewed = x + skewDelta;
+        float32v ySkewed = y + skewDelta;
+
+        float32v xSkewedBase = FS::Floor( xSkewed );
+        float32v ySkewedBase = FS::Floor( ySkewed );
+        float32v dxSkewed = xSkewed - xSkewedBase;
+        float32v dySkewed = ySkewed - ySkewedBase;
+
+        int32v xPrimedBase = FS::Convert<int32_t>( xSkewedBase ) * int32v( Primes::X );
+        int32v yPrimedBase = FS::Convert<int32_t>( ySkewedBase ) * int32v( Primes::Y );
+
+        mask32v xGreaterEqualY = dxSkewed >= dySkewed;
+        
+        float32v unskewDelta = float32v( kUnskew2 ) * ( dxSkewed + dySkewed );
+        float32v dx0 = dxSkewed + unskewDelta;
+        float32v dy0 = dySkewed + unskewDelta;
+        
+        float32v dx1 = FS::MaskedIncrement( ~xGreaterEqualY, dx0 ) - float32v( kUnskew2 + 1 );
+        float32v dy1 = FS::MaskedIncrement( xGreaterEqualY, dy0 ) - float32v( kUnskew2 + 1 );
+        float32v dx2 = dx0 - float32v( kUnskew2 * 2 + 1 );
+        float32v dy2 = dy0 - float32v( kUnskew2 * 2 + 1 );
+
+        float32v falloff0 = FS::FNMulAdd( dx0, dx0, FS::FNMulAdd( dy0, dy0, float32v( kFalloffRadiusSquared ) ) );
+        float32v falloff1 = FS::FNMulAdd( dx1, dx1, FS::FNMulAdd( dy1, dy1, float32v( kFalloffRadiusSquared ) ) );
+        float32v falloff2 = falloff0 + FS::FMulAdd( unskewDelta,
+            float32v( -4.0 * ( kRoot3 + 2.0 ) / ( kRoot3 + 3.0 ) ),
+            float32v( -2.0 / 3.0 ) );
+
+        falloff0 = FS::Max( falloff0, float32v( 0 ) );
+        falloff1 = FS::Max( falloff1, float32v( 0 ) );
+        falloff2 = FS::Max( falloff2, float32v( 0 ) );
+
+        falloff0 *= falloff0; falloff0 *= falloff0;
+        falloff1 *= falloff1; falloff1 *= falloff1;
+        falloff2 *= falloff2; falloff2 *= falloff2;
+
+        float32v gradientRampValue0 = GetGradientDotPerlin( HashPrimes( seed, xPrimedBase, yPrimedBase ), dx0, dy0 );
+        float32v gradientRampValue1 = GetGradientDotPerlin( HashPrimes( seed, FS::MaskedAdd( xGreaterEqualY, xPrimedBase, int32v( Primes::X ) ), FS::InvMaskedAdd( xGreaterEqualY, yPrimedBase, int32v( Primes::Y ) ) ), dx1, dy1 );
+        float32v gradientRampValue2 = GetGradientDotPerlin( HashPrimes( seed, xPrimedBase + int32v( Primes::X ), yPrimedBase + int32v( Primes::Y ) ), dx2, dy2 );
+
+        constexpr double kBounding = 38.283687591552734375;
+
+        return this->ScaleOutput( FS::FMulAdd( gradientRampValue0, falloff0, FS::FMulAdd( gradientRampValue1, falloff1, gradientRampValue2 * falloff2 ) ),
+            -1 / kBounding, 1 / kBounding );
+    }
 
-        int32v i = FS_Convertf32_i32( x0 ) * int32v( FnPrimes::X );
-        int32v j = FS_Convertf32_i32( y0 ) * int32v( FnPrimes::Y );
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z ) const
+    {
+        this->ScalePositions( x, y, z );
+
+        constexpr double kSkew3 = 1.0 / 3.0;
+        constexpr double kReflectUnskew3 = -1.0 / 2.0;
+        constexpr double kFalloffRadiusSquared = 0.6;
+
+        float32v skewDelta = float32v( kSkew3 ) * ( x + y + z );
+        float32v xSkewed = x + skewDelta;
+        float32v ySkewed = y + skewDelta;
+        float32v zSkewed = z + skewDelta;
+
+        float32v xSkewedBase = FS::Floor( xSkewed );
+        float32v ySkewedBase = FS::Floor( ySkewed );
+        float32v zSkewedBase = FS::Floor( zSkewed );
+        float32v dxSkewed = xSkewed - xSkewedBase;
+        float32v dySkewed = ySkewed - ySkewedBase;
+        float32v dzSkewed = zSkewed - zSkewedBase;
+
+        int32v xPrimedBase = FS::Convert<int32_t>( xSkewedBase ) * int32v( Primes::X );
+        int32v yPrimedBase = FS::Convert<int32_t>( ySkewedBase ) * int32v( Primes::Y );
+        int32v zPrimedBase = FS::Convert<int32_t>( zSkewedBase ) * int32v( Primes::Z );
+
+        mask32v xGreaterEqualY = dxSkewed >= dySkewed;
+        mask32v yGreaterEqualZ = dySkewed >= dzSkewed;
+        mask32v xGreaterEqualZ = dxSkewed >= dzSkewed;
+
+        float32v unskewDelta = float32v( kReflectUnskew3 ) * ( dxSkewed + dySkewed + dzSkewed );
+        float32v dx0 = dxSkewed + unskewDelta;
+        float32v dy0 = dySkewed + unskewDelta;
+        float32v dz0 = dzSkewed + unskewDelta;
+
+        mask32v maskX1 = xGreaterEqualY & xGreaterEqualZ;
+        mask32v maskY1 = FS::BitwiseAndNot( yGreaterEqualZ, xGreaterEqualY );
+        mask32v maskZ1 = xGreaterEqualZ | yGreaterEqualZ; // Inv masked
+
+        mask32v nMaskX2 = xGreaterEqualY | xGreaterEqualZ; // Inv masked
+        mask32v nMaskY2 = FS::BitwiseAndNot( xGreaterEqualY, yGreaterEqualZ );
+        mask32v nMaskZ2 = xGreaterEqualZ & yGreaterEqualZ;
+
+        float32v dx3 = dx0 - float32v( kReflectUnskew3 * 3 + 1 );
+        float32v dy3 = dy0 - float32v( kReflectUnskew3 * 3 + 1 );
+        float32v dz3 = dz0 - float32v( kReflectUnskew3 * 3 + 1 );
+        float32v dx1 = FS::MaskedSub( maskX1, dx3, float32v( 1 ) ); // kReflectUnskew3 * 3 + 1 = kReflectUnskew3, so dx0 - kReflectUnskew3 = dx3
+        float32v dy1 = FS::MaskedSub( maskY1, dy3, float32v( 1 ) );
+        float32v dz1 = FS::InvMaskedSub( maskZ1, dz3, float32v( 1 ) );
+        float32v dx2 = FS::MaskedIncrement( ~nMaskX2, dx0 ); // kReflectUnskew3 * 2 - 1 = 0, so dx0 + ( kReflectUnskew3 * 2 - 1 ) = dx0
+        float32v dy2 = FS::MaskedIncrement( nMaskY2, dy0 );
+        float32v dz2 = FS::MaskedIncrement( nMaskZ2, dz0 );
+
+        float32v falloff0 = FS::FNMulAdd( dz0, dz0, FS::FNMulAdd( dy0, dy0, FS::FNMulAdd( dx0, dx0, float32v( kFalloffRadiusSquared ) ) ) );
+        float32v falloff1 = FS::FNMulAdd( dz1, dz1, FS::FNMulAdd( dy1, dy1, FS::FNMulAdd( dx1, dx1, float32v( kFalloffRadiusSquared ) ) ) );
+        float32v falloff2 = FS::FNMulAdd( dz2, dz2, FS::FNMulAdd( dy2, dy2, FS::FNMulAdd( dx2, dx2, float32v( kFalloffRadiusSquared ) ) ) );
+        float32v falloff3 = falloff0 - ( unskewDelta + float32v( 3.0 / 4.0 ) );
+
+        falloff0 = FS::Max( falloff0, float32v( 0 ) );
+        falloff1 = FS::Max( falloff1, float32v( 0 ) );
+        falloff2 = FS::Max( falloff2, float32v( 0 ) );
+        falloff3 = FS::Max( falloff3, float32v( 0 ) );
+
+        falloff0 *= falloff0; falloff0 *= falloff0;
+        falloff1 *= falloff1; falloff1 *= falloff1;
+        falloff2 *= falloff2; falloff2 *= falloff2;
+        falloff3 *= falloff3; falloff3 *= falloff3;
+
+        float32v gradientRampValue0 = GetGradientDotCommon( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimedBase ), dx0, dy0, dz0 );
+        float32v gradientRampValue1 = GetGradientDotCommon( HashPrimes( seed, FS::MaskedAdd( maskX1, xPrimedBase, int32v( Primes::X ) ), FS::MaskedAdd( maskY1, yPrimedBase, int32v( Primes::Y ) ), FS::InvMaskedAdd( maskZ1, zPrimedBase, int32v( Primes::Z ) ) ), dx1, dy1, dz1 );
+        float32v gradientRampValue2 = GetGradientDotCommon( HashPrimes( seed, FS::MaskedAdd( nMaskX2, xPrimedBase, int32v( Primes::X ) ), FS::InvMaskedAdd( nMaskY2, yPrimedBase, int32v( Primes::Y ) ), FS::InvMaskedAdd( nMaskZ2, zPrimedBase, int32v( Primes::Z ) ) ), dx2, dy2, dz2 );
+        float32v gradientRampValue3 = GetGradientDotCommon( HashPrimes( seed, xPrimedBase + int32v( Primes::X ), yPrimedBase + int32v( Primes::Y ), zPrimedBase + int32v( Primes::Z ) ), dx3, dy3, dz3 );
+
+        constexpr double kBounding = 32.69428253173828125;
+
+        return this->ScaleOutput( FS::FMulAdd( gradientRampValue3, falloff3, FS::FMulAdd( gradientRampValue2, falloff2, FS::FMulAdd( gradientRampValue1, falloff1, gradientRampValue0 * falloff0 ) ) ),
+            -1 / kBounding, 1 / kBounding );
+    }
 
-        float32v g = float32v( G2 ) * (x0 + y0);
-        x0 = x - (x0 - g);
-        y0 = y - (y0 - g);
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z, float32v w ) const
+    {
+        this->ScalePositions( x, y, z, w );
+
+        constexpr double kRoot5 = 2.2360679774997896964091736687313;
+        constexpr double kSkew4 = 1.0 / ( kRoot5 + 1.0 );
+        constexpr double kUnskew4 = -1.0 / ( kRoot5 + 5.0 );
+        constexpr double kFalloffRadiusSquared = 0.6;
+
+        float32v skewDelta = float32v( kSkew4 ) * ( x + y + z + w );
+        float32v xSkewed = x + skewDelta;
+        float32v ySkewed = y + skewDelta;
+        float32v zSkewed = z + skewDelta;
+        float32v wSkewed = w + skewDelta;
+
+        float32v xSkewedBase = FS::Floor( xSkewed );
+        float32v ySkewedBase = FS::Floor( ySkewed );
+        float32v zSkewedBase = FS::Floor( zSkewed );
+        float32v wSkewedBase = FS::Floor( wSkewed );
+        float32v dxSkewed = xSkewed - xSkewedBase;
+        float32v dySkewed = ySkewed - ySkewedBase;
+        float32v dzSkewed = zSkewed - zSkewedBase;
+        float32v dwSkewed = wSkewed - wSkewedBase;
+
+        int32v xPrimedBase = FS::Convert<int32_t>( xSkewedBase ) * int32v( Primes::X );
+        int32v yPrimedBase = FS::Convert<int32_t>( ySkewedBase ) * int32v( Primes::Y );
+        int32v zPrimedBase = FS::Convert<int32_t>( zSkewedBase ) * int32v( Primes::Z );
+        int32v wPrimedBase = FS::Convert<int32_t>( wSkewedBase ) * int32v( Primes::W );
+
+        float32v unskewDelta = float32v( kUnskew4 ) * ( dxSkewed + dySkewed + dzSkewed + dwSkewed );
+        float32v dx0 = dxSkewed + unskewDelta;
+        float32v dy0 = dySkewed + unskewDelta;
+        float32v dz0 = dzSkewed + unskewDelta;
+        float32v dw0 = dwSkewed + unskewDelta;
+
+        int32v rankX( 0 );
+        int32v rankY( 0 );
+        int32v rankZ( 0 );
+        int32v rankW( 0 );
+
+        mask32v xGreaterEqualY = dx0 >= dy0;
+        rankX = FS::MaskedIncrement(  xGreaterEqualY, rankX );
+        rankY = FS::MaskedIncrement( ~xGreaterEqualY, rankY );
+
+        mask32v xGreaterEqualZ = dx0 >= dz0;
+        rankX = FS::MaskedIncrement(  xGreaterEqualZ, rankX );
+        rankZ = FS::MaskedIncrement( ~xGreaterEqualZ, rankZ );
+
+        mask32v xGreaterEqualW = dx0 >= dw0;
+        rankX = FS::MaskedIncrement(  xGreaterEqualW, rankX );
+        rankW = FS::MaskedIncrement( ~xGreaterEqualW, rankW );
+
+        mask32v yGreaterEqualZ = dy0 >= dz0;
+        rankY = FS::MaskedIncrement(  yGreaterEqualZ, rankY );
+        rankZ = FS::MaskedIncrement( ~yGreaterEqualZ, rankZ );
+
+        mask32v yGreaterEqualW = dy0 >= dw0;
+        rankY = FS::MaskedIncrement(  yGreaterEqualW, rankY );
+        rankW = FS::MaskedIncrement( ~yGreaterEqualW, rankW );
+
+        mask32v zGreaterEqualW = dz0 >= dw0;
+        rankZ = FS::MaskedIncrement(  zGreaterEqualW, rankZ );
+        rankW = FS::MaskedIncrement( ~zGreaterEqualW, rankW );
+
+        mask32v maskX1 = rankX > int32v( 2 );
+        mask32v maskY1 = rankY > int32v( 2 );
+        mask32v maskZ1 = rankZ > int32v( 2 );
+        mask32v maskW1 = rankW > int32v( 2 );
+
+        mask32v maskX2 = rankX > int32v( 1 );
+        mask32v maskY2 = rankY > int32v( 1 );
+        mask32v maskZ2 = rankZ > int32v( 1 );
+        mask32v maskW2 = rankW > int32v( 1 );
+
+        mask32v maskX3 = rankX > int32v( 0 );
+        mask32v maskY3 = rankY > int32v( 0 );
+        mask32v maskZ3 = rankZ > int32v( 0 );
+        mask32v maskW3 = rankW > int32v( 0 );
+
+        float32v dx1 = FS::MaskedSub( maskX1, dx0, float32v( 1 ) ) - float32v( kUnskew4 );
+        float32v dy1 = FS::MaskedSub( maskY1, dy0, float32v( 1 ) ) - float32v( kUnskew4 );
+        float32v dz1 = FS::MaskedSub( maskZ1, dz0, float32v( 1 ) ) - float32v( kUnskew4 );
+        float32v dw1 = FS::MaskedSub( maskW1, dw0, float32v( 1 ) ) - float32v( kUnskew4 );
+        float32v dx2 = FS::MaskedSub( maskX2, dx0, float32v( 1 ) ) - float32v( kUnskew4 * 2 );
+        float32v dy2 = FS::MaskedSub( maskY2, dy0, float32v( 1 ) ) - float32v( kUnskew4 * 2 );
+        float32v dz2 = FS::MaskedSub( maskZ2, dz0, float32v( 1 ) ) - float32v( kUnskew4 * 2 );
+        float32v dw2 = FS::MaskedSub( maskW2, dw0, float32v( 1 ) ) - float32v( kUnskew4 * 2 );
+        float32v dx3 = FS::MaskedSub( maskX3, dx0, float32v( 1 ) ) - float32v( kUnskew4 * 3 );
+        float32v dy3 = FS::MaskedSub( maskY3, dy0, float32v( 1 ) ) - float32v( kUnskew4 * 3 );
+        float32v dz3 = FS::MaskedSub( maskZ3, dz0, float32v( 1 ) ) - float32v( kUnskew4 * 3 );
+        float32v dw3 = FS::MaskedSub( maskW3, dw0, float32v( 1 ) ) - float32v( kUnskew4 * 3 );
+        float32v dx4 = dx0 - float32v( kUnskew4 * 4 + 1 );
+        float32v dy4 = dy0 - float32v( kUnskew4 * 4 + 1 );
+        float32v dz4 = dz0 - float32v( kUnskew4 * 4 + 1 );
+        float32v dw4 = dw0 - float32v( kUnskew4 * 4 + 1 );
+
+        float32v falloff0 = FS::FNMulAdd( dw0, dw0, FS::FNMulAdd( dz0, dz0, FS::FNMulAdd( dy0, dy0, FS::FNMulAdd( dx0, dx0, float32v( kFalloffRadiusSquared ) ) ) ) );
+        float32v falloff1 = FS::FNMulAdd( dw1, dw1, FS::FNMulAdd( dz1, dz1, FS::FNMulAdd( dy1, dy1, FS::FNMulAdd( dx1, dx1, float32v( kFalloffRadiusSquared ) ) ) ) );
+        float32v falloff2 = FS::FNMulAdd( dw2, dw2, FS::FNMulAdd( dz2, dz2, FS::FNMulAdd( dy2, dy2, FS::FNMulAdd( dx2, dx2, float32v( kFalloffRadiusSquared ) ) ) ) );
+        float32v falloff3 = FS::FNMulAdd( dw3, dw3, FS::FNMulAdd( dz3, dz3, FS::FNMulAdd( dy3, dy3, FS::FNMulAdd( dx3, dx3, float32v( kFalloffRadiusSquared ) ) ) ) );
+        float32v falloff4 = falloff0 + FS::FMulAdd( unskewDelta,
+            float32v( -4.0 * ( kRoot5 + 3.0 ) / ( kRoot5 + 5.0 ) ),
+            float32v( -4.0 / 5.0 ) );
+
+        falloff0 = FS::Max( falloff0, float32v( 0 ) );
+        falloff1 = FS::Max( falloff1, float32v( 0 ) );
+        falloff2 = FS::Max( falloff2, float32v( 0 ) );
+        falloff3 = FS::Max( falloff3, float32v( 0 ) );
+        falloff4 = FS::Max( falloff4, float32v( 0 ) );
+
+        falloff0 *= falloff0; falloff0 *= falloff0;
+        falloff1 *= falloff1; falloff1 *= falloff1;
+        falloff2 *= falloff2; falloff2 *= falloff2;
+        falloff3 *= falloff3; falloff3 *= falloff3;
+        falloff4 *= falloff4; falloff4 *= falloff4;
+
+        float32v gradientRampValue0 = GetGradientDotPerlin( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimedBase, wPrimedBase ), dx0, dy0, dz0, dw0 );
+        float32v gradientRampValue1 = GetGradientDotPerlin( HashPrimes( seed,
+            FS::MaskedAdd( maskX1, xPrimedBase, int32v( Primes::X ) ),
+            FS::MaskedAdd( maskY1, yPrimedBase, int32v( Primes::Y ) ),
+            FS::MaskedAdd( maskZ1, zPrimedBase, int32v( Primes::Z ) ),
+            FS::MaskedAdd( maskW1, wPrimedBase, int32v( Primes::W ) ) ), dx1, dy1, dz1, dw1 );
+        float32v gradientRampValue2 = GetGradientDotPerlin( HashPrimes( seed,
+            FS::MaskedAdd( maskX2, xPrimedBase, int32v( Primes::X ) ),
+            FS::MaskedAdd( maskY2, yPrimedBase, int32v( Primes::Y ) ),
+            FS::MaskedAdd( maskZ2, zPrimedBase, int32v( Primes::Z ) ),
+            FS::MaskedAdd( maskW2, wPrimedBase, int32v( Primes::W ) ) ), dx2, dy2, dz2, dw2 );
+        float32v gradientRampValue3 = GetGradientDotPerlin( HashPrimes( seed,
+            FS::MaskedAdd( maskX3, xPrimedBase, int32v( Primes::X ) ),
+            FS::MaskedAdd( maskY3, yPrimedBase, int32v( Primes::Y ) ),
+            FS::MaskedAdd( maskZ3, zPrimedBase, int32v( Primes::Z ) ),
+            FS::MaskedAdd( maskW3, wPrimedBase, int32v( Primes::W ) ) ), dx3, dy3, dz3, dw3 );
+        float32v gradientRampValue4 = GetGradientDotPerlin( HashPrimes( seed,
+            xPrimedBase + int32v( Primes::X ), yPrimedBase + int32v( Primes::Y ), zPrimedBase + int32v( Primes::Z ), wPrimedBase + int32v( Primes::W ) ),
+            dx4, dy4, dz4, dw4 );
+
+        constexpr double kBounding = 33.653125584827855;
+
+        return this->ScaleOutput( FS::FMulAdd( gradientRampValue0, falloff0, FS::FMulAdd( gradientRampValue1, falloff1, FS::FMulAdd( gradientRampValue2, falloff2, FS::FMulAdd( gradientRampValue3, falloff3, gradientRampValue4 * falloff4 ) ) ) ),
+            -1 / kBounding, 1 / kBounding );
+    }
 
-        mask32v i1 = x0 > y0;
-        //mask32v j1 = ~i1; //NMasked funcs
+};
 
-        float32v x1 = FS_MaskedSub_f32( x0, float32v( 1.f ), i1 ) + float32v( G2 );
-        float32v y1 = FS_NMaskedSub_f32( y0, float32v( 1.f ), i1 ) + float32v( G2 );
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::SuperSimplex, SIMD> final : public virtual FastNoise::SuperSimplex, public FastSIMD::DispatchClass<FastNoise::VariableRange<ScalableGenerator>, SIMD>
+{
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y ) const
+    {
+        this->ScalePositions( x, y );
+
+        constexpr double kRoot3 = 1.7320508075688772935274463415059;
+        constexpr double kSkew2 = 1.0 / ( kRoot3 + 1.0 );
+        constexpr double kUnskew2 = -1.0 / ( kRoot3 + 3.0 );
+        constexpr double kFalloffRadiusSquared = 2.0 / 3.0;
+
+        float32v skewDelta = float32v( kSkew2 ) * ( x + y );
+        float32v xSkewed = x + skewDelta;
+        float32v ySkewed = y + skewDelta;
+        float32v xSkewedBase = FS::Floor( xSkewed );
+        float32v ySkewedBase = FS::Floor( ySkewed );
+        float32v dxSkewed = xSkewed - xSkewedBase;
+        float32v dySkewed = ySkewed - ySkewedBase;
+        int32v xPrimedBase = FS::Convert<int32_t>( xSkewedBase ) * int32v( Primes::X );
+        int32v yPrimedBase = FS::Convert<int32_t>( ySkewedBase ) * int32v( Primes::Y );
+
+        mask32v forwardXY = dxSkewed + dySkewed > float32v( 1.0f );
+        float32v boundaryXY = FS::Masked( forwardXY, float32v( -1.0f ) );
+        mask32v forwardX = FS::FMulAdd( dxSkewed, float32v( -2.0f ), dySkewed ) < boundaryXY;
+        mask32v forwardY = FS::FMulAdd( dySkewed, float32v( -2.0f ), dxSkewed ) < boundaryXY;
+
+        float32v unskewDelta = float32v( kUnskew2 ) * ( dxSkewed + dySkewed );
+        float32v dxBase = dxSkewed + unskewDelta;
+        float32v dyBase = dySkewed + unskewDelta;
+
+        float32v falloffBase0, value;
+
+        // Vertex <0, 0>
+        {
+            int32v hash = HashPrimes( seed, xPrimedBase, yPrimedBase );
+            float32v gradientRampValue = GetGradientDotPerlin( hash, dxBase, dyBase );
+            falloffBase0 = FS::FNMulAdd( dxBase, dxBase, FS::FNMulAdd( dyBase, dyBase, float32v( kFalloffRadiusSquared ) ) );
+            float32v falloff = falloffBase0; falloff *= falloff; falloff *= falloff;
+            value = falloff * gradientRampValue;
+        }
 
-        float32v x2 = x0 + float32v( G2 * 2 - 1 );
-        float32v y2 = y0 + float32v( G2 * 2 - 1 );
+        // Vertex <1, 1>
+        {
+            int32v hash = HashPrimes( seed, xPrimedBase + int32v( Primes::X ), yPrimedBase + int32v( Primes::Y ) );
+            float32v gradientRampValue = GetGradientDotPerlin( hash, dxBase - float32v( 2 * kUnskew2 + 1 ), dyBase - float32v( 2 * kUnskew2 + 1 ) );
+            float32v falloff = FS::FMulAdd( unskewDelta,
+                float32v( -4.0 * ( kRoot3 + 2.0 ) / ( kRoot3 + 3.0 ) ),
+                falloffBase0 - float32v( kFalloffRadiusSquared ) );
+            falloff *= falloff; falloff *= falloff;
+            value = FS::FMulAdd( falloff, gradientRampValue, value );
+        }
 
-        float32v t0 = FS_FNMulAdd_f32( x0, x0, FS_FNMulAdd_f32( y0, y0, float32v( 0.5f ) ) );
-        float32v t1 = FS_FNMulAdd_f32( x1, x1, FS_FNMulAdd_f32( y1, y1, float32v( 0.5f ) ) );
-        float32v t2 = FS_FNMulAdd_f32( x2, x2, FS_FNMulAdd_f32( y2, y2, float32v( 0.5f ) ) );
+        float32v xyDelta = FS::Select( forwardXY, float32v( kUnskew2 + 1 ), float32v( -kUnskew2 ) );
+        dxBase -= xyDelta;
+        dyBase -= xyDelta;
 
-        t0 = FS_Max_f32( t0, float32v( 0 ) );
-        t1 = FS_Max_f32( t1, float32v( 0 ) );
-        t2 = FS_Max_f32( t2, float32v( 0 ) );
+        // Vertex <1, 0> or <-1, 0> or <1, 2>
+        {
+            int32v hash = HashPrimes( seed,
+                FS::InvMaskedSub( forwardXY, FS::MaskedAdd( forwardX, xPrimedBase, int32v( Primes::X * 2 ) ), int32v( Primes::X ) ),
+                FS::MaskedAdd( forwardXY, yPrimedBase, int32v( Primes::Y ) ) );
+            float32v dx = dxBase - FS::Select( forwardX, float32v( 1 + 2 * kUnskew2 ), float32v( -1 ) );
+            float32v dy = FS::MaskedSub( forwardX, dyBase, float32v( 2 * kUnskew2 ) );
+            float32v gradientRampValue = GetGradientDotPerlin( hash, dx, dy );
+            float32v falloff = FS::Max( FS::FNMulAdd( dx, dx, FS::FNMulAdd( dy, dy, float32v( kFalloffRadiusSquared ) ) ), float32v( 0 ) );
+            falloff *= falloff; falloff *= falloff;
+            value = FS::FMulAdd( falloff, gradientRampValue, value );
+        }
 
-        t0 *= t0; t0 *= t0;
-        t1 *= t1; t1 *= t1;
-        t2 *= t2; t2 *= t2;
+        // Vertex <0, 1> or <0, -1> or <2, 1>
+        {
+            int32v hash = HashPrimes( seed,
+                FS::MaskedAdd( forwardXY, xPrimedBase, int32v( Primes::X ) ),
+                FS::InvMaskedSub( forwardXY, FS::MaskedAdd( forwardY, yPrimedBase, int32v( (int32_t)( Primes::Y * 2LL ) ) ), int32v( Primes::Y ) ) );
+            float32v dx = FS::MaskedSub( forwardY, dxBase, float32v( 2 * kUnskew2 ) );
+            float32v dy = dyBase - FS::Select( forwardY, float32v( 1 + 2 * kUnskew2 ), float32v( -1 ) );
+            float32v gradientRampValue = GetGradientDotPerlin( hash, dx, dy );
+            float32v falloff = FS::Max( FS::FNMulAdd( dx, dx, FS::FNMulAdd( dy, dy, float32v( kFalloffRadiusSquared ) ) ), float32v( 0 ) );
+            falloff *= falloff; falloff *= falloff;
+            value = FS::FMulAdd( falloff, gradientRampValue, value );
+        }
 
-        float32v n0 = FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, i, j ), x0, y0 );
-        float32v n1 = FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, FS_MaskedAdd_i32( i, int32v( FnPrimes::X ), i1 ), FS_NMaskedAdd_i32( j, int32v( FnPrimes::Y ), i1 ) ), x1, y1 );
-        float32v n2 = FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, i + int32v( FnPrimes::X ), j + int32v( FnPrimes::Y ) ), x2, y2 );
+        constexpr double kBounding = 9.28993664146183;
 
-        return float32v( 38.283687591552734375f ) * FS_FMulAdd_f32( n0, t0, FS_FMulAdd_f32( n1, t1, n2 * t2 ) );
+        return this->ScaleOutput( value, -1 / kBounding, 1 / kBounding );
     }
 
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z ) const final
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z ) const
     {
-        const float F3 = 1.0f / 3.0f;
-        const float G3 = 1.0f / 2.0f;
-
-        float32v s = float32v( F3 ) * (x + y + z);
-        x += s;
-        y += s;
-        z += s;
-
-        float32v x0 = FS_Floor_f32( x );
-        float32v y0 = FS_Floor_f32( y );
-        float32v z0 = FS_Floor_f32( z );
-        float32v xi = x - x0;
-        float32v yi = y - y0;
-        float32v zi = z - z0;
-
-        int32v i = FS_Convertf32_i32( x0 ) * int32v( FnPrimes::X );
-        int32v j = FS_Convertf32_i32( y0 ) * int32v( FnPrimes::Y );
-        int32v k = FS_Convertf32_i32( z0 ) * int32v( FnPrimes::Z );
-
-        mask32v x_ge_y = xi >= yi;
-        mask32v y_ge_z = yi >= zi;
-        mask32v x_ge_z = xi >= zi;
-
-        float32v g = float32v( G3 ) * (xi + yi + zi);
-        x0 = xi - g;
-        y0 = yi - g;
-        z0 = zi - g;
-
-        mask32v i1 = x_ge_y & x_ge_z;
-        mask32v j1 = FS_BitwiseAndNot_m32( y_ge_z, x_ge_y );
-        mask32v k1 = FS_BitwiseAndNot_m32( ~x_ge_z, y_ge_z );
-
-        mask32v i2 = x_ge_y | x_ge_z;
-        mask32v j2 = ~x_ge_y | y_ge_z;
-        mask32v k2 = x_ge_z & y_ge_z; //NMasked
-
-        float32v x1 = FS_MaskedSub_f32( x0, float32v( 1 ), i1 ) + float32v( G3 );
-        float32v y1 = FS_MaskedSub_f32( y0, float32v( 1 ), j1 ) + float32v( G3 );
-        float32v z1 = FS_MaskedSub_f32( z0, float32v( 1 ), k1 ) + float32v( G3 );
-        float32v x2 = FS_MaskedSub_f32( x0, float32v( 1 ), i2 ) + float32v( G3 * 2 );
-        float32v y2 = FS_MaskedSub_f32( y0, float32v( 1 ), j2 ) + float32v( G3 * 2 );
-        float32v z2 = FS_NMaskedSub_f32( z0, float32v( 1 ), k2 ) + float32v( G3 * 2 );
-        float32v x3 = x0 + float32v( G3 * 3 - 1 );
-        float32v y3 = y0 + float32v( G3 * 3 - 1 );
-        float32v z3 = z0 + float32v( G3 * 3 - 1 );
-
-        float32v t0 = FS_FNMulAdd_f32( x0, x0, FS_FNMulAdd_f32( y0, y0, FS_FNMulAdd_f32( z0, z0, float32v( 0.6f ) ) ) );
-        float32v t1 = FS_FNMulAdd_f32( x1, x1, FS_FNMulAdd_f32( y1, y1, FS_FNMulAdd_f32( z1, z1, float32v( 0.6f ) ) ) );
-        float32v t2 = FS_FNMulAdd_f32( x2, x2, FS_FNMulAdd_f32( y2, y2, FS_FNMulAdd_f32( z2, z2, float32v( 0.6f ) ) ) );
-        float32v t3 = FS_FNMulAdd_f32( x3, x3, FS_FNMulAdd_f32( y3, y3, FS_FNMulAdd_f32( z3, z3, float32v( 0.6f ) ) ) );
-
-        t0 = FS_Max_f32( t0, float32v( 0 ) );
-        t1 = FS_Max_f32( t1, float32v( 0 ) );
-        t2 = FS_Max_f32( t2, float32v( 0 ) );
-        t3 = FS_Max_f32( t3, float32v( 0 ) );
-
-        t0 *= t0; t0 *= t0;
-        t1 *= t1; t1 *= t1;
-        t2 *= t2; t2 *= t2;
-        t3 *= t3; t3 *= t3;             
-
-        float32v n0 = FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, i, j, k ), x0, y0, z0 );
-        float32v n1 = FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, FS_MaskedAdd_i32( i, int32v( FnPrimes::X ), i1 ), FS_MaskedAdd_i32( j, int32v( FnPrimes::Y ), j1 ), FS_MaskedAdd_i32( k, int32v( FnPrimes::Z ), k1 ) ), x1, y1, z1 );
-        float32v n2 = FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, FS_MaskedAdd_i32( i, int32v( FnPrimes::X ), i2 ), FS_MaskedAdd_i32( j, int32v( FnPrimes::Y ), j2 ), FS_NMaskedAdd_i32( k, int32v( FnPrimes::Z ), k2 ) ), x2, y2, z2 );
-        float32v n3 = FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, i + int32v( FnPrimes::X ), j + int32v( FnPrimes::Y ), k + int32v( FnPrimes::Z ) ), x3, y3, z3 );
-
-        return float32v( 32.69428253173828125f ) * FS_FMulAdd_f32( n0, t0, FS_FMulAdd_f32( n1, t1, FS_FMulAdd_f32( n2, t2, n3 * t3 ) ) );
-    }
+        this->ScalePositions( x, y, z );
 
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z, float32v w ) const final
-    {
-        const float SQRT5 = 2.236067977499f;
-        const float F4 = (SQRT5 - 1.0f) / 4.0f;
-        const float G4 = (5.0f - SQRT5) / 20.0f;
-
-        float32v s = float32v( F4 ) * (x + y + z + w);
-        x += s;
-        y += s;
-        z += s;
-        w += s;
-
-        float32v x0 = FS_Floor_f32( x );
-        float32v y0 = FS_Floor_f32( y );
-        float32v z0 = FS_Floor_f32( z );
-        float32v w0 = FS_Floor_f32( w );
-        float32v xi = x - x0;
-        float32v yi = y - y0;
-        float32v zi = z - z0;
-        float32v wi = w - w0;
-
-        int32v i = FS_Convertf32_i32( x0 ) * int32v( FnPrimes::X );
-        int32v j = FS_Convertf32_i32( y0 ) * int32v( FnPrimes::Y );
-        int32v k = FS_Convertf32_i32( z0 ) * int32v( FnPrimes::Z );
-        int32v l = FS_Convertf32_i32( w0 ) * int32v( FnPrimes::W );
-
-        float32v g = float32v( G4 ) * (xi + yi + zi + wi);
-        x0 = xi - g;
-        y0 = yi - g;
-        z0 = zi - g;
-        w0 = wi - g;
-
-        int32v rankx( 0 );
-        int32v ranky( 0 );
-        int32v rankz( 0 );
-        int32v rankw( 0 );
-
-        mask32v x_ge_y = x0 >= y0;
-        rankx = FS_MaskedIncrement_i32( rankx, x_ge_y );
-        ranky = FS_MaskedIncrement_i32( ranky, ~x_ge_y );
-
-        mask32v x_ge_z = x0 >= z0;
-        rankx = FS_MaskedIncrement_i32( rankx, x_ge_z );
-        rankz = FS_MaskedIncrement_i32( rankz, ~x_ge_z );
-
-        mask32v x_ge_w = x0 >= w0;
-        rankx = FS_MaskedIncrement_i32( rankx, x_ge_w );
-        rankw = FS_MaskedIncrement_i32( rankw, ~x_ge_w );
-
-        mask32v y_ge_z = y0 >= z0;
-        ranky = FS_MaskedIncrement_i32( ranky, y_ge_z );
-        rankz = FS_MaskedIncrement_i32( rankz, ~y_ge_z );
-
-        mask32v y_ge_w = y0 >= w0;
-        ranky = FS_MaskedIncrement_i32( ranky, y_ge_w );
-        rankw = FS_MaskedIncrement_i32( rankw, ~y_ge_w );
-
-        mask32v z_ge_w = z0 >= w0;
-        rankz = FS_MaskedIncrement_i32( rankz, z_ge_w );
-        rankw = FS_MaskedIncrement_i32( rankw, ~z_ge_w );
-
-        mask32v i1 = rankx > int32v( 2 );
-        mask32v j1 = ranky > int32v( 2 );
-        mask32v k1 = rankz > int32v( 2 );
-        mask32v l1 = rankw > int32v( 2 );
-
-        mask32v i2 = rankx > int32v( 1 );
-        mask32v j2 = ranky > int32v( 1 );
-        mask32v k2 = rankz > int32v( 1 );
-        mask32v l2 = rankw > int32v( 1 );
-
-        mask32v i3 = rankx > int32v( 0 );
-        mask32v j3 = ranky > int32v( 0 );
-        mask32v k3 = rankz > int32v( 0 );
-        mask32v l3 = rankw > int32v( 0 );
-
-        float32v x1 = FS_MaskedSub_f32( x0, float32v( 1 ), i1 ) + float32v( G4 );
-        float32v y1 = FS_MaskedSub_f32( y0, float32v( 1 ), j1 ) + float32v( G4 );
-        float32v z1 = FS_MaskedSub_f32( z0, float32v( 1 ), k1 ) + float32v( G4 );
-        float32v w1 = FS_MaskedSub_f32( w0, float32v( 1 ), l1 ) + float32v( G4 );
-        float32v x2 = FS_MaskedSub_f32( x0, float32v( 1 ), i2 ) + float32v( G4 * 2 );
-        float32v y2 = FS_MaskedSub_f32( y0, float32v( 1 ), j2 ) + float32v( G4 * 2 );
-        float32v z2 = FS_MaskedSub_f32( z0, float32v( 1 ), k2 ) + float32v( G4 * 2 );
-        float32v w2 = FS_MaskedSub_f32( w0, float32v( 1 ), l2 ) + float32v( G4 * 2 );
-        float32v x3 = FS_MaskedSub_f32( x0, float32v( 1 ), i3 ) + float32v( G4 * 3 );
-        float32v y3 = FS_MaskedSub_f32( y0, float32v( 1 ), j3 ) + float32v( G4 * 3 );
-        float32v z3 = FS_MaskedSub_f32( z0, float32v( 1 ), k3 ) + float32v( G4 * 3 );
-        float32v w3 = FS_MaskedSub_f32( w0, float32v( 1 ), l3 ) + float32v( G4 * 3 );
-        float32v x4 = x0 + float32v( G4 * 4 - 1 );
-        float32v y4 = y0 + float32v( G4 * 4 - 1 );
-        float32v z4 = z0 + float32v( G4 * 4 - 1 );
-        float32v w4 = w0 + float32v( G4 * 4 - 1 );
-
-        float32v t0 = FS_FNMulAdd_f32( x0, x0, FS_FNMulAdd_f32( y0, y0, FS_FNMulAdd_f32( z0, z0, FS_FNMulAdd_f32( w0, w0, float32v( 0.6f ) ) ) ) );
-        float32v t1 = FS_FNMulAdd_f32( x1, x1, FS_FNMulAdd_f32( y1, y1, FS_FNMulAdd_f32( z1, z1, FS_FNMulAdd_f32( w1, w1, float32v( 0.6f ) ) ) ) );
-        float32v t2 = FS_FNMulAdd_f32( x2, x2, FS_FNMulAdd_f32( y2, y2, FS_FNMulAdd_f32( z2, z2, FS_FNMulAdd_f32( w2, w2, float32v( 0.6f ) ) ) ) );
-        float32v t3 = FS_FNMulAdd_f32( x3, x3, FS_FNMulAdd_f32( y3, y3, FS_FNMulAdd_f32( z3, z3, FS_FNMulAdd_f32( w3, w3, float32v( 0.6f ) ) ) ) );
-        float32v t4 = FS_FNMulAdd_f32( x4, x4, FS_FNMulAdd_f32( y4, y4, FS_FNMulAdd_f32( z4, z4, FS_FNMulAdd_f32( w4, w4, float32v( 0.6f ) ) ) ) );
-
-        t0 = FS_Max_f32( t0, float32v( 0 ) );
-        t1 = FS_Max_f32( t1, float32v( 0 ) );
-        t2 = FS_Max_f32( t2, float32v( 0 ) );
-        t3 = FS_Max_f32( t3, float32v( 0 ) );
-        t4 = FS_Max_f32( t4, float32v( 0 ) );
-
-        t0 *= t0; t0 *= t0;
-        t1 *= t1; t1 *= t1;
-        t2 *= t2; t2 *= t2;
-        t3 *= t3; t3 *= t3;
-        t4 *= t4; t4 *= t4;
-
-        float32v n0 = FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, i, j, k, l ), x0, y0, z0, w0 );
-        float32v n1 = FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, 
-            FS_MaskedAdd_i32( i, int32v( FnPrimes::X ), i1 ),
-            FS_MaskedAdd_i32( j, int32v( FnPrimes::Y ), j1 ),
-            FS_MaskedAdd_i32( k, int32v( FnPrimes::Z ), k1 ),
-            FS_MaskedAdd_i32( l, int32v( FnPrimes::W ), l1 ) ), x1, y1, z1, w1 );
-        float32v n2 = FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, 
-            FS_MaskedAdd_i32( i, int32v( FnPrimes::X ), i2 ),
-            FS_MaskedAdd_i32( j, int32v( FnPrimes::Y ), j2 ),
-            FS_MaskedAdd_i32( k, int32v( FnPrimes::Z ), k2 ),
-            FS_MaskedAdd_i32( l, int32v( FnPrimes::W ), l2 ) ), x2, y2, z2, w2 );
-        float32v n3 = FnUtils::GetGradientDot( FnUtils::HashPrimes( seed,
-            FS_MaskedAdd_i32( i, int32v( FnPrimes::X ), i3 ),
-            FS_MaskedAdd_i32( j, int32v( FnPrimes::Y ), j3 ),
-            FS_MaskedAdd_i32( k, int32v( FnPrimes::Z ), k3 ),
-            FS_MaskedAdd_i32( l, int32v( FnPrimes::W ), l3 ) ), x3, y3, z3, w3 );
-        float32v n4 = FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, i + int32v( FnPrimes::X ), j + int32v( FnPrimes::Y ), k + int32v( FnPrimes::Z ), l + int32v( FnPrimes::W ) ), x4, y4, z4, w4 );
-
-        return float32v( 27.f ) * FS_FMulAdd_f32( n0, t0, FS_FMulAdd_f32( n1, t1, FS_FMulAdd_f32( n2, t2, FS_FMulAdd_f32( n3, t3, n4 * t4 ) ) ) );
-    }
-};
+        constexpr double kSkew3 = 1.0 / 3.0;
+        constexpr double kReflectUnskew3 = -1.0 / 2.0;
+        constexpr double kTwiceUnskew3 = -1.0 / 4.0;
 
-template<typename FS>
-class FS_T<FastNoise::OpenSimplex2, FS> : public virtual FastNoise::OpenSimplex2, public FS_T<FastNoise::Generator, FS>
-{
-    FASTSIMD_DECLARE_FS_TYPES;
+        constexpr double kDistanceSquaredA = 3.0 / 4.0;
+        constexpr double kDistanceSquaredB = 1.0;
+        constexpr double kFalloffRadiusSquared = kDistanceSquaredA;
 
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y ) const final
-    {
-        const float SQRT3 = 1.7320508075f;
-        const float F2 = 0.5f * (SQRT3 - 1.0f);
-        const float G2 = (3.0f - SQRT3) / 6.0f;
+        float32v skewDelta = float32v( kSkew3 ) * ( x + y + z );
 
-        float32v f = float32v( F2 ) * (x + y);
-        float32v x0 = FS_Floor_f32( x + f );
-        float32v y0 = FS_Floor_f32( y + f );
+        float32v xSkewed = x + skewDelta;
+        float32v ySkewed = y + skewDelta;
+        float32v zSkewed = z + skewDelta;
+        float32v xSkewedBase = FS::Floor( xSkewed );
+        float32v ySkewedBase = FS::Floor( ySkewed );
+        float32v zSkewedBase = FS::Floor( zSkewed );
+        float32v dxSkewed = xSkewed - xSkewedBase;
+        float32v dySkewed = ySkewed - ySkewedBase;
+        float32v dzSkewed = zSkewed - zSkewedBase;
 
-        int32v i = FS_Convertf32_i32( x0 ) * int32v( FnPrimes::X );
-        int32v j = FS_Convertf32_i32( y0 ) * int32v( FnPrimes::Y );
+        // From unit cell base, find closest vertex
+        {
+            // Perform a double unskew to get the vector whose dot product with skewed vectors produces the unskewed result.
+            float32v twiceUnskewDelta = float32v( kTwiceUnskew3 ) * ( dxSkewed + dySkewed + dzSkewed );
+            float32v xNormal = dxSkewed + twiceUnskewDelta;
+            float32v yNormal = dySkewed + twiceUnskewDelta;
+            float32v zNormal = dzSkewed + twiceUnskewDelta;
+            float32v xyzNormal = -twiceUnskewDelta; // xNormal + yNormal + zNormal
+
+            // Using those, compare scores to determine which vertex is closest.
+            constexpr auto considerVertex = [] ( float32v& maxScore, int32v& moveMaskBits, float32v score, int32v bits ) constexpr
+            {
+                moveMaskBits = FS::Select( score > maxScore, bits, moveMaskBits );
+                maxScore = FS::Max( maxScore, score );
+            };
+            float32v maxScore = float32v( 0.375f );
+            int32v moveMaskBits = FS::Masked( xyzNormal > maxScore, int32v( -1 ) );
+            maxScore = FS::Max( maxScore, xyzNormal );
+            considerVertex( maxScore, moveMaskBits, xNormal, 0b001 );
+            considerVertex( maxScore, moveMaskBits, yNormal, 0b010 );
+            considerVertex( maxScore, moveMaskBits, zNormal, 0b100 );
+            maxScore += float32v( 0.125f ) - xyzNormal;
+            considerVertex( maxScore, moveMaskBits, -zNormal, 0b011 );
+            considerVertex( maxScore, moveMaskBits, -yNormal, 0b101 );
+            considerVertex( maxScore, moveMaskBits, -xNormal, 0b110 );
+
+            mask32v moveX = ( moveMaskBits & int32v( 0b001 ) ) != int32v( 0 );
+            mask32v moveY = ( moveMaskBits & int32v( 0b010 ) ) != int32v( 0 );
+            mask32v moveZ = ( moveMaskBits & int32v( 0b100 ) ) != int32v( 0 );
+
+            xSkewedBase = FS::MaskedIncrement( moveX, xSkewedBase );
+            ySkewedBase = FS::MaskedIncrement( moveY, ySkewedBase );
+            zSkewedBase = FS::MaskedIncrement( moveZ, zSkewedBase );
+
+            dxSkewed = FS::MaskedDecrement( moveX, dxSkewed );
+            dySkewed = FS::MaskedDecrement( moveY, dySkewed );
+            dzSkewed = FS::MaskedDecrement( moveZ, dzSkewed );
+        }
 
-        float32v g = float32v( G2 ) * (x0 + y0);
-        x0 = x - (x0 - g);
-        y0 = y - (y0 - g);
+        int32v xPrimedBase = FS::Convert<int32_t>( xSkewedBase ) * int32v( Primes::X );
+        int32v yPrimedBase = FS::Convert<int32_t>( ySkewedBase ) * int32v( Primes::Y );
+        int32v zPrimedBase = FS::Convert<int32_t>( zSkewedBase ) * int32v( Primes::Z );
 
-        mask32v i1 = x0 > y0;
-        //mask32v j1 = ~i1; //NMasked funcs
+        float32v skewedCoordinateSum = dxSkewed + dySkewed + dzSkewed;
+        float32v twiceUnskewDelta = float32v( kTwiceUnskew3 ) * skewedCoordinateSum;
+        float32v xNormal = dxSkewed + twiceUnskewDelta;
+        float32v yNormal = dySkewed + twiceUnskewDelta;
+        float32v zNormal = dzSkewed + twiceUnskewDelta;
+        float32v xyzNormal = -twiceUnskewDelta; // xNormal + yNormal + zNormal
 
-        float32v x1 = FS_MaskedSub_f32( x0, float32v( 1.f ), i1 ) + float32v( G2 );
-        float32v y1 = FS_NMaskedSub_f32( y0, float32v( 1.f ), i1 ) + float32v( G2 );
-        float32v x2 = x0 + float32v( (G2 * 2) - 1 );
-        float32v y2 = y0 + float32v( (G2 * 2) - 1 );
+        float32v unskewDelta = float32v( kReflectUnskew3 ) * skewedCoordinateSum;
+        float32v dxBase = dxSkewed + unskewDelta;
+        float32v dyBase = dySkewed + unskewDelta;
+        float32v dzBase = dzSkewed + unskewDelta;
 
-        float32v t0 = float32v( 0.5f ) - (x0 * x0) - (y0 * y0);
-        float32v t1 = float32v( 0.5f ) - (x1 * x1) - (y1 * y1);
-        float32v t2 = float32v( 0.5f ) - (x2 * x2) - (y2 * y2);
+        float32v coordinateSum = float32v( 1 + 3 * kReflectUnskew3 ) * skewedCoordinateSum; // dxBase + dyBase + dzBase
 
-        t0 = FS_Max_f32( t0, float32v( 0 ) );
-        t1 = FS_Max_f32( t1, float32v( 0 ) );
-        t2 = FS_Max_f32( t2, float32v( 0 ) );
+        // Vertex <0, 0, 0>
+        float32v value, falloffBaseStemA, falloffBaseStemB;
+        {
+            float32v gradientRampValue = GetGradientDotCommon( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimedBase ), dxBase, dyBase, dzBase );
+            float32v falloffBase = FS::FNMulAdd( dzBase, dzBase, FS::FNMulAdd( dyBase, dyBase, FS::FNMulAdd( dxBase, dxBase, float32v( kFalloffRadiusSquared ) ) ) ) * float32v( 0.5f );
+            falloffBaseStemA = falloffBase - float32v( kDistanceSquaredA * 0.5 );
+            falloffBaseStemB = falloffBase - float32v( kDistanceSquaredB * 0.5 );
+            value = ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ) * gradientRampValue;
+        }
 
-        t0 *= t0; t0 *= t0;
-        t1 *= t1; t1 *= t1;
-        t2 *= t2; t2 *= t2;
+        // Vertex <1, 1, 1> or <-1, -1, -1>
+        {
+            mask32v signMask = xyzNormal < float32v( 0 );
 
-        float32v n0 = FnUtils::GetGradientDotFancy( FnUtils::HashPrimes( seed, i, j ), x0, y0 );
-        float32v n1 = FnUtils::GetGradientDotFancy( FnUtils::HashPrimes( seed, FS_MaskedAdd_i32( i, int32v( FnPrimes::X ), i1 ), FS_NMaskedAdd_i32( j, int32v( FnPrimes::Y ), i1 ) ), x1, y1 );
-        float32v n2 = FnUtils::GetGradientDotFancy( FnUtils::HashPrimes( seed, i + int32v( FnPrimes::X ), j + int32v( FnPrimes::Y ) ), x2, y2 );
+            int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) );
+            int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) );
+            int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) );
 
-        return float32v( 49.918426513671875f ) * FS_FMulAdd_f32( n0, t0, FS_FMulAdd_f32( n1, t1, n2 * t2 ) );
-    }
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+            float32v offset = float32v( 3 * kReflectUnskew3 + 1 ) ^ sign;
 
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z ) const final
-    {
-        float32v f = float32v( 2.0f / 3.0f ) * (x + y + z);
-        float32v xr = f - x;
-        float32v yr = f - y;
-        float32v zr = f - z;
+            float32v gradientRampValue = GetGradientDotCommon( HashPrimes( seed, xPrimed, yPrimed, zPrimed ), dxBase - offset, dyBase - offset, dzBase - offset );
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset, coordinateSum, falloffBaseStemA ), float32v( 0.0f ) );
+            value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value );
+        }
 
-        float32v val( 0 );
-        for( size_t i = 0; ; i++ )
+        // Vertex <1, 1, 0> or <-1, -1, 0>
         {
-            float32v v0xr = FS_Round_f32( xr );
-            float32v v0yr = FS_Round_f32( yr );
-            float32v v0zr = FS_Round_f32( zr );
-            float32v d0xr = xr - v0xr;
-            float32v d0yr = yr - v0yr;
-            float32v d0zr = zr - v0zr;
-
-            float32v score0xr = FS_Abs_f32( d0xr );
-            float32v score0yr = FS_Abs_f32( d0yr );
-            float32v score0zr = FS_Abs_f32( d0zr );
-            mask32v dir0xr = FS_Max_f32( score0yr, score0zr ) <= score0xr;
-            mask32v dir0yr = FS_BitwiseAndNot_m32( FS_Max_f32( score0zr, score0xr ) <= score0yr, dir0xr );
-            mask32v dir0zr = ~(dir0xr | dir0yr);
-            float32v v1xr = FS_MaskedAdd_f32( v0xr, float32v( 1.0f ) | ( float32v( -1.0f ) & d0xr ), dir0xr );
-            float32v v1yr = FS_MaskedAdd_f32( v0yr, float32v( 1.0f ) | ( float32v( -1.0f ) & d0yr ), dir0yr );
-            float32v v1zr = FS_MaskedAdd_f32( v0zr, float32v( 1.0f ) | ( float32v( -1.0f ) & d0zr ), dir0zr );
-            float32v d1xr = xr - v1xr;
-            float32v d1yr = yr - v1yr;
-            float32v d1zr = zr - v1zr;
-
-            int32v hv0xr = FS_Convertf32_i32( v0xr ) * int32v( FnPrimes::X );
-            int32v hv0yr = FS_Convertf32_i32( v0yr ) * int32v( FnPrimes::Y );
-            int32v hv0zr = FS_Convertf32_i32( v0zr ) * int32v( FnPrimes::Z );
-
-            int32v hv1xr = FS_Convertf32_i32( v1xr ) * int32v( FnPrimes::X );
-            int32v hv1yr = FS_Convertf32_i32( v1yr ) * int32v( FnPrimes::Y );
-            int32v hv1zr = FS_Convertf32_i32( v1zr ) * int32v( FnPrimes::Z );
-
-            float32v t0 = FS_FNMulAdd_f32( d0zr, d0zr, FS_FNMulAdd_f32( d0yr, d0yr, FS_FNMulAdd_f32( d0xr, d0xr, float32v( 0.6f ) ) ) );
-            float32v t1 = FS_FNMulAdd_f32( d1zr, d1zr, FS_FNMulAdd_f32( d1yr, d1yr, FS_FNMulAdd_f32( d1xr, d1xr, float32v( 0.6f ) ) ) );
-            t0 = FS_Max_f32( t0, float32v( 0 ) );
-            t1 = FS_Max_f32( t1, float32v( 0 ) );
-            t0 *= t0; t0 *= t0;
-            t1 *= t1; t1 *= t1;
-
-            float32v v0 = FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, hv0xr, hv0yr, hv0zr ), d0xr, d0yr, d0zr );
-            float32v v1 = FnUtils::GetGradientDot( FnUtils::HashPrimes( seed, hv1xr, hv1yr, hv1zr ), d1xr, d1yr, d1zr );
-
-            val = FS_FMulAdd_f32( v0, t0, FS_FMulAdd_f32( v1, t1, val ) );
-
-            if( i == 1 )
-            {
-                break;
-            }
+            mask32v signMask = xyzNormal < zNormal;
+
+            int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) );
+            int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) );
 
-            xr += float32v( 0.5f );
-            yr += float32v( 0.5f );
-            zr += float32v( 0.5f );
-            seed = ~seed;
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+            float32v offset0 = float32v( 2 * kReflectUnskew3 ) ^ sign;
+
+            float32v gradientRampValue = GetGradientDotCommon( HashPrimes( seed, xPrimed, yPrimed, zPrimedBase ), dxBase, dyBase, dzBase - offset0 );
+            float32v falloffBase = FS::Min( ( sign ^ dzBase ) - falloffBaseStemB, float32v( 0.0f ) );
+            value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value );
         }
 
-        return float32v( 32.69428253173828125f ) * val;
-    }
-};
+        // Vertex <1, 0, 1> or <-1, 0, -1>
+        {
+            mask32v signMask = xyzNormal < yNormal;
 
-template<typename FS>
-class FS_T<FastNoise::OpenSimplex2S, FS> : public virtual FastNoise::OpenSimplex2S, public FS_T<FastNoise::Generator, FS>
-{
-    FASTSIMD_DECLARE_FS_TYPES;
+            int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) );
+            int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) );
 
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y ) const final
-    {
-        const float SQRT3 = 1.7320508075688772935274463415059f;
-        const float F2 = 0.5f * ( SQRT3 - 1.0f );
-        const float G2 = ( SQRT3 - 3.0f ) / 6.0f;
-
-        float32v s = float32v( F2 ) * ( x + y );
-        float32v xs = x + s;
-        float32v ys = y + s;
-        float32v xsb = FS_Floor_f32( xs );
-        float32v ysb = FS_Floor_f32( ys );
-        float32v xsi = xs - xsb;
-        float32v ysi = ys - ysb;
-        int32v xsbp = FS_Convertf32_i32( xsb ) * int32v( FnPrimes::X );
-        int32v ysbp = FS_Convertf32_i32( ysb ) * int32v( FnPrimes::Y );
-
-        mask32v forwardXY = xsi + ysi > float32v( 1.0f );
-        float32v boundaryXY = FS_Mask_f32( float32v( -1.0f ), forwardXY );
-        mask32v forwardX = FS_FMulAdd_f32( xsi, float32v( -2.0f ), ysi ) < boundaryXY;
-        mask32v forwardY = FS_FMulAdd_f32( ysi, float32v( -2.0f ), xsi ) < boundaryXY;
-
-        float32v t = float32v( G2 ) * ( xsi + ysi );
-        float32v xi = xsi + t;
-        float32v yi = ysi + t;
-
-        int32v h0 = FnUtils::HashPrimes( seed, xsbp, ysbp );
-        float32v v0 = FnUtils::GetGradientDotFancy( h0, xi, yi );
-        float32v a = FS_FNMulAdd_f32( xi, xi, FS_FNMulAdd_f32( yi, yi, float32v( 2.0f / 3.0f ) ) );
-        float32v a0 = a; a0 *= a0; a0 *= a0;
-        float32v value = a0 * v0;
-
-        int32v h1 = FnUtils::HashPrimes( seed, xsbp + int32v( FnPrimes::X ), ysbp + int32v( FnPrimes::Y ) );
-        float32v v1 = FnUtils::GetGradientDotFancy( h1, xi - float32v( 2 * G2 + 1 ), yi - float32v( 2 * G2 + 1 ) );
-        float32v a1 = FS_FMulAdd_f32( float32v( 2 * ( 1 + 2 * G2 ) * ( 1 / G2 + 2 ) ), t, a + float32v( -2 * ( 1 + 2 * G2 ) * ( 1 + 2 * G2 ) ) );
-        a1 *= a1; a1 *= a1;
-        value = FS_FMulAdd_f32( a1, v1, value );
-
-        float32v xyDelta = FS_Select_f32( forwardXY, float32v( G2 + 1 ), float32v( -G2 ) );
-        xi -= xyDelta;
-        yi -= xyDelta;
-
-        int32v h2 = FnUtils::HashPrimes( seed,
-            FS_NMaskedSub_i32( FS_MaskedAdd_i32( xsbp, int32v( FnPrimes::X * 2 ), forwardX ), int32v( FnPrimes::X ), forwardXY ),
-            FS_MaskedAdd_i32( ysbp, int32v( FnPrimes::Y ), forwardXY ) );
-        float32v xi2 = xi - FS_Select_f32( forwardX, float32v( 1 + 2 * G2 ), float32v( -1 ) );
-        float32v yi2 = FS_MaskedSub_f32( yi, float32v( 2 * G2 ), forwardX );
-        float32v v2 = FnUtils::GetGradientDotFancy( h2, xi2, yi2 );
-        float32v a2 = FS_Max_f32( FS_FNMulAdd_f32( xi2, xi2, FS_FNMulAdd_f32( yi2, yi2, float32v( 2.0f / 3.0f ) ) ), float32v( 0 ) );
-        a2 *= a2; a2 *= a2;
-        value = FS_FMulAdd_f32( a2, v2, value );
-
-        int32v h3 = FnUtils::HashPrimes( seed,
-            FS_MaskedAdd_i32( xsbp, int32v( FnPrimes::X ), forwardXY ),
-            FS_NMaskedSub_i32( FS_MaskedAdd_i32( ysbp, int32v( (int32_t)( FnPrimes::Y * 2LL ) ), forwardY ), int32v( FnPrimes::Y ), forwardXY ) );
-        float32v xi3 = FS_MaskedSub_f32( xi, float32v( 2 * G2 ), forwardY );
-        float32v yi3 = yi - FS_Select_f32( forwardY, float32v( 1 + 2 * G2 ), float32v( -1 ) );
-        float32v v3 = FnUtils::GetGradientDotFancy( h3, xi3, yi3 );
-        float32v a3 = FS_Max_f32( FS_FNMulAdd_f32( xi3, xi3, FS_FNMulAdd_f32( yi3, yi3, float32v( 2.0f / 3.0f ) ) ), float32v( 0 ) );
-        a3 *= a3; a3 *= a3;
-        value = FS_FMulAdd_f32( a3, v3, value );
-
-        return float32v( 9.28993664146183f ) * value;
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+            float32v offset0 = float32v( 2 * kReflectUnskew3 ) ^ sign;
+
+            float32v gradientRampValue = GetGradientDotCommon( HashPrimes( seed, xPrimed, yPrimedBase, zPrimed ), dxBase, dyBase - offset0, dzBase );
+            float32v falloffBase = FS::Min( ( sign ^ dyBase ) - falloffBaseStemB, float32v( 0.0f ) );
+            value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value );
+        }
+
+        // Vertex <0, 1, 1> or <0, -1, -1>
+        {
+            mask32v signMask = xyzNormal < xNormal;
+
+            int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) );
+            int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) );
+
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+            float32v offset0 = float32v( 2 * kReflectUnskew3 ) ^ sign;
+
+            float32v gradientRampValue = GetGradientDotCommon( HashPrimes( seed, xPrimedBase, yPrimed, zPrimed ), dxBase - offset0, dyBase, dzBase );
+            float32v falloffBase = FS::Min( ( sign ^ dxBase ) - falloffBaseStemB, float32v( 0.0f ) );
+            value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value );
+        }
+
+        // Vertex <1, 0, 0> or <-1, 0, 0>
+        {
+            mask32v signMask = xNormal < float32v( 0 );
+
+            int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) );
+
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+            float32v offset0 = float32v( kReflectUnskew3 ) ^ sign; // offset1 = -offset0 because kReflectUnskew3 + 1 = -kReflectUnskew3
+
+            float32v gradientRampValue = GetGradientDotCommon( HashPrimes( seed, xPrimed, yPrimedBase, zPrimedBase ), dxBase + offset0, dyBase - offset0, dzBase - offset0 );
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dxBase ), float32v( 0.0f ) );
+            value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value );
+        }
+
+        // Vertex <0, 1, 0> or <0, -1, 0>
+        {
+            mask32v signMask = yNormal < float32v( 0 );
+
+            int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) );
+
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+            float32v offset0 = float32v( kReflectUnskew3 ) ^ sign; // offset1 = -offset0 because kReflectUnskew3 + 1 = -kReflectUnskew3
+
+            float32v gradientRampValue = GetGradientDotCommon( HashPrimes( seed, xPrimedBase, yPrimed, zPrimedBase ), dxBase - offset0, dyBase + offset0, dzBase - offset0 );
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dyBase ), float32v( 0.0f ) );
+            value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value );
+        }
+
+        // Vertex <0, 0, 1> or <0, 0, -1>
+        {
+            mask32v signMask = zNormal < float32v( 0 );
+
+            int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) );
+
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+            float32v offset0 = float32v( kReflectUnskew3 ) ^ sign; // offset1 = -offset0 because kReflectUnskew3 + 1 = -kReflectUnskew3
+
+            float32v gradientRampValue = GetGradientDotCommon( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimed ), dxBase - offset0, dyBase - offset0, dzBase + offset0 );
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dzBase ), float32v( 0.0f ) );
+            value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value );
+        }
+
+        constexpr double kBounding = 144.736422163332608;
+
+        return this->ScaleOutput( value, -1 / kBounding, 1 / kBounding );
     }
 
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z ) const final
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z, float32v w ) const
     {
-        float32v f = float32v( 2.0f / 3.0f ) * ( x + y + z );
-        float32v xr = f - x;
-        float32v yr = f - y;
-        float32v zr = f - z;
-
-        float32v xrb = FS_Floor_f32( xr );
-        float32v yrb = FS_Floor_f32( yr );
-        float32v zrb = FS_Floor_f32( zr );
-        float32v xri = xr - xrb;
-        float32v yri = yr - yrb;
-        float32v zri = zr - zrb;
-        int32v xrbp = FS_Convertf32_i32( xrb ) * int32v( FnPrimes::X );
-        int32v yrbp = FS_Convertf32_i32( yrb ) * int32v( FnPrimes::Y );
-        int32v zrbp = FS_Convertf32_i32( zrb ) * int32v( FnPrimes::Z );
-
-        float32v value( 0 );
-        for( size_t i = 0; ; i++ )
+        this->ScalePositions( x, y, z, w );
+
+        constexpr double kRoot5 = 2.2360679774997896964091736687313;
+        constexpr double kSkew4 = 1.0 / ( kRoot5 + 1.0 );
+        constexpr double kUnskew4 = -1.0 / ( kRoot5 + 5.0 );
+        constexpr double kTwiceUnskew4 = -1.0 / 5.0;
+
+        constexpr double kDistanceSquaredA = 4.0 / 5.0;
+        constexpr double kDistanceSquaredB = 6.0 / 5.0;
+        constexpr double kFalloffRadiusSquared = kDistanceSquaredA;
+
+        float32v skewDelta = float32v( kSkew4 ) * ( x + y + z + w );
+
+        float32v xSkewed = x + skewDelta;
+        float32v ySkewed = y + skewDelta;
+        float32v zSkewed = z + skewDelta;
+        float32v wSkewed = w + skewDelta;
+        float32v xSkewedBase = FS::Floor( xSkewed );
+        float32v ySkewedBase = FS::Floor( ySkewed );
+        float32v zSkewedBase = FS::Floor( zSkewed );
+        float32v wSkewedBase = FS::Floor( wSkewed );
+        float32v dxSkewed = xSkewed - xSkewedBase;
+        float32v dySkewed = ySkewed - ySkewedBase;
+        float32v dzSkewed = zSkewed - zSkewedBase;
+        float32v dwSkewed = wSkewed - wSkewedBase;
+
+        // From unit cell base, find closest vertex
         {
-            float32v a = FS_FNMulAdd_f32( xri, xri, FS_FNMulAdd_f32( yri, yri, FS_FNMulAdd_f32( zri, zri, float32v( 0.75f ) ) ) ) * float32v( 0.5f );
-
-            float32v p0 = zri + yri + xri - float32v( 1.5f );
-            mask32v flip0 = p0 >= float32v( 0.0f );
-            float32v a0 = FS_Max_f32( FS_MaskedAdd_f32( a, p0, flip0 ), float32v( 0 ) );
-            a0 *= a0; a0 *= a0;
-            int32v h0 = FnUtils::HashPrimes( seed, FS_MaskedAdd_i32( xrbp, int32v( FnPrimes::X ), flip0 ), FS_MaskedAdd_i32( yrbp, int32v( FnPrimes::Y ), flip0 ), FS_MaskedAdd_i32( zrbp, int32v( FnPrimes::Z ), flip0 ) );
-            float32v v0 = FnUtils::GetGradientDot( h0, FS_MaskedSub_f32( xri, float32v( 1.0f ), flip0 ), FS_MaskedSub_f32( yri, float32v( 1.0f ), flip0 ), FS_MaskedSub_f32( zri, float32v( 1.0f ), flip0 ) );
-            value = FS_FMulAdd_f32( a0, v0, value );
-            a -= float32v( 0.5f );
-
-            float32v p1 = zri + yri - xri + float32v( -0.5f );
-            mask32v flip1 = p1 >= float32v( 0.0f );
-            float32v a1 = FS_Max_f32( FS_MaskedAdd_f32( a + xri, p1, flip1 ), float32v( 0 ) );
-            a1 *= a1; a1 *= a1;
-            int32v h1 = FnUtils::HashPrimes( seed, FS_NMaskedAdd_i32( xrbp, int32v( FnPrimes::X ), flip1 ), FS_MaskedAdd_i32( yrbp, int32v( FnPrimes::Y ), flip1 ), FS_MaskedAdd_i32( zrbp, int32v( FnPrimes::Z ), flip1 ) );
-            float32v v1 = FnUtils::GetGradientDot( h1, FS_NMaskedSub_f32( xri, float32v( 1.0f ), flip1 ), FS_MaskedSub_f32( yri, float32v( 1.0f ), flip1 ), FS_MaskedSub_f32( zri, float32v( 1.0f ), flip1 ) );
-            value = FS_FMulAdd_f32( a1, v1, value );
-
-            float32v p2 = xri + float32v( -0.5f ) + ( zri - yri );
-            mask32v flip2 = p2 >= float32v( 0.0f );
-            float32v a2 = FS_Max_f32( FS_MaskedAdd_f32( a + yri, p2, flip2 ), float32v( 0 ) );
-            a2 *= a2; a2 *= a2;
-            int32v h2 = FnUtils::HashPrimes( seed, FS_MaskedAdd_i32( xrbp, int32v( FnPrimes::X ), flip2 ), FS_NMaskedAdd_i32( yrbp, int32v( FnPrimes::Y ), flip2 ), FS_MaskedAdd_i32( zrbp, int32v( FnPrimes::Z ), flip2 ) );
-            float32v v2 = FnUtils::GetGradientDot( h2, FS_MaskedSub_f32( xri, float32v( 1.0f ), flip2 ), FS_NMaskedSub_f32( yri, float32v( 1.0f ), flip2 ), FS_MaskedSub_f32( zri, float32v( 1.0f ), flip2 ) );
-            value = FS_FMulAdd_f32( a2, v2, value );
-
-            float32v p3 = xri + float32v( -0.5f ) - ( zri - yri );
-            mask32v flip3 = p3 >= float32v( 0.0f );
-            float32v a3 = FS_Max_f32( FS_MaskedAdd_f32( a + zri, p3, flip3 ), float32v( 0 ) );
-            a3 *= a3; a3 *= a3;
-            int32v h3 = FnUtils::HashPrimes( seed, FS_MaskedAdd_i32( xrbp, int32v( FnPrimes::X ), flip3 ), FS_MaskedAdd_i32( yrbp, int32v( FnPrimes::Y ), flip3 ), FS_NMaskedAdd_i32( zrbp, int32v( FnPrimes::Z ), flip3 ) );
-            float32v v3 = FnUtils::GetGradientDot( h3, FS_MaskedSub_f32( xri, float32v( 1.0f ), flip3 ), FS_MaskedSub_f32( yri, float32v( 1.0f ), flip3 ), FS_NMaskedSub_f32( zri, float32v( 1.0f ), flip3 ) );
-            value = FS_FMulAdd_f32( a3, v3, value );
-
-            if( i == 1 )
+            // Perform a double unskew to get the vector whose dot product with skewed vectors produces the unskewed result.
+            float32v twiceUnskewDelta = float32v( kTwiceUnskew4 ) * ( dxSkewed + dySkewed + dzSkewed + dwSkewed );
+            float32v xNormal = dxSkewed + twiceUnskewDelta;
+            float32v yNormal = dySkewed + twiceUnskewDelta;
+            float32v zNormal = dzSkewed + twiceUnskewDelta;
+            float32v wNormal = dwSkewed + twiceUnskewDelta;
+            float32v xyzwNormal = -twiceUnskewDelta; // xNormal + yNormal + zNormal + wNormal
+
+            // Using those, compare scores to determine which vertex is closest.
+            constexpr auto considerVertex = [] ( float32v& maxScore, int32v& moveMaskBits, float32v score, int32v bits ) constexpr
             {
-                break;
-            }
+                moveMaskBits = FS::Select( score > maxScore, bits, moveMaskBits );
+                maxScore = FS::Max( maxScore, score );
+            };
+            float32v maxScore = float32v( 0.6f ) - xyzwNormal;
+            int32v moveMaskBits = FS::Masked( float32v( 0.2f ) > maxScore, int32v( -1 ) );
+            maxScore = FS::Max( maxScore, float32v( 0.2f ) );
+            considerVertex( maxScore, moveMaskBits, -wNormal, 0b0111 );
+            considerVertex( maxScore, moveMaskBits, -zNormal, 0b1011 );
+            considerVertex( maxScore, moveMaskBits, -yNormal, 0b1101 );
+            considerVertex( maxScore, moveMaskBits, -xNormal, 0b1110 );
+            maxScore += xyzwNormal - float32v( 0.2f );
+            considerVertex( maxScore, moveMaskBits, xNormal, 0b0001 );
+            considerVertex( maxScore, moveMaskBits, yNormal, 0b0010 );
+            considerVertex( maxScore, moveMaskBits, zNormal, 0b0100 );
+            considerVertex( maxScore, moveMaskBits, wNormal, 0b1000 );
+            maxScore += float32v( 0.2f ) - xNormal;
+            considerVertex( maxScore, moveMaskBits, yNormal, 0b0011 );
+            considerVertex( maxScore, moveMaskBits, zNormal, 0b0101 );
+            considerVertex( maxScore, moveMaskBits, wNormal, 0b1001 );
+            maxScore += xNormal;
+            considerVertex( maxScore, moveMaskBits, yNormal + zNormal, 0b0110 );
+            maxScore -= wNormal;
+            considerVertex( maxScore, moveMaskBits, yNormal, 0b1010 );
+            considerVertex( maxScore, moveMaskBits, zNormal, 0b1100 );
+
+            mask32v moveX = ( moveMaskBits & int32v( 0b0001 ) ) != int32v( 0 );
+            mask32v moveY = ( moveMaskBits & int32v( 0b0010 ) ) != int32v( 0 );
+            mask32v moveZ = ( moveMaskBits & int32v( 0b0100 ) ) != int32v( 0 );
+            mask32v moveW = ( moveMaskBits & int32v( 0b1000 ) ) != int32v( 0 );
+
+            xSkewedBase = FS::MaskedIncrement( moveX, xSkewedBase );
+            ySkewedBase = FS::MaskedIncrement( moveY, ySkewedBase );
+            zSkewedBase = FS::MaskedIncrement( moveZ, zSkewedBase );
+            wSkewedBase = FS::MaskedIncrement( moveW, wSkewedBase );
+
+            dxSkewed = FS::MaskedDecrement( moveX, dxSkewed );
+            dySkewed = FS::MaskedDecrement( moveY, dySkewed );
+            dzSkewed = FS::MaskedDecrement( moveZ, dzSkewed );
+            dwSkewed = FS::MaskedDecrement( moveW, dwSkewed );
+        }
 
-            mask32v sideX = xri >= float32v( 0.5f );
-            mask32v sideY = yri >= float32v( 0.5f );
-            mask32v sideZ = zri >= float32v( 0.5f );
+        int32v xPrimedBase = FS::Convert<int32_t>( xSkewedBase ) * int32v( Primes::X );
+        int32v yPrimedBase = FS::Convert<int32_t>( ySkewedBase ) * int32v( Primes::Y );
+        int32v zPrimedBase = FS::Convert<int32_t>( zSkewedBase ) * int32v( Primes::Z );
+        int32v wPrimedBase = FS::Convert<int32_t>( wSkewedBase ) * int32v( Primes::W );
+
+        float32v skewedCoordinateSum = dxSkewed + dySkewed + dzSkewed + dwSkewed;
+        float32v twiceUnskewDelta = float32v( kTwiceUnskew4 ) * skewedCoordinateSum;
+        float32v xNormal = dxSkewed + twiceUnskewDelta;
+        float32v yNormal = dySkewed + twiceUnskewDelta;
+        float32v zNormal = dzSkewed + twiceUnskewDelta;
+        float32v wNormal = dwSkewed + twiceUnskewDelta;
+        float32v xyzwNormal = -twiceUnskewDelta; // xNormal + yNormal + zNormal + wNormal
+
+        float32v unskewDelta = float32v( kUnskew4 ) * skewedCoordinateSum;
+        float32v dxBase = dxSkewed + unskewDelta;
+        float32v dyBase = dySkewed + unskewDelta;
+        float32v dzBase = dzSkewed + unskewDelta;
+        float32v dwBase = dwSkewed + unskewDelta;
+
+        float32v coordinateSum = float32v( 1 + 4 * kUnskew4 ) * skewedCoordinateSum; // dxBase + dyBase + dzBase + dwBase
+
+        // Vertex <0, 0, 0, 0>
+        float32v value, falloffBaseStemA, falloffBaseStemB;
+        {
+            float32v gradientRampValue = GetGradientDotPerlin( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimedBase, wPrimedBase ), dxBase, dyBase, dzBase, dwBase );
+            float32v falloffBase = FS::FNMulAdd( dwBase, dwBase, FS::FNMulAdd( dzBase, dzBase, FS::FNMulAdd( dyBase, dyBase, FS::FNMulAdd( dxBase, dxBase, float32v( kFalloffRadiusSquared ) ) ) ) ) * float32v( 0.5f );
+            falloffBaseStemA = falloffBase - float32v( kDistanceSquaredA * 0.5 );
+            falloffBaseStemB = falloffBase - float32v( kDistanceSquaredB * 0.5 );
+            value = ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ) * gradientRampValue;
+        }
+
+        // Vertex <1, 1, 1, 1> or <-1, -1, -1, -1>
+        {
+            mask32v signMask = xyzwNormal < float32v( 0 );
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) );
+            int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) );
+            int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) );
+            int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) );
+
+            float32v offset = float32v( 4 * kUnskew4 + 1 ) ^ sign;
+
+            float32v gradientRampValue = GetGradientDotPerlin( HashPrimes( seed, xPrimed, yPrimed, zPrimed, wPrimed ), dxBase - offset, dyBase - offset, dzBase - offset, dwBase - offset );
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset, coordinateSum, falloffBaseStemA ), float32v( 0.0f ) );
+            value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value );
+        }
+
+        // Vertex <1, 1, 1, 0> or <-1, -1, -1, 0>
+        {
+            mask32v signMask = xyzwNormal < wNormal;
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
 
-            xrbp = FS_MaskedAdd_i32( xrbp, int32v( FnPrimes::X ), sideX );
-            yrbp = FS_MaskedAdd_i32( yrbp, int32v( FnPrimes::Y ), sideY );
-            zrbp = FS_MaskedAdd_i32( zrbp, int32v( FnPrimes::Z ), sideZ );
+            int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) );
+            int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) );
+            int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) );
 
-            xri += FS_Select_f32( sideX, float32v( -0.5f ), float32v( 0.5f ) );
-            yri += FS_Select_f32( sideY, float32v( -0.5f ), float32v( 0.5f ) );
-            zri += FS_Select_f32( sideZ, float32v( -0.5f ), float32v( 0.5f ) );
+            float32v offset1 = float32v( 3 * kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( 3 * kUnskew4 ) ^ sign;
 
-            seed = ~seed;
+            float32v gradientRampValue = GetGradientDotPerlin( HashPrimes( seed, xPrimed, yPrimed, zPrimed, wPrimedBase ), dxBase - offset1, dyBase - offset1, dzBase - offset1, dwBase - offset0 );
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset1, coordinateSum, falloffBaseStemB ) - ( sign ^ dwBase ), float32v( 0.0f ) );
+            value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value );
         }
 
-        return float32v( 144.736422163332608f ) * value;
+        // Vertex <1, 1, 0, 1> or <-1, -1, 0, -1>
+        {
+            mask32v signMask = xyzwNormal < zNormal;
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) );
+            int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) );
+            int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) );
+
+            float32v offset1 = float32v( 3 * kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( 3 * kUnskew4 ) ^ sign;
+
+            float32v gradientRampValue = GetGradientDotPerlin( HashPrimes( seed, xPrimed, yPrimed, zPrimedBase, wPrimed ), dxBase - offset1, dyBase - offset1, dzBase - offset0, dwBase - offset1 );
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset1, coordinateSum, falloffBaseStemB ) - ( sign ^ dzBase ), float32v( 0.0f ) );
+            value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value );
+        }
+
+        // Vertex <1, 0, 1, 1> or <-1, 0, -1, -1>
+        {
+            mask32v signMask = xyzwNormal < yNormal;
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) );
+            int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) );
+            int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) );
+
+            float32v offset1 = float32v( 3 * kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( 3 * kUnskew4 ) ^ sign;
+
+            float32v gradientRampValue = GetGradientDotPerlin( HashPrimes( seed, xPrimed, yPrimedBase, zPrimed, wPrimed ), dxBase - offset1, dyBase - offset0, dzBase - offset1, dwBase - offset1 );
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset1, coordinateSum, falloffBaseStemB ) - ( sign ^ dyBase ), float32v( 0.0f ) );
+            value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value );
+        }
+
+        // Vertex <0, 1, 1, 1> or <0, -1, -1, -1>
+        {
+            mask32v signMask = xyzwNormal < xNormal;
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) );
+            int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) );
+            int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) );
+
+            float32v offset1 = float32v( 3 * kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( 3 * kUnskew4 ) ^ sign;
+
+            float32v gradientRampValue = GetGradientDotPerlin( HashPrimes( seed, xPrimedBase, yPrimed, zPrimed, wPrimed ), dxBase - offset0, dyBase - offset1, dzBase - offset1, dwBase - offset1 );
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset1, coordinateSum, falloffBaseStemB ) - ( sign ^ dxBase ), float32v( 0.0f ) );
+            value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value );
+        }
+
+        // Vertex <1, 0, 0, 0> or <-1, 0, 0, 0>
+        {
+            mask32v signMask = xNormal < float32v( 0 );
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) );
+
+            float32v offset1 = float32v( kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( kUnskew4 ) ^ sign;
+
+            float32v gradientRampValue = GetGradientDotPerlin( HashPrimes( seed, xPrimed, yPrimedBase, zPrimedBase, wPrimedBase ), dxBase - offset1, dyBase - offset0, dzBase - offset0, dwBase - offset0 );
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dxBase ), float32v( 0.0f ) );
+            value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value );
+        }
+
+        // Vertex <1, 1, 0, 0> or <-1, -1, 0, 0>
+        {
+            mask32v signMask = xNormal < -yNormal;
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) );
+            int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) );
+
+            float32v offset1 = float32v( 2 * kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( 2 * kUnskew4 ) ^ sign;
+
+            float32v gradientRampValue = GetGradientDotPerlin( HashPrimes( seed, xPrimed, yPrimed, zPrimedBase, wPrimedBase ), dxBase - offset1, dyBase - offset1, dzBase - offset0, dwBase - offset0 );
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemB ) + ( sign ^ ( dxBase + dyBase ) ), float32v( 0.0f ) );
+            value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value );
+        }
+
+        // Vertex <1, 0, 1, 0> or <-1, 0, -1, 0>
+        {
+            mask32v signMask = xNormal < -zNormal;
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) );
+            int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) );
+
+            float32v offset1 = float32v( 2 * kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( 2 * kUnskew4 ) ^ sign;
+
+            float32v gradientRampValue = GetGradientDotPerlin( HashPrimes( seed, xPrimed, yPrimedBase, zPrimed, wPrimedBase ), dxBase - offset1, dyBase - offset0, dzBase - offset1, dwBase - offset0 );
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemB ) + ( sign ^ ( dxBase + dzBase ) ), float32v( 0.0f ) );
+            value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value );
+        }
+
+        // Vertex <1, 0, 0, 1> or <-1, 0, 0, -1>
+        {
+            mask32v signMask = xNormal < -wNormal;
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) );
+            int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) );
+
+            float32v offset1 = float32v( 2 * kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( 2 * kUnskew4 ) ^ sign;
+
+            float32v gradientRampValue = GetGradientDotPerlin( HashPrimes( seed, xPrimed, yPrimedBase, zPrimedBase, wPrimed ), dxBase - offset1, dyBase - offset0, dzBase - offset0, dwBase - offset1 );
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemB ) + ( sign ^ ( dxBase + dwBase ) ), float32v( 0.0f ) );
+            value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value );
+        }
+
+        // Vertex <0, 1, 0, 0> or <0, -1, 0, 0>
+        {
+            mask32v signMask = yNormal < float32v( 0 );
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) );
+
+            float32v offset1 = float32v( kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( kUnskew4 ) ^ sign;
+
+            float32v gradientRampValue = GetGradientDotPerlin( HashPrimes( seed, xPrimedBase, yPrimed, zPrimedBase, wPrimedBase ), dxBase - offset0, dyBase - offset1, dzBase - offset0, dwBase - offset0 );
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dyBase ), float32v( 0.0f ) );
+            value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value );
+        }
+
+        // Vertex <0, 1, 1, 0> or <0, -1, -1, 0>
+        {
+            mask32v signMask = yNormal < -zNormal;
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) );
+            int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) );
+
+            float32v offset1 = float32v( 2 * kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( 2 * kUnskew4 ) ^ sign;
+
+            float32v gradientRampValue = GetGradientDotPerlin( HashPrimes( seed, xPrimedBase, yPrimed, zPrimed, wPrimedBase ), dxBase - offset0, dyBase - offset1, dzBase - offset1, dwBase - offset0 );
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemB ) + ( sign ^ ( dyBase + dzBase ) ), float32v( 0.0f ) );
+            value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value );
+        }
+
+        // Vertex <0, 1, 0, 1> or <0, -1, 0, -1>
+        {
+            mask32v signMask = yNormal < -wNormal;
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) );
+            int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) );
+
+            float32v offset1 = float32v( 2 * kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( 2 * kUnskew4 ) ^ sign;
+
+            float32v gradientRampValue = GetGradientDotPerlin( HashPrimes( seed, xPrimedBase, yPrimed, zPrimedBase, wPrimed ), dxBase - offset0, dyBase - offset1, dzBase - offset0, dwBase - offset1 );
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemB ) + ( sign ^ ( dyBase + dwBase ) ), float32v( 0.0f ) );
+            value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value );
+        }
+
+        // Vertex <0, 0, 1, 0> or <0, 0, -1, 0>
+        {
+            mask32v signMask = zNormal < float32v( 0 );
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) );
+
+            float32v offset1 = float32v( kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( kUnskew4 ) ^ sign;
+
+            float32v gradientRampValue = GetGradientDotPerlin( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimed, wPrimedBase ), dxBase - offset0, dyBase - offset0, dzBase - offset1, dwBase - offset0 );
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dzBase ), float32v( 0.0f ) );
+            value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value );
+        }
+
+        // Vertex <0, 0, 1, 1> or <0, 0, -1, -1>
+        {
+            mask32v signMask = zNormal < -wNormal;
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) );
+            int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) );
+
+            float32v offset1 = float32v( 2 * kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( 2 * kUnskew4 ) ^ sign;
+
+            float32v gradientRampValue = GetGradientDotPerlin( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimed, wPrimed ), dxBase - offset0, dyBase - offset0, dzBase - offset1, dwBase - offset1 );
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemB ) + ( sign ^ ( dzBase + dwBase ) ), float32v( 0.0f ) );
+            value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value );
+        }
+
+        // Vertex <0, 0, 0, 1> or <0, 0, 0, -1>
+        {
+            mask32v signMask = wNormal < float32v( 0 );
+            float32v sign = FS::Masked( signMask, float32v( FS::Cast<float>( int32v( 1 << 31 ) ) ) );
+
+            int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) );
+
+            float32v offset1 = float32v( kUnskew4 + 1 ) ^ sign;
+            float32v offset0 = float32v( kUnskew4 ) ^ sign;
+
+            float32v gradientRampValue = GetGradientDotPerlin( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimedBase, wPrimed ), dxBase - offset0, dyBase - offset0, dzBase - offset0, dwBase - offset1 );
+            float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dwBase ), float32v( 0.0f ) );
+            value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value );
+        }
+
+        constexpr double kBounding = 115.21625311930542;
+
+        return this->ScaleOutput( value, -1 / kBounding, 1 / kBounding );
     }
 };
-
diff --git a/include/FastNoise/Generators/Utils.inl b/include/FastNoise/Generators/Utils.inl
index 5e4fff89..46b742ed 100644
--- a/include/FastNoise/Generators/Utils.inl
+++ b/include/FastNoise/Generators/Utils.inl
@@ -1,393 +1,1044 @@
 #pragma once
-#include "FastSIMD/InlInclude.h"
 #include <climits>
 
 namespace FastNoise
-{    
+{
     namespace Primes
     {
-        static constexpr int X = 501125321;
-        static constexpr int Y = 1136930381;
-        static constexpr int Z = 1720413743;
-        static constexpr int W = 1066037191;
+        static constexpr int X = (int)0xF797C5C7;
+        static constexpr int Y = (int)0x6C060C89;
+        static constexpr int Z = (int)0x465FD04F;
+        static constexpr int W = (int)0xF7A62279;
 
         static constexpr int Lookup[] = { X,Y,Z,W };
     }
 
-    template<typename FS>
-    struct Utils
+    namespace HashMultiplier
     {
-        using float32v = typename FS::float32v;
-        using int32v = typename FS::int32v;
-        using mask32v = typename FS::mask32v;
+        static constexpr int A = (int)0xB7E0A5F5;
+    };
+
+    static constexpr double kRoot2 = 1.4142135623730950488016887242097;
+    static constexpr double kRoot3 = 1.7320508075688772935274463415059;
+    static constexpr double kRoot5 = 2.2360679774997896964091736687313;
+    static constexpr double kSkew2 = 1.0 / ( kRoot3 + 1.0 );
+    static constexpr double kSkew4 = 1.0 / ( kRoot5 + 1.0 );
+
+    static constexpr float kValueBounds = 2147483648.f;
+    static constexpr float kRoot2f = kRoot2;
+    static constexpr float kRoot3f = kRoot3;
+    static constexpr float kSkew2f = kSkew2;
+    static constexpr float kSkew4f = kSkew4;
+
+    template<FastSIMD::FeatureSet SIMD = FastSIMD::FeatureSetDefault()>
+    FS_FORCEINLINE static float32v GetGradientDotSimplex( int32v hash31, float32v fX, float32v fY )
+    {
+        int32v index = FS::BitShiftRightZeroExtend( hash31, 1 ) * int32v( 12 >> 2 ); // [0,12) in the upper four bits
+
+        if constexpr( SIMD & FastSIMD::FeatureFlag::AVX512_F )
+        {
+            index >>= 28;
+
+            float32v gX = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), index, FS::Constant<float>( kRoot3f, -kRoot3f, 1, -1, kRoot3f, -kRoot3f, -1, 1, 2, -2, 0, 0, 0, 0, 0, 0 ) );
+            float32v gY = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), index, FS::Constant<float>( 1, -1, kRoot3f, -kRoot3f, -1, 1, kRoot3f, -kRoot3f, 0, 0, 2, -2, 0, 0, 0, 0 ) );
 
-        static constexpr float ROOT2 = 1.4142135623730950488f;
-        static constexpr float ROOT3 = 1.7320508075688772935f;
+            return FS::FMulAdd( gX, fX, fY * gY );
+        }
+        else if constexpr( SIMD & FastSIMD::FeatureFlag::AVX2 )
+        {
+            float32v finalSign = FS::Cast<float>( ( index >> 28 ) << 31 );
+            index >>= 29;
 
-        template<typename SIMD = FS, std::enable_if_t<SIMD::SIMD_Level < FastSIMD::Level_AVX2>* = nullptr>
-        FS_INLINE static float32v GetGradientDotFancy( int32v hash, float32v fX, float32v fY )
+            float32v gX = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), FS::Constant<float>( kRoot3f, 1, kRoot3f, -1, 2, 0, 0, 0 ), index );
+            float32v gY = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), FS::Constant<float>( 1, kRoot3f, -1, kRoot3f, 0, 2, 0, 0 ), index );
+
+            return FS::FMulAdd( gX, fX, fY * gY ) ^ finalSign;
+        }
+        else
         {
-            int32v index = FS_Convertf32_i32( FS_Converti32_f32( hash & int32v( 0x3FFFFF ) ) * float32v( 1.3333333333333333f ) );
+            float32v u = FS::SelectHighBit( index << 2, fY, fX );
+            float32v v = FS::SelectHighBit( index << 2, fX, fY );
 
-            // Bit-4 = Choose X Y ordering
-            mask32v xy;
+            float32v a = u * FS::SelectHighBit( index, float32v( 2 ), float32v( kRoot3f ) );
+            float32v b = v ^ FS::Cast<float>( ( index >> 30 ) << 31 );
 
-            if constexpr( FS::SIMD_Level == FastSIMD::Level_Scalar )
+            if constexpr( SIMD & FastSIMD::FeatureFlag::x86 )
             {
-                xy = int32_t( index & int32v( 1 << 2 ) ) != 0;
+                auto indexNegativeMask = FS::Cast<FS::Mask<32, false>>( index >> 31 );
+
+                return FS::InvMaskedAdd( indexNegativeMask, a, b ) ^ FS::Cast<float>( ( index >> 28 ) << 31 );
             }
             else
             {
-                xy = index << 29;
-
-                if constexpr( FS::SIMD_Level < FastSIMD::Level_SSE41 )
-                {
-                    xy >>= 31;
-                }
+                return FS::MaskedAdd( index >= int32v( 0 ), a, b ) ^ FS::Cast<float>( ( index >> 28 ) << 31 );
             }
+        }
+    }
+
+    template<FastSIMD::FeatureSet SIMD = FastSIMD::FeatureSetDefault()>
+    FS_FORCEINLINE static float32v GetGradientDotSimplex( int32v hash31, float32v fX, float32v fY, float32v fZ, float32v fW )
+    {
+        int32v hashShifted = FS::BitShiftRightZeroExtend( hash31, 2 );
+        int32v index = hashShifted * int32v( 20 >> 2 ); // [0,20) in the upper five bits
+
+        if constexpr( SIMD & FastSIMD::FeatureFlag::AVX512_F )
+        {
+            index = FS::BitShiftRightZeroExtend( index, 27 );
+
+            const auto tableX = FS::Constant<float>( kSkew4f + 1, kSkew4f, kSkew4f, kSkew4f, -1, 1, 0, 0, -1, 0, 1, 0, -1, 0, 0, 1 );
+            const auto tableY = FS::Constant<float>( kSkew4f, kSkew4f + 1, kSkew4f, kSkew4f, 1, -1, 0, 0, 0, -1, 0, 1, 0, -1, 1, 0 );
+            const auto tableZ = FS::Constant<float>( kSkew4f, kSkew4f, kSkew4f + 1, kSkew4f, 0, 0, -1, 1, 1, 0, -1, 0, 0, 1, -1, 0 );
+            const auto tableW = FS::Constant<float>( kSkew4f, kSkew4f, kSkew4f, kSkew4f + 1, 0, 0, 1, -1, 0, 1, 0, -1, 1, 0, 0, -1 );
+
+            float32v gX = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableX, index, -tableX );
+            float32v gY = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableY, index, -tableY );
+            float32v gZ = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableZ, index, -tableZ );
+            float32v gW = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableW, index, -tableW );
 
-            float32v a = FS_Select_f32( xy, fY, fX );
-            float32v b = FS_Select_f32( xy, fX, fY );
+            return FS::FMulAdd( gW, fW, FS::FMulAdd( gZ, fZ, FS::FMulAdd( gY, fY, gX * fX ) ) );
+        }
+        else
+        {
+            int32v indexA = index & int32v( 0x03 << 27 );
+            int32v indexB = ( index >> 2 ) & int32v( 0x07 << 27 );
+            indexB ^= indexA; // Simplifies the AVX512_F case.
+
+            mask32v extra = indexB >= int32v( 0x04 << 27 );
+            mask32v equal = ( indexA == indexB );
+            indexA |= FS::Cast<int32_t>( equal ); // Forces decrement conditions to fail.
+
+            float32v neutral = FS::Masked( equal | extra, FS::MaskedMul( extra, float32v( kSkew4f ), float32v( -1.0f ) ) );
+
+            float32v gX = FS::MaskedIncrement( indexB == int32v( 0 << 27 ), FS::MaskedDecrement( indexA == int32v( 0 << 27 ), neutral ) );
+            float32v gY = FS::MaskedIncrement( indexB == int32v( 1 << 27 ), FS::MaskedDecrement( indexA == int32v( 1 << 27 ), neutral ) );
+            float32v gZ = FS::MaskedIncrement( indexB == int32v( 2 << 27 ), FS::MaskedDecrement( indexA == int32v( 2 << 27 ), neutral ) );
+            float32v gW = FS::MaskedIncrement( indexB == int32v( 3 << 27 ), FS::MaskedDecrement( indexA == int32v( 3 << 27 ), neutral ) );
+
+            return FS::FMulAdd( gW, fW, FS::FMulAdd( gZ, fZ, FS::FMulAdd( gY, fY, gX * fX ) ) );
+        }
+    }
 
-            // Bit-1 = b flip sign
-            b ^= FS_Casti32_f32( index << 31 );
+    template<FastSIMD::FeatureSet SIMD = FastSIMD::FeatureSetDefault()>
+    FS_FORCEINLINE static float32v GetGradientDotCommon( int32v hash, float32v fX, float32v fY, float32v fZ )
+    {
+        if constexpr( SIMD & FastSIMD::FeatureFlag::AVX512_F )
+        {
+            float32v gX = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), hash, FS::Constant<float>( 1, -1, 1, -1, 1, -1, 1, -1, 0, 0, 0, 0, 1, 0, -1, 0 ) );
+            float32v gY = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), hash, FS::Constant<float>( 1, 1, -1, -1, 0, 0, 0, 0, 1, -1, 1, -1, 1, -1, 1, -1 ) );
+            float32v gZ = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), hash, FS::Constant<float>( 0, 0, 0, 0, 1, 1, -1, -1, 1, 1, -1, -1, 0, 1, 0, -1 ) );
 
-            // Bit-2 = Mul a by 2 or Root3
-            mask32v aMul2;
+            return FS::FMulAdd( gX, fX, FS::FMulAdd( fY, gY, fZ * gZ ));
+        }
+        else
+        {
+            int32v hasha13 = hash & int32v( 13 );
 
-            if constexpr( FS::SIMD_Level == FastSIMD::Level_Scalar )
+            // if h > 7 then y, else x
+            mask32v gt7;
+            if constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 )
             {
-                aMul2 = int32_t( index & int32v( 1 << 1 ) ) != 0;
+                gt7 = FS::Cast<FS::Mask<32>>( hash << 28 );
             }
             else
             {
-                aMul2 = (index << 30) >> 31;
+                gt7 = hasha13 > int32v( 7 );
             }
+            float32v u = FS::Select( gt7, fY, fX );
 
-            a *= FS_Select_f32( aMul2, float32v( 2 ), float32v( ROOT3 ) );
-            // b zero value if a mul 2
-            b = FS_NMask_f32( b, aMul2 );
+            // if h < 4 then y else if h is 12 or 14 then x else z
+            float32v v = FS::Select( hasha13 == int32v( 12 ), fX, fZ );
+            v = FS::Select( hasha13 < int32v( 2 ), fY, v );
 
-            // Bit-8 = Flip sign of a + b
-            return ( a + b ) ^ FS_Casti32_f32( (index >> 3) << 31 );
+            // if h1 then -u else u
+            // if h2 then -v else v
+            float32v h1 = FS::Cast<float>( hash << 31 );
+            float32v h2 = FS::Cast<float>( ( hash >> 1 ) << 31 );
+            // then add them
+            return ( u ^ h1 ) + ( v ^ h2 );
         }
-        template<typename SIMD = FS, std::enable_if_t<SIMD::SIMD_Level == FastSIMD::Level_NEON>* = nullptr>
-        FS_INLINE static float32v GetGradientDotFancy( int32v hash, float32v fX, float32v fY )
+    }
+
+    template<FastSIMD::FeatureSet SIMD = FastSIMD::FeatureSetDefault()>
+    FS_FORCEINLINE static float32v GetGradientDotPerlin( int32v hash, float32v fX, float32v fY )
+    {
+        // ( 1+R2, 1 ) ( -1-R2, 1 ) ( 1+R2, -1 ) ( -1-R2, -1 )
+        // ( 1, 1+R2 ) ( 1, -1-R2 ) ( -1, 1+R2 ) ( -1, -1-R2 )
+
+        if constexpr( SIMD & FastSIMD::FeatureFlag::AVX512_F )
         {
-            int32v index = FS_Convertf32_i32( FS_Converti32_f32( hash & int32v( 0x3FFFFF ) ) * float32v( 1.3333333333333333f ) );
-
-            // Bit-4 = Choose X Y ordering
-            mask32v xy;
-
-//             if constexpr( FS::SIMD_Level == FastSIMD::Level_Scalar )
-//             {
-//                 xy = int32_t( index & int32v( 1 << 2 ) ) != 0;
-//             }
-//             else
-//             {
-                xy = index << 29;
-
-//                 if constexpr( FS::SIMD_Level < FastSIMD::Level_SSE41 )
-//                 {
-                    xy >>= 31;
-//                 }
-//             }
-
-            float32v a = FS_Select_f32( xy, fY, fX );
-            float32v b = FS_Select_f32( xy, fX, fY );
-
-            // Bit-1 = b flip sign
-            b ^= FS_Casti32_f32( index << 31 );
-
-            // Bit-2 = Mul a by 2 or Root3
-            mask32v aMul2;
-
-//             if constexpr( FS::SIMD_Level == FastSIMD::Level_Scalar )
-//             {
-//                 aMul2 = int32_t( index & int32v( 1 << 1 ) ) != 0;
-//             }
-//             else
-//             {
-                aMul2 = (index << 30) >> 31;
-//             }
-
-            a *= FS_Select_f32( aMul2, float32v( 2 ), float32v( ROOT3 ) );
-            // b zero value if a mul 2
-            b = FS_NMask_f32( b, aMul2 );
-
-            // Bit-8 = Flip sign of a + b
-            return ( a + b ) ^ FS_Casti32_f32( (index >> 3) << 31 );
+            float32v gX = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), hash, FS::Constant<float>( 1 + kRoot2f, -1 - kRoot2f, 1 + kRoot2f, -1 - kRoot2f, 1, -1, 1, -1, 1 + kRoot2f, -1 - kRoot2f, 1 + kRoot2f, -1 - kRoot2f, 1, -1, 1, -1 ) );
+            float32v gY = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), hash, FS::Constant<float>( 1, 1, -1, -1, 1 + kRoot2f, 1 + kRoot2f, -1 - kRoot2f, -1 - kRoot2f, 1, 1, -1, -1, 1 + kRoot2f, 1 + kRoot2f, -1 - kRoot2f, -1 - kRoot2f ) );
+
+            return FS::FMulAdd( gX, fX, fY * gY );
         }
+        else if constexpr( SIMD & FastSIMD::FeatureFlag::AVX2 )
+        {
+            float32v gX = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), FS::Constant<float>( 1 + kRoot2f, -1 - kRoot2f, 1 + kRoot2f, -1 - kRoot2f, 1, -1, 1, -1 ), hash );
+            float32v gY = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), FS::Constant<float>( 1, 1, -1, -1, 1 + kRoot2f, 1 + kRoot2f, -1 - kRoot2f, -1 - kRoot2f ), hash );
 
-        template<typename SIMD = FS, std::enable_if_t<SIMD::SIMD_Level == FastSIMD::Level_AVX2>* = nullptr>
-        FS_INLINE static float32v GetGradientDotFancy( int32v hash, float32v fX, float32v fY )
+            return FS::FMulAdd( gX, fX, fY * gY );
+        }
+        else
         {
-            int32v index = FS_Convertf32_i32( FS_Converti32_f32( hash & int32v( 0x3FFFFF ) ) * float32v( 1.3333333333333333f ) );
+            fX ^= FS::Cast<float>( hash << 31 );
+            fY ^= FS::Cast<float>( ( hash >> 1 ) << 31 );
 
-            float32v gX = _mm256_permutevar8x32_ps( float32v( ROOT3, ROOT3, 2, 2, 1, -1, 0, 0 ), index );
-            float32v gY = _mm256_permutevar8x32_ps( float32v( 1, -1, 0, 0, ROOT3, ROOT3, 2, 2 ), index );
+            float32v u = FS::SelectHighBit( hash << 29, fY, fX );
+            float32v v = FS::SelectHighBit( hash << 29, fX, fY );
 
-            // Bit-8 = Flip sign of a + b
-            return FS_FMulAdd_f32( gX, fX, fY * gY ) ^ FS_Casti32_f32( (index >> 3) << 31 );
+            return FS::FMulAdd( float32v( 1.0f + kRoot2f ), u, v );
         }
+    }
+    
+    template<FastSIMD::FeatureSet SIMD = FastSIMD::FeatureSetDefault()>
+    FS_FORCEINLINE static float32v GetGradientDotPerlin( int32v hash, float32v fX, float32v fY, float32v fZ, float32v fW )
+    {
+        if constexpr( SIMD & FastSIMD::FeatureFlag::AVX512_F )
+        {
+            float32v gX = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), FS::Constant<float>( 0, 0, 0, 0, 0, 0, 0, 0, 1, -1, 1, -1, 1, -1, 1, -1 ), hash, FS::Constant<float>( 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1 ) );
+            float32v gY = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), FS::Constant<float>( 1, -1, 1, -1, 1, -1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0 ), hash, FS::Constant<float>( 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1 ) );
+            float32v gZ = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), FS::Constant<float>( 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1 ), hash, FS::Constant<float>( 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, -1, -1, -1, -1 ) );
+            float32v gW = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), FS::Constant<float>( 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1 ), hash, FS::Constant<float>( 1, 1, 1, 1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0 ) );
 
-        template<typename SIMD = FS, std::enable_if_t<(SIMD::SIMD_Level == FastSIMD::Level_AVX512)>* = nullptr>
-        FS_INLINE static float32v GetGradientDotFancy( int32v hash, float32v fX, float32v fY )
+            return FS::FMulAdd( gX, fX, FS::FMulAdd( fY, gY, FS::FMulAdd( fZ, gZ, fW * gW ) ));
+        }
+        else
         {
-            int32v index = FS_Convertf32_i32( FS_Converti32_f32( hash & int32v( 0x3FFFFF ) ) * float32v( 1.3333333333333333f ) );
+            int32v p = hash & int32v( 3 << 3 );
+
+            float32v a = FS::Select( p > int32v( 0 ), fX, fY );
+            float32v b = FS::SelectHighBit( hash << 27, fY, fZ );
+            float32v c = FS::Select( p > int32v( 2 << 3 ), fZ, fW );
 
-            float32v gX = _mm512_permutexvar_ps( index, float32v( ROOT3, ROOT3, 2, 2, 1, -1, 0, 0, -ROOT3, -ROOT3, -2, -2, -1, 1, 0, 0 ) );
-            float32v gY = _mm512_permutexvar_ps( index, float32v( 1, -1, 0, 0, ROOT3, ROOT3, 2, 2, -1, 1, 0, 0, -ROOT3, -ROOT3, -2, -2 ) );
+            float32v aSign = FS::Cast<float>( hash << 31 );
+            float32v bSign = FS::Cast<float>( ( hash >> 1 ) << 31 );
+            float32v cSign = FS::Cast<float>( ( hash >> 2 ) << 31 );
 
-            return FS_FMulAdd_f32( gX, fX, fY * gY );
+            return ( a ^ aSign ) + ( b ^ bSign ) + ( c ^ cSign );
         }
+    }
 
+    template<FastSIMD::FeatureSet SIMD = FastSIMD::FeatureSetDefault()>
+    FS_FORCEINLINE static void ApplyGradientOuterProductVectorProductSimplex( int32v hash31, float32v fX, float32v fY, float32v multiplier, float32v& valueX, float32v& valueY )
+    {
+        int32v hashShifted = FS::BitShiftRightZeroExtend( hash31, 1 );
+        int32v indexGradient = hashShifted * int32v( 12 >> 2 ); // [0,12) in the upper four bits
+        int32v indexOuterVector = ( hashShifted * int32v( 0xAAAAAAAB ) ) & int32v( 0xC0000003 ); // [0,12) in bits 0,1,30,31 // ( -4LL << 30 ) / 3 )
 
-        template<typename SIMD = FS, std::enable_if_t<SIMD::SIMD_Level < FastSIMD::Level_AVX2>* = nullptr>
-        FS_INLINE static float32v GetGradientDot( int32v hash, float32v fX, float32v fY )
+        if constexpr( SIMD & FastSIMD::FeatureFlag::AVX512_F )
         {
-            // ( 1+R2, 1 ) ( -1-R2, 1 ) ( 1+R2, -1 ) ( -1-R2, -1 )
-            // ( 1, 1+R2 ) ( 1, -1-R2 ) ( -1, 1+R2 ) ( -1, -1-R2 )
+            indexGradient >>= 28;
+            indexOuterVector |= indexOuterVector >> 28;
+
+            float32v gX = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), indexGradient, FS::Constant<float>( kRoot3f, -kRoot3f, 1, -1, kRoot3f, -kRoot3f, -1, 1, 2, -2, 0, 0, 0, 0, 0, 0 ) );
+            float32v gY = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), indexGradient, FS::Constant<float>( 1, -1, kRoot3f, -kRoot3f, -1, 1, kRoot3f, -kRoot3f, 0, 0, 2, -2, 0, 0, 0, 0 ) );
 
-            int32v  bit1 = (hash << 31);
-            int32v  bit2 = (hash >> 1) << 31;
-            mask32v bit4;
+            multiplier *= FS::FMulAdd( fY, gY, fX * gX );
 
-            if constexpr( FS::SIMD_Level == FastSIMD::Level_Scalar )
+            valueX = FS::FMulAdd( multiplier, FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), indexOuterVector, FS::Constant<float>( kRoot3f, -kRoot3f, 1, -1, kRoot3f, -kRoot3f, -1, 1, 2, -2, 0, 0, 0, 0, 0, 0 ) ), valueX );
+            valueY = FS::FMulAdd( multiplier, FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), indexOuterVector, FS::Constant<float>( 1, -1, kRoot3f, -kRoot3f, -1, 1, kRoot3f, -kRoot3f, 0, 0, 2, -2, 0, 0, 0, 0 ) ), valueY );
+        }
+        else if constexpr( SIMD & FastSIMD::FeatureFlag::AVX )
+        {
+            float32v finalSign = FS::Cast<float>( ( ( indexGradient >> 28 ) ^ indexOuterVector ) << 31 );
+            indexGradient >>= 29;
+            indexOuterVector = ( indexOuterVector >> 1 ) | ( indexOuterVector >> 29 );
+
+            float32v gX = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), FS::Constant<float>( kRoot3f, 1, kRoot3f, -1, 2, 0, 0, 0 ), indexGradient );
+            float32v gY = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), FS::Constant<float>( 1, kRoot3f, -1, kRoot3f, 0, 2, 0, 0 ), indexGradient );
+
+            multiplier *= FS::FMulAdd( fY, gY, fX * gX ) ^ finalSign;
+
+            valueX = FS::FMulAdd( multiplier, FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), FS::Constant<float>( kRoot3f, 1, kRoot3f, -1, 2, 0, 0, 0 ), indexOuterVector ), valueX );
+            valueY = FS::FMulAdd( multiplier, FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), FS::Constant<float>( 1, kRoot3f, -1, kRoot3f, 0, 2, 0, 0 ), indexOuterVector ), valueY );
+        }
+        else
+        {
             {
-                bit4 = int32_t( hash & int32v( 1 << 2 ) ) != 0;
+                float32v u = FS::SelectHighBit( indexGradient << 2, fY, fX );
+                float32v v = FS::SelectHighBit( indexGradient << 2, fX, fY );
+
+                float32v a = u * FS::SelectHighBit( indexGradient, float32v( 2 ), float32v( kRoot3f ) );
+                float32v b = v ^ FS::Cast<float>( ( indexGradient >> 30 ) << 31 );
+
+                multiplier *= FS::MaskedAdd( indexGradient >= int32v( 0 ), a, b ) ^ FS::Cast<float>( ( ( indexGradient >> 28 ) ^ indexOuterVector ) << 31 );
             }
-            else
+
             {
-                bit4 = hash << 29;
+                float32v a = multiplier * FS::SelectHighBit( indexOuterVector, float32v( 2 ), float32v( kRoot3f ) );
+                float32v b = FS::Masked( indexOuterVector >= int32v( 0 ), multiplier ) ^ FS::Cast<float>( ( indexOuterVector >> 30 ) << 31 );
+
+                valueX += FS::SelectHighBit( indexOuterVector << 30, b, a );
+                valueY += FS::SelectHighBit( indexOuterVector << 30, a, b );
+            }
+        }
+    }
 
-                if constexpr( FS::SIMD_Level < FastSIMD::Level_SSE41 )
+    template<FastSIMD::FeatureSet SIMD = FastSIMD::FeatureSetDefault()>
+    FS_FORCEINLINE static void ApplyGradientOuterProductVectorProductCommon( int32v hash31, float32v fX, float32v fY, float32v fZ, float32v multiplier, float32v& valueX, float32v& valueY, float32v& valueZ )
+    {
+        int32v hashShifted = FS::BitShiftRightZeroExtend( hash31, 1 );
+        int32v indexGradient = FS::BitShiftRightZeroExtend( hashShifted * int32v( 12 >> 2 ), 28 ); // [0,12)
+        int32v indexOuterVector = ( hashShifted * int32v( 0xAAAAAAAB ) ) & int32v( 0xC0000003 ); // [0,12) in bits 0,1,30,31 // ( -4LL << 30 ) / 3 )
+        indexOuterVector |= indexOuterVector >> 28;
+
+        if constexpr( SIMD & FastSIMD::FeatureFlag::AVX512_F )
+        {
+            float32v gX = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), indexGradient, FS::Constant<float>( 1, -1, 1, -1, 1, -1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0 ) );
+            float32v gY = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), indexGradient, FS::Constant<float>( 1, 1, -1, -1, 0, 0, 0, 0, 1, -1, 1, -1, 0, 0, 0, 0 ) );
+            float32v gZ = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), indexGradient, FS::Constant<float>( 0, 0, 0, 0, 1, 1, -1, -1, 1, 1, -1, -1, 0, 0, 0, 0 ) );
+
+            multiplier *= FS::FMulAdd( gZ, fZ, FS::FMulAdd( fY, gY, fX * gX ) );
+
+            valueX = FS::FMulAdd( multiplier, FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), indexOuterVector, FS::Constant<float>( 1, -1, 1, -1, 1, -1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0 ) ), valueX );
+            valueY = FS::FMulAdd( multiplier, FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), indexOuterVector, FS::Constant<float>( 1, 1, -1, -1, 0, 0, 0, 0, 1, -1, 1, -1, 0, 0, 0, 0 ) ), valueY );
+            valueZ = FS::FMulAdd( multiplier, FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), indexOuterVector, FS::Constant<float>( 0, 0, 0, 0, 1, 1, -1, -1, 1, 1, -1, -1, 0, 0, 0, 0 ) ), valueZ );
+        }
+        else
+        {
+            {
+                float32v sign0 = FS::Cast<float>( indexGradient << 31 );
+                float32v sign1 = FS::Cast<float>( ( indexGradient >> 1 ) << 31 );
+
+                float32v u;
+                if constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 )
+                {
+                    u = FS::SelectHighBit( indexGradient << ( 31 - 3 ), fY, fX );
+                }
+                else
                 {
-                    bit4 >>= 31;
+                    u = FS::Select( indexGradient >= int32v( 8 ), fY, fX );
                 }
+                float32v v = FS::Select( indexGradient >= int32v( 4 ), fZ, fY );
+
+                multiplier *= ( u ^ sign0 ) + ( v ^ sign1 );
             }
 
-            fX ^= FS_Casti32_f32( bit1 );
-            fY ^= FS_Casti32_f32( bit2 );
-            
-            float32v a = FS_Select_f32( bit4, fY, fX );
-            float32v b = FS_Select_f32( bit4, fX, fY );
-            
-            return FS_FMulAdd_f32( float32v( 1.0f + ROOT2 ), a, b );
+            {
+                indexOuterVector &= int32v( 0xF );
+
+                float32v signed0 = multiplier ^ FS::Cast<float>( indexOuterVector << 31 );
+                float32v signed1 = multiplier ^ FS::Cast<float>( ( indexOuterVector >> 1 ) << 31 );
+
+                mask32v notYZ = indexOuterVector < int32v( 8 );
+                mask32v notXY = indexOuterVector >= int32v( 4 );
+
+                valueX = FS::MaskedAdd( notYZ, valueX, signed0 );
+                valueZ = FS::MaskedAdd( notXY, valueZ, signed1 );
+                valueY = FS::InvMaskedAdd( notYZ & notXY, valueY, FS::Select( notXY, signed0, signed1 ) );
+            }
         }
-        template<typename SIMD = FS, std::enable_if_t<SIMD::SIMD_Level == FastSIMD::Level_NEON> * = nullptr>
-         FS_INLINE static float32v GetGradientDot( int32v hash, float32v fX, float32v fY )
+    }
+
+    template<FastSIMD::FeatureSet SIMD = FastSIMD::FeatureSetDefault()>
+    FS_FORCEINLINE static void ApplyGradientOuterProductVectorProductSimplex( int32v hash, float32v fX, float32v fY, float32v fZ, float32v fW, float32v multiplier, float32v& valueX, float32v& valueY, float32v& valueZ, float32v& valueW )
+    {
+        int32v hashShifted = FS::BitShiftRightZeroExtend( hash, 2 );
+        int32v indexGradient = hashShifted * int32v( 20 >> 2 ); // [0,20) in the upper five bits
+        int32v indexOuterVector = hashShifted * int32v( 0xCCCCCCCD ); // ( -8LL << 29 ) / 5
+        indexOuterVector = ( indexOuterVector & int32v( 0xE0000003 ) ) * int32v( 3 | ( 1 << 27 ) ); // [0,20) in the upper five bits, independently of the above
+
+        if constexpr( SIMD & FastSIMD::FeatureFlag::AVX512_F )
         {
-            // ( 1+R2, 1 ) ( -1-R2, 1 ) ( 1+R2, -1 ) ( -1-R2, -1 )
-            // ( 1, 1+R2 ) ( 1, -1-R2 ) ( -1, 1+R2 ) ( -1, -1-R2 )
-
-            int32v  bit1 = (hash << 31);
-            int32v  bit2 = (hash >> 1) << 31;
-            mask32v bit4;
-
-//             if constexpr( FS::SIMD_Level == FastSIMD::Level_Scalar )
-//             {
-//                 bit4 = int32_t( hash & int32v( 1 << 2 ) ) != 0;
-//             }
-//             else
-//             {
-                bit4 = hash << 29;
-// 
-//                 if constexpr( FS::SIMD_Level < FastSIMD::Level_SSE41 )
-//                 {
-                    bit4 >>= 31;
-//                 }
-//             }
-
-            fX ^= FS_Casti32_f32( bit1 );
-            fY ^= FS_Casti32_f32( bit2 );
-            
-            float32v a = FS_Select_f32( bit4, fY, fX );
-            float32v b = FS_Select_f32( bit4, fX, fY );
-            
-            return FS_FMulAdd_f32( float32v( 1.0f + ROOT2 ), a, b );
-        }
+            indexGradient = FS::BitShiftRightZeroExtend( indexGradient, 27 );
+            indexOuterVector = FS::BitShiftRightZeroExtend( indexOuterVector, 27 );
+
+            const auto tableX = FS::Constant<float>( kSkew4f + 1, kSkew4f, kSkew4f, kSkew4f, -1, 1, 0, 0, -1, 0, 1, 0, -1, 0, 0, 1 );
+            const auto tableY = FS::Constant<float>( kSkew4f, kSkew4f + 1, kSkew4f, kSkew4f, 1, -1, 0, 0, 0, -1, 0, 1, 0, -1, 1, 0 );
+            const auto tableZ = FS::Constant<float>( kSkew4f, kSkew4f, kSkew4f + 1, kSkew4f, 0, 0, -1, 1, 1, 0, -1, 0, 0, 1, -1, 0 );
+            const auto tableW = FS::Constant<float>( kSkew4f, kSkew4f, kSkew4f, kSkew4f + 1, 0, 0, 1, -1, 0, 1, 0, -1, 1, 0, 0, -1 );
 
-        template<typename SIMD = FS, std::enable_if_t<SIMD::SIMD_Level == FastSIMD::Level_AVX2>* = nullptr>
-        FS_INLINE static float32v GetGradientDot( int32v hash, float32v fX, float32v fY )
+            float32v gX = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableX, indexGradient, -tableX );
+            float32v gY = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableY, indexGradient, -tableY );
+            float32v gZ = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableZ, indexGradient, -tableZ );
+            float32v gW = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableW, indexGradient, -tableW );
+
+            multiplier *= FS::FMulAdd( gW, fW, FS::FMulAdd( gZ, fZ, FS::FMulAdd( gY, fY, gX * fX ) ) );
+
+            valueX = FS::FMulAdd( multiplier, FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableX, indexOuterVector, -tableX ), valueX );
+            valueY = FS::FMulAdd( multiplier, FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableY, indexOuterVector, -tableY ), valueY );
+            valueZ = FS::FMulAdd( multiplier, FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableZ, indexOuterVector, -tableZ ), valueZ );
+            valueW = FS::FMulAdd( multiplier, FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableW, indexOuterVector, -tableW ), valueW );
+        }
+        else
         {
-            float32v gX = _mm256_permutevar8x32_ps( float32v( 1 + ROOT2, -1 - ROOT2, 1 + ROOT2, -1 - ROOT2, 1, -1, 1, -1 ), hash );
-            float32v gY = _mm256_permutevar8x32_ps( float32v( 1, 1, -1, -1, 1 + ROOT2, 1 + ROOT2, -1 - ROOT2, -1 - ROOT2 ), hash );
+            {
+                int32v indexA = indexGradient & int32v( 0x03 << 27 );
+                int32v indexB = ( indexGradient >> 2 ) & int32v( 0x07 << 27 );
+                indexB ^= indexA; // Simplifies the AVX512_F case.
 
-            return FS_FMulAdd_f32( gX, fX, fY * gY );
+                mask32v extra = indexB >= int32v( 0x04 << 27 );
+                mask32v equal = ( indexA == indexB );
+                indexA |= FS::Cast<int32_t>( equal ); // Forces decrement conditions to fail.
+
+                float32v neutral = FS::Masked( equal | extra, FS::MaskedMul( extra, float32v( kSkew4f ), float32v( -1.0f ) ) );
+
+                float32v gX = FS::MaskedIncrement( indexB == int32v( 0 << 27 ), FS::MaskedDecrement( indexA == int32v( 0 << 27 ), neutral ) );
+                float32v gY = FS::MaskedIncrement( indexB == int32v( 1 << 27 ), FS::MaskedDecrement( indexA == int32v( 1 << 27 ), neutral ) );
+                float32v gZ = FS::MaskedIncrement( indexB == int32v( 2 << 27 ), FS::MaskedDecrement( indexA == int32v( 2 << 27 ), neutral ) );
+                float32v gW = FS::MaskedIncrement( indexB == int32v( 3 << 27 ), FS::MaskedDecrement( indexA == int32v( 3 << 27 ), neutral ) );
+
+                multiplier *= FS::FMulAdd( gW, fW, FS::FMulAdd( gZ, fZ, FS::FMulAdd( gY, fY, gX * fX ) ) );
+            }
+
+            {
+                int32v indexA = indexOuterVector & int32v( 0x03 << 27 );
+                int32v indexB = ( indexOuterVector >> 2 ) & int32v( 0x07 << 27 );
+                indexB ^= indexA; // Simplifies the AVX512_F case.
+
+                mask32v extra = indexB >= int32v( 0x04 << 27 );
+                mask32v equal = ( indexA == indexB );
+                indexA |= FS::Cast<int32_t>( equal ); // Forces decrement conditions to fail.
+
+                float32v neutral = FS::Masked( equal | extra, FS::MaskedMul( extra, float32v( kSkew4f ), float32v( -1.0f ) ) );
+
+                float32v gX = FS::MaskedIncrement( indexB == int32v( 0 << 27 ), FS::MaskedDecrement( indexA == int32v( 0 << 27 ), neutral ) );
+                float32v gY = FS::MaskedIncrement( indexB == int32v( 1 << 27 ), FS::MaskedDecrement( indexA == int32v( 1 << 27 ), neutral ) );
+                float32v gZ = FS::MaskedIncrement( indexB == int32v( 2 << 27 ), FS::MaskedDecrement( indexA == int32v( 2 << 27 ), neutral ) );
+                float32v gW = FS::MaskedIncrement( indexB == int32v( 3 << 27 ), FS::MaskedDecrement( indexA == int32v( 3 << 27 ), neutral ) );
+
+                valueX = FS::FMulAdd( multiplier, gX, valueX );
+                valueY = FS::FMulAdd( multiplier, gY, valueY );
+                valueZ = FS::FMulAdd( multiplier, gZ, valueZ );
+                valueW = FS::FMulAdd( multiplier, gW, valueW );
+            }
         }
+    }
+
+    template<FastSIMD::FeatureSet SIMD = FastSIMD::FeatureSetDefault()>
+    FS_FORCEINLINE static void ApplyOrthogonalGradientMatrixVectorProductSimplex( int32v hash31, float32v fX, float32v fY, float32v multiplier, float32v& valueX, float32v& valueY )
+    {
+        int32v index = FS::BitShiftRightZeroExtend( hash31, 1 ) * int32v( 12 >> 2 ); // [0,12) in the upper four bits
 
-        template<typename SIMD = FS, std::enable_if_t<SIMD::SIMD_Level == FastSIMD::Level_AVX512> * = nullptr>
-         FS_INLINE static float32v GetGradientDot( int32v hash, float32v fX, float32v fY )
+        if constexpr( SIMD & FastSIMD::FeatureFlag::AVX512_F )
         {
-            float32v gX = _mm512_permutexvar_ps( hash, float32v( 1 + ROOT2, -1 - ROOT2, 1 + ROOT2, -1 - ROOT2, 1, -1, 1, -1, 1 + ROOT2, -1 - ROOT2, 1 + ROOT2, -1 - ROOT2, 1, -1, 1, -1 ) );
-            float32v gY = _mm512_permutexvar_ps( hash, float32v( 1, 1, -1, -1, 1 + ROOT2, 1 + ROOT2, -1 - ROOT2, -1 - ROOT2, 1, 1, -1, -1, 1 + ROOT2, 1 + ROOT2, -1 - ROOT2, -1 - ROOT2 ) );
+            index = FS::BitShiftRightZeroExtend( index, 28 );
 
-            return FS_FMulAdd_f32( gX, fX, fY * gY );
-        }
+            float32v gX = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), index, FS::Constant<float>( kSkew2f, -kSkew2f, kSkew2f, -kSkew2f, kSkew2f + 1, -kSkew2f - 1, kSkew2f + 1, -kSkew2f - 1, 1, -1, 1, -1, 0, 0, 0, 0 ) );
+            float32v gY = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), index, FS::Constant<float>( kSkew2f + 1, kSkew2f + 1, -kSkew2f - 1, -kSkew2f - 1, kSkew2f, kSkew2f, -kSkew2f, -kSkew2f, 1, 1, -1, -1, 0, 0, 0, 0 ) );
 
-        template<typename SIMD = FS, std::enable_if_t<SIMD::SIMD_Level != FastSIMD::Level_AVX512 > * = nullptr >
-        FS_INLINE static float32v GetGradientDot( int32v hash, float32v fX, float32v fY, float32v fZ )
+            valueX = FS::FMulAdd( multiplier, FS::FMulAdd( fY, gY, fX * gX ), valueX );
+            multiplier ^= FS::Cast<float>( hash31 << 31 );
+            valueY = FS::FMulAdd( multiplier, FS::FMulSub( fY, gX, fX * gY ), valueY );
+        }
+        else if constexpr( SIMD & FastSIMD::FeatureFlag::AVX2 )
         {
-            int32v hasha13 = hash & int32v( 13 );
+            float32v signX = FS::Cast<float>( ( index >> 28 ) << 31 );
+            index = FS::BitShiftRightZeroExtend( index, 29 );
 
-            //if h < 8 then x, else y
-            float32v u = FS_Select_f32( hasha13 < int32v( 8 ), fX, fY );
+            float32v gX = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), FS::Constant<float>( kSkew2f, kSkew2f, kSkew2f + 1, kSkew2f + 1, 1, 1, 0, 0 ), index ) ^ signX;
+            float32v gY = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), FS::Constant<float>( kSkew2f + 1, -kSkew2f - 1, kSkew2f, -kSkew2f, 1, -1, 0, 0 ), index );
 
-            //if h < 4 then y else if h is 12 or 14 then x else z
-            float32v v = FS_Select_f32( hasha13 == int32v( 12 ), fX, fZ );
-            v = FS_Select_f32( hasha13 < int32v( 2 ), fY, v );
+            valueX = FS::FMulAdd( multiplier, FS::FMulAdd( fY, gY, fX * gX ), valueX );
+            multiplier ^= FS::Cast<float>( hash31 << 31 );
+            valueY = FS::FMulAdd( multiplier, FS::FMulSub( fY, gX, fX * gY ), valueY );
+        }
+        else
+        {
+            int32v ofThree = FS::BitShiftRightZeroExtend( index, 30 );
+            float32v signX = FS::Cast<float>( ( index >> 28 ) << 31 );
+            float32v signY = FS::Cast<float>( ( index >> 29 ) << 31 );
 
-            //if h1 then -u else u
-            //if h2 then -v else v
-            float32v h1 = FS_Casti32_f32( hash << 31 );
-            float32v h2 = FS_Casti32_f32( (hash & int32v( 2 )) << 30 );
-            //then add them
-            return ( u ^ h1 ) + ( v ^ h2 );
+            float32v masked = FS::Masked( index >= int32v( 0 ), float32v( kSkew2f ) );
+            float32v gX = FS::MaskedIncrement( ofThree != int32v( 0 ), masked ) ^ signX;
+            float32v gY = FS::MaskedIncrement( ofThree != int32v( 1 ), masked ) ^ signY;
+
+            valueX = FS::FMulAdd( multiplier, FS::FMulAdd( fY, gY, fX * gX ), valueX );
+            multiplier ^= FS::Cast<float>( hash31 << 31 );
+            valueY = FS::FMulAdd( multiplier, FS::FMulSub( fY, gX, fX * gY ), valueY );
         }
+    }
 
-        template<typename SIMD = FS, std::enable_if_t<SIMD::SIMD_Level == FastSIMD::Level_AVX512>* = nullptr>
-        FS_INLINE static float32v GetGradientDot( int32v hash, float32v fX, float32v fY, float32v fZ )
+    template<FastSIMD::FeatureSet SIMD = FastSIMD::FeatureSetDefault()>
+    FS_FORCEINLINE static void ApplyOrthogonalGradientMatrixVectorProductCommon( int32v hash31, float32v fX, float32v fY, float32v fZ, float32v multiplier, float32v& valueX, float32v& valueY, float32v& valueZ )
+    {
+        constexpr float kComponentA = 2.224744871391589f;
+        constexpr float kComponentB = -0.224744871391589f;
+        constexpr float kComponentC = -1.0f;
+        constexpr float kComponentsDE = 1.0f;
+        constexpr float kComponentF = 2.0f;
+        
+        int32v hashShifted = FS::BitShiftRightZeroExtend( hash31, 1 );
+        int32v indexFacetBasisWithPermute2 = hashShifted * int32v( 0xAAAAAAAB ); // [0,3) in the highest two bits, [0,8) in the lowest three bits // ( -4LL << 30 ) / 3
+        int32v indexPermutation2HighBit = ( indexFacetBasisWithPermute2 << 29 ); // & int32v( 1 << 31 ); // [0,1) in the most significant bit
+        int32v indexPermutation3 = FS::BitShiftRightZeroExtend( hashShifted * int32v( 3 ), 30 ); // [0,3)
+        float32v finalSign = FS::Cast<float>( hash31 << 31 );
+
+        float32v valueAB, valueBA, valueC;
+
+        if constexpr( SIMD & FastSIMD::FeatureFlag::AVX512_F )
         {
-            float32v gX = _mm512_permutexvar_ps( hash, float32v( 1, -1, 1, -1, 1, -1, 1, -1, 0, 0, 0, 0, 1, 0, -1, 0 ) );
-            float32v gY = _mm512_permutexvar_ps( hash, float32v( 1, 1, -1, -1, 0, 0, 0, 0, 1, -1, 1, -1, 1, -1, 1, -1 ) );
-            float32v gZ = _mm512_permutexvar_ps( hash, float32v( 0, 0, 0, 0, 1, 1, -1, -1, 1, 1, -1, -1, 0, 1, 0, -1 ) );
+            indexFacetBasisWithPermute2 = FS::NativeExec<int32v>( []( auto a ){ return _mm512_rol_epi32( a, 2 ); }, indexFacetBasisWithPermute2 );
+
+            const auto tableA_gX = FS::Constant<float>( kComponentA, kComponentA, kComponentC, kComponentC, -kComponentA, -kComponentA, kComponentC, kComponentC, kComponentA, kComponentA, kComponentC, kComponentC, -kComponentA, -kComponentA, kComponentC, kComponentC );
+            const auto tableA_gY = FS::Constant<float>( kComponentC, kComponentB, kComponentA, kComponentA, kComponentC, kComponentB, -kComponentA, -kComponentA, kComponentC, -kComponentB, kComponentA, kComponentA, kComponentC, -kComponentB, -kComponentA, -kComponentA );
+            const auto tableA_gZ = FS::Constant<float>( kComponentB, kComponentC, kComponentB, kComponentB, kComponentB, kComponentC, kComponentB, kComponentB, -kComponentB, kComponentC, -kComponentB, -kComponentB, -kComponentB, kComponentC, -kComponentB, -kComponentB );
+
+            const auto tableB_gX = FS::Constant<float>( kComponentB, kComponentB, kComponentC, kComponentC, -kComponentB, -kComponentB, kComponentC, kComponentC, kComponentB, kComponentB, kComponentC, kComponentC, -kComponentB, -kComponentB, kComponentC, kComponentC );
+            const auto tableB_gY = FS::Constant<float>( kComponentC, kComponentA, kComponentB, kComponentB, kComponentC, kComponentA, -kComponentB, -kComponentB, kComponentC, -kComponentA, kComponentB, kComponentB, kComponentC, -kComponentA, -kComponentB, -kComponentB );
+            const auto tableB_gZ = FS::Constant<float>( kComponentA, kComponentC, kComponentA, kComponentA, kComponentA, kComponentC, kComponentA, kComponentA, -kComponentA, kComponentC, -kComponentA, -kComponentA, -kComponentA, kComponentC, -kComponentA, -kComponentA );
+
+            const auto tableC_gX = FS::Constant<float>( kComponentsDE, kComponentsDE, kComponentF, kComponentF, kComponentC, kComponentC, kComponentF, kComponentF, kComponentsDE, kComponentsDE, kComponentF, kComponentF, kComponentC, kComponentC, kComponentF, kComponentF );
+            const auto tableC_gY = FS::Constant<float>( kComponentF, kComponentsDE, kComponentsDE, kComponentsDE, kComponentF, kComponentsDE, kComponentC, kComponentC, kComponentF, kComponentC, kComponentsDE, kComponentsDE, kComponentF, kComponentC, kComponentC, kComponentC );
+            const auto tableC_gZ = FS::Constant<float>( kComponentsDE, kComponentF, kComponentsDE, kComponentsDE, kComponentsDE, kComponentF, kComponentsDE, kComponentsDE, kComponentC, kComponentF, kComponentC, kComponentC, kComponentC, kComponentF, kComponentC, kComponentC );
+
+            float32v valueAB_gX = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableA_gX, indexFacetBasisWithPermute2, tableB_gX );
+            float32v valueAB_gY = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableA_gY, indexFacetBasisWithPermute2, tableB_gY );
+            float32v valueAB_gZ = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableA_gZ, indexFacetBasisWithPermute2, tableB_gZ );
+            valueAB = FS::FMulAdd( valueAB_gZ, fZ, FS::FMulAdd( fY, valueAB_gY, fX * valueAB_gX ) );
+
+            float32v valueBA_gX = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableB_gX, indexFacetBasisWithPermute2, tableA_gX );
+            float32v valueBA_gY = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableB_gY, indexFacetBasisWithPermute2, tableA_gY );
+            float32v valueBA_gZ = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableB_gZ, indexFacetBasisWithPermute2, tableA_gZ );
+            valueBA = FS::FMulAdd( valueBA_gZ, fZ, FS::FMulAdd( fY, valueBA_gY, fX * valueBA_gX ) );
+
+            float32v valueC_gX = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), indexFacetBasisWithPermute2, tableC_gX );
+            float32v valueC_gY = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), indexFacetBasisWithPermute2, tableC_gY );
+            float32v valueC_gZ = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), indexFacetBasisWithPermute2, tableC_gZ );
+            valueC = FS::FMulAdd( valueC_gZ, fZ, FS::FMulAdd( fY, valueC_gY, fX * valueC_gX ) );
+        }
+        else
+        {
+            float32v sign0 = FS::Cast<float>( indexFacetBasisWithPermute2 << 31 );
+            float32v sign1 = FS::Cast<float>( ( indexFacetBasisWithPermute2 << 30 ) & int32v( 1 << 31 ) );
+
+            auto notYZ = indexFacetBasisWithPermute2;
+            auto notXY = indexFacetBasisWithPermute2 << 1;
 
-            return FS_FMulAdd_f32( gX, fX, FS_FMulAdd_f32( fY, gY, fZ * gZ ));
+            float32v valueA_gX = FS::SelectHighBit( notYZ, float32v( kComponentC ), float32v( kComponentA ) ^ sign0 );
+            float32v valueA_gY = FS::SelectHighBit( notYZ | notXY, FS::SelectHighBit( notXY, float32v( kComponentB ) ^ sign1, float32v( kComponentA ) ^ sign0 ), float32v( kComponentC ) );
+            float32v valueA_gZ = FS::SelectHighBit( notXY, float32v( kComponentC ), float32v( kComponentB ) ^ sign1 );
+            float32v valueA = FS::FMulAdd( valueA_gZ, fZ, FS::FMulAdd( fY, valueA_gY, fX * valueA_gX ) );
+
+            float32v valueB_gX = FS::SelectHighBit( notYZ, float32v( kComponentC ), float32v( kComponentB ) ^ sign0 );
+            float32v valueB_gY = FS::SelectHighBit( notYZ | notXY, FS::SelectHighBit( notXY, float32v( kComponentA ) ^ sign1, float32v( kComponentB ) ^ sign0 ), float32v( kComponentC ) );
+            float32v valueB_gZ = FS::SelectHighBit( notXY, float32v( kComponentC ), float32v( kComponentA ) ^ sign1 );
+            float32v valueB = FS::FMulAdd( valueB_gZ, fZ, FS::FMulAdd( fY, valueB_gY, fX * valueB_gX ) );
+
+            float32v valueC_gX = FS::SelectHighBit( notYZ, float32v( kComponentF ), float32v( kComponentsDE ) ^ sign0 );
+            float32v valueC_gY = FS::SelectHighBit( notYZ | notXY, FS::SelectHighBit( notXY, float32v( kComponentsDE ) ^ sign1, float32v( kComponentsDE ) ^ sign0 ), float32v( kComponentF ) );
+            float32v valueC_gZ = FS::SelectHighBit( notXY, float32v( kComponentF ), float32v( kComponentsDE ) ^ sign1 );
+            valueC = FS::FMulAdd( valueC_gZ, fZ, FS::FMulAdd( fY, valueC_gY, fX * valueC_gX ) );
+
+            valueAB = FS::SelectHighBit( indexPermutation2HighBit, valueB, valueA );
+            valueBA = FS::SelectHighBit( indexPermutation2HighBit, valueA, valueB );
         }
+        
+        multiplier ^= finalSign;
+        valueX = FS::FMulAdd( multiplier, FS::Select( indexPermutation3 == int32v( 0 ), valueC, valueAB ), valueX );
+        valueY = FS::FMulAdd( multiplier, FS::Select( indexPermutation3 == int32v( 1 ), valueC, FS::Select( indexPermutation3 == int32v( 2 ), valueBA, valueAB ) ), valueY );
+        valueZ = FS::FMulAdd( multiplier, FS::Select( indexPermutation3 == int32v( 2 ), valueC, valueBA ), valueZ );
+    }
 
-        template<typename SIMD = FS, std::enable_if_t<SIMD::SIMD_Level != FastSIMD::Level_AVX512>* = nullptr >
-        FS_INLINE static float32v GetGradientDot( int32v hash, float32v fX, float32v fY, float32v fZ, float32v fW )
+    template<FastSIMD::FeatureSet SIMD = FastSIMD::FeatureSetDefault()>
+    static void FS_VECTORCALL ApplyOrthogonalGradientMatrixVectorProductSimplex( int32v hash31, float32v fX, float32v fY, float32v fZ, float32v fW, float32v multiplier, float32v& valueX, float32v& valueY, float32v& valueZ, float32v& valueW )
+    {
+        constexpr float kComponentPairwiseIndexedNegativeAB = -0.375999676691291f;
+        constexpr float kComponentPairwiseUnindexedFillerAB = 0.222726847849776f;
+        constexpr float kComponentPairwiseIndexedPositiveD = -kSkew4f;
+        constexpr float kComponentPairwiseUnindexedD = kSkew4f;
+
+        constexpr float kDeltaPairwiseToSingleAB = -0.124000323308709f;
+        constexpr float kDeltaPairwiseToSingleD = 0.190983005625053f;
+        constexpr float kDeltaSingleToExtra = kSkew4f;
+        constexpr float kDeltaPairwiseABToC = 0.437016024448821f;
+        constexpr float kDeltaUnindexedFillerToDiagonal = -kRoot2f;
+
+        constexpr float kDeltaPairwiseToSingleExtraAB = kDeltaPairwiseToSingleAB + kDeltaSingleToExtra;
+        constexpr float kDeltaPairwiseToSingleExtraD = kDeltaPairwiseToSingleD + kDeltaSingleToExtra;
+
+        constexpr float sIdxABC = kComponentPairwiseIndexedNegativeAB + kDeltaPairwiseToSingleAB;
+        constexpr float sDiagABC = kComponentPairwiseUnindexedFillerAB + kDeltaPairwiseToSingleAB + kDeltaUnindexedFillerToDiagonal;
+        constexpr float sFillABC = kComponentPairwiseUnindexedFillerAB + kDeltaPairwiseToSingleAB;
+        constexpr float sIdxD = kComponentPairwiseIndexedPositiveD + kDeltaPairwiseToSingleD - 1;
+        constexpr float sFillD = kComponentPairwiseUnindexedD + kDeltaPairwiseToSingleD;
+
+        constexpr float pIdxPosAB = kComponentPairwiseIndexedNegativeAB + 1;
+        constexpr float pIdxNegAB = kComponentPairwiseIndexedNegativeAB;
+        constexpr float pFillAB = kComponentPairwiseUnindexedFillerAB;
+        constexpr float pDiagAB = kComponentPairwiseUnindexedFillerAB + kDeltaUnindexedFillerToDiagonal;
+        constexpr float pIdxPosC = kComponentPairwiseIndexedNegativeAB + kDeltaPairwiseABToC + 1;
+        constexpr float pIdxNegC = kComponentPairwiseIndexedNegativeAB + kDeltaPairwiseABToC;
+        constexpr float pFillC = kComponentPairwiseUnindexedFillerAB + kDeltaPairwiseABToC;
+        constexpr float pIdxPosD = kComponentPairwiseIndexedPositiveD;
+        constexpr float pIdxNegD = kComponentPairwiseIndexedPositiveD - 1;
+        constexpr float pFillD = kComponentPairwiseUnindexedD;
+
+        constexpr float eIdxABC = kComponentPairwiseIndexedNegativeAB + kDeltaPairwiseToSingleExtraAB + 1;
+        constexpr float eDiagABC = kComponentPairwiseUnindexedFillerAB + kDeltaPairwiseToSingleExtraAB + kDeltaUnindexedFillerToDiagonal;
+        constexpr float eFillABC = kComponentPairwiseUnindexedFillerAB + kDeltaPairwiseToSingleExtraAB;
+        constexpr float eIdxD = kComponentPairwiseIndexedPositiveD + kDeltaPairwiseToSingleExtraD;
+        constexpr float eFillD = kComponentPairwiseUnindexedD + kDeltaPairwiseToSingleExtraD;
+
+        int32v hashShifted = FS::BitShiftRightZeroExtend( hash31, 2 );
+        int32v indexBasis = hashShifted * int32v( 20 >> 2 ); // [0,20) << 27
+        int32v indexPermutation3 = ( hashShifted * int32v( 0xD5555556 ) ) >> 29; // [0,3] // ( -4LL << 29 ) / 3
+        int32v indexPermutation8 = indexBasis >> 24; // & int32v( 0x07 );
+        float32v finalSign = FS::Cast<float>( hash31 << 31 );
+
+        float32v valueA, valueB, valueC, valueD;
+        float32v valueA_gX, valueB_gX, valueC_gX, valueD_gX;
+        float32v valueA_gY, valueB_gY, valueC_gY, valueD_gY;
+        float32v valueA_gZ, valueB_gZ, valueC_gZ, valueD_gZ;
+        float32v valueA_gW, valueB_gW, valueC_gW, valueD_gW;
+
+        if constexpr( SIMD & FastSIMD::FeatureFlag::AVX512_F )
         {
-            int32v p = hash & int32v( 3 << 3 );
+            indexBasis >>= 27;
+
+            valueA_gX = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ),
+                FS::Constant<float>( sIdxABC, sDiagABC, sDiagABC, sDiagABC, pIdxPosAB, pIdxNegAB, pDiagAB, pDiagAB, pIdxPosAB, pDiagAB, pIdxNegAB, pDiagAB, pIdxPosAB, pDiagAB, pDiagAB, pIdxNegAB ), indexBasis,
+                FS::Constant<float>( eIdxABC, eDiagABC, eDiagABC, eDiagABC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 )
+            );
+            valueB_gX = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ),
+                FS::Constant<float>( sIdxABC, sFillABC, sFillABC, sFillABC, pIdxPosAB, pIdxNegAB, pFillAB, pFillAB, pIdxPosAB, pFillAB, pIdxNegAB, pFillAB, pIdxPosAB, pFillAB, pFillAB, pIdxNegAB ), indexBasis,
+                FS::Constant<float>( eIdxABC, eFillABC, eFillABC, eFillABC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 )
+            );
+            valueC_gX = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ),
+                FS::Constant<float>( sIdxABC, sFillABC, sFillABC, sFillABC, pIdxPosC, pIdxNegC, pFillC, pFillC, pIdxPosC, pFillC, pIdxNegC, pFillC, pIdxPosC, pFillC, pFillC, pIdxNegC ), indexBasis,
+                FS::Constant<float>( eIdxABC, eFillABC, eFillABC, eFillABC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 )
+            );
+            valueD_gX = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ),
+                FS::Constant<float>( sIdxD, sFillD, sFillD, sFillD, pIdxPosD, pIdxNegD, pFillD, pFillD, pIdxPosD, pFillD, pIdxNegD, pFillD, pIdxPosD, pFillD, pFillD, pIdxNegD ), indexBasis,
+                FS::Constant<float>( eIdxD, eFillD, eFillD, eFillD, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 )
+            );
+
+            valueA = valueA_gX * fX;
+            valueB = valueB_gX * fX;
+            valueC = valueC_gX * fX;
+            valueD = valueD_gX * fX;
+
+            valueA_gY = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ),
+                FS::Constant<float>( sDiagABC, sIdxABC, sFillABC, sFillABC, pIdxNegAB, pIdxPosAB, pFillAB, pFillAB, pDiagAB, pIdxPosAB, pDiagAB, pIdxNegAB, pDiagAB, pIdxPosAB, pIdxNegAB, pDiagAB ), indexBasis,
+                FS::Constant<float>( eDiagABC, eIdxABC, eFillABC, eFillABC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 )
+            );
+            valueB_gY = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ),
+                FS::Constant<float>( sFillABC, sIdxABC, sDiagABC, sDiagABC, pIdxNegAB, pIdxPosAB, pDiagAB, pDiagAB, pFillAB, pIdxPosAB, pFillAB, pIdxNegAB, pFillAB, pIdxPosAB, pIdxNegAB, pFillAB ), indexBasis,
+                FS::Constant<float>( eFillABC, eIdxABC, eDiagABC, eDiagABC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 )
+            );
+            valueC_gY = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ),
+                FS::Constant<float>( sFillABC, sIdxABC, sFillABC, sFillABC, pIdxNegC, pIdxPosC, pFillC, pFillC, pFillC, pIdxPosC, pFillC, pIdxNegC, pFillC, pIdxPosC, pIdxNegC, pFillC ), indexBasis,
+                FS::Constant<float>( eFillABC, eIdxABC, eFillABC, eFillABC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 )
+            );
+            valueD_gY = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ),
+                FS::Constant<float>( sFillD, sIdxD, sFillD, sFillD, pIdxNegD, pIdxPosD, pFillD, pFillD, pFillD, pIdxPosD, pFillD, pIdxNegD, pFillD, pIdxPosD, pIdxNegD, pFillD ), indexBasis,
+                FS::Constant<float>( eFillD, eIdxD, eFillD, eFillD, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 )
+            );
+
+            valueA = FS::FMulAdd( valueA_gY, fY, valueA );
+            valueB = FS::FMulAdd( valueB_gY, fY, valueB );
+            valueC = FS::FMulAdd( valueC_gY, fY, valueC );
+            valueD = FS::FMulAdd( valueD_gY, fY, valueD );
+
+            valueA_gZ = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ),
+                FS::Constant<float>( sFillABC, sFillABC, sIdxABC, sFillABC, pDiagAB, pDiagAB, pIdxPosAB, pIdxNegAB, pIdxNegAB, pFillAB, pIdxPosAB, pFillAB, pFillAB, pIdxNegAB, pIdxPosAB, pFillAB ), indexBasis,
+                FS::Constant<float>( eFillABC, eFillABC, eIdxABC, eFillABC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 )
+            );
+            valueB_gZ = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ),
+                FS::Constant<float>( sDiagABC, sDiagABC, sIdxABC, sFillABC, pFillAB, pFillAB, pIdxPosAB, pIdxNegAB, pIdxNegAB, pDiagAB, pIdxPosAB, pDiagAB, pDiagAB, pIdxNegAB, pIdxPosAB, pDiagAB ), indexBasis,
+                FS::Constant<float>( eDiagABC, eDiagABC, eIdxABC, eFillABC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 )
+            );
+            valueC_gZ = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ),
+                FS::Constant<float>( sFillABC, sFillABC, sIdxABC, sDiagABC, pFillC, pFillC, pIdxPosC, pIdxNegC, pIdxNegC, pFillC, pIdxPosC, pFillC, pFillC, pIdxNegC, pIdxPosC, pFillC ), indexBasis,
+                FS::Constant<float>( eFillABC, eFillABC, eIdxABC, eDiagABC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 )
+            );
+            valueD_gZ = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ),
+                FS::Constant<float>( sFillD, sFillD, sIdxD, sFillD, pFillD, pFillD, pIdxPosD, pIdxNegD, pIdxNegD, pFillD, pIdxPosD, pFillD, pFillD, pIdxNegD, pIdxPosD, pFillD ), indexBasis,
+                FS::Constant<float>( eFillD, eFillD, eIdxD, eFillD, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 )
+            );
+
+            valueA = FS::FMulAdd( valueA_gZ, fZ, valueA );
+            valueB = FS::FMulAdd( valueB_gZ, fZ, valueB );
+            valueC = FS::FMulAdd( valueC_gZ, fZ, valueC );
+            valueD = FS::FMulAdd( valueD_gZ, fZ, valueD );
+
+            valueA_gW = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ),
+                FS::Constant<float>( sFillABC, sFillABC, sFillABC, sIdxABC, pFillAB, pFillAB, pIdxNegAB, pIdxPosAB, pFillAB, pIdxNegAB, pFillAB, pIdxPosAB, pIdxNegAB, pFillAB, pFillAB, pIdxPosAB ), indexBasis,
+                FS::Constant<float>( eFillABC, eFillABC, eFillABC, eIdxABC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 )
+            );
+            valueB_gW = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ),
+                FS::Constant<float>( sFillABC, sFillABC, sFillABC, sIdxABC, pDiagAB, pDiagAB, pIdxNegAB, pIdxPosAB, pDiagAB, pIdxNegAB, pDiagAB, pIdxPosAB, pIdxNegAB, pDiagAB, pDiagAB, pIdxPosAB ), indexBasis,
+                FS::Constant<float>( eFillABC, eFillABC, eFillABC, eIdxABC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 )
+            );
+            valueC_gW = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ),
+                FS::Constant<float>( sDiagABC, sDiagABC, sDiagABC, sIdxABC, pFillC, pFillC, pIdxNegC, pIdxPosC, pFillC, pIdxNegC, pFillC, pIdxPosC, pIdxNegC, pFillC, pFillC, pIdxPosC ), indexBasis,
+                FS::Constant<float>( eDiagABC, eDiagABC, eDiagABC, eIdxABC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 )
+            );
+            valueD_gW = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ),
+                FS::Constant<float>( sFillD, sFillD, sFillD, sIdxD, pFillD, pFillD, pIdxNegD, pIdxPosD, pFillD, pIdxNegD, pFillD, pIdxPosD, pIdxNegD, pFillD, pFillD, pIdxPosD ), indexBasis,
+                FS::Constant<float>( eFillD, eFillD, eFillD, eIdxD, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 )
+            );
+
+            valueA = FS::FMulAdd( valueA_gW, fW, valueA );
+            valueB = FS::FMulAdd( valueB_gW, fW, valueB );
+            valueC = FS::FMulAdd( valueC_gW, fW, valueC );
+            valueD = FS::FMulAdd( valueD_gW, fW, valueD );
+        }
+        else if constexpr( SIMD & FastSIMD::FeatureFlag::AVX2 )
+        {
+            const auto tableAB = FS::Constant<float>( pFillAB, pIdxNegAB, pFillAB, pIdxPosAB, sFillABC, sIdxABC, eFillABC, eIdxABC );
+            const auto tableC  = FS::Constant<float>( pFillC,  pIdxNegC,  pFillC,  pIdxPosC,  sFillABC, sIdxABC, eFillABC, eIdxABC );
+            const auto tableD  = FS::Constant<float>( pFillD,  pIdxNegD,  pFillD,  pIdxPosD,  sFillD,   sIdxD,   eFillD,   eIdxD   );
+
+            int32v indexPositive = indexBasis & int32v( 0x03 << 27 );
+            int32v indexNegative = ( indexBasis >> 2 ) & int32v( 0x03 << 27 );
+            indexNegative ^= indexPositive;
+
+            auto extraCase = ( indexBasis >= int32v( 0x10 << 27 ) );
+            auto singleCase = ( indexPositive == indexNegative );
+            indexPositive |= FS::Cast<int32_t>( singleCase ); // Force indexPositive checks to fail
+
+            int32v indexSelectBase = FS::Masked( singleCase, int32v( 4 ) ) | FS::Masked( extraCase, int32v( 2 ) );
+
+            int32v indexedCounter( -1 );
 
-            float32v a = FS_Select_f32( p > int32v( 0 ), fX, fY );
-            float32v b;
-            if constexpr( FS::SIMD_Level <= FastSIMD::Level_SSE2 )
             {
-                b = FS_Select_f32( p > int32v( 1 << 3 ), fY, fZ );        
+                auto indexedPositive = ( indexPositive == int32v( 0 << 27 ) );
+                auto indexed = indexedPositive | ( indexNegative == int32v( 0 << 27 ) );
+                int32v indexSelect = FS::MaskedIncrement( indexed, indexSelectBase | FS::Masked( indexedPositive, int32v( 2 ) ) );
+
+                valueA_gX = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), FS::Constant<float>( pDiagAB, pIdxNegAB, pDiagAB, pIdxPosAB, sDiagABC, sIdxABC, eDiagABC, eIdxABC ), indexSelect );
+                valueB_gX = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), tableAB, indexSelect );
+                valueC_gX = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), tableC,  indexSelect );
+                valueD_gX = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), tableD,  indexSelect );
+
+                indexedCounter = FS::MaskedDecrement( indexed, indexedCounter );
             }
-            else
+
+            valueA = valueA_gX * fX;
+            valueB = valueB_gX * fX;
+            valueC = valueC_gX * fX;
+            valueD = valueD_gX * fX;
+
             {
-                b = FS_Select_f32( hash << 27, fY, fZ );
+                auto indexedPositive = ( indexPositive == int32v( 1 << 27 ) );
+                auto indexed = indexedPositive | ( indexNegative == int32v( 1 << 27 ) );
+                int32v indexSelect = FS::MaskedIncrement( indexed, indexSelectBase | FS::Masked( indexedPositive, int32v( 2 ) ) );
+
+                valueA_gY = valueB_gY = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), tableAB, indexSelect );
+                valueC_gY = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), tableC, indexSelect );
+                valueD_gY = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), tableD, indexSelect );
+
+                int32v maskedIndexedCounter = FS::InvMasked( indexed, indexedCounter );
+                valueA_gY = FS::MaskedAdd( maskedIndexedCounter == int32v( -2 ), valueA_gY, float32v( kDeltaUnindexedFillerToDiagonal ) );
+                valueB_gY = FS::MaskedAdd( maskedIndexedCounter == int32v( -1 ), valueB_gY, float32v( kDeltaUnindexedFillerToDiagonal ) );
+
+                indexedCounter = FS::MaskedDecrement( indexed, indexedCounter );
             }
-            float32v c = FS_Select_f32( p > int32v( 2 << 3 ), fZ, fW );
 
-            float32v aSign = FS_Casti32_f32( hash << 31 );
-            float32v bSign = FS_Casti32_f32( (hash << 30) & int32v( 0x80000000 ) );
-            float32v cSign = FS_Casti32_f32( (hash << 29) & int32v( 0x80000000 ) );
+            valueA = FS::FMulAdd( valueA_gY, fY, valueA );
+            valueB = FS::FMulAdd( valueB_gY, fY, valueB );
+            valueC = FS::FMulAdd( valueC_gY, fY, valueC );
+            valueD = FS::FMulAdd( valueD_gY, fY, valueD );
 
-            return ( a ^ aSign ) + ( b ^ bSign ) + ( c ^ cSign );
-        }
+            {
+                auto indexedPositive = ( indexPositive == int32v( 2 << 27 ) );
+                auto indexed = indexedPositive | ( indexNegative == int32v( 2 << 27 ) );
+                int32v indexSelect = FS::MaskedIncrement( indexed, indexSelectBase | FS::Masked( indexedPositive, int32v( 2 ) ) );
 
-        template<typename SIMD = FS, std::enable_if_t<SIMD::SIMD_Level == FastSIMD::Level_AVX512>* = nullptr>
-        FS_INLINE static float32v GetGradientDot( int32v hash, float32v fX, float32v fY, float32v fZ, float32v fW )
-        {
-            float32v gX = _mm512_permutex2var_ps( float32v( 0, 0, 0, 0, 0, 0, 0, 0, 1, -1, 1, -1, 1, -1, 1, -1 ), hash, float32v( 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1 ) );
-            float32v gY = _mm512_permutex2var_ps( float32v( 1, -1, 1, -1, 1, -1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0 ), hash, float32v( 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1 ) );
-            float32v gZ = _mm512_permutex2var_ps( float32v( 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1 ), hash, float32v( 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, -1, -1, -1, -1 ) );
-            float32v gW = _mm512_permutex2var_ps( float32v( 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1 ), hash, float32v( 1, 1, 1, 1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0 ) );
+                valueA_gZ = valueB_gZ = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), tableAB, indexSelect );
+                valueC_gZ = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), tableC, indexSelect );
+                valueD_gZ = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), tableD, indexSelect );
 
-            return FS_FMulAdd_f32( gX, fX, FS_FMulAdd_f32( fY, gY, FS_FMulAdd_f32( fZ, gZ, fW * gW ) ));
-        }
+                int32v maskedIndexedCounter = FS::InvMasked( indexed, indexedCounter );
+                valueA_gZ = FS::MaskedAdd( maskedIndexedCounter == int32v( -3 ), valueA_gZ, float32v( kDeltaUnindexedFillerToDiagonal ) );
+                valueB_gZ = FS::MaskedAdd( maskedIndexedCounter == int32v( -2 ), valueB_gZ, float32v( kDeltaUnindexedFillerToDiagonal ) );
+                valueC_gZ = FS::MaskedAdd( maskedIndexedCounter == int32v( -1 ), valueC_gZ, float32v( kDeltaUnindexedFillerToDiagonal ) );
 
-        template<typename SIMD = FS, typename... P>
-        FS_INLINE static int32v HashPrimes( int32v seed, P... primedPos )
-        {
-            int32v hash = seed;
-            hash ^= (primedPos ^ ...);
+                indexedCounter = FS::MaskedDecrement( indexed, indexedCounter );
+            }
 
-            hash *= int32v( 0x27d4eb2d );
-            return (hash >> 15) ^ hash;
-        }
+            valueA = FS::FMulAdd( valueA_gZ, fZ, valueA );
+            valueB = FS::FMulAdd( valueB_gZ, fZ, valueB );
+            valueC = FS::FMulAdd( valueC_gZ, fZ, valueC );
+            valueD = FS::FMulAdd( valueD_gZ, fZ, valueD );
 
-        template<typename SIMD = FS, typename... P>
-        FS_INLINE static int32v HashPrimesHB( int32v seed, P... primedPos )
-        {
-            int32v hash = seed;
-            hash ^= (primedPos ^ ...);
-            
-            hash *= int32v( 0x27d4eb2d );
-            return hash;
-        }  
-
-        template<typename SIMD = FS, typename... P>
-         FS_INLINE static float32v GetValueCoord( int32v seed, P... primedPos )
+            {
+                auto indexedPositive = ( indexPositive == int32v( 3 << 27 ) );
+                auto indexed = indexedPositive | ( indexNegative == int32v( 3 << 27 ) );
+                int32v indexSelect = FS::MaskedIncrement( indexed, indexSelectBase | FS::Masked( indexedPositive, int32v( 2 ) ) );
+
+                valueA_gW = valueB_gW = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), tableAB, indexSelect );
+                valueC_gW = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), tableC, indexSelect );
+                valueD_gW = FS::NativeExec<float32v>( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), tableD, indexSelect );
+
+                int32v maskedIndexedCounter = FS::InvMasked( indexed, indexedCounter );
+                valueB_gW = FS::MaskedAdd( maskedIndexedCounter == int32v( -3 ), valueB_gW, float32v( kDeltaUnindexedFillerToDiagonal ) );
+                valueC_gW = FS::MaskedAdd( maskedIndexedCounter == int32v( -2 ), valueC_gW, float32v( kDeltaUnindexedFillerToDiagonal ) );
+            }
+
+            valueA = FS::FMulAdd( valueA_gW, fW, valueA );
+            valueB = FS::FMulAdd( valueB_gW, fW, valueB );
+            valueC = FS::FMulAdd( valueC_gW, fW, valueC );
+            valueD = FS::FMulAdd( valueD_gW, fW, valueD );
+        }
+        else
         {
-            int32v hash = seed;
-            hash ^= (primedPos ^ ...);
-            
-            hash *= hash * int32v( 0x27d4eb2d );
-            return FS_Converti32_f32( hash ) * float32v( 1.0f / (float)INT_MAX );
+            int32v indexPositive = indexBasis & int32v( 0x03 << 27 );
+            int32v indexNegative = ( indexBasis >> 2 ) & int32v( 0x03 << 27 );
+            indexNegative ^= indexPositive;
+
+            auto extraCase = ( indexBasis >= int32v( 0x10 << 27 ) );
+            auto singleCase = ( indexPositive == indexNegative );
+            auto singleNonExtraCase = indexBasis < int32v( 0x04 << 27 );
+            indexPositive |= FS::Cast<int32_t>( singleNonExtraCase ); // Force indexPositive checks to fail
+
+            float32v singleOffsetAB = FS::MaskedAdd( extraCase, float32v( kDeltaPairwiseToSingleAB ), float32v( kDeltaSingleToExtra ) );
+            float32v componentIndexedNegativeAB = FS::MaskedAdd( singleCase, float32v( kComponentPairwiseIndexedNegativeAB ), singleOffsetAB );
+            float32v componentUnindexedFillerAB = FS::MaskedAdd( singleCase, float32v( kComponentPairwiseUnindexedFillerAB ), singleOffsetAB );
+
+            float32v componentIndexedNegativeC = FS::InvMaskedAdd( singleCase, componentIndexedNegativeAB, float32v( kDeltaPairwiseABToC ) );
+            float32v componentUnindexedFillerC = FS::InvMaskedAdd( singleCase, componentUnindexedFillerAB, float32v( kDeltaPairwiseABToC ) );
+
+            float32v singleOffsetD = FS::MaskedAdd( extraCase, float32v( kDeltaPairwiseToSingleD ), float32v( kDeltaSingleToExtra ) );
+            float32v componentIndexedPositiveD = FS::MaskedAdd( singleCase, float32v( kComponentPairwiseIndexedPositiveD ), singleOffsetD );
+            float32v componentUnindexedD = FS::MaskedAdd( singleCase, float32v( kComponentPairwiseUnindexedD ), singleOffsetD );
+
+            int32v indexedCounter( -1 );
+
+            {
+                auto indexedPositive = ( indexPositive == int32v( 0 << 27 ) );
+                auto indexed = indexedPositive | ( indexNegative == int32v( 0 << 27 ) );
+
+                float32v indexedComponentAB = FS::MaskedIncrement( indexedPositive, componentIndexedNegativeAB );
+                float32v indexedComponentC = FS::MaskedIncrement( indexedPositive, componentIndexedNegativeC );
+                float32v indexedComponentD = FS::MaskedDecrement( ~indexedPositive, componentIndexedPositiveD );
+
+                float32v unindexedComponentA = componentUnindexedFillerAB + float32v( kDeltaUnindexedFillerToDiagonal );
+                float32v unindexedComponentB = componentUnindexedFillerAB;
+                float32v unindexedComponentC = componentUnindexedFillerC;
+
+                valueA_gX = FS::Select( indexed, indexedComponentAB, unindexedComponentA );
+                valueB_gX = FS::Select( indexed, indexedComponentAB, unindexedComponentB );
+                valueC_gX = FS::Select( indexed, indexedComponentC,  unindexedComponentC );
+                valueD_gX = FS::Select( indexed, indexedComponentD,  componentUnindexedD );
+
+                indexedCounter = FS::MaskedDecrement( indexed, indexedCounter );
+            }
+
+            valueA = valueA_gX * fX;
+            valueB = valueB_gX * fX;
+            valueC = valueC_gX * fX;
+            valueD = valueD_gX * fX;
+
+            {
+                auto indexedPositive = ( indexPositive == int32v( 1 << 27 ) );
+                auto indexed = indexedPositive | ( indexNegative == int32v( 1 << 27 ) );
+
+                float32v indexedComponentAB = FS::MaskedIncrement( indexedPositive, componentIndexedNegativeAB );
+                float32v indexedComponentC = FS::MaskedIncrement( indexedPositive, componentIndexedNegativeC );
+                float32v indexedComponentD = FS::MaskedDecrement( ~indexedPositive, componentIndexedPositiveD );
+
+                float32v unindexedComponentA = FS::MaskedAdd( indexedCounter == int32v( -2 ), componentUnindexedFillerAB, float32v( kDeltaUnindexedFillerToDiagonal ) );
+                float32v unindexedComponentB = FS::MaskedAdd( indexedCounter == int32v( -1 ), componentUnindexedFillerAB, float32v( kDeltaUnindexedFillerToDiagonal ) );
+                float32v unindexedComponentC = componentUnindexedFillerC;
+
+                valueA_gY = FS::Select( indexed, indexedComponentAB, unindexedComponentA );
+                valueB_gY = FS::Select( indexed, indexedComponentAB, unindexedComponentB );
+                valueC_gY = FS::Select( indexed, indexedComponentC,  unindexedComponentC );
+                valueD_gY = FS::Select( indexed, indexedComponentD,  componentUnindexedD );
+
+                indexedCounter = FS::MaskedDecrement( indexed, indexedCounter );
+            }
+
+            valueA = FS::FMulAdd( valueA_gY, fY, valueA );
+            valueB = FS::FMulAdd( valueB_gY, fY, valueB );
+            valueC = FS::FMulAdd( valueC_gY, fY, valueC );
+            valueD = FS::FMulAdd( valueD_gY, fY, valueD );
+
+            {
+                auto indexedPositive = ( indexPositive == int32v( 2 << 27 ) );
+                auto indexed = indexedPositive | ( indexNegative == int32v( 2 << 27 ) );
+
+                float32v indexedComponentAB = FS::MaskedIncrement( indexedPositive, componentIndexedNegativeAB );
+                float32v indexedComponentC = FS::MaskedIncrement( indexedPositive, componentIndexedNegativeC );
+                float32v indexedComponentD = FS::MaskedDecrement( ~indexedPositive, componentIndexedPositiveD );
+
+                float32v unindexedComponentA = FS::MaskedAdd( indexedCounter == int32v( -3 ), componentUnindexedFillerAB, float32v( kDeltaUnindexedFillerToDiagonal ) );
+                float32v unindexedComponentB = FS::MaskedAdd( indexedCounter == int32v( -2 ), componentUnindexedFillerAB, float32v( kDeltaUnindexedFillerToDiagonal ) );
+                float32v unindexedComponentC = FS::MaskedAdd( indexedCounter == int32v( -1 ), componentUnindexedFillerC,  float32v( kDeltaUnindexedFillerToDiagonal ) );
+
+                valueA_gZ = FS::Select( indexed, indexedComponentAB, unindexedComponentA );
+                valueB_gZ = FS::Select( indexed, indexedComponentAB, unindexedComponentB );
+                valueC_gZ = FS::Select( indexed, indexedComponentC,  unindexedComponentC );
+                valueD_gZ = FS::Select( indexed, indexedComponentD,  componentUnindexedD );
+
+                indexedCounter = FS::MaskedDecrement( indexed, indexedCounter );
+            }
+
+            valueA = FS::FMulAdd( valueA_gZ, fZ, valueA );
+            valueB = FS::FMulAdd( valueB_gZ, fZ, valueB );
+            valueC = FS::FMulAdd( valueC_gZ, fZ, valueC );
+            valueD = FS::FMulAdd( valueD_gZ, fZ, valueD );
+
+            {
+                auto indexedPositive = ( indexPositive == int32v( 3 << 27 ) );
+                auto indexed = indexedPositive | ( indexNegative == int32v( 3 << 27 ) );
+
+                float32v indexedComponentAB = FS::MaskedIncrement( indexedPositive, componentIndexedNegativeAB );
+                float32v indexedComponentC = FS::MaskedIncrement( indexedPositive, componentIndexedNegativeC );
+                float32v indexedComponentD = FS::MaskedDecrement( ~indexedPositive, componentIndexedPositiveD );
+
+                float32v unindexedComponentA = componentUnindexedFillerAB;
+                float32v unindexedComponentB = FS::MaskedAdd( indexedCounter == int32v( -3 ), componentUnindexedFillerAB, float32v( kDeltaUnindexedFillerToDiagonal ) );
+                float32v unindexedComponentC = FS::MaskedAdd( indexedCounter == int32v( -2 ), componentUnindexedFillerC,  float32v( kDeltaUnindexedFillerToDiagonal ) );
+
+                valueA_gW = FS::Select( indexed, indexedComponentAB, unindexedComponentA );
+                valueB_gW = FS::Select( indexed, indexedComponentAB, unindexedComponentB );
+                valueC_gW = FS::Select( indexed, indexedComponentC,  unindexedComponentC );
+                valueD_gW = FS::Select( indexed, indexedComponentD,  componentUnindexedD );
+            }
+
+            valueA = FS::FMulAdd( valueA_gW, fW, valueA );
+            valueB = FS::FMulAdd( valueB_gW, fW, valueB );
+            valueC = FS::FMulAdd( valueC_gW, fW, valueC );
+            valueD = FS::FMulAdd( valueD_gW, fW, valueD );
         }
 
-        template<typename SIMD = FS>
-        FS_INLINE static float32v Lerp( float32v a, float32v b, float32v t )
-        {
-            return FS_FMulAdd_f32( t, b - a, a );
+        int32v valueIndexX = ( indexPermutation8 >> 1 ); // & int32v( 0x3 );
+        int32v valueIndexY = ( FS::Increment( valueIndexX ) + indexPermutation3 ); // & int32v( 0x3 );
+        int32v valueIndexZ = indexPermutation8 & int32v( 0x1 );
+        valueIndexZ = ( FS::Increment( valueIndexX ) + FS::MaskedIncrement( valueIndexZ >= indexPermutation3, valueIndexZ ) ); // & int32v( 0x3 );
+        int32v valueIndexSumXYZ = valueIndexX + valueIndexY + valueIndexZ;
+
+        multiplier ^= finalSign;
+        valueX = FS::FMulAdd( multiplier, FS::SelectHighBit( valueIndexX << 31, FS::SelectHighBit( valueIndexX << 30, valueD, valueB ), FS::SelectHighBit( valueIndexX << 30, valueC, valueA ) ), valueX );
+        valueY = FS::FMulAdd( multiplier, FS::SelectHighBit( valueIndexY << 31, FS::SelectHighBit( valueIndexY << 30, valueD, valueB ), FS::SelectHighBit( valueIndexY << 30, valueC, valueA ) ), valueY );
+        valueZ = FS::FMulAdd( multiplier, FS::SelectHighBit( valueIndexZ << 31, FS::SelectHighBit( valueIndexZ << 30, valueD, valueB ), FS::SelectHighBit( valueIndexZ << 30, valueC, valueA ) ), valueZ );
+        valueW = FS::FMulAdd( multiplier, FS::SelectHighBit( valueIndexSumXYZ << 31, FS::SelectHighBit( valueIndexSumXYZ << 30, valueD, valueB ), FS::SelectHighBit( valueIndexSumXYZ << 30, valueA, valueC ) ), valueW );
+    }
+
+    template<VectorizationScheme Scheme>
+    FS_FORCEINLINE static void ApplyVectorContributionSimplex( int32v hash, float32v fX, float32v fY, float32v multiplier, float32v& valueX, float32v& valueY ) {
+        switch( Scheme ) {
+            case VectorizationScheme::OrthogonalGradientMatrix:
+                return ApplyOrthogonalGradientMatrixVectorProductSimplex( hash, fX, fY, multiplier, valueX, valueY );
+            case VectorizationScheme::GradientOuterProduct:
+                return ApplyGradientOuterProductVectorProductSimplex( hash, fX, fY, multiplier, valueX, valueY );
         }
+    }
 
-        template<typename SIMD = FS>
-         FS_INLINE static float32v InterpHermite( float32v t )
-        {
-            return t * t * FS_FNMulAdd_f32( t, float32v( 2 ), float32v( 3 ));
+    template<VectorizationScheme Scheme>
+    FS_FORCEINLINE static void ApplyVectorContributionCommon( int32v hash, float32v fX, float32v fY, float32v fZ, float32v multiplier, float32v& valueX, float32v& valueY, float32v& valueZ ) {
+        switch( Scheme ) {
+            case VectorizationScheme::OrthogonalGradientMatrix:
+                return ApplyOrthogonalGradientMatrixVectorProductCommon( hash, fX, fY, fZ, multiplier, valueX, valueY, valueZ );
+            case VectorizationScheme::GradientOuterProduct:
+                return ApplyGradientOuterProductVectorProductCommon( hash, fX, fY, fZ, multiplier, valueX, valueY, valueZ );
         }
+    }
 
-        template<typename SIMD = FS>
-         FS_INLINE static float32v InterpQuintic( float32v t )
-        {
-            return t * t * t * FS_FMulAdd_f32( t, FS_FMulAdd_f32( t, float32v( 6 ), float32v( -15 )), float32v( 10 ) );
+    template<VectorizationScheme Scheme>
+    FS_FORCEINLINE static void ApplyVectorContributionSimplex( int32v hash, float32v fX, float32v fY, float32v fZ, float32v fW, float32v multiplier, float32v& valueX, float32v& valueY, float32v& valueZ, float32v& valueW ) {
+        switch( Scheme ) {
+            case VectorizationScheme::OrthogonalGradientMatrix:
+                return ApplyOrthogonalGradientMatrixVectorProductSimplex( hash, fX, fY, fZ, fW, multiplier, valueX, valueY, valueZ, valueW );
+            case VectorizationScheme::GradientOuterProduct:
+                return ApplyGradientOuterProductVectorProductSimplex( hash, fX, fY, fZ, fW, multiplier, valueX, valueY, valueZ, valueW );
         }
+    }
+
+    template<typename... P>
+    FS_FORCEINLINE static int32v HashPrimes( int32v seed, P... primedPos )
+    {
+        int32v hash = seed;
+        hash ^= ( primedPos ^ ... );
+
+        hash *= int32v( HashMultiplier::A );
 
-        template<typename SIMD = FS, typename... P>
-        FS_INLINE static float32v CalcDistance( DistanceFunction distFunc, float32v dX, P... d )
+        return ( hash >> 15 ) ^ hash;
+    }
+
+    template<typename... P>
+    FS_FORCEINLINE static int32v HashPrimesHB( int32v seed, P... primedPos )
+    {
+        int32v hash = seed;
+        hash ^= ( primedPos ^ ... );
+        
+        hash *= int32v( HashMultiplier::A );
+        return hash;
+    }
+
+    template<typename... P>
+    FS_FORCEINLINE static float32v GetValueCoord( int32v seed, P... primedPos )
+    {
+        int32v hash = seed;
+        hash ^= (primedPos ^ ...);
+
+        hash *= hash * int32v( HashMultiplier::A );
+        return FS::Convert<float>( hash );
+    }
+     
+    FS_FORCEINLINE static float32v Lerp( float32v a, float32v b, float32v t )
+    {
+        return FS::FMulAdd( t, b - a, a );
+    }
+    
+    FS_FORCEINLINE static float32v InterpHermite( float32v t )
+    {
+        return t * t * FS::FNMulAdd( t, float32v( 2 ), float32v( 3 ));
+    }
+     
+     FS_FORCEINLINE static float32v InterpQuintic( float32v t )
+    {
+        return t * t * t * FS::FMulAdd( t, FS::FMulAdd( t, float32v( 6 ), float32v( -15 )), float32v( 10 ) );
+    }
+
+    template<bool DO_SQRT = true, FastSIMD::FeatureSet SIMD = FastSIMD::FeatureSetDefault(), typename... P>
+    FS_FORCEINLINE static float32v CalcDistance( DistanceFunction distFunc, const HybridSource& minkowskiP, int32v seed, float32v pX, P... pos )
+    {
+        switch( distFunc )
         {
-            switch( distFunc )
+            default:
+            case DistanceFunction::Euclidean:
+            if constexpr( DO_SQRT )
             {
-                default:
-                case DistanceFunction::Euclidean:
-                {
-                    float32v distSqr = dX * dX;
-                    ((distSqr = FS_FMulAdd_f32( d, d, distSqr )), ...);
+                float32v distSqr = pX * pX;
+                ((distSqr = FS::FMulAdd( pos, pos, distSqr )), ...);
 
-                    return FS_InvSqrt_f32( distSqr ) * distSqr;
-                }
+                float32v invSqrt = FS::InvSqrt( distSqr );
 
-                case DistanceFunction::EuclideanSquared:
-                {
-                    float32v distSqr = dX * dX;
-                    ((distSqr = FS_FMulAdd_f32( d, d, distSqr )), ...);
+                return FS::Masked( invSqrt != float32v( INFINITY ), distSqr * invSqrt );
+            }
 
-                    return distSqr;
-                }
+            case DistanceFunction::EuclideanSquared:
+            {
+                float32v distSqr = pX * pX;
+                ((distSqr = FS::FMulAdd( pos, pos, distSqr )), ...);
 
-                case DistanceFunction::Manhattan:
-                {
-                    float32v dist = FS_Abs_f32( dX );
-                    dist += (FS_Abs_f32( d ) + ...);
+                return distSqr;
+            }
 
-                    return dist;
-                }
+            case DistanceFunction::Manhattan:
+            {
+                float32v dist = FS::Abs( pX );
+                dist += (FS::Abs( pos ) + ...);
 
-                case DistanceFunction::Hybrid:
-                {
-                    float32v both = FS_FMulAdd_f32( dX, dX, FS_Abs_f32( dX ) );
-                    ((both += FS_FMulAdd_f32( d, d, FS_Abs_f32( d ) )), ...);
+                return dist;
+            }
 
-                    return both;
-                }
+            case DistanceFunction::Hybrid:
+            {
+                float32v both = FS::FMulAdd( pX, pX, FS::Abs( pX ) );
+                ((both += FS::FMulAdd( pos, pos, FS::Abs( pos ) )), ...);
 
-                case DistanceFunction::MaxAxis:
-                {
-                    float32v max = FS_Abs_f32( dX );
-                    ((max = FS_Max_f32( FS_Abs_f32(d), max )), ...);
+                return both;
+            }
 
-                    return max;
-                }
+            case DistanceFunction::MaxAxis:
+            {
+                float32v max = FS::Abs( pX );
+                ((max = FS::Max( FS::Abs( pos ), max )), ...);
+
+                return max;
+            }
+
+            case DistanceFunction::Minkowski:
+            {
+                float32v minkowski = FastSIMD::DispatchClass<Generator, SIMD>::GetSourceValue( minkowskiP, seed, pX, pos... );
+
+                return FS::Pow( FS::Pow( FS::Abs( pX ), minkowski) + (FS::Pow( FS::Abs( pos ), minkowski) + ...), FS::Reciprocal( minkowski ) );
             }
         }
-    };
+    }    
 }
-
-using FnUtils = FastNoise::Utils<FS_SIMD_CLASS>;
-namespace FnPrimes = FastNoise::Primes;
diff --git a/include/FastNoise/Generators/Value.h b/include/FastNoise/Generators/Value.h
index bc2a6bce..2392f2f3 100644
--- a/include/FastNoise/Generators/Value.h
+++ b/include/FastNoise/Generators/Value.h
@@ -1,28 +1,26 @@
-#pragma once
-#include "Generator.h"
-
-namespace FastNoise
-{
-    class Value : public virtual Generator
-    {
-    public:
-        FASTSIMD_LEVEL_SUPPORT( FastNoise::SUPPORTED_SIMD_LEVELS );
-        const Metadata& GetMetadata() const override;
-    };
-
-#ifdef FASTNOISE_METADATA
-    template<>
-    struct MetadataT<Value> : MetadataT<Generator>
-    {
-        SmartNode<> CreateNode( FastSIMD::eLevel ) const override;
-
-        MetadataT()
-        {
-            groups.push_back( "Coherent Noise" );
-
-            description = 
-                "Smooth gradient noise from N dimensional grid";
-        }
-    };
-#endif
-}
+#pragma once
+#include "Generator.h"
+
+namespace FastNoise
+{
+    class Value : public virtual VariableRange<ScalableGenerator>
+    {
+    public:        const Metadata& GetMetadata() const override;
+    };
+
+#ifdef FASTNOISE_METADATA
+    template<>
+    struct MetadataT<Value> : MetadataT<VariableRange<ScalableGenerator>>
+    {
+        SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override;
+
+        MetadataT()
+        {
+            groups.push_back( "Coherent Noise" );
+
+            description = 
+                "Smooth gradient noise from N dimensional grid";
+        }
+    };
+#endif
+}
diff --git a/include/FastNoise/Generators/Value.inl b/include/FastNoise/Generators/Value.inl
index 8c3565ee..3792186b 100644
--- a/include/FastNoise/Generators/Value.inl
+++ b/include/FastNoise/Generators/Value.inl
@@ -1,88 +1,93 @@
-#include "FastSIMD/InlInclude.h"
-
 #include "Value.h"
 #include "Utils.inl"
 
-template<typename FS>
-class FS_T<FastNoise::Value, FS> : public virtual FastNoise::Value, public FS_T<FastNoise::Generator, FS>
+template<FastSIMD::FeatureSet SIMD>
+class FastSIMD::DispatchClass<FastNoise::Value, SIMD> final : public virtual FastNoise::Value, public FastSIMD::DispatchClass<FastNoise::VariableRange<ScalableGenerator>, SIMD>
 {
-    FASTSIMD_DECLARE_FS_TYPES;
-
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y ) const final
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y ) const
     {
-        float32v xs = FS_Floor_f32( x );
-        float32v ys = FS_Floor_f32( y );
+        this->ScalePositions( x, y );
+
+        float32v xs = FS::Floor( x );
+        float32v ys = FS::Floor( y );
 
-        int32v x0 = FS_Convertf32_i32( xs ) * int32v( FnPrimes::X );
-        int32v y0 = FS_Convertf32_i32( ys ) * int32v( FnPrimes::Y );
-        int32v x1 = x0 + int32v( FnPrimes::X );
-        int32v y1 = y0 + int32v( FnPrimes::Y );
+        int32v x0 = FS::Convert<int32_t>( xs ) * int32v( Primes::X );
+        int32v y0 = FS::Convert<int32_t>( ys ) * int32v( Primes::Y );
+        int32v x1 = x0 + int32v( Primes::X );
+        int32v y1 = y0 + int32v( Primes::Y );
 
-        xs = FnUtils::InterpHermite( x - xs );
-        ys = FnUtils::InterpHermite( y - ys );
+        xs = InterpHermite( x - xs );
+        ys = InterpHermite( y - ys );
 
-        return FnUtils::Lerp(
-            FnUtils::Lerp( FnUtils::GetValueCoord( seed, x0, y0 ), FnUtils::GetValueCoord( seed, x1, y0 ), xs ),
-            FnUtils::Lerp( FnUtils::GetValueCoord( seed, x0, y1 ), FnUtils::GetValueCoord( seed, x1, y1 ), xs ), ys );
+        return this->ScaleOutput( Lerp(
+            Lerp( GetValueCoord( seed, x0, y0 ), GetValueCoord( seed, x1, y0 ), xs ),
+            Lerp( GetValueCoord( seed, x0, y1 ), GetValueCoord( seed, x1, y1 ), xs ), ys ),
+            -kValueBounds, kValueBounds );
     }
 
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z ) const final
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z ) const
     {
-        float32v xs = FS_Floor_f32( x );
-        float32v ys = FS_Floor_f32( y );
-        float32v zs = FS_Floor_f32( z );
+        this->ScalePositions( x, y, z );
 
-        int32v x0 = FS_Convertf32_i32( xs ) * int32v( FnPrimes::X );
-        int32v y0 = FS_Convertf32_i32( ys ) * int32v( FnPrimes::Y );
-        int32v z0 = FS_Convertf32_i32( zs ) * int32v( FnPrimes::Z );
-        int32v x1 = x0 + int32v( FnPrimes::X );
-        int32v y1 = y0 + int32v( FnPrimes::Y );
-        int32v z1 = z0 + int32v( FnPrimes::Z );
+        float32v xs = FS::Floor( x );
+        float32v ys = FS::Floor( y );
+        float32v zs = FS::Floor( z );
 
-        xs = FnUtils::InterpHermite( x - xs );
-        ys = FnUtils::InterpHermite( y - ys );
-        zs = FnUtils::InterpHermite( z - zs );
+        int32v x0 = FS::Convert<int32_t>( xs ) * int32v( Primes::X );
+        int32v y0 = FS::Convert<int32_t>( ys ) * int32v( Primes::Y );
+        int32v z0 = FS::Convert<int32_t>( zs ) * int32v( Primes::Z );
+        int32v x1 = x0 + int32v( Primes::X );
+        int32v y1 = y0 + int32v( Primes::Y );
+        int32v z1 = z0 + int32v( Primes::Z );
 
-        return FnUtils::Lerp( FnUtils::Lerp(
-            FnUtils::Lerp( FnUtils::GetValueCoord( seed, x0, y0, z0 ), FnUtils::GetValueCoord( seed, x1, y0, z0 ), xs ),
-            FnUtils::Lerp( FnUtils::GetValueCoord( seed, x0, y1, z0 ), FnUtils::GetValueCoord( seed, x1, y1, z0 ), xs ), ys ),
-            FnUtils::Lerp(                                                                                
-            FnUtils::Lerp( FnUtils::GetValueCoord( seed, x0, y0, z1 ), FnUtils::GetValueCoord( seed, x1, y0, z1 ), xs ),    
-            FnUtils::Lerp( FnUtils::GetValueCoord( seed, x0, y1, z1 ), FnUtils::GetValueCoord( seed, x1, y1, z1 ), xs ), ys ), zs );
+        xs = InterpHermite( x - xs );
+        ys = InterpHermite( y - ys );
+        zs = InterpHermite( z - zs );
+
+        return this->ScaleOutput( Lerp( Lerp(
+            Lerp( GetValueCoord( seed, x0, y0, z0 ), GetValueCoord( seed, x1, y0, z0 ), xs ),
+            Lerp( GetValueCoord( seed, x0, y1, z0 ), GetValueCoord( seed, x1, y1, z0 ), xs ), ys ),
+            Lerp(                                                                                
+            Lerp( GetValueCoord( seed, x0, y0, z1 ), GetValueCoord( seed, x1, y0, z1 ), xs ),    
+            Lerp( GetValueCoord( seed, x0, y1, z1 ), GetValueCoord( seed, x1, y1, z1 ), xs ), ys ), zs ),
+            -kValueBounds, kValueBounds );
     }
 
-    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z, float32v w ) const final
+    float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z, float32v w ) const
     {
-        float32v xs = FS_Floor_f32( x );
-        float32v ys = FS_Floor_f32( y );
-        float32v zs = FS_Floor_f32( z );
-        float32v ws = FS_Floor_f32( w );
+        this->ScalePositions( x, y, z, w );
+
+        float32v xs = FS::Floor( x );
+        float32v ys = FS::Floor( y );
+        float32v zs = FS::Floor( z );
+        float32v ws = FS::Floor( w );
 
-        int32v x0 = FS_Convertf32_i32( xs ) * int32v( FnPrimes::X );
-        int32v y0 = FS_Convertf32_i32( ys ) * int32v( FnPrimes::Y );
-        int32v z0 = FS_Convertf32_i32( zs ) * int32v( FnPrimes::Z );
-        int32v w0 = FS_Convertf32_i32( ws ) * int32v( FnPrimes::W );
-        int32v x1 = x0 + int32v( FnPrimes::X );
-        int32v y1 = y0 + int32v( FnPrimes::Y );
-        int32v z1 = z0 + int32v( FnPrimes::Z );
-        int32v w1 = w0 + int32v( FnPrimes::W );
+        int32v x0 = FS::Convert<int32_t>( xs ) * int32v( Primes::X );
+        int32v y0 = FS::Convert<int32_t>( ys ) * int32v( Primes::Y );
+        int32v z0 = FS::Convert<int32_t>( zs ) * int32v( Primes::Z );
+        int32v w0 = FS::Convert<int32_t>( ws ) * int32v( Primes::W );
+        int32v x1 = x0 + int32v( Primes::X );
+        int32v y1 = y0 + int32v( Primes::Y );
+        int32v z1 = z0 + int32v( Primes::Z );
+        int32v w1 = w0 + int32v( Primes::W );
 
-        xs = FnUtils::InterpHermite( x - xs );
-        ys = FnUtils::InterpHermite( y - ys );
-        zs = FnUtils::InterpHermite( z - zs );
-        ws = FnUtils::InterpHermite( w - ws );
+        xs = InterpHermite( x - xs );
+        ys = InterpHermite( y - ys );
+        zs = InterpHermite( z - zs );
+        ws = InterpHermite( w - ws );
 
-        return FnUtils::Lerp( FnUtils::Lerp( FnUtils::Lerp(
-            FnUtils::Lerp( FnUtils::GetValueCoord( seed, x0, y0, z0, w0 ), FnUtils::GetValueCoord( seed, x1, y0, z0, w0 ), xs ),
-            FnUtils::Lerp( FnUtils::GetValueCoord( seed, x0, y1, z0, w0 ), FnUtils::GetValueCoord( seed, x1, y1, z0, w0 ), xs ), ys ),
-            FnUtils::Lerp( 
-            FnUtils::Lerp( FnUtils::GetValueCoord( seed, x0, y0, z1, w0 ), FnUtils::GetValueCoord( seed, x1, y0, z1, w0 ), xs ),    
-            FnUtils::Lerp( FnUtils::GetValueCoord( seed, x0, y1, z1, w0 ), FnUtils::GetValueCoord( seed, x1, y1, z1, w0 ), xs ), ys ), zs ),
-            FnUtils::Lerp( FnUtils::Lerp(
-            FnUtils::Lerp( FnUtils::GetValueCoord( seed, x0, y0, z0, w1 ), FnUtils::GetValueCoord( seed, x1, y0, z0, w1 ), xs ),
-            FnUtils::Lerp( FnUtils::GetValueCoord( seed, x0, y1, z0, w1 ), FnUtils::GetValueCoord( seed, x1, y1, z0, w1 ), xs ), ys ),
-            FnUtils::Lerp( 
-            FnUtils::Lerp( FnUtils::GetValueCoord( seed, x0, y0, z1, w1 ), FnUtils::GetValueCoord( seed, x1, y0, z1, w1 ), xs ),    
-            FnUtils::Lerp( FnUtils::GetValueCoord( seed, x0, y1, z1, w1 ), FnUtils::GetValueCoord( seed, x1, y1, z1, w1 ), xs ), ys ), zs ), ws );
+        return this->ScaleOutput( Lerp( Lerp( Lerp(
+            Lerp( GetValueCoord( seed, x0, y0, z0, w0 ), GetValueCoord( seed, x1, y0, z0, w0 ), xs ),
+            Lerp( GetValueCoord( seed, x0, y1, z0, w0 ), GetValueCoord( seed, x1, y1, z0, w0 ), xs ), ys ),
+            Lerp( 
+            Lerp( GetValueCoord( seed, x0, y0, z1, w0 ), GetValueCoord( seed, x1, y0, z1, w0 ), xs ),    
+            Lerp( GetValueCoord( seed, x0, y1, z1, w0 ), GetValueCoord( seed, x1, y1, z1, w0 ), xs ), ys ), zs ),
+            Lerp( Lerp(
+            Lerp( GetValueCoord( seed, x0, y0, z0, w1 ), GetValueCoord( seed, x1, y0, z0, w1 ), xs ),
+            Lerp( GetValueCoord( seed, x0, y1, z0, w1 ), GetValueCoord( seed, x1, y1, z0, w1 ), xs ), ys ),
+            Lerp( 
+            Lerp( GetValueCoord( seed, x0, y0, z1, w1 ), GetValueCoord( seed, x1, y0, z1, w1 ), xs ),    
+            Lerp( GetValueCoord( seed, x0, y1, z1, w1 ), GetValueCoord( seed, x1, y1, z1, w1 ), xs ), ys ), zs ), ws ),
+            -kValueBounds, kValueBounds );
     }
 };
diff --git a/include/FastNoise/Metadata.h b/include/FastNoise/Metadata.h
index 3ca5a422..7b992251 100644
--- a/include/FastNoise/Metadata.h
+++ b/include/FastNoise/Metadata.h
@@ -5,10 +5,7 @@
 #include <cstdint>
 #include <memory>
 
-#include "FastNoise_Config.h"
-
-#pragma warning( push )
-#pragma warning( disable : 4251 )
+#include "Utility/Config.h"
 
 namespace FastNoise
 {
@@ -28,55 +25,30 @@ namespace FastNoise
     // Node name, member name+types, functions to set members
     struct FASTNOISE_API Metadata
     {
-        virtual ~Metadata() = default;
-
-        /// <returns>Array containing metadata for every FastNoise node type</returns>
-        static const std::vector<const Metadata*>& GetAll()
-        {
-            return sAllMetadata;
-        }
-
-        /// <returns>Metadata for given Metadata::id</returns>
-        static const Metadata* GetFromId( uint16_t nodeId )
-        {
-            // Metadata not loaded yet
-            // Don't try to create nodes from metadata during static initialisation
-            // Metadata is loaded using static variable and static variable init is done in a random order
-            assert( sAllMetadata.size() );
-
-            if( nodeId < sAllMetadata.size() )
-            {
-                return sAllMetadata[nodeId];
-            }
-
-            return nullptr;
-        }
-
-        /// <returns>Metadata for given node class</returns>
         template<typename T>
-        static const Metadata& Get()
+        class FASTNOISE_API Vector
         {
-            static_assert( std::is_base_of<Generator, T>::value, "This function should only be used for FastNoise node classes, for example FastNoise::Simplex" );
-            static_assert( std::is_member_function_pointer<decltype(&T::GetMetadata)>::value, "Cannot get Metadata for abstract node class, use a derived class, for example: Fractal -> FractalFBm" );
-
-            return Impl::GetMetadata<T>();
-        }
-
-        /// <summary>
-        /// Serialise node data and any source node datas (recursive)
-        /// </summary>
-        /// <param name="nodeData">Root node data</param>
-        /// <param name="fixUp">Remove dependency loops and invalid node types</param>
-        /// <returns>Empty string on error</returns>
-        static std::string SerialiseNodeData( NodeData* nodeData, bool fixUp = false );
-
-        /// <summary>
-        /// Deserialise a string created from SerialiseNodeData to a node data tree
-        /// </summary>
-        /// <param name="serialisedBase64NodeData">Encoded string to deserialise</param>
-        /// <param name="nodeDataOut">Storage for new node data</param>
-        /// <returns>Root node</returns>
-        static NodeData* DeserialiseNodeData( const char* serialisedBase64NodeData, std::vector<std::unique_ptr<NodeData>>& nodeDataOut );
+        public:
+            using const_iterator = const T*;
+
+            const_iterator begin() const { return data() + mStart; }
+            const_iterator end() const { return data() + mEnd; }
+            size_t size() const { return mEnd - mStart; }
+            const T& operator []( size_t i ) const { return begin()[i]; }
+
+        private:
+            template<typename>
+            friend struct MetadataT;
+            friend struct Metadata;
+            friend class Generator;
+            using index_type = uint8_t;
+
+            T* data() const;
+            void push_back( const T& value );
+
+            index_type mStart = (index_type)-1;
+            index_type mEnd = (index_type)-1;
+        };
 
         struct NameDesc
         {
@@ -91,25 +63,9 @@ namespace FastNoise
         {
             const char* name = "";
             const char* description = "";
-            int dimensionIdx = -1;            
+            int dimensionIdx = -1;
         };
 
-        /// <summary>
-        /// Add spaces to node names: DomainScale -> Domain Scale
-        /// </summary>
-        /// <param name="metadata">FastNoise node metadata</param>
-        /// <param name="removeGroups">Removes metadata groups from name: FractalFBm -> FBm</param>
-        /// <returns>string with formatted name</returns>
-        static std::string FormatMetadataNodeName( const Metadata* metadata, bool removeGroups = false );
-
-        /// <summary>
-        /// Adds dimension prefix to member varibles that per-dimension:
-        /// DomainAxisScale::Scale -> X Scale
-        /// </summary>
-        /// <param name="member">FastNoise node metadata member</param>
-        /// <returns>string with formatted name</returns>
-        static std::string FormatMetadataMemberName( const Member& member );
-
         // float, int or enum value
         struct MemberVariable : Member
         {
@@ -153,7 +109,8 @@ namespace FastNoise
 
             eType type;
             ValueUnion valueDefault, valueMin, valueMax;
-            std::vector<const char*> enumNames;
+            float valueUiDragSpeed = 0;
+            Vector<const char*> enumNames;
 
             // Function to set value for given generator
             // Returns true if Generator is correct node class
@@ -171,7 +128,7 @@ namespace FastNoise
         // Either a constant float or node lookup
         struct MemberHybrid : Member
         {
-            float valueDefault = 0.0f;
+            float valueDefault, valueUiDragSpeed;
 
             // Function to set value for given generator
             // Returns true if Generator is correct node class
@@ -183,14 +140,75 @@ namespace FastNoise
             std::function<bool( Generator*, SmartNodeArg<> )> setNodeFunc;
         };
 
-        uint16_t id;
-        const char* name = "";
-        const char* description = "";
-        std::vector<const char*> groups;
+        using node_id = uint8_t;
+
+        static std::pair<int32_t, const char*> DebugCheckVectorStorageSize( int i );
 
-        std::vector<MemberVariable>   memberVariables;
-        std::vector<MemberNodeLookup> memberNodeLookups;
-        std::vector<MemberHybrid>     memberHybrids;
+        virtual ~Metadata() = default;
+
+        /// <returns>Array containing metadata for every FastNoise node type</returns>
+        static const Vector<const Metadata*>& GetAll()
+        {
+            return sAllMetadata;
+        }
+
+        /// <returns>Metadata for given Metadata::id</returns>
+        static const Metadata* GetFromId( node_id nodeId )
+        {
+            // Metadata not loaded yet
+            // Don't try to create nodes from metadata during static initialisation
+            // Metadata is loaded using static variable and static variable init is done in a random order
+            assert( sAllMetadata.size() );
+
+            if( nodeId < sAllMetadata.size() )
+            {
+                return sAllMetadata[nodeId];
+            }
+
+            return nullptr;
+        }
+
+        /// <returns>Metadata for given node class</returns>
+        template<typename T>
+        static const Metadata& Get()
+        {
+            static_assert( std::is_base_of<Generator, T>::value, "This function should only be used for FastNoise node classes, for example FastNoise::Simplex" );
+            static_assert( std::is_member_function_pointer<decltype(&T::GetMetadata)>::value, "Cannot get Metadata for abstract node class, use a derived class, for example: Fractal -> FractalFBm" );
+
+            return Impl::GetMetadata<T>();
+        }
+
+        /// <summary>
+        /// Serialise node data and any source node datas (recursive)
+        /// </summary>
+        /// <param name="nodeData">Root node data</param>
+        /// <param name="fixUp">Remove dependency loops and invalid node types</param>
+        /// <returns>Empty string on error</returns>
+        static std::string SerialiseNodeData( NodeData* nodeData, bool fixUp = false );
+
+        /// <summary>
+        /// Deserialise a string created from SerialiseNodeData to a node data tree
+        /// </summary>
+        /// <param name="serialisedBase64NodeData">Encoded string to deserialise</param>
+        /// <param name="nodeDataOut">Storage for new node data</param>
+        /// <returns>Root node</returns>
+        static NodeData* DeserialiseNodeData( const char* serialisedBase64NodeData, std::vector<std::unique_ptr<NodeData>>& nodeDataOut );
+
+        /// <summary>
+        /// Add spaces to node names: DomainScale -> Domain Scale
+        /// </summary>
+        /// <param name="metadata">FastNoise node metadata</param>
+        /// <param name="removeGroups">Removes metadata groups from name: FractalFBm -> FBm</param>
+        /// <returns>string with formatted name</returns>
+        static std::string FormatMetadataNodeName( const Metadata* metadata, bool removeGroups = false );
+
+        /// <summary>
+        /// Adds dimension prefix to member varibles that per-dimension:
+        /// DomainAxisScale::Scale -> X Scale
+        /// </summary>
+        /// <param name="member">FastNoise node metadata member</param>
+        /// <returns>string with formatted name</returns>
+        static std::string FormatMetadataMemberName( const Member& member );
 
         /// <summary>
         /// Create new instance of a FastNoise node from metadata
@@ -201,7 +219,17 @@ namespace FastNoise
         /// </example>
         /// <param name="maxSimdLevel">Max SIMD level, Null = Auto</param>
         /// <returns>SmartNode<T> is guaranteed not nullptr</returns>
-        virtual SmartNode<> CreateNode( FastSIMD::eLevel maxSimdLevel = FastSIMD::Level_Null ) const = 0;
+        virtual SmartNode<> CreateNode( FastSIMD::FeatureSet maxFeatureSet = FastSIMD::FeatureSet::Max ) const = 0;
+
+        node_id id;
+        Vector<MemberVariable>   memberVariables;
+        Vector<MemberNodeLookup> memberNodeLookups;
+        Vector<MemberHybrid>     memberHybrids;
+        Vector<const char*>      groups;
+
+        const char* name = "";
+        const char* description = "";
+        const char* formattedName = nullptr;
 
     protected:
         Metadata()
@@ -209,22 +237,41 @@ namespace FastNoise
             id = AddMetadata( this );
         }
 
+        static constexpr float kDefaultUiDragSpeedFloat = 0.02f;
+        static constexpr float kDefaultUiDragSpeedInt = 0.2f;
+
     private:
-        static uint16_t AddMetadata( const Metadata* newMetadata )
+        static node_id AddMetadata( const Metadata* newMetadata )
         {
-            sAllMetadata.emplace_back( newMetadata );
+            sAllMetadata.push_back( newMetadata );
 
-            return (uint16_t)sAllMetadata.size() - 1;
+            return (node_id)sAllMetadata.size() - 1;
         }
 
-        static std::vector<const Metadata*> sAllMetadata;
+        static Vector<const Metadata*> sAllMetadata;
     };
 
     // Stores data to create an instance of a FastNoise node
     // Node type, member values
-    struct FASTNOISE_API NodeData
+    struct NodeData
     {
-        NodeData( const Metadata* metadata );
+        NodeData( const Metadata* data )
+        {
+            if( ( metadata = data ) )
+            {
+                for( const Metadata::MemberVariable& value: metadata->memberVariables )
+                {
+                    variables.push_back( value.valueDefault );
+                }
+
+                nodeLookups.assign( metadata->memberNodeLookups.size(), nullptr );
+
+                for( const Metadata::MemberHybrid& value: metadata->memberHybrids )
+                {
+                    hybrids.emplace_back( nullptr, value.valueDefault );
+                }
+            }
+        }
 
         const Metadata* metadata;
         std::vector<Metadata::MemberVariable::ValueUnion> variables;
@@ -240,5 +287,3 @@ namespace FastNoise
         }
     };
 }
-
-#pragma warning( pop )
diff --git a/include/FastNoise/Utility/Config.h b/include/FastNoise/Utility/Config.h
new file mode 100644
index 00000000..c77f9c01
--- /dev/null
+++ b/include/FastNoise/Utility/Config.h
@@ -0,0 +1,25 @@
+#pragma once
+#include "Export.h"
+#include <FastSIMD/DispatchClass.h>
+
+#define FASTNOISE_CALC_MIN_MAX true
+
+namespace FastNoise
+{    
+    class Generator;
+    struct Metadata;
+
+    template<typename T>
+    struct MetadataT;
+
+    template<typename T = Generator>
+    class SmartNode;
+
+    template<typename T = Generator>
+    using SmartNodeArg = const SmartNode<const T>&;
+
+    template<typename T>
+    SmartNode<T> New( FastSIMD::FeatureSet maxFeatureSet = FastSIMD::FeatureSet::Max );
+} // namespace FastNoise
+
+#include "SmartNode.h"
\ No newline at end of file
diff --git a/include/FastNoise/Utility/Export.h b/include/FastNoise/Utility/Export.h
new file mode 100644
index 00000000..9d983f71
--- /dev/null
+++ b/include/FastNoise/Utility/Export.h
@@ -0,0 +1,18 @@
+#ifndef FASTNOISE_EXPORT_H
+#define FASTNOISE_EXPORT_H
+
+#if ( !defined( FASTNOISE_STATIC_LIB ) && !defined( FASTSIMD_STATIC_LIB ) ) && ( defined( _WIN32 ) || defined( __CYGWIN__ ) )
+#if defined( FASTNOISE_EXPORT ) || defined( FASTSIMD_EXPORT )
+#define FASTNOISE_API __declspec( dllexport )
+#else
+#define FASTNOISE_API __declspec( dllimport )
+#endif
+#else
+#define FASTNOISE_API
+#endif
+
+#if defined( FASTNOISE_STATIC_LIB ) && !defined( FASTSIMD_STATIC_LIB )
+#define FASTSIMD_STATIC_LIB
+#endif
+
+#endif
\ No newline at end of file
diff --git a/include/FastNoise/SmartNode.h b/include/FastNoise/Utility/SmartNode.h
similarity index 53%
rename from include/FastNoise/SmartNode.h
rename to include/FastNoise/Utility/SmartNode.h
index 9125323b..58f785a8 100644
--- a/include/FastNoise/SmartNode.h
+++ b/include/FastNoise/Utility/SmartNode.h
@@ -6,14 +6,14 @@
 #include <type_traits>
 #include <functional>
 
-#include "FastNoise_Config.h"
+#include "Config.h"
 
 namespace FastNoise
 {
     class FASTNOISE_API SmartNodeManager
     {
     public:
-        static constexpr uint64_t kInvalidReferenceId = (uint64_t)-1;
+        static constexpr uint64_t kInvalidReferenceId = (uint64_t)0;
 
         SmartNodeManager() = delete;
 
@@ -27,17 +27,11 @@ namespace FastNoise
         friend class SmartNode;
 
         template<typename T>
-        friend SmartNode<T> New( FastSIMD::eLevel );
-
-        static uint64_t GetReference( const void* ptr );
-
-        static void IncReference( uint64_t id );
-
-        static void DecReference( uint64_t id, void* ptr, void ( *destructorFunc )( void* ) );
-
-        static uint32_t ReferenceCount( uint64_t id );
+        friend SmartNode<T> New( FastSIMD::FeatureSet );
 
         static void* Allocate( size_t size, size_t align );
+
+        static void Free( const void* ptr );
     };
 
     template<typename T>
@@ -47,62 +41,50 @@ namespace FastNoise
         static_assert( std::is_base_of<Generator, T>::value, "SmartNode should only be used for FastNoise node classes" );
 
         template<typename U>
-        static SmartNode DynamicCast( SmartNode<U> node )
+        static SmartNode DynamicCast( const SmartNode<U>& node )
         {
-            if( T* dynamicCast = dynamic_cast<T*>( node.get() ) )
+            if( T* dynamicCast = dynamic_cast<T*>( node.mPtr ) )
             {
-                return FastNoise::SmartNode<T>( node, dynamicCast );
+                return FastNoise::SmartNode<T>( dynamicCast );
             }
 
             return nullptr;
         }
 
         constexpr SmartNode( std::nullptr_t = nullptr ) noexcept :
-            mReferenceId( SmartNodeManager::kInvalidReferenceId ),
             mPtr( nullptr )
         {}
         
-        SmartNode( const SmartNode& node )
+        SmartNode( const SmartNode& node ) noexcept :
+            mPtr( node.mPtr )
         {
-            TryInc( node.mReferenceId );
-            mReferenceId = node.mReferenceId;
-            mPtr = node.mPtr;
+            TryInc( mPtr );
         }
 
         template<typename U>
-        SmartNode( const SmartNode<U>& node )
+        SmartNode( const SmartNode<U>& node ) noexcept :
+            mPtr( node.mPtr )
         {
-            TryInc( node.mReferenceId );
-            mReferenceId = node.mReferenceId;
-            mPtr = node.mPtr;
+            TryInc( mPtr );
         }
 
         template<typename U>
-        SmartNode( const SmartNode<U>& node, T* ptr )
+        SmartNode( const SmartNode<U>&, T* ptr ) noexcept :
+            mPtr( ptr )
         {
-            assert( ptr );
-
-            TryInc( node.mReferenceId );
-            mReferenceId = node.mReferenceId;
-            mPtr = ptr;
+            TryInc( mPtr );
         }
 
-        SmartNode( SmartNode&& node ) noexcept
+        SmartNode( SmartNode&& node ) noexcept :
+            mPtr( node.mPtr )
         {
-            mReferenceId = node.mReferenceId;
-            mPtr = node.mPtr;
-
-            node.mReferenceId = SmartNodeManager::kInvalidReferenceId;
             node.mPtr = nullptr;
         }
 
         template<typename U>
-        SmartNode( SmartNode<U>&& node ) noexcept
+        SmartNode( SmartNode<U>&& node ) noexcept :
+            mPtr( node.mPtr )
         {
-            mReferenceId = node.mReferenceId;
-            mPtr = node.mPtr;
-
-            node.mReferenceId = SmartNodeManager::kInvalidReferenceId;
             node.mPtr = nullptr;
         }
 
@@ -120,17 +102,11 @@ namespace FastNoise
         template<typename U>
         SmartNode& operator=( SmartNode<U>&& node ) noexcept
         {
-            if( mReferenceId == node.mReferenceId )
-            {
-                mPtr = node.mPtr;                
-            }
-            else
+            if( mPtr != node.mPtr )            
             {
                 Release();
-                mReferenceId = node.mReferenceId;
                 mPtr = node.mPtr;
 
-                node.mReferenceId = SmartNodeManager::kInvalidReferenceId;
                 node.mPtr = nullptr;
             }
 
@@ -139,13 +115,12 @@ namespace FastNoise
 
         SmartNode& operator=( const SmartNode& node ) noexcept
         {
-            if( mReferenceId != node.mReferenceId )
+            if( mPtr != node.mPtr )
             {
-                TryInc( node.mReferenceId );
+                TryInc( node.mPtr );
                 Release();
-                mReferenceId = node.mReferenceId;
+                mPtr = node.mPtr;
             }
-            mPtr = node.mPtr;
 
             return *this;
         }
@@ -153,13 +128,12 @@ namespace FastNoise
         template<typename U>
         SmartNode& operator=( const SmartNode<U>& node ) noexcept
         {
-            if( mReferenceId != node.mReferenceId )
+            if( mPtr != node.mPtr )
             {
-                TryInc( node.mReferenceId );
+                TryInc( node.mPtr );
                 Release();
-                mReferenceId = node.mReferenceId;
+                mPtr = node.mPtr;
             }
-            mPtr = node.mPtr;
 
             return *this;
         }
@@ -176,13 +150,27 @@ namespace FastNoise
             return lhs.get() != rhs.get();
         }
 
-        T& operator*() const noexcept
+        const T& operator*() const noexcept
+        {
+            assert( mPtr->ReferencesFetchAdd() );
+            return *mPtr;
+        }
+
+        T& operator*() noexcept
         {
+            assert( mPtr->ReferencesFetchAdd() );
             return *mPtr;
         }
 
-        T* operator->() const noexcept
+        const T* operator->() const noexcept
         {
+            assert( mPtr->ReferencesFetchAdd() );
+            return mPtr;
+        }
+
+        T* operator->() noexcept
+        {
+            assert( mPtr->ReferencesFetchAdd() );
             return mPtr;
         }
 
@@ -191,7 +179,12 @@ namespace FastNoise
             return mPtr != nullptr;
         }
 
-        T* get() const noexcept
+        const T* get() const noexcept
+        {
+            return mPtr;
+        }
+
+        T* get() noexcept
         {
             return mPtr;
         }
@@ -203,18 +196,17 @@ namespace FastNoise
 
         void swap( SmartNode& node ) noexcept
         {
-            std::swap( mReferenceId, node.mReferenceId );
             std::swap( mPtr, node.mPtr );
         }
 
         long use_count() const noexcept
         {
-            if( mReferenceId == SmartNodeManager::kInvalidReferenceId )
+            if( mPtr )
             {
-                return 0;
+                return mPtr->ReferencesFetchAdd();
             }
 
-            return (long)SmartNodeManager::ReferenceCount( mReferenceId );
+            return 0;
         }
 
         bool unique() const noexcept
@@ -224,7 +216,7 @@ namespace FastNoise
 
     private:
         template<typename U>
-        friend SmartNode<U> New( FastSIMD::eLevel );
+        friend SmartNode<U> New( FastSIMD::FeatureSet );
 
         template<typename U>
         friend struct MetadataT;
@@ -232,38 +224,44 @@ namespace FastNoise
         template<typename U>
         friend class SmartNode;
 
+        friend T;
+
         explicit SmartNode( T* ptr ) :
-            mReferenceId( ptr ? SmartNodeManager::GetReference( ptr ) : SmartNodeManager::kInvalidReferenceId ),
             mPtr( ptr )
         {
-            if( mReferenceId != SmartNodeManager::kInvalidReferenceId )
-            {
-                SmartNodeManager::IncReference( mReferenceId );
-            }
+            TryInc( ptr );
         }
 
         void Release()
         {
             using U = typename std::remove_const<T>::type;
 
-            if( mReferenceId != SmartNodeManager::kInvalidReferenceId )
+            if( mPtr )
             {
-                SmartNodeManager::DecReference( mReferenceId, const_cast<U*>( mPtr ), []( void* ptr ) { ( (U*)ptr )->~T(); } );
+                int32_t previousRefCount = mPtr->ReferencesFetchAdd( -1 );
+
+                assert( previousRefCount );
+
+                if( previousRefCount == 1 )
+                {
+                    const_cast<U*>( mPtr )->~U();
+
+                    SmartNodeManager::Free( mPtr );
+                }
             }
 
-            mReferenceId = SmartNodeManager::kInvalidReferenceId;
-            mPtr = nullptr;            
+            mPtr = nullptr;
         }
 
-        static void TryInc( uint64_t id )
+        template<typename U>
+        static void TryInc( U* ptr ) noexcept
         {
-            if( id != SmartNodeManager::kInvalidReferenceId )
+            if( ptr )
             {
-                SmartNodeManager::IncReference( id );
+                ptr->ReferencesFetchAdd( 1 );
             }
         }
-
-        uint64_t mReferenceId;
+        
         T* mPtr;
     };
 } // namespace FastNoise
diff --git a/include/FastSIMD/FastSIMD.h b/include/FastSIMD/FastSIMD.h
deleted file mode 100644
index 83d1c5b4..00000000
--- a/include/FastSIMD/FastSIMD.h
+++ /dev/null
@@ -1,51 +0,0 @@
-#pragma once
-#include "FastSIMD_Config.h"
-
-namespace FastSIMD
-{
-    typedef uint32_t Level_BitFlags;
-
-    enum eLevel : Level_BitFlags
-    {
-        Level_Null   = 0,       // Uninitilised
-        Level_Scalar = 1 <<  0, // 80386 instruction set (Not SIMD)
-        Level_SSE    = 1 <<  1, // SSE (XMM) supported by CPU (not testing for O.S. support)
-        Level_SSE2   = 1 <<  2, // SSE2
-        Level_SSE3   = 1 <<  3, // SSE3
-        Level_SSSE3  = 1 <<  4, // Supplementary SSE3 (SSSE3)
-        Level_SSE41  = 1 <<  5, // SSE4.1
-        Level_SSE42  = 1 <<  6, // SSE4.2
-        Level_AVX    = 1 <<  7, // AVX supported by CPU and operating system
-        Level_AVX2   = 1 <<  8, // AVX2
-        Level_AVX512 = 1 <<  9, // AVX512, AVX512DQ supported by CPU and operating system
-
-        Level_NEON   = 1 << 16, // ARM NEON
-    };
-
-    const Level_BitFlags COMPILED_SIMD_LEVELS =
-        (FASTSIMD_COMPILE_SCALAR     ? Level_Scalar : Level_Null) |
-        (FASTSIMD_COMPILE_SSE        ? Level_SSE    : Level_Null) |
-        (FASTSIMD_COMPILE_SSE2       ? Level_SSE2   : Level_Null) |
-        (FASTSIMD_COMPILE_SSE3       ? Level_SSE3   : Level_Null) |
-        (FASTSIMD_COMPILE_SSSE3      ? Level_SSSE3  : Level_Null) |
-        (FASTSIMD_COMPILE_SSE41      ? Level_SSE41  : Level_Null) |
-        (FASTSIMD_COMPILE_SSE42      ? Level_SSE42  : Level_Null) |
-        (FASTSIMD_COMPILE_AVX        ? Level_AVX    : Level_Null) |
-        (FASTSIMD_COMPILE_AVX2       ? Level_AVX2   : Level_Null) |
-        (FASTSIMD_COMPILE_AVX512     ? Level_AVX512 : Level_Null) |
-        (FASTSIMD_COMPILE_NEON       ? Level_NEON   : Level_Null) ;
-    
-    typedef void* (*MemoryAllocator)( size_t size, size_t align );
-
-    FASTSIMD_API eLevel CPUMaxSIMDLevel();
-
-    template<typename T>
-    T* New( eLevel maxSIMDLevel = Level_Null, MemoryAllocator allocator = nullptr );
-
-    template<typename T, eLevel SIMD_LEVEL>
-    T* ClassFactory( MemoryAllocator allocator = nullptr );
-
-#define FASTSIMD_LEVEL_SUPPORT( ... ) \
-    static const FastSIMD::Level_BitFlags Supported_SIMD_Levels = __VA_ARGS__
-
-}
diff --git a/include/FastSIMD/FastSIMD_Config.h b/include/FastSIMD/FastSIMD_Config.h
deleted file mode 100644
index b823d67e..00000000
--- a/include/FastSIMD/FastSIMD_Config.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#pragma once
-#include <cstdint>
-#include <cstddef>
-
-#include "FastSIMD_Export.h"
-
-#if defined(__arm__) || defined(__aarch64__)
-#define FASTSIMD_x86 false
-#define FASTSIMD_ARM true
-#else
-#define FASTSIMD_x86 true
-#define FASTSIMD_ARM false
-#endif
-
-#define FASTSIMD_64BIT (INTPTR_MAX == INT64_MAX)
-
-#define FASTSIMD_COMPILE_SCALAR (!(FASTSIMD_x86 && FASTSIMD_64BIT)) // Don't compile for x86 64bit since CPU is guaranteed SSE2 support 
-
-#define FASTSIMD_COMPILE_SSE    (FASTSIMD_x86 & false) // Not supported
-#define FASTSIMD_COMPILE_SSE2   (FASTSIMD_x86 & true )
-#define FASTSIMD_COMPILE_SSE3   (FASTSIMD_x86 & true )
-#define FASTSIMD_COMPILE_SSSE3  (FASTSIMD_x86 & true )
-#define FASTSIMD_COMPILE_SSE41  (FASTSIMD_x86 & true )
-#define FASTSIMD_COMPILE_SSE42  (FASTSIMD_x86 & true )
-#define FASTSIMD_COMPILE_AVX    (FASTSIMD_x86 & false) // Not supported
-#define FASTSIMD_COMPILE_AVX2   (FASTSIMD_x86 & true )
-#define FASTSIMD_COMPILE_AVX512 (FASTSIMD_x86 & true )
-
-#define FASTSIMD_COMPILE_NEON   (FASTSIMD_ARM & true )
-
-#define FASTSIMD_USE_FMA                   true
-#define FASTSIMD_CONFIG_GENERATE_CONSTANTS false
-
diff --git a/include/FastSIMD/FastSIMD_Export.h b/include/FastSIMD/FastSIMD_Export.h
deleted file mode 100644
index d81950aa..00000000
--- a/include/FastSIMD/FastSIMD_Export.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#pragma once
-
-#if !defined( FASTNOISE_STATIC_LIB ) && ( defined( _WIN32 ) || defined( __CYGWIN__ ) )
-#ifdef FASTNOISE_EXPORT // CHANGE ME
-#define FASTSIMD_API __declspec( dllexport )
-#else
-#define FASTSIMD_API __declspec( dllimport )
-#endif
-#else
-#define FASTSIMD_API
-#endif
\ No newline at end of file
diff --git a/include/FastSIMD/FunctionList.h b/include/FastSIMD/FunctionList.h
deleted file mode 100644
index 5135f461..00000000
--- a/include/FastSIMD/FunctionList.h
+++ /dev/null
@@ -1,856 +0,0 @@
-#pragma once
-#include <cinttypes>
-#include <type_traits>
-#include <memory>
-
-#include "FastSIMD/FastSIMD.h"
-
-#ifdef _MSC_VER
-#if defined( _M_IX86_FP ) && _M_IX86_FP < 2
-#define FS_VECTORCALL
-#else
-#define FS_VECTORCALL __vectorcall
-#endif
-#define FS_INLINE __forceinline
-#else
-#define FS_VECTORCALL 
-#define FS_INLINE __attribute__((always_inline)) inline
-#endif
-
-#ifndef NDEBUG
-#undef FS_INLINE
-#define FS_INLINE inline
-#endif
-
-/// <summary>
-/// Number of 32 width elements that will fit into a vector
-/// </summary>
-/// <remarks>
-/// Compile time constant
-/// </remarks>
-/// <code>
-/// size_t FS_Size_32()
-/// </code>
-#define FS_Size_32() FS::template VectorSize<sizeof( int32_t )>
-
-
-// Vector builders
-
-/// <summary>
-/// Vector with values incrementing from 0 based on element index {0, 1, 2, 3...}
-/// </summary>
-/// <code>
-/// example: int32v::FS_Incremented()
-/// </code>
-#define FS_Incremented() Incremented()
-
-
-// Load
-
-/// <summary>
-/// Copies sizeof(float32v) bytes from given memory location into float32v
-/// </summary>
-/// <remarks>
-/// Memory does not need to be aligned
-/// </remarks>
-/// <code>
-/// float32v FS_Load_f32( void const* ptr )
-/// </code>
-#define FS_Load_f32( ... ) FS::Load_f32( __VA_ARGS__ )
-
-
-/// <summary>
-/// Copies sizeof(int32v) bytes from given memory location into int32v
-/// </summary>
-/// <remarks>
-/// Memory does not need to be aligned
-/// </remarks>
-/// <code>
-/// int32v FS_Load_i32( void const* ptr )
-/// </code>
-#define FS_Load_i32( ... ) FS::Load_i32( __VA_ARGS__ )
-
-
-// Store
-
-/// <summary>
-/// Copies all elements of float32v to given memory location
-/// </summary>
-/// <code>
-/// void FS_Store_f32( void* ptr, float32v f )
-/// </code>
-#define FS_Store_f32( ... ) FS::Store_f32( __VA_ARGS__ )
-
-/// <summary>
-/// Copies all elements of int32v to given memory location
-/// </summary>
-/// <code>
-/// void FS_Store_i32( void* ptr, int32v i )
-/// </code>
-#define FS_Store_i32( ... ) FS::Store_i32( __VA_ARGS__ )
-
-
-// Extract
-
-/// <summary>
-/// Retreive element 0 from vector
-/// </summary>
-/// <code>
-/// float FS_Extract0_f32( float32v f )
-/// </code>
-#define FS_Extract0_f32( ... ) FS::Extract0_f32( __VA_ARGS__ )
-
-/// <summary>
-/// Retreive element 0 from vector
-/// </summary>
-/// <code>
-/// int32_t FS_Extract0_i32( int32v i )
-/// </code>
-#define FS_Extract0_i32( ... ) FS::Extract0_i32( __VA_ARGS__ )
-
-/// <summary>
-/// Retreive element from vector at position
-/// </summary>
-/// <code>
-/// float FS_Extract_f32( float32v f, size_t idx )
-/// </code>
-#define FS_Extract_f32( ... ) FS::Extract_f32( __VA_ARGS__ )
-
-/// <summary>
-/// Retreive element from vector at position
-/// </summary>
-/// <code>
-/// int32_t FS_Extract_i32( int32v i, size_t idx )
-/// </code>
-#define FS_Extract_i32( ... ) FS::Extract_i32( __VA_ARGS__ )
-
-
-// Cast
-
-/// <summary>
-/// Bitwise cast int to float
-/// </summary>
-/// <code>
-/// float32v FS_Casti32_f32( int32v i )
-/// </code>
-#define FS_Casti32_f32( ... ) FS::Casti32_f32( __VA_ARGS__ )
-
-/// <summary>
-/// Bitwise cast float to int
-/// </summary>
-/// <code>
-/// int32v FS_Castf32_i32( float32v f )
-/// </code>
-#define FS_Castf32_i32( ... ) FS::Castf32_i32( __VA_ARGS__ )
-
-
-// Convert
-
-/// <summary>
-/// Convert int to float 
-/// </summary>
-/// <remarks>
-/// Rounding: truncate
-/// </remarks>
-/// <code>
-/// float32v FS_Converti32_f32( int32v i )
-/// </code>
-#define FS_Converti32_f32( ... ) FS::Converti32_f32( __VA_ARGS__ )
-
-/// <summary>
-/// Convert float to int
-/// </summary>
-/// <code>
-/// int32v FS_Convertf32_i32( float32v f )
-/// </code>
-#define FS_Convertf32_i32( ... ) FS::Convertf32_i32( __VA_ARGS__ )
-
-
-// Select
-
-/// <summary>
-/// return ( m ? a : b )
-/// </summary>
-/// <code>
-/// float32v FS_Select_f32( mask32v m, float32v a, float32v b )
-/// </code>
-#define FS_Select_f32( ... ) FS::Select_f32( __VA_ARGS__ )
-
-/// <summary>
-/// return ( m ? a : b )
-/// </summary>
-/// <code>
-/// int32v FS_Select_i32( mask32v m, int32v a, int32v b )
-/// </code>
-#define FS_Select_i32( ... ) FS::Select_i32( __VA_ARGS__ )
-
-
-// Min, Max
-
-/// <summary>
-/// return ( a < b ? a : b )
-/// </summary>
-/// <code>
-/// float32v FS_Min_f32( float32v a, float32v b )
-/// </code>
-#define FS_Min_f32( ... ) FS::Min_f32( __VA_ARGS__ )
-
-/// <summary>
-/// return ( a > b ? a : b )
-/// </summary>
-/// <code>
-/// float32v FS_Max_f32( float32v a, float32v b )
-/// </code>
-#define FS_Max_f32( ... ) FS::Max_f32( __VA_ARGS__ )
-
-/// <summary>
-/// return ( a < b ? a : b )
-/// </summary>
-/// <code>
-/// int32v FS_Min_i32( int32v a, int32v b )
-/// </code>
-#define FS_Min_i32( ... ) FS::Min_i32( __VA_ARGS__ )
-
-/// <summary>
-/// return ( a > b ? a : b )
-/// </summary>
-/// <code>
-/// int32v FS_Max_i32( int32v a, int32v b )
-/// </code>
-#define FS_Max_i32( ... ) FS::Max_i32( __VA_ARGS__ )
-
-
-// Bitwise
-
-/// <summary>
-/// return ( a & ~b )
-/// </summary>
-/// <code>
-/// float32v FS_BitwiseAndNot_f32( float32v a, float32v b )
-/// </code>
-#define FS_BitwiseAndNot_f32( ... ) FS::BitwiseAndNot_f32( __VA_ARGS__ )
-
-/// <summary>
-/// return ( a & ~b )
-/// </summary>
-/// <code>
-/// int32v FS_BitwiseAndNot_i32( int32v a, int32v b )
-/// </code>
-#define FS_BitwiseAndNot_i32( ... ) FS::BitwiseAndNot_i32( __VA_ARGS__ )
-
-/// <summary>
-/// return ( a & ~b )
-/// </summary>
-/// <code>
-/// mask32v FS_BitwiseAndNot_m32( mask32v a, mask32v b )
-/// </code>
-#define FS_BitwiseAndNot_m32( ... ) FastSIMD::BitwiseAndNot_m32<FS>( __VA_ARGS__ )
-
-
-/// <summary>
-/// return ZeroExtend( a >> b )
-/// </summary>
-/// <code>
-/// float32v FS_BitwiseShiftRightZX_f32( float32v a, int32_t b )
-/// </code>
-#define FS_BitwiseShiftRightZX_f32( ... ) FS::BitwiseShiftRightZX_f32( __VA_ARGS__ )
-
-/// <summary>
-/// return ZeroExtend( a >> b )
-/// </summary>
-/// <code>
-/// float32v FS_BitwiseShiftRightZX_i32( int32v a, int32_t b )
-/// </code>
-#define FS_BitwiseShiftRightZX_i32( ... ) FS::BitwiseShiftRightZX_i32( __VA_ARGS__ )
-
-// Abs
-
-/// <summary>
-/// return ( a < 0 ? -a : a )
-/// </summary>
-/// <code>
-/// float32v FS_Abs_f32( float32v a )
-/// </code>
-#define FS_Abs_f32( ... ) FS::Abs_f32( __VA_ARGS__ )
-
-/// <summary>
-/// return ( a < 0 ? -a : a )
-/// </summary>
-/// <code>
-/// int32v FS_Abs_i32( int32v a )
-/// </code>
-#define FS_Abs_i32( ... ) FS::Abs_i32( __VA_ARGS__ )
-
-
-// Float math
-
-/// <summary>
-/// return sqrt( a )
-/// </summary>
-/// <code>
-/// float32v FS_Sqrt_f32( float32v a )
-/// </code>
-#define FS_Sqrt_f32( ... ) FS::Sqrt_f32( __VA_ARGS__ )
-
-/// <summary>
-/// return APPROXIMATE( 1.0 / sqrt( a ) )
-/// </summary>
-/// <code>
-/// float32v FS_InvSqrt_f32( float32v a )
-/// </code>
-#define FS_InvSqrt_f32( ... ) FS::InvSqrt_f32( __VA_ARGS__ )
-
-/// <summary>
-/// return APPROXIMATE( 1.0 / a )
-/// </summary>
-/// <code>
-/// float32v FS_Reciprocal_f32( float32v a )
-/// </code>
-#define FS_Reciprocal_f32( ... ) FS::Reciprocal_f32( __VA_ARGS__ )
-
-// Floor, Ceil, Round
-
-/// <summary>
-/// return floor( a )
-/// </summary>
-/// <remarks>
-/// Rounding: Towards negative infinity
-/// </remarks>
-/// <code>
-/// float32v FS_Floor_f32( float32v a )
-/// </code>
-#define FS_Floor_f32( ... ) FS::Floor_f32( __VA_ARGS__ )
-
-/// <summary>
-/// return ceil( a )
-/// </summary>
-/// <remarks>
-/// Rounding: Towards positive infinity
-/// </remarks>
-/// <code>
-/// float32v FS_Ceil_f32( float32v a )
-/// </code>
-#define FS_Ceil_f32( ... ) FS::Ceil_f32( __VA_ARGS__ )
-
-/// <summary>
-/// return round( a )
-/// </summary>
-/// <remarks>
-/// Rounding: Banker's rounding
-/// </remarks>
-/// <code>
-/// float32v FS_Round_f32( float32v a )
-/// </code>
-#define FS_Round_f32( ... ) FS::Round_f32( __VA_ARGS__ )
-
-// Trig
-
-/// <summary>
-/// return APPROXIMATE( cos( a ) )
-/// </summary>
-/// <code>
-/// float32v FS_Cos_f32( float32v a )
-/// </code>
-#define FS_Cos_f32( ... ) FastSIMD::Cos_f32<FS>( __VA_ARGS__ )
-
-/// <summary>
-/// return APPROXIMATE( sin( a ) )
-/// </summary>
-/// <code>
-/// float32v FS_Sin_f32( float32v a )
-/// </code>
-#define FS_Sin_f32( ... ) FastSIMD::Sin_f32<FS>( __VA_ARGS__ )
-
-// Math
-
-/// <summary>
-/// return pow( v, pow )
-/// </summary>
-/// <code>
-/// float32v FS_Pow_f32( float32v v, float32v pow )
-/// </code>
-#define FS_Pow_f32( ... ) FastSIMD::Pow_f32<FS>( __VA_ARGS__ )
-
-/// <summary>
-/// return log( a )
-/// </summary>
-/// <remarks>
-/// a <= 0 returns 0
-/// </remarks>
-/// <code>
-/// float32v FS_Log_f32( float32v a )
-/// </code>
-#define FS_Log_f32( ... ) FastSIMD::Log_f32<FS>( __VA_ARGS__ )
-
-/// <summary>
-/// return exp( a )
-/// </summary>
-/// <remarks>
-/// a will be clamped to -88.376, 88.376
-/// </remarks>
-/// <code>
-/// float32v FS_Exp_f32( float32v a )
-/// </code>
-#define FS_Exp_f32( ... ) FastSIMD::Exp_f32<FS>( __VA_ARGS__ )
-
-
-// Mask
-
-/// <summary>
-/// return ( m ? a : 0 )
-/// </summary>
-/// <code>
-/// int32v FS_Mask_i32( int32v a, mask32v m )
-/// </code>
-#define FS_Mask_i32( ... ) FS::Mask_i32( __VA_ARGS__ )
-
-/// <summary>
-/// return ( m ? a : 0 )
-/// </summary>
-/// <code>
-/// float32v FS_Mask_f32( float32v a, mask32v m )
-/// </code>
-#define FS_Mask_f32( ... ) FS::Mask_f32( __VA_ARGS__ )
-
-/// <summary>
-/// return ( m ? 0 : a )
-/// </summary>
-/// <code>
-/// int32v FS_NMask_i32( int32v a, mask32v m )
-/// </code>
-#define FS_NMask_i32( ... ) FS::NMask_i32( __VA_ARGS__ )
-
-/// <summary>
-/// return ( m ? 0 : a )
-/// </summary>
-/// <code>
-/// float32v FS_NMask_f32( float32v a, mask32v m )
-/// </code>
-#define FS_NMask_f32( ... ) FS::NMask_f32( __VA_ARGS__ )
-
-/// <summary>
-/// return m.contains( true )
-/// </summary>
-/// <code>
-/// bool FS_AnyMask_bool( mask32v m )
-/// </code>
-#define FS_AnyMask_bool( ... ) FS::AnyMask_bool( __VA_ARGS__ )
-
-
-// FMA
-
-/// <summary>
-/// return ( (a * b) + c )
-/// </summary>
-/// <code>
-/// float32v FS_FMulAdd_f32( float32v a, float32v b, float32v c )
-/// </code>
-#define FS_FMulAdd_f32( ... ) FastSIMD::FMulAdd_f32<FS>( __VA_ARGS__ )
-
-/// <summary>
-/// return ( -(a * b) + c )
-/// </summary>
-/// <code>
-/// float32v FS_FNMulAdd_f32( float32v a, float32v b, float32v c )
-/// </code>
-#define FS_FNMulAdd_f32( ... ) FastSIMD::FNMulAdd_f32<FS>( __VA_ARGS__ )
-
-
-// Masked float
-
-/// <summary>
-/// return ( m ? (a + b) : a )
-/// </summary>
-/// <code>
-/// float32v FS_MaskedAdd_f32( float32v a, float32v b, mask32v m )
-/// </code>
-#define FS_MaskedAdd_f32( ... ) FastSIMD::MaskedAdd_f32<FS>( __VA_ARGS__ )
-
-/// <summary>
-/// return ( m ? (a - b) : a )
-/// </summary>
-/// <code>
-/// float32v FS_MaskedSub_f32( float32v a, float32v b, mask32v m )
-/// </code>
-#define FS_MaskedSub_f32( ... ) FastSIMD::MaskedSub_f32<FS>( __VA_ARGS__ )
-
-/// <summary>
-/// return ( m ? (a * b) : a )
-/// </summary>
-/// <code>
-/// float32v FS_MaskedMul_f32( float32v a, float32v b, mask32v m )
-/// </code>
-#define FS_MaskedMul_f32( ... ) FastSIMD::MaskedMul_f32<FS>( __VA_ARGS__ )
-
-
-// Masked int32
-
-/// <summary>
-/// return ( m ? (a + b) : a )
-/// </summary>
-/// <code>
-/// int32v FS_MaskedAdd_i32( int32v a, int32v b, mask32v m )
-/// </code>
-#define FS_MaskedAdd_i32( ... ) FastSIMD::MaskedAdd_i32<FS>( __VA_ARGS__ )
-
-/// <summary>
-/// return ( m ? (a - b) : a )
-/// </summary>
-/// <code>
-/// int32v FS_MaskedSub_i32( int32v a, int32v b, mask32v m )
-/// </code>
-#define FS_MaskedSub_i32( ... ) FastSIMD::MaskedSub_i32<FS>( __VA_ARGS__ )
-
-/// <summary>
-/// return ( m ? (a * b) : a )
-/// </summary>
-/// <code>
-/// int32v FS_MaskedMul_i32( int32v a, int32v b, mask32v m )
-/// </code>
-#define FS_MaskedMul_i32( ... ) FastSIMD::MaskedMul_i32<FS>( __VA_ARGS__ )
-
-/// <summary>
-/// return ( m ? (a + 1) : a )
-/// </summary>
-/// <code>
-/// int32v FS_MaskedIncrement_i32( int32v a, mask32v m )
-/// </code>
-#define FS_MaskedIncrement_i32( ... ) FastSIMD::MaskedIncrement_i32<FS>( __VA_ARGS__ )
-
-/// <summary>
-/// return ( m ? (a - 1) : a )
-/// </summary>
-/// <code>
-/// int32v FS_MaskedDecrement_i32( int32v a, mask32v m )
-/// </code>
-#define FS_MaskedDecrement_i32( ... ) FastSIMD::MaskedDecrement_i32<FS>( __VA_ARGS__ )
-
-
-// NMasked float
-
-/// <summary>
-/// return ( m ? a : (a + b) )
-/// </summary>
-/// <code>
-/// float32v FS_NMaskedAdd_f32( float32v a, float32v b, mask32v m )
-/// </code>
-#define FS_NMaskedAdd_f32( ... ) FastSIMD::NMaskedAdd_f32<FS>( __VA_ARGS__ )
-
-/// <summary>
-/// return ( m ? a : (a - b) )
-/// </summary>
-/// <code>
-/// float32v FS_NMaskedSub_f32( float32v a, float32v b, mask32v m )
-/// </code>
-#define FS_NMaskedSub_f32( ... ) FastSIMD::NMaskedSub_f32<FS>( __VA_ARGS__ )
-
-/// <summary>
-/// return ( m ? a : (a * b) )
-/// </summary>
-/// <code>
-/// float32v FS_NMaskedMul_f32( float32v a, float32v b, mask32v m )
-/// </code>
-#define FS_NMaskedMul_f32( ... ) FastSIMD::NMaskedMul_f32<FS>( __VA_ARGS__ )
-
-
-// NMasked int32
-
-/// <summary>
-/// return ( m ? a : (a + b) )
-/// </summary>
-/// <code>
-/// int32v FS_NMaskedAdd_i32( int32v a, int32v b, mask32v m )
-/// </code>
-#define FS_NMaskedAdd_i32( ... ) FastSIMD::NMaskedAdd_i32<FS>( __VA_ARGS__ )
-
-/// <summary>
-/// return ( m ? a : (a - b) )
-/// </summary>
-/// <code>
-/// int32v FS_NMaskedSub_i32( int32v a, int32v b, mask32v m )
-/// </code>
-#define FS_NMaskedSub_i32( ... ) FastSIMD::NMaskedSub_i32<FS>( __VA_ARGS__ )
-
-/// <summary>
-/// return ( m ? a : (a * b) )
-/// </summary>
-/// <code>
-/// int32v FS_NMaskedMul_i32( int32v a, int32v b, mask32v m )
-/// </code>
-#define FS_NMaskedMul_i32( ... ) FastSIMD::NMaskedMul_i32<FS>( __VA_ARGS__ )
-
-
-namespace FastSIMD
-{
-    //FMA
-
-    template<typename FS>
-    FS_INLINE typename FS::float32v FMulAdd_f32( typename FS::float32v a, typename FS::float32v b, typename FS::float32v c )
-    {
-        return (a * b) + c;
-    }
-
-    template<typename FS>
-    FS_INLINE typename FS::float32v FNMulAdd_f32( typename FS::float32v a, typename FS::float32v b, typename FS::float32v c )
-    {
-        return -(a * b) + c;
-    }
-
-    // Masked float
-
-    template<typename FS>
-    FS_INLINE typename FS::float32v MaskedAdd_f32( typename FS::float32v a, typename FS::float32v b, typename FS::mask32v m )
-    {
-        return a + FS::Mask_f32( b, m );
-    }
-
-    template<typename FS>
-    FS_INLINE typename FS::float32v MaskedSub_f32( typename FS::float32v a, typename FS::float32v b, typename FS::mask32v m )
-    {
-        return a - FS::Mask_f32( b, m );
-    }
-
-    template<typename FS>
-    FS_INLINE typename FS::float32v MaskedMul_f32( typename FS::float32v a, typename FS::float32v b, typename FS::mask32v m )
-    {
-        return a * FS::Mask_f32( b, m );
-    }
-
-    // Masked int32
-
-    template<typename FS>
-    FS_INLINE typename FS::int32v MaskedAdd_i32( typename FS::int32v a, typename FS::int32v b, typename FS::mask32v m )
-    {
-        return a + FS::Mask_i32( b, m );
-    }
-
-    template<typename FS>
-    FS_INLINE typename FS::int32v MaskedSub_i32( typename FS::int32v a, typename FS::int32v b, typename FS::mask32v m )
-    {
-        return a - FS::Mask_i32( b, m );
-    }
-
-    template<typename FS>
-    FS_INLINE typename FS::int32v MaskedMul_i32( typename FS::int32v a, typename FS::int32v b, typename FS::mask32v m )
-    {
-        return a * FS::Mask_i32( b, m );
-    }
-
-    // NMasked float
-
-    template<typename FS>
-    FS_INLINE typename FS::float32v NMaskedAdd_f32( typename FS::float32v a, typename FS::float32v b, typename FS::mask32v m )
-    {
-        return a + FS::NMask_f32( b, m );
-    }
-
-    template<typename FS>
-    FS_INLINE typename FS::float32v NMaskedSub_f32( typename FS::float32v a, typename FS::float32v b, typename FS::mask32v m )
-    {
-        return a - FS::NMask_f32( b, m );
-    }
-
-    template<typename FS>
-    FS_INLINE typename FS::float32v NMaskedMul_f32( typename FS::float32v a, typename FS::float32v b, typename FS::mask32v m )
-    {
-        return a * FS::NMask_f32( b, m );
-    }
-
-    // NMasked int32
-
-    template<typename FS>
-    FS_INLINE typename FS::int32v NMaskedAdd_i32( typename FS::int32v a, typename FS::int32v b, typename FS::mask32v m )
-    {
-        return a + FS::NMask_i32( b, m );
-    }
-
-    template<typename FS>
-    FS_INLINE typename FS::int32v NMaskedSub_i32( typename FS::int32v a, typename FS::int32v b, typename FS::mask32v m )
-    {
-        return a - FS::NMask_i32( b, m );
-    }
-
-    template<typename FS>
-    FS_INLINE typename FS::int32v NMaskedMul_i32( typename FS::int32v a, typename FS::int32v b, typename FS::mask32v m )
-    {
-        return a * FS::NMask_i32( b, m );
-    }
-
-    template<typename FS, std::enable_if_t<std::is_same_v<typename FS::int32v, typename FS::mask32v>>* = nullptr>
-    FS_INLINE typename FS::int32v MaskedIncrement_i32( typename FS::int32v a, typename FS::mask32v m )
-    {
-        return a - m;
-    }
-
-    template<typename FS, std::enable_if_t<!std::is_same_v<typename FS::int32v, typename FS::mask32v>>* = nullptr>
-    FS_INLINE typename FS::int32v MaskedIncrement_i32( typename FS::int32v a, typename FS::mask32v m )
-    {
-        return MaskedSub_i32<FS>( a, typename FS::int32v( -1 ), m );
-    }
-    template<typename FS, std::enable_if_t<std::is_same_v<typename FS::int32v, typename FS::mask32v>>* = nullptr>
-    FS_INLINE typename FS::int32v MaskedDecrement_i32( typename FS::int32v a, typename FS::mask32v m )
-    {
-        return a + m;
-    }
-
-    template<typename FS, std::enable_if_t<!std::is_same_v<typename FS::int32v, typename FS::mask32v>>* = nullptr>
-    FS_INLINE typename FS::int32v MaskedDecrement_i32( typename FS::int32v a, typename FS::mask32v m )
-    {
-        return MaskedAdd_i32<FS>( a, typename FS::int32v( -1 ), m );
-    }
-
-    // Bitwise
-
-    template<typename FS, std::enable_if_t<std::is_same_v<typename FS::int32v, typename FS::mask32v>>* = nullptr>
-    FS_INLINE  typename FS::mask32v BitwiseAndNot_m32( typename FS::mask32v a, typename FS::mask32v b )
-    {
-        return FS::BitwiseAndNot_i32( a, b );
-    }
-
-    template<typename FS, std::enable_if_t<!std::is_same_v<typename FS::int32v, typename FS::mask32v>>* = nullptr>
-    FS_INLINE typename FS::mask32v BitwiseAndNot_m32( typename FS::mask32v a, typename FS::mask32v b )
-    {
-        return a & (~b);
-    }
-
-    // Trig
-
-    template<typename FS>
-    FS_INLINE typename FS::float32v Cos_f32( typename FS::float32v value )
-    {
-        typedef typename FS::int32v int32v;
-        typedef typename FS::float32v float32v;
-        typedef typename FS::mask32v mask32v;
-
-        value = FS_Abs_f32( value );
-        value -= FS_Floor_f32( value * float32v( 0.1591549f ) ) * float32v( 6.283185f );
-
-        mask32v geHalfPi  = value >= float32v( 1.570796f );
-        mask32v geHalfPi2 = value >= float32v( 3.141593f );
-        mask32v geHalfPi3 = value >= float32v( 4.7123889f );
-
-        float32v cosAngle = value ^ FS_Mask_f32( ( value ^ float32v( 3.141593f ) - value ), geHalfPi );
-        cosAngle = cosAngle ^ FS_Mask_f32( FS_Casti32_f32( int32v( 0x80000000 ) ), geHalfPi2 );
-        cosAngle = cosAngle ^ FS_Mask_f32( cosAngle ^ ( float32v( 6.283185f ) - value ), geHalfPi3 );
-
-        cosAngle *= cosAngle;
-
-        cosAngle = FS_FMulAdd_f32( cosAngle, FS_FMulAdd_f32( cosAngle, float32v( 0.03679168f ), float32v( -0.49558072f ) ), float32v( 0.99940307f ) );
-
-        return cosAngle ^ FS_Mask_f32( FS_Casti32_f32( int32v( 0x80000000 ) ), FS_BitwiseAndNot_m32( geHalfPi, geHalfPi3 ) );
-    }
-
-    template<typename FS>
-    FS_INLINE typename FS::float32v Sin_f32( typename FS::float32v value )
-    {
-        return Cos_f32<FS>( typename FS::float32v( 1.570796f ) - value );
-    }
-
-    template<typename FS>
-    FS_INLINE typename FS::float32v Exp_f32( typename FS::float32v x )
-    {
-        typedef typename FS::int32v int32v;
-        typedef typename FS::float32v float32v;
-
-        x = FS_Min_f32( x, float32v( 88.3762626647949f ) );
-        x = FS_Max_f32( x, float32v( -88.3762626647949f ) );
-
-        /* express exp(x) as exp(g + n*log(2)) */
-        float32v fx = x * float32v( 1.44269504088896341f );
-        fx += float32v( 0.5f );
-
-        float32v flr = FS_Floor_f32( fx );  
-        fx = FS_MaskedSub_f32( flr, float32v( 1 ), flr > fx );
-
-        x -= fx * float32v( 0.693359375f );
-        x -= fx * float32v( -2.12194440e-4f );
-
-        float32v y( 1.9875691500E-4f );
-        y *= x;
-        y += float32v( 1.3981999507E-3f );
-        y *= x;
-        y += float32v( 8.3334519073E-3f );
-        y *= x;
-        y += float32v( 4.1665795894E-2f );
-        y *= x;
-        y += float32v( 1.6666665459E-1f );
-        y *= x;
-        y += float32v( 5.0000001201E-1f );
-        y *= x * x;
-        y += x + float32v( 1 );        
-
-        /* build 2^n */
-        int32v i = FS_Convertf32_i32( fx );
-        // another two AVX2 instructions
-        i += int32v( 0x7f );
-        i <<= 23;
-        float32v pow2n = FS_Casti32_f32( i );
-        
-        return y * pow2n;        
-    }
-
-    template<typename FS>
-    FS_INLINE typename FS::float32v Log_f32( typename FS::float32v x )
-    {
-        typedef typename FS::int32v int32v;
-        typedef typename FS::float32v float32v;
-        typedef typename FS::mask32v mask32v;
-                
-        mask32v validMask = x > float32v( 0 );
-
-        x = FS_Max_f32( x, FS_Casti32_f32( int32v( 0x00800000 ) ) );  /* cut off denormalized stuff */
-
-        // can be done with AVX2
-        int32v i = FS_BitwiseShiftRightZX_i32( FS_Castf32_i32( x ), 23 );
-
-        /* keep only the fractional part */
-        x &= FS_Casti32_f32( int32v( ~0x7f800000 ) );
-        x |= float32v( 0.5f );
-
-        // this is again another AVX2 instruction
-        i -= int32v( 0x7f );
-        float32v e = FS_Converti32_f32( i );
-
-        e += float32v( 1 );
-
-        mask32v mask = x < float32v( 0.707106781186547524f );
-        x = FS_MaskedAdd_f32( x, x, mask );
-        x -= float32v( 1 );
-        e = FS_MaskedSub_f32( e, float32v( 1 ), mask );
-
-        float32v y = float32v( 7.0376836292E-2f );
-        y *= x;
-        y += float32v( -1.1514610310E-1f );
-        y *= x;
-        y += float32v( 1.1676998740E-1f );
-        y *= x;
-        y += float32v( -1.2420140846E-1f );
-        y *= x;
-        y += float32v( 1.4249322787E-1f );
-        y *= x;
-        y += float32v( -1.6668057665E-1f );
-        y *= x;
-        y += float32v( 2.0000714765E-1f );
-        y *= x;
-        y += float32v( -2.4999993993E-1f );
-        y *= x;
-        y += float32v( 3.3333331174E-1f );
-        y *= x;
-
-        float32v xx = x * x;
-        y *= xx;
-        y *= e * float32v( -2.12194440e-4f );
-        y -= xx * float32v( 0.5f );
-
-        x += y;
-        x += e * float32v( 0.693359375f );
-
-        return FS_Mask_f32( x, validMask );
-    }
-
-    template<typename FS>
-    FS_INLINE typename FS::float32v Pow_f32( typename FS::float32v value, typename FS::float32v pow )
-    {
-        return Exp_f32<FS>( pow * Log_f32<FS>( value ) );
-    }
-}
diff --git a/include/FastSIMD/InlInclude.h b/include/FastSIMD/InlInclude.h
deleted file mode 100644
index b4f4ae16..00000000
--- a/include/FastSIMD/InlInclude.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#pragma once
-#include "FunctionList.h"
-
-template<typename CLASS, typename FS>
-class FS_T;
-
-#define FASTSIMD_DECLARE_FS_TYPES \
-using float32v = typename FS::float32v;\
-using int32v   = typename FS::int32v;\
-using mask32v  = typename FS::mask32v
diff --git a/include/FastSIMD/SIMDTypeList.h b/include/FastSIMD/SIMDTypeList.h
deleted file mode 100644
index bb624b2d..00000000
--- a/include/FastSIMD/SIMDTypeList.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#pragma once
-
-#include "FastSIMD.h"
-
-namespace FastSIMD
-{
-    template<eLevel... T>
-    struct SIMDTypeContainer
-    {
-        static constexpr eLevel MinimumCompiled = Level_Null;
-
-        template<eLevel L>
-        static constexpr eLevel GetNextCompiledAfter = Level_Null;
-    };
-
-    template<eLevel HEAD, eLevel... TAIL>
-    struct SIMDTypeContainer<HEAD, TAIL...>
-    {
-        static constexpr eLevel MinimumCompiled = (HEAD & COMPILED_SIMD_LEVELS) != 0 ? HEAD : SIMDTypeContainer<TAIL...>::MinimumCompiled;
-
-        template<eLevel L>
-        static constexpr eLevel GetNextCompiledAfter = (L == HEAD) ? SIMDTypeContainer<TAIL...>::MinimumCompiled : SIMDTypeContainer<TAIL...>::template GetNextCompiledAfter<L>;
-    };
-
-    using SIMDTypeList = SIMDTypeContainer<
-        Level_Scalar,
-        Level_SSE,
-        Level_SSE2,
-        Level_SSE3,
-        Level_SSSE3,
-        Level_SSE41,
-        Level_SSE42,
-        Level_AVX,
-        Level_AVX2,
-        Level_AVX512,
-        Level_NEON>;
-}
\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index aabf8487..f4db5094 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,52 +1,15 @@
-set(CMAKE_CXX_STANDARD 17)
 
-set(install_targets ${install_targets} FastNoise PARENT_SCOPE)
-
-file(GLOB_RECURSE FastSIMD_headers "../include/FastSIMD/*.h")
-file(GLOB_RECURSE FastSIMD_include_inl "../include/FastSIMD/*.inl")
-file(GLOB FastSIMD_inline "FastSIMD/*.inl")
-file(GLOB_RECURSE FastSIMD_internal_headers "FastSIMD/Internal/*.h")
-file(GLOB_RECURSE FastSIMD_internal_inl "FastSIMD/Internal/*.inl")
-
-set(install_fastsimd_headers ${FastSIMD_headers} PARENT_SCOPE)
-
-list(APPEND FastSIMD_headers ${FastSIMD_inline})
-list(APPEND FastSIMD_headers ${FastSIMD_include_inl})
-list(APPEND FastSIMD_internal_headers ${FastSIMD_internal_inl})
-
-
-
-if(FASTSIMD_COMPILE_HAVE_NEON)
-
-    set(FastSIMD_sources
-        FastSIMD/FastSIMD.cpp
-        FastSIMD/FastSIMD_Level_NEON.cpp
-        FastSIMD/FastSIMD_Level_Scalar.cpp
-    )
-
-elseif(FASTSIMD_COMPILE_ARM)
-
-    set(FastSIMD_sources
-        FastSIMD/FastSIMD.cpp
-        FastSIMD/FastSIMD_Level_Scalar.cpp
-    )
-    
-else()
-
-    set(FastSIMD_sources
-        FastSIMD/FastSIMD.cpp
-        FastSIMD/FastSIMD_Level_AVX2.cpp
-        FastSIMD/FastSIMD_Level_AVX512.cpp
-        FastSIMD/FastSIMD_Level_Scalar.cpp
-        FastSIMD/FastSIMD_Level_SSE2.cpp
-        FastSIMD/FastSIMD_Level_SSE3.cpp
-        FastSIMD/FastSIMD_Level_SSE41.cpp
-        FastSIMD/FastSIMD_Level_SSE42.cpp
-        FastSIMD/FastSIMD_Level_SSSE3.cpp
-    )
-
-endif()
+CPMAddPackage(
+    NAME FastSIMD
+    GITHUB_REPOSITORY Auburn/FastSIMD
+    GIT_TAG 2417e5b938d7e0aa4f4293d11682db0582a83ce8
+)
 
+set(install_targets ${install_targets}
+    FastNoise
+    FastSIMD
+    FastSIMD_FastNoise  
+    PARENT_SCOPE)
 
 
 file(GLOB FastNoise_headers "../include/FastNoise/*.h")
@@ -65,13 +28,9 @@ set(FastNoise_source
     FastNoise/SmartNode.cpp
     FastNoise/FastNoise_C.cpp)
 
-source_group("FastSIMD" FILES ${FastSIMD_headers})
-source_group("FastSIMD" FILES ${FastSIMD_sources})
-source_group("FastSIMD\\internals" FILES ${FastSIMD_internal_headers})
-
 source_group("FastNoise" FILES ${FastNoise_headers})
 source_group("FastNoise" FILES ${FastNoise_source})
-source_group("FastNoise\\Generators" FILES ${FastNoise_generators_headers})
+source_group("FastNoise/Generators" FILES ${FastNoise_generators_headers})
 
 add_library(FastNoise
     ${FastNoise_headers}
@@ -88,60 +47,48 @@ target_include_directories(FastNoise PUBLIC
     $<BUILD_INTERFACE:${FastNoise2_SOURCE_DIR}/include>
     $<INSTALL_INTERFACE:include>
 )
+   
+target_compile_definitions(FastNoise PRIVATE FASTNOISE_EXPORT)
+
+set_target_properties(FastNoise PROPERTIES
+    DEBUG_POSTFIX D
+    COMPILE_PDB_NAME_DEBUG FastNoiseD)
+
+if(NOT FASTNOISE2_STRICT_FP)
+    set(FASTSIMD_RELAXED RELAXED)
+endif()
+
+fastsimd_create_dispatch_library(FastSIMD_FastNoise ${FASTSIMD_RELAXED} SOURCES "FastNoise/FastSIMD_Build.inl")
+
+target_include_directories(FastSIMD_FastNoise PRIVATE "../include/")
 
 if(NOT BUILD_SHARED_LIBS)
     target_compile_definitions(FastNoise PUBLIC FASTNOISE_STATIC_LIB)
 endif()
 
-set_target_properties(FastNoise PROPERTIES
-    DEFINE_SYMBOL FASTNOISE_EXPORT
-    DEBUG_POSTFIX D
-    COMPILE_PDB_NAME_DEBUG FastNoiseD)
+target_link_libraries(FastNoise PUBLIC FastSIMD FastSIMD_FastNoise)
 
 if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
-    target_compile_options(FastNoise PRIVATE /GL- /GS- /fp:fast /wd4251)
+    target_compile_options(FastSIMD_FastNoise PRIVATE /GL- /GS- /wd4251 /permissive- /d2vzeroupper-)
     
-    if(NOT FASTSIMD_COMPILE_ARM)
-    
-        if(CMAKE_SIZEOF_VOID_P EQUAL 4)
-            set_source_files_properties(FastSIMD/FastSIMD_Level_Scalar.cpp PROPERTIES COMPILE_FLAGS "/arch:SSE")
-            set_source_files_properties(FastSIMD/FastSIMD_Level_SSE2.cpp PROPERTIES COMPILE_FLAGS "/arch:SSE2")
-            set_source_files_properties(FastSIMD/FastSIMD_Level_SSE3.cpp PROPERTIES COMPILE_FLAGS "/arch:SSE2")
-            set_source_files_properties(FastSIMD/FastSIMD_Level_SSSE3.cpp PROPERTIES COMPILE_FLAGS "/arch:SSE2")
-            set_source_files_properties(FastSIMD/FastSIMD_Level_SSE41.cpp PROPERTIES COMPILE_FLAGS "/arch:SSE2")
-            set_source_files_properties(FastSIMD/FastSIMD_Level_SSE42.cpp PROPERTIES COMPILE_FLAGS "/arch:SSE2")
-        endif()
-        set_source_files_properties(FastSIMD/FastSIMD_Level_AVX2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2")
-        set_source_files_properties(FastSIMD/FastSIMD_Level_AVX512.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX512")
-    
-    elseif(FASTSIMD_COMPILE_ARMV7)
-        set_source_files_properties(FastSIMD/FastSIMD_Level_NEON.cpp PROPERTIES COMPILE_FLAGS "/arch:NEON")
-    endif()
-
 elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
     if(MSVC)
-        target_compile_options(FastNoise PRIVATE /GL- /GS- /fp:fast)
+        target_compile_options(FastSIMD_FastNoise PRIVATE /GS-)
     else()
-        target_compile_options(FastNoise PRIVATE -ffast-math -fno-stack-protector)        
+        target_compile_options(FastSIMD_FastNoise PRIVATE -fno-stack-protector -Wno-nan-infinity-disabled)
     endif()
-    
-    if(NOT FASTSIMD_COMPILE_ARM)
-
-        if(CMAKE_SIZEOF_VOID_P EQUAL 4 OR "${CMAKE_CXX_FLAGS}" MATCHES "-m32")
-            set_source_files_properties(FastSIMD/FastSIMD_Level_Scalar.cpp PROPERTIES COMPILE_FLAGS "-msse")
-            set_source_files_properties(FastSIMD/FastSIMD_Level_SSE2.cpp PROPERTIES COMPILE_FLAGS "-msse2")
-        endif()
-        set_source_files_properties(FastSIMD/FastSIMD_Level_SSE3.cpp PROPERTIES COMPILE_FLAGS "-msse3")
-        set_source_files_properties(FastSIMD/FastSIMD_Level_SSSE3.cpp PROPERTIES COMPILE_FLAGS "-mssse3")
-        set_source_files_properties(FastSIMD/FastSIMD_Level_SSE41.cpp PROPERTIES COMPILE_FLAGS "-msse4.1")
-        set_source_files_properties(FastSIMD/FastSIMD_Level_SSE42.cpp PROPERTIES COMPILE_FLAGS "-msse4.2")
-        set_source_files_properties(FastSIMD/FastSIMD_Level_AVX2.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
-        set_source_files_properties(FastSIMD/FastSIMD_Level_AVX512.cpp PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512dq -mfma")
-    
-    elseif(FASTSIMD_COMPILE_ARMV7)
-        set_source_files_properties(FastSIMD/FastSIMD_Level_NEON.cpp PROPERTIES COMPILE_FLAGS "-march=armv7-a -mfpu=neon")
+
+    if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+        target_compile_options(FastSIMD_FastNoise PRIVATE -mno-vzeroupper)
+    else()
+        target_compile_options(FastSIMD_FastNoise PRIVATE -mllvm -x86-use-vzeroupper=0)        
     endif()
-    
-    
 endif()
 
+if(NOT FASTNOISE2_STRICT_FP)
+    if(MSVC)
+        target_compile_options(FastSIMD_FastNoise PRIVATE /fp:fast)
+    else()
+        target_compile_options(FastSIMD_FastNoise PRIVATE -ffast-math)   
+    endif()
+endif()
diff --git a/src/FastNoise/Base64.h b/src/FastNoise/Base64.h
index 9d7449fa..a2f52322 100644
--- a/src/FastNoise/Base64.h
+++ b/src/FastNoise/Base64.h
@@ -1,40 +1,17 @@
 #pragma once
 
+#include <cstdint>
 #include <cstring>
 #include <string>
 #include <vector>
-#include <cstdint>
 
 namespace FastNoise
 {
-    /** https://gist.github.com/tomykaira/f0fd86b6c73063283afe550bc5d77594
-     * The MIT License (MIT)
-     * Copyright (c) 2016 tomykaira
-     *
-     * Permission is hereby granted, free of charge, to any person obtaining
-     * a copy of this software and associated documentation files (the
-     * "Software"), to deal in the Software without restriction, including
-     * without limitation the rights to use, copy, modify, merge, publish,
-     * distribute, sublicense, and/or sell copies of the Software, and to
-     * permit persons to whom the Software is furnished to do so, subject to
-     * the following conditions:
-     *
-     * The above copyright notice and this permission notice shall be
-     * included in all copies or substantial portions of the Software.
-     *
-     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-     * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-     * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-     * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-     * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-     * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-     * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-     */
     namespace Base64
     {
         static std::string Encode( const std::vector<uint8_t>& data )
         {
-            static constexpr char sEncodingTable[] = {
+            static constexpr char kEncodingTable[] = {
                 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
                 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
                 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
@@ -45,33 +22,70 @@ namespace FastNoise
                 '4', '5', '6', '7', '8', '9', '+', '/'
             };
 
-            size_t in_len = data.size();
-            size_t out_len = 4 * ((in_len + 2) / 3);
-            std::string ret( out_len, '\0' );
+            size_t inLen = data.size();
+            std::string ret;
+            size_t consecutiveAs = 0;
+
+            auto appendChar = [&]( char c ) {
+                if( c == 'A' ) // Compress "A"s into @ with count in following char
+                {
+                    if( consecutiveAs++ <= 1 )
+                    {
+                        ret += 'A';
+                    }
+                    else if( consecutiveAs >= std::size( kEncodingTable ) + 2 )
+                    {
+                        ret[ret.length() - 2] = '@';
+                        ret[ret.length() - 1] = kEncodingTable[consecutiveAs - 3];
+
+                        ret += 'A';
+                        consecutiveAs = 1;
+                    }
+                }
+                else
+                {
+                    if( consecutiveAs >= 3 )
+                    {
+                        ret[ret.length() - 2] = '@';
+                        ret[ret.length() - 1] = kEncodingTable[consecutiveAs - 3];
+                    }
+                    if( c != '\0' )
+                    {
+                        ret += c;
+                    }
+
+                    consecutiveAs = 0;
+                }
+            };
+
             size_t i;
-            char* p = const_cast<char*>(ret.c_str());
 
-            for( i = 0; i < in_len - 2; i += 3 )
+            for( i = 0; i < inLen - 2; i += 3 )
             {
-                *p++ = sEncodingTable[(data[i] >> 2) & 0x3F];
-                *p++ = sEncodingTable[((data[i] & 0x3) << 4) | ((int)(data[i + 1] & 0xF0) >> 4)];
-                *p++ = sEncodingTable[((data[i + 1] & 0xF) << 2) | ((int)(data[i + 2] & 0xC0) >> 6)];
-                *p++ = sEncodingTable[data[i + 2] & 0x3F];
+                appendChar( kEncodingTable[( data[i] >> 2 ) & 0x3F] );
+                appendChar( kEncodingTable[( ( data[i] & 0x3 ) << 4 ) | ( ( data[i + 1] & 0xF0 ) >> 4 )] );
+                appendChar( kEncodingTable[( ( data[i + 1] & 0xF ) << 2 ) | ( ( data[i + 2] & 0xC0 ) >> 6 )] );
+                appendChar( kEncodingTable[data[i + 2] & 0x3F] );
             }
-            if( i < in_len )
+            if( i < inLen )
             {
-                *p++ = sEncodingTable[(data[i] >> 2) & 0x3F];
-                if( i == (in_len - 1) )
+                appendChar( kEncodingTable[( data[i] >> 2 ) & 0x3F] );
+                if( i == ( inLen - 1 ) )
                 {
-                    *p++ = sEncodingTable[((data[i] & 0x3) << 4)];
-                    *p++ = '=';
+                    appendChar( kEncodingTable[( ( data[i] & 0x3 ) << 4 )] );
+                    appendChar( '=' );
                 }
                 else
                 {
-                    *p++ = sEncodingTable[((data[i] & 0x3) << 4) | ((int)(data[i + 1] & 0xF0) >> 4)];
-                    *p++ = sEncodingTable[((data[i + 1] & 0xF) << 2)];
+                    appendChar( kEncodingTable[( ( data[i] & 0x3 ) << 4 ) | ( ( data[i + 1] & 0xF0 ) >> 4 )] );
+                    appendChar( kEncodingTable[( ( data[i + 1] & 0xF ) << 2 )] );
                 }
-                *p++ = '=';
+                appendChar( '=' );
+            }
+            else
+            {
+                // Handle any trailing As
+                appendChar( '\0' );
             }
 
             return ret;
@@ -83,7 +97,7 @@ namespace FastNoise
                 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
                 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
                 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 62, 64, 64, 64, 63,
-                52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 64, 64, 64, 64, 64, 64,
+                52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 64, 64, 64, 0, 64, 64,
                 64, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
                 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 64, 64, 64, 64, 64,
                 64, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
@@ -98,31 +112,83 @@ namespace FastNoise
                 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
             };
 
-            size_t in_len = std::strlen( input );
-            size_t out_len = in_len / 4 * 3;
+            size_t rawLen = 0, decompLen = 0;
+
+            // Check string length with decompress
+            while( input[rawLen] )
+            {
+                if( input[rawLen] == '@' )
+                {
+                    unsigned char aExtra = kDecodingTable[static_cast<unsigned char>( input[++rawLen] )];
 
-            if( out_len == 0 || in_len % 4 != 0 ) return {};
+                    if( aExtra == 64 ) // Error
+                    {
+                        return {};
+                    }
 
-            if( input[in_len - 1] == '=' ) out_len--;
-            if( input[in_len - 2] == '=' ) out_len--;
+                    decompLen += aExtra + 2;
+                }
+                else
+                {
+                    decompLen++;
+                    rawLen++;
+                }
+            }
 
-            std::vector<uint8_t> out( out_len );
+            size_t outLen = decompLen / 4 * 3;
 
-            for( size_t i = 0, j = 0; i < in_len; )
+            if( outLen == 0 || decompLen % 4 != 0 )
+                return {};
+                        
+            if( input[rawLen - 1] == '=' )
             {
-                uint32_t a = input[i] == '=' ? 0 & i++ : kDecodingTable[static_cast<int>(input[i++])];
-                uint32_t b = input[i] == '=' ? 0 & i++ : kDecodingTable[static_cast<int>(input[i++])];
-                uint32_t c = input[i] == '=' ? 0 & i++ : kDecodingTable[static_cast<int>(input[i++])];
-                uint32_t d = input[i] == '=' ? 0 & i++ : kDecodingTable[static_cast<int>(input[i++])];
+                outLen--;
+                if( input[rawLen - 2] == '=' )
+                    outLen--;
+            }
+
+            std::vector<uint8_t> out( outLen );
+            size_t i = 0, j = 0, consecutiveAs = 0;
+
+            while( i < rawLen || consecutiveAs > 0 )
+            {
+                char currentBlock[4] = { 0 };
+
+                for( int k = 0; k < 4; k++ )
+                {
+                    if( consecutiveAs > 0 )
+                    {
+                        currentBlock[k] = 'A';
+                        consecutiveAs--;
+                    }
+                    else if( input[i] == '@' )
+                    {
+                        currentBlock[k] = 'A';
+                        i++;
+                        consecutiveAs = kDecodingTable[static_cast<unsigned char>( input[i++] )] + 2;
+                    }
+                    else
+                    {
+                        currentBlock[k] = input[i++];
+                    }
+                }
+
+                uint32_t a = kDecodingTable[static_cast<unsigned char>( currentBlock[0] )];
+                uint32_t b = kDecodingTable[static_cast<unsigned char>( currentBlock[1] )];
+                uint32_t c = kDecodingTable[static_cast<unsigned char>( currentBlock[2] )];
+                uint32_t d = kDecodingTable[static_cast<unsigned char>( currentBlock[3] )];
 
-                uint32_t triple = (a << 3 * 6) + (b << 2 * 6) + (c << 1 * 6) + (d << 0 * 6);
+                uint32_t triple = ( a << 3 * 6 ) + ( b << 2 * 6 ) + ( c << 1 * 6 ) + ( d << 0 * 6 );
 
-                if( j < out_len ) out[j++] = (triple >> 2 * 8) & 0xFF;
-                if( j < out_len ) out[j++] = (triple >> 1 * 8) & 0xFF;
-                if( j < out_len ) out[j++] = (triple >> 0 * 8) & 0xFF;
+                if( j < outLen )
+                    out[j++] = ( triple >> 2 * 8 ) & 0xFF;
+                if( j < outLen )
+                    out[j++] = ( triple >> 1 * 8 ) & 0xFF;
+                if( j < outLen )
+                    out[j++] = ( triple >> 0 * 8 ) & 0xFF;
             }
 
             return out;
         }
-    };
-}
+    }; // namespace Base64
+} // namespace FastNoise
diff --git a/src/FastNoise/FastNoise_C.cpp b/src/FastNoise/FastNoise_C.cpp
index 3fbc249a..527ea793 100644
--- a/src/FastNoise/FastNoise_C.cpp
+++ b/src/FastNoise/FastNoise_C.cpp
@@ -2,14 +2,22 @@
 #include <FastNoise/FastNoise.h>
 #include <FastNoise/Metadata.h>
 
+namespace FastNoise::Internal
+{
+    void BumpNodeRefences( const Generator* ptr, bool up )
+    {
+        ptr->ReferencesFetchAdd( up ? 1 : -1 );
+    }
+}
+
 FastNoise::Generator* ToGen( void* p )
 {
-    return static_cast<FastNoise::SmartNode<>*>( p )->get();
+    return static_cast<FastNoise::Generator*>( p );
 }
 
 const FastNoise::Generator* ToGen( const void* p )
 {
-    return static_cast<const FastNoise::SmartNode<>*>( p )->get();
+    return static_cast<const FastNoise::Generator*>( p );
 }
 
 void StoreMinMax( float* floatArray2, FastNoise::OutputMinMax minMax )
@@ -23,21 +31,23 @@ void StoreMinMax( float* floatArray2, FastNoise::OutputMinMax minMax )
 
 void* fnNewFromEncodedNodeTree( const char* encodedString, unsigned simdLevel )
 {
-    if( FastNoise::SmartNode<> node = FastNoise::NewFromEncodedNodeTree( encodedString, (FastSIMD::eLevel)simdLevel ) )
+    if( FastNoise::SmartNode<> node = FastNoise::NewFromEncodedNodeTree( encodedString, (FastSIMD::FeatureSet)simdLevel ) )
     {
-        return new FastNoise::SmartNode<>( std::move( node ) );
+        FastNoise::Internal::BumpNodeRefences( node.get(), true );
+
+        return node.get();
     }
     return nullptr;
 }
 
 void fnDeleteNodeRef( void* node )
 {
-    delete static_cast<FastNoise::SmartNode<>*>( node );
+    FastNoise::Internal::BumpNodeRefences( ToGen( node ), false );
 }
 
 unsigned fnGetSIMDLevel( const void* node )
 {
-    return (unsigned)ToGen( node )->GetSIMDLevel();
+    return (unsigned)ToGen( node )->GetActiveFeatureSet();
 }
 
 int fnGetMetadataID( const void* node )
@@ -45,19 +55,19 @@ int fnGetMetadataID( const void* node )
     return ToGen( node )->GetMetadata().id;
 }
 
-void fnGenUniformGrid2D( const void* node, float* noiseOut, int xStart, int yStart, int xSize, int ySize, float frequency, int seed, float* outputMinMax )
+void fnGenUniformGrid2D( const void* node, float* noiseOut, int xStart, int yStart, int xSize, int ySize, int seed, float* outputMinMax )
 {
-    StoreMinMax( outputMinMax, ToGen( node )->GenUniformGrid2D( noiseOut, xStart, yStart, xSize, ySize, frequency, seed ) );    
+    StoreMinMax( outputMinMax, ToGen( node )->GenUniformGrid2D( noiseOut, xStart, yStart, xSize, ySize, seed ) );    
 }
 
-void fnGenUniformGrid3D( const void* node, float* noiseOut, int xStart, int yStart, int zStart, int xSize, int ySize, int zSize, float frequency, int seed, float* outputMinMax )
+void fnGenUniformGrid3D( const void* node, float* noiseOut, int xStart, int yStart, int zStart, int xSize, int ySize, int zSize, int seed, float* outputMinMax )
 {
-    StoreMinMax( outputMinMax, ToGen( node )->GenUniformGrid3D( noiseOut, xStart, yStart, zStart, xSize, ySize, zSize, frequency, seed ) );    
+    StoreMinMax( outputMinMax, ToGen( node )->GenUniformGrid3D( noiseOut, xStart, yStart, zStart, xSize, ySize, zSize, seed ) );    
 }
 
-void fnGenUniformGrid4D( const void* node, float* noiseOut, int xStart, int yStart, int zStart, int wStart, int xSize, int ySize, int zSize, int wSize, float frequency, int seed, float* outputMinMax )
+void fnGenUniformGrid4D( const void* node, float* noiseOut, int xStart, int yStart, int zStart, int wStart, int xSize, int ySize, int zSize, int wSize, int seed, float* outputMinMax )
 {
-    StoreMinMax( outputMinMax, ToGen( node )->GenUniformGrid4D( noiseOut, xStart, yStart, zStart, wStart, xSize, ySize, zSize, wSize, frequency, seed ) );    
+    StoreMinMax( outputMinMax, ToGen( node )->GenUniformGrid4D( noiseOut, xStart, yStart, zStart, wStart, xSize, ySize, zSize, wSize, seed ) );    
 }
 
 void fnGenPositionArray2D( const void* node, float* noiseOut, int count, const float* xPosArray, const float* yPosArray, float xOffset, float yOffset, int seed, float* outputMinMax )
@@ -90,9 +100,9 @@ float fnGenSingle4D( const void* node, float x, float y, float z, float w, int s
     return ToGen( node )->GenSingle4D( x, y, z, w, seed );
 }
 
-void fnGenTileable2D( const void* node, float* noiseOut, int xSize, int ySize, float frequency, int seed, float* outputMinMax )
+void fnGenTileable2D( const void* node, float* noiseOut, int xSize, int ySize, int seed, float* outputMinMax )
 {
-    StoreMinMax( outputMinMax, ToGen( node )->GenTileable2D( noiseOut, xSize, ySize, frequency, seed ) );
+    StoreMinMax( outputMinMax, ToGen( node )->GenTileable2D( noiseOut, xSize, ySize, seed ) );
 }
 
 int fnGetMetadataCount()
@@ -102,7 +112,7 @@ int fnGetMetadataCount()
 
 const char* fnGetMetadataName( int id )
 {
-    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (uint16_t)id ) )
+    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (FastNoise::Metadata::node_id)id ) )
     {
         return metadata->name;
     }
@@ -111,16 +121,19 @@ const char* fnGetMetadataName( int id )
 
 void* fnNewFromMetadata( int id, unsigned simdLevel )
 {
-    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (uint16_t)id ) )
+    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (FastNoise::Metadata::node_id)id ) )
     {
-        return new FastNoise::SmartNode<>( metadata->CreateNode( (FastSIMD::eLevel)simdLevel ) );
+        FastNoise::SmartNode<> node = metadata->CreateNode( (FastSIMD::FeatureSet)simdLevel );
+        FastNoise::Internal::BumpNodeRefences( node.get(), true );
+
+        return node.get();
     }
     return nullptr;
 }
 
 int fnGetMetadataVariableCount( int id )
 {
-    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (uint16_t)id ) )
+    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (FastNoise::Metadata::node_id)id ) )
     {
         return (int)metadata->memberVariables.size();
     }
@@ -129,7 +142,7 @@ int fnGetMetadataVariableCount( int id )
 
 const char* fnGetMetadataVariableName( int id, int variableIndex )
 {
-    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (uint16_t)id ) )
+    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (FastNoise::Metadata::node_id)id ) )
     {
         if( (size_t)variableIndex < metadata->memberVariables.size() )
         {
@@ -142,7 +155,7 @@ const char* fnGetMetadataVariableName( int id, int variableIndex )
 
 int fnGetMetadataVariableType( int id, int variableIndex )
 {
-    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (uint16_t)id ) )
+    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (FastNoise::Metadata::node_id)id ) )
     {
         if( (size_t)variableIndex < metadata->memberVariables.size() )
         {
@@ -155,7 +168,7 @@ int fnGetMetadataVariableType( int id, int variableIndex )
 
 int fnGetMetadataVariableDimensionIdx( int id, int variableIndex )
 {
-    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (uint16_t)id ) )
+    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (FastNoise::Metadata::node_id)id ) )
     {
         if( (size_t)variableIndex < metadata->memberVariables.size() )
         {
@@ -168,7 +181,7 @@ int fnGetMetadataVariableDimensionIdx( int id, int variableIndex )
 
 int fnGetMetadataEnumCount( int id, int variableIndex )
 {
-    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (uint16_t)id ) )
+    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (FastNoise::Metadata::node_id)id ) )
     {
         if( (size_t)variableIndex < metadata->memberVariables.size() )
         {
@@ -181,7 +194,7 @@ int fnGetMetadataEnumCount( int id, int variableIndex )
 
 const char* fnGetMetadataEnumName( int id, int variableIndex, int enumIndex )
 {
-    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (uint16_t)id ) )
+    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (FastNoise::Metadata::node_id)id ) )
     {
         if( (size_t)variableIndex < metadata->memberVariables.size() )
         {
@@ -218,7 +231,7 @@ bool fnSetVariableIntEnum( void* node, int variableIndex, int value )
 
 int fnGetMetadataNodeLookupCount( int id )
 {
-    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (uint16_t)id ) )
+    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (FastNoise::Metadata::node_id)id ) )
     {
         return (int)metadata->memberNodeLookups.size();
     }
@@ -227,7 +240,7 @@ int fnGetMetadataNodeLookupCount( int id )
 
 const char* fnGetMetadataNodeLookupName( int id, int nodeLookupIndex )
 {
-    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (uint16_t)id ) )
+    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (FastNoise::Metadata::node_id)id ) )
     {
         if( (size_t)nodeLookupIndex < metadata->memberNodeLookups.size() )
         {
@@ -240,7 +253,7 @@ const char* fnGetMetadataNodeLookupName( int id, int nodeLookupIndex )
 
 int fnGetMetadataNodeLookupDimensionIdx( int id, int nodeLookupIndex )
 {
-    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (uint16_t)id ) )
+    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (FastNoise::Metadata::node_id)id ) )
     {
         if( (size_t)nodeLookupIndex < metadata->memberNodeLookups.size() )
         {
@@ -263,7 +276,7 @@ bool fnSetNodeLookup( void* node, int nodeLookupIndex, const void* nodeLookup )
 
 int fnGetMetadataHybridCount( int id )
 {
-    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (uint16_t)id ) )
+    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (FastNoise::Metadata::node_id)id ) )
     {
         return (int)metadata->memberHybrids.size();
     }
@@ -272,7 +285,7 @@ int fnGetMetadataHybridCount( int id )
 
 const char* fnGetMetadataHybridName( int id, int hybridIndex )
 {
-    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (uint16_t)id ) )
+    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (FastNoise::Metadata::node_id)id ) )
     {
         if( (size_t)hybridIndex < metadata->memberHybrids.size() )
         {
@@ -285,7 +298,7 @@ const char* fnGetMetadataHybridName( int id, int hybridIndex )
 
 int fnGetMetadataHybridDimensionIdx( int id, int hybridIndex )
 {
-    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (uint16_t)id ) )
+    if( const FastNoise::Metadata* metadata = FastNoise::Metadata::GetFromId( (FastNoise::Metadata::node_id)id ) )
     {
         if( (size_t)hybridIndex < metadata->memberHybrids.size() )
         {
diff --git a/src/FastNoise/FastSIMD_Build.inl b/src/FastNoise/FastSIMD_Build.inl
new file mode 100644
index 00000000..44b04bf7
--- /dev/null
+++ b/src/FastNoise/FastSIMD_Build.inl
@@ -0,0 +1,143 @@
+#pragma once
+
+#ifndef FASTNOISE_REGISTER_NODE
+#define FASTNOISE_REGISTER_NODE( CLASS ) \
+template class FastSIMD::RegisterDispatchClass<FastNoise::CLASS>;\
+static_assert( std::is_final_v<FastSIMD::DispatchClass<CLASS, FastSIMD::FeatureSetDefault()>> )
+#endif
+
+#ifdef FASTSIMD_INCLUDE_HEADER_ONLY
+#include <FastNoise/Generators/Generator.h>
+#else
+#include <FastNoise/Generators/Generator.inl>
+#endif
+
+#ifdef FASTSIMD_INCLUDE_HEADER_ONLY
+#include <FastNoise/Generators/BasicGenerators.h>
+#else
+#include <FastNoise/Generators/BasicGenerators.inl>
+#endif
+
+#ifdef FASTSIMD_INCLUDE_HEADER_ONLY
+#include <FastNoise/Generators/Value.h>
+#else
+#include <FastNoise/Generators/Value.inl>
+#endif
+
+#ifdef FASTSIMD_INCLUDE_HEADER_ONLY
+#include <FastNoise/Generators/Perlin.h>
+#else
+#include <FastNoise/Generators/Perlin.inl>
+#endif
+
+#ifdef FASTSIMD_INCLUDE_HEADER_ONLY
+#include <FastNoise/Generators/Simplex.h>
+#else
+#include <FastNoise/Generators/Simplex.inl>
+#endif
+
+#ifdef FASTSIMD_INCLUDE_HEADER_ONLY
+#include <FastNoise/Generators/Cellular.h>
+#else
+#include <FastNoise/Generators/Cellular.inl>
+#endif
+
+#ifdef FASTSIMD_INCLUDE_HEADER_ONLY
+#include <FastNoise/Generators/Fractal.h>
+#else
+#include <FastNoise/Generators/Fractal.inl>
+#endif
+
+#ifdef FASTSIMD_INCLUDE_HEADER_ONLY
+#include <FastNoise/Generators/DomainWarp.h>
+#else
+#include <FastNoise/Generators/DomainWarp.inl>
+
+#endif
+#ifdef FASTSIMD_INCLUDE_HEADER_ONLY
+#include <FastNoise/Generators/DomainWarpSimplex.h>
+#else
+#include <FastNoise/Generators/DomainWarpSimplex.inl>
+#endif
+
+#ifdef FASTSIMD_INCLUDE_HEADER_ONLY
+#include <FastNoise/Generators/DomainWarpFractal.h>
+#else
+#include <FastNoise/Generators/DomainWarpFractal.inl>
+#endif
+
+#ifdef FASTSIMD_INCLUDE_HEADER_ONLY
+#include <FastNoise/Generators/Modifiers.h>
+#else
+#include <FastNoise/Generators/Modifiers.inl>
+#endif
+
+#ifdef FASTSIMD_INCLUDE_HEADER_ONLY
+#include <FastNoise/Generators/Blends.h>
+#else
+#include <FastNoise/Generators/Blends.inl>
+#endif
+
+// Nodes
+// Order is important!
+// Always add to bottom of list,
+// inserting will break existing encoded node trees
+
+FASTNOISE_REGISTER_NODE( Constant );
+FASTNOISE_REGISTER_NODE( White );
+FASTNOISE_REGISTER_NODE( Checkerboard );
+FASTNOISE_REGISTER_NODE( SineWave );
+FASTNOISE_REGISTER_NODE( PositionOutput );
+FASTNOISE_REGISTER_NODE( DistanceToPoint );
+
+FASTNOISE_REGISTER_NODE( Simplex );
+FASTNOISE_REGISTER_NODE( SuperSimplex );
+FASTNOISE_REGISTER_NODE( Perlin );
+FASTNOISE_REGISTER_NODE( Value );
+                       
+FASTNOISE_REGISTER_NODE( CellularValue );
+FASTNOISE_REGISTER_NODE( CellularDistance );
+FASTNOISE_REGISTER_NODE( CellularLookup );
+                       
+FASTNOISE_REGISTER_NODE( FractalFBm );
+FASTNOISE_REGISTER_NODE( FractalPingPong );
+FASTNOISE_REGISTER_NODE( FractalRidged );
+
+FASTNOISE_REGISTER_NODE( DomainWarpSimplex );
+FASTNOISE_REGISTER_NODE( DomainWarpSuperSimplex );
+FASTNOISE_REGISTER_NODE( DomainWarpGradient );
+
+FASTNOISE_REGISTER_NODE( DomainWarpFractalProgressive );
+FASTNOISE_REGISTER_NODE( DomainWarpFractalIndependant );
+                       
+FASTNOISE_REGISTER_NODE( Add );
+FASTNOISE_REGISTER_NODE( Subtract );
+FASTNOISE_REGISTER_NODE( Multiply );
+FASTNOISE_REGISTER_NODE( Divide );
+
+FASTNOISE_REGISTER_NODE( Abs );
+FASTNOISE_REGISTER_NODE( Min );
+FASTNOISE_REGISTER_NODE( Max );
+FASTNOISE_REGISTER_NODE( MinSmooth );
+FASTNOISE_REGISTER_NODE( MaxSmooth );
+FASTNOISE_REGISTER_NODE( SquareRoot );
+FASTNOISE_REGISTER_NODE( PowFloat );
+FASTNOISE_REGISTER_NODE( PowInt );
+
+FASTNOISE_REGISTER_NODE( DomainScale );
+FASTNOISE_REGISTER_NODE( DomainOffset );
+FASTNOISE_REGISTER_NODE( DomainRotate );
+FASTNOISE_REGISTER_NODE( DomainAxisScale );
+
+FASTNOISE_REGISTER_NODE( SeedOffset );
+FASTNOISE_REGISTER_NODE( ConvertRGBA8 );
+FASTNOISE_REGISTER_NODE( GeneratorCache );
+
+FASTNOISE_REGISTER_NODE( Fade );
+FASTNOISE_REGISTER_NODE( Remap );
+FASTNOISE_REGISTER_NODE( Terrace );
+FASTNOISE_REGISTER_NODE( AddDimension );
+FASTNOISE_REGISTER_NODE( RemoveDimension );
+
+FASTNOISE_REGISTER_NODE( Modulus );
+FASTNOISE_REGISTER_NODE( DomainRotatePlane );
diff --git a/src/FastNoise/Metadata.cpp b/src/FastNoise/Metadata.cpp
index f1b4bcb1..d49c9a94 100644
--- a/src/FastNoise/Metadata.cpp
+++ b/src/FastNoise/Metadata.cpp
@@ -13,34 +13,91 @@
 
 using namespace FastNoise;
 
-std::vector<const Metadata*> Metadata::sAllMetadata;
+Metadata::Vector<const Metadata*> Metadata::sAllMetadata;
 
-NodeData::NodeData( const Metadata* data )
-{
-    metadata = data;
+template<typename T>
+constexpr static std::nullptr_t gMetadataVectorSize = nullptr; // Invalid
+
+// Setting these values avoids needless vector resizing and oversizing on startup
+// Sadly there is no way to automate this as they fill up as part of static init
+template<>
+constexpr size_t gMetadataVectorSize<const Metadata*> = 47;
+template<>
+constexpr size_t gMetadataVectorSize<const char*> = 91;
+template<>
+constexpr size_t gMetadataVectorSize<Metadata::MemberVariable> = 72;
+template<>
+constexpr size_t gMetadataVectorSize<Metadata::MemberNodeLookup> = 32;
+template<>
+constexpr size_t gMetadataVectorSize<Metadata::MemberHybrid> = 57;
 
-    if( metadata )
+template<typename T>
+static std::vector<T>& GetVectorStorage()
+{
+    static std::vector<T> v = []()
     {
-        for( const auto& value : metadata->memberVariables )
-        {
-            variables.push_back( value.valueDefault );
-        }
+        std::vector<T> vec;
+        vec.reserve( gMetadataVectorSize<T> );
+        return vec;
+    }();
+    return v;
+}
 
-        for( const auto& value : metadata->memberNodeLookups )
-        {
-            (void)value;
-            nodeLookups.push_back( nullptr );
-        }
+template<typename T>
+static int32_t DebugCheckType()
+{
+    return ( GetVectorStorage<T>().size() == gMetadataVectorSize<T> ? -1 : 1 ) * (int32_t)GetVectorStorage<T>().size();
+}
 
-        for( const auto& value : metadata->memberHybrids )
-        {
-            hybrids.emplace_back( nullptr, value.valueDefault );
-        }
+std::pair<int32_t, const char*> Metadata::DebugCheckVectorStorageSize( int i )
+{
+    switch( i )
+    {
+    case 0: return { DebugCheckType<const Metadata*>(),  "const Metadata*" };
+    case 1: return { DebugCheckType<const char*>(),      "const char*" };
+    case 2: return { DebugCheckType<MemberVariable>(),   "MemberVariable" };
+    case 3: return { DebugCheckType<MemberNodeLookup>(), "MemberNodeLookup" };
+    case 4: return { DebugCheckType<MemberHybrid>(),     "MemberHybrid" };
     }
+    return { 0, nullptr };
+}
+
+template<typename T>
+T* Metadata::Vector<T>::data() const
+{
+    return GetVectorStorage<T>().data();
 }
 
 template<typename T>
-void AddToDataStream( std::vector<uint8_t>& dataStream, T value )
+void Metadata::Vector<T>::push_back( const T& value )
+{
+    std::vector<T>& vec = GetVectorStorage<T>();
+    vec.push_back( value );
+    assert( vec.size() <= (index_type)-1 );
+
+    mEnd = (index_type)vec.size() - 1;
+    mStart = std::min( mStart, mEnd++ );
+}
+
+template class Metadata::Vector<const Metadata*>;
+template class Metadata::Vector<const char*>;
+template class Metadata::Vector<Metadata::MemberVariable>;
+template class Metadata::Vector<Metadata::MemberNodeLookup>;
+template class Metadata::Vector<Metadata::MemberHybrid>;
+
+union MemberLookup
+{
+    struct
+    {
+        uint8_t type : 2;
+        uint8_t index : 6;
+    } member;
+
+    uint8_t data;
+};
+
+template<typename T>
+static void AddToDataStream( std::vector<uint8_t>& dataStream, T value )
 {
     for( size_t i = 0; i < sizeof( T ); i++ )
     {
@@ -48,16 +105,25 @@ void AddToDataStream( std::vector<uint8_t>& dataStream, T value )
     }
 }
 
-bool SerialiseNodeDataInternal( NodeData* nodeData, bool fixUp, std::vector<uint8_t>& dataStream, std::unordered_map<const NodeData*, uint16_t>& referenceIds, std::unordered_set<const NodeData*> dependencies = {} )
+static void AddMemberLookupToDataStream( std::vector<uint8_t>& dataStream, uint8_t type, uint8_t index )
+{
+    MemberLookup memberLookup;
+    memberLookup.member.type = type;
+    memberLookup.member.index = index;
+    AddToDataStream( dataStream, memberLookup.data );
+}
+
+
+static bool SerialiseNodeDataInternal( NodeData* nodeData, bool fixUp, std::vector<uint8_t>& dataStream, std::unordered_map<const NodeData*, uint16_t>& referenceIds, std::unordered_set<const NodeData*> dependencies = {} )
 {
     // dependencies passed by value to avoid false positives from other branches in the node tree
 
     const Metadata* metadata = nodeData->metadata;
 
     if( !metadata ||
-        nodeData->variables.size() != metadata->memberVariables.size()   ||
-        nodeData->nodeLookups.size()     != metadata->memberNodeLookups.size() ||
-        nodeData->hybrids.size()   != metadata->memberHybrids.size()     )
+        nodeData->variables.size() != metadata->memberVariables.size() ||
+        nodeData->nodeLookups.size() != metadata->memberNodeLookups.size() ||
+        nodeData->hybrids.size() != metadata->memberHybrids.size() )
     {
         assert( 0 ); // Member size mismatch with metadata
         return false;
@@ -90,9 +156,9 @@ bool SerialiseNodeDataInternal( NodeData* nodeData, bool fixUp, std::vector<uint
 
     if( reference != referenceIds.end() )
     {
-        // UINT16_MAX where node ID should be
+        // UINT8_MAX where node ID should be
         // Referenced by index in reference array, array ordering will match on decode
-        AddToDataStream( dataStream, std::numeric_limits<uint16_t>::max() );
+        AddToDataStream( dataStream, std::numeric_limits<Metadata::node_id>::max() );
         AddToDataStream( dataStream, reference->second );
         return true;
     }
@@ -103,7 +169,17 @@ bool SerialiseNodeDataInternal( NodeData* nodeData, bool fixUp, std::vector<uint
     // Member variables
     for( size_t i = 0; i < metadata->memberVariables.size(); i++ )
     {
-        AddToDataStream( dataStream, nodeData->variables[i].i );
+        if( nodeData->variables[i].i != metadata->memberVariables[i].valueDefault.i )
+        {
+            AddMemberLookupToDataStream( dataStream, 0, (uint8_t)i );
+
+            AddToDataStream( dataStream, nodeData->variables[i].i );
+        }
+    }
+
+    if( metadata->memberNodeLookups.size() )
+    {
+        AddMemberLookupToDataStream( dataStream, 1, (uint8_t)metadata->memberNodeLookups.size() );
     }
 
     // Member nodes
@@ -131,17 +207,16 @@ bool SerialiseNodeDataInternal( NodeData* nodeData, bool fixUp, std::vector<uint
     // Member hybrids
     for( size_t i = 0; i < metadata->memberHybrids.size(); i++ )
     {
-        // 1 byte to indicate:
-        // 0 = constant float value
-        // 1 = node lookup
-
         if( !nodeData->hybrids[i].first )
         {
-            AddToDataStream( dataStream, (uint8_t)0 );
+            if( nodeData->hybrids[i].second != metadata->memberHybrids[i].valueDefault )
+            {
+                AddMemberLookupToDataStream( dataStream, 2, (uint8_t)i );
 
-            Metadata::MemberVariable::ValueUnion v = nodeData->hybrids[i].second;
+                Metadata::MemberVariable::ValueUnion v = nodeData->hybrids[i].second;
 
-            AddToDataStream( dataStream, v.i );
+                AddToDataStream( dataStream, v.i );
+            }
         }
         else
         {
@@ -158,7 +233,8 @@ bool SerialiseNodeDataInternal( NodeData* nodeData, bool fixUp, std::vector<uint
                 }
             }
 
-            AddToDataStream( dataStream, (uint8_t)1 );
+            AddMemberLookupToDataStream( dataStream, 3, (uint8_t)i );
+
             if( !SerialiseNodeDataInternal( nodeData->hybrids[i].first, fixUp, dataStream, referenceIds, dependencies ) )
             {
                 return false;
@@ -166,9 +242,12 @@ bool SerialiseNodeDataInternal( NodeData* nodeData, bool fixUp, std::vector<uint
         }
     }
 
+    // Mark end of node
+    AddToDataStream( dataStream, (uint8_t)255 );
+
     referenceIds.emplace( nodeData, (uint16_t)referenceIds.size() );
 
-    return true; 
+    return true;
 }
 
 std::string Metadata::SerialiseNodeData( NodeData* nodeData, bool fixUp )
@@ -184,7 +263,7 @@ std::string Metadata::SerialiseNodeData( NodeData* nodeData, bool fixUp )
 }
 
 template<typename T>
-bool GetFromDataStream( const std::vector<uint8_t>& dataStream, size_t& idx, T& value )
+static bool GetFromDataStream( const std::vector<uint8_t>& dataStream, size_t& idx, T& value )
 {
     if( dataStream.size() < idx + sizeof( T ) )
     {
@@ -197,16 +276,16 @@ bool GetFromDataStream( const std::vector<uint8_t>& dataStream, size_t& idx, T&
     return true;
 }
 
-SmartNode<> DeserialiseSmartNodeInternal( const std::vector<uint8_t>& serialisedNodeData, size_t& serialIdx, std::vector<SmartNode<>>& referenceNodes, FastSIMD::eLevel level = FastSIMD::Level_Null )
+static SmartNode<> DeserialiseSmartNodeInternal( const std::vector<uint8_t>& serialisedNodeData, size_t& serialIdx, std::vector<SmartNode<>>& referenceNodes, FastSIMD::FeatureSet level = FastSIMD::FeatureSet::Max )
 {
-    uint16_t nodeId;
+    Metadata::node_id nodeId;
     if( !GetFromDataStream( serialisedNodeData, serialIdx, nodeId ) )
     {
         return nullptr;
     }
 
-    // UINT16_MAX indicates a reference node
-    if( nodeId == std::numeric_limits<uint16_t>::max() )
+    // UINT8_MAX indicates a reference node
+    if( nodeId == std::numeric_limits<Metadata::node_id>::max() )
     {
         uint16_t referenceId;
         if( !GetFromDataStream( serialisedNodeData, serialIdx, referenceId ) )
@@ -237,8 +316,14 @@ SmartNode<> DeserialiseSmartNodeInternal( const std::vector<uint8_t>& serialised
         return nullptr;
     }
 
+    MemberLookup memberLookup;
+    if( !GetFromDataStream( serialisedNodeData, serialIdx, memberLookup ) )
+    {
+        return nullptr;
+    }
+
     // Member variables
-    for( const auto& var : metadata->memberVariables )
+    while( memberLookup.member.type == 0 )
     {
         Metadata::MemberVariable::ValueUnion v;
 
@@ -247,40 +332,66 @@ SmartNode<> DeserialiseSmartNodeInternal( const std::vector<uint8_t>& serialised
             return nullptr;
         }
 
-        var.setFunc( generator.get(), v );
-    }
-
-    // Member nodes
-    for( const auto& node : metadata->memberNodeLookups )
-    {
-        SmartNode<> nodeGen = DeserialiseSmartNodeInternal( serialisedNodeData, serialIdx, referenceNodes, level );
+        if( memberLookup.member.index < metadata->memberVariables.size() )
+        {
+            metadata->memberVariables[memberLookup.member.index].setFunc( generator.get(), v );
+        }
 
-        if( !nodeGen || !node.setFunc( generator.get(), nodeGen ) )
+        if( !GetFromDataStream( serialisedNodeData, serialIdx, memberLookup ) )
         {
             return nullptr;
         }
     }
 
-    // Member variables
-    for( const auto& hybrid : metadata->memberHybrids )
+    // Member nodes
+    if( memberLookup.member.type == 1 )
     {
-        uint8_t isGenerator;
-        // 1 byte to indicate:
-        // 0 = constant float value
-        // 1 = node lookup
+        size_t i = 0;
+        for( ; i < std::min<size_t>( memberLookup.member.index, metadata->memberNodeLookups.size() ); i++ )
+        {
+            SmartNode<> nodeGen = DeserialiseSmartNodeInternal( serialisedNodeData, serialIdx, referenceNodes, level );
 
-        if( !GetFromDataStream( serialisedNodeData, serialIdx, isGenerator ) || isGenerator > 1 )
+            if( !nodeGen || !metadata->memberNodeLookups[i].setFunc( generator.get(), nodeGen ) )
+            {
+                return nullptr;
+            }
+        }
+        for( ; i < memberLookup.member.index; i++ )
+        {
+            // Still need to deserialise this even if there is no where to put it
+            if( !DeserialiseSmartNodeInternal( serialisedNodeData, serialIdx, referenceNodes, level ) )
+            {
+                return nullptr;
+            }
+        }
+        for( ; i < metadata->memberNodeLookups.size(); i++ )
+        {
+            // Attempt to use a dummy node to fill the new node lookup
+            if( !metadata->memberNodeLookups[i].setFunc( generator.get(), FastNoise::New<FastNoise::Constant>( level ) ) )
+            {
+                return nullptr;
+            }
+        }
+
+        if( !GetFromDataStream( serialisedNodeData, serialIdx, memberLookup ) )
         {
             return nullptr;
         }
+    }
 
-        if( isGenerator )
+    // Member hybrids
+    while( memberLookup.data != 255 )
+    {
+        if( memberLookup.member.type == 3 )
         {
             SmartNode<> nodeGen = DeserialiseSmartNodeInternal( serialisedNodeData, serialIdx, referenceNodes, level );
 
-            if( !nodeGen || !hybrid.setNodeFunc( generator.get(), nodeGen ) )
+            if( memberLookup.member.index < metadata->memberHybrids.size() )
             {
-                return nullptr;
+                if( !nodeGen || !metadata->memberHybrids[memberLookup.member.index].setNodeFunc( generator.get(), nodeGen ) )
+                {
+                    return nullptr;
+                }
             }
         }
         else
@@ -292,7 +403,15 @@ SmartNode<> DeserialiseSmartNodeInternal( const std::vector<uint8_t>& serialised
                 return nullptr;
             }
 
-            hybrid.setValueFunc( generator.get(), v );
+            if( memberLookup.member.index < metadata->memberHybrids.size() )
+            {
+                metadata->memberHybrids[memberLookup.member.index].setValueFunc( generator.get(), v );
+            }
+        }
+
+        if( !GetFromDataStream( serialisedNodeData, serialIdx, memberLookup ) )
+        {
+            return nullptr;
         }
     }
 
@@ -301,7 +420,7 @@ SmartNode<> DeserialiseSmartNodeInternal( const std::vector<uint8_t>& serialised
     return generator;
 }
 
-SmartNode<> FastNoise::NewFromEncodedNodeTree( const char* serialisedBase64NodeData, FastSIMD::eLevel level )
+SmartNode<> FastNoise::NewFromEncodedNodeTree( const char* serialisedBase64NodeData, FastSIMD::FeatureSet level )
 {
     std::vector<uint8_t> dataStream = Base64::Decode( serialisedBase64NodeData );
     size_t startIdx = 0;
@@ -311,16 +430,16 @@ SmartNode<> FastNoise::NewFromEncodedNodeTree( const char* serialisedBase64NodeD
     return DeserialiseSmartNodeInternal( dataStream, startIdx, referenceNodes, level );
 }
 
-NodeData* DeserialiseNodeDataInternal( const std::vector<uint8_t>& serialisedNodeData, std::vector<std::unique_ptr<NodeData>>& nodeDataOut, size_t& serialIdx )
+static NodeData* DeserialiseNodeDataInternal( const std::vector<uint8_t>& serialisedNodeData, std::vector<std::unique_ptr<NodeData>>& nodeDataOut, size_t& serialIdx )
 {
-    uint16_t nodeId;
+    Metadata::node_id nodeId;
     if( !GetFromDataStream( serialisedNodeData, serialIdx, nodeId ) )
     {
         return nullptr;
     }
 
-    // UINT16_MAX indicates a reference node
-    if( nodeId == std::numeric_limits<uint16_t>::max() )
+    // UINT8_MAX indicates a reference node
+    if( nodeId == std::numeric_limits<Metadata::node_id>::max() )
     {
         uint16_t referenceId;
         if( !GetFromDataStream( serialisedNodeData, serialIdx, referenceId ) )
@@ -346,54 +465,84 @@ NodeData* DeserialiseNodeDataInternal( const std::vector<uint8_t>& serialisedNod
 
     std::unique_ptr<NodeData> nodeData( new NodeData( metadata ) );
 
+
+    MemberLookup memberLookup;
+    if( !GetFromDataStream( serialisedNodeData, serialIdx, memberLookup ) )
+    {
+        return nullptr;
+    }
+
     // Member variables
-    for( auto& var : nodeData->variables )
+    while( memberLookup.member.type == 0 )
     {
-        if( !GetFromDataStream( serialisedNodeData, serialIdx, var ) )
+        Metadata::MemberVariable::ValueUnion v;
+
+        if( !GetFromDataStream( serialisedNodeData, serialIdx, v.i ) )
         {
             return nullptr;
         }
-    }
 
-    // Member nodes
-    for( auto& node : nodeData->nodeLookups )
-    {
-        node = DeserialiseNodeDataInternal( serialisedNodeData, nodeDataOut, serialIdx );
+        if( memberLookup.member.index < metadata->memberVariables.size() )
+        {
+            nodeData->variables[memberLookup.member.index] = v;
+        }
 
-        if( !node )
+        if( !GetFromDataStream( serialisedNodeData, serialIdx, memberLookup ) )
         {
             return nullptr;
         }
     }
 
-    // Member hybrids
-    for( auto& hybrid : nodeData->hybrids )
+    // Member nodes
+    if( memberLookup.member.type == 1 )
     {
-        uint8_t isGenerator;
-        // 1 byte to indicate:
-        // 0 = constant float value
-        // 1 = node lookup
+        size_t i = 0;
+        for( ; i < std::min<size_t>( memberLookup.member.index, metadata->memberNodeLookups.size() ); i++ )
+        {
+            nodeData->nodeLookups[i] = DeserialiseNodeDataInternal( serialisedNodeData, nodeDataOut, serialIdx );
+        }
+        for( ; i < memberLookup.member.index; i++ )
+        {
+            // Still need to deserialise this even if there is no where to put it
+            DeserialiseNodeDataInternal( serialisedNodeData, nodeDataOut, serialIdx );
+        }
 
-        if( !GetFromDataStream( serialisedNodeData, serialIdx, isGenerator ) || isGenerator > 1 )
+        if( !GetFromDataStream( serialisedNodeData, serialIdx, memberLookup ) )
         {
             return nullptr;
         }
+    }
 
-        if( isGenerator )
+    // Member hybrids
+    while( memberLookup.data != 255 )
+    {
+        if( memberLookup.member.type == 3 )
         {
-            hybrid.first = DeserialiseNodeDataInternal( serialisedNodeData, nodeDataOut, serialIdx );
+            NodeData* node = DeserialiseNodeDataInternal( serialisedNodeData, nodeDataOut, serialIdx );
 
-            if( !hybrid.first )
+            if( memberLookup.member.index < metadata->memberHybrids.size() )
             {
-                return nullptr;
+                nodeData->hybrids[memberLookup.member.index].first = node;
             }
         }
         else
         {
-            if( !GetFromDataStream( serialisedNodeData, serialIdx, hybrid.second ) )
+            float v;
+
+            if( !GetFromDataStream( serialisedNodeData, serialIdx, v ) )
             {
                 return nullptr;
             }
+
+            if( memberLookup.member.index < metadata->memberHybrids.size() )
+            {
+                nodeData->hybrids[memberLookup.member.index].second = v;
+            }
+        }
+
+        if( !GetFromDataStream( serialisedNodeData, serialIdx, memberLookup ) )
+        {
+            return nullptr;
         }
     }
 
@@ -410,12 +559,21 @@ NodeData* Metadata::DeserialiseNodeData( const char* serialisedBase64NodeData, s
 
 std::string Metadata::FormatMetadataNodeName( const Metadata* metadata, bool removeGroups )
 {
-    std::string string = metadata->name;
-    for( size_t i = 1; i < string.size(); i++ )
+    std::string string;
+
+    if( metadata->formattedName )
     {
-        if( ( isdigit( string[i] ) || isupper( string[i] ) ) && islower( string[i - 1] ) )
+        string = metadata->formattedName;
+    }
+    else
+    {
+        string = metadata->name;
+        for( size_t i = 1; i < string.size(); i++ )
         {
-            string.insert( i++, 1, ' ' );
+            if( ( isdigit( string[i] ) || isupper( string[i] ) ) && islower( string[i - 1] ) )
+            {
+                string.insert( i++, 1, ' ' );
+            }
         }
     }
 
@@ -430,6 +588,13 @@ std::string Metadata::FormatMetadataNodeName( const Metadata* metadata, bool rem
             }
         }
     }
+
+    // Fallback since empty strings cause imgui errors
+    if( string.empty() )
+    {
+        return metadata->name;
+    }
+
     return string;
 }
 
@@ -451,21 +616,18 @@ namespace FastNoise
 }
 
 template<typename T>
-std::unique_ptr<const MetadataT<T>> CreateMetadataInstance( const char* className )
+static std::unique_ptr<const MetadataT<T>> CreateMetadataInstance( const char* className )
 {
     auto* newMetadata = new MetadataT<T>;
     newMetadata->name = className;
+
+    // Node must be in a group or it is not selectable in the UI
+    assert( newMetadata->groups.size() );
     return std::unique_ptr<const MetadataT<T>>( newMetadata );
 }
 
-#if FASTNOISE_USE_SHARED_PTR
-#define FASTNOISE_GET_MEMORY_ALLOCATOR()
-#else
-#define FASTNOISE_GET_MEMORY_ALLOCATOR() , &SmartNodeManager::Allocate
-#endif
-
-#define FASTSIMD_BUILD_CLASS2( CLASS ) \
-const std::unique_ptr<const FastNoise::MetadataT<CLASS>> g ## CLASS ## Metadata = CreateMetadataInstance<CLASS>( #CLASS );\
+#define FASTNOISE_REGISTER_NODE( CLASS ) \
+static const std::unique_ptr<const FastNoise::MetadataT<CLASS>> g ## CLASS ## Metadata = CreateMetadataInstance<CLASS>( #CLASS );\
 template<> FASTNOISE_API const FastNoise::Metadata& FastNoise::Impl::GetMetadata<CLASS>()\
 {\
     return *g ## CLASS ## Metadata;\
@@ -474,14 +636,10 @@ const FastNoise::Metadata& CLASS::GetMetadata() const\
 {\
     return FastNoise::Impl::GetMetadata<CLASS>();\
 }\
-SmartNode<> FastNoise::MetadataT<CLASS>::CreateNode( FastSIMD::eLevel l ) const\
+SmartNode<> FastNoise::MetadataT<CLASS>::CreateNode( FastSIMD::FeatureSet l ) const\
 {\
-    return SmartNode<>( FastSIMD::New<CLASS>( l FASTNOISE_GET_MEMORY_ALLOCATOR() ) );\
+    return SmartNode<>( FastSIMD::NewDispatchClass<CLASS>( l, &SmartNodeManager::Allocate ) );\
 }
 
-#define FASTSIMD_BUILD_CLASS( CLASS ) FASTSIMD_BUILD_CLASS2( CLASS )
-
-#define FASTNOISE_CLASS( CLASS ) CLASS
-
 #define FASTSIMD_INCLUDE_HEADER_ONLY
-#include "FastNoise/FastNoise_BuildList.inl"
\ No newline at end of file
+#include "FastSIMD_Build.inl"
\ No newline at end of file
diff --git a/src/FastNoise/SmartNode.cpp b/src/FastNoise/SmartNode.cpp
index 9f39ba17..5d4cc01d 100644
--- a/src/FastNoise/SmartNode.cpp
+++ b/src/FastNoise/SmartNode.cpp
@@ -1,8 +1,5 @@
-#include <FastNoise/FastNoise_Config.h>
-
-#if !FASTNOISE_USE_SHARED_PTR
-
-#include <FastNoise/SmartNode.h>
+#include <FastNoise/Utility/Config.h>
+#include <FastNoise/Utility/SmartNode.h>
 
 #include <mutex>
 #include <atomic>
@@ -14,333 +11,222 @@
 
 namespace FastNoise
 {
-    union SmartNodeReference
-    {
-        uint64_t u64;
-        struct
-        {
-            uint32_t pool;
-            uint32_t id;
-        } u32;
-    };
-    
-    struct SmartNodeManagerPool
+    class SmartNodeManagerPool
     {
-        static constexpr uint32_t kInvalidSlot = (uint32_t)-1;
+    public:
+        SmartNodeManagerPool( uint32_t size ) :
+            mAllocState( 0 ), mNextPool( nullptr ), mPoolSize( std::min<uint32_t>( size, INT32_MAX ) )
+        { }
 
-        struct SlotHeader
+        SmartNodeManagerPool( const SmartNodeManagerPool& ) = delete;
+        SmartNodeManagerPool( SmartNodeManagerPool&& ) = delete;
+        
+        bool Contains( const void* ptr ) const
         {
-            std::atomic<uint32_t> references;
-        };
+            uint8_t* pool = GetPool();
+            uint32_t nextFreeIndex = (uint32_t)( mAllocState.load( std::memory_order_relaxed ) >> 32 );
 
-        struct Slot
-        {
-            uint32_t pos;
-            uint32_t size;            
-        };
+            return ptr >= pool && ptr < pool + ( nextFreeIndex - 1 );
+        }
 
-        SmartNodeManagerPool( uint32_t size )
+        void* TryAlloc( size_t size, size_t align )
         {
-            size = std::min<uint32_t>( size, INT32_MAX );
+            uint8_t* pool = GetPool();
+            uint64_t allocState = mAllocState.load( std::memory_order_relaxed );
+            uint64_t newAllocState;
+            void* startSlot;
 
-            uint32_t alignOffset = size % alignof( SlotHeader );
-            if( alignOffset )
+            do
             {
-                // pool size needs to be multiple of `alignof( SlotHeader )` (likely 4)
-                size += alignof( SlotHeader ) - alignOffset;
-            }
+                uint32_t activeAllocs = (uint32_t)allocState;
+                uint32_t nextFreeIndex = (uint32_t)(allocState >> 32);
 
-            poolSize = size;
-            pool = (uint8_t*)new SlotHeader[size / sizeof( SlotHeader )];
+                // Reset pool counter if there are no allocs
+                startSlot    = activeAllocs ? pool      + nextFreeIndex : pool;
+                size_t space = activeAllocs ? mPoolSize - nextFreeIndex : mPoolSize;
 
-            freeSlots = { { 0, poolSize } };
-        }
+                if( !std::align( align, size, startSlot, space ) )
+                {
+                    return nullptr;
+                }
 
-        SmartNodeManagerPool( const SmartNodeManagerPool& ) = delete;
-        SmartNodeManagerPool( SmartNodeManagerPool&& ) = delete;
+                nextFreeIndex = static_cast<uint32_t>( ( (uint8_t*)startSlot + size ) - pool );
+                activeAllocs++;
 
-        ~SmartNodeManagerPool()
-        {
-            assert( usedSlots.empty() );
+                newAllocState = (uint64_t)activeAllocs | ( (uint64_t)nextFreeIndex << 32 );
+                                
+            } while( !mAllocState.compare_exchange_weak( allocState, newAllocState, std::memory_order_relaxed ) );
 
-            delete[] pool;
+            return startSlot;
         }
 
-        auto GetUsedSlotItr( const void* ptr ) const
+        int32_t Free( const void* ptr )
         {
-            if( ptr > pool && ptr < pool + poolSize )
+            if( Contains( ptr ) )
             {
-                for( auto itr = usedSlots.begin(); itr != usedSlots.end(); ++itr )
-                {
-                    const uint8_t* start = pool + itr->pos;
+                uint64_t allocState = mAllocState.fetch_sub( 1, std::memory_order_relaxed );
 
-                    if( start < ptr && start + itr->size > ptr )
-                    {
-                        return itr;
-                    }
-                }
+                assert( (uint32_t)allocState != 0 );
+                return (int32_t)allocState - 1;
             }
 
-            return usedSlots.end();
+            return -1;
         }
 
-        auto GetUsedSlotItr( uint32_t pos ) const
+        int32_t AllocCount() const
         {
-            return std::find_if( usedSlots.begin(), usedSlots.end(), [pos]( const Slot& slot ) 
-            {
-                return slot.pos == pos;    
-            } );
+            return (int32_t)mAllocState.load( std::memory_order_relaxed );
         }
-        
-        bool ValidatePtr( uint32_t pos, const void* ptr ) const
-        {            
-            if( pos >= poolSize )
-            {
-                assert( 0 );
-                return false;
-            }
 
-            auto slot = GetUsedSlotItr( ptr );
+        bool MarkForRemoval()
+        {
+            uint64_t allocState = mAllocState.load( std::memory_order_relaxed );
 
-            // Check pos pointing at garbage data
-            if( slot == usedSlots.end() )
+            if( (uint32_t)allocState != 0 )
             {
-                assert( 0 );
                 return false;
             }
 
-            // Check pos is correct
-            if( slot->pos != pos )
-            {
-                assert( 0 );
-                return false;
-            }
-            return true;
+            uint64_t newAllocState = ( (uint64_t)mPoolSize << 32 ) + 1; // Set as full
+
+            return mAllocState.compare_exchange_strong( allocState, newAllocState, std::memory_order_relaxed );
         }
 
-        std::atomic<uint32_t>& GetReferenceCount( uint32_t pos ) const
+        uint8_t* GetPool() const
         {
-            SlotHeader* slot = (SlotHeader*)( pool + pos );
-
-            assert( pos < poolSize );
-
-            return slot->references;
+            return (uint8_t*)this + sizeof( SmartNodeManagerPool );
         }
 
-        uint32_t GetReferenceId( const void* ptr ) const
-        {
-            auto slot = GetUsedSlotItr( ptr );
+        std::atomic<uint64_t> mAllocState;
+        std::atomic<SmartNodeManagerPool*> mNextPool;
+        uint32_t mPoolSize;
+    };
+    
+    class SmartNodeMemoryAllocator
+    {
+    public:
+        static inline uint32_t sNewPoolSize = 64 * 1024;
 
-            if( slot == usedSlots.end() )
+        void* Alloc( size_t size, size_t align ) 
+        {
+            if( void* ptr = AllocFromPools( size, align ) )
             {
-                return UINT32_MAX;
+                return ptr;
             }
 
-            return slot->pos;
-        }
-
-        void* TryAlloc( size_t size, size_t align )
-        {
-            align = std::max( align, alignof( SlotHeader ) );
+            std::lock_guard lock( mMutex );
 
-            for( uint32_t idx = 0; idx < freeSlots.size(); idx++ )
+            if( void* ptr = AllocFromPools( size, align ) )
             {
-                if( freeSlots[idx].size < size + sizeof( SlotHeader ) )
-                {
-                    continue;
-                }
-
-                uint8_t* startSlot = pool + freeSlots[idx].pos;
-                void* ptr = startSlot + sizeof( SlotHeader );
-                size_t space = freeSlots[idx].size - sizeof( SlotHeader );
-
-                if( std::align( align, size, ptr, space ) )
-                {                   
-                    uint8_t* endSlot = (uint8_t*)ptr + size;
-
-                    // Align next slot correctly for SlotHeader
-                    size_t alignmentOffset = (size_t)endSlot % alignof( SlotHeader );
-
-                    if( alignmentOffset )
-                    {
-                        endSlot += alignof( SlotHeader ) - alignmentOffset;
-                    }
+                return ptr;
+            }
+      
+            if( void* poolAlloc = std::malloc( std::max( (uint32_t)sizeof( SmartNodeManagerPool ), sNewPoolSize ) ) )
+            {        
+                SmartNodeManagerPool* newPool = new( poolAlloc ) SmartNodeManagerPool( sNewPoolSize - (uint32_t)sizeof( SmartNodeManagerPool ) );
 
-                    uint32_t slotSize = (uint32_t)( endSlot - startSlot );
+                void* alloc = newPool->TryAlloc( size, align );
+                assert( alloc ); // Alloc too large to fit in empty pool, increase pool size
 
-                    assert( freeSlots[idx].size >= slotSize );
-                    
-                    new( startSlot ) SlotHeader { 0u };
-                    usedSlots.emplace_back( Slot{ freeSlots[idx].pos, slotSize } );
+                if( mPools )
+                {
+                    SmartNodeManagerPool* pool = mPools;
 
-                    // Check if remaining free slot is empty
-                    if( freeSlots[idx].size <= slotSize )
+                    while( SmartNodeManagerPool* nextPool = pool->mNextPool.load( std::memory_order_relaxed ) )
                     {
-                        assert( freeSlots[idx].size == slotSize );
-                        freeSlots.erase( freeSlots.cbegin() + idx );
-                        return ptr;
-                    }
+                        pool = nextPool;
+                    }  
 
-                    freeSlots[idx].pos += slotSize;
-                    freeSlots[idx].size -= slotSize;
-
-                    return ptr;
+                    pool->mNextPool.store( newPool, std::memory_order_release );
+                }
+                else
+                {
+                    mPools = newPool;
                 }
-            }
 
-            assert( freeSlots.empty() || freeSlots[0].size != poolSize ); // Empty pool not large enough to fit alloc, increase the pool size
+                return alloc;
+            } 
+
             return nullptr;
         }
 
-        void DeAlloc( uint32_t pos )
+        void Free( const void* ptr )
         {
-            SlotHeader* slotHeader = (SlotHeader*)( pool + pos );
-            auto slot = GetUsedSlotItr( pos );
-
-            assert( slot != usedSlots.end() );            
-            assert( slotHeader->references == 0 );
-            assert( slot->size < poolSize );
+            SmartNodeManagerPool* pool = mPools;
 
-            // Merge free slots as necessary
-            Slot* expandedBefore = nullptr;
-            uint32_t idx = 0;
-
-            for( ; idx < freeSlots.size(); idx++ )
+            while( pool )
             {
-                if( freeSlots[idx].pos > pos )
-                {
-                    break;
-                }
-
-                // Found slot before, expand
-                if( freeSlots[idx].pos + freeSlots[idx].size == pos )
-                {
-                    freeSlots[idx].size += slot->size;
-                    expandedBefore = &freeSlots[idx];
-                    idx++;
-                    break;
-                }
-            }
+                int32_t allocCount = pool->Free( ptr );
 
-            if( idx < freeSlots.size() && freeSlots[idx].pos == pos + slot->size )
-            {
-                // Found slot before and after, expand before again, delete after
-                if( expandedBefore )
-                {
-                    expandedBefore->size += freeSlots[idx].size;
-                    freeSlots.erase( freeSlots.begin() + idx );
-                }
-                else // Found slot after, expand
+                if( allocCount >= 0 )
                 {
-                    freeSlots[idx].pos = pos;
-                    freeSlots[idx].size += slot->size;
+                    if( allocCount == 0 )
+                    {
+                        RemoveEmptyPool();
+                    }
+                    return;
                 }
-            }
-            else if( !expandedBefore ) // No slots before or after, create new
-            {
-                freeSlots.emplace( freeSlots.begin() + idx, Slot { pos, slot->size } );
-            }
-            
-            slotHeader->~SlotHeader();
-            assert( memset( slotHeader, 255, slot->size ) );
 
-            usedSlots.erase( slot );
-        }
-
-        uint32_t poolSize;
-        uint8_t* pool;
-        std::vector<Slot> freeSlots;
-        std::vector<Slot> usedSlots;
-    };
-    
-    class SmartNodeMemoryAllocator
-    {
-    public:
-        static inline uint32_t sNewPoolSize = 256 * 1024;
-
-        bool ValidatePtr( SmartNodeReference ref, const void* ptr )
-        {
-            std::lock_guard lock( mMutex );
-
-            if( ref.u32.pool >= mPools.size() )
-            {
-                assert( 0 );
-                return false;
+                pool = pool->mNextPool;
             }
 
-            return std::next( mPools.begin(), ref.u32.pool )->ValidatePtr( ref.u32.id, ptr );
+            assert( 0 ); // Pointer not in any of the pools
         }
-
-        std::atomic<uint32_t>& GetReferenceCount( SmartNodeReference ref ) const
-        {
-            std::lock_guard lock( mMutex );
-
-            return std::next( mPools.begin(), ref.u32.pool )->GetReferenceCount( ref.u32.id );
-        }
-
-        SmartNodeReference GetReference( const void* ptr )
+        
+    private:
+        void* AllocFromPools( size_t size, size_t align )
         {
-            std::lock_guard lock( mMutex );
-
-            SmartNodeReference ref = { 0 };
+            SmartNodeManagerPool* pool = mPools;
 
-            for( auto& poolItr : mPools )
+            while( pool )
             {
-                ref.u32.id = poolItr.GetReferenceId( ptr );
-                if( ref.u32.id != UINT32_MAX )
+                if( void* ptr = pool->TryAlloc( size, align ) )
                 {
-                    return ref;
+                    return ptr;
                 }
 
-                ref.u32.pool++;
+                pool = pool->mNextPool;
             }
-
-            // Could not find ptr in pools, probably not allocated using this class
-            assert( 0 );
-            return { SmartNodeManager::kInvalidReferenceId };
+            return nullptr;
         }
 
-        void* Alloc( size_t size, size_t align ) 
+        void RemoveEmptyPool()
         {
-            std::lock_guard lock( mMutex );
+            SmartNodeManagerPool* pool = mPools;
+            SmartNodeManagerPool* emptyPool = mPools->AllocCount() > 0 ? nullptr : mPools;
 
-            if( void* ptr = AllocFromPools( size, align ) )
+            while( SmartNodeManagerPool* nextPool = pool->mNextPool.load( std::memory_order_relaxed ) )
             {
-                return ptr;
-            }
+                int32_t allocCount = nextPool->AllocCount();
 
-            mPools.emplace_back( sNewPoolSize );
+                if( allocCount == 0 )
+                {
+                    if( emptyPool ) // Only remove a pool if we have 2 empty pools
+                    {
+                        std::lock_guard lock( mMutex );
 
-            return AllocFromPools( size, align );
-        }
+                        SmartNodeManagerPool* toRemove = nextPool;
 
-        void Dealloc( SmartNodeReference ref )
-        {
-            std::lock_guard lock( mMutex );
+                        if( toRemove->MarkForRemoval() )
+                        {
+                            pool->mNextPool.store( toRemove->mNextPool.load( std::memory_order_relaxed ) );
 
-            std::next( mPools.begin(), ref.u32.pool )->DeAlloc( ref.u32.id );
-        }
-        
-    private:
-        void* AllocFromPools( size_t size, size_t align )
-        {
-            uint32_t idx = 0;            
+                            toRemove->~SmartNodeManagerPool();
 
-            for( auto& poolItr : mPools )
-            {
-                if( void* ptr = poolItr.TryAlloc( size, align ) )
-                {
-                    return ptr;
+                            std::free( toRemove );
+                        }
+
+                        return;
+                    }
+
+                    emptyPool = nextPool;                    
                 }
 
-                idx++;
+                pool = nextPool;
             }
-            return nullptr;
         }
-
-        // std::list is used to allow lock free reads to pools
-        // In most use cases there should only be 1 pool so performance is not a concern
-        std::list<SmartNodeManagerPool> mPools;
+        
+        SmartNodeManagerPool* mPools = nullptr;
         mutable std::mutex mMutex;
     };
 
@@ -351,51 +237,13 @@ namespace FastNoise
         SmartNodeMemoryAllocator::sNewPoolSize = size;
     }
 
-    uint64_t SmartNodeManager::GetReference( const void* ptr )
-    {
-        assert( ptr );
-
-        return gMemoryAllocator.GetReference( ptr ).u64;
-    }
-
-    void SmartNodeManager::IncReference( uint64_t id )
-    {
-        assert( id != kInvalidReferenceId );
-
-        std::atomic<uint32_t>& refCount = gMemoryAllocator.GetReferenceCount( { id } );
-
-        ++refCount;
-    }
-
-    void SmartNodeManager::DecReference( uint64_t id, void* ptr, void ( *destructorFunc )( void* ) )
-    {
-        assert( gMemoryAllocator.ValidatePtr( { id }, ptr ) );
-
-        std::atomic<uint32_t>& refCount = gMemoryAllocator.GetReferenceCount( { id } );    
-
-        uint32_t previousRefCount = refCount.fetch_sub( 1 );
-
-        assert( previousRefCount );
-
-        if( previousRefCount == 1 )
-        {
-            destructorFunc( ptr );
-
-            gMemoryAllocator.Dealloc( { id } );
-        }
-    }
-
-    uint32_t SmartNodeManager::ReferenceCount( uint64_t id )
+    void* SmartNodeManager::Allocate( size_t size, size_t align )
     {
-        assert( id != kInvalidReferenceId );
-        
-        return gMemoryAllocator.GetReferenceCount( { id } );
+        return gMemoryAllocator.Alloc( size, align );
     }
 
-    void* SmartNodeManager::Allocate( size_t size, size_t align )
+    void SmartNodeManager::Free( const void* ptr )
     {
-        return gMemoryAllocator.Alloc( size, align );
+        gMemoryAllocator.Free( ptr );        
     }
 } // namespace FastNoise
-
-#endif
\ No newline at end of file
diff --git a/src/FastSIMD/Example/Example.h b/src/FastSIMD/Example/Example.h
deleted file mode 100644
index f64ed155..00000000
--- a/src/FastSIMD/Example/Example.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#include "FS_Class.inl"
-#ifdef FASTSIMD_INCLUDE_CHECK
-#include __FILE__
-#endif
-#include "FS_Class.inl"
-#pragma once
-
-FASTSIMD_CLASS_DECLARATION( Example )
-{
-    FASTSIMD_CLASS_SETUP( FastSIMD::Level_AVX2 | FastSIMD::Level_SSE41 | FastSIMD::Level_SSE2 | FastSIMD::Level_Scalar );
-
-public:
-
-    FS_EXTERNAL_FUNC( void DoStuff( int* data ) );
-
-    FS_EXTERNAL_FUNC( void DoArray( int* data0, int* data1, int size ) );
-};
diff --git a/src/FastSIMD/Example/Example.inl b/src/FastSIMD/Example/Example.inl
deleted file mode 100644
index c640a018..00000000
--- a/src/FastSIMD/Example/Example.inl
+++ /dev/null
@@ -1,125 +0,0 @@
-#define FASTSIMD_INTELLISENSE
-#include "Example.h"
-
-//template<typename T>// Generic function, used if no specialised function found
-//FS_CLASS( Example ) < T, FS_SIMD_CLASS::SIMD_Level >::FS_CLASS( Example )()
-//{
-//    int test = 1;
-//
-//    test += test;
-//}
-
-template<typename F, FastSIMD::ELevel S> // Generic function, used if no specialised function found
-void FS_CLASS( Example )<F, S>::DoStuff( int* data )
-{
-    int32v a = int32v( 1 );
-
-    FS_Store_i32( data, a );
-}
-
-//template<typename CLASS_T, typename SIMD_T> // Different function for level SSE2 or AVX2
-//void FS_CLASS( Example )::DoStuff( int* data )
-//{
-//    int32v a = _mm_loadu_si128( reinterpret_cast<__m128i const*>(data) );
-//
-//    a += _mm_set_epi32( 2, 3, 4, 5 );
-//
-//    a -= _mm_castps_si128( FS_VecZero_f32( ) );
-//
-//    FS_Store_i32( data, a );
-//}
-//
-//
-//template<typename CLASS_T, FastSIMD::Level LEVEL_T>
-//void FS_CLASS( Example )::DoArray( int* data0, int* data1, int size )
-//{
-//    for ( int i = 0; i < size; i += FS_VectorSize_i32() )
-//    {
-//        int32v a = FS_Load_i32( &data0[i] );
-//        int32v b = FS_Load_i32( &data1[i] );
-//        
-//        a *= b;
-//
-//        a <<= 1;
-//
-//        a -= FS_VecZero_i32();
-//
-//        (~a);
-//
-//        FS_Store_i32( &data0[i], a );
-//    }
-//}
-
-template<typename F, FastSIMD::ELevel S>
-void FS_CLASS( Example )<F, S>::DoArray( int* data0, int* data1, int size )
-{
-    for ( size_t i = 0; i < size; i += int32v::FS_Size() )
-    {
-        int32v a = FS_Load_i32( &data0[i] );
-        int32v b = FS_Load_i32( &data1[i] );
-
-        a += b;
-
-        a <<= 1;
-
-        a *= b;
-
-        a -= int32v::FS_Zero();
-
-        (~a);
-
-        FS_Store_i32( &data0[i], a );
-    }
-}
-
-template<typename T_FS>
-class FS_CLASS( Example )<T_FS, FastSIMD::Level_AVX2> : public FS_CLASS( Example )<T_FS, FastSIMD::Level_Null>
-{
-    //typedef FastSIMD_AVX2 T_FS;
-    FASTSIMD_CLASS_SETUP( FastSIMD::COMPILED_SIMD_LEVELS );
-
-public:
-    void DoArray( int* data0, int* data1, int size )
-    {
-        for ( size_t i = 0; i < size; i += int32v::FS_Size() )
-        {
-            int32v a = FS_Load_i32( &data0[i] );
-            int32v b = FS_Load_i32( &data1[i] );
-
-            //a += gfhfdghdfgh();
-
-            a += b;
-
-            a <<= 2;
-
-            a *= b;
-
-            a -= int32v::FS_Zero();
-
-            (~a);
-
-            FS_Store_i32( &data0[i], a );
-        }
-    }
-};
-
-//
-//template<typename T>
-//typename std::enable_if<(T::SIMD_Level <= 1)>::type FS_CLASS( Example )<T, FS_SIMD_CLASS::SIMD_Level>::DoArray( int* data0, int* data1, int size )
-//{
-//    for ( int i = 0; i < size; i += FS_VectorSize_i32() )
-//    {
-//        int32v a = FS_Load_i32( &data0[i] );
-//        int32v b = FS_Load_i32( &data1[i] );
-//
-//        a += b;
-//
-//        a <<= 1;
-//
-//        a -= FS_VecZero_i32();
-//
-//        (~a);
-//
-//        FS_Store_i32( &data0[i], a );
-//    }
-//}
diff --git a/src/FastSIMD/FastSIMD.cpp b/src/FastSIMD/FastSIMD.cpp
deleted file mode 100644
index eee3821e..00000000
--- a/src/FastSIMD/FastSIMD.cpp
+++ /dev/null
@@ -1,242 +0,0 @@
-#include "FastSIMD/FastSIMD.h"
-
-#include <algorithm>
-#include <cstdint>
-
-#if FASTSIMD_x86
-
-#ifdef __GNUG__
-#include <x86intrin.h>
-#else
-#include <intrin.h>
-#endif
-
-#endif
-
-
-#include "FastSIMD/SIMDTypeList.h"
-
-static_assert(FastSIMD::SIMDTypeList::MinimumCompiled & FastSIMD::COMPILED_SIMD_LEVELS, "FASTSIMD_FALLBACK_SIMD_LEVEL is not a compiled SIMD level, check FastSIMD_Config.h");
-
-#if FASTSIMD_x86
-// Define interface to cpuid instruction.
-// input:  eax = functionnumber, ecx = 0
-// output: eax = output[0], ebx = output[1], ecx = output[2], edx = output[3]
-static void cpuid( int output[4], int functionnumber )
-{
-#if defined( __GNUC__ ) || defined( __clang__ )              // use inline assembly, Gnu/AT&T syntax
-
-    int a, b, c, d;
-    __asm("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "a"(functionnumber), "c"(0) : );
-    output[0] = a;
-    output[1] = b;
-    output[2] = c;
-    output[3] = d;
-
-#elif defined( _MSC_VER ) || defined ( __INTEL_COMPILER )     // Microsoft or Intel compiler, intrin.h included
-
-    __cpuidex( output, functionnumber, 0 ); // intrinsic function for CPUID
-
-#else                                                      // unknown platform. try inline assembly with masm/intel syntax
-
-    __asm
-    {
-        mov eax, functionnumber
-        xor ecx, ecx
-        cpuid;
-        mov esi, output
-            mov[esi], eax
-            mov[esi + 4], ebx
-            mov[esi + 8], ecx
-            mov[esi + 12], edx
-    }
-
-#endif
-}
-
-// Define interface to xgetbv instruction
-static int64_t xgetbv( int ctr )
-{
-#if (defined( _MSC_FULL_VER ) && _MSC_FULL_VER >= 160040000) || (defined( __INTEL_COMPILER ) && __INTEL_COMPILER >= 1200) // Microsoft or Intel compiler supporting _xgetbv intrinsic
-
-    return _xgetbv( ctr ); // intrinsic function for XGETBV
-
-#elif defined( __GNUC__ )                                    // use inline assembly, Gnu/AT&T syntax
-
-    uint32_t a, d;
-    __asm("xgetbv" : "=a"(a), "=d"(d) : "c"(ctr) : );
-    return a | (uint64_t( d ) << 32);
-
-#else  // #elif defined (_WIN32)                           // other compiler. try inline assembly with masm/intel/MS syntax
-
-    uint32_t a, d;
-    __asm {
-        mov ecx, ctr
-        _emit 0x0f
-        _emit 0x01
-        _emit 0xd0; // xgetbv
-        mov a, eax
-            mov d, edx
-    }
-    return a | (uint64_t( d ) << 32);
-
-#endif
-}
-#endif
-
-FASTSIMD_API FastSIMD::eLevel FastSIMD::CPUMaxSIMDLevel()
-{
-    static eLevel simdLevel = Level_Null;
-
-    if ( simdLevel > Level_Null )
-    {
-        return simdLevel;
-    }
-
-#if FASTSIMD_x86
-    int abcd[4] = { 0,0,0,0 }; // cpuid results
-
-#if !FASTSIMD_64BIT
-    simdLevel = Level_Scalar; // default value
-
-    cpuid( abcd, 0 ); // call cpuid function 0
-    if ( abcd[0] == 0 )
-        return simdLevel; // no further cpuid function supported
-
-    cpuid( abcd, 1 ); // call cpuid function 1 for feature flags
-    if ( (abcd[3] & (1 << 0)) == 0 )
-        return simdLevel; // no floating point
-    if ( (abcd[3] & (1 << 23)) == 0 )
-        return simdLevel; // no MMX
-    if ( (abcd[3] & (1 << 15)) == 0 )
-        return simdLevel; // no conditional move
-    if ( (abcd[3] & (1 << 24)) == 0 )
-        return simdLevel; // no FXSAVE
-    if ( (abcd[3] & (1 << 25)) == 0 )
-        return simdLevel; // no SSE
-    simdLevel = Level_SSE;
-    // 1: SSE supported
-
-    if ( (abcd[3] & (1 << 26)) == 0 )
-        return simdLevel; // no SSE2
-#else
-    cpuid( abcd, 1 ); // call cpuid function 1 for feature flags
-#endif
-
-    simdLevel = Level_SSE2; // default value for 64bit
-    // 2: SSE2 supported
-
-    if ( (abcd[2] & (1 << 0)) == 0 )
-        return simdLevel; // no SSE3
-    simdLevel = Level_SSE3;
-    // 3: SSE3 supported
-
-    if ( (abcd[2] & (1 << 9)) == 0 )
-        return simdLevel; // no SSSE3
-    simdLevel = Level_SSSE3;
-    // 4: SSSE3 supported
-
-    if ( (abcd[2] & (1 << 19)) == 0 )
-        return simdLevel; // no SSE4.1
-    simdLevel = Level_SSE41;
-    // 5: SSE4.1 supported
-
-    if ( (abcd[2] & (1 << 23)) == 0 )
-        return simdLevel; // no POPCNT
-    if ( (abcd[2] & (1 << 20)) == 0 )
-        return simdLevel; // no SSE4.2
-    simdLevel = Level_SSE42;
-    // 6: SSE4.2 supported
-
-    if ( (abcd[2] & (1 << 26)) == 0 )
-        return simdLevel; // no XSAVE
-    if ( (abcd[2] & (1 << 27)) == 0 )
-        return simdLevel; // no OSXSAVE
-    if ( (abcd[2] & (1 << 28)) == 0 )
-        return simdLevel; // no AVX
-
-    uint64_t osbv = xgetbv( 0 );
-    if ( (osbv & 6) != 6 )
-        return simdLevel; // AVX not enabled in O.S.
-    simdLevel = Level_AVX;
-    // 7: AVX supported
-
-    cpuid( abcd, 7 ); // call cpuid leaf 7 for feature flags
-    if ( (abcd[1] & (1 << 5)) == 0 )
-        return simdLevel; // no AVX2
-    simdLevel = Level_AVX2;
-    // 8: AVX2 supported
-
-    if( (osbv & (0xE0)) != 0xE0 )
-        return simdLevel; // AVX512 not enabled in O.S.
-    if ( (abcd[1] & (1 << 16)) == 0 )
-        return simdLevel; // no AVX512
-    cpuid( abcd, 0xD ); // call cpuid leaf 0xD for feature flags
-    if ( (abcd[0] & 0x60) != 0x60 )
-        return simdLevel; // no AVX512
-    // 9: AVX512 supported
-
-    cpuid( abcd, 7 ); // call cpuid leaf 7 for feature flags
-    if ( (abcd[1] & (1 << 31)) == 0 )
-        return simdLevel; // no AVX512VL
-    // 10: AVX512VL supported
-
-    if ( (abcd[1] & 0x40020000) != 0x40020000 )
-        return simdLevel; // no AVX512BW, AVX512DQ
-    simdLevel = Level_AVX512;
-    // 11: AVX512BW & AVX512DQ supported
-#endif
-
-#if FASTSIMD_ARM
-    simdLevel = Level_NEON;
-#endif
-
-    return simdLevel;
-}
-
-template<typename CLASS_T, FastSIMD::eLevel SIMD_LEVEL>
-CLASS_T* SIMDLevelSelector( FastSIMD::eLevel maxSIMDLevel, FastSIMD::MemoryAllocator allocator )
-{
-    if constexpr( ( CLASS_T::Supported_SIMD_Levels & SIMD_LEVEL ) != 0 )
-    {
-        CLASS_T* newClass = SIMDLevelSelector<CLASS_T, FastSIMD::SIMDTypeList::GetNextCompiledAfter<SIMD_LEVEL>>( maxSIMDLevel, allocator );
-
-        if( !newClass && SIMD_LEVEL <= maxSIMDLevel )
-        {
-            return FastSIMD::ClassFactory<CLASS_T, SIMD_LEVEL>( allocator );
-        }
-
-        return newClass;
-    }
-    else
-    {
-        if constexpr( SIMD_LEVEL == FastSIMD::Level_Null )
-        {
-            return nullptr;
-        }
-
-        return SIMDLevelSelector<CLASS_T, FastSIMD::SIMDTypeList::GetNextCompiledAfter<SIMD_LEVEL>>( maxSIMDLevel, allocator );        
-    }
-}
-
-template<typename CLASS_T>
-CLASS_T* FastSIMD::New( eLevel maxSIMDLevel, FastSIMD::MemoryAllocator allocator )
-{
-    if( maxSIMDLevel == Level_Null )
-    {
-        maxSIMDLevel = CPUMaxSIMDLevel();
-    }
-    else
-    {
-        maxSIMDLevel = std::min( maxSIMDLevel, CPUMaxSIMDLevel() );        
-    }
-
-    static_assert(( CLASS_T::Supported_SIMD_Levels & FastSIMD::SIMDTypeList::MinimumCompiled ), "MinimumCompiled SIMD Level must be supported by this class" );
-    return SIMDLevelSelector<CLASS_T, SIMDTypeList::MinimumCompiled>( maxSIMDLevel, allocator );
-}
-
-#define FASTSIMD_BUILD_CLASS( CLASS ) \
-template FASTSIMD_API CLASS* FastSIMD::New( FastSIMD::eLevel, FastSIMD::MemoryAllocator );
-
-#define FASTSIMD_INCLUDE_HEADER_ONLY
-#include "FastSIMD_BuildList.inl"
diff --git a/src/FastSIMD/FastSIMD_BuildList.inl b/src/FastSIMD/FastSIMD_BuildList.inl
deleted file mode 100644
index 8e65ff25..00000000
--- a/src/FastSIMD/FastSIMD_BuildList.inl
+++ /dev/null
@@ -1,10 +0,0 @@
-#pragma once
-
-#ifndef FASTSIMD_BUILD_CLASS
-#error Do not include this file
-#endif
-
-//#include "Example/Example.inl"
-//FASTSIMD_BUILD_CLASS( Example )
-
-#include "FastNoise/FastNoise_BuildList.inl"
\ No newline at end of file
diff --git a/src/FastSIMD/FastSIMD_Level_AVX2.cpp b/src/FastSIMD/FastSIMD_Level_AVX2.cpp
deleted file mode 100644
index c8ae3ed8..00000000
--- a/src/FastSIMD/FastSIMD_Level_AVX2.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-#include "FastSIMD/FastSIMD.h"
-
-#if FASTSIMD_COMPILE_AVX2
-
-// To compile AVX2 support enable AVX(2) code generation compiler flags for this file
-#ifndef __AVX__
-#ifdef _MSC_VER
-#error To compile AVX set C++ code generation to use /arch:AVX on FastSIMD_Level_AVX2.cpp, or change "#define FASTSIMD_COMPILE_AVX2" in FastSIMD_Config.h
-#else
-#error To compile AVX add build command "-march=core-avx" on FastSIMD_Level_AVX2.cpp, or change "#define FASTSIMD_COMPILE_AVX2" in FastSIMD_Config.h
-#endif
-#endif
-
-#include "Internal/AVX.h"
-#define FS_SIMD_CLASS FastSIMD::AVX2
-#include "Internal/SourceBuilder.inl"
-#endif
\ No newline at end of file
diff --git a/src/FastSIMD/FastSIMD_Level_AVX512.cpp b/src/FastSIMD/FastSIMD_Level_AVX512.cpp
deleted file mode 100644
index 1472d656..00000000
--- a/src/FastSIMD/FastSIMD_Level_AVX512.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-#include "FastSIMD/FastSIMD.h"
-
-#if FASTSIMD_COMPILE_AVX512 
-
-// To compile AVX512 support enable AVX512 code generation compiler flags for this file
-#ifndef __AVX512DQ__ 
-#ifdef _MSC_VER
-#error To compile AVX512 set C++ code generation to use /arch:AVX512 on FastSIMD_Level_AVX512.cpp, or change "#define FASTSIMD_COMPILE_AVX512" in FastSIMD_Config.h
-#else
-#error To compile AVX512 add build command "-mavx512f -mavx512dq" on FastSIMD_Level_AVX512.cpp, or change "#define FASTSIMD_COMPILE_AVX512" in FastSIMD_Config.h
-#endif
-#endif
-
-#include "Internal/AVX512.h"
-#define FS_SIMD_CLASS FastSIMD::AVX512
-#include "Internal/SourceBuilder.inl"
-#endif
\ No newline at end of file
diff --git a/src/FastSIMD/FastSIMD_Level_NEON.cpp b/src/FastSIMD/FastSIMD_Level_NEON.cpp
deleted file mode 100644
index e804ace1..00000000
--- a/src/FastSIMD/FastSIMD_Level_NEON.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "FastSIMD/FastSIMD.h"
-
-#if FASTSIMD_COMPILE_NEON
-#include "Internal/NEON.h"
-#define FS_SIMD_CLASS FastSIMD::NEON
-#include "Internal/SourceBuilder.inl"
-#endif
\ No newline at end of file
diff --git a/src/FastSIMD/FastSIMD_Level_SSE2.cpp b/src/FastSIMD/FastSIMD_Level_SSE2.cpp
deleted file mode 100644
index a36c4f66..00000000
--- a/src/FastSIMD/FastSIMD_Level_SSE2.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "FastSIMD/FastSIMD.h"
-
-#if FASTSIMD_COMPILE_SSE2
-#include "Internal/SSE.h"
-#define FS_SIMD_CLASS FastSIMD::SSE2
-#include "Internal/SourceBuilder.inl"
-#endif
diff --git a/src/FastSIMD/FastSIMD_Level_SSE3.cpp b/src/FastSIMD/FastSIMD_Level_SSE3.cpp
deleted file mode 100644
index a633767d..00000000
--- a/src/FastSIMD/FastSIMD_Level_SSE3.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "FastSIMD/FastSIMD.h"
-
-#if FASTSIMD_COMPILE_SSE3
-#include "Internal/SSE.h"
-#define FS_SIMD_CLASS FastSIMD::SSE3
-#include "Internal/SourceBuilder.inl"
-#endif
diff --git a/src/FastSIMD/FastSIMD_Level_SSE41.cpp b/src/FastSIMD/FastSIMD_Level_SSE41.cpp
deleted file mode 100644
index b33ba482..00000000
--- a/src/FastSIMD/FastSIMD_Level_SSE41.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "FastSIMD/FastSIMD.h"
-
-#if FASTSIMD_COMPILE_SSE41
-#include "Internal/SSE.h"
-#define FS_SIMD_CLASS FastSIMD::SSE41
-#include "Internal/SourceBuilder.inl"
-#endif
diff --git a/src/FastSIMD/FastSIMD_Level_SSE42.cpp b/src/FastSIMD/FastSIMD_Level_SSE42.cpp
deleted file mode 100644
index 140065e0..00000000
--- a/src/FastSIMD/FastSIMD_Level_SSE42.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "FastSIMD/FastSIMD.h"
-
-#if FASTSIMD_COMPILE_SSE42
-#include "Internal/SSE.h"
-#define FS_SIMD_CLASS FastSIMD::SSE42
-#include "Internal/SourceBuilder.inl"
-#endif
diff --git a/src/FastSIMD/FastSIMD_Level_SSSE3.cpp b/src/FastSIMD/FastSIMD_Level_SSSE3.cpp
deleted file mode 100644
index f91de069..00000000
--- a/src/FastSIMD/FastSIMD_Level_SSSE3.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "FastSIMD/FastSIMD.h"
-
-#if FASTSIMD_COMPILE_SSSE3
-#include "Internal/SSE.h"
-#define FS_SIMD_CLASS FastSIMD::SSSE3
-#include "Internal/SourceBuilder.inl"
-#endif
diff --git a/src/FastSIMD/FastSIMD_Level_Scalar.cpp b/src/FastSIMD/FastSIMD_Level_Scalar.cpp
deleted file mode 100644
index 87aff72a..00000000
--- a/src/FastSIMD/FastSIMD_Level_Scalar.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "FastSIMD/FastSIMD.h"
-
-#if FASTSIMD_COMPILE_SCALAR
-#include "Internal/Scalar.h"
-#define FS_SIMD_CLASS FastSIMD::Scalar
-#include "Internal/SourceBuilder.inl"
-#endif
\ No newline at end of file
diff --git a/src/FastSIMD/Internal/AVX.h b/src/FastSIMD/Internal/AVX.h
deleted file mode 100644
index b46375cd..00000000
--- a/src/FastSIMD/Internal/AVX.h
+++ /dev/null
@@ -1,474 +0,0 @@
-#pragma once
-
-#ifdef __GNUG__
-#include <x86intrin.h>
-#else
-#include <intrin.h>
-#endif
-
-#include "VecTools.h"
-
-namespace FastSIMD
-{
-    struct AVX_f32x8
-    {
-        FASTSIMD_INTERNAL_TYPE_SET( AVX_f32x8, __m256 );
-
-        FS_INLINE static AVX_f32x8 Incremented()
-        {
-            return _mm256_set_ps( 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f );
-        }
-
-        FS_INLINE explicit AVX_f32x8( float f )
-        {
-            *this = _mm256_set1_ps( f );
-        }
-
-        FS_INLINE explicit AVX_f32x8( float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7 )
-        {
-            *this = _mm256_set_ps( f7, f6, f5, f4, f3, f2, f1, f0 );
-        }
-
-        FS_INLINE AVX_f32x8& operator+=( const AVX_f32x8& rhs )
-        {
-            *this = _mm256_add_ps( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX_f32x8& operator-=( const AVX_f32x8& rhs )
-        {
-            *this = _mm256_sub_ps( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX_f32x8& operator*=( const AVX_f32x8& rhs )
-        {
-            *this = _mm256_mul_ps( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX_f32x8& operator/=( const AVX_f32x8& rhs )
-        {
-            *this = _mm256_div_ps( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX_f32x8& operator&=( const AVX_f32x8& rhs )
-        {
-            *this = _mm256_and_ps( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX_f32x8& operator|=( const AVX_f32x8& rhs )
-        {
-            *this = _mm256_or_ps( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX_f32x8& operator^=( const AVX_f32x8& rhs )
-        {
-            *this = _mm256_xor_ps( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX_f32x8 operator~() const
-        {
-#if FASTSIMD_CONFIG_GENERATE_CONSTANTS
-            const __m256i neg1 = _mm256_cmpeq_epi32( _mm256_setzero_si256(), _mm256_setzero_si256() );
-#else
-            const __m256i neg1 = _mm256_set1_epi32( -1 );
-#endif
-            return _mm256_xor_ps( *this, _mm256_castsi256_ps( neg1 ) );
-        }
-
-        FS_INLINE AVX_f32x8 operator-() const
-        {
-#if FASTSIMD_CONFIG_GENERATE_CONSTANTS
-            const __m256i minInt = _mm256_slli_epi32( _mm256_cmpeq_epi32( _mm256_setzero_si256(), _mm256_setzero_si256() ), 31 );
-#else
-            const __m256i minInt = _mm256_set1_epi32( 0x80000000 );
-#endif
-            return _mm256_xor_ps( *this, _mm256_castsi256_ps( minInt ) );
-        }
-
-        FS_INLINE __m256i operator==( const AVX_f32x8& rhs )
-        {
-            return _mm256_castps_si256( _mm256_cmp_ps( *this, rhs, _CMP_EQ_OS ) );
-        }
-
-        FS_INLINE __m256i operator!=( const AVX_f32x8& rhs )
-        {
-            return _mm256_castps_si256( _mm256_cmp_ps( *this, rhs, _CMP_NEQ_OS ) );
-        }
-
-        FS_INLINE __m256i operator>( const AVX_f32x8& rhs )
-        {
-            return _mm256_castps_si256( _mm256_cmp_ps( *this, rhs, _CMP_GT_OS ) );
-        }
-
-        FS_INLINE __m256i operator<( const AVX_f32x8& rhs )
-        {
-            return _mm256_castps_si256( _mm256_cmp_ps( *this, rhs, _CMP_LT_OS ) );
-        }
-
-        FS_INLINE __m256i operator>=( const AVX_f32x8& rhs )
-        {
-            return _mm256_castps_si256( _mm256_cmp_ps( *this, rhs, _CMP_GE_OS ) );
-        }
-
-        FS_INLINE __m256i operator<=( const AVX_f32x8& rhs )
-        {
-            return _mm256_castps_si256( _mm256_cmp_ps( *this, rhs, _CMP_LE_OS ) );
-        }
-    };
-
-    FASTSIMD_INTERNAL_OPERATORS_FLOAT( AVX_f32x8 )
-
-
-    struct AVX2_i32x8
-    {
-        FASTSIMD_INTERNAL_TYPE_SET( AVX2_i32x8, __m256i );
-
-        FS_INLINE static AVX2_i32x8 Incremented()
-        {
-            return _mm256_set_epi32( 7, 6, 5, 4, 3, 2, 1, 0 );
-        }
-
-        FS_INLINE explicit AVX2_i32x8( int32_t f )
-        {
-            *this = _mm256_set1_epi32( f );
-        }
-
-        FS_INLINE explicit AVX2_i32x8( int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, int32_t i6, int32_t i7 )
-        {
-            *this = _mm256_set_epi32( i7, i6, i5, i4, i3, i2, i1, i0 );
-        }
-
-        FS_INLINE AVX2_i32x8& operator+=( const AVX2_i32x8& rhs )
-        {
-            *this = _mm256_add_epi32( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX2_i32x8& operator-=( const AVX2_i32x8& rhs )
-        {
-            *this = _mm256_sub_epi32( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX2_i32x8& operator*=( const AVX2_i32x8& rhs )
-        {
-            *this = _mm256_mullo_epi32( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX2_i32x8& operator&=( const AVX2_i32x8& rhs )
-        {
-            *this = _mm256_and_si256( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX2_i32x8& operator|=( const AVX2_i32x8& rhs )
-        {
-            *this = _mm256_or_si256( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX2_i32x8& operator^=( const AVX2_i32x8& rhs )
-        {
-            *this = _mm256_xor_si256( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX2_i32x8& operator>>=( int32_t rhs )
-        {
-            *this = _mm256_srai_epi32( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX2_i32x8& operator<<=( int32_t rhs )
-        {
-            *this = _mm256_slli_epi32( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX2_i32x8 operator~() const
-        {
-#if FASTSIMD_CONFIG_GENERATE_CONSTANTS
-            const __m256i neg1 = _mm256_cmpeq_epi32( _mm256_setzero_si256(), _mm256_setzero_si256() );
-#else
-            const __m256i neg1 = _mm256_set1_epi32( -1 );
-#endif
-            return _mm256_xor_si256( *this, neg1 );
-        }
-
-        FS_INLINE AVX2_i32x8 operator-() const
-        {
-            return _mm256_sub_epi32( _mm256_setzero_si256(), *this );
-        }
-
-        FS_INLINE AVX2_i32x8 operator==( const AVX2_i32x8& rhs )
-        {
-            return _mm256_cmpeq_epi32( *this, rhs );
-        }
-
-        FS_INLINE AVX2_i32x8 operator>( const AVX2_i32x8& rhs )
-        {
-            return _mm256_cmpgt_epi32( *this, rhs );
-        }
-
-        FS_INLINE AVX2_i32x8 operator<( const AVX2_i32x8& rhs )
-        {
-            return _mm256_cmpgt_epi32( rhs, *this );
-        }
-    };
-
-    FASTSIMD_INTERNAL_OPERATORS_INT( AVX2_i32x8, int32_t )
-
-    template<eLevel LEVEL_T>
-    class AVX_T
-    {
-    public:
-        static_assert( LEVEL_T >= Level_AVX && LEVEL_T <= Level_AVX2, "Cannot create template with unsupported SIMD level" );
-
-        static constexpr eLevel SIMD_Level = LEVEL_T;
-
-        template<size_t ElementSize>
-        static constexpr size_t VectorSize = (256 / 8) / ElementSize;
-
-        typedef AVX_f32x8  float32v;
-        typedef AVX2_i32x8 int32v;
-        typedef AVX2_i32x8 mask32v;
-
-        // Load
-
-        FS_INLINE static float32v Load_f32( void const* p )
-        {
-            return _mm256_loadu_ps( reinterpret_cast<float const*>(p) );
-        }
-
-        FS_INLINE static int32v Load_i32( void const* p )
-        {
-            return _mm256_loadu_si256( reinterpret_cast<__m256i const*>(p) );
-        }
-
-        // Store
-
-        FS_INLINE static void Store_f32( void* p, float32v a )
-        {
-            _mm256_storeu_ps( reinterpret_cast<float*>(p), a );
-        }
-
-        FS_INLINE static void Store_i32( void* p, int32v a )
-        {
-            _mm256_storeu_si256( reinterpret_cast<__m256i*>(p), a );
-        }
-
-        // Extract
-
-        FS_INLINE static float Extract0_f32( float32v a )
-        {
-            return _mm256_cvtss_f32( a );
-        }
-
-        FS_INLINE static int32_t Extract0_i32( int32v a )
-        {
-            return _mm_cvtsi128_si32(_mm256_castsi256_si128( a ));
-        }
-
-        FS_INLINE static float Extract_f32( float32v a, size_t idx )
-        {
-            float f[8];
-            Store_f32( &f, a );
-            return f[idx & 7];
-        }
-
-        FS_INLINE static int32_t Extract_i32( int32v a, size_t idx )
-        {
-            int32_t i[8];
-            Store_i32( &i, a );
-            return i[idx & 7];
-        }
-
-        // Cast
-
-        FS_INLINE static float32v Casti32_f32( int32v a )
-        {
-            return _mm256_castsi256_ps( a );
-        }
-
-        FS_INLINE static int32v Castf32_i32( float32v a )
-        {
-            return _mm256_castps_si256( a );
-        }
-
-        // Convert
-
-        FS_INLINE static float32v Converti32_f32( int32v a )
-        {
-            return _mm256_cvtepi32_ps( a );
-        }
-
-        FS_INLINE static int32v Convertf32_i32( float32v a )
-        {
-            return _mm256_cvtps_epi32( a );
-        }
-
-        // Select
-
-        FS_INLINE static float32v Select_f32( mask32v m, float32v a, float32v b )
-        {
-            return  _mm256_blendv_ps( b, a, _mm256_castsi256_ps( m ) );
-        }
-
-        FS_INLINE static int32v Select_i32( mask32v m, int32v a, int32v b )
-        {
-            return _mm256_castps_si256( _mm256_blendv_ps( _mm256_castsi256_ps( b ), _mm256_castsi256_ps( a ), _mm256_castsi256_ps( m ) ) );
-        }
-
-        // Min, Max
-
-        FS_INLINE static float32v Min_f32( float32v a, float32v b )
-        {
-            return _mm256_min_ps( a, b );
-        }
-
-        FS_INLINE static float32v Max_f32( float32v a, float32v b )
-        {
-            return _mm256_max_ps( a, b );
-        }
-
-        FS_INLINE static int32v Min_i32( int32v a, int32v b )
-        {
-            return _mm256_min_epi32( a, b );
-        }
-
-        FS_INLINE static int32v Max_i32( int32v a, int32v b )
-        {
-            return _mm256_max_epi32( a, b );
-        }
-
-        // Bitwise
-
-        FS_INLINE static float32v BitwiseAndNot_f32( float32v a, float32v b )
-        {
-            return _mm256_andnot_ps( b, a );
-        }
-
-        FS_INLINE static int32v BitwiseAndNot_i32( int32v a, int32v b )
-        {
-            return _mm256_andnot_si256( b, a );
-        }
-
-        FS_INLINE static float32v BitwiseShiftRightZX_f32( float32v a, int32_t b )
-        {
-            return Casti32_f32( _mm256_srli_epi32( Castf32_i32( a ), b ) );
-        }
-
-        FS_INLINE static int32v BitwiseShiftRightZX_i32( int32v a, int32_t b )
-        {
-            return _mm256_srli_epi32( a, b );
-        }
-
-        // Abs
-
-        FS_INLINE static float32v Abs_f32( float32v a )
-        {
-#if FASTSIMD_CONFIG_GENERATE_CONSTANTS
-            const __m256i intMax = _mm256_srli_epi32( _mm256_cmpeq_epi32( _mm256_setzero_si256(), _mm256_setzero_si256() ), 1 );
-#else
-            const __m256i intMax = _mm256_set1_epi32( 0x7FFFFFFF );
-#endif
-            return _mm256_and_ps( a, _mm256_castsi256_ps( intMax ) );
-        }
-
-        FS_INLINE static int32v Abs_i32( int32v a )
-        {
-            return _mm256_abs_epi32( a );
-        }
-
-        // Float math
-
-        FS_INLINE static float32v Sqrt_f32( float32v a )
-        {
-            return _mm256_sqrt_ps( a );
-        }
-
-        FS_INLINE static float32v InvSqrt_f32( float32v a )
-        {
-            return _mm256_rsqrt_ps( a );
-        }
-
-        FS_INLINE static float32v Reciprocal_f32( float32v a )
-        {
-            return _mm256_rcp_ps( a );
-        }
-
-        // Floor, Ceil, Round
-
-        FS_INLINE static float32v Floor_f32( float32v a )
-        {
-            return _mm256_round_ps( a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC );
-        }
-
-        FS_INLINE static float32v Ceil_f32( float32v a )
-        {
-            return _mm256_round_ps( a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC );
-        }
-
-        FS_INLINE static float32v Round_f32( float32v a )
-        {
-            return _mm256_round_ps( a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC );
-        }
-
-        //Mask
-
-        FS_INLINE static int32v Mask_i32( int32v a, mask32v m )
-        {
-            return a & m;
-        }
-
-        FS_INLINE static float32v Mask_f32( float32v a, mask32v m )
-        {
-            return _mm256_and_ps( a, _mm256_castsi256_ps( m ) );
-        }
-
-        FS_INLINE static int32v NMask_i32( int32v a, mask32v m )
-        {
-            return _mm256_andnot_si256( m, a );
-        }
-
-        FS_INLINE static float32v NMask_f32( float32v a, mask32v m )
-        {
-            return _mm256_andnot_ps( _mm256_castsi256_ps( m ), a );
-        }
-
-        FS_INLINE static bool AnyMask_bool( mask32v m )
-        {
-            return !_mm256_testz_si256( m, m );
-        }
-    };
-
-#if FASTSIMD_COMPILE_AVX
-    typedef AVX_T<Level_AVX>  AVX;
-#endif
-
-#if FASTSIMD_COMPILE_AVX2
-    typedef AVX_T<Level_AVX2> AVX2;
-
-#if FASTSIMD_USE_FMA
-    template<>
-    FS_INLINE AVX2::float32v FMulAdd_f32<AVX2>( AVX2::float32v a, AVX2::float32v b, AVX2::float32v c )
-    {
-        return _mm256_fmadd_ps( a, b, c );
-    }
-
-    template<>
-    FS_INLINE AVX2::float32v FNMulAdd_f32<AVX2>( AVX2::float32v a, AVX2::float32v b, AVX2::float32v c )
-    {
-        return _mm256_fnmadd_ps( a, b, c );
-    }
-#endif
-#endif
-    
-}
diff --git a/src/FastSIMD/Internal/AVX512.h b/src/FastSIMD/Internal/AVX512.h
deleted file mode 100644
index fd378eda..00000000
--- a/src/FastSIMD/Internal/AVX512.h
+++ /dev/null
@@ -1,540 +0,0 @@
-#pragma once
-
-#include <immintrin.h>
-
-#include "VecTools.h"
-
-namespace FastSIMD
-{
-
-    struct AVX512_f32x16
-    {
-        FASTSIMD_INTERNAL_TYPE_SET( AVX512_f32x16, __m512 );
-
-        FS_INLINE static AVX512_f32x16 Incremented()
-        {
-            return _mm512_set_ps( 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f );
-        }
-
-        FS_INLINE explicit AVX512_f32x16( float f )
-        {
-            *this = _mm512_set1_ps( f );
-        }
-
-        FS_INLINE explicit AVX512_f32x16( float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7, float f8, float f9, float f10, float f11, float f12, float f13, float f14, float f15 )
-        {
-            *this = _mm512_set_ps( f15, f14, f13, f12, f11, f10, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0 );
-        }
-
-        FS_INLINE AVX512_f32x16& operator+=( const AVX512_f32x16& rhs )
-        {
-            *this = _mm512_add_ps( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX512_f32x16& operator-=( const AVX512_f32x16& rhs )
-        {
-            *this = _mm512_sub_ps( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX512_f32x16& operator*=( const AVX512_f32x16& rhs )
-        {
-            *this = _mm512_mul_ps( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX512_f32x16& operator/=( const AVX512_f32x16& rhs )
-        {
-            *this = _mm512_div_ps( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX512_f32x16& operator&=( const AVX512_f32x16& rhs )
-        {
-            *this = _mm512_and_ps( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX512_f32x16& operator|=( const AVX512_f32x16& rhs )
-        {
-            *this = _mm512_or_ps( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX512_f32x16& operator^=( const AVX512_f32x16& rhs )
-        {
-            *this = _mm512_xor_ps( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX512_f32x16 operator~() const
-        {
-#if FASTSIMD_CONFIG_GENERATE_CONSTANTS
-            const __m512i neg1 = _mm512_cmpeq_epi32( _mm512_setzero_si512(), _mm512_setzero_si512() );
-#else
-            const __m512i neg1 = _mm512_set1_epi32( -1 );
-#endif
-            return _mm512_xor_ps( *this, _mm512_castsi512_ps( neg1 ) );
-        }
-
-        FS_INLINE AVX512_f32x16 operator-() const
-        {
-#if FASTSIMD_CONFIG_GENERATE_CONSTANTS
-            const __m512i minInt = _mm512_slli_epi32( _mm512_cmpeq_epi32( _mm512_setzero_si512(), _mm512_setzero_si512() ), 31 );
-#else
-            const __m512i minInt = _mm512_set1_epi32( 0x80000000 );
-#endif
-            return _mm512_xor_ps( *this, _mm512_castsi512_ps( minInt ) );
-        }
-
-        FS_INLINE __mmask16 operator==( const AVX512_f32x16& rhs )
-        {
-            return _mm512_cmp_ps_mask( *this, rhs, _CMP_EQ_OS );
-        }
-
-        FS_INLINE __mmask16 operator!=( const AVX512_f32x16& rhs )
-        {
-            return _mm512_cmp_ps_mask( *this, rhs, _CMP_NEQ_OS );
-        }
-
-        FS_INLINE __mmask16 operator>( const AVX512_f32x16& rhs )
-        {
-            return _mm512_cmp_ps_mask( *this, rhs, _CMP_GT_OS );
-        }
-
-        FS_INLINE __mmask16 operator<( const AVX512_f32x16& rhs )
-        {
-            return _mm512_cmp_ps_mask( *this, rhs, _CMP_LT_OS );
-        }
-
-        FS_INLINE __mmask16 operator>=( const AVX512_f32x16& rhs )
-        {
-            return _mm512_cmp_ps_mask( *this, rhs, _CMP_GE_OS );
-        }
-
-        FS_INLINE __mmask16 operator<=( const AVX512_f32x16& rhs )
-        {
-            return _mm512_cmp_ps_mask( *this, rhs, _CMP_LE_OS );
-        }
-    };
-
-    FASTSIMD_INTERNAL_OPERATORS_FLOAT( AVX512_f32x16 )
-
-
-    struct AVX512_i32x16
-    {
-        FASTSIMD_INTERNAL_TYPE_SET( AVX512_i32x16, __m512i );
-
-        FS_INLINE static AVX512_i32x16 Incremented()
-        {
-            return _mm512_set_epi32( 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 );
-        }
-
-        FS_INLINE explicit AVX512_i32x16( int32_t i )
-        {
-            *this = _mm512_set1_epi32( i );
-        }
-
-        FS_INLINE explicit AVX512_i32x16( int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, int32_t i6, int32_t i7, int32_t i8, int32_t i9, int32_t i10, int32_t i11, int32_t i12, int32_t i13, int32_t i14, int32_t i15 )
-        {
-            *this = _mm512_set_epi32( i15, i14, i13, i12, i11, i10, i9, i8, i7, i6, i5, i4, i3, i2, i1, i0 );
-        }
-
-        FS_INLINE AVX512_i32x16& operator+=( const AVX512_i32x16& rhs )
-        {
-            *this = _mm512_add_epi32( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX512_i32x16& operator-=( const AVX512_i32x16& rhs )
-        {
-            *this = _mm512_sub_epi32( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX512_i32x16& operator*=( const AVX512_i32x16& rhs )
-        {
-            *this = _mm512_mullo_epi32( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX512_i32x16& operator&=( const AVX512_i32x16& rhs )
-        {
-            *this = _mm512_and_si512( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX512_i32x16& operator|=( const AVX512_i32x16& rhs )
-        {
-            *this = _mm512_or_si512( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX512_i32x16& operator^=( const AVX512_i32x16& rhs )
-        {
-            *this = _mm512_xor_si512( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX512_i32x16& operator>>=( int32_t rhs )
-        {
-            *this = _mm512_srai_epi32( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX512_i32x16& operator<<=( int32_t rhs )
-        {
-            *this = _mm512_slli_epi32( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE AVX512_i32x16 operator~() const
-        {
-#if FASTSIMD_CONFIG_GENERATE_CONSTANTS
-            const __m512i neg1 = _mm512_cmpeq_epi32( _mm512_setzero_si512(), _mm512_setzero_si512() );
-#else
-            const __m512i neg1 = _mm512_set1_epi32( -1 );
-#endif
-            return _mm512_xor_si512( *this, neg1 );
-        }
-
-        FS_INLINE AVX512_i32x16 operator-() const
-        {
-            return _mm512_sub_epi32( _mm512_setzero_si512(), *this );
-        }
-
-        FS_INLINE __mmask16 operator==( const AVX512_i32x16& rhs )
-        {
-            return _mm512_cmpeq_epi32_mask( *this, rhs );
-        }
-
-        FS_INLINE __mmask16 operator>( const AVX512_i32x16& rhs )
-        {
-            return _mm512_cmpgt_epi32_mask( *this, rhs );
-        }
-
-        FS_INLINE __mmask16 operator<( const AVX512_i32x16& rhs )
-        {
-            return _mm512_cmplt_epi32_mask( *this, rhs );
-        }
-    };
-
-    FASTSIMD_INTERNAL_OPERATORS_INT( AVX512_i32x16, int32_t )
-
-    template<eLevel LEVEL_T>
-    class AVX512_T
-    {
-    public:
-        static_assert( LEVEL_T == Level_AVX512, "Cannot create template with unsupported SIMD level" );
-
-        static constexpr eLevel SIMD_Level = LEVEL_T;
-
-        template<size_t ElementSize>
-        static constexpr size_t VectorSize = (512 / 8) / ElementSize;
-
-        typedef AVX512_f32x16  float32v;
-        typedef AVX512_i32x16  int32v;
-        typedef __mmask16      mask32v;
-
-        // Load
-
-        FS_INLINE static float32v Load_f32( void const* p )
-        {
-            return _mm512_loadu_ps( p );
-        }
-
-        FS_INLINE static int32v Load_i32( void const* p )
-        {
-            return _mm512_loadu_si512( p );
-        }
-
-        // Store
-
-        FS_INLINE static void Store_f32( void* p, float32v a )
-        {
-            _mm512_storeu_ps( p, a );
-        }
-
-        FS_INLINE static void Store_i32( void* p, int32v a )
-        {
-            _mm512_storeu_si512( p, a );
-        }
-
-        // Cast
-
-        FS_INLINE static float32v Casti32_f32( int32v a )
-        {
-            return _mm512_castsi512_ps( a );
-        }
-
-        FS_INLINE static int32v Castf32_i32( float32v a )
-        {
-            return _mm512_castps_si512( a );
-        }
-
-        // Extract
-
-        FS_INLINE static float Extract0_f32( float32v a )
-        {
-            return _mm512_cvtss_f32( a );
-        }
-
-        FS_INLINE static int32_t Extract0_i32( int32v a )
-        {
-            return _mm_cvtsi128_si32( _mm512_castsi512_si128( a ) );
-        }
-
-        FS_INLINE static float Extract_f32( float32v a, size_t idx )
-        {
-            float32v x = _mm512_maskz_compress_ps( mask32v( 1u << (idx & 15) ), a );
-            return _mm512_cvtss_f32( x );
-        }
-
-        FS_INLINE static int32_t Extract_i32( int32v a, size_t idx )
-        {
-            int32v x = _mm512_maskz_compress_epi32( mask32v( 1u << (idx & 15) ), a );
-            return _mm_cvtsi128_si32( _mm512_castsi512_si128( x ) );
-        }
-
-        // Convert
-
-        FS_INLINE static float32v Converti32_f32( int32v a )
-        {
-            return _mm512_cvtepi32_ps( a );
-        }
-
-        FS_INLINE static int32v Convertf32_i32( float32v a )
-        {
-            return _mm512_cvtps_epi32( a );
-        }
-
-        // Select
-
-        FS_INLINE static float32v Select_f32( mask32v m, float32v a, float32v b )
-        {
-            return _mm512_mask_blend_ps( m, b, a );
-        }
-
-        FS_INLINE static int32v Select_i32( mask32v m, int32v a, int32v b )
-        {
-            return _mm512_mask_blend_epi32( m, b, a );
-        }
-
-        // Min, Max
-
-        FS_INLINE static float32v Min_f32( float32v a, float32v b )
-        {
-            return _mm512_min_ps( a, b );
-        }
-
-        FS_INLINE static float32v Max_f32( float32v a, float32v b )
-        {
-            return _mm512_max_ps( a, b );
-        }
-
-        FS_INLINE static int32v Min_i32( int32v a, int32v b )
-        {
-            return _mm512_min_epi32( a, b );
-        }
-
-        FS_INLINE static int32v Max_i32( int32v a, int32v b )
-        {
-            return _mm512_max_epi32( a, b );
-        }
-
-        // Bitwise
-
-        FS_INLINE static float32v BitwiseAndNot_f32( float32v a, float32v b )
-        {
-            return _mm512_andnot_ps( b, a );
-        }
-
-        FS_INLINE static int32v BitwiseAndNot_i32( int32v a, int32v b )
-        {
-            return _mm512_andnot_si512( b, a );
-        }
-
-        FS_INLINE static float32v BitwiseShiftRightZX_f32( float32v a, int32_t b )
-        {
-            return Casti32_f32( _mm512_srli_epi32( Castf32_i32( a ), b ) );
-        }
-
-        FS_INLINE static int32v BitwiseShiftRightZX_i32( int32v a, int32_t b )
-        {
-            return _mm512_srli_epi32( a, b );
-        }
-
-        // Abs
-
-        FS_INLINE static float32v Abs_f32( float32v a )
-        {
-            return _mm512_abs_ps( a );
-        }
-
-        FS_INLINE static int32v Abs_i32( int32v a )
-        {
-            return _mm512_abs_epi32( a );
-        }
-
-        // Float math
-
-        FS_INLINE static float32v Sqrt_f32( float32v a )
-        {
-            return _mm512_sqrt_ps( a );
-        }
-
-        FS_INLINE static float32v InvSqrt_f32( float32v a )
-        {
-            return _mm512_rsqrt14_ps( a );
-        }
-
-        FS_INLINE static float32v Reciprocal_f32( float32v a )
-        {
-            return _mm512_rcp14_ps( a );
-        }
-
-        // Floor, Ceil, Round
-
-        FS_INLINE static float32v Floor_f32( float32v a )
-        {
-            return _mm512_roundscale_ps( a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC );
-        }
-
-        FS_INLINE static float32v Ceil_f32( float32v a )
-        {
-            return _mm512_roundscale_ps( a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC );
-        }
-
-        FS_INLINE static float32v Round_f32( float32v a )
-        {
-            return _mm512_roundscale_ps( a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC );
-        }
-
-        //Mask
-
-        FS_INLINE static int32v Mask_i32( int32v a, mask32v m )
-        {
-            return _mm512_maskz_mov_epi32( m, a );
-        }
-
-        FS_INLINE static float32v Mask_f32( float32v a, mask32v m )
-        {
-            return _mm512_maskz_mov_ps( m, a );
-        }
-
-        FS_INLINE static int32v NMask_i32( int32v a, mask32v m )
-        {
-            return _mm512_maskz_mov_epi32( ~m, a );
-        }
-
-        FS_INLINE static float32v NMask_f32( float32v a, mask32v m )
-        {
-            return _mm512_maskz_mov_ps( ~m, a );
-        }
-
-        FS_INLINE static bool AnyMask_bool( mask32v m )
-        {
-            return m;
-        }
-    };
-
-#if FASTSIMD_COMPILE_AVX512
-    typedef AVX512_T<Level_AVX512> AVX512;
-
-#if FASTSIMD_USE_FMA
-    template<>
-    FS_INLINE AVX512::float32v FMulAdd_f32<AVX512>( AVX512::float32v a, AVX512::float32v b, AVX512::float32v c )
-    {
-        return _mm512_fmadd_ps( a, b, c );
-    }
-
-    template<>
-    FS_INLINE AVX512::float32v FNMulAdd_f32<AVX512>( AVX512::float32v a, AVX512::float32v b, AVX512::float32v c )
-    {
-        return _mm512_fnmadd_ps( a, b, c );
-    }
-#endif
-
-    // Masked float
-
-    template<>
-    FS_INLINE AVX512::float32v MaskedAdd_f32<AVX512>( AVX512::float32v a, AVX512::float32v b, AVX512::mask32v m )
-    {
-        return _mm512_mask_add_ps( a, m, a, b );
-    }
-
-    template<>
-    FS_INLINE AVX512::float32v MaskedSub_f32<AVX512>( AVX512::float32v a, AVX512::float32v b, AVX512::mask32v m )
-    {
-        return _mm512_mask_sub_ps( a, m, a, b );
-    }
-
-    template<>
-    FS_INLINE AVX512::float32v MaskedMul_f32<AVX512>( AVX512::float32v a, AVX512::float32v b, AVX512::mask32v m )
-    {
-        return _mm512_mask_mul_ps( a, m, a, b );
-    }
-
-    // Masked int32
-
-    template<>
-    FS_INLINE AVX512::int32v MaskedAdd_i32<AVX512>( AVX512::int32v a, AVX512::int32v b, AVX512::mask32v m )
-    {
-        return _mm512_mask_add_epi32( a, m, a, b );
-    }
-
-    template<>
-    FS_INLINE AVX512::int32v MaskedSub_i32<AVX512>( AVX512::int32v a, AVX512::int32v b, AVX512::mask32v m )
-    {
-        return _mm512_mask_sub_epi32( a, m, a, b );
-    }
-
-    template<>
-    FS_INLINE AVX512::int32v MaskedMul_i32<AVX512>( AVX512::int32v a, AVX512::int32v b, AVX512::mask32v m )
-    {
-        return _mm512_mask_mullo_epi32( a, m, a, b );
-    }
-
-    // NMasked float
-
-    template<>
-    FS_INLINE AVX512::float32v NMaskedAdd_f32<AVX512>( AVX512::float32v a, AVX512::float32v b, AVX512::mask32v m )
-    {
-        return _mm512_mask_add_ps( a, ~m, a, b );
-    }
-
-    template<>
-    FS_INLINE AVX512::float32v NMaskedSub_f32<AVX512>( AVX512::float32v a, AVX512::float32v b, AVX512::mask32v m )
-    {
-        return _mm512_mask_sub_ps( a, ~m, a, b );
-    }
-
-    template<>
-    FS_INLINE AVX512::float32v NMaskedMul_f32<AVX512>( AVX512::float32v a, AVX512::float32v b, AVX512::mask32v m )
-    {
-        return _mm512_mask_mul_ps( a, ~m, a, b );
-    }
-
-    // NMasked int32
-
-    template<>
-    FS_INLINE AVX512::int32v NMaskedAdd_i32<AVX512>( AVX512::int32v a, AVX512::int32v b, AVX512::mask32v m )
-    {
-        return _mm512_mask_add_epi32( a, ~m, a, b );
-    }
-
-    template<>
-    FS_INLINE AVX512::int32v NMaskedSub_i32<AVX512>( AVX512::int32v a, AVX512::int32v b, AVX512::mask32v m )
-    {
-        return _mm512_mask_sub_epi32( a, ~m, a, b );
-    }
-
-    template<>
-    FS_INLINE AVX512::int32v NMaskedMul_i32<AVX512>( AVX512::int32v a, AVX512::int32v b, AVX512::mask32v m )
-    {
-        return _mm512_mask_mul_epi32( a, ~m, a, b );
-    }
-#endif
-    
-}
diff --git a/src/FastSIMD/Internal/NEON.h b/src/FastSIMD/Internal/NEON.h
deleted file mode 100644
index 532f0614..00000000
--- a/src/FastSIMD/Internal/NEON.h
+++ /dev/null
@@ -1,611 +0,0 @@
-#pragma once
-
-
-#include <arm_neon.h>
-
-#include "VecTools.h"
-
-#if defined(__arm__)
-#define FASTSIMD_USE_ARMV7
-#endif
-
-
-namespace FastSIMD
-{
-    
-    struct NEON_i32x4
-    {
-        FASTSIMD_INTERNAL_TYPE_SET( NEON_i32x4, int32x4_t );
-        
-
-        FS_INLINE static NEON_i32x4 Zero()
-        {
-            return vdupq_n_s32( 0 );
-        }
-
-        FS_INLINE static NEON_i32x4 Incremented()
-        {
-            alignas(16) const int32_t f[4]{ 0, 1, 2, 3 };
-            return vld1q_s32( f );
-        }
-
-        FS_INLINE explicit NEON_i32x4( int32_t i )
-        {
-            *this = vdupq_n_s32( i );
-        }
-
-        FS_INLINE explicit NEON_i32x4( int32_t i0, int32_t i1, int32_t i2, int32_t i3 )
-        {
-            alignas(16) const int32_t f[4]{ i0, i1, i2, i3 };
-            *this = vld1q_s32( f );
-        }
-
-        FS_INLINE NEON_i32x4& operator+=( const NEON_i32x4& rhs )
-        {
-            *this = vaddq_s32( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE NEON_i32x4& operator-=( const NEON_i32x4& rhs )
-        {
-            *this = vsubq_s32( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE NEON_i32x4& operator*=( const NEON_i32x4& rhs )
-        {
-            *this = vmulq_s32( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE NEON_i32x4& operator&=( const NEON_i32x4& rhs )
-        {
-            *this = vandq_s32( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE NEON_i32x4& operator|=( const NEON_i32x4& rhs )
-        {
-            *this = vorrq_s32( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE NEON_i32x4& operator^=( const NEON_i32x4& rhs )
-        {
-            *this = veorq_s32( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE NEON_i32x4& operator>>=( const int32_t rhs )
-        {
-            int32x4_t rhs2 = vdupq_n_s32( -rhs );
-            *this = vshlq_s32(*this, rhs2);//use shift right by constant for faster execution
-            return *this;
-        }
-
-        FS_INLINE NEON_i32x4& operator<<=( const int32_t rhs )
-        {
-            int32x4_t rhs2 = vdupq_n_s32( rhs );
-            *this = vshlq_s32(*this, rhs2);//use shift left by constant for faster execution
-            return *this;
-        }
-
-        FS_INLINE NEON_i32x4 operator~() const
-        {
-            return vmvnq_s32( *this );
-        }
-
-        FS_INLINE NEON_i32x4 operator-() const
-        {
-            return vnegq_s32( *this );
-        }
-        
-        FS_INLINE NEON_i32x4 operator<( const NEON_i32x4 &b ) const
-        {
-            return vreinterpretq_s32_u32( vcltq_s32( *this, b ) );
-        }
-        FS_INLINE NEON_i32x4 operator>( const NEON_i32x4 &b ) const
-        {
-            return vreinterpretq_s32_u32( vcgtq_s32( *this, b ) );
-        }
-        FS_INLINE NEON_i32x4 operator<=( const NEON_i32x4 &b ) const
-        {
-            return vreinterpretq_s32_u32( vcleq_s32( *this, b ) );
-        }
-        FS_INLINE NEON_i32x4 operator>=( const NEON_i32x4 &b ) const
-        {
-            return vreinterpretq_s32_u32( vcgeq_s32( *this, b ) );
-        }
-        FS_INLINE NEON_i32x4 operator!=( const NEON_i32x4 &b ) const
-        {
-            return vreinterpretq_s32_u32( vmvnq_u32 (vceqq_s32( *this, b ) ) );
-        }
-        FS_INLINE NEON_i32x4 operator==( const NEON_i32x4 &b ) const
-        {
-            return vreinterpretq_s32_u32( vceqq_s32( *this, b ) );
-        }
-    };
-
-    FASTSIMD_INTERNAL_OPERATORS_INT( NEON_i32x4, int32_t )
-    
-
-    struct NEON_f32x4
-    {
-        FASTSIMD_INTERNAL_TYPE_SET( NEON_f32x4, float32x4_t );
-
-
-        FS_INLINE static NEON_f32x4 Zero()
-        {
-            return vdupq_n_f32( 0 );
-        }
-
-        FS_INLINE static NEON_f32x4 Incremented()
-        {
-            alignas(16) const float f[4]{ 0.0f, 1.0f, 2.0f, 3.0f };
-            return vld1q_f32( f );
-        }
-
-        FS_INLINE explicit NEON_f32x4( float f )
-        {
-            *this = vdupq_n_f32( f );
-        }
-
-        FS_INLINE explicit NEON_f32x4( float f0, float f1, float f2, float f3 )
-        {
-            alignas(16) const float f[4]{ f0, f1, f2, f3 };
-            *this = vld1q_f32( f );
-        }
-
-        FS_INLINE NEON_f32x4& operator+=( const NEON_f32x4& rhs )
-        {
-            *this = vaddq_f32( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE NEON_f32x4& operator-=( const NEON_f32x4& rhs )
-        {
-            *this = vsubq_f32( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE NEON_f32x4& operator*=( const NEON_f32x4& rhs )
-        {
-            *this = vmulq_f32( *this, rhs );
-            return *this;
-        }
-        
-        #ifdef FASTSIMD_USE_ARMV7
-            FS_INLINE NEON_f32x4& operator/=( const NEON_f32x4& rhs )
-            {
-                
-                float32x4_t reciprocal = vrecpeq_f32( rhs );
-                // use a couple Newton-Raphson steps to refine the estimate.  Depending on your
-                // application's accuracy requirements, you may be able to get away with only
-                // one refinement (instead of the two used here).  Be sure to test!
-                reciprocal = vmulq_f32( vrecpsq_f32( rhs, reciprocal ), reciprocal );
-                reciprocal = vmulq_f32( vrecpsq_f32( rhs, reciprocal ), reciprocal );
-
-                // and finally, compute a/b = a*(1/b)
-                *this = vmulq_f32( *this, reciprocal );
-                
-                return *this;
-            }
-        #else
-            FS_INLINE NEON_f32x4& operator/=( const NEON_f32x4& rhs )
-            {
-                /*
-                float32x4_t reciprocal = vrecpeq_f32( rhs );
-                reciprocal = vmulq_f32( vrecpsq_f32( rhs, reciprocal ), reciprocal );
-                reciprocal = vmulq_f32( vrecpsq_f32( rhs, reciprocal ), reciprocal );
-                *this = vmulq_f32( *this, reciprocal );
-                */
-                *this = vdivq_f32( *this, rhs );
-                
-                return *this;
-            }
-        #endif
-
-        
-        
-        
-        FS_INLINE NEON_f32x4& operator&=( const NEON_f32x4& rhs )
-        {
-            *this = vreinterpretq_f32_s32( vandq_s32( vreinterpretq_s32_f32( *this ), vreinterpretq_s32_f32( rhs ) ) );
-            return *this;
-        }
-        FS_INLINE NEON_f32x4& operator|=( const NEON_f32x4& rhs )
-        {
-            *this = vreinterpretq_f32_s32( vorrq_s32( vreinterpretq_s32_f32( *this ), vreinterpretq_s32_f32( rhs ) ) );
-            return *this;
-        }
-        FS_INLINE NEON_f32x4& operator^=( const NEON_f32x4& rhs )
-        {
-            *this = vreinterpretq_f32_s32( veorq_s32( vreinterpretq_s32_f32( *this ), vreinterpretq_s32_f32( rhs ) ) );
-            return *this;
-        }
-
-        FS_INLINE NEON_f32x4 operator-() const
-        {
-            return vnegq_f32( *this );
-        }
-        FS_INLINE NEON_f32x4 operator~() const
-        {
-            return vreinterpretq_f32_u32( vmvnq_u32( vreinterpretq_u32_f32(*this) ) );
-        }
-        
-        
-        FS_INLINE NEON_i32x4 operator<( const NEON_f32x4 &b ) const
-        {
-            return vreinterpretq_s32_u32( vcltq_f32( *this, b ) );
-        }
-        FS_INLINE NEON_i32x4 operator>( const NEON_f32x4 &b ) const
-        {
-            return vreinterpretq_s32_u32( vcgtq_f32( *this, b ) );
-        }
-        FS_INLINE NEON_i32x4 operator<=( const NEON_f32x4 &b ) const
-        {
-            return vreinterpretq_s32_u32( vcleq_f32( *this, b ) );
-        }
-        FS_INLINE NEON_i32x4 operator>=( const NEON_f32x4 &b ) const
-        {
-            return vreinterpretq_s32_u32( vcgeq_f32( *this, b ) );
-        }
-        FS_INLINE NEON_i32x4 operator!=( const NEON_f32x4 &b ) const
-        {
-            return vreinterpretq_s32_u32( vmvnq_u32 (vceqq_f32( *this, b ) ) );
-        }
-        FS_INLINE NEON_i32x4 operator==( const NEON_f32x4 &b ) const
-        {
-            return vreinterpretq_s32_u32( vceqq_f32( *this, b ) );
-        }
-    };
-
-    FASTSIMD_INTERNAL_OPERATORS_FLOAT( NEON_f32x4 )
-
-
-    
-    
-
-    template<eLevel LEVEL_T>
-    class NEON_T
-    {
-    public:
-        static constexpr eLevel SIMD_Level = FastSIMD::Level_NEON;
-
-        
-        template<size_t ElementSize>
-        static constexpr size_t VectorSize = (128 / 8) / ElementSize;
-
-        typedef NEON_f32x4 float32v;
-        typedef NEON_i32x4   int32v;
-        typedef NEON_i32x4  mask32v;
-
-        FS_INLINE static float32v Load_f32( void const* p )
-        {
-            return vld1q_f32( reinterpret_cast<float const*>(p) );
-        }
-
-        FS_INLINE static int32v Load_i32( void const* p )
-        {
-            return vld1q_s32( reinterpret_cast<int32_t const*>(p) );
-        }
-
-        // Store
-
-        FS_INLINE static void Store_f32( void* p, float32v a )
-        {
-            vst1q_f32( reinterpret_cast<float*>(p), a );
-        }
-
-        FS_INLINE static void Store_i32( void* p, int32v a )
-        {
-            vst1q_s32( reinterpret_cast<int32_t*>(p), a );
-        }
-
-        // Cast
-
-        FS_INLINE static float32v Casti32_f32( int32v a )
-        {
-            return vreinterpretq_f32_s32( a );
-        }
-
-        FS_INLINE static int32v Castf32_i32( float32v a )
-        {
-            return vreinterpretq_s32_f32( a );
-        }
-
-        // Convert
-
-        FS_INLINE static float32v Converti32_f32( int32v a )
-        {
-            return vcvtq_f32_s32( a );
-        }
-        
-        FS_INLINE static int32v Convertf32_i32( float32v a )
-        {
-            return vcvtq_s32_f32( Round_f32(a) );
-        }        
-
-        // Comparisons
-
-        FS_INLINE static mask32v Equal_f32( float32v a, float32v b )
-        {
-            return vreinterpretq_s32_u32( vceqq_f32( a, b ) );
-        }
-
-        FS_INLINE static mask32v GreaterThan_f32( float32v a, float32v b )
-        {
-            return vreinterpretq_s32_u32( vcgtq_f32( a, b ) );
-        }
-
-        FS_INLINE static mask32v LessThan_f32( float32v a, float32v b )
-        {
-            return vreinterpretq_s32_u32( vcltq_f32( a, b ) );
-        }
-
-        FS_INLINE static mask32v GreaterEqualThan_f32( float32v a, float32v b )
-        {
-            return vreinterpretq_s32_u32( vcgeq_f32( a, b ) );
-        }
-
-        FS_INLINE static mask32v LessEqualThan_f32( float32v a, float32v b )
-        {
-            return vreinterpretq_s32_u32( vcleq_f32( a, b ) );
-        }
-
-        FS_INLINE static mask32v Equal_i32( int32v a, int32v b )
-        {
-            return vreinterpretq_s32_u32( vceqq_s32( a, b ) );
-        }
-
-        FS_INLINE static mask32v GreaterThan_i32( int32v a, int32v b )
-        {
-            return vreinterpretq_s32_u32( vcgtq_s32( a, b ) );
-        }
-
-        FS_INLINE static mask32v LessThan_i32( int32v a, int32v b )
-        {
-            return vreinterpretq_s32_u32( vcltq_s32( a, b ) );
-        }
-
-        // Select
-
-        FS_INLINE static float32v Select_f32( mask32v m, float32v a, float32v b )
-        {
-            return vbslq_f32( vreinterpretq_u32_s32( m ), a, b );
-        }
-        FS_INLINE static int32v Select_i32( mask32v m, int32v a, int32v b )
-        {
-            return vbslq_s32( vreinterpretq_u32_s32( m ), a, b );
-        }
-
-        // Min, Max
-
-        FS_INLINE static float32v Min_f32( float32v a, float32v b )
-        {
-            return vminq_f32( a, b );
-        }
-
-        FS_INLINE static float32v Max_f32( float32v a, float32v b )
-        {
-            return vmaxq_f32( a, b );
-        }
-
-        FS_INLINE static int32v Min_i32( int32v a, int32v b )
-        {
-            return vminq_s32( a, b );
-        }
-
-        FS_INLINE static int32v Max_i32( int32v a, int32v b )
-        {
-            return vmaxq_s32( a, b );
-        }
-        
-        // Bitwise
-
-        FS_INLINE static float32v BitwiseAnd_f32( float32v a, float32v b )
-        {
-            return vreinterpretq_f32_u32( vandq_u32( vreinterpretq_u32_f32( a ), vreinterpretq_u32_f32( b ) ) );
-        }
-/*
-        FS_INLINE static float32v BitwiseOr_f32( float32v a, float32v b )
-        {
-            return vreinterpretq_f32_u32( vorrq_u32( vreinterpretq_u32_f32( a ), vreinterpretq_u32_f32( b ) ) );
-        }
-
-        FS_INLINE static float32v BitwiseXor_f32( float32v a, float32v b )
-        {
-            return vreinterpretq_f32_u32( veorq_u32( vreinterpretq_u32_f32( a ), vreinterpretq_u32_f32( b ) ) );
-        }
-
-        FS_INLINE static float32v BitwiseNot_f32( float32v a )
-        {
-            return vreinterpretq_f32_u32( vmvnq_u32( vreinterpretq_u32_f32( a ) ) );
-        }
-*/
-        FS_INLINE static float32v BitwiseAndNot_f32( float32v a, float32v b )
-        {
-            return vreinterpretq_f32_u32( vandq_u32( vreinterpretq_u32_f32( a ), vmvnq_u32( vreinterpretq_u32_f32( b ) ) ) );
-        }
-
-        FS_INLINE static int32v BitwiseAndNot_i32( int32v a, int32v b )
-        {
-            return vandq_s32( a , vmvnq_s32( b ) );
-        }
-
-        // Abs
-
-        FS_INLINE static float32v Abs_f32( float32v a )
-        {
-            return vabsq_f32( a );
-        }
-
-        FS_INLINE static int32v Abs_i32( int32v a )
-        {
-            return vabsq_s32( a );
-        }        
-
-        FS_INLINE static float32v InvSqrt_f32( float32v a )
-        {
-            return vrsqrteq_f32( a );
-        }        
-        
-        // Floor, Ceil, Round:
-
-#ifdef FASTSIMD_USE_ARMV7    
-        FS_INLINE static float32v IntFloor_f32(float32v a)
-        {
-            static const float32x4_t cmpval = vcvtq_f32_s32( vdupq_n_s32( 0x7FFFFFFF ) );            
-
-            uint32x4_t cmp1 = vcagtq_f32( a, cmpval );
-            uint32x4_t cmp2 = vcaleq_f32( a, cmpval );
-
-            float32x4_t tr = vcvtq_f32_s32( vcvtq_s32_f32( a ) );
-
-            uint32x4_t xcmp1 = vandq_u32(cmp1, vreinterpretq_u32_f32( a ) );
-            uint32x4_t xcmp2 = vandq_u32(cmp2, vreinterpretq_u32_f32( tr ) );
-
-            uint32x4_t res0 = vorrq_u32( xcmp1, xcmp2 );
-
-            float32x4_t res1 = vreinterpretq_f32_u32( res0 );
-            
-            return res1;
-        }
-    
-        FS_INLINE static float32v Floor_f32(float32v a)
-        {
-            static const float32x4_t zerox = vdupq_n_f32( 0 );
-
-            float32x4_t ifl = IntFloor_f32(a);
-
-            uint32x4_t cond1 = vmvnq_u32(vceqq_f32(a, ifl));
-            uint32x4_t cond2 = vcltq_f32(a, zerox);
-
-            uint32x4_t cmpmask = vandq_u32(cond1, cond2);
-            float32x4_t addx = vcvtq_f32_s32( vreinterpretq_s32_u32(cmpmask) );
-
-            float32x4_t ret0 = vaddq_f32(ifl, addx);
-
-            return ret0;
-        }
-
-        FS_INLINE static float32v Ceil_f32(float32v a)
-        {
-            static const float32x4_t zerox = vdupq_n_f32( 0 );
-
-            float32x4_t ifl = IntFloor_f32(a);
-            
-            uint32x4_t cond1 = vmvnq_u32(vceqq_f32(a, ifl));
-            uint32x4_t cond2 = vcgeq_f32(a, zerox);
-
-            uint32x4_t cmpmask = vandq_u32(cond1, cond2);
-            float32x4_t addx = vcvtq_f32_s32( vreinterpretq_s32_u32(cmpmask) );
-            
-
-            float32x4_t ret0 = vsubq_f32(ifl, addx);
-
-            return ret0;
-        }
-
-        FS_INLINE static float32v Round_f32(float32v a)
-        {
-            static const float32x4_t zerox = vdupq_n_f32( 0 );
-            static const float32x4_t halfx = vdupq_n_f32( 0.5f );
-            static const float32x4_t onex = vdupq_n_f32( 1.0f );
-        
-            float32x4_t a2 = vaddq_f32(vabsq_f32(a), halfx);
-            float32x4_t ifl = IntFloor_f32(a2);          
-            
-            uint32x4_t cmpmask = vcltq_f32(a, zerox);
-            float32x4_t rhs = vcvtq_f32_s32( vreinterpretq_s32_u32(cmpmask) );
-            float32x4_t rhs2 = vaddq_f32(vmulq_n_f32(rhs, 2.0f), onex);            
-
-            return vmulq_f32(ifl, rhs2);
-        }
-        
-        FS_INLINE static float32v Sqrt_f32( float32v a )
-        {
-            return Reciprocal_f32(InvSqrt_f32(a));
-        }
-
-    #else
-        FS_INLINE static float32v Floor_f32( float32v a )
-        {
-            return vrndmq_f32( a );
-        }
-
-        FS_INLINE static float32v Ceil_f32( float32v a )
-        {
-            return vrndpq_f32( a );
-        }
-
-        FS_INLINE static float32v Round_f32( float32v a )
-        {
-            return vrndnq_f32( a );
-        }
-
-        FS_INLINE static float32v Sqrt_f32( float32v a )
-        {
-            return vsqrtq_f32( a );
-        }
-    #endif      
-        
-        // Mask
-
-        FS_INLINE static int32v Mask_i32( int32v a, mask32v m )
-        {
-            return a & m;
-        }
-
-        FS_INLINE static int32v NMask_i32( int32v a, mask32v m )
-        {
-            return BitwiseAndNot_i32(a, m);
-        }
-
-        FS_INLINE static float32v Mask_f32( float32v a, mask32v m )
-        {
-            return BitwiseAnd_f32( a, vreinterpretq_f32_s32( m ) );
-        }
-
-        FS_INLINE static float32v NMask_f32( float32v a, mask32v m )
-        {
-            return BitwiseAndNot_f32( a, vreinterpretq_f32_s32( m ) );
-        }
-        
-        FS_INLINE static float Extract0_f32( float32v a )
-        {
-            return vgetq_lane_f32(a, 0);
-        }
-
-        FS_INLINE static int32_t Extract0_i32( int32v a )
-        {
-            return vgetq_lane_s32(a, 0);
-        }
-
-        FS_INLINE static float32v Reciprocal_f32( float32v a )
-        {            
-            return vrecpeq_f32( a );
-        }
-
-        FS_INLINE static float32v BitwiseShiftRightZX_f32( float32v a, int32_t b )
-        {
-            int32x4_t rhs2 = vdupq_n_s32( -b );
-            return vreinterpretq_f32_u32 ( vshlq_u32( vreinterpretq_u32_f32(a), rhs2) );
-        }
-        
-        FS_INLINE static int32v BitwiseShiftRightZX_i32( int32v a, int32_t b )
-        {
-            int32x4_t rhs2 = vdupq_n_s32( -b );
-            return vreinterpretq_s32_u32 (vshlq_u32( vreinterpretq_u32_s32(a), rhs2));
-        }
-        FS_INLINE static bool AnyMask_bool( mask32v m )
-        {
-            uint32x2_t tmp = vorr_u32(vget_low_u32(vreinterpretq_u32_s32(m)), vget_high_u32(vreinterpretq_u32_s32(m)));
-            return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
-        }
-    };
-    
-#if FASTSIMD_COMPILE_NEON
-    typedef NEON_T<Level_NEON> NEON;
-#endif
-}
diff --git a/src/FastSIMD/Internal/SSE.h b/src/FastSIMD/Internal/SSE.h
deleted file mode 100644
index 664a005b..00000000
--- a/src/FastSIMD/Internal/SSE.h
+++ /dev/null
@@ -1,574 +0,0 @@
-#pragma once
-
-#ifdef __GNUG__
-#include <x86intrin.h>
-#else
-#include <intrin.h>
-#endif
-
-#include "VecTools.h"
-
-namespace FastSIMD
-{
-    struct SSE_f32x4
-    {
-        FASTSIMD_INTERNAL_TYPE_SET( SSE_f32x4, __m128 );
-
-        FS_INLINE static SSE_f32x4 Incremented()
-        {
-            return _mm_set_ps( 3.0f, 2.0f, 1.0f, 0.0f );
-        }
-
-        FS_INLINE explicit SSE_f32x4( float f )
-        {
-            *this = _mm_set1_ps( f );
-        }
-
-        FS_INLINE explicit SSE_f32x4( float f0, float f1, float f2, float f3 )
-        {
-            *this = _mm_set_ps( f3, f2, f1, f0 );
-        }
-
-        FS_INLINE SSE_f32x4& operator+=( const SSE_f32x4& rhs )
-        {
-            *this = _mm_add_ps( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE SSE_f32x4& operator-=( const SSE_f32x4& rhs )
-        {
-            *this = _mm_sub_ps( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE SSE_f32x4& operator*=( const SSE_f32x4& rhs )
-        {
-            *this = _mm_mul_ps( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE SSE_f32x4& operator/=( const SSE_f32x4& rhs )
-        {
-            *this = _mm_div_ps( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE SSE_f32x4& operator&=( const SSE_f32x4& rhs )
-        {
-            *this = _mm_and_ps( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE SSE_f32x4& operator|=( const SSE_f32x4& rhs )
-        {
-            *this = _mm_or_ps( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE SSE_f32x4& operator^=( const SSE_f32x4& rhs )
-        {
-            *this = _mm_xor_ps( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE SSE_f32x4 operator~() const
-        {
-#if FASTSIMD_CONFIG_GENERATE_CONSTANTS
-            const __m128i neg1 = _mm_cmpeq_epi32( _mm_setzero_si128(), _mm_setzero_si128() );
-#else
-            const __m128i neg1 = _mm_set1_epi32( -1 );
-#endif
-            return _mm_xor_ps( *this, _mm_castsi128_ps( neg1 ) );
-        }
-
-        FS_INLINE SSE_f32x4 operator-() const
-        {
-#if FASTSIMD_CONFIG_GENERATE_CONSTANTS
-            const __m128i minInt = _mm_slli_epi32( _mm_cmpeq_epi32( _mm_undefined_si128(), _mm_setzero_si128() ), 31 );
-#else
-            const __m128i minInt = _mm_set1_epi32( 0x80000000 );
-#endif
-            return _mm_xor_ps( *this, _mm_castsi128_ps( minInt ) );
-        }
-
-        FS_INLINE __m128i operator==( const SSE_f32x4& rhs )
-        {
-            return _mm_castps_si128( _mm_cmpeq_ps( *this, rhs ) );
-        }
-
-        FS_INLINE __m128i operator!=( const SSE_f32x4& rhs )
-        {
-            return _mm_castps_si128( _mm_cmpneq_ps( *this, rhs ) );
-        }
-
-        FS_INLINE __m128i operator>( const SSE_f32x4& rhs )
-        {
-            return _mm_castps_si128( _mm_cmpgt_ps( *this, rhs ) );
-        }
-
-        FS_INLINE __m128i operator<( const SSE_f32x4& rhs )
-        {
-            return _mm_castps_si128( _mm_cmplt_ps( *this, rhs ) );
-        }
-
-        FS_INLINE __m128i operator>=( const SSE_f32x4& rhs )
-        {
-            return _mm_castps_si128( _mm_cmpge_ps( *this, rhs ) );
-        }
-
-        FS_INLINE __m128i operator<=( const SSE_f32x4& rhs )
-        {
-            return _mm_castps_si128( _mm_cmple_ps( *this, rhs ) );
-        }
-    };
-
-    FASTSIMD_INTERNAL_OPERATORS_FLOAT( SSE_f32x4 )
-
-
-    template<eLevel LEVEL_T>
-    struct SSE_i32x4
-    {
-        FASTSIMD_INTERNAL_TYPE_SET( SSE_i32x4, __m128i );
-
-        FS_INLINE static SSE_i32x4 Incremented()
-        {
-            return _mm_set_epi32( 3, 2, 1, 0 );
-        }
-
-        FS_INLINE explicit SSE_i32x4( int32_t i )
-        {
-            *this = _mm_set1_epi32( i );
-        }
-
-        FS_INLINE explicit SSE_i32x4( int32_t i0, int32_t i1, int32_t i2, int32_t i3 )
-        {
-            *this = _mm_set_epi32( i3, i2, i1, i0 );
-        }
-
-        FS_INLINE SSE_i32x4& operator+=( const SSE_i32x4& rhs )
-        {
-            *this = _mm_add_epi32( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE SSE_i32x4& operator-=( const SSE_i32x4& rhs )
-        {
-            *this = _mm_sub_epi32( *this, rhs );
-            return *this;
-        }
-
-        template<eLevel L = LEVEL_T, std::enable_if_t<(L < Level_SSE41)>* = nullptr>
-        FS_INLINE SSE_i32x4& operator*=( const SSE_i32x4& rhs )
-        {
-            __m128i tmp1 = _mm_mul_epu32( *this, rhs ); /* mul 2,0*/
-            __m128i tmp2 = _mm_mul_epu32( _mm_srli_si128( *this, 4 ), _mm_srli_si128( rhs, 4 ) ); /* mul 3,1 */
-            *this = _mm_unpacklo_epi32( _mm_shuffle_epi32( tmp1, _MM_SHUFFLE( 0, 0, 2, 0 ) ), _mm_shuffle_epi32( tmp2, _MM_SHUFFLE( 0, 0, 2, 0 ) ) ); /* shuffle results to [63..0] and pack */
-            return *this;
-        }
-        
-        template<eLevel L = LEVEL_T, std::enable_if_t<(L >= Level_SSE41)>* = nullptr>
-        FS_INLINE SSE_i32x4& operator*=( const SSE_i32x4& rhs )
-        {
-            *this = _mm_mullo_epi32( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE SSE_i32x4& operator&=( const SSE_i32x4& rhs )
-        {
-            *this = _mm_and_si128( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE SSE_i32x4& operator|=( const SSE_i32x4& rhs )
-        {
-            *this = _mm_or_si128( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE SSE_i32x4& operator^=( const SSE_i32x4& rhs )
-        {
-            *this = _mm_xor_si128( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE SSE_i32x4& operator>>=( int32_t rhs )
-        {
-            *this = _mm_srai_epi32( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE SSE_i32x4& operator<<=( int32_t rhs )
-        {
-            *this = _mm_slli_epi32( *this, rhs );
-            return *this;
-        }
-
-        FS_INLINE SSE_i32x4 operator~() const
-        {
-#if FASTSIMD_CONFIG_GENERATE_CONSTANTS
-            const __m128i neg1 = _mm_cmpeq_epi32( _mm_setzero_si128(), _mm_setzero_si128() );
-#else
-            const __m128i neg1 = _mm_set1_epi32( -1 );
-#endif
-            return _mm_xor_si128( *this, neg1 );
-        }
-
-        FS_INLINE SSE_i32x4 operator-() const
-        {
-            return _mm_sub_epi32( _mm_setzero_si128(), *this );
-        }
-
-        FS_INLINE SSE_i32x4 operator==( const SSE_i32x4& rhs )
-        {
-            return _mm_cmpeq_epi32( *this, rhs );
-        }
-
-        FS_INLINE SSE_i32x4 operator>( const SSE_i32x4& rhs )
-        {
-            return _mm_cmpgt_epi32( *this, rhs );
-        }
-
-        FS_INLINE SSE_i32x4 operator<( const SSE_i32x4& rhs )
-        {
-            return _mm_cmplt_epi32( *this, rhs );
-        }
-    };
-
-    FASTSIMD_INTERNAL_OPERATORS_INT_TEMPLATED( SSE_i32x4, int32_t )
-
-    template<eLevel LEVEL_T>
-    class SSE_T
-    {
-    public:
-        static_assert( LEVEL_T >= Level_SSE && LEVEL_T <= Level_SSE42, "Cannot create template with unsupported SIMD level" );
-
-        static constexpr eLevel SIMD_Level = LEVEL_T;
-
-        template<size_t ElementSize>
-        static constexpr size_t VectorSize = (128 / 8) / ElementSize;
-
-        typedef SSE_f32x4          float32v;
-        typedef SSE_i32x4<LEVEL_T> int32v;
-        typedef SSE_i32x4<LEVEL_T> mask32v;
-
-        // Load
-
-        FS_INLINE static float32v Load_f32( void const* p )
-        {
-            return _mm_loadu_ps( reinterpret_cast<float const*>(p) );
-        }
-
-        FS_INLINE static int32v Load_i32( void const* p )
-        {
-            return _mm_loadu_si128( reinterpret_cast<__m128i const*>(p) );
-        }
-
-        // Store
-
-        FS_INLINE static void Store_f32( void* p, float32v a )
-        {
-            _mm_storeu_ps( reinterpret_cast<float*>(p), a );
-        }
-
-        FS_INLINE static void Store_i32( void* p, int32v a )
-        {
-            _mm_storeu_si128( reinterpret_cast<__m128i*>(p), a );
-        }
-
-        // Extract
-
-        FS_INLINE static float Extract0_f32( float32v a )
-        {
-            return _mm_cvtss_f32( a );
-        }
-
-        FS_INLINE static int32_t Extract0_i32( int32v a )
-        {
-            return _mm_cvtsi128_si32( a );
-        }
-
-        FS_INLINE static float Extract_f32( float32v a, size_t idx )
-        {
-            float f[4];
-            Store_f32( &f, a );
-            return f[idx & 3];
-        }
-
-        FS_INLINE static int32_t Extract_i32( int32v a, size_t idx )
-        {
-            int32_t i[4];
-            Store_i32( &i, a );
-            return i[idx & 3];
-        }
-
-        // Cast
-
-        FS_INLINE static float32v Casti32_f32( int32v a )
-        {
-            return _mm_castsi128_ps( a );
-        }
-
-        FS_INLINE static int32v Castf32_i32( float32v a )
-        {
-            return _mm_castps_si128( a );
-        }
-
-        // Convert
-
-        FS_INLINE static float32v Converti32_f32( int32v a )
-        {
-            return _mm_cvtepi32_ps( a );
-        }
-
-        FS_INLINE static int32v Convertf32_i32( float32v a )
-        {
-            return _mm_cvtps_epi32( a );
-        }
-
-        // Select
-
-        template<eLevel L = LEVEL_T, std::enable_if_t<(L < Level_SSE41)>* = nullptr>
-        FS_INLINE static float32v Select_f32( mask32v m, float32v a, float32v b )
-        {
-            __m128 mf = _mm_castsi128_ps( m );
-
-            return _mm_xor_ps( b, _mm_and_ps( mf, _mm_xor_ps( a, b ) ) );
-        }
-        
-        template<eLevel L = LEVEL_T, std::enable_if_t<(L >= Level_SSE41)>* = nullptr>
-        FS_INLINE static float32v Select_f32( mask32v m, float32v a, float32v b )
-        {
-            return  _mm_blendv_ps( b, a, _mm_castsi128_ps( m ) );
-        }
-
-        template<eLevel L = LEVEL_T, std::enable_if_t<(L < Level_SSE41)>* = nullptr>
-        FS_INLINE static int32v Select_i32( mask32v m, int32v a, int32v b )
-        {
-            return _mm_xor_si128( b, _mm_and_si128( m, _mm_xor_si128( a, b ) ) );
-        }
-        
-        template<eLevel L = LEVEL_T, std::enable_if_t<(L >= Level_SSE41)>* = nullptr>
-        FS_INLINE static int32v Select_i32( mask32v m, int32v a, int32v b )
-        {
-            return _mm_castps_si128( _mm_blendv_ps( _mm_castsi128_ps( b ), _mm_castsi128_ps( a ), _mm_castsi128_ps( m ) ) );
-        }
-
-        // Min, Max
-
-        FS_INLINE static float32v Min_f32( float32v a, float32v b )
-        {
-            return _mm_min_ps( a, b );
-        }
-
-        FS_INLINE static float32v Max_f32( float32v a, float32v b )
-        {
-            return _mm_max_ps( a, b );
-        }
-
-        template<eLevel L = LEVEL_T, std::enable_if_t<(L < Level_SSE41)>* = nullptr>
-        FS_INLINE static int32v Min_i32( int32v a, int32v b )
-        {
-            return Select_i32( a < b, a, b );
-        }
-        
-        template<eLevel L = LEVEL_T, std::enable_if_t<(L >= Level_SSE41)>* = nullptr>
-        FS_INLINE static int32v Min_i32( int32v a, int32v b )
-        {
-            return _mm_min_epi32( a, b );
-        }
-
-        template<eLevel L = LEVEL_T, std::enable_if_t<(L < Level_SSE41)>* = nullptr>
-        FS_INLINE static int32v Max_i32( int32v a, int32v b )
-        {
-            return Select_i32( a > b, a, b );
-        }
-        
-        template<eLevel L = LEVEL_T, std::enable_if_t<(L >= Level_SSE41)>* = nullptr>
-        FS_INLINE static int32v Max_i32( int32v a, int32v b )
-        {
-            return _mm_max_epi32( a, b );
-        }
-
-        // Bitwise
-
-        FS_INLINE static float32v BitwiseAndNot_f32( float32v a, float32v b )
-        {
-            return _mm_andnot_ps( b, a );
-        }
-
-        FS_INLINE static int32v BitwiseAndNot_i32( int32v a, int32v b )
-        {
-            return _mm_andnot_si128( b, a );
-        }
-
-        FS_INLINE static float32v BitwiseShiftRightZX_f32( float32v a, int32_t b )
-        {
-            return Casti32_f32( _mm_srli_epi32( Castf32_i32( a ), b ) );
-        }
-
-        FS_INLINE static int32v BitwiseShiftRightZX_i32( int32v a, int32_t b )
-        {
-            return _mm_srli_epi32( a, b );
-        }
-
-        // Abs
-
-        FS_INLINE static float32v Abs_f32( float32v a )
-        {
-#if FASTSIMD_CONFIG_GENERATE_CONSTANTS
-            const __m128i intMax = _mm_srli_epi32( _mm_cmpeq_epi32( _mm_setzero_si128(), _mm_setzero_si128() ), 1 );
-#else
-            const __m128i intMax = _mm_set1_epi32( 0x7FFFFFFF );
-#endif
-            return _mm_and_ps( a, _mm_castsi128_ps( intMax ) );
-        }
-
-        template<eLevel L = LEVEL_T, std::enable_if_t<(L < Level_SSSE3)>* = nullptr>
-        FS_INLINE static int32v Abs_i32( int32v a )
-        {
-            __m128i signMask = _mm_srai_epi32( a, 31 );
-            return _mm_sub_epi32( _mm_xor_si128( a, signMask ), signMask );
-        }
-        
-        template<eLevel L = LEVEL_T, std::enable_if_t<(L >= Level_SSSE3)>* = nullptr>
-        FS_INLINE static int32v Abs_i32( int32v a )
-        {
-            return _mm_abs_epi32( a );
-        }
-
-        // Float math
-
-        FS_INLINE static float32v Sqrt_f32( float32v a )
-        {
-            return _mm_sqrt_ps( a );
-        }
-
-        FS_INLINE static float32v InvSqrt_f32( float32v a )
-        {
-            return _mm_rsqrt_ps( a );
-        }
-
-        FS_INLINE static float32v Reciprocal_f32( float32v a )
-        {
-            return _mm_rcp_ps( a );
-        }
-
-        // Floor, Ceil, Round: http://dss.stephanierct.com/DevBlog/?p=8
-
-        template<eLevel L = LEVEL_T, std::enable_if_t<(L < Level_SSE41)>* = nullptr>
-        FS_INLINE static float32v Floor_f32( float32v a )
-        {
-#if FASTSIMD_CONFIG_GENERATE_CONSTANTS
-            const __m128 f1 = _mm_castsi128_ps( _mm_slli_epi32( _mm_srli_epi32( _mm_cmpeq_epi32( _mm_setzero_si128(), _mm_setzero_si128() ), 25 ), 23 ) );
-#else
-            const __m128 f1 = _mm_set1_ps( 1.0f );
-#endif
-            __m128 fval = _mm_cvtepi32_ps( _mm_cvttps_epi32( a ) );
-
-            return _mm_sub_ps( fval, _mm_and_ps( _mm_cmplt_ps( a, fval ), f1 ) );
-        }
-        
-        template<eLevel L = LEVEL_T, std::enable_if_t<(L >= Level_SSE41)>* = nullptr>
-        FS_INLINE static float32v Floor_f32( float32v a )
-        {
-            return _mm_round_ps( a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC );
-        }
-        
-        template<eLevel L = LEVEL_T, std::enable_if_t<(L < Level_SSE41)>* = nullptr>
-        FS_INLINE static float32v Ceil_f32( float32v a )
-        {
-#if FASTSIMD_CONFIG_GENERATE_CONSTANTS
-            const __m128 f1 = _mm_castsi128_ps( _mm_slli_epi32( _mm_srli_epi32( _mm_cmpeq_epi32( _mm_setzero_si128(), _mm_setzero_si128() ), 25 ), 23 ) );
-#else
-            const __m128 f1 = _mm_set1_ps( 1.0f );
-#endif
-            __m128 fval = _mm_cvtepi32_ps( _mm_cvttps_epi32( a ) );
-            __m128 cmp = _mm_cmplt_ps( fval, a );
-            return _mm_add_ps( fval, _mm_and_ps( cmp, f1 ) );
-        }
-        
-        template<eLevel L = LEVEL_T, std::enable_if_t<(L >= Level_SSE41)>* = nullptr>
-        FS_INLINE static float32v Ceil_f32( float32v a )
-        {
-            return _mm_round_ps( a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC );
-        }
-        
-        template<eLevel L = LEVEL_T, std::enable_if_t<(L < Level_SSE41)>* = nullptr>
-        FS_INLINE static float32v Round_f32( float32v a )
-        {
-            __m128 aSign = _mm_and_ps( a, _mm_castsi128_ps( int32v( 0x80000000 ) ) );
-
-            return _mm_cvtepi32_ps( _mm_cvttps_epi32( a + float32v(_mm_or_ps( aSign, float32v( 0.5f ) ) ) ) );
-
-#if FASTSIMD_CONFIG_GENERATE_CONSTANTS
-            const __m128 nearest2 = _mm_castsi128_ps( _mm_srli_epi32( _mm_cmpeq_epi32( _mm_setzero_si128(), _mm_setzero_si128() ), 2 ) );
-#else
-            const __m128 nearest2 = _mm_set1_ps( 1.99999988079071044921875f );
-#endif
-            __m128 aTrunc = _mm_cvtepi32_ps( _mm_cvttps_epi32( a ) );       // truncate a
-            __m128 rmd = _mm_sub_ps( a, aTrunc );                           // get remainder
-            __m128 rmd2 = _mm_mul_ps( rmd, nearest2 );                   // mul remainder by near 2 will yield the needed offset
-            __m128 rmd2Trunc = _mm_cvtepi32_ps( _mm_cvttps_epi32( rmd2 ) ); // after being truncated of course
-            return _mm_add_ps( aTrunc, rmd2Trunc );
-        }
-        
-        template<eLevel L = LEVEL_T, std::enable_if_t<(L >= Level_SSE41)>* = nullptr>
-        FS_INLINE static float32v Round_f32( float32v a )
-        {
-            return _mm_round_ps( a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC );
-        }
-
-        // Mask
-
-        FS_INLINE static int32v Mask_i32( int32v a, mask32v m )
-        {
-            return a & m;
-        }
-
-        FS_INLINE static float32v Mask_f32( float32v a, mask32v m )
-        {
-            return _mm_and_ps( a, _mm_castsi128_ps( m ) );
-        }
-
-        FS_INLINE static int32v NMask_i32( int32v a, mask32v m )
-        {
-            return _mm_andnot_si128( m, a );
-        }
-
-        FS_INLINE static float32v NMask_f32( float32v a, mask32v m )
-        {
-            return _mm_andnot_ps( _mm_castsi128_ps( m ), a );
-        }
-
-        template<eLevel L = LEVEL_T, std::enable_if_t<( L < Level_SSE41 )>* = nullptr>
-        FS_INLINE static bool AnyMask_bool( mask32v m )
-        {
-            return _mm_movemask_ps( _mm_castsi128_ps( m ) );
-        }
-
-        template<eLevel L = LEVEL_T, std::enable_if_t<( L >= Level_SSE41 )>* = nullptr>
-        FS_INLINE static bool AnyMask_bool( mask32v m )
-        {
-            return !_mm_testz_si128( m, m );
-        }
-    };
-
-#if FASTSIMD_COMPILE_SSE
-    typedef SSE_T<Level_SSE>   SSE;
-#endif
-#if FASTSIMD_COMPILE_SSE2
-    typedef SSE_T<Level_SSE2>  SSE2;
-#endif
-#if FASTSIMD_COMPILE_SSE3
-    typedef SSE_T<Level_SSE3>  SSE3;
-#endif
-#if FASTSIMD_COMPILE_SSSE3
-    typedef SSE_T<Level_SSSE3> SSSE3;
-#endif
-#if FASTSIMD_COMPILE_SSE41
-    typedef SSE_T<Level_SSE41> SSE41;
-#endif
-#if FASTSIMD_COMPILE_SSE42
-    typedef SSE_T<Level_SSE42> SSE42;
-#endif
-}
diff --git a/src/FastSIMD/Internal/Scalar.h b/src/FastSIMD/Internal/Scalar.h
deleted file mode 100644
index 23c44529..00000000
--- a/src/FastSIMD/Internal/Scalar.h
+++ /dev/null
@@ -1,451 +0,0 @@
-#pragma once
-
-#include "VecTools.h"
-#include <algorithm>
-#include <cmath>
-
-namespace FastSIMD
-{
-    template<typename OUT, typename IN>
-    OUT ScalarCast( IN a )
-    {
-        union
-        {
-            OUT o;
-            IN  i;
-        } u;
-
-        u.i = a;
-        return u.o;
-    }
-
-    struct Scalar_Float
-    {
-        FASTSIMD_INTERNAL_TYPE_SET( Scalar_Float, float );
-
-        FS_INLINE static Scalar_Float Incremented()
-        {
-            return 0.0f;
-        }
-
-        FS_INLINE Scalar_Float& operator+=( const Scalar_Float& rhs )
-        {
-            vector += rhs;
-            return *this;
-        }
-
-        FS_INLINE Scalar_Float& operator-=( const Scalar_Float& rhs )
-        {
-            vector -= rhs;
-            return *this;
-        }
-
-        FS_INLINE Scalar_Float& operator*=( const Scalar_Float& rhs )
-        {
-            vector *= rhs;
-            return *this;
-        }
-
-        FS_INLINE Scalar_Float& operator/=( const Scalar_Float& rhs )
-        {
-            vector /= rhs;
-            return *this;
-        }
-
-        FS_INLINE Scalar_Float& operator&=( const Scalar_Float& rhs )
-        {
-            *this = ScalarCast<float>( ScalarCast<int32_t, float>( *this ) & ScalarCast<int32_t, float>( rhs ) );
-            return *this;
-        }
-
-        FS_INLINE Scalar_Float& operator|=( const Scalar_Float& rhs )
-        {
-            *this = ScalarCast<float>( ScalarCast<int32_t, float>( *this ) | ScalarCast<int32_t, float>( rhs ) );
-            return *this;
-        }
-
-        FS_INLINE Scalar_Float& operator^=( const Scalar_Float& rhs )
-        {
-            *this = ScalarCast<float>( ScalarCast<int32_t, float>( *this ) ^ ScalarCast<int32_t, float>( rhs ) );
-            return *this;
-        }
-
-        FS_INLINE Scalar_Float operator~() const
-        {
-            return ScalarCast<float>( ~ScalarCast<int32_t, float>( *this ) );
-        }
-
-        FS_INLINE Scalar_Float operator-() const
-        {
-            return -vector;
-        }
-
-        FS_INLINE bool operator==( const Scalar_Float& rhs )
-        {
-            return vector == rhs;
-        }
-
-        FS_INLINE bool operator!=( const Scalar_Float& rhs )
-        {
-            return vector != rhs;
-        }
-
-        FS_INLINE bool operator>( const Scalar_Float& rhs )
-        {
-            return vector > rhs;
-        }
-
-        FS_INLINE bool operator<( const Scalar_Float& rhs )
-        {
-            return vector < rhs;
-        }
-
-        FS_INLINE bool operator>=( const Scalar_Float& rhs )
-        {
-            return vector >= rhs;
-        }
-
-        FS_INLINE bool operator<=( const Scalar_Float& rhs )
-        {
-            return vector <= rhs;
-        }
-    };
-
-    FASTSIMD_INTERNAL_OPERATORS_FLOAT( Scalar_Float )
-
-
-    struct Scalar_Int
-    {
-        FASTSIMD_INTERNAL_TYPE_SET( Scalar_Int, int32_t );
-
-        FS_INLINE static Scalar_Int Incremented()
-        {
-            return 0;
-        }
-
-        FS_INLINE Scalar_Int& operator+=( const Scalar_Int& rhs )
-        {
-            vector += rhs;
-            return *this;
-        }
-
-        FS_INLINE Scalar_Int& operator-=( const Scalar_Int& rhs )
-        {
-            vector -= rhs;
-            return *this;
-        }
-
-        FS_INLINE Scalar_Int& operator*=( const Scalar_Int& rhs )
-        {
-            vector *= rhs;
-            return *this;
-        }
-
-        FS_INLINE Scalar_Int& operator&=( const Scalar_Int& rhs )
-        {
-            vector &= rhs;
-            return *this;
-        }
-
-        FS_INLINE Scalar_Int& operator|=( const Scalar_Int& rhs )
-        {
-            vector |= rhs;
-            return *this;
-        }
-
-        FS_INLINE Scalar_Int& operator^=( const Scalar_Int& rhs )
-        {
-            vector ^= rhs;
-            return *this;
-        }
-
-        FS_INLINE Scalar_Int& operator>>=( int32_t rhs )
-        {
-            vector >>= rhs;
-            return *this;
-        }
-
-        FS_INLINE Scalar_Int& operator<<=( int32_t rhs )
-        {
-            vector <<= rhs;
-            return *this;
-        }
-
-        FS_INLINE Scalar_Int operator~() const
-        {
-            return ~vector;
-        }
-
-        FS_INLINE Scalar_Int operator-() const
-        {
-            return -vector;
-        }
-
-        FS_INLINE bool operator==( const Scalar_Int& rhs )
-        {
-            return vector == rhs;
-        }
-
-        FS_INLINE bool operator>( const Scalar_Int& rhs )
-        {
-            return vector > rhs;
-        }
-
-        FS_INLINE bool operator<( const Scalar_Int& rhs )
-        {
-            return vector < rhs;
-        }
-    };
-
-    FASTSIMD_INTERNAL_OPERATORS_INT( Scalar_Int, int32_t )
-
-
-    struct Scalar_Mask
-    {
-        FASTSIMD_INTERNAL_TYPE_SET( Scalar_Mask, bool );
-
-        FS_INLINE Scalar_Mask operator~() const
-        {
-            return !vector;
-        }
-
-        FS_INLINE Scalar_Mask& operator&=( const Scalar_Mask& rhs )
-        {
-            vector = vector && rhs;
-            return *this;
-        }
-
-        FS_INLINE Scalar_Mask& operator|=( const Scalar_Mask& rhs )
-        {
-            vector = vector || rhs;
-            return *this;
-        }
-
-        FS_INLINE Scalar_Mask operator&( const Scalar_Mask& rhs )
-        {
-            return vector && rhs;
-        }
-
-        FS_INLINE Scalar_Mask operator|( const Scalar_Mask& rhs )
-        {
-            return vector || rhs;
-        }
-    };
-
-    class Scalar
-    {
-    public:
-        static constexpr eLevel SIMD_Level = FastSIMD::Level_Scalar;
-
-        template<size_t ElementSize = 8>
-        static constexpr size_t VectorSize = sizeof(int32_t) / ElementSize;
-
-        typedef Scalar_Float float32v;
-        typedef Scalar_Int   int32v;
-        typedef Scalar_Mask  mask32v;
-
-        // Load
-
-        FS_INLINE static float32v Load_f32( void const* p )
-        {
-            return *reinterpret_cast<float32v const*>(p);
-        }
-
-        FS_INLINE static int32v Load_i32( void const* p )
-        {
-            return *reinterpret_cast<int32v const*>(p);
-        }
-
-        // Store
-
-        FS_INLINE static void Store_f32( void* p, float32v a )
-        {
-            *reinterpret_cast<float32v*>(p) = a;
-        }
-
-        FS_INLINE static void Store_i32( void* p, int32v a )
-        {
-            *reinterpret_cast<int32v*>(p) = a;
-        }
-
-        // Extract
-
-        FS_INLINE static float Extract0_f32( float32v a )
-        {
-            return a;
-        }
-
-        FS_INLINE static int32_t Extract0_i32( int32v a )
-        {
-            return a;
-        }
-
-        FS_INLINE static float Extract_f32( float32v a, size_t idx )
-        {
-            return a;
-        }
-
-        FS_INLINE static int32_t Extract_i32( int32v a, size_t idx )
-        {
-            return a;
-        }
-
-        // Cast
-
-        FS_INLINE static float32v Casti32_f32( int32v a )
-        {
-            return ScalarCast<float, int32_t>( a );
-        }
-
-        FS_INLINE static int32v Castf32_i32( float32v a )
-        {
-            return ScalarCast<int32_t, float>( a );
-        }
-
-        // Convert
-
-        FS_INLINE static float32v Converti32_f32( int32v a )
-        {
-            return static_cast<float>(a);
-        }
-
-        FS_INLINE static int32v Convertf32_i32( float32v a )
-        {
-            return static_cast<int32_t>(rintf( a ));
-        }
-
-        // Select
-
-        FS_INLINE static float32v Select_f32( mask32v m, float32v a, float32v b )
-        {
-            return m ? a : b;
-        }
-
-        FS_INLINE static int32v Select_i32( mask32v m, int32v a, int32v b )
-        {
-            return m ? a : b;
-        }
-
-        // Min, Max
-
-        FS_INLINE static float32v Min_f32( float32v a, float32v b )
-        {
-            return fminf( a, b );
-        }
-
-        FS_INLINE static float32v Max_f32( float32v a, float32v b )
-        {
-            return fmaxf( a, b );
-        }
-
-        FS_INLINE static int32v Min_i32( int32v a, int32v b )
-        {
-            return std::min( a, b );
-        }
-
-        FS_INLINE static int32v Max_i32( int32v a, int32v b )
-        {
-            return std::max( a, b );
-        }
-
-        // Bitwise       
-
-        FS_INLINE static float32v BitwiseAndNot_f32( float32v a, float32v b )
-        {
-            return Casti32_f32( Castf32_i32( a ) & ~Castf32_i32( b ) );
-        }
-
-        FS_INLINE static int32v BitwiseAndNot_i32( int32v a, int32v b )
-        {
-            return a & ~b;
-        }
-
-        FS_INLINE static float32v BitwiseShiftRightZX_f32( float32v a, int32_t b )
-        {
-            return Casti32_f32( int32_t( uint32_t( Castf32_i32( a ) ) >> b ) );
-        }
-
-        FS_INLINE static int32v BitwiseShiftRightZX_i32( int32v a, int32_t b )
-        {
-            return int32_t( uint32_t( a ) >> b );
-        }
-
-        // Abs
-
-        FS_INLINE static float32v Abs_f32( float32v a )
-        {
-            return fabsf( a );
-        }
-
-        FS_INLINE static int32v Abs_i32( int32v a )
-        {
-            return abs( a );
-        }
-
-        // Float math
-
-        FS_INLINE static float32v Sqrt_f32( float32v a )
-        {
-            return sqrtf( a );
-        }
-
-        FS_INLINE static float32v InvSqrt_f32( float32v a )
-        {
-            float xhalf = 0.5f * (float)a;
-            a = Casti32_f32( 0x5f3759df - ((int32_t)Castf32_i32( a ) >> 1) );
-            a *= (1.5f - xhalf * (float)a * (float)a);
-            return a;
-        }
-
-        FS_INLINE static float32v Reciprocal_f32( float32v a )
-        {
-            // pow( pow(x,-0.5), 2 ) = pow( x, -1 ) = 1.0 / x
-            a = Casti32_f32( (0xbe6eb3beU - (int32_t)Castf32_i32( a )) >> 1 );
-            return a * a;
-        }
-
-        // Floor, Ceil, Round
-
-        FS_INLINE static float32v Floor_f32( float32v a )
-        {
-            return floorf( a );
-        }
-
-        FS_INLINE static float32v Ceil_f32( float32v a )
-        {
-            return ceilf( a );
-        }
-
-        FS_INLINE static float32v Round_f32( float32v a )
-        {
-            return nearbyintf( a );
-        }
-
-        // Mask
-
-        FS_INLINE static int32v Mask_i32( int32v a, mask32v m )
-        {
-            return m ? a : int32v(0);
-        }
-
-        FS_INLINE static float32v Mask_f32( float32v a, mask32v m )
-        {
-            return m ? a : float32v(0);
-        }
-
-        FS_INLINE static int32v NMask_i32( int32v a, mask32v m )
-        {
-            return m ? int32v(0) : a;
-        }
-
-        FS_INLINE static float32v NMask_f32( float32v a, mask32v m )
-        {
-            return m ? float32v(0) : a;
-        }
-
-        FS_INLINE static bool AnyMask_bool( mask32v m )
-        {
-            return m;
-        }
-    };
-}
diff --git a/src/FastSIMD/Internal/SourceBuilder.inl b/src/FastSIMD/Internal/SourceBuilder.inl
deleted file mode 100644
index 34adda5e..00000000
--- a/src/FastSIMD/Internal/SourceBuilder.inl
+++ /dev/null
@@ -1,29 +0,0 @@
-#pragma once
-#include "FastSIMD/FastSIMD.h"
-
-template<typename CLASS, typename FS>
-class FS_T;
-
-template<typename CLASS, FastSIMD::eLevel LEVEL>
-CLASS* FastSIMD::ClassFactory( FastSIMD::MemoryAllocator allocator ) 
-{
-    if constexpr( ( CLASS::Supported_SIMD_Levels & LEVEL & FastSIMD::COMPILED_SIMD_LEVELS ) != 0 )
-    {
-        static_assert( std::is_base_of_v<CLASS, FS_T<CLASS, FS_SIMD_CLASS>> );
-
-        if( allocator )
-        {
-            void* alloc = allocator( sizeof( FS_T<CLASS, FS_SIMD_CLASS> ), alignof( FS_T<CLASS, FS_SIMD_CLASS> ) );
-            
-            return new( alloc ) FS_T<CLASS, FS_SIMD_CLASS>;
-        }
-
-        return new FS_T<CLASS, FS_SIMD_CLASS>;        
-    }
-    return nullptr; 
-}
-
-#define FASTSIMD_BUILD_CLASS( CLASS ) \
-template FASTSIMD_API CLASS* FastSIMD::ClassFactory<CLASS, FS_SIMD_CLASS::SIMD_Level>( FastSIMD::MemoryAllocator );
-
-#include "../FastSIMD_BuildList.inl"
diff --git a/src/FastSIMD/Internal/VecTools.h b/src/FastSIMD/Internal/VecTools.h
deleted file mode 100644
index d3c42581..00000000
--- a/src/FastSIMD/Internal/VecTools.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#pragma once
-
-#include <cinttypes>
-
-#include "FastSIMD/FastSIMD.h"
-#include "FastSIMD/FunctionList.h"
-
-#define FASTSIMD_INTERNAL_TYPE_SET( CLASS, TYPE )                           \
-TYPE vector;									                            \
-FS_INLINE CLASS() { }                                                       \
-FS_INLINE CLASS( const TYPE& v ) : vector(v) {};	                        \
-FS_INLINE CLASS& operator = ( const TYPE& v ) { vector = v; return *this; } \
-FS_INLINE operator TYPE() const { return vector; }
-
-#define FASTSIMD_INTERNAL_OPERATOR( TYPE, TYPE2, OPERATOR, OPERATOREQUALS )	\
-FS_INLINE static TYPE operator OPERATOR ( TYPE lhs, TYPE2 rhs )             \
-{											                                \
-    lhs OPERATOREQUALS rhs;								                    \
-    return lhs;								                                \
-}
-
-#define FASTSIMD_INTERNAL_OPERATOR_TEMPLATED( TYPE, TYPE2, OPERATOR, OPERATOREQUALS ) \
-template<FastSIMD::eLevel L>                                                          \
-FS_INLINE static TYPE operator OPERATOR ( TYPE lhs, TYPE2 rhs )                       \
-{											                                          \
-    lhs OPERATOREQUALS rhs;								                              \
-    return lhs;								                                          \
-}
-
-#define FASTSIMD_INTERNAL_OPERATORS_FLOAT( TYPE )      \
-FASTSIMD_INTERNAL_OPERATOR( TYPE, const TYPE&, +, += ) \
-FASTSIMD_INTERNAL_OPERATOR( TYPE, const TYPE&, -, -= ) \
-FASTSIMD_INTERNAL_OPERATOR( TYPE, const TYPE&, *, *= ) \
-FASTSIMD_INTERNAL_OPERATOR( TYPE, const TYPE&, /, /= ) \
-FASTSIMD_INTERNAL_OPERATOR( TYPE, const TYPE&, &, &= ) \
-FASTSIMD_INTERNAL_OPERATOR( TYPE, const TYPE&, |, |= ) \
-FASTSIMD_INTERNAL_OPERATOR( TYPE, const TYPE&, ^, ^= ) 
-
-#define FASTSIMD_INTERNAL_OPERATORS_FLOAT_TEMPLATED( TYPE )            \
-FASTSIMD_INTERNAL_OPERATOR_TEMPLATED( TYPE<L>, const TYPE<L>&, +, += ) \
-FASTSIMD_INTERNAL_OPERATOR_TEMPLATED( TYPE<L>, const TYPE<L>&, -, -= ) \
-FASTSIMD_INTERNAL_OPERATOR_TEMPLATED( TYPE<L>, const TYPE<L>&, *, *= ) \
-FASTSIMD_INTERNAL_OPERATOR_TEMPLATED( TYPE<L>, const TYPE<L>&, /, /= ) \
-FASTSIMD_INTERNAL_OPERATOR_TEMPLATED( TYPE<L>, const TYPE<L>&, &, &= ) \
-FASTSIMD_INTERNAL_OPERATOR_TEMPLATED( TYPE<L>, const TYPE<L>&, |, |= ) \
-FASTSIMD_INTERNAL_OPERATOR_TEMPLATED( TYPE<L>, const TYPE<L>&, ^, ^= ) 
-
-#define FASTSIMD_INTERNAL_OPERATORS_INT( TYPE, TYPE2 ) \
-FASTSIMD_INTERNAL_OPERATOR( TYPE, const TYPE&, +, += ) \
-FASTSIMD_INTERNAL_OPERATOR( TYPE, const TYPE&, -, -= ) \
-FASTSIMD_INTERNAL_OPERATOR( TYPE, const TYPE&, *, *= ) \
-FASTSIMD_INTERNAL_OPERATOR( TYPE, const TYPE&, &, &= ) \
-FASTSIMD_INTERNAL_OPERATOR( TYPE, const TYPE&, |, |= ) \
-FASTSIMD_INTERNAL_OPERATOR( TYPE, const TYPE&, ^, ^= ) \
-FASTSIMD_INTERNAL_OPERATOR( TYPE, TYPE2, >>, >>= )     \
-FASTSIMD_INTERNAL_OPERATOR( TYPE, TYPE2, <<, <<= )
-
-#define FASTSIMD_INTERNAL_OPERATORS_INT_TEMPLATED( TYPE, TYPE2 )       \
-FASTSIMD_INTERNAL_OPERATOR_TEMPLATED( TYPE<L>, const TYPE<L>&, +, += ) \
-FASTSIMD_INTERNAL_OPERATOR_TEMPLATED( TYPE<L>, const TYPE<L>&, -, -= ) \
-FASTSIMD_INTERNAL_OPERATOR_TEMPLATED( TYPE<L>, const TYPE<L>&, *, *= ) \
-FASTSIMD_INTERNAL_OPERATOR_TEMPLATED( TYPE<L>, const TYPE<L>&, &, &= ) \
-FASTSIMD_INTERNAL_OPERATOR_TEMPLATED( TYPE<L>, const TYPE<L>&, |, |= ) \
-FASTSIMD_INTERNAL_OPERATOR_TEMPLATED( TYPE<L>, const TYPE<L>&, ^, ^= ) \
-FASTSIMD_INTERNAL_OPERATOR_TEMPLATED( TYPE<L>, TYPE2, >>, >>= )        \
-FASTSIMD_INTERNAL_OPERATOR_TEMPLATED( TYPE<L>, TYPE2, <<, <<= )
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index d33881e2..56c75bf7 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -40,36 +40,3 @@ set_target_properties(FastNoiseCpp11Test PROPERTIES CXX_STANDARD 11)
 target_link_libraries(FastNoiseCpp11Test
     FastNoise
 )
-
-#add_executable(FastSIMDTest
-#    "SIMDUnitTest.cpp"
-#)
-#
-#if(NOT FASTSIMD_COMPILE_ARM)
-#
-#    if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
-#    set_source_files_properties("SIMDUnitTest.cpp" PROPERTIES COMPILE_FLAGS "/arch:AVX512")
-#
-#    elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
-#        set_source_files_properties("SIMDUnitTest.cpp" PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512dq -mavx2 -mfma -msse4.2")
-#    endif()
-#
-#elseif(FASTSIMD_COMPILE_ARMV7)
-#
-#    if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
-#    set_source_files_properties("SIMDUnitTest.cpp" PROPERTIES COMPILE_FLAGS "/arch:NEON")
-#
-#    elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
-#        set_source_files_properties("SIMDUnitTest.cpp" PROPERTIES COMPILE_FLAGS "-march=armv7-a -mfpu=neon")
-#    endif()
-#endif()
-#
-#
-#
-#target_link_libraries(FastSIMDTest
-#    FastNoise
-#)
-# 
-#add_dependencies(FastSIMDTest 
-#    FastNoise
-#)
diff --git a/tests/FastNoiseBenchmark.cpp b/tests/FastNoiseBenchmark.cpp
index 45484dab..cefce3ea 100644
--- a/tests/FastNoiseBenchmark.cpp
+++ b/tests/FastNoiseBenchmark.cpp
@@ -1,15 +1,17 @@
+#include <iostream>
+#include <ostream>
+
 #include <benchmark/benchmark.h>
 #include "FastNoise/FastNoise.h"
 #include "FastNoise/Metadata.h"
+#include "FastSIMD/FastSIMD_FastNoise_config.h"
 
-#include "../NoiseTool/DemoNodeTrees.inl"
-
-#include "magic_enum.h"
+#include "../tools/NodeEditor/util/DemoNodeTrees.inl"
 
 static const size_t gPositionCount = 8192;
 static float gPositionFloats[gPositionCount]; 
 
-FastNoise::SmartNode<> BuildGenerator( benchmark::State& state, const FastNoise::Metadata* metadata, FastSIMD::eLevel level )
+FastNoise::SmartNode<> BuildGenerator( benchmark::State& state, const FastNoise::Metadata* metadata, FastSIMD::FeatureSet level )
 {    
     FastNoise::SmartNode<> generator = metadata->CreateNode( level );
 
@@ -101,26 +103,10 @@ void BenchFastNoiseGenerator4D( benchmark::State& state, const FastNoise::SmartN
 }
 
 template<typename T>
-void RegisterBenchmarks( FastSIMD::eLevel level, const char* groupName, const char* name, T generatorFunc )
+void RegisterBenchmarks( FastSIMD::FeatureSet level, const char* groupName, const char* name, T generatorFunc )
 {
     std::string benchName = "0D/";
-
-#ifdef MAGIC_ENUM_SUPPORTED
-    auto enumName = magic_enum::flags::enum_name( level );
-    auto find = enumName.find( '_' );
-    if( find != std::string::npos )
-    {
-        benchName += enumName.data() + find + 1;
-    }
-    else
-    {
-        benchName += enumName;
-    }
-#else
-    benchName += std::to_string( (int)level );
-#endif
-
-
+    benchName += FastSIMD::GetFeatureSetString( level );  
     benchName += '/';
     benchName += groupName;
     benchName += '/';
@@ -138,6 +124,8 @@ void RegisterBenchmarks( FastSIMD::eLevel level, const char* groupName, const ch
 
 int main( int argc, char** argv )
 {
+    std::cout << "FastSIMD Max Supported Feature Set: " << FastSIMD::GetFeatureSetString( FastSIMD::DetectCpuMaxFeatureSet() ) << std::endl;
+
     benchmark::Initialize( &argc, argv );
 
     for( size_t idx = 0; idx < gPositionCount; idx++ )
@@ -145,18 +133,13 @@ int main( int argc, char** argv )
         gPositionFloats[idx] = (float)idx * 0.6f;
     }
     
-    for( FastSIMD::eLevel level = FastSIMD::CPUMaxSIMDLevel(); level != FastSIMD::Level_Null; level = (FastSIMD::eLevel)(level >> 1) )
+    for( auto level : FastSIMD::FastSIMD_FastNoise::CompiledFeatureSets::AsArray )
     {
-        if( !(level & FastSIMD::COMPILED_SIMD_LEVELS & FastNoise::SUPPORTED_SIMD_LEVELS) )
-        {
-            continue;
-        }
-
         for( const FastNoise::Metadata* metadata : FastNoise::Metadata::GetAll() )
         {
             const char* groupName = "Misc";
 
-            if( !metadata->groups.empty() )
+            if( metadata->groups.size() )
             {
                 groupName = metadata->groups[metadata->groups.size() - 1];
             }
diff --git a/tests/FastNoiseCpp11Include.cpp b/tests/FastNoiseCpp11Include.cpp
index aa01c9a7..556d1473 100644
--- a/tests/FastNoiseCpp11Include.cpp
+++ b/tests/FastNoiseCpp11Include.cpp
@@ -7,7 +7,7 @@ int main()
 {
     auto node = FastNoise::New<FastNoise::FractalFBm>();
 
-    std::cout << node->GetSIMDLevel() << std::endl;
+    std::cout << (unsigned)node->GetActiveFeatureSet() << std::endl;
 
     node->SetSource( FastNoise::New<FastNoise::Simplex>() );
     node->SetGain( FastNoise::New<FastNoise::Value>() );
@@ -16,7 +16,7 @@ int main()
 
     float noise[size * size];
 
-    node->GenUniformGrid2D( noise, 0, 0, size, size, 0.02f, 1337 );
+    node->GenUniformGrid2D( noise, 0, 0, size, size, 1337 );
 
     for( int i = 0; i < sizeof(noise) / sizeof(float); i++ )
     {
@@ -26,7 +26,6 @@ int main()
     std::cout << std::endl;
 
     // SmartNode down cast example
-#if !FASTNOISE_USE_SHARED_PTR
     {
         // New Checkerboard node stored in base SmartNode type
         FastNoise::SmartNode<> base = FastNoise::New<FastNoise::Checkerboard>();
@@ -38,12 +37,11 @@ int main()
         auto checkerboard = FastNoise::SmartNode<FastNoise::Checkerboard>::DynamicCast( base );
 
         // Ok
-        checkerboard->SetSize( 8.0f );
+        checkerboard->SetScale( 8.0f );
 
         // Down cast to wrong type will return nullptr
         auto simplex = FastNoise::SmartNode<FastNoise::Simplex>::DynamicCast( base );
 
         std::cout << ( simplex ? "valid" : "nullptr" ) << std::endl;
     }
-#endif
 }
\ No newline at end of file
diff --git a/tests/SIMDUnitTest.cpp b/tests/SIMDUnitTest.cpp
deleted file mode 100644
index b6f897b6..00000000
--- a/tests/SIMDUnitTest.cpp
+++ /dev/null
@@ -1,293 +0,0 @@
-#include <cfloat>
-#include <climits>
-#include <random>
-#include <iostream>
-#include <cmath>
-
-#include "FastSIMD/FunctionList.h"
-#include "../src/FastSIMD/Internal/Scalar.h"
-
-#if FASTSIMD_x86
-#include "../src/FastSIMD/Internal/SSE.h"
-#include "../src/FastSIMD/Internal/AVX.h"
-#include "../src/FastSIMD/Internal/AVX512.h"
-#endif
-
-#if FASTSIMD_ARM
-#include "../src/FastSIMD/Internal/NEON.h"
-#endif
-
-#include <vector>
-#include <functional>
-#include <type_traits>
-
-template<typename... T>
-struct SIMDClassContainer
-{
-    using Top = void;
-
-    template<typename L>
-    using GetNext = void;
-};
-
-template<typename HEAD, typename... TAIL>
-struct SIMDClassContainer<HEAD, TAIL...>
-{
-    using Top = HEAD;
-
-    template<typename L>
-    using GetNext = std::conditional_t<std::is_same_v<L, HEAD>, typename SIMDClassContainer<TAIL...>::Top, typename SIMDClassContainer<TAIL...>::template GetNext<L>>;
-};
-
-typedef SIMDClassContainer<
-    FastSIMD::Scalar
-#if FASTSIMD_x86
-    ,
-    FastSIMD::SSE2,
-    FastSIMD::SSE41,
-    FastSIMD::AVX2,
-    FastSIMD::AVX512
-#endif
-#if FASTSIMD_ARM
-    ,
-    FastSIMD::NEON
-#endif
->
-SIMDClassList;
-
-class SIMDUnitTest
-{
-public:
-
-    static void RunAll();
-
-    SIMDUnitTest( std::function<void( void* )> func )
-    {
-        tests.emplace_back( func );
-    }
-
-private:
-    inline static std::vector<std::function<void( void* )> > tests;
-
-};
-
-const std::size_t TestCount = 1073741824 / 16;
-const std::size_t NonVecMask = ~15;
-
-int  * rndInts0;
-int  * rndInts1;
-float* rndFloats0;
-float* rndFloats1;
-
-float GenNormalFloat( std::mt19937& gen )
-{
-    union
-    {
-        float f;
-        int32_t i;
-    } u;
-
-    do
-    {
-        u.i = gen();
-
-    } while ( !std::isnormal( u.f ) );
-
-    return u.f;
-}
-
-void SIMDUnitTest::RunAll()
-{
-    rndInts0 = new int[TestCount];
-    rndInts1 = new int[TestCount];
-    rndFloats0 = new float[TestCount];
-    rndFloats1 = new float[TestCount];
-
-    std::random_device rd;  //Will be used to obtain a seed for the random number engine
-    std::mt19937 gen( rd() ); //Standard mersenne_twister_engine seeded with rd()
-
-    for ( std::size_t i = 0; i < TestCount; i++ )
-    {
-        rndInts0[i] = gen();
-        rndInts1[i] = gen();
-        rndFloats0[i] = GenNormalFloat( gen );
-        rndFloats1[i] = GenNormalFloat( gen );
-    }
-
-    for ( const auto& test : tests )
-    {
-        test( nullptr );
-    }
-
-    delete[] rndInts0;
-    delete[] rndInts1;
-    delete[] rndFloats0;
-    delete[] rndFloats1;
-}
-
-#define SIMD_FUNCTION_TEST( NAME, RETURN_TYPE, FUNC ) SIMD_FUNCTION_TEST_BASE( NAME, RETURN_TYPE, SIMDClassList::Top, FUNC )
-
-#define SIMD_FUNCTION_TEST_BASE( NAME, RETURN_TYPE, LEVEL, FUNC )                                          \
-template<typename T, typename FS>                                                                          \
-std::enable_if_t<std::is_same<void, FS>::value> TestFunction_##NAME( void* baseData = nullptr )      \
-{                                                                                                          \
-    std::cout << "\n";                                                                                     \
-    delete[] (T*)baseData;                                                                                     \
-}                                                                                                          \
-                                                                                                           \
-template<typename T, typename FS>                                                                          \
-std::enable_if_t<!std::is_same<void, FS>::value> TestFunction_##NAME( void* baseData = nullptr )     \
-{                                                                                                          \
-    bool isBase = baseData == nullptr;                                                                     \
-                                                                                                           \
-    if ( isBase )                                                                                          \
-    {                                                                                                      \
-        std::cout << #NAME " - Base: " << FS::SIMD_Level;                                                  \
-        baseData = new T[TestCount];                                                                       \
-    }                                                                                                      \
-    else { std::cout << " Testing: " << FS::SIMD_Level; }                                                  \
-                                                                                                           \
-    if ( FS::SIMD_Level > FastSIMD::CPUMaxSIMDLevel() )                                                    \
-    {                                                                                                      \
-        std::cout << " CPU N//A: " << FS::SIMD_Level;                                                      \
-    }                                                                                                      \
-    else                                                                                                   \
-    {                                                                                                      \
-        T result[FS_Size_32()];                                                    \
-        int failCount = 0;                                                                                    \
-                                                                                                           \
-        for ( std::size_t i = 0; i < TestCount; i += FS_Size_32() )                \
-        {                                                                                                  \
-            FUNC;                                                                                          \
-                                                                                                           \
-            for ( std::size_t ir = 0; ir < FS_Size_32(); ir++ )                    \
-            {                                                                                              \
-                if ( isBase )                                                                              \
-                {                                                                                          \
-                    ((T*)baseData)[i + ir] = result[ir];                                                   \
-                }                                                                                          \
-                else if ( result[ir] != ((T*)baseData)[i + ir] &&                                          \
-                    (result[ir] == result[ir] ||                                                           \
-                    ((T*)baseData)[i + ir] == ((T*)baseData)[i + ir]) )                                    \
-                {                                                                                           \
-                    failCount++;                                                                                           \
-                    std::cout << "\n" << FS::SIMD_Level << " Failed: expected: " << ((T*)baseData)[i + ir];                         \
-                    std::cout << " actual: " << result[ir] << " index: " << i+ir;                          \
-                    if(std::is_integral_v<T>) std::cout << " ints: " << rndInts0[i + ir] << " : " << rndInts1[i + ir];               \
-                    else std::cout << " floats: " << rndFloats0[i + ir] << " : " << rndFloats1[i + ir] << "\n"; \
-                }                                                                                          \
-            }                                                                                              \
-            if( failCount >= 32 ) break;                                                                    \
-        }                                                                                                  \
-    }                                                                                                      \
-                                                                                                           \
-    TestFunction_##NAME<T, SIMDClassList::GetNext<FS>>( baseData );                \
-}                                                                                                          \
-SIMDUnitTest test_##NAME( TestFunction_##NAME<RETURN_TYPE, LEVEL> );
-
-SIMD_FUNCTION_TEST( LoadStore_f32, float, FS_Store_f32( &result, FS_Load_f32( &rndFloats0[i] ) ) )
-
-SIMD_FUNCTION_TEST( LoadStore_i32, int32_t, FS_Store_i32( &result, FS_Load_i32( &rndInts0[i] ) ) )
-
-
-SIMD_FUNCTION_TEST( Casti32_f32, float, FS_Store_f32( &result, FS_Casti32_f32( FS_Load_i32( &rndInts0[i] ) ) ) )
-
-SIMD_FUNCTION_TEST( Castf32_i32, int32_t, FS_Store_i32( &result, FS_Castf32_i32( FS_Load_f32( &rndFloats0[i] ) ) ) )
-
-SIMD_FUNCTION_TEST( Converti32_f32, float, FS_Store_f32( &result, FS_Converti32_f32( FS_Load_i32( &rndInts0[i] ) ) ) )
-
-SIMD_FUNCTION_TEST( Convertf32_i32, int32_t, FS_Store_i32( &result, FS_Convertf32_i32( FS_Load_f32( &rndFloats0[i] ) ) ) )
-
-
-SIMD_FUNCTION_TEST( Equal_f32, float, FS_Store_f32( &result, FS_Mask_f32( typename FS::float32v( 1 ), ( FS_Load_f32( &rndFloats0[i] ) == FS_Load_f32( &rndFloats1[i] ) ) ) ) )
-
-SIMD_FUNCTION_TEST( GreaterThan_f32, float, FS_Store_f32( &result, FS_Mask_f32( typename FS::float32v( 1 ), ( FS_Load_f32( &rndFloats0[i] ) > FS_Load_f32( &rndFloats1[i] ) ) ) ) )
-
-SIMD_FUNCTION_TEST( LessThan_f32, float, FS_Store_f32( &result, FS_Mask_f32( typename FS::float32v( 1 ), ( FS_Load_f32( &rndFloats0[i] ) < FS_Load_f32( &rndFloats1[i] ) ) ) ) )
-
-SIMD_FUNCTION_TEST( GreaterEqualThan_f32, float, FS_Store_f32( &result, FS_Mask_f32( typename FS::float32v( 1 ), ( FS_Load_f32( &rndFloats0[i] ) >= FS_Load_f32( &rndFloats1[i] ) ) ) ) )
-
-SIMD_FUNCTION_TEST( LessEqualThan_f32, float, FS_Store_f32( &result, FS_Mask_f32( typename FS::float32v( 1 ), ( FS_Load_f32( &rndFloats0[i] ) <= FS_Load_f32( &rndFloats1[i] ) ) ) ) )
-
-SIMD_FUNCTION_TEST( Equal_i32, int32_t, FS_Store_i32( &result, FS_Mask_i32( typename FS::int32v( 1 ), ( FS_Load_i32( &rndInts0[i] ) == FS_Load_i32( &rndInts1[i] ) ) ) ) )
-
-SIMD_FUNCTION_TEST( GreaterThan_i32, int32_t, FS_Store_i32( &result, FS_Mask_i32( typename FS::int32v( 1 ), ( FS_Load_i32( &rndInts0[i] ) > FS_Load_i32( &rndInts1[i] ) ) ) ) )
-
-SIMD_FUNCTION_TEST( LessThan_i32, int32_t, FS_Store_i32( &result, FS_Mask_i32( typename FS::int32v( 1 ), ( FS_Load_i32( &rndInts0[i] ) < FS_Load_i32( &rndInts1[i] ) ) ) ) )
-
-
-SIMD_FUNCTION_TEST( Select_f32, float, FS_Store_f32( &result, FS_Select_f32( ( FS_Load_f32( &rndFloats0[i] ) > FS_Load_f32( &rndFloats1[i] ) ), FS_Load_f32( &rndFloats0[i] ), FS_Load_f32( &rndFloats1[i] ) ) ) )
-
-SIMD_FUNCTION_TEST( Select_i32, int32_t, FS_Store_i32( &result, FS_Select_i32( ( FS_Load_i32( &rndInts0[i] ) > FS_Load_i32( &rndInts1[i] ) ), FS_Load_i32( &rndInts0[i] ), FS_Load_i32( &rndInts1[i] ) ) ) )
-
-
-SIMD_FUNCTION_TEST( Min_f32, float, FS_Store_f32( &result, FS_Min_f32( FS_Load_f32( &rndFloats0[i] ), FS_Load_f32( &rndFloats1[i] ) ) ) )
-
-SIMD_FUNCTION_TEST( Max_f32, float, FS_Store_f32( &result, FS_Max_f32( FS_Load_f32( &rndFloats0[i] ), FS_Load_f32( &rndFloats1[i] ) ) ) )
-
-SIMD_FUNCTION_TEST( Min_i32, int32_t, FS_Store_i32( &result, FS_Min_i32( FS_Load_i32( &rndInts0[i] ), FS_Load_i32( &rndInts1[i] ) ) ) )
-
-SIMD_FUNCTION_TEST( Max_i32, int32_t, FS_Store_i32( &result, FS_Max_i32( FS_Load_i32( &rndInts0[i] ), FS_Load_i32( &rndInts1[i] ) ) ) )
-
-
-SIMD_FUNCTION_TEST( BitwiseAndNot_f32, float, FS_Store_f32( &result, FS_BitwiseAndNot_f32( FS_Load_f32( &rndFloats0[i] ), FS_Load_f32( &rndFloats1[i] ) ) ) )
-
-SIMD_FUNCTION_TEST( BitwiseAndNot_i32, int32_t, FS_Store_i32( &result, FS_BitwiseAndNot_i32( FS_Load_i32( &rndInts0[i] ), FS_Load_i32( &rndInts1[i] ) ) ) )
-
-
-SIMD_FUNCTION_TEST( BitwiseShiftRightZX_f32, float, FS_Store_f32( &result, FS_BitwiseShiftRightZX_f32( FS_Load_f32( &rndFloats0[i] ), (rndInts1[i & NonVecMask] & 31) ) ) )
-
-SIMD_FUNCTION_TEST( BitwiseShiftRightZX_i32, int32_t, FS_Store_i32( &result, FS_BitwiseShiftRightZX_i32( FS_Load_i32( &rndInts0[i] ), (rndInts1[i & NonVecMask] & 31) ) ) )
-
-
-SIMD_FUNCTION_TEST( Abs_f32, float, FS_Store_f32( &result, FS_Abs_f32( FS_Load_f32( &rndFloats0[i] ) ) ) )
-
-SIMD_FUNCTION_TEST( Abs_i32, int32_t, FS_Store_i32( &result, FS_Abs_i32( FS_Load_i32( &rndInts0[i] ) ) ) )
-
-SIMD_FUNCTION_TEST( Sqrt_f32, float, FS_Store_f32( &result, FS_Sqrt_f32( FS_Load_f32( &rndFloats0[i] ) ) ) )
-
-//SIMD_FUNCTION_TEST( InvSqrt_f32, float, FS_Store_f32( &result, FS_InvSqrt_f32( FS_Load_f32( &rndFloats0[i] ) ) ) )
-
-
-const float MAX_ROUNDING = (float)INT_MAX / 2.0f;
-
-SIMD_FUNCTION_TEST( Floor_f32, float, FS_Store_f32( &result, FS_Floor_f32( typename FS::float32v( MAX_ROUNDING / FLT_MAX ) * FS_Load_f32( &rndFloats0[i] ) ) ) )
-
-SIMD_FUNCTION_TEST( Ceil_f32, float, FS_Store_f32( &result, FS_Ceil_f32( typename FS::float32v( MAX_ROUNDING / FLT_MAX ) * FS_Load_f32( &rndFloats0[i] ) ) ) )
-
-//SIMD_FUNCTION_TEST( Round_f32, float, FS_Store_f32( &result, FS_Round_f32( FS_Min_f32( FS::float32v( MAX_ROUNDING ), FS_Max_f32( FS::float32v( -MAX_ROUNDING ), FS_Load_f32( &rndFloats0[i] ) ) ) ) ) )
-
-SIMD_FUNCTION_TEST( Add_f32, float, FS_Store_f32( &result, FS_Load_f32( &rndFloats0[i] ) + FS_Load_f32( &rndFloats1[i] ) ) )
-SIMD_FUNCTION_TEST( Sub_f32, float, FS_Store_f32( &result, FS_Load_f32( &rndFloats0[i] ) - FS_Load_f32( &rndFloats1[i] ) ) )
-SIMD_FUNCTION_TEST( Mul_f32, float, FS_Store_f32( &result, FS_Load_f32( &rndFloats0[i] ) * FS_Load_f32( &rndFloats1[i] ) ) )
-SIMD_FUNCTION_TEST( Div_f32, float, FS_Store_f32( &result, FS_Load_f32( &rndFloats0[i] ) / FS_Load_f32( &rndFloats1[i] ) ) )
-SIMD_FUNCTION_TEST( And_f32, float, FS_Store_f32( &result, FS_Load_f32( &rndFloats0[i] ) & FS_Load_f32( &rndFloats1[i] ) ) )
-SIMD_FUNCTION_TEST( Xor_f32, float, FS_Store_f32( &result, FS_Load_f32( &rndFloats0[i] ) ^ FS_Load_f32( &rndFloats1[i] ) ) )
-SIMD_FUNCTION_TEST( Or_f32, float, FS_Store_f32( &result, FS_Load_f32( &rndFloats0[i] ) | FS_Load_f32( &rndFloats1[i] ) ) )
-SIMD_FUNCTION_TEST( Not_f32, float, FS_Store_f32( &result, ~FS_Load_f32( &rndFloats1[i] ) ) )
-SIMD_FUNCTION_TEST( Negate_f32, float, FS_Store_f32( &result, -FS_Load_f32( &rndFloats1[i] ) ) )
-
-SIMD_FUNCTION_TEST( Add_i32, int32_t, FS_Store_i32( &result, FS_Load_i32( &rndInts0[i] ) + FS_Load_i32( &rndInts1[i] ) ) )
-SIMD_FUNCTION_TEST( Sub_i32, int32_t, FS_Store_i32( &result, FS_Load_i32( &rndInts0[i] ) - FS_Load_i32( &rndInts1[i] ) ) )
-SIMD_FUNCTION_TEST( Mul_i32, int32_t, FS_Store_i32( &result, FS_Load_i32( &rndInts0[i] ) * FS_Load_i32( &rndInts1[i] ) ) )
-SIMD_FUNCTION_TEST( And_i32, int32_t, FS_Store_i32( &result, FS_Load_i32( &rndInts0[i] ) & FS_Load_i32( &rndInts1[i] ) ) )
-SIMD_FUNCTION_TEST( Xor_i32, int32_t, FS_Store_i32( &result, FS_Load_i32( &rndInts0[i] ) ^ FS_Load_i32( &rndInts1[i] ) ) )
-SIMD_FUNCTION_TEST( Or_i32, int32_t, FS_Store_i32( &result, FS_Load_i32( &rndInts0[i] ) | FS_Load_i32( &rndInts1[i] ) ) )
-SIMD_FUNCTION_TEST( Not_i32, int32_t, FS_Store_i32( &result, ~FS_Load_i32( &rndInts1[i] ) ) )
-SIMD_FUNCTION_TEST( Negate_i32, int32_t, FS_Store_i32( &result, -FS_Load_i32( &rndInts1[i] ) ) )
-
-SIMD_FUNCTION_TEST( ShiftL_i32, int32_t, FS_Store_i32( &result, FS_Load_i32( &rndInts0[i] ) << (rndInts1[i & NonVecMask] & 31) ) )
-SIMD_FUNCTION_TEST( ShiftR_i32, int32_t, FS_Store_i32( &result, FS_Load_i32( &rndInts0[i] ) >> (rndInts1[i & NonVecMask] & 31) ) )
-
-
-int main( int argc, char** argv )
-{
-    std::cout << std::fixed;
-
-    SIMDUnitTest::RunAll();
-
-    std::cout << "Tests Complete!\n";
-
-    getchar();
-    return 0;
-}
diff --git a/tests/magic_enum.h b/tests/magic_enum.h
deleted file mode 100644
index 4f227472..00000000
--- a/tests/magic_enum.h
+++ /dev/null
@@ -1,1103 +0,0 @@
-//  __  __             _        ______                          _____
-// |  \/  |           (_)      |  ____|                        / ____|_     _
-// | \  / | __ _  __ _ _  ___  | |__   _ __  _   _ _ __ ___   | |   _| |_ _| |_
-// | |\/| |/ _` |/ _` | |/ __| |  __| | '_ \| | | | '_ ` _ \  | |  |_   _|_   _|
-// | |  | | (_| | (_| | | (__  | |____| | | | |_| | | | | | | | |____|_|   |_|
-// |_|  |_|\__,_|\__, |_|\___| |______|_| |_|\__,_|_| |_| |_|  \_____|
-//                __/ | https://github.com/Neargye/magic_enum
-//               |___/  version 0.6.6
-//
-// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2019 - 2020 Daniil Goncharov <neargye@gmail.com>.
-//
-// Permission is hereby  granted, free of charge, to any  person obtaining a copy
-// of this software and associated  documentation files (the "Software"), to deal
-// in the Software  without restriction, including without  limitation the rights
-// to  use, copy,  modify, merge,  publish, distribute,  sublicense, and/or  sell
-// copies  of  the Software,  and  to  permit persons  to  whom  the Software  is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
-//
-// THE SOFTWARE  IS PROVIDED "AS  IS", WITHOUT WARRANTY  OF ANY KIND,  EXPRESS OR
-// IMPLIED,  INCLUDING BUT  NOT  LIMITED TO  THE  WARRANTIES OF  MERCHANTABILITY,
-// FITNESS FOR  A PARTICULAR PURPOSE AND  NONINFRINGEMENT. IN NO EVENT  SHALL THE
-// AUTHORS  OR COPYRIGHT  HOLDERS  BE  LIABLE FOR  ANY  CLAIM,  DAMAGES OR  OTHER
-// LIABILITY, WHETHER IN AN ACTION OF  CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE  OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-
-#ifndef NEARGYE_MAGIC_ENUM_HPP
-#define NEARGYE_MAGIC_ENUM_HPP
-
-#define MAGIC_ENUM_VERSION_MAJOR 0
-#define MAGIC_ENUM_VERSION_MINOR 6
-#define MAGIC_ENUM_VERSION_PATCH 6
-
-#include <array>
-#include <cassert>
-#include <cstdint>
-#include <cstddef>
-#include <iosfwd>
-#include <limits>
-#include <type_traits>
-#include <utility>
-
-#if !defined(MAGIC_ENUM_USING_ALIAS_OPTIONAL)
-#include <optional>
-#endif
-#if !defined(MAGIC_ENUM_USING_ALIAS_STRING)
-#include <string>
-#endif
-#if !defined(MAGIC_ENUM_USING_ALIAS_STRING_VIEW)
-#include <string_view>
-#endif
-
-#if defined(__clang__)
-#  pragma clang diagnostic push
-#elif defined(__GNUC__)
-#  pragma GCC diagnostic push
-#  pragma GCC diagnostic ignored "-Wmaybe-uninitialized" // May be used uninitialized 'return {};'.
-#elif defined(_MSC_VER)
-#  pragma warning(push)
-#  pragma warning(disable : 26495) // Variable 'static_string<N>::chars' is uninitialized.
-#endif
-
-// Checks magic_enum compiler compatibility.
-#if defined(__clang__) && __clang_major__ >= 5 || defined(__GNUC__) && __GNUC__ >= 9 || defined(_MSC_VER) && _MSC_VER >= 1910
-#  undef  MAGIC_ENUM_SUPPORTED
-#  define MAGIC_ENUM_SUPPORTED 1
-#endif
-
-// Checks magic_enum compiler aliases compatibility.
-#if defined(__clang__) && __clang_major__ >= 5 || defined(__GNUC__) && __GNUC__ >= 9 || defined(_MSC_VER) && _MSC_VER >= 1920
-#  undef  MAGIC_ENUM_SUPPORTED_ALIASES
-#  define MAGIC_ENUM_SUPPORTED_ALIASES 1
-#endif
-
-// Enum value must be greater or equals than MAGIC_ENUM_RANGE_MIN. By default MAGIC_ENUM_RANGE_MIN = -128.
-// If need another min range for all enum types by default, redefine the macro MAGIC_ENUM_RANGE_MIN.
-#if !defined(MAGIC_ENUM_RANGE_MIN)
-#  define MAGIC_ENUM_RANGE_MIN -128
-#endif
-
-// Enum value must be less or equals than MAGIC_ENUM_RANGE_MAX. By default MAGIC_ENUM_RANGE_MAX = 128.
-// If need another max range for all enum types by default, redefine the macro MAGIC_ENUM_RANGE_MAX.
-#if !defined(MAGIC_ENUM_RANGE_MAX)
-#  define MAGIC_ENUM_RANGE_MAX 128
-#endif
-
-namespace magic_enum {
-
-// If need another optional type, define the macro MAGIC_ENUM_USING_ALIAS_OPTIONAL.
-#if defined(MAGIC_ENUM_USING_ALIAS_OPTIONAL)
-MAGIC_ENUM_USING_ALIAS_OPTIONAL
-#else
-template <typename T>
-using optional = std::optional<T>;
-#endif
-
-// If need another optional type, define the macro MAGIC_ENUM_USING_ALIAS_STRING_VIEW.
-#if defined(MAGIC_ENUM_USING_ALIAS_STRING_VIEW)
-MAGIC_ENUM_USING_ALIAS_STRING_VIEW
-#else
-using string_view = std::string_view;
-#endif
-
-// If need another optional type, define the macro MAGIC_ENUM_USING_ALIAS_STRING.
-#if defined(MAGIC_ENUM_USING_ALIAS_STRING)
-MAGIC_ENUM_USING_ALIAS_STRING
-#else
-using string = std::string;
-#endif
-
-namespace customize {
-
-// Enum value must be in range [MAGIC_ENUM_RANGE_MIN, MAGIC_ENUM_RANGE_MAX]. By default MAGIC_ENUM_RANGE_MIN = -128, MAGIC_ENUM_RANGE_MAX = 128.
-// If need another range for all enum types by default, redefine the macro MAGIC_ENUM_RANGE_MIN and MAGIC_ENUM_RANGE_MAX.
-// If need another range for specific enum type, add specialization enum_range for necessary enum type.
-template <typename E>
-struct enum_range {
-  static_assert(std::is_enum_v<E>, "magic_enum::customize::enum_range requires enum type.");
-  inline static constexpr int min = MAGIC_ENUM_RANGE_MIN;
-  inline static constexpr int max = MAGIC_ENUM_RANGE_MAX;
-  static_assert(max > min, "magic_enum::customize::enum_range requires max > min.");
-};
-
-static_assert(MAGIC_ENUM_RANGE_MIN <= 0, "MAGIC_ENUM_RANGE_MIN must be less or equals than 0.");
-static_assert(MAGIC_ENUM_RANGE_MIN > (std::numeric_limits<std::int16_t>::min)(), "MAGIC_ENUM_RANGE_MIN must be greater than INT16_MIN.");
-
-static_assert(MAGIC_ENUM_RANGE_MAX > 0, "MAGIC_ENUM_RANGE_MAX must be greater than 0.");
-static_assert(MAGIC_ENUM_RANGE_MAX < (std::numeric_limits<std::int16_t>::max)(), "MAGIC_ENUM_RANGE_MAX must be less than INT16_MAX.");
-
-static_assert(MAGIC_ENUM_RANGE_MAX > MAGIC_ENUM_RANGE_MIN, "MAGIC_ENUM_RANGE_MAX must be greater than MAGIC_ENUM_RANGE_MIN.");
-
-// If need cunstom names for enum type, add specialization enum_name for necessary enum type.
-template <typename E>
-constexpr string_view enum_name(E) noexcept {
-  static_assert(std::is_enum_v<E>, "magic_enum::customize::enum_name requires enum type.");
-
-  return {};
-}
-
-} // namespace magic_enum::customize
-
-namespace detail {
-
-template <typename T>
-struct supported
-#if defined(MAGIC_ENUM_SUPPORTED) && MAGIC_ENUM_SUPPORTED || defined(MAGIC_ENUM_NO_CHECK_SUPPORT)
-    : std::true_type {};
-#else
-    : std::false_type {};
-#endif
-
-template <std::size_t N>
-struct static_string {
-  constexpr explicit static_string(string_view str) noexcept : static_string{str, std::make_index_sequence<N>{}} {
-    assert(str.size() == N);
-  }
-
-  constexpr const char* data() const noexcept { return chars.data(); }
-
-  constexpr std::size_t size() const noexcept { return N; }
-
-  constexpr operator string_view() const noexcept { return {data(), size()}; }
-
- private:
-  template <std::size_t... I>
-  constexpr static_string(string_view str, std::index_sequence<I...>) noexcept : chars{{str[I]..., '\0'}} {}
-
-  const std::array<char, N + 1> chars;
-};
-
-template <>
-struct static_string<0> {
-  constexpr explicit static_string(string_view) noexcept {}
-
-  constexpr const char* data() const noexcept { return nullptr; }
-
-  constexpr std::size_t size() const noexcept { return 0; }
-
-  constexpr operator string_view() const noexcept { return {}; }
-};
-
-struct char_equal_to {
-  constexpr bool operator()(char lhs, char rhs) const noexcept {
-    return lhs == rhs;
-  }
-};
-
-constexpr string_view pretty_name(string_view name) noexcept {
-  for (std::size_t i = name.size(); i > 0; --i) {
-    if (!((name[i - 1] >= '0' && name[i - 1] <= '9') ||
-          (name[i - 1] >= 'a' && name[i - 1] <= 'z') ||
-          (name[i - 1] >= 'A' && name[i - 1] <= 'Z') ||
-          (name[i - 1] == '_'))) {
-      name.remove_prefix(i);
-      break;
-    }
-  }
-
-  if (name.size() > 0 && ((name.front() >= 'a' && name.front() <= 'z') ||
-                          (name.front() >= 'A' && name.front() <= 'Z') ||
-                          (name.front() == '_'))) {
-    return name;
-  }
-
-  return {}; // Invalid name.
-}
-
-constexpr std::size_t find(string_view str, char c) noexcept {
-#if defined(__clang__) && __clang_major__ < 9 && defined(__GLIBCXX__) || defined(_MSC_VER) && _MSC_VER < 1920
-// https://stackoverflow.com/questions/56484834/constexpr-stdstring-viewfind-last-of-doesnt-work-on-clang-8-with-libstdc
-// https://developercommunity.visualstudio.com/content/problem/360432/vs20178-regression-c-failed-in-test.html
-  constexpr auto workaroung = true;
-#else
-  constexpr auto workaroung = false;
-#endif
-  if constexpr (workaroung) {
-    for (std::size_t i = 0; i < str.size(); ++i) {
-      if (str[i] == c) {
-        return i;
-      }
-    }
-
-    return string_view::npos;
-  } else {
-    return str.find_first_of(c);
-  }
-}
-
-template <typename BinaryPredicate>
-constexpr bool cmp_equal(string_view lhs, string_view rhs, BinaryPredicate&& p) noexcept(std::is_nothrow_invocable_r_v<bool, BinaryPredicate, char, char>) {
-#if defined(_MSC_VER) && _MSC_VER < 1920
-  // https://developercommunity.visualstudio.com/content/problem/360432/vs20178-regression-c-failed-in-test.html
-  // https://developercommunity.visualstudio.com/content/problem/232218/c-constexpr-string-view.html
-  constexpr auto workaroung = true;
-#else
-  constexpr auto workaroung = false;
-#endif
-  if constexpr (std::is_same_v<std::decay_t<BinaryPredicate>, char_equal_to> && !workaroung) {
-    static_cast<void>(p);
-    return lhs == rhs;
-  } else {
-    if (lhs.size() != rhs.size()) {
-      return false;
-    }
-
-    const auto size = lhs.size();
-    for (std::size_t i = 0; i < size; ++i) {
-      if (!p(lhs[i], rhs[i])) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-}
-
-template <typename L, typename R>
-constexpr bool cmp_less(L lhs, R rhs) noexcept {
-  static_assert(std::is_integral_v<L> && std::is_integral_v<R>, "magic_enum::detail::cmp_less requires integral type.");
-
-  if constexpr (std::is_signed_v<L> == std::is_signed_v<R>) {
-    // If same signedness (both signed or both unsigned).
-    return lhs < rhs;
-  } else if constexpr (std::is_signed_v<R>) {
-    // If 'right' is negative, then result is 'false', otherwise cast & compare.
-    return rhs > 0 && lhs < static_cast<std::make_unsigned_t<R>>(rhs);
-  } else {
-    // If 'left' is negative, then result is 'true', otherwise cast & compare.
-    return lhs < 0 || static_cast<std::make_unsigned_t<L>>(lhs) < rhs;
-  }
-}
-
-template <typename I>
-constexpr I log2(I value) noexcept {
-  static_assert(std::is_integral_v<I>, "magic_enum::detail::log2 requires integral type.");
-
-  auto ret = I{0};
-  for (; value > I{1}; value >>= I{1}, ++ret) {};
-
-  return ret;
-}
-
-template <typename I>
-constexpr bool is_pow2(I x) noexcept {
-  static_assert(std::is_integral_v<I>, "magic_enum::detail::is_pow2 requires integral type.");
-
-  return x != 0 && (x & (x - 1)) == 0;
-}
-
-template <typename T>
-inline constexpr bool is_enum_v = std::is_enum_v<T> && std::is_same_v<T, std::decay_t<T>>;
-
-template <typename E>
-constexpr auto n() noexcept {
-  static_assert(is_enum_v<E>, "magic_enum::detail::n requires enum type.");
-#if defined(MAGIC_ENUM_SUPPORTED) && MAGIC_ENUM_SUPPORTED
-#  if defined(__clang__)
-  constexpr string_view name{__PRETTY_FUNCTION__ + 34, sizeof(__PRETTY_FUNCTION__) - 36};
-#  elif defined(__GNUC__)
-  constexpr string_view name{__PRETTY_FUNCTION__ + 49, sizeof(__PRETTY_FUNCTION__) - 51};
-#  elif defined(_MSC_VER)
-  constexpr string_view name{__FUNCSIG__ + 40, sizeof(__FUNCSIG__) - 57};
-#  endif
-  return static_string<name.size()>{name};
-#else
-  return string_view{}; // Unsupported compiler.
-#endif
-}
-
-template <typename E>
-inline constexpr auto type_name_v = n<E>();
-
-template <typename E, E V>
-constexpr auto n() noexcept {
-  static_assert(is_enum_v<E>, "magic_enum::detail::n requires enum type.");
-  constexpr auto custom_name = customize::enum_name<E>(V);
-
-  if constexpr (custom_name.empty()) {
-    static_cast<void>(custom_name);
-#if defined(MAGIC_ENUM_SUPPORTED) && MAGIC_ENUM_SUPPORTED
-#  if defined(__clang__) || defined(__GNUC__)
-    constexpr auto name = pretty_name({__PRETTY_FUNCTION__, sizeof(__PRETTY_FUNCTION__) - 2});
-#  elif defined(_MSC_VER)
-    constexpr auto name = pretty_name({__FUNCSIG__, sizeof(__FUNCSIG__) - 17});
-#  endif
-    return static_string<name.size()>{name};
-#else
-    return string_view{}; // Unsupported compiler.
-#endif
-  } else {
-    return static_string<custom_name.size()>{custom_name};
-  }
-}
-
-template <typename E, E V>
-inline constexpr auto enum_name_v = n<E, V>();
-
-template <typename E, auto V>
-constexpr bool is_valid() noexcept {
-  static_assert(is_enum_v<E>, "magic_enum::detail::is_valid requires enum type.");
-
-  return n<E, static_cast<E>(V)>().size() != 0;
-}
-
-template <typename E, bool IsFlags, typename U = std::underlying_type_t<E>>
-constexpr int reflected_min() noexcept {
-  static_assert(is_enum_v<E>, "magic_enum::detail::reflected_min requires enum type.");
-
-  if constexpr (IsFlags) {
-    return 0;
-  } else {
-    constexpr auto lhs = customize::enum_range<E>::min;
-    static_assert(lhs > (std::numeric_limits<std::int16_t>::min)(), "magic_enum::enum_range requires min must be greater than INT16_MIN.");
-    constexpr auto rhs = (std::numeric_limits<U>::min)();
-
-    if constexpr (cmp_less(lhs, rhs)) {
-      return rhs;
-    } else {
-      return lhs;
-    }
-  }
-}
-
-template <typename E, bool IsFlags, typename U = std::underlying_type_t<E>>
-constexpr int reflected_max() noexcept {
-  static_assert(is_enum_v<E>, "magic_enum::detail::reflected_max requires enum type.");
-
-  if constexpr (IsFlags) {
-    return std::numeric_limits<U>::digits - 1;
-  } else {
-    constexpr auto lhs = customize::enum_range<E>::max;
-    static_assert(lhs < (std::numeric_limits<std::int16_t>::max)(), "magic_enum::enum_range requires max must be less than INT16_MAX.");
-    constexpr auto rhs = (std::numeric_limits<U>::max)();
-
-    if constexpr (cmp_less(lhs, rhs)) {
-      return lhs;
-    } else {
-      return rhs;
-    }
-  }
-}
-
-template <typename E, bool IsFlags = false>
-inline constexpr auto reflected_min_v = reflected_min<E, IsFlags>();
-
-template <typename E, bool IsFlags = false>
-inline constexpr auto reflected_max_v = reflected_max<E, IsFlags>();
-
-template <typename E, int O, bool IsFlags = false, typename U = std::underlying_type_t<E>>
-constexpr E value(std::size_t i) noexcept {
-  static_assert(is_enum_v<E>, "magic_enum::detail::value requires enum type.");
-
-  if constexpr (IsFlags) {
-    return static_cast<E>(U{1} << static_cast<U>(static_cast<int>(i) + O));
-  } else {
-    return static_cast<E>(static_cast<int>(i) + O);
-  }
-}
-
-template <typename E, bool IsFlags, int Min, std::size_t... I>
-constexpr auto values(std::index_sequence<I...>) noexcept {
-  static_assert(is_enum_v<E>, "magic_enum::detail::values requires enum type.");
-  constexpr std::array<bool, sizeof...(I)> valid{{is_valid<E, value<E, Min, IsFlags>(I)>()...}};
-  constexpr std::size_t count = ((valid[I] ? std::size_t{1} : std::size_t{0}) + ...);
-
-  std::array<E, count> values{};
-  for (std::size_t i = 0, v = 0; v < count; ++i) {
-    if (valid[i]) {
-      values[v++] = value<E, Min, IsFlags>(i);
-    }
-  }
-
-  return values;
-}
-
-template <typename E, bool IsFlags, typename U = std::underlying_type_t<E>>
-constexpr auto values() noexcept {
-  static_assert(is_enum_v<E>, "magic_enum::detail::values requires enum type.");
-  constexpr auto range_size = reflected_max_v<E, IsFlags> - reflected_min_v<E, IsFlags> + 1;
-  static_assert(range_size > 0, "magic_enum::enum_range requires valid size.");
-  static_assert(range_size < (std::numeric_limits<std::uint16_t>::max)(), "magic_enum::enum_range requires valid size.");
-
-  return values<E, IsFlags, reflected_min_v<E, IsFlags>>(std::make_index_sequence<range_size>{});
-}
-
-template <typename E, bool IsFlags = false>
-inline constexpr auto values_v = values<E, IsFlags>();
-
-template <typename E, bool IsFlags = false, typename D = std::decay_t<E>>
-using values_t = decltype((values_v<D, IsFlags>));
-
-template <typename E, bool IsFlags = false>
-inline constexpr auto count_v = values_v<E, IsFlags>.size();
-
-template <typename E, bool IsFlags = false, typename U = std::underlying_type_t<E>>
-inline constexpr auto min_v = static_cast<U>(values_v<E, IsFlags>.front());
-
-template <typename E, bool IsFlags = false, typename U = std::underlying_type_t<E>>
-inline constexpr auto max_v = static_cast<U>(values_v<E, IsFlags>.back());
-
-template <typename E, bool IsFlags, typename U = std::underlying_type_t<E>>
-constexpr std::size_t range_size() noexcept {
-  static_assert(is_enum_v<E>, "magic_enum::detail::range_size requires enum type.");
-  constexpr auto max = IsFlags ? log2(max_v<E, IsFlags>) : max_v<E, IsFlags>;
-  constexpr auto min = IsFlags ? log2(min_v<E, IsFlags>) : min_v<E, IsFlags>;
-  constexpr auto range_size = max - min + U{1};
-  static_assert(range_size > 0, "magic_enum::enum_range requires valid size.");
-  static_assert(range_size < (std::numeric_limits<std::uint16_t>::max)(), "magic_enum::enum_range requires valid size.");
-
-  return static_cast<std::size_t>(range_size);
-}
-
-template <typename E, bool IsFlags = false>
-inline constexpr auto range_size_v = range_size<E, IsFlags>();
-
-template <typename E, bool IsFlags = false>
-using index_t = std::conditional_t<range_size_v<E, IsFlags> < (std::numeric_limits<std::uint8_t>::max)(), std::uint8_t, std::uint16_t>;
-
-template <typename E, bool IsFlags = false>
-inline constexpr auto invalid_index_v = (std::numeric_limits<index_t<E, IsFlags>>::max)();
-
-template <typename E, bool IsFlags, std::size_t... I>
-constexpr auto indexes(std::index_sequence<I...>) noexcept {
-  static_assert(is_enum_v<E>, "magic_enum::detail::indexes requires enum type.");
-  constexpr auto min = IsFlags ? log2(min_v<E, IsFlags>) : min_v<E, IsFlags>;
-  [[maybe_unused]] auto i = index_t<E, IsFlags>{0};
-
-  return std::array<decltype(i), sizeof...(I)>{{(is_valid<E, value<E, min, IsFlags>(I)>() ? i++ : invalid_index_v<E, IsFlags>)...}};
-}
-
-template <typename E, bool IsFlags = false>
-inline constexpr auto indexes_v = indexes<E, IsFlags>(std::make_index_sequence<range_size_v<E, IsFlags>>{});
-
-template <typename E, bool IsFlags, std::size_t... I>
-constexpr auto names(std::index_sequence<I...>) noexcept {
-  static_assert(is_enum_v<E>, "magic_enum::detail::names requires enum type.");
-
-  return std::array<string_view, sizeof...(I)>{{enum_name_v<E, values_v<E, IsFlags>[I]>...}};
-}
-
-template <typename E, bool IsFlags = false>
-inline constexpr auto names_v = names<E, IsFlags>(std::make_index_sequence<count_v<E, IsFlags>>{});
-
-template <typename E, bool IsFlags = false, typename D = std::decay_t<E>>
-using names_t = decltype((names_v<D, IsFlags>));
-
-template <typename E, bool IsFlags, std::size_t... I>
-constexpr auto entries(std::index_sequence<I...>) noexcept {
-  static_assert(is_enum_v<E>, "magic_enum::detail::entries requires enum type.");
-
-  return std::array<std::pair<E, string_view>, sizeof...(I)>{{{values_v<E, IsFlags>[I], enum_name_v<E, values_v<E, IsFlags>[I]>}...}};
-}
-
-template <typename E, bool IsFlags = false>
-inline constexpr auto entries_v = entries<E, IsFlags>(std::make_index_sequence<count_v<E, IsFlags>>{});
-
-template <typename E, bool IsFlags = false, typename D = std::decay_t<E>>
-using entries_t = decltype((entries_v<D, IsFlags>));
-
-template <typename E, bool IsFlags, typename U = std::underlying_type_t<E>>
-constexpr bool is_sparse() noexcept {
-  static_assert(is_enum_v<E>, "magic_enum::detail::is_sparse requires enum type.");
-
-  return range_size_v<E, IsFlags> != count_v<E, IsFlags>;
-}
-
-template <typename E, bool IsFlags = false>
-inline constexpr bool is_sparse_v = is_sparse<E, IsFlags>();
-
-template <typename E, typename U = std::underlying_type_t<E>>
-constexpr std::size_t undex(U value) noexcept {
-  static_assert(is_enum_v<E>, "magic_enum::detail::undex requires enum type.");
-
-  if (const auto i = static_cast<std::size_t>(value - min_v<E>); value >= min_v<E> && value <= max_v<E>) {
-    if constexpr (is_sparse_v<E>) {
-      if (const auto idx = indexes_v<E>[i]; idx != invalid_index_v<E>) {
-        return idx;
-      }
-    } else {
-      return i;
-    }
-  }
-
-  return invalid_index_v<E>; // Value out of range.
-}
-
-template <typename E, typename U = std::underlying_type_t<E>>
-constexpr std::size_t endex(E value) noexcept {
-  static_assert(is_enum_v<E>, "magic_enum::detail::endex requires enum type.");
-
-  return undex<E>(static_cast<U>(value));
-}
-
-template <typename E, typename U = std::underlying_type_t<E>>
-constexpr U value_ors() noexcept {
-  static_assert(is_enum_v<E>, "magic_enum::detail::endex requires enum type.");
-
-  auto value = U{0};
-  for (std::size_t i = 0; i < count_v<E, true>; ++i) {
-    value |= static_cast<U>(values_v<E, true>[i]);
-  }
-
-  return value;
-}
-
-template <bool, bool, typename T, typename R>
-struct enable_if_enum {};
-
-template <typename T, typename R>
-struct enable_if_enum<true, false, T, R> {
-  using type = R;
-  using D = std::decay_t<T>;
-  static_assert(supported<D>::value, "magic_enum unsupported compiler (https://github.com/Neargye/magic_enum#compiler-compatibility).");
-  static_assert(count_v<D, false> > 0, "magic_enum requires enum implementation and valid max and min.");
-};
-
-template <typename T, typename R>
-struct enable_if_enum<true, true, T, R> {
-  using type = R;
-  using D = std::decay_t<T>;
-  static_assert(supported<D>::value, "magic_enum unsupported compiler (https://github.com/Neargye/magic_enum#compiler-compatibility).");
-  static_assert(count_v<D, true> > 0, "magic_enum::flags requires enum-flags implementation.");
-};
-
-template <typename T, typename R = void>
-using enable_if_enum_t = typename enable_if_enum<std::is_enum_v<std::decay_t<T>>, false, T, R>::type;
-
-template <typename T, typename R = void>
-using enable_if_enum_flags_t = typename enable_if_enum<std::is_enum_v<std::decay_t<T>>, true, T, R>::type;
-
-template <typename T, typename Enable = std::enable_if_t<std::is_enum_v<std::decay_t<T>>>>
-using enum_concept = T;
-
-template <typename T, bool = std::is_enum_v<T>>
-struct is_scoped_enum : std::false_type {};
-
-template <typename T>
-struct is_scoped_enum<T, true> : std::bool_constant<!std::is_convertible_v<T, std::underlying_type_t<T>>> {};
-
-template <typename T, bool = std::is_enum_v<T>>
-struct is_unscoped_enum : std::false_type {};
-
-template <typename T>
-struct is_unscoped_enum<T, true> : std::bool_constant<std::is_convertible_v<T, std::underlying_type_t<T>>> {};
-
-template <typename T, bool = std::is_enum_v<std::decay_t<T>>>
-struct underlying_type {};
-
-template <typename T>
-struct underlying_type<T, true> : std::underlying_type<std::decay_t<T>> {};
-
-} // namespace magic_enum::detail
-
-// Checks is magic_enum supported compiler.
-inline constexpr bool is_magic_enum_supported = detail::supported<void>::value;
-
-template <typename T>
-using Enum = detail::enum_concept<T>;
-
-// Checks whether T is an Unscoped enumeration type.
-// Provides the member constant value which is equal to true, if T is an [Unscoped enumeration](https://en.cppreference.com/w/cpp/language/enum#Unscoped_enumeration) type. Otherwise, value is equal to false.
-template <typename T>
-struct is_unscoped_enum : detail::is_unscoped_enum<T> {};
-
-template <typename T>
-inline constexpr bool is_unscoped_enum_v = is_unscoped_enum<T>::value;
-
-// Checks whether T is an Scoped enumeration type.
-// Provides the member constant value which is equal to true, if T is an [Scoped enumeration](https://en.cppreference.com/w/cpp/language/enum#Scoped_enumerations) type. Otherwise, value is equal to false.
-template <typename T>
-struct is_scoped_enum : detail::is_scoped_enum<T> {};
-
-template <typename T>
-inline constexpr bool is_scoped_enum_v = is_scoped_enum<T>::value;
-
-// If T is a complete enumeration type, provides a member typedef type that names the underlying type of T.
-// Otherwise, if T is not an enumeration type, there is no member type. Otherwise (T is an incomplete enumeration type), the program is ill-formed.
-template <typename T>
-struct underlying_type : detail::underlying_type<T> {};
-
-template <typename T>
-using underlying_type_t = typename underlying_type<T>::type;
-
-// Returns type name of enum.
-template <typename E>
-[[nodiscard]] constexpr auto enum_type_name() noexcept -> std::enable_if_t<std::is_enum_v<std::decay_t<E>>, string_view> {
-  using D = std::decay_t<E>;
-  constexpr string_view name = detail::type_name_v<D>;
-  static_assert(name.size() > 0, "Enum type does not have a name.");
-
-  return name;
-}
-
-// Returns number of enum values.
-template <typename E>
-[[nodiscard]] constexpr auto enum_count() noexcept -> detail::enable_if_enum_t<E, std::size_t> {
-  using D = std::decay_t<E>;
-
-  return detail::count_v<D>;
-}
-
-// Returns enum value at specified index.
-// No bounds checking is performed: the behavior is undefined if index >= number of enum values.
-template <typename E>
-[[nodiscard]] constexpr auto enum_value(std::size_t index) noexcept -> detail::enable_if_enum_t<E, std::decay_t<E>> {
-  using D = std::decay_t<E>;
-
-  if constexpr (detail::is_sparse_v<D>) {
-    return assert((index < detail::count_v<D>)), detail::values_v<D>[index];
-  } else {
-    return assert((index < detail::count_v<D>)), detail::value<D, detail::min_v<D>>(index);
-  }
-}
-
-// Returns std::array with enum values, sorted by enum value.
-template <typename E>
-[[nodiscard]] constexpr auto enum_values() noexcept -> detail::enable_if_enum_t<E, detail::values_t<E>> {
-  using D = std::decay_t<E>;
-
-  return detail::values_v<D>;
-}
-
-// Returns name from static storage enum variable.
-// This version is much lighter on the compile times and is not restricted to the enum_range limitation.
-template <auto V>
-[[nodiscard]] constexpr auto enum_name() noexcept -> std::enable_if_t<std::is_enum_v<std::decay_t<decltype(V)>>, string_view> {
-  using D = std::decay_t<decltype(V)>;
-  constexpr string_view name = detail::enum_name_v<D, V>;
-  static_assert(name.size() > 0, "Enum value does not have a name.");
-
-  return name;
-}
-
-// Returns name from enum value.
-// If enum value does not have name or value out of range, returns empty string.
-template <typename E>
-[[nodiscard]] constexpr auto enum_name(E value) noexcept -> detail::enable_if_enum_t<E, string_view> {
-  using D = std::decay_t<E>;
-
-  if (const auto i = detail::endex<D>(value); i != detail::invalid_index_v<D>) {
-    return detail::names_v<D>[i];
-  }
-
-  return {}; // Invalid value or out of range.
-}
-
-// Returns std::array with names, sorted by enum value.
-template <typename E>
-[[nodiscard]] constexpr auto enum_names() noexcept -> detail::enable_if_enum_t<E, detail::names_t<E>> {
-  using D = std::decay_t<E>;
-
-  return detail::names_v<D>;
-}
-
-// Returns std::array with pairs (value, name), sorted by enum value.
-template <typename E>
-[[nodiscard]] constexpr auto enum_entries() noexcept -> detail::enable_if_enum_t<E, detail::entries_t<E>> {
-  using D = std::decay_t<E>;
-
-  return detail::entries_v<D>;
-}
-
-// Obtains enum value from integer value.
-// Returns optional with enum value.
-template <typename E>
-[[nodiscard]] constexpr auto enum_cast(underlying_type_t<E> value) noexcept -> detail::enable_if_enum_t<E, optional<std::decay_t<E>>> {
-  using D = std::decay_t<E>;
-
-  if (detail::undex<D>(value) != detail::invalid_index_v<D>) {
-    return static_cast<D>(value);
-  }
-
-  return {}; // Invalid value or out of range.
-}
-
-// Obtains enum value from name.
-// Returns optional with enum value.
-template <typename E, typename BinaryPredicate>
-[[nodiscard]] constexpr auto enum_cast(string_view value, BinaryPredicate p) noexcept(std::is_nothrow_invocable_r_v<bool, BinaryPredicate, char, char>) -> detail::enable_if_enum_t<E, optional<std::decay_t<E>>> {
-  static_assert(std::is_invocable_r_v<bool, BinaryPredicate, char, char>, "magic_enum::enum_cast requires bool(char, char) invocable predicate.");
-  using D = std::decay_t<E>;
-
-  for (std::size_t i = 0; i < detail::count_v<D>; ++i) {
-    if (detail::cmp_equal(value, detail::names_v<D>[i], p)) {
-      return enum_value<D>(i);
-    }
-  }
-
-  return {}; // Invalid value or out of range.
-}
-
-// Obtains enum value from name.
-// Returns optional with enum value.
-template <typename E>
-[[nodiscard]] constexpr auto enum_cast(string_view value) noexcept -> detail::enable_if_enum_t<E, optional<std::decay_t<E>>> {
-  using D = std::decay_t<E>;
-
-  return enum_cast<D>(value, detail::char_equal_to{});
-}
-
-// Returns integer value from enum value.
-template <typename E>
-[[nodiscard]] constexpr auto enum_integer(E value) noexcept -> std::enable_if_t<std::is_enum_v<std::decay_t<E>>, underlying_type_t<E>> {
-  return static_cast<underlying_type_t<E>>(value);
-}
-
-// Obtains index in enum values from enum value.
-// Returns optional with index.
-template <typename E>
-[[nodiscard]] constexpr auto enum_index(E value) noexcept -> detail::enable_if_enum_t<E, optional<std::size_t>> {
-  using D = std::decay_t<E>;
-
-  if (const auto i = detail::endex<D>(value); i != detail::invalid_index_v<D>) {
-    return i;
-  }
-
-  return {}; // Invalid value or out of range.
-}
-
-// Checks whether enum contains enumerator with such enum value.
-template <typename E>
-[[nodiscard]] constexpr auto enum_contains(E value) noexcept -> detail::enable_if_enum_t<E, bool> {
-  using D = std::decay_t<E>;
-
-  return detail::endex<D>(value) != detail::invalid_index_v<D>;
-}
-
-// Checks whether enum contains enumerator with such integer value.
-template <typename E>
-[[nodiscard]] constexpr auto enum_contains(underlying_type_t<E> value) noexcept -> detail::enable_if_enum_t<E, bool> {
-  using D = std::decay_t<E>;
-
-  return detail::undex<D>(value) != detail::invalid_index_v<D>;
-}
-
-// Checks whether enum contains enumerator with such name.
-template <typename E, typename BinaryPredicate>
-[[nodiscard]] constexpr auto enum_contains(string_view value, BinaryPredicate p) noexcept(std::is_nothrow_invocable_r_v<bool, BinaryPredicate, char, char>) -> detail::enable_if_enum_t<E, bool> {
-  using D = std::decay_t<E>;
-  static_assert(std::is_invocable_r_v<bool, BinaryPredicate, char, char>, "magic_enum::enum_contains requires bool(char, char) invocable predicate.");
-
-  return enum_cast<D>(value, std::move_if_noexcept(p)).has_value();
-}
-
-// Checks whether enum contains enumerator with such name.
-template <typename E>
-[[nodiscard]] constexpr auto enum_contains(string_view value) noexcept -> detail::enable_if_enum_t<E, bool> {
-  using D = std::decay_t<E>;
-
-  return enum_cast<D>(value).has_value();
-}
-
-namespace ostream_operators {
-
-template <typename Char, typename Traits, typename E, std::enable_if_t<std::is_enum_v<E>, int> = 0>
-std::basic_ostream<Char, Traits>& operator<<(std::basic_ostream<Char, Traits>& os, E value) {
-  using D = std::decay_t<E>;
-  using U = underlying_type_t<D>;
-#if defined(MAGIC_ENUM_SUPPORTED) && MAGIC_ENUM_SUPPORTED
-  if (const auto name = magic_enum::enum_name<D>(value); !name.empty()) {
-    for (const auto c : name) {
-      os.put(c);
-    }
-    return os;
-  }
-#endif
-  return (os << static_cast<U>(value));
-}
-
-template <typename Char, typename Traits, typename E, std::enable_if_t<std::is_enum_v<E>, int> = 0>
-std::basic_ostream<Char, Traits>& operator<<(std::basic_ostream<Char, Traits>& os, optional<E> value) {
-  return value.has_value() ? (os << value.value()) : os;
-}
-
-} // namespace magic_enum::ostream_operators
-
-namespace bitwise_operators {
-
-template <typename E, std::enable_if_t<std::is_enum_v<E>, int> = 0>
-constexpr E operator~(E rhs) noexcept {
-  return static_cast<E>(~static_cast<underlying_type_t<E>>(rhs));
-}
-
-template <typename E, std::enable_if_t<std::is_enum_v<E>, int> = 0>
-constexpr E operator|(E lhs, E rhs) noexcept {
-  return static_cast<E>(static_cast<underlying_type_t<E>>(lhs) | static_cast<underlying_type_t<E>>(rhs));
-}
-
-template <typename E, std::enable_if_t<std::is_enum_v<E>, int> = 0>
-constexpr E operator&(E lhs, E rhs) noexcept {
-  return static_cast<E>(static_cast<underlying_type_t<E>>(lhs) & static_cast<underlying_type_t<E>>(rhs));
-}
-
-template <typename E, std::enable_if_t<std::is_enum_v<E>, int> = 0>
-constexpr E operator^(E lhs, E rhs) noexcept {
-  return static_cast<E>(static_cast<underlying_type_t<E>>(lhs) ^ static_cast<underlying_type_t<E>>(rhs));
-}
-
-template <typename E, std::enable_if_t<std::is_enum_v<E>, int> = 0>
-constexpr E& operator|=(E& lhs, E rhs) noexcept {
-  return lhs = (lhs | rhs);
-}
-
-template <typename E, std::enable_if_t<std::is_enum_v<E>, int> = 0>
-constexpr E& operator&=(E& lhs, E rhs) noexcept {
-  return lhs = (lhs & rhs);
-}
-
-template <typename E, std::enable_if_t<std::is_enum_v<E>, int> = 0>
-constexpr E& operator^=(E& lhs, E rhs) noexcept {
-  return lhs = (lhs ^ rhs);
-}
-
-} // namespace magic_enum::bitwise_operators
-
-namespace flags {
-
-// Returns type name of enum.
-using magic_enum::enum_type_name;
-
-// Returns number of enum-flags values.
-template <typename E>
-[[nodiscard]] constexpr auto enum_count() noexcept -> detail::enable_if_enum_flags_t<E, std::size_t> {
-  using D = std::decay_t<E>;
-
-  return detail::count_v<D, true>;
-}
-
-// Returns enum-flags value at specified index.
-// No bounds checking is performed: the behavior is undefined if index >= number of enum-flags values.
-template <typename E>
-[[nodiscard]] constexpr auto enum_value(std::size_t index) noexcept -> detail::enable_if_enum_flags_t<E, std::decay_t<E>> {
-  using D = std::decay_t<E>;
-
-  if constexpr (detail::is_sparse_v<D, true>) {
-    return assert((index < detail::count_v<D, true>)), detail::values_v<D, true>[index];
-  } else {
-    constexpr auto min = detail::log2(detail::min_v<D, true>);
-
-    return assert((index < detail::count_v<D, true>)), detail::value<D, min, true>(index);
-  }
-}
-
-// Returns std::array with enum-flags values, sorted by enum-flags value.
-template <typename E>
-[[nodiscard]] constexpr auto enum_values() noexcept -> detail::enable_if_enum_flags_t<E, detail::values_t<E, true>> {
-  using D = std::decay_t<E>;
-
-  return detail::values_v<D, true>;
-}
-
-// Returns name from enum-flags value.
-// If enum-flags value does not have name or value out of range, returns empty string.
-template <typename E>
-[[nodiscard]] auto enum_name(E value) -> detail::enable_if_enum_flags_t<E, string> {
-  using D = std::decay_t<E>;
-  using U = underlying_type_t<D>;
-
-  string name;
-  auto check_value = U{0};
-  for (std::size_t i = 0; i < detail::count_v<D, true>; ++i) {
-    if (const auto v = static_cast<U>(enum_value<D>(i)); (static_cast<U>(value) & v) != 0) {
-      check_value |= v;
-      const auto n = detail::names_v<D, true>[i];
-      if (!name.empty()) {
-        name.append(1, '|');
-      }
-      name.append(n.data(), n.size());
-    }
-  }
-
-  if (check_value != 0 && check_value == static_cast<U>(value)) {
-    return name;
-  }
-
-  return {}; // Invalid value or out of range.
-}
-
-// Returns std::array with string names, sorted by enum-flags value.
-template <typename E>
-[[nodiscard]] constexpr auto enum_names() noexcept -> detail::enable_if_enum_flags_t<E, detail::names_t<E, true>> {
-  using D = std::decay_t<E>;
-
-  return detail::names_v<D, true>;
-}
-
-// Returns std::array with pairs (value, name), sorted by enum-flags value.
-template <typename E>
-[[nodiscard]] constexpr auto enum_entries() noexcept -> detail::enable_if_enum_flags_t<E, detail::entries_t<E, true>> {
-  using D = std::decay_t<E>;
-
-  return detail::entries_v<D, true>;
-}
-
-// Obtains enum-flags value from integer value.
-// Returns optional with enum-flags value.
-template <typename E>
-[[nodiscard]] constexpr auto enum_cast(underlying_type_t<E> value) noexcept -> detail::enable_if_enum_flags_t<E, optional<std::decay_t<E>>> {
-  using D = std::decay_t<E>;
-  using U = underlying_type_t<D>;
-
-  if constexpr (detail::is_sparse_v<D, true>) {
-    auto check_value = U{0};
-    for (std::size_t i = 0; i < detail::count_v<D, true>; ++i) {
-      if (const auto v = static_cast<U>(enum_value<D>(i)); (value & v) != 0) {
-        check_value |= v;
-      }
-    }
-
-    if (check_value != 0 && check_value == value) {
-      return static_cast<D>(value);
-    }
-  } else {
-    constexpr auto min = detail::min_v<D, true>;
-    constexpr auto max = detail::value_ors<D>();
-
-    if (value >= min && value <= max) {
-      return static_cast<D>(value);
-    }
-  }
-
-  return {}; // Invalid value or out of range.
-}
-
-// Obtains enum-flags value from name.
-// Returns optional with enum-flags value.
-template <typename E, typename BinaryPredicate>
-[[nodiscard]] constexpr auto enum_cast(string_view value, BinaryPredicate p) noexcept(std::is_nothrow_invocable_r_v<bool, BinaryPredicate, char, char>) -> detail::enable_if_enum_flags_t<E, optional<std::decay_t<E>>> {
-  static_assert(std::is_invocable_r_v<bool, BinaryPredicate, char, char>, "magic_enum::flags::enum_cast requires bool(char, char) invocable predicate.");
-  using D = std::decay_t<E>;
-  using U = underlying_type_t<D>;
-
-  auto result = U{0};
-  while (!value.empty()) {
-    const auto d = detail::find(value, '|');
-    const auto s = (d == string_view::npos) ? value : value.substr(0, d);
-    auto f = U{0};
-    for (std::size_t i = 0; i < detail::count_v<D, true>; ++i) {
-      if (detail::cmp_equal(s, detail::names_v<D, true>[i], p)) {
-        f = static_cast<U>(enum_value<D>(i));
-        result |= f;
-        break;
-      }
-    }
-    if (f == U{0}) {
-      return {}; // Invalid value or out of range.
-    }
-    value.remove_prefix((d == string_view::npos) ? value.size() : d + 1);
-  }
-
-  if (result == U{0}) {
-    return {}; // Invalid value or out of range.
-  } else {
-    return static_cast<D>(result);
-  }
-}
-
-// Obtains enum-flags value from name.
-// Returns optional with enum-flags value.
-template <typename E>
-[[nodiscard]] constexpr auto enum_cast(string_view value) noexcept -> detail::enable_if_enum_flags_t<E, optional<std::decay_t<E>>> {
-  using D = std::decay_t<E>;
-
-  return enum_cast<D>(value, detail::char_equal_to{});
-}
-
-// Returns integer value from enum value.
-using magic_enum::enum_integer;
-
-// Obtains index in enum-flags values from enum-flags value.
-// Returns optional with index.
-template <typename E>
-[[nodiscard]] constexpr auto enum_index(E value) noexcept -> detail::enable_if_enum_flags_t<E, optional<std::size_t>> {
-  using D = std::decay_t<E>;
-  using U = underlying_type_t<D>;
-
-  if (detail::is_pow2(static_cast<U>(value))) {
-    for (std::size_t i = 0; i < detail::count_v<D, true>; ++i) {
-      if (enum_value<D>(i) == value) {
-        return i;
-      }
-    }
-  }
-
-  return {}; // Invalid value or out of range.
-}
-
-// Checks whether enum-flags contains enumerator with such enum-flags value.
-template <typename E>
-[[nodiscard]] constexpr auto enum_contains(E value) noexcept -> detail::enable_if_enum_flags_t<E, bool> {
-  using D = std::decay_t<E>;
-  using U = underlying_type_t<D>;
-
-  return enum_cast<D>(static_cast<U>(value)).has_value();
-}
-
-// Checks whether enum-flags contains enumerator with such integer value.
-template <typename E>
-[[nodiscard]] constexpr auto enum_contains(underlying_type_t<E> value) noexcept -> detail::enable_if_enum_flags_t<E, bool> {
-  using D = std::decay_t<E>;
-
-  return enum_cast<D>(value).has_value();
-}
-
-// Checks whether enum-flags contains enumerator with such name.
-template <typename E, typename BinaryPredicate>
-[[nodiscard]] constexpr auto enum_contains(string_view value, BinaryPredicate p) noexcept(std::is_nothrow_invocable_r_v<bool, BinaryPredicate, char, char>) -> detail::enable_if_enum_flags_t<E, bool> {
-  static_assert(std::is_invocable_r_v<bool, BinaryPredicate, char, char>, "magic_enum::flags::enum_contains requires bool(char, char) invocable predicate.");
-  using D = std::decay_t<E>;
-
-  return enum_cast<D>(value, std::move_if_noexcept(p)).has_value();
-}
-
-// Checks whether enum-flags contains enumerator with such name.
-template <typename E>
-[[nodiscard]] constexpr auto enum_contains(string_view value) noexcept -> detail::enable_if_enum_flags_t<E, bool> {
-  using D = std::decay_t<E>;
-
-  return enum_cast<D>(value).has_value();
-}
-
-} // namespace magic_enum::flags
-
-namespace flags::ostream_operators {
-
-template <typename Char, typename Traits, typename E, detail::enable_if_enum_flags_t<E, int> = 0>
-std::basic_ostream<Char, Traits>& operator<<(std::basic_ostream<Char, Traits>& os, E value) {
-  using D = std::decay_t<E>;
-  using U = underlying_type_t<D>;
-#if defined(MAGIC_ENUM_SUPPORTED) && MAGIC_ENUM_SUPPORTED
-  if (const auto name = magic_enum::flags::enum_name<D>(value); !name.empty()) {
-    for (const auto c : name) {
-      os.put(c);
-    }
-    return os;
-  }
-#endif
-  return (os << static_cast<U>(value));
-}
-
-template <typename Char, typename Traits, typename E, detail::enable_if_enum_flags_t<E, int> = 0>
-std::basic_ostream<Char, Traits>& operator<<(std::basic_ostream<Char, Traits>& os, optional<E> value) {
-  return value.has_value() ? (os << value.value()) : os;
-}
-
-} // namespace magic_enum::flags::ostream_operators
-
-} // namespace magic_enum
-
-#if defined(__clang__)
-#  pragma clang diagnostic pop
-#elif defined(__GNUC__)
-#  pragma GCC diagnostic pop
-#elif defined(_MSC_VER)
-#  pragma warning(pop)
-#endif
-
-#endif // NEARGYE_MAGIC_ENUM_HPP
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
new file mode 100644
index 00000000..9b9c6b6e
--- /dev/null
+++ b/tools/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_subdirectory(NodeEditor)
+
+set(install_targets ${install_targets} PARENT_SCOPE)
\ No newline at end of file
diff --git a/NoiseTool/CMakeLists.txt b/tools/NodeEditor/CMakeLists.txt
similarity index 50%
rename from NoiseTool/CMakeLists.txt
rename to tools/NodeEditor/CMakeLists.txt
index 98135869..58df8f5c 100644
--- a/NoiseTool/CMakeLists.txt
+++ b/tools/NodeEditor/CMakeLists.txt
@@ -1,7 +1,7 @@
 CPMAddPackage(
     NAME corrade
     GITHUB_REPOSITORY mosra/corrade
-    GIT_TAG dfbeae5c4a2ee429ecad3a37121aba3e3d389036
+    GIT_TAG 295bbba1f49887da060465f88b8501965f6acd7d
     GIT_SUBMODULES "src"
     EXCLUDE_FROM_ALL YES
     OPTIONS
@@ -12,34 +12,43 @@ CPMAddPackage(
         "CORRADE_WITH_TESTSUITE OFF"
 )
 
-CPMAddPackage(
-    NAME GLFW
-    GITHUB_REPOSITORY glfw/glfw
-    GIT_TAG 3.3.9
-    EXCLUDE_FROM_ALL YES
-    OPTIONS
-        "BUILD_SHARED_LIBS OFF"
-        "GLFW_INSTALL OFF"
-        "GLFW_BUILD_TESTS OFF"
-        "GLFW_BUILD_EXAMPLES OFF"
-        "GLFW_BUILD_DOCS OFF"
-)
+if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
+    set(NODE_EDITOR_APP_TYPE_CAPS "EMSCRIPTEN")   
+    set(NODE_EDITOR_APP_TYPE "Emscripten")   
+else()
+    set(NODE_EDITOR_APP_TYPE_CAPS "GLFW")   
+    set(NODE_EDITOR_APP_TYPE "Glfw")   
+
+    CPMAddPackage(
+        NAME GLFW
+        GITHUB_REPOSITORY glfw/glfw
+        GIT_TAG 3.3.9
+        EXCLUDE_FROM_ALL YES
+        OPTIONS
+            "BUILD_SHARED_LIBS OFF"
+            "GLFW_INSTALL OFF"
+            "GLFW_BUILD_TESTS OFF"
+            "GLFW_BUILD_EXAMPLES OFF"
+            "GLFW_BUILD_DOCS OFF"
+    )
+endif()
 
 CPMAddPackage(
     NAME magnum
     GITHUB_REPOSITORY mosra/magnum
-    GIT_TAG b1ba1f076d3e8b4295b1afac94e95ff8a846e619
+    GIT_TAG c9a884938c606b7d4555da6d278d1f3e09588c3e
     GIT_SUBMODULES "src"
     EXCLUDE_FROM_ALL YES
     OPTIONS
         "MAGNUM_BUILD_STATIC ON"
         "MAGNUM_BUILD_PLUGINS_STATIC ON"
         "MAGNUM_BUILD_STATIC_UNIQUE_GLOBALS OFF"
-        "MAGNUM_WITH_GLFWAPPLICATION ON"
+        "MAGNUM_WITH_${NODE_EDITOR_APP_TYPE_CAPS}APPLICATION ON"
         "MAGNUM_WITH_MESHTOOLS OFF"
         "MAGNUM_WITH_TRADE OFF"
         "MAGNUM_WITH_TEXT OFF"
         "MAGNUM_WITH_TEXTURETOOLS OFF"
+        "MAGNUM_TARGET_GLES2 OFF"
 )
     
 CPMAddPackage(
@@ -55,7 +64,7 @@ set(IMGUI_DIR ${imgui_SOURCE_DIR})
 CPMAddPackage(
     NAME magnum-integration
     GITHUB_REPOSITORY mosra/magnum-integration
-    GIT_TAG 05cbe5f85593b7d4252048df98f0bc3bb48b540d
+    GIT_TAG f01593fc94556bff23a848ac71187c56e034b6d9
     GIT_SUBMODULES "src"
     EXCLUDE_FROM_ALL YES
     OPTIONS
@@ -66,14 +75,14 @@ CPMAddPackage(
 # Use modules from magnum-integration since it has everything we need
 set(CMAKE_MODULE_PATH "${magnum-integration_SOURCE_DIR}/modules" ${CMAKE_MODULE_PATH})
 
-find_package(Magnum REQUIRED GL GlfwApplication)
+find_package(Magnum REQUIRED GL ${NODE_EDITOR_APP_TYPE}Application)
 find_package(MagnumIntegration REQUIRED ImGui)
 find_package(ImGui REQUIRED SourcesMiscCpp)
   
 CPMAddPackage(
     NAME imnodes
     GITHUB_REPOSITORY Auburn/imnodes
-    GIT_TAG 1aa48f4af2a4f9f1b9a6ed53fe858ed76646b233
+    GIT_TAG db2ef1192a4ddff32a838094de7127142a731ef0
     GIT_SUBMODULES ".github"
     EXCLUDE_FROM_ALL YES
     OPTIONS
@@ -88,7 +97,7 @@ CPMAddPackage(
     EXCLUDE_FROM_ALL YES
 )
 
-# Ensure FastNoise.dll is built into the same dir as NoiseTool.exe
+# Ensure FastNoise.dll is built into the same dir as NodeEditor.exe
 set_target_properties(FastNoise
     PROPERTIES
     ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}
@@ -98,19 +107,19 @@ set_target_properties(FastNoise
 
 # Bundle a better font
 # Configure resource file for imgui source dir variable
-set(NoiseTool_RESOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR})
-configure_file("resources.conf" "${CMAKE_CURRENT_BINARY_DIR}/resources.conf")
-corrade_add_resource(NoiseTool_RESOURCES "${CMAKE_CURRENT_BINARY_DIR}/resources.conf")
+set(NodeEditor_RESOURCES_DIR "${CMAKE_CURRENT_LIST_DIR}/resources")
+configure_file("resources/resources.conf" "${CMAKE_CURRENT_BINARY_DIR}/resources.conf")
+corrade_add_resource(NodeEditor_RESOURCES "${CMAKE_CURRENT_BINARY_DIR}/resources.conf")
 
-add_executable(NoiseTool
-    "NoiseToolApp.cpp"
+add_executable(NodeEditor
+    "NodeEditorApp.cpp"
     "FastNoiseNodeEditor.cpp"
     "MeshNoisePreview.cpp"
     "NoiseTexture.cpp"
-    ${NoiseTool_RESOURCES}
+    ${NodeEditor_RESOURCES}
 ) 
 
-target_link_libraries(NoiseTool PRIVATE
+target_link_libraries(NodeEditor PRIVATE
     FastNoise
     Magnum::Application
     Magnum::Shaders
@@ -121,28 +130,46 @@ target_link_libraries(NoiseTool PRIVATE
     robin_hood
 )
 
+target_compile_features(NodeEditor PRIVATE cxx_std_20)
+
 # Windows HiDPI support
 if(CORRADE_TARGET_WINDOWS)
-    target_sources(NoiseTool PRIVATE WindowsHiDPI.manifest)
+    target_sources(NodeEditor PRIVATE resources/WindowsHiDPI.manifest)
 endif()
 
-if (UNIX)
-    target_link_options(NoiseTool PRIVATE -pthread)
+if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
+    set(CMAKE_EXECUTABLE_SUFFIX ".html")
+    target_compile_options(NodeEditor PRIVATE -pthread -msimd128)
+    target_link_options(NodeEditor PRIVATE 
+        "-sPTHREAD_POOL_SIZE=Math.max(2,navigator.hardwareConcurrency)+3-navigator.hardwareConcurrency/4"
+        -pthread -sALLOW_MEMORY_GROWTH=1 -lidbfs.js -s FORCE_FILESYSTEM
+        --shell-file "resources/emscripten_shell.html"
+        --pre-js "resources/emscripten_pre.js"
+        -Wl,-u,_emscripten_run_callback_on_thread
+    )
+    add_custom_command(TARGET NodeEditor POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different
+        "resources/emscripten_enable_shared_array_buffer.js"
+        $<TARGET_FILE_DIR:NodeEditor>
+    )
+
+elseif (UNIX)
+    target_link_options(NodeEditor PRIVATE -pthread)
 
     if(APPLE)
-        set_property(TARGET NoiseTool PROPERTY
+        set_property(TARGET NodeEditor PROPERTY
             INSTALL_RPATH "@loader_path/../lib")
     else()
-        set_property(TARGET NoiseTool PROPERTY
+        set_property(TARGET NodeEditor PROPERTY
             INSTALL_RPATH "\$ORIGIN/../lib")
     endif()
 endif()
 
 if (MSVC)
-    target_compile_definitions(NoiseTool PRIVATE _CRT_SECURE_NO_WARNINGS=1)
+    target_compile_definitions(NodeEditor PRIVATE _CRT_SECURE_NO_WARNINGS=1)
 endif()
 
-set(install_targets ${install_targets} NoiseTool PARENT_SCOPE)
+set(install_targets ${install_targets} NodeEditor PARENT_SCOPE)
 
 # Make the executable a default target to build & run in Visual Studio
-set_property(DIRECTORY ${PROJECT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT NoiseTool)
+set_property(DIRECTORY ../.. PROPERTY VS_STARTUP_PROJECT .)
diff --git a/NoiseTool/FastNoiseNodeEditor.cpp b/tools/NodeEditor/FastNoiseNodeEditor.cpp
similarity index 69%
rename from NoiseTool/FastNoiseNodeEditor.cpp
rename to tools/NodeEditor/FastNoiseNodeEditor.cpp
index c0e33830..1abf47c2 100644
--- a/NoiseTool/FastNoiseNodeEditor.cpp
+++ b/tools/NodeEditor/FastNoiseNodeEditor.cpp
@@ -1,6 +1,7 @@
 #include <sstream>
 #include <random>
 #include <cstdio>
+#include <atomic>
 
 #define IMGUI_DEFINE_MATH_OPERATORS
 #include <imgui.h>
@@ -12,47 +13,97 @@
 #include <Magnum/ImGuiIntegration/Widgets.h>
 #include <Corrade/Containers/ArrayViewStl.h>
 
-#include "ImGuiExtra.h"
+#include "util/ImGuiExtra.h"
+#include "util/DemoNodeTrees.inl"
 #include "FastNoiseNodeEditor.h"
-#include "DemoNodeTrees.inl"
+#include "NodeEditorApp.h"
 
 using namespace Magnum;
 
-static bool MatchingGroup( const std::vector<const char*>& a, const std::vector<const char*>& b )
+#include "util/SharedMemoryIpc.inl"
+
+static constexpr const char* kNodeGraphSettingsFile = FILESYSTEM_ROOT "NodeGraph.ini";
+
+void FastNoiseNodeEditor::OpenStandaloneNodeGraph()
 {
-    std::string aString;
-    for( const char* c : a )
-    {
-        aString.append( c );
-        aString.push_back( '\t' );
+#ifdef WIN32
+    std::string startArgs = "\"";
+    startArgs += mNodeEditorApp.GetExecutablePath();
+    startArgs += "\" -detached";
+
+    STARTUPINFOA si;
+    PROCESS_INFORMATION pi;
+
+    ZeroMemory( &si, sizeof( si ) );
+    si.cb = sizeof( si );
+    ZeroMemory( &pi, sizeof( pi ) );
+
+    // Create a job object
+    HANDLE hJob = CreateJobObject( NULL, NULL );
+    JOBOBJECT_EXTENDED_LIMIT_INFORMATION jeli;
+    ZeroMemory( &jeli, sizeof( jeli ) );
+
+    // Configure the job object to terminate processes when the handle is closed
+    jeli.BasicLimitInformation.LimitFlags = JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE;
+    SetInformationJobObject( hJob, JobObjectExtendedLimitInformation, &jeli, sizeof( jeli ) );
+
+    // Start the child process.
+    if( CreateProcessA( NULL, // No module name (use command line)
+                         (LPSTR)startArgs.data(), // Command line
+                         NULL, // Process handle not inheritable
+                         NULL, // Thread handle not inheritable
+                         FALSE, // Set handle inheritance to FALSE
+                         0, // No creation flags
+                         NULL, // Use parent's environment block
+                         NULL, // Use parent's starting directory
+                         &si, // Pointer to STARTUPINFO structure
+                         &pi ) ) // Pointer to PROCESS_INFORMATION structure
+    {
+        // Assign the child process to the job object
+        AssignProcessToJobObject( hJob, pi.hProcess );
+
+        // Close handles to the child process and primary thread
+        CloseHandle( pi.hProcess );
+        CloseHandle( pi.hThread );
     }
+    else
+#elif !defined( __EMSCRIPTEN__ )
+    pid_t pid = fork(); // Duplicate current process
 
-    std::string bString;
-    for( const char* c : b )
+    if( pid == 0 )
     {
-        bString.append( c );
-        bString.push_back( '\t' );
+        // Child process
+        const char* executable = mNodeEditorApp.GetExecutablePath().data(); // Path to the current executable
+        execl( executable, executable, "-detached", (char*)NULL );
+        // If execl returns, it means it has failed
+        exit( EXIT_FAILURE ); // Ensure the child process exits if execl fails
+    }
+    if( pid < 0 )
+#endif
+    {
+        Debug {} << "Failed to launch standalone node graph process"
+#ifdef WIN32
+            << GetLastError()
+#endif
+        ;
     }
-
-    return aString == bString;
 }
 
-template<typename T>
-static bool MatchingMembers( const std::vector<T>& a, const std::vector<T>& b )
+static bool MatchingGroup( const FastNoise::Metadata::Vector<const char*>& a, const FastNoise::Metadata::Vector<const char*>& b )
 {
-    if( a.size() != b.size() )
+    return std::ranges::equal( a, b, []( auto& x, auto& y )
     {
-        return false;
-    }
+        return std::strcmp( x, y ) == 0;
+    } );
+}
 
-    for( size_t i = 0; i < a.size(); i++ )
+template<typename T>
+static bool MatchingMembers( const FastNoise::Metadata::Vector<T>& a, const FastNoise::Metadata::Vector<T>& b )
+{
+    return std::ranges::equal( a, b, []( auto& x, auto& y )
     {
-        if( strcmp( a[i].name, b[i].name ) != 0 )
-        {
-            return false;
-        }
-    }
-    return true;
+        return std::strcmp( x.name, y.name ) == 0;
+    } );
 }
 
 static std::string TimeWithUnits( int64_t time, int significantDigits = 3 )
@@ -71,35 +122,40 @@ static std::string TimeWithUnits( int64_t time, int significantDigits = 3 )
     return ss.str();
 }
 
-template<typename... Args>
-std::string string_format( const char* format, Args... args )
+template<size_t N, typename... Args>
+std::string string_format( const char (&format)[N], const Args&... args )
 {
     int size_s = std::snprintf( nullptr, 0, format, args... );
     if( size_s <= 0 )
     {
         return "";
     }
-    auto size = static_cast<size_t>( size_s );
+    auto size = static_cast<size_t>( size_s + 1 );
     std::string buf( size, 0 );
     std::snprintf( buf.data(), size, format, args... );
     return buf;
 }
 
-template<typename... T>
-static bool DoHoverPopup( const char* format, T... args )
+const char* string_format( const char* txt )
+{
+    return txt;
+}
+
+template<typename T, typename... Args>
+static bool DoHoverPopup( T&& format, const Args&... args )
 {
     if( ImGui::IsItemHovered() )
     {
-        std::string hoverTxt = string_format( format, args... );
+        auto hoverTxt = string_format( format, args... );
 
-        if( hoverTxt.empty() )
+        if( !hoverTxt[0] )
         {
             return false;
         }
 
         ImGui::PushStyleVar( ImGuiStyleVar_WindowPadding, ImVec2( 4.f, 4.f ) );
         ImGui::BeginTooltip();
-        ImGui::TextUnformatted( hoverTxt.c_str() );
+        ImGui::TextUnformatted( &hoverTxt[0] );
         ImGui::EndTooltip();
         ImGui::PopStyleVar();
         return true;
@@ -129,7 +185,7 @@ void FastNoiseNodeEditor::Node::GeneratePreview( bool nodeTreeChanged, bool benc
     static std::array<float, NoiseSize * NoiseSize> noiseData;
 
     serialised = FastNoise::Metadata::SerialiseNodeData( data.get(), true );
-    auto generator = FastNoise::NewFromEncodedNodeTree( serialised.c_str(), editor.mMaxSIMDLevel );
+    auto generator = FastNoise::NewFromEncodedNodeTree( serialised.c_str(), editor.mMaxFeatureSet );
 
     if( !benchmark && nodeTreeChanged )
     {
@@ -138,8 +194,11 @@ void FastNoiseNodeEditor::Node::GeneratePreview( bool nodeTreeChanged, bool benc
 
     if( generator )
     {
-        auto genRGB = FastNoise::New<FastNoise::ConvertRGBA8>( editor.mMaxSIMDLevel );
-        genRGB->SetSource( generator );
+        auto genRGB = FastNoise::New<FastNoise::ConvertRGBA8>( editor.mMaxFeatureSet );
+        auto scale = FastNoise::New<FastNoise::DomainScale>( editor.mMaxFeatureSet );
+        genRGB->SetSource( scale );
+        scale->SetSource( generator );
+        scale->SetScaling( editor.mNodeScale );
         
         auto startTime = std::chrono::high_resolution_clock::now();
 
@@ -186,7 +245,7 @@ void FastNoiseNodeEditor::Node::GeneratePreview( bool nodeTreeChanged, bool benc
         }
 
         // Save nodes to ini
-        ImGuiExtra::MarkSettingsDirty();
+        editor.mSettingsDirty = true;
     }
 }
 
@@ -274,7 +333,7 @@ bool FastNoiseNodeEditor::MetadataMenuItem::CanDraw( std::function<bool( const F
 
 const FastNoise::Metadata* FastNoiseNodeEditor::MetadataMenuItem::DrawUI( std::function<bool( const FastNoise::Metadata* )> isValid, bool drawGroups ) const
 {
-    std::string format = FastNoise::Metadata::FormatMetadataNodeName( metadata, true );
+    std::string format = FastNoise::Metadata::FormatMetadataNodeName( metadata, drawGroups );
     
     if( ImGui::MenuItem( format.c_str() ) )
     {
@@ -365,7 +424,7 @@ void FastNoiseNodeEditor::Node::SerialiseIncludingDependancies( ImGuiSettingsHan
 void FastNoiseNodeEditor::SetupSettingsHandlers()
 {
     ImGuiSettingsHandler nodeSettings;
-    nodeSettings.TypeName = "NoiseToolNodeData";
+    nodeSettings.TypeName = "NodeEditorNodeData";
     nodeSettings.TypeHash = ImHashStr( nodeSettings.TypeName );
     nodeSettings.UserData = this;
     nodeSettings.WriteAllFn = []( ImGuiContext* ctx, ImGuiSettingsHandler* handler, ImGuiTextBuffer* outBuf )
@@ -464,7 +523,7 @@ void FastNoiseNodeEditor::SetupSettingsHandlers()
 
 
     ImGuiSettingsHandler editorSettings;
-    editorSettings.TypeName = "NoiseToolNodeGraph";
+    editorSettings.TypeName = "NodeEditorNodeGraph";
     editorSettings.TypeHash = ImHashStr( editorSettings.TypeName );
     editorSettings.UserData = this;
     editorSettings.WriteAllFn = []( ImGuiContext* ctx, ImGuiSettingsHandler* handler, ImGuiTextBuffer* outBuf )
@@ -475,9 +534,16 @@ void FastNoiseNodeEditor::SetupSettingsHandlers()
         ImVec2 gridOffset = ImNodes::EditorContextGetPanning();
         outBuf->appendf( "grid_offset=%f:%f\n", gridOffset.x, gridOffset.y );
 
-        outBuf->appendf( "frequency=%f\n", nodeEditor->mNodeFrequency );
+        outBuf->appendf( "scale=%f\n", nodeEditor->mNodeScale );
         outBuf->appendf( "seed=%d\n", nodeEditor->mNodeSeed );
         outBuf->appendf( "gen_type=%d\n", (int)nodeEditor->mNodeGenType );
+        
+        auto find = nodeEditor->mNodes.find( nodeEditor->mSelectedNode );
+
+        if( find != nodeEditor->mNodes.end() )
+        {
+            outBuf->appendf( "selected_node=%d\n", find->second.nodeId );
+        }
     };
     editorSettings.ReadOpenFn = []( ImGuiContext* ctx, ImGuiSettingsHandler* handler, const char* name ) -> void*
     {
@@ -498,23 +564,49 @@ void FastNoiseNodeEditor::SetupSettingsHandlers()
             ImNodes::EditorContextResetPanning( imVec2 );
         }
 
-        sscanf( line, "frequency=%f", &nodeEditor->mNodeFrequency );
+        sscanf( line, "scale=%f", &nodeEditor->mNodeScale );
         sscanf( line, "seed=%d", &nodeEditor->mNodeSeed );
         sscanf( line, "gen_type=%d", (int*)&nodeEditor->mNodeGenType );
+
+        if( nodeEditor->mNodeEditorApp.IsDetachedNodeGraph() )
+        {
+            int i;
+            if( sscanf( line, "selected_node=%d", &i ) == 1 )
+            {
+                if( Node* selectedNode = nodeEditor->FindNodeFromId( i ) )
+                {
+                    nodeEditor->mSelectedNode = selectedNode->data.get();
+                }
+            }
+        }
+    };
+    editorSettings.ApplyAllFn = []( ImGuiContext* ctx, ImGuiSettingsHandler* handler )
+    {
+        auto* nodeEditor = (FastNoiseNodeEditor*)handler->UserData;
+        for( auto& node : nodeEditor->mNodes )
+        {
+            node.second.GeneratePreview( false );
+        }
     };
 
-    ImGuiExtra::AddOrReplaceSettingsHandler( editorSettings );
+
     ImGuiExtra::AddOrReplaceSettingsHandler( nodeSettings );
+    ImGuiExtra::AddOrReplaceSettingsHandler( editorSettings );
 }
 
-FastNoiseNodeEditor::FastNoiseNodeEditor() :
+FastNoiseNodeEditor::FastNoiseNodeEditor( NodeEditorApp& nodeEditorApp ) :
+    mNodeEditorApp( nodeEditorApp ),
+    mMainContext( ImGui::GetCurrentContext() ),
     mOverheadNode( *this, new FastNoise::NodeData( &FastNoise::Metadata::Get<FastNoise::Constant>() ), false )
 {
+    if( !mNodeEditorApp.IsDetachedNodeGraph() )
+    {
 #ifdef IMGUI_HAS_DOCK
-    ImGui::GetIO().ConfigFlags |= ImGuiConfigFlags_DockingEnable;
-    ImGui::GetIO().ConfigFlags |= ImGuiConfigFlags_ViewportsEnable;
+        ImGui::GetIO().ConfigFlags |= ImGuiConfigFlags_DockingEnable;
+        ImGui::GetIO().ConfigFlags |= ImGuiConfigFlags_ViewportsEnable;
 #endif
-    ImGui::GetIO().ConfigWindowsResizeFromEdges = true;
+        ImGui::GetIO().ConfigWindowsResizeFromEdges = true;
+    }
     ImGui::GetIO().ConfigFlags |= ImGuiConfigFlags_NavEnableSetMousePos;
     ImGui::GetIO().ConfigFlags |= ImGuiConfigFlags_NavEnableKeyboard;
 
@@ -529,8 +621,6 @@ FastNoiseNodeEditor::FastNoiseNodeEditor() :
 #ifndef NDEBUG
     mNodeBenchmarkMax = 1;
 #endif
-    
-    SetupSettingsHandlers();
 
     // Create Metadata context menu tree
     std::unordered_map<std::string, MetadataMenuGroup*> groupMap;
@@ -565,7 +655,32 @@ FastNoiseNodeEditor::FastNoiseNodeEditor() :
 
         metaDataGroup->items.emplace_back( mContextMetadata.emplace_back( new MetadataMenuItem( metadata ) ).get() );
         std::sort( metaDataGroup->items.begin(), metaDataGroup->items.end(), menuSort );
-    }    
+    }
+
+    int debugMetadataVectorCheckIdx = 0;
+    std::pair<int32_t, const char*> state;
+    do
+    {
+        state = FastNoise::Metadata::DebugCheckVectorStorageSize( debugMetadataVectorCheckIdx++ );
+        if( state.first > 0 )
+        {
+            Error{} << "Non-optimal metadata vector, in FastNoise Metadata.cpp adjust gMetadataVectorSize<" << state.second << "> to: " << state.first;
+        }
+
+    } while( state.second );
+
+}
+
+FastNoiseNodeEditor::~FastNoiseNodeEditor()
+{
+    // Go into node graph context and trigger save
+    ImGuiContext* currentContext = ImGui::GetCurrentContext();
+    ImGui::SetCurrentContext( ImNodes::GetNodeEditorImGuiContext() );
+    ImGui::SaveIniSettingsToDisk( kNodeGraphSettingsFile );
+    NodeEditorApp::SyncFileSystem();
+    ImGui::SetCurrentContext( currentContext );
+
+    ImNodes::DestroyContext();
 }
 
 void FastNoiseNodeEditor::DoNodeBenchmarks()
@@ -600,78 +715,182 @@ void FastNoiseNodeEditor::DoNodeBenchmarks()
 
 void FastNoiseNodeEditor::Draw( const Matrix4& transformation, const Matrix4& projection, const Vector3& cameraPosition )
 {
+#if !defined( WIN32 ) && !defined( __EMSCRIPTEN__ )
+    static pid_t parentPid = getppid();
+
+    if( getppid() != parentPid ) 
+    {
+        mNodeEditorApp.exit();
+    }
+#endif
+
+    DoIpcPolling();
+
+    bool isDetachedNodeEditor = mNodeEditorApp.IsDetachedNodeGraph();
     const ImGuiViewport* viewport = ImGui::GetMainViewport();
-    ImGui::DockSpaceOverViewport( viewport, ImGuiDockNodeFlags_PassthruCentralNode ); 
 
-    std::string simdTxt = "Current SIMD Level: ";
-    simdTxt += GetSIMDLevelName( mActualSIMDLevel );
-    ImGui::TextUnformatted( simdTxt.c_str() );
+    ImGuiWindowFlags windowFlags = 0;
+    ImGuiWindow* nodeGraphWindow = ImGui::FindWindowByName( "Node Graph" );
 
-    ImGui::DragInt( "Node Benchmark Count", &mNodeBenchmarkMax, 8, 8, 64 * 1024 );
+    if( isDetachedNodeEditor )
+    {
+        ImGui::SetNextWindowSize( viewport->WorkSize );
+        ImGui::SetNextWindowPos( ImVec2( 0, 0 ) );
+        windowFlags = ImGuiWindowFlags_NoDecoration | ImGuiWindowFlags_NoSavedSettings | ImGuiWindowFlags_MenuBar;
+    }
+    else if( nodeGraphWindow && nodeGraphWindow->Collapsed )
+    {
+        // Avoid saving over the window position when it is minimised from detach
+        windowFlags = ImGuiWindowFlags_NoSavedSettings | ImGuiWindowFlags_MenuBar;
+    }
+    else
+    {
+        windowFlags = ImGuiWindowFlags_MenuBar;
+        ImGui::DockSpaceOverViewport( viewport, ImGuiDockNodeFlags_PassthruCentralNode );
+        
+        std::string simdTxt = "Current Feature Set: ";
+        simdTxt += FastSIMD::GetFeatureSetString( mActualFeatureSet );
+        ImGui::TextUnformatted( simdTxt.c_str() );
+
+        ImGui::DragInt( "Node Benchmark Count", &mNodeBenchmarkMax, 8, 8, 64 * 1024 );
+
+        ImGui::SetNextWindowSize( ImVec2( 963, 634 ), ImGuiCond_FirstUseEver );
+        ImGui::SetNextWindowPos( ImVec2( 8, 439 ), ImGuiCond_FirstUseEver );
+    }
 
-    ImGui::SetNextWindowSize( ImVec2( 963, 634 ), ImGuiCond_FirstUseEver );
-    ImGui::SetNextWindowPos( ImVec2( 8, 439 ), ImGuiCond_FirstUseEver );
-    if( ImGui::Begin( "Node Editor" ) )
+    if( ImGui::Begin( "Node Graph", nullptr, windowFlags ) )
     {
         UpdateSelected();
 
-        bool edited = false;
-        ImGui::PushItemWidth( 82.0f );
-        
-        edited |= ImGui::Combo( "Generation Type", reinterpret_cast<int*>( &mNodeGenType ), NoiseTexture::GenTypeStrings );
-        edited |= ImGuiExtra::ScrollCombo( reinterpret_cast<int*>( &mNodeGenType ), NoiseTexture::GenType_Count ); 
-        ImGui::SameLine();  
-
-        edited |= ImGui::DragInt( "Seed", &mNodeSeed );
-        ImGui::SameLine();
-        edited |= ImGui::DragFloat( "Frequency", &mNodeFrequency, 0.001f );    
-        ImGui::SameLine();    
+        // Declare variables used in menus
+        bool openStandalonenodeGraph = false;
 
-        if( ImGui::Button( "Retest Node Performance" ) )
+        // Menu bar for preview settings
+        if( ImGui::BeginMenuBar() )
         {
-            for( auto& node : mNodes )
+            if( ImGui::BeginMenu( "Preview Settings" ) )
             {
-                node.second.generateAverages.clear();
+                bool edited = false;
+                ImGui::PushItemWidth( 120.0f );
+                
+                edited |= ImGui::Combo( "Generation Type", reinterpret_cast<int*>( &mNodeGenType ), NoiseTexture::GenTypeStrings );
+                edited |= ImGuiExtra::ScrollCombo( reinterpret_cast<int*>( &mNodeGenType ), NoiseTexture::GenType_Count ); 
+                
+                edited |= ImGui::DragInt( "Seed", &mNodeSeed );
+                edited |= ImGui::DragFloat( "Scale", &mNodeScale, 0.01f );
+
+                ImGui::PopItemWidth();
+                
+                if( edited )
+                {
+                    for( auto& node : mNodes )
+                    {
+                        node.second.GeneratePreview( false );
+                    }
+
+                    mSettingsDirty = true;
+                }
+                
+                ImGui::EndMenu();
             }
+            
+            if( ImGui::BeginMenu( "Tools" ) )
+            {
+                if( ImGui::MenuItem( "Retest Node Performance" ) )
+                {
+                    for( auto& node : mNodes )
+                    {
+                        node.second.generateAverages.clear();
+                    }
+                }
+                if( ImGui::IsItemHovered() )
+                {
+                    ImGui::BeginTooltip();
+                    ImGui::TextUnformatted( "Disable \"Generate Mesh Preview\" for more accurate results" );
+                    ImGui::EndTooltip();
+                }
+
+                if( !isDetachedNodeEditor )
+                {
+                    if( ImGui::MenuItem( "Detach Node Graph" ) )
+                    {
+                        openStandalonenodeGraph = true;
+
+                        ImGui::SetWindowCollapsed( true );
+                        ImGui::GetCurrentWindow()->Pos = ImVec2( 0, 0 );
+                    }
+                    if( ImGui::IsItemHovered() )
+                    {
+                        ImGui::BeginTooltip();
+                        ImGui::TextUnformatted( "Opens the node graph in a new window which can be moved to another monitor" );
+                        ImGui::EndTooltip();
+                    }
+                }
+                
+                ImGui::EndMenu();
+            }
+            ImGui::EndMenuBar();
         }
-        if( ImGui::IsItemHovered() )
+
+        ImNodes::BeginNodeEditor();
+
+        // Setup setting handles in zoom context
+        if( ImGui::GetFrameCount() == 1 )
         {
-            ImGui::BeginTooltip();
-            ImGui::TextUnformatted( "Disable \"Generate Mesh Preview\" for more accurate results" );
-            ImGui::EndTooltip();
+            SetupSettingsHandlers();
+            ImGui::LoadIniSettingsFromDisk( kNodeGraphSettingsFile );
         }
-
-        ImGui::PopItemWidth();
-        
-        if( edited )
+        if( mSettingsDirty )
         {
-            for( auto& node : mNodes )
-            {
-                node.second.GeneratePreview( false );
-            }
+            ImGui::MarkIniSettingsDirty();
+            mSettingsDirty = false;
+        }
+        if( ImGui::GetIO().WantSaveIniSettings || openStandalonenodeGraph )
+        {
+            ImGui::SaveIniSettingsToDisk( kNodeGraphSettingsFile );
+            ImGui::GetIO().WantSaveIniSettings = false;
+            NodeEditorApp::SyncFileSystem();
+        }
 
-            ImGuiExtra::MarkSettingsDirty();
-        }                
+        // Open this after saving settings
+        if( openStandalonenodeGraph )
+        {
+            OpenStandaloneNodeGraph();
+        }
 
-        ImNodes::BeginNodeEditor();
-        
+        ImGui::SetCurrentContext( mMainContext );
         DoHelp();
-
         DoContextMenu();
+        ImGui::SetCurrentContext( ImNodes::GetNodeEditorImGuiContext() );
 
         DoNodes();
 
         ImNodes::MiniMap( 0.2f, ImNodesMiniMapLocation_BottomLeft );
 
-#if 0
-        if( ImGui::IsWindowHovered() )
+        // Capture in the editor imgui context
+        float editorMouseWheel = ImGui::GetIO().MouseWheel;
+
+        ImNodes::EndNodeEditor();
+
+        // Zoom
+        if( ImNodes::IsEditorHovered() && editorMouseWheel != 0 )
         {
-            auto zoom = ImNodes::EditorContextGetZoom() + ImGui::GetIO().MouseWheel * 0.1f;
+            float zoom = ImNodes::EditorContextGetZoom();
+            if( editorMouseWheel > 0 )
+            {
+                zoom *= 1.5f;
+                if( zoom > 0.9f )
+                {
+                    zoom = 1;
+                }
+            }
+            else
+            {
+                zoom /= 1.5f;
+                zoom = std::max( zoom, 0.2f );
+            }
             ImNodes::EditorContextSetZoom( zoom, ImGui::GetMousePos() );
         }
-#endif
-
-        ImNodes::EndNodeEditor();
 
         CheckLinks();
 
@@ -680,9 +899,12 @@ void FastNoiseNodeEditor::Draw( const Matrix4& transformation, const Matrix4& pr
 
     DoNodeBenchmarks();
 
-    mNoiseTexture.Draw();
+    if( !isDetachedNodeEditor )
+    {
+        mNoiseTexture.Draw();
 
-    mMeshNoisePreview.Draw( transformation, projection, cameraPosition );
+        mMeshNoisePreview.Draw( transformation, projection, cameraPosition );
+    }
 }
 
 void FastNoiseNodeEditor::CheckLinks()
@@ -752,9 +974,9 @@ void FastNoiseNodeEditor::UpdateSelected()
     std::vector<int> linksToDelete;
     int selectedLinkCount = ImNodes::NumSelectedLinks();
 
-    bool delKeyPressed =
+    bool delKeyPressed = !ImGui::GetIO().WantTextInput && (
         ImGui::IsKeyPressed( ImGui::GetKeyIndex( ImGuiKey_Delete ), false ) ||
-        ImGui::IsKeyPressed( ImGui::GetKeyIndex( ImGuiKey_Backspace ), false );
+        ImGui::IsKeyPressed( ImGui::GetKeyIndex( ImGuiKey_Backspace ), false ) );
 
     if( selectedLinkCount && delKeyPressed )
     {
@@ -811,9 +1033,9 @@ void FastNoiseNodeEditor::UpdateSelected()
     }
 }
 
-void FastNoiseNodeEditor::SetSIMDLevel( FastSIMD::eLevel lvl )
+void FastNoiseNodeEditor::SetSIMDLevel( FastSIMD::FeatureSet lvl )
 {
-    mMaxSIMDLevel = lvl;
+    mMaxFeatureSet = lvl;
 
     mOverheadNode.generateAverages.clear();
     DoNodeBenchmarks();
@@ -824,7 +1046,7 @@ void FastNoiseNodeEditor::SetSIMDLevel( FastSIMD::eLevel lvl )
         node.second.GeneratePreview( false );
     }
 
-    ChangeSelectedNode( mSelectedNode );
+    SetPreviewGenerator( mCachedActiveEnt );
 }
 
 void FastNoiseNodeEditor::DoNodes()
@@ -855,10 +1077,19 @@ void FastNoiseNodeEditor::DoNodes()
         }
 
         ImNodes::EndNodeTitleBar();
+        ImGuiID popupId = ImGui::GetItemID();
+
+        if( ImGui::IsMouseReleased( ImGuiMouseButton_Right ) && ImGui::IsItemHovered( ImGuiHoveredFlags_AllowWhenBlockedByPopup ) )
+        {
+            ImGui::SetCurrentContext( mMainContext );
+            ImGui::OpenPopup( popupId );
+        }
 
+        ImGui::SetCurrentContext( mMainContext );
         // Right click node title to change node type
         ImGui::PushStyleVar( ImGuiStyleVar_WindowPadding, ImVec2( 4, 4 ) );
-        if( ImGui::BeginPopupContextItem() )
+
+        if( ImGui::BeginPopupEx( popupId, ImGuiWindowFlags_AlwaysAutoResize | ImGuiWindowFlags_NoTitleBar | ImGuiWindowFlags_NoSavedSettings ) )
         {
             if( ImGui::MenuItem( "Copy Encoded Node Tree" ) )
             {
@@ -881,7 +1112,7 @@ void FastNoiseNodeEditor::DoNodes()
                     MatchingMembers( newMetadata->memberNodeLookups, nodeMetadata->memberNodeLookups ) &&
                     MatchingMembers( newMetadata->memberHybrids, nodeMetadata->memberHybrids ) )
                 {
-                    nodeMetadata = newMetadata;                    
+                    nodeMetadata = newMetadata;
                 }
                 else
                 {
@@ -911,7 +1142,7 @@ void FastNoiseNodeEditor::DoNodes()
                         links.pop();
                     }
 
-                    *node.second.data = std::move( newData );                  
+                    *node.second.data = std::move( newData );
                 }
 
                 node.second.GeneratePreview();
@@ -921,7 +1152,9 @@ void FastNoiseNodeEditor::DoNodes()
         }
         ImGui::PopStyleVar();
 
-        ImGui::PushItemWidth( 60.0f );
+        ImGui::SetCurrentContext( ImNodes::GetNodeEditorImGuiContext() );
+
+        ImGui::PushItemWidth( 90.0f );
 
         ImNodes::PushAttributeFlag( ImNodesAttributeFlags_EnableLinkCreationOnSnap );
         ImNodes::PushAttributeFlag( ImNodesAttributeFlags_EnableLinkDetachWithDragClick );
@@ -954,7 +1187,7 @@ void FastNoiseNodeEditor::DoNodes()
 
             formatName = FastNoise::Metadata::FormatMetadataMemberName( nodeMetadata->memberHybrids[i] );
 
-            if( ImGui::DragFloat( formatName.c_str(), &nodeData->hybrids[i].second, 0.02f, 0, 0, floatFormat ) )
+            if( ImGui::DragFloat( formatName.c_str(), &nodeData->hybrids[i].second, nodeMetadata->memberHybrids[i].valueUiDragSpeed, 0, 0, floatFormat ) )
             {
                 node.second.GeneratePreview();
             }
@@ -979,7 +1212,7 @@ void FastNoiseNodeEditor::DoNodes()
             {
             case FastNoise::Metadata::MemberVariable::EFloat:
             {
-                if( ImGui::DragFloat( formatName.c_str(), &nodeData->variables[i].f, 0.02f, nodeVar.valueMin.f, nodeVar.valueMax.f ) )
+                if( ImGui::DragFloat( formatName.c_str(), &nodeData->variables[i].f, nodeVar.valueUiDragSpeed, nodeVar.valueMin.f, nodeVar.valueMax.f ) )
                 {
                     node.second.GeneratePreview();
                 }
@@ -987,7 +1220,7 @@ void FastNoiseNodeEditor::DoNodes()
             break;
             case FastNoise::Metadata::MemberVariable::EInt:
             {
-                if( ImGui::DragInt( formatName.c_str(), &nodeData->variables[i].i, 0.2f, nodeVar.valueMin.i, nodeVar.valueMax.i ) )
+                if( ImGui::DragInt( formatName.c_str(), &nodeData->variables[i].i, nodeVar.valueUiDragSpeed, nodeVar.valueMin.i, nodeVar.valueMax.i ) )
                 {
                     node.second.GeneratePreview();
                 }
@@ -995,7 +1228,7 @@ void FastNoiseNodeEditor::DoNodes()
             break;
             case FastNoise::Metadata::MemberVariable::EEnum:
             {
-                if( ImGui::Combo( formatName.c_str(), &nodeData->variables[i].i, nodeVar.enumNames.data(), (int)nodeVar.enumNames.size() ) ||
+                if( ImGui::Combo( formatName.c_str(), &nodeData->variables[i].i, nodeVar.enumNames.begin(), (int)nodeVar.enumNames.size() ) ||
                     ImGuiExtra::ScrollCombo( &nodeData->variables[i].i, (int)nodeVar.enumNames.size() ) )
                 {
                     node.second.GeneratePreview();
@@ -1093,7 +1326,7 @@ void FastNoiseNodeEditor::DoHelp()
     ImGui::Text( " Help" );
     if( ImGui::IsItemHovered() )
     {
-        ImGui::PushStyleVar( ImGuiStyleVar_WindowPadding, ImVec2( 4.f, 4.f ) );
+        ImGui::PushStyleVar( ImGuiStyleVar_WindowPadding, ImVec2( 6.f, 6.f ) );
         ImGui::BeginTooltip();
         constexpr float alignPx = 110;
 
@@ -1105,6 +1338,10 @@ void FastNoiseNodeEditor::DoHelp()
         ImGui::SameLine( alignPx );
         ImGui::TextUnformatted( "Right mouse drag" );
 
+        ImGui::TextUnformatted( "Zoom graph" );
+        ImGui::SameLine( alignPx );
+        ImGui::TextUnformatted( "Mouse wheel" );
+
         ImGui::TextUnformatted( "Delete node/link" );
         ImGui::SameLine( alignPx );
         ImGui::TextUnformatted( "Backspace or Delete" );
@@ -1128,7 +1365,7 @@ void FastNoiseNodeEditor::DoContextMenu()
     ImGui::PushStyleVar( ImGuiStyleVar_WindowPadding, ImVec2( 4, 4 ) );
     if( distance < 5.0f && ImGui::BeginPopupContextWindow( "new_node", 1 ) )
     {
-        mContextStartPos = ImGui::GetMousePosOnOpeningCurrentPopup();
+        mContextStartPos = ImNodes::ConvertToEditorContextSpace( ImGui::GetMousePosOnOpeningCurrentPopup() );
 
         if( auto newMetadata = mContextMetadata.front()->DrawUI() )
         {
@@ -1198,11 +1435,11 @@ void FastNoiseNodeEditor::DoContextMenu()
     }
     if( ImGui::BeginPopup( "new_node_drop", ImGuiWindowFlags_AlwaysAutoResize | ImGuiWindowFlags_NoTitleBar | ImGuiWindowFlags_NoSavedSettings ) )
     {
-        ImVec2 startPos = ImGui::GetMousePosOnOpeningCurrentPopup();
+        ImVec2 startPos = ImNodes::ConvertToEditorContextSpace( ImGui::GetMousePosOnOpeningCurrentPopup() );
 
         auto newMetadata = mContextMetadata.front()->DrawUI( []( const FastNoise::Metadata* metadata )
         {
-            return !metadata->memberNodeLookups.empty() || !metadata->memberHybrids.empty();
+            return metadata->memberNodeLookups.size() || metadata->memberHybrids.size();
         } );
 
         if( newMetadata )
@@ -1218,25 +1455,16 @@ void FastNoiseNodeEditor::DoContextMenu()
     ImGui::PopStyleVar();
 }
 
-FastNoise::SmartNode<> FastNoiseNodeEditor::GenerateSelectedPreview()
+std::string_view FastNoiseNodeEditor::GetSelectedEncodedNodeTree()
 {
     auto find = mNodes.find( mSelectedNode );
 
-    FastNoise::SmartNode<> generator;
-
     if( find != mNodes.end() )
     {
-        generator = FastNoise::NewFromEncodedNodeTree( find->second.serialised.c_str(), mMaxSIMDLevel );
-
-        if( generator )
-        {
-            mActualSIMDLevel = generator->GetSIMDLevel();
-        }
+        return find->second.serialised;
     }
 
-    mNoiseTexture.ReGenerate( generator );
-
-    return generator;
+    return { "" };
 }
 
 FastNoise::OutputMinMax FastNoiseNodeEditor::GenerateNodePreviewNoise( FastNoise::Generator* gen, float* noise )
@@ -1246,25 +1474,22 @@ FastNoise::OutputMinMax FastNoiseNodeEditor::GenerateNodePreviewNoise( FastNoise
     case NoiseTexture::GenType_2D:
         return gen->GenUniformGrid2D( noise,
             Node::NoiseSize / -2, Node::NoiseSize / -2,
-            Node::NoiseSize, Node::NoiseSize,
-            mNodeFrequency, mNodeSeed );
+            Node::NoiseSize, Node::NoiseSize, mNodeSeed );
 
     case NoiseTexture::GenType_2DTiled:
         return gen->GenTileable2D( noise,
-            Node::NoiseSize, Node::NoiseSize,
-            mNodeFrequency, mNodeSeed );
+            Node::NoiseSize, Node::NoiseSize, mNodeSeed );
 
     case NoiseTexture::GenType_3D:
         return gen->GenUniformGrid3D( noise,
             Node::NoiseSize / -2, Node::NoiseSize / -2, 0,
-            Node::NoiseSize, Node::NoiseSize, 1,
-            mNodeFrequency, mNodeSeed );
+            Node::NoiseSize, Node::NoiseSize, 1, mNodeSeed );
 
     case NoiseTexture::GenType_4D:
         return gen->GenUniformGrid4D( noise,
             Node::NoiseSize / -2, Node::NoiseSize / -2, 0, 0,
-            Node::NoiseSize, Node::NoiseSize, 1, 1,
-            mNodeFrequency, mNodeSeed );
+            Node::NoiseSize, Node::NoiseSize, 1, 1, mNodeSeed );
+
     case NoiseTexture::GenType_Count:
         break;
     }
@@ -1304,30 +1529,70 @@ void FastNoiseNodeEditor::ChangeSelectedNode( FastNoise::NodeData* newId )
 {
     mSelectedNode = newId;
 
-    FastNoise::SmartNode<> generator = GenerateSelectedPreview();
+    std::string_view encodedNodeTree = GetSelectedEncodedNodeTree();
 
-    if( generator )
+    if( !encodedNodeTree.empty() )
     {
-        mMeshNoisePreview.ReGenerate( generator );
+        // Send updated node tree via IPC
+        unsigned char* sharedMemory = static_cast<unsigned char*>( mNodeEditorApp.GetIpcSharedMemory() );
+
+        if( encodedNodeTree.length() + 3 >= kSharedMemorySize )
+        {
+            Debug {} << "Encoded node tree too large to send via IPC " << encodedNodeTree.length();
+            sharedMemory = nullptr;
+        }
+
+        if( sharedMemory )
+        {
+            std::memcpy( sharedMemory + 2, encodedNodeTree.data(), encodedNodeTree.length() + 1 );
+            sharedMemory[1] = 0;
+
+            std::atomic_thread_fence( std::memory_order_acq_rel );
+            sharedMemory[0]++; // Increment counter to mark updated tree
+        }
+        else
+        {
+            SetPreviewGenerator( encodedNodeTree );
+        }
     }
 }
 
-const char* FastNoiseNodeEditor::GetSIMDLevelName( FastSIMD::eLevel lvl )
+void FastNoiseNodeEditor::SetPreviewGenerator( std::string_view encodedNodeTree )
 {
-    switch( lvl )
-    {
-    default:
-    case FastSIMD::Level_Null:   return "NULL";
-    case FastSIMD::Level_Scalar: return "Scalar";
-    case FastSIMD::Level_SSE:    return "SSE";
-    case FastSIMD::Level_SSE2:   return "SSE2";
-    case FastSIMD::Level_SSE3:   return "SSE3";
-    case FastSIMD::Level_SSSE3:  return "SSSE3";
-    case FastSIMD::Level_SSE41:  return "SSE4.1";
-    case FastSIMD::Level_SSE42:  return "SSE4.2";
-    case FastSIMD::Level_AVX:    return "AVX";
-    case FastSIMD::Level_AVX2:   return "AVX2";
-    case FastSIMD::Level_AVX512: return "AVX512";
-    case FastSIMD::Level_NEON:   return "NEON";
+    auto SetActiveEnt = [this]( std::string_view encodedNodeTree )
+    {
+        if( GetSelectedEncodedNodeTree() != encodedNodeTree )
+        {
+            mSelectedNode = nullptr;
+        }
+
+        mCachedActiveEnt = encodedNodeTree;
+    };
+
+    FastNoise::SmartNode<> generator = FastNoise::NewFromEncodedNodeTree( encodedNodeTree.data(), mMaxFeatureSet );
+
+    if( generator )
+    {
+        mActualFeatureSet = generator->GetActiveFeatureSet();
+
+        if( !mNodeEditorApp.IsDetachedNodeGraph() )
+        {
+            mNoiseTexture.ReGenerate( generator );
+            mMeshNoisePreview.ReGenerate( generator );
+        }
+
+        if( !encodedNodeTree.empty() )
+        {
+            SetActiveEnt( encodedNodeTree );
+        }
+    }
+    else if( encodedNodeTree.empty() )
+    {
+        SetActiveEnt( encodedNodeTree );
+    }
+    else
+    {
+        Debug {} << "Invalid encoded node tree";
     }
 }
+
diff --git a/NoiseTool/FastNoiseNodeEditor.h b/tools/NodeEditor/FastNoiseNodeEditor.h
similarity index 86%
rename from NoiseTool/FastNoiseNodeEditor.h
rename to tools/NodeEditor/FastNoiseNodeEditor.h
index 2ee3b82f..cbb4b0b7 100644
--- a/NoiseTool/FastNoiseNodeEditor.h
+++ b/tools/NodeEditor/FastNoiseNodeEditor.h
@@ -18,14 +18,20 @@
 
 namespace Magnum
 {
+    class NodeEditorApp;
+
     class FastNoiseNodeEditor
     {
     public:
-        FastNoiseNodeEditor();
+        FastNoiseNodeEditor( NodeEditorApp& nodeEditorApp );
+        ~FastNoiseNodeEditor();
+
         void Draw( const Matrix4& transformation, const Matrix4& projection, const Vector3& cameraPosition );
-        void SetSIMDLevel( FastSIMD::eLevel lvl );
+        void SetSIMDLevel( FastSIMD::FeatureSet lvl );
+        void DoIpcPolling();
 
-        static const char* GetSIMDLevelName( FastSIMD::eLevel lvl );
+        static void* SetupSharedMemoryIpc();
+        static void ReleaseSharedMemoryIpc();
 
     private:
         struct Node
@@ -102,14 +108,16 @@ namespace Magnum
 
         Node& AddNode( ImVec2 startPos, const FastNoise::Metadata* metadata, bool generatePreview = true );
         bool AddNodeFromEncodedString( const char* string, ImVec2 nodePos );
-        FastNoise::SmartNode<> GenerateSelectedPreview();
+        std::string_view GetSelectedEncodedNodeTree();
         FastNoise::OutputMinMax GenerateNodePreviewNoise( FastNoise::Generator* gen, float* noise );
         Node* FindNodeFromId( int id );
         int GetFreeNodeId();
+        void SetPreviewGenerator( std::string_view encodedNodeTree );
         void ChangeSelectedNode( FastNoise::NodeData* newId );
         void DeleteNode( FastNoise::NodeData* nodeData );
         void DoNodeBenchmarks();
         void SetupSettingsHandlers();
+        void OpenStandaloneNodeGraph();
 
         void CheckLinks();
         void DoHelp();
@@ -117,6 +125,9 @@ namespace Magnum
         void DoNodes();
         void UpdateSelected();
 
+        NodeEditorApp& mNodeEditorApp;
+        ImGuiContext* mMainContext;
+
         std::unordered_map<FastNoise::NodeData*, Node> mNodes;
         FastNoise::NodeData* mDroppedLinkNode = nullptr;
         bool mDroppedLink = false;
@@ -125,20 +136,22 @@ namespace Magnum
         std::vector<std::unique_ptr<MetadataMenu>> mContextMetadata;
         std::string mImportNodeString;
         bool mImportNodeModal = false;
+        bool mSettingsDirty = false;
 
         MeshNoisePreview mMeshNoisePreview;
         NoiseTexture mNoiseTexture;
 
+        std::string mCachedActiveEnt;
         FastNoise::NodeData* mSelectedNode = nullptr;
         Node mOverheadNode;
         int32_t mNodeBenchmarkIndex = 0;
         int32_t mNodeBenchmarkMax = 128;
 
-        float mNodeFrequency = 0.02f;
+        float mNodeScale = 2.5f;
         int mNodeSeed = 1337;
         NoiseTexture::GenType mNodeGenType = NoiseTexture::GenType_2D;
 
-        FastSIMD::eLevel mMaxSIMDLevel    = FastSIMD::Level_Null;
-        FastSIMD::eLevel mActualSIMDLevel = FastSIMD::Level_Null;
+        FastSIMD::FeatureSet mMaxFeatureSet    = FastSIMD::FeatureSet::Max;
+        FastSIMD::FeatureSet mActualFeatureSet = FastSIMD::FeatureSet::Invalid;
     };
-}
\ No newline at end of file
+}
diff --git a/tools/NodeEditor/MeshNoisePreview.cpp b/tools/NodeEditor/MeshNoisePreview.cpp
new file mode 100644
index 00000000..20d42231
--- /dev/null
+++ b/tools/NodeEditor/MeshNoisePreview.cpp
@@ -0,0 +1,1110 @@
+#include <algorithm>
+#include <cmath>
+#include <thread>
+#include <bit>
+
+#include <Corrade/Utility/Resource.h>
+#include <Magnum/Math/Color.h>
+#include <Magnum/Math/Frustum.h>
+#include <Magnum/Math/Intersection.h>
+#include <Magnum/Math/Matrix4.h>
+#include <Magnum/GL/Context.h>
+#include <Magnum/GL/Extensions.h>
+
+#include "util/ImGuiExtra.h"
+#include "util/DmcTable.inl"
+#include "MeshNoisePreview.h"
+
+
+using namespace Magnum;
+
+static constexpr float SqrtNewtonRaphson( float x, float curr, float prev )
+{
+    return curr == prev ? curr : SqrtNewtonRaphson( x, 0.5f * ( curr + x / curr ), curr );
+}
+
+static constexpr Vector3 NormaliseConstExpr( const Vector3& vec )
+{
+    float lenSqr = vec.x() * vec.x() + vec.y() * vec.y() + vec.z() * vec.z();
+    return vec / SqrtNewtonRaphson( lenSqr, lenSqr, 0 );
+}
+
+MeshNoisePreview::MeshNoisePreview()
+{
+    mBuildData.scale = 1.f;
+    mBuildData.seed = 1337;
+    mBuildData.isoSurface = 0.0f;
+    mBuildData.heightmapMultiplier = 100.0f;
+    mBuildData.color = Color3( 1.0f );
+    mBuildData.meshType = MeshType_DualMarchingCubes3D;
+
+    uint32_t threadCount = std::max( 2u, std::thread::hardware_concurrency() );
+
+    threadCount -= threadCount / 4;
+
+    for( uint32_t i = 0; i < threadCount; i++ )
+    {
+        mThreads.emplace_back( GenerateLoopThread, std::ref( mGenerateQueue ), std::ref( mCompleteQueue ) );
+    }
+
+    Debug{} << "Mesh generator thread count: " << mThreads.size();
+
+    SetupSettingsHandlers();
+}
+
+MeshNoisePreview::~MeshNoisePreview()
+{
+    mGenerateQueue.KillThreads();
+
+    for( auto& thread: mThreads )
+    {
+        thread.join();
+    }
+}
+
+void MeshNoisePreview::ReGenerate( FastNoise::SmartNodeArg<> generator )
+{
+    mLoadRange = 200.0f;
+    mBuildData.generator = generator;
+    mBuildData.generatorScaled = FastNoise::New<FastNoise::DomainScale>( generator->GetActiveFeatureSet() );
+    mBuildData.generatorScaled->SetScaling( mBuildData.scale );
+    mBuildData.generatorScaled->SetSource( generator );
+    mBuildData.pos = Vector3i( 0 );
+
+    mMinMax = {};
+    mMinAirY = INFINITY;
+    mMaxSolidY = -INFINITY;
+
+    mRegisteredChunkPositions.clear();
+    mChunks.clear();
+    mGenerateQueue.Clear();
+    mBuildData.genVersion = mCompleteQueue.IncVersion();
+
+    Chunk::MeshData meshData;
+    while( mCompleteQueue.Pop( meshData ) )
+    {
+        meshData.Free();
+    }
+}
+
+void MeshNoisePreview::Draw( const Matrix4& transformation, const Matrix4& projection, const Vector3& cameraPosition )
+{
+    if( ImGui::Checkbox( "Generate Mesh Preview", &mEnabled ) )
+    {
+        ReGenerate( mBuildData.generator );
+        ImGuiExtra::MarkSettingsDirty();
+    }
+
+    if( !mBuildData.generator || !mEnabled )
+    {
+        return;
+    }
+
+    UpdateChunkQueues( cameraPosition );
+
+    Matrix4 transformationProjection = projection * transformation;
+
+    Frustum camFrustum = Frustum::fromMatrix( transformationProjection );
+    mShader.SetTransformationProjectionMatrix( transformationProjection );
+
+    mTriCount = 0;
+    mMeshesCount = 0;
+    uint32_t drawnTriCount = 0;
+
+    for( Chunk& chunk: mChunks )
+    {
+        if( GL::Mesh* mesh = chunk.GetMesh() )
+        {
+            int32_t meshTriCount = mesh->count();
+
+            mTriCount += meshTriCount;
+            mMeshesCount++;
+
+            Vector3 posf( chunk.GetPos() );
+            Range3D bbox( posf, posf + Vector3( Chunk::SIZE + 1 ) );
+
+            if( mBuildData.meshType == MeshType_Heightmap2D )
+            {
+                bbox.min().y() = mMinMax.min * mBuildData.heightmapMultiplier;
+                bbox.max().y() = mMinMax.max * mBuildData.heightmapMultiplier;
+            }
+
+            if( Math::Intersection::rangeFrustum( bbox, camFrustum ) )
+            {
+                drawnTriCount += meshTriCount;
+                mShader.draw( *mesh );
+            }
+        }
+    }
+    mTriCount /= 3;
+
+    bool edited = false;
+    edited |= ImGui::Combo( "Mesh Type", reinterpret_cast<int*>( &mBuildData.meshType ), MeshTypeStrings );
+    edited |= ImGuiExtra::ScrollCombo( reinterpret_cast<int*>( &mBuildData.meshType ), MeshType_Count );
+
+    if( ImGui::ColorEdit3( "Mesh Colour", mBuildData.color.data() ) )
+    {
+        mShader.SetColorTint( mBuildData.color );
+        ImGuiExtra::MarkSettingsDirty();
+    }
+
+    edited |= ImGui::DragInt( "Seed", &mBuildData.seed );
+    edited |= ImGui::DragFloat( "Scale", &mBuildData.scale, 0.05f, 0, 0, "%.4f" );
+
+    if( mBuildData.meshType == MeshType_Heightmap2D )
+    {
+        edited |= ImGui::DragFloat( "Heightmap Multiplier", &mBuildData.heightmapMultiplier, 0.5f );
+    }
+    else
+    {
+        edited |= ImGui::DragFloat( "Iso Surface", &mBuildData.isoSurface, 0.02f );
+    }
+
+    if( edited )
+    {
+        ReGenerate( mBuildData.generator );
+        ImGuiExtra::MarkSettingsDirty();
+    }
+
+    float triLimitMil = (float)mTriLimit / 1000000.0f;
+    if( ImGui::DragFloat( "Triangle Limit", &triLimitMil, 1, 10.0f, 300.0f, "%0.1fM" ) )
+    {
+        mTriLimit = (uint32_t)( triLimitMil * 1000000 );
+        ImGuiExtra::MarkSettingsDirty();
+    }
+
+    ImGui::Text( "Triangle Count: %0.1fM (%0.1fM)", mTriCount / 1000000.0f, drawnTriCount / 3000000.0f );
+    if( mBuildData.meshType != MeshType_Heightmap2D )
+    {
+    ImGui::Text( "Voxel Count: %0.1fM", mChunks.size() * ( Chunk::SIZE * Chunk::SIZE * Chunk::SIZE / 1000000.0f ) );
+    }
+    ImGui::Text( "Loaded Chunks: %zu (%d)", mChunks.size(), mMeshesCount );
+
+    size_t generateCount = mGenerateQueue.Count();
+    ImGui::Text( "Meshing Chunks: %zu (%zu)", mRegisteredChunkPositions.size() - mChunks.size() - generateCount, generateCount );
+    ImGui::Text( "Chunk Load Range: %0.1f", mLoadRange );
+    ImGui::Text( "Generated Min (%0.6f) : Max (%0.6f)", mMinMax.min, mMinMax.max );
+
+    if( mBuildData.meshType != MeshType_Heightmap2D )
+    {
+        ImGui::Text( "Min Air Y (%0.1f) : Max Solid Y (%0.1f)", mMinAirY, mMaxSolidY );
+    }
+
+    ImGui::Text( "Camera Pos: %0.1f, %0.1f, %0.1f", cameraPosition.x(), cameraPosition.y(), cameraPosition.z() );
+
+    UpdateChunksForPosition( cameraPosition );
+}
+
+float MeshNoisePreview::GetLoadRangeModifier()
+{
+    return std::min( 0.01f, (float)( 1000 / std::pow( std::min( 1000.0f, mLoadRange ), 1.5 ) ) );
+}
+
+void MeshNoisePreview::UpdateChunkQueues( const Vector3& position )
+{
+    size_t queueCount = mCompleteQueue.Count();
+
+    if( mTriCount > mTriLimit ) // Reduce load range if over tri limit
+    {
+        mLoadRange = std::max( mLoadRange * ( 1 - GetLoadRangeModifier() ), Chunk::SIZE * 1.5f );
+    }
+
+    StartTimer();
+    Vector3i chunkPos = Vector3i( position - Vector3( Chunk::SIZE / 2.0f ) );
+
+    size_t newChunks = 0;
+    if( queueCount )
+    {
+        Chunk::MeshData meshData;
+
+        while( GetTimerDurationMs() < 14 && mCompleteQueue.Pop( meshData ) )
+        {
+            mMinMax << meshData.minMax;
+            mMinAirY = std::min( mMinAirY, meshData.minAirY );
+            mMaxSolidY = std::max( mMaxSolidY, meshData.maxSolidY );
+
+            mChunks.emplace_back( meshData );
+            newChunks++;
+        }
+        mAvgNewChunks += ( newChunks - mAvgNewChunks ) * 0.01f;
+    }
+
+    std::sort( mChunks.begin(), mChunks.end(),
+               [chunkPos]( const Chunk& a, const Chunk& b ) {
+                   return ( chunkPos - a.GetPos() ).dot() < ( chunkPos - b.GetPos() ).dot();
+               } );
+
+    // Unload further chunk if out of load range
+    //size_t deletedChunks = 0;
+    while( !mChunks.empty() )
+    {
+        Vector3i backChunkPos = mChunks.back().GetPos();
+        float unloadRange = mLoadRange * 1.1f;
+        if( GetTimerDurationMs() < 15 && ( chunkPos - backChunkPos ).dot() > unloadRange * unloadRange )
+        {
+            mRegisteredChunkPositions.erase( backChunkPos );
+            mChunks.pop_back();
+            //deletedChunks++;
+        }
+        else
+        {
+            break;
+        }
+    }
+
+    // ImGui::Text( " Queued Chunks: %zu", queueCount );
+    // ImGui::Text( "    New Chunks: %zu (%0.1f)", newChunks, mAvgNewChunks );
+    // ImGui::Text( "Deleted Chunks: %zu", deletedChunks );
+
+    // Increase load range if queue is not full
+    if( (double)mTriCount < mTriLimit * 0.85 && ( mRegisteredChunkPositions.size() - mChunks.size() ) < mThreads.size() * mAvgNewChunks )
+    {
+        mLoadRange = std::min( mLoadRange * ( 1 + GetLoadRangeModifier() ), 3000.0f );
+    }
+}
+
+void MeshNoisePreview::UpdateChunksForPosition( Vector3 position )
+{
+    // StartTimer();
+    int chunkRange = (int)ceilf( mLoadRange / Chunk::SIZE );
+
+    position -= Vector3( Chunk::SIZE * 0.5f );
+    Vector3i positionI = Vector3i( position );
+
+    Vector3i chunkCenter = ( positionI / Chunk::SIZE ) * Chunk::SIZE;
+
+    std::vector<Vector3i> chunkPositions;
+    Vector3i chunkPos;
+    int loadRangeSq = (int)( mLoadRange * mLoadRange );
+
+    int staggerShift = std::min( 5, (int)( ( loadRangeSq * (int64_t)mLoadRange ) / 1000000000 ) );
+    int staggerCount = ( 1 << staggerShift ) - 1;
+
+    for( int x = -chunkRange; x <= chunkRange; x++ )
+    {
+        if( ( x & staggerCount ) != ( mStaggerCheck & staggerCount ) )
+        {
+            continue;
+        }
+
+        chunkPos.x() = x * Chunk::SIZE + chunkCenter.x();
+
+        for( int y = -chunkRange; y <= chunkRange; y++ )
+        {
+            if( mBuildData.meshType == MeshType_Heightmap2D )
+            {
+                positionI.y() = 0;
+                chunkPos.y() = 0;
+                y = chunkRange;
+            }
+            else
+            {
+                chunkPos.y() = y * Chunk::SIZE + chunkCenter.y();
+            }
+
+            for( int z = -chunkRange; z <= chunkRange; z++ )
+            {
+                chunkPos.z() = z * Chunk::SIZE + chunkCenter.z();
+
+
+                if( ( positionI - chunkPos ).dot() <= loadRangeSq &&
+                    !mRegisteredChunkPositions.contains( chunkPos ) )
+                {
+                    chunkPositions.push_back( chunkPos );
+                }
+            }
+        }
+    }
+
+    mStaggerCheck++;
+
+    std::sort( chunkPositions.begin(), chunkPositions.end(), [positionI]( const Vector3i& a, const Vector3i& b )
+    {
+        return ( positionI - a ).dot() < ( positionI - b ).dot();
+    } );
+
+    for( const Vector3i& pos: chunkPositions )
+    {
+        mBuildData.pos = pos;
+        mRegisteredChunkPositions.insert( pos );
+
+        if( mGenerateQueue.Push( mBuildData ) >= mThreads.size() * 16 )
+        {
+            break;
+        }
+    }
+
+    // ImGui::Text( "UpdateChunksForPosition(%d) Ms: %.2f", staggerShift, GetTimerDurationMs() );
+}
+
+void MeshNoisePreview::GenerateLoopThread( GenerateQueue<Chunk::BuildData>& generateQueue, CompleteQueue<Chunk::MeshData>& completeQueue )
+{
+    while( true )
+    {
+        Chunk::BuildData buildData = generateQueue.Pop();
+
+        if( generateQueue.ShouldKillThread() )
+        {
+            return;
+        }
+
+        Chunk::MeshData meshData = Chunk::BuildMeshData( buildData );
+
+        if( !completeQueue.Push( meshData, buildData.genVersion ) )
+        {
+            meshData.Free();
+        }
+    }
+}
+
+MeshNoisePreview::Chunk::MeshData MeshNoisePreview::Chunk::BuildMeshData( const BuildData& buildData )
+{
+    thread_local static std::vector<float> densityValues( SIZE_DENSITY * SIZE_DENSITY * SIZE_DENSITY );
+    thread_local static std::vector<VertexData> vertexData;
+    thread_local static std::vector<uint32_t> indicies;
+
+    vertexData.clear();
+    indicies.clear();
+
+    switch( buildData.meshType )
+    {
+    case MeshType_Bloxel3D:
+        return BuildBloxel3DMesh( buildData, densityValues.data(), vertexData, indicies );
+
+    case MeshType_DualMarchingCubes3D:
+        return BuildDmc3DMesh( buildData, densityValues.data(), vertexData, indicies );
+
+    case MeshType_Heightmap2D:
+        return BuildHeightMap2DMesh( buildData, densityValues.data(), vertexData, indicies );
+
+    case MeshType_Count:
+        break;
+    }
+
+    return MeshData( buildData.pos, {}, vertexData, indicies );
+}
+
+MeshNoisePreview::Chunk::MeshData MeshNoisePreview::Chunk::BuildBloxel3DMesh( const BuildData& buildData, float* densityValues, std::vector<VertexData>& vertexData, std::vector<uint32_t>& indicies )
+{
+    static constexpr uint32_t SIZE_GEN = SIZE + 2;
+    FastNoise::OutputMinMax minMax = buildData.generatorScaled->GenUniformGrid3D( densityValues,
+                                                                                  buildData.pos.x() - 1, buildData.pos.y() - 1, buildData.pos.z() - 1,
+                                                                                  SIZE_GEN, SIZE_GEN, SIZE_GEN, buildData.seed );
+    float minAir = INFINITY;
+    float maxSolid = -INFINITY;
+
+#if FASTNOISE_CALC_MIN_MAX
+    if( minMax.min > buildData.isoSurface )
+    {
+        minAir = (float)buildData.pos.y();
+    }
+    else if( minMax.max < buildData.isoSurface )
+    {
+        maxSolid = (float)buildData.pos.y() - 1.0f + SIZE;
+    }
+    else
+#endif
+    {
+        constexpr Vector3 SUN = LIGHT_DIR * ( 1.0f - AMBIENT_LIGHT );
+
+        constexpr int32_t STEP_X = 1;
+        constexpr int32_t STEP_Y = SIZE_GEN;
+        constexpr int32_t STEP_Z = SIZE_GEN * SIZE_GEN;
+
+        int32_t noiseIdx = STEP_X + STEP_Y + STEP_Z;
+
+        for( uint32_t z = 0; z < SIZE; z++ )
+        {
+            float zf = z + (float)buildData.pos.z();
+
+            for( uint32_t y = 0; y < SIZE; y++ )
+            {
+                float yf = y + (float)buildData.pos.y();
+
+                for( uint32_t x = 0; x < SIZE; x++ )
+                {
+                    float xf = x + (float)buildData.pos.x();
+
+                    if( densityValues[noiseIdx] <= buildData.isoSurface ) // Is Solid?
+                    {
+                        maxSolid = std::max( yf, maxSolid );
+
+                        if( densityValues[noiseIdx + STEP_X] > buildData.isoSurface ) // Right
+                        {
+                            BloxelAddQuadAO( vertexData, indicies, densityValues, buildData.isoSurface, noiseIdx, STEP_X, STEP_Y, STEP_Z, SUN.x() + AMBIENT_LIGHT,
+                                       Vector3( xf + 1, yf, zf ), Vector3( xf + 1, yf + 1, zf ), Vector3( xf + 1, yf + 1, zf + 1 ), Vector3( xf + 1, yf, zf + 1 ) );
+                        }
+
+                        if( densityValues[noiseIdx - STEP_X] > buildData.isoSurface ) // Left
+                        {
+                            BloxelAddQuadAO( vertexData, indicies, densityValues, buildData.isoSurface, noiseIdx, -STEP_X, -STEP_Y, STEP_Z, 1.0f - SUN.x(),
+                                       Vector3( xf, yf + 1, zf ), Vector3( xf, yf, zf ), Vector3( xf, yf, zf + 1 ), Vector3( xf, yf + 1, zf + 1 ) );
+                        }
+
+                        if( densityValues[noiseIdx + STEP_Y] > buildData.isoSurface ) // Up
+                        {
+                            BloxelAddQuadAO( vertexData, indicies, densityValues, buildData.isoSurface, noiseIdx, STEP_Y, STEP_Z, STEP_X, SUN.y() + AMBIENT_LIGHT,
+                                       Vector3( xf, yf + 1, zf ), Vector3( xf, yf + 1, zf + 1 ), Vector3( xf + 1, yf + 1, zf + 1 ), Vector3( xf + 1, yf + 1, zf ) );
+                        }
+
+                        if( densityValues[noiseIdx - STEP_Y] > buildData.isoSurface ) // Down
+                        {
+                            BloxelAddQuadAO( vertexData, indicies, densityValues, buildData.isoSurface, noiseIdx, -STEP_Y, -STEP_Z, STEP_X, 1.0f - SUN.y(),
+                                       Vector3( xf, yf, zf + 1 ), Vector3( xf, yf, zf ), Vector3( xf + 1, yf, zf ), Vector3( xf + 1, yf, zf + 1 ) );
+                        }
+
+                        if( densityValues[noiseIdx + STEP_Z] > buildData.isoSurface ) // Forward
+                        {
+                            BloxelAddQuadAO( vertexData, indicies, densityValues, buildData.isoSurface, noiseIdx, STEP_Z, STEP_X, STEP_Y, SUN.z() + AMBIENT_LIGHT,
+                                       Vector3( xf, yf, zf + 1 ), Vector3( xf + 1, yf, zf + 1 ), Vector3( xf + 1, yf + 1, zf + 1 ), Vector3( xf, yf + 1, zf + 1 ) );
+                        }
+
+                        if( densityValues[noiseIdx - STEP_Z] > buildData.isoSurface ) // Back
+                        {
+                            BloxelAddQuadAO( vertexData, indicies, densityValues, buildData.isoSurface, noiseIdx, -STEP_Z, -STEP_X, STEP_Y, 1.0f - SUN.z(),
+                                       Vector3( xf + 1, yf, zf ), Vector3( xf, yf, zf ), Vector3( xf, yf + 1, zf ), Vector3( xf + 1, yf + 1, zf ) );
+                        }
+                    }
+                    else
+                    {
+                        minAir = std::min( yf, minAir );
+                    }
+                    noiseIdx++;
+                }
+
+                noiseIdx += STEP_X * 2;
+            }
+
+            noiseIdx += STEP_Y * 2;
+        }
+    }
+
+    return MeshData( buildData.pos, minMax, vertexData, indicies, minAir, maxSolid );
+}
+
+void MeshNoisePreview::Chunk::BloxelAddQuadAO( std::vector<VertexData>& verts, std::vector<uint32_t>& indicies, const float* density, float isoSurface,
+                                         int32_t idx, int32_t facingOffset, int32_t offsetA, int32_t offsetB, float light, Vector3 pos00, Vector3 pos01, Vector3 pos11, Vector3 pos10 )
+{
+    int32_t facingIdx = idx + facingOffset;
+
+    uint8_t sideA0 = density[facingIdx - offsetA] <= isoSurface;
+    uint8_t sideA1 = density[facingIdx + offsetA] <= isoSurface;
+    uint8_t sideB0 = density[facingIdx - offsetB] <= isoSurface;
+    uint8_t sideB1 = density[facingIdx + offsetB] <= isoSurface;
+
+    uint8_t corner00 = ( sideA0 & sideB0 ) || density[facingIdx - offsetA - offsetB] <= isoSurface;
+    uint8_t corner01 = ( sideA0 & sideB1 ) || density[facingIdx - offsetA + offsetB] <= isoSurface;
+    uint8_t corner10 = ( sideA1 & sideB0 ) || density[facingIdx + offsetA - offsetB] <= isoSurface;
+    uint8_t corner11 = ( sideA1 & sideB1 ) || density[facingIdx + offsetA + offsetB] <= isoSurface;
+
+    constexpr float aoAdjust = AO_STRENGTH / 3.0f;
+
+    float ao00 = (float)( sideA0 + sideB0 + corner00 ) * aoAdjust;
+    float ao01 = (float)( sideA1 + sideB0 + corner10 ) * aoAdjust;
+    float ao10 = (float)( sideA0 + sideB1 + corner01 ) * aoAdjust;
+    float ao11 = (float)( sideA1 + sideB1 + corner11 ) * aoAdjust;
+
+    float densityLightShift = ( isoSurface - density[idx] ) * 4;
+    light -= densityLightShift;
+    light *= std::abs( light );
+    light = std::max( AMBIENT_LIGHT, light );
+
+    uint32_t vertIdx = (uint32_t)verts.size();
+    verts.emplace_back( pos00, ( 1.0f - ao00 ) * light );
+    verts.emplace_back( pos01, ( 1.0f - ao01 ) * light );
+    verts.emplace_back( pos10, ( 1.0f - ao10 ) * light );
+    verts.emplace_back( pos11, ( 1.0f - ao11 ) * light );
+
+    // Rotate tris to give best visuals for AO lighting
+    uint32_t triRotation = ( ao00 + ao11 > ao01 + ao10 ) * 2;
+    indicies.push_back( vertIdx );
+    indicies.push_back( vertIdx + 3 - triRotation );
+    indicies.push_back( vertIdx + 2 );
+    indicies.push_back( vertIdx + 3 );
+    indicies.push_back( vertIdx + triRotation );
+    indicies.push_back( vertIdx + 1 );
+}
+
+MeshNoisePreview::Chunk::MeshData MeshNoisePreview::Chunk::BuildDmc3DMesh( const BuildData& buildData, float* densityValues, std::vector<VertexData>& vertexData, std::vector<uint32_t>& indicies )
+{
+    static constexpr uint32_t SIZE_GEN = SIZE + 4;
+
+    FastNoise::OutputMinMax minMax = buildData.generatorScaled->GenUniformGrid3D( densityValues,
+                                                                                  buildData.pos.x() - 2, buildData.pos.y() - 2, buildData.pos.z() - 2,
+                                                                                  SIZE_GEN, SIZE_GEN, SIZE_GEN, buildData.seed );
+    float minAir = INFINITY;
+    float maxSolid = -INFINITY;
+
+#if FASTNOISE_CALC_MIN_MAX
+    if( minMax.min > buildData.isoSurface )
+    {
+        minAir = (float)buildData.pos.y();
+    }
+    else if( minMax.max < buildData.isoSurface )
+    {
+        maxSolid = (float)buildData.pos.y() - 1.0f + SIZE;
+    }
+    else
+#endif
+    {
+        constexpr Vector3 VEC_X = Vector3( 1, 0, 0 );
+        constexpr Vector3 VEC_Y = Vector3( 0, 1, 0 );
+        constexpr Vector3 VEC_Z = Vector3( 0, 0, 1 );
+
+        constexpr uint32_t STEP_X = 1;
+        constexpr uint32_t STEP_Y = SIZE_GEN;
+        constexpr uint32_t STEP_Z = SIZE_GEN * SIZE_GEN;
+
+        robin_hood::unordered_flat_map<uint64_t, uint32_t> vertIndexMap;
+
+        Vector3 cellOffset( NoInit );
+        uint32_t cellIndex = (STEP_X + STEP_Y + STEP_Z) * 2;
+
+        for( uint32_t z = 0; z < SIZE; z++ )
+        {
+            cellOffset.z() = (float)( buildData.pos.z() + (int32_t)z );
+
+            for( uint32_t y = 0; y < SIZE; y++ )
+            {
+                cellOffset.y() = (float)( buildData.pos.y() + (int32_t)y );
+
+                for( uint32_t x = 0; x < SIZE; x++ )
+                {
+                    cellOffset.x() = (float)( buildData.pos.x() + (int32_t)x );
+
+                    const float density = densityValues[cellIndex];
+
+                    // construct quad for x edge
+                    {
+                        const float densityX = densityValues[cellIndex + STEP_X];
+
+                        // is edge intersected?
+                        if( ( density <= buildData.isoSurface ) ^ ( densityX <= buildData.isoSurface ) )
+                        {
+                            // generate quad
+                            const uint32_t quadVertIndicies[] = {
+                                DmcGetVertIndex<STEP_X, STEP_Y, STEP_Z, SIZE_GEN>( cellIndex, DMC::EDGE0, cellOffset, densityValues, buildData.isoSurface, vertexData, vertIndexMap ),
+                                DmcGetVertIndex<STEP_X, STEP_Y, STEP_Z, SIZE_GEN>( cellIndex - STEP_Z, DMC::EDGE2, cellOffset - VEC_Z, densityValues, buildData.isoSurface, vertexData, vertIndexMap ),
+                                DmcGetVertIndex<STEP_X, STEP_Y, STEP_Z, SIZE_GEN>( cellIndex - STEP_Y, DMC::EDGE4, cellOffset - VEC_Y, densityValues, buildData.isoSurface, vertexData, vertIndexMap ),
+                                DmcGetVertIndex<STEP_X, STEP_Y, STEP_Z, SIZE_GEN>( cellIndex - STEP_Y - STEP_Z, DMC::EDGE6, cellOffset - (VEC_Y + VEC_Z), densityValues, buildData.isoSurface, vertexData, vertIndexMap ),
+                            };
+
+                            // Slice quad for best vertex lighting
+                            uint8_t triRotation = 2 * ( std::abs( vertexData[quadVertIndicies[0]].posLight.w() - vertexData[quadVertIndicies[3]].posLight.w() ) >
+                                std::abs( vertexData[quadVertIndicies[1]].posLight.w() - vertexData[quadVertIndicies[2]].posLight.w() ) );
+
+                            // Flip tris if backfacing
+                            uint8_t triFlip = 2 * ( density < densityX );
+
+                            indicies.emplace_back( quadVertIndicies[triFlip] );
+                            indicies.emplace_back( quadVertIndicies[3 - triRotation] );
+                            indicies.emplace_back( quadVertIndicies[2 - triFlip] );
+                            indicies.emplace_back( quadVertIndicies[3 - triFlip] );
+                            indicies.emplace_back( quadVertIndicies[triRotation] );
+                            indicies.emplace_back( quadVertIndicies[1 + triFlip] );
+                        }
+                    }
+
+                    // construct quad for y edge
+                    {
+                        const float densityY = densityValues[cellIndex + STEP_Y];
+
+                        // is edge intersected?
+                        if( ( density <= buildData.isoSurface ) ^ ( densityY <= buildData.isoSurface ) )
+                        {
+                            // generate quad
+                            const uint32_t quadVertIndicies[] = {
+                                DmcGetVertIndex<STEP_X, STEP_Y, STEP_Z, SIZE_GEN>( cellIndex, DMC::EDGE8, cellOffset, densityValues, buildData.isoSurface, vertexData, vertIndexMap ),
+                                DmcGetVertIndex<STEP_X, STEP_Y, STEP_Z, SIZE_GEN>( cellIndex - STEP_X, DMC::EDGE9, cellOffset - VEC_X, densityValues, buildData.isoSurface, vertexData, vertIndexMap ),
+                                DmcGetVertIndex<STEP_X, STEP_Y, STEP_Z, SIZE_GEN>( cellIndex - STEP_Z, DMC::EDGE11, cellOffset - VEC_Z, densityValues, buildData.isoSurface, vertexData, vertIndexMap ),
+                                DmcGetVertIndex<STEP_X, STEP_Y, STEP_Z, SIZE_GEN>( cellIndex - STEP_X - STEP_Z, DMC::EDGE10, cellOffset - (VEC_X + VEC_Z), densityValues, buildData.isoSurface, vertexData, vertIndexMap ),
+                            };
+
+                            // Slice quad for best vertex lighting
+                            uint8_t triRotation = 2 * ( std::abs( vertexData[quadVertIndicies[0]].posLight.w() - vertexData[quadVertIndicies[3]].posLight.w() ) >
+                                std::abs( vertexData[quadVertIndicies[1]].posLight.w() - vertexData[quadVertIndicies[2]].posLight.w() ) );
+
+                            // Flip tris if backfacing
+                            uint8_t triFlip = 2 * (density < densityY);
+
+                            indicies.emplace_back( quadVertIndicies[triFlip] );
+                            indicies.emplace_back( quadVertIndicies[3 - triRotation] );
+                            indicies.emplace_back( quadVertIndicies[2 - triFlip] );
+                            indicies.emplace_back( quadVertIndicies[3 - triFlip] );
+                            indicies.emplace_back( quadVertIndicies[triRotation] );
+                            indicies.emplace_back( quadVertIndicies[1 + triFlip] );
+                            
+                            if( density <= buildData.isoSurface )
+                            {
+                                maxSolid = std::max( { maxSolid, 
+                                    vertexData[quadVertIndicies[0]].posLight.y(),
+                                    vertexData[quadVertIndicies[1]].posLight.y(),
+                                    vertexData[quadVertIndicies[2]].posLight.y(),
+                                    vertexData[quadVertIndicies[3]].posLight.y()
+                                } );
+                            }
+                            else
+                            {
+                                minAir = std::min( { minAir,
+                                    vertexData[quadVertIndicies[0]].posLight.y(),
+                                    vertexData[quadVertIndicies[1]].posLight.y(),
+                                    vertexData[quadVertIndicies[2]].posLight.y(),
+                                    vertexData[quadVertIndicies[3]].posLight.y()
+                                } );                                
+                            }
+                        }
+                    }
+
+                    // construct quad for z edge
+                    {
+                        const float densityZ = densityValues[cellIndex + STEP_Z];
+
+                        // is edge intersected?
+                        if( ( density <= buildData.isoSurface ) ^ ( densityZ <= buildData.isoSurface ) )
+                        {
+                            // generate quad
+                            const uint32_t quadVertIndicies[] = {
+                                DmcGetVertIndex<STEP_X, STEP_Y, STEP_Z, SIZE_GEN>( cellIndex, DMC::EDGE3, cellOffset, densityValues, buildData.isoSurface, vertexData, vertIndexMap ),
+                                DmcGetVertIndex<STEP_X, STEP_Y, STEP_Z, SIZE_GEN>( cellIndex - STEP_Y, DMC::EDGE7, cellOffset - VEC_Y, densityValues, buildData.isoSurface, vertexData, vertIndexMap ),
+                                DmcGetVertIndex<STEP_X, STEP_Y, STEP_Z, SIZE_GEN>( cellIndex - STEP_X, DMC::EDGE1, cellOffset - VEC_X, densityValues, buildData.isoSurface, vertexData, vertIndexMap ),
+                                DmcGetVertIndex<STEP_X, STEP_Y, STEP_Z, SIZE_GEN>( cellIndex - STEP_X - STEP_Y, DMC::EDGE5, cellOffset - (VEC_X + VEC_Y), densityValues, buildData.isoSurface, vertexData, vertIndexMap ),
+                            };
+
+                            // Slice quad for best vertex lighting
+                            uint8_t triRotation = 2 * ( std::abs( vertexData[quadVertIndicies[0]].posLight.w() - vertexData[quadVertIndicies[3]].posLight.w() ) >
+                                std::abs( vertexData[quadVertIndicies[1]].posLight.w() - vertexData[quadVertIndicies[2]].posLight.w() ) );
+
+                            // Flip tris if backfacing
+                            uint8_t triFlip = 2 * ( density < densityZ );
+
+                            indicies.emplace_back( quadVertIndicies[triFlip] );
+                            indicies.emplace_back( quadVertIndicies[3 - triRotation] );
+                            indicies.emplace_back( quadVertIndicies[2 - triFlip] );
+                            indicies.emplace_back( quadVertIndicies[3 - triFlip] );
+                            indicies.emplace_back( quadVertIndicies[triRotation] );
+                            indicies.emplace_back( quadVertIndicies[1 + triFlip] );
+                        }
+                    }
+
+                    cellIndex += STEP_X;
+                }
+                cellIndex += STEP_X * ( SIZE_GEN - SIZE );
+            }
+            cellIndex += STEP_Y * ( SIZE_GEN - SIZE );
+        }
+    }
+
+    return MeshData( buildData.pos, minMax, vertexData, indicies, minAir, maxSolid );
+}
+
+template<uint32_t STEP_X, uint32_t STEP_Y, uint32_t STEP_Z, uint32_t SIZE_GEN>
+uint32_t MeshNoisePreview::Chunk::DmcGetVertIndex( uint32_t cellIndex, uint16_t edge, Vector3 vertOffset, const float* densityArray, float isoSurface,
+                                                   std::vector<VertexData>& vertexData, robin_hood::unordered_flat_map<uint64_t, uint32_t>& vertIndexMap )
+{
+    uint32_t cellCode = 0;
+    if( densityArray[cellIndex] > isoSurface )
+        cellCode |= 1;
+    if( densityArray[cellIndex + STEP_X] > isoSurface )
+        cellCode |= 2;
+    if( densityArray[cellIndex + STEP_Y] > isoSurface )
+        cellCode |= 4;
+    if( densityArray[cellIndex + STEP_X + STEP_Y] > isoSurface )
+        cellCode |= 8;
+    if( densityArray[cellIndex + STEP_Z] > isoSurface )
+        cellCode |= 16;
+    if( densityArray[cellIndex + STEP_X + STEP_Z] > isoSurface )
+        cellCode |= 32;
+    if( densityArray[cellIndex + STEP_Y + STEP_Z] > isoSurface )
+        cellCode |= 64;
+    if( densityArray[cellIndex + STEP_X + STEP_Y + STEP_Z] > isoSurface )
+        cellCode |= 128;
+    
+    uint16_t pointCode = 0;
+    for( int i = 0; i < 4; ++i )
+    {
+        if( DMC::kDualPointsList[cellCode][i] & edge )
+        {
+            pointCode = DMC::kDualPointsList[cellCode][i];
+            break;
+        }
+    }
+
+    uint64_t lookup = (uint64_t)cellIndex << 12 | (uint64_t)pointCode;
+    uint32_t vertIndex = (uint32_t)vertexData.size();
+    auto find = vertIndexMap.try_emplace( lookup, vertIndex );
+
+    if( !find.second )
+    {
+        return find.first->second;
+    }
+
+    // compute the dual point as the mean of the face vertices belonging to the
+    // original marching cubes face
+    Vector3 vert( Math::ZeroInit );
+
+    // sum edge intersection vertices using the point code
+    if( pointCode & DMC::EDGE0 )
+    {
+        vert.x() += ( isoSurface - densityArray[cellIndex] ) / ( densityArray[cellIndex + STEP_X] - densityArray[cellIndex] );
+    }
+
+    if( pointCode & DMC::EDGE1 )
+    {
+        vert.x() += 1.0f;
+        vert.z() += ( isoSurface - densityArray[cellIndex + STEP_X] ) / ( densityArray[cellIndex + STEP_X + STEP_Z] - densityArray[cellIndex + STEP_X] );
+    }
+
+    if( pointCode & DMC::EDGE2 )
+    {
+        vert.x() += ( isoSurface - densityArray[cellIndex + STEP_Z] ) / ( densityArray[cellIndex + STEP_X + STEP_Z] - densityArray[cellIndex + STEP_Z] );
+        vert.z() += 1.0f;
+    }
+
+    if( pointCode & DMC::EDGE3 )
+    {
+        vert.z() += ( isoSurface - densityArray[cellIndex] ) / ( densityArray[cellIndex + STEP_Z] - densityArray[cellIndex] );
+    }
+
+    if( pointCode & DMC::EDGE4 )
+    {
+        vert.x() += ( isoSurface - densityArray[cellIndex + STEP_Y] ) / ( densityArray[cellIndex + STEP_X + STEP_Y] - densityArray[cellIndex + STEP_Y] );
+        vert.y() += 1.0f;
+    }
+
+    if( pointCode & DMC::EDGE5 )
+    {
+        vert.x() += 1.0f;
+        vert.y() += 1.0f;
+        vert.z() += ( isoSurface - densityArray[cellIndex + STEP_X + STEP_Y] ) / ( densityArray[cellIndex + STEP_X + STEP_Y + STEP_Z] - densityArray[cellIndex + STEP_X + STEP_Y] );
+    }
+
+    if( pointCode & DMC::EDGE6 )
+    {
+        vert.x() += ( isoSurface - densityArray[cellIndex + STEP_Y + STEP_Z] ) / ( densityArray[cellIndex + STEP_X + STEP_Y + STEP_Z] - densityArray[cellIndex + STEP_Y + STEP_Z] );
+        vert.y() += 1.0f;
+        vert.z() += 1.0f;
+    }
+
+    if( pointCode & DMC::EDGE7 )
+    {
+        vert.y() += 1.0f;
+        vert.z() += ( isoSurface - densityArray[cellIndex + STEP_Y] ) / ( densityArray[cellIndex + STEP_Y + STEP_Z] - densityArray[cellIndex + STEP_Y] );
+    }
+
+    if( pointCode & DMC::EDGE8 )
+    {
+        vert.y() += ( isoSurface - densityArray[cellIndex] ) / ( densityArray[cellIndex + STEP_Y] - densityArray[cellIndex] );
+    }
+
+    if( pointCode & DMC::EDGE9 )
+    {
+        vert.x() += 1.0f;
+        vert.y() += ( isoSurface - densityArray[cellIndex + STEP_X] ) / ( densityArray[cellIndex + STEP_X + STEP_Y] - densityArray[cellIndex + STEP_X] );
+    }
+
+    if( pointCode & DMC::EDGE10 )
+    {
+        vert.x() += 1.0f;
+        vert.y() += ( isoSurface - densityArray[cellIndex + STEP_X + STEP_Z] ) / ( densityArray[cellIndex + STEP_X + STEP_Y + STEP_Z] - densityArray[cellIndex + STEP_X + STEP_Z] );
+        vert.z() += 1.0f;
+    }
+
+    if( pointCode & DMC::EDGE11 )
+    {
+        vert.y() += ( isoSurface - densityArray[cellIndex + STEP_Z] ) / ( densityArray[cellIndex + STEP_Y + STEP_Z] - densityArray[cellIndex + STEP_Z] );
+        vert.z() += 1.0f;
+    }
+
+    vert /= (float)std::popcount( pointCode );
+
+    // Calculate analytical derivative 
+
+    uint32_t derivOffsetX = STEP_X & (int)std::lroundf( -vert.x() );
+    uint32_t derivOffsetY = STEP_Y & (int)std::lroundf( -vert.y() );
+    uint32_t derivOffsetZ = STEP_Z & (int)std::lroundf( -vert.z() );
+
+    Vector3 derivDelta = vert + Vector3( 0.5f );
+    derivDelta -= Math::floor( derivDelta );
+        
+    Vector3 derivative;
+
+    for( int32_t z = -1; z < 1; z++ )
+    {
+        float contribZ = std::abs( z + vert.z() );
+
+        for( int32_t y = -1; y < 1; y++ )
+        {
+            float contribY = std::abs( y + vert.y() );
+
+            for( int32_t x = -1; x < 1; x++ )
+            {
+                float contribX = std::abs( x + vert.x() );
+
+                if( x )
+                {
+                    uint32_t derivIndex = cellIndex + derivOffsetX;
+                    derivative.x() += contribY * contribZ *
+                        ImLerp( densityArray[derivIndex - STEP_X] - densityArray[derivIndex],
+                                densityArray[derivIndex] - densityArray[derivIndex + STEP_X], derivDelta.x() );
+                }
+                if( y )
+                {
+                    uint32_t derivIndex = cellIndex + derivOffsetY;
+                    derivative.y() += contribX * contribZ *
+                        ImLerp( densityArray[derivIndex - STEP_Y] - densityArray[derivIndex],
+                                densityArray[derivIndex] - densityArray[derivIndex + STEP_Y], derivDelta.y() );
+                }
+                if( z )
+                {
+                    uint32_t derivIndex = cellIndex + derivOffsetZ;
+                    derivative.z() += contribX * contribY *
+                        ImLerp( densityArray[derivIndex - STEP_Z] - densityArray[derivIndex],
+                                densityArray[derivIndex] - densityArray[derivIndex + STEP_Z], derivDelta.z() );
+                }
+
+                cellIndex += STEP_X;
+            }
+
+            cellIndex += STEP_Y - STEP_X * 2;
+        }
+
+        cellIndex += STEP_Z - STEP_Y * 2;
+    }
+
+    float light = ( NormaliseConstExpr( -LIGHT_DIR ) * derivative.normalized() ).sum() * ( 0.5f - AMBIENT_LIGHT * 0.5f ) + ( 0.5f + AMBIENT_LIGHT * 0.5f );
+    light *= light;
+
+    // Catch NaNs
+    light = std::min( 1.0f, light );
+
+    vertexData.emplace_back( vert + vertOffset, light );
+
+    return vertIndex;
+}
+
+MeshNoisePreview::Chunk::MeshData MeshNoisePreview::Chunk::BuildHeightMap2DMesh( const BuildData& buildData, float* densityValues, std::vector<VertexData>& vertexData, std::vector<uint32_t>& indicies )
+{
+    static constexpr uint32_t SIZE_GEN = SIZE + 1;
+
+    FastNoise::OutputMinMax minMax = buildData.generatorScaled->GenUniformGrid2D( densityValues,
+                                                                                  buildData.pos.x(), buildData.pos.z(),
+                                                                                  SIZE_GEN, SIZE_GEN, buildData.seed );
+    constexpr int32_t STEP_X = 1;
+    constexpr int32_t STEP_Y = SIZE_GEN;
+
+    int32_t noiseIdx = 0;
+
+    for( uint32_t y = 0; y < SIZE; y++ )
+    {
+        float yf = y + (float)buildData.pos.z();
+
+        for( uint32_t x = 0; x < SIZE; x++ )
+        {
+            float xf = x + (float)buildData.pos.x();
+
+            Vector3 v00( xf, densityValues[noiseIdx] * buildData.heightmapMultiplier, yf );
+            Vector3 v01( xf, densityValues[noiseIdx + STEP_Y] * buildData.heightmapMultiplier, yf + 1 );
+            Vector3 v10( xf + 1, densityValues[noiseIdx + STEP_X] * buildData.heightmapMultiplier, yf );
+            Vector3 v11( xf + 1, densityValues[noiseIdx + STEP_X + STEP_Y] * buildData.heightmapMultiplier, yf + 1 );
+
+            // Normal for quad
+            float light = ( LIGHT_DIR * ( Math::cross( v10 - v11, v00 - v11 ).normalized() + Math::cross( v01 - v00, v11 - v00 ).normalized() ).normalized() ).dot();
+
+            uint32_t vertIdx = (uint32_t)vertexData.size();
+            vertexData.emplace_back( v00, light );
+            vertexData.emplace_back( v01, light );
+            vertexData.emplace_back( v10, light );
+            vertexData.emplace_back( v11, light );
+
+            // Slice quad along longest split
+            uint32_t triRotation = 2 * ( ( v00 + v11 ).dot() < ( v01 + v10 ).dot() );
+            indicies.push_back( vertIdx );
+            indicies.push_back( vertIdx + 3 - triRotation );
+            indicies.push_back( vertIdx + 2 );
+            indicies.push_back( vertIdx + 3 );
+            indicies.push_back( vertIdx + triRotation );
+            indicies.push_back( vertIdx + 1 );
+
+            noiseIdx++;
+        }
+
+        noiseIdx += STEP_X;
+    }
+
+    return MeshData( buildData.pos, minMax, vertexData, indicies );
+}
+
+MeshNoisePreview::Chunk::Chunk( MeshData& meshData )
+{
+    mPos = meshData.pos;
+
+    if( !meshData.vertexData.isEmpty() )
+    {
+        // https://doc.magnum.graphics/magnum/classMagnum_1_1GL_1_1Mesh.html
+
+        mMesh = std::make_unique<GL::Mesh>( GL::MeshPrimitive::Triangles );
+
+        mMesh->addVertexBuffer( GL::Buffer( GL::Buffer::TargetHint::Array, meshData.vertexData ), 0, VertexLightShader::PositionLight {} );
+
+        if( meshData.indicies.isEmpty() )
+        {
+            mMesh->setCount( (int)meshData.vertexData.size() );
+        }
+        else
+        {
+            mMesh->setCount( (Int)meshData.indicies.size() );
+            mMesh->setIndexBuffer( GL::Buffer( GL::Buffer::TargetHint::ElementArray, meshData.indicies ), 0, GL::MeshIndexType::UnsignedInt, 0, (UnsignedInt)meshData.vertexData.size() - 1 );
+        }
+    }
+
+    meshData.Free();
+}
+
+MeshNoisePreview::VertexLightShader::VertexLightShader()
+{
+    Utility::Resource NodeEditorResources( "NodeEditor" );
+
+#ifndef MAGNUM_TARGET_GLES
+    const GL::Version version = GL::Context::current().supportedVersion( { GL::Version::GL320, GL::Version::GL310, GL::Version::GL300, GL::Version::GL210 } );
+#else
+    const GL::Version version = GL::Context::current().supportedVersion( { GL::Version::GLES300, GL::Version::GLES200 } );
+#endif
+
+    GL::Shader vert = CreateShader( version, GL::Shader::Type::Vertex );
+    GL::Shader frag = CreateShader( version, GL::Shader::Type::Fragment );
+
+    CORRADE_INTERNAL_ASSERT_OUTPUT(
+        vert.addSource( NodeEditorResources.getString( "VertexLight.vert" ) ).compile() );
+    CORRADE_INTERNAL_ASSERT_OUTPUT(
+        frag.addSource( NodeEditorResources.getString( "VertexLight.frag" ) ).compile() );
+
+    attachShader( vert );
+    attachShader( frag );
+
+    /* ES3 has this done in the shader directly */
+#if !defined( MAGNUM_TARGET_GLES ) || defined( MAGNUM_TARGET_GLES2 )
+#ifndef MAGNUM_TARGET_GLES
+    if( !GL::Context::current().isExtensionSupported<GL::Extensions::ARB::explicit_attrib_location>( version ) )
+#endif
+    {
+        bindAttributeLocation( PositionLight::Location, "positionLight" );
+    }
+#endif
+
+    CORRADE_INTERNAL_ASSERT_OUTPUT( link() );
+
+#ifndef MAGNUM_TARGET_GLES
+    if( !GL::Context::current().isExtensionSupported<GL::Extensions::ARB::explicit_uniform_location>( version ) )
+#endif
+    {
+        mTransformationProjectionMatrixUniform = uniformLocation( "transformationProjectionMatrix" );
+        mColorTintUniform = uniformLocation( "colorTint" );
+    }
+
+    /* Set defaults in OpenGL ES (for desktop they are set in shader code itself) */
+#ifdef MAGNUM_TARGET_GLES
+    SetTransformationProjectionMatrix( Matrix4{} );
+    SetColorTint( Color3 { 1.0f } );
+#endif
+}
+
+GL::Shader MeshNoisePreview::VertexLightShader::CreateShader( GL::Version version, GL::Shader::Type type )
+{
+    GL::Shader shader( version, type );
+
+#ifndef MAGNUM_TARGET_GLES
+    if( GL::Context::current().isExtensionDisabled<GL::Extensions::ARB::explicit_attrib_location>( version ) )
+        shader.addSource( "#define DISABLE_GL_ARB_explicit_attrib_location\n" );
+    if( GL::Context::current().isExtensionDisabled<GL::Extensions::ARB::shading_language_420pack>( version ) )
+        shader.addSource( "#define DISABLE_GL_ARB_shading_language_420pack\n" );
+    if( GL::Context::current().isExtensionDisabled<GL::Extensions::ARB::explicit_uniform_location>( version ) )
+        shader.addSource( "#define DISABLE_GL_ARB_explicit_uniform_location\n" );
+#endif
+
+#ifndef MAGNUM_TARGET_GLES2
+    if( type == GL::Shader::Type::Vertex && GL::Context::current().isExtensionDisabled<GL::Extensions::MAGNUM::shader_vertex_id>( version ) )
+        shader.addSource( "#define DISABLE_GL_MAGNUM_shader_vertex_id\n" );
+#endif
+
+/* My Android emulator (running on NVidia) doesn't define GL_ES
+       preprocessor macro, thus *all* the stock shaders fail to compile */
+/** @todo remove this when Android emulator is sane */
+#ifdef CORRADE_TARGET_ANDROID
+    shader.addSource( "#ifndef GL_ES\n#define GL_ES 1\n#endif\n" );
+#endif
+
+    return shader;
+}
+
+MeshNoisePreview::VertexLightShader& MeshNoisePreview::VertexLightShader::SetTransformationProjectionMatrix( const Matrix4& matrix )
+{
+    setUniform( mTransformationProjectionMatrixUniform, matrix );
+    return *this;
+}
+
+MeshNoisePreview::VertexLightShader& MeshNoisePreview::VertexLightShader::SetColorTint( const Color3& color )
+{
+    setUniform( mColorTintUniform, Vector4( color, 1.0f ) );
+    return *this;
+}
+
+void MeshNoisePreview::StartTimer()
+{
+    mTimerStart = std::chrono::high_resolution_clock::now();
+}
+
+float MeshNoisePreview::GetTimerDurationMs()
+{
+    return std::chrono::duration_cast<std::chrono::microseconds>( std::chrono::high_resolution_clock::now() - mTimerStart ).count() / 1e3f;
+}
+
+void MeshNoisePreview::SetupSettingsHandlers()
+{
+    ImGuiSettingsHandler editorSettings;
+    editorSettings.TypeName = "NodeEditorMeshNoisePreview";
+    editorSettings.TypeHash = ImHashStr( editorSettings.TypeName );
+    editorSettings.UserData = this;
+    editorSettings.WriteAllFn = []( ImGuiContext* ctx, ImGuiSettingsHandler* handler, ImGuiTextBuffer* outBuf ) {
+        auto* meshNoisePreview = (MeshNoisePreview*)handler->UserData;
+        outBuf->appendf( "\n[%s][Settings]\n", handler->TypeName );
+
+        outBuf->appendf( "tri_limit=%d\n", (int)meshNoisePreview->mTriLimit );
+        outBuf->appendf( "scale=%f\n", meshNoisePreview->mBuildData.scale );
+        outBuf->appendf( "iso_surface=%f\n", meshNoisePreview->mBuildData.isoSurface );
+        outBuf->appendf( "heightmap_multiplier=%f\n", meshNoisePreview->mBuildData.heightmapMultiplier );
+        outBuf->appendf( "seed=%d\n", meshNoisePreview->mBuildData.seed );
+        outBuf->appendf( "color=%d\n", (int)meshNoisePreview->mBuildData.color.toSrgbInt() );
+        outBuf->appendf( "mesh_type=%d\n", (int)meshNoisePreview->mBuildData.meshType );
+        outBuf->appendf( "enabled=%d\n", (int)meshNoisePreview->mEnabled );
+    };
+    editorSettings.ReadOpenFn = []( ImGuiContext* ctx, ImGuiSettingsHandler* handler, const char* name ) -> void* {
+        if( strcmp( name, "Settings" ) == 0 )
+        {
+            return handler->UserData;
+        }
+
+        return nullptr;
+    };
+    editorSettings.ReadLineFn = []( ImGuiContext* ctx, ImGuiSettingsHandler* handler, void* entry, const char* line ) {
+        auto* meshNoisePreview = (MeshNoisePreview*)handler->UserData;
+
+        sscanf( line, "tri_limit=%d", &meshNoisePreview->mTriLimit );
+        sscanf( line, "scale=%f", &meshNoisePreview->mBuildData.scale );
+        sscanf( line, "iso_surface=%f", &meshNoisePreview->mBuildData.isoSurface );
+        sscanf( line, "heightmap_multiplier=%f", &meshNoisePreview->mBuildData.heightmapMultiplier );
+        sscanf( line, "seed=%d", &meshNoisePreview->mBuildData.seed );
+        sscanf( line, "mesh_type=%d", (int*)&meshNoisePreview->mBuildData.meshType );
+
+        int i;
+        if( sscanf( line, "color=%d", &i ) == 1 )
+        {
+            meshNoisePreview->mBuildData.color = Color3::fromSrgbInt( i );
+        }
+        else if( sscanf( line, "enabled=%d", &i ) == 1 )
+        {
+            meshNoisePreview->mEnabled = i;
+        }
+    };
+
+    ImGuiExtra::AddOrReplaceSettingsHandler( editorSettings );
+}
\ No newline at end of file
diff --git a/NoiseTool/MeshNoisePreview.h b/tools/NodeEditor/MeshNoisePreview.h
similarity index 76%
rename from NoiseTool/MeshNoisePreview.h
rename to tools/NodeEditor/MeshNoisePreview.h
index 4cb6fab4..a9bc125f 100644
--- a/NoiseTool/MeshNoisePreview.h
+++ b/tools/NodeEditor/MeshNoisePreview.h
@@ -33,13 +33,15 @@ namespace Magnum
     private:
         enum MeshType
         {
-            MeshType_Voxel3D,
+            MeshType_Bloxel3D,
+            MeshType_DualMarchingCubes3D,
             MeshType_Heightmap2D,
             MeshType_Count
         };
 
         inline static const char* MeshTypeStrings =
-            "Voxel 3D\0"
+            "Bloxel 3D\0"
+            "Dual Marching Cubes 3D\0"
             "Heightmap 2D\0";
 
         class VertexLightShader : public GL::AbstractShaderProgram
@@ -121,33 +123,39 @@ namespace Magnum
             struct BuildData
             {
                 FastNoise::SmartNode<const FastNoise::Generator> generator;
+                FastNoise::SmartNode<FastNoise::DomainScale> generatorScaled;
                 Vector3i pos;
                 Color3 color;
-                float frequency, isoSurface, heightmapMultiplier;
+                float scale, isoSurface, heightmapMultiplier;
                 int32_t seed;
                 MeshType meshType;
                 uint32_t genVersion;
             };
 
             static MeshData BuildMeshData( const BuildData& buildData );
-            static MeshNoisePreview::Chunk::MeshData BuildVoxel3DMesh( const BuildData& buildData, float* densityValues, std::vector<VertexData>& vertexData, std::vector<uint32_t>& indicies );
-            static MeshNoisePreview::Chunk::MeshData BuildHeightMap2DMesh( const BuildData& buildData, float* densityValues, std::vector<VertexData>& vertexData, std::vector<uint32_t>& indicies );
+            static MeshData BuildBloxel3DMesh( const BuildData& buildData, float* densityValues, std::vector<VertexData>& vertexData, std::vector<uint32_t>& indicies );
+            static MeshData BuildDmc3DMesh( const BuildData& buildData, float* densityValues, std::vector<VertexData>& vertexData, std::vector<uint32_t>& indicies );
+            static MeshData BuildHeightMap2DMesh( const BuildData& buildData, float* densityValues, std::vector<VertexData>& vertexData, std::vector<uint32_t>& indicies );
 
             Chunk( MeshData& meshData );
 
             GL::Mesh* GetMesh() { return mMesh.get(); }
             Vector3i GetPos() const { return mPos; }
 
-            static constexpr uint32_t SIZE          = 128;
-            static constexpr Vector3  LIGHT_DIR     = { 3, 4, 2 };
-            static constexpr float    AMBIENT_LIGHT = 0.3f;
-            static constexpr float    AO_STRENGTH   = 0.6f;
+            static constexpr uint32_t SIZE           = 128;
+            static constexpr Vector3  LIGHT_DIR      = { 0.6f, 1.f, 0.4f };
+            static constexpr float    AMBIENT_LIGHT  = 0.3f;
+            static constexpr float    AO_STRENGTH    = 0.9f;
 
         private:
-            static void AddQuadAO( std::vector<VertexData>& verts, std::vector<uint32_t>& indicies, const float* density, float isoSurface,
-                                   int32_t idx, int32_t facingIdx, int32_t offsetA, int32_t offsetB, float light, Vector3 pos00, Vector3 pos01, Vector3 pos11, Vector3 pos10 );
+            static void BloxelAddQuadAO( std::vector<VertexData>& verts, std::vector<uint32_t>& indicies, const float* density, float isoSurface,
+                                         int32_t idx, int32_t facingIdx, int32_t offsetA, int32_t offsetB, float light, Vector3 pos00, Vector3 pos01, Vector3 pos11, Vector3 pos10 );
 
-            static constexpr uint32_t SIZE_GEN = SIZE + 2;
+            template<uint32_t STEP_X, uint32_t STEP_Y, uint32_t STEP_Z, uint32_t SIZE_GEN>
+            static uint32_t DmcGetVertIndex( uint32_t cellIndex, uint16_t edge, Vector3 vertOffset, const float* densityArray, float isoSurface,
+                std::vector<VertexData>& vertexData, robin_hood::unordered_flat_map<uint64_t, uint32_t>& vertIndexMap );
+
+            static constexpr uint32_t SIZE_DENSITY = SIZE + 4;
 
             Vector3i mPos;
             std::unique_ptr<GL::Mesh> mMesh;
diff --git a/NoiseTool/MultiThreadQueues.h b/tools/NodeEditor/MultiThreadQueues.h
similarity index 100%
rename from NoiseTool/MultiThreadQueues.h
rename to tools/NodeEditor/MultiThreadQueues.h
diff --git a/NoiseTool/NoiseToolApp.cpp b/tools/NodeEditor/NodeEditorApp.cpp
similarity index 53%
rename from NoiseTool/NoiseToolApp.cpp
rename to tools/NodeEditor/NodeEditorApp.cpp
index ff01a3fd..5f377336 100644
--- a/NoiseTool/NoiseToolApp.cpp
+++ b/tools/NodeEditor/NodeEditorApp.cpp
@@ -8,29 +8,49 @@
 #include <Magnum/GL/DefaultFramebuffer.h>
 #include <Magnum/GL/Renderer.h>
 
-#include "NoiseToolApp.h"
-#include "ImGuiExtra.h"
+#ifdef __EMSCRIPTEN__
+#include <emscripten.h>
+#endif
+
+#include "NodeEditorApp.h"
+#include "util/ImGuiExtra.h"
+#include "FastSIMD/FastSIMD_FastNoise_config.h"
 
 using namespace Magnum;
 
+static constexpr const char* kAppSettingsFile = FILESYSTEM_ROOT "NodeEditor.ini";
+
 void InitResources()
 {
 #ifdef MAGNUM_BUILD_STATIC
-    CORRADE_RESOURCE_INITIALIZE( NoiseTool_RESOURCES )
+    CORRADE_RESOURCE_INITIALIZE( NodeEditor_RESOURCES )
 #endif
 }
 
-NoiseToolApp::NoiseToolApp( const Arguments& arguments ) :
+static bool IsDetached( const NodeEditorApp::Arguments& arguments )
+{
+    return arguments.argc > 1 && std::string_view { arguments.argv[1] } == "-detached";
+}
+
+NodeEditorApp::NodeEditorApp( const Arguments& arguments ) :
     Platform::Application{ arguments,
-    Configuration{}
-    .setTitle( "FastNoise2 NoiseTool" )
-    .setSize( Vector2i( 1280, 720 ) )
-    .setWindowFlags( Configuration::WindowFlag::Resizable | Configuration::WindowFlag::Maximized ),
-    GLConfiguration{}
-    .setSampleCount( 4 )
+        Configuration{}
+        .setTitle( IsDetached( arguments ) ? "FastNoise2 Node Graph" : "FastNoise2 Node Editor" )
+#ifdef __EMSCRIPTEN__
+        .setWindowFlags( Configuration::WindowFlag::Resizable )
+#else
+        .setSize( Vector2i( 1280, 720 ) )
+        .setWindowFlags( Configuration::WindowFlag::Resizable | ( IsDetached( arguments ) ? (Configuration::WindowFlag)0 : Configuration::WindowFlag::Maximized ) ),
+        GLConfiguration{}
+        .setSampleCount( 4 )
+#endif
     },
+    mIsDetachedNodeGraph( IsDetached( arguments ) ),
+    mExecutablePath( arguments.argv[0] ),
+    mIpcSharedMemory( FastNoiseNodeEditor::SetupSharedMemoryIpc() ),
     mImGuiIntegrationContext{ NoCreate },
-    mImGuiContext{ ImGui::CreateContext() }
+    mImGuiContext{ ImGui::CreateContext() },
+    mNodeEditor( *this )
 {
     InitResources();
 
@@ -40,16 +60,22 @@ NoiseToolApp::NoiseToolApp( const Arguments& arguments ) :
     {
         ImFontConfig fontConfig;
         fontConfig.FontDataOwnedByAtlas = false;
-        const auto font = Utility::Resource{ "NoiseTool" }.getRaw( "Font.ttf" );
+        const auto font = Utility::Resource{ "NodeEditor" }.getRaw( "Font.ttf" );
         ImGui::GetIO().Fonts->AddFontFromMemoryTTF( const_cast<char*>( font.data() ), (int)font.size(), 14.0f * framebufferSize().x() / size.x(), &fontConfig );
     }
 
-    ImGui::GetIO().IniFilename = "NoiseTool.ini";
+    // We manually save so we can sync the filesystem on emscripten
+    ImGui::GetIO().IniFilename = nullptr;
+    ImGui::LoadIniSettingsFromDisk( kAppSettingsFile );
+
+    ImGui::GetIO().ConfigDragClickToInputText = true;
     mImGuiIntegrationContext = ImGuiIntegration::Context( *mImGuiContext, size, windowSize(), framebufferSize() );
 
     GL::Renderer::enable( GL::Renderer::Feature::DepthTest );
 
+#ifndef __EMSCRIPTEN__
     setSwapInterval( 1 );
+#endif
 
     mFrameTime.start();
 
@@ -62,109 +88,132 @@ NoiseToolApp::NoiseToolApp( const Arguments& arguments ) :
     GL::Renderer::setBlendEquation( GL::Renderer::BlendEquation::Add, GL::Renderer::BlendEquation::Add );
     GL::Renderer::setBlendFunction( GL::Renderer::BlendFunction::SourceAlpha, GL::Renderer::BlendFunction::OneMinusSourceAlpha );
 
-    Debug{} << "FastSIMD detected max CPU SIMD Level:" << FastNoiseNodeEditor::GetSIMDLevelName( FastSIMD::CPUMaxSIMDLevel() );
+    Debug{} << "FastSIMD detected max CPU supported feature set:" << FastSIMD::GetFeatureSetString( FastSIMD::DetectCpuMaxFeatureSet() );
 
-    mLevelNames = { "Auto" };
-    mLevelEnums = { FastSIMD::Level_Null };
+    mFeatureSetSelection = { FastSIMD::FeatureSet::Max };
+    mFeatureSetSelection.insert( mFeatureSetSelection.end(),
+        std::rbegin( FastSIMD::FastSIMD_FastNoise::CompiledFeatureSets::AsArray ), 
+        std::rend( FastSIMD::FastSIMD_FastNoise::CompiledFeatureSets::AsArray ) );
 
-    for( int i = 1; i > 0; i <<= 1 )
+    for( FastSIMD::FeatureSet featureSet : mFeatureSetSelection )
     {
-        FastSIMD::eLevel lvl = (FastSIMD::eLevel)i;
-        if( lvl & FastNoise::SUPPORTED_SIMD_LEVELS & FastSIMD::COMPILED_SIMD_LEVELS )
-        {
-            mLevelNames.emplace_back( FastNoiseNodeEditor::GetSIMDLevelName( lvl ) );
-            mLevelEnums.emplace_back( lvl );
-        }
+        mFeatureSetNames.push_back( FastSIMD::GetFeatureSetString( featureSet ) );
     }
 }
 
-NoiseToolApp::~NoiseToolApp()
+NodeEditorApp::~NodeEditorApp()
 {
     // Avoid trying to save settings after node editor is already destroyed
     ImGui::SaveIniSettingsToDisk( ImGui::GetIO().IniFilename );
     ImGui::GetIO().IniFilename = nullptr;
+
+    FastNoiseNodeEditor::ReleaseSharedMemoryIpc();
+}
+
+void NodeEditorApp::SyncFileSystem()
+{
+#ifdef __EMSCRIPTEN__
+    // Don't forget to sync to make sure you store it to IndexedDB
+    EM_ASM(
+        FS.syncfs( false, function( err ) {
+            if (err) {
+                console.warn("Error saving:", err);
+            }
+        } );
+    );
+#endif
 }
 
-void NoiseToolApp::drawEvent()
+void NodeEditorApp::drawEvent()
 {
     GL::defaultFramebuffer.clear( GL::FramebufferClear::Color | GL::FramebufferClear::Depth );
 
     mImGuiIntegrationContext.newFrame();
 
+    if( ImGui::GetIO().WantSaveIniSettings )
+    {
+        ImGui::SaveIniSettingsToDisk( kAppSettingsFile );
+        ImGui::GetIO().WantSaveIniSettings = false;
+        SyncFileSystem();
+    }
+
     /* Enable text input, if needed */
     if( ImGui::GetIO().WantTextInput && !isTextInputActive() )
         startTextInput();
     else if( !ImGui::GetIO().WantTextInput && isTextInputActive() )
         stopTextInput();
 
+    if( !mIsDetachedNodeGraph )
     {
-        if( ImGui::Button( "Reset State" ) )
         {
-            ImGui::ClearIniSettings();
-            mNodeEditor.~FastNoiseNodeEditor();
-            new( &mNodeEditor ) FastNoiseNodeEditor();
-            ImGui::SaveIniSettingsToDisk( ImGui::GetIO().IniFilename );
+            if( ImGui::Button( "Reset State" ) )
+            {
+                ImGui::ClearIniSettings();
+                mNodeEditor.~FastNoiseNodeEditor();
+                new( &mNodeEditor ) FastNoiseNodeEditor( *this );
+                ImGui::SaveIniSettingsToDisk( ImGui::GetIO().IniFilename );
+            }
+
+            if( ImGui::ColorEdit3( "Clear Color", mClearColor.data() ) )
+                GL::Renderer::setClearColor( mClearColor );
+
+            ImGui::Checkbox( "Backface Culling", &mBackFaceCulling );
+
+            ImGui::Text( "Application average %.3f ms/frame (%.1f FPS)",
+                         1000.0 / Double( ImGui::GetIO().Framerate ), Double( ImGui::GetIO().Framerate ) );
+
+            if( ImGui::Combo( "Max Feature Set", &mMaxFeatureSet, mFeatureSetNames.data(), (int)mFeatureSetSelection.size() ) ||
+                ImGuiExtra::ScrollCombo( &mMaxFeatureSet, (int)mFeatureSetSelection.size() ) )
+            {
+                FastSIMD::FeatureSet newLevel = mFeatureSetSelection[mMaxFeatureSet];
+                mNodeEditor.SetSIMDLevel( newLevel );
+            }
         }
 
-        if( ImGui::ColorEdit3( "Clear Color", mClearColor.data() ) )
-            GL::Renderer::setClearColor( mClearColor );
-
-        ImGui::Checkbox( "Backface Culling", &mBackFaceCulling );
-
-        ImGui::Text( "Application average %.3f ms/frame (%.1f FPS)",
-            1000.0 / Double( ImGui::GetIO().Framerate ), Double( ImGui::GetIO().Framerate ) );
-
-        if( ImGui::Combo( "Max SIMD Level", &mMaxSIMDLevel, mLevelNames.data(), (int)mLevelEnums.size() ) ||
-            ImGuiExtra::ScrollCombo( &mMaxSIMDLevel, (int)mLevelEnums.size() ) )
-        {   
-            FastSIMD::eLevel newLevel = mLevelEnums[mMaxSIMDLevel];
-            mNodeEditor.SetSIMDLevel( newLevel );
+        // Update camera pos
+        Vector3 cameraVelocity( 0 );
+        if( mKeyDown[Key_W] || mKeyDown[Key_Up] )
+        {
+            cameraVelocity.z() -= 1.0f;
+        }
+        if( mKeyDown[Key_S] || mKeyDown[Key_Down] )
+        {
+            cameraVelocity.z() += 1.0f;
+        }
+        if( mKeyDown[Key_A] || mKeyDown[Key_Left] )
+        {
+            cameraVelocity.x() -= 1.0f;
+        }
+        if( mKeyDown[Key_D] || mKeyDown[Key_Right] )
+        {
+            cameraVelocity.x() += 1.0f;
+        }
+        if( mKeyDown[Key_Q] || mKeyDown[Key_PgDn] )
+        {
+            cameraVelocity.y() -= 1.0f;
+        }
+        if( mKeyDown[Key_E] || mKeyDown[Key_PgUp] )
+        {
+            cameraVelocity.y() += 1.0f;
+        }
+        if( mKeyDown[Key_RShift] || mKeyDown[Key_LShift] )
+        {
+            cameraVelocity *= 4.0f;
         }
-    }
-
-    // Update camera pos
-    Vector3 cameraVelocity( 0 );
-    if( mKeyDown[Key_W] || mKeyDown[Key_Up] )
-    {
-        cameraVelocity.z() -= 1.0f;
-    }
-    if( mKeyDown[Key_S] || mKeyDown[Key_Down] )
-    {
-        cameraVelocity.z() += 1.0f;
-    }
-    if( mKeyDown[Key_A] || mKeyDown[Key_Left] )
-    {
-        cameraVelocity.x() -= 1.0f;
-    }
-    if( mKeyDown[Key_D] || mKeyDown[Key_Right] )
-    {
-        cameraVelocity.x() += 1.0f;
-    }
-    if( mKeyDown[Key_Q] || mKeyDown[Key_PgDn] )
-    {
-        cameraVelocity.y() -= 1.0f;
-    }
-    if( mKeyDown[Key_E] || mKeyDown[Key_PgUp] )
-    {
-        cameraVelocity.y() += 1.0f;
-    }
-    if( mKeyDown[Key_RShift] || mKeyDown[Key_LShift] )
-    {
-        cameraVelocity *= 4.0f;
-    }
 
-    cameraVelocity *= mFrameTime.previousFrameDuration() * 80.0f;
+        cameraVelocity *= mFrameTime.previousFrameDuration() * 80.0f;
 
-    if( !cameraVelocity.isZero() ) 
-    {
-        Matrix4 transform = mCameraObject.transformation();
-        transform.translation() += transform.rotation() * cameraVelocity;
-        mCameraObject.setTransformation( transform );
-    }
+        if( !cameraVelocity.isZero() )
+        {
+            Matrix4 transform = mCameraObject.transformation();
+            transform.translation() += transform.rotation() * cameraVelocity;
+            mCameraObject.setTransformation( transform );
+        }
 
-    if( mBackFaceCulling )
-    {
-        GL::Renderer::enable( GL::Renderer::Feature::FaceCulling );
+        if( mBackFaceCulling )
+        {
+            GL::Renderer::enable( GL::Renderer::Feature::FaceCulling );
+        }
     }
 
     mNodeEditor.Draw( mCamera.cameraMatrix(), mCamera.projectionMatrix(), mCameraObject.transformation().translation() );
@@ -190,7 +239,7 @@ void NoiseToolApp::drawEvent()
     mFrameTime.nextFrame();
 }
 
-void NoiseToolApp::viewportEvent( ViewportEvent& event )
+void NodeEditorApp::viewportEvent( ViewportEvent& event )
 {
     GL::defaultFramebuffer.setViewport( { {}, event.framebufferSize() } );
 
@@ -199,7 +248,7 @@ void NoiseToolApp::viewportEvent( ViewportEvent& event )
     mImGuiIntegrationContext.relayout( Vector2 { event.windowSize() } / event.dpiScaling(), event.windowSize(), event.framebufferSize() );
 }
 
-void NoiseToolApp::keyPressEvent( KeyEvent& event )
+void NodeEditorApp::keyPressEvent( KeyEvent& event )
 {
     if( mImGuiIntegrationContext.handleKeyPressEvent( event ) )
         return;
@@ -207,7 +256,7 @@ void NoiseToolApp::keyPressEvent( KeyEvent& event )
     HandleKeyEvent( event.key(), true );
 }
 
-void NoiseToolApp::keyReleaseEvent( KeyEvent& event )
+void NodeEditorApp::keyReleaseEvent( KeyEvent& event )
 {
     if( mImGuiIntegrationContext.handleKeyReleaseEvent( event ) )
         return;
@@ -215,7 +264,7 @@ void NoiseToolApp::keyReleaseEvent( KeyEvent& event )
     HandleKeyEvent( event.key(), false );
 }
 
-void NoiseToolApp::HandleKeyEvent( KeyEvent::Key key, bool value )
+void NodeEditorApp::HandleKeyEvent( KeyEvent::Key key, bool value )
 {
     switch( key )
     {
@@ -266,7 +315,7 @@ void NoiseToolApp::HandleKeyEvent( KeyEvent::Key key, bool value )
     }
 }
 
-void NoiseToolApp::mousePressEvent( MouseEvent& event )
+void NodeEditorApp::mousePressEvent( MouseEvent& event )
 {
     if( mImGuiIntegrationContext.handleMousePressEvent( event ) )
         return;
@@ -276,7 +325,7 @@ void NoiseToolApp::mousePressEvent( MouseEvent& event )
     event.setAccepted();
 }
 
-void NoiseToolApp::mouseReleaseEvent( MouseEvent& event )
+void NodeEditorApp::mouseReleaseEvent( MouseEvent& event )
 {
     if( mImGuiIntegrationContext.handleMouseReleaseEvent( event ) )
         return;
@@ -284,7 +333,7 @@ void NoiseToolApp::mouseReleaseEvent( MouseEvent& event )
     event.setAccepted();
 }
 
-void NoiseToolApp::mouseScrollEvent( MouseScrollEvent& event ) {
+void NodeEditorApp::mouseScrollEvent( MouseScrollEvent& event ) {
     if( mImGuiIntegrationContext.handleMouseScrollEvent( event ) )
     {
         /* Prevent scrolling the page */
@@ -293,7 +342,7 @@ void NoiseToolApp::mouseScrollEvent( MouseScrollEvent& event ) {
     }
 }
 
-void NoiseToolApp::mouseMoveEvent( MouseMoveEvent& event )
+void NodeEditorApp::mouseMoveEvent( MouseMoveEvent& event )
 {
     if( mImGuiIntegrationContext.handleMouseMoveEvent( event ) )
         return;
@@ -317,16 +366,16 @@ void NoiseToolApp::mouseMoveEvent( MouseMoveEvent& event )
     event.setAccepted();
 }
 
-void NoiseToolApp::textInputEvent( TextInputEvent& event )
+void NodeEditorApp::textInputEvent( TextInputEvent& event )
 {
     if( mImGuiIntegrationContext.handleTextInputEvent( event ) )
         return;
 }
 
-void NoiseToolApp::UpdatePespectiveProjection()
+void NodeEditorApp::UpdatePespectiveProjection()
 {
     mCamera.setProjectionMatrix( Matrix4::perspectiveProjection( Deg( 70.0f ), Vector2{ windowSize() }.aspectRatio(), 2.0f, 3500.0f ) );
 }
 
 
-MAGNUM_APPLICATION_MAIN( NoiseToolApp )
+MAGNUM_APPLICATION_MAIN( NodeEditorApp )
diff --git a/NoiseTool/NoiseToolApp.h b/tools/NodeEditor/NodeEditorApp.h
similarity index 65%
rename from NoiseTool/NoiseToolApp.h
rename to tools/NodeEditor/NodeEditorApp.h
index 12610606..f5be0746 100644
--- a/NoiseTool/NoiseToolApp.h
+++ b/tools/NodeEditor/NodeEditorApp.h
@@ -1,8 +1,16 @@
 #pragma once
 
 #include <array>
-#include <Magnum/Math/Color.h>
+
+#ifdef __EMSCRIPTEN__
+#define FILESYSTEM_ROOT "/fastnoise2/"
+#include <Magnum/Platform/EmscriptenApplication.h>
+#else
+#define FILESYSTEM_ROOT
 #include <Magnum/Platform/GlfwApplication.h>
+#endif
+
+#include <Magnum/Math/Color.h>
 #include <Magnum/ImGuiIntegration/Context.h>
 #include <Magnum/SceneGraph/Object.h>
 #include <Magnum/SceneGraph/Camera.h>
@@ -12,11 +20,28 @@
 
 namespace Magnum
 {
-    class NoiseToolApp : public Platform::Application
+    class NodeEditorApp : public Platform::Application
     {
     public:
-        explicit NoiseToolApp( const Arguments& arguments );
-        ~NoiseToolApp();
+        explicit NodeEditorApp( const Arguments& arguments );
+        ~NodeEditorApp();
+
+        bool IsDetachedNodeGraph()
+        {
+            return mIsDetachedNodeGraph;
+        }
+
+        void* GetIpcSharedMemory()
+        {
+            return mIpcSharedMemory;
+        }
+
+        std::string_view GetExecutablePath()
+        {
+            return mExecutablePath;
+        }
+
+        static void SyncFileSystem();
 
     private:
         void drawEvent() override;
@@ -33,6 +58,10 @@ namespace Magnum
         void UpdatePespectiveProjection();
         void HandleKeyEvent( KeyEvent::Key key, bool value );
 
+        bool mIsDetachedNodeGraph;
+        std::string mExecutablePath;
+        void* mIpcSharedMemory;
+
         SceneGraph::Object<SceneGraph::MatrixTransformation3D> mCameraObject;
         SceneGraph::Camera3D mCamera{ mCameraObject };
         Vector2 mLookAngle{ 0 };
@@ -40,9 +69,9 @@ namespace Magnum
 
         Color3 mClearColor{ 0.122f };
         bool mBackFaceCulling = false;
-        int mMaxSIMDLevel = 0;
-        std::vector<const char*> mLevelNames;
-        std::vector<FastSIMD::eLevel> mLevelEnums;
+        int mMaxFeatureSet = 0;
+        std::vector<FastSIMD::FeatureSet> mFeatureSetSelection;
+        std::vector<const char*> mFeatureSetNames;
 
         ImGuiIntegration::Context mImGuiIntegrationContext;
         ImGuiContext* mImGuiContext;
diff --git a/tools/NodeEditor/NoiseTexture.cpp b/tools/NodeEditor/NoiseTexture.cpp
new file mode 100644
index 00000000..4913b914
--- /dev/null
+++ b/tools/NodeEditor/NoiseTexture.cpp
@@ -0,0 +1,591 @@
+#include <cstdio>
+
+#ifdef __EMSCRIPTEN__
+#include <emscripten.h>
+#include <sstream>
+#else
+#include <filesystem>
+#include <fstream>
+#endif
+
+#define IMGUI_DEFINE_MATH_OPERATORS
+#include <imgui.h>
+#include <imgui_internal.h>
+
+#include <Corrade/Containers/ArrayViewStl.h>
+#include <Magnum/PixelFormat.h>
+#include <Magnum/GL/TextureFormat.h>
+#include <Magnum/Math/Functions.h>
+#include <Magnum/ImGuiIntegration/Widgets.h>
+
+#include <FastNoise/Metadata.h>
+
+#include "util/ImGuiExtra.h"
+#include "NoiseTexture.h"
+
+
+using namespace Magnum;
+
+NoiseTexture::NoiseTexture()
+{
+    mBuildData.iteration = 0;
+    mBuildData.scale = 1.f;
+    mBuildData.seed = 1337;
+    mBuildData.size = { -1, -1 };
+    mBuildData.offset = {};
+    mBuildData.generationType = GenType_2D;
+
+    mExportBuildData.size = { 4096, 4096 };
+
+    for( size_t i = 0; i < 2; i++ )
+    {
+        mThreads.emplace_back( GenerateLoopThread, std::ref( mGenerateQueue ), std::ref( mCompleteQueue ) );
+    }
+
+    Debug{} << "Texture generator thread count: " << mThreads.size();
+
+    SetupSettingsHandlers();
+}
+
+NoiseTexture::~NoiseTexture()
+{
+    for( auto& thread : mThreads )
+    {
+        mGenerateQueue.KillThreads();
+        thread.join();
+    }
+    
+    if( mExportThread.joinable() )
+    {
+        mExportThread.join();
+    }
+}
+
+void NoiseTexture::Draw()
+{
+    TextureData texData;
+    if( mCompleteQueue.Pop( texData ) )
+    {
+        if( mCurrentIteration < texData.iteration )
+        {
+            mCurrentIteration = texData.iteration;
+            ImageView2D noiseImage( PixelFormat::RGBA8Unorm, texData.size, texData.textureData );
+            SetPreviewTexture( noiseImage );
+        }
+        texData.Free();
+    }
+
+    ImGui::SetNextWindowSize( ImVec2( 768, 768 ), ImGuiCond_FirstUseEver );
+    ImGui::SetNextWindowPos( ImVec2( 1143, 305 ), ImGuiCond_FirstUseEver );
+    if( ImGui::Begin( "Texture Preview", nullptr, ImGuiWindowFlags_NoScrollbar | ImGuiWindowFlags_NoScrollWithMouse | ImGuiWindowFlags_MenuBar ) )
+    {
+        //ImGui::Text( "Min: %0.6f Max: %0.6f", mMinMax.min, mMinMax.max );
+        
+        bool edited = false;
+        
+        // Menu bar dropdown
+        if( ImGui::BeginMenuBar() )
+        {
+            if( ImGui::BeginMenu( "Settings" ) )
+            {
+                ImGui::PushItemWidth( 120.0f );
+                
+                edited |= ImGui::Combo( "Generation Type", reinterpret_cast<int*>( &mBuildData.generationType ), GenTypeStrings );
+                edited |= ImGuiExtra::ScrollCombo( reinterpret_cast<int*>( &mBuildData.generationType ), GenType_Count );
+                
+                edited |= ImGui::DragInt( "Seed", &mBuildData.seed );
+
+                Vector2i texSize = { mBuildData.size.x(), mBuildData.size.y() };
+                if( ImGui::DragInt2( "Size", texSize.data(), 2, 4, 8192 ) )
+                {
+                    ImVec2 delta( Vector2{ texSize - mBuildData.size } );
+                    ImVec2 windowSize = ImGui::GetWindowSize();
+                    windowSize += delta;
+                    ImVec2 contentSize = ImGui::GetContentRegionAvail();
+                    contentSize += delta;
+                    ImGui::SetWindowSize( windowSize );
+                }
+
+                // Scale control with center-relative scaling
+                float previousScale = mBuildData.scale;
+                if( ImGui::DragFloat( "Scale", &mBuildData.scale, 0.01f ) )
+                {
+                    // Adjust offset to maintain visual center when scale changes
+                    if( mBuildData.scale != 0.0f )
+                    {
+                        mBuildData.offset *= previousScale / mBuildData.scale;
+                    }
+                    edited = true;
+                }
+
+                // Offset controls
+                if( mBuildData.generationType != GenType_2DTiled )
+                {
+                    Vector2 xyOffset = { mBuildData.offset.x(), mBuildData.offset.y() };
+                    if( ImGui::DragFloat2( "Center X Y", xyOffset.data(), 1.0f ) )
+                    {
+                        mBuildData.offset.x() = xyOffset.x();
+                        mBuildData.offset.y() = xyOffset.y();
+                        edited = true;
+                    }
+                }
+                
+                // Show Z, W offset for 3D and 4D generation types
+                if( mBuildData.generationType == GenType_3D )
+                {
+                    // 3D mode: just Z offset
+                    float zOffset = mBuildData.offset.z();
+                    if( ImGui::DragFloat( "Center Z", &zOffset, 1.0f ) )
+                    {
+                        mBuildData.offset.z() = zOffset;
+                        edited = true;
+                    }
+                }
+                else if( mBuildData.generationType == GenType_4D )
+                {
+                    // 4D mode: Z and W together
+                    Vector2 zwOffset = { mBuildData.offset.z(), mBuildData.offset.w() };
+                    if( ImGui::DragFloat2( "Center Z W", zwOffset.data(), 1.0f ) )
+                    {
+                        mBuildData.offset.z() = zwOffset.x();
+                        mBuildData.offset.w() = zwOffset.y();
+                        edited = true;
+                    }
+                }
+                
+                
+                ImGui::PopItemWidth();
+                ImGui::EndMenu();
+            }
+            
+            if( ImGui::BeginMenu( mIsExporting ? "Exporting..." : "Export" ) )
+            {
+                DrawExportMenu();
+                ImGui::EndMenu();
+            }
+            ImGui::EndMenuBar();
+        }
+        
+        if( edited )
+        {
+            ImGuiExtra::MarkSettingsDirty();
+        }
+
+        ImVec2 contentSize = ImGui::GetContentRegionAvail();
+        
+        if( contentSize.x >= 1 && contentSize.y >= 1 &&
+            (edited || mBuildData.size.x() != (int)contentSize.x || mBuildData.size.y() != (int)contentSize.y) )
+        {
+            Vector2i newSize = { (int)contentSize.x, (int)contentSize.y };
+
+            mBuildData.offset.xy() -= Vector2( newSize - mBuildData.size ) / 2;
+            mBuildData.size = newSize;
+            ReGenerate( mBuildData.generator );
+        }
+
+        ImGui::PushStyleColor( ImGuiCol_Button, 0 );
+        ImGui::PushStyleColor( ImGuiCol_ButtonActive, 0 );
+        ImGui::PushStyleColor( ImGuiCol_ButtonHovered, 0 );
+        ImGuiIntegration::imageButton( mNoiseTexture, Vector2( mBuildData.size ), { {}, Vector2 { 1 } }, 0 );
+        ImGui::PopStyleColor( 3 );
+
+        if( ImGui::IsItemHovered() )
+        {
+            Vector4 oldOffset = mBuildData.offset;
+
+            if( mBuildData.generationType != GenType_2DTiled && ImGui::IsMouseDragging( ImGuiMouseButton_Left ) )
+            {
+                Vector2 dragDelta( ImGui::GetMouseDragDelta( ImGuiMouseButton_Left ) );
+                ImGui::ResetMouseDragDelta( ImGuiMouseButton_Left );
+
+                mBuildData.offset.x() -= dragDelta.x();
+                mBuildData.offset.y() += dragDelta.y();
+            }
+            else if( (mBuildData.generationType == GenType_3D || mBuildData.generationType == GenType_4D)
+                && ImGui::IsMouseDragging( ImGuiMouseButton_Right ) )
+            {
+                Vector2 dragDelta( ImGui::GetMouseDragDelta( ImGuiMouseButton_Right ) );
+                ImGui::ResetMouseDragDelta( ImGuiMouseButton_Right );
+
+                mBuildData.offset.z() -= dragDelta.x();
+
+                if( mBuildData.generationType == GenType_4D )
+                {
+                    mBuildData.offset.w() -= dragDelta.y();
+                }
+            }
+
+            if( oldOffset != mBuildData.offset )
+            {
+                ReGenerate( mBuildData.generator );
+            }
+        }
+    }
+    ImGui::End();
+}
+
+void NoiseTexture::DrawExportMenu()
+{
+    ImGui::PushItemWidth( 120.0f );
+    
+    if( ImGui::DragInt2( "Export Size", mExportBuildData.size.data(), 2, 4, 8192 * 4 ) )
+    {
+        ImGuiExtra::MarkSettingsDirty();
+    }
+    
+    // Filename input field
+    char filenameBuffer[256];
+    std::string displayFilename = mExportFilename;
+    
+    // If no custom filename is set, show the node name as default
+    if( displayFilename.empty() && mBuildData.generator )
+    {
+        displayFilename = mBuildData.generator->GetMetadata().name;
+    }
+    
+    strncpy( filenameBuffer, displayFilename.c_str(), sizeof(filenameBuffer) - 1 );
+    filenameBuffer[sizeof(filenameBuffer) - 1] = '\0';
+    
+    if( ImGui::InputText( "Filename", filenameBuffer, sizeof(filenameBuffer) ) )
+    {
+        std::string newFilename = std::string( filenameBuffer );
+        
+        // Only save as custom filename if it's different from the node name
+        if( mBuildData.generator && newFilename != mBuildData.generator->GetMetadata().name )
+        {
+            mExportFilename = newFilename;
+        }
+        else if( !mBuildData.generator )
+        {
+            mExportFilename = newFilename;
+        }
+        else
+        {
+            mExportFilename = "";  // Clear custom filename if user set it back to node name
+        }
+        
+        ImGuiExtra::MarkSettingsDirty();
+    }
+    
+    if( ImGui::Checkbox( "Scale export to match preview area", &mUseRelativeScaling ) )
+    {
+        ImGuiExtra::MarkSettingsDirty();
+    }
+    
+    ImGui::Separator();
+    
+    ImGui::BeginDisabled( mIsExporting || !mBuildData.generator );
+    if( ImGui::MenuItem( "Export as BMP" ) )
+    {
+        mExportBuildData.generationType = mBuildData.generationType;
+        mExportBuildData.seed = mBuildData.seed;
+        mExportBuildData.generator = mBuildData.generator;
+        
+        if( mUseRelativeScaling )
+        {
+            float relativeScale = (float)mExportBuildData.size.sum() / mBuildData.size.sum();
+            mExportBuildData.scale = mBuildData.scale / relativeScale;
+            mExportBuildData.offset = mBuildData.offset * relativeScale;
+        }
+        else
+        {
+            mExportBuildData.scale = mBuildData.scale;
+            mExportBuildData.offset = mBuildData.offset;
+        }
+
+        if( mExportThread.joinable() )
+        {
+            mExportThread.join();
+        }
+        mIsExporting.store( true, std::memory_order::relaxed );
+
+        mExportThread = std::thread([buildData = mExportBuildData, customFilename = mExportFilename, this]()
+        {
+            Debug{} << "BMP Export Started";
+            auto data = BuildTexture( buildData );
+
+            std::string filename;
+            if( !customFilename.empty() )
+            {
+                filename = customFilename;
+                // Add .bmp extension if not already present
+                if( filename.length() < 4 || filename.substr( filename.length() - 4 ) != ".bmp" )
+                {
+                    filename += ".bmp";
+                }
+            }
+            else
+            {
+                const char* nodeName = buildData.generator->GetMetadata().name;
+                filename = nodeName;
+                filename += ".bmp";
+            }
+
+#ifdef __EMSCRIPTEN__
+            std::stringstream file;
+#else
+            // Get absolute path for console output
+            std::filesystem::path fullPath = std::filesystem::absolute( filename );
+            
+            // Iterate through file names if filename exists
+            for( int i = 1; i < 1024; i++ )
+            {
+                if( !std::filesystem::exists( filename.c_str() ) )
+                {
+                    fullPath = std::filesystem::absolute( filename );
+                    break;
+                }
+                
+                // Create numbered version
+                size_t dotPos = filename.find_last_of( '.' );
+                if( dotPos != std::string::npos )
+                {
+                    std::string baseName = filename.substr( 0, dotPos );
+                    std::string extension = filename.substr( dotPos );
+                    filename = baseName + '_' + std::to_string( i ) + extension;
+                }
+                else
+                {
+                    filename += '_' + std::to_string( i );
+                }
+            }   
+            
+            fullPath = std::filesystem::absolute( filename );
+
+            std::ofstream file( filename.c_str(), std::ofstream::binary | std::ofstream::out | std::ofstream::trunc );
+
+            if( file.is_open() )
+#endif
+            {
+                struct BmpHeader
+                {
+                    // File header (14)
+                    // char b = 'B';
+                    // char m = 'M';
+                    uint32_t fileSize;
+                    uint32_t reserved = 0;
+                    uint32_t dataOffset = 14u + 12u + (256u * 3u);
+                    // Bmp Info Header (12)
+                    uint32_t headerSize = 12u;
+                    uint16_t sizeX;
+                    uint16_t sizeY;
+                    uint16_t colorPlanes = 1u;
+                    uint16_t bitDepth = 8u;
+                };
+
+                int paddedSizeX = buildData.size.x();
+                int padding = paddedSizeX % 4;
+                if( padding )
+                {
+                    padding = 4 - padding;
+                    paddedSizeX += padding;
+                }
+
+                BmpHeader header;
+                header.fileSize = header.dataOffset + (uint32_t)(paddedSizeX * buildData.size.y());
+                header.sizeX = (uint16_t)buildData.size.x();
+                header.sizeY = (uint16_t)buildData.size.y();
+
+                file << 'B' << 'M';
+                file.write( reinterpret_cast<char*>( &header ), sizeof( BmpHeader ) );
+
+                // Colour map
+                for (int i = 0; i < 256; i++)
+                {
+                    Vector3ub b3( (uint8_t)i );
+                    file.write( reinterpret_cast<char*>( b3.data() ), 3 );
+                }
+
+                int xIdx = padding ? buildData.size.x() : 0;
+
+                for( uint32_t pix : data.textureData ) 
+                {
+                    file.write( reinterpret_cast<char*>( &pix ), 1 );
+
+                    if( --xIdx == 0 )
+                    {
+                        xIdx = buildData.size.x();
+
+                        Vector3ub b3( 0 );
+                        file.write( reinterpret_cast<char*>( b3.data() ), padding );                        
+                    }
+                }
+
+#ifdef __EMSCRIPTEN__
+                std::string_view fileString = file.view();
+
+                MAIN_THREAD_EM_ASM( (
+                    // Create a temporary ArrayBuffer and copy the contents of the shared buffer
+                    // into it.
+                    const tempBuffer = new ArrayBuffer( $2 );
+                    const tempView = new Uint8Array( tempBuffer );
+
+                    let sharedView = new Uint8Array( Module["HEAPU8"].buffer, $1, $2 );
+                    tempView.set( sharedView );
+
+                    /// Offer a buffer in memory as a file to download, specifying download filename and mime type
+                    var a = document.createElement( 'a' );
+                    a.download = UTF8ToString( $0 );
+                    a.href = URL.createObjectURL( new Blob( [tempView], {type: 'image/bmp'} ) );
+                    a.click();
+                    ), filename.c_str(), fileString.data(), fileString.length() );
+#else
+                file.close();
+#endif
+
+                Debug{} << "BMP Export Complete:" << fullPath.string().c_str();
+                mIsExporting = false;
+            }
+        } );
+    }
+    ImGui::EndDisabled();
+    
+    ImGui::PopItemWidth();
+}
+
+void NoiseTexture::SetPreviewTexture( ImageView2D& imageView )
+{
+    mNoiseTexture = GL::Texture2D();
+    mNoiseTexture.setStorage( 1, GL::TextureFormat::RGBA8, imageView.size() )
+        .setSubImage( 0, {}, imageView );
+}
+
+void NoiseTexture::ReGenerate( FastNoise::SmartNodeArg<> generator )
+{
+    mBuildData.generator = generator;
+    mBuildData.iteration++;
+
+    mGenerateQueue.Clear();
+
+    if( mBuildData.size.x() <= 0 || mBuildData.size.y() <= 0 )
+    {
+        return;
+    }
+
+    if( generator )
+    {
+        mGenerateQueue.Push( mBuildData );
+        return;
+    }
+
+    std::array<uint32_t, 16 * 16> blankTex = {};
+
+    ImageView2D noiseImage( PixelFormat::RGBA8Unorm, {16,16}, blankTex );
+    mCurrentIteration = mBuildData.iteration;
+
+    SetPreviewTexture( noiseImage );
+}
+
+
+NoiseTexture::TextureData NoiseTexture::BuildTexture( const BuildData& buildData )
+{
+    static thread_local std::vector<float> noiseData;
+    noiseData.resize( (size_t)buildData.size.x() * buildData.size.y() );
+
+    auto gen = FastNoise::New<FastNoise::ConvertRGBA8>( buildData.generator->GetActiveFeatureSet() );
+    auto scale = FastNoise::New<FastNoise::DomainScale>( buildData.generator->GetActiveFeatureSet() );
+    gen->SetSource( scale );
+    scale->SetSource( buildData.generator );
+    scale->SetScaling( buildData.scale );
+
+    FastNoise::OutputMinMax minMax;
+
+    switch( buildData.generationType )
+    {
+    case GenType_2D:
+        minMax = gen->GenUniformGrid2D( noiseData.data(), 
+            (int)buildData.offset.x() - buildData.size.x() / 2, (int)buildData.offset.y() - buildData.size.y() / 2,
+            buildData.size.x(), buildData.size.y(), buildData.seed );
+        break;
+
+    case GenType_2DTiled:
+        minMax = gen->GenTileable2D( noiseData.data(),
+            buildData.size.x(), buildData.size.y(), buildData.seed );
+        break;
+
+    case GenType_3D:
+        minMax = gen->GenUniformGrid3D( noiseData.data(),
+            (int)buildData.offset.x() - buildData.size.x() / 2, (int)buildData.offset.y() - buildData.size.y() / 2, (int)buildData.offset.z(),
+            buildData.size.x(), buildData.size.y(), 1, buildData.seed );
+        break;
+
+    case GenType_4D:
+        minMax = gen->GenUniformGrid4D( noiseData.data(),
+            (int)buildData.offset.x() - buildData.size.x() / 2, (int)buildData.offset.y() - buildData.size.y() / 2, (int)buildData.offset.z(), (int)buildData.offset.w(),
+            buildData.size.x(), buildData.size.y(), 1, 1, buildData.seed );
+        break;
+    case GenType_Count:
+        break;
+    }
+
+    return TextureData( buildData.iteration, buildData.size, minMax, noiseData );
+}
+
+void NoiseTexture::GenerateLoopThread( GenerateQueue<BuildData>& generateQueue, CompleteQueue<TextureData>& completeQueue )
+{
+    while( true )
+    {
+        BuildData buildData = generateQueue.Pop();
+
+        if( generateQueue.ShouldKillThread() )
+        {
+            return;
+        }
+
+        TextureData texData = BuildTexture( buildData );
+
+        if( !completeQueue.Push( texData ) )
+        {
+            texData.Free();
+        }
+    }
+}
+
+void NoiseTexture::SetupSettingsHandlers()
+{
+    ImGuiSettingsHandler editorSettings;
+    editorSettings.TypeName = "NodeEditorNoiseTexture";
+    editorSettings.TypeHash = ImHashStr( editorSettings.TypeName );
+    editorSettings.UserData = this;
+    editorSettings.WriteAllFn = []( ImGuiContext* ctx, ImGuiSettingsHandler* handler, ImGuiTextBuffer* outBuf ) {
+        auto* noiseTexture = (NoiseTexture*)handler->UserData;
+        outBuf->appendf( "\n[%s][Settings]\n", handler->TypeName );        
+
+        outBuf->appendf( "scale=%f\n", noiseTexture->mBuildData.scale );
+        outBuf->appendf( "seed=%d\n", noiseTexture->mBuildData.seed );
+        outBuf->appendf( "gen_type=%d\n", (int)noiseTexture->mBuildData.generationType );
+        outBuf->appendf( "export_size=%d:%d\n", noiseTexture->mExportBuildData.size.x(), noiseTexture->mExportBuildData.size.y() );
+        outBuf->appendf( "use_relative_scaling=%d\n", noiseTexture->mUseRelativeScaling ? 1 : 0 );
+        outBuf->appendf( "export_filename=%s\n", noiseTexture->mExportFilename.c_str() );
+    };
+    editorSettings.ReadOpenFn = []( ImGuiContext* ctx, ImGuiSettingsHandler* handler, const char* name ) -> void* {
+        if( strcmp( name, "Settings" ) == 0 )
+        {
+            return handler->UserData;
+        }
+
+        return nullptr;
+    };
+    editorSettings.ReadLineFn = []( ImGuiContext* ctx, ImGuiSettingsHandler* handler, void* entry, const char* line ) {
+        auto* noiseTexture = (NoiseTexture*)handler->UserData;
+        
+        sscanf( line, "scale=%f", &noiseTexture->mBuildData.scale );
+        sscanf( line, "seed=%d", &noiseTexture->mBuildData.seed );
+        sscanf( line, "gen_type=%d", (int*)&noiseTexture->mBuildData.generationType );
+        sscanf( line, "export_size=%d:%d", &noiseTexture->mExportBuildData.size.x() , &noiseTexture->mExportBuildData.size.y() );
+        
+        int useRelativeScaling = 0;
+        if( sscanf( line, "use_relative_scaling=%d", &useRelativeScaling ) == 1 )
+        {
+            noiseTexture->mUseRelativeScaling = useRelativeScaling != 0;
+        }
+        
+        // Read export filename (skip "export_filename=" prefix)
+        if( strncmp( line, "export_filename=", 16 ) == 0 )
+        {
+            noiseTexture->mExportFilename = std::string( line + 16 );
+        }
+    };
+
+    ImGuiExtra::AddOrReplaceSettingsHandler( editorSettings );
+}
diff --git a/NoiseTool/NoiseTexture.h b/tools/NodeEditor/NoiseTexture.h
similarity index 93%
rename from NoiseTool/NoiseTexture.h
rename to tools/NodeEditor/NoiseTexture.h
index 2d173c4a..fce0f7a9 100644
--- a/NoiseTool/NoiseTexture.h
+++ b/tools/NodeEditor/NoiseTexture.h
@@ -3,6 +3,7 @@
 #include <memory>
 #include <thread>
 #include <cstring>
+#include <string>
 
 #include <Magnum/Magnum.h>
 #include <Magnum/GL/GL.h>
@@ -45,7 +46,7 @@ namespace Magnum
             FastNoise::SmartNode<const FastNoise::Generator> generator;
             Vector2i size;
             Vector4 offset;
-            float frequency;
+            float scale;
             int32_t seed;
             uint64_t iteration;
             GenType generationType;          
@@ -85,7 +86,7 @@ namespace Magnum
         static TextureData BuildTexture( const BuildData& buildData );
         static void GenerateLoopThread( GenerateQueue<BuildData>& generateQueue, CompleteQueue<TextureData>& completeQueue );
         
-        void DoExport();
+        void DrawExportMenu();
         void SetupSettingsHandlers();
         void SetPreviewTexture( ImageView2D& imageView );
 
@@ -96,6 +97,9 @@ namespace Magnum
         BuildData mExportBuildData;
         FastNoise::OutputMinMax mMinMax;
 
+        std::atomic_bool mIsExporting = false;
+        bool mUseRelativeScaling = true;
+        std::string mExportFilename;
         std::thread mExportThread;
         std::vector<std::thread> mThreads;
         GenerateQueue<BuildData> mGenerateQueue;
diff --git a/NoiseTool/VertexLight.frag b/tools/NodeEditor/resources/VertexLight.frag
similarity index 100%
rename from NoiseTool/VertexLight.frag
rename to tools/NodeEditor/resources/VertexLight.frag
diff --git a/NoiseTool/VertexLight.vert b/tools/NodeEditor/resources/VertexLight.vert
similarity index 100%
rename from NoiseTool/VertexLight.vert
rename to tools/NodeEditor/resources/VertexLight.vert
diff --git a/NoiseTool/WindowsHiDPI.manifest b/tools/NodeEditor/resources/WindowsHiDPI.manifest
similarity index 100%
rename from NoiseTool/WindowsHiDPI.manifest
rename to tools/NodeEditor/resources/WindowsHiDPI.manifest
diff --git a/tools/NodeEditor/resources/emscripten_enable_shared_array_buffer.js b/tools/NodeEditor/resources/emscripten_enable_shared_array_buffer.js
new file mode 100644
index 00000000..83bf5d94
--- /dev/null
+++ b/tools/NodeEditor/resources/emscripten_enable_shared_array_buffer.js
@@ -0,0 +1,75 @@
+// NOTE: This file creates a service worker that cross-origin-isolates the page (read more here: https://web.dev/coop-coep/) which allows us to use wasm threads.
+// Normally you would set the COOP and COEP headers on the server to do this, but Github Pages doesn't allow this, so this is a hack to do that.
+
+/* Edited version of: coi-serviceworker v0.1.6 - Guido Zuidhof, licensed under MIT */
+// From here: https://github.com/gzuidhof/coi-serviceworker
+if(typeof window === 'undefined') {
+  self.addEventListener("install", () => self.skipWaiting());
+  self.addEventListener("activate", e => e.waitUntil(self.clients.claim()));
+
+  async function handleFetch(request) {
+    if(request.cache === "only-if-cached" && request.mode !== "same-origin") {
+      return;
+    }
+    
+    if(request.mode === "no-cors") { // We need to set `credentials` to "omit" for no-cors requests, per this comment: https://bugs.chromium.org/p/chromium/issues/detail?id=1309901#c7
+      request = new Request(request.url, {
+        cache: request.cache,
+        credentials: "omit",
+        headers: request.headers,
+        integrity: request.integrity,
+        destination: request.destination,
+        keepalive: request.keepalive,
+        method: request.method,
+        mode: request.mode,
+        redirect: request.redirect,
+        referrer: request.referrer,
+        referrerPolicy: request.referrerPolicy,
+        signal: request.signal,
+      });
+    }
+    
+    let r = await fetch(request).catch(e => console.error(e));
+    
+    if(r.status === 0) {
+      return r;
+    }
+
+    const headers = new Headers(r.headers);
+    headers.set("Cross-Origin-Embedder-Policy", "require-corp"); // or: credentialless
+    headers.set("Cross-Origin-Opener-Policy", "same-origin");
+    
+    return new Response(r.body, { status: r.status, statusText: r.statusText, headers });
+  }
+
+  self.addEventListener("fetch", function(e) {
+    e.respondWith(handleFetch(e.request)); // respondWith must be executed synchonously (but can be passed a Promise)
+  });
+  
+} else {
+  (async function() {
+    if(window.crossOriginIsolated !== false) return;
+
+    let registration = await navigator.serviceWorker.register(window.document.currentScript.src).catch(e => console.error("COOP/COEP Service Worker failed to register:", e));
+    if(registration) {
+      console.log("COOP/COEP Service Worker registered", registration.scope);
+
+      registration.addEventListener("updatefound", () => {
+        console.log("Reloading page to make use of updated COOP/COEP Service Worker.");
+        window.location.reload();
+      });
+
+      // If the registration is active, but it's not controlling the page
+      if(registration.active && !navigator.serviceWorker.controller) {
+        console.log("Reloading page to make use of COOP/COEP Service Worker.");
+        window.location.reload();
+      }
+    }
+  })();
+}
+
+// Code to deregister:
+// let registrations = await navigator.serviceWorker.getRegistrations();
+// for(let registration of registrations) {
+//   await registration.unregister();
+// }
diff --git a/tools/NodeEditor/resources/emscripten_pre.js b/tools/NodeEditor/resources/emscripten_pre.js
new file mode 100644
index 00000000..f01af27c
--- /dev/null
+++ b/tools/NodeEditor/resources/emscripten_pre.js
@@ -0,0 +1,11 @@
+(Module["preRun"] = Module["preRun"] || []).push(function () {
+    addRunDependency('syncfs')
+
+    FS.mkdir('/fastnoise2')
+    FS.mount(IDBFS, {}, '/fastnoise2')
+    FS.syncfs(true, function (err) {
+        if (err) throw err
+        removeRunDependency('syncfs')
+        console.log("FS Synced")
+    })
+});
\ No newline at end of file
diff --git a/tools/NodeEditor/resources/emscripten_shell.html b/tools/NodeEditor/resources/emscripten_shell.html
new file mode 100644
index 00000000..14282b11
--- /dev/null
+++ b/tools/NodeEditor/resources/emscripten_shell.html
@@ -0,0 +1,76 @@
+<!doctype html>
+<html lang="en-us">
+
+  <!-- NOTE: THIS FILE BASED ON: imgui/examples/libs/emscripten/shell_minimal.html -->
+
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, minimum-scale=1, user-scalable=no"/>
+
+    <title>FastNoise 2 Node Editor</title>
+
+    <style>
+        body { margin: 0; background-color: black }
+        /* FIXME: with our GLFW example this block seems to break resizing and io.DisplaySize gets stuck */
+        .emscripten {
+            position: absolute;
+            top:  0px;
+            left: 0px;
+            margin: 0px;
+            border: 0;
+            width:  100%;
+            height: 100%;
+            overflow: hidden;
+            display: block;
+            image-rendering: optimizeSpeed;
+            image-rendering: -moz-crisp-edges;
+            image-rendering: -o-crisp-edges;
+            image-rendering: -webkit-optimize-contrast;
+            image-rendering: optimize-contrast;
+            image-rendering: crisp-edges;
+            image-rendering: pixelated;
+            -ms-interpolation-mode: nearest-neighbor;
+        }
+    </style>
+  </head>
+
+  <body>
+    <canvas class="emscripten" id="canvas" oncontextmenu="event.preventDefault()"></canvas>
+
+    <script type='text/javascript'>
+      var Module = {
+        preRun: [],
+        postRun: [],
+        print: (function() {
+            return function(text) {
+                text = Array.prototype.slice.call(arguments).join(' ');
+                console.log(text);
+            };
+        })(),
+        printErr: function(text) {
+            text = Array.prototype.slice.call(arguments).join(' ');
+            console.error(text);
+        },
+        canvas: (function() {
+            var canvas = document.getElementById('canvas');
+            //canvas.addEventListener("webglcontextlost", function(e) { alert('FIXME: WebGL context lost, please reload the page'); e.preventDefault(); }, false);
+            return canvas;
+        })(),
+        setStatus: function(text) {
+            console.log("status: " + text);
+        },
+        monitorRunDependencies: function(left) {
+            // no run dependencies to log
+        }
+      };
+      window.onerror = function() {
+        console.log("onerror: " + event);
+      };
+    </script>
+    <script src="emscripten_enable_shared_array_buffer.js"></script>
+
+    {{{ SCRIPT }}}
+
+  </body>
+
+</html>
diff --git a/NoiseTool/resources.conf b/tools/NodeEditor/resources/resources.conf
similarity index 52%
rename from NoiseTool/resources.conf
rename to tools/NodeEditor/resources/resources.conf
index 86773924..1aa73822 100644
--- a/NoiseTool/resources.conf
+++ b/tools/NodeEditor/resources/resources.conf
@@ -1,11 +1,11 @@
-group=NoiseTool
+group=NodeEditor
 
 [file]
-filename=${NoiseTool_RESOURCES_DIR}/VertexLight.frag
+filename=${NodeEditor_RESOURCES_DIR}/VertexLight.frag
 alias=VertexLight.frag
 
 [file]
-filename=${NoiseTool_RESOURCES_DIR}/VertexLight.vert
+filename=${NodeEditor_RESOURCES_DIR}/VertexLight.vert
 alias=VertexLight.vert
 
 [file]
diff --git a/tools/NodeEditor/util/DemoNodeTrees.inl b/tools/NodeEditor/util/DemoNodeTrees.inl
new file mode 100644
index 00000000..52e98253
--- /dev/null
+++ b/tools/NodeEditor/util/DemoNodeTrees.inl
@@ -0,0 +1,7 @@
+#pragma once
+
+inline const char* gDemoNodeTrees[][2] =
+{
+    { "Simple Terrain", "EwAC@BB@AIEAFEgUVBQ0AB@CQAACBABQY@ACWQ/8CZmYmPwY@B//wMEBI/CdTz//wIAACBC/wIzMzM/BpqZmT7/" },
+    { "Cellular Caves", "FAAC@BB@A4EAFEg@ACBCBRwFIwUlBQs@BlRATNzMw9C@AIMAMAw@ABAC@BFAM@BYAg@BcJ@BPkIEH4XrPgiF61E/////AgAApUMGAAClQwoAAKVD/wP/AQAG7FE4Pv8C@AgQf8CmpmZPgbNzEw//w==" },
+};
diff --git a/tools/NodeEditor/util/DmcTable.inl b/tools/NodeEditor/util/DmcTable.inl
new file mode 100644
index 00000000..c5c24a47
--- /dev/null
+++ b/tools/NodeEditor/util/DmcTable.inl
@@ -0,0 +1,332 @@
+namespace DMC
+{
+    enum EdgeCode : uint16_t
+    {
+        EDGE0 = 1,
+        EDGE1 = 1 << 1,
+        EDGE2 = 1 << 2,
+        EDGE3 = 1 << 3,
+        EDGE4 = 1 << 4,
+        EDGE5 = 1 << 5,
+        EDGE6 = 1 << 6,
+        EDGE7 = 1 << 7,
+        EDGE8 = 1 << 8,
+        EDGE9 = 1 << 9,
+        EDGE10 = 1 << 10,
+        EDGE11 = 1 << 11,
+    };
+
+    //  Coordinate system
+    //
+    //       y
+    //       |
+    //       |
+    //       |
+    //       0-----x
+    //      /
+    //     /
+    //    z
+    //
+
+    // Cell Corners
+    // (Corners are voxels. Number correspond to Morton codes of corner coordinates)
+    //
+    //       2-------------------3
+    //      /|                  /|
+    //     / |                 / |
+    //    /  |                /  |
+    //   6-------------------7   |
+    //   |   |               |   |
+    //   |   |               |   |
+    //   |   |               |   |
+    //   |   |               |   |
+    //   |   0---------------|---1
+    //   |  /                |  /
+    //   | /                 | /
+    //   |/                  |/
+    //   4-------------------5
+    //
+
+
+    //         Cell Edges
+    //
+    //       o--------4----------o
+    //      /|                  /|
+    //     7 |                 5 |
+    //    /  |                /  |
+    //   o--------6----------o   |
+    //   |   8               |   9
+    //   |   |               |   |
+    //   |   |               |   |
+    //   11  |               10  |
+    //   |   o--------0------|---o
+    //   |  /                |  /
+    //   | 3                 | 1
+    //   |/                  |/
+    //   o--------2----------o
+    //
+
+    // Encodes the edge vertices for the 256 marching cubes cases.
+    // A marching cube case produces up to four faces and ,thus, up to four
+    // dual points.
+
+    const uint16_t kDualPointsList[256][4] = {
+        { 0, 0, 0, 0 }, // 0
+        { EDGE0 | EDGE3 | EDGE8, 0, 0, 0 }, // 1
+        { EDGE0 | EDGE1 | EDGE9, 0, 0, 0 }, // 2
+        { EDGE1 | EDGE3 | EDGE8 | EDGE9, 0, 0, 0 }, // 3
+        { EDGE4 | EDGE7 | EDGE8, 0, 0, 0 }, // 4
+        { EDGE0 | EDGE3 | EDGE4 | EDGE7, 0, 0, 0 }, // 5
+        { EDGE0 | EDGE1 | EDGE9, EDGE4 | EDGE7 | EDGE8, 0, 0 }, // 6
+        { EDGE1 | EDGE3 | EDGE4 | EDGE7 | EDGE9, 0, 0, 0 }, // 7
+        { EDGE4 | EDGE5 | EDGE9, 0, 0, 0 }, // 8
+        { EDGE0 | EDGE3 | EDGE8, EDGE4 | EDGE5 | EDGE9, 0, 0 }, // 9
+        { EDGE0 | EDGE1 | EDGE4 | EDGE5, 0, 0, 0 }, // 10
+        { EDGE1 | EDGE3 | EDGE4 | EDGE5 | EDGE8, 0, 0, 0 }, // 11
+        { EDGE5 | EDGE7 | EDGE8 | EDGE9, 0, 0, 0 }, // 12
+        { EDGE0 | EDGE3 | EDGE5 | EDGE7 | EDGE9, 0, 0, 0 }, // 13
+        { EDGE0 | EDGE1 | EDGE5 | EDGE7 | EDGE8, 0, 0, 0 }, // 14
+        { EDGE1 | EDGE3 | EDGE5 | EDGE7, 0, 0, 0 }, // 15
+        { EDGE2 | EDGE3 | EDGE11, 0, 0, 0 }, // 16
+        { EDGE0 | EDGE2 | EDGE8 | EDGE11, 0, 0, 0 }, // 17
+        { EDGE0 | EDGE1 | EDGE9, EDGE2 | EDGE3 | EDGE11, 0, 0 }, // 18
+        { EDGE1 | EDGE2 | EDGE8 | EDGE9 | EDGE11, 0, 0, 0 }, // 19
+        { EDGE4 | EDGE7 | EDGE8, EDGE2 | EDGE3 | EDGE11, 0, 0 }, // 20
+        { EDGE0 | EDGE2 | EDGE4 | EDGE7 | EDGE11, 0, 0, 0 }, // 21
+        { EDGE0 | EDGE1 | EDGE9, EDGE4 | EDGE7 | EDGE8, EDGE2 | EDGE3 | EDGE11, 0 }, // 22
+        { EDGE1 | EDGE2 | EDGE4 | EDGE7 | EDGE9 | EDGE11, 0, 0, 0 }, // 23
+        { EDGE4 | EDGE5 | EDGE9, EDGE2 | EDGE3 | EDGE11, 0, 0 }, // 24
+        { EDGE0 | EDGE2 | EDGE8 | EDGE11, EDGE4 | EDGE5 | EDGE9, 0, 0 }, // 25
+        { EDGE0 | EDGE1 | EDGE4 | EDGE5, EDGE2 | EDGE3 | EDGE11, 0, 0 }, // 26
+        { EDGE1 | EDGE2 | EDGE4 | EDGE5 | EDGE8 | EDGE11, 0, 0, 0 }, // 27
+        { EDGE5 | EDGE7 | EDGE8 | EDGE9, EDGE2 | EDGE3 | EDGE11, 0, 0 }, // 28
+        { EDGE0 | EDGE2 | EDGE5 | EDGE7 | EDGE9 | EDGE11, 0, 0, 0 }, // 29
+        { EDGE0 | EDGE1 | EDGE5 | EDGE7 | EDGE8, EDGE2 | EDGE3 | EDGE11, 0, 0 }, // 30
+        { EDGE1 | EDGE2 | EDGE5 | EDGE7 | EDGE11, 0, 0, 0 }, // 31
+        { EDGE1 | EDGE2 | EDGE10, 0, 0, 0 }, // 32
+        { EDGE0 | EDGE3 | EDGE8, EDGE1 | EDGE2 | EDGE10, 0, 0 }, // 33
+        { EDGE0 | EDGE2 | EDGE9 | EDGE10, 0, 0, 0 }, // 34
+        { EDGE2 | EDGE3 | EDGE8 | EDGE9 | EDGE10, 0, 0, 0 }, // 35
+        { EDGE4 | EDGE7 | EDGE8, EDGE1 | EDGE2 | EDGE10, 0, 0 }, // 36
+        { EDGE0 | EDGE3 | EDGE4 | EDGE7, EDGE1 | EDGE2 | EDGE10, 0, 0 }, // 37
+        { EDGE0 | EDGE2 | EDGE9 | EDGE10, EDGE4 | EDGE7 | EDGE8, 0, 0 }, // 38
+        { EDGE2 | EDGE3 | EDGE4 | EDGE7 | EDGE9 | EDGE10, 0, 0, 0 }, // 39
+        { EDGE4 | EDGE5 | EDGE9, EDGE1 | EDGE2 | EDGE10, 0, 0 }, // 40
+        { EDGE0 | EDGE3 | EDGE8, EDGE4 | EDGE5 | EDGE9, EDGE1 | EDGE2 | EDGE10, 0 }, // 41
+        { EDGE0 | EDGE2 | EDGE4 | EDGE5 | EDGE10, 0, 0, 0 }, // 42
+        { EDGE2 | EDGE3 | EDGE4 | EDGE5 | EDGE8 | EDGE10, 0, 0, 0 }, // 43
+        { EDGE5 | EDGE7 | EDGE8 | EDGE9, EDGE1 | EDGE2 | EDGE10, 0, 0 }, // 44
+        { EDGE0 | EDGE3 | EDGE5 | EDGE7 | EDGE9, EDGE1 | EDGE2 | EDGE10, 0, 0 }, // 45
+        { EDGE0 | EDGE2 | EDGE5 | EDGE7 | EDGE8 | EDGE10, 0, 0, 0 }, // 46
+        { EDGE2 | EDGE3 | EDGE5 | EDGE7 | EDGE10, 0, 0, 0 }, // 47
+        { EDGE1 | EDGE3 | EDGE10 | EDGE11, 0, 0, 0 }, // 48
+        { EDGE0 | EDGE1 | EDGE8 | EDGE10 | EDGE11, 0, 0, 0 }, // 49
+        { EDGE0 | EDGE3 | EDGE9 | EDGE10 | EDGE11, 0, 0, 0 }, // 50
+        { EDGE8 | EDGE9 | EDGE10 | EDGE11, 0, 0, 0 }, // 51
+        { EDGE4 | EDGE7 | EDGE8, EDGE1 | EDGE3 | EDGE10 | EDGE11, 0, 0 }, // 52
+        { EDGE0 | EDGE1 | EDGE4 | EDGE7 | EDGE10 | EDGE11, 0, 0, 0 }, // 53
+        { EDGE0 | EDGE3 | EDGE9 | EDGE10 | EDGE11, EDGE4 | EDGE7 | EDGE8, 0, 0 }, // 54
+        { EDGE4 | EDGE7 | EDGE9 | EDGE10 | EDGE11, 0, 0, 0 }, // 55
+        { EDGE4 | EDGE5 | EDGE9, EDGE1 | EDGE3 | EDGE10 | EDGE11, 0, 0 }, // 56
+        { EDGE0 | EDGE1 | EDGE8 | EDGE10 | EDGE11, EDGE4 | EDGE5 | EDGE9, 0, 0 }, // 57
+        { EDGE0 | EDGE3 | EDGE4 | EDGE5 | EDGE10 | EDGE11, 0, 0, 0 }, // 58
+        { EDGE4 | EDGE5 | EDGE8 | EDGE10 | EDGE11, 0, 0, 0 }, // 59
+        { EDGE5 | EDGE7 | EDGE8 | EDGE9, EDGE1 | EDGE3 | EDGE10 | EDGE11, 0, 0 }, // 60
+        { EDGE0 | EDGE1 | EDGE5 | EDGE7 | EDGE9 | EDGE10 | EDGE11, 0, 0, 0 }, // 61
+        { EDGE0 | EDGE3 | EDGE5 | EDGE7 | EDGE8 | EDGE10 | EDGE11, 0, 0, 0 }, // 62
+        { EDGE5 | EDGE7 | EDGE10 | EDGE11, 0, 0, 0 }, // 63
+        { EDGE6 | EDGE7 | EDGE11, 0, 0, 0 }, // 64
+        { EDGE0 | EDGE3 | EDGE8, EDGE6 | EDGE7 | EDGE11, 0, 0 }, // 65
+        { EDGE0 | EDGE1 | EDGE9, EDGE6 | EDGE7 | EDGE11, 0, 0 }, // 66
+        { EDGE1 | EDGE3 | EDGE8 | EDGE9, EDGE6 | EDGE7 | EDGE11, 0, 0 }, // 67
+        { EDGE4 | EDGE6 | EDGE8 | EDGE11, 0, 0, 0 }, // 68
+        { EDGE0 | EDGE3 | EDGE4 | EDGE6 | EDGE11, 0, 0, 0 }, // 69
+        { EDGE0 | EDGE1 | EDGE9, EDGE4 | EDGE6 | EDGE8 | EDGE11, 0, 0 }, // 70
+        { EDGE1 | EDGE3 | EDGE4 | EDGE6 | EDGE9 | EDGE11, 0, 0, 0 }, // 71
+        { EDGE4 | EDGE5 | EDGE9, EDGE6 | EDGE7 | EDGE11, 0, 0 }, // 72
+        { EDGE0 | EDGE3 | EDGE8, EDGE4 | EDGE5 | EDGE9, EDGE6 | EDGE7 | EDGE11, 0 }, // 73
+        { EDGE0 | EDGE1 | EDGE4 | EDGE5, EDGE6 | EDGE7 | EDGE11, 0, 0 }, // 74
+        { EDGE1 | EDGE3 | EDGE4 | EDGE5 | EDGE8, EDGE6 | EDGE7 | EDGE11, 0, 0 }, // 75
+        { EDGE5 | EDGE6 | EDGE8 | EDGE9 | EDGE11, 0, 0, 0 }, // 76
+        { EDGE0 | EDGE3 | EDGE5 | EDGE6 | EDGE9 | EDGE11, 0, 0, 0 }, // 77
+        { EDGE0 | EDGE1 | EDGE5 | EDGE6 | EDGE8 | EDGE11, 0, 0, 0 }, // 78
+        { EDGE1 | EDGE3 | EDGE5 | EDGE6 | EDGE11, 0, 0, 0 }, // 79
+        { EDGE2 | EDGE3 | EDGE6 | EDGE7, 0, 0, 0 }, // 80
+        { EDGE0 | EDGE2 | EDGE6 | EDGE7 | EDGE8, 0, 0, 0 }, // 81
+        { EDGE0 | EDGE1 | EDGE9, EDGE2 | EDGE3 | EDGE6 | EDGE7, 0, 0 }, // 82
+        { EDGE1 | EDGE2 | EDGE6 | EDGE7 | EDGE8 | EDGE9, 0, 0, 0 }, // 83
+        { EDGE2 | EDGE3 | EDGE4 | EDGE6 | EDGE8, 0, 0, 0 }, // 84
+        { EDGE0 | EDGE2 | EDGE4 | EDGE6, 0, 0, 0 }, // 85
+        { EDGE0 | EDGE1 | EDGE9, EDGE2 | EDGE3 | EDGE4 | EDGE6 | EDGE8, 0, 0 }, // 86
+        { EDGE1 | EDGE2 | EDGE4 | EDGE6 | EDGE9, 0, 0, 0 }, // 87
+        { EDGE4 | EDGE5 | EDGE9, EDGE2 | EDGE3 | EDGE6 | EDGE7, 0, 0 }, // 88
+        { EDGE0 | EDGE2 | EDGE6 | EDGE7 | EDGE8, EDGE4 | EDGE5 | EDGE9, 0, 0 }, // 89
+        { EDGE0 | EDGE1 | EDGE4 | EDGE5, EDGE2 | EDGE3 | EDGE6 | EDGE7, 0, 0 }, // 90
+        { EDGE1 | EDGE2 | EDGE4 | EDGE5 | EDGE6 | EDGE7 | EDGE8, 0, 0, 0 }, // 91
+        { EDGE2 | EDGE3 | EDGE5 | EDGE6 | EDGE8 | EDGE9, 0, 0, 0 }, // 92
+        { EDGE0 | EDGE2 | EDGE5 | EDGE6 | EDGE9, 0, 0, 0 }, // 93
+        { EDGE0 | EDGE1 | EDGE2 | EDGE3 | EDGE5 | EDGE6 | EDGE8, 0, 0, 0 }, // 94
+        { EDGE1 | EDGE2 | EDGE5 | EDGE6, 0, 0, 0 }, // 95
+        { EDGE1 | EDGE2 | EDGE10, EDGE6 | EDGE7 | EDGE11, 0, 0 }, // 96
+        { EDGE0 | EDGE3 | EDGE8, EDGE1 | EDGE2 | EDGE10, EDGE6 | EDGE7 | EDGE11, 0 }, // 97
+        { EDGE0 | EDGE2 | EDGE9 | EDGE10, EDGE6 | EDGE7 | EDGE11, 0, 0 }, // 98
+        { EDGE2 | EDGE3 | EDGE8 | EDGE9 | EDGE10, EDGE6 | EDGE7 | EDGE11, 0, 0 }, // 99
+        { EDGE4 | EDGE6 | EDGE8 | EDGE11, EDGE1 | EDGE2 | EDGE10, 0, 0 }, // 100
+        { EDGE0 | EDGE3 | EDGE4 | EDGE6 | EDGE11, EDGE1 | EDGE2 | EDGE10, 0, 0 }, // 101
+        { EDGE0 | EDGE2 | EDGE9 | EDGE10, EDGE4 | EDGE6 | EDGE8 | EDGE11, 0, 0 }, // 102
+        { EDGE2 | EDGE3 | EDGE4 | EDGE6 | EDGE9 | EDGE10 | EDGE11, 0, 0, 0 }, // 103
+        { EDGE4 | EDGE5 | EDGE9, EDGE1 | EDGE2 | EDGE10, EDGE6 | EDGE7 | EDGE11, 0 }, // 104
+        { EDGE0 | EDGE3 | EDGE8, EDGE4 | EDGE5 | EDGE9, EDGE1 | EDGE2 | EDGE10, EDGE6 | EDGE7 | EDGE11 }, // 105
+        { EDGE0 | EDGE2 | EDGE4 | EDGE5 | EDGE10, EDGE6 | EDGE7 | EDGE11, 0, 0 }, // 106
+        { EDGE2 | EDGE3 | EDGE4 | EDGE5 | EDGE8 | EDGE10, EDGE6 | EDGE7 | EDGE11, 0, 0 }, // 107
+        { EDGE5 | EDGE6 | EDGE8 | EDGE9 | EDGE11, EDGE1 | EDGE2 | EDGE10, 0, 0 }, // 108
+        { EDGE0 | EDGE3 | EDGE5 | EDGE6 | EDGE9 | EDGE11, EDGE1 | EDGE2 | EDGE10, 0, 0 }, // 109
+        { EDGE0 | EDGE2 | EDGE5 | EDGE6 | EDGE8 | EDGE10 | EDGE11, 0, 0, 0 }, // 110
+        { EDGE2 | EDGE3 | EDGE5 | EDGE6 | EDGE10 | EDGE11, 0, 0, 0 }, // 111
+        { EDGE1 | EDGE3 | EDGE6 | EDGE7 | EDGE10, 0, 0, 0 }, // 112
+        { EDGE0 | EDGE1 | EDGE6 | EDGE7 | EDGE8 | EDGE10, 0, 0, 0 }, // 113
+        { EDGE0 | EDGE3 | EDGE6 | EDGE7 | EDGE9 | EDGE10, 0, 0, 0 }, // 114
+        { EDGE6 | EDGE7 | EDGE8 | EDGE9 | EDGE10, 0, 0, 0 }, // 115
+        { EDGE1 | EDGE3 | EDGE4 | EDGE6 | EDGE8 | EDGE10, 0, 0, 0 }, // 116
+        { EDGE0 | EDGE1 | EDGE4 | EDGE6 | EDGE10, 0, 0, 0 }, // 117
+        { EDGE0 | EDGE3 | EDGE4 | EDGE6 | EDGE8 | EDGE9 | EDGE10, 0, 0, 0 }, // 118
+        { EDGE4 | EDGE6 | EDGE9 | EDGE10, 0, 0, 0 }, // 119
+        { EDGE4 | EDGE5 | EDGE9, EDGE1 | EDGE3 | EDGE6 | EDGE7 | EDGE10, 0, 0 }, // 120
+        { EDGE0 | EDGE1 | EDGE6 | EDGE7 | EDGE8 | EDGE10, EDGE4 | EDGE5 | EDGE9, 0, 0 }, // 121
+        { EDGE0 | EDGE3 | EDGE4 | EDGE5 | EDGE6 | EDGE7 | EDGE10, 0, 0, 0 }, // 122
+        { EDGE4 | EDGE5 | EDGE6 | EDGE7 | EDGE8 | EDGE10, 0, 0, 0 }, // 123
+        { EDGE1 | EDGE3 | EDGE5 | EDGE6 | EDGE8 | EDGE9 | EDGE10, 0, 0, 0 }, // 124
+        { EDGE0 | EDGE1 | EDGE5 | EDGE6 | EDGE9 | EDGE10, 0, 0, 0 }, // 125
+        { EDGE0 | EDGE3 | EDGE8, EDGE5 | EDGE6 | EDGE10, 0, 0 }, // 126
+        { EDGE5 | EDGE6 | EDGE10, 0, 0, 0 }, // 127
+        { EDGE5 | EDGE6 | EDGE10, 0, 0, 0 }, // 128
+        { EDGE0 | EDGE3 | EDGE8, EDGE5 | EDGE6 | EDGE10, 0, 0 }, // 129
+        { EDGE0 | EDGE1 | EDGE9, EDGE5 | EDGE6 | EDGE10, 0, 0 }, // 130
+        { EDGE1 | EDGE3 | EDGE8 | EDGE9, EDGE5 | EDGE6 | EDGE10, 0, 0 }, // 131
+        { EDGE4 | EDGE7 | EDGE8, EDGE5 | EDGE6 | EDGE10, 0, 0 }, // 132
+        { EDGE0 | EDGE3 | EDGE4 | EDGE7, EDGE5 | EDGE6 | EDGE10, 0, 0 }, // 133
+        { EDGE0 | EDGE1 | EDGE9, EDGE4 | EDGE7 | EDGE8, EDGE5 | EDGE6 | EDGE10, 0 }, // 134
+        { EDGE1 | EDGE3 | EDGE4 | EDGE7 | EDGE9, EDGE5 | EDGE6 | EDGE10, 0, 0 }, // 135
+        { EDGE4 | EDGE6 | EDGE9 | EDGE10, 0, 0, 0 }, // 136
+        { EDGE0 | EDGE3 | EDGE8, EDGE4 | EDGE6 | EDGE9 | EDGE10, 0, 0 }, // 137
+        { EDGE0 | EDGE1 | EDGE4 | EDGE6 | EDGE10, 0, 0, 0 }, // 138
+        { EDGE1 | EDGE3 | EDGE4 | EDGE6 | EDGE8 | EDGE10, 0, 0, 0 }, // 139
+        { EDGE6 | EDGE7 | EDGE8 | EDGE9 | EDGE10, 0, 0, 0 }, // 140
+        { EDGE0 | EDGE3 | EDGE6 | EDGE7 | EDGE9 | EDGE10, 0, 0, 0 }, // 141
+        { EDGE0 | EDGE1 | EDGE6 | EDGE7 | EDGE8 | EDGE10, 0, 0, 0 }, // 142
+        { EDGE1 | EDGE3 | EDGE6 | EDGE7 | EDGE10, 0, 0, 0 }, // 143
+        { EDGE2 | EDGE3 | EDGE11, EDGE5 | EDGE6 | EDGE10, 0, 0 }, // 144
+        { EDGE0 | EDGE2 | EDGE8 | EDGE11, EDGE5 | EDGE6 | EDGE10, 0, 0 }, // 145
+        { EDGE0 | EDGE1 | EDGE9, EDGE2 | EDGE3 | EDGE11, EDGE5 | EDGE6 | EDGE10, 0 }, // 146
+        { EDGE1 | EDGE2 | EDGE8 | EDGE9 | EDGE11, EDGE5 | EDGE6 | EDGE10, 0, 0 }, // 147
+        { EDGE4 | EDGE7 | EDGE8, EDGE2 | EDGE3 | EDGE11, EDGE5 | EDGE6 | EDGE10, 0 }, // 148
+        { EDGE0 | EDGE2 | EDGE4 | EDGE7 | EDGE11, EDGE5 | EDGE6 | EDGE10, 0, 0 }, // 149
+        { EDGE0 | EDGE1 | EDGE9, EDGE4 | EDGE7 | EDGE8, EDGE2 | EDGE3 | EDGE11, EDGE5 | EDGE6 | EDGE10 }, // 150
+        { EDGE1 | EDGE2 | EDGE4 | EDGE7 | EDGE9 | EDGE11, EDGE5 | EDGE6 | EDGE10, 0, 0 }, // 151
+        { EDGE4 | EDGE6 | EDGE9 | EDGE10, EDGE2 | EDGE3 | EDGE11, 0, 0 }, // 152
+        { EDGE0 | EDGE2 | EDGE8 | EDGE11, EDGE4 | EDGE6 | EDGE9 | EDGE10, 0, 0 }, // 153
+        { EDGE0 | EDGE1 | EDGE4 | EDGE6 | EDGE10, EDGE2 | EDGE3 | EDGE11, 0, 0 }, // 154
+        { EDGE1 | EDGE2 | EDGE4 | EDGE6 | EDGE8 | EDGE10 | EDGE11, 0, 0, 0 }, // 155
+        { EDGE6 | EDGE7 | EDGE8 | EDGE9 | EDGE10, EDGE2 | EDGE3 | EDGE11, 0, 0 }, // 156
+        { EDGE0 | EDGE2 | EDGE6 | EDGE7 | EDGE9 | EDGE10 | EDGE11, 0, 0, 0 }, // 157
+        { EDGE0 | EDGE1 | EDGE6 | EDGE7 | EDGE8 | EDGE10, EDGE2 | EDGE3 | EDGE11, 0, 0 }, // 158
+        { EDGE1 | EDGE2 | EDGE6 | EDGE7 | EDGE10 | EDGE11, 0, 0, 0 }, // 159
+        { EDGE1 | EDGE2 | EDGE5 | EDGE6, 0, 0, 0 }, // 160
+        { EDGE0 | EDGE3 | EDGE8, EDGE1 | EDGE2 | EDGE5 | EDGE6, 0, 0 }, // 161
+        { EDGE0 | EDGE2 | EDGE5 | EDGE6 | EDGE9, 0, 0, 0 }, // 162
+        { EDGE2 | EDGE3 | EDGE5 | EDGE6 | EDGE8 | EDGE9, 0, 0, 0 }, // 163
+        { EDGE4 | EDGE7 | EDGE8, EDGE1 | EDGE2 | EDGE5 | EDGE6, 0, 0 }, // 164
+        { EDGE0 | EDGE3 | EDGE4 | EDGE7, EDGE1 | EDGE2 | EDGE5 | EDGE6, 0, 0 }, // 165
+        { EDGE0 | EDGE2 | EDGE5 | EDGE6 | EDGE9, EDGE4 | EDGE7 | EDGE8, 0, 0 }, // 166
+        { EDGE2 | EDGE3 | EDGE4 | EDGE5 | EDGE6 | EDGE7 | EDGE9, 0, 0, 0 }, // 167
+        { EDGE1 | EDGE2 | EDGE4 | EDGE6 | EDGE9, 0, 0, 0 }, // 168
+        { EDGE0 | EDGE3 | EDGE8, EDGE1 | EDGE2 | EDGE4 | EDGE6 | EDGE9, 0, 0 }, // 169
+        { EDGE0 | EDGE2 | EDGE4 | EDGE6, 0, 0, 0 }, // 170
+        { EDGE2 | EDGE3 | EDGE4 | EDGE6 | EDGE8, 0, 0, 0 }, // 171
+        { EDGE1 | EDGE2 | EDGE6 | EDGE7 | EDGE8 | EDGE9, 0, 0, 0 }, // 172
+        { EDGE0 | EDGE1 | EDGE2 | EDGE3 | EDGE6 | EDGE7 | EDGE9, 0, 0, 0 }, // 173
+        { EDGE0 | EDGE2 | EDGE6 | EDGE7 | EDGE8, 0, 0, 0 }, // 174
+        { EDGE2 | EDGE3 | EDGE6 | EDGE7, 0, 0, 0 }, // 175
+        { EDGE1 | EDGE3 | EDGE5 | EDGE6 | EDGE11, 0, 0, 0 }, // 176
+        { EDGE0 | EDGE1 | EDGE5 | EDGE6 | EDGE8 | EDGE11, 0, 0, 0 }, // 177
+        { EDGE0 | EDGE3 | EDGE5 | EDGE6 | EDGE9 | EDGE11, 0, 0, 0 }, // 178
+        { EDGE5 | EDGE6 | EDGE8 | EDGE9 | EDGE11, 0, 0, 0 }, // 179
+        { EDGE4 | EDGE7 | EDGE8, EDGE1 | EDGE3 | EDGE5 | EDGE6 | EDGE11, 0, 0 }, // 180
+        { EDGE0 | EDGE1 | EDGE4 | EDGE5 | EDGE6 | EDGE7 | EDGE11, 0, 0, 0 }, // 181
+        { EDGE0 | EDGE3 | EDGE5 | EDGE6 | EDGE9 | EDGE11, EDGE4 | EDGE7 | EDGE8, 0, 0 }, // 182
+        { EDGE4 | EDGE5 | EDGE6 | EDGE7 | EDGE9 | EDGE11, 0, 0, 0 }, // 183
+        { EDGE1 | EDGE3 | EDGE4 | EDGE6 | EDGE9 | EDGE11, 0, 0, 0 }, // 184
+        { EDGE0 | EDGE1 | EDGE4 | EDGE6 | EDGE8 | EDGE9 | EDGE11, 0, 0, 0 }, // 185
+        { EDGE0 | EDGE3 | EDGE4 | EDGE6 | EDGE11, 0, 0, 0 }, // 186
+        { EDGE4 | EDGE6 | EDGE8 | EDGE11, 0, 0, 0 }, // 187
+        { EDGE1 | EDGE3 | EDGE6 | EDGE7 | EDGE8 | EDGE9 | EDGE11, 0, 0, 0 }, // 188
+        { EDGE0 | EDGE1 | EDGE9, EDGE6 | EDGE7 | EDGE11, 0, 0 }, // 189
+        { EDGE0 | EDGE3 | EDGE6 | EDGE7 | EDGE8 | EDGE11, 0, 0, 0 }, // 190
+        { EDGE6 | EDGE7 | EDGE11, 0, 0, 0 }, // 191
+        { EDGE5 | EDGE7 | EDGE10 | EDGE11, 0, 0, 0 }, // 192
+        { EDGE0 | EDGE3 | EDGE8, EDGE5 | EDGE7 | EDGE10 | EDGE11, 0, 0 }, // 193
+        { EDGE0 | EDGE1 | EDGE9, EDGE5 | EDGE7 | EDGE10 | EDGE11, 0, 0 }, // 194
+        { EDGE1 | EDGE3 | EDGE8 | EDGE9, EDGE5 | EDGE7 | EDGE10 | EDGE11, 0, 0 }, // 195
+        { EDGE4 | EDGE5 | EDGE8 | EDGE10 | EDGE11, 0, 0, 0 }, // 196
+        { EDGE0 | EDGE3 | EDGE4 | EDGE5 | EDGE10 | EDGE11, 0, 0, 0 }, // 197
+        { EDGE0 | EDGE1 | EDGE9, EDGE4 | EDGE5 | EDGE8 | EDGE10 | EDGE11, 0, 0 }, // 198
+        { EDGE1 | EDGE3 | EDGE4 | EDGE5 | EDGE9 | EDGE10 | EDGE11, 0, 0, 0 }, // 199
+        { EDGE4 | EDGE7 | EDGE9 | EDGE10 | EDGE11, 0, 0, 0 }, // 200
+        { EDGE0 | EDGE3 | EDGE8, EDGE4 | EDGE7 | EDGE9 | EDGE10 | EDGE11, 0, 0 }, // 201
+        { EDGE0 | EDGE1 | EDGE4 | EDGE7 | EDGE10 | EDGE11, 0, 0, 0 }, // 202
+        { EDGE1 | EDGE3 | EDGE4 | EDGE7 | EDGE8 | EDGE10 | EDGE11, 0, 0, 0 }, // 203
+        { EDGE8 | EDGE9 | EDGE10 | EDGE11, 0, 0, 0 }, // 204
+        { EDGE0 | EDGE3 | EDGE9 | EDGE10 | EDGE11, 0, 0, 0 }, // 205
+        { EDGE0 | EDGE1 | EDGE8 | EDGE10 | EDGE11, 0, 0, 0 }, // 206
+        { EDGE1 | EDGE3 | EDGE10 | EDGE11, 0, 0, 0 }, // 207
+        { EDGE2 | EDGE3 | EDGE5 | EDGE7 | EDGE10, 0, 0, 0 }, // 208
+        { EDGE0 | EDGE2 | EDGE5 | EDGE7 | EDGE8 | EDGE10, 0, 0, 0 }, // 209
+        { EDGE0 | EDGE1 | EDGE9, EDGE2 | EDGE3 | EDGE5 | EDGE7 | EDGE10, 0, 0 }, // 210
+        { EDGE1 | EDGE2 | EDGE5 | EDGE7 | EDGE8 | EDGE9 | EDGE10, 0, 0, 0 }, // 211
+        { EDGE2 | EDGE3 | EDGE4 | EDGE5 | EDGE8 | EDGE10, 0, 0, 0 }, // 212
+        { EDGE0 | EDGE2 | EDGE4 | EDGE5 | EDGE10, 0, 0, 0 }, // 213
+        { EDGE0 | EDGE1 | EDGE9, EDGE2 | EDGE3 | EDGE4 | EDGE5 | EDGE8 | EDGE10, 0, 0 }, // 214
+        { EDGE1 | EDGE2 | EDGE4 | EDGE5 | EDGE9 | EDGE10, 0, 0, 0 }, // 215
+        { EDGE2 | EDGE3 | EDGE4 | EDGE7 | EDGE9 | EDGE10, 0, 0, 0 }, // 216
+        { EDGE0 | EDGE2 | EDGE4 | EDGE7 | EDGE8 | EDGE9 | EDGE10, 0, 0, 0 }, // 217
+        { EDGE0 | EDGE1 | EDGE2 | EDGE3 | EDGE4 | EDGE7 | EDGE10, 0, 0, 0 }, // 218
+        { EDGE4 | EDGE7 | EDGE8, EDGE1 | EDGE2 | EDGE10, 0, 0 }, // 219
+        { EDGE2 | EDGE3 | EDGE8 | EDGE9 | EDGE10, 0, 0, 0 }, // 220
+        { EDGE0 | EDGE2 | EDGE9 | EDGE10, 0, 0, 0 }, // 221
+        { EDGE0 | EDGE1 | EDGE2 | EDGE3 | EDGE8 | EDGE10, 0, 0, 0 }, // 222
+        { EDGE1 | EDGE2 | EDGE10, 0, 0, 0 }, // 223
+        { EDGE1 | EDGE2 | EDGE5 | EDGE7 | EDGE11, 0, 0, 0 }, // 224
+        { EDGE0 | EDGE3 | EDGE8, EDGE1 | EDGE2 | EDGE5 | EDGE7 | EDGE11, 0, 0 }, // 225
+        { EDGE0 | EDGE2 | EDGE5 | EDGE7 | EDGE9 | EDGE11, 0, 0, 0 }, // 226
+        { EDGE2 | EDGE3 | EDGE5 | EDGE7 | EDGE8 | EDGE9 | EDGE11, 0, 0, 0 }, // 227
+        { EDGE1 | EDGE2 | EDGE4 | EDGE5 | EDGE8 | EDGE11, 0, 0, 0 }, // 228
+        { EDGE0 | EDGE1 | EDGE2 | EDGE3 | EDGE4 | EDGE5 | EDGE11, 0, 0, 0 }, // 229
+        { EDGE0 | EDGE2 | EDGE4 | EDGE5 | EDGE8 | EDGE9 | EDGE11, 0, 0, 0 }, // 230
+        { EDGE4 | EDGE5 | EDGE9, EDGE2 | EDGE3 | EDGE11, 0, 0 }, // 231
+        { EDGE1 | EDGE2 | EDGE4 | EDGE7 | EDGE9 | EDGE11, 0, 0, 0 }, // 232
+        { EDGE0 | EDGE3 | EDGE8, EDGE1 | EDGE2 | EDGE4 | EDGE7 | EDGE9 | EDGE11, 0, 0 }, // 233
+        { EDGE0 | EDGE2 | EDGE4 | EDGE7 | EDGE11, 0, 0, 0 }, // 234
+        { EDGE2 | EDGE3 | EDGE4 | EDGE7 | EDGE8 | EDGE11, 0, 0, 0 }, // 235
+        { EDGE1 | EDGE2 | EDGE8 | EDGE9 | EDGE11, 0, 0, 0 }, // 236
+        { EDGE0 | EDGE1 | EDGE2 | EDGE3 | EDGE9 | EDGE11, 0, 0, 0 }, // 237
+        { EDGE0 | EDGE2 | EDGE8 | EDGE11, 0, 0, 0 }, // 238
+        { EDGE2 | EDGE3 | EDGE11, 0, 0, 0 }, // 239
+        { EDGE1 | EDGE3 | EDGE5 | EDGE7, 0, 0, 0 }, // 240
+        { EDGE0 | EDGE1 | EDGE5 | EDGE7 | EDGE8, 0, 0, 0 }, // 241
+        { EDGE0 | EDGE3 | EDGE5 | EDGE7 | EDGE9, 0, 0, 0 }, // 242
+        { EDGE5 | EDGE7 | EDGE8 | EDGE9, 0, 0, 0 }, // 243
+        { EDGE1 | EDGE3 | EDGE4 | EDGE5 | EDGE8, 0, 0, 0 }, // 244
+        { EDGE0 | EDGE1 | EDGE4 | EDGE5, 0, 0, 0 }, // 245
+        { EDGE0 | EDGE3 | EDGE4 | EDGE5 | EDGE8 | EDGE9, 0, 0, 0 }, // 246
+        { EDGE4 | EDGE5 | EDGE9, 0, 0, 0 }, // 247
+        { EDGE1 | EDGE3 | EDGE4 | EDGE7 | EDGE9, 0, 0, 0 }, // 248
+        { EDGE0 | EDGE1 | EDGE4 | EDGE7 | EDGE8 | EDGE9, 0, 0, 0 }, // 249
+        { EDGE0 | EDGE3 | EDGE4 | EDGE7, 0, 0, 0 }, // 250
+        { EDGE4 | EDGE7 | EDGE8, 0, 0, 0 }, // 251
+        { EDGE1 | EDGE3 | EDGE8 | EDGE9, 0, 0, 0 }, // 252
+        { EDGE0 | EDGE1 | EDGE9, 0, 0, 0 }, // 253
+        { EDGE0 | EDGE3 | EDGE8, 0, 0, 0 }, // 254
+        { 0, 0, 0, 0 } // 255
+    };
+
+} // namespace dualmc
diff --git a/NoiseTool/ImGuiExtra.h b/tools/NodeEditor/util/ImGuiExtra.h
similarity index 89%
rename from NoiseTool/ImGuiExtra.h
rename to tools/NodeEditor/util/ImGuiExtra.h
index da3e41af..3978ad46 100644
--- a/NoiseTool/ImGuiExtra.h
+++ b/tools/NodeEditor/util/ImGuiExtra.h
@@ -34,14 +34,18 @@ namespace ImGuiExtra
             if( ImGui::GetIO().MouseWheel < 0 && *comboIndex < comboCount - 1 )
             {
                 (*comboIndex)++;
+                ImGui::GetIO().MouseWheel = 0;
                 return true;
             }
 
             if( ImGui::GetIO().MouseWheel > 0 && *comboIndex > 0 )
             {
                 (*comboIndex)--;
+                ImGui::GetIO().MouseWheel = 0;
                 return true;
             }
+
+            ImGui::GetIO().MouseWheel = 0;
         }
         return false;
     }
diff --git a/tools/NodeEditor/util/SharedMemoryIpc.inl b/tools/NodeEditor/util/SharedMemoryIpc.inl
new file mode 100644
index 00000000..380c0d5b
--- /dev/null
+++ b/tools/NodeEditor/util/SharedMemoryIpc.inl
@@ -0,0 +1,121 @@
+#ifndef __EMSCRIPTEN__
+#ifdef _WIN32
+#define NOMINMAX
+#define WIN32_LEAN_AND_MEAN
+#include <Windows.h>
+#else
+#include <fcntl.h> // For O_* constants
+#include <sys/mman.h> // For shared memory
+#include <sys/stat.h> // For mode constants
+#include <unistd.h>
+#endif
+#endif
+
+static constexpr const char* kSharedMemoryName = "/FastNoise2NodeEditor";
+static constexpr unsigned int kSharedMemorySize = 64 * 1024;
+
+// Setup shared memory for IPC selected node ENT updates
+void* FastNoiseNodeEditor::SetupSharedMemoryIpc()
+{
+#ifdef __EMSCRIPTEN__
+    return nullptr;
+#elif defined( WIN32 )
+    // Create a shared memory file mapping
+    HANDLE hMapFile = CreateFileMapping(
+        INVALID_HANDLE_VALUE, // Use paging file - shared memory
+        NULL, // Default security attributes
+        PAGE_READWRITE, // Read/write access
+        0, // Maximum object size (high-order DWORD)
+        kSharedMemorySize, // Maximum object size (low-order DWORD)
+        kSharedMemoryName ); // Name of mapping object
+
+    if( hMapFile == NULL )
+    {
+        Debug {} << "Failed to create IPC shared memory object" << GetLastError();
+        return nullptr;
+    }
+
+    // Map a view of the file mapping into the address space of the current process
+    void* ptr = MapViewOfFile( hMapFile, // Handle to map object
+        FILE_MAP_ALL_ACCESS, // Read/write permission
+        0,
+        0,
+        kSharedMemorySize );
+
+    if( !ptr )
+    {
+        Debug {} << "Failed to map IPC shared memory" << GetLastError();
+    }
+    return ptr;
+
+#else
+    // Create the shared memory object
+    int shmFd = shm_open( kSharedMemoryName, O_CREAT | O_RDWR, 0666 );
+    if( shmFd == -1 )
+    {
+        Debug {} << "Failed to create IPC shared memory object";
+        return nullptr;
+    }
+
+    // Configure the size of the shared memory object
+    if( ftruncate( shmFd, kSharedMemorySize ) == -1 )
+    {
+        if( errno != EINVAL ) // If the error is not just because it's already the right size
+        {
+            Debug {} << "Failed to config IPC shared memory object";
+            return nullptr;
+        }
+    }
+
+    // Memory map the shared memory object
+    void* ptr = mmap( 0, kSharedMemorySize, PROT_READ | PROT_WRITE, MAP_SHARED, shmFd, 0 );
+    if( ptr == MAP_FAILED )
+    {
+        Debug {} << "Failed to map IPC shared memory object";
+        return nullptr;
+    }
+    return ptr;
+#endif
+}
+
+void FastNoiseNodeEditor::ReleaseSharedMemoryIpc()
+{
+#if !defined( WIN32 ) && !defined( __EMSCRIPTEN__ )
+    shm_unlink( kSharedMemoryName );
+#endif
+}
+
+// Poll for changes in the shared memory space
+void FastNoiseNodeEditor::DoIpcPolling()
+{
+    const void* sharedMemory = mNodeEditorApp.GetIpcSharedMemory();
+
+    if( sharedMemory )
+    {
+        const unsigned char sharedCounter = *static_cast<const unsigned char*>( sharedMemory );
+        const unsigned char dataType = *( static_cast<const unsigned char*>( sharedMemory ) + 1 );
+
+        // Invalidate the counter to read initial stale data only if it's type 0
+        static int counter = ( dataType == 0 ) ? 0xFFFFFF : sharedCounter;
+
+        if( sharedCounter != counter )
+        {
+            counter = sharedCounter;
+
+            // Check type
+            switch( dataType )
+            {
+            default:
+                Debug {} << "Unknown IPC data type" << dataType;
+                break;
+            case 0: // Selected node ENT
+            {
+                std::string newEncodedNodeTree = static_cast<const char*>( sharedMemory ) + 2;
+
+                SetPreviewGenerator( newEncodedNodeTree );
+            }
+            break;
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/util/CMakeLists.txt b/util/CMakeLists.txt
new file mode 100644
index 00000000..87ceb847
--- /dev/null
+++ b/util/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(WikiGenerator)
\ No newline at end of file
diff --git a/util/WikiGenerator/CMakeLists.txt b/util/WikiGenerator/CMakeLists.txt
new file mode 100644
index 00000000..242b4225
--- /dev/null
+++ b/util/WikiGenerator/CMakeLists.txt
@@ -0,0 +1,8 @@
+
+add_executable(WikiGenerator
+    main.cpp
+)
+
+target_link_libraries(WikiGenerator PRIVATE
+    FastNoise
+)
\ No newline at end of file
diff --git a/util/WikiGenerator/main.cpp b/util/WikiGenerator/main.cpp
new file mode 100644
index 00000000..7711b7cd
--- /dev/null
+++ b/util/WikiGenerator/main.cpp
@@ -0,0 +1,238 @@
+#include <filesystem>
+#include <FastNoise/Metadata.h>
+#include <FastNoise/FastNoise.h>
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <regex>
+#include <unordered_map>
+
+static constexpr int imageSizeX = 256;
+static constexpr int imageSizeY = 256;
+
+FastNoise::SmartNode<> BuildGenerator( const FastNoise::Metadata* metadata )
+{
+    FastNoise::SmartNode<> generator = metadata->CreateNode();
+
+    auto source = FastNoise::New<FastNoise::Constant>();
+    source->SetValue( 0.5f );
+
+    for( const auto& memberNode : metadata->memberNodeLookups )
+    {
+        if( !memberNode.setFunc( generator.get(), source ) )
+        {
+            // If constant source is not valid try all other node types in order
+            for( const FastNoise::Metadata* tryMetadata : FastNoise::Metadata::GetAll() )
+            {
+                // Other node types may also have sources
+                FastNoise::SmartNode<> trySource = BuildGenerator( tryMetadata );
+
+                if( trySource && memberNode.setFunc( generator.get(), trySource ) )
+                {
+                    for( const auto& tryMemberNode : tryMetadata->memberNodeLookups )
+                    {
+                        if( !tryMemberNode.setFunc( trySource.get(), source ) )
+                        {
+                            return {};
+                        }
+                    }
+                    break;
+                }
+            }
+        }
+    }
+    return generator;
+}
+
+bool CreateImage( const FastNoise::Metadata* metadata, const std::string& outDir, const std::string& nodeName )
+{
+    auto node = FastNoise::New<FastNoise::DomainScale>();
+    node->SetSource( BuildGenerator( metadata ) );
+    node->SetScaling( 3.f );
+
+    std::vector<float> noiseData( imageSizeX * imageSizeY );
+    auto noiseMinMax = node->GenUniformGrid2D( noiseData.data(), imageSizeX / -2, imageSizeY / -2, imageSizeX, imageSizeY, 1337 );
+
+    if( noiseMinMax.min == noiseMinMax.max || !std::isfinite( noiseMinMax.min  ) || !std::isfinite( noiseMinMax.max ) )
+    {
+        return false;
+    }
+
+    std::filesystem::path tempFile = std::filesystem::temp_directory_path() / (nodeName + ".bmp");
+
+    std::ofstream file( tempFile, std::ofstream::binary | std::ofstream::out | std::ofstream::trunc );
+
+    if( file.is_open() )
+    {
+        float scale = 255 / (noiseMinMax.max - noiseMinMax.min);
+
+        struct BmpHeader
+        {
+            // File header (14)
+            // char b = 'B';
+            // char m = 'M';
+            uint32_t fileSize;
+            uint32_t reserved = 0;
+            uint32_t dataOffset = 14u + 12u + (256u * 3u);
+            // Bmp Info Header (12)
+            uint32_t headerSize = 12u;
+            uint16_t sizeX;
+            uint16_t sizeY;
+            uint16_t colorPlanes = 1u;
+            uint16_t bitDepth = 8u;
+        };
+
+        int paddedSizeX = imageSizeX;
+        int padding = paddedSizeX % 4;
+        if( padding )
+        {
+            padding = 4 - padding;
+            paddedSizeX += padding;
+        }
+
+        BmpHeader header;
+        header.fileSize = header.dataOffset + (uint32_t)(paddedSizeX * imageSizeY);
+        header.sizeX = (uint16_t)imageSizeX;
+        header.sizeY = (uint16_t)imageSizeY;
+
+        file << 'B' << 'M';
+        file.write( reinterpret_cast<char*>( &header ), sizeof( BmpHeader ) );
+
+        // Colour map
+        for (int i = 0; i < 256; i++)
+        {
+            char colourB = i;
+            file.write( &colourB, 1 );
+            file.write( &colourB, 1 );
+            file.write( &colourB, 1 );
+        }
+
+        int xIdx = padding ? imageSizeX : 0;
+
+        for( float noise : noiseData )
+        {
+            unsigned char pix = (unsigned char)std::clamp( (noise - noiseMinMax.min) * scale, 0.0f, 255.0f );
+
+            file.write( reinterpret_cast<char*>( &pix ), 1 );
+
+            if( --xIdx == 0 )
+            {
+                xIdx = imageSizeX;
+
+                int zero( 0 );
+                file.write( reinterpret_cast<char*>( &zero ), padding );
+            }
+        }
+
+        file.close();
+
+        std::string convertCmd = "magick convert \"";
+        convertCmd += tempFile.string();
+        convertCmd += "\" \"" + outDir + "/images/" + nodeName + ".png";
+
+        std::system( convertCmd.c_str() );
+        return true;
+    }
+    return false;
+}
+
+std::string FormatDescription( const char* description )
+{
+    std::string formatted = description;
+    size_t pos = 0;
+    
+    while( (pos = formatted.find( '\n', pos )) != std::string::npos )
+    {
+        formatted.insert( pos, "<br/>" );
+        pos += 6; // Length of "\n<br/>"
+    }
+
+    return formatted;
+}
+
+void DoNode( std::stringstream& output, const FastNoise::Metadata* metadata, const std::string& outDir )
+{
+    std::string nodeName = FastNoise::Metadata::FormatMetadataNodeName( metadata, false );
+
+    output << "## " << nodeName << '\n';
+    output << FormatDescription( metadata->description ) << "\n\n";
+
+    if( CreateImage( metadata, outDir, nodeName ) )
+    {
+        output << "[[images/" << nodeName << ".png]]\n";
+    }
+
+    for( auto& node_lookup : metadata->memberNodeLookups )
+    {
+        output << "### " << node_lookup.name << " _- Node Lookup_\n" << FormatDescription( node_lookup.description ) << '\n';
+    }
+
+    for( auto& hybrid_lookup : metadata->memberHybrids )
+    {
+        output << "### " << hybrid_lookup.name << " `= " << hybrid_lookup.valueDefault << "f` _- Hybrid Lookup_\n" << FormatDescription( hybrid_lookup.description ) << '\n';
+    }
+
+    for( auto& variable : metadata->memberVariables )
+    {
+        switch( variable.type )
+        {
+        case FastNoise::Metadata::MemberVariable::EFloat:
+            output << "### " << FastNoise::Metadata::FormatMetadataMemberName( variable ) << " `= " << variable.valueDefault.f << "f`\n" << FormatDescription( variable.description ) << '\n';
+            break;
+        case FastNoise::Metadata::MemberVariable::EInt:
+            output << "### " << FastNoise::Metadata::FormatMetadataMemberName( variable ) << " `= " << variable.valueDefault.i << "`\n" << FormatDescription( variable.description ) << '\n';
+            break;
+        case FastNoise::Metadata::MemberVariable::EEnum:
+            output << "### " << FastNoise::Metadata::FormatMetadataMemberName( variable ) << " `= " << variable.enumNames[variable.valueDefault.i] << "` _- Enum_\n" << FormatDescription( variable.description ) << '\n';
+            for( size_t i = 0; i < variable.enumNames.size(); i++ )
+            {
+                output << "* " << variable.enumNames[i] << (variable.valueDefault.i == i ? " (Default)\n" : "\n");
+            }
+            break;
+        }
+    }
+
+}
+
+int main( int argc, char* argv[] )
+{
+    std::string outputDir = ".";
+    if( argc > 1 )
+    {
+        outputDir = argv[1];
+        std::filesystem::create_directories( outputDir );
+    }
+
+    std::filesystem::create_directories( outputDir + "/images" );
+
+    std::unordered_map<std::string, std::stringstream> outputStreams;
+
+    for( const FastNoise::Metadata* metadata : FastNoise::Metadata::GetAll() )
+    {
+        const char* groupName = metadata->groups[0];
+
+        if( outputStreams.try_emplace( groupName ).second )
+        {
+            outputStreams[groupName] << "# " << groupName << '\n';
+            outputStreams[groupName].setf(std::ios::fixed);
+            outputStreams[groupName].precision(1);
+        }
+
+        DoNode( outputStreams[groupName], metadata, outputDir );
+
+    }
+
+    for( auto& stream : outputStreams )
+    {
+        std::string fileName = stream.first;
+        std::replace( fileName.begin(), fileName.end(), ' ', '-' );
+
+        std::ofstream outFile( outputDir + "/Nodes#-" + fileName + ".md" );
+
+        outFile << stream.second.str();
+        outFile.close();
+
+        std::cout << "Written " << fileName << ".md\n";
+    }
+}
\ No newline at end of file