From 2477bad06b77aa03bbc9a1a94f9a1cc9f0d512d2 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 25 Jul 2024 11:18:35 +0200 Subject: [PATCH 001/368] start metal backend --- CMakeLists.txt | 7 +- src/Cafe/CMakeLists.txt | 10 + src/Cafe/GraphicPack/GraphicPack2.cpp | 42 ++-- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 198 ++++++++++++++++++ .../HW/Latte/Renderer/Metal/MetalRenderer.h | 142 +++++++++++++ src/Cafe/HW/Latte/Renderer/Renderer.h | 7 +- src/Cemu/Logging/CemuLogging.cpp | 3 +- src/Cemu/Logging/CemuLogging.h | 5 +- src/config/CemuConfig.h | 19 +- src/gui/CMakeLists.txt | 4 +- src/gui/GameProfileWindow.cpp | 24 ++- src/gui/GeneralSettings2.cpp | 86 ++++---- src/gui/LoggingWindow.cpp | 5 +- src/gui/MainWindow.cpp | 61 +++--- src/gui/PadViewFrame.cpp | 9 +- src/gui/canvas/MetalCanvas.cpp | 63 ++++++ src/gui/canvas/MetalCanvas.h | 19 ++ src/gui/guiWrapper.cpp | 9 +- src/imgui/CMakeLists.txt | 2 + 19 files changed, 586 insertions(+), 129 deletions(-) create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h create mode 100644 src/gui/canvas/MetalCanvas.cpp create mode 100644 src/gui/canvas/MetalCanvas.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 6b5f38819..c3b940f9d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -78,6 +78,7 @@ endif() option(ENABLE_OPENGL "Enables the OpenGL backend" ON) option(ENABLE_VULKAN "Enables the Vulkan backend" ON) +option(ENABLE_METAL "Enables the Metal backend" ON) option(ENABLE_DISCORD_RPC "Enables the Discord Rich Presence feature" ON) @@ -190,9 +191,9 @@ if (ENABLE_WXWIDGETS) endif() if (ENABLE_CUBEB) - if (NOT ENABLE_VCPKG) - find_package(cubeb) - endif() + #if (NOT ENABLE_VCPKG) + #find_package(cubeb) + #endif() if (NOT cubeb_FOUND) option(BUILD_TESTS "" OFF) option(BUILD_TOOLS "" OFF) diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index 0fb7a44bb..f7f256445 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -524,6 +524,16 @@ if(APPLE) target_sources(CemuCafe PRIVATE "HW/Latte/Renderer/Vulkan/CocoaSurface.mm") endif() +if(ENABLE_METAL) + if(APPLE) + target_sources(CemuCafe PRIVATE + HW/Latte/Renderer/Metal/MetalRenderer.cpp + ) + else() + message(FATAL_ERROR "Metal is only supported on macOS") + endif() +endif() + set_property(TARGET CemuCafe PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") target_include_directories(CemuCafe PUBLIC "../") diff --git a/src/Cafe/GraphicPack/GraphicPack2.cpp b/src/Cafe/GraphicPack/GraphicPack2.cpp index 27d423b9d..3fbfb518e 100644 --- a/src/Cafe/GraphicPack/GraphicPack2.cpp +++ b/src/Cafe/GraphicPack/GraphicPack2.cpp @@ -109,7 +109,7 @@ bool GraphicPack2::LoadGraphicPack(const fs::path& rulesPath, IniParser& rules) gp->SetActivePreset(kv.first, kv.second, false); } - + gp->SetEnabled(enabled); } @@ -141,7 +141,7 @@ bool GraphicPack2::DeactivateGraphicPack(const std::shared_ptr& gr if (!graphic_pack->IsActivated()) return false; - const auto it = std::find_if(s_active_graphic_packs.begin(), s_active_graphic_packs.end(), + const auto it = std::find_if(s_active_graphic_packs.begin(), s_active_graphic_packs.end(), [graphic_pack](const GraphicPackPtr& gp) { return gp->GetNormalizedPathString() == graphic_pack->GetNormalizedPathString(); @@ -269,6 +269,8 @@ GraphicPack2::GraphicPack2(fs::path rulesPath, IniParser& rules) m_renderer_api = RendererAPI::Vulkan; else if (boost::iequals(*option_rendererFilter, "opengl")) m_renderer_api = RendererAPI::OpenGL; + else if (boost::iequals(*option_rendererFilter, "metal")) + m_renderer_api = RendererAPI::Metal; else cemuLog_log(LogType::Force, "Unknown value '{}' for rendererFilter option", *option_rendererFilter); } @@ -348,7 +350,7 @@ GraphicPack2::GraphicPack2(fs::path rulesPath, IniParser& rules) cemuLog_log(LogType::Force, "Graphic pack \"{}\": Preset in line {} skipped because it has no name option defined", m_name, rules.GetCurrentSectionLineNumber()); continue; } - + const auto category = rules.FindOption("category"); const auto condition = rules.FindOption("condition"); const auto default_selected = rules.FindOption("default"); @@ -420,12 +422,12 @@ GraphicPack2::GraphicPack2(fs::path rulesPath, IniParser& rules) { // store by category std::unordered_map> tmp_map; - + // all vars must be defined in the default preset vars before for (const auto& entry : m_presets) { tmp_map[entry->category].emplace_back(entry); - + for (auto& kv : entry->variables) { const auto it = m_preset_vars.find(kv.first); @@ -560,7 +562,7 @@ void GraphicPack2::ValidatePresetSelections() // // example: a preset category might be hidden entirely (e.g. due to a separate advanced options dropdown) // how to handle: leave the previously selected preset - // + // // the logic is therefore as follows: // if there is a preset category with at least 1 visible preset entry then make sure one of those is actually selected // for completely hidden preset categories we leave the selection as-is @@ -624,17 +626,17 @@ bool GraphicPack2::SetActivePreset(std::string_view category, std::string_view n // disable currently active preset std::for_each(m_presets.begin(), m_presets.end(), [category](PresetPtr& p) { - if(p->category == category) + if(p->category == category) p->active = false; }); - + if (name.empty()) return true; - + // enable new preset const auto it = std::find_if(m_presets.cbegin(), m_presets.cend(), [category, name](const PresetPtr& preset) { - return preset->category == category && preset->name == name; + return preset->category == category && preset->name == name; }); bool result; @@ -775,7 +777,7 @@ std::optional GraphicPack2::GetPresetVariable(const std return it->second; } } - + for (const auto& preset : presets) { if (!preset->visible) @@ -785,7 +787,7 @@ std::optional GraphicPack2::GetPresetVariable(const std return it->second; } } - + const auto it = std::find_if(m_preset_vars.cbegin(), m_preset_vars.cend(), [&var_name](auto p) { return p.first == var_name; }); if (it != m_preset_vars.cend()) { @@ -831,7 +833,7 @@ void GraphicPack2::_iterateReplacedFiles(const fs::path& currentPath, bool isAOC virtualMountPath = fs::path("vol/content/") / virtualMountPath; } fscDeviceRedirect_add(virtualMountPath.generic_string(), it.path().generic_string(), m_fs_priority); - } + } } } @@ -851,7 +853,7 @@ void GraphicPack2::LoadReplacedFiles() std::error_code ec; if (fs::exists(contentPath, ec)) { - // setup redirections + // setup redirections fscDeviceRedirect_map(); _iterateReplacedFiles(contentPath, false); } @@ -864,7 +866,7 @@ void GraphicPack2::LoadReplacedFiles() uint64 aocTitleId = CafeSystem::GetForegroundTitleId(); aocTitleId = aocTitleId & 0xFFFFFFFFULL; aocTitleId |= 0x0005000c00000000ULL; - // setup redirections + // setup redirections fscDeviceRedirect_map(); _iterateReplacedFiles(aocPath, true); } @@ -980,7 +982,7 @@ bool GraphicPack2::Activate() // enable patch groups EnablePatches(); - + // load replaced files LoadReplacedFiles(); @@ -1026,7 +1028,7 @@ bool GraphicPack2::Deactivate() m_output_shader_source.clear(); m_upscaling_shader_source.clear(); m_downscaling_shader_source.clear(); - + if (HasCustomVSyncFrequency()) { m_vsync_frequency = -1; @@ -1058,7 +1060,7 @@ const std::string* GraphicPack2::FindCustomShaderSource(uint64 shaderBaseHash, u std::unordered_map> GraphicPack2::GetCategorizedPresets(std::vector& order) const { order.clear(); - + std::unordered_map> result; for(const auto& entry : m_presets) { @@ -1067,13 +1069,13 @@ std::unordered_map> GraphicPac if (it == order.cend()) order.emplace_back(entry->category); } - + return result; } bool GraphicPack2::HasShaders() const { - return !GetCustomShaders().empty() + return !GetCustomShaders().empty() || !m_output_shader_source.empty() || !m_upscaling_shader_source.empty() || !m_downscaling_shader_source.empty(); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp new file mode 100644 index 000000000..0e0bfb9a0 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -0,0 +1,198 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" + +void MetalRenderer::InitializeLayer(const Vector2i& size, bool mainWindow) { + /* + const auto& windowInfo = gui_getWindowInfo().window_main; + + NSView* view = (NS::View*)handle; + + MetalView* childView = [[MetalView alloc] initWithFrame:view.bounds]; + childView.autoresizingMask = NSViewWidthSizable | NSViewHeightSizable; + childView.wantsLayer = YES; + + [view addSubview:childView]; + + VkMetalSurfaceCreateInfoEXT surface; + surface.sType = VK_STRUCTURE_TYPE_METAL_SURFACE_CREATE_INFO_EXT; + surface.pNext = NULL; + surface.flags = 0; + surface.pLayer = (CAMetalLayer*)childView.layer; + */ +} + +void MetalRenderer::Initialize() { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::Shutdown() { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +bool MetalRenderer::IsPadWindowActive() { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +bool MetalRenderer::GetVRAMInfo(int& usageInMB, int& totalInMB) const { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::ClearColorbuffer(bool padView) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::DrawEmptyFrame(bool mainWindow) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutputShader* shader, bool useLinearTexFilter, + sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight, + bool padView, bool clearBackground) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} +bool MetalRenderer::BeginFrame(bool mainWindow) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::Flush(bool waitIdle) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::NotifyLatteCommandProcessorIdle() { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::AppendOverlayDebugInfo() { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::renderTarget_setViewport(float x, float y, float width, float height, float nearZ, float farZ, bool halfZ) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::renderTarget_setScissor(sint32 scissorX, sint32 scissorY, sint32 scissorWidth, sint32 scissorHeight) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +LatteCachedFBO* MetalRenderer::rendertarget_createCachedFBO(uint64 key) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::rendertarget_deleteCachedFBO(LatteCachedFBO* fbo) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::rendertarget_bindFramebufferObject(LatteCachedFBO* cfbo) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void* MetalRenderer::texture_acquireTextureUploadBuffer(uint32 size) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::texture_releaseTextureUploadBuffer(uint8* mem) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +TextureDecoder* MetalRenderer::texture_chooseDecodedFormat(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, uint32 width, uint32 height) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::texture_clearSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, sint32 height, sint32 depth, void* pixelData, sint32 sliceIndex, sint32 mipIndex, uint32 compressedImageSize) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::texture_clearColorSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sliceIndex, sint32 mipIndex, bool clearDepth, bool clearStencil, float depthValue, uint32 stencilValue) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +LatteTexture* MetalRenderer::texture_createTextureEx(Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::texture_setLatteTexture(LatteTextureView* textureView, uint32 textureUnit) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::texture_copyImageSubData(LatteTexture* src, sint32 srcMip, sint32 effectiveSrcX, sint32 effectiveSrcY, sint32 srcSlice, LatteTexture* dst, sint32 dstMip, sint32 effectiveDstX, sint32 effectiveDstY, sint32 dstSlice, sint32 effectiveCopyWidth, sint32 effectiveCopyHeight, sint32 srcDepth) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +LatteTextureReadbackInfo* MetalRenderer::texture_createReadback(LatteTextureView* textureView) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* sourceTexture, sint32 srcMip, sint32 srcSlice, LatteTexture* destinationTexture, sint32 dstMip, sint32 dstSlice, sint32 width, sint32 height) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::bufferCache_init(const sint32 bufferSize) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::bufferCache_upload(uint8* buffer, sint32 size, uint32 bufferOffset) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::bufferCache_copy(uint32 srcOffset, uint32 dstOffset, uint32 size) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint32 dstOffset, uint32 size) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, uint32 size) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::buffer_bindUniformBuffer(LatteConst::ShaderType shaderType, uint32 bufferIndex, uint32 offset, uint32 size) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +RendererShader* MetalRenderer::shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool compileAsync, bool isGfxPackSource) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::streamout_setupXfbBuffer(uint32 bufferIndex, sint32 ringBufferOffset, uint32 rangeAddr, uint32 rangeSize) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::streamout_begin() { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::streamout_rendererFinishDrawcall() { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::draw_beginSequence() { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::draw_endSequence() { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void* MetalRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} + +void MetalRenderer::indexData_uploadIndexMemory(uint32 offset, uint32 size) { + cemuLog_logDebug(LogType::Force, "not implemented"); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h new file mode 100644 index 000000000..017c32fda --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -0,0 +1,142 @@ +#pragma once + +#include "Cafe/HW/Latte/Renderer/Renderer.h" + +class MetalRenderer : public Renderer +{ +public: + ~MetalRenderer() = default; + + RendererAPI GetType() override + { + return RendererAPI::Metal; + } + + static MetalRenderer* GetInstance() { + return static_cast(g_renderer.get()); + } + + void InitializeLayer(const Vector2i& size, bool mainWindow); + + void Initialize() override; + void Shutdown() override; + bool IsPadWindowActive() override; + + bool GetVRAMInfo(int& usageInMB, int& totalInMB) const override; + + void ClearColorbuffer(bool padView) override; + void DrawEmptyFrame(bool mainWindow) override; + void SwapBuffers(bool swapTV, bool swapDRC) override; + + void HandleScreenshotRequest(LatteTextureView* texView, bool padView) override { + cemuLog_logDebug(LogType::Force, "Screenshots are not yet supported on Metal"); + } + + void DrawBackbufferQuad(LatteTextureView* texView, RendererOutputShader* shader, bool useLinearTexFilter, + sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight, + bool padView, bool clearBackground) override; + bool BeginFrame(bool mainWindow) override; + + // flush control + void Flush(bool waitIdle = false) override; // called when explicit flush is required (e.g. by imgui) + void NotifyLatteCommandProcessorIdle() override; // called when command processor has no more commands available or when stalled + + // imgui + bool ImguiBegin(bool mainWindow) override { + cemuLog_logDebug(LogType::Force, "Imgui is not yet supported on Metal"); + }; + + void ImguiEnd() override { + cemuLog_logDebug(LogType::Force, "Imgui is not yet supported on Metal"); + }; + + ImTextureID GenerateTexture(const std::vector& data, const Vector2i& size) override { + cemuLog_logDebug(LogType::Force, "Imgui is not yet supported on Metal"); + }; + + void DeleteTexture(ImTextureID id) override { + cemuLog_logDebug(LogType::Force, "Imgui is not yet supported on Metal"); + }; + + void DeleteFontTextures() override { + cemuLog_logDebug(LogType::Force, "Imgui is not yet supported on Metal"); + }; + + void AppendOverlayDebugInfo() override; + + // rendertarget + void renderTarget_setViewport(float x, float y, float width, float height, float nearZ, float farZ, bool halfZ = false) override; + void renderTarget_setScissor(sint32 scissorX, sint32 scissorY, sint32 scissorWidth, sint32 scissorHeight) override; + + LatteCachedFBO* rendertarget_createCachedFBO(uint64 key) override; + void rendertarget_deleteCachedFBO(LatteCachedFBO* fbo) override; + void rendertarget_bindFramebufferObject(LatteCachedFBO* cfbo) override; + + // texture functions + void* texture_acquireTextureUploadBuffer(uint32 size) override; + void texture_releaseTextureUploadBuffer(uint8* mem) override; + + TextureDecoder* texture_chooseDecodedFormat(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, uint32 width, uint32 height) override; + + void texture_clearSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex) override; + void texture_loadSlice(LatteTexture* hostTexture, sint32 width, sint32 height, sint32 depth, void* pixelData, sint32 sliceIndex, sint32 mipIndex, uint32 compressedImageSize) override; + void texture_clearColorSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a) override; + void texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sliceIndex, sint32 mipIndex, bool clearDepth, bool clearStencil, float depthValue, uint32 stencilValue) override; + + LatteTexture* texture_createTextureEx(Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) override; + + void texture_setLatteTexture(LatteTextureView* textureView, uint32 textureUnit) override; + void texture_copyImageSubData(LatteTexture* src, sint32 srcMip, sint32 effectiveSrcX, sint32 effectiveSrcY, sint32 srcSlice, LatteTexture* dst, sint32 dstMip, sint32 effectiveDstX, sint32 effectiveDstY, sint32 dstSlice, sint32 effectiveCopyWidth, sint32 effectiveCopyHeight, sint32 srcDepth) override; + + LatteTextureReadbackInfo* texture_createReadback(LatteTextureView* textureView) override; + + // surface copy + void surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* sourceTexture, sint32 srcMip, sint32 srcSlice, LatteTexture* destinationTexture, sint32 dstMip, sint32 dstSlice, sint32 width, sint32 height) override; + + // buffer cache + void bufferCache_init(const sint32 bufferSize) override; + void bufferCache_upload(uint8* buffer, sint32 size, uint32 bufferOffset) override; + void bufferCache_copy(uint32 srcOffset, uint32 dstOffset, uint32 size) override; + void bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint32 dstOffset, uint32 size) override; + + void buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, uint32 size) override; + void buffer_bindUniformBuffer(LatteConst::ShaderType shaderType, uint32 bufferIndex, uint32 offset, uint32 size) override; + + // shader + RendererShader* shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool compileAsync, bool isGfxPackSource) override; + + // streamout + void streamout_setupXfbBuffer(uint32 bufferIndex, sint32 ringBufferOffset, uint32 rangeAddr, uint32 rangeSize) override; + void streamout_begin() override; + void streamout_rendererFinishDrawcall() override; + + // core drawing logic + void draw_beginSequence() override; + void draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst) override; + void draw_endSequence() override; + + // index + void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override; + void indexData_uploadIndexMemory(uint32 offset, uint32 size) override; + + // occlusion queries + LatteQueryObject* occlusionQuery_create() override { + cemuLog_logDebug(LogType::Force, "Occlusion queries are not yet supported on Metal"); + } + + void occlusionQuery_destroy(LatteQueryObject* queryObj) override { + cemuLog_logDebug(LogType::Force, "Occlusion queries are not yet supported on Metal"); + } + + void occlusionQuery_flush() override { + cemuLog_logDebug(LogType::Force, "Occlusion queries are not yet supported on Metal"); + } + + void occlusionQuery_updateState() override { + cemuLog_logDebug(LogType::Force, "Occlusion queries are not yet supported on Metal"); + } + + +protected: + //CA::MetalLayer* m_metalLayer; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Renderer.h b/src/Cafe/HW/Latte/Renderer/Renderer.h index 0b694bb95..a94ad1550 100644 --- a/src/Cafe/HW/Latte/Renderer/Renderer.h +++ b/src/Cafe/HW/Latte/Renderer/Renderer.h @@ -33,6 +33,7 @@ enum class RendererAPI { OpenGL, Vulkan, + Metal, MAX }; @@ -66,9 +67,9 @@ class Renderer virtual void SwapBuffers(bool swapTV, bool swapDRC) = 0; virtual void HandleScreenshotRequest(LatteTextureView* texView, bool padView){} - - virtual void DrawBackbufferQuad(LatteTextureView* texView, RendererOutputShader* shader, bool useLinearTexFilter, - sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight, + + virtual void DrawBackbufferQuad(LatteTextureView* texView, RendererOutputShader* shader, bool useLinearTexFilter, + sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight, bool padView, bool clearBackground) = 0; virtual bool BeginFrame(bool mainWindow) = 0; diff --git a/src/Cemu/Logging/CemuLogging.cpp b/src/Cemu/Logging/CemuLogging.cpp index 5cde2a7fb..6b77b2265 100644 --- a/src/Cemu/Logging/CemuLogging.cpp +++ b/src/Cemu/Logging/CemuLogging.cpp @@ -59,6 +59,7 @@ const std::map g_logging_window_mapping {LogType::TextureReadback, "Texture readback"}, {LogType::OpenGLLogging, "OpenGL debug output"}, {LogType::VulkanValidation, "Vulkan validation layer"}, + {LogType::MetalLogging, "Metal debug output"}, }; bool cemuLog_advancedPPCLoggingEnabled() @@ -158,7 +159,7 @@ bool cemuLog_log(LogType type, std::string_view text) bool cemuLog_log(LogType type, std::u8string_view text) { - std::basic_string_view s((char*)text.data(), text.size()); + std::basic_string_view s((char*)text.data(), text.size()); return cemuLog_log(type, s); } diff --git a/src/Cemu/Logging/CemuLogging.h b/src/Cemu/Logging/CemuLogging.h index a671ce51b..edca3241e 100644 --- a/src/Cemu/Logging/CemuLogging.h +++ b/src/Cemu/Logging/CemuLogging.h @@ -20,6 +20,7 @@ enum class LogType : sint32 OpenGLLogging = 10, // OpenGL debug logging TextureCache = 11, // texture cache warnings and info VulkanValidation = 12, // Vulkan validation layer + MetalLogging = 13, // Metal debug logging Patches = 14, CoreinitMem = 8, // coreinit memory functions CoreinitMP = 15, @@ -52,7 +53,7 @@ enum class LogType : sint32 template <> struct fmt::formatter : formatter { template - auto format(std::u8string_view v, FormatContext& ctx) + auto format(std::u8string_view v, FormatContext& ctx) { string_view s((char*)v.data(), v.size()); return formatter::format(s, ctx); @@ -96,7 +97,7 @@ bool cemuLog_log(LogType type, std::basic_string formatStr, TArgs&&... args) } return true; } - + template bool cemuLog_log(LogType type, const T* format, TArgs&&... args) { diff --git a/src/config/CemuConfig.h b/src/config/CemuConfig.h index 2a1d29cbf..0dc23ce13 100644 --- a/src/config/CemuConfig.h +++ b/src/config/CemuConfig.h @@ -32,7 +32,7 @@ struct GameEntry std::wstring save_folder; std::wstring update_folder; std::wstring dlc_folder; - + uint64 legacy_time_played = 0; uint64 legacy_last_played = 0; @@ -74,6 +74,7 @@ enum GraphicAPI { kOpenGL = 0, kVulkan, + kMetal, }; enum AudioChannels @@ -105,7 +106,7 @@ enum class ScreenPosition kTopRight, kBottomLeft, kBottomCenter, - kBottomRight, + kBottomRight, }; enum class PrecompiledShaderOption @@ -134,7 +135,7 @@ enum class CPUMode ENABLE_ENUM_ITERATORS(CPUMode, CPUMode::SinglecoreInterpreter, CPUMode::Auto); -enum class CPUModeLegacy +enum class CPUModeLegacy { SinglecoreInterpreter = 0, SinglecoreRecompiler = 1, @@ -270,7 +271,7 @@ struct fmt::formatter : formatter { case CafeConsoleRegion::TWN: name = wxTRANSLATE("Taiwan"); break; case CafeConsoleRegion::Auto: name = wxTRANSLATE("Auto"); break; default: name = wxTRANSLATE("many"); break; - + } return formatter::format(name, ctx); } @@ -312,7 +313,7 @@ struct fmt::formatter : formatter { case CrashDump::Lite: name = "Lite"; break; case CrashDump::Full: name = "Full"; break; default: name = "unknown"; break; - + } return formatter::format(name, ctx); } @@ -363,7 +364,7 @@ struct CemuConfig ConfigValue advanced_ppc_logging{ false }; ConfigValue permanent_storage{ true }; - + ConfigValue language{ wxLANGUAGE_DEFAULT }; ConfigValue use_discord_presence{ true }; ConfigValue mlc_path{}; @@ -387,7 +388,7 @@ struct CemuConfig // optimized access std::set game_cache_favorites; // per titleId - + struct _path_hash { std::size_t operator()(const fs::path& path) const { return fs::hash_value(path); @@ -514,7 +515,7 @@ struct CemuConfig NetworkService GetAccountNetworkService(uint32 persistentId); void SetAccountSelectedService(uint32 persistentId, NetworkService serviceIndex); - + // emulated usb devices struct { @@ -530,5 +531,3 @@ struct CemuConfig typedef XMLDataConfig XMLCemuConfig_t; extern XMLCemuConfig_t g_config; inline CemuConfig& GetConfig() { return g_config.data(); } - - diff --git a/src/gui/CMakeLists.txt b/src/gui/CMakeLists.txt index 02f96a9c8..df98c1f1e 100644 --- a/src/gui/CMakeLists.txt +++ b/src/gui/CMakeLists.txt @@ -1,9 +1,11 @@ -add_library(CemuGui +add_library(CemuGui canvas/IRenderCanvas.h canvas/OpenGLCanvas.cpp canvas/OpenGLCanvas.h canvas/VulkanCanvas.cpp canvas/VulkanCanvas.h + canvas/MetalCanvas.cpp + canvas/MetalCanvas.h CemuApp.cpp CemuApp.h CemuUpdateWindow.cpp diff --git a/src/gui/GameProfileWindow.cpp b/src/gui/GameProfileWindow.cpp index f15395e42..fe3327025 100644 --- a/src/gui/GameProfileWindow.cpp +++ b/src/gui/GameProfileWindow.cpp @@ -61,7 +61,7 @@ GameProfileWindow::GameProfileWindow(wxWindow* parent, uint64_t title_id) const sint32 m_cpu_modeNChoices = std::size(cpu_modes); m_cpu_mode = new wxChoice(box, wxID_ANY, wxDefaultPosition, wxDefaultSize, m_cpu_modeNChoices, cpu_modes, 0); m_cpu_mode->SetToolTip(_("Set the CPU emulation mode")); - first_row->Add(m_cpu_mode, 0, wxALL, 5); + first_row->Add(m_cpu_mode, 0, wxALL, 5); first_row->Add(new wxStaticText(box, wxID_ANY, _("Thread quantum")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); @@ -112,10 +112,14 @@ GameProfileWindow::GameProfileWindow(wxWindow* parent, uint64_t title_id) first_row->Add(new wxStaticText(panel, wxID_ANY, _("Graphics API")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); - wxString gapi_values[] = { "", "OpenGL", "Vulkan" }; + wxString gapi_values[] = { "", "OpenGL", "Vulkan", +#ifdef __APPLE__ + "Metal" +#endif + }; m_graphic_api = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(gapi_values), gapi_values); first_row->Add(m_graphic_api, 0, wxALL, 5); - + first_row->Add(new wxStaticText(panel, wxID_ANY, _("Shader multiplication accuracy")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); wxString mul_values[] = { _("false"), _("true")}; @@ -249,7 +253,7 @@ void GameProfileWindow::ApplyProfile() // general m_load_libs->SetValue(m_game_profile.m_loadSharedLibraries.value()); m_start_with_padview->SetValue(m_game_profile.m_startWithPadView); - + // cpu // wxString cpu_modes[] = { _("Singlecore-Interpreter"), _("Singlecore-Recompiler"), _("Triplecore-Recompiler"), _("Auto (recommended)") }; switch(m_game_profile.m_cpuMode.value()) @@ -258,9 +262,9 @@ void GameProfileWindow::ApplyProfile() case CPUMode::SinglecoreRecompiler: m_cpu_mode->SetSelection(1); break; case CPUMode::DualcoreRecompiler: m_cpu_mode->SetSelection(2); break; case CPUMode::MulticoreRecompiler: m_cpu_mode->SetSelection(2); break; - default: m_cpu_mode->SetSelection(3); + default: m_cpu_mode->SetSelection(3); } - + m_thread_quantum->SetStringSelection(fmt::format("{}", m_game_profile.m_threadQuantum)); // gpu @@ -275,7 +279,7 @@ void GameProfileWindow::ApplyProfile() // controller auto profiles = InputManager::get_profiles(); - + for (const auto& cb : m_controller_profile) { cb->Clear(); @@ -293,7 +297,7 @@ void GameProfileWindow::ApplyProfile() const auto& v = m_game_profile.m_controllerProfile[i].value(); m_controller_profile[i]->SetStringSelection(wxString::FromUTF8(v)); } - + else m_controller_profile[i]->SetSelection(wxNOT_FOUND); } @@ -317,7 +321,7 @@ void GameProfileWindow::SaveProfile() m_game_profile.m_cpuMode = CPUMode::Auto; } - + const wxString thread_quantum = m_thread_quantum->GetStringSelection(); if (!thread_quantum.empty()) { @@ -365,4 +369,4 @@ void GameProfileWindow::SetSliderValue(wxSlider* slider, sint32 new_value) const slider_event.SetEventObject(slider); slider_event.SetClientData((void*)IsFrozen()); wxPostEvent(slider->GetEventHandler(), slider_event); -} \ No newline at end of file +} diff --git a/src/gui/GeneralSettings2.cpp b/src/gui/GeneralSettings2.cpp index 08395cd31..4f7a7fe7e 100644 --- a/src/gui/GeneralSettings2.cpp +++ b/src/gui/GeneralSettings2.cpp @@ -101,7 +101,7 @@ class wxAccountData : public wxClientData Account& GetAccount() { return m_account; } const Account& GetAccount() const { return m_account; } - + private: Account m_account; }; @@ -165,11 +165,11 @@ wxPanel* GeneralSettings2::AddGeneralPage(wxNotebook* notebook) m_auto_update = new wxCheckBox(box, wxID_ANY, _("Automatically check for updates")); m_auto_update->SetToolTip(_("Automatically checks for new cemu versions on startup")); second_row->Add(m_auto_update, 0, botflag, 5); -#if BOOST_OS_LINUX +#if BOOST_OS_LINUX if (!std::getenv("APPIMAGE")) { m_auto_update->Disable(); - } -#endif + } +#endif second_row->AddSpacer(10); m_save_screenshot = new wxCheckBox(box, wxID_ANY, _("Save screenshot")); m_save_screenshot->SetToolTip(_("Pressing the screenshot key (F12) will save a screenshot directly to the screenshots folder")); @@ -276,12 +276,14 @@ wxPanel* GeneralSettings2::AddGraphicsPage(wxNotebook* notebook) row->Add(new wxStaticText(box, wxID_ANY, _("Graphics API")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); sint32 api_size = 1; - wxString choices[2] = { "OpenGL" }; + wxString choices[3] = { "OpenGL" }; if (g_vulkan_available) { - choices[1] = "Vulkan"; - api_size = 2; + choices[api_size++] = "Vulkan"; } +#ifdef __APPLE__ + choices[api_size++] = "Metal"; +#endif m_graphic_api = new wxChoice(box, wxID_ANY, wxDefaultPosition, wxDefaultSize, api_size, choices); m_graphic_api->SetSelection(0); @@ -728,7 +730,7 @@ wxPanel* GeneralSettings2::AddAccountPage(wxNotebook* notebook) auto* row = new wxFlexGridSizer(0, 2, 0, 0); row->SetFlexibleDirection(wxBOTH); row->SetNonFlexibleGrowMode(wxFLEX_GROWMODE_SPECIFIED); - + const wxImage tmp = wxBITMAP_PNG_FROM_DATA(PNG_ERROR).ConvertToImage(); m_validate_online = new wxBitmapButton(box, wxID_ANY, tmp.Scale(16, 16)); m_validate_online->Bind(wxEVT_BUTTON, &GeneralSettings2::OnShowOnlineValidator, this); @@ -738,7 +740,7 @@ wxPanel* GeneralSettings2::AddAccountPage(wxNotebook* notebook) row->Add(m_online_status, 1, wxALL | wxALIGN_CENTRE_VERTICAL, 5); box_sizer->Add(row, 1, wxEXPAND, 5); - + auto* tutorial_link = new wxHyperlinkCtrl(box, wxID_ANY, _("Online play tutorial"), "https://cemu.info/online-guide"); box_sizer->Add(tutorial_link, 0, wxALL, 5); @@ -856,14 +858,14 @@ GeneralSettings2::GeneralSettings2(wxWindow* parent, bool game_launched) notebook->AddPage(AddGeneralPage(notebook), _("General")); notebook->AddPage(AddGraphicsPage(notebook), _("Graphics")); - notebook->AddPage(AddAudioPage(notebook), _("Audio")); + notebook->AddPage(AddAudioPage(notebook), _("Audio")); notebook->AddPage(AddOverlayPage(notebook), _("Overlay")); notebook->AddPage(AddAccountPage(notebook), _("Account")); notebook->AddPage(AddDebugPage(notebook), _("Debug")); Bind(wxEVT_CLOSE_WINDOW, &GeneralSettings2::OnClose, this); - // + // sizer->Add(notebook, 1, wxEXPAND | wxALL, 5); @@ -878,7 +880,7 @@ GeneralSettings2::GeneralSettings2(wxWindow* parent, bool game_launched) ApplyConfig(); HandleGraphicsApiSelection(); - + DisableSettings(game_launched); } @@ -890,7 +892,7 @@ uint32 GeneralSettings2::GetSelectedAccountPersistentId() return dynamic_cast(m_active_account->GetClientObject(active_account))->GetAccount().GetPersistentId(); } -void GeneralSettings2::StoreConfig() +void GeneralSettings2::StoreConfig() { auto* app = (CemuApp*)wxTheApp; auto& config = GetConfig(); @@ -908,7 +910,7 @@ void GeneralSettings2::StoreConfig() { ScreenSaver::SetInhibit(config.disable_screensaver); } - + // -1 is default wx widget value -> set to dummy 0 so mainwindow and padwindow will update it config.window_position = m_save_window_position_size->IsChecked() ? Vector2i{ 0,0 } : Vector2i{-1,-1}; config.window_size = m_save_window_position_size->IsChecked() ? Vector2i{ 0,0 } : Vector2i{-1,-1}; @@ -951,7 +953,7 @@ void GeneralSettings2::StoreConfig() config.pad_channels = kStereo; // (AudioChannels)m_pad_channels->GetSelection(); //config.input_channels = (AudioChannels)m_input_channels->GetSelection(); config.input_channels = kMono; // (AudioChannels)m_input_channels->GetSelection(); - + config.tv_volume = m_tv_volume->GetValue(); config.pad_volume = m_pad_volume->GetValue(); config.input_volume = m_input_volume->GetValue(); @@ -997,16 +999,16 @@ void GeneralSettings2::StoreConfig() } else config.graphic_device_uuid = {}; - + config.vsync = m_vsync->GetSelection(); config.gx2drawdone_sync = m_gx2drawdone_sync->IsChecked(); config.async_compile = m_async_compile->IsChecked(); - + config.upscale_filter = m_upscale_filter->GetSelection(); config.downscale_filter = m_downscale_filter->GetSelection(); config.fullscreen_scaling = m_fullscreen_scaling->GetSelection(); - + config.overlay.position = (ScreenPosition)m_overlay_position->GetSelection(); wxASSERT((int)config.overlay.position <= (int)ScreenPosition::kBottomRight); config.overlay.text_color = m_overlay_font_color->GetColour().GetRGBA(); config.overlay.text_scale = m_overlay_scale->GetSelection() * 25 + 50; @@ -1064,7 +1066,7 @@ void GeneralSettings2::ValidateConfig() void GeneralSettings2::DisableSettings(bool game_launched) { - + } void GeneralSettings2::OnAudioLatencyChanged(wxCommandEvent& event) @@ -1075,7 +1077,7 @@ void GeneralSettings2::OnAudioLatencyChanged(wxCommandEvent& event) void GeneralSettings2::OnVolumeChanged(wxCommandEvent& event) { - + if(event.GetEventObject() == m_input_volume) { std::shared_lock lock(g_audioInputMutex); @@ -1099,7 +1101,7 @@ void GeneralSettings2::OnVolumeChanged(wxCommandEvent& event) g_tvAudio->SetVolume(event.GetInt()); } } - + event.Skip(); } @@ -1112,7 +1114,7 @@ void GeneralSettings2::OnInputVolumeChanged(wxCommandEvent& event) g_padAudio->SetInputVolume(event.GetInt()); g_padVolume = event.GetInt(); } - + event.Skip(); } @@ -1190,7 +1192,7 @@ void GeneralSettings2::UpdateAudioDeviceList() // todo reset global instance of audio device } -void GeneralSettings2::ResetAccountInformation() +void GeneralSettings2::ResetAccountInformation() { m_account_grid->SetSplitterPosition(100); m_active_account->SetSelection(0); @@ -1218,7 +1220,7 @@ void GeneralSettings2::OnAccountCreate(wxCommandEvent& event) Account account(dialog.GetPersistentId(), dialog.GetMiiName().ToStdWstring()); account.Save(); Account::RefreshAccounts(); - + const int index = m_active_account->Append(account.ToString(), new wxAccountData(account)); // update ui @@ -1227,7 +1229,7 @@ void GeneralSettings2::OnAccountCreate(wxCommandEvent& event) m_create_account->Enable(m_active_account->GetCount() < 0xC); m_delete_account->Enable(m_active_account->GetCount() > 1); - + // send main window event wxASSERT(GetParent()); wxCommandEvent refresh_event(wxEVT_ACCOUNTLIST_REFRESH); @@ -1257,7 +1259,7 @@ void GeneralSettings2::OnAccountDelete(wxCommandEvent& event) return; // todo: ask if saves should be deleted too? - + const fs::path path = account.GetFileName(); try { @@ -1275,7 +1277,7 @@ void GeneralSettings2::OnAccountDelete(wxCommandEvent& event) SystemException sys(ex); cemuLog_log(LogType::Force, sys.what()); } - + } void GeneralSettings2::OnAccountSettingsChanged(wxPropertyGridEvent& event) @@ -1330,7 +1332,7 @@ void GeneralSettings2::OnAccountSettingsChanged(wxPropertyGridEvent& event) else if (property->GetName() == kPropertyEmail) { account.SetEmail(value.As().ToStdString()); - + } else if (property->GetName() == kPropertyCountry) { @@ -1338,7 +1340,7 @@ void GeneralSettings2::OnAccountSettingsChanged(wxPropertyGridEvent& event) } else cemu_assert_debug(false); - + account.Save(); Account::RefreshAccounts(); // refresh internal account list UpdateAccountInformation(); // refresh on invalid values @@ -1378,7 +1380,7 @@ void GeneralSettings2::UpdateAccountInformation() gender_property->SetChoiceSelection(std::min(gender_property->GetChoices().GetCount() - 1, (uint32)account.GetGender())); m_account_grid->GetProperty(kPropertyEmail)->SetValueFromString(std::string{ account.GetEmail() }); - + auto* country_property = dynamic_cast(m_account_grid->GetProperty(kPropertyCountry)); wxASSERT(country_property); int index = (country_property)->GetIndexForValue(account.GetCountry()); @@ -1462,7 +1464,7 @@ void GeneralSettings2::HandleGraphicsApiSelection() int selection = m_vsync->GetSelection(); if(selection == wxNOT_FOUND) selection = GetConfig().vsync; - + m_vsync->Clear(); if(m_graphic_api->GetSelection() == 0) { @@ -1494,7 +1496,7 @@ void GeneralSettings2::HandleGraphicsApiSelection() #endif m_vsync->Select(selection); - + m_graphic_device->Enable(); auto devices = VulkanRenderer::GetDevices(); m_graphic_device->Clear(); @@ -1618,7 +1620,7 @@ void GeneralSettings2::ApplyConfig() m_pad_channels->SetSelection(0); //m_input_channels->SetSelection(config.pad_channels); m_input_channels->SetSelection(0); - + SendSliderEvent(m_tv_volume, config.tv_volume); if (!config.tv_device.empty() && m_tv_device->HasClientObjectData()) @@ -1635,7 +1637,7 @@ void GeneralSettings2::ApplyConfig() } else m_tv_device->SetSelection(0); - + SendSliderEvent(m_pad_volume, config.pad_volume); if (!config.pad_device.empty() && m_pad_device->HasClientObjectData()) { @@ -1768,7 +1770,7 @@ void GeneralSettings2::UpdateAudioDevice() } } } - + // pad audio device { const auto selection = m_pad_device->GetSelection(); @@ -1884,14 +1886,14 @@ void GeneralSettings2::OnAudioChannelsSelected(wxCommandEvent& event) { if (config.tv_channels == (AudioChannels)obj->GetSelection()) return; - + config.tv_channels = (AudioChannels)obj->GetSelection(); } else if (obj == m_pad_channels) { if (config.pad_channels == (AudioChannels)obj->GetSelection()) return; - + config.pad_channels = (AudioChannels)obj->GetSelection(); } else @@ -2034,23 +2036,23 @@ void GeneralSettings2::OnShowOnlineValidator(wxCommandEvent& event) const auto selection = m_active_account->GetSelection(); if (selection == wxNOT_FOUND) return; - + const auto* obj = dynamic_cast(m_active_account->GetClientObject(selection)); wxASSERT(obj); const auto& account = obj->GetAccount(); - + const auto validator = account.ValidateOnlineFiles(); if (validator) // everything valid? shouldn't happen return; - + wxString err; err << _("The following error(s) have been found:") << '\n'; - + if (validator.otp == OnlineValidator::FileState::Missing) err << _("otp.bin missing in Cemu directory") << '\n'; else if(validator.otp == OnlineValidator::FileState::Corrupted) err << _("otp.bin is invalid") << '\n'; - + if (validator.seeprom == OnlineValidator::FileState::Missing) err << _("seeprom.bin missing in Cemu directory") << '\n'; else if(validator.seeprom == OnlineValidator::FileState::Corrupted) diff --git a/src/gui/LoggingWindow.cpp b/src/gui/LoggingWindow.cpp index 4026113e7..0d25e2796 100644 --- a/src/gui/LoggingWindow.cpp +++ b/src/gui/LoggingWindow.cpp @@ -21,7 +21,7 @@ LoggingWindow::LoggingWindow(wxFrame* parent) filter_row->Add(new wxStaticText( this, wxID_ANY, _("Filter")), 0, wxALIGN_CENTER_VERTICAL|wxALL, 5 ); - wxString choices[] = {"Unsupported APIs calls", "Coreinit Logging", "Coreinit File-Access", "Coreinit Thread-Synchronization", "Coreinit Memory", "Coreinit MP", "Coreinit Thread", "nn::nfp", "GX2", "Audio", "Input", "Socket", "Save", "H264", "Graphic pack patches", "Texture cache", "Texture readback", "OpenGL debug output", "Vulkan validation layer"}; + wxString choices[] = {"Unsupported APIs calls", "Coreinit Logging", "Coreinit File-Access", "Coreinit Thread-Synchronization", "Coreinit Memory", "Coreinit MP", "Coreinit Thread", "nn::nfp", "GX2", "Audio", "Input", "Socket", "Save", "H264", "Graphic pack patches", "Texture cache", "Texture readback", "OpenGL debug output", "Vulkan validation layer", "Metal debug output"}; m_filter = new wxComboBox( this, wxID_ANY, wxEmptyString, wxDefaultPosition, wxDefaultSize, std::size(choices), choices, 0 ); m_filter->Bind(wxEVT_COMBOBOX, &LoggingWindow::OnFilterChange, this); m_filter->Bind(wxEVT_TEXT, &LoggingWindow::OnFilterChange, this); @@ -83,7 +83,7 @@ void LoggingWindow::Log(std::string_view filter, std::wstring_view message) void LoggingWindow::OnLogMessage(wxLogEvent& event) { - m_log_list->PushEntry(event.GetFilter(), event.GetMessage()); + m_log_list->PushEntry(event.GetFilter(), event.GetMessage()); } void LoggingWindow::OnFilterChange(wxCommandEvent& event) @@ -97,4 +97,3 @@ void LoggingWindow::OnFilterMessageChange(wxCommandEvent& event) m_log_list->SetFilterMessage(m_filter_message->GetValue()); event.Skip(); } - diff --git a/src/gui/MainWindow.cpp b/src/gui/MainWindow.cpp index c83ab16b4..7f738c2ec 100644 --- a/src/gui/MainWindow.cpp +++ b/src/gui/MainWindow.cpp @@ -12,6 +12,7 @@ #include "audio/audioDebuggerWindow.h" #include "gui/canvas/OpenGLCanvas.h" #include "gui/canvas/VulkanCanvas.h" +#include "gui/canvas/MetalCanvas.h" #include "Cafe/OS/libs/nfc/nfc.h" #include "Cafe/OS/libs/swkbd/swkbd.h" #include "gui/debugger/DebuggerWindow2.h" @@ -93,7 +94,7 @@ enum // options -> account MAINFRAME_MENU_ID_OPTIONS_ACCOUNT_1 = 20350, MAINFRAME_MENU_ID_OPTIONS_ACCOUNT_12 = 20350 + 11, - + // options -> system language MAINFRAME_MENU_ID_OPTIONS_LANGUAGE_JAPANESE = 20500, MAINFRAME_MENU_ID_OPTIONS_LANGUAGE_ENGLISH, @@ -243,7 +244,7 @@ class wxGameDropTarget : public wxFileDropTarget { if(!m_window->IsGameLaunched() && filenames.GetCount() == 1) return m_window->FileLoad(_utf8ToPath(filenames[0].utf8_string()), wxLaunchGameEvent::INITIATED_BY::DRAG_AND_DROP); - + return false; } @@ -455,7 +456,7 @@ bool MainWindow::InstallUpdate(const fs::path& metaFilePath) { throw std::runtime_error(frame.GetExceptionMessage()); } - } + } } catch(const AbortException&) { @@ -639,13 +640,13 @@ void MainWindow::OnFileMenu(wxCommandEvent& event) _("Wii U executable (*.rpx, *.elf)"), _("All files (*.*)") ); - + wxFileDialog openFileDialog(this, _("Open file to launch"), wxEmptyString, wxEmptyString, wildcard, wxFD_OPEN | wxFD_FILE_MUST_EXIST); if (openFileDialog.ShowModal() == wxID_CANCEL || openFileDialog.GetPath().IsEmpty()) return; - const wxString wxStrFilePath = openFileDialog.GetPath(); + const wxString wxStrFilePath = openFileDialog.GetPath(); FileLoad(_utf8ToPath(wxStrFilePath.utf8_string()), wxLaunchGameEvent::INITIATED_BY::MENU); } else if (menuId >= MAINFRAME_MENU_ID_FILE_RECENT_0 && menuId <= MAINFRAME_MENU_ID_FILE_RECENT_LAST) @@ -784,7 +785,7 @@ void MainWindow::TogglePadView() { if (m_padView) return; - + m_padView = new PadViewFrame(this); m_padView->Bind(wxEVT_CLOSE_WINDOW, &MainWindow::OnPadClose, this); @@ -992,7 +993,7 @@ void MainWindow::OnConsoleLanguage(wxCommandEvent& event) // GetConfig().cpu_mode = CPUMode::TriplecoreRecompiler; // else // cemu_assert_debug(false); -// +// // g_config.Save(); //} @@ -1056,7 +1057,7 @@ void MainWindow::OnDebugSetting(wxCommandEvent& event) ActiveSettings::SetTimerShiftFactor(6); else cemu_assert_debug(false); - + g_config.Save(); } @@ -1132,7 +1133,7 @@ void MainWindow::OnLoggingWindow(wxCommandEvent& event) return; m_logging_window = new LoggingWindow(this); - m_logging_window->Bind(wxEVT_CLOSE_WINDOW, + m_logging_window->Bind(wxEVT_CLOSE_WINDOW, [this](wxCloseEvent& event) { m_logging_window = nullptr; event.Skip(); @@ -1307,7 +1308,7 @@ void MainWindow::SaveSettings() { auto lock = g_config.Lock(); auto& config = GetConfig(); - + if (config.window_position != Vector2i{ -1,-1 }) { config.window_position.x = m_restored_position.x; @@ -1344,7 +1345,7 @@ void MainWindow::SaveSettings() if(m_game_list) m_game_list->SaveConfig(); - + g_config.Save(); } @@ -1374,14 +1375,14 @@ void MainWindow::OnMouseMove(wxMouseEvent& event) void MainWindow::OnMouseLeft(wxMouseEvent& event) { auto& instance = InputManager::instance(); - + std::scoped_lock lock(instance.m_main_mouse.m_mutex); instance.m_main_mouse.left_down = event.ButtonDown(wxMOUSE_BTN_LEFT); auto physPos = ToPhys(event.GetPosition()); instance.m_main_mouse.position = { physPos.x, physPos.y }; if (event.ButtonDown(wxMOUSE_BTN_LEFT)) instance.m_main_mouse.left_down_toggle = true; - + event.Skip(); } @@ -1395,7 +1396,7 @@ void MainWindow::OnMouseRight(wxMouseEvent& event) instance.m_main_mouse.position = { physPos.x, physPos.y }; if(event.ButtonDown(wxMOUSE_BTN_RIGHT)) instance.m_main_mouse.right_down_toggle = true; - + event.Skip(); } @@ -1443,7 +1444,7 @@ void MainWindow::OnKeyUp(wxKeyEvent& event) void MainWindow::OnKeyDown(wxKeyEvent& event) { - if ((event.AltDown() && event.GetKeyCode() == WXK_F4) || + if ((event.AltDown() && event.GetKeyCode() == WXK_F4) || (event.CmdDown() && event.GetKeyCode() == 'Q')) { Close(true); @@ -1458,7 +1459,7 @@ void MainWindow::OnChar(wxKeyEvent& event) { if (swkbd_hasKeyboardInputHook()) swkbd_keyInput(event.GetUnicodeKey()); - + // event.Skip(); } @@ -1483,7 +1484,7 @@ void MainWindow::OnToolsInput(wxCommandEvent& event) case MAINFRAME_MENU_ID_TOOLS_DOWNLOAD_MANAGER: { const auto default_tab = id == MAINFRAME_MENU_ID_TOOLS_TITLE_MANAGER ? TitleManagerPage::TitleManager : TitleManagerPage::DownloadManager; - + if (m_title_manager) m_title_manager->SetFocusAndTab(default_tab); else @@ -1533,7 +1534,7 @@ void MainWindow::OnGesturePan(wxPanGestureEvent& event) instance.m_main_touch.left_down = event.IsGestureStart() || !event.IsGestureEnd(); if (event.IsGestureStart() || !event.IsGestureEnd()) instance.m_main_touch.left_down_toggle = true; - + event.Skip(); } @@ -1567,8 +1568,10 @@ void MainWindow::CreateCanvas() // create canvas if (ActiveSettings::GetGraphicsAPI() == kVulkan) m_render_canvas = new VulkanCanvas(m_game_panel, wxSize(1280, 720), true); - else + else if (ActiveSettings::GetGraphicsAPI() == kOpenGL) m_render_canvas = GLCanvas_Create(m_game_panel, wxSize(1280, 720), true); + else + m_render_canvas = new MetalCanvas(m_game_panel, wxSize(1280, 720), true); // mouse events m_render_canvas->Bind(wxEVT_MOTION, &MainWindow::OnMouseMove, this); @@ -1748,10 +1751,10 @@ void MainWindow::UpdateNFCMenu() const auto& entry = config.recent_nfc_files[i]; if (entry.empty()) continue; - + if (!fs::exists(_utf8ToPath(entry))) continue; - + if (recentFileIndex == 0) m_nfcMenuSeparator0 = m_nfcMenu->AppendSeparator(); @@ -1802,7 +1805,7 @@ void MainWindow::OnTimer(wxTimerEvent& event) { ShowCursor(false); } - + } #define BUILD_DATE __DATE__ " " __TIME__ @@ -2061,9 +2064,9 @@ void MainWindow::RecreateMenu() m_menuBar->Destroy(); m_menuBar = nullptr; } - + auto& config = GetConfig(); - + m_menuBar = new wxMenuBar(); // file submenu m_fileMenu = new wxMenu(); @@ -2115,7 +2118,7 @@ void MainWindow::RecreateMenu() item->Check(account_id == account.GetPersistentId()); if (m_game_launched || LaunchSettings::GetPersistentId().has_value()) item->Enable(false); - + ++index; } @@ -2145,8 +2148,8 @@ void MainWindow::RecreateMenu() // options submenu wxMenu* optionsMenu = new wxMenu(); m_fullscreenMenuItem = optionsMenu->AppendCheckItem(MAINFRAME_MENU_ID_OPTIONS_FULLSCREEN, _("&Fullscreen"), wxEmptyString); - m_fullscreenMenuItem->Check(ActiveSettings::FullscreenEnabled()); - + m_fullscreenMenuItem->Check(ActiveSettings::FullscreenEnabled()); + optionsMenu->Append(MAINFRAME_MENU_ID_OPTIONS_GRAPHIC_PACKS2, _("&Graphic packs")); m_padViewMenuItem = optionsMenu->AppendCheckItem(MAINFRAME_MENU_ID_OPTIONS_SECOND_WINDOW_PADVIEW, _("&Separate GamePad view"), wxEmptyString); m_padViewMenuItem->Check(GetConfig().pad_open); @@ -2227,6 +2230,7 @@ void MainWindow::RecreateMenu() debugLoggingMenu->AppendSeparator(); debugLoggingMenu->AppendCheckItem(MAINFRAME_MENU_ID_DEBUG_LOGGING0 + stdx::to_underlying(LogType::OpenGLLogging), _("&OpenGL debug output"), wxEmptyString)->Check(cemuLog_isLoggingEnabled(LogType::OpenGLLogging)); debugLoggingMenu->AppendCheckItem(MAINFRAME_MENU_ID_DEBUG_LOGGING0 + stdx::to_underlying(LogType::VulkanValidation), _("&Vulkan validation layer (slow)"), wxEmptyString)->Check(cemuLog_isLoggingEnabled(LogType::VulkanValidation)); + debugLoggingMenu->AppendCheckItem(MAINFRAME_MENU_ID_DEBUG_LOGGING0 + stdx::to_underlying(LogType::MetalLogging), _("&Metal debug output"), wxEmptyString)->Check(cemuLog_isLoggingEnabled(LogType::MetalLogging)); debugLoggingMenu->AppendCheckItem(MAINFRAME_MENU_ID_DEBUG_ADVANCED_PPC_INFO, _("&Log PPC context for API"), wxEmptyString)->Check(cemuLog_advancedPPCLoggingEnabled()); m_loggingSubmenu = debugLoggingMenu; // debug->dump submenu @@ -2240,7 +2244,7 @@ void MainWindow::RecreateMenu() debugMenu->AppendSubMenu(debugLoggingMenu, _("&Logging")); debugMenu->AppendSubMenu(debugDumpMenu, _("&Dump")); debugMenu->AppendSeparator(); - + auto upsidedownItem = debugMenu->AppendCheckItem(MAINFRAME_MENU_ID_DEBUG_RENDER_UPSIDE_DOWN, _("&Render upside-down"), wxEmptyString); upsidedownItem->Check(ActiveSettings::RenderUpsideDownEnabled()); if(LaunchSettings::RenderUpsideDownEnabled().has_value()) @@ -2296,6 +2300,7 @@ void MainWindow::RecreateMenu() // these options cant be toggled after the renderer backend is initialized: m_loggingSubmenu->Enable(MAINFRAME_MENU_ID_DEBUG_LOGGING0 + stdx::to_underlying(LogType::OpenGLLogging), false); m_loggingSubmenu->Enable(MAINFRAME_MENU_ID_DEBUG_LOGGING0 + stdx::to_underlying(LogType::VulkanValidation), false); + m_loggingSubmenu->Enable(MAINFRAME_MENU_ID_DEBUG_LOGGING0 + stdx::to_underlying(LogType::MetalLogging), false); UpdateNFCMenu(); } diff --git a/src/gui/PadViewFrame.cpp b/src/gui/PadViewFrame.cpp index e7cc5c185..6d1ec7d62 100644 --- a/src/gui/PadViewFrame.cpp +++ b/src/gui/PadViewFrame.cpp @@ -8,6 +8,7 @@ #include "Cafe/OS/libs/swkbd/swkbd.h" #include "gui/canvas/OpenGLCanvas.h" #include "gui/canvas/VulkanCanvas.h" +#include "gui/canvas/MetalCanvas.h" #include "config/CemuConfig.h" #include "gui/MainWindow.h" #include "gui/helpers/wxHelpers.h" @@ -74,8 +75,10 @@ void PadViewFrame::InitializeRenderCanvas() { if (ActiveSettings::GetGraphicsAPI() == kVulkan) m_render_canvas = new VulkanCanvas(this, wxSize(854, 480), false); - else + else if (ActiveSettings::GetGraphicsAPI() == kOpenGL) m_render_canvas = GLCanvas_Create(this, wxSize(854, 480), false); + else + m_render_canvas = new MetalCanvas(this, wxSize(854, 480), false); sizer->Add(m_render_canvas, 1, wxEXPAND, 0, nullptr); } SetSizer(sizer); @@ -173,7 +176,7 @@ void PadViewFrame::OnChar(wxKeyEvent& event) { if (swkbd_hasKeyboardInputHook()) swkbd_keyInput(event.GetUnicodeKey()); - + event.Skip(); } @@ -198,7 +201,7 @@ void PadViewFrame::OnMouseLeft(wxMouseEvent& event) instance.m_pad_mouse.position = { physPos.x, physPos.y }; if (event.ButtonDown(wxMOUSE_BTN_LEFT)) instance.m_pad_mouse.left_down_toggle = true; - + } void PadViewFrame::OnMouseRight(wxMouseEvent& event) diff --git a/src/gui/canvas/MetalCanvas.cpp b/src/gui/canvas/MetalCanvas.cpp new file mode 100644 index 000000000..fe8dc4eec --- /dev/null +++ b/src/gui/canvas/MetalCanvas.cpp @@ -0,0 +1,63 @@ +#include "gui/canvas/MetalCanvas.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "gui/guiWrapper.h" + +#include +#include + +MetalCanvas::MetalCanvas(wxWindow* parent, const wxSize& size, bool is_main_window) + : IRenderCanvas(is_main_window), wxWindow(parent, wxID_ANY, wxDefaultPosition, size, wxNO_FULL_REPAINT_ON_RESIZE | wxWANTS_CHARS) +{ + Bind(wxEVT_PAINT, &MetalCanvas::OnPaint, this); + Bind(wxEVT_SIZE, &MetalCanvas::OnResize, this); + + WindowHandleInfo& canvas = is_main_window ? gui_getWindowInfo().canvas_main : gui_getWindowInfo().canvas_pad; + gui_initHandleContextFromWxWidgetsWindow(canvas, this); + + try + { + if (is_main_window) + g_renderer = std::make_unique(); + + auto metal_renderer = MetalRenderer::GetInstance(); + metal_renderer->InitializeLayer({size.x, size.y}, is_main_window); + } + catch(const std::exception& ex) + { + cemuLog_log(LogType::Force, "Error when initializing Metal renderer: {}", ex.what()); + auto msg = formatWxString(_("Error when initializing Metal renderer:\n{}"), ex.what()); + wxMessageDialog dialog(this, msg, _("Error"), wxOK | wxCENTRE | wxICON_ERROR); + dialog.ShowModal(); + exit(0); + } + + wxWindow::EnableTouchEvents(wxTOUCH_PAN_GESTURES); +} + +MetalCanvas::~MetalCanvas() +{ + Unbind(wxEVT_PAINT, &MetalCanvas::OnPaint, this); + Unbind(wxEVT_SIZE, &MetalCanvas::OnResize, this); + + if(!m_is_main_window) + { + // TODO + //MetalRenderer* vkr = (MetalRenderer*)g_renderer.get(); + //if(vkr) + // vkr->StopUsingPadAndWait(); + } +} + +void MetalCanvas::OnPaint(wxPaintEvent& event) +{ +} + +void MetalCanvas::OnResize(wxSizeEvent& event) +{ + const wxSize size = GetSize(); + if (size.GetWidth() == 0 || size.GetHeight() == 0) + return; + + const wxRect refreshRect(size); + RefreshRect(refreshRect, false); +} diff --git a/src/gui/canvas/MetalCanvas.h b/src/gui/canvas/MetalCanvas.h new file mode 100644 index 000000000..4dc4d49f9 --- /dev/null +++ b/src/gui/canvas/MetalCanvas.h @@ -0,0 +1,19 @@ +#pragma once + +#include "gui/canvas/IRenderCanvas.h" + +#include + +#include + +class MetalCanvas : public IRenderCanvas, public wxWindow +{ +public: + MetalCanvas(wxWindow* parent, const wxSize& size, bool is_main_window); + ~MetalCanvas(); + +private: + + void OnPaint(wxPaintEvent& event); + void OnResize(wxSizeEvent& event); +}; diff --git a/src/gui/guiWrapper.cpp b/src/gui/guiWrapper.cpp index d887e89a1..8f004eddb 100644 --- a/src/gui/guiWrapper.cpp +++ b/src/gui/guiWrapper.cpp @@ -82,11 +82,14 @@ void gui_updateWindowTitles(bool isIdle, bool isLoading, double fps) case RendererAPI::OpenGL: renderer = "[OpenGL]"; break; - case RendererAPI::Vulkan: + case RendererAPI::Vulkan: renderer = "[Vulkan]"; break; + case RendererAPI::Metal: + renderer = "[Metal]"; + break; default: ; - } + } } // get GPU vendor/mode @@ -217,7 +220,7 @@ void gui_initHandleContextFromWxWidgetsWindow(WindowHandleInfo& handleInfoOut, c cemuLog_log(LogType::Force, "Unable to get xlib display"); } } - else + else #ifdef HAS_WAYLAND if(GDK_IS_WAYLAND_WINDOW(gdkWindow)) { diff --git a/src/imgui/CMakeLists.txt b/src/imgui/CMakeLists.txt index db7686bd8..c3fc4a0ea 100644 --- a/src/imgui/CMakeLists.txt +++ b/src/imgui/CMakeLists.txt @@ -7,6 +7,8 @@ add_library(imguiImpl imgui_extension.h ) +# TODO: add Metal + set_property(TARGET imguiImpl PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") target_include_directories(imguiImpl PUBLIC "../") From 98370260d3eafbabbca04f83bb9309904ebc74fb Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 25 Jul 2024 12:53:24 +0200 Subject: [PATCH 002/368] initialize Metal --- .gitmodules | 3 + CMakeLists.txt | 5 + dependencies/metal-cpp | 1 + src/Cafe/CMakeLists.txt | 9 ++ .../Latte/Renderer/Metal/LatteTextureMtl.cpp | 92 +++++++++++++++++++ .../HW/Latte/Renderer/Metal/LatteTextureMtl.h | 33 +++++++ .../HW/Latte/Renderer/Metal/MetalCppImpl.cpp | 6 ++ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 28 +++++- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 23 ++++- .../HW/Latte/Renderer/RendererOuputShader.cpp | 46 +++++----- 10 files changed, 220 insertions(+), 26 deletions(-) create mode 160000 dependencies/metal-cpp create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalCppImpl.cpp diff --git a/.gitmodules b/.gitmodules index f352d478f..e5fb2a503 100644 --- a/.gitmodules +++ b/.gitmodules @@ -16,3 +16,6 @@ [submodule "dependencies/imgui"] path = dependencies/imgui url = https://github.com/ocornut/imgui +[submodule "dependencies/metal-cpp"] + path = dependencies/metal-cpp + url = https://github.com/bkaradzic/metal-cpp.git diff --git a/CMakeLists.txt b/CMakeLists.txt index c3b940f9d..e5dadd98a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -166,6 +166,11 @@ if (ENABLE_OPENGL) find_package(OpenGL REQUIRED) endif() +# TODO: handle this differently? +if (ENABLE_METAL AND APPLE) + include_directories(${CMAKE_SOURCE_DIR}/dependencies/metal-cpp) +endif() + if (ENABLE_DISCORD_RPC) add_compile_definitions(ENABLE_DISCORD_RPC) add_subdirectory(dependencies/discord-rpc EXCLUDE_FROM_ALL) diff --git a/dependencies/metal-cpp b/dependencies/metal-cpp new file mode 160000 index 000000000..a63bd172d --- /dev/null +++ b/dependencies/metal-cpp @@ -0,0 +1 @@ +Subproject commit a63bd172ddcba73a3d87ca32032b66ad41ddb9a6 diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index f7f256445..9da5caa90 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -528,7 +528,16 @@ if(ENABLE_METAL) if(APPLE) target_sources(CemuCafe PRIVATE HW/Latte/Renderer/Metal/MetalRenderer.cpp + HW/Latte/Renderer/Metal/MetalRenderer.h + HW/Latte/Renderer/Metal/MetalCppImpl.cpp + HW/Latte/Renderer/Metal/LatteTextureMtl.cpp + HW/Latte/Renderer/Metal/LatteTextureMtl.h ) + + #target_link_libraries(CemuCafe PRIVATE + # "-framework Metal" + # "-framework QuartzCore" + #) else() message(FATAL_ERROR "Metal is only supported on macOS") endif() diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp new file mode 100644 index 000000000..d6a9ac4a1 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -0,0 +1,92 @@ +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" +//#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" + +LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, + Latte::E_HWTILEMODE tileMode, bool isDepth) + : LatteTexture(dim, physAddress, physMipAddress, format, width, height, depth, pitch, mipLevels, swizzle, tileMode, isDepth), m_mtlr(mtlRenderer) +{ + MTL::TextureDescriptor* desc = MTL::TextureDescriptor::alloc()->init(); + sint32 effectiveBaseWidth = width; + sint32 effectiveBaseHeight = height; + sint32 effectiveBaseDepth = depth; + if (overwriteInfo.hasResolutionOverwrite) + { + effectiveBaseWidth = overwriteInfo.width; + effectiveBaseHeight = overwriteInfo.height; + effectiveBaseDepth = overwriteInfo.depth; + } + effectiveBaseDepth = std::max(1, effectiveBaseDepth); + + desc->setWidth(effectiveBaseWidth); + desc->setHeight(effectiveBaseHeight); + desc->setMipmapLevelCount(mipLevels); + + if (dim == Latte::E_DIM::DIM_3D) + { + desc->setDepth(effectiveBaseDepth); + } + else + { + desc->setArrayLength(effectiveBaseDepth); + } + + // TODO: uncomment + //MetalRenderer::FormatInfoMTL texFormatInfo; + //mtlRenderer->GetTextureFormatInfoMTL(format, isDepth, dim, effectiveBaseWidth, effectiveBaseHeight, &texFormatInfo); + //cemu_assert_debug(hasStencil == ((texFormatInfo.vkImageAspect & VK_IMAGE_ASPECT_STENCIL_BIT) != 0)); + //imageInfo.format = texFormatInfo.mtlPixelFormat; + desc->setPixelFormat(MTL::PixelFormatRGBA8Unorm); + + // TODO: is write needed? + MTL::TextureUsage usage = MTL::TextureUsageShaderRead | MTL::TextureUsageShaderWrite; + // TODO: add more conditions + if (Latte::IsCompressedFormat(format) == false) + { + usage |= MTL::TextureUsageRenderTarget; + } + desc->setUsage(usage); + + if (dim == Latte::E_DIM::DIM_2D) + desc->setTextureType(MTL::TextureType2D); + else if (dim == Latte::E_DIM::DIM_1D) + desc->setTextureType(MTL::TextureType1D); + else if (dim == Latte::E_DIM::DIM_3D) + desc->setTextureType(MTL::TextureType3D); + else if (dim == Latte::E_DIM::DIM_2D_ARRAY) + desc->setTextureType(MTL::TextureType2DArray); + else if (dim == Latte::E_DIM::DIM_CUBEMAP) + desc->setTextureType(MTL::TextureTypeCube); // TODO: is this correct? + else if (dim == Latte::E_DIM::DIM_2D_MSAA) + desc->setTextureType(MTL::TextureType2D); + else + { + cemu_assert_unimplemented(); + } + + m_texture = mtlRenderer->GetDevice()->newTexture(desc); + desc->release(); +} + +LatteTextureMtl::~LatteTextureMtl() +{ + m_texture->release(); +} + +LatteTextureView* LatteTextureMtl::CreateView(Latte::E_DIM dim, Latte::E_GX2SURFFMT format, sint32 firstMip, sint32 mipCount, sint32 firstSlice, sint32 sliceCount) +{ + cemu_assert_debug(mipCount > 0); + cemu_assert_debug(sliceCount > 0); + cemu_assert_debug((firstMip + mipCount) <= this->mipLevels); + cemu_assert_debug((firstSlice + sliceCount) <= this->depth); + + //return new LatteTextureViewMtl(m_mtlr, this, dim, format, firstMip, mipCount, firstSlice, sliceCount); + cemuLog_logDebug(LogType::Force, "not implemented"); + + return nullptr; +} + +void LatteTextureMtl::AllocateOnHost() +{ + cemuLog_logDebug(LogType::Force, "not implemented"); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h new file mode 100644 index 000000000..266a69d1a --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h @@ -0,0 +1,33 @@ +#pragma once + +#include + +#include "Cafe/HW/Latte/Core/LatteTexture.h" +#include "util/ChunkedHeap/ChunkedHeap.h" + +class LatteTextureMtl : public LatteTexture +{ +public: + LatteTextureMtl(class MetalRenderer* vkRenderer, Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, + uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth); + + ~LatteTextureMtl(); + + void AllocateOnHost() override; + +protected: + LatteTextureView* CreateView(Latte::E_DIM dim, Latte::E_GX2SURFFMT format, sint32 firstMip, sint32 mipCount, sint32 firstSlice, sint32 sliceCount) override; + +public: + uint64 m_vkFlushIndex{}; // used to track read-write dependencies within the same renderpass + + uint64 m_vkFlushIndex_read{}; + uint64 m_vkFlushIndex_write{}; + + uint32 m_collisionCheckIndex{}; // used to track if texture is being both sampled and output to during drawcall + +private: + class MetalRenderer* m_mtlr; + + MTL::Texture* m_texture; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCppImpl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalCppImpl.cpp new file mode 100644 index 000000000..13cd9dd67 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCppImpl.cpp @@ -0,0 +1,6 @@ +#define NS_PRIVATE_IMPLEMENTATION +#define CA_PRIVATE_IMPLEMENTATION +#define MTL_PRIVATE_IMPLEMENTATION +#include +#include +#include diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 0e0bfb9a0..8e203c50c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1,5 +1,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" + void MetalRenderer::InitializeLayer(const Vector2i& size, bool mainWindow) { /* const auto& windowInfo = gui_getWindowInfo().window_main; @@ -21,7 +23,8 @@ void MetalRenderer::InitializeLayer(const Vector2i& size, bool mainWindow) { } void MetalRenderer::Initialize() { - cemuLog_logDebug(LogType::Force, "not implemented"); + m_device = MTL::CreateSystemDefaultDevice(); + m_commandQueue = m_device->newCommandQueue(); } void MetalRenderer::Shutdown() { @@ -30,10 +33,17 @@ void MetalRenderer::Shutdown() { bool MetalRenderer::IsPadWindowActive() { cemuLog_logDebug(LogType::Force, "not implemented"); + + return false; } bool MetalRenderer::GetVRAMInfo(int& usageInMB, int& totalInMB) const { cemuLog_logDebug(LogType::Force, "not implemented"); + + usageInMB = 1024; + totalInMB = 1024; + + return false; } void MetalRenderer::ClearColorbuffer(bool padView) { @@ -55,6 +65,8 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput } bool MetalRenderer::BeginFrame(bool mainWindow) { cemuLog_logDebug(LogType::Force, "not implemented"); + + return false; } void MetalRenderer::Flush(bool waitIdle) { @@ -79,6 +91,8 @@ void MetalRenderer::renderTarget_setScissor(sint32 scissorX, sint32 scissorY, si LatteCachedFBO* MetalRenderer::rendertarget_createCachedFBO(uint64 key) { cemuLog_logDebug(LogType::Force, "not implemented"); + + return nullptr; } void MetalRenderer::rendertarget_deleteCachedFBO(LatteCachedFBO* fbo) { @@ -91,6 +105,8 @@ void MetalRenderer::rendertarget_bindFramebufferObject(LatteCachedFBO* cfbo) { void* MetalRenderer::texture_acquireTextureUploadBuffer(uint32 size) { cemuLog_logDebug(LogType::Force, "not implemented"); + + return nullptr; } void MetalRenderer::texture_releaseTextureUploadBuffer(uint8* mem) { @@ -99,6 +115,8 @@ void MetalRenderer::texture_releaseTextureUploadBuffer(uint8* mem) { TextureDecoder* MetalRenderer::texture_chooseDecodedFormat(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, uint32 width, uint32 height) { cemuLog_logDebug(LogType::Force, "not implemented"); + + return nullptr; } void MetalRenderer::texture_clearSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex) { @@ -118,7 +136,7 @@ void MetalRenderer::texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sl } LatteTexture* MetalRenderer::texture_createTextureEx(Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) { - cemuLog_logDebug(LogType::Force, "not implemented"); + return new LatteTextureMtl(this, dim, physAddress, physMipAddress, format, width, height, depth, pitch, mipLevels, swizzle, tileMode, isDepth); } void MetalRenderer::texture_setLatteTexture(LatteTextureView* textureView, uint32 textureUnit) { @@ -131,6 +149,8 @@ void MetalRenderer::texture_copyImageSubData(LatteTexture* src, sint32 srcMip, s LatteTextureReadbackInfo* MetalRenderer::texture_createReadback(LatteTextureView* textureView) { cemuLog_logDebug(LogType::Force, "not implemented"); + + return nullptr; } void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* sourceTexture, sint32 srcMip, sint32 srcSlice, LatteTexture* destinationTexture, sint32 dstMip, sint32 dstSlice, sint32 width, sint32 height) { @@ -163,6 +183,8 @@ void MetalRenderer::buffer_bindUniformBuffer(LatteConst::ShaderType shaderType, RendererShader* MetalRenderer::shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool compileAsync, bool isGfxPackSource) { cemuLog_logDebug(LogType::Force, "not implemented"); + + return nullptr; } void MetalRenderer::streamout_setupXfbBuffer(uint32 bufferIndex, sint32 ringBufferOffset, uint32 rangeAddr, uint32 rangeSize) { @@ -191,6 +213,8 @@ void MetalRenderer::draw_endSequence() { void* MetalRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) { cemuLog_logDebug(LogType::Force, "not implemented"); + + return nullptr; } void MetalRenderer::indexData_uploadIndexMemory(uint32 offset, uint32 size) { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 017c32fda..dd214a9c5 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -1,5 +1,9 @@ #pragma once +#include +#include +#include + #include "Cafe/HW/Latte/Renderer/Renderer.h" class MetalRenderer : public Renderer @@ -16,6 +20,11 @@ class MetalRenderer : public Renderer return static_cast(g_renderer.get()); } + // Helper functions + MTL::Device* GetDevice() const { + return m_device; + } + void InitializeLayer(const Vector2i& size, bool mainWindow); void Initialize() override; @@ -44,6 +53,8 @@ class MetalRenderer : public Renderer // imgui bool ImguiBegin(bool mainWindow) override { cemuLog_logDebug(LogType::Force, "Imgui is not yet supported on Metal"); + + return false; }; void ImguiEnd() override { @@ -52,6 +63,8 @@ class MetalRenderer : public Renderer ImTextureID GenerateTexture(const std::vector& data, const Vector2i& size) override { cemuLog_logDebug(LogType::Force, "Imgui is not yet supported on Metal"); + + return nullptr; }; void DeleteTexture(ImTextureID id) override { @@ -122,6 +135,8 @@ class MetalRenderer : public Renderer // occlusion queries LatteQueryObject* occlusionQuery_create() override { cemuLog_logDebug(LogType::Force, "Occlusion queries are not yet supported on Metal"); + + return nullptr; } void occlusionQuery_destroy(LatteQueryObject* queryObj) override { @@ -137,6 +152,10 @@ class MetalRenderer : public Renderer } -protected: - //CA::MetalLayer* m_metalLayer; +private: + CA::MetalLayer* m_metalLayer; + + // Metal objects + MTL::Device* m_device; + MTL::CommandQueue* m_commandQueue; }; diff --git a/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp b/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp index cdbeb3f3a..ab468055c 100644 --- a/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp +++ b/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp @@ -83,7 +83,7 @@ void main(){ const std::string RendererOutputShader::s_hermite_shader_source = R"(#version 420 -in vec4 gl_FragCoord; +in vec4 gl_FragCoord; in vec2 passUV; layout(binding=0) uniform sampler2D textureSrc; uniform vec2 textureSrcResolution; @@ -100,7 +100,7 @@ vec3 CubicHermite (vec3 A, vec3 B, vec3 C, vec3 D, float t) vec3 b = A - (5.0*B)/2.0 + 2.0*C - D / 2.0; vec3 c = -A/2.0 + C/2.0; vec3 d = B; - + return a*t3 + b*t2 + c*t + d; } @@ -108,36 +108,36 @@ vec3 CubicHermite (vec3 A, vec3 B, vec3 C, vec3 D, float t) vec3 BicubicHermiteTexture(vec2 uv, vec4 texelSize) { vec2 pixel = uv*texelSize.zw + 0.5; - vec2 frac = fract(pixel); + vec2 frac = fract(pixel); pixel = floor(pixel) / texelSize.zw - vec2(texelSize.xy/2.0); - + vec4 doubleSize = texelSize*texelSize; vec3 C00 = texture(textureSrc, pixel + vec2(-texelSize.x ,-texelSize.y)).rgb; vec3 C10 = texture(textureSrc, pixel + vec2( 0.0 ,-texelSize.y)).rgb; vec3 C20 = texture(textureSrc, pixel + vec2( texelSize.x ,-texelSize.y)).rgb; vec3 C30 = texture(textureSrc, pixel + vec2( doubleSize.x,-texelSize.y)).rgb; - + vec3 C01 = texture(textureSrc, pixel + vec2(-texelSize.x , 0.0)).rgb; vec3 C11 = texture(textureSrc, pixel + vec2( 0.0 , 0.0)).rgb; vec3 C21 = texture(textureSrc, pixel + vec2( texelSize.x , 0.0)).rgb; - vec3 C31 = texture(textureSrc, pixel + vec2( doubleSize.x, 0.0)).rgb; - + vec3 C31 = texture(textureSrc, pixel + vec2( doubleSize.x, 0.0)).rgb; + vec3 C02 = texture(textureSrc, pixel + vec2(-texelSize.x , texelSize.y)).rgb; vec3 C12 = texture(textureSrc, pixel + vec2( 0.0 , texelSize.y)).rgb; vec3 C22 = texture(textureSrc, pixel + vec2( texelSize.x , texelSize.y)).rgb; - vec3 C32 = texture(textureSrc, pixel + vec2( doubleSize.x, texelSize.y)).rgb; - + vec3 C32 = texture(textureSrc, pixel + vec2( doubleSize.x, texelSize.y)).rgb; + vec3 C03 = texture(textureSrc, pixel + vec2(-texelSize.x , doubleSize.y)).rgb; vec3 C13 = texture(textureSrc, pixel + vec2( 0.0 , doubleSize.y)).rgb; vec3 C23 = texture(textureSrc, pixel + vec2( texelSize.x , doubleSize.y)).rgb; - vec3 C33 = texture(textureSrc, pixel + vec2( doubleSize.x, doubleSize.y)).rgb; - + vec3 C33 = texture(textureSrc, pixel + vec2( doubleSize.x, doubleSize.y)).rgb; + vec3 CP0X = CubicHermite(C00, C10, C20, C30, frac.x); vec3 CP1X = CubicHermite(C01, C11, C21, C31, frac.x); vec3 CP2X = CubicHermite(C02, C12, C22, C32, frac.x); vec3 CP3X = CubicHermite(C03, C13, C23, C33, frac.x); - + return CubicHermite(CP0X, CP1X, CP2X, CP3X, frac.y); } @@ -190,7 +190,7 @@ void RendererOutputShader::SetUniformParameters(const LatteTextureView& texture_ float res[2]; // vertex shader if (m_attributes[0].m_loc_texture_src_resolution != -1) - { + { res[0] = (float)texture_view.baseTexture->width; res[1] = (float)texture_view.baseTexture->height; m_vertex_shader->SetUniform2fv(m_attributes[0].m_loc_texture_src_resolution, res, 1); @@ -250,9 +250,9 @@ std::string RendererOutputShader::GetOpenGlVertexSource(bool render_upside_down) R"(#version 400 out vec2 passUV; -out gl_PerVertex -{ - vec4 gl_Position; +out gl_PerVertex +{ + vec4 gl_Position; }; void main(){ @@ -286,7 +286,7 @@ void main(){ vertex_source << R"( passUV = vUV; - gl_Position = vec4(vPos, 0.0, 1.0); + gl_Position = vec4(vPos, 0.0, 1.0); } )"; return vertex_source.str(); @@ -300,9 +300,9 @@ std::string RendererOutputShader::GetVulkanVertexSource(bool render_upside_down) R"(#version 450 layout(location = 0) out vec2 passUV; -out gl_PerVertex -{ - vec4 gl_Position; +out gl_PerVertex +{ + vec4 gl_Position; }; void main(){ @@ -336,7 +336,7 @@ void main(){ vertex_source << R"( passUV = vUV; - gl_Position = vec4(vPos, 0.0, 1.0); + gl_Position = vec4(vPos, 0.0, 1.0); } )"; return vertex_source.str(); @@ -359,7 +359,7 @@ void RendererOutputShader::InitializeStatic() s_hermit_shader = new RendererOutputShader(vertex_source, s_hermite_shader_source); s_hermit_shader_ud = new RendererOutputShader(vertex_source_ud, s_hermite_shader_source); } - else + else if (g_renderer->GetType() == RendererAPI::Vulkan) { vertex_source = GetVulkanVertexSource(false); vertex_source_ud = GetVulkanVertexSource(true); @@ -372,5 +372,7 @@ void RendererOutputShader::InitializeStatic() s_hermit_shader = new RendererOutputShader(vertex_source, s_hermite_shader_source); s_hermit_shader_ud = new RendererOutputShader(vertex_source_ud, s_hermite_shader_source);*/ + } else { + cemuLog_logDebug(LogType::Force, "Output shader not implemented for Metal"); } } From 6db893c44681c9290fef64fc1b62111e9b8a8b24 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 25 Jul 2024 14:05:56 +0200 Subject: [PATCH 003/368] implement texture view & fix: crashes --- src/Cafe/CMakeLists.txt | 2 + src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp | 15 ++--- src/Cafe/HW/Latte/Core/LatteShaderCache.cpp | 8 +-- .../Latte/Renderer/Metal/LatteTextureMtl.cpp | 7 +-- .../HW/Latte/Renderer/Metal/LatteTextureMtl.h | 5 +- .../Renderer/Metal/LatteTextureViewMtl.cpp | 56 +++++++++++++++++++ .../Renderer/Metal/LatteTextureViewMtl.h | 21 +++++++ 7 files changed, 97 insertions(+), 17 deletions(-) create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index 9da5caa90..ce6738d82 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -532,6 +532,8 @@ if(ENABLE_METAL) HW/Latte/Renderer/Metal/MetalCppImpl.cpp HW/Latte/Renderer/Metal/LatteTextureMtl.cpp HW/Latte/Renderer/Metal/LatteTextureMtl.h + HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp + HW/Latte/Renderer/Metal/LatteTextureViewMtl.h ) #target_link_libraries(CemuCafe PRIVATE diff --git a/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp b/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp index 60124c02e..f8a53b6d4 100644 --- a/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp +++ b/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp @@ -724,8 +724,8 @@ void LatteRenderTarget_applyTextureColorClear(LatteTexture* texture, uint32 slic void LatteRenderTarget_applyTextureDepthClear(LatteTexture* texture, uint32 sliceIndex, uint32 mipIndex, bool hasDepthClear, bool hasStencilClear, float depthValue, uint8 stencilValue, uint64 eventCounter) { - if(texture->isDepth) - { + if(texture->isDepth) + { g_renderer->texture_clearDepthSlice(texture, sliceIndex, mipIndex, hasDepthClear, hasStencilClear, depthValue, stencilValue); } else @@ -884,7 +884,7 @@ void LatteRenderTarget_copyToBackbuffer(LatteTextureView* textureView, bool isPa textureView->baseTexture->GetEffectiveSize(effectiveWidth, effectiveHeight, 0); _currentOutputImageWidth = effectiveWidth; _currentOutputImageHeight = effectiveHeight; - + sint32 imageX, imageY; sint32 imageWidth, imageHeight; sint32 fullscreenWidth, fullscreenHeight; @@ -933,7 +933,7 @@ void LatteRenderTarget_copyToBackbuffer(LatteTextureView* textureView, bool isPa if (shader == nullptr) { sint32 scaling_filter = downscaling ? GetConfig().downscale_filter : GetConfig().upscale_filter; - + if (g_renderer->GetType() == RendererAPI::Vulkan) { // force linear or nearest neighbor filter @@ -978,7 +978,8 @@ void LatteRenderTarget_copyToBackbuffer(LatteTextureView* textureView, bool isPa filter = LatteTextureView::MagFilter::kNearestNeighbor; } } - cemu_assert(shader); + // HACK: comment out the assert + //cemu_assert(shader); g_renderer->DrawBackbufferQuad(textureView, shader, filter==LatteTextureView::MagFilter::kLinear, imageX, imageY, imageWidth, imageHeight, isPadView, clearBackground); g_renderer->HandleScreenshotRequest(textureView, isPadView); if (!g_renderer->ImguiBegin(!isPadView)) @@ -1029,7 +1030,7 @@ void LatteRenderTarget_itHLECopyColorBufferToScanBuffer(MPTR colorBufferPtr, uin { controller = InputManager::instance().get_vpad_controller(1); if (controller && controller->is_screen_active()) - showDRC = true; + showDRC = true; } } @@ -1054,7 +1055,7 @@ void LatteRenderTarget_updateViewport() float vpX = LatteGPUState.contextNew.PA_CL_VPORT_XOFFSET.get_OFFSET() - LatteGPUState.contextNew.PA_CL_VPORT_XSCALE.get_SCALE(); float vpHeight = LatteGPUState.contextNew.PA_CL_VPORT_YSCALE.get_SCALE() / -0.5f; float vpY = LatteGPUState.contextNew.PA_CL_VPORT_YOFFSET.get_OFFSET() + LatteGPUState.contextNew.PA_CL_VPORT_YSCALE.get_SCALE(); - + bool halfZ = LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_DX_CLIP_SPACE_DEF(); // calculate near/far diff --git a/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp b/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp index 9576eb2e2..884186509 100644 --- a/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp @@ -41,7 +41,7 @@ struct sint32 pixelShaderCount; }shaderCacheScreenStats; -struct +struct { ImTextureID textureTVId; ImTextureID textureDRCId; @@ -328,7 +328,7 @@ void LatteShaderCache_Load() }; LatteShaderCache_ShowProgress(LoadShadersUpdate, false); - + LatteShaderCache_updateCompileQueue(0); // write load time and RAM usage to log file (in dev build) #if BOOST_OS_WINDOWS @@ -371,7 +371,7 @@ void LatteShaderCache_ShowProgress(const std::function & loadUpdateF { const auto kPopupFlags = ImGuiWindowFlags_NoMove | ImGuiWindowFlags_NoDecoration | ImGuiWindowFlags_NoSavedSettings | ImGuiWindowFlags_NoFocusOnAppearing | ImGuiWindowFlags_NoNav | ImGuiWindowFlags_AlwaysAutoResize; const auto textColor = 0xFF888888; - + auto lastFrameUpdate = tick_cached(); while (true) @@ -793,7 +793,7 @@ void LatteShaderCache_handleDeprecatedCacheFiles(fs::path pathGeneric, fs::path { // ask user if they want to delete or keep the old cache file auto infoMsg = _("Cemu detected that the shader cache for this game is outdated.\nOnly shader caches generated with Cemu 1.25.0 or above are supported.\n\nWe recommend deleting the outdated cache file as it will no longer be used by Cemu."); - + wxMessageDialog dialog(nullptr, infoMsg, _("Outdated shader cache"), wxYES_NO | wxCENTRE | wxICON_EXCLAMATION); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index d6a9ac4a1..d2c41b737 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -1,5 +1,5 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" -//#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, @@ -80,10 +80,7 @@ LatteTextureView* LatteTextureMtl::CreateView(Latte::E_DIM dim, Latte::E_GX2SURF cemu_assert_debug((firstMip + mipCount) <= this->mipLevels); cemu_assert_debug((firstSlice + sliceCount) <= this->depth); - //return new LatteTextureViewMtl(m_mtlr, this, dim, format, firstMip, mipCount, firstSlice, sliceCount); - cemuLog_logDebug(LogType::Force, "not implemented"); - - return nullptr; + return new LatteTextureViewMtl(m_mtlr, this, dim, format, firstMip, mipCount, firstSlice, sliceCount); } void LatteTextureMtl::AllocateOnHost() diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h index 266a69d1a..c659e919f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h @@ -10,9 +10,12 @@ class LatteTextureMtl : public LatteTexture public: LatteTextureMtl(class MetalRenderer* vkRenderer, Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth); - ~LatteTextureMtl(); + MTL::Texture* GetTexture() const { + return m_texture; + } + void AllocateOnHost() override; protected: diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp new file mode 100644 index 000000000..3a93e76d8 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp @@ -0,0 +1,56 @@ +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" + +LatteTextureViewMtl::LatteTextureViewMtl(MetalRenderer* mtlRenderer, LatteTextureMtl* texture, Latte::E_DIM dim, Latte::E_GX2SURFFMT format, sint32 firstMip, sint32 mipCount, sint32 firstSlice, sint32 sliceCount) + : LatteTextureView(texture, firstMip, mipCount, firstSlice, sliceCount, dim, format), m_mtlr(mtlRenderer) +{ + // TODO: don't hardcode the format + MTL::PixelFormat pixelFormat = MTL::PixelFormatRGBA8Unorm; + MTL::TextureType textureType; + switch (dim) + { + case Latte::E_DIM::DIM_1D: + textureType = MTL::TextureType1D; + case Latte::E_DIM::DIM_2D: + case Latte::E_DIM::DIM_2D_MSAA: + textureType = MTL::TextureType2D; + case Latte::E_DIM::DIM_2D_ARRAY: + textureType = MTL::TextureType2DArray; + case Latte::E_DIM::DIM_3D: + textureType = MTL::TextureType3D; + case Latte::E_DIM::DIM_CUBEMAP: + textureType = MTL::TextureTypeCube; // TODO: check this + default: + cemu_assert_unimplemented(); + textureType = MTL::TextureType2D; + } + + uint32 baseLevel = firstMip; + uint32 levelCount = this->numMip; + uint32 baseLayer; + uint32 layerCount; + // TODO: check if base texture is 3D texture as well + if (textureType == MTL::TextureType3D) + { + cemu_assert_debug(firstMip == 0); + // TODO: uncomment + //cemu_assert_debug(this->numSlice == baseTexture->depth); + baseLayer = 0; + layerCount = 1; + } + else + { + baseLayer = firstSlice; + layerCount = this->numSlice; + } + + // TODO: swizzle + + m_texture = texture->GetTexture()->newTextureView(pixelFormat, textureType, NS::Range::Make(baseLevel, levelCount), NS::Range::Make(baseLayer, layerCount)); +} + +LatteTextureViewMtl::~LatteTextureViewMtl() +{ + m_texture->release(); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h new file mode 100644 index 000000000..c5a21b126 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h @@ -0,0 +1,21 @@ +#pragma once + +#include + +#include "Cafe/HW/Latte/Core/LatteTexture.h" + +class LatteTextureViewMtl : public LatteTextureView +{ +public: + LatteTextureViewMtl(class MetalRenderer* mtlRenderer, class LatteTextureMtl* texture, Latte::E_DIM dim, Latte::E_GX2SURFFMT format, sint32 firstMip, sint32 mipCount, sint32 firstSlice, sint32 sliceCount); + ~LatteTextureViewMtl(); + + MTL::Texture* GetTexture() const { + return m_texture; + } + +private: + class MetalRenderer* m_mtlr; + + MTL::Texture* m_texture; +}; From 9b127be38daca7262cde3af84dc9d4ed57bde404 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 25 Jul 2024 16:37:38 +0200 Subject: [PATCH 004/368] create and present drawable --- src/Cafe/CMakeLists.txt | 7 +- .../Latte/Renderer/Metal/LatteTextureMtl.cpp | 1 + src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.h | 3 + .../HW/Latte/Renderer/Metal/MetalLayer.mm | 16 ++ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 180 ++++++++++++------ .../HW/Latte/Renderer/Metal/MetalRenderer.h | 3 +- src/Cafe/HW/Latte/Renderer/MetalView.h | 7 + src/Cafe/HW/Latte/Renderer/MetalView.mm | 26 +++ .../HW/Latte/Renderer/Vulkan/CocoaSurface.mm | 31 +-- 9 files changed, 180 insertions(+), 94 deletions(-) create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.h create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.mm create mode 100644 src/Cafe/HW/Latte/Renderer/MetalView.h create mode 100644 src/Cafe/HW/Latte/Renderer/MetalView.mm diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index ce6738d82..379b9857c 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -521,7 +521,10 @@ add_library(CemuCafe ) if(APPLE) - target_sources(CemuCafe PRIVATE "HW/Latte/Renderer/Vulkan/CocoaSurface.mm") + target_sources(CemuCafe PRIVATE + HW/Latte/Renderer/Vulkan/CocoaSurface.mm + HW/Latte/Renderer/MetalView.mm + ) endif() if(ENABLE_METAL) @@ -530,6 +533,8 @@ if(ENABLE_METAL) HW/Latte/Renderer/Metal/MetalRenderer.cpp HW/Latte/Renderer/Metal/MetalRenderer.h HW/Latte/Renderer/Metal/MetalCppImpl.cpp + HW/Latte/Renderer/Metal/MetalLayer.mm + HW/Latte/Renderer/Metal/MetalLayer.h HW/Latte/Renderer/Metal/LatteTextureMtl.cpp HW/Latte/Renderer/Metal/LatteTextureMtl.h HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index d2c41b737..4e053821e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -65,6 +65,7 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM } m_texture = mtlRenderer->GetDevice()->newTexture(desc); + desc->release(); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.h new file mode 100644 index 000000000..56a302246 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.h @@ -0,0 +1,3 @@ +#pragma once + +void* CreateMetalLayer(void* handle); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.mm b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.mm new file mode 100644 index 000000000..8ce3202ed --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.mm @@ -0,0 +1,16 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalLayer.h" + +#include "Cafe/HW/Latte/Renderer/MetalView.h" + +void* CreateMetalLayer(void* handle) +{ + NSView* view = (NSView*)handle; + + MetalView* childView = [[MetalView alloc] initWithFrame:view.bounds]; + childView.autoresizingMask = NSViewWidthSizable | NSViewHeightSizable; + childView.wantsLayer = YES; + + [view addSubview:childView]; + + return childView.layer; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 8e203c50c..cdd3b05cd 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1,43 +1,48 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" - +#include "Cafe/HW/Latte/Renderer/Metal/MetalLayer.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" -void MetalRenderer::InitializeLayer(const Vector2i& size, bool mainWindow) { - /* - const auto& windowInfo = gui_getWindowInfo().window_main; +#include "gui/guiWrapper.h" - NSView* view = (NS::View*)handle; +MetalRenderer::MetalRenderer() +{ + m_device = MTL::CreateSystemDefaultDevice(); + m_commandQueue = m_device->newCommandQueue(); +} - MetalView* childView = [[MetalView alloc] initWithFrame:view.bounds]; - childView.autoresizingMask = NSViewWidthSizable | NSViewHeightSizable; - childView.wantsLayer = YES; +MetalRenderer::~MetalRenderer() +{ + m_commandQueue->release(); + m_device->release(); +} - [view addSubview:childView]; +// TODO: don't ignore "mainWindow" argument +void MetalRenderer::InitializeLayer(const Vector2i& size, bool mainWindow) +{ + const auto& windowInfo = gui_getWindowInfo().window_main; - VkMetalSurfaceCreateInfoEXT surface; - surface.sType = VK_STRUCTURE_TYPE_METAL_SURFACE_CREATE_INFO_EXT; - surface.pNext = NULL; - surface.flags = 0; - surface.pLayer = (CAMetalLayer*)childView.layer; - */ + m_metalLayer = (CA::MetalLayer*)CreateMetalLayer(windowInfo.handle); + m_metalLayer->setDevice(m_device); } -void MetalRenderer::Initialize() { - m_device = MTL::CreateSystemDefaultDevice(); - m_commandQueue = m_device->newCommandQueue(); +void MetalRenderer::Initialize() +{ } -void MetalRenderer::Shutdown() { +void MetalRenderer::Shutdown() +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -bool MetalRenderer::IsPadWindowActive() { +bool MetalRenderer::IsPadWindowActive() +{ cemuLog_logDebug(LogType::Force, "not implemented"); return false; } -bool MetalRenderer::GetVRAMInfo(int& usageInMB, int& totalInMB) const { +bool MetalRenderer::GetVRAMInfo(int& usageInMB, int& totalInMB) const +{ cemuLog_logDebug(LogType::Force, "not implemented"); usageInMB = 1024; @@ -46,177 +51,228 @@ bool MetalRenderer::GetVRAMInfo(int& usageInMB, int& totalInMB) const { return false; } -void MetalRenderer::ClearColorbuffer(bool padView) { +void MetalRenderer::ClearColorbuffer(bool padView) +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -void MetalRenderer::DrawEmptyFrame(bool mainWindow) { +void MetalRenderer::DrawEmptyFrame(bool mainWindow) +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) { - cemuLog_logDebug(LogType::Force, "not implemented"); +void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) +{ + CA::MetalDrawable* drawable = m_metalLayer->nextDrawable(); + if (!drawable) + { + return; + } + + MTL::CommandBuffer* commandBuffer = m_commandQueue->commandBuffer(); + commandBuffer->presentDrawable(drawable); + commandBuffer->commit(); + + commandBuffer->release(); } void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutputShader* shader, bool useLinearTexFilter, sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight, - bool padView, bool clearBackground) { + bool padView, bool clearBackground) +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -bool MetalRenderer::BeginFrame(bool mainWindow) { + +bool MetalRenderer::BeginFrame(bool mainWindow) +{ cemuLog_logDebug(LogType::Force, "not implemented"); return false; } -void MetalRenderer::Flush(bool waitIdle) { +void MetalRenderer::Flush(bool waitIdle) +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -void MetalRenderer::NotifyLatteCommandProcessorIdle() { +void MetalRenderer::NotifyLatteCommandProcessorIdle() +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -void MetalRenderer::AppendOverlayDebugInfo() { +void MetalRenderer::AppendOverlayDebugInfo() +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -void MetalRenderer::renderTarget_setViewport(float x, float y, float width, float height, float nearZ, float farZ, bool halfZ) { +void MetalRenderer::renderTarget_setViewport(float x, float y, float width, float height, float nearZ, float farZ, bool halfZ) +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -void MetalRenderer::renderTarget_setScissor(sint32 scissorX, sint32 scissorY, sint32 scissorWidth, sint32 scissorHeight) { +void MetalRenderer::renderTarget_setScissor(sint32 scissorX, sint32 scissorY, sint32 scissorWidth, sint32 scissorHeight) +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -LatteCachedFBO* MetalRenderer::rendertarget_createCachedFBO(uint64 key) { +LatteCachedFBO* MetalRenderer::rendertarget_createCachedFBO(uint64 key) +{ cemuLog_logDebug(LogType::Force, "not implemented"); return nullptr; } -void MetalRenderer::rendertarget_deleteCachedFBO(LatteCachedFBO* fbo) { +void MetalRenderer::rendertarget_deleteCachedFBO(LatteCachedFBO* fbo) +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -void MetalRenderer::rendertarget_bindFramebufferObject(LatteCachedFBO* cfbo) { +void MetalRenderer::rendertarget_bindFramebufferObject(LatteCachedFBO* cfbo) +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -void* MetalRenderer::texture_acquireTextureUploadBuffer(uint32 size) { +void* MetalRenderer::texture_acquireTextureUploadBuffer(uint32 size) +{ cemuLog_logDebug(LogType::Force, "not implemented"); return nullptr; } -void MetalRenderer::texture_releaseTextureUploadBuffer(uint8* mem) { +void MetalRenderer::texture_releaseTextureUploadBuffer(uint8* mem) +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -TextureDecoder* MetalRenderer::texture_chooseDecodedFormat(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, uint32 width, uint32 height) { +TextureDecoder* MetalRenderer::texture_chooseDecodedFormat(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, uint32 width, uint32 height) +{ cemuLog_logDebug(LogType::Force, "not implemented"); return nullptr; } -void MetalRenderer::texture_clearSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex) { +void MetalRenderer::texture_clearSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex) +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, sint32 height, sint32 depth, void* pixelData, sint32 sliceIndex, sint32 mipIndex, uint32 compressedImageSize) { +void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, sint32 height, sint32 depth, void* pixelData, sint32 sliceIndex, sint32 mipIndex, uint32 compressedImageSize) +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -void MetalRenderer::texture_clearColorSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a) { +void MetalRenderer::texture_clearColorSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a) +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -void MetalRenderer::texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sliceIndex, sint32 mipIndex, bool clearDepth, bool clearStencil, float depthValue, uint32 stencilValue) { +void MetalRenderer::texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sliceIndex, sint32 mipIndex, bool clearDepth, bool clearStencil, float depthValue, uint32 stencilValue) +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -LatteTexture* MetalRenderer::texture_createTextureEx(Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) { +LatteTexture* MetalRenderer::texture_createTextureEx(Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) +{ return new LatteTextureMtl(this, dim, physAddress, physMipAddress, format, width, height, depth, pitch, mipLevels, swizzle, tileMode, isDepth); } -void MetalRenderer::texture_setLatteTexture(LatteTextureView* textureView, uint32 textureUnit) { +void MetalRenderer::texture_setLatteTexture(LatteTextureView* textureView, uint32 textureUnit) +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -void MetalRenderer::texture_copyImageSubData(LatteTexture* src, sint32 srcMip, sint32 effectiveSrcX, sint32 effectiveSrcY, sint32 srcSlice, LatteTexture* dst, sint32 dstMip, sint32 effectiveDstX, sint32 effectiveDstY, sint32 dstSlice, sint32 effectiveCopyWidth, sint32 effectiveCopyHeight, sint32 srcDepth) { +void MetalRenderer::texture_copyImageSubData(LatteTexture* src, sint32 srcMip, sint32 effectiveSrcX, sint32 effectiveSrcY, sint32 srcSlice, LatteTexture* dst, sint32 dstMip, sint32 effectiveDstX, sint32 effectiveDstY, sint32 dstSlice, sint32 effectiveCopyWidth, sint32 effectiveCopyHeight, sint32 srcDepth) +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -LatteTextureReadbackInfo* MetalRenderer::texture_createReadback(LatteTextureView* textureView) { +LatteTextureReadbackInfo* MetalRenderer::texture_createReadback(LatteTextureView* textureView) +{ cemuLog_logDebug(LogType::Force, "not implemented"); return nullptr; } -void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* sourceTexture, sint32 srcMip, sint32 srcSlice, LatteTexture* destinationTexture, sint32 dstMip, sint32 dstSlice, sint32 width, sint32 height) { +void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* sourceTexture, sint32 srcMip, sint32 srcSlice, LatteTexture* destinationTexture, sint32 dstMip, sint32 dstSlice, sint32 width, sint32 height) +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -void MetalRenderer::bufferCache_init(const sint32 bufferSize) { +void MetalRenderer::bufferCache_init(const sint32 bufferSize) +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -void MetalRenderer::bufferCache_upload(uint8* buffer, sint32 size, uint32 bufferOffset) { +void MetalRenderer::bufferCache_upload(uint8* buffer, sint32 size, uint32 bufferOffset) +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -void MetalRenderer::bufferCache_copy(uint32 srcOffset, uint32 dstOffset, uint32 size) { +void MetalRenderer::bufferCache_copy(uint32 srcOffset, uint32 dstOffset, uint32 size) +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -void MetalRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint32 dstOffset, uint32 size) { +void MetalRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint32 dstOffset, uint32 size) +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, uint32 size) { +void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, uint32 size) +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -void MetalRenderer::buffer_bindUniformBuffer(LatteConst::ShaderType shaderType, uint32 bufferIndex, uint32 offset, uint32 size) { +void MetalRenderer::buffer_bindUniformBuffer(LatteConst::ShaderType shaderType, uint32 bufferIndex, uint32 offset, uint32 size) +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -RendererShader* MetalRenderer::shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool compileAsync, bool isGfxPackSource) { +RendererShader* MetalRenderer::shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool compileAsync, bool isGfxPackSource) +{ cemuLog_logDebug(LogType::Force, "not implemented"); return nullptr; } -void MetalRenderer::streamout_setupXfbBuffer(uint32 bufferIndex, sint32 ringBufferOffset, uint32 rangeAddr, uint32 rangeSize) { +void MetalRenderer::streamout_setupXfbBuffer(uint32 bufferIndex, sint32 ringBufferOffset, uint32 rangeAddr, uint32 rangeSize) +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -void MetalRenderer::streamout_begin() { +void MetalRenderer::streamout_begin() +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -void MetalRenderer::streamout_rendererFinishDrawcall() { +void MetalRenderer::streamout_rendererFinishDrawcall() +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -void MetalRenderer::draw_beginSequence() { +void MetalRenderer::draw_beginSequence() +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst) { +void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst) +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -void MetalRenderer::draw_endSequence() { +void MetalRenderer::draw_endSequence() +{ cemuLog_logDebug(LogType::Force, "not implemented"); } -void* MetalRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) { +void* MetalRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) +{ cemuLog_logDebug(LogType::Force, "not implemented"); return nullptr; } -void MetalRenderer::indexData_uploadIndexMemory(uint32 offset, uint32 size) { +void MetalRenderer::indexData_uploadIndexMemory(uint32 offset, uint32 size) +{ cemuLog_logDebug(LogType::Force, "not implemented"); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index dd214a9c5..4cc06f571 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -9,7 +9,8 @@ class MetalRenderer : public Renderer { public: - ~MetalRenderer() = default; + MetalRenderer(); + ~MetalRenderer() override; RendererAPI GetType() override { diff --git a/src/Cafe/HW/Latte/Renderer/MetalView.h b/src/Cafe/HW/Latte/Renderer/MetalView.h new file mode 100644 index 000000000..43e5c7b3f --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/MetalView.h @@ -0,0 +1,7 @@ +#pragma once + +#import +#import + +@interface MetalView : NSView +@end diff --git a/src/Cafe/HW/Latte/Renderer/MetalView.mm b/src/Cafe/HW/Latte/Renderer/MetalView.mm new file mode 100644 index 000000000..5ca17b5ef --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/MetalView.mm @@ -0,0 +1,26 @@ +#include "Cafe/HW/Latte/Renderer/MetalView.h" + +@implementation MetalView + +-(BOOL) wantsUpdateLayer { return YES; } + ++(Class) layerClass { return [CAMetalLayer class]; } + +// copied from https://github.com/KhronosGroup/MoltenVK/blob/master/Demos/Cube/macOS/DemoViewController.m + +-(CALayer*) makeBackingLayer +{ + CALayer* layer = [self.class.layerClass layer]; + CGSize viewScale = [self convertSizeToBacking: CGSizeMake(1.0, 1.0)]; + layer.contentsScale = MIN(viewScale.width, viewScale.height); + return layer; +} + +-(BOOL) layer: (CALayer *)layer shouldInheritContentsScale: (CGFloat)newScale fromWindow: (NSWindow *)window +{ + if (newScale == layer.contentsScale) { return NO; } + + layer.contentsScale = newScale; + return YES; +} +@end diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/CocoaSurface.mm b/src/Cafe/HW/Latte/Renderer/Vulkan/CocoaSurface.mm index 731a6a267..a68174c93 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/CocoaSurface.mm +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/CocoaSurface.mm @@ -1,36 +1,7 @@ #include "Cafe/HW/Latte/Renderer/Vulkan/CocoaSurface.h" #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h" -#import -#import - -@interface MetalView : NSView -@end - -@implementation MetalView - --(BOOL) wantsUpdateLayer { return YES; } - -+(Class) layerClass { return [CAMetalLayer class]; } - -// copied from https://github.com/KhronosGroup/MoltenVK/blob/master/Demos/Cube/macOS/DemoViewController.m - --(CALayer*) makeBackingLayer -{ - CALayer* layer = [self.class.layerClass layer]; - CGSize viewScale = [self convertSizeToBacking: CGSizeMake(1.0, 1.0)]; - layer.contentsScale = MIN(viewScale.width, viewScale.height); - return layer; -} - --(BOOL) layer: (CALayer *)layer shouldInheritContentsScale: (CGFloat)newScale fromWindow: (NSWindow *)window -{ - if (newScale == layer.contentsScale) { return NO; } - - layer.contentsScale = newScale; - return YES; -} -@end +#include "Cafe/HW/Latte/Renderer/MetalView.h" VkSurfaceKHR CreateCocoaSurface(VkInstance instance, void* handle) { From 46981d7b0368d59a05f1b92fdc4293267c688b3b Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 26 Jul 2024 08:51:27 +0200 Subject: [PATCH 005/368] implement pixel formats and texture copy --- src/Cafe/CMakeLists.txt | 3 + .../Latte/Renderer/Metal/LatteTextureMtl.cpp | 15 ++- .../HW/Latte/Renderer/Metal/LatteTextureMtl.h | 7 ++ .../Renderer/Metal/LatteTextureViewMtl.cpp | 2 +- .../Renderer/Metal/LatteTextureViewMtl.h | 6 ++ .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 97 +++++++++++++++++++ src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h | 22 +++++ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 18 ++-- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 2 + 9 files changed, 157 insertions(+), 15 deletions(-) create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index 379b9857c..d4446652a 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -524,6 +524,7 @@ if(APPLE) target_sources(CemuCafe PRIVATE HW/Latte/Renderer/Vulkan/CocoaSurface.mm HW/Latte/Renderer/MetalView.mm + HW/Latte/Renderer/MetalView.h ) endif() @@ -535,6 +536,8 @@ if(ENABLE_METAL) HW/Latte/Renderer/Metal/MetalCppImpl.cpp HW/Latte/Renderer/Metal/MetalLayer.mm HW/Latte/Renderer/Metal/MetalLayer.h + HW/Latte/Renderer/Metal/LatteToMtl.cpp + HW/Latte/Renderer/Metal/LatteToMtl.h HW/Latte/Renderer/Metal/LatteTextureMtl.cpp HW/Latte/Renderer/Metal/LatteTextureMtl.h HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index 4e053821e..1b140590d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -1,12 +1,15 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) - : LatteTexture(dim, physAddress, physMipAddress, format, width, height, depth, pitch, mipLevels, swizzle, tileMode, isDepth), m_mtlr(mtlRenderer) + : LatteTexture(dim, physAddress, physMipAddress, format, width, height, depth, pitch, mipLevels, swizzle, tileMode, isDepth), m_mtlr(mtlRenderer), m_format(format) { MTL::TextureDescriptor* desc = MTL::TextureDescriptor::alloc()->init(); + desc->setStorageMode(MTL::StorageModeShared); // TODO: use private? + sint32 effectiveBaseWidth = width; sint32 effectiveBaseHeight = height; sint32 effectiveBaseDepth = depth; @@ -31,17 +34,13 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM desc->setArrayLength(effectiveBaseDepth); } - // TODO: uncomment - //MetalRenderer::FormatInfoMTL texFormatInfo; - //mtlRenderer->GetTextureFormatInfoMTL(format, isDepth, dim, effectiveBaseWidth, effectiveBaseHeight, &texFormatInfo); - //cemu_assert_debug(hasStencil == ((texFormatInfo.vkImageAspect & VK_IMAGE_ASPECT_STENCIL_BIT) != 0)); - //imageInfo.format = texFormatInfo.mtlPixelFormat; - desc->setPixelFormat(MTL::PixelFormatRGBA8Unorm); + auto formatInfo = GetMtlPixelFormatInfo(format); + desc->setPixelFormat(formatInfo.pixelFormat); // TODO: is write needed? MTL::TextureUsage usage = MTL::TextureUsageShaderRead | MTL::TextureUsageShaderWrite; // TODO: add more conditions - if (Latte::IsCompressedFormat(format) == false) + if (!Latte::IsCompressedFormat(format)) { usage |= MTL::TextureUsageRenderTarget; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h index c659e919f..cc08d4690 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h @@ -3,6 +3,7 @@ #include #include "Cafe/HW/Latte/Core/LatteTexture.h" +#include "HW/Latte/ISA/LatteReg.h" #include "util/ChunkedHeap/ChunkedHeap.h" class LatteTextureMtl : public LatteTexture @@ -16,6 +17,10 @@ class LatteTextureMtl : public LatteTexture return m_texture; } + Latte::E_GX2SURFFMT GetFormat() const { + return m_format; + } + void AllocateOnHost() override; protected: @@ -33,4 +38,6 @@ class LatteTextureMtl : public LatteTexture class MetalRenderer* m_mtlr; MTL::Texture* m_texture; + + Latte::E_GX2SURFFMT m_format; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp index 3a93e76d8..c8df8cf62 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp @@ -3,7 +3,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" LatteTextureViewMtl::LatteTextureViewMtl(MetalRenderer* mtlRenderer, LatteTextureMtl* texture, Latte::E_DIM dim, Latte::E_GX2SURFFMT format, sint32 firstMip, sint32 mipCount, sint32 firstSlice, sint32 sliceCount) - : LatteTextureView(texture, firstMip, mipCount, firstSlice, sliceCount, dim, format), m_mtlr(mtlRenderer) + : LatteTextureView(texture, firstMip, mipCount, firstSlice, sliceCount, dim, format), m_mtlr(mtlRenderer), m_format(format) { // TODO: don't hardcode the format MTL::PixelFormat pixelFormat = MTL::PixelFormatRGBA8Unorm; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h index c5a21b126..7df74b4f0 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h @@ -14,8 +14,14 @@ class LatteTextureViewMtl : public LatteTextureView return m_texture; } + Latte::E_GX2SURFFMT GetFormat() const { + return m_format; + } + private: class MetalRenderer* m_mtlr; MTL::Texture* m_texture; + + Latte::E_GX2SURFFMT m_format; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp new file mode 100644 index 000000000..a1d398bdc --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -0,0 +1,97 @@ +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Common/precompiled.h" + +std::map MTL_FORMAT_TABLE = {{ + {Latte::E_GX2SURFFMT::R4_G4_UNORM, {MTL::PixelFormatInvalid, 0}}, // TODO + {Latte::E_GX2SURFFMT::R5_G6_B5_UNORM, {MTL::PixelFormatB5G6R5Unorm, 2}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM, {MTL::PixelFormatBGR5A1Unorm, 2}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM, {MTL::PixelFormatABGR4Unorm, 2}}, // TODO: correct? + {Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM, {MTL::PixelFormatA1BGR5Unorm, 2}}, + {Latte::E_GX2SURFFMT::R8_UNORM, {MTL::PixelFormatR8Unorm, 1}}, + {Latte::E_GX2SURFFMT::R8_SNORM, {MTL::PixelFormatR8Snorm, 1}}, + {Latte::E_GX2SURFFMT::R8_UINT, {MTL::PixelFormatR8Uint, 1}}, + {Latte::E_GX2SURFFMT::R8_SINT, {MTL::PixelFormatR8Sint, 1}}, + {Latte::E_GX2SURFFMT::R8_G8_UNORM, {MTL::PixelFormatRG8Unorm, 2}}, + {Latte::E_GX2SURFFMT::R8_G8_SNORM, {MTL::PixelFormatRG8Snorm, 2}}, + {Latte::E_GX2SURFFMT::R8_G8_UINT, {MTL::PixelFormatRG8Uint, 2}}, + {Latte::E_GX2SURFFMT::R8_G8_SINT, {MTL::PixelFormatRG8Sint, 2}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_UNORM, {MTL::PixelFormatRGBA8Unorm, 4}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SNORM, {MTL::PixelFormatRGBA8Snorm, 4}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_UINT, {MTL::PixelFormatRGBA8Uint, 4}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SINT, {MTL::PixelFormatRGBA8Sint, 4}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SRGB, {MTL::PixelFormatRGBA8Unorm_sRGB, 4}}, + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_UNORM, {MTL::PixelFormatRGB10A2Unorm, 4}}, + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SNORM, {MTL::PixelFormatInvalid, 0}}, // TODO + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_UINT, {MTL::PixelFormatRGB10A2Uint, 4}}, + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SINT, {MTL::PixelFormatInvalid, 0}}, // TODO + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SRGB, {MTL::PixelFormatInvalid, 0}}, // TODO + {Latte::E_GX2SURFFMT::A2_B10_G10_R10_UNORM, {MTL::PixelFormatBGR10A2Unorm, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::A2_B10_G10_R10_UINT, {MTL::PixelFormatRGB10A2Uint, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R16_UNORM, {MTL::PixelFormatR16Unorm, 2}}, + {Latte::E_GX2SURFFMT::R16_SNORM, {MTL::PixelFormatR16Snorm, 2}}, + {Latte::E_GX2SURFFMT::R16_UINT, {MTL::PixelFormatR16Uint, 2}}, + {Latte::E_GX2SURFFMT::R16_SINT, {MTL::PixelFormatR16Sint, 2}}, + {Latte::E_GX2SURFFMT::R16_FLOAT, {MTL::PixelFormatR16Float, 2}}, + {Latte::E_GX2SURFFMT::R16_G16_UNORM, {MTL::PixelFormatRG16Unorm, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_SNORM, {MTL::PixelFormatRG16Snorm, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_UINT, {MTL::PixelFormatRG16Uint, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_SINT, {MTL::PixelFormatRG16Sint, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_FLOAT, {MTL::PixelFormatRG16Float, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_UNORM, {MTL::PixelFormatRGBA16Unorm, 8}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_SNORM, {MTL::PixelFormatRGBA16Snorm, 8}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_UINT, {MTL::PixelFormatRGBA16Uint, 8}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_SINT, {MTL::PixelFormatRGBA16Sint, 8}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_FLOAT, {MTL::PixelFormatRGBA16Float, 8}}, + {Latte::E_GX2SURFFMT::R24_X8_UNORM, {MTL::PixelFormatInvalid, 0}}, // TODO + {Latte::E_GX2SURFFMT::R24_X8_FLOAT, {MTL::PixelFormatInvalid, 0}}, // TODO + {Latte::E_GX2SURFFMT::X24_G8_UINT, {MTL::PixelFormatInvalid, 0}}, // TODO + {Latte::E_GX2SURFFMT::R32_X8_FLOAT, {MTL::PixelFormatInvalid, 0}}, // TODO + {Latte::E_GX2SURFFMT::X32_G8_UINT_X24, {MTL::PixelFormatInvalid, 0}}, // TODO + {Latte::E_GX2SURFFMT::R11_G11_B10_FLOAT, {MTL::PixelFormatRG11B10Float, 4}}, + {Latte::E_GX2SURFFMT::R32_UINT, {MTL::PixelFormatR32Uint, 4}}, + {Latte::E_GX2SURFFMT::R32_SINT, {MTL::PixelFormatR32Sint, 4}}, + {Latte::E_GX2SURFFMT::R32_FLOAT, {MTL::PixelFormatR32Float, 4}}, + {Latte::E_GX2SURFFMT::R32_G32_UINT, {MTL::PixelFormatRG32Uint, 8}}, + {Latte::E_GX2SURFFMT::R32_G32_SINT, {MTL::PixelFormatRG32Sint, 8}}, + {Latte::E_GX2SURFFMT::R32_G32_FLOAT, {MTL::PixelFormatRG32Float, 8}}, + {Latte::E_GX2SURFFMT::R32_G32_B32_A32_UINT, {MTL::PixelFormatRGBA32Uint, 16}}, + {Latte::E_GX2SURFFMT::R32_G32_B32_A32_SINT, {MTL::PixelFormatRGBA32Sint, 16}}, + {Latte::E_GX2SURFFMT::R32_G32_B32_A32_FLOAT, {MTL::PixelFormatRGBA32Float, 16}}, + {Latte::E_GX2SURFFMT::D24_S8_UNORM, {MTL::PixelFormatDepth24Unorm_Stencil8, 4}}, // TODO: not supported on Apple sillicon, maybe find something else + {Latte::E_GX2SURFFMT::D24_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::D32_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, 5}}, + {Latte::E_GX2SURFFMT::D16_UNORM, {MTL::PixelFormatDepth16Unorm, 2}}, + {Latte::E_GX2SURFFMT::D32_FLOAT, {MTL::PixelFormatDepth32Float, 4}}, + {Latte::E_GX2SURFFMT::BC1_UNORM, {MTL::PixelFormatBC1_RGBA, 8, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC1_SRGB, {MTL::PixelFormatBC1_RGBA_sRGB, 8, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC2_UNORM, {MTL::PixelFormatBC2_RGBA, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC2_SRGB, {MTL::PixelFormatBC2_RGBA_sRGB, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC3_UNORM, {MTL::PixelFormatBC3_RGBA, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC3_SRGB, {MTL::PixelFormatBC3_RGBA_sRGB, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC4_UNORM, {MTL::PixelFormatInvalid, 0}}, // TODO + {Latte::E_GX2SURFFMT::BC4_SNORM, {MTL::PixelFormatInvalid, 0}}, // TODO + {Latte::E_GX2SURFFMT::BC5_UNORM, {MTL::PixelFormatInvalid, 0}}, // TODO + {Latte::E_GX2SURFFMT::BC5_SNORM, {MTL::PixelFormatInvalid, 0}}, // TODO +}}; + +const MtlPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format) { + cemu_assert_debug(static_cast(format) < MTL_FORMAT_TABLE.size()); + + return MTL_FORMAT_TABLE[format]; +} + +inline uint32 CeilDivide(uint32 a, uint32 b) { + return (a + b - 1) / b; +} + +size_t GetMtlTextureBytesPerRow(Latte::E_GX2SURFFMT format, uint32 width) { + const auto& formatInfo = GetMtlPixelFormatInfo(format); + + return CeilDivide(width, formatInfo.blockTexelSize.x) * formatInfo.bytesPerBlock; +} + +size_t GetMtlTextureBytesPerImage(Latte::E_GX2SURFFMT format, uint32 height, size_t bytesPerRow) { + const auto& formatInfo = GetMtlPixelFormatInfo(format); + + return CeilDivide(height, formatInfo.blockTexelSize.y) * bytesPerRow; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h new file mode 100644 index 000000000..f8b1ee037 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h @@ -0,0 +1,22 @@ +#pragma once + +#include + +#include "HW/Latte/ISA/LatteReg.h" + +struct Uvec2 { + uint32 x; + uint32 y; +}; + +struct MtlPixelFormatInfo { + MTL::PixelFormat pixelFormat; + size_t bytesPerBlock; + Uvec2 blockTexelSize = {1, 1}; +}; + +const MtlPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format); + +size_t GetMtlTextureBytesPerRow(Latte::E_GX2SURFFMT format, uint32 width); + +size_t GetMtlTextureBytesPerImage(Latte::E_GX2SURFFMT format, uint32 height, size_t bytesPerRow); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index cdd3b05cd..c49035dd6 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1,6 +1,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalLayer.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #include "gui/guiWrapper.h" @@ -69,11 +70,11 @@ void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) return; } - MTL::CommandBuffer* commandBuffer = m_commandQueue->commandBuffer(); - commandBuffer->presentDrawable(drawable); - commandBuffer->commit(); + m_commandBuffer->presentDrawable(drawable); + m_commandBuffer->commit(); - commandBuffer->release(); + m_commandBuffer->release(); + m_commandBuffer = nullptr; } void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutputShader* shader, bool useLinearTexFilter, @@ -85,8 +86,9 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput bool MetalRenderer::BeginFrame(bool mainWindow) { - cemuLog_logDebug(LogType::Force, "not implemented"); + m_commandBuffer = m_commandQueue->commandBuffer(); + // TODO return false; } @@ -158,7 +160,11 @@ void MetalRenderer::texture_clearSlice(LatteTexture* hostTexture, sint32 sliceIn void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, sint32 height, sint32 depth, void* pixelData, sint32 sliceIndex, sint32 mipIndex, uint32 compressedImageSize) { - cemuLog_logDebug(LogType::Force, "not implemented"); + auto mtlTexture = (LatteTextureMtl*)hostTexture; + + size_t bytesPerRow = GetMtlTextureBytesPerRow(mtlTexture->GetFormat(), width); + size_t bytesPerImage = GetMtlTextureBytesPerImage(mtlTexture->GetFormat(), height, bytesPerRow); + mtlTexture->GetTexture()->replaceRegion(MTL::Region(0, 0, width, height), mipIndex, sliceIndex, pixelData, bytesPerRow, bytesPerImage); } void MetalRenderer::texture_clearColorSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 4cc06f571..49e11be92 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -159,4 +159,6 @@ class MetalRenderer : public Renderer // Metal objects MTL::Device* m_device; MTL::CommandQueue* m_commandQueue; + + MTL::CommandBuffer* m_commandBuffer = nullptr; }; From 69597166f332579d88b76c4b452387051ac1de18 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 26 Jul 2024 11:42:18 +0200 Subject: [PATCH 006/368] start the shader decompiler --- src/Cafe/CMakeLists.txt | 3 + .../LatteDecompiler.cpp | 22 +- .../LatteDecompilerEmitMSL.cpp | 4127 +++++++++++++++++ .../LatteDecompilerEmitMSLAttrDecoder.cpp | 508 ++ .../LatteDecompilerEmitMSLHeader.hpp | 426 ++ .../LatteDecompilerInternal.h | 7 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 3 + 7 files changed, 5088 insertions(+), 8 deletions(-) create mode 100644 src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp create mode 100644 src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp create mode 100644 src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index d4446652a..fa3c6ff94 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -151,6 +151,9 @@ add_library(CemuCafe HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitGLSLAttrDecoder.cpp HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitGLSL.cpp HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitGLSLHeader.hpp + HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp + HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp + HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h HW/Latte/LegacyShaderDecompiler/LatteDecompilerInstructions.h HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.cpp index c3f7c19e0..5f0d7fb25 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.cpp @@ -12,6 +12,8 @@ #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h" #include "util/helpers/helpers.h" +// TODO: remove this include +#include "util/helpers/StringBuf.h" // parse instruction and if valid append it to instructionList bool LatteDecompiler_ParseCFInstruction(LatteDecompilerShaderContext* shaderContext, uint32 cfIndex, uint32 cfWord0, uint32 cfWord1, bool* endOfProgram, std::vector& instructionList) @@ -323,8 +325,8 @@ bool LatteDecompiler_IsALUTransInstruction(bool isOP3, uint32 opcode) } else if( opcode == ALU_OP2_INST_MOV || opcode == ALU_OP2_INST_ADD || - opcode == ALU_OP2_INST_NOP || - opcode == ALU_OP2_INST_MUL || + opcode == ALU_OP2_INST_NOP || + opcode == ALU_OP2_INST_MUL || opcode == ALU_OP2_INST_DOT4 || opcode == ALU_OP2_INST_DOT4_IEEE || opcode == ALU_OP2_INST_MAX || // Not sure if MIN/MAX are non-transcendental? @@ -927,7 +929,7 @@ void LatteDecompiler_ParseTEXClause(LatteDecompilerShader* shaderContext, LatteD texInstruction.memRead.format = dataFormat; texInstruction.memRead.nfa = nfa; texInstruction.memRead.isSigned = isSigned; - + cfInstruction->instructionsTEX.emplace_back(texInstruction); } else @@ -1066,9 +1068,19 @@ void _LatteDecompiler_Process(LatteDecompilerShaderContext* shaderContext, uint8 LatteDecompiler_analyzeDataTypes(shaderContext); // emit code if (shaderContext->shader->hasError == false) - LatteDecompiler_emitGLSLShader(shaderContext, shaderContext->shader); + { + if (g_renderer->GetType() == RendererAPI::Metal) + { + LatteDecompiler_emitMSLShader(shaderContext, shaderContext->shader); + // HACK + std::cout << shaderContext->shaderSource->c_str() << std::endl; + } else + { + LatteDecompiler_emitGLSLShader(shaderContext, shaderContext->shader); + } + } LatteDecompiler_cleanup(shaderContext); - // fast access + // fast access _LatteDecompiler_GenerateDataForFastAccess(shaderContext->shader); } diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp new file mode 100644 index 000000000..943f18401 --- /dev/null +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -0,0 +1,4127 @@ +#include "Cafe/HW/Latte/Core/LatteConst.h" +#include "Cafe/HW/Latte/Core/LatteShaderAssembly.h" +#include "Cafe/HW/Latte/ISA/RegDefines.h" +#include "Cafe/OS/libs/gx2/GX2.h" // todo - remove dependency +#include "Cafe/HW/Latte/Core/Latte.h" +#include "Cafe/HW/Latte/Core/LatteDraw.h" +#include "Cafe/HW/Latte/Core/LatteShader.h" +#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h" +#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h" +#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInstructions.h" +#include "Cafe/HW/Latte/Core/FetchShader.h" +#include "Cafe/HW/Latte/Renderer/Renderer.h" +#include "config/ActiveSettings.h" +#include "util/helpers/StringBuf.h" + +#include +#include + +#define _CRLF "\r\n" + +void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext, StringBuf* src, LatteParsedFetchShaderAttribute_t* attrib); + +/* + * Variable names: + * R0-R127 temp + * Most variables are multi-typed and the respective type is appended to the name + * Type suffixes are: f (float), i (32bit int), ui (unsigned 32bit int) + * Examples: R13ui.x, tempf.z + */ + +// local prototypes +void _emitTypeConversionPrefixMSL(LatteDecompilerShaderContext* shaderContext, sint32 sourceType, sint32 destinationType); +void _emitTypeConversionSuffixMSL(LatteDecompilerShaderContext* shaderContext, sint32 sourceType, sint32 destinationType); +void LatteDecompiler_emitClauseCodeMSL(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, bool isSubroutine); + +static const char* _getElementStrByIndex(uint32 channel) +{ + switch (channel) + { + case 0: + return "x"; + case 1: + return "y"; + case 2: + return "z"; + case 3: + return "w"; + } + return "UNDEFINED"; +} + +static char _tempGenString[64][256]; +static uint32 _tempGenStringIndex = 0; + +static char* _getTempString() +{ + char* str = _tempGenString[_tempGenStringIndex]; + _tempGenStringIndex = (_tempGenStringIndex+1)%64; + return str; +} + +static char* _getActiveMaskVarName(LatteDecompilerShaderContext* shaderContext, sint32 index) +{ + char* varName = _getTempString(); + if (shaderContext->isSubroutine) + sprintf(varName, "activeMaskStackSub%04x[%d]", shaderContext->subroutineInfo->cfAddr, index); + else + sprintf(varName, "activeMaskStack[%d]", index); + return varName; +} + +static char* _getActiveMaskCVarName(LatteDecompilerShaderContext* shaderContext, sint32 index) +{ + char* varName = _getTempString(); + if (shaderContext->isSubroutine) + sprintf(varName, "activeMaskStackCSub%04x[%d]", shaderContext->subroutineInfo->cfAddr, index); + else + sprintf(varName, "activeMaskStackC[%d]", index); + return varName; +} + +static char* _getRegisterVarName(LatteDecompilerShaderContext* shaderContext, uint32 index, sint32 destRelIndexMode=-1) +{ + auto type = shaderContext->typeTracker.defaultDataType; + char* tempStr = _getTempString(); + if (shaderContext->typeTracker.useArrayGPRs == false) + { + if (type == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + sprintf(tempStr, "R%di", index); + else if (type == LATTE_DECOMPILER_DTYPE_FLOAT) + sprintf(tempStr, "R%df", index); + } + else + { + char destRelOffset[32]; + if (destRelIndexMode >= 0) + { + if (destRelIndexMode == GPU7_INDEX_AR_X) + strcpy(destRelOffset, "ARi.x"); + else if (destRelIndexMode == GPU7_INDEX_AR_Y) + strcpy(destRelOffset, "ARi.y"); + else if (destRelIndexMode == GPU7_INDEX_AR_Z) + strcpy(destRelOffset, "ARi.z"); + else if (destRelIndexMode == GPU7_INDEX_AR_W) + strcpy(destRelOffset, "ARi.w"); + else + debugBreakpoint(); + if (type == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + sprintf(tempStr, "Ri[%d+%s]", index, destRelOffset); + } + else if (type == LATTE_DECOMPILER_DTYPE_FLOAT) + { + sprintf(tempStr, "Rf[%d+%s]", index, destRelOffset); + } + } + else + { + if (type == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + sprintf(tempStr, "Ri[%d]", index); + } + else if (type == LATTE_DECOMPILER_DTYPE_FLOAT) + { + sprintf(tempStr, "Rf[%d]", index); + } + } + } + return tempStr; +} + +static void _appendRegisterTypeSuffix(StringBuf* src, sint32 dataType) +{ + if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->add("i"); + else if (dataType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT) + src->add("ui"); + else if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->add("f"); + else + cemu_assert_unimplemented(); +} + +// appends x/y/z/w +static void _appendChannel(StringBuf* src, sint32 channelIndex) +{ + cemu_assert_debug(channelIndex >= 0 && channelIndex <= 3); + switch (channelIndex) + { + case 0: + src->add("x"); + return; + case 1: + src->add("y"); + return; + case 2: + src->add("z"); + return; + case 3: + src->add("w"); + return; + } +} + +// appends .x/.y/.z/.w +static void _appendChannelAccess(StringBuf* src, sint32 channelIndex) +{ + cemu_assert_debug(channelIndex >= 0 && channelIndex <= 3); + switch (channelIndex) + { + case 0: + src->add(".x"); + return; + case 1: + src->add(".y"); + return; + case 2: + src->add(".z"); + return; + case 3: + src->add(".w"); + return; + } +} + +static void _appendPVPS(LatteDecompilerShaderContext* shaderContext, StringBuf* src, uint32 groupIndex, uint8 aluUnit) +{ + cemu_assert_debug(aluUnit < 5); + if (aluUnit == 4) + { + src->addFmt("PS{}", (groupIndex & 1)); + _appendRegisterTypeSuffix(src, shaderContext->typeTracker.defaultDataType); + return; + } + src->addFmt("PV{}", (groupIndex & 1)); + _appendRegisterTypeSuffix(src, shaderContext->typeTracker.defaultDataType); + _appendChannel(src, aluUnit); +} + +std::string _FormatFloatAsConstant(float f) +{ + char floatAsStr[64]; + size_t floatAsStrLen = fmt::format_to_n(floatAsStr, 64, "{:#}", f).size; + size_t floatAsStrLenOrg = floatAsStrLen; + if(floatAsStrLen > 0 && floatAsStr[floatAsStrLen-1] == '.') + { + floatAsStr[floatAsStrLen] = '0'; + floatAsStrLen++; + } + cemu_assert(floatAsStrLen < 50); // constant suspiciously long? + floatAsStr[floatAsStrLen] = '\0'; + cemu_assert_debug(floatAsStrLen >= 3); // shortest possible form is "0.0" + return floatAsStr; +} + +// tracks PV/PS and register backups +struct ALUClauseTemporariesState +{ + struct PVPSAlias + { + enum class LOCATION_TYPE : uint8 + { + LOCATION_NONE, + LOCATION_GPR, + LOCATION_PVPS, + }; + + LOCATION_TYPE location{ LOCATION_TYPE::LOCATION_NONE }; + uint8 index; // GPR index or temporary index + uint8 aluUnit; // x,y,z,w (or 5 for PS) + + void SetLocationGPR(uint8 gprIndex, uint8 channel) + { + cemu_assert_debug(channel < 4); + this->location = LOCATION_TYPE::LOCATION_GPR; + this->index = gprIndex; + this->aluUnit = channel; + } + + void SetLocationPSPVTemporary(uint8 aluUnit, uint32 groupIndex) + { + cemu_assert_debug(aluUnit < 5); + this->location = LOCATION_TYPE::LOCATION_PVPS; + this->index = groupIndex & 1; + this->aluUnit = aluUnit; + } + }; + + struct GPRTemporary + { + GPRTemporary(uint8 gprIndex, uint8 channel, uint8 backupVarIndex) : gprIndex(gprIndex), channel(channel), backupVarIndex(backupVarIndex) {} + + uint8 gprIndex; + uint8 channel; + uint8 backupVarIndex; + }; + + void TrackGroupOutputPVPS(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstr, size_t numInstr) + { + // unset current + for (auto& it : m_pvps) + it.location = PVPSAlias::LOCATION_TYPE::LOCATION_NONE; + for (size_t i = 0; i < numInstr; i++) + { + LatteDecompilerALUInstruction& inst = aluInstr[i]; + if (!inst.isOP3 && inst.opcode == ALU_OP2_INST_NOP) + continue; // skip NOP instruction + + if (inst.writeMask == 0) + { + // map to temporary + m_pvps[inst.aluUnit].SetLocationPSPVTemporary(inst.aluUnit, aluInstr->instructionGroupIndex); + } + else + { + // map to GPR + if(inst.destRel == 0) // is PV/PS set for indexed writes? + m_pvps[inst.aluUnit].SetLocationGPR(inst.destGpr, inst.destElem); + } + } + } + + bool HasPVPS(uint8 aluUnitIndex) const + { + cemu_assert_debug(aluUnitIndex < 5); + return m_pvps[aluUnitIndex].location != PVPSAlias::LOCATION_TYPE::LOCATION_NONE; + } + + void EmitPVPSAccess(LatteDecompilerShaderContext* shaderContext, uint8 aluUnitIndex, uint32 currentGroupIndex) const + { + switch (m_pvps[aluUnitIndex].location) + { + case PVPSAlias::LOCATION_TYPE::LOCATION_GPR: + { + sint32 temporaryIndex = GetTemporaryForGPR(m_pvps[aluUnitIndex].index, m_pvps[aluUnitIndex].aluUnit); + if (temporaryIndex < 0) + { + shaderContext->shaderSource->add(_getRegisterVarName(shaderContext, m_pvps[aluUnitIndex].index, -1)); + _appendChannelAccess(shaderContext->shaderSource, m_pvps[aluUnitIndex].aluUnit); + } + else + { + // use temporary instead of GPR + shaderContext->shaderSource->addFmt("backupReg{}", temporaryIndex); + _appendRegisterTypeSuffix(shaderContext->shaderSource, shaderContext->typeTracker.defaultDataType); + } + break; + } + case PVPSAlias::LOCATION_TYPE::LOCATION_PVPS: + _appendPVPS(shaderContext, shaderContext->shaderSource, currentGroupIndex-1, m_pvps[aluUnitIndex].aluUnit); + break; + default: + cemuLog_log(LogType::Force, "Shader {:016x} accesses PV/PS without writing to it", shaderContext->shaderBaseHash); + cemu_assert_suspicious(); + break; + } + } + + /* + * Check for GPR channels which are modified before they are read within the same group + * These registers need to be copied to a temporary + */ + void CreateGPRTemporaries(LatteDecompilerShaderContext* shaderContext, std::span aluInstructions) + { + uint8 registerChannelWriteMask[(LATTE_NUM_GPR * 4 + 7) / 8] = { 0 }; + + m_gprTemporaries.clear(); + for (auto& aluInstruction : aluInstructions) + { + // ignore NOP instructions + if (aluInstruction.isOP3 == false && aluInstruction.opcode == ALU_OP2_INST_NOP) + continue; + cemu_assert_debug(aluInstruction.destElem <= 3); + // check if any previously written register is read + for (sint32 f = 0; f < 3; f++) + { + uint32 readGPRIndex; + uint32 readGPRChannel; + if (GPU7_ALU_SRC_IS_GPR(aluInstruction.sourceOperand[f].sel)) + { + readGPRIndex = GPU7_ALU_SRC_GET_GPR_INDEX(aluInstruction.sourceOperand[f].sel); + cemu_assert_debug(aluInstruction.sourceOperand[f].chan <= 3); + readGPRChannel = aluInstruction.sourceOperand[f].chan; + } + else if (GPU7_ALU_SRC_IS_PV(aluInstruction.sourceOperand[f].sel) || GPU7_ALU_SRC_IS_PS(aluInstruction.sourceOperand[f].sel)) + { + uint8 aluUnitIndex = 0; + if (GPU7_ALU_SRC_IS_PV(aluInstruction.sourceOperand[f].sel)) + aluUnitIndex = aluInstruction.sourceOperand[f].chan; + else + aluUnitIndex = 4; + // if aliased to a GPR, then consider it a GPR read + if(m_pvps[aluUnitIndex].location != PVPSAlias::LOCATION_TYPE::LOCATION_GPR) + continue; + readGPRIndex = m_pvps[aluUnitIndex].index; + readGPRChannel = m_pvps[aluUnitIndex].aluUnit; + } + else + continue; + // track GPR read + if ((registerChannelWriteMask[(readGPRIndex * 4 + aluInstruction.sourceOperand[f].chan) / 8] & (1 << ((readGPRIndex * 4 + aluInstruction.sourceOperand[f].chan) % 8))) != 0) + { + // register is overwritten by previous instruction, a temporary variable is required + if (GetTemporaryForGPR(readGPRIndex, readGPRChannel) < 0) + m_gprTemporaries.emplace_back(readGPRIndex, readGPRChannel, m_gprTemporaries.size()); + } + } + // track write + if (aluInstruction.writeMask != 0) + registerChannelWriteMask[(aluInstruction.destGpr * 4 + aluInstruction.destElem) / 8] |= (1 << ((aluInstruction.destGpr * 4 + aluInstruction.destElem) % 8)); + } + // output code to move GPRs into temporaries + StringBuf* src = shaderContext->shaderSource; + for (auto& it : m_gprTemporaries) + { + src->addFmt("backupReg{}", it.backupVarIndex); + _appendRegisterTypeSuffix(src, shaderContext->typeTracker.defaultDataType); + src->add(" = "); + src->add(_getRegisterVarName(shaderContext, it.gprIndex)); + _appendChannelAccess(src, it.channel); + src->add(";" _CRLF); + } + } + + // returns -1 if none present + sint32 GetTemporaryForGPR(uint8 gprIndex, uint8 channel) const + { + for (auto& it : m_gprTemporaries) + { + if (it.gprIndex == gprIndex && it.channel == channel) + return (sint32)it.backupVarIndex; + } + return -1; + } + +private: + PVPSAlias m_pvps[5]{}; + boost::container::small_vector m_gprTemporaries; +}; + +sint32 _getVertexShaderOutParamSemanticId(uint32* contextRegisters, sint32 index); +sint32 _getInputRegisterDataType(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex); +sint32 _getALUInstructionOutputDataType(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction); +bool _isReductionInstruction(LatteDecompilerALUInstruction* aluInstruction); + +/* + * Writes the name of the output variable and channel + * E.g. R5f.x or tempf.x if writeMask is 0 + */ +static void _emitInstructionOutputVariableName(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction) +{ + auto src = shaderContext->shaderSource; + sint32 outputDataType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); + if( aluInstruction->writeMask == 0 ) + { + // does not output to GPR + if( !_isReductionInstruction(aluInstruction) ) + { + // output to PV/PS + _appendPVPS(shaderContext, src, aluInstruction->instructionGroupIndex, aluInstruction->aluUnit); + return; + } + else + { + // output to temp + src->add("temp"); + _appendRegisterTypeSuffix(src, outputDataType); + } + _appendChannelAccess(src, aluInstruction->aluUnit); + } + else + { + // output to GPR. Aliasing to PV/PS happens at the end of the group + src->add(_getRegisterVarName(shaderContext, aluInstruction->destGpr, aluInstruction->destRel==0?-1:aluInstruction->indexMode)); + _appendChannelAccess(src, aluInstruction->destElem); + } +} + +static void _emitInstructionPVPSOutputVariableName(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction) +{ + _appendPVPS(shaderContext, shaderContext->shaderSource, aluInstruction->instructionGroupIndex, aluInstruction->aluUnit); +} + +static void _emitRegisterAccessCode(LatteDecompilerShaderContext* shaderContext, sint32 gprIndex, sint32 channel0, sint32 channel1, sint32 channel2, sint32 channel3, sint32 dataType = -1) +{ + StringBuf* src = shaderContext->shaderSource; + sint32 registerElementDataType = shaderContext->typeTracker.defaultDataType; + cemu_assert_debug(gprIndex >= 0 && gprIndex <= 127); + if (dataType >= 0) + { + _emitTypeConversionPrefixMSL(shaderContext, registerElementDataType, dataType); + } + if (shaderContext->typeTracker.useArrayGPRs) + src->add("R"); + else + src->addFmt("R{}", gprIndex); + _appendRegisterTypeSuffix(src, registerElementDataType); + if (shaderContext->typeTracker.useArrayGPRs) + src->addFmt("[{}]", gprIndex); + + src->add("."); + + sint32 channelArray[4]; + channelArray[0] = channel0; + channelArray[1] = channel1; + channelArray[2] = channel2; + channelArray[3] = channel3; + + for (sint32 i = 0; i < 4; i++) + { + if (channelArray[i] >= 0 && channelArray[i] <= 3) + src->add(_getElementStrByIndex(channelArray[i])); + else if (channelArray[i] == -1) + { + // channel not used + } + else + { + cemu_assert_unimplemented(); + } + } + if (dataType >= 0) + _emitTypeConversionSuffixMSL(shaderContext, registerElementDataType, dataType); +} + +// optimized variant of _emitRegisterAccessCode for raw one channel reads +static void _emitRegisterChannelAccessCode(LatteDecompilerShaderContext* shaderContext, sint32 gprIndex, sint32 channel, sint32 dataType) +{ + cemu_assert_debug(gprIndex >= 0 && gprIndex <= 127); + cemu_assert_debug(channel >= 0 && channel < 4); + StringBuf* src = shaderContext->shaderSource; + sint32 registerElementDataType = shaderContext->typeTracker.defaultDataType; + _emitTypeConversionPrefixMSL(shaderContext, registerElementDataType, dataType); + if (shaderContext->typeTracker.useArrayGPRs) + src->add("R"); + else + src->addFmt("R{}", gprIndex); + _appendRegisterTypeSuffix(src, registerElementDataType); + if (shaderContext->typeTracker.useArrayGPRs) + src->addFmt("[{}]", gprIndex); + src->add("."); + src->add(_getElementStrByIndex(channel)); + _emitTypeConversionSuffixMSL(shaderContext, registerElementDataType, dataType); +} + +static void _emitALURegisterInputAccessCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex) +{ + StringBuf* src = shaderContext->shaderSource; + sint32 currentRegisterElementType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); + cemu_assert_debug(GPU7_ALU_SRC_IS_GPR(aluInstruction->sourceOperand[operandIndex].sel)); + sint32 gprIndex = GPU7_ALU_SRC_GET_GPR_INDEX(aluInstruction->sourceOperand[operandIndex].sel); + sint32 temporaryIndex = shaderContext->aluPVPSState->GetTemporaryForGPR(gprIndex, aluInstruction->sourceOperand[operandIndex].chan); + if(temporaryIndex >= 0) + { + // access via backup variable + src->addFmt("backupReg{}", temporaryIndex); + _appendRegisterTypeSuffix(src, currentRegisterElementType); + } + else + { + // access via register variable + _emitRegisterAccessCode(shaderContext, gprIndex, aluInstruction->sourceOperand[operandIndex].chan, -1, -1, -1); + } +} + +static void _emitPVPSAccessCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex, uint8 aluUnitIndex) +{ + cemu_assert_debug(aluInstruction->instructionGroupIndex > 0); // PV/PS is uninitialized for group 0 + // PV/PS vars are currently always using the default type (shaderContext->typeTracker.defaultDataType) + shaderContext->aluPVPSState->EmitPVPSAccess(shaderContext, aluUnitIndex, aluInstruction->instructionGroupIndex); +} + +/* + * Emits the expression used for calculating the index for uniform access + * For static access, this is a number + * For dynamic access, this is AR.* + base + */ +static void _emitUniformAccessIndexCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex) +{ + StringBuf* src = shaderContext->shaderSource; + bool isUniformRegister = GPU7_ALU_SRC_IS_CFILE(aluInstruction->sourceOperand[operandIndex].sel); + sint32 uniformOffset = 0; // index into array, for relative accesses this is the base offset + if( isUniformRegister ) + { + uniformOffset = GPU7_ALU_SRC_GET_CFILE_INDEX(aluInstruction->sourceOperand[operandIndex].sel); + } + else + { + if( GPU7_ALU_SRC_IS_CBANK0(aluInstruction->sourceOperand[operandIndex].sel) ) + { + uniformOffset = GPU7_ALU_SRC_GET_CBANK0_INDEX(aluInstruction->sourceOperand[operandIndex].sel) + aluInstruction->cfInstruction->cBank0AddrBase; + } + else + { + uniformOffset = GPU7_ALU_SRC_GET_CBANK1_INDEX(aluInstruction->sourceOperand[operandIndex].sel) + aluInstruction->cfInstruction->cBank1AddrBase; + } + } + if( aluInstruction->sourceOperand[operandIndex].rel != 0 ) + { + if (aluInstruction->indexMode == GPU7_INDEX_AR_X) + src->addFmt("ARi.x+{}", uniformOffset); + else if (aluInstruction->indexMode == GPU7_INDEX_AR_Y) + src->addFmt("ARi.y+{}", uniformOffset); + else if (aluInstruction->indexMode == GPU7_INDEX_AR_Z) + src->addFmt("ARi.z+{}", uniformOffset); + else if (aluInstruction->indexMode == GPU7_INDEX_AR_W) + src->addFmt("ARi.w+{}", uniformOffset); + else + cemu_assert_unimplemented(); + } + else + { + src->addFmt("{}", uniformOffset); + } +} + +static void _emitUniformAccessCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex, sint32 requiredType) +{ + StringBuf* src = shaderContext->shaderSource; + if(shaderContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_REMAPPED ) + { + // uniform registers or buffers are accessed statically with predictable offsets + // find entry in remapped uniform + if( aluInstruction->sourceOperand[operandIndex].rel != 0 ) + debugBreakpoint(); + bool isUniformRegister = GPU7_ALU_SRC_IS_CFILE(aluInstruction->sourceOperand[operandIndex].sel); + sint32 uniformOffset = 0; // index into array + sint32 uniformBufferIndex = 0; + if( isUniformRegister ) + { + uniformOffset = GPU7_ALU_SRC_GET_CFILE_INDEX(aluInstruction->sourceOperand[operandIndex].sel); + uniformBufferIndex = 0; + } + else + { + if( GPU7_ALU_SRC_IS_CBANK0(aluInstruction->sourceOperand[operandIndex].sel) ) + { + uniformOffset = GPU7_ALU_SRC_GET_CBANK0_INDEX(aluInstruction->sourceOperand[operandIndex].sel) + aluInstruction->cfInstruction->cBank0AddrBase; + uniformBufferIndex = aluInstruction->cfInstruction->cBank0Index; + } + else + { + uniformOffset = GPU7_ALU_SRC_GET_CBANK1_INDEX(aluInstruction->sourceOperand[operandIndex].sel) + aluInstruction->cfInstruction->cBank1AddrBase; + uniformBufferIndex = aluInstruction->cfInstruction->cBank1Index; + } + } + LatteDecompilerRemappedUniformEntry_t* remappedUniformEntry = NULL; + for(size_t i=0; i< shaderContext->shader->list_remappedUniformEntries.size(); i++) + { + LatteDecompilerRemappedUniformEntry_t* remappedUniformEntryItr = shaderContext->shader->list_remappedUniformEntries.data() + i; + if( remappedUniformEntryItr->isRegister && isUniformRegister ) + { + if( remappedUniformEntryItr->index == uniformOffset ) + { + remappedUniformEntry = remappedUniformEntryItr; + break; + } + } + else + { + if( remappedUniformEntryItr->kcacheBankId == uniformBufferIndex && remappedUniformEntryItr->index == uniformOffset ) + { + remappedUniformEntry = remappedUniformEntryItr; + break; + } + } + } + cemu_assert_debug(remappedUniformEntry); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); + if(shaderContext->shader->shaderType == LatteConst::ShaderType::Vertex ) + src->addFmt("uf_remappedVS[{}]", remappedUniformEntry->mappedIndex); + else if(shaderContext->shader->shaderType == LatteConst::ShaderType::Pixel ) + src->addFmt("uf_remappedPS[{}]", remappedUniformEntry->mappedIndex); + else if(shaderContext->shader->shaderType == LatteConst::ShaderType::Geometry ) + src->addFmt("uf_remappedGS[{}]", remappedUniformEntry->mappedIndex); + else + debugBreakpoint(); + _appendChannelAccess(src, aluInstruction->sourceOperand[operandIndex].chan); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); + } + else if( shaderContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CFILE ) + { + // uniform registers are accessed with unpredictable (dynamic) offset + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); + if(shaderContext->shader->shaderType == LatteConst::ShaderType::Vertex ) + src->add("uf_uniformRegisterVS["); + else if (shaderContext->shader->shaderType == LatteConst::ShaderType::Pixel) + src->add("uf_uniformRegisterPS["); + else if(shaderContext->shader->shaderType == LatteConst::ShaderType::Geometry ) + src->add("uf_uniformRegisterGS["); + else + debugBreakpoint(); + _emitUniformAccessIndexCode(shaderContext, aluInstruction, operandIndex); + src->add("]"); + + _appendChannelAccess(src, aluInstruction->sourceOperand[operandIndex].chan); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); + } + else if( shaderContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK ) + { + // uniform buffers are available as a whole + bool isUniformRegister = GPU7_ALU_SRC_IS_CFILE(aluInstruction->sourceOperand[operandIndex].sel); + if( isUniformRegister ) + debugBreakpoint(); + sint32 uniformBufferIndex = 0; + if( GPU7_ALU_SRC_IS_CBANK0(aluInstruction->sourceOperand[operandIndex].sel) ) + { + uniformBufferIndex = aluInstruction->cfInstruction->cBank0Index; + } + else + { + uniformBufferIndex = aluInstruction->cfInstruction->cBank1Index; + } + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); + src->addFmt("ubuff{}[", uniformBufferIndex); + _emitUniformAccessIndexCode(shaderContext, aluInstruction, operandIndex); + src->addFmt("]"); + + _appendChannelAccess(src, aluInstruction->sourceOperand[operandIndex].chan); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); + } + else + debugBreakpoint(); +} + +// Generates (slow) code to read an indexed GPR +static void _emitCodeToReadRelativeGPR(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex, sint32 requiredType) +{ + StringBuf* src = shaderContext->shaderSource; + uint32 gprBaseIndex = GPU7_ALU_SRC_GET_GPR_INDEX(aluInstruction->sourceOperand[operandIndex].sel); + cemu_assert_debug(aluInstruction->sourceOperand[operandIndex].rel != 0); + + if( shaderContext->typeTracker.useArrayGPRs ) + { + _emitTypeConversionPrefixMSL(shaderContext, shaderContext->typeTracker.defaultDataType, requiredType); + src->add(_getRegisterVarName(shaderContext, gprBaseIndex, aluInstruction->indexMode)); + _appendChannelAccess(src, aluInstruction->sourceOperand[operandIndex].chan); + _emitTypeConversionSuffixMSL(shaderContext, shaderContext->typeTracker.defaultDataType, requiredType); + return; + } + + char indexAccessCode[64]; + if (aluInstruction->indexMode == GPU7_INDEX_AR_X) + sprintf(indexAccessCode, "ARi.x"); + else if (aluInstruction->indexMode == GPU7_INDEX_AR_Y) + sprintf(indexAccessCode, "ARi.y"); + else if (aluInstruction->indexMode == GPU7_INDEX_AR_Z) + sprintf(indexAccessCode, "ARi.z"); + else if (aluInstruction->indexMode == GPU7_INDEX_AR_W) + sprintf(indexAccessCode, "ARi.w"); + else + cemu_assert_unimplemented(); + + if( LATTE_DECOMPILER_DTYPE_SIGNED_INT != requiredType ) + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); + + // generated code looks like this: + // result = ((lookupIndex==0)?GPR5:(lookupIndex==1)?GPR6:(lookupIndex==2)?GPR7:...:(lookupIndex==122)?GPR127:0) + src->add("("); + for(sint32 i=gprBaseIndex; ianalyzer.gprUseMask[i / 8] & (1 << (i % 8))) == 0 ) + continue; + src->addFmt("({}=={})?", indexAccessCode, i-gprBaseIndex); + // code to access gpr + uint32 gprIndex = i; + src->add(_getRegisterVarName(shaderContext, i)); + _appendChannelAccess(src, aluInstruction->sourceOperand[operandIndex].chan); + src->add(":"); + } + src->add("0)"); + if( LATTE_DECOMPILER_DTYPE_SIGNED_INT != requiredType ) + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); +} + +static void _emitOperandInputCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex, sint32 requiredType) +{ + StringBuf* src = shaderContext->shaderSource; + if( operandIndex < 0 || operandIndex >= 3 ) + debugBreakpoint(); + sint32 requiredTypeOut = requiredType; + if( requiredType != LATTE_DECOMPILER_DTYPE_FLOAT && (aluInstruction->sourceOperand[operandIndex].abs != 0 || aluInstruction->sourceOperand[operandIndex].neg != 0) ) + { + // we need to apply float operations on the input but it's not read as a float + // force internal required type to float and then cast it back to whatever type is actually required + requiredType = LATTE_DECOMPILER_DTYPE_FLOAT; + } + + if( requiredTypeOut != requiredType ) + _emitTypeConversionPrefixMSL(shaderContext, requiredType, requiredTypeOut); + + if( aluInstruction->sourceOperand[operandIndex].neg != 0 ) + src->add("-("); + if( aluInstruction->sourceOperand[operandIndex].abs != 0 ) + src->add("abs("); + + if( GPU7_ALU_SRC_IS_GPR(aluInstruction->sourceOperand[operandIndex].sel) ) + { + if( aluInstruction->sourceOperand[operandIndex].rel != 0 ) + { + _emitCodeToReadRelativeGPR(shaderContext, aluInstruction, operandIndex, requiredType); + } + else + { + uint32 gprIndex = GPU7_ALU_SRC_GET_GPR_INDEX(aluInstruction->sourceOperand[operandIndex].sel); + if( requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + // signed int 32bit + sint32 currentRegisterElementType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); + // write code for register input + _emitTypeConversionPrefixMSL(shaderContext, currentRegisterElementType, requiredType); + _emitALURegisterInputAccessCode(shaderContext, aluInstruction, operandIndex); + _emitTypeConversionSuffixMSL(shaderContext, currentRegisterElementType, requiredType); + } + else if( requiredType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT ) + { + // unsigned int 32bit + sint32 currentRegisterElementType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); + if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + // need to convert from int to uint + src->add("uint("); + } + else if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT ) + { + // no extra work necessary + } + else + debugBreakpoint(); + // write code for register input + _emitALURegisterInputAccessCode(shaderContext, aluInstruction, operandIndex); + if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + src->add(")"); + } + } + else if( requiredType == LATTE_DECOMPILER_DTYPE_FLOAT ) + { + // float 32bit + sint32 currentRegisterElementType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); + if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + // need to convert (not cast) from int bits to float + src->add("intBitsToFloat("); + } + else if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_FLOAT ) + { + // no extra work necessary + } + else + debugBreakpoint(); + // write code for register input + _emitALURegisterInputAccessCode(shaderContext, aluInstruction, operandIndex); + if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + src->add(")"); + } + } + else + debugBreakpoint(); + } + } + else if( GPU7_ALU_SRC_IS_CONST_0F(aluInstruction->sourceOperand[operandIndex].sel) ) + { + if(requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT || requiredType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT) + src->add("0"); + else if( requiredType == LATTE_DECOMPILER_DTYPE_FLOAT ) + src->add("0.0"); + } + else if( GPU7_ALU_SRC_IS_CONST_1F(aluInstruction->sourceOperand[operandIndex].sel) ) + { + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); + src->add("1.0"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); + } + else if( GPU7_ALU_SRC_IS_CONST_0_5F(aluInstruction->sourceOperand[operandIndex].sel) ) + { + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); + src->add("0.5"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); + } + else if( GPU7_ALU_SRC_IS_CONST_1I(aluInstruction->sourceOperand[operandIndex].sel) ) + { + if (requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->add("int(1)"); + else if (requiredType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT) + src->add("uint(1)"); + else + cemu_assert_suspicious(); + } + else if( GPU7_ALU_SRC_IS_CONST_M1I(aluInstruction->sourceOperand[operandIndex].sel) ) + { + if( requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + src->add("int(-1)"); + else + cemu_assert_suspicious(); + } + else if( GPU7_ALU_SRC_IS_LITERAL(aluInstruction->sourceOperand[operandIndex].sel) ) + { + if( requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + src->addFmt("0x{:x}", aluInstruction->literalData.w[aluInstruction->sourceOperand[operandIndex].chan]); + else if( requiredType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT ) + src->addFmt("uint(0x{:x})", aluInstruction->literalData.w[aluInstruction->sourceOperand[operandIndex].chan]); + else if (requiredType == LATTE_DECOMPILER_DTYPE_FLOAT) + { + uint32 constVal = aluInstruction->literalData.w[aluInstruction->sourceOperand[operandIndex].chan]; + sint32 exponent = (constVal >> 23) & 0xFF; + exponent -= 127; + if ((constVal & 0xFF) == 0 && exponent >= -10 && exponent <= 10) + { + src->add(_FormatFloatAsConstant(*(float*)&constVal)); + } + else + src->addFmt("intBitsToFloat(0x{:08x})", constVal); + } + } + else if( GPU7_ALU_SRC_IS_CFILE(aluInstruction->sourceOperand[operandIndex].sel) ) + { + _emitUniformAccessCode(shaderContext, aluInstruction, operandIndex, requiredType); + } + else if( GPU7_ALU_SRC_IS_CBANK0(aluInstruction->sourceOperand[operandIndex].sel) || + GPU7_ALU_SRC_IS_CBANK1(aluInstruction->sourceOperand[operandIndex].sel) ) + { + _emitUniformAccessCode(shaderContext, aluInstruction, operandIndex, requiredType); + } + else if( GPU7_ALU_SRC_IS_PV(aluInstruction->sourceOperand[operandIndex].sel) ) + { + sint32 currentPVDataType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); + _emitTypeConversionPrefixMSL(shaderContext, currentPVDataType, requiredType); + _emitPVPSAccessCode(shaderContext, aluInstruction, operandIndex, aluInstruction->sourceOperand[operandIndex].chan); + _emitTypeConversionSuffixMSL(shaderContext, currentPVDataType, requiredType); + } + else if( GPU7_ALU_SRC_IS_PS(aluInstruction->sourceOperand[operandIndex].sel) ) + { + sint32 currentPSDataType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); + _emitTypeConversionPrefixMSL(shaderContext, currentPSDataType, requiredType); + _emitPVPSAccessCode(shaderContext, aluInstruction, operandIndex, 4); + _emitTypeConversionSuffixMSL(shaderContext, currentPSDataType, requiredType); + } + else + { + cemuLog_log(LogType::Force, "Unsupported shader ALU operand sel {:#x}\n", aluInstruction->sourceOperand[operandIndex].sel); + debugBreakpoint(); + } + + if( aluInstruction->sourceOperand[operandIndex].abs != 0 ) + src->add(")"); + if( aluInstruction->sourceOperand[operandIndex].neg != 0 ) + src->add(")"); + + if( requiredTypeOut != requiredType ) + _emitTypeConversionSuffixMSL(shaderContext, requiredType, requiredTypeOut); +} + +void _emitTypeConversionPrefixMSL(LatteDecompilerShaderContext* shaderContext, sint32 sourceType, sint32 destinationType) +{ + if( sourceType == destinationType ) + return; + StringBuf* src = shaderContext->shaderSource; + if (sourceType == LATTE_DECOMPILER_DTYPE_FLOAT && destinationType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->add("as_type("); + else if (sourceType == LATTE_DECOMPILER_DTYPE_FLOAT && destinationType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT) + src->add("as_type("); + else if( sourceType == LATTE_DECOMPILER_DTYPE_SIGNED_INT && destinationType == LATTE_DECOMPILER_DTYPE_FLOAT ) + src->add("as_type("); + else if( sourceType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT && destinationType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + src->add("int("); + else if( sourceType == LATTE_DECOMPILER_DTYPE_SIGNED_INT && destinationType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT ) + src->add("uint("); + else + cemu_assert_debug(false); +} + +void _emitTypeConversionSuffixMSL(LatteDecompilerShaderContext* shaderContext, sint32 sourceType, sint32 destinationType) +{ + if( sourceType == destinationType ) + return; + StringBuf* src = shaderContext->shaderSource; + src->add(")"); +} + +template +static void _emitALUOperationBinary(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, const char* operandStr) +{ + StringBuf* src = shaderContext->shaderSource; + sint32 outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, TDataType, outputType); + _emitOperandInputCode(shaderContext, aluInstruction, 0, TDataType); + src->add((char*)operandStr); + _emitOperandInputCode(shaderContext, aluInstruction, 1, TDataType); + _emitTypeConversionSuffixMSL(shaderContext, TDataType, outputType); + src->add(";" _CRLF); +} + +static bool _isSameGPROperand(LatteDecompilerALUInstruction* aluInstruction, sint32 opIndexA, sint32 opIndexB) +{ + if (aluInstruction->sourceOperand[opIndexA].sel != aluInstruction->sourceOperand[opIndexB].sel) + return false; + if (!GPU7_ALU_SRC_IS_GPR(aluInstruction->sourceOperand[opIndexA].sel)) + return false; + if (aluInstruction->sourceOperand[opIndexA].chan != aluInstruction->sourceOperand[opIndexB].chan) + return false; + if (aluInstruction->sourceOperand[opIndexA].abs != aluInstruction->sourceOperand[opIndexB].abs) + return false; + if (aluInstruction->sourceOperand[opIndexA].neg != aluInstruction->sourceOperand[opIndexB].neg) + return false; + if (aluInstruction->sourceOperand[opIndexA].rel != aluInstruction->sourceOperand[opIndexB].rel) + return false; + return true; +} + +static bool _operandHasModifiers(LatteDecompilerALUInstruction* aluInstruction, sint32 opIndex) +{ + return aluInstruction->sourceOperand[opIndex].abs != 0 || aluInstruction->sourceOperand[opIndex].neg != 0; +} + +static void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, LatteDecompilerALUInstruction* aluInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + sint32 outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); // data type of output + if( aluInstruction->opcode == ALU_OP2_INST_MOV ) + { + bool requiresFloatMove = false; + requiresFloatMove = aluInstruction->sourceOperand[0].abs != 0 || aluInstruction->sourceOperand[0].neg != 0; + if( requiresFloatMove ) + { + // abs/neg operations are applied to source operand, do float based move + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitOperandInputCode(shaderContext, aluInstruction, 0, outputType); + src->add(";" _CRLF); + } + } + else if( aluInstruction->opcode == ALU_OP2_INST_MOVA_FLOOR ) + { + cemu_assert_debug(aluInstruction->writeMask == 0); + cemu_assert_debug(aluInstruction->omod == 0); + src->add("tempResultf = "); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(";" _CRLF); + src->add("tempResultf = floor(tempResultf);" _CRLF); + src->add("tempResultf = clamp(tempResultf, -256.0, 255.0);" _CRLF); + // set AR + if( aluInstruction->destElem == 0 ) + src->add("ARi.x = int(tempResultf);" _CRLF); + else if( aluInstruction->destElem == 1 ) + src->add("ARi.y = int(tempResultf);" _CRLF); + else if( aluInstruction->destElem == 2 ) + src->add("ARi.z = int(tempResultf);" _CRLF); + else + src->add("ARi.w = int(tempResultf);" _CRLF); + // set output + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + if( outputType != LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + debugBreakpoint(); // todo + src->add("floatBitsToInt(tempResultf)"); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_MOVA_INT ) + { + cemu_assert_debug(aluInstruction->writeMask == 0); + cemu_assert_debug(aluInstruction->omod == 0); + src->add("tempResulti = "); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(";" _CRLF); + src->add("tempResulti = clamp(tempResulti, -256, 255);" _CRLF); + // set AR + if( aluInstruction->destElem == 0 ) + src->add("ARi.x = tempResulti;" _CRLF); + else if( aluInstruction->destElem == 1 ) + src->add("ARi.y = tempResulti;" _CRLF); + else if( aluInstruction->destElem == 2 ) + src->add("ARi.z = tempResulti;" _CRLF); + else + src->add("ARi.w = tempResulti;" _CRLF); + // set output + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + if( outputType != LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + debugBreakpoint(); // todo + src->add("tempResulti"); + src->add(";" _CRLF); + + } + else if( aluInstruction->opcode == ALU_OP2_INST_ADD ) + { + _emitALUOperationBinary(shaderContext, aluInstruction, " + "); + } + else if( aluInstruction->opcode == ALU_OP2_INST_MUL ) + { + // 0*anything is always 0 + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + + // if any operand is a non-zero literal or constant we can use standard multiplication + bool useDefaultMul = false; + if (GPU7_ALU_SRC_IS_CONST_0F(aluInstruction->sourceOperand[0].sel) || GPU7_ALU_SRC_IS_CONST_0F(aluInstruction->sourceOperand[1].sel)) + { + // result is always zero + src->add("0.0"); + } + else + { + // multiply + if (GPU7_ALU_SRC_IS_LITERAL(aluInstruction->sourceOperand[0].sel) || GPU7_ALU_SRC_IS_LITERAL(aluInstruction->sourceOperand[1].sel) || + GPU7_ALU_SRC_IS_ANY_CONST(aluInstruction->sourceOperand[0].sel) || GPU7_ALU_SRC_IS_ANY_CONST(aluInstruction->sourceOperand[1].sel)) + { + useDefaultMul = true; + } + if (shaderContext->options->strictMul && useDefaultMul == false) + { + src->add("mul_nonIEEE("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(", "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + } + else + { + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(" * "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + } + } + + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_MUL_IEEE ) + { + // 0*anything according to IEEE rules + _emitALUOperationBinary(shaderContext, aluInstruction, " * "); + } + else if (aluInstruction->opcode == ALU_OP2_INST_RECIP_IEEE) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("1.0"); + src->add(" / "); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if (aluInstruction->opcode == ALU_OP2_INST_RECIP_FF) + { + // untested (BotW bombs) + src->add("tempResultf = 1.0 / ("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(");" _CRLF); + // INF becomes 0.0 + src->add("if( isinf(tempResultf) == true && (floatBitsToInt(tempResultf)&0x80000000) == 0 ) tempResultf = 0.0;" _CRLF); + // -INF becomes -0.0 + src->add("else if( isinf(tempResultf) == true && (floatBitsToInt(tempResultf)&0x80000000) != 0 ) tempResultf = -0.0;" _CRLF); + // assign result to output + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("tempResultf"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_RECIPSQRT_IEEE || + aluInstruction->opcode == ALU_OP2_INST_RECIPSQRT_CLAMPED || + aluInstruction->opcode == ALU_OP2_INST_RECIPSQRT_FF ) + { + // todo: This should be correct but testing is needed + src->add("tempResultf = 1.0 / sqrt("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(");" _CRLF); + if (aluInstruction->opcode == ALU_OP2_INST_RECIPSQRT_CLAMPED) + { + // note: if( -INF < 0.0 ) does not resolve to true + src->add("if( isinf(tempResultf) == true && (floatBitsToInt(tempResultf)&0x80000000) != 0 ) tempResultf = -3.40282347E+38F;" _CRLF); + src->add("else if( isinf(tempResultf) == true && (floatBitsToInt(tempResultf)&0x80000000) == 0 ) tempResultf = 3.40282347E+38F;" _CRLF); + } + else if (aluInstruction->opcode == ALU_OP2_INST_RECIPSQRT_FF) + { + // untested (BotW bombs) + src->add("if( isinf(tempResultf) == true && (floatBitsToInt(tempResultf)&0x80000000) != 0 ) tempResultf = -0.0;" _CRLF); + src->add("else if( isinf(tempResultf) == true && (floatBitsToInt(tempResultf)&0x80000000) == 0 ) tempResultf = 0.0;" _CRLF); + } + // assign result to output + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("tempResultf"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_MAX || + aluInstruction->opcode == ALU_OP2_INST_MIN || + aluInstruction->opcode == ALU_OP2_INST_MAX_DX10 || + aluInstruction->opcode == ALU_OP2_INST_MIN_DX10 ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + if( aluInstruction->opcode == ALU_OP2_INST_MAX ) + src->add("max"); + else if( aluInstruction->opcode == ALU_OP2_INST_MIN ) + src->add("min"); + else if (aluInstruction->opcode == ALU_OP2_INST_MAX_DX10) + src->add("max"); + else if (aluInstruction->opcode == ALU_OP2_INST_MIN_DX10) + src->add("min"); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(", "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_FLOOR || + aluInstruction->opcode == ALU_OP2_INST_FRACT || + aluInstruction->opcode == ALU_OP2_INST_TRUNC ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + if( aluInstruction->opcode == ALU_OP2_INST_FLOOR ) + src->add("floor"); + else if( aluInstruction->opcode == ALU_OP2_INST_FRACT ) + src->add("fract"); + else + src->add("trunc"); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_LOG_CLAMPED || + aluInstruction->opcode == ALU_OP2_INST_LOG_IEEE ) + { + src->add("tempResultf = max(0.0, "); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(");" _CRLF); + + src->add("tempResultf = log2(tempResultf);" _CRLF); + if( aluInstruction->opcode == ALU_OP2_INST_LOG_CLAMPED ) + { + src->add("if( isinf(tempResultf) == true ) tempResultf = -3.40282347E+38F;" _CRLF); + } + // assign result to output + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("tempResultf"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_RNDNE ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("roundEven"); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_EXP_IEEE ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("exp2"); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SQRT_IEEE ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("sqrt"); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SIN || + aluInstruction->opcode == ALU_OP2_INST_COS ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + if( aluInstruction->opcode == ALU_OP2_INST_SIN ) + src->add("sin"); + else + src->add("cos"); + src->add("(("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")/0.1591549367)"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_FLT_TO_INT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("int"); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_FLT_TO_UINT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_UNSIGNED_INT, outputType); + src->add("uint"); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_UNSIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_INT_TO_FLOAT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("float("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_UINT_TO_FLOAT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("float("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_UNSIGNED_INT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if (aluInstruction->opcode == ALU_OP2_INST_AND_INT) + _emitALUOperationBinary(shaderContext, aluInstruction, " & "); + else if (aluInstruction->opcode == ALU_OP2_INST_OR_INT) + _emitALUOperationBinary(shaderContext, aluInstruction, " | "); + else if (aluInstruction->opcode == ALU_OP2_INST_XOR_INT) + _emitALUOperationBinary(shaderContext, aluInstruction, " ^ "); + else if( aluInstruction->opcode == ALU_OP2_INST_NOT_INT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("~("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_ADD_INT ) + _emitALUOperationBinary(shaderContext, aluInstruction, " + "); + else if( aluInstruction->opcode == ALU_OP2_INST_MAX_INT || aluInstruction->opcode == ALU_OP2_INST_MIN_INT ) + { + // not verified + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + if( aluInstruction->opcode == ALU_OP2_INST_MAX_INT ) + src->add(" = max("); + else + src->add(" = min("); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(", "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(");" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SUB_INT ) + { + // note: The AMD doc says src1 is on the left side but tests indicate otherwise. It's src0 - src1. + _emitALUOperationBinary(shaderContext, aluInstruction, " - "); + } + else if (aluInstruction->opcode == ALU_OP2_INST_MULLO_INT) + _emitALUOperationBinary(shaderContext, aluInstruction, " * "); + else if (aluInstruction->opcode == ALU_OP2_INST_MULLO_UINT) + _emitALUOperationBinary(shaderContext, aluInstruction, " * "); + else if( aluInstruction->opcode == ALU_OP2_INST_LSHL_INT ) + _emitALUOperationBinary(shaderContext, aluInstruction, " << "); + else if( aluInstruction->opcode == ALU_OP2_INST_LSHR_INT ) + _emitALUOperationBinary(shaderContext, aluInstruction, " >> "); + else if( aluInstruction->opcode == ALU_OP2_INST_ASHR_INT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(" >> "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SETGT || + aluInstruction->opcode == ALU_OP2_INST_SETGE || + aluInstruction->opcode == ALU_OP2_INST_SETNE || + aluInstruction->opcode == ALU_OP2_INST_SETE ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + if( aluInstruction->opcode == ALU_OP2_INST_SETGT ) + src->add(" > "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETGE ) + src->add(" >= "); + else if (aluInstruction->opcode == ALU_OP2_INST_SETNE) + src->add(" != "); + else if (aluInstruction->opcode == ALU_OP2_INST_SETE) + src->add(" == "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")?1.0:0.0"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SETGT_DX10 || + aluInstruction->opcode == ALU_OP2_INST_SETE_DX10 || + aluInstruction->opcode == ALU_OP2_INST_SETNE_DX10 || + aluInstruction->opcode == ALU_OP2_INST_SETGE_DX10 ) + { + if( aluInstruction->omod != 0 ) + debugBreakpoint(); + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("(("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + if( aluInstruction->opcode == ALU_OP2_INST_SETE_DX10 ) + src->add(" == "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETNE_DX10 ) + src->add(" != "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETGT_DX10 ) + src->add(" > "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETGE_DX10 ) + src->add(" >= "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")?-1:0)"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";"); + src->add(_CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SETE_INT || + aluInstruction->opcode == ALU_OP2_INST_SETNE_INT || + aluInstruction->opcode == ALU_OP2_INST_SETGT_INT || + aluInstruction->opcode == ALU_OP2_INST_SETGE_INT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + if( aluInstruction->opcode == ALU_OP2_INST_SETE_INT ) + src->add(" == "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETNE_INT ) + src->add(" != "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETGT_INT ) + src->add(" > "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETGE_INT ) + src->add(" >= "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(")?-1:0"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SETGE_UINT || + aluInstruction->opcode == ALU_OP2_INST_SETGT_UINT ) + { + // todo: Unsure if the result is unsigned or signed + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_UNSIGNED_INT); + if( aluInstruction->opcode == ALU_OP2_INST_SETGE_UINT ) + src->add(" >= "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETGT_UINT ) + src->add(" > "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_UNSIGNED_INT); + src->add(")?int(0xFFFFFFFF):int(0x0)"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_PRED_SETGT || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETGE || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETE || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETNE || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETNE_INT || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETE_INT || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETGE_INT || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETGT_INT ) + { + cemu_assert_debug(aluInstruction->writeMask == 0); + bool isIntPred = (aluInstruction->opcode == ALU_OP2_INST_PRED_SETNE_INT) || (aluInstruction->opcode == ALU_OP2_INST_PRED_SETE_INT) || (aluInstruction->opcode == ALU_OP2_INST_PRED_SETGE_INT) || (aluInstruction->opcode == ALU_OP2_INST_PRED_SETGT_INT); + + src->add("predResult"); + src->add(" = ("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, isIntPred?LATTE_DECOMPILER_DTYPE_SIGNED_INT:LATTE_DECOMPILER_DTYPE_FLOAT); + + if (aluInstruction->opcode == ALU_OP2_INST_PRED_SETE || aluInstruction->opcode == ALU_OP2_INST_PRED_SETE_INT) + src->add(" == "); + else if (aluInstruction->opcode == ALU_OP2_INST_PRED_SETGT || aluInstruction->opcode == ALU_OP2_INST_PRED_SETGT_INT) + src->add(" > "); + else if (aluInstruction->opcode == ALU_OP2_INST_PRED_SETGE || aluInstruction->opcode == ALU_OP2_INST_PRED_SETGE_INT) + src->add(" >= "); + else if (aluInstruction->opcode == ALU_OP2_INST_PRED_SETNE || aluInstruction->opcode == ALU_OP2_INST_PRED_SETNE_INT) + src->add(" != "); + else + cemu_assert_debug(false); + + _emitOperandInputCode(shaderContext, aluInstruction, 1, isIntPred?LATTE_DECOMPILER_DTYPE_SIGNED_INT:LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(");" _CRLF); + // handle result of predicate instruction based on current ALU clause type + if( cfInstruction->type == GPU7_CF_INST_ALU_PUSH_BEFORE ) + { + src->addFmt("{} = predResult;" _CRLF, _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth)); + src->addFmt("{} = predResult == true && {} == true;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth)); + } + else if( cfInstruction->type == GPU7_CF_INST_ALU_BREAK ) + { + // leave current loop + src->add("if( predResult == false ) break;" _CRLF); + } + else + cemu_assert_debug(false); + } + else if( aluInstruction->opcode == ALU_OP2_INST_KILLE_INT || + aluInstruction->opcode == ALU_OP2_INST_KILLNE_INT || + aluInstruction->opcode == ALU_OP2_INST_KILLGT_INT) + { + src->add("if( "); + src->add(" ("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + if( aluInstruction->opcode == ALU_OP2_INST_KILLE_INT ) + src->add(" == "); + else if (aluInstruction->opcode == ALU_OP2_INST_KILLNE_INT) + src->add(" != "); + else if (aluInstruction->opcode == ALU_OP2_INST_KILLGT_INT) + src->add(" > "); + else + debugBreakpoint(); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(")"); + src->add(") discard;"); + src->add(_CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_KILLGT || + aluInstruction->opcode == ALU_OP2_INST_KILLGE || + aluInstruction->opcode == ALU_OP2_INST_KILLE ) + { + src->add("if( "); + src->add(" ("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + if( aluInstruction->opcode == ALU_OP2_INST_KILLGT ) + src->add(" > "); + else if( aluInstruction->opcode == ALU_OP2_INST_KILLGE ) + src->add(" >= "); + else if( aluInstruction->opcode == ALU_OP2_INST_KILLE ) + src->add(" == "); + else + debugBreakpoint(); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + src->add(") discard;"); + src->add(_CRLF); + } + else + { + src->add("Unsupported instruction;" _CRLF); + debug_printf("Unsupported ALU op2 instruction 0x%x\n", aluInstruction->opcode); + shaderContext->shader->hasError = true; + } +} + +static void _emitALUOP3InstructionCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, LatteDecompilerALUInstruction* aluInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + cemu_assert_debug(aluInstruction->destRel == 0); // todo + sint32 outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); + + /* check for common no-op or mov-like instructions */ + if (aluInstruction->opcode == ALU_OP3_INST_CMOVGE || + aluInstruction->opcode == ALU_OP3_INST_CMOVE || + aluInstruction->opcode == ALU_OP3_INST_CMOVGT || + aluInstruction->opcode == ALU_OP3_INST_CNDE_INT || + aluInstruction->opcode == ALU_OP3_INST_CNDGT_INT || + aluInstruction->opcode == ALU_OP3_INST_CMOVGE_INT) + { + if (_isSameGPROperand(aluInstruction, 1, 2) && !_operandHasModifiers(aluInstruction, 1)) + { + // the condition is irrelevant as both operands are the same + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, outputType); + src->add(";" _CRLF); + return; + } + } + + + /* generic handlers */ + if( aluInstruction->opcode == ALU_OP3_INST_MULADD || + aluInstruction->opcode == ALU_OP3_INST_MULADD_D2 || + aluInstruction->opcode == ALU_OP3_INST_MULADD_M2 || + aluInstruction->opcode == ALU_OP3_INST_MULADD_M4 || + aluInstruction->opcode == ALU_OP3_INST_MULADD_IEEE ) + { + // todo: The difference between MULADD and MULADD IEEE is that the former has 0*anything=0 rule similar to MUL/MUL_IEEE? + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + if (aluInstruction->opcode != ALU_OP3_INST_MULADD) // avoid unnecessary parenthesis to improve code readability slightly + src->add("("); + + bool useDefaultMul = false; + if (GPU7_ALU_SRC_IS_LITERAL(aluInstruction->sourceOperand[0].sel) || GPU7_ALU_SRC_IS_LITERAL(aluInstruction->sourceOperand[1].sel) || + GPU7_ALU_SRC_IS_ANY_CONST(aluInstruction->sourceOperand[0].sel) || GPU7_ALU_SRC_IS_ANY_CONST(aluInstruction->sourceOperand[1].sel)) + { + useDefaultMul = true; + } + if (aluInstruction->opcode == ALU_OP3_INST_MULADD_IEEE) + useDefaultMul = true; + + if (shaderContext->options->strictMul && useDefaultMul == false) + { + src->add("mul_nonIEEE("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + } + else + { + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(" * "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + } + + src->add(" + "); + _emitOperandInputCode(shaderContext, aluInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); + if(aluInstruction->opcode != ALU_OP3_INST_MULADD) + src->add(")"); + if( aluInstruction->opcode == ALU_OP3_INST_MULADD_D2 ) + src->add("/2.0"); + else if( aluInstruction->opcode == ALU_OP3_INST_MULADD_M2 ) + src->add("*2.0"); + else if( aluInstruction->opcode == ALU_OP3_INST_MULADD_M4 ) + src->add("*4.0"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if(aluInstruction->opcode == ALU_OP3_INST_CNDE_INT || aluInstruction->opcode == ALU_OP3_INST_CNDGT_INT || aluInstruction->opcode == ALU_OP3_INST_CMOVGE_INT) + { + bool requiresFloatResult = (aluInstruction->sourceOperand[1].neg != 0) || (aluInstruction->sourceOperand[2].neg != 0); + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, requiresFloatResult?LATTE_DECOMPILER_DTYPE_FLOAT:LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("(("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + if (aluInstruction->opcode == ALU_OP3_INST_CNDE_INT) + src->add(" == "); + else if (aluInstruction->opcode == ALU_OP3_INST_CNDGT_INT) + src->add(" > "); + else if (aluInstruction->opcode == ALU_OP3_INST_CMOVGE_INT) + src->add(" >= "); + src->add("0)?("); + + _emitOperandInputCode(shaderContext, aluInstruction, 1, requiresFloatResult?LATTE_DECOMPILER_DTYPE_FLOAT:LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add("):("); + _emitOperandInputCode(shaderContext, aluInstruction, 2, requiresFloatResult?LATTE_DECOMPILER_DTYPE_FLOAT:LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add("))"); + _emitTypeConversionSuffixMSL(shaderContext, requiresFloatResult?LATTE_DECOMPILER_DTYPE_FLOAT:LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP3_INST_CMOVGE || + aluInstruction->opcode == ALU_OP3_INST_CMOVE || + aluInstruction->opcode == ALU_OP3_INST_CMOVGT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("(("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + if (aluInstruction->opcode == ALU_OP3_INST_CMOVE) + src->add(" == "); + else if (aluInstruction->opcode == ALU_OP3_INST_CMOVGE) + src->add(" >= "); + else if (aluInstruction->opcode == ALU_OP3_INST_CMOVGT) + src->add(" > "); + src->add("0.0)?("); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add("):("); + _emitOperandInputCode(shaderContext, aluInstruction, 2, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add("))"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else + { + src->add("Unsupported instruction;" _CRLF); + debug_printf("Unsupported ALU op3 instruction 0x%x\n", aluInstruction->opcode); + shaderContext->shader->hasError = true; + } +} + +static void _emitALUReductionInstructionCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluRedcInstruction[4]) +{ + StringBuf* src = shaderContext->shaderSource; + if( aluRedcInstruction[0]->isOP3 == false && (aluRedcInstruction[0]->opcode == ALU_OP2_INST_DOT4 || aluRedcInstruction[0]->opcode == ALU_OP2_INST_DOT4_IEEE) ) + { + // todo: Figure out and implement the difference between normal DOT4 and DOT4_IEEE + sint32 outputType = _getALUInstructionOutputDataType(shaderContext, aluRedcInstruction[0]); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[0]); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + + // dot(vec4(op0),vec4(op1)) + src->add("dot(vec4("); + _emitOperandInputCode(shaderContext, aluRedcInstruction[0], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[1], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[2], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[3], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("),vec4("); + _emitOperandInputCode(shaderContext, aluRedcInstruction[0], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[1], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[2], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[3], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("))"); + + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluRedcInstruction[0]->isOP3 == false && (aluRedcInstruction[0]->opcode == ALU_OP2_INST_CUBE) ) + { + /* + * How the CUBE instruction works (guessed mostly, based on DirectX/OpenGL spec): + Input: vec4, 3d direction vector (can be unnormalized) + w component (which can be ignored, since it only scales the vector but does not affect the direction) + + First we figure out the major axis (closest axis-aligned vector). There are six possible vectors: + +rx 0 + -rx 1 + +ry 2 + -ry 3 + +rz 4 + -rz 5 + The major axis vector is calculated by looking at the largest (absolute) 3d vector component and then setting the other components to 0.0 + The value that remains in the axis vector is referred to as 'MajorAxis' by the AMD documentation. + The S,T coordinates are taken from the other two components. + Example: -0.5,0.2,0.4 -> -rx -> -0.5,0.0,0.0 MajorAxis: -0.5, S: 0.2 T: 0.4 + + The CUBE reduction instruction requires a specific mapping for the input vector: + src0 = Rn.zzxy + src1 = Rn.yxzz + It's probably related to the way the instruction works internally? + If we look at the individual components per ALU unit: + z y -> Compare y/z + z x -> Compare x/z + x z -> Compare x/z + y z -> Compare y/z + */ + + sint32 outputType; + + src->add("redcCUBE("); + src->add("vec4("); + _emitOperandInputCode(shaderContext, aluRedcInstruction[0], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[1], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[2], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[3], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("),"); + src->add("vec4("); + _emitOperandInputCode(shaderContext, aluRedcInstruction[0], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[1], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[2], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[3], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("),"); + src->add("cubeMapSTM,cubeMapFaceId);" _CRLF); + + // dst.X (S) + outputType = _getALUInstructionOutputDataType(shaderContext, aluRedcInstruction[0]); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[0]); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("cubeMapSTM.x"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + // dst.Y (T) + outputType = _getALUInstructionOutputDataType(shaderContext, aluRedcInstruction[1]); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[1]); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("cubeMapSTM.y"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + // dst.Z (MajorAxis) + outputType = _getALUInstructionOutputDataType(shaderContext, aluRedcInstruction[2]); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[2]); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("cubeMapSTM.z"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + // dst.W (FaceId) + outputType = _getALUInstructionOutputDataType(shaderContext, aluRedcInstruction[3]); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[3]); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("cubeMapFaceId"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else + cemu_assert_unimplemented(); +} + +static void _emitALUClauseRegisterBackupCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, sint32 startIndex) +{ + sint32 instructionGroupIndex = cfInstruction->instructionsALU[startIndex].instructionGroupIndex; + size_t groupSize = 1; + while ((startIndex + groupSize) < cfInstruction->instructionsALU.size()) + { + if (instructionGroupIndex != cfInstruction->instructionsALU[startIndex + groupSize].instructionGroupIndex) + break; + groupSize++; + } + shaderContext->aluPVPSState->CreateGPRTemporaries(shaderContext, { cfInstruction->instructionsALU.data() + startIndex, groupSize }); +} + +/* +bool _isPVUsedInNextGroup(LatteDecompilerCFInstruction* cfInstruction, sint32 startIndex, sint32 pvUnit) +{ + sint32 currentGroupIndex = cfInstruction->instructionsALU[startIndex].instructionGroupIndex; + for (sint32 i = startIndex + 1; i < (sint32)cfInstruction->instructionsALU.size(); i++) + { + LatteDecompilerALUInstruction& aluInstructionItr = cfInstruction->instructionsALU[i]; + if(aluInstructionItr.instructionGroupIndex == currentGroupIndex ) + continue; + if ((sint32)aluInstructionItr.instructionGroupIndex > currentGroupIndex + 1) + return false; + // check OP code type + if (aluInstructionItr.isOP3) + { + // op0 + if (GPU7_ALU_SRC_IS_PV(aluInstructionItr.sourceOperand[0].sel)) + { + uint32 chan = aluInstructionItr.sourceOperand[0].chan; + if (pvUnit == chan) + return true; + } + // op1 + if (GPU7_ALU_SRC_IS_PV(aluInstructionItr.sourceOperand[1].sel)) + { + uint32 chan = aluInstructionItr.sourceOperand[1].chan; + if (pvUnit == chan) + return true; + } + // op2 + if (GPU7_ALU_SRC_IS_PV(aluInstructionItr.sourceOperand[2].sel)) + { + uint32 chan = aluInstructionItr.sourceOperand[2].chan; + if (pvUnit == chan) + return true; + } + } + else + { + // op0 + if (GPU7_ALU_SRC_IS_PV(aluInstructionItr.sourceOperand[0].sel)) + { + uint32 chan = aluInstructionItr.sourceOperand[0].chan; + if (pvUnit == chan) + return true; + } + // op1 + if (GPU7_ALU_SRC_IS_PV(aluInstructionItr.sourceOperand[1].sel)) + { + uint32 chan = aluInstructionItr.sourceOperand[1].chan; + if (pvUnit == chan) + return true; + } + // todo: Not all operations use both operands + } + } + return false; +} +*/ + +static void _emitVec3(LatteDecompilerShaderContext* shaderContext, uint32 dataType, LatteDecompilerALUInstruction* aluInst0, sint32 opIdx0, LatteDecompilerALUInstruction* aluInst1, sint32 opIdx1, LatteDecompilerALUInstruction* aluInst2, sint32 opIdx2) +{ + StringBuf* src = shaderContext->shaderSource; + if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) + { + src->add("vec3("); + _emitOperandInputCode(shaderContext, aluInst0, opIdx0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluInst1, opIdx1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluInst2, opIdx2, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + } + else if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + src->add("ivec3("); + _emitOperandInputCode(shaderContext, aluInst0, opIdx0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(","); + _emitOperandInputCode(shaderContext, aluInst1, opIdx1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(","); + _emitOperandInputCode(shaderContext, aluInst2, opIdx2, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(")"); + } + else + cemu_assert_unimplemented(); +} + +static void _emitGPRVectorAssignment(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction** aluInstructions, sint32 count) +{ + StringBuf* src = shaderContext->shaderSource; + // output var name (GPR) + src->add(_getRegisterVarName(shaderContext, aluInstructions[0]->destGpr, -1)); + src->add("."); + for (sint32 f = 0; f < count; f++) + { + src->add(_getElementStrByIndex(aluInstructions[f]->destElem)); + } + src->add(" = "); +} + +static void _emitALUClauseCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) +{ + ALUClauseTemporariesState pvpsState; + shaderContext->aluPVPSState = &pvpsState; + StringBuf* src = shaderContext->shaderSource; + LatteDecompilerALUInstruction* aluRedcInstruction[4]; + size_t groupStartIndex = 0; + for(size_t i=0; iinstructionsALU.size(); i++) + { + LatteDecompilerALUInstruction& aluInstruction = cfInstruction->instructionsALU[i]; + if( aluInstruction.indexInGroup == 0 ) + { + src->addFmt("// {}" _CRLF, aluInstruction.instructionGroupIndex); + // apply PV/PS updates for previous group + if (i > 0) + { + pvpsState.TrackGroupOutputPVPS(shaderContext, cfInstruction->instructionsALU.data() + groupStartIndex, i - groupStartIndex); + } + groupStartIndex = i; + // backup registers which are read after being written + _emitALUClauseRegisterBackupCode(shaderContext, cfInstruction, i); + } + // detect reduction instructions and use a special handler + bool isReductionOperation = _isReductionInstruction(&aluInstruction); + if( isReductionOperation ) + { + cemu_assert_debug((i + 4) <= cfInstruction->instructionsALU.size()); + aluRedcInstruction[0] = &aluInstruction; + aluRedcInstruction[1] = &cfInstruction->instructionsALU[i + 1]; + aluRedcInstruction[2] = &cfInstruction->instructionsALU[i + 2]; + aluRedcInstruction[3] = &cfInstruction->instructionsALU[i + 3]; + if( aluRedcInstruction[0]->isOP3 != aluRedcInstruction[1]->isOP3 || aluRedcInstruction[1]->isOP3 != aluRedcInstruction[2]->isOP3 || aluRedcInstruction[2]->isOP3 != aluRedcInstruction[3]->isOP3 ) + debugBreakpoint(); + if( aluRedcInstruction[0]->opcode != aluRedcInstruction[1]->opcode || aluRedcInstruction[1]->opcode != aluRedcInstruction[2]->opcode || aluRedcInstruction[2]->opcode != aluRedcInstruction[3]->opcode ) + debugBreakpoint(); + if( aluRedcInstruction[0]->omod != aluRedcInstruction[1]->omod || aluRedcInstruction[1]->omod != aluRedcInstruction[2]->omod || aluRedcInstruction[2]->omod != aluRedcInstruction[3]->omod ) + debugBreakpoint(); + if( aluRedcInstruction[0]->destClamp != aluRedcInstruction[1]->destClamp || aluRedcInstruction[1]->destClamp != aluRedcInstruction[2]->destClamp || aluRedcInstruction[2]->destClamp != aluRedcInstruction[3]->destClamp ) + debugBreakpoint(); + _emitALUReductionInstructionCode(shaderContext, aluRedcInstruction); + i += 3; // skip the instructions that are part of the reduction operation + } + else /* not a reduction operation */ + { + if( aluInstruction.isOP3 ) + { + // op3 + _emitALUOP3InstructionCode(shaderContext, cfInstruction, &aluInstruction); + } + else + { + // op2 + if( aluInstruction.opcode == ALU_OP2_INST_NOP ) + continue; // skip NOP instruction + _emitALUOP2InstructionCode(shaderContext, cfInstruction, &aluInstruction); + } + } + // handle omod + sint32 outputDataType = _getALUInstructionOutputDataType(shaderContext, &aluInstruction); + if( aluInstruction.omod != ALU_OMOD_NONE ) + { + if( outputDataType == LATTE_DECOMPILER_DTYPE_FLOAT ) + { + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + if( aluInstruction.omod == ALU_OMOD_MUL2 ) + src->add(" *= 2.0;" _CRLF); + else if( aluInstruction.omod == ALU_OMOD_MUL4 ) + src->add(" *= 4.0;" _CRLF); + else if( aluInstruction.omod == ALU_OMOD_DIV2 ) + src->add(" /= 2.0;" _CRLF); + } + else if( outputDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + src->add(" = "); + src->add("floatBitsToInt(intBitsToFloat("); + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + src->add(")"); + if( aluInstruction.omod == 1 ) + src->add(" * 2.0"); + else if( aluInstruction.omod == 2 ) + src->add(" * 4.0"); + else if( aluInstruction.omod == 3 ) + src->add(" / 2.0"); + src->add(");" _CRLF); + } + else + { + cemu_assert_unimplemented(); + } + } + // handle clamp + if( aluInstruction.destClamp != 0 ) + { + if( outputDataType == LATTE_DECOMPILER_DTYPE_FLOAT ) + { + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + src->add(" = clamp("); + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + src->add(", 0.0, 1.0);" _CRLF); + } + else if( outputDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + src->add(" = clampFI32("); + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + src->add(");" _CRLF); + } + else + { + cemu_assert_unimplemented(); + } + } + // handle result broadcasting for reduction instructions + if( isReductionOperation ) + { + // reduction operations set all four PV components (todo: Needs further research. According to AMD docs, dot4 only sets PV.x? update: Unlike DOT4, CUBE sets all PV elements accordingly to their GPR output?) + if( aluRedcInstruction[0]->opcode == ALU_OP2_INST_CUBE ) + { + // CUBE + for (sint32 f = 0; f < 4; f++) + { + if (aluRedcInstruction[f]->writeMask != 0) + continue; + _emitInstructionPVPSOutputVariableName(shaderContext, aluRedcInstruction[f]); + src->add(" = "); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[0]); + src->add(";" _CRLF); + } + } + else + { + // DOT4, DOT4_IEEE, etc. + // reduction operation result is only set for output in redc[0], we also need to update redc[1] to redc[3] + for(sint32 f=0; f<4; f++) + { + if( aluRedcInstruction[f]->writeMask == 0 ) + _emitInstructionPVPSOutputVariableName(shaderContext, aluRedcInstruction[f]); + else + { + if (f == 0) + continue; + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[f]); + } + src->add(" = "); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[0]); + src->add(";" _CRLF); + } + } + } + } + shaderContext->aluPVPSState = nullptr; +} + +/* + * Emits code to access one component (xyzw) of the texture coordinate input vector + */ +static void _emitTEXSampleCoordInputComponent(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction, sint32 componentIndex, sint32 interpretSrcAsType) +{ + cemu_assert(componentIndex >= 0 && componentIndex < 4); + cemu_assert_debug(interpretSrcAsType == LATTE_DECOMPILER_DTYPE_SIGNED_INT || interpretSrcAsType == LATTE_DECOMPILER_DTYPE_FLOAT); + StringBuf* src = shaderContext->shaderSource; + sint32 elementSel = texInstruction->textureFetch.srcSel[componentIndex]; + if (elementSel < 4) + { + _emitRegisterChannelAccessCode(shaderContext, texInstruction->srcGpr, elementSel, interpretSrcAsType); + return; + } + const char* resultElemTable[4] = {"x","y","z","w"}; + if(interpretSrcAsType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + if( elementSel == 4 ) + src->add("floatBitsToInt(0.0)"); + else if( elementSel == 5 ) + src->add("floatBitsToInt(1.0)"); + } + else if(interpretSrcAsType == LATTE_DECOMPILER_DTYPE_FLOAT ) + { + if( elementSel == 4 ) + src->add("0.0"); + else if( elementSel == 5 ) + src->add("1.0"); + } +} + +static const char* _texGprAccessElemTable[8] = {"x","y","z","w","_","_","_","_"}; + +static char* _getTexGPRAccess(LatteDecompilerShaderContext* shaderContext, sint32 gprIndex, uint32 dataType, sint8 selX, sint8 selY, sint8 selZ, sint8 selW, char* tempBuffer) +{ + // intBitsToFloat(R{}i.w) + *tempBuffer = '\0'; + uint8 elemCount = (selX > 0 ? 1 : 0) + (selY > 0 ? 1 : 0) + (selZ > 0 ? 1 : 0) + (selW > 0 ? 1 : 0); + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + ; // no conversion + else if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) + strcat(tempBuffer, "intBitsToFloat("); + else + cemu_assert_unimplemented(); + strcat(tempBuffer, _getRegisterVarName(shaderContext, gprIndex)); + // _texGprAccessElemTable + strcat(tempBuffer, "."); + if (selX >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selX]); + if (selY >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selY]); + if (selZ >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selZ]); + if (selW >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selW]); + if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + ; // no conversion + else if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) + strcat(tempBuffer, ")"); + else + cemu_assert_unimplemented(); + } + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + { + if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + cemu_assert_unimplemented(); + else if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) + ; // no conversion + else + cemu_assert_unimplemented(); + strcat(tempBuffer, _getRegisterVarName(shaderContext, gprIndex)); + // _texGprAccessElemTable + strcat(tempBuffer, "."); + if (selX >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selX]); + if (selY >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selY]); + if (selZ >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selZ]); + if (selW >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selW]); + if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + cemu_assert_unimplemented(); + else if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) + ; // no conversion + else + cemu_assert_unimplemented(); + } + else + cemu_assert_unimplemented(); + return tempBuffer; +} + +static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + if (texInstruction->textureFetch.textureIndex < 0 || texInstruction->textureFetch.textureIndex >= LATTE_NUM_MAX_TEX_UNITS) + { + // skip out of bounds texture unit access + return; + } + + auto texDim = shaderContext->shader->textureUnitDim[texInstruction->textureFetch.textureIndex]; + + char tempBuffer0[32]; + char tempBuffer1[32]; + src->add(_getRegisterVarName(shaderContext, texInstruction->dstGpr)); + src->add("."); + const char* resultElemTable[4] = {"x","y","z","w"}; + sint32 numWrittenElements = 0; + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[f]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + debugBreakpoint(); + } + } + // texture sampler opcode + uint32 texOpcode = texInstruction->opcode; + if (shaderContext->shaderType == LatteConst::ShaderType::Vertex) + { + // vertex shader forces LOD to zero, but certain sampler types don't support textureLod(...) API + if (texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ) + texOpcode = GPU7_TEX_INST_SAMPLE_C; + } + // check if offset is used + bool hasOffset = false; + if( texInstruction->textureFetch.offsetX != 0 || texInstruction->textureFetch.offsetY != 0 || texInstruction->textureFetch.offsetZ != 0 ) + hasOffset = true; + // emit sample code + if (shaderContext->shader->textureIsIntegerFormat[texInstruction->textureFetch.textureIndex]) + { + // integer samplers + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) // uint to int + { + if(numWrittenElements == 1) + src->add(" = int("); + else + shaderContext->shaderSource->addFmt(" = ivec{}(", numWrittenElements); + } + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->add(" = uintBitsToFloat("); + } + else + { + // float samplers + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->add(" = floatBitsToInt("); + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->add(" = ("); + } + + bool unnormalizationHandled = false; + bool useTexelCoordinates = false; + + // handle illegal combinations + if (texOpcode == GPU7_TEX_INST_FETCH4 && (texDim == Latte::E_DIM::DIM_1D || texDim == Latte::E_DIM::DIM_1D_ARRAY)) + { + // fetch4 is not allowed on 1D textures + // seen in YWW during boss fight of Level 1-4 + // todo - investigate what this returns on actual HW + if (numWrittenElements == 1) + shaderContext->shaderSource->add("0.0"); + else + shaderContext->shaderSource->addFmt("vec{}(0.0)", numWrittenElements); + shaderContext->shaderSource->add(");" _CRLF); + return; + } + + + if (texOpcode == GPU7_TEX_INST_SAMPLE && (texInstruction->textureFetch.unnormalized[0] && texInstruction->textureFetch.unnormalized[1] && texInstruction->textureFetch.unnormalized[2] && texInstruction->textureFetch.unnormalized[3]) ) + { + // texture is likely a RECT + if (hasOffset) + cemu_assert_unimplemented(); + src->add("texelFetch("); + unnormalizationHandled = true; + useTexelCoordinates = true; + } + else if( texOpcode == GPU7_TEX_INST_FETCH4 ) + { + if( hasOffset ) + cemu_assert_unimplemented(); + src->add("textureGather("); + } + else if( texOpcode == GPU7_TEX_INST_LD ) + { + if( hasOffset ) + cemu_assert_unimplemented(); + src->add("texelFetch("); + unnormalizationHandled = true; + useTexelCoordinates = true; + } + else if( texOpcode == GPU7_TEX_INST_SAMPLE_L ) + { + // sample with LOD value set in gpr.w (replaces computed LOD value) + if( hasOffset ) + src->add("textureLodOffset("); + else + src->add("textureLod("); + } + else if (texOpcode == GPU7_TEX_INST_SAMPLE_LZ) + { + // sample with LOD set to 0.0 (replaces computed LOD value) + if (hasOffset) + src->add("textureLodOffset("); + else + src->add("textureLod("); + } + else if (texOpcode == GPU7_TEX_INST_SAMPLE_LB) + { + // sample with LOD biased + // note: AMD doc says LOD bias is calculated from instruction LOD_BIAS field. But it appears that LOD bias is taken from input register. Might actually be both? + if (hasOffset) + src->add("textureOffset("); + else + src->add("texture("); + } + else if (texOpcode == GPU7_TEX_INST_SAMPLE) + { + if (hasOffset) + src->add("textureOffset("); + else + src->add("texture("); + } + else if (texOpcode == GPU7_TEX_INST_SAMPLE_C_L) + { + // sample with LOD value set in gpr.w (replaces computed LOD value) + if (hasOffset) + src->add("textureLodOffset("); + else + src->add("textureLod("); + } + else if (texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ) + { + // sample with LOD set to 0.0 (replaces computed LOD value) + if (hasOffset) + src->add("textureLodOffset("); + else + src->add("textureLod("); + } + else if (texOpcode == GPU7_TEX_INST_SAMPLE_C) + { + if (hasOffset) + src->add("textureOffset("); + else + src->add("texture("); + } + else if (texOpcode == GPU7_TEX_INST_SAMPLE_G) + { + if (hasOffset) + cemu_assert_unimplemented(); + src->add("textureGrad("); + } + else + { + if( hasOffset ) + cemu_assert_unimplemented(); + cemu_assert_unimplemented(); + src->add("texture("); + } + src->addFmt("tex{}, ", texInstruction->textureFetch.textureIndex); + + // for textureGather() add shift (todo: depends on rounding mode set in sampler registers?) + if (texOpcode == GPU7_TEX_INST_FETCH4) + { + if (texDim == Latte::E_DIM::DIM_2D) + { + //src->addFmt2("(vec2(-0.1) / vec2(textureSize(tex{},0).xy)) + ", texInstruction->textureIndex); + + // vec2(-0.00001) is minimum to break Nvidia + // vec2(0.0001) is minimum to fix shadows on Intel, also fixes it on AMD (Windows and Linux) + + // todo - emulating coordinate rounding mode correctly is tricky + // GX2 supports two modes: Truncate or rounding according to DX9 rules + // Vulkan uses truncate mode when point sampling (min and mag is both nearest) otherwise it uses rounding + + // adding a small fixed bias is enough to avoid vendor-specific cases where small inaccuracies cause the number to get rounded down due to truncation + src->addFmt("vec2(0.0001) + "); + } + } + + const sint32 texCoordDataType = (texOpcode == GPU7_TEX_INST_LD) ? LATTE_DECOMPILER_DTYPE_SIGNED_INT : LATTE_DECOMPILER_DTYPE_FLOAT; + if(useTexelCoordinates) + { + // handle integer coordinates for texelFetch + if (texDim == Latte::E_DIM::DIM_2D || texDim == Latte::E_DIM::DIM_2D_MSAA) + { + src->add("ivec2("); + src->add("vec2("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, texCoordDataType); + src->addFmt(", "); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, texCoordDataType); + + src->addFmt(")*uf_tex{}Scale", texInstruction->textureFetch.textureIndex); // close vec2 and scale + + src->add("), 0"); // close ivec2 and lod param + // todo - lod + } + else if (texDim == Latte::E_DIM::DIM_1D) + { + // VC DS games forget to initialize textures and use texel fetch on an uninitialized texture (a dim of 0 maps to 1D) + src->add("int("); + src->add("float("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, (texOpcode == GPU7_TEX_INST_LD) ? LATTE_DECOMPILER_DTYPE_SIGNED_INT : LATTE_DECOMPILER_DTYPE_FLOAT); + src->addFmt(")*uf_tex{}Scale.x", texInstruction->textureFetch.textureIndex); + src->add("), 0"); + // todo - lod + } + else + cemu_assert_debug(false); + } + else /* useTexelCoordinates == false */ + { + // float coordinates + if ( (texOpcode == GPU7_TEX_INST_SAMPLE_C || texOpcode == GPU7_TEX_INST_SAMPLE_C_L || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ) ) + { + // shadow sampler + if (texDim == Latte::E_DIM::DIM_2D_ARRAY) + { + // 3 coords + compare value (as vec4) + src->add("vec4("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); + + src->addFmt(",{})", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer0)); + } + else if (texDim == Latte::E_DIM::DIM_CUBEMAP) + { + // 2 coords + faceId + if (texInstruction->textureFetch.srcSel[0] >= 4 || texInstruction->textureFetch.srcSel[1] >= 4) + { + debugBreakpoint(); + } + src->add("vec4("); + src->addFmt("redcCUBEReverse({},", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0)); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->addFmt(")"); + src->addFmt(",cubeMapArrayIndex{})", texInstruction->textureFetch.textureIndex); // cubemap index + } + else if (texDim == Latte::E_DIM::DIM_1D) + { + // 1 coord + 1 unused coord (per spec) + compare value + if (texInstruction->textureFetch.srcSel[0] >= 4) + { + debugBreakpoint(); + } + src->addFmt("vec3({},0.0,{})", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], -1, -1, -1, tempBuffer0), _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer1)); + } + else + { + // 2 coords + compare value (as vec3) + if (texInstruction->textureFetch.srcSel[0] >= 4 && texInstruction->textureFetch.srcSel[1] >= 4) + { + debugBreakpoint(); + } + src->addFmt("vec3({}, {})", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0), _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer1)); + } + } + else if( texDim == Latte::E_DIM::DIM_3D || texDim == Latte::E_DIM::DIM_2D_ARRAY ) + { + // 3 coords + src->add("vec3("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + } + else if( texDim == Latte::E_DIM::DIM_CUBEMAP ) + { + // 2 coords + faceId + cemu_assert_debug(texInstruction->textureFetch.srcSel[0] < 4); + cemu_assert_debug(texInstruction->textureFetch.srcSel[1] < 4); + src->add("vec4("); + src->addFmt("redcCUBEReverse({},", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0)); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(")"); + src->addFmt(",cubeMapArrayIndex{})", texInstruction->textureFetch.textureIndex); // cubemap index + } + else if( texDim == Latte::E_DIM::DIM_1D ) + { + // 1 coord + src->add(_getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], -1, -1, -1, tempBuffer0)); + } + else + { + // 2 coords + src->add("vec2("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + // avoid truncate to effectively round downwards on texel edges + if (ActiveSettings::ForceSamplerRoundToPrecision()) + src->addFmt("+ vec2(1.0)/vec2(textureSize(tex{}, 0))/512.0", texInstruction->textureFetch.textureIndex); + } + // lod or lod bias parameter + if( texOpcode == GPU7_TEX_INST_SAMPLE_L || texOpcode == GPU7_TEX_INST_SAMPLE_LB || texOpcode == GPU7_TEX_INST_SAMPLE_C_L) + { + src->add(","); + if(texOpcode == GPU7_TEX_INST_SAMPLE_LB) + src->add(_FormatFloatAsConstant((float)texInstruction->textureFetch.lodBias / 16.0f)); + else + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 3, LATTE_DECOMPILER_DTYPE_FLOAT); + } + else if( texOpcode == GPU7_TEX_INST_SAMPLE_LZ || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ ) + { + src->add(",0.0"); + } + } + // gradient parameters + if (texOpcode == GPU7_TEX_INST_SAMPLE_G) + { + if (texDim == Latte::E_DIM::DIM_2D || + texDim == Latte::E_DIM::DIM_1D ) + { + src->add(",gradH.xy,gradV.xy"); + } + else + { + cemu_assert_unimplemented(); + } + } + // offset + if( texOpcode == GPU7_TEX_INST_SAMPLE_L || texOpcode == GPU7_TEX_INST_SAMPLE_LZ || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ || texOpcode == GPU7_TEX_INST_SAMPLE || texOpcode == GPU7_TEX_INST_SAMPLE_C ) + { + if( hasOffset ) + { + uint8 offsetComponentCount = 0; + if( texDim == Latte::E_DIM::DIM_1D ) + offsetComponentCount = 1; + else if( texDim == Latte::E_DIM::DIM_2D ) + offsetComponentCount = 2; + else if( texDim == Latte::E_DIM::DIM_3D ) + offsetComponentCount = 3; + else if( texDim == Latte::E_DIM::DIM_2D_ARRAY ) + offsetComponentCount = 2; + else + cemu_assert_unimplemented(); + + if( (texInstruction->textureFetch.offsetX&1) ) + cemu_assert_unimplemented(); + if( (texInstruction->textureFetch.offsetY&1) ) + cemu_assert_unimplemented(); + if ((texInstruction->textureFetch.offsetZ & 1)) + cemu_assert_unimplemented(); + + if( offsetComponentCount == 1 ) + src->addFmt(",{}", texInstruction->textureFetch.offsetX/2); + else if( offsetComponentCount == 2 ) + src->addFmt(",ivec2({},{})", texInstruction->textureFetch.offsetX/2, texInstruction->textureFetch.offsetY/2, texInstruction->textureFetch.offsetZ/2); + else if( offsetComponentCount == 3 ) + src->addFmt(",ivec3({},{},{})", texInstruction->textureFetch.offsetX/2, texInstruction->textureFetch.offsetY/2, texInstruction->textureFetch.offsetZ/2); + } + } + // lod bias + if( texOpcode == GPU7_TEX_INST_SAMPLE_C || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ ) + { + src->add(")"); + + if (numWrittenElements > 1) + { + // result is copied into multiple channels + src->add("."); + for (sint32 f = 0; f < numWrittenElements; f++) + { + cemu_assert_debug(texInstruction->dstSel[f] == 0); // only x component is defined + src->add("x"); + } + } + } + else + { + src->add(")."); + for (sint32 f = 0; f < 4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + uint8 elemIndex = texInstruction->dstSel[f]; + if (texOpcode == GPU7_TEX_INST_FETCH4) + { + // 's textureGather() and GPU7's FETCH4 instruction have a different order of elements + // xyzw: top-left, top-right, bottom-right, bottom-left + // textureGather xyzw + // fetch4 yzxw + // translate index from fetch4 to textureGather order + static uint8 fetchToGather[4] = + { + 2, // x -> z + 0, // y -> x + 1, // z -> y + 3, // w -> w + }; + elemIndex = fetchToGather[elemIndex]; + } + src->add(resultElemTable[elemIndex]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + cemu_assert_unimplemented(); + } + } + } + src->add(");"); + + // debug +#ifdef CEMU_DEBUG_ASSERT + if(texInstruction->opcode == GPU7_TEX_INST_LD ) + src->add(" // TEX_INST_LD"); + else if(texInstruction->opcode == GPU7_TEX_INST_SAMPLE ) + src->add(" // TEX_INST_SAMPLE"); + else if(texInstruction->opcode == GPU7_TEX_INST_SAMPLE_L ) + src->add(" // TEX_INST_SAMPLE_L"); + else if(texInstruction->opcode == GPU7_TEX_INST_SAMPLE_LZ ) + src->add(" // TEX_INST_SAMPLE_LZ"); + else if(texInstruction->opcode == GPU7_TEX_INST_SAMPLE_C ) + src->add(" // TEX_INST_SAMPLE_C"); + else if(texInstruction->opcode == GPU7_TEX_INST_SAMPLE_G ) + src->add(" // TEX_INST_SAMPLE_G"); + else + src->addFmt(" // 0x{:02x}", texInstruction->opcode); + if (texInstruction->opcode != texOpcode) + src->addFmt(" (applied as 0x{:02x})", texOpcode); + src->addFmt(" OffsetXYZ {:02x} {:02x} {:02x}", (uint8)texInstruction->textureFetch.offsetX&0xFF, (uint8)texInstruction->textureFetch.offsetY&0xFF, (uint8)texInstruction->textureFetch.offsetZ&0xFF); +#endif + src->add("" _CRLF); +} + +static void _emitTEXGetTextureResInfoCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + src->addFmt("R{}", texInstruction->dstGpr); + src->add("i"); + src->add("."); + + const char* resultElemTable[4] = {"x","y","z","w"}; + sint32 numWrittenElements = 0; + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[f]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + cemu_assert_unimplemented(); + } + } + + // todo - mip index parameter? + + auto texDim = shaderContext->shader->textureUnitDim[texInstruction->textureFetch.textureIndex]; + + if (texDim == Latte::E_DIM::DIM_1D) + src->addFmt(" = ivec4(textureSize(tex{}, 0),1,1,1).", texInstruction->textureFetch.textureIndex); + else if (texDim == Latte::E_DIM::DIM_1D_ARRAY) + src->addFmt(" = ivec4(textureSize(tex{}, 0),1,1).", texInstruction->textureFetch.textureIndex); + else if (texDim == Latte::E_DIM::DIM_2D || texDim == Latte::E_DIM::DIM_2D_MSAA) + src->addFmt(" = ivec4(textureSize(tex{}, 0),1,1).", texInstruction->textureFetch.textureIndex); + else if (texDim == Latte::E_DIM::DIM_2D_ARRAY) + src->addFmt(" = ivec4(textureSize(tex{}, 0),1).", texInstruction->textureFetch.textureIndex); + else + { + cemu_assert_debug(false); + src->addFmt(" = ivec4(textureSize(tex{}, 0),1,1).", texInstruction->textureFetch.textureIndex); + } + + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[texInstruction->dstSel[f]]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + debugBreakpoint(); + } + } + src->add(";" _CRLF); +} + +static void _emitTEXGetCompTexLodCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + src->add(_getRegisterVarName(shaderContext, texInstruction->dstGpr)); + src->add("."); + + const char* resultElemTable[4] = {"x","y","z","w"}; + sint32 numWrittenElements = 0; + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[f]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + debugBreakpoint(); + } + } + + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, shaderContext->typeTracker.defaultDataType); + + if( shaderContext->shader->textureUnitDim[texInstruction->textureFetch.textureIndex] == Latte::E_DIM::DIM_CUBEMAP ) + { + // 3 coordinates + if(shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("vec4(textureQueryLod(tex{}, {}.{}{}{}),0.0,0.0)", texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]], resultElemTable[texInstruction->textureFetch.srcSel[2]]); + else + src->addFmt("vec4(textureQueryLod(tex{}, intBitsToFloat({}.{}{}{})),0.0,0.0)", texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]], resultElemTable[texInstruction->textureFetch.srcSel[2]]); + } + else + { + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("vec4(textureQueryLod(tex{}, {}.{}{}),0.0,0.0)", texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]]); + else + src->addFmt("vec4(textureQueryLod(tex{}, intBitsToFloat({}.{}{})),0.0,0.0)", texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]]); + debugBreakpoint(); + } + + + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, shaderContext->typeTracker.defaultDataType); + src->add("."); + + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[texInstruction->dstSel[f]]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + debugBreakpoint(); + } + } + src->add(";" _CRLF); +} + +static void _emitTEXSetCubemapIndexCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + src->addFmt("cubeMapArrayIndex{}", texInstruction->textureFetch.textureIndex); + const char* resultElemTable[4] = {"x","y","z","w"}; + + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->addFmt(" = intBitsToFloat(R{}i.{});" _CRLF, texInstruction->srcGpr, resultElemTable[texInstruction->textureFetch.srcSel[0]]); + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt(" = R{}f.{};" _CRLF, texInstruction->srcGpr, resultElemTable[texInstruction->textureFetch.srcSel[0]]); + else + cemu_assert_unimplemented(); +} + +static void _emitTEXGetGradientsHV(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + sint32 componentCount = 0; + for (sint32 i = 0; i < 4; i++) + { + if(texInstruction->dstSel[i] == 7) + continue; + componentCount++; + } + src->add(_getRegisterVarName(shaderContext, texInstruction->dstGpr)); + src->add("."); + const char* resultElemTable[4] = { "x","y","z","w" }; + sint32 numWrittenElements = 0; + for (sint32 f = 0; f < 4; f++) + { + if (texInstruction->dstSel[f] < 4) + { + src->add(resultElemTable[f]); + numWrittenElements++; + } + else if (texInstruction->dstSel[f] == 7) + { + // masked and not written + } + else + { + debugBreakpoint(); + } + } + + const char* funcName; + if (texInstruction->opcode == GPU7_TEX_INST_GET_GRADIENTS_H) + funcName = "dFdx"; + else + funcName = "dFdy"; + + src->add(" = "); + + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, shaderContext->typeTracker.defaultDataType); + + src->addFmt("{}(", funcName); + _emitRegisterAccessCode(shaderContext, texInstruction->srcGpr, (componentCount >= 1) ? texInstruction->textureFetch.srcSel[0] : -1, (componentCount >= 2) ? texInstruction->textureFetch.srcSel[1] : -1, (componentCount >= 3) ? texInstruction->textureFetch.srcSel[2] : -1, (componentCount >= 4)?texInstruction->textureFetch.srcSel[3]:-1, LATTE_DECOMPILER_DTYPE_FLOAT); + + src->add(")"); + + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, shaderContext->typeTracker.defaultDataType); + + src->add(";" _CRLF); + +} + +static void _emitTEXSetGradientsHV(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + if (texInstruction->opcode == GPU7_TEX_INST_SET_GRADIENTS_H) + src->add("gradH = "); + else + src->add("gradV = "); + + _emitRegisterAccessCode(shaderContext, texInstruction->srcGpr, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], texInstruction->textureFetch.srcSel[2], texInstruction->textureFetch.srcSel[3], LATTE_DECOMPILER_DTYPE_FLOAT); + + src->add(";" _CRLF); +} + +static void _emitGSReadInputVFetchCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + src->add(_getRegisterVarName(shaderContext, texInstruction->dstGpr)); + + src->add("."); + + const char* resultElemTable[4] = {"x","y","z","w"}; + sint32 numWrittenElements = 0; + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[f]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + cemu_assert_unimplemented(); + } + } + + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, shaderContext->typeTracker.defaultDataType); + src->add("(v2g["); + if (texInstruction->textureFetch.srcSel[0] >= 4) + cemu_assert_unimplemented(); + if (texInstruction->textureFetch.srcSel[1] >= 4) + cemu_assert_unimplemented(); + // todo: Index type + src->add("0"); + src->addFmt("].passV2GParameter{}.", texInstruction->textureFetch.offset/16); + + + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[texInstruction->dstSel[f]]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + cemu_assert_unimplemented(); + } + } + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, shaderContext->typeTracker.defaultDataType); + src->add(";" _CRLF); +} + +static sint32 _writeDestMaskXYZW(LatteDecompilerShaderContext* shaderContext, sint8* dstSel) +{ + StringBuf* src = shaderContext->shaderSource; + const char* resultElemTable[4] = { "x","y","z","w" }; + sint32 numWrittenElements = 0; + for (sint32 f = 0; f < 4; f++) + { + if (dstSel[f] < 4) + { + src->add(resultElemTable[f]); + numWrittenElements++; + } + else if (dstSel[f] == 7) + { + // masked and not written + } + else + { + cemu_assert_unimplemented(); + } + } + return numWrittenElements; +} + +static void _emitTEXVFetchCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + // handle special case where geometry shader reads input attributes from vertex shader via ringbuffer + StringBuf* src = shaderContext->shaderSource; + if( texInstruction->textureFetch.textureIndex == 0x9F && shaderContext->shaderType == LatteConst::ShaderType::Geometry ) + { + _emitGSReadInputVFetchCode(shaderContext, texInstruction); + return; + } + + src->add(_getRegisterVarName(shaderContext, texInstruction->dstGpr)); + src->add("."); + + _writeDestMaskXYZW(shaderContext, texInstruction->dstSel); + const char* resultElemTable[4] = {"x","y","z","w"}; + + src->add(" = "); + + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->add("floatBitsToInt("); + else + src->add("("); + + src->addFmt("ubuff{}[", texInstruction->textureFetch.textureIndex - 0x80); + + if( shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + src->addFmt("{}.{}", _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]]); + else + src->addFmt("floatBitsToInt({}.{})", _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]]); + src->add("]."); + + + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[texInstruction->dstSel[f]]); + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + debugBreakpoint(); + } + } + src->add(");" _CRLF); +} + +static void _emitTEXReadMemCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + src->add(_getRegisterVarName(shaderContext, texInstruction->dstGpr)); + src->add("."); + sint32 count = _writeDestMaskXYZW(shaderContext, texInstruction->dstSel); + + src->add(" = "); + + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->add("floatBitsToInt("); + else + src->add("("); + + sint32 readCount; + + if (texInstruction->memRead.format == FMT_32_FLOAT) + { + readCount = 1; + // todo + src->add("0.0"); + } + else if (texInstruction->memRead.format == FMT_32_32_FLOAT) + { + readCount = 2; + // todo + src->add("vec2(0.0,0.0)"); + } + else if (texInstruction->memRead.format == FMT_32_32_32_FLOAT) + { + readCount = 3; + // todo + src->add("vec3(0.0,0.0,0.0)"); + } + else + { + cemu_assert_unimplemented(); + } + if (count < readCount) + { + if (count == 1) + src->add(".x"); + else if (count == 2) + src->add(".xy"); + else if (count == 3) + src->add(".xyz"); + } + src->add(");" _CRLF); +} + +static void _emitTEXClauseCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) +{ + cemu_assert_debug(cfInstruction->instructionsALU.empty()); + for(auto& texInstruction : cfInstruction->instructionsTEX) + { + if( texInstruction.opcode == GPU7_TEX_INST_SAMPLE || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_L || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_LB || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_LZ || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C_L || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C_LZ || texInstruction.opcode == GPU7_TEX_INST_FETCH4 || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_G || texInstruction.opcode == GPU7_TEX_INST_LD ) + _emitTEXSampleTextureCode(shaderContext, &texInstruction); + else if( texInstruction.opcode == GPU7_TEX_INST_GET_TEXTURE_RESINFO ) + _emitTEXGetTextureResInfoCode(shaderContext, &texInstruction); + else if( texInstruction.opcode == GPU7_TEX_INST_GET_COMP_TEX_LOD ) + _emitTEXGetCompTexLodCode(shaderContext, &texInstruction); + else if( texInstruction.opcode == GPU7_TEX_INST_SET_CUBEMAP_INDEX ) + _emitTEXSetCubemapIndexCode(shaderContext, &texInstruction); + else if (texInstruction.opcode == GPU7_TEX_INST_GET_GRADIENTS_H || + texInstruction.opcode == GPU7_TEX_INST_GET_GRADIENTS_V) + _emitTEXGetGradientsHV(shaderContext, &texInstruction); + else if (texInstruction.opcode == GPU7_TEX_INST_SET_GRADIENTS_H || + texInstruction.opcode == GPU7_TEX_INST_SET_GRADIENTS_V) + _emitTEXSetGradientsHV(shaderContext, &texInstruction); + else if (texInstruction.opcode == GPU7_TEX_INST_VFETCH) + _emitTEXVFetchCode(shaderContext, &texInstruction); + else if (texInstruction.opcode == GPU7_TEX_INST_MEM) + _emitTEXReadMemCode(shaderContext, &texInstruction); + else + cemu_assert_unimplemented(); + } +} + +// generate the code for reading the source input GPR (or constants) for exports +static void _emitExportGPRReadCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, sint32 requiredType, uint32 burstIndex) +{ + StringBuf* src = shaderContext->shaderSource; + uint32 numOutputs = 4; + if( cfInstruction->type == GPU7_CF_INST_MEM_RING_WRITE ) + { + numOutputs = (cfInstruction->memWriteCompMask&1)?1:0; + numOutputs += (cfInstruction->memWriteCompMask&2)?1:0; + numOutputs += (cfInstruction->memWriteCompMask&4)?1:0; + numOutputs += (cfInstruction->memWriteCompMask&8)?1:0; + } + if (requiredType == LATTE_DECOMPILER_DTYPE_FLOAT) + { + if(numOutputs == 1) + src->add("float("); + else + src->addFmt("vec{}(", numOutputs); + } + else if (requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + if (numOutputs == 1) + src->add("int("); + else + src->addFmt("ivec{}(", numOutputs); + } + else + cemu_assert_unimplemented(); + sint32 actualOutputs = 0; + for(sint32 i=0; i<4; i++) + { + // todo: Use type of register element based on information from type tracker (currently we assume it's always a signed integer) + uint32 exportSel = 0; + if( cfInstruction->type == GPU7_CF_INST_MEM_RING_WRITE ) + { + exportSel = i; + if( (cfInstruction->memWriteCompMask&(1<exportComponentSel[i]; + } + if( actualOutputs > 0 ) + src->add(", "); + actualOutputs++; + if( exportSel < 4 ) + { + _emitRegisterAccessCode(shaderContext, cfInstruction->exportSourceGPR+burstIndex, exportSel, -1, -1, -1, requiredType); + } + else if (exportSel == 4) + { + // constant zero + src->add("0"); + } + else if (exportSel == 5) + { + // constant one + src->add("1.0"); + } + else if( exportSel == 7 ) + { + // element masked (which means 0 is exported?) + src->add("0"); + } + else + { + cemu_assert_debug(false); + src->add("0"); + } + } + if( requiredType == LATTE_DECOMPILER_DTYPE_FLOAT ) + src->add(")"); + else if( requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + src->add(")"); + else + cemu_assert_unimplemented(); +} + +static void _emitExportCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + src->add("// export" _CRLF); + if(shaderContext->shaderType == LatteConst::ShaderType::Vertex ) + { + if( cfInstruction->exportBurstCount != 0 ) + debugBreakpoint(); + if (cfInstruction->exportType == 1 && cfInstruction->exportArrayBase == GPU7_DECOMPILER_CF_EXPORT_BASE_POSITION) + { + // export position + // GX2 special state 0 disables rasterizer viewport offset and scaling (probably, exact mechanism is not known). Handle this here + bool hasAnyViewportScaleDisabled = + !shaderContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_X_SCALE_ENA() || + !shaderContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Y_SCALE_ENA() || + !shaderContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Z_SCALE_ENA(); + + if (hasAnyViewportScaleDisabled) + { + src->add("vec4 finalPos = "); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); + src->add(";" _CRLF); + src->add("finalPos.xy = finalPos.xy * uf_windowSpaceToClipSpaceTransform - vec2(1.0,1.0);"); + src->add("SET_POSITION(finalPos);"); + } + else + { + src->add("SET_POSITION("); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); + src->add(");" _CRLF); + } + } + else if (cfInstruction->exportType == 1 && cfInstruction->exportArrayBase == GPU7_DECOMPILER_CF_EXPORT_POINT_SIZE ) + { + // export gl_PointSize + if (shaderContext->analyzer.outputPointSize) + { + cemu_assert_debug(shaderContext->analyzer.writesPointSize); + src->add("gl_PointSize = ("); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); + src->add(").x"); + src->add(";" _CRLF); + } + } + else if( cfInstruction->exportType == 2 && cfInstruction->exportArrayBase < 32 ) + { + // export parameter + sint32 paramIndex = cfInstruction->exportArrayBase; + uint32 vsSemanticId = _getVertexShaderOutParamSemanticId(shaderContext->contextRegisters, paramIndex); + if (vsSemanticId != 0xFF) + { + src->addFmt("passParameterSem{} = ", vsSemanticId); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); + src->add(";" _CRLF); + } + else + { + src->add("// skipped export to semanticId 255" _CRLF); + } + } + else + cemu_assert_unimplemented(); + } + else if(shaderContext->shaderType == LatteConst::ShaderType::Pixel ) + { + if( cfInstruction->exportType == 0 && cfInstruction->exportArrayBase < 8 ) + { + for(uint32 i=0; i<(cfInstruction->exportBurstCount+1); i++) + { + sint32 pixelColorOutputIndex = LatteDecompiler_getColorOutputIndexFromExportIndex(shaderContext, cfInstruction->exportArrayBase+i); + // if color output is for target 0, then also handle alpha test + bool alphaTestEnable = shaderContext->contextRegistersNew->SX_ALPHA_TEST_CONTROL.get_ALPHA_TEST_ENABLE(); + auto alphaTestFunc = shaderContext->contextRegistersNew->SX_ALPHA_TEST_CONTROL.get_ALPHA_FUNC(); + if( pixelColorOutputIndex == 0 && alphaTestEnable && alphaTestFunc == Latte::E_COMPAREFUNC::NEVER ) + { + // never pass alpha test + src->add("discard;" _CRLF); + } + else if( pixelColorOutputIndex == 0 && alphaTestEnable && alphaTestFunc != Latte::E_COMPAREFUNC::ALWAYS) + { + src->add("if( (("); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, i); + src->add(").a "); + + switch( alphaTestFunc ) + { + case Latte::E_COMPAREFUNC::LESS: + src->add("<"); + break; + case Latte::E_COMPAREFUNC::EQUAL: + src->add("=="); + break; + case Latte::E_COMPAREFUNC::LEQUAL: + src->add("<="); + break; + case Latte::E_COMPAREFUNC::GREATER: + src->add(">"); + break; + case Latte::E_COMPAREFUNC::NOTEQUAL: + src->add("!="); + break; + case Latte::E_COMPAREFUNC::GEQUAL: + src->add(">="); + break; + } + src->add(" uf_alphaTestRef"); + src->add(") == false) discard;" _CRLF); + } + // pixel color output + src->addFmt("passPixelColor{} = ", pixelColorOutputIndex); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, i); + src->add(";" _CRLF); + + if( cfInstruction->exportArrayBase+i >= 8 ) + cemu_assert_unimplemented(); + } + } + else if( cfInstruction->exportType == 0 && cfInstruction->exportArrayBase == 61 ) + { + // pixel depth or gl_FragStencilRefARB + if( cfInstruction->exportBurstCount > 0 ) + cemu_assert_unimplemented(); + + if (cfInstruction->exportComponentSel[0] == 7) + { + cemu_assert_unimplemented(); // gl_FragDepth ? + } + if (cfInstruction->exportComponentSel[1] != 7) + { + cemu_assert_unimplemented(); // exporting to gl_FragStencilRefARB + } + if (cfInstruction->exportComponentSel[2] != 7) + { + cemu_assert_unimplemented(); // ukn + } + if (cfInstruction->exportComponentSel[3] != 7) + { + cemu_assert_unimplemented(); // ukn + } + + src->add("gl_FragDepth = "); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); + src->add(".x"); + src->add(";" _CRLF); + } + else + cemu_assert_unimplemented(); + } +} + +static void _emitXYZWByMask(StringBuf* src, uint32 mask) +{ + if( (mask&(1<<0)) != 0 ) + src->add("x"); + if( (mask&(1<<1)) != 0 ) + src->add("y"); + if( (mask&(1<<2)) != 0 ) + src->add("z"); + if( (mask&(1<<3)) != 0 ) + src->add("w"); +} + +static void _emitCFRingWriteCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + // calculate parameter output (based on ring buffer output offset relative to GS unit) + uint32 bytesPerVertex = shaderContext->contextRegisters[mmSQ_GS_VERT_ITEMSIZE] * 4; + bytesPerVertex = std::max(bytesPerVertex, (uint32)1); // avoid division by zero + uint32 parameterOffset = ((cfInstruction->exportArrayBase * 4) % bytesPerVertex); + // for geometry shaders with streamout, MEM_RING_WRITE is used to pass the data to the copy shader, which then uses STREAM*_WRITE + if (shaderContext->shaderType == LatteConst::ShaderType::Geometry && shaderContext->analyzer.hasStreamoutEnable) + { + // if streamout is enabled, we generate transform feedback output code instead of the normal gs output + for (uint32 burstIndex = 0; burstIndex < (cfInstruction->exportBurstCount + 1); burstIndex++) + { + parameterOffset = ((cfInstruction->exportArrayBase * 4 + burstIndex*0x10) % bytesPerVertex); + // find matching stream write in copy shader + LatteGSCopyShaderStreamWrite_t* streamWrite = nullptr; + for (auto& it : shaderContext->parsedGSCopyShader->list_streamWrites) + { + if (it.offset == parameterOffset) + { + streamWrite = ⁢ + break; + } + } + if (streamWrite == nullptr) + { + cemu_assert_suspicious(); + return; + } + + for (sint32 i = 0; i < 4; i++) + { + if ((cfInstruction->memWriteCompMask&(1 << i)) == 0) + continue; + + if (shaderContext->options->useTFViaSSBO) + { + uint32 u32Offset = streamWrite->exportArrayBase + i; + src->addFmt("sb_buffer[sbBase{} + {}]", streamWrite->bufferIndex, u32Offset); + } + else + { + src->addFmt("sb{}[{}]", streamWrite->bufferIndex, streamWrite->exportArrayBase + i); + } + + src->add(" = "); + + _emitTypeConversionPrefixMSL(shaderContext, shaderContext->typeTracker.defaultDataType, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + + src->addFmt("{}.", _getRegisterVarName(shaderContext, cfInstruction->exportSourceGPR+burstIndex)); + if (i == 0) + src->add("x"); + else if (i == 1) + src->add("y"); + else if (i == 2) + src->add("z"); + else if (i == 3) + src->add("w"); + + _emitTypeConversionSuffixMSL(shaderContext, shaderContext->typeTracker.defaultDataType, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + + src->add(";" _CRLF); + } + } + return; + } + + if (shaderContext->shaderType == LatteConst::ShaderType::Vertex) + { + if (cfInstruction->memWriteElemSize != 3) + cemu_assert_unimplemented(); + if ((cfInstruction->exportArrayBase & 3) != 0) + cemu_assert_unimplemented(); + for (sint32 burstIndex = 0; burstIndex < (sint32)(cfInstruction->exportBurstCount + 1); burstIndex++) + { + src->addFmt("v2g.passV2GParameter{}.", (cfInstruction->exportArrayBase) / 4 + burstIndex); + _emitXYZWByMask(src, cfInstruction->memWriteCompMask); + src->addFmt(" = "); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_SIGNED_INT, burstIndex); + src->add(";" _CRLF); + } + } + else if (shaderContext->shaderType == LatteConst::ShaderType::Geometry) + { + cemu_assert_debug(cfInstruction->memWriteElemSize == 3); + //if (cfInstruction->memWriteElemSize != 3) + // debugBreakpoint(); + cemu_assert_debug((cfInstruction->exportArrayBase & 3) == 0); + + for (uint32 burstIndex = 0; burstIndex < (cfInstruction->exportBurstCount + 1); burstIndex++) + { + uint32 parameterExportType = 0; + uint32 parameterExportBase = 0; + if (LatteGSCopyShaderParser_getExportTypeByOffset(shaderContext->parsedGSCopyShader, parameterOffset + burstIndex * (cfInstruction->memWriteElemSize+1)*4, ¶meterExportType, ¶meterExportBase) == false) + { + cemu_assert_debug(false); + shaderContext->hasError = true; + return; + } + + if (parameterExportType == 1 && parameterExportBase == GPU7_DECOMPILER_CF_EXPORT_BASE_POSITION) + { + src->add("{" _CRLF); + src->addFmt("vec4 pos = vec4(0.0,0.0,0.0,1.0);" _CRLF); + src->addFmt("pos."); + _emitXYZWByMask(src, cfInstruction->memWriteCompMask); + src->addFmt(" = "); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, burstIndex); + src->add(";" _CRLF); + src->add("SET_POSITION(pos);" _CRLF); + src->add("}" _CRLF); + } + else if (parameterExportType == 2 && parameterExportBase < 16) + { + src->addFmt("passG2PParameter{}.", parameterExportBase); + _emitXYZWByMask(src, cfInstruction->memWriteCompMask); + src->addFmt(" = "); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, burstIndex); + src->add(";" _CRLF); + } + else + cemu_assert_debug(false); + } + } + else + debugBreakpoint(); // todo +} + +static void _emitStreamWriteCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + if (shaderContext->analyzer.hasStreamoutEnable == false) + { +#ifdef CEMU_DEBUG_ASSERT + src->add("// omitted streamout write" _CRLF); +#endif + return; + } + uint32 streamoutBufferIndex; + if (cfInstruction->type == GPU7_CF_INST_MEM_STREAM0_WRITE) + streamoutBufferIndex = 0; + else if (cfInstruction->type == GPU7_CF_INST_MEM_STREAM1_WRITE) + streamoutBufferIndex = 1; + else + cemu_assert_unimplemented(); + + if (shaderContext->shaderType == LatteConst::ShaderType::Vertex) + { + uint32 arraySize = cfInstruction->memWriteArraySize + 1; + + for (sint32 i = 0; i < (sint32)arraySize; i++) + { + if ((cfInstruction->memWriteCompMask&(1 << i)) == 0) + continue; + + if (shaderContext->options->useTFViaSSBO) + { + uint32 u32Offset = cfInstruction->exportArrayBase + i; + src->addFmt("sb_buffer[sbBase{} + {}]", streamoutBufferIndex, u32Offset); + } + else + { + src->addFmt("sb{}[{}]", streamoutBufferIndex, cfInstruction->exportArrayBase + i); + } + + src->add(" = "); + + _emitTypeConversionPrefixMSL(shaderContext, shaderContext->typeTracker.defaultDataType, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + + src->add(_getRegisterVarName(shaderContext, cfInstruction->exportSourceGPR)); + _appendChannelAccess(src, i); + _emitTypeConversionSuffixMSL(shaderContext, shaderContext->typeTracker.defaultDataType, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + + src->add(";" _CRLF); + } + } + else + cemu_assert_debug(false); +} + +static void _emitCFCall(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + uint32 subroutineAddr = cfInstruction->addr; + LatteDecompilerSubroutineInfo* subroutineInfo = nullptr; + // find subroutine + for (auto& subroutineItr : shaderContext->list_subroutines) + { + if (subroutineItr.cfAddr == subroutineAddr) + { + subroutineInfo = &subroutineItr; + break; + } + } + if (subroutineInfo == nullptr) + { + cemu_assert_debug(false); + return; + } + // inline function + if (shaderContext->isSubroutine) + { + cemu_assert_debug(false); // inlining with cascaded function calls not supported + return; + } + // init CF stack variables + src->addFmt("activeMaskStackSub{:04x}[0] = true;" _CRLF, subroutineInfo->cfAddr); + src->addFmt("activeMaskStackCSub{:04x}[0] = true;" _CRLF, subroutineInfo->cfAddr); + src->addFmt("activeMaskStackCSub{:04x}[1] = true;" _CRLF, subroutineInfo->cfAddr); + + shaderContext->isSubroutine = true; + shaderContext->subroutineInfo = subroutineInfo; + for(auto& cfInstruction : subroutineInfo->instructions) + LatteDecompiler_emitClauseCodeMSL(shaderContext, &cfInstruction, true); + shaderContext->isSubroutine = false; + shaderContext->subroutineInfo = nullptr; +} + +void LatteDecompiler_emitClauseCodeMSL(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, bool isSubroutine) +{ + StringBuf* src = shaderContext->shaderSource; + + if( cfInstruction->type == GPU7_CF_INST_ALU || cfInstruction->type == GPU7_CF_INST_ALU_PUSH_BEFORE || cfInstruction->type == GPU7_CF_INST_ALU_POP_AFTER || cfInstruction->type == GPU7_CF_INST_ALU_POP2_AFTER || cfInstruction->type == GPU7_CF_INST_ALU_BREAK || cfInstruction->type == GPU7_CF_INST_ALU_ELSE_AFTER ) + { + // emit ALU code + if (shaderContext->analyzer.modifiesPixelActiveState) + { + if(cfInstruction->type == GPU7_CF_INST_ALU_PUSH_BEFORE) + src->addFmt("if( {} == true ) {{" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1 - 1)); + else + src->addFmt("if( {} == true ) {{" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1)); + } + if (cfInstruction->type == GPU7_CF_INST_ALU_PUSH_BEFORE) + { + src->addFmt("{} = {};" _CRLF, _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth-1)); + src->addFmt("{} = {};" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth)); + } + _emitALUClauseCode(shaderContext, cfInstruction); + if( shaderContext->analyzer.modifiesPixelActiveState ) + src->add("}" _CRLF); + cemu_assert_debug(!(shaderContext->analyzer.modifiesPixelActiveState == false && cfInstruction->type != GPU7_CF_INST_ALU)); + // handle ELSE case of PUSH_BEFORE + if( cfInstruction->type == GPU7_CF_INST_ALU_PUSH_BEFORE ) + { + src->add("else {" _CRLF); + src->addFmt("{} = false;" _CRLF, _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth)); + src->addFmt("{} = false;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1)); + src->add("}" _CRLF); + } + // post clause handler + if( cfInstruction->type == GPU7_CF_INST_ALU_POP_AFTER ) + { + src->addFmt("{} = {} == true && {} == true;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1 - 1), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth - 1), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth - 1)); + } + else if( cfInstruction->type == GPU7_CF_INST_ALU_POP2_AFTER ) + { + src->addFmt("{} = {} == true && {} == true;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1 - 2), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth - 2), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth - 2)); + } + else if( cfInstruction->type == GPU7_CF_INST_ALU_ELSE_AFTER ) + { + // no condition test + // pop stack + if( cfInstruction->popCount != 0 ) + debugBreakpoint(); + // else operation + src->addFmt("{} = {} == false;" _CRLF, _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth)); + src->addFmt("{} = {} == true && {} == true;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth)); + } + } + else if( cfInstruction->type == GPU7_CF_INST_TEX ) + { + // emit TEX code + if (shaderContext->analyzer.modifiesPixelActiveState) + { + src->addFmt("if( {} == true ) {{" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth+1)); + } + _emitTEXClauseCode(shaderContext, cfInstruction); + if (shaderContext->analyzer.modifiesPixelActiveState) + { + src->add("}" _CRLF); + } + } + else if( cfInstruction->type == GPU7_CF_INST_EXPORT || cfInstruction->type == GPU7_CF_INST_EXPORT_DONE ) + { + // emit export code + _emitExportCode(shaderContext, cfInstruction); + } + else if( cfInstruction->type == GPU7_CF_INST_ELSE ) + { + // todo: Condition test, popCount? + src->addFmt("{} = {} == false;" _CRLF, _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth)); + src->addFmt("{} = {} == true && {} == true;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth)); + } + else if( cfInstruction->type == GPU7_CF_INST_POP ) + { + src->addFmt("{} = {} == true && {} == true;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1 - cfInstruction->popCount), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth - cfInstruction->popCount), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth - cfInstruction->popCount)); + } + else if( cfInstruction->type == GPU7_CF_INST_LOOP_START_DX10 || + cfInstruction->type == GPU7_CF_INST_LOOP_START_NO_AL) + { + // start of loop + // if pixel is disabled, then skip loop + if (ActiveSettings::ShaderPreventInfiniteLoopsEnabled()) + { + // with iteration limit to prevent infinite loops + src->addFmt("int loopCounter{} = 0;" _CRLF, (sint32)cfInstruction->cfAddr); + src->addFmt("while( {} == true && loopCounter{} < 500 )" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1), (sint32)cfInstruction->cfAddr); + src->add("{" _CRLF); + src->addFmt("loopCounter{}++;" _CRLF, (sint32)cfInstruction->cfAddr); + } + else + { + src->addFmt("while( {} == true )" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1)); + src->add("{" _CRLF); + } + } + else if( cfInstruction->type == GPU7_CF_INST_LOOP_END ) + { + // this might not always work + if( cfInstruction->popCount != 0 ) + debugBreakpoint(); + src->add("}" _CRLF); + } + else if( cfInstruction->type == GPU7_CF_INST_LOOP_BREAK ) + { + if( cfInstruction->popCount != 0 ) + debugBreakpoint(); + if (shaderContext->analyzer.modifiesPixelActiveState) + { + src->addFmt("if( {} == true ) {{" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1)); + } + // note: active stack level is set to the same level as the loop begin. popCount is ignored + src->add("break;" _CRLF); + + if (shaderContext->analyzer.modifiesPixelActiveState) + src->add("}" _CRLF); + + } + else if( cfInstruction->type == GPU7_CF_INST_MEM_STREAM0_WRITE || + cfInstruction->type == GPU7_CF_INST_MEM_STREAM1_WRITE ) + { + _emitStreamWriteCode(shaderContext, cfInstruction); + } + else if( cfInstruction->type == GPU7_CF_INST_MEM_RING_WRITE ) + { + _emitCFRingWriteCode(shaderContext, cfInstruction); + } + else if( cfInstruction->type == GPU7_CF_INST_EMIT_VERTEX ) + { + if( shaderContext->analyzer.modifiesPixelActiveState ) + src->addFmt("if( {} == true ) {{" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1)); + // write point size + if (shaderContext->analyzer.outputPointSize && shaderContext->analyzer.writesPointSize == false) + src->add("gl_PointSize = uf_pointSize;" _CRLF); + // emit vertex + src->add("EmitVertex();" _CRLF); + // increment transform feedback pointer + if (shaderContext->analyzer.useSSBOForStreamout) + { + for (sint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++) + { + if (!shaderContext->output->streamoutBufferWriteMask[i]) + continue; + cemu_assert_debug((shaderContext->output->streamoutBufferStride[i] & 3) == 0); + src->addFmt("sbBase{} += {};" _CRLF, i, shaderContext->output->streamoutBufferStride[i] / 4); + } + } + + if( shaderContext->analyzer.modifiesPixelActiveState ) + src->add("}" _CRLF); + } + else if (cfInstruction->type == GPU7_CF_INST_CALL) + { + _emitCFCall(shaderContext, cfInstruction); + } + else if (cfInstruction->type == GPU7_CF_INST_RETURN) + { + // todo (handle properly) + } + else + { + cemu_assert_debug(false); + } +} + +void LatteDecompiler_emitHelperFunctions(LatteDecompilerShaderContext* shaderContext, StringBuf* fCStr_shaderSource) +{ + if( shaderContext->analyzer.hasRedcCUBE ) + { + fCStr_shaderSource->add("void redcCUBE(vec4 src0, vec4 src1, out vec3 stm, out int faceId)\r\n" + "{\r\n" + "// stm -> x .. s, y .. t, z .. MajorAxis*2.0\r\n" + + "vec3 inputCoord = normalize(vec3(src1.y, src1.x, src0.x));\r\n" + + "float rx = inputCoord.x;\r\n" + "float ry = inputCoord.y;\r\n" + "float rz = inputCoord.z;\r\n" + "if( abs(rx) > abs(ry) && abs(rx) > abs(rz) )\r\n" + "{\r\n" + "stm.z = rx*2.0;\r\n" + "stm.xy = vec2(ry,rz); \r\n" + "if( rx >= 0.0 )\r\n" + "{\r\n" + "faceId = 0;\r\n" + "}\r\n" + "else\r\n" + "{\r\n" + "faceId = 1;\r\n" + "}\r\n" + "}\r\n" + "else if( abs(ry) > abs(rx) && abs(ry) > abs(rz) )\r\n" + "{\r\n" + "stm.z = ry*2.0;\r\n" + "stm.xy = vec2(rx,rz); \r\n" + "if( ry >= 0.0 )\r\n" + "{\r\n" + "faceId = 2;\r\n" + "}\r\n" + "else\r\n" + "{\r\n" + "faceId = 3;\r\n" + "}\r\n" + "}\r\n" + "else //if( abs(rz) > abs(ry) && abs(rz) > abs(rx) )\r\n" + "{\r\n" + "stm.z = rz*2.0;\r\n" + "stm.xy = vec2(rx,ry); \r\n" + "if( rz >= 0.0 )\r\n" + "{\r\n" + "faceId = 4;\r\n" + "}\r\n" + "else\r\n" + "{\r\n" + "faceId = 5;\r\n" + "}\r\n" + "}\r\n" + "}\r\n"); + } + + if( shaderContext->analyzer.hasCubeMapTexture ) + { + fCStr_shaderSource->add("vec3 redcCUBEReverse(vec2 st, int faceId)\r\n" + "{\r\n" + "st.yx = st.xy;\r\n" + "vec3 v;\r\n" + "float majorAxis = 1.0;\r\n" + "if( faceId == 0 )\r\n" + "{\r\n" + "v.yz = (st-vec2(1.5))*(majorAxis*2.0);\r\n" + "v.x = 1.0;\r\n" + "}\r\n" + "else if( faceId == 1 )\r\n" + "{\r\n" + "v.yz = (st-vec2(1.5))*(majorAxis*2.0);\r\n" + "v.x = -1.0;\r\n" + "}\r\n" + "else if( faceId == 2 )\r\n" + "{\r\n" + "v.xz = (st-vec2(1.5))*(majorAxis*2.0);\r\n" + "v.y = 1.0;\r\n" + "}\r\n" + "else if( faceId == 3 )\r\n" + "{\r\n" + "v.xz = (st-vec2(1.5))*(majorAxis*2.0);\r\n" + "v.y = -1.0;\r\n" + "}\r\n" + "else if( faceId == 4 )\r\n" + "{\r\n" + "v.xy = (st-vec2(1.5))*(majorAxis*2.0);\r\n" + "v.z = 1.0;\r\n" + "}\r\n" + "else\r\n" + "{\r\n" + "v.xy = (st-vec2(1.5))*(majorAxis*2.0);\r\n" + "v.z = -1.0;\r\n" + "}\r\n" + + "return v;\r\n" + "}\r\n"); + } + + // clamp + fCStr_shaderSource->add("" + "int clampFI32(int v)\r\n" + "{\r\n" + "if( v == 0x7FFFFFFF )\r\n" + " return floatBitsToInt(1.0);\r\n" + "else if( v == 0xFFFFFFFF )\r\n" + " return floatBitsToInt(0.0);\r\n" + "return floatBitsToInt(clamp(intBitsToFloat(v), 0.0, 1.0));\r\n" + "}\r\n"); + // mul non-ieee way (0*NaN/INF => 0.0) + if (shaderContext->options->strictMul) + { + // things we tried: + //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ return mix(a*b,0.0,a==0.0||b==0.0); }" STR_LINEBREAK); + //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ return mix(vec2(a*b,0.0),vec2(0.0,0.0),(equal(vec2(a),vec2(0.0,0.0))||equal(vec2(b),vec2(0.0,0.0)))).x; }" STR_LINEBREAK); + //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ if( a == 0.0 || b == 0.0 ) return 0.0; return a*b; }" STR_LINEBREAK); + //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){float r = a*b;r = intBitsToFloat(floatBitsToInt(r)&(((floatBitsToInt(a) != 0) && (floatBitsToInt(b) != 0))?0xFFFFFFFF:0));return r;}" STR_LINEBREAK); works + + // for "min" it used to be: float mul_nonIEEE(float a, float b){ return min(a*b,min(abs(a)*3.40282347E+38F,abs(b)*3.40282347E+38F)); } + + if( LatteGPUState.glVendor == GLVENDOR_NVIDIA && !ActiveSettings::DumpShadersEnabled()) + fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){return mix(0.0, a*b, (a != 0.0) && (b != 0.0));}" _CRLF); // compiles faster on Nvidia and also results in lower RAM usage (OpenGL) + else + fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ if( a == 0.0 || b == 0.0 ) return 0.0; return a*b; }" _CRLF); + + // DXKV-like: fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ return (b==0.0 ? 0.0 : a) * (a==0.0 ? 0.0 : b); }" _CRLF); + } +} + +#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp" + +static void LatteDecompiler_emitAttributeImport(LatteDecompilerShaderContext* shaderContext, LatteParsedFetchShaderAttribute_t& attrib) +{ + auto src = shaderContext->shaderSource; + + static const char* dsMappingTableFloat[6] = { "int(attrDecoder.x)", "int(attrDecoder.y)", "int(attrDecoder.z)", "int(attrDecoder.w)", /*"floatBitsToInt(0.0)"*/ "0", /*"floatBitsToInt(1.0)"*/ "0x3f800000" }; + static const char* dsMappingTableInt[6] = { "int(attrDecoder.x)", "int(attrDecoder.y)", "int(attrDecoder.z)", "int(attrDecoder.w)", "0", "1" }; + + // get register index based on vtx semantic table + uint32 attributeShaderLoc = 0xFFFFFFFF; + for (sint32 f = 0; f < 32; f++) + { + if (shaderContext->contextRegisters[mmSQ_VTX_SEMANTIC_0 + f] == attrib.semanticId) + { + attributeShaderLoc = f; + break; + } + } + if (attributeShaderLoc == 0xFFFFFFFF) + return; // attribute is not mapped to VS input + uint32 registerIndex = attributeShaderLoc + 1; // R0 is skipped + // is register used? + if ((shaderContext->analyzer.gprUseMask[registerIndex / 8] & (1 << (registerIndex % 8))) == 0) + { + src->addFmt("// skipped unused attribute for r{}" _CRLF, registerIndex); + return; + } + + LatteDecompiler_emitAttributeDecodeMSL(shaderContext->shader, src, &attrib); + + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->addFmt("{} = ivec4(", _getRegisterVarName(shaderContext, registerIndex)); + else + src->addFmt("{} = vec4(", _getRegisterVarName(shaderContext, registerIndex)); + for (sint32 f = 0; f < 4; f++) + { + uint8 ds = attrib.ds[f]; + if (f > 0) + src->add(", "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, shaderContext->typeTracker.defaultDataType); + if (ds >= 6) + { + cemu_assert_unimplemented(); + ds = 4; // read as 0.0 + } + if (attrib.nfa != 1) + { + src->add(dsMappingTableFloat[ds]); + } + else + { + src->add(dsMappingTableInt[ds]); + } + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, shaderContext->typeTracker.defaultDataType); + } + src->add(");" _CRLF); +} + +void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, LatteDecompilerShader* shader) +{ + StringBuf* src = new StringBuf(1024*1024*12); // reserve 12MB for generated source (we resize-to-fit at the end) + shaderContext->shaderSource = src; + + // debug info + src->addFmt("// shader {:016x}" _CRLF, shaderContext->shaderBaseHash); +#ifdef CEMU_DEBUG_ASSERT + src->addFmt("// usesIntegerValues: {}" _CRLF, shaderContext->analyzer.usesIntegerValues?"true":"false"); + src->addFmt(_CRLF); +#endif + // header part (definitions for inputs and outputs) + LatteDecompiler::emitHeader(shaderContext); + // helper functions + LatteDecompiler_emitHelperFunctions(shaderContext, src); + // start of main + src->add("void main()" _CRLF); + src->add("{" _CRLF); + // variable definition + if (shaderContext->typeTracker.useArrayGPRs == false) + { + // each register is a separate variable + for (sint32 i = 0; i < 128; i++) + { + if (shaderContext->analyzer.usesRelativeGPRRead || (shaderContext->analyzer.gprUseMask[i / 8] & (1 << (i & 7))) != 0) + { + if (shaderContext->typeTracker.genIntReg) + src->addFmt("int4 R{}i = int4(0);" _CRLF, i); + else if (shaderContext->typeTracker.genFloatReg) + src->addFmt("float4 R{}f = float4(0.0);" _CRLF, i); + } + } + } + else + { + // registers are represented using a single large array + if (shaderContext->typeTracker.genIntReg) + src->addFmt("int4 Ri[128];" _CRLF); + else if (shaderContext->typeTracker.genFloatReg) + src->addFmt("float4 Rf[128];" _CRLF); + for (sint32 i = 0; i < 128; i++) + { + if (shaderContext->typeTracker.genIntReg) + src->addFmt("Ri[{}] = int4(0);" _CRLF, i); + else if (shaderContext->typeTracker.genFloatReg) + src->addFmt("Rf[{}] = float4(0.0);" _CRLF, i); + } + } + + if( shader->shaderType == LatteConst::ShaderType::Vertex ) + src->addFmt("uint4 attrDecoder;" _CRLF); + if (shaderContext->typeTracker.genIntReg) + src->addFmt("int backupReg0i, backupReg1i, backupReg2i, backupReg3i, backupReg4i;" _CRLF); + if (shaderContext->typeTracker.genFloatReg) + src->addFmt("float backupReg0f, backupReg1f, backupReg2f, backupReg3f, backupReg4f;" _CRLF); + if (shaderContext->typeTracker.genIntReg) + { + src->addFmt("int PV0ix = 0, PV0iy = 0, PV0iz = 0, PV0iw = 0, PV1ix = 0, PV1iy = 0, PV1iz = 0, PV1iw = 0;" _CRLF); + src->addFmt("int PS0i = 0, PS1i = 0;" _CRLF); + src->addFmt("int4 tempi = int4(0);" _CRLF); + } + if (shaderContext->typeTracker.genFloatReg) + { + src->addFmt("float PV0fx = 0.0, PV0fy = 0.0, PV0fz = 0.0, PV0fw = 0.0, PV1fx = 0.0, PV1fy = 0.0, PV1fz = 0.0, PV1fw = 0.0;" _CRLF); + src->addFmt("float PS0f = 0.0, PS1f = 0.0;" _CRLF); + src->addFmt("float4 tempf = float4(0.0);" _CRLF); + } + if (shaderContext->analyzer.hasGradientLookup) + { + src->add("float4 gradH;" _CRLF); + src->add("float4 gradV;" _CRLF); + } + src->add("float tempResultf;" _CRLF); + src->add("int tempResulti;" _CRLF); + src->add("int4 ARi = int4(0);" _CRLF); + src->add("bool predResult = true;" _CRLF); + if(shaderContext->analyzer.modifiesPixelActiveState ) + { + src->addFmt("bool activeMaskStack[{}];" _CRLF, shaderContext->analyzer.activeStackMaxDepth+1); + src->addFmt("bool activeMaskStackC[{}];" _CRLF, shaderContext->analyzer.activeStackMaxDepth+2); + for (sint32 i = 0; i < shaderContext->analyzer.activeStackMaxDepth; i++) + { + src->addFmt("activeMaskStack[{}] = false;" _CRLF, i); + } + for (sint32 i = 0; i < shaderContext->analyzer.activeStackMaxDepth+1; i++) + { + src->addFmt("activeMaskStackC[{}] = false;" _CRLF, i); + } + src->addFmt("activeMaskStack[0] = true;" _CRLF); + src->addFmt("activeMaskStackC[0] = true;" _CRLF); + src->addFmt("activeMaskStackC[1] = true;" _CRLF); + // generate vars for each subroutine + for (auto& subroutineInfo : shaderContext->list_subroutines) + { + sint32 subroutineMaxStackDepth = 0; + src->addFmt("bool activeMaskStackSub{:04x}[{}];" _CRLF, subroutineInfo.cfAddr, subroutineMaxStackDepth + 1); + src->addFmt("bool activeMaskStackCSub{:04x}[{}];" _CRLF, subroutineInfo.cfAddr, subroutineMaxStackDepth + 2); + } + } + // helper variables for cube maps (todo: Only emit when used) + if (shaderContext->analyzer.hasRedcCUBE) + { + src->add("float3 cubeMapSTM;" _CRLF); + src->add("int cubeMapFaceId;" _CRLF); + } + for(sint32 i=0; ioutput->textureUnitMask[i]) + continue; + if( shader->textureUnitDim[i] != Latte::E_DIM::DIM_CUBEMAP ) + continue; + src->addFmt("float cubeMapArrayIndex{} = 0.0;" _CRLF, i); + } + // init base offset for streamout buffer writes + if (shaderContext->analyzer.useSSBOForStreamout && (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry)) + { + for (sint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++) + { + if(!shaderContext->output->streamoutBufferWriteMask[i]) + continue; + + cemu_assert_debug((shaderContext->output->streamoutBufferStride[i]&3) == 0); + + if (shader->shaderType == LatteConst::ShaderType::Vertex) // vertex shader + src->addFmt("int sbBase{} = uf_streamoutBufferBase{}/4 + (gl_VertexID + uf_verticesPerInstance * gl_InstanceID)*{};" _CRLF, i, i, shaderContext->output->streamoutBufferStride[i] / 4); + else // geometry shader + { + uint32 gsOutPrimType = shaderContext->contextRegisters[mmVGT_GS_OUT_PRIM_TYPE]; + uint32 bytesPerVertex = shaderContext->contextRegisters[mmSQ_GS_VERT_ITEMSIZE] * 4; + uint32 maxVerticesInGS = ((shaderContext->contextRegisters[mmSQ_GSVS_RING_ITEMSIZE] & 0x7FFF) * 4) / bytesPerVertex; + + cemu_assert_debug(gsOutPrimType == 0); // currently we only properly handle GS output primitive points + + src->addFmt("int sbBase{} = uf_streamoutBufferBase{}/4 + (gl_PrimitiveIDIn * {})*{};" _CRLF, i, i, maxVerticesInGS, shaderContext->output->streamoutBufferStride[i] / 4); + } + } + + } + // code to load inputs from previous stage + if( shader->shaderType == LatteConst::ShaderType::Vertex ) + { + if( (shaderContext->analyzer.gprUseMask[0/8]&(1<<(0%8))) != 0 ) + { + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->addFmt("{} = int4(gl_VertexID, 0, 0, gl_InstanceID);" _CRLF, _getRegisterVarName(shaderContext, 0)); + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("{} = as_type(float4(gl_VertexID, 0, 0, gl_InstanceID));" _CRLF, _getRegisterVarName(shaderContext, 0)); // TODO: is this correct? + else + cemu_assert_unimplemented(); + } + + LatteFetchShader* parsedFetchShader = shaderContext->fetchShader; + for(auto& bufferGroup : parsedFetchShader->bufferGroups) + { + for(sint32 i=0; ibufferGroupsInvalid) + { + // these attributes point to non-existent buffers + // todo - figure out how the hardware actually handles this, currently we assume the input values are zero + for (sint32 i = 0; i < bufferGroup.attribCount; i++) + LatteDecompiler_emitAttributeImport(shaderContext, bufferGroup.attrib[i]); + } + } + else if (shader->shaderType == LatteConst::ShaderType::Pixel) + { + LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable(); + + uint32 psControl0 = shaderContext->contextRegisters[mmSPI_PS_IN_CONTROL_0]; + uint32 psControl1 = shaderContext->contextRegisters[mmSPI_PS_IN_CONTROL_1]; + + uint32 spiInterpControl = shaderContext->contextRegisters[mmSPI_INTERP_CONTROL_0]; + uint8 spriteEnable = (spiInterpControl >> 1) & 1; + cemu_assert_debug(spriteEnable == 0); + + uint8 frontFace_enabled = (psControl1 >> 8) & 1; + uint8 frontFace_chan = (psControl1 >> 9) & 3; + uint8 frontFace_allBits = (psControl1 >> 11) & 1; + uint8 frontFace_regIndex = (psControl1 >> 12) & 0x1F; + + // handle param_gen + if (psInputTable->paramGen != 0) + { + cemu_assert_debug((psInputTable->paramGen) == 1); // handle the other bits (the same set of coordinates with different perspective/projection settings?) + uint32 paramGenGPRIndex = psInputTable->paramGenGPR; + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("{} = in.position.xyxy;" _CRLF, _getRegisterVarName(shaderContext, paramGenGPRIndex)); + else + src->addFmt("{} = as_type(gl_PointCoord.xyxy);" _CRLF, _getRegisterVarName(shaderContext, paramGenGPRIndex)); + } + + for (sint32 i = 0; i < psInputTable->count; i++) + { + uint32 psControl0 = shaderContext->contextRegisters[mmSPI_PS_IN_CONTROL_0]; + uint32 spi0_paramGen = (psControl0 >> 15) & 0xF; + + sint32 gprIndex = i;// +spi0_paramGen + paramRegOffset; + if ((shaderContext->analyzer.gprUseMask[gprIndex / 8] & (1 << (gprIndex % 8))) == 0 && shaderContext->analyzer.usesRelativeGPRRead == false) + continue; + uint32 psInputSemanticId = psInputTable->import[i].semanticId; + if (psInputSemanticId == LATTE_ANALYZER_IMPORT_INDEX_SPIPOSITION) + { + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("{} = GET_FRAGCOORD();" _CRLF, _getRegisterVarName(shaderContext, gprIndex)); + else + src->addFmt("{} = as_type(GET_FRAGCOORD());" _CRLF, _getRegisterVarName(shaderContext, gprIndex)); + continue; + } + + if (shaderContext->options->usesGeometryShader) + { + // import from geometry shader + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->addFmt("{} = asy_type(passG2PParameter{});" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId & 0x7F); + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("{} = passG2PParameter{};" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId & 0x7F); + else + cemu_assert_unimplemented(); + } + else + { + // import from vertex shader + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->addFmt("{} = as_type(passParameterSem{});" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId); + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("{} = passParameterSem{};" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId); + else + cemu_assert_unimplemented(); + } + } + // front facing attribute + if (frontFace_enabled) + { + if ((shaderContext->analyzer.gprUseMask[0 / 8] & (1 << (0 % 8))) != 0) + { + if (frontFace_allBits) + cemu_assert_debug(false); + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->addFmt("{}.{} = as_type(gl_FrontFacing?1.0:0.0);" _CRLF, _getRegisterVarName(shaderContext, frontFace_regIndex), _getElementStrByIndex(frontFace_chan)); + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("{}.{} = frontFacing ? 1.0 : 0.0;" _CRLF, _getRegisterVarName(shaderContext, frontFace_regIndex), _getElementStrByIndex(frontFace_chan)); + else + cemu_assert_debug(false); + } + } + } + for(auto& cfInstruction : shaderContext->cfInstructions) + LatteDecompiler_emitClauseCodeMSL(shaderContext, &cfInstruction, false); + if( shader->shaderType == LatteConst::ShaderType::Geometry ) + src->add("EndPrimitive();" _CRLF); + // vertex shader should write renderstate point size at the end if required but not modified by shader + if (shaderContext->analyzer.outputPointSize && shaderContext->analyzer.writesPointSize == false) + { + if (shader->shaderType == LatteConst::ShaderType::Vertex && shaderContext->options->usesGeometryShader == false) + src->add("out.pointSize = uf_pointSize;" _CRLF); + } + // end of shader main + src->add("}" _CRLF); + src->shrink_to_fit(); + shader->strBuf_shaderSource = src; +} diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp new file mode 100644 index 000000000..8219646aa --- /dev/null +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp @@ -0,0 +1,508 @@ +#include "Cafe/HW/Latte/Core/LatteConst.h" +#include "Cafe/HW/Latte/Core/LatteShaderAssembly.h" +#include "Cafe/HW/Latte/ISA/RegDefines.h" +#include "Cafe/HW/Latte/Core/Latte.h" +#include "Cafe/HW/Latte/Core/LatteDraw.h" +#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h" +#include "Cafe/HW/Latte/Core/FetchShader.h" +#include "Cafe/HW/Latte/Renderer/Renderer.h" +#include "util/helpers/StringBuf.h" + +#define _CRLF "\r\n" + +static void _readLittleEndianAttributeU32x4(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = attrDataSem{};" _CRLF, attributeInputIndex); +} + +static void _readLittleEndianAttributeU32x3(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = uint4(attrDataSem{}.xyz,0);" _CRLF, attributeInputIndex); +} + +static void _readLittleEndianAttributeU32x2(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = uint4(attrDataSem{}.xy,0,0);" _CRLF, attributeInputIndex); +} + +static void _readLittleEndianAttributeU32x1(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = uint4(attrDataSem{}.x,0,0,0);" _CRLF, attributeInputIndex); +} + +static void _readLittleEndianAttributeU16x2(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = uint4(attrDataSem{}.xy,0,0);" _CRLF, attributeInputIndex); +} + +static void _readLittleEndianAttributeU16x4(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = attrDataSem{};" _CRLF, attributeInputIndex); +} + +static void _readBigEndianAttributeU32x4(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = attrDataSem{};" _CRLF, attributeInputIndex); + src->add("attrDecoder = (attrDecoder>>24)|((attrDecoder>>8)&0xFF00)|((attrDecoder<<8)&0xFF0000)|((attrDecoder<<24));" _CRLF); +} + +static void _readBigEndianAttributeU32x3(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder.xyz = attrDataSem{}.xyz;" _CRLF, attributeInputIndex); + src->add("attrDecoder.xyz = (attrDecoder.xyz>>24)|((attrDecoder.xyz>>8)&0xFF00)|((attrDecoder.xyz<<8)&0xFF0000)|((attrDecoder.xyz<<24));" _CRLF); + src->add("attrDecoder.w = 0;" _CRLF); +} + +static void _readBigEndianAttributeU32x2(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder.xy = attrDataSem{}.xy;" _CRLF, attributeInputIndex); + src->add("attrDecoder.xy = (attrDecoder.xy>>24)|((attrDecoder.xy>>8)&0xFF00)|((attrDecoder.xy<<8)&0xFF0000)|((attrDecoder.xy<<24));" _CRLF); + src->add("attrDecoder.z = 0;" _CRLF); + src->add("attrDecoder.w = 0;" _CRLF); +} + +static void _readBigEndianAttributeU32x1(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder.x = attrDataSem{}.x;" _CRLF, attributeInputIndex); + src->add("attrDecoder.x = (attrDecoder.x>>24)|((attrDecoder.x>>8)&0xFF00)|((attrDecoder.x<<8)&0xFF0000)|((attrDecoder.x<<24));" _CRLF); + src->add("attrDecoder.y = 0;" _CRLF); + src->add("attrDecoder.z = 0;" _CRLF); + src->add("attrDecoder.w = 0;" _CRLF); +} + +static void _readBigEndianAttributeU16x1(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder.xy = attrDataSem{}.xy;" _CRLF, attributeInputIndex); + src->add("attrDecoder.x = ((attrDecoder.x>>8)&0xFF)|((attrDecoder.x<<8)&0xFF00);" _CRLF); + src->add("attrDecoder.y = 0;" _CRLF); + src->add("attrDecoder.z = 0;" _CRLF); + src->add("attrDecoder.w = 0;" _CRLF); +} + +static void _readBigEndianAttributeU16x2(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder.xy = attrDataSem{}.xy;" _CRLF, attributeInputIndex); + src->add("attrDecoder.xy = ((attrDecoder.xy>>8)&0xFF)|((attrDecoder.xy<<8)&0xFF00);" _CRLF); + src->add("attrDecoder.z = 0;" _CRLF); + src->add("attrDecoder.w = 0;" _CRLF); +} + +static void _readBigEndianAttributeU16x4(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder.xyzw = attrDataSem{}.xyzw;" _CRLF, attributeInputIndex); + src->add("attrDecoder = ((attrDecoder>>8)&0xFF)|((attrDecoder<<8)&0xFF00);" _CRLF); +} + +void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext, StringBuf* src, LatteParsedFetchShaderAttribute_t* attrib) +{ + if (attrib->attributeBufferIndex >= Latte::GPU_LIMITS::NUM_VERTEX_BUFFERS) + { + src->add("attrDecoder = int4(0);" _CRLF); + return; + } + + uint32 attributeInputIndex = attrib->semanticId; + if( attrib->endianSwap == LatteConst::VertexFetchEndianMode::SWAP_U32 ) + { + if( attrib->format == FMT_32_32_32_32_FLOAT && attrib->nfa == 2 ) + { + _readBigEndianAttributeU32x4(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_32_32_32_FLOAT && attrib->nfa == 2 ) + { + _readBigEndianAttributeU32x3(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_32_32_FLOAT && attrib->nfa == 2 ) + { + _readBigEndianAttributeU32x2(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_32_FLOAT && attrib->nfa == 2 ) + { + _readBigEndianAttributeU32x1(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_2_10_10_10 && attrib->nfa == 0 ) + { + _readBigEndianAttributeU32x1(shaderContext, src, attributeInputIndex); + // Bayonetta 2 uses this format to store normals + src->add("attrDecoder.xyzw = uint4((attrDecoder.x>>0)&0x3FF,(attrDecoder.x>>10)&0x3FF,(attrDecoder.x>>20)&0x3FF,(attrDecoder.x>>30)&0x3);" _CRLF); + if (attrib->isSigned != 0) + { + src->add("if( (attrDecoder.x&0x200) != 0 ) attrDecoder.x |= 0xFFFFFC00;" _CRLF); + src->add("if( (attrDecoder.y&0x200) != 0 ) attrDecoder.y |= 0xFFFFFC00;" _CRLF); + src->add("if( (attrDecoder.z&0x200) != 0 ) attrDecoder.z |= 0xFFFFFC00;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/511.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/511.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/511.0,-1.0));" _CRLF); + } + else + { + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/1023.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/1023.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/1023.0,-1.0));" _CRLF); + } + src->add("attrDecoder.w = as_type(float(attrDecoder.w));" _CRLF); // unsure? + + } + else if( attrib->format == FMT_32_32_32_32 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU32x4(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_32_32_32 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU32x3(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_32_32 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU32x2(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_32 && attrib->nfa == 1 && attrib->isSigned == 0) + { + _readBigEndianAttributeU32x1(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_32 && attrib->nfa == 1 && attrib->isSigned == 1) + { + // we can just read the signed s32 as a u32 since no sign-extension is necessary + _readBigEndianAttributeU32x1(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 0 && attrib->isSigned == 0 ) + { + // seen in Minecraft Wii U Edition + src->addFmt("attrDecoder.xyzw = as_type(vec4(attrDataSem{}.wzyx)/255.0);" _CRLF, attributeInputIndex); + } + else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 0 && attrib->isSigned != 0 ) + { + // seen in Minecraft Wii U Edition + src->addFmt("attrDecoder.xyzw = attrDataSem{}.wzyx;" _CRLF, attributeInputIndex); + src->add("if( (attrDecoder.x&0x80) != 0 ) attrDecoder.x |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.y&0x80) != 0 ) attrDecoder.y |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.z&0x80) != 0 ) attrDecoder.z |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.w&0x80) != 0 ) attrDecoder.w |= 0xFFFFFF00;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.w = as_type(max(float(int(attrDecoder.w))/127.0,-1.0));" _CRLF); + } + else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + // seen in Minecraft Wii U Edition + src->addFmt("attrDecoder.xyzw = attrDataSem{}.wzyx;" _CRLF, attributeInputIndex); + } + else if (attrib->format == FMT_8_8_8_8 && attrib->nfa == 2 && attrib->isSigned == 0) + { + // seen in Ben 10 Omniverse + src->addFmt("attrDecoder.xyzw = as_type(vec4(attrDataSem{}.wzyx));" _CRLF, attributeInputIndex); + } + else + { + cemuLog_log(LogType::Force, "_emitAttributeDecode(): Unsupported fmt {:02x} nfa {} signed {} endian {}\n", attrib->format, attrib->nfa, attrib->isSigned, attrib->endianSwap); + cemu_assert_unimplemented(); + } + } + else if( attrib->endianSwap == LatteConst::VertexFetchEndianMode::SWAP_NONE ) + { + if( attrib->format == FMT_32_32_32_32_FLOAT && attrib->nfa == 2 ) + { + _readLittleEndianAttributeU32x4(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_32_32_32_FLOAT && attrib->nfa == 2) + { + _readLittleEndianAttributeU32x3(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_32_32_FLOAT && attrib->nfa == 2) + { + // seen in Cities of Gold + _readLittleEndianAttributeU32x2(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_32 && attrib->nfa == 1 && attrib->isSigned == 0) + { + // seen in Nano Assault Neo + _readLittleEndianAttributeU32x1(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_2_10_10_10 && attrib->nfa == 0 && attrib->isSigned == 0) + { + // seen in Fast Racing Neo + _readLittleEndianAttributeU32x1(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.xyzw = uint4((attrDecoder.x>>0)&0x3FF,(attrDecoder.x>>10)&0x3FF,(attrDecoder.x>>20)&0x3FF,(attrDecoder.x>>30)&0x3);" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/1023.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/1023.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/1023.0,-1.0));" _CRLF); + src->add("attrDecoder.w = as_type(float(attrDecoder.w));" _CRLF); // todo - is this correct? + } + else if (attrib->format == FMT_16_16_16_16 && attrib->nfa == 0 && attrib->isSigned != 0) + { + // seen in CoD ghosts + _readLittleEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.z&0x8000) != 0 ) attrDecoder.z |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.w&0x8000) != 0 ) attrDecoder.w |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.w = as_type(max(float(int(attrDecoder.w))/32767.0,-1.0));" _CRLF); + } + else if( attrib->format == FMT_16_16_16_16 && attrib->nfa == 2 && attrib->isSigned == 1 ) + { + // seen in Rabbids Land + _readLittleEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.z&0x8000) != 0 ) attrDecoder.z |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.w&0x8000) != 0 ) attrDecoder.w |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.xyzw = as_type(float4(int4(attrDecoder)));" _CRLF); + } + else if (attrib->format == FMT_16_16_16_16_FLOAT && attrib->nfa == 2) + { + // seen in Giana Sisters: Twisted Dreams + _readLittleEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.xyzw = as_type(float4(unpackHalf2x16(attrDecoder.x|(attrDecoder.y<<16)),unpackHalf2x16(attrDecoder.z|(attrDecoder.w<<16))));" _CRLF); + } + else if (attrib->format == FMT_16_16 && attrib->nfa == 0 && attrib->isSigned != 0) + { + // seen in Nano Assault Neo + _readLittleEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/32767.0,-1.0));" _CRLF); + } + else if (attrib->format == FMT_16_16_FLOAT && attrib->nfa == 2) + { + // seen in Giana Sisters: Twisted Dreams + _readLittleEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.xy = as_type(unpackHalf2x16(attrDecoder.x|(attrDecoder.y<<16)));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 0 && attrib->isSigned == 0 ) + { + src->addFmt("attrDecoder.xyzw = as_type(float4(attrDataSem{}.xyzw)/255.0);" _CRLF, attributeInputIndex); + } + else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 0 && attrib->isSigned != 0 ) + { + src->addFmt("attrDecoder.xyzw = attrDataSem{}.xyzw;" _CRLF, attributeInputIndex); + src->add("if( (attrDecoder.x&0x80) != 0 ) attrDecoder.x |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.y&0x80) != 0 ) attrDecoder.y |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.z&0x80) != 0 ) attrDecoder.z |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.w&0x80) != 0 ) attrDecoder.w |= 0xFFFFFF00;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.w = as_type(max(float(int(attrDecoder.w))/127.0,-1.0));" _CRLF); + } + else if (attrib->format == FMT_8_8_8_8 && attrib->nfa == 1 && attrib->isSigned == 0) + { + src->addFmt("attrDecoder.xyzw = attrDataSem{}.xyzw;" _CRLF, attributeInputIndex); + } + else if (attrib->format == FMT_8_8_8_8 && attrib->nfa == 1 && attrib->isSigned != 0) + { + // seen in Sonic Lost World + src->addFmt("attrDecoder.xyzw = attrDataSem{}.xyzw;" _CRLF, attributeInputIndex); + src->add("if( (attrDecoder.x&0x80) != 0 ) attrDecoder.x |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.y&0x80) != 0 ) attrDecoder.y |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.z&0x80) != 0 ) attrDecoder.z |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.w&0x80) != 0 ) attrDecoder.w |= 0xFFFFFF00;" _CRLF); + } + else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 2 && attrib->isSigned == 0 ) + { + // seen in One Piece + src->addFmt("attrDecoder.xyzw = as_type(float4(attrDataSem{}.xyzw));" _CRLF, attributeInputIndex); + } + else if (attrib->format == FMT_8_8 && attrib->nfa == 0 && attrib->isSigned == 0) + { + if( (attrib->offset&3) == 2 && LatteGPUState.glVendor == GLVENDOR_AMD && g_renderer->GetType() == RendererAPI::OpenGL ) + { + // AMD workaround + src->addFmt("attrDecoder.xy = as_type(float2(attrDataSem{}.zw)/255.0);" _CRLF, attributeInputIndex); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else + { + src->addFmt("attrDecoder.xy = as_type(float2(attrDataSem{}.xy)/255.0);" _CRLF, attributeInputIndex); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + } + else if (attrib->format == FMT_8_8 && attrib->nfa == 2 && attrib->isSigned == 0) + { + // seen in BotW + if ((attrib->offset & 3) == 2 && LatteGPUState.glVendor == GLVENDOR_AMD && g_renderer->GetType() == RendererAPI::OpenGL) + { + // AMD workaround + src->addFmt("attrDecoder.xy = as_type(float2(attrDataSem{}.zw));" _CRLF, attributeInputIndex); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else + { + src->addFmt("attrDecoder.xy = as_type(float2(attrDataSem{}.xy));" _CRLF, attributeInputIndex); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + } + else if (attrib->format == FMT_8_8 && attrib->nfa == 0 && attrib->isSigned != 0) + { + if ((attrib->offset & 3) == 2 && LatteGPUState.glVendor == GLVENDOR_AMD && g_renderer->GetType() == RendererAPI::OpenGL) + { + // AMD workaround + src->addFmt("attrDecoder.xy = attrDataSem{}.zw;" _CRLF, attributeInputIndex); + src->add("if( (attrDecoder.x&0x80) != 0 ) attrDecoder.x |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.y&0x80) != 0 ) attrDecoder.y |= 0xFFFFFF00;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else + { + src->addFmt("attrDecoder.xy = attrDataSem{}.xy;" _CRLF, attributeInputIndex); + src->add("if( (attrDecoder.x&0x80) != 0 ) attrDecoder.x |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.y&0x80) != 0 ) attrDecoder.y |= 0xFFFFFF00;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + } + else if (attrib->format == FMT_8_8 && attrib->nfa == 1 && attrib->isSigned == 0) + { + if ((attrib->offset & 3) == 2 && LatteGPUState.glVendor == GLVENDOR_AMD && g_renderer->GetType() == RendererAPI::OpenGL) + { + // AMD workaround + src->addFmt("attrDecoder.xyzw = uint4(attrDataSem{}.zw,0,0);" _CRLF, attributeInputIndex); + } + else + { + src->addFmt("attrDecoder.xyzw = uint4(attrDataSem{}.xy,0,0);" _CRLF, attributeInputIndex); + } + } + else if( attrib->format == FMT_8 && attrib->nfa == 0 && attrib->isSigned == 0 ) + { + // seen in Pikmin 3 + src->addFmt("attrDecoder.x = as_type(float(attrDataSem{}.x)/255.0);" _CRLF, attributeInputIndex); + src->add("attrDecoder.yzw = uint3(0);" _CRLF); + } + else if( attrib->format == FMT_8 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + src->addFmt("attrDecoder.xyzw = uint4(attrDataSem{}.x,0,0,0);" _CRLF, attributeInputIndex); + } + else + { + cemuLog_log(LogType::Force, "_emitAttributeDecode(): Unsupported fmt {:02x} nfa {} signed {} endian {}\n", attrib->format, attrib->nfa, attrib->isSigned, attrib->endianSwap); + cemu_assert_debug(false); + } + } + else if( attrib->endianSwap == LatteConst::VertexFetchEndianMode::SWAP_U16 ) + { + if( attrib->format == FMT_16_16_16_16_FLOAT && attrib->nfa == 2 ) + { + _readBigEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.xyzw = as_type(float4(unpackHalf2x16(attrDecoder.x|(attrDecoder.y<<16)),unpackHalf2x16(attrDecoder.z|(attrDecoder.w<<16))));" _CRLF); + } + else if (attrib->format == FMT_16_16_16_16 && attrib->nfa == 0 && attrib->isSigned != 0) + { + _readBigEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.z&0x8000) != 0 ) attrDecoder.z |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.w&0x8000) != 0 ) attrDecoder.w |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.w = as_type(max(float(int(attrDecoder.w))/32767.0,-1.0));" _CRLF); + } + else if (attrib->format == FMT_16_16_16_16 && attrib->nfa == 0 && attrib->isSigned == 0) + { + // seen in BotW + _readBigEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.x = as_type(float(int(attrDecoder.x))/65535.0);" _CRLF); + src->add("attrDecoder.y = as_type(float(int(attrDecoder.y))/65535.0);" _CRLF); + src->add("attrDecoder.z = as_type(float(int(attrDecoder.z))/65535.0);" _CRLF); + src->add("attrDecoder.w = as_type(float(int(attrDecoder.w))/65535.0);" _CRLF); + } + else if( attrib->format == FMT_16_16_16_16 && attrib->nfa == 2 && attrib->isSigned != 0 ) + { + // seen in Minecraft Wii U Edition + _readBigEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.z&0x8000) != 0 ) attrDecoder.z |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.w&0x8000) != 0 ) attrDecoder.w |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.x = as_type(float(int(attrDecoder.x)));" _CRLF); + src->add("attrDecoder.y = as_type(float(int(attrDecoder.y)));" _CRLF); + src->add("attrDecoder.z = as_type(float(int(attrDecoder.z)));" _CRLF); + src->add("attrDecoder.w = as_type(float(int(attrDecoder.w)));" _CRLF); + } + else if( attrib->format == FMT_16_16_16_16 && attrib->nfa == 1 && attrib->isSigned != 0 ) + { + // seen in Minecraft Wii U Edition + _readBigEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.z&0x8000) != 0 ) attrDecoder.z |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.w&0x8000) != 0 ) attrDecoder.w |= 0xFFFF0000;" _CRLF); + } + else if( attrib->format == FMT_16_16_16_16 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_16_16_FLOAT && attrib->nfa == 2 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.xy = as_type(unpackHalf2x16(attrDecoder.x|(attrDecoder.y<<16)));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if( attrib->format == FMT_16_16 && attrib->nfa == 0 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.xy = as_type(float2(float(attrDecoder.x), float(attrDecoder.y))/65535.0);" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if( attrib->format == FMT_16_16 && attrib->nfa == 0 && attrib->isSigned != 0 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if( attrib->format == FMT_16_16 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_16_16 && attrib->nfa == 1 && attrib->isSigned != 0 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if( attrib->format == FMT_16_16 && attrib->nfa == 2 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.xy = as_type(float2(float(attrDecoder.x), float(attrDecoder.y)));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if( attrib->format == FMT_16_16 && attrib->nfa == 2 && attrib->isSigned != 0 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.xy = as_type(uint2(float(int(attrDecoder.x)), float(int(attrDecoder.y))));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if (attrib->format == FMT_16 && attrib->nfa == 1 && attrib->isSigned == 0) + { + _readBigEndianAttributeU16x1(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_16 && attrib->nfa == 0 && attrib->isSigned == 0) + { + // seen in CoD ghosts + _readBigEndianAttributeU16x1(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.x = as_type(float(int(attrDecoder.x))/65535.0);" _CRLF); + } + else + { + cemuLog_logDebug(LogType::Force, "_emitAttributeDecode(): Unsupported fmt {:02x} nfa {} signed {} endian {}", attrib->format, attrib->nfa, attrib->isSigned, attrib->endianSwap); + } + } + else + { + cemu_assert_debug(false); + } +} diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp new file mode 100644 index 000000000..fade47759 --- /dev/null +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -0,0 +1,426 @@ +#pragma once + +namespace LatteDecompiler +{ + static void _emitUniformVariables(LatteDecompilerShaderContext* decompilerContext, LatteDecompilerOutputUniformOffsets& uniformOffsets) + { + LatteDecompilerShaderResourceMapping& resourceMapping = decompilerContext->output->resourceMappingVK; + + sint32 uniformCurrentOffset = 0; + auto shader = decompilerContext->shader; + auto shaderType = decompilerContext->shader->shaderType; + auto shaderSrc = decompilerContext->shaderSource; + if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_REMAPPED) + { + // uniform registers or buffers are accessed statically with predictable offsets + // this allows us to remap the used entries into a more compact array + if (shaderType == LatteConst::ShaderType::Vertex) + shaderSrc->addFmt("uniform ivec4 uf_remappedVS[{}];" _CRLF, (sint32)shader->list_remappedUniformEntries.size()); + else if (shaderType == LatteConst::ShaderType::Pixel) + shaderSrc->addFmt("uniform ivec4 uf_remappedPS[{}];" _CRLF, (sint32)shader->list_remappedUniformEntries.size()); + else if (shaderType == LatteConst::ShaderType::Geometry) + shaderSrc->addFmt("uniform ivec4 uf_remappedGS[{}];" _CRLF, (sint32)shader->list_remappedUniformEntries.size()); + else + debugBreakpoint(); + uniformOffsets.offset_remapped = uniformCurrentOffset; + uniformCurrentOffset += 16 * shader->list_remappedUniformEntries.size(); + } + else if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CFILE) + { + uint32 cfileSize = decompilerContext->analyzer.uniformRegisterAccessTracker.DetermineSize(decompilerContext->shaderBaseHash, 256); + // full or partial uniform register file has to be present + if (shaderType == LatteConst::ShaderType::Vertex) + shaderSrc->addFmt("uniform ivec4 uf_uniformRegisterVS[{}];" _CRLF, cfileSize); + else if (shaderType == LatteConst::ShaderType::Pixel) + shaderSrc->addFmt("uniform ivec4 uf_uniformRegisterPS[{}];" _CRLF, cfileSize); + else if (shaderType == LatteConst::ShaderType::Geometry) + shaderSrc->addFmt("uniform ivec4 uf_uniformRegisterGS[{}];" _CRLF, cfileSize); + uniformOffsets.offset_uniformRegister = uniformCurrentOffset; + uniformOffsets.count_uniformRegister = cfileSize; + uniformCurrentOffset += 16 * cfileSize; + } + // special uniforms + bool hasAnyViewportScaleDisabled = + !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_X_SCALE_ENA() || + !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Y_SCALE_ENA() || + !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Z_SCALE_ENA(); + + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex && hasAnyViewportScaleDisabled) + { + // aka GX2 special state 0 + uniformCurrentOffset = (uniformCurrentOffset + 7)&~7; + shaderSrc->add("uniform vec2 uf_windowSpaceToClipSpaceTransform;" _CRLF); + uniformOffsets.offset_windowSpaceToClipSpaceTransform = uniformCurrentOffset; + uniformCurrentOffset += 8; + } + bool alphaTestEnable = decompilerContext->contextRegistersNew->SX_ALPHA_TEST_CONTROL.get_ALPHA_TEST_ENABLE(); + if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel && alphaTestEnable) + { + uniformCurrentOffset = (uniformCurrentOffset + 3)&~3; + shaderSrc->add("uniform float uf_alphaTestRef;" _CRLF); + uniformOffsets.offset_alphaTestRef = uniformCurrentOffset; + uniformCurrentOffset += 4; + } + if (decompilerContext->analyzer.outputPointSize && decompilerContext->analyzer.writesPointSize == false) + { + if ((decompilerContext->shaderType == LatteConst::ShaderType::Vertex && !decompilerContext->options->usesGeometryShader) || + decompilerContext->shaderType == LatteConst::ShaderType::Geometry) + { + uniformCurrentOffset = (uniformCurrentOffset + 3)&~3; + shaderSrc->add("uniform float uf_pointSize;" _CRLF); + uniformOffsets.offset_pointSize = uniformCurrentOffset; + uniformCurrentOffset += 4; + } + } + // define uf_fragCoordScale which holds the xy scale for render target resolution vs effective resolution + if (shader->shaderType == LatteConst::ShaderType::Pixel) + { + uniformCurrentOffset = (uniformCurrentOffset + 7)&~7; + shaderSrc->add("uniform vec2 uf_fragCoordScale;" _CRLF); + uniformOffsets.offset_fragCoordScale = uniformCurrentOffset; + uniformCurrentOffset += 8; + } + // provide scale factor for every texture that is accessed via texel coordinates (texelFetch) + for (sint32 t = 0; t < LATTE_NUM_MAX_TEX_UNITS; t++) + { + if (decompilerContext->analyzer.texUnitUsesTexelCoordinates.test(t) == false) + continue; + uniformCurrentOffset = (uniformCurrentOffset + 7) & ~7; + shaderSrc->addFmt("uniform vec2 uf_tex{}Scale;" _CRLF, t); + uniformOffsets.offset_texScale[t] = uniformCurrentOffset; + uniformCurrentOffset += 8; + } + // define uf_verticesPerInstance + uf_streamoutBufferBaseX + if (decompilerContext->analyzer.useSSBOForStreamout && + (shader->shaderType == LatteConst::ShaderType::Vertex && decompilerContext->options->usesGeometryShader == false) || + (shader->shaderType == LatteConst::ShaderType::Geometry) ) + { + shaderSrc->add("uniform int uf_verticesPerInstance;" _CRLF); + uniformOffsets.offset_verticesPerInstance = uniformCurrentOffset; + uniformCurrentOffset += 4; + for (uint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++) + { + if (decompilerContext->output->streamoutBufferWriteMask[i]) + { + shaderSrc->addFmt("uniform int uf_streamoutBufferBase{};" _CRLF, i); + uniformOffsets.offset_streamoutBufferBase[i] = uniformCurrentOffset; + uniformCurrentOffset += 4; + } + } + } + + uniformOffsets.offset_endOfBlock = uniformCurrentOffset; + } + + static void _emitUniformBuffers(LatteDecompilerShaderContext* decompilerContext) + { + auto shaderSrc = decompilerContext->shaderSource; + // uniform buffer definition + if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK) + { + for (uint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++) + { + if (!decompilerContext->analyzer.uniformBufferAccessTracker[i].HasAccess()) + continue; + + cemu_assert_debug(decompilerContext->output->resourceMappingGL.uniformBuffersBindingPoint[i] >= 0); + cemu_assert_debug(decompilerContext->output->resourceMappingVK.uniformBuffersBindingPoint[i] >= 0); + + shaderSrc->addFmt("UNIFORM_BUFFER_LAYOUT({}, {}, {}) ", (sint32)decompilerContext->output->resourceMappingGL.uniformBuffersBindingPoint[i], (sint32)decompilerContext->output->resourceMappingVK.setIndex, (sint32)decompilerContext->output->resourceMappingVK.uniformBuffersBindingPoint[i]); + + shaderSrc->addFmt("uniform ubuff{}" _CRLF, i); + shaderSrc->add("{" _CRLF); + shaderSrc->addFmt("float4 ubuff{}[{}];" _CRLF, i, decompilerContext->analyzer.uniformBufferAccessTracker[i].DetermineSize(decompilerContext->shaderBaseHash, LATTE_GLSL_DYNAMIC_UNIFORM_BLOCK_SIZE)); + shaderSrc->add("};" _CRLF _CRLF); + shaderSrc->add(_CRLF); + } + } + else if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_REMAPPED) + { + // already generated in _emitUniformVariables + } + else if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CFILE) + { + // already generated in _emitUniformVariables + } + else if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_NONE) + { + // no uniforms used + } + else + { + cemu_assert_debug(false); + } + } + + static void _emitTextureDefinitions(LatteDecompilerShaderContext* shaderContext) + { + auto src = shaderContext->shaderSource; + // texture sampler definition + for (sint32 i = 0; i < LATTE_NUM_MAX_TEX_UNITS; i++) + { + if (!shaderContext->output->textureUnitMask[i]) + continue; + + if (shaderContext->shader->textureIsIntegerFormat[i]) + { + // integer samplers + if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_1D) + src->add("texture1d"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D || shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_MSAA) + src->add("texture2d"); + else + cemu_assert_unimplemented(); + } + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D || shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_MSAA) + src->add("texture2d"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_1D) + src->add("texture1d"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_ARRAY) + src->add("texture2d_array"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_CUBEMAP) + src->add("texturecube_array"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_3D) + src->add("texture3d"); + else + { + cemu_assert_unimplemented(); + } + + src->addFmt(" tex{} [[texture({})]], ", i, shaderContext->output->resourceMappingGL.textureUnitToBindingPoint[i]); + src->addFmt("sampler samplr{} [[sampler({})]], ", i, shaderContext->output->resourceMappingGL.textureUnitToBindingPoint[i]); + } + } + + static void _emitAttributes(LatteDecompilerShaderContext* decompilerContext) + { + auto shaderSrc = decompilerContext->shaderSource; + if (decompilerContext->shader->shaderType == LatteConst::ShaderType::Vertex) + { + // attribute inputs + for (uint32 i = 0; i < LATTE_NUM_MAX_ATTRIBUTE_LOCATIONS; i++) + { + if (decompilerContext->analyzer.inputAttributSemanticMask[i]) + { + cemu_assert_debug(decompilerContext->output->resourceMappingGL.attributeMapping[i] >= 0); + cemu_assert_debug(decompilerContext->output->resourceMappingVK.attributeMapping[i] >= 0); + cemu_assert_debug(decompilerContext->output->resourceMappingGL.attributeMapping[i] == decompilerContext->output->resourceMappingVK.attributeMapping[i]); + + shaderSrc->addFmt("ATTR_LAYOUT({}, {}) in uvec4 attrDataSem{};" _CRLF, (sint32)decompilerContext->output->resourceMappingVK.setIndex, (sint32)decompilerContext->output->resourceMappingVK.attributeMapping[i], i); + } + } + } + } + + static void _emitVSExports(LatteDecompilerShaderContext* shaderContext) + { + auto* src = shaderContext->shaderSource; + LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable(); + auto parameterMask = shaderContext->shader->outputParameterMask; + for (uint32 i = 0; i < 32; i++) + { + if ((parameterMask&(1 << i)) == 0) + continue; + uint32 vsSemanticId = _getVertexShaderOutParamSemanticId(shaderContext->contextRegisters, i); + if (vsSemanticId > LATTE_ANALYZER_IMPORT_INDEX_PARAM_MAX) + continue; + // get import based on semanticId + sint32 psInputIndex = -1; + for (sint32 f = 0; f < psInputTable->count; f++) + { + if (psInputTable->import[f].semanticId == vsSemanticId) + { + psInputIndex = f; + break; + } + } + if (psInputIndex == -1) + continue; // no ps input + + src->addFmt("layout(location = {}) ", psInputIndex); + if (psInputTable->import[psInputIndex].isFlat) + src->add("flat "); + if (psInputTable->import[psInputIndex].isNoPerspective) + src->add("noperspective "); + src->add("out"); + src->addFmt(" vec4 passParameterSem{};" _CRLF, psInputTable->import[psInputIndex].semanticId); + } + } + + static void _emitPSImports(LatteDecompilerShaderContext* shaderContext) + { + auto* src = shaderContext->shaderSource; + LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable(); + for (sint32 i = 0; i < psInputTable->count; i++) + { + if (psInputTable->import[i].semanticId > LATTE_ANALYZER_IMPORT_INDEX_PARAM_MAX) + continue; + src->addFmt("layout(location = {}) ", i); + if (psInputTable->import[i].isFlat) + src->add("flat "); + if (psInputTable->import[i].isNoPerspective) + src->add("noperspective "); + src->add("in"); + src->addFmt(" vec4 passParameterSem{};" _CRLF, psInputTable->import[i].semanticId); + } + } + + static void _emitMisc(LatteDecompilerShaderContext* decompilerContext) + { + auto src = decompilerContext->shaderSource; + // per-vertex output (VS or GS) + if ((decompilerContext->shaderType == LatteConst::ShaderType::Vertex && !decompilerContext->options->usesGeometryShader) || + (decompilerContext->shaderType == LatteConst::ShaderType::Geometry)) + { + src->add("out gl_PerVertex" _CRLF); + src->add("{" _CRLF); + src->add(" vec4 gl_Position;" _CRLF); + if (decompilerContext->analyzer.outputPointSize) + src->add(" float gl_PointSize;" _CRLF); + src->add("};" _CRLF); + } + // varyings (variables passed from vertex to pixel shader, only if geometry stage is disabled + if (decompilerContext->options->usesGeometryShader == false) + { + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) + { + _emitVSExports(decompilerContext); + } + else if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel) + { + _emitPSImports(decompilerContext); + } + } + else + { + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex || decompilerContext->shaderType == LatteConst::ShaderType::Geometry) + { + // parameters shared between vertex shader and geometry shader + src->add("V2G_LAYOUT "); + + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) + src->add("out Vertex" _CRLF); + else if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry) + src->add("in Vertex" _CRLF); + src->add("{" _CRLF); + uint32 ringParameterCountVS2GS = 0; + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) + { + ringParameterCountVS2GS = decompilerContext->shader->ringParameterCount; + } + else + { + ringParameterCountVS2GS = decompilerContext->shader->ringParameterCountFromPrevStage; + } + for (uint32 f = 0; f < ringParameterCountVS2GS; f++) + src->addFmt(" ivec4 passV2GParameter{};" _CRLF, f); + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) + src->add("}v2g;" _CRLF); + else if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry) + src->add("}v2g[];" _CRLF); + } + if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry) + { + // parameters shared between geometry and pixel shader + uint32 ringItemSize = decompilerContext->contextRegisters[mmSQ_GSVS_RING_ITEMSIZE] & 0x7FFF; + if ((ringItemSize & 0xF) != 0) + debugBreakpoint(); + if (((decompilerContext->contextRegisters[mmSQ_GSVS_RING_ITEMSIZE] & 0x7FFF) & 0xF) != 0) + debugBreakpoint(); + + for (sint32 p = 0; p < decompilerContext->parsedGSCopyShader->numParam; p++) + { + if (decompilerContext->parsedGSCopyShader->paramMapping[p].exportType != 2) + continue; + src->addFmt("layout(location = {}) out vec4 passG2PParameter{};" _CRLF, decompilerContext->parsedGSCopyShader->paramMapping[p].exportParam & 0x7F, (sint32)decompilerContext->parsedGSCopyShader->paramMapping[p].exportParam); + } + } + else if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel) + { + // pixel shader with geometry shader + LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable(); + for (sint32 i = 0; i < psInputTable->count; i++) + { + if (psInputTable->import[i].semanticId > LATTE_ANALYZER_IMPORT_INDEX_PARAM_MAX) + continue; + uint32 location = psInputTable->import[i].semanticId & 0x7F; // todo - the range above 128 has special meaning? + + src->addFmt("layout(location = {}) ", location); + if (psInputTable->import[i].isFlat) + src->add("flat "); + if (psInputTable->import[i].isNoPerspective) + src->add("noperspective "); + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) + src->add("out"); + else if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel) + src->add("in"); + else + debugBreakpoint(); + + src->addFmt(" vec4 passG2PParameter{};" _CRLF, (sint32)location); + } + } + } + // output defines + if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel) + { + // generate pixel outputs for pixel shader + for (uint32 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + { + if ((decompilerContext->shader->pixelColorOutputMask&(1 << i)) != 0) + { + src->addFmt("layout(location = {}) out vec4 passPixelColor{};" _CRLF, i, i); + } + } + } + // streamout buffer (transform feedback) + if ((decompilerContext->shaderType == LatteConst::ShaderType::Vertex || decompilerContext->shaderType == LatteConst::ShaderType::Geometry) && decompilerContext->analyzer.hasStreamoutEnable) + { + if (decompilerContext->options->useTFViaSSBO) + { + if (decompilerContext->analyzer.useSSBOForStreamout && decompilerContext->analyzer.hasStreamoutWrite) + { + src->addFmt("layout(set = {}, binding = {}) buffer StreamoutBuffer" _CRLF, decompilerContext->output->resourceMappingVK.setIndex, decompilerContext->output->resourceMappingVK.getTFStorageBufferBindingPoint()); + src->add("{" _CRLF); + src->add("int sb_buffer[];" _CRLF); + src->add("};" _CRLF); + } + } + else + { + sint32 locationOffset = 0; // glslang wants a location for xfb outputs + for (uint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++) + { + if (!decompilerContext->output->streamoutBufferWriteMask[i]) + continue; + uint32 bufferStride = decompilerContext->output->streamoutBufferStride[i]; + src->addFmt("XFB_BLOCK_LAYOUT({}, {}, {}) out XfbBlock{} " _CRLF, i, bufferStride, locationOffset, i); + src->add("{" _CRLF); + src->addFmt("layout(xfb_buffer = {}, xfb_offset = 0) int sb{}[{}];" _CRLF, i, i, decompilerContext->output->streamoutBufferStride[i] / 4); + src->add("};" _CRLF); + locationOffset += (decompilerContext->output->streamoutBufferStride[i] / 4); + } + } + } + } + + static void emitHeader(LatteDecompilerShaderContext* decompilerContext) + { + const bool dump_shaders_enabled = ActiveSettings::DumpShadersEnabled(); + if(dump_shaders_enabled) + decompilerContext->shaderSource->add("// start of shader inputs/outputs, predetermined by Cemu. Do not touch" _CRLF); + // uniform variables + _emitUniformVariables(decompilerContext, decompilerContext->output->uniformOffsetsVK); + // uniform buffers + _emitUniformBuffers(decompilerContext); + // textures + _emitTextureDefinitions(decompilerContext); + // attributes + _emitAttributes(decompilerContext); + // misc stuff + _emitMisc(decompilerContext); + + if (dump_shaders_enabled) + decompilerContext->shaderSource->add("// end of shader inputs/outputs" _CRLF); + } +} diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h index ed1858bae..4b85d4586 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h @@ -47,7 +47,7 @@ struct LatteDecompilerTEXInstruction sint32 dstGpr; sint8 dstSel[4]; // texture fetch - struct + struct { sint32 textureIndex{}; sint32 samplerIndex{}; @@ -216,7 +216,7 @@ struct LatteDecompilerShaderContext bool genIntReg; // if set, generate R*i register variables bool useArrayGPRs; // if set, an array is used to represent GPRs instead of individual variables }typeTracker; - // analyzer + // analyzer struct { // general @@ -268,9 +268,10 @@ struct LatteDecompilerShaderContext void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteDecompilerShader* shader); void LatteDecompiler_analyzeDataTypes(LatteDecompilerShaderContext* shaderContext); void LatteDecompiler_emitGLSLShader(LatteDecompilerShaderContext* shaderContext, LatteDecompilerShader* shader); +void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, LatteDecompilerShader* shader); void LatteDecompiler_cleanup(LatteDecompilerShaderContext* shaderContext); // helper functions -sint32 LatteDecompiler_getColorOutputIndexFromExportIndex(LatteDecompilerShaderContext* shaderContext, sint32 exportIndex); \ No newline at end of file +sint32 LatteDecompiler_getColorOutputIndexFromExportIndex(LatteDecompilerShaderContext* shaderContext, sint32 exportIndex); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index c49035dd6..b1710e8a0 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -3,6 +3,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "HW/Latte/Core/LatteShader.h" #include "gui/guiWrapper.h" MetalRenderer::MetalRenderer() @@ -259,6 +260,8 @@ void MetalRenderer::streamout_rendererFinishDrawcall() void MetalRenderer::draw_beginSequence() { cemuLog_logDebug(LogType::Force, "not implemented"); + + LatteSHRC_UpdateActiveShaders(); } void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst) From aac9b123a508cfa54ba9b1755b80a91e169626cf Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 26 Jul 2024 15:43:15 +0200 Subject: [PATCH 007/368] improve shadergen & create shaders --- src/Cafe/CMakeLists.txt | 2 + src/Cafe/HW/Latte/Core/LatteShaderCache.cpp | 7 + .../LatteDecompiler.cpp | 9 +- .../LatteDecompilerEmitMSL.cpp | 20 +- .../LatteDecompilerEmitMSLHeader.hpp | 366 ++++++++---------- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 6 +- .../Renderer/Metal/RendererShaderMtl.cpp | 38 ++ .../Latte/Renderer/Metal/RendererShaderMtl.h | 44 +++ 8 files changed, 268 insertions(+), 224 deletions(-) create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index fa3c6ff94..2a3dda471 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -545,6 +545,8 @@ if(ENABLE_METAL) HW/Latte/Renderer/Metal/LatteTextureMtl.h HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp HW/Latte/Renderer/Metal/LatteTextureViewMtl.h + HW/Latte/Renderer/Metal/RendererShaderMtl.cpp + HW/Latte/Renderer/Metal/RendererShaderMtl.h ) #target_link_libraries(CemuCafe PRIVATE diff --git a/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp b/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp index 884186509..98d970f6d 100644 --- a/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp @@ -11,6 +11,7 @@ #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Cafe/HW/Latte/Renderer/OpenGL/RendererShaderGL.h" #include "Cafe/HW/Latte/Renderer/Vulkan/RendererShaderVk.h" +#include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanPipelineStableCache.h" #include @@ -161,6 +162,8 @@ void LatteShaderCache_finish() RendererShaderVk::ShaderCacheLoading_end(); else if (g_renderer->GetType() == RendererAPI::OpenGL) RendererShaderGL::ShaderCacheLoading_end(); + else if (g_renderer->GetType() == RendererAPI::Metal) + RendererShaderMtl::ShaderCacheLoading_end(); } uint32 LatteShaderCache_getShaderCacheExtraVersion(uint64 titleId) @@ -243,6 +246,8 @@ void LatteShaderCache_Load() RendererShaderVk::ShaderCacheLoading_begin(cacheTitleId); else if (g_renderer->GetType() == RendererAPI::OpenGL) RendererShaderGL::ShaderCacheLoading_begin(cacheTitleId); + else if (g_renderer->GetType() == RendererAPI::Metal) + RendererShaderMtl::ShaderCacheLoading_begin(cacheTitleId); // get cache file name const auto pathGeneric = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}_shaders.bin", cacheTitleId); const auto pathGenericPre1_25_0 = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}.bin", cacheTitleId); // before 1.25.0 @@ -774,6 +779,8 @@ void LatteShaderCache_Close() RendererShaderVk::ShaderCacheLoading_Close(); else if (g_renderer->GetType() == RendererAPI::OpenGL) RendererShaderGL::ShaderCacheLoading_Close(); + else if (g_renderer->GetType() == RendererAPI::Metal) + RendererShaderMtl::ShaderCacheLoading_Close(); // if Vulkan then also close pipeline cache if (g_renderer->GetType() == RendererAPI::Vulkan) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.cpp index 5f0d7fb25..c2051090b 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.cpp @@ -12,8 +12,6 @@ #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h" #include "util/helpers/helpers.h" -// TODO: remove this include -#include "util/helpers/StringBuf.h" // parse instruction and if valid append it to instructionList bool LatteDecompiler_ParseCFInstruction(LatteDecompilerShaderContext* shaderContext, uint32 cfIndex, uint32 cfWord0, uint32 cfWord1, bool* endOfProgram, std::vector& instructionList) @@ -1070,14 +1068,9 @@ void _LatteDecompiler_Process(LatteDecompilerShaderContext* shaderContext, uint8 if (shaderContext->shader->hasError == false) { if (g_renderer->GetType() == RendererAPI::Metal) - { LatteDecompiler_emitMSLShader(shaderContext, shaderContext->shader); - // HACK - std::cout << shaderContext->shaderSource->c_str() << std::endl; - } else - { + else LatteDecompiler_emitGLSLShader(shaderContext, shaderContext->shader); - } } LatteDecompiler_cleanup(shaderContext); // fast access diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 943f18401..2ffcfa0fe 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3878,9 +3878,19 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, LatteDecompiler::emitHeader(shaderContext); // helper functions LatteDecompiler_emitHelperFunctions(shaderContext, src); + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + src->add("VertexOut"); + break; + case LatteConst::ShaderType::Pixel: + src->add("FragmentOut"); + break; + } // start of main - src->add("void main()" _CRLF); - src->add("{" _CRLF); + src->add(" main0("); + LatteDecompiler::emitInputs(shaderContext); + src->add(") {" _CRLF); // variable definition if (shaderContext->typeTracker.useArrayGPRs == false) { @@ -3987,7 +3997,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, cemu_assert_debug((shaderContext->output->streamoutBufferStride[i]&3) == 0); if (shader->shaderType == LatteConst::ShaderType::Vertex) // vertex shader - src->addFmt("int sbBase{} = uf_streamoutBufferBase{}/4 + (gl_VertexID + uf_verticesPerInstance * gl_InstanceID)*{};" _CRLF, i, i, shaderContext->output->streamoutBufferStride[i] / 4); + src->addFmt("int sbBase{} = uf_streamoutBufferBase{}/4 + (vid + uf_verticesPerInstance * iid)*{};" _CRLF, i, i, shaderContext->output->streamoutBufferStride[i] / 4); else // geometry shader { uint32 gsOutPrimType = shaderContext->contextRegisters[mmVGT_GS_OUT_PRIM_TYPE]; @@ -4007,9 +4017,9 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, if( (shaderContext->analyzer.gprUseMask[0/8]&(1<<(0%8))) != 0 ) { if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->addFmt("{} = int4(gl_VertexID, 0, 0, gl_InstanceID);" _CRLF, _getRegisterVarName(shaderContext, 0)); + src->addFmt("{} = int4(vid, 0, 0, iid);" _CRLF, _getRegisterVarName(shaderContext, 0)); else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) - src->addFmt("{} = as_type(float4(gl_VertexID, 0, 0, gl_InstanceID));" _CRLF, _getRegisterVarName(shaderContext, 0)); // TODO: is this correct? + src->addFmt("{} = as_type(float4(vid, 0, 0, iid));" _CRLF, _getRegisterVarName(shaderContext, 0)); // TODO: is this correct? else cemu_assert_unimplemented(); } diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index fade47759..39798dc5f 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -2,24 +2,28 @@ namespace LatteDecompiler { - static void _emitUniformVariables(LatteDecompilerShaderContext* decompilerContext, LatteDecompilerOutputUniformOffsets& uniformOffsets) + static void _emitUniformVariables(LatteDecompilerShaderContext* decompilerContext) { - LatteDecompilerShaderResourceMapping& resourceMapping = decompilerContext->output->resourceMappingVK; + auto src = decompilerContext->shaderSource; + + LatteDecompilerShaderResourceMapping& resourceMapping = decompilerContext->output->resourceMappingGL; + auto& uniformOffsets = decompilerContext->output->uniformOffsetsVK; + + src->add("struct DefualtUniforms {" _CRLF); sint32 uniformCurrentOffset = 0; auto shader = decompilerContext->shader; auto shaderType = decompilerContext->shader->shaderType; - auto shaderSrc = decompilerContext->shaderSource; if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_REMAPPED) { // uniform registers or buffers are accessed statically with predictable offsets // this allows us to remap the used entries into a more compact array if (shaderType == LatteConst::ShaderType::Vertex) - shaderSrc->addFmt("uniform ivec4 uf_remappedVS[{}];" _CRLF, (sint32)shader->list_remappedUniformEntries.size()); + src->addFmt("ivec4 uf_remappedVS[{}];" _CRLF, (sint32)shader->list_remappedUniformEntries.size()); else if (shaderType == LatteConst::ShaderType::Pixel) - shaderSrc->addFmt("uniform ivec4 uf_remappedPS[{}];" _CRLF, (sint32)shader->list_remappedUniformEntries.size()); + src->addFmt("ivec4 uf_remappedPS[{}];" _CRLF, (sint32)shader->list_remappedUniformEntries.size()); else if (shaderType == LatteConst::ShaderType::Geometry) - shaderSrc->addFmt("uniform ivec4 uf_remappedGS[{}];" _CRLF, (sint32)shader->list_remappedUniformEntries.size()); + src->addFmt("ivec4 uf_remappedGS[{}];" _CRLF, (sint32)shader->list_remappedUniformEntries.size()); else debugBreakpoint(); uniformOffsets.offset_remapped = uniformCurrentOffset; @@ -30,11 +34,11 @@ namespace LatteDecompiler uint32 cfileSize = decompilerContext->analyzer.uniformRegisterAccessTracker.DetermineSize(decompilerContext->shaderBaseHash, 256); // full or partial uniform register file has to be present if (shaderType == LatteConst::ShaderType::Vertex) - shaderSrc->addFmt("uniform ivec4 uf_uniformRegisterVS[{}];" _CRLF, cfileSize); + src->addFmt("ivec4 uf_uniformRegisterVS[{}];" _CRLF, cfileSize); else if (shaderType == LatteConst::ShaderType::Pixel) - shaderSrc->addFmt("uniform ivec4 uf_uniformRegisterPS[{}];" _CRLF, cfileSize); + src->addFmt("ivec4 uf_uniformRegisterPS[{}];" _CRLF, cfileSize); else if (shaderType == LatteConst::ShaderType::Geometry) - shaderSrc->addFmt("uniform ivec4 uf_uniformRegisterGS[{}];" _CRLF, cfileSize); + src->addFmt("ivec4 uf_uniformRegisterGS[{}];" _CRLF, cfileSize); uniformOffsets.offset_uniformRegister = uniformCurrentOffset; uniformOffsets.count_uniformRegister = cfileSize; uniformCurrentOffset += 16 * cfileSize; @@ -49,7 +53,7 @@ namespace LatteDecompiler { // aka GX2 special state 0 uniformCurrentOffset = (uniformCurrentOffset + 7)&~7; - shaderSrc->add("uniform vec2 uf_windowSpaceToClipSpaceTransform;" _CRLF); + src->add("float2 uf_windowSpaceToClipSpaceTransform;" _CRLF); uniformOffsets.offset_windowSpaceToClipSpaceTransform = uniformCurrentOffset; uniformCurrentOffset += 8; } @@ -57,7 +61,7 @@ namespace LatteDecompiler if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel && alphaTestEnable) { uniformCurrentOffset = (uniformCurrentOffset + 3)&~3; - shaderSrc->add("uniform float uf_alphaTestRef;" _CRLF); + src->add("float uf_alphaTestRef;" _CRLF); uniformOffsets.offset_alphaTestRef = uniformCurrentOffset; uniformCurrentOffset += 4; } @@ -67,7 +71,7 @@ namespace LatteDecompiler decompilerContext->shaderType == LatteConst::ShaderType::Geometry) { uniformCurrentOffset = (uniformCurrentOffset + 3)&~3; - shaderSrc->add("uniform float uf_pointSize;" _CRLF); + src->add("float uf_pointSize;" _CRLF); uniformOffsets.offset_pointSize = uniformCurrentOffset; uniformCurrentOffset += 4; } @@ -76,7 +80,7 @@ namespace LatteDecompiler if (shader->shaderType == LatteConst::ShaderType::Pixel) { uniformCurrentOffset = (uniformCurrentOffset + 7)&~7; - shaderSrc->add("uniform vec2 uf_fragCoordScale;" _CRLF); + src->add("vec2 uf_fragCoordScale;" _CRLF); uniformOffsets.offset_fragCoordScale = uniformCurrentOffset; uniformCurrentOffset += 8; } @@ -86,7 +90,7 @@ namespace LatteDecompiler if (decompilerContext->analyzer.texUnitUsesTexelCoordinates.test(t) == false) continue; uniformCurrentOffset = (uniformCurrentOffset + 7) & ~7; - shaderSrc->addFmt("uniform vec2 uf_tex{}Scale;" _CRLF, t); + src->addFmt("vec2 uf_tex{}Scale;" _CRLF, t); uniformOffsets.offset_texScale[t] = uniformCurrentOffset; uniformCurrentOffset += 8; } @@ -95,20 +99,22 @@ namespace LatteDecompiler (shader->shaderType == LatteConst::ShaderType::Vertex && decompilerContext->options->usesGeometryShader == false) || (shader->shaderType == LatteConst::ShaderType::Geometry) ) { - shaderSrc->add("uniform int uf_verticesPerInstance;" _CRLF); + src->add("int uf_verticesPerInstance;" _CRLF); uniformOffsets.offset_verticesPerInstance = uniformCurrentOffset; uniformCurrentOffset += 4; for (uint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++) { if (decompilerContext->output->streamoutBufferWriteMask[i]) { - shaderSrc->addFmt("uniform int uf_streamoutBufferBase{};" _CRLF, i); + src->addFmt("int uf_streamoutBufferBase{};" _CRLF, i); uniformOffsets.offset_streamoutBufferBase[i] = uniformCurrentOffset; uniformCurrentOffset += 4; } } } + src->add("}" _CRLF _CRLF); + uniformOffsets.offset_endOfBlock = uniformCurrentOffset; } @@ -126,13 +132,11 @@ namespace LatteDecompiler cemu_assert_debug(decompilerContext->output->resourceMappingGL.uniformBuffersBindingPoint[i] >= 0); cemu_assert_debug(decompilerContext->output->resourceMappingVK.uniformBuffersBindingPoint[i] >= 0); - shaderSrc->addFmt("UNIFORM_BUFFER_LAYOUT({}, {}, {}) ", (sint32)decompilerContext->output->resourceMappingGL.uniformBuffersBindingPoint[i], (sint32)decompilerContext->output->resourceMappingVK.setIndex, (sint32)decompilerContext->output->resourceMappingVK.uniformBuffersBindingPoint[i]); + //shaderSrc->addFmt("UNIFORM_BUFFER_LAYOUT({}, {}, {}) ", (sint32)decompilerContext->output->resourceMappingGL.uniformBuffersBindingPoint[i], (sint32)decompilerContext->output->resourceMappingVK.setIndex, (sint32)decompilerContext->output->resourceMappingVK.uniformBuffersBindingPoint[i]); - shaderSrc->addFmt("uniform ubuff{}" _CRLF, i); - shaderSrc->add("{" _CRLF); - shaderSrc->addFmt("float4 ubuff{}[{}];" _CRLF, i, decompilerContext->analyzer.uniformBufferAccessTracker[i].DetermineSize(decompilerContext->shaderBaseHash, LATTE_GLSL_DYNAMIC_UNIFORM_BLOCK_SIZE)); + shaderSrc->addFmt("struct UBuff{} {" _CRLF, i); + shaderSrc->addFmt("float4 d{}[{}];" _CRLF, i, decompilerContext->analyzer.uniformBufferAccessTracker[i].DetermineSize(decompilerContext->shaderBaseHash, LATTE_GLSL_DYNAMIC_UNIFORM_BLOCK_SIZE)); shaderSrc->add("};" _CRLF _CRLF); - shaderSrc->add(_CRLF); } } else if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_REMAPPED) @@ -153,50 +157,13 @@ namespace LatteDecompiler } } - static void _emitTextureDefinitions(LatteDecompilerShaderContext* shaderContext) - { - auto src = shaderContext->shaderSource; - // texture sampler definition - for (sint32 i = 0; i < LATTE_NUM_MAX_TEX_UNITS; i++) - { - if (!shaderContext->output->textureUnitMask[i]) - continue; - - if (shaderContext->shader->textureIsIntegerFormat[i]) - { - // integer samplers - if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_1D) - src->add("texture1d"); - else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D || shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_MSAA) - src->add("texture2d"); - else - cemu_assert_unimplemented(); - } - else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D || shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_MSAA) - src->add("texture2d"); - else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_1D) - src->add("texture1d"); - else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_ARRAY) - src->add("texture2d_array"); - else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_CUBEMAP) - src->add("texturecube_array"); - else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_3D) - src->add("texture3d"); - else - { - cemu_assert_unimplemented(); - } - - src->addFmt(" tex{} [[texture({})]], ", i, shaderContext->output->resourceMappingGL.textureUnitToBindingPoint[i]); - src->addFmt("sampler samplr{} [[sampler({})]], ", i, shaderContext->output->resourceMappingGL.textureUnitToBindingPoint[i]); - } - } - static void _emitAttributes(LatteDecompilerShaderContext* decompilerContext) { - auto shaderSrc = decompilerContext->shaderSource; + auto src = decompilerContext->shaderSource; + if (decompilerContext->shader->shaderType == LatteConst::ShaderType::Vertex) { + src->add("struct VertexIn {" _CRLF); // attribute inputs for (uint32 i = 0; i < LATTE_NUM_MAX_ATTRIBUTE_LOCATIONS; i++) { @@ -206,15 +173,23 @@ namespace LatteDecompiler cemu_assert_debug(decompilerContext->output->resourceMappingVK.attributeMapping[i] >= 0); cemu_assert_debug(decompilerContext->output->resourceMappingGL.attributeMapping[i] == decompilerContext->output->resourceMappingVK.attributeMapping[i]); - shaderSrc->addFmt("ATTR_LAYOUT({}, {}) in uvec4 attrDataSem{};" _CRLF, (sint32)decompilerContext->output->resourceMappingVK.setIndex, (sint32)decompilerContext->output->resourceMappingVK.attributeMapping[i], i); + src->addFmt("ATTR_LAYOUT({}, {}) in uvec4 attrDataSem{};" _CRLF, (sint32)decompilerContext->output->resourceMappingVK.setIndex, (sint32)decompilerContext->output->resourceMappingVK.attributeMapping[i], i); } } + src->add("};" _CRLF _CRLF); } } - static void _emitVSExports(LatteDecompilerShaderContext* shaderContext) + static void _emitVSOutputs(LatteDecompilerShaderContext* shaderContext) { auto* src = shaderContext->shaderSource; + + src->add("struct VertexOut {" _CRLF); + + src->add("float4 position [[position]];" _CRLF); + if (shaderContext->analyzer.outputPointSize) + src->add("float pointSize[[point_size]];" _CRLF); + LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable(); auto parameterMask = shaderContext->shader->outputParameterMask; for (uint32 i = 0; i < 32; i++) @@ -237,190 +212,165 @@ namespace LatteDecompiler if (psInputIndex == -1) continue; // no ps input - src->addFmt("layout(location = {}) ", psInputIndex); + src->addFmt("float4 passParameterSem{}", psInputTable->import[psInputIndex].semanticId); + src->addFmt(" [[user(locn{})]]", psInputIndex); if (psInputTable->import[psInputIndex].isFlat) - src->add("flat "); + src->add(" [[flat]]"); if (psInputTable->import[psInputIndex].isNoPerspective) - src->add("noperspective "); - src->add("out"); - src->addFmt(" vec4 passParameterSem{};" _CRLF, psInputTable->import[psInputIndex].semanticId); + src->add(" [[center_no_perspective]]"); + src->addFmt(";" _CRLF); } + + src->add("};" _CRLF _CRLF); } - static void _emitPSImports(LatteDecompilerShaderContext* shaderContext) + static void _emitPSInputs(LatteDecompilerShaderContext* shaderContext) { auto* src = shaderContext->shaderSource; + + src->add("struct FragmentIn {" _CRLF); + LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable(); for (sint32 i = 0; i < psInputTable->count; i++) { if (psInputTable->import[i].semanticId > LATTE_ANALYZER_IMPORT_INDEX_PARAM_MAX) continue; - src->addFmt("layout(location = {}) ", i); + src->addFmt("float4 passParameterSem{}", psInputTable->import[i].semanticId); + src->addFmt(" [[user(locn{})]]", i); if (psInputTable->import[i].isFlat) - src->add("flat "); + src->add(" [[flat]]"); if (psInputTable->import[i].isNoPerspective) - src->add("noperspective "); - src->add("in"); - src->addFmt(" vec4 passParameterSem{};" _CRLF, psInputTable->import[i].semanticId); + src->add(" [[center_no_perspective]]"); + src->add(";" _CRLF); } + + src->add("};" _CRLF _CRLF); } - static void _emitMisc(LatteDecompilerShaderContext* decompilerContext) + static void _emitInputsAndOutputs(LatteDecompilerShaderContext* decompilerContext) { auto src = decompilerContext->shaderSource; - // per-vertex output (VS or GS) - if ((decompilerContext->shaderType == LatteConst::ShaderType::Vertex && !decompilerContext->options->usesGeometryShader) || - (decompilerContext->shaderType == LatteConst::ShaderType::Geometry)) - { - src->add("out gl_PerVertex" _CRLF); - src->add("{" _CRLF); - src->add(" vec4 gl_Position;" _CRLF); - if (decompilerContext->analyzer.outputPointSize) - src->add(" float gl_PointSize;" _CRLF); - src->add("};" _CRLF); - } - // varyings (variables passed from vertex to pixel shader, only if geometry stage is disabled - if (decompilerContext->options->usesGeometryShader == false) + + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) { - if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) - { - _emitVSExports(decompilerContext); - } - else if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel) - { - _emitPSImports(decompilerContext); - } + _emitAttributes(decompilerContext); + _emitVSOutputs(decompilerContext); + + // TODO: transform feedback } - else + else if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel) { - if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex || decompilerContext->shaderType == LatteConst::ShaderType::Geometry) - { - // parameters shared between vertex shader and geometry shader - src->add("V2G_LAYOUT "); - - if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) - src->add("out Vertex" _CRLF); - else if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry) - src->add("in Vertex" _CRLF); - src->add("{" _CRLF); - uint32 ringParameterCountVS2GS = 0; - if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) - { - ringParameterCountVS2GS = decompilerContext->shader->ringParameterCount; - } - else - { - ringParameterCountVS2GS = decompilerContext->shader->ringParameterCountFromPrevStage; - } - for (uint32 f = 0; f < ringParameterCountVS2GS; f++) - src->addFmt(" ivec4 passV2GParameter{};" _CRLF, f); - if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) - src->add("}v2g;" _CRLF); - else if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry) - src->add("}v2g[];" _CRLF); - } - if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry) - { - // parameters shared between geometry and pixel shader - uint32 ringItemSize = decompilerContext->contextRegisters[mmSQ_GSVS_RING_ITEMSIZE] & 0x7FFF; - if ((ringItemSize & 0xF) != 0) - debugBreakpoint(); - if (((decompilerContext->contextRegisters[mmSQ_GSVS_RING_ITEMSIZE] & 0x7FFF) & 0xF) != 0) - debugBreakpoint(); - - for (sint32 p = 0; p < decompilerContext->parsedGSCopyShader->numParam; p++) - { - if (decompilerContext->parsedGSCopyShader->paramMapping[p].exportType != 2) - continue; - src->addFmt("layout(location = {}) out vec4 passG2PParameter{};" _CRLF, decompilerContext->parsedGSCopyShader->paramMapping[p].exportParam & 0x7F, (sint32)decompilerContext->parsedGSCopyShader->paramMapping[p].exportParam); - } - } - else if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel) - { - // pixel shader with geometry shader - LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable(); - for (sint32 i = 0; i < psInputTable->count; i++) - { - if (psInputTable->import[i].semanticId > LATTE_ANALYZER_IMPORT_INDEX_PARAM_MAX) - continue; - uint32 location = psInputTable->import[i].semanticId & 0x7F; // todo - the range above 128 has special meaning? - - src->addFmt("layout(location = {}) ", location); - if (psInputTable->import[i].isFlat) - src->add("flat "); - if (psInputTable->import[i].isNoPerspective) - src->add("noperspective "); - if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) - src->add("out"); - else if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel) - src->add("in"); - else - debugBreakpoint(); - - src->addFmt(" vec4 passG2PParameter{};" _CRLF, (sint32)location); - } - } + _emitPSInputs(decompilerContext); + + src->add("struct FragmentOut {" _CRLF); + + // generate pixel outputs for pixel shader + for (uint32 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + { + if ((decompilerContext->shader->pixelColorOutputMask&(1 << i)) != 0) + { + src->addFmt("float4 passPixelColor{} [[color({})]];" _CRLF, i, i); + } + } + + src->add("};" _CRLF _CRLF); } - // output defines - if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel) + } + + static void emitHeader(LatteDecompilerShaderContext* decompilerContext) + { + const bool dump_shaders_enabled = ActiveSettings::DumpShadersEnabled(); + if(dump_shaders_enabled) + decompilerContext->shaderSource->add("// start of shader inputs/outputs, predetermined by Cemu. Do not touch" _CRLF); + // uniform variables + _emitUniformVariables(decompilerContext); + // uniform buffers + _emitUniformBuffers(decompilerContext); + // inputs and outputs + _emitInputsAndOutputs(decompilerContext); + + if (dump_shaders_enabled) + decompilerContext->shaderSource->add("// end of shader inputs/outputs" _CRLF); + } + + static void _emitUniformBufferDefinitions(LatteDecompilerShaderContext* decompilerContext) + { + auto src = decompilerContext->shaderSource; + // uniform buffer definition + if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK) { - // generate pixel outputs for pixel shader - for (uint32 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + for (uint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++) { - if ((decompilerContext->shader->pixelColorOutputMask&(1 << i)) != 0) - { - src->addFmt("layout(location = {}) out vec4 passPixelColor{};" _CRLF, i, i); - } + if (!decompilerContext->analyzer.uniformBufferAccessTracker[i].HasAccess()) + continue; + + cemu_assert_debug(decompilerContext->output->resourceMappingGL.uniformBuffersBindingPoint[i] >= 0); + cemu_assert_debug(decompilerContext->output->resourceMappingVK.uniformBuffersBindingPoint[i] >= 0); + + src->addFmt("constant UBuff{}& ubuff{} [[buffer({})]]" _CRLF, i, i, (sint32)decompilerContext->output->resourceMappingGL.uniformBuffersBindingPoint[i]); } } - // streamout buffer (transform feedback) - if ((decompilerContext->shaderType == LatteConst::ShaderType::Vertex || decompilerContext->shaderType == LatteConst::ShaderType::Geometry) && decompilerContext->analyzer.hasStreamoutEnable) + } + + static void _emitTextureDefinitions(LatteDecompilerShaderContext* shaderContext) + { + auto src = shaderContext->shaderSource; + // texture sampler definition + for (sint32 i = 0; i < LATTE_NUM_MAX_TEX_UNITS; i++) { - if (decompilerContext->options->useTFViaSSBO) + if (!shaderContext->output->textureUnitMask[i]) + continue; + + src->add(", "); + + if (shaderContext->shader->textureIsIntegerFormat[i]) { - if (decompilerContext->analyzer.useSSBOForStreamout && decompilerContext->analyzer.hasStreamoutWrite) - { - src->addFmt("layout(set = {}, binding = {}) buffer StreamoutBuffer" _CRLF, decompilerContext->output->resourceMappingVK.setIndex, decompilerContext->output->resourceMappingVK.getTFStorageBufferBindingPoint()); - src->add("{" _CRLF); - src->add("int sb_buffer[];" _CRLF); - src->add("};" _CRLF); - } + // integer samplers + if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_1D) + src->add("texture1d"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D || shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_MSAA) + src->add("texture2d"); + else + cemu_assert_unimplemented(); } + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D || shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_MSAA) + src->add("texture2d"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_1D) + src->add("texture1d"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_ARRAY) + src->add("texture2d_array"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_CUBEMAP) + src->add("texturecube_array"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_3D) + src->add("texture3d"); else { - sint32 locationOffset = 0; // glslang wants a location for xfb outputs - for (uint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++) - { - if (!decompilerContext->output->streamoutBufferWriteMask[i]) - continue; - uint32 bufferStride = decompilerContext->output->streamoutBufferStride[i]; - src->addFmt("XFB_BLOCK_LAYOUT({}, {}, {}) out XfbBlock{} " _CRLF, i, bufferStride, locationOffset, i); - src->add("{" _CRLF); - src->addFmt("layout(xfb_buffer = {}, xfb_offset = 0) int sb{}[{}];" _CRLF, i, i, decompilerContext->output->streamoutBufferStride[i] / 4); - src->add("};" _CRLF); - locationOffset += (decompilerContext->output->streamoutBufferStride[i] / 4); - } + cemu_assert_unimplemented(); } + + src->addFmt(" tex{} [[texture({})]]", i, shaderContext->output->resourceMappingGL.textureUnitToBindingPoint[i]); + src->addFmt(", sampler samplr{} [[sampler({})]]", i, shaderContext->output->resourceMappingGL.textureUnitToBindingPoint[i]); } } - static void emitHeader(LatteDecompilerShaderContext* decompilerContext) + static void emitInputs(LatteDecompilerShaderContext* decompilerContext) { - const bool dump_shaders_enabled = ActiveSettings::DumpShadersEnabled(); - if(dump_shaders_enabled) - decompilerContext->shaderSource->add("// start of shader inputs/outputs, predetermined by Cemu. Do not touch" _CRLF); - // uniform variables - _emitUniformVariables(decompilerContext, decompilerContext->output->uniformOffsetsVK); + auto src = decompilerContext->shaderSource; + + switch (decompilerContext->shaderType) + { + case LatteConst::ShaderType::Vertex: + src->add("VertexIn"); + break; + case LatteConst::ShaderType::Pixel: + src->add("FragmentIn"); + break; + } + + src->add(" in [[stage_in]], DefaultVariables defaultVars [[buffer(29)]]"); // uniform buffers - _emitUniformBuffers(decompilerContext); + _emitUniformBufferDefinitions(decompilerContext); // textures _emitTextureDefinitions(decompilerContext); - // attributes - _emitAttributes(decompilerContext); - // misc stuff - _emitMisc(decompilerContext); - - if (dump_shaders_enabled) - decompilerContext->shaderSource->add("// end of shader inputs/outputs" _CRLF); } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index b1710e8a0..fa3b03f20 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1,6 +1,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalLayer.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #include "HW/Latte/Core/LatteShader.h" @@ -235,10 +236,9 @@ void MetalRenderer::buffer_bindUniformBuffer(LatteConst::ShaderType shaderType, cemuLog_logDebug(LogType::Force, "not implemented"); } -RendererShader* MetalRenderer::shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool compileAsync, bool isGfxPackSource) +RendererShader* MetalRenderer::shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool isGameShader, bool isGfxPackShader) { - cemuLog_logDebug(LogType::Force, "not implemented"); - + //return new RendererShaderMtl(this, type, baseHash, auxHash, isGameShader, isGfxPackShader, source); return nullptr; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp new file mode 100644 index 000000000..50033d5c9 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -0,0 +1,38 @@ +#include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cemu/Logging/CemuLogging.h" + +RendererShaderMtl::RendererShaderMtl(MetalRenderer* mtlRenderer, ShaderType type, uint64 baseHash, uint64 auxHash, bool isGameShader, bool isGfxPackShader, const std::string& mslCode) + : RendererShader(type, baseHash, auxHash, isGameShader, isGfxPackShader) +{ + NS::Error* error = nullptr; + MTL::Library* library = mtlRenderer->GetDevice()->newLibrary(NS::String::string(mslCode.c_str(), NS::ASCIIStringEncoding), nullptr, &error); + if (error) + { + cemuLog_log(LogType::MetalLogging, "Failed to create library (error: {}) -> source:\n{}", error->localizedDescription()->utf8String(), mslCode.c_str()); + error->release(); + return; + } + m_function = library->newFunction(NS::String::string("main0", NS::ASCIIStringEncoding)); +} + +RendererShaderMtl::~RendererShaderMtl() +{ + if (m_function) + m_function->release(); +} + +void RendererShaderMtl::ShaderCacheLoading_begin(uint64 cacheTitleId) +{ + cemuLog_log(LogType::MetalLogging, "RendererShaderMtl::ShaderCacheLoading_begin not implemented!"); +} + +void RendererShaderMtl::ShaderCacheLoading_end() +{ + cemuLog_log(LogType::MetalLogging, "RendererShaderMtl::ShaderCacheLoading_end not implemented!"); +} + +void RendererShaderMtl::ShaderCacheLoading_Close() +{ + cemuLog_log(LogType::MetalLogging, "RendererShaderMtl::ShaderCacheLoading_Close not implemented!"); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h new file mode 100644 index 000000000..030bbff0b --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h @@ -0,0 +1,44 @@ +#pragma once + +#include "Cafe/HW/Latte/Renderer/RendererShader.h" +#include "HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "util/helpers/ConcurrentQueue.h" + +#include + +class RendererShaderMtl : public RendererShader +{ + //enum class COMPILATION_STATE : uint32 + //{ + // NONE, + // QUEUED, + // COMPILING, + // DONE + //}; + +public: + static void ShaderCacheLoading_begin(uint64 cacheTitleId); + static void ShaderCacheLoading_end(); + static void ShaderCacheLoading_Close(); + + RendererShaderMtl(class MetalRenderer* mtlRenderer, ShaderType type, uint64 baseHash, uint64 auxHash, bool isGameShader, bool isGfxPackShader, const std::string& mslCode); + virtual ~RendererShaderMtl(); + + MTL::Function* GetFunction() const + { + return m_function; + } + + // TODO: implement these + sint32 GetUniformLocation(const char* name) override { return 0; } + void SetUniform2fv(sint32 location, void* data, sint32 count) override {} + void SetUniform4iv(sint32 location, void* data, sint32 count) override {} + + // TODO: implement this + void PreponeCompilation(bool isRenderThread) override {} + bool IsCompiled() override { return true; } + bool WaitForCompiled() override { return true; } + +private: + MTL::Function* m_function = nullptr; +}; From af3ce80b7c41206c26772cef080415c37d6886fb Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 26 Jul 2024 18:56:30 +0200 Subject: [PATCH 008/368] fix: shader errors and shader crashes --- src/Cafe/HW/Latte/Core/LatteShader.cpp | 8 +- src/Cafe/HW/Latte/Core/LatteShaderGL.cpp | 2 +- .../LatteDecompilerEmitMSL.cpp | 309 +++++++----------- .../LatteDecompilerEmitMSLAttrDecoder.cpp | 64 ++-- .../LatteDecompilerEmitMSLHeader.hpp | 48 +-- .../Latte/Renderer/Metal/LatteTextureMtl.cpp | 2 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 79 +++-- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 20 +- .../Renderer/Metal/RendererShaderMtl.cpp | 3 +- .../Latte/Renderer/Metal/RendererShaderMtl.h | 19 +- 10 files changed, 252 insertions(+), 302 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteShader.cpp b/src/Cafe/HW/Latte/Core/LatteShader.cpp index b59702cd1..6561e6420 100644 --- a/src/Cafe/HW/Latte/Core/LatteShader.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShader.cpp @@ -77,7 +77,7 @@ inline ska::flat_hash_map& LatteSHRC_GetCacheByT if (shaderType == LatteConst::ShaderType::Vertex) return sVertexShaders; else if (shaderType == LatteConst::ShaderType::Geometry) - return sGeometryShaders; + return sGeometryShaders; cemu_assert_debug(shaderType == LatteConst::ShaderType::Pixel); return sPixelShaders; } @@ -320,7 +320,7 @@ void LatteShader_CreateRendererShader(LatteDecompilerShader* shader, bool compil { shaderType = RendererShader::ShaderType::kGeometry; gpShaderType = GraphicPack2::GP_SHADER_TYPE::GEOMETRY; - } + } else if (shader->shaderType == LatteConst::ShaderType::Pixel) { shaderType = RendererShader::ShaderType::kFragment; @@ -443,7 +443,7 @@ void LatteShader_DumpShader(uint64 baseHash, uint64 auxHash, LatteDecompilerShad { if (!ActiveSettings::DumpShadersEnabled()) return; - + const char* suffix = ""; if (shader->shaderType == LatteConst::ShaderType::Vertex) suffix = "vs"; @@ -1011,4 +1011,4 @@ void LatteSHRC_UnloadAll() while(!sPixelShaders.empty()) LatteShader_free(sPixelShaders.begin()->second); cemu_assert_debug(sPixelShaders.empty()); -} \ No newline at end of file +} diff --git a/src/Cafe/HW/Latte/Core/LatteShaderGL.cpp b/src/Cafe/HW/Latte/Core/LatteShaderGL.cpp index b8cb0ce1b..09c484e68 100644 --- a/src/Cafe/HW/Latte/Core/LatteShaderGL.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShaderGL.cpp @@ -26,7 +26,7 @@ bool gxShader_checkIfSuccessfullyLinked(GLuint glProgram) void LatteShader_prepareSeparableUniforms(LatteDecompilerShader* shader) { - if (g_renderer->GetType() == RendererAPI::Vulkan) + if (g_renderer->GetType() != RendererAPI::OpenGL) return; auto shaderGL = (RendererShaderGL*)shader->shader; diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 2ffcfa0fe..9981d5ea4 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -628,11 +628,11 @@ static void _emitUniformAccessCode(LatteDecompilerShaderContext* shaderContext, cemu_assert_debug(remappedUniformEntry); _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); if(shaderContext->shader->shaderType == LatteConst::ShaderType::Vertex ) - src->addFmt("uf_remappedVS[{}]", remappedUniformEntry->mappedIndex); + src->addFmt("supportBuffer.remappedVS[{}]", remappedUniformEntry->mappedIndex); else if(shaderContext->shader->shaderType == LatteConst::ShaderType::Pixel ) - src->addFmt("uf_remappedPS[{}]", remappedUniformEntry->mappedIndex); + src->addFmt("supportBuffer.remappedPS[{}]", remappedUniformEntry->mappedIndex); else if(shaderContext->shader->shaderType == LatteConst::ShaderType::Geometry ) - src->addFmt("uf_remappedGS[{}]", remappedUniformEntry->mappedIndex); + src->addFmt("supportBuffer.remappedGS[{}]", remappedUniformEntry->mappedIndex); else debugBreakpoint(); _appendChannelAccess(src, aluInstruction->sourceOperand[operandIndex].chan); @@ -643,11 +643,11 @@ static void _emitUniformAccessCode(LatteDecompilerShaderContext* shaderContext, // uniform registers are accessed with unpredictable (dynamic) offset _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); if(shaderContext->shader->shaderType == LatteConst::ShaderType::Vertex ) - src->add("uf_uniformRegisterVS["); + src->add("supportBuffer.uniformRegisterVS["); else if (shaderContext->shader->shaderType == LatteConst::ShaderType::Pixel) - src->add("uf_uniformRegisterPS["); + src->add("supportBuffer.uniformRegisterPS["); else if(shaderContext->shader->shaderType == LatteConst::ShaderType::Geometry ) - src->add("uf_uniformRegisterGS["); + src->add("supportBuffer.uniformRegisterGS["); else debugBreakpoint(); _emitUniformAccessIndexCode(shaderContext, aluInstruction, operandIndex); @@ -802,7 +802,7 @@ static void _emitOperandInputCode(LatteDecompilerShaderContext* shaderContext, L if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) { // need to convert (not cast) from int bits to float - src->add("intBitsToFloat("); + src->add("as_type("); } else if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_FLOAT ) { @@ -872,7 +872,7 @@ static void _emitOperandInputCode(LatteDecompilerShaderContext* shaderContext, L src->add(_FormatFloatAsConstant(*(float*)&constVal)); } else - src->addFmt("intBitsToFloat(0x{:08x})", constVal); + src->addFmt("as_type(0x{:08x})", constVal); } } else if( GPU7_ALU_SRC_IS_CFILE(aluInstruction->sourceOperand[operandIndex].sel) ) @@ -1026,7 +1026,7 @@ static void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderConte src->add(" = "); if( outputType != LATTE_DECOMPILER_DTYPE_SIGNED_INT ) debugBreakpoint(); // todo - src->add("floatBitsToInt(tempResultf)"); + src->add("as_type(tempResultf)"); src->add(";" _CRLF); } else if( aluInstruction->opcode == ALU_OP2_INST_MOVA_INT ) @@ -1123,9 +1123,9 @@ static void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderConte _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(");" _CRLF); // INF becomes 0.0 - src->add("if( isinf(tempResultf) == true && (floatBitsToInt(tempResultf)&0x80000000) == 0 ) tempResultf = 0.0;" _CRLF); + src->add("if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) == 0 ) tempResultf = 0.0;" _CRLF); // -INF becomes -0.0 - src->add("else if( isinf(tempResultf) == true && (floatBitsToInt(tempResultf)&0x80000000) != 0 ) tempResultf = -0.0;" _CRLF); + src->add("else if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) != 0 ) tempResultf = -0.0;" _CRLF); // assign result to output _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); @@ -1145,14 +1145,14 @@ static void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderConte if (aluInstruction->opcode == ALU_OP2_INST_RECIPSQRT_CLAMPED) { // note: if( -INF < 0.0 ) does not resolve to true - src->add("if( isinf(tempResultf) == true && (floatBitsToInt(tempResultf)&0x80000000) != 0 ) tempResultf = -3.40282347E+38F;" _CRLF); - src->add("else if( isinf(tempResultf) == true && (floatBitsToInt(tempResultf)&0x80000000) == 0 ) tempResultf = 3.40282347E+38F;" _CRLF); + src->add("if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) != 0 ) tempResultf = -3.40282347E+38F;" _CRLF); + src->add("else if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) == 0 ) tempResultf = 3.40282347E+38F;" _CRLF); } else if (aluInstruction->opcode == ALU_OP2_INST_RECIPSQRT_FF) { // untested (BotW bombs) - src->add("if( isinf(tempResultf) == true && (floatBitsToInt(tempResultf)&0x80000000) != 0 ) tempResultf = -0.0;" _CRLF); - src->add("else if( isinf(tempResultf) == true && (floatBitsToInt(tempResultf)&0x80000000) == 0 ) tempResultf = 0.0;" _CRLF); + src->add("if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) != 0 ) tempResultf = -0.0;" _CRLF); + src->add("else if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) == 0 ) tempResultf = 0.0;" _CRLF); } // assign result to output _emitInstructionOutputVariableName(shaderContext, aluInstruction); @@ -1704,8 +1704,8 @@ static void _emitALUReductionInstructionCode(LatteDecompilerShaderContext* shade src->add(" = "); _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); - // dot(vec4(op0),vec4(op1)) - src->add("dot(vec4("); + // dot(float4(op0),float4(op1)) + src->add("dot(float4("); _emitOperandInputCode(shaderContext, aluRedcInstruction[0], 0, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(","); _emitOperandInputCode(shaderContext, aluRedcInstruction[1], 0, LATTE_DECOMPILER_DTYPE_FLOAT); @@ -1713,7 +1713,7 @@ static void _emitALUReductionInstructionCode(LatteDecompilerShaderContext* shade _emitOperandInputCode(shaderContext, aluRedcInstruction[2], 0, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(","); _emitOperandInputCode(shaderContext, aluRedcInstruction[3], 0, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add("),vec4("); + src->add("),float4("); _emitOperandInputCode(shaderContext, aluRedcInstruction[0], 1, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(","); _emitOperandInputCode(shaderContext, aluRedcInstruction[1], 1, LATTE_DECOMPILER_DTYPE_FLOAT); @@ -1730,7 +1730,7 @@ static void _emitALUReductionInstructionCode(LatteDecompilerShaderContext* shade { /* * How the CUBE instruction works (guessed mostly, based on DirectX/OpenGL spec): - Input: vec4, 3d direction vector (can be unnormalized) + w component (which can be ignored, since it only scales the vector but does not affect the direction) + Input: float4, 3d direction vector (can be unnormalized) + w component (which can be ignored, since it only scales the vector but does not affect the direction) First we figure out the major axis (closest axis-aligned vector). There are six possible vectors: +rx 0 @@ -1758,7 +1758,7 @@ static void _emitALUReductionInstructionCode(LatteDecompilerShaderContext* shade sint32 outputType; src->add("redcCUBE("); - src->add("vec4("); + src->add("float4("); _emitOperandInputCode(shaderContext, aluRedcInstruction[0], 0, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(","); _emitOperandInputCode(shaderContext, aluRedcInstruction[1], 0, LATTE_DECOMPILER_DTYPE_FLOAT); @@ -1767,7 +1767,7 @@ static void _emitALUReductionInstructionCode(LatteDecompilerShaderContext* shade src->add(","); _emitOperandInputCode(shaderContext, aluRedcInstruction[3], 0, LATTE_DECOMPILER_DTYPE_FLOAT); src->add("),"); - src->add("vec4("); + src->add("float4("); _emitOperandInputCode(shaderContext, aluRedcInstruction[0], 1, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(","); _emitOperandInputCode(shaderContext, aluRedcInstruction[1], 1, LATTE_DECOMPILER_DTYPE_FLOAT); @@ -1887,12 +1887,12 @@ bool _isPVUsedInNextGroup(LatteDecompilerCFInstruction* cfInstruction, sint32 st } */ -static void _emitVec3(LatteDecompilerShaderContext* shaderContext, uint32 dataType, LatteDecompilerALUInstruction* aluInst0, sint32 opIdx0, LatteDecompilerALUInstruction* aluInst1, sint32 opIdx1, LatteDecompilerALUInstruction* aluInst2, sint32 opIdx2) +static void _emitFloat3(LatteDecompilerShaderContext* shaderContext, uint32 dataType, LatteDecompilerALUInstruction* aluInst0, sint32 opIdx0, LatteDecompilerALUInstruction* aluInst1, sint32 opIdx1, LatteDecompilerALUInstruction* aluInst2, sint32 opIdx2) { StringBuf* src = shaderContext->shaderSource; if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) { - src->add("vec3("); + src->add("float3("); _emitOperandInputCode(shaderContext, aluInst0, opIdx0, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(","); _emitOperandInputCode(shaderContext, aluInst1, opIdx1, LATTE_DECOMPILER_DTYPE_FLOAT); @@ -1902,7 +1902,7 @@ static void _emitVec3(LatteDecompilerShaderContext* shaderContext, uint32 dataTy } else if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) { - src->add("ivec3("); + src->add("int3("); _emitOperandInputCode(shaderContext, aluInst0, opIdx0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); src->add(","); _emitOperandInputCode(shaderContext, aluInst1, opIdx1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); @@ -2002,7 +2002,7 @@ static void _emitALUClauseCode(LatteDecompilerShaderContext* shaderContext, Latt { _emitInstructionOutputVariableName(shaderContext, &aluInstruction); src->add(" = "); - src->add("floatBitsToInt(intBitsToFloat("); + src->add("as_type("); _emitInstructionOutputVariableName(shaderContext, &aluInstruction); src->add(")"); if( aluInstruction.omod == 1 ) @@ -2099,9 +2099,9 @@ static void _emitTEXSampleCoordInputComponent(LatteDecompilerShaderContext* shad if(interpretSrcAsType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) { if( elementSel == 4 ) - src->add("floatBitsToInt(0.0)"); + src->add("as_type(0.0)"); else if( elementSel == 5 ) - src->add("floatBitsToInt(1.0)"); + src->add("as_type(1.0)"); } else if(interpretSrcAsType == LATTE_DECOMPILER_DTYPE_FLOAT ) { @@ -2116,7 +2116,7 @@ static const char* _texGprAccessElemTable[8] = {"x","y","z","w","_","_","_","_"} static char* _getTexGPRAccess(LatteDecompilerShaderContext* shaderContext, sint32 gprIndex, uint32 dataType, sint8 selX, sint8 selY, sint8 selZ, sint8 selW, char* tempBuffer) { - // intBitsToFloat(R{}i.w) + // as_type(R{}i.w) *tempBuffer = '\0'; uint8 elemCount = (selX > 0 ? 1 : 0) + (selY > 0 ? 1 : 0) + (selZ > 0 ? 1 : 0) + (selW > 0 ? 1 : 0); if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) @@ -2124,7 +2124,7 @@ static char* _getTexGPRAccess(LatteDecompilerShaderContext* shaderContext, sint3 if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) ; // no conversion else if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) - strcat(tempBuffer, "intBitsToFloat("); + strcat(tempBuffer, "as_type("); else cemu_assert_unimplemented(); strcat(tempBuffer, _getRegisterVarName(shaderContext, gprIndex)); @@ -2230,16 +2230,16 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex if(numWrittenElements == 1) src->add(" = int("); else - shaderContext->shaderSource->addFmt(" = ivec{}(", numWrittenElements); + shaderContext->shaderSource->addFmt(" = int{}(", numWrittenElements); } else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) - src->add(" = uintBitsToFloat("); + src->add(" = as_type("); } else { // float samplers if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->add(" = floatBitsToInt("); + src->add(" = as_type("); else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) src->add(" = ("); } @@ -2256,104 +2256,26 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex if (numWrittenElements == 1) shaderContext->shaderSource->add("0.0"); else - shaderContext->shaderSource->addFmt("vec{}(0.0)", numWrittenElements); + shaderContext->shaderSource->addFmt("float{}(0.0)", numWrittenElements); shaderContext->shaderSource->add(");" _CRLF); return; } - - if (texOpcode == GPU7_TEX_INST_SAMPLE && (texInstruction->textureFetch.unnormalized[0] && texInstruction->textureFetch.unnormalized[1] && texInstruction->textureFetch.unnormalized[2] && texInstruction->textureFetch.unnormalized[3]) ) + src->addFmt("tex{}.", texInstruction->textureFetch.textureIndex); + if ((texOpcode == GPU7_TEX_INST_SAMPLE && (texInstruction->textureFetch.unnormalized[0] && texInstruction->textureFetch.unnormalized[1] && texInstruction->textureFetch.unnormalized[2] && texInstruction->textureFetch.unnormalized[3])) || + texOpcode == GPU7_TEX_INST_LD) { // texture is likely a RECT if (hasOffset) cemu_assert_unimplemented(); - src->add("texelFetch("); + src->add("read("); unnormalizationHandled = true; useTexelCoordinates = true; } - else if( texOpcode == GPU7_TEX_INST_FETCH4 ) - { - if( hasOffset ) - cemu_assert_unimplemented(); - src->add("textureGather("); - } - else if( texOpcode == GPU7_TEX_INST_LD ) - { - if( hasOffset ) - cemu_assert_unimplemented(); - src->add("texelFetch("); - unnormalizationHandled = true; - useTexelCoordinates = true; - } - else if( texOpcode == GPU7_TEX_INST_SAMPLE_L ) - { - // sample with LOD value set in gpr.w (replaces computed LOD value) - if( hasOffset ) - src->add("textureLodOffset("); - else - src->add("textureLod("); - } - else if (texOpcode == GPU7_TEX_INST_SAMPLE_LZ) - { - // sample with LOD set to 0.0 (replaces computed LOD value) - if (hasOffset) - src->add("textureLodOffset("); - else - src->add("textureLod("); - } - else if (texOpcode == GPU7_TEX_INST_SAMPLE_LB) - { - // sample with LOD biased - // note: AMD doc says LOD bias is calculated from instruction LOD_BIAS field. But it appears that LOD bias is taken from input register. Might actually be both? - if (hasOffset) - src->add("textureOffset("); - else - src->add("texture("); - } - else if (texOpcode == GPU7_TEX_INST_SAMPLE) - { - if (hasOffset) - src->add("textureOffset("); - else - src->add("texture("); - } - else if (texOpcode == GPU7_TEX_INST_SAMPLE_C_L) - { - // sample with LOD value set in gpr.w (replaces computed LOD value) - if (hasOffset) - src->add("textureLodOffset("); - else - src->add("textureLod("); - } - else if (texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ) - { - // sample with LOD set to 0.0 (replaces computed LOD value) - if (hasOffset) - src->add("textureLodOffset("); - else - src->add("textureLod("); - } - else if (texOpcode == GPU7_TEX_INST_SAMPLE_C) - { - if (hasOffset) - src->add("textureOffset("); - else - src->add("texture("); - } - else if (texOpcode == GPU7_TEX_INST_SAMPLE_G) - { - if (hasOffset) - cemu_assert_unimplemented(); - src->add("textureGrad("); - } else { - if( hasOffset ) - cemu_assert_unimplemented(); - cemu_assert_unimplemented(); - src->add("texture("); + src->addFmt("sample(samplr{}, ", texInstruction->textureFetch.textureIndex); } - src->addFmt("tex{}, ", texInstruction->textureFetch.textureIndex); // for textureGather() add shift (todo: depends on rounding mode set in sampler registers?) if (texOpcode == GPU7_TEX_INST_FETCH4) @@ -2370,7 +2292,7 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex // Vulkan uses truncate mode when point sampling (min and mag is both nearest) otherwise it uses rounding // adding a small fixed bias is enough to avoid vendor-specific cases where small inaccuracies cause the number to get rounded down due to truncation - src->addFmt("vec2(0.0001) + "); + src->addFmt("float2(0.0001) + "); } } @@ -2380,15 +2302,15 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex // handle integer coordinates for texelFetch if (texDim == Latte::E_DIM::DIM_2D || texDim == Latte::E_DIM::DIM_2D_MSAA) { - src->add("ivec2("); - src->add("vec2("); + src->add("int2("); + src->add("float2("); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, texCoordDataType); src->addFmt(", "); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, texCoordDataType); - src->addFmt(")*uf_tex{}Scale", texInstruction->textureFetch.textureIndex); // close vec2 and scale + src->addFmt(")*supportBuffer.tex{}Scale", texInstruction->textureFetch.textureIndex); // close float2 and scale - src->add("), 0"); // close ivec2 and lod param + src->add("), 0"); // close int2 and lod param // todo - lod } else if (texDim == Latte::E_DIM::DIM_1D) @@ -2397,7 +2319,7 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex src->add("int("); src->add("float("); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, (texOpcode == GPU7_TEX_INST_LD) ? LATTE_DECOMPILER_DTYPE_SIGNED_INT : LATTE_DECOMPILER_DTYPE_FLOAT); - src->addFmt(")*uf_tex{}Scale.x", texInstruction->textureFetch.textureIndex); + src->addFmt(")*supportBuffer.tex{}Scale.x", texInstruction->textureFetch.textureIndex); src->add("), 0"); // todo - lod } @@ -2412,8 +2334,8 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex // shadow sampler if (texDim == Latte::E_DIM::DIM_2D_ARRAY) { - // 3 coords + compare value (as vec4) - src->add("vec4("); + // 3 coords + compare value (as float4) + src->add("float4("); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(","); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); @@ -2429,7 +2351,7 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex { debugBreakpoint(); } - src->add("vec4("); + src->add("float4("); src->addFmt("redcCUBEReverse({},", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0)); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_SIGNED_INT); src->addFmt(")"); @@ -2442,22 +2364,22 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex { debugBreakpoint(); } - src->addFmt("vec3({},0.0,{})", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], -1, -1, -1, tempBuffer0), _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer1)); + src->addFmt("float3({},0.0,{})", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], -1, -1, -1, tempBuffer0), _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer1)); } else { - // 2 coords + compare value (as vec3) + // 2 coords + compare value (as float3) if (texInstruction->textureFetch.srcSel[0] >= 4 && texInstruction->textureFetch.srcSel[1] >= 4) { debugBreakpoint(); } - src->addFmt("vec3({}, {})", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0), _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer1)); + src->addFmt("float3({}, {})", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0), _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer1)); } } else if( texDim == Latte::E_DIM::DIM_3D || texDim == Latte::E_DIM::DIM_2D_ARRAY ) { // 3 coords - src->add("vec3("); + src->add("float3("); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(","); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); @@ -2470,7 +2392,7 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex // 2 coords + faceId cemu_assert_debug(texInstruction->textureFetch.srcSel[0] < 4); cemu_assert_debug(texInstruction->textureFetch.srcSel[1] < 4); - src->add("vec4("); + src->add("float4("); src->addFmt("redcCUBEReverse({},", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0)); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_SIGNED_INT); src->add(")"); @@ -2484,14 +2406,14 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex else { // 2 coords - src->add("vec2("); + src->add("float2("); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(","); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(")"); // avoid truncate to effectively round downwards on texel edges if (ActiveSettings::ForceSamplerRoundToPrecision()) - src->addFmt("+ vec2(1.0)/vec2(textureSize(tex{}, 0))/512.0", texInstruction->textureFetch.textureIndex); + src->addFmt("+ float2(1.0)/float2(textureSize(tex{}, 0))/512.0", texInstruction->textureFetch.textureIndex); } // lod or lod bias parameter if( texOpcode == GPU7_TEX_INST_SAMPLE_L || texOpcode == GPU7_TEX_INST_SAMPLE_LB || texOpcode == GPU7_TEX_INST_SAMPLE_C_L) @@ -2547,9 +2469,9 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex if( offsetComponentCount == 1 ) src->addFmt(",{}", texInstruction->textureFetch.offsetX/2); else if( offsetComponentCount == 2 ) - src->addFmt(",ivec2({},{})", texInstruction->textureFetch.offsetX/2, texInstruction->textureFetch.offsetY/2, texInstruction->textureFetch.offsetZ/2); + src->addFmt(",int2({},{})", texInstruction->textureFetch.offsetX/2, texInstruction->textureFetch.offsetY/2, texInstruction->textureFetch.offsetZ/2); else if( offsetComponentCount == 3 ) - src->addFmt(",ivec3({},{},{})", texInstruction->textureFetch.offsetX/2, texInstruction->textureFetch.offsetY/2, texInstruction->textureFetch.offsetZ/2); + src->addFmt(",int3({},{},{})", texInstruction->textureFetch.offsetX/2, texInstruction->textureFetch.offsetY/2, texInstruction->textureFetch.offsetZ/2); } } // lod bias @@ -2661,17 +2583,17 @@ static void _emitTEXGetTextureResInfoCode(LatteDecompilerShaderContext* shaderCo auto texDim = shaderContext->shader->textureUnitDim[texInstruction->textureFetch.textureIndex]; if (texDim == Latte::E_DIM::DIM_1D) - src->addFmt(" = ivec4(textureSize(tex{}, 0),1,1,1).", texInstruction->textureFetch.textureIndex); + src->addFmt(" = int4(textureSize(tex{}, 0),1,1,1).", texInstruction->textureFetch.textureIndex); else if (texDim == Latte::E_DIM::DIM_1D_ARRAY) - src->addFmt(" = ivec4(textureSize(tex{}, 0),1,1).", texInstruction->textureFetch.textureIndex); + src->addFmt(" = int4(textureSize(tex{}, 0),1,1).", texInstruction->textureFetch.textureIndex); else if (texDim == Latte::E_DIM::DIM_2D || texDim == Latte::E_DIM::DIM_2D_MSAA) - src->addFmt(" = ivec4(textureSize(tex{}, 0),1,1).", texInstruction->textureFetch.textureIndex); + src->addFmt(" = int4(textureSize(tex{}, 0),1,1).", texInstruction->textureFetch.textureIndex); else if (texDim == Latte::E_DIM::DIM_2D_ARRAY) - src->addFmt(" = ivec4(textureSize(tex{}, 0),1).", texInstruction->textureFetch.textureIndex); + src->addFmt(" = int4(textureSize(tex{}, 0),1).", texInstruction->textureFetch.textureIndex); else { cemu_assert_debug(false); - src->addFmt(" = ivec4(textureSize(tex{}, 0),1,1).", texInstruction->textureFetch.textureIndex); + src->addFmt(" = int4(textureSize(tex{}, 0),1,1).", texInstruction->textureFetch.textureIndex); } for(sint32 f=0; f<4; f++) @@ -2725,16 +2647,16 @@ static void _emitTEXGetCompTexLodCode(LatteDecompilerShaderContext* shaderContex { // 3 coordinates if(shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) - src->addFmt("vec4(textureQueryLod(tex{}, {}.{}{}{}),0.0,0.0)", texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]], resultElemTable[texInstruction->textureFetch.srcSel[2]]); + src->addFmt("float4(textureQueryLod(tex{}, {}.{}{}{}),0.0,0.0)", texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]], resultElemTable[texInstruction->textureFetch.srcSel[2]]); else - src->addFmt("vec4(textureQueryLod(tex{}, intBitsToFloat({}.{}{}{})),0.0,0.0)", texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]], resultElemTable[texInstruction->textureFetch.srcSel[2]]); + src->addFmt("float4(textureQueryLod(tex{}, as_type({}.{}{}{})),0.0,0.0)", texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]], resultElemTable[texInstruction->textureFetch.srcSel[2]]); } else { if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) - src->addFmt("vec4(textureQueryLod(tex{}, {}.{}{}),0.0,0.0)", texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]]); + src->addFmt("float4(textureQueryLod(tex{}, {}.{}{}),0.0,0.0)", texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]]); else - src->addFmt("vec4(textureQueryLod(tex{}, intBitsToFloat({}.{}{})),0.0,0.0)", texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]]); + src->addFmt("float4(textureQueryLod(tex{}, as_type({}.{}{})),0.0,0.0)", texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]]); debugBreakpoint(); } @@ -2768,7 +2690,7 @@ static void _emitTEXSetCubemapIndexCode(LatteDecompilerShaderContext* shaderCont const char* resultElemTable[4] = {"x","y","z","w"}; if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->addFmt(" = intBitsToFloat(R{}i.{});" _CRLF, texInstruction->srcGpr, resultElemTable[texInstruction->textureFetch.srcSel[0]]); + src->addFmt(" = as_type(R{}i.{});" _CRLF, texInstruction->srcGpr, resultElemTable[texInstruction->textureFetch.srcSel[0]]); else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) src->addFmt(" = R{}f.{};" _CRLF, texInstruction->srcGpr, resultElemTable[texInstruction->textureFetch.srcSel[0]]); else @@ -2942,7 +2864,7 @@ static void _emitTEXVFetchCode(LatteDecompilerShaderContext* shaderContext, Latt src->add(" = "); if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->add("floatBitsToInt("); + src->add("as_type("); else src->add("("); @@ -2951,7 +2873,7 @@ static void _emitTEXVFetchCode(LatteDecompilerShaderContext* shaderContext, Latt if( shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) src->addFmt("{}.{}", _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]]); else - src->addFmt("floatBitsToInt({}.{})", _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]]); + src->addFmt("as_type({}.{})", _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]]); src->add("]."); @@ -2983,7 +2905,7 @@ static void _emitTEXReadMemCode(LatteDecompilerShaderContext* shaderContext, Lat src->add(" = "); if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->add("floatBitsToInt("); + src->add("as_type("); else src->add("("); @@ -2999,13 +2921,13 @@ static void _emitTEXReadMemCode(LatteDecompilerShaderContext* shaderContext, Lat { readCount = 2; // todo - src->add("vec2(0.0,0.0)"); + src->add("float2(0.0,0.0)"); } else if (texInstruction->memRead.format == FMT_32_32_32_FLOAT) { readCount = 3; // todo - src->add("vec3(0.0,0.0,0.0)"); + src->add("float3(0.0,0.0,0.0)"); } else { @@ -3068,14 +2990,14 @@ static void _emitExportGPRReadCode(LatteDecompilerShaderContext* shaderContext, if(numOutputs == 1) src->add("float("); else - src->addFmt("vec{}(", numOutputs); + src->addFmt("float{}(", numOutputs); } else if (requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) { if (numOutputs == 1) src->add("int("); else - src->addFmt("ivec{}(", numOutputs); + src->addFmt("int{}(", numOutputs); } else cemu_assert_unimplemented(); @@ -3149,17 +3071,17 @@ static void _emitExportCode(LatteDecompilerShaderContext* shaderContext, LatteDe if (hasAnyViewportScaleDisabled) { - src->add("vec4 finalPos = "); + src->add("float4 finalPos = "); _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); src->add(";" _CRLF); - src->add("finalPos.xy = finalPos.xy * uf_windowSpaceToClipSpaceTransform - vec2(1.0,1.0);"); - src->add("SET_POSITION(finalPos);"); + src->add("finalPos.xy = finalPos.xy * supportBuffer.windowSpaceToClipSpaceTransform - float2(1.0,1.0);"); + src->add("out.position = finalPos;"); } else { - src->add("SET_POSITION("); + src->add("out.position = "); _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); - src->add(");" _CRLF); + src->add(";" _CRLF); } } else if (cfInstruction->exportType == 1 && cfInstruction->exportArrayBase == GPU7_DECOMPILER_CF_EXPORT_POINT_SIZE ) @@ -3181,7 +3103,7 @@ static void _emitExportCode(LatteDecompilerShaderContext* shaderContext, LatteDe uint32 vsSemanticId = _getVertexShaderOutParamSemanticId(shaderContext->contextRegisters, paramIndex); if (vsSemanticId != 0xFF) { - src->addFmt("passParameterSem{} = ", vsSemanticId); + src->addFmt("out.passParameterSem{} = ", vsSemanticId); _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); src->add(";" _CRLF); } @@ -3235,7 +3157,7 @@ static void _emitExportCode(LatteDecompilerShaderContext* shaderContext, LatteDe src->add(">="); break; } - src->add(" uf_alphaTestRef"); + src->add(" supportBuffer.alphaTestRef"); src->add(") == false) discard;" _CRLF); } // pixel color output @@ -3395,13 +3317,13 @@ static void _emitCFRingWriteCode(LatteDecompilerShaderContext* shaderContext, La if (parameterExportType == 1 && parameterExportBase == GPU7_DECOMPILER_CF_EXPORT_BASE_POSITION) { src->add("{" _CRLF); - src->addFmt("vec4 pos = vec4(0.0,0.0,0.0,1.0);" _CRLF); + src->addFmt("float4 pos = float4(0.0,0.0,0.0,1.0);" _CRLF); src->addFmt("pos."); _emitXYZWByMask(src, cfInstruction->memWriteCompMask); src->addFmt(" = "); _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, burstIndex); src->add(";" _CRLF); - src->add("SET_POSITION(pos);" _CRLF); + src->add("out.position = pos;" _CRLF); src->add("}" _CRLF); } else if (parameterExportType == 2 && parameterExportBase < 16) @@ -3645,7 +3567,7 @@ void LatteDecompiler_emitClauseCodeMSL(LatteDecompilerShaderContext* shaderConte src->addFmt("if( {} == true ) {{" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1)); // write point size if (shaderContext->analyzer.outputPointSize && shaderContext->analyzer.writesPointSize == false) - src->add("gl_PointSize = uf_pointSize;" _CRLF); + src->add("gl_PointSize = supportBuffer.pointSize;" _CRLF); // emit vertex src->add("EmitVertex();" _CRLF); // increment transform feedback pointer @@ -3681,11 +3603,11 @@ void LatteDecompiler_emitHelperFunctions(LatteDecompilerShaderContext* shaderCon { if( shaderContext->analyzer.hasRedcCUBE ) { - fCStr_shaderSource->add("void redcCUBE(vec4 src0, vec4 src1, out vec3 stm, out int faceId)\r\n" + fCStr_shaderSource->add("void redcCUBE(float4 src0, float4 src1, out float3 stm, out int faceId)\r\n" "{\r\n" "// stm -> x .. s, y .. t, z .. MajorAxis*2.0\r\n" - "vec3 inputCoord = normalize(vec3(src1.y, src1.x, src0.x));\r\n" + "float3 inputCoord = normalize(float3(src1.y, src1.x, src0.x));\r\n" "float rx = inputCoord.x;\r\n" "float ry = inputCoord.y;\r\n" @@ -3693,7 +3615,7 @@ void LatteDecompiler_emitHelperFunctions(LatteDecompilerShaderContext* shaderCon "if( abs(rx) > abs(ry) && abs(rx) > abs(rz) )\r\n" "{\r\n" "stm.z = rx*2.0;\r\n" - "stm.xy = vec2(ry,rz); \r\n" + "stm.xy = float2(ry,rz); \r\n" "if( rx >= 0.0 )\r\n" "{\r\n" "faceId = 0;\r\n" @@ -3706,7 +3628,7 @@ void LatteDecompiler_emitHelperFunctions(LatteDecompilerShaderContext* shaderCon "else if( abs(ry) > abs(rx) && abs(ry) > abs(rz) )\r\n" "{\r\n" "stm.z = ry*2.0;\r\n" - "stm.xy = vec2(rx,rz); \r\n" + "stm.xy = float2(rx,rz); \r\n" "if( ry >= 0.0 )\r\n" "{\r\n" "faceId = 2;\r\n" @@ -3719,7 +3641,7 @@ void LatteDecompiler_emitHelperFunctions(LatteDecompilerShaderContext* shaderCon "else //if( abs(rz) > abs(ry) && abs(rz) > abs(rx) )\r\n" "{\r\n" "stm.z = rz*2.0;\r\n" - "stm.xy = vec2(rx,ry); \r\n" + "stm.xy = float2(rx,ry); \r\n" "if( rz >= 0.0 )\r\n" "{\r\n" "faceId = 4;\r\n" @@ -3734,39 +3656,39 @@ void LatteDecompiler_emitHelperFunctions(LatteDecompilerShaderContext* shaderCon if( shaderContext->analyzer.hasCubeMapTexture ) { - fCStr_shaderSource->add("vec3 redcCUBEReverse(vec2 st, int faceId)\r\n" + fCStr_shaderSource->add("float3 redcCUBEReverse(float2 st, int faceId)\r\n" "{\r\n" "st.yx = st.xy;\r\n" - "vec3 v;\r\n" + "float3 v;\r\n" "float majorAxis = 1.0;\r\n" "if( faceId == 0 )\r\n" "{\r\n" - "v.yz = (st-vec2(1.5))*(majorAxis*2.0);\r\n" + "v.yz = (st-float2(1.5))*(majorAxis*2.0);\r\n" "v.x = 1.0;\r\n" "}\r\n" "else if( faceId == 1 )\r\n" "{\r\n" - "v.yz = (st-vec2(1.5))*(majorAxis*2.0);\r\n" + "v.yz = (st-float2(1.5))*(majorAxis*2.0);\r\n" "v.x = -1.0;\r\n" "}\r\n" "else if( faceId == 2 )\r\n" "{\r\n" - "v.xz = (st-vec2(1.5))*(majorAxis*2.0);\r\n" + "v.xz = (st-float2(1.5))*(majorAxis*2.0);\r\n" "v.y = 1.0;\r\n" "}\r\n" "else if( faceId == 3 )\r\n" "{\r\n" - "v.xz = (st-vec2(1.5))*(majorAxis*2.0);\r\n" + "v.xz = (st-float2(1.5))*(majorAxis*2.0);\r\n" "v.y = -1.0;\r\n" "}\r\n" "else if( faceId == 4 )\r\n" "{\r\n" - "v.xy = (st-vec2(1.5))*(majorAxis*2.0);\r\n" + "v.xy = (st-float2(1.5))*(majorAxis*2.0);\r\n" "v.z = 1.0;\r\n" "}\r\n" "else\r\n" "{\r\n" - "v.xy = (st-vec2(1.5))*(majorAxis*2.0);\r\n" + "v.xy = (st-float2(1.5))*(majorAxis*2.0);\r\n" "v.z = -1.0;\r\n" "}\r\n" @@ -3779,10 +3701,10 @@ void LatteDecompiler_emitHelperFunctions(LatteDecompilerShaderContext* shaderCon "int clampFI32(int v)\r\n" "{\r\n" "if( v == 0x7FFFFFFF )\r\n" - " return floatBitsToInt(1.0);\r\n" + " return as_type(1.0);\r\n" "else if( v == 0xFFFFFFFF )\r\n" - " return floatBitsToInt(0.0);\r\n" - "return floatBitsToInt(clamp(intBitsToFloat(v), 0.0, 1.0));\r\n" + " return as_type(0.0);\r\n" + "return as_type(clamp(as_type(v), 0.0, 1.0));\r\n" "}\r\n"); // mul non-ieee way (0*NaN/INF => 0.0) if (shaderContext->options->strictMul) @@ -3791,7 +3713,7 @@ void LatteDecompiler_emitHelperFunctions(LatteDecompilerShaderContext* shaderCon //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ return mix(a*b,0.0,a==0.0||b==0.0); }" STR_LINEBREAK); //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ return mix(vec2(a*b,0.0),vec2(0.0,0.0),(equal(vec2(a),vec2(0.0,0.0))||equal(vec2(b),vec2(0.0,0.0)))).x; }" STR_LINEBREAK); //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ if( a == 0.0 || b == 0.0 ) return 0.0; return a*b; }" STR_LINEBREAK); - //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){float r = a*b;r = intBitsToFloat(floatBitsToInt(r)&(((floatBitsToInt(a) != 0) && (floatBitsToInt(b) != 0))?0xFFFFFFFF:0));return r;}" STR_LINEBREAK); works + //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){float r = a*b;r = as_type(floatBitsToInt(r)&(((floatBitsToInt(a) != 0) && (floatBitsToInt(b) != 0))?0xFFFFFFFF:0));return r;}" STR_LINEBREAK); works // for "min" it used to be: float mul_nonIEEE(float a, float b){ return min(a*b,min(abs(a)*3.40282347E+38F,abs(b)*3.40282347E+38F)); } @@ -3836,9 +3758,9 @@ static void LatteDecompiler_emitAttributeImport(LatteDecompilerShaderContext* sh LatteDecompiler_emitAttributeDecodeMSL(shaderContext->shader, src, &attrib); if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->addFmt("{} = ivec4(", _getRegisterVarName(shaderContext, registerIndex)); + src->addFmt("{} = int4(", _getRegisterVarName(shaderContext, registerIndex)); else - src->addFmt("{} = vec4(", _getRegisterVarName(shaderContext, registerIndex)); + src->addFmt("{} = float4(", _getRegisterVarName(shaderContext, registerIndex)); for (sint32 f = 0; f < 4; f++) { uint8 ds = attrib.ds[f]; @@ -3874,23 +3796,28 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, src->addFmt("// usesIntegerValues: {}" _CRLF, shaderContext->analyzer.usesIntegerValues?"true":"false"); src->addFmt(_CRLF); #endif + // include metal standard library + src->add("#include " _CRLF); + src->add("using namespace metal;" _CRLF); // header part (definitions for inputs and outputs) LatteDecompiler::emitHeader(shaderContext); // helper functions LatteDecompiler_emitHelperFunctions(shaderContext, src); + const char* outputTypeName; switch (shader->shaderType) { case LatteConst::ShaderType::Vertex: - src->add("VertexOut"); + outputTypeName = "VertexOut"; break; case LatteConst::ShaderType::Pixel: - src->add("FragmentOut"); + outputTypeName = "FragmentOut"; break; } // start of main - src->add(" main0("); + src->addFmt("{} main0(", outputTypeName); LatteDecompiler::emitInputs(shaderContext); src->add(") {" _CRLF); + src->addFmt("{} out;" _CRLF, outputTypeName); // variable definition if (shaderContext->typeTracker.useArrayGPRs == false) { @@ -3997,7 +3924,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, cemu_assert_debug((shaderContext->output->streamoutBufferStride[i]&3) == 0); if (shader->shaderType == LatteConst::ShaderType::Vertex) // vertex shader - src->addFmt("int sbBase{} = uf_streamoutBufferBase{}/4 + (vid + uf_verticesPerInstance * iid)*{};" _CRLF, i, i, shaderContext->output->streamoutBufferStride[i] / 4); + src->addFmt("int sbBase{} = supportBuffer.streamoutBufferBase{}/4 + (vid + supportBuffer.verticesPerInstance * iid)*{};" _CRLF, i, i, shaderContext->output->streamoutBufferStride[i] / 4); else // geometry shader { uint32 gsOutPrimType = shaderContext->contextRegisters[mmVGT_GS_OUT_PRIM_TYPE]; @@ -4006,7 +3933,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, cemu_assert_debug(gsOutPrimType == 0); // currently we only properly handle GS output primitive points - src->addFmt("int sbBase{} = uf_streamoutBufferBase{}/4 + (gl_PrimitiveIDIn * {})*{};" _CRLF, i, i, maxVerticesInGS, shaderContext->output->streamoutBufferStride[i] / 4); + src->addFmt("int sbBase{} = supportBuffer.streamoutBufferBase{}/4 + (gl_PrimitiveIDIn * {})*{};" _CRLF, i, i, maxVerticesInGS, shaderContext->output->streamoutBufferStride[i] / 4); } } @@ -4019,7 +3946,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) src->addFmt("{} = int4(vid, 0, 0, iid);" _CRLF, _getRegisterVarName(shaderContext, 0)); else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) - src->addFmt("{} = as_type(float4(vid, 0, 0, iid));" _CRLF, _getRegisterVarName(shaderContext, 0)); // TODO: is this correct? + src->addFmt("{} = float4(vid, 0, 0, iid);" _CRLF, _getRegisterVarName(shaderContext, 0)); // TODO: as_type(float4(vid, 0, 0, iid))? else cemu_assert_unimplemented(); } @@ -4097,9 +4024,9 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, { // import from vertex shader if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->addFmt("{} = as_type(passParameterSem{});" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId); + src->addFmt("{} = as_type(in.passParameterSem{});" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId); else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) - src->addFmt("{} = passParameterSem{};" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId); + src->addFmt("{} = in.passParameterSem{};" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId); else cemu_assert_unimplemented(); } @@ -4112,7 +4039,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, if (frontFace_allBits) cemu_assert_debug(false); if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->addFmt("{}.{} = as_type(gl_FrontFacing?1.0:0.0);" _CRLF, _getRegisterVarName(shaderContext, frontFace_regIndex), _getElementStrByIndex(frontFace_chan)); + src->addFmt("{}.{} = as_type(frontFacing?1.0:0.0);" _CRLF, _getRegisterVarName(shaderContext, frontFace_regIndex), _getElementStrByIndex(frontFace_chan)); else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) src->addFmt("{}.{} = frontFacing ? 1.0 : 0.0;" _CRLF, _getRegisterVarName(shaderContext, frontFace_regIndex), _getElementStrByIndex(frontFace_chan)); else @@ -4128,8 +4055,10 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, if (shaderContext->analyzer.outputPointSize && shaderContext->analyzer.writesPointSize == false) { if (shader->shaderType == LatteConst::ShaderType::Vertex && shaderContext->options->usesGeometryShader == false) - src->add("out.pointSize = uf_pointSize;" _CRLF); + src->add("out.pointSize = supportBuffer.pointSize;" _CRLF); } + // return + src->add("return out;" _CRLF); // end of shader main src->add("}" _CRLF); src->shrink_to_fit(); diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp index 8219646aa..cb90e45d6 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp @@ -12,50 +12,50 @@ static void _readLittleEndianAttributeU32x4(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) { - src->addFmt("attrDecoder = attrDataSem{};" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder = in.attrDataSem{};" _CRLF, attributeInputIndex); } static void _readLittleEndianAttributeU32x3(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) { - src->addFmt("attrDecoder = uint4(attrDataSem{}.xyz,0);" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder = uint4(in.attrDataSem{}.xyz,0);" _CRLF, attributeInputIndex); } static void _readLittleEndianAttributeU32x2(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) { - src->addFmt("attrDecoder = uint4(attrDataSem{}.xy,0,0);" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder = uint4(in.attrDataSem{}.xy,0,0);" _CRLF, attributeInputIndex); } static void _readLittleEndianAttributeU32x1(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) { - src->addFmt("attrDecoder = uint4(attrDataSem{}.x,0,0,0);" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder = uint4(in.attrDataSem{}.x,0,0,0);" _CRLF, attributeInputIndex); } static void _readLittleEndianAttributeU16x2(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) { - src->addFmt("attrDecoder = uint4(attrDataSem{}.xy,0,0);" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder = uint4(in.attrDataSem{}.xy,0,0);" _CRLF, attributeInputIndex); } static void _readLittleEndianAttributeU16x4(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) { - src->addFmt("attrDecoder = attrDataSem{};" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder = in.attrDataSem{};" _CRLF, attributeInputIndex); } static void _readBigEndianAttributeU32x4(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) { - src->addFmt("attrDecoder = attrDataSem{};" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder = in.attrDataSem{};" _CRLF, attributeInputIndex); src->add("attrDecoder = (attrDecoder>>24)|((attrDecoder>>8)&0xFF00)|((attrDecoder<<8)&0xFF0000)|((attrDecoder<<24));" _CRLF); } static void _readBigEndianAttributeU32x3(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) { - src->addFmt("attrDecoder.xyz = attrDataSem{}.xyz;" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.xyz = in.attrDataSem{}.xyz;" _CRLF, attributeInputIndex); src->add("attrDecoder.xyz = (attrDecoder.xyz>>24)|((attrDecoder.xyz>>8)&0xFF00)|((attrDecoder.xyz<<8)&0xFF0000)|((attrDecoder.xyz<<24));" _CRLF); src->add("attrDecoder.w = 0;" _CRLF); } static void _readBigEndianAttributeU32x2(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) { - src->addFmt("attrDecoder.xy = attrDataSem{}.xy;" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.xy = in.attrDataSem{}.xy;" _CRLF, attributeInputIndex); src->add("attrDecoder.xy = (attrDecoder.xy>>24)|((attrDecoder.xy>>8)&0xFF00)|((attrDecoder.xy<<8)&0xFF0000)|((attrDecoder.xy<<24));" _CRLF); src->add("attrDecoder.z = 0;" _CRLF); src->add("attrDecoder.w = 0;" _CRLF); @@ -63,7 +63,7 @@ static void _readBigEndianAttributeU32x2(LatteDecompilerShader* shaderContext, S static void _readBigEndianAttributeU32x1(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) { - src->addFmt("attrDecoder.x = attrDataSem{}.x;" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.x = in.attrDataSem{}.x;" _CRLF, attributeInputIndex); src->add("attrDecoder.x = (attrDecoder.x>>24)|((attrDecoder.x>>8)&0xFF00)|((attrDecoder.x<<8)&0xFF0000)|((attrDecoder.x<<24));" _CRLF); src->add("attrDecoder.y = 0;" _CRLF); src->add("attrDecoder.z = 0;" _CRLF); @@ -72,7 +72,7 @@ static void _readBigEndianAttributeU32x1(LatteDecompilerShader* shaderContext, S static void _readBigEndianAttributeU16x1(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) { - src->addFmt("attrDecoder.xy = attrDataSem{}.xy;" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.xy = in.attrDataSem{}.xy;" _CRLF, attributeInputIndex); src->add("attrDecoder.x = ((attrDecoder.x>>8)&0xFF)|((attrDecoder.x<<8)&0xFF00);" _CRLF); src->add("attrDecoder.y = 0;" _CRLF); src->add("attrDecoder.z = 0;" _CRLF); @@ -81,7 +81,7 @@ static void _readBigEndianAttributeU16x1(LatteDecompilerShader* shaderContext, S static void _readBigEndianAttributeU16x2(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) { - src->addFmt("attrDecoder.xy = attrDataSem{}.xy;" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.xy = in.attrDataSem{}.xy;" _CRLF, attributeInputIndex); src->add("attrDecoder.xy = ((attrDecoder.xy>>8)&0xFF)|((attrDecoder.xy<<8)&0xFF00);" _CRLF); src->add("attrDecoder.z = 0;" _CRLF); src->add("attrDecoder.w = 0;" _CRLF); @@ -89,7 +89,7 @@ static void _readBigEndianAttributeU16x2(LatteDecompilerShader* shaderContext, S static void _readBigEndianAttributeU16x4(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) { - src->addFmt("attrDecoder.xyzw = attrDataSem{}.xyzw;" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.xyzw = in.attrDataSem{}.xyzw;" _CRLF, attributeInputIndex); src->add("attrDecoder = ((attrDecoder>>8)&0xFF)|((attrDecoder<<8)&0xFF00);" _CRLF); } @@ -167,12 +167,12 @@ void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 0 && attrib->isSigned == 0 ) { // seen in Minecraft Wii U Edition - src->addFmt("attrDecoder.xyzw = as_type(vec4(attrDataSem{}.wzyx)/255.0);" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.xyzw = as_type(float4(in.attrDataSem{}.wzyx)/255.0);" _CRLF, attributeInputIndex); } else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 0 && attrib->isSigned != 0 ) { // seen in Minecraft Wii U Edition - src->addFmt("attrDecoder.xyzw = attrDataSem{}.wzyx;" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.xyzw = in.attrDataSem{}.wzyx;" _CRLF, attributeInputIndex); src->add("if( (attrDecoder.x&0x80) != 0 ) attrDecoder.x |= 0xFFFFFF00;" _CRLF); src->add("if( (attrDecoder.y&0x80) != 0 ) attrDecoder.y |= 0xFFFFFF00;" _CRLF); src->add("if( (attrDecoder.z&0x80) != 0 ) attrDecoder.z |= 0xFFFFFF00;" _CRLF); @@ -185,12 +185,12 @@ void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 1 && attrib->isSigned == 0 ) { // seen in Minecraft Wii U Edition - src->addFmt("attrDecoder.xyzw = attrDataSem{}.wzyx;" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.xyzw = in.attrDataSem{}.wzyx;" _CRLF, attributeInputIndex); } else if (attrib->format == FMT_8_8_8_8 && attrib->nfa == 2 && attrib->isSigned == 0) { // seen in Ben 10 Omniverse - src->addFmt("attrDecoder.xyzw = as_type(vec4(attrDataSem{}.wzyx));" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.xyzw = as_type(float4(in.attrDataSem{}.wzyx));" _CRLF, attributeInputIndex); } else { @@ -275,11 +275,11 @@ void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext } else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 0 && attrib->isSigned == 0 ) { - src->addFmt("attrDecoder.xyzw = as_type(float4(attrDataSem{}.xyzw)/255.0);" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.xyzw = as_type(float4(in.attrDataSem{}.xyzw)/255.0);" _CRLF, attributeInputIndex); } else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 0 && attrib->isSigned != 0 ) { - src->addFmt("attrDecoder.xyzw = attrDataSem{}.xyzw;" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.xyzw = in.attrDataSem{}.xyzw;" _CRLF, attributeInputIndex); src->add("if( (attrDecoder.x&0x80) != 0 ) attrDecoder.x |= 0xFFFFFF00;" _CRLF); src->add("if( (attrDecoder.y&0x80) != 0 ) attrDecoder.y |= 0xFFFFFF00;" _CRLF); src->add("if( (attrDecoder.z&0x80) != 0 ) attrDecoder.z |= 0xFFFFFF00;" _CRLF); @@ -291,12 +291,12 @@ void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext } else if (attrib->format == FMT_8_8_8_8 && attrib->nfa == 1 && attrib->isSigned == 0) { - src->addFmt("attrDecoder.xyzw = attrDataSem{}.xyzw;" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.xyzw = in.attrDataSem{}.xyzw;" _CRLF, attributeInputIndex); } else if (attrib->format == FMT_8_8_8_8 && attrib->nfa == 1 && attrib->isSigned != 0) { // seen in Sonic Lost World - src->addFmt("attrDecoder.xyzw = attrDataSem{}.xyzw;" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.xyzw = in.attrDataSem{}.xyzw;" _CRLF, attributeInputIndex); src->add("if( (attrDecoder.x&0x80) != 0 ) attrDecoder.x |= 0xFFFFFF00;" _CRLF); src->add("if( (attrDecoder.y&0x80) != 0 ) attrDecoder.y |= 0xFFFFFF00;" _CRLF); src->add("if( (attrDecoder.z&0x80) != 0 ) attrDecoder.z |= 0xFFFFFF00;" _CRLF); @@ -305,19 +305,19 @@ void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 2 && attrib->isSigned == 0 ) { // seen in One Piece - src->addFmt("attrDecoder.xyzw = as_type(float4(attrDataSem{}.xyzw));" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.xyzw = as_type(float4(in.attrDataSem{}.xyzw));" _CRLF, attributeInputIndex); } else if (attrib->format == FMT_8_8 && attrib->nfa == 0 && attrib->isSigned == 0) { if( (attrib->offset&3) == 2 && LatteGPUState.glVendor == GLVENDOR_AMD && g_renderer->GetType() == RendererAPI::OpenGL ) { // AMD workaround - src->addFmt("attrDecoder.xy = as_type(float2(attrDataSem{}.zw)/255.0);" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.xy = as_type(float2(in.attrDataSem{}.zw)/255.0);" _CRLF, attributeInputIndex); src->add("attrDecoder.zw = uint2(0);" _CRLF); } else { - src->addFmt("attrDecoder.xy = as_type(float2(attrDataSem{}.xy)/255.0);" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.xy = as_type(float2(in.attrDataSem{}.xy)/255.0);" _CRLF, attributeInputIndex); src->add("attrDecoder.zw = uint2(0);" _CRLF); } } @@ -327,12 +327,12 @@ void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext if ((attrib->offset & 3) == 2 && LatteGPUState.glVendor == GLVENDOR_AMD && g_renderer->GetType() == RendererAPI::OpenGL) { // AMD workaround - src->addFmt("attrDecoder.xy = as_type(float2(attrDataSem{}.zw));" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.xy = as_type(float2(in.attrDataSem{}.zw));" _CRLF, attributeInputIndex); src->add("attrDecoder.zw = uint2(0);" _CRLF); } else { - src->addFmt("attrDecoder.xy = as_type(float2(attrDataSem{}.xy));" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.xy = as_type(float2(in.attrDataSem{}.xy));" _CRLF, attributeInputIndex); src->add("attrDecoder.zw = uint2(0);" _CRLF); } } @@ -341,7 +341,7 @@ void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext if ((attrib->offset & 3) == 2 && LatteGPUState.glVendor == GLVENDOR_AMD && g_renderer->GetType() == RendererAPI::OpenGL) { // AMD workaround - src->addFmt("attrDecoder.xy = attrDataSem{}.zw;" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.xy = in.attrDataSem{}.zw;" _CRLF, attributeInputIndex); src->add("if( (attrDecoder.x&0x80) != 0 ) attrDecoder.x |= 0xFFFFFF00;" _CRLF); src->add("if( (attrDecoder.y&0x80) != 0 ) attrDecoder.y |= 0xFFFFFF00;" _CRLF); src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/127.0,-1.0));" _CRLF); @@ -350,7 +350,7 @@ void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext } else { - src->addFmt("attrDecoder.xy = attrDataSem{}.xy;" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.xy = in.attrDataSem{}.xy;" _CRLF, attributeInputIndex); src->add("if( (attrDecoder.x&0x80) != 0 ) attrDecoder.x |= 0xFFFFFF00;" _CRLF); src->add("if( (attrDecoder.y&0x80) != 0 ) attrDecoder.y |= 0xFFFFFF00;" _CRLF); src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/127.0,-1.0));" _CRLF); @@ -363,22 +363,22 @@ void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext if ((attrib->offset & 3) == 2 && LatteGPUState.glVendor == GLVENDOR_AMD && g_renderer->GetType() == RendererAPI::OpenGL) { // AMD workaround - src->addFmt("attrDecoder.xyzw = uint4(attrDataSem{}.zw,0,0);" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.xyzw = uint4(in.attrDataSem{}.zw,0,0);" _CRLF, attributeInputIndex); } else { - src->addFmt("attrDecoder.xyzw = uint4(attrDataSem{}.xy,0,0);" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.xyzw = uint4(in.attrDataSem{}.xy,0,0);" _CRLF, attributeInputIndex); } } else if( attrib->format == FMT_8 && attrib->nfa == 0 && attrib->isSigned == 0 ) { // seen in Pikmin 3 - src->addFmt("attrDecoder.x = as_type(float(attrDataSem{}.x)/255.0);" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.x = as_type(float(in.attrDataSem{}.x)/255.0);" _CRLF, attributeInputIndex); src->add("attrDecoder.yzw = uint3(0);" _CRLF); } else if( attrib->format == FMT_8 && attrib->nfa == 1 && attrib->isSigned == 0 ) { - src->addFmt("attrDecoder.xyzw = uint4(attrDataSem{}.x,0,0,0);" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.xyzw = uint4(in.attrDataSem{}.x,0,0,0);" _CRLF, attributeInputIndex); } else { diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 39798dc5f..45c886314 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -9,7 +9,7 @@ namespace LatteDecompiler LatteDecompilerShaderResourceMapping& resourceMapping = decompilerContext->output->resourceMappingGL; auto& uniformOffsets = decompilerContext->output->uniformOffsetsVK; - src->add("struct DefualtUniforms {" _CRLF); + src->add("struct SupportBuffer {" _CRLF); sint32 uniformCurrentOffset = 0; auto shader = decompilerContext->shader; @@ -19,11 +19,11 @@ namespace LatteDecompiler // uniform registers or buffers are accessed statically with predictable offsets // this allows us to remap the used entries into a more compact array if (shaderType == LatteConst::ShaderType::Vertex) - src->addFmt("ivec4 uf_remappedVS[{}];" _CRLF, (sint32)shader->list_remappedUniformEntries.size()); + src->addFmt("int4 remappedVS[{}];" _CRLF, (sint32)shader->list_remappedUniformEntries.size()); else if (shaderType == LatteConst::ShaderType::Pixel) - src->addFmt("ivec4 uf_remappedPS[{}];" _CRLF, (sint32)shader->list_remappedUniformEntries.size()); + src->addFmt("int4 remappedPS[{}];" _CRLF, (sint32)shader->list_remappedUniformEntries.size()); else if (shaderType == LatteConst::ShaderType::Geometry) - src->addFmt("ivec4 uf_remappedGS[{}];" _CRLF, (sint32)shader->list_remappedUniformEntries.size()); + src->addFmt("int4 remappedGS[{}];" _CRLF, (sint32)shader->list_remappedUniformEntries.size()); else debugBreakpoint(); uniformOffsets.offset_remapped = uniformCurrentOffset; @@ -34,11 +34,11 @@ namespace LatteDecompiler uint32 cfileSize = decompilerContext->analyzer.uniformRegisterAccessTracker.DetermineSize(decompilerContext->shaderBaseHash, 256); // full or partial uniform register file has to be present if (shaderType == LatteConst::ShaderType::Vertex) - src->addFmt("ivec4 uf_uniformRegisterVS[{}];" _CRLF, cfileSize); + src->addFmt("int4 uniformRegisterVS[{}];" _CRLF, cfileSize); else if (shaderType == LatteConst::ShaderType::Pixel) - src->addFmt("ivec4 uf_uniformRegisterPS[{}];" _CRLF, cfileSize); + src->addFmt("int4 uniformRegisterPS[{}];" _CRLF, cfileSize); else if (shaderType == LatteConst::ShaderType::Geometry) - src->addFmt("ivec4 uf_uniformRegisterGS[{}];" _CRLF, cfileSize); + src->addFmt("int4 uniformRegisterGS[{}];" _CRLF, cfileSize); uniformOffsets.offset_uniformRegister = uniformCurrentOffset; uniformOffsets.count_uniformRegister = cfileSize; uniformCurrentOffset += 16 * cfileSize; @@ -53,7 +53,7 @@ namespace LatteDecompiler { // aka GX2 special state 0 uniformCurrentOffset = (uniformCurrentOffset + 7)&~7; - src->add("float2 uf_windowSpaceToClipSpaceTransform;" _CRLF); + src->add("float2 windowSpaceToClipSpaceTransform;" _CRLF); uniformOffsets.offset_windowSpaceToClipSpaceTransform = uniformCurrentOffset; uniformCurrentOffset += 8; } @@ -61,7 +61,7 @@ namespace LatteDecompiler if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel && alphaTestEnable) { uniformCurrentOffset = (uniformCurrentOffset + 3)&~3; - src->add("float uf_alphaTestRef;" _CRLF); + src->add("float alphaTestRef;" _CRLF); uniformOffsets.offset_alphaTestRef = uniformCurrentOffset; uniformCurrentOffset += 4; } @@ -71,16 +71,16 @@ namespace LatteDecompiler decompilerContext->shaderType == LatteConst::ShaderType::Geometry) { uniformCurrentOffset = (uniformCurrentOffset + 3)&~3; - src->add("float uf_pointSize;" _CRLF); + src->add("float pointSize;" _CRLF); uniformOffsets.offset_pointSize = uniformCurrentOffset; uniformCurrentOffset += 4; } } - // define uf_fragCoordScale which holds the xy scale for render target resolution vs effective resolution + // define fragCoordScale which holds the xy scale for render target resolution vs effective resolution if (shader->shaderType == LatteConst::ShaderType::Pixel) { uniformCurrentOffset = (uniformCurrentOffset + 7)&~7; - src->add("vec2 uf_fragCoordScale;" _CRLF); + src->add("float2 fragCoordScale;" _CRLF); uniformOffsets.offset_fragCoordScale = uniformCurrentOffset; uniformCurrentOffset += 8; } @@ -90,30 +90,30 @@ namespace LatteDecompiler if (decompilerContext->analyzer.texUnitUsesTexelCoordinates.test(t) == false) continue; uniformCurrentOffset = (uniformCurrentOffset + 7) & ~7; - src->addFmt("vec2 uf_tex{}Scale;" _CRLF, t); + src->addFmt("float2 tex{}Scale;" _CRLF, t); uniformOffsets.offset_texScale[t] = uniformCurrentOffset; uniformCurrentOffset += 8; } - // define uf_verticesPerInstance + uf_streamoutBufferBaseX + // define verticesPerInstance + streamoutBufferBaseX if (decompilerContext->analyzer.useSSBOForStreamout && (shader->shaderType == LatteConst::ShaderType::Vertex && decompilerContext->options->usesGeometryShader == false) || (shader->shaderType == LatteConst::ShaderType::Geometry) ) { - src->add("int uf_verticesPerInstance;" _CRLF); + src->add("int verticesPerInstance;" _CRLF); uniformOffsets.offset_verticesPerInstance = uniformCurrentOffset; uniformCurrentOffset += 4; for (uint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++) { if (decompilerContext->output->streamoutBufferWriteMask[i]) { - src->addFmt("int uf_streamoutBufferBase{};" _CRLF, i); + src->addFmt("int streamoutBufferBase{};" _CRLF, i); uniformOffsets.offset_streamoutBufferBase[i] = uniformCurrentOffset; uniformCurrentOffset += 4; } } } - src->add("}" _CRLF _CRLF); + src->add("};" _CRLF _CRLF); uniformOffsets.offset_endOfBlock = uniformCurrentOffset; } @@ -173,7 +173,7 @@ namespace LatteDecompiler cemu_assert_debug(decompilerContext->output->resourceMappingVK.attributeMapping[i] >= 0); cemu_assert_debug(decompilerContext->output->resourceMappingGL.attributeMapping[i] == decompilerContext->output->resourceMappingVK.attributeMapping[i]); - src->addFmt("ATTR_LAYOUT({}, {}) in uvec4 attrDataSem{};" _CRLF, (sint32)decompilerContext->output->resourceMappingVK.setIndex, (sint32)decompilerContext->output->resourceMappingVK.attributeMapping[i], i); + src->addFmt("uint4 attrDataSem{} [[attribute({})]];" _CRLF, i, (sint32)decompilerContext->output->resourceMappingVK.attributeMapping[i]); } } src->add("};" _CRLF _CRLF); @@ -367,7 +367,17 @@ namespace LatteDecompiler break; } - src->add(" in [[stage_in]], DefaultVariables defaultVars [[buffer(29)]]"); + src->add(" in [[stage_in]], constant SupportBuffer& supportBuffer [[buffer(29)]]"); + switch (decompilerContext->shaderType) + { + case LatteConst::ShaderType::Vertex: + src->add(", uint vid [[vertex_id]]"); + src->add(", uint iid [[instance_id]]"); + break; + case LatteConst::ShaderType::Pixel: + src->add(", bool frontFacing [[front_facing]]"); + break; + } // uniform buffers _emitUniformBufferDefinitions(decompilerContext); // textures diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index 1b140590d..bf6cab245 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -85,5 +85,5 @@ LatteTextureView* LatteTextureMtl::CreateView(Latte::E_DIM dim, Latte::E_GX2SURF void LatteTextureMtl::AllocateOnHost() { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index fa3b03f20..a5b7e3a66 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -34,19 +34,19 @@ void MetalRenderer::Initialize() void MetalRenderer::Shutdown() { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } bool MetalRenderer::IsPadWindowActive() { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); return false; } bool MetalRenderer::GetVRAMInfo(int& usageInMB, int& totalInMB) const { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); usageInMB = 1024; totalInMB = 1024; @@ -56,12 +56,12 @@ bool MetalRenderer::GetVRAMInfo(int& usageInMB, int& totalInMB) const void MetalRenderer::ClearColorbuffer(bool padView) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } void MetalRenderer::DrawEmptyFrame(bool mainWindow) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) @@ -83,7 +83,7 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight, bool padView, bool clearBackground) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } bool MetalRenderer::BeginFrame(bool mainWindow) @@ -96,68 +96,68 @@ bool MetalRenderer::BeginFrame(bool mainWindow) void MetalRenderer::Flush(bool waitIdle) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } void MetalRenderer::NotifyLatteCommandProcessorIdle() { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } void MetalRenderer::AppendOverlayDebugInfo() { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } void MetalRenderer::renderTarget_setViewport(float x, float y, float width, float height, float nearZ, float farZ, bool halfZ) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } void MetalRenderer::renderTarget_setScissor(sint32 scissorX, sint32 scissorY, sint32 scissorWidth, sint32 scissorHeight) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } LatteCachedFBO* MetalRenderer::rendertarget_createCachedFBO(uint64 key) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); return nullptr; } void MetalRenderer::rendertarget_deleteCachedFBO(LatteCachedFBO* fbo) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } void MetalRenderer::rendertarget_bindFramebufferObject(LatteCachedFBO* cfbo) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } void* MetalRenderer::texture_acquireTextureUploadBuffer(uint32 size) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); return nullptr; } void MetalRenderer::texture_releaseTextureUploadBuffer(uint8* mem) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } TextureDecoder* MetalRenderer::texture_chooseDecodedFormat(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, uint32 width, uint32 height) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); return nullptr; } void MetalRenderer::texture_clearSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, sint32 height, sint32 depth, void* pixelData, sint32 sliceIndex, sint32 mipIndex, uint32 compressedImageSize) @@ -171,12 +171,12 @@ void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, s void MetalRenderer::texture_clearColorSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } void MetalRenderer::texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sliceIndex, sint32 mipIndex, bool clearDepth, bool clearStencil, float depthValue, uint32 stencilValue) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } LatteTexture* MetalRenderer::texture_createTextureEx(Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) @@ -186,102 +186,101 @@ LatteTexture* MetalRenderer::texture_createTextureEx(Latte::E_DIM dim, MPTR phys void MetalRenderer::texture_setLatteTexture(LatteTextureView* textureView, uint32 textureUnit) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } void MetalRenderer::texture_copyImageSubData(LatteTexture* src, sint32 srcMip, sint32 effectiveSrcX, sint32 effectiveSrcY, sint32 srcSlice, LatteTexture* dst, sint32 dstMip, sint32 effectiveDstX, sint32 effectiveDstY, sint32 dstSlice, sint32 effectiveCopyWidth, sint32 effectiveCopyHeight, sint32 srcDepth) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } LatteTextureReadbackInfo* MetalRenderer::texture_createReadback(LatteTextureView* textureView) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); return nullptr; } void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* sourceTexture, sint32 srcMip, sint32 srcSlice, LatteTexture* destinationTexture, sint32 dstMip, sint32 dstSlice, sint32 width, sint32 height) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } void MetalRenderer::bufferCache_init(const sint32 bufferSize) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } void MetalRenderer::bufferCache_upload(uint8* buffer, sint32 size, uint32 bufferOffset) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } void MetalRenderer::bufferCache_copy(uint32 srcOffset, uint32 dstOffset, uint32 size) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } void MetalRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint32 dstOffset, uint32 size) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, uint32 size) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } void MetalRenderer::buffer_bindUniformBuffer(LatteConst::ShaderType shaderType, uint32 bufferIndex, uint32 offset, uint32 size) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } RendererShader* MetalRenderer::shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool isGameShader, bool isGfxPackShader) { - //return new RendererShaderMtl(this, type, baseHash, auxHash, isGameShader, isGfxPackShader, source); - return nullptr; + return new RendererShaderMtl(this, type, baseHash, auxHash, isGameShader, isGfxPackShader, source); } void MetalRenderer::streamout_setupXfbBuffer(uint32 bufferIndex, sint32 ringBufferOffset, uint32 rangeAddr, uint32 rangeSize) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } void MetalRenderer::streamout_begin() { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } void MetalRenderer::streamout_rendererFinishDrawcall() { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } void MetalRenderer::draw_beginSequence() { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); LatteSHRC_UpdateActiveShaders(); } void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } void MetalRenderer::draw_endSequence() { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } void* MetalRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); return nullptr; } void MetalRenderer::indexData_uploadIndexMemory(uint32 offset, uint32 size) { - cemuLog_logDebug(LogType::Force, "not implemented"); + cemuLog_log(LogType::MetalLogging, "not implemented"); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 49e11be92..d43cc2be9 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -39,7 +39,7 @@ class MetalRenderer : public Renderer void SwapBuffers(bool swapTV, bool swapDRC) override; void HandleScreenshotRequest(LatteTextureView* texView, bool padView) override { - cemuLog_logDebug(LogType::Force, "Screenshots are not yet supported on Metal"); + cemuLog_log(LogType::MetalLogging, "Screenshots are not yet supported on Metal"); } void DrawBackbufferQuad(LatteTextureView* texView, RendererOutputShader* shader, bool useLinearTexFilter, @@ -53,27 +53,27 @@ class MetalRenderer : public Renderer // imgui bool ImguiBegin(bool mainWindow) override { - cemuLog_logDebug(LogType::Force, "Imgui is not yet supported on Metal"); + cemuLog_log(LogType::MetalLogging, "Imgui is not yet supported on Metal"); return false; }; void ImguiEnd() override { - cemuLog_logDebug(LogType::Force, "Imgui is not yet supported on Metal"); + cemuLog_log(LogType::MetalLogging, "Imgui is not yet supported on Metal"); }; ImTextureID GenerateTexture(const std::vector& data, const Vector2i& size) override { - cemuLog_logDebug(LogType::Force, "Imgui is not yet supported on Metal"); + cemuLog_log(LogType::MetalLogging, "Imgui is not yet supported on Metal"); return nullptr; }; void DeleteTexture(ImTextureID id) override { - cemuLog_logDebug(LogType::Force, "Imgui is not yet supported on Metal"); + cemuLog_log(LogType::MetalLogging, "Imgui is not yet supported on Metal"); }; void DeleteFontTextures() override { - cemuLog_logDebug(LogType::Force, "Imgui is not yet supported on Metal"); + cemuLog_log(LogType::MetalLogging, "Imgui is not yet supported on Metal"); }; void AppendOverlayDebugInfo() override; @@ -135,21 +135,21 @@ class MetalRenderer : public Renderer // occlusion queries LatteQueryObject* occlusionQuery_create() override { - cemuLog_logDebug(LogType::Force, "Occlusion queries are not yet supported on Metal"); + cemuLog_log(LogType::MetalLogging, "Occlusion queries are not yet supported on Metal"); return nullptr; } void occlusionQuery_destroy(LatteQueryObject* queryObj) override { - cemuLog_logDebug(LogType::Force, "Occlusion queries are not yet supported on Metal"); + cemuLog_log(LogType::MetalLogging, "Occlusion queries are not yet supported on Metal"); } void occlusionQuery_flush() override { - cemuLog_logDebug(LogType::Force, "Occlusion queries are not yet supported on Metal"); + cemuLog_log(LogType::MetalLogging, "Occlusion queries are not yet supported on Metal"); } void occlusionQuery_updateState() override { - cemuLog_logDebug(LogType::Force, "Occlusion queries are not yet supported on Metal"); + cemuLog_log(LogType::MetalLogging, "Occlusion queries are not yet supported on Metal"); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index 50033d5c9..28b386123 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -9,11 +9,12 @@ RendererShaderMtl::RendererShaderMtl(MetalRenderer* mtlRenderer, ShaderType type MTL::Library* library = mtlRenderer->GetDevice()->newLibrary(NS::String::string(mslCode.c_str(), NS::ASCIIStringEncoding), nullptr, &error); if (error) { - cemuLog_log(LogType::MetalLogging, "Failed to create library (error: {}) -> source:\n{}", error->localizedDescription()->utf8String(), mslCode.c_str()); + printf("Failed to create library (error: %s) -> source:\n%s", error->localizedDescription()->utf8String(), mslCode.c_str()); error->release(); return; } m_function = library->newFunction(NS::String::string("main0", NS::ASCIIStringEncoding)); + library->release(); } RendererShaderMtl::~RendererShaderMtl() diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h index 030bbff0b..e440d4dc2 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h @@ -29,10 +29,21 @@ class RendererShaderMtl : public RendererShader return m_function; } - // TODO: implement these - sint32 GetUniformLocation(const char* name) override { return 0; } - void SetUniform2fv(sint32 location, void* data, sint32 count) override {} - void SetUniform4iv(sint32 location, void* data, sint32 count) override {} + sint32 GetUniformLocation(const char* name) override + { + cemu_assert_suspicious(); + return 0; + } + + void SetUniform2fv(sint32 location, void* data, sint32 count) override + { + cemu_assert_suspicious(); + } + + void SetUniform4iv(sint32 location, void* data, sint32 count) override + { + cemu_assert_suspicious(); + } // TODO: implement this void PreponeCompilation(bool isRenderThread) override {} From 4976ff3084c91ade0b6d9a8a8c64d7bd6d51f8f8 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 27 Jul 2024 11:36:51 +0200 Subject: [PATCH 009/368] fix: sampling & capture boundaries --- src/Cafe/HW/Latte/Core/LatteTextureLoader.cpp | 4 +- .../LatteDecompilerEmitMSL.cpp | 43 +++-- .../Latte/Renderer/Metal/LatteTextureMtl.cpp | 37 ++-- .../Renderer/Metal/LatteTextureViewMtl.cpp | 12 +- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 162 ++++++++++-------- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 84 +++++++-- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 12 ++ .../Latte/Renderer/Vulkan/VulkanRenderer.cpp | 2 +- 8 files changed, 234 insertions(+), 122 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteTextureLoader.cpp b/src/Cafe/HW/Latte/Core/LatteTextureLoader.cpp index c06a3bf18..b80bd869c 100644 --- a/src/Cafe/HW/Latte/Core/LatteTextureLoader.cpp +++ b/src/Cafe/HW/Latte/Core/LatteTextureLoader.cpp @@ -602,7 +602,7 @@ void LatteTextureLoader_loadTextureDataIntoSlice(LatteTexture* hostTexture, sint void LatteTextureLoader_UpdateTextureSliceData(LatteTexture* tex, uint32 sliceIndex, uint32 mipIndex, MPTR physImagePtr, MPTR physMipPtr, Latte::E_DIM dim, uint32 width, uint32 height, uint32 depth, uint32 mipLevels, uint32 pitch, Latte::E_HWTILEMODE tileMode, uint32 swizzle, bool dumpTex) { LatteTextureLoaderCtx textureLoader = { 0 }; - + Latte::E_GX2SURFFMT format = tex->format; LatteTextureLoader_begin(&textureLoader, sliceIndex, mipIndex, physImagePtr, physMipPtr, format, dim, width, height, depth, mipLevels, pitch, tileMode, swizzle); @@ -853,7 +853,7 @@ void LatteTextureLoader_writeReadbackTextureToMemory(LatteTextureDefinition* tex pixelInput += 4; } } - } + } else { cemuLog_logDebug(LogType::Force, "Texture readback unsupported format {:04x} for tileMode 0x{:02x}", (uint32)textureData->format, textureData->tileMode); diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 9981d5ea4..76cd6a18b 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2335,14 +2335,14 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex if (texDim == Latte::E_DIM::DIM_2D_ARRAY) { // 3 coords + compare value (as float4) - src->add("float4("); + src->add("float3("); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add(","); + src->add(", "); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add(","); + src->add(", "); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); - src->addFmt(",{})", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer0)); + src->addFmt("), {}", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer0)); } else if (texDim == Latte::E_DIM::DIM_CUBEMAP) { @@ -2364,7 +2364,7 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex { debugBreakpoint(); } - src->addFmt("float3({},0.0,{})", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], -1, -1, -1, tempBuffer0), _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer1)); + src->addFmt("{}, {}", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], -1, -1, -1, tempBuffer0), _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer1)); } else { @@ -2373,17 +2373,27 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex { debugBreakpoint(); } - src->addFmt("float3({}, {})", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0), _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer1)); + src->addFmt("float2({}), {}", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0), _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer1)); } } - else if( texDim == Latte::E_DIM::DIM_3D || texDim == Latte::E_DIM::DIM_2D_ARRAY ) + else if(texDim == Latte::E_DIM::DIM_2D_ARRAY) { // 3 coords - src->add("float3("); + src->add("float2("); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add(","); + src->add(", "); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add(","); + src->add("), "); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); + } + else if(texDim == Latte::E_DIM::DIM_3D) + { + // 3 coords + src->add("float2("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(", "); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(", "); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(")"); } @@ -2392,11 +2402,9 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex // 2 coords + faceId cemu_assert_debug(texInstruction->textureFetch.srcSel[0] < 4); cemu_assert_debug(texInstruction->textureFetch.srcSel[1] < 4); - src->add("float4("); src->addFmt("redcCUBEReverse({},", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0)); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_SIGNED_INT); - src->add(")"); - src->addFmt(",cubeMapArrayIndex{})", texInstruction->textureFetch.textureIndex); // cubemap index + src->addFmt(", cubeMapArrayIndex{})", texInstruction->textureFetch.textureIndex); // cubemap index } else if( texDim == Latte::E_DIM::DIM_1D ) { @@ -3161,7 +3169,7 @@ static void _emitExportCode(LatteDecompilerShaderContext* shaderContext, LatteDe src->add(") == false) discard;" _CRLF); } // pixel color output - src->addFmt("passPixelColor{} = ", pixelColorOutputIndex); + src->addFmt("out.passPixelColor{} = ", pixelColorOutputIndex); _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, i); src->add(";" _CRLF); @@ -3706,6 +3714,13 @@ void LatteDecompiler_emitHelperFunctions(LatteDecompilerShaderContext* shaderCon " return as_type(0.0);\r\n" "return as_type(clamp(as_type(v), 0.0, 1.0));\r\n" "}\r\n"); + + // round even + fCStr_shaderSource->add("" + "float roundEven(float x) {\r\n" + "return round(x / 2.0) * 2.0;\r\n" + "}\r\n"); + // mul non-ieee way (0*NaN/INF => 0.0) if (shaderContext->options->strictMul) { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index bf6cab245..bbd714d9a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -46,22 +46,29 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM } desc->setUsage(usage); - if (dim == Latte::E_DIM::DIM_2D) - desc->setTextureType(MTL::TextureType2D); - else if (dim == Latte::E_DIM::DIM_1D) - desc->setTextureType(MTL::TextureType1D); - else if (dim == Latte::E_DIM::DIM_3D) - desc->setTextureType(MTL::TextureType3D); - else if (dim == Latte::E_DIM::DIM_2D_ARRAY) + switch (dim) + { + case Latte::E_DIM::DIM_1D: + desc->setTextureType(MTL::TextureType1D); + break; + case Latte::E_DIM::DIM_2D: + case Latte::E_DIM::DIM_2D_MSAA: + desc->setTextureType(MTL::TextureType2D); + break; + case Latte::E_DIM::DIM_2D_ARRAY: desc->setTextureType(MTL::TextureType2DArray); - else if (dim == Latte::E_DIM::DIM_CUBEMAP) - desc->setTextureType(MTL::TextureTypeCube); // TODO: is this correct? - else if (dim == Latte::E_DIM::DIM_2D_MSAA) - desc->setTextureType(MTL::TextureType2D); - else - { - cemu_assert_unimplemented(); - } + break; + case Latte::E_DIM::DIM_3D: + desc->setTextureType(MTL::TextureType3D); + break; + case Latte::E_DIM::DIM_CUBEMAP: + desc->setTextureType(MTL::TextureTypeCube); // TODO: check this + break; + default: + cemu_assert_unimplemented(); + desc->setTextureType(MTL::TextureType2D); + break; + } m_texture = mtlRenderer->GetDevice()->newTexture(desc); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp index c8df8cf62..37399fca1 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp @@ -1,29 +1,34 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" LatteTextureViewMtl::LatteTextureViewMtl(MetalRenderer* mtlRenderer, LatteTextureMtl* texture, Latte::E_DIM dim, Latte::E_GX2SURFFMT format, sint32 firstMip, sint32 mipCount, sint32 firstSlice, sint32 sliceCount) : LatteTextureView(texture, firstMip, mipCount, firstSlice, sliceCount, dim, format), m_mtlr(mtlRenderer), m_format(format) { - // TODO: don't hardcode the format - MTL::PixelFormat pixelFormat = MTL::PixelFormatRGBA8Unorm; MTL::TextureType textureType; switch (dim) { case Latte::E_DIM::DIM_1D: textureType = MTL::TextureType1D; + break; case Latte::E_DIM::DIM_2D: case Latte::E_DIM::DIM_2D_MSAA: textureType = MTL::TextureType2D; + break; case Latte::E_DIM::DIM_2D_ARRAY: textureType = MTL::TextureType2DArray; + break; case Latte::E_DIM::DIM_3D: textureType = MTL::TextureType3D; + break; case Latte::E_DIM::DIM_CUBEMAP: textureType = MTL::TextureTypeCube; // TODO: check this + break; default: cemu_assert_unimplemented(); textureType = MTL::TextureType2D; + break; } uint32 baseLevel = firstMip; @@ -47,7 +52,8 @@ LatteTextureViewMtl::LatteTextureViewMtl(MetalRenderer* mtlRenderer, LatteTextur // TODO: swizzle - m_texture = texture->GetTexture()->newTextureView(pixelFormat, textureType, NS::Range::Make(baseLevel, levelCount), NS::Range::Make(baseLayer, layerCount)); + auto formatInfo = GetMtlPixelFormatInfo(format); + m_texture = texture->GetTexture()->newTextureView(formatInfo.pixelFormat, textureType, NS::Range::Make(baseLevel, levelCount), NS::Range::Make(baseLayer, layerCount)); } LatteTextureViewMtl::~LatteTextureViewMtl() diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index a1d398bdc..cc24348ae 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -1,96 +1,106 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #include "Common/precompiled.h" +#include "Metal/MTLPixelFormat.hpp" -std::map MTL_FORMAT_TABLE = {{ - {Latte::E_GX2SURFFMT::R4_G4_UNORM, {MTL::PixelFormatInvalid, 0}}, // TODO - {Latte::E_GX2SURFFMT::R5_G6_B5_UNORM, {MTL::PixelFormatB5G6R5Unorm, 2}}, // TODO: correct? - {Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM, {MTL::PixelFormatBGR5A1Unorm, 2}}, // TODO: correct? - {Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM, {MTL::PixelFormatABGR4Unorm, 2}}, // TODO: correct? - {Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM, {MTL::PixelFormatA1BGR5Unorm, 2}}, - {Latte::E_GX2SURFFMT::R8_UNORM, {MTL::PixelFormatR8Unorm, 1}}, - {Latte::E_GX2SURFFMT::R8_SNORM, {MTL::PixelFormatR8Snorm, 1}}, - {Latte::E_GX2SURFFMT::R8_UINT, {MTL::PixelFormatR8Uint, 1}}, - {Latte::E_GX2SURFFMT::R8_SINT, {MTL::PixelFormatR8Sint, 1}}, - {Latte::E_GX2SURFFMT::R8_G8_UNORM, {MTL::PixelFormatRG8Unorm, 2}}, - {Latte::E_GX2SURFFMT::R8_G8_SNORM, {MTL::PixelFormatRG8Snorm, 2}}, - {Latte::E_GX2SURFFMT::R8_G8_UINT, {MTL::PixelFormatRG8Uint, 2}}, - {Latte::E_GX2SURFFMT::R8_G8_SINT, {MTL::PixelFormatRG8Sint, 2}}, - {Latte::E_GX2SURFFMT::R8_G8_B8_A8_UNORM, {MTL::PixelFormatRGBA8Unorm, 4}}, - {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SNORM, {MTL::PixelFormatRGBA8Snorm, 4}}, - {Latte::E_GX2SURFFMT::R8_G8_B8_A8_UINT, {MTL::PixelFormatRGBA8Uint, 4}}, - {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SINT, {MTL::PixelFormatRGBA8Sint, 4}}, - {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SRGB, {MTL::PixelFormatRGBA8Unorm_sRGB, 4}}, - {Latte::E_GX2SURFFMT::R10_G10_B10_A2_UNORM, {MTL::PixelFormatRGB10A2Unorm, 4}}, - {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SNORM, {MTL::PixelFormatInvalid, 0}}, // TODO - {Latte::E_GX2SURFFMT::R10_G10_B10_A2_UINT, {MTL::PixelFormatRGB10A2Uint, 4}}, - {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SINT, {MTL::PixelFormatInvalid, 0}}, // TODO - {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SRGB, {MTL::PixelFormatInvalid, 0}}, // TODO - {Latte::E_GX2SURFFMT::A2_B10_G10_R10_UNORM, {MTL::PixelFormatBGR10A2Unorm, 4}}, // TODO: correct? - {Latte::E_GX2SURFFMT::A2_B10_G10_R10_UINT, {MTL::PixelFormatRGB10A2Uint, 4}}, // TODO: correct? - {Latte::E_GX2SURFFMT::R16_UNORM, {MTL::PixelFormatR16Unorm, 2}}, - {Latte::E_GX2SURFFMT::R16_SNORM, {MTL::PixelFormatR16Snorm, 2}}, - {Latte::E_GX2SURFFMT::R16_UINT, {MTL::PixelFormatR16Uint, 2}}, - {Latte::E_GX2SURFFMT::R16_SINT, {MTL::PixelFormatR16Sint, 2}}, - {Latte::E_GX2SURFFMT::R16_FLOAT, {MTL::PixelFormatR16Float, 2}}, - {Latte::E_GX2SURFFMT::R16_G16_UNORM, {MTL::PixelFormatRG16Unorm, 4}}, - {Latte::E_GX2SURFFMT::R16_G16_SNORM, {MTL::PixelFormatRG16Snorm, 4}}, - {Latte::E_GX2SURFFMT::R16_G16_UINT, {MTL::PixelFormatRG16Uint, 4}}, - {Latte::E_GX2SURFFMT::R16_G16_SINT, {MTL::PixelFormatRG16Sint, 4}}, - {Latte::E_GX2SURFFMT::R16_G16_FLOAT, {MTL::PixelFormatRG16Float, 4}}, - {Latte::E_GX2SURFFMT::R16_G16_B16_A16_UNORM, {MTL::PixelFormatRGBA16Unorm, 8}}, - {Latte::E_GX2SURFFMT::R16_G16_B16_A16_SNORM, {MTL::PixelFormatRGBA16Snorm, 8}}, - {Latte::E_GX2SURFFMT::R16_G16_B16_A16_UINT, {MTL::PixelFormatRGBA16Uint, 8}}, - {Latte::E_GX2SURFFMT::R16_G16_B16_A16_SINT, {MTL::PixelFormatRGBA16Sint, 8}}, - {Latte::E_GX2SURFFMT::R16_G16_B16_A16_FLOAT, {MTL::PixelFormatRGBA16Float, 8}}, - {Latte::E_GX2SURFFMT::R24_X8_UNORM, {MTL::PixelFormatInvalid, 0}}, // TODO - {Latte::E_GX2SURFFMT::R24_X8_FLOAT, {MTL::PixelFormatInvalid, 0}}, // TODO - {Latte::E_GX2SURFFMT::X24_G8_UINT, {MTL::PixelFormatInvalid, 0}}, // TODO - {Latte::E_GX2SURFFMT::R32_X8_FLOAT, {MTL::PixelFormatInvalid, 0}}, // TODO - {Latte::E_GX2SURFFMT::X32_G8_UINT_X24, {MTL::PixelFormatInvalid, 0}}, // TODO - {Latte::E_GX2SURFFMT::R11_G11_B10_FLOAT, {MTL::PixelFormatRG11B10Float, 4}}, - {Latte::E_GX2SURFFMT::R32_UINT, {MTL::PixelFormatR32Uint, 4}}, - {Latte::E_GX2SURFFMT::R32_SINT, {MTL::PixelFormatR32Sint, 4}}, - {Latte::E_GX2SURFFMT::R32_FLOAT, {MTL::PixelFormatR32Float, 4}}, - {Latte::E_GX2SURFFMT::R32_G32_UINT, {MTL::PixelFormatRG32Uint, 8}}, - {Latte::E_GX2SURFFMT::R32_G32_SINT, {MTL::PixelFormatRG32Sint, 8}}, - {Latte::E_GX2SURFFMT::R32_G32_FLOAT, {MTL::PixelFormatRG32Float, 8}}, - {Latte::E_GX2SURFFMT::R32_G32_B32_A32_UINT, {MTL::PixelFormatRGBA32Uint, 16}}, - {Latte::E_GX2SURFFMT::R32_G32_B32_A32_SINT, {MTL::PixelFormatRGBA32Sint, 16}}, - {Latte::E_GX2SURFFMT::R32_G32_B32_A32_FLOAT, {MTL::PixelFormatRGBA32Float, 16}}, - {Latte::E_GX2SURFFMT::D24_S8_UNORM, {MTL::PixelFormatDepth24Unorm_Stencil8, 4}}, // TODO: not supported on Apple sillicon, maybe find something else - {Latte::E_GX2SURFFMT::D24_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, 4}}, // TODO: correct? - {Latte::E_GX2SURFFMT::D32_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, 5}}, - {Latte::E_GX2SURFFMT::D16_UNORM, {MTL::PixelFormatDepth16Unorm, 2}}, - {Latte::E_GX2SURFFMT::D32_FLOAT, {MTL::PixelFormatDepth32Float, 4}}, - {Latte::E_GX2SURFFMT::BC1_UNORM, {MTL::PixelFormatBC1_RGBA, 8, {4, 4}}}, // TODO: correct? - {Latte::E_GX2SURFFMT::BC1_SRGB, {MTL::PixelFormatBC1_RGBA_sRGB, 8, {4, 4}}}, // TODO: correct? - {Latte::E_GX2SURFFMT::BC2_UNORM, {MTL::PixelFormatBC2_RGBA, 16, {4, 4}}}, // TODO: correct? - {Latte::E_GX2SURFFMT::BC2_SRGB, {MTL::PixelFormatBC2_RGBA_sRGB, 16, {4, 4}}}, // TODO: correct? - {Latte::E_GX2SURFFMT::BC3_UNORM, {MTL::PixelFormatBC3_RGBA, 16, {4, 4}}}, // TODO: correct? - {Latte::E_GX2SURFFMT::BC3_SRGB, {MTL::PixelFormatBC3_RGBA_sRGB, 16, {4, 4}}}, // TODO: correct? - {Latte::E_GX2SURFFMT::BC4_UNORM, {MTL::PixelFormatInvalid, 0}}, // TODO - {Latte::E_GX2SURFFMT::BC4_SNORM, {MTL::PixelFormatInvalid, 0}}, // TODO - {Latte::E_GX2SURFFMT::BC5_UNORM, {MTL::PixelFormatInvalid, 0}}, // TODO - {Latte::E_GX2SURFFMT::BC5_SNORM, {MTL::PixelFormatInvalid, 0}}, // TODO -}}; +std::map MTL_FORMAT_TABLE = { + {Latte::E_GX2SURFFMT::R4_G4_UNORM, {MTL::PixelFormatRG8Unorm, 2}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R5_G6_B5_UNORM, {MTL::PixelFormatB5G6R5Unorm, 2}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM, {MTL::PixelFormatBGR5A1Unorm, 2}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM, {MTL::PixelFormatABGR4Unorm, 2}}, // TODO: correct? + {Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM, {MTL::PixelFormatA1BGR5Unorm, 2}}, + {Latte::E_GX2SURFFMT::R8_UNORM, {MTL::PixelFormatR8Unorm, 1}}, + {Latte::E_GX2SURFFMT::R8_SNORM, {MTL::PixelFormatR8Snorm, 1}}, + {Latte::E_GX2SURFFMT::R8_UINT, {MTL::PixelFormatR8Uint, 1}}, + {Latte::E_GX2SURFFMT::R8_SINT, {MTL::PixelFormatR8Sint, 1}}, + {Latte::E_GX2SURFFMT::R8_G8_UNORM, {MTL::PixelFormatRG8Unorm, 2}}, + {Latte::E_GX2SURFFMT::R8_G8_SNORM, {MTL::PixelFormatRG8Snorm, 2}}, + {Latte::E_GX2SURFFMT::R8_G8_UINT, {MTL::PixelFormatRG8Uint, 2}}, + {Latte::E_GX2SURFFMT::R8_G8_SINT, {MTL::PixelFormatRG8Sint, 2}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_UNORM, {MTL::PixelFormatRGBA8Unorm, 4}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SNORM, {MTL::PixelFormatRGBA8Snorm, 4}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_UINT, {MTL::PixelFormatRGBA8Uint, 4}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SINT, {MTL::PixelFormatRGBA8Sint, 4}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SRGB, {MTL::PixelFormatRGBA8Unorm_sRGB, 4}}, + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_UNORM, {MTL::PixelFormatRGB10A2Unorm, 4}}, + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SNORM, {MTL::PixelFormatRGBA16Snorm, 8}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_UINT, {MTL::PixelFormatRGB10A2Uint, 4}}, + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SINT, {MTL::PixelFormatRGBA16Sint, 8}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SRGB, {MTL::PixelFormatRGBA8Unorm_sRGB, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::A2_B10_G10_R10_UNORM, {MTL::PixelFormatBGR10A2Unorm, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::A2_B10_G10_R10_UINT, {MTL::PixelFormatRGB10A2Uint, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R16_UNORM, {MTL::PixelFormatR16Unorm, 2}}, + {Latte::E_GX2SURFFMT::R16_SNORM, {MTL::PixelFormatR16Snorm, 2}}, + {Latte::E_GX2SURFFMT::R16_UINT, {MTL::PixelFormatR16Uint, 2}}, + {Latte::E_GX2SURFFMT::R16_SINT, {MTL::PixelFormatR16Sint, 2}}, + {Latte::E_GX2SURFFMT::R16_FLOAT, {MTL::PixelFormatR16Float, 2}}, + {Latte::E_GX2SURFFMT::R16_G16_UNORM, {MTL::PixelFormatRG16Unorm, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_SNORM, {MTL::PixelFormatRG16Snorm, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_UINT, {MTL::PixelFormatRG16Uint, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_SINT, {MTL::PixelFormatRG16Sint, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_FLOAT, {MTL::PixelFormatRG16Float, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_UNORM, {MTL::PixelFormatRGBA16Unorm, 8}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_SNORM, {MTL::PixelFormatRGBA16Snorm, 8}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_UINT, {MTL::PixelFormatRGBA16Uint, 8}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_SINT, {MTL::PixelFormatRGBA16Sint, 8}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_FLOAT, {MTL::PixelFormatRGBA16Float, 8}}, + {Latte::E_GX2SURFFMT::R24_X8_UNORM, {MTL::PixelFormatInvalid, 0}}, // TODO + {Latte::E_GX2SURFFMT::R24_X8_FLOAT, {MTL::PixelFormatInvalid, 0}}, // TODO + {Latte::E_GX2SURFFMT::X24_G8_UINT, {MTL::PixelFormatInvalid, 0}}, // TODO + {Latte::E_GX2SURFFMT::R32_X8_FLOAT, {MTL::PixelFormatInvalid, 0}}, // TODO + {Latte::E_GX2SURFFMT::X32_G8_UINT_X24, {MTL::PixelFormatInvalid, 0}}, // TODO + {Latte::E_GX2SURFFMT::R11_G11_B10_FLOAT, {MTL::PixelFormatRG11B10Float, 4}}, + {Latte::E_GX2SURFFMT::R32_UINT, {MTL::PixelFormatR32Uint, 4}}, + {Latte::E_GX2SURFFMT::R32_SINT, {MTL::PixelFormatR32Sint, 4}}, + {Latte::E_GX2SURFFMT::R32_FLOAT, {MTL::PixelFormatR32Float, 4}}, + {Latte::E_GX2SURFFMT::R32_G32_UINT, {MTL::PixelFormatRG32Uint, 8}}, + {Latte::E_GX2SURFFMT::R32_G32_SINT, {MTL::PixelFormatRG32Sint, 8}}, + {Latte::E_GX2SURFFMT::R32_G32_FLOAT, {MTL::PixelFormatRG32Float, 8}}, + {Latte::E_GX2SURFFMT::R32_G32_B32_A32_UINT, {MTL::PixelFormatRGBA32Uint, 16}}, + {Latte::E_GX2SURFFMT::R32_G32_B32_A32_SINT, {MTL::PixelFormatRGBA32Sint, 16}}, + {Latte::E_GX2SURFFMT::R32_G32_B32_A32_FLOAT, {MTL::PixelFormatRGBA32Float, 16}}, + {Latte::E_GX2SURFFMT::D24_S8_UNORM, {MTL::PixelFormatDepth24Unorm_Stencil8, 4}}, // TODO: not supported on Apple sillicon, maybe find something else + {Latte::E_GX2SURFFMT::D24_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::D32_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, 5}}, + {Latte::E_GX2SURFFMT::D16_UNORM, {MTL::PixelFormatDepth16Unorm, 2}}, + {Latte::E_GX2SURFFMT::D32_FLOAT, {MTL::PixelFormatDepth32Float, 4}}, + {Latte::E_GX2SURFFMT::BC1_UNORM, {MTL::PixelFormatBC1_RGBA, 8, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC1_SRGB, {MTL::PixelFormatBC1_RGBA_sRGB, 8, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC2_UNORM, {MTL::PixelFormatBC2_RGBA, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC2_SRGB, {MTL::PixelFormatBC2_RGBA_sRGB, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC3_UNORM, {MTL::PixelFormatBC3_RGBA, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC3_SRGB, {MTL::PixelFormatBC3_RGBA_sRGB, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC4_UNORM, {MTL::PixelFormatBC4_RUnorm, 8, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC4_SNORM, {MTL::PixelFormatBC4_RSnorm, 8, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC5_UNORM, {MTL::PixelFormatBC5_RGUnorm, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC5_SNORM, {MTL::PixelFormatBC5_RGSnorm, 16, {4, 4}}}, // TODO: correct? +}; -const MtlPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format) { +const MtlPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format) +{ cemu_assert_debug(static_cast(format) < MTL_FORMAT_TABLE.size()); - return MTL_FORMAT_TABLE[format]; + MtlPixelFormatInfo formatInfo = MTL_FORMAT_TABLE[format]; + if (formatInfo.pixelFormat == MTL::PixelFormatInvalid) + { + printf("invalid pixel format: %i\n", (int)format); + } + + return formatInfo; } inline uint32 CeilDivide(uint32 a, uint32 b) { return (a + b - 1) / b; } -size_t GetMtlTextureBytesPerRow(Latte::E_GX2SURFFMT format, uint32 width) { +size_t GetMtlTextureBytesPerRow(Latte::E_GX2SURFFMT format, uint32 width) +{ const auto& formatInfo = GetMtlPixelFormatInfo(format); return CeilDivide(width, formatInfo.blockTexelSize.x) * formatInfo.bytesPerBlock; } -size_t GetMtlTextureBytesPerImage(Latte::E_GX2SURFFMT format, uint32 height, size_t bytesPerRow) { +size_t GetMtlTextureBytesPerImage(Latte::E_GX2SURFFMT format, uint32 height, size_t bytesPerRow) +{ const auto& formatInfo = GetMtlPixelFormatInfo(format); return CeilDivide(height, formatInfo.blockTexelSize.y) * bytesPerRow; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index a5b7e3a66..0923b9ef3 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -7,6 +7,8 @@ #include "HW/Latte/Core/LatteShader.h" #include "gui/guiWrapper.h" +extern bool hasValidFramebufferAttached; + MetalRenderer::MetalRenderer() { m_device = MTL::CreateSystemDefaultDevice(); @@ -66,17 +68,27 @@ void MetalRenderer::DrawEmptyFrame(bool mainWindow) void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) { + CA::MetalDrawable* drawable = m_metalLayer->nextDrawable(); - if (!drawable) + if (drawable) + { + ensureCommandBuffer(); + m_commandBuffer->presentDrawable(drawable); + } else { - return; + printf("skipped present!\n"); } - m_commandBuffer->presentDrawable(drawable); - m_commandBuffer->commit(); + if (m_commandBuffer) + { + m_commandBuffer->commit(); + + m_commandBuffer->release(); + m_commandBuffer = nullptr; - m_commandBuffer->release(); - m_commandBuffer = nullptr; + // Debug + m_commandQueue->insertDebugCaptureBoundary(); + } } void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutputShader* shader, bool useLinearTexFilter, @@ -88,8 +100,6 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput bool MetalRenderer::BeginFrame(bool mainWindow) { - m_commandBuffer = m_commandQueue->commandBuffer(); - // TODO return false; } @@ -162,6 +172,7 @@ void MetalRenderer::texture_clearSlice(LatteTexture* hostTexture, sint32 sliceIn void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, sint32 height, sint32 depth, void* pixelData, sint32 sliceIndex, sint32 mipIndex, uint32 compressedImageSize) { + std::cout << "TEXTURE LOAD SLICE" << std::endl; auto mtlTexture = (LatteTextureMtl*)hostTexture; size_t bytesPerRow = GetMtlTextureBytesPerRow(mtlTexture->GetFormat(), width); @@ -191,6 +202,7 @@ void MetalRenderer::texture_setLatteTexture(LatteTextureView* textureView, uint3 void MetalRenderer::texture_copyImageSubData(LatteTexture* src, sint32 srcMip, sint32 effectiveSrcX, sint32 effectiveSrcY, sint32 srcSlice, LatteTexture* dst, sint32 dstMip, sint32 effectiveDstX, sint32 effectiveDstY, sint32 dstSlice, sint32 effectiveCopyWidth, sint32 effectiveCopyHeight, sint32 srcDepth) { + std::cout << "TEXTURE COPY IMAGE SUBDATA" << std::endl; cemuLog_log(LogType::MetalLogging, "not implemented"); } @@ -258,9 +270,59 @@ void MetalRenderer::streamout_rendererFinishDrawcall() void MetalRenderer::draw_beginSequence() { - cemuLog_log(LogType::MetalLogging, "not implemented"); - - LatteSHRC_UpdateActiveShaders(); + skipDraws = false; + + // update shader state + LatteSHRC_UpdateActiveShaders(); + if (LatteGPUState.activeShaderHasError) + { + cemuLog_logDebugOnce(LogType::Force, "Skipping drawcalls due to shader error"); + skipDraws = true; + cemu_assert_debug(false); + return; + } + + // update render target and texture state + LatteGPUState.requiresTextureBarrier = false; + while (true) + { + LatteGPUState.repeatTextureInitialization = false; + if (!LatteMRT::UpdateCurrentFBO()) + { + debug_printf("Rendertarget invalid\n"); + skipDraws = true; + return; // no render target + } + + if (!hasValidFramebufferAttached) + { + debug_printf("Drawcall with no color buffer or depth buffer attached\n"); + skipDraws = true; + return; // no render target + } + LatteTexture_updateTextures(); + if (!LatteGPUState.repeatTextureInitialization) + break; + } + + // apply render target + // HACK: not implemented yet + //LatteMRT::ApplyCurrentState(); + + // viewport and scissor box + LatteRenderTarget_updateViewport(); + LatteRenderTarget_updateScissorBox(); + + // check for conditions which would turn the drawcalls into no-ops + bool rasterizerEnable = LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL() == false; + + // GX2SetSpecialState(0, true) enables DX_RASTERIZATION_KILL, but still expects depth writes to happen? -> Research which stages are disabled by DX_RASTERIZATION_KILL exactly + // for now we use a workaround: + if (!LatteGPUState.contextNew.PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) + rasterizerEnable = true; + + if (!rasterizerEnable == false) + skipDraws = true; } void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index d43cc2be9..a9177ab56 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -161,4 +161,16 @@ class MetalRenderer : public Renderer MTL::CommandQueue* m_commandQueue; MTL::CommandBuffer* m_commandBuffer = nullptr; + + // State + bool skipDraws = false; + + // Helpers + void ensureCommandBuffer() + { + if (!m_commandBuffer) + { + m_commandBuffer = m_commandQueue->commandBuffer(); + } + } }; diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp index 9209e3cdd..ec27b4123 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp @@ -61,7 +61,7 @@ VKAPI_ATTR VkBool32 VKAPI_CALL DebugUtilsCallback(VkDebugUtilsMessageSeverityFla if (strstr(pCallbackData->pMessage, "consumes input location")) return VK_FALSE; // false means we dont care if (strstr(pCallbackData->pMessage, "blend")) - return VK_FALSE; // + return VK_FALSE; // // note: Check if previously used location in VK_EXT_debug_report callback is the same as messageIdNumber under the new extension // validation errors which are difficult to fix From 4c78c6afbb4db49a3ef8e2d5c0384c636d8dca48 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 27 Jul 2024 15:20:51 +0200 Subject: [PATCH 010/368] implement texture decoding --- src/Cafe/CMakeLists.txt | 1 + .../HW/Latte/LatteAddrLib/AddrLibFastDecode.h | 2 +- .../LatteDecompilerEmitMSL.cpp | 2 +- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 3 +- src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h | 2 +- .../Latte/Renderer/Metal/MetalMemoryManager.h | 22 ++ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 203 ++++++++++++++---- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 3 + 8 files changed, 192 insertions(+), 46 deletions(-) create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index 2a3dda471..f34099dc1 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -547,6 +547,7 @@ if(ENABLE_METAL) HW/Latte/Renderer/Metal/LatteTextureViewMtl.h HW/Latte/Renderer/Metal/RendererShaderMtl.cpp HW/Latte/Renderer/Metal/RendererShaderMtl.h + HW/Latte/Renderer/Metal/MetalMemoryManager.h ) #target_link_libraries(CemuCafe PRIVATE diff --git a/src/Cafe/HW/Latte/LatteAddrLib/AddrLibFastDecode.h b/src/Cafe/HW/Latte/LatteAddrLib/AddrLibFastDecode.h index b0e2cfb31..b54d6038e 100644 --- a/src/Cafe/HW/Latte/LatteAddrLib/AddrLibFastDecode.h +++ b/src/Cafe/HW/Latte/LatteAddrLib/AddrLibFastDecode.h @@ -381,4 +381,4 @@ void optimizedDecodeLoops(LatteTextureLoaderCtx* textureLoader, uint8* outputDat } } } -} \ No newline at end of file +} diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 76cd6a18b..ce48bd63a 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2002,7 +2002,7 @@ static void _emitALUClauseCode(LatteDecompilerShaderContext* shaderContext, Latt { _emitInstructionOutputVariableName(shaderContext, &aluInstruction); src->add(" = "); - src->add("as_type("); + src->add("as_type(as_type("); _emitInstructionOutputVariableName(shaderContext, &aluInstruction); src->add(")"); if( aluInstruction.omod == 1 ) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index cc24348ae..0d7d14c55 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -2,6 +2,7 @@ #include "Common/precompiled.h" #include "Metal/MTLPixelFormat.hpp" +// TODO: separate color and depth formats std::map MTL_FORMAT_TABLE = { {Latte::E_GX2SURFFMT::R4_G4_UNORM, {MTL::PixelFormatRG8Unorm, 2}}, // TODO: correct? {Latte::E_GX2SURFFMT::R5_G6_B5_UNORM, {MTL::PixelFormatB5G6R5Unorm, 2}}, // TODO: correct? @@ -82,7 +83,7 @@ const MtlPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format) MtlPixelFormatInfo formatInfo = MTL_FORMAT_TABLE[format]; if (formatInfo.pixelFormat == MTL::PixelFormatInvalid) { - printf("invalid pixel format: %i\n", (int)format); + printf("invalid pixel format: %u\n", (uint32)format); } return formatInfo; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h index f8b1ee037..e4a821560 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h @@ -2,7 +2,7 @@ #include -#include "HW/Latte/ISA/LatteReg.h" +#include "Cafe/HW/Latte/ISA/LatteReg.h" struct Uvec2 { uint32 x; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h new file mode 100644 index 000000000..d767f2322 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -0,0 +1,22 @@ +#pragma once + +#include "Cafe/HW/Latte/ISA/LatteReg.h" + +class MetalMemoryManager +{ +public: + MetalMemoryManager() = default; + + void* GetTextureUploadBuffer(size_t size) + { + if (m_textureUploadBuffer.size() < size) + { + m_textureUploadBuffer.resize(size); + } + + return m_textureUploadBuffer.data(); + } + +private: + std::vector m_textureUploadBuffer; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 0923b9ef3..40ca1acf5 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -36,19 +36,19 @@ void MetalRenderer::Initialize() void MetalRenderer::Shutdown() { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::Shutdown not implemented\n"); } bool MetalRenderer::IsPadWindowActive() { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::IsPadWindowActive not implemented\n"); return false; } bool MetalRenderer::GetVRAMInfo(int& usageInMB, int& totalInMB) const { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::GetVRAMInfo not implemented\n"); usageInMB = 1024; totalInMB = 1024; @@ -58,12 +58,12 @@ bool MetalRenderer::GetVRAMInfo(int& usageInMB, int& totalInMB) const void MetalRenderer::ClearColorbuffer(bool padView) { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::ClearColorbuffer not implemented\n"); } void MetalRenderer::DrawEmptyFrame(bool mainWindow) { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::DrawEmptyFrame not implemented\n"); } void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) @@ -95,7 +95,7 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight, bool padView, bool clearBackground) { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::DrawBackbufferQuad not implemented\n"); } bool MetalRenderer::BeginFrame(bool mainWindow) @@ -106,68 +106,188 @@ bool MetalRenderer::BeginFrame(bool mainWindow) void MetalRenderer::Flush(bool waitIdle) { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::Flush not implemented\n"); } void MetalRenderer::NotifyLatteCommandProcessorIdle() { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::NotifyLatteCommandProcessorIdle not implemented\n"); } void MetalRenderer::AppendOverlayDebugInfo() { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::AppendOverlayDebugInfo not implemented\n"); } void MetalRenderer::renderTarget_setViewport(float x, float y, float width, float height, float nearZ, float farZ, bool halfZ) { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::renderTarget_setViewport not implemented\n"); } void MetalRenderer::renderTarget_setScissor(sint32 scissorX, sint32 scissorY, sint32 scissorWidth, sint32 scissorHeight) { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::renderTarget_setScissor not implemented\n"); } LatteCachedFBO* MetalRenderer::rendertarget_createCachedFBO(uint64 key) { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::rendertarget_createCachedFBO not implemented\n"); return nullptr; } void MetalRenderer::rendertarget_deleteCachedFBO(LatteCachedFBO* fbo) { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::rendertarget_deleteCachedFBO not implemented\n"); } void MetalRenderer::rendertarget_bindFramebufferObject(LatteCachedFBO* cfbo) { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::rendertarget_bindFramebufferObject not implemented\n"); } void* MetalRenderer::texture_acquireTextureUploadBuffer(uint32 size) { - cemuLog_log(LogType::MetalLogging, "not implemented"); - - return nullptr; + return m_memoryManager.GetTextureUploadBuffer(size); } void MetalRenderer::texture_releaseTextureUploadBuffer(uint8* mem) { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::texture_releaseTextureUploadBuffer not implemented\n"); } TextureDecoder* MetalRenderer::texture_chooseDecodedFormat(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, uint32 width, uint32 height) { - cemuLog_log(LogType::MetalLogging, "not implemented"); - - return nullptr; + printf("decoding format %u\n", (uint32)format); + // TODO: move to LatteToMtl + if (isDepth) + { + switch (format) + { + case Latte::E_GX2SURFFMT::D24_S8_UNORM: + return TextureDecoder_D24_S8::getInstance(); + case Latte::E_GX2SURFFMT::D24_S8_FLOAT: + return TextureDecoder_NullData64::getInstance(); + case Latte::E_GX2SURFFMT::D32_FLOAT: + return TextureDecoder_R32_FLOAT::getInstance(); + case Latte::E_GX2SURFFMT::D16_UNORM: + return TextureDecoder_R16_UNORM::getInstance(); + case Latte::E_GX2SURFFMT::D32_S8_FLOAT: + return TextureDecoder_D32_S8_UINT_X24::getInstance(); + default: + printf("invalid depth texture format %u\n", (uint32)format); + cemu_assert_debug(false); + return nullptr; + } + } else + { + switch (format) + { + case Latte::E_GX2SURFFMT::R32_G32_B32_A32_FLOAT: + return TextureDecoder_R32_G32_B32_A32_FLOAT::getInstance(); + case Latte::E_GX2SURFFMT::R32_G32_B32_A32_UINT: + return TextureDecoder_R32_G32_B32_A32_UINT::getInstance(); + case Latte::E_GX2SURFFMT::R16_G16_B16_A16_FLOAT: + return TextureDecoder_R16_G16_B16_A16_FLOAT::getInstance(); + case Latte::E_GX2SURFFMT::R16_G16_B16_A16_UINT: + return TextureDecoder_R16_G16_B16_A16_UINT::getInstance(); + case Latte::E_GX2SURFFMT::R16_G16_B16_A16_UNORM: + return TextureDecoder_R16_G16_B16_A16::getInstance(); + case Latte::E_GX2SURFFMT::R16_G16_B16_A16_SNORM: + return TextureDecoder_R16_G16_B16_A16::getInstance(); + case Latte::E_GX2SURFFMT::R8_G8_B8_A8_UNORM: + return TextureDecoder_R8_G8_B8_A8::getInstance(); + case Latte::E_GX2SURFFMT::R8_G8_B8_A8_SNORM: + return TextureDecoder_R8_G8_B8_A8::getInstance(); + case Latte::E_GX2SURFFMT::R8_G8_B8_A8_SRGB: + return TextureDecoder_R8_G8_B8_A8::getInstance(); + case Latte::E_GX2SURFFMT::R8_G8_B8_A8_UINT: + return TextureDecoder_R8_G8_B8_A8::getInstance(); + case Latte::E_GX2SURFFMT::R8_G8_B8_A8_SINT: + return TextureDecoder_R8_G8_B8_A8::getInstance(); + case Latte::E_GX2SURFFMT::R32_G32_FLOAT: + return TextureDecoder_R32_G32_FLOAT::getInstance(); + case Latte::E_GX2SURFFMT::R32_G32_UINT: + return TextureDecoder_R32_G32_UINT::getInstance(); + case Latte::E_GX2SURFFMT::R16_G16_UNORM: + return TextureDecoder_R16_G16::getInstance(); + case Latte::E_GX2SURFFMT::R16_G16_FLOAT: + return TextureDecoder_R16_G16_FLOAT::getInstance(); + case Latte::E_GX2SURFFMT::R8_G8_UNORM: + return TextureDecoder_R8_G8::getInstance(); + case Latte::E_GX2SURFFMT::R8_G8_SNORM: + return TextureDecoder_R8_G8::getInstance(); + case Latte::E_GX2SURFFMT::R4_G4_UNORM: + return TextureDecoder_R4_G4::getInstance(); + case Latte::E_GX2SURFFMT::R32_FLOAT: + return TextureDecoder_R32_FLOAT::getInstance(); + case Latte::E_GX2SURFFMT::R32_UINT: + return TextureDecoder_R32_UINT::getInstance(); + case Latte::E_GX2SURFFMT::R16_FLOAT: + return TextureDecoder_R16_FLOAT::getInstance(); + case Latte::E_GX2SURFFMT::R16_UNORM: + return TextureDecoder_R16_UNORM::getInstance(); + case Latte::E_GX2SURFFMT::R16_SNORM: + return TextureDecoder_R16_SNORM::getInstance(); + case Latte::E_GX2SURFFMT::R16_UINT: + return TextureDecoder_R16_UINT::getInstance(); + case Latte::E_GX2SURFFMT::R8_UNORM: + return TextureDecoder_R8::getInstance(); + case Latte::E_GX2SURFFMT::R8_SNORM: + return TextureDecoder_R8::getInstance(); + case Latte::E_GX2SURFFMT::R8_UINT: + return TextureDecoder_R8_UINT::getInstance(); + case Latte::E_GX2SURFFMT::R5_G6_B5_UNORM: + return TextureDecoder_R5_G6_B5_swappedRB::getInstance(); + case Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM: + return TextureDecoder_R5_G5_B5_A1_UNORM_swappedRB::getInstance(); + case Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM: + return TextureDecoder_A1_B5_G5_R5_UNORM_vulkan::getInstance(); + case Latte::E_GX2SURFFMT::R11_G11_B10_FLOAT: + return TextureDecoder_R11_G11_B10_FLOAT::getInstance(); + case Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM: + return TextureDecoder_R4_G4_B4_A4_UNORM::getInstance(); + case Latte::E_GX2SURFFMT::R10_G10_B10_A2_UNORM: + return TextureDecoder_R10_G10_B10_A2_UNORM::getInstance(); + case Latte::E_GX2SURFFMT::R10_G10_B10_A2_SNORM: + return TextureDecoder_R10_G10_B10_A2_SNORM_To_RGBA16::getInstance(); + case Latte::E_GX2SURFFMT::R10_G10_B10_A2_SRGB: + return TextureDecoder_R10_G10_B10_A2_UNORM::getInstance(); + case Latte::E_GX2SURFFMT::BC1_SRGB: + return TextureDecoder_BC1::getInstance(); + case Latte::E_GX2SURFFMT::BC1_UNORM: + return TextureDecoder_BC1::getInstance(); + case Latte::E_GX2SURFFMT::BC2_UNORM: + return TextureDecoder_BC2::getInstance(); + case Latte::E_GX2SURFFMT::BC2_SRGB: + return TextureDecoder_BC2::getInstance(); + case Latte::E_GX2SURFFMT::BC3_UNORM: + return TextureDecoder_BC3::getInstance(); + case Latte::E_GX2SURFFMT::BC3_SRGB: + return TextureDecoder_BC3::getInstance(); + case Latte::E_GX2SURFFMT::BC4_UNORM: + return TextureDecoder_BC4::getInstance(); + case Latte::E_GX2SURFFMT::BC4_SNORM: + return TextureDecoder_BC4::getInstance(); + case Latte::E_GX2SURFFMT::BC5_UNORM: + return TextureDecoder_BC5::getInstance(); + case Latte::E_GX2SURFFMT::BC5_SNORM: + return TextureDecoder_BC5::getInstance(); + case Latte::E_GX2SURFFMT::R24_X8_UNORM: + return TextureDecoder_R24_X8::getInstance(); + case Latte::E_GX2SURFFMT::X24_G8_UINT: + return TextureDecoder_X24_G8_UINT::getInstance(); // todo - verify + default: + printf("invalid color texture format %u\n", (uint32)format); + cemu_assert_debug(false); + return nullptr; + } + } } void MetalRenderer::texture_clearSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex) { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::texture_clearSlice not implemented\n"); } void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, sint32 height, sint32 depth, void* pixelData, sint32 sliceIndex, sint32 mipIndex, uint32 compressedImageSize) @@ -182,12 +302,12 @@ void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, s void MetalRenderer::texture_clearColorSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a) { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::texture_clearColorSlice not implemented\n"); } void MetalRenderer::texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sliceIndex, sint32 mipIndex, bool clearDepth, bool clearStencil, float depthValue, uint32 stencilValue) { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::texture_clearDepthSlice not implemented\n"); } LatteTexture* MetalRenderer::texture_createTextureEx(Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) @@ -197,55 +317,54 @@ LatteTexture* MetalRenderer::texture_createTextureEx(Latte::E_DIM dim, MPTR phys void MetalRenderer::texture_setLatteTexture(LatteTextureView* textureView, uint32 textureUnit) { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::texture_setLatteTexture not implemented\n"); } void MetalRenderer::texture_copyImageSubData(LatteTexture* src, sint32 srcMip, sint32 effectiveSrcX, sint32 effectiveSrcY, sint32 srcSlice, LatteTexture* dst, sint32 dstMip, sint32 effectiveDstX, sint32 effectiveDstY, sint32 dstSlice, sint32 effectiveCopyWidth, sint32 effectiveCopyHeight, sint32 srcDepth) { - std::cout << "TEXTURE COPY IMAGE SUBDATA" << std::endl; - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::texture_copyImageSubData not implemented\n"); } LatteTextureReadbackInfo* MetalRenderer::texture_createReadback(LatteTextureView* textureView) { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::texture_createReadback not implemented\n"); return nullptr; } void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* sourceTexture, sint32 srcMip, sint32 srcSlice, LatteTexture* destinationTexture, sint32 dstMip, sint32 dstSlice, sint32 width, sint32 height) { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion not implemented\n"); } void MetalRenderer::bufferCache_init(const sint32 bufferSize) { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::bufferCache_init not implemented\n"); } void MetalRenderer::bufferCache_upload(uint8* buffer, sint32 size, uint32 bufferOffset) { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::bufferCache_upload not implemented\n"); } void MetalRenderer::bufferCache_copy(uint32 srcOffset, uint32 dstOffset, uint32 size) { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::bufferCache_copy not implemented\n"); } void MetalRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint32 dstOffset, uint32 size) { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::bufferCache_copyStreamoutToMainBuffer not implemented\n"); } void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, uint32 size) { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::buffer_bindVertexBuffer not implemented\n"); } void MetalRenderer::buffer_bindUniformBuffer(LatteConst::ShaderType shaderType, uint32 bufferIndex, uint32 offset, uint32 size) { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::buffer_bindUniformBuffer not implemented\n"); } RendererShader* MetalRenderer::shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool isGameShader, bool isGfxPackShader) @@ -255,17 +374,17 @@ RendererShader* MetalRenderer::shader_create(RendererShader::ShaderType type, ui void MetalRenderer::streamout_setupXfbBuffer(uint32 bufferIndex, sint32 ringBufferOffset, uint32 rangeAddr, uint32 rangeSize) { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::streamout_setupXfbBuffer not implemented\n"); } void MetalRenderer::streamout_begin() { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::streamout_begin not implemented\n"); } void MetalRenderer::streamout_rendererFinishDrawcall() { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::streamout_rendererFinishDrawcall not implemented\n"); } void MetalRenderer::draw_beginSequence() @@ -327,22 +446,22 @@ void MetalRenderer::draw_beginSequence() void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst) { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::draw_execute not implemented\n"); } void MetalRenderer::draw_endSequence() { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::draw_endSequence not implemented\n"); } void* MetalRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::indexData_reserveIndexMemory not implemented\n"); return nullptr; } void MetalRenderer::indexData_uploadIndexMemory(uint32 offset, uint32 size) { - cemuLog_log(LogType::MetalLogging, "not implemented"); + printf("MetalRenderer::indexData_uploadIndexMemory not implemented\n"); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index a9177ab56..55a681581 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -5,6 +5,7 @@ #include #include "Cafe/HW/Latte/Renderer/Renderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" class MetalRenderer : public Renderer { @@ -156,6 +157,8 @@ class MetalRenderer : public Renderer private: CA::MetalLayer* m_metalLayer; + MetalMemoryManager m_memoryManager; + // Metal objects MTL::Device* m_device; MTL::CommandQueue* m_commandQueue; From e5395277a791ef5f67701f7a94fc1ef553fe1d64 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 27 Jul 2024 19:14:00 +0200 Subject: [PATCH 011/368] implement render passes --- src/Cafe/CMakeLists.txt | 2 + .../HW/Latte/Renderer/Metal/CachedFBOMtl.cpp | 37 +++++++ .../HW/Latte/Renderer/Metal/CachedFBOMtl.h | 27 ++++++ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 97 ++++++++++++++++--- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 15 ++- 5 files changed, 164 insertions(+), 14 deletions(-) create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index f34099dc1..bfb0d1b50 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -547,6 +547,8 @@ if(ENABLE_METAL) HW/Latte/Renderer/Metal/LatteTextureViewMtl.h HW/Latte/Renderer/Metal/RendererShaderMtl.cpp HW/Latte/Renderer/Metal/RendererShaderMtl.h + HW/Latte/Renderer/Metal/CachedFBOMtl.cpp + HW/Latte/Renderer/Metal/CachedFBOMtl.h HW/Latte/Renderer/Metal/MetalMemoryManager.h ) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp new file mode 100644 index 000000000..ac34918e8 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp @@ -0,0 +1,37 @@ +#include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" +#include "Metal/MTLRenderPass.hpp" + +void CachedFBOMtl::CreateRenderPass() +{ + m_renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); + + for (int i = 0; i < 8; ++i) + { + auto& buffer = colorBuffer[i]; + auto textureView = (LatteTextureViewMtl*)buffer.texture; + if (!textureView) + { + continue; + } + auto colorAttachment = m_renderPassDescriptor->colorAttachments()->object(i); + colorAttachment->setTexture(textureView->GetTexture()); + colorAttachment->setLoadAction(MTL::LoadActionLoad); + colorAttachment->setStoreAction(MTL::StoreActionStore); + } + + // setup depth attachment + if (depthBuffer.texture) + { + auto textureView = static_cast(depthBuffer.texture); + auto depthAttachment = m_renderPassDescriptor->depthAttachment(); + depthAttachment->setTexture(textureView->GetTexture()); + depthAttachment->setLoadAction(MTL::LoadActionLoad); + depthAttachment->setStoreAction(MTL::StoreActionStore); + } +} + +CachedFBOMtl::~CachedFBOMtl() +{ + m_renderPassDescriptor->release(); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h new file mode 100644 index 000000000..0d926e7ed --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h @@ -0,0 +1,27 @@ +#pragma once + +#include + +#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h" +#include "Cafe/HW/Latte/Core/LatteCachedFBO.h" + +class CachedFBOMtl : public LatteCachedFBO +{ +public: + CachedFBOMtl(uint64 key) : LatteCachedFBO(key) + { + CreateRenderPass(); + } + + ~CachedFBOMtl(); + + MTL::RenderPassDescriptor* GetRenderPassDescriptor() + { + return m_renderPassDescriptor; + } + +private: + MTL::RenderPassDescriptor* m_renderPassDescriptor = nullptr; + + void CreateRenderPass(); +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 40ca1acf5..40c331d04 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -2,9 +2,11 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalLayer.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #include "HW/Latte/Core/LatteShader.h" +#include "Cafe/HW/Latte/Core/LatteIndices.h" #include "gui/guiWrapper.h" extern bool hasValidFramebufferAttached; @@ -68,6 +70,12 @@ void MetalRenderer::DrawEmptyFrame(bool mainWindow) void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) { + if (m_renderCommandEncoder) + { + m_renderCommandEncoder->endEncoding(); + m_renderCommandEncoder->release(); + m_renderCommandEncoder = nullptr; + } CA::MetalDrawable* drawable = m_metalLayer->nextDrawable(); if (drawable) @@ -131,19 +139,18 @@ void MetalRenderer::renderTarget_setScissor(sint32 scissorX, sint32 scissorY, si LatteCachedFBO* MetalRenderer::rendertarget_createCachedFBO(uint64 key) { - printf("MetalRenderer::rendertarget_createCachedFBO not implemented\n"); - - return nullptr; + return new CachedFBOMtl(key); } -void MetalRenderer::rendertarget_deleteCachedFBO(LatteCachedFBO* fbo) +void MetalRenderer::rendertarget_deleteCachedFBO(LatteCachedFBO* cfbo) { - printf("MetalRenderer::rendertarget_deleteCachedFBO not implemented\n"); + if (cfbo == (LatteCachedFBO*)m_state.activeFBO) + m_state.activeFBO = nullptr; } void MetalRenderer::rendertarget_bindFramebufferObject(LatteCachedFBO* cfbo) { - printf("MetalRenderer::rendertarget_bindFramebufferObject not implemented\n"); + m_state.activeFBO = (CachedFBOMtl*)cfbo; } void* MetalRenderer::texture_acquireTextureUploadBuffer(uint32 size) @@ -292,7 +299,6 @@ void MetalRenderer::texture_clearSlice(LatteTexture* hostTexture, sint32 sliceIn void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, sint32 height, sint32 depth, void* pixelData, sint32 sliceIndex, sint32 mipIndex, uint32 compressedImageSize) { - std::cout << "TEXTURE LOAD SLICE" << std::endl; auto mtlTexture = (LatteTextureMtl*)hostTexture; size_t bytesPerRow = GetMtlTextureBytesPerRow(mtlTexture->GetFormat(), width); @@ -389,14 +395,14 @@ void MetalRenderer::streamout_rendererFinishDrawcall() void MetalRenderer::draw_beginSequence() { - skipDraws = false; + m_state.skipDrawSequence = false; // update shader state LatteSHRC_UpdateActiveShaders(); if (LatteGPUState.activeShaderHasError) { cemuLog_logDebugOnce(LogType::Force, "Skipping drawcalls due to shader error"); - skipDraws = true; + m_state.skipDrawSequence = true; cemu_assert_debug(false); return; } @@ -409,14 +415,14 @@ void MetalRenderer::draw_beginSequence() if (!LatteMRT::UpdateCurrentFBO()) { debug_printf("Rendertarget invalid\n"); - skipDraws = true; + m_state.skipDrawSequence = true; return; // no render target } if (!hasValidFramebufferAttached) { debug_printf("Drawcall with no color buffer or depth buffer attached\n"); - skipDraws = true; + m_state.skipDrawSequence = true; return; // no render target } LatteTexture_updateTextures(); @@ -441,12 +447,77 @@ void MetalRenderer::draw_beginSequence() rasterizerEnable = true; if (!rasterizerEnable == false) - skipDraws = true; + m_state.skipDrawSequence = true; } void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst) { - printf("MetalRenderer::draw_execute not implemented\n"); + std::cout << "DRAW" << std::endl; + // TODO: uncomment + //if (m_state.skipDrawSequence) + //{ + // printf("skipping draw\n"); + // return; + //} + + // Render pass + LatteMRT::ApplyCurrentState(); + + if (!m_state.activeFBO) + { + printf("no active FBO, skipping draw\n"); + return; + } + + auto renderPassDescriptor = m_state.activeFBO->GetRenderPassDescriptor(); + m_renderCommandEncoder = m_commandBuffer->renderCommandEncoder(renderPassDescriptor); + + // Shaders + /* + LatteDecompilerShader* vertexShader = LatteSHRC_GetActiveVertexShader(); + LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader(); + + // Render pipeline state + MTL::RenderPipelineDescriptor* renderPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); + renderPipelineDescriptor->setVertexFunction(static_cast(vertexShader->shader)->GetFunction()); + renderPipelineDescriptor->setFragmentFunction(static_cast(pixelShader->shader)->GetFunction()); + + NS::Error* error = nullptr; + MTL::RenderPipelineState* renderPipelineState = m_device->newRenderPipelineState(renderPipelineDescriptor, &error); + if (error) + { + printf("error creating render pipeline state: %s\n", error->localizedDescription()->utf8String()); + return; + } + + // TODO: bind resources + + const LattePrimitiveMode primitiveMode = static_cast(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]); + // TODO: uncomment + //auto mtlPrimitiveType = GetMtlPrimitiveType(primitiveMode); + + Renderer::INDEX_TYPE hostIndexType; + uint32 hostIndexCount; + uint32 indexMin = 0; + uint32 indexMax = 0; + uint32 indexBufferOffset = 0; + uint32 indexBufferIndex = 0; + LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexBufferOffset, indexBufferIndex); + */ + + // Draw + // TODO: uncomment + /* + if (hostIndexType != INDEX_TYPE::NONE) + { + auto mtlIndexType = GetMtlIndexType(hostIndexType); + // TODO: get index buffer + m_renderCommandEncoder->drawIndexedPrimitives(mtlPrimitiveType, hostIndexCount, mtlIndexType, indexBuffer, 0, instanceCount, baseVertex, baseInstance); + } else + { + m_renderCommandEncoder->drawPrimitives(mtlPrimitiveType, baseVertex, count, instanceCount, baseInstance); + } + */ } void MetalRenderer::draw_endSequence() diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 55a681581..4adf0984b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -6,6 +6,17 @@ #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" +#include "Metal/MTLRenderCommandEncoder.hpp" + +#define MAX_MTL_BUFFERS 31 + +struct MetalState +{ + bool skipDrawSequence = false; + class CachedFBOMtl* activeFBO = nullptr; + //MTL::Buffer* vertexBuffers[MAX_MTL_BUFFERS] = {nullptr}; + //MTL::Buffer* indexBuffer = nullptr; +}; class MetalRenderer : public Renderer { @@ -163,10 +174,12 @@ class MetalRenderer : public Renderer MTL::Device* m_device; MTL::CommandQueue* m_commandQueue; + // Active objects MTL::CommandBuffer* m_commandBuffer = nullptr; + MTL::RenderCommandEncoder* m_renderCommandEncoder = nullptr; // State - bool skipDraws = false; + MetalState m_state; // Helpers void ensureCommandBuffer() From cb525b22ff9c204cd3b6bf4ace689bd6c5ce8990 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 27 Jul 2024 21:26:26 +0200 Subject: [PATCH 012/368] implement vertex descriptors & draw --- src/Cafe/CMakeLists.txt | 1 + .../LatteDecompilerEmitMSL.cpp | 7 +- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 88 ++++++++++++++++++ src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h | 9 ++ .../Renderer/Metal/MetalMemoryManager.cpp | 26 ++++++ .../Latte/Renderer/Metal/MetalMemoryManager.h | 27 ++++-- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 90 +++++++++++++++---- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 15 +++- .../Renderer/Metal/RendererShaderMtl.cpp | 14 ++- 9 files changed, 247 insertions(+), 30 deletions(-) create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index bfb0d1b50..6dbe8781b 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -549,6 +549,7 @@ if(ENABLE_METAL) HW/Latte/Renderer/Metal/RendererShaderMtl.h HW/Latte/Renderer/Metal/CachedFBOMtl.cpp HW/Latte/Renderer/Metal/CachedFBOMtl.h + HW/Latte/Renderer/Metal/MetalMemoryManager.cpp HW/Latte/Renderer/Metal/MetalMemoryManager.h ) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index ce48bd63a..320d8e24c 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3818,18 +3818,21 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, LatteDecompiler::emitHeader(shaderContext); // helper functions LatteDecompiler_emitHelperFunctions(shaderContext, src); - const char* outputTypeName; + const char* functionType = ""; + const char* outputTypeName = ""; switch (shader->shaderType) { case LatteConst::ShaderType::Vertex: + functionType = "vertex"; outputTypeName = "VertexOut"; break; case LatteConst::ShaderType::Pixel: + functionType = "fragment"; outputTypeName = "FragmentOut"; break; } // start of main - src->addFmt("{} main0(", outputTypeName); + src->addFmt("{} {} main0(", functionType, outputTypeName); LatteDecompiler::emitInputs(shaderContext); src->add(") {" _CRLF); src->addFmt("{} out;" _CRLF, outputTypeName); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index 0d7d14c55..7c7b41870 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -1,6 +1,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #include "Common/precompiled.h" #include "Metal/MTLPixelFormat.hpp" +#include "Metal/MTLVertexDescriptor.hpp" // TODO: separate color and depth formats std::map MTL_FORMAT_TABLE = { @@ -106,3 +107,90 @@ size_t GetMtlTextureBytesPerImage(Latte::E_GX2SURFFMT format, uint32 height, siz return CeilDivide(height, formatInfo.blockTexelSize.y) * bytesPerRow; } + +MTL::PrimitiveType GetMtlPrimitiveType(LattePrimitiveMode mode) +{ + switch (mode) + { + case LattePrimitiveMode::POINTS: + return MTL::PrimitiveTypePoint; + case LattePrimitiveMode::LINES: + return MTL::PrimitiveTypeLine; + case LattePrimitiveMode::TRIANGLES: + return MTL::PrimitiveTypeTriangle; + case LattePrimitiveMode::TRIANGLE_STRIP: + return MTL::PrimitiveTypeTriangleStrip; + default: + printf("unimplemented primitive type %u\n", (uint32)mode); + cemu_assert_debug(false); + return MTL::PrimitiveTypeTriangle; + } +} + +MTL::VertexFormat GetMtlVertexFormat(uint8 format) +{ + switch (format) + { + case FMT_32_32_32_32_FLOAT: + return MTL::VertexFormatUInt4; + case FMT_32_32_32_FLOAT: + return MTL::VertexFormatUInt3; + case FMT_32_32_FLOAT: + return MTL::VertexFormatUInt2; + case FMT_32_FLOAT: + return MTL::VertexFormatUInt; + case FMT_8_8_8_8: + return MTL::VertexFormatUChar4; + case FMT_8_8_8: + return MTL::VertexFormatUChar3; + case FMT_8_8: + return MTL::VertexFormatUChar2; + case FMT_8: + return MTL::VertexFormatUChar; + case FMT_32_32_32_32: + return MTL::VertexFormatUInt4; + case FMT_32_32_32: + return MTL::VertexFormatUInt3; + case FMT_32_32: + return MTL::VertexFormatUInt2; + case FMT_32: + return MTL::VertexFormatUInt; + case FMT_16_16_16_16: + return MTL::VertexFormatUShort4; // verified to match OpenGL + case FMT_16_16_16: + return MTL::VertexFormatUShort3; + case FMT_16_16: + return MTL::VertexFormatUShort2; + case FMT_16: + return MTL::VertexFormatUShort; + case FMT_16_16_16_16_FLOAT: + return MTL::VertexFormatUShort4; // verified to match OpenGL + case FMT_16_16_16_FLOAT: + return MTL::VertexFormatUShort3; + case FMT_16_16_FLOAT: + return MTL::VertexFormatUShort2; + case FMT_16_FLOAT: + return MTL::VertexFormatUShort; + case FMT_2_10_10_10: + return MTL::VertexFormatUInt; // verified to match OpenGL + default: + printf("unsupported vertex format: %u\n", (uint32)format); + assert_dbg(); + return MTL::VertexFormatInvalid; + } +} + +MTL::IndexType GetMtlIndexType(Renderer::INDEX_TYPE indexType) +{ + switch (indexType) + { + case Renderer::INDEX_TYPE::U16: + return MTL::IndexTypeUInt16; + case Renderer::INDEX_TYPE::U32: + return MTL::IndexTypeUInt32; + default: + printf("unsupported index type: %u\n", (uint32)indexType); + assert_dbg(); + return MTL::IndexTypeUInt32; + } +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h index e4a821560..a0c1b9395 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h @@ -3,6 +3,9 @@ #include #include "Cafe/HW/Latte/ISA/LatteReg.h" +#include "Cafe/HW/Latte/Core/LatteConst.h" +//#include "Cafe/HW/Latte/Core/FetchShader.h" +#include "Cafe/HW/Latte/Renderer/Renderer.h" struct Uvec2 { uint32 x; @@ -20,3 +23,9 @@ const MtlPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format); size_t GetMtlTextureBytesPerRow(Latte::E_GX2SURFFMT format, uint32 width); size_t GetMtlTextureBytesPerImage(Latte::E_GX2SURFFMT format, uint32 height, size_t bytesPerRow); + +MTL::PrimitiveType GetMtlPrimitiveType(LattePrimitiveMode mode); + +MTL::VertexFormat GetMtlVertexFormat(uint8 format); + +MTL::IndexType GetMtlIndexType(Renderer::INDEX_TYPE indexType); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp new file mode 100644 index 000000000..e85ede402 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -0,0 +1,26 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" + +void* MetalMemoryManager::GetTextureUploadBuffer(size_t size) +{ + if (m_textureUploadBuffer.size() < size) + { + m_textureUploadBuffer.resize(size); + } + + return m_textureUploadBuffer.data(); +} + +// TODO: optimize this +MetalBufferAllocation MetalMemoryManager::GetBufferAllocation(size_t size) +{ + MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(size, MTL::ResourceStorageModeShared); + + MetalBufferAllocation allocation; + allocation.bufferIndex = m_buffers.size(); + allocation.bufferOffset = 0; + allocation.data = buffer->contents(); + + m_buffers.push_back(buffer); + + return allocation; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index d767f2322..89ccd2442 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -1,22 +1,33 @@ #pragma once +#include + #include "Cafe/HW/Latte/ISA/LatteReg.h" +struct MetalBufferAllocation +{ + void* data; + uint32 bufferIndex; + size_t bufferOffset; +}; + class MetalMemoryManager { public: - MetalMemoryManager() = default; + MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} - void* GetTextureUploadBuffer(size_t size) + MTL::Buffer* GetBuffer(uint32 bufferIndex) { - if (m_textureUploadBuffer.size() < size) - { - m_textureUploadBuffer.resize(size); - } - - return m_textureUploadBuffer.data(); + return m_buffers[bufferIndex]; } + void* GetTextureUploadBuffer(size_t size); + + MetalBufferAllocation GetBufferAllocation(size_t size); + private: + class MetalRenderer* m_mtlr; + std::vector m_textureUploadBuffer; + std::vector m_buffers; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 40c331d04..21b8f9fa0 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -5,8 +5,10 @@ #include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" -#include "HW/Latte/Core/LatteShader.h" +#include "Cafe/HW/Latte/Core/FetchShader.h" +#include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/Core/LatteIndices.h" +#include "Metal/MTLVertexDescriptor.hpp" #include "gui/guiWrapper.h" extern bool hasValidFramebufferAttached; @@ -15,10 +17,14 @@ MetalRenderer::MetalRenderer() { m_device = MTL::CreateSystemDefaultDevice(); m_commandQueue = m_device->newCommandQueue(); + + m_memoryManager = new MetalMemoryManager(this); } MetalRenderer::~MetalRenderer() { + delete m_memoryManager; + m_commandQueue->release(); m_device->release(); } @@ -155,7 +161,7 @@ void MetalRenderer::rendertarget_bindFramebufferObject(LatteCachedFBO* cfbo) void* MetalRenderer::texture_acquireTextureUploadBuffer(uint32 size) { - return m_memoryManager.GetTextureUploadBuffer(size); + return m_memoryManager->GetTextureUploadBuffer(size); } void MetalRenderer::texture_releaseTextureUploadBuffer(uint8* mem) @@ -431,8 +437,7 @@ void MetalRenderer::draw_beginSequence() } // apply render target - // HACK: not implemented yet - //LatteMRT::ApplyCurrentState(); + LatteMRT::ApplyCurrentState(); // viewport and scissor box LatteRenderTarget_updateViewport(); @@ -453,6 +458,8 @@ void MetalRenderer::draw_beginSequence() void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst) { std::cout << "DRAW" << std::endl; + + ensureCommandBuffer(); // TODO: uncomment //if (m_state.skipDrawSequence) //{ @@ -461,8 +468,6 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 //} // Render pass - LatteMRT::ApplyCurrentState(); - if (!m_state.activeFBO) { printf("no active FBO, skipping draw\n"); @@ -470,17 +475,70 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 } auto renderPassDescriptor = m_state.activeFBO->GetRenderPassDescriptor(); - m_renderCommandEncoder = m_commandBuffer->renderCommandEncoder(renderPassDescriptor); + beginRenderPassIfNeeded(renderPassDescriptor); // Shaders - /* LatteDecompilerShader* vertexShader = LatteSHRC_GetActiveVertexShader(); LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader(); + auto fetchShader = vertexShader->compatibleFetchShader; + + // Vertex descriptor + MTL::VertexDescriptor* vertexDescriptor = MTL::VertexDescriptor::alloc()->init(); + for (auto& bufferGroup : fetchShader->bufferGroups) + { + std::optional fetchType; + + for (sint32 j = 0; j < bufferGroup.attribCount; ++j) + { + auto& attr = bufferGroup.attrib[j]; + + uint32 semanticId = vertexShader->resourceMapping.attributeMapping[attr.semanticId]; + if (semanticId == (uint32)-1) + continue; // attribute not used? + + auto attribute = vertexDescriptor->attributes()->object(semanticId); + attribute->setOffset(attr.offset); + // Bind from the end to not conflict with uniform buffers + attribute->setBufferIndex(GET_MTL_VERTEX_BUFFER_INDEX(attr.attributeBufferIndex)); + attribute->setFormat(GetMtlVertexFormat(attr.format)); + + if (fetchType.has_value()) + cemu_assert_debug(fetchType == attr.fetchType); + else + fetchType = attr.fetchType; + + if (attr.fetchType == LatteConst::INSTANCE_DATA) + { + cemu_assert_debug(attr.aluDivisor == 1); // other divisor not yet supported + } + } + + uint32 bufferIndex = bufferGroup.attributeBufferIndex; + uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; + // TODO: is LatteGPUState.contextNew correct? + uint32 bufferStride = (LatteGPUState.contextNew.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + printf("buffer %u has stride %u bytes\n", bufferIndex, bufferStride); + + auto layout = vertexDescriptor->layouts()->object(GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)); + layout->setStride(bufferStride); + if (!fetchType.has_value() || fetchType == LatteConst::VertexFetchType2::VERTEX_DATA) + layout->setStepFunction(MTL::VertexStepFunctionPerVertex); + else if (fetchType == LatteConst::VertexFetchType2::INSTANCE_DATA) + layout->setStepFunction(MTL::VertexStepFunctionPerInstance); + else + { + printf("unimplemented vertex fetch type %u\n", (uint32)fetchType.value()); + cemu_assert(false); + } + } + // Render pipeline state MTL::RenderPipelineDescriptor* renderPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); renderPipelineDescriptor->setVertexFunction(static_cast(vertexShader->shader)->GetFunction()); renderPipelineDescriptor->setFragmentFunction(static_cast(pixelShader->shader)->GetFunction()); + // TODO: don't always set the vertex descriptor + renderPipelineDescriptor->setVertexDescriptor(vertexDescriptor); NS::Error* error = nullptr; MTL::RenderPipelineState* renderPipelineState = m_device->newRenderPipelineState(renderPipelineDescriptor, &error); @@ -489,12 +547,12 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 printf("error creating render pipeline state: %s\n", error->localizedDescription()->utf8String()); return; } + m_renderCommandEncoder->setRenderPipelineState(renderPipelineState); // TODO: bind resources const LattePrimitiveMode primitiveMode = static_cast(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]); - // TODO: uncomment - //auto mtlPrimitiveType = GetMtlPrimitiveType(primitiveMode); + auto mtlPrimitiveType = GetMtlPrimitiveType(primitiveMode); Renderer::INDEX_TYPE hostIndexType; uint32 hostIndexCount; @@ -503,21 +561,17 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 uint32 indexBufferOffset = 0; uint32 indexBufferIndex = 0; LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexBufferOffset, indexBufferIndex); - */ // Draw - // TODO: uncomment - /* if (hostIndexType != INDEX_TYPE::NONE) { auto mtlIndexType = GetMtlIndexType(hostIndexType); - // TODO: get index buffer + MTL::Buffer* indexBuffer = m_memoryManager->GetBuffer(indexBufferIndex); m_renderCommandEncoder->drawIndexedPrimitives(mtlPrimitiveType, hostIndexCount, mtlIndexType, indexBuffer, 0, instanceCount, baseVertex, baseInstance); } else { m_renderCommandEncoder->drawPrimitives(mtlPrimitiveType, baseVertex, count, instanceCount, baseInstance); } - */ } void MetalRenderer::draw_endSequence() @@ -527,9 +581,11 @@ void MetalRenderer::draw_endSequence() void* MetalRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) { - printf("MetalRenderer::indexData_reserveIndexMemory not implemented\n"); + auto allocation = m_memoryManager->GetBufferAllocation(size); + offset = allocation.bufferOffset; + bufferIndex = allocation.bufferIndex; - return nullptr; + return allocation.data; } void MetalRenderer::indexData_uploadIndexMemory(uint32 offset, uint32 size) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 4adf0984b..0b1063f90 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -7,8 +7,10 @@ #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" #include "Metal/MTLRenderCommandEncoder.hpp" +#include "Metal/MTLRenderPass.hpp" #define MAX_MTL_BUFFERS 31 +#define GET_MTL_VERTEX_BUFFER_INDEX(index) (MAX_MTL_BUFFERS - index - 1) struct MetalState { @@ -168,7 +170,7 @@ class MetalRenderer : public Renderer private: CA::MetalLayer* m_metalLayer; - MetalMemoryManager m_memoryManager; + MetalMemoryManager* m_memoryManager; // Metal objects MTL::Device* m_device; @@ -186,7 +188,18 @@ class MetalRenderer : public Renderer { if (!m_commandBuffer) { + // Debug + m_commandQueue->insertDebugCaptureBoundary(); + m_commandBuffer = m_commandQueue->commandBuffer(); } } + + void beginRenderPassIfNeeded(MTL::RenderPassDescriptor* renderPassDescriptor) + { + if (!m_renderCommandEncoder) + { + m_renderCommandEncoder = m_commandBuffer->renderCommandEncoder(renderPassDescriptor); + } + } }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index 28b386123..b870fc686 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -1,6 +1,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Cemu/Logging/CemuLogging.h" +#include "Metal/MTLFunctionDescriptor.hpp" RendererShaderMtl::RendererShaderMtl(MetalRenderer* mtlRenderer, ShaderType type, uint64 baseHash, uint64 auxHash, bool isGameShader, bool isGfxPackShader, const std::string& mslCode) : RendererShader(type, baseHash, auxHash, isGameShader, isGfxPackShader) @@ -9,11 +10,20 @@ RendererShaderMtl::RendererShaderMtl(MetalRenderer* mtlRenderer, ShaderType type MTL::Library* library = mtlRenderer->GetDevice()->newLibrary(NS::String::string(mslCode.c_str(), NS::ASCIIStringEncoding), nullptr, &error); if (error) { - printf("Failed to create library (error: %s) -> source:\n%s", error->localizedDescription()->utf8String(), mslCode.c_str()); + printf("Failed to create library (error: %s) -> source:\n%s\n", error->localizedDescription()->utf8String(), mslCode.c_str()); + error->release(); + return; + } + MTL::FunctionDescriptor* desc = MTL::FunctionDescriptor::alloc()->init(); + desc->setName(NS::String::string("main0", NS::ASCIIStringEncoding)); + error = nullptr; + m_function = library->newFunction(desc, &error); + if (error) + { + printf("Failed to create function (error: %s)\n", error->localizedDescription()->utf8String()); error->release(); return; } - m_function = library->newFunction(NS::String::string("main0", NS::ASCIIStringEncoding)); library->release(); } From 7ea18d8a5f62dc7133c41e92e49ca49f53a89277 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 28 Jul 2024 07:36:32 +0200 Subject: [PATCH 013/368] bind resources to shaders --- .../LatteDecompilerEmitMSLHeader.hpp | 2 +- .../Renderer/Metal/MetalMemoryManager.cpp | 33 +++ .../Latte/Renderer/Metal/MetalMemoryManager.h | 11 + .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 221 +++++++++++++++++- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 18 +- 5 files changed, 271 insertions(+), 14 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 45c886314..3bc8796e2 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -367,7 +367,7 @@ namespace LatteDecompiler break; } - src->add(" in [[stage_in]], constant SupportBuffer& supportBuffer [[buffer(29)]]"); + src->add(" in [[stage_in]], constant SupportBuffer& supportBuffer [[buffer(30)]]"); switch (decompilerContext->shaderType) { case LatteConst::ShaderType::Vertex: diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index e85ede402..9ec94c0e3 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -24,3 +24,36 @@ MetalBufferAllocation MetalMemoryManager::GetBufferAllocation(size_t size) return allocation; } + +void MetalMemoryManager::InitBufferCache(size_t size) +{ + if (m_bufferCache) + { + printf("MetalMemoryManager::InitBufferCache: buffer cache already initialized\n"); + return; + } + + m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, MTL::ResourceStorageModeShared); +} + +void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, size_t size) +{ + if (!m_bufferCache) + { + printf("MetalMemoryManager::UploadToBufferCache: buffer cache not initialized\n"); + return; + } + + memcpy((uint8*)m_bufferCache->contents() + offset, data, size); +} + +void MetalMemoryManager::CopyBufferCache(size_t srcOffset, size_t dstOffset, size_t size) +{ + if (!m_bufferCache) + { + printf("MetalMemoryManager::CopyBufferCache: buffer cache not initialized\n"); + return; + } + + memcpy((uint8*)m_bufferCache->contents() + dstOffset, (uint8*)m_bufferCache->contents() + srcOffset, size); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index 89ccd2442..ee773047c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -21,13 +21,24 @@ class MetalMemoryManager return m_buffers[bufferIndex]; } + MTL::Buffer* GetBufferCache() + { + return m_bufferCache; + } + void* GetTextureUploadBuffer(size_t size); MetalBufferAllocation GetBufferAllocation(size_t size); + // Buffer cache + void InitBufferCache(size_t size); + void UploadToBufferCache(const void* data, size_t offset, size_t size); + void CopyBufferCache(size_t srcOffset, size_t dstOffset, size_t size); + private: class MetalRenderer* m_mtlr; std::vector m_textureUploadBuffer; std::vector m_buffers; + MTL::Buffer* m_bufferCache = nullptr; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 21b8f9fa0..0bd17f098 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1,6 +1,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalLayer.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" @@ -13,6 +14,8 @@ extern bool hasValidFramebufferAttached; +float supportBufferData[512 * 4]; + MetalRenderer::MetalRenderer() { m_device = MTL::CreateSystemDefaultDevice(); @@ -329,7 +332,7 @@ LatteTexture* MetalRenderer::texture_createTextureEx(Latte::E_DIM dim, MPTR phys void MetalRenderer::texture_setLatteTexture(LatteTextureView* textureView, uint32 textureUnit) { - printf("MetalRenderer::texture_setLatteTexture not implemented\n"); + m_state.textures[textureUnit] = static_cast(textureView); } void MetalRenderer::texture_copyImageSubData(LatteTexture* src, sint32 srcMip, sint32 effectiveSrcX, sint32 effectiveSrcY, sint32 srcSlice, LatteTexture* dst, sint32 dstMip, sint32 effectiveDstX, sint32 effectiveDstY, sint32 dstSlice, sint32 effectiveCopyWidth, sint32 effectiveCopyHeight, sint32 srcDepth) @@ -351,17 +354,17 @@ void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* so void MetalRenderer::bufferCache_init(const sint32 bufferSize) { - printf("MetalRenderer::bufferCache_init not implemented\n"); + m_memoryManager->InitBufferCache(bufferSize); } void MetalRenderer::bufferCache_upload(uint8* buffer, sint32 size, uint32 bufferOffset) { - printf("MetalRenderer::bufferCache_upload not implemented\n"); + m_memoryManager->UploadToBufferCache(buffer, bufferOffset, size); } void MetalRenderer::bufferCache_copy(uint32 srcOffset, uint32 dstOffset, uint32 size) { - printf("MetalRenderer::bufferCache_copy not implemented\n"); + m_memoryManager->CopyBufferCache(srcOffset, dstOffset, size); } void MetalRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint32 dstOffset, uint32 size) @@ -371,7 +374,11 @@ void MetalRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, uint32 size) { - printf("MetalRenderer::buffer_bindVertexBuffer not implemented\n"); + if (m_state.vertexBuffers[bufferIndex].offset == offset) + return; + cemu_assert_debug(bufferIndex < LATTE_MAX_VERTEX_BUFFERS); + m_state.vertexBuffers[bufferIndex].needsRebind = true; + m_state.vertexBuffers[bufferIndex].offset = offset; } void MetalRenderer::buffer_bindUniformBuffer(LatteConst::ShaderType shaderType, uint32 bufferIndex, uint32 offset, uint32 size) @@ -457,8 +464,6 @@ void MetalRenderer::draw_beginSequence() void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst) { - std::cout << "DRAW" << std::endl; - ensureCommandBuffer(); // TODO: uncomment //if (m_state.skipDrawSequence) @@ -478,6 +483,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 beginRenderPassIfNeeded(renderPassDescriptor); // Shaders + LatteSHRC_UpdateActiveShaders(); LatteDecompilerShader* vertexShader = LatteSHRC_GetActiveVertexShader(); LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader(); @@ -518,7 +524,6 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; // TODO: is LatteGPUState.contextNew correct? uint32 bufferStride = (LatteGPUState.contextNew.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; - printf("buffer %u has stride %u bytes\n", bufferIndex, bufferStride); auto layout = vertexDescriptor->layouts()->object(GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)); layout->setStride(bufferStride); @@ -549,11 +554,13 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 } m_renderCommandEncoder->setRenderPipelineState(renderPipelineState); - // TODO: bind resources - + // Primitive type const LattePrimitiveMode primitiveMode = static_cast(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]); auto mtlPrimitiveType = GetMtlPrimitiveType(primitiveMode); + // Resources + + // Index buffer Renderer::INDEX_TYPE hostIndexType; uint32 hostIndexCount; uint32 indexMin = 0; @@ -562,6 +569,25 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 uint32 indexBufferIndex = 0; LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexBufferOffset, indexBufferIndex); + // synchronize vertex and uniform cache and update buffer bindings + LatteBufferCache_Sync(indexMin + baseVertex, indexMax + baseVertex, baseInstance, instanceCount); + + // Vertex buffers + for (uint8 i = 0; i < MAX_MTL_BUFFERS; i++) + { + auto& vertexBufferRange = m_state.vertexBuffers[i]; + if (vertexBufferRange.needsRebind) + { + m_renderCommandEncoder->setVertexBuffer(m_memoryManager->GetBufferCache(), vertexBufferRange.offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); + // TODO: uncomment + //vertexBufferRange.needRebind = false; + } + } + + // Uniform buffers, textures and samplers + BindStageResources(vertexShader); + BindStageResources(pixelShader); + // Draw if (hostIndexType != INDEX_TYPE::NONE) { @@ -592,3 +618,178 @@ void MetalRenderer::indexData_uploadIndexMemory(uint32 offset, uint32 size) { printf("MetalRenderer::indexData_uploadIndexMemory not implemented\n"); } + +void MetalRenderer::BindStageResources(LatteDecompilerShader* shader) +{ + sint32 textureCount = shader->resourceMapping.getTextureCount(); + + for (int i = 0; i < textureCount; ++i) + { + const auto relative_textureUnit = shader->resourceMapping.getTextureUnitFromBindingPoint(i); + auto hostTextureUnit = relative_textureUnit; + auto textureDim = shader->textureUnitDim[relative_textureUnit]; + auto texUnitRegIndex = hostTextureUnit * 7; + + auto textureView = m_state.textures[hostTextureUnit]; + + //LatteTexture* baseTexture = textureView->baseTexture; + // get texture register word 0 + uint32 word4 = LatteGPUState.contextRegister[texUnitRegIndex + 4]; + + // TODO: wht + //auto imageViewObj = textureView->GetSamplerView(word4); + //info.imageView = imageViewObj->m_textureImageView; + + uint32 stageSamplerIndex = shader->textureUnitSamplerAssignment[relative_textureUnit]; + // TODO: uncomment + MTL::SamplerState* sampler = nullptr;//basicSampler; + if (stageSamplerIndex != LATTE_DECOMPILER_SAMPLER_NONE) + { + // TODO: bind the actual sampler + } + + uint32 binding = shader->resourceMapping.getTextureBaseBindingPoint() + i; + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + { + m_renderCommandEncoder->setVertexTexture(textureView->GetTexture(), binding); + break; + } + case LatteConst::ShaderType::Pixel: + { + m_renderCommandEncoder->setFragmentTexture(textureView->GetTexture(), binding); + break; + } + default: + UNREACHABLE; + } + } + + // Support buffer + auto GET_UNIFORM_DATA_PTR = [&](size_t index) { return supportBufferData + (index / 4); }; + + sint32 shaderAluConst; + sint32 shaderUniformRegisterOffset; + + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + shaderAluConst = 0x400; + shaderUniformRegisterOffset = mmSQ_VTX_UNIFORM_BLOCK_START; + break; + case LatteConst::ShaderType::Pixel: + shaderAluConst = 0; + shaderUniformRegisterOffset = mmSQ_PS_UNIFORM_BLOCK_START; + break; + case LatteConst::ShaderType::Geometry: + shaderAluConst = 0; // geometry shader has no ALU const + shaderUniformRegisterOffset = mmSQ_GS_UNIFORM_BLOCK_START; + break; + default: + UNREACHABLE; + } + + //if (shader->resourceMapping.uniformVarsBufferBindingPoint >= 0) + //{ + if (shader->uniform.list_ufTexRescale.empty() == false) + { + for (auto& entry : shader->uniform.list_ufTexRescale) + { + float* xyScale = LatteTexture_getEffectiveTextureScale(shader->shaderType, entry.texUnit); + memcpy(entry.currentValue, xyScale, sizeof(float) * 2); + memcpy(GET_UNIFORM_DATA_PTR(entry.uniformLocation), xyScale, sizeof(float) * 2); + } + } + if (shader->uniform.loc_alphaTestRef >= 0) + { + *GET_UNIFORM_DATA_PTR(shader->uniform.loc_alphaTestRef) = LatteGPUState.contextNew.SX_ALPHA_REF.get_ALPHA_TEST_REF(); + } + if (shader->uniform.loc_pointSize >= 0) + { + const auto& pointSizeReg = LatteGPUState.contextNew.PA_SU_POINT_SIZE; + float pointWidth = (float)pointSizeReg.get_WIDTH() / 8.0f; + if (pointWidth == 0.0f) + pointWidth = 1.0f / 8.0f; // minimum size + *GET_UNIFORM_DATA_PTR(shader->uniform.loc_pointSize) = pointWidth; + } + if (shader->uniform.loc_remapped >= 0) + { + LatteBufferCache_LoadRemappedUniforms(shader, GET_UNIFORM_DATA_PTR(shader->uniform.loc_remapped)); + } + if (shader->uniform.loc_uniformRegister >= 0) + { + uint32* uniformRegData = (uint32*)(LatteGPUState.contextRegister + mmSQ_ALU_CONSTANT0_0 + shaderAluConst); + memcpy(GET_UNIFORM_DATA_PTR(shader->uniform.loc_uniformRegister), uniformRegData, shader->uniform.count_uniformRegister * 16); + } + if (shader->uniform.loc_windowSpaceToClipSpaceTransform >= 0) + { + sint32 viewportWidth; + sint32 viewportHeight; + LatteRenderTarget_GetCurrentVirtualViewportSize(&viewportWidth, &viewportHeight); // always call after _updateViewport() + float* v = GET_UNIFORM_DATA_PTR(shader->uniform.loc_windowSpaceToClipSpaceTransform); + v[0] = 2.0f / (float)viewportWidth; + v[1] = 2.0f / (float)viewportHeight; + } + if (shader->uniform.loc_fragCoordScale >= 0) + { + LatteMRT::GetCurrentFragCoordScale(GET_UNIFORM_DATA_PTR(shader->uniform.loc_fragCoordScale)); + } + // TODO: uncomment? + /* + if (shader->uniform.loc_verticesPerInstance >= 0) + { + *(int*)(supportBufferData + ((size_t)shader->uniform.loc_verticesPerInstance / 4)) = m_streamoutState.verticesPerInstance; + for (sint32 b = 0; b < LATTE_NUM_STREAMOUT_BUFFER; b++) + { + if (shader->uniform.loc_streamoutBufferBase[b] >= 0) + { + *(uint32*)GET_UNIFORM_DATA_PTR(shader->uniform.loc_streamoutBufferBase[b]) = m_streamoutState.buffer[b].ringBufferOffset; + } + } + } + */ + + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + { + m_renderCommandEncoder->setVertexBytes(supportBufferData, sizeof(supportBufferData), MTL_SUPPORT_BUFFER_BINDING); + break; + } + case LatteConst::ShaderType::Pixel: + { + m_renderCommandEncoder->setFragmentBytes(supportBufferData, sizeof(supportBufferData), MTL_SUPPORT_BUFFER_BINDING); + break; + } + default: + UNREACHABLE; + } + //} + + // Uniform buffers + for (sint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++) + { + if (shader->resourceMapping.uniformBuffersBindingPoint[i] >= 0) + { + uint32 binding = shader->resourceMapping.uniformBuffersBindingPoint[i]; + // TODO: don't hardcode + size_t offset = 0; + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + { + m_renderCommandEncoder->setVertexBuffer(m_memoryManager->GetBufferCache(), offset, binding); + break; + } + case LatteConst::ShaderType::Pixel: + { + m_renderCommandEncoder->setFragmentBuffer(m_memoryManager->GetBufferCache(), offset, binding); + break; + } + default: + UNREACHABLE; + } + } + } +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 0b1063f90..fd4463653 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -10,14 +10,24 @@ #include "Metal/MTLRenderPass.hpp" #define MAX_MTL_BUFFERS 31 -#define GET_MTL_VERTEX_BUFFER_INDEX(index) (MAX_MTL_BUFFERS - index - 1) +#define GET_MTL_VERTEX_BUFFER_INDEX(index) (MAX_MTL_BUFFERS - index - 2) +// TODO: don't harcdode the support buffer binding +#define MTL_SUPPORT_BUFFER_BINDING 30 + +#define MAX_MTL_TEXTURES 31 + +struct MetalBufferRange +{ + bool needsRebind = false; + sint32 offset = -1; +}; struct MetalState { bool skipDrawSequence = false; class CachedFBOMtl* activeFBO = nullptr; - //MTL::Buffer* vertexBuffers[MAX_MTL_BUFFERS] = {nullptr}; - //MTL::Buffer* indexBuffer = nullptr; + MetalBufferRange vertexBuffers[MAX_MTL_BUFFERS] = {{}}; + class LatteTextureViewMtl* textures[MAX_MTL_TEXTURES] = {nullptr}; }; class MetalRenderer : public Renderer @@ -202,4 +212,6 @@ class MetalRenderer : public Renderer m_renderCommandEncoder = m_commandBuffer->renderCommandEncoder(renderPassDescriptor); } } + + void BindStageResources(LatteDecompilerShader* shader); }; From 35eea12950fe653966fbca252562f9c63fb65d45 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 28 Jul 2024 16:32:37 +0200 Subject: [PATCH 014/368] bind default sampler --- .../HW/Latte/Renderer/Metal/CachedFBOMtl.cpp | 2 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 42 +++++++++++++++++-- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 3 ++ 3 files changed, 42 insertions(+), 5 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp index ac34918e8..2a0715b64 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp @@ -8,7 +8,7 @@ void CachedFBOMtl::CreateRenderPass() for (int i = 0; i < 8; ++i) { - auto& buffer = colorBuffer[i]; + const auto& buffer = colorBuffer[i]; auto textureView = (LatteTextureViewMtl*)buffer.texture; if (!textureView) { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 0bd17f098..a0f73c028 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -9,6 +9,7 @@ #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/Core/LatteIndices.h" +#include "Metal/MTLSampler.hpp" #include "Metal/MTLVertexDescriptor.hpp" #include "gui/guiWrapper.h" @@ -21,6 +22,9 @@ MetalRenderer::MetalRenderer() m_device = MTL::CreateSystemDefaultDevice(); m_commandQueue = m_device->newCommandQueue(); + MTL::SamplerDescriptor* samplerDescriptor = MTL::SamplerDescriptor::alloc()->init(); + m_nearestSampler = m_device->newSamplerState(samplerDescriptor); + m_memoryManager = new MetalMemoryManager(this); } @@ -544,6 +548,21 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 renderPipelineDescriptor->setFragmentFunction(static_cast(pixelShader->shader)->GetFunction()); // TODO: don't always set the vertex descriptor renderPipelineDescriptor->setVertexDescriptor(vertexDescriptor); + for (uint8 i = 0; i < 8; i++) + { + const auto& colorBuffer = m_state.activeFBO->colorBuffer[i]; + auto texture = static_cast(colorBuffer.texture); + if (!texture) + { + continue; + } + renderPipelineDescriptor->colorAttachments()->object(i)->setPixelFormat(texture->GetTexture()->pixelFormat()); + } + if (m_state.activeFBO->depthBuffer.texture) + { + auto texture = static_cast(m_state.activeFBO->depthBuffer.texture); + renderPipelineDescriptor->setDepthAttachmentPixelFormat(texture->GetTexture()->pixelFormat()); + } NS::Error* error = nullptr; MTL::RenderPipelineState* renderPipelineState = m_device->newRenderPipelineState(renderPipelineDescriptor, &error); @@ -579,8 +598,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 if (vertexBufferRange.needsRebind) { m_renderCommandEncoder->setVertexBuffer(m_memoryManager->GetBufferCache(), vertexBufferRange.offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); - // TODO: uncomment - //vertexBufferRange.needRebind = false; + vertexBufferRange.needsRebind = false; } } @@ -640,15 +658,31 @@ void MetalRenderer::BindStageResources(LatteDecompilerShader* shader) //auto imageViewObj = textureView->GetSamplerView(word4); //info.imageView = imageViewObj->m_textureImageView; + uint32 binding = shader->resourceMapping.getTextureBaseBindingPoint() + i; + uint32 stageSamplerIndex = shader->textureUnitSamplerAssignment[relative_textureUnit]; // TODO: uncomment - MTL::SamplerState* sampler = nullptr;//basicSampler; if (stageSamplerIndex != LATTE_DECOMPILER_SAMPLER_NONE) { // TODO: bind the actual sampler + MTL::SamplerState* sampler = m_nearestSampler; + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + { + m_renderCommandEncoder->setVertexSamplerState(sampler, binding); + break; + } + case LatteConst::ShaderType::Pixel: + { + m_renderCommandEncoder->setFragmentSamplerState(sampler, binding); + break; + } + default: + UNREACHABLE; + } } - uint32 binding = shader->resourceMapping.getTextureBaseBindingPoint() + i; switch (shader->shaderType) { case LatteConst::ShaderType::Vertex: diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index fd4463653..ee1d865b1 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -186,6 +186,9 @@ class MetalRenderer : public Renderer MTL::Device* m_device; MTL::CommandQueue* m_commandQueue; + // Basic + MTL::SamplerState* m_nearestSampler; + // Active objects MTL::CommandBuffer* m_commandBuffer = nullptr; MTL::RenderCommandEncoder* m_renderCommandEncoder = nullptr; From 7ae29a74cd39e67390f712edc0ae47fd7d0ab114 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 28 Jul 2024 18:43:47 +0200 Subject: [PATCH 015/368] present --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 106 +++++++++++++----- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 68 ++++++++++- .../Renderer/Metal/RendererShaderMtl.cpp | 4 +- .../Renderer/Metal/ShaderSourcePresent.h | 23 ++++ 4 files changed, 168 insertions(+), 33 deletions(-) create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/ShaderSourcePresent.h diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index a0f73c028..cb363d61f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -6,11 +6,12 @@ #include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/ShaderSourcePresent.h" + #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/Core/LatteIndices.h" -#include "Metal/MTLSampler.hpp" -#include "Metal/MTLVertexDescriptor.hpp" +#include "Foundation/NSTypes.hpp" #include "gui/guiWrapper.h" extern bool hasValidFramebufferAttached; @@ -43,6 +44,35 @@ void MetalRenderer::InitializeLayer(const Vector2i& size, bool mainWindow) m_metalLayer = (CA::MetalLayer*)CreateMetalLayer(windowInfo.handle); m_metalLayer->setDevice(m_device); + + // Present pipeline + NS::Error* error = nullptr; + MTL::Library* presentLibrary = m_device->newLibrary(NS::String::string(presentLibrarySource, NS::ASCIIStringEncoding), nullptr, &error); + if (error) + { + printf("failed to create present library (error: %s)\n", error->localizedDescription()->utf8String()); + error->release(); + throw; + return; + } + MTL::Function* presentVertexFunction = presentLibrary->newFunction(NS::String::string("presentVertex", NS::ASCIIStringEncoding)); + MTL::Function* presentFragmentFunction = presentLibrary->newFunction(NS::String::string("presentFragment", NS::ASCIIStringEncoding)); + presentLibrary->release(); + + MTL::RenderPipelineDescriptor* renderPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); + renderPipelineDescriptor->setVertexFunction(presentVertexFunction); + renderPipelineDescriptor->setFragmentFunction(presentFragmentFunction); + renderPipelineDescriptor->colorAttachments()->object(0)->setPixelFormat(m_metalLayer->pixelFormat()); + m_presentPipeline = m_device->newRenderPipelineState(renderPipelineDescriptor, &error); + presentVertexFunction->release(); + presentFragmentFunction->release(); + if (error) + { + printf("failed to create present pipeline (error: %s)\n", error->localizedDescription()->utf8String()); + error->release(); + throw; + return; + } } void MetalRenderer::Initialize() @@ -83,40 +113,48 @@ void MetalRenderer::DrawEmptyFrame(bool mainWindow) void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) { - if (m_renderCommandEncoder) - { - m_renderCommandEncoder->endEncoding(); - m_renderCommandEncoder->release(); - m_renderCommandEncoder = nullptr; - } + EndEncoding(); - CA::MetalDrawable* drawable = m_metalLayer->nextDrawable(); - if (drawable) + if (m_drawable) { - ensureCommandBuffer(); - m_commandBuffer->presentDrawable(drawable); + EnsureCommandBuffer(); + m_commandBuffer->presentDrawable(m_drawable); } else { printf("skipped present!\n"); } + m_drawable = nullptr; - if (m_commandBuffer) - { - m_commandBuffer->commit(); - - m_commandBuffer->release(); - m_commandBuffer = nullptr; - - // Debug - m_commandQueue->insertDebugCaptureBoundary(); - } + CommitCommandBuffer(); } void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutputShader* shader, bool useLinearTexFilter, sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight, bool padView, bool clearBackground) { - printf("MetalRenderer::DrawBackbufferQuad not implemented\n"); + // Acquire drawable + m_drawable = m_metalLayer->nextDrawable(); + if (!m_drawable) + { + return; + } + + MTL::Texture* presentTexture = static_cast(texView)->GetTexture(); + + // Create render pass + MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); + renderPassDescriptor->colorAttachments()->object(0)->setTexture(m_drawable->texture()); + + MTL::Texture* colorRenderTargets[8] = {nullptr}; + colorRenderTargets[0] = m_drawable->texture(); + BeginRenderPassIfNeeded(renderPassDescriptor, colorRenderTargets, nullptr); + + // Draw to Metal layer + m_renderCommandEncoder->setRenderPipelineState(m_presentPipeline); + m_renderCommandEncoder->setFragmentTexture(presentTexture, 0); + m_renderCommandEncoder->setFragmentSamplerState(m_nearestSampler, 0); + + m_renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3)); } bool MetalRenderer::BeginFrame(bool mainWindow) @@ -178,7 +216,6 @@ void MetalRenderer::texture_releaseTextureUploadBuffer(uint8* mem) TextureDecoder* MetalRenderer::texture_chooseDecodedFormat(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, uint32 width, uint32 height) { - printf("decoding format %u\n", (uint32)format); // TODO: move to LatteToMtl if (isDepth) { @@ -468,7 +505,6 @@ void MetalRenderer::draw_beginSequence() void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst) { - ensureCommandBuffer(); // TODO: uncomment //if (m_state.skipDrawSequence) //{ @@ -484,7 +520,22 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 } auto renderPassDescriptor = m_state.activeFBO->GetRenderPassDescriptor(); - beginRenderPassIfNeeded(renderPassDescriptor); + MTL::Texture* colorRenderTargets[8] = {nullptr}; + MTL::Texture* depthRenderTarget = nullptr; + for (uint32 i = 0; i < 8; i++) + { + auto colorTexture = static_cast(m_state.activeFBO->colorBuffer[i].texture); + if (colorTexture) + { + colorRenderTargets[i] = colorTexture->GetTexture(); + } + } + auto depthTexture = static_cast(m_state.activeFBO->depthBuffer.texture); + if (depthTexture) + { + depthRenderTarget = depthTexture->GetTexture(); + } + BeginRenderPassIfNeeded(renderPassDescriptor, colorRenderTargets, depthRenderTarget); // Shaders LatteSHRC_UpdateActiveShaders(); @@ -598,7 +649,8 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 if (vertexBufferRange.needsRebind) { m_renderCommandEncoder->setVertexBuffer(m_memoryManager->GetBufferCache(), vertexBufferRange.offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); - vertexBufferRange.needsRebind = false; + // TODO: uncomment + //vertexBufferRange.needsRebind = false; } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index ee1d865b1..079e5bba6 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -8,6 +8,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" #include "Metal/MTLRenderCommandEncoder.hpp" #include "Metal/MTLRenderPass.hpp" +#include "Metal/MTLRenderPipeline.hpp" #define MAX_MTL_BUFFERS 31 #define GET_MTL_VERTEX_BUFFER_INDEX(index) (MAX_MTL_BUFFERS - index - 2) @@ -28,6 +29,8 @@ struct MetalState class CachedFBOMtl* activeFBO = nullptr; MetalBufferRange vertexBuffers[MAX_MTL_BUFFERS] = {{}}; class LatteTextureViewMtl* textures[MAX_MTL_TEXTURES] = {nullptr}; + MTL::Texture* colorRenderTargets[8] = {nullptr}; + MTL::Texture* depthRenderTarget = nullptr; }; class MetalRenderer : public Renderer @@ -186,18 +189,22 @@ class MetalRenderer : public Renderer MTL::Device* m_device; MTL::CommandQueue* m_commandQueue; + // Pipelines + MTL::RenderPipelineState* m_presentPipeline; + // Basic MTL::SamplerState* m_nearestSampler; // Active objects MTL::CommandBuffer* m_commandBuffer = nullptr; MTL::RenderCommandEncoder* m_renderCommandEncoder = nullptr; + CA::MetalDrawable* m_drawable; // State MetalState m_state; // Helpers - void ensureCommandBuffer() + void EnsureCommandBuffer() { if (!m_commandBuffer) { @@ -208,11 +215,64 @@ class MetalRenderer : public Renderer } } - void beginRenderPassIfNeeded(MTL::RenderPassDescriptor* renderPassDescriptor) + void BeginRenderPassIfNeeded(MTL::RenderPassDescriptor* renderPassDescriptor, MTL::Texture* colorRenderTargets[8], MTL::Texture* depthRenderTarget) { - if (!m_renderCommandEncoder) + EnsureCommandBuffer(); + + // Check if we need to begin a new render pass + if (m_renderCommandEncoder) { - m_renderCommandEncoder = m_commandBuffer->renderCommandEncoder(renderPassDescriptor); + bool needsNewRenderPass = false; + for (uint8 i = 0; i < 8; i++) + { + if (colorRenderTargets[i] && (colorRenderTargets[i] != m_state.colorRenderTargets[i])) + { + needsNewRenderPass = true; + break; + } + } + + if (!needsNewRenderPass) + { + if (depthRenderTarget && (depthRenderTarget != m_state.depthRenderTarget)) + { + needsNewRenderPass = true; + } + } + + if (!needsNewRenderPass) + { + return; + } + + EndEncoding(); + } + + m_renderCommandEncoder = m_commandBuffer->renderCommandEncoder(renderPassDescriptor); + } + + void EndEncoding() + { + if (m_renderCommandEncoder) + { + m_renderCommandEncoder->endEncoding(); + m_renderCommandEncoder->release(); + m_renderCommandEncoder = nullptr; + } + } + + void CommitCommandBuffer() + { + EndEncoding(); + + if (m_commandBuffer) + { + m_commandBuffer->commit(); + m_commandBuffer->release(); + m_commandBuffer = nullptr; + + // Debug + m_commandQueue->insertDebugCaptureBoundary(); } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index b870fc686..b4a43a7c7 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -10,7 +10,7 @@ RendererShaderMtl::RendererShaderMtl(MetalRenderer* mtlRenderer, ShaderType type MTL::Library* library = mtlRenderer->GetDevice()->newLibrary(NS::String::string(mslCode.c_str(), NS::ASCIIStringEncoding), nullptr, &error); if (error) { - printf("Failed to create library (error: %s) -> source:\n%s\n", error->localizedDescription()->utf8String(), mslCode.c_str()); + printf("failed to create library (error: %s) -> source:\n%s\n", error->localizedDescription()->utf8String(), mslCode.c_str()); error->release(); return; } @@ -20,7 +20,7 @@ RendererShaderMtl::RendererShaderMtl(MetalRenderer* mtlRenderer, ShaderType type m_function = library->newFunction(desc, &error); if (error) { - printf("Failed to create function (error: %s)\n", error->localizedDescription()->utf8String()); + printf("failed to create function (error: %s)\n", error->localizedDescription()->utf8String()); error->release(); return; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/ShaderSourcePresent.h b/src/Cafe/HW/Latte/Renderer/Metal/ShaderSourcePresent.h new file mode 100644 index 000000000..0ca7cbbe8 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/ShaderSourcePresent.h @@ -0,0 +1,23 @@ +#include +const char* presentLibrarySource = \ +"#include \n" \ +"using namespace metal;\n" \ +"\n" \ +"constant float2 positions[] = {float2(-1.0, -3.0), float2(-1.0, 1.0), float2(3.0, 1.0)};\n" +"\n" \ +"struct VertexOut {\n" \ +" float4 position [[position]];\n" \ +" float2 texCoord;\n" \ +"};\n" \ +"\n" \ +"vertex VertexOut presentVertex(ushort vid [[vertex_id]]) {\n" \ +" VertexOut out;\n" \ +" out.position = float4(positions[vid], 0.0, 1.0);\n" \ +" out.texCoord = positions[vid] * 0.5 + 0.5;\n" \ +"\n" \ +" return out;\n" \ +"}\n" \ +"\n" \ +"fragment float4 presentFragment(VertexOut in [[stage_in]], texture2d tex [[texture(0)]], sampler samplr [[sampler(0)]]) {\n" \ +" return tex.sample(samplr, in.texCoord);\n" \ +"}\n"; From 8800de0229973c80b062787c8b6ef96dac04533d Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 29 Jul 2024 15:12:42 +0200 Subject: [PATCH 016/368] update CMakeLists.txt --- src/Cafe/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index 6dbe8781b..49238c62f 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -551,6 +551,7 @@ if(ENABLE_METAL) HW/Latte/Renderer/Metal/CachedFBOMtl.h HW/Latte/Renderer/Metal/MetalMemoryManager.cpp HW/Latte/Renderer/Metal/MetalMemoryManager.h + HW/Latte/Renderer/Metal/ShaderSourcePresent.h ) #target_link_libraries(CemuCafe PRIVATE From f01130022a0fb32517cfae531fac8e28208ff2f6 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 29 Jul 2024 19:00:13 +0200 Subject: [PATCH 017/368] rework command encoder system --- .../LatteDecompilerEmitMSLHeader.hpp | 4 +- .../Latte/Renderer/Metal/LatteTextureMtl.cpp | 4 +- .../HW/Latte/Renderer/Metal/LatteTextureMtl.h | 13 +-- .../Renderer/Metal/LatteTextureViewMtl.cpp | 2 +- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 40 ++++--- src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h | 6 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 47 ++++---- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 104 ++++++++++++++---- 8 files changed, 146 insertions(+), 74 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 3bc8796e2..38b153847 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -134,7 +134,7 @@ namespace LatteDecompiler //shaderSrc->addFmt("UNIFORM_BUFFER_LAYOUT({}, {}, {}) ", (sint32)decompilerContext->output->resourceMappingGL.uniformBuffersBindingPoint[i], (sint32)decompilerContext->output->resourceMappingVK.setIndex, (sint32)decompilerContext->output->resourceMappingVK.uniformBuffersBindingPoint[i]); - shaderSrc->addFmt("struct UBuff{} {" _CRLF, i); + shaderSrc->addFmt("struct UBuff{} {{" _CRLF, i); shaderSrc->addFmt("float4 d{}[{}];" _CRLF, i, decompilerContext->analyzer.uniformBufferAccessTracker[i].DetermineSize(decompilerContext->shaderBaseHash, LATTE_GLSL_DYNAMIC_UNIFORM_BLOCK_SIZE)); shaderSrc->add("};" _CRLF _CRLF); } @@ -307,7 +307,7 @@ namespace LatteDecompiler cemu_assert_debug(decompilerContext->output->resourceMappingGL.uniformBuffersBindingPoint[i] >= 0); cemu_assert_debug(decompilerContext->output->resourceMappingVK.uniformBuffersBindingPoint[i] >= 0); - src->addFmt("constant UBuff{}& ubuff{} [[buffer({})]]" _CRLF, i, i, (sint32)decompilerContext->output->resourceMappingGL.uniformBuffersBindingPoint[i]); + src->addFmt(", constant UBuff{}& ubuff{} [[buffer({})]]", i, i, (sint32)decompilerContext->output->resourceMappingGL.uniformBuffersBindingPoint[i]); } } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index bbd714d9a..9a8a39278 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -5,7 +5,7 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) - : LatteTexture(dim, physAddress, physMipAddress, format, width, height, depth, pitch, mipLevels, swizzle, tileMode, isDepth), m_mtlr(mtlRenderer), m_format(format) + : LatteTexture(dim, physAddress, physMipAddress, format, width, height, depth, pitch, mipLevels, swizzle, tileMode, isDepth), m_mtlr(mtlRenderer), m_format(format), m_isDepth(isDepth) { MTL::TextureDescriptor* desc = MTL::TextureDescriptor::alloc()->init(); desc->setStorageMode(MTL::StorageModeShared); // TODO: use private? @@ -34,7 +34,7 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM desc->setArrayLength(effectiveBaseDepth); } - auto formatInfo = GetMtlPixelFormatInfo(format); + auto formatInfo = GetMtlPixelFormatInfo(format, isDepth); desc->setPixelFormat(formatInfo.pixelFormat); // TODO: is write needed? diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h index cc08d4690..e2187e1bf 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h @@ -21,23 +21,20 @@ class LatteTextureMtl : public LatteTexture return m_format; } + bool IsDepth() const { + return m_isDepth; + } + void AllocateOnHost() override; protected: LatteTextureView* CreateView(Latte::E_DIM dim, Latte::E_GX2SURFFMT format, sint32 firstMip, sint32 mipCount, sint32 firstSlice, sint32 sliceCount) override; -public: - uint64 m_vkFlushIndex{}; // used to track read-write dependencies within the same renderpass - - uint64 m_vkFlushIndex_read{}; - uint64 m_vkFlushIndex_write{}; - - uint32 m_collisionCheckIndex{}; // used to track if texture is being both sampled and output to during drawcall - private: class MetalRenderer* m_mtlr; MTL::Texture* m_texture; Latte::E_GX2SURFFMT m_format; + bool m_isDepth; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp index 37399fca1..34dd6f9f1 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp @@ -52,7 +52,7 @@ LatteTextureViewMtl::LatteTextureViewMtl(MetalRenderer* mtlRenderer, LatteTextur // TODO: swizzle - auto formatInfo = GetMtlPixelFormatInfo(format); + auto formatInfo = GetMtlPixelFormatInfo(format, texture->IsDepth()); m_texture = texture->GetTexture()->newTextureView(formatInfo.pixelFormat, textureType, NS::Range::Make(baseLevel, levelCount), NS::Range::Make(baseLayer, layerCount)); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index 7c7b41870..22b8a0697 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -3,8 +3,7 @@ #include "Metal/MTLPixelFormat.hpp" #include "Metal/MTLVertexDescriptor.hpp" -// TODO: separate color and depth formats -std::map MTL_FORMAT_TABLE = { +std::map MTL_COLOR_FORMAT_TABLE = { {Latte::E_GX2SURFFMT::R4_G4_UNORM, {MTL::PixelFormatRG8Unorm, 2}}, // TODO: correct? {Latte::E_GX2SURFFMT::R5_G6_B5_UNORM, {MTL::PixelFormatB5G6R5Unorm, 2}}, // TODO: correct? {Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM, {MTL::PixelFormatBGR5A1Unorm, 2}}, // TODO: correct? @@ -60,11 +59,6 @@ std::map MTL_FORMAT_TABLE = { {Latte::E_GX2SURFFMT::R32_G32_B32_A32_UINT, {MTL::PixelFormatRGBA32Uint, 16}}, {Latte::E_GX2SURFFMT::R32_G32_B32_A32_SINT, {MTL::PixelFormatRGBA32Sint, 16}}, {Latte::E_GX2SURFFMT::R32_G32_B32_A32_FLOAT, {MTL::PixelFormatRGBA32Float, 16}}, - {Latte::E_GX2SURFFMT::D24_S8_UNORM, {MTL::PixelFormatDepth24Unorm_Stencil8, 4}}, // TODO: not supported on Apple sillicon, maybe find something else - {Latte::E_GX2SURFFMT::D24_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, 4}}, // TODO: correct? - {Latte::E_GX2SURFFMT::D32_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, 5}}, - {Latte::E_GX2SURFFMT::D16_UNORM, {MTL::PixelFormatDepth16Unorm, 2}}, - {Latte::E_GX2SURFFMT::D32_FLOAT, {MTL::PixelFormatDepth32Float, 4}}, {Latte::E_GX2SURFFMT::BC1_UNORM, {MTL::PixelFormatBC1_RGBA, 8, {4, 4}}}, // TODO: correct? {Latte::E_GX2SURFFMT::BC1_SRGB, {MTL::PixelFormatBC1_RGBA_sRGB, 8, {4, 4}}}, // TODO: correct? {Latte::E_GX2SURFFMT::BC2_UNORM, {MTL::PixelFormatBC2_RGBA, 16, {4, 4}}}, // TODO: correct? @@ -77,11 +71,29 @@ std::map MTL_FORMAT_TABLE = { {Latte::E_GX2SURFFMT::BC5_SNORM, {MTL::PixelFormatBC5_RGSnorm, 16, {4, 4}}}, // TODO: correct? }; -const MtlPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format) +std::map MTL_DEPTH_FORMAT_TABLE = { + {Latte::E_GX2SURFFMT::D24_S8_UNORM, {MTL::PixelFormatDepth24Unorm_Stencil8, 4}}, // TODO: not supported on Apple sillicon, maybe find something else + {Latte::E_GX2SURFFMT::D24_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::D32_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, 5}}, + {Latte::E_GX2SURFFMT::D16_UNORM, {MTL::PixelFormatDepth16Unorm, 2}}, + {Latte::E_GX2SURFFMT::D32_FLOAT, {MTL::PixelFormatDepth32Float, 4}}, +}; + +const MtlPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format, bool isDepth) { - cemu_assert_debug(static_cast(format) < MTL_FORMAT_TABLE.size()); + MtlPixelFormatInfo formatInfo; + if (isDepth) + formatInfo = MTL_DEPTH_FORMAT_TABLE[format]; + else + formatInfo = MTL_COLOR_FORMAT_TABLE[format]; + + // Depth24Unorm_Stencil8 is not supported on Apple sillicon + // TODO: query if format is available instead + if (formatInfo.pixelFormat == MTL::PixelFormatDepth24Unorm_Stencil8) + { + formatInfo.pixelFormat = MTL::PixelFormatDepth32Float_Stencil8; + } - MtlPixelFormatInfo formatInfo = MTL_FORMAT_TABLE[format]; if (formatInfo.pixelFormat == MTL::PixelFormatInvalid) { printf("invalid pixel format: %u\n", (uint32)format); @@ -94,16 +106,16 @@ inline uint32 CeilDivide(uint32 a, uint32 b) { return (a + b - 1) / b; } -size_t GetMtlTextureBytesPerRow(Latte::E_GX2SURFFMT format, uint32 width) +size_t GetMtlTextureBytesPerRow(Latte::E_GX2SURFFMT format, bool isDepth, uint32 width) { - const auto& formatInfo = GetMtlPixelFormatInfo(format); + const auto& formatInfo = GetMtlPixelFormatInfo(format, isDepth); return CeilDivide(width, formatInfo.blockTexelSize.x) * formatInfo.bytesPerBlock; } -size_t GetMtlTextureBytesPerImage(Latte::E_GX2SURFFMT format, uint32 height, size_t bytesPerRow) +size_t GetMtlTextureBytesPerImage(Latte::E_GX2SURFFMT format, bool isDepth, uint32 height, size_t bytesPerRow) { - const auto& formatInfo = GetMtlPixelFormatInfo(format); + const auto& formatInfo = GetMtlPixelFormatInfo(format, isDepth); return CeilDivide(height, formatInfo.blockTexelSize.y) * bytesPerRow; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h index a0c1b9395..e9eb0b91c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h @@ -18,11 +18,11 @@ struct MtlPixelFormatInfo { Uvec2 blockTexelSize = {1, 1}; }; -const MtlPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format); +const MtlPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format, bool isDepth); -size_t GetMtlTextureBytesPerRow(Latte::E_GX2SURFFMT format, uint32 width); +size_t GetMtlTextureBytesPerRow(Latte::E_GX2SURFFMT format, bool isDepth, uint32 width); -size_t GetMtlTextureBytesPerImage(Latte::E_GX2SURFFMT format, uint32 height, size_t bytesPerRow); +size_t GetMtlTextureBytesPerImage(Latte::E_GX2SURFFMT format, bool isDepth, uint32 height, size_t bytesPerRow); MTL::PrimitiveType GetMtlPrimitiveType(LattePrimitiveMode mode); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index cb363d61f..af022a0f9 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -12,6 +12,7 @@ #include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/Core/LatteIndices.h" #include "Foundation/NSTypes.hpp" +#include "Metal/MTLRenderCommandEncoder.hpp" #include "gui/guiWrapper.h" extern bool hasValidFramebufferAttached; @@ -147,14 +148,14 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput MTL::Texture* colorRenderTargets[8] = {nullptr}; colorRenderTargets[0] = m_drawable->texture(); - BeginRenderPassIfNeeded(renderPassDescriptor, colorRenderTargets, nullptr); + auto renderCommandEncoder = GetRenderCommandEncoder(renderPassDescriptor, colorRenderTargets, nullptr); // Draw to Metal layer - m_renderCommandEncoder->setRenderPipelineState(m_presentPipeline); - m_renderCommandEncoder->setFragmentTexture(presentTexture, 0); - m_renderCommandEncoder->setFragmentSamplerState(m_nearestSampler, 0); + renderCommandEncoder->setRenderPipelineState(m_presentPipeline); + renderCommandEncoder->setFragmentTexture(presentTexture, 0); + renderCommandEncoder->setFragmentSamplerState(m_nearestSampler, 0); - m_renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3)); + renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3)); } bool MetalRenderer::BeginFrame(bool mainWindow) @@ -351,8 +352,8 @@ void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, s { auto mtlTexture = (LatteTextureMtl*)hostTexture; - size_t bytesPerRow = GetMtlTextureBytesPerRow(mtlTexture->GetFormat(), width); - size_t bytesPerImage = GetMtlTextureBytesPerImage(mtlTexture->GetFormat(), height, bytesPerRow); + size_t bytesPerRow = GetMtlTextureBytesPerRow(mtlTexture->GetFormat(), mtlTexture->IsDepth(), width); + size_t bytesPerImage = GetMtlTextureBytesPerImage(mtlTexture->GetFormat(), mtlTexture->IsDepth(), height, bytesPerRow); mtlTexture->GetTexture()->replaceRegion(MTL::Region(0, 0, width, height), mipIndex, sliceIndex, pixelData, bytesPerRow, bytesPerImage); } @@ -535,7 +536,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 { depthRenderTarget = depthTexture->GetTexture(); } - BeginRenderPassIfNeeded(renderPassDescriptor, colorRenderTargets, depthRenderTarget); + auto renderCommandEncoder = GetRenderCommandEncoder(renderPassDescriptor, colorRenderTargets, depthRenderTarget); // Shaders LatteSHRC_UpdateActiveShaders(); @@ -622,7 +623,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 printf("error creating render pipeline state: %s\n", error->localizedDescription()->utf8String()); return; } - m_renderCommandEncoder->setRenderPipelineState(renderPipelineState); + renderCommandEncoder->setRenderPipelineState(renderPipelineState); // Primitive type const LattePrimitiveMode primitiveMode = static_cast(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]); @@ -648,25 +649,25 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 auto& vertexBufferRange = m_state.vertexBuffers[i]; if (vertexBufferRange.needsRebind) { - m_renderCommandEncoder->setVertexBuffer(m_memoryManager->GetBufferCache(), vertexBufferRange.offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); + renderCommandEncoder->setVertexBuffer(m_memoryManager->GetBufferCache(), vertexBufferRange.offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); // TODO: uncomment //vertexBufferRange.needsRebind = false; } } // Uniform buffers, textures and samplers - BindStageResources(vertexShader); - BindStageResources(pixelShader); + BindStageResources(renderCommandEncoder, vertexShader); + BindStageResources(renderCommandEncoder, pixelShader); // Draw if (hostIndexType != INDEX_TYPE::NONE) { auto mtlIndexType = GetMtlIndexType(hostIndexType); MTL::Buffer* indexBuffer = m_memoryManager->GetBuffer(indexBufferIndex); - m_renderCommandEncoder->drawIndexedPrimitives(mtlPrimitiveType, hostIndexCount, mtlIndexType, indexBuffer, 0, instanceCount, baseVertex, baseInstance); + renderCommandEncoder->drawIndexedPrimitives(mtlPrimitiveType, hostIndexCount, mtlIndexType, indexBuffer, 0, instanceCount, baseVertex, baseInstance); } else { - m_renderCommandEncoder->drawPrimitives(mtlPrimitiveType, baseVertex, count, instanceCount, baseInstance); + renderCommandEncoder->drawPrimitives(mtlPrimitiveType, baseVertex, count, instanceCount, baseInstance); } } @@ -689,7 +690,7 @@ void MetalRenderer::indexData_uploadIndexMemory(uint32 offset, uint32 size) printf("MetalRenderer::indexData_uploadIndexMemory not implemented\n"); } -void MetalRenderer::BindStageResources(LatteDecompilerShader* shader) +void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader) { sint32 textureCount = shader->resourceMapping.getTextureCount(); @@ -722,12 +723,12 @@ void MetalRenderer::BindStageResources(LatteDecompilerShader* shader) { case LatteConst::ShaderType::Vertex: { - m_renderCommandEncoder->setVertexSamplerState(sampler, binding); + renderCommandEncoder->setVertexSamplerState(sampler, binding); break; } case LatteConst::ShaderType::Pixel: { - m_renderCommandEncoder->setFragmentSamplerState(sampler, binding); + renderCommandEncoder->setFragmentSamplerState(sampler, binding); break; } default: @@ -739,12 +740,12 @@ void MetalRenderer::BindStageResources(LatteDecompilerShader* shader) { case LatteConst::ShaderType::Vertex: { - m_renderCommandEncoder->setVertexTexture(textureView->GetTexture(), binding); + renderCommandEncoder->setVertexTexture(textureView->GetTexture(), binding); break; } case LatteConst::ShaderType::Pixel: { - m_renderCommandEncoder->setFragmentTexture(textureView->GetTexture(), binding); + renderCommandEncoder->setFragmentTexture(textureView->GetTexture(), binding); break; } default: @@ -840,12 +841,12 @@ void MetalRenderer::BindStageResources(LatteDecompilerShader* shader) { case LatteConst::ShaderType::Vertex: { - m_renderCommandEncoder->setVertexBytes(supportBufferData, sizeof(supportBufferData), MTL_SUPPORT_BUFFER_BINDING); + renderCommandEncoder->setVertexBytes(supportBufferData, sizeof(supportBufferData), MTL_SUPPORT_BUFFER_BINDING); break; } case LatteConst::ShaderType::Pixel: { - m_renderCommandEncoder->setFragmentBytes(supportBufferData, sizeof(supportBufferData), MTL_SUPPORT_BUFFER_BINDING); + renderCommandEncoder->setFragmentBytes(supportBufferData, sizeof(supportBufferData), MTL_SUPPORT_BUFFER_BINDING); break; } default: @@ -865,12 +866,12 @@ void MetalRenderer::BindStageResources(LatteDecompilerShader* shader) { case LatteConst::ShaderType::Vertex: { - m_renderCommandEncoder->setVertexBuffer(m_memoryManager->GetBufferCache(), offset, binding); + renderCommandEncoder->setVertexBuffer(m_memoryManager->GetBufferCache(), offset, binding); break; } case LatteConst::ShaderType::Pixel: { - m_renderCommandEncoder->setFragmentBuffer(m_memoryManager->GetBufferCache(), offset, binding); + renderCommandEncoder->setFragmentBuffer(m_memoryManager->GetBufferCache(), offset, binding); break; } default: diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 079e5bba6..0c09ee241 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -6,6 +6,7 @@ #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" +#include "Metal/MTLComputeCommandEncoder.hpp" #include "Metal/MTLRenderCommandEncoder.hpp" #include "Metal/MTLRenderPass.hpp" #include "Metal/MTLRenderPipeline.hpp" @@ -33,6 +34,14 @@ struct MetalState MTL::Texture* depthRenderTarget = nullptr; }; +enum class MetalEncoderType +{ + None, + Render, + Compute, + Blit, +}; + class MetalRenderer : public Renderer { public: @@ -197,7 +206,8 @@ class MetalRenderer : public Renderer // Active objects MTL::CommandBuffer* m_commandBuffer = nullptr; - MTL::RenderCommandEncoder* m_renderCommandEncoder = nullptr; + MetalEncoderType m_encoderType = MetalEncoderType::None; + MTL::CommandEncoder* m_commandEncoder = nullptr; CA::MetalDrawable* m_drawable; // State @@ -215,49 +225,101 @@ class MetalRenderer : public Renderer } } - void BeginRenderPassIfNeeded(MTL::RenderPassDescriptor* renderPassDescriptor, MTL::Texture* colorRenderTargets[8], MTL::Texture* depthRenderTarget) + MTL::RenderCommandEncoder* GetRenderCommandEncoder(MTL::RenderPassDescriptor* renderPassDescriptor, MTL::Texture* colorRenderTargets[8], MTL::Texture* depthRenderTarget) { EnsureCommandBuffer(); // Check if we need to begin a new render pass - if (m_renderCommandEncoder) + if (m_commandEncoder) { - bool needsNewRenderPass = false; - for (uint8 i = 0; i < 8; i++) + if (m_encoderType == MetalEncoderType::Render) { - if (colorRenderTargets[i] && (colorRenderTargets[i] != m_state.colorRenderTargets[i])) + bool needsNewRenderPass = false; + for (uint8 i = 0; i < 8; i++) { - needsNewRenderPass = true; - break; + if (colorRenderTargets[i] && (colorRenderTargets[i] != m_state.colorRenderTargets[i])) + { + needsNewRenderPass = true; + break; + } } - } - if (!needsNewRenderPass) - { - if (depthRenderTarget && (depthRenderTarget != m_state.depthRenderTarget)) + if (!needsNewRenderPass) + { + if (depthRenderTarget && (depthRenderTarget != m_state.depthRenderTarget)) + { + needsNewRenderPass = true; + } + } + + if (!needsNewRenderPass) { - needsNewRenderPass = true; + return (MTL::RenderCommandEncoder*)m_commandEncoder; } } - if (!needsNewRenderPass) + EndEncoding(); + } + + // Update state + for (uint8 i = 0; i < 8; i++) + { + m_state.colorRenderTargets[i] = colorRenderTargets[i]; + } + m_state.depthRenderTarget = depthRenderTarget; + + auto renderCommandEncoder = m_commandBuffer->renderCommandEncoder(renderPassDescriptor); + m_commandEncoder = renderCommandEncoder; + m_encoderType = MetalEncoderType::Render; + + return renderCommandEncoder; + } + + MTL::ComputeCommandEncoder* GetComputeCommandEncoder() + { + if (m_commandEncoder) + { + if (m_encoderType != MetalEncoderType::Compute) { - return; + return (MTL::ComputeCommandEncoder*)m_commandEncoder; } EndEncoding(); } - m_renderCommandEncoder = m_commandBuffer->renderCommandEncoder(renderPassDescriptor); + auto computeCommandEncoder = m_commandBuffer->computeCommandEncoder(); + m_commandEncoder = computeCommandEncoder; + m_encoderType = MetalEncoderType::Compute; + + return computeCommandEncoder; + } + + MTL::BlitCommandEncoder* GetBlitCommandEncoder() + { + if (m_commandEncoder) + { + if (m_encoderType != MetalEncoderType::Blit) + { + return (MTL::BlitCommandEncoder*)m_commandEncoder; + } + + EndEncoding(); + } + + auto blitCommandEncoder = m_commandBuffer->blitCommandEncoder(); + m_commandEncoder = blitCommandEncoder; + m_encoderType = MetalEncoderType::Blit; + + return blitCommandEncoder; } void EndEncoding() { - if (m_renderCommandEncoder) + if (m_commandEncoder) { - m_renderCommandEncoder->endEncoding(); - m_renderCommandEncoder->release(); - m_renderCommandEncoder = nullptr; + m_commandEncoder->endEncoding(); + m_commandEncoder->release(); + m_commandEncoder = nullptr; } } @@ -276,5 +338,5 @@ class MetalRenderer : public Renderer } } - void BindStageResources(LatteDecompilerShader* shader); + void BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader); }; From 0cb83d4668f6d23b00c9cb7df545e505bed9e37d Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 29 Jul 2024 20:04:41 +0200 Subject: [PATCH 018/368] fix: support buffer data & fix: depth --- src/Cafe/HW/Latte/Core/LatteShader.cpp | 6 ++++-- .../LatteDecompilerEmitMSL.cpp | 19 +++++++++++++------ 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteShader.cpp b/src/Cafe/HW/Latte/Core/LatteShader.cpp index 6561e6420..486516efd 100644 --- a/src/Cafe/HW/Latte/Core/LatteShader.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShader.cpp @@ -615,7 +615,8 @@ LatteDecompilerShader* LatteShader_CreateShaderFromDecompilerOutput(LatteDecompi LatteDecompilerShader* shader = decompilerOutput.shader; shader->baseHash = baseHash; // copy resource mapping - if(g_renderer->GetType() == RendererAPI::Vulkan) + // HACK + if (g_renderer->GetType() != RendererAPI::OpenGL) shader->resourceMapping = decompilerOutput.resourceMappingVK; else shader->resourceMapping = decompilerOutput.resourceMappingGL; @@ -626,7 +627,8 @@ LatteDecompilerShader* LatteShader_CreateShaderFromDecompilerOutput(LatteDecompi shader->hasStreamoutBufferWrite = decompilerOutput.streamoutBufferWriteMask.any(); // copy uniform offsets // for OpenGL these are retrieved in _prepareSeparableUniforms() - if (g_renderer->GetType() == RendererAPI::Vulkan) + // HACK + if (g_renderer->GetType() != RendererAPI::OpenGL) { shader->uniform.loc_remapped = decompilerOutput.uniformOffsetsVK.offset_remapped; shader->uniform.loc_uniformRegister = decompilerOutput.uniformOffsetsVK.offset_uniformRegister; diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 320d8e24c..215874373 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -1531,7 +1531,7 @@ static void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderConte debugBreakpoint(); _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); src->add(")"); - src->add(") discard;"); + src->add(") discard_fragment();"); src->add(_CRLF); } else if( aluInstruction->opcode == ALU_OP2_INST_KILLGT || @@ -1551,7 +1551,7 @@ static void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderConte debugBreakpoint(); _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(")"); - src->add(") discard;"); + src->add(") discard_fragment();"); src->add(_CRLF); } else @@ -3136,7 +3136,7 @@ static void _emitExportCode(LatteDecompilerShaderContext* shaderContext, LatteDe if( pixelColorOutputIndex == 0 && alphaTestEnable && alphaTestFunc == Latte::E_COMPAREFUNC::NEVER ) { // never pass alpha test - src->add("discard;" _CRLF); + src->add("discard_fragment();" _CRLF); } else if( pixelColorOutputIndex == 0 && alphaTestEnable && alphaTestFunc != Latte::E_COMPAREFUNC::ALWAYS) { @@ -3166,7 +3166,7 @@ static void _emitExportCode(LatteDecompilerShaderContext* shaderContext, LatteDe break; } src->add(" supportBuffer.alphaTestRef"); - src->add(") == false) discard;" _CRLF); + src->add(") == false) discard_fragment();" _CRLF); } // pixel color output src->addFmt("out.passPixelColor{} = ", pixelColorOutputIndex); @@ -4067,14 +4067,21 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, } for(auto& cfInstruction : shaderContext->cfInstructions) LatteDecompiler_emitClauseCodeMSL(shaderContext, &cfInstruction, false); - if( shader->shaderType == LatteConst::ShaderType::Geometry ) - src->add("EndPrimitive();" _CRLF); + //if(shader->shaderType == LatteConst::ShaderType::Geometry) + // src->add("EndPrimitive();" _CRLF); // vertex shader should write renderstate point size at the end if required but not modified by shader if (shaderContext->analyzer.outputPointSize && shaderContext->analyzer.writesPointSize == false) { if (shader->shaderType == LatteConst::ShaderType::Vertex && shaderContext->options->usesGeometryShader == false) src->add("out.pointSize = supportBuffer.pointSize;" _CRLF); } + // HACK: this should be handled outside of the shader, because clipping currently wouldn't work + if (shader->shaderType == LatteConst::ShaderType::Vertex) + { + // Convert depth from the range of [-1, 1] to [0, 1] + src->add("out.position /= out.position.w;" _CRLF); + src->add("out.position.z = out.position.z * 0.5 + 0.5;" _CRLF); + } // return src->add("return out;" _CRLF); // end of shader main From be8a5604965d4bb91bfc2e2a7472f991a9969b09 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 30 Jul 2024 19:27:52 +0200 Subject: [PATCH 019/368] optimize index buffers --- .../Renderer/Metal/MetalMemoryManager.cpp | 57 +++++++++++++--- .../Latte/Renderer/Metal/MetalMemoryManager.h | 66 +++++++++++++++++-- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 4 +- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 7 +- .../Renderer/Metal/ShaderSourcePresent.h | 3 +- 5 files changed, 115 insertions(+), 22 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 9ec94c0e3..d09ab116f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -1,19 +1,35 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" -void* MetalMemoryManager::GetTextureUploadBuffer(size_t size) +const size_t BUFFER_ALLOCATION_SIZE = 8 * 1024 * 1024; + +MetalBufferAllocation MetalBufferAllocator::GetBufferAllocation(size_t size) { - if (m_textureUploadBuffer.size() < size) + // First, try to find a free range + for (uint32 i = 0; i < m_freeBufferRanges.size(); i++) { - m_textureUploadBuffer.resize(size); - } + auto& range = m_freeBufferRanges[i]; + if (range.size >= size) + { + MetalBufferAllocation allocation; + allocation.bufferIndex = range.bufferIndex; + allocation.bufferOffset = range.offset; + allocation.data = (uint8*)m_buffers[range.bufferIndex]->contents() + range.offset; - return m_textureUploadBuffer.data(); -} + range.offset += size; + range.size -= size; -// TODO: optimize this -MetalBufferAllocation MetalMemoryManager::GetBufferAllocation(size_t size) -{ - MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(size, MTL::ResourceStorageModeShared); + if (range.size == 0) + { + m_freeBufferRanges.erase(m_freeBufferRanges.begin() + i); + } + + return allocation; + } + } + + // If no free range was found, allocate a new buffer + MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(std::max(size, BUFFER_ALLOCATION_SIZE), MTL::ResourceStorageModeShared); MetalBufferAllocation allocation; allocation.bufferIndex = m_buffers.size(); @@ -22,9 +38,30 @@ MetalBufferAllocation MetalMemoryManager::GetBufferAllocation(size_t size) m_buffers.push_back(buffer); + // If the buffer is larger than the requested size, add the remaining space to the free buffer ranges + if (size < BUFFER_ALLOCATION_SIZE) + { + MetalBufferRange range; + range.bufferIndex = allocation.bufferIndex; + range.offset = size; + range.size = BUFFER_ALLOCATION_SIZE - size; + + m_freeBufferRanges.push_back(range); + } + return allocation; } +void* MetalMemoryManager::GetTextureUploadBuffer(size_t size) +{ + if (m_textureUploadBuffer.size() < size) + { + m_textureUploadBuffer.resize(size); + } + + return m_textureUploadBuffer.data(); +} + void MetalMemoryManager::InitBufferCache(size_t size) { if (m_bufferCache) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index ee773047c..c099360fc 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -4,6 +4,8 @@ #include "Cafe/HW/Latte/ISA/LatteReg.h" +//const uint32 bufferAllocatorIndexShift = 24; + struct MetalBufferAllocation { void* data; @@ -11,16 +13,69 @@ struct MetalBufferAllocation size_t bufferOffset; }; -class MetalMemoryManager +struct MetalBufferRange +{ + uint32 bufferIndex; + size_t offset; + size_t size; +}; + +class MetalBufferAllocator { public: - MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} + MetalBufferAllocator(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} + + void ResetTemporaryBuffers() + { + m_freeBufferRanges.clear(); + + // Register the free ranges + for (uint32 i = 0; i < m_buffers.size(); i++) + { + m_freeBufferRanges.push_back({i, 0, m_buffers[i]->length()}); + } + } MTL::Buffer* GetBuffer(uint32 bufferIndex) { return m_buffers[bufferIndex]; } + MetalBufferAllocation GetBufferAllocation(size_t size); + +private: + class MetalRenderer* m_mtlr; + + std::vector m_buffers; + std::vector m_freeBufferRanges; +}; + +class MetalMemoryManager +{ +public: + MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_bufferAllocator(metalRenderer) {} + + void ResetTemporaryBuffers() + { + m_bufferAllocator/*s[m_bufferAllocatorIndex]*/.ResetTemporaryBuffers(); + //m_bufferAllocatorIndex = (m_bufferAllocatorIndex + 1) % 2; + } + + MTL::Buffer* GetBuffer(uint32 bufferIndex) + { + //uint32 bufferAllocatorIndex = (bufferIndex >> bufferAllocatorIndexShift); + + return m_bufferAllocator/*s[bufferAllocatorIndex]*/.GetBuffer(bufferIndex); + } + + MetalBufferAllocation GetBufferAllocation(size_t size) + { + auto allocation = m_bufferAllocator/*s[m_bufferAllocatorIndex]*/.GetBufferAllocation(size); + //allocation.bufferIndex |= (m_bufferAllocatorIndex << bufferAllocatorIndexShift); + + return allocation; + } + MTL::Buffer* GetBufferCache() { return m_bufferCache; @@ -28,8 +83,6 @@ class MetalMemoryManager void* GetTextureUploadBuffer(size_t size); - MetalBufferAllocation GetBufferAllocation(size_t size); - // Buffer cache void InitBufferCache(size_t size); void UploadToBufferCache(const void* data, size_t offset, size_t size); @@ -39,6 +92,9 @@ class MetalMemoryManager class MetalRenderer* m_mtlr; std::vector m_textureUploadBuffer; - std::vector m_buffers; + + MetalBufferAllocator m_bufferAllocator;//s[2]; + //uint8 m_bufferAllocatorIndex = 0; + MTL::Buffer* m_bufferCache = nullptr; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index af022a0f9..232130090 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -456,7 +456,7 @@ void MetalRenderer::draw_beginSequence() LatteSHRC_UpdateActiveShaders(); if (LatteGPUState.activeShaderHasError) { - cemuLog_logDebugOnce(LogType::Force, "Skipping drawcalls due to shader error"); + debug_printf("Skipping drawcalls due to shader error\n"); m_state.skipDrawSequence = true; cemu_assert_debug(false); return; @@ -506,10 +506,8 @@ void MetalRenderer::draw_beginSequence() void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst) { - // TODO: uncomment //if (m_state.skipDrawSequence) //{ - // printf("skipping draw\n"); // return; //} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 0c09ee241..45e9e703e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -18,7 +18,7 @@ #define MAX_MTL_TEXTURES 31 -struct MetalBufferRange +struct MetalBoundBuffer { bool needsRebind = false; sint32 offset = -1; @@ -28,7 +28,7 @@ struct MetalState { bool skipDrawSequence = false; class CachedFBOMtl* activeFBO = nullptr; - MetalBufferRange vertexBuffers[MAX_MTL_BUFFERS] = {{}}; + MetalBoundBuffer vertexBuffers[MAX_MTL_BUFFERS] = {{}}; class LatteTextureViewMtl* textures[MAX_MTL_TEXTURES] = {nullptr}; MTL::Texture* colorRenderTargets[8] = {nullptr}; MTL::Texture* depthRenderTarget = nullptr; @@ -333,6 +333,9 @@ class MetalRenderer : public Renderer m_commandBuffer->release(); m_commandBuffer = nullptr; + // Reset temporary buffers + m_memoryManager->ResetTemporaryBuffers(); + // Debug m_commandQueue->insertDebugCaptureBoundary(); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/ShaderSourcePresent.h b/src/Cafe/HW/Latte/Renderer/Metal/ShaderSourcePresent.h index 0ca7cbbe8..ad3f69715 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/ShaderSourcePresent.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/ShaderSourcePresent.h @@ -1,5 +1,4 @@ -#include -const char* presentLibrarySource = \ +inline const char* presentLibrarySource = \ "#include \n" \ "using namespace metal;\n" \ "\n" \ From 1fbd6ad37651ea5a11c51ea2edbfc7c72b52666a Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 31 Jul 2024 20:50:24 +0200 Subject: [PATCH 020/368] set viewport and scissor --- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 4 +- .../Renderer/Metal/MetalMemoryManager.cpp | 7 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 85 +++++++++++++++++-- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 7 ++ 4 files changed, 91 insertions(+), 12 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index 22b8a0697..1d0ad0a80 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -186,7 +186,7 @@ MTL::VertexFormat GetMtlVertexFormat(uint8 format) case FMT_2_10_10_10: return MTL::VertexFormatUInt; // verified to match OpenGL default: - printf("unsupported vertex format: %u\n", (uint32)format); + printf("unsupported vertex format %u\n", (uint32)format); assert_dbg(); return MTL::VertexFormatInvalid; } @@ -201,7 +201,7 @@ MTL::IndexType GetMtlIndexType(Renderer::INDEX_TYPE indexType) case Renderer::INDEX_TYPE::U32: return MTL::IndexTypeUInt32; default: - printf("unsupported index type: %u\n", (uint32)indexType); + printf("unsupported index type %u\n", (uint32)indexType); assert_dbg(); return MTL::IndexTypeUInt32; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index d09ab116f..6e609e4f7 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -3,9 +3,11 @@ const size_t BUFFER_ALLOCATION_SIZE = 8 * 1024 * 1024; +// TODO: uncomment everything MetalBufferAllocation MetalBufferAllocator::GetBufferAllocation(size_t size) { // First, try to find a free range + /* for (uint32 i = 0; i < m_freeBufferRanges.size(); i++) { auto& range = m_freeBufferRanges[i]; @@ -27,9 +29,10 @@ MetalBufferAllocation MetalBufferAllocator::GetBufferAllocation(size_t size) return allocation; } } + */ // If no free range was found, allocate a new buffer - MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(std::max(size, BUFFER_ALLOCATION_SIZE), MTL::ResourceStorageModeShared); + MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(/*std::max(*/size/*, BUFFER_ALLOCATION_SIZE)*/, MTL::ResourceStorageModeShared); MetalBufferAllocation allocation; allocation.bufferIndex = m_buffers.size(); @@ -39,6 +42,7 @@ MetalBufferAllocation MetalBufferAllocator::GetBufferAllocation(size_t size) m_buffers.push_back(buffer); // If the buffer is larger than the requested size, add the remaining space to the free buffer ranges + /* if (size < BUFFER_ALLOCATION_SIZE) { MetalBufferRange range; @@ -48,6 +52,7 @@ MetalBufferAllocation MetalBufferAllocator::GetBufferAllocation(size_t size) m_freeBufferRanges.push_back(range); } + */ return allocation; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 232130090..9874f7713 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -11,6 +11,7 @@ #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/Core/LatteIndices.h" +#include "Cemu/Logging/CemuDebugLogging.h" #include "Foundation/NSTypes.hpp" #include "Metal/MTLRenderCommandEncoder.hpp" #include "gui/guiWrapper.h" @@ -171,7 +172,8 @@ void MetalRenderer::Flush(bool waitIdle) void MetalRenderer::NotifyLatteCommandProcessorIdle() { - printf("MetalRenderer::NotifyLatteCommandProcessorIdle not implemented\n"); + // TODO: should we? + CommitCommandBuffer(); } void MetalRenderer::AppendOverlayDebugInfo() @@ -181,12 +183,20 @@ void MetalRenderer::AppendOverlayDebugInfo() void MetalRenderer::renderTarget_setViewport(float x, float y, float width, float height, float nearZ, float farZ, bool halfZ) { - printf("MetalRenderer::renderTarget_setViewport not implemented\n"); + m_state.viewport = MTL::Viewport{x, y + height, width, -height, nearZ, farZ}; + if (m_encoderType == MetalEncoderType::Render) + { + static_cast(m_commandEncoder)->setViewport(m_state.viewport); + } } void MetalRenderer::renderTarget_setScissor(sint32 scissorX, sint32 scissorY, sint32 scissorWidth, sint32 scissorHeight) { - printf("MetalRenderer::renderTarget_setScissor not implemented\n"); + m_state.scissor = MTL::ScissorRect{NS::UInteger(scissorX), NS::UInteger(scissorY), NS::UInteger(scissorWidth), NS::UInteger(scissorHeight)}; + if (m_encoderType == MetalEncoderType::Render) + { + static_cast(m_commandEncoder)->setScissorRect(m_state.scissor); + } } LatteCachedFBO* MetalRenderer::rendertarget_createCachedFBO(uint64 key) @@ -456,7 +466,7 @@ void MetalRenderer::draw_beginSequence() LatteSHRC_UpdateActiveShaders(); if (LatteGPUState.activeShaderHasError) { - debug_printf("Skipping drawcalls due to shader error\n"); + printf("Skipping drawcalls due to shader error\n"); m_state.skipDrawSequence = true; cemu_assert_debug(false); return; @@ -469,14 +479,14 @@ void MetalRenderer::draw_beginSequence() LatteGPUState.repeatTextureInitialization = false; if (!LatteMRT::UpdateCurrentFBO()) { - debug_printf("Rendertarget invalid\n"); + printf("Rendertarget invalid\n"); m_state.skipDrawSequence = true; return; // no render target } if (!hasValidFramebufferAttached) { - debug_printf("Drawcall with no color buffer or depth buffer attached\n"); + printf("Drawcall with no color buffer or depth buffer attached\n"); m_state.skipDrawSequence = true; return; // no render target } @@ -540,6 +550,11 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 LatteSHRC_UpdateActiveShaders(); LatteDecompilerShader* vertexShader = LatteSHRC_GetActiveVertexShader(); LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader(); + if (!vertexShader) + { + printf("no vertex function, skipping draw\n"); + return; + } auto fetchShader = vertexShader->compatibleFetchShader; @@ -648,8 +663,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 if (vertexBufferRange.needsRebind) { renderCommandEncoder->setVertexBuffer(m_memoryManager->GetBufferCache(), vertexBufferRange.offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); - // TODO: uncomment - //vertexBufferRange.needsRebind = false; + vertexBufferRange.needsRebind = false; } } @@ -671,7 +685,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 void MetalRenderer::draw_endSequence() { - printf("MetalRenderer::draw_endSequence not implemented\n"); + // TODO: do something? } void* MetalRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) @@ -878,3 +892,56 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE } } } + +void MetalRenderer::RebindRenderState(MTL::RenderCommandEncoder* renderCommandEncoder) +{ + // Viewport + if (m_state.viewport.width != 0.0) + { + printf("setting previous viewport X: %f Y: %f width: %f height %f\n", m_state.viewport.originX, m_state.viewport.originY, m_state.viewport.width, m_state.viewport.height); + renderCommandEncoder->setViewport(m_state.viewport); + } + else + { + // Find the framebuffer dimensions + uint32 framebufferWidth = 0, framebufferHeight = 0; + if (m_state.activeFBO->hasDepthBuffer()) + { + framebufferHeight = m_state.activeFBO->depthBuffer.texture->baseTexture->width; + framebufferHeight = m_state.activeFBO->depthBuffer.texture->baseTexture->height; + } + else + { + for (uint8 i = 0; i < 8; i++) + { + auto texture = m_state.activeFBO->colorBuffer[i].texture; + if (texture) + { + framebufferWidth = texture->baseTexture->width; + framebufferHeight = texture->baseTexture->height; + } + } + } + + MTL::Viewport viewport{0, (double)framebufferHeight, (double)framebufferWidth, -(double)framebufferHeight, 0.0, 1.0}; + printf("setting default viewport X: %f Y: %f width: %f height %f\n", viewport.originX, viewport.originY, viewport.width, viewport.height); + renderCommandEncoder->setViewport(viewport); + } + + // Scissor + if (m_state.scissor.width != 0) + { + renderCommandEncoder->setScissorRect(m_state.scissor); + } + + // Vertex buffers + for (uint8 i = 0; i < MAX_MTL_BUFFERS; i++) + { + auto& vertexBufferRange = m_state.vertexBuffers[i]; + if (vertexBufferRange.offset != -1) + { + renderCommandEncoder->setVertexBuffer(m_memoryManager->GetBufferCache(), vertexBufferRange.offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); + vertexBufferRange.needsRebind = false; + } + } +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 45e9e703e..2816be194 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -32,6 +32,8 @@ struct MetalState class LatteTextureViewMtl* textures[MAX_MTL_TEXTURES] = {nullptr}; MTL::Texture* colorRenderTargets[8] = {nullptr}; MTL::Texture* depthRenderTarget = nullptr; + MTL::Viewport viewport = {0, 0, 0, 0, 0, 0}; + MTL::ScissorRect scissor = {0, 0, 0, 0}; }; enum class MetalEncoderType @@ -272,6 +274,9 @@ class MetalRenderer : public Renderer m_commandEncoder = renderCommandEncoder; m_encoderType = MetalEncoderType::Render; + // Rebind all the render state + RebindRenderState(renderCommandEncoder); + return renderCommandEncoder; } @@ -342,4 +347,6 @@ class MetalRenderer : public Renderer } void BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader); + + void RebindRenderState(MTL::RenderCommandEncoder* renderCommandEncoder); }; From d4564c18f27065d0d48c5eacfb185460ce0064db Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 1 Aug 2024 17:55:16 +0200 Subject: [PATCH 021/368] fix: viewport and scissor --- .../Renderer/Metal/MetalMemoryManager.cpp | 5 ++ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 68 ++++++++++++------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 16 +++-- 3 files changed, 59 insertions(+), 30 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 6e609e4f7..53dd8b10d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -80,6 +80,11 @@ void MetalMemoryManager::InitBufferCache(size_t size) void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, size_t size) { + if ((offset + size) > m_bufferCache->length()) + { + throw std::runtime_error(std::to_string(offset) + " + " + std::to_string(size) + " > " + std::to_string(m_bufferCache->length())); + } + if (!m_bufferCache) { printf("MetalMemoryManager::UploadToBufferCache: buffer cache not initialized\n"); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 9874f7713..055b9aa06 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -29,6 +29,15 @@ MetalRenderer::MetalRenderer() m_nearestSampler = m_device->newSamplerState(samplerDescriptor); m_memoryManager = new MetalMemoryManager(this); + + // Initialize state + for (uint32 i = 0; i < (uint32)LatteConst::ShaderType::TotalCount; i++) + { + for (uint32 j = 0; j < MAX_MTL_BUFFERS; j++) + { + m_state.uniformBufferOffsets[i][j] = INVALID_OFFSET; + } + } } MetalRenderer::~MetalRenderer() @@ -149,7 +158,8 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput MTL::Texture* colorRenderTargets[8] = {nullptr}; colorRenderTargets[0] = m_drawable->texture(); - auto renderCommandEncoder = GetRenderCommandEncoder(renderPassDescriptor, colorRenderTargets, nullptr); + // If there was already an encoder with these attachment, we should set the viewport and scissor to default, but that shouldn't happen + auto renderCommandEncoder = GetRenderCommandEncoder(renderPassDescriptor, colorRenderTargets, nullptr, false); // Draw to Metal layer renderCommandEncoder->setRenderPipelineState(m_presentPipeline); @@ -435,7 +445,7 @@ void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, u void MetalRenderer::buffer_bindUniformBuffer(LatteConst::ShaderType shaderType, uint32 bufferIndex, uint32 offset, uint32 size) { - printf("MetalRenderer::buffer_bindUniformBuffer not implemented\n"); + m_state.uniformBufferOffsets[(uint32)shaderType][bufferIndex] = offset; } RendererShader* MetalRenderer::shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool isGameShader, bool isGfxPackShader) @@ -872,22 +882,29 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE if (shader->resourceMapping.uniformBuffersBindingPoint[i] >= 0) { uint32 binding = shader->resourceMapping.uniformBuffersBindingPoint[i]; - // TODO: don't hardcode - size_t offset = 0; - switch (shader->shaderType) - { - case LatteConst::ShaderType::Vertex: + if (binding >= MAX_MTL_BUFFERS) { - renderCommandEncoder->setVertexBuffer(m_memoryManager->GetBufferCache(), offset, binding); - break; + printf("too big buffer index (%u), skipping binding\n", binding); + continue; } - case LatteConst::ShaderType::Pixel: + size_t offset = m_state.uniformBufferOffsets[(uint32)shader->shaderType][binding]; + if (offset != INVALID_OFFSET) { - renderCommandEncoder->setFragmentBuffer(m_memoryManager->GetBufferCache(), offset, binding); - break; - } - default: - UNREACHABLE; + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + { + renderCommandEncoder->setVertexBuffer(m_memoryManager->GetBufferCache(), offset, binding); + break; + } + case LatteConst::ShaderType::Pixel: + { + renderCommandEncoder->setFragmentBuffer(m_memoryManager->GetBufferCache(), offset, binding); + break; + } + default: + UNREACHABLE; + } } } } @@ -896,10 +913,10 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE void MetalRenderer::RebindRenderState(MTL::RenderCommandEncoder* renderCommandEncoder) { // Viewport - if (m_state.viewport.width != 0.0) - { - printf("setting previous viewport X: %f Y: %f width: %f height %f\n", m_state.viewport.originX, m_state.viewport.originY, m_state.viewport.width, m_state.viewport.height); - renderCommandEncoder->setViewport(m_state.viewport); + //if (m_state.viewport.width != 0.0) + //{ + renderCommandEncoder->setViewport(m_state.viewport); + /* } else { @@ -919,26 +936,27 @@ void MetalRenderer::RebindRenderState(MTL::RenderCommandEncoder* renderCommandEn { framebufferWidth = texture->baseTexture->width; framebufferHeight = texture->baseTexture->height; + break; } } } MTL::Viewport viewport{0, (double)framebufferHeight, (double)framebufferWidth, -(double)framebufferHeight, 0.0, 1.0}; - printf("setting default viewport X: %f Y: %f width: %f height %f\n", viewport.originX, viewport.originY, viewport.width, viewport.height); renderCommandEncoder->setViewport(viewport); } + */ // Scissor - if (m_state.scissor.width != 0) - { - renderCommandEncoder->setScissorRect(m_state.scissor); - } + //if (m_state.scissor.width != 0) + //{ + renderCommandEncoder->setScissorRect(m_state.scissor); + //} // Vertex buffers for (uint8 i = 0; i < MAX_MTL_BUFFERS; i++) { auto& vertexBufferRange = m_state.vertexBuffers[i]; - if (vertexBufferRange.offset != -1) + if (vertexBufferRange.offset != INVALID_OFFSET) { renderCommandEncoder->setVertexBuffer(m_memoryManager->GetBufferCache(), vertexBufferRange.offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); vertexBufferRange.needsRebind = false; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 2816be194..fff6c6c2d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -18,10 +18,12 @@ #define MAX_MTL_TEXTURES 31 +constexpr size_t INVALID_OFFSET = std::numeric_limits::max(); + struct MetalBoundBuffer { bool needsRebind = false; - sint32 offset = -1; + size_t offset = INVALID_OFFSET; }; struct MetalState @@ -30,6 +32,7 @@ struct MetalState class CachedFBOMtl* activeFBO = nullptr; MetalBoundBuffer vertexBuffers[MAX_MTL_BUFFERS] = {{}}; class LatteTextureViewMtl* textures[MAX_MTL_TEXTURES] = {nullptr}; + size_t uniformBufferOffsets[(uint32)LatteConst::ShaderType::TotalCount][MAX_MTL_BUFFERS]; MTL::Texture* colorRenderTargets[8] = {nullptr}; MTL::Texture* depthRenderTarget = nullptr; MTL::Viewport viewport = {0, 0, 0, 0, 0, 0}; @@ -210,7 +213,7 @@ class MetalRenderer : public Renderer MTL::CommandBuffer* m_commandBuffer = nullptr; MetalEncoderType m_encoderType = MetalEncoderType::None; MTL::CommandEncoder* m_commandEncoder = nullptr; - CA::MetalDrawable* m_drawable; + CA::MetalDrawable* m_drawable = nullptr; // State MetalState m_state; @@ -227,7 +230,7 @@ class MetalRenderer : public Renderer } } - MTL::RenderCommandEncoder* GetRenderCommandEncoder(MTL::RenderPassDescriptor* renderPassDescriptor, MTL::Texture* colorRenderTargets[8], MTL::Texture* depthRenderTarget) + MTL::RenderCommandEncoder* GetRenderCommandEncoder(MTL::RenderPassDescriptor* renderPassDescriptor, MTL::Texture* colorRenderTargets[8], MTL::Texture* depthRenderTarget, bool rebindStateIfNewEncoder = true) { EnsureCommandBuffer(); @@ -274,8 +277,11 @@ class MetalRenderer : public Renderer m_commandEncoder = renderCommandEncoder; m_encoderType = MetalEncoderType::Render; - // Rebind all the render state - RebindRenderState(renderCommandEncoder); + if (rebindStateIfNewEncoder) + { + // Rebind all the render state + RebindRenderState(renderCommandEncoder); + } return renderCommandEncoder; } From e4abb305ac831d253b203c846a2f388c059aef92 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 1 Aug 2024 20:45:24 +0200 Subject: [PATCH 022/368] implement blending --- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 56 ++++++++++++++++++- src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h | 5 ++ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 39 ++++++++++++- src/gui/CemuApp.cpp | 5 +- 4 files changed, 97 insertions(+), 8 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index 1d0ad0a80..a7d5598c9 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -1,6 +1,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #include "Common/precompiled.h" #include "Metal/MTLPixelFormat.hpp" +#include "Metal/MTLRenderPipeline.hpp" #include "Metal/MTLVertexDescriptor.hpp" std::map MTL_COLOR_FORMAT_TABLE = { @@ -134,7 +135,6 @@ MTL::PrimitiveType GetMtlPrimitiveType(LattePrimitiveMode mode) return MTL::PrimitiveTypeTriangleStrip; default: printf("unimplemented primitive type %u\n", (uint32)mode); - cemu_assert_debug(false); return MTL::PrimitiveTypeTriangle; } } @@ -201,8 +201,58 @@ MTL::IndexType GetMtlIndexType(Renderer::INDEX_TYPE indexType) case Renderer::INDEX_TYPE::U32: return MTL::IndexTypeUInt32; default: - printf("unsupported index type %u\n", (uint32)indexType); - assert_dbg(); + cemu_assert_suspicious(); return MTL::IndexTypeUInt32; } } + +MTL::BlendOperation GetMtlBlendOp(Latte::LATTE_CB_BLENDN_CONTROL::E_COMBINEFUNC combineFunc) +{ + switch (combineFunc) + { + case Latte::LATTE_CB_BLENDN_CONTROL::E_COMBINEFUNC::DST_PLUS_SRC: + return MTL::BlendOperationAdd; + case Latte::LATTE_CB_BLENDN_CONTROL::E_COMBINEFUNC::SRC_MINUS_DST: + return MTL::BlendOperationSubtract; + case Latte::LATTE_CB_BLENDN_CONTROL::E_COMBINEFUNC::MIN_DST_SRC: + return MTL::BlendOperationMin; + case Latte::LATTE_CB_BLENDN_CONTROL::E_COMBINEFUNC::MAX_DST_SRC: + return MTL::BlendOperationMax; + case Latte::LATTE_CB_BLENDN_CONTROL::E_COMBINEFUNC::DST_MINUS_SRC: + return MTL::BlendOperationReverseSubtract; + default: + cemu_assert_suspicious(); + return MTL::BlendOperationAdd; + } +} + +const MTL::BlendFactor MTL_BLEND_FACTORS[] = +{ + /* 0x00 */ MTL::BlendFactorZero, + /* 0x01 */ MTL::BlendFactorOne, + /* 0x02 */ MTL::BlendFactorSourceColor, + /* 0x03 */ MTL::BlendFactorOneMinusSourceColor, + /* 0x04 */ MTL::BlendFactorSourceAlpha, + /* 0x05 */ MTL::BlendFactorOneMinusSourceAlpha, + /* 0x06 */ MTL::BlendFactorDestinationAlpha, + /* 0x07 */ MTL::BlendFactorOneMinusDestinationAlpha, + /* 0x08 */ MTL::BlendFactorDestinationColor, + /* 0x09 */ MTL::BlendFactorOneMinusDestinationColor, + /* 0x0A */ MTL::BlendFactorSourceAlphaSaturated, + /* 0x0B */ MTL::BlendFactorZero, // TODO + /* 0x0C */ MTL::BlendFactorZero, // TODO + /* 0x0D */ MTL::BlendFactorBlendColor, + /* 0x0E */ MTL::BlendFactorOneMinusBlendColor, + /* 0x0F */ MTL::BlendFactorSource1Color, + /* 0x10 */ MTL::BlendFactorOneMinusSource1Color, + /* 0x11 */ MTL::BlendFactorSource1Alpha, + /* 0x12 */ MTL::BlendFactorOneMinusSource1Alpha, + /* 0x13 */ MTL::BlendFactorBlendAlpha, + /* 0x14 */ MTL::BlendFactorOneMinusBlendAlpha +}; + +MTL::BlendFactor GetMtlBlendFactor(Latte::LATTE_CB_BLENDN_CONTROL::E_BLENDFACTOR factor) +{ + cemu_assert_debug((uint32)factor < std::size(MTL_BLEND_FACTORS)); + return MTL_BLEND_FACTORS[(uint32)factor]; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h index e9eb0b91c..922d536cf 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h @@ -6,6 +6,7 @@ #include "Cafe/HW/Latte/Core/LatteConst.h" //#include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Renderer/Renderer.h" +#include "Metal/MTLRenderPipeline.hpp" struct Uvec2 { uint32 x; @@ -29,3 +30,7 @@ MTL::PrimitiveType GetMtlPrimitiveType(LattePrimitiveMode mode); MTL::VertexFormat GetMtlVertexFormat(uint8 format); MTL::IndexType GetMtlIndexType(Renderer::INDEX_TYPE indexType); + +MTL::BlendOperation GetMtlBlendOp(Latte::LATTE_CB_BLENDN_CONTROL::E_COMBINEFUNC combineFunc); + +MTL::BlendFactor GetMtlBlendFactor(Latte::LATTE_CB_BLENDN_CONTROL::E_BLENDFACTOR factor); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 055b9aa06..00207d658 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -560,7 +560,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 LatteSHRC_UpdateActiveShaders(); LatteDecompilerShader* vertexShader = LatteSHRC_GetActiveVertexShader(); LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader(); - if (!vertexShader) + if (!vertexShader || !static_cast(vertexShader->shader)->GetFunction()) { printf("no vertex function, skipping draw\n"); return; @@ -631,7 +631,42 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 { continue; } - renderPipelineDescriptor->colorAttachments()->object(i)->setPixelFormat(texture->GetTexture()->pixelFormat()); + auto colorAttachment = renderPipelineDescriptor->colorAttachments()->object(i); + colorAttachment->setPixelFormat(texture->GetTexture()->pixelFormat()); + + // Blending + const Latte::LATTE_CB_COLOR_CONTROL& colorControlReg = LatteGPUState.contextNew.CB_COLOR_CONTROL; + uint32 blendEnableMask = colorControlReg.get_BLEND_MASK(); + uint32 renderTargetMask = LatteGPUState.contextNew.CB_TARGET_MASK.get_MASK(); + + bool blendEnabled = ((blendEnableMask & (1 << i))) != 0; + if (blendEnabled) + { + colorAttachment->setBlendingEnabled(true); + + const auto& blendControlReg = LatteGPUState.contextNew.CB_BLENDN_CONTROL[i]; + + auto rgbBlendOp = GetMtlBlendOp(blendControlReg.get_COLOR_COMB_FCN()); + auto srcRgbBlendFactor = GetMtlBlendFactor(blendControlReg.get_COLOR_SRCBLEND()); + auto dstRgbBlendFactor = GetMtlBlendFactor(blendControlReg.get_COLOR_DSTBLEND()); + + colorAttachment->setWriteMask((renderTargetMask >> (i * 4)) & 0xF); + colorAttachment->setRgbBlendOperation(rgbBlendOp); + colorAttachment->setSourceRGBBlendFactor(srcRgbBlendFactor); + colorAttachment->setDestinationRGBBlendFactor(dstRgbBlendFactor); + if (blendControlReg.get_SEPARATE_ALPHA_BLEND()) + { + colorAttachment->setAlphaBlendOperation(GetMtlBlendOp(blendControlReg.get_ALPHA_COMB_FCN())); + colorAttachment->setSourceAlphaBlendFactor(GetMtlBlendFactor(blendControlReg.get_ALPHA_SRCBLEND())); + colorAttachment->setDestinationAlphaBlendFactor(GetMtlBlendFactor(blendControlReg.get_ALPHA_DSTBLEND())); + } + else + { + colorAttachment->setAlphaBlendOperation(rgbBlendOp); + colorAttachment->setSourceAlphaBlendFactor(srcRgbBlendFactor); + colorAttachment->setDestinationAlphaBlendFactor(dstRgbBlendFactor); + } + } } if (m_state.activeFBO->depthBuffer.texture) { diff --git a/src/gui/CemuApp.cpp b/src/gui/CemuApp.cpp index f91c1e3a1..322980e9d 100644 --- a/src/gui/CemuApp.cpp +++ b/src/gui/CemuApp.cpp @@ -368,7 +368,8 @@ void CemuApp::OnAssertFailure(const wxChar* file, int line, const wxChar* func, #if BOOST_OS_WINDOWS DumpThreadStackTrace(); #endif - cemu_assert_debug(false); + // HACK + //cemu_assert_debug(false); } int CemuApp::FilterEvent(wxEvent& event) @@ -545,5 +546,3 @@ void CemuApp::ActivateApp(wxActivateEvent& event) g_window_info.app_active = event.GetActive(); event.Skip(); } - - From a2d15858ad09da2ee5236f60e6f8ea05eb54ba9f Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 2 Aug 2024 17:48:25 +0200 Subject: [PATCH 023/368] fix: shader errors --- .../LatteDecompilerEmitMSL.cpp | 26 ++++-- .../LatteDecompilerEmitMSLHeader.hpp | 9 +- .../Latte/Renderer/Metal/LatteTextureMtl.cpp | 56 +++++++------ .../Renderer/Metal/LatteTextureViewMtl.cpp | 4 +- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 4 - src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h | 1 - .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 82 ++++++++++--------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 4 +- 8 files changed, 106 insertions(+), 80 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 215874373..ae6ca8f38 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -672,7 +672,7 @@ static void _emitUniformAccessCode(LatteDecompilerShaderContext* shaderContext, uniformBufferIndex = aluInstruction->cfInstruction->cBank1Index; } _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); - src->addFmt("ubuff{}[", uniformBufferIndex); + src->addFmt("ubuff{}.d[", uniformBufferIndex); _emitUniformAccessIndexCode(shaderContext, aluInstruction, operandIndex); src->addFmt("]"); @@ -2404,7 +2404,8 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex cemu_assert_debug(texInstruction->textureFetch.srcSel[1] < 4); src->addFmt("redcCUBEReverse({},", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0)); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_SIGNED_INT); - src->addFmt(", cubeMapArrayIndex{})", texInstruction->textureFetch.textureIndex); // cubemap index + src->add(")"); + src->addFmt(", cubeMapArrayIndex{}", texInstruction->textureFetch.textureIndex); // cubemap index } else if( texDim == Latte::E_DIM::DIM_1D ) { @@ -2427,10 +2428,17 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex if( texOpcode == GPU7_TEX_INST_SAMPLE_L || texOpcode == GPU7_TEX_INST_SAMPLE_LB || texOpcode == GPU7_TEX_INST_SAMPLE_C_L) { src->add(","); - if(texOpcode == GPU7_TEX_INST_SAMPLE_LB) - src->add(_FormatFloatAsConstant((float)texInstruction->textureFetch.lodBias / 16.0f)); + if (texOpcode == GPU7_TEX_INST_SAMPLE_LB) + { + src->addFmt("bias({})", _FormatFloatAsConstant((float)texInstruction->textureFetch.lodBias / 16.0f)); + } else + { + // TODO: is this correct + src->add("level("); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 3, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + } } else if( texOpcode == GPU7_TEX_INST_SAMPLE_LZ || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ ) { @@ -2876,7 +2884,7 @@ static void _emitTEXVFetchCode(LatteDecompilerShaderContext* shaderContext, Latt else src->add("("); - src->addFmt("ubuff{}[", texInstruction->textureFetch.textureIndex - 0x80); + src->addFmt("ubuff{}.d[", texInstruction->textureFetch.textureIndex - 0x80); if( shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) src->addFmt("{}.{}", _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]]); @@ -3611,7 +3619,7 @@ void LatteDecompiler_emitHelperFunctions(LatteDecompilerShaderContext* shaderCon { if( shaderContext->analyzer.hasRedcCUBE ) { - fCStr_shaderSource->add("void redcCUBE(float4 src0, float4 src1, out float3 stm, out int faceId)\r\n" + fCStr_shaderSource->add("void redcCUBE(float4 src0, float4 src1, thread float3& stm, thread int& faceId)\r\n" "{\r\n" "// stm -> x .. s, y .. t, z .. MajorAxis*2.0\r\n" @@ -3721,6 +3729,12 @@ void LatteDecompiler_emitHelperFunctions(LatteDecompilerShaderContext* shaderCon "return round(x / 2.0) * 2.0;\r\n" "}\r\n"); + // unpackHalf2x16 + fCStr_shaderSource->add("" + "float2 unpackHalf2x16(float x) {\r\n" + "return float2(as_type(ushort(as_type(x) & 0x00FF)), as_type(ushort((as_type(x) & 0xFF00) >> 16)));\r\n" + "}\r\n"); + // mul non-ieee way (0*NaN/INF => 0.0) if (shaderContext->options->strictMul) { diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 38b153847..e62a7d1cd 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -135,7 +135,7 @@ namespace LatteDecompiler //shaderSrc->addFmt("UNIFORM_BUFFER_LAYOUT({}, {}, {}) ", (sint32)decompilerContext->output->resourceMappingGL.uniformBuffersBindingPoint[i], (sint32)decompilerContext->output->resourceMappingVK.setIndex, (sint32)decompilerContext->output->resourceMappingVK.uniformBuffersBindingPoint[i]); shaderSrc->addFmt("struct UBuff{} {{" _CRLF, i); - shaderSrc->addFmt("float4 d{}[{}];" _CRLF, i, decompilerContext->analyzer.uniformBufferAccessTracker[i].DetermineSize(decompilerContext->shaderBaseHash, LATTE_GLSL_DYNAMIC_UNIFORM_BLOCK_SIZE)); + shaderSrc->addFmt("float4 d[{}];" _CRLF, decompilerContext->analyzer.uniformBufferAccessTracker[i].DetermineSize(decompilerContext->shaderBaseHash, LATTE_GLSL_DYNAMIC_UNIFORM_BLOCK_SIZE)); shaderSrc->add("};" _CRLF _CRLF); } } @@ -348,8 +348,11 @@ namespace LatteDecompiler cemu_assert_unimplemented(); } - src->addFmt(" tex{} [[texture({})]]", i, shaderContext->output->resourceMappingGL.textureUnitToBindingPoint[i]); - src->addFmt(", sampler samplr{} [[sampler({})]]", i, shaderContext->output->resourceMappingGL.textureUnitToBindingPoint[i]); + // HACK + uint32 textureBinding = shaderContext->output->resourceMappingGL.textureUnitToBindingPoint[i] % 31; + uint32 samplerBinding = textureBinding % 16; + src->addFmt(" tex{} [[texture({})]]", i, textureBinding); + src->addFmt(", sampler samplr{} [[sampler({})]]", i, samplerBinding); } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index 9a8a39278..933752ebe 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -25,10 +25,40 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM desc->setHeight(effectiveBaseHeight); desc->setMipmapLevelCount(mipLevels); - if (dim == Latte::E_DIM::DIM_3D) + MTL::TextureType textureType; + switch (dim) + { + case Latte::E_DIM::DIM_1D: + textureType = MTL::TextureType1D; + break; + case Latte::E_DIM::DIM_2D: + case Latte::E_DIM::DIM_2D_MSAA: + textureType = MTL::TextureType2D; + break; + case Latte::E_DIM::DIM_2D_ARRAY: + textureType = MTL::TextureType2DArray; + break; + case Latte::E_DIM::DIM_3D: + textureType = MTL::TextureType3D; + break; + case Latte::E_DIM::DIM_CUBEMAP: + textureType = MTL::TextureTypeCube; // TODO: check this + break; + default: + cemu_assert_unimplemented(); + textureType = MTL::TextureType2D; + break; + } + desc->setTextureType(textureType); + + if (textureType == MTL::TextureType3D) { desc->setDepth(effectiveBaseDepth); } + else if (textureType == MTL::TextureTypeCube || textureType == MTL::TextureTypeCubeArray) + { + desc->setArrayLength(effectiveBaseDepth / 6); + } else { desc->setArrayLength(effectiveBaseDepth); @@ -46,30 +76,6 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM } desc->setUsage(usage); - switch (dim) - { - case Latte::E_DIM::DIM_1D: - desc->setTextureType(MTL::TextureType1D); - break; - case Latte::E_DIM::DIM_2D: - case Latte::E_DIM::DIM_2D_MSAA: - desc->setTextureType(MTL::TextureType2D); - break; - case Latte::E_DIM::DIM_2D_ARRAY: - desc->setTextureType(MTL::TextureType2DArray); - break; - case Latte::E_DIM::DIM_3D: - desc->setTextureType(MTL::TextureType3D); - break; - case Latte::E_DIM::DIM_CUBEMAP: - desc->setTextureType(MTL::TextureTypeCube); // TODO: check this - break; - default: - cemu_assert_unimplemented(); - desc->setTextureType(MTL::TextureType2D); - break; - } - m_texture = mtlRenderer->GetDevice()->newTexture(desc); desc->release(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp index 34dd6f9f1..2e7a33099 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp @@ -2,6 +2,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Metal/MTLTexture.hpp" LatteTextureViewMtl::LatteTextureViewMtl(MetalRenderer* mtlRenderer, LatteTextureMtl* texture, Latte::E_DIM dim, Latte::E_GX2SURFFMT format, sint32 firstMip, sint32 mipCount, sint32 firstSlice, sint32 sliceCount) : LatteTextureView(texture, firstMip, mipCount, firstSlice, sliceCount, dim, format), m_mtlr(mtlRenderer), m_format(format) @@ -39,8 +40,7 @@ LatteTextureViewMtl::LatteTextureViewMtl(MetalRenderer* mtlRenderer, LatteTextur if (textureType == MTL::TextureType3D) { cemu_assert_debug(firstMip == 0); - // TODO: uncomment - //cemu_assert_debug(this->numSlice == baseTexture->depth); + cemu_assert_debug(this->numSlice == baseTexture->depth); baseLayer = 0; layerCount = 1; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index a7d5598c9..049482c65 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -1,8 +1,4 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" -#include "Common/precompiled.h" -#include "Metal/MTLPixelFormat.hpp" -#include "Metal/MTLRenderPipeline.hpp" -#include "Metal/MTLVertexDescriptor.hpp" std::map MTL_COLOR_FORMAT_TABLE = { {Latte::E_GX2SURFFMT::R4_G4_UNORM, {MTL::PixelFormatRG8Unorm, 2}}, // TODO: correct? diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h index 922d536cf..54e0de324 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h @@ -6,7 +6,6 @@ #include "Cafe/HW/Latte/Core/LatteConst.h" //#include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Renderer/Renderer.h" -#include "Metal/MTLRenderPipeline.hpp" struct Uvec2 { uint32 x; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 00207d658..54bf1fee7 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -61,7 +61,7 @@ void MetalRenderer::InitializeLayer(const Vector2i& size, bool mainWindow) MTL::Library* presentLibrary = m_device->newLibrary(NS::String::string(presentLibrarySource, NS::ASCIIStringEncoding), nullptr, &error); if (error) { - printf("failed to create present library (error: %s)\n", error->localizedDescription()->utf8String()); + debug_printf("failed to create present library (error: %s)\n", error->localizedDescription()->utf8String()); error->release(); throw; return; @@ -79,7 +79,7 @@ void MetalRenderer::InitializeLayer(const Vector2i& size, bool mainWindow) presentFragmentFunction->release(); if (error) { - printf("failed to create present pipeline (error: %s)\n", error->localizedDescription()->utf8String()); + debug_printf("failed to create present pipeline (error: %s)\n", error->localizedDescription()->utf8String()); error->release(); throw; return; @@ -92,19 +92,19 @@ void MetalRenderer::Initialize() void MetalRenderer::Shutdown() { - printf("MetalRenderer::Shutdown not implemented\n"); + debug_printf("MetalRenderer::Shutdown not implemented\n"); } bool MetalRenderer::IsPadWindowActive() { - printf("MetalRenderer::IsPadWindowActive not implemented\n"); + debug_printf("MetalRenderer::IsPadWindowActive not implemented\n"); return false; } bool MetalRenderer::GetVRAMInfo(int& usageInMB, int& totalInMB) const { - printf("MetalRenderer::GetVRAMInfo not implemented\n"); + debug_printf("MetalRenderer::GetVRAMInfo not implemented\n"); usageInMB = 1024; totalInMB = 1024; @@ -114,12 +114,12 @@ bool MetalRenderer::GetVRAMInfo(int& usageInMB, int& totalInMB) const void MetalRenderer::ClearColorbuffer(bool padView) { - printf("MetalRenderer::ClearColorbuffer not implemented\n"); + debug_printf("MetalRenderer::ClearColorbuffer not implemented\n"); } void MetalRenderer::DrawEmptyFrame(bool mainWindow) { - printf("MetalRenderer::DrawEmptyFrame not implemented\n"); + debug_printf("MetalRenderer::DrawEmptyFrame not implemented\n"); } void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) @@ -132,7 +132,7 @@ void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) m_commandBuffer->presentDrawable(m_drawable); } else { - printf("skipped present!\n"); + debug_printf("skipped present!\n"); } m_drawable = nullptr; @@ -177,7 +177,7 @@ bool MetalRenderer::BeginFrame(bool mainWindow) void MetalRenderer::Flush(bool waitIdle) { - printf("MetalRenderer::Flush not implemented\n"); + debug_printf("MetalRenderer::Flush not implemented\n"); } void MetalRenderer::NotifyLatteCommandProcessorIdle() @@ -188,7 +188,7 @@ void MetalRenderer::NotifyLatteCommandProcessorIdle() void MetalRenderer::AppendOverlayDebugInfo() { - printf("MetalRenderer::AppendOverlayDebugInfo not implemented\n"); + debug_printf("MetalRenderer::AppendOverlayDebugInfo not implemented\n"); } void MetalRenderer::renderTarget_setViewport(float x, float y, float width, float height, float nearZ, float farZ, bool halfZ) @@ -232,7 +232,7 @@ void* MetalRenderer::texture_acquireTextureUploadBuffer(uint32 size) void MetalRenderer::texture_releaseTextureUploadBuffer(uint8* mem) { - printf("MetalRenderer::texture_releaseTextureUploadBuffer not implemented\n"); + debug_printf("MetalRenderer::texture_releaseTextureUploadBuffer not implemented\n"); } TextureDecoder* MetalRenderer::texture_chooseDecodedFormat(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, uint32 width, uint32 height) @@ -253,7 +253,7 @@ TextureDecoder* MetalRenderer::texture_chooseDecodedFormat(Latte::E_GX2SURFFMT f case Latte::E_GX2SURFFMT::D32_S8_FLOAT: return TextureDecoder_D32_S8_UINT_X24::getInstance(); default: - printf("invalid depth texture format %u\n", (uint32)format); + debug_printf("invalid depth texture format %u\n", (uint32)format); cemu_assert_debug(false); return nullptr; } @@ -356,7 +356,7 @@ TextureDecoder* MetalRenderer::texture_chooseDecodedFormat(Latte::E_GX2SURFFMT f case Latte::E_GX2SURFFMT::X24_G8_UINT: return TextureDecoder_X24_G8_UINT::getInstance(); // todo - verify default: - printf("invalid color texture format %u\n", (uint32)format); + debug_printf("invalid color texture format %u\n", (uint32)format); cemu_assert_debug(false); return nullptr; } @@ -365,7 +365,7 @@ TextureDecoder* MetalRenderer::texture_chooseDecodedFormat(Latte::E_GX2SURFFMT f void MetalRenderer::texture_clearSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex) { - printf("MetalRenderer::texture_clearSlice not implemented\n"); + debug_printf("MetalRenderer::texture_clearSlice not implemented\n"); } void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, sint32 height, sint32 depth, void* pixelData, sint32 sliceIndex, sint32 mipIndex, uint32 compressedImageSize) @@ -379,12 +379,12 @@ void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, s void MetalRenderer::texture_clearColorSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a) { - printf("MetalRenderer::texture_clearColorSlice not implemented\n"); + debug_printf("MetalRenderer::texture_clearColorSlice not implemented\n"); } void MetalRenderer::texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sliceIndex, sint32 mipIndex, bool clearDepth, bool clearStencil, float depthValue, uint32 stencilValue) { - printf("MetalRenderer::texture_clearDepthSlice not implemented\n"); + debug_printf("MetalRenderer::texture_clearDepthSlice not implemented\n"); } LatteTexture* MetalRenderer::texture_createTextureEx(Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) @@ -399,19 +399,19 @@ void MetalRenderer::texture_setLatteTexture(LatteTextureView* textureView, uint3 void MetalRenderer::texture_copyImageSubData(LatteTexture* src, sint32 srcMip, sint32 effectiveSrcX, sint32 effectiveSrcY, sint32 srcSlice, LatteTexture* dst, sint32 dstMip, sint32 effectiveDstX, sint32 effectiveDstY, sint32 dstSlice, sint32 effectiveCopyWidth, sint32 effectiveCopyHeight, sint32 srcDepth) { - printf("MetalRenderer::texture_copyImageSubData not implemented\n"); + debug_printf("MetalRenderer::texture_copyImageSubData not implemented\n"); } LatteTextureReadbackInfo* MetalRenderer::texture_createReadback(LatteTextureView* textureView) { - printf("MetalRenderer::texture_createReadback not implemented\n"); + debug_printf("MetalRenderer::texture_createReadback not implemented\n"); return nullptr; } void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* sourceTexture, sint32 srcMip, sint32 srcSlice, LatteTexture* destinationTexture, sint32 dstMip, sint32 dstSlice, sint32 width, sint32 height) { - printf("MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion not implemented\n"); + debug_printf("MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion not implemented\n"); } void MetalRenderer::bufferCache_init(const sint32 bufferSize) @@ -431,7 +431,7 @@ void MetalRenderer::bufferCache_copy(uint32 srcOffset, uint32 dstOffset, uint32 void MetalRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint32 dstOffset, uint32 size) { - printf("MetalRenderer::bufferCache_copyStreamoutToMainBuffer not implemented\n"); + debug_printf("MetalRenderer::bufferCache_copyStreamoutToMainBuffer not implemented\n"); } void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, uint32 size) @@ -455,17 +455,17 @@ RendererShader* MetalRenderer::shader_create(RendererShader::ShaderType type, ui void MetalRenderer::streamout_setupXfbBuffer(uint32 bufferIndex, sint32 ringBufferOffset, uint32 rangeAddr, uint32 rangeSize) { - printf("MetalRenderer::streamout_setupXfbBuffer not implemented\n"); + debug_printf("MetalRenderer::streamout_setupXfbBuffer not implemented\n"); } void MetalRenderer::streamout_begin() { - printf("MetalRenderer::streamout_begin not implemented\n"); + debug_printf("MetalRenderer::streamout_begin not implemented\n"); } void MetalRenderer::streamout_rendererFinishDrawcall() { - printf("MetalRenderer::streamout_rendererFinishDrawcall not implemented\n"); + debug_printf("MetalRenderer::streamout_rendererFinishDrawcall not implemented\n"); } void MetalRenderer::draw_beginSequence() @@ -476,7 +476,7 @@ void MetalRenderer::draw_beginSequence() LatteSHRC_UpdateActiveShaders(); if (LatteGPUState.activeShaderHasError) { - printf("Skipping drawcalls due to shader error\n"); + debug_printf("Skipping drawcalls due to shader error\n"); m_state.skipDrawSequence = true; cemu_assert_debug(false); return; @@ -489,14 +489,14 @@ void MetalRenderer::draw_beginSequence() LatteGPUState.repeatTextureInitialization = false; if (!LatteMRT::UpdateCurrentFBO()) { - printf("Rendertarget invalid\n"); + debug_printf("Rendertarget invalid\n"); m_state.skipDrawSequence = true; return; // no render target } if (!hasValidFramebufferAttached) { - printf("Drawcall with no color buffer or depth buffer attached\n"); + debug_printf("Drawcall with no color buffer or depth buffer attached\n"); m_state.skipDrawSequence = true; return; // no render target } @@ -534,7 +534,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // Render pass if (!m_state.activeFBO) { - printf("no active FBO, skipping draw\n"); + debug_printf("no active FBO, skipping draw\n"); return; } @@ -562,7 +562,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader(); if (!vertexShader || !static_cast(vertexShader->shader)->GetFunction()) { - printf("no vertex function, skipping draw\n"); + debug_printf("no vertex function, skipping draw\n"); return; } @@ -612,7 +612,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 layout->setStepFunction(MTL::VertexStepFunctionPerInstance); else { - printf("unimplemented vertex fetch type %u\n", (uint32)fetchType.value()); + debug_printf("unimplemented vertex fetch type %u\n", (uint32)fetchType.value()); cemu_assert(false); } } @@ -678,7 +678,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 MTL::RenderPipelineState* renderPipelineState = m_device->newRenderPipelineState(renderPipelineDescriptor, &error); if (error) { - printf("error creating render pipeline state: %s\n", error->localizedDescription()->utf8String()); + debug_printf("error creating render pipeline state: %s\n", error->localizedDescription()->utf8String()); return; } renderCommandEncoder->setRenderPipelineState(renderPipelineState); @@ -744,7 +744,7 @@ void* MetalRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, u void MetalRenderer::indexData_uploadIndexMemory(uint32 offset, uint32 size) { - printf("MetalRenderer::indexData_uploadIndexMemory not implemented\n"); + debug_printf("MetalRenderer::indexData_uploadIndexMemory not implemented\n"); } void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader) @@ -759,6 +759,11 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE auto texUnitRegIndex = hostTextureUnit * 7; auto textureView = m_state.textures[hostTextureUnit]; + if (!textureView) + { + debug_printf("invalid bound texture view %u\n", hostTextureUnit); + continue; + } //LatteTexture* baseTexture = textureView->baseTexture; // get texture register word 0 @@ -768,10 +773,11 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE //auto imageViewObj = textureView->GetSamplerView(word4); //info.imageView = imageViewObj->m_textureImageView; - uint32 binding = shader->resourceMapping.getTextureBaseBindingPoint() + i; + // HACK + uint32 textureBinding = (shader->resourceMapping.getTextureBaseBindingPoint() + i) % MAX_MTL_TEXTURES; + uint32 samplerBinding = textureBinding % MAX_MTL_SAMPLERS; uint32 stageSamplerIndex = shader->textureUnitSamplerAssignment[relative_textureUnit]; - // TODO: uncomment if (stageSamplerIndex != LATTE_DECOMPILER_SAMPLER_NONE) { // TODO: bind the actual sampler @@ -780,12 +786,12 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE { case LatteConst::ShaderType::Vertex: { - renderCommandEncoder->setVertexSamplerState(sampler, binding); + renderCommandEncoder->setVertexSamplerState(sampler, samplerBinding); break; } case LatteConst::ShaderType::Pixel: { - renderCommandEncoder->setFragmentSamplerState(sampler, binding); + renderCommandEncoder->setFragmentSamplerState(sampler, samplerBinding); break; } default: @@ -797,12 +803,12 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE { case LatteConst::ShaderType::Vertex: { - renderCommandEncoder->setVertexTexture(textureView->GetTexture(), binding); + renderCommandEncoder->setVertexTexture(textureView->GetTexture(), textureBinding); break; } case LatteConst::ShaderType::Pixel: { - renderCommandEncoder->setFragmentTexture(textureView->GetTexture(), binding); + renderCommandEncoder->setFragmentTexture(textureView->GetTexture(), textureBinding); break; } default: @@ -919,7 +925,7 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE uint32 binding = shader->resourceMapping.uniformBuffersBindingPoint[i]; if (binding >= MAX_MTL_BUFFERS) { - printf("too big buffer index (%u), skipping binding\n", binding); + debug_printf("too big buffer index (%u), skipping binding\n", binding); continue; } size_t offset = m_state.uniformBufferOffsets[(uint32)shader->shaderType][binding]; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index fff6c6c2d..5e6d6995a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -17,6 +17,7 @@ #define MTL_SUPPORT_BUFFER_BINDING 30 #define MAX_MTL_TEXTURES 31 +#define MAX_MTL_SAMPLERS 16 constexpr size_t INVALID_OFFSET = std::numeric_limits::max(); @@ -31,7 +32,8 @@ struct MetalState bool skipDrawSequence = false; class CachedFBOMtl* activeFBO = nullptr; MetalBoundBuffer vertexBuffers[MAX_MTL_BUFFERS] = {{}}; - class LatteTextureViewMtl* textures[MAX_MTL_TEXTURES] = {nullptr}; + // TODO: find out what is the max number of bound textures on the Wii U + class LatteTextureViewMtl* textures[64] = {nullptr}; size_t uniformBufferOffsets[(uint32)LatteConst::ShaderType::TotalCount][MAX_MTL_BUFFERS]; MTL::Texture* colorRenderTargets[8] = {nullptr}; MTL::Texture* depthRenderTarget = nullptr; From 4173675f03ca102df010eef2ea327d9624f16cc0 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 2 Aug 2024 19:42:34 +0200 Subject: [PATCH 024/368] implement depth state & texture clears --- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 20 +++ src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h | 3 + .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 127 +++++++++++++++++- 3 files changed, 145 insertions(+), 5 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index 049482c65..40eacdf5e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -1,4 +1,6 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Common/precompiled.h" +#include "Metal/MTLDepthStencil.hpp" std::map MTL_COLOR_FORMAT_TABLE = { {Latte::E_GX2SURFFMT::R4_G4_UNORM, {MTL::PixelFormatRG8Unorm, 2}}, // TODO: correct? @@ -252,3 +254,21 @@ MTL::BlendFactor GetMtlBlendFactor(Latte::LATTE_CB_BLENDN_CONTROL::E_BLENDFACTOR cemu_assert_debug((uint32)factor < std::size(MTL_BLEND_FACTORS)); return MTL_BLEND_FACTORS[(uint32)factor]; } + +const MTL::CompareFunction MTL_COMPARE_FUNCTIONS[8] = +{ + MTL::CompareFunctionNever, + MTL::CompareFunctionLess, + MTL::CompareFunctionEqual, + MTL::CompareFunctionLessEqual, + MTL::CompareFunctionGreater, + MTL::CompareFunctionNotEqual, + MTL::CompareFunctionGreaterEqual, + MTL::CompareFunctionAlways +}; + +MTL::CompareFunction GetMtlCompareFunc(Latte::E_COMPAREFUNC func) +{ + cemu_assert_debug((uint32)func < std::size(MTL_COMPARE_FUNCTIONS)); + return MTL_COMPARE_FUNCTIONS[(uint32)func]; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h index 54e0de324..ed99098df 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h @@ -6,6 +6,7 @@ #include "Cafe/HW/Latte/Core/LatteConst.h" //#include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Renderer/Renderer.h" +#include "Metal/MTLDepthStencil.hpp" struct Uvec2 { uint32 x; @@ -33,3 +34,5 @@ MTL::IndexType GetMtlIndexType(Renderer::INDEX_TYPE indexType); MTL::BlendOperation GetMtlBlendOp(Latte::LATTE_CB_BLENDN_CONTROL::E_COMBINEFUNC combineFunc); MTL::BlendFactor GetMtlBlendFactor(Latte::LATTE_CB_BLENDN_CONTROL::E_BLENDFACTOR factor); + +MTL::CompareFunction GetMtlCompareFunc(Latte::E_COMPAREFUNC func); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 54bf1fee7..aeedcd820 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -13,7 +13,9 @@ #include "Cafe/HW/Latte/Core/LatteIndices.h" #include "Cemu/Logging/CemuDebugLogging.h" #include "Foundation/NSTypes.hpp" +#include "Metal/MTLDepthStencil.hpp" #include "Metal/MTLRenderCommandEncoder.hpp" +#include "Metal/MTLRenderPass.hpp" #include "gui/guiWrapper.h" extern bool hasValidFramebufferAttached; @@ -377,14 +379,48 @@ void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, s mtlTexture->GetTexture()->replaceRegion(MTL::Region(0, 0, width, height), mipIndex, sliceIndex, pixelData, bytesPerRow, bytesPerImage); } +// TODO: use sliceIndex and mipIndex void MetalRenderer::texture_clearColorSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a) { - debug_printf("MetalRenderer::texture_clearColorSlice not implemented\n"); + auto mtlTexture = static_cast(hostTexture)->GetTexture(); + + MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); + auto colorAttachment = renderPassDescriptor->colorAttachments()->object(0); + colorAttachment->setTexture(mtlTexture); + colorAttachment->setClearColor(MTL::ClearColor(r, g, b, a)); + colorAttachment->setLoadAction(MTL::LoadActionClear); + colorAttachment->setStoreAction(MTL::StoreActionStore); + + MTL::Texture* colorRenderTargets[8] = {nullptr}; + colorRenderTargets[0] = mtlTexture; + GetRenderCommandEncoder(renderPassDescriptor, colorRenderTargets, nullptr); } +// TODO: use sliceIndex and mipIndex void MetalRenderer::texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sliceIndex, sint32 mipIndex, bool clearDepth, bool clearStencil, float depthValue, uint32 stencilValue) { - debug_printf("MetalRenderer::texture_clearDepthSlice not implemented\n"); + auto mtlTexture = static_cast(hostTexture)->GetTexture(); + + MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); + if (clearDepth) + { + auto depthAttachment = renderPassDescriptor->depthAttachment(); + depthAttachment->setTexture(mtlTexture); + depthAttachment->setClearDepth(depthValue); + depthAttachment->setLoadAction(MTL::LoadActionClear); + depthAttachment->setStoreAction(MTL::StoreActionStore); + } + if (clearStencil) + { + auto stencilAttachment = renderPassDescriptor->stencilAttachment(); + stencilAttachment->setTexture(mtlTexture); + stencilAttachment->setClearStencil(stencilValue); + stencilAttachment->setLoadAction(MTL::LoadActionClear); + stencilAttachment->setStoreAction(MTL::StoreActionStore); + } + + MTL::Texture* colorRenderTargets[8] = {nullptr}; + GetRenderCommandEncoder(renderPassDescriptor, colorRenderTargets, mtlTexture); } LatteTexture* MetalRenderer::texture_createTextureEx(Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) @@ -683,6 +719,87 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 } renderCommandEncoder->setRenderPipelineState(renderPipelineState); + // Depth stencil state + bool depthEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_Z_ENABLE(); + auto depthFunc = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_Z_FUNC(); + bool depthWriteEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_Z_WRITE_ENABLE(); + + MTL::DepthStencilDescriptor* depthStencilDescriptor = MTL::DepthStencilDescriptor::alloc()->init(); + depthStencilDescriptor->setDepthWriteEnabled(depthWriteEnable); + + auto depthCompareFunc = GetMtlCompareFunc(depthFunc); + if (!depthEnable) + { + depthCompareFunc = MTL::CompareFunctionAlways; + } + depthStencilDescriptor->setDepthCompareFunction(depthCompareFunc); + + // TODO: stencil state + /* + // get stencil control parameters + bool stencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ENABLE(); + bool backStencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_BACK_STENCIL_ENABLE(); + auto frontStencilFunc = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_FUNC_F(); + auto frontStencilZPass = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ZPASS_F(); + auto frontStencilZFail = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ZFAIL_F(); + auto frontStencilFail = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_FAIL_F(); + auto backStencilFunc = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_FUNC_B(); + auto backStencilZPass = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ZPASS_B(); + auto backStencilZFail = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ZFAIL_B(); + auto backStencilFail = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_FAIL_B(); + // get stencil control parameters + uint32 stencilCompareMaskFront = LatteGPUState.contextNew.DB_STENCILREFMASK.get_STENCILMASK_F(); + uint32 stencilWriteMaskFront = LatteGPUState.contextNew.DB_STENCILREFMASK.get_STENCILWRITEMASK_F(); + uint32 stencilRefFront = LatteGPUState.contextNew.DB_STENCILREFMASK.get_STENCILREF_F(); + uint32 stencilCompareMaskBack = LatteGPUState.contextNew.DB_STENCILREFMASK_BF.get_STENCILMASK_B(); + uint32 stencilWriteMaskBack = LatteGPUState.contextNew.DB_STENCILREFMASK_BF.get_STENCILWRITEMASK_B(); + uint32 stencilRefBack = LatteGPUState.contextNew.DB_STENCILREFMASK_BF.get_STENCILREF_B(); + + static const VkStencilOp stencilOpTable[8] = { + VK_STENCIL_OP_KEEP, + VK_STENCIL_OP_ZERO, + VK_STENCIL_OP_REPLACE, + VK_STENCIL_OP_INCREMENT_AND_CLAMP, + VK_STENCIL_OP_DECREMENT_AND_CLAMP, + VK_STENCIL_OP_INVERT, + VK_STENCIL_OP_INCREMENT_AND_WRAP, + VK_STENCIL_OP_DECREMENT_AND_WRAP + }; + + depthStencilState.stencilTestEnable = stencilEnable ? VK_TRUE : VK_FALSE; + + depthStencilState.front.reference = stencilRefFront; + depthStencilState.front.compareMask = stencilCompareMaskFront; + depthStencilState.front.writeMask = stencilWriteMaskBack; + depthStencilState.front.compareOp = vkDepthCompareTable[(size_t)frontStencilFunc]; + depthStencilState.front.depthFailOp = stencilOpTable[(size_t)frontStencilZFail]; + depthStencilState.front.failOp = stencilOpTable[(size_t)frontStencilFail]; + depthStencilState.front.passOp = stencilOpTable[(size_t)frontStencilZPass]; + + if (backStencilEnable) + { + depthStencilState.back.reference = stencilRefBack; + depthStencilState.back.compareMask = stencilCompareMaskBack; + depthStencilState.back.writeMask = stencilWriteMaskBack; + depthStencilState.back.compareOp = vkDepthCompareTable[(size_t)backStencilFunc]; + depthStencilState.back.depthFailOp = stencilOpTable[(size_t)backStencilZFail]; + depthStencilState.back.failOp = stencilOpTable[(size_t)backStencilFail]; + depthStencilState.back.passOp = stencilOpTable[(size_t)backStencilZPass]; + } + else + { + depthStencilState.back.reference = stencilRefFront; + depthStencilState.back.compareMask = stencilCompareMaskFront; + depthStencilState.back.writeMask = stencilWriteMaskFront; + depthStencilState.back.compareOp = vkDepthCompareTable[(size_t)frontStencilFunc]; + depthStencilState.back.depthFailOp = stencilOpTable[(size_t)frontStencilZFail]; + depthStencilState.back.failOp = stencilOpTable[(size_t)frontStencilFail]; + depthStencilState.back.passOp = stencilOpTable[(size_t)frontStencilZPass]; + } + */ + MTL::DepthStencilState* depthStencilState = m_device->newDepthStencilState(depthStencilDescriptor); + renderCommandEncoder->setDepthStencilState(depthStencilState); + // Primitive type const LattePrimitiveMode primitiveMode = static_cast(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]); auto mtlPrimitiveType = GetMtlPrimitiveType(primitiveMode); @@ -840,8 +957,8 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE UNREACHABLE; } - //if (shader->resourceMapping.uniformVarsBufferBindingPoint >= 0) - //{ + if (shader->resourceMapping.uniformVarsBufferBindingPoint >= 0) + { if (shader->uniform.list_ufTexRescale.empty() == false) { for (auto& entry : shader->uniform.list_ufTexRescale) @@ -915,7 +1032,7 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE default: UNREACHABLE; } - //} + } // Uniform buffers for (sint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++) From 4022755a33c4434f5523cb6c5fbaa8f16931a18d Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 2 Aug 2024 20:25:37 +0200 Subject: [PATCH 025/368] implement proper bit cast in shaders --- .../LatteDecompilerEmitMSL.cpp | 78 +++++++++++-------- .../LatteDecompilerEmitMSLAttrDecoder.cpp | 9 ++- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 6 +- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 36 +++++---- 4 files changed, 76 insertions(+), 53 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index ae6ca8f38..0e55ffa13 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -802,7 +802,7 @@ static void _emitOperandInputCode(LatteDecompilerShaderContext* shaderContext, L if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) { // need to convert (not cast) from int bits to float - src->add("as_type("); + src->add("bitCast("); } else if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_FLOAT ) { @@ -872,7 +872,7 @@ static void _emitOperandInputCode(LatteDecompilerShaderContext* shaderContext, L src->add(_FormatFloatAsConstant(*(float*)&constVal)); } else - src->addFmt("as_type(0x{:08x})", constVal); + src->addFmt("bitCast(0x{:08x})", constVal); } } else if( GPU7_ALU_SRC_IS_CFILE(aluInstruction->sourceOperand[operandIndex].sel) ) @@ -919,11 +919,11 @@ void _emitTypeConversionPrefixMSL(LatteDecompilerShaderContext* shaderContext, s return; StringBuf* src = shaderContext->shaderSource; if (sourceType == LATTE_DECOMPILER_DTYPE_FLOAT && destinationType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->add("as_type("); + src->add("bitCast("); else if (sourceType == LATTE_DECOMPILER_DTYPE_FLOAT && destinationType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT) - src->add("as_type("); + src->add("bitCast("); else if( sourceType == LATTE_DECOMPILER_DTYPE_SIGNED_INT && destinationType == LATTE_DECOMPILER_DTYPE_FLOAT ) - src->add("as_type("); + src->add("bitCast("); else if( sourceType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT && destinationType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) src->add("int("); else if( sourceType == LATTE_DECOMPILER_DTYPE_SIGNED_INT && destinationType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT ) @@ -1026,7 +1026,7 @@ static void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderConte src->add(" = "); if( outputType != LATTE_DECOMPILER_DTYPE_SIGNED_INT ) debugBreakpoint(); // todo - src->add("as_type(tempResultf)"); + src->add("bitCast(tempResultf)"); src->add(";" _CRLF); } else if( aluInstruction->opcode == ALU_OP2_INST_MOVA_INT ) @@ -1123,9 +1123,9 @@ static void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderConte _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(");" _CRLF); // INF becomes 0.0 - src->add("if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) == 0 ) tempResultf = 0.0;" _CRLF); + src->add("if( isinf(tempResultf) == true && (bitCast(tempResultf)&0x80000000) == 0 ) tempResultf = 0.0;" _CRLF); // -INF becomes -0.0 - src->add("else if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) != 0 ) tempResultf = -0.0;" _CRLF); + src->add("else if( isinf(tempResultf) == true && (bitCast(tempResultf)&0x80000000) != 0 ) tempResultf = -0.0;" _CRLF); // assign result to output _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); @@ -1145,14 +1145,14 @@ static void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderConte if (aluInstruction->opcode == ALU_OP2_INST_RECIPSQRT_CLAMPED) { // note: if( -INF < 0.0 ) does not resolve to true - src->add("if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) != 0 ) tempResultf = -3.40282347E+38F;" _CRLF); - src->add("else if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) == 0 ) tempResultf = 3.40282347E+38F;" _CRLF); + src->add("if( isinf(tempResultf) == true && (bitCast(tempResultf)&0x80000000) != 0 ) tempResultf = -3.40282347E+38F;" _CRLF); + src->add("else if( isinf(tempResultf) == true && (bitCast(tempResultf)&0x80000000) == 0 ) tempResultf = 3.40282347E+38F;" _CRLF); } else if (aluInstruction->opcode == ALU_OP2_INST_RECIPSQRT_FF) { // untested (BotW bombs) - src->add("if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) != 0 ) tempResultf = -0.0;" _CRLF); - src->add("else if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) == 0 ) tempResultf = 0.0;" _CRLF); + src->add("if( isinf(tempResultf) == true && (bitCast(tempResultf)&0x80000000) != 0 ) tempResultf = -0.0;" _CRLF); + src->add("else if( isinf(tempResultf) == true && (bitCast(tempResultf)&0x80000000) == 0 ) tempResultf = 0.0;" _CRLF); } // assign result to output _emitInstructionOutputVariableName(shaderContext, aluInstruction); @@ -2002,7 +2002,7 @@ static void _emitALUClauseCode(LatteDecompilerShaderContext* shaderContext, Latt { _emitInstructionOutputVariableName(shaderContext, &aluInstruction); src->add(" = "); - src->add("as_type(as_type("); + src->add("bitCast(bitCast("); _emitInstructionOutputVariableName(shaderContext, &aluInstruction); src->add(")"); if( aluInstruction.omod == 1 ) @@ -2099,9 +2099,9 @@ static void _emitTEXSampleCoordInputComponent(LatteDecompilerShaderContext* shad if(interpretSrcAsType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) { if( elementSel == 4 ) - src->add("as_type(0.0)"); + src->add("bitCast(0.0)"); else if( elementSel == 5 ) - src->add("as_type(1.0)"); + src->add("bitCast(1.0)"); } else if(interpretSrcAsType == LATTE_DECOMPILER_DTYPE_FLOAT ) { @@ -2116,7 +2116,7 @@ static const char* _texGprAccessElemTable[8] = {"x","y","z","w","_","_","_","_"} static char* _getTexGPRAccess(LatteDecompilerShaderContext* shaderContext, sint32 gprIndex, uint32 dataType, sint8 selX, sint8 selY, sint8 selZ, sint8 selW, char* tempBuffer) { - // as_type(R{}i.w) + // bitCast(R{}i.w) *tempBuffer = '\0'; uint8 elemCount = (selX > 0 ? 1 : 0) + (selY > 0 ? 1 : 0) + (selZ > 0 ? 1 : 0) + (selW > 0 ? 1 : 0); if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) @@ -2124,7 +2124,7 @@ static char* _getTexGPRAccess(LatteDecompilerShaderContext* shaderContext, sint3 if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) ; // no conversion else if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) - strcat(tempBuffer, "as_type("); + strcat(tempBuffer, "bitCast("); else cemu_assert_unimplemented(); strcat(tempBuffer, _getRegisterVarName(shaderContext, gprIndex)); @@ -2233,13 +2233,13 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex shaderContext->shaderSource->addFmt(" = int{}(", numWrittenElements); } else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) - src->add(" = as_type("); + src->add(" = bitCast("); } else { // float samplers if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->add(" = as_type("); + src->add(" = bitCast("); else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) src->add(" = ("); } @@ -2665,14 +2665,14 @@ static void _emitTEXGetCompTexLodCode(LatteDecompilerShaderContext* shaderContex if(shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) src->addFmt("float4(textureQueryLod(tex{}, {}.{}{}{}),0.0,0.0)", texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]], resultElemTable[texInstruction->textureFetch.srcSel[2]]); else - src->addFmt("float4(textureQueryLod(tex{}, as_type({}.{}{}{})),0.0,0.0)", texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]], resultElemTable[texInstruction->textureFetch.srcSel[2]]); + src->addFmt("float4(textureQueryLod(tex{}, bitCast({}.{}{}{})),0.0,0.0)", texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]], resultElemTable[texInstruction->textureFetch.srcSel[2]]); } else { if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) src->addFmt("float4(textureQueryLod(tex{}, {}.{}{}),0.0,0.0)", texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]]); else - src->addFmt("float4(textureQueryLod(tex{}, as_type({}.{}{})),0.0,0.0)", texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]]); + src->addFmt("float4(textureQueryLod(tex{}, bitCast({}.{}{})),0.0,0.0)", texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]]); debugBreakpoint(); } @@ -2706,7 +2706,7 @@ static void _emitTEXSetCubemapIndexCode(LatteDecompilerShaderContext* shaderCont const char* resultElemTable[4] = {"x","y","z","w"}; if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->addFmt(" = as_type(R{}i.{});" _CRLF, texInstruction->srcGpr, resultElemTable[texInstruction->textureFetch.srcSel[0]]); + src->addFmt(" = bitCast(R{}i.{});" _CRLF, texInstruction->srcGpr, resultElemTable[texInstruction->textureFetch.srcSel[0]]); else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) src->addFmt(" = R{}f.{};" _CRLF, texInstruction->srcGpr, resultElemTable[texInstruction->textureFetch.srcSel[0]]); else @@ -2880,7 +2880,7 @@ static void _emitTEXVFetchCode(LatteDecompilerShaderContext* shaderContext, Latt src->add(" = "); if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->add("as_type("); + src->add("bitCast("); else src->add("("); @@ -2889,7 +2889,7 @@ static void _emitTEXVFetchCode(LatteDecompilerShaderContext* shaderContext, Latt if( shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) src->addFmt("{}.{}", _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]]); else - src->addFmt("as_type({}.{})", _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]]); + src->addFmt("bitCast({}.{})", _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]]); src->add("]."); @@ -2921,7 +2921,7 @@ static void _emitTEXReadMemCode(LatteDecompilerShaderContext* shaderContext, Lat src->add(" = "); if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->add("as_type("); + src->add("bitCast("); else src->add("("); @@ -3735,6 +3735,22 @@ void LatteDecompiler_emitHelperFunctions(LatteDecompilerShaderContext* shaderCon "return float2(as_type(ushort(as_type(x) & 0x00FF)), as_type(ushort((as_type(x) & 0xFF00) >> 16)));\r\n" "}\r\n"); + // Bit cast + + // Scalar + fCStr_shaderSource->add("" + "template\r\n" + "ResultT bitCast(T x) {\r\n" + "return as_type(x);\r\n" + "}\r\n"); + + // Vector + fCStr_shaderSource->add("" + "template\r\n" + "vec bitCast(vec x) {\r\n" + "return as_type>(x);\r\n" + "}\r\n"); + // mul non-ieee way (0*NaN/INF => 0.0) if (shaderContext->options->strictMul) { @@ -3742,7 +3758,7 @@ void LatteDecompiler_emitHelperFunctions(LatteDecompilerShaderContext* shaderCon //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ return mix(a*b,0.0,a==0.0||b==0.0); }" STR_LINEBREAK); //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ return mix(vec2(a*b,0.0),vec2(0.0,0.0),(equal(vec2(a),vec2(0.0,0.0))||equal(vec2(b),vec2(0.0,0.0)))).x; }" STR_LINEBREAK); //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ if( a == 0.0 || b == 0.0 ) return 0.0; return a*b; }" STR_LINEBREAK); - //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){float r = a*b;r = as_type(floatBitsToInt(r)&(((floatBitsToInt(a) != 0) && (floatBitsToInt(b) != 0))?0xFFFFFFFF:0));return r;}" STR_LINEBREAK); works + //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){float r = a*b;r = bitCast(floatBitsToInt(r)&(((floatBitsToInt(a) != 0) && (floatBitsToInt(b) != 0))?0xFFFFFFFF:0));return r;}" STR_LINEBREAK); works // for "min" it used to be: float mul_nonIEEE(float a, float b){ return min(a*b,min(abs(a)*3.40282347E+38F,abs(b)*3.40282347E+38F)); } @@ -3978,7 +3994,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) src->addFmt("{} = int4(vid, 0, 0, iid);" _CRLF, _getRegisterVarName(shaderContext, 0)); else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) - src->addFmt("{} = float4(vid, 0, 0, iid);" _CRLF, _getRegisterVarName(shaderContext, 0)); // TODO: as_type(float4(vid, 0, 0, iid))? + src->addFmt("{} = float4(vid, 0, 0, iid);" _CRLF, _getRegisterVarName(shaderContext, 0)); // TODO: bitCast(float4(vid, 0, 0, iid))? else cemu_assert_unimplemented(); } @@ -4021,7 +4037,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) src->addFmt("{} = in.position.xyxy;" _CRLF, _getRegisterVarName(shaderContext, paramGenGPRIndex)); else - src->addFmt("{} = as_type(gl_PointCoord.xyxy);" _CRLF, _getRegisterVarName(shaderContext, paramGenGPRIndex)); + src->addFmt("{} = bitCast(gl_PointCoord.xyxy);" _CRLF, _getRegisterVarName(shaderContext, paramGenGPRIndex)); } for (sint32 i = 0; i < psInputTable->count; i++) @@ -4038,7 +4054,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) src->addFmt("{} = GET_FRAGCOORD();" _CRLF, _getRegisterVarName(shaderContext, gprIndex)); else - src->addFmt("{} = as_type(GET_FRAGCOORD());" _CRLF, _getRegisterVarName(shaderContext, gprIndex)); + src->addFmt("{} = bitCast(GET_FRAGCOORD());" _CRLF, _getRegisterVarName(shaderContext, gprIndex)); continue; } @@ -4056,7 +4072,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, { // import from vertex shader if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->addFmt("{} = as_type(in.passParameterSem{});" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId); + src->addFmt("{} = bitCast(in.passParameterSem{});" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId); else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) src->addFmt("{} = in.passParameterSem{};" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId); else @@ -4071,7 +4087,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, if (frontFace_allBits) cemu_assert_debug(false); if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->addFmt("{}.{} = as_type(frontFacing?1.0:0.0);" _CRLF, _getRegisterVarName(shaderContext, frontFace_regIndex), _getElementStrByIndex(frontFace_chan)); + src->addFmt("{}.{} = bitCast(frontFacing?1.0:0.0);" _CRLF, _getRegisterVarName(shaderContext, frontFace_regIndex), _getElementStrByIndex(frontFace_chan)); else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) src->addFmt("{}.{} = frontFacing ? 1.0 : 0.0;" _CRLF, _getRegisterVarName(shaderContext, frontFace_regIndex), _getElementStrByIndex(frontFace_chan)); else diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp index cb90e45d6..a9993964b 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp @@ -255,7 +255,8 @@ void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext { // seen in Giana Sisters: Twisted Dreams _readLittleEndianAttributeU16x4(shaderContext, src, attributeInputIndex); - src->add("attrDecoder.xyzw = as_type(float4(unpackHalf2x16(attrDecoder.x|(attrDecoder.y<<16)),unpackHalf2x16(attrDecoder.z|(attrDecoder.w<<16))));" _CRLF); + // TODO: uint4? + src->add("attrDecoder.xyzw = as_type(float4(unpackHalf2x16(attrDecoder.x|(attrDecoder.y<<16)),unpackHalf2x16(attrDecoder.z|(attrDecoder.w<<16))));" _CRLF); } else if (attrib->format == FMT_16_16 && attrib->nfa == 0 && attrib->isSigned != 0) { @@ -305,7 +306,8 @@ void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 2 && attrib->isSigned == 0 ) { // seen in One Piece - src->addFmt("attrDecoder.xyzw = as_type(float4(in.attrDataSem{}.xyzw));" _CRLF, attributeInputIndex); + // TODO: uint4? + src->addFmt("attrDecoder.xyzw = as_type(float4(in.attrDataSem{}.xyzw));" _CRLF, attributeInputIndex); } else if (attrib->format == FMT_8_8 && attrib->nfa == 0 && attrib->isSigned == 0) { @@ -391,7 +393,8 @@ void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext if( attrib->format == FMT_16_16_16_16_FLOAT && attrib->nfa == 2 ) { _readBigEndianAttributeU16x4(shaderContext, src, attributeInputIndex); - src->add("attrDecoder.xyzw = as_type(float4(unpackHalf2x16(attrDecoder.x|(attrDecoder.y<<16)),unpackHalf2x16(attrDecoder.z|(attrDecoder.w<<16))));" _CRLF); + // TODO: uint4? + src->add("attrDecoder.xyzw = as_type(float4(unpackHalf2x16(attrDecoder.x|(attrDecoder.y<<16)),unpackHalf2x16(attrDecoder.z|(attrDecoder.w<<16))));" _CRLF); } else if (attrib->format == FMT_16_16_16_16 && attrib->nfa == 0 && attrib->isSigned != 0) { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index aeedcd820..de8282fe2 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -161,7 +161,7 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput MTL::Texture* colorRenderTargets[8] = {nullptr}; colorRenderTargets[0] = m_drawable->texture(); // If there was already an encoder with these attachment, we should set the viewport and scissor to default, but that shouldn't happen - auto renderCommandEncoder = GetRenderCommandEncoder(renderPassDescriptor, colorRenderTargets, nullptr, false); + auto renderCommandEncoder = GetRenderCommandEncoder(renderPassDescriptor, colorRenderTargets, nullptr, false, false); // Draw to Metal layer renderCommandEncoder->setRenderPipelineState(m_presentPipeline); @@ -393,7 +393,7 @@ void MetalRenderer::texture_clearColorSlice(LatteTexture* hostTexture, sint32 sl MTL::Texture* colorRenderTargets[8] = {nullptr}; colorRenderTargets[0] = mtlTexture; - GetRenderCommandEncoder(renderPassDescriptor, colorRenderTargets, nullptr); + GetRenderCommandEncoder(renderPassDescriptor, colorRenderTargets, nullptr, true); } // TODO: use sliceIndex and mipIndex @@ -420,7 +420,7 @@ void MetalRenderer::texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sl } MTL::Texture* colorRenderTargets[8] = {nullptr}; - GetRenderCommandEncoder(renderPassDescriptor, colorRenderTargets, mtlTexture); + GetRenderCommandEncoder(renderPassDescriptor, colorRenderTargets, mtlTexture, true); } LatteTexture* MetalRenderer::texture_createTextureEx(Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 5e6d6995a..8ac871189 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -232,36 +232,40 @@ class MetalRenderer : public Renderer } } - MTL::RenderCommandEncoder* GetRenderCommandEncoder(MTL::RenderPassDescriptor* renderPassDescriptor, MTL::Texture* colorRenderTargets[8], MTL::Texture* depthRenderTarget, bool rebindStateIfNewEncoder = true) + // Some render passes clear the attachments, forceRecreate is supposed to be used in those cases + MTL::RenderCommandEncoder* GetRenderCommandEncoder(MTL::RenderPassDescriptor* renderPassDescriptor, MTL::Texture* colorRenderTargets[8], MTL::Texture* depthRenderTarget, bool forceRecreate = false, bool rebindStateIfNewEncoder = true) { EnsureCommandBuffer(); // Check if we need to begin a new render pass if (m_commandEncoder) { - if (m_encoderType == MetalEncoderType::Render) + if (!forceRecreate) { - bool needsNewRenderPass = false; - for (uint8 i = 0; i < 8; i++) + if (m_encoderType == MetalEncoderType::Render) { - if (colorRenderTargets[i] && (colorRenderTargets[i] != m_state.colorRenderTargets[i])) + bool needsNewRenderPass = false; + for (uint8 i = 0; i < 8; i++) { - needsNewRenderPass = true; - break; + if (colorRenderTargets[i] && (colorRenderTargets[i] != m_state.colorRenderTargets[i])) + { + needsNewRenderPass = true; + break; + } } - } - if (!needsNewRenderPass) - { - if (depthRenderTarget && (depthRenderTarget != m_state.depthRenderTarget)) + if (!needsNewRenderPass) { - needsNewRenderPass = true; + if (depthRenderTarget && (depthRenderTarget != m_state.depthRenderTarget)) + { + needsNewRenderPass = true; + } } - } - if (!needsNewRenderPass) - { - return (MTL::RenderCommandEncoder*)m_commandEncoder; + if (!needsNewRenderPass) + { + return (MTL::RenderCommandEncoder*)m_commandEncoder; + } } } From db709c34956a406ed9e31e8fe7dc246419a6e4f4 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 3 Aug 2024 08:24:39 +0200 Subject: [PATCH 026/368] fix: present issue --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 9 +++++++++ src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h | 1 + 2 files changed, 10 insertions(+) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index de8282fe2..ecb75c71f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -137,6 +137,7 @@ void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) debug_printf("skipped present!\n"); } m_drawable = nullptr; + m_drawableAcquired = false; CommitCommandBuffer(); } @@ -145,6 +146,14 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight, bool padView, bool clearBackground) { + if (m_drawableAcquired) + { + debug_printf("drawable already acquired this frame\n"); + return; + } + + m_drawableAcquired = true; + // Acquire drawable m_drawable = m_metalLayer->nextDrawable(); if (!m_drawable) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 8ac871189..a79a515f9 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -216,6 +216,7 @@ class MetalRenderer : public Renderer MetalEncoderType m_encoderType = MetalEncoderType::None; MTL::CommandEncoder* m_commandEncoder = nullptr; CA::MetalDrawable* m_drawable = nullptr; + bool m_drawableAcquired = false; // State MetalState m_state; From c6f66167a58f6e04093f758b644d427967403acc Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 3 Aug 2024 08:36:34 +0200 Subject: [PATCH 027/368] fix: viewport --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 19 +++++++++---------- .../Renderer/Metal/ShaderSourcePresent.h | 1 + 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index ecb75c71f..f91a36a47 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -146,19 +146,18 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight, bool padView, bool clearBackground) { - if (m_drawableAcquired) + if (!m_drawableAcquired) { debug_printf("drawable already acquired this frame\n"); - return; - } - m_drawableAcquired = true; + m_drawableAcquired = true; - // Acquire drawable - m_drawable = m_metalLayer->nextDrawable(); - if (!m_drawable) - { - return; + // Acquire drawable + m_drawable = m_metalLayer->nextDrawable(); + if (!m_drawable) + { + return; + } } MTL::Texture* presentTexture = static_cast(texView)->GetTexture(); @@ -204,7 +203,7 @@ void MetalRenderer::AppendOverlayDebugInfo() void MetalRenderer::renderTarget_setViewport(float x, float y, float width, float height, float nearZ, float farZ, bool halfZ) { - m_state.viewport = MTL::Viewport{x, y + height, width, -height, nearZ, farZ}; + m_state.viewport = MTL::Viewport{x, y, width, height, nearZ, farZ}; if (m_encoderType == MetalEncoderType::Render) { static_cast(m_commandEncoder)->setViewport(m_state.viewport); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/ShaderSourcePresent.h b/src/Cafe/HW/Latte/Renderer/Metal/ShaderSourcePresent.h index ad3f69715..a77ce2cdb 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/ShaderSourcePresent.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/ShaderSourcePresent.h @@ -13,6 +13,7 @@ inline const char* presentLibrarySource = \ " VertexOut out;\n" \ " out.position = float4(positions[vid], 0.0, 1.0);\n" \ " out.texCoord = positions[vid] * 0.5 + 0.5;\n" \ +" out.texCoord.y = 1.0 - out.texCoord.y;\n" \ "\n" \ " return out;\n" \ "}\n" \ From d7411e27f773e5b9ef1278c11e87d35b9109725f Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 3 Aug 2024 09:31:40 +0200 Subject: [PATCH 028/368] implement render pipeline cache --- src/Cafe/CMakeLists.txt | 2 + src/Cafe/HW/Latte/Core/LatteShader.cpp | 1 - .../LegacyShaderDecompiler/LatteDecompiler.h | 12 +- .../Renderer/Metal/LatteTextureViewMtl.cpp | 1 - .../Renderer/Metal/MetalPipelineCache.cpp | 207 ++++++++++++++ .../Latte/Renderer/Metal/MetalPipelineCache.h | 22 ++ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 255 ++++++++++-------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 151 +---------- 8 files changed, 388 insertions(+), 263 deletions(-) create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index 82a1989f3..fb802c82f 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -553,6 +553,8 @@ if(ENABLE_METAL) HW/Latte/Renderer/Metal/CachedFBOMtl.h HW/Latte/Renderer/Metal/MetalMemoryManager.cpp HW/Latte/Renderer/Metal/MetalMemoryManager.h + HW/Latte/Renderer/Metal/MetalPipelineCache.cpp + HW/Latte/Renderer/Metal/MetalPipelineCache.h HW/Latte/Renderer/Metal/ShaderSourcePresent.h ) diff --git a/src/Cafe/HW/Latte/Core/LatteShader.cpp b/src/Cafe/HW/Latte/Core/LatteShader.cpp index 486516efd..66730a9b6 100644 --- a/src/Cafe/HW/Latte/Core/LatteShader.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShader.cpp @@ -615,7 +615,6 @@ LatteDecompilerShader* LatteShader_CreateShaderFromDecompilerOutput(LatteDecompi LatteDecompilerShader* shader = decompilerOutput.shader; shader->baseHash = baseHash; // copy resource mapping - // HACK if (g_renderer->GetType() != RendererAPI::OpenGL) shader->resourceMapping = decompilerOutput.resourceMappingVK; else diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h index 1159614e5..78af1deca 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h @@ -36,7 +36,7 @@ typedef struct uint16 mappedIndexOffset; // index in remapped uniform array }LatteFastAccessRemappedUniformEntry_buffer_t; -typedef struct +typedef struct { uint32 texUnit; sint32 uniformLocation; @@ -198,7 +198,7 @@ struct LatteDecompilerShader // resource mapping (binding points) LatteDecompilerShaderResourceMapping resourceMapping{}; // uniforms - struct + struct { sint32 loc_remapped; // uf_remappedVS/uf_remappedGS/uf_remappedPS sint32 loc_uniformRegister; // uf_uniformRegisterVS/uf_uniformRegisterGS/uf_uniformRegisterPS @@ -215,7 +215,7 @@ struct LatteDecompilerShader sint32 uniformRangeSize; // entire size of uniform variable block }uniform{ 0 }; // fast access - struct _RemappedUniformBufferGroup + struct _RemappedUniformBufferGroup { _RemappedUniformBufferGroup(uint32 _kcacheBankIdOffset) : kcacheBankIdOffset(_kcacheBankIdOffset) {}; @@ -255,14 +255,14 @@ struct LatteDecompilerOutputUniformOffsets } }; -struct LatteDecompilerOptions +struct LatteDecompilerOptions { bool usesGeometryShader{ false }; // floating point math bool strictMul{}; // if true, 0*anything=0 rule is emulated // Vulkan-specific bool useTFViaSSBO{ false }; - struct + struct { bool hasRoundingModeRTEFloat32{ false }; }spirvInstrinsics; @@ -322,4 +322,4 @@ struct LatteParsedGSCopyShader }; LatteParsedGSCopyShader* LatteGSCopyShaderParser_parse(uint8* programData, uint32 programSize); -bool LatteGSCopyShaderParser_getExportTypeByOffset(LatteParsedGSCopyShader* shaderContext, uint32 offset, uint32* exportType, uint32* exportParam); \ No newline at end of file +bool LatteGSCopyShaderParser_getExportTypeByOffset(LatteParsedGSCopyShader* shaderContext, uint32 offset, uint32* exportType, uint32* exportParam); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp index 2e7a33099..d48b17cc5 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp @@ -2,7 +2,6 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" -#include "Metal/MTLTexture.hpp" LatteTextureViewMtl::LatteTextureViewMtl(MetalRenderer* mtlRenderer, LatteTextureMtl* texture, Latte::E_DIM dim, Latte::E_GX2SURFFMT format, sint32 firstMip, sint32 mipCount, sint32 firstSlice, sint32 sliceCount) : LatteTextureView(texture, firstMip, mipCount, firstSlice, sliceCount, dim, format), m_mtlr(mtlRenderer), m_format(format) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp new file mode 100644 index 000000000..59dcdaeee --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -0,0 +1,207 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "HW/Latte/Core/FetchShader.h" +#include "HW/Latte/ISA/RegDefines.h" +#include "HW/Latte/Renderer/Metal/CachedFBOMtl.h" +#include "HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "HW/Latte/Renderer/Metal/RendererShaderMtl.h" +#include "HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" + +MetalPipelineCache::~MetalPipelineCache() +{ + for (auto& pair : m_pipelineCache) + { + pair.second->release(); + } + m_pipelineCache.clear(); +} + +MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) +{ + uint64 stateHash = CalculatePipelineHash(fetchShader, vertexShader, pixelShader, activeFBO, lcr); + auto& pipeline = m_pipelineCache[stateHash]; + if (pipeline) + { + return pipeline; + } + + // Vertex descriptor + MTL::VertexDescriptor* vertexDescriptor = MTL::VertexDescriptor::alloc()->init(); + for (auto& bufferGroup : fetchShader->bufferGroups) + { + std::optional fetchType; + + for (sint32 j = 0; j < bufferGroup.attribCount; ++j) + { + auto& attr = bufferGroup.attrib[j]; + + uint32 semanticId = vertexShader->resourceMapping.attributeMapping[attr.semanticId]; + if (semanticId == (uint32)-1) + continue; // attribute not used? + + auto attribute = vertexDescriptor->attributes()->object(semanticId); + attribute->setOffset(attr.offset); + // Bind from the end to not conflict with uniform buffers + attribute->setBufferIndex(GET_MTL_VERTEX_BUFFER_INDEX(attr.attributeBufferIndex)); + attribute->setFormat(GetMtlVertexFormat(attr.format)); + + if (fetchType.has_value()) + cemu_assert_debug(fetchType == attr.fetchType); + else + fetchType = attr.fetchType; + + if (attr.fetchType == LatteConst::INSTANCE_DATA) + { + cemu_assert_debug(attr.aluDivisor == 1); // other divisor not yet supported + } + } + + uint32 bufferIndex = bufferGroup.attributeBufferIndex; + uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; + // TODO: is LatteGPUState.contextNew correct? + uint32 bufferStride = (LatteGPUState.contextNew.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + + auto layout = vertexDescriptor->layouts()->object(GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)); + layout->setStride(bufferStride); + if (!fetchType.has_value() || fetchType == LatteConst::VertexFetchType2::VERTEX_DATA) + layout->setStepFunction(MTL::VertexStepFunctionPerVertex); + else if (fetchType == LatteConst::VertexFetchType2::INSTANCE_DATA) + layout->setStepFunction(MTL::VertexStepFunctionPerInstance); + else + { + debug_printf("unimplemented vertex fetch type %u\n", (uint32)fetchType.value()); + cemu_assert(false); + } + } + + // Render pipeline state + MTL::RenderPipelineDescriptor* renderPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); + renderPipelineDescriptor->setVertexFunction(static_cast(vertexShader->shader)->GetFunction()); + renderPipelineDescriptor->setFragmentFunction(static_cast(pixelShader->shader)->GetFunction()); + // TODO: don't always set the vertex descriptor + renderPipelineDescriptor->setVertexDescriptor(vertexDescriptor); + for (uint8 i = 0; i < 8; i++) + { + const auto& colorBuffer = activeFBO->colorBuffer[i]; + auto texture = static_cast(colorBuffer.texture); + if (!texture) + { + continue; + } + auto colorAttachment = renderPipelineDescriptor->colorAttachments()->object(i); + colorAttachment->setPixelFormat(texture->GetTexture()->pixelFormat()); + + // Blending + const Latte::LATTE_CB_COLOR_CONTROL& colorControlReg = LatteGPUState.contextNew.CB_COLOR_CONTROL; + uint32 blendEnableMask = colorControlReg.get_BLEND_MASK(); + uint32 renderTargetMask = LatteGPUState.contextNew.CB_TARGET_MASK.get_MASK(); + + bool blendEnabled = ((blendEnableMask & (1 << i))) != 0; + if (blendEnabled) + { + colorAttachment->setBlendingEnabled(true); + + const auto& blendControlReg = LatteGPUState.contextNew.CB_BLENDN_CONTROL[i]; + + auto rgbBlendOp = GetMtlBlendOp(blendControlReg.get_COLOR_COMB_FCN()); + auto srcRgbBlendFactor = GetMtlBlendFactor(blendControlReg.get_COLOR_SRCBLEND()); + auto dstRgbBlendFactor = GetMtlBlendFactor(blendControlReg.get_COLOR_DSTBLEND()); + + colorAttachment->setWriteMask((renderTargetMask >> (i * 4)) & 0xF); + colorAttachment->setRgbBlendOperation(rgbBlendOp); + colorAttachment->setSourceRGBBlendFactor(srcRgbBlendFactor); + colorAttachment->setDestinationRGBBlendFactor(dstRgbBlendFactor); + if (blendControlReg.get_SEPARATE_ALPHA_BLEND()) + { + colorAttachment->setAlphaBlendOperation(GetMtlBlendOp(blendControlReg.get_ALPHA_COMB_FCN())); + colorAttachment->setSourceAlphaBlendFactor(GetMtlBlendFactor(blendControlReg.get_ALPHA_SRCBLEND())); + colorAttachment->setDestinationAlphaBlendFactor(GetMtlBlendFactor(blendControlReg.get_ALPHA_DSTBLEND())); + } + else + { + colorAttachment->setAlphaBlendOperation(rgbBlendOp); + colorAttachment->setSourceAlphaBlendFactor(srcRgbBlendFactor); + colorAttachment->setDestinationAlphaBlendFactor(dstRgbBlendFactor); + } + } + } + if (activeFBO->depthBuffer.texture) + { + auto texture = static_cast(activeFBO->depthBuffer.texture); + renderPipelineDescriptor->setDepthAttachmentPixelFormat(texture->GetTexture()->pixelFormat()); + } + + NS::Error* error = nullptr; + pipeline = m_mtlr->GetDevice()->newRenderPipelineState(renderPipelineDescriptor, &error); + if (error) + { + debug_printf("error creating render pipeline state: %s\n", error->localizedDescription()->utf8String()); + return nullptr; + } + + return pipeline; +} + +uint64 MetalPipelineCache::CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) +{ + // Hash + uint64 stateHash = 0; + for (auto& group : fetchShader->bufferGroups) + { + uint32 bufferStride = group.getCurrentBufferStride(lcr.GetRawView()); + stateHash = std::rotl(stateHash, 7); + stateHash += bufferStride * 3; + } + + stateHash += fetchShader->getVkPipelineHashFragment(); + stateHash = std::rotl(stateHash, 7); + + stateHash += lcr.GetRawView()[mmVGT_PRIMITIVE_TYPE]; + stateHash = std::rotl(stateHash, 7); + + stateHash += lcr.GetRawView()[mmVGT_STRMOUT_EN]; + stateHash = std::rotl(stateHash, 7); + + if(lcr.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL()) + stateHash += 0x333333; + + stateHash = (stateHash >> 8) + (stateHash * 0x370531ull) % 0x7F980D3BF9B4639Dull; + + uint32* ctxRegister = lcr.GetRawView(); + + if (vertexShader) + stateHash += vertexShader->baseHash; + + stateHash = std::rotl(stateHash, 13); + + if (pixelShader) + stateHash += pixelShader->baseHash + pixelShader->auxHash; + + stateHash = std::rotl(stateHash, 13); + + uint32 polygonCtrl = lcr.PA_SU_SC_MODE_CNTL.getRawValue(); + stateHash += polygonCtrl; + stateHash = std::rotl(stateHash, 7); + + stateHash += ctxRegister[Latte::REGADDR::PA_CL_CLIP_CNTL]; + stateHash = std::rotl(stateHash, 7); + + const auto colorControlReg = ctxRegister[Latte::REGADDR::CB_COLOR_CONTROL]; + stateHash += colorControlReg; + + stateHash += ctxRegister[Latte::REGADDR::CB_TARGET_MASK]; + + const uint32 blendEnableMask = (colorControlReg >> 8) & 0xFF; + if (blendEnableMask) + { + for (auto i = 0; i < 8; ++i) + { + if (((blendEnableMask & (1 << i))) == 0) + continue; + stateHash = std::rotl(stateHash, 7); + stateHash += ctxRegister[Latte::REGADDR::CB_BLEND0_CONTROL + i]; + } + } + + return stateHash; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h new file mode 100644 index 000000000..11f81f883 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h @@ -0,0 +1,22 @@ +#pragma once + +#include + +#include "HW/Latte/ISA/LatteReg.h" +#include "HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h" + +class MetalPipelineCache +{ +public: + MetalPipelineCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} + ~MetalPipelineCache(); + + MTL::RenderPipelineState* GetPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr); + +private: + class MetalRenderer* m_mtlr; + + std::map m_pipelineCache; + + uint64 CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr); +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index f91a36a47..5262360d4 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -4,6 +4,8 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/ShaderSourcePresent.h" @@ -13,9 +15,11 @@ #include "Cafe/HW/Latte/Core/LatteIndices.h" #include "Cemu/Logging/CemuDebugLogging.h" #include "Foundation/NSTypes.hpp" +#include "HW/Latte/Core/Latte.h" #include "Metal/MTLDepthStencil.hpp" #include "Metal/MTLRenderCommandEncoder.hpp" #include "Metal/MTLRenderPass.hpp" +#include "Metal/MTLRenderPipeline.hpp" #include "gui/guiWrapper.h" extern bool hasValidFramebufferAttached; @@ -31,6 +35,7 @@ MetalRenderer::MetalRenderer() m_nearestSampler = m_device->newSamplerState(samplerDescriptor); m_memoryManager = new MetalMemoryManager(this); + m_pipelineCache = new MetalPipelineCache(this); // Initialize state for (uint32 i = 0; i < (uint32)LatteConst::ShaderType::TotalCount; i++) @@ -612,119 +617,8 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 auto fetchShader = vertexShader->compatibleFetchShader; - // Vertex descriptor - MTL::VertexDescriptor* vertexDescriptor = MTL::VertexDescriptor::alloc()->init(); - for (auto& bufferGroup : fetchShader->bufferGroups) - { - std::optional fetchType; - - for (sint32 j = 0; j < bufferGroup.attribCount; ++j) - { - auto& attr = bufferGroup.attrib[j]; - - uint32 semanticId = vertexShader->resourceMapping.attributeMapping[attr.semanticId]; - if (semanticId == (uint32)-1) - continue; // attribute not used? - - auto attribute = vertexDescriptor->attributes()->object(semanticId); - attribute->setOffset(attr.offset); - // Bind from the end to not conflict with uniform buffers - attribute->setBufferIndex(GET_MTL_VERTEX_BUFFER_INDEX(attr.attributeBufferIndex)); - attribute->setFormat(GetMtlVertexFormat(attr.format)); - - if (fetchType.has_value()) - cemu_assert_debug(fetchType == attr.fetchType); - else - fetchType = attr.fetchType; - - if (attr.fetchType == LatteConst::INSTANCE_DATA) - { - cemu_assert_debug(attr.aluDivisor == 1); // other divisor not yet supported - } - } - - uint32 bufferIndex = bufferGroup.attributeBufferIndex; - uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; - // TODO: is LatteGPUState.contextNew correct? - uint32 bufferStride = (LatteGPUState.contextNew.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; - - auto layout = vertexDescriptor->layouts()->object(GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)); - layout->setStride(bufferStride); - if (!fetchType.has_value() || fetchType == LatteConst::VertexFetchType2::VERTEX_DATA) - layout->setStepFunction(MTL::VertexStepFunctionPerVertex); - else if (fetchType == LatteConst::VertexFetchType2::INSTANCE_DATA) - layout->setStepFunction(MTL::VertexStepFunctionPerInstance); - else - { - debug_printf("unimplemented vertex fetch type %u\n", (uint32)fetchType.value()); - cemu_assert(false); - } - } - // Render pipeline state - MTL::RenderPipelineDescriptor* renderPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); - renderPipelineDescriptor->setVertexFunction(static_cast(vertexShader->shader)->GetFunction()); - renderPipelineDescriptor->setFragmentFunction(static_cast(pixelShader->shader)->GetFunction()); - // TODO: don't always set the vertex descriptor - renderPipelineDescriptor->setVertexDescriptor(vertexDescriptor); - for (uint8 i = 0; i < 8; i++) - { - const auto& colorBuffer = m_state.activeFBO->colorBuffer[i]; - auto texture = static_cast(colorBuffer.texture); - if (!texture) - { - continue; - } - auto colorAttachment = renderPipelineDescriptor->colorAttachments()->object(i); - colorAttachment->setPixelFormat(texture->GetTexture()->pixelFormat()); - - // Blending - const Latte::LATTE_CB_COLOR_CONTROL& colorControlReg = LatteGPUState.contextNew.CB_COLOR_CONTROL; - uint32 blendEnableMask = colorControlReg.get_BLEND_MASK(); - uint32 renderTargetMask = LatteGPUState.contextNew.CB_TARGET_MASK.get_MASK(); - - bool blendEnabled = ((blendEnableMask & (1 << i))) != 0; - if (blendEnabled) - { - colorAttachment->setBlendingEnabled(true); - - const auto& blendControlReg = LatteGPUState.contextNew.CB_BLENDN_CONTROL[i]; - - auto rgbBlendOp = GetMtlBlendOp(blendControlReg.get_COLOR_COMB_FCN()); - auto srcRgbBlendFactor = GetMtlBlendFactor(blendControlReg.get_COLOR_SRCBLEND()); - auto dstRgbBlendFactor = GetMtlBlendFactor(blendControlReg.get_COLOR_DSTBLEND()); - - colorAttachment->setWriteMask((renderTargetMask >> (i * 4)) & 0xF); - colorAttachment->setRgbBlendOperation(rgbBlendOp); - colorAttachment->setSourceRGBBlendFactor(srcRgbBlendFactor); - colorAttachment->setDestinationRGBBlendFactor(dstRgbBlendFactor); - if (blendControlReg.get_SEPARATE_ALPHA_BLEND()) - { - colorAttachment->setAlphaBlendOperation(GetMtlBlendOp(blendControlReg.get_ALPHA_COMB_FCN())); - colorAttachment->setSourceAlphaBlendFactor(GetMtlBlendFactor(blendControlReg.get_ALPHA_SRCBLEND())); - colorAttachment->setDestinationAlphaBlendFactor(GetMtlBlendFactor(blendControlReg.get_ALPHA_DSTBLEND())); - } - else - { - colorAttachment->setAlphaBlendOperation(rgbBlendOp); - colorAttachment->setSourceAlphaBlendFactor(srcRgbBlendFactor); - colorAttachment->setDestinationAlphaBlendFactor(dstRgbBlendFactor); - } - } - } - if (m_state.activeFBO->depthBuffer.texture) - { - auto texture = static_cast(m_state.activeFBO->depthBuffer.texture); - renderPipelineDescriptor->setDepthAttachmentPixelFormat(texture->GetTexture()->pixelFormat()); - } - - NS::Error* error = nullptr; - MTL::RenderPipelineState* renderPipelineState = m_device->newRenderPipelineState(renderPipelineDescriptor, &error); - if (error) - { - debug_printf("error creating render pipeline state: %s\n", error->localizedDescription()->utf8String()); - return; - } + MTL::RenderPipelineState* renderPipelineState = m_pipelineCache->GetPipelineState(fetchShader, vertexShader, pixelShader, m_state.activeFBO, LatteGPUState.contextNew); renderCommandEncoder->setRenderPipelineState(renderPipelineState); // Depth stencil state @@ -872,6 +766,143 @@ void MetalRenderer::indexData_uploadIndexMemory(uint32 offset, uint32 size) debug_printf("MetalRenderer::indexData_uploadIndexMemory not implemented\n"); } +void MetalRenderer::EnsureCommandBuffer() +{ + if (!m_commandBuffer) + { + // Debug + m_commandQueue->insertDebugCaptureBoundary(); + + m_commandBuffer = m_commandQueue->commandBuffer(); + } +} + +// Some render passes clear the attachments, forceRecreate is supposed to be used in those cases +MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(MTL::RenderPassDescriptor* renderPassDescriptor, MTL::Texture* colorRenderTargets[8], MTL::Texture* depthRenderTarget, bool forceRecreate, bool rebindStateIfNewEncoder) +{ + EnsureCommandBuffer(); + + // Check if we need to begin a new render pass + if (m_commandEncoder) + { + if (!forceRecreate) + { + if (m_encoderType == MetalEncoderType::Render) + { + bool needsNewRenderPass = false; + for (uint8 i = 0; i < 8; i++) + { + if (colorRenderTargets[i] && (colorRenderTargets[i] != m_state.colorRenderTargets[i])) + { + needsNewRenderPass = true; + break; + } + } + + if (!needsNewRenderPass) + { + if (depthRenderTarget && (depthRenderTarget != m_state.depthRenderTarget)) + { + needsNewRenderPass = true; + } + } + + if (!needsNewRenderPass) + { + return (MTL::RenderCommandEncoder*)m_commandEncoder; + } + } + } + + EndEncoding(); + } + + // Update state + for (uint8 i = 0; i < 8; i++) + { + m_state.colorRenderTargets[i] = colorRenderTargets[i]; + } + m_state.depthRenderTarget = depthRenderTarget; + + auto renderCommandEncoder = m_commandBuffer->renderCommandEncoder(renderPassDescriptor); + m_commandEncoder = renderCommandEncoder; + m_encoderType = MetalEncoderType::Render; + + if (rebindStateIfNewEncoder) + { + // Rebind all the render state + RebindRenderState(renderCommandEncoder); + } + + return renderCommandEncoder; +} + +MTL::ComputeCommandEncoder* MetalRenderer::GetComputeCommandEncoder() +{ + if (m_commandEncoder) + { + if (m_encoderType != MetalEncoderType::Compute) + { + return (MTL::ComputeCommandEncoder*)m_commandEncoder; + } + + EndEncoding(); + } + + auto computeCommandEncoder = m_commandBuffer->computeCommandEncoder(); + m_commandEncoder = computeCommandEncoder; + m_encoderType = MetalEncoderType::Compute; + + return computeCommandEncoder; +} + +MTL::BlitCommandEncoder* MetalRenderer::GetBlitCommandEncoder() +{ + if (m_commandEncoder) + { + if (m_encoderType != MetalEncoderType::Blit) + { + return (MTL::BlitCommandEncoder*)m_commandEncoder; + } + + EndEncoding(); + } + + auto blitCommandEncoder = m_commandBuffer->blitCommandEncoder(); + m_commandEncoder = blitCommandEncoder; + m_encoderType = MetalEncoderType::Blit; + + return blitCommandEncoder; +} + +void MetalRenderer::EndEncoding() +{ + if (m_commandEncoder) + { + m_commandEncoder->endEncoding(); + m_commandEncoder->release(); + m_commandEncoder = nullptr; + } +} + +void MetalRenderer::CommitCommandBuffer() +{ + EndEncoding(); + + if (m_commandBuffer) + { + m_commandBuffer->commit(); + m_commandBuffer->release(); + m_commandBuffer = nullptr; + + // Reset temporary buffers + m_memoryManager->ResetTemporaryBuffers(); + + // Debug + m_commandQueue->insertDebugCaptureBoundary(); + } +} + void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader) { sint32 textureCount = shader->resourceMapping.getTextureCount(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index a79a515f9..77aee9ee5 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -5,11 +5,6 @@ #include #include "Cafe/HW/Latte/Renderer/Renderer.h" -#include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" -#include "Metal/MTLComputeCommandEncoder.hpp" -#include "Metal/MTLRenderCommandEncoder.hpp" -#include "Metal/MTLRenderPass.hpp" -#include "Metal/MTLRenderPipeline.hpp" #define MAX_MTL_BUFFERS 31 #define GET_MTL_VERTEX_BUFFER_INDEX(index) (MAX_MTL_BUFFERS - index - 2) @@ -199,7 +194,8 @@ class MetalRenderer : public Renderer private: CA::MetalLayer* m_metalLayer; - MetalMemoryManager* m_memoryManager; + class MetalMemoryManager* m_memoryManager; + class MetalPipelineCache* m_pipelineCache; // Metal objects MTL::Device* m_device; @@ -222,144 +218,13 @@ class MetalRenderer : public Renderer MetalState m_state; // Helpers - void EnsureCommandBuffer() - { - if (!m_commandBuffer) - { - // Debug - m_commandQueue->insertDebugCaptureBoundary(); - - m_commandBuffer = m_commandQueue->commandBuffer(); - } - } - - // Some render passes clear the attachments, forceRecreate is supposed to be used in those cases - MTL::RenderCommandEncoder* GetRenderCommandEncoder(MTL::RenderPassDescriptor* renderPassDescriptor, MTL::Texture* colorRenderTargets[8], MTL::Texture* depthRenderTarget, bool forceRecreate = false, bool rebindStateIfNewEncoder = true) - { - EnsureCommandBuffer(); - - // Check if we need to begin a new render pass - if (m_commandEncoder) - { - if (!forceRecreate) - { - if (m_encoderType == MetalEncoderType::Render) - { - bool needsNewRenderPass = false; - for (uint8 i = 0; i < 8; i++) - { - if (colorRenderTargets[i] && (colorRenderTargets[i] != m_state.colorRenderTargets[i])) - { - needsNewRenderPass = true; - break; - } - } - - if (!needsNewRenderPass) - { - if (depthRenderTarget && (depthRenderTarget != m_state.depthRenderTarget)) - { - needsNewRenderPass = true; - } - } - - if (!needsNewRenderPass) - { - return (MTL::RenderCommandEncoder*)m_commandEncoder; - } - } - } - - EndEncoding(); - } - - // Update state - for (uint8 i = 0; i < 8; i++) - { - m_state.colorRenderTargets[i] = colorRenderTargets[i]; - } - m_state.depthRenderTarget = depthRenderTarget; - - auto renderCommandEncoder = m_commandBuffer->renderCommandEncoder(renderPassDescriptor); - m_commandEncoder = renderCommandEncoder; - m_encoderType = MetalEncoderType::Render; - - if (rebindStateIfNewEncoder) - { - // Rebind all the render state - RebindRenderState(renderCommandEncoder); - } - - return renderCommandEncoder; - } - - MTL::ComputeCommandEncoder* GetComputeCommandEncoder() - { - if (m_commandEncoder) - { - if (m_encoderType != MetalEncoderType::Compute) - { - return (MTL::ComputeCommandEncoder*)m_commandEncoder; - } - - EndEncoding(); - } - - auto computeCommandEncoder = m_commandBuffer->computeCommandEncoder(); - m_commandEncoder = computeCommandEncoder; - m_encoderType = MetalEncoderType::Compute; - - return computeCommandEncoder; - } - - MTL::BlitCommandEncoder* GetBlitCommandEncoder() - { - if (m_commandEncoder) - { - if (m_encoderType != MetalEncoderType::Blit) - { - return (MTL::BlitCommandEncoder*)m_commandEncoder; - } - - EndEncoding(); - } - - auto blitCommandEncoder = m_commandBuffer->blitCommandEncoder(); - m_commandEncoder = blitCommandEncoder; - m_encoderType = MetalEncoderType::Blit; - - return blitCommandEncoder; - } - - void EndEncoding() - { - if (m_commandEncoder) - { - m_commandEncoder->endEncoding(); - m_commandEncoder->release(); - m_commandEncoder = nullptr; - } - } - - void CommitCommandBuffer() - { - EndEncoding(); - - if (m_commandBuffer) - { - m_commandBuffer->commit(); - m_commandBuffer->release(); - m_commandBuffer = nullptr; - - // Reset temporary buffers - m_memoryManager->ResetTemporaryBuffers(); - - // Debug - m_commandQueue->insertDebugCaptureBoundary(); - } - } + void EnsureCommandBuffer(); + MTL::RenderCommandEncoder* GetRenderCommandEncoder(MTL::RenderPassDescriptor* renderPassDescriptor, MTL::Texture* colorRenderTargets[8], MTL::Texture* depthRenderTarget, bool forceRecreate = false, bool rebindStateIfNewEncoder = true); + MTL::ComputeCommandEncoder* GetComputeCommandEncoder(); + MTL::BlitCommandEncoder* GetBlitCommandEncoder(); + void EndEncoding(); + void CommitCommandBuffer(); void BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader); - void RebindRenderState(MTL::RenderCommandEncoder* renderCommandEncoder); }; From 406a85672df6edc38a4f0ad686a576c446e2f846 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 3 Aug 2024 10:01:57 +0200 Subject: [PATCH 029/368] implement depth stencil cache & fix: textureSize --- src/Cafe/CMakeLists.txt | 2 + src/Cafe/HW/Latte/Core/LatteShader.cpp | 1 + .../LatteDecompilerEmitMSL.cpp | 12 +- .../Renderer/Metal/MetalDepthStencilCache.cpp | 138 ++++++++++++++++++ .../Renderer/Metal/MetalDepthStencilCache.h | 21 +++ .../Renderer/Metal/MetalPipelineCache.cpp | 15 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 89 +---------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 1 + .../Renderer/Metal/RendererShaderMtl.cpp | 2 +- 9 files changed, 183 insertions(+), 98 deletions(-) create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.h diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index fb802c82f..7d10788a6 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -555,6 +555,8 @@ if(ENABLE_METAL) HW/Latte/Renderer/Metal/MetalMemoryManager.h HW/Latte/Renderer/Metal/MetalPipelineCache.cpp HW/Latte/Renderer/Metal/MetalPipelineCache.h + HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp + HW/Latte/Renderer/Metal/MetalDepthStencilCache.h HW/Latte/Renderer/Metal/ShaderSourcePresent.h ) diff --git a/src/Cafe/HW/Latte/Core/LatteShader.cpp b/src/Cafe/HW/Latte/Core/LatteShader.cpp index 66730a9b6..486516efd 100644 --- a/src/Cafe/HW/Latte/Core/LatteShader.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShader.cpp @@ -615,6 +615,7 @@ LatteDecompilerShader* LatteShader_CreateShaderFromDecompilerOutput(LatteDecompi LatteDecompilerShader* shader = decompilerOutput.shader; shader->baseHash = baseHash; // copy resource mapping + // HACK if (g_renderer->GetType() != RendererAPI::OpenGL) shader->resourceMapping = decompilerOutput.resourceMappingVK; else diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 0e55ffa13..3617c7c07 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2422,7 +2422,7 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex src->add(")"); // avoid truncate to effectively round downwards on texel edges if (ActiveSettings::ForceSamplerRoundToPrecision()) - src->addFmt("+ float2(1.0)/float2(textureSize(tex{}, 0))/512.0", texInstruction->textureFetch.textureIndex); + src->addFmt("+ float2(1.0)/float2(tex{}.get_width(), tex{}.get_height())/512.0", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex); } // lod or lod bias parameter if( texOpcode == GPU7_TEX_INST_SAMPLE_L || texOpcode == GPU7_TEX_INST_SAMPLE_LB || texOpcode == GPU7_TEX_INST_SAMPLE_C_L) @@ -2599,17 +2599,17 @@ static void _emitTEXGetTextureResInfoCode(LatteDecompilerShaderContext* shaderCo auto texDim = shaderContext->shader->textureUnitDim[texInstruction->textureFetch.textureIndex]; if (texDim == Latte::E_DIM::DIM_1D) - src->addFmt(" = int4(textureSize(tex{}, 0),1,1,1).", texInstruction->textureFetch.textureIndex); + src->addFmt(" = int4(tex{}.get_width(), 1, 1, 1).", texInstruction->textureFetch.textureIndex); else if (texDim == Latte::E_DIM::DIM_1D_ARRAY) - src->addFmt(" = int4(textureSize(tex{}, 0),1,1).", texInstruction->textureFetch.textureIndex); + src->addFmt(" = int4(tex{}.get_width(), tex{}.get_array_size(), 1, 1).", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex); else if (texDim == Latte::E_DIM::DIM_2D || texDim == Latte::E_DIM::DIM_2D_MSAA) - src->addFmt(" = int4(textureSize(tex{}, 0),1,1).", texInstruction->textureFetch.textureIndex); + src->addFmt(" = int4(tex{}.get_width(), tex{}.get_height(), 1, 1).", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex); else if (texDim == Latte::E_DIM::DIM_2D_ARRAY) - src->addFmt(" = int4(textureSize(tex{}, 0),1).", texInstruction->textureFetch.textureIndex); + src->addFmt(" = int4(tex{}.get_width(), tex{}.get_height(), tex{}.get_array_size(), 1).", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex); else { cemu_assert_debug(false); - src->addFmt(" = int4(textureSize(tex{}, 0),1,1).", texInstruction->textureFetch.textureIndex); + src->addFmt(" = int4(tex{}.get_width(), tex{}.get_height(), 1, 1).", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex); } for(sint32 f=0; f<4; f++) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp new file mode 100644 index 000000000..87968ec3f --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp @@ -0,0 +1,138 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "HW/Latte/ISA/RegDefines.h" +#include "HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Metal/MTLDepthStencil.hpp" + +MetalDepthStencilCache::~MetalDepthStencilCache() +{ + for (auto& pair : m_depthStencilCache) + { + pair.second->release(); + } + m_depthStencilCache.clear(); +} + +MTL::DepthStencilState* MetalDepthStencilCache::GetDepthStencilState(const LatteContextRegister& lcr) +{ + uint64 stateHash = CalculateDepthStencilHash(lcr); + auto& depthStencilState = m_depthStencilCache[stateHash]; + if (depthStencilState) + { + return depthStencilState; + } + + // Depth stencil state + bool depthEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_Z_ENABLE(); + auto depthFunc = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_Z_FUNC(); + bool depthWriteEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_Z_WRITE_ENABLE(); + + MTL::DepthStencilDescriptor* desc = MTL::DepthStencilDescriptor::alloc()->init(); + desc->setDepthWriteEnabled(depthWriteEnable); + + auto depthCompareFunc = GetMtlCompareFunc(depthFunc); + if (!depthEnable) + { + depthCompareFunc = MTL::CompareFunctionAlways; + } + desc->setDepthCompareFunction(depthCompareFunc); + + // TODO: stencil state + /* + // get stencil control parameters + bool stencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ENABLE(); + bool backStencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_BACK_STENCIL_ENABLE(); + auto frontStencilFunc = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_FUNC_F(); + auto frontStencilZPass = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ZPASS_F(); + auto frontStencilZFail = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ZFAIL_F(); + auto frontStencilFail = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_FAIL_F(); + auto backStencilFunc = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_FUNC_B(); + auto backStencilZPass = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ZPASS_B(); + auto backStencilZFail = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ZFAIL_B(); + auto backStencilFail = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_FAIL_B(); + // get stencil control parameters + uint32 stencilCompareMaskFront = LatteGPUState.contextNew.DB_STENCILREFMASK.get_STENCILMASK_F(); + uint32 stencilWriteMaskFront = LatteGPUState.contextNew.DB_STENCILREFMASK.get_STENCILWRITEMASK_F(); + uint32 stencilRefFront = LatteGPUState.contextNew.DB_STENCILREFMASK.get_STENCILREF_F(); + uint32 stencilCompareMaskBack = LatteGPUState.contextNew.DB_STENCILREFMASK_BF.get_STENCILMASK_B(); + uint32 stencilWriteMaskBack = LatteGPUState.contextNew.DB_STENCILREFMASK_BF.get_STENCILWRITEMASK_B(); + uint32 stencilRefBack = LatteGPUState.contextNew.DB_STENCILREFMASK_BF.get_STENCILREF_B(); + + static const VkStencilOp stencilOpTable[8] = { + VK_STENCIL_OP_KEEP, + VK_STENCIL_OP_ZERO, + VK_STENCIL_OP_REPLACE, + VK_STENCIL_OP_INCREMENT_AND_CLAMP, + VK_STENCIL_OP_DECREMENT_AND_CLAMP, + VK_STENCIL_OP_INVERT, + VK_STENCIL_OP_INCREMENT_AND_WRAP, + VK_STENCIL_OP_DECREMENT_AND_WRAP + }; + + depthStencilState.stencilTestEnable = stencilEnable ? VK_TRUE : VK_FALSE; + + depthStencilState.front.reference = stencilRefFront; + depthStencilState.front.compareMask = stencilCompareMaskFront; + depthStencilState.front.writeMask = stencilWriteMaskBack; + depthStencilState.front.compareOp = vkDepthCompareTable[(size_t)frontStencilFunc]; + depthStencilState.front.depthFailOp = stencilOpTable[(size_t)frontStencilZFail]; + depthStencilState.front.failOp = stencilOpTable[(size_t)frontStencilFail]; + depthStencilState.front.passOp = stencilOpTable[(size_t)frontStencilZPass]; + + if (backStencilEnable) + { + depthStencilState.back.reference = stencilRefBack; + depthStencilState.back.compareMask = stencilCompareMaskBack; + depthStencilState.back.writeMask = stencilWriteMaskBack; + depthStencilState.back.compareOp = vkDepthCompareTable[(size_t)backStencilFunc]; + depthStencilState.back.depthFailOp = stencilOpTable[(size_t)backStencilZFail]; + depthStencilState.back.failOp = stencilOpTable[(size_t)backStencilFail]; + depthStencilState.back.passOp = stencilOpTable[(size_t)backStencilZPass]; + } + else + { + depthStencilState.back.reference = stencilRefFront; + depthStencilState.back.compareMask = stencilCompareMaskFront; + depthStencilState.back.writeMask = stencilWriteMaskFront; + depthStencilState.back.compareOp = vkDepthCompareTable[(size_t)frontStencilFunc]; + depthStencilState.back.depthFailOp = stencilOpTable[(size_t)frontStencilZFail]; + depthStencilState.back.failOp = stencilOpTable[(size_t)frontStencilFail]; + depthStencilState.back.passOp = stencilOpTable[(size_t)frontStencilZPass]; + } + */ + + depthStencilState = m_mtlr->GetDevice()->newDepthStencilState(desc); + desc->release(); + + return depthStencilState; +} + +uint64 MetalDepthStencilCache::CalculateDepthStencilHash(const LatteContextRegister& lcr) +{ + uint32* ctxRegister = lcr.GetRawView(); + + // Hash + uint64 stateHash = 0; + uint32 depthControl = ctxRegister[Latte::REGADDR::DB_DEPTH_CONTROL]; + bool stencilTestEnable = depthControl & 1; + if (stencilTestEnable) + { + stateHash += ctxRegister[mmDB_STENCILREFMASK]; + stateHash = std::rotl(stateHash, 17); + if(depthControl & (1<<7)) // back stencil enable + { + stateHash += ctxRegister[mmDB_STENCILREFMASK_BF]; + stateHash = std::rotl(stateHash, 13); + } + } + else + { + // zero out stencil related bits (8-31) + depthControl &= 0xFF; + } + + stateHash = std::rotl(stateHash, 17); + stateHash += depthControl; + + return stateHash; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.h new file mode 100644 index 000000000..4ce05c286 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.h @@ -0,0 +1,21 @@ +#pragma once + +#include + +#include "HW/Latte/ISA/LatteReg.h" + +class MetalDepthStencilCache +{ +public: + MetalDepthStencilCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} + ~MetalDepthStencilCache(); + + MTL::DepthStencilState* GetDepthStencilState(const LatteContextRegister& lcr); + +private: + class MetalRenderer* m_mtlr; + + std::map m_depthStencilCache; + + uint64 CalculateDepthStencilHash(const LatteContextRegister& lcr); +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 59dcdaeee..28f32193b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -75,11 +75,11 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS } // Render pipeline state - MTL::RenderPipelineDescriptor* renderPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); - renderPipelineDescriptor->setVertexFunction(static_cast(vertexShader->shader)->GetFunction()); - renderPipelineDescriptor->setFragmentFunction(static_cast(pixelShader->shader)->GetFunction()); + MTL::RenderPipelineDescriptor* desc = MTL::RenderPipelineDescriptor::alloc()->init(); + desc->setVertexFunction(static_cast(vertexShader->shader)->GetFunction()); + desc->setFragmentFunction(static_cast(pixelShader->shader)->GetFunction()); // TODO: don't always set the vertex descriptor - renderPipelineDescriptor->setVertexDescriptor(vertexDescriptor); + desc->setVertexDescriptor(vertexDescriptor); for (uint8 i = 0; i < 8; i++) { const auto& colorBuffer = activeFBO->colorBuffer[i]; @@ -88,7 +88,7 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS { continue; } - auto colorAttachment = renderPipelineDescriptor->colorAttachments()->object(i); + auto colorAttachment = desc->colorAttachments()->object(i); colorAttachment->setPixelFormat(texture->GetTexture()->pixelFormat()); // Blending @@ -128,11 +128,12 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS if (activeFBO->depthBuffer.texture) { auto texture = static_cast(activeFBO->depthBuffer.texture); - renderPipelineDescriptor->setDepthAttachmentPixelFormat(texture->GetTexture()->pixelFormat()); + desc->setDepthAttachmentPixelFormat(texture->GetTexture()->pixelFormat()); } NS::Error* error = nullptr; - pipeline = m_mtlr->GetDevice()->newRenderPipelineState(renderPipelineDescriptor, &error); + pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, &error); + desc->release(); if (error) { debug_printf("error creating render pipeline state: %s\n", error->localizedDescription()->utf8String()); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 5262360d4..5e1a74ecf 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -5,21 +5,16 @@ #include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/ShaderSourcePresent.h" -#include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/Core/LatteIndices.h" #include "Cemu/Logging/CemuDebugLogging.h" -#include "Foundation/NSTypes.hpp" #include "HW/Latte/Core/Latte.h" -#include "Metal/MTLDepthStencil.hpp" -#include "Metal/MTLRenderCommandEncoder.hpp" -#include "Metal/MTLRenderPass.hpp" -#include "Metal/MTLRenderPipeline.hpp" #include "gui/guiWrapper.h" extern bool hasValidFramebufferAttached; @@ -36,6 +31,7 @@ MetalRenderer::MetalRenderer() m_memoryManager = new MetalMemoryManager(this); m_pipelineCache = new MetalPipelineCache(this); + m_depthStencilCache = new MetalDepthStencilCache(this); // Initialize state for (uint32 i = 0; i < (uint32)LatteConst::ShaderType::TotalCount; i++) @@ -49,6 +45,8 @@ MetalRenderer::MetalRenderer() MetalRenderer::~MetalRenderer() { + delete m_depthStencilCache; + delete m_pipelineCache; delete m_memoryManager; m_commandQueue->release(); @@ -622,84 +620,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 renderCommandEncoder->setRenderPipelineState(renderPipelineState); // Depth stencil state - bool depthEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_Z_ENABLE(); - auto depthFunc = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_Z_FUNC(); - bool depthWriteEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_Z_WRITE_ENABLE(); - - MTL::DepthStencilDescriptor* depthStencilDescriptor = MTL::DepthStencilDescriptor::alloc()->init(); - depthStencilDescriptor->setDepthWriteEnabled(depthWriteEnable); - - auto depthCompareFunc = GetMtlCompareFunc(depthFunc); - if (!depthEnable) - { - depthCompareFunc = MTL::CompareFunctionAlways; - } - depthStencilDescriptor->setDepthCompareFunction(depthCompareFunc); - - // TODO: stencil state - /* - // get stencil control parameters - bool stencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ENABLE(); - bool backStencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_BACK_STENCIL_ENABLE(); - auto frontStencilFunc = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_FUNC_F(); - auto frontStencilZPass = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ZPASS_F(); - auto frontStencilZFail = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ZFAIL_F(); - auto frontStencilFail = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_FAIL_F(); - auto backStencilFunc = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_FUNC_B(); - auto backStencilZPass = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ZPASS_B(); - auto backStencilZFail = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ZFAIL_B(); - auto backStencilFail = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_FAIL_B(); - // get stencil control parameters - uint32 stencilCompareMaskFront = LatteGPUState.contextNew.DB_STENCILREFMASK.get_STENCILMASK_F(); - uint32 stencilWriteMaskFront = LatteGPUState.contextNew.DB_STENCILREFMASK.get_STENCILWRITEMASK_F(); - uint32 stencilRefFront = LatteGPUState.contextNew.DB_STENCILREFMASK.get_STENCILREF_F(); - uint32 stencilCompareMaskBack = LatteGPUState.contextNew.DB_STENCILREFMASK_BF.get_STENCILMASK_B(); - uint32 stencilWriteMaskBack = LatteGPUState.contextNew.DB_STENCILREFMASK_BF.get_STENCILWRITEMASK_B(); - uint32 stencilRefBack = LatteGPUState.contextNew.DB_STENCILREFMASK_BF.get_STENCILREF_B(); - - static const VkStencilOp stencilOpTable[8] = { - VK_STENCIL_OP_KEEP, - VK_STENCIL_OP_ZERO, - VK_STENCIL_OP_REPLACE, - VK_STENCIL_OP_INCREMENT_AND_CLAMP, - VK_STENCIL_OP_DECREMENT_AND_CLAMP, - VK_STENCIL_OP_INVERT, - VK_STENCIL_OP_INCREMENT_AND_WRAP, - VK_STENCIL_OP_DECREMENT_AND_WRAP - }; - - depthStencilState.stencilTestEnable = stencilEnable ? VK_TRUE : VK_FALSE; - - depthStencilState.front.reference = stencilRefFront; - depthStencilState.front.compareMask = stencilCompareMaskFront; - depthStencilState.front.writeMask = stencilWriteMaskBack; - depthStencilState.front.compareOp = vkDepthCompareTable[(size_t)frontStencilFunc]; - depthStencilState.front.depthFailOp = stencilOpTable[(size_t)frontStencilZFail]; - depthStencilState.front.failOp = stencilOpTable[(size_t)frontStencilFail]; - depthStencilState.front.passOp = stencilOpTable[(size_t)frontStencilZPass]; - - if (backStencilEnable) - { - depthStencilState.back.reference = stencilRefBack; - depthStencilState.back.compareMask = stencilCompareMaskBack; - depthStencilState.back.writeMask = stencilWriteMaskBack; - depthStencilState.back.compareOp = vkDepthCompareTable[(size_t)backStencilFunc]; - depthStencilState.back.depthFailOp = stencilOpTable[(size_t)backStencilZFail]; - depthStencilState.back.failOp = stencilOpTable[(size_t)backStencilFail]; - depthStencilState.back.passOp = stencilOpTable[(size_t)backStencilZPass]; - } - else - { - depthStencilState.back.reference = stencilRefFront; - depthStencilState.back.compareMask = stencilCompareMaskFront; - depthStencilState.back.writeMask = stencilWriteMaskFront; - depthStencilState.back.compareOp = vkDepthCompareTable[(size_t)frontStencilFunc]; - depthStencilState.back.depthFailOp = stencilOpTable[(size_t)frontStencilZFail]; - depthStencilState.back.failOp = stencilOpTable[(size_t)frontStencilFail]; - depthStencilState.back.passOp = stencilOpTable[(size_t)frontStencilZPass]; - } - */ - MTL::DepthStencilState* depthStencilState = m_device->newDepthStencilState(depthStencilDescriptor); + MTL::DepthStencilState* depthStencilState = m_depthStencilCache->GetDepthStencilState(LatteGPUState.contextNew); renderCommandEncoder->setDepthStencilState(depthStencilState); // Primitive type diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 77aee9ee5..ac1d55b05 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -196,6 +196,7 @@ class MetalRenderer : public Renderer class MetalMemoryManager* m_memoryManager; class MetalPipelineCache* m_pipelineCache; + class MetalDepthStencilCache* m_depthStencilCache; // Metal objects MTL::Device* m_device; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index b4a43a7c7..f0d5fda1a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -18,13 +18,13 @@ RendererShaderMtl::RendererShaderMtl(MetalRenderer* mtlRenderer, ShaderType type desc->setName(NS::String::string("main0", NS::ASCIIStringEncoding)); error = nullptr; m_function = library->newFunction(desc, &error); + library->release(); if (error) { printf("failed to create function (error: %s)\n", error->localizedDescription()->utf8String()); error->release(); return; } - library->release(); } RendererShaderMtl::~RendererShaderMtl() From fe3b84b4a7566c3bf2564051a9d0ad496a77d167 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 3 Aug 2024 15:01:52 +0200 Subject: [PATCH 030/368] fix: buffer allocator --- .../HW/Latte/Renderer/Metal/MetalMemoryManager.cpp | 14 ++++++-------- .../HW/Latte/Renderer/Metal/MetalMemoryManager.h | 6 +++--- .../HW/Latte/Renderer/Metal/MetalPipelineCache.cpp | 1 + src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 13 +++++++------ 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 53dd8b10d..9e615c22a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -3,15 +3,16 @@ const size_t BUFFER_ALLOCATION_SIZE = 8 * 1024 * 1024; -// TODO: uncomment everything -MetalBufferAllocation MetalBufferAllocator::GetBufferAllocation(size_t size) +MetalBufferAllocation MetalBufferAllocator::GetBufferAllocation(size_t size, size_t alignment) { + // Align the size + size = (size + alignment - 1) & ~(alignment - 1); + // First, try to find a free range - /* for (uint32 i = 0; i < m_freeBufferRanges.size(); i++) { auto& range = m_freeBufferRanges[i]; - if (range.size >= size) + if (size <= range.size) { MetalBufferAllocation allocation; allocation.bufferIndex = range.bufferIndex; @@ -29,10 +30,9 @@ MetalBufferAllocation MetalBufferAllocator::GetBufferAllocation(size_t size) return allocation; } } - */ // If no free range was found, allocate a new buffer - MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(/*std::max(*/size/*, BUFFER_ALLOCATION_SIZE)*/, MTL::ResourceStorageModeShared); + MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(std::max(size, BUFFER_ALLOCATION_SIZE), MTL::ResourceStorageModeShared); MetalBufferAllocation allocation; allocation.bufferIndex = m_buffers.size(); @@ -42,7 +42,6 @@ MetalBufferAllocation MetalBufferAllocator::GetBufferAllocation(size_t size) m_buffers.push_back(buffer); // If the buffer is larger than the requested size, add the remaining space to the free buffer ranges - /* if (size < BUFFER_ALLOCATION_SIZE) { MetalBufferRange range; @@ -52,7 +51,6 @@ MetalBufferAllocation MetalBufferAllocator::GetBufferAllocation(size_t size) m_freeBufferRanges.push_back(range); } - */ return allocation; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index c099360fc..58096eabd 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -41,7 +41,7 @@ class MetalBufferAllocator return m_buffers[bufferIndex]; } - MetalBufferAllocation GetBufferAllocation(size_t size); + MetalBufferAllocation GetBufferAllocation(size_t size, size_t alignment); private: class MetalRenderer* m_mtlr; @@ -68,9 +68,9 @@ class MetalMemoryManager return m_bufferAllocator/*s[bufferAllocatorIndex]*/.GetBuffer(bufferIndex); } - MetalBufferAllocation GetBufferAllocation(size_t size) + MetalBufferAllocation GetBufferAllocation(size_t size, size_t alignment) { - auto allocation = m_bufferAllocator/*s[m_bufferAllocatorIndex]*/.GetBufferAllocation(size); + auto allocation = m_bufferAllocator/*s[m_bufferAllocatorIndex]*/.GetBufferAllocation(size, alignment); //allocation.bufferIndex |= (m_bufferAllocatorIndex << bufferAllocatorIndexShift); return allocation; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 28f32193b..328fd6d00 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -129,6 +129,7 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS { auto texture = static_cast(activeFBO->depthBuffer.texture); desc->setDepthAttachmentPixelFormat(texture->GetTexture()->pixelFormat()); + // TODO: stencil pixel format } NS::Error* error = nullptr; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 5e1a74ecf..efc9233a2 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -16,6 +16,7 @@ #include "Cemu/Logging/CemuDebugLogging.h" #include "HW/Latte/Core/Latte.h" #include "gui/guiWrapper.h" +#include extern bool hasValidFramebufferAttached; @@ -143,6 +144,9 @@ void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) m_drawableAcquired = false; CommitCommandBuffer(); + + // Reset temporary buffers + m_memoryManager->ResetTemporaryBuffers(); } void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutputShader* shader, bool useLinearTexFilter, @@ -661,7 +665,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 { auto mtlIndexType = GetMtlIndexType(hostIndexType); MTL::Buffer* indexBuffer = m_memoryManager->GetBuffer(indexBufferIndex); - renderCommandEncoder->drawIndexedPrimitives(mtlPrimitiveType, hostIndexCount, mtlIndexType, indexBuffer, 0, instanceCount, baseVertex, baseInstance); + renderCommandEncoder->drawIndexedPrimitives(mtlPrimitiveType, hostIndexCount, mtlIndexType, indexBuffer, indexBufferOffset, instanceCount, baseVertex, baseInstance); } else { renderCommandEncoder->drawPrimitives(mtlPrimitiveType, baseVertex, count, instanceCount, baseInstance); @@ -675,7 +679,7 @@ void MetalRenderer::draw_endSequence() void* MetalRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) { - auto allocation = m_memoryManager->GetBufferAllocation(size); + auto allocation = m_memoryManager->GetBufferAllocation(size, 4); offset = allocation.bufferOffset; bufferIndex = allocation.bufferIndex; @@ -684,7 +688,7 @@ void* MetalRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, u void MetalRenderer::indexData_uploadIndexMemory(uint32 offset, uint32 size) { - debug_printf("MetalRenderer::indexData_uploadIndexMemory not implemented\n"); + // Do nothing, since the buffer has shared storage mode } void MetalRenderer::EnsureCommandBuffer() @@ -816,9 +820,6 @@ void MetalRenderer::CommitCommandBuffer() m_commandBuffer->release(); m_commandBuffer = nullptr; - // Reset temporary buffers - m_memoryManager->ResetTemporaryBuffers(); - // Debug m_commandQueue->insertDebugCaptureBoundary(); } From fa53af54db1ac05acc952269968a50f586f8fa50 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 3 Aug 2024 15:26:57 +0200 Subject: [PATCH 031/368] fix: memory leaks --- .../Renderer/Metal/MetalMemoryManager.cpp | 16 +++++++++++++++ .../Latte/Renderer/Metal/MetalMemoryManager.h | 2 ++ .../Renderer/Metal/MetalPipelineCache.cpp | 2 ++ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 9 ++++++++- .../Renderer/Metal/RendererShaderMtl.cpp | 20 +++++++++---------- 5 files changed, 38 insertions(+), 11 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 9e615c22a..f6f064f31 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -3,6 +3,14 @@ const size_t BUFFER_ALLOCATION_SIZE = 8 * 1024 * 1024; +MetalBufferAllocator::~MetalBufferAllocator() +{ + for (auto buffer : m_buffers) + { + buffer->release(); + } +} + MetalBufferAllocation MetalBufferAllocator::GetBufferAllocation(size_t size, size_t alignment) { // Align the size @@ -55,6 +63,14 @@ MetalBufferAllocation MetalBufferAllocator::GetBufferAllocation(size_t size, siz return allocation; } +MetalMemoryManager::~MetalMemoryManager() +{ + if (m_bufferCache) + { + m_bufferCache->release(); + } +} + void* MetalMemoryManager::GetTextureUploadBuffer(size_t size) { if (m_textureUploadBuffer.size() < size) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index 58096eabd..b0be29486 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -24,6 +24,7 @@ class MetalBufferAllocator { public: MetalBufferAllocator(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} + ~MetalBufferAllocator(); void ResetTemporaryBuffers() { @@ -54,6 +55,7 @@ class MetalMemoryManager { public: MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_bufferAllocator(metalRenderer) {} + ~MetalMemoryManager(); void ResetTemporaryBuffers() { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 328fd6d00..d6976a8d7 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -135,9 +135,11 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS NS::Error* error = nullptr; pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, &error); desc->release(); + vertexDescriptor->release(); if (error) { debug_printf("error creating render pipeline state: %s\n", error->localizedDescription()->utf8String()); + error->release(); return nullptr; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index efc9233a2..e99f641c2 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -29,6 +29,7 @@ MetalRenderer::MetalRenderer() MTL::SamplerDescriptor* samplerDescriptor = MTL::SamplerDescriptor::alloc()->init(); m_nearestSampler = m_device->newSamplerState(samplerDescriptor); + samplerDescriptor->release(); m_memoryManager = new MetalMemoryManager(this); m_pipelineCache = new MetalPipelineCache(this); @@ -50,6 +51,8 @@ MetalRenderer::~MetalRenderer() delete m_pipelineCache; delete m_memoryManager; + m_nearestSampler->release(); + m_commandQueue->release(); m_device->release(); } @@ -81,19 +84,20 @@ void MetalRenderer::InitializeLayer(const Vector2i& size, bool mainWindow) renderPipelineDescriptor->setFragmentFunction(presentFragmentFunction); renderPipelineDescriptor->colorAttachments()->object(0)->setPixelFormat(m_metalLayer->pixelFormat()); m_presentPipeline = m_device->newRenderPipelineState(renderPipelineDescriptor, &error); + renderPipelineDescriptor->release(); presentVertexFunction->release(); presentFragmentFunction->release(); if (error) { debug_printf("failed to create present pipeline (error: %s)\n", error->localizedDescription()->utf8String()); error->release(); - throw; return; } } void MetalRenderer::Initialize() { + Renderer::Initialize(); } void MetalRenderer::Shutdown() @@ -177,6 +181,7 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput colorRenderTargets[0] = m_drawable->texture(); // If there was already an encoder with these attachment, we should set the viewport and scissor to default, but that shouldn't happen auto renderCommandEncoder = GetRenderCommandEncoder(renderPassDescriptor, colorRenderTargets, nullptr, false, false); + renderPassDescriptor->release(); // Draw to Metal layer renderCommandEncoder->setRenderPipelineState(m_presentPipeline); @@ -409,6 +414,7 @@ void MetalRenderer::texture_clearColorSlice(LatteTexture* hostTexture, sint32 sl MTL::Texture* colorRenderTargets[8] = {nullptr}; colorRenderTargets[0] = mtlTexture; GetRenderCommandEncoder(renderPassDescriptor, colorRenderTargets, nullptr, true); + renderPassDescriptor->release(); } // TODO: use sliceIndex and mipIndex @@ -436,6 +442,7 @@ void MetalRenderer::texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sl MTL::Texture* colorRenderTargets[8] = {nullptr}; GetRenderCommandEncoder(renderPassDescriptor, colorRenderTargets, mtlTexture, true); + renderPassDescriptor->release(); } LatteTexture* MetalRenderer::texture_createTextureEx(Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index f0d5fda1a..dcceb18ad 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -14,17 +14,17 @@ RendererShaderMtl::RendererShaderMtl(MetalRenderer* mtlRenderer, ShaderType type error->release(); return; } - MTL::FunctionDescriptor* desc = MTL::FunctionDescriptor::alloc()->init(); - desc->setName(NS::String::string("main0", NS::ASCIIStringEncoding)); - error = nullptr; - m_function = library->newFunction(desc, &error); + //MTL::FunctionDescriptor* desc = MTL::FunctionDescriptor::alloc()->init(); + //desc->setName(NS::String::string("main0", NS::ASCIIStringEncoding)); + //error = nullptr; + m_function = library->newFunction(NS::String::string("main0", NS::ASCIIStringEncoding)); library->release(); - if (error) - { - printf("failed to create function (error: %s)\n", error->localizedDescription()->utf8String()); - error->release(); - return; - } + //if (error) + //{ + // printf("failed to create function (error: %s)\n", error->localizedDescription()->utf8String()); + // error->release(); + // return; + //} } RendererShaderMtl::~RendererShaderMtl() From d2edc41680b290503d8547c0aac26903b7e046c2 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 3 Aug 2024 16:15:22 +0200 Subject: [PATCH 032/368] fix: present issues --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 51 ++++++++++++------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 3 +- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index e99f641c2..2c5b7d610 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -102,7 +102,8 @@ void MetalRenderer::Initialize() void MetalRenderer::Shutdown() { - debug_printf("MetalRenderer::Shutdown not implemented\n"); + Renderer::Shutdown(); + CommitCommandBuffer(); } bool MetalRenderer::IsPadWindowActive() @@ -129,7 +130,9 @@ void MetalRenderer::ClearColorbuffer(bool padView) void MetalRenderer::DrawEmptyFrame(bool mainWindow) { - debug_printf("MetalRenderer::DrawEmptyFrame not implemented\n"); + if (!BeginFrame(mainWindow)) + return; + SwapBuffers(mainWindow, !mainWindow); } void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) @@ -145,7 +148,6 @@ void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) debug_printf("skipped present!\n"); } m_drawable = nullptr; - m_drawableAcquired = false; CommitCommandBuffer(); @@ -157,19 +159,8 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight, bool padView, bool clearBackground) { - if (!m_drawableAcquired) - { - debug_printf("drawable already acquired this frame\n"); - - m_drawableAcquired = true; - - // Acquire drawable - m_drawable = m_metalLayer->nextDrawable(); - if (!m_drawable) - { - return; - } - } + if (!AcquireNextDrawable()) + return; MTL::Texture* presentTexture = static_cast(texView)->GetTexture(); @@ -193,13 +184,17 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput bool MetalRenderer::BeginFrame(bool mainWindow) { - // TODO - return false; + return AcquireNextDrawable(); } void MetalRenderer::Flush(bool waitIdle) { - debug_printf("MetalRenderer::Flush not implemented\n"); + // TODO: should we? + CommitCommandBuffer(); + if (waitIdle) + { + // TODO + } } void MetalRenderer::NotifyLatteCommandProcessorIdle() @@ -832,6 +827,24 @@ void MetalRenderer::CommitCommandBuffer() } } +bool MetalRenderer::AcquireNextDrawable() +{ + if (m_drawable) + { + // TODO: should this be true? + return true; + } + + m_drawable = m_metalLayer->nextDrawable(); + if (!m_drawable) + { + printf("failed to acquire next drawable\n"); + return false; + } + + return true; +} + void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader) { sint32 textureCount = shader->resourceMapping.getTextureCount(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index ac1d55b05..f5064d54f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -213,7 +213,6 @@ class MetalRenderer : public Renderer MetalEncoderType m_encoderType = MetalEncoderType::None; MTL::CommandEncoder* m_commandEncoder = nullptr; CA::MetalDrawable* m_drawable = nullptr; - bool m_drawableAcquired = false; // State MetalState m_state; @@ -226,6 +225,8 @@ class MetalRenderer : public Renderer void EndEncoding(); void CommitCommandBuffer(); + bool AcquireNextDrawable(); + void BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader); void RebindRenderState(MTL::RenderCommandEncoder* renderCommandEncoder); }; From ce6d4cacd125db6f003a306c8800fead40122f5e Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 3 Aug 2024 19:58:40 +0200 Subject: [PATCH 033/368] implement sampler states --- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 19 +++ src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h | 3 + .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 109 +++++++++++++++++- 3 files changed, 127 insertions(+), 4 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index 40eacdf5e..e5913f5d5 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -1,6 +1,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #include "Common/precompiled.h" #include "Metal/MTLDepthStencil.hpp" +#include "Metal/MTLSampler.hpp" std::map MTL_COLOR_FORMAT_TABLE = { {Latte::E_GX2SURFFMT::R4_G4_UNORM, {MTL::PixelFormatRG8Unorm, 2}}, // TODO: correct? @@ -272,3 +273,21 @@ MTL::CompareFunction GetMtlCompareFunc(Latte::E_COMPAREFUNC func) cemu_assert_debug((uint32)func < std::size(MTL_COMPARE_FUNCTIONS)); return MTL_COMPARE_FUNCTIONS[(uint32)func]; } + +// TODO: clamp to border color? (should be fine though) +const MTL::SamplerAddressMode MTL_SAMPLER_ADDRESS_MODES[] = { + MTL::SamplerAddressModeRepeat, // WRAP + MTL::SamplerAddressModeMirrorRepeat, // MIRROR + MTL::SamplerAddressModeClampToEdge, // CLAMP_LAST_TEXEL + MTL::SamplerAddressModeMirrorClampToEdge, // MIRROR_ONCE_LAST_TEXEL + MTL::SamplerAddressModeClampToEdge, // unsupported HALF_BORDER + MTL::SamplerAddressModeClampToBorderColor, // unsupported MIRROR_ONCE_HALF_BORDER + MTL::SamplerAddressModeClampToBorderColor, // CLAMP_BORDER + MTL::SamplerAddressModeClampToBorderColor // MIRROR_ONCE_BORDER +}; + +MTL::SamplerAddressMode GetMtlSamplerAddressMode(Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_CLAMP clamp) +{ + cemu_assert_debug((uint32)clamp < std::size(MTL_SAMPLER_ADDRESS_MODES)); + return MTL_SAMPLER_ADDRESS_MODES[(uint32)clamp]; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h index ed99098df..153b90a01 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h @@ -7,6 +7,7 @@ //#include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Metal/MTLDepthStencil.hpp" +#include "Metal/MTLSampler.hpp" struct Uvec2 { uint32 x; @@ -36,3 +37,5 @@ MTL::BlendOperation GetMtlBlendOp(Latte::LATTE_CB_BLENDN_CONTROL::E_COMBINEFUNC MTL::BlendFactor GetMtlBlendFactor(Latte::LATTE_CB_BLENDN_CONTROL::E_BLENDFACTOR factor); MTL::CompareFunction GetMtlCompareFunc(Latte::E_COMPAREFUNC func); + +MTL::SamplerAddressMode GetMtlSamplerAddressMode(Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_CLAMP clamp); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 2c5b7d610..395f3fd3c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -15,8 +15,8 @@ #include "Cafe/HW/Latte/Core/LatteIndices.h" #include "Cemu/Logging/CemuDebugLogging.h" #include "HW/Latte/Core/Latte.h" +#include "HW/Latte/ISA/LatteReg.h" #include "gui/guiWrapper.h" -#include extern bool hasValidFramebufferAttached; @@ -863,7 +863,7 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE continue; } - //LatteTexture* baseTexture = textureView->baseTexture; + LatteTexture* baseTexture = textureView->baseTexture; // get texture register word 0 uint32 word4 = LatteGPUState.contextRegister[texUnitRegIndex + 4]; @@ -878,8 +878,108 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE uint32 stageSamplerIndex = shader->textureUnitSamplerAssignment[relative_textureUnit]; if (stageSamplerIndex != LATTE_DECOMPILER_SAMPLER_NONE) { - // TODO: bind the actual sampler - MTL::SamplerState* sampler = m_nearestSampler; + uint32 samplerIndex = stageSamplerIndex + LatteDecompiler_getTextureSamplerBaseIndex(shader->shaderType); + const _LatteRegisterSetSampler* samplerWords = LatteGPUState.contextNew.SQ_TEX_SAMPLER + samplerIndex; + + // TODO: cache this instead + MTL::SamplerDescriptor* samplerDescriptor = MTL::SamplerDescriptor::alloc()->init(); + + // lod + uint32 iMinLOD = samplerWords->WORD1.get_MIN_LOD(); + uint32 iMaxLOD = samplerWords->WORD1.get_MAX_LOD(); + sint32 iLodBias = samplerWords->WORD1.get_LOD_BIAS(); + + // apply relative lod bias from graphic pack + if (baseTexture->overwriteInfo.hasRelativeLodBias) + iLodBias += baseTexture->overwriteInfo.relativeLodBias; + // apply absolute lod bias from graphic pack + if (baseTexture->overwriteInfo.hasLodBias) + iLodBias = baseTexture->overwriteInfo.lodBias; + + auto filterMip = samplerWords->WORD0.get_MIP_FILTER(); + if (filterMip == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_Z_FILTER::NONE) + { + samplerDescriptor->setMipFilter(MTL::SamplerMipFilterNearest); + samplerDescriptor->setLodMinClamp(0.0f); + samplerDescriptor->setLodMaxClamp(0.25f); + } + else if (filterMip == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_Z_FILTER::POINT) + { + samplerDescriptor->setMipFilter(MTL::SamplerMipFilterNearest); + samplerDescriptor->setLodMinClamp((float)iMinLOD / 64.0f); + samplerDescriptor->setLodMaxClamp((float)iMaxLOD / 64.0f); + } + else if (filterMip == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_Z_FILTER::LINEAR) + { + samplerDescriptor->setMipFilter(MTL::SamplerMipFilterLinear); + samplerDescriptor->setLodMinClamp((float)iMinLOD / 64.0f); + samplerDescriptor->setLodMaxClamp((float)iMaxLOD / 64.0f); + } + else + { + // fallback for invalid constants + samplerDescriptor->setMipFilter(MTL::SamplerMipFilterLinear); + samplerDescriptor->setLodMinClamp((float)iMinLOD / 64.0f); + samplerDescriptor->setLodMaxClamp((float)iMaxLOD / 64.0f); + } + + auto filterMin = samplerWords->WORD0.get_XY_MIN_FILTER(); + cemu_assert_debug(filterMin != Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::BICUBIC); // todo + samplerDescriptor->setMinFilter((filterMin == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::POINT || filterMin == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::ANISO_POINT) ? MTL::SamplerMinMagFilterNearest : MTL::SamplerMinMagFilterLinear); + + auto filterMag = samplerWords->WORD0.get_XY_MAG_FILTER(); + samplerDescriptor->setMagFilter((filterMag == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::POINT || filterMin == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::ANISO_POINT) ? MTL::SamplerMinMagFilterNearest : MTL::SamplerMinMagFilterLinear); + + auto filterZ = samplerWords->WORD0.get_Z_FILTER(); + // todo: z-filter for texture array samplers is customizable for GPU7 but OpenGL/Vulkan doesn't expose this functionality? + + auto clampX = samplerWords->WORD0.get_CLAMP_X(); + auto clampY = samplerWords->WORD0.get_CLAMP_Y(); + auto clampZ = samplerWords->WORD0.get_CLAMP_Z(); + + samplerDescriptor->setRAddressMode(GetMtlSamplerAddressMode(clampX)); + samplerDescriptor->setSAddressMode(GetMtlSamplerAddressMode(clampY)); + samplerDescriptor->setTAddressMode(GetMtlSamplerAddressMode(clampZ)); + + auto maxAniso = samplerWords->WORD0.get_MAX_ANISO_RATIO(); + + if (baseTexture->overwriteInfo.anisotropicLevel >= 0) + maxAniso = baseTexture->overwriteInfo.anisotropicLevel; + + if (maxAniso > 0) + { + samplerDescriptor->setMaxAnisotropy(1 << maxAniso); + } + + // TODO: set lod bias + //samplerInfo.mipLodBias = (float)iLodBias / 64.0f; + + // depth compare + uint8 depthCompareMode = shader->textureUsesDepthCompare[relative_textureUnit] ? 1 : 0; + if (depthCompareMode == 1) + { + // TODO: is it okay to just cast? + samplerDescriptor->setCompareFunction(GetMtlCompareFunc((Latte::E_COMPAREFUNC)samplerWords->WORD0.get_DEPTH_COMPARE_FUNCTION())); + } + + // border + auto borderType = samplerWords->WORD0.get_BORDER_COLOR_TYPE(); + + if (borderType == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_BORDER_COLOR_TYPE::TRANSPARENT_BLACK) + samplerDescriptor->setBorderColor(MTL::SamplerBorderColorTransparentBlack); + else if (borderType == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_BORDER_COLOR_TYPE::OPAQUE_BLACK) + samplerDescriptor->setBorderColor(MTL::SamplerBorderColorOpaqueBlack); + else if (borderType == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_BORDER_COLOR_TYPE::OPAQUE_WHITE) + samplerDescriptor->setBorderColor(MTL::SamplerBorderColorOpaqueWhite); + else + { + // Metal doesn't support custom border color + samplerDescriptor->setBorderColor(MTL::SamplerBorderColorOpaqueBlack); + } + + MTL::SamplerState* sampler = m_device->newSamplerState(samplerDescriptor); + samplerDescriptor->release(); + switch (shader->shaderType) { case LatteConst::ShaderType::Vertex: @@ -895,6 +995,7 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE default: UNREACHABLE; } + sampler->release(); } switch (shader->shaderType) From 763d57d9215f4b679d175f6f1ef5f60476d6af41 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 4 Aug 2024 13:30:34 +0200 Subject: [PATCH 034/368] implement texture copy and clear --- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 127 ++++++++++ src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h | 2 + .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 223 +++++++----------- 3 files changed, 220 insertions(+), 132 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index e5913f5d5..a60539dcc 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -120,6 +120,133 @@ size_t GetMtlTextureBytesPerImage(Latte::E_GX2SURFFMT format, bool isDepth, uint return CeilDivide(height, formatInfo.blockTexelSize.y) * bytesPerRow; } +TextureDecoder* GetMtlTextureDecoder(Latte::E_GX2SURFFMT format, bool isDepth) +{ + if (isDepth) + { + switch (format) + { + case Latte::E_GX2SURFFMT::D24_S8_UNORM: + return TextureDecoder_D24_S8::getInstance(); + case Latte::E_GX2SURFFMT::D24_S8_FLOAT: + return TextureDecoder_NullData64::getInstance(); + case Latte::E_GX2SURFFMT::D32_FLOAT: + return TextureDecoder_R32_FLOAT::getInstance(); + case Latte::E_GX2SURFFMT::D16_UNORM: + return TextureDecoder_R16_UNORM::getInstance(); + case Latte::E_GX2SURFFMT::D32_S8_FLOAT: + return TextureDecoder_D32_S8_UINT_X24::getInstance(); + default: + debug_printf("invalid depth texture format %u\n", (uint32)format); + cemu_assert_debug(false); + return nullptr; + } + } else + { + switch (format) + { + case Latte::E_GX2SURFFMT::R32_G32_B32_A32_FLOAT: + return TextureDecoder_R32_G32_B32_A32_FLOAT::getInstance(); + case Latte::E_GX2SURFFMT::R32_G32_B32_A32_UINT: + return TextureDecoder_R32_G32_B32_A32_UINT::getInstance(); + case Latte::E_GX2SURFFMT::R16_G16_B16_A16_FLOAT: + return TextureDecoder_R16_G16_B16_A16_FLOAT::getInstance(); + case Latte::E_GX2SURFFMT::R16_G16_B16_A16_UINT: + return TextureDecoder_R16_G16_B16_A16_UINT::getInstance(); + case Latte::E_GX2SURFFMT::R16_G16_B16_A16_UNORM: + return TextureDecoder_R16_G16_B16_A16::getInstance(); + case Latte::E_GX2SURFFMT::R16_G16_B16_A16_SNORM: + return TextureDecoder_R16_G16_B16_A16::getInstance(); + case Latte::E_GX2SURFFMT::R8_G8_B8_A8_UNORM: + return TextureDecoder_R8_G8_B8_A8::getInstance(); + case Latte::E_GX2SURFFMT::R8_G8_B8_A8_SNORM: + return TextureDecoder_R8_G8_B8_A8::getInstance(); + case Latte::E_GX2SURFFMT::R8_G8_B8_A8_SRGB: + return TextureDecoder_R8_G8_B8_A8::getInstance(); + case Latte::E_GX2SURFFMT::R8_G8_B8_A8_UINT: + return TextureDecoder_R8_G8_B8_A8::getInstance(); + case Latte::E_GX2SURFFMT::R8_G8_B8_A8_SINT: + return TextureDecoder_R8_G8_B8_A8::getInstance(); + case Latte::E_GX2SURFFMT::R32_G32_FLOAT: + return TextureDecoder_R32_G32_FLOAT::getInstance(); + case Latte::E_GX2SURFFMT::R32_G32_UINT: + return TextureDecoder_R32_G32_UINT::getInstance(); + case Latte::E_GX2SURFFMT::R16_G16_UNORM: + return TextureDecoder_R16_G16::getInstance(); + case Latte::E_GX2SURFFMT::R16_G16_FLOAT: + return TextureDecoder_R16_G16_FLOAT::getInstance(); + case Latte::E_GX2SURFFMT::R8_G8_UNORM: + return TextureDecoder_R8_G8::getInstance(); + case Latte::E_GX2SURFFMT::R8_G8_SNORM: + return TextureDecoder_R8_G8::getInstance(); + case Latte::E_GX2SURFFMT::R4_G4_UNORM: + return TextureDecoder_R4_G4::getInstance(); + case Latte::E_GX2SURFFMT::R32_FLOAT: + return TextureDecoder_R32_FLOAT::getInstance(); + case Latte::E_GX2SURFFMT::R32_UINT: + return TextureDecoder_R32_UINT::getInstance(); + case Latte::E_GX2SURFFMT::R16_FLOAT: + return TextureDecoder_R16_FLOAT::getInstance(); + case Latte::E_GX2SURFFMT::R16_UNORM: + return TextureDecoder_R16_UNORM::getInstance(); + case Latte::E_GX2SURFFMT::R16_SNORM: + return TextureDecoder_R16_SNORM::getInstance(); + case Latte::E_GX2SURFFMT::R16_UINT: + return TextureDecoder_R16_UINT::getInstance(); + case Latte::E_GX2SURFFMT::R8_UNORM: + return TextureDecoder_R8::getInstance(); + case Latte::E_GX2SURFFMT::R8_SNORM: + return TextureDecoder_R8::getInstance(); + case Latte::E_GX2SURFFMT::R8_UINT: + return TextureDecoder_R8_UINT::getInstance(); + case Latte::E_GX2SURFFMT::R5_G6_B5_UNORM: + return TextureDecoder_R5_G6_B5_swappedRB::getInstance(); + case Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM: + return TextureDecoder_R5_G5_B5_A1_UNORM_swappedRB::getInstance(); + case Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM: + return TextureDecoder_A1_B5_G5_R5_UNORM_vulkan::getInstance(); + case Latte::E_GX2SURFFMT::R11_G11_B10_FLOAT: + return TextureDecoder_R11_G11_B10_FLOAT::getInstance(); + case Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM: + return TextureDecoder_R4_G4_B4_A4_UNORM::getInstance(); + case Latte::E_GX2SURFFMT::R10_G10_B10_A2_UNORM: + return TextureDecoder_R10_G10_B10_A2_UNORM::getInstance(); + case Latte::E_GX2SURFFMT::R10_G10_B10_A2_SNORM: + return TextureDecoder_R10_G10_B10_A2_SNORM_To_RGBA16::getInstance(); + case Latte::E_GX2SURFFMT::R10_G10_B10_A2_SRGB: + return TextureDecoder_R10_G10_B10_A2_UNORM::getInstance(); + case Latte::E_GX2SURFFMT::BC1_SRGB: + return TextureDecoder_BC1::getInstance(); + case Latte::E_GX2SURFFMT::BC1_UNORM: + return TextureDecoder_BC1::getInstance(); + case Latte::E_GX2SURFFMT::BC2_UNORM: + return TextureDecoder_BC2::getInstance(); + case Latte::E_GX2SURFFMT::BC2_SRGB: + return TextureDecoder_BC2::getInstance(); + case Latte::E_GX2SURFFMT::BC3_UNORM: + return TextureDecoder_BC3::getInstance(); + case Latte::E_GX2SURFFMT::BC3_SRGB: + return TextureDecoder_BC3::getInstance(); + case Latte::E_GX2SURFFMT::BC4_UNORM: + return TextureDecoder_BC4::getInstance(); + case Latte::E_GX2SURFFMT::BC4_SNORM: + return TextureDecoder_BC4::getInstance(); + case Latte::E_GX2SURFFMT::BC5_UNORM: + return TextureDecoder_BC5::getInstance(); + case Latte::E_GX2SURFFMT::BC5_SNORM: + return TextureDecoder_BC5::getInstance(); + case Latte::E_GX2SURFFMT::R24_X8_UNORM: + return TextureDecoder_R24_X8::getInstance(); + case Latte::E_GX2SURFFMT::X24_G8_UINT: + return TextureDecoder_X24_G8_UINT::getInstance(); // todo - verify + default: + debug_printf("invalid color texture format %u\n", (uint32)format); + cemu_assert_debug(false); + return nullptr; + } + } +} + MTL::PrimitiveType GetMtlPrimitiveType(LattePrimitiveMode mode) { switch (mode) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h index 153b90a01..f0348303c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h @@ -26,6 +26,8 @@ size_t GetMtlTextureBytesPerRow(Latte::E_GX2SURFFMT format, bool isDepth, uint32 size_t GetMtlTextureBytesPerImage(Latte::E_GX2SURFFMT format, bool isDepth, uint32 height, size_t bytesPerRow); +TextureDecoder* GetMtlTextureDecoder(Latte::E_GX2SURFFMT format, bool isDepth); + MTL::PrimitiveType GetMtlPrimitiveType(LattePrimitiveMode mode); MTL::VertexFormat GetMtlVertexFormat(uint8 format); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 395f3fd3c..67e1cb469 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -16,6 +16,7 @@ #include "Cemu/Logging/CemuDebugLogging.h" #include "HW/Latte/Core/Latte.h" #include "HW/Latte/ISA/LatteReg.h" +#include "Metal/MTLTypes.hpp" #include "gui/guiWrapper.h" extern bool hasValidFramebufferAttached; @@ -249,140 +250,24 @@ void* MetalRenderer::texture_acquireTextureUploadBuffer(uint32 size) void MetalRenderer::texture_releaseTextureUploadBuffer(uint8* mem) { - debug_printf("MetalRenderer::texture_releaseTextureUploadBuffer not implemented\n"); + // TODO: should the texture buffer get released? } TextureDecoder* MetalRenderer::texture_chooseDecodedFormat(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, uint32 width, uint32 height) { - // TODO: move to LatteToMtl - if (isDepth) - { - switch (format) - { - case Latte::E_GX2SURFFMT::D24_S8_UNORM: - return TextureDecoder_D24_S8::getInstance(); - case Latte::E_GX2SURFFMT::D24_S8_FLOAT: - return TextureDecoder_NullData64::getInstance(); - case Latte::E_GX2SURFFMT::D32_FLOAT: - return TextureDecoder_R32_FLOAT::getInstance(); - case Latte::E_GX2SURFFMT::D16_UNORM: - return TextureDecoder_R16_UNORM::getInstance(); - case Latte::E_GX2SURFFMT::D32_S8_FLOAT: - return TextureDecoder_D32_S8_UINT_X24::getInstance(); - default: - debug_printf("invalid depth texture format %u\n", (uint32)format); - cemu_assert_debug(false); - return nullptr; - } - } else - { - switch (format) - { - case Latte::E_GX2SURFFMT::R32_G32_B32_A32_FLOAT: - return TextureDecoder_R32_G32_B32_A32_FLOAT::getInstance(); - case Latte::E_GX2SURFFMT::R32_G32_B32_A32_UINT: - return TextureDecoder_R32_G32_B32_A32_UINT::getInstance(); - case Latte::E_GX2SURFFMT::R16_G16_B16_A16_FLOAT: - return TextureDecoder_R16_G16_B16_A16_FLOAT::getInstance(); - case Latte::E_GX2SURFFMT::R16_G16_B16_A16_UINT: - return TextureDecoder_R16_G16_B16_A16_UINT::getInstance(); - case Latte::E_GX2SURFFMT::R16_G16_B16_A16_UNORM: - return TextureDecoder_R16_G16_B16_A16::getInstance(); - case Latte::E_GX2SURFFMT::R16_G16_B16_A16_SNORM: - return TextureDecoder_R16_G16_B16_A16::getInstance(); - case Latte::E_GX2SURFFMT::R8_G8_B8_A8_UNORM: - return TextureDecoder_R8_G8_B8_A8::getInstance(); - case Latte::E_GX2SURFFMT::R8_G8_B8_A8_SNORM: - return TextureDecoder_R8_G8_B8_A8::getInstance(); - case Latte::E_GX2SURFFMT::R8_G8_B8_A8_SRGB: - return TextureDecoder_R8_G8_B8_A8::getInstance(); - case Latte::E_GX2SURFFMT::R8_G8_B8_A8_UINT: - return TextureDecoder_R8_G8_B8_A8::getInstance(); - case Latte::E_GX2SURFFMT::R8_G8_B8_A8_SINT: - return TextureDecoder_R8_G8_B8_A8::getInstance(); - case Latte::E_GX2SURFFMT::R32_G32_FLOAT: - return TextureDecoder_R32_G32_FLOAT::getInstance(); - case Latte::E_GX2SURFFMT::R32_G32_UINT: - return TextureDecoder_R32_G32_UINT::getInstance(); - case Latte::E_GX2SURFFMT::R16_G16_UNORM: - return TextureDecoder_R16_G16::getInstance(); - case Latte::E_GX2SURFFMT::R16_G16_FLOAT: - return TextureDecoder_R16_G16_FLOAT::getInstance(); - case Latte::E_GX2SURFFMT::R8_G8_UNORM: - return TextureDecoder_R8_G8::getInstance(); - case Latte::E_GX2SURFFMT::R8_G8_SNORM: - return TextureDecoder_R8_G8::getInstance(); - case Latte::E_GX2SURFFMT::R4_G4_UNORM: - return TextureDecoder_R4_G4::getInstance(); - case Latte::E_GX2SURFFMT::R32_FLOAT: - return TextureDecoder_R32_FLOAT::getInstance(); - case Latte::E_GX2SURFFMT::R32_UINT: - return TextureDecoder_R32_UINT::getInstance(); - case Latte::E_GX2SURFFMT::R16_FLOAT: - return TextureDecoder_R16_FLOAT::getInstance(); - case Latte::E_GX2SURFFMT::R16_UNORM: - return TextureDecoder_R16_UNORM::getInstance(); - case Latte::E_GX2SURFFMT::R16_SNORM: - return TextureDecoder_R16_SNORM::getInstance(); - case Latte::E_GX2SURFFMT::R16_UINT: - return TextureDecoder_R16_UINT::getInstance(); - case Latte::E_GX2SURFFMT::R8_UNORM: - return TextureDecoder_R8::getInstance(); - case Latte::E_GX2SURFFMT::R8_SNORM: - return TextureDecoder_R8::getInstance(); - case Latte::E_GX2SURFFMT::R8_UINT: - return TextureDecoder_R8_UINT::getInstance(); - case Latte::E_GX2SURFFMT::R5_G6_B5_UNORM: - return TextureDecoder_R5_G6_B5_swappedRB::getInstance(); - case Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM: - return TextureDecoder_R5_G5_B5_A1_UNORM_swappedRB::getInstance(); - case Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM: - return TextureDecoder_A1_B5_G5_R5_UNORM_vulkan::getInstance(); - case Latte::E_GX2SURFFMT::R11_G11_B10_FLOAT: - return TextureDecoder_R11_G11_B10_FLOAT::getInstance(); - case Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM: - return TextureDecoder_R4_G4_B4_A4_UNORM::getInstance(); - case Latte::E_GX2SURFFMT::R10_G10_B10_A2_UNORM: - return TextureDecoder_R10_G10_B10_A2_UNORM::getInstance(); - case Latte::E_GX2SURFFMT::R10_G10_B10_A2_SNORM: - return TextureDecoder_R10_G10_B10_A2_SNORM_To_RGBA16::getInstance(); - case Latte::E_GX2SURFFMT::R10_G10_B10_A2_SRGB: - return TextureDecoder_R10_G10_B10_A2_UNORM::getInstance(); - case Latte::E_GX2SURFFMT::BC1_SRGB: - return TextureDecoder_BC1::getInstance(); - case Latte::E_GX2SURFFMT::BC1_UNORM: - return TextureDecoder_BC1::getInstance(); - case Latte::E_GX2SURFFMT::BC2_UNORM: - return TextureDecoder_BC2::getInstance(); - case Latte::E_GX2SURFFMT::BC2_SRGB: - return TextureDecoder_BC2::getInstance(); - case Latte::E_GX2SURFFMT::BC3_UNORM: - return TextureDecoder_BC3::getInstance(); - case Latte::E_GX2SURFFMT::BC3_SRGB: - return TextureDecoder_BC3::getInstance(); - case Latte::E_GX2SURFFMT::BC4_UNORM: - return TextureDecoder_BC4::getInstance(); - case Latte::E_GX2SURFFMT::BC4_SNORM: - return TextureDecoder_BC4::getInstance(); - case Latte::E_GX2SURFFMT::BC5_UNORM: - return TextureDecoder_BC5::getInstance(); - case Latte::E_GX2SURFFMT::BC5_SNORM: - return TextureDecoder_BC5::getInstance(); - case Latte::E_GX2SURFFMT::R24_X8_UNORM: - return TextureDecoder_R24_X8::getInstance(); - case Latte::E_GX2SURFFMT::X24_G8_UINT: - return TextureDecoder_X24_G8_UINT::getInstance(); // todo - verify - default: - debug_printf("invalid color texture format %u\n", (uint32)format); - cemu_assert_debug(false); - return nullptr; - } - } + return GetMtlTextureDecoder(format, isDepth); } void MetalRenderer::texture_clearSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex) { - debug_printf("MetalRenderer::texture_clearSlice not implemented\n"); + if (hostTexture->isDepth) + { + texture_clearDepthSlice(hostTexture, sliceIndex, mipIndex, true, hostTexture->hasStencil, 0.0f, 0); + } + else + { + texture_clearColorSlice(hostTexture, sliceIndex, mipIndex, 0.0f, 0.0f, 0.0f, 0.0f); + } } void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, sint32 height, sint32 depth, void* pixelData, sint32 sliceIndex, sint32 mipIndex, uint32 compressedImageSize) @@ -394,7 +279,6 @@ void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, s mtlTexture->GetTexture()->replaceRegion(MTL::Region(0, 0, width, height), mipIndex, sliceIndex, pixelData, bytesPerRow, bytesPerImage); } -// TODO: use sliceIndex and mipIndex void MetalRenderer::texture_clearColorSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a) { auto mtlTexture = static_cast(hostTexture)->GetTexture(); @@ -405,6 +289,8 @@ void MetalRenderer::texture_clearColorSlice(LatteTexture* hostTexture, sint32 sl colorAttachment->setClearColor(MTL::ClearColor(r, g, b, a)); colorAttachment->setLoadAction(MTL::LoadActionClear); colorAttachment->setStoreAction(MTL::StoreActionStore); + colorAttachment->setSlice(sliceIndex); + colorAttachment->setLevel(mipIndex); MTL::Texture* colorRenderTargets[8] = {nullptr}; colorRenderTargets[0] = mtlTexture; @@ -412,7 +298,6 @@ void MetalRenderer::texture_clearColorSlice(LatteTexture* hostTexture, sint32 sl renderPassDescriptor->release(); } -// TODO: use sliceIndex and mipIndex void MetalRenderer::texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sliceIndex, sint32 mipIndex, bool clearDepth, bool clearStencil, float depthValue, uint32 stencilValue) { auto mtlTexture = static_cast(hostTexture)->GetTexture(); @@ -425,6 +310,8 @@ void MetalRenderer::texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sl depthAttachment->setClearDepth(depthValue); depthAttachment->setLoadAction(MTL::LoadActionClear); depthAttachment->setStoreAction(MTL::StoreActionStore); + depthAttachment->setSlice(sliceIndex); + depthAttachment->setLevel(mipIndex); } if (clearStencil) { @@ -433,6 +320,8 @@ void MetalRenderer::texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sl stencilAttachment->setClearStencil(stencilValue); stencilAttachment->setLoadAction(MTL::LoadActionClear); stencilAttachment->setStoreAction(MTL::StoreActionStore); + stencilAttachment->setSlice(sliceIndex); + stencilAttachment->setLevel(mipIndex); } MTL::Texture* colorRenderTargets[8] = {nullptr}; @@ -450,9 +339,79 @@ void MetalRenderer::texture_setLatteTexture(LatteTextureView* textureView, uint3 m_state.textures[textureUnit] = static_cast(textureView); } -void MetalRenderer::texture_copyImageSubData(LatteTexture* src, sint32 srcMip, sint32 effectiveSrcX, sint32 effectiveSrcY, sint32 srcSlice, LatteTexture* dst, sint32 dstMip, sint32 effectiveDstX, sint32 effectiveDstY, sint32 dstSlice, sint32 effectiveCopyWidth, sint32 effectiveCopyHeight, sint32 srcDepth) +void MetalRenderer::texture_copyImageSubData(LatteTexture* src, sint32 srcMip, sint32 effectiveSrcX, sint32 effectiveSrcY, sint32 srcSlice, LatteTexture* dst, sint32 dstMip, sint32 effectiveDstX, sint32 effectiveDstY, sint32 dstSlice, sint32 effectiveCopyWidth, sint32 effectiveCopyHeight, sint32 srcDepth_) { - debug_printf("MetalRenderer::texture_copyImageSubData not implemented\n"); + auto blitCommandEncoder = GetBlitCommandEncoder(); + + auto mtlSrc = static_cast(src)->GetTexture(); + auto mtlDst = static_cast(dst)->GetTexture(); + + uint32 srcBaseLayer = 0; + uint32 dstBaseLayer = 0; + uint32 srcOffsetZ = 0; + uint32 dstOffsetZ = 0; + uint32 srcLayerCount = 1; + uint32 dstLayerCount = 1; + uint32 srcDepth = 1; + uint32 dstDepth = 1; + + if (src->Is3DTexture()) + { + srcOffsetZ = srcSlice; + srcDepth = srcDepth_; + } + else + { + srcBaseLayer = srcSlice; + srcLayerCount = srcDepth_; + } + + if (dst->Is3DTexture()) + { + dstOffsetZ = dstSlice; + dstDepth = srcDepth_; + } + else + { + dstBaseLayer = dstSlice; + dstLayerCount = srcDepth_; + } + + // If copying whole textures, we can do a more efficient copy + if (effectiveSrcX == 0 && effectiveSrcY == 0 && effectiveDstX == 0 && effectiveDstY == 0 && + effectiveCopyWidth == src->GetMipWidth(srcMip) && effectiveCopyHeight == src->GetMipHeight(srcMip) && + effectiveCopyWidth == dst->GetMipWidth(dstMip) && effectiveCopyHeight == dst->GetMipHeight(dstMip) && + srcLayerCount == dstLayerCount) + { + blitCommandEncoder->copyFromTexture(mtlSrc, srcSlice, srcMip, mtlDst, dstSlice, dstMip, srcLayerCount, 1); + } + else + { + if (srcLayerCount == dstLayerCount) + { + for (uint32 i = 0; i < srcLayerCount; i++) + { + blitCommandEncoder->copyFromTexture(mtlSrc, srcSlice + i, srcMip, MTL::Origin(effectiveSrcX, effectiveSrcY, srcOffsetZ), MTL::Size(effectiveCopyWidth, effectiveCopyHeight, 1), mtlDst, dstSlice + i, dstMip, MTL::Origin(effectiveDstX, effectiveDstY, dstOffsetZ)); + } + } + else + { + for (uint32 i = 0; i < std::max(srcLayerCount, dstLayerCount); i++) + { + if (srcLayerCount == 1) + srcOffsetZ++; + else + srcSlice++; + + if (dstLayerCount == 1) + dstOffsetZ++; + else + dstSlice++; + + blitCommandEncoder->copyFromTexture(mtlSrc, srcSlice, srcMip, MTL::Origin(effectiveSrcX, effectiveSrcY, srcOffsetZ), MTL::Size(effectiveCopyWidth, effectiveCopyHeight, 1), mtlDst, dstSlice, dstMip, MTL::Origin(effectiveDstX, effectiveDstY, dstOffsetZ)); + } + } + } } LatteTextureReadbackInfo* MetalRenderer::texture_createReadback(LatteTextureView* textureView) @@ -768,7 +727,7 @@ MTL::ComputeCommandEncoder* MetalRenderer::GetComputeCommandEncoder() { if (m_commandEncoder) { - if (m_encoderType != MetalEncoderType::Compute) + if (m_encoderType == MetalEncoderType::Compute) { return (MTL::ComputeCommandEncoder*)m_commandEncoder; } @@ -787,7 +746,7 @@ MTL::BlitCommandEncoder* MetalRenderer::GetBlitCommandEncoder() { if (m_commandEncoder) { - if (m_encoderType != MetalEncoderType::Blit) + if (m_encoderType == MetalEncoderType::Blit) { return (MTL::BlitCommandEncoder*)m_commandEncoder; } From 5030a2e84acedb1bb65b077da4e6a79a1ebc7c2a Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 4 Aug 2024 15:09:49 +0200 Subject: [PATCH 035/368] implement color buffer clear --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 36 +++++++++++-------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 2 ++ 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 67e1cb469..ad1ce1261 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -126,7 +126,10 @@ bool MetalRenderer::GetVRAMInfo(int& usageInMB, int& totalInMB) const void MetalRenderer::ClearColorbuffer(bool padView) { - debug_printf("MetalRenderer::ClearColorbuffer not implemented\n"); + if (!AcquireNextDrawable()) + return; + + ClearColorTextureInternal(m_drawable->texture(), 0, 0, 0.0f, 0.0f, 0.0f, 0.0f); } void MetalRenderer::DrawEmptyFrame(bool mainWindow) @@ -283,19 +286,7 @@ void MetalRenderer::texture_clearColorSlice(LatteTexture* hostTexture, sint32 sl { auto mtlTexture = static_cast(hostTexture)->GetTexture(); - MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); - auto colorAttachment = renderPassDescriptor->colorAttachments()->object(0); - colorAttachment->setTexture(mtlTexture); - colorAttachment->setClearColor(MTL::ClearColor(r, g, b, a)); - colorAttachment->setLoadAction(MTL::LoadActionClear); - colorAttachment->setStoreAction(MTL::StoreActionStore); - colorAttachment->setSlice(sliceIndex); - colorAttachment->setLevel(mipIndex); - - MTL::Texture* colorRenderTargets[8] = {nullptr}; - colorRenderTargets[0] = mtlTexture; - GetRenderCommandEncoder(renderPassDescriptor, colorRenderTargets, nullptr, true); - renderPassDescriptor->release(); + ClearColorTextureInternal(mtlTexture, sliceIndex, mipIndex, r, g, b, a); } void MetalRenderer::texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sliceIndex, sint32 mipIndex, bool clearDepth, bool clearStencil, float depthValue, uint32 stencilValue) @@ -1162,3 +1153,20 @@ void MetalRenderer::RebindRenderState(MTL::RenderCommandEncoder* renderCommandEn } } } + +void MetalRenderer::ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a) +{ + MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); + auto colorAttachment = renderPassDescriptor->colorAttachments()->object(0); + colorAttachment->setTexture(mtlTexture); + colorAttachment->setClearColor(MTL::ClearColor(r, g, b, a)); + colorAttachment->setLoadAction(MTL::LoadActionClear); + colorAttachment->setStoreAction(MTL::StoreActionStore); + colorAttachment->setSlice(sliceIndex); + colorAttachment->setLevel(mipIndex); + + MTL::Texture* colorRenderTargets[8] = {nullptr}; + colorRenderTargets[0] = mtlTexture; + GetRenderCommandEncoder(renderPassDescriptor, colorRenderTargets, nullptr, true); + renderPassDescriptor->release(); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index f5064d54f..b6fb30ad1 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -229,4 +229,6 @@ class MetalRenderer : public Renderer void BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader); void RebindRenderState(MTL::RenderCommandEncoder* renderCommandEncoder); + + void ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a); }; From f11526a244fc9637d1cbddc1be9713f22cdf5a9b Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 5 Aug 2024 08:54:11 +0200 Subject: [PATCH 036/368] fix: texture bindings --- .../LatteDecompilerEmitMSL.cpp | 9 +++-- .../LatteDecompilerEmitMSLHeader.hpp | 19 +++------ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 40 +++++++++++++++---- 3 files changed, 44 insertions(+), 24 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 3617c7c07..91558c3ba 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2193,7 +2193,7 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex src->add("."); const char* resultElemTable[4] = {"x","y","z","w"}; sint32 numWrittenElements = 0; - for(sint32 f=0; f<4; f++) + for(sint32 f = 0; f < 4; f++) { if( texInstruction->dstSel[f] < 4 ) { @@ -2493,18 +2493,21 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex // lod bias if( texOpcode == GPU7_TEX_INST_SAMPLE_C || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ ) { - src->add(")"); + src->add(")."); if (numWrittenElements > 1) { // result is copied into multiple channels - src->add("."); for (sint32 f = 0; f < numWrittenElements; f++) { cemu_assert_debug(texInstruction->dstSel[f] == 0); // only x component is defined src->add("x"); } } + else + { + src->add("x"); + } } else { diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index e62a7d1cd..b3380d312 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -6,7 +6,6 @@ namespace LatteDecompiler { auto src = decompilerContext->shaderSource; - LatteDecompilerShaderResourceMapping& resourceMapping = decompilerContext->output->resourceMappingGL; auto& uniformOffsets = decompilerContext->output->uniformOffsetsVK; src->add("struct SupportBuffer {" _CRLF); @@ -129,11 +128,8 @@ namespace LatteDecompiler if (!decompilerContext->analyzer.uniformBufferAccessTracker[i].HasAccess()) continue; - cemu_assert_debug(decompilerContext->output->resourceMappingGL.uniformBuffersBindingPoint[i] >= 0); cemu_assert_debug(decompilerContext->output->resourceMappingVK.uniformBuffersBindingPoint[i] >= 0); - //shaderSrc->addFmt("UNIFORM_BUFFER_LAYOUT({}, {}, {}) ", (sint32)decompilerContext->output->resourceMappingGL.uniformBuffersBindingPoint[i], (sint32)decompilerContext->output->resourceMappingVK.setIndex, (sint32)decompilerContext->output->resourceMappingVK.uniformBuffersBindingPoint[i]); - shaderSrc->addFmt("struct UBuff{} {{" _CRLF, i); shaderSrc->addFmt("float4 d[{}];" _CRLF, decompilerContext->analyzer.uniformBufferAccessTracker[i].DetermineSize(decompilerContext->shaderBaseHash, LATTE_GLSL_DYNAMIC_UNIFORM_BLOCK_SIZE)); shaderSrc->add("};" _CRLF _CRLF); @@ -169,9 +165,7 @@ namespace LatteDecompiler { if (decompilerContext->analyzer.inputAttributSemanticMask[i]) { - cemu_assert_debug(decompilerContext->output->resourceMappingGL.attributeMapping[i] >= 0); cemu_assert_debug(decompilerContext->output->resourceMappingVK.attributeMapping[i] >= 0); - cemu_assert_debug(decompilerContext->output->resourceMappingGL.attributeMapping[i] == decompilerContext->output->resourceMappingVK.attributeMapping[i]); src->addFmt("uint4 attrDataSem{} [[attribute({})]];" _CRLF, i, (sint32)decompilerContext->output->resourceMappingVK.attributeMapping[i]); } @@ -304,10 +298,9 @@ namespace LatteDecompiler if (!decompilerContext->analyzer.uniformBufferAccessTracker[i].HasAccess()) continue; - cemu_assert_debug(decompilerContext->output->resourceMappingGL.uniformBuffersBindingPoint[i] >= 0); cemu_assert_debug(decompilerContext->output->resourceMappingVK.uniformBuffersBindingPoint[i] >= 0); - src->addFmt(", constant UBuff{}& ubuff{} [[buffer({})]]", i, i, (sint32)decompilerContext->output->resourceMappingGL.uniformBuffersBindingPoint[i]); + src->addFmt(", constant UBuff{}& ubuff{} [[buffer({})]]", i, i, (sint32)decompilerContext->output->resourceMappingVK.uniformBuffersBindingPoint[i]); } } } @@ -348,11 +341,11 @@ namespace LatteDecompiler cemu_assert_unimplemented(); } - // HACK - uint32 textureBinding = shaderContext->output->resourceMappingGL.textureUnitToBindingPoint[i] % 31; - uint32 samplerBinding = textureBinding % 16; - src->addFmt(" tex{} [[texture({})]]", i, textureBinding); - src->addFmt(", sampler samplr{} [[sampler({})]]", i, samplerBinding); + uint32 binding = shaderContext->output->resourceMappingVK.textureUnitToBindingPoint[i]; + //uint32 textureBinding = shaderContext->output->resourceMappingVK.textureUnitToBindingPoint[i] % 31; + //uint32 samplerBinding = textureBinding % 16; + src->addFmt(" tex{} [[texture({})]]", i, binding); + src->addFmt(", sampler samplr{} [[sampler({})]]", i, binding); } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index ad1ce1261..2a3707b6c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -107,9 +107,10 @@ void MetalRenderer::Shutdown() CommitCommandBuffer(); } +// TODO: what should this do? bool MetalRenderer::IsPadWindowActive() { - debug_printf("MetalRenderer::IsPadWindowActive not implemented\n"); + //debug_printf("MetalRenderer::IsPadWindowActive not implemented\n"); return false; } @@ -805,6 +806,23 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE auto hostTextureUnit = relative_textureUnit; auto textureDim = shader->textureUnitDim[relative_textureUnit]; auto texUnitRegIndex = hostTextureUnit * 7; + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + hostTextureUnit += LATTE_CEMU_VS_TEX_UNIT_BASE; + texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_VS; + break; + case LatteConst::ShaderType::Pixel: + hostTextureUnit += LATTE_CEMU_PS_TEX_UNIT_BASE; + texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_PS; + break; + case LatteConst::ShaderType::Geometry: + hostTextureUnit += LATTE_CEMU_GS_TEX_UNIT_BASE; + texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_GS; + break; + default: + UNREACHABLE; + } auto textureView = m_state.textures[hostTextureUnit]; if (!textureView) @@ -821,9 +839,15 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE //auto imageViewObj = textureView->GetSamplerView(word4); //info.imageView = imageViewObj->m_textureImageView; - // HACK - uint32 textureBinding = (shader->resourceMapping.getTextureBaseBindingPoint() + i) % MAX_MTL_TEXTURES; - uint32 samplerBinding = textureBinding % MAX_MTL_SAMPLERS; + // TODO: uncomment + uint32 binding = shader->resourceMapping.getTextureBaseBindingPoint() + i;//shader->resourceMapping.textureUnitToBindingPoint[hostTextureUnit]; + //uint32 textureBinding = binding % MAX_MTL_TEXTURES; + //uint32 samplerBinding = binding % MAX_MTL_SAMPLERS; + if (binding >= MAX_MTL_TEXTURES) + { + debug_printf("invalid texture binding %u\n", binding); + continue; + } uint32 stageSamplerIndex = shader->textureUnitSamplerAssignment[relative_textureUnit]; if (stageSamplerIndex != LATTE_DECOMPILER_SAMPLER_NONE) @@ -934,12 +958,12 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE { case LatteConst::ShaderType::Vertex: { - renderCommandEncoder->setVertexSamplerState(sampler, samplerBinding); + renderCommandEncoder->setVertexSamplerState(sampler, binding); break; } case LatteConst::ShaderType::Pixel: { - renderCommandEncoder->setFragmentSamplerState(sampler, samplerBinding); + renderCommandEncoder->setFragmentSamplerState(sampler, binding); break; } default: @@ -952,12 +976,12 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE { case LatteConst::ShaderType::Vertex: { - renderCommandEncoder->setVertexTexture(textureView->GetTexture(), textureBinding); + renderCommandEncoder->setVertexTexture(textureView->GetTexture(), binding); break; } case LatteConst::ShaderType::Pixel: { - renderCommandEncoder->setFragmentTexture(textureView->GetTexture(), textureBinding); + renderCommandEncoder->setFragmentTexture(textureView->GetTexture(), binding); break; } default: From d64e64e5ef30c39c56853fa556db653080149fb7 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 6 Aug 2024 06:43:22 +0200 Subject: [PATCH 037/368] fix: texture updates and buffer bindings --- src/Cafe/HW/Latte/Core/LatteTexture.cpp | 10 ++-- .../LatteDecompilerEmitMSLHeader.hpp | 2 +- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 3 +- .../Renderer/Metal/MetalPipelineCache.cpp | 1 - .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 60 ++++++++++++------- 5 files changed, 46 insertions(+), 30 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteTexture.cpp b/src/Cafe/HW/Latte/Core/LatteTexture.cpp index d88528910..18e686ac0 100644 --- a/src/Cafe/HW/Latte/Core/LatteTexture.cpp +++ b/src/Cafe/HW/Latte/Core/LatteTexture.cpp @@ -170,7 +170,7 @@ void LatteTexture_UnregisterTextureMemoryOccupancy(LatteTexture* texture) } // calculate the actually accessed data range -// the resulting range is an estimate and may be smaller than the actual slice size (but not larger) +// the resulting range is an estimate and may be smaller than the actual slice size (but not larger) void LatteTexture_EstimateMipSliceAccessedDataRange(LatteTexture* texture, sint32 sliceIndex, sint32 mipIndex, LatteTextureSliceMipInfo* sliceMipInfo) { uint32 estAddrStart; @@ -222,7 +222,7 @@ void LatteTexture_InitSliceAndMipInfo(LatteTexture* texture) LatteAddrLib::AddrSurfaceInfo_OUT surfaceInfo; LatteAddrLib::GX2CalculateSurfaceInfo(texture->format, texture->width, texture->height, texture->depth, texture->dim, Latte::MakeGX2TileMode(texture->tileMode), 0, mipIndex, &surfaceInfo); sliceMipInfo->tileMode = surfaceInfo.hwTileMode; - + if (mipIndex == 0) sliceMipInfo->pitch = texture->pitch; // for the base level, use the pitch value configured in hardware else @@ -352,6 +352,7 @@ void LatteTexture_CopySlice(LatteTexture* srcTexture, sint32 srcSlice, sint32 sr if (srcTexture->isDepth != dstTexture->isDepth) { g_renderer->surfaceCopy_copySurfaceWithFormatConversion(srcTexture, srcMip, srcSlice, dstTexture, dstMip, dstSlice, width, height); + throw std::runtime_error("1"); return; } // rescale copy size @@ -384,6 +385,7 @@ void LatteTexture_CopySlice(LatteTexture* srcTexture, sint32 srcSlice, sint32 sr cemuLog_log(LogType::Force, "Source: {:08x} origResolution {:4}/{:4} effectiveResolution {:4}/{:4} fmt {:04x} mipIndex {} ratioW/H: {:.4}/{:.4}", srcTexture->physAddress, srcTexture->width, srcTexture->height, effectiveWidth_src, effectiveHeight_src, (uint32)srcTexture->format, srcMip, ratioWidth_src, ratioHeight_src); cemuLog_log(LogType::Force, "Destination: {:08x} origResolution {:4}/{:4} effectiveResolution {:4}/{:4} fmt {:04x} mipIndex {} ratioW/H: {:.4}/{:.4}", dstTexture->physAddress, dstTexture->width, dstTexture->height, effectiveWidth_dst, effectiveHeight_dst, (uint32)dstTexture->format, dstMip, ratioWidth_dst, ratioHeight_dst); } + throw std::runtime_error("2"); //cemuLog_logDebug(LogType::Force, "If these textures are not meant to share data you can ignore this"); return; } @@ -877,7 +879,7 @@ VIEWCOMPATIBILITY LatteTexture_CanTextureBeRepresentedAsView(LatteTexture* baseT // check pitch if(sliceMipInfo->pitch != pitch) continue; - // check all slices + // check all slices if(LatteAddrLib::TM_IsThickAndMacroTiled(baseTexture->tileMode)) continue; // todo - check only every 4th slice? for (sint32 s=0; sGetMipDepth(m); s++) @@ -978,7 +980,7 @@ LatteTextureView* LatteTexture_CreateMapping(MPTR physAddr, MPTR physMipAddr, si } // note: When creating an existing texture, we only allow mip and slice expansion at the end cemu_assert_debug(depth); - + cemu_assert_debug(!(depth > 1 && dimBase == Latte::E_DIM::DIM_2D)); cemu_assert_debug(!(numSlice > 1 && dimView == Latte::E_DIM::DIM_2D)); // todo, depth and numSlice are redundant diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index b3380d312..92a5fb133 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -261,7 +261,7 @@ namespace LatteDecompiler // generate pixel outputs for pixel shader for (uint32 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) { - if ((decompilerContext->shader->pixelColorOutputMask&(1 << i)) != 0) + if ((decompilerContext->shader->pixelColorOutputMask & (1 << i)) != 0) { src->addFmt("float4 passPixelColor{} [[color({})]];" _CRLF, i, i); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index a60539dcc..f522439cd 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -260,7 +260,8 @@ MTL::PrimitiveType GetMtlPrimitiveType(LattePrimitiveMode mode) case LattePrimitiveMode::TRIANGLE_STRIP: return MTL::PrimitiveTypeTriangleStrip; default: - printf("unimplemented primitive type %u\n", (uint32)mode); + // TODO: uncomment + //printf("unimplemented primitive type %u\n", (uint32)mode); return MTL::PrimitiveTypeTriangle; } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index d6976a8d7..049d738bc 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -58,7 +58,6 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS uint32 bufferIndex = bufferGroup.attributeBufferIndex; uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; - // TODO: is LatteGPUState.contextNew correct? uint32 bufferStride = (LatteGPUState.contextNew.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; auto layout = vertexDescriptor->layouts()->object(GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 2a3707b6c..e8ce23f30 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -561,7 +561,6 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 auto renderCommandEncoder = GetRenderCommandEncoder(renderPassDescriptor, colorRenderTargets, depthRenderTarget); // Shaders - LatteSHRC_UpdateActiveShaders(); LatteDecompilerShader* vertexShader = LatteSHRC_GetActiveVertexShader(); LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader(); if (!vertexShader || !static_cast(vertexShader->shader)->GetFunction()) @@ -627,7 +626,16 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 void MetalRenderer::draw_endSequence() { - // TODO: do something? + LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader(); + // post-drawcall logic + if (pixelShader) + LatteRenderTarget_trackUpdates(); + bool hasReadback = LatteTextureReadback_Update(); + //m_recordedDrawcalls++; + //if (m_recordedDrawcalls >= m_submitThreshold || hasReadback) + //{ + // SubmitCommandBuffer(); + //} } void* MetalRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) @@ -1095,33 +1103,39 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE { if (shader->resourceMapping.uniformBuffersBindingPoint[i] >= 0) { - uint32 binding = shader->resourceMapping.uniformBuffersBindingPoint[i]; - if (binding >= MAX_MTL_BUFFERS) - { - debug_printf("too big buffer index (%u), skipping binding\n", binding); - continue; - } - size_t offset = m_state.uniformBufferOffsets[(uint32)shader->shaderType][binding]; - if (offset != INVALID_OFFSET) - { - switch (shader->shaderType) - { - case LatteConst::ShaderType::Vertex: - { + uint32 binding = shader->resourceMapping.uniformBuffersBindingPoint[i]; + if (binding >= MAX_MTL_BUFFERS) + { + debug_printf("too big buffer index (%u), skipping binding\n", binding); + continue; + } + size_t offset = m_state.uniformBufferOffsets[(uint32)shader->shaderType][i]; + if (offset != INVALID_OFFSET) + { + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + { renderCommandEncoder->setVertexBuffer(m_memoryManager->GetBufferCache(), offset, binding); break; - } - case LatteConst::ShaderType::Pixel: - { - renderCommandEncoder->setFragmentBuffer(m_memoryManager->GetBufferCache(), offset, binding); + } + case LatteConst::ShaderType::Pixel: + { + renderCommandEncoder->setFragmentBuffer(m_memoryManager->GetBufferCache(), offset, binding); break; - } - default: + } + default: UNREACHABLE; - } - } + } + } } } + + // Storage buffer + if (shader->resourceMapping.tfStorageBindingPoint >= 0) + { + debug_printf("storage buffer not implemented, index: %i\n", shader->resourceMapping.tfStorageBindingPoint); + } } void MetalRenderer::RebindRenderState(MTL::RenderCommandEncoder* renderCommandEncoder) From 0a7f30c6a45235c03d9f4ebcfbce256996907654 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 6 Aug 2024 10:23:57 +0200 Subject: [PATCH 038/368] implement texture swizzle --- .../HW/Latte/Renderer/Metal/CachedFBOMtl.cpp | 4 +- .../Renderer/Metal/LatteTextureViewMtl.cpp | 163 ++++++++++++------ .../Renderer/Metal/LatteTextureViewMtl.h | 24 ++- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 17 ++ src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h | 3 + .../Renderer/Metal/MetalPipelineCache.cpp | 4 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 17 +- 7 files changed, 164 insertions(+), 68 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp index 2a0715b64..f3b98f156 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp @@ -15,7 +15,7 @@ void CachedFBOMtl::CreateRenderPass() continue; } auto colorAttachment = m_renderPassDescriptor->colorAttachments()->object(i); - colorAttachment->setTexture(textureView->GetTexture()); + colorAttachment->setTexture(textureView->GetRGBAView()); colorAttachment->setLoadAction(MTL::LoadActionLoad); colorAttachment->setStoreAction(MTL::StoreActionStore); } @@ -25,7 +25,7 @@ void CachedFBOMtl::CreateRenderPass() { auto textureView = static_cast(depthBuffer.texture); auto depthAttachment = m_renderPassDescriptor->depthAttachment(); - depthAttachment->setTexture(textureView->GetTexture()); + depthAttachment->setTexture(textureView->GetRGBAView()); depthAttachment->setLoadAction(MTL::LoadActionLoad); depthAttachment->setStoreAction(MTL::StoreActionStore); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp index d48b17cc5..ce2fec623 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp @@ -4,58 +4,123 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" LatteTextureViewMtl::LatteTextureViewMtl(MetalRenderer* mtlRenderer, LatteTextureMtl* texture, Latte::E_DIM dim, Latte::E_GX2SURFFMT format, sint32 firstMip, sint32 mipCount, sint32 firstSlice, sint32 sliceCount) - : LatteTextureView(texture, firstMip, mipCount, firstSlice, sliceCount, dim, format), m_mtlr(mtlRenderer), m_format(format) + : LatteTextureView(texture, firstMip, mipCount, firstSlice, sliceCount, dim, format), m_mtlr(mtlRenderer), m_baseTexture(texture) { - MTL::TextureType textureType; - switch (dim) - { - case Latte::E_DIM::DIM_1D: - textureType = MTL::TextureType1D; - break; - case Latte::E_DIM::DIM_2D: - case Latte::E_DIM::DIM_2D_MSAA: - textureType = MTL::TextureType2D; - break; - case Latte::E_DIM::DIM_2D_ARRAY: - textureType = MTL::TextureType2DArray; - break; - case Latte::E_DIM::DIM_3D: - textureType = MTL::TextureType3D; - break; - case Latte::E_DIM::DIM_CUBEMAP: - textureType = MTL::TextureTypeCube; // TODO: check this - break; - default: - cemu_assert_unimplemented(); - textureType = MTL::TextureType2D; - break; - } - - uint32 baseLevel = firstMip; - uint32 levelCount = this->numMip; - uint32 baseLayer; - uint32 layerCount; - // TODO: check if base texture is 3D texture as well - if (textureType == MTL::TextureType3D) - { - cemu_assert_debug(firstMip == 0); - cemu_assert_debug(this->numSlice == baseTexture->depth); - baseLayer = 0; - layerCount = 1; - } - else - { - baseLayer = firstSlice; - layerCount = this->numSlice; - } - - // TODO: swizzle - - auto formatInfo = GetMtlPixelFormatInfo(format, texture->IsDepth()); - m_texture = texture->GetTexture()->newTextureView(formatInfo.pixelFormat, textureType, NS::Range::Make(baseLevel, levelCount), NS::Range::Make(baseLayer, layerCount)); } LatteTextureViewMtl::~LatteTextureViewMtl() { - m_texture->release(); + for (sint32 i = 0; i < std::size(m_viewCache); i++) + { + if (m_viewCache[i].key != INVALID_SWIZZLE) + m_viewCache[i].texture->release(); + } + + for (auto& [key, texture] : m_fallbackViewCache) + { + texture->release(); + } +} + +MTL::Texture* LatteTextureViewMtl::GetSwizzledView(uint32 gpuSamplerSwizzle) +{ + // Mask out + gpuSamplerSwizzle &= 0x0FFF0000; + + if (gpuSamplerSwizzle == RGBA_SWIZZLE) + { + return m_baseTexture->GetTexture(); + } + else + { + // First, try to find a view in the cache + + // Fast cache + sint32 freeIndex = -1; + for (sint32 i = 0; i < std::size(m_viewCache); i++) + { + if (m_viewCache[i].key == gpuSamplerSwizzle) + { + return m_viewCache[i].texture; + } + else if (m_viewCache[i].key == INVALID_SWIZZLE && freeIndex == -1) + { + freeIndex = i; + } + } + + // Fallback cache + auto it = m_fallbackViewCache.find(gpuSamplerSwizzle); + if (it != m_fallbackViewCache.end()) + { + return it->second; + } + + MTL::Texture* texture = CreateSwizzledView(gpuSamplerSwizzle); + if (freeIndex != -1) + m_viewCache[freeIndex] = {gpuSamplerSwizzle, texture}; + else + it->second = texture; + + return texture; + } +} + +MTL::Texture* LatteTextureViewMtl::CreateSwizzledView(uint32 gpuSamplerSwizzle) +{ + uint32 compSelR = (gpuSamplerSwizzle >> 16) & 0x7; + uint32 compSelG = (gpuSamplerSwizzle >> 19) & 0x7; + uint32 compSelB = (gpuSamplerSwizzle >> 22) & 0x7; + uint32 compSelA = (gpuSamplerSwizzle >> 25) & 0x7; + // TODO: adjust + + MTL::TextureType textureType; + switch (dim) + { + case Latte::E_DIM::DIM_1D: + textureType = MTL::TextureType1D; + break; + case Latte::E_DIM::DIM_2D: + case Latte::E_DIM::DIM_2D_MSAA: + textureType = MTL::TextureType2D; + break; + case Latte::E_DIM::DIM_2D_ARRAY: + textureType = MTL::TextureType2DArray; + break; + case Latte::E_DIM::DIM_3D: + textureType = MTL::TextureType3D; + break; + case Latte::E_DIM::DIM_CUBEMAP: + textureType = MTL::TextureTypeCube; // TODO: check this + break; + default: + cemu_assert_unimplemented(); + textureType = MTL::TextureType2D; + break; + } + + uint32 baseLevel = firstMip; + uint32 levelCount = this->numMip; + uint32 baseLayer; + uint32 layerCount; + // TODO: check if base texture is 3D texture as well + if (textureType == MTL::TextureType3D) + { + cemu_assert_debug(firstMip == 0); + cemu_assert_debug(this->numSlice == baseTexture->depth); + baseLayer = 0; + layerCount = 1; + } + else + { + baseLayer = firstSlice; + layerCount = this->numSlice; + } + + // TODO: swizzle + + auto formatInfo = GetMtlPixelFormatInfo(format, m_baseTexture->IsDepth()); + MTL::Texture* texture = m_baseTexture->GetTexture()->newTextureView(formatInfo.pixelFormat, textureType, NS::Range::Make(baseLevel, levelCount), NS::Range::Make(baseLayer, layerCount)); + + return texture; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h index 7df74b4f0..eb2241803 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h @@ -1,27 +1,37 @@ #pragma once #include +#include #include "Cafe/HW/Latte/Core/LatteTexture.h" +#define RGBA_SWIZZLE 0x06880000 +#define INVALID_SWIZZLE 0xFFFFFFFF + +// TODO: test the swizzle class LatteTextureViewMtl : public LatteTextureView { public: LatteTextureViewMtl(class MetalRenderer* mtlRenderer, class LatteTextureMtl* texture, Latte::E_DIM dim, Latte::E_GX2SURFFMT format, sint32 firstMip, sint32 mipCount, sint32 firstSlice, sint32 sliceCount); ~LatteTextureViewMtl(); - MTL::Texture* GetTexture() const { - return m_texture; - } + MTL::Texture* GetSwizzledView(uint32 gpuSamplerSwizzle); - Latte::E_GX2SURFFMT GetFormat() const { - return m_format; + MTL::Texture* GetRGBAView() + { + return GetSwizzledView(RGBA_SWIZZLE); } private: class MetalRenderer* m_mtlr; - MTL::Texture* m_texture; + class LatteTextureMtl* m_baseTexture; + + struct { + uint32 key; + MTL::Texture* texture; + } m_viewCache[4] = {{INVALID_SWIZZLE, nullptr}}; + std::unordered_map m_fallbackViewCache; - Latte::E_GX2SURFFMT m_format; + MTL::Texture* CreateSwizzledView(uint32 gpuSamplerSwizzle); }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index f522439cd..fdf18ef5e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -419,3 +419,20 @@ MTL::SamplerAddressMode GetMtlSamplerAddressMode(Latte::LATTE_SQ_TEX_SAMPLER_WOR cemu_assert_debug((uint32)clamp < std::size(MTL_SAMPLER_ADDRESS_MODES)); return MTL_SAMPLER_ADDRESS_MODES[(uint32)clamp]; } + +const MTL::TextureSwizzle MTL_TEXTURE_SWIZZLES[] = { + MTL::TextureSwizzleRed, + MTL::TextureSwizzleGreen, + MTL::TextureSwizzleBlue, + MTL::TextureSwizzleAlpha, + MTL::TextureSwizzleZero, + MTL::TextureSwizzleOne, + MTL::TextureSwizzleZero, + MTL::TextureSwizzleZero +}; + +MTL::TextureSwizzle GetMtlTextureSwizzle(uint32 swizzle) +{ + cemu_assert_debug(swizzle < std::size(MTL_TEXTURE_SWIZZLES)); + return MTL_TEXTURE_SWIZZLES[swizzle]; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h index f0348303c..7e75c35d6 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h @@ -8,6 +8,7 @@ #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Metal/MTLDepthStencil.hpp" #include "Metal/MTLSampler.hpp" +#include "Metal/MTLTexture.hpp" struct Uvec2 { uint32 x; @@ -41,3 +42,5 @@ MTL::BlendFactor GetMtlBlendFactor(Latte::LATTE_CB_BLENDN_CONTROL::E_BLENDFACTOR MTL::CompareFunction GetMtlCompareFunc(Latte::E_COMPAREFUNC func); MTL::SamplerAddressMode GetMtlSamplerAddressMode(Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_CLAMP clamp); + +MTL::TextureSwizzle GetMtlTextureSwizzle(uint32 swizzle); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 049d738bc..d18f8bb9c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -88,7 +88,7 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS continue; } auto colorAttachment = desc->colorAttachments()->object(i); - colorAttachment->setPixelFormat(texture->GetTexture()->pixelFormat()); + colorAttachment->setPixelFormat(texture->GetRGBAView()->pixelFormat()); // Blending const Latte::LATTE_CB_COLOR_CONTROL& colorControlReg = LatteGPUState.contextNew.CB_COLOR_CONTROL; @@ -127,7 +127,7 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS if (activeFBO->depthBuffer.texture) { auto texture = static_cast(activeFBO->depthBuffer.texture); - desc->setDepthAttachmentPixelFormat(texture->GetTexture()->pixelFormat()); + desc->setDepthAttachmentPixelFormat(texture->GetRGBAView()->pixelFormat()); // TODO: stencil pixel format } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index e8ce23f30..5ff58c680 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -167,7 +167,7 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput if (!AcquireNextDrawable()) return; - MTL::Texture* presentTexture = static_cast(texView)->GetTexture(); + MTL::Texture* presentTexture = static_cast(texView)->GetRGBAView(); // Create render pass MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); @@ -550,13 +550,13 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 auto colorTexture = static_cast(m_state.activeFBO->colorBuffer[i].texture); if (colorTexture) { - colorRenderTargets[i] = colorTexture->GetTexture(); + colorRenderTargets[i] = colorTexture->GetRGBAView(); } } auto depthTexture = static_cast(m_state.activeFBO->depthBuffer.texture); if (depthTexture) { - depthRenderTarget = depthTexture->GetTexture(); + depthRenderTarget = depthTexture->GetRGBAView(); } auto renderCommandEncoder = GetRenderCommandEncoder(renderPassDescriptor, colorRenderTargets, depthRenderTarget); @@ -919,9 +919,9 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE auto clampY = samplerWords->WORD0.get_CLAMP_Y(); auto clampZ = samplerWords->WORD0.get_CLAMP_Z(); - samplerDescriptor->setRAddressMode(GetMtlSamplerAddressMode(clampX)); - samplerDescriptor->setSAddressMode(GetMtlSamplerAddressMode(clampY)); - samplerDescriptor->setTAddressMode(GetMtlSamplerAddressMode(clampZ)); + samplerDescriptor->setSAddressMode(GetMtlSamplerAddressMode(clampX)); + samplerDescriptor->setTAddressMode(GetMtlSamplerAddressMode(clampY)); + samplerDescriptor->setRAddressMode(GetMtlSamplerAddressMode(clampZ)); auto maxAniso = samplerWords->WORD0.get_MAX_ANISO_RATIO(); @@ -980,16 +980,17 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE sampler->release(); } + MTL::Texture* mtlTexture = textureView->GetSwizzledView(word4); switch (shader->shaderType) { case LatteConst::ShaderType::Vertex: { - renderCommandEncoder->setVertexTexture(textureView->GetTexture(), binding); + renderCommandEncoder->setVertexTexture(mtlTexture, binding); break; } case LatteConst::ShaderType::Pixel: { - renderCommandEncoder->setFragmentTexture(textureView->GetTexture(), binding); + renderCommandEncoder->setFragmentTexture(mtlTexture, binding); break; } default: From 5fc45407db8f5f159fe231e6033cf03fd76a9ccb Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 6 Aug 2024 13:42:06 +0200 Subject: [PATCH 039/368] fix: texture swizzle --- .../LatteDecompilerEmitMSL.cpp | 5 +- .../Renderer/Metal/LatteTextureViewMtl.cpp | 59 ++++++++++--------- .../Renderer/Metal/LatteTextureViewMtl.h | 1 - 3 files changed, 34 insertions(+), 31 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 91558c3ba..e743f31c4 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2334,7 +2334,7 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex // shadow sampler if (texDim == Latte::E_DIM::DIM_2D_ARRAY) { - // 3 coords + compare value (as float4) + // 3 coords + compare value src->add("float3("); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(", "); @@ -2442,7 +2442,8 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex } else if( texOpcode == GPU7_TEX_INST_SAMPLE_LZ || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ ) { - src->add(",0.0"); + // TODO: correct? + src->add(", level(0.0)"); } } // gradient parameters diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp index ce2fec623..b9d001192 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp @@ -27,43 +27,42 @@ MTL::Texture* LatteTextureViewMtl::GetSwizzledView(uint32 gpuSamplerSwizzle) // Mask out gpuSamplerSwizzle &= 0x0FFF0000; + // RGBA swizzle == no swizzle if (gpuSamplerSwizzle == RGBA_SWIZZLE) { return m_baseTexture->GetTexture(); } - else - { - // First, try to find a view in the cache - // Fast cache - sint32 freeIndex = -1; - for (sint32 i = 0; i < std::size(m_viewCache); i++) + // First, try to find a view in the cache + + // Fast cache + sint32 freeIndex = -1; + for (sint32 i = 0; i < std::size(m_viewCache); i++) + { + if (m_viewCache[i].key == gpuSamplerSwizzle) { - if (m_viewCache[i].key == gpuSamplerSwizzle) - { - return m_viewCache[i].texture; - } - else if (m_viewCache[i].key == INVALID_SWIZZLE && freeIndex == -1) - { - freeIndex = i; - } + return m_viewCache[i].texture; } - - // Fallback cache - auto it = m_fallbackViewCache.find(gpuSamplerSwizzle); - if (it != m_fallbackViewCache.end()) + else if (m_viewCache[i].key == INVALID_SWIZZLE && freeIndex == -1) { - return it->second; + freeIndex = i; } + } - MTL::Texture* texture = CreateSwizzledView(gpuSamplerSwizzle); - if (freeIndex != -1) - m_viewCache[freeIndex] = {gpuSamplerSwizzle, texture}; - else - it->second = texture; - - return texture; + // Fallback cache + auto it = m_fallbackViewCache.find(gpuSamplerSwizzle); + if (it != m_fallbackViewCache.end()) + { + return it->second; } + + MTL::Texture* texture = CreateSwizzledView(gpuSamplerSwizzle); + if (freeIndex != -1) + m_viewCache[freeIndex] = {gpuSamplerSwizzle, texture}; + else + it->second = texture; + + return texture; } MTL::Texture* LatteTextureViewMtl::CreateSwizzledView(uint32 gpuSamplerSwizzle) @@ -117,10 +116,14 @@ MTL::Texture* LatteTextureViewMtl::CreateSwizzledView(uint32 gpuSamplerSwizzle) layerCount = this->numSlice; } - // TODO: swizzle + MTL::TextureSwizzleChannels swizzle; + swizzle.red = GetMtlTextureSwizzle(compSelR); + swizzle.green = GetMtlTextureSwizzle(compSelG); + swizzle.blue = GetMtlTextureSwizzle(compSelB); + swizzle.alpha = GetMtlTextureSwizzle(compSelA); auto formatInfo = GetMtlPixelFormatInfo(format, m_baseTexture->IsDepth()); - MTL::Texture* texture = m_baseTexture->GetTexture()->newTextureView(formatInfo.pixelFormat, textureType, NS::Range::Make(baseLevel, levelCount), NS::Range::Make(baseLayer, layerCount)); + MTL::Texture* texture = m_baseTexture->GetTexture()->newTextureView(formatInfo.pixelFormat, textureType, NS::Range::Make(baseLevel, levelCount), NS::Range::Make(baseLayer, layerCount), swizzle); return texture; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h index eb2241803..7a5a9dfa3 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h @@ -8,7 +8,6 @@ #define RGBA_SWIZZLE 0x06880000 #define INVALID_SWIZZLE 0xFFFFFFFF -// TODO: test the swizzle class LatteTextureViewMtl : public LatteTextureView { public: From 99ff28272009e0c2a40ad0242e0a0bc4a52b0ca5 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 6 Aug 2024 16:29:06 +0200 Subject: [PATCH 040/368] implement more primitive types & warn about vertex stride --- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 45 +++++++++++++------ src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h | 2 +- .../Renderer/Metal/MetalPipelineCache.cpp | 6 +++ 3 files changed, 38 insertions(+), 15 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index fdf18ef5e..2da259f31 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -1,6 +1,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #include "Common/precompiled.h" #include "Metal/MTLDepthStencil.hpp" +#include "Metal/MTLRenderCommandEncoder.hpp" #include "Metal/MTLSampler.hpp" std::map MTL_COLOR_FORMAT_TABLE = { @@ -247,22 +248,38 @@ TextureDecoder* GetMtlTextureDecoder(Latte::E_GX2SURFFMT format, bool isDepth) } } -MTL::PrimitiveType GetMtlPrimitiveType(LattePrimitiveMode mode) +MTL::PrimitiveType GetMtlPrimitiveType(LattePrimitiveMode primitiveMode) { - switch (mode) + switch (primitiveMode) { - case LattePrimitiveMode::POINTS: - return MTL::PrimitiveTypePoint; - case LattePrimitiveMode::LINES: - return MTL::PrimitiveTypeLine; - case LattePrimitiveMode::TRIANGLES: - return MTL::PrimitiveTypeTriangle; - case LattePrimitiveMode::TRIANGLE_STRIP: - return MTL::PrimitiveTypeTriangleStrip; - default: - // TODO: uncomment - //printf("unimplemented primitive type %u\n", (uint32)mode); - return MTL::PrimitiveTypeTriangle; + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::POINTS: + return MTL::PrimitiveTypePoint; + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::LINES: + return MTL::PrimitiveTypeLine; + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::LINE_STRIP: + return MTL::PrimitiveTypeLineStrip; + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::LINE_LOOP: + return MTL::PrimitiveTypeLineStrip; // line loops are emulated as line strips with an extra connecting strip at the end + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::LINE_STRIP_ADJACENT: // Tropical Freeze level 3-6 + debug_printf("Metal doesn't support line strip adjacent primitive, using line strip instead\n"); + return MTL::PrimitiveTypeLineStrip; + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::TRIANGLES: + return MTL::PrimitiveTypeTriangle; + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::TRIANGLE_FAN: + debug_printf("Metal doesn't support triangle fan primitive, using triangle strip instead\n"); + return MTL::PrimitiveTypeTriangleStrip; + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::TRIANGLE_STRIP: + return MTL::PrimitiveTypeTriangleStrip; + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::QUADS: + return MTL::PrimitiveTypeTriangle; // quads are emulated as 2 triangles + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::QUAD_STRIP: + return MTL::PrimitiveTypeTriangle; // quad strips are emulated as (count-2)/2 triangles + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS: + return MTL::PrimitiveTypeTriangle; // rects are emulated as 2 triangles + default: + cemuLog_logDebug(LogType::Force, "Metal-Unsupported: Render pipeline with primitive mode {} created", primitiveMode); + cemu_assert_debug(false); + return MTL::PrimitiveTypeTriangle; } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h index 7e75c35d6..bd25ada5b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h @@ -29,7 +29,7 @@ size_t GetMtlTextureBytesPerImage(Latte::E_GX2SURFFMT format, bool isDepth, uint TextureDecoder* GetMtlTextureDecoder(Latte::E_GX2SURFFMT format, bool isDepth); -MTL::PrimitiveType GetMtlPrimitiveType(LattePrimitiveMode mode); +MTL::PrimitiveType GetMtlPrimitiveType(LattePrimitiveMode primitiveMode); MTL::VertexFormat GetMtlVertexFormat(uint8 format); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index d18f8bb9c..5ced9e9f0 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -60,6 +60,12 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; uint32 bufferStride = (LatteGPUState.contextNew.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + uint32 strideRemainder = bufferStride % 4; + if (strideRemainder != 0) + { + debug_printf("vertex stride must be a multiple of 4, remainder: %u\n", strideRemainder); + } + auto layout = vertexDescriptor->layouts()->object(GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)); layout->setStride(bufferStride); if (!fetchType.has_value() || fetchType == LatteConst::VertexFetchType2::VERTEX_DATA) From 82dcbd98a6487e3285ff86d8ccfced57856d595b Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 6 Aug 2024 19:08:22 +0200 Subject: [PATCH 041/368] prepare for vertex stride workaround --- src/Cafe/CMakeLists.txt | 1 + .../HW/Latte/Renderer/Metal/MetalCommon.h | 6 ++ .../Renderer/Metal/MetalMemoryManager.cpp | 71 +++++++++++++++- .../Latte/Renderer/Metal/MetalMemoryManager.h | 81 ++++++++++++++++++- .../Renderer/Metal/MetalPipelineCache.cpp | 13 ++- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 49 +++++++---- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 5 ++ 7 files changed, 197 insertions(+), 29 deletions(-) create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index 7d10788a6..ea2373484 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -538,6 +538,7 @@ if(ENABLE_METAL) target_sources(CemuCafe PRIVATE HW/Latte/Renderer/Metal/MetalRenderer.cpp HW/Latte/Renderer/Metal/MetalRenderer.h + HW/Latte/Renderer/Metal/MetalCommon.h HW/Latte/Renderer/Metal/MetalCppImpl.cpp HW/Latte/Renderer/Metal/MetalLayer.mm HW/Latte/Renderer/Metal/MetalLayer.h diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h new file mode 100644 index 000000000..c7011ab87 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -0,0 +1,6 @@ +#pragma once + +inline size_t align(size_t size, size_t alignment) +{ + return (size + alignment - 1) & ~(alignment - 1); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index f6f064f31..897e95939 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -1,3 +1,4 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" @@ -14,7 +15,7 @@ MetalBufferAllocator::~MetalBufferAllocator() MetalBufferAllocation MetalBufferAllocator::GetBufferAllocation(size_t size, size_t alignment) { // Align the size - size = (size + alignment - 1) & ~(alignment - 1); + size = align(size, alignment); // First, try to find a free range for (uint32 i = 0; i < m_freeBufferRanges.size(); i++) @@ -63,6 +64,65 @@ MetalBufferAllocation MetalBufferAllocator::GetBufferAllocation(size_t size, siz return allocation; } +MetalVertexBufferCache::~MetalVertexBufferCache() +{ + for (uint32 i = 0; i < LATTE_MAX_VERTEX_BUFFERS; i++) + { + auto vertexBufferRange = m_bufferRanges[i]; + if (vertexBufferRange) + { + if (vertexBufferRange->restrideInfo.buffer) + { + vertexBufferRange->restrideInfo.buffer->release(); + } + } + } +} + +MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(uint32 bufferIndex, size_t stride) +{ + auto vertexBufferRange = m_bufferRanges[bufferIndex]; + auto& restrideInfo = vertexBufferRange->restrideInfo; + + if (stride % 4 == 0) + { + // No restride needed + return {nullptr, vertexBufferRange->offset}; + } + + if (restrideInfo.memoryInvalidated || stride != restrideInfo.lastStride) + { + // TODO: restride + throw std::runtime_error("restride needed"); + + restrideInfo.memoryInvalidated = false; + restrideInfo.lastStride = stride; + } + + // TODO: remove + throw std::runtime_error("restride unimplemented"); + + return {restrideInfo.buffer, 0}; +} + +void MetalVertexBufferCache::MemoryRangeChanged(size_t offset, size_t size) +{ + for (uint32 i = 0; i < LATTE_MAX_VERTEX_BUFFERS; i++) + { + auto vertexBufferRange = m_bufferRanges[i]; + if (vertexBufferRange) + { + if ((offset < vertexBufferRange->offset && (offset + size) < (vertexBufferRange->offset + vertexBufferRange->size)) || + (offset > vertexBufferRange->offset && (offset + size) > (vertexBufferRange->offset + vertexBufferRange->size))) + { + continue; + } + + vertexBufferRange->restrideInfo.memoryInvalidated = true; + } + } +} + MetalMemoryManager::~MetalMemoryManager() { if (m_bufferCache) @@ -85,7 +145,7 @@ void MetalMemoryManager::InitBufferCache(size_t size) { if (m_bufferCache) { - printf("MetalMemoryManager::InitBufferCache: buffer cache already initialized\n"); + debug_printf("MetalMemoryManager::InitBufferCache: buffer cache already initialized\n"); return; } @@ -101,18 +161,21 @@ void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, si if (!m_bufferCache) { - printf("MetalMemoryManager::UploadToBufferCache: buffer cache not initialized\n"); + debug_printf("MetalMemoryManager::UploadToBufferCache: buffer cache not initialized\n"); return; } memcpy((uint8*)m_bufferCache->contents() + offset, data, size); + + // Notify vertex buffer cache about the change + m_vertexBufferCache.MemoryRangeChanged(offset, size); } void MetalMemoryManager::CopyBufferCache(size_t srcOffset, size_t dstOffset, size_t size) { if (!m_bufferCache) { - printf("MetalMemoryManager::CopyBufferCache: buffer cache not initialized\n"); + debug_printf("MetalMemoryManager::CopyBufferCache: buffer cache not initialized\n"); return; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index b0be29486..5ee0b37d2 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -3,6 +3,7 @@ #include #include "Cafe/HW/Latte/ISA/LatteReg.h" +#include "Cafe/HW/Latte/Core/LatteConst.h" //const uint32 bufferAllocatorIndexShift = 24; @@ -51,10 +52,65 @@ class MetalBufferAllocator std::vector m_freeBufferRanges; }; +struct MetalRestridedBufferRange +{ + MTL::Buffer* buffer; + size_t offset; +}; + +// TODO: use one big buffer for all the restrided vertex buffers? +struct MetalRestrideInfo +{ + bool memoryInvalidated = true; + size_t lastStride = 0; + MTL::Buffer* buffer = nullptr; +}; + +struct MetalVertexBufferRange +{ + size_t offset; + size_t size; + MetalRestrideInfo& restrideInfo; +}; + +class MetalVertexBufferCache +{ +public: + friend class MetalMemoryManager; + + MetalVertexBufferCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} + ~MetalVertexBufferCache(); + + // Vertex buffer cache + void TrackVertexBuffer(uint32 bufferIndex, size_t offset, size_t size, MetalRestrideInfo& restrideInfo) + { + m_bufferRanges[bufferIndex] = new MetalVertexBufferRange{offset, size, restrideInfo}; + } + + void UntrackVertexBuffer(uint32 bufferIndex) + { + auto& range = m_bufferRanges[bufferIndex]; + if (range->restrideInfo.buffer) + { + range->restrideInfo.buffer->release(); + } + range = nullptr; + } + + MetalRestridedBufferRange RestrideBufferIfNeeded(uint32 bufferIndex, size_t stride); + +private: + class MetalRenderer* m_mtlr; + + MetalVertexBufferRange* m_bufferRanges[LATTE_MAX_VERTEX_BUFFERS] = {nullptr}; + + void MemoryRangeChanged(size_t offset, size_t size); +}; + class MetalMemoryManager { public: - MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_bufferAllocator(metalRenderer) {} + MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_bufferAllocator(metalRenderer), m_vertexBufferCache(metalRenderer) {} ~MetalMemoryManager(); void ResetTemporaryBuffers() @@ -90,6 +146,28 @@ class MetalMemoryManager void UploadToBufferCache(const void* data, size_t offset, size_t size); void CopyBufferCache(size_t srcOffset, size_t dstOffset, size_t size); + // Vertex buffer cache + void TrackVertexBuffer(uint32 bufferIndex, size_t offset, size_t size, MetalRestrideInfo& restrideInfo) + { + m_vertexBufferCache.TrackVertexBuffer(bufferIndex, offset, size, restrideInfo); + } + + void UntrackVertexBuffer(uint32 bufferIndex) + { + m_vertexBufferCache.UntrackVertexBuffer(bufferIndex); + } + + MetalRestridedBufferRange RestrideBufferIfNeeded(uint32 bufferIndex, size_t stride) + { + auto range = m_vertexBufferCache.RestrideBufferIfNeeded(bufferIndex, stride); + if (!range.buffer) + { + range.buffer = m_bufferCache; + } + + return range; + } + private: class MetalRenderer* m_mtlr; @@ -97,6 +175,7 @@ class MetalMemoryManager MetalBufferAllocator m_bufferAllocator;//s[2]; //uint8 m_bufferAllocatorIndex = 0; + MetalVertexBufferCache m_vertexBufferCache; MTL::Buffer* m_bufferCache = nullptr; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 5ced9e9f0..46918f8f0 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -1,12 +1,14 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" -#include "HW/Latte/Core/FetchShader.h" -#include "HW/Latte/ISA/RegDefines.h" #include "HW/Latte/Renderer/Metal/CachedFBOMtl.h" #include "HW/Latte/Renderer/Metal/LatteToMtl.h" #include "HW/Latte/Renderer/Metal/RendererShaderMtl.h" #include "HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" +#include "HW/Latte/Core/FetchShader.h" +#include "HW/Latte/ISA/RegDefines.h" + MetalPipelineCache::~MetalPipelineCache() { for (auto& pair : m_pipelineCache) @@ -59,12 +61,7 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS uint32 bufferIndex = bufferGroup.attributeBufferIndex; uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; uint32 bufferStride = (LatteGPUState.contextNew.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; - - uint32 strideRemainder = bufferStride % 4; - if (strideRemainder != 0) - { - debug_printf("vertex stride must be a multiple of 4, remainder: %u\n", strideRemainder); - } + bufferStride = align(bufferStride, 4); auto layout = vertexDescriptor->layouts()->object(GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)); layout->setStride(bufferStride); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 5ff58c680..e4764590a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -6,7 +6,6 @@ #include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.h" -#include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/ShaderSourcePresent.h" @@ -440,11 +439,22 @@ void MetalRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, uint32 size) { - if (m_state.vertexBuffers[bufferIndex].offset == offset) + cemu_assert_debug(bufferIndex < LATTE_MAX_VERTEX_BUFFERS); + auto& buffer = m_state.vertexBuffers[bufferIndex]; + if (buffer.offset == offset && buffer.size == size) return; - cemu_assert_debug(bufferIndex < LATTE_MAX_VERTEX_BUFFERS); - m_state.vertexBuffers[bufferIndex].needsRebind = true; - m_state.vertexBuffers[bufferIndex].offset = offset; + + if (buffer.offset != INVALID_OFFSET) + { + m_memoryManager->UntrackVertexBuffer(bufferIndex); + } + + buffer.needsRebind = true; + buffer.offset = offset; + buffer.size = size; + buffer.restrideInfo = {}; + + m_memoryManager->TrackVertexBuffer(bufferIndex, offset, size, buffer.restrideInfo); } void MetalRenderer::buffer_bindUniformBuffer(LatteConst::ShaderType shaderType, uint32 bufferIndex, uint32 offset, uint32 size) @@ -598,15 +608,25 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 LatteBufferCache_Sync(indexMin + baseVertex, indexMax + baseVertex, baseInstance, instanceCount); // Vertex buffers - for (uint8 i = 0; i < MAX_MTL_BUFFERS; i++) - { - auto& vertexBufferRange = m_state.vertexBuffers[i]; - if (vertexBufferRange.needsRebind) + for (uint8 i = 0; i < MAX_MTL_BUFFERS; i++) + { + auto& vertexBufferRange = m_state.vertexBuffers[i]; + if (vertexBufferRange.offset != INVALID_OFFSET) { - renderCommandEncoder->setVertexBuffer(m_memoryManager->GetBufferCache(), vertexBufferRange.offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); - vertexBufferRange.needsRebind = false; + // Restride + uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + i * 7; + uint32 bufferStride = (LatteGPUState.contextNew.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + + auto restridedBuffer = m_memoryManager->RestrideBufferIfNeeded(i, bufferStride); + + // Bind + if (vertexBufferRange.needsRebind) + { + renderCommandEncoder->setVertexBuffer(restridedBuffer.buffer, restridedBuffer.offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); + vertexBufferRange.needsRebind = false; + } } - } + } // Uniform buffers, textures and samplers BindStageResources(renderCommandEncoder, vertexShader); @@ -1186,10 +1206,7 @@ void MetalRenderer::RebindRenderState(MTL::RenderCommandEncoder* renderCommandEn { auto& vertexBufferRange = m_state.vertexBuffers[i]; if (vertexBufferRange.offset != INVALID_OFFSET) - { - renderCommandEncoder->setVertexBuffer(m_memoryManager->GetBufferCache(), vertexBufferRange.offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); - vertexBufferRange.needsRebind = false; - } + vertexBufferRange.needsRebind = true; } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index b6fb30ad1..bee581d68 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -6,6 +6,8 @@ #include "Cafe/HW/Latte/Renderer/Renderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" + #define MAX_MTL_BUFFERS 31 #define GET_MTL_VERTEX_BUFFER_INDEX(index) (MAX_MTL_BUFFERS - index - 2) // TODO: don't harcdode the support buffer binding @@ -20,6 +22,9 @@ struct MetalBoundBuffer { bool needsRebind = false; size_t offset = INVALID_OFFSET; + size_t size = 0; + // Memory manager will write restride info to this variable + MetalRestrideInfo restrideInfo; }; struct MetalState From ac651eba7762053a43ea8224b9d7ece78899b356 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 6 Aug 2024 20:42:38 +0200 Subject: [PATCH 042/368] implement vertex stride workaround --- .../Renderer/Metal/MetalMemoryManager.cpp | 24 ++++++++++++------- .../Latte/Renderer/Metal/MetalMemoryManager.h | 10 ++------ .../Renderer/Metal/MetalPipelineCache.cpp | 7 ++++++ 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 897e95939..bba887b99 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -1,6 +1,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Metal/MTLResource.hpp" const size_t BUFFER_ALLOCATION_SIZE = 8 * 1024 * 1024; @@ -79,7 +80,7 @@ MetalVertexBufferCache::~MetalVertexBufferCache() } } -MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(uint32 bufferIndex, size_t stride) +MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Buffer* bufferCache, uint32 bufferIndex, size_t stride) { auto vertexBufferRange = m_bufferRanges[bufferIndex]; auto& restrideInfo = vertexBufferRange->restrideInfo; @@ -87,21 +88,28 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(uint32 if (stride % 4 == 0) { // No restride needed - return {nullptr, vertexBufferRange->offset}; + return {bufferCache, vertexBufferRange->offset}; } if (restrideInfo.memoryInvalidated || stride != restrideInfo.lastStride) { - // TODO: restride - throw std::runtime_error("restride needed"); + // TODO: use compute/void vertex function instead + size_t newStride = align(stride, 4); + size_t newSize = vertexBufferRange->size / stride * newStride; + restrideInfo.buffer = m_mtlr->GetDevice()->newBuffer(newSize, MTL::StorageModeShared); + + uint8* oldPtr = (uint8*)bufferCache->contents() + vertexBufferRange->offset; + uint8* newPtr = (uint8*)restrideInfo.buffer->contents(); + + for (size_t elem = 0; elem < vertexBufferRange->size / stride; elem++) + { + memcpy(newPtr + elem * newStride, oldPtr + elem * stride, stride); + } restrideInfo.memoryInvalidated = false; - restrideInfo.lastStride = stride; + restrideInfo.lastStride = newStride; } - // TODO: remove - throw std::runtime_error("restride unimplemented"); - return {restrideInfo.buffer, 0}; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index 5ee0b37d2..93011ae36 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -97,7 +97,7 @@ class MetalVertexBufferCache range = nullptr; } - MetalRestridedBufferRange RestrideBufferIfNeeded(uint32 bufferIndex, size_t stride); + MetalRestridedBufferRange RestrideBufferIfNeeded(MTL::Buffer* bufferCache, uint32 bufferIndex, size_t stride); private: class MetalRenderer* m_mtlr; @@ -159,13 +159,7 @@ class MetalMemoryManager MetalRestridedBufferRange RestrideBufferIfNeeded(uint32 bufferIndex, size_t stride) { - auto range = m_vertexBufferCache.RestrideBufferIfNeeded(bufferIndex, stride); - if (!range.buffer) - { - range.buffer = m_bufferCache; - } - - return range; + return m_vertexBufferCache.RestrideBufferIfNeeded(m_bufferCache, bufferIndex, stride); } private: diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 46918f8f0..d4d043129 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -63,6 +63,13 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS uint32 bufferStride = (LatteGPUState.contextNew.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; bufferStride = align(bufferStride, 4); + // HACK + if (bufferStride == 0) + { + debug_printf("vertex buffer %u has a vertex stride of 0 bytes, using 4 bytes instead\n", bufferIndex); + bufferStride = 4; + } + auto layout = vertexDescriptor->layouts()->object(GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)); layout->setStride(bufferStride); if (!fetchType.has_value() || fetchType == LatteConst::VertexFetchType2::VERTEX_DATA) From d7e9aff230f470deb169b56d1de57356f7f8f3c4 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 7 Aug 2024 11:44:16 +0200 Subject: [PATCH 043/368] implement triangle fan reindexing --- src/Cafe/HW/Latte/Core/LatteIndices.cpp | 71 ++++++++- src/Cafe/HW/Latte/Core/LatteTexture.cpp | 2 - .../LatteDecompilerEmitMSL.cpp | 4 +- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 140 +++++++++--------- src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h | 1 + .../Renderer/Metal/MetalPipelineCache.cpp | 2 +- 6 files changed, 141 insertions(+), 79 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteIndices.cpp b/src/Cafe/HW/Latte/Core/LatteIndices.cpp index 6e1d74559..891dc3e1c 100644 --- a/src/Cafe/HW/Latte/Core/LatteIndices.cpp +++ b/src/Cafe/HW/Latte/Core/LatteIndices.cpp @@ -7,7 +7,7 @@ #include #endif -struct +struct { const void* lastPtr; uint32 lastCount; @@ -284,6 +284,46 @@ void LatteIndices_generateAutoLineLoopIndices(void* indexDataOutput, uint32 coun indexMax = std::max(count, 1u) - 1; } +template +void LatteIndices_unpackTriangleFanAndConvert(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax) +{ + debug_printf("TRIANGLE FAN UNPACK\n"); + const betype* src = (betype*)indexDataInput; + T* dst = (T*)indexDataOutput; + // TODO: check this + for (sint32 i = 0; i < count; i++) + { + uint32 i0; + if (i % 2 == 0) + i0 = i / 2; + else + i0 = count - 1 - i / 2; + T idx = src[i0]; + indexMin = std::min(indexMin, (uint32)idx); + indexMax = std::max(indexMax, (uint32)idx); + dst[i] = idx; + } +} + +template +void LatteIndices_generateAutoTriangleFanIndices(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax) +{ + debug_printf("TRIANGLE FAN AUTO\n"); + const betype* src = (betype*)indexDataInput; + T* dst = (T*)indexDataOutput; + for (sint32 i = 0; i < count; i++) + { + T idx = i; + if (idx % 2 == 0) + idx = idx / 2; + else + idx = count - 1 - idx / 2; + dst[i] = idx; + } + indexMin = 0; + indexMax = std::max(count, 1u) - 1; +} + #if defined(ARCH_X86_64) ATTRIBUTE_AVX2 void LatteIndices_fastConvertU16_AVX2(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax) @@ -295,7 +335,7 @@ void LatteIndices_fastConvertU16_AVX2(const void* indexDataInput, void* indexDat sint32 countRemaining = count & 15; if (count16) { - __m256i mMin = _mm256_set_epi16((sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, + __m256i mMin = _mm256_set_epi16((sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF); __m256i mMax = _mm256_set_epi16(0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000); __m256i mShuffle16Swap = _mm256_set_epi8(30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); @@ -659,6 +699,29 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 cemu_assert_debug(false); outputCount = count + 1; } + else if (primitiveMode == LattePrimitiveMode::TRIANGLE_FAN && g_renderer->GetType() == RendererAPI::Metal) + { + if (indexType == LatteIndexType::AUTO) + { + if (count <= 0xFFFF) + { + LatteIndices_generateAutoTriangleFanIndices(indexData, indexOutputPtr, count, indexMin, indexMax); + renderIndexType = Renderer::INDEX_TYPE::U16; + } + else + { + LatteIndices_generateAutoTriangleFanIndices(indexData, indexOutputPtr, count, indexMin, indexMax); + renderIndexType = Renderer::INDEX_TYPE::U32; + } + } + else if (indexType == LatteIndexType::U16_BE) + LatteIndices_unpackTriangleFanAndConvert(indexData, indexOutputPtr, count, indexMin, indexMax); + else if (indexType == LatteIndexType::U32_BE) + LatteIndices_unpackTriangleFanAndConvert(indexData, indexOutputPtr, count, indexMin, indexMax); + else + cemu_assert_debug(false); + outputCount = count; + } else { if (indexType == LatteIndexType::U16_BE) @@ -671,7 +734,7 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 else LatteIndices_convertBE(indexData, indexOutputPtr, count, indexMin, indexMax); #else - LatteIndices_convertBE(indexData, indexOutputPtr, count, indexMin, indexMax); + LatteIndices_convertBE(indexData, indexOutputPtr, count, indexMin, indexMax); #endif } else if (indexType == LatteIndexType::U32_BE) @@ -682,7 +745,7 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 else LatteIndices_convertBE(indexData, indexOutputPtr, count, indexMin, indexMax); #else - LatteIndices_convertBE(indexData, indexOutputPtr, count, indexMin, indexMax); + LatteIndices_convertBE(indexData, indexOutputPtr, count, indexMin, indexMax); #endif } else if (indexType == LatteIndexType::U16_LE) diff --git a/src/Cafe/HW/Latte/Core/LatteTexture.cpp b/src/Cafe/HW/Latte/Core/LatteTexture.cpp index 18e686ac0..3c5610006 100644 --- a/src/Cafe/HW/Latte/Core/LatteTexture.cpp +++ b/src/Cafe/HW/Latte/Core/LatteTexture.cpp @@ -352,7 +352,6 @@ void LatteTexture_CopySlice(LatteTexture* srcTexture, sint32 srcSlice, sint32 sr if (srcTexture->isDepth != dstTexture->isDepth) { g_renderer->surfaceCopy_copySurfaceWithFormatConversion(srcTexture, srcMip, srcSlice, dstTexture, dstMip, dstSlice, width, height); - throw std::runtime_error("1"); return; } // rescale copy size @@ -385,7 +384,6 @@ void LatteTexture_CopySlice(LatteTexture* srcTexture, sint32 srcSlice, sint32 sr cemuLog_log(LogType::Force, "Source: {:08x} origResolution {:4}/{:4} effectiveResolution {:4}/{:4} fmt {:04x} mipIndex {} ratioW/H: {:.4}/{:.4}", srcTexture->physAddress, srcTexture->width, srcTexture->height, effectiveWidth_src, effectiveHeight_src, (uint32)srcTexture->format, srcMip, ratioWidth_src, ratioHeight_src); cemuLog_log(LogType::Force, "Destination: {:08x} origResolution {:4}/{:4} effectiveResolution {:4}/{:4} fmt {:04x} mipIndex {} ratioW/H: {:.4}/{:.4}", dstTexture->physAddress, dstTexture->width, dstTexture->height, effectiveWidth_dst, effectiveHeight_dst, (uint32)dstTexture->format, dstMip, ratioWidth_dst, ratioHeight_dst); } - throw std::runtime_error("2"); //cemuLog_logDebug(LogType::Force, "If these textures are not meant to share data you can ignore this"); return; } diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index e743f31c4..0975a4d00 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2389,7 +2389,7 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex else if(texDim == Latte::E_DIM::DIM_3D) { // 3 coords - src->add("float2("); + src->add("float3("); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(", "); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); @@ -2434,7 +2434,7 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex } else { - // TODO: is this correct + // TODO: is this correct? src->add("level("); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 3, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(")"); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index 2da259f31..226a5c6d5 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -5,79 +5,79 @@ #include "Metal/MTLSampler.hpp" std::map MTL_COLOR_FORMAT_TABLE = { - {Latte::E_GX2SURFFMT::R4_G4_UNORM, {MTL::PixelFormatRG8Unorm, 2}}, // TODO: correct? - {Latte::E_GX2SURFFMT::R5_G6_B5_UNORM, {MTL::PixelFormatB5G6R5Unorm, 2}}, // TODO: correct? - {Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM, {MTL::PixelFormatBGR5A1Unorm, 2}}, // TODO: correct? - {Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM, {MTL::PixelFormatABGR4Unorm, 2}}, // TODO: correct? - {Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM, {MTL::PixelFormatA1BGR5Unorm, 2}}, - {Latte::E_GX2SURFFMT::R8_UNORM, {MTL::PixelFormatR8Unorm, 1}}, - {Latte::E_GX2SURFFMT::R8_SNORM, {MTL::PixelFormatR8Snorm, 1}}, - {Latte::E_GX2SURFFMT::R8_UINT, {MTL::PixelFormatR8Uint, 1}}, - {Latte::E_GX2SURFFMT::R8_SINT, {MTL::PixelFormatR8Sint, 1}}, - {Latte::E_GX2SURFFMT::R8_G8_UNORM, {MTL::PixelFormatRG8Unorm, 2}}, - {Latte::E_GX2SURFFMT::R8_G8_SNORM, {MTL::PixelFormatRG8Snorm, 2}}, - {Latte::E_GX2SURFFMT::R8_G8_UINT, {MTL::PixelFormatRG8Uint, 2}}, - {Latte::E_GX2SURFFMT::R8_G8_SINT, {MTL::PixelFormatRG8Sint, 2}}, - {Latte::E_GX2SURFFMT::R8_G8_B8_A8_UNORM, {MTL::PixelFormatRGBA8Unorm, 4}}, - {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SNORM, {MTL::PixelFormatRGBA8Snorm, 4}}, - {Latte::E_GX2SURFFMT::R8_G8_B8_A8_UINT, {MTL::PixelFormatRGBA8Uint, 4}}, - {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SINT, {MTL::PixelFormatRGBA8Sint, 4}}, - {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SRGB, {MTL::PixelFormatRGBA8Unorm_sRGB, 4}}, - {Latte::E_GX2SURFFMT::R10_G10_B10_A2_UNORM, {MTL::PixelFormatRGB10A2Unorm, 4}}, - {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SNORM, {MTL::PixelFormatRGBA16Snorm, 8}}, // TODO: correct? - {Latte::E_GX2SURFFMT::R10_G10_B10_A2_UINT, {MTL::PixelFormatRGB10A2Uint, 4}}, - {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SINT, {MTL::PixelFormatRGBA16Sint, 8}}, // TODO: correct? - {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SRGB, {MTL::PixelFormatRGBA8Unorm_sRGB, 4}}, // TODO: correct? - {Latte::E_GX2SURFFMT::A2_B10_G10_R10_UNORM, {MTL::PixelFormatBGR10A2Unorm, 4}}, // TODO: correct? - {Latte::E_GX2SURFFMT::A2_B10_G10_R10_UINT, {MTL::PixelFormatRGB10A2Uint, 4}}, // TODO: correct? - {Latte::E_GX2SURFFMT::R16_UNORM, {MTL::PixelFormatR16Unorm, 2}}, - {Latte::E_GX2SURFFMT::R16_SNORM, {MTL::PixelFormatR16Snorm, 2}}, - {Latte::E_GX2SURFFMT::R16_UINT, {MTL::PixelFormatR16Uint, 2}}, - {Latte::E_GX2SURFFMT::R16_SINT, {MTL::PixelFormatR16Sint, 2}}, - {Latte::E_GX2SURFFMT::R16_FLOAT, {MTL::PixelFormatR16Float, 2}}, - {Latte::E_GX2SURFFMT::R16_G16_UNORM, {MTL::PixelFormatRG16Unorm, 4}}, - {Latte::E_GX2SURFFMT::R16_G16_SNORM, {MTL::PixelFormatRG16Snorm, 4}}, - {Latte::E_GX2SURFFMT::R16_G16_UINT, {MTL::PixelFormatRG16Uint, 4}}, - {Latte::E_GX2SURFFMT::R16_G16_SINT, {MTL::PixelFormatRG16Sint, 4}}, - {Latte::E_GX2SURFFMT::R16_G16_FLOAT, {MTL::PixelFormatRG16Float, 4}}, - {Latte::E_GX2SURFFMT::R16_G16_B16_A16_UNORM, {MTL::PixelFormatRGBA16Unorm, 8}}, - {Latte::E_GX2SURFFMT::R16_G16_B16_A16_SNORM, {MTL::PixelFormatRGBA16Snorm, 8}}, - {Latte::E_GX2SURFFMT::R16_G16_B16_A16_UINT, {MTL::PixelFormatRGBA16Uint, 8}}, - {Latte::E_GX2SURFFMT::R16_G16_B16_A16_SINT, {MTL::PixelFormatRGBA16Sint, 8}}, - {Latte::E_GX2SURFFMT::R16_G16_B16_A16_FLOAT, {MTL::PixelFormatRGBA16Float, 8}}, - {Latte::E_GX2SURFFMT::R24_X8_UNORM, {MTL::PixelFormatInvalid, 0}}, // TODO - {Latte::E_GX2SURFFMT::R24_X8_FLOAT, {MTL::PixelFormatInvalid, 0}}, // TODO - {Latte::E_GX2SURFFMT::X24_G8_UINT, {MTL::PixelFormatInvalid, 0}}, // TODO - {Latte::E_GX2SURFFMT::R32_X8_FLOAT, {MTL::PixelFormatInvalid, 0}}, // TODO - {Latte::E_GX2SURFFMT::X32_G8_UINT_X24, {MTL::PixelFormatInvalid, 0}}, // TODO - {Latte::E_GX2SURFFMT::R11_G11_B10_FLOAT, {MTL::PixelFormatRG11B10Float, 4}}, - {Latte::E_GX2SURFFMT::R32_UINT, {MTL::PixelFormatR32Uint, 4}}, - {Latte::E_GX2SURFFMT::R32_SINT, {MTL::PixelFormatR32Sint, 4}}, - {Latte::E_GX2SURFFMT::R32_FLOAT, {MTL::PixelFormatR32Float, 4}}, - {Latte::E_GX2SURFFMT::R32_G32_UINT, {MTL::PixelFormatRG32Uint, 8}}, - {Latte::E_GX2SURFFMT::R32_G32_SINT, {MTL::PixelFormatRG32Sint, 8}}, - {Latte::E_GX2SURFFMT::R32_G32_FLOAT, {MTL::PixelFormatRG32Float, 8}}, - {Latte::E_GX2SURFFMT::R32_G32_B32_A32_UINT, {MTL::PixelFormatRGBA32Uint, 16}}, - {Latte::E_GX2SURFFMT::R32_G32_B32_A32_SINT, {MTL::PixelFormatRGBA32Sint, 16}}, - {Latte::E_GX2SURFFMT::R32_G32_B32_A32_FLOAT, {MTL::PixelFormatRGBA32Float, 16}}, - {Latte::E_GX2SURFFMT::BC1_UNORM, {MTL::PixelFormatBC1_RGBA, 8, {4, 4}}}, // TODO: correct? - {Latte::E_GX2SURFFMT::BC1_SRGB, {MTL::PixelFormatBC1_RGBA_sRGB, 8, {4, 4}}}, // TODO: correct? - {Latte::E_GX2SURFFMT::BC2_UNORM, {MTL::PixelFormatBC2_RGBA, 16, {4, 4}}}, // TODO: correct? - {Latte::E_GX2SURFFMT::BC2_SRGB, {MTL::PixelFormatBC2_RGBA_sRGB, 16, {4, 4}}}, // TODO: correct? - {Latte::E_GX2SURFFMT::BC3_UNORM, {MTL::PixelFormatBC3_RGBA, 16, {4, 4}}}, // TODO: correct? - {Latte::E_GX2SURFFMT::BC3_SRGB, {MTL::PixelFormatBC3_RGBA_sRGB, 16, {4, 4}}}, // TODO: correct? - {Latte::E_GX2SURFFMT::BC4_UNORM, {MTL::PixelFormatBC4_RUnorm, 8, {4, 4}}}, // TODO: correct? - {Latte::E_GX2SURFFMT::BC4_SNORM, {MTL::PixelFormatBC4_RSnorm, 8, {4, 4}}}, // TODO: correct? - {Latte::E_GX2SURFFMT::BC5_UNORM, {MTL::PixelFormatBC5_RGUnorm, 16, {4, 4}}}, // TODO: correct? - {Latte::E_GX2SURFFMT::BC5_SNORM, {MTL::PixelFormatBC5_RGSnorm, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R4_G4_UNORM, {MTL::PixelFormatRG8Unorm, true, 2}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R5_G6_B5_UNORM, {MTL::PixelFormatB5G6R5Unorm, true, 2}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM, {MTL::PixelFormatBGR5A1Unorm, true, 2}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM, {MTL::PixelFormatABGR4Unorm, true, 2}}, // TODO: correct? + {Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM, {MTL::PixelFormatA1BGR5Unorm, true, 2}}, + {Latte::E_GX2SURFFMT::R8_UNORM, {MTL::PixelFormatR8Unorm, true, 1}}, + {Latte::E_GX2SURFFMT::R8_SNORM, {MTL::PixelFormatR8Snorm, true, 1}}, + {Latte::E_GX2SURFFMT::R8_UINT, {MTL::PixelFormatR8Uint, false, 1}}, + {Latte::E_GX2SURFFMT::R8_SINT, {MTL::PixelFormatR8Sint, false, 1}}, + {Latte::E_GX2SURFFMT::R8_G8_UNORM, {MTL::PixelFormatRG8Unorm, true, 2}}, + {Latte::E_GX2SURFFMT::R8_G8_SNORM, {MTL::PixelFormatRG8Snorm, true, 2}}, + {Latte::E_GX2SURFFMT::R8_G8_UINT, {MTL::PixelFormatRG8Uint, false, 2}}, + {Latte::E_GX2SURFFMT::R8_G8_SINT, {MTL::PixelFormatRG8Sint, false, 2}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_UNORM, {MTL::PixelFormatRGBA8Unorm, true, 4}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SNORM, {MTL::PixelFormatRGBA8Snorm, true, 4}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_UINT, {MTL::PixelFormatRGBA8Uint, false, 4}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SINT, {MTL::PixelFormatRGBA8Sint, false, 4}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SRGB, {MTL::PixelFormatRGBA8Unorm_sRGB, true, 4}}, + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_UNORM, {MTL::PixelFormatRGB10A2Unorm, true, 4}}, + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SNORM, {MTL::PixelFormatRGBA16Snorm, true, 8}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_UINT, {MTL::PixelFormatRGB10A2Uint, false, 4}}, + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SINT, {MTL::PixelFormatRGBA16Sint, false, 8}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SRGB, {MTL::PixelFormatRGBA8Unorm_sRGB, true, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::A2_B10_G10_R10_UNORM, {MTL::PixelFormatBGR10A2Unorm, true, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::A2_B10_G10_R10_UINT, {MTL::PixelFormatRGB10A2Uint, false, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R16_UNORM, {MTL::PixelFormatR16Unorm, true, 2}}, + {Latte::E_GX2SURFFMT::R16_SNORM, {MTL::PixelFormatR16Snorm, true, 2}}, + {Latte::E_GX2SURFFMT::R16_UINT, {MTL::PixelFormatR16Uint, false, 2}}, + {Latte::E_GX2SURFFMT::R16_SINT, {MTL::PixelFormatR16Sint, false, 2}}, + {Latte::E_GX2SURFFMT::R16_FLOAT, {MTL::PixelFormatR16Float, true, 2}}, + {Latte::E_GX2SURFFMT::R16_G16_UNORM, {MTL::PixelFormatRG16Unorm, true, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_SNORM, {MTL::PixelFormatRG16Snorm, true, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_UINT, {MTL::PixelFormatRG16Uint, false, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_SINT, {MTL::PixelFormatRG16Sint, false, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_FLOAT, {MTL::PixelFormatRG16Float, true, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_UNORM, {MTL::PixelFormatRGBA16Unorm, true, 8}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_SNORM, {MTL::PixelFormatRGBA16Snorm, true, 8}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_UINT, {MTL::PixelFormatRGBA16Uint, false, 8}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_SINT, {MTL::PixelFormatRGBA16Sint, false, 8}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_FLOAT, {MTL::PixelFormatRGBA16Float, true, 8}}, + {Latte::E_GX2SURFFMT::R24_X8_UNORM, {MTL::PixelFormatInvalid, false, 0}}, // TODO + {Latte::E_GX2SURFFMT::R24_X8_FLOAT, {MTL::PixelFormatInvalid, false, 0}}, // TODO + {Latte::E_GX2SURFFMT::X24_G8_UINT, {MTL::PixelFormatInvalid, false, 0}}, // TODO + {Latte::E_GX2SURFFMT::R32_X8_FLOAT, {MTL::PixelFormatInvalid, false, 0}}, // TODO + {Latte::E_GX2SURFFMT::X32_G8_UINT_X24, {MTL::PixelFormatInvalid, false, 0}}, // TODO + {Latte::E_GX2SURFFMT::R11_G11_B10_FLOAT, {MTL::PixelFormatRG11B10Float, true, 4}}, + {Latte::E_GX2SURFFMT::R32_UINT, {MTL::PixelFormatR32Uint, false, 4}}, + {Latte::E_GX2SURFFMT::R32_SINT, {MTL::PixelFormatR32Sint, false, 4}}, + {Latte::E_GX2SURFFMT::R32_FLOAT, {MTL::PixelFormatR32Float, true, 4}}, + {Latte::E_GX2SURFFMT::R32_G32_UINT, {MTL::PixelFormatRG32Uint, false, 8}}, + {Latte::E_GX2SURFFMT::R32_G32_SINT, {MTL::PixelFormatRG32Sint, false, 8}}, + {Latte::E_GX2SURFFMT::R32_G32_FLOAT, {MTL::PixelFormatRG32Float, true, 8}}, + {Latte::E_GX2SURFFMT::R32_G32_B32_A32_UINT, {MTL::PixelFormatRGBA32Uint, false, 16}}, + {Latte::E_GX2SURFFMT::R32_G32_B32_A32_SINT, {MTL::PixelFormatRGBA32Sint, false, 16}}, + {Latte::E_GX2SURFFMT::R32_G32_B32_A32_FLOAT, {MTL::PixelFormatRGBA32Float, true, 16}}, + {Latte::E_GX2SURFFMT::BC1_UNORM, {MTL::PixelFormatBC1_RGBA, true, 8, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC1_SRGB, {MTL::PixelFormatBC1_RGBA_sRGB, true, 8, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC2_UNORM, {MTL::PixelFormatBC2_RGBA, true, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC2_SRGB, {MTL::PixelFormatBC2_RGBA_sRGB, true, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC3_UNORM, {MTL::PixelFormatBC3_RGBA, true, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC3_SRGB, {MTL::PixelFormatBC3_RGBA_sRGB, true, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC4_UNORM, {MTL::PixelFormatBC4_RUnorm, true, 8, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC4_SNORM, {MTL::PixelFormatBC4_RSnorm, true, 8, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC5_UNORM, {MTL::PixelFormatBC5_RGUnorm, true, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC5_SNORM, {MTL::PixelFormatBC5_RGSnorm, true, 16, {4, 4}}}, // TODO: correct? }; std::map MTL_DEPTH_FORMAT_TABLE = { - {Latte::E_GX2SURFFMT::D24_S8_UNORM, {MTL::PixelFormatDepth24Unorm_Stencil8, 4}}, // TODO: not supported on Apple sillicon, maybe find something else - {Latte::E_GX2SURFFMT::D24_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, 4}}, // TODO: correct? - {Latte::E_GX2SURFFMT::D32_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, 5}}, - {Latte::E_GX2SURFFMT::D16_UNORM, {MTL::PixelFormatDepth16Unorm, 2}}, - {Latte::E_GX2SURFFMT::D32_FLOAT, {MTL::PixelFormatDepth32Float, 4}}, + {Latte::E_GX2SURFFMT::D24_S8_UNORM, {MTL::PixelFormatDepth24Unorm_Stencil8, false, 4}}, // TODO: not supported on Apple sillicon, maybe find something else + {Latte::E_GX2SURFFMT::D24_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, false, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::D32_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, false, 5}}, + {Latte::E_GX2SURFFMT::D16_UNORM, {MTL::PixelFormatDepth16Unorm, false, 2}}, + {Latte::E_GX2SURFFMT::D32_FLOAT, {MTL::PixelFormatDepth32Float, false, 4}}, }; const MtlPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format, bool isDepth) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h index bd25ada5b..91ae1c2fd 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h @@ -17,6 +17,7 @@ struct Uvec2 { struct MtlPixelFormatInfo { MTL::PixelFormat pixelFormat; + bool blendable; size_t bytesPerBlock; Uvec2 blockTexelSize = {1, 1}; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index d4d043129..ef59c1111 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -106,7 +106,7 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS uint32 renderTargetMask = LatteGPUState.contextNew.CB_TARGET_MASK.get_MASK(); bool blendEnabled = ((blendEnableMask & (1 << i))) != 0; - if (blendEnabled) + if (blendEnabled && GetMtlPixelFormatInfo(texture->format, false).blendable) { colorAttachment->setBlendingEnabled(true); From e2ec602c437b69277e923d4ed2890cbd969bd431 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 7 Aug 2024 13:25:38 +0200 Subject: [PATCH 044/368] specialize fragment shader output types & fix: shader errors --- src/Cafe/HW/Latte/Core/LatteBufferData.cpp | 15 +- .../LatteDecompilerEmitMSL.cpp | 13 +- .../LatteDecompilerEmitMSLHeader.hpp | 6 +- .../Renderer/Metal/LatteTextureViewMtl.cpp | 8 +- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 149 +++++++++--------- src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h | 14 +- .../HW/Latte/Renderer/Metal/MetalCommon.h | 7 +- .../Renderer/Metal/MetalMemoryManager.cpp | 4 +- .../Renderer/Metal/MetalPipelineCache.cpp | 15 +- .../Renderer/Metal/RendererShaderMtl.cpp | 86 +++++++--- .../Latte/Renderer/Metal/RendererShaderMtl.h | 9 ++ 11 files changed, 205 insertions(+), 121 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteBufferData.cpp b/src/Cafe/HW/Latte/Core/LatteBufferData.cpp index 85d4cdf7a..7620e6a77 100644 --- a/src/Cafe/HW/Latte/Core/LatteBufferData.cpp +++ b/src/Cafe/HW/Latte/Core/LatteBufferData.cpp @@ -62,7 +62,7 @@ void rectGenerate4thVertex(uint32be* output, uint32be* input0, uint32be* input1, // order of rectangle vertices is // v0 v1 - // v2 v3 + // v2 v3 for (sint32 f = 0; f < vectorLen*4; f++) output[f] = _swapEndianU32(output[f]); @@ -199,11 +199,14 @@ bool LatteBufferCache_Sync(uint32 minIndex, uint32 maxIndex, uint32 baseInstance #if BOOST_OS_MACOS if(bufferStride % 4 != 0) { - if (VulkanRenderer* vkRenderer = VulkanRenderer::GetInstance()) + if (g_renderer->GetType() == RendererAPI::Vulkan) { - auto fixedBuffer = vkRenderer->buffer_genStrideWorkaroundVertexBuffer(bufferAddress, fixedBufferSize, bufferStride); - vkRenderer->buffer_bindVertexStrideWorkaroundBuffer(fixedBuffer.first, fixedBuffer.second, bufferIndex, fixedBufferSize); - continue; + if (VulkanRenderer* vkRenderer = VulkanRenderer::GetInstance()) + { + auto fixedBuffer = vkRenderer->buffer_genStrideWorkaroundVertexBuffer(bufferAddress, fixedBufferSize, bufferStride); + vkRenderer->buffer_bindVertexStrideWorkaroundBuffer(fixedBuffer.first, fixedBuffer.second, bufferIndex, fixedBufferSize); + continue; + } } } #endif @@ -222,4 +225,4 @@ bool LatteBufferCache_Sync(uint32 minIndex, uint32 maxIndex, uint32 baseInstance if (pixelShader) LatteBufferCache_syncGPUUniformBuffers(pixelShader, mmSQ_PS_UNIFORM_BLOCK_START, LatteConst::ShaderType::Pixel); return true; -} \ No newline at end of file +} diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 0975a4d00..84cba9092 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -10,6 +10,7 @@ #include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInstructions.h" #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Renderer/Renderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" #include "config/ActiveSettings.h" #include "util/helpers/StringBuf.h" @@ -2335,14 +2336,14 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex if (texDim == Latte::E_DIM::DIM_2D_ARRAY) { // 3 coords + compare value - src->add("float3("); + src->add("float2("); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(", "); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add(", "); + src->add("), "); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); - src->addFmt("), {}", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer0)); + src->addFmt(", {}", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer0)); } else if (texDim == Latte::E_DIM::DIM_CUBEMAP) { @@ -3181,9 +3182,11 @@ static void _emitExportCode(LatteDecompilerShaderContext* shaderContext, LatteDe src->add(") == false) discard_fragment();" _CRLF); } // pixel color output - src->addFmt("out.passPixelColor{} = ", pixelColorOutputIndex); + src->addFmt("#ifdef {}" _CRLF, GetColorAttachmentTypeStr(pixelColorOutputIndex)); + src->addFmt("out.passPixelColor{} = as_type<{}>(", pixelColorOutputIndex, GetColorAttachmentTypeStr(pixelColorOutputIndex)); _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, i); - src->add(";" _CRLF); + src->add(");" _CRLF); + src->add("#endif" _CRLF); if( cfInstruction->exportArrayBase+i >= 8 ) cemu_assert_unimplemented(); diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 92a5fb133..5384f0d4d 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -222,6 +222,8 @@ namespace LatteDecompiler { auto* src = shaderContext->shaderSource; + src->add("#define GET_FRAGCOORD() vec4(in.position.xy * supportBuffer.fragCoordScale.xy, in.position.z, 1.0 / in.position.w)" _CRLF); + src->add("struct FragmentIn {" _CRLF); LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable(); @@ -263,7 +265,9 @@ namespace LatteDecompiler { if ((decompilerContext->shader->pixelColorOutputMask & (1 << i)) != 0) { - src->addFmt("float4 passPixelColor{} [[color({})]];" _CRLF, i, i); + src->addFmt("#ifdef {}" _CRLF, GetColorAttachmentTypeStr(i)); + src->addFmt("{} passPixelColor{} [[color({})]];" _CRLF, GetColorAttachmentTypeStr(i), i, i); + src->add("#endif" _CRLF); } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp index b9d001192..edf720fb1 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp @@ -50,17 +50,17 @@ MTL::Texture* LatteTextureViewMtl::GetSwizzledView(uint32 gpuSamplerSwizzle) } // Fallback cache - auto it = m_fallbackViewCache.find(gpuSamplerSwizzle); - if (it != m_fallbackViewCache.end()) + auto& fallbackEntry = m_fallbackViewCache[gpuSamplerSwizzle]; + if (fallbackEntry) { - return it->second; + return fallbackEntry; } MTL::Texture* texture = CreateSwizzledView(gpuSamplerSwizzle); if (freeIndex != -1) m_viewCache[freeIndex] = {gpuSamplerSwizzle, texture}; else - it->second = texture; + fallbackEntry = texture; return texture; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index 226a5c6d5..44cb1f3bc 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -4,85 +4,85 @@ #include "Metal/MTLRenderCommandEncoder.hpp" #include "Metal/MTLSampler.hpp" -std::map MTL_COLOR_FORMAT_TABLE = { - {Latte::E_GX2SURFFMT::R4_G4_UNORM, {MTL::PixelFormatRG8Unorm, true, 2}}, // TODO: correct? - {Latte::E_GX2SURFFMT::R5_G6_B5_UNORM, {MTL::PixelFormatB5G6R5Unorm, true, 2}}, // TODO: correct? - {Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM, {MTL::PixelFormatBGR5A1Unorm, true, 2}}, // TODO: correct? - {Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM, {MTL::PixelFormatABGR4Unorm, true, 2}}, // TODO: correct? - {Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM, {MTL::PixelFormatA1BGR5Unorm, true, 2}}, - {Latte::E_GX2SURFFMT::R8_UNORM, {MTL::PixelFormatR8Unorm, true, 1}}, - {Latte::E_GX2SURFFMT::R8_SNORM, {MTL::PixelFormatR8Snorm, true, 1}}, - {Latte::E_GX2SURFFMT::R8_UINT, {MTL::PixelFormatR8Uint, false, 1}}, - {Latte::E_GX2SURFFMT::R8_SINT, {MTL::PixelFormatR8Sint, false, 1}}, - {Latte::E_GX2SURFFMT::R8_G8_UNORM, {MTL::PixelFormatRG8Unorm, true, 2}}, - {Latte::E_GX2SURFFMT::R8_G8_SNORM, {MTL::PixelFormatRG8Snorm, true, 2}}, - {Latte::E_GX2SURFFMT::R8_G8_UINT, {MTL::PixelFormatRG8Uint, false, 2}}, - {Latte::E_GX2SURFFMT::R8_G8_SINT, {MTL::PixelFormatRG8Sint, false, 2}}, - {Latte::E_GX2SURFFMT::R8_G8_B8_A8_UNORM, {MTL::PixelFormatRGBA8Unorm, true, 4}}, - {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SNORM, {MTL::PixelFormatRGBA8Snorm, true, 4}}, - {Latte::E_GX2SURFFMT::R8_G8_B8_A8_UINT, {MTL::PixelFormatRGBA8Uint, false, 4}}, - {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SINT, {MTL::PixelFormatRGBA8Sint, false, 4}}, - {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SRGB, {MTL::PixelFormatRGBA8Unorm_sRGB, true, 4}}, - {Latte::E_GX2SURFFMT::R10_G10_B10_A2_UNORM, {MTL::PixelFormatRGB10A2Unorm, true, 4}}, - {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SNORM, {MTL::PixelFormatRGBA16Snorm, true, 8}}, // TODO: correct? - {Latte::E_GX2SURFFMT::R10_G10_B10_A2_UINT, {MTL::PixelFormatRGB10A2Uint, false, 4}}, - {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SINT, {MTL::PixelFormatRGBA16Sint, false, 8}}, // TODO: correct? - {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SRGB, {MTL::PixelFormatRGBA8Unorm_sRGB, true, 4}}, // TODO: correct? - {Latte::E_GX2SURFFMT::A2_B10_G10_R10_UNORM, {MTL::PixelFormatBGR10A2Unorm, true, 4}}, // TODO: correct? - {Latte::E_GX2SURFFMT::A2_B10_G10_R10_UINT, {MTL::PixelFormatRGB10A2Uint, false, 4}}, // TODO: correct? - {Latte::E_GX2SURFFMT::R16_UNORM, {MTL::PixelFormatR16Unorm, true, 2}}, - {Latte::E_GX2SURFFMT::R16_SNORM, {MTL::PixelFormatR16Snorm, true, 2}}, - {Latte::E_GX2SURFFMT::R16_UINT, {MTL::PixelFormatR16Uint, false, 2}}, - {Latte::E_GX2SURFFMT::R16_SINT, {MTL::PixelFormatR16Sint, false, 2}}, - {Latte::E_GX2SURFFMT::R16_FLOAT, {MTL::PixelFormatR16Float, true, 2}}, - {Latte::E_GX2SURFFMT::R16_G16_UNORM, {MTL::PixelFormatRG16Unorm, true, 4}}, - {Latte::E_GX2SURFFMT::R16_G16_SNORM, {MTL::PixelFormatRG16Snorm, true, 4}}, - {Latte::E_GX2SURFFMT::R16_G16_UINT, {MTL::PixelFormatRG16Uint, false, 4}}, - {Latte::E_GX2SURFFMT::R16_G16_SINT, {MTL::PixelFormatRG16Sint, false, 4}}, - {Latte::E_GX2SURFFMT::R16_G16_FLOAT, {MTL::PixelFormatRG16Float, true, 4}}, - {Latte::E_GX2SURFFMT::R16_G16_B16_A16_UNORM, {MTL::PixelFormatRGBA16Unorm, true, 8}}, - {Latte::E_GX2SURFFMT::R16_G16_B16_A16_SNORM, {MTL::PixelFormatRGBA16Snorm, true, 8}}, - {Latte::E_GX2SURFFMT::R16_G16_B16_A16_UINT, {MTL::PixelFormatRGBA16Uint, false, 8}}, - {Latte::E_GX2SURFFMT::R16_G16_B16_A16_SINT, {MTL::PixelFormatRGBA16Sint, false, 8}}, - {Latte::E_GX2SURFFMT::R16_G16_B16_A16_FLOAT, {MTL::PixelFormatRGBA16Float, true, 8}}, - {Latte::E_GX2SURFFMT::R24_X8_UNORM, {MTL::PixelFormatInvalid, false, 0}}, // TODO - {Latte::E_GX2SURFFMT::R24_X8_FLOAT, {MTL::PixelFormatInvalid, false, 0}}, // TODO - {Latte::E_GX2SURFFMT::X24_G8_UINT, {MTL::PixelFormatInvalid, false, 0}}, // TODO - {Latte::E_GX2SURFFMT::R32_X8_FLOAT, {MTL::PixelFormatInvalid, false, 0}}, // TODO - {Latte::E_GX2SURFFMT::X32_G8_UINT_X24, {MTL::PixelFormatInvalid, false, 0}}, // TODO - {Latte::E_GX2SURFFMT::R11_G11_B10_FLOAT, {MTL::PixelFormatRG11B10Float, true, 4}}, - {Latte::E_GX2SURFFMT::R32_UINT, {MTL::PixelFormatR32Uint, false, 4}}, - {Latte::E_GX2SURFFMT::R32_SINT, {MTL::PixelFormatR32Sint, false, 4}}, - {Latte::E_GX2SURFFMT::R32_FLOAT, {MTL::PixelFormatR32Float, true, 4}}, - {Latte::E_GX2SURFFMT::R32_G32_UINT, {MTL::PixelFormatRG32Uint, false, 8}}, - {Latte::E_GX2SURFFMT::R32_G32_SINT, {MTL::PixelFormatRG32Sint, false, 8}}, - {Latte::E_GX2SURFFMT::R32_G32_FLOAT, {MTL::PixelFormatRG32Float, true, 8}}, - {Latte::E_GX2SURFFMT::R32_G32_B32_A32_UINT, {MTL::PixelFormatRGBA32Uint, false, 16}}, - {Latte::E_GX2SURFFMT::R32_G32_B32_A32_SINT, {MTL::PixelFormatRGBA32Sint, false, 16}}, - {Latte::E_GX2SURFFMT::R32_G32_B32_A32_FLOAT, {MTL::PixelFormatRGBA32Float, true, 16}}, - {Latte::E_GX2SURFFMT::BC1_UNORM, {MTL::PixelFormatBC1_RGBA, true, 8, {4, 4}}}, // TODO: correct? - {Latte::E_GX2SURFFMT::BC1_SRGB, {MTL::PixelFormatBC1_RGBA_sRGB, true, 8, {4, 4}}}, // TODO: correct? - {Latte::E_GX2SURFFMT::BC2_UNORM, {MTL::PixelFormatBC2_RGBA, true, 16, {4, 4}}}, // TODO: correct? - {Latte::E_GX2SURFFMT::BC2_SRGB, {MTL::PixelFormatBC2_RGBA_sRGB, true, 16, {4, 4}}}, // TODO: correct? - {Latte::E_GX2SURFFMT::BC3_UNORM, {MTL::PixelFormatBC3_RGBA, true, 16, {4, 4}}}, // TODO: correct? - {Latte::E_GX2SURFFMT::BC3_SRGB, {MTL::PixelFormatBC3_RGBA_sRGB, true, 16, {4, 4}}}, // TODO: correct? - {Latte::E_GX2SURFFMT::BC4_UNORM, {MTL::PixelFormatBC4_RUnorm, true, 8, {4, 4}}}, // TODO: correct? - {Latte::E_GX2SURFFMT::BC4_SNORM, {MTL::PixelFormatBC4_RSnorm, true, 8, {4, 4}}}, // TODO: correct? - {Latte::E_GX2SURFFMT::BC5_UNORM, {MTL::PixelFormatBC5_RGUnorm, true, 16, {4, 4}}}, // TODO: correct? - {Latte::E_GX2SURFFMT::BC5_SNORM, {MTL::PixelFormatBC5_RGSnorm, true, 16, {4, 4}}}, // TODO: correct? +std::map MTL_COLOR_FORMAT_TABLE = { + {Latte::E_GX2SURFFMT::R4_G4_UNORM, {MTL::PixelFormatRG8Unorm, MetalDataType::FLOAT, 2}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R5_G6_B5_UNORM, {MTL::PixelFormatB5G6R5Unorm, MetalDataType::FLOAT, 2}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM, {MTL::PixelFormatBGR5A1Unorm, MetalDataType::FLOAT, 2}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM, {MTL::PixelFormatABGR4Unorm, MetalDataType::FLOAT, 2}}, // TODO: correct? + {Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM, {MTL::PixelFormatA1BGR5Unorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R8_UNORM, {MTL::PixelFormatR8Unorm, MetalDataType::FLOAT, 1}}, + {Latte::E_GX2SURFFMT::R8_SNORM, {MTL::PixelFormatR8Snorm, MetalDataType::FLOAT, 1}}, + {Latte::E_GX2SURFFMT::R8_UINT, {MTL::PixelFormatR8Uint, MetalDataType::UINT, 1}}, + {Latte::E_GX2SURFFMT::R8_SINT, {MTL::PixelFormatR8Sint, MetalDataType::INT, 1}}, + {Latte::E_GX2SURFFMT::R8_G8_UNORM, {MTL::PixelFormatRG8Unorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R8_G8_SNORM, {MTL::PixelFormatRG8Snorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R8_G8_UINT, {MTL::PixelFormatRG8Uint, MetalDataType::UINT, 2}}, + {Latte::E_GX2SURFFMT::R8_G8_SINT, {MTL::PixelFormatRG8Sint, MetalDataType::INT, 2}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_UNORM, {MTL::PixelFormatRGBA8Unorm, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SNORM, {MTL::PixelFormatRGBA8Snorm, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_UINT, {MTL::PixelFormatRGBA8Uint, MetalDataType::UINT, 4}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SINT, {MTL::PixelFormatRGBA8Sint, MetalDataType::INT, 4}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SRGB, {MTL::PixelFormatRGBA8Unorm_sRGB, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_UNORM, {MTL::PixelFormatRGB10A2Unorm, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SNORM, {MTL::PixelFormatRGBA16Snorm, MetalDataType::FLOAT, 8}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_UINT, {MTL::PixelFormatRGB10A2Uint, MetalDataType::UINT, 4}}, + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SINT, {MTL::PixelFormatRGBA16Sint, MetalDataType::INT, 8}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SRGB, {MTL::PixelFormatRGBA8Unorm_sRGB, MetalDataType::FLOAT, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::A2_B10_G10_R10_UNORM, {MTL::PixelFormatBGR10A2Unorm, MetalDataType::FLOAT, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::A2_B10_G10_R10_UINT, {MTL::PixelFormatRGB10A2Uint, MetalDataType::UINT, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R16_UNORM, {MTL::PixelFormatR16Unorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R16_SNORM, {MTL::PixelFormatR16Snorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R16_UINT, {MTL::PixelFormatR16Uint, MetalDataType::UINT, 2}}, + {Latte::E_GX2SURFFMT::R16_SINT, {MTL::PixelFormatR16Sint, MetalDataType::INT, 2}}, + {Latte::E_GX2SURFFMT::R16_FLOAT, {MTL::PixelFormatR16Float, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R16_G16_UNORM, {MTL::PixelFormatRG16Unorm, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_SNORM, {MTL::PixelFormatRG16Snorm, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_UINT, {MTL::PixelFormatRG16Uint, MetalDataType::UINT, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_SINT, {MTL::PixelFormatRG16Sint, MetalDataType::INT, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_FLOAT, {MTL::PixelFormatRG16Float, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_UNORM, {MTL::PixelFormatRGBA16Unorm, MetalDataType::FLOAT, 8}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_SNORM, {MTL::PixelFormatRGBA16Snorm, MetalDataType::FLOAT, 8}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_UINT, {MTL::PixelFormatRGBA16Uint, MetalDataType::UINT, 8}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_SINT, {MTL::PixelFormatRGBA16Sint, MetalDataType::INT, 8}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_FLOAT, {MTL::PixelFormatRGBA16Float, MetalDataType::FLOAT, 8}}, + {Latte::E_GX2SURFFMT::R24_X8_UNORM, {MTL::PixelFormatInvalid, MetalDataType::NONE, 0}}, // TODO + {Latte::E_GX2SURFFMT::R24_X8_FLOAT, {MTL::PixelFormatInvalid, MetalDataType::NONE, 0}}, // TODO + {Latte::E_GX2SURFFMT::X24_G8_UINT, {MTL::PixelFormatInvalid, MetalDataType::NONE, 0}}, // TODO + {Latte::E_GX2SURFFMT::R32_X8_FLOAT, {MTL::PixelFormatInvalid, MetalDataType::NONE, 0}}, // TODO + {Latte::E_GX2SURFFMT::X32_G8_UINT_X24, {MTL::PixelFormatInvalid, MetalDataType::NONE, 0}}, // TODO + {Latte::E_GX2SURFFMT::R11_G11_B10_FLOAT, {MTL::PixelFormatRG11B10Float, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R32_UINT, {MTL::PixelFormatR32Uint, MetalDataType::UINT, 4}}, + {Latte::E_GX2SURFFMT::R32_SINT, {MTL::PixelFormatR32Sint, MetalDataType::INT, 4}}, + {Latte::E_GX2SURFFMT::R32_FLOAT, {MTL::PixelFormatR32Float, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R32_G32_UINT, {MTL::PixelFormatRG32Uint, MetalDataType::UINT, 8}}, + {Latte::E_GX2SURFFMT::R32_G32_SINT, {MTL::PixelFormatRG32Sint, MetalDataType::INT, 8}}, + {Latte::E_GX2SURFFMT::R32_G32_FLOAT, {MTL::PixelFormatRG32Float, MetalDataType::FLOAT, 8}}, + {Latte::E_GX2SURFFMT::R32_G32_B32_A32_UINT, {MTL::PixelFormatRGBA32Uint, MetalDataType::UINT, 16}}, + {Latte::E_GX2SURFFMT::R32_G32_B32_A32_SINT, {MTL::PixelFormatRGBA32Sint, MetalDataType::INT, 16}}, + {Latte::E_GX2SURFFMT::R32_G32_B32_A32_FLOAT, {MTL::PixelFormatRGBA32Float, MetalDataType::FLOAT, 16}}, + {Latte::E_GX2SURFFMT::BC1_UNORM, {MTL::PixelFormatBC1_RGBA, MetalDataType::FLOAT, 8, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC1_SRGB, {MTL::PixelFormatBC1_RGBA_sRGB, MetalDataType::FLOAT, 8, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC2_UNORM, {MTL::PixelFormatBC2_RGBA, MetalDataType::FLOAT, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC2_SRGB, {MTL::PixelFormatBC2_RGBA_sRGB, MetalDataType::FLOAT, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC3_UNORM, {MTL::PixelFormatBC3_RGBA, MetalDataType::FLOAT, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC3_SRGB, {MTL::PixelFormatBC3_RGBA_sRGB, MetalDataType::FLOAT, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC4_UNORM, {MTL::PixelFormatBC4_RUnorm, MetalDataType::FLOAT, 8, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC4_SNORM, {MTL::PixelFormatBC4_RSnorm, MetalDataType::FLOAT, 8, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC5_UNORM, {MTL::PixelFormatBC5_RGUnorm, MetalDataType::FLOAT, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC5_SNORM, {MTL::PixelFormatBC5_RGSnorm, MetalDataType::FLOAT, 16, {4, 4}}}, // TODO: correct? }; -std::map MTL_DEPTH_FORMAT_TABLE = { - {Latte::E_GX2SURFFMT::D24_S8_UNORM, {MTL::PixelFormatDepth24Unorm_Stencil8, false, 4}}, // TODO: not supported on Apple sillicon, maybe find something else - {Latte::E_GX2SURFFMT::D24_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, false, 4}}, // TODO: correct? - {Latte::E_GX2SURFFMT::D32_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, false, 5}}, - {Latte::E_GX2SURFFMT::D16_UNORM, {MTL::PixelFormatDepth16Unorm, false, 2}}, - {Latte::E_GX2SURFFMT::D32_FLOAT, {MTL::PixelFormatDepth32Float, false, 4}}, +std::map MTL_DEPTH_FORMAT_TABLE = { + {Latte::E_GX2SURFFMT::D24_S8_UNORM, {MTL::PixelFormatDepth24Unorm_Stencil8, MetalDataType::NONE, 4}}, // TODO: not supported on Apple sillicon, maybe find something else + {Latte::E_GX2SURFFMT::D24_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, MetalDataType::NONE, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::D32_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, MetalDataType::NONE, 5}}, + {Latte::E_GX2SURFFMT::D16_UNORM, {MTL::PixelFormatDepth16Unorm, MetalDataType::NONE, 2}}, + {Latte::E_GX2SURFFMT::D32_FLOAT, {MTL::PixelFormatDepth32Float, MetalDataType::NONE, 4}}, }; -const MtlPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format, bool isDepth) +const MetalPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format, bool isDepth) { - MtlPixelFormatInfo formatInfo; + MetalPixelFormatInfo formatInfo; if (isDepth) formatInfo = MTL_DEPTH_FORMAT_TABLE[format]; else @@ -266,7 +266,6 @@ MTL::PrimitiveType GetMtlPrimitiveType(LattePrimitiveMode primitiveMode) case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::TRIANGLES: return MTL::PrimitiveTypeTriangle; case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::TRIANGLE_FAN: - debug_printf("Metal doesn't support triangle fan primitive, using triangle strip instead\n"); return MTL::PrimitiveTypeTriangleStrip; case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::TRIANGLE_STRIP: return MTL::PrimitiveTypeTriangleStrip; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h index 91ae1c2fd..6b3ae2afe 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h @@ -15,14 +15,22 @@ struct Uvec2 { uint32 y; }; -struct MtlPixelFormatInfo { +enum class MetalDataType +{ + NONE, + INT, + UINT, + FLOAT, +}; + +struct MetalPixelFormatInfo { MTL::PixelFormat pixelFormat; - bool blendable; + MetalDataType dataType; size_t bytesPerBlock; Uvec2 blockTexelSize = {1, 1}; }; -const MtlPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format, bool isDepth); +const MetalPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format, bool isDepth); size_t GetMtlTextureBytesPerRow(Latte::E_GX2SURFFMT format, bool isDepth, uint32 width); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h index c7011ab87..2a2713e50 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -1,6 +1,11 @@ #pragma once -inline size_t align(size_t size, size_t alignment) +inline size_t Align(size_t size, size_t alignment) { return (size + alignment - 1) & ~(alignment - 1); } + +inline std::string GetColorAttachmentTypeStr(uint32 index) +{ + return "COLOR_ATTACHMENT" + std::to_string(index) + "_TYPE"; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index bba887b99..c19d85961 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -16,7 +16,7 @@ MetalBufferAllocator::~MetalBufferAllocator() MetalBufferAllocation MetalBufferAllocator::GetBufferAllocation(size_t size, size_t alignment) { // Align the size - size = align(size, alignment); + size = Align(size, alignment); // First, try to find a free range for (uint32 i = 0; i < m_freeBufferRanges.size(); i++) @@ -94,7 +94,7 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu if (restrideInfo.memoryInvalidated || stride != restrideInfo.lastStride) { // TODO: use compute/void vertex function instead - size_t newStride = align(stride, 4); + size_t newStride = Align(stride, 4); size_t newSize = vertexBufferRange->size / stride * newStride; restrideInfo.buffer = m_mtlr->GetDevice()->newBuffer(newSize, MTL::StorageModeShared); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index ef59c1111..b76cd8f8c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -61,7 +61,7 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS uint32 bufferIndex = bufferGroup.attributeBufferIndex; uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; uint32 bufferStride = (LatteGPUState.contextNew.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; - bufferStride = align(bufferStride, 4); + bufferStride = Align(bufferStride, 4); // HACK if (bufferStride == 0) @@ -83,11 +83,15 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS } } + auto mtlVertexShader = static_cast(vertexShader->shader); + auto mtlPixelShader = static_cast(pixelShader->shader); + mtlPixelShader->CompileFragmentFunction(activeFBO); + // Render pipeline state MTL::RenderPipelineDescriptor* desc = MTL::RenderPipelineDescriptor::alloc()->init(); - desc->setVertexFunction(static_cast(vertexShader->shader)->GetFunction()); - desc->setFragmentFunction(static_cast(pixelShader->shader)->GetFunction()); - // TODO: don't always set the vertex descriptor + desc->setVertexFunction(mtlVertexShader->GetFunction()); + desc->setFragmentFunction(mtlPixelShader->GetFunction()); + // TODO: don't always set the vertex descriptor? desc->setVertexDescriptor(vertexDescriptor); for (uint8 i = 0; i < 8; i++) { @@ -106,7 +110,8 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS uint32 renderTargetMask = LatteGPUState.contextNew.CB_TARGET_MASK.get_MASK(); bool blendEnabled = ((blendEnableMask & (1 << i))) != 0; - if (blendEnabled && GetMtlPixelFormatInfo(texture->format, false).blendable) + // Only float data type is blendable + if (blendEnabled && GetMtlPixelFormatInfo(texture->format, false).dataType == MetalDataType::FLOAT) { colorAttachment->setBlendingEnabled(true); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index dcceb18ad..183340521 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -1,30 +1,23 @@ #include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" + #include "Cemu/Logging/CemuLogging.h" -#include "Metal/MTLFunctionDescriptor.hpp" +#include "Common/precompiled.h" RendererShaderMtl::RendererShaderMtl(MetalRenderer* mtlRenderer, ShaderType type, uint64 baseHash, uint64 auxHash, bool isGameShader, bool isGfxPackShader, const std::string& mslCode) - : RendererShader(type, baseHash, auxHash, isGameShader, isGfxPackShader) + : RendererShader(type, baseHash, auxHash, isGameShader, isGfxPackShader), m_mtlr{mtlRenderer} { - NS::Error* error = nullptr; - MTL::Library* library = mtlRenderer->GetDevice()->newLibrary(NS::String::string(mslCode.c_str(), NS::ASCIIStringEncoding), nullptr, &error); - if (error) + // Fragment functions are compiled just-in-time + if (m_type == ShaderType::kFragment) { - printf("failed to create library (error: %s) -> source:\n%s\n", error->localizedDescription()->utf8String(), mslCode.c_str()); - error->release(); - return; + m_mslCode = mslCode; + } + else + { + Compile(mslCode); } - //MTL::FunctionDescriptor* desc = MTL::FunctionDescriptor::alloc()->init(); - //desc->setName(NS::String::string("main0", NS::ASCIIStringEncoding)); - //error = nullptr; - m_function = library->newFunction(NS::String::string("main0", NS::ASCIIStringEncoding)); - library->release(); - //if (error) - //{ - // printf("failed to create function (error: %s)\n", error->localizedDescription()->utf8String()); - // error->release(); - // return; - //} } RendererShaderMtl::~RendererShaderMtl() @@ -33,6 +26,47 @@ RendererShaderMtl::~RendererShaderMtl() m_function->release(); } +void RendererShaderMtl::CompileFragmentFunction(CachedFBOMtl* activeFBO) +{ + cemu_assert_debug(m_type == ShaderType::kFragment); + + if (m_function) + m_function->release(); + + std::string fullCode; + + // Define color attachment data types + for (uint8 i = 0; i < 8; i++) + { + const auto& colorBuffer = activeFBO->colorBuffer[i]; + if (!colorBuffer.texture) + { + continue; + } + auto dataType = GetMtlPixelFormatInfo(colorBuffer.texture->format, false).dataType; + fullCode += "#define " + GetColorAttachmentTypeStr(i) + " "; + switch (dataType) + { + case MetalDataType::INT: + fullCode += "int4"; + break; + case MetalDataType::UINT: + fullCode += "uint4"; + break; + case MetalDataType::FLOAT: + fullCode += "float4"; + break; + default: + cemu_assert_suspicious(); + break; + } + fullCode += "\n"; + } + + fullCode += m_mslCode; + Compile(fullCode); +} + void RendererShaderMtl::ShaderCacheLoading_begin(uint64 cacheTitleId) { cemuLog_log(LogType::MetalLogging, "RendererShaderMtl::ShaderCacheLoading_begin not implemented!"); @@ -47,3 +81,17 @@ void RendererShaderMtl::ShaderCacheLoading_Close() { cemuLog_log(LogType::MetalLogging, "RendererShaderMtl::ShaderCacheLoading_Close not implemented!"); } + +void RendererShaderMtl::Compile(const std::string& mslCode) +{ + NS::Error* error = nullptr; + MTL::Library* library = m_mtlr->GetDevice()->newLibrary(NS::String::string(mslCode.c_str(), NS::ASCIIStringEncoding), nullptr, &error); + if (error) + { + printf("failed to create library (error: %s) -> source:\n%s\n", error->localizedDescription()->utf8String(), mslCode.c_str()); + error->release(); + return; + } + m_function = library->newFunction(NS::String::string("main0", NS::ASCIIStringEncoding)); + library->release(); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h index e440d4dc2..f70db1bda 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h @@ -1,6 +1,7 @@ #pragma once #include "Cafe/HW/Latte/Renderer/RendererShader.h" +#include "HW/Latte/Renderer/Metal/CachedFBOMtl.h" #include "HW/Latte/Renderer/Metal/MetalRenderer.h" #include "util/helpers/ConcurrentQueue.h" @@ -24,6 +25,8 @@ class RendererShaderMtl : public RendererShader RendererShaderMtl(class MetalRenderer* mtlRenderer, ShaderType type, uint64 baseHash, uint64 auxHash, bool isGameShader, bool isGfxPackShader, const std::string& mslCode); virtual ~RendererShaderMtl(); + void CompileFragmentFunction(CachedFBOMtl* activeFBO); + MTL::Function* GetFunction() const { return m_function; @@ -51,5 +54,11 @@ class RendererShaderMtl : public RendererShader bool WaitForCompiled() override { return true; } private: + class MetalRenderer* m_mtlr; + MTL::Function* m_function = nullptr; + + std::string m_mslCode; + + void Compile(const std::string& mslCode); }; From d3249dc324417f8c65a495020dd1e1ba5eafdaf2 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 7 Aug 2024 18:20:09 +0200 Subject: [PATCH 045/368] implement texture readback --- src/Cafe/CMakeLists.txt | 2 + .../HW/Latte/Renderer/Metal/LatteTextureMtl.h | 2 +- .../Metal/LatteTextureReadbackMtl.cpp | 43 +++++++++++ .../Renderer/Metal/LatteTextureReadbackMtl.h | 22 ++++++ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 21 ++++- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 76 ++++++++++++++----- 6 files changed, 143 insertions(+), 23 deletions(-) create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index ea2373484..3f2244915 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -548,6 +548,8 @@ if(ENABLE_METAL) HW/Latte/Renderer/Metal/LatteTextureMtl.h HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp HW/Latte/Renderer/Metal/LatteTextureViewMtl.h + HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp + HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h HW/Latte/Renderer/Metal/RendererShaderMtl.cpp HW/Latte/Renderer/Metal/RendererShaderMtl.h HW/Latte/Renderer/Metal/CachedFBOMtl.cpp diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h index e2187e1bf..81942dfaf 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h @@ -9,7 +9,7 @@ class LatteTextureMtl : public LatteTexture { public: - LatteTextureMtl(class MetalRenderer* vkRenderer, Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, + LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth); ~LatteTextureMtl(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp new file mode 100644 index 000000000..608ff050d --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp @@ -0,0 +1,43 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" +#include "HW/Latte/Renderer/Metal/LatteToMtl.h" + +LatteTextureReadbackInfoMtl::~LatteTextureReadbackInfoMtl() +{ +} + +void LatteTextureReadbackInfoMtl::StartTransfer() +{ + cemu_assert(m_textureView); + + auto* baseTexture = (LatteTextureMtl*)m_textureView->baseTexture; + + cemu_assert_debug(m_textureView->firstSlice == 0); + cemu_assert_debug(m_textureView->firstMip == 0); + cemu_assert_debug(m_textureView->baseTexture->dim != Latte::E_DIM::DIM_3D); + + size_t bytesPerRow = GetMtlTextureBytesPerRow(baseTexture->format, baseTexture->IsDepth(), baseTexture->width); + size_t bytesPerImage = GetMtlTextureBytesPerImage(baseTexture->format, baseTexture->IsDepth(), baseTexture->height, bytesPerRow); + + auto blitCommandEncoder = m_mtlr->GetBlitCommandEncoder(); + + blitCommandEncoder->copyFromTexture(baseTexture->GetTexture(), 0, 0, MTL::Origin{0, 0, 0}, MTL::Size{(uint32)baseTexture->width, (uint32)baseTexture->height, 1}, m_mtlr->GetTextureReadbackBuffer(), m_bufferOffset, bytesPerRow, bytesPerImage); +} + +bool LatteTextureReadbackInfoMtl::IsFinished() +{ + // TODO: implement + + return true; +} + +void LatteTextureReadbackInfoMtl::ForceFinish() +{ + // TODO: implement +} + +uint8* LatteTextureReadbackInfoMtl::GetData() +{ + return (uint8*)m_mtlr->GetTextureReadbackBuffer()->contents() + m_bufferOffset; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h new file mode 100644 index 000000000..a03bbd499 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h @@ -0,0 +1,22 @@ +#pragma once + +#include "Cafe/HW/Latte/Core/LatteTextureReadbackInfo.h" + +class LatteTextureReadbackInfoMtl : public LatteTextureReadbackInfo +{ +public: + LatteTextureReadbackInfoMtl(class MetalRenderer* mtlRenderer, LatteTextureView* textureView, uint32 bufferOffset) : LatteTextureReadbackInfo(textureView), m_mtlr{mtlRenderer}, m_bufferOffset{bufferOffset} {} + ~LatteTextureReadbackInfoMtl(); + + void StartTransfer() override; + + bool IsFinished() override; + void ForceFinish() override; + + uint8* GetData() override; + +private: + class MetalRenderer* m_mtlr; + + uint32 m_bufferOffset = 0; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index e4764590a..852f1993b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -6,6 +6,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/ShaderSourcePresent.h" @@ -35,6 +36,9 @@ MetalRenderer::MetalRenderer() m_pipelineCache = new MetalPipelineCache(this); m_depthStencilCache = new MetalDepthStencilCache(this); + // Texture readback + m_readbackBuffer = m_device->newBuffer(TEXTURE_READBACK_SIZE, MTL::StorageModeShared); + // Initialize state for (uint32 i = 0; i < (uint32)LatteConst::ShaderType::TotalCount; i++) { @@ -53,6 +57,8 @@ MetalRenderer::~MetalRenderer() m_nearestSampler->release(); + m_readbackBuffer->release(); + m_commandQueue->release(); m_device->release(); } @@ -407,9 +413,17 @@ void MetalRenderer::texture_copyImageSubData(LatteTexture* src, sint32 srcMip, s LatteTextureReadbackInfo* MetalRenderer::texture_createReadback(LatteTextureView* textureView) { - debug_printf("MetalRenderer::texture_createReadback not implemented\n"); + size_t uploadSize = static_cast(textureView->baseTexture)->GetTexture()->allocatedSize(); - return nullptr; + if ((m_readbackBufferWriteOffset + uploadSize) > TEXTURE_READBACK_SIZE) + { + m_readbackBufferWriteOffset = 0; + } + + auto* result = new LatteTextureReadbackInfoMtl(this, textureView, m_readbackBufferWriteOffset); + m_readbackBufferWriteOffset += uploadSize; + + return result; } void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* sourceTexture, sint32 srcMip, sint32 srcSlice, LatteTexture* destinationTexture, sint32 dstMip, sint32 dstSlice, sint32 width, sint32 height) @@ -801,6 +815,9 @@ void MetalRenderer::CommitCommandBuffer() m_commandBuffer->release(); m_commandBuffer = nullptr; + // TODO: where should this be called? + LatteTextureReadback_UpdateFinishedTransfers(false); + // Debug m_commandQueue->insertDebugCaptureBoundary(); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index bee581d68..26ab52f11 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -49,9 +49,36 @@ enum class MetalEncoderType Blit, }; +class LatteQueryObjectMtl : public LatteQueryObject +{ +public: + LatteQueryObjectMtl(class MetalRenderer* mtlRenderer) : m_mtlr{mtlRenderer} {} + + bool getResult(uint64& numSamplesPassed) override + { + cemuLog_log(LogType::MetalLogging, "LatteQueryObjectMtl::getResult: occlusion queries are not yet supported on Metal"); + return false; + } + + void begin() override + { + cemuLog_log(LogType::MetalLogging, "LatteQueryObjectMtl::begin: occlusion queries are not yet supported on Metal"); + } + + void end() override + { + cemuLog_log(LogType::MetalLogging, "LatteQueryObjectMtl::end: occlusion queries are not yet supported on Metal"); + } + +private: + class MetalRenderer* m_mtlr; +}; + class MetalRenderer : public Renderer { public: + static const inline int TEXTURE_READBACK_SIZE = 32 * 1024 * 1024; // 32 MB + MetalRenderer(); ~MetalRenderer() override; @@ -178,23 +205,43 @@ class MetalRenderer : public Renderer // occlusion queries LatteQueryObject* occlusionQuery_create() override { - cemuLog_log(LogType::MetalLogging, "Occlusion queries are not yet supported on Metal"); + cemuLog_log(LogType::MetalLogging, "MetalRenderer::occlusionQuery_create: Occlusion queries are not yet supported on Metal"); - return nullptr; + return new LatteQueryObjectMtl(this); } void occlusionQuery_destroy(LatteQueryObject* queryObj) override { - cemuLog_log(LogType::MetalLogging, "Occlusion queries are not yet supported on Metal"); + cemuLog_log(LogType::MetalLogging, "MetalRenderer::occlusionQuery_destroy: occlusion queries are not yet supported on Metal"); } void occlusionQuery_flush() override { - cemuLog_log(LogType::MetalLogging, "Occlusion queries are not yet supported on Metal"); + cemuLog_log(LogType::MetalLogging, "MetalRenderer::occlusionQuery_flush: occlusion queries are not yet supported on Metal"); } void occlusionQuery_updateState() override { - cemuLog_log(LogType::MetalLogging, "Occlusion queries are not yet supported on Metal"); + cemuLog_log(LogType::MetalLogging, "MetalRenderer::occlusionQuery_updateState: occlusion queries are not yet supported on Metal"); } + // Helpers + void EnsureCommandBuffer(); + MTL::RenderCommandEncoder* GetRenderCommandEncoder(MTL::RenderPassDescriptor* renderPassDescriptor, MTL::Texture* colorRenderTargets[8], MTL::Texture* depthRenderTarget, bool forceRecreate = false, bool rebindStateIfNewEncoder = true); + MTL::ComputeCommandEncoder* GetComputeCommandEncoder(); + MTL::BlitCommandEncoder* GetBlitCommandEncoder(); + void EndEncoding(); + void CommitCommandBuffer(); + + bool AcquireNextDrawable(); + + void BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader); + void RebindRenderState(MTL::RenderCommandEncoder* renderCommandEncoder); + + void ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a); + + // Getters + MTL::Buffer* GetTextureReadbackBuffer() + { + return m_readbackBuffer; + } private: CA::MetalLayer* m_metalLayer; @@ -213,6 +260,10 @@ class MetalRenderer : public Renderer // Basic MTL::SamplerState* m_nearestSampler; + // Texture readback + MTL::Buffer* m_readbackBuffer; + uint32 m_readbackBufferWriteOffset = 0; + // Active objects MTL::CommandBuffer* m_commandBuffer = nullptr; MetalEncoderType m_encoderType = MetalEncoderType::None; @@ -221,19 +272,4 @@ class MetalRenderer : public Renderer // State MetalState m_state; - - // Helpers - void EnsureCommandBuffer(); - MTL::RenderCommandEncoder* GetRenderCommandEncoder(MTL::RenderPassDescriptor* renderPassDescriptor, MTL::Texture* colorRenderTargets[8], MTL::Texture* depthRenderTarget, bool forceRecreate = false, bool rebindStateIfNewEncoder = true); - MTL::ComputeCommandEncoder* GetComputeCommandEncoder(); - MTL::BlitCommandEncoder* GetBlitCommandEncoder(); - void EndEncoding(); - void CommitCommandBuffer(); - - bool AcquireNextDrawable(); - - void BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader); - void RebindRenderState(MTL::RenderCommandEncoder* renderCommandEncoder); - - void ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a); }; From 1bcdade83e960860fe6f4a57024a6e361676987d Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 7 Aug 2024 20:59:05 +0200 Subject: [PATCH 046/368] set stencil state --- .../LatteDecompilerEmitMSL.cpp | 2 +- .../LatteDecompilerEmitMSLHeader.hpp | 6 ++ .../HW/Latte/Renderer/Metal/CachedFBOMtl.cpp | 9 +++ .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 17 ++++ src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h | 2 + .../Renderer/Metal/MetalDepthStencilCache.cpp | 81 +++++++++---------- .../Renderer/Metal/MetalPipelineCache.cpp | 5 +- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 1 + 8 files changed, 79 insertions(+), 44 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 84cba9092..56253a716 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3215,7 +3215,7 @@ static void _emitExportCode(LatteDecompilerShaderContext* shaderContext, LatteDe cemu_assert_unimplemented(); // ukn } - src->add("gl_FragDepth = "); + src->add("out.depth = "); _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); src->add(".x"); src->add(";" _CRLF); diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 5384f0d4d..c3d63eacb 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -271,6 +271,12 @@ namespace LatteDecompiler } } + // generate depth output for pixel shader + if (decompilerContext->shader->pixelDepthOutputMask) + { + src->add("float passDepth [[depth(any)]];" _CRLF); + } + src->add("};" _CRLF _CRLF); } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp index f3b98f156..1c02f7b40 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp @@ -28,6 +28,15 @@ void CachedFBOMtl::CreateRenderPass() depthAttachment->setTexture(textureView->GetRGBAView()); depthAttachment->setLoadAction(MTL::LoadActionLoad); depthAttachment->setStoreAction(MTL::StoreActionStore); + + // setup stencil attachment + if (depthBuffer.hasStencil) + { + auto stencilAttachment = m_renderPassDescriptor->stencilAttachment(); + stencilAttachment->setTexture(textureView->GetRGBAView()); + stencilAttachment->setLoadAction(MTL::LoadActionLoad); + stencilAttachment->setStoreAction(MTL::StoreActionStore); + } } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index 44cb1f3bc..d03be2de8 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -452,3 +452,20 @@ MTL::TextureSwizzle GetMtlTextureSwizzle(uint32 swizzle) cemu_assert_debug(swizzle < std::size(MTL_TEXTURE_SWIZZLES)); return MTL_TEXTURE_SWIZZLES[swizzle]; } + +const MTL::StencilOperation MTL_STENCIL_OPERATIONS[8] = { + MTL::StencilOperationKeep, + MTL::StencilOperationZero, + MTL::StencilOperationReplace, + MTL::StencilOperationIncrementClamp, + MTL::StencilOperationDecrementClamp, + MTL::StencilOperationInvert, + MTL::StencilOperationIncrementWrap, + MTL::StencilOperationDecrementWrap +}; + +MTL::StencilOperation GetMtlStencilOp(Latte::LATTE_DB_DEPTH_CONTROL::E_STENCILACTION action) +{ + cemu_assert_debug((uint32)action < std::size(MTL_STENCIL_OPERATIONS)); + return MTL_STENCIL_OPERATIONS[(uint32)action]; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h index 6b3ae2afe..5fd231866 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h @@ -53,3 +53,5 @@ MTL::CompareFunction GetMtlCompareFunc(Latte::E_COMPAREFUNC func); MTL::SamplerAddressMode GetMtlSamplerAddressMode(Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_CLAMP clamp); MTL::TextureSwizzle GetMtlTextureSwizzle(uint32 swizzle); + +MTL::StencilOperation GetMtlStencilOp(Latte::LATTE_DB_DEPTH_CONTROL::E_STENCILACTION action); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp index 87968ec3f..8f50c44cc 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp @@ -37,8 +37,6 @@ MTL::DepthStencilState* MetalDepthStencilCache::GetDepthStencilState(const Latte } desc->setDepthCompareFunction(depthCompareFunc); - // TODO: stencil state - /* // get stencil control parameters bool stencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ENABLE(); bool backStencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_BACK_STENCIL_ENABLE(); @@ -58,48 +56,47 @@ MTL::DepthStencilState* MetalDepthStencilCache::GetDepthStencilState(const Latte uint32 stencilWriteMaskBack = LatteGPUState.contextNew.DB_STENCILREFMASK_BF.get_STENCILWRITEMASK_B(); uint32 stencilRefBack = LatteGPUState.contextNew.DB_STENCILREFMASK_BF.get_STENCILREF_B(); - static const VkStencilOp stencilOpTable[8] = { - VK_STENCIL_OP_KEEP, - VK_STENCIL_OP_ZERO, - VK_STENCIL_OP_REPLACE, - VK_STENCIL_OP_INCREMENT_AND_CLAMP, - VK_STENCIL_OP_DECREMENT_AND_CLAMP, - VK_STENCIL_OP_INVERT, - VK_STENCIL_OP_INCREMENT_AND_WRAP, - VK_STENCIL_OP_DECREMENT_AND_WRAP - }; - - depthStencilState.stencilTestEnable = stencilEnable ? VK_TRUE : VK_FALSE; - - depthStencilState.front.reference = stencilRefFront; - depthStencilState.front.compareMask = stencilCompareMaskFront; - depthStencilState.front.writeMask = stencilWriteMaskBack; - depthStencilState.front.compareOp = vkDepthCompareTable[(size_t)frontStencilFunc]; - depthStencilState.front.depthFailOp = stencilOpTable[(size_t)frontStencilZFail]; - depthStencilState.front.failOp = stencilOpTable[(size_t)frontStencilFail]; - depthStencilState.front.passOp = stencilOpTable[(size_t)frontStencilZPass]; - - if (backStencilEnable) - { - depthStencilState.back.reference = stencilRefBack; - depthStencilState.back.compareMask = stencilCompareMaskBack; - depthStencilState.back.writeMask = stencilWriteMaskBack; - depthStencilState.back.compareOp = vkDepthCompareTable[(size_t)backStencilFunc]; - depthStencilState.back.depthFailOp = stencilOpTable[(size_t)backStencilZFail]; - depthStencilState.back.failOp = stencilOpTable[(size_t)backStencilFail]; - depthStencilState.back.passOp = stencilOpTable[(size_t)backStencilZPass]; - } - else + if (stencilEnable) { - depthStencilState.back.reference = stencilRefFront; - depthStencilState.back.compareMask = stencilCompareMaskFront; - depthStencilState.back.writeMask = stencilWriteMaskFront; - depthStencilState.back.compareOp = vkDepthCompareTable[(size_t)frontStencilFunc]; - depthStencilState.back.depthFailOp = stencilOpTable[(size_t)frontStencilZFail]; - depthStencilState.back.failOp = stencilOpTable[(size_t)frontStencilFail]; - depthStencilState.back.passOp = stencilOpTable[(size_t)frontStencilZPass]; + MTL::StencilDescriptor* frontStencil = MTL::StencilDescriptor::alloc()->init(); + // TODO: set reference + //depthStencilState.front.reference = stencilRefFront; + frontStencil->setReadMask(stencilCompareMaskFront); + frontStencil->setWriteMask(stencilWriteMaskFront); + frontStencil->setStencilCompareFunction(GetMtlCompareFunc(frontStencilFunc)); + frontStencil->setDepthFailureOperation(GetMtlStencilOp(frontStencilZFail)); + frontStencil->setStencilFailureOperation(GetMtlStencilOp(frontStencilFail)); + frontStencil->setDepthStencilPassOperation(GetMtlStencilOp(frontStencilZPass)); + desc->setFrontFaceStencil(frontStencil); + + MTL::StencilDescriptor* backStencil = MTL::StencilDescriptor::alloc()->init(); + if (backStencilEnable) + { + // TODO: set reference + //depthStencilState.back.reference = stencilRefBack; + backStencil->setReadMask(stencilCompareMaskBack); + backStencil->setWriteMask(stencilWriteMaskBack); + backStencil->setStencilCompareFunction(GetMtlCompareFunc(backStencilFunc)); + backStencil->setDepthFailureOperation(GetMtlStencilOp(backStencilZFail)); + backStencil->setStencilFailureOperation(GetMtlStencilOp(backStencilFail)); + backStencil->setDepthStencilPassOperation(GetMtlStencilOp(backStencilZPass)); + } + else + { + // TODO: set reference + //depthStencilState.back.reference = stencilRefFront; + backStencil->setReadMask(stencilCompareMaskFront); + backStencil->setWriteMask(stencilWriteMaskFront); + backStencil->setStencilCompareFunction(GetMtlCompareFunc(frontStencilFunc)); + backStencil->setDepthFailureOperation(GetMtlStencilOp(frontStencilZFail)); + backStencil->setStencilFailureOperation(GetMtlStencilOp(frontStencilFail)); + backStencil->setDepthStencilPassOperation(GetMtlStencilOp(frontStencilZPass)); + } + desc->setBackFaceStencil(backStencil); + + frontStencil->release(); + backStencil->release(); } - */ depthStencilState = m_mtlr->GetDevice()->newDepthStencilState(desc); desc->release(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index b76cd8f8c..2462f370f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -143,7 +143,10 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS { auto texture = static_cast(activeFBO->depthBuffer.texture); desc->setDepthAttachmentPixelFormat(texture->GetRGBAView()->pixelFormat()); - // TODO: stencil pixel format + if (activeFBO->depthBuffer.hasStencil) + { + desc->setStencilAttachmentPixelFormat(texture->GetRGBAView()->pixelFormat()); + } } NS::Error* error = nullptr; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 26ab52f11..c770017bb 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -49,6 +49,7 @@ enum class MetalEncoderType Blit, }; +// HACK: Dummy occlusion query object for Metal class LatteQueryObjectMtl : public LatteQueryObject { public: From a50ce997df6709b5b84f360cb948fba3720e4c3b Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 7 Aug 2024 21:14:31 +0200 Subject: [PATCH 047/368] fix: writing to depth from a fragment shader --- .../LegacyShaderDecompiler/LatteDecompiler.h | 2 ++ .../LatteDecompilerAnalyzer.cpp | 28 +++++++++---------- .../LatteDecompilerEmitMSL.cpp | 13 +++++---- .../LatteDecompilerEmitMSLHeader.hpp | 4 +-- 4 files changed, 25 insertions(+), 22 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h index 78af1deca..57df13b16 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h @@ -182,6 +182,8 @@ struct LatteDecompilerShader // analyzer stage (pixel outputs) uint32 pixelColorOutputMask{ 0 }; // from LSB to MSB, 1 bit per written output. 1 if written (indices of color attachments) + // analyzer stage (depth write) + bool depthWritten{ false }; // analyzer stage (geometry shader parameters/inputs) uint32 ringParameterCount{ 0 }; uint32 ringParameterCountFromPrevStage{ 0 }; // used in geometry shader to hold VS ringParameterCount diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp index 19604e0c5..e84e48519 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp @@ -287,15 +287,15 @@ void LatteDecompiler_analyzeTEXClause(LatteDecompilerShaderContext* shaderContex LatteDecompilerShader* shader = shaderContext->shader; for(auto& texInstruction : cfInstruction->instructionsTEX) { - if( texInstruction.opcode == GPU7_TEX_INST_SAMPLE || - texInstruction.opcode == GPU7_TEX_INST_SAMPLE_L || - texInstruction.opcode == GPU7_TEX_INST_SAMPLE_LB || - texInstruction.opcode == GPU7_TEX_INST_SAMPLE_LZ || - texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C || + if( texInstruction.opcode == GPU7_TEX_INST_SAMPLE || + texInstruction.opcode == GPU7_TEX_INST_SAMPLE_L || + texInstruction.opcode == GPU7_TEX_INST_SAMPLE_LB || + texInstruction.opcode == GPU7_TEX_INST_SAMPLE_LZ || + texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C_L || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C_LZ || - texInstruction.opcode == GPU7_TEX_INST_FETCH4 || - texInstruction.opcode == GPU7_TEX_INST_SAMPLE_G || + texInstruction.opcode == GPU7_TEX_INST_FETCH4 || + texInstruction.opcode == GPU7_TEX_INST_SAMPLE_G || texInstruction.opcode == GPU7_TEX_INST_LD ) { if (texInstruction.textureFetch.textureIndex < 0 || texInstruction.textureFetch.textureIndex >= LATTE_NUM_MAX_TEX_UNITS) @@ -313,7 +313,7 @@ void LatteDecompiler_analyzeTEXClause(LatteDecompilerShaderContext* shaderContex shader->textureUnitSamplerAssignment[texInstruction.textureFetch.textureIndex] = texInstruction.textureFetch.samplerIndex; if( texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C_L || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C_LZ) shader->textureUsesDepthCompare[texInstruction.textureFetch.textureIndex] = true; - + bool useTexelCoords = false; if (texInstruction.opcode == GPU7_TEX_INST_SAMPLE && (texInstruction.textureFetch.unnormalized[0] && texInstruction.textureFetch.unnormalized[1] && texInstruction.textureFetch.unnormalized[2] && texInstruction.textureFetch.unnormalized[3])) useTexelCoords = true; @@ -393,7 +393,7 @@ void LatteDecompiler_analyzeExport(LatteDecompilerShaderContext* shaderContext, } else if( cfInstruction->exportType == 0 && cfInstruction->exportArrayBase == 61 ) { - // writes pixel depth + shader->depthWritten = true; } else debugBreakpoint(); @@ -419,7 +419,7 @@ void LatteDecompiler_analyzeExport(LatteDecompilerShaderContext* shaderContext, void LatteDecompiler_analyzeSubroutine(LatteDecompilerShaderContext* shaderContext, uint32 cfAddr) { // analyze CF and clauses up to RET statement - + // todo - find cfInstruction index from cfAddr cemu_assert_debug(false); @@ -505,9 +505,9 @@ namespace LatteDecompiler decompilerContext->hasUniformVarBlock = true; else if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CFILE) decompilerContext->hasUniformVarBlock = true; - - bool hasAnyViewportScaleDisabled = - !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_X_SCALE_ENA() || + + bool hasAnyViewportScaleDisabled = + !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_X_SCALE_ENA() || !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Y_SCALE_ENA() || !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Z_SCALE_ENA(); // we currently only support all on/off. Individual component scaling is not supported @@ -803,7 +803,7 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD for(sint32 i=0; ioutput->textureUnitMask[i]) + if (!shaderContext->output->textureUnitMask[i]) { // texture unit not used shader->textureUnitDim[i] = (Latte::E_DIM)0xFF; diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 56253a716..efe5cf302 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2441,11 +2441,12 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex src->add(")"); } } - else if( texOpcode == GPU7_TEX_INST_SAMPLE_LZ || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ ) - { - // TODO: correct? - src->add(", level(0.0)"); - } + // TODO: uncomment? + //else if( texOpcode == GPU7_TEX_INST_SAMPLE_LZ || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ ) + //{ + // // TODO: correct? + // src->add(", level(0.0)"); + //} } // gradient parameters if (texOpcode == GPU7_TEX_INST_SAMPLE_G) @@ -3215,7 +3216,7 @@ static void _emitExportCode(LatteDecompilerShaderContext* shaderContext, LatteDe cemu_assert_unimplemented(); // ukn } - src->add("out.depth = "); + src->add("out.passDepth = "); _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); src->add(".x"); src->add(";" _CRLF); diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index c3d63eacb..9d52196a6 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -272,9 +272,9 @@ namespace LatteDecompiler } // generate depth output for pixel shader - if (decompilerContext->shader->pixelDepthOutputMask) + if (decompilerContext->shader->depthWritten) { - src->add("float passDepth [[depth(any)]];" _CRLF); + src->add("float passDepth [[depth]];" _CRLF); } src->add("};" _CRLF _CRLF); From e0791c3bf402685fa8c0035f95fc1b6616f4ce12 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 8 Aug 2024 11:58:18 +0200 Subject: [PATCH 048/368] set stencil reference value --- .../LatteDecompilerEmitMSL.cpp | 10 +++--- .../Renderer/Metal/MetalDepthStencilCache.cpp | 35 +++++++++---------- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 13 +++++++ 3 files changed, 35 insertions(+), 23 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index efe5cf302..39ff895c5 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3112,7 +3112,7 @@ static void _emitExportCode(LatteDecompilerShaderContext* shaderContext, LatteDe if (shaderContext->analyzer.outputPointSize) { cemu_assert_debug(shaderContext->analyzer.writesPointSize); - src->add("gl_PointSize = ("); + src->add("out.pointSize = ("); _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); src->add(").x"); src->add(";" _CRLF); @@ -4113,12 +4113,12 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, if (shader->shaderType == LatteConst::ShaderType::Vertex && shaderContext->options->usesGeometryShader == false) src->add("out.pointSize = supportBuffer.pointSize;" _CRLF); } - // HACK: this should be handled outside of the shader, because clipping currently wouldn't work + // HACK: this should be handled outside of the shader, because clipping currently wouldn't work (or would it?) if (shader->shaderType == LatteConst::ShaderType::Vertex) { - // Convert depth from the range of [-1, 1] to [0, 1] - src->add("out.position /= out.position.w;" _CRLF); - src->add("out.position.z = out.position.z * 0.5 + 0.5;" _CRLF); + // TODO: check this + // MoltenVK does this + src->add("out.position.z = (out.position.z + out.position.w) / 2.0;" _CRLF); } // return src->add("return out;" _CRLF); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp index 8f50c44cc..4734ae42c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp @@ -37,27 +37,26 @@ MTL::DepthStencilState* MetalDepthStencilCache::GetDepthStencilState(const Latte } desc->setDepthCompareFunction(depthCompareFunc); - // get stencil control parameters + // Stencil state bool stencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ENABLE(); - bool backStencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_BACK_STENCIL_ENABLE(); - auto frontStencilFunc = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_FUNC_F(); - auto frontStencilZPass = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ZPASS_F(); - auto frontStencilZFail = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ZFAIL_F(); - auto frontStencilFail = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_FAIL_F(); - auto backStencilFunc = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_FUNC_B(); - auto backStencilZPass = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ZPASS_B(); - auto backStencilZFail = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ZFAIL_B(); - auto backStencilFail = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_FAIL_B(); - // get stencil control parameters - uint32 stencilCompareMaskFront = LatteGPUState.contextNew.DB_STENCILREFMASK.get_STENCILMASK_F(); - uint32 stencilWriteMaskFront = LatteGPUState.contextNew.DB_STENCILREFMASK.get_STENCILWRITEMASK_F(); - uint32 stencilRefFront = LatteGPUState.contextNew.DB_STENCILREFMASK.get_STENCILREF_F(); - uint32 stencilCompareMaskBack = LatteGPUState.contextNew.DB_STENCILREFMASK_BF.get_STENCILMASK_B(); - uint32 stencilWriteMaskBack = LatteGPUState.contextNew.DB_STENCILREFMASK_BF.get_STENCILWRITEMASK_B(); - uint32 stencilRefBack = LatteGPUState.contextNew.DB_STENCILREFMASK_BF.get_STENCILREF_B(); - if (stencilEnable) { + // get stencil control parameters + bool backStencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_BACK_STENCIL_ENABLE(); + auto frontStencilFunc = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_FUNC_F(); + auto frontStencilZPass = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ZPASS_F(); + auto frontStencilZFail = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ZFAIL_F(); + auto frontStencilFail = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_FAIL_F(); + auto backStencilFunc = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_FUNC_B(); + auto backStencilZPass = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ZPASS_B(); + auto backStencilZFail = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ZFAIL_B(); + auto backStencilFail = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_FAIL_B(); + // get stencil control parameters + uint32 stencilCompareMaskFront = LatteGPUState.contextNew.DB_STENCILREFMASK.get_STENCILMASK_F(); + uint32 stencilWriteMaskFront = LatteGPUState.contextNew.DB_STENCILREFMASK.get_STENCILWRITEMASK_F(); + uint32 stencilCompareMaskBack = LatteGPUState.contextNew.DB_STENCILREFMASK_BF.get_STENCILMASK_B(); + uint32 stencilWriteMaskBack = LatteGPUState.contextNew.DB_STENCILREFMASK_BF.get_STENCILWRITEMASK_B(); + MTL::StencilDescriptor* frontStencil = MTL::StencilDescriptor::alloc()->init(); // TODO: set reference //depthStencilState.front.reference = stencilRefFront; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 852f1993b..b65160eba 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -603,6 +603,19 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 MTL::DepthStencilState* depthStencilState = m_depthStencilCache->GetDepthStencilState(LatteGPUState.contextNew); renderCommandEncoder->setDepthStencilState(depthStencilState); + // Stencil reference + bool stencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ENABLE(); + if (stencilEnable) + { + bool backStencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_BACK_STENCIL_ENABLE(); + uint32 stencilRefFront = LatteGPUState.contextNew.DB_STENCILREFMASK.get_STENCILREF_F(); + uint32 stencilRefBack = LatteGPUState.contextNew.DB_STENCILREFMASK_BF.get_STENCILREF_B(); + if (backStencilEnable) + renderCommandEncoder->setStencilReferenceValues(stencilRefFront, stencilRefBack); + else + renderCommandEncoder->setStencilReferenceValue(stencilRefFront); + } + // Primitive type const LattePrimitiveMode primitiveMode = static_cast(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]); auto mtlPrimitiveType = GetMtlPrimitiveType(primitiveMode); From 5c246d55bdaa0642e62a15a6dc18cd3a6665455e Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 8 Aug 2024 13:52:48 +0200 Subject: [PATCH 049/368] implement transform feedback --- src/Cafe/HW/Latte/Core/LatteShader.cpp | 3 +- .../LatteDecompilerEmitMSL.cpp | 41 ++++++------------- .../LatteDecompilerEmitMSLHeader.hpp | 14 ++++--- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 20 ++++++++- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 4 ++ src/Cafe/HW/Latte/Renderer/Renderer.h | 1 + .../HW/Latte/Renderer/Vulkan/VulkanRenderer.h | 36 ++++++++-------- 7 files changed, 65 insertions(+), 54 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteShader.cpp b/src/Cafe/HW/Latte/Core/LatteShader.cpp index 486516efd..66539a761 100644 --- a/src/Cafe/HW/Latte/Core/LatteShader.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShader.cpp @@ -9,6 +9,7 @@ #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h" #include "Cafe/OS/libs/gx2/GX2.h" // todo - remove dependency #include "Cafe/GraphicPack/GraphicPack2.h" +#include "HW/Latte/Renderer/Renderer.h" #include "util/helpers/StringParser.h" #include "config/ActiveSettings.h" #include "Cafe/GameProfile/GameProfile.h" @@ -688,9 +689,9 @@ void LatteShader_GetDecompilerOptions(LatteDecompilerOptions& options, LatteCons { options.usesGeometryShader = geometryShaderEnabled; options.spirvInstrinsics.hasRoundingModeRTEFloat32 = false; + options.useTFViaSSBO = g_renderer->UseTFViaSSBO(); if (g_renderer->GetType() == RendererAPI::Vulkan) { - options.useTFViaSSBO = VulkanRenderer::GetInstance()->UseTFViaSSBO(); options.spirvInstrinsics.hasRoundingModeRTEFloat32 = VulkanRenderer::GetInstance()->HasSPRIVRoundingModeRTE32(); } options.strictMul = g_current_game_profile->GetAccurateShaderMul() != AccurateShaderMulOption::False; diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 39ff895c5..8cbbbe376 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2752,9 +2752,9 @@ static void _emitTEXGetGradientsHV(LatteDecompilerShaderContext* shaderContext, const char* funcName; if (texInstruction->opcode == GPU7_TEX_INST_GET_GRADIENTS_H) - funcName = "dFdx"; + funcName = "dfdx"; else - funcName = "dFdy"; + funcName = "dfdy"; src->add(" = "); @@ -3273,15 +3273,8 @@ static void _emitCFRingWriteCode(LatteDecompilerShaderContext* shaderContext, La if ((cfInstruction->memWriteCompMask&(1 << i)) == 0) continue; - if (shaderContext->options->useTFViaSSBO) - { - uint32 u32Offset = streamWrite->exportArrayBase + i; - src->addFmt("sb_buffer[sbBase{} + {}]", streamWrite->bufferIndex, u32Offset); - } - else - { - src->addFmt("sb{}[{}]", streamWrite->bufferIndex, streamWrite->exportArrayBase + i); - } + uint32 u32Offset = streamWrite->exportArrayBase + i; + src->addFmt("sb[sbBase{} + {}]", streamWrite->bufferIndex, u32Offset); src->add(" = "); @@ -3393,15 +3386,8 @@ static void _emitStreamWriteCode(LatteDecompilerShaderContext* shaderContext, La if ((cfInstruction->memWriteCompMask&(1 << i)) == 0) continue; - if (shaderContext->options->useTFViaSSBO) - { - uint32 u32Offset = cfInstruction->exportArrayBase + i; - src->addFmt("sb_buffer[sbBase{} + {}]", streamoutBufferIndex, u32Offset); - } - else - { - src->addFmt("sb{}[{}]", streamoutBufferIndex, cfInstruction->exportArrayBase + i); - } + uint32 u32Offset = cfInstruction->exportArrayBase + i; + src->addFmt("sb[sbBase{} + {}]", streamoutBufferIndex, u32Offset); src->add(" = "); @@ -3595,15 +3581,12 @@ void LatteDecompiler_emitClauseCodeMSL(LatteDecompilerShaderContext* shaderConte // emit vertex src->add("EmitVertex();" _CRLF); // increment transform feedback pointer - if (shaderContext->analyzer.useSSBOForStreamout) + for (sint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++) { - for (sint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++) - { - if (!shaderContext->output->streamoutBufferWriteMask[i]) - continue; - cemu_assert_debug((shaderContext->output->streamoutBufferStride[i] & 3) == 0); - src->addFmt("sbBase{} += {};" _CRLF, i, shaderContext->output->streamoutBufferStride[i] / 4); - } + if (!shaderContext->output->streamoutBufferWriteMask[i]) + continue; + cemu_assert_debug((shaderContext->output->streamoutBufferStride[i] & 3) == 0); + src->addFmt("sbBase{} += {};" _CRLF, i, shaderContext->output->streamoutBufferStride[i] / 4); } if( shaderContext->analyzer.modifiesPixelActiveState ) @@ -3970,7 +3953,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, src->addFmt("float cubeMapArrayIndex{} = 0.0;" _CRLF, i); } // init base offset for streamout buffer writes - if (shaderContext->analyzer.useSSBOForStreamout && (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry)) + if (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry) { for (sint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++) { diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 9d52196a6..114cd6fa1 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -94,9 +94,8 @@ namespace LatteDecompiler uniformCurrentOffset += 8; } // define verticesPerInstance + streamoutBufferBaseX - if (decompilerContext->analyzer.useSSBOForStreamout && - (shader->shaderType == LatteConst::ShaderType::Vertex && decompilerContext->options->usesGeometryShader == false) || - (shader->shaderType == LatteConst::ShaderType::Geometry) ) + if ((shader->shaderType == LatteConst::ShaderType::Vertex && decompilerContext->options->usesGeometryShader == false) || + (shader->shaderType == LatteConst::ShaderType::Geometry)) { src->add("int verticesPerInstance;" _CRLF); uniformOffsets.offset_verticesPerInstance = uniformCurrentOffset; @@ -251,8 +250,6 @@ namespace LatteDecompiler { _emitAttributes(decompilerContext); _emitVSOutputs(decompilerContext); - - // TODO: transform feedback } else if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel) { @@ -379,6 +376,13 @@ namespace LatteDecompiler case LatteConst::ShaderType::Vertex: src->add(", uint vid [[vertex_id]]"); src->add(", uint iid [[instance_id]]"); + + // streamout buffer (transform feedback) + if (decompilerContext->analyzer.hasStreamoutEnable && decompilerContext->analyzer.hasStreamoutWrite) + { + src->addFmt(", device int* sb [[buffer({})]]" _CRLF, decompilerContext->output->resourceMappingVK.getTFStorageBufferBindingPoint()); + } + break; case LatteConst::ShaderType::Pixel: src->add(", bool frontFacing [[front_facing]]"); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index b65160eba..51323eec1 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -16,6 +16,7 @@ #include "Cemu/Logging/CemuDebugLogging.h" #include "HW/Latte/Core/Latte.h" #include "HW/Latte/ISA/LatteReg.h" +#include "Metal/MTLResource.hpp" #include "Metal/MTLTypes.hpp" #include "gui/guiWrapper.h" @@ -39,6 +40,9 @@ MetalRenderer::MetalRenderer() // Texture readback m_readbackBuffer = m_device->newBuffer(TEXTURE_READBACK_SIZE, MTL::StorageModeShared); + // Transform feedback + m_xfbRingBuffer = m_device->newBuffer(LatteStreamout_GetRingBufferSize(), MTL::StorageModeShared); + // Initialize state for (uint32 i = 0; i < (uint32)LatteConst::ShaderType::TotalCount; i++) { @@ -1185,7 +1189,21 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE // Storage buffer if (shader->resourceMapping.tfStorageBindingPoint >= 0) { - debug_printf("storage buffer not implemented, index: %i\n", shader->resourceMapping.tfStorageBindingPoint); + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + { + renderCommandEncoder->setVertexBuffer(m_xfbRingBuffer, 0, shader->resourceMapping.tfStorageBindingPoint); + break; + } + case LatteConst::ShaderType::Pixel: + { + renderCommandEncoder->setFragmentBuffer(m_xfbRingBuffer, 0, shader->resourceMapping.tfStorageBindingPoint); + break; + } + default: + UNREACHABLE; + } } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index c770017bb..ef33c95d7 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -147,6 +147,7 @@ class MetalRenderer : public Renderer cemuLog_log(LogType::MetalLogging, "Imgui is not yet supported on Metal"); }; + bool UseTFViaSSBO() const override { return true; } void AppendOverlayDebugInfo() override; // rendertarget @@ -265,6 +266,9 @@ class MetalRenderer : public Renderer MTL::Buffer* m_readbackBuffer; uint32 m_readbackBufferWriteOffset = 0; + // Transform feedback + MTL::Buffer* m_xfbRingBuffer; + // Active objects MTL::CommandBuffer* m_commandBuffer = nullptr; MetalEncoderType m_encoderType = MetalEncoderType::None; diff --git a/src/Cafe/HW/Latte/Renderer/Renderer.h b/src/Cafe/HW/Latte/Renderer/Renderer.h index a94ad1550..7bd143d03 100644 --- a/src/Cafe/HW/Latte/Renderer/Renderer.h +++ b/src/Cafe/HW/Latte/Renderer/Renderer.h @@ -85,6 +85,7 @@ class Renderer virtual void DeleteFontTextures() = 0; GfxVendor GetVendor() const { return m_vendor; } + virtual bool UseTFViaSSBO() const { return false; } virtual void AppendOverlayDebugInfo() = 0; // rendertarget diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h index 6df53da47..e4b4cbf94 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h @@ -73,11 +73,11 @@ class PipelineInfo return true; } - + template struct direct_hash { - size_t operator()(const uint64& k) const noexcept + size_t operator()(const uint64& k) const noexcept { return k; } @@ -277,7 +277,6 @@ class VulkanRenderer : public Renderer // texture functions void* texture_acquireTextureUploadBuffer(uint32 size) override; void texture_releaseTextureUploadBuffer(uint8* mem) override; - TextureDecoder* texture_chooseDecodedFormat(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, uint32 width, uint32 height) override; @@ -370,7 +369,7 @@ class VulkanRenderer : public Renderer VkRect2D currentScissorRect{}; // vertex bindings - struct + struct { uint32 offset; }currentVertexBinding[LATTE_MAX_VERTEX_BUFFERS]{}; @@ -457,17 +456,17 @@ class VulkanRenderer : public Renderer bool shaderRoundingModeRTEFloat32{ false }; }shaderFloatControls; // from VK_KHR_shader_float_controls - struct + struct { bool debug_utils = false; // VK_EXT_DEBUG_UTILS }instanceExtensions; - struct + struct { bool useTFEmulationViaSSBO = true; // emulate transform feedback via shader writes to a storage buffer }mode; - struct + struct { uint32 minUniformBufferOffsetAlignment = 256; uint32 nonCoherentAtomSize = 256; @@ -497,7 +496,7 @@ class VulkanRenderer : public Renderer void CreateCommandBuffers(); void swapchain_createDescriptorSetLayout(); - + // shader bool IsAsyncPipelineAllowed(uint32 numIndices); @@ -512,6 +511,8 @@ class VulkanRenderer : public Renderer void DeleteFontTextures() override; bool BeginFrame(bool mainWindow) override; + bool UseTFViaSSBO() const override { return m_featureControl.mode.useTFEmulationViaSSBO; } + // drawcall emulation PipelineInfo* draw_createGraphicsPipeline(uint32 indexCount); PipelineInfo* draw_getOrCreateGraphicsPipeline(uint32 indexCount); @@ -574,7 +575,7 @@ class VulkanRenderer : public Renderer VkDevice m_logicalDevice = VK_NULL_HANDLE; VkDebugUtilsMessengerEXT m_debugCallback = nullptr; volatile bool m_destructionRequested = false; - + QueueFamilyIndices m_indices{}; Semaphore m_pipeline_cache_semaphore; @@ -583,7 +584,7 @@ class VulkanRenderer : public Renderer VkPipelineCache m_pipeline_cache{ nullptr }; VkPipelineLayout m_pipelineLayout{nullptr}; VkCommandPool m_commandPool{ nullptr }; - + // buffer to cache uniform vars VkBuffer m_uniformVarBuffer = VK_NULL_HANDLE; VkDeviceMemory m_uniformVarBufferMemory = VK_NULL_HANDLE; @@ -652,19 +653,19 @@ class VulkanRenderer : public Renderer bool m_submitOnIdle{}; // submit current buffer if Latte command processor goes into idle state (no more commands or waiting for externally signaled condition) // tracking for dynamic offsets - struct + struct { uint32 uniformVarBufferOffset[VulkanRendererConst::SHADER_STAGE_INDEX_COUNT]; - struct + struct { uint32 unformBufferOffset[LATTE_NUM_MAX_UNIFORM_BUFFERS]; }shaderUB[VulkanRendererConst::SHADER_STAGE_INDEX_COUNT]; }dynamicOffsetInfo{}; // streamout - struct + struct { - struct + struct { bool enabled; uint32 ringBufferOffset; @@ -714,11 +715,11 @@ class VulkanRenderer : public Renderer accessFlags = 0; if constexpr ((TSyncOp & BUFFER_SHADER_READ) != 0) { - // in theory: VK_ACCESS_INDEX_READ_BIT should be set here too but indices are currently separated + // in theory: VK_ACCESS_INDEX_READ_BIT should be set here too but indices are currently separated stages |= VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; accessFlags |= VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | VK_ACCESS_UNIFORM_READ_BIT | VK_ACCESS_SHADER_READ_BIT; } - + if constexpr ((TSyncOp & BUFFER_SHADER_WRITE) != 0) { stages |= VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; @@ -921,7 +922,6 @@ class VulkanRenderer : public Renderer public: bool GetDisableMultithreadedCompilation() const { return m_featureControl.disableMultithreadedCompilation; } - bool UseTFViaSSBO() const { return m_featureControl.mode.useTFEmulationViaSSBO; } bool HasSPRIVRoundingModeRTE32() const { return m_featureControl.shaderFloatControls.shaderRoundingModeRTEFloat32; } bool IsDebugUtilsEnabled() const { return m_featureControl.debugMarkersSupported && m_featureControl.instanceExtensions.debug_utils; } @@ -931,7 +931,7 @@ class VulkanRenderer : public Renderer void debug_genericBarrier(); // shaders - struct + struct { RendererShaderVk* copySurface_vs{}; RendererShaderVk* copySurface_psDepth2Color{}; From a38ddb5fc25315e3f5db9ada66126830d9ca1e17 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 8 Aug 2024 19:25:12 +0200 Subject: [PATCH 050/368] fix: shadows --- .../LatteDecompilerEmitMSL.cpp | 122 ++++++++++-------- .../LatteDecompilerEmitMSLHeader.hpp | 22 ++-- .../Renderer/Metal/LatteTextureViewMtl.cpp | 7 +- .../Renderer/Metal/LatteTextureViewMtl.h | 2 +- .../Renderer/Metal/MetalDepthStencilCache.cpp | 6 +- .../Renderer/Metal/MetalMemoryManager.cpp | 1 + .../Renderer/Metal/MetalPipelineCache.cpp | 1 - .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 4 +- 8 files changed, 91 insertions(+), 74 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 8cbbbe376..17ff27eab 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2187,6 +2187,7 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex } auto texDim = shaderContext->shader->textureUnitDim[texInstruction->textureFetch.textureIndex]; + bool isCompare = shaderContext->shader->textureUsesDepthCompare[texInstruction->textureFetch.textureIndex]; char tempBuffer0[32]; char tempBuffer1[32]; @@ -2212,6 +2213,7 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex } // texture sampler opcode uint32 texOpcode = texInstruction->opcode; + // TODO: is this needed? if (shaderContext->shaderType == LatteConst::ShaderType::Vertex) { // vertex shader forces LOD to zero, but certain sampler types don't support textureLod(...) API @@ -2275,7 +2277,10 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex } else { - src->addFmt("sample(samplr{}, ", texInstruction->textureFetch.textureIndex); + src->add("sample"); + if (isCompare) + src->add("_compare"); + src->addFmt("(samplr{}, ", texInstruction->textureFetch.textureIndex); } // for textureGather() add shift (todo: depends on rounding mode set in sampler registers?) @@ -2493,61 +2498,68 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex src->addFmt(",int3({},{},{})", texInstruction->textureFetch.offsetX/2, texInstruction->textureFetch.offsetY/2, texInstruction->textureFetch.offsetZ/2); } } - // lod bias - if( texOpcode == GPU7_TEX_INST_SAMPLE_C || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ ) - { - src->add(")."); - if (numWrittenElements > 1) - { - // result is copied into multiple channels - for (sint32 f = 0; f < numWrittenElements; f++) - { - cemu_assert_debug(texInstruction->dstSel[f] == 0); // only x component is defined - src->add("x"); - } - } - else - { - src->add("x"); - } - } - else - { - src->add(")."); - for (sint32 f = 0; f < 4; f++) - { - if( texInstruction->dstSel[f] < 4 ) - { - uint8 elemIndex = texInstruction->dstSel[f]; - if (texOpcode == GPU7_TEX_INST_FETCH4) - { - // 's textureGather() and GPU7's FETCH4 instruction have a different order of elements - // xyzw: top-left, top-right, bottom-right, bottom-left - // textureGather xyzw - // fetch4 yzxw - // translate index from fetch4 to textureGather order - static uint8 fetchToGather[4] = - { - 2, // x -> z - 0, // y -> x - 1, // z -> y - 3, // w -> w - }; - elemIndex = fetchToGather[elemIndex]; - } - src->add(resultElemTable[elemIndex]); - numWrittenElements++; - } - else if( texInstruction->dstSel[f] == 7 ) - { - // masked and not written - } - else - { - cemu_assert_unimplemented(); - } - } + // lod bias (TODO: wht?) + + src->add(")"); + // sample_compare doesn't return a float + if (!isCompare) + { + if( texOpcode == GPU7_TEX_INST_SAMPLE_C || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ ) + { + src->add("."); + + if (numWrittenElements > 1) + { + // result is copied into multiple channels + for (sint32 f = 0; f < numWrittenElements; f++) + { + cemu_assert_debug(texInstruction->dstSel[f] == 0); // only x component is defined + src->add("x"); + } + } + else + { + src->add("x"); + } + } + else + { + src->add("."); + for (sint32 f = 0; f < 4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + uint8 elemIndex = texInstruction->dstSel[f]; + if (texOpcode == GPU7_TEX_INST_FETCH4) + { + // 's textureGather() and GPU7's FETCH4 instruction have a different order of elements + // xyzw: top-left, top-right, bottom-right, bottom-left + // textureGather xyzw + // fetch4 yzxw + // translate index from fetch4 to textureGather order + static uint8 fetchToGather[4] = + { + 2, // x -> z + 0, // y -> x + 1, // z -> y + 3, // w -> w + }; + elemIndex = fetchToGather[elemIndex]; + } + src->add(resultElemTable[elemIndex]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + cemu_assert_unimplemented(); + } + } + } } src->add(");"); diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 114cd6fa1..6a696e118 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -224,6 +224,7 @@ namespace LatteDecompiler src->add("#define GET_FRAGCOORD() vec4(in.position.xy * supportBuffer.fragCoordScale.xy, in.position.z, 1.0 / in.position.w)" _CRLF); src->add("struct FragmentIn {" _CRLF); + src->add("float4 position [[position]];" _CRLF); LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable(); for (sint32 i = 0; i < psInputTable->count; i++) @@ -271,7 +272,7 @@ namespace LatteDecompiler // generate depth output for pixel shader if (decompilerContext->shader->depthWritten) { - src->add("float passDepth [[depth]];" _CRLF); + src->add("float passDepth [[depth(any)]];" _CRLF); } src->add("};" _CRLF _CRLF); @@ -323,26 +324,31 @@ namespace LatteDecompiler src->add(", "); + if (shaderContext->shader->textureUsesDepthCompare[i]) + src->add("depth"); + else + src->add("texture"); + if (shaderContext->shader->textureIsIntegerFormat[i]) { // integer samplers if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_1D) - src->add("texture1d"); + src->add("1d"); else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D || shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_MSAA) - src->add("texture2d"); + src->add("2d"); else cemu_assert_unimplemented(); } else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D || shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_MSAA) - src->add("texture2d"); + src->add("2d"); else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_1D) - src->add("texture1d"); + src->add("1d"); else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_ARRAY) - src->add("texture2d_array"); + src->add("2d_array"); else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_CUBEMAP) - src->add("texturecube_array"); + src->add("cube_array"); else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_3D) - src->add("texture3d"); + src->add("3d"); else { cemu_assert_unimplemented(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp index edf720fb1..4a6ceeb4f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp @@ -39,11 +39,12 @@ MTL::Texture* LatteTextureViewMtl::GetSwizzledView(uint32 gpuSamplerSwizzle) sint32 freeIndex = -1; for (sint32 i = 0; i < std::size(m_viewCache); i++) { - if (m_viewCache[i].key == gpuSamplerSwizzle) + const auto& entry = m_viewCache[i]; + if (entry.key == gpuSamplerSwizzle) { - return m_viewCache[i].texture; + return entry.texture; } - else if (m_viewCache[i].key == INVALID_SWIZZLE && freeIndex == -1) + else if (entry.key == INVALID_SWIZZLE && freeIndex == -1) { freeIndex = i; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h index 7a5a9dfa3..fc05de5f5 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h @@ -29,7 +29,7 @@ class LatteTextureViewMtl : public LatteTextureView struct { uint32 key; MTL::Texture* texture; - } m_viewCache[4] = {{INVALID_SWIZZLE, nullptr}}; + } m_viewCache[4] = {{INVALID_SWIZZLE, nullptr}, {INVALID_SWIZZLE, nullptr}, {INVALID_SWIZZLE, nullptr}, {INVALID_SWIZZLE, nullptr}}; std::unordered_map m_fallbackViewCache; MTL::Texture* CreateSwizzledView(uint32 gpuSamplerSwizzle); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp index 4734ae42c..b8f3fc52d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp @@ -30,12 +30,10 @@ MTL::DepthStencilState* MetalDepthStencilCache::GetDepthStencilState(const Latte MTL::DepthStencilDescriptor* desc = MTL::DepthStencilDescriptor::alloc()->init(); desc->setDepthWriteEnabled(depthWriteEnable); - auto depthCompareFunc = GetMtlCompareFunc(depthFunc); - if (!depthEnable) + if (depthEnable) { - depthCompareFunc = MTL::CompareFunctionAlways; + desc->setDepthCompareFunction(GetMtlCompareFunc(depthFunc)); } - desc->setDepthCompareFunction(depthCompareFunc); // Stencil state bool stencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ENABLE(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index c19d85961..40832aa3d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -105,6 +105,7 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu { memcpy(newPtr + elem * newStride, oldPtr + elem * stride, stride); } + debug_printf("Restrided vertex buffer (old stride: %zu, new stride: %zu, old size: %zu, new size: %zu)\n", stride, newStride, vertexBufferRange->size, newSize); restrideInfo.memoryInvalidated = false; restrideInfo.lastStride = newStride; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 2462f370f..c1be4dbb1 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -43,7 +43,6 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS auto attribute = vertexDescriptor->attributes()->object(semanticId); attribute->setOffset(attr.offset); - // Bind from the end to not conflict with uniform buffers attribute->setBufferIndex(GET_MTL_VERTEX_BUFFER_INDEX(attr.attributeBufferIndex)); attribute->setFormat(GetMtlVertexFormat(attr.format)); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 51323eec1..5d35baa3d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -19,6 +19,7 @@ #include "Metal/MTLResource.hpp" #include "Metal/MTLTypes.hpp" #include "gui/guiWrapper.h" +#include extern bool hasValidFramebufferAttached; @@ -596,8 +597,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 debug_printf("no vertex function, skipping draw\n"); return; } - - auto fetchShader = vertexShader->compatibleFetchShader; + const auto fetchShader = LatteSHRC_GetActiveFetchShader(); // Render pipeline state MTL::RenderPipelineState* renderPipelineState = m_pipelineCache->GetPipelineState(fetchShader, vertexShader, pixelShader, m_state.activeFBO, LatteGPUState.contextNew); From 6d34d24322bb859044fdd504b68c5ec78b44224b Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 11 Aug 2024 13:35:26 +0200 Subject: [PATCH 051/368] fix: missing color attachments & bind some other state --- .../LatteDecompilerEmitMSLHeader.hpp | 2 +- .../Latte/Renderer/Metal/LatteTextureMtl.cpp | 2 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 59 +++++++++++++++++-- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 4 +- 4 files changed, 58 insertions(+), 9 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 6a696e118..0b23fecdc 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -221,7 +221,7 @@ namespace LatteDecompiler { auto* src = shaderContext->shaderSource; - src->add("#define GET_FRAGCOORD() vec4(in.position.xy * supportBuffer.fragCoordScale.xy, in.position.z, 1.0 / in.position.w)" _CRLF); + src->add("#define GET_FRAGCOORD() float4(in.position.xy * supportBuffer.fragCoordScale.xy, in.position.z, 1.0 / in.position.w)" _CRLF); src->add("struct FragmentIn {" _CRLF); src->add("float4 position [[position]];" _CRLF); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index 933752ebe..63cd69f60 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -67,7 +67,7 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM auto formatInfo = GetMtlPixelFormatInfo(format, isDepth); desc->setPixelFormat(formatInfo.pixelFormat); - // TODO: is write needed? + // HACK: even though the textures are never written to from a shader, we still need to use `ShaderWrite` usage to prevent pink lines over the screen MTL::TextureUsage usage = MTL::TextureUsageShaderRead | MTL::TextureUsageShaderWrite; // TODO: add more conditions if (!Latte::IsCompressedFormat(format)) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 5d35baa3d..cb71f90b9 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -16,6 +16,7 @@ #include "Cemu/Logging/CemuDebugLogging.h" #include "HW/Latte/Core/Latte.h" #include "HW/Latte/ISA/LatteReg.h" +#include "Metal/MTLRenderCommandEncoder.hpp" #include "Metal/MTLResource.hpp" #include "Metal/MTLTypes.hpp" #include "gui/guiWrapper.h" @@ -600,7 +601,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 const auto fetchShader = LatteSHRC_GetActiveFetchShader(); // Render pipeline state - MTL::RenderPipelineState* renderPipelineState = m_pipelineCache->GetPipelineState(fetchShader, vertexShader, pixelShader, m_state.activeFBO, LatteGPUState.contextNew); + MTL::RenderPipelineState* renderPipelineState = m_pipelineCache->GetPipelineState(fetchShader, vertexShader, pixelShader, m_state.lastUsedFBO, LatteGPUState.contextNew); renderCommandEncoder->setRenderPipelineState(renderPipelineState); // Depth stencil state @@ -620,9 +621,54 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 renderCommandEncoder->setStencilReferenceValue(stencilRefFront); } - // Primitive type - const LattePrimitiveMode primitiveMode = static_cast(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]); - auto mtlPrimitiveType = GetMtlPrimitiveType(primitiveMode); + // Primitive type + const LattePrimitiveMode primitiveMode = static_cast(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]); + auto mtlPrimitiveType = GetMtlPrimitiveType(primitiveMode); + bool isPrimitiveRect = (primitiveMode == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS); + + // Blend color + float* blendColorConstant = (float*)LatteGPUState.contextRegister + Latte::REGADDR::CB_BLEND_RED; + renderCommandEncoder->setBlendColor(blendColorConstant[0], blendColorConstant[1], blendColorConstant[2], blendColorConstant[3]); + + // polygon control + const auto& polygonControlReg = LatteGPUState.contextNew.PA_SU_SC_MODE_CNTL; + const auto frontFace = polygonControlReg.get_FRONT_FACE(); + uint32 cullFront = polygonControlReg.get_CULL_FRONT(); + uint32 cullBack = polygonControlReg.get_CULL_BACK(); + uint32 polyOffsetFrontEnable = polygonControlReg.get_OFFSET_FRONT_ENABLED(); + + // TODO + //cemu_assert_debug(LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_ZCLIP_NEAR_DISABLE() == LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_ZCLIP_FAR_DISABLE()); // near or far clipping can be disabled individually + //bool zClipEnable = LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_ZCLIP_FAR_DISABLE() == false; + + if (polyOffsetFrontEnable) + { + // TODO: set depth bias + } + + // todo - how does culling behave with rects? + // right now we just assume that their winding is always CW + if (isPrimitiveRect) + { + if (frontFace == Latte::LATTE_PA_SU_SC_MODE_CNTL::E_FRONTFACE::CW) + cullFront = cullBack; + else + cullBack = cullFront; + } + + if (cullFront && cullBack) + return; // We can just skip the draw (TODO: can we?) + else if (cullFront) + renderCommandEncoder->setCullMode(MTL::CullModeFront); + else if (cullBack) + renderCommandEncoder->setCullMode(MTL::CullModeBack); + else + renderCommandEncoder->setCullMode(MTL::CullModeNone); + + if (frontFace == Latte::LATTE_PA_SU_SC_MODE_CNTL::E_FRONTFACE::CCW) + renderCommandEncoder->setFrontFacingWinding(MTL::WindingCounterClockwise); + else + renderCommandEncoder->setFrontFacingWinding(MTL::WindingClockwise); // Resources @@ -708,7 +754,7 @@ void MetalRenderer::EnsureCommandBuffer() if (!m_commandBuffer) { // Debug - m_commandQueue->insertDebugCaptureBoundary(); + //m_commandQueue->insertDebugCaptureBoundary(); m_commandBuffer = m_commandQueue->commandBuffer(); } @@ -755,6 +801,7 @@ MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(MTL::RenderPas } // Update state + m_state.lastUsedFBO = m_state.activeFBO; for (uint8 i = 0; i < 8; i++) { m_state.colorRenderTargets[i] = colorRenderTargets[i]; @@ -836,7 +883,7 @@ void MetalRenderer::CommitCommandBuffer() LatteTextureReadback_UpdateFinishedTransfers(false); // Debug - m_commandQueue->insertDebugCaptureBoundary(); + //m_commandQueue->insertDebugCaptureBoundary(); } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index ef33c95d7..f70eec2c9 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -31,6 +31,8 @@ struct MetalState { bool skipDrawSequence = false; class CachedFBOMtl* activeFBO = nullptr; + // If the FBO changes, but it's the same FBO as the last one with some omitted attachments, this FBO doesn't change' + class CachedFBOMtl* lastUsedFBO = nullptr; MetalBoundBuffer vertexBuffers[MAX_MTL_BUFFERS] = {{}}; // TODO: find out what is the max number of bound textures on the Wii U class LatteTextureViewMtl* textures[64] = {nullptr}; @@ -58,7 +60,7 @@ class LatteQueryObjectMtl : public LatteQueryObject bool getResult(uint64& numSamplesPassed) override { cemuLog_log(LogType::MetalLogging, "LatteQueryObjectMtl::getResult: occlusion queries are not yet supported on Metal"); - return false; + return true; } void begin() override From caba20da4b6d1f64391bf4b329b7ffc7e06a853c Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 11 Aug 2024 13:47:06 +0200 Subject: [PATCH 052/368] apply gamma correction --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index cb71f90b9..3bef372b1 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -16,6 +16,7 @@ #include "Cemu/Logging/CemuDebugLogging.h" #include "HW/Latte/Core/Latte.h" #include "HW/Latte/ISA/LatteReg.h" +#include "Metal/MTLPixelFormat.hpp" #include "Metal/MTLRenderCommandEncoder.hpp" #include "Metal/MTLResource.hpp" #include "Metal/MTLTypes.hpp" @@ -76,6 +77,8 @@ void MetalRenderer::InitializeLayer(const Vector2i& size, bool mainWindow) m_metalLayer = (CA::MetalLayer*)CreateMetalLayer(windowInfo.handle); m_metalLayer->setDevice(m_device); + // TODO: shouldn't this be handled differently? + m_metalLayer->setPixelFormat(MTL::PixelFormatRGBA8Unorm_sRGB); // Present pipeline NS::Error* error = nullptr; @@ -84,7 +87,6 @@ void MetalRenderer::InitializeLayer(const Vector2i& size, bool mainWindow) { debug_printf("failed to create present library (error: %s)\n", error->localizedDescription()->utf8String()); error->release(); - throw; return; } MTL::Function* presentVertexFunction = presentLibrary->newFunction(NS::String::string("presentVertex", NS::ASCIIStringEncoding)); From c6ab45a098b8ab03e7f4cd32558c3e0245747c96 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 11 Aug 2024 14:17:40 +0200 Subject: [PATCH 053/368] fix: vertex buffer leaks --- .../HW/Latte/Renderer/Metal/MetalCommon.h | 4 +++ .../Renderer/Metal/MetalMemoryManager.cpp | 27 ++++++++++--------- .../Latte/Renderer/Metal/MetalMemoryManager.h | 21 +++++++-------- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 3 ++- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 2 -- 5 files changed, 30 insertions(+), 27 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h index 2a2713e50..aa71731e1 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -1,5 +1,9 @@ #pragma once +#include + +constexpr size_t INVALID_OFFSET = std::numeric_limits::max(); + inline size_t Align(size_t size, size_t alignment) { return (size + alignment - 1) & ~(alignment - 1); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 40832aa3d..09e07cd9f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -70,11 +70,11 @@ MetalVertexBufferCache::~MetalVertexBufferCache() for (uint32 i = 0; i < LATTE_MAX_VERTEX_BUFFERS; i++) { auto vertexBufferRange = m_bufferRanges[i]; - if (vertexBufferRange) + if (vertexBufferRange.offset != INVALID_OFFSET) { - if (vertexBufferRange->restrideInfo.buffer) + if (vertexBufferRange.restrideInfo->buffer) { - vertexBufferRange->restrideInfo.buffer->release(); + vertexBufferRange.restrideInfo->buffer->release(); } } } @@ -83,29 +83,30 @@ MetalVertexBufferCache::~MetalVertexBufferCache() MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Buffer* bufferCache, uint32 bufferIndex, size_t stride) { auto vertexBufferRange = m_bufferRanges[bufferIndex]; - auto& restrideInfo = vertexBufferRange->restrideInfo; + auto& restrideInfo = *vertexBufferRange.restrideInfo; if (stride % 4 == 0) { // No restride needed - return {bufferCache, vertexBufferRange->offset}; + return {bufferCache, vertexBufferRange.offset}; } if (restrideInfo.memoryInvalidated || stride != restrideInfo.lastStride) { // TODO: use compute/void vertex function instead size_t newStride = Align(stride, 4); - size_t newSize = vertexBufferRange->size / stride * newStride; + size_t newSize = vertexBufferRange.size / stride * newStride; restrideInfo.buffer = m_mtlr->GetDevice()->newBuffer(newSize, MTL::StorageModeShared); - uint8* oldPtr = (uint8*)bufferCache->contents() + vertexBufferRange->offset; + uint8* oldPtr = (uint8*)bufferCache->contents() + vertexBufferRange.offset; uint8* newPtr = (uint8*)restrideInfo.buffer->contents(); - for (size_t elem = 0; elem < vertexBufferRange->size / stride; elem++) + for (size_t elem = 0; elem < vertexBufferRange.size / stride; elem++) { memcpy(newPtr + elem * newStride, oldPtr + elem * stride, stride); } - debug_printf("Restrided vertex buffer (old stride: %zu, new stride: %zu, old size: %zu, new size: %zu)\n", stride, newStride, vertexBufferRange->size, newSize); + // TODO: remove + debug_printf("Restrided vertex buffer (old stride: %zu, new stride: %zu, old size: %zu, new size: %zu)\n", stride, newStride, vertexBufferRange.size, newSize); restrideInfo.memoryInvalidated = false; restrideInfo.lastStride = newStride; @@ -119,15 +120,15 @@ void MetalVertexBufferCache::MemoryRangeChanged(size_t offset, size_t size) for (uint32 i = 0; i < LATTE_MAX_VERTEX_BUFFERS; i++) { auto vertexBufferRange = m_bufferRanges[i]; - if (vertexBufferRange) + if (vertexBufferRange.offset != INVALID_OFFSET) { - if ((offset < vertexBufferRange->offset && (offset + size) < (vertexBufferRange->offset + vertexBufferRange->size)) || - (offset > vertexBufferRange->offset && (offset + size) > (vertexBufferRange->offset + vertexBufferRange->size))) + if ((offset < vertexBufferRange.offset && (offset + size) < (vertexBufferRange.offset + vertexBufferRange.size)) || + (offset > vertexBufferRange.offset && (offset + size) > (vertexBufferRange.offset + vertexBufferRange.size))) { continue; } - vertexBufferRange->restrideInfo.memoryInvalidated = true; + vertexBufferRange.restrideInfo->memoryInvalidated = true; } } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index 93011ae36..4f875687d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -1,9 +1,8 @@ #pragma once -#include - #include "Cafe/HW/Latte/ISA/LatteReg.h" #include "Cafe/HW/Latte/Core/LatteConst.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" //const uint32 bufferAllocatorIndexShift = 24; @@ -68,9 +67,9 @@ struct MetalRestrideInfo struct MetalVertexBufferRange { - size_t offset; + size_t offset = INVALID_OFFSET; size_t size; - MetalRestrideInfo& restrideInfo; + MetalRestrideInfo* restrideInfo; }; class MetalVertexBufferCache @@ -82,19 +81,19 @@ class MetalVertexBufferCache ~MetalVertexBufferCache(); // Vertex buffer cache - void TrackVertexBuffer(uint32 bufferIndex, size_t offset, size_t size, MetalRestrideInfo& restrideInfo) + void TrackVertexBuffer(uint32 bufferIndex, size_t offset, size_t size, MetalRestrideInfo* restrideInfo) { - m_bufferRanges[bufferIndex] = new MetalVertexBufferRange{offset, size, restrideInfo}; + m_bufferRanges[bufferIndex] = MetalVertexBufferRange{offset, size, restrideInfo}; } void UntrackVertexBuffer(uint32 bufferIndex) { auto& range = m_bufferRanges[bufferIndex]; - if (range->restrideInfo.buffer) + if (range.restrideInfo->buffer) { - range->restrideInfo.buffer->release(); + range.restrideInfo->buffer->release(); } - range = nullptr; + range.offset = INVALID_OFFSET; } MetalRestridedBufferRange RestrideBufferIfNeeded(MTL::Buffer* bufferCache, uint32 bufferIndex, size_t stride); @@ -102,7 +101,7 @@ class MetalVertexBufferCache private: class MetalRenderer* m_mtlr; - MetalVertexBufferRange* m_bufferRanges[LATTE_MAX_VERTEX_BUFFERS] = {nullptr}; + MetalVertexBufferRange m_bufferRanges[LATTE_MAX_VERTEX_BUFFERS] = {}; void MemoryRangeChanged(size_t offset, size_t size); }; @@ -147,7 +146,7 @@ class MetalMemoryManager void CopyBufferCache(size_t srcOffset, size_t dstOffset, size_t size); // Vertex buffer cache - void TrackVertexBuffer(uint32 bufferIndex, size_t offset, size_t size, MetalRestrideInfo& restrideInfo) + void TrackVertexBuffer(uint32 bufferIndex, size_t offset, size_t size, MetalRestrideInfo* restrideInfo) { m_vertexBufferCache.TrackVertexBuffer(bufferIndex, offset, size, restrideInfo); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 3bef372b1..c3d278653 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -77,6 +77,7 @@ void MetalRenderer::InitializeLayer(const Vector2i& size, bool mainWindow) m_metalLayer = (CA::MetalLayer*)CreateMetalLayer(windowInfo.handle); m_metalLayer->setDevice(m_device); + // TODO: don't always force sRGB // TODO: shouldn't this be handled differently? m_metalLayer->setPixelFormat(MTL::PixelFormatRGBA8Unorm_sRGB); @@ -476,7 +477,7 @@ void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, u buffer.size = size; buffer.restrideInfo = {}; - m_memoryManager->TrackVertexBuffer(bufferIndex, offset, size, buffer.restrideInfo); + m_memoryManager->TrackVertexBuffer(bufferIndex, offset, size, &buffer.restrideInfo); } void MetalRenderer::buffer_bindUniformBuffer(LatteConst::ShaderType shaderType, uint32 bufferIndex, uint32 offset, uint32 size) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index f70eec2c9..6bba8c745 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -16,8 +16,6 @@ #define MAX_MTL_TEXTURES 31 #define MAX_MTL_SAMPLERS 16 -constexpr size_t INVALID_OFFSET = std::numeric_limits::max(); - struct MetalBoundBuffer { bool needsRebind = false; From eb573fcacaa6bf762c6453169844c08ba11c09f7 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 11 Aug 2024 18:32:50 +0200 Subject: [PATCH 054/368] fix: color write mask and unpackHalf2x16 --- src/Cafe/HW/Latte/Core/LatteIndices.cpp | 6 +++--- .../LatteDecompilerEmitMSL.cpp | 5 +++-- src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp | 12 ++++++++++++ src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h | 3 +++ .../HW/Latte/Renderer/Metal/MetalPipelineCache.cpp | 13 ++++++++----- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 2 +- 6 files changed, 30 insertions(+), 11 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteIndices.cpp b/src/Cafe/HW/Latte/Core/LatteIndices.cpp index 891dc3e1c..75f0a5a2d 100644 --- a/src/Cafe/HW/Latte/Core/LatteIndices.cpp +++ b/src/Cafe/HW/Latte/Core/LatteIndices.cpp @@ -287,7 +287,7 @@ void LatteIndices_generateAutoLineLoopIndices(void* indexDataOutput, uint32 coun template void LatteIndices_unpackTriangleFanAndConvert(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax) { - debug_printf("TRIANGLE FAN UNPACK\n"); + debug_printf("TRIANGLE FAN UNPACK %u\n", rand()); const betype* src = (betype*)indexDataInput; T* dst = (T*)indexDataOutput; // TODO: check this @@ -295,7 +295,7 @@ void LatteIndices_unpackTriangleFanAndConvert(const void* indexDataInput, void* { uint32 i0; if (i % 2 == 0) - i0 = i / 2; + i0 = i / 2; else i0 = count - 1 - i / 2; T idx = src[i0]; @@ -308,7 +308,7 @@ void LatteIndices_unpackTriangleFanAndConvert(const void* indexDataInput, void* template void LatteIndices_generateAutoTriangleFanIndices(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax) { - debug_printf("TRIANGLE FAN AUTO\n"); + debug_printf("TRIANGLE FAN AUTO %u\n", rand()); const betype* src = (betype*)indexDataInput; T* dst = (T*)indexDataOutput; for (sint32 i = 0; i < count; i++) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 17ff27eab..c3cad925b 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3734,8 +3734,9 @@ void LatteDecompiler_emitHelperFunctions(LatteDecompilerShaderContext* shaderCon // unpackHalf2x16 fCStr_shaderSource->add("" - "float2 unpackHalf2x16(float x) {\r\n" - "return float2(as_type(ushort(as_type(x) & 0x00FF)), as_type(ushort((as_type(x) & 0xFF00) >> 16)));\r\n" + "template\r\n" + "float2 unpackHalf2x16(T x) {\r\n" + "return float2(as_type(x));\r\n" "}\r\n"); // Bit cast diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index d03be2de8..0538650a3 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -2,6 +2,7 @@ #include "Common/precompiled.h" #include "Metal/MTLDepthStencil.hpp" #include "Metal/MTLRenderCommandEncoder.hpp" +#include "Metal/MTLRenderPipeline.hpp" #include "Metal/MTLSampler.hpp" std::map MTL_COLOR_FORMAT_TABLE = { @@ -469,3 +470,14 @@ MTL::StencilOperation GetMtlStencilOp(Latte::LATTE_DB_DEPTH_CONTROL::E_STENCILAC cemu_assert_debug((uint32)action < std::size(MTL_STENCIL_OPERATIONS)); return MTL_STENCIL_OPERATIONS[(uint32)action]; } + +MTL::ColorWriteMask GetMtlColorWriteMask(uint8 mask) +{ + MTL::ColorWriteMask mtlMask = MTL::ColorWriteMaskNone; + if (mask & 0x1) mtlMask |= MTL::ColorWriteMaskRed; + if (mask & 0x2) mtlMask |= MTL::ColorWriteMaskGreen; + if (mask & 0x4) mtlMask |= MTL::ColorWriteMaskBlue; + if (mask & 0x8) mtlMask |= MTL::ColorWriteMaskAlpha; + + return mtlMask; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h index 5fd231866..c3f697bb3 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h @@ -7,6 +7,7 @@ //#include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Metal/MTLDepthStencil.hpp" +#include "Metal/MTLRenderPipeline.hpp" #include "Metal/MTLSampler.hpp" #include "Metal/MTLTexture.hpp" @@ -55,3 +56,5 @@ MTL::SamplerAddressMode GetMtlSamplerAddressMode(Latte::LATTE_SQ_TEX_SAMPLER_WOR MTL::TextureSwizzle GetMtlTextureSwizzle(uint32 swizzle); MTL::StencilOperation GetMtlStencilOp(Latte::LATTE_DB_DEPTH_CONTROL::E_STENCILACTION action); + +MTL::ColorWriteMask GetMtlColorWriteMask(uint8 mask); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index c1be4dbb1..289c1a609 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -92,6 +92,11 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS desc->setFragmentFunction(mtlPixelShader->GetFunction()); // TODO: don't always set the vertex descriptor? desc->setVertexDescriptor(vertexDescriptor); + + // Color attachments + const Latte::LATTE_CB_COLOR_CONTROL& colorControlReg = LatteGPUState.contextNew.CB_COLOR_CONTROL; + uint32 blendEnableMask = colorControlReg.get_BLEND_MASK(); + uint32 renderTargetMask = LatteGPUState.contextNew.CB_TARGET_MASK.get_MASK(); for (uint8 i = 0; i < 8; i++) { const auto& colorBuffer = activeFBO->colorBuffer[i]; @@ -102,12 +107,9 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS } auto colorAttachment = desc->colorAttachments()->object(i); colorAttachment->setPixelFormat(texture->GetRGBAView()->pixelFormat()); + colorAttachment->setWriteMask(GetMtlColorWriteMask((renderTargetMask >> (i * 4)) & 0xF)); // Blending - const Latte::LATTE_CB_COLOR_CONTROL& colorControlReg = LatteGPUState.contextNew.CB_COLOR_CONTROL; - uint32 blendEnableMask = colorControlReg.get_BLEND_MASK(); - uint32 renderTargetMask = LatteGPUState.contextNew.CB_TARGET_MASK.get_MASK(); - bool blendEnabled = ((blendEnableMask & (1 << i))) != 0; // Only float data type is blendable if (blendEnabled && GetMtlPixelFormatInfo(texture->format, false).dataType == MetalDataType::FLOAT) @@ -120,7 +122,6 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS auto srcRgbBlendFactor = GetMtlBlendFactor(blendControlReg.get_COLOR_SRCBLEND()); auto dstRgbBlendFactor = GetMtlBlendFactor(blendControlReg.get_COLOR_DSTBLEND()); - colorAttachment->setWriteMask((renderTargetMask >> (i * 4)) & 0xF); colorAttachment->setRgbBlendOperation(rgbBlendOp); colorAttachment->setSourceRGBBlendFactor(srcRgbBlendFactor); colorAttachment->setDestinationRGBBlendFactor(dstRgbBlendFactor); @@ -138,6 +139,8 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS } } } + + // Depth stencil attachment if (activeFBO->depthBuffer.texture) { auto texture = static_cast(activeFBO->depthBuffer.texture); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index c3d278653..099f923b7 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -79,7 +79,7 @@ void MetalRenderer::InitializeLayer(const Vector2i& size, bool mainWindow) m_metalLayer->setDevice(m_device); // TODO: don't always force sRGB // TODO: shouldn't this be handled differently? - m_metalLayer->setPixelFormat(MTL::PixelFormatRGBA8Unorm_sRGB); + m_metalLayer->setPixelFormat(MTL::PixelFormatRGBA8Unorm/*_sRGB*/); // Present pipeline NS::Error* error = nullptr; From 8316cee59aabf018afbeb1562989b9c45b6cd85e Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 11 Aug 2024 20:09:48 +0200 Subject: [PATCH 055/368] prepare for surface copy --- src/Cafe/CMakeLists.txt | 4 +- .../LatteDecompilerEmitMSL.cpp | 7 ---- .../LatteDecompilerEmitMSLAttrDecoder.cpp | 8 ++-- .../Metal/MetalHybridComputePipeline.cpp | 31 ++++++++++++++ .../Metal/MetalHybridComputePipeline.h | 19 +++++++++ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 41 +++++++++++++------ .../HW/Latte/Renderer/Metal/MetalRenderer.h | 5 +++ .../Renderer/Metal/ShaderSourcePresent.h | 23 ----------- .../Renderer/Metal/UtilityShaderSource.h | 34 +++++++++++++++ 9 files changed, 124 insertions(+), 48 deletions(-) create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h delete mode 100644 src/Cafe/HW/Latte/Renderer/Metal/ShaderSourcePresent.h create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index 3f2244915..d0e7d9210 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -560,7 +560,9 @@ if(ENABLE_METAL) HW/Latte/Renderer/Metal/MetalPipelineCache.h HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp HW/Latte/Renderer/Metal/MetalDepthStencilCache.h - HW/Latte/Renderer/Metal/ShaderSourcePresent.h + HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp + HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h + HW/Latte/Renderer/Metal/UtilityShaderSource.h ) #target_link_libraries(CemuCafe PRIVATE diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index c3cad925b..e769064fc 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3732,13 +3732,6 @@ void LatteDecompiler_emitHelperFunctions(LatteDecompilerShaderContext* shaderCon "return round(x / 2.0) * 2.0;\r\n" "}\r\n"); - // unpackHalf2x16 - fCStr_shaderSource->add("" - "template\r\n" - "float2 unpackHalf2x16(T x) {\r\n" - "return float2(as_type(x));\r\n" - "}\r\n"); - // Bit cast // Scalar diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp index a9993964b..ee4382987 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp @@ -256,7 +256,7 @@ void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext // seen in Giana Sisters: Twisted Dreams _readLittleEndianAttributeU16x4(shaderContext, src, attributeInputIndex); // TODO: uint4? - src->add("attrDecoder.xyzw = as_type(float4(unpackHalf2x16(attrDecoder.x|(attrDecoder.y<<16)),unpackHalf2x16(attrDecoder.z|(attrDecoder.w<<16))));" _CRLF); + src->add("attrDecoder.xyzw = as_type(float4(float2(as_type(attrDecoder.x|(attrDecoder.y<<16))),float2(as_type(attrDecoder.z|(attrDecoder.w<<16)))));" _CRLF); } else if (attrib->format == FMT_16_16 && attrib->nfa == 0 && attrib->isSigned != 0) { @@ -271,7 +271,7 @@ void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext { // seen in Giana Sisters: Twisted Dreams _readLittleEndianAttributeU16x2(shaderContext, src, attributeInputIndex); - src->add("attrDecoder.xy = as_type(unpackHalf2x16(attrDecoder.x|(attrDecoder.y<<16)));" _CRLF); + src->add("attrDecoder.xy = as_type(float2(as_type(attrDecoder.x|(attrDecoder.y<<16))));" _CRLF); src->add("attrDecoder.zw = uint2(0);" _CRLF); } else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 0 && attrib->isSigned == 0 ) @@ -394,7 +394,7 @@ void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext { _readBigEndianAttributeU16x4(shaderContext, src, attributeInputIndex); // TODO: uint4? - src->add("attrDecoder.xyzw = as_type(float4(unpackHalf2x16(attrDecoder.x|(attrDecoder.y<<16)),unpackHalf2x16(attrDecoder.z|(attrDecoder.w<<16))));" _CRLF); + src->add("attrDecoder.xyzw = as_type(float4(float2(as_type(attrDecoder.x|(attrDecoder.y<<16))),float2(as_type(attrDecoder.z|(attrDecoder.w<<16)))));" _CRLF); } else if (attrib->format == FMT_16_16_16_16 && attrib->nfa == 0 && attrib->isSigned != 0) { @@ -446,7 +446,7 @@ void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext else if( attrib->format == FMT_16_16_FLOAT && attrib->nfa == 2 ) { _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); - src->add("attrDecoder.xy = as_type(unpackHalf2x16(attrDecoder.x|(attrDecoder.y<<16)));" _CRLF); + src->add("attrDecoder.xy = as_type(float2(as_type(attrDecoder.x|(attrDecoder.y<<16))));" _CRLF); src->add("attrDecoder.zw = uint2(0);" _CRLF); } else if( attrib->format == FMT_16_16 && attrib->nfa == 0 && attrib->isSigned == 0 ) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp new file mode 100644 index 000000000..3802939ba --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp @@ -0,0 +1,31 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h" + +MetalHybridComputePipeline::MetalHybridComputePipeline(class MetalRenderer* mtlRenderer, MTL::Library* library, const char* vertexFunctionName, const char* kernelFunctionName) +{ + // Render pipeline state + MTL::Function* vertexFunction = library->newFunction(NS::String::string(vertexFunctionName, NS::ASCIIStringEncoding)); + + MTL::RenderPipelineDescriptor* renderPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); + renderPipelineDescriptor->setVertexFunction(vertexFunction); + renderPipelineDescriptor->setRasterizationEnabled(false); + + NS::Error* error = nullptr; + m_renderPipelineState = mtlRenderer->GetDevice()->newRenderPipelineState(renderPipelineDescriptor, &error); + renderPipelineDescriptor->release(); + vertexFunction->release(); + if (error) + { + printf("error creating hybrid render pipeline state: %s\n", error->localizedDescription()->utf8String()); + error->release(); + } + + // Compute pipeline state + // TODO +} + +MetalHybridComputePipeline::~MetalHybridComputePipeline() +{ + m_renderPipelineState->release(); + // TODO: uncomment + //m_computePipelineState->release(); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h new file mode 100644 index 000000000..7d586e242 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h @@ -0,0 +1,19 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" +#include "HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Metal/MTLLibrary.hpp" +#include "Metal/MTLRenderPipeline.hpp" + +class MetalHybridComputePipeline +{ +public: + MetalHybridComputePipeline(class MetalRenderer* mtlRenderer, MTL::Library* library, const char* vertexFunctionName, const char* kernelFunctionName); + ~MetalHybridComputePipeline(); + + MTL::RenderPipelineState* GetRenderPipelineState() const { return m_renderPipelineState; } + + MTL::RenderPipelineState* GetComputePipelineState() const { return m_computePipelineState; } + +private: + MTL::RenderPipelineState* m_renderPipelineState; + MTL::RenderPipelineState* m_computePipelineState; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 099f923b7..2f03d0a24 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -7,13 +7,15 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" -#include "Cafe/HW/Latte/Renderer/Metal/ShaderSourcePresent.h" +#include "Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h" #include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/Core/LatteIndices.h" #include "Cemu/Logging/CemuDebugLogging.h" +#include "Foundation/NSError.hpp" #include "HW/Latte/Core/Latte.h" #include "HW/Latte/ISA/LatteReg.h" #include "Metal/MTLPixelFormat.hpp" @@ -54,10 +56,31 @@ MetalRenderer::MetalRenderer() m_state.uniformBufferOffsets[i][j] = INVALID_OFFSET; } } + + // Utility shader source + NS::Error* error = nullptr; + m_utilityLibrary = m_device->newLibrary(NS::String::string(utilityShaderSource, NS::ASCIIStringEncoding), nullptr, &error); + if (error) + { + debug_printf("failed to create present library (error: %s)\n", error->localizedDescription()->utf8String()); + error->release(); + return; + } + + // Hybrid pipelines + m_copyDepthToColorPipeline = new MetalHybridComputePipeline(this, m_utilityLibrary, "vertexCopyDepthToColor", "kernelCopyDepthToColor"); + m_copyColorToDepthPipeline = new MetalHybridComputePipeline(this, m_utilityLibrary, "vertexCopyColorToDepth", "kernelCopyColorToDepth"); } MetalRenderer::~MetalRenderer() { + delete m_copyDepthToColorPipeline; + delete m_copyColorToDepthPipeline; + + m_presentPipeline->release(); + + m_utilityLibrary->release(); + delete m_depthStencilCache; delete m_pipelineCache; delete m_memoryManager; @@ -82,22 +105,15 @@ void MetalRenderer::InitializeLayer(const Vector2i& size, bool mainWindow) m_metalLayer->setPixelFormat(MTL::PixelFormatRGBA8Unorm/*_sRGB*/); // Present pipeline - NS::Error* error = nullptr; - MTL::Library* presentLibrary = m_device->newLibrary(NS::String::string(presentLibrarySource, NS::ASCIIStringEncoding), nullptr, &error); - if (error) - { - debug_printf("failed to create present library (error: %s)\n", error->localizedDescription()->utf8String()); - error->release(); - return; - } - MTL::Function* presentVertexFunction = presentLibrary->newFunction(NS::String::string("presentVertex", NS::ASCIIStringEncoding)); - MTL::Function* presentFragmentFunction = presentLibrary->newFunction(NS::String::string("presentFragment", NS::ASCIIStringEncoding)); - presentLibrary->release(); + MTL::Function* presentVertexFunction = m_utilityLibrary->newFunction(NS::String::string("vertexFullscreen", NS::ASCIIStringEncoding)); + MTL::Function* presentFragmentFunction = m_utilityLibrary->newFunction(NS::String::string("fragmentPresent", NS::ASCIIStringEncoding)); MTL::RenderPipelineDescriptor* renderPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); renderPipelineDescriptor->setVertexFunction(presentVertexFunction); renderPipelineDescriptor->setFragmentFunction(presentFragmentFunction); renderPipelineDescriptor->colorAttachments()->object(0)->setPixelFormat(m_metalLayer->pixelFormat()); + + NS::Error* error = nullptr; m_presentPipeline = m_device->newRenderPipelineState(renderPipelineDescriptor, &error); renderPipelineDescriptor->release(); presentVertexFunction->release(); @@ -106,7 +122,6 @@ void MetalRenderer::InitializeLayer(const Vector2i& size, bool mainWindow) { debug_printf("failed to create present pipeline (error: %s)\n", error->localizedDescription()->utf8String()); error->release(); - return; } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 6bba8c745..efff02649 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -257,8 +257,13 @@ class MetalRenderer : public Renderer MTL::CommandQueue* m_commandQueue; // Pipelines + MTL::Library* m_utilityLibrary; MTL::RenderPipelineState* m_presentPipeline; + // Hybrid pipelines + class MetalHybridComputePipeline* m_copyDepthToColorPipeline; + class MetalHybridComputePipeline* m_copyColorToDepthPipeline; + // Basic MTL::SamplerState* m_nearestSampler; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/ShaderSourcePresent.h b/src/Cafe/HW/Latte/Renderer/Metal/ShaderSourcePresent.h deleted file mode 100644 index a77ce2cdb..000000000 --- a/src/Cafe/HW/Latte/Renderer/Metal/ShaderSourcePresent.h +++ /dev/null @@ -1,23 +0,0 @@ -inline const char* presentLibrarySource = \ -"#include \n" \ -"using namespace metal;\n" \ -"\n" \ -"constant float2 positions[] = {float2(-1.0, -3.0), float2(-1.0, 1.0), float2(3.0, 1.0)};\n" -"\n" \ -"struct VertexOut {\n" \ -" float4 position [[position]];\n" \ -" float2 texCoord;\n" \ -"};\n" \ -"\n" \ -"vertex VertexOut presentVertex(ushort vid [[vertex_id]]) {\n" \ -" VertexOut out;\n" \ -" out.position = float4(positions[vid], 0.0, 1.0);\n" \ -" out.texCoord = positions[vid] * 0.5 + 0.5;\n" \ -" out.texCoord.y = 1.0 - out.texCoord.y;\n" \ -"\n" \ -" return out;\n" \ -"}\n" \ -"\n" \ -"fragment float4 presentFragment(VertexOut in [[stage_in]], texture2d tex [[texture(0)]], sampler samplr [[sampler(0)]]) {\n" \ -" return tex.sample(samplr, in.texCoord);\n" \ -"}\n"; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h new file mode 100644 index 000000000..edfee9bad --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h @@ -0,0 +1,34 @@ +inline const char* utilityShaderSource = \ +"#include \n" \ +"using namespace metal;\n" \ +"\n" \ +"constant float2 positions[] = {float2(-1.0, -3.0), float2(-1.0, 1.0), float2(3.0, 1.0)};\n" +"\n" \ +"struct VertexOut {\n" \ +" float4 position [[position]];\n" \ +" float2 texCoord;\n" \ +"};\n" \ +"\n" \ +"vertex VertexOut vertexFullscreen(ushort vid [[vertex_id]]) {\n" \ +" VertexOut out;\n" \ +" out.position = float4(positions[vid], 0.0, 1.0);\n" \ +" out.texCoord = positions[vid] * 0.5 + 0.5;\n" \ +" out.texCoord.y = 1.0 - out.texCoord.y;\n" \ +"\n" \ +" return out;\n" \ +"}\n" \ +"\n" \ +"fragment float4 fragmentPresent(VertexOut in [[stage_in]], texture2d tex [[texture(0)]], sampler samplr [[sampler(0)]]) {\n" \ +" return tex.sample(samplr, in.texCoord);\n" \ +"}\n" \ +"\n" \ +"vertex void vertexCopyDepthToColor(uint vid [[vertex_id]], depth2d src [[texture(0)]], texture2d dst [[texture(1)]], constant uint& copyWidth) {\n" \ +" uint2 coord = uint2(vid % copyWidth, vid / copyWidth);\n" \ +" return dst.write(float4(src.read(coord), 0.0, 0.0, 0.0), coord);\n" \ +"}\n" \ +"\n" \ +"vertex void vertexCopyColorToDepth(uint vid [[vertex_id]], texture2d src [[texture(0)]], texture2d dst [[texture(1)]], constant uint& copyWidth) {\n" \ +" uint2 coord = uint2(vid % copyWidth, vid / copyWidth);\n" \ +" return dst.write(float4(src.read(coord).r), coord);\n" \ +"}\n" \ +"\n"; From 5bc9913bf482f4e26c068381de6801c37f60a06b Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 11 Aug 2024 20:31:19 +0200 Subject: [PATCH 056/368] implement surface copy --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 72 ++++++++++++++++--- 1 file changed, 63 insertions(+), 9 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 2f03d0a24..f3e3fdb13 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -15,15 +15,7 @@ #include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/Core/LatteIndices.h" #include "Cemu/Logging/CemuDebugLogging.h" -#include "Foundation/NSError.hpp" -#include "HW/Latte/Core/Latte.h" -#include "HW/Latte/ISA/LatteReg.h" -#include "Metal/MTLPixelFormat.hpp" -#include "Metal/MTLRenderCommandEncoder.hpp" -#include "Metal/MTLResource.hpp" -#include "Metal/MTLTypes.hpp" #include "gui/guiWrapper.h" -#include extern bool hasValidFramebufferAttached; @@ -452,7 +444,68 @@ LatteTextureReadbackInfo* MetalRenderer::texture_createReadback(LatteTextureView void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* sourceTexture, sint32 srcMip, sint32 srcSlice, LatteTexture* destinationTexture, sint32 dstMip, sint32 dstSlice, sint32 width, sint32 height) { - debug_printf("MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion not implemented\n"); + EnsureCommandBuffer(); + + // scale copy size to effective size + sint32 effectiveCopyWidth = width; + sint32 effectiveCopyHeight = height; + LatteTexture_scaleToEffectiveSize(sourceTexture, &effectiveCopyWidth, &effectiveCopyHeight, 0); + sint32 sourceEffectiveWidth, sourceEffectiveHeight; + sourceTexture->GetEffectiveSize(sourceEffectiveWidth, sourceEffectiveHeight, srcMip); + + sint32 texSrcMip = srcMip; + sint32 texSrcSlice = srcSlice; + sint32 texDstMip = dstMip; + sint32 texDstSlice = dstSlice; + + LatteTextureMtl* srcTextureMtl = static_cast(sourceTexture); + LatteTextureMtl* dstTextureMtl = static_cast(destinationTexture); + + // check if texture rescale ratios match + // todo - if not, we have to use drawcall based copying + if (!LatteTexture_doesEffectiveRescaleRatioMatch(srcTextureMtl, texSrcMip, dstTextureMtl, texDstMip)) + { + cemuLog_logDebug(LogType::Force, "surfaceCopy_copySurfaceWithFormatConversion(): Mismatching dimensions"); + return; + } + + // check if bpp size matches + if (srcTextureMtl->GetBPP() != dstTextureMtl->GetBPP()) + { + cemuLog_logDebug(LogType::Force, "surfaceCopy_copySurfaceWithFormatConversion(): Mismatching BPP"); + return; + } + + MetalHybridComputePipeline* copyPipeline; + if (srcTextureMtl->IsDepth()) + copyPipeline = m_copyDepthToColorPipeline; + else + copyPipeline = m_copyColorToDepthPipeline; + + MTL::Texture* textures[] = {srcTextureMtl->GetTexture(), dstTextureMtl->GetTexture()}; + + if (m_encoderType == MetalEncoderType::Render) + { + auto renderCommandEncoder = static_cast(m_commandEncoder); + + renderCommandEncoder->setRenderPipelineState(copyPipeline->GetRenderPipelineState()); + + renderCommandEncoder->setViewport(MTL::Viewport{0.0, 0.0, (double)effectiveCopyWidth, (double)effectiveCopyHeight, 0.0, 1.0}); + renderCommandEncoder->setScissorRect(MTL::ScissorRect{0, 0, (uint32)effectiveCopyWidth, (uint32)effectiveCopyHeight}); + + renderCommandEncoder->setVertexTextures(textures, NS::Range(0, 2)); + renderCommandEncoder->setVertexBytes(&effectiveCopyWidth, sizeof(uint32), 0); + // TODO: set slices and mips + + renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3)); + } + else + { + // TODO: do the copy in a compute shader + debug_printf("surfaceCopy_copySurfaceWithFormatConversion: no active render command encoder, skipping copy\n"); + } + + // TODO: restore state } void MetalRenderer::bufferCache_init(const sint32 bufferSize) @@ -884,6 +937,7 @@ void MetalRenderer::EndEncoding() m_commandEncoder->endEncoding(); m_commandEncoder->release(); m_commandEncoder = nullptr; + m_encoderType = MetalEncoderType::None; } } From 7fb3e1bd1e846f6d5079eebab8ec531bbbd79393 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 12 Aug 2024 10:27:52 +0200 Subject: [PATCH 057/368] support cubemap arrays --- .../HW/Latte/Renderer/Metal/LatteTextureMtl.cpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index 63cd69f60..5d5273caa 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -42,7 +42,13 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM textureType = MTL::TextureType3D; break; case Latte::E_DIM::DIM_CUBEMAP: - textureType = MTL::TextureTypeCube; // TODO: check this + if (effectiveBaseDepth % 6 != 0) + debug_printf("cubemaps must have an array length multiple of 6, length: %u\n", effectiveBaseDepth); + + if (effectiveBaseDepth <= 6) + textureType = MTL::TextureTypeCube; + else + textureType = MTL::TextureTypeCubeArray; break; default: cemu_assert_unimplemented(); @@ -55,7 +61,11 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM { desc->setDepth(effectiveBaseDepth); } - else if (textureType == MTL::TextureTypeCube || textureType == MTL::TextureTypeCubeArray) + else if (textureType == MTL::TextureTypeCube) + { + // Do notjing + } + else if (textureType == MTL::TextureTypeCubeArray) { desc->setArrayLength(effectiveBaseDepth / 6); } From 5e748cdb48038c05b0a331da210a84aa9883daf9 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 12 Aug 2024 11:14:55 +0200 Subject: [PATCH 058/368] support surface copy parameters --- .../Renderer/Metal/MetalMemoryManager.cpp | 1 + .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 26 +++++++++---------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 3 +-- .../Renderer/Metal/UtilityShaderSource.h | 17 +++++++----- 4 files changed, 25 insertions(+), 22 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 09e07cd9f..aef458a7c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -96,6 +96,7 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu // TODO: use compute/void vertex function instead size_t newStride = Align(stride, 4); size_t newSize = vertexBufferRange.size / stride * newStride; + // TODO: use one big buffer for all restrided buffers restrideInfo.buffer = m_mtlr->GetDevice()->newBuffer(newSize, MTL::StorageModeShared); uint8* oldPtr = (uint8*)bufferCache->contents() + vertexBufferRange.offset; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index f3e3fdb13..7afbefd63 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -60,14 +60,12 @@ MetalRenderer::MetalRenderer() } // Hybrid pipelines - m_copyDepthToColorPipeline = new MetalHybridComputePipeline(this, m_utilityLibrary, "vertexCopyDepthToColor", "kernelCopyDepthToColor"); - m_copyColorToDepthPipeline = new MetalHybridComputePipeline(this, m_utilityLibrary, "vertexCopyColorToDepth", "kernelCopyColorToDepth"); + m_copyTextureToTexturePipeline = new MetalHybridComputePipeline(this, m_utilityLibrary, "vertexCopyTextureToTexture", "kernelCopyTextureToTexture"); } MetalRenderer::~MetalRenderer() { - delete m_copyDepthToColorPipeline; - delete m_copyColorToDepthPipeline; + delete m_copyTextureToTexturePipeline; m_presentPipeline->release(); @@ -476,26 +474,28 @@ void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* so return; } - MetalHybridComputePipeline* copyPipeline; - if (srcTextureMtl->IsDepth()) - copyPipeline = m_copyDepthToColorPipeline; - else - copyPipeline = m_copyColorToDepthPipeline; - MTL::Texture* textures[] = {srcTextureMtl->GetTexture(), dstTextureMtl->GetTexture()}; + struct CopyParams + { + uint32 width; + uint32 srcMip; + uint32 srcSlice; + uint32 dstMip; + uint32 dstSlice; + } params{(uint32)effectiveCopyWidth, (uint32)texSrcMip, (uint32)texSrcSlice, (uint32)texDstMip, (uint32)texDstSlice}; + if (m_encoderType == MetalEncoderType::Render) { auto renderCommandEncoder = static_cast(m_commandEncoder); - renderCommandEncoder->setRenderPipelineState(copyPipeline->GetRenderPipelineState()); + renderCommandEncoder->setRenderPipelineState(m_copyTextureToTexturePipeline->GetRenderPipelineState()); renderCommandEncoder->setViewport(MTL::Viewport{0.0, 0.0, (double)effectiveCopyWidth, (double)effectiveCopyHeight, 0.0, 1.0}); renderCommandEncoder->setScissorRect(MTL::ScissorRect{0, 0, (uint32)effectiveCopyWidth, (uint32)effectiveCopyHeight}); renderCommandEncoder->setVertexTextures(textures, NS::Range(0, 2)); - renderCommandEncoder->setVertexBytes(&effectiveCopyWidth, sizeof(uint32), 0); - // TODO: set slices and mips + renderCommandEncoder->setVertexBytes(¶ms, sizeof(params), 0); renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3)); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index efff02649..0b2208ec5 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -261,8 +261,7 @@ class MetalRenderer : public Renderer MTL::RenderPipelineState* m_presentPipeline; // Hybrid pipelines - class MetalHybridComputePipeline* m_copyDepthToColorPipeline; - class MetalHybridComputePipeline* m_copyColorToDepthPipeline; + class MetalHybridComputePipeline* m_copyTextureToTexturePipeline; // Basic MTL::SamplerState* m_nearestSampler; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h index edfee9bad..2e49fa951 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h @@ -22,13 +22,16 @@ inline const char* utilityShaderSource = \ " return tex.sample(samplr, in.texCoord);\n" \ "}\n" \ "\n" \ -"vertex void vertexCopyDepthToColor(uint vid [[vertex_id]], depth2d src [[texture(0)]], texture2d dst [[texture(1)]], constant uint& copyWidth) {\n" \ -" uint2 coord = uint2(vid % copyWidth, vid / copyWidth);\n" \ -" return dst.write(float4(src.read(coord), 0.0, 0.0, 0.0), coord);\n" \ -"}\n" \ +"struct CopyParams {\n" \ +" uint width;\n" \ +" uint srcMip;\n" \ +" uint srcSlice;\n" \ +" uint dstMip;\n" \ +" uint dstSlice;\n" \ +"};\n" \ "\n" \ -"vertex void vertexCopyColorToDepth(uint vid [[vertex_id]], texture2d src [[texture(0)]], texture2d dst [[texture(1)]], constant uint& copyWidth) {\n" \ -" uint2 coord = uint2(vid % copyWidth, vid / copyWidth);\n" \ -" return dst.write(float4(src.read(coord).r), coord);\n" \ +"vertex void vertexCopyTextureToTexture(uint vid [[vertex_id]], texture2d_array src [[texture(0)]], texture2d_array dst [[texture(1)]], constant CopyParams& params) {\n" \ +" uint2 coord = uint2(vid % params.width, vid / params.width);\n" \ +" return dst.write(float4(src.read(coord, params.srcSlice, params.srcMip).r, 0.0, 0.0, 0.0), coord, params.dstSlice, params.dstMip);\n" \ "}\n" \ "\n"; From 34d8076ab6f97dd9745a5cde337480eca0a6b6ec Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 12 Aug 2024 11:36:12 +0200 Subject: [PATCH 059/368] fix: srgb --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 181 ++++++++---------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 32 ++-- 2 files changed, 101 insertions(+), 112 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 7afbefd63..ba50f0725 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -15,6 +15,7 @@ #include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/Core/LatteIndices.h" #include "Cemu/Logging/CemuDebugLogging.h" +#include "Metal/MTLPixelFormat.hpp" #include "gui/guiWrapper.h" extern bool hasValidFramebufferAttached; @@ -45,13 +46,13 @@ MetalRenderer::MetalRenderer() { for (uint32 j = 0; j < MAX_MTL_BUFFERS; j++) { - m_state.uniformBufferOffsets[i][j] = INVALID_OFFSET; + m_state.m_uniformBufferOffsets[i][j] = INVALID_OFFSET; } } // Utility shader source NS::Error* error = nullptr; - m_utilityLibrary = m_device->newLibrary(NS::String::string(utilityShaderSource, NS::ASCIIStringEncoding), nullptr, &error); + MTL::Library* utilityLibrary = m_device->newLibrary(NS::String::string(utilityShaderSource, NS::ASCIIStringEncoding), nullptr, &error); if (error) { debug_printf("failed to create present library (error: %s)\n", error->localizedDescription()->utf8String()); @@ -59,17 +60,46 @@ MetalRenderer::MetalRenderer() return; } + // Present pipeline + MTL::Function* presentVertexFunction = utilityLibrary->newFunction(NS::String::string("vertexFullscreen", NS::ASCIIStringEncoding)); + MTL::Function* presentFragmentFunction = utilityLibrary->newFunction(NS::String::string("fragmentPresent", NS::ASCIIStringEncoding)); + + MTL::RenderPipelineDescriptor* renderPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); + renderPipelineDescriptor->setVertexFunction(presentVertexFunction); + renderPipelineDescriptor->setFragmentFunction(presentFragmentFunction); + presentVertexFunction->release(); + presentFragmentFunction->release(); + + error = nullptr; + renderPipelineDescriptor->colorAttachments()->object(0)->setPixelFormat(MTL::PixelFormatRGBA8Unorm); + m_presentPipelineLinear = m_device->newRenderPipelineState(renderPipelineDescriptor, &error); + if (error) + { + debug_printf("failed to create linear present pipeline (error: %s)\n", error->localizedDescription()->utf8String()); + error->release(); + } + + error = nullptr; + renderPipelineDescriptor->colorAttachments()->object(0)->setPixelFormat(MTL::PixelFormatRGBA8Unorm_sRGB); + m_presentPipelineSRGB = m_device->newRenderPipelineState(renderPipelineDescriptor, &error); + renderPipelineDescriptor->release(); + if (error) + { + debug_printf("failed to create sRGB present pipeline (error: %s)\n", error->localizedDescription()->utf8String()); + error->release(); + } + // Hybrid pipelines - m_copyTextureToTexturePipeline = new MetalHybridComputePipeline(this, m_utilityLibrary, "vertexCopyTextureToTexture", "kernelCopyTextureToTexture"); + m_copyTextureToTexturePipeline = new MetalHybridComputePipeline(this, utilityLibrary, "vertexCopyTextureToTexture", "kernelCopyTextureToTexture"); + utilityLibrary->release(); } MetalRenderer::~MetalRenderer() { delete m_copyTextureToTexturePipeline; - m_presentPipeline->release(); - - m_utilityLibrary->release(); + m_presentPipelineLinear->release(); + m_presentPipelineSRGB->release(); delete m_depthStencilCache; delete m_pipelineCache; @@ -90,29 +120,6 @@ void MetalRenderer::InitializeLayer(const Vector2i& size, bool mainWindow) m_metalLayer = (CA::MetalLayer*)CreateMetalLayer(windowInfo.handle); m_metalLayer->setDevice(m_device); - // TODO: don't always force sRGB - // TODO: shouldn't this be handled differently? - m_metalLayer->setPixelFormat(MTL::PixelFormatRGBA8Unorm/*_sRGB*/); - - // Present pipeline - MTL::Function* presentVertexFunction = m_utilityLibrary->newFunction(NS::String::string("vertexFullscreen", NS::ASCIIStringEncoding)); - MTL::Function* presentFragmentFunction = m_utilityLibrary->newFunction(NS::String::string("fragmentPresent", NS::ASCIIStringEncoding)); - - MTL::RenderPipelineDescriptor* renderPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); - renderPipelineDescriptor->setVertexFunction(presentVertexFunction); - renderPipelineDescriptor->setFragmentFunction(presentFragmentFunction); - renderPipelineDescriptor->colorAttachments()->object(0)->setPixelFormat(m_metalLayer->pixelFormat()); - - NS::Error* error = nullptr; - m_presentPipeline = m_device->newRenderPipelineState(renderPipelineDescriptor, &error); - renderPipelineDescriptor->release(); - presentVertexFunction->release(); - presentFragmentFunction->release(); - if (error) - { - debug_printf("failed to create present pipeline (error: %s)\n", error->localizedDescription()->utf8String()); - error->release(); - } } void MetalRenderer::Initialize() @@ -146,7 +153,7 @@ bool MetalRenderer::GetVRAMInfo(int& usageInMB, int& totalInMB) const void MetalRenderer::ClearColorbuffer(bool padView) { - if (!AcquireNextDrawable()) + if (!AcquireNextDrawable(!padView)) return; ClearColorTextureInternal(m_drawable->texture(), 0, 0, 0.0f, 0.0f, 0.0f, 0.0f); @@ -183,9 +190,12 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight, bool padView, bool clearBackground) { - if (!AcquireNextDrawable()) + if (!AcquireNextDrawable(!padView)) return; + if (clearBackground) + ClearColorbuffer(padView); + MTL::Texture* presentTexture = static_cast(texView)->GetRGBAView(); // Create render pass @@ -199,7 +209,7 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput renderPassDescriptor->release(); // Draw to Metal layer - renderCommandEncoder->setRenderPipelineState(m_presentPipeline); + renderCommandEncoder->setRenderPipelineState(m_state.m_usesSRGB ? m_presentPipelineSRGB : m_presentPipelineLinear); renderCommandEncoder->setFragmentTexture(presentTexture, 0); renderCommandEncoder->setFragmentSamplerState(m_nearestSampler, 0); @@ -208,7 +218,7 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput bool MetalRenderer::BeginFrame(bool mainWindow) { - return AcquireNextDrawable(); + return AcquireNextDrawable(mainWindow); } void MetalRenderer::Flush(bool waitIdle) @@ -234,19 +244,19 @@ void MetalRenderer::AppendOverlayDebugInfo() void MetalRenderer::renderTarget_setViewport(float x, float y, float width, float height, float nearZ, float farZ, bool halfZ) { - m_state.viewport = MTL::Viewport{x, y, width, height, nearZ, farZ}; + m_state.m_viewport = MTL::Viewport{x, y, width, height, nearZ, farZ}; if (m_encoderType == MetalEncoderType::Render) { - static_cast(m_commandEncoder)->setViewport(m_state.viewport); + static_cast(m_commandEncoder)->setViewport(m_state.m_viewport); } } void MetalRenderer::renderTarget_setScissor(sint32 scissorX, sint32 scissorY, sint32 scissorWidth, sint32 scissorHeight) { - m_state.scissor = MTL::ScissorRect{NS::UInteger(scissorX), NS::UInteger(scissorY), NS::UInteger(scissorWidth), NS::UInteger(scissorHeight)}; + m_state.m_scissor = MTL::ScissorRect{NS::UInteger(scissorX), NS::UInteger(scissorY), NS::UInteger(scissorWidth), NS::UInteger(scissorHeight)}; if (m_encoderType == MetalEncoderType::Render) { - static_cast(m_commandEncoder)->setScissorRect(m_state.scissor); + static_cast(m_commandEncoder)->setScissorRect(m_state.m_scissor); } } @@ -257,13 +267,13 @@ LatteCachedFBO* MetalRenderer::rendertarget_createCachedFBO(uint64 key) void MetalRenderer::rendertarget_deleteCachedFBO(LatteCachedFBO* cfbo) { - if (cfbo == (LatteCachedFBO*)m_state.activeFBO) - m_state.activeFBO = nullptr; + if (cfbo == (LatteCachedFBO*)m_state.m_activeFBO) + m_state.m_activeFBO = nullptr; } void MetalRenderer::rendertarget_bindFramebufferObject(LatteCachedFBO* cfbo) { - m_state.activeFBO = (CachedFBOMtl*)cfbo; + m_state.m_activeFBO = (CachedFBOMtl*)cfbo; } void* MetalRenderer::texture_acquireTextureUploadBuffer(uint32 size) @@ -347,7 +357,7 @@ LatteTexture* MetalRenderer::texture_createTextureEx(Latte::E_DIM dim, MPTR phys void MetalRenderer::texture_setLatteTexture(LatteTextureView* textureView, uint32 textureUnit) { - m_state.textures[textureUnit] = static_cast(textureView); + m_state.m_textures[textureUnit] = static_cast(textureView); } void MetalRenderer::texture_copyImageSubData(LatteTexture* src, sint32 srcMip, sint32 effectiveSrcX, sint32 effectiveSrcY, sint32 srcSlice, LatteTexture* dst, sint32 dstMip, sint32 effectiveDstX, sint32 effectiveDstY, sint32 dstSlice, sint32 effectiveCopyWidth, sint32 effectiveCopyHeight, sint32 srcDepth_) @@ -531,7 +541,7 @@ void MetalRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, uint32 size) { cemu_assert_debug(bufferIndex < LATTE_MAX_VERTEX_BUFFERS); - auto& buffer = m_state.vertexBuffers[bufferIndex]; + auto& buffer = m_state.m_vertexBuffers[bufferIndex]; if (buffer.offset == offset && buffer.size == size) return; @@ -550,7 +560,7 @@ void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, u void MetalRenderer::buffer_bindUniformBuffer(LatteConst::ShaderType shaderType, uint32 bufferIndex, uint32 offset, uint32 size) { - m_state.uniformBufferOffsets[(uint32)shaderType][bufferIndex] = offset; + m_state.m_uniformBufferOffsets[(uint32)shaderType][bufferIndex] = offset; } RendererShader* MetalRenderer::shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool isGameShader, bool isGfxPackShader) @@ -575,14 +585,14 @@ void MetalRenderer::streamout_rendererFinishDrawcall() void MetalRenderer::draw_beginSequence() { - m_state.skipDrawSequence = false; + m_state.m_skipDrawSequence = false; // update shader state LatteSHRC_UpdateActiveShaders(); if (LatteGPUState.activeShaderHasError) { debug_printf("Skipping drawcalls due to shader error\n"); - m_state.skipDrawSequence = true; + m_state.m_skipDrawSequence = true; cemu_assert_debug(false); return; } @@ -595,14 +605,14 @@ void MetalRenderer::draw_beginSequence() if (!LatteMRT::UpdateCurrentFBO()) { debug_printf("Rendertarget invalid\n"); - m_state.skipDrawSequence = true; + m_state.m_skipDrawSequence = true; return; // no render target } if (!hasValidFramebufferAttached) { debug_printf("Drawcall with no color buffer or depth buffer attached\n"); - m_state.skipDrawSequence = true; + m_state.m_skipDrawSequence = true; return; // no render target } LatteTexture_updateTextures(); @@ -626,7 +636,7 @@ void MetalRenderer::draw_beginSequence() rasterizerEnable = true; if (!rasterizerEnable == false) - m_state.skipDrawSequence = true; + m_state.m_skipDrawSequence = true; } void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst) @@ -637,24 +647,24 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 //} // Render pass - if (!m_state.activeFBO) + if (!m_state.m_activeFBO) { debug_printf("no active FBO, skipping draw\n"); return; } - auto renderPassDescriptor = m_state.activeFBO->GetRenderPassDescriptor(); + auto renderPassDescriptor = m_state.m_activeFBO->GetRenderPassDescriptor(); MTL::Texture* colorRenderTargets[8] = {nullptr}; MTL::Texture* depthRenderTarget = nullptr; for (uint32 i = 0; i < 8; i++) { - auto colorTexture = static_cast(m_state.activeFBO->colorBuffer[i].texture); + auto colorTexture = static_cast(m_state.m_activeFBO->colorBuffer[i].texture); if (colorTexture) { colorRenderTargets[i] = colorTexture->GetRGBAView(); } } - auto depthTexture = static_cast(m_state.activeFBO->depthBuffer.texture); + auto depthTexture = static_cast(m_state.m_activeFBO->depthBuffer.texture); if (depthTexture) { depthRenderTarget = depthTexture->GetRGBAView(); @@ -672,7 +682,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 const auto fetchShader = LatteSHRC_GetActiveFetchShader(); // Render pipeline state - MTL::RenderPipelineState* renderPipelineState = m_pipelineCache->GetPipelineState(fetchShader, vertexShader, pixelShader, m_state.lastUsedFBO, LatteGPUState.contextNew); + MTL::RenderPipelineState* renderPipelineState = m_pipelineCache->GetPipelineState(fetchShader, vertexShader, pixelShader, m_state.m_lastUsedFBO, LatteGPUState.contextNew); renderCommandEncoder->setRenderPipelineState(renderPipelineState); // Depth stencil state @@ -758,7 +768,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // Vertex buffers for (uint8 i = 0; i < MAX_MTL_BUFFERS; i++) { - auto& vertexBufferRange = m_state.vertexBuffers[i]; + auto& vertexBufferRange = m_state.m_vertexBuffers[i]; if (vertexBufferRange.offset != INVALID_OFFSET) { // Restride @@ -846,7 +856,7 @@ MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(MTL::RenderPas bool needsNewRenderPass = false; for (uint8 i = 0; i < 8; i++) { - if (colorRenderTargets[i] && (colorRenderTargets[i] != m_state.colorRenderTargets[i])) + if (colorRenderTargets[i] && (colorRenderTargets[i] != m_state.m_colorRenderTargets[i])) { needsNewRenderPass = true; break; @@ -855,7 +865,7 @@ MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(MTL::RenderPas if (!needsNewRenderPass) { - if (depthRenderTarget && (depthRenderTarget != m_state.depthRenderTarget)) + if (depthRenderTarget && (depthRenderTarget != m_state.m_depthRenderTarget)) { needsNewRenderPass = true; } @@ -872,12 +882,12 @@ MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(MTL::RenderPas } // Update state - m_state.lastUsedFBO = m_state.activeFBO; + m_state.m_lastUsedFBO = m_state.m_activeFBO; for (uint8 i = 0; i < 8; i++) { - m_state.colorRenderTargets[i] = colorRenderTargets[i]; + m_state.m_colorRenderTargets[i] = colorRenderTargets[i]; } - m_state.depthRenderTarget = depthRenderTarget; + m_state.m_depthRenderTarget = depthRenderTarget; auto renderCommandEncoder = m_commandBuffer->renderCommandEncoder(renderPassDescriptor); m_commandEncoder = renderCommandEncoder; @@ -959,8 +969,15 @@ void MetalRenderer::CommitCommandBuffer() } } -bool MetalRenderer::AcquireNextDrawable() +bool MetalRenderer::AcquireNextDrawable(bool mainWindow) { + const bool latteBufferUsesSRGB = mainWindow ? LatteGPUState.tvBufferUsesSRGB : LatteGPUState.drcBufferUsesSRGB; + if (latteBufferUsesSRGB != m_state.m_usesSRGB) + { + m_metalLayer->setPixelFormat(latteBufferUsesSRGB ? MTL::PixelFormatRGBA8Unorm_sRGB : MTL::PixelFormatRGBA8Unorm); + m_state.m_usesSRGB = latteBufferUsesSRGB; + } + if (m_drawable) { // TODO: should this be true? @@ -1005,7 +1022,7 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE UNREACHABLE; } - auto textureView = m_state.textures[hostTextureUnit]; + auto textureView = m_state.m_textures[hostTextureUnit]; if (!textureView) { debug_printf("invalid bound texture view %u\n", hostTextureUnit); @@ -1283,7 +1300,7 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE debug_printf("too big buffer index (%u), skipping binding\n", binding); continue; } - size_t offset = m_state.uniformBufferOffsets[(uint32)shader->shaderType][i]; + size_t offset = m_state.m_uniformBufferOffsets[(uint32)shader->shaderType][i]; if (offset != INVALID_OFFSET) { switch (shader->shaderType) @@ -1329,49 +1346,15 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE void MetalRenderer::RebindRenderState(MTL::RenderCommandEncoder* renderCommandEncoder) { // Viewport - //if (m_state.viewport.width != 0.0) - //{ - renderCommandEncoder->setViewport(m_state.viewport); - /* - } - else - { - // Find the framebuffer dimensions - uint32 framebufferWidth = 0, framebufferHeight = 0; - if (m_state.activeFBO->hasDepthBuffer()) - { - framebufferHeight = m_state.activeFBO->depthBuffer.texture->baseTexture->width; - framebufferHeight = m_state.activeFBO->depthBuffer.texture->baseTexture->height; - } - else - { - for (uint8 i = 0; i < 8; i++) - { - auto texture = m_state.activeFBO->colorBuffer[i].texture; - if (texture) - { - framebufferWidth = texture->baseTexture->width; - framebufferHeight = texture->baseTexture->height; - break; - } - } - } - - MTL::Viewport viewport{0, (double)framebufferHeight, (double)framebufferWidth, -(double)framebufferHeight, 0.0, 1.0}; - renderCommandEncoder->setViewport(viewport); - } - */ + renderCommandEncoder->setViewport(m_state.m_viewport); // Scissor - //if (m_state.scissor.width != 0) - //{ - renderCommandEncoder->setScissorRect(m_state.scissor); - //} + renderCommandEncoder->setScissorRect(m_state.m_scissor); // Vertex buffers for (uint8 i = 0; i < MAX_MTL_BUFFERS; i++) { - auto& vertexBufferRange = m_state.vertexBuffers[i]; + auto& vertexBufferRange = m_state.m_vertexBuffers[i]; if (vertexBufferRange.offset != INVALID_OFFSET) vertexBufferRange.needsRebind = true; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 0b2208ec5..3868ae709 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -27,18 +27,24 @@ struct MetalBoundBuffer struct MetalState { - bool skipDrawSequence = false; - class CachedFBOMtl* activeFBO = nullptr; + bool m_usesSRGB = false; + + bool m_skipDrawSequence = false; + + class CachedFBOMtl* m_activeFBO = nullptr; // If the FBO changes, but it's the same FBO as the last one with some omitted attachments, this FBO doesn't change' - class CachedFBOMtl* lastUsedFBO = nullptr; - MetalBoundBuffer vertexBuffers[MAX_MTL_BUFFERS] = {{}}; + class CachedFBOMtl* m_lastUsedFBO = nullptr; + + MetalBoundBuffer m_vertexBuffers[MAX_MTL_BUFFERS] = {{}}; // TODO: find out what is the max number of bound textures on the Wii U - class LatteTextureViewMtl* textures[64] = {nullptr}; - size_t uniformBufferOffsets[(uint32)LatteConst::ShaderType::TotalCount][MAX_MTL_BUFFERS]; - MTL::Texture* colorRenderTargets[8] = {nullptr}; - MTL::Texture* depthRenderTarget = nullptr; - MTL::Viewport viewport = {0, 0, 0, 0, 0, 0}; - MTL::ScissorRect scissor = {0, 0, 0, 0}; + class LatteTextureViewMtl* m_textures[64] = {nullptr}; + size_t m_uniformBufferOffsets[(uint32)LatteConst::ShaderType::TotalCount][MAX_MTL_BUFFERS]; + + MTL::Texture* m_colorRenderTargets[8] = {nullptr}; + MTL::Texture* m_depthRenderTarget = nullptr; + + MTL::Viewport m_viewport = {0, 0, 0, 0, 0, 0}; + MTL::ScissorRect m_scissor = {0, 0, 0, 0}; }; enum class MetalEncoderType @@ -232,7 +238,7 @@ class MetalRenderer : public Renderer void EndEncoding(); void CommitCommandBuffer(); - bool AcquireNextDrawable(); + bool AcquireNextDrawable(bool mainWindow); void BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader); void RebindRenderState(MTL::RenderCommandEncoder* renderCommandEncoder); @@ -257,8 +263,8 @@ class MetalRenderer : public Renderer MTL::CommandQueue* m_commandQueue; // Pipelines - MTL::Library* m_utilityLibrary; - MTL::RenderPipelineState* m_presentPipeline; + MTL::RenderPipelineState* m_presentPipelineLinear; + MTL::RenderPipelineState* m_presentPipelineSRGB; // Hybrid pipelines class MetalHybridComputePipeline* m_copyTextureToTexturePipeline; From 0c73ff8452da2ce24b129ba199628e585ca66e1d Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 12 Aug 2024 14:27:28 +0200 Subject: [PATCH 060/368] use multiple command buffers per frame --- .../Metal/LatteTextureReadbackMtl.cpp | 10 +- .../Renderer/Metal/LatteTextureReadbackMtl.h | 3 + .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 110 ++++++++++++------ .../HW/Latte/Renderer/Metal/MetalRenderer.h | 22 +++- 4 files changed, 105 insertions(+), 40 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp index 608ff050d..ef1576642 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp @@ -23,18 +23,22 @@ void LatteTextureReadbackInfoMtl::StartTransfer() auto blitCommandEncoder = m_mtlr->GetBlitCommandEncoder(); blitCommandEncoder->copyFromTexture(baseTexture->GetTexture(), 0, 0, MTL::Origin{0, 0, 0}, MTL::Size{(uint32)baseTexture->width, (uint32)baseTexture->height, 1}, m_mtlr->GetTextureReadbackBuffer(), m_bufferOffset, bytesPerRow, bytesPerImage); + + m_commandBuffer = m_mtlr->GetCurrentCommandBuffer(); + // TODO: uncomment + //m_mtlr->RequestSoonCommit(); } bool LatteTextureReadbackInfoMtl::IsFinished() { - // TODO: implement - + // HACK: just return true for now, otherwise the game would freeze + //return m_mtlr->CommandBufferCompleted(m_commandBuffer); return true; } void LatteTextureReadbackInfoMtl::ForceFinish() { - // TODO: implement + m_mtlr->WaitForCommandBufferCompletion(m_commandBuffer); } uint8* LatteTextureReadbackInfoMtl::GetData() diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h index a03bbd499..19ca6574a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h @@ -1,5 +1,6 @@ #pragma once +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" #include "Cafe/HW/Latte/Core/LatteTextureReadbackInfo.h" class LatteTextureReadbackInfoMtl : public LatteTextureReadbackInfo @@ -18,5 +19,7 @@ class LatteTextureReadbackInfoMtl : public LatteTextureReadbackInfo private: class MetalRenderer* m_mtlr; + MTL::CommandBuffer* m_commandBuffer = nullptr; + uint32 m_bufferOffset = 0; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index ba50f0725..ca74a1d8c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -15,9 +15,12 @@ #include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/Core/LatteIndices.h" #include "Cemu/Logging/CemuDebugLogging.h" +#include "Common/precompiled.h" #include "Metal/MTLPixelFormat.hpp" #include "gui/guiWrapper.h" +#define COMMIT_TRESHOLD 256 + extern bool hasValidFramebufferAttached; float supportBufferData[512 * 4]; @@ -113,7 +116,7 @@ MetalRenderer::~MetalRenderer() m_device->release(); } -// TODO: don't ignore "mainWindow" argument +// TODO: don't ignore "mainWindow" argument and respect size void MetalRenderer::InitializeLayer(const Vector2i& size, bool mainWindow) { const auto& windowInfo = gui_getWindowInfo().window_main; @@ -168,19 +171,23 @@ void MetalRenderer::DrawEmptyFrame(bool mainWindow) void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) { - EndEncoding(); if (m_drawable) { - EnsureCommandBuffer(); - m_commandBuffer->presentDrawable(m_drawable); - } else + auto commandBuffer = GetCommandBuffer(); + commandBuffer->presentDrawable(m_drawable); + } + else { debug_printf("skipped present!\n"); } m_drawable = nullptr; + // Release all the command buffers CommitCommandBuffer(); + for (uint32 i = 0; i < m_commandBuffers.size(); i++) + m_commandBuffers[i].m_commandBuffer->release(); + m_commandBuffers.clear(); // Reset temporary buffers m_memoryManager->ResetTemporaryBuffers(); @@ -223,18 +230,20 @@ bool MetalRenderer::BeginFrame(bool mainWindow) void MetalRenderer::Flush(bool waitIdle) { - // TODO: should we? - CommitCommandBuffer(); + // TODO: commit if commit on idle is requested + if (m_recordedDrawcalls > 0) + CommitCommandBuffer(); if (waitIdle) { - // TODO + // TODO: shouldn't we wait for all command buffers? + WaitForCommandBufferCompletion(GetCurrentCommandBuffer()); } } void MetalRenderer::NotifyLatteCommandProcessorIdle() { - // TODO: should we? - CommitCommandBuffer(); + // TODO: commit if commit on idle is requested + //CommitCommandBuffer(); } void MetalRenderer::AppendOverlayDebugInfo() @@ -452,7 +461,7 @@ LatteTextureReadbackInfo* MetalRenderer::texture_createReadback(LatteTextureView void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* sourceTexture, sint32 srcMip, sint32 srcSlice, LatteTexture* destinationTexture, sint32 dstMip, sint32 dstSlice, sint32 width, sint32 height) { - EnsureCommandBuffer(); + GetCommandBuffer(); // scale copy size to effective size sint32 effectiveCopyWidth = width; @@ -809,11 +818,15 @@ void MetalRenderer::draw_endSequence() if (pixelShader) LatteRenderTarget_trackUpdates(); bool hasReadback = LatteTextureReadback_Update(); - //m_recordedDrawcalls++; - //if (m_recordedDrawcalls >= m_submitThreshold || hasReadback) - //{ - // SubmitCommandBuffer(); - //} + m_recordedDrawcalls++; + // The number of draw calls needs to twice as big, since we are interrupting the render pass + if (m_recordedDrawcalls >= COMMIT_TRESHOLD * 2 || hasReadback) + { + CommitCommandBuffer(); + + // TODO: where should this be called? + LatteTextureReadback_UpdateFinishedTransfers(false); + } } void* MetalRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) @@ -830,22 +843,38 @@ void MetalRenderer::indexData_uploadIndexMemory(uint32 offset, uint32 size) // Do nothing, since the buffer has shared storage mode } -void MetalRenderer::EnsureCommandBuffer() +MTL::CommandBuffer* MetalRenderer::GetCommandBuffer() { - if (!m_commandBuffer) + bool needsNewCommandBuffer = (m_commandBuffers.empty() || m_commandBuffers.back().m_commited); + if (needsNewCommandBuffer) { // Debug //m_commandQueue->insertDebugCaptureBoundary(); - m_commandBuffer = m_commandQueue->commandBuffer(); + MTL::CommandBuffer* mtlCommandBuffer = m_commandQueue->commandBuffer(); + m_commandBuffers.push_back({mtlCommandBuffer}); + + return mtlCommandBuffer; + } + else + { + return m_commandBuffers.back().m_commandBuffer; } } +bool MetalRenderer::CommandBufferCompleted(MTL::CommandBuffer* commandBuffer) +{ + return commandBuffer->status() == MTL::CommandBufferStatusCompleted; +} + +void MetalRenderer::WaitForCommandBufferCompletion(MTL::CommandBuffer* commandBuffer) +{ + commandBuffer->waitUntilCompleted(); +} + // Some render passes clear the attachments, forceRecreate is supposed to be used in those cases MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(MTL::RenderPassDescriptor* renderPassDescriptor, MTL::Texture* colorRenderTargets[8], MTL::Texture* depthRenderTarget, bool forceRecreate, bool rebindStateIfNewEncoder) { - EnsureCommandBuffer(); - // Check if we need to begin a new render pass if (m_commandEncoder) { @@ -881,6 +910,8 @@ MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(MTL::RenderPas EndEncoding(); } + auto commandBuffer = GetCommandBuffer(); + // Update state m_state.m_lastUsedFBO = m_state.m_activeFBO; for (uint8 i = 0; i < 8; i++) @@ -889,7 +920,7 @@ MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(MTL::RenderPas } m_state.m_depthRenderTarget = depthRenderTarget; - auto renderCommandEncoder = m_commandBuffer->renderCommandEncoder(renderPassDescriptor); + auto renderCommandEncoder = commandBuffer->renderCommandEncoder(renderPassDescriptor); m_commandEncoder = renderCommandEncoder; m_encoderType = MetalEncoderType::Render; @@ -914,7 +945,9 @@ MTL::ComputeCommandEncoder* MetalRenderer::GetComputeCommandEncoder() EndEncoding(); } - auto computeCommandEncoder = m_commandBuffer->computeCommandEncoder(); + auto commandBuffer = GetCommandBuffer(); + + auto computeCommandEncoder = commandBuffer->computeCommandEncoder(); m_commandEncoder = computeCommandEncoder; m_encoderType = MetalEncoderType::Compute; @@ -933,7 +966,9 @@ MTL::BlitCommandEncoder* MetalRenderer::GetBlitCommandEncoder() EndEncoding(); } - auto blitCommandEncoder = m_commandBuffer->blitCommandEncoder(); + auto commandBuffer = GetCommandBuffer(); + + auto blitCommandEncoder = commandBuffer->blitCommandEncoder(); m_commandEncoder = blitCommandEncoder; m_encoderType = MetalEncoderType::Blit; @@ -942,30 +977,35 @@ MTL::BlitCommandEncoder* MetalRenderer::GetBlitCommandEncoder() void MetalRenderer::EndEncoding() { - if (m_commandEncoder) + if (m_encoderType != MetalEncoderType::None) { m_commandEncoder->endEncoding(); m_commandEncoder->release(); - m_commandEncoder = nullptr; m_encoderType = MetalEncoderType::None; + + // Commit the command buffer if enough draw calls have been recorded + if (m_recordedDrawcalls >= COMMIT_TRESHOLD) + CommitCommandBuffer(); } } void MetalRenderer::CommitCommandBuffer() { - EndEncoding(); + m_recordedDrawcalls = 0; - if (m_commandBuffer) + if (m_commandBuffers.size() != 0) { - m_commandBuffer->commit(); - m_commandBuffer->release(); - m_commandBuffer = nullptr; + EndEncoding(); - // TODO: where should this be called? - LatteTextureReadback_UpdateFinishedTransfers(false); + auto& commandBuffer = m_commandBuffers.back(); + if (!commandBuffer.m_commited) + { + commandBuffer.m_commandBuffer->commit(); + commandBuffer.m_commited = true; - // Debug - //m_commandQueue->insertDebugCaptureBoundary(); + // Debug + //m_commandQueue->insertDebugCaptureBoundary(); + } } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 3868ae709..78e1dacaf 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -7,6 +7,8 @@ #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" +#include "Common/precompiled.h" +#include "Metal/MTLCommandBuffer.hpp" #define MAX_MTL_BUFFERS 31 #define GET_MTL_VERTEX_BUFFER_INDEX(index) (MAX_MTL_BUFFERS - index - 2) @@ -47,6 +49,12 @@ struct MetalState MTL::ScissorRect m_scissor = {0, 0, 0, 0}; }; +struct MetalCommandBuffer +{ + MTL::CommandBuffer* m_commandBuffer; + bool m_commited = false; +}; + enum class MetalEncoderType { None, @@ -231,7 +239,16 @@ class MetalRenderer : public Renderer } // Helpers - void EnsureCommandBuffer(); + MTL::CommandBuffer* GetCurrentCommandBuffer() + { + cemu_assert_debug(m_commandBuffers.size() != 0); + + return m_commandBuffers[m_commandBuffers.size() - 1].m_commandBuffer; + } + + MTL::CommandBuffer* GetCommandBuffer(); + bool CommandBufferCompleted(MTL::CommandBuffer* commandBuffer); + void WaitForCommandBufferCompletion(MTL::CommandBuffer* commandBuffer); MTL::RenderCommandEncoder* GetRenderCommandEncoder(MTL::RenderPassDescriptor* renderPassDescriptor, MTL::Texture* colorRenderTargets[8], MTL::Texture* depthRenderTarget, bool forceRecreate = false, bool rebindStateIfNewEncoder = true); MTL::ComputeCommandEncoder* GetComputeCommandEncoder(); MTL::BlitCommandEncoder* GetBlitCommandEncoder(); @@ -280,7 +297,8 @@ class MetalRenderer : public Renderer MTL::Buffer* m_xfbRingBuffer; // Active objects - MTL::CommandBuffer* m_commandBuffer = nullptr; + std::vector m_commandBuffers; + uint32 m_recordedDrawcalls = 0; MetalEncoderType m_encoderType = MetalEncoderType::None; MTL::CommandEncoder* m_commandEncoder = nullptr; CA::MetalDrawable* m_drawable = nullptr; From 3f52f3acfe5ba98e9aa8f0ee36cc3c03f022a52c Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 13 Aug 2024 07:30:33 +0200 Subject: [PATCH 061/368] fix: cubemap sampling --- .../LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp | 5 ++--- src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp | 9 +++------ src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp | 4 +++- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index e769064fc..06ca4ec47 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2357,11 +2357,10 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex { debugBreakpoint(); } - src->add("float4("); src->addFmt("redcCUBEReverse({},", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0)); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_SIGNED_INT); src->addFmt(")"); - src->addFmt(",cubeMapArrayIndex{})", texInstruction->textureFetch.textureIndex); // cubemap index + src->addFmt(", uint(cubeMapArrayIndex{})", texInstruction->textureFetch.textureIndex); // cubemap index } else if (texDim == Latte::E_DIM::DIM_1D) { @@ -2411,7 +2410,7 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex src->addFmt("redcCUBEReverse({},", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0)); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_SIGNED_INT); src->add(")"); - src->addFmt(", cubeMapArrayIndex{}", texInstruction->textureFetch.textureIndex); // cubemap index + src->addFmt(", uint(cubeMapArrayIndex{})", texInstruction->textureFetch.textureIndex); // cubemap index } else if( texDim == Latte::E_DIM::DIM_1D ) { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index 5d5273caa..645973df5 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -2,6 +2,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Common/precompiled.h" LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) @@ -42,13 +43,9 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM textureType = MTL::TextureType3D; break; case Latte::E_DIM::DIM_CUBEMAP: - if (effectiveBaseDepth % 6 != 0) - debug_printf("cubemaps must have an array length multiple of 6, length: %u\n", effectiveBaseDepth); + cemu_assert_debug(effectiveBaseDepth % 6 == 0 && "cubemaps must have an array length multiple of 6"); - if (effectiveBaseDepth <= 6) - textureType = MTL::TextureTypeCube; - else - textureType = MTL::TextureTypeCubeArray; + textureType = MTL::TextureTypeCubeArray; break; default: cemu_assert_unimplemented(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp index 4a6ceeb4f..7e13738a3 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp @@ -91,7 +91,9 @@ MTL::Texture* LatteTextureViewMtl::CreateSwizzledView(uint32 gpuSamplerSwizzle) textureType = MTL::TextureType3D; break; case Latte::E_DIM::DIM_CUBEMAP: - textureType = MTL::TextureTypeCube; // TODO: check this + cemu_assert_debug(this->numSlice % 6 == 0 && "cubemaps must have an array length multiple of 6"); + + textureType = MTL::TextureTypeCubeArray; break; default: cemu_assert_unimplemented(); From 4b7c01ee2a0d6e42fb3bb03ec781f407535f3b23 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 13 Aug 2024 11:44:49 +0200 Subject: [PATCH 062/368] fix: 3D texture copies & fix: present filter --- src/Cafe/HW/Latte/Core/LatteIndices.cpp | 2 + src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp | 2 +- .../Renderer/Metal/LatteTextureViewMtl.cpp | 4 +- .../Renderer/Metal/LatteTextureViewMtl.h | 1 + .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 46 ++++++++++++------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 1 + 6 files changed, 38 insertions(+), 18 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteIndices.cpp b/src/Cafe/HW/Latte/Core/LatteIndices.cpp index 75f0a5a2d..0f12356b5 100644 --- a/src/Cafe/HW/Latte/Core/LatteIndices.cpp +++ b/src/Cafe/HW/Latte/Core/LatteIndices.cpp @@ -699,6 +699,7 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 cemu_assert_debug(false); outputCount = count + 1; } + /* else if (primitiveMode == LattePrimitiveMode::TRIANGLE_FAN && g_renderer->GetType() == RendererAPI::Metal) { if (indexType == LatteIndexType::AUTO) @@ -722,6 +723,7 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 cemu_assert_debug(false); outputCount = count; } + */ else { if (indexType == LatteIndexType::U16_BE) diff --git a/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp b/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp index f8a53b6d4..4f88a0a0c 100644 --- a/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp +++ b/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp @@ -934,7 +934,7 @@ void LatteRenderTarget_copyToBackbuffer(LatteTextureView* textureView, bool isPa { sint32 scaling_filter = downscaling ? GetConfig().downscale_filter : GetConfig().upscale_filter; - if (g_renderer->GetType() == RendererAPI::Vulkan) + if (g_renderer->GetType() == RendererAPI::Vulkan || g_renderer->GetType() == RendererAPI::Metal) { // force linear or nearest neighbor filter if(scaling_filter != kLinearFilter && scaling_filter != kNearestNeighborFilter) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp index 7e13738a3..adb77643f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp @@ -6,10 +6,12 @@ LatteTextureViewMtl::LatteTextureViewMtl(MetalRenderer* mtlRenderer, LatteTextureMtl* texture, Latte::E_DIM dim, Latte::E_GX2SURFFMT format, sint32 firstMip, sint32 mipCount, sint32 firstSlice, sint32 sliceCount) : LatteTextureView(texture, firstMip, mipCount, firstSlice, sliceCount, dim, format), m_mtlr(mtlRenderer), m_baseTexture(texture) { + m_rgbaView = CreateSwizzledView(RGBA_SWIZZLE); } LatteTextureViewMtl::~LatteTextureViewMtl() { + m_rgbaView->release(); for (sint32 i = 0; i < std::size(m_viewCache); i++) { if (m_viewCache[i].key != INVALID_SWIZZLE) @@ -30,7 +32,7 @@ MTL::Texture* LatteTextureViewMtl::GetSwizzledView(uint32 gpuSamplerSwizzle) // RGBA swizzle == no swizzle if (gpuSamplerSwizzle == RGBA_SWIZZLE) { - return m_baseTexture->GetTexture(); + return m_rgbaView; } // First, try to find a view in the cache diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h index fc05de5f5..2634735ef 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h @@ -26,6 +26,7 @@ class LatteTextureViewMtl : public LatteTextureView class LatteTextureMtl* m_baseTexture; + MTL::Texture* m_rgbaView; struct { uint32 key; MTL::Texture* texture; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index ca74a1d8c..dc33dc6a7 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -16,7 +16,6 @@ #include "Cafe/HW/Latte/Core/LatteIndices.h" #include "Cemu/Logging/CemuDebugLogging.h" #include "Common/precompiled.h" -#include "Metal/MTLPixelFormat.hpp" #include "gui/guiWrapper.h" #define COMMIT_TRESHOLD 256 @@ -32,6 +31,10 @@ MetalRenderer::MetalRenderer() MTL::SamplerDescriptor* samplerDescriptor = MTL::SamplerDescriptor::alloc()->init(); m_nearestSampler = m_device->newSamplerState(samplerDescriptor); + + samplerDescriptor->setMinFilter(MTL::SamplerMinMagFilterLinear); + samplerDescriptor->setMagFilter(MTL::SamplerMinMagFilterLinear); + m_linearSampler = m_device->newSamplerState(samplerDescriptor); samplerDescriptor->release(); m_memoryManager = new MetalMemoryManager(this); @@ -109,6 +112,7 @@ MetalRenderer::~MetalRenderer() delete m_memoryManager; m_nearestSampler->release(); + m_linearSampler->release(); m_readbackBuffer->release(); @@ -200,25 +204,26 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput if (!AcquireNextDrawable(!padView)) return; - if (clearBackground) - ClearColorbuffer(padView); - MTL::Texture* presentTexture = static_cast(texView)->GetRGBAView(); // Create render pass MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); - renderPassDescriptor->colorAttachments()->object(0)->setTexture(m_drawable->texture()); + auto colorAttachment = renderPassDescriptor->colorAttachments()->object(0); + colorAttachment->setTexture(m_drawable->texture()); + // TODO: shouldn't it be LoadActionLoad when not clearing? + colorAttachment->setLoadAction(clearBackground ? MTL::LoadActionClear : MTL::LoadActionDontCare); + colorAttachment->setStoreAction(MTL::StoreActionStore); MTL::Texture* colorRenderTargets[8] = {nullptr}; colorRenderTargets[0] = m_drawable->texture(); // If there was already an encoder with these attachment, we should set the viewport and scissor to default, but that shouldn't happen - auto renderCommandEncoder = GetRenderCommandEncoder(renderPassDescriptor, colorRenderTargets, nullptr, false, false); + auto renderCommandEncoder = GetRenderCommandEncoder(renderPassDescriptor, colorRenderTargets, nullptr, clearBackground, false); renderPassDescriptor->release(); // Draw to Metal layer renderCommandEncoder->setRenderPipelineState(m_state.m_usesSRGB ? m_presentPipelineSRGB : m_presentPipelineLinear); renderCommandEncoder->setFragmentTexture(presentTexture, 0); - renderCommandEncoder->setFragmentSamplerState(m_nearestSampler, 0); + renderCommandEncoder->setFragmentSamplerState((useLinearTexFilter ? m_linearSampler : m_nearestSampler), 0); renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3)); } @@ -314,11 +319,19 @@ void MetalRenderer::texture_clearSlice(LatteTexture* hostTexture, sint32 sliceIn void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, sint32 height, sint32 depth, void* pixelData, sint32 sliceIndex, sint32 mipIndex, uint32 compressedImageSize) { - auto mtlTexture = (LatteTextureMtl*)hostTexture; + auto textureMtl = (LatteTextureMtl*)hostTexture; + + uint32 offsetZ = 0; + if (textureMtl->Is3DTexture()) + { + offsetZ = sliceIndex; + sliceIndex = 0; + } - size_t bytesPerRow = GetMtlTextureBytesPerRow(mtlTexture->GetFormat(), mtlTexture->IsDepth(), width); - size_t bytesPerImage = GetMtlTextureBytesPerImage(mtlTexture->GetFormat(), mtlTexture->IsDepth(), height, bytesPerRow); - mtlTexture->GetTexture()->replaceRegion(MTL::Region(0, 0, width, height), mipIndex, sliceIndex, pixelData, bytesPerRow, bytesPerImage); + size_t bytesPerRow = GetMtlTextureBytesPerRow(textureMtl->GetFormat(), textureMtl->IsDepth(), width); + // No need to calculate bytesPerImage for 3D textures, since we always load just one slice + //size_t bytesPerImage = GetMtlTextureBytesPerImage(textureMtl->GetFormat(), textureMtl->IsDepth(), height, bytesPerRow); + textureMtl->GetTexture()->replaceRegion(MTL::Region(0, 0, offsetZ, width, height, 1), mipIndex, sliceIndex, pixelData, bytesPerRow, 0); } void MetalRenderer::texture_clearColorSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a) @@ -409,11 +422,12 @@ void MetalRenderer::texture_copyImageSubData(LatteTexture* src, sint32 srcMip, s // If copying whole textures, we can do a more efficient copy if (effectiveSrcX == 0 && effectiveSrcY == 0 && effectiveDstX == 0 && effectiveDstY == 0 && - effectiveCopyWidth == src->GetMipWidth(srcMip) && effectiveCopyHeight == src->GetMipHeight(srcMip) && - effectiveCopyWidth == dst->GetMipWidth(dstMip) && effectiveCopyHeight == dst->GetMipHeight(dstMip) && + srcOffsetZ == 0 && dstOffsetZ == 0 && + effectiveCopyWidth == src->GetMipWidth(srcMip) && effectiveCopyHeight == src->GetMipHeight(srcMip) && srcDepth == src->GetMipDepth(srcMip) && + effectiveCopyWidth == dst->GetMipWidth(dstMip) && effectiveCopyHeight == dst->GetMipHeight(dstMip) && dstDepth == dst->GetMipDepth(dstMip) && srcLayerCount == dstLayerCount) { - blitCommandEncoder->copyFromTexture(mtlSrc, srcSlice, srcMip, mtlDst, dstSlice, dstMip, srcLayerCount, 1); + blitCommandEncoder->copyFromTexture(mtlSrc, srcBaseLayer, srcMip, mtlDst, dstBaseLayer, dstMip, srcLayerCount, 1); } else { @@ -421,7 +435,7 @@ void MetalRenderer::texture_copyImageSubData(LatteTexture* src, sint32 srcMip, s { for (uint32 i = 0; i < srcLayerCount; i++) { - blitCommandEncoder->copyFromTexture(mtlSrc, srcSlice + i, srcMip, MTL::Origin(effectiveSrcX, effectiveSrcY, srcOffsetZ), MTL::Size(effectiveCopyWidth, effectiveCopyHeight, 1), mtlDst, dstSlice + i, dstMip, MTL::Origin(effectiveDstX, effectiveDstY, dstOffsetZ)); + blitCommandEncoder->copyFromTexture(mtlSrc, srcBaseLayer + i, srcMip, MTL::Origin(effectiveSrcX, effectiveSrcY, srcOffsetZ), MTL::Size(effectiveCopyWidth, effectiveCopyHeight, srcDepth), mtlDst, dstBaseLayer + i, dstMip, MTL::Origin(effectiveDstX, effectiveDstY, dstOffsetZ)); } } else @@ -438,7 +452,7 @@ void MetalRenderer::texture_copyImageSubData(LatteTexture* src, sint32 srcMip, s else dstSlice++; - blitCommandEncoder->copyFromTexture(mtlSrc, srcSlice, srcMip, MTL::Origin(effectiveSrcX, effectiveSrcY, srcOffsetZ), MTL::Size(effectiveCopyWidth, effectiveCopyHeight, 1), mtlDst, dstSlice, dstMip, MTL::Origin(effectiveDstX, effectiveDstY, dstOffsetZ)); + blitCommandEncoder->copyFromTexture(mtlSrc, srcBaseLayer, srcMip, MTL::Origin(effectiveSrcX, effectiveSrcY, srcOffsetZ), MTL::Size(effectiveCopyWidth, effectiveCopyHeight, 1), mtlDst, dstBaseLayer, dstMip, MTL::Origin(effectiveDstX, effectiveDstY, dstOffsetZ)); } } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 78e1dacaf..fd3ca61fe 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -288,6 +288,7 @@ class MetalRenderer : public Renderer // Basic MTL::SamplerState* m_nearestSampler; + MTL::SamplerState* m_linearSampler; // Texture readback MTL::Buffer* m_readbackBuffer; From 5e9537c777aff906a88487e6d3626a3e48f278f0 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 13 Aug 2024 16:40:25 +0200 Subject: [PATCH 063/368] fix: render pass mess --- .../Renderer/Metal/MetalPipelineCache.cpp | 10 +++ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 76 +++++++++---------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 7 +- 3 files changed, 50 insertions(+), 43 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 289c1a609..51642c9b1 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -169,6 +169,16 @@ uint64 MetalPipelineCache::CalculatePipelineHash(const LatteFetchShader* fetchSh { // Hash uint64 stateHash = 0; + for (int i = 0; i < Latte::GPU_LIMITS::NUM_COLOR_ATTACHMENTS; ++i) + { + auto textureView = static_cast(activeFBO->colorBuffer[i].texture); + if (!textureView) + continue; + + stateHash += textureView->GetRGBAView()->pixelFormat() + i * 31; + stateHash = std::rotl(stateHash, 7); + } + for (auto& group : fetchShader->bufferGroups) { uint32 bufferStride = group.getCurrentBufferStride(lcr.GetRawView()); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index dc33dc6a7..80246aab7 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -16,6 +16,7 @@ #include "Cafe/HW/Latte/Core/LatteIndices.h" #include "Cemu/Logging/CemuDebugLogging.h" #include "Common/precompiled.h" +#include "Metal/MTLRenderPass.hpp" #include "gui/guiWrapper.h" #define COMMIT_TRESHOLD 256 @@ -214,10 +215,7 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput colorAttachment->setLoadAction(clearBackground ? MTL::LoadActionClear : MTL::LoadActionDontCare); colorAttachment->setStoreAction(MTL::StoreActionStore); - MTL::Texture* colorRenderTargets[8] = {nullptr}; - colorRenderTargets[0] = m_drawable->texture(); - // If there was already an encoder with these attachment, we should set the viewport and scissor to default, but that shouldn't happen - auto renderCommandEncoder = GetRenderCommandEncoder(renderPassDescriptor, colorRenderTargets, nullptr, clearBackground, false); + auto renderCommandEncoder = GetTemporaryRenderCommandEncoder(renderPassDescriptor); renderPassDescriptor->release(); // Draw to Metal layer @@ -226,6 +224,8 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput renderCommandEncoder->setFragmentSamplerState((useLinearTexFilter ? m_linearSampler : m_nearestSampler), 0); renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3)); + + EndEncoding(); } bool MetalRenderer::BeginFrame(bool mainWindow) @@ -367,9 +367,9 @@ void MetalRenderer::texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sl stencilAttachment->setLevel(mipIndex); } - MTL::Texture* colorRenderTargets[8] = {nullptr}; - GetRenderCommandEncoder(renderPassDescriptor, colorRenderTargets, mtlTexture, true); + GetTemporaryRenderCommandEncoder(renderPassDescriptor); renderPassDescriptor->release(); + EndEncoding(); } LatteTexture* MetalRenderer::texture_createTextureEx(Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) @@ -676,23 +676,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 return; } - auto renderPassDescriptor = m_state.m_activeFBO->GetRenderPassDescriptor(); - MTL::Texture* colorRenderTargets[8] = {nullptr}; - MTL::Texture* depthRenderTarget = nullptr; - for (uint32 i = 0; i < 8; i++) - { - auto colorTexture = static_cast(m_state.m_activeFBO->colorBuffer[i].texture); - if (colorTexture) - { - colorRenderTargets[i] = colorTexture->GetRGBAView(); - } - } - auto depthTexture = static_cast(m_state.m_activeFBO->depthBuffer.texture); - if (depthTexture) - { - depthRenderTarget = depthTexture->GetRGBAView(); - } - auto renderCommandEncoder = GetRenderCommandEncoder(renderPassDescriptor, colorRenderTargets, depthRenderTarget); + auto renderCommandEncoder = GetRenderCommandEncoder(); // Shaders LatteDecompilerShader* vertexShader = LatteSHRC_GetActiveVertexShader(); @@ -705,7 +689,8 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 const auto fetchShader = LatteSHRC_GetActiveFetchShader(); // Render pipeline state - MTL::RenderPipelineState* renderPipelineState = m_pipelineCache->GetPipelineState(fetchShader, vertexShader, pixelShader, m_state.m_lastUsedFBO, LatteGPUState.contextNew); + // TODO: use `m_lastUsedFBO` instead of `m_activeFBO` + MTL::RenderPipelineState* renderPipelineState = m_pipelineCache->GetPipelineState(fetchShader, vertexShader, pixelShader, m_state.m_activeFBO, LatteGPUState.contextNew); renderCommandEncoder->setRenderPipelineState(renderPipelineState); // Depth stencil state @@ -886,8 +871,21 @@ void MetalRenderer::WaitForCommandBufferCompletion(MTL::CommandBuffer* commandBu commandBuffer->waitUntilCompleted(); } +MTL::RenderCommandEncoder* MetalRenderer::GetTemporaryRenderCommandEncoder(MTL::RenderPassDescriptor* renderPassDescriptor) +{ + EndEncoding(); + + auto commandBuffer = GetCommandBuffer(); + + auto renderCommandEncoder = commandBuffer->renderCommandEncoder(renderPassDescriptor); + m_commandEncoder = renderCommandEncoder; + m_encoderType = MetalEncoderType::Render; + + return renderCommandEncoder; +} + // Some render passes clear the attachments, forceRecreate is supposed to be used in those cases -MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(MTL::RenderPassDescriptor* renderPassDescriptor, MTL::Texture* colorRenderTargets[8], MTL::Texture* depthRenderTarget, bool forceRecreate, bool rebindStateIfNewEncoder) +MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(bool forceRecreate, bool rebindStateIfNewEncoder) { // Check if we need to begin a new render pass if (m_commandEncoder) @@ -896,19 +894,22 @@ MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(MTL::RenderPas { if (m_encoderType == MetalEncoderType::Render) { - bool needsNewRenderPass = false; - for (uint8 i = 0; i < 8; i++) + bool needsNewRenderPass = (m_state.m_lastUsedFBO == nullptr); + if (!needsNewRenderPass) { - if (colorRenderTargets[i] && (colorRenderTargets[i] != m_state.m_colorRenderTargets[i])) + for (uint8 i = 0; i < 8; i++) { - needsNewRenderPass = true; - break; + if (m_state.m_activeFBO->colorBuffer[i].texture && m_state.m_activeFBO->colorBuffer[i].texture != m_state.m_lastUsedFBO->colorBuffer[i].texture) + { + needsNewRenderPass = true; + break; + } } } if (!needsNewRenderPass) { - if (depthRenderTarget && (depthRenderTarget != m_state.m_depthRenderTarget)) + if (m_state.m_activeFBO->depthBuffer.texture && m_state.m_activeFBO->depthBuffer.texture != m_state.m_lastUsedFBO->depthBuffer.texture) { needsNewRenderPass = true; } @@ -928,13 +929,8 @@ MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(MTL::RenderPas // Update state m_state.m_lastUsedFBO = m_state.m_activeFBO; - for (uint8 i = 0; i < 8; i++) - { - m_state.m_colorRenderTargets[i] = colorRenderTargets[i]; - } - m_state.m_depthRenderTarget = depthRenderTarget; - auto renderCommandEncoder = commandBuffer->renderCommandEncoder(renderPassDescriptor); + auto renderCommandEncoder = commandBuffer->renderCommandEncoder(m_state.m_activeFBO->GetRenderPassDescriptor()); m_commandEncoder = renderCommandEncoder; m_encoderType = MetalEncoderType::Render; @@ -991,10 +987,11 @@ MTL::BlitCommandEncoder* MetalRenderer::GetBlitCommandEncoder() void MetalRenderer::EndEncoding() { - if (m_encoderType != MetalEncoderType::None) + if (m_commandEncoder) { m_commandEncoder->endEncoding(); m_commandEncoder->release(); + m_commandEncoder = nullptr; m_encoderType = MetalEncoderType::None; // Commit the command buffer if enough draw calls have been recorded @@ -1427,6 +1424,7 @@ void MetalRenderer::ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 s MTL::Texture* colorRenderTargets[8] = {nullptr}; colorRenderTargets[0] = mtlTexture; - GetRenderCommandEncoder(renderPassDescriptor, colorRenderTargets, nullptr, true); + GetTemporaryRenderCommandEncoder(renderPassDescriptor); renderPassDescriptor->release(); + EndEncoding(); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index fd3ca61fe..067788ff1 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -9,6 +9,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" #include "Common/precompiled.h" #include "Metal/MTLCommandBuffer.hpp" +#include "Metal/MTLRenderPass.hpp" #define MAX_MTL_BUFFERS 31 #define GET_MTL_VERTEX_BUFFER_INDEX(index) (MAX_MTL_BUFFERS - index - 2) @@ -42,9 +43,6 @@ struct MetalState class LatteTextureViewMtl* m_textures[64] = {nullptr}; size_t m_uniformBufferOffsets[(uint32)LatteConst::ShaderType::TotalCount][MAX_MTL_BUFFERS]; - MTL::Texture* m_colorRenderTargets[8] = {nullptr}; - MTL::Texture* m_depthRenderTarget = nullptr; - MTL::Viewport m_viewport = {0, 0, 0, 0, 0, 0}; MTL::ScissorRect m_scissor = {0, 0, 0, 0}; }; @@ -249,7 +247,8 @@ class MetalRenderer : public Renderer MTL::CommandBuffer* GetCommandBuffer(); bool CommandBufferCompleted(MTL::CommandBuffer* commandBuffer); void WaitForCommandBufferCompletion(MTL::CommandBuffer* commandBuffer); - MTL::RenderCommandEncoder* GetRenderCommandEncoder(MTL::RenderPassDescriptor* renderPassDescriptor, MTL::Texture* colorRenderTargets[8], MTL::Texture* depthRenderTarget, bool forceRecreate = false, bool rebindStateIfNewEncoder = true); + MTL::RenderCommandEncoder* GetTemporaryRenderCommandEncoder(MTL::RenderPassDescriptor* renderPassDescriptor); + MTL::RenderCommandEncoder* GetRenderCommandEncoder(bool forceRecreate = false, bool rebindStateIfNewEncoder = true); MTL::ComputeCommandEncoder* GetComputeCommandEncoder(); MTL::BlitCommandEncoder* GetBlitCommandEncoder(); void EndEncoding(); From 27925a4fd9f13394bc3a92eb51d54354f621e1ea Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 13 Aug 2024 18:28:10 +0200 Subject: [PATCH 064/368] do vertex buffer restride in a void vertex function --- .../Renderer/Metal/MetalMemoryManager.cpp | 61 ++++++++++++++----- .../Latte/Renderer/Metal/MetalMemoryManager.h | 14 ++++- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 18 +++--- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 12 ++++ .../Renderer/Metal/UtilityShaderSource.h | 14 ++++- 5 files changed, 96 insertions(+), 23 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index aef458a7c..ef6871a7e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -1,7 +1,10 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" -#include "Metal/MTLResource.hpp" +#include "Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h" +#include "Common/precompiled.h" +#include "Foundation/NSRange.hpp" +#include "Metal/MTLRenderCommandEncoder.hpp" const size_t BUFFER_ALLOCATION_SIZE = 8 * 1024 * 1024; @@ -93,21 +96,51 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu if (restrideInfo.memoryInvalidated || stride != restrideInfo.lastStride) { - // TODO: use compute/void vertex function instead size_t newStride = Align(stride, 4); size_t newSize = vertexBufferRange.size / stride * newStride; - // TODO: use one big buffer for all restrided buffers - restrideInfo.buffer = m_mtlr->GetDevice()->newBuffer(newSize, MTL::StorageModeShared); - - uint8* oldPtr = (uint8*)bufferCache->contents() + vertexBufferRange.offset; - uint8* newPtr = (uint8*)restrideInfo.buffer->contents(); - - for (size_t elem = 0; elem < vertexBufferRange.size / stride; elem++) - { - memcpy(newPtr + elem * newStride, oldPtr + elem * stride, stride); - } - // TODO: remove - debug_printf("Restrided vertex buffer (old stride: %zu, new stride: %zu, old size: %zu, new size: %zu)\n", stride, newStride, vertexBufferRange.size, newSize); + if (!restrideInfo.buffer || newSize != restrideInfo.buffer->length()) + { + if (restrideInfo.buffer) + restrideInfo.buffer->release(); + // TODO: use one big buffer for all restrided buffers + restrideInfo.buffer = m_mtlr->GetDevice()->newBuffer(newSize, MTL::StorageModeShared); + } + + //uint8* oldPtr = (uint8*)bufferCache->contents() + vertexBufferRange.offset; + //uint8* newPtr = (uint8*)restrideInfo.buffer->contents(); + + //for (size_t elem = 0; elem < vertexBufferRange.size / stride; elem++) + //{ + // memcpy(newPtr + elem * newStride, oldPtr + elem * stride, stride); + //} + //debug_printf("Restrided vertex buffer (old stride: %zu, new stride: %zu, old size: %zu, new size: %zu)\n", stride, newStride, vertexBufferRange.size, newSize); + + if (m_mtlr->GetEncoderType() == MetalEncoderType::Render) + { + auto renderCommandEncoder = static_cast(m_mtlr->GetCommandEncoder()); + + renderCommandEncoder->setRenderPipelineState(m_restrideBufferPipeline->GetRenderPipelineState()); + MTL::Buffer* buffers[] = {bufferCache, restrideInfo.buffer}; + size_t offsets[] = {vertexBufferRange.offset, 0}; + renderCommandEncoder->setVertexBuffers(buffers, offsets, NS::Range(0, 2)); + + struct + { + uint32 oldStride; + uint32 newStride; + } strideData = {static_cast(stride), static_cast(newStride)}; + renderCommandEncoder->setVertexBytes(&strideData, sizeof(strideData), 2); + + renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypePoint, NS::UInteger(0), vertexBufferRange.size / stride); + + MTL::Resource* barrierBuffers[] = {restrideInfo.buffer}; + renderCommandEncoder->memoryBarrier(barrierBuffers, 1, MTL::RenderStageVertex, MTL::RenderStageVertex); + } + else + { + debug_printf("vertex buffer restride needs an active render encoder\n"); + cemu_assert_suspicious(); + } restrideInfo.memoryInvalidated = false; restrideInfo.lastStride = newStride; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index 4f875687d..0fc55936a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -80,7 +80,11 @@ class MetalVertexBufferCache MetalVertexBufferCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} ~MetalVertexBufferCache(); - // Vertex buffer cache + void SetRestrideBufferPipeline(class MetalHybridComputePipeline* restrideBufferPipeline) + { + m_restrideBufferPipeline = restrideBufferPipeline; + } + void TrackVertexBuffer(uint32 bufferIndex, size_t offset, size_t size, MetalRestrideInfo* restrideInfo) { m_bufferRanges[bufferIndex] = MetalVertexBufferRange{offset, size, restrideInfo}; @@ -101,6 +105,8 @@ class MetalVertexBufferCache private: class MetalRenderer* m_mtlr; + class MetalHybridComputePipeline* m_restrideBufferPipeline = nullptr; + MetalVertexBufferRange m_bufferRanges[LATTE_MAX_VERTEX_BUFFERS] = {}; void MemoryRangeChanged(size_t offset, size_t size); @@ -112,6 +118,12 @@ class MetalMemoryManager MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_bufferAllocator(metalRenderer), m_vertexBufferCache(metalRenderer) {} ~MetalMemoryManager(); + // Pipelines + void SetRestrideBufferPipeline(class MetalHybridComputePipeline* restrideBufferPipeline) + { + m_vertexBufferCache.SetRestrideBufferPipeline(restrideBufferPipeline); + } + void ResetTemporaryBuffers() { m_bufferAllocator/*s[m_bufferAllocatorIndex]*/.ResetTemporaryBuffers(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 80246aab7..3c1b59e5c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -62,8 +62,9 @@ MetalRenderer::MetalRenderer() MTL::Library* utilityLibrary = m_device->newLibrary(NS::String::string(utilityShaderSource, NS::ASCIIStringEncoding), nullptr, &error); if (error) { - debug_printf("failed to create present library (error: %s)\n", error->localizedDescription()->utf8String()); + debug_printf("failed to create utility library (error: %s)\n", error->localizedDescription()->utf8String()); error->release(); + throw; return; } @@ -98,12 +99,16 @@ MetalRenderer::MetalRenderer() // Hybrid pipelines m_copyTextureToTexturePipeline = new MetalHybridComputePipeline(this, utilityLibrary, "vertexCopyTextureToTexture", "kernelCopyTextureToTexture"); + m_restrideBufferPipeline = new MetalHybridComputePipeline(this, utilityLibrary, "vertexRestrideBuffer", "kernelRestrideBuffer"); utilityLibrary->release(); + + m_memoryManager->SetRestrideBufferPipeline(m_restrideBufferPipeline); } MetalRenderer::~MetalRenderer() { delete m_copyTextureToTexturePipeline; + delete m_restrideBufferPipeline; m_presentPipelineLinear->release(); m_presentPipelineSRGB->release(); @@ -688,11 +693,6 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 } const auto fetchShader = LatteSHRC_GetActiveFetchShader(); - // Render pipeline state - // TODO: use `m_lastUsedFBO` instead of `m_activeFBO` - MTL::RenderPipelineState* renderPipelineState = m_pipelineCache->GetPipelineState(fetchShader, vertexShader, pixelShader, m_state.m_activeFBO, LatteGPUState.contextNew); - renderCommandEncoder->setRenderPipelineState(renderPipelineState); - // Depth stencil state MTL::DepthStencilState* depthStencilState = m_depthStencilCache->GetDepthStencilState(LatteGPUState.contextNew); renderCommandEncoder->setDepthStencilState(depthStencilState); @@ -794,6 +794,10 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 } } + // Render pipeline state + MTL::RenderPipelineState* renderPipelineState = m_pipelineCache->GetPipelineState(fetchShader, vertexShader, pixelShader, m_state.m_activeFBO, LatteGPUState.contextNew); + renderCommandEncoder->setRenderPipelineState(renderPipelineState); + // Uniform buffers, textures and samplers BindStageResources(renderCommandEncoder, vertexShader); BindStageResources(renderCommandEncoder, pixelShader); @@ -1308,7 +1312,7 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE { LatteMRT::GetCurrentFragCoordScale(GET_UNIFORM_DATA_PTR(shader->uniform.loc_fragCoordScale)); } - // TODO: uncomment? + // TODO: uncomment /* if (shader->uniform.loc_verticesPerInstance >= 0) { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 067788ff1..5e86d7d9f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -9,6 +9,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" #include "Common/precompiled.h" #include "Metal/MTLCommandBuffer.hpp" +#include "Metal/MTLCommandEncoder.hpp" #include "Metal/MTLRenderPass.hpp" #define MAX_MTL_BUFFERS 31 @@ -244,6 +245,16 @@ class MetalRenderer : public Renderer return m_commandBuffers[m_commandBuffers.size() - 1].m_commandBuffer; } + MTL::CommandEncoder* GetCommandEncoder() + { + return m_commandEncoder; + } + + MetalEncoderType GetEncoderType() + { + return m_encoderType; + } + MTL::CommandBuffer* GetCommandBuffer(); bool CommandBufferCompleted(MTL::CommandBuffer* commandBuffer); void WaitForCommandBufferCompletion(MTL::CommandBuffer* commandBuffer); @@ -284,6 +295,7 @@ class MetalRenderer : public Renderer // Hybrid pipelines class MetalHybridComputePipeline* m_copyTextureToTexturePipeline; + class MetalHybridComputePipeline* m_restrideBufferPipeline; // Basic MTL::SamplerState* m_nearestSampler; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h index 2e49fa951..3bc2ff756 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h @@ -30,8 +30,20 @@ inline const char* utilityShaderSource = \ " uint dstSlice;\n" \ "};\n" \ "\n" \ -"vertex void vertexCopyTextureToTexture(uint vid [[vertex_id]], texture2d_array src [[texture(0)]], texture2d_array dst [[texture(1)]], constant CopyParams& params) {\n" \ +"vertex void vertexCopyTextureToTexture(uint vid [[vertex_id]], texture2d_array src [[texture(0)]], texture2d_array dst [[texture(1)]], constant CopyParams& params [[buffer(0)]]) {\n" \ " uint2 coord = uint2(vid % params.width, vid / params.width);\n" \ " return dst.write(float4(src.read(coord, params.srcSlice, params.srcMip).r, 0.0, 0.0, 0.0), coord, params.dstSlice, params.dstMip);\n" \ "}\n" \ +"\n" \ +"struct RestrideParams {\n" \ +" uint oldStride;\n" \ +" uint newStride;\n" \ +"};\n" \ +"\n" \ +/* TODO: use uint32? Since that would require less iterations */ \ +"vertex void vertexRestrideBuffer(uint vid [[vertex_id]], device uint8_t* src [[buffer(0)]], device uint8_t* dst [[buffer(1)]], constant RestrideParams& params [[buffer(2)]]) {\n" \ +" for (uint32_t i = 0; i < params.oldStride; i++) {\n" \ +" dst[vid * params.newStride + i] = src[vid * params.oldStride + i];\n" \ +" }\n" \ +"}\n" \ "\n"; From bba2bbcefbbc2c2bda29c27045da3040f5bba5cf Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 13 Aug 2024 19:00:39 +0200 Subject: [PATCH 065/368] use buffer allocator for restrided vertex buffers --- .../Renderer/Metal/MetalMemoryManager.cpp | 29 +++++-------------- .../Latte/Renderer/Metal/MetalMemoryManager.h | 18 +++++++----- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 2 +- 3 files changed, 18 insertions(+), 31 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index ef6871a7e..d14e9678a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -70,17 +70,6 @@ MetalBufferAllocation MetalBufferAllocator::GetBufferAllocation(size_t size, siz MetalVertexBufferCache::~MetalVertexBufferCache() { - for (uint32 i = 0; i < LATTE_MAX_VERTEX_BUFFERS; i++) - { - auto vertexBufferRange = m_bufferRanges[i]; - if (vertexBufferRange.offset != INVALID_OFFSET) - { - if (vertexBufferRange.restrideInfo->buffer) - { - vertexBufferRange.restrideInfo->buffer->release(); - } - } - } } MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Buffer* bufferCache, uint32 bufferIndex, size_t stride) @@ -94,17 +83,12 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu return {bufferCache, vertexBufferRange.offset}; } + auto buffer = m_bufferAllocator->GetBuffer(restrideInfo.allocation.bufferIndex); if (restrideInfo.memoryInvalidated || stride != restrideInfo.lastStride) { size_t newStride = Align(stride, 4); size_t newSize = vertexBufferRange.size / stride * newStride; - if (!restrideInfo.buffer || newSize != restrideInfo.buffer->length()) - { - if (restrideInfo.buffer) - restrideInfo.buffer->release(); - // TODO: use one big buffer for all restrided buffers - restrideInfo.buffer = m_mtlr->GetDevice()->newBuffer(newSize, MTL::StorageModeShared); - } + restrideInfo.allocation = m_bufferAllocator->GetBufferAllocation(newSize, 4); //uint8* oldPtr = (uint8*)bufferCache->contents() + vertexBufferRange.offset; //uint8* newPtr = (uint8*)restrideInfo.buffer->contents(); @@ -120,8 +104,8 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu auto renderCommandEncoder = static_cast(m_mtlr->GetCommandEncoder()); renderCommandEncoder->setRenderPipelineState(m_restrideBufferPipeline->GetRenderPipelineState()); - MTL::Buffer* buffers[] = {bufferCache, restrideInfo.buffer}; - size_t offsets[] = {vertexBufferRange.offset, 0}; + MTL::Buffer* buffers[] = {bufferCache, buffer}; + size_t offsets[] = {vertexBufferRange.offset, restrideInfo.allocation.bufferOffset}; renderCommandEncoder->setVertexBuffers(buffers, offsets, NS::Range(0, 2)); struct @@ -133,7 +117,8 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypePoint, NS::UInteger(0), vertexBufferRange.size / stride); - MTL::Resource* barrierBuffers[] = {restrideInfo.buffer}; + // TODO: restride in one call? + MTL::Resource* barrierBuffers[] = {buffer}; renderCommandEncoder->memoryBarrier(barrierBuffers, 1, MTL::RenderStageVertex, MTL::RenderStageVertex); } else @@ -146,7 +131,7 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu restrideInfo.lastStride = newStride; } - return {restrideInfo.buffer, 0}; + return {buffer, restrideInfo.allocation.bufferOffset}; } void MetalVertexBufferCache::MemoryRangeChanged(size_t offset, size_t size) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index 0fc55936a..d3588fd56 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -10,7 +10,12 @@ struct MetalBufferAllocation { void* data; uint32 bufferIndex; - size_t bufferOffset; + size_t bufferOffset = INVALID_OFFSET; + + bool IsValid() const + { + return bufferOffset != INVALID_OFFSET; + } }; struct MetalBufferRange @@ -62,7 +67,7 @@ struct MetalRestrideInfo { bool memoryInvalidated = true; size_t lastStride = 0; - MTL::Buffer* buffer = nullptr; + MetalBufferAllocation allocation{}; }; struct MetalVertexBufferRange @@ -77,7 +82,7 @@ class MetalVertexBufferCache public: friend class MetalMemoryManager; - MetalVertexBufferCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} + MetalVertexBufferCache(class MetalRenderer* metalRenderer, MetalBufferAllocator* bufferAllocator) : m_mtlr{metalRenderer}, m_bufferAllocator{bufferAllocator} {} ~MetalVertexBufferCache(); void SetRestrideBufferPipeline(class MetalHybridComputePipeline* restrideBufferPipeline) @@ -93,10 +98,6 @@ class MetalVertexBufferCache void UntrackVertexBuffer(uint32 bufferIndex) { auto& range = m_bufferRanges[bufferIndex]; - if (range.restrideInfo->buffer) - { - range.restrideInfo->buffer->release(); - } range.offset = INVALID_OFFSET; } @@ -104,6 +105,7 @@ class MetalVertexBufferCache private: class MetalRenderer* m_mtlr; + MetalBufferAllocator* m_bufferAllocator; class MetalHybridComputePipeline* m_restrideBufferPipeline = nullptr; @@ -115,7 +117,7 @@ class MetalVertexBufferCache class MetalMemoryManager { public: - MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_bufferAllocator(metalRenderer), m_vertexBufferCache(metalRenderer) {} + MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_bufferAllocator(metalRenderer), m_vertexBufferCache(metalRenderer, &m_bufferAllocator) {} ~MetalMemoryManager(); // Pipelines diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 3c1b59e5c..e55be3e2f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -795,7 +795,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 } // Render pipeline state - MTL::RenderPipelineState* renderPipelineState = m_pipelineCache->GetPipelineState(fetchShader, vertexShader, pixelShader, m_state.m_activeFBO, LatteGPUState.contextNew); + MTL::RenderPipelineState* renderPipelineState = m_pipelineCache->GetPipelineState(fetchShader, vertexShader, pixelShader, m_state.m_lastUsedFBO, LatteGPUState.contextNew); renderCommandEncoder->setRenderPipelineState(renderPipelineState); // Uniform buffers, textures and samplers From 13834ca9cb29e9ed878764b4e06067d7077a0286 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 14 Aug 2024 07:59:22 +0200 Subject: [PATCH 066/368] correct comment --- src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index d14e9678a..d48a91233 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -117,7 +117,7 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypePoint, NS::UInteger(0), vertexBufferRange.size / stride); - // TODO: restride in one call? + // TODO: do the barrier in one call? MTL::Resource* barrierBuffers[] = {buffer}; renderCommandEncoder->memoryBarrier(barrierBuffers, 1, MTL::RenderStageVertex, MTL::RenderStageVertex); } From 53efb9e5b2bc5c4a34b4cdbd36e058cc1d7f2dda Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 14 Aug 2024 11:57:01 +0200 Subject: [PATCH 067/368] prepare for shader cache --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 13 ++-- .../Renderer/Metal/RendererShaderMtl.cpp | 68 +++++++++++++++++-- .../Latte/Renderer/Metal/RendererShaderMtl.h | 4 ++ 3 files changed, 74 insertions(+), 11 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index e55be3e2f..b94f73bdf 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -665,22 +665,21 @@ void MetalRenderer::draw_beginSequence() if (!rasterizerEnable == false) m_state.m_skipDrawSequence = true; + + // TODO: is this even needed? + if (!m_state.m_activeFBO) + m_state.m_skipDrawSequence = true; } void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst) { - //if (m_state.skipDrawSequence) + // TODO: uncomment + //if (m_state.m_skipDrawSequence) //{ // return; //} // Render pass - if (!m_state.m_activeFBO) - { - debug_printf("no active FBO, skipping draw\n"); - return; - } - auto renderCommandEncoder = GetRenderCommandEncoder(); // Shaders diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index 183340521..08d036a08 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -2,22 +2,40 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" +#include "Cemu/FileCache/FileCache.h" +#include "config/ActiveSettings.h" #include "Cemu/Logging/CemuLogging.h" #include "Common/precompiled.h" +bool s_isLoadingShadersMtl{ false }; +class FileCache* s_mslCache{nullptr}; + +extern std::atomic_int g_compiled_shaders_total; +extern std::atomic_int g_compiled_shaders_async; + RendererShaderMtl::RendererShaderMtl(MetalRenderer* mtlRenderer, ShaderType type, uint64 baseHash, uint64 auxHash, bool isGameShader, bool isGfxPackShader, const std::string& mslCode) : RendererShader(type, baseHash, auxHash, isGameShader, isGfxPackShader), m_mtlr{mtlRenderer} { - // Fragment functions are compiled just-in-time + if (LoadBinary()) + return; + if (m_type == ShaderType::kFragment) { + // Fragment functions are compiled just-in-time m_mslCode = mslCode; } else { Compile(mslCode); } + + // Store the compiled shader in the cache + StoreBinary(); + + // Count shader compilation + if (!s_isLoadingShadersMtl) + g_compiled_shaders_total++; } RendererShaderMtl::~RendererShaderMtl() @@ -69,17 +87,29 @@ void RendererShaderMtl::CompileFragmentFunction(CachedFBOMtl* activeFBO) void RendererShaderMtl::ShaderCacheLoading_begin(uint64 cacheTitleId) { - cemuLog_log(LogType::MetalLogging, "RendererShaderMtl::ShaderCacheLoading_begin not implemented!"); + if (s_mslCache) + { + delete s_mslCache; + } + uint32 spirvCacheMagic = GeneratePrecompiledCacheId(); + const std::string cacheFilename = fmt::format("{:016x}_msl.bin", cacheTitleId); + const fs::path cachePath = ActiveSettings::GetCachePath("shaderCache/precompiled/{}", cacheFilename); + s_mslCache = FileCache::Open(cachePath, true, spirvCacheMagic); + if (!s_mslCache) + cemuLog_log(LogType::Force, "Unable to open MSL cache {}", cacheFilename); + s_isLoadingShadersMtl = true; } void RendererShaderMtl::ShaderCacheLoading_end() { - cemuLog_log(LogType::MetalLogging, "RendererShaderMtl::ShaderCacheLoading_end not implemented!"); + s_isLoadingShadersMtl = false; } void RendererShaderMtl::ShaderCacheLoading_Close() { - cemuLog_log(LogType::MetalLogging, "RendererShaderMtl::ShaderCacheLoading_Close not implemented!"); + delete s_mslCache; + g_compiled_shaders_total = 0; + g_compiled_shaders_async = 0; } void RendererShaderMtl::Compile(const std::string& mslCode) @@ -95,3 +125,33 @@ void RendererShaderMtl::Compile(const std::string& mslCode) m_function = library->newFunction(NS::String::string("main0", NS::ASCIIStringEncoding)); library->release(); } + +bool RendererShaderMtl::LoadBinary() +{ + // HACK: since fragment functions are compiled just-in-time, we cannot load them from the cache + if (m_type == ShaderType::kFragment) + return false; + + uint64 h1, h2; + GenerateShaderPrecompiledCacheFilename(m_type, m_baseHash, m_auxHash, h1, h2); + if (!s_mslCache->GetFile({h1, h2 }, m_binary)) + return false; + + // TODO: implement + return false; + + return true; +} + +void RendererShaderMtl::StoreBinary() +{ + if (m_binary.size() == 0) + { + // TODO: retrieve the binary from the function + return; + } + + uint64 h1, h2; + GenerateShaderPrecompiledCacheFilename(m_type, m_baseHash, m_auxHash, h1, h2); + s_mslCache->AddFileAsync({h1, h2 }, m_binary.data(), m_binary.size()); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h index f70db1bda..f788c1452 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h @@ -58,7 +58,11 @@ class RendererShaderMtl : public RendererShader MTL::Function* m_function = nullptr; + std::vector m_binary; std::string m_mslCode; void Compile(const std::string& mslCode); + + bool LoadBinary(); + void StoreBinary(); }; From 3c1a7479c2d3e6633ccf931ee5707b392d209b2a Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 14 Aug 2024 15:16:10 +0200 Subject: [PATCH 068/368] save pipelines to a binary archive --- src/Cafe/HW/Latte/Core/LatteBufferCache.cpp | 11 +- src/Cafe/HW/Latte/Core/LatteShaderCache.cpp | 8 +- .../Renderer/Metal/MetalPipelineCache.cpp | 131 +++++++++++++++++- .../Latte/Renderer/Metal/MetalPipelineCache.h | 13 ++ .../Renderer/Metal/RendererShaderMtl.cpp | 73 +--------- .../Latte/Renderer/Metal/RendererShaderMtl.h | 7 - 6 files changed, 150 insertions(+), 93 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp b/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp index 716312a39..174a86846 100644 --- a/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp +++ b/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp @@ -290,7 +290,8 @@ class BufferCacheNode { if (m_hasCacheAlloc) { - cemu_assert_debug(isInUse() == false); + // HACK + //cemu_assert_debug(isInUse() == false); g_gpuBufferHeap->freeOffset(m_cacheOffset); m_hasCacheAlloc = false; } @@ -441,7 +442,7 @@ class BufferCacheNode if (uploadBegin >= uploadEnd) return; // reserve range not within invalidation or range is zero sized - + if (uploadBegin == m_invalidationRangeBegin) { m_invalidationRangeBegin = uploadEnd; @@ -536,7 +537,7 @@ class BufferCacheNode MPTR m_invalidationRangeBegin; MPTR m_invalidationRangeEnd; - BufferCacheNode(MPTR rangeBegin, MPTR rangeEnd): m_rangeBegin(rangeBegin), m_rangeEnd(rangeEnd) + BufferCacheNode(MPTR rangeBegin, MPTR rangeEnd): m_rangeBegin(rangeBegin), m_rangeEnd(rangeEnd) { flagInUse(); cemu_assert_debug(rangeBegin < rangeEnd); @@ -740,7 +741,7 @@ class BufferCacheNode cemu_assert_debug(rangeEnd <= pageRangeEnd); cemu_assert_debug((rangeBegin & 0xF) == 0); cemu_assert_debug((rangeEnd & 0xF) == 0); - + auto pageInfo = m_pageInfo.data() + pageIndex; pageInfo->hasStreamoutData = true; @@ -805,7 +806,7 @@ class BufferCacheNode s_allCacheNodes.clear(); g_deallocateQueue.clear(); } - + static void ProcessDeallocations() { for(auto& itr : g_deallocateQueue) diff --git a/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp b/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp index 98d970f6d..1ba50dec5 100644 --- a/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp @@ -11,7 +11,7 @@ #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Cafe/HW/Latte/Renderer/OpenGL/RendererShaderGL.h" #include "Cafe/HW/Latte/Renderer/Vulkan/RendererShaderVk.h" -#include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h" #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanPipelineStableCache.h" #include @@ -163,7 +163,7 @@ void LatteShaderCache_finish() else if (g_renderer->GetType() == RendererAPI::OpenGL) RendererShaderGL::ShaderCacheLoading_end(); else if (g_renderer->GetType() == RendererAPI::Metal) - RendererShaderMtl::ShaderCacheLoading_end(); + MetalPipelineCache::ShaderCacheLoading_end(); } uint32 LatteShaderCache_getShaderCacheExtraVersion(uint64 titleId) @@ -247,7 +247,7 @@ void LatteShaderCache_Load() else if (g_renderer->GetType() == RendererAPI::OpenGL) RendererShaderGL::ShaderCacheLoading_begin(cacheTitleId); else if (g_renderer->GetType() == RendererAPI::Metal) - RendererShaderMtl::ShaderCacheLoading_begin(cacheTitleId); + MetalPipelineCache::ShaderCacheLoading_begin(cacheTitleId); // get cache file name const auto pathGeneric = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}_shaders.bin", cacheTitleId); const auto pathGenericPre1_25_0 = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}.bin", cacheTitleId); // before 1.25.0 @@ -780,7 +780,7 @@ void LatteShaderCache_Close() else if (g_renderer->GetType() == RendererAPI::OpenGL) RendererShaderGL::ShaderCacheLoading_Close(); else if (g_renderer->GetType() == RendererAPI::Metal) - RendererShaderMtl::ShaderCacheLoading_Close(); + MetalPipelineCache::ShaderCacheLoading_Close(); // if Vulkan then also close pipeline cache if (g_renderer->GetType() == RendererAPI::Vulkan) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 51642c9b1..39a7ec8d3 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -1,6 +1,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Foundation/NSObject.hpp" #include "HW/Latte/Renderer/Metal/CachedFBOMtl.h" #include "HW/Latte/Renderer/Metal/LatteToMtl.h" #include "HW/Latte/Renderer/Metal/RendererShaderMtl.h" @@ -8,6 +9,29 @@ #include "HW/Latte/Core/FetchShader.h" #include "HW/Latte/ISA/RegDefines.h" +#include "config/ActiveSettings.h" + +#define INVALID_TITLE_ID 0xFFFFFFFFFFFFFFFF + +uint64 s_cacheTitleId = INVALID_TITLE_ID; + +extern std::atomic_int g_compiled_shaders_total; +extern std::atomic_int g_compiled_shaders_async; + +void MetalPipelineCache::ShaderCacheLoading_begin(uint64 cacheTitleId) +{ + s_cacheTitleId = cacheTitleId; +} + +void MetalPipelineCache::ShaderCacheLoading_end() +{ +} + +void MetalPipelineCache::ShaderCacheLoading_Close() +{ + g_compiled_shaders_total = 0; + g_compiled_shaders_async = 0; +} MetalPipelineCache::~MetalPipelineCache() { @@ -16,6 +40,17 @@ MetalPipelineCache::~MetalPipelineCache() pair.second->release(); } m_pipelineCache.clear(); + + NS::Error* error = nullptr; + m_binaryArchive->serializeToURL(m_binaryArchiveURL, &error); + if (error) + { + debug_printf("failed to serialize binary archive: %s\n", error->localizedDescription()->utf8String()); + error->release(); + } + m_binaryArchive->release(); + + m_binaryArchiveURL->release(); } MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) @@ -151,16 +186,41 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS } } - NS::Error* error = nullptr; - pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, &error); - desc->release(); - vertexDescriptor->release(); + LoadBinary(desc); + + NS::Error* error = nullptr; + pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, MTL::PipelineOptionFailOnBinaryArchiveMiss, nullptr, &error); + + //static uint32 oldPipelineCount = 0; + //static uint32 newPipelineCount = 0; + + // Pipeline wasn't found in the binary archive, we need to compile it if (error) { - debug_printf("error creating render pipeline state: %s\n", error->localizedDescription()->utf8String()); - error->release(); - return nullptr; + desc->setBinaryArchives(nullptr); + + error->release(); + error = nullptr; + pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, &error); + if (error) + { + debug_printf("error creating render pipeline state: %s\n", error->localizedDescription()->utf8String()); + error->release(); + } + else + { + SaveBinary(desc); + } + + //newPipelineCount++; } + //else + //{ + // oldPipelineCount++; + //} + //debug_printf("%u pipelines were found in the binary archive, %u new were created\n", oldPipelineCount, newPipelineCount); + desc->release(); + vertexDescriptor->release(); return pipeline; } @@ -238,3 +298,60 @@ uint64 MetalPipelineCache::CalculatePipelineHash(const LatteFetchShader* fetchSh return stateHash; } + +void MetalPipelineCache::TryLoadBinaryArchive() +{ + if (m_binaryArchive || s_cacheTitleId == INVALID_TITLE_ID) + return; + + const std::string cacheFilename = fmt::format("{:016x}_mtl_pipelines.bin", s_cacheTitleId); + const fs::path cachePath = ActiveSettings::GetCachePath("shaderCache/precompiled/{}", cacheFilename); + m_binaryArchiveURL = NS::URL::fileURLWithPath(NS::String::string((const char*)cachePath.generic_u8string().c_str(), NS::ASCIIStringEncoding)); + + MTL::BinaryArchiveDescriptor* desc = MTL::BinaryArchiveDescriptor::alloc()->init(); + desc->setUrl(m_binaryArchiveURL); + + NS::Error* error = nullptr; + m_binaryArchive = m_mtlr->GetDevice()->newBinaryArchive(desc, &error); + if (error) + { + desc->setUrl(nullptr); + + error->release(); + error = nullptr; + m_binaryArchive = m_mtlr->GetDevice()->newBinaryArchive(desc, &error); + if (error) + { + debug_printf("failed to create binary archive: %s\n", error->localizedDescription()->utf8String()); + error->release(); + } + } + desc->release(); +} + +void MetalPipelineCache::LoadBinary(MTL::RenderPipelineDescriptor* desc) +{ + TryLoadBinaryArchive(); + + if (!m_binaryArchive) + return; + + NS::Object* binArchives[] = {m_binaryArchive}; + auto binaryArchives = NS::Array::alloc()->init(binArchives, 1); + desc->setBinaryArchives(binaryArchives); + binaryArchives->release(); +} + +void MetalPipelineCache::SaveBinary(MTL::RenderPipelineDescriptor* desc) +{ + if (!m_binaryArchive) + return; + + NS::Error* error = nullptr; + m_binaryArchive->addRenderPipelineFunctions(desc, &error); + if (error) + { + debug_printf("error saving render pipeline functions: %s\n", error->localizedDescription()->utf8String()); + error->release(); + } +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h index 11f81f883..1fa1f87c0 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h @@ -8,6 +8,10 @@ class MetalPipelineCache { public: + static void ShaderCacheLoading_begin(uint64 cacheTitleId); + static void ShaderCacheLoading_end(); + static void ShaderCacheLoading_Close(); + MetalPipelineCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} ~MetalPipelineCache(); @@ -18,5 +22,14 @@ class MetalPipelineCache std::map m_pipelineCache; + NS::URL* m_binaryArchiveURL; + MTL::BinaryArchive* m_binaryArchive; + uint64 CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr); + + void TryLoadBinaryArchive(); + + void LoadBinary(MTL::RenderPipelineDescriptor* desc); + + void SaveBinary(MTL::RenderPipelineDescriptor* desc); }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index 08d036a08..47c796bfd 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -2,24 +2,18 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" -#include "Cemu/FileCache/FileCache.h" -#include "config/ActiveSettings.h" +//#include "Cemu/FileCache/FileCache.h" +//#include "config/ActiveSettings.h" #include "Cemu/Logging/CemuLogging.h" #include "Common/precompiled.h" -bool s_isLoadingShadersMtl{ false }; -class FileCache* s_mslCache{nullptr}; - extern std::atomic_int g_compiled_shaders_total; extern std::atomic_int g_compiled_shaders_async; RendererShaderMtl::RendererShaderMtl(MetalRenderer* mtlRenderer, ShaderType type, uint64 baseHash, uint64 auxHash, bool isGameShader, bool isGfxPackShader, const std::string& mslCode) : RendererShader(type, baseHash, auxHash, isGameShader, isGfxPackShader), m_mtlr{mtlRenderer} { - if (LoadBinary()) - return; - if (m_type == ShaderType::kFragment) { // Fragment functions are compiled just-in-time @@ -30,12 +24,8 @@ RendererShaderMtl::RendererShaderMtl(MetalRenderer* mtlRenderer, ShaderType type Compile(mslCode); } - // Store the compiled shader in the cache - StoreBinary(); - // Count shader compilation - if (!s_isLoadingShadersMtl) - g_compiled_shaders_total++; + g_compiled_shaders_total++; } RendererShaderMtl::~RendererShaderMtl() @@ -85,33 +75,6 @@ void RendererShaderMtl::CompileFragmentFunction(CachedFBOMtl* activeFBO) Compile(fullCode); } -void RendererShaderMtl::ShaderCacheLoading_begin(uint64 cacheTitleId) -{ - if (s_mslCache) - { - delete s_mslCache; - } - uint32 spirvCacheMagic = GeneratePrecompiledCacheId(); - const std::string cacheFilename = fmt::format("{:016x}_msl.bin", cacheTitleId); - const fs::path cachePath = ActiveSettings::GetCachePath("shaderCache/precompiled/{}", cacheFilename); - s_mslCache = FileCache::Open(cachePath, true, spirvCacheMagic); - if (!s_mslCache) - cemuLog_log(LogType::Force, "Unable to open MSL cache {}", cacheFilename); - s_isLoadingShadersMtl = true; -} - -void RendererShaderMtl::ShaderCacheLoading_end() -{ - s_isLoadingShadersMtl = false; -} - -void RendererShaderMtl::ShaderCacheLoading_Close() -{ - delete s_mslCache; - g_compiled_shaders_total = 0; - g_compiled_shaders_async = 0; -} - void RendererShaderMtl::Compile(const std::string& mslCode) { NS::Error* error = nullptr; @@ -125,33 +88,3 @@ void RendererShaderMtl::Compile(const std::string& mslCode) m_function = library->newFunction(NS::String::string("main0", NS::ASCIIStringEncoding)); library->release(); } - -bool RendererShaderMtl::LoadBinary() -{ - // HACK: since fragment functions are compiled just-in-time, we cannot load them from the cache - if (m_type == ShaderType::kFragment) - return false; - - uint64 h1, h2; - GenerateShaderPrecompiledCacheFilename(m_type, m_baseHash, m_auxHash, h1, h2); - if (!s_mslCache->GetFile({h1, h2 }, m_binary)) - return false; - - // TODO: implement - return false; - - return true; -} - -void RendererShaderMtl::StoreBinary() -{ - if (m_binary.size() == 0) - { - // TODO: retrieve the binary from the function - return; - } - - uint64 h1, h2; - GenerateShaderPrecompiledCacheFilename(m_type, m_baseHash, m_auxHash, h1, h2); - s_mslCache->AddFileAsync({h1, h2 }, m_binary.data(), m_binary.size()); -} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h index f788c1452..eea12ae7d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h @@ -18,10 +18,6 @@ class RendererShaderMtl : public RendererShader //}; public: - static void ShaderCacheLoading_begin(uint64 cacheTitleId); - static void ShaderCacheLoading_end(); - static void ShaderCacheLoading_Close(); - RendererShaderMtl(class MetalRenderer* mtlRenderer, ShaderType type, uint64 baseHash, uint64 auxHash, bool isGameShader, bool isGfxPackShader, const std::string& mslCode); virtual ~RendererShaderMtl(); @@ -62,7 +58,4 @@ class RendererShaderMtl : public RendererShader std::string m_mslCode; void Compile(const std::string& mslCode); - - bool LoadBinary(); - void StoreBinary(); }; From d48de5b56f5c35e1a31da995a79522dc15fbdfdb Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 14 Aug 2024 15:37:06 +0200 Subject: [PATCH 069/368] fix: buffer cache crash on shutdown --- src/Cafe/HW/Latte/Core/LatteBufferCache.cpp | 3 +-- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 3 +++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp b/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp index 174a86846..821651ddf 100644 --- a/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp +++ b/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp @@ -290,8 +290,7 @@ class BufferCacheNode { if (m_hasCacheAlloc) { - // HACK - //cemu_assert_debug(isInUse() == false); + cemu_assert_debug(isInUse() == false); g_gpuBufferHeap->freeOffset(m_cacheOffset); m_hasCacheAlloc = false; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index b94f73bdf..ed9e90cb9 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -676,6 +676,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // TODO: uncomment //if (m_state.m_skipDrawSequence) //{ + // LatteGPUState.drawCallCounter++; // return; //} @@ -811,6 +812,8 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 { renderCommandEncoder->drawPrimitives(mtlPrimitiveType, baseVertex, count, instanceCount, baseInstance); } + + LatteGPUState.drawCallCounter++; } void MetalRenderer::draw_endSequence() From c4688e1ad1688b4ec167d943084dc5fbdeb87841 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 14 Aug 2024 16:21:43 +0200 Subject: [PATCH 070/368] stringify utility shader source --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 12 ++- .../Renderer/Metal/UtilityShaderSource.h | 102 +++++++++--------- 2 files changed, 63 insertions(+), 51 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index ed9e90cb9..096e3201d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -57,9 +57,17 @@ MetalRenderer::MetalRenderer() } } - // Utility shader source + // Utility shader library + + // Process the source first + std::string processedUtilityShaderSource = utilityShaderSource; + processedUtilityShaderSource.pop_back(); + processedUtilityShaderSource.erase(processedUtilityShaderSource.begin()); + processedUtilityShaderSource = "#include \n" + processedUtilityShaderSource; + + // Create the library NS::Error* error = nullptr; - MTL::Library* utilityLibrary = m_device->newLibrary(NS::String::string(utilityShaderSource, NS::ASCIIStringEncoding), nullptr, &error); + MTL::Library* utilityLibrary = m_device->newLibrary(NS::String::string(processedUtilityShaderSource.c_str(), NS::ASCIIStringEncoding), nullptr, &error); if (error) { debug_printf("failed to create utility library (error: %s)\n", error->localizedDescription()->utf8String()); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h index 3bc2ff756..a3e5bae10 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h @@ -1,49 +1,53 @@ -inline const char* utilityShaderSource = \ -"#include \n" \ -"using namespace metal;\n" \ -"\n" \ -"constant float2 positions[] = {float2(-1.0, -3.0), float2(-1.0, 1.0), float2(3.0, 1.0)};\n" -"\n" \ -"struct VertexOut {\n" \ -" float4 position [[position]];\n" \ -" float2 texCoord;\n" \ -"};\n" \ -"\n" \ -"vertex VertexOut vertexFullscreen(ushort vid [[vertex_id]]) {\n" \ -" VertexOut out;\n" \ -" out.position = float4(positions[vid], 0.0, 1.0);\n" \ -" out.texCoord = positions[vid] * 0.5 + 0.5;\n" \ -" out.texCoord.y = 1.0 - out.texCoord.y;\n" \ -"\n" \ -" return out;\n" \ -"}\n" \ -"\n" \ -"fragment float4 fragmentPresent(VertexOut in [[stage_in]], texture2d tex [[texture(0)]], sampler samplr [[sampler(0)]]) {\n" \ -" return tex.sample(samplr, in.texCoord);\n" \ -"}\n" \ -"\n" \ -"struct CopyParams {\n" \ -" uint width;\n" \ -" uint srcMip;\n" \ -" uint srcSlice;\n" \ -" uint dstMip;\n" \ -" uint dstSlice;\n" \ -"};\n" \ -"\n" \ -"vertex void vertexCopyTextureToTexture(uint vid [[vertex_id]], texture2d_array src [[texture(0)]], texture2d_array dst [[texture(1)]], constant CopyParams& params [[buffer(0)]]) {\n" \ -" uint2 coord = uint2(vid % params.width, vid / params.width);\n" \ -" return dst.write(float4(src.read(coord, params.srcSlice, params.srcMip).r, 0.0, 0.0, 0.0), coord, params.dstSlice, params.dstMip);\n" \ -"}\n" \ -"\n" \ -"struct RestrideParams {\n" \ -" uint oldStride;\n" \ -" uint newStride;\n" \ -"};\n" \ -"\n" \ -/* TODO: use uint32? Since that would require less iterations */ \ -"vertex void vertexRestrideBuffer(uint vid [[vertex_id]], device uint8_t* src [[buffer(0)]], device uint8_t* dst [[buffer(1)]], constant RestrideParams& params [[buffer(2)]]) {\n" \ -" for (uint32_t i = 0; i < params.oldStride; i++) {\n" \ -" dst[vid * params.newStride + i] = src[vid * params.oldStride + i];\n" \ -" }\n" \ -"}\n" \ -"\n"; +#pragma once + +#define __STRINGIFY(x) #x +#define _STRINGIFY(x) __STRINGIFY(x) + +constexpr const char* utilityShaderSource = _STRINGIFY(( +using namespace metal; + +constant float2 positions[] = {float2(-1.0, -3.0), float2(-1.0, 1.0), float2(3.0, 1.0)}; + +struct VertexOut { + float4 position [[position]]; + float2 texCoord; +}; + +vertex VertexOut vertexFullscreen(ushort vid [[vertex_id]]) { + VertexOut out; + out.position = float4(positions[vid], 0.0, 1.0); + out.texCoord = positions[vid] * 0.5 + 0.5; + out.texCoord.y = 1.0 - out.texCoord.y; + + return out; +} + +fragment float4 fragmentPresent(VertexOut in [[stage_in]], texture2d tex [[texture(0)]], sampler samplr [[sampler(0)]]) { + return tex.sample(samplr, in.texCoord); +} + +struct CopyParams { + uint width; + uint srcMip; + uint srcSlice; + uint dstMip; + uint dstSlice; +}; + +vertex void vertexCopyTextureToTexture(uint vid [[vertex_id]], texture2d_array src [[texture(0)]], texture2d_array dst [[texture(1)]], constant CopyParams& params [[buffer(0)]]) { + uint2 coord = uint2(vid % params.width, vid / params.width); + return dst.write(float4(src.read(coord, params.srcSlice, params.srcMip).r, 0.0, 0.0, 0.0), coord, params.dstSlice, params.dstMip); +} + +struct RestrideParams { + uint oldStride; + uint newStride; +}; + +/* TODO: use uint32? Since that would require less iterations */ +vertex void vertexRestrideBuffer(uint vid [[vertex_id]], device uint8_t* src [[buffer(0)]], device uint8_t* dst [[buffer(1)]], constant RestrideParams& params [[buffer(2)]]) { + for (uint32_t i = 0; i < params.oldStride; i++) { + dst[vid * params.newStride + i] = src[vid * params.oldStride + i]; + } +} +)); From 0bb99aaa755061dcb21c8c680420fcb459dda340 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 14 Aug 2024 20:28:28 +0200 Subject: [PATCH 071/368] set depth bias --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 096e3201d..224e908df 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -740,7 +740,17 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 if (polyOffsetFrontEnable) { - // TODO: set depth bias + //uint32 frontScaleU32 = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_SCALE.getRawValue(); + //uint32 frontOffsetU32 = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_OFFSET.getRawValue(); + //uint32 offsetClampU32 = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_CLAMP.getRawValue(); + + float frontScale = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_SCALE.get_SCALE(); + float frontOffset = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_OFFSET.get_OFFSET(); + float offsetClamp = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_CLAMP.get_CLAMP(); + + frontScale /= 16.0f; + + renderCommandEncoder->setDepthBias(frontOffset, frontScale, offsetClamp); } // todo - how does culling behave with rects? From be987343582fbe0aa002c22546cdc6aafd0742aa Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 14 Aug 2024 20:58:47 +0200 Subject: [PATCH 072/368] fix: incorrect roundEven implementation --- .../LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 06ca4ec47..95e91d40d 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -1231,8 +1231,7 @@ static void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderConte _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); - src->add("roundEven"); - src->add("("); + src->add("rint("); _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(")"); _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); @@ -3725,12 +3724,6 @@ void LatteDecompiler_emitHelperFunctions(LatteDecompilerShaderContext* shaderCon "return as_type(clamp(as_type(v), 0.0, 1.0));\r\n" "}\r\n"); - // round even - fCStr_shaderSource->add("" - "float roundEven(float x) {\r\n" - "return round(x / 2.0) * 2.0;\r\n" - "}\r\n"); - // Bit cast // Scalar From ed7354fa1bdeeff66e9905db1bbcaec097b639e9 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 14 Aug 2024 21:23:18 +0200 Subject: [PATCH 073/368] reset depth bias --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 224e908df..f9427b54e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -752,6 +752,10 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 renderCommandEncoder->setDepthBias(frontOffset, frontScale, offsetClamp); } + else + { + renderCommandEncoder->setDepthBias(0.0f, 0.0f, 0.0f); + } // todo - how does culling behave with rects? // right now we just assume that their winding is always CW From 9a215e064ff13ed6bf633433c06401838ab93d00 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 15 Aug 2024 10:15:05 +0200 Subject: [PATCH 074/368] don't bind textures and buffers which are already bound --- .../HW/Latte/Renderer/Metal/MetalCommon.h | 14 +++ .../Renderer/Metal/MetalMemoryManager.cpp | 4 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 94 +++++++++++-------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 59 +++++++++--- .../Renderer/Metal/UtilityShaderSource.h | 8 +- 5 files changed, 119 insertions(+), 60 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h index aa71731e1..0e2c6ac93 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -2,6 +2,20 @@ #include +#define MAX_MTL_BUFFERS 31 +// Buffer index 30 is reserved for the support buffer, buffer indices 27-29 are reserved for the helper shaders +#define GET_MTL_VERTEX_BUFFER_INDEX(index) (MAX_MTL_BUFFERS - index - 5) +// TODO: don't harcdode the support buffer binding +#define MTL_SUPPORT_BUFFER_BINDING 30 + +#define MAX_MTL_TEXTURES 31 +#define MAX_MTL_SAMPLERS 16 + +#define GET_HELPER_BUFFER_BINDING(index) (27 + index) +#define GET_HELPER_TEXTURE_BINDING(index) (29 + index) +#define GET_HELPER_SAMPLER_BINDING(index) (14 + index) + +constexpr uint32 INVALID_UINT32 = std::numeric_limits::max(); constexpr size_t INVALID_OFFSET = std::numeric_limits::max(); inline size_t Align(size_t size, size_t alignment) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index d48a91233..022fda8d6 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -106,14 +106,14 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu renderCommandEncoder->setRenderPipelineState(m_restrideBufferPipeline->GetRenderPipelineState()); MTL::Buffer* buffers[] = {bufferCache, buffer}; size_t offsets[] = {vertexBufferRange.offset, restrideInfo.allocation.bufferOffset}; - renderCommandEncoder->setVertexBuffers(buffers, offsets, NS::Range(0, 2)); + renderCommandEncoder->setVertexBuffers(buffers, offsets, NS::Range(GET_HELPER_BUFFER_BINDING(0), 2)); struct { uint32 oldStride; uint32 newStride; } strideData = {static_cast(stride), static_cast(newStride)}; - renderCommandEncoder->setVertexBytes(&strideData, sizeof(strideData), 2); + renderCommandEncoder->setVertexBytes(&strideData, sizeof(strideData), GET_HELPER_BUFFER_BINDING(2)); renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypePoint, NS::UInteger(0), vertexBufferRange.size / stride); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index f9427b54e..06d74d470 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -16,6 +16,7 @@ #include "Cafe/HW/Latte/Core/LatteIndices.h" #include "Cemu/Logging/CemuDebugLogging.h" #include "Common/precompiled.h" +#include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "Metal/MTLRenderPass.hpp" #include "gui/guiWrapper.h" @@ -49,12 +50,10 @@ MetalRenderer::MetalRenderer() m_xfbRingBuffer = m_device->newBuffer(LatteStreamout_GetRingBufferSize(), MTL::StorageModeShared); // Initialize state - for (uint32 i = 0; i < (uint32)LatteConst::ShaderType::TotalCount; i++) + for (uint32 i = 0; i < METAL_SHADER_TYPE_TOTAL; i++) { for (uint32 j = 0; j < MAX_MTL_BUFFERS; j++) - { m_state.m_uniformBufferOffsets[i][j] = INVALID_OFFSET; - } } // Utility shader library @@ -63,7 +62,7 @@ MetalRenderer::MetalRenderer() std::string processedUtilityShaderSource = utilityShaderSource; processedUtilityShaderSource.pop_back(); processedUtilityShaderSource.erase(processedUtilityShaderSource.begin()); - processedUtilityShaderSource = "#include \n" + processedUtilityShaderSource; + processedUtilityShaderSource = "#include \nusing namespace metal;\n#define GET_BUFFER_BINDING(index) (27 + index)\n#define GET_TEXTURE_BINDING(index) (29 + index)\n#define GET_SAMPLER_BINDING(index) (14 + index)\n" + processedUtilityShaderSource; // Create the library NS::Error* error = nullptr; @@ -233,8 +232,8 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput // Draw to Metal layer renderCommandEncoder->setRenderPipelineState(m_state.m_usesSRGB ? m_presentPipelineSRGB : m_presentPipelineLinear); - renderCommandEncoder->setFragmentTexture(presentTexture, 0); - renderCommandEncoder->setFragmentSamplerState((useLinearTexFilter ? m_linearSampler : m_nearestSampler), 0); + renderCommandEncoder->setFragmentTexture(presentTexture, GET_HELPER_TEXTURE_BINDING(0)); + renderCommandEncoder->setFragmentSamplerState((useLinearTexFilter ? m_linearSampler : m_nearestSampler), GET_HELPER_SAMPLER_BINDING(0)); renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3)); @@ -540,8 +539,8 @@ void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* so renderCommandEncoder->setViewport(MTL::Viewport{0.0, 0.0, (double)effectiveCopyWidth, (double)effectiveCopyHeight, 0.0, 1.0}); renderCommandEncoder->setScissorRect(MTL::ScissorRect{0, 0, (uint32)effectiveCopyWidth, (uint32)effectiveCopyHeight}); - renderCommandEncoder->setVertexTextures(textures, NS::Range(0, 2)); - renderCommandEncoder->setVertexBytes(¶ms, sizeof(params), 0); + renderCommandEncoder->setVertexTextures(textures, NS::Range(GET_HELPER_BUFFER_BINDING(0), 2)); + renderCommandEncoder->setVertexBytes(¶ms, sizeof(params), GET_HELPER_BUFFER_BINDING(0)); renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3)); } @@ -596,7 +595,7 @@ void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, u void MetalRenderer::buffer_bindUniformBuffer(LatteConst::ShaderType shaderType, uint32 bufferIndex, uint32 offset, uint32 size) { - m_state.m_uniformBufferOffsets[(uint32)shaderType][bufferIndex] = offset; + m_state.m_uniformBufferOffsets[GetMtlShaderType(shaderType)][bufferIndex] = offset; } RendererShader* MetalRenderer::shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool isGameShader, bool isGfxPackShader) @@ -962,6 +961,8 @@ MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(bool forceRecr m_commandEncoder = renderCommandEncoder; m_encoderType = MetalEncoderType::Render; + ResetEncoderState(); + if (rebindStateIfNewEncoder) { // Rebind all the render state @@ -989,6 +990,8 @@ MTL::ComputeCommandEncoder* MetalRenderer::GetComputeCommandEncoder() m_commandEncoder = computeCommandEncoder; m_encoderType = MetalEncoderType::Compute; + ResetEncoderState(); + return computeCommandEncoder; } @@ -1010,6 +1013,8 @@ MTL::BlitCommandEncoder* MetalRenderer::GetBlitCommandEncoder() m_commandEncoder = blitCommandEncoder; m_encoderType = MetalEncoderType::Blit; + ResetEncoderState(); + return blitCommandEncoder; } @@ -1075,8 +1080,9 @@ bool MetalRenderer::AcquireNextDrawable(bool mainWindow) void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader) { - sint32 textureCount = shader->resourceMapping.getTextureCount(); + auto mtlShaderType = GetMtlShaderType(shader->shaderType); + sint32 textureCount = shader->resourceMapping.getTextureCount(); for (int i = 0; i < textureCount; ++i) { const auto relative_textureUnit = shader->resourceMapping.getTextureUnitFromBindingPoint(i); @@ -1108,24 +1114,16 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE continue; } - LatteTexture* baseTexture = textureView->baseTexture; - // get texture register word 0 - uint32 word4 = LatteGPUState.contextRegister[texUnitRegIndex + 4]; - - // TODO: wht - //auto imageViewObj = textureView->GetSamplerView(word4); - //info.imageView = imageViewObj->m_textureImageView; - // TODO: uncomment uint32 binding = shader->resourceMapping.getTextureBaseBindingPoint() + i;//shader->resourceMapping.textureUnitToBindingPoint[hostTextureUnit]; - //uint32 textureBinding = binding % MAX_MTL_TEXTURES; - //uint32 samplerBinding = binding % MAX_MTL_SAMPLERS; if (binding >= MAX_MTL_TEXTURES) { debug_printf("invalid texture binding %u\n", binding); continue; } + LatteTexture* baseTexture = textureView->baseTexture; + uint32 stageSamplerIndex = shader->textureUnitSamplerAssignment[relative_textureUnit]; if (stageSamplerIndex != LATTE_DECOMPILER_SAMPLER_NONE) { @@ -1249,6 +1247,14 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE sampler->release(); } + // get texture register word 0 + uint32 word4 = LatteGPUState.contextRegister[texUnitRegIndex + 4]; + auto& boundTexture = m_state.m_encoderState.m_textures[mtlShaderType][binding]; + if (textureView == boundTexture.m_textureView && word4 == boundTexture.m_word4) + continue; + + boundTexture = {textureView, word4}; + MTL::Texture* mtlTexture = textureView->GetSwizzledView(word4); switch (shader->shaderType) { @@ -1376,28 +1382,36 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE uint32 binding = shader->resourceMapping.uniformBuffersBindingPoint[i]; if (binding >= MAX_MTL_BUFFERS) { - debug_printf("too big buffer index (%u), skipping binding\n", binding); + debug_printf("invalid buffer binding%u\n", binding); continue; } - size_t offset = m_state.m_uniformBufferOffsets[(uint32)shader->shaderType][i]; - if (offset != INVALID_OFFSET) - { - switch (shader->shaderType) - { - case LatteConst::ShaderType::Vertex: - { - renderCommandEncoder->setVertexBuffer(m_memoryManager->GetBufferCache(), offset, binding); - break; - } - case LatteConst::ShaderType::Pixel: - { - renderCommandEncoder->setFragmentBuffer(m_memoryManager->GetBufferCache(), offset, binding); - break; - } - default: - UNREACHABLE; - } - } + + size_t offset = m_state.m_uniformBufferOffsets[mtlShaderType][i]; + if (offset == INVALID_OFFSET) + continue; + + auto& boundOffset = m_state.m_encoderState.m_uniformBufferOffsets[mtlShaderType][binding]; + if (offset == boundOffset) + continue; + + boundOffset = offset; + + // TODO: only set the offset if already bound + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + { + renderCommandEncoder->setVertexBuffer(m_memoryManager->GetBufferCache(), offset, binding); + break; + } + case LatteConst::ShaderType::Pixel: + { + renderCommandEncoder->setFragmentBuffer(m_memoryManager->GetBufferCache(), offset, binding); + break; + } + default: + UNREACHABLE; + } } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 5e86d7d9f..d151af9a8 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -7,18 +7,6 @@ #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" -#include "Common/precompiled.h" -#include "Metal/MTLCommandBuffer.hpp" -#include "Metal/MTLCommandEncoder.hpp" -#include "Metal/MTLRenderPass.hpp" - -#define MAX_MTL_BUFFERS 31 -#define GET_MTL_VERTEX_BUFFER_INDEX(index) (MAX_MTL_BUFFERS - index - 2) -// TODO: don't harcdode the support buffer binding -#define MTL_SUPPORT_BUFFER_BINDING 30 - -#define MAX_MTL_TEXTURES 31 -#define MAX_MTL_SAMPLERS 16 struct MetalBoundBuffer { @@ -29,8 +17,40 @@ struct MetalBoundBuffer MetalRestrideInfo restrideInfo; }; +enum MetalShaderType +{ + METAL_SHADER_TYPE_VERTEX, + METAL_SHADER_TYPE_FRAGMENT, + + METAL_SHADER_TYPE_TOTAL +}; + +inline MetalShaderType GetMtlShaderType(LatteConst::ShaderType shaderType) +{ + switch (shaderType) + { + case LatteConst::ShaderType::Vertex: + return METAL_SHADER_TYPE_VERTEX; + case LatteConst::ShaderType::Pixel: + return METAL_SHADER_TYPE_FRAGMENT; + default: + return METAL_SHADER_TYPE_TOTAL; + } +} + +struct MetalEncoderState +{ + struct { + class LatteTextureViewMtl* m_textureView = nullptr; + uint32 m_word4 = INVALID_UINT32; + } m_textures[METAL_SHADER_TYPE_TOTAL][MAX_MTL_TEXTURES]; + size_t m_uniformBufferOffsets[METAL_SHADER_TYPE_TOTAL][MAX_MTL_BUFFERS]; +}; + struct MetalState { + MetalEncoderState m_encoderState{}; + bool m_usesSRGB = false; bool m_skipDrawSequence = false; @@ -42,7 +62,7 @@ struct MetalState MetalBoundBuffer m_vertexBuffers[MAX_MTL_BUFFERS] = {{}}; // TODO: find out what is the max number of bound textures on the Wii U class LatteTextureViewMtl* m_textures[64] = {nullptr}; - size_t m_uniformBufferOffsets[(uint32)LatteConst::ShaderType::TotalCount][MAX_MTL_BUFFERS]; + size_t m_uniformBufferOffsets[METAL_SHADER_TYPE_TOTAL][MAX_MTL_BUFFERS]; MTL::Viewport m_viewport = {0, 0, 0, 0, 0, 0}; MTL::ScissorRect m_scissor = {0, 0, 0, 0}; @@ -255,6 +275,19 @@ class MetalRenderer : public Renderer return m_encoderType; } + void ResetEncoderState() + { + m_state.m_encoderState = {}; + + for (uint32 i = 0; i < METAL_SHADER_TYPE_TOTAL; i++) + { + for (uint32 j = 0; j < MAX_MTL_TEXTURES; j++) + m_state.m_encoderState.m_textures[i][j] = {nullptr}; + for (uint32 j = 0; j < MAX_MTL_BUFFERS; j++) + m_state.m_encoderState.m_uniformBufferOffsets[i][j] = INVALID_OFFSET; + } + } + MTL::CommandBuffer* GetCommandBuffer(); bool CommandBufferCompleted(MTL::CommandBuffer* commandBuffer); void WaitForCommandBufferCompletion(MTL::CommandBuffer* commandBuffer); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h index a3e5bae10..d96d32948 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h @@ -4,8 +4,6 @@ #define _STRINGIFY(x) __STRINGIFY(x) constexpr const char* utilityShaderSource = _STRINGIFY(( -using namespace metal; - constant float2 positions[] = {float2(-1.0, -3.0), float2(-1.0, 1.0), float2(3.0, 1.0)}; struct VertexOut { @@ -22,7 +20,7 @@ vertex VertexOut vertexFullscreen(ushort vid [[vertex_id]]) { return out; } -fragment float4 fragmentPresent(VertexOut in [[stage_in]], texture2d tex [[texture(0)]], sampler samplr [[sampler(0)]]) { +fragment float4 fragmentPresent(VertexOut in [[stage_in]], texture2d tex [[texture(GET_TEXTURE_BINDING(0))]], sampler samplr [[sampler(GET_SAMPLER_BINDING(0))]]) { return tex.sample(samplr, in.texCoord); } @@ -34,7 +32,7 @@ struct CopyParams { uint dstSlice; }; -vertex void vertexCopyTextureToTexture(uint vid [[vertex_id]], texture2d_array src [[texture(0)]], texture2d_array dst [[texture(1)]], constant CopyParams& params [[buffer(0)]]) { +vertex void vertexCopyTextureToTexture(uint vid [[vertex_id]], texture2d_array src [[texture(GET_TEXTURE_BINDING(0))]], texture2d_array dst [[texture(GET_TEXTURE_BINDING(1))]], constant CopyParams& params [[buffer(GET_BUFFER_BINDING(0))]]) { uint2 coord = uint2(vid % params.width, vid / params.width); return dst.write(float4(src.read(coord, params.srcSlice, params.srcMip).r, 0.0, 0.0, 0.0), coord, params.dstSlice, params.dstMip); } @@ -45,7 +43,7 @@ struct RestrideParams { }; /* TODO: use uint32? Since that would require less iterations */ -vertex void vertexRestrideBuffer(uint vid [[vertex_id]], device uint8_t* src [[buffer(0)]], device uint8_t* dst [[buffer(1)]], constant RestrideParams& params [[buffer(2)]]) { +vertex void vertexRestrideBuffer(uint vid [[vertex_id]], device uint8_t* src [[buffer(GET_BUFFER_BINDING(0))]], device uint8_t* dst [[buffer(GET_BUFFER_BINDING(1))]], constant RestrideParams& params [[buffer(GET_BUFFER_BINDING(2))]]) { for (uint32_t i = 0; i < params.oldStride; i++) { dst[vid * params.newStride + i] = src[vid * params.oldStride + i]; } From 9982ac7acb28e4dc81ed18b956c06043ef162602 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 15 Aug 2024 10:58:33 +0200 Subject: [PATCH 075/368] remove more unnecessary rebinds --- .../Renderer/Metal/MetalMemoryManager.cpp | 5 + .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 111 +++++++++++++----- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 16 +++ .../Renderer/Metal/UtilityShaderSource.h | 2 +- 4 files changed, 103 insertions(+), 31 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 022fda8d6..b059adf49 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -104,9 +104,13 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu auto renderCommandEncoder = static_cast(m_mtlr->GetCommandEncoder()); renderCommandEncoder->setRenderPipelineState(m_restrideBufferPipeline->GetRenderPipelineState()); + m_mtlr->GetEncoderState().m_renderPipelineState = m_restrideBufferPipeline->GetRenderPipelineState(); + MTL::Buffer* buffers[] = {bufferCache, buffer}; size_t offsets[] = {vertexBufferRange.offset, restrideInfo.allocation.bufferOffset}; renderCommandEncoder->setVertexBuffers(buffers, offsets, NS::Range(GET_HELPER_BUFFER_BINDING(0), 2)); + m_mtlr->GetEncoderState().m_uniformBufferOffsets[METAL_SHADER_TYPE_VERTEX][GET_HELPER_BUFFER_BINDING(0)] = INVALID_OFFSET; + m_mtlr->GetEncoderState().m_uniformBufferOffsets[METAL_SHADER_TYPE_VERTEX][GET_HELPER_BUFFER_BINDING(1)] = INVALID_OFFSET; struct { @@ -114,6 +118,7 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu uint32 newStride; } strideData = {static_cast(stride), static_cast(newStride)}; renderCommandEncoder->setVertexBytes(&strideData, sizeof(strideData), GET_HELPER_BUFFER_BINDING(2)); + m_mtlr->GetEncoderState().m_uniformBufferOffsets[METAL_SHADER_TYPE_VERTEX][GET_HELPER_BUFFER_BINDING(2)] = INVALID_OFFSET; renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypePoint, NS::UInteger(0), vertexBufferRange.size / stride); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 06d74d470..0870d2ed0 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -17,6 +17,7 @@ #include "Cemu/Logging/CemuDebugLogging.h" #include "Common/precompiled.h" #include "HW/Latte/Renderer/Metal/MetalCommon.h" +#include "Metal/MTLRenderCommandEncoder.hpp" #include "Metal/MTLRenderPass.hpp" #include "gui/guiWrapper.h" @@ -232,8 +233,8 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput // Draw to Metal layer renderCommandEncoder->setRenderPipelineState(m_state.m_usesSRGB ? m_presentPipelineSRGB : m_presentPipelineLinear); - renderCommandEncoder->setFragmentTexture(presentTexture, GET_HELPER_TEXTURE_BINDING(0)); - renderCommandEncoder->setFragmentSamplerState((useLinearTexFilter ? m_linearSampler : m_nearestSampler), GET_HELPER_SAMPLER_BINDING(0)); + renderCommandEncoder->setFragmentTexture(presentTexture, 0); + renderCommandEncoder->setFragmentSamplerState((useLinearTexFilter ? m_linearSampler : m_nearestSampler), 0); renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3)); @@ -535,12 +536,13 @@ void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* so auto renderCommandEncoder = static_cast(m_commandEncoder); renderCommandEncoder->setRenderPipelineState(m_copyTextureToTexturePipeline->GetRenderPipelineState()); + m_state.m_encoderState.m_renderPipelineState = m_copyTextureToTexturePipeline->GetRenderPipelineState(); - renderCommandEncoder->setViewport(MTL::Viewport{0.0, 0.0, (double)effectiveCopyWidth, (double)effectiveCopyHeight, 0.0, 1.0}); - renderCommandEncoder->setScissorRect(MTL::ScissorRect{0, 0, (uint32)effectiveCopyWidth, (uint32)effectiveCopyHeight}); - - renderCommandEncoder->setVertexTextures(textures, NS::Range(GET_HELPER_BUFFER_BINDING(0), 2)); + renderCommandEncoder->setVertexTextures(textures, NS::Range(GET_HELPER_TEXTURE_BINDING(0), 2)); + m_state.m_encoderState.m_textures[METAL_SHADER_TYPE_VERTEX][GET_HELPER_TEXTURE_BINDING(0)] = {(LatteTextureViewMtl*)textures[0]}; + m_state.m_encoderState.m_textures[METAL_SHADER_TYPE_VERTEX][GET_HELPER_TEXTURE_BINDING(1)] = {(LatteTextureViewMtl*)textures[1]}; renderCommandEncoder->setVertexBytes(¶ms, sizeof(params), GET_HELPER_BUFFER_BINDING(0)); + m_state.m_encoderState.m_uniformBufferOffsets[METAL_SHADER_TYPE_VERTEX][GET_HELPER_BUFFER_BINDING(0)] = INVALID_OFFSET; renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3)); } @@ -687,6 +689,8 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // return; //} + auto& encoderState = m_state.m_encoderState; + // Render pass auto renderCommandEncoder = GetRenderCommandEncoder(); @@ -702,7 +706,11 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // Depth stencil state MTL::DepthStencilState* depthStencilState = m_depthStencilCache->GetDepthStencilState(LatteGPUState.contextNew); - renderCommandEncoder->setDepthStencilState(depthStencilState); + if (depthStencilState != encoderState.m_depthStencilState) + { + renderCommandEncoder->setDepthStencilState(depthStencilState); + encoderState.m_depthStencilState = depthStencilState; + } // Stencil reference bool stencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ENABLE(); @@ -710,11 +718,19 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 { bool backStencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_BACK_STENCIL_ENABLE(); uint32 stencilRefFront = LatteGPUState.contextNew.DB_STENCILREFMASK.get_STENCILREF_F(); - uint32 stencilRefBack = LatteGPUState.contextNew.DB_STENCILREFMASK_BF.get_STENCILREF_B(); - if (backStencilEnable) - renderCommandEncoder->setStencilReferenceValues(stencilRefFront, stencilRefBack); - else - renderCommandEncoder->setStencilReferenceValue(stencilRefFront); + uint32 stencilRefBack; + if (backStencilEnable) + stencilRefBack = LatteGPUState.contextNew.DB_STENCILREFMASK_BF.get_STENCILREF_B(); + else + stencilRefBack = stencilRefFront; + + if (stencilRefFront != encoderState.m_stencilRefFront || stencilRefBack != encoderState.m_stencilRefBack) + { + renderCommandEncoder->setStencilReferenceValues(stencilRefFront, stencilRefBack); + + encoderState.m_stencilRefFront = stencilRefFront; + encoderState.m_stencilRefBack = stencilRefBack; + } } // Primitive type @@ -739,21 +755,35 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 if (polyOffsetFrontEnable) { - //uint32 frontScaleU32 = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_SCALE.getRawValue(); - //uint32 frontOffsetU32 = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_OFFSET.getRawValue(); - //uint32 offsetClampU32 = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_CLAMP.getRawValue(); + uint32 frontScaleU32 = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_SCALE.getRawValue(); + uint32 frontOffsetU32 = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_OFFSET.getRawValue(); + uint32 offsetClampU32 = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_CLAMP.getRawValue(); + + if (frontOffsetU32 != encoderState.m_depthBias || frontScaleU32 != encoderState.m_depthSlope || offsetClampU32 != encoderState.m_depthClamp) + { + float frontScale = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_SCALE.get_SCALE(); + float frontOffset = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_OFFSET.get_OFFSET(); + float offsetClamp = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_CLAMP.get_CLAMP(); - float frontScale = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_SCALE.get_SCALE(); - float frontOffset = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_OFFSET.get_OFFSET(); - float offsetClamp = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_CLAMP.get_CLAMP(); + frontScale /= 16.0f; - frontScale /= 16.0f; + renderCommandEncoder->setDepthBias(frontOffset, frontScale, offsetClamp); - renderCommandEncoder->setDepthBias(frontOffset, frontScale, offsetClamp); + encoderState.m_depthBias = frontOffsetU32; + encoderState.m_depthSlope = frontScaleU32; + encoderState.m_depthClamp = offsetClampU32; + } } else { - renderCommandEncoder->setDepthBias(0.0f, 0.0f, 0.0f); + if (0 != encoderState.m_depthBias || 0 != encoderState.m_depthSlope || 0 != encoderState.m_depthClamp) + { + renderCommandEncoder->setDepthBias(0.0f, 0.0f, 0.0f); + + encoderState.m_depthBias = 0; + encoderState.m_depthSlope = 0; + encoderState.m_depthClamp = 0; + } } // todo - how does culling behave with rects? @@ -766,19 +796,36 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 cullBack = cullFront; } + // Cull mode if (cullFront && cullBack) return; // We can just skip the draw (TODO: can we?) - else if (cullFront) - renderCommandEncoder->setCullMode(MTL::CullModeFront); - else if (cullBack) - renderCommandEncoder->setCullMode(MTL::CullModeBack); - else - renderCommandEncoder->setCullMode(MTL::CullModeNone); + MTL::CullMode cullMode; + if (cullFront) + cullMode = MTL::CullModeFront; + else if (cullBack) + cullMode = MTL::CullModeBack; + else + cullMode = MTL::CullModeNone; + + if (cullMode != encoderState.m_cullMode) + { + renderCommandEncoder->setCullMode(cullMode); + encoderState.m_cullMode = cullMode; + } + + // Front face + MTL::Winding frontFaceWinding; if (frontFace == Latte::LATTE_PA_SU_SC_MODE_CNTL::E_FRONTFACE::CCW) - renderCommandEncoder->setFrontFacingWinding(MTL::WindingCounterClockwise); + frontFaceWinding = MTL::WindingCounterClockwise; else - renderCommandEncoder->setFrontFacingWinding(MTL::WindingClockwise); + frontFaceWinding = MTL::WindingClockwise; + + if (frontFaceWinding != encoderState.m_frontFaceWinding) + { + renderCommandEncoder->setFrontFacingWinding(frontFaceWinding); + encoderState.m_frontFaceWinding = frontFaceWinding; + } // Resources @@ -817,7 +864,11 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // Render pipeline state MTL::RenderPipelineState* renderPipelineState = m_pipelineCache->GetPipelineState(fetchShader, vertexShader, pixelShader, m_state.m_lastUsedFBO, LatteGPUState.contextNew); - renderCommandEncoder->setRenderPipelineState(renderPipelineState); + if (renderPipelineState != encoderState.m_renderPipelineState) + { + renderCommandEncoder->setRenderPipelineState(renderPipelineState); + encoderState.m_renderPipelineState = renderPipelineState; + } // Uniform buffers, textures and samplers BindStageResources(renderCommandEncoder, vertexShader); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index d151af9a8..bbd531944 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -7,6 +7,8 @@ #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" +#include "Metal/MTLDepthStencil.hpp" +#include "Metal/MTLRenderCommandEncoder.hpp" struct MetalBoundBuffer { @@ -40,6 +42,15 @@ inline MetalShaderType GetMtlShaderType(LatteConst::ShaderType shaderType) struct MetalEncoderState { + MTL::RenderPipelineState* m_renderPipelineState = nullptr; + MTL::DepthStencilState* m_depthStencilState = nullptr; + MTL::CullMode m_cullMode = MTL::CullModeNone; + MTL::Winding m_frontFaceWinding = MTL::WindingClockwise; + uint32 m_stencilRefFront = 0; + uint32 m_stencilRefBack = 0; + uint32 m_depthBias = 0; + uint32 m_depthSlope = 0; + uint32 m_depthClamp = 0; struct { class LatteTextureViewMtl* m_textureView = nullptr; uint32 m_word4 = INVALID_UINT32; @@ -288,6 +299,11 @@ class MetalRenderer : public Renderer } } + MetalEncoderState& GetEncoderState() + { + return m_state.m_encoderState; + } + MTL::CommandBuffer* GetCommandBuffer(); bool CommandBufferCompleted(MTL::CommandBuffer* commandBuffer); void WaitForCommandBufferCompletion(MTL::CommandBuffer* commandBuffer); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h index d96d32948..2e94807d3 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h @@ -20,7 +20,7 @@ vertex VertexOut vertexFullscreen(ushort vid [[vertex_id]]) { return out; } -fragment float4 fragmentPresent(VertexOut in [[stage_in]], texture2d tex [[texture(GET_TEXTURE_BINDING(0))]], sampler samplr [[sampler(GET_SAMPLER_BINDING(0))]]) { +fragment float4 fragmentPresent(VertexOut in [[stage_in]], texture2d tex [[texture(0)]], sampler samplr [[sampler(0)]]) { return tex.sample(samplr, in.texCoord); } From d79d7fea63e2f849b072d4dcaddf6b98720516b2 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 15 Aug 2024 11:44:06 +0200 Subject: [PATCH 076/368] implement sampler cache --- src/Cafe/CMakeLists.txt | 2 + .../Renderer/Metal/MetalDepthStencilCache.cpp | 2 - .../Renderer/Metal/MetalPipelineCache.cpp | 2 - .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 144 ++++-------------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 6 +- .../Renderer/Metal/MetalSamplerCache.cpp | 128 ++++++++++++++++ .../Latte/Renderer/Metal/MetalSamplerCache.h | 21 +++ 7 files changed, 182 insertions(+), 123 deletions(-) create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index d0e7d9210..fb4672d27 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -560,6 +560,8 @@ if(ENABLE_METAL) HW/Latte/Renderer/Metal/MetalPipelineCache.h HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp HW/Latte/Renderer/Metal/MetalDepthStencilCache.h + HW/Latte/Renderer/Metal/MetalSamplerCache.cpp + HW/Latte/Renderer/Metal/MetalSamplerCache.h HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h HW/Latte/Renderer/Metal/UtilityShaderSource.h diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp index b8f3fc52d..96375e0bf 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp @@ -18,9 +18,7 @@ MTL::DepthStencilState* MetalDepthStencilCache::GetDepthStencilState(const Latte uint64 stateHash = CalculateDepthStencilHash(lcr); auto& depthStencilState = m_depthStencilCache[stateHash]; if (depthStencilState) - { return depthStencilState; - } // Depth stencil state bool depthEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_Z_ENABLE(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 39a7ec8d3..faf67c3ce 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -58,9 +58,7 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS uint64 stateHash = CalculatePipelineHash(fetchShader, vertexShader, pixelShader, activeFBO, lcr); auto& pipeline = m_pipelineCache[stateHash]; if (pipeline) - { return pipeline; - } // Vertex descriptor MTL::VertexDescriptor* vertexDescriptor = MTL::VertexDescriptor::alloc()->init(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 0870d2ed0..4489b0208 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -6,6 +6,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" @@ -16,6 +17,7 @@ #include "Cafe/HW/Latte/Core/LatteIndices.h" #include "Cemu/Logging/CemuDebugLogging.h" #include "Common/precompiled.h" +#include "HW/Latte/Core/Latte.h" #include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "Metal/MTLRenderCommandEncoder.hpp" #include "Metal/MTLRenderPass.hpp" @@ -43,6 +45,7 @@ MetalRenderer::MetalRenderer() m_memoryManager = new MetalMemoryManager(this); m_pipelineCache = new MetalPipelineCache(this); m_depthStencilCache = new MetalDepthStencilCache(this); + m_samplerCache = new MetalSamplerCache(this); // Texture readback m_readbackBuffer = m_device->newBuffer(TEXTURE_READBACK_SIZE, MTL::StorageModeShared); @@ -121,8 +124,9 @@ MetalRenderer::~MetalRenderer() m_presentPipelineLinear->release(); m_presentPipelineSRGB->release(); - delete m_depthStencilCache; delete m_pipelineCache; + delete m_depthStencilCache; + delete m_samplerCache; delete m_memoryManager; m_nearestSampler->release(); @@ -1179,123 +1183,29 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE if (stageSamplerIndex != LATTE_DECOMPILER_SAMPLER_NONE) { uint32 samplerIndex = stageSamplerIndex + LatteDecompiler_getTextureSamplerBaseIndex(shader->shaderType); - const _LatteRegisterSetSampler* samplerWords = LatteGPUState.contextNew.SQ_TEX_SAMPLER + samplerIndex; - - // TODO: cache this instead - MTL::SamplerDescriptor* samplerDescriptor = MTL::SamplerDescriptor::alloc()->init(); - - // lod - uint32 iMinLOD = samplerWords->WORD1.get_MIN_LOD(); - uint32 iMaxLOD = samplerWords->WORD1.get_MAX_LOD(); - sint32 iLodBias = samplerWords->WORD1.get_LOD_BIAS(); - - // apply relative lod bias from graphic pack - if (baseTexture->overwriteInfo.hasRelativeLodBias) - iLodBias += baseTexture->overwriteInfo.relativeLodBias; - // apply absolute lod bias from graphic pack - if (baseTexture->overwriteInfo.hasLodBias) - iLodBias = baseTexture->overwriteInfo.lodBias; - - auto filterMip = samplerWords->WORD0.get_MIP_FILTER(); - if (filterMip == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_Z_FILTER::NONE) - { - samplerDescriptor->setMipFilter(MTL::SamplerMipFilterNearest); - samplerDescriptor->setLodMinClamp(0.0f); - samplerDescriptor->setLodMaxClamp(0.25f); - } - else if (filterMip == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_Z_FILTER::POINT) - { - samplerDescriptor->setMipFilter(MTL::SamplerMipFilterNearest); - samplerDescriptor->setLodMinClamp((float)iMinLOD / 64.0f); - samplerDescriptor->setLodMaxClamp((float)iMaxLOD / 64.0f); - } - else if (filterMip == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_Z_FILTER::LINEAR) - { - samplerDescriptor->setMipFilter(MTL::SamplerMipFilterLinear); - samplerDescriptor->setLodMinClamp((float)iMinLOD / 64.0f); - samplerDescriptor->setLodMaxClamp((float)iMaxLOD / 64.0f); - } - else - { - // fallback for invalid constants - samplerDescriptor->setMipFilter(MTL::SamplerMipFilterLinear); - samplerDescriptor->setLodMinClamp((float)iMinLOD / 64.0f); - samplerDescriptor->setLodMaxClamp((float)iMaxLOD / 64.0f); - } - - auto filterMin = samplerWords->WORD0.get_XY_MIN_FILTER(); - cemu_assert_debug(filterMin != Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::BICUBIC); // todo - samplerDescriptor->setMinFilter((filterMin == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::POINT || filterMin == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::ANISO_POINT) ? MTL::SamplerMinMagFilterNearest : MTL::SamplerMinMagFilterLinear); - - auto filterMag = samplerWords->WORD0.get_XY_MAG_FILTER(); - samplerDescriptor->setMagFilter((filterMag == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::POINT || filterMin == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::ANISO_POINT) ? MTL::SamplerMinMagFilterNearest : MTL::SamplerMinMagFilterLinear); - - auto filterZ = samplerWords->WORD0.get_Z_FILTER(); - // todo: z-filter for texture array samplers is customizable for GPU7 but OpenGL/Vulkan doesn't expose this functionality? - - auto clampX = samplerWords->WORD0.get_CLAMP_X(); - auto clampY = samplerWords->WORD0.get_CLAMP_Y(); - auto clampZ = samplerWords->WORD0.get_CLAMP_Z(); - - samplerDescriptor->setSAddressMode(GetMtlSamplerAddressMode(clampX)); - samplerDescriptor->setTAddressMode(GetMtlSamplerAddressMode(clampY)); - samplerDescriptor->setRAddressMode(GetMtlSamplerAddressMode(clampZ)); - - auto maxAniso = samplerWords->WORD0.get_MAX_ANISO_RATIO(); - - if (baseTexture->overwriteInfo.anisotropicLevel >= 0) - maxAniso = baseTexture->overwriteInfo.anisotropicLevel; - - if (maxAniso > 0) - { - samplerDescriptor->setMaxAnisotropy(1 << maxAniso); - } - - // TODO: set lod bias - //samplerInfo.mipLodBias = (float)iLodBias / 64.0f; - - // depth compare - uint8 depthCompareMode = shader->textureUsesDepthCompare[relative_textureUnit] ? 1 : 0; - if (depthCompareMode == 1) - { - // TODO: is it okay to just cast? - samplerDescriptor->setCompareFunction(GetMtlCompareFunc((Latte::E_COMPAREFUNC)samplerWords->WORD0.get_DEPTH_COMPARE_FUNCTION())); - } - - // border - auto borderType = samplerWords->WORD0.get_BORDER_COLOR_TYPE(); - - if (borderType == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_BORDER_COLOR_TYPE::TRANSPARENT_BLACK) - samplerDescriptor->setBorderColor(MTL::SamplerBorderColorTransparentBlack); - else if (borderType == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_BORDER_COLOR_TYPE::OPAQUE_BLACK) - samplerDescriptor->setBorderColor(MTL::SamplerBorderColorOpaqueBlack); - else if (borderType == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_BORDER_COLOR_TYPE::OPAQUE_WHITE) - samplerDescriptor->setBorderColor(MTL::SamplerBorderColorOpaqueWhite); - else - { - // Metal doesn't support custom border color - samplerDescriptor->setBorderColor(MTL::SamplerBorderColorOpaqueBlack); - } - - MTL::SamplerState* sampler = m_device->newSamplerState(samplerDescriptor); - samplerDescriptor->release(); + auto sampler = m_samplerCache->GetSamplerState(LatteGPUState.contextNew, samplerIndex); - switch (shader->shaderType) - { - case LatteConst::ShaderType::Vertex: - { - renderCommandEncoder->setVertexSamplerState(sampler, binding); - break; - } - case LatteConst::ShaderType::Pixel: - { - renderCommandEncoder->setFragmentSamplerState(sampler, binding); - break; - } - default: - UNREACHABLE; - } - sampler->release(); + auto& boundSampler = m_state.m_encoderState.m_samplers[mtlShaderType][binding]; + if (sampler != boundSampler) + { + boundSampler = sampler; + + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + { + renderCommandEncoder->setVertexSamplerState(sampler, binding); + break; + } + case LatteConst::ShaderType::Pixel: + { + renderCommandEncoder->setFragmentSamplerState(sampler, binding); + break; + } + default: + UNREACHABLE; + } + } } // get texture register word 0 diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index bbd531944..451a29af5 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -7,8 +7,6 @@ #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" -#include "Metal/MTLDepthStencil.hpp" -#include "Metal/MTLRenderCommandEncoder.hpp" struct MetalBoundBuffer { @@ -55,6 +53,7 @@ struct MetalEncoderState class LatteTextureViewMtl* m_textureView = nullptr; uint32 m_word4 = INVALID_UINT32; } m_textures[METAL_SHADER_TYPE_TOTAL][MAX_MTL_TEXTURES]; + MTL::SamplerState* m_samplers[METAL_SHADER_TYPE_TOTAL][MAX_MTL_SAMPLERS]; size_t m_uniformBufferOffsets[METAL_SHADER_TYPE_TOTAL][MAX_MTL_BUFFERS]; }; @@ -294,6 +293,8 @@ class MetalRenderer : public Renderer { for (uint32 j = 0; j < MAX_MTL_TEXTURES; j++) m_state.m_encoderState.m_textures[i][j] = {nullptr}; + for (uint32 j = 0; j < MAX_MTL_SAMPLERS; j++) + m_state.m_encoderState.m_samplers[i][j] = nullptr; for (uint32 j = 0; j < MAX_MTL_BUFFERS; j++) m_state.m_encoderState.m_uniformBufferOffsets[i][j] = INVALID_OFFSET; } @@ -333,6 +334,7 @@ class MetalRenderer : public Renderer class MetalMemoryManager* m_memoryManager; class MetalPipelineCache* m_pipelineCache; class MetalDepthStencilCache* m_depthStencilCache; + class MetalSamplerCache* m_samplerCache; // Metal objects MTL::Device* m_device; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp new file mode 100644 index 000000000..4f987d83c --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp @@ -0,0 +1,128 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "HW/Latte/Renderer/Metal/LatteToMtl.h" + +MetalSamplerCache::~MetalSamplerCache() +{ + for (auto& pair : m_samplerCache) + { + pair.second->release(); + } + m_samplerCache.clear(); +} + +MTL::SamplerState* MetalSamplerCache::GetSamplerState(const LatteContextRegister& lcr, uint32 samplerIndex) +{ + uint64 stateHash = CalculateSamplerHash(lcr, samplerIndex); + auto& samplerState = m_samplerCache[stateHash]; + if (samplerState) + return samplerState; + + // Sampler state + const _LatteRegisterSetSampler* samplerWords = lcr.SQ_TEX_SAMPLER + samplerIndex; + + MTL::SamplerDescriptor* samplerDescriptor = MTL::SamplerDescriptor::alloc()->init(); + + // lod + uint32 iMinLOD = samplerWords->WORD1.get_MIN_LOD(); + uint32 iMaxLOD = samplerWords->WORD1.get_MAX_LOD(); + sint32 iLodBias = samplerWords->WORD1.get_LOD_BIAS(); + + // TODO: uncomment + // apply relative lod bias from graphic pack + //if (baseTexture->overwriteInfo.hasRelativeLodBias) + // iLodBias += baseTexture->overwriteInfo.relativeLodBias; + // apply absolute lod bias from graphic pack + //if (baseTexture->overwriteInfo.hasLodBias) + // iLodBias = baseTexture->overwriteInfo.lodBias; + + auto filterMip = samplerWords->WORD0.get_MIP_FILTER(); + if (filterMip == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_Z_FILTER::NONE) + { + samplerDescriptor->setMipFilter(MTL::SamplerMipFilterNearest); + samplerDescriptor->setLodMinClamp(0.0f); + samplerDescriptor->setLodMaxClamp(0.25f); + } + else if (filterMip == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_Z_FILTER::POINT) + { + samplerDescriptor->setMipFilter(MTL::SamplerMipFilterNearest); + samplerDescriptor->setLodMinClamp((float)iMinLOD / 64.0f); + samplerDescriptor->setLodMaxClamp((float)iMaxLOD / 64.0f); + } + else if (filterMip == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_Z_FILTER::LINEAR) + { + samplerDescriptor->setMipFilter(MTL::SamplerMipFilterLinear); + samplerDescriptor->setLodMinClamp((float)iMinLOD / 64.0f); + samplerDescriptor->setLodMaxClamp((float)iMaxLOD / 64.0f); + } + else + { + // fallback for invalid constants + samplerDescriptor->setMipFilter(MTL::SamplerMipFilterLinear); + samplerDescriptor->setLodMinClamp((float)iMinLOD / 64.0f); + samplerDescriptor->setLodMaxClamp((float)iMaxLOD / 64.0f); + } + + auto filterMin = samplerWords->WORD0.get_XY_MIN_FILTER(); + cemu_assert_debug(filterMin != Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::BICUBIC); // todo + samplerDescriptor->setMinFilter((filterMin == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::POINT || filterMin == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::ANISO_POINT) ? MTL::SamplerMinMagFilterNearest : MTL::SamplerMinMagFilterLinear); + + auto filterMag = samplerWords->WORD0.get_XY_MAG_FILTER(); + samplerDescriptor->setMagFilter((filterMag == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::POINT || filterMin == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::ANISO_POINT) ? MTL::SamplerMinMagFilterNearest : MTL::SamplerMinMagFilterLinear); + + auto filterZ = samplerWords->WORD0.get_Z_FILTER(); + // todo: z-filter for texture array samplers is customizable for GPU7 but OpenGL/Vulkan doesn't expose this functionality? + + auto clampX = samplerWords->WORD0.get_CLAMP_X(); + auto clampY = samplerWords->WORD0.get_CLAMP_Y(); + auto clampZ = samplerWords->WORD0.get_CLAMP_Z(); + + samplerDescriptor->setSAddressMode(GetMtlSamplerAddressMode(clampX)); + samplerDescriptor->setTAddressMode(GetMtlSamplerAddressMode(clampY)); + samplerDescriptor->setRAddressMode(GetMtlSamplerAddressMode(clampZ)); + + auto maxAniso = samplerWords->WORD0.get_MAX_ANISO_RATIO(); + + // TODO: uncomment + //if (baseTexture->overwriteInfo.anisotropicLevel >= 0) + // maxAniso = baseTexture->overwriteInfo.anisotropicLevel; + + if (maxAniso > 0) + samplerDescriptor->setMaxAnisotropy(1 << maxAniso); + + // TODO: set lod bias + //samplerInfo.mipLodBias = (float)iLodBias / 64.0f; + + // depth compare + //uint8 depthCompareMode = shader->textureUsesDepthCompare[relative_textureUnit] ? 1 : 0; + // TODO: is it okay to just cast? + samplerDescriptor->setCompareFunction(GetMtlCompareFunc((Latte::E_COMPAREFUNC)samplerWords->WORD0.get_DEPTH_COMPARE_FUNCTION())); + + // border + auto borderType = samplerWords->WORD0.get_BORDER_COLOR_TYPE(); + + if (borderType == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_BORDER_COLOR_TYPE::TRANSPARENT_BLACK) + samplerDescriptor->setBorderColor(MTL::SamplerBorderColorTransparentBlack); + else if (borderType == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_BORDER_COLOR_TYPE::OPAQUE_BLACK) + samplerDescriptor->setBorderColor(MTL::SamplerBorderColorOpaqueBlack); + else if (borderType == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_BORDER_COLOR_TYPE::OPAQUE_WHITE) + samplerDescriptor->setBorderColor(MTL::SamplerBorderColorOpaqueWhite); + else + { + // Metal doesn't support custom border color + samplerDescriptor->setBorderColor(MTL::SamplerBorderColorOpaqueBlack); + } + + samplerState = m_mtlr->GetDevice()->newSamplerState(samplerDescriptor); + samplerDescriptor->release(); + + return samplerState; +} + +uint64 MetalSamplerCache::CalculateSamplerHash(const LatteContextRegister& lcr, uint32 samplerIndex) +{ + const _LatteRegisterSetSampler* samplerWords = lcr.SQ_TEX_SAMPLER + samplerIndex; + + // TODO: check this + return *((uint64*)samplerWords); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h new file mode 100644 index 000000000..891d7e035 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h @@ -0,0 +1,21 @@ +#pragma once + +#include + +#include "HW/Latte/ISA/LatteReg.h" + +class MetalSamplerCache +{ +public: + MetalSamplerCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} + ~MetalSamplerCache(); + + MTL::SamplerState* GetSamplerState(const LatteContextRegister& lcr, uint32 samplerIndex); + +private: + class MetalRenderer* m_mtlr; + + std::map m_samplerCache; + + uint64 CalculateSamplerHash(const LatteContextRegister& lcr, uint32 samplerIndex); +}; From 0d0152d6e06620831604d7a9d8c3544eff9f3c73 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 16 Aug 2024 15:33:49 +0200 Subject: [PATCH 077/368] fix: present --- CMakeLists.txt | 5 ++++- src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.h | 2 +- src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.mm | 8 +++++++- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 15 +++++++++++++-- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h | 2 ++ src/gui/canvas/MetalCanvas.cpp | 3 +++ 6 files changed, 30 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 58d8d6a45..ee4f19bf0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,8 @@ cmake_minimum_required(VERSION 3.21.1) +# TODO: remove this +set(CMAKE_CXX_FLAGS_DEBUG "-g") + option(ENABLE_VCPKG "Enable the vcpkg package manager" ON) option(MACOS_BUNDLE "The executable when built on macOS will be created as an application bundle" OFF) set(EXPERIMENTAL_VERSION "" CACHE STRING "") # used by CI script to set experimental version @@ -23,7 +26,7 @@ if (ENABLE_VCPKG) OUTPUT_VARIABLE is_vcpkg_shallow OUTPUT_STRIP_TRAILING_WHITESPACE ) - + if(is_vcpkg_shallow STREQUAL "true") message(STATUS "vcpkg is shallow. Unshallowing it now...") execute_process( diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.h index 56a302246..d2b30667c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.h @@ -1,3 +1,3 @@ #pragma once -void* CreateMetalLayer(void* handle); +void* CreateMetalLayer(void* handle, float& scaleX, float& scaleY); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.mm b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.mm index 8ce3202ed..16a7aa676 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.mm +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.mm @@ -2,7 +2,7 @@ #include "Cafe/HW/Latte/Renderer/MetalView.h" -void* CreateMetalLayer(void* handle) +void* CreateMetalLayer(void* handle, float& scaleX, float& scaleY) { NSView* view = (NSView*)handle; @@ -12,5 +12,11 @@ [view addSubview:childView]; + const NSRect points = [childView frame]; + const NSRect pixels = [childView convertRectToBacking:points]; + + scaleX = (float)(pixels.size.width / points.size.width); + scaleY = (float)(pixels.size.height / points.size.height); + return childView.layer; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 4489b0208..9454d62c4 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -138,13 +138,20 @@ MetalRenderer::~MetalRenderer() m_device->release(); } -// TODO: don't ignore "mainWindow" argument and respect size +// TODO: don't ignore "mainWindow" argument void MetalRenderer::InitializeLayer(const Vector2i& size, bool mainWindow) { const auto& windowInfo = gui_getWindowInfo().window_main; - m_metalLayer = (CA::MetalLayer*)CreateMetalLayer(windowInfo.handle); + m_metalLayer = (CA::MetalLayer*)CreateMetalLayer(windowInfo.handle, m_layerScaleX, m_layerScaleY); m_metalLayer->setDevice(m_device); + m_metalLayer->setDrawableSize(CGSize{(float)size.x * m_layerScaleX, (float)size.y * m_layerScaleY}); +} + +// TODO: don't ignore "mainWindow" argument +void MetalRenderer::ResizeLayer(const Vector2i& size, bool mainWindow) +{ + m_metalLayer->setDrawableSize(CGSize{(float)size.x * m_layerScaleX, (float)size.y * m_layerScaleY}); } void MetalRenderer::Initialize() @@ -215,6 +222,7 @@ void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) m_memoryManager->ResetTemporaryBuffers(); } +// TODO: use `shader` for drawing void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutputShader* shader, bool useLinearTexFilter, sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight, bool padView, bool clearBackground) @@ -240,6 +248,9 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput renderCommandEncoder->setFragmentTexture(presentTexture, 0); renderCommandEncoder->setFragmentSamplerState((useLinearTexFilter ? m_linearSampler : m_nearestSampler), 0); + renderCommandEncoder->setViewport(MTL::Viewport{(double)imageX, (double)imageY, (double)imageWidth, (double)imageHeight, 0.0, 1.0}); + renderCommandEncoder->setScissorRect(MTL::ScissorRect{(uint32)imageX, (uint32)imageY, (uint32)imageWidth, (uint32)imageHeight}); + renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3)); EndEncoding(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 451a29af5..f315963e9 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -141,6 +141,7 @@ class MetalRenderer : public Renderer } void InitializeLayer(const Vector2i& size, bool mainWindow); + void ResizeLayer(const Vector2i& size, bool mainWindow); void Initialize() override; void Shutdown() override; @@ -330,6 +331,7 @@ class MetalRenderer : public Renderer private: CA::MetalLayer* m_metalLayer; + float m_layerScaleX, m_layerScaleY; class MetalMemoryManager* m_memoryManager; class MetalPipelineCache* m_pipelineCache; diff --git a/src/gui/canvas/MetalCanvas.cpp b/src/gui/canvas/MetalCanvas.cpp index fe8dc4eec..2c89f8822 100644 --- a/src/gui/canvas/MetalCanvas.cpp +++ b/src/gui/canvas/MetalCanvas.cpp @@ -60,4 +60,7 @@ void MetalCanvas::OnResize(wxSizeEvent& event) const wxRect refreshRect(size); RefreshRect(refreshRect, false); + + auto metal_renderer = MetalRenderer::GetInstance(); + metal_renderer->InitializeLayer({size.x, size.y}, m_is_main_window); } From 502d5b8b2ffb37454af77682fcaa72e78dbe7533 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 16 Aug 2024 16:38:01 +0200 Subject: [PATCH 078/368] include debug information --- .../HW/Latte/Renderer/Metal/MetalCommon.h | 18 ++++++++++++++ .../Metal/MetalHybridComputePipeline.cpp | 2 +- .../Renderer/Metal/MetalMemoryManager.cpp | 16 +++++++++---- .../Renderer/Metal/MetalPipelineCache.cpp | 8 ++++++- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 24 ++++++++++++++++--- .../Renderer/Metal/RendererShaderMtl.cpp | 4 ++-- 6 files changed, 60 insertions(+), 12 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h index 0e2c6ac93..19ac7ab79 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -1,5 +1,6 @@ #pragma once +#include "Foundation/NSString.hpp" #include #define MAX_MTL_BUFFERS 31 @@ -27,3 +28,20 @@ inline std::string GetColorAttachmentTypeStr(uint32 index) { return "COLOR_ATTACHMENT" + std::to_string(index) + "_TYPE"; } + +// Cast from const char* to NS::String* +inline NS::String* ToNSString(const char* str) +{ + return NS::String::string(str, NS::ASCIIStringEncoding); +} + +// Cast from std::string to NS::String* +inline NS::String* ToNSString(const std::string& str) +{ + return ToNSString(str.c_str()); +} + +inline NS::String* GetLabel(const std::string& label, const void* identifier) +{ + return ToNSString(label + " (" + std::to_string(reinterpret_cast(identifier)) + ")"); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp index 3802939ba..3be1cf521 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp @@ -3,7 +3,7 @@ MetalHybridComputePipeline::MetalHybridComputePipeline(class MetalRenderer* mtlRenderer, MTL::Library* library, const char* vertexFunctionName, const char* kernelFunctionName) { // Render pipeline state - MTL::Function* vertexFunction = library->newFunction(NS::String::string(vertexFunctionName, NS::ASCIIStringEncoding)); + MTL::Function* vertexFunction = library->newFunction(ToNSString(vertexFunctionName)); MTL::RenderPipelineDescriptor* renderPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); renderPipelineDescriptor->setVertexFunction(vertexFunction); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index b059adf49..63ded223d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -46,6 +46,9 @@ MetalBufferAllocation MetalBufferAllocator::GetBufferAllocation(size_t size, siz // If no free range was found, allocate a new buffer MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(std::max(size, BUFFER_ALLOCATION_SIZE), MTL::ResourceStorageModeShared); +#ifdef CEMU_DEBUG_ASSERT + buffer->setLabel(GetLabel("Buffer from buffer allocator", buffer)); +#endif MetalBufferAllocation allocation; allocation.bufferIndex = m_buffers.size(); @@ -184,21 +187,24 @@ void MetalMemoryManager::InitBufferCache(size_t size) } m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, MTL::ResourceStorageModeShared); +#ifdef CEMU_DEBUG_ASSERT + m_bufferCache->setLabel(GetLabel("Buffer cache", m_bufferCache)); +#endif } void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, size_t size) { - if ((offset + size) > m_bufferCache->length()) - { - throw std::runtime_error(std::to_string(offset) + " + " + std::to_string(size) + " > " + std::to_string(m_bufferCache->length())); - } - if (!m_bufferCache) { debug_printf("MetalMemoryManager::UploadToBufferCache: buffer cache not initialized\n"); return; } + if ((offset + size) > m_bufferCache->length()) + { + debug_printf("MetalMemoryManager::UploadToBufferCache: out of bounds access (offset: %zu, size: %zu, buffer size: %zu)\n", offset, size, m_bufferCache->length()); + } + memcpy((uint8*)m_bufferCache->contents() + offset, data, size); // Notify vertex buffer cache about the change diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index faf67c3ce..616ceeb19 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -187,6 +187,9 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS LoadBinary(desc); NS::Error* error = nullptr; +#ifdef CEMU_DEBUG_ASSERT + desc->setLabel(GetLabel("Cached render pipeline state", desc)); +#endif pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, MTL::PipelineOptionFailOnBinaryArchiveMiss, nullptr, &error); //static uint32 oldPipelineCount = 0; @@ -199,6 +202,9 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS error->release(); error = nullptr; +#ifdef CEMU_DEBUG_ASSERT + desc->setLabel(GetLabel("New render pipeline state", desc)); +#endif pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, &error); if (error) { @@ -304,7 +310,7 @@ void MetalPipelineCache::TryLoadBinaryArchive() const std::string cacheFilename = fmt::format("{:016x}_mtl_pipelines.bin", s_cacheTitleId); const fs::path cachePath = ActiveSettings::GetCachePath("shaderCache/precompiled/{}", cacheFilename); - m_binaryArchiveURL = NS::URL::fileURLWithPath(NS::String::string((const char*)cachePath.generic_u8string().c_str(), NS::ASCIIStringEncoding)); + m_binaryArchiveURL = NS::URL::fileURLWithPath(ToNSString((const char*)cachePath.generic_u8string().c_str())); MTL::BinaryArchiveDescriptor* desc = MTL::BinaryArchiveDescriptor::alloc()->init(); desc->setUrl(m_binaryArchiveURL); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 9454d62c4..bdd3c2acf 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -49,9 +49,15 @@ MetalRenderer::MetalRenderer() // Texture readback m_readbackBuffer = m_device->newBuffer(TEXTURE_READBACK_SIZE, MTL::StorageModeShared); +#ifdef CEMU_DEBUG_ASSERT + m_readbackBuffer->setLabel(GetLabel("Texture readback buffer", m_readbackBuffer)); +#endif // Transform feedback m_xfbRingBuffer = m_device->newBuffer(LatteStreamout_GetRingBufferSize(), MTL::StorageModeShared); +#ifdef CEMU_DEBUG_ASSERT + m_xfbRingBuffer->setLabel(GetLabel("Transform feedback buffer", m_xfbRingBuffer)); +#endif // Initialize state for (uint32 i = 0; i < METAL_SHADER_TYPE_TOTAL; i++) @@ -70,7 +76,7 @@ MetalRenderer::MetalRenderer() // Create the library NS::Error* error = nullptr; - MTL::Library* utilityLibrary = m_device->newLibrary(NS::String::string(processedUtilityShaderSource.c_str(), NS::ASCIIStringEncoding), nullptr, &error); + MTL::Library* utilityLibrary = m_device->newLibrary(ToNSString(processedUtilityShaderSource.c_str()), nullptr, &error); if (error) { debug_printf("failed to create utility library (error: %s)\n", error->localizedDescription()->utf8String()); @@ -80,8 +86,8 @@ MetalRenderer::MetalRenderer() } // Present pipeline - MTL::Function* presentVertexFunction = utilityLibrary->newFunction(NS::String::string("vertexFullscreen", NS::ASCIIStringEncoding)); - MTL::Function* presentFragmentFunction = utilityLibrary->newFunction(NS::String::string("fragmentPresent", NS::ASCIIStringEncoding)); + MTL::Function* presentVertexFunction = utilityLibrary->newFunction(ToNSString("vertexFullscreen")); + MTL::Function* presentFragmentFunction = utilityLibrary->newFunction(ToNSString("fragmentPresent")); MTL::RenderPipelineDescriptor* renderPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); renderPipelineDescriptor->setVertexFunction(presentVertexFunction); @@ -91,6 +97,9 @@ MetalRenderer::MetalRenderer() error = nullptr; renderPipelineDescriptor->colorAttachments()->object(0)->setPixelFormat(MTL::PixelFormatRGBA8Unorm); +#ifdef CEMU_DEBUG_ASSERT + renderPipelineDescriptor->setLabel(GetLabel("Present pipeline linear", renderPipelineDescriptor)); +#endif m_presentPipelineLinear = m_device->newRenderPipelineState(renderPipelineDescriptor, &error); if (error) { @@ -100,6 +109,9 @@ MetalRenderer::MetalRenderer() error = nullptr; renderPipelineDescriptor->colorAttachments()->object(0)->setPixelFormat(MTL::PixelFormatRGBA8Unorm_sRGB); +#ifdef CEMU_DEBUG_ASSERT + renderPipelineDescriptor->setLabel(GetLabel("Present pipeline sRGB", renderPipelineDescriptor)); +#endif m_presentPipelineSRGB = m_device->newRenderPipelineState(renderPipelineDescriptor, &error); renderPipelineDescriptor->release(); if (error) @@ -971,6 +983,9 @@ MTL::RenderCommandEncoder* MetalRenderer::GetTemporaryRenderCommandEncoder(MTL:: auto commandBuffer = GetCommandBuffer(); auto renderCommandEncoder = commandBuffer->renderCommandEncoder(renderPassDescriptor); +#ifdef CEMU_DEBUG_ASSERT + renderCommandEncoder->setLabel(GetLabel("Temporary render command encoder", renderCommandEncoder)); +#endif m_commandEncoder = renderCommandEncoder; m_encoderType = MetalEncoderType::Render; @@ -1024,6 +1039,9 @@ MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(bool forceRecr m_state.m_lastUsedFBO = m_state.m_activeFBO; auto renderCommandEncoder = commandBuffer->renderCommandEncoder(m_state.m_activeFBO->GetRenderPassDescriptor()); +#ifdef CEMU_DEBUG_ASSERT + renderCommandEncoder->setLabel(GetLabel("Render command encoder", renderCommandEncoder)); +#endif m_commandEncoder = renderCommandEncoder; m_encoderType = MetalEncoderType::Render; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index 47c796bfd..465e93163 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -78,13 +78,13 @@ void RendererShaderMtl::CompileFragmentFunction(CachedFBOMtl* activeFBO) void RendererShaderMtl::Compile(const std::string& mslCode) { NS::Error* error = nullptr; - MTL::Library* library = m_mtlr->GetDevice()->newLibrary(NS::String::string(mslCode.c_str(), NS::ASCIIStringEncoding), nullptr, &error); + MTL::Library* library = m_mtlr->GetDevice()->newLibrary(ToNSString(mslCode), nullptr, &error); if (error) { printf("failed to create library (error: %s) -> source:\n%s\n", error->localizedDescription()->utf8String(), mslCode.c_str()); error->release(); return; } - m_function = library->newFunction(NS::String::string("main0", NS::ASCIIStringEncoding)); + m_function = library->newFunction(ToNSString("main0")); library->release(); } From 83a08b224726c114a7bd0107f9e8f84df0efac10 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 17 Aug 2024 17:51:37 +0200 Subject: [PATCH 079/368] fix: some Metal validation errors --- .../HW/Latte/Renderer/Metal/CachedFBOMtl.cpp | 3 +- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 11 +- src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h | 1 + .../Renderer/Metal/MetalMemoryManager.cpp | 37 ++-- .../Latte/Renderer/Metal/MetalMemoryManager.h | 8 +- .../Renderer/Metal/MetalPipelineCache.cpp | 7 + .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 193 +++++++++++++----- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 15 +- .../Renderer/Metal/UtilityShaderSource.h | 40 ++-- 9 files changed, 216 insertions(+), 99 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp index 1c02f7b40..a9e673f62 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp @@ -1,5 +1,6 @@ #include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" +#include "HW/Latte/Renderer/Metal/LatteToMtl.h" #include "Metal/MTLRenderPass.hpp" void CachedFBOMtl::CreateRenderPass() @@ -30,7 +31,7 @@ void CachedFBOMtl::CreateRenderPass() depthAttachment->setStoreAction(MTL::StoreActionStore); // setup stencil attachment - if (depthBuffer.hasStencil) + if (depthBuffer.hasStencil && GetMtlPixelFormatInfo(depthBuffer.texture->format, true).hasStencil) { auto stencilAttachment = m_renderPassDescriptor->stencilAttachment(); stencilAttachment->setTexture(textureView->GetRGBAView()); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index 0538650a3..704cf883a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -74,11 +74,12 @@ std::map MTL_COLOR_FORMAT_TABLE = { }; std::map MTL_DEPTH_FORMAT_TABLE = { - {Latte::E_GX2SURFFMT::D24_S8_UNORM, {MTL::PixelFormatDepth24Unorm_Stencil8, MetalDataType::NONE, 4}}, // TODO: not supported on Apple sillicon, maybe find something else - {Latte::E_GX2SURFFMT::D24_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, MetalDataType::NONE, 4}}, // TODO: correct? - {Latte::E_GX2SURFFMT::D32_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, MetalDataType::NONE, 5}}, - {Latte::E_GX2SURFFMT::D16_UNORM, {MTL::PixelFormatDepth16Unorm, MetalDataType::NONE, 2}}, - {Latte::E_GX2SURFFMT::D32_FLOAT, {MTL::PixelFormatDepth32Float, MetalDataType::NONE, 4}}, + // TODO: one of these 2 formats is not supported on Apple silicon + {Latte::E_GX2SURFFMT::D24_S8_UNORM, {MTL::PixelFormatDepth24Unorm_Stencil8, MetalDataType::NONE, 4, {1, 1}, true}}, + {Latte::E_GX2SURFFMT::D24_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, MetalDataType::NONE, 4, {1, 1}, true}}, + {Latte::E_GX2SURFFMT::D32_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, MetalDataType::NONE, 5, {1, 1}, true}}, + {Latte::E_GX2SURFFMT::D16_UNORM, {MTL::PixelFormatDepth16Unorm, MetalDataType::NONE, 2, {1, 1}}}, + {Latte::E_GX2SURFFMT::D32_FLOAT, {MTL::PixelFormatDepth32Float, MetalDataType::NONE, 4, {1, 1}}}, }; const MetalPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format, bool isDepth) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h index c3f697bb3..5fcd2d229 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h @@ -29,6 +29,7 @@ struct MetalPixelFormatInfo { MetalDataType dataType; size_t bytesPerBlock; Uvec2 blockTexelSize = {1, 1}; + bool hasStencil = false; }; const MetalPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format, bool isDepth); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 63ded223d..fb9419d64 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -3,10 +3,6 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h" #include "Common/precompiled.h" -#include "Foundation/NSRange.hpp" -#include "Metal/MTLRenderCommandEncoder.hpp" - -const size_t BUFFER_ALLOCATION_SIZE = 8 * 1024 * 1024; MetalBufferAllocator::~MetalBufferAllocator() { @@ -16,10 +12,10 @@ MetalBufferAllocator::~MetalBufferAllocator() } } -MetalBufferAllocation MetalBufferAllocator::GetBufferAllocation(size_t size, size_t alignment) +MetalBufferAllocation MetalBufferAllocator::GetBufferAllocation(size_t size) { // Align the size - size = Align(size, alignment); + size = Align(size, 16); // First, try to find a free range for (uint32 i = 0; i < m_freeBufferRanges.size(); i++) @@ -45,7 +41,8 @@ MetalBufferAllocation MetalBufferAllocator::GetBufferAllocation(size_t size, siz } // If no free range was found, allocate a new buffer - MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(std::max(size, BUFFER_ALLOCATION_SIZE), MTL::ResourceStorageModeShared); + m_allocationSize = std::max(m_allocationSize, size); + MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(m_allocationSize, MTL::ResourceStorageModeShared); #ifdef CEMU_DEBUG_ASSERT buffer->setLabel(GetLabel("Buffer from buffer allocator", buffer)); #endif @@ -58,16 +55,20 @@ MetalBufferAllocation MetalBufferAllocator::GetBufferAllocation(size_t size, siz m_buffers.push_back(buffer); // If the buffer is larger than the requested size, add the remaining space to the free buffer ranges - if (size < BUFFER_ALLOCATION_SIZE) + if (size < m_allocationSize) { MetalBufferRange range; range.bufferIndex = allocation.bufferIndex; range.offset = size; - range.size = BUFFER_ALLOCATION_SIZE - size; + range.size = m_allocationSize - size; m_freeBufferRanges.push_back(range); } + // Increase the allocation size for the next buffer + if (m_allocationSize < 128 * 1024 * 1024) + m_allocationSize *= 2; + return allocation; } @@ -91,10 +92,11 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu { size_t newStride = Align(stride, 4); size_t newSize = vertexBufferRange.size / stride * newStride; - restrideInfo.allocation = m_bufferAllocator->GetBufferAllocation(newSize, 4); + restrideInfo.allocation = m_bufferAllocator->GetBufferAllocation(newSize); + buffer = m_bufferAllocator->GetBuffer(restrideInfo.allocation.bufferIndex); //uint8* oldPtr = (uint8*)bufferCache->contents() + vertexBufferRange.offset; - //uint8* newPtr = (uint8*)restrideInfo.buffer->contents(); + //uint8* newPtr = (uint8*)buffer->contents() + restrideInfo.allocation.bufferOffset; //for (size_t elem = 0; elem < vertexBufferRange.size / stride; elem++) //{ @@ -123,9 +125,18 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu renderCommandEncoder->setVertexBytes(&strideData, sizeof(strideData), GET_HELPER_BUFFER_BINDING(2)); m_mtlr->GetEncoderState().m_uniformBufferOffsets[METAL_SHADER_TYPE_VERTEX][GET_HELPER_BUFFER_BINDING(2)] = INVALID_OFFSET; - renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypePoint, NS::UInteger(0), vertexBufferRange.size / stride); + // TODO: remove + uint32 vertexCount = vertexBufferRange.size / stride; + if (vertexCount * strideData.oldStride > buffers[0]->length() - offsets[0]) { + throw std::runtime_error("Source buffer overflow (" + std::to_string(vertexCount) + " * " + std::to_string(strideData.oldStride) + " > " + std::to_string(buffers[0]->length()) + " - " + std::to_string(offsets[0]) + ")"); + } + if (vertexCount * strideData.newStride > buffers[1]->length() - offsets[1]) { + throw std::runtime_error("Destination buffer overflow (" + std::to_string(vertexCount) + " * " + std::to_string(strideData.newStride) + " > " + std::to_string(buffers[1]->length()) + " - " + std::to_string(offsets[1]) + ")"); + } + + renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangleStrip, NS::UInteger(0), vertexBufferRange.size / stride); - // TODO: do the barrier in one call? + // TODO: do the barriers in one call? MTL::Resource* barrierBuffers[] = {buffer}; renderCommandEncoder->memoryBarrier(barrierBuffers, 1, MTL::RenderStageVertex, MTL::RenderStageVertex); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index d3588fd56..9bffe9f25 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -47,11 +47,13 @@ class MetalBufferAllocator return m_buffers[bufferIndex]; } - MetalBufferAllocation GetBufferAllocation(size_t size, size_t alignment); + MetalBufferAllocation GetBufferAllocation(size_t size); private: class MetalRenderer* m_mtlr; + size_t m_allocationSize = 8 * 1024 * 1024; + std::vector m_buffers; std::vector m_freeBufferRanges; }; @@ -139,9 +141,9 @@ class MetalMemoryManager return m_bufferAllocator/*s[bufferAllocatorIndex]*/.GetBuffer(bufferIndex); } - MetalBufferAllocation GetBufferAllocation(size_t size, size_t alignment) + MetalBufferAllocation GetBufferAllocation(size_t size) { - auto allocation = m_bufferAllocator/*s[m_bufferAllocatorIndex]*/.GetBufferAllocation(size, alignment); + auto allocation = m_bufferAllocator/*s[m_bufferAllocatorIndex]*/.GetBufferAllocation(size); //allocation.bufferIndex |= (m_bufferAllocatorIndex << bufferAllocatorIndexShift); return allocation; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 616ceeb19..94ab37210 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -243,6 +243,13 @@ uint64 MetalPipelineCache::CalculatePipelineHash(const LatteFetchShader* fetchSh stateHash = std::rotl(stateHash, 7); } + if (activeFBO->depthBuffer.texture) + { + auto textureView = static_cast(activeFBO->depthBuffer.texture); + stateHash += textureView->GetRGBAView()->pixelFormat(); + stateHash = std::rotl(stateHash, 7); + } + for (auto& group : fetchShader->bufferGroups) { uint32 bufferStride = group.getCurrentBufferStride(lcr.GetRawView()); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index bdd3c2acf..a9245383c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -17,10 +17,6 @@ #include "Cafe/HW/Latte/Core/LatteIndices.h" #include "Cemu/Logging/CemuDebugLogging.h" #include "Common/precompiled.h" -#include "HW/Latte/Core/Latte.h" -#include "HW/Latte/Renderer/Metal/MetalCommon.h" -#include "Metal/MTLRenderCommandEncoder.hpp" -#include "Metal/MTLRenderPass.hpp" #include "gui/guiWrapper.h" #define COMMIT_TRESHOLD 256 @@ -34,14 +30,38 @@ MetalRenderer::MetalRenderer() m_device = MTL::CreateSystemDefaultDevice(); m_commandQueue = m_device->newCommandQueue(); + // Resources MTL::SamplerDescriptor* samplerDescriptor = MTL::SamplerDescriptor::alloc()->init(); +#ifdef CEMU_DEBUG_ASSERT + samplerDescriptor->setLabel(GetLabel("Nearest sampler state", samplerDescriptor)); +#endif m_nearestSampler = m_device->newSamplerState(samplerDescriptor); samplerDescriptor->setMinFilter(MTL::SamplerMinMagFilterLinear); samplerDescriptor->setMagFilter(MTL::SamplerMinMagFilterLinear); +#ifdef CEMU_DEBUG_ASSERT + samplerDescriptor->setLabel(GetLabel("Linear sampler state", samplerDescriptor)); +#endif m_linearSampler = m_device->newSamplerState(samplerDescriptor); samplerDescriptor->release(); + // Null resources + MTL::TextureDescriptor* textureDescriptor = MTL::TextureDescriptor::alloc()->init(); + textureDescriptor->setTextureType(MTL::TextureType1D); + textureDescriptor->setWidth(4); + m_nullTexture1D = m_device->newTexture(textureDescriptor); +#ifdef CEMU_DEBUG_ASSERT + m_nullTexture1D->setLabel(GetLabel("Null texture 1D", m_nullTexture1D)); +#endif + + textureDescriptor->setTextureType(MTL::TextureType2D); + textureDescriptor->setHeight(4); + m_nullTexture2D = m_device->newTexture(textureDescriptor); +#ifdef CEMU_DEBUG_ASSERT + m_nullTexture2D->setLabel(GetLabel("Null texture 2D", m_nullTexture2D)); +#endif + textureDescriptor->release(); + m_memoryManager = new MetalMemoryManager(this); m_pipelineCache = new MetalPipelineCache(this); m_depthStencilCache = new MetalDepthStencilCache(this); @@ -296,22 +316,15 @@ void MetalRenderer::AppendOverlayDebugInfo() debug_printf("MetalRenderer::AppendOverlayDebugInfo not implemented\n"); } +// TODO: halfZ void MetalRenderer::renderTarget_setViewport(float x, float y, float width, float height, float nearZ, float farZ, bool halfZ) { m_state.m_viewport = MTL::Viewport{x, y, width, height, nearZ, farZ}; - if (m_encoderType == MetalEncoderType::Render) - { - static_cast(m_commandEncoder)->setViewport(m_state.m_viewport); - } } void MetalRenderer::renderTarget_setScissor(sint32 scissorX, sint32 scissorY, sint32 scissorWidth, sint32 scissorHeight) { - m_state.m_scissor = MTL::ScissorRect{NS::UInteger(scissorX), NS::UInteger(scissorY), NS::UInteger(scissorWidth), NS::UInteger(scissorHeight)}; - if (m_encoderType == MetalEncoderType::Render) - { - static_cast(m_commandEncoder)->setScissorRect(m_state.m_scissor); - } + m_state.m_scissor = MTL::ScissorRect{(uint32)scissorX, (uint32)scissorY, (uint32)scissorWidth, (uint32)scissorHeight}; } LatteCachedFBO* MetalRenderer::rendertarget_createCachedFBO(uint64 key) @@ -396,7 +409,7 @@ void MetalRenderer::texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sl depthAttachment->setSlice(sliceIndex); depthAttachment->setLevel(mipIndex); } - if (clearStencil) + if (clearStencil && GetMtlPixelFormatInfo(hostTexture->format, true).hasStencil) { auto stencilAttachment = renderPassDescriptor->stencilAttachment(); stencilAttachment->setTexture(mtlTexture); @@ -854,6 +867,33 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 encoderState.m_frontFaceWinding = frontFaceWinding; } + // Viewport + if (m_state.m_viewport.originX != encoderState.m_viewport.originX || + m_state.m_viewport.originY != encoderState.m_viewport.originY || + m_state.m_viewport.width != encoderState.m_viewport.width || + m_state.m_viewport.height != encoderState.m_viewport.height || + m_state.m_viewport.znear != encoderState.m_viewport.znear || + m_state.m_viewport.zfar != encoderState.m_viewport.zfar) + { + renderCommandEncoder->setViewport(m_state.m_viewport); + + encoderState.m_viewport = m_state.m_viewport; + } + + // Scissor + if (m_state.m_scissor.x != encoderState.m_scissor.x || + m_state.m_scissor.y != encoderState.m_scissor.y || + m_state.m_scissor.width != encoderState.m_scissor.width || + m_state.m_scissor.height != encoderState.m_scissor.height) + { + encoderState.m_scissor = m_state.m_scissor; + + // TODO: clamp scissor to render target dimensions + //scissor.width = ; + //scissor.height = ; + renderCommandEncoder->setScissorRect(encoderState.m_scissor); + } + // Resources // Index buffer @@ -935,7 +975,7 @@ void MetalRenderer::draw_endSequence() void* MetalRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) { - auto allocation = m_memoryManager->GetBufferAllocation(size, 4); + auto allocation = m_memoryManager->GetBufferAllocation(size); offset = allocation.bufferOffset; bufferIndex = allocation.bufferIndex; @@ -1017,7 +1057,7 @@ MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(bool forceRecr if (!needsNewRenderPass) { - if (m_state.m_activeFBO->depthBuffer.texture && m_state.m_activeFBO->depthBuffer.texture != m_state.m_lastUsedFBO->depthBuffer.texture) + if (m_state.m_activeFBO->depthBuffer.texture && (m_state.m_activeFBO->depthBuffer.texture != m_state.m_lastUsedFBO->depthBuffer.texture || ( m_state.m_activeFBO->depthBuffer.hasStencil && !m_state.m_lastUsedFBO->depthBuffer.hasStencil))) { needsNewRenderPass = true; } @@ -1155,7 +1195,7 @@ bool MetalRenderer::AcquireNextDrawable(bool mainWindow) m_drawable = m_metalLayer->nextDrawable(); if (!m_drawable) { - printf("failed to acquire next drawable\n"); + debug_printf("failed to acquire next drawable\n"); return false; } @@ -1191,13 +1231,6 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE UNREACHABLE; } - auto textureView = m_state.m_textures[hostTextureUnit]; - if (!textureView) - { - debug_printf("invalid bound texture view %u\n", hostTextureUnit); - continue; - } - // TODO: uncomment uint32 binding = shader->resourceMapping.getTextureBaseBindingPoint() + i;//shader->resourceMapping.textureUnitToBindingPoint[hostTextureUnit]; if (binding >= MAX_MTL_TEXTURES) @@ -1206,37 +1239,89 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE continue; } + auto textureView = m_state.m_textures[hostTextureUnit]; + if (!textureView) + { + // TODO: don't bind if already bound + if (textureDim == Latte::E_DIM::DIM_1D) + { + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + { + renderCommandEncoder->setVertexTexture(m_nullTexture1D, binding); + renderCommandEncoder->setVertexSamplerState(m_nearestSampler, binding); + break; + } + case LatteConst::ShaderType::Pixel: + { + renderCommandEncoder->setFragmentTexture(m_nullTexture1D, binding); + renderCommandEncoder->setVertexSamplerState(m_nearestSampler, binding); + break; + } + default: + UNREACHABLE; + } + } + else + { + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + { + renderCommandEncoder->setVertexTexture(m_nullTexture2D, binding); + renderCommandEncoder->setVertexSamplerState(m_nearestSampler, binding); + break; + } + case LatteConst::ShaderType::Pixel: + { + renderCommandEncoder->setFragmentTexture(m_nullTexture2D, binding); + renderCommandEncoder->setVertexSamplerState(m_nearestSampler, binding); + break; + } + default: + UNREACHABLE; + } + } + continue; + } + LatteTexture* baseTexture = textureView->baseTexture; uint32 stageSamplerIndex = shader->textureUnitSamplerAssignment[relative_textureUnit]; + MTL::SamplerState* sampler; if (stageSamplerIndex != LATTE_DECOMPILER_SAMPLER_NONE) { uint32 samplerIndex = stageSamplerIndex + LatteDecompiler_getTextureSamplerBaseIndex(shader->shaderType); - auto sampler = m_samplerCache->GetSamplerState(LatteGPUState.contextNew, samplerIndex); - - auto& boundSampler = m_state.m_encoderState.m_samplers[mtlShaderType][binding]; - if (sampler != boundSampler) - { - boundSampler = sampler; - - switch (shader->shaderType) - { - case LatteConst::ShaderType::Vertex: - { - renderCommandEncoder->setVertexSamplerState(sampler, binding); - break; - } - case LatteConst::ShaderType::Pixel: - { - renderCommandEncoder->setFragmentSamplerState(sampler, binding); - break; - } - default: - UNREACHABLE; - } - } + sampler = m_samplerCache->GetSamplerState(LatteGPUState.contextNew, samplerIndex); + } + else + { + sampler = m_nearestSampler; } + auto& boundSampler = m_state.m_encoderState.m_samplers[mtlShaderType][binding]; + if (sampler != boundSampler) + { + boundSampler = sampler; + + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + { + renderCommandEncoder->setVertexSamplerState(sampler, binding); + break; + } + case LatteConst::ShaderType::Pixel: + { + renderCommandEncoder->setFragmentSamplerState(sampler, binding); + break; + } + default: + UNREACHABLE; + } + } + // get texture register word 0 uint32 word4 = LatteGPUState.contextRegister[texUnitRegIndex + 4]; auto& boundTexture = m_state.m_encoderState.m_textures[mtlShaderType][binding]; @@ -1347,16 +1432,22 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE } */ + // TODO: uncomment + //auto supportBuffer = m_memoryManager->GetBufferAllocation(sizeof(supportBufferData)); + //memcpy(supportBuffer.data, supportBufferData, sizeof(supportBufferData)); + switch (shader->shaderType) { case LatteConst::ShaderType::Vertex: { + //renderCommandEncoder->setVertexBuffer(m_memoryManager->GetBuffer(supportBuffer.bufferIndex), supportBuffer.bufferOffset, MTL_SUPPORT_BUFFER_BINDING); renderCommandEncoder->setVertexBytes(supportBufferData, sizeof(supportBufferData), MTL_SUPPORT_BUFFER_BINDING); break; } case LatteConst::ShaderType::Pixel: { - renderCommandEncoder->setFragmentBytes(supportBufferData, sizeof(supportBufferData), MTL_SUPPORT_BUFFER_BINDING); + //renderCommandEncoder->setFragmentBuffer(m_memoryManager->GetBuffer(supportBuffer.bufferIndex), supportBuffer.bufferOffset, MTL_SUPPORT_BUFFER_BINDING); + renderCommandEncoder->setFragmentBytes(supportBufferData, sizeof(supportBufferData), MTL_SUPPORT_BUFFER_BINDING); break; } default: @@ -1428,12 +1519,6 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE void MetalRenderer::RebindRenderState(MTL::RenderCommandEncoder* renderCommandEncoder) { - // Viewport - renderCommandEncoder->setViewport(m_state.m_viewport); - - // Scissor - renderCommandEncoder->setScissorRect(m_state.m_scissor); - // Vertex buffers for (uint8 i = 0; i < MAX_MTL_BUFFERS; i++) { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index f315963e9..664e8815a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -7,6 +7,7 @@ #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" +#include "Metal/MTLRenderCommandEncoder.hpp" struct MetalBoundBuffer { @@ -44,6 +45,8 @@ struct MetalEncoderState MTL::DepthStencilState* m_depthStencilState = nullptr; MTL::CullMode m_cullMode = MTL::CullModeNone; MTL::Winding m_frontFaceWinding = MTL::WindingClockwise; + MTL::Viewport m_viewport; + MTL::ScissorRect m_scissor; uint32 m_stencilRefFront = 0; uint32 m_stencilRefBack = 0; uint32 m_depthBias = 0; @@ -74,8 +77,8 @@ struct MetalState class LatteTextureViewMtl* m_textures[64] = {nullptr}; size_t m_uniformBufferOffsets[METAL_SHADER_TYPE_TOTAL][MAX_MTL_BUFFERS]; - MTL::Viewport m_viewport = {0, 0, 0, 0, 0, 0}; - MTL::ScissorRect m_scissor = {0, 0, 0, 0}; + MTL::Viewport m_viewport; + MTL::ScissorRect m_scissor; }; struct MetalCommandBuffer @@ -290,6 +293,8 @@ class MetalRenderer : public Renderer { m_state.m_encoderState = {}; + // TODO: set viewport and scissor to render target dimensions if render commands + for (uint32 i = 0; i < METAL_SHADER_TYPE_TOTAL; i++) { for (uint32 j = 0; j < MAX_MTL_TEXTURES; j++) @@ -350,10 +355,14 @@ class MetalRenderer : public Renderer class MetalHybridComputePipeline* m_copyTextureToTexturePipeline; class MetalHybridComputePipeline* m_restrideBufferPipeline; - // Basic + // Resources MTL::SamplerState* m_nearestSampler; MTL::SamplerState* m_linearSampler; + // Null resources + MTL::Texture* m_nullTexture1D; + MTL::Texture* m_nullTexture2D; + // Texture readback MTL::Buffer* m_readbackBuffer; uint32 m_readbackBufferWriteOffset = 0; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h index 2e94807d3..c298150e9 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h @@ -7,45 +7,45 @@ constexpr const char* utilityShaderSource = _STRINGIFY(( constant float2 positions[] = {float2(-1.0, -3.0), float2(-1.0, 1.0), float2(3.0, 1.0)}; struct VertexOut { - float4 position [[position]]; - float2 texCoord; + float4 position [[position]]; + float2 texCoord; }; vertex VertexOut vertexFullscreen(ushort vid [[vertex_id]]) { - VertexOut out; - out.position = float4(positions[vid], 0.0, 1.0); - out.texCoord = positions[vid] * 0.5 + 0.5; - out.texCoord.y = 1.0 - out.texCoord.y; + VertexOut out; + out.position = float4(positions[vid], 0.0, 1.0); + out.texCoord = positions[vid] * 0.5 + 0.5; + out.texCoord.y = 1.0 - out.texCoord.y; - return out; + return out; } fragment float4 fragmentPresent(VertexOut in [[stage_in]], texture2d tex [[texture(0)]], sampler samplr [[sampler(0)]]) { - return tex.sample(samplr, in.texCoord); + return tex.sample(samplr, in.texCoord); } struct CopyParams { - uint width; - uint srcMip; - uint srcSlice; - uint dstMip; - uint dstSlice; + uint width; + uint srcMip; + uint srcSlice; + uint dstMip; + uint dstSlice; }; vertex void vertexCopyTextureToTexture(uint vid [[vertex_id]], texture2d_array src [[texture(GET_TEXTURE_BINDING(0))]], texture2d_array dst [[texture(GET_TEXTURE_BINDING(1))]], constant CopyParams& params [[buffer(GET_BUFFER_BINDING(0))]]) { - uint2 coord = uint2(vid % params.width, vid / params.width); - return dst.write(float4(src.read(coord, params.srcSlice, params.srcMip).r, 0.0, 0.0, 0.0), coord, params.dstSlice, params.dstMip); + uint2 coord = uint2(vid % params.width, vid / params.width); + return dst.write(float4(src.read(coord, params.srcSlice, params.srcMip).r, 0.0, 0.0, 0.0), coord, params.dstSlice, params.dstMip); } struct RestrideParams { - uint oldStride; - uint newStride; + uint oldStride; + uint newStride; }; /* TODO: use uint32? Since that would require less iterations */ vertex void vertexRestrideBuffer(uint vid [[vertex_id]], device uint8_t* src [[buffer(GET_BUFFER_BINDING(0))]], device uint8_t* dst [[buffer(GET_BUFFER_BINDING(1))]], constant RestrideParams& params [[buffer(GET_BUFFER_BINDING(2))]]) { - for (uint32_t i = 0; i < params.oldStride; i++) { - dst[vid * params.newStride + i] = src[vid * params.oldStride + i]; - } + for (uint32_t i = 0; i < params.oldStride; i++) { + dst[vid * params.newStride + i] = src[vid * params.oldStride + i]; + } } )); From b8e9bb355c3c2a30b960deb084cd63e70b938e63 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 17 Aug 2024 19:05:00 +0200 Subject: [PATCH 080/368] fix: GPU hangs (hack) --- .../LatteDecompilerEmitMSL.cpp | 4 ++ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 53 +++++++++++++++++-- 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 95e91d40d..a46da96c4 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3284,6 +3284,8 @@ static void _emitCFRingWriteCode(LatteDecompilerShaderContext* shaderContext, La continue; uint32 u32Offset = streamWrite->exportArrayBase + i; + // HACK: disable streamout temporarily, since it causes GPU hangs + continue; src->addFmt("sb[sbBase{} + {}]", streamWrite->bufferIndex, u32Offset); src->add(" = "); @@ -3397,6 +3399,8 @@ static void _emitStreamWriteCode(LatteDecompilerShaderContext* shaderContext, La continue; uint32 u32Offset = cfInstruction->exportArrayBase + i; + // HACK: disable streamout temporarily, since it causes GPU hangs + continue; src->addFmt("sb[sbBase{} + {}]", streamoutBufferIndex, u32Offset); src->add(" = "); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index a9245383c..ca4c9938b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -591,8 +591,6 @@ void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* so // TODO: do the copy in a compute shader debug_printf("surfaceCopy_copySurfaceWithFormatConversion: no active render command encoder, skipping copy\n"); } - - // TODO: restore state } void MetalRenderer::bufferCache_init(const sint32 bufferSize) @@ -745,7 +743,13 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 const auto fetchShader = LatteSHRC_GetActiveFetchShader(); // Depth stencil state - MTL::DepthStencilState* depthStencilState = m_depthStencilCache->GetDepthStencilState(LatteGPUState.contextNew); + auto depthStencilContext = LatteGPUState.contextNew; + + // Disable depth write when there is no depth attachment + if (!m_state.m_lastUsedFBO->depthBuffer.texture) + depthStencilContext.DB_DEPTH_CONTROL.set_Z_WRITE_ENABLE(false); + + MTL::DepthStencilState* depthStencilState = m_depthStencilCache->GetDepthStencilState(depthStencilContext); if (depthStencilState != encoderState.m_depthStencilState) { renderCommandEncoder->setDepthStencilState(depthStencilState); @@ -1286,6 +1290,49 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE continue; } + if (textureDim == Latte::E_DIM::DIM_1D && (textureView->dim != Latte::E_DIM::DIM_1D)) + { + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + { + renderCommandEncoder->setVertexTexture(m_nullTexture1D, binding); + renderCommandEncoder->setVertexSamplerState(m_nearestSampler, binding); + break; + } + case LatteConst::ShaderType::Pixel: + { + renderCommandEncoder->setFragmentTexture(m_nullTexture1D, binding); + renderCommandEncoder->setVertexSamplerState(m_nearestSampler, binding); + break; + } + default: + UNREACHABLE; + } + continue; + } + else if (textureDim == Latte::E_DIM::DIM_2D && (textureView->dim != Latte::E_DIM::DIM_2D && textureView->dim != Latte::E_DIM::DIM_2D_MSAA)) + { + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + { + renderCommandEncoder->setVertexTexture(m_nullTexture2D, binding); + renderCommandEncoder->setVertexSamplerState(m_nearestSampler, binding); + break; + } + case LatteConst::ShaderType::Pixel: + { + renderCommandEncoder->setFragmentTexture(m_nullTexture2D, binding); + renderCommandEncoder->setVertexSamplerState(m_nearestSampler, binding); + break; + } + default: + UNREACHABLE; + } + continue; + } + LatteTexture* baseTexture = textureView->baseTexture; uint32 stageSamplerIndex = shader->textureUnitSamplerAssignment[relative_textureUnit]; From 07989d828c3a7989624fff638ffc10f58cfaf8c4 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 17 Aug 2024 19:54:55 +0200 Subject: [PATCH 081/368] fix: performance regression --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index ca4c9938b..94d7d916d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -161,6 +161,9 @@ MetalRenderer::~MetalRenderer() delete m_samplerCache; delete m_memoryManager; + m_nullTexture1D->release(); + m_nullTexture2D->release(); + m_nearestSampler->release(); m_linearSampler->release(); @@ -743,13 +746,14 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 const auto fetchShader = LatteSHRC_GetActiveFetchShader(); // Depth stencil state - auto depthStencilContext = LatteGPUState.contextNew; + // TODO: implement this somehow + //auto depthControl = LatteGPUState.contextNew.DB_DEPTH_CONTROL; // Disable depth write when there is no depth attachment - if (!m_state.m_lastUsedFBO->depthBuffer.texture) - depthStencilContext.DB_DEPTH_CONTROL.set_Z_WRITE_ENABLE(false); + //if (!m_state.m_lastUsedFBO->depthBuffer.texture) + // depthControl.set_Z_WRITE_ENABLE(false); - MTL::DepthStencilState* depthStencilState = m_depthStencilCache->GetDepthStencilState(depthStencilContext); + MTL::DepthStencilState* depthStencilState = m_depthStencilCache->GetDepthStencilState(LatteGPUState.contextNew); if (depthStencilState != encoderState.m_depthStencilState) { renderCommandEncoder->setDepthStencilState(depthStencilState); From 265785772aa07da9c1bdbca3fa7c1ba5910b12ed Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 18 Aug 2024 11:13:49 +0200 Subject: [PATCH 082/368] rework buffer allocator --- .../Renderer/Metal/MetalMemoryManager.cpp | 83 +----- .../Latte/Renderer/Metal/MetalMemoryManager.h | 243 ++++++++++++++---- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 32 ++- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 23 +- 4 files changed, 243 insertions(+), 138 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index fb9419d64..c102bcf29 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -1,77 +1,8 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" -#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h" #include "Common/precompiled.h" -MetalBufferAllocator::~MetalBufferAllocator() -{ - for (auto buffer : m_buffers) - { - buffer->release(); - } -} - -MetalBufferAllocation MetalBufferAllocator::GetBufferAllocation(size_t size) -{ - // Align the size - size = Align(size, 16); - - // First, try to find a free range - for (uint32 i = 0; i < m_freeBufferRanges.size(); i++) - { - auto& range = m_freeBufferRanges[i]; - if (size <= range.size) - { - MetalBufferAllocation allocation; - allocation.bufferIndex = range.bufferIndex; - allocation.bufferOffset = range.offset; - allocation.data = (uint8*)m_buffers[range.bufferIndex]->contents() + range.offset; - - range.offset += size; - range.size -= size; - - if (range.size == 0) - { - m_freeBufferRanges.erase(m_freeBufferRanges.begin() + i); - } - - return allocation; - } - } - - // If no free range was found, allocate a new buffer - m_allocationSize = std::max(m_allocationSize, size); - MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(m_allocationSize, MTL::ResourceStorageModeShared); -#ifdef CEMU_DEBUG_ASSERT - buffer->setLabel(GetLabel("Buffer from buffer allocator", buffer)); -#endif - - MetalBufferAllocation allocation; - allocation.bufferIndex = m_buffers.size(); - allocation.bufferOffset = 0; - allocation.data = buffer->contents(); - - m_buffers.push_back(buffer); - - // If the buffer is larger than the requested size, add the remaining space to the free buffer ranges - if (size < m_allocationSize) - { - MetalBufferRange range; - range.bufferIndex = allocation.bufferIndex; - range.offset = size; - range.size = m_allocationSize - size; - - m_freeBufferRanges.push_back(range); - } - - // Increase the allocation size for the next buffer - if (m_allocationSize < 128 * 1024 * 1024) - m_allocationSize *= 2; - - return allocation; -} - MetalVertexBufferCache::~MetalVertexBufferCache() { } @@ -87,13 +18,13 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu return {bufferCache, vertexBufferRange.offset}; } - auto buffer = m_bufferAllocator->GetBuffer(restrideInfo.allocation.bufferIndex); + MTL::Buffer* buffer; if (restrideInfo.memoryInvalidated || stride != restrideInfo.lastStride) { size_t newStride = Align(stride, 4); size_t newSize = vertexBufferRange.size / stride * newStride; - restrideInfo.allocation = m_bufferAllocator->GetBufferAllocation(newSize); - buffer = m_bufferAllocator->GetBuffer(restrideInfo.allocation.bufferIndex); + restrideInfo.allocation = m_bufferAllocator.GetBufferAllocation(newSize); + buffer = m_bufferAllocator.GetBuffer(restrideInfo.allocation.bufferIndex); //uint8* oldPtr = (uint8*)bufferCache->contents() + vertexBufferRange.offset; //uint8* newPtr = (uint8*)buffer->contents() + restrideInfo.allocation.bufferOffset; @@ -112,7 +43,7 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu m_mtlr->GetEncoderState().m_renderPipelineState = m_restrideBufferPipeline->GetRenderPipelineState(); MTL::Buffer* buffers[] = {bufferCache, buffer}; - size_t offsets[] = {vertexBufferRange.offset, restrideInfo.allocation.bufferOffset}; + size_t offsets[] = {vertexBufferRange.offset, restrideInfo.allocation.offset}; renderCommandEncoder->setVertexBuffers(buffers, offsets, NS::Range(GET_HELPER_BUFFER_BINDING(0), 2)); m_mtlr->GetEncoderState().m_uniformBufferOffsets[METAL_SHADER_TYPE_VERTEX][GET_HELPER_BUFFER_BINDING(0)] = INVALID_OFFSET; m_mtlr->GetEncoderState().m_uniformBufferOffsets[METAL_SHADER_TYPE_VERTEX][GET_HELPER_BUFFER_BINDING(1)] = INVALID_OFFSET; @@ -149,8 +80,12 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu restrideInfo.memoryInvalidated = false; restrideInfo.lastStride = newStride; } + else + { + buffer = m_bufferAllocator.GetBuffer(restrideInfo.allocation.bufferIndex); + } - return {buffer, restrideInfo.allocation.bufferOffset}; + return {buffer, restrideInfo.allocation.offset}; } void MetalVertexBufferCache::MemoryRangeChanged(size_t offset, size_t size) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index 9bffe9f25..ea8b7554c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -3,20 +3,7 @@ #include "Cafe/HW/Latte/ISA/LatteReg.h" #include "Cafe/HW/Latte/Core/LatteConst.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" - -//const uint32 bufferAllocatorIndexShift = 24; - -struct MetalBufferAllocation -{ - void* data; - uint32 bufferIndex; - size_t bufferOffset = INVALID_OFFSET; - - bool IsValid() const - { - return bufferOffset != INVALID_OFFSET; - } -}; +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" struct MetalBufferRange { @@ -25,51 +12,210 @@ struct MetalBufferRange size_t size; }; +template class MetalBufferAllocator { public: MetalBufferAllocator(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} - ~MetalBufferAllocator(); - void ResetTemporaryBuffers() + ~MetalBufferAllocator() { - m_freeBufferRanges.clear(); - - // Register the free ranges - for (uint32 i = 0; i < m_buffers.size(); i++) + for (auto buffer : m_buffers) { - m_freeBufferRanges.push_back({i, 0, m_buffers[i]->length()}); + buffer.m_buffer->release(); } } + void ResetAllocations() + { + m_freeBufferRanges.clear(); + for (uint32_t i = 0; i < m_buffers.size(); i++) + m_freeBufferRanges.push_back({i, 0, m_buffers[i].m_buffer->length()}); + } + MTL::Buffer* GetBuffer(uint32 bufferIndex) { - return m_buffers[bufferIndex]; + return m_buffers[bufferIndex].m_buffer; } - MetalBufferAllocation GetBufferAllocation(size_t size); + MetalBufferAllocation GetBufferAllocation(size_t size) + { + // Align the size + size = Align(size, 16); + + // First, try to find a free range + for (uint32 i = 0; i < m_freeBufferRanges.size(); i++) + { + auto& range = m_freeBufferRanges[i]; + if (size <= range.size) + { + auto& buffer = m_buffers[range.bufferIndex]; + + MetalBufferAllocation allocation; + allocation.bufferIndex = range.bufferIndex; + allocation.offset = range.offset; + allocation.size = size; + allocation.data = (uint8*)buffer.m_buffer->contents() + range.offset; + + range.offset += size; + range.size -= size; + + if (range.size == 0) + { + m_freeBufferRanges.erase(m_freeBufferRanges.begin() + i); + } + + return allocation; + } + } -private: + // If no free range was found, allocate a new buffer + m_allocationSize = std::max(m_allocationSize, size); + MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(m_allocationSize, MTL::ResourceStorageModeShared); + #ifdef CEMU_DEBUG_ASSERT + buffer->setLabel(GetLabel("Buffer from buffer allocator", buffer)); + #endif + + MetalBufferAllocation allocation; + allocation.bufferIndex = m_buffers.size(); + allocation.offset = 0; + allocation.size = size; + allocation.data = buffer->contents(); + + m_buffers.push_back({buffer}); + + // If the buffer is larger than the requested size, add the remaining space to the free buffer ranges + if (size < m_allocationSize) + { + MetalBufferRange range; + range.bufferIndex = allocation.bufferIndex; + range.offset = size; + range.size = m_allocationSize - size; + + m_freeBufferRanges.push_back(range); + } + + // Increase the allocation size for the next buffer + if (m_allocationSize < 128 * 1024 * 1024) + m_allocationSize *= 2; + + return allocation; + } + + void FreeAllocation(MetalBufferAllocation& allocation) + { + MetalBufferRange range; + range.bufferIndex = allocation.bufferIndex; + range.offset = allocation.offset; + range.size = allocation.size; + + allocation.offset = INVALID_OFFSET; + + // Find the correct position to insert the free range + for (uint32 i = 0; i < m_freeBufferRanges.size(); i++) + { + auto& freeRange = m_freeBufferRanges[i]; + if (freeRange.bufferIndex == range.bufferIndex && freeRange.offset + freeRange.size == range.offset) + { + freeRange.size += range.size; + return; + } + } + + m_freeBufferRanges.push_back(range); + } + +protected: class MetalRenderer* m_mtlr; size_t m_allocationSize = 8 * 1024 * 1024; - std::vector m_buffers; + std::vector m_buffers; std::vector m_freeBufferRanges; }; -struct MetalRestridedBufferRange +struct MetalBuffer { - MTL::Buffer* buffer; - size_t offset; + MTL::Buffer* m_buffer; +}; + +typedef MetalBufferAllocator MetalDefaultBufferAllocator; + +struct MetalSyncedBuffer +{ + MTL::Buffer* m_buffer; + std::vector m_commandBuffers; +}; + +class MetalTemporaryBufferAllocator : public MetalBufferAllocator +{ +public: + MetalTemporaryBufferAllocator(class MetalRenderer* metalRenderer) : MetalBufferAllocator(metalRenderer) {} + + void SetActiveCommandBuffer(MTL::CommandBuffer* commandBuffer) + { + m_activeCommandBuffer = commandBuffer; + } + + void CommandBufferFinished(MTL::CommandBuffer* commandBuffer) + { + for (uint32_t i = 0; i < m_buffers.size(); i++) + { + auto& buffer = m_buffers[i]; + for (uint32_t j = 0; j < buffer.m_commandBuffers.size(); j++) + { + if (commandBuffer == buffer.m_commandBuffers[j]) + { + if (buffer.m_commandBuffers.size() == 1) + { + // All command buffers using it have finished execution, we can use it again + m_freeBufferRanges.push_back({i, 0, buffer.m_buffer->length()}); + + buffer.m_commandBuffers.clear(); + } + else + { + buffer.m_commandBuffers.erase(buffer.m_commandBuffers.begin() + j); + } + break; + } + } + } + } + + // TODO: should this be here? It's just to ensure safety + MTL::Buffer* GetBuffer(uint32 bufferIndex) + { + auto& buffer = m_buffers[bufferIndex]; + if (buffer.m_commandBuffers.back() != m_activeCommandBuffer) + buffer.m_commandBuffers.push_back(m_activeCommandBuffer); + + return buffer.m_buffer; + } + + MetalBufferAllocation GetBufferAllocation(size_t size) + { + // TODO: remove this + if (!m_activeCommandBuffer) + throw std::runtime_error("No active command buffer when allocating a buffer!"); + + auto allocation = MetalBufferAllocator::GetBufferAllocation(size); + + auto& buffer = m_buffers[allocation.bufferIndex]; + if (buffer.m_commandBuffers.empty() || buffer.m_commandBuffers.back() != m_activeCommandBuffer) + buffer.m_commandBuffers.push_back(m_activeCommandBuffer); + + return allocation; + } + +private: + MTL::CommandBuffer* m_activeCommandBuffer = nullptr; }; -// TODO: use one big buffer for all the restrided vertex buffers? -struct MetalRestrideInfo +struct MetalRestridedBufferRange { - bool memoryInvalidated = true; - size_t lastStride = 0; - MetalBufferAllocation allocation{}; + MTL::Buffer* buffer; + size_t offset; }; struct MetalVertexBufferRange @@ -84,7 +230,7 @@ class MetalVertexBufferCache public: friend class MetalMemoryManager; - MetalVertexBufferCache(class MetalRenderer* metalRenderer, MetalBufferAllocator* bufferAllocator) : m_mtlr{metalRenderer}, m_bufferAllocator{bufferAllocator} {} + MetalVertexBufferCache(class MetalRenderer* metalRenderer, MetalDefaultBufferAllocator& bufferAllocator) : m_mtlr{metalRenderer}, m_bufferAllocator{bufferAllocator} {} ~MetalVertexBufferCache(); void SetRestrideBufferPipeline(class MetalHybridComputePipeline* restrideBufferPipeline) @@ -100,6 +246,8 @@ class MetalVertexBufferCache void UntrackVertexBuffer(uint32 bufferIndex) { auto& range = m_bufferRanges[bufferIndex]; + //if (range.restrideInfo->allocation.offset != INVALID_OFFSET) + // m_bufferAllocator.FreeAllocation(range.restrideInfo->allocation); range.offset = INVALID_OFFSET; } @@ -107,7 +255,7 @@ class MetalVertexBufferCache private: class MetalRenderer* m_mtlr; - MetalBufferAllocator* m_bufferAllocator; + MetalDefaultBufferAllocator& m_bufferAllocator; class MetalHybridComputePipeline* m_restrideBufferPipeline = nullptr; @@ -119,7 +267,7 @@ class MetalVertexBufferCache class MetalMemoryManager { public: - MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_bufferAllocator(metalRenderer), m_vertexBufferCache(metalRenderer, &m_bufferAllocator) {} + MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_bufferAllocator(metalRenderer), m_framePersistentBufferAllocator(metalRenderer), m_tempBufferAllocator(metalRenderer), m_vertexBufferCache(metalRenderer, m_framePersistentBufferAllocator) {} ~MetalMemoryManager(); // Pipelines @@ -128,25 +276,19 @@ class MetalMemoryManager m_vertexBufferCache.SetRestrideBufferPipeline(restrideBufferPipeline); } - void ResetTemporaryBuffers() + MetalDefaultBufferAllocator& GetBufferAllocator() { - m_bufferAllocator/*s[m_bufferAllocatorIndex]*/.ResetTemporaryBuffers(); - //m_bufferAllocatorIndex = (m_bufferAllocatorIndex + 1) % 2; + return m_bufferAllocator; } - MTL::Buffer* GetBuffer(uint32 bufferIndex) + MetalDefaultBufferAllocator& GetFramePersistentBufferAllocator() { - //uint32 bufferAllocatorIndex = (bufferIndex >> bufferAllocatorIndexShift); - - return m_bufferAllocator/*s[bufferAllocatorIndex]*/.GetBuffer(bufferIndex); + return m_framePersistentBufferAllocator; } - MetalBufferAllocation GetBufferAllocation(size_t size) + MetalTemporaryBufferAllocator& GetTemporaryBufferAllocator() { - auto allocation = m_bufferAllocator/*s[m_bufferAllocatorIndex]*/.GetBufferAllocation(size); - //allocation.bufferIndex |= (m_bufferAllocatorIndex << bufferAllocatorIndexShift); - - return allocation; + return m_tempBufferAllocator; } MTL::Buffer* GetBufferCache() @@ -182,8 +324,9 @@ class MetalMemoryManager std::vector m_textureUploadBuffer; - MetalBufferAllocator m_bufferAllocator;//s[2]; - //uint8 m_bufferAllocatorIndex = 0; + MetalDefaultBufferAllocator m_bufferAllocator; + MetalDefaultBufferAllocator m_framePersistentBufferAllocator; + MetalTemporaryBufferAllocator m_tempBufferAllocator; MetalVertexBufferCache m_vertexBufferCache; MTL::Buffer* m_bufferCache = nullptr; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 94d7d916d..20123f0df 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1,5 +1,6 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalLayer.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" @@ -253,8 +254,8 @@ void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) m_commandBuffers[i].m_commandBuffer->release(); m_commandBuffers.clear(); - // Reset temporary buffers - m_memoryManager->ResetTemporaryBuffers(); + // Release frame persistent buffers + m_memoryManager->GetFramePersistentBufferAllocator().ResetAllocations(); } // TODO: use `shader` for drawing @@ -953,7 +954,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 if (hostIndexType != INDEX_TYPE::NONE) { auto mtlIndexType = GetMtlIndexType(hostIndexType); - MTL::Buffer* indexBuffer = m_memoryManager->GetBuffer(indexBufferIndex); + MTL::Buffer* indexBuffer = m_memoryManager->GetTemporaryBufferAllocator().GetBuffer(indexBufferIndex); renderCommandEncoder->drawIndexedPrimitives(mtlPrimitiveType, hostIndexCount, mtlIndexType, indexBuffer, indexBufferOffset, instanceCount, baseVertex, baseInstance); } else { @@ -983,8 +984,8 @@ void MetalRenderer::draw_endSequence() void* MetalRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) { - auto allocation = m_memoryManager->GetBufferAllocation(size); - offset = allocation.bufferOffset; + auto allocation = m_memoryManager->GetTemporaryBufferAllocator().GetBufferAllocation(size); + offset = allocation.offset; bufferIndex = allocation.bufferIndex; return allocation.data; @@ -1006,6 +1007,9 @@ MTL::CommandBuffer* MetalRenderer::GetCommandBuffer() MTL::CommandBuffer* mtlCommandBuffer = m_commandQueue->commandBuffer(); m_commandBuffers.push_back({mtlCommandBuffer}); + // Notify memory manager about the new command buffer + m_memoryManager->GetTemporaryBufferAllocator().SetActiveCommandBuffer(mtlCommandBuffer); + return mtlCommandBuffer; } else @@ -1176,6 +1180,10 @@ void MetalRenderer::CommitCommandBuffer() auto& commandBuffer = m_commandBuffers.back(); if (!commandBuffer.m_commited) { + commandBuffer.m_commandBuffer->addCompletedHandler(^(MTL::CommandBuffer* cmd) { + m_memoryManager->GetTemporaryBufferAllocator().CommandBufferFinished(commandBuffer.m_commandBuffer); + }); + commandBuffer.m_commandBuffer->commit(); commandBuffer.m_commited = true; @@ -1483,22 +1491,22 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE } */ - // TODO: uncomment - //auto supportBuffer = m_memoryManager->GetBufferAllocation(sizeof(supportBufferData)); - //memcpy(supportBuffer.data, supportBufferData, sizeof(supportBufferData)); + auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator(); + auto supportBuffer = bufferAllocator.GetBufferAllocation(sizeof(supportBufferData)); + memcpy(supportBuffer.data, supportBufferData, sizeof(supportBufferData)); switch (shader->shaderType) { case LatteConst::ShaderType::Vertex: { - //renderCommandEncoder->setVertexBuffer(m_memoryManager->GetBuffer(supportBuffer.bufferIndex), supportBuffer.bufferOffset, MTL_SUPPORT_BUFFER_BINDING); - renderCommandEncoder->setVertexBytes(supportBufferData, sizeof(supportBufferData), MTL_SUPPORT_BUFFER_BINDING); + renderCommandEncoder->setVertexBuffer(bufferAllocator.GetBuffer(supportBuffer.bufferIndex), supportBuffer.offset, MTL_SUPPORT_BUFFER_BINDING); + //renderCommandEncoder->setVertexBytes(supportBufferData, sizeof(supportBufferData), MTL_SUPPORT_BUFFER_BINDING); break; } case LatteConst::ShaderType::Pixel: { - //renderCommandEncoder->setFragmentBuffer(m_memoryManager->GetBuffer(supportBuffer.bufferIndex), supportBuffer.bufferOffset, MTL_SUPPORT_BUFFER_BINDING); - renderCommandEncoder->setFragmentBytes(supportBufferData, sizeof(supportBufferData), MTL_SUPPORT_BUFFER_BINDING); + renderCommandEncoder->setFragmentBuffer(bufferAllocator.GetBuffer(supportBuffer.bufferIndex), supportBuffer.offset, MTL_SUPPORT_BUFFER_BINDING); + //renderCommandEncoder->setFragmentBytes(supportBufferData, sizeof(supportBufferData), MTL_SUPPORT_BUFFER_BINDING); break; } default: diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 664e8815a..8d63f6daa 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -6,8 +6,27 @@ #include "Cafe/HW/Latte/Renderer/Renderer.h" -#include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" -#include "Metal/MTLRenderCommandEncoder.hpp" +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" + +struct MetalBufferAllocation +{ + void* data; + uint32 bufferIndex; + size_t offset = INVALID_OFFSET; + size_t size; + + bool IsValid() const + { + return offset != INVALID_OFFSET; + } +}; + +struct MetalRestrideInfo +{ + bool memoryInvalidated = true; + size_t lastStride = 0; + MetalBufferAllocation allocation{}; +}; struct MetalBoundBuffer { From 269e0721394cc9b87b08e301c0c5016b87d0aa8f Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 18 Aug 2024 11:32:24 +0200 Subject: [PATCH 083/368] move buffer allocators to separate file & fix: high memory usage --- src/Cafe/CMakeLists.txt | 1 + .../Renderer/Metal/MetalBufferAllocator.h | 210 +++++++++++++++++ .../Latte/Renderer/Metal/MetalMemoryManager.h | 212 +----------------- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 5 +- 4 files changed, 215 insertions(+), 213 deletions(-) create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index fb4672d27..37bef0e85 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -554,6 +554,7 @@ if(ENABLE_METAL) HW/Latte/Renderer/Metal/RendererShaderMtl.h HW/Latte/Renderer/Metal/CachedFBOMtl.cpp HW/Latte/Renderer/Metal/CachedFBOMtl.h + HW/Latte/Renderer/Metal/MetalBufferAllocator.h HW/Latte/Renderer/Metal/MetalMemoryManager.cpp HW/Latte/Renderer/Metal/MetalMemoryManager.h HW/Latte/Renderer/Metal/MetalPipelineCache.cpp diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h new file mode 100644 index 000000000..20467e654 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h @@ -0,0 +1,210 @@ +#pragma once + +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" + +struct MetalBufferRange +{ + uint32 bufferIndex; + size_t offset; + size_t size; +}; + +template +class MetalBufferAllocator +{ +public: + MetalBufferAllocator(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} + + ~MetalBufferAllocator() + { + for (auto buffer : m_buffers) + { + buffer.m_buffer->release(); + } + } + + void ResetAllocations() + { + m_freeBufferRanges.clear(); + for (uint32_t i = 0; i < m_buffers.size(); i++) + m_freeBufferRanges.push_back({i, 0, m_buffers[i].m_buffer->length()}); + } + + MTL::Buffer* GetBuffer(uint32 bufferIndex) + { + return m_buffers[bufferIndex].m_buffer; + } + + MetalBufferAllocation GetBufferAllocation(size_t size) + { + // Align the size + size = Align(size, 16); + + // First, try to find a free range + for (uint32 i = 0; i < m_freeBufferRanges.size(); i++) + { + auto& range = m_freeBufferRanges[i]; + if (size <= range.size) + { + auto& buffer = m_buffers[range.bufferIndex]; + + MetalBufferAllocation allocation; + allocation.bufferIndex = range.bufferIndex; + allocation.offset = range.offset; + allocation.size = size; + allocation.data = (uint8*)buffer.m_buffer->contents() + range.offset; + + range.offset += size; + range.size -= size; + + if (range.size == 0) + { + m_freeBufferRanges.erase(m_freeBufferRanges.begin() + i); + } + + return allocation; + } + } + + // If no free range was found, allocate a new buffer + m_allocationSize = std::max(m_allocationSize, size); + MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(m_allocationSize, MTL::ResourceStorageModeShared); + #ifdef CEMU_DEBUG_ASSERT + buffer->setLabel(GetLabel("Buffer from buffer allocator", buffer)); + #endif + + MetalBufferAllocation allocation; + allocation.bufferIndex = m_buffers.size(); + allocation.offset = 0; + allocation.size = size; + allocation.data = buffer->contents(); + + m_buffers.push_back({buffer}); + + // If the buffer is larger than the requested size, add the remaining space to the free buffer ranges + if (size < m_allocationSize) + { + MetalBufferRange range; + range.bufferIndex = allocation.bufferIndex; + range.offset = size; + range.size = m_allocationSize - size; + + m_freeBufferRanges.push_back(range); + } + + // Increase the allocation size for the next buffer + if (m_allocationSize < 128 * 1024 * 1024) + m_allocationSize *= 2; + + return allocation; + } + + void FreeAllocation(MetalBufferAllocation& allocation) + { + MetalBufferRange range; + range.bufferIndex = allocation.bufferIndex; + range.offset = allocation.offset; + range.size = allocation.size; + + allocation.offset = INVALID_OFFSET; + + // Find the correct position to insert the free range + for (uint32 i = 0; i < m_freeBufferRanges.size(); i++) + { + auto& freeRange = m_freeBufferRanges[i]; + if (freeRange.bufferIndex == range.bufferIndex && freeRange.offset + freeRange.size == range.offset) + { + freeRange.size += range.size; + return; + } + } + + m_freeBufferRanges.push_back(range); + } + +protected: + class MetalRenderer* m_mtlr; + + size_t m_allocationSize = 8 * 1024 * 1024; + + std::vector m_buffers; + std::vector m_freeBufferRanges; +}; + +struct MetalBuffer +{ + MTL::Buffer* m_buffer; +}; + +typedef MetalBufferAllocator MetalDefaultBufferAllocator; + +struct MetalSyncedBuffer +{ + MTL::Buffer* m_buffer; + std::vector m_commandBuffers; +}; + +class MetalTemporaryBufferAllocator : public MetalBufferAllocator +{ +public: + MetalTemporaryBufferAllocator(class MetalRenderer* metalRenderer) : MetalBufferAllocator(metalRenderer) {} + + void SetActiveCommandBuffer(MTL::CommandBuffer* commandBuffer) + { + m_activeCommandBuffer = commandBuffer; + } + + void CommandBufferFinished(MTL::CommandBuffer* commandBuffer) + { + for (uint32_t i = 0; i < m_buffers.size(); i++) + { + auto& buffer = m_buffers[i]; + for (uint32_t j = 0; j < buffer.m_commandBuffers.size(); j++) + { + if (commandBuffer == buffer.m_commandBuffers[j]) + { + if (buffer.m_commandBuffers.size() == 1) + { + // All command buffers using it have finished execution, we can use it again + m_freeBufferRanges.push_back({i, 0, buffer.m_buffer->length()}); + + buffer.m_commandBuffers.clear(); + } + else + { + buffer.m_commandBuffers.erase(buffer.m_commandBuffers.begin() + j); + } + break; + } + } + } + } + + // TODO: should this be here? It's just to ensure safety + MTL::Buffer* GetBuffer(uint32 bufferIndex) + { + auto& buffer = m_buffers[bufferIndex]; + if (buffer.m_commandBuffers.back() != m_activeCommandBuffer) + buffer.m_commandBuffers.push_back(m_activeCommandBuffer); + + return buffer.m_buffer; + } + + MetalBufferAllocation GetBufferAllocation(size_t size) + { + // TODO: remove this + if (!m_activeCommandBuffer) + throw std::runtime_error("No active command buffer when allocating a buffer!"); + + auto allocation = MetalBufferAllocator::GetBufferAllocation(size); + + auto& buffer = m_buffers[allocation.bufferIndex]; + if (buffer.m_commandBuffers.empty() || buffer.m_commandBuffers.back() != m_activeCommandBuffer) + buffer.m_commandBuffers.push_back(m_activeCommandBuffer); + + return allocation; + } + +private: + MTL::CommandBuffer* m_activeCommandBuffer = nullptr; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index ea8b7554c..cc89f5ce3 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -1,216 +1,6 @@ #pragma once -#include "Cafe/HW/Latte/ISA/LatteReg.h" -#include "Cafe/HW/Latte/Core/LatteConst.h" -#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" -#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" - -struct MetalBufferRange -{ - uint32 bufferIndex; - size_t offset; - size_t size; -}; - -template -class MetalBufferAllocator -{ -public: - MetalBufferAllocator(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} - - ~MetalBufferAllocator() - { - for (auto buffer : m_buffers) - { - buffer.m_buffer->release(); - } - } - - void ResetAllocations() - { - m_freeBufferRanges.clear(); - for (uint32_t i = 0; i < m_buffers.size(); i++) - m_freeBufferRanges.push_back({i, 0, m_buffers[i].m_buffer->length()}); - } - - MTL::Buffer* GetBuffer(uint32 bufferIndex) - { - return m_buffers[bufferIndex].m_buffer; - } - - MetalBufferAllocation GetBufferAllocation(size_t size) - { - // Align the size - size = Align(size, 16); - - // First, try to find a free range - for (uint32 i = 0; i < m_freeBufferRanges.size(); i++) - { - auto& range = m_freeBufferRanges[i]; - if (size <= range.size) - { - auto& buffer = m_buffers[range.bufferIndex]; - - MetalBufferAllocation allocation; - allocation.bufferIndex = range.bufferIndex; - allocation.offset = range.offset; - allocation.size = size; - allocation.data = (uint8*)buffer.m_buffer->contents() + range.offset; - - range.offset += size; - range.size -= size; - - if (range.size == 0) - { - m_freeBufferRanges.erase(m_freeBufferRanges.begin() + i); - } - - return allocation; - } - } - - // If no free range was found, allocate a new buffer - m_allocationSize = std::max(m_allocationSize, size); - MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(m_allocationSize, MTL::ResourceStorageModeShared); - #ifdef CEMU_DEBUG_ASSERT - buffer->setLabel(GetLabel("Buffer from buffer allocator", buffer)); - #endif - - MetalBufferAllocation allocation; - allocation.bufferIndex = m_buffers.size(); - allocation.offset = 0; - allocation.size = size; - allocation.data = buffer->contents(); - - m_buffers.push_back({buffer}); - - // If the buffer is larger than the requested size, add the remaining space to the free buffer ranges - if (size < m_allocationSize) - { - MetalBufferRange range; - range.bufferIndex = allocation.bufferIndex; - range.offset = size; - range.size = m_allocationSize - size; - - m_freeBufferRanges.push_back(range); - } - - // Increase the allocation size for the next buffer - if (m_allocationSize < 128 * 1024 * 1024) - m_allocationSize *= 2; - - return allocation; - } - - void FreeAllocation(MetalBufferAllocation& allocation) - { - MetalBufferRange range; - range.bufferIndex = allocation.bufferIndex; - range.offset = allocation.offset; - range.size = allocation.size; - - allocation.offset = INVALID_OFFSET; - - // Find the correct position to insert the free range - for (uint32 i = 0; i < m_freeBufferRanges.size(); i++) - { - auto& freeRange = m_freeBufferRanges[i]; - if (freeRange.bufferIndex == range.bufferIndex && freeRange.offset + freeRange.size == range.offset) - { - freeRange.size += range.size; - return; - } - } - - m_freeBufferRanges.push_back(range); - } - -protected: - class MetalRenderer* m_mtlr; - - size_t m_allocationSize = 8 * 1024 * 1024; - - std::vector m_buffers; - std::vector m_freeBufferRanges; -}; - -struct MetalBuffer -{ - MTL::Buffer* m_buffer; -}; - -typedef MetalBufferAllocator MetalDefaultBufferAllocator; - -struct MetalSyncedBuffer -{ - MTL::Buffer* m_buffer; - std::vector m_commandBuffers; -}; - -class MetalTemporaryBufferAllocator : public MetalBufferAllocator -{ -public: - MetalTemporaryBufferAllocator(class MetalRenderer* metalRenderer) : MetalBufferAllocator(metalRenderer) {} - - void SetActiveCommandBuffer(MTL::CommandBuffer* commandBuffer) - { - m_activeCommandBuffer = commandBuffer; - } - - void CommandBufferFinished(MTL::CommandBuffer* commandBuffer) - { - for (uint32_t i = 0; i < m_buffers.size(); i++) - { - auto& buffer = m_buffers[i]; - for (uint32_t j = 0; j < buffer.m_commandBuffers.size(); j++) - { - if (commandBuffer == buffer.m_commandBuffers[j]) - { - if (buffer.m_commandBuffers.size() == 1) - { - // All command buffers using it have finished execution, we can use it again - m_freeBufferRanges.push_back({i, 0, buffer.m_buffer->length()}); - - buffer.m_commandBuffers.clear(); - } - else - { - buffer.m_commandBuffers.erase(buffer.m_commandBuffers.begin() + j); - } - break; - } - } - } - } - - // TODO: should this be here? It's just to ensure safety - MTL::Buffer* GetBuffer(uint32 bufferIndex) - { - auto& buffer = m_buffers[bufferIndex]; - if (buffer.m_commandBuffers.back() != m_activeCommandBuffer) - buffer.m_commandBuffers.push_back(m_activeCommandBuffer); - - return buffer.m_buffer; - } - - MetalBufferAllocation GetBufferAllocation(size_t size) - { - // TODO: remove this - if (!m_activeCommandBuffer) - throw std::runtime_error("No active command buffer when allocating a buffer!"); - - auto allocation = MetalBufferAllocator::GetBufferAllocation(size); - - auto& buffer = m_buffers[allocation.bufferIndex]; - if (buffer.m_commandBuffers.empty() || buffer.m_commandBuffers.back() != m_activeCommandBuffer) - buffer.m_commandBuffers.push_back(m_activeCommandBuffer); - - return allocation; - } - -private: - MTL::CommandBuffer* m_activeCommandBuffer = nullptr; -}; +#include "Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h" struct MetalRestridedBufferRange { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 20123f0df..46b5cfd01 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1492,8 +1492,9 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE */ auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator(); - auto supportBuffer = bufferAllocator.GetBufferAllocation(sizeof(supportBufferData)); - memcpy(supportBuffer.data, supportBufferData, sizeof(supportBufferData)); + size_t size = shader->uniform.uniformRangeSize; + auto supportBuffer = bufferAllocator.GetBufferAllocation(size); + memcpy(supportBuffer.data, supportBufferData, size); switch (shader->shaderType) { From 485a652c85660d830dc6ac9e360322b1139e540d Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 18 Aug 2024 14:37:39 +0200 Subject: [PATCH 084/368] use managed storage mode when dedicated memory --- src/Cafe/HW/Latte/Core/LatteIndices.cpp | 2 +- .../Latte/Renderer/Metal/LatteTextureMtl.cpp | 2 +- .../Renderer/Metal/MetalBufferAllocator.h | 8 +++-- .../Renderer/Metal/MetalMemoryManager.cpp | 4 ++- .../Latte/Renderer/Metal/MetalMemoryManager.h | 3 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 19 ++++++++--- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 32 +++++++++++++++---- .../HW/Latte/Renderer/OpenGL/OpenGLRenderer.h | 6 ++-- src/Cafe/HW/Latte/Renderer/Renderer.h | 2 +- .../HW/Latte/Renderer/Vulkan/VulkanRenderer.h | 2 +- .../Renderer/Vulkan/VulkanRendererCore.cpp | 20 ++++++------ 11 files changed, 67 insertions(+), 33 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteIndices.cpp b/src/Cafe/HW/Latte/Core/LatteIndices.cpp index 0f12356b5..dc6408f9c 100644 --- a/src/Cafe/HW/Latte/Core/LatteIndices.cpp +++ b/src/Cafe/HW/Latte/Core/LatteIndices.cpp @@ -769,7 +769,7 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 // recalculate index range but filter out primitive restart index LatteIndices_alternativeCalculateIndexMinMax(indexData, indexType, count, indexMin, indexMax); } - g_renderer->indexData_uploadIndexMemory(indexBufferOffset, indexOutputSize); + g_renderer->indexData_uploadIndexMemory(indexBufferIndex, indexBufferOffset, indexOutputSize); // update cache LatteIndexCache.lastPtr = indexData; LatteIndexCache.lastCount = count; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index 645973df5..c15080859 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -9,7 +9,7 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM : LatteTexture(dim, physAddress, physMipAddress, format, width, height, depth, pitch, mipLevels, swizzle, tileMode, isDepth), m_mtlr(mtlRenderer), m_format(format), m_isDepth(isDepth) { MTL::TextureDescriptor* desc = MTL::TextureDescriptor::alloc()->init(); - desc->setStorageMode(MTL::StorageModeShared); // TODO: use private? + desc->setStorageMode(m_mtlr->GetOptimalStorageMode()); sint32 effectiveBaseWidth = width; sint32 effectiveBaseHeight = height; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h index 20467e654..a7e54e1e2 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h @@ -1,6 +1,7 @@ #pragma once #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Metal/MTLResource.hpp" struct MetalBufferRange { @@ -13,7 +14,7 @@ template class MetalBufferAllocator { public: - MetalBufferAllocator(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} + MetalBufferAllocator(class MetalRenderer* metalRenderer, MTL::ResourceOptions storageMode) : m_mtlr{metalRenderer}, m_storageMode{storageMode} {} ~MetalBufferAllocator() { @@ -68,7 +69,7 @@ class MetalBufferAllocator // If no free range was found, allocate a new buffer m_allocationSize = std::max(m_allocationSize, size); - MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(m_allocationSize, MTL::ResourceStorageModeShared); + MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(m_allocationSize, m_storageMode); #ifdef CEMU_DEBUG_ASSERT buffer->setLabel(GetLabel("Buffer from buffer allocator", buffer)); #endif @@ -124,6 +125,7 @@ class MetalBufferAllocator protected: class MetalRenderer* m_mtlr; + MTL::ResourceOptions m_storageMode; size_t m_allocationSize = 8 * 1024 * 1024; @@ -147,7 +149,7 @@ struct MetalSyncedBuffer class MetalTemporaryBufferAllocator : public MetalBufferAllocator { public: - MetalTemporaryBufferAllocator(class MetalRenderer* metalRenderer) : MetalBufferAllocator(metalRenderer) {} + MetalTemporaryBufferAllocator(class MetalRenderer* metalRenderer) : MetalBufferAllocator(metalRenderer, metalRenderer->GetOptimalResourceStorageMode()) {} void SetActiveCommandBuffer(MTL::CommandBuffer* commandBuffer) { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index c102bcf29..534b9831b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -132,7 +132,7 @@ void MetalMemoryManager::InitBufferCache(size_t size) return; } - m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, MTL::ResourceStorageModeShared); + m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, m_mtlr->GetOptimalResourceStorageMode()); #ifdef CEMU_DEBUG_ASSERT m_bufferCache->setLabel(GetLabel("Buffer cache", m_bufferCache)); #endif @@ -152,6 +152,8 @@ void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, si } memcpy((uint8*)m_bufferCache->contents() + offset, data, size); + if (!m_mtlr->HasUnifiedMemory()) + m_bufferCache->didModifyRange(NS::Range(offset, size)); // Notify vertex buffer cache about the change m_vertexBufferCache.MemoryRangeChanged(offset, size); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index cc89f5ce3..5abc7c623 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -1,6 +1,7 @@ #pragma once #include "Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h" +#include "Metal/MTLResource.hpp" struct MetalRestridedBufferRange { @@ -57,7 +58,7 @@ class MetalVertexBufferCache class MetalMemoryManager { public: - MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_bufferAllocator(metalRenderer), m_framePersistentBufferAllocator(metalRenderer), m_tempBufferAllocator(metalRenderer), m_vertexBufferCache(metalRenderer, m_framePersistentBufferAllocator) {} + MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_bufferAllocator(metalRenderer, m_mtlr->GetOptimalResourceStorageMode()), m_framePersistentBufferAllocator(metalRenderer, MTL::ResourceStorageModePrivate), m_tempBufferAllocator(metalRenderer), m_vertexBufferCache(metalRenderer, m_framePersistentBufferAllocator) {} ~MetalMemoryManager(); // Pipelines diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 46b5cfd01..7c6c6a2de 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -18,6 +18,7 @@ #include "Cafe/HW/Latte/Core/LatteIndices.h" #include "Cemu/Logging/CemuDebugLogging.h" #include "Common/precompiled.h" +#include "Metal/MTLResource.hpp" #include "gui/guiWrapper.h" #define COMMIT_TRESHOLD 256 @@ -31,6 +32,9 @@ MetalRenderer::MetalRenderer() m_device = MTL::CreateSystemDefaultDevice(); m_commandQueue = m_device->newCommandQueue(); + // Feature support + m_hasUnifiedMemory = m_device->hasUnifiedMemory(); + // Resources MTL::SamplerDescriptor* samplerDescriptor = MTL::SamplerDescriptor::alloc()->init(); #ifdef CEMU_DEBUG_ASSERT @@ -75,7 +79,7 @@ MetalRenderer::MetalRenderer() #endif // Transform feedback - m_xfbRingBuffer = m_device->newBuffer(LatteStreamout_GetRingBufferSize(), MTL::StorageModeShared); + m_xfbRingBuffer = m_device->newBuffer(LatteStreamout_GetRingBufferSize(), MTL::ResourceStorageModePrivate); #ifdef CEMU_DEBUG_ASSERT m_xfbRingBuffer->setLabel(GetLabel("Transform feedback buffer", m_xfbRingBuffer)); #endif @@ -991,9 +995,11 @@ void* MetalRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, u return allocation.data; } -void MetalRenderer::indexData_uploadIndexMemory(uint32 offset, uint32 size) +void MetalRenderer::indexData_uploadIndexMemory(uint32 bufferIndex, uint32 offset, uint32 size) { - // Do nothing, since the buffer has shared storage mode + auto buffer = m_memoryManager->GetTemporaryBufferAllocator().GetBuffer(bufferIndex); + if (!HasUnifiedMemory()) + buffer->didModifyRange(NS::Range(offset, size)); } MTL::CommandBuffer* MetalRenderer::GetCommandBuffer() @@ -1495,18 +1501,21 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE size_t size = shader->uniform.uniformRangeSize; auto supportBuffer = bufferAllocator.GetBufferAllocation(size); memcpy(supportBuffer.data, supportBufferData, size); + auto buffer = bufferAllocator.GetBuffer(supportBuffer.bufferIndex); + if (!HasUnifiedMemory()) + buffer->didModifyRange(NS::Range(supportBuffer.offset, size)); switch (shader->shaderType) { case LatteConst::ShaderType::Vertex: { - renderCommandEncoder->setVertexBuffer(bufferAllocator.GetBuffer(supportBuffer.bufferIndex), supportBuffer.offset, MTL_SUPPORT_BUFFER_BINDING); + renderCommandEncoder->setVertexBuffer(buffer, supportBuffer.offset, MTL_SUPPORT_BUFFER_BINDING); //renderCommandEncoder->setVertexBytes(supportBufferData, sizeof(supportBufferData), MTL_SUPPORT_BUFFER_BINDING); break; } case LatteConst::ShaderType::Pixel: { - renderCommandEncoder->setFragmentBuffer(bufferAllocator.GetBuffer(supportBuffer.bufferIndex), supportBuffer.offset, MTL_SUPPORT_BUFFER_BINDING); + renderCommandEncoder->setFragmentBuffer(buffer, supportBuffer.offset, MTL_SUPPORT_BUFFER_BINDING); //renderCommandEncoder->setFragmentBytes(supportBufferData, sizeof(supportBufferData), MTL_SUPPORT_BUFFER_BINDING); break; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 8d63f6daa..c3eb9ab7b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -7,6 +7,7 @@ #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" +#include "Metal/MTLResource.hpp" struct MetalBufferAllocation { @@ -269,7 +270,7 @@ class MetalRenderer : public Renderer // index void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override; - void indexData_uploadIndexMemory(uint32 offset, uint32 size) override; + void indexData_uploadIndexMemory(uint32 bufferIndex, uint32 offset, uint32 size) override; // occlusion queries LatteQueryObject* occlusionQuery_create() override { @@ -348,7 +349,22 @@ class MetalRenderer : public Renderer void ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a); // Getters - MTL::Buffer* GetTextureReadbackBuffer() + bool HasUnifiedMemory() const + { + return m_hasUnifiedMemory; + } + + MTL::StorageMode GetOptimalStorageMode() const + { + return (m_hasUnifiedMemory ? MTL::StorageModeShared : MTL::StorageModeManaged); + } + + MTL::ResourceOptions GetOptimalResourceStorageMode() const + { + return (m_hasUnifiedMemory ? MTL::ResourceStorageModeShared : MTL::ResourceStorageModeManaged); + } + + MTL::Buffer* GetTextureReadbackBuffer() const { return m_readbackBuffer; } @@ -357,15 +373,19 @@ class MetalRenderer : public Renderer CA::MetalLayer* m_metalLayer; float m_layerScaleX, m_layerScaleY; + // Metal objects + MTL::Device* m_device; + MTL::CommandQueue* m_commandQueue; + + // Feature support + bool m_hasUnifiedMemory; + + // Managers and caches class MetalMemoryManager* m_memoryManager; class MetalPipelineCache* m_pipelineCache; class MetalDepthStencilCache* m_depthStencilCache; class MetalSamplerCache* m_samplerCache; - // Metal objects - MTL::Device* m_device; - MTL::CommandQueue* m_commandQueue; - // Pipelines MTL::RenderPipelineState* m_presentPipelineLinear; MTL::RenderPipelineState* m_presentPipelineSRGB; diff --git a/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h b/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h index 313ea3c0a..196403b49 100644 --- a/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h @@ -109,7 +109,7 @@ class OpenGLRenderer : public Renderer return nullptr; } - void indexData_uploadIndexMemory(uint32 offset, uint32 size) override + void indexData_uploadIndexMemory(uint32 bufferIndex, uint32 offset, uint32 size) override { assert_dbg(); } @@ -243,11 +243,11 @@ class OpenGLRenderer : public Renderer // occlusion queries std::vector list_queryCacheOcclusion; // cache for unused queries - // resource garbage collection + // resource garbage collection struct BufferCacheReleaseQueueEntry { BufferCacheReleaseQueueEntry(VirtualBufferHeap_t* heap, VirtualBufferHeapEntry_t* entry) : m_heap(heap), m_entry(entry) {}; - + void free() { virtualBufferHeap_free(m_heap, m_entry); diff --git a/src/Cafe/HW/Latte/Renderer/Renderer.h b/src/Cafe/HW/Latte/Renderer/Renderer.h index 7bd143d03..1dba52c8d 100644 --- a/src/Cafe/HW/Latte/Renderer/Renderer.h +++ b/src/Cafe/HW/Latte/Renderer/Renderer.h @@ -141,7 +141,7 @@ class Renderer // index virtual void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) = 0; - virtual void indexData_uploadIndexMemory(uint32 offset, uint32 size) = 0; + virtual void indexData_uploadIndexMemory(uint32 bufferIndex, uint32 offset, uint32 size) = 0; // occlusion queries virtual LatteQueryObject* occlusionQuery_create() = 0; diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h index e4b4cbf94..2b819e152 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h @@ -328,7 +328,7 @@ class VulkanRenderer : public Renderer RendererShader* shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool isGameShader, bool isGfxPackShader) override; void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override; - void indexData_uploadIndexMemory(uint32 offset, uint32 size) override; + void indexData_uploadIndexMemory(uint32 bufferIndex, uint32 offset, uint32 size) override; // externally callable void GetTextureFormatInfoVK(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, sint32 width, sint32 height, FormatInfoVK* formatInfoOut); diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp index 6500f7d37..d41022ac3 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp @@ -60,7 +60,7 @@ uint64 VulkanRenderer::draw_calculateGraphicsPipelineHash(const LatteFetchShader uint64 stateHash; stateHash = draw_calculateMinimalGraphicsPipelineHash(fetchShader, lcr); stateHash = (stateHash >> 8) + (stateHash * 0x370531ull) % 0x7F980D3BF9B4639Dull; - + uint32* ctxRegister = lcr.GetRawView(); if (vertexShader) @@ -103,7 +103,7 @@ uint64 VulkanRenderer::draw_calculateGraphicsPipelineHash(const LatteFetchShader } stateHash += renderPassObj->m_hashForPipeline; - + uint32 depthControl = ctxRegister[Latte::REGADDR::DB_DEPTH_CONTROL]; bool stencilTestEnable = depthControl & 1; if (stencilTestEnable) @@ -111,7 +111,7 @@ uint64 VulkanRenderer::draw_calculateGraphicsPipelineHash(const LatteFetchShader stateHash += ctxRegister[mmDB_STENCILREFMASK]; stateHash = std::rotl(stateHash, 17); if(depthControl & (1<<7)) // back stencil enable - { + { stateHash += ctxRegister[mmDB_STENCILREFMASK_BF]; stateHash = std::rotl(stateHash, 13); } @@ -302,7 +302,7 @@ PipelineInfo* VulkanRenderer::draw_createGraphicsPipeline(uint32 indexCount) pipelineCompiler->TrackAsCached(vsBaseHash, pipelineHash); // use heuristics based on parameter patterns to determine if the current drawcall is essential (non-skipable) - bool allowAsyncCompile = false; + bool allowAsyncCompile = false; if (GetConfig().async_compile) allowAsyncCompile = IsAsyncPipelineAllowed(indexCount); @@ -366,7 +366,7 @@ void* VulkanRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, return resv.memPtr; } -void VulkanRenderer::indexData_uploadIndexMemory(uint32 offset, uint32 size) +void VulkanRenderer::indexData_uploadIndexMemory(uint32 bufferIndex, uint32 offset, uint32 size) { // does nothing since the index buffer memory is coherent } @@ -701,8 +701,8 @@ VkDescriptorSetInfo* VulkanRenderer::draw_getOrCreateDescriptorSet(PipelineInfo* LatteTexture* baseTexture = textureView->baseTexture; // get texture register word 0 uint32 word4 = LatteGPUState.contextRegister[texUnitRegIndex + 4]; - - auto imageViewObj = textureView->GetSamplerView(word4); + + auto imageViewObj = textureView->GetSamplerView(word4); info.imageView = imageViewObj->m_textureImageView; vkObjDS->addRef(imageViewObj); @@ -772,7 +772,7 @@ VkDescriptorSetInfo* VulkanRenderer::draw_getOrCreateDescriptorSet(PipelineInfo* VK_SAMPLER_ADDRESS_MODE_REPEAT, // WRAP VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT, // MIRROR VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE, // CLAMP_LAST_TEXEL - VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE, // MIRROR_ONCE_LAST_TEXEL + VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE, // MIRROR_ONCE_LAST_TEXEL VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE, // unsupported HALF_BORDER VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, // unsupported MIRROR_ONCE_HALF_BORDER VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, // CLAMP_BORDER @@ -900,7 +900,7 @@ VkDescriptorSetInfo* VulkanRenderer::draw_getOrCreateDescriptorSet(PipelineInfo* uniformVarsBufferInfo.buffer = m_uniformVarBuffer; uniformVarsBufferInfo.offset = 0; // fixed offset is always zero since we only use dynamic offsets uniformVarsBufferInfo.range = shader->uniform.uniformRangeSize; - + VkWriteDescriptorSet write_descriptor{}; write_descriptor.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; write_descriptor.dstSet = result; @@ -1211,7 +1211,7 @@ void VulkanRenderer::draw_setRenderPass() draw_endRenderPass(); if (m_state.descriptorSetsChanged) sync_inputTexturesChanged(); - + // assume that FBO changed, update self-dependency state m_state.hasRenderSelfDependency = fboVk->CheckForCollision(m_state.activeVertexDS, m_state.activeGeometryDS, m_state.activePixelDS); From be0a69a5e268644a06faf58b3076f04cb4ae80ae Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 18 Aug 2024 14:49:25 +0200 Subject: [PATCH 085/368] do texture load on GPU if no unified memory --- .../Latte/Renderer/Metal/LatteTextureMtl.cpp | 3 ++- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 22 +++++++++++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index c15080859..9bc5b63bf 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -3,13 +3,14 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #include "Common/precompiled.h" +#include "Metal/MTLResource.hpp" LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) : LatteTexture(dim, physAddress, physMipAddress, format, width, height, depth, pitch, mipLevels, swizzle, tileMode, isDepth), m_mtlr(mtlRenderer), m_format(format), m_isDepth(isDepth) { MTL::TextureDescriptor* desc = MTL::TextureDescriptor::alloc()->init(); - desc->setStorageMode(m_mtlr->GetOptimalStorageMode()); + desc->setStorageMode(m_mtlr->HasUnifiedMemory() ? MTL::StorageModeShared : MTL::StorageModePrivate); sint32 effectiveBaseWidth = width; sint32 effectiveBaseHeight = height; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 7c6c6a2de..21ee47ebf 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -378,6 +378,7 @@ void MetalRenderer::texture_clearSlice(LatteTexture* hostTexture, sint32 sliceIn } } +// TODO: do a GPU blit even on unified memory? That would mean we could use private storage mode for all textures void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, sint32 height, sint32 depth, void* pixelData, sint32 sliceIndex, sint32 mipIndex, uint32 compressedImageSize) { auto textureMtl = (LatteTextureMtl*)hostTexture; @@ -390,9 +391,26 @@ void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, s } size_t bytesPerRow = GetMtlTextureBytesPerRow(textureMtl->GetFormat(), textureMtl->IsDepth(), width); - // No need to calculate bytesPerImage for 3D textures, since we always load just one slice + // No need to set bytesPerImage for 3D textures, since we always load just one slice //size_t bytesPerImage = GetMtlTextureBytesPerImage(textureMtl->GetFormat(), textureMtl->IsDepth(), height, bytesPerRow); - textureMtl->GetTexture()->replaceRegion(MTL::Region(0, 0, offsetZ, width, height, 1), mipIndex, sliceIndex, pixelData, bytesPerRow, 0); + if (HasUnifiedMemory()) + { + textureMtl->GetTexture()->replaceRegion(MTL::Region(0, 0, offsetZ, width, height, 1), mipIndex, sliceIndex, pixelData, bytesPerRow, 0); + } + else + { + auto blitCommandEncoder = GetBlitCommandEncoder(); + + // Allocate a temporary buffer + auto allocation = m_memoryManager->GetTemporaryBufferAllocator().GetBufferAllocation(compressedImageSize); + auto buffer = m_memoryManager->GetTemporaryBufferAllocator().GetBuffer(allocation.bufferIndex); + + // Copy the data to the temporary buffer + memcpy(allocation.data, pixelData, compressedImageSize); + + // Copy the data from the temporary buffer to the texture + blitCommandEncoder->copyFromBuffer(buffer, allocation.offset, bytesPerRow, 0, MTL::Size(width, height, 1), textureMtl->GetTexture(), sliceIndex, mipIndex, MTL::Origin(0, 0, offsetZ)); + } } void MetalRenderer::texture_clearColorSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a) From 9bb7479d169aab19b528d87ee696bcf691db3816 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 18 Aug 2024 17:40:41 +0200 Subject: [PATCH 086/368] use more efficient cpu cache mode when possible --- src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp | 1 + src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h | 2 +- src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp | 2 +- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index 9bc5b63bf..c1e7149e0 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -11,6 +11,7 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM { MTL::TextureDescriptor* desc = MTL::TextureDescriptor::alloc()->init(); desc->setStorageMode(m_mtlr->HasUnifiedMemory() ? MTL::StorageModeShared : MTL::StorageModePrivate); + desc->setCpuCacheMode(MTL::CPUCacheModeWriteCombined); sint32 effectiveBaseWidth = width; sint32 effectiveBaseHeight = height; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h index a7e54e1e2..15f0e7cfc 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h @@ -69,7 +69,7 @@ class MetalBufferAllocator // If no free range was found, allocate a new buffer m_allocationSize = std::max(m_allocationSize, size); - MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(m_allocationSize, m_storageMode); + MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(m_allocationSize, m_storageMode | MTL::ResourceCPUCacheModeWriteCombined); #ifdef CEMU_DEBUG_ASSERT buffer->setLabel(GetLabel("Buffer from buffer allocator", buffer)); #endif diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 534b9831b..8986805d4 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -132,7 +132,7 @@ void MetalMemoryManager::InitBufferCache(size_t size) return; } - m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, m_mtlr->GetOptimalResourceStorageMode()); + m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, m_mtlr->GetOptimalResourceStorageMode() | MTL::ResourceCPUCacheModeWriteCombined); #ifdef CEMU_DEBUG_ASSERT m_bufferCache->setLabel(GetLabel("Buffer cache", m_bufferCache)); #endif diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 21ee47ebf..1536c88be 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -73,7 +73,7 @@ MetalRenderer::MetalRenderer() m_samplerCache = new MetalSamplerCache(this); // Texture readback - m_readbackBuffer = m_device->newBuffer(TEXTURE_READBACK_SIZE, MTL::StorageModeShared); + m_readbackBuffer = m_device->newBuffer(TEXTURE_READBACK_SIZE, MTL::ResourceStorageModeShared); #ifdef CEMU_DEBUG_ASSERT m_readbackBuffer->setLabel(GetLabel("Texture readback buffer", m_readbackBuffer)); #endif From 9aa72e62953015ce4410d8f6a74763a20c9ee2f4 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 18 Aug 2024 20:03:37 +0200 Subject: [PATCH 087/368] report vram usage --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 1536c88be..76139e63e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -215,12 +215,10 @@ bool MetalRenderer::IsPadWindowActive() bool MetalRenderer::GetVRAMInfo(int& usageInMB, int& totalInMB) const { - debug_printf("MetalRenderer::GetVRAMInfo not implemented\n"); + usageInMB = m_device->currentAllocatedSize(); + totalInMB = usageInMB; - usageInMB = 1024; - totalInMB = 1024; - - return false; + return true; } void MetalRenderer::ClearColorbuffer(bool padView) From e2f66b8aa3bec12df4571df088a636cbd1996102 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 19 Aug 2024 13:07:30 +0200 Subject: [PATCH 088/368] fix: streamout --- .../LatteDecompilerEmitMSL.cpp | 4 ---- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 24 ++++++++++++------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 12 ++++++++++ 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index a46da96c4..95e91d40d 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3284,8 +3284,6 @@ static void _emitCFRingWriteCode(LatteDecompilerShaderContext* shaderContext, La continue; uint32 u32Offset = streamWrite->exportArrayBase + i; - // HACK: disable streamout temporarily, since it causes GPU hangs - continue; src->addFmt("sb[sbBase{} + {}]", streamWrite->bufferIndex, u32Offset); src->add(" = "); @@ -3399,8 +3397,6 @@ static void _emitStreamWriteCode(LatteDecompilerShaderContext* shaderContext, La continue; uint32 u32Offset = cfInstruction->exportArrayBase + i; - // HACK: disable streamout temporarily, since it causes GPU hangs - continue; src->addFmt("sb[sbBase{} + {}]", streamoutBufferIndex, u32Offset); src->add(" = "); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 76139e63e..af092c155 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -634,7 +634,9 @@ void MetalRenderer::bufferCache_copy(uint32 srcOffset, uint32 dstOffset, uint32 void MetalRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint32 dstOffset, uint32 size) { - debug_printf("MetalRenderer::bufferCache_copyStreamoutToMainBuffer not implemented\n"); + auto blitCommandEncoder = GetBlitCommandEncoder(); + + blitCommandEncoder->copyFromBuffer(m_xfbRingBuffer, srcOffset, m_memoryManager->GetBufferCache(), dstOffset, size); } void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, uint32 size) @@ -669,17 +671,18 @@ RendererShader* MetalRenderer::shader_create(RendererShader::ShaderType type, ui void MetalRenderer::streamout_setupXfbBuffer(uint32 bufferIndex, sint32 ringBufferOffset, uint32 rangeAddr, uint32 rangeSize) { - debug_printf("MetalRenderer::streamout_setupXfbBuffer not implemented\n"); + m_state.m_streamoutState.buffers[bufferIndex].enabled = true; + m_state.m_streamoutState.buffers[bufferIndex].ringBufferOffset = ringBufferOffset; } void MetalRenderer::streamout_begin() { - debug_printf("MetalRenderer::streamout_begin not implemented\n"); + // Do nothing } void MetalRenderer::streamout_rendererFinishDrawcall() { - debug_printf("MetalRenderer::streamout_rendererFinishDrawcall not implemented\n"); + // Do nothing } void MetalRenderer::draw_beginSequence() @@ -966,6 +969,10 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 encoderState.m_renderPipelineState = renderPipelineState; } + // Prepare streamout + m_state.m_streamoutState.verticesPerInstance = count; + LatteStreamout_PrepareDrawcall(count, instanceCount); + // Uniform buffers, textures and samplers BindStageResources(renderCommandEncoder, vertexShader); BindStageResources(renderCommandEncoder, pixelShader); @@ -981,6 +988,8 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 renderCommandEncoder->drawPrimitives(mtlPrimitiveType, baseVertex, count, instanceCount, baseInstance); } + LatteStreamout_FinishDrawcall(false); + LatteGPUState.drawCallCounter++; } @@ -1498,20 +1507,17 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE { LatteMRT::GetCurrentFragCoordScale(GET_UNIFORM_DATA_PTR(shader->uniform.loc_fragCoordScale)); } - // TODO: uncomment - /* if (shader->uniform.loc_verticesPerInstance >= 0) { - *(int*)(supportBufferData + ((size_t)shader->uniform.loc_verticesPerInstance / 4)) = m_streamoutState.verticesPerInstance; + *(int*)(supportBufferData + ((size_t)shader->uniform.loc_verticesPerInstance / 4)) = m_state.m_streamoutState.verticesPerInstance; for (sint32 b = 0; b < LATTE_NUM_STREAMOUT_BUFFER; b++) { if (shader->uniform.loc_streamoutBufferBase[b] >= 0) { - *(uint32*)GET_UNIFORM_DATA_PTR(shader->uniform.loc_streamoutBufferBase[b]) = m_streamoutState.buffer[b].ringBufferOffset; + *(uint32*)GET_UNIFORM_DATA_PTR(shader->uniform.loc_streamoutBufferBase[b]) = m_state.m_streamoutState.buffers[b].ringBufferOffset; } } } - */ auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator(); size_t size = shader->uniform.uniformRangeSize; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index c3eb9ab7b..e18e619a7 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -80,6 +80,16 @@ struct MetalEncoderState size_t m_uniformBufferOffsets[METAL_SHADER_TYPE_TOTAL][MAX_MTL_BUFFERS]; }; +struct MetalStreamoutState +{ + struct + { + bool enabled; + uint32 ringBufferOffset; + } buffers[LATTE_NUM_STREAMOUT_BUFFER]; + sint32 verticesPerInstance; +}; + struct MetalState { MetalEncoderState m_encoderState{}; @@ -99,6 +109,8 @@ struct MetalState MTL::Viewport m_viewport; MTL::ScissorRect m_scissor; + + MetalStreamoutState m_streamoutState; }; struct MetalCommandBuffer From b59dbfc9c37c73ac1c44f8211a12c9a9e6b28dbe Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 19 Aug 2024 16:02:36 +0200 Subject: [PATCH 089/368] hack: increase xfb ring buffer size --- .../LatteDecompilerEmitMSLHeader.hpp | 2 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 22 +++++++++++++------ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 0b23fecdc..8527adde9 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -386,7 +386,7 @@ namespace LatteDecompiler // streamout buffer (transform feedback) if (decompilerContext->analyzer.hasStreamoutEnable && decompilerContext->analyzer.hasStreamoutWrite) { - src->addFmt(", device int* sb [[buffer({})]]" _CRLF, decompilerContext->output->resourceMappingVK.getTFStorageBufferBindingPoint()); + src->addFmt(", device int* sb [[buffer({})]]" _CRLF, decompilerContext->output->resourceMappingVK.tfStorageBindingPoint); } break; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index af092c155..f69057f5a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -18,7 +18,7 @@ #include "Cafe/HW/Latte/Core/LatteIndices.h" #include "Cemu/Logging/CemuDebugLogging.h" #include "Common/precompiled.h" -#include "Metal/MTLResource.hpp" +#include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "gui/guiWrapper.h" #define COMMIT_TRESHOLD 256 @@ -79,7 +79,7 @@ MetalRenderer::MetalRenderer() #endif // Transform feedback - m_xfbRingBuffer = m_device->newBuffer(LatteStreamout_GetRingBufferSize(), MTL::ResourceStorageModePrivate); + m_xfbRingBuffer = m_device->newBuffer(LatteStreamout_GetRingBufferSize() * 32, MTL::ResourceStorageModePrivate); #ifdef CEMU_DEBUG_ASSERT m_xfbRingBuffer->setLabel(GetLabel("Transform feedback buffer", m_xfbRingBuffer)); #endif @@ -761,12 +761,19 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // Shaders LatteDecompilerShader* vertexShader = LatteSHRC_GetActiveVertexShader(); + LatteDecompilerShader* geometryShader = LatteSHRC_GetActiveGeometryShader(); LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader(); if (!vertexShader || !static_cast(vertexShader->shader)->GetFunction()) { debug_printf("no vertex function, skipping draw\n"); return; } + // TODO: remove this? + if (geometryShader) + { + debug_printf("geometry shader aren't supported on Metal yet, skipping draw\n"); + return; + } const auto fetchShader = LatteSHRC_GetActiveFetchShader(); // Depth stencil state @@ -1590,21 +1597,22 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE // Storage buffer if (shader->resourceMapping.tfStorageBindingPoint >= 0) { - switch (shader->shaderType) + switch (shader->shaderType) { case LatteConst::ShaderType::Vertex: { - renderCommandEncoder->setVertexBuffer(m_xfbRingBuffer, 0, shader->resourceMapping.tfStorageBindingPoint); - break; + renderCommandEncoder->setVertexBuffer(m_xfbRingBuffer, 0, shader->resourceMapping.tfStorageBindingPoint); + break; } case LatteConst::ShaderType::Pixel: { renderCommandEncoder->setFragmentBuffer(m_xfbRingBuffer, 0, shader->resourceMapping.tfStorageBindingPoint); - break; + break; } default: - UNREACHABLE; + UNREACHABLE; } + m_state.m_encoderState.m_uniformBufferOffsets[mtlShaderType][shader->resourceMapping.tfStorageBindingPoint] = INVALID_OFFSET; } } From 69a36246fba33bcdc750d909eea5aeffafe8eec9 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 19 Aug 2024 16:14:06 +0200 Subject: [PATCH 090/368] do GPU texture loading on non-apple GPUs --- src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp | 2 +- .../HW/Latte/Renderer/Metal/MetalBufferAllocator.h | 2 +- .../HW/Latte/Renderer/Metal/MetalMemoryManager.cpp | 2 +- .../HW/Latte/Renderer/Metal/MetalMemoryManager.h | 2 +- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 4 +++- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h | 12 +++++++++--- 6 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index c1e7149e0..eeeee0ba8 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -10,7 +10,7 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM : LatteTexture(dim, physAddress, physMipAddress, format, width, height, depth, pitch, mipLevels, swizzle, tileMode, isDepth), m_mtlr(mtlRenderer), m_format(format), m_isDepth(isDepth) { MTL::TextureDescriptor* desc = MTL::TextureDescriptor::alloc()->init(); - desc->setStorageMode(m_mtlr->HasUnifiedMemory() ? MTL::StorageModeShared : MTL::StorageModePrivate); + desc->setStorageMode(m_mtlr->GetOptimalTextureStorageMode()); desc->setCpuCacheMode(MTL::CPUCacheModeWriteCombined); sint32 effectiveBaseWidth = width; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h index 15f0e7cfc..9853ae7fd 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h @@ -149,7 +149,7 @@ struct MetalSyncedBuffer class MetalTemporaryBufferAllocator : public MetalBufferAllocator { public: - MetalTemporaryBufferAllocator(class MetalRenderer* metalRenderer) : MetalBufferAllocator(metalRenderer, metalRenderer->GetOptimalResourceStorageMode()) {} + MetalTemporaryBufferAllocator(class MetalRenderer* metalRenderer) : MetalBufferAllocator(metalRenderer, metalRenderer->GetOptimalBufferStorageMode()) {} void SetActiveCommandBuffer(MTL::CommandBuffer* commandBuffer) { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 8986805d4..93c6ec860 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -132,7 +132,7 @@ void MetalMemoryManager::InitBufferCache(size_t size) return; } - m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, m_mtlr->GetOptimalResourceStorageMode() | MTL::ResourceCPUCacheModeWriteCombined); + m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, m_mtlr->GetOptimalBufferStorageMode() | MTL::ResourceCPUCacheModeWriteCombined); #ifdef CEMU_DEBUG_ASSERT m_bufferCache->setLabel(GetLabel("Buffer cache", m_bufferCache)); #endif diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index 5abc7c623..62254b213 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -58,7 +58,7 @@ class MetalVertexBufferCache class MetalMemoryManager { public: - MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_bufferAllocator(metalRenderer, m_mtlr->GetOptimalResourceStorageMode()), m_framePersistentBufferAllocator(metalRenderer, MTL::ResourceStorageModePrivate), m_tempBufferAllocator(metalRenderer), m_vertexBufferCache(metalRenderer, m_framePersistentBufferAllocator) {} + MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_bufferAllocator(metalRenderer, m_mtlr->GetOptimalBufferStorageMode()), m_framePersistentBufferAllocator(metalRenderer, MTL::ResourceStorageModePrivate), m_tempBufferAllocator(metalRenderer), m_vertexBufferCache(metalRenderer, m_framePersistentBufferAllocator) {} ~MetalMemoryManager(); // Pipelines diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index f69057f5a..fc4f98362 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -19,6 +19,7 @@ #include "Cemu/Logging/CemuDebugLogging.h" #include "Common/precompiled.h" #include "HW/Latte/Renderer/Metal/MetalCommon.h" +#include "Metal/MTLDevice.hpp" #include "gui/guiWrapper.h" #define COMMIT_TRESHOLD 256 @@ -34,6 +35,7 @@ MetalRenderer::MetalRenderer() // Feature support m_hasUnifiedMemory = m_device->hasUnifiedMemory(); + m_isAppleGPU = m_device->supportsFamily(MTL::GPUFamilyApple1); // Resources MTL::SamplerDescriptor* samplerDescriptor = MTL::SamplerDescriptor::alloc()->init(); @@ -391,7 +393,7 @@ void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, s size_t bytesPerRow = GetMtlTextureBytesPerRow(textureMtl->GetFormat(), textureMtl->IsDepth(), width); // No need to set bytesPerImage for 3D textures, since we always load just one slice //size_t bytesPerImage = GetMtlTextureBytesPerImage(textureMtl->GetFormat(), textureMtl->IsDepth(), height, bytesPerRow); - if (HasUnifiedMemory()) + if (IsAppleGPU()) { textureMtl->GetTexture()->replaceRegion(MTL::Region(0, 0, offsetZ, width, height, 1), mipIndex, sliceIndex, pixelData, bytesPerRow, 0); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index e18e619a7..49d4af83b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -366,12 +366,17 @@ class MetalRenderer : public Renderer return m_hasUnifiedMemory; } - MTL::StorageMode GetOptimalStorageMode() const + bool IsAppleGPU() const { - return (m_hasUnifiedMemory ? MTL::StorageModeShared : MTL::StorageModeManaged); + return m_isAppleGPU; } - MTL::ResourceOptions GetOptimalResourceStorageMode() const + MTL::StorageMode GetOptimalTextureStorageMode() const + { + return (m_isAppleGPU ? MTL::StorageModeShared : MTL::StorageModePrivate); + } + + MTL::ResourceOptions GetOptimalBufferStorageMode() const { return (m_hasUnifiedMemory ? MTL::ResourceStorageModeShared : MTL::ResourceStorageModeManaged); } @@ -391,6 +396,7 @@ class MetalRenderer : public Renderer // Feature support bool m_hasUnifiedMemory; + bool m_isAppleGPU; // Managers and caches class MetalMemoryManager* m_memoryManager; From ca256eb764d8c08d5c54029821aedffe8dd1465c Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 19 Aug 2024 16:40:36 +0200 Subject: [PATCH 091/368] check for pixel format support --- .../Latte/Renderer/Metal/LatteTextureMtl.cpp | 4 +-- .../Renderer/Metal/LatteTextureViewMtl.cpp | 4 +-- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 33 ++++++++++++++++++- src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h | 8 ++--- .../HW/Latte/Renderer/Metal/MetalCommon.h | 19 ++++++++++- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 1 + .../HW/Latte/Renderer/Metal/MetalRenderer.h | 6 ++++ 7 files changed, 64 insertions(+), 11 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index eeeee0ba8..c588a21e4 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -73,8 +73,8 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM desc->setArrayLength(effectiveBaseDepth); } - auto formatInfo = GetMtlPixelFormatInfo(format, isDepth); - desc->setPixelFormat(formatInfo.pixelFormat); + auto pixelFormat = GetMtlPixelFormat(format, isDepth, m_mtlr->GetPixelFormatSupport()); + desc->setPixelFormat(pixelFormat); // HACK: even though the textures are never written to from a shader, we still need to use `ShaderWrite` usage to prevent pink lines over the screen MTL::TextureUsage usage = MTL::TextureUsageShaderRead | MTL::TextureUsageShaderWrite; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp index adb77643f..0607370bd 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp @@ -127,8 +127,8 @@ MTL::Texture* LatteTextureViewMtl::CreateSwizzledView(uint32 gpuSamplerSwizzle) swizzle.blue = GetMtlTextureSwizzle(compSelB); swizzle.alpha = GetMtlTextureSwizzle(compSelA); - auto formatInfo = GetMtlPixelFormatInfo(format, m_baseTexture->IsDepth()); - MTL::Texture* texture = m_baseTexture->GetTexture()->newTextureView(formatInfo.pixelFormat, textureType, NS::Range::Make(baseLevel, levelCount), NS::Range::Make(baseLayer, layerCount), swizzle); + auto pixelFormat = GetMtlPixelFormat(format, m_baseTexture->IsDepth(), m_mtlr->GetPixelFormatSupport()); + MTL::Texture* texture = m_baseTexture->GetTexture()->newTextureView(pixelFormat, textureType, NS::Range::Make(baseLevel, levelCount), NS::Range::Make(baseLayer, layerCount), swizzle); return texture; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index 704cf883a..771aa0599 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -1,6 +1,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #include "Common/precompiled.h" #include "Metal/MTLDepthStencil.hpp" +#include "Metal/MTLPixelFormat.hpp" #include "Metal/MTLRenderCommandEncoder.hpp" #include "Metal/MTLRenderPipeline.hpp" #include "Metal/MTLSampler.hpp" @@ -74,7 +75,6 @@ std::map MTL_COLOR_FORMAT_TABLE = { }; std::map MTL_DEPTH_FORMAT_TABLE = { - // TODO: one of these 2 formats is not supported on Apple silicon {Latte::E_GX2SURFFMT::D24_S8_UNORM, {MTL::PixelFormatDepth24Unorm_Stencil8, MetalDataType::NONE, 4, {1, 1}, true}}, {Latte::E_GX2SURFFMT::D24_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, MetalDataType::NONE, 4, {1, 1}, true}}, {Latte::E_GX2SURFFMT::D32_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, MetalDataType::NONE, 5, {1, 1}, true}}, @@ -105,6 +105,37 @@ const MetalPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format, boo return formatInfo; } +MTL::PixelFormat GetMtlPixelFormat(Latte::E_GX2SURFFMT format, bool isDepth, const MetalPixelFormatSupport& pixelFormatSupport) +{ + auto pixelFormat = GetMtlPixelFormatInfo(format, isDepth).pixelFormat; + + if (!pixelFormatSupport.m_supportsR8Unorm_sRGB && pixelFormat == MTL::PixelFormatR8Unorm_sRGB) + pixelFormat = MTL::PixelFormatRGBA8Unorm_sRGB; + + if (!pixelFormatSupport.m_supportsRG8Unorm_sRGB && pixelFormat == MTL::PixelFormatRG8Unorm_sRGB) + pixelFormat = MTL::PixelFormatRGBA8Unorm_sRGB; + + if (!pixelFormatSupport.m_supportsPacked16BitFormats) + { + switch (pixelFormat) + { + case MTL::PixelFormatB5G6R5Unorm: + case MTL::PixelFormatA1BGR5Unorm: + case MTL::PixelFormatABGR4Unorm: + case MTL::PixelFormatBGR5A1Unorm: + pixelFormat = MTL::PixelFormatRGBA8Unorm; + break; + default: + break; + } + } + + if (!pixelFormatSupport.m_supportsDepth24Unorm_Stencil8 && pixelFormat == MTL::PixelFormatDepth24Unorm_Stencil8) + pixelFormat = MTL::PixelFormatDepth32Float_Stencil8; + + return pixelFormat; +} + inline uint32 CeilDivide(uint32 a, uint32 b) { return (a + b - 1) / b; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h index 5fcd2d229..cc0c5e02d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h @@ -1,15 +1,11 @@ #pragma once -#include +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" #include "Cafe/HW/Latte/ISA/LatteReg.h" #include "Cafe/HW/Latte/Core/LatteConst.h" //#include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Renderer/Renderer.h" -#include "Metal/MTLDepthStencil.hpp" -#include "Metal/MTLRenderPipeline.hpp" -#include "Metal/MTLSampler.hpp" -#include "Metal/MTLTexture.hpp" struct Uvec2 { uint32 x; @@ -34,6 +30,8 @@ struct MetalPixelFormatInfo { const MetalPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format, bool isDepth); +MTL::PixelFormat GetMtlPixelFormat(Latte::E_GX2SURFFMT format, bool isDepth, const MetalPixelFormatSupport& pixelFormatSupport); + size_t GetMtlTextureBytesPerRow(Latte::E_GX2SURFFMT format, bool isDepth, uint32 width); size_t GetMtlTextureBytesPerImage(Latte::E_GX2SURFFMT format, bool isDepth, uint32 height, size_t bytesPerRow); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h index 19ac7ab79..926af5f43 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -1,8 +1,25 @@ #pragma once -#include "Foundation/NSString.hpp" +#include #include +struct MetalPixelFormatSupport +{ + bool m_supportsR8Unorm_sRGB; + bool m_supportsRG8Unorm_sRGB; + bool m_supportsPacked16BitFormats; + bool m_supportsDepth24Unorm_Stencil8; + + MetalPixelFormatSupport() = default; + MetalPixelFormatSupport(MTL::Device* device) + { + m_supportsR8Unorm_sRGB = device->supportsFamily(MTL::GPUFamilyApple1); + m_supportsRG8Unorm_sRGB = device->supportsFamily(MTL::GPUFamilyApple1); + m_supportsPacked16BitFormats = device->supportsFamily(MTL::GPUFamilyApple1); + m_supportsDepth24Unorm_Stencil8 = device->supportsFamily(MTL::GPUFamilyMac2); + } +}; + #define MAX_MTL_BUFFERS 31 // Buffer index 30 is reserved for the support buffer, buffer indices 27-29 are reserved for the helper shaders #define GET_MTL_VERTEX_BUFFER_INDEX(index) (MAX_MTL_BUFFERS - index - 5) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index fc4f98362..f5a11118a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -36,6 +36,7 @@ MetalRenderer::MetalRenderer() // Feature support m_hasUnifiedMemory = m_device->hasUnifiedMemory(); m_isAppleGPU = m_device->supportsFamily(MTL::GPUFamilyApple1); + m_pixelFormatSupport = MetalPixelFormatSupport(m_device); // Resources MTL::SamplerDescriptor* samplerDescriptor = MTL::SamplerDescriptor::alloc()->init(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 49d4af83b..4ea13fb2d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -371,6 +371,11 @@ class MetalRenderer : public Renderer return m_isAppleGPU; } + const MetalPixelFormatSupport& GetPixelFormatSupport() const + { + return m_pixelFormatSupport; + } + MTL::StorageMode GetOptimalTextureStorageMode() const { return (m_isAppleGPU ? MTL::StorageModeShared : MTL::StorageModePrivate); @@ -397,6 +402,7 @@ class MetalRenderer : public Renderer // Feature support bool m_hasUnifiedMemory; bool m_isAppleGPU; + MetalPixelFormatSupport m_pixelFormatSupport; // Managers and caches class MetalMemoryManager* m_memoryManager; From 6b1360415bbc1115a180338c453a48d16aaf8d27 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 20 Aug 2024 08:22:24 +0200 Subject: [PATCH 092/368] emit object shaders --- .../LatteDecompilerEmitMSL.cpp | 50 ++++++++++++--- .../LatteDecompilerEmitMSLHeader.hpp | 62 +++++++++++++------ 2 files changed, 83 insertions(+), 29 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 95e91d40d..5bf3c1e54 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2822,14 +2822,14 @@ static void _emitGSReadInputVFetchCode(LatteDecompilerShaderContext* shaderConte src->add(" = "); _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, shaderContext->typeTracker.defaultDataType); - src->add("(v2g["); + src->add("(objectPayload["); if (texInstruction->textureFetch.srcSel[0] >= 4) cemu_assert_unimplemented(); if (texInstruction->textureFetch.srcSel[1] >= 4) cemu_assert_unimplemented(); // todo: Index type src->add("0"); - src->addFmt("].passV2GParameter{}.", texInstruction->textureFetch.offset/16); + src->addFmt("].passParameterSem{}.", texInstruction->textureFetch.offset/16); for(sint32 f=0; f<4; f++) @@ -3316,7 +3316,7 @@ static void _emitCFRingWriteCode(LatteDecompilerShaderContext* shaderContext, La cemu_assert_unimplemented(); for (sint32 burstIndex = 0; burstIndex < (sint32)(cfInstruction->exportBurstCount + 1); burstIndex++) { - src->addFmt("v2g.passV2GParameter{}.", (cfInstruction->exportArrayBase) / 4 + burstIndex); + src->addFmt("out.passParameterSem{}.", (cfInstruction->exportArrayBase) / 4 + burstIndex); _emitXYZWByMask(src, cfInstruction->memWriteCompMask); src->addFmt(" = "); _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_SIGNED_INT, burstIndex); @@ -3842,8 +3842,20 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, switch (shader->shaderType) { case LatteConst::ShaderType::Vertex: - functionType = "vertex"; - outputTypeName = "VertexOut"; + if (shaderContext->options->usesGeometryShader) + { + // Defined just-in-time + // Will also modify vid in case of an indexed draw + src->add("ObjectIn fetchInput(VERTEX_BUFFER_DEFINITIONS, thread uint& vid);" _CRLF); + + functionType = "[[object, max_total_threads_per_threadgroup(MAX_THREADS_PER_THREADGROUP), max_total_threadgroups_per_mesh_grid(1)]]"; + outputTypeName = "void"; + } + else + { + functionType = "vertex"; + outputTypeName = "VertexOut"; + } break; case LatteConst::ShaderType::Pixel: functionType = "fragment"; @@ -3854,7 +3866,21 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, src->addFmt("{} {} main0(", functionType, outputTypeName); LatteDecompiler::emitInputs(shaderContext); src->add(") {" _CRLF); - src->addFmt("{} out;" _CRLF, outputTypeName); + if (shader->shaderType == LatteConst::ShaderType::Vertex && shaderContext->options->usesGeometryShader) + { + // Calculate the imaginary vertex id + src->add("uint vid = tig * PRIMITIVE_VERTEX_COUNT + tid;" _CRLF); + // TODO: don't hardcode the instance index + src->add("uint iid = 0;" _CRLF); + // Fetch the input + src->add("ObjectIn in = fetchInput(VERTEX_BUFFERS, vid);" _CRLF); + // Output is defined as object payload + src->add("object_payload ObjectPayload& out = objectPayload[tid];" _CRLF); + } + else + { + src->addFmt("{} out;" _CRLF, outputTypeName); + } // variable definition if (shaderContext->typeTracker.useArrayGPRs == false) { @@ -4094,13 +4120,17 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, if (shader->shaderType == LatteConst::ShaderType::Vertex && shaderContext->options->usesGeometryShader == false) src->add("out.pointSize = supportBuffer.pointSize;" _CRLF); } - // HACK: this should be handled outside of the shader, because clipping currently wouldn't work (or would it?) + // TODO: this should be handled outside of the shader, because clipping currently wouldn't work (or would it?) if (shader->shaderType == LatteConst::ShaderType::Vertex) - { - // TODO: check this - // MoltenVK does this src->add("out.position.z = (out.position.z + out.position.w) / 2.0;" _CRLF); + + if (shader->shaderType == LatteConst::ShaderType::Vertex && shaderContext->options->usesGeometryShader) + { + src->add("if (tid == 0) {" _CRLF); + src->add("meshGridProperties.set_threadgroups_per_grid(uint3(1, 1, 1));" _CRLF); + src->add("}" _CRLF); } + // return src->add("return out;" _CRLF); // end of shader main diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 8527adde9..4a36cf650 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -158,7 +158,10 @@ namespace LatteDecompiler if (decompilerContext->shader->shaderType == LatteConst::ShaderType::Vertex) { - src->add("struct VertexIn {" _CRLF); + if (decompilerContext->options->usesGeometryShader) + src->add("struct ObjectIn {" _CRLF); + else + src->add("struct VertexIn {" _CRLF); // attribute inputs for (uint32 i = 0; i < LATTE_NUM_MAX_ATTRIBUTE_LOCATIONS; i++) { @@ -166,7 +169,10 @@ namespace LatteDecompiler { cemu_assert_debug(decompilerContext->output->resourceMappingVK.attributeMapping[i] >= 0); - src->addFmt("uint4 attrDataSem{} [[attribute({})]];" _CRLF, i, (sint32)decompilerContext->output->resourceMappingVK.attributeMapping[i]); + src->addFmt("uint4 attrDataSem{}", i); + if (!decompilerContext->options->usesGeometryShader) + src->addFmt(" [[attribute({})]]", (sint32)decompilerContext->output->resourceMappingVK.attributeMapping[i]); + src->add(";" _CRLF); } } src->add("};" _CRLF _CRLF); @@ -177,11 +183,14 @@ namespace LatteDecompiler { auto* src = shaderContext->shaderSource; - src->add("struct VertexOut {" _CRLF); + if (shaderContext->options->usesGeometryShader) + src->add("struct ObjectPayload {" _CRLF); + else + src->add("struct VertexOut {" _CRLF); src->add("float4 position [[position]];" _CRLF); if (shaderContext->analyzer.outputPointSize) - src->add("float pointSize[[point_size]];" _CRLF); + src->add("float pointSize [[point_size]];" _CRLF); LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable(); auto parameterMask = shaderContext->shader->outputParameterMask; @@ -206,11 +215,14 @@ namespace LatteDecompiler continue; // no ps input src->addFmt("float4 passParameterSem{}", psInputTable->import[psInputIndex].semanticId); - src->addFmt(" [[user(locn{})]]", psInputIndex); - if (psInputTable->import[psInputIndex].isFlat) - src->add(" [[flat]]"); - if (psInputTable->import[psInputIndex].isNoPerspective) - src->add(" [[center_no_perspective]]"); + if (!shaderContext->options->usesGeometryShader) + { + src->addFmt(" [[user(locn{})]]", psInputIndex); + if (psInputTable->import[psInputIndex].isFlat) + src->add(" [[flat]]"); + if (psInputTable->import[psInputIndex].isNoPerspective) + src->add(" [[center_no_perspective]]"); + } src->addFmt(";" _CRLF); } @@ -369,26 +381,38 @@ namespace LatteDecompiler switch (decompilerContext->shaderType) { case LatteConst::ShaderType::Vertex: - src->add("VertexIn"); + if (!decompilerContext->options->usesGeometryShader) + src->add("VertexIn in [[stage_in]], "); break; case LatteConst::ShaderType::Pixel: - src->add("FragmentIn"); + src->add("FragmentIn in [[stage_in]], "); + break; + default: break; } - src->add(" in [[stage_in]], constant SupportBuffer& supportBuffer [[buffer(30)]]"); + src->add("constant SupportBuffer& supportBuffer [[buffer(30)]]"); switch (decompilerContext->shaderType) { case LatteConst::ShaderType::Vertex: - src->add(", uint vid [[vertex_id]]"); - src->add(", uint iid [[instance_id]]"); - - // streamout buffer (transform feedback) - if (decompilerContext->analyzer.hasStreamoutEnable && decompilerContext->analyzer.hasStreamoutWrite) + if (decompilerContext->options->usesGeometryShader) { - src->addFmt(", device int* sb [[buffer({})]]" _CRLF, decompilerContext->output->resourceMappingVK.tfStorageBindingPoint); + src->add(", object_data ObjectPayload* objectPayload [[payload]]"); + src->add(", mesh_grid_properties meshGridProperties"); + src->add(", uint tig [[threadgroup_position_in_grid]]"); + src->add(", uint tid [[thread_index_in_threadgroup]]"); + } + else + { + src->add(", uint vid [[vertex_id]]"); + src->add(", uint iid [[instance_id]]"); + + // streamout buffer (transform feedback) + if (decompilerContext->analyzer.hasStreamoutEnable && decompilerContext->analyzer.hasStreamoutWrite) + { + src->addFmt(", device int* sb [[buffer({})]]" _CRLF, decompilerContext->output->resourceMappingVK.tfStorageBindingPoint); + } } - break; case LatteConst::ShaderType::Pixel: src->add(", bool frontFacing [[front_facing]]"); From b10bcd422e946fdb2cf708cc877e94984202bccc Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 20 Aug 2024 09:20:22 +0200 Subject: [PATCH 093/368] emit mesh shaders --- .../LatteDecompilerEmitMSL.cpp | 45 +++++--- .../LatteDecompilerEmitMSLHeader.hpp | 101 +++++++++++++----- 2 files changed, 104 insertions(+), 42 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 5bf3c1e54..2061790a7 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3355,7 +3355,7 @@ static void _emitCFRingWriteCode(LatteDecompilerShaderContext* shaderContext, La } else if (parameterExportType == 2 && parameterExportBase < 16) { - src->addFmt("passG2PParameter{}.", parameterExportBase); + src->addFmt("out.passParameterSem{}.", parameterExportBase); _emitXYZWByMask(src, cfInstruction->memWriteCompMask); src->addFmt(" = "); _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, burstIndex); @@ -3587,9 +3587,10 @@ void LatteDecompiler_emitClauseCodeMSL(LatteDecompilerShaderContext* shaderConte src->addFmt("if( {} == true ) {{" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1)); // write point size if (shaderContext->analyzer.outputPointSize && shaderContext->analyzer.writesPointSize == false) - src->add("gl_PointSize = supportBuffer.pointSize;" _CRLF); + src->add("out.pointSize = supportBuffer.pointSize;" _CRLF); // emit vertex - src->add("EmitVertex();" _CRLF); + src->add("mesh.set_vertex(out);" _CRLF); + src->add("mesh.set_index(tid, tid);" _CRLF); // increment transform feedback pointer for (sint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++) { @@ -3846,7 +3847,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, { // Defined just-in-time // Will also modify vid in case of an indexed draw - src->add("ObjectIn fetchInput(VERTEX_BUFFER_DEFINITIONS, thread uint& vid);" _CRLF); + src->add("VertexIn fetchInput(VERTEX_BUFFER_DEFINITIONS, thread uint& vid);" _CRLF); functionType = "[[object, max_total_threads_per_threadgroup(MAX_THREADS_PER_THREADGROUP), max_total_threadgroups_per_mesh_grid(1)]]"; outputTypeName = "void"; @@ -3857,6 +3858,10 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, outputTypeName = "VertexOut"; } break; + case LatteConst::ShaderType::Geometry: + functionType = "[[mesh, max_total_threads_per_threadgroup(MAX_THREADS_PER_THREADGROUP)]]"; + outputTypeName = "void"; + break; case LatteConst::ShaderType::Pixel: functionType = "fragment"; outputTypeName = "FragmentOut"; @@ -3866,16 +3871,23 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, src->addFmt("{} {} main0(", functionType, outputTypeName); LatteDecompiler::emitInputs(shaderContext); src->add(") {" _CRLF); - if (shader->shaderType == LatteConst::ShaderType::Vertex && shaderContext->options->usesGeometryShader) + if (shaderContext->options->usesGeometryShader) { - // Calculate the imaginary vertex id - src->add("uint vid = tig * PRIMITIVE_VERTEX_COUNT + tid;" _CRLF); - // TODO: don't hardcode the instance index - src->add("uint iid = 0;" _CRLF); - // Fetch the input - src->add("ObjectIn in = fetchInput(VERTEX_BUFFERS, vid);" _CRLF); - // Output is defined as object payload - src->add("object_payload ObjectPayload& out = objectPayload[tid];" _CRLF); + if (shader->shaderType == LatteConst::ShaderType::Vertex) + { + // Calculate the imaginary vertex id + src->add("uint vid = tig * PRIMITIVE_VERTEX_COUNT + tid;" _CRLF); + // TODO: don't hardcode the instance index + src->add("uint iid = 0;" _CRLF); + // Fetch the input + src->add("VertexIn in = fetchInput(VERTEX_BUFFERS, vid);" _CRLF); + // Output is defined as object payload + src->add("object_data VertexOut& out = objectPayload.vertexOut[tid];" _CRLF); + } + else + { + src->add("GeometryOut out;" _CRLF); + } } else { @@ -4077,9 +4089,9 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, { // import from geometry shader if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->addFmt("{} = asy_type(passG2PParameter{});" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId & 0x7F); + src->addFmt("{} = asy_type(passParameterSem{});" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId & 0x7F); else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) - src->addFmt("{} = passG2PParameter{};" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId & 0x7F); + src->addFmt("{} = passParameterSem{};" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId & 0x7F); else cemu_assert_unimplemented(); } @@ -4132,7 +4144,8 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, } // return - src->add("return out;" _CRLF); + if (!shaderContext->options->usesGeometryShader) + src->add("return out;" _CRLF); // end of shader main src->add("}" _CRLF); src->shrink_to_fit(); diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 4a36cf650..71d83710f 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -1,5 +1,6 @@ #pragma once +#include "HW/Latte/Core/LatteConst.h" namespace LatteDecompiler { static void _emitUniformVariables(LatteDecompilerShaderContext* decompilerContext) @@ -158,10 +159,7 @@ namespace LatteDecompiler if (decompilerContext->shader->shaderType == LatteConst::ShaderType::Vertex) { - if (decompilerContext->options->usesGeometryShader) - src->add("struct ObjectIn {" _CRLF); - else - src->add("struct VertexIn {" _CRLF); + src->add("struct VertexIn {" _CRLF); // attribute inputs for (uint32 i = 0; i < LATTE_NUM_MAX_ATTRIBUTE_LOCATIONS; i++) { @@ -183,11 +181,7 @@ namespace LatteDecompiler { auto* src = shaderContext->shaderSource; - if (shaderContext->options->usesGeometryShader) - src->add("struct ObjectPayload {" _CRLF); - else - src->add("struct VertexOut {" _CRLF); - + src->add("struct VertexOut {" _CRLF); src->add("float4 position [[position]];" _CRLF); if (shaderContext->analyzer.outputPointSize) src->add("float pointSize [[point_size]];" _CRLF); @@ -215,14 +209,11 @@ namespace LatteDecompiler continue; // no ps input src->addFmt("float4 passParameterSem{}", psInputTable->import[psInputIndex].semanticId); - if (!shaderContext->options->usesGeometryShader) - { - src->addFmt(" [[user(locn{})]]", psInputIndex); - if (psInputTable->import[psInputIndex].isFlat) - src->add(" [[flat]]"); - if (psInputTable->import[psInputIndex].isNoPerspective) - src->add(" [[center_no_perspective]]"); - } + src->addFmt(" [[user(locn{})]]", psInputIndex); + if (psInputTable->import[psInputIndex].isFlat) + src->add(" [[flat]]"); + if (psInputTable->import[psInputIndex].isNoPerspective) + src->add(" [[center_no_perspective]]"); src->addFmt(";" _CRLF); } @@ -262,11 +253,10 @@ namespace LatteDecompiler if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) { _emitAttributes(decompilerContext); - _emitVSOutputs(decompilerContext); } else if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel) { - _emitPSInputs(decompilerContext); + _emitPSInputs(decompilerContext); src->add("struct FragmentOut {" _CRLF); @@ -289,6 +279,57 @@ namespace LatteDecompiler src->add("};" _CRLF _CRLF); } + + if (!decompilerContext->options->usesGeometryShader) + { + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) + _emitVSOutputs(decompilerContext); + } + else + { + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex || decompilerContext->shaderType == LatteConst::ShaderType::Geometry) + { + src->add("struct VertexOut {" _CRLF); + src->add("float4 position [[position]];" _CRLF); + uint32 ringParameterCountVS2GS = 0; + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) + { + ringParameterCountVS2GS = decompilerContext->shader->ringParameterCount; + } + else + { + ringParameterCountVS2GS = decompilerContext->shader->ringParameterCountFromPrevStage; + } + for (uint32 f = 0; f < ringParameterCountVS2GS; f++) + src->addFmt("int4 passParameterSem{};" _CRLF, f); + src->add("};" _CRLF _CRLF); + src->add("struct ObjectPayload {" _CRLF); + src->add("VertexOut vertexOut[PRIMITIVE_VERTEX_COUNT];" _CRLF); + src->add("};" _CRLF _CRLF); + } + if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry) + { + // parameters shared between geometry and pixel shader + uint32 ringItemSize = decompilerContext->contextRegisters[mmSQ_GSVS_RING_ITEMSIZE] & 0x7FFF; + if ((ringItemSize & 0xF) != 0) + debugBreakpoint(); + if (((decompilerContext->contextRegisters[mmSQ_GSVS_RING_ITEMSIZE] & 0x7FFF) & 0xF) != 0) + debugBreakpoint(); + + src->add("struct GeometryOut {" _CRLF); + src->add("float4 position [[position]];" _CRLF); + for (sint32 p = 0; p < decompilerContext->parsedGSCopyShader->numParam; p++) + { + if (decompilerContext->parsedGSCopyShader->paramMapping[p].exportType != 2) + continue; + src->addFmt("float4 passParameterSem{} [[user(locn)]];" _CRLF, (sint32)decompilerContext->parsedGSCopyShader->paramMapping[p].exportParam, decompilerContext->parsedGSCopyShader->paramMapping[p].exportParam & 0x7F); + } + src->add("};" _CRLF _CRLF); + + // Define the mesh shader output type + src->add("using MeshType = mesh;" _CRLF); + } + } } static void emitHeader(LatteDecompilerShaderContext* decompilerContext) @@ -397,27 +438,35 @@ namespace LatteDecompiler case LatteConst::ShaderType::Vertex: if (decompilerContext->options->usesGeometryShader) { - src->add(", object_data ObjectPayload* objectPayload [[payload]]"); + src->add(", object_data ObjectPayload& objectPayload [[payload]]"); src->add(", mesh_grid_properties meshGridProperties"); src->add(", uint tig [[threadgroup_position_in_grid]]"); src->add(", uint tid [[thread_index_in_threadgroup]]"); + src->add(", VERTEX_BUFFER_DEFINITIONS"); } else { src->add(", uint vid [[vertex_id]]"); src->add(", uint iid [[instance_id]]"); - - // streamout buffer (transform feedback) - if (decompilerContext->analyzer.hasStreamoutEnable && decompilerContext->analyzer.hasStreamoutWrite) - { - src->addFmt(", device int* sb [[buffer({})]]" _CRLF, decompilerContext->output->resourceMappingVK.tfStorageBindingPoint); - } } break; + case LatteConst::ShaderType::Geometry: + src->add(", MeshType mesh"); + src->add(", const object_data ObjectPayload& objectPayload [[payload]]"); + src->add(", uint tid [[thread_index_in_threadgroup]]"); + break; case LatteConst::ShaderType::Pixel: src->add(", bool frontFacing [[front_facing]]"); break; } + + // streamout buffer (transform feedback) + if ((decompilerContext->shaderType == LatteConst::ShaderType::Vertex && !decompilerContext->options->usesGeometryShader) || decompilerContext->shaderType == LatteConst::ShaderType::Geometry) + { + if (decompilerContext->analyzer.hasStreamoutEnable && decompilerContext->analyzer.hasStreamoutWrite) + src->addFmt(", device int* sb [[buffer({})]]" _CRLF, decompilerContext->output->resourceMappingVK.tfStorageBindingPoint); + } + // uniform buffers _emitUniformBufferDefinitions(decompilerContext); // textures From 46269c0069e75b2133bab7329ce9e66751f7d3a4 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 20 Aug 2024 09:52:54 +0200 Subject: [PATCH 094/368] fix: mesh shader errors --- .../LatteDecompilerEmitMSL.cpp | 47 ++++++++++++------- .../LatteDecompilerEmitMSLHeader.hpp | 1 - 2 files changed, 30 insertions(+), 18 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 2061790a7..c4ad46ad9 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2822,13 +2822,12 @@ static void _emitGSReadInputVFetchCode(LatteDecompilerShaderContext* shaderConte src->add(" = "); _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, shaderContext->typeTracker.defaultDataType); - src->add("(objectPayload["); + src->add("(in["); if (texInstruction->textureFetch.srcSel[0] >= 4) cemu_assert_unimplemented(); if (texInstruction->textureFetch.srcSel[1] >= 4) cemu_assert_unimplemented(); - // todo: Index type - src->add("0"); + src->add("vertexIndex"); src->addFmt("].passParameterSem{}.", texInstruction->textureFetch.offset/16); @@ -3588,9 +3587,10 @@ void LatteDecompiler_emitClauseCodeMSL(LatteDecompilerShaderContext* shaderConte // write point size if (shaderContext->analyzer.outputPointSize && shaderContext->analyzer.writesPointSize == false) src->add("out.pointSize = supportBuffer.pointSize;" _CRLF); - // emit vertex - src->add("mesh.set_vertex(out);" _CRLF); - src->add("mesh.set_index(tid, tid);" _CRLF); + // Emit vertex (if the vertex index matches thread id) + src->add("mesh.set_vertex(vertexIndex, out);" _CRLF); + src->add("mesh.set_index(vertexIndex, vertexIndex);" _CRLF); + src->add("vertexIndex++;" _CRLF); // increment transform feedback pointer for (sint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++) { @@ -3859,7 +3859,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, } break; case LatteConst::ShaderType::Geometry: - functionType = "[[mesh, max_total_threads_per_threadgroup(MAX_THREADS_PER_THREADGROUP)]]"; + functionType = "[[mesh, max_total_threads_per_threadgroup(1)]]"; outputTypeName = "void"; break; case LatteConst::ShaderType::Pixel: @@ -3886,7 +3886,11 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, } else { + // Input is defined as object payload + src->add("object_data VertexOut* in = objectPayload.vertexOut;" _CRLF); src->add("GeometryOut out;" _CRLF); + // The index of the current vertex that is being emitted + src->add("uint vertexIndex = 0;" _CRLF); } } else @@ -4132,20 +4136,29 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, if (shader->shaderType == LatteConst::ShaderType::Vertex && shaderContext->options->usesGeometryShader == false) src->add("out.pointSize = supportBuffer.pointSize;" _CRLF); } - // TODO: this should be handled outside of the shader, because clipping currently wouldn't work (or would it?) - if (shader->shaderType == LatteConst::ShaderType::Vertex) - src->add("out.position.z = (out.position.z + out.position.w) / 2.0;" _CRLF); - if (shader->shaderType == LatteConst::ShaderType::Vertex && shaderContext->options->usesGeometryShader) + if (shaderContext->options->usesGeometryShader) { - src->add("if (tid == 0) {" _CRLF); - src->add("meshGridProperties.set_threadgroups_per_grid(uint3(1, 1, 1));" _CRLF); - src->add("}" _CRLF); + if (shader->shaderType == LatteConst::ShaderType::Vertex) + { + src->add("if (tid == 0) {" _CRLF); + src->add("meshGridProperties.set_threadgroups_per_grid(uint3(1, 1, 1));" _CRLF); + src->add("}" _CRLF); + } + } + else + { + if (shader->shaderType == LatteConst::ShaderType::Vertex) + { + // TODO: this should be handled outside of the shader, because clipping currently wouldn't work (or would it?) + if (shader->shaderType == LatteConst::ShaderType::Vertex) + src->add("out.position.z = (out.position.z + out.position.w) / 2.0;" _CRLF); + } + + // Return + src->add("return out;" _CRLF); } - // return - if (!shaderContext->options->usesGeometryShader) - src->add("return out;" _CRLF); // end of shader main src->add("}" _CRLF); src->shrink_to_fit(); diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 71d83710f..95fd4cef5 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -290,7 +290,6 @@ namespace LatteDecompiler if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex || decompilerContext->shaderType == LatteConst::ShaderType::Geometry) { src->add("struct VertexOut {" _CRLF); - src->add("float4 position [[position]];" _CRLF); uint32 ringParameterCountVS2GS = 0; if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) { From 9679c6b7e819c688b2091289acf28bd548191eba Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 20 Aug 2024 17:33:55 +0200 Subject: [PATCH 095/368] set primitive count in mesh shaders --- .../LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp | 8 ++++++-- .../LatteDecompilerEmitMSLHeader.hpp | 6 ++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index c4ad46ad9..734aa2e45 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3849,7 +3849,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, // Will also modify vid in case of an indexed draw src->add("VertexIn fetchInput(VERTEX_BUFFER_DEFINITIONS, thread uint& vid);" _CRLF); - functionType = "[[object, max_total_threads_per_threadgroup(MAX_THREADS_PER_THREADGROUP), max_total_threadgroups_per_mesh_grid(1)]]"; + functionType = "[[object, max_total_threads_per_threadgroup(VERTICES_PER_PRIMITIVE), max_total_threadgroups_per_mesh_grid(1)]]"; outputTypeName = "void"; } else @@ -3876,7 +3876,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, if (shader->shaderType == LatteConst::ShaderType::Vertex) { // Calculate the imaginary vertex id - src->add("uint vid = tig * PRIMITIVE_VERTEX_COUNT + tid;" _CRLF); + src->add("uint vid = tig * VERTICES_PER_PRIMITIVE + tid;" _CRLF); // TODO: don't hardcode the instance index src->add("uint iid = 0;" _CRLF); // Fetch the input @@ -4145,6 +4145,10 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, src->add("meshGridProperties.set_threadgroups_per_grid(uint3(1, 1, 1));" _CRLF); src->add("}" _CRLF); } + else if (shader->shaderType == LatteConst::ShaderType::Geometry) + { + src->add("mesh.set_primitive_count(vertexIndex / VERTICES_PER_PRIMITIVE);" _CRLF); + } } else { diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 95fd4cef5..9f8b62ae4 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -303,7 +303,7 @@ namespace LatteDecompiler src->addFmt("int4 passParameterSem{};" _CRLF, f); src->add("};" _CRLF _CRLF); src->add("struct ObjectPayload {" _CRLF); - src->add("VertexOut vertexOut[PRIMITIVE_VERTEX_COUNT];" _CRLF); + src->add("VertexOut vertexOut[VERTICES_PER_PRIMITIVE];" _CRLF); src->add("};" _CRLF _CRLF); } if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry) @@ -325,8 +325,10 @@ namespace LatteDecompiler } src->add("};" _CRLF _CRLF); + const uint32 MAX_PRIMITIVE_COUNT = 8; + // Define the mesh shader output type - src->add("using MeshType = mesh;" _CRLF); + src->addFmt("using MeshType = mesh;" _CRLF, MAX_PRIMITIVE_COUNT, MAX_PRIMITIVE_COUNT); } } } From 2f4ceb33e07a18ca4ab0e8cf56e6fec5dd29bfa8 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 20 Aug 2024 18:59:25 +0200 Subject: [PATCH 096/368] patch object and mesh shaders --- .../LatteDecompilerEmitMSL.cpp | 4 +- .../LatteDecompilerEmitMSLHeader.hpp | 21 +- .../Renderer/Metal/MetalPipelineCache.cpp | 9 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 8 +- .../Renderer/Metal/RendererShaderMtl.cpp | 185 ++++++++++++++++-- .../Latte/Renderer/Metal/RendererShaderMtl.h | 7 + 6 files changed, 208 insertions(+), 26 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 734aa2e45..9945725a0 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3847,7 +3847,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, { // Defined just-in-time // Will also modify vid in case of an indexed draw - src->add("VertexIn fetchInput(VERTEX_BUFFER_DEFINITIONS, thread uint& vid);" _CRLF); + src->add("VertexIn fetchInput(thread uint& vid VERTEX_BUFFER_DEFINITIONS);" _CRLF); functionType = "[[object, max_total_threads_per_threadgroup(VERTICES_PER_PRIMITIVE), max_total_threadgroups_per_mesh_grid(1)]]"; outputTypeName = "void"; @@ -3880,7 +3880,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, // TODO: don't hardcode the instance index src->add("uint iid = 0;" _CRLF); // Fetch the input - src->add("VertexIn in = fetchInput(VERTEX_BUFFERS, vid);" _CRLF); + src->add("VertexIn in = fetchInput(vid VERTEX_BUFFERS);" _CRLF); // Output is defined as object payload src->add("object_data VertexOut& out = objectPayload.vertexOut[tid];" _CRLF); } diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 9f8b62ae4..51385c2e8 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -156,6 +156,7 @@ namespace LatteDecompiler static void _emitAttributes(LatteDecompilerShaderContext* decompilerContext) { auto src = decompilerContext->shaderSource; + std::string attributeNames; if (decompilerContext->shader->shaderType == LatteConst::ShaderType::Vertex) { @@ -168,13 +169,16 @@ namespace LatteDecompiler cemu_assert_debug(decompilerContext->output->resourceMappingVK.attributeMapping[i] >= 0); src->addFmt("uint4 attrDataSem{}", i); - if (!decompilerContext->options->usesGeometryShader) + if (decompilerContext->options->usesGeometryShader) + attributeNames += "#define ATTRIBUTE_NAME" + std::to_string((sint32)decompilerContext->output->resourceMappingVK.attributeMapping[i]) + " attrDataSem" + std::to_string(i) + "\n"; + else src->addFmt(" [[attribute({})]]", (sint32)decompilerContext->output->resourceMappingVK.attributeMapping[i]); src->add(";" _CRLF); } } src->add("};" _CRLF _CRLF); } + src->addFmt("{}", attributeNames); } static void _emitVSOutputs(LatteDecompilerShaderContext* shaderContext) @@ -335,6 +339,21 @@ namespace LatteDecompiler static void emitHeader(LatteDecompilerShaderContext* decompilerContext) { + auto src = decompilerContext->shaderSource; + + if (decompilerContext->options->usesGeometryShader && (decompilerContext->shaderType == LatteConst::ShaderType::Vertex || decompilerContext->shaderType == LatteConst::ShaderType::Geometry)) + { + src->add("#if PRIMITIVE_TYPE == point" _CRLF); + src->add("#define VERTICES_PER_PRIMITIVE 1" _CRLF); + src->add("#if PRIMITIVE_TYPE == line" _CRLF); + src->add("#define VERTICES_PER_PRIMITIVE 2" _CRLF); + src->add("#if PRIMITIVE_TYPE == triangle" _CRLF); + src->add("#define VERTICES_PER_PRIMITIVE 3" _CRLF); + src->add("#else" _CRLF); + src->add("#error unsupported primitive type" _CRLF); + src->add("#endif" _CRLF); + } + const bool dump_shaders_enabled = ActiveSettings::DumpShadersEnabled(); if(dump_shaders_enabled) decompilerContext->shaderSource->add("// start of shader inputs/outputs, predetermined by Cemu. Do not touch" _CRLF); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 94ab37210..d68ddf5ef 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -92,7 +92,7 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS uint32 bufferIndex = bufferGroup.attributeBufferIndex; uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; - uint32 bufferStride = (LatteGPUState.contextNew.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + uint32 bufferStride = (lcr.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; bufferStride = Align(bufferStride, 4); // HACK @@ -117,6 +117,7 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS auto mtlVertexShader = static_cast(vertexShader->shader); auto mtlPixelShader = static_cast(pixelShader->shader); + mtlVertexShader->CompileVertexFunction(); mtlPixelShader->CompileFragmentFunction(activeFBO); // Render pipeline state @@ -127,9 +128,9 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS desc->setVertexDescriptor(vertexDescriptor); // Color attachments - const Latte::LATTE_CB_COLOR_CONTROL& colorControlReg = LatteGPUState.contextNew.CB_COLOR_CONTROL; + const Latte::LATTE_CB_COLOR_CONTROL& colorControlReg = lcr.CB_COLOR_CONTROL; uint32 blendEnableMask = colorControlReg.get_BLEND_MASK(); - uint32 renderTargetMask = LatteGPUState.contextNew.CB_TARGET_MASK.get_MASK(); + uint32 renderTargetMask = lcr.CB_TARGET_MASK.get_MASK(); for (uint8 i = 0; i < 8; i++) { const auto& colorBuffer = activeFBO->colorBuffer[i]; @@ -149,7 +150,7 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS { colorAttachment->setBlendingEnabled(true); - const auto& blendControlReg = LatteGPUState.contextNew.CB_BLENDN_CONTROL[i]; + const auto& blendControlReg = lcr.CB_BLENDN_CONTROL[i]; auto rgbBlendOp = GetMtlBlendOp(blendControlReg.get_COLOR_COMB_FCN()); auto srcRgbBlendFactor = GetMtlBlendFactor(blendControlReg.get_COLOR_SRCBLEND()); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index f5a11118a..0fa60eea6 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -766,17 +766,11 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 LatteDecompilerShader* vertexShader = LatteSHRC_GetActiveVertexShader(); LatteDecompilerShader* geometryShader = LatteSHRC_GetActiveGeometryShader(); LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader(); - if (!vertexShader || !static_cast(vertexShader->shader)->GetFunction()) + if (!vertexShader) { debug_printf("no vertex function, skipping draw\n"); return; } - // TODO: remove this? - if (geometryShader) - { - debug_printf("geometry shader aren't supported on Metal yet, skipping draw\n"); - return; - } const auto fetchShader = LatteSHRC_GetActiveFetchShader(); // Depth stencil state diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index 465e93163..17322b197 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -7,6 +7,8 @@ #include "Cemu/Logging/CemuLogging.h" #include "Common/precompiled.h" +#include "HW/Latte/Core/FetchShader.h" +#include "HW/Latte/ISA/RegDefines.h" extern std::atomic_int g_compiled_shaders_total; extern std::atomic_int g_compiled_shaders_async; @@ -14,15 +16,8 @@ extern std::atomic_int g_compiled_shaders_async; RendererShaderMtl::RendererShaderMtl(MetalRenderer* mtlRenderer, ShaderType type, uint64 baseHash, uint64 auxHash, bool isGameShader, bool isGfxPackShader, const std::string& mslCode) : RendererShader(type, baseHash, auxHash, isGameShader, isGfxPackShader), m_mtlr{mtlRenderer} { - if (m_type == ShaderType::kFragment) - { - // Fragment functions are compiled just-in-time - m_mslCode = mslCode; - } - else - { - Compile(mslCode); - } + // TODO: don't compile just-in-time + m_mslCode = mslCode; // Count shader compilation g_compiled_shaders_total++; @@ -34,13 +29,176 @@ RendererShaderMtl::~RendererShaderMtl() m_function->release(); } +void RendererShaderMtl::CompileObjectFunction(const LatteContextRegister& lcr, const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, Renderer::INDEX_TYPE hostIndexType) +{ + cemu_assert_debug(m_type == ShaderType::kVertex); + + std::string fullCode; + + // Primitive type + const LattePrimitiveMode primitiveMode = static_cast(lcr.VGT_PRIMITIVE_TYPE.get_PRIMITIVE_MODE()); + fullCode += "#define PRIMITIVE_TYPE "; + switch (primitiveMode) + { + case LattePrimitiveMode::POINTS: + fullCode += "point"; + break; + case LattePrimitiveMode::LINES: + fullCode += "line"; + break; + case LattePrimitiveMode::TRIANGLES: + fullCode += "triangle"; + break; + default: + break; + } + fullCode += "\n"; + + // Vertex buffers + std::string vertexBufferDefinitions = "#define VERTEX_BUFFER_DEFINITIONS "; + std::string vertexBuffers = "#define VERTEX_BUFFERS "; + std::string inputFetchDefinition = "VertexIn fetchInput(thread uint& vid VERTEX_BUFFER_DEFINITIONS) {\n"; + inputFetchDefinition += "VertexIn in;\n"; + if (hostIndexType != Renderer::INDEX_TYPE::NONE) + { + vertexBufferDefinitions += ", device "; + switch (hostIndexType) + { + case Renderer::INDEX_TYPE::U16: + vertexBufferDefinitions += "ushort"; + break; + case Renderer::INDEX_TYPE::U32: + vertexBufferDefinitions += "uint"; + break; + default: + cemu_assert_suspicious(); + break; + } + // TODO: don't hardcode the index + vertexBufferDefinitions += "* indexBuffer [[buffer(20)]]"; + vertexBuffers += ", indexBuffer"; + inputFetchDefinition += "vid = indexBuffer[vid]\n"; + } + for (auto& bufferGroup : fetchShader->bufferGroups) + { + std::optional fetchType; + + for (sint32 j = 0; j < bufferGroup.attribCount; ++j) + { + auto& attr = bufferGroup.attrib[j]; + + uint32 semanticId = vertexShader->resourceMapping.attributeMapping[attr.semanticId]; + if (semanticId == (uint32)-1) + continue; // attribute not used? + + std::string formatName; + switch (GetMtlVertexFormat(attr.format)) + { + case MTL::VertexFormatUChar: + formatName = "uchar"; + break; + case MTL::VertexFormatUChar2: + formatName = "uchar2"; + break; + case MTL::VertexFormatUChar3: + formatName = "uchar3"; + break; + case MTL::VertexFormatUChar4: + formatName = "uchar4"; + break; + case MTL::VertexFormatUShort: + formatName = "ushort"; + break; + case MTL::VertexFormatUShort2: + formatName = "ushort2"; + break; + case MTL::VertexFormatUShort3: + formatName = "ushort3"; + break; + case MTL::VertexFormatUShort4: + formatName = "ushort4"; + break; + case MTL::VertexFormatUInt: + formatName = "uint"; + break; + case MTL::VertexFormatUInt2: + formatName = "uint2"; + break; + case MTL::VertexFormatUInt3: + formatName = "uint3"; + break; + case MTL::VertexFormatUInt4: + formatName = "uint4"; + break; + } + + // Fetch the attribute + inputFetchDefinition += "in.ATTRIBUTE_NAME" + std::to_string(semanticId) + " = "; + inputFetchDefinition += "*(device " + formatName + "*)"; + inputFetchDefinition += "(vertexBuffer" + std::to_string(attr.attributeBufferIndex); + inputFetchDefinition += " + vid + " + std::to_string(attr.offset) + ");\n"; + + if (fetchType.has_value()) + cemu_assert_debug(fetchType == attr.fetchType); + else + fetchType = attr.fetchType; + + if (attr.fetchType == LatteConst::INSTANCE_DATA) + { + cemu_assert_debug(attr.aluDivisor == 1); // other divisor not yet supported + } + } + + uint32 bufferIndex = bufferGroup.attributeBufferIndex; + uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; + uint32 bufferStride = (lcr.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + + fullCode += ", device uchar* vertexBuffer" + std::to_string(bufferIndex) + " [[buffer(" + std::to_string(GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)) + ")]]"; + } + inputFetchDefinition += "return in;\n"; + inputFetchDefinition += "}\n"; + + fullCode += vertexBufferDefinitions + "\n"; + fullCode += vertexBuffers + "\n"; + fullCode += m_mslCode; + fullCode += inputFetchDefinition; + + Compile(fullCode); +} + +void RendererShaderMtl::CompileMeshFunction(const LatteContextRegister& lcr, const LatteFetchShader* fetchShader) +{ + cemu_assert_debug(m_type == ShaderType::kGeometry); + + std::string fullCode; + + // Primitive type + const LattePrimitiveMode primitiveMode = static_cast(lcr.VGT_PRIMITIVE_TYPE.get_PRIMITIVE_MODE()); + fullCode += "#define PRIMITIVE_TYPE "; + switch (primitiveMode) + { + case LattePrimitiveMode::POINTS: + fullCode += "point"; + break; + case LattePrimitiveMode::LINES: + fullCode += "line"; + break; + case LattePrimitiveMode::TRIANGLES: + fullCode += "triangle"; + break; + default: + break; + } + fullCode += "\n"; + + fullCode += m_mslCode; + Compile(fullCode); +} + void RendererShaderMtl::CompileFragmentFunction(CachedFBOMtl* activeFBO) { cemu_assert_debug(m_type == ShaderType::kFragment); - if (m_function) - m_function->release(); - std::string fullCode; // Define color attachment data types @@ -77,6 +235,9 @@ void RendererShaderMtl::CompileFragmentFunction(CachedFBOMtl* activeFBO) void RendererShaderMtl::Compile(const std::string& mslCode) { + if (m_function) + m_function->release(); + NS::Error* error = nullptr; MTL::Library* library = m_mtlr->GetDevice()->newLibrary(ToNSString(mslCode), nullptr, &error); if (error) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h index eea12ae7d..1a53313a0 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h @@ -21,6 +21,13 @@ class RendererShaderMtl : public RendererShader RendererShaderMtl(class MetalRenderer* mtlRenderer, ShaderType type, uint64 baseHash, uint64 auxHash, bool isGameShader, bool isGfxPackShader, const std::string& mslCode); virtual ~RendererShaderMtl(); + void CompileVertexFunction() + { + Compile(m_mslCode); + } + + void CompileObjectFunction(const LatteContextRegister& lcr, const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, Renderer::INDEX_TYPE hostIndexType); + void CompileMeshFunction(const LatteContextRegister& lcr, const LatteFetchShader* fetchShader); void CompileFragmentFunction(CachedFBOMtl* activeFBO); MTL::Function* GetFunction() const From 97f441ecf1ffa93951287d0f2b7eaf31c9e23ea0 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 20 Aug 2024 20:14:02 +0200 Subject: [PATCH 097/368] draw with geometry shaders --- .../LatteDecompilerEmitMSL.cpp | 14 +- .../LatteDecompilerEmitMSLHeader.hpp | 8 +- .../Renderer/Metal/MetalPipelineCache.cpp | 230 +++++++------ .../Latte/Renderer/Metal/MetalPipelineCache.h | 11 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 303 +++++++----------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 43 ++- .../Renderer/Metal/RendererShaderMtl.cpp | 27 +- 7 files changed, 332 insertions(+), 304 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 9945725a0..02581b03d 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2822,7 +2822,7 @@ static void _emitGSReadInputVFetchCode(LatteDecompilerShaderContext* shaderConte src->add(" = "); _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, shaderContext->typeTracker.defaultDataType); - src->add("(in["); + src->add("(objectPayload.vertexOut["); if (texInstruction->textureFetch.srcSel[0] >= 4) cemu_assert_unimplemented(); if (texInstruction->textureFetch.srcSel[1] >= 4) @@ -3871,7 +3871,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, src->addFmt("{} {} main0(", functionType, outputTypeName); LatteDecompiler::emitInputs(shaderContext); src->add(") {" _CRLF); - if (shaderContext->options->usesGeometryShader) + if (shaderContext->options->usesGeometryShader && (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry)) { if (shader->shaderType == LatteConst::ShaderType::Vertex) { @@ -3884,10 +3884,8 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, // Output is defined as object payload src->add("object_data VertexOut& out = objectPayload.vertexOut[tid];" _CRLF); } - else + else if (shader->shaderType == LatteConst::ShaderType::Geometry) { - // Input is defined as object payload - src->add("object_data VertexOut* in = objectPayload.vertexOut;" _CRLF); src->add("GeometryOut out;" _CRLF); // The index of the current vertex that is being emitted src->add("uint vertexIndex = 0;" _CRLF); @@ -4093,9 +4091,9 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, { // import from geometry shader if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->addFmt("{} = asy_type(passParameterSem{});" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId & 0x7F); + src->addFmt("{} = as_type(in.passParameterSem{});" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId & 0x7F); else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) - src->addFmt("{} = passParameterSem{};" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId & 0x7F); + src->addFmt("{} = in.passParameterSem{};" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId & 0x7F); else cemu_assert_unimplemented(); } @@ -4137,7 +4135,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, src->add("out.pointSize = supportBuffer.pointSize;" _CRLF); } - if (shaderContext->options->usesGeometryShader) + if (shaderContext->options->usesGeometryShader && (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry)) { if (shader->shaderType == LatteConst::ShaderType::Vertex) { diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 51385c2e8..b5e16ec7c 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -325,7 +325,7 @@ namespace LatteDecompiler { if (decompilerContext->parsedGSCopyShader->paramMapping[p].exportType != 2) continue; - src->addFmt("float4 passParameterSem{} [[user(locn)]];" _CRLF, (sint32)decompilerContext->parsedGSCopyShader->paramMapping[p].exportParam, decompilerContext->parsedGSCopyShader->paramMapping[p].exportParam & 0x7F); + src->addFmt("float4 passParameterSem{} [[user(locn{})]];" _CRLF, (sint32)decompilerContext->parsedGSCopyShader->paramMapping[p].exportParam, decompilerContext->parsedGSCopyShader->paramMapping[p].exportParam & 0x7F); } src->add("};" _CRLF _CRLF); @@ -345,9 +345,9 @@ namespace LatteDecompiler { src->add("#if PRIMITIVE_TYPE == point" _CRLF); src->add("#define VERTICES_PER_PRIMITIVE 1" _CRLF); - src->add("#if PRIMITIVE_TYPE == line" _CRLF); + src->add("#elif PRIMITIVE_TYPE == line" _CRLF); src->add("#define VERTICES_PER_PRIMITIVE 2" _CRLF); - src->add("#if PRIMITIVE_TYPE == triangle" _CRLF); + src->add("#elif PRIMITIVE_TYPE == triangle" _CRLF); src->add("#define VERTICES_PER_PRIMITIVE 3" _CRLF); src->add("#else" _CRLF); src->add("#error unsupported primitive type" _CRLF); @@ -462,7 +462,7 @@ namespace LatteDecompiler src->add(", mesh_grid_properties meshGridProperties"); src->add(", uint tig [[threadgroup_position_in_grid]]"); src->add(", uint tid [[thread_index_in_threadgroup]]"); - src->add(", VERTEX_BUFFER_DEFINITIONS"); + src->add(" VERTEX_BUFFER_DEFINITIONS"); } else { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index d68ddf5ef..d8d39b79e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -9,6 +9,8 @@ #include "HW/Latte/Core/FetchShader.h" #include "HW/Latte/ISA/RegDefines.h" +#include "Metal/MTLDevice.hpp" +#include "Metal/MTLRenderPipeline.hpp" #include "config/ActiveSettings.h" #define INVALID_TITLE_ID 0xFFFFFFFFFFFFFFFF @@ -18,6 +20,68 @@ uint64 s_cacheTitleId = INVALID_TITLE_ID; extern std::atomic_int g_compiled_shaders_total; extern std::atomic_int g_compiled_shaders_async; +template +void SetFragmentState(T* desc, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) +{ + // Color attachments + const Latte::LATTE_CB_COLOR_CONTROL& colorControlReg = lcr.CB_COLOR_CONTROL; + uint32 blendEnableMask = colorControlReg.get_BLEND_MASK(); + uint32 renderTargetMask = lcr.CB_TARGET_MASK.get_MASK(); + for (uint8 i = 0; i < 8; i++) + { + const auto& colorBuffer = activeFBO->colorBuffer[i]; + auto texture = static_cast(colorBuffer.texture); + if (!texture) + { + continue; + } + auto colorAttachment = desc->colorAttachments()->object(i); + colorAttachment->setPixelFormat(texture->GetRGBAView()->pixelFormat()); + colorAttachment->setWriteMask(GetMtlColorWriteMask((renderTargetMask >> (i * 4)) & 0xF)); + + // Blending + bool blendEnabled = ((blendEnableMask & (1 << i))) != 0; + // Only float data type is blendable + if (blendEnabled && GetMtlPixelFormatInfo(texture->format, false).dataType == MetalDataType::FLOAT) + { + colorAttachment->setBlendingEnabled(true); + + const auto& blendControlReg = lcr.CB_BLENDN_CONTROL[i]; + + auto rgbBlendOp = GetMtlBlendOp(blendControlReg.get_COLOR_COMB_FCN()); + auto srcRgbBlendFactor = GetMtlBlendFactor(blendControlReg.get_COLOR_SRCBLEND()); + auto dstRgbBlendFactor = GetMtlBlendFactor(blendControlReg.get_COLOR_DSTBLEND()); + + colorAttachment->setRgbBlendOperation(rgbBlendOp); + colorAttachment->setSourceRGBBlendFactor(srcRgbBlendFactor); + colorAttachment->setDestinationRGBBlendFactor(dstRgbBlendFactor); + if (blendControlReg.get_SEPARATE_ALPHA_BLEND()) + { + colorAttachment->setAlphaBlendOperation(GetMtlBlendOp(blendControlReg.get_ALPHA_COMB_FCN())); + colorAttachment->setSourceAlphaBlendFactor(GetMtlBlendFactor(blendControlReg.get_ALPHA_SRCBLEND())); + colorAttachment->setDestinationAlphaBlendFactor(GetMtlBlendFactor(blendControlReg.get_ALPHA_DSTBLEND())); + } + else + { + colorAttachment->setAlphaBlendOperation(rgbBlendOp); + colorAttachment->setSourceAlphaBlendFactor(srcRgbBlendFactor); + colorAttachment->setDestinationAlphaBlendFactor(dstRgbBlendFactor); + } + } + } + + // Depth stencil attachment + if (activeFBO->depthBuffer.texture) + { + auto texture = static_cast(activeFBO->depthBuffer.texture); + desc->setDepthAttachmentPixelFormat(texture->GetRGBAView()->pixelFormat()); + if (activeFBO->depthBuffer.hasStencil) + { + desc->setStencilAttachmentPixelFormat(texture->GetRGBAView()->pixelFormat()); + } + } +} + void MetalPipelineCache::ShaderCacheLoading_begin(uint64 cacheTitleId) { s_cacheTitleId = cacheTitleId; @@ -53,9 +117,9 @@ MetalPipelineCache::~MetalPipelineCache() m_binaryArchiveURL->release(); } -MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) +MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) { - uint64 stateHash = CalculatePipelineHash(fetchShader, vertexShader, pixelShader, activeFBO, lcr); + uint64 stateHash = CalculateRenderPipelineHash(fetchShader, vertexShader, pixelShader, activeFBO, lcr); auto& pipeline = m_pipelineCache[stateHash]; if (pipeline) return pipeline; @@ -127,65 +191,18 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS // TODO: don't always set the vertex descriptor? desc->setVertexDescriptor(vertexDescriptor); - // Color attachments - const Latte::LATTE_CB_COLOR_CONTROL& colorControlReg = lcr.CB_COLOR_CONTROL; - uint32 blendEnableMask = colorControlReg.get_BLEND_MASK(); - uint32 renderTargetMask = lcr.CB_TARGET_MASK.get_MASK(); - for (uint8 i = 0; i < 8; i++) - { - const auto& colorBuffer = activeFBO->colorBuffer[i]; - auto texture = static_cast(colorBuffer.texture); - if (!texture) - { - continue; - } - auto colorAttachment = desc->colorAttachments()->object(i); - colorAttachment->setPixelFormat(texture->GetRGBAView()->pixelFormat()); - colorAttachment->setWriteMask(GetMtlColorWriteMask((renderTargetMask >> (i * 4)) & 0xF)); - - // Blending - bool blendEnabled = ((blendEnableMask & (1 << i))) != 0; - // Only float data type is blendable - if (blendEnabled && GetMtlPixelFormatInfo(texture->format, false).dataType == MetalDataType::FLOAT) - { - colorAttachment->setBlendingEnabled(true); - - const auto& blendControlReg = lcr.CB_BLENDN_CONTROL[i]; - - auto rgbBlendOp = GetMtlBlendOp(blendControlReg.get_COLOR_COMB_FCN()); - auto srcRgbBlendFactor = GetMtlBlendFactor(blendControlReg.get_COLOR_SRCBLEND()); - auto dstRgbBlendFactor = GetMtlBlendFactor(blendControlReg.get_COLOR_DSTBLEND()); - - colorAttachment->setRgbBlendOperation(rgbBlendOp); - colorAttachment->setSourceRGBBlendFactor(srcRgbBlendFactor); - colorAttachment->setDestinationRGBBlendFactor(dstRgbBlendFactor); - if (blendControlReg.get_SEPARATE_ALPHA_BLEND()) - { - colorAttachment->setAlphaBlendOperation(GetMtlBlendOp(blendControlReg.get_ALPHA_COMB_FCN())); - colorAttachment->setSourceAlphaBlendFactor(GetMtlBlendFactor(blendControlReg.get_ALPHA_SRCBLEND())); - colorAttachment->setDestinationAlphaBlendFactor(GetMtlBlendFactor(blendControlReg.get_ALPHA_DSTBLEND())); - } - else - { - colorAttachment->setAlphaBlendOperation(rgbBlendOp); - colorAttachment->setSourceAlphaBlendFactor(srcRgbBlendFactor); - colorAttachment->setDestinationAlphaBlendFactor(dstRgbBlendFactor); - } - } - } + SetFragmentState(desc, activeFBO, lcr); - // Depth stencil attachment - if (activeFBO->depthBuffer.texture) - { - auto texture = static_cast(activeFBO->depthBuffer.texture); - desc->setDepthAttachmentPixelFormat(texture->GetRGBAView()->pixelFormat()); - if (activeFBO->depthBuffer.hasStencil) - { - desc->setStencilAttachmentPixelFormat(texture->GetRGBAView()->pixelFormat()); - } - } + TryLoadBinaryArchive(); - LoadBinary(desc); + // Load binary + if (m_binaryArchive) + { + NS::Object* binArchives[] = {m_binaryArchive}; + auto binaryArchives = NS::Array::alloc()->init(binArchives, 1); + desc->setBinaryArchives(binaryArchives); + binaryArchives->release(); + } NS::Error* error = nullptr; #ifdef CEMU_DEBUG_ASSERT @@ -211,10 +228,21 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS { debug_printf("error creating render pipeline state: %s\n", error->localizedDescription()->utf8String()); error->release(); + return nullptr; } else { - SaveBinary(desc); + // Save binary + if (m_binaryArchive) + { + NS::Error* error = nullptr; + m_binaryArchive->addRenderPipelineFunctions(desc, &error); + if (error) + { + debug_printf("error saving render pipeline functions: %s\n", error->localizedDescription()->utf8String()); + error->release(); + } + } } //newPipelineCount++; @@ -230,7 +258,57 @@ MTL::RenderPipelineState* MetalPipelineCache::GetPipelineState(const LatteFetchS return pipeline; } -uint64 MetalPipelineCache::CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) +MTL::RenderPipelineState* MetalPipelineCache::GetMeshPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, CachedFBOMtl* activeFBO, const LatteContextRegister& lcr, Renderer::INDEX_TYPE hostIndexType) +{ + uint64 stateHash = CalculateRenderPipelineHash(fetchShader, vertexShader, pixelShader, activeFBO, lcr); + + stateHash += lcr.GetRawView()[mmVGT_PRIMITIVE_TYPE]; + stateHash = std::rotl(stateHash, 7); + + stateHash += (uint8)hostIndexType; + stateHash = std::rotl(stateHash, 7); // TODO: 7?s + + auto& pipeline = m_pipelineCache[stateHash]; + if (pipeline) + return pipeline; + + auto mtlObjectShader = static_cast(vertexShader->shader); + auto mtlMeshShader = static_cast(geometryShader->shader); + auto mtlPixelShader = static_cast(pixelShader->shader); + mtlObjectShader->CompileObjectFunction(lcr, fetchShader, vertexShader, hostIndexType); + mtlMeshShader->CompileMeshFunction(lcr, fetchShader); + mtlPixelShader->CompileFragmentFunction(activeFBO); + + // Render pipeline state + MTL::MeshRenderPipelineDescriptor* desc = MTL::MeshRenderPipelineDescriptor::alloc()->init(); + desc->setObjectFunction(mtlObjectShader->GetFunction()); + desc->setMeshFunction(mtlMeshShader->GetFunction()); + desc->setFragmentFunction(mtlPixelShader->GetFunction()); + + SetFragmentState(desc, activeFBO, lcr); + + TryLoadBinaryArchive(); + + // Load binary + // TODO: no binary archives? :( + + NS::Error* error = nullptr; +#ifdef CEMU_DEBUG_ASSERT + desc->setLabel(GetLabel("Mesh pipeline state", desc)); +#endif + pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, MTL::PipelineOptionNone, nullptr, &error); + if (error) + { + debug_printf("error creating render pipeline state: %s\n", error->localizedDescription()->utf8String()); + error->release(); + return nullptr; + } + desc->release(); + + return pipeline; +} + +uint64 MetalPipelineCache::CalculateRenderPipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) { // Hash uint64 stateHash = 0; @@ -261,9 +339,6 @@ uint64 MetalPipelineCache::CalculatePipelineHash(const LatteFetchShader* fetchSh stateHash += fetchShader->getVkPipelineHashFragment(); stateHash = std::rotl(stateHash, 7); - stateHash += lcr.GetRawView()[mmVGT_PRIMITIVE_TYPE]; - stateHash = std::rotl(stateHash, 7); - stateHash += lcr.GetRawView()[mmVGT_STRMOUT_EN]; stateHash = std::rotl(stateHash, 7); @@ -340,30 +415,3 @@ void MetalPipelineCache::TryLoadBinaryArchive() } desc->release(); } - -void MetalPipelineCache::LoadBinary(MTL::RenderPipelineDescriptor* desc) -{ - TryLoadBinaryArchive(); - - if (!m_binaryArchive) - return; - - NS::Object* binArchives[] = {m_binaryArchive}; - auto binaryArchives = NS::Array::alloc()->init(binArchives, 1); - desc->setBinaryArchives(binaryArchives); - binaryArchives->release(); -} - -void MetalPipelineCache::SaveBinary(MTL::RenderPipelineDescriptor* desc) -{ - if (!m_binaryArchive) - return; - - NS::Error* error = nullptr; - m_binaryArchive->addRenderPipelineFunctions(desc, &error); - if (error) - { - debug_printf("error saving render pipeline functions: %s\n", error->localizedDescription()->utf8String()); - error->release(); - } -} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h index 1fa1f87c0..30f40208b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h @@ -4,6 +4,7 @@ #include "HW/Latte/ISA/LatteReg.h" #include "HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h" +#include "Cafe/HW/Latte/Renderer/Renderer.h" class MetalPipelineCache { @@ -15,7 +16,9 @@ class MetalPipelineCache MetalPipelineCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} ~MetalPipelineCache(); - MTL::RenderPipelineState* GetPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr); + MTL::RenderPipelineState* GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr); + + MTL::RenderPipelineState* GetMeshPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr, Renderer::INDEX_TYPE hostIndexType); private: class MetalRenderer* m_mtlr; @@ -25,11 +28,7 @@ class MetalPipelineCache NS::URL* m_binaryArchiveURL; MTL::BinaryArchive* m_binaryArchive; - uint64 CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr); + uint64 CalculateRenderPipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr); void TryLoadBinaryArchive(); - - void LoadBinary(MTL::RenderPipelineDescriptor* desc); - - void SaveBinary(MTL::RenderPipelineDescriptor* desc); }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 0fa60eea6..323c67dd5 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -20,6 +20,7 @@ #include "Common/precompiled.h" #include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "Metal/MTLDevice.hpp" +#include "Metal/MTLRenderCommandEncoder.hpp" #include "gui/guiWrapper.h" #define COMMIT_TRESHOLD 256 @@ -28,6 +29,63 @@ extern bool hasValidFramebufferAttached; float supportBufferData[512 * 4]; +void SetBuffer(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Buffer* buffer, size_t offset, uint32 index) +{ + switch (shaderType) + { + case METAL_SHADER_TYPE_VERTEX: + renderCommandEncoder->setVertexBuffer(buffer, offset, index); + break; + case METAL_SHADER_TYPE_OBJECT: + renderCommandEncoder->setObjectBuffer(buffer, offset, index); + break; + case METAL_SHADER_TYPE_MESH: + renderCommandEncoder->setMeshBuffer(buffer, offset, index); + break; + case METAL_SHADER_TYPE_FRAGMENT: + renderCommandEncoder->setFragmentBuffer(buffer, offset, index); + break; + } +} + +void SetTexture(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Texture* texture, uint32 index) +{ + switch (shaderType) + { + case METAL_SHADER_TYPE_VERTEX: + renderCommandEncoder->setVertexTexture(texture, index); + break; + case METAL_SHADER_TYPE_OBJECT: + renderCommandEncoder->setObjectTexture(texture, index); + break; + case METAL_SHADER_TYPE_MESH: + renderCommandEncoder->setMeshTexture(texture, index); + break; + case METAL_SHADER_TYPE_FRAGMENT: + renderCommandEncoder->setFragmentTexture(texture, index); + break; + } +} + +void SetSamplerState(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::SamplerState* samplerState, uint32 index) +{ + switch (shaderType) + { + case METAL_SHADER_TYPE_VERTEX: + renderCommandEncoder->setVertexSamplerState(samplerState, index); + break; + case METAL_SHADER_TYPE_OBJECT: + renderCommandEncoder->setObjectSamplerState(samplerState, index); + break; + case METAL_SHADER_TYPE_MESH: + renderCommandEncoder->setMeshSamplerState(samplerState, index); + break; + case METAL_SHADER_TYPE_FRAGMENT: + renderCommandEncoder->setFragmentSamplerState(samplerState, index); + break; + } +} + MetalRenderer::MetalRenderer() { m_device = MTL::CreateSystemDefaultDevice(); @@ -654,7 +712,6 @@ void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, u m_memoryManager->UntrackVertexBuffer(bufferIndex); } - buffer.needsRebind = true; buffer.offset = offset; buffer.size = size; buffer.restrideInfo = {}; @@ -664,7 +721,7 @@ void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, u void MetalRenderer::buffer_bindUniformBuffer(LatteConst::ShaderType shaderType, uint32 bufferIndex, uint32 offset, uint32 size) { - m_state.m_uniformBufferOffsets[GetMtlShaderType(shaderType)][bufferIndex] = offset; + m_state.m_uniformBufferOffsets[GetMtlGeneralShaderType(shaderType)][bufferIndex] = offset; } RendererShader* MetalRenderer::shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool isGameShader, bool isGfxPackShader) @@ -957,16 +1014,19 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 auto restridedBuffer = m_memoryManager->RestrideBufferIfNeeded(i, bufferStride); // Bind - if (vertexBufferRange.needsRebind) + if (true) { - renderCommandEncoder->setVertexBuffer(restridedBuffer.buffer, restridedBuffer.offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); - vertexBufferRange.needsRebind = false; + SetBuffer(renderCommandEncoder, GetMtlShaderType(vertexShader->shaderType, (geometryShader != nullptr)),restridedBuffer.buffer, restridedBuffer.offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); } } } // Render pipeline state - MTL::RenderPipelineState* renderPipelineState = m_pipelineCache->GetPipelineState(fetchShader, vertexShader, pixelShader, m_state.m_lastUsedFBO, LatteGPUState.contextNew); + MTL::RenderPipelineState* renderPipelineState; + if (geometryShader) + renderPipelineState = m_pipelineCache->GetMeshPipelineState(fetchShader, vertexShader, geometryShader, pixelShader, m_state.m_lastUsedFBO, LatteGPUState.contextNew, hostIndexType); + else + renderPipelineState = m_pipelineCache->GetRenderPipelineState(fetchShader, vertexShader, pixelShader, m_state.m_lastUsedFBO, LatteGPUState.contextNew); if (renderPipelineState != encoderState.m_renderPipelineState) { renderCommandEncoder->setRenderPipelineState(renderPipelineState); @@ -978,18 +1038,51 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 LatteStreamout_PrepareDrawcall(count, instanceCount); // Uniform buffers, textures and samplers - BindStageResources(renderCommandEncoder, vertexShader); - BindStageResources(renderCommandEncoder, pixelShader); + BindStageResources(renderCommandEncoder, vertexShader, (geometryShader != nullptr)); + if (geometryShader) + BindStageResources(renderCommandEncoder, geometryShader, (geometryShader != nullptr)); + BindStageResources(renderCommandEncoder, pixelShader, (geometryShader != nullptr)); // Draw + MTL::Buffer* indexBuffer = nullptr; if (hostIndexType != INDEX_TYPE::NONE) + indexBuffer = m_memoryManager->GetTemporaryBufferAllocator().GetBuffer(indexBufferIndex); + if (geometryShader) { - auto mtlIndexType = GetMtlIndexType(hostIndexType); - MTL::Buffer* indexBuffer = m_memoryManager->GetTemporaryBufferAllocator().GetBuffer(indexBufferIndex); - renderCommandEncoder->drawIndexedPrimitives(mtlPrimitiveType, hostIndexCount, mtlIndexType, indexBuffer, indexBufferOffset, instanceCount, baseVertex, baseInstance); - } else + // TODO: don't hardcode the index + if (indexBuffer) + renderCommandEncoder->setObjectBuffer(indexBuffer, indexBufferOffset, 20); + + uint32 verticesPerPrimitive = 0; + switch (primitiveMode) + { + case LattePrimitiveMode::POINTS: + verticesPerPrimitive = 1; + break; + case LattePrimitiveMode::LINES: + verticesPerPrimitive = 2; + break; + case LattePrimitiveMode::TRIANGLES: + verticesPerPrimitive = 3; + break; + default: + throw std::runtime_error("Invalid primitive mode"); + break; + } + + renderCommandEncoder->drawMeshThreadgroups(MTL::Size(count / verticesPerPrimitive, 1, 1), MTL::Size(verticesPerPrimitive, 1, 1), MTL::Size(1, 1, 1)); + } + else { - renderCommandEncoder->drawPrimitives(mtlPrimitiveType, baseVertex, count, instanceCount, baseInstance); + if (indexBuffer) + { + auto mtlIndexType = GetMtlIndexType(hostIndexType); + renderCommandEncoder->drawIndexedPrimitives(mtlPrimitiveType, hostIndexCount, mtlIndexType, indexBuffer, indexBufferOffset, instanceCount, baseVertex, baseInstance); + } + else + { + renderCommandEncoder->drawPrimitives(mtlPrimitiveType, baseVertex, count, instanceCount, baseInstance); + } } LatteStreamout_FinishDrawcall(false); @@ -1080,7 +1173,7 @@ MTL::RenderCommandEncoder* MetalRenderer::GetTemporaryRenderCommandEncoder(MTL:: } // Some render passes clear the attachments, forceRecreate is supposed to be used in those cases -MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(bool forceRecreate, bool rebindStateIfNewEncoder) +MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(bool forceRecreate) { // Check if we need to begin a new render pass if (m_commandEncoder) @@ -1134,12 +1227,6 @@ MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(bool forceRecr ResetEncoderState(); - if (rebindStateIfNewEncoder) - { - // Rebind all the render state - RebindRenderState(renderCommandEncoder); - } - return renderCommandEncoder; } @@ -1253,9 +1340,9 @@ bool MetalRenderer::AcquireNextDrawable(bool mainWindow) return true; } -void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader) +void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader, bool usesGeometryShader) { - auto mtlShaderType = GetMtlShaderType(shader->shaderType); + auto mtlShaderType = GetMtlShaderType(shader->shaderType, usesGeometryShader); sint32 textureCount = shader->resourceMapping.getTextureCount(); for (int i = 0; i < textureCount; ++i) @@ -1295,88 +1382,21 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE { // TODO: don't bind if already bound if (textureDim == Latte::E_DIM::DIM_1D) - { - switch (shader->shaderType) - { - case LatteConst::ShaderType::Vertex: - { - renderCommandEncoder->setVertexTexture(m_nullTexture1D, binding); - renderCommandEncoder->setVertexSamplerState(m_nearestSampler, binding); - break; - } - case LatteConst::ShaderType::Pixel: - { - renderCommandEncoder->setFragmentTexture(m_nullTexture1D, binding); - renderCommandEncoder->setVertexSamplerState(m_nearestSampler, binding); - break; - } - default: - UNREACHABLE; - } - } + SetTexture(renderCommandEncoder, mtlShaderType, m_nullTexture1D, binding); else - { - switch (shader->shaderType) - { - case LatteConst::ShaderType::Vertex: - { - renderCommandEncoder->setVertexTexture(m_nullTexture2D, binding); - renderCommandEncoder->setVertexSamplerState(m_nearestSampler, binding); - break; - } - case LatteConst::ShaderType::Pixel: - { - renderCommandEncoder->setFragmentTexture(m_nullTexture2D, binding); - renderCommandEncoder->setVertexSamplerState(m_nearestSampler, binding); - break; - } - default: - UNREACHABLE; - } - } + SetTexture(renderCommandEncoder, mtlShaderType, m_nullTexture2D, binding); + SetSamplerState(renderCommandEncoder, mtlShaderType, m_nearestSampler, binding); continue; } if (textureDim == Latte::E_DIM::DIM_1D && (textureView->dim != Latte::E_DIM::DIM_1D)) { - switch (shader->shaderType) - { - case LatteConst::ShaderType::Vertex: - { - renderCommandEncoder->setVertexTexture(m_nullTexture1D, binding); - renderCommandEncoder->setVertexSamplerState(m_nearestSampler, binding); - break; - } - case LatteConst::ShaderType::Pixel: - { - renderCommandEncoder->setFragmentTexture(m_nullTexture1D, binding); - renderCommandEncoder->setVertexSamplerState(m_nearestSampler, binding); - break; - } - default: - UNREACHABLE; - } + SetTexture(renderCommandEncoder, mtlShaderType, m_nullTexture1D, binding); continue; } else if (textureDim == Latte::E_DIM::DIM_2D && (textureView->dim != Latte::E_DIM::DIM_2D && textureView->dim != Latte::E_DIM::DIM_2D_MSAA)) { - switch (shader->shaderType) - { - case LatteConst::ShaderType::Vertex: - { - renderCommandEncoder->setVertexTexture(m_nullTexture2D, binding); - renderCommandEncoder->setVertexSamplerState(m_nearestSampler, binding); - break; - } - case LatteConst::ShaderType::Pixel: - { - renderCommandEncoder->setFragmentTexture(m_nullTexture2D, binding); - renderCommandEncoder->setVertexSamplerState(m_nearestSampler, binding); - break; - } - default: - UNREACHABLE; - } + SetTexture(renderCommandEncoder, mtlShaderType, m_nullTexture2D, binding); continue; } @@ -1399,21 +1419,7 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE { boundSampler = sampler; - switch (shader->shaderType) - { - case LatteConst::ShaderType::Vertex: - { - renderCommandEncoder->setVertexSamplerState(sampler, binding); - break; - } - case LatteConst::ShaderType::Pixel: - { - renderCommandEncoder->setFragmentSamplerState(sampler, binding); - break; - } - default: - UNREACHABLE; - } + SetSamplerState(renderCommandEncoder, mtlShaderType, sampler, binding); } // get texture register word 0 @@ -1425,21 +1431,7 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE boundTexture = {textureView, word4}; MTL::Texture* mtlTexture = textureView->GetSwizzledView(word4); - switch (shader->shaderType) - { - case LatteConst::ShaderType::Vertex: - { - renderCommandEncoder->setVertexTexture(mtlTexture, binding); - break; - } - case LatteConst::ShaderType::Pixel: - { - renderCommandEncoder->setFragmentTexture(mtlTexture, binding); - break; - } - default: - UNREACHABLE; - } + SetTexture(renderCommandEncoder, mtlShaderType, mtlTexture, binding); } // Support buffer @@ -1531,23 +1523,7 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE if (!HasUnifiedMemory()) buffer->didModifyRange(NS::Range(supportBuffer.offset, size)); - switch (shader->shaderType) - { - case LatteConst::ShaderType::Vertex: - { - renderCommandEncoder->setVertexBuffer(buffer, supportBuffer.offset, MTL_SUPPORT_BUFFER_BINDING); - //renderCommandEncoder->setVertexBytes(supportBufferData, sizeof(supportBufferData), MTL_SUPPORT_BUFFER_BINDING); - break; - } - case LatteConst::ShaderType::Pixel: - { - renderCommandEncoder->setFragmentBuffer(buffer, supportBuffer.offset, MTL_SUPPORT_BUFFER_BINDING); - //renderCommandEncoder->setFragmentBytes(supportBufferData, sizeof(supportBufferData), MTL_SUPPORT_BUFFER_BINDING); - break; - } - default: - UNREACHABLE; - } + SetBuffer(renderCommandEncoder, mtlShaderType, buffer, supportBuffer.offset, MTL_SUPPORT_BUFFER_BINDING); } // Uniform buffers @@ -1562,7 +1538,7 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE continue; } - size_t offset = m_state.m_uniformBufferOffsets[mtlShaderType][i]; + size_t offset = m_state.m_uniformBufferOffsets[GetMtlGeneralShaderType(shader->shaderType)][i]; if (offset == INVALID_OFFSET) continue; @@ -1573,57 +1549,18 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE boundOffset = offset; // TODO: only set the offset if already bound - switch (shader->shaderType) - { - case LatteConst::ShaderType::Vertex: - { - renderCommandEncoder->setVertexBuffer(m_memoryManager->GetBufferCache(), offset, binding); - break; - } - case LatteConst::ShaderType::Pixel: - { - renderCommandEncoder->setFragmentBuffer(m_memoryManager->GetBufferCache(), offset, binding); - break; - } - default: - UNREACHABLE; - } + SetBuffer(renderCommandEncoder, mtlShaderType, m_memoryManager->GetBufferCache(), offset, binding); } } // Storage buffer if (shader->resourceMapping.tfStorageBindingPoint >= 0) { - switch (shader->shaderType) - { - case LatteConst::ShaderType::Vertex: - { - renderCommandEncoder->setVertexBuffer(m_xfbRingBuffer, 0, shader->resourceMapping.tfStorageBindingPoint); - break; - } - case LatteConst::ShaderType::Pixel: - { - renderCommandEncoder->setFragmentBuffer(m_xfbRingBuffer, 0, shader->resourceMapping.tfStorageBindingPoint); - break; - } - default: - UNREACHABLE; - } + SetBuffer(renderCommandEncoder, mtlShaderType, m_xfbRingBuffer, 0, shader->resourceMapping.tfStorageBindingPoint); m_state.m_encoderState.m_uniformBufferOffsets[mtlShaderType][shader->resourceMapping.tfStorageBindingPoint] = INVALID_OFFSET; } } -void MetalRenderer::RebindRenderState(MTL::RenderCommandEncoder* renderCommandEncoder) -{ - // Vertex buffers - for (uint8 i = 0; i < MAX_MTL_BUFFERS; i++) - { - auto& vertexBufferRange = m_state.m_vertexBuffers[i]; - if (vertexBufferRange.offset != INVALID_OFFSET) - vertexBufferRange.needsRebind = true; - } -} - void MetalRenderer::ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a) { MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 4ea13fb2d..682685341 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -31,27 +31,57 @@ struct MetalRestrideInfo struct MetalBoundBuffer { - bool needsRebind = false; size_t offset = INVALID_OFFSET; size_t size = 0; // Memory manager will write restride info to this variable MetalRestrideInfo restrideInfo; }; +enum MetalGeneralShaderType +{ + METAL_GENERAL_SHADER_TYPE_VERTEX, + METAL_GENERAL_SHADER_TYPE_GEOMETRY, + METAL_GENERAL_SHADER_TYPE_FRAGMENT, + + METAL_GENERAL_SHADER_TYPE_TOTAL +}; + +inline MetalGeneralShaderType GetMtlGeneralShaderType(LatteConst::ShaderType shaderType) +{ + switch (shaderType) + { + case LatteConst::ShaderType::Vertex: + return METAL_GENERAL_SHADER_TYPE_VERTEX; + case LatteConst::ShaderType::Geometry: + return METAL_GENERAL_SHADER_TYPE_GEOMETRY; + case LatteConst::ShaderType::Pixel: + return METAL_GENERAL_SHADER_TYPE_FRAGMENT; + default: + return METAL_GENERAL_SHADER_TYPE_TOTAL; + } +} + enum MetalShaderType { METAL_SHADER_TYPE_VERTEX, + METAL_SHADER_TYPE_OBJECT, + METAL_SHADER_TYPE_MESH, METAL_SHADER_TYPE_FRAGMENT, METAL_SHADER_TYPE_TOTAL }; -inline MetalShaderType GetMtlShaderType(LatteConst::ShaderType shaderType) +inline MetalShaderType GetMtlShaderType(LatteConst::ShaderType shaderType, bool usesGeometryShader) { switch (shaderType) { case LatteConst::ShaderType::Vertex: - return METAL_SHADER_TYPE_VERTEX; + if (usesGeometryShader) + return METAL_SHADER_TYPE_OBJECT; + else + return METAL_SHADER_TYPE_VERTEX; + case LatteConst::ShaderType::Geometry: + return METAL_SHADER_TYPE_MESH; case LatteConst::ShaderType::Pixel: return METAL_SHADER_TYPE_FRAGMENT; default: @@ -105,7 +135,7 @@ struct MetalState MetalBoundBuffer m_vertexBuffers[MAX_MTL_BUFFERS] = {{}}; // TODO: find out what is the max number of bound textures on the Wii U class LatteTextureViewMtl* m_textures[64] = {nullptr}; - size_t m_uniformBufferOffsets[METAL_SHADER_TYPE_TOTAL][MAX_MTL_BUFFERS]; + size_t m_uniformBufferOffsets[METAL_GENERAL_SHADER_TYPE_TOTAL][MAX_MTL_BUFFERS]; MTL::Viewport m_viewport; MTL::ScissorRect m_scissor; @@ -347,7 +377,7 @@ class MetalRenderer : public Renderer bool CommandBufferCompleted(MTL::CommandBuffer* commandBuffer); void WaitForCommandBufferCompletion(MTL::CommandBuffer* commandBuffer); MTL::RenderCommandEncoder* GetTemporaryRenderCommandEncoder(MTL::RenderPassDescriptor* renderPassDescriptor); - MTL::RenderCommandEncoder* GetRenderCommandEncoder(bool forceRecreate = false, bool rebindStateIfNewEncoder = true); + MTL::RenderCommandEncoder* GetRenderCommandEncoder(bool forceRecreate = false); MTL::ComputeCommandEncoder* GetComputeCommandEncoder(); MTL::BlitCommandEncoder* GetBlitCommandEncoder(); void EndEncoding(); @@ -355,8 +385,7 @@ class MetalRenderer : public Renderer bool AcquireNextDrawable(bool mainWindow); - void BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader); - void RebindRenderState(MTL::RenderCommandEncoder* renderCommandEncoder); + void BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader, bool usesGeometryShader); void ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index 17322b197..883a85c6b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -58,7 +58,6 @@ void RendererShaderMtl::CompileObjectFunction(const LatteContextRegister& lcr, c std::string vertexBufferDefinitions = "#define VERTEX_BUFFER_DEFINITIONS "; std::string vertexBuffers = "#define VERTEX_BUFFERS "; std::string inputFetchDefinition = "VertexIn fetchInput(thread uint& vid VERTEX_BUFFER_DEFINITIONS) {\n"; - inputFetchDefinition += "VertexIn in;\n"; if (hostIndexType != Renderer::INDEX_TYPE::NONE) { vertexBufferDefinitions += ", device "; @@ -77,8 +76,9 @@ void RendererShaderMtl::CompileObjectFunction(const LatteContextRegister& lcr, c // TODO: don't hardcode the index vertexBufferDefinitions += "* indexBuffer [[buffer(20)]]"; vertexBuffers += ", indexBuffer"; - inputFetchDefinition += "vid = indexBuffer[vid]\n"; + inputFetchDefinition += "vid = indexBuffer[vid];\n"; } + inputFetchDefinition += "VertexIn in;\n"; for (auto& bufferGroup : fetchShader->bufferGroups) { std::optional fetchType; @@ -92,51 +92,67 @@ void RendererShaderMtl::CompileObjectFunction(const LatteContextRegister& lcr, c continue; // attribute not used? std::string formatName; + uint8 componentCount = 0; switch (GetMtlVertexFormat(attr.format)) { case MTL::VertexFormatUChar: formatName = "uchar"; + componentCount = 1; break; case MTL::VertexFormatUChar2: formatName = "uchar2"; + componentCount = 2; break; case MTL::VertexFormatUChar3: formatName = "uchar3"; + componentCount = 3; break; case MTL::VertexFormatUChar4: formatName = "uchar4"; + componentCount = 4; break; case MTL::VertexFormatUShort: formatName = "ushort"; + componentCount = 1; break; case MTL::VertexFormatUShort2: formatName = "ushort2"; + componentCount = 2; break; case MTL::VertexFormatUShort3: formatName = "ushort3"; + componentCount = 3; break; case MTL::VertexFormatUShort4: formatName = "ushort4"; + componentCount = 4; break; case MTL::VertexFormatUInt: formatName = "uint"; + componentCount = 1; break; case MTL::VertexFormatUInt2: formatName = "uint2"; + componentCount = 2; break; case MTL::VertexFormatUInt3: formatName = "uint3"; + componentCount = 3; break; case MTL::VertexFormatUInt4: formatName = "uint4"; + componentCount = 4; break; } // Fetch the attribute inputFetchDefinition += "in.ATTRIBUTE_NAME" + std::to_string(semanticId) + " = "; - inputFetchDefinition += "*(device " + formatName + "*)"; + inputFetchDefinition += "uint4(*(device " + formatName + "*)"; inputFetchDefinition += "(vertexBuffer" + std::to_string(attr.attributeBufferIndex); - inputFetchDefinition += " + vid + " + std::to_string(attr.offset) + ");\n"; + inputFetchDefinition += " + vid + " + std::to_string(attr.offset) + ")"; + for (uint8 i = 0; i < (4 - componentCount); i++) + inputFetchDefinition += ", 0"; + inputFetchDefinition += ");\n"; if (fetchType.has_value()) cemu_assert_debug(fetchType == attr.fetchType); @@ -153,7 +169,8 @@ void RendererShaderMtl::CompileObjectFunction(const LatteContextRegister& lcr, c uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; uint32 bufferStride = (lcr.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; - fullCode += ", device uchar* vertexBuffer" + std::to_string(bufferIndex) + " [[buffer(" + std::to_string(GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)) + ")]]"; + vertexBufferDefinitions += ", device uchar* vertexBuffer" + std::to_string(bufferIndex) + " [[buffer(" + std::to_string(GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)) + ")]]"; + vertexBuffers += ", vertexBuffer" + std::to_string(bufferIndex); } inputFetchDefinition += "return in;\n"; inputFetchDefinition += "}\n"; From 76cf3de80b4d5919eb3c66e74132a5585cfc69b9 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 20 Aug 2024 20:27:41 +0200 Subject: [PATCH 098/368] disable vertex buffer restride for object shaders --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 323c67dd5..91df236c0 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1007,16 +1007,31 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 auto& vertexBufferRange = m_state.m_vertexBuffers[i]; if (vertexBufferRange.offset != INVALID_OFFSET) { + MTL::Buffer* buffer; + size_t offset; + // Restride - uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + i * 7; - uint32 bufferStride = (LatteGPUState.contextNew.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + if (geometryShader) + { + // Object shaders don't need restriding, since the attribute are fetched in the shader + buffer = m_memoryManager->GetBufferCache(); + offset = m_state.m_vertexBuffers[i].offset; + } + else + { + uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + i * 7; + uint32 bufferStride = (LatteGPUState.contextNew.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + + auto restridedBuffer = m_memoryManager->RestrideBufferIfNeeded(i, bufferStride); - auto restridedBuffer = m_memoryManager->RestrideBufferIfNeeded(i, bufferStride); + buffer = restridedBuffer.buffer; + offset = restridedBuffer.offset; + } // Bind if (true) { - SetBuffer(renderCommandEncoder, GetMtlShaderType(vertexShader->shaderType, (geometryShader != nullptr)),restridedBuffer.buffer, restridedBuffer.offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); + SetBuffer(renderCommandEncoder, GetMtlShaderType(vertexShader->shaderType, (geometryShader != nullptr)), buffer, offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); } } } From 7500a54b38990d08316506933def7e64af6d1a9d Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 21 Aug 2024 08:17:41 +0200 Subject: [PATCH 099/368] fix: geometry shaders --- .../LatteDecompilerEmitMSL.cpp | 27 ++++++-- .../LatteDecompilerEmitMSLHeader.hpp | 56 +++++++++++---- .../Renderer/Metal/MetalPipelineCache.cpp | 1 - .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 2 +- .../Renderer/Metal/RendererShaderMtl.cpp | 69 ++++--------------- .../Latte/Renderer/Metal/RendererShaderMtl.h | 1 - 6 files changed, 82 insertions(+), 74 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 02581b03d..1b0f3f717 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3589,7 +3589,6 @@ void LatteDecompiler_emitClauseCodeMSL(LatteDecompilerShaderContext* shaderConte src->add("out.pointSize = supportBuffer.pointSize;" _CRLF); // Emit vertex (if the vertex index matches thread id) src->add("mesh.set_vertex(vertexIndex, out);" _CRLF); - src->add("mesh.set_index(vertexIndex, vertexIndex);" _CRLF); src->add("vertexIndex++;" _CRLF); // increment transform feedback pointer for (sint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++) @@ -3849,7 +3848,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, // Will also modify vid in case of an indexed draw src->add("VertexIn fetchInput(thread uint& vid VERTEX_BUFFER_DEFINITIONS);" _CRLF); - functionType = "[[object, max_total_threads_per_threadgroup(VERTICES_PER_PRIMITIVE), max_total_threadgroups_per_mesh_grid(1)]]"; + functionType = "[[object, max_total_threads_per_threadgroup(VERTICES_PER_VERTEX_PRIMITIVE), max_total_threadgroups_per_mesh_grid(1)]]"; outputTypeName = "void"; } else @@ -3876,7 +3875,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, if (shader->shaderType == LatteConst::ShaderType::Vertex) { // Calculate the imaginary vertex id - src->add("uint vid = tig * VERTICES_PER_PRIMITIVE + tid;" _CRLF); + src->add("uint vid = tig * VERTICES_PER_VERTEX_PRIMITIVE + tid;" _CRLF); // TODO: don't hardcode the instance index src->add("uint iid = 0;" _CRLF); // Fetch the input @@ -4145,7 +4144,27 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, } else if (shader->shaderType == LatteConst::ShaderType::Geometry) { - src->add("mesh.set_primitive_count(vertexIndex / VERTICES_PER_PRIMITIVE);" _CRLF); + src->add("mesh.set_primitive_count(GET_PRIMITIVE_COUNT(vertexIndex));" _CRLF); + + // Set indices + if (shaderContext->contextRegisters[mmVGT_GS_OUT_PRIM_TYPE] == 1) // Line strip + { + src->add("for (uint8_t i = 0; i < GET_PRIMITIVE_COUNT(vertexIndex) * 2; i++) {" _CRLF); + src->add("mesh.set_index(i, (i 2 3) + i % 2);" _CRLF); + src->add("}" _CRLF); + } + else if (shaderContext->contextRegisters[mmVGT_GS_OUT_PRIM_TYPE] == 2) // Triangle strip + { + src->add("for (uint8_t i = 0; i < GET_PRIMITIVE_COUNT(vertexIndex) * 3; i++) {" _CRLF); + src->add("mesh.set_index(i, (i / 3) + i % 3);" _CRLF); + src->add("}" _CRLF); + } + else + { + src->add("for (uint8_t i = 0; i < vertexIndex; i++) {" _CRLF); + src->add("mesh.set_index(i, i);" _CRLF); + src->add("}" _CRLF); + } } } else diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index b5e16ec7c..20f75c95a 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -1,5 +1,6 @@ #pragma once +#include "Common/precompiled.h" #include "HW/Latte/Core/LatteConst.h" namespace LatteDecompiler { @@ -307,7 +308,7 @@ namespace LatteDecompiler src->addFmt("int4 passParameterSem{};" _CRLF, f); src->add("};" _CRLF _CRLF); src->add("struct ObjectPayload {" _CRLF); - src->add("VertexOut vertexOut[VERTICES_PER_PRIMITIVE];" _CRLF); + src->add("VertexOut vertexOut[VERTICES_PER_VERTEX_PRIMITIVE];" _CRLF); src->add("};" _CRLF _CRLF); } if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry) @@ -329,10 +330,10 @@ namespace LatteDecompiler } src->add("};" _CRLF _CRLF); - const uint32 MAX_PRIMITIVE_COUNT = 8; + const uint32 MAX_VERTEX_COUNT = 32; // Define the mesh shader output type - src->addFmt("using MeshType = mesh;" _CRLF, MAX_PRIMITIVE_COUNT, MAX_PRIMITIVE_COUNT); + src->addFmt("using MeshType = mesh;" _CRLF, MAX_VERTEX_COUNT, MAX_VERTEX_COUNT); } } } @@ -343,15 +344,46 @@ namespace LatteDecompiler if (decompilerContext->options->usesGeometryShader && (decompilerContext->shaderType == LatteConst::ShaderType::Vertex || decompilerContext->shaderType == LatteConst::ShaderType::Geometry)) { - src->add("#if PRIMITIVE_TYPE == point" _CRLF); - src->add("#define VERTICES_PER_PRIMITIVE 1" _CRLF); - src->add("#elif PRIMITIVE_TYPE == line" _CRLF); - src->add("#define VERTICES_PER_PRIMITIVE 2" _CRLF); - src->add("#elif PRIMITIVE_TYPE == triangle" _CRLF); - src->add("#define VERTICES_PER_PRIMITIVE 3" _CRLF); - src->add("#else" _CRLF); - src->add("#error unsupported primitive type" _CRLF); - src->add("#endif" _CRLF); + // TODO: make vsOutPrimType parth of the shader hash + LattePrimitiveMode vsOutPrimType = static_cast(decompilerContext->contextRegisters[mmVGT_PRIMITIVE_TYPE]); + uint32 gsOutPrimType = decompilerContext->contextRegisters[mmVGT_GS_OUT_PRIM_TYPE]; + + switch (vsOutPrimType) + { + case LattePrimitiveMode::POINTS: + src->add("#define VERTICES_PER_VERTEX_PRIMITIVE 1" _CRLF); + break; + case LattePrimitiveMode::LINES: + src->add("#define VERTICES_PER_VERTEX_PRIMITIVE 2" _CRLF); + break; + case LattePrimitiveMode::TRIANGLES: + src->add("#define VERTICES_PER_VERTEX_PRIMITIVE 3" _CRLF); + break; + default: + cemu_assert_suspicious(); + break; + } + if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry) + { + switch (gsOutPrimType) + { + case 0: // Point + src->add("#define MTL_PRIMITIVE_TYPE point" _CRLF); + src->add("#define GET_PRIMITIVE_COUNT(vertexCount) (vertexCount / 1)" _CRLF); + break; + case 1: // Line strip + src->add("#define MTL_PRIMITIVE_TYPE line" _CRLF); + src->add("#define GET_PRIMITIVE_COUNT(vertexCount) (vertexCount - 1)" _CRLF); + break; + case 2: // Triangle strip + src->add("#define MTL_PRIMITIVE_TYPE triangle" _CRLF); + src->add("#define GET_PRIMITIVE_COUNT(vertexCount) (vertexCount - 2)" _CRLF); + break; + default: + cemu_assert_suspicious(); + break; + } + } } const bool dump_shaders_enabled = ActiveSettings::DumpShadersEnabled(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index d8d39b79e..8e115e587 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -276,7 +276,6 @@ MTL::RenderPipelineState* MetalPipelineCache::GetMeshPipelineState(const LatteFe auto mtlMeshShader = static_cast(geometryShader->shader); auto mtlPixelShader = static_cast(pixelShader->shader); mtlObjectShader->CompileObjectFunction(lcr, fetchShader, vertexShader, hostIndexType); - mtlMeshShader->CompileMeshFunction(lcr, fetchShader); mtlPixelShader->CompileFragmentFunction(activeFBO); // Render pipeline state diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 91df236c0..89c9c2a3a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1013,7 +1013,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // Restride if (geometryShader) { - // Object shaders don't need restriding, since the attribute are fetched in the shader + // Object shaders don't need restriding, since the attributes are fetched in the shader buffer = m_memoryManager->GetBufferCache(); offset = m_state.m_vertexBuffers[i].offset; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index 883a85c6b..dc2846ef5 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -16,8 +16,15 @@ extern std::atomic_int g_compiled_shaders_async; RendererShaderMtl::RendererShaderMtl(MetalRenderer* mtlRenderer, ShaderType type, uint64 baseHash, uint64 auxHash, bool isGameShader, bool isGfxPackShader, const std::string& mslCode) : RendererShader(type, baseHash, auxHash, isGameShader, isGfxPackShader), m_mtlr{mtlRenderer} { - // TODO: don't compile just-in-time - m_mslCode = mslCode; + if (type == ShaderType::kGeometry) + { + Compile(mslCode); + } + else + { + // TODO: don't compile just-in-time + m_mslCode = mslCode; + } // Count shader compilation g_compiled_shaders_total++; @@ -35,25 +42,6 @@ void RendererShaderMtl::CompileObjectFunction(const LatteContextRegister& lcr, c std::string fullCode; - // Primitive type - const LattePrimitiveMode primitiveMode = static_cast(lcr.VGT_PRIMITIVE_TYPE.get_PRIMITIVE_MODE()); - fullCode += "#define PRIMITIVE_TYPE "; - switch (primitiveMode) - { - case LattePrimitiveMode::POINTS: - fullCode += "point"; - break; - case LattePrimitiveMode::LINES: - fullCode += "line"; - break; - case LattePrimitiveMode::TRIANGLES: - fullCode += "triangle"; - break; - default: - break; - } - fullCode += "\n"; - // Vertex buffers std::string vertexBufferDefinitions = "#define VERTEX_BUFFER_DEFINITIONS "; std::string vertexBuffers = "#define VERTEX_BUFFERS "; @@ -83,6 +71,10 @@ void RendererShaderMtl::CompileObjectFunction(const LatteContextRegister& lcr, c { std::optional fetchType; + uint32 bufferIndex = bufferGroup.attributeBufferIndex; + uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; + uint32 bufferStride = (lcr.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + for (sint32 j = 0; j < bufferGroup.attribCount; ++j) { auto& attr = bufferGroup.attrib[j]; @@ -149,7 +141,7 @@ void RendererShaderMtl::CompileObjectFunction(const LatteContextRegister& lcr, c inputFetchDefinition += "in.ATTRIBUTE_NAME" + std::to_string(semanticId) + " = "; inputFetchDefinition += "uint4(*(device " + formatName + "*)"; inputFetchDefinition += "(vertexBuffer" + std::to_string(attr.attributeBufferIndex); - inputFetchDefinition += " + vid + " + std::to_string(attr.offset) + ")"; + inputFetchDefinition += " + vid * " + std::to_string(bufferStride) + " + " + std::to_string(attr.offset) + ")"; for (uint8 i = 0; i < (4 - componentCount); i++) inputFetchDefinition += ", 0"; inputFetchDefinition += ");\n"; @@ -165,10 +157,6 @@ void RendererShaderMtl::CompileObjectFunction(const LatteContextRegister& lcr, c } } - uint32 bufferIndex = bufferGroup.attributeBufferIndex; - uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; - uint32 bufferStride = (lcr.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; - vertexBufferDefinitions += ", device uchar* vertexBuffer" + std::to_string(bufferIndex) + " [[buffer(" + std::to_string(GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)) + ")]]"; vertexBuffers += ", vertexBuffer" + std::to_string(bufferIndex); } @@ -183,35 +171,6 @@ void RendererShaderMtl::CompileObjectFunction(const LatteContextRegister& lcr, c Compile(fullCode); } -void RendererShaderMtl::CompileMeshFunction(const LatteContextRegister& lcr, const LatteFetchShader* fetchShader) -{ - cemu_assert_debug(m_type == ShaderType::kGeometry); - - std::string fullCode; - - // Primitive type - const LattePrimitiveMode primitiveMode = static_cast(lcr.VGT_PRIMITIVE_TYPE.get_PRIMITIVE_MODE()); - fullCode += "#define PRIMITIVE_TYPE "; - switch (primitiveMode) - { - case LattePrimitiveMode::POINTS: - fullCode += "point"; - break; - case LattePrimitiveMode::LINES: - fullCode += "line"; - break; - case LattePrimitiveMode::TRIANGLES: - fullCode += "triangle"; - break; - default: - break; - } - fullCode += "\n"; - - fullCode += m_mslCode; - Compile(fullCode); -} - void RendererShaderMtl::CompileFragmentFunction(CachedFBOMtl* activeFBO) { cemu_assert_debug(m_type == ShaderType::kFragment); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h index 1a53313a0..e21db55ed 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h @@ -27,7 +27,6 @@ class RendererShaderMtl : public RendererShader } void CompileObjectFunction(const LatteContextRegister& lcr, const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, Renderer::INDEX_TYPE hostIndexType); - void CompileMeshFunction(const LatteContextRegister& lcr, const LatteFetchShader* fetchShader); void CompileFragmentFunction(CachedFBOMtl* activeFBO); MTL::Function* GetFunction() const From a832bc225ed35913029f57b9bee1210b6ef799f0 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 21 Aug 2024 12:12:45 +0200 Subject: [PATCH 100/368] support rect primitive emulation --- .../LatteDecompilerEmitMSL.cpp | 35 ++-- .../LatteDecompilerEmitMSLHeader.hpp | 44 +++-- .../Renderer/Metal/MetalPipelineCache.cpp | 183 +++++++++++++++++- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 29 +-- 4 files changed, 241 insertions(+), 50 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 1b0f3f717..faf20065d 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3821,20 +3821,22 @@ static void LatteDecompiler_emitAttributeImport(LatteDecompilerShaderContext* sh void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, LatteDecompilerShader* shader) { + bool isRectVertexShader = (static_cast(shaderContext->contextRegisters[mmVGT_PRIMITIVE_TYPE]) == LattePrimitiveMode::RECTS); + StringBuf* src = new StringBuf(1024*1024*12); // reserve 12MB for generated source (we resize-to-fit at the end) shaderContext->shaderSource = src; // debug info src->addFmt("// shader {:016x}" _CRLF, shaderContext->shaderBaseHash); #ifdef CEMU_DEBUG_ASSERT - src->addFmt("// usesIntegerValues: {}" _CRLF, shaderContext->analyzer.usesIntegerValues?"true":"false"); + src->addFmt("// usesIntegerValues: {}" _CRLF, shaderContext->analyzer.usesIntegerValues ? "true" : "false"); src->addFmt(_CRLF); #endif // include metal standard library src->add("#include " _CRLF); src->add("using namespace metal;" _CRLF); // header part (definitions for inputs and outputs) - LatteDecompiler::emitHeader(shaderContext); + LatteDecompiler::emitHeader(shaderContext, isRectVertexShader); // helper functions LatteDecompiler_emitHelperFunctions(shaderContext, src); const char* functionType = ""; @@ -3842,7 +3844,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, switch (shader->shaderType) { case LatteConst::ShaderType::Vertex: - if (shaderContext->options->usesGeometryShader) + if (shaderContext->options->usesGeometryShader || isRectVertexShader) { // Defined just-in-time // Will also modify vid in case of an indexed draw @@ -3868,9 +3870,9 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, } // start of main src->addFmt("{} {} main0(", functionType, outputTypeName); - LatteDecompiler::emitInputs(shaderContext); + LatteDecompiler::emitInputs(shaderContext, isRectVertexShader); src->add(") {" _CRLF); - if (shaderContext->options->usesGeometryShader && (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry)) + if ((shaderContext->options->usesGeometryShader || isRectVertexShader) && (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry)) { if (shader->shaderType == LatteConst::ShaderType::Vertex) { @@ -4086,7 +4088,8 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, continue; } - if (shaderContext->options->usesGeometryShader) + // TODO: is the if statement even needed? + if (shaderContext->options->usesGeometryShader || isRectVertexShader) { // import from geometry shader if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) @@ -4130,11 +4133,11 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, // vertex shader should write renderstate point size at the end if required but not modified by shader if (shaderContext->analyzer.outputPointSize && shaderContext->analyzer.writesPointSize == false) { - if (shader->shaderType == LatteConst::ShaderType::Vertex && shaderContext->options->usesGeometryShader == false) + if (shader->shaderType == LatteConst::ShaderType::Vertex && !shaderContext->options->usesGeometryShader) src->add("out.pointSize = supportBuffer.pointSize;" _CRLF); } - if (shaderContext->options->usesGeometryShader && (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry)) + if ((shaderContext->options->usesGeometryShader || isRectVertexShader) && (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry)) { if (shader->shaderType == LatteConst::ShaderType::Vertex) { @@ -4167,18 +4170,14 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, } } } - else - { - if (shader->shaderType == LatteConst::ShaderType::Vertex) - { - // TODO: this should be handled outside of the shader, because clipping currently wouldn't work (or would it?) - if (shader->shaderType == LatteConst::ShaderType::Vertex) - src->add("out.position.z = (out.position.z + out.position.w) / 2.0;" _CRLF); - } - // Return + // TODO: this should be handled outside of the shader, because clipping currently wouldn't work (or would it?) + if ((shader->shaderType == LatteConst::ShaderType::Vertex && !shaderContext->options->usesGeometryShader) || shader->shaderType == LatteConst::ShaderType::Geometry) + src->add("out.position.z = (out.position.z + out.position.w) / 2.0;" _CRLF); + + // Return + if (!(shaderContext->options->usesGeometryShader || isRectVertexShader) || shader->shaderType == LatteConst::ShaderType::Pixel) src->add("return out;" _CRLF); - } // end of shader main src->add("}" _CRLF); diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 20f75c95a..5e1b4c113 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -96,7 +96,7 @@ namespace LatteDecompiler uniformCurrentOffset += 8; } // define verticesPerInstance + streamoutBufferBaseX - if ((shader->shaderType == LatteConst::ShaderType::Vertex && decompilerContext->options->usesGeometryShader == false) || + if ((shader->shaderType == LatteConst::ShaderType::Vertex && !decompilerContext->options->usesGeometryShader) || (shader->shaderType == LatteConst::ShaderType::Geometry)) { src->add("int verticesPerInstance;" _CRLF); @@ -182,7 +182,7 @@ namespace LatteDecompiler src->addFmt("{}", attributeNames); } - static void _emitVSOutputs(LatteDecompilerShaderContext* shaderContext) + static void _emitVSOutputs(LatteDecompilerShaderContext* shaderContext, bool isRectVertexShader) { auto* src = shaderContext->shaderSource; @@ -214,15 +214,25 @@ namespace LatteDecompiler continue; // no ps input src->addFmt("float4 passParameterSem{}", psInputTable->import[psInputIndex].semanticId); - src->addFmt(" [[user(locn{})]]", psInputIndex); - if (psInputTable->import[psInputIndex].isFlat) - src->add(" [[flat]]"); - if (psInputTable->import[psInputIndex].isNoPerspective) - src->add(" [[center_no_perspective]]"); + if (!isRectVertexShader) + { + src->addFmt(" [[user(locn{})]]", psInputIndex); + if (psInputTable->import[psInputIndex].isFlat) + src->add(" [[flat]]"); + if (psInputTable->import[psInputIndex].isNoPerspective) + src->add(" [[center_no_perspective]]"); + } src->addFmt(";" _CRLF); } src->add("};" _CRLF _CRLF); + + if (isRectVertexShader) + { + src->add("struct ObjectPayload {" _CRLF); + src->add("VertexOut vertexOut[VERTICES_PER_VERTEX_PRIMITIVE];" _CRLF); + src->add("};" _CRLF _CRLF); + } } static void _emitPSInputs(LatteDecompilerShaderContext* shaderContext) @@ -251,7 +261,7 @@ namespace LatteDecompiler src->add("};" _CRLF _CRLF); } - static void _emitInputsAndOutputs(LatteDecompilerShaderContext* decompilerContext) + static void _emitInputsAndOutputs(LatteDecompilerShaderContext* decompilerContext, bool isRectVertexShader) { auto src = decompilerContext->shaderSource; @@ -288,7 +298,7 @@ namespace LatteDecompiler if (!decompilerContext->options->usesGeometryShader) { if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) - _emitVSOutputs(decompilerContext); + _emitVSOutputs(decompilerContext, isRectVertexShader); } else { @@ -338,11 +348,11 @@ namespace LatteDecompiler } } - static void emitHeader(LatteDecompilerShaderContext* decompilerContext) + static void emitHeader(LatteDecompilerShaderContext* decompilerContext, bool isRectVertexShader) { auto src = decompilerContext->shaderSource; - if (decompilerContext->options->usesGeometryShader && (decompilerContext->shaderType == LatteConst::ShaderType::Vertex || decompilerContext->shaderType == LatteConst::ShaderType::Geometry)) + if ((decompilerContext->options->usesGeometryShader || isRectVertexShader) && (decompilerContext->shaderType == LatteConst::ShaderType::Vertex || decompilerContext->shaderType == LatteConst::ShaderType::Geometry)) { // TODO: make vsOutPrimType parth of the shader hash LattePrimitiveMode vsOutPrimType = static_cast(decompilerContext->contextRegisters[mmVGT_PRIMITIVE_TYPE]); @@ -359,6 +369,9 @@ namespace LatteDecompiler case LattePrimitiveMode::TRIANGLES: src->add("#define VERTICES_PER_VERTEX_PRIMITIVE 3" _CRLF); break; + case LattePrimitiveMode::RECTS: + src->add("#define VERTICES_PER_VERTEX_PRIMITIVE 3" _CRLF); + break; default: cemu_assert_suspicious(); break; @@ -394,7 +407,7 @@ namespace LatteDecompiler // uniform buffers _emitUniformBuffers(decompilerContext); // inputs and outputs - _emitInputsAndOutputs(decompilerContext); + _emitInputsAndOutputs(decompilerContext, isRectVertexShader); if (dump_shaders_enabled) decompilerContext->shaderSource->add("// end of shader inputs/outputs" _CRLF); @@ -467,14 +480,14 @@ namespace LatteDecompiler } } - static void emitInputs(LatteDecompilerShaderContext* decompilerContext) + static void emitInputs(LatteDecompilerShaderContext* decompilerContext, bool isRectVertexShader) { auto src = decompilerContext->shaderSource; switch (decompilerContext->shaderType) { case LatteConst::ShaderType::Vertex: - if (!decompilerContext->options->usesGeometryShader) + if (!(decompilerContext->options->usesGeometryShader || isRectVertexShader)) src->add("VertexIn in [[stage_in]], "); break; case LatteConst::ShaderType::Pixel: @@ -488,7 +501,7 @@ namespace LatteDecompiler switch (decompilerContext->shaderType) { case LatteConst::ShaderType::Vertex: - if (decompilerContext->options->usesGeometryShader) + if (decompilerContext->options->usesGeometryShader || isRectVertexShader) { src->add(", object_data ObjectPayload& objectPayload [[payload]]"); src->add(", mesh_grid_properties meshGridProperties"); @@ -505,7 +518,6 @@ namespace LatteDecompiler case LatteConst::ShaderType::Geometry: src->add(", MeshType mesh"); src->add(", const object_data ObjectPayload& objectPayload [[payload]]"); - src->add(", uint tid [[thread_index_in_threadgroup]]"); break; case LatteConst::ShaderType::Pixel: src->add(", bool frontFacing [[front_facing]]"); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 8e115e587..ba5770129 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -2,6 +2,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Foundation/NSObject.hpp" +#include "HW/Latte/Core/LatteShader.h" #include "HW/Latte/Renderer/Metal/CachedFBOMtl.h" #include "HW/Latte/Renderer/Metal/LatteToMtl.h" #include "HW/Latte/Renderer/Metal/RendererShaderMtl.h" @@ -9,10 +10,177 @@ #include "HW/Latte/Core/FetchShader.h" #include "HW/Latte/ISA/RegDefines.h" -#include "Metal/MTLDevice.hpp" -#include "Metal/MTLRenderPipeline.hpp" #include "config/ActiveSettings.h" +static void rectsEmulationGS_outputSingleVertex(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable* psInputTable, sint32 vIdx, const LatteContextRegister& latteRegister) +{ + auto parameterMask = vertexShader->outputParameterMask; + for (uint32 i = 0; i < 32; i++) + { + if ((parameterMask & (1 << i)) == 0) + continue; + sint32 vsSemanticId = psInputTable->getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i); + if (vsSemanticId < 0) + continue; + // make sure PS has matching input + if (!psInputTable->hasPSImportForSemanticId(vsSemanticId)) + continue; + gsSrc.append(fmt::format("out.passParameterSem{} = objectPayload.vertexOut[{}].passParameterSem{};\r\n", vsSemanticId, vIdx, vsSemanticId)); + } + gsSrc.append(fmt::format("out.position = objectPayload.vertexOut[{}].position;\r\n", vIdx)); + gsSrc.append(fmt::format("mesh.set_vertex({}, out);\r\n", vIdx)); +} + +static void rectsEmulationGS_outputGeneratedVertex(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable* psInputTable, const char* variant, const LatteContextRegister& latteRegister) +{ + auto parameterMask = vertexShader->outputParameterMask; + for (uint32 i = 0; i < 32; i++) + { + if ((parameterMask & (1 << i)) == 0) + continue; + sint32 vsSemanticId = psInputTable->getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i); + if (vsSemanticId < 0) + continue; + // make sure PS has matching input + if (!psInputTable->hasPSImportForSemanticId(vsSemanticId)) + continue; + gsSrc.append(fmt::format("passParameterSem{}Out = gen4thVertex{}(objectPayload.vertexOut[0].passParameterSem{}, objectPayload.vertexOut[1].passParameterSem{}, objectPayload.vertexOut[2].passParameterSem{});\r\n", vsSemanticId, variant, vsSemanticId, vsSemanticId, vsSemanticId)); + } + gsSrc.append(fmt::format("out.position = gen4thVertex{}(objectPayload.vertexOut[0].position, objectPayload.vertexOut[1].position, objectPayload.vertexOut[2].position);\r\n", variant)); + gsSrc.append(fmt::format("mesh.set_vertex(3, out);\r\n")); +} + +static void rectsEmulationGS_outputVerticesCode(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable* psInputTable, sint32 p0, sint32 p1, sint32 p2, sint32 p3, const char* variant, const LatteContextRegister& latteRegister) +{ + sint32 pList[4] = { p0, p1, p2, p3 }; + for (sint32 i = 0; i < 4; i++) + { + if (pList[i] == 3) + rectsEmulationGS_outputGeneratedVertex(gsSrc, vertexShader, psInputTable, variant, latteRegister); + else + rectsEmulationGS_outputSingleVertex(gsSrc, vertexShader, psInputTable, pList[i], latteRegister); + } +} + +static RendererShaderMtl* rectsEmulationGS_generate(MetalRenderer* metalRenderer, const LatteDecompilerShader* vertexShader, const LatteContextRegister& latteRegister) +{ + std::string gsSrc; + gsSrc.append("#include \r\n"); + gsSrc.append("using namespace metal;\r\n"); + + LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable(); + + // inputs & outputs + std::string vertexOutDefinition = "struct VertexOut {\r\n"; + vertexOutDefinition += "float4 position;\r\n"; + std::string geometryOutDefinition = "struct GeometryOut {\r\n"; + geometryOutDefinition += "float4 position [[position]];\r\n"; + auto parameterMask = vertexShader->outputParameterMask; + for (sint32 f = 0; f < 2; f++) + { + for (uint32 i = 0; i < 32; i++) + { + if ((parameterMask & (1 << i)) == 0) + continue; + sint32 vsSemanticId = psInputTable->getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i); + if (vsSemanticId < 0) + continue; + auto psImport = psInputTable->getPSImportBySemanticId(vsSemanticId); + if (psImport == nullptr) + continue; + + if (f == 0) + { + vertexOutDefinition += fmt::format("float4 passParameterSem{};\r\n", vsSemanticId); + } + else + { + geometryOutDefinition += fmt::format("float4 passParameterSem{}", vsSemanticId); + + geometryOutDefinition += fmt::format(" [[user(locn{})]]", psInputTable->getPSImportLocationBySemanticId(vsSemanticId)); + if (psImport->isFlat) + geometryOutDefinition += " [[flat]]"; + if (psImport->isNoPerspective) + geometryOutDefinition += " [[center_no_perspective]]"; + geometryOutDefinition += ";\r\n"; + } + } + } + vertexOutDefinition += "};\r\n"; + geometryOutDefinition += "};\r\n"; + + gsSrc.append(vertexOutDefinition); + gsSrc.append(geometryOutDefinition); + + gsSrc.append("struct ObjectPayload {\r\n"); + gsSrc.append("VertexOut vertexOut[3];\r\n"); + gsSrc.append("};\r\n"); + + // gen function + gsSrc.append("float4 gen4thVertexA(float4 a, float4 b, float4 c)\r\n"); + gsSrc.append("{\r\n"); + gsSrc.append("return b - (c - a);\r\n"); + gsSrc.append("}\r\n"); + + gsSrc.append("float4 gen4thVertexB(float4 a, float4 b, float4 c)\r\n"); + gsSrc.append("{\r\n"); + gsSrc.append("return c - (b - a);\r\n"); + gsSrc.append("}\r\n"); + + gsSrc.append("float4 gen4thVertexC(float4 a, float4 b, float4 c)\r\n"); + gsSrc.append("{\r\n"); + gsSrc.append("return c + (b - a);\r\n"); + gsSrc.append("}\r\n"); + + // main + gsSrc.append("using MeshType = mesh;\r\n"); + gsSrc.append("[[mesh, max_total_threads_per_threadgroup(1)]]\r\n"); + gsSrc.append("void main0(MeshType mesh, const object_data ObjectPayload& objectPayload [[payload]])\r\n"); + gsSrc.append("{\r\n"); + gsSrc.append("GeometryOut out;\r\n"); + + // there are two possible winding orders that need different triangle generation: + // 0 1 + // 2 3 + // and + // 0 1 + // 3 2 + // all others are just symmetries of these cases + + // we can determine the case by comparing the distance 0<->1 and 0<->2 + + gsSrc.append("float dist0_1 = length(objectPayload.vertexOut[1].position.xy - objectPayload.vertexOut[0].position.xy);\r\n"); + gsSrc.append("float dist0_2 = length(objectPayload.vertexOut[2].position.xy - objectPayload.vertexOut[0].position.xy);\r\n"); + gsSrc.append("float dist1_2 = length(objectPayload.vertexOut[2].position.xy - objectPayload.vertexOut[1].position.xy);\r\n"); + + // emit vertices + gsSrc.append("if(dist0_1 > dist0_2 && dist0_1 > dist1_2)\r\n"); + gsSrc.append("{\r\n"); + // p0 to p1 is diagonal + rectsEmulationGS_outputVerticesCode(gsSrc, vertexShader, psInputTable, 2, 1, 0, 3, "A", latteRegister); + gsSrc.append("} else if ( dist0_2 > dist0_1 && dist0_2 > dist1_2 ) {\r\n"); + // p0 to p2 is diagonal + rectsEmulationGS_outputVerticesCode(gsSrc, vertexShader, psInputTable, 1, 2, 0, 3, "B", latteRegister); + gsSrc.append("} else {\r\n"); + // p1 to p2 is diagonal + rectsEmulationGS_outputVerticesCode(gsSrc, vertexShader, psInputTable, 0, 1, 2, 3, "C", latteRegister); + gsSrc.append("}\r\n"); + + gsSrc.append("mesh.set_primitive_count(2);\r\n"); + gsSrc.append("mesh.set_index(0, 0);\r\n"); + gsSrc.append("mesh.set_index(1, 1);\r\n"); + gsSrc.append("mesh.set_index(2, 2);\r\n"); + gsSrc.append("mesh.set_index(3, 1);\r\n"); + gsSrc.append("mesh.set_index(4, 2);\r\n"); + gsSrc.append("mesh.set_index(5, 3);\r\n"); + + gsSrc.append("}\r\n"); + + auto mtlShader = new RendererShaderMtl(metalRenderer, RendererShader::ShaderType::kGeometry, 0, 0, false, false, gsSrc); + + return mtlShader; +} + #define INVALID_TITLE_ID 0xFFFFFFFFFFFFFFFF uint64 s_cacheTitleId = INVALID_TITLE_ID; @@ -273,7 +441,16 @@ MTL::RenderPipelineState* MetalPipelineCache::GetMeshPipelineState(const LatteFe return pipeline; auto mtlObjectShader = static_cast(vertexShader->shader); - auto mtlMeshShader = static_cast(geometryShader->shader); + RendererShaderMtl* mtlMeshShader; + if (geometryShader) + { + mtlMeshShader = static_cast(geometryShader->shader); + } + else + { + // If there is no geometry shader, it means that we are emulating rects + mtlMeshShader = rectsEmulationGS_generate(m_mtlr, vertexShader, lcr); + } auto mtlPixelShader = static_cast(pixelShader->shader); mtlObjectShader->CompileObjectFunction(lcr, fetchShader, vertexShader, hostIndexType); mtlPixelShader->CompileFragmentFunction(activeFBO); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 89c9c2a3a..aef5445aa 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -819,6 +819,11 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // Render pass auto renderCommandEncoder = GetRenderCommandEncoder(); + // Primitive type + const LattePrimitiveMode primitiveMode = static_cast(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]); + auto mtlPrimitiveType = GetMtlPrimitiveType(primitiveMode); + bool isPrimitiveRect = (primitiveMode == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS); + // Shaders LatteDecompilerShader* vertexShader = LatteSHRC_GetActiveVertexShader(); LatteDecompilerShader* geometryShader = LatteSHRC_GetActiveGeometryShader(); @@ -830,6 +835,8 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 } const auto fetchShader = LatteSHRC_GetActiveFetchShader(); + bool usesGeometryShader = (geometryShader != nullptr || isPrimitiveRect); + // Depth stencil state // TODO: implement this somehow //auto depthControl = LatteGPUState.contextNew.DB_DEPTH_CONTROL; @@ -866,11 +873,6 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 } } - // Primitive type - const LattePrimitiveMode primitiveMode = static_cast(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]); - auto mtlPrimitiveType = GetMtlPrimitiveType(primitiveMode); - bool isPrimitiveRect = (primitiveMode == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS); - // Blend color float* blendColorConstant = (float*)LatteGPUState.contextRegister + Latte::REGADDR::CB_BLEND_RED; renderCommandEncoder->setBlendColor(blendColorConstant[0], blendColorConstant[1], blendColorConstant[2], blendColorConstant[3]); @@ -1011,7 +1013,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 size_t offset; // Restride - if (geometryShader) + if (usesGeometryShader) { // Object shaders don't need restriding, since the attributes are fetched in the shader buffer = m_memoryManager->GetBufferCache(); @@ -1031,14 +1033,14 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // Bind if (true) { - SetBuffer(renderCommandEncoder, GetMtlShaderType(vertexShader->shaderType, (geometryShader != nullptr)), buffer, offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); + SetBuffer(renderCommandEncoder, GetMtlShaderType(vertexShader->shaderType, usesGeometryShader), buffer, offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); } } } // Render pipeline state MTL::RenderPipelineState* renderPipelineState; - if (geometryShader) + if (usesGeometryShader) renderPipelineState = m_pipelineCache->GetMeshPipelineState(fetchShader, vertexShader, geometryShader, pixelShader, m_state.m_lastUsedFBO, LatteGPUState.contextNew, hostIndexType); else renderPipelineState = m_pipelineCache->GetRenderPipelineState(fetchShader, vertexShader, pixelShader, m_state.m_lastUsedFBO, LatteGPUState.contextNew); @@ -1053,16 +1055,16 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 LatteStreamout_PrepareDrawcall(count, instanceCount); // Uniform buffers, textures and samplers - BindStageResources(renderCommandEncoder, vertexShader, (geometryShader != nullptr)); + BindStageResources(renderCommandEncoder, vertexShader, usesGeometryShader); if (geometryShader) - BindStageResources(renderCommandEncoder, geometryShader, (geometryShader != nullptr)); - BindStageResources(renderCommandEncoder, pixelShader, (geometryShader != nullptr)); + BindStageResources(renderCommandEncoder, geometryShader, usesGeometryShader); + BindStageResources(renderCommandEncoder, pixelShader, usesGeometryShader); // Draw MTL::Buffer* indexBuffer = nullptr; if (hostIndexType != INDEX_TYPE::NONE) indexBuffer = m_memoryManager->GetTemporaryBufferAllocator().GetBuffer(indexBufferIndex); - if (geometryShader) + if (usesGeometryShader) { // TODO: don't hardcode the index if (indexBuffer) @@ -1078,10 +1080,11 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 verticesPerPrimitive = 2; break; case LattePrimitiveMode::TRIANGLES: + case LattePrimitiveMode::RECTS: verticesPerPrimitive = 3; break; default: - throw std::runtime_error("Invalid primitive mode"); + debug_printf("invalid primitive mode %u\n", (uint32)primitiveMode); break; } From 339af5c4c50447db2e9a0a2a71aa5f869f378f52 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 21 Aug 2024 13:49:15 +0200 Subject: [PATCH 101/368] support formats with X components --- src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index 771aa0599..80e44ef46 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -47,10 +47,10 @@ std::map MTL_COLOR_FORMAT_TABLE = { {Latte::E_GX2SURFFMT::R16_G16_B16_A16_UINT, {MTL::PixelFormatRGBA16Uint, MetalDataType::UINT, 8}}, {Latte::E_GX2SURFFMT::R16_G16_B16_A16_SINT, {MTL::PixelFormatRGBA16Sint, MetalDataType::INT, 8}}, {Latte::E_GX2SURFFMT::R16_G16_B16_A16_FLOAT, {MTL::PixelFormatRGBA16Float, MetalDataType::FLOAT, 8}}, - {Latte::E_GX2SURFFMT::R24_X8_UNORM, {MTL::PixelFormatInvalid, MetalDataType::NONE, 0}}, // TODO + {Latte::E_GX2SURFFMT::R24_X8_UNORM, {MTL::PixelFormatR32Float, MetalDataType::FLOAT, 0}}, // TODO {Latte::E_GX2SURFFMT::R24_X8_FLOAT, {MTL::PixelFormatInvalid, MetalDataType::NONE, 0}}, // TODO - {Latte::E_GX2SURFFMT::X24_G8_UINT, {MTL::PixelFormatInvalid, MetalDataType::NONE, 0}}, // TODO - {Latte::E_GX2SURFFMT::R32_X8_FLOAT, {MTL::PixelFormatInvalid, MetalDataType::NONE, 0}}, // TODO + {Latte::E_GX2SURFFMT::X24_G8_UINT, {MTL::PixelFormatRGBA8Uint, MetalDataType::UINT, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R32_X8_FLOAT, {MTL::PixelFormatR32Float, MetalDataType::FLOAT, 4}}, // TODO: correct? {Latte::E_GX2SURFFMT::X32_G8_UINT_X24, {MTL::PixelFormatInvalid, MetalDataType::NONE, 0}}, // TODO {Latte::E_GX2SURFFMT::R11_G11_B10_FLOAT, {MTL::PixelFormatRG11B10Float, MetalDataType::FLOAT, 4}}, {Latte::E_GX2SURFFMT::R32_UINT, {MTL::PixelFormatR32Uint, MetalDataType::UINT, 4}}, From 4f7288d9372302f31949c725c341cf407ff63c7c Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 21 Aug 2024 14:51:36 +0200 Subject: [PATCH 102/368] fix: rect mesh shader errors --- .../LatteDecompilerEmitMSL.cpp | 2 +- .../HW/Latte/Renderer/Metal/MetalPipelineCache.cpp | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index faf20065d..dcca05604 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3587,7 +3587,7 @@ void LatteDecompiler_emitClauseCodeMSL(LatteDecompilerShaderContext* shaderConte // write point size if (shaderContext->analyzer.outputPointSize && shaderContext->analyzer.writesPointSize == false) src->add("out.pointSize = supportBuffer.pointSize;" _CRLF); - // Emit vertex (if the vertex index matches thread id) + src->add("out.position.z = (out.position.z + out.position.w) / 2.0;" _CRLF); src->add("mesh.set_vertex(vertexIndex, out);" _CRLF); src->add("vertexIndex++;" _CRLF); // increment transform feedback pointer diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index ba5770129..ad63a041a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -44,7 +44,7 @@ static void rectsEmulationGS_outputGeneratedVertex(std::string& gsSrc, const Lat // make sure PS has matching input if (!psInputTable->hasPSImportForSemanticId(vsSemanticId)) continue; - gsSrc.append(fmt::format("passParameterSem{}Out = gen4thVertex{}(objectPayload.vertexOut[0].passParameterSem{}, objectPayload.vertexOut[1].passParameterSem{}, objectPayload.vertexOut[2].passParameterSem{});\r\n", vsSemanticId, variant, vsSemanticId, vsSemanticId, vsSemanticId)); + gsSrc.append(fmt::format("out.passParameterSem{} = gen4thVertex{}(objectPayload.vertexOut[0].passParameterSem{}, objectPayload.vertexOut[1].passParameterSem{}, objectPayload.vertexOut[2].passParameterSem{});\r\n", vsSemanticId, variant, vsSemanticId, vsSemanticId, vsSemanticId)); } gsSrc.append(fmt::format("out.position = gen4thVertex{}(objectPayload.vertexOut[0].position, objectPayload.vertexOut[1].position, objectPayload.vertexOut[2].position);\r\n", variant)); gsSrc.append(fmt::format("mesh.set_vertex(3, out);\r\n")); @@ -60,6 +60,12 @@ static void rectsEmulationGS_outputVerticesCode(std::string& gsSrc, const LatteD else rectsEmulationGS_outputSingleVertex(gsSrc, vertexShader, psInputTable, pList[i], latteRegister); } + gsSrc.append(fmt::format("mesh.set_index(0, {});\r\n", pList[0])); + gsSrc.append(fmt::format("mesh.set_index(1, {});\r\n", pList[1])); + gsSrc.append(fmt::format("mesh.set_index(2, {});\r\n", pList[2])); + gsSrc.append(fmt::format("mesh.set_index(3, {});\r\n", pList[1])); + gsSrc.append(fmt::format("mesh.set_index(4, {});\r\n", pList[2])); + gsSrc.append(fmt::format("mesh.set_index(5, {});\r\n", pList[3])); } static RendererShaderMtl* rectsEmulationGS_generate(MetalRenderer* metalRenderer, const LatteDecompilerShader* vertexShader, const LatteContextRegister& latteRegister) @@ -167,12 +173,6 @@ static RendererShaderMtl* rectsEmulationGS_generate(MetalRenderer* metalRenderer gsSrc.append("}\r\n"); gsSrc.append("mesh.set_primitive_count(2);\r\n"); - gsSrc.append("mesh.set_index(0, 0);\r\n"); - gsSrc.append("mesh.set_index(1, 1);\r\n"); - gsSrc.append("mesh.set_index(2, 2);\r\n"); - gsSrc.append("mesh.set_index(3, 1);\r\n"); - gsSrc.append("mesh.set_index(4, 2);\r\n"); - gsSrc.append("mesh.set_index(5, 3);\r\n"); gsSrc.append("}\r\n"); From 3d0055af6a18d5e2846f60a1ba8db47c158fbe31 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 21 Aug 2024 15:53:40 +0200 Subject: [PATCH 103/368] end render pass if attachment is being read --- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 2 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 90 +++++++++++++++---- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 2 + 3 files changed, 78 insertions(+), 16 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index 80e44ef46..51885e518 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -29,7 +29,7 @@ std::map MTL_COLOR_FORMAT_TABLE = { {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SNORM, {MTL::PixelFormatRGBA16Snorm, MetalDataType::FLOAT, 8}}, // TODO: correct? {Latte::E_GX2SURFFMT::R10_G10_B10_A2_UINT, {MTL::PixelFormatRGB10A2Uint, MetalDataType::UINT, 4}}, {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SINT, {MTL::PixelFormatRGBA16Sint, MetalDataType::INT, 8}}, // TODO: correct? - {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SRGB, {MTL::PixelFormatRGBA8Unorm_sRGB, MetalDataType::FLOAT, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SRGB, {MTL::PixelFormatRGB10A2Unorm, MetalDataType::FLOAT, 4}}, // TODO: sRGB? {Latte::E_GX2SURFFMT::A2_B10_G10_R10_UNORM, {MTL::PixelFormatBGR10A2Unorm, MetalDataType::FLOAT, 4}}, // TODO: correct? {Latte::E_GX2SURFFMT::A2_B10_G10_R10_UINT, {MTL::PixelFormatRGB10A2Uint, MetalDataType::UINT, 4}}, // TODO: correct? {Latte::E_GX2SURFFMT::R16_UNORM, {MTL::PixelFormatR16Unorm, MetalDataType::FLOAT, 2}}, diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index aef5445aa..64141ed10 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -18,6 +18,7 @@ #include "Cafe/HW/Latte/Core/LatteIndices.h" #include "Cemu/Logging/CemuDebugLogging.h" #include "Common/precompiled.h" +#include "HW/Latte/Core/LatteConst.h" #include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "Metal/MTLDevice.hpp" #include "Metal/MTLRenderCommandEncoder.hpp" @@ -816,6 +817,28 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 auto& encoderState = m_state.m_encoderState; + // Shaders + LatteDecompilerShader* vertexShader = LatteSHRC_GetActiveVertexShader(); + LatteDecompilerShader* geometryShader = LatteSHRC_GetActiveGeometryShader(); + LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader(); + if (!vertexShader) + { + debug_printf("no vertex function, skipping draw\n"); + return; + } + const auto fetchShader = LatteSHRC_GetActiveFetchShader(); + + // Check if we need to end the render pass + // Fragment shader is most likely to require a render pass flush, so check for it first + bool endRenderPass = CheckIfRenderPassNeedsFlush(pixelShader); + if (!endRenderPass) + endRenderPass = CheckIfRenderPassNeedsFlush(vertexShader); + if (!endRenderPass && geometryShader) + endRenderPass = CheckIfRenderPassNeedsFlush(geometryShader); + + if (endRenderPass) + EndEncoding(); + // Render pass auto renderCommandEncoder = GetRenderCommandEncoder(); @@ -824,23 +847,10 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 auto mtlPrimitiveType = GetMtlPrimitiveType(primitiveMode); bool isPrimitiveRect = (primitiveMode == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS); - // Shaders - LatteDecompilerShader* vertexShader = LatteSHRC_GetActiveVertexShader(); - LatteDecompilerShader* geometryShader = LatteSHRC_GetActiveGeometryShader(); - LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader(); - if (!vertexShader) - { - debug_printf("no vertex function, skipping draw\n"); - return; - } - const auto fetchShader = LatteSHRC_GetActiveFetchShader(); - - bool usesGeometryShader = (geometryShader != nullptr || isPrimitiveRect); + bool usesGeometryShader = (geometryShader != nullptr || isPrimitiveRect); // Depth stencil state - // TODO: implement this somehow - //auto depthControl = LatteGPUState.contextNew.DB_DEPTH_CONTROL; - + // TODO // Disable depth write when there is no depth attachment //if (!m_state.m_lastUsedFBO->depthBuffer.texture) // depthControl.set_Z_WRITE_ENABLE(false); @@ -1103,6 +1113,8 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 } } + m_state.m_isFirstDrawInRenderPass = false; + LatteStreamout_FinishDrawcall(false); LatteGPUState.drawCallCounter++; @@ -1235,6 +1247,7 @@ MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(bool forceRecr // Update state m_state.m_lastUsedFBO = m_state.m_activeFBO; + m_state.m_isFirstDrawInRenderPass = true; auto renderCommandEncoder = commandBuffer->renderCommandEncoder(m_state.m_activeFBO->GetRenderPassDescriptor()); #ifdef CEMU_DEBUG_ASSERT @@ -1358,6 +1371,53 @@ bool MetalRenderer::AcquireNextDrawable(bool mainWindow) return true; } +bool MetalRenderer::CheckIfRenderPassNeedsFlush(LatteDecompilerShader* shader) +{ + sint32 textureCount = shader->resourceMapping.getTextureCount(); + for (int i = 0; i < textureCount; ++i) + { + const auto relative_textureUnit = shader->resourceMapping.getTextureUnitFromBindingPoint(i); + auto hostTextureUnit = relative_textureUnit; + auto textureDim = shader->textureUnitDim[relative_textureUnit]; + auto texUnitRegIndex = hostTextureUnit * 7; + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + hostTextureUnit += LATTE_CEMU_VS_TEX_UNIT_BASE; + texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_VS; + break; + case LatteConst::ShaderType::Pixel: + hostTextureUnit += LATTE_CEMU_PS_TEX_UNIT_BASE; + texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_PS; + break; + case LatteConst::ShaderType::Geometry: + hostTextureUnit += LATTE_CEMU_GS_TEX_UNIT_BASE; + texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_GS; + break; + default: + UNREACHABLE; + } + + auto textureView = m_state.m_textures[hostTextureUnit]; + if (!textureView) + continue; + + LatteTexture* baseTexture = textureView->baseTexture; + if (!m_state.m_isFirstDrawInRenderPass) + { + // If the texture is also used in the current render pass, we need to end the render pass to "flush" the texture + for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + { + auto colorTarget = m_state.m_activeFBO->colorBuffer[i].texture; + if (colorTarget && colorTarget->baseTexture == baseTexture) + return true; + } + } + } + + return false; +} + void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader, bool usesGeometryShader) { auto mtlShaderType = GetMtlShaderType(shader->shaderType, usesGeometryShader); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 682685341..896ef43ac 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -127,6 +127,7 @@ struct MetalState bool m_usesSRGB = false; bool m_skipDrawSequence = false; + bool m_isFirstDrawInRenderPass = true; class CachedFBOMtl* m_activeFBO = nullptr; // If the FBO changes, but it's the same FBO as the last one with some omitted attachments, this FBO doesn't change' @@ -385,6 +386,7 @@ class MetalRenderer : public Renderer bool AcquireNextDrawable(bool mainWindow); + bool CheckIfRenderPassNeedsFlush(LatteDecompilerShader* shader); void BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader, bool usesGeometryShader); void ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a); From 67a64c9fe9d5466ab9b52c2bd49881e79925b78f Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 22 Aug 2024 13:58:50 +0200 Subject: [PATCH 104/368] rework the binding system --- src/Cafe/HW/Latte/Core/LatteShader.cpp | 6 +- .../LegacyShaderDecompiler/LatteDecompiler.h | 5 +- .../LatteDecompilerAnalyzer.cpp | 24 ++- .../LatteDecompilerEmitMSLHeader.hpp | 51 +++-- .../LatteDecompilerInternal.h | 2 + .../HW/Latte/Renderer/Metal/MetalCommon.h | 2 - .../Renderer/Metal/MetalMemoryManager.cpp | 19 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 179 +++++++++--------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 19 +- .../Renderer/Metal/RendererShaderMtl.cpp | 22 ++- 10 files changed, 168 insertions(+), 161 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteShader.cpp b/src/Cafe/HW/Latte/Core/LatteShader.cpp index 66539a761..d17fd57d6 100644 --- a/src/Cafe/HW/Latte/Core/LatteShader.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShader.cpp @@ -617,10 +617,12 @@ LatteDecompilerShader* LatteShader_CreateShaderFromDecompilerOutput(LatteDecompi shader->baseHash = baseHash; // copy resource mapping // HACK - if (g_renderer->GetType() != RendererAPI::OpenGL) + if (g_renderer->GetType() == RendererAPI::Vulkan) shader->resourceMapping = decompilerOutput.resourceMappingVK; - else + else if (g_renderer->GetType() == RendererAPI::OpenGL) shader->resourceMapping = decompilerOutput.resourceMappingGL; + else + shader->resourceMapping = decompilerOutput.resourceMappingMTL; // copy texture info shader->textureUnitMask2 = decompilerOutput.textureUnitMask; // copy streamout info diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h index 57df13b16..5d8b2c6f3 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h @@ -57,12 +57,14 @@ struct LatteDecompilerShaderResourceMapping // texture sint8 textureUnitToBindingPoint[LATTE_NUM_MAX_TEX_UNITS]; // uniform buffer - sint8 uniformVarsBufferBindingPoint{}; // special block for uniform registers/remapped array/custom variables + sint8 uniformVarsBufferBindingPoint{-1}; // special block for uniform registers/remapped array/custom variables sint8 uniformBuffersBindingPoint[LATTE_NUM_MAX_UNIFORM_BUFFERS]; // shader storage buffer for transform feedback (if alternative mode is used) sint8 tfStorageBindingPoint{-1}; // attributes (vertex shader only) sint8 attributeMapping[LATTE_NUM_MAX_ATTRIBUTE_LOCATIONS]; + // Metal exclusive + sint8 indexBufferBinding{-1}; sint32 getTextureCount() { @@ -288,6 +290,7 @@ struct LatteDecompilerOutput_t // mapping and binding information LatteDecompilerShaderResourceMapping resourceMappingGL; LatteDecompilerShaderResourceMapping resourceMappingVK; + LatteDecompilerShaderResourceMapping resourceMappingMTL; }; struct LatteDecompilerSubroutineInfo; diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp index e84e48519..9a3db895b 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp @@ -498,6 +498,18 @@ namespace LatteDecompiler } } + void _initTextureBindingPointsMTL(LatteDecompilerShaderContext* decompilerContext) + { + // for Vulkan we use consecutive indices + for (sint32 i = 0; i < LATTE_NUM_MAX_TEX_UNITS; i++) + { + if (!decompilerContext->output->textureUnitMask[i]) + continue; + decompilerContext->output->resourceMappingMTL.textureUnitToBindingPoint[i] = decompilerContext->currentTextureBindingPointMTL; + decompilerContext->currentTextureBindingPointMTL++; + } + } + void _initHasUniformVarBlock(LatteDecompilerShaderContext* decompilerContext) { decompilerContext->hasUniformVarBlock = false; @@ -552,14 +564,13 @@ namespace LatteDecompiler } } // assign binding point to uniform var block - decompilerContext->output->resourceMappingGL.uniformVarsBufferBindingPoint = -1; // OpenGL currently doesnt use a uniform block if (decompilerContext->hasUniformVarBlock) { decompilerContext->output->resourceMappingVK.uniformVarsBufferBindingPoint = decompilerContext->currentBindingPointVK; decompilerContext->currentBindingPointVK++; + decompilerContext->output->resourceMappingMTL.uniformVarsBufferBindingPoint = decompilerContext->currentBufferBindingPointMTL; + decompilerContext->currentBufferBindingPointMTL++; } - else - decompilerContext->output->resourceMappingVK.uniformVarsBufferBindingPoint = -1; // assign binding points to uniform buffers if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK) { @@ -578,6 +589,8 @@ namespace LatteDecompiler decompilerContext->output->resourceMappingVK.uniformBuffersBindingPoint[i] = decompilerContext->currentBindingPointVK; decompilerContext->currentBindingPointVK++; + decompilerContext->output->resourceMappingMTL.uniformBuffersBindingPoint[i] = decompilerContext->currentBufferBindingPointMTL; + decompilerContext->currentBufferBindingPointMTL++; } // for OpenGL we use the relative buffer index for (uint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++) @@ -599,6 +612,8 @@ namespace LatteDecompiler { decompilerContext->output->resourceMappingVK.tfStorageBindingPoint = decompilerContext->currentBindingPointVK; decompilerContext->currentBindingPointVK++; + decompilerContext->output->resourceMappingMTL.tfStorageBindingPoint = decompilerContext->currentBufferBindingPointMTL; + decompilerContext->currentBufferBindingPointMTL++; } } @@ -615,6 +630,7 @@ namespace LatteDecompiler { decompilerContext->output->resourceMappingGL.attributeMapping[i] = bindingIndex; decompilerContext->output->resourceMappingVK.attributeMapping[i] = bindingIndex; + decompilerContext->output->resourceMappingMTL.attributeMapping[i] = bindingIndex; bindingIndex++; } } @@ -1000,6 +1016,8 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD shaderContext->output->resourceMappingVK.setIndex = 2; LatteDecompiler::_initTextureBindingPointsGL(shaderContext); LatteDecompiler::_initTextureBindingPointsVK(shaderContext); + LatteDecompiler::_initTextureBindingPointsMTL(shaderContext); LatteDecompiler::_initUniformBindingPoints(shaderContext); LatteDecompiler::_initAttributeBindingPoints(shaderContext); + shaderContext->output->resourceMappingMTL.indexBufferBinding = shaderContext->currentBufferBindingPointMTL++; } diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 5e1b4c113..53332f7ce 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -129,7 +129,7 @@ namespace LatteDecompiler if (!decompilerContext->analyzer.uniformBufferAccessTracker[i].HasAccess()) continue; - cemu_assert_debug(decompilerContext->output->resourceMappingVK.uniformBuffersBindingPoint[i] >= 0); + cemu_assert_debug(decompilerContext->output->resourceMappingMTL.uniformBuffersBindingPoint[i] >= 0); shaderSrc->addFmt("struct UBuff{} {{" _CRLF, i); shaderSrc->addFmt("float4 d[{}];" _CRLF, decompilerContext->analyzer.uniformBufferAccessTracker[i].DetermineSize(decompilerContext->shaderBaseHash, LATTE_GLSL_DYNAMIC_UNIFORM_BLOCK_SIZE)); @@ -167,13 +167,13 @@ namespace LatteDecompiler { if (decompilerContext->analyzer.inputAttributSemanticMask[i]) { - cemu_assert_debug(decompilerContext->output->resourceMappingVK.attributeMapping[i] >= 0); + cemu_assert_debug(decompilerContext->output->resourceMappingMTL.attributeMapping[i] >= 0); src->addFmt("uint4 attrDataSem{}", i); if (decompilerContext->options->usesGeometryShader) - attributeNames += "#define ATTRIBUTE_NAME" + std::to_string((sint32)decompilerContext->output->resourceMappingVK.attributeMapping[i]) + " attrDataSem" + std::to_string(i) + "\n"; + attributeNames += "#define ATTRIBUTE_NAME" + std::to_string((sint32)decompilerContext->output->resourceMappingMTL.attributeMapping[i]) + " attrDataSem" + std::to_string(i) + "\n"; else - src->addFmt(" [[attribute({})]]", (sint32)decompilerContext->output->resourceMappingVK.attributeMapping[i]); + src->addFmt(" [[attribute({})]]", (sint32)decompilerContext->output->resourceMappingMTL.attributeMapping[i]); src->add(";" _CRLF); } } @@ -424,9 +424,9 @@ namespace LatteDecompiler if (!decompilerContext->analyzer.uniformBufferAccessTracker[i].HasAccess()) continue; - cemu_assert_debug(decompilerContext->output->resourceMappingVK.uniformBuffersBindingPoint[i] >= 0); + cemu_assert_debug(decompilerContext->output->resourceMappingMTL.uniformBuffersBindingPoint[i] >= 0); - src->addFmt(", constant UBuff{}& ubuff{} [[buffer({})]]", i, i, (sint32)decompilerContext->output->resourceMappingVK.uniformBuffersBindingPoint[i]); + src->addFmt(", constant UBuff{}& ubuff{} [[buffer({})]]", i, i, (sint32)decompilerContext->output->resourceMappingMTL.uniformBuffersBindingPoint[i]); } } } @@ -472,8 +472,8 @@ namespace LatteDecompiler cemu_assert_unimplemented(); } - uint32 binding = shaderContext->output->resourceMappingVK.textureUnitToBindingPoint[i]; - //uint32 textureBinding = shaderContext->output->resourceMappingVK.textureUnitToBindingPoint[i] % 31; + uint32 binding = shaderContext->output->resourceMappingMTL.textureUnitToBindingPoint[i]; + //uint32 textureBinding = shaderContext->output->resourceMappingMTL.textureUnitToBindingPoint[i] % 31; //uint32 samplerBinding = textureBinding % 16; src->addFmt(" tex{} [[texture({})]]", i, binding); src->addFmt(", sampler samplr{} [[sampler({})]]", i, binding); @@ -484,51 +484,44 @@ namespace LatteDecompiler { auto src = decompilerContext->shaderSource; - switch (decompilerContext->shaderType) - { - case LatteConst::ShaderType::Vertex: - if (!(decompilerContext->options->usesGeometryShader || isRectVertexShader)) - src->add("VertexIn in [[stage_in]], "); - break; - case LatteConst::ShaderType::Pixel: - src->add("FragmentIn in [[stage_in]], "); - break; - default: - break; - } - - src->add("constant SupportBuffer& supportBuffer [[buffer(30)]]"); switch (decompilerContext->shaderType) { case LatteConst::ShaderType::Vertex: if (decompilerContext->options->usesGeometryShader || isRectVertexShader) { - src->add(", object_data ObjectPayload& objectPayload [[payload]]"); - src->add(", mesh_grid_properties meshGridProperties"); - src->add(", uint tig [[threadgroup_position_in_grid]]"); - src->add(", uint tid [[thread_index_in_threadgroup]]"); - src->add(" VERTEX_BUFFER_DEFINITIONS"); + src->add("object_data ObjectPayload& objectPayload [[payload]]"); + src->add(", mesh_grid_properties meshGridProperties"); + src->add(", uint tig [[threadgroup_position_in_grid]]"); + src->add(", uint tid [[thread_index_in_threadgroup]]"); + src->add(" VERTEX_BUFFER_DEFINITIONS"); } else { + src->add("VertexIn in [[stage_in]]"); src->add(", uint vid [[vertex_id]]"); src->add(", uint iid [[instance_id]]"); } break; case LatteConst::ShaderType::Geometry: - src->add(", MeshType mesh"); + src->add("MeshType mesh"); src->add(", const object_data ObjectPayload& objectPayload [[payload]]"); break; case LatteConst::ShaderType::Pixel: + src->add("FragmentIn in [[stage_in]]"); src->add(", bool frontFacing [[front_facing]]"); + break; + default: break; } + if (decompilerContext->output->resourceMappingMTL.uniformVarsBufferBindingPoint >= 0) + src->addFmt(", constant SupportBuffer& supportBuffer [[buffer({})]]", decompilerContext->output->resourceMappingMTL.uniformVarsBufferBindingPoint); + // streamout buffer (transform feedback) if ((decompilerContext->shaderType == LatteConst::ShaderType::Vertex && !decompilerContext->options->usesGeometryShader) || decompilerContext->shaderType == LatteConst::ShaderType::Geometry) { if (decompilerContext->analyzer.hasStreamoutEnable && decompilerContext->analyzer.hasStreamoutWrite) - src->addFmt(", device int* sb [[buffer({})]]" _CRLF, decompilerContext->output->resourceMappingVK.tfStorageBindingPoint); + src->addFmt(", device int* sb [[buffer({})]]" _CRLF, decompilerContext->output->resourceMappingMTL.tfStorageBindingPoint); } // uniform buffers diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h index 4b85d4586..f4135640f 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h @@ -260,6 +260,8 @@ struct LatteDecompilerShaderContext // emitter bool hasUniformVarBlock; sint32 currentBindingPointVK{}; + sint32 currentBufferBindingPointMTL{}; + sint32 currentTextureBindingPointMTL{}; struct ALUClauseTemporariesState* aluPVPSState{nullptr}; // misc std::vector list_subroutines; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h index 926af5f43..849567863 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -23,8 +23,6 @@ struct MetalPixelFormatSupport #define MAX_MTL_BUFFERS 31 // Buffer index 30 is reserved for the support buffer, buffer indices 27-29 are reserved for the helper shaders #define GET_MTL_VERTEX_BUFFER_INDEX(index) (MAX_MTL_BUFFERS - index - 5) -// TODO: don't harcdode the support buffer binding -#define MTL_SUPPORT_BUFFER_BINDING 30 #define MAX_MTL_TEXTURES 31 #define MAX_MTL_SAMPLERS 16 diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 93c6ec860..f0c122179 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -2,6 +2,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h" #include "Common/precompiled.h" +#include "HW/Latte/Renderer/Metal/MetalRenderer.h" MetalVertexBufferCache::~MetalVertexBufferCache() { @@ -42,11 +43,8 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu renderCommandEncoder->setRenderPipelineState(m_restrideBufferPipeline->GetRenderPipelineState()); m_mtlr->GetEncoderState().m_renderPipelineState = m_restrideBufferPipeline->GetRenderPipelineState(); - MTL::Buffer* buffers[] = {bufferCache, buffer}; - size_t offsets[] = {vertexBufferRange.offset, restrideInfo.allocation.offset}; - renderCommandEncoder->setVertexBuffers(buffers, offsets, NS::Range(GET_HELPER_BUFFER_BINDING(0), 2)); - m_mtlr->GetEncoderState().m_uniformBufferOffsets[METAL_SHADER_TYPE_VERTEX][GET_HELPER_BUFFER_BINDING(0)] = INVALID_OFFSET; - m_mtlr->GetEncoderState().m_uniformBufferOffsets[METAL_SHADER_TYPE_VERTEX][GET_HELPER_BUFFER_BINDING(1)] = INVALID_OFFSET; + m_mtlr->SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_VERTEX, bufferCache, vertexBufferRange.offset, GET_HELPER_BUFFER_BINDING(0)); + m_mtlr->SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_VERTEX, buffer, restrideInfo.allocation.offset, GET_HELPER_BUFFER_BINDING(1)); struct { @@ -54,16 +52,7 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu uint32 newStride; } strideData = {static_cast(stride), static_cast(newStride)}; renderCommandEncoder->setVertexBytes(&strideData, sizeof(strideData), GET_HELPER_BUFFER_BINDING(2)); - m_mtlr->GetEncoderState().m_uniformBufferOffsets[METAL_SHADER_TYPE_VERTEX][GET_HELPER_BUFFER_BINDING(2)] = INVALID_OFFSET; - - // TODO: remove - uint32 vertexCount = vertexBufferRange.size / stride; - if (vertexCount * strideData.oldStride > buffers[0]->length() - offsets[0]) { - throw std::runtime_error("Source buffer overflow (" + std::to_string(vertexCount) + " * " + std::to_string(strideData.oldStride) + " > " + std::to_string(buffers[0]->length()) + " - " + std::to_string(offsets[0]) + ")"); - } - if (vertexCount * strideData.newStride > buffers[1]->length() - offsets[1]) { - throw std::runtime_error("Destination buffer overflow (" + std::to_string(vertexCount) + " * " + std::to_string(strideData.newStride) + " > " + std::to_string(buffers[1]->length()) + " - " + std::to_string(offsets[1]) + ")"); - } + m_mtlr->GetEncoderState().m_buffers[METAL_SHADER_TYPE_VERTEX][GET_HELPER_BUFFER_BINDING(2)] = {nullptr}; renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangleStrip, NS::UInteger(0), vertexBufferRange.size / stride); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 64141ed10..1996ff46e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -30,63 +30,6 @@ extern bool hasValidFramebufferAttached; float supportBufferData[512 * 4]; -void SetBuffer(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Buffer* buffer, size_t offset, uint32 index) -{ - switch (shaderType) - { - case METAL_SHADER_TYPE_VERTEX: - renderCommandEncoder->setVertexBuffer(buffer, offset, index); - break; - case METAL_SHADER_TYPE_OBJECT: - renderCommandEncoder->setObjectBuffer(buffer, offset, index); - break; - case METAL_SHADER_TYPE_MESH: - renderCommandEncoder->setMeshBuffer(buffer, offset, index); - break; - case METAL_SHADER_TYPE_FRAGMENT: - renderCommandEncoder->setFragmentBuffer(buffer, offset, index); - break; - } -} - -void SetTexture(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Texture* texture, uint32 index) -{ - switch (shaderType) - { - case METAL_SHADER_TYPE_VERTEX: - renderCommandEncoder->setVertexTexture(texture, index); - break; - case METAL_SHADER_TYPE_OBJECT: - renderCommandEncoder->setObjectTexture(texture, index); - break; - case METAL_SHADER_TYPE_MESH: - renderCommandEncoder->setMeshTexture(texture, index); - break; - case METAL_SHADER_TYPE_FRAGMENT: - renderCommandEncoder->setFragmentTexture(texture, index); - break; - } -} - -void SetSamplerState(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::SamplerState* samplerState, uint32 index) -{ - switch (shaderType) - { - case METAL_SHADER_TYPE_VERTEX: - renderCommandEncoder->setVertexSamplerState(samplerState, index); - break; - case METAL_SHADER_TYPE_OBJECT: - renderCommandEncoder->setObjectSamplerState(samplerState, index); - break; - case METAL_SHADER_TYPE_MESH: - renderCommandEncoder->setMeshSamplerState(samplerState, index); - break; - case METAL_SHADER_TYPE_FRAGMENT: - renderCommandEncoder->setFragmentSamplerState(samplerState, index); - break; - } -} - MetalRenderer::MetalRenderer() { m_device = MTL::CreateSystemDefaultDevice(); @@ -646,8 +589,6 @@ void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* so return; } - MTL::Texture* textures[] = {srcTextureMtl->GetTexture(), dstTextureMtl->GetTexture()}; - struct CopyParams { uint32 width; @@ -664,11 +605,10 @@ void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* so renderCommandEncoder->setRenderPipelineState(m_copyTextureToTexturePipeline->GetRenderPipelineState()); m_state.m_encoderState.m_renderPipelineState = m_copyTextureToTexturePipeline->GetRenderPipelineState(); - renderCommandEncoder->setVertexTextures(textures, NS::Range(GET_HELPER_TEXTURE_BINDING(0), 2)); - m_state.m_encoderState.m_textures[METAL_SHADER_TYPE_VERTEX][GET_HELPER_TEXTURE_BINDING(0)] = {(LatteTextureViewMtl*)textures[0]}; - m_state.m_encoderState.m_textures[METAL_SHADER_TYPE_VERTEX][GET_HELPER_TEXTURE_BINDING(1)] = {(LatteTextureViewMtl*)textures[1]}; + SetTexture(renderCommandEncoder, METAL_SHADER_TYPE_VERTEX, srcTextureMtl->GetTexture(), GET_HELPER_TEXTURE_BINDING(0)); + SetTexture(renderCommandEncoder, METAL_SHADER_TYPE_VERTEX, dstTextureMtl->GetTexture(), GET_HELPER_TEXTURE_BINDING(1)); renderCommandEncoder->setVertexBytes(¶ms, sizeof(params), GET_HELPER_BUFFER_BINDING(0)); - m_state.m_encoderState.m_uniformBufferOffsets[METAL_SHADER_TYPE_VERTEX][GET_HELPER_BUFFER_BINDING(0)] = INVALID_OFFSET; + m_state.m_encoderState.m_buffers[METAL_SHADER_TYPE_VERTEX][GET_HELPER_BUFFER_BINDING(0)] = {nullptr}; renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3)); } @@ -1041,10 +981,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 } // Bind - if (true) - { - SetBuffer(renderCommandEncoder, GetMtlShaderType(vertexShader->shaderType, usesGeometryShader), buffer, offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); - } + SetBuffer(renderCommandEncoder, GetMtlShaderType(vertexShader->shaderType, usesGeometryShader), buffer, offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); } } @@ -1076,9 +1013,8 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 indexBuffer = m_memoryManager->GetTemporaryBufferAllocator().GetBuffer(indexBufferIndex); if (usesGeometryShader) { - // TODO: don't hardcode the index if (indexBuffer) - renderCommandEncoder->setObjectBuffer(indexBuffer, indexBufferOffset, 20); + SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_OBJECT, indexBuffer, indexBufferOffset, vertexShader->resourceMapping.indexBufferBinding); uint32 verticesPerPrimitive = 0; switch (primitiveMode) @@ -1154,6 +1090,83 @@ void MetalRenderer::indexData_uploadIndexMemory(uint32 bufferIndex, uint32 offse buffer->didModifyRange(NS::Range(offset, size)); } +void MetalRenderer::SetBuffer(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Buffer* buffer, size_t offset, uint32 index) +{ + auto& boundBuffer = m_state.m_encoderState.m_buffers[shaderType][index]; + if (buffer == boundBuffer.m_buffer && offset == boundBuffer.m_offset) + return; + + // TODO: only set the offset if only offset changed + + boundBuffer = {buffer, offset}; + + switch (shaderType) + { + case METAL_SHADER_TYPE_VERTEX: + renderCommandEncoder->setVertexBuffer(buffer, offset, index); + break; + case METAL_SHADER_TYPE_OBJECT: + renderCommandEncoder->setObjectBuffer(buffer, offset, index); + break; + case METAL_SHADER_TYPE_MESH: + renderCommandEncoder->setMeshBuffer(buffer, offset, index); + break; + case METAL_SHADER_TYPE_FRAGMENT: + renderCommandEncoder->setFragmentBuffer(buffer, offset, index); + break; + } +} + +void MetalRenderer::SetTexture(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Texture* texture, uint32 index) +{ + auto& boundTexture = m_state.m_encoderState.m_textures[shaderType][index]; + if (texture == boundTexture) + return; + + boundTexture = texture; + + switch (shaderType) + { + case METAL_SHADER_TYPE_VERTEX: + renderCommandEncoder->setVertexTexture(texture, index); + break; + case METAL_SHADER_TYPE_OBJECT: + renderCommandEncoder->setObjectTexture(texture, index); + break; + case METAL_SHADER_TYPE_MESH: + renderCommandEncoder->setMeshTexture(texture, index); + break; + case METAL_SHADER_TYPE_FRAGMENT: + renderCommandEncoder->setFragmentTexture(texture, index); + break; + } +} + +void MetalRenderer::SetSamplerState(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::SamplerState* samplerState, uint32 index) +{ + auto& boundSamplerState = m_state.m_encoderState.m_samplers[shaderType][index]; + if (samplerState == boundSamplerState) + return; + + boundSamplerState = samplerState; + + switch (shaderType) + { + case METAL_SHADER_TYPE_VERTEX: + renderCommandEncoder->setVertexSamplerState(samplerState, index); + break; + case METAL_SHADER_TYPE_OBJECT: + renderCommandEncoder->setObjectSamplerState(samplerState, index); + break; + case METAL_SHADER_TYPE_MESH: + renderCommandEncoder->setMeshSamplerState(samplerState, index); + break; + case METAL_SHADER_TYPE_FRAGMENT: + renderCommandEncoder->setFragmentSamplerState(samplerState, index); + break; + } +} + MTL::CommandBuffer* MetalRenderer::GetCommandBuffer() { bool needsNewCommandBuffer = (m_commandBuffers.empty() || m_commandBuffers.back().m_commited); @@ -1447,8 +1460,8 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE UNREACHABLE; } - // TODO: uncomment - uint32 binding = shader->resourceMapping.getTextureBaseBindingPoint() + i;//shader->resourceMapping.textureUnitToBindingPoint[hostTextureUnit]; + // TODO: correct? + uint32 binding = shader->resourceMapping.getTextureBaseBindingPoint() + i; if (binding >= MAX_MTL_TEXTURES) { debug_printf("invalid texture binding %u\n", binding); @@ -1491,23 +1504,11 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE { sampler = m_nearestSampler; } - - auto& boundSampler = m_state.m_encoderState.m_samplers[mtlShaderType][binding]; - if (sampler != boundSampler) - { - boundSampler = sampler; - - SetSamplerState(renderCommandEncoder, mtlShaderType, sampler, binding); - } + SetSamplerState(renderCommandEncoder, mtlShaderType, sampler, binding); // get texture register word 0 uint32 word4 = LatteGPUState.contextRegister[texUnitRegIndex + 4]; auto& boundTexture = m_state.m_encoderState.m_textures[mtlShaderType][binding]; - if (textureView == boundTexture.m_textureView && word4 == boundTexture.m_word4) - continue; - - boundTexture = {textureView, word4}; - MTL::Texture* mtlTexture = textureView->GetSwizzledView(word4); SetTexture(renderCommandEncoder, mtlShaderType, mtlTexture, binding); } @@ -1601,7 +1602,7 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE if (!HasUnifiedMemory()) buffer->didModifyRange(NS::Range(supportBuffer.offset, size)); - SetBuffer(renderCommandEncoder, mtlShaderType, buffer, supportBuffer.offset, MTL_SUPPORT_BUFFER_BINDING); + SetBuffer(renderCommandEncoder, mtlShaderType, buffer, supportBuffer.offset, shader->resourceMapping.uniformVarsBufferBindingPoint); } // Uniform buffers @@ -1620,13 +1621,6 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE if (offset == INVALID_OFFSET) continue; - auto& boundOffset = m_state.m_encoderState.m_uniformBufferOffsets[mtlShaderType][binding]; - if (offset == boundOffset) - continue; - - boundOffset = offset; - - // TODO: only set the offset if already bound SetBuffer(renderCommandEncoder, mtlShaderType, m_memoryManager->GetBufferCache(), offset, binding); } } @@ -1635,7 +1629,6 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE if (shader->resourceMapping.tfStorageBindingPoint >= 0) { SetBuffer(renderCommandEncoder, mtlShaderType, m_xfbRingBuffer, 0, shader->resourceMapping.tfStorageBindingPoint); - m_state.m_encoderState.m_uniformBufferOffsets[mtlShaderType][shader->resourceMapping.tfStorageBindingPoint] = INVALID_OFFSET; } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 896ef43ac..f8e12bd6f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -8,6 +8,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" #include "Metal/MTLResource.hpp" +#include "Metal/MTLSampler.hpp" struct MetalBufferAllocation { @@ -103,11 +104,11 @@ struct MetalEncoderState uint32 m_depthSlope = 0; uint32 m_depthClamp = 0; struct { - class LatteTextureViewMtl* m_textureView = nullptr; - uint32 m_word4 = INVALID_UINT32; - } m_textures[METAL_SHADER_TYPE_TOTAL][MAX_MTL_TEXTURES]; + MTL::Buffer* m_buffer; + size_t m_offset; + } m_buffers[METAL_SHADER_TYPE_TOTAL][MAX_MTL_BUFFERS]; + MTL::Texture* m_textures[METAL_SHADER_TYPE_TOTAL][MAX_MTL_TEXTURES]; MTL::SamplerState* m_samplers[METAL_SHADER_TYPE_TOTAL][MAX_MTL_SAMPLERS]; - size_t m_uniformBufferOffsets[METAL_SHADER_TYPE_TOTAL][MAX_MTL_BUFFERS]; }; struct MetalStreamoutState @@ -360,12 +361,12 @@ class MetalRenderer : public Renderer for (uint32 i = 0; i < METAL_SHADER_TYPE_TOTAL; i++) { + for (uint32 j = 0; j < MAX_MTL_BUFFERS; j++) + m_state.m_encoderState.m_buffers[i][j] = {nullptr}; for (uint32 j = 0; j < MAX_MTL_TEXTURES; j++) - m_state.m_encoderState.m_textures[i][j] = {nullptr}; + m_state.m_encoderState.m_textures[i][j] = nullptr; for (uint32 j = 0; j < MAX_MTL_SAMPLERS; j++) m_state.m_encoderState.m_samplers[i][j] = nullptr; - for (uint32 j = 0; j < MAX_MTL_BUFFERS; j++) - m_state.m_encoderState.m_uniformBufferOffsets[i][j] = INVALID_OFFSET; } } @@ -374,6 +375,10 @@ class MetalRenderer : public Renderer return m_state.m_encoderState; } + void SetBuffer(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Buffer* buffer, size_t offset, uint32 index); + void SetTexture(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Texture* texture, uint32 index); + void SetSamplerState(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::SamplerState* samplerState, uint32 index); + MTL::CommandBuffer* GetCommandBuffer(); bool CommandBufferCompleted(MTL::CommandBuffer* commandBuffer); void WaitForCommandBufferCompletion(MTL::CommandBuffer* commandBuffer); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index dc2846ef5..4c968d1cb 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -46,7 +46,9 @@ void RendererShaderMtl::CompileObjectFunction(const LatteContextRegister& lcr, c std::string vertexBufferDefinitions = "#define VERTEX_BUFFER_DEFINITIONS "; std::string vertexBuffers = "#define VERTEX_BUFFERS "; std::string inputFetchDefinition = "VertexIn fetchInput(thread uint& vid VERTEX_BUFFER_DEFINITIONS) {\n"; - if (hostIndexType != Renderer::INDEX_TYPE::NONE) + + // Index buffer + if (hostIndexType != Renderer::INDEX_TYPE::NONE) { vertexBufferDefinitions += ", device "; switch (hostIndexType) @@ -61,11 +63,12 @@ void RendererShaderMtl::CompileObjectFunction(const LatteContextRegister& lcr, c cemu_assert_suspicious(); break; } - // TODO: don't hardcode the index - vertexBufferDefinitions += "* indexBuffer [[buffer(20)]]"; + + vertexBufferDefinitions += fmt::format("* indexBuffer [[buffer({})]]", vertexShader->resourceMapping.indexBufferBinding); vertexBuffers += ", indexBuffer"; inputFetchDefinition += "vid = indexBuffer[vid];\n"; } + inputFetchDefinition += "VertexIn in;\n"; for (auto& bufferGroup : fetchShader->bufferGroups) { @@ -138,10 +141,10 @@ void RendererShaderMtl::CompileObjectFunction(const LatteContextRegister& lcr, c } // Fetch the attribute - inputFetchDefinition += "in.ATTRIBUTE_NAME" + std::to_string(semanticId) + " = "; - inputFetchDefinition += "uint4(*(device " + formatName + "*)"; - inputFetchDefinition += "(vertexBuffer" + std::to_string(attr.attributeBufferIndex); - inputFetchDefinition += " + vid * " + std::to_string(bufferStride) + " + " + std::to_string(attr.offset) + ")"; + inputFetchDefinition += fmt::format("in.ATTRIBUTE_NAME{} = ", semanticId); + inputFetchDefinition += fmt::format("uint4(*(device {}*)", formatName); + inputFetchDefinition += fmt::format("(vertexBuffer{}", attr.attributeBufferIndex); + inputFetchDefinition += fmt::format(" + vid * {} + {})", bufferStride, attr.offset); for (uint8 i = 0; i < (4 - componentCount); i++) inputFetchDefinition += ", 0"; inputFetchDefinition += ");\n"; @@ -157,9 +160,10 @@ void RendererShaderMtl::CompileObjectFunction(const LatteContextRegister& lcr, c } } - vertexBufferDefinitions += ", device uchar* vertexBuffer" + std::to_string(bufferIndex) + " [[buffer(" + std::to_string(GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)) + ")]]"; - vertexBuffers += ", vertexBuffer" + std::to_string(bufferIndex); + vertexBufferDefinitions += fmt::format(", device uchar* vertexBuffer{} [[buffer({})]]", bufferIndex, GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)); + vertexBuffers += fmt::format(", vertexBuffer{}", bufferIndex); } + inputFetchDefinition += "return in;\n"; inputFetchDefinition += "}\n"; From c05b2d0b4831d5e7f84caaa7856c5077ca2c985a Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 22 Aug 2024 15:02:49 +0200 Subject: [PATCH 105/368] don't stringify shader source & move vertex buffer bindings by 1 --- src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h | 6 +++--- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 11 +---------- .../HW/Latte/Renderer/Metal/UtilityShaderSource.h | 11 +++++++++-- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h index 849567863..44d4d873b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -21,13 +21,13 @@ struct MetalPixelFormatSupport }; #define MAX_MTL_BUFFERS 31 -// Buffer index 30 is reserved for the support buffer, buffer indices 27-29 are reserved for the helper shaders -#define GET_MTL_VERTEX_BUFFER_INDEX(index) (MAX_MTL_BUFFERS - index - 5) +// Buffer indices 28-30 are reserved for the helper shaders +#define GET_MTL_VERTEX_BUFFER_INDEX(index) (MAX_MTL_BUFFERS - index - 4) #define MAX_MTL_TEXTURES 31 #define MAX_MTL_SAMPLERS 16 -#define GET_HELPER_BUFFER_BINDING(index) (27 + index) +#define GET_HELPER_BUFFER_BINDING(index) (28 + index) #define GET_HELPER_TEXTURE_BINDING(index) (29 + index) #define GET_HELPER_SAMPLER_BINDING(index) (14 + index) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 1996ff46e..7604406fa 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -17,11 +17,8 @@ #include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/Core/LatteIndices.h" #include "Cemu/Logging/CemuDebugLogging.h" -#include "Common/precompiled.h" #include "HW/Latte/Core/LatteConst.h" #include "HW/Latte/Renderer/Metal/MetalCommon.h" -#include "Metal/MTLDevice.hpp" -#include "Metal/MTLRenderCommandEncoder.hpp" #include "gui/guiWrapper.h" #define COMMIT_TRESHOLD 256 @@ -98,15 +95,9 @@ MetalRenderer::MetalRenderer() // Utility shader library - // Process the source first - std::string processedUtilityShaderSource = utilityShaderSource; - processedUtilityShaderSource.pop_back(); - processedUtilityShaderSource.erase(processedUtilityShaderSource.begin()); - processedUtilityShaderSource = "#include \nusing namespace metal;\n#define GET_BUFFER_BINDING(index) (27 + index)\n#define GET_TEXTURE_BINDING(index) (29 + index)\n#define GET_SAMPLER_BINDING(index) (14 + index)\n" + processedUtilityShaderSource; - // Create the library NS::Error* error = nullptr; - MTL::Library* utilityLibrary = m_device->newLibrary(ToNSString(processedUtilityShaderSource.c_str()), nullptr, &error); + MTL::Library* utilityLibrary = m_device->newLibrary(ToNSString(utilityShaderSource), nullptr, &error); if (error) { debug_printf("failed to create utility library (error: %s)\n", error->localizedDescription()->utf8String()); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h index c298150e9..7f8f3dc7f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h @@ -3,7 +3,14 @@ #define __STRINGIFY(x) #x #define _STRINGIFY(x) __STRINGIFY(x) -constexpr const char* utilityShaderSource = _STRINGIFY(( +constexpr const char* utilityShaderSource = R"V0G0N( +#include +using namespace metal; + +#define GET_BUFFER_BINDING(index) (28 + index) +#define GET_TEXTURE_BINDING(index) (29 + index) +#define GET_SAMPLER_BINDING(index) (14 + index)\n + constant float2 positions[] = {float2(-1.0, -3.0), float2(-1.0, 1.0), float2(3.0, 1.0)}; struct VertexOut { @@ -48,4 +55,4 @@ vertex void vertexRestrideBuffer(uint vid [[vertex_id]], device uint8_t* src [[b dst[vid * params.newStride + i] = src[vid * params.oldStride + i]; } } -)); +)V0G0N"; From 8e87b9676a4294c81b132ca43145d43eb217dfc4 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 22 Aug 2024 18:20:26 +0200 Subject: [PATCH 106/368] fix: missing lod parameter --- .../LatteDecompilerEmitMSL.cpp | 40 ++++++++++--------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index dcca05604..8aa2ebeea 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2429,27 +2429,29 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex src->addFmt("+ float2(1.0)/float2(tex{}.get_width(), tex{}.get_height())/512.0", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex); } // lod or lod bias parameter - if( texOpcode == GPU7_TEX_INST_SAMPLE_L || texOpcode == GPU7_TEX_INST_SAMPLE_LB || texOpcode == GPU7_TEX_INST_SAMPLE_C_L) + // 1D textures don't support lod + if (texDim != Latte::E_DIM::DIM_1D && texDim != Latte::E_DIM::DIM_1D_ARRAY) { - src->add(","); - if (texOpcode == GPU7_TEX_INST_SAMPLE_LB) - { - src->addFmt("bias({})", _FormatFloatAsConstant((float)texInstruction->textureFetch.lodBias / 16.0f)); - } - else - { - // TODO: is this correct? - src->add("level("); - _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 3, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add(")"); - } + if( texOpcode == GPU7_TEX_INST_SAMPLE_L || texOpcode == GPU7_TEX_INST_SAMPLE_LB || texOpcode == GPU7_TEX_INST_SAMPLE_C_L) + { + src->add(", "); + if (texOpcode == GPU7_TEX_INST_SAMPLE_LB) + { + src->addFmt("bias({})", _FormatFloatAsConstant((float)texInstruction->textureFetch.lodBias / 16.0f)); + } + else + { + // TODO: is this correct? + src->add("level("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 3, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + } + } + else if( texOpcode == GPU7_TEX_INST_SAMPLE_LZ || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ ) + { + src->add(", level(0.0)"); + } } - // TODO: uncomment? - //else if( texOpcode == GPU7_TEX_INST_SAMPLE_LZ || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ ) - //{ - // // TODO: correct? - // src->add(", level(0.0)"); - //} } // gradient parameters if (texOpcode == GPU7_TEX_INST_SAMPLE_G) From 28aef858f2adde5f32da31e56c0b6c240f146ba6 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 23 Aug 2024 09:00:00 +0200 Subject: [PATCH 107/368] fix: texture readback --- .../Renderer/Metal/LatteTextureReadbackMtl.cpp | 17 ++++++++++++----- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 15 ++++++++------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 15 ++++++++++++++- 3 files changed, 34 insertions(+), 13 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp index ef1576642..491017c04 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp @@ -25,15 +25,22 @@ void LatteTextureReadbackInfoMtl::StartTransfer() blitCommandEncoder->copyFromTexture(baseTexture->GetTexture(), 0, 0, MTL::Origin{0, 0, 0}, MTL::Size{(uint32)baseTexture->width, (uint32)baseTexture->height, 1}, m_mtlr->GetTextureReadbackBuffer(), m_bufferOffset, bytesPerRow, bytesPerImage); m_commandBuffer = m_mtlr->GetCurrentCommandBuffer(); - // TODO: uncomment - //m_mtlr->RequestSoonCommit(); + m_mtlr->RequestSoonCommit(); + m_mtlr->RequestCommitOnIdle(); } bool LatteTextureReadbackInfoMtl::IsFinished() { - // HACK: just return true for now, otherwise the game would freeze - //return m_mtlr->CommandBufferCompleted(m_commandBuffer); - return true; + // TODO: is this needed? + if (!m_commandBuffer) + return false; + + // TODO: remove this? + // Command buffer wasn't even comitted, let's commit immediately + if (m_mtlr->GetCurrentCommandBuffer() == m_commandBuffer) + m_mtlr->CommitCommandBuffer(); + + return m_mtlr->CommandBufferCompleted(m_commandBuffer); } void LatteTextureReadbackInfoMtl::ForceFinish() diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 7604406fa..a348409d9 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -21,7 +21,7 @@ #include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "gui/guiWrapper.h" -#define COMMIT_TRESHOLD 256 +#define DEFAULT_COMMIT_TRESHOLD 256 extern bool hasValidFramebufferAttached; @@ -297,8 +297,7 @@ bool MetalRenderer::BeginFrame(bool mainWindow) void MetalRenderer::Flush(bool waitIdle) { - // TODO: commit if commit on idle is requested - if (m_recordedDrawcalls > 0) + if (m_commitOnIdle || m_recordedDrawcalls > 0) CommitCommandBuffer(); if (waitIdle) { @@ -309,8 +308,8 @@ void MetalRenderer::Flush(bool waitIdle) void MetalRenderer::NotifyLatteCommandProcessorIdle() { - // TODO: commit if commit on idle is requested - //CommitCommandBuffer(); + if (m_commitOnIdle) + CommitCommandBuffer(); } void MetalRenderer::AppendOverlayDebugInfo() @@ -1056,7 +1055,7 @@ void MetalRenderer::draw_endSequence() bool hasReadback = LatteTextureReadback_Update(); m_recordedDrawcalls++; // The number of draw calls needs to twice as big, since we are interrupting the render pass - if (m_recordedDrawcalls >= COMMIT_TRESHOLD * 2 || hasReadback) + if (m_recordedDrawcalls >= m_commitTreshold * 2 || hasReadback) { CommitCommandBuffer(); @@ -1321,14 +1320,16 @@ void MetalRenderer::EndEncoding() m_encoderType = MetalEncoderType::None; // Commit the command buffer if enough draw calls have been recorded - if (m_recordedDrawcalls >= COMMIT_TRESHOLD) + if (m_recordedDrawcalls >= m_commitTreshold) CommitCommandBuffer(); } } void MetalRenderer::CommitCommandBuffer() { + m_commitTreshold = DEFAULT_COMMIT_TRESHOLD; m_recordedDrawcalls = 0; + m_commitOnIdle = false; if (m_commandBuffers.size() != 0) { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index f8e12bd6f..108901f3d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -375,6 +375,16 @@ class MetalRenderer : public Renderer return m_state.m_encoderState; } + void RequestSoonCommit() + { + m_commitTreshold = m_recordedDrawcalls + 8; + } + + void RequestCommitOnIdle() + { + m_commitOnIdle = true; + } + void SetBuffer(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Buffer* buffer, size_t offset, uint32 index); void SetTexture(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Texture* texture, uint32 index); void SetSamplerState(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::SamplerState* samplerState, uint32 index); @@ -471,11 +481,14 @@ class MetalRenderer : public Renderer // Active objects std::vector m_commandBuffers; - uint32 m_recordedDrawcalls = 0; MetalEncoderType m_encoderType = MetalEncoderType::None; MTL::CommandEncoder* m_commandEncoder = nullptr; CA::MetalDrawable* m_drawable = nullptr; + uint32 m_commitTreshold = 0; + uint32 m_recordedDrawcalls = 0; + bool m_commitOnIdle = false; + // State MetalState m_state; }; From 6bb191212b28d38e5d84cb422bd9599b0993697c Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 23 Aug 2024 09:41:00 +0200 Subject: [PATCH 108/368] simplify texture readback --- .../Renderer/Metal/LatteTextureReadbackMtl.cpp | 2 -- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 14 ++++++-------- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h | 12 ------------ 3 files changed, 6 insertions(+), 22 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp index 491017c04..f2c03709a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp @@ -25,8 +25,6 @@ void LatteTextureReadbackInfoMtl::StartTransfer() blitCommandEncoder->copyFromTexture(baseTexture->GetTexture(), 0, 0, MTL::Origin{0, 0, 0}, MTL::Size{(uint32)baseTexture->width, (uint32)baseTexture->height, 1}, m_mtlr->GetTextureReadbackBuffer(), m_bufferOffset, bytesPerRow, bytesPerImage); m_commandBuffer = m_mtlr->GetCurrentCommandBuffer(); - m_mtlr->RequestSoonCommit(); - m_mtlr->RequestCommitOnIdle(); } bool LatteTextureReadbackInfoMtl::IsFinished() diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index a348409d9..0031b4e28 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -21,7 +21,7 @@ #include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "gui/guiWrapper.h" -#define DEFAULT_COMMIT_TRESHOLD 256 +#define COMMIT_TRESHOLD 256 extern bool hasValidFramebufferAttached; @@ -297,7 +297,7 @@ bool MetalRenderer::BeginFrame(bool mainWindow) void MetalRenderer::Flush(bool waitIdle) { - if (m_commitOnIdle || m_recordedDrawcalls > 0) + if (m_recordedDrawcalls > 0) CommitCommandBuffer(); if (waitIdle) { @@ -308,8 +308,8 @@ void MetalRenderer::Flush(bool waitIdle) void MetalRenderer::NotifyLatteCommandProcessorIdle() { - if (m_commitOnIdle) - CommitCommandBuffer(); + //if (m_commitOnIdle) + // CommitCommandBuffer(); } void MetalRenderer::AppendOverlayDebugInfo() @@ -1055,7 +1055,7 @@ void MetalRenderer::draw_endSequence() bool hasReadback = LatteTextureReadback_Update(); m_recordedDrawcalls++; // The number of draw calls needs to twice as big, since we are interrupting the render pass - if (m_recordedDrawcalls >= m_commitTreshold * 2 || hasReadback) + if (m_recordedDrawcalls >= COMMIT_TRESHOLD * 2 || hasReadback) { CommitCommandBuffer(); @@ -1320,16 +1320,14 @@ void MetalRenderer::EndEncoding() m_encoderType = MetalEncoderType::None; // Commit the command buffer if enough draw calls have been recorded - if (m_recordedDrawcalls >= m_commitTreshold) + if (m_recordedDrawcalls >= COMMIT_TRESHOLD) CommitCommandBuffer(); } } void MetalRenderer::CommitCommandBuffer() { - m_commitTreshold = DEFAULT_COMMIT_TRESHOLD; m_recordedDrawcalls = 0; - m_commitOnIdle = false; if (m_commandBuffers.size() != 0) { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 108901f3d..4da66653a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -375,16 +375,6 @@ class MetalRenderer : public Renderer return m_state.m_encoderState; } - void RequestSoonCommit() - { - m_commitTreshold = m_recordedDrawcalls + 8; - } - - void RequestCommitOnIdle() - { - m_commitOnIdle = true; - } - void SetBuffer(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Buffer* buffer, size_t offset, uint32 index); void SetTexture(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Texture* texture, uint32 index); void SetSamplerState(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::SamplerState* samplerState, uint32 index); @@ -485,9 +475,7 @@ class MetalRenderer : public Renderer MTL::CommandEncoder* m_commandEncoder = nullptr; CA::MetalDrawable* m_drawable = nullptr; - uint32 m_commitTreshold = 0; uint32 m_recordedDrawcalls = 0; - bool m_commitOnIdle = false; // State MetalState m_state; From d4a10744253470649eba7e5d5d66e99cab24efd6 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 23 Aug 2024 10:52:20 +0200 Subject: [PATCH 109/368] rework the present system --- src/Cafe/CMakeLists.txt | 2 + .../Latte/Renderer/Metal/MetalLayerHandle.cpp | 45 +++++++++++++ .../Latte/Renderer/Metal/MetalLayerHandle.h | 33 +++++++++ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 67 +++++++++---------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 20 +++--- 5 files changed, 120 insertions(+), 47 deletions(-) create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index 37bef0e85..100f00d83 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -542,6 +542,8 @@ if(ENABLE_METAL) HW/Latte/Renderer/Metal/MetalCppImpl.cpp HW/Latte/Renderer/Metal/MetalLayer.mm HW/Latte/Renderer/Metal/MetalLayer.h + HW/Latte/Renderer/Metal/MetalLayerHandle.cpp + HW/Latte/Renderer/Metal/MetalLayerHandle.h HW/Latte/Renderer/Metal/LatteToMtl.cpp HW/Latte/Renderer/Metal/LatteToMtl.h HW/Latte/Renderer/Metal/LatteTextureMtl.cpp diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp new file mode 100644 index 000000000..f4d4490e1 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp @@ -0,0 +1,45 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalLayer.h" + +#include "gui/guiWrapper.h" + +MetalLayerHandle::MetalLayerHandle(MTL::Device* device, const Vector2i& size) +{ + const auto& windowInfo = gui_getWindowInfo().window_main; + + m_layer = (CA::MetalLayer*)CreateMetalLayer(windowInfo.handle, m_layerScaleX, m_layerScaleY); + m_layer->setDevice(device); + m_layer->setDrawableSize(CGSize{(float)size.x * m_layerScaleX, (float)size.y * m_layerScaleY}); +} + +MetalLayerHandle::~MetalLayerHandle() +{ + if (m_layer) + m_layer->release(); +} + +void MetalLayerHandle::Resize(const Vector2i& size) +{ + m_layer->setDrawableSize(CGSize{(float)size.x * m_layerScaleX, (float)size.y * m_layerScaleY}); +} + +bool MetalLayerHandle::AcquireDrawable() +{ + if (m_drawable) + return true; + + m_drawable = m_layer->nextDrawable(); + if (!m_drawable) + { + debug_printf("failed to acquire next drawable\n"); + return false; + } + + return true; +} + +void MetalLayerHandle::PresentDrawable(MTL::CommandBuffer* commandBuffer) +{ + commandBuffer->presentDrawable(m_drawable); + m_drawable = nullptr; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h new file mode 100644 index 000000000..f95bd8005 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h @@ -0,0 +1,33 @@ +#pragma once + +#include + +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" +#include "QuartzCore/CAMetalDrawable.hpp" +#include "QuartzCore/CAMetalLayer.hpp" +#include "util/math/vector2.h" + +class MetalLayerHandle +{ +public: + MetalLayerHandle() = default; + MetalLayerHandle(MTL::Device* device, const Vector2i& size); + + ~MetalLayerHandle(); + + void Resize(const Vector2i& size); + + bool AcquireDrawable(); + + void PresentDrawable(MTL::CommandBuffer* commandBuffer); + + CA::MetalLayer* GetLayer() const { return m_layer; } + + CA::MetalDrawable* GetDrawable() const { return m_drawable; } + +private: + CA::MetalLayer* m_layer = nullptr; + float m_layerScaleX, m_layerScaleY; + + CA::MetalDrawable* m_drawable = nullptr; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 0031b4e28..42ecbacf6 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1,5 +1,4 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" -#include "Cafe/HW/Latte/Renderer/Metal/MetalLayer.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" @@ -19,6 +18,7 @@ #include "Cemu/Logging/CemuDebugLogging.h" #include "HW/Latte/Core/LatteConst.h" #include "HW/Latte/Renderer/Metal/MetalCommon.h" +#include "HW/Latte/Renderer/Metal/MetalLayerHandle.h" #include "gui/guiWrapper.h" #define COMMIT_TRESHOLD 256 @@ -174,20 +174,14 @@ MetalRenderer::~MetalRenderer() m_device->release(); } -// TODO: don't ignore "mainWindow" argument void MetalRenderer::InitializeLayer(const Vector2i& size, bool mainWindow) { - const auto& windowInfo = gui_getWindowInfo().window_main; - - m_metalLayer = (CA::MetalLayer*)CreateMetalLayer(windowInfo.handle, m_layerScaleX, m_layerScaleY); - m_metalLayer->setDevice(m_device); - m_metalLayer->setDrawableSize(CGSize{(float)size.x * m_layerScaleX, (float)size.y * m_layerScaleY}); + GetLayer(mainWindow) = MetalLayerHandle(m_device, size); } -// TODO: don't ignore "mainWindow" argument void MetalRenderer::ResizeLayer(const Vector2i& size, bool mainWindow) { - m_metalLayer->setDrawableSize(CGSize{(float)size.x * m_layerScaleX, (float)size.y * m_layerScaleY}); + GetLayer(mainWindow).Resize(size); } void MetalRenderer::Initialize() @@ -222,7 +216,7 @@ void MetalRenderer::ClearColorbuffer(bool padView) if (!AcquireNextDrawable(!padView)) return; - ClearColorTextureInternal(m_drawable->texture(), 0, 0, 0.0f, 0.0f, 0.0f, 0.0f); + ClearColorTextureInternal(GetLayer(!padView).GetDrawable()->texture(), 0, 0, 0.0f, 0.0f, 0.0f, 0.0f); } void MetalRenderer::DrawEmptyFrame(bool mainWindow) @@ -234,17 +228,10 @@ void MetalRenderer::DrawEmptyFrame(bool mainWindow) void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) { - - if (m_drawable) - { - auto commandBuffer = GetCommandBuffer(); - commandBuffer->presentDrawable(m_drawable); - } - else - { - debug_printf("skipped present!\n"); - } - m_drawable = nullptr; + if (swapTV) + SwapBuffer(true); + if (swapDRC) + SwapBuffer(false); // Release all the command buffers CommitCommandBuffer(); @@ -269,7 +256,7 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput // Create render pass MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); auto colorAttachment = renderPassDescriptor->colorAttachments()->object(0); - colorAttachment->setTexture(m_drawable->texture()); + colorAttachment->setTexture(GetLayer(!padView).GetDrawable()->texture()); // TODO: shouldn't it be LoadActionLoad when not clearing? colorAttachment->setLoadAction(clearBackground ? MTL::LoadActionClear : MTL::LoadActionDontCare); colorAttachment->setStoreAction(MTL::StoreActionStore); @@ -1351,27 +1338,16 @@ void MetalRenderer::CommitCommandBuffer() bool MetalRenderer::AcquireNextDrawable(bool mainWindow) { + auto& layer = GetLayer(mainWindow); + const bool latteBufferUsesSRGB = mainWindow ? LatteGPUState.tvBufferUsesSRGB : LatteGPUState.drcBufferUsesSRGB; if (latteBufferUsesSRGB != m_state.m_usesSRGB) { - m_metalLayer->setPixelFormat(latteBufferUsesSRGB ? MTL::PixelFormatRGBA8Unorm_sRGB : MTL::PixelFormatRGBA8Unorm); + layer.GetLayer()->setPixelFormat(latteBufferUsesSRGB ? MTL::PixelFormatRGBA8Unorm_sRGB : MTL::PixelFormatRGBA8Unorm); m_state.m_usesSRGB = latteBufferUsesSRGB; } - if (m_drawable) - { - // TODO: should this be true? - return true; - } - - m_drawable = m_metalLayer->nextDrawable(); - if (!m_drawable) - { - debug_printf("failed to acquire next drawable\n"); - return false; - } - - return true; + return layer.AcquireDrawable(); } bool MetalRenderer::CheckIfRenderPassNeedsFlush(LatteDecompilerShader* shader) @@ -1639,3 +1615,20 @@ void MetalRenderer::ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 s renderPassDescriptor->release(); EndEncoding(); } + + + +void MetalRenderer::SwapBuffer(bool mainWindow) +{ + auto& layer = GetLayer(mainWindow); + + if (layer.GetDrawable()) + { + auto commandBuffer = GetCommandBuffer(); + layer.PresentDrawable(commandBuffer); + } + else + { + debug_printf("skipped present!\n"); + } +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 4da66653a..7607de1da 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -1,14 +1,8 @@ #pragma once -#include -#include -#include - #include "Cafe/HW/Latte/Renderer/Renderer.h" -#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" -#include "Metal/MTLResource.hpp" -#include "Metal/MTLSampler.hpp" +#include "Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h" struct MetalBufferAllocation { @@ -428,8 +422,8 @@ class MetalRenderer : public Renderer } private: - CA::MetalLayer* m_metalLayer; - float m_layerScaleX, m_layerScaleY; + MetalLayerHandle m_mainLayer; + MetalLayerHandle m_padLayer; // Metal objects MTL::Device* m_device; @@ -473,10 +467,16 @@ class MetalRenderer : public Renderer std::vector m_commandBuffers; MetalEncoderType m_encoderType = MetalEncoderType::None; MTL::CommandEncoder* m_commandEncoder = nullptr; - CA::MetalDrawable* m_drawable = nullptr; uint32 m_recordedDrawcalls = 0; // State MetalState m_state; + + MetalLayerHandle& GetLayer(bool mainWindow) + { + return (mainWindow ? m_mainLayer : m_padLayer); + } + + void SwapBuffer(bool mainWindow); }; From 7499c3fbe74c5b6c04856b53c004d95984b97631 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 24 Aug 2024 17:03:49 +0200 Subject: [PATCH 110/368] add: ImGui support --- .../Latte/Renderer/Metal/MetalLayerHandle.cpp | 20 + .../Latte/Renderer/Metal/MetalLayerHandle.h | 5 + .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 139 ++++- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 32 +- src/imgui/CMakeLists.txt | 9 +- src/imgui/imgui_impl_metal.h | 64 ++ src/imgui/imgui_impl_metal.mm | 579 ++++++++++++++++++ 7 files changed, 805 insertions(+), 43 deletions(-) create mode 100644 src/imgui/imgui_impl_metal.h create mode 100644 src/imgui/imgui_impl_metal.mm diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp index f4d4490e1..9349dc895 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp @@ -16,6 +16,8 @@ MetalLayerHandle::~MetalLayerHandle() { if (m_layer) m_layer->release(); + if (m_renderPassDescriptor) + m_renderPassDescriptor->release(); } void MetalLayerHandle::Resize(const Vector2i& size) @@ -35,9 +37,27 @@ bool MetalLayerHandle::AcquireDrawable() return false; } + if (m_renderPassDescriptor) + { + m_renderPassDescriptor->release(); + m_renderPassDescriptor = nullptr; + } + return true; } +void MetalLayerHandle::CreateRenderPassDescriptor(bool clear) +{ + if (m_renderPassDescriptor) + m_renderPassDescriptor->release(); + + m_renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); + auto colorAttachment = m_renderPassDescriptor->colorAttachments()->object(0); + colorAttachment->setTexture(m_drawable->texture()); + colorAttachment->setLoadAction(clear ? MTL::LoadActionClear : MTL::LoadActionLoad); + colorAttachment->setStoreAction(MTL::StoreActionStore); +} + void MetalLayerHandle::PresentDrawable(MTL::CommandBuffer* commandBuffer) { commandBuffer->presentDrawable(m_drawable); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h index f95bd8005..b4aff7dfb 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h @@ -19,15 +19,20 @@ class MetalLayerHandle bool AcquireDrawable(); + void CreateRenderPassDescriptor(bool clear); + void PresentDrawable(MTL::CommandBuffer* commandBuffer); CA::MetalLayer* GetLayer() const { return m_layer; } CA::MetalDrawable* GetDrawable() const { return m_drawable; } + MTL::RenderPassDescriptor* GetRenderPassDescriptor() const { return m_renderPassDescriptor; } + private: CA::MetalLayer* m_layer = nullptr; float m_layerScaleX, m_layerScaleY; CA::MetalDrawable* m_drawable = nullptr; + MTL::RenderPassDescriptor* m_renderPassDescriptor = nullptr; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 42ecbacf6..f4a35eb54 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -19,7 +19,12 @@ #include "HW/Latte/Core/LatteConst.h" #include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "HW/Latte/Renderer/Metal/MetalLayerHandle.h" -#include "gui/guiWrapper.h" +#include "HW/Latte/Renderer/Renderer.h" +#include "imgui.h" + +#define IMGUI_IMPL_METAL_CPP +#include "imgui/imgui_extension.h" +#include "imgui/imgui_impl_metal.h" #define COMMIT_TRESHOLD 256 @@ -191,6 +196,8 @@ void MetalRenderer::Initialize() void MetalRenderer::Shutdown() { + // TODO: should shutdown both layers + ImGui_ImplMetal_Shutdown(); Renderer::Shutdown(); CommitCommandBuffer(); } @@ -205,7 +212,7 @@ bool MetalRenderer::IsPadWindowActive() bool MetalRenderer::GetVRAMInfo(int& usageInMB, int& totalInMB) const { - usageInMB = m_device->currentAllocatedSize(); + usageInMB = m_device->currentAllocatedSize() / 1024 / 1024; totalInMB = usageInMB; return true; @@ -213,7 +220,7 @@ bool MetalRenderer::GetVRAMInfo(int& usageInMB, int& totalInMB) const void MetalRenderer::ClearColorbuffer(bool padView) { - if (!AcquireNextDrawable(!padView)) + if (!AcquireDrawable(!padView)) return; ClearColorTextureInternal(GetLayer(!padView).GetDrawable()->texture(), 0, 0, 0.0f, 0.0f, 0.0f, 0.0f); @@ -248,21 +255,16 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight, bool padView, bool clearBackground) { - if (!AcquireNextDrawable(!padView)) + if (!AcquireDrawable(!padView)) return; MTL::Texture* presentTexture = static_cast(texView)->GetRGBAView(); // Create render pass - MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); - auto colorAttachment = renderPassDescriptor->colorAttachments()->object(0); - colorAttachment->setTexture(GetLayer(!padView).GetDrawable()->texture()); - // TODO: shouldn't it be LoadActionLoad when not clearing? - colorAttachment->setLoadAction(clearBackground ? MTL::LoadActionClear : MTL::LoadActionDontCare); - colorAttachment->setStoreAction(MTL::StoreActionStore); + auto& layer = GetLayer(!padView); + layer.CreateRenderPassDescriptor(clearBackground); - auto renderCommandEncoder = GetTemporaryRenderCommandEncoder(renderPassDescriptor); - renderPassDescriptor->release(); + auto renderCommandEncoder = GetTemporaryRenderCommandEncoder(layer.GetRenderPassDescriptor()); // Draw to Metal layer renderCommandEncoder->setRenderPipelineState(m_state.m_usesSRGB ? m_presentPipelineSRGB : m_presentPipelineLinear); @@ -279,14 +281,14 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput bool MetalRenderer::BeginFrame(bool mainWindow) { - return AcquireNextDrawable(mainWindow); + return AcquireDrawable(mainWindow); } void MetalRenderer::Flush(bool waitIdle) { if (m_recordedDrawcalls > 0) CommitCommandBuffer(); - if (waitIdle) + if (waitIdle && m_commandBuffers.size() != 0) { // TODO: shouldn't we wait for all command buffers? WaitForCommandBufferCompletion(GetCurrentCommandBuffer()); @@ -299,9 +301,101 @@ void MetalRenderer::NotifyLatteCommandProcessorIdle() // CommitCommandBuffer(); } +bool MetalRenderer::ImguiBegin(bool mainWindow) +{ + EnsureImGuiBackend(); + + if (!Renderer::ImguiBegin(mainWindow)) + return false; + + if (!AcquireDrawable(mainWindow)) + return false; + + auto& layer = GetLayer(mainWindow); + if (!layer.GetRenderPassDescriptor()) + layer.CreateRenderPassDescriptor(true); // TODO: should we clear? + + ImGui_ImplMetal_CreateFontsTexture(m_device); + ImGui_ImplMetal_NewFrame(layer.GetRenderPassDescriptor()); + ImGui_UpdateWindowInformation(mainWindow); + ImGui::NewFrame(); + + if (m_encoderType != MetalEncoderType::Render) + GetTemporaryRenderCommandEncoder(layer.GetRenderPassDescriptor()); + + return true; +} + +void MetalRenderer::ImguiEnd() +{ + EnsureImGuiBackend(); + + if (m_encoderType != MetalEncoderType::Render) + { + debug_printf("no render command encoder, cannot draw ImGui\n"); + return; + } + + ImGui::Render(); + ImGui_ImplMetal_RenderDrawData(ImGui::GetDrawData(), GetCurrentCommandBuffer(), (MTL::RenderCommandEncoder*)m_commandEncoder); + //ImGui::EndFrame(); + + EndEncoding(); +} + +ImTextureID MetalRenderer::GenerateTexture(const std::vector& data, const Vector2i& size) +{ + try + { + std::vector tmp(size.x * size.y * 4); + for (size_t i = 0; i < data.size() / 3; ++i) + { + tmp[(i * 4) + 0] = data[(i * 3) + 0]; + tmp[(i * 4) + 1] = data[(i * 3) + 1]; + tmp[(i * 4) + 2] = data[(i * 3) + 2]; + tmp[(i * 4) + 3] = 0xFF; + } + + MTL::TextureDescriptor* desc = MTL::TextureDescriptor::alloc()->init(); + desc->setTextureType(MTL::TextureType2D); + desc->setPixelFormat(MTL::PixelFormatRGBA8Unorm); + desc->setWidth(size.x); + desc->setHeight(size.y); + desc->setStorageMode(MTL::StorageModeShared); + desc->setUsage(MTL::TextureUsageShaderRead); + + MTL::Texture* texture = m_device->newTexture(desc); + desc->release(); + + // TODO: do a GPU copy? + texture->replaceRegion(MTL::Region(0, 0, size.x, size.y), 0, 0, tmp.data(), size.x * 4, 0); + + return (ImTextureID)texture; + } + catch (const std::exception& ex) + { + cemuLog_log(LogType::Force, "can't generate imgui texture: {}", ex.what()); + return nullptr; + } +} + +void MetalRenderer::DeleteTexture(ImTextureID id) +{ + EnsureImGuiBackend(); + + ((MTL::Texture*)id)->release(); +} + +void MetalRenderer::DeleteFontTextures() +{ + EnsureImGuiBackend(); + + ImGui_ImplMetal_DestroyFontsTexture(); +} + void MetalRenderer::AppendOverlayDebugInfo() { - debug_printf("MetalRenderer::AppendOverlayDebugInfo not implemented\n"); + // TODO: implement } // TODO: halfZ @@ -1336,7 +1430,7 @@ void MetalRenderer::CommitCommandBuffer() } } -bool MetalRenderer::AcquireNextDrawable(bool mainWindow) +bool MetalRenderer::AcquireDrawable(bool mainWindow) { auto& layer = GetLayer(mainWindow); @@ -1616,11 +1710,11 @@ void MetalRenderer::ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 s EndEncoding(); } - - void MetalRenderer::SwapBuffer(bool mainWindow) { auto& layer = GetLayer(mainWindow); + if (!layer.AcquireDrawable()) + return; if (layer.GetDrawable()) { @@ -1632,3 +1726,12 @@ void MetalRenderer::SwapBuffer(bool mainWindow) debug_printf("skipped present!\n"); } } + +void MetalRenderer::EnsureImGuiBackend() +{ + if (!ImGui::GetIO().BackendRendererUserData) + { + ImGui_ImplMetal_Init(m_device); + //ImGui_ImplMetal_CreateFontsTexture(m_device); + } +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 7607de1da..d43dcdd38 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -228,29 +228,11 @@ class MetalRenderer : public Renderer void NotifyLatteCommandProcessorIdle() override; // called when command processor has no more commands available or when stalled // imgui - bool ImguiBegin(bool mainWindow) override { - cemuLog_log(LogType::MetalLogging, "Imgui is not yet supported on Metal"); - - return false; - }; - - void ImguiEnd() override { - cemuLog_log(LogType::MetalLogging, "Imgui is not yet supported on Metal"); - }; - - ImTextureID GenerateTexture(const std::vector& data, const Vector2i& size) override { - cemuLog_log(LogType::MetalLogging, "Imgui is not yet supported on Metal"); - - return nullptr; - }; - - void DeleteTexture(ImTextureID id) override { - cemuLog_log(LogType::MetalLogging, "Imgui is not yet supported on Metal"); - }; - - void DeleteFontTextures() override { - cemuLog_log(LogType::MetalLogging, "Imgui is not yet supported on Metal"); - }; + bool ImguiBegin(bool mainWindow) override; + void ImguiEnd() override; + ImTextureID GenerateTexture(const std::vector& data, const Vector2i& size) override; + void DeleteTexture(ImTextureID id) override; + void DeleteFontTextures() override; bool UseTFViaSSBO() const override { return true; } void AppendOverlayDebugInfo() override; @@ -383,7 +365,7 @@ class MetalRenderer : public Renderer void EndEncoding(); void CommitCommandBuffer(); - bool AcquireNextDrawable(bool mainWindow); + bool AcquireDrawable(bool mainWindow); bool CheckIfRenderPassNeedsFlush(LatteDecompilerShader* shader); void BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader, bool usesGeometryShader); @@ -479,4 +461,6 @@ class MetalRenderer : public Renderer } void SwapBuffer(bool mainWindow); + + void EnsureImGuiBackend(); }; diff --git a/src/imgui/CMakeLists.txt b/src/imgui/CMakeLists.txt index c3fc4a0ea..86aeb130f 100644 --- a/src/imgui/CMakeLists.txt +++ b/src/imgui/CMakeLists.txt @@ -7,7 +7,14 @@ add_library(imguiImpl imgui_extension.h ) -# TODO: add Metal +if (ENABLE_METAL) + target_sources(imguiImpl PRIVATE + imgui_impl_metal.mm + imgui_impl_metal.h + ) + + target_compile_definitions(imguiImpl PRIVATE IMGUI_IMPL_METAL_CPP) +endif () set_property(TARGET imguiImpl PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") diff --git a/src/imgui/imgui_impl_metal.h b/src/imgui/imgui_impl_metal.h new file mode 100644 index 000000000..3aaacb9e0 --- /dev/null +++ b/src/imgui/imgui_impl_metal.h @@ -0,0 +1,64 @@ +// dear imgui: Renderer Backend for Metal +// This needs to be used along with a Platform Backend (e.g. OSX) + +// Implemented features: +// [X] Renderer: User texture binding. Use 'MTLTexture' as ImTextureID. Read the FAQ about ImTextureID! +// [X] Renderer: Large meshes support (64k+ vertices) with 16-bit indices. + +// You can use unmodified imgui_impl_* files in your project. See examples/ folder for examples of using this. +// Prefer including the entire imgui/ repository into your project (either as a copy or as a submodule), and only build the backends you need. +// If you are new to Dear ImGui, read documentation from the docs/ folder + read the top of imgui.cpp. +// Read online: https://github.com/ocornut/imgui/tree/master/docs + +#include "imgui.h" // IMGUI_IMPL_API + +//----------------------------------------------------------------------------- +// ObjC API +//----------------------------------------------------------------------------- + +#ifdef __OBJC__ + +@class MTLRenderPassDescriptor; +@protocol MTLDevice, MTLCommandBuffer, MTLRenderCommandEncoder; + +IMGUI_IMPL_API bool ImGui_ImplMetal_Init(id device); +IMGUI_IMPL_API void ImGui_ImplMetal_Shutdown(); +IMGUI_IMPL_API void ImGui_ImplMetal_NewFrame(MTLRenderPassDescriptor* renderPassDescriptor); +IMGUI_IMPL_API void ImGui_ImplMetal_RenderDrawData(ImDrawData* drawData, + id commandBuffer, + id commandEncoder); + +// Called by Init/NewFrame/Shutdown +IMGUI_IMPL_API bool ImGui_ImplMetal_CreateFontsTexture(id device); +IMGUI_IMPL_API void ImGui_ImplMetal_DestroyFontsTexture(); +IMGUI_IMPL_API bool ImGui_ImplMetal_CreateDeviceObjects(id device); +IMGUI_IMPL_API void ImGui_ImplMetal_DestroyDeviceObjects(); + +#endif + +//----------------------------------------------------------------------------- +// C++ API +//----------------------------------------------------------------------------- + +// Enable Metal C++ binding support with '#define IMGUI_IMPL_METAL_CPP' in your imconfig.h file +// More info about using Metal from C++: https://developer.apple.com/metal/cpp/ + +#ifdef IMGUI_IMPL_METAL_CPP +#include +#ifndef __OBJC__ + +IMGUI_IMPL_API bool ImGui_ImplMetal_Init(MTL::Device* device); +IMGUI_IMPL_API void ImGui_ImplMetal_Shutdown(); +IMGUI_IMPL_API void ImGui_ImplMetal_NewFrame(MTL::RenderPassDescriptor* renderPassDescriptor); +IMGUI_IMPL_API void ImGui_ImplMetal_RenderDrawData(ImDrawData* draw_data, + MTL::CommandBuffer* commandBuffer, + MTL::RenderCommandEncoder* commandEncoder); + +// Called by Init/NewFrame/Shutdown +IMGUI_IMPL_API bool ImGui_ImplMetal_CreateFontsTexture(MTL::Device* device); +IMGUI_IMPL_API void ImGui_ImplMetal_DestroyFontsTexture(); +IMGUI_IMPL_API bool ImGui_ImplMetal_CreateDeviceObjects(MTL::Device* device); +IMGUI_IMPL_API void ImGui_ImplMetal_DestroyDeviceObjects(); + +#endif +#endif diff --git a/src/imgui/imgui_impl_metal.mm b/src/imgui/imgui_impl_metal.mm new file mode 100644 index 000000000..8a070a6c2 --- /dev/null +++ b/src/imgui/imgui_impl_metal.mm @@ -0,0 +1,579 @@ +// dear imgui: Renderer Backend for Metal +// This needs to be used along with a Platform Backend (e.g. OSX) + +// Implemented features: +// [X] Renderer: User texture binding. Use 'MTLTexture' as ImTextureID. Read the FAQ about ImTextureID! +// [X] Renderer: Large meshes support (64k+ vertices) with 16-bit indices. + +// You can use unmodified imgui_impl_* files in your project. See examples/ folder for examples of using this. +// Prefer including the entire imgui/ repository into your project (either as a copy or as a submodule), and only build the backends you need. +// If you are new to Dear ImGui, read documentation from the docs/ folder + read the top of imgui.cpp. +// Read online: https://github.com/ocornut/imgui/tree/master/docs + +// CHANGELOG +// (minor and older changes stripped away, please see git history for details) +// 2022-08-23: Metal: Update deprecated property 'sampleCount'->'rasterSampleCount'. +// 2022-07-05: Metal: Add dispatch synchronization. +// 2022-06-30: Metal: Use __bridge for ARC based systems. +// 2022-06-01: Metal: Fixed null dereference on exit inside command buffer completion handler. +// 2022-04-27: Misc: Store backend data in a per-context struct, allowing to use this backend with multiple contexts. +// 2022-01-03: Metal: Ignore ImDrawCmd where ElemCount == 0 (very rare but can technically be manufactured by user code). +// 2021-12-30: Metal: Added Metal C++ support. Enable with '#define IMGUI_IMPL_METAL_CPP' in your imconfig.h file. +// 2021-08-24: Metal: Fixed a crash when clipping rect larger than framebuffer is submitted. (#4464) +// 2021-05-19: Metal: Replaced direct access to ImDrawCmd::TextureId with a call to ImDrawCmd::GetTexID(). (will become a requirement) +// 2021-02-18: Metal: Change blending equation to preserve alpha in output buffer. +// 2021-01-25: Metal: Fixed texture storage mode when building on Mac Catalyst. +// 2019-05-29: Metal: Added support for large mesh (64K+ vertices), enable ImGuiBackendFlags_RendererHasVtxOffset flag. +// 2019-04-30: Metal: Added support for special ImDrawCallback_ResetRenderState callback to reset render state. +// 2019-02-11: Metal: Projecting clipping rectangles correctly using draw_data->FramebufferScale to allow multi-viewports for retina display. +// 2018-11-30: Misc: Setting up io.BackendRendererName so it can be displayed in the About Window. +// 2018-07-05: Metal: Added new Metal backend implementation. + +#include "imgui.h" +#include "imgui_impl_metal.h" +#import +#import + +#pragma mark - Support classes + +// A wrapper around a MTLBuffer object that knows the last time it was reused +@interface MetalBuffer : NSObject +@property (nonatomic, strong) id buffer; +@property (nonatomic, assign) double lastReuseTime; +- (instancetype)initWithBuffer:(id)buffer; +@end + +// An object that encapsulates the data necessary to uniquely identify a +// render pipeline state. These are used as cache keys. +@interface FramebufferDescriptor : NSObject +@property (nonatomic, assign) unsigned long sampleCount; +@property (nonatomic, assign) MTLPixelFormat colorPixelFormat; +@property (nonatomic, assign) MTLPixelFormat depthPixelFormat; +@property (nonatomic, assign) MTLPixelFormat stencilPixelFormat; +- (instancetype)initWithRenderPassDescriptor:(MTLRenderPassDescriptor*)renderPassDescriptor; +@end + +// A singleton that stores long-lived objects that are needed by the Metal +// renderer backend. Stores the render pipeline state cache and the default +// font texture, and manages the reusable buffer cache. +@interface MetalContext : NSObject +@property (nonatomic, strong) id device; +@property (nonatomic, strong) id depthStencilState; +@property (nonatomic, strong) FramebufferDescriptor* framebufferDescriptor; // framebuffer descriptor for current frame; transient +@property (nonatomic, strong) NSMutableDictionary* renderPipelineStateCache; // pipeline cache; keyed on framebuffer descriptors +@property (nonatomic, strong, nullable) id fontTexture; +@property (nonatomic, strong) NSMutableArray* bufferCache; +@property (nonatomic, assign) double lastBufferCachePurge; +- (MetalBuffer*)dequeueReusableBufferOfLength:(NSUInteger)length device:(id)device; +- (id)renderPipelineStateForFramebufferDescriptor:(FramebufferDescriptor*)descriptor device:(id)device; +@end + +struct ImGui_ImplMetal_Data +{ + MetalContext* SharedMetalContext; + + ImGui_ImplMetal_Data() { memset(this, 0, sizeof(*this)); } +}; + +static ImGui_ImplMetal_Data* ImGui_ImplMetal_CreateBackendData() { return IM_NEW(ImGui_ImplMetal_Data)(); } +static ImGui_ImplMetal_Data* ImGui_ImplMetal_GetBackendData() { return ImGui::GetCurrentContext() ? (ImGui_ImplMetal_Data*)ImGui::GetIO().BackendRendererUserData : nullptr; } +static void ImGui_ImplMetal_DestroyBackendData(){ IM_DELETE(ImGui_ImplMetal_GetBackendData()); } + +static inline CFTimeInterval GetMachAbsoluteTimeInSeconds() { return (CFTimeInterval)(double)(clock_gettime_nsec_np(CLOCK_UPTIME_RAW) / 1e9); } + +#ifdef IMGUI_IMPL_METAL_CPP + +#pragma mark - Dear ImGui Metal C++ Backend API + +bool ImGui_ImplMetal_Init(MTL::Device* device) +{ + return ImGui_ImplMetal_Init((__bridge id)(device)); +} + +void ImGui_ImplMetal_NewFrame(MTL::RenderPassDescriptor* renderPassDescriptor) +{ + ImGui_ImplMetal_NewFrame((__bridge MTLRenderPassDescriptor*)(renderPassDescriptor)); +} + +void ImGui_ImplMetal_RenderDrawData(ImDrawData* draw_data, + MTL::CommandBuffer* commandBuffer, + MTL::RenderCommandEncoder* commandEncoder) +{ + ImGui_ImplMetal_RenderDrawData(draw_data, + (__bridge id)(commandBuffer), + (__bridge id)(commandEncoder)); + +} + +bool ImGui_ImplMetal_CreateFontsTexture(MTL::Device* device) +{ + return ImGui_ImplMetal_CreateFontsTexture((__bridge id)(device)); +} + +bool ImGui_ImplMetal_CreateDeviceObjects(MTL::Device* device) +{ + return ImGui_ImplMetal_CreateDeviceObjects((__bridge id)(device)); +} + +#endif // #ifdef IMGUI_IMPL_METAL_CPP + +#pragma mark - Dear ImGui Metal Backend API + +bool ImGui_ImplMetal_Init(id device) +{ + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_CreateBackendData(); + ImGuiIO& io = ImGui::GetIO(); + io.BackendRendererUserData = (void*)bd; + io.BackendRendererName = "imgui_impl_metal"; + io.BackendFlags |= ImGuiBackendFlags_RendererHasVtxOffset; // We can honor the ImDrawCmd::VtxOffset field, allowing for large meshes. + + bd->SharedMetalContext = [[MetalContext alloc] init]; + bd->SharedMetalContext.device = device; + + return true; +} + +void ImGui_ImplMetal_Shutdown() +{ + ImGui_ImplMetal_DestroyDeviceObjects(); + ImGui_ImplMetal_DestroyBackendData(); +} + +void ImGui_ImplMetal_NewFrame(MTLRenderPassDescriptor* renderPassDescriptor) +{ + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_GetBackendData(); + IM_ASSERT(bd->SharedMetalContext != nil && "No Metal context. Did you call ImGui_ImplMetal_Init() ?"); + bd->SharedMetalContext.framebufferDescriptor = [[FramebufferDescriptor alloc] initWithRenderPassDescriptor:renderPassDescriptor]; + + if (bd->SharedMetalContext.depthStencilState == nil) + ImGui_ImplMetal_CreateDeviceObjects(bd->SharedMetalContext.device); +} + +static void ImGui_ImplMetal_SetupRenderState(ImDrawData* drawData, id commandBuffer, + id commandEncoder, id renderPipelineState, + MetalBuffer* vertexBuffer, size_t vertexBufferOffset) +{ + IM_UNUSED(commandBuffer); + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_GetBackendData(); + [commandEncoder setCullMode:MTLCullModeNone]; + [commandEncoder setDepthStencilState:bd->SharedMetalContext.depthStencilState]; + + // Setup viewport, orthographic projection matrix + // Our visible imgui space lies from draw_data->DisplayPos (top left) to + // draw_data->DisplayPos+data_data->DisplaySize (bottom right). DisplayMin is typically (0,0) for single viewport apps. + MTLViewport viewport = + { + .originX = 0.0, + .originY = 0.0, + .width = (double)(drawData->DisplaySize.x * drawData->FramebufferScale.x), + .height = (double)(drawData->DisplaySize.y * drawData->FramebufferScale.y), + .znear = 0.0, + .zfar = 1.0 + }; + [commandEncoder setViewport:viewport]; + + float L = drawData->DisplayPos.x; + float R = drawData->DisplayPos.x + drawData->DisplaySize.x; + float T = drawData->DisplayPos.y; + float B = drawData->DisplayPos.y + drawData->DisplaySize.y; + float N = (float)viewport.znear; + float F = (float)viewport.zfar; + const float ortho_projection[4][4] = + { + { 2.0f/(R-L), 0.0f, 0.0f, 0.0f }, + { 0.0f, 2.0f/(T-B), 0.0f, 0.0f }, + { 0.0f, 0.0f, 1/(F-N), 0.0f }, + { (R+L)/(L-R), (T+B)/(B-T), N/(F-N), 1.0f }, + }; + [commandEncoder setVertexBytes:&ortho_projection length:sizeof(ortho_projection) atIndex:1]; + + [commandEncoder setRenderPipelineState:renderPipelineState]; + + [commandEncoder setVertexBuffer:vertexBuffer.buffer offset:0 atIndex:0]; + [commandEncoder setVertexBufferOffset:vertexBufferOffset atIndex:0]; +} + +// Metal Render function. +void ImGui_ImplMetal_RenderDrawData(ImDrawData* drawData, id commandBuffer, id commandEncoder) +{ + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_GetBackendData(); + MetalContext* ctx = bd->SharedMetalContext; + + // Avoid rendering when minimized, scale coordinates for retina displays (screen coordinates != framebuffer coordinates) + int fb_width = (int)(drawData->DisplaySize.x * drawData->FramebufferScale.x); + int fb_height = (int)(drawData->DisplaySize.y * drawData->FramebufferScale.y); + if (fb_width <= 0 || fb_height <= 0 || drawData->CmdListsCount == 0) + return; + + // Try to retrieve a render pipeline state that is compatible with the framebuffer config for this frame + // The hit rate for this cache should be very near 100%. + id renderPipelineState = ctx.renderPipelineStateCache[ctx.framebufferDescriptor]; + if (renderPipelineState == nil) + { + // No luck; make a new render pipeline state + renderPipelineState = [ctx renderPipelineStateForFramebufferDescriptor:ctx.framebufferDescriptor device:commandBuffer.device]; + + // Cache render pipeline state for later reuse + ctx.renderPipelineStateCache[ctx.framebufferDescriptor] = renderPipelineState; + } + + size_t vertexBufferLength = (size_t)drawData->TotalVtxCount * sizeof(ImDrawVert); + size_t indexBufferLength = (size_t)drawData->TotalIdxCount * sizeof(ImDrawIdx); + MetalBuffer* vertexBuffer = [ctx dequeueReusableBufferOfLength:vertexBufferLength device:commandBuffer.device]; + MetalBuffer* indexBuffer = [ctx dequeueReusableBufferOfLength:indexBufferLength device:commandBuffer.device]; + + ImGui_ImplMetal_SetupRenderState(drawData, commandBuffer, commandEncoder, renderPipelineState, vertexBuffer, 0); + + // Will project scissor/clipping rectangles into framebuffer space + ImVec2 clip_off = drawData->DisplayPos; // (0,0) unless using multi-viewports + ImVec2 clip_scale = drawData->FramebufferScale; // (1,1) unless using retina display which are often (2,2) + + // Render command lists + size_t vertexBufferOffset = 0; + size_t indexBufferOffset = 0; + for (int n = 0; n < drawData->CmdListsCount; n++) + { + const ImDrawList* cmd_list = drawData->CmdLists[n]; + + memcpy((char*)vertexBuffer.buffer.contents + vertexBufferOffset, cmd_list->VtxBuffer.Data, (size_t)cmd_list->VtxBuffer.Size * sizeof(ImDrawVert)); + memcpy((char*)indexBuffer.buffer.contents + indexBufferOffset, cmd_list->IdxBuffer.Data, (size_t)cmd_list->IdxBuffer.Size * sizeof(ImDrawIdx)); + + for (int cmd_i = 0; cmd_i < cmd_list->CmdBuffer.Size; cmd_i++) + { + const ImDrawCmd* pcmd = &cmd_list->CmdBuffer[cmd_i]; + if (pcmd->UserCallback) + { + // User callback, registered via ImDrawList::AddCallback() + // (ImDrawCallback_ResetRenderState is a special callback value used by the user to request the renderer to reset render state.) + if (pcmd->UserCallback == ImDrawCallback_ResetRenderState) + ImGui_ImplMetal_SetupRenderState(drawData, commandBuffer, commandEncoder, renderPipelineState, vertexBuffer, vertexBufferOffset); + else + pcmd->UserCallback(cmd_list, pcmd); + } + else + { + // Project scissor/clipping rectangles into framebuffer space + ImVec2 clip_min((pcmd->ClipRect.x - clip_off.x) * clip_scale.x, (pcmd->ClipRect.y - clip_off.y) * clip_scale.y); + ImVec2 clip_max((pcmd->ClipRect.z - clip_off.x) * clip_scale.x, (pcmd->ClipRect.w - clip_off.y) * clip_scale.y); + + // Clamp to viewport as setScissorRect() won't accept values that are off bounds + if (clip_min.x < 0.0f) { clip_min.x = 0.0f; } + if (clip_min.y < 0.0f) { clip_min.y = 0.0f; } + if (clip_max.x > fb_width) { clip_max.x = (float)fb_width; } + if (clip_max.y > fb_height) { clip_max.y = (float)fb_height; } + if (clip_max.x <= clip_min.x || clip_max.y <= clip_min.y) + continue; + if (pcmd->ElemCount == 0) // drawIndexedPrimitives() validation doesn't accept this + continue; + + // Apply scissor/clipping rectangle + MTLScissorRect scissorRect = + { + .x = NSUInteger(clip_min.x), + .y = NSUInteger(clip_min.y), + .width = NSUInteger(clip_max.x - clip_min.x), + .height = NSUInteger(clip_max.y - clip_min.y) + }; + [commandEncoder setScissorRect:scissorRect]; + + // Bind texture, Draw + if (ImTextureID tex_id = pcmd->GetTexID()) + [commandEncoder setFragmentTexture:(__bridge id)(tex_id) atIndex:0]; + + [commandEncoder setVertexBufferOffset:(vertexBufferOffset + pcmd->VtxOffset * sizeof(ImDrawVert)) atIndex:0]; + [commandEncoder drawIndexedPrimitives:MTLPrimitiveTypeTriangle + indexCount:pcmd->ElemCount + indexType:sizeof(ImDrawIdx) == 2 ? MTLIndexTypeUInt16 : MTLIndexTypeUInt32 + indexBuffer:indexBuffer.buffer + indexBufferOffset:indexBufferOffset + pcmd->IdxOffset * sizeof(ImDrawIdx)]; + } + } + + vertexBufferOffset += (size_t)cmd_list->VtxBuffer.Size * sizeof(ImDrawVert); + indexBufferOffset += (size_t)cmd_list->IdxBuffer.Size * sizeof(ImDrawIdx); + } + + [commandBuffer addCompletedHandler:^(id) + { + dispatch_async(dispatch_get_main_queue(), ^{ + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_GetBackendData(); + if (bd != nullptr) + { + @synchronized(bd->SharedMetalContext.bufferCache) + { + [bd->SharedMetalContext.bufferCache addObject:vertexBuffer]; + [bd->SharedMetalContext.bufferCache addObject:indexBuffer]; + } + } + }); + }]; +} + +bool ImGui_ImplMetal_CreateFontsTexture(id device) +{ + // HACK: check if the font atlas has been built already + ImGuiIO& io = ImGui::GetIO(); + if (io.Fonts->IsBuilt()) + return true; + + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_GetBackendData(); + + // We are retrieving and uploading the font atlas as a 4-channels RGBA texture here. + // In theory we could call GetTexDataAsAlpha8() and upload a 1-channel texture to save on memory access bandwidth. + // However, using a shader designed for 1-channel texture would make it less obvious to use the ImTextureID facility to render users own textures. + // You can make that change in your implementation. + unsigned char* pixels; + int width, height; + io.Fonts->GetTexDataAsRGBA32(&pixels, &width, &height); + MTLTextureDescriptor* textureDescriptor = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatRGBA8Unorm + width:(NSUInteger)width + height:(NSUInteger)height + mipmapped:NO]; + textureDescriptor.usage = MTLTextureUsageShaderRead; +#if TARGET_OS_OSX || TARGET_OS_MACCATALYST + textureDescriptor.storageMode = MTLStorageModeManaged; +#else + textureDescriptor.storageMode = MTLStorageModeShared; +#endif + id texture = [device newTextureWithDescriptor:textureDescriptor]; + [texture replaceRegion:MTLRegionMake2D(0, 0, (NSUInteger)width, (NSUInteger)height) mipmapLevel:0 withBytes:pixels bytesPerRow:(NSUInteger)width * 4]; + bd->SharedMetalContext.fontTexture = texture; + io.Fonts->SetTexID((__bridge void*)bd->SharedMetalContext.fontTexture); // ImTextureID == void* + + return (bd->SharedMetalContext.fontTexture != nil); +} + +void ImGui_ImplMetal_DestroyFontsTexture() +{ + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_GetBackendData(); + ImGuiIO& io = ImGui::GetIO(); + bd->SharedMetalContext.fontTexture = nil; + io.Fonts->SetTexID(nullptr); +} + +bool ImGui_ImplMetal_CreateDeviceObjects(id device) +{ + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_GetBackendData(); + MTLDepthStencilDescriptor* depthStencilDescriptor = [[MTLDepthStencilDescriptor alloc] init]; + depthStencilDescriptor.depthWriteEnabled = NO; + depthStencilDescriptor.depthCompareFunction = MTLCompareFunctionAlways; + bd->SharedMetalContext.depthStencilState = [device newDepthStencilStateWithDescriptor:depthStencilDescriptor]; + ImGui_ImplMetal_CreateFontsTexture(device); + + return true; +} + +void ImGui_ImplMetal_DestroyDeviceObjects() +{ + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_GetBackendData(); + ImGui_ImplMetal_DestroyFontsTexture(); + [bd->SharedMetalContext.renderPipelineStateCache removeAllObjects]; +} + +#pragma mark - MetalBuffer implementation + +@implementation MetalBuffer +- (instancetype)initWithBuffer:(id)buffer +{ + if ((self = [super init])) + { + _buffer = buffer; + _lastReuseTime = GetMachAbsoluteTimeInSeconds(); + } + return self; +} +@end + +#pragma mark - FramebufferDescriptor implementation + +@implementation FramebufferDescriptor +- (instancetype)initWithRenderPassDescriptor:(MTLRenderPassDescriptor*)renderPassDescriptor +{ + if ((self = [super init])) + { + _sampleCount = renderPassDescriptor.colorAttachments[0].texture.sampleCount; + _colorPixelFormat = renderPassDescriptor.colorAttachments[0].texture.pixelFormat; + _depthPixelFormat = renderPassDescriptor.depthAttachment.texture.pixelFormat; + _stencilPixelFormat = renderPassDescriptor.stencilAttachment.texture.pixelFormat; + } + return self; +} + +- (nonnull id)copyWithZone:(nullable NSZone*)zone +{ + FramebufferDescriptor* copy = [[FramebufferDescriptor allocWithZone:zone] init]; + copy.sampleCount = self.sampleCount; + copy.colorPixelFormat = self.colorPixelFormat; + copy.depthPixelFormat = self.depthPixelFormat; + copy.stencilPixelFormat = self.stencilPixelFormat; + return copy; +} + +- (NSUInteger)hash +{ + NSUInteger sc = _sampleCount & 0x3; + NSUInteger cf = _colorPixelFormat & 0x3FF; + NSUInteger df = _depthPixelFormat & 0x3FF; + NSUInteger sf = _stencilPixelFormat & 0x3FF; + NSUInteger hash = (sf << 22) | (df << 12) | (cf << 2) | sc; + return hash; +} + +- (BOOL)isEqual:(id)object +{ + FramebufferDescriptor* other = object; + if (![other isKindOfClass:[FramebufferDescriptor class]]) + return NO; + return other.sampleCount == self.sampleCount && + other.colorPixelFormat == self.colorPixelFormat && + other.depthPixelFormat == self.depthPixelFormat && + other.stencilPixelFormat == self.stencilPixelFormat; +} + +@end + +#pragma mark - MetalContext implementation + +@implementation MetalContext +- (instancetype)init +{ + if ((self = [super init])) + { + self.renderPipelineStateCache = [NSMutableDictionary dictionary]; + self.bufferCache = [NSMutableArray array]; + _lastBufferCachePurge = GetMachAbsoluteTimeInSeconds(); + } + return self; +} + +- (MetalBuffer*)dequeueReusableBufferOfLength:(NSUInteger)length device:(id)device +{ + uint64_t now = GetMachAbsoluteTimeInSeconds(); + + @synchronized(self.bufferCache) + { + // Purge old buffers that haven't been useful for a while + if (now - self.lastBufferCachePurge > 1.0) + { + NSMutableArray* survivors = [NSMutableArray array]; + for (MetalBuffer* candidate in self.bufferCache) + if (candidate.lastReuseTime > self.lastBufferCachePurge) + [survivors addObject:candidate]; + self.bufferCache = [survivors mutableCopy]; + self.lastBufferCachePurge = now; + } + + // See if we have a buffer we can reuse + MetalBuffer* bestCandidate = nil; + for (MetalBuffer* candidate in self.bufferCache) + if (candidate.buffer.length >= length && (bestCandidate == nil || bestCandidate.lastReuseTime > candidate.lastReuseTime)) + bestCandidate = candidate; + + if (bestCandidate != nil) + { + [self.bufferCache removeObject:bestCandidate]; + bestCandidate.lastReuseTime = now; + return bestCandidate; + } + } + + // No luck; make a new buffer + id backing = [device newBufferWithLength:length options:MTLResourceStorageModeShared]; + return [[MetalBuffer alloc] initWithBuffer:backing]; +} + +// Bilinear sampling is required by default. Set 'io.Fonts->Flags |= ImFontAtlasFlags_NoBakedLines' or 'style.AntiAliasedLinesUseTex = false' to allow point/nearest sampling. +- (id)renderPipelineStateForFramebufferDescriptor:(FramebufferDescriptor*)descriptor device:(id)device +{ + NSError* error = nil; + + NSString* shaderSource = @"" + "#include \n" + "using namespace metal;\n" + "\n" + "struct Uniforms {\n" + " float4x4 projectionMatrix;\n" + "};\n" + "\n" + "struct VertexIn {\n" + " float2 position [[attribute(0)]];\n" + " float2 texCoords [[attribute(1)]];\n" + " uchar4 color [[attribute(2)]];\n" + "};\n" + "\n" + "struct VertexOut {\n" + " float4 position [[position]];\n" + " float2 texCoords;\n" + " float4 color;\n" + "};\n" + "\n" + "vertex VertexOut vertex_main(VertexIn in [[stage_in]],\n" + " constant Uniforms &uniforms [[buffer(1)]]) {\n" + " VertexOut out;\n" + " out.position = uniforms.projectionMatrix * float4(in.position, 0, 1);\n" + " out.texCoords = in.texCoords;\n" + " out.color = float4(in.color) / float4(255.0);\n" + " return out;\n" + "}\n" + "\n" + "fragment half4 fragment_main(VertexOut in [[stage_in]],\n" + " texture2d texture [[texture(0)]]) {\n" + " constexpr sampler linearSampler(coord::normalized, min_filter::linear, mag_filter::linear, mip_filter::linear);\n" + " half4 texColor = texture.sample(linearSampler, in.texCoords);\n" + " return half4(in.color) * texColor;\n" + "}\n"; + + id library = [device newLibraryWithSource:shaderSource options:nil error:&error]; + if (library == nil) + { + NSLog(@"Error: failed to create Metal library: %@", error); + return nil; + } + + id vertexFunction = [library newFunctionWithName:@"vertex_main"]; + id fragmentFunction = [library newFunctionWithName:@"fragment_main"]; + + if (vertexFunction == nil || fragmentFunction == nil) + { + NSLog(@"Error: failed to find Metal shader functions in library: %@", error); + return nil; + } + + MTLVertexDescriptor* vertexDescriptor = [MTLVertexDescriptor vertexDescriptor]; + vertexDescriptor.attributes[0].offset = IM_OFFSETOF(ImDrawVert, pos); + vertexDescriptor.attributes[0].format = MTLVertexFormatFloat2; // position + vertexDescriptor.attributes[0].bufferIndex = 0; + vertexDescriptor.attributes[1].offset = IM_OFFSETOF(ImDrawVert, uv); + vertexDescriptor.attributes[1].format = MTLVertexFormatFloat2; // texCoords + vertexDescriptor.attributes[1].bufferIndex = 0; + vertexDescriptor.attributes[2].offset = IM_OFFSETOF(ImDrawVert, col); + vertexDescriptor.attributes[2].format = MTLVertexFormatUChar4; // color + vertexDescriptor.attributes[2].bufferIndex = 0; + vertexDescriptor.layouts[0].stepRate = 1; + vertexDescriptor.layouts[0].stepFunction = MTLVertexStepFunctionPerVertex; + vertexDescriptor.layouts[0].stride = sizeof(ImDrawVert); + + MTLRenderPipelineDescriptor* pipelineDescriptor = [[MTLRenderPipelineDescriptor alloc] init]; + pipelineDescriptor.vertexFunction = vertexFunction; + pipelineDescriptor.fragmentFunction = fragmentFunction; + pipelineDescriptor.vertexDescriptor = vertexDescriptor; + pipelineDescriptor.rasterSampleCount = self.framebufferDescriptor.sampleCount; + pipelineDescriptor.colorAttachments[0].pixelFormat = self.framebufferDescriptor.colorPixelFormat; + pipelineDescriptor.colorAttachments[0].blendingEnabled = YES; + pipelineDescriptor.colorAttachments[0].rgbBlendOperation = MTLBlendOperationAdd; + pipelineDescriptor.colorAttachments[0].sourceRGBBlendFactor = MTLBlendFactorSourceAlpha; + pipelineDescriptor.colorAttachments[0].destinationRGBBlendFactor = MTLBlendFactorOneMinusSourceAlpha; + pipelineDescriptor.colorAttachments[0].alphaBlendOperation = MTLBlendOperationAdd; + pipelineDescriptor.colorAttachments[0].sourceAlphaBlendFactor = MTLBlendFactorOne; + pipelineDescriptor.colorAttachments[0].destinationAlphaBlendFactor = MTLBlendFactorOneMinusSourceAlpha; + pipelineDescriptor.depthAttachmentPixelFormat = self.framebufferDescriptor.depthPixelFormat; + pipelineDescriptor.stencilAttachmentPixelFormat = self.framebufferDescriptor.stencilPixelFormat; + + id renderPipelineState = [device newRenderPipelineStateWithDescriptor:pipelineDescriptor error:&error]; + if (error != nil) + NSLog(@"Error: failed to create Metal pipeline state: %@", error); + + return renderPipelineState; +} + +@end From c4a26d4c7055c47ea6a3d0f3c5b0733bc3efce82 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 24 Aug 2024 17:06:50 +0200 Subject: [PATCH 111/368] report total vram correctly --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 4 +++- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index f4a35eb54..644061719 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -40,6 +40,7 @@ MetalRenderer::MetalRenderer() // Feature support m_hasUnifiedMemory = m_device->hasUnifiedMemory(); m_isAppleGPU = m_device->supportsFamily(MTL::GPUFamilyApple1); + m_recommendedMaxVRAMUsage = m_device->recommendedMaxWorkingSetSize(); m_pixelFormatSupport = MetalPixelFormatSupport(m_device); // Resources @@ -213,7 +214,8 @@ bool MetalRenderer::IsPadWindowActive() bool MetalRenderer::GetVRAMInfo(int& usageInMB, int& totalInMB) const { usageInMB = m_device->currentAllocatedSize() / 1024 / 1024; - totalInMB = usageInMB; + // TODO: get the total VRAM size? Though would be pretty useless on Apple Silicon + totalInMB = m_recommendedMaxVRAMUsage / 1024 / 1024; return true; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index d43dcdd38..5b34d630b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -414,6 +414,7 @@ class MetalRenderer : public Renderer // Feature support bool m_hasUnifiedMemory; bool m_isAppleGPU; + uint32 m_recommendedMaxVRAMUsage; MetalPixelFormatSupport m_pixelFormatSupport; // Managers and caches From 6bf3406793d324bdf8c85643c5e71d4edd6b4249 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 24 Aug 2024 17:30:56 +0200 Subject: [PATCH 112/368] implement IsPadWindowActive --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 644061719..bc5b83ac2 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -206,9 +206,7 @@ void MetalRenderer::Shutdown() // TODO: what should this do? bool MetalRenderer::IsPadWindowActive() { - //debug_printf("MetalRenderer::IsPadWindowActive not implemented\n"); - - return false; + return (GetLayer(false).GetDrawable() != nullptr); } bool MetalRenderer::GetVRAMInfo(int& usageInMB, int& totalInMB) const From b105a383aa87cb7fc59bcf3e92f25731926fa234 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 24 Aug 2024 17:48:02 +0200 Subject: [PATCH 113/368] add: basic debug overlay --- src/Cafe/CMakeLists.txt | 1 + .../Renderer/Metal/MetalBufferAllocator.h | 3 +++ .../Renderer/Metal/MetalPerformanceMonitor.h | 18 ++++++++++++++++++ .../Latte/Renderer/Metal/MetalPipelineCache.h | 3 +++ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 11 ++++++++++- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 5 +++++ 6 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index 100f00d83..634014b7b 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -567,6 +567,7 @@ if(ENABLE_METAL) HW/Latte/Renderer/Metal/MetalSamplerCache.h HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h + HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h HW/Latte/Renderer/Metal/UtilityShaderSource.h ) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h index 9853ae7fd..3ec0acbd4 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h @@ -93,6 +93,9 @@ class MetalBufferAllocator m_freeBufferRanges.push_back(range); } + // Debug + m_mtlr->GetPerformanceMonitor().m_bufferAllocatorMemory += m_allocationSize; + // Increase the allocation size for the next buffer if (m_allocationSize < 128 * 1024 * 1024) m_allocationSize *= 2; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h new file mode 100644 index 000000000..64e94d38b --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h @@ -0,0 +1,18 @@ +#pragma once + +class MetalPerformanceMonitor +{ +public: + size_t m_bufferAllocatorMemory = 0; + + // Per frame data + uint32 m_renderPasses = 0; + + MetalPerformanceMonitor() = default; + ~MetalPerformanceMonitor() = default; + + void ResetPerFrameData() + { + m_renderPasses = 0; + } +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h index 30f40208b..ec4cabbf4 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h @@ -20,6 +20,9 @@ class MetalPipelineCache MTL::RenderPipelineState* GetMeshPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr, Renderer::INDEX_TYPE hostIndexType); + // Debug + size_t GetPipelineCacheSize() const { return m_pipelineCache.size(); } + private: class MetalRenderer* m_mtlr; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index bc5b83ac2..dfbbb2cca 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -395,7 +395,13 @@ void MetalRenderer::DeleteFontTextures() void MetalRenderer::AppendOverlayDebugInfo() { - // TODO: implement + ImGui::Text("--- Metal info ---"); + ImGui::Text("Render pipeline states %zu", m_pipelineCache->GetPipelineCacheSize()); + ImGui::Text("Buffer allocator memory %zuMB", m_performanceMonitor.m_bufferAllocatorMemory / 1024 / 1024); + + ImGui::Text("--- Metal info (per frame) ---"); + ImGui::Text("Command buffers %zu", m_commandBuffers.size()); + ImGui::Text("Render passes %u", m_performanceMonitor.m_renderPasses); } // TODO: halfZ @@ -1725,6 +1731,9 @@ void MetalRenderer::SwapBuffer(bool mainWindow) { debug_printf("skipped present!\n"); } + + // Debug + m_performanceMonitor.ResetPerFrameData(); } void MetalRenderer::EnsureImGuiBackend() diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 5b34d630b..d1f0eaeb8 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -3,6 +3,7 @@ #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h" struct MetalBufferAllocation { @@ -312,6 +313,8 @@ class MetalRenderer : public Renderer } // Helpers + MetalPerformanceMonitor& GetPerformanceMonitor() { return m_performanceMonitor; } + MTL::CommandBuffer* GetCurrentCommandBuffer() { cemu_assert_debug(m_commandBuffers.size() != 0); @@ -407,6 +410,8 @@ class MetalRenderer : public Renderer MetalLayerHandle m_mainLayer; MetalLayerHandle m_padLayer; + MetalPerformanceMonitor m_performanceMonitor; + // Metal objects MTL::Device* m_device; MTL::CommandQueue* m_commandQueue; From c905399f1f8b255ae858da730f893911efb7679d Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 25 Aug 2024 10:09:18 +0200 Subject: [PATCH 114/368] fix: ImGui issues --- .../Latte/Renderer/Metal/MetalLayerHandle.cpp | 20 -------- .../Latte/Renderer/Metal/MetalLayerHandle.h | 7 --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 48 ++++++++++++++----- src/imgui/imgui_impl_metal.mm | 6 +-- 4 files changed, 37 insertions(+), 44 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp index 9349dc895..f4d4490e1 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp @@ -16,8 +16,6 @@ MetalLayerHandle::~MetalLayerHandle() { if (m_layer) m_layer->release(); - if (m_renderPassDescriptor) - m_renderPassDescriptor->release(); } void MetalLayerHandle::Resize(const Vector2i& size) @@ -37,27 +35,9 @@ bool MetalLayerHandle::AcquireDrawable() return false; } - if (m_renderPassDescriptor) - { - m_renderPassDescriptor->release(); - m_renderPassDescriptor = nullptr; - } - return true; } -void MetalLayerHandle::CreateRenderPassDescriptor(bool clear) -{ - if (m_renderPassDescriptor) - m_renderPassDescriptor->release(); - - m_renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); - auto colorAttachment = m_renderPassDescriptor->colorAttachments()->object(0); - colorAttachment->setTexture(m_drawable->texture()); - colorAttachment->setLoadAction(clear ? MTL::LoadActionClear : MTL::LoadActionLoad); - colorAttachment->setStoreAction(MTL::StoreActionStore); -} - void MetalLayerHandle::PresentDrawable(MTL::CommandBuffer* commandBuffer) { commandBuffer->presentDrawable(m_drawable); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h index b4aff7dfb..39a7cd1f3 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h @@ -3,8 +3,6 @@ #include #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" -#include "QuartzCore/CAMetalDrawable.hpp" -#include "QuartzCore/CAMetalLayer.hpp" #include "util/math/vector2.h" class MetalLayerHandle @@ -19,20 +17,15 @@ class MetalLayerHandle bool AcquireDrawable(); - void CreateRenderPassDescriptor(bool clear); - void PresentDrawable(MTL::CommandBuffer* commandBuffer); CA::MetalLayer* GetLayer() const { return m_layer; } CA::MetalDrawable* GetDrawable() const { return m_drawable; } - MTL::RenderPassDescriptor* GetRenderPassDescriptor() const { return m_renderPassDescriptor; } - private: CA::MetalLayer* m_layer = nullptr; float m_layerScaleX, m_layerScaleY; CA::MetalDrawable* m_drawable = nullptr; - MTL::RenderPassDescriptor* m_renderPassDescriptor = nullptr; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index dfbbb2cca..ee15a5a56 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -20,6 +20,7 @@ #include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "HW/Latte/Renderer/Metal/MetalLayerHandle.h" #include "HW/Latte/Renderer/Renderer.h" +#include "Metal/MTLRenderPass.hpp" #include "imgui.h" #define IMGUI_IMPL_METAL_CPP @@ -237,8 +238,8 @@ void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) { if (swapTV) SwapBuffer(true); - if (swapDRC) - SwapBuffer(false); + //if (swapDRC) + // SwapBuffer(false); // Release all the command buffers CommitCommandBuffer(); @@ -262,9 +263,15 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput // Create render pass auto& layer = GetLayer(!padView); - layer.CreateRenderPassDescriptor(clearBackground); - auto renderCommandEncoder = GetTemporaryRenderCommandEncoder(layer.GetRenderPassDescriptor()); + MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); + auto colorAttachment = renderPassDescriptor->colorAttachments()->object(0); + colorAttachment->setTexture(layer.GetDrawable()->texture()); + colorAttachment->setLoadAction(clearBackground ? MTL::LoadActionClear : MTL::LoadActionLoad); + colorAttachment->setStoreAction(MTL::StoreActionStore); + + auto renderCommandEncoder = GetTemporaryRenderCommandEncoder(renderPassDescriptor); + renderPassDescriptor->release(); // Draw to Metal layer renderCommandEncoder->setRenderPipelineState(m_state.m_usesSRGB ? m_presentPipelineSRGB : m_presentPipelineLinear); @@ -303,25 +310,36 @@ void MetalRenderer::NotifyLatteCommandProcessorIdle() bool MetalRenderer::ImguiBegin(bool mainWindow) { - EnsureImGuiBackend(); - if (!Renderer::ImguiBegin(mainWindow)) return false; if (!AcquireDrawable(mainWindow)) return false; + EnsureImGuiBackend(); + + // Check if the font texture needs to be built + ImGuiIO& io = ImGui::GetIO(); + if (!io.Fonts->IsBuilt()) + ImGui_ImplMetal_CreateFontsTexture(m_device); + auto& layer = GetLayer(mainWindow); - if (!layer.GetRenderPassDescriptor()) - layer.CreateRenderPassDescriptor(true); // TODO: should we clear? - ImGui_ImplMetal_CreateFontsTexture(m_device); - ImGui_ImplMetal_NewFrame(layer.GetRenderPassDescriptor()); + // Render pass descriptor + MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); + auto colorAttachment = renderPassDescriptor->colorAttachments()->object(0); + colorAttachment->setTexture(layer.GetDrawable()->texture()); + colorAttachment->setLoadAction(MTL::LoadActionLoad); + colorAttachment->setStoreAction(MTL::StoreActionStore); + + // New frame + ImGui_ImplMetal_NewFrame(renderPassDescriptor); ImGui_UpdateWindowInformation(mainWindow); ImGui::NewFrame(); if (m_encoderType != MetalEncoderType::Render) - GetTemporaryRenderCommandEncoder(layer.GetRenderPassDescriptor()); + GetTemporaryRenderCommandEncoder(renderPassDescriptor); + renderPassDescriptor->release(); return true; } @@ -401,7 +419,7 @@ void MetalRenderer::AppendOverlayDebugInfo() ImGui::Text("--- Metal info (per frame) ---"); ImGui::Text("Command buffers %zu", m_commandBuffers.size()); - ImGui::Text("Render passes %u", m_performanceMonitor.m_renderPasses); + ImGui::Text("Render passes %u", m_performanceMonitor.m_renderPasses); } // TODO: halfZ @@ -1289,6 +1307,9 @@ MTL::RenderCommandEncoder* MetalRenderer::GetTemporaryRenderCommandEncoder(MTL:: m_commandEncoder = renderCommandEncoder; m_encoderType = MetalEncoderType::Render; + // Debug + m_performanceMonitor.m_renderPasses++; + return renderCommandEncoder; } @@ -1348,6 +1369,9 @@ MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(bool forceRecr ResetEncoderState(); + // Debug + m_performanceMonitor.m_renderPasses++; + return renderCommandEncoder; } diff --git a/src/imgui/imgui_impl_metal.mm b/src/imgui/imgui_impl_metal.mm index 8a070a6c2..5f0588573 100644 --- a/src/imgui/imgui_impl_metal.mm +++ b/src/imgui/imgui_impl_metal.mm @@ -311,12 +311,8 @@ void ImGui_ImplMetal_RenderDrawData(ImDrawData* drawData, id c bool ImGui_ImplMetal_CreateFontsTexture(id device) { - // HACK: check if the font atlas has been built already - ImGuiIO& io = ImGui::GetIO(); - if (io.Fonts->IsBuilt()) - return true; - ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_GetBackendData(); + ImGuiIO& io = ImGui::GetIO(); // We are retrieving and uploading the font atlas as a 4-channels RGBA texture here. // In theory we could call GetTexDataAsAlpha8() and upload a 1-channel texture to save on memory access bandwidth. From 96d6168c502ddf6ce6c1317d4b0bc767c2c26168 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 25 Aug 2024 10:15:10 +0200 Subject: [PATCH 115/368] add: additional debug info --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 11 +++++++++-- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h | 12 +++++++++--- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index ee15a5a56..eb85f1ea1 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -20,6 +20,7 @@ #include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "HW/Latte/Renderer/Metal/MetalLayerHandle.h" #include "HW/Latte/Renderer/Renderer.h" +#include "Metal/MTLDevice.hpp" #include "Metal/MTLRenderPass.hpp" #include "imgui.h" @@ -39,8 +40,9 @@ MetalRenderer::MetalRenderer() m_commandQueue = m_device->newCommandQueue(); // Feature support - m_hasUnifiedMemory = m_device->hasUnifiedMemory(); m_isAppleGPU = m_device->supportsFamily(MTL::GPUFamilyApple1); + m_hasUnifiedMemory = m_device->hasUnifiedMemory(); + m_supportsMetal3 = m_device->supportsFamily(MTL::GPUFamilyMetal3); m_recommendedMaxVRAMUsage = m_device->recommendedMaxWorkingSetSize(); m_pixelFormatSupport = MetalPixelFormatSupport(m_device); @@ -413,13 +415,18 @@ void MetalRenderer::DeleteFontTextures() void MetalRenderer::AppendOverlayDebugInfo() { + ImGui::Text("--- GPU info ---"); + ImGui::Text("Is Apple GPU %s", (m_isAppleGPU ? "yes" : "no")); + ImGui::Text("Has unified memory %s", (m_hasUnifiedMemory ? "yes" : "no")); + ImGui::Text("Supports Metal3 %s", (m_supportsMetal3 ? "yes" : "no")); + ImGui::Text("--- Metal info ---"); ImGui::Text("Render pipeline states %zu", m_pipelineCache->GetPipelineCacheSize()); ImGui::Text("Buffer allocator memory %zuMB", m_performanceMonitor.m_bufferAllocatorMemory / 1024 / 1024); ImGui::Text("--- Metal info (per frame) ---"); ImGui::Text("Command buffers %zu", m_commandBuffers.size()); - ImGui::Text("Render passes %u", m_performanceMonitor.m_renderPasses); + ImGui::Text("Render passes %u", m_performanceMonitor.m_renderPasses); } // TODO: halfZ diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index d1f0eaeb8..7a9b41e48 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -376,14 +376,19 @@ class MetalRenderer : public Renderer void ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a); // Getters + bool IsAppleGPU() const + { + return m_isAppleGPU; + } + bool HasUnifiedMemory() const { return m_hasUnifiedMemory; } - bool IsAppleGPU() const + bool SupportsMetal3() const { - return m_isAppleGPU; + return m_supportsMetal3; } const MetalPixelFormatSupport& GetPixelFormatSupport() const @@ -417,8 +422,9 @@ class MetalRenderer : public Renderer MTL::CommandQueue* m_commandQueue; // Feature support - bool m_hasUnifiedMemory; bool m_isAppleGPU; + bool m_hasUnifiedMemory; + bool m_supportsMetal3; uint32 m_recommendedMaxVRAMUsage; MetalPixelFormatSupport m_pixelFormatSupport; From a693bf564f21af42fdcca1b5f867dafa4349a30c Mon Sep 17 00:00:00 2001 From: Samo Z Date: Sun, 25 Aug 2024 20:32:46 +0200 Subject: [PATCH 116/368] fix: issues on Intel Macs --- .../Renderer/Metal/MetalBufferAllocator.h | 2 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 19 ++++++++++++------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h index 3ec0acbd4..d3b0cc5e5 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h @@ -39,7 +39,7 @@ class MetalBufferAllocator MetalBufferAllocation GetBufferAllocation(size_t size) { // Align the size - size = Align(size, 16); + size = Align(size, 128); // First, try to find a free range for (uint32 i = 0; i < m_freeBufferRanges.size(); i++) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index eb85f1ea1..b01e41487 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -126,7 +126,7 @@ MetalRenderer::MetalRenderer() presentFragmentFunction->release(); error = nullptr; - renderPipelineDescriptor->colorAttachments()->object(0)->setPixelFormat(MTL::PixelFormatRGBA8Unorm); + renderPipelineDescriptor->colorAttachments()->object(0)->setPixelFormat(MTL::PixelFormatBGRA8Unorm); #ifdef CEMU_DEBUG_ASSERT renderPipelineDescriptor->setLabel(GetLabel("Present pipeline linear", renderPipelineDescriptor)); #endif @@ -138,7 +138,7 @@ MetalRenderer::MetalRenderer() } error = nullptr; - renderPipelineDescriptor->colorAttachments()->object(0)->setPixelFormat(MTL::PixelFormatRGBA8Unorm_sRGB); + renderPipelineDescriptor->colorAttachments()->object(0)->setPixelFormat(MTL::PixelFormatBGRA8Unorm_sRGB); #ifdef CEMU_DEBUG_ASSERT renderPipelineDescriptor->setLabel(GetLabel("Present pipeline sRGB", renderPipelineDescriptor)); #endif @@ -185,7 +185,9 @@ MetalRenderer::~MetalRenderer() void MetalRenderer::InitializeLayer(const Vector2i& size, bool mainWindow) { - GetLayer(mainWindow) = MetalLayerHandle(m_device, size); + auto& layer = GetLayer(mainWindow); + layer = MetalLayerHandle(m_device, size); + layer.GetLayer()->setPixelFormat(MTL::PixelFormatBGRA8Unorm); } void MetalRenderer::ResizeLayer(const Vector2i& size, bool mainWindow) @@ -381,7 +383,7 @@ ImTextureID MetalRenderer::GenerateTexture(const std::vector& data, const desc->setPixelFormat(MTL::PixelFormatRGBA8Unorm); desc->setWidth(size.x); desc->setHeight(size.y); - desc->setStorageMode(MTL::StorageModeShared); + desc->setStorageMode(m_isAppleGPU ? MTL::StorageModeShared : MTL::StorageModeManaged); desc->setUsage(MTL::TextureUsageShaderRead); MTL::Texture* texture = m_device->newTexture(desc); @@ -507,11 +509,14 @@ void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, s auto blitCommandEncoder = GetBlitCommandEncoder(); // Allocate a temporary buffer - auto allocation = m_memoryManager->GetTemporaryBufferAllocator().GetBufferAllocation(compressedImageSize); - auto buffer = m_memoryManager->GetTemporaryBufferAllocator().GetBuffer(allocation.bufferIndex); + // HACK: use the persistent buffer allocator so as to avoid any issues + auto& bufferAllocator = m_memoryManager->GetBufferAllocator(); + auto allocation = bufferAllocator.GetBufferAllocation(compressedImageSize); + auto buffer = bufferAllocator.GetBuffer(allocation.bufferIndex); // Copy the data to the temporary buffer memcpy(allocation.data, pixelData, compressedImageSize); + buffer->didModifyRange(NS::Range(allocation.offset, allocation.size)); // Copy the data from the temporary buffer to the texture blitCommandEncoder->copyFromBuffer(buffer, allocation.offset, bytesPerRow, 0, MTL::Size(width, height, 1), textureMtl->GetTexture(), sliceIndex, mipIndex, MTL::Origin(0, 0, offsetZ)); @@ -1474,7 +1479,7 @@ bool MetalRenderer::AcquireDrawable(bool mainWindow) const bool latteBufferUsesSRGB = mainWindow ? LatteGPUState.tvBufferUsesSRGB : LatteGPUState.drcBufferUsesSRGB; if (latteBufferUsesSRGB != m_state.m_usesSRGB) { - layer.GetLayer()->setPixelFormat(latteBufferUsesSRGB ? MTL::PixelFormatRGBA8Unorm_sRGB : MTL::PixelFormatRGBA8Unorm); + layer.GetLayer()->setPixelFormat(latteBufferUsesSRGB ? MTL::PixelFormatBGRA8Unorm_sRGB : MTL::PixelFormatBGRA8Unorm); m_state.m_usesSRGB = latteBufferUsesSRGB; } From 6c8947d0e5f95941cecc7456d087ab3d976b2761 Mon Sep 17 00:00:00 2001 From: Samo Z Date: Mon, 26 Aug 2024 08:07:49 +0200 Subject: [PATCH 117/368] fix: temporary buffer allocator --- .../HW/Latte/Renderer/Metal/MetalBufferAllocator.h | 12 +++++++++++- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 9 +++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h index d3b0cc5e5..96724e883 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h @@ -152,7 +152,7 @@ struct MetalSyncedBuffer class MetalTemporaryBufferAllocator : public MetalBufferAllocator { public: - MetalTemporaryBufferAllocator(class MetalRenderer* metalRenderer) : MetalBufferAllocator(metalRenderer, metalRenderer->GetOptimalBufferStorageMode()) {} + MetalTemporaryBufferAllocator(class MetalRenderer* metalRenderer) : MetalBufferAllocator(metalRenderer, MTL::ResourceStorageModeShared) {} void SetActiveCommandBuffer(MTL::CommandBuffer* commandBuffer) { @@ -170,6 +170,16 @@ class MetalTemporaryBufferAllocator : public MetalBufferAllocatorlength()}); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index b01e41487..854a6e006 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -509,8 +509,7 @@ void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, s auto blitCommandEncoder = GetBlitCommandEncoder(); // Allocate a temporary buffer - // HACK: use the persistent buffer allocator so as to avoid any issues - auto& bufferAllocator = m_memoryManager->GetBufferAllocator(); + auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator(); auto allocation = bufferAllocator.GetBufferAllocation(compressedImageSize); auto buffer = bufferAllocator.GetBuffer(allocation.bufferIndex); @@ -1192,9 +1191,11 @@ void* MetalRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, u void MetalRenderer::indexData_uploadIndexMemory(uint32 bufferIndex, uint32 offset, uint32 size) { - auto buffer = m_memoryManager->GetTemporaryBufferAllocator().GetBuffer(bufferIndex); if (!HasUnifiedMemory()) + { + auto buffer = m_memoryManager->GetTemporaryBufferAllocator().GetBuffer(bufferIndex); buffer->didModifyRange(NS::Range(offset, size)); + } } void MetalRenderer::SetBuffer(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Buffer* buffer, size_t offset, uint32 index) @@ -1459,7 +1460,7 @@ void MetalRenderer::CommitCommandBuffer() auto& commandBuffer = m_commandBuffers.back(); if (!commandBuffer.m_commited) { - commandBuffer.m_commandBuffer->addCompletedHandler(^(MTL::CommandBuffer* cmd) { + commandBuffer.m_commandBuffer->addCompletedHandler(^(MTL::CommandBuffer*) { m_memoryManager->GetTemporaryBufferAllocator().CommandBufferFinished(commandBuffer.m_commandBuffer); }); From b7f88d093a5aa5c4bc8ac206fad9d921d094030a Mon Sep 17 00:00:00 2001 From: Samo Z Date: Mon, 26 Aug 2024 12:22:33 +0200 Subject: [PATCH 118/368] implement buffer locking system --- .../Renderer/Metal/MetalBufferAllocator.h | 111 +++++++++++++++--- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 40 +++++-- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 5 + 3 files changed, 133 insertions(+), 23 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h index 96724e883..51c119d95 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h @@ -1,6 +1,7 @@ #pragma once #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Common/precompiled.h" #include "Metal/MTLResource.hpp" struct MetalBufferRange @@ -146,20 +147,86 @@ typedef MetalBufferAllocator MetalDefaultBufferAllocator; struct MetalSyncedBuffer { MTL::Buffer* m_buffer; - std::vector m_commandBuffers; + std::vector m_commandBuffers; + uint32 m_lock = 0; + + bool IsLocked() const + { + return (m_lock != 0); + } }; +//constexpr uint16 MAX_COMMAND_BUFFER_FRAMES = 1024; + class MetalTemporaryBufferAllocator : public MetalBufferAllocator { public: MetalTemporaryBufferAllocator(class MetalRenderer* metalRenderer) : MetalBufferAllocator(metalRenderer, MTL::ResourceStorageModeShared) {} - void SetActiveCommandBuffer(MTL::CommandBuffer* commandBuffer) + void LockBuffer(uint32 bufferIndex) + { + m_buffers[bufferIndex].m_lock++; + } + + void UnlockBuffer(uint32 bufferIndex) + { + auto& buffer = m_buffers[bufferIndex]; + + buffer.m_lock--; + + // TODO: is this really necessary? + // Release the buffer if it wasn't released due to the lock + if (!buffer.IsLocked() && buffer.m_commandBuffers.empty()) + m_freeBufferRanges.push_back({bufferIndex, 0, buffer.m_buffer->length()}); + } + + void UnlockAllBuffers() + { + for (uint32_t i = 0; i < m_buffers.size(); i++) + { + auto& buffer = m_buffers[i]; + + if (buffer.m_lock != 0) + { + if (buffer.m_commandBuffers.empty()) + m_freeBufferRanges.push_back({i, 0, buffer.m_buffer->length()}); + + buffer.m_lock = 0; + } + } + + /* + auto it = m_commandBuffersFrames.begin(); + while (it != m_commandBuffersFrames.end()) + { + it->second++; + + if (it->second > MAX_COMMAND_BUFFER_FRAMES) + { + debug_printf("command buffer %u remained unfinished for more than %u frames\n", it->first, MAX_COMMAND_BUFFER_FRAMES); + + // Pretend like the command buffer has finished + CommandBufferFinished(it->first, false); + + it = m_commandBuffersFrames.erase(it); + } + else + { + it++; + } + } + */ + } + + void SetActiveCommandBuffer(uint32 commandBuffer) { m_activeCommandBuffer = commandBuffer; + + //if (commandBuffer != INVALID_COMMAND_BUFFER_ID) + // m_commandBuffersFrames[commandBuffer] = 0; } - void CommandBufferFinished(MTL::CommandBuffer* commandBuffer) + void CommandBufferFinished(uint32 commandBuffer/*, bool erase = true*/) { for (uint32_t i = 0; i < m_buffers.size(); i++) { @@ -170,18 +237,21 @@ class MetalTemporaryBufferAllocator : public MetalBufferAllocatorlength()}); + // All command buffers using it have finished execution, we can use it again + m_freeBufferRanges.push_back({i, 0, buffer.m_buffer->length()}); + } buffer.m_commandBuffers.clear(); } @@ -193,18 +263,28 @@ class MetalTemporaryBufferAllocator : public MetalBufferAllocator m_commandBuffersFrames; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 854a6e006..273b4c628 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -247,12 +247,16 @@ void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) // Release all the command buffers CommitCommandBuffer(); + // TODO: should this be released here? for (uint32 i = 0; i < m_commandBuffers.size(); i++) m_commandBuffers[i].m_commandBuffer->release(); m_commandBuffers.clear(); // Release frame persistent buffers m_memoryManager->GetFramePersistentBufferAllocator().ResetAllocations(); + + // Unlock all temporary buffers + m_memoryManager->GetTemporaryBufferAllocator().UnlockAllBuffers(); } // TODO: use `shader` for drawing @@ -515,7 +519,7 @@ void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, s // Copy the data to the temporary buffer memcpy(allocation.data, pixelData, compressedImageSize); - buffer->didModifyRange(NS::Range(allocation.offset, allocation.size)); + //buffer->didModifyRange(NS::Range(allocation.offset, allocation.size)); // Copy the data from the temporary buffer to the texture blitCommandEncoder->copyFromBuffer(buffer, allocation.offset, bytesPerRow, 0, MTL::Size(width, height, 1), textureMtl->GetTexture(), sliceIndex, mipIndex, MTL::Origin(0, 0, offsetZ)); @@ -1116,7 +1120,13 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // Draw MTL::Buffer* indexBuffer = nullptr; if (hostIndexType != INDEX_TYPE::NONE) - indexBuffer = m_memoryManager->GetTemporaryBufferAllocator().GetBuffer(indexBufferIndex); + { + auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator(); + indexBuffer = bufferAllocator.GetBuffer(indexBufferIndex); + + // We have already retrieved the buffer, no need for it to be locked anymore + bufferAllocator.UnlockBuffer(indexBufferIndex); + } if (usesGeometryShader) { if (indexBuffer) @@ -1182,20 +1192,27 @@ void MetalRenderer::draw_endSequence() void* MetalRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) { - auto allocation = m_memoryManager->GetTemporaryBufferAllocator().GetBufferAllocation(size); + auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator(); + auto allocation = bufferAllocator.GetBufferAllocation(size); offset = allocation.offset; bufferIndex = allocation.bufferIndex; + // Lock the buffer so that it doesn't get released + bufferAllocator.LockBuffer(allocation.bufferIndex); + return allocation.data; } void MetalRenderer::indexData_uploadIndexMemory(uint32 bufferIndex, uint32 offset, uint32 size) { + // Do nothing + /* if (!HasUnifiedMemory()) { - auto buffer = m_memoryManager->GetTemporaryBufferAllocator().GetBuffer(bufferIndex); + auto buffer = m_memoryManager->GetTemporaryBufferAllocator().GetBufferOutsideOfCommandBuffer(bufferIndex); buffer->didModifyRange(NS::Range(offset, size)); } + */ } void MetalRenderer::SetBuffer(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Buffer* buffer, size_t offset, uint32 index) @@ -1284,10 +1301,13 @@ MTL::CommandBuffer* MetalRenderer::GetCommandBuffer() //m_commandQueue->insertDebugCaptureBoundary(); MTL::CommandBuffer* mtlCommandBuffer = m_commandQueue->commandBuffer(); - m_commandBuffers.push_back({mtlCommandBuffer}); + MetalCommandBuffer commandBuffer = {mtlCommandBuffer, m_commandBufferID}; + m_commandBuffers.push_back(commandBuffer); + + m_commandBufferID = (m_commandBufferID + 1) % 65536; // Notify memory manager about the new command buffer - m_memoryManager->GetTemporaryBufferAllocator().SetActiveCommandBuffer(mtlCommandBuffer); + m_memoryManager->GetTemporaryBufferAllocator().SetActiveCommandBuffer(commandBuffer.m_id); return mtlCommandBuffer; } @@ -1461,12 +1481,14 @@ void MetalRenderer::CommitCommandBuffer() if (!commandBuffer.m_commited) { commandBuffer.m_commandBuffer->addCompletedHandler(^(MTL::CommandBuffer*) { - m_memoryManager->GetTemporaryBufferAllocator().CommandBufferFinished(commandBuffer.m_commandBuffer); + m_memoryManager->GetTemporaryBufferAllocator().CommandBufferFinished(commandBuffer.m_id); }); commandBuffer.m_commandBuffer->commit(); commandBuffer.m_commited = true; + m_memoryManager->GetTemporaryBufferAllocator().SetActiveCommandBuffer(INVALID_COMMAND_BUFFER_ID); + // Debug //m_commandQueue->insertDebugCaptureBoundary(); } @@ -1702,8 +1724,8 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE auto supportBuffer = bufferAllocator.GetBufferAllocation(size); memcpy(supportBuffer.data, supportBufferData, size); auto buffer = bufferAllocator.GetBuffer(supportBuffer.bufferIndex); - if (!HasUnifiedMemory()) - buffer->didModifyRange(NS::Range(supportBuffer.offset, size)); + //if (!HasUnifiedMemory()) + // buffer->didModifyRange(NS::Range(supportBuffer.offset, size)); SetBuffer(renderCommandEncoder, mtlShaderType, buffer, supportBuffer.offset, shader->resourceMapping.uniformVarsBufferBindingPoint); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 7a9b41e48..3d494cbe5 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -143,9 +143,12 @@ struct MetalState struct MetalCommandBuffer { MTL::CommandBuffer* m_commandBuffer; + uint32 m_id; bool m_commited = false; }; +constexpr uint32 INVALID_COMMAND_BUFFER_ID = std::numeric_limits::max(); + enum class MetalEncoderType { None, @@ -417,6 +420,8 @@ class MetalRenderer : public Renderer MetalPerformanceMonitor m_performanceMonitor; + uint32 m_commandBufferID = 0; + // Metal objects MTL::Device* m_device; MTL::CommandQueue* m_commandQueue; From cd8b74ba32ac569e81a3a497d78c284641e1e3f0 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 26 Aug 2024 18:31:22 +0200 Subject: [PATCH 119/368] fix: buffer allocator leaks --- .../Renderer/Metal/MetalBufferAllocator.h | 88 +++++++++++++------ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 23 ++--- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 5 -- 3 files changed, 74 insertions(+), 42 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h index 51c119d95..445fb8231 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h @@ -147,7 +147,7 @@ typedef MetalBufferAllocator MetalDefaultBufferAllocator; struct MetalSyncedBuffer { MTL::Buffer* m_buffer; - std::vector m_commandBuffers; + std::vector m_commandBuffers; uint32 m_lock = 0; bool IsLocked() const @@ -156,7 +156,7 @@ struct MetalSyncedBuffer } }; -//constexpr uint16 MAX_COMMAND_BUFFER_FRAMES = 1024; +constexpr uint16 MAX_COMMAND_BUFFER_FRAMES = 8; class MetalTemporaryBufferAllocator : public MetalBufferAllocator { @@ -177,7 +177,7 @@ class MetalTemporaryBufferAllocator : public MetalBufferAllocatorlength()}); + FreeBuffer(bufferIndex); } void UnlockAllBuffers() @@ -189,7 +189,7 @@ class MetalTemporaryBufferAllocator : public MetalBufferAllocatorlength()}); + FreeBuffer(i); buffer.m_lock = 0; } @@ -203,7 +203,7 @@ class MetalTemporaryBufferAllocator : public MetalBufferAllocatorsecond > MAX_COMMAND_BUFFER_FRAMES) { - debug_printf("command buffer %u remained unfinished for more than %u frames\n", it->first, MAX_COMMAND_BUFFER_FRAMES); + debug_printf("command buffer %p remained unfinished for more than %u frames\n", it->first, MAX_COMMAND_BUFFER_FRAMES); // Pretend like the command buffer has finished CommandBufferFinished(it->first, false); @@ -218,48 +218,39 @@ class MetalTemporaryBufferAllocator : public MetalBufferAllocatorCommandBufferCompleted(buffer.m_commandBuffers[j])) { if (buffer.m_commandBuffers.size() == 1) { if (!buffer.IsLocked()) { - // First remove any free ranges that use this buffer - for (uint32 k = 0; k < m_freeBufferRanges.size(); k++) - { - if (m_freeBufferRanges[k].bufferIndex == i) - { - m_freeBufferRanges.erase(m_freeBufferRanges.begin() + k); - k--; - } - } - // All command buffers using it have finished execution, we can use it again - m_freeBufferRanges.push_back({i, 0, buffer.m_buffer->length()}); + FreeBuffer(i); } buffer.m_commandBuffers.clear(); + break; } else { buffer.m_commandBuffers.erase(buffer.m_commandBuffers.begin() + j); + j--; } - break; } } } @@ -270,10 +261,10 @@ class MetalTemporaryBufferAllocator : public MetalBufferAllocator size: %lu, command buffers: %zu\n", buffer.m_buffer, buffer.m_buffer->length(), buffer.m_commandBuffers.size()); + uint32 same = 0; + uint32 completed = 0; + for (uint32 i = 0; i < buffer.m_commandBuffers.size(); i++) + { + if (m_mtlr->CommandBufferCompleted(buffer.m_commandBuffers[i])) + completed++; + for (uint32 j = 0; j < buffer.m_commandBuffers.size(); j++) + { + if (i != j && buffer.m_commandBuffers[i] == buffer.m_commandBuffers[j]) + same++; + } + } + debug_printf(" same: %u\n", same); + debug_printf(" completed: %u\n", completed); + } + + debug_printf("FREE RANGES:\n"); + for (auto& range : m_freeBufferRanges) + { + debug_printf(" %u -> offset: %zu, size: %zu\n", range.bufferIndex, range.offset, range.size); + } + } + */ + private: - uint32 m_activeCommandBuffer = INVALID_COMMAND_BUFFER_ID; + MTL::CommandBuffer* m_activeCommandBuffer = nullptr; + + //std::map m_commandBuffersFrames; + + void FreeBuffer(uint32 bufferIndex) + { + // First remove any free ranges that use this buffer + for (uint32 k = 0; k < m_freeBufferRanges.size(); k++) + { + if (m_freeBufferRanges[k].bufferIndex == bufferIndex) + { + m_freeBufferRanges.erase(m_freeBufferRanges.begin() + k); + k--; + } + } - //std::map m_commandBuffersFrames; + m_freeBufferRanges.push_back({bufferIndex, 0, m_buffers[bufferIndex].m_buffer->length()}); + } }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 273b4c628..5bd5040c2 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -248,8 +248,8 @@ void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) // Release all the command buffers CommitCommandBuffer(); // TODO: should this be released here? - for (uint32 i = 0; i < m_commandBuffers.size(); i++) - m_commandBuffers[i].m_commandBuffer->release(); + //for (uint32 i = 0; i < m_commandBuffers.size(); i++) + // m_commandBuffers[i].m_commandBuffer->release(); m_commandBuffers.clear(); // Release frame persistent buffers @@ -257,6 +257,9 @@ void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) // Unlock all temporary buffers m_memoryManager->GetTemporaryBufferAllocator().UnlockAllBuffers(); + + // Check for completed command buffers + m_memoryManager->GetTemporaryBufferAllocator().CheckForCompletedCommandBuffers(); } // TODO: use `shader` for drawing @@ -1301,13 +1304,10 @@ MTL::CommandBuffer* MetalRenderer::GetCommandBuffer() //m_commandQueue->insertDebugCaptureBoundary(); MTL::CommandBuffer* mtlCommandBuffer = m_commandQueue->commandBuffer(); - MetalCommandBuffer commandBuffer = {mtlCommandBuffer, m_commandBufferID}; - m_commandBuffers.push_back(commandBuffer); - - m_commandBufferID = (m_commandBufferID + 1) % 65536; + m_commandBuffers.push_back({mtlCommandBuffer}); // Notify memory manager about the new command buffer - m_memoryManager->GetTemporaryBufferAllocator().SetActiveCommandBuffer(commandBuffer.m_id); + m_memoryManager->GetTemporaryBufferAllocator().SetActiveCommandBuffer(mtlCommandBuffer); return mtlCommandBuffer; } @@ -1480,14 +1480,15 @@ void MetalRenderer::CommitCommandBuffer() auto& commandBuffer = m_commandBuffers.back(); if (!commandBuffer.m_commited) { - commandBuffer.m_commandBuffer->addCompletedHandler(^(MTL::CommandBuffer*) { - m_memoryManager->GetTemporaryBufferAllocator().CommandBufferFinished(commandBuffer.m_id); - }); + // Handled differently, since it seems like Metal doesn't always call the completion handler + //commandBuffer.m_commandBuffer->addCompletedHandler(^(MTL::CommandBuffer*) { + // m_memoryManager->GetTemporaryBufferAllocator().CommandBufferFinished(commandBuffer.m_commandBuffer); + //}); commandBuffer.m_commandBuffer->commit(); commandBuffer.m_commited = true; - m_memoryManager->GetTemporaryBufferAllocator().SetActiveCommandBuffer(INVALID_COMMAND_BUFFER_ID); + m_memoryManager->GetTemporaryBufferAllocator().SetActiveCommandBuffer(nullptr); // Debug //m_commandQueue->insertDebugCaptureBoundary(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 3d494cbe5..7a9b41e48 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -143,12 +143,9 @@ struct MetalState struct MetalCommandBuffer { MTL::CommandBuffer* m_commandBuffer; - uint32 m_id; bool m_commited = false; }; -constexpr uint32 INVALID_COMMAND_BUFFER_ID = std::numeric_limits::max(); - enum class MetalEncoderType { None, @@ -420,8 +417,6 @@ class MetalRenderer : public Renderer MetalPerformanceMonitor m_performanceMonitor; - uint32 m_commandBufferID = 0; - // Metal objects MTL::Device* m_device; MTL::CommandQueue* m_commandQueue; From 3439b3259e481743ff47c3376deaa827a75a1013 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 27 Aug 2024 08:18:17 +0200 Subject: [PATCH 120/368] uncomment drc swap buffer --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 5bd5040c2..5fac079f6 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -242,12 +242,12 @@ void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) { if (swapTV) SwapBuffer(true); - //if (swapDRC) - // SwapBuffer(false); + if (swapDRC) + SwapBuffer(false); // Release all the command buffers CommitCommandBuffer(); - // TODO: should this be released here? + // TODO: release //for (uint32 i = 0; i < m_commandBuffers.size(); i++) // m_commandBuffers[i].m_commandBuffer->release(); m_commandBuffers.clear(); From b7a1adec91f4c4c7b5526282883b3127c90e1a83 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 27 Aug 2024 14:39:52 +0200 Subject: [PATCH 121/368] clip mode --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 14 ++++++++++---- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h | 1 + 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 5fac079f6..7b6be6cea 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -947,10 +947,6 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 uint32 cullBack = polygonControlReg.get_CULL_BACK(); uint32 polyOffsetFrontEnable = polygonControlReg.get_OFFSET_FRONT_ENABLED(); - // TODO - //cemu_assert_debug(LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_ZCLIP_NEAR_DISABLE() == LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_ZCLIP_FAR_DISABLE()); // near or far clipping can be disabled individually - //bool zClipEnable = LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_ZCLIP_FAR_DISABLE() == false; - if (polyOffsetFrontEnable) { uint32 frontScaleU32 = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_SCALE.getRawValue(); @@ -984,6 +980,16 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 } } + // Depth clip mode + cemu_assert_debug(LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_ZCLIP_NEAR_DISABLE() == LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_ZCLIP_FAR_DISABLE()); // near or far clipping can be disabled individually + bool zClipEnable = LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_ZCLIP_FAR_DISABLE() == false; + + if (zClipEnable != encoderState.m_depthClipEnable) + { + renderCommandEncoder->setDepthClipMode(zClipEnable ? MTL::DepthClipModeClip : MTL::DepthClipModeClamp); + encoderState.m_depthClipEnable = zClipEnable; + } + // todo - how does culling behave with rects? // right now we just assume that their winding is always CW if (isPrimitiveRect) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 7a9b41e48..8fe3a8d90 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -98,6 +98,7 @@ struct MetalEncoderState uint32 m_depthBias = 0; uint32 m_depthSlope = 0; uint32 m_depthClamp = 0; + bool m_depthClipEnable = true; struct { MTL::Buffer* m_buffer; size_t m_offset; From 4572f906a3afeac483f0e2db0c0690baae4d2e94 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 27 Aug 2024 16:01:27 +0200 Subject: [PATCH 122/368] fix: infinite command buffer wait --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 7b6be6cea..e5a6a9f6c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -20,6 +20,7 @@ #include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "HW/Latte/Renderer/Metal/MetalLayerHandle.h" #include "HW/Latte/Renderer/Renderer.h" +#include "Metal/MTLCommandBuffer.hpp" #include "Metal/MTLDevice.hpp" #include "Metal/MTLRenderPass.hpp" #include "imgui.h" @@ -306,10 +307,13 @@ void MetalRenderer::Flush(bool waitIdle) { if (m_recordedDrawcalls > 0) CommitCommandBuffer(); - if (waitIdle && m_commandBuffers.size() != 0) + if (waitIdle) { - // TODO: shouldn't we wait for all command buffers? - WaitForCommandBufferCompletion(GetCurrentCommandBuffer()); + for (auto commandBuffer : m_commandBuffers) + { + if (commandBuffer.m_commited) + WaitForCommandBufferCompletion(commandBuffer.m_commandBuffer); + } } } @@ -1325,7 +1329,8 @@ MTL::CommandBuffer* MetalRenderer::GetCommandBuffer() bool MetalRenderer::CommandBufferCompleted(MTL::CommandBuffer* commandBuffer) { - return commandBuffer->status() == MTL::CommandBufferStatusCompleted; + auto status = commandBuffer->status(); + return (status == MTL::CommandBufferStatusCompleted || status == MTL::CommandBufferStatusError); } void MetalRenderer::WaitForCommandBufferCompletion(MTL::CommandBuffer* commandBuffer) From 0e61471c5e0706572b52fb38d0af5ad7727d5c99 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 27 Aug 2024 16:01:50 +0200 Subject: [PATCH 123/368] skip draws when pipeline is invalid --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index e5a6a9f6c..008cac998 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1114,6 +1114,14 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 renderPipelineState = m_pipelineCache->GetMeshPipelineState(fetchShader, vertexShader, geometryShader, pixelShader, m_state.m_lastUsedFBO, LatteGPUState.contextNew, hostIndexType); else renderPipelineState = m_pipelineCache->GetRenderPipelineState(fetchShader, vertexShader, pixelShader, m_state.m_lastUsedFBO, LatteGPUState.contextNew); + + // HACK + if (!renderPipelineState) + { + debug_printf("invalid render pipeline state, skipping draw\n"); + return; + } + if (renderPipelineState != encoderState.m_renderPipelineState) { renderCommandEncoder->setRenderPipelineState(renderPipelineState); From 366be049a49c818a71daf0068bce217490d8c539 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 28 Aug 2024 11:51:31 +0200 Subject: [PATCH 124/368] fix: validation errors --- .../Latte/Renderer/Metal/MetalBufferAllocator.h | 17 ++++++++++++----- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 17 +++++++++++------ 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h index 445fb8231..702278ca6 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h @@ -15,7 +15,13 @@ template class MetalBufferAllocator { public: - MetalBufferAllocator(class MetalRenderer* metalRenderer, MTL::ResourceOptions storageMode) : m_mtlr{metalRenderer}, m_storageMode{storageMode} {} + MetalBufferAllocator(class MetalRenderer* metalRenderer, MTL::ResourceOptions storageMode) : m_mtlr{metalRenderer} { + m_isCPUAccessible = (storageMode == MTL::ResourceStorageModeShared) || (storageMode == MTL::ResourceStorageModeManaged); + + m_options = storageMode; + if (m_isCPUAccessible) + m_options |= MTL::ResourceCPUCacheModeWriteCombined; + } ~MetalBufferAllocator() { @@ -54,7 +60,7 @@ class MetalBufferAllocator allocation.bufferIndex = range.bufferIndex; allocation.offset = range.offset; allocation.size = size; - allocation.data = (uint8*)buffer.m_buffer->contents() + range.offset; + allocation.data = (m_isCPUAccessible ? (uint8*)buffer.m_buffer->contents() + range.offset : nullptr); range.offset += size; range.size -= size; @@ -70,7 +76,7 @@ class MetalBufferAllocator // If no free range was found, allocate a new buffer m_allocationSize = std::max(m_allocationSize, size); - MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(m_allocationSize, m_storageMode | MTL::ResourceCPUCacheModeWriteCombined); + MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(m_allocationSize, m_options); #ifdef CEMU_DEBUG_ASSERT buffer->setLabel(GetLabel("Buffer from buffer allocator", buffer)); #endif @@ -79,7 +85,7 @@ class MetalBufferAllocator allocation.bufferIndex = m_buffers.size(); allocation.offset = 0; allocation.size = size; - allocation.data = buffer->contents(); + allocation.data = (m_isCPUAccessible ? buffer->contents() : nullptr); m_buffers.push_back({buffer}); @@ -129,7 +135,8 @@ class MetalBufferAllocator protected: class MetalRenderer* m_mtlr; - MTL::ResourceOptions m_storageMode; + bool m_isCPUAccessible; + MTL::ResourceOptions m_options; size_t m_allocationSize = 8 * 1024 * 1024; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 008cac998..44090a050 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -41,7 +41,7 @@ MetalRenderer::MetalRenderer() m_commandQueue = m_device->newCommandQueue(); // Feature support - m_isAppleGPU = m_device->supportsFamily(MTL::GPUFamilyApple1); + m_isAppleGPU = false;//m_device->supportsFamily(MTL::GPUFamilyApple1); m_hasUnifiedMemory = m_device->hasUnifiedMemory(); m_supportsMetal3 = m_device->supportsFamily(MTL::GPUFamilyMetal3); m_recommendedMaxVRAMUsage = m_device->recommendedMaxWorkingSetSize(); @@ -511,7 +511,7 @@ void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, s size_t bytesPerRow = GetMtlTextureBytesPerRow(textureMtl->GetFormat(), textureMtl->IsDepth(), width); // No need to set bytesPerImage for 3D textures, since we always load just one slice //size_t bytesPerImage = GetMtlTextureBytesPerImage(textureMtl->GetFormat(), textureMtl->IsDepth(), height, bytesPerRow); - if (IsAppleGPU()) + if (m_isAppleGPU) { textureMtl->GetTexture()->replaceRegion(MTL::Region(0, 0, offsetZ, width, height, 1), mipIndex, sliceIndex, pixelData, bytesPerRow, 0); } @@ -674,7 +674,7 @@ LatteTextureReadbackInfo* MetalRenderer::texture_createReadback(LatteTextureView void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* sourceTexture, sint32 srcMip, sint32 srcSlice, LatteTexture* destinationTexture, sint32 dstMip, sint32 dstSlice, sint32 width, sint32 height) { - GetCommandBuffer(); + return; // scale copy size to effective size sint32 effectiveCopyWidth = width; @@ -907,10 +907,12 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 bool usesGeometryShader = (geometryShader != nullptr || isPrimitiveRect); // Depth stencil state - // TODO + // Disable depth write when there is no depth attachment - //if (!m_state.m_lastUsedFBO->depthBuffer.texture) - // depthControl.set_Z_WRITE_ENABLE(false); + auto& depthControl = LatteGPUState.contextNew.DB_DEPTH_CONTROL; + bool depthWriteEnable = depthControl.get_Z_WRITE_ENABLE(); + if (!m_state.m_lastUsedFBO->depthBuffer.texture) + depthControl.set_Z_WRITE_ENABLE(false); MTL::DepthStencilState* depthStencilState = m_depthStencilCache->GetDepthStencilState(LatteGPUState.contextNew); if (depthStencilState != encoderState.m_depthStencilState) @@ -919,6 +921,9 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 encoderState.m_depthStencilState = depthStencilState; } + // Restore the original depth write state + depthControl.set_Z_WRITE_ENABLE(depthWriteEnable); + // Stencil reference bool stencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ENABLE(); if (stencilEnable) From 163eeea10207d3a1bf14c4419bf2e21e9c5b3d6d Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 28 Aug 2024 13:19:51 +0200 Subject: [PATCH 125/368] use texture views for surface copies --- .../Metal/MetalHybridComputePipeline.cpp | 4 +- .../Metal/MetalHybridComputePipeline.h | 7 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 71 ++++++++++++------- .../Renderer/Metal/UtilityShaderSource.h | 14 +--- 4 files changed, 54 insertions(+), 42 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp index 3be1cf521..f822fb13c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp @@ -1,6 +1,6 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h" -MetalHybridComputePipeline::MetalHybridComputePipeline(class MetalRenderer* mtlRenderer, MTL::Library* library, const char* vertexFunctionName, const char* kernelFunctionName) +MetalHybridComputePipeline::MetalHybridComputePipeline(class MetalRenderer* mtlRenderer, MTL::Library* library, const std::string& vertexFunctionName/*, const std::string& kernelFunctionName*/) { // Render pipeline state MTL::Function* vertexFunction = library->newFunction(ToNSString(vertexFunctionName)); @@ -20,12 +20,10 @@ MetalHybridComputePipeline::MetalHybridComputePipeline(class MetalRenderer* mtlR } // Compute pipeline state - // TODO } MetalHybridComputePipeline::~MetalHybridComputePipeline() { m_renderPipelineState->release(); - // TODO: uncomment //m_computePipelineState->release(); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h index 7d586e242..5aeee65f2 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h @@ -3,17 +3,18 @@ #include "Metal/MTLLibrary.hpp" #include "Metal/MTLRenderPipeline.hpp" +// TODO: rename to MetalVoidVertexPipeline class MetalHybridComputePipeline { public: - MetalHybridComputePipeline(class MetalRenderer* mtlRenderer, MTL::Library* library, const char* vertexFunctionName, const char* kernelFunctionName); + MetalHybridComputePipeline(class MetalRenderer* mtlRenderer, MTL::Library* library, const std::string& vertexFunctionName/*, const std::string& kernelFunctionName*/); ~MetalHybridComputePipeline(); MTL::RenderPipelineState* GetRenderPipelineState() const { return m_renderPipelineState; } - MTL::RenderPipelineState* GetComputePipelineState() const { return m_computePipelineState; } + //MTL::RenderPipelineState* GetComputePipelineState() const { return m_computePipelineState; } private: MTL::RenderPipelineState* m_renderPipelineState; - MTL::RenderPipelineState* m_computePipelineState; + //MTL::RenderPipelineState* m_computePipelineState; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 44090a050..f917e889a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -16,6 +16,7 @@ #include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/Core/LatteIndices.h" #include "Cemu/Logging/CemuDebugLogging.h" +#include "Foundation/NSTypes.hpp" #include "HW/Latte/Core/LatteConst.h" #include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "HW/Latte/Renderer/Metal/MetalLayerHandle.h" @@ -23,7 +24,9 @@ #include "Metal/MTLCommandBuffer.hpp" #include "Metal/MTLDevice.hpp" #include "Metal/MTLRenderPass.hpp" +#include "Metal/MTLRenderPipeline.hpp" #include "imgui.h" +#include #define IMGUI_IMPL_METAL_CPP #include "imgui/imgui_extension.h" @@ -117,13 +120,13 @@ MetalRenderer::MetalRenderer() } // Present pipeline - MTL::Function* presentVertexFunction = utilityLibrary->newFunction(ToNSString("vertexFullscreen")); + MTL::Function* fullscreenVertexFunction = utilityLibrary->newFunction(ToNSString("vertexFullscreen")); MTL::Function* presentFragmentFunction = utilityLibrary->newFunction(ToNSString("fragmentPresent")); MTL::RenderPipelineDescriptor* renderPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); - renderPipelineDescriptor->setVertexFunction(presentVertexFunction); + renderPipelineDescriptor->setVertexFunction(fullscreenVertexFunction); renderPipelineDescriptor->setFragmentFunction(presentFragmentFunction); - presentVertexFunction->release(); + fullscreenVertexFunction->release(); presentFragmentFunction->release(); error = nullptr; @@ -151,9 +154,12 @@ MetalRenderer::MetalRenderer() error->release(); } + // Copy texture pipelines + auto copyTextureToColorPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); + // Hybrid pipelines - m_copyTextureToTexturePipeline = new MetalHybridComputePipeline(this, utilityLibrary, "vertexCopyTextureToTexture", "kernelCopyTextureToTexture"); - m_restrideBufferPipeline = new MetalHybridComputePipeline(this, utilityLibrary, "vertexRestrideBuffer", "kernelRestrideBuffer"); + m_copyTextureToTexturePipeline = new MetalHybridComputePipeline(this, utilityLibrary, "vertexCopyTextureToTexture"); + m_restrideBufferPipeline = new MetalHybridComputePipeline(this, utilityLibrary, "vertexRestrideBuffer"); utilityLibrary->release(); m_memoryManager->SetRestrideBufferPipeline(m_restrideBufferPipeline); @@ -674,8 +680,6 @@ LatteTextureReadbackInfo* MetalRenderer::texture_createReadback(LatteTextureView void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* sourceTexture, sint32 srcMip, sint32 srcSlice, LatteTexture* destinationTexture, sint32 dstMip, sint32 dstSlice, sint32 width, sint32 height) { - return; - // scale copy size to effective size sint32 effectiveCopyWidth = width; sint32 effectiveCopyHeight = height; @@ -688,33 +692,25 @@ void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* so sint32 texDstMip = dstMip; sint32 texDstSlice = dstSlice; - LatteTextureMtl* srcTextureMtl = static_cast(sourceTexture); - LatteTextureMtl* dstTextureMtl = static_cast(destinationTexture); + // Create texture views + LatteTextureViewMtl* srcTextureMtl = static_cast(sourceTexture->GetOrCreateView(srcMip, 1, srcSlice, 1)); + LatteTextureViewMtl* dstTextureMtl = static_cast(destinationTexture->GetOrCreateView(dstMip, 1, dstSlice, 1)); // check if texture rescale ratios match // todo - if not, we have to use drawcall based copying - if (!LatteTexture_doesEffectiveRescaleRatioMatch(srcTextureMtl, texSrcMip, dstTextureMtl, texDstMip)) + if (!LatteTexture_doesEffectiveRescaleRatioMatch(sourceTexture, texSrcMip, destinationTexture, texDstMip)) { cemuLog_logDebug(LogType::Force, "surfaceCopy_copySurfaceWithFormatConversion(): Mismatching dimensions"); return; } // check if bpp size matches - if (srcTextureMtl->GetBPP() != dstTextureMtl->GetBPP()) + if (sourceTexture->GetBPP() != destinationTexture->GetBPP()) { cemuLog_logDebug(LogType::Force, "surfaceCopy_copySurfaceWithFormatConversion(): Mismatching BPP"); return; } - struct CopyParams - { - uint32 width; - uint32 srcMip; - uint32 srcSlice; - uint32 dstMip; - uint32 dstSlice; - } params{(uint32)effectiveCopyWidth, (uint32)texSrcMip, (uint32)texSrcSlice, (uint32)texDstMip, (uint32)texDstSlice}; - if (m_encoderType == MetalEncoderType::Render) { auto renderCommandEncoder = static_cast(m_commandEncoder); @@ -722,17 +718,42 @@ void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* so renderCommandEncoder->setRenderPipelineState(m_copyTextureToTexturePipeline->GetRenderPipelineState()); m_state.m_encoderState.m_renderPipelineState = m_copyTextureToTexturePipeline->GetRenderPipelineState(); - SetTexture(renderCommandEncoder, METAL_SHADER_TYPE_VERTEX, srcTextureMtl->GetTexture(), GET_HELPER_TEXTURE_BINDING(0)); - SetTexture(renderCommandEncoder, METAL_SHADER_TYPE_VERTEX, dstTextureMtl->GetTexture(), GET_HELPER_TEXTURE_BINDING(1)); - renderCommandEncoder->setVertexBytes(¶ms, sizeof(params), GET_HELPER_BUFFER_BINDING(0)); + SetTexture(renderCommandEncoder, METAL_SHADER_TYPE_VERTEX, srcTextureMtl->GetRGBAView(), GET_HELPER_TEXTURE_BINDING(0)); + SetTexture(renderCommandEncoder, METAL_SHADER_TYPE_VERTEX, dstTextureMtl->GetRGBAView(), GET_HELPER_TEXTURE_BINDING(1)); + renderCommandEncoder->setVertexBytes(&effectiveCopyWidth, sizeof(effectiveCopyWidth), GET_HELPER_BUFFER_BINDING(0)); m_state.m_encoderState.m_buffers[METAL_SHADER_TYPE_VERTEX][GET_HELPER_BUFFER_BINDING(0)] = {nullptr}; renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3)); } else { - // TODO: do the copy in a compute shader - debug_printf("surfaceCopy_copySurfaceWithFormatConversion: no active render command encoder, skipping copy\n"); + // TODO: uncomment + /* + bool copyingToWholeRegion = ((effectiveCopyWidth == dstTextureMtl->GetMipWidth(dstMip) && effectiveCopyHeight == dstTextureMtl->GetMipHeight(dstMip))); + + auto renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); + auto colorAttachment = renderPassDescriptor->colorAttachments()->object(0); + colorAttachment->setTexture(dstTextureMtl->GetTexture()); + // We don't care about previous contents if we are about to overwrite the whole region + colorAttachment->setLoadAction(copyingToWholeRegion ? MTL::LoadActionDontCare : MTL::LoadActionLoad); + colorAttachment->setStoreAction(MTL::StoreActionStore); + colorAttachment->setSlice(dstSlice); + colorAttachment->setLevel(dstMip); + + auto renderCommandEncoder = GetTemporaryRenderCommandEncoder(renderPassDescriptor); + + auto pipeline = (srcTextureMtl->IsDepth() ? m_copyTextureToColorPipeline : m_copyTextureToDepthPipeline); + renderCommandEncoder->setRenderPipelineState(pipeline); + + renderCommandEncoder->setFragmentTexture(srcTextureMtl->GetTexture(), GET_HELPER_TEXTURE_BINDING(0)); + renderCommandEncoder->setFragmentBytes(&effectiveCopyWidth, offsetof(effectiveCopyWidth), GET_HELPER_BUFFER_BINDING(0)); + + renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3)); + + EndEncoding(); + */ + + debug_printf("surface copy with no render command encoder, skipping copy\n"); } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h index 7f8f3dc7f..c7a625466 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h @@ -31,17 +31,9 @@ fragment float4 fragmentPresent(VertexOut in [[stage_in]], texture2d tex return tex.sample(samplr, in.texCoord); } -struct CopyParams { - uint width; - uint srcMip; - uint srcSlice; - uint dstMip; - uint dstSlice; -}; - -vertex void vertexCopyTextureToTexture(uint vid [[vertex_id]], texture2d_array src [[texture(GET_TEXTURE_BINDING(0))]], texture2d_array dst [[texture(GET_TEXTURE_BINDING(1))]], constant CopyParams& params [[buffer(GET_BUFFER_BINDING(0))]]) { - uint2 coord = uint2(vid % params.width, vid / params.width); - return dst.write(float4(src.read(coord, params.srcSlice, params.srcMip).r, 0.0, 0.0, 0.0), coord, params.dstSlice, params.dstMip); +vertex void vertexCopyTextureToTexture(uint vid [[vertex_id]], texture2d src [[texture(GET_TEXTURE_BINDING(0))]], texture2d dst [[texture(GET_TEXTURE_BINDING(1))]], constant uint32_t& width [[buffer(GET_BUFFER_BINDING(0))]]) { + uint2 coord = uint2(vid % width, vid / width); + return dst.write(float4(src.read(coord).r, 0.0, 0.0, 0.0), coord); } struct RestrideParams { From 1b44269d70a315770ff4ec78ccfd6701141b00bd Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 28 Aug 2024 13:24:01 +0200 Subject: [PATCH 126/368] simplify flush --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index f917e889a..eb009ecd4 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -16,6 +16,7 @@ #include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/Core/LatteIndices.h" #include "Cemu/Logging/CemuDebugLogging.h" +#include "Common/precompiled.h" #include "Foundation/NSTypes.hpp" #include "HW/Latte/Core/LatteConst.h" #include "HW/Latte/Renderer/Metal/MetalCommon.h" @@ -311,14 +312,15 @@ bool MetalRenderer::BeginFrame(bool mainWindow) void MetalRenderer::Flush(bool waitIdle) { - if (m_recordedDrawcalls > 0) + if (m_recordedDrawcalls > 0 || waitIdle) CommitCommandBuffer(); if (waitIdle) { for (auto commandBuffer : m_commandBuffers) { - if (commandBuffer.m_commited) - WaitForCommandBufferCompletion(commandBuffer.m_commandBuffer); + cemu_assert_debug(commandBuffer.m_commited); + + WaitForCommandBufferCompletion(commandBuffer.m_commandBuffer); } } } From be76dadb901b411aee6bb8c2a9b0b165b4940a01 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 28 Aug 2024 13:43:06 +0200 Subject: [PATCH 127/368] do surface copy properly --- .../HW/Latte/Renderer/Metal/MetalLayerHandle.cpp | 1 + .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 15 ++++++++------- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h | 2 +- .../HW/Latte/Renderer/Metal/UtilityShaderSource.h | 4 +++- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp index f4d4490e1..3a3a6b547 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp @@ -10,6 +10,7 @@ MetalLayerHandle::MetalLayerHandle(MTL::Device* device, const Vector2i& size) m_layer = (CA::MetalLayer*)CreateMetalLayer(windowInfo.handle, m_layerScaleX, m_layerScaleY); m_layer->setDevice(device); m_layer->setDrawableSize(CGSize{(float)size.x * m_layerScaleX, (float)size.y * m_layerScaleY}); + m_layer->setFramebufferOnly(true); } MetalLayerHandle::~MetalLayerHandle() diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index eb009ecd4..95b72c40c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -159,7 +159,7 @@ MetalRenderer::MetalRenderer() auto copyTextureToColorPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); // Hybrid pipelines - m_copyTextureToTexturePipeline = new MetalHybridComputePipeline(this, utilityLibrary, "vertexCopyTextureToTexture"); + //m_copyTextureToTexturePipeline = new MetalHybridComputePipeline(this, utilityLibrary, "vertexCopyTextureToTexture"); m_restrideBufferPipeline = new MetalHybridComputePipeline(this, utilityLibrary, "vertexRestrideBuffer"); utilityLibrary->release(); @@ -168,7 +168,7 @@ MetalRenderer::MetalRenderer() MetalRenderer::~MetalRenderer() { - delete m_copyTextureToTexturePipeline; + //delete m_copyTextureToTexturePipeline; delete m_restrideBufferPipeline; m_presentPipelineLinear->release(); @@ -686,9 +686,12 @@ void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* so sint32 effectiveCopyWidth = width; sint32 effectiveCopyHeight = height; LatteTexture_scaleToEffectiveSize(sourceTexture, &effectiveCopyWidth, &effectiveCopyHeight, 0); - sint32 sourceEffectiveWidth, sourceEffectiveHeight; - sourceTexture->GetEffectiveSize(sourceEffectiveWidth, sourceEffectiveHeight, srcMip); + //sint32 sourceEffectiveWidth, sourceEffectiveHeight; + //sourceTexture->GetEffectiveSize(sourceEffectiveWidth, sourceEffectiveHeight, srcMip); + texture_copyImageSubData(sourceTexture, srcMip, 0, 0, srcSlice, destinationTexture, dstMip, 0, 0, dstSlice, effectiveCopyWidth, effectiveCopyHeight, 0); + + /* sint32 texSrcMip = srcMip; sint32 texSrcSlice = srcSlice; sint32 texDstMip = dstMip; @@ -729,8 +732,6 @@ void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* so } else { - // TODO: uncomment - /* bool copyingToWholeRegion = ((effectiveCopyWidth == dstTextureMtl->GetMipWidth(dstMip) && effectiveCopyHeight == dstTextureMtl->GetMipHeight(dstMip))); auto renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); @@ -753,10 +754,10 @@ void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* so renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3)); EndEncoding(); - */ debug_printf("surface copy with no render command encoder, skipping copy\n"); } + */ } void MetalRenderer::bufferCache_init(const sint32 bufferSize) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 8fe3a8d90..2ada50c3f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -440,7 +440,7 @@ class MetalRenderer : public Renderer MTL::RenderPipelineState* m_presentPipelineSRGB; // Hybrid pipelines - class MetalHybridComputePipeline* m_copyTextureToTexturePipeline; + //class MetalHybridComputePipeline* m_copyTextureToTexturePipeline; class MetalHybridComputePipeline* m_restrideBufferPipeline; // Resources diff --git a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h index c7a625466..cf355f611 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h @@ -31,17 +31,19 @@ fragment float4 fragmentPresent(VertexOut in [[stage_in]], texture2d tex return tex.sample(samplr, in.texCoord); } +/* vertex void vertexCopyTextureToTexture(uint vid [[vertex_id]], texture2d src [[texture(GET_TEXTURE_BINDING(0))]], texture2d dst [[texture(GET_TEXTURE_BINDING(1))]], constant uint32_t& width [[buffer(GET_BUFFER_BINDING(0))]]) { uint2 coord = uint2(vid % width, vid / width); return dst.write(float4(src.read(coord).r, 0.0, 0.0, 0.0), coord); } +*/ struct RestrideParams { uint oldStride; uint newStride; }; -/* TODO: use uint32? Since that would require less iterations */ +// TODO: use uint32? Since that would require less iterations vertex void vertexRestrideBuffer(uint vid [[vertex_id]], device uint8_t* src [[buffer(GET_BUFFER_BINDING(0))]], device uint8_t* dst [[buffer(GET_BUFFER_BINDING(1))]], constant RestrideParams& params [[buffer(GET_BUFFER_BINDING(2))]]) { for (uint32_t i = 0; i < params.oldStride; i++) { dst[vid * params.newStride + i] = src[vid * params.oldStride + i]; From 35740c5c8ebb16c1dc95cf4c209abc11ed6c5fdd Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 28 Aug 2024 15:26:42 +0200 Subject: [PATCH 128/368] always do texture copies on gpu --- .../Latte/Renderer/Metal/LatteTextureMtl.cpp | 2 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 36 +++++++++---------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 8 ++--- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index c588a21e4..4510571ab 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -10,7 +10,7 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM : LatteTexture(dim, physAddress, physMipAddress, format, width, height, depth, pitch, mipLevels, swizzle, tileMode, isDepth), m_mtlr(mtlRenderer), m_format(format), m_isDepth(isDepth) { MTL::TextureDescriptor* desc = MTL::TextureDescriptor::alloc()->init(); - desc->setStorageMode(m_mtlr->GetOptimalTextureStorageMode()); + desc->setStorageMode(MTL::StorageModePrivate); desc->setCpuCacheMode(MTL::CPUCacheModeWriteCombined); sint32 effectiveBaseWidth = width; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 95b72c40c..2e541c2b8 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -45,7 +45,7 @@ MetalRenderer::MetalRenderer() m_commandQueue = m_device->newCommandQueue(); // Feature support - m_isAppleGPU = false;//m_device->supportsFamily(MTL::GPUFamilyApple1); + m_isAppleGPU = m_device->supportsFamily(MTL::GPUFamilyApple1); m_hasUnifiedMemory = m_device->hasUnifiedMemory(); m_supportsMetal3 = m_device->supportsFamily(MTL::GPUFamilyMetal3); m_recommendedMaxVRAMUsage = m_device->recommendedMaxWorkingSetSize(); @@ -519,26 +519,26 @@ void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, s size_t bytesPerRow = GetMtlTextureBytesPerRow(textureMtl->GetFormat(), textureMtl->IsDepth(), width); // No need to set bytesPerImage for 3D textures, since we always load just one slice //size_t bytesPerImage = GetMtlTextureBytesPerImage(textureMtl->GetFormat(), textureMtl->IsDepth(), height, bytesPerRow); - if (m_isAppleGPU) - { - textureMtl->GetTexture()->replaceRegion(MTL::Region(0, 0, offsetZ, width, height, 1), mipIndex, sliceIndex, pixelData, bytesPerRow, 0); - } - else - { - auto blitCommandEncoder = GetBlitCommandEncoder(); + //if (m_isAppleGPU) + //{ + // textureMtl->GetTexture()->replaceRegion(MTL::Region(0, 0, offsetZ, width, height, 1), mipIndex, sliceIndex, pixelData, bytesPerRow, 0); + //} + //else + //{ + auto blitCommandEncoder = GetBlitCommandEncoder(); - // Allocate a temporary buffer - auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator(); - auto allocation = bufferAllocator.GetBufferAllocation(compressedImageSize); - auto buffer = bufferAllocator.GetBuffer(allocation.bufferIndex); + // Allocate a temporary buffer + auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator(); + auto allocation = bufferAllocator.GetBufferAllocation(compressedImageSize); + auto buffer = bufferAllocator.GetBuffer(allocation.bufferIndex); - // Copy the data to the temporary buffer - memcpy(allocation.data, pixelData, compressedImageSize); - //buffer->didModifyRange(NS::Range(allocation.offset, allocation.size)); + // Copy the data to the temporary buffer + memcpy(allocation.data, pixelData, compressedImageSize); + //buffer->didModifyRange(NS::Range(allocation.offset, allocation.size)); - // Copy the data from the temporary buffer to the texture - blitCommandEncoder->copyFromBuffer(buffer, allocation.offset, bytesPerRow, 0, MTL::Size(width, height, 1), textureMtl->GetTexture(), sliceIndex, mipIndex, MTL::Origin(0, 0, offsetZ)); - } + // Copy the data from the temporary buffer to the texture + blitCommandEncoder->copyFromBuffer(buffer, allocation.offset, bytesPerRow, 0, MTL::Size(width, height, 1), textureMtl->GetTexture(), sliceIndex, mipIndex, MTL::Origin(0, 0, offsetZ)); + //} } void MetalRenderer::texture_clearColorSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 2ada50c3f..586eb79ab 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -397,10 +397,10 @@ class MetalRenderer : public Renderer return m_pixelFormatSupport; } - MTL::StorageMode GetOptimalTextureStorageMode() const - { - return (m_isAppleGPU ? MTL::StorageModeShared : MTL::StorageModePrivate); - } + //MTL::StorageMode GetOptimalTextureStorageMode() const + //{ + // return (m_isAppleGPU ? MTL::StorageModeShared : MTL::StorageModePrivate); + //} MTL::ResourceOptions GetOptimalBufferStorageMode() const { From a6c8d834364d92c49bd393e50c8cfb9099e08e82 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 28 Aug 2024 16:42:55 +0200 Subject: [PATCH 129/368] release unused buffers --- .../Renderer/Metal/MetalBufferAllocator.h | 211 +++++++++--------- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 7 +- 2 files changed, 107 insertions(+), 111 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h index 702278ca6..2e264b53f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h @@ -6,15 +6,23 @@ struct MetalBufferRange { - uint32 bufferIndex; size_t offset; size_t size; }; +constexpr size_t BASE_ALLOCATION_SIZE = 8 * 1024 * 1024; + template class MetalBufferAllocator { public: + struct Buffer + { + MTL::Buffer* m_buffer; + std::vector m_freeRanges; + BufferT m_data; + }; + MetalBufferAllocator(class MetalRenderer* metalRenderer, MTL::ResourceOptions storageMode) : m_mtlr{metalRenderer} { m_isCPUAccessible = (storageMode == MTL::ResourceStorageModeShared) || (storageMode == MTL::ResourceStorageModeManaged); @@ -33,9 +41,8 @@ class MetalBufferAllocator void ResetAllocations() { - m_freeBufferRanges.clear(); - for (uint32_t i = 0; i < m_buffers.size(); i++) - m_freeBufferRanges.push_back({i, 0, m_buffers[i].m_buffer->length()}); + for (uint32 i = 0; i < m_buffers.size(); i++) + FreeBuffer(i); } MTL::Buffer* GetBuffer(uint32 bufferIndex) @@ -49,63 +56,62 @@ class MetalBufferAllocator size = Align(size, 128); // First, try to find a free range - for (uint32 i = 0; i < m_freeBufferRanges.size(); i++) + for (uint32 i = 0; i < m_buffers.size(); i++) { - auto& range = m_freeBufferRanges[i]; - if (size <= range.size) + auto& buffer = m_buffers[i]; + for (uint32 j = 0; j < buffer.m_freeRanges.size(); j++) { - auto& buffer = m_buffers[range.bufferIndex]; + auto& range = buffer.m_freeRanges[j]; + if (size <= range.size) + { + MetalBufferAllocation allocation; + allocation.bufferIndex = i; + allocation.offset = range.offset; + allocation.size = size; + allocation.data = (m_isCPUAccessible ? (uint8*)buffer.m_buffer->contents() + range.offset : nullptr); - MetalBufferAllocation allocation; - allocation.bufferIndex = range.bufferIndex; - allocation.offset = range.offset; - allocation.size = size; - allocation.data = (m_isCPUAccessible ? (uint8*)buffer.m_buffer->contents() + range.offset : nullptr); + range.offset += size; + range.size -= size; - range.offset += size; - range.size -= size; + if (range.size == 0) + { + buffer.m_freeRanges.erase(buffer.m_freeRanges.begin() + j); + } - if (range.size == 0) - { - m_freeBufferRanges.erase(m_freeBufferRanges.begin() + i); + return allocation; } - - return allocation; } } // If no free range was found, allocate a new buffer - m_allocationSize = std::max(m_allocationSize, size); - MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(m_allocationSize, m_options); + size_t allocationSize = BASE_ALLOCATION_SIZE * (1u << m_buffers.size()); + allocationSize = std::max(allocationSize, size); + MTL::Buffer* mtlBuffer = m_mtlr->GetDevice()->newBuffer(allocationSize, m_options); #ifdef CEMU_DEBUG_ASSERT - buffer->setLabel(GetLabel("Buffer from buffer allocator", buffer)); + mtlBuffer->setLabel(GetLabel("Buffer from buffer allocator", mtlBuffer)); #endif MetalBufferAllocation allocation; allocation.bufferIndex = m_buffers.size(); allocation.offset = 0; allocation.size = size; - allocation.data = (m_isCPUAccessible ? buffer->contents() : nullptr); + allocation.data = (m_isCPUAccessible ? mtlBuffer->contents() : nullptr); - m_buffers.push_back({buffer}); + m_buffers.push_back({mtlBuffer}); + auto& buffer = m_buffers.back(); // If the buffer is larger than the requested size, add the remaining space to the free buffer ranges - if (size < m_allocationSize) + if (size < allocationSize) { MetalBufferRange range; - range.bufferIndex = allocation.bufferIndex; range.offset = size; - range.size = m_allocationSize - size; + range.size = allocationSize - size; - m_freeBufferRanges.push_back(range); + buffer.m_freeRanges.push_back(range); } // Debug - m_mtlr->GetPerformanceMonitor().m_bufferAllocatorMemory += m_allocationSize; - - // Increase the allocation size for the next buffer - if (m_allocationSize < 128 * 1024 * 1024) - m_allocationSize *= 2; + m_mtlr->GetPerformanceMonitor().m_bufferAllocatorMemory += allocationSize; return allocation; } @@ -113,24 +119,24 @@ class MetalBufferAllocator void FreeAllocation(MetalBufferAllocation& allocation) { MetalBufferRange range; - range.bufferIndex = allocation.bufferIndex; range.offset = allocation.offset; range.size = allocation.size; allocation.offset = INVALID_OFFSET; // Find the correct position to insert the free range - for (uint32 i = 0; i < m_freeBufferRanges.size(); i++) + auto& buffer = m_buffers[allocation.bufferIndex]; + for (uint32 i = 0; i < buffer.m_freeRanges.size(); i++) { - auto& freeRange = m_freeBufferRanges[i]; - if (freeRange.bufferIndex == range.bufferIndex && freeRange.offset + freeRange.size == range.offset) + auto& freeRange = buffer.m_freeRanges[i]; + if (freeRange.offset + freeRange.size == range.offset) { freeRange.size += range.size; return; } } - m_freeBufferRanges.push_back(range); + buffer.m_freeRanges.push_back(range); } protected: @@ -138,22 +144,22 @@ class MetalBufferAllocator bool m_isCPUAccessible; MTL::ResourceOptions m_options; - size_t m_allocationSize = 8 * 1024 * 1024; + std::vector m_buffers; - std::vector m_buffers; - std::vector m_freeBufferRanges; -}; - -struct MetalBuffer -{ - MTL::Buffer* m_buffer; + void FreeBuffer(uint32 bufferIndex) + { + auto& buffer = m_buffers[bufferIndex]; + buffer.m_freeRanges.clear(); + buffer.m_freeRanges.reserve(1); + buffer.m_freeRanges.push_back({0, m_buffers[bufferIndex].m_buffer->length()}); + } }; -typedef MetalBufferAllocator MetalDefaultBufferAllocator; +struct Empty {}; +typedef MetalBufferAllocator MetalDefaultBufferAllocator; struct MetalSyncedBuffer { - MTL::Buffer* m_buffer; std::vector m_commandBuffers; uint32 m_lock = 0; @@ -163,7 +169,7 @@ struct MetalSyncedBuffer } }; -constexpr uint16 MAX_COMMAND_BUFFER_FRAMES = 8; +constexpr uint16 BUFFER_RELEASE_FRAME_TRESHOLD = 1024; class MetalTemporaryBufferAllocator : public MetalBufferAllocator { @@ -172,65 +178,72 @@ class MetalTemporaryBufferAllocator : public MetalBufferAllocatorsecond++; - - if (it->second > MAX_COMMAND_BUFFER_FRAMES) + auto& backBuffer = m_buffers.back(); + if (backBuffer.m_data.m_commandBuffers.empty()) { - debug_printf("command buffer %p remained unfinished for more than %u frames\n", it->first, MAX_COMMAND_BUFFER_FRAMES); + // Release the back buffer if it hasn't been accessed for a while + if (m_framesSinceBackBufferAccess >= BUFFER_RELEASE_FRAME_TRESHOLD) + { + // Debug + m_mtlr->GetPerformanceMonitor().m_bufferAllocatorMemory -= backBuffer.m_buffer->length(); - // Pretend like the command buffer has finished - CommandBufferFinished(it->first, false); + backBuffer.m_buffer->release(); + m_buffers.pop_back(); - it = m_commandBuffersFrames.erase(it); + m_framesSinceBackBufferAccess = 0; + } + else + { + m_framesSinceBackBufferAccess++; + } } else { - it++; + m_framesSinceBackBufferAccess = 0; } } - */ } void SetActiveCommandBuffer(MTL::CommandBuffer* commandBuffer) { m_activeCommandBuffer = commandBuffer; - - //if (commandBuffer) - // m_commandBuffersFrames[commandBuffer] = 0; } void CheckForCompletedCommandBuffers(/*MTL::CommandBuffer* commandBuffer, bool erase = true*/) @@ -238,24 +251,24 @@ class MetalTemporaryBufferAllocator : public MetalBufferAllocatorCommandBufferCompleted(buffer.m_commandBuffers[j])) + if (m_mtlr->CommandBufferCompleted(buffer.m_data.m_commandBuffers[j])) { - if (buffer.m_commandBuffers.size() == 1) + if (buffer.m_data.m_commandBuffers.size() == 1) { - if (!buffer.IsLocked()) + if (!buffer.m_data.IsLocked()) { // All command buffers using it have finished execution, we can use it again FreeBuffer(i); } - buffer.m_commandBuffers.clear(); + buffer.m_data.m_commandBuffers.clear(); break; } else { - buffer.m_commandBuffers.erase(buffer.m_commandBuffers.begin() + j); + buffer.m_data.m_commandBuffers.erase(buffer.m_data.m_commandBuffers.begin() + j); j--; } } @@ -271,8 +284,8 @@ class MetalTemporaryBufferAllocator : public MetalBufferAllocator size: %lu, command buffers: %zu\n", buffer.m_buffer, buffer.m_buffer->length(), buffer.m_commandBuffers.size()); + debug_printf(" %p -> size: %lu, command buffers: %zu\n", buffer.m_buffer, buffer.m_buffer->length(), buffer.m_data.m_commandBuffers.size()); uint32 same = 0; uint32 completed = 0; - for (uint32 i = 0; i < buffer.m_commandBuffers.size(); i++) + for (uint32 i = 0; i < buffer.m_data.m_commandBuffers.size(); i++) { - if (m_mtlr->CommandBufferCompleted(buffer.m_commandBuffers[i])) + if (m_mtlr->CommandBufferCompleted(buffer.m_data.m_commandBuffers[i])) completed++; - for (uint32 j = 0; j < buffer.m_commandBuffers.size(); j++) + for (uint32 j = 0; j < buffer.m_data.m_commandBuffers.size(); j++) { - if (i != j && buffer.m_commandBuffers[i] == buffer.m_commandBuffers[j]) + if (i != j && buffer.m_data.m_commandBuffers[i] == buffer.m_data.m_commandBuffers[j]) same++; } } debug_printf(" same: %u\n", same); debug_printf(" completed: %u\n", completed); - } - debug_printf("FREE RANGES:\n"); - for (auto& range : m_freeBufferRanges) - { - debug_printf(" %u -> offset: %zu, size: %zu\n", range.bufferIndex, range.offset, range.size); + debug_printf(" FREE RANGES:\n"); + for (auto& range : buffer.m_freeRanges) + { + debug_printf(" offset: %zu, size: %zu\n", range.offset, range.size); + } } } */ @@ -332,20 +346,5 @@ class MetalTemporaryBufferAllocator : public MetalBufferAllocator m_commandBuffersFrames; - - void FreeBuffer(uint32 bufferIndex) - { - // First remove any free ranges that use this buffer - for (uint32 k = 0; k < m_freeBufferRanges.size(); k++) - { - if (m_freeBufferRanges[k].bufferIndex == bufferIndex) - { - m_freeBufferRanges.erase(m_freeBufferRanges.begin() + k); - k--; - } - } - - m_freeBufferRanges.push_back({bufferIndex, 0, m_buffers[bufferIndex].m_buffer->length()}); - } + uint16 m_framesSinceBackBufferAccess = 0; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 2e541c2b8..b604cdd50 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -264,10 +264,7 @@ void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) m_memoryManager->GetFramePersistentBufferAllocator().ResetAllocations(); // Unlock all temporary buffers - m_memoryManager->GetTemporaryBufferAllocator().UnlockAllBuffers(); - - // Check for completed command buffers - m_memoryManager->GetTemporaryBufferAllocator().CheckForCompletedCommandBuffers(); + m_memoryManager->GetTemporaryBufferAllocator().EndFrame(); } // TODO: use `shader` for drawing @@ -504,7 +501,7 @@ void MetalRenderer::texture_clearSlice(LatteTexture* hostTexture, sint32 sliceIn } } -// TODO: do a GPU blit even on unified memory? That would mean we could use private storage mode for all textures +// TODO: do a cpu copy on Apple Silicon? void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, sint32 height, sint32 depth, void* pixelData, sint32 sliceIndex, sint32 mipIndex, uint32 compressedImageSize) { auto textureMtl = (LatteTextureMtl*)hostTexture; From bbed00751f459241852d95b98cc544b7f79d9d63 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 28 Aug 2024 20:09:45 +0200 Subject: [PATCH 130/368] hack: don't attempt to compile shaders with errors --- src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp | 6 ++++++ src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp | 8 ++++++++ src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h | 3 +++ 3 files changed, 17 insertions(+) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index ad63a041a..ade541bac 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -350,6 +350,12 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte auto mtlVertexShader = static_cast(vertexShader->shader); auto mtlPixelShader = static_cast(pixelShader->shader); mtlVertexShader->CompileVertexFunction(); + // HACK + if (!mtlVertexShader->GetFunction()) + { + debug_printf("no vertex function, skipping draw\n"); + return nullptr; + } mtlPixelShader->CompileFragmentFunction(activeFBO); // Render pipeline state diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index 4c968d1cb..6ad72d879 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -218,12 +218,20 @@ void RendererShaderMtl::Compile(const std::string& mslCode) if (m_function) m_function->release(); + // HACK + if (m_hasError) + return; + NS::Error* error = nullptr; MTL::Library* library = m_mtlr->GetDevice()->newLibrary(ToNSString(mslCode), nullptr, &error); if (error) { printf("failed to create library (error: %s) -> source:\n%s\n", error->localizedDescription()->utf8String(), mslCode.c_str()); error->release(); + + // HACK + m_hasError = true; + return; } m_function = library->newFunction(ToNSString("main0")); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h index e21db55ed..ca5a0ff97 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h @@ -63,5 +63,8 @@ class RendererShaderMtl : public RendererShader std::vector m_binary; std::string m_mslCode; + // HACK + bool m_hasError = false; + void Compile(const std::string& mslCode); }; From 64610c161fe08dce07896f654f7a4b9ee2b526a9 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 28 Aug 2024 20:14:56 +0200 Subject: [PATCH 131/368] limit buffer allocation size --- src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h index 2e264b53f..b8a3c7607 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h @@ -10,7 +10,8 @@ struct MetalBufferRange size_t size; }; -constexpr size_t BASE_ALLOCATION_SIZE = 8 * 1024 * 1024; +constexpr size_t BASE_ALLOCATION_SIZE = 8 * 1024 * 1024; // 8 MB +constexpr size_t MAX_ALLOCATION_SIZE = 64 * 1024 * 1024; // 64 MB template class MetalBufferAllocator @@ -85,6 +86,7 @@ class MetalBufferAllocator // If no free range was found, allocate a new buffer size_t allocationSize = BASE_ALLOCATION_SIZE * (1u << m_buffers.size()); + allocationSize = std::min(allocationSize, MAX_ALLOCATION_SIZE); // Limit the allocation size allocationSize = std::max(allocationSize, size); MTL::Buffer* mtlBuffer = m_mtlr->GetDevice()->newBuffer(allocationSize, m_options); #ifdef CEMU_DEBUG_ASSERT From b011d756ee60987deb84808c390a392271042c1e Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 29 Aug 2024 08:47:22 +0200 Subject: [PATCH 132/368] adjust texture swizzle --- .../Renderer/Metal/LatteTextureViewMtl.cpp | 55 ++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp index 0607370bd..950cadc1d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp @@ -3,6 +3,56 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +uint32 LatteTextureMtl_AdjustTextureCompSel(Latte::E_GX2SURFFMT format, uint32 compSel) +{ + switch (format) + { + case Latte::E_GX2SURFFMT::R8_UNORM: // R8 is replicated on all channels (while OpenGL would return 1.0 for BGA instead) + case Latte::E_GX2SURFFMT::R8_SNORM: // probably the same as _UNORM, but needs testing + if (compSel >= 1 && compSel <= 3) + compSel = 0; + break; + case Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM: // order of components is reversed (RGBA -> ABGR) + if (compSel >= 0 && compSel <= 3) + compSel = 3 - compSel; + break; + case Latte::E_GX2SURFFMT::BC4_UNORM: + case Latte::E_GX2SURFFMT::BC4_SNORM: + if (compSel >= 1 && compSel <= 3) + compSel = 0; + break; + case Latte::E_GX2SURFFMT::BC5_UNORM: + case Latte::E_GX2SURFFMT::BC5_SNORM: + // RG maps to RG + // B maps to ? + // A maps to G (guessed) + if (compSel == 3) + compSel = 1; // read Alpha as Green + break; + case Latte::E_GX2SURFFMT::A2_B10_G10_R10_UNORM: + // reverse components (Wii U: ABGR, OpenGL: RGBA) + // used in Resident Evil Revelations + if (compSel >= 0 && compSel <= 3) + compSel = 3 - compSel; + break; + case Latte::E_GX2SURFFMT::X24_G8_UINT: + // map everything to alpha? + if (compSel >= 0 && compSel <= 3) + compSel = 3; + break; + case Latte::E_GX2SURFFMT::R4_G4_UNORM: + // red and green swapped + if (compSel == 0) + compSel = 1; + else if (compSel == 1) + compSel = 0; + break; + default: + break; + } + return compSel; +} + LatteTextureViewMtl::LatteTextureViewMtl(MetalRenderer* mtlRenderer, LatteTextureMtl* texture, Latte::E_DIM dim, Latte::E_GX2SURFFMT format, sint32 firstMip, sint32 mipCount, sint32 firstSlice, sint32 sliceCount) : LatteTextureView(texture, firstMip, mipCount, firstSlice, sliceCount, dim, format), m_mtlr(mtlRenderer), m_baseTexture(texture) { @@ -74,7 +124,10 @@ MTL::Texture* LatteTextureViewMtl::CreateSwizzledView(uint32 gpuSamplerSwizzle) uint32 compSelG = (gpuSamplerSwizzle >> 19) & 0x7; uint32 compSelB = (gpuSamplerSwizzle >> 22) & 0x7; uint32 compSelA = (gpuSamplerSwizzle >> 25) & 0x7; - // TODO: adjust + compSelR = LatteTextureMtl_AdjustTextureCompSel(format, compSelR); + compSelG = LatteTextureMtl_AdjustTextureCompSel(format, compSelG); + compSelB = LatteTextureMtl_AdjustTextureCompSel(format, compSelB); + compSelA = LatteTextureMtl_AdjustTextureCompSel(format, compSelA); MTL::TextureType textureType; switch (dim) From 7a28985454a4e2620dbad595bd8fd0468e4cb0b2 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 29 Aug 2024 09:23:34 +0200 Subject: [PATCH 133/368] properly implement 0 stride vertex buffers --- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 51 +++++++++++++++++++ src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h | 2 + .../Renderer/Metal/MetalPipelineCache.cpp | 33 +++++++----- 3 files changed, 74 insertions(+), 12 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index 51885e518..6f54272cd 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -368,6 +368,57 @@ MTL::VertexFormat GetMtlVertexFormat(uint8 format) } } +uint32 GetMtlVertexFormatSize(uint8 format) +{ + switch (format) + { + case FMT_32_32_32_32_FLOAT: + return 16; + case FMT_32_32_32_FLOAT: + return 12; + case FMT_32_32_FLOAT: + return 8; + case FMT_32_FLOAT: + return 4; + case FMT_8_8_8_8: + return 4; + case FMT_8_8_8: + return 3; + case FMT_8_8: + return 2; + case FMT_8: + return 1; + case FMT_32_32_32_32: + return 16; + case FMT_32_32_32: + return 12; + case FMT_32_32: + return 8; + case FMT_32: + return 4; + case FMT_16_16_16_16: + return 8; + case FMT_16_16_16: + return 6; + case FMT_16_16: + return 4; + case FMT_16: + return 2; + case FMT_16_16_16_16_FLOAT: + return 8; + case FMT_16_16_16_FLOAT: + return 6; + case FMT_16_16_FLOAT: + return 4; + case FMT_16_FLOAT: + return 2; + case FMT_2_10_10_10: + return 4; + default: + return 0; + } +} + MTL::IndexType GetMtlIndexType(Renderer::INDEX_TYPE indexType) { switch (indexType) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h index cc0c5e02d..2c805527f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h @@ -42,6 +42,8 @@ MTL::PrimitiveType GetMtlPrimitiveType(LattePrimitiveMode primitiveMode); MTL::VertexFormat GetMtlVertexFormat(uint8 format); +uint32 GetMtlVertexFormatSize(uint8 format); + MTL::IndexType GetMtlIndexType(Renderer::INDEX_TYPE indexType); MTL::BlendOperation GetMtlBlendOp(Latte::LATTE_CB_BLENDN_CONTROL::E_COMBINEFUNC combineFunc); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index ade541bac..1d0684260 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -298,6 +298,7 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte { std::optional fetchType; + uint32 minBufferStride = 0; for (sint32 j = 0; j < bufferGroup.attribCount; ++j) { auto& attr = bufferGroup.attrib[j]; @@ -311,6 +312,8 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte attribute->setBufferIndex(GET_MTL_VERTEX_BUFFER_INDEX(attr.attributeBufferIndex)); attribute->setFormat(GetMtlVertexFormat(attr.format)); + minBufferStride = std::max(minBufferStride, attr.offset + GetMtlVertexFormatSize(attr.format)); + if (fetchType.has_value()) cemu_assert_debug(fetchType == attr.fetchType); else @@ -327,24 +330,30 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte uint32 bufferStride = (lcr.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; bufferStride = Align(bufferStride, 4); - // HACK + auto layout = vertexDescriptor->layouts()->object(GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)); if (bufferStride == 0) { - debug_printf("vertex buffer %u has a vertex stride of 0 bytes, using 4 bytes instead\n", bufferIndex); - bufferStride = 4; - } + // Buffer stride cannot be zero, let's use the minimum stride + bufferStride = minBufferStride; + debug_printf("vertex buffer %u has a vertex stride of 0 bytes, using %u bytes instead\n", bufferIndex, bufferStride); - auto layout = vertexDescriptor->layouts()->object(GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)); - layout->setStride(bufferStride); - if (!fetchType.has_value() || fetchType == LatteConst::VertexFetchType2::VERTEX_DATA) - layout->setStepFunction(MTL::VertexStepFunctionPerVertex); - else if (fetchType == LatteConst::VertexFetchType2::INSTANCE_DATA) - layout->setStepFunction(MTL::VertexStepFunctionPerInstance); + // Additionally, constant vertex function must be used + layout->setStepFunction(MTL::VertexStepFunctionConstant); + layout->setStepRate(0); + } else { - debug_printf("unimplemented vertex fetch type %u\n", (uint32)fetchType.value()); - cemu_assert(false); + if (!fetchType.has_value() || fetchType == LatteConst::VertexFetchType2::VERTEX_DATA) + layout->setStepFunction(MTL::VertexStepFunctionPerVertex); + else if (fetchType == LatteConst::VertexFetchType2::INSTANCE_DATA) + layout->setStepFunction(MTL::VertexStepFunctionPerInstance); + else + { + debug_printf("unimplemented vertex fetch type %u\n", (uint32)fetchType.value()); + cemu_assert(false); + } } + layout->setStride(bufferStride); } auto mtlVertexShader = static_cast(vertexShader->shader); From 074f9f6f3e04c0d49a97b74c1be935ae1a9541c7 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 29 Aug 2024 09:28:38 +0200 Subject: [PATCH 134/368] align buffer stride after setting to min stride --- src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 1d0684260..6a7465151 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -328,7 +328,6 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte uint32 bufferIndex = bufferGroup.attributeBufferIndex; uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; uint32 bufferStride = (lcr.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; - bufferStride = Align(bufferStride, 4); auto layout = vertexDescriptor->layouts()->object(GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)); if (bufferStride == 0) @@ -353,6 +352,7 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte cemu_assert(false); } } + bufferStride = Align(bufferStride, 4); layout->setStride(bufferStride); } From ccd72bfe45472af77d2e86f20c0cd2cd50bf47eb Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 29 Aug 2024 14:03:48 +0200 Subject: [PATCH 135/368] implement gather sampling --- .../LatteDecompilerEmitMSL.cpp | 12 ++++++++---- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 2 ++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 8aa2ebeea..7b8755bfe 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2186,7 +2186,6 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex } auto texDim = shaderContext->shader->textureUnitDim[texInstruction->textureFetch.textureIndex]; - bool isCompare = shaderContext->shader->textureUsesDepthCompare[texInstruction->textureFetch.textureIndex]; char tempBuffer0[32]; char tempBuffer1[32]; @@ -2246,6 +2245,9 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex src->add(" = ("); } + bool isCompare = shaderContext->shader->textureUsesDepthCompare[texInstruction->textureFetch.textureIndex]; + bool isGather = (texOpcode == GPU7_TEX_INST_FETCH4); + bool unnormalizationHandled = false; bool useTexelCoordinates = false; @@ -2267,7 +2269,6 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex if ((texOpcode == GPU7_TEX_INST_SAMPLE && (texInstruction->textureFetch.unnormalized[0] && texInstruction->textureFetch.unnormalized[1] && texInstruction->textureFetch.unnormalized[2] && texInstruction->textureFetch.unnormalized[3])) || texOpcode == GPU7_TEX_INST_LD) { - // texture is likely a RECT if (hasOffset) cemu_assert_unimplemented(); src->add("read("); @@ -2276,7 +2277,10 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex } else { - src->add("sample"); + if (isGather) + src->add("gather"); + else + src->add("sample"); if (isCompare) src->add("_compare"); src->addFmt("(samplr{}, ", texInstruction->textureFetch.textureIndex); @@ -2531,7 +2535,7 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex if( texInstruction->dstSel[f] < 4 ) { uint8 elemIndex = texInstruction->dstSel[f]; - if (texOpcode == GPU7_TEX_INST_FETCH4) + if (isGather) { // 's textureGather() and GPU7's FETCH4 instruction have a different order of elements // xyzw: top-left, top-right, bottom-right, bottom-left diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index b604cdd50..f570d03b6 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -968,6 +968,8 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // Blend color float* blendColorConstant = (float*)LatteGPUState.contextRegister + Latte::REGADDR::CB_BLEND_RED; + + // TODO: only set when changed renderCommandEncoder->setBlendColor(blendColorConstant[0], blendColorConstant[1], blendColorConstant[2], blendColorConstant[3]); // polygon control From a4ff855585f868c85e41a420451083df32ccdb37 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 29 Aug 2024 15:32:32 +0200 Subject: [PATCH 136/368] do buffer copying in a void vertex function --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 31 +++++++++++++++++-- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 1 + .../Renderer/Metal/UtilityShaderSource.h | 4 +++ 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index f570d03b6..83759fe9c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -24,6 +24,7 @@ #include "HW/Latte/Renderer/Renderer.h" #include "Metal/MTLCommandBuffer.hpp" #include "Metal/MTLDevice.hpp" +#include "Metal/MTLRenderCommandEncoder.hpp" #include "Metal/MTLRenderPass.hpp" #include "Metal/MTLRenderPipeline.hpp" #include "imgui.h" @@ -95,7 +96,8 @@ MetalRenderer::MetalRenderer() #endif // Transform feedback - m_xfbRingBuffer = m_device->newBuffer(LatteStreamout_GetRingBufferSize() * 32, MTL::ResourceStorageModePrivate); + // HACK: using just LatteStreamout_GetRingBufferSize will cause page faults + m_xfbRingBuffer = m_device->newBuffer(LatteStreamout_GetRingBufferSize() * 4, MTL::ResourceStorageModePrivate); #ifdef CEMU_DEBUG_ASSERT m_xfbRingBuffer->setLabel(GetLabel("Transform feedback buffer", m_xfbRingBuffer)); #endif @@ -159,6 +161,7 @@ MetalRenderer::MetalRenderer() auto copyTextureToColorPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); // Hybrid pipelines + m_copyBufferToBufferPipeline = new MetalHybridComputePipeline(this, utilityLibrary, "vertexCopyBufferToBuffer"); //m_copyTextureToTexturePipeline = new MetalHybridComputePipeline(this, utilityLibrary, "vertexCopyTextureToTexture"); m_restrideBufferPipeline = new MetalHybridComputePipeline(this, utilityLibrary, "vertexRestrideBuffer"); utilityLibrary->release(); @@ -168,6 +171,7 @@ MetalRenderer::MetalRenderer() MetalRenderer::~MetalRenderer() { + delete m_copyBufferToBufferPipeline; //delete m_copyTextureToTexturePipeline; delete m_restrideBufferPipeline; @@ -774,9 +778,30 @@ void MetalRenderer::bufferCache_copy(uint32 srcOffset, uint32 dstOffset, uint32 void MetalRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint32 dstOffset, uint32 size) { - auto blitCommandEncoder = GetBlitCommandEncoder(); + if (m_encoderType == MetalEncoderType::Render) + { + auto renderCommandEncoder = static_cast(m_commandEncoder); + + MTL::Resource* barrierBuffers[] = {m_xfbRingBuffer}; + renderCommandEncoder->memoryBarrier(barrierBuffers, 1, MTL::RenderStageVertex | MTL::RenderStageFragment | MTL::RenderStageObject | MTL::RenderStageMesh, MTL::RenderStageVertex); + + renderCommandEncoder->setRenderPipelineState(m_copyBufferToBufferPipeline->GetRenderPipelineState()); + m_state.m_encoderState.m_renderPipelineState = m_copyBufferToBufferPipeline->GetRenderPipelineState(); + + SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_VERTEX, m_xfbRingBuffer, srcOffset, GET_HELPER_BUFFER_BINDING(0)); + SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_VERTEX, m_memoryManager->GetBufferCache(), dstOffset, GET_HELPER_BUFFER_BINDING(1)); + + renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypePoint, NS::UInteger(0), NS::UInteger(size)); - blitCommandEncoder->copyFromBuffer(m_xfbRingBuffer, srcOffset, m_memoryManager->GetBufferCache(), dstOffset, size); + barrierBuffers[0] = m_memoryManager->GetBufferCache(); + renderCommandEncoder->memoryBarrier(barrierBuffers, 1, MTL::RenderStageVertex, MTL::RenderStageVertex | MTL::RenderStageFragment | MTL::RenderStageObject | MTL::RenderStageMesh); + } + else + { + auto blitCommandEncoder = GetBlitCommandEncoder(); + + blitCommandEncoder->copyFromBuffer(m_xfbRingBuffer, srcOffset, m_memoryManager->GetBufferCache(), dstOffset, size); + } } void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, uint32 size) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 586eb79ab..f9acb7813 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -440,6 +440,7 @@ class MetalRenderer : public Renderer MTL::RenderPipelineState* m_presentPipelineSRGB; // Hybrid pipelines + class MetalHybridComputePipeline* m_copyBufferToBufferPipeline; //class MetalHybridComputePipeline* m_copyTextureToTexturePipeline; class MetalHybridComputePipeline* m_restrideBufferPipeline; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h index cf355f611..95606df7b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h @@ -31,6 +31,10 @@ fragment float4 fragmentPresent(VertexOut in [[stage_in]], texture2d tex return tex.sample(samplr, in.texCoord); } +vertex void vertexCopyBufferToBuffer(uint vid [[vertex_id]], device uint8_t* src [[buffer(GET_BUFFER_BINDING(0))]], device uint8_t* dst [[buffer(GET_BUFFER_BINDING(1))]]) { + dst[vid] = src[vid]; +} + /* vertex void vertexCopyTextureToTexture(uint vid [[vertex_id]], texture2d src [[texture(GET_TEXTURE_BINDING(0))]], texture2d dst [[texture(GET_TEXTURE_BINDING(1))]], constant uint32_t& width [[buffer(GET_BUFFER_BINDING(0))]]) { uint2 coord = uint2(vid % width, vid / width); From 2403cf948a86f4a0daa0bf9f9780886d1ab69635 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 29 Aug 2024 17:44:20 +0200 Subject: [PATCH 137/368] always use blit commands for buffer copies on non-apple GPUs --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 83759fe9c..af251a5de 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -161,7 +161,8 @@ MetalRenderer::MetalRenderer() auto copyTextureToColorPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); // Hybrid pipelines - m_copyBufferToBufferPipeline = new MetalHybridComputePipeline(this, utilityLibrary, "vertexCopyBufferToBuffer"); + if (m_isAppleGPU) + m_copyBufferToBufferPipeline = new MetalHybridComputePipeline(this, utilityLibrary, "vertexCopyBufferToBuffer"); //m_copyTextureToTexturePipeline = new MetalHybridComputePipeline(this, utilityLibrary, "vertexCopyTextureToTexture"); m_restrideBufferPipeline = new MetalHybridComputePipeline(this, utilityLibrary, "vertexRestrideBuffer"); utilityLibrary->release(); @@ -171,7 +172,8 @@ MetalRenderer::MetalRenderer() MetalRenderer::~MetalRenderer() { - delete m_copyBufferToBufferPipeline; + if (m_isAppleGPU) + delete m_copyBufferToBufferPipeline; //delete m_copyTextureToTexturePipeline; delete m_restrideBufferPipeline; @@ -778,7 +780,8 @@ void MetalRenderer::bufferCache_copy(uint32 srcOffset, uint32 dstOffset, uint32 void MetalRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint32 dstOffset, uint32 size) { - if (m_encoderType == MetalEncoderType::Render) + // Do the copy in a vertex shader on Apple GPUs + if (m_isAppleGPU && m_encoderType == MetalEncoderType::Render) { auto renderCommandEncoder = static_cast(m_commandEncoder); From 1cfb841b5fe58ce25ea39df14e1ea109725601e4 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 29 Aug 2024 19:06:01 +0200 Subject: [PATCH 138/368] do buffer cache uploading on the GPU --- .../Renderer/Metal/MetalMemoryManager.cpp | 44 ++++----- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 93 ++++++++++--------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 2 + 3 files changed, 70 insertions(+), 69 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index f0c122179..ef55e96d2 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -3,6 +3,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h" #include "Common/precompiled.h" #include "HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Metal/MTLResource.hpp" MetalVertexBufferCache::~MetalVertexBufferCache() { @@ -115,13 +116,9 @@ void* MetalMemoryManager::GetTextureUploadBuffer(size_t size) void MetalMemoryManager::InitBufferCache(size_t size) { - if (m_bufferCache) - { - debug_printf("MetalMemoryManager::InitBufferCache: buffer cache already initialized\n"); - return; - } + cemu_assert_debug(!m_bufferCache); - m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, m_mtlr->GetOptimalBufferStorageMode() | MTL::ResourceCPUCacheModeWriteCombined); + m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, MTL::ResourceStorageModePrivate); #ifdef CEMU_DEBUG_ASSERT m_bufferCache->setLabel(GetLabel("Buffer cache", m_bufferCache)); #endif @@ -129,20 +126,23 @@ void MetalMemoryManager::InitBufferCache(size_t size) void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, size_t size) { - if (!m_bufferCache) - { - debug_printf("MetalMemoryManager::UploadToBufferCache: buffer cache not initialized\n"); - return; - } + cemu_assert_debug(m_bufferCache); + cemu_assert_debug((offset + size) <= m_bufferCache->length()); - if ((offset + size) > m_bufferCache->length()) - { - debug_printf("MetalMemoryManager::UploadToBufferCache: out of bounds access (offset: %zu, size: %zu, buffer size: %zu)\n", offset, size, m_bufferCache->length()); - } + auto allocation = m_tempBufferAllocator.GetBufferAllocation(size); + auto buffer = m_tempBufferAllocator.GetBufferOutsideOfCommandBuffer(allocation.bufferIndex); + memcpy((uint8*)buffer->contents() + allocation.offset, data, size); + + // Lock the buffer to make sure it's not deallocated before the copy is done + m_tempBufferAllocator.LockBuffer(allocation.bufferIndex); + + m_mtlr->CopyBufferToBuffer(buffer, allocation.offset, m_bufferCache, offset, size); - memcpy((uint8*)m_bufferCache->contents() + offset, data, size); - if (!m_mtlr->HasUnifiedMemory()) - m_bufferCache->didModifyRange(NS::Range(offset, size)); + // Make sure the buffer has the right command buffer + m_tempBufferAllocator.GetBuffer(allocation.bufferIndex); // TODO: make a helper function for this + + // We can now safely unlock the buffer + m_tempBufferAllocator.UnlockBuffer(allocation.bufferIndex); // Notify vertex buffer cache about the change m_vertexBufferCache.MemoryRangeChanged(offset, size); @@ -150,11 +150,7 @@ void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, si void MetalMemoryManager::CopyBufferCache(size_t srcOffset, size_t dstOffset, size_t size) { - if (!m_bufferCache) - { - debug_printf("MetalMemoryManager::CopyBufferCache: buffer cache not initialized\n"); - return; - } + cemu_assert_debug(m_bufferCache); - memcpy((uint8*)m_bufferCache->contents() + dstOffset, (uint8*)m_bufferCache->contents() + srcOffset, size); + m_mtlr->CopyBufferToBuffer(m_bufferCache, srcOffset, m_bufferCache, dstOffset, size); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index af251a5de..a755ba31b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -22,11 +22,6 @@ #include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "HW/Latte/Renderer/Metal/MetalLayerHandle.h" #include "HW/Latte/Renderer/Renderer.h" -#include "Metal/MTLCommandBuffer.hpp" -#include "Metal/MTLDevice.hpp" -#include "Metal/MTLRenderCommandEncoder.hpp" -#include "Metal/MTLRenderPass.hpp" -#include "Metal/MTLRenderPipeline.hpp" #include "imgui.h" #include @@ -780,31 +775,7 @@ void MetalRenderer::bufferCache_copy(uint32 srcOffset, uint32 dstOffset, uint32 void MetalRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint32 dstOffset, uint32 size) { - // Do the copy in a vertex shader on Apple GPUs - if (m_isAppleGPU && m_encoderType == MetalEncoderType::Render) - { - auto renderCommandEncoder = static_cast(m_commandEncoder); - - MTL::Resource* barrierBuffers[] = {m_xfbRingBuffer}; - renderCommandEncoder->memoryBarrier(barrierBuffers, 1, MTL::RenderStageVertex | MTL::RenderStageFragment | MTL::RenderStageObject | MTL::RenderStageMesh, MTL::RenderStageVertex); - - renderCommandEncoder->setRenderPipelineState(m_copyBufferToBufferPipeline->GetRenderPipelineState()); - m_state.m_encoderState.m_renderPipelineState = m_copyBufferToBufferPipeline->GetRenderPipelineState(); - - SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_VERTEX, m_xfbRingBuffer, srcOffset, GET_HELPER_BUFFER_BINDING(0)); - SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_VERTEX, m_memoryManager->GetBufferCache(), dstOffset, GET_HELPER_BUFFER_BINDING(1)); - - renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypePoint, NS::UInteger(0), NS::UInteger(size)); - - barrierBuffers[0] = m_memoryManager->GetBufferCache(); - renderCommandEncoder->memoryBarrier(barrierBuffers, 1, MTL::RenderStageVertex, MTL::RenderStageVertex | MTL::RenderStageFragment | MTL::RenderStageObject | MTL::RenderStageMesh); - } - else - { - auto blitCommandEncoder = GetBlitCommandEncoder(); - - blitCommandEncoder->copyFromBuffer(m_xfbRingBuffer, srcOffset, m_memoryManager->GetBufferCache(), dstOffset, size); - } + CopyBufferToBuffer(m_xfbRingBuffer, srcOffset, m_memoryManager->GetBufferCache(), dstOffset, size); } void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, uint32 size) @@ -945,9 +916,6 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 if (endRenderPass) EndEncoding(); - // Render pass - auto renderCommandEncoder = GetRenderCommandEncoder(); - // Primitive type const LattePrimitiveMode primitiveMode = static_cast(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]); auto mtlPrimitiveType = GetMtlPrimitiveType(primitiveMode); @@ -955,6 +923,22 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 bool usesGeometryShader = (geometryShader != nullptr || isPrimitiveRect); + // Index buffer + Renderer::INDEX_TYPE hostIndexType; + uint32 hostIndexCount; + uint32 indexMin = 0; + uint32 indexMax = 0; + uint32 indexBufferOffset = 0; + uint32 indexBufferIndex = 0; + LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexBufferOffset, indexBufferIndex); + + // synchronize vertex and uniform cache and update buffer bindings + // We need to call this before getting the render command encoder, since it can cause buffer copies + LatteBufferCache_Sync(indexMin + baseVertex, indexMax + baseVertex, baseInstance, instanceCount); + + // Render pass + auto renderCommandEncoder = GetRenderCommandEncoder(); + // Depth stencil state // Disable depth write when there is no depth attachment @@ -1120,18 +1104,6 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // Resources - // Index buffer - Renderer::INDEX_TYPE hostIndexType; - uint32 hostIndexCount; - uint32 indexMin = 0; - uint32 indexMax = 0; - uint32 indexBufferOffset = 0; - uint32 indexBufferIndex = 0; - LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexBufferOffset, indexBufferIndex); - - // synchronize vertex and uniform cache and update buffer bindings - LatteBufferCache_Sync(indexMin + baseVertex, indexMax + baseVertex, baseInstance, instanceCount); - // Vertex buffers for (uint8 i = 0; i < MAX_MTL_BUFFERS; i++) { @@ -1851,6 +1823,37 @@ void MetalRenderer::ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 s EndEncoding(); } +void MetalRenderer::CopyBufferToBuffer(MTL::Buffer* src, uint32 srcOffset, MTL::Buffer* dst, uint32 dstOffset, uint32 size) +{ + // Do the copy in a vertex shader on Apple GPUs + if (m_isAppleGPU && m_encoderType == MetalEncoderType::Render) + { + auto renderCommandEncoder = static_cast(m_commandEncoder); + + MTL::Resource* barrierBuffers[] = {src}; + // TODO: let the caller choose the stages + renderCommandEncoder->memoryBarrier(barrierBuffers, 1, MTL::RenderStageVertex | MTL::RenderStageFragment | MTL::RenderStageObject | MTL::RenderStageMesh, MTL::RenderStageVertex); + + renderCommandEncoder->setRenderPipelineState(m_copyBufferToBufferPipeline->GetRenderPipelineState()); + m_state.m_encoderState.m_renderPipelineState = m_copyBufferToBufferPipeline->GetRenderPipelineState(); + + SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_VERTEX, src, srcOffset, GET_HELPER_BUFFER_BINDING(0)); + SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_VERTEX, dst, dstOffset, GET_HELPER_BUFFER_BINDING(1)); + + renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypePoint, NS::UInteger(0), NS::UInteger(size)); + + barrierBuffers[0] = dst; + // TODO: let the caller choose the stages + renderCommandEncoder->memoryBarrier(barrierBuffers, 1, MTL::RenderStageVertex, MTL::RenderStageVertex | MTL::RenderStageFragment | MTL::RenderStageObject | MTL::RenderStageMesh); + } + else + { + auto blitCommandEncoder = GetBlitCommandEncoder(); + + blitCommandEncoder->copyFromBuffer(src, srcOffset, dst, dstOffset, size); + } +} + void MetalRenderer::SwapBuffer(bool mainWindow) { auto& layer = GetLayer(mainWindow); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index f9acb7813..e8c15133f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -376,6 +376,8 @@ class MetalRenderer : public Renderer void ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a); + void CopyBufferToBuffer(MTL::Buffer* src, uint32 srcOffset, MTL::Buffer* dst, uint32 dstOffset, uint32 size); + // Getters bool IsAppleGPU() const { From 2f822d07c73595b6787389cad936afdfda230ace Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 30 Aug 2024 08:57:50 +0200 Subject: [PATCH 139/368] support game pad view in a separate window --- .../Latte/Renderer/Metal/MetalLayerHandle.cpp | 6 ++-- .../Latte/Renderer/Metal/MetalLayerHandle.h | 2 +- .../Renderer/Metal/MetalPipelineCache.cpp | 1 - .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 36 ++++++++----------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 1 + src/gui/canvas/MetalCanvas.cpp | 12 +++---- 6 files changed, 24 insertions(+), 34 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp index 3a3a6b547..ad16b89ae 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp @@ -3,9 +3,9 @@ #include "gui/guiWrapper.h" -MetalLayerHandle::MetalLayerHandle(MTL::Device* device, const Vector2i& size) +MetalLayerHandle::MetalLayerHandle(MTL::Device* device, const Vector2i& size, bool mainWindow) { - const auto& windowInfo = gui_getWindowInfo().window_main; + const auto& windowInfo = (mainWindow ? gui_getWindowInfo().window_main : gui_getWindowInfo().window_pad); m_layer = (CA::MetalLayer*)CreateMetalLayer(windowInfo.handle, m_layerScaleX, m_layerScaleY); m_layer->setDevice(device); @@ -32,7 +32,7 @@ bool MetalLayerHandle::AcquireDrawable() m_drawable = m_layer->nextDrawable(); if (!m_drawable) { - debug_printf("failed to acquire next drawable\n"); + debug_printf("layer %p failed to acquire next drawable\n", this); return false; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h index 39a7cd1f3..014d2d432 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h @@ -9,7 +9,7 @@ class MetalLayerHandle { public: MetalLayerHandle() = default; - MetalLayerHandle(MTL::Device* device, const Vector2i& size); + MetalLayerHandle(MTL::Device* device, const Vector2i& size, bool mainWindow); ~MetalLayerHandle(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 6a7465151..48a048ec9 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -334,7 +334,6 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte { // Buffer stride cannot be zero, let's use the minimum stride bufferStride = minBufferStride; - debug_printf("vertex buffer %u has a vertex stride of 0 bytes, using %u bytes instead\n", bufferIndex, bufferStride); // Additionally, constant vertex function must be used layout->setStepFunction(MTL::VertexStepFunctionConstant); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index a755ba31b..392ede6dc 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -16,14 +16,11 @@ #include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/Core/LatteIndices.h" #include "Cemu/Logging/CemuDebugLogging.h" -#include "Common/precompiled.h" -#include "Foundation/NSTypes.hpp" #include "HW/Latte/Core/LatteConst.h" #include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "HW/Latte/Renderer/Metal/MetalLayerHandle.h" #include "HW/Latte/Renderer/Renderer.h" #include "imgui.h" -#include #define IMGUI_IMPL_METAL_CPP #include "imgui/imgui_extension.h" @@ -195,10 +192,15 @@ MetalRenderer::~MetalRenderer() void MetalRenderer::InitializeLayer(const Vector2i& size, bool mainWindow) { auto& layer = GetLayer(mainWindow); - layer = MetalLayerHandle(m_device, size); + layer = MetalLayerHandle(m_device, size, mainWindow); layer.GetLayer()->setPixelFormat(MTL::PixelFormatBGRA8Unorm); } +void MetalRenderer::ShutdownLayer(bool mainWindow) +{ + GetLayer(mainWindow) = MetalLayerHandle(); +} + void MetalRenderer::ResizeLayer(const Vector2i& size, bool mainWindow) { GetLayer(mainWindow).Resize(size); @@ -217,16 +219,14 @@ void MetalRenderer::Shutdown() CommitCommandBuffer(); } -// TODO: what should this do? bool MetalRenderer::IsPadWindowActive() { - return (GetLayer(false).GetDrawable() != nullptr); + return (GetLayer(false).GetLayer() != nullptr); } bool MetalRenderer::GetVRAMInfo(int& usageInMB, int& totalInMB) const { usageInMB = m_device->currentAllocatedSize() / 1024 / 1024; - // TODO: get the total VRAM size? Though would be pretty useless on Apple Silicon totalInMB = m_recommendedMaxVRAMUsage / 1024 / 1024; return true; @@ -266,6 +266,9 @@ void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) // Unlock all temporary buffers m_memoryManager->GetTemporaryBufferAllocator().EndFrame(); + + // Debug + m_performanceMonitor.ResetPerFrameData(); } // TODO: use `shader` for drawing @@ -1546,6 +1549,8 @@ void MetalRenderer::CommitCommandBuffer() bool MetalRenderer::AcquireDrawable(bool mainWindow) { auto& layer = GetLayer(mainWindow); + if (!layer.GetLayer()) + return false; const bool latteBufferUsesSRGB = mainWindow ? LatteGPUState.tvBufferUsesSRGB : LatteGPUState.drcBufferUsesSRGB; if (latteBufferUsesSRGB != m_state.m_usesSRGB) @@ -1856,22 +1861,11 @@ void MetalRenderer::CopyBufferToBuffer(MTL::Buffer* src, uint32 srcOffset, MTL:: void MetalRenderer::SwapBuffer(bool mainWindow) { - auto& layer = GetLayer(mainWindow); - if (!layer.AcquireDrawable()) + if (!AcquireDrawable(mainWindow)) return; - if (layer.GetDrawable()) - { - auto commandBuffer = GetCommandBuffer(); - layer.PresentDrawable(commandBuffer); - } - else - { - debug_printf("skipped present!\n"); - } - - // Debug - m_performanceMonitor.ResetPerFrameData(); + auto commandBuffer = GetCommandBuffer(); + GetLayer(mainWindow).PresentDrawable(commandBuffer); } void MetalRenderer::EnsureImGuiBackend() diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index e8c15133f..3281ccf2f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -204,6 +204,7 @@ class MetalRenderer : public Renderer } void InitializeLayer(const Vector2i& size, bool mainWindow); + void ShutdownLayer(bool mainWindow); void ResizeLayer(const Vector2i& size, bool mainWindow); void Initialize() override; diff --git a/src/gui/canvas/MetalCanvas.cpp b/src/gui/canvas/MetalCanvas.cpp index 2c89f8822..a9d1cb9dc 100644 --- a/src/gui/canvas/MetalCanvas.cpp +++ b/src/gui/canvas/MetalCanvas.cpp @@ -39,13 +39,9 @@ MetalCanvas::~MetalCanvas() Unbind(wxEVT_PAINT, &MetalCanvas::OnPaint, this); Unbind(wxEVT_SIZE, &MetalCanvas::OnResize, this); - if(!m_is_main_window) - { - // TODO - //MetalRenderer* vkr = (MetalRenderer*)g_renderer.get(); - //if(vkr) - // vkr->StopUsingPadAndWait(); - } + MetalRenderer* mtlr = (MetalRenderer*)g_renderer.get(); + if (mtlr) + mtlr->ShutdownLayer(m_is_main_window); } void MetalCanvas::OnPaint(wxPaintEvent& event) @@ -62,5 +58,5 @@ void MetalCanvas::OnResize(wxSizeEvent& event) RefreshRect(refreshRect, false); auto metal_renderer = MetalRenderer::GetInstance(); - metal_renderer->InitializeLayer({size.x, size.y}, m_is_main_window); + metal_renderer->ResizeLayer({size.x, size.y}, m_is_main_window); } From 1412d1e70a2c5087eebb8a7cf7a1246205d0b71d Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 30 Aug 2024 11:02:09 +0200 Subject: [PATCH 140/368] enable triangle fan support --- src/Cafe/HW/Latte/Core/LatteIndices.cpp | 4 ---- src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp | 3 +++ src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h | 4 ++++ src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 6 ++++++ 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteIndices.cpp b/src/Cafe/HW/Latte/Core/LatteIndices.cpp index dc6408f9c..e92f0f732 100644 --- a/src/Cafe/HW/Latte/Core/LatteIndices.cpp +++ b/src/Cafe/HW/Latte/Core/LatteIndices.cpp @@ -287,7 +287,6 @@ void LatteIndices_generateAutoLineLoopIndices(void* indexDataOutput, uint32 coun template void LatteIndices_unpackTriangleFanAndConvert(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax) { - debug_printf("TRIANGLE FAN UNPACK %u\n", rand()); const betype* src = (betype*)indexDataInput; T* dst = (T*)indexDataOutput; // TODO: check this @@ -308,7 +307,6 @@ void LatteIndices_unpackTriangleFanAndConvert(const void* indexDataInput, void* template void LatteIndices_generateAutoTriangleFanIndices(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax) { - debug_printf("TRIANGLE FAN AUTO %u\n", rand()); const betype* src = (betype*)indexDataInput; T* dst = (T*)indexDataOutput; for (sint32 i = 0; i < count; i++) @@ -699,7 +697,6 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 cemu_assert_debug(false); outputCount = count + 1; } - /* else if (primitiveMode == LattePrimitiveMode::TRIANGLE_FAN && g_renderer->GetType() == RendererAPI::Metal) { if (indexType == LatteIndexType::AUTO) @@ -723,7 +720,6 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 cemu_assert_debug(false); outputCount = count; } - */ else { if (indexType == LatteIndexType::U16_BE) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index ef55e96d2..07bd2a228 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -60,6 +60,9 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu // TODO: do the barriers in one call? MTL::Resource* barrierBuffers[] = {buffer}; renderCommandEncoder->memoryBarrier(barrierBuffers, 1, MTL::RenderStageVertex, MTL::RenderStageVertex); + + // Debug + m_mtlr->GetPerformanceMonitor().m_vertexBufferRestrides++; } else { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h index 64e94d38b..100b4b399 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h @@ -7,6 +7,8 @@ class MetalPerformanceMonitor // Per frame data uint32 m_renderPasses = 0; + uint32 m_vertexBufferRestrides = 0; + uint32 m_triangleFans = 0; MetalPerformanceMonitor() = default; ~MetalPerformanceMonitor() = default; @@ -14,5 +16,7 @@ class MetalPerformanceMonitor void ResetPerFrameData() { m_renderPasses = 0; + m_vertexBufferRestrides = 0; + m_triangleFans = 0; } }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 392ede6dc..59b18ac54 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -449,6 +449,8 @@ void MetalRenderer::AppendOverlayDebugInfo() ImGui::Text("--- Metal info (per frame) ---"); ImGui::Text("Command buffers %zu", m_commandBuffers.size()); ImGui::Text("Render passes %u", m_performanceMonitor.m_renderPasses); + ImGui::Text("Vertex buffer restrides %u", m_performanceMonitor.m_vertexBufferRestrides); + ImGui::Text("Triangle fans %u", m_performanceMonitor.m_triangleFans); } // TODO: halfZ @@ -1221,6 +1223,10 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 LatteStreamout_FinishDrawcall(false); + // Debug + if (primitiveMode == LattePrimitiveMode::TRIANGLE_FAN) + m_performanceMonitor.m_triangleFans++; + LatteGPUState.drawCallCounter++; } From 9c29acc635c7ef873fd2361e53de183eb2177945 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 30 Aug 2024 15:27:55 +0200 Subject: [PATCH 141/368] synchronize buffer copying correctly --- src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h | 2 ++ .../HW/Latte/Renderer/Metal/MetalMemoryManager.cpp | 4 ++-- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 11 +++++------ src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h | 2 +- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h index 44d4d873b..a47ecf9a1 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -60,3 +60,5 @@ inline NS::String* GetLabel(const std::string& label, const void* identifier) { return ToNSString(label + " (" + std::to_string(reinterpret_cast(identifier)) + ")"); } + +constexpr MTL::RenderStages ALL_MTL_RENDER_STAGES = MTL::RenderStageVertex | MTL::RenderStageObject | MTL::RenderStageMesh | MTL::RenderStageFragment; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 07bd2a228..6173532c2 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -139,7 +139,7 @@ void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, si // Lock the buffer to make sure it's not deallocated before the copy is done m_tempBufferAllocator.LockBuffer(allocation.bufferIndex); - m_mtlr->CopyBufferToBuffer(buffer, allocation.offset, m_bufferCache, offset, size); + m_mtlr->CopyBufferToBuffer(buffer, allocation.offset, m_bufferCache, offset, size, ALL_MTL_RENDER_STAGES, ALL_MTL_RENDER_STAGES); // Make sure the buffer has the right command buffer m_tempBufferAllocator.GetBuffer(allocation.bufferIndex); // TODO: make a helper function for this @@ -155,5 +155,5 @@ void MetalMemoryManager::CopyBufferCache(size_t srcOffset, size_t dstOffset, siz { cemu_assert_debug(m_bufferCache); - m_mtlr->CopyBufferToBuffer(m_bufferCache, srcOffset, m_bufferCache, dstOffset, size); + m_mtlr->CopyBufferToBuffer(m_bufferCache, srcOffset, m_bufferCache, dstOffset, size, ALL_MTL_RENDER_STAGES, ALL_MTL_RENDER_STAGES); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 59b18ac54..fbee6f3b2 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -20,6 +20,7 @@ #include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "HW/Latte/Renderer/Metal/MetalLayerHandle.h" #include "HW/Latte/Renderer/Renderer.h" +#include "Metal/MTLRenderCommandEncoder.hpp" #include "imgui.h" #define IMGUI_IMPL_METAL_CPP @@ -780,7 +781,7 @@ void MetalRenderer::bufferCache_copy(uint32 srcOffset, uint32 dstOffset, uint32 void MetalRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint32 dstOffset, uint32 size) { - CopyBufferToBuffer(m_xfbRingBuffer, srcOffset, m_memoryManager->GetBufferCache(), dstOffset, size); + CopyBufferToBuffer(m_xfbRingBuffer, srcOffset, m_memoryManager->GetBufferCache(), dstOffset, size, MTL::RenderStageVertex, ALL_MTL_RENDER_STAGES); } void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, uint32 size) @@ -1834,7 +1835,7 @@ void MetalRenderer::ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 s EndEncoding(); } -void MetalRenderer::CopyBufferToBuffer(MTL::Buffer* src, uint32 srcOffset, MTL::Buffer* dst, uint32 dstOffset, uint32 size) +void MetalRenderer::CopyBufferToBuffer(MTL::Buffer* src, uint32 srcOffset, MTL::Buffer* dst, uint32 dstOffset, uint32 size, MTL::RenderStages after, MTL::RenderStages before) { // Do the copy in a vertex shader on Apple GPUs if (m_isAppleGPU && m_encoderType == MetalEncoderType::Render) @@ -1842,8 +1843,7 @@ void MetalRenderer::CopyBufferToBuffer(MTL::Buffer* src, uint32 srcOffset, MTL:: auto renderCommandEncoder = static_cast(m_commandEncoder); MTL::Resource* barrierBuffers[] = {src}; - // TODO: let the caller choose the stages - renderCommandEncoder->memoryBarrier(barrierBuffers, 1, MTL::RenderStageVertex | MTL::RenderStageFragment | MTL::RenderStageObject | MTL::RenderStageMesh, MTL::RenderStageVertex); + renderCommandEncoder->memoryBarrier(barrierBuffers, 1, after, MTL::RenderStageVertex); renderCommandEncoder->setRenderPipelineState(m_copyBufferToBufferPipeline->GetRenderPipelineState()); m_state.m_encoderState.m_renderPipelineState = m_copyBufferToBufferPipeline->GetRenderPipelineState(); @@ -1854,8 +1854,7 @@ void MetalRenderer::CopyBufferToBuffer(MTL::Buffer* src, uint32 srcOffset, MTL:: renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypePoint, NS::UInteger(0), NS::UInteger(size)); barrierBuffers[0] = dst; - // TODO: let the caller choose the stages - renderCommandEncoder->memoryBarrier(barrierBuffers, 1, MTL::RenderStageVertex, MTL::RenderStageVertex | MTL::RenderStageFragment | MTL::RenderStageObject | MTL::RenderStageMesh); + renderCommandEncoder->memoryBarrier(barrierBuffers, 1, MTL::RenderStageVertex, before); } else { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 3281ccf2f..25051a975 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -377,7 +377,7 @@ class MetalRenderer : public Renderer void ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a); - void CopyBufferToBuffer(MTL::Buffer* src, uint32 srcOffset, MTL::Buffer* dst, uint32 dstOffset, uint32 size); + void CopyBufferToBuffer(MTL::Buffer* src, uint32 srcOffset, MTL::Buffer* dst, uint32 dstOffset, uint32 size, MTL::RenderStages after, MTL::RenderStages before); // Getters bool IsAppleGPU() const From cda4799b546d2e2be9f8ea8561b17e47bc3ef32b Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 30 Aug 2024 16:44:26 +0200 Subject: [PATCH 142/368] disable writes for color attachments which are not in the active FBO --- .../LatteDecompilerEmitMSLAttrDecoder.cpp | 4 +- .../Renderer/Metal/MetalPipelineCache.cpp | 45 +++++++++++-------- .../Latte/Renderer/Metal/MetalPipelineCache.h | 6 +-- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 4 +- 4 files changed, 34 insertions(+), 25 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp index ee4382987..585309a1e 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp @@ -167,7 +167,7 @@ void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 0 && attrib->isSigned == 0 ) { // seen in Minecraft Wii U Edition - src->addFmt("attrDecoder.xyzw = as_type(float4(in.attrDataSem{}.wzyx)/255.0);" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.xyzw = as_type(float4(in.attrDataSem{}.wzyx)/255.0);" _CRLF, attributeInputIndex); } else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 0 && attrib->isSigned != 0 ) { @@ -190,7 +190,7 @@ void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext else if (attrib->format == FMT_8_8_8_8 && attrib->nfa == 2 && attrib->isSigned == 0) { // seen in Ben 10 Omniverse - src->addFmt("attrDecoder.xyzw = as_type(float4(in.attrDataSem{}.wzyx));" _CRLF, attributeInputIndex); + src->addFmt("attrDecoder.xyzw = as_type(float4(in.attrDataSem{}.wzyx));" _CRLF, attributeInputIndex); } else { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 48a048ec9..4357117d6 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -10,6 +10,7 @@ #include "HW/Latte/Core/FetchShader.h" #include "HW/Latte/ISA/RegDefines.h" +#include "Metal/MTLRenderPipeline.hpp" #include "config/ActiveSettings.h" static void rectsEmulationGS_outputSingleVertex(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable* psInputTable, sint32 vIdx, const LatteContextRegister& latteRegister) @@ -189,7 +190,7 @@ extern std::atomic_int g_compiled_shaders_total; extern std::atomic_int g_compiled_shaders_async; template -void SetFragmentState(T* desc, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) +void SetFragmentState(T* desc, CachedFBOMtl* lastUsedFBO, CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) { // Color attachments const Latte::LATTE_CB_COLOR_CONTROL& colorControlReg = lcr.CB_COLOR_CONTROL; @@ -197,7 +198,7 @@ void SetFragmentState(T* desc, class CachedFBOMtl* activeFBO, const LatteContext uint32 renderTargetMask = lcr.CB_TARGET_MASK.get_MASK(); for (uint8 i = 0; i < 8; i++) { - const auto& colorBuffer = activeFBO->colorBuffer[i]; + const auto& colorBuffer = lastUsedFBO->colorBuffer[i]; auto texture = static_cast(colorBuffer.texture); if (!texture) { @@ -205,6 +206,14 @@ void SetFragmentState(T* desc, class CachedFBOMtl* activeFBO, const LatteContext } auto colorAttachment = desc->colorAttachments()->object(i); colorAttachment->setPixelFormat(texture->GetRGBAView()->pixelFormat()); + + // Disable writes if not in the active FBO + if (!activeFBO->colorBuffer[i].texture) + { + colorAttachment->setWriteMask(MTL::ColorWriteMaskNone); + continue; + } + colorAttachment->setWriteMask(GetMtlColorWriteMask((renderTargetMask >> (i * 4)) & 0xF)); // Blending @@ -239,11 +248,11 @@ void SetFragmentState(T* desc, class CachedFBOMtl* activeFBO, const LatteContext } // Depth stencil attachment - if (activeFBO->depthBuffer.texture) + if (lastUsedFBO->depthBuffer.texture) { - auto texture = static_cast(activeFBO->depthBuffer.texture); + auto texture = static_cast(lastUsedFBO->depthBuffer.texture); desc->setDepthAttachmentPixelFormat(texture->GetRGBAView()->pixelFormat()); - if (activeFBO->depthBuffer.hasStencil) + if (lastUsedFBO->depthBuffer.hasStencil) { desc->setStencilAttachmentPixelFormat(texture->GetRGBAView()->pixelFormat()); } @@ -285,9 +294,9 @@ MetalPipelineCache::~MetalPipelineCache() m_binaryArchiveURL->release(); } -MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) +MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, CachedFBOMtl* lastUsedFBO, CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) { - uint64 stateHash = CalculateRenderPipelineHash(fetchShader, vertexShader, pixelShader, activeFBO, lcr); + uint64 stateHash = CalculateRenderPipelineHash(fetchShader, vertexShader, pixelShader, lastUsedFBO, lcr); auto& pipeline = m_pipelineCache[stateHash]; if (pipeline) return pipeline; @@ -364,7 +373,7 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte debug_printf("no vertex function, skipping draw\n"); return nullptr; } - mtlPixelShader->CompileFragmentFunction(activeFBO); + mtlPixelShader->CompileFragmentFunction(lastUsedFBO); // Render pipeline state MTL::RenderPipelineDescriptor* desc = MTL::RenderPipelineDescriptor::alloc()->init(); @@ -373,7 +382,7 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte // TODO: don't always set the vertex descriptor? desc->setVertexDescriptor(vertexDescriptor); - SetFragmentState(desc, activeFBO, lcr); + SetFragmentState(desc, lastUsedFBO, activeFBO, lcr); TryLoadBinaryArchive(); @@ -440,15 +449,15 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte return pipeline; } -MTL::RenderPipelineState* MetalPipelineCache::GetMeshPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, CachedFBOMtl* activeFBO, const LatteContextRegister& lcr, Renderer::INDEX_TYPE hostIndexType) +MTL::RenderPipelineState* MetalPipelineCache::GetMeshPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, CachedFBOMtl* lastUsedFBO, CachedFBOMtl* activeFBO, const LatteContextRegister& lcr, Renderer::INDEX_TYPE hostIndexType) { - uint64 stateHash = CalculateRenderPipelineHash(fetchShader, vertexShader, pixelShader, activeFBO, lcr); + uint64 stateHash = CalculateRenderPipelineHash(fetchShader, vertexShader, pixelShader, lastUsedFBO, lcr); stateHash += lcr.GetRawView()[mmVGT_PRIMITIVE_TYPE]; stateHash = std::rotl(stateHash, 7); stateHash += (uint8)hostIndexType; - stateHash = std::rotl(stateHash, 7); // TODO: 7?s + stateHash = std::rotl(stateHash, 7); auto& pipeline = m_pipelineCache[stateHash]; if (pipeline) @@ -467,7 +476,7 @@ MTL::RenderPipelineState* MetalPipelineCache::GetMeshPipelineState(const LatteFe } auto mtlPixelShader = static_cast(pixelShader->shader); mtlObjectShader->CompileObjectFunction(lcr, fetchShader, vertexShader, hostIndexType); - mtlPixelShader->CompileFragmentFunction(activeFBO); + mtlPixelShader->CompileFragmentFunction(lastUsedFBO); // Render pipeline state MTL::MeshRenderPipelineDescriptor* desc = MTL::MeshRenderPipelineDescriptor::alloc()->init(); @@ -475,7 +484,7 @@ MTL::RenderPipelineState* MetalPipelineCache::GetMeshPipelineState(const LatteFe desc->setMeshFunction(mtlMeshShader->GetFunction()); desc->setFragmentFunction(mtlPixelShader->GetFunction()); - SetFragmentState(desc, activeFBO, lcr); + SetFragmentState(desc, lastUsedFBO, activeFBO, lcr); TryLoadBinaryArchive(); @@ -498,13 +507,13 @@ MTL::RenderPipelineState* MetalPipelineCache::GetMeshPipelineState(const LatteFe return pipeline; } -uint64 MetalPipelineCache::CalculateRenderPipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) +uint64 MetalPipelineCache::CalculateRenderPipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, const LatteContextRegister& lcr) { // Hash uint64 stateHash = 0; for (int i = 0; i < Latte::GPU_LIMITS::NUM_COLOR_ATTACHMENTS; ++i) { - auto textureView = static_cast(activeFBO->colorBuffer[i].texture); + auto textureView = static_cast(lastUsedFBO->colorBuffer[i].texture); if (!textureView) continue; @@ -512,9 +521,9 @@ uint64 MetalPipelineCache::CalculateRenderPipelineHash(const LatteFetchShader* f stateHash = std::rotl(stateHash, 7); } - if (activeFBO->depthBuffer.texture) + if (lastUsedFBO->depthBuffer.texture) { - auto textureView = static_cast(activeFBO->depthBuffer.texture); + auto textureView = static_cast(lastUsedFBO->depthBuffer.texture); stateHash += textureView->GetRGBAView()->pixelFormat(); stateHash = std::rotl(stateHash, 7); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h index ec4cabbf4..916a90728 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h @@ -16,9 +16,9 @@ class MetalPipelineCache MetalPipelineCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} ~MetalPipelineCache(); - MTL::RenderPipelineState* GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr); + MTL::RenderPipelineState* GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr); - MTL::RenderPipelineState* GetMeshPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr, Renderer::INDEX_TYPE hostIndexType); + MTL::RenderPipelineState* GetMeshPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr, Renderer::INDEX_TYPE hostIndexType); // Debug size_t GetPipelineCacheSize() const { return m_pipelineCache.size(); } @@ -31,7 +31,7 @@ class MetalPipelineCache NS::URL* m_binaryArchiveURL; MTL::BinaryArchive* m_binaryArchive; - uint64 CalculateRenderPipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr); + uint64 CalculateRenderPipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, const LatteContextRegister& lcr); void TryLoadBinaryArchive(); }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index fbee6f3b2..bdd0d93dd 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1145,9 +1145,9 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // Render pipeline state MTL::RenderPipelineState* renderPipelineState; if (usesGeometryShader) - renderPipelineState = m_pipelineCache->GetMeshPipelineState(fetchShader, vertexShader, geometryShader, pixelShader, m_state.m_lastUsedFBO, LatteGPUState.contextNew, hostIndexType); + renderPipelineState = m_pipelineCache->GetMeshPipelineState(fetchShader, vertexShader, geometryShader, pixelShader, m_state.m_lastUsedFBO, m_state.m_activeFBO, LatteGPUState.contextNew, hostIndexType); else - renderPipelineState = m_pipelineCache->GetRenderPipelineState(fetchShader, vertexShader, pixelShader, m_state.m_lastUsedFBO, LatteGPUState.contextNew); + renderPipelineState = m_pipelineCache->GetRenderPipelineState(fetchShader, vertexShader, pixelShader, m_state.m_lastUsedFBO, m_state.m_activeFBO, LatteGPUState.contextNew); // HACK if (!renderPipelineState) From a9a4d7b4f8f8b8df35f3924519fb0212bd6e8065 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 30 Aug 2024 16:53:00 +0200 Subject: [PATCH 143/368] disable depth write if active FBO doesn't have a depth attachment --- .../HW/Latte/Renderer/Metal/MetalPipelineCache.cpp | 10 +++++----- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 4357117d6..8f7740b9b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -251,11 +251,11 @@ void SetFragmentState(T* desc, CachedFBOMtl* lastUsedFBO, CachedFBOMtl* activeFB if (lastUsedFBO->depthBuffer.texture) { auto texture = static_cast(lastUsedFBO->depthBuffer.texture); - desc->setDepthAttachmentPixelFormat(texture->GetRGBAView()->pixelFormat()); - if (lastUsedFBO->depthBuffer.hasStencil) - { - desc->setStencilAttachmentPixelFormat(texture->GetRGBAView()->pixelFormat()); - } + desc->setDepthAttachmentPixelFormat(texture->GetRGBAView()->pixelFormat()); + if (lastUsedFBO->depthBuffer.hasStencil) + { + desc->setStencilAttachmentPixelFormat(texture->GetRGBAView()->pixelFormat()); + } } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index bdd0d93dd..f7e98c478 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -950,7 +950,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // Disable depth write when there is no depth attachment auto& depthControl = LatteGPUState.contextNew.DB_DEPTH_CONTROL; bool depthWriteEnable = depthControl.get_Z_WRITE_ENABLE(); - if (!m_state.m_lastUsedFBO->depthBuffer.texture) + if (!m_state.m_activeFBO->depthBuffer.texture) depthControl.set_Z_WRITE_ENABLE(false); MTL::DepthStencilState* depthStencilState = m_depthStencilCache->GetDepthStencilState(LatteGPUState.contextNew); From 5eb30262a0f680e547e191b313dc4586a694d3c5 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 31 Aug 2024 08:40:58 +0200 Subject: [PATCH 144/368] skip unnecessary draws --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 33 +++++++++++-------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index f7e98c478..17e805ba7 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -833,6 +833,8 @@ void MetalRenderer::draw_beginSequence() { m_state.m_skipDrawSequence = false; + bool streamoutEnable = LatteGPUState.contextRegister[mmVGT_STRMOUT_EN] != 0; + // update shader state LatteSHRC_UpdateActiveShaders(); if (LatteGPUState.activeShaderHasError) @@ -855,7 +857,7 @@ void MetalRenderer::draw_beginSequence() return; // no render target } - if (!hasValidFramebufferAttached) + if (!hasValidFramebufferAttached && !streamoutEnable) { debug_printf("Drawcall with no color buffer or depth buffer attached\n"); m_state.m_skipDrawSequence = true; @@ -881,9 +883,17 @@ void MetalRenderer::draw_beginSequence() if (!LatteGPUState.contextNew.PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) rasterizerEnable = true; - if (!rasterizerEnable == false) + if (!rasterizerEnable && !streamoutEnable) m_state.m_skipDrawSequence = true; + // Both faces are culled + // TODO: can we really skip the draw? + const auto& polygonControlReg = LatteGPUState.contextNew.PA_SU_SC_MODE_CNTL; + uint32 cullFront = polygonControlReg.get_CULL_FRONT(); + uint32 cullBack = polygonControlReg.get_CULL_BACK(); + if (cullFront && cullBack) + m_state.m_skipDrawSequence = true; + // TODO: is this even needed? if (!m_state.m_activeFBO) m_state.m_skipDrawSequence = true; @@ -891,12 +901,11 @@ void MetalRenderer::draw_beginSequence() void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst) { - // TODO: uncomment - //if (m_state.m_skipDrawSequence) - //{ - // LatteGPUState.drawCallCounter++; - // return; - //} + if (m_state.m_skipDrawSequence) + { + LatteGPUState.drawCallCounter++; + return; + } auto& encoderState = m_state.m_encoderState; @@ -904,9 +913,10 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 LatteDecompilerShader* vertexShader = LatteSHRC_GetActiveVertexShader(); LatteDecompilerShader* geometryShader = LatteSHRC_GetActiveGeometryShader(); LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader(); + // TODO: is this even needed? Also, should go to draw_beginSequence if (!vertexShader) { - debug_printf("no vertex function, skipping draw\n"); + printf("no vertex function, skipping draw\n"); return; } const auto fetchShader = LatteSHRC_GetActiveFetchShader(); @@ -1051,9 +1061,6 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 } // Cull mode - if (cullFront && cullBack) - return; // We can just skip the draw (TODO: can we?) - MTL::CullMode cullMode; if (cullFront) cullMode = MTL::CullModeFront; @@ -1152,7 +1159,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // HACK if (!renderPipelineState) { - debug_printf("invalid render pipeline state, skipping draw\n"); + printf("invalid render pipeline state, skipping draw\n"); return; } From f9f62605800691ada8303691a89205bedfbe3d0f Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 31 Aug 2024 10:34:42 +0200 Subject: [PATCH 145/368] monitor clears --- src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h | 2 ++ src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h index 100b4b399..e9e1690c4 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h @@ -7,6 +7,7 @@ class MetalPerformanceMonitor // Per frame data uint32 m_renderPasses = 0; + uint32 m_clears = 0; uint32 m_vertexBufferRestrides = 0; uint32 m_triangleFans = 0; @@ -16,6 +17,7 @@ class MetalPerformanceMonitor void ResetPerFrameData() { m_renderPasses = 0; + m_clears = 0; m_vertexBufferRestrides = 0; m_triangleFans = 0; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 17e805ba7..a60b13635 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -450,6 +450,7 @@ void MetalRenderer::AppendOverlayDebugInfo() ImGui::Text("--- Metal info (per frame) ---"); ImGui::Text("Command buffers %zu", m_commandBuffers.size()); ImGui::Text("Render passes %u", m_performanceMonitor.m_renderPasses); + ImGui::Text("Clears %u", m_performanceMonitor.m_clears); ImGui::Text("Vertex buffer restrides %u", m_performanceMonitor.m_vertexBufferRestrides); ImGui::Text("Triangle fans %u", m_performanceMonitor.m_triangleFans); } @@ -581,6 +582,9 @@ void MetalRenderer::texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sl GetTemporaryRenderCommandEncoder(renderPassDescriptor); renderPassDescriptor->release(); EndEncoding(); + + // Debug + m_performanceMonitor.m_clears++; } LatteTexture* MetalRenderer::texture_createTextureEx(Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) @@ -1840,6 +1844,9 @@ void MetalRenderer::ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 s GetTemporaryRenderCommandEncoder(renderPassDescriptor); renderPassDescriptor->release(); EndEncoding(); + + // Debug + m_performanceMonitor.m_clears++; } void MetalRenderer::CopyBufferToBuffer(MTL::Buffer* src, uint32 srcOffset, MTL::Buffer* dst, uint32 dstOffset, uint32 size, MTL::RenderStages after, MTL::RenderStages before) From 5d07d115a69ae4107a42abe2731268bf823c193f Mon Sep 17 00:00:00 2001 From: Samo Z Date: Sun, 1 Sep 2024 14:36:39 +0200 Subject: [PATCH 146/368] emulate unsupported shadow sampler types --- .../LatteDecompilerEmitMSL.cpp | 59 ++++++++++++++----- .../LatteDecompilerEmitMSLHeader.hpp | 6 +- .../HW/Latte/Renderer/Metal/MetalCommon.h | 7 +++ 3 files changed, 55 insertions(+), 17 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 7b8755bfe..1c75b737c 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2246,6 +2246,7 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex } bool isCompare = shaderContext->shader->textureUsesDepthCompare[texInstruction->textureFetch.textureIndex]; + bool emulateCompare = (isCompare && !IsValidDepthTextureType(texDim)); bool isGather = (texOpcode == GPU7_TEX_INST_FETCH4); bool unnormalizationHandled = false; @@ -2265,25 +2266,40 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex return; } - src->addFmt("tex{}.", texInstruction->textureFetch.textureIndex); - if ((texOpcode == GPU7_TEX_INST_SAMPLE && (texInstruction->textureFetch.unnormalized[0] && texInstruction->textureFetch.unnormalized[1] && texInstruction->textureFetch.unnormalized[2] && texInstruction->textureFetch.unnormalized[3])) || - texOpcode == GPU7_TEX_INST_LD) + if (emulateCompare) { - if (hasOffset) - cemu_assert_unimplemented(); - src->add("read("); - unnormalizationHandled = true; - useTexelCoordinates = true; + cemu_assert_debug(!isGather); + + src->add("sampleCompareEmulate("); + } + + src->addFmt("tex{}", texInstruction->textureFetch.textureIndex); + if (!emulateCompare) + { + src->add("."); + if ((texOpcode == GPU7_TEX_INST_SAMPLE && (texInstruction->textureFetch.unnormalized[0] && texInstruction->textureFetch.unnormalized[1] && texInstruction->textureFetch.unnormalized[2] && texInstruction->textureFetch.unnormalized[3])) || + texOpcode == GPU7_TEX_INST_LD) + { + if (hasOffset) + cemu_assert_unimplemented(); + src->add("read("); + unnormalizationHandled = true; + useTexelCoordinates = true; + } + else + { + if (isGather) + src->add("gather"); + else + src->add("sample"); + if (isCompare) + src->add("_compare"); + src->addFmt("(samplr{}, ", texInstruction->textureFetch.textureIndex); + } } else { - if (isGather) - src->add("gather"); - else - src->add("sample"); - if (isCompare) - src->add("_compare"); - src->addFmt("(samplr{}, ", texInstruction->textureFetch.textureIndex); + src->addFmt(", samplr{}, ", texInstruction->textureFetch.textureIndex); } // for textureGather() add shift (todo: depends on rounding mode set in sampler registers?) @@ -3719,6 +3735,19 @@ void LatteDecompiler_emitHelperFunctions(LatteDecompilerShaderContext* shaderCon "}\r\n"); } + // Sample compare emulate + // TODO: only add when needed + + // TODO: lod_options overload + // TODO: when the sampler has linear min mag filter, use gather and filter manually + // TODO: offset? + fCStr_shaderSource->add("" + "template\r\n" + "float sampleCompareEmulate(TextureT tex, sampler samplr, CoordT coord, float compareValue) {\r\n" + "return compareValue < tex.sample(samplr, coord).x ? 1.0 : 0.0;\r\n" + "}\r\n" + ); + // clamp fCStr_shaderSource->add("" "int clampFI32(int v)\r\n" diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 53332f7ce..615e86f93 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -1,7 +1,8 @@ #pragma once #include "Common/precompiled.h" -#include "HW/Latte/Core/LatteConst.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" + namespace LatteDecompiler { static void _emitUniformVariables(LatteDecompilerShaderContext* decompilerContext) @@ -442,7 +443,8 @@ namespace LatteDecompiler src->add(", "); - if (shaderContext->shader->textureUsesDepthCompare[i]) + // Only 2D and 2D array textures can be used with comparison samplers + if (shaderContext->shader->textureUsesDepthCompare[i] && IsValidDepthTextureType(shaderContext->shader->textureUnitDim[i])) src->add("depth"); else src->add("texture"); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h index a47ecf9a1..a2ecc7e98 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -3,6 +3,8 @@ #include #include +#include "Cafe/HW/Latte/Core/LatteConst.h" + struct MetalPixelFormatSupport { bool m_supportsR8Unorm_sRGB; @@ -62,3 +64,8 @@ inline NS::String* GetLabel(const std::string& label, const void* identifier) } constexpr MTL::RenderStages ALL_MTL_RENDER_STAGES = MTL::RenderStageVertex | MTL::RenderStageObject | MTL::RenderStageMesh | MTL::RenderStageFragment; + +inline bool IsValidDepthTextureType(Latte::E_DIM dim) +{ + return (dim == Latte::E_DIM::DIM_2D || dim == Latte::E_DIM::DIM_2D_MSAA || dim == Latte::E_DIM::DIM_2D_ARRAY || dim == Latte::E_DIM::DIM_2D_ARRAY_MSAA || dim == Latte::E_DIM::DIM_CUBEMAP); +} From 491ac694ab85607b291abb3a3efa017379c90cb6 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 1 Sep 2024 18:58:48 +0200 Subject: [PATCH 147/368] fix: object shader error when used with rect primitive --- .../LatteDecompilerEmitMSLHeader.hpp | 6 +++--- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 7 +++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 615e86f93..38392bdb9 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -155,7 +155,7 @@ namespace LatteDecompiler } } - static void _emitAttributes(LatteDecompilerShaderContext* decompilerContext) + static void _emitAttributes(LatteDecompilerShaderContext* decompilerContext, bool isRectVertexShader) { auto src = decompilerContext->shaderSource; std::string attributeNames; @@ -171,7 +171,7 @@ namespace LatteDecompiler cemu_assert_debug(decompilerContext->output->resourceMappingMTL.attributeMapping[i] >= 0); src->addFmt("uint4 attrDataSem{}", i); - if (decompilerContext->options->usesGeometryShader) + if (decompilerContext->options->usesGeometryShader || isRectVertexShader) attributeNames += "#define ATTRIBUTE_NAME" + std::to_string((sint32)decompilerContext->output->resourceMappingMTL.attributeMapping[i]) + " attrDataSem" + std::to_string(i) + "\n"; else src->addFmt(" [[attribute({})]]", (sint32)decompilerContext->output->resourceMappingMTL.attributeMapping[i]); @@ -268,7 +268,7 @@ namespace LatteDecompiler if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) { - _emitAttributes(decompilerContext); + _emitAttributes(decompilerContext, isRectVertexShader); } else if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel) { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index a60b13635..6b667ec0d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -20,8 +20,6 @@ #include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "HW/Latte/Renderer/Metal/MetalLayerHandle.h" #include "HW/Latte/Renderer/Renderer.h" -#include "Metal/MTLRenderCommandEncoder.hpp" -#include "imgui.h" #define IMGUI_IMPL_METAL_CPP #include "imgui/imgui_extension.h" @@ -1065,6 +1063,11 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 } // Cull mode + + // Handled in draw_beginSequence + if (cullFront && cullBack) + cemu_assert_suspicious(); + MTL::CullMode cullMode; if (cullFront) cullMode = MTL::CullModeFront; From 45737e82d97cc5ce122f5db371da50d664e23c2b Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 2 Sep 2024 08:10:40 +0200 Subject: [PATCH 148/368] disable shader write usage on textures --- src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index 4510571ab..27fd9dbeb 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -76,9 +76,7 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM auto pixelFormat = GetMtlPixelFormat(format, isDepth, m_mtlr->GetPixelFormatSupport()); desc->setPixelFormat(pixelFormat); - // HACK: even though the textures are never written to from a shader, we still need to use `ShaderWrite` usage to prevent pink lines over the screen - MTL::TextureUsage usage = MTL::TextureUsageShaderRead | MTL::TextureUsageShaderWrite; - // TODO: add more conditions + MTL::TextureUsage usage = MTL::TextureUsageShaderRead; if (!Latte::IsCompressedFormat(format)) { usage |= MTL::TextureUsageRenderTarget; From c4eb19579706c4171540a78c60c85991adbfd8c3 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 2 Sep 2024 08:25:36 +0200 Subject: [PATCH 149/368] make all textures be at least 1x1x1 --- src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp | 2 ++ src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index 27fd9dbeb..eebacd45f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -22,6 +22,8 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM effectiveBaseHeight = overwriteInfo.height; effectiveBaseDepth = overwriteInfo.depth; } + effectiveBaseWidth = std::max(1, effectiveBaseWidth); + effectiveBaseHeight = std::max(1, effectiveBaseHeight); effectiveBaseDepth = std::max(1, effectiveBaseDepth); desc->setWidth(effectiveBaseWidth); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 6b667ec0d..6ccbdb691 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -61,14 +61,14 @@ MetalRenderer::MetalRenderer() // Null resources MTL::TextureDescriptor* textureDescriptor = MTL::TextureDescriptor::alloc()->init(); textureDescriptor->setTextureType(MTL::TextureType1D); - textureDescriptor->setWidth(4); + textureDescriptor->setWidth(1); m_nullTexture1D = m_device->newTexture(textureDescriptor); #ifdef CEMU_DEBUG_ASSERT m_nullTexture1D->setLabel(GetLabel("Null texture 1D", m_nullTexture1D)); #endif textureDescriptor->setTextureType(MTL::TextureType2D); - textureDescriptor->setHeight(4); + textureDescriptor->setHeight(1); m_nullTexture2D = m_device->newTexture(textureDescriptor); #ifdef CEMU_DEBUG_ASSERT m_nullTexture2D->setLabel(GetLabel("Null texture 2D", m_nullTexture2D)); From 953975f5ecdef8218453152ddb397f523bee9aba Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 3 Sep 2024 13:59:52 +0200 Subject: [PATCH 150/368] don't jit compile vertex shaders --- src/Cafe/HW/Latte/Core/FetchShader.cpp | 15 +- .../LegacyShaderDecompiler/LatteDecompiler.h | 1 + .../LatteDecompilerAnalyzer.cpp | 1 + .../LatteDecompilerEmitMSL.cpp | 127 +++++++++++++++- .../LatteDecompilerEmitMSLHeader.hpp | 4 + .../Renderer/Metal/MetalPipelineCache.cpp | 8 - .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 4 +- .../Renderer/Metal/RendererShaderMtl.cpp | 142 +----------------- .../Latte/Renderer/Metal/RendererShaderMtl.h | 6 - 9 files changed, 146 insertions(+), 162 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/FetchShader.cpp b/src/Cafe/HW/Latte/Core/FetchShader.cpp index 6c9893f92..272b7c0b6 100644 --- a/src/Cafe/HW/Latte/Core/FetchShader.cpp +++ b/src/Cafe/HW/Latte/Core/FetchShader.cpp @@ -8,6 +8,7 @@ #include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInstructions.h" #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/ISA/LatteInstructions.h" +#include "HW/Latte/Renderer/Renderer.h" #include "util/containers/LookupTableL3.h" #include "util/helpers/fspinlock.h" #include /* SHA1_DIGEST_LENGTH */ @@ -107,6 +108,14 @@ void LatteShader_calculateFSKey(LatteFetchShader* fetchShader) key += (uint64)(attrib->offset & 3); key = std::rotl(key, 2); } + + // TODO: also check if geometry shader is used + if (g_renderer->GetType() == RendererAPI::Metal) + { + key += (uint64)group.attributeBufferIndex; + key = std::rotl(key, 5); + // TODO: hash the stride as well + } } // todo - also hash invalid buffer groups? fetchShader->key = key; @@ -161,7 +170,7 @@ void _fetchShaderDecompiler_parseInstruction_VTX_SEMANTIC(LatteFetchShader* pars auto nfa = instr->getField_NUM_FORMAT_ALL(); bool isSigned = instr->getField_FORMAT_COMP_ALL() == LatteClauseInstruction_VTX::FORMAT_COMP::COMP_SIGNED; auto endianSwap = instr->getField_ENDIAN_SWAP(); - + // get buffer cemu_assert_debug(bufferId >= 0xA0 && bufferId < 0xB0); uint32 bufferIndex = (bufferId - 0xA0); @@ -316,7 +325,7 @@ LatteFetchShader* LatteShaderRecompiler_createFetchShader(LatteFetchShader::Cach // {0x00000002, 0x01800c00, 0x00000000, 0x8a000000, 0x2c00a001, 0x2c151000, 0x000a0000, ...} // size 0x50 // {0x00000002, 0x01801000, 0x00000000, 0x8a000000, 0x1c00a001, 0x280d1000, 0x00090000, ...} // size 0x60 // {0x00000002, 0x01801c00, 0x00000000, 0x8a000000, 0x1c00a001, 0x280d1000, 0x00090000, ...} // size 0x90 - + // our new implementation: // {0x00000002, 0x01800400, 0x00000000, 0x8a000000, 0x0000a001, 0x2c151000, 0x00020000, ...} @@ -411,7 +420,7 @@ LatteFetchShader::~LatteFetchShader() UnregisterInCache(); } -struct FetchShaderLookupInfo +struct FetchShaderLookupInfo { LatteFetchShader* fetchShader; uint32 programSize; diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h index 5d8b2c6f3..29e65c58d 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h @@ -65,6 +65,7 @@ struct LatteDecompilerShaderResourceMapping sint8 attributeMapping[LATTE_NUM_MAX_ATTRIBUTE_LOCATIONS]; // Metal exclusive sint8 indexBufferBinding{-1}; + sint8 indexTypeBinding{-1}; sint32 getTextureCount() { diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp index 9a3db895b..ec3d8aa7b 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp @@ -1020,4 +1020,5 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD LatteDecompiler::_initUniformBindingPoints(shaderContext); LatteDecompiler::_initAttributeBindingPoints(shaderContext); shaderContext->output->resourceMappingMTL.indexBufferBinding = shaderContext->currentBufferBindingPointMTL++; + shaderContext->output->resourceMappingMTL.indexTypeBinding = shaderContext->currentBufferBindingPointMTL++; } diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 1c75b737c..c40d97c67 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -11,6 +11,7 @@ #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #include "config/ActiveSettings.h" #include "util/helpers/StringBuf.h" @@ -3856,6 +3857,8 @@ static void LatteDecompiler_emitAttributeImport(LatteDecompilerShaderContext* sh void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, LatteDecompilerShader* shader) { + LatteShaderSHRC_UpdateFetchShader(); + auto fetchShader = LatteSHRC_GetActiveFetchShader(); bool isRectVertexShader = (static_cast(shaderContext->contextRegisters[mmVGT_PRIMITIVE_TYPE]) == LattePrimitiveMode::RECTS); StringBuf* src = new StringBuf(1024*1024*12); // reserve 12MB for generated source (we resize-to-fit at the end) @@ -3881,9 +3884,125 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, case LatteConst::ShaderType::Vertex: if (shaderContext->options->usesGeometryShader || isRectVertexShader) { - // Defined just-in-time - // Will also modify vid in case of an indexed draw - src->add("VertexIn fetchInput(thread uint& vid VERTEX_BUFFER_DEFINITIONS);" _CRLF); + // TODO: clean this up + // Will modify vid in case of an indexed draw + + // Vertex buffers + std::string vertexBufferDefinitions = "#define VERTEX_BUFFER_DEFINITIONS "; + std::string vertexBuffers = "#define VERTEX_BUFFERS "; + std::string inputFetchDefinition = "VertexIn fetchInput(thread uint& vid, device uint* indexBuffer, uint indexType VERTEX_BUFFER_DEFINITIONS) {\n"; + + // Index buffer + inputFetchDefinition += "if (indexType == 1) // UShort\n"; + inputFetchDefinition += "vid = ((device ushort*)indexBuffer)[vid];\n"; + inputFetchDefinition += "else if (indexType == 2)\n"; + inputFetchDefinition += "vid = ((device uint*)indexBuffer)[vid]; // UInt\n"; + + inputFetchDefinition += "VertexIn in;\n"; + for (auto& bufferGroup : fetchShader->bufferGroups) + { + std::optional fetchType; + + uint32 bufferIndex = bufferGroup.attributeBufferIndex; + uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; + uint32 bufferStride = (shaderContext->contextRegisters[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + + for (sint32 j = 0; j < bufferGroup.attribCount; ++j) + { + auto& attr = bufferGroup.attrib[j]; + + uint32 semanticId = shaderContext->output->resourceMappingMTL.attributeMapping[attr.semanticId]; + if (semanticId == (uint32)-1) + continue; // attribute not used? + + std::string formatName; + uint8 componentCount = 0; + switch (GetMtlVertexFormat(attr.format)) + { + case MTL::VertexFormatUChar: + formatName = "uchar"; + componentCount = 1; + break; + case MTL::VertexFormatUChar2: + formatName = "uchar2"; + componentCount = 2; + break; + case MTL::VertexFormatUChar3: + formatName = "uchar3"; + componentCount = 3; + break; + case MTL::VertexFormatUChar4: + formatName = "uchar4"; + componentCount = 4; + break; + case MTL::VertexFormatUShort: + formatName = "ushort"; + componentCount = 1; + break; + case MTL::VertexFormatUShort2: + formatName = "ushort2"; + componentCount = 2; + break; + case MTL::VertexFormatUShort3: + formatName = "ushort3"; + componentCount = 3; + break; + case MTL::VertexFormatUShort4: + formatName = "ushort4"; + componentCount = 4; + break; + case MTL::VertexFormatUInt: + formatName = "uint"; + componentCount = 1; + break; + case MTL::VertexFormatUInt2: + formatName = "uint2"; + componentCount = 2; + break; + case MTL::VertexFormatUInt3: + formatName = "uint3"; + componentCount = 3; + break; + case MTL::VertexFormatUInt4: + formatName = "uint4"; + componentCount = 4; + break; + } + + // Fetch the attribute + inputFetchDefinition += fmt::format("in.ATTRIBUTE_NAME{} = ", semanticId); + inputFetchDefinition += fmt::format("uint4(*(device {}*)", formatName); + inputFetchDefinition += fmt::format("(vertexBuffer{}", attr.attributeBufferIndex); + inputFetchDefinition += fmt::format(" + vid * {} + {})", bufferStride, attr.offset); + for (uint8 i = 0; i < (4 - componentCount); i++) + inputFetchDefinition += ", 0"; + inputFetchDefinition += ");\n"; + + if (fetchType.has_value()) + cemu_assert_debug(fetchType == attr.fetchType); + else + fetchType = attr.fetchType; + + if (attr.fetchType == LatteConst::INSTANCE_DATA) + { + cemu_assert_debug(attr.aluDivisor == 1); // other divisor not yet supported + } + } + + // TODO: fetch type + + vertexBufferDefinitions += fmt::format(", device uchar* vertexBuffer{} [[buffer({})]]", bufferIndex, GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)); + vertexBuffers += fmt::format(", vertexBuffer{}", bufferIndex); + } + + inputFetchDefinition += "return in;\n"; + inputFetchDefinition += "}\n"; + + src->add(vertexBufferDefinitions.c_str()); + src->add("\n"); + src->add(vertexBuffers.c_str()); + src->add("\n"); + src->add(inputFetchDefinition.c_str()); functionType = "[[object, max_total_threads_per_threadgroup(VERTICES_PER_VERTEX_PRIMITIVE), max_total_threadgroups_per_mesh_grid(1)]]"; outputTypeName = "void"; @@ -3916,7 +4035,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, // TODO: don't hardcode the instance index src->add("uint iid = 0;" _CRLF); // Fetch the input - src->add("VertexIn in = fetchInput(vid VERTEX_BUFFERS);" _CRLF); + src->add("VertexIn in = fetchInput(vid, indexBuffer, indexType VERTEX_BUFFERS);" _CRLF); // Output is defined as object payload src->add("object_data VertexOut& out = objectPayload.vertexOut[tid];" _CRLF); } diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 38392bdb9..aed7e9f1f 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -495,6 +495,10 @@ namespace LatteDecompiler src->add(", mesh_grid_properties meshGridProperties"); src->add(", uint tig [[threadgroup_position_in_grid]]"); src->add(", uint tid [[thread_index_in_threadgroup]]"); + // TODO: inly include index buffer if needed + src->addFmt(", device uint* indexBuffer [[buffer({})]]", decompilerContext->output->resourceMappingMTL.indexBufferBinding); + // TODO: use uchar? + src->addFmt(", constant uint& indexType [[buffer({})]]", decompilerContext->output->resourceMappingMTL.indexTypeBinding); src->add(" VERTEX_BUFFER_DEFINITIONS"); } else diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 8f7740b9b..a138ec8c7 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -366,13 +366,6 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte auto mtlVertexShader = static_cast(vertexShader->shader); auto mtlPixelShader = static_cast(pixelShader->shader); - mtlVertexShader->CompileVertexFunction(); - // HACK - if (!mtlVertexShader->GetFunction()) - { - debug_printf("no vertex function, skipping draw\n"); - return nullptr; - } mtlPixelShader->CompileFragmentFunction(lastUsedFBO); // Render pipeline state @@ -475,7 +468,6 @@ MTL::RenderPipelineState* MetalPipelineCache::GetMeshPipelineState(const LatteFe mtlMeshShader = rectsEmulationGS_generate(m_mtlr, vertexShader, lcr); } auto mtlPixelShader = static_cast(pixelShader->shader); - mtlObjectShader->CompileObjectFunction(lcr, fetchShader, vertexShader, hostIndexType); mtlPixelShader->CompileFragmentFunction(lastUsedFBO); // Render pipeline state diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 6ccbdb691..c40fbabb2 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -916,7 +916,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 LatteDecompilerShader* geometryShader = LatteSHRC_GetActiveGeometryShader(); LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader(); // TODO: is this even needed? Also, should go to draw_beginSequence - if (!vertexShader) + if (!vertexShader || !static_cast(vertexShader->shader)->GetFunction()) { printf("no vertex function, skipping draw\n"); return; @@ -1200,6 +1200,8 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 { if (indexBuffer) SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_OBJECT, indexBuffer, indexBufferOffset, vertexShader->resourceMapping.indexBufferBinding); + renderCommandEncoder->setObjectBytes(&hostIndexType, sizeof(hostIndexType), vertexShader->resourceMapping.indexTypeBinding); + encoderState.m_buffers[METAL_SHADER_TYPE_OBJECT][vertexShader->resourceMapping.indexTypeBinding] = {nullptr}; uint32 verticesPerPrimitive = 0; switch (primitiveMode) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index 6ad72d879..8905ddee2 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -16,7 +16,8 @@ extern std::atomic_int g_compiled_shaders_async; RendererShaderMtl::RendererShaderMtl(MetalRenderer* mtlRenderer, ShaderType type, uint64 baseHash, uint64 auxHash, bool isGameShader, bool isGfxPackShader, const std::string& mslCode) : RendererShader(type, baseHash, auxHash, isGameShader, isGfxPackShader), m_mtlr{mtlRenderer} { - if (type == ShaderType::kGeometry) + // TODO: don't compile fragment function just-in-time + if (type != ShaderType::kFragment) { Compile(mslCode); } @@ -36,145 +37,6 @@ RendererShaderMtl::~RendererShaderMtl() m_function->release(); } -void RendererShaderMtl::CompileObjectFunction(const LatteContextRegister& lcr, const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, Renderer::INDEX_TYPE hostIndexType) -{ - cemu_assert_debug(m_type == ShaderType::kVertex); - - std::string fullCode; - - // Vertex buffers - std::string vertexBufferDefinitions = "#define VERTEX_BUFFER_DEFINITIONS "; - std::string vertexBuffers = "#define VERTEX_BUFFERS "; - std::string inputFetchDefinition = "VertexIn fetchInput(thread uint& vid VERTEX_BUFFER_DEFINITIONS) {\n"; - - // Index buffer - if (hostIndexType != Renderer::INDEX_TYPE::NONE) - { - vertexBufferDefinitions += ", device "; - switch (hostIndexType) - { - case Renderer::INDEX_TYPE::U16: - vertexBufferDefinitions += "ushort"; - break; - case Renderer::INDEX_TYPE::U32: - vertexBufferDefinitions += "uint"; - break; - default: - cemu_assert_suspicious(); - break; - } - - vertexBufferDefinitions += fmt::format("* indexBuffer [[buffer({})]]", vertexShader->resourceMapping.indexBufferBinding); - vertexBuffers += ", indexBuffer"; - inputFetchDefinition += "vid = indexBuffer[vid];\n"; - } - - inputFetchDefinition += "VertexIn in;\n"; - for (auto& bufferGroup : fetchShader->bufferGroups) - { - std::optional fetchType; - - uint32 bufferIndex = bufferGroup.attributeBufferIndex; - uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; - uint32 bufferStride = (lcr.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; - - for (sint32 j = 0; j < bufferGroup.attribCount; ++j) - { - auto& attr = bufferGroup.attrib[j]; - - uint32 semanticId = vertexShader->resourceMapping.attributeMapping[attr.semanticId]; - if (semanticId == (uint32)-1) - continue; // attribute not used? - - std::string formatName; - uint8 componentCount = 0; - switch (GetMtlVertexFormat(attr.format)) - { - case MTL::VertexFormatUChar: - formatName = "uchar"; - componentCount = 1; - break; - case MTL::VertexFormatUChar2: - formatName = "uchar2"; - componentCount = 2; - break; - case MTL::VertexFormatUChar3: - formatName = "uchar3"; - componentCount = 3; - break; - case MTL::VertexFormatUChar4: - formatName = "uchar4"; - componentCount = 4; - break; - case MTL::VertexFormatUShort: - formatName = "ushort"; - componentCount = 1; - break; - case MTL::VertexFormatUShort2: - formatName = "ushort2"; - componentCount = 2; - break; - case MTL::VertexFormatUShort3: - formatName = "ushort3"; - componentCount = 3; - break; - case MTL::VertexFormatUShort4: - formatName = "ushort4"; - componentCount = 4; - break; - case MTL::VertexFormatUInt: - formatName = "uint"; - componentCount = 1; - break; - case MTL::VertexFormatUInt2: - formatName = "uint2"; - componentCount = 2; - break; - case MTL::VertexFormatUInt3: - formatName = "uint3"; - componentCount = 3; - break; - case MTL::VertexFormatUInt4: - formatName = "uint4"; - componentCount = 4; - break; - } - - // Fetch the attribute - inputFetchDefinition += fmt::format("in.ATTRIBUTE_NAME{} = ", semanticId); - inputFetchDefinition += fmt::format("uint4(*(device {}*)", formatName); - inputFetchDefinition += fmt::format("(vertexBuffer{}", attr.attributeBufferIndex); - inputFetchDefinition += fmt::format(" + vid * {} + {})", bufferStride, attr.offset); - for (uint8 i = 0; i < (4 - componentCount); i++) - inputFetchDefinition += ", 0"; - inputFetchDefinition += ");\n"; - - if (fetchType.has_value()) - cemu_assert_debug(fetchType == attr.fetchType); - else - fetchType = attr.fetchType; - - if (attr.fetchType == LatteConst::INSTANCE_DATA) - { - cemu_assert_debug(attr.aluDivisor == 1); // other divisor not yet supported - } - } - - vertexBufferDefinitions += fmt::format(", device uchar* vertexBuffer{} [[buffer({})]]", bufferIndex, GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)); - vertexBuffers += fmt::format(", vertexBuffer{}", bufferIndex); - } - - inputFetchDefinition += "return in;\n"; - inputFetchDefinition += "}\n"; - - fullCode += vertexBufferDefinitions + "\n"; - fullCode += vertexBuffers + "\n"; - fullCode += m_mslCode; - fullCode += inputFetchDefinition; - - Compile(fullCode); -} - void RendererShaderMtl::CompileFragmentFunction(CachedFBOMtl* activeFBO) { cemu_assert_debug(m_type == ShaderType::kFragment); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h index ca5a0ff97..6ae2b9287 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h @@ -21,12 +21,6 @@ class RendererShaderMtl : public RendererShader RendererShaderMtl(class MetalRenderer* mtlRenderer, ShaderType type, uint64 baseHash, uint64 auxHash, bool isGameShader, bool isGfxPackShader, const std::string& mslCode); virtual ~RendererShaderMtl(); - void CompileVertexFunction() - { - Compile(m_mslCode); - } - - void CompileObjectFunction(const LatteContextRegister& lcr, const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, Renderer::INDEX_TYPE hostIndexType); void CompileFragmentFunction(CachedFBOMtl* activeFBO); MTL::Function* GetFunction() const From 2ee92e53e9f254e8461c688487074cc03e6e93d9 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 3 Sep 2024 14:26:04 +0200 Subject: [PATCH 151/368] use the correct fetch shader --- .../Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index c40d97c67..5dae21319 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3857,8 +3857,6 @@ static void LatteDecompiler_emitAttributeImport(LatteDecompilerShaderContext* sh void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, LatteDecompilerShader* shader) { - LatteShaderSHRC_UpdateFetchShader(); - auto fetchShader = LatteSHRC_GetActiveFetchShader(); bool isRectVertexShader = (static_cast(shaderContext->contextRegisters[mmVGT_PRIMITIVE_TYPE]) == LattePrimitiveMode::RECTS); StringBuf* src = new StringBuf(1024*1024*12); // reserve 12MB for generated source (we resize-to-fit at the end) @@ -3899,7 +3897,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, inputFetchDefinition += "vid = ((device uint*)indexBuffer)[vid]; // UInt\n"; inputFetchDefinition += "VertexIn in;\n"; - for (auto& bufferGroup : fetchShader->bufferGroups) + for (auto& bufferGroup : shaderContext->fetchShader->bufferGroups) { std::optional fetchType; From b13ba58aad6ebc6231f14a7479d8327ccbf37036 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 4 Sep 2024 07:36:40 +0200 Subject: [PATCH 152/368] correct the object shader hash --- src/Cafe/HW/Latte/Core/FetchShader.cpp | 28 +++++++++++++------ src/Cafe/HW/Latte/Core/FetchShader.h | 7 ++++- src/Cafe/HW/Latte/Core/LatteShader.cpp | 2 ++ .../Renderer/Metal/MetalPipelineCache.cpp | 2 -- 4 files changed, 27 insertions(+), 12 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/FetchShader.cpp b/src/Cafe/HW/Latte/Core/FetchShader.cpp index 272b7c0b6..5933fe055 100644 --- a/src/Cafe/HW/Latte/Core/FetchShader.cpp +++ b/src/Cafe/HW/Latte/Core/FetchShader.cpp @@ -8,7 +8,6 @@ #include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInstructions.h" #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/ISA/LatteInstructions.h" -#include "HW/Latte/Renderer/Renderer.h" #include "util/containers/LookupTableL3.h" #include "util/helpers/fspinlock.h" #include /* SHA1_DIGEST_LENGTH */ @@ -108,14 +107,6 @@ void LatteShader_calculateFSKey(LatteFetchShader* fetchShader) key += (uint64)(attrib->offset & 3); key = std::rotl(key, 2); } - - // TODO: also check if geometry shader is used - if (g_renderer->GetType() == RendererAPI::Metal) - { - key += (uint64)group.attributeBufferIndex; - key = std::rotl(key, 5); - // TODO: hash the stride as well - } } // todo - also hash invalid buffer groups? fetchShader->key = key; @@ -155,6 +146,23 @@ void LatteFetchShader::CalculateFetchShaderVkHash() this->vkPipelineHashFragment = h; } +void LatteFetchShader::CalculateFetchShaderMtlObjectShaderHash(uint32* contextRegister) +{uint64 key = 0; + for (sint32 g = 0; g < bufferGroups.size(); g++) + { + LatteParsedFetchShaderBufferGroup_t& group = bufferGroups[g]; + uint32 bufferIndex = group.attributeBufferIndex; + uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; + uint32 bufferStride = (contextRegister[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + + key += (uint64)bufferIndex; + key = std::rotl(key, 5); + key += (uint64)bufferStride; + key = std::rotl(key, 5); + } + mtlShaderHashObject = key; +} + void _fetchShaderDecompiler_parseInstruction_VTX_SEMANTIC(LatteFetchShader* parsedFetchShader, uint32* contextRegister, const LatteClauseInstruction_VTX* instr) { uint32 semanticId = instr->getFieldSEM_SEMANTIC_ID(); // location (attribute index inside shader) @@ -337,6 +345,7 @@ LatteFetchShader* LatteShaderRecompiler_createFetchShader(LatteFetchShader::Cach // these only make sense when vertex shader does not call FS? LatteShader_calculateFSKey(newFetchShader); newFetchShader->CalculateFetchShaderVkHash(); + newFetchShader->CalculateFetchShaderMtlObjectShaderHash(contextRegister); return newFetchShader; } @@ -396,6 +405,7 @@ LatteFetchShader* LatteShaderRecompiler_createFetchShader(LatteFetchShader::Cach } LatteShader_calculateFSKey(newFetchShader); newFetchShader->CalculateFetchShaderVkHash(); + newFetchShader->CalculateFetchShaderMtlObjectShaderHash(contextRegister); // register in cache // its possible that during multi-threaded shader cache loading, two identical (same hash) fetch shaders get created simultaneously diff --git a/src/Cafe/HW/Latte/Core/FetchShader.h b/src/Cafe/HW/Latte/Core/FetchShader.h index ac57714d0..9aeed6bde 100644 --- a/src/Cafe/HW/Latte/Core/FetchShader.h +++ b/src/Cafe/HW/Latte/Core/FetchShader.h @@ -46,6 +46,9 @@ struct LatteFetchShader // Vulkan uint64 vkPipelineHashFragment{}; // hash of all fetch shader state that influences the Vulkan graphics pipeline + // Metal + uint64 mtlShaderHashObject{}; + // cache info CacheHash m_cacheHash{}; bool m_isRegistered{}; // if true, fetch shader is referenced by cache (RegisterInCache() succeeded) @@ -53,6 +56,8 @@ struct LatteFetchShader void CalculateFetchShaderVkHash(); + void CalculateFetchShaderMtlObjectShaderHash(uint32* contextRegister); + uint64 getVkPipelineHashFragment() const { return vkPipelineHashFragment; }; static bool isValidBufferIndex(const uint32 index) { return index < 0x10; }; @@ -69,4 +74,4 @@ struct LatteFetchShader static std::unordered_map s_fetchShaderByHash; }; -LatteFetchShader* LatteShaderRecompiler_createFetchShader(LatteFetchShader::CacheHash fsHash, uint32* contextRegister, uint32* fsProgramCode, uint32 fsProgramSize); \ No newline at end of file +LatteFetchShader* LatteShaderRecompiler_createFetchShader(LatteFetchShader::CacheHash fsHash, uint32* contextRegister, uint32* fsProgramCode, uint32 fsProgramSize); diff --git a/src/Cafe/HW/Latte/Core/LatteShader.cpp b/src/Cafe/HW/Latte/Core/LatteShader.cpp index 3ac0e9d27..d20067a65 100644 --- a/src/Cafe/HW/Latte/Core/LatteShader.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShader.cpp @@ -498,6 +498,8 @@ void LatteSHRC_UpdateVSBaseHash(uint8* vertexShaderPtr, uint32 vertexShaderSize, uint64 vsHash2 = 0; _calculateShaderProgramHash(vsProgramCode, vertexShaderSize, &hashCacheVS, &vsHash1, &vsHash2); uint64 vsHash = vsHash1 + vsHash2 + _activeFetchShader->key + _activePSImportTable.key + (usesGeometryShader ? 0x1111ULL : 0ULL); + if (g_renderer->GetType() == RendererAPI::Metal && usesGeometryShader) + vsHash += _activeFetchShader->mtlShaderHashObject; uint32 tmp = LatteGPUState.contextNew.PA_CL_VTE_CNTL.getRawValue() ^ 0x43F; vsHash += tmp; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index a138ec8c7..9a99f1386 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -1,7 +1,6 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" -#include "Foundation/NSObject.hpp" #include "HW/Latte/Core/LatteShader.h" #include "HW/Latte/Renderer/Metal/CachedFBOMtl.h" #include "HW/Latte/Renderer/Metal/LatteToMtl.h" @@ -10,7 +9,6 @@ #include "HW/Latte/Core/FetchShader.h" #include "HW/Latte/ISA/RegDefines.h" -#include "Metal/MTLRenderPipeline.hpp" #include "config/ActiveSettings.h" static void rectsEmulationGS_outputSingleVertex(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable* psInputTable, sint32 vIdx, const LatteContextRegister& latteRegister) From 8a74445a9632935d218c653377d189d607f5b9cb Mon Sep 17 00:00:00 2001 From: Samo Z Date: Wed, 4 Sep 2024 19:05:07 +0200 Subject: [PATCH 153/368] don't compile fragment shaders just-in-time --- .../LatteDecompilerEmitMSL.cpp | 12 +-- .../LatteDecompilerEmitMSLHeader.hpp | 6 +- .../HW/Latte/Renderer/Metal/MetalCommon.h | 8 +- .../Renderer/Metal/MetalPipelineCache.cpp | 2 - .../Renderer/Metal/RendererShaderMtl.cpp | 79 ++----------------- .../Latte/Renderer/Metal/RendererShaderMtl.h | 8 -- 6 files changed, 21 insertions(+), 94 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 5dae21319..1d6ab1a4c 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3215,11 +3215,11 @@ static void _emitExportCode(LatteDecompilerShaderContext* shaderContext, LatteDe src->add(") == false) discard_fragment();" _CRLF); } // pixel color output - src->addFmt("#ifdef {}" _CRLF, GetColorAttachmentTypeStr(pixelColorOutputIndex)); - src->addFmt("out.passPixelColor{} = as_type<{}>(", pixelColorOutputIndex, GetColorAttachmentTypeStr(pixelColorOutputIndex)); + //src->addFmt("#ifdef {}" _CRLF, GetColorAttachmentTypeStr(pixelColorOutputIndex)); + src->addFmt("out.passPixelColor{} = as_type(", pixelColorOutputIndex/*, GetColorAttachmentTypeStr(pixelColorOutputIndex)*/); _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, i); src->add(");" _CRLF); - src->add("#endif" _CRLF); + //src->add("#endif" _CRLF); if( cfInstruction->exportArrayBase+i >= 8 ) cemu_assert_unimplemented(); @@ -3883,12 +3883,12 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, if (shaderContext->options->usesGeometryShader || isRectVertexShader) { // TODO: clean this up - // Will modify vid in case of an indexed draw + // fetchVertex will modify vid in case of an indexed draw // Vertex buffers std::string vertexBufferDefinitions = "#define VERTEX_BUFFER_DEFINITIONS "; std::string vertexBuffers = "#define VERTEX_BUFFERS "; - std::string inputFetchDefinition = "VertexIn fetchInput(thread uint& vid, device uint* indexBuffer, uint indexType VERTEX_BUFFER_DEFINITIONS) {\n"; + std::string inputFetchDefinition = "VertexIn fetchVertex(thread uint& vid, device uint* indexBuffer, uint indexType VERTEX_BUFFER_DEFINITIONS) {\n"; // Index buffer inputFetchDefinition += "if (indexType == 1) // UShort\n"; @@ -4033,7 +4033,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, // TODO: don't hardcode the instance index src->add("uint iid = 0;" _CRLF); // Fetch the input - src->add("VertexIn in = fetchInput(vid, indexBuffer, indexType VERTEX_BUFFERS);" _CRLF); + src->add("VertexIn in = fetchVertex(vid, indexBuffer, indexType VERTEX_BUFFERS);" _CRLF); // Output is defined as object payload src->add("object_data VertexOut& out = objectPayload.vertexOut[tid];" _CRLF); } diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index aed7e9f1f..5f88f2468 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -281,9 +281,9 @@ namespace LatteDecompiler { if ((decompilerContext->shader->pixelColorOutputMask & (1 << i)) != 0) { - src->addFmt("#ifdef {}" _CRLF, GetColorAttachmentTypeStr(i)); - src->addFmt("{} passPixelColor{} [[color({})]];" _CRLF, GetColorAttachmentTypeStr(i), i, i); - src->add("#endif" _CRLF); + //src->addFmt("#ifdef {}" _CRLF, GetColorAttachmentTypeStr(i)); + src->addFmt("float4 passPixelColor{} [[color({})]];" _CRLF/*, GetColorAttachmentTypeStr(i)*/, i, i); + //src->add("#endif" _CRLF); } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h index a2ecc7e98..ede0bed68 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -41,10 +41,10 @@ inline size_t Align(size_t size, size_t alignment) return (size + alignment - 1) & ~(alignment - 1); } -inline std::string GetColorAttachmentTypeStr(uint32 index) -{ - return "COLOR_ATTACHMENT" + std::to_string(index) + "_TYPE"; -} +//inline std::string GetColorAttachmentTypeStr(uint32 index) +//{ +// return "COLOR_ATTACHMENT" + std::to_string(index) + "_TYPE"; +//} // Cast from const char* to NS::String* inline NS::String* ToNSString(const char* str) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 9a99f1386..1842142e9 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -364,7 +364,6 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte auto mtlVertexShader = static_cast(vertexShader->shader); auto mtlPixelShader = static_cast(pixelShader->shader); - mtlPixelShader->CompileFragmentFunction(lastUsedFBO); // Render pipeline state MTL::RenderPipelineDescriptor* desc = MTL::RenderPipelineDescriptor::alloc()->init(); @@ -466,7 +465,6 @@ MTL::RenderPipelineState* MetalPipelineCache::GetMeshPipelineState(const LatteFe mtlMeshShader = rectsEmulationGS_generate(m_mtlr, vertexShader, lcr); } auto mtlPixelShader = static_cast(pixelShader->shader); - mtlPixelShader->CompileFragmentFunction(lastUsedFBO); // Render pipeline state MTL::MeshRenderPipelineDescriptor* desc = MTL::MeshRenderPipelineDescriptor::alloc()->init(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index 8905ddee2..d343ef453 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -16,16 +16,16 @@ extern std::atomic_int g_compiled_shaders_async; RendererShaderMtl::RendererShaderMtl(MetalRenderer* mtlRenderer, ShaderType type, uint64 baseHash, uint64 auxHash, bool isGameShader, bool isGfxPackShader, const std::string& mslCode) : RendererShader(type, baseHash, auxHash, isGameShader, isGfxPackShader), m_mtlr{mtlRenderer} { - // TODO: don't compile fragment function just-in-time - if (type != ShaderType::kFragment) - { - Compile(mslCode); - } - else + NS::Error* error = nullptr; + MTL::Library* library = m_mtlr->GetDevice()->newLibrary(ToNSString(mslCode), nullptr, &error); + if (error) { - // TODO: don't compile just-in-time - m_mslCode = mslCode; + printf("failed to create library (error: %s) -> source:\n%s\n", error->localizedDescription()->utf8String(), mslCode.c_str()); + error->release(); + return; } + m_function = library->newFunction(ToNSString("main0")); + library->release(); // Count shader compilation g_compiled_shaders_total++; @@ -36,66 +36,3 @@ RendererShaderMtl::~RendererShaderMtl() if (m_function) m_function->release(); } - -void RendererShaderMtl::CompileFragmentFunction(CachedFBOMtl* activeFBO) -{ - cemu_assert_debug(m_type == ShaderType::kFragment); - - std::string fullCode; - - // Define color attachment data types - for (uint8 i = 0; i < 8; i++) - { - const auto& colorBuffer = activeFBO->colorBuffer[i]; - if (!colorBuffer.texture) - { - continue; - } - auto dataType = GetMtlPixelFormatInfo(colorBuffer.texture->format, false).dataType; - fullCode += "#define " + GetColorAttachmentTypeStr(i) + " "; - switch (dataType) - { - case MetalDataType::INT: - fullCode += "int4"; - break; - case MetalDataType::UINT: - fullCode += "uint4"; - break; - case MetalDataType::FLOAT: - fullCode += "float4"; - break; - default: - cemu_assert_suspicious(); - break; - } - fullCode += "\n"; - } - - fullCode += m_mslCode; - Compile(fullCode); -} - -void RendererShaderMtl::Compile(const std::string& mslCode) -{ - if (m_function) - m_function->release(); - - // HACK - if (m_hasError) - return; - - NS::Error* error = nullptr; - MTL::Library* library = m_mtlr->GetDevice()->newLibrary(ToNSString(mslCode), nullptr, &error); - if (error) - { - printf("failed to create library (error: %s) -> source:\n%s\n", error->localizedDescription()->utf8String(), mslCode.c_str()); - error->release(); - - // HACK - m_hasError = true; - - return; - } - m_function = library->newFunction(ToNSString("main0")); - library->release(); -} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h index 6ae2b9287..0758b0e63 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h @@ -21,8 +21,6 @@ class RendererShaderMtl : public RendererShader RendererShaderMtl(class MetalRenderer* mtlRenderer, ShaderType type, uint64 baseHash, uint64 auxHash, bool isGameShader, bool isGfxPackShader, const std::string& mslCode); virtual ~RendererShaderMtl(); - void CompileFragmentFunction(CachedFBOMtl* activeFBO); - MTL::Function* GetFunction() const { return m_function; @@ -54,11 +52,5 @@ class RendererShaderMtl : public RendererShader MTL::Function* m_function = nullptr; - std::vector m_binary; - std::string m_mslCode; - - // HACK - bool m_hasError = false; - void Compile(const std::string& mslCode); }; From cd72ad80d219a4bf25dfc3e72098239d900280b2 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 5 Sep 2024 14:34:39 +0200 Subject: [PATCH 154/368] include color format data types in ps hash --- src/Cafe/HW/Latte/Core/LatteShader.cpp | 17 ++++++++++++++++ .../LatteDecompilerEmitMSL.cpp | 2 +- .../LatteDecompilerEmitMSLHeader.hpp | 2 +- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 12 +++++------ src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h | 17 ++++++++++++++++ .../Renderer/Metal/MetalPipelineCache.cpp | 20 +++++++++---------- 6 files changed, 50 insertions(+), 20 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteShader.cpp b/src/Cafe/HW/Latte/Core/LatteShader.cpp index d20067a65..2bc719cdd 100644 --- a/src/Cafe/HW/Latte/Core/LatteShader.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShader.cpp @@ -14,6 +14,9 @@ #include "config/ActiveSettings.h" #include "Cafe/GameProfile/GameProfile.h" #include "util/containers/flat_hash_map.hpp" +#if BOOST_OS_MACOS +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#endif #include // experimental new decompiler (WIP) @@ -544,6 +547,20 @@ void LatteSHRC_UpdatePSBaseHash(uint8* pixelShaderPtr, uint32 pixelShaderSize, b _calculateShaderProgramHash(psProgramCode, pixelShaderSize, &hashCachePS, &psHash1, &psHash2); // get vertex shader uint64 psHash = psHash1 + psHash2 + _activePSImportTable.key + (usesGeometryShader ? hashCacheGS.prevHash1 : 0ULL); + +#if BOOST_OS_MACOS + if (g_renderer->GetType() == RendererAPI::Metal) + { + for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + { + auto format = LatteMRT::GetColorBufferFormat(i, LatteGPUState.contextNew); + uint8 dataType = (uint8)GetMtlPixelFormatInfo(format, false).dataType; + psHash += (uint64)dataType; + psHash = std::rotl(psHash, 7); + } + } +#endif + _shaderBaseHash_ps = psHash; } diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 1d6ab1a4c..aea421fb9 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3216,7 +3216,7 @@ static void _emitExportCode(LatteDecompilerShaderContext* shaderContext, LatteDe } // pixel color output //src->addFmt("#ifdef {}" _CRLF, GetColorAttachmentTypeStr(pixelColorOutputIndex)); - src->addFmt("out.passPixelColor{} = as_type(", pixelColorOutputIndex/*, GetColorAttachmentTypeStr(pixelColorOutputIndex)*/); + src->addFmt("out.passPixelColor{} = as_type<{}>(", pixelColorOutputIndex, GetColorBufferDataTypeStr(pixelColorOutputIndex, *shaderContext->contextRegistersNew)/*, GetColorAttachmentTypeStr(pixelColorOutputIndex)*/); _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, i); src->add(");" _CRLF); //src->add("#endif" _CRLF); diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 5f88f2468..066e3b5c2 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -282,7 +282,7 @@ namespace LatteDecompiler if ((decompilerContext->shader->pixelColorOutputMask & (1 << i)) != 0) { //src->addFmt("#ifdef {}" _CRLF, GetColorAttachmentTypeStr(i)); - src->addFmt("float4 passPixelColor{} [[color({})]];" _CRLF/*, GetColorAttachmentTypeStr(i)*/, i, i); + src->addFmt("{} passPixelColor{} [[color({})]];" _CRLF, GetColorBufferDataTypeStr(i, *decompilerContext->contextRegistersNew)/*, GetColorAttachmentTypeStr(i)*/, i, i); //src->add("#endif" _CRLF); } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index 6f54272cd..581c0e190 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -84,19 +84,17 @@ std::map MTL_DEPTH_FORMAT_TABLE = { const MetalPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format, bool isDepth) { + if (format == Latte::E_GX2SURFFMT::INVALID_FORMAT) + { + return {MTL::PixelFormatInvalid, MetalDataType::NONE, 0}; + } + MetalPixelFormatInfo formatInfo; if (isDepth) formatInfo = MTL_DEPTH_FORMAT_TABLE[format]; else formatInfo = MTL_COLOR_FORMAT_TABLE[format]; - // Depth24Unorm_Stencil8 is not supported on Apple sillicon - // TODO: query if format is available instead - if (formatInfo.pixelFormat == MTL::PixelFormatDepth24Unorm_Stencil8) - { - formatInfo.pixelFormat = MTL::PixelFormatDepth32Float_Stencil8; - } - if (formatInfo.pixelFormat == MTL::PixelFormatInvalid) { printf("invalid pixel format: %u\n", (uint32)format); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h index 2c805527f..218a74f5f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h @@ -32,6 +32,23 @@ const MetalPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format, boo MTL::PixelFormat GetMtlPixelFormat(Latte::E_GX2SURFFMT format, bool isDepth, const MetalPixelFormatSupport& pixelFormatSupport); +inline const char* GetColorBufferDataTypeStr(const uint32 index, const LatteContextRegister& lcr) +{ + auto format = LatteMRT::GetColorBufferFormat(index, lcr); + auto dataType = GetMtlPixelFormatInfo(format, false).dataType; + switch (dataType) + { + case MetalDataType::INT: + return "int4"; + case MetalDataType::UINT: + return "uint4"; + case MetalDataType::FLOAT: + return "float4"; + default: + return "unknown"; + } +} + size_t GetMtlTextureBytesPerRow(Latte::E_GX2SURFFMT format, bool isDepth, uint32 width); size_t GetMtlTextureBytesPerImage(Latte::E_GX2SURFFMT format, bool isDepth, uint32 height, size_t bytesPerRow); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 1842142e9..8273ec162 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -1,14 +1,14 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" -#include "HW/Latte/Core/LatteShader.h" -#include "HW/Latte/Renderer/Metal/CachedFBOMtl.h" -#include "HW/Latte/Renderer/Metal/LatteToMtl.h" -#include "HW/Latte/Renderer/Metal/RendererShaderMtl.h" -#include "HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" - -#include "HW/Latte/Core/FetchShader.h" -#include "HW/Latte/ISA/RegDefines.h" +#include "Cafe/HW/Latte/Core/LatteShader.h" +#include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" + +#include "Cafe/HW/Latte/Core/FetchShader.h" +#include "Cafe/HW/Latte/ISA/RegDefines.h" #include "config/ActiveSettings.h" static void rectsEmulationGS_outputSingleVertex(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable* psInputTable, sint32 vIdx, const LatteContextRegister& latteRegister) @@ -409,7 +409,6 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte { debug_printf("error creating render pipeline state: %s\n", error->localizedDescription()->utf8String()); error->release(); - return nullptr; } else { @@ -484,13 +483,12 @@ MTL::RenderPipelineState* MetalPipelineCache::GetMeshPipelineState(const LatteFe desc->setLabel(GetLabel("Mesh pipeline state", desc)); #endif pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, MTL::PipelineOptionNone, nullptr, &error); + desc->release(); if (error) { debug_printf("error creating render pipeline state: %s\n", error->localizedDescription()->utf8String()); error->release(); - return nullptr; } - desc->release(); return pipeline; } From 4251f3fe551ee7b1c2ea521e2ae30b7576405e28 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 6 Sep 2024 17:16:47 +0200 Subject: [PATCH 155/368] check for invalid color formats --- .../LatteDecompilerEmitMSL.cpp | 12 +++++++----- .../LatteDecompilerEmitMSLHeader.hpp | 8 +++++--- src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h | 12 +++++++++--- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index aea421fb9..137f8e87a 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3215,11 +3215,13 @@ static void _emitExportCode(LatteDecompilerShaderContext* shaderContext, LatteDe src->add(") == false) discard_fragment();" _CRLF); } // pixel color output - //src->addFmt("#ifdef {}" _CRLF, GetColorAttachmentTypeStr(pixelColorOutputIndex)); - src->addFmt("out.passPixelColor{} = as_type<{}>(", pixelColorOutputIndex, GetColorBufferDataTypeStr(pixelColorOutputIndex, *shaderContext->contextRegistersNew)/*, GetColorAttachmentTypeStr(pixelColorOutputIndex)*/); - _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, i); - src->add(");" _CRLF); - //src->add("#endif" _CRLF); + auto dataType = GetColorBufferDataType(pixelColorOutputIndex, *shaderContext->contextRegistersNew); + if (dataType != MetalDataType::NONE) + { + src->addFmt("out.passPixelColor{} = as_type<{}>(", pixelColorOutputIndex, GetDataTypeStr(dataType)); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, i); + src->add(");" _CRLF); + } if( cfInstruction->exportArrayBase+i >= 8 ) cemu_assert_unimplemented(); diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 066e3b5c2..412c99922 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -281,9 +281,11 @@ namespace LatteDecompiler { if ((decompilerContext->shader->pixelColorOutputMask & (1 << i)) != 0) { - //src->addFmt("#ifdef {}" _CRLF, GetColorAttachmentTypeStr(i)); - src->addFmt("{} passPixelColor{} [[color({})]];" _CRLF, GetColorBufferDataTypeStr(i, *decompilerContext->contextRegistersNew)/*, GetColorAttachmentTypeStr(i)*/, i, i); - //src->add("#endif" _CRLF); + auto dataType = GetColorBufferDataType(i, *decompilerContext->contextRegistersNew); + if (dataType != MetalDataType::NONE) + { + src->addFmt("{} passPixelColor{} [[color({})]];" _CRLF, GetDataTypeStr(dataType), i, i); + } } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h index 218a74f5f..c1b1c75c3 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h @@ -6,6 +6,7 @@ #include "Cafe/HW/Latte/Core/LatteConst.h" //#include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Renderer/Renderer.h" +#include "Common/precompiled.h" struct Uvec2 { uint32 x; @@ -32,10 +33,14 @@ const MetalPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format, boo MTL::PixelFormat GetMtlPixelFormat(Latte::E_GX2SURFFMT format, bool isDepth, const MetalPixelFormatSupport& pixelFormatSupport); -inline const char* GetColorBufferDataTypeStr(const uint32 index, const LatteContextRegister& lcr) +inline MetalDataType GetColorBufferDataType(const uint32 index, const LatteContextRegister& lcr) { auto format = LatteMRT::GetColorBufferFormat(index, lcr); - auto dataType = GetMtlPixelFormatInfo(format, false).dataType; + return GetMtlPixelFormatInfo(format, false).dataType; +} + +inline const char* GetDataTypeStr(MetalDataType dataType) +{ switch (dataType) { case MetalDataType::INT: @@ -45,7 +50,8 @@ inline const char* GetColorBufferDataTypeStr(const uint32 index, const LatteCont case MetalDataType::FLOAT: return "float4"; default: - return "unknown"; + cemu_assert_suspicious(); + return ""; } } From 6a3bdd49e9e3e14290f2760526c27e4d8fb42af9 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 6 Sep 2024 17:38:19 +0200 Subject: [PATCH 156/368] refactor pixel format support --- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 51 +++++++++---------- .../HW/Latte/Renderer/Metal/MetalCommon.h | 2 +- 2 files changed, 26 insertions(+), 27 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index 581c0e190..daa283e40 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -1,4 +1,5 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Cemu/Logging/CemuLogging.h" #include "Common/precompiled.h" #include "Metal/MTLDepthStencil.hpp" #include "Metal/MTLPixelFormat.hpp" @@ -95,42 +96,40 @@ const MetalPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format, boo else formatInfo = MTL_COLOR_FORMAT_TABLE[format]; - if (formatInfo.pixelFormat == MTL::PixelFormatInvalid) - { - printf("invalid pixel format: %u\n", (uint32)format); - } - return formatInfo; } MTL::PixelFormat GetMtlPixelFormat(Latte::E_GX2SURFFMT format, bool isDepth, const MetalPixelFormatSupport& pixelFormatSupport) { auto pixelFormat = GetMtlPixelFormatInfo(format, isDepth).pixelFormat; + if (pixelFormat == MTL::PixelFormatInvalid) + cemuLog_logDebug(LogType::Force, "invalid pixel format {}\n", pixelFormat); - if (!pixelFormatSupport.m_supportsR8Unorm_sRGB && pixelFormat == MTL::PixelFormatR8Unorm_sRGB) - pixelFormat = MTL::PixelFormatRGBA8Unorm_sRGB; - - if (!pixelFormatSupport.m_supportsRG8Unorm_sRGB && pixelFormat == MTL::PixelFormatRG8Unorm_sRGB) - pixelFormat = MTL::PixelFormatRGBA8Unorm_sRGB; - - if (!pixelFormatSupport.m_supportsPacked16BitFormats) + switch (pixelFormat) { - switch (pixelFormat) - { - case MTL::PixelFormatB5G6R5Unorm: - case MTL::PixelFormatA1BGR5Unorm: - case MTL::PixelFormatABGR4Unorm: - case MTL::PixelFormatBGR5A1Unorm: - pixelFormat = MTL::PixelFormatRGBA8Unorm; - break; - default: - break; - } + case MTL::PixelFormatR8Unorm_sRGB: + if (!pixelFormatSupport.m_supportsR8Unorm_sRGB) + return MTL::PixelFormatRGBA8Unorm_sRGB; + break; + case MTL::PixelFormatRG8Unorm_sRGB: + if (!pixelFormatSupport.m_supportsRG8Unorm_sRGB) + return MTL::PixelFormatRGBA8Unorm_sRGB; + break; + case MTL::PixelFormatB5G6R5Unorm: + case MTL::PixelFormatA1BGR5Unorm: + case MTL::PixelFormatABGR4Unorm: + case MTL::PixelFormatBGR5A1Unorm: + if (!pixelFormatSupport.m_supportsPacked16BitFormats) + return MTL::PixelFormatRGBA8Unorm; + break; + case MTL::PixelFormatDepth24Unorm_Stencil8: + if (!pixelFormatSupport.m_supportsDepth24Unorm_Stencil8) + return MTL::PixelFormatDepth32Float_Stencil8; + break; + default: + break; } - if (!pixelFormatSupport.m_supportsDepth24Unorm_Stencil8 && pixelFormat == MTL::PixelFormatDepth24Unorm_Stencil8) - pixelFormat = MTL::PixelFormatDepth32Float_Stencil8; - return pixelFormat; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h index ede0bed68..8a6daa928 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -18,7 +18,7 @@ struct MetalPixelFormatSupport m_supportsR8Unorm_sRGB = device->supportsFamily(MTL::GPUFamilyApple1); m_supportsRG8Unorm_sRGB = device->supportsFamily(MTL::GPUFamilyApple1); m_supportsPacked16BitFormats = device->supportsFamily(MTL::GPUFamilyApple1); - m_supportsDepth24Unorm_Stencil8 = device->supportsFamily(MTL::GPUFamilyMac2); + m_supportsDepth24Unorm_Stencil8 = device->depth24Stencil8PixelFormatSupported(); } }; From 548ffb6b575caa6318b8f9296db3700360f5dd35 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 6 Sep 2024 19:15:56 +0200 Subject: [PATCH 157/368] add: todo notices --- src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp | 1 + src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index eebacd45f..86c44efee 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -78,6 +78,7 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM auto pixelFormat = GetMtlPixelFormat(format, isDepth, m_mtlr->GetPixelFormatSupport()); desc->setPixelFormat(pixelFormat); + // TODO: using MTL::TextureUsageShaderWrite as well fixes Mario Tennis: Ultra Smash, investigate why MTL::TextureUsage usage = MTL::TextureUsageShaderRead; if (!Latte::IsCompressedFormat(format)) { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index c40fbabb2..545295dc1 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -859,6 +859,7 @@ void MetalRenderer::draw_beginSequence() return; // no render target } + // TODO: not checking for !streamoutEnable fixes Super Smash Bros. for Wii U, investigate why if (!hasValidFramebufferAttached && !streamoutEnable) { debug_printf("Drawcall with no color buffer or depth buffer attached\n"); From 3dc233fb56061b0468a1fb25248f53432a0bc0a4 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 7 Sep 2024 11:00:10 +0200 Subject: [PATCH 158/368] support rasterization kill --- src/Cafe/HW/Latte/Core/LatteShader.cpp | 23 +++++- .../LatteDecompilerEmitMSL.cpp | 53 +++++++++++--- .../LatteDecompilerEmitMSLHeader.hpp | 8 +-- .../Renderer/Metal/MetalPipelineCache.cpp | 70 +++++++++++-------- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 50 +++++-------- 5 files changed, 124 insertions(+), 80 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteShader.cpp b/src/Cafe/HW/Latte/Core/LatteShader.cpp index 2bc719cdd..c530dc43c 100644 --- a/src/Cafe/HW/Latte/Core/LatteShader.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShader.cpp @@ -501,8 +501,27 @@ void LatteSHRC_UpdateVSBaseHash(uint8* vertexShaderPtr, uint32 vertexShaderSize, uint64 vsHash2 = 0; _calculateShaderProgramHash(vsProgramCode, vertexShaderSize, &hashCacheVS, &vsHash1, &vsHash2); uint64 vsHash = vsHash1 + vsHash2 + _activeFetchShader->key + _activePSImportTable.key + (usesGeometryShader ? 0x1111ULL : 0ULL); - if (g_renderer->GetType() == RendererAPI::Metal && usesGeometryShader) - vsHash += _activeFetchShader->mtlShaderHashObject; + if (g_renderer->GetType() == RendererAPI::Metal) + { + if (usesGeometryShader) + vsHash += _activeFetchShader->mtlShaderHashObject; + + // Rasterization + bool rasterizationEnabled = !LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); + + // HACK + if (!LatteGPUState.contextNew.PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) + rasterizationEnabled = true; + + const auto& polygonControlReg = LatteGPUState.contextNew.PA_SU_SC_MODE_CNTL; + uint32 cullFront = polygonControlReg.get_CULL_FRONT(); + uint32 cullBack = polygonControlReg.get_CULL_BACK(); + if (cullFront && cullBack) + rasterizationEnabled = false; + + if (rasterizationEnabled) + vsHash += 51ULL; + } uint32 tmp = LatteGPUState.contextNew.PA_CL_VTE_CNTL.getRawValue() ^ 0x43F; vsHash += tmp; diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 137f8e87a..4c62c2444 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -20,6 +20,8 @@ #define _CRLF "\r\n" +static bool rasterizationEnabled; + void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext, StringBuf* src, LatteParsedFetchShaderAttribute_t* attrib); /* @@ -3108,6 +3110,9 @@ static void _emitExportGPRReadCode(LatteDecompilerShaderContext* shaderContext, static void _emitExportCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) { + if (!rasterizationEnabled) + return; + StringBuf* src = shaderContext->shaderSource; src->add("// export" _CRLF); if(shaderContext->shaderType == LatteConst::ShaderType::Vertex ) @@ -3332,6 +3337,9 @@ static void _emitCFRingWriteCode(LatteDecompilerShaderContext* shaderContext, La return; } + if (!rasterizationEnabled) + return; + if (shaderContext->shaderType == LatteConst::ShaderType::Vertex) { if (cfInstruction->memWriteElemSize != 3) @@ -3861,6 +3869,23 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, { bool isRectVertexShader = (static_cast(shaderContext->contextRegisters[mmVGT_PRIMITIVE_TYPE]) == LattePrimitiveMode::RECTS); + // Rasterization + rasterizationEnabled = true; + if (shader->shaderType == LatteConst::ShaderType::Vertex) + { + rasterizationEnabled = !shaderContext->contextRegistersNew->PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); + + // HACK + if (!shaderContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) + rasterizationEnabled = true; + + const auto& polygonControlReg = shaderContext->contextRegistersNew->PA_SU_SC_MODE_CNTL; + uint32 cullFront = polygonControlReg.get_CULL_FRONT(); + uint32 cullBack = polygonControlReg.get_CULL_BACK(); + if (cullFront && cullBack) + rasterizationEnabled = false; + } + StringBuf* src = new StringBuf(1024*1024*12); // reserve 12MB for generated source (we resize-to-fit at the end) shaderContext->shaderSource = src; @@ -3874,7 +3899,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, src->add("#include " _CRLF); src->add("using namespace metal;" _CRLF); // header part (definitions for inputs and outputs) - LatteDecompiler::emitHeader(shaderContext, isRectVertexShader); + LatteDecompiler::emitHeader(shaderContext, isRectVertexShader, rasterizationEnabled); // helper functions LatteDecompiler_emitHelperFunctions(shaderContext, src); const char* functionType = ""; @@ -4010,7 +4035,10 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, else { functionType = "vertex"; - outputTypeName = "VertexOut"; + if (rasterizationEnabled) + outputTypeName = "VertexOut"; + else + outputTypeName = "void"; } break; case LatteConst::ShaderType::Geometry: @@ -4048,7 +4076,8 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, } else { - src->addFmt("{} out;" _CRLF, outputTypeName); + if (rasterizationEnabled) + src->addFmt("{} out;" _CRLF, outputTypeName); } // variable definition if (shaderContext->typeTracker.useArrayGPRs == false) @@ -4285,9 +4314,9 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, //if(shader->shaderType == LatteConst::ShaderType::Geometry) // src->add("EndPrimitive();" _CRLF); // vertex shader should write renderstate point size at the end if required but not modified by shader - if (shaderContext->analyzer.outputPointSize && shaderContext->analyzer.writesPointSize == false) + if (shaderContext->analyzer.outputPointSize && !shaderContext->analyzer.writesPointSize) { - if (shader->shaderType == LatteConst::ShaderType::Vertex && !shaderContext->options->usesGeometryShader) + if (shader->shaderType == LatteConst::ShaderType::Vertex && !shaderContext->options->usesGeometryShader && rasterizationEnabled) src->add("out.pointSize = supportBuffer.pointSize;" _CRLF); } @@ -4325,13 +4354,15 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, } } - // TODO: this should be handled outside of the shader, because clipping currently wouldn't work (or would it?) - if ((shader->shaderType == LatteConst::ShaderType::Vertex && !shaderContext->options->usesGeometryShader) || shader->shaderType == LatteConst::ShaderType::Geometry) - src->add("out.position.z = (out.position.z + out.position.w) / 2.0;" _CRLF); + if (rasterizationEnabled) + { + if (shader->shaderType == LatteConst::ShaderType::Vertex && !shaderContext->options->usesGeometryShader) + src->add("out.position.z = (out.position.z + out.position.w) / 2.0;" _CRLF); - // Return - if (!(shaderContext->options->usesGeometryShader || isRectVertexShader) || shader->shaderType == LatteConst::ShaderType::Pixel) - src->add("return out;" _CRLF); + // Return + if (!(shaderContext->options->usesGeometryShader || isRectVertexShader) || shader->shaderType == LatteConst::ShaderType::Pixel) + src->add("return out;" _CRLF); + } // end of shader main src->add("}" _CRLF); diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 412c99922..1342a2776 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -262,7 +262,7 @@ namespace LatteDecompiler src->add("};" _CRLF _CRLF); } - static void _emitInputsAndOutputs(LatteDecompilerShaderContext* decompilerContext, bool isRectVertexShader) + static void _emitInputsAndOutputs(LatteDecompilerShaderContext* decompilerContext, bool isRectVertexShader, bool rasterizationEnabled) { auto src = decompilerContext->shaderSource; @@ -300,7 +300,7 @@ namespace LatteDecompiler if (!decompilerContext->options->usesGeometryShader) { - if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex && rasterizationEnabled) _emitVSOutputs(decompilerContext, isRectVertexShader); } else @@ -351,7 +351,7 @@ namespace LatteDecompiler } } - static void emitHeader(LatteDecompilerShaderContext* decompilerContext, bool isRectVertexShader) + static void emitHeader(LatteDecompilerShaderContext* decompilerContext, bool isRectVertexShader, bool rasterizationEnabled) { auto src = decompilerContext->shaderSource; @@ -410,7 +410,7 @@ namespace LatteDecompiler // uniform buffers _emitUniformBuffers(decompilerContext); // inputs and outputs - _emitInputsAndOutputs(decompilerContext, isRectVertexShader); + _emitInputsAndOutputs(decompilerContext, isRectVertexShader, rasterizationEnabled); if (dump_shaders_enabled) decompilerContext->shaderSource->add("// end of shader inputs/outputs" _CRLF); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 8273ec162..931b61491 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -9,6 +9,7 @@ #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/ISA/RegDefines.h" +#include "Cemu/Logging/CemuLogging.h" #include "config/ActiveSettings.h" static void rectsEmulationGS_outputSingleVertex(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable* psInputTable, sint32 vIdx, const LatteContextRegister& latteRegister) @@ -188,8 +189,31 @@ extern std::atomic_int g_compiled_shaders_total; extern std::atomic_int g_compiled_shaders_async; template -void SetFragmentState(T* desc, CachedFBOMtl* lastUsedFBO, CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) +void SetFragmentState(T* desc, CachedFBOMtl* lastUsedFBO, CachedFBOMtl* activeFBO, const LatteDecompilerShader* pixelShader, const LatteContextRegister& lcr) { + // Rasterization + bool rasterizationEnabled = !LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); + + // HACK + // TODO: include this in the hash? + if (!LatteGPUState.contextNew.PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) + rasterizationEnabled = true; + + // Culling both front and back faces effectively disables rasterization + const auto& polygonControlReg = LatteGPUState.contextNew.PA_SU_SC_MODE_CNTL; + uint32 cullFront = polygonControlReg.get_CULL_FRONT(); + uint32 cullBack = polygonControlReg.get_CULL_BACK(); + if (cullFront && cullBack) + rasterizationEnabled = false; + + desc->setRasterizationEnabled(rasterizationEnabled); + + if (rasterizationEnabled) + { + auto pixelShaderMtl = static_cast(pixelShader->shader); + desc->setFragmentFunction(pixelShaderMtl->GetFunction()); + } + // Color attachments const Latte::LATTE_CB_COLOR_CONTROL& colorControlReg = lcr.CB_COLOR_CONTROL; uint32 blendEnableMask = colorControlReg.get_BLEND_MASK(); @@ -284,7 +308,7 @@ MetalPipelineCache::~MetalPipelineCache() m_binaryArchive->serializeToURL(m_binaryArchiveURL, &error); if (error) { - debug_printf("failed to serialize binary archive: %s\n", error->localizedDescription()->utf8String()); + cemuLog_log(LogType::Force, "failed to serialize binary archive: {}", error->localizedDescription()->utf8String()); error->release(); } m_binaryArchive->release(); @@ -362,17 +386,15 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte layout->setStride(bufferStride); } - auto mtlVertexShader = static_cast(vertexShader->shader); - auto mtlPixelShader = static_cast(pixelShader->shader); + auto vertexShaderMtl = static_cast(vertexShader->shader); // Render pipeline state MTL::RenderPipelineDescriptor* desc = MTL::RenderPipelineDescriptor::alloc()->init(); - desc->setVertexFunction(mtlVertexShader->GetFunction()); - desc->setFragmentFunction(mtlPixelShader->GetFunction()); + desc->setVertexFunction(vertexShaderMtl->GetFunction()); // TODO: don't always set the vertex descriptor? desc->setVertexDescriptor(vertexDescriptor); - SetFragmentState(desc, lastUsedFBO, activeFBO, lcr); + SetFragmentState(desc, lastUsedFBO, activeFBO, pixelShader, lcr); TryLoadBinaryArchive(); @@ -391,9 +413,6 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte #endif pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, MTL::PipelineOptionFailOnBinaryArchiveMiss, nullptr, &error); - //static uint32 oldPipelineCount = 0; - //static uint32 newPipelineCount = 0; - // Pipeline wasn't found in the binary archive, we need to compile it if (error) { @@ -407,7 +426,7 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, &error); if (error) { - debug_printf("error creating render pipeline state: %s\n", error->localizedDescription()->utf8String()); + cemuLog_log(LogType::Force, "error creating render pipeline state: {}", error->localizedDescription()->utf8String()); error->release(); } else @@ -419,19 +438,12 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte m_binaryArchive->addRenderPipelineFunctions(desc, &error); if (error) { - debug_printf("error saving render pipeline functions: %s\n", error->localizedDescription()->utf8String()); + cemuLog_log(LogType::Force, "error saving render pipeline functions: {}", error->localizedDescription()->utf8String()); error->release(); } } } - - //newPipelineCount++; } - //else - //{ - // oldPipelineCount++; - //} - //debug_printf("%u pipelines were found in the binary archive, %u new were created\n", oldPipelineCount, newPipelineCount); desc->release(); vertexDescriptor->release(); @@ -452,26 +464,24 @@ MTL::RenderPipelineState* MetalPipelineCache::GetMeshPipelineState(const LatteFe if (pipeline) return pipeline; - auto mtlObjectShader = static_cast(vertexShader->shader); - RendererShaderMtl* mtlMeshShader; + auto objectShaderMtl = static_cast(vertexShader->shader); + RendererShaderMtl* meshShaderMtl; if (geometryShader) { - mtlMeshShader = static_cast(geometryShader->shader); + meshShaderMtl = static_cast(geometryShader->shader); } else { // If there is no geometry shader, it means that we are emulating rects - mtlMeshShader = rectsEmulationGS_generate(m_mtlr, vertexShader, lcr); + meshShaderMtl = rectsEmulationGS_generate(m_mtlr, vertexShader, lcr); } - auto mtlPixelShader = static_cast(pixelShader->shader); // Render pipeline state MTL::MeshRenderPipelineDescriptor* desc = MTL::MeshRenderPipelineDescriptor::alloc()->init(); - desc->setObjectFunction(mtlObjectShader->GetFunction()); - desc->setMeshFunction(mtlMeshShader->GetFunction()); - desc->setFragmentFunction(mtlPixelShader->GetFunction()); + desc->setObjectFunction(objectShaderMtl->GetFunction()); + desc->setMeshFunction(meshShaderMtl->GetFunction()); - SetFragmentState(desc, lastUsedFBO, activeFBO, lcr); + SetFragmentState(desc, lastUsedFBO, activeFBO, pixelShader, lcr); TryLoadBinaryArchive(); @@ -486,7 +496,7 @@ MTL::RenderPipelineState* MetalPipelineCache::GetMeshPipelineState(const LatteFe desc->release(); if (error) { - debug_printf("error creating render pipeline state: %s\n", error->localizedDescription()->utf8String()); + cemuLog_log(LogType::Force, "error creating mesh render pipeline state: {}", error->localizedDescription()->utf8String()); error->release(); } @@ -594,7 +604,7 @@ void MetalPipelineCache::TryLoadBinaryArchive() m_binaryArchive = m_mtlr->GetDevice()->newBinaryArchive(desc, &error); if (error) { - debug_printf("failed to create binary archive: %s\n", error->localizedDescription()->utf8String()); + cemuLog_log(LogType::Force, "failed to create binary archive: {}", error->localizedDescription()->utf8String()); error->release(); } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 545295dc1..3c3ed1066 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -879,7 +879,7 @@ void MetalRenderer::draw_beginSequence() LatteRenderTarget_updateScissorBox(); // check for conditions which would turn the drawcalls into no-ops - bool rasterizerEnable = LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL() == false; + bool rasterizerEnable = !LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); // GX2SetSpecialState(0, true) enables DX_RASTERIZATION_KILL, but still expects depth writes to happen? -> Research which stages are disabled by DX_RASTERIZATION_KILL exactly // for now we use a workaround: @@ -888,18 +888,6 @@ void MetalRenderer::draw_beginSequence() if (!rasterizerEnable && !streamoutEnable) m_state.m_skipDrawSequence = true; - - // Both faces are culled - // TODO: can we really skip the draw? - const auto& polygonControlReg = LatteGPUState.contextNew.PA_SU_SC_MODE_CNTL; - uint32 cullFront = polygonControlReg.get_CULL_FRONT(); - uint32 cullBack = polygonControlReg.get_CULL_BACK(); - if (cullFront && cullBack) - m_state.m_skipDrawSequence = true; - - // TODO: is this even needed? - if (!m_state.m_activeFBO) - m_state.m_skipDrawSequence = true; } void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst) @@ -1065,23 +1053,23 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // Cull mode - // Handled in draw_beginSequence - if (cullFront && cullBack) - cemu_assert_suspicious(); - - MTL::CullMode cullMode; - if (cullFront) - cullMode = MTL::CullModeFront; - else if (cullBack) - cullMode = MTL::CullModeBack; - else - cullMode = MTL::CullModeNone; + // Cull front and back is handled by disabling rasterization + if (!(cullFront && cullBack)) + { + MTL::CullMode cullMode; + if (cullFront) + cullMode = MTL::CullModeFront; + else if (cullBack) + cullMode = MTL::CullModeBack; + else + cullMode = MTL::CullModeNone; - if (cullMode != encoderState.m_cullMode) - { - renderCommandEncoder->setCullMode(cullMode); - encoderState.m_cullMode = cullMode; - } + if (cullMode != encoderState.m_cullMode) + { + renderCommandEncoder->setCullMode(cullMode); + encoderState.m_cullMode = cullMode; + } + } // Front face MTL::Winding frontFaceWinding; @@ -1164,12 +1152,8 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 else renderPipelineState = m_pipelineCache->GetRenderPipelineState(fetchShader, vertexShader, pixelShader, m_state.m_lastUsedFBO, m_state.m_activeFBO, LatteGPUState.contextNew); - // HACK if (!renderPipelineState) - { - printf("invalid render pipeline state, skipping draw\n"); return; - } if (renderPipelineState != encoderState.m_renderPipelineState) { From cf5602466cecdc46129347c5bf37f3d4e2d168ce Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 7 Sep 2024 11:05:13 +0200 Subject: [PATCH 159/368] optimize rasterization kill --- .../HW/Latte/Renderer/Metal/MetalPipelineCache.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 931b61491..a3b826158 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -206,14 +206,15 @@ void SetFragmentState(T* desc, CachedFBOMtl* lastUsedFBO, CachedFBOMtl* activeFB if (cullFront && cullBack) rasterizationEnabled = false; - desc->setRasterizationEnabled(rasterizationEnabled); - - if (rasterizationEnabled) + if (!rasterizationEnabled) { - auto pixelShaderMtl = static_cast(pixelShader->shader); - desc->setFragmentFunction(pixelShaderMtl->GetFunction()); + desc->setRasterizationEnabled(false); + return; } + auto pixelShaderMtl = static_cast(pixelShader->shader); + desc->setFragmentFunction(pixelShaderMtl->GetFunction()); + // Color attachments const Latte::LATTE_CB_COLOR_CONTROL& colorControlReg = lcr.CB_COLOR_CONTROL; uint32 blendEnableMask = colorControlReg.get_BLEND_MASK(); From 84049814f2631b74b948727a3dd26a032c43870a Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 7 Sep 2024 13:23:38 +0200 Subject: [PATCH 160/368] make binary archives device and os version specific --- .../Renderer/Metal/MetalPipelineCache.cpp | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index a3b826158..aa0339760 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -309,7 +309,7 @@ MetalPipelineCache::~MetalPipelineCache() m_binaryArchive->serializeToURL(m_binaryArchiveURL, &error); if (error) { - cemuLog_log(LogType::Force, "failed to serialize binary archive: {}", error->localizedDescription()->utf8String()); + cemuLog_log(LogType::Force, "error serializing binary archive: {}", error->localizedDescription()->utf8String()); error->release(); } m_binaryArchive->release(); @@ -587,8 +587,28 @@ void MetalPipelineCache::TryLoadBinaryArchive() if (m_binaryArchive || s_cacheTitleId == INVALID_TITLE_ID) return; + // GPU name + const char* deviceName1 = m_mtlr->GetDevice()->name()->utf8String(); + std::string deviceName; + deviceName.assign(deviceName1); + + // Replace spaces with underscores + for (auto& c : deviceName) + { + if (c == ' ') + c = '_'; + } + + // OS version + auto osVersion = NS::ProcessInfo::processInfo()->operatingSystemVersion(); + + // Precompiled binaries cannot be shared between different devices or OS versions const std::string cacheFilename = fmt::format("{:016x}_mtl_pipelines.bin", s_cacheTitleId); - const fs::path cachePath = ActiveSettings::GetCachePath("shaderCache/precompiled/{}", cacheFilename); + const fs::path cachePath = ActiveSettings::GetCachePath("shaderCache/precompiled/{}/{}-{}-{}/{}", deviceName, osVersion.majorVersion, osVersion.minorVersion, osVersion.patchVersion, cacheFilename); + + // Create the directory if it doesn't exist + std::filesystem::create_directories(cachePath.parent_path()); + m_binaryArchiveURL = NS::URL::fileURLWithPath(ToNSString((const char*)cachePath.generic_u8string().c_str())); MTL::BinaryArchiveDescriptor* desc = MTL::BinaryArchiveDescriptor::alloc()->init(); From e8f726ecd7b8b29e9bfa8e21d8c48a054457bcfb Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 7 Sep 2024 13:42:31 +0200 Subject: [PATCH 161/368] use lcr --- src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index aa0339760..ee855135b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -192,15 +192,15 @@ template void SetFragmentState(T* desc, CachedFBOMtl* lastUsedFBO, CachedFBOMtl* activeFBO, const LatteDecompilerShader* pixelShader, const LatteContextRegister& lcr) { // Rasterization - bool rasterizationEnabled = !LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); + bool rasterizationEnabled = !lcr.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); // HACK // TODO: include this in the hash? - if (!LatteGPUState.contextNew.PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) + if (!lcr.PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) rasterizationEnabled = true; // Culling both front and back faces effectively disables rasterization - const auto& polygonControlReg = LatteGPUState.contextNew.PA_SU_SC_MODE_CNTL; + const auto& polygonControlReg = lcr.PA_SU_SC_MODE_CNTL; uint32 cullFront = polygonControlReg.get_CULL_FRONT(); uint32 cullBack = polygonControlReg.get_CULL_BACK(); if (cullFront && cullBack) From 395cd1cd1172158cb5689749b0ae32627565fbf9 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 11 Sep 2024 10:55:10 +0200 Subject: [PATCH 162/368] handle rasterization kill for mesh shaders --- src/Cafe/HW/Latte/Core/LatteShader.cpp | 30 +++++++++++-------- .../LatteDecompilerEmitMSL.cpp | 2 +- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteShader.cpp b/src/Cafe/HW/Latte/Core/LatteShader.cpp index c530dc43c..0af3b577b 100644 --- a/src/Cafe/HW/Latte/Core/LatteShader.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShader.cpp @@ -504,23 +504,27 @@ void LatteSHRC_UpdateVSBaseHash(uint8* vertexShaderPtr, uint32 vertexShaderSize, if (g_renderer->GetType() == RendererAPI::Metal) { if (usesGeometryShader) + { vsHash += _activeFetchShader->mtlShaderHashObject; + } + else + { + // Rasterization + bool rasterizationEnabled = !LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); - // Rasterization - bool rasterizationEnabled = !LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); - - // HACK - if (!LatteGPUState.contextNew.PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) - rasterizationEnabled = true; + // HACK + if (!LatteGPUState.contextNew.PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) + rasterizationEnabled = true; - const auto& polygonControlReg = LatteGPUState.contextNew.PA_SU_SC_MODE_CNTL; - uint32 cullFront = polygonControlReg.get_CULL_FRONT(); - uint32 cullBack = polygonControlReg.get_CULL_BACK(); - if (cullFront && cullBack) - rasterizationEnabled = false; + const auto& polygonControlReg = LatteGPUState.contextNew.PA_SU_SC_MODE_CNTL; + uint32 cullFront = polygonControlReg.get_CULL_FRONT(); + uint32 cullBack = polygonControlReg.get_CULL_BACK(); + if (cullFront && cullBack) + rasterizationEnabled = false; - if (rasterizationEnabled) - vsHash += 51ULL; + if (rasterizationEnabled) + vsHash += 51ULL; + } } uint32 tmp = LatteGPUState.contextNew.PA_CL_VTE_CNTL.getRawValue() ^ 0x43F; diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 4c62c2444..71e3f0df4 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3871,7 +3871,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, // Rasterization rasterizationEnabled = true; - if (shader->shaderType == LatteConst::ShaderType::Vertex) + if (shader->shaderType == LatteConst::ShaderType::Vertex && !(shaderContext->options->usesGeometryShader || isRectVertexShader)) { rasterizationEnabled = !shaderContext->contextRegistersNew->PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); From 950f04d4446f6a5b58bb1c2170205b6f5e4c8283 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 11 Sep 2024 12:22:45 +0200 Subject: [PATCH 163/368] support instancing for mesh shaders --- .../HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h | 1 + .../LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp | 1 + .../LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp | 8 ++++---- .../LatteDecompilerEmitMSLHeader.hpp | 3 +++ src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 8 ++++++-- 5 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h index 29e65c58d..2812facc7 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h @@ -64,6 +64,7 @@ struct LatteDecompilerShaderResourceMapping // attributes (vertex shader only) sint8 attributeMapping[LATTE_NUM_MAX_ATTRIBUTE_LOCATIONS]; // Metal exclusive + sint8 verticesPerInstanceBinding{-1}; sint8 indexBufferBinding{-1}; sint8 indexTypeBinding{-1}; diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp index ec3d8aa7b..b5697d42e 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp @@ -1019,6 +1019,7 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD LatteDecompiler::_initTextureBindingPointsMTL(shaderContext); LatteDecompiler::_initUniformBindingPoints(shaderContext); LatteDecompiler::_initAttributeBindingPoints(shaderContext); + shaderContext->output->resourceMappingMTL.verticesPerInstanceBinding = shaderContext->currentBufferBindingPointMTL++; shaderContext->output->resourceMappingMTL.indexBufferBinding = shaderContext->currentBufferBindingPointMTL++; shaderContext->output->resourceMappingMTL.indexTypeBinding = shaderContext->currentBufferBindingPointMTL++; } diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 71e3f0df4..3f022c617 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3920,8 +3920,8 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, // Index buffer inputFetchDefinition += "if (indexType == 1) // UShort\n"; inputFetchDefinition += "vid = ((device ushort*)indexBuffer)[vid];\n"; - inputFetchDefinition += "else if (indexType == 2)\n"; - inputFetchDefinition += "vid = ((device uint*)indexBuffer)[vid]; // UInt\n"; + inputFetchDefinition += "else if (indexType == 2) // UInt\n"; + inputFetchDefinition += "vid = ((device uint*)indexBuffer)[vid];\n"; inputFetchDefinition += "VertexIn in;\n"; for (auto& bufferGroup : shaderContext->fetchShader->bufferGroups) @@ -4060,8 +4060,8 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, { // Calculate the imaginary vertex id src->add("uint vid = tig * VERTICES_PER_VERTEX_PRIMITIVE + tid;" _CRLF); - // TODO: don't hardcode the instance index - src->add("uint iid = 0;" _CRLF); + src->add("uint iid = vid / verticesPerInstance;" _CRLF); + src->add("vid %= verticesPerInstance;" _CRLF); // Fetch the input src->add("VertexIn in = fetchVertex(vid, indexBuffer, indexType VERTEX_BUFFERS);" _CRLF); // Output is defined as object payload diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 1342a2776..a7121f52d 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -497,8 +497,11 @@ namespace LatteDecompiler src->add(", mesh_grid_properties meshGridProperties"); src->add(", uint tig [[threadgroup_position_in_grid]]"); src->add(", uint tid [[thread_index_in_threadgroup]]"); + // TODO: put into the support buffer? + src->addFmt(", constant uint& verticesPerInstance [[buffer({})]]", decompilerContext->output->resourceMappingMTL.verticesPerInstanceBinding); // TODO: inly include index buffer if needed src->addFmt(", device uint* indexBuffer [[buffer({})]]", decompilerContext->output->resourceMappingMTL.indexBufferBinding); + // TODO: put into the support buffer? // TODO: use uchar? src->addFmt(", constant uint& indexType [[buffer({})]]", decompilerContext->output->resourceMappingMTL.indexTypeBinding); src->add(" VERTEX_BUFFER_DEFINITIONS"); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 3c3ed1066..46bcf6dcb 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1183,10 +1183,14 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 } if (usesGeometryShader) { + uint32 verticesPerInstance = count / instanceCount; + // TODO: make a helper function for this + renderCommandEncoder->setObjectBytes(&verticesPerInstance, sizeof(verticesPerInstance), vertexShader->resourceMapping.verticesPerInstanceBinding); + encoderState.m_buffers[METAL_SHADER_TYPE_OBJECT][vertexShader->resourceMapping.verticesPerInstanceBinding] = {nullptr}; if (indexBuffer) SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_OBJECT, indexBuffer, indexBufferOffset, vertexShader->resourceMapping.indexBufferBinding); renderCommandEncoder->setObjectBytes(&hostIndexType, sizeof(hostIndexType), vertexShader->resourceMapping.indexTypeBinding); - encoderState.m_buffers[METAL_SHADER_TYPE_OBJECT][vertexShader->resourceMapping.indexTypeBinding] = {nullptr}; + encoderState.m_buffers[METAL_SHADER_TYPE_OBJECT][vertexShader->resourceMapping.indexTypeBinding] = {nullptr}; uint32 verticesPerPrimitive = 0; switch (primitiveMode) @@ -1206,7 +1210,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 break; } - renderCommandEncoder->drawMeshThreadgroups(MTL::Size(count / verticesPerPrimitive, 1, 1), MTL::Size(verticesPerPrimitive, 1, 1), MTL::Size(1, 1, 1)); + renderCommandEncoder->drawMeshThreadgroups(MTL::Size(count * instanceCount / verticesPerPrimitive, 1, 1), MTL::Size(verticesPerPrimitive, 1, 1), MTL::Size(1, 1, 1)); } else { From e7f8f0ee4cff283a2e4ee5308d29fee6c73023f0 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 11 Sep 2024 12:28:35 +0200 Subject: [PATCH 164/368] simplify uniform names --- .../LatteDecompilerEmitMSL.cpp | 18 ++---------------- .../LatteDecompilerEmitMSLHeader.hpp | 16 ++-------------- 2 files changed, 4 insertions(+), 30 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 3f022c617..58b2615be 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -631,14 +631,7 @@ static void _emitUniformAccessCode(LatteDecompilerShaderContext* shaderContext, } cemu_assert_debug(remappedUniformEntry); _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); - if(shaderContext->shader->shaderType == LatteConst::ShaderType::Vertex ) - src->addFmt("supportBuffer.remappedVS[{}]", remappedUniformEntry->mappedIndex); - else if(shaderContext->shader->shaderType == LatteConst::ShaderType::Pixel ) - src->addFmt("supportBuffer.remappedPS[{}]", remappedUniformEntry->mappedIndex); - else if(shaderContext->shader->shaderType == LatteConst::ShaderType::Geometry ) - src->addFmt("supportBuffer.remappedGS[{}]", remappedUniformEntry->mappedIndex); - else - debugBreakpoint(); + src->addFmt("supportBuffer.remapped[{}]", remappedUniformEntry->mappedIndex); _appendChannelAccess(src, aluInstruction->sourceOperand[operandIndex].chan); _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); } @@ -646,14 +639,7 @@ static void _emitUniformAccessCode(LatteDecompilerShaderContext* shaderContext, { // uniform registers are accessed with unpredictable (dynamic) offset _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); - if(shaderContext->shader->shaderType == LatteConst::ShaderType::Vertex ) - src->add("supportBuffer.uniformRegisterVS["); - else if (shaderContext->shader->shaderType == LatteConst::ShaderType::Pixel) - src->add("supportBuffer.uniformRegisterPS["); - else if(shaderContext->shader->shaderType == LatteConst::ShaderType::Geometry ) - src->add("supportBuffer.uniformRegisterGS["); - else - debugBreakpoint(); + src->add("supportBuffer.uniformRegister["); _emitUniformAccessIndexCode(shaderContext, aluInstruction, operandIndex); src->add("]"); diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index a7121f52d..37a509c1c 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -20,14 +20,7 @@ namespace LatteDecompiler { // uniform registers or buffers are accessed statically with predictable offsets // this allows us to remap the used entries into a more compact array - if (shaderType == LatteConst::ShaderType::Vertex) - src->addFmt("int4 remappedVS[{}];" _CRLF, (sint32)shader->list_remappedUniformEntries.size()); - else if (shaderType == LatteConst::ShaderType::Pixel) - src->addFmt("int4 remappedPS[{}];" _CRLF, (sint32)shader->list_remappedUniformEntries.size()); - else if (shaderType == LatteConst::ShaderType::Geometry) - src->addFmt("int4 remappedGS[{}];" _CRLF, (sint32)shader->list_remappedUniformEntries.size()); - else - debugBreakpoint(); + src->addFmt("int4 remapped[{}];" _CRLF, (sint32)shader->list_remappedUniformEntries.size()); uniformOffsets.offset_remapped = uniformCurrentOffset; uniformCurrentOffset += 16 * shader->list_remappedUniformEntries.size(); } @@ -35,12 +28,7 @@ namespace LatteDecompiler { uint32 cfileSize = decompilerContext->analyzer.uniformRegisterAccessTracker.DetermineSize(decompilerContext->shaderBaseHash, 256); // full or partial uniform register file has to be present - if (shaderType == LatteConst::ShaderType::Vertex) - src->addFmt("int4 uniformRegisterVS[{}];" _CRLF, cfileSize); - else if (shaderType == LatteConst::ShaderType::Pixel) - src->addFmt("int4 uniformRegisterPS[{}];" _CRLF, cfileSize); - else if (shaderType == LatteConst::ShaderType::Geometry) - src->addFmt("int4 uniformRegisterGS[{}];" _CRLF, cfileSize); + src->addFmt("int4 uniformRegister[{}];" _CRLF, cfileSize); uniformOffsets.offset_uniformRegister = uniformCurrentOffset; uniformOffsets.count_uniformRegister = cfileSize; uniformCurrentOffset += 16 * cfileSize; From a328c5e7538fca0673358ce8609aa1c8f5b9e38d Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 11 Sep 2024 12:43:15 +0200 Subject: [PATCH 165/368] use uchar for index type --- .../Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp | 2 +- .../LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp | 3 +-- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 5 ++++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 58b2615be..615c8a5f1 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3901,7 +3901,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, // Vertex buffers std::string vertexBufferDefinitions = "#define VERTEX_BUFFER_DEFINITIONS "; std::string vertexBuffers = "#define VERTEX_BUFFERS "; - std::string inputFetchDefinition = "VertexIn fetchVertex(thread uint& vid, device uint* indexBuffer, uint indexType VERTEX_BUFFER_DEFINITIONS) {\n"; + std::string inputFetchDefinition = "VertexIn fetchVertex(thread uint& vid, device uint* indexBuffer, uchar indexType VERTEX_BUFFER_DEFINITIONS) {\n"; // Index buffer inputFetchDefinition += "if (indexType == 1) // UShort\n"; diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 37a509c1c..9770c595d 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -490,8 +490,7 @@ namespace LatteDecompiler // TODO: inly include index buffer if needed src->addFmt(", device uint* indexBuffer [[buffer({})]]", decompilerContext->output->resourceMappingMTL.indexBufferBinding); // TODO: put into the support buffer? - // TODO: use uchar? - src->addFmt(", constant uint& indexType [[buffer({})]]", decompilerContext->output->resourceMappingMTL.indexTypeBinding); + src->addFmt(", constant uchar& indexType [[buffer({})]]", decompilerContext->output->resourceMappingMTL.indexTypeBinding); src->add(" VERTEX_BUFFER_DEFINITIONS"); } else diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 46bcf6dcb..4ff1a3b03 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1187,9 +1187,12 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // TODO: make a helper function for this renderCommandEncoder->setObjectBytes(&verticesPerInstance, sizeof(verticesPerInstance), vertexShader->resourceMapping.verticesPerInstanceBinding); encoderState.m_buffers[METAL_SHADER_TYPE_OBJECT][vertexShader->resourceMapping.verticesPerInstanceBinding] = {nullptr}; + if (indexBuffer) SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_OBJECT, indexBuffer, indexBufferOffset, vertexShader->resourceMapping.indexBufferBinding); - renderCommandEncoder->setObjectBytes(&hostIndexType, sizeof(hostIndexType), vertexShader->resourceMapping.indexTypeBinding); + + uint8 hostIndexTypeU8 = (uint8)hostIndexType; + renderCommandEncoder->setObjectBytes(&hostIndexTypeU8, sizeof(hostIndexTypeU8), vertexShader->resourceMapping.indexTypeBinding); encoderState.m_buffers[METAL_SHADER_TYPE_OBJECT][vertexShader->resourceMapping.indexTypeBinding] = {nullptr}; uint32 verticesPerPrimitive = 0; From 4cce3699f35ac4472fa6abcf085cb76ec64f4c67 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 12 Sep 2024 08:05:27 +0200 Subject: [PATCH 166/368] put query object into a separate file --- src/Cafe/CMakeLists.txt | 2 + .../HW/Latte/Renderer/Metal/MetalQuery.cpp | 17 +++++++ src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h | 19 ++++++++ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 19 ++++++++ .../HW/Latte/Renderer/Metal/MetalRenderer.h | 47 ++----------------- 5 files changed, 61 insertions(+), 43 deletions(-) create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index 634014b7b..10c852703 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -567,6 +567,8 @@ if(ENABLE_METAL) HW/Latte/Renderer/Metal/MetalSamplerCache.h HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h + HW/Latte/Renderer/Metal/MetalQuery.cpp + HW/Latte/Renderer/Metal/MetalQuery.h HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h HW/Latte/Renderer/Metal/UtilityShaderSource.h ) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp new file mode 100644 index 000000000..40c73fd4f --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp @@ -0,0 +1,17 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalQuery.h" + +bool LatteQueryObjectMtl::getResult(uint64& numSamplesPassed) +{ + cemuLog_log(LogType::MetalLogging, "LatteQueryObjectMtl::getResult: occlusion queries are not yet supported on Metal"); + return true; +} + +void LatteQueryObjectMtl::begin() +{ + cemuLog_log(LogType::MetalLogging, "LatteQueryObjectMtl::begin: occlusion queries are not yet supported on Metal"); +} + +void LatteQueryObjectMtl::end() +{ + cemuLog_log(LogType::MetalLogging, "LatteQueryObjectMtl::end: occlusion queries are not yet supported on Metal"); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h new file mode 100644 index 000000000..ea2be227e --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h @@ -0,0 +1,19 @@ +#pragma once + +#include "Cafe/HW/Latte/Core/LatteQueryObject.h" + +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" + +// HACK: Dummy occlusion query object +class LatteQueryObjectMtl : public LatteQueryObject +{ +public: + LatteQueryObjectMtl(class MetalRenderer* mtlRenderer) : m_mtlr{mtlRenderer} {} + + bool getResult(uint64& numSamplesPassed) override; + void begin() override; + void end() override; + +private: + class MetalRenderer* m_mtlr; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 4ff1a3b03..ffb8fb724 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -9,6 +9,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalQuery.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h" @@ -1282,6 +1283,24 @@ void MetalRenderer::indexData_uploadIndexMemory(uint32 bufferIndex, uint32 offse */ } +LatteQueryObject* MetalRenderer::occlusionQuery_create() { + cemuLog_log(LogType::MetalLogging, "MetalRenderer::occlusionQuery_create: Occlusion queries are not yet supported on Metal"); + + return new LatteQueryObjectMtl(this); +} + +void MetalRenderer::occlusionQuery_destroy(LatteQueryObject* queryObj) { + cemuLog_log(LogType::MetalLogging, "MetalRenderer::occlusionQuery_destroy: occlusion queries are not yet supported on Metal"); +} + +void MetalRenderer::occlusionQuery_flush() { + cemuLog_log(LogType::MetalLogging, "MetalRenderer::occlusionQuery_flush: occlusion queries are not yet supported on Metal"); +} + +void MetalRenderer::occlusionQuery_updateState() { + cemuLog_log(LogType::MetalLogging, "MetalRenderer::occlusionQuery_updateState: occlusion queries are not yet supported on Metal"); +} + void MetalRenderer::SetBuffer(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Buffer* buffer, size_t offset, uint32 index) { auto& boundBuffer = m_state.m_encoderState.m_buffers[shaderType][index]; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 25051a975..f00f814c3 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -155,32 +155,6 @@ enum class MetalEncoderType Blit, }; -// HACK: Dummy occlusion query object for Metal -class LatteQueryObjectMtl : public LatteQueryObject -{ -public: - LatteQueryObjectMtl(class MetalRenderer* mtlRenderer) : m_mtlr{mtlRenderer} {} - - bool getResult(uint64& numSamplesPassed) override - { - cemuLog_log(LogType::MetalLogging, "LatteQueryObjectMtl::getResult: occlusion queries are not yet supported on Metal"); - return true; - } - - void begin() override - { - cemuLog_log(LogType::MetalLogging, "LatteQueryObjectMtl::begin: occlusion queries are not yet supported on Metal"); - } - - void end() override - { - cemuLog_log(LogType::MetalLogging, "LatteQueryObjectMtl::end: occlusion queries are not yet supported on Metal"); - } - -private: - class MetalRenderer* m_mtlr; -}; - class MetalRenderer : public Renderer { public: @@ -296,23 +270,10 @@ class MetalRenderer : public Renderer void indexData_uploadIndexMemory(uint32 bufferIndex, uint32 offset, uint32 size) override; // occlusion queries - LatteQueryObject* occlusionQuery_create() override { - cemuLog_log(LogType::MetalLogging, "MetalRenderer::occlusionQuery_create: Occlusion queries are not yet supported on Metal"); - - return new LatteQueryObjectMtl(this); - } - - void occlusionQuery_destroy(LatteQueryObject* queryObj) override { - cemuLog_log(LogType::MetalLogging, "MetalRenderer::occlusionQuery_destroy: occlusion queries are not yet supported on Metal"); - } - - void occlusionQuery_flush() override { - cemuLog_log(LogType::MetalLogging, "MetalRenderer::occlusionQuery_flush: occlusion queries are not yet supported on Metal"); - } - - void occlusionQuery_updateState() override { - cemuLog_log(LogType::MetalLogging, "MetalRenderer::occlusionQuery_updateState: occlusion queries are not yet supported on Metal"); - } + LatteQueryObject* occlusionQuery_create() override; + void occlusionQuery_destroy(LatteQueryObject* queryObj) override; + void occlusionQuery_flush() override; + void occlusionQuery_updateState() override; // Helpers MetalPerformanceMonitor& GetPerformanceMonitor() { return m_performanceMonitor; } From eb7c10e89f27d04b5f8a13886ee3c64244a08bfd Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 13 Sep 2024 10:41:03 +0200 Subject: [PATCH 167/368] implement occlusion queries --- .../HW/Latte/Renderer/Metal/CachedFBOMtl.cpp | 9 ++-- .../HW/Latte/Renderer/Metal/CachedFBOMtl.h | 7 +--- .../HW/Latte/Renderer/Metal/MetalQuery.cpp | 14 +++++-- src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h | 5 ++- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 34 ++++++++++++--- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 41 +++++++++++++++++++ 6 files changed, 91 insertions(+), 19 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp index a9e673f62..85adbfb97 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp @@ -1,9 +1,9 @@ #include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" -#include "HW/Latte/Renderer/Metal/LatteToMtl.h" -#include "Metal/MTLRenderPass.hpp" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" -void CachedFBOMtl::CreateRenderPass() +CachedFBOMtl::CachedFBOMtl(class MetalRenderer* metalRenderer, uint64 key) : LatteCachedFBO(key) { m_renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); @@ -39,6 +39,9 @@ void CachedFBOMtl::CreateRenderPass() stencilAttachment->setStoreAction(MTL::StoreActionStore); } } + + // Visibility buffer + m_renderPassDescriptor->setVisibilityResultBuffer(metalRenderer->GetOcclusionQueryResultBuffer()); } CachedFBOMtl::~CachedFBOMtl() diff --git a/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h index 0d926e7ed..f1221eb22 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h @@ -8,10 +8,7 @@ class CachedFBOMtl : public LatteCachedFBO { public: - CachedFBOMtl(uint64 key) : LatteCachedFBO(key) - { - CreateRenderPass(); - } + CachedFBOMtl(class MetalRenderer* metalRenderer, uint64 key); ~CachedFBOMtl(); @@ -22,6 +19,4 @@ class CachedFBOMtl : public LatteCachedFBO private: MTL::RenderPassDescriptor* m_renderPassDescriptor = nullptr; - - void CreateRenderPass(); }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp index 40c73fd4f..c27a5620e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp @@ -1,17 +1,25 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalQuery.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "HW/Latte/Renderer/Metal/MetalCommon.h" bool LatteQueryObjectMtl::getResult(uint64& numSamplesPassed) { - cemuLog_log(LogType::MetalLogging, "LatteQueryObjectMtl::getResult: occlusion queries are not yet supported on Metal"); + if (!m_mtlr->CommandBufferCompleted(m_commandBuffer)) + return false; + + numSamplesPassed = m_mtlr->GetOcclusionQueryResultsPtr()[m_queryIndex]; + return true; } void LatteQueryObjectMtl::begin() { - cemuLog_log(LogType::MetalLogging, "LatteQueryObjectMtl::begin: occlusion queries are not yet supported on Metal"); + m_queryIndex = m_mtlr->GetAvailableOcclusionQueryIndex(); + m_mtlr->SetActiveOcclusionQueryIndex(m_queryIndex); } void LatteQueryObjectMtl::end() { - cemuLog_log(LogType::MetalLogging, "LatteQueryObjectMtl::end: occlusion queries are not yet supported on Metal"); + m_mtlr->SetActiveOcclusionQueryIndex(INVALID_UINT32); + // TODO: request soon submit of the command buffer } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h index ea2be227e..8fa534974 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h @@ -4,7 +4,6 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" -// HACK: Dummy occlusion query object class LatteQueryObjectMtl : public LatteQueryObject { public: @@ -16,4 +15,8 @@ class LatteQueryObjectMtl : public LatteQueryObject private: class MetalRenderer* m_mtlr; + + uint32 m_queryIndex; + MTL::CommandBuffer* m_commandBuffer; + uint64 m_acccumulatedSum; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index ffb8fb724..045ca77ed 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -21,12 +21,14 @@ #include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "HW/Latte/Renderer/Metal/MetalLayerHandle.h" #include "HW/Latte/Renderer/Renderer.h" +#include "Metal/MTLRenderCommandEncoder.hpp" #define IMGUI_IMPL_METAL_CPP #include "imgui/imgui_extension.h" #include "imgui/imgui_impl_metal.h" #define COMMIT_TRESHOLD 256 +#define OCCLUSION_QUERY_POOL_SIZE 1024 extern bool hasValidFramebufferAttached; @@ -94,6 +96,17 @@ MetalRenderer::MetalRenderer() m_xfbRingBuffer->setLabel(GetLabel("Transform feedback buffer", m_xfbRingBuffer)); #endif + // Occlusion queries + m_occlusionQuery.m_resultBuffer = m_device->newBuffer(OCCLUSION_QUERY_POOL_SIZE * sizeof(uint64), MTL::ResourceStorageModeShared); +#ifdef CEMU_DEBUG_ASSERT + m_occlusionQuery.m_resultBuffer->setLabel(GetLabel("Occlusion query result buffer", m_occlusionQuery.m_resultBuffer)); +#endif + m_occlusionQuery.m_resultsPtr = (uint64*)m_occlusionQuery.m_resultBuffer->contents(); + + m_occlusionQuery.m_availableIndices.reserve(OCCLUSION_QUERY_POOL_SIZE); + for (uint32 i = 0; i < OCCLUSION_QUERY_POOL_SIZE; i++) + m_occlusionQuery.m_availableIndices.push_back(i); + // Initialize state for (uint32 i = 0; i < METAL_SHADER_TYPE_TOTAL; i++) { @@ -467,7 +480,7 @@ void MetalRenderer::renderTarget_setScissor(sint32 scissorX, sint32 scissorY, si LatteCachedFBO* MetalRenderer::rendertarget_createCachedFBO(uint64 key) { - return new CachedFBOMtl(key); + return new CachedFBOMtl(this, key); } void MetalRenderer::rendertarget_deleteCachedFBO(LatteCachedFBO* cfbo) @@ -1042,6 +1055,14 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 encoderState.m_depthClipEnable = zClipEnable; } + // Visibility result mode + if (m_occlusionQuery.m_activeIndex != encoderState.m_visibilityResultOffset) + { + auto mode = (m_occlusionQuery.m_activeIndex == INVALID_UINT32 ? MTL::VisibilityResultModeDisabled : MTL::VisibilityResultModeCounting); + renderCommandEncoder->setVisibilityResultMode(mode, m_occlusionQuery.m_activeIndex); + encoderState.m_visibilityResultOffset = m_occlusionQuery.m_activeIndex; + } + // todo - how does culling behave with rects? // right now we just assume that their winding is always CW if (isPrimitiveRect) @@ -1284,21 +1305,20 @@ void MetalRenderer::indexData_uploadIndexMemory(uint32 bufferIndex, uint32 offse } LatteQueryObject* MetalRenderer::occlusionQuery_create() { - cemuLog_log(LogType::MetalLogging, "MetalRenderer::occlusionQuery_create: Occlusion queries are not yet supported on Metal"); - return new LatteQueryObjectMtl(this); } void MetalRenderer::occlusionQuery_destroy(LatteQueryObject* queryObj) { - cemuLog_log(LogType::MetalLogging, "MetalRenderer::occlusionQuery_destroy: occlusion queries are not yet supported on Metal"); + // TODO: do something? } void MetalRenderer::occlusionQuery_flush() { - cemuLog_log(LogType::MetalLogging, "MetalRenderer::occlusionQuery_flush: occlusion queries are not yet supported on Metal"); + // TODO: implement + debug_printf("Occlusion query flush is not implemented\n"); } void MetalRenderer::occlusionQuery_updateState() { - cemuLog_log(LogType::MetalLogging, "MetalRenderer::occlusionQuery_updateState: occlusion queries are not yet supported on Metal"); + // TODO } void MetalRenderer::SetBuffer(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Buffer* buffer, size_t offset, uint32 index) @@ -1574,6 +1594,8 @@ void MetalRenderer::CommitCommandBuffer() m_memoryManager->GetTemporaryBufferAllocator().SetActiveCommandBuffer(nullptr); + m_occlusionQuery.m_availableIndices.insert(m_occlusionQuery.m_availableIndices.end(), m_occlusionQuery.m_crntCmdBuffIndices.begin(), m_occlusionQuery.m_crntCmdBuffIndices.end()); + // Debug //m_commandQueue->insertDebugCaptureBoundary(); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index f00f814c3..4f7376b67 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -99,6 +99,7 @@ struct MetalEncoderState uint32 m_depthSlope = 0; uint32 m_depthClamp = 0; bool m_depthClipEnable = true; + uint32 m_visibilityResultOffset = INVALID_UINT32; struct { MTL::Buffer* m_buffer; size_t m_offset; @@ -376,6 +377,36 @@ class MetalRenderer : public Renderer return m_readbackBuffer; } + MTL::Buffer* GetOcclusionQueryResultBuffer() const + { + return m_occlusionQuery.m_resultBuffer; + } + + uint64* GetOcclusionQueryResultsPtr() + { + return m_occlusionQuery.m_resultsPtr; + } + + uint32 GetAvailableOcclusionQueryIndex() + { + if (m_occlusionQuery.m_availableIndices.empty()) + { + cemuLog_log(LogType::Force, "No occlusion query index available"); + return 0; + } + + uint32 queryIndex = m_occlusionQuery.m_availableIndices.back(); + m_occlusionQuery.m_availableIndices.pop_back(); + m_occlusionQuery.m_crntCmdBuffIndices.push_back(queryIndex); + + return queryIndex; + } + + void SetActiveOcclusionQueryIndex(uint32 queryIndex) + { + m_occlusionQuery.m_activeIndex = queryIndex; + } + private: MetalLayerHandle m_mainLayer; MetalLayerHandle m_padLayer; @@ -423,6 +454,16 @@ class MetalRenderer : public Renderer // Transform feedback MTL::Buffer* m_xfbRingBuffer; + // Occlusion queries + struct + { + MTL::Buffer* m_resultBuffer; + uint64* m_resultsPtr; + std::vector m_availableIndices; + std::vector m_crntCmdBuffIndices; + uint32 m_activeIndex = INVALID_UINT32; + } m_occlusionQuery; + // Active objects std::vector m_commandBuffers; MetalEncoderType m_encoderType = MetalEncoderType::None; From fd16488e4c238e83a5faf52fc2f3ed1ebca365d7 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 13 Sep 2024 11:19:56 +0200 Subject: [PATCH 168/368] improve command buffer sync --- .../Metal/LatteTextureReadbackMtl.cpp | 4 ++-- .../Renderer/Metal/MetalBufferAllocator.h | 2 +- .../HW/Latte/Renderer/Metal/MetalCommon.h | 6 +++++ .../HW/Latte/Renderer/Metal/MetalQuery.cpp | 13 +++++++++-- src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h | 5 ++-- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 23 ++++--------------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 9 ++++---- 7 files changed, 33 insertions(+), 29 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp index f2c03709a..d7adc25ca 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp @@ -38,12 +38,12 @@ bool LatteTextureReadbackInfoMtl::IsFinished() if (m_mtlr->GetCurrentCommandBuffer() == m_commandBuffer) m_mtlr->CommitCommandBuffer(); - return m_mtlr->CommandBufferCompleted(m_commandBuffer); + return CommandBufferCompleted(m_commandBuffer); } void LatteTextureReadbackInfoMtl::ForceFinish() { - m_mtlr->WaitForCommandBufferCompletion(m_commandBuffer); + m_commandBuffer->waitUntilCompleted(); } uint8* LatteTextureReadbackInfoMtl::GetData() diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h index b8a3c7607..198d9978d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h @@ -255,7 +255,7 @@ class MetalTemporaryBufferAllocator : public MetalBufferAllocatorCommandBufferCompleted(buffer.m_data.m_commandBuffers[j])) + if (CommandBufferCompleted(buffer.m_data.m_commandBuffers[j])) { if (buffer.m_data.m_commandBuffers.size() == 1) { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h index 8a6daa928..a1fe7f826 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -69,3 +69,9 @@ inline bool IsValidDepthTextureType(Latte::E_DIM dim) { return (dim == Latte::E_DIM::DIM_2D || dim == Latte::E_DIM::DIM_2D_MSAA || dim == Latte::E_DIM::DIM_2D_ARRAY || dim == Latte::E_DIM::DIM_2D_ARRAY_MSAA || dim == Latte::E_DIM::DIM_CUBEMAP); } + +inline bool CommandBufferCompleted(MTL::CommandBuffer* commandBuffer) +{ + auto status = commandBuffer->status(); + return (status == MTL::CommandBufferStatusCompleted || status == MTL::CommandBufferStatusError); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp index c27a5620e..0119209b3 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp @@ -4,14 +4,21 @@ bool LatteQueryObjectMtl::getResult(uint64& numSamplesPassed) { - if (!m_mtlr->CommandBufferCompleted(m_commandBuffer)) + if (!CommandBufferCompleted(m_commandBuffer)) return false; numSamplesPassed = m_mtlr->GetOcclusionQueryResultsPtr()[m_queryIndex]; + printf("Num samples: %llu\n", numSamplesPassed); return true; } +LatteQueryObjectMtl::~LatteQueryObjectMtl() +{ + if (m_queryIndex != INVALID_UINT32) + m_mtlr->ReleaseOcclusionQueryIndex(m_queryIndex); +} + void LatteQueryObjectMtl::begin() { m_queryIndex = m_mtlr->GetAvailableOcclusionQueryIndex(); @@ -21,5 +28,7 @@ void LatteQueryObjectMtl::begin() void LatteQueryObjectMtl::end() { m_mtlr->SetActiveOcclusionQueryIndex(INVALID_UINT32); - // TODO: request soon submit of the command buffer + m_commandBuffer = m_mtlr->GetCurrentCommandBuffer(); + // TODO: request soon submit instead? + m_mtlr->CommitCommandBuffer(); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h index 8fa534974..58b4e266a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h @@ -8,6 +8,7 @@ class LatteQueryObjectMtl : public LatteQueryObject { public: LatteQueryObjectMtl(class MetalRenderer* mtlRenderer) : m_mtlr{mtlRenderer} {} + ~LatteQueryObjectMtl(); bool getResult(uint64& numSamplesPassed) override; void begin() override; @@ -16,7 +17,7 @@ class LatteQueryObjectMtl : public LatteQueryObject private: class MetalRenderer* m_mtlr; - uint32 m_queryIndex; + uint32 m_queryIndex = INVALID_UINT32; + // TODO: make this a list of command buffers MTL::CommandBuffer* m_commandBuffer; - uint64 m_acccumulatedSum; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 045ca77ed..e4b25d552 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -334,7 +334,7 @@ void MetalRenderer::Flush(bool waitIdle) { cemu_assert_debug(commandBuffer.m_commited); - WaitForCommandBufferCompletion(commandBuffer.m_commandBuffer); + commandBuffer.m_commandBuffer->waitUntilCompleted(); } } } @@ -1059,7 +1059,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 if (m_occlusionQuery.m_activeIndex != encoderState.m_visibilityResultOffset) { auto mode = (m_occlusionQuery.m_activeIndex == INVALID_UINT32 ? MTL::VisibilityResultModeDisabled : MTL::VisibilityResultModeCounting); - renderCommandEncoder->setVisibilityResultMode(mode, m_occlusionQuery.m_activeIndex); + renderCommandEncoder->setVisibilityResultMode(mode, m_occlusionQuery.m_activeIndex * sizeof(uint64)); encoderState.m_visibilityResultOffset = m_occlusionQuery.m_activeIndex; } @@ -1309,16 +1309,16 @@ LatteQueryObject* MetalRenderer::occlusionQuery_create() { } void MetalRenderer::occlusionQuery_destroy(LatteQueryObject* queryObj) { - // TODO: do something? + auto queryObjMtl = static_cast(queryObj); + delete queryObjMtl; } void MetalRenderer::occlusionQuery_flush() { // TODO: implement - debug_printf("Occlusion query flush is not implemented\n"); } void MetalRenderer::occlusionQuery_updateState() { - // TODO + // TODO: implement } void MetalRenderer::SetBuffer(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Buffer* buffer, size_t offset, uint32 index) @@ -1420,17 +1420,6 @@ MTL::CommandBuffer* MetalRenderer::GetCommandBuffer() } } -bool MetalRenderer::CommandBufferCompleted(MTL::CommandBuffer* commandBuffer) -{ - auto status = commandBuffer->status(); - return (status == MTL::CommandBufferStatusCompleted || status == MTL::CommandBufferStatusError); -} - -void MetalRenderer::WaitForCommandBufferCompletion(MTL::CommandBuffer* commandBuffer) -{ - commandBuffer->waitUntilCompleted(); -} - MTL::RenderCommandEncoder* MetalRenderer::GetTemporaryRenderCommandEncoder(MTL::RenderPassDescriptor* renderPassDescriptor) { EndEncoding(); @@ -1594,8 +1583,6 @@ void MetalRenderer::CommitCommandBuffer() m_memoryManager->GetTemporaryBufferAllocator().SetActiveCommandBuffer(nullptr); - m_occlusionQuery.m_availableIndices.insert(m_occlusionQuery.m_availableIndices.end(), m_occlusionQuery.m_crntCmdBuffIndices.begin(), m_occlusionQuery.m_crntCmdBuffIndices.end()); - // Debug //m_commandQueue->insertDebugCaptureBoundary(); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 4f7376b67..f46a4be35 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -323,8 +323,6 @@ class MetalRenderer : public Renderer void SetSamplerState(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::SamplerState* samplerState, uint32 index); MTL::CommandBuffer* GetCommandBuffer(); - bool CommandBufferCompleted(MTL::CommandBuffer* commandBuffer); - void WaitForCommandBufferCompletion(MTL::CommandBuffer* commandBuffer); MTL::RenderCommandEncoder* GetTemporaryRenderCommandEncoder(MTL::RenderPassDescriptor* renderPassDescriptor); MTL::RenderCommandEncoder* GetRenderCommandEncoder(bool forceRecreate = false); MTL::ComputeCommandEncoder* GetComputeCommandEncoder(); @@ -397,11 +395,15 @@ class MetalRenderer : public Renderer uint32 queryIndex = m_occlusionQuery.m_availableIndices.back(); m_occlusionQuery.m_availableIndices.pop_back(); - m_occlusionQuery.m_crntCmdBuffIndices.push_back(queryIndex); return queryIndex; } + void ReleaseOcclusionQueryIndex(uint32 queryIndex) + { + m_occlusionQuery.m_availableIndices.push_back(queryIndex); + } + void SetActiveOcclusionQueryIndex(uint32 queryIndex) { m_occlusionQuery.m_activeIndex = queryIndex; @@ -460,7 +462,6 @@ class MetalRenderer : public Renderer MTL::Buffer* m_resultBuffer; uint64* m_resultsPtr; std::vector m_availableIndices; - std::vector m_crntCmdBuffIndices; uint32 m_activeIndex = INVALID_UINT32; } m_occlusionQuery; From 934b1f8b55d85450fa6e156157f786b845486e91 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 13 Sep 2024 15:28:51 +0200 Subject: [PATCH 169/368] handle occlusion queries with no draws --- src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp | 17 ++++++++++++----- src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h | 2 +- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 5 +++++ 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp index 0119209b3..895bab224 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp @@ -4,12 +4,16 @@ bool LatteQueryObjectMtl::getResult(uint64& numSamplesPassed) { + if (!m_commandBuffer) + { + numSamplesPassed = 0; + return true; + } + if (!CommandBufferCompleted(m_commandBuffer)) return false; numSamplesPassed = m_mtlr->GetOcclusionQueryResultsPtr()[m_queryIndex]; - printf("Num samples: %llu\n", numSamplesPassed); - return true; } @@ -28,7 +32,10 @@ void LatteQueryObjectMtl::begin() void LatteQueryObjectMtl::end() { m_mtlr->SetActiveOcclusionQueryIndex(INVALID_UINT32); - m_commandBuffer = m_mtlr->GetCurrentCommandBuffer(); - // TODO: request soon submit instead? - m_mtlr->CommitCommandBuffer(); + if (m_mtlr->IsCommandBufferActive()) + { + m_commandBuffer = m_mtlr->GetCurrentCommandBuffer(); + // TODO: request soon submit instead? + m_mtlr->CommitCommandBuffer(); + } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h index 58b4e266a..554cdacec 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h @@ -19,5 +19,5 @@ class LatteQueryObjectMtl : public LatteQueryObject uint32 m_queryIndex = INVALID_UINT32; // TODO: make this a list of command buffers - MTL::CommandBuffer* m_commandBuffer; + MTL::CommandBuffer* m_commandBuffer = nullptr; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index f46a4be35..0c7c78f58 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -279,6 +279,11 @@ class MetalRenderer : public Renderer // Helpers MetalPerformanceMonitor& GetPerformanceMonitor() { return m_performanceMonitor; } + bool IsCommandBufferActive() const + { + return (m_commandBuffers.size() != 0); + } + MTL::CommandBuffer* GetCurrentCommandBuffer() { cemu_assert_debug(m_commandBuffers.size() != 0); From e89efed7434fe4029861a589d57f3521cd61100c Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 13 Sep 2024 16:12:47 +0200 Subject: [PATCH 170/368] request soon commit instead of committing directly --- .../Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp | 1 + src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp | 4 ++-- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 12 +++++++----- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h | 8 +++++++- 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp index d7adc25ca..dd2d8aebd 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp @@ -25,6 +25,7 @@ void LatteTextureReadbackInfoMtl::StartTransfer() blitCommandEncoder->copyFromTexture(baseTexture->GetTexture(), 0, 0, MTL::Origin{0, 0, 0}, MTL::Size{(uint32)baseTexture->width, (uint32)baseTexture->height, 1}, m_mtlr->GetTextureReadbackBuffer(), m_bufferOffset, bytesPerRow, bytesPerImage); m_commandBuffer = m_mtlr->GetCurrentCommandBuffer(); + m_mtlr->RequestSoonCommit(); } bool LatteTextureReadbackInfoMtl::IsFinished() diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp index 895bab224..ab24b4db8 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp @@ -14,6 +14,7 @@ bool LatteQueryObjectMtl::getResult(uint64& numSamplesPassed) return false; numSamplesPassed = m_mtlr->GetOcclusionQueryResultsPtr()[m_queryIndex]; + return true; } @@ -35,7 +36,6 @@ void LatteQueryObjectMtl::end() if (m_mtlr->IsCommandBufferActive()) { m_commandBuffer = m_mtlr->GetCurrentCommandBuffer(); - // TODO: request soon submit instead? - m_mtlr->CommitCommandBuffer(); + m_mtlr->RequestSoonCommit(); } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index e4b25d552..a063a0b73 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -27,7 +27,7 @@ #include "imgui/imgui_extension.h" #include "imgui/imgui_impl_metal.h" -#define COMMIT_TRESHOLD 256 +#define DEFAULT_COMMIT_TRESHOLD 256 #define OCCLUSION_QUERY_POOL_SIZE 1024 extern bool hasValidFramebufferAttached; @@ -1270,7 +1270,8 @@ void MetalRenderer::draw_endSequence() bool hasReadback = LatteTextureReadback_Update(); m_recordedDrawcalls++; // The number of draw calls needs to twice as big, since we are interrupting the render pass - if (m_recordedDrawcalls >= COMMIT_TRESHOLD * 2 || hasReadback) + // TODO: ucomment? + if (m_recordedDrawcalls >= m_commitTreshold * 2/* || hasReadback*/) { CommitCommandBuffer(); @@ -1409,6 +1410,9 @@ MTL::CommandBuffer* MetalRenderer::GetCommandBuffer() MTL::CommandBuffer* mtlCommandBuffer = m_commandQueue->commandBuffer(); m_commandBuffers.push_back({mtlCommandBuffer}); + m_recordedDrawcalls = 0; + m_commitTreshold = DEFAULT_COMMIT_TRESHOLD; + // Notify memory manager about the new command buffer m_memoryManager->GetTemporaryBufferAllocator().SetActiveCommandBuffer(mtlCommandBuffer); @@ -1557,15 +1561,13 @@ void MetalRenderer::EndEncoding() m_encoderType = MetalEncoderType::None; // Commit the command buffer if enough draw calls have been recorded - if (m_recordedDrawcalls >= COMMIT_TRESHOLD) + if (m_recordedDrawcalls >= m_commitTreshold) CommitCommandBuffer(); } } void MetalRenderer::CommitCommandBuffer() { - m_recordedDrawcalls = 0; - if (m_commandBuffers.size() != 0) { EndEncoding(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 0c7c78f58..3217d09d2 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -291,6 +291,11 @@ class MetalRenderer : public Renderer return m_commandBuffers[m_commandBuffers.size() - 1].m_commandBuffer; } + void RequestSoonCommit() + { + m_commitTreshold = m_recordedDrawcalls + 8; + } + MTL::CommandEncoder* GetCommandEncoder() { return m_commandEncoder; @@ -475,7 +480,8 @@ class MetalRenderer : public Renderer MetalEncoderType m_encoderType = MetalEncoderType::None; MTL::CommandEncoder* m_commandEncoder = nullptr; - uint32 m_recordedDrawcalls = 0; + uint32 m_recordedDrawcalls; + uint32 m_commitTreshold; // State MetalState m_state; From b5954d8f5bd90a7ec2eac80b60b5506807753784 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 14 Sep 2024 08:23:45 +0200 Subject: [PATCH 171/368] release command buffers properly --- .../Metal/LatteTextureReadbackMtl.cpp | 4 +- .../Renderer/Metal/MetalBufferAllocator.h | 61 +++++++++++-------- .../HW/Latte/Renderer/Metal/MetalQuery.cpp | 5 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 6 +- 4 files changed, 44 insertions(+), 32 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp index dd2d8aebd..0bcab09fb 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp @@ -5,6 +5,8 @@ LatteTextureReadbackInfoMtl::~LatteTextureReadbackInfoMtl() { + if (m_commandBuffer) + m_commandBuffer->release(); } void LatteTextureReadbackInfoMtl::StartTransfer() @@ -24,7 +26,7 @@ void LatteTextureReadbackInfoMtl::StartTransfer() blitCommandEncoder->copyFromTexture(baseTexture->GetTexture(), 0, 0, MTL::Origin{0, 0, 0}, MTL::Size{(uint32)baseTexture->width, (uint32)baseTexture->height, 1}, m_mtlr->GetTextureReadbackBuffer(), m_bufferOffset, bytesPerRow, bytesPerImage); - m_commandBuffer = m_mtlr->GetCurrentCommandBuffer(); + m_commandBuffer = m_mtlr->GetCurrentCommandBuffer()->retain(); m_mtlr->RequestSoonCommit(); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h index 198d9978d..ef65c4583 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h @@ -3,6 +3,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Common/precompiled.h" #include "Metal/MTLResource.hpp" +#include struct MetalBufferRange { @@ -162,7 +163,8 @@ typedef MetalBufferAllocator MetalDefaultBufferAllocator; struct MetalSyncedBuffer { - std::vector m_commandBuffers; + uint32 m_commandBufferCount = 0; + MTL::CommandBuffer* m_lastCommandBuffer = nullptr; uint32 m_lock = 0; bool IsLocked() const @@ -191,7 +193,7 @@ class MetalTemporaryBufferAllocator : public MetalBufferAllocator= BUFFER_RELEASE_FRAME_TRESHOLD) @@ -246,34 +248,34 @@ class MetalTemporaryBufferAllocator : public MetalBufferAllocator{})); + cemu_assert_debug(result.second); + m_activeCommandBufferIt = result.first; + commandBuffer->retain(); } void CheckForCompletedCommandBuffers(/*MTL::CommandBuffer* commandBuffer, bool erase = true*/) { - for (uint32_t i = 0; i < m_buffers.size(); i++) + for (auto it = m_executingCommandBuffers.begin(); it != m_executingCommandBuffers.end();) { - auto& buffer = m_buffers[i]; - for (uint32_t j = 0; j < buffer.m_data.m_commandBuffers.size(); j++) + if (CommandBufferCompleted(it->first)) { - if (CommandBufferCompleted(buffer.m_data.m_commandBuffers[j])) + for (auto bufferIndex : it->second) { - if (buffer.m_data.m_commandBuffers.size() == 1) - { - if (!buffer.m_data.IsLocked()) - { - // All command buffers using it have finished execution, we can use it again - FreeBuffer(i); - } - - buffer.m_data.m_commandBuffers.clear(); - break; - } - else - { - buffer.m_data.m_commandBuffers.erase(buffer.m_data.m_commandBuffers.begin() + j); - j--; - } + auto& buffer = m_buffers[bufferIndex]; + buffer.m_data.m_commandBufferCount--; + + if (!buffer.m_data.IsLocked() && buffer.m_data.m_commandBufferCount == 0) + FreeBuffer(bufferIndex); } + + it->first->release(); + + it = m_executingCommandBuffers.erase(it); + } + else + { + ++it; } } @@ -286,8 +288,12 @@ class MetalTemporaryBufferAllocator : public MetalBufferAllocatorsecond.push_back(bufferIndex); + buffer.m_data.m_commandBufferCount++; + buffer.m_data.m_lastCommandBuffer = m_activeCommandBuffer; + } return buffer.m_buffer; } @@ -348,5 +354,8 @@ class MetalTemporaryBufferAllocator : public MetalBufferAllocator> m_executingCommandBuffers; + std::map>::iterator m_activeCommandBufferIt; + uint16 m_framesSinceBackBufferAccess = 0; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp index ab24b4db8..91f252e8a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp @@ -22,6 +22,9 @@ LatteQueryObjectMtl::~LatteQueryObjectMtl() { if (m_queryIndex != INVALID_UINT32) m_mtlr->ReleaseOcclusionQueryIndex(m_queryIndex); + + if (m_commandBuffer) + m_commandBuffer->release(); } void LatteQueryObjectMtl::begin() @@ -35,7 +38,7 @@ void LatteQueryObjectMtl::end() m_mtlr->SetActiveOcclusionQueryIndex(INVALID_UINT32); if (m_mtlr->IsCommandBufferActive()) { - m_commandBuffer = m_mtlr->GetCurrentCommandBuffer(); + m_commandBuffer = m_mtlr->GetCurrentCommandBuffer()->retain(); m_mtlr->RequestSoonCommit(); } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index a063a0b73..f33e527f7 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -267,11 +267,8 @@ void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) if (swapDRC) SwapBuffer(false); - // Release all the command buffers + // Reset the command buffers (they are released in) CommitCommandBuffer(); - // TODO: release - //for (uint32 i = 0; i < m_commandBuffers.size(); i++) - // m_commandBuffers[i].m_commandBuffer->release(); m_commandBuffers.clear(); // Release frame persistent buffers @@ -1581,6 +1578,7 @@ void MetalRenderer::CommitCommandBuffer() //}); commandBuffer.m_commandBuffer->commit(); + commandBuffer.m_commandBuffer->release(); commandBuffer.m_commited = true; m_memoryManager->GetTemporaryBufferAllocator().SetActiveCommandBuffer(nullptr); From 02254d4e57ab9fa29e4d62f669d97b9cec1100b7 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 14 Sep 2024 16:42:42 +0200 Subject: [PATCH 172/368] set pixel format view usage for textures --- src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index 86c44efee..5546241f7 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -4,6 +4,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #include "Common/precompiled.h" #include "Metal/MTLResource.hpp" +#include "Metal/MTLTexture.hpp" LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) @@ -78,8 +79,7 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM auto pixelFormat = GetMtlPixelFormat(format, isDepth, m_mtlr->GetPixelFormatSupport()); desc->setPixelFormat(pixelFormat); - // TODO: using MTL::TextureUsageShaderWrite as well fixes Mario Tennis: Ultra Smash, investigate why - MTL::TextureUsage usage = MTL::TextureUsageShaderRead; + MTL::TextureUsage usage = MTL::TextureUsageShaderRead | MTL::TextureUsagePixelFormatView; if (!Latte::IsCompressedFormat(format)) { usage |= MTL::TextureUsageRenderTarget; From 008c11ce71ade8a5f0759d3f38c7817dac287b06 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 14 Sep 2024 17:12:02 +0200 Subject: [PATCH 173/368] allocate special buffers only when needed --- .../Renderer/Metal/MetalBufferAllocator.h | 15 +++++++--- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 23 ++++++--------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 28 +++++++++++++++++-- 3 files changed, 44 insertions(+), 22 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h index ef65c4583..9998ac894 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h @@ -248,10 +248,17 @@ class MetalTemporaryBufferAllocator : public MetalBufferAllocator{})); - cemu_assert_debug(result.second); - m_activeCommandBufferIt = result.first; - commandBuffer->retain(); + if (commandBuffer) + { + auto result = m_executingCommandBuffers.emplace(std::make_pair(m_activeCommandBuffer, std::vector{})); + cemu_assert_debug(result.second); + m_activeCommandBufferIt = result.first; + commandBuffer->retain(); + } + else + { + m_activeCommandBufferIt = m_executingCommandBuffers.end(); + } } void CheckForCompletedCommandBuffers(/*MTL::CommandBuffer* commandBuffer, bool erase = true*/) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index f33e527f7..8e4d4bc5b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -83,19 +83,6 @@ MetalRenderer::MetalRenderer() m_depthStencilCache = new MetalDepthStencilCache(this); m_samplerCache = new MetalSamplerCache(this); - // Texture readback - m_readbackBuffer = m_device->newBuffer(TEXTURE_READBACK_SIZE, MTL::ResourceStorageModeShared); -#ifdef CEMU_DEBUG_ASSERT - m_readbackBuffer->setLabel(GetLabel("Texture readback buffer", m_readbackBuffer)); -#endif - - // Transform feedback - // HACK: using just LatteStreamout_GetRingBufferSize will cause page faults - m_xfbRingBuffer = m_device->newBuffer(LatteStreamout_GetRingBufferSize() * 4, MTL::ResourceStorageModePrivate); -#ifdef CEMU_DEBUG_ASSERT - m_xfbRingBuffer->setLabel(GetLabel("Transform feedback buffer", m_xfbRingBuffer)); -#endif - // Occlusion queries m_occlusionQuery.m_resultBuffer = m_device->newBuffer(OCCLUSION_QUERY_POOL_SIZE * sizeof(uint64), MTL::ResourceStorageModeShared); #ifdef CEMU_DEBUG_ASSERT @@ -196,7 +183,13 @@ MetalRenderer::~MetalRenderer() m_nearestSampler->release(); m_linearSampler->release(); - m_readbackBuffer->release(); + if (m_readbackBuffer) + m_readbackBuffer->release(); + + if (m_xfbRingBuffer) + m_xfbRingBuffer->release(); + + m_occlusionQuery.m_resultBuffer->release(); m_commandQueue->release(); m_device->release(); @@ -794,7 +787,7 @@ void MetalRenderer::bufferCache_copy(uint32 srcOffset, uint32 dstOffset, uint32 void MetalRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint32 dstOffset, uint32 size) { - CopyBufferToBuffer(m_xfbRingBuffer, srcOffset, m_memoryManager->GetBufferCache(), dstOffset, size, MTL::RenderStageVertex, ALL_MTL_RENDER_STAGES); + CopyBufferToBuffer(GetXfbRingBuffer(), srcOffset, m_memoryManager->GetBufferCache(), dstOffset, size, MTL::RenderStageVertex, ALL_MTL_RENDER_STAGES); } void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, uint32 size) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 3217d09d2..abb7f7e53 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -380,11 +380,33 @@ class MetalRenderer : public Renderer return (m_hasUnifiedMemory ? MTL::ResourceStorageModeShared : MTL::ResourceStorageModeManaged); } - MTL::Buffer* GetTextureReadbackBuffer() const + MTL::Buffer* GetTextureReadbackBuffer() { + if (!m_readbackBuffer) + { + m_readbackBuffer = m_device->newBuffer(TEXTURE_READBACK_SIZE, MTL::ResourceStorageModeShared); +#ifdef CEMU_DEBUG_ASSERT + m_readbackBuffer->setLabel(GetLabel("Texture readback buffer", m_readbackBuffer)); +#endif + } + return m_readbackBuffer; } + MTL::Buffer* GetXfbRingBuffer() + { + if (!m_xfbRingBuffer) + { + // HACK: using just LatteStreamout_GetRingBufferSize will cause page faults + m_xfbRingBuffer = m_device->newBuffer(LatteStreamout_GetRingBufferSize() * 4, MTL::ResourceStorageModePrivate); +#ifdef CEMU_DEBUG_ASSERT + m_xfbRingBuffer->setLabel(GetLabel("Transform feedback buffer", m_xfbRingBuffer)); +#endif + } + + return m_xfbRingBuffer; + } + MTL::Buffer* GetOcclusionQueryResultBuffer() const { return m_occlusionQuery.m_resultBuffer; @@ -460,11 +482,11 @@ class MetalRenderer : public Renderer MTL::Texture* m_nullTexture2D; // Texture readback - MTL::Buffer* m_readbackBuffer; + MTL::Buffer* m_readbackBuffer = nullptr; uint32 m_readbackBufferWriteOffset = 0; // Transform feedback - MTL::Buffer* m_xfbRingBuffer; + MTL::Buffer* m_xfbRingBuffer = nullptr; // Occlusion queries struct From 2961151f25f4a6dc000e3b92b2102b1f3c0664e1 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 14 Sep 2024 17:21:23 +0200 Subject: [PATCH 174/368] correct comment --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 8e4d4bc5b..3b809100e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -260,7 +260,7 @@ void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) if (swapDRC) SwapBuffer(false); - // Reset the command buffers (they are released in) + // Reset the command buffers (they are released by TemporaryBufferAllocator) CommitCommandBuffer(); m_commandBuffers.clear(); From 358567ad4a275239bc00a1581b3dc94e12fd10b0 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 14 Sep 2024 22:06:30 +0200 Subject: [PATCH 175/368] make a workaround for streamout with no fbo --- src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp | 14 ++++++++++++++ .../HW/Latte/Renderer/Metal/MetalPipelineCache.cpp | 10 ++++++---- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 9 ++++----- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h | 5 +++++ 4 files changed, 29 insertions(+), 9 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp index 85adbfb97..a7e87c794 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp @@ -7,6 +7,7 @@ CachedFBOMtl::CachedFBOMtl(class MetalRenderer* metalRenderer, uint64 key) : Lat { m_renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); + bool hasAttachment = false; for (int i = 0; i < 8; ++i) { const auto& buffer = colorBuffer[i]; @@ -19,6 +20,8 @@ CachedFBOMtl::CachedFBOMtl(class MetalRenderer* metalRenderer, uint64 key) : Lat colorAttachment->setTexture(textureView->GetRGBAView()); colorAttachment->setLoadAction(MTL::LoadActionLoad); colorAttachment->setStoreAction(MTL::StoreActionStore); + + hasAttachment = true; } // setup depth attachment @@ -38,6 +41,17 @@ CachedFBOMtl::CachedFBOMtl(class MetalRenderer* metalRenderer, uint64 key) : Lat stencilAttachment->setLoadAction(MTL::LoadActionLoad); stencilAttachment->setStoreAction(MTL::StoreActionStore); } + + hasAttachment = true; + } + + // HACK: setup a dummy color attachment to prevent Metal from discarding draws for stremout draws in Super Smash Bros. for Wii U (works fine on MoltenVK without this hack though) + if (!hasAttachment) + { + auto colorAttachment = m_renderPassDescriptor->colorAttachments()->object(0); + colorAttachment->setTexture(metalRenderer->GetNullTexture2D()); + colorAttachment->setLoadAction(MTL::LoadActionDontCare); + colorAttachment->setStoreAction(MTL::StoreActionDontCare); } // Visibility buffer diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index ee855135b..436ef99cc 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -10,6 +10,7 @@ #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/ISA/RegDefines.h" #include "Cemu/Logging/CemuLogging.h" +#include "HW/Latte/Core/LatteConst.h" #include "config/ActiveSettings.h" static void rectsEmulationGS_outputSingleVertex(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable* psInputTable, sint32 vIdx, const LatteContextRegister& latteRegister) @@ -206,20 +207,21 @@ void SetFragmentState(T* desc, CachedFBOMtl* lastUsedFBO, CachedFBOMtl* activeFB if (cullFront && cullBack) rasterizationEnabled = false; - if (!rasterizationEnabled) + auto pixelShaderMtl = static_cast(pixelShader->shader); + + if (!rasterizationEnabled || !pixelShaderMtl) { desc->setRasterizationEnabled(false); return; } - auto pixelShaderMtl = static_cast(pixelShader->shader); - desc->setFragmentFunction(pixelShaderMtl->GetFunction()); + desc->setFragmentFunction(pixelShaderMtl->GetFunction()); // Color attachments const Latte::LATTE_CB_COLOR_CONTROL& colorControlReg = lcr.CB_COLOR_CONTROL; uint32 blendEnableMask = colorControlReg.get_BLEND_MASK(); uint32 renderTargetMask = lcr.CB_TARGET_MASK.get_MASK(); - for (uint8 i = 0; i < 8; i++) + for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) { const auto& colorBuffer = lastUsedFBO->colorBuffer[i]; auto texture = static_cast(colorBuffer.texture); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 3b809100e..9a88c2497 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -863,7 +863,6 @@ void MetalRenderer::draw_beginSequence() return; // no render target } - // TODO: not checking for !streamoutEnable fixes Super Smash Bros. for Wii U, investigate why if (!hasValidFramebufferAttached && !streamoutEnable) { debug_printf("Drawcall with no color buffer or depth buffer attached\n"); @@ -1476,10 +1475,6 @@ MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(bool forceRecr auto commandBuffer = GetCommandBuffer(); - // Update state - m_state.m_lastUsedFBO = m_state.m_activeFBO; - m_state.m_isFirstDrawInRenderPass = true; - auto renderCommandEncoder = commandBuffer->renderCommandEncoder(m_state.m_activeFBO->GetRenderPassDescriptor()); #ifdef CEMU_DEBUG_ASSERT renderCommandEncoder->setLabel(GetLabel("Render command encoder", renderCommandEncoder)); @@ -1487,6 +1482,10 @@ MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(bool forceRecr m_commandEncoder = renderCommandEncoder; m_encoderType = MetalEncoderType::Render; + // Update state + m_state.m_lastUsedFBO = m_state.m_activeFBO; + m_state.m_isFirstDrawInRenderPass = true; + ResetEncoderState(); // Debug diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index abb7f7e53..5b8406d2f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -380,6 +380,11 @@ class MetalRenderer : public Renderer return (m_hasUnifiedMemory ? MTL::ResourceStorageModeShared : MTL::ResourceStorageModeManaged); } + MTL::Texture* GetNullTexture2D() const + { + return m_nullTexture2D; + } + MTL::Buffer* GetTextureReadbackBuffer() { if (!m_readbackBuffer) From 8ac90cef7e9a33de9585e14cad9e1d04b6d9129b Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 16 Sep 2024 08:38:08 +0200 Subject: [PATCH 176/368] use correct stages for buffer sync --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 9a88c2497..47660e637 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -787,7 +787,7 @@ void MetalRenderer::bufferCache_copy(uint32 srcOffset, uint32 dstOffset, uint32 void MetalRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint32 dstOffset, uint32 size) { - CopyBufferToBuffer(GetXfbRingBuffer(), srcOffset, m_memoryManager->GetBufferCache(), dstOffset, size, MTL::RenderStageVertex, ALL_MTL_RENDER_STAGES); + CopyBufferToBuffer(GetXfbRingBuffer(), srcOffset, m_memoryManager->GetBufferCache(), dstOffset, size, MTL::RenderStageVertex | MTL::RenderStageMesh, ALL_MTL_RENDER_STAGES); } void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, uint32 size) @@ -1874,7 +1874,7 @@ void MetalRenderer::CopyBufferToBuffer(MTL::Buffer* src, uint32 srcOffset, MTL:: auto renderCommandEncoder = static_cast(m_commandEncoder); MTL::Resource* barrierBuffers[] = {src}; - renderCommandEncoder->memoryBarrier(barrierBuffers, 1, after, MTL::RenderStageVertex); + renderCommandEncoder->memoryBarrier(barrierBuffers, 1, after, after | MTL::RenderStageVertex); renderCommandEncoder->setRenderPipelineState(m_copyBufferToBufferPipeline->GetRenderPipelineState()); m_state.m_encoderState.m_renderPipelineState = m_copyBufferToBufferPipeline->GetRenderPipelineState(); @@ -1885,7 +1885,7 @@ void MetalRenderer::CopyBufferToBuffer(MTL::Buffer* src, uint32 srcOffset, MTL:: renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypePoint, NS::UInteger(0), NS::UInteger(size)); barrierBuffers[0] = dst; - renderCommandEncoder->memoryBarrier(barrierBuffers, 1, MTL::RenderStageVertex, before); + renderCommandEncoder->memoryBarrier(barrierBuffers, 1, before | MTL::RenderStageVertex, before); } else { From 535107e1f4f8132372a6759cdaece34a850ed57e Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 16 Sep 2024 09:50:39 +0200 Subject: [PATCH 177/368] fix: invalid surface copy depth --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 47660e637..d6c366769 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -21,7 +21,6 @@ #include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "HW/Latte/Renderer/Metal/MetalLayerHandle.h" #include "HW/Latte/Renderer/Renderer.h" -#include "Metal/MTLRenderCommandEncoder.hpp" #define IMGUI_IMPL_METAL_CPP #include "imgui/imgui_extension.h" @@ -699,7 +698,7 @@ void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* so //sint32 sourceEffectiveWidth, sourceEffectiveHeight; //sourceTexture->GetEffectiveSize(sourceEffectiveWidth, sourceEffectiveHeight, srcMip); - texture_copyImageSubData(sourceTexture, srcMip, 0, 0, srcSlice, destinationTexture, dstMip, 0, 0, dstSlice, effectiveCopyWidth, effectiveCopyHeight, 0); + texture_copyImageSubData(sourceTexture, srcMip, 0, 0, srcSlice, destinationTexture, dstMip, 0, 0, dstSlice, effectiveCopyWidth, effectiveCopyHeight, 1); /* sint32 texSrcMip = srcMip; From aa81070e874bb8aa86fa4ecd62198330201bcf04 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 16 Sep 2024 19:44:49 +0200 Subject: [PATCH 178/368] refactor pixel formats --- .../Latte/Renderer/Metal/LatteTextureMtl.cpp | 2 +- .../Renderer/Metal/LatteTextureViewMtl.cpp | 2 +- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 107 +++++++++++------- src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h | 4 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 2 + .../HW/Latte/Renderer/Metal/MetalRenderer.h | 5 - 6 files changed, 71 insertions(+), 51 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index 5546241f7..142870501 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -76,7 +76,7 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM desc->setArrayLength(effectiveBaseDepth); } - auto pixelFormat = GetMtlPixelFormat(format, isDepth, m_mtlr->GetPixelFormatSupport()); + auto pixelFormat = GetMtlPixelFormat(format, isDepth); desc->setPixelFormat(pixelFormat); MTL::TextureUsage usage = MTL::TextureUsageShaderRead | MTL::TextureUsagePixelFormatView; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp index 950cadc1d..5374126ac 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp @@ -180,7 +180,7 @@ MTL::Texture* LatteTextureViewMtl::CreateSwizzledView(uint32 gpuSamplerSwizzle) swizzle.blue = GetMtlTextureSwizzle(compSelB); swizzle.alpha = GetMtlTextureSwizzle(compSelA); - auto pixelFormat = GetMtlPixelFormat(format, m_baseTexture->IsDepth(), m_mtlr->GetPixelFormatSupport()); + auto pixelFormat = GetMtlPixelFormat(format, m_baseTexture->IsDepth()); MTL::Texture* texture = m_baseTexture->GetTexture()->newTextureView(pixelFormat, textureType, NS::Range::Make(baseLevel, levelCount), NS::Range::Make(baseLayer, layerCount), swizzle); return texture; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index daa283e40..a62ecdc43 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -1,13 +1,11 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #include "Cemu/Logging/CemuLogging.h" -#include "Common/precompiled.h" -#include "Metal/MTLDepthStencil.hpp" +#include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "Metal/MTLPixelFormat.hpp" -#include "Metal/MTLRenderCommandEncoder.hpp" -#include "Metal/MTLRenderPipeline.hpp" -#include "Metal/MTLSampler.hpp" std::map MTL_COLOR_FORMAT_TABLE = { + {Latte::E_GX2SURFFMT::INVALID_FORMAT, {MTL::PixelFormatInvalid, MetalDataType::NONE, 0}}, + {Latte::E_GX2SURFFMT::R4_G4_UNORM, {MTL::PixelFormatRG8Unorm, MetalDataType::FLOAT, 2}}, // TODO: correct? {Latte::E_GX2SURFFMT::R5_G6_B5_UNORM, {MTL::PixelFormatB5G6R5Unorm, MetalDataType::FLOAT, 2}}, // TODO: correct? {Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM, {MTL::PixelFormatBGR5A1Unorm, MetalDataType::FLOAT, 2}}, // TODO: correct? @@ -48,11 +46,11 @@ std::map MTL_COLOR_FORMAT_TABLE = { {Latte::E_GX2SURFFMT::R16_G16_B16_A16_UINT, {MTL::PixelFormatRGBA16Uint, MetalDataType::UINT, 8}}, {Latte::E_GX2SURFFMT::R16_G16_B16_A16_SINT, {MTL::PixelFormatRGBA16Sint, MetalDataType::INT, 8}}, {Latte::E_GX2SURFFMT::R16_G16_B16_A16_FLOAT, {MTL::PixelFormatRGBA16Float, MetalDataType::FLOAT, 8}}, - {Latte::E_GX2SURFFMT::R24_X8_UNORM, {MTL::PixelFormatR32Float, MetalDataType::FLOAT, 0}}, // TODO - {Latte::E_GX2SURFFMT::R24_X8_FLOAT, {MTL::PixelFormatInvalid, MetalDataType::NONE, 0}}, // TODO + {Latte::E_GX2SURFFMT::R24_X8_UNORM, {MTL::PixelFormatR32Float, MetalDataType::FLOAT, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R24_X8_FLOAT, {MTL::PixelFormatR32Float, MetalDataType::FLOAT, 4}}, // TODO: correct? {Latte::E_GX2SURFFMT::X24_G8_UINT, {MTL::PixelFormatRGBA8Uint, MetalDataType::UINT, 4}}, // TODO: correct? {Latte::E_GX2SURFFMT::R32_X8_FLOAT, {MTL::PixelFormatR32Float, MetalDataType::FLOAT, 4}}, // TODO: correct? - {Latte::E_GX2SURFFMT::X32_G8_UINT_X24, {MTL::PixelFormatInvalid, MetalDataType::NONE, 0}}, // TODO + {Latte::E_GX2SURFFMT::X32_G8_UINT_X24, {MTL::PixelFormatRGBA16Uint, MetalDataType::UINT, 8}}, // TODO: correct? {Latte::E_GX2SURFFMT::R11_G11_B10_FLOAT, {MTL::PixelFormatRG11B10Float, MetalDataType::FLOAT, 4}}, {Latte::E_GX2SURFFMT::R32_UINT, {MTL::PixelFormatR32Uint, MetalDataType::UINT, 4}}, {Latte::E_GX2SURFFMT::R32_SINT, {MTL::PixelFormatR32Sint, MetalDataType::INT, 4}}, @@ -76,6 +74,8 @@ std::map MTL_COLOR_FORMAT_TABLE = { }; std::map MTL_DEPTH_FORMAT_TABLE = { + {Latte::E_GX2SURFFMT::INVALID_FORMAT, {MTL::PixelFormatInvalid, MetalDataType::NONE, 0}}, + {Latte::E_GX2SURFFMT::D24_S8_UNORM, {MTL::PixelFormatDepth24Unorm_Stencil8, MetalDataType::NONE, 4, {1, 1}, true}}, {Latte::E_GX2SURFFMT::D24_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, MetalDataType::NONE, 4, {1, 1}, true}}, {Latte::E_GX2SURFFMT::D32_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, MetalDataType::NONE, 5, {1, 1}, true}}, @@ -83,52 +83,73 @@ std::map MTL_DEPTH_FORMAT_TABLE = { {Latte::E_GX2SURFFMT::D32_FLOAT, {MTL::PixelFormatDepth32Float, MetalDataType::NONE, 4, {1, 1}}}, }; -const MetalPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format, bool isDepth) +void CheckForPixelFormatSupport(const MetalPixelFormatSupport& support) { - if (format == Latte::E_GX2SURFFMT::INVALID_FORMAT) + // Color formats + for (auto& [fmt, formatInfo] : MTL_COLOR_FORMAT_TABLE) + { + switch (formatInfo.pixelFormat) + { + case MTL::PixelFormatR8Unorm_sRGB: + if (!support.m_supportsR8Unorm_sRGB) + formatInfo.pixelFormat = MTL::PixelFormatRGBA8Unorm_sRGB; + break; + case MTL::PixelFormatRG8Unorm_sRGB: + if (!support.m_supportsRG8Unorm_sRGB) + formatInfo.pixelFormat = MTL::PixelFormatRGBA8Unorm_sRGB; + break; + case MTL::PixelFormatB5G6R5Unorm: + case MTL::PixelFormatA1BGR5Unorm: + case MTL::PixelFormatABGR4Unorm: + case MTL::PixelFormatBGR5A1Unorm: + if (!support.m_supportsPacked16BitFormats) + formatInfo.pixelFormat = MTL::PixelFormatRGBA8Unorm; + break; + default: + break; + } + } + + // Depth formats + for (auto& [fmt, formatInfo] : MTL_DEPTH_FORMAT_TABLE) { - return {MTL::PixelFormatInvalid, MetalDataType::NONE, 0}; + switch (formatInfo.pixelFormat) + { + case MTL::PixelFormatDepth24Unorm_Stencil8: + if (!support.m_supportsDepth24Unorm_Stencil8) + formatInfo.pixelFormat = MTL::PixelFormatDepth32Float_Stencil8; + break; + default: + break; + } } +} - MetalPixelFormatInfo formatInfo; +const MetalPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format, bool isDepth) +{ if (isDepth) - formatInfo = MTL_DEPTH_FORMAT_TABLE[format]; + { + auto it = MTL_DEPTH_FORMAT_TABLE.find(format); + if (it == MTL_DEPTH_FORMAT_TABLE.end()) + return {MTL::PixelFormatDepth16Unorm, MetalDataType::NONE, 2}; // Fallback + else + return it->second; + } else - formatInfo = MTL_COLOR_FORMAT_TABLE[format]; - - return formatInfo; + { + auto it = MTL_COLOR_FORMAT_TABLE.find(format); + if (it == MTL_COLOR_FORMAT_TABLE.end()) + return {MTL::PixelFormatR8Unorm, MetalDataType::FLOAT, 1}; // Fallback + else + return it->second; + } } -MTL::PixelFormat GetMtlPixelFormat(Latte::E_GX2SURFFMT format, bool isDepth, const MetalPixelFormatSupport& pixelFormatSupport) +MTL::PixelFormat GetMtlPixelFormat(Latte::E_GX2SURFFMT format, bool isDepth) { auto pixelFormat = GetMtlPixelFormatInfo(format, isDepth).pixelFormat; if (pixelFormat == MTL::PixelFormatInvalid) - cemuLog_logDebug(LogType::Force, "invalid pixel format {}\n", pixelFormat); - - switch (pixelFormat) - { - case MTL::PixelFormatR8Unorm_sRGB: - if (!pixelFormatSupport.m_supportsR8Unorm_sRGB) - return MTL::PixelFormatRGBA8Unorm_sRGB; - break; - case MTL::PixelFormatRG8Unorm_sRGB: - if (!pixelFormatSupport.m_supportsRG8Unorm_sRGB) - return MTL::PixelFormatRGBA8Unorm_sRGB; - break; - case MTL::PixelFormatB5G6R5Unorm: - case MTL::PixelFormatA1BGR5Unorm: - case MTL::PixelFormatABGR4Unorm: - case MTL::PixelFormatBGR5A1Unorm: - if (!pixelFormatSupport.m_supportsPacked16BitFormats) - return MTL::PixelFormatRGBA8Unorm; - break; - case MTL::PixelFormatDepth24Unorm_Stencil8: - if (!pixelFormatSupport.m_supportsDepth24Unorm_Stencil8) - return MTL::PixelFormatDepth32Float_Stencil8; - break; - default: - break; - } + cemuLog_log(LogType::Force, "invalid pixel format 0x{:x}, is depth: {}\n", format, isDepth); return pixelFormat; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h index c1b1c75c3..726d13eb7 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h @@ -29,9 +29,11 @@ struct MetalPixelFormatInfo { bool hasStencil = false; }; +void CheckForPixelFormatSupport(const MetalPixelFormatSupport& support); + const MetalPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format, bool isDepth); -MTL::PixelFormat GetMtlPixelFormat(Latte::E_GX2SURFFMT format, bool isDepth, const MetalPixelFormatSupport& pixelFormatSupport); +MTL::PixelFormat GetMtlPixelFormat(Latte::E_GX2SURFFMT format, bool isDepth); inline MetalDataType GetColorBufferDataType(const uint32 index, const LatteContextRegister& lcr) { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index d6c366769..3191b90bf 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -45,6 +45,8 @@ MetalRenderer::MetalRenderer() m_recommendedMaxVRAMUsage = m_device->recommendedMaxWorkingSetSize(); m_pixelFormatSupport = MetalPixelFormatSupport(m_device); + CheckForPixelFormatSupport(m_pixelFormatSupport); + // Resources MTL::SamplerDescriptor* samplerDescriptor = MTL::SamplerDescriptor::alloc()->init(); #ifdef CEMU_DEBUG_ASSERT diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 5b8406d2f..8eb0e319d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -365,11 +365,6 @@ class MetalRenderer : public Renderer return m_supportsMetal3; } - const MetalPixelFormatSupport& GetPixelFormatSupport() const - { - return m_pixelFormatSupport; - } - //MTL::StorageMode GetOptimalTextureStorageMode() const //{ // return (m_isAppleGPU ? MTL::StorageModeShared : MTL::StorageModePrivate); From f03c0a2769c3e0e939631075055cd14dfc267945 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 19 Sep 2024 10:05:48 +0200 Subject: [PATCH 179/368] only set buffer offset if needed --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 3191b90bf..784049493 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1318,7 +1318,27 @@ void MetalRenderer::SetBuffer(MTL::RenderCommandEncoder* renderCommandEncoder, M if (buffer == boundBuffer.m_buffer && offset == boundBuffer.m_offset) return; - // TODO: only set the offset if only offset changed + if (buffer == boundBuffer.m_buffer) + { + // Just update the offset + boundBuffer.m_offset = offset; + + switch (shaderType) + { + case METAL_SHADER_TYPE_VERTEX: + renderCommandEncoder->setVertexBufferOffset(offset, index); + break; + case METAL_SHADER_TYPE_OBJECT: + renderCommandEncoder->setObjectBufferOffset(offset, index); + break; + case METAL_SHADER_TYPE_MESH: + renderCommandEncoder->setMeshBufferOffset(offset, index); + break; + case METAL_SHADER_TYPE_FRAGMENT: + renderCommandEncoder->setFragmentBufferOffset(offset, index); + break; + } + } boundBuffer = {buffer, offset}; @@ -1874,7 +1894,7 @@ void MetalRenderer::CopyBufferToBuffer(MTL::Buffer* src, uint32 srcOffset, MTL:: { auto renderCommandEncoder = static_cast(m_commandEncoder); - MTL::Resource* barrierBuffers[] = {src}; + MTL::Resource* barrierBuffers[] = {src}; renderCommandEncoder->memoryBarrier(barrierBuffers, 1, after, after | MTL::RenderStageVertex); renderCommandEncoder->setRenderPipelineState(m_copyBufferToBufferPipeline->GetRenderPipelineState()); From a460a5d28abeecfe2c1364b5f63720f0e51f5319 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 21 Sep 2024 09:28:46 +0200 Subject: [PATCH 180/368] do vertex restride on the CPU --- .../Metal/LatteTextureReadbackMtl.cpp | 13 ++++----- .../Renderer/Metal/MetalMemoryManager.cpp | 24 ++++++++-------- .../Latte/Renderer/Metal/MetalMemoryManager.h | 2 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 28 +++++++++++-------- 4 files changed, 35 insertions(+), 32 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp index 0bcab09fb..ca4e31a7f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp @@ -27,19 +27,16 @@ void LatteTextureReadbackInfoMtl::StartTransfer() blitCommandEncoder->copyFromTexture(baseTexture->GetTexture(), 0, 0, MTL::Origin{0, 0, 0}, MTL::Size{(uint32)baseTexture->width, (uint32)baseTexture->height, 1}, m_mtlr->GetTextureReadbackBuffer(), m_bufferOffset, bytesPerRow, bytesPerImage); m_commandBuffer = m_mtlr->GetCurrentCommandBuffer()->retain(); - m_mtlr->RequestSoonCommit(); + // TODO: uncomment? + //m_mtlr->RequestSoonCommit(); + m_mtlr->CommitCommandBuffer(); } bool LatteTextureReadbackInfoMtl::IsFinished() { - // TODO: is this needed? - if (!m_commandBuffer) - return false; - - // TODO: remove this? // Command buffer wasn't even comitted, let's commit immediately - if (m_mtlr->GetCurrentCommandBuffer() == m_commandBuffer) - m_mtlr->CommitCommandBuffer(); + //if (m_mtlr->GetCurrentCommandBuffer() == m_commandBuffer) + // m_mtlr->CommitCommandBuffer(); return CommandBufferCompleted(m_commandBuffer); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 6173532c2..e406abf6d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -28,15 +28,14 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu restrideInfo.allocation = m_bufferAllocator.GetBufferAllocation(newSize); buffer = m_bufferAllocator.GetBuffer(restrideInfo.allocation.bufferIndex); - //uint8* oldPtr = (uint8*)bufferCache->contents() + vertexBufferRange.offset; - //uint8* newPtr = (uint8*)buffer->contents() + restrideInfo.allocation.bufferOffset; + // HACK: the restriding is done on the CPU, since doing it on the GPU was causing over-synchronization + uint8* oldPtr = (uint8*)bufferCache->contents() + vertexBufferRange.offset; + uint8* newPtr = (uint8*)buffer->contents() + restrideInfo.allocation.offset; - //for (size_t elem = 0; elem < vertexBufferRange.size / stride; elem++) - //{ - // memcpy(newPtr + elem * newStride, oldPtr + elem * stride, stride); - //} - //debug_printf("Restrided vertex buffer (old stride: %zu, new stride: %zu, old size: %zu, new size: %zu)\n", stride, newStride, vertexBufferRange.size, newSize); + for (size_t elem = 0; elem < vertexBufferRange.size / stride; elem++) + memcpy(newPtr + elem * newStride, oldPtr + elem * stride, stride); + /* if (m_mtlr->GetEncoderType() == MetalEncoderType::Render) { auto renderCommandEncoder = static_cast(m_mtlr->GetCommandEncoder()); @@ -60,18 +59,19 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu // TODO: do the barriers in one call? MTL::Resource* barrierBuffers[] = {buffer}; renderCommandEncoder->memoryBarrier(barrierBuffers, 1, MTL::RenderStageVertex, MTL::RenderStageVertex); - - // Debug - m_mtlr->GetPerformanceMonitor().m_vertexBufferRestrides++; } else { - debug_printf("vertex buffer restride needs an active render encoder\n"); + debug_printf("vertex buffer restride needs an active render command encoder\n"); cemu_assert_suspicious(); } + */ restrideInfo.memoryInvalidated = false; restrideInfo.lastStride = newStride; + + // Debug + m_mtlr->GetPerformanceMonitor().m_vertexBufferRestrides++; } else { @@ -121,7 +121,7 @@ void MetalMemoryManager::InitBufferCache(size_t size) { cemu_assert_debug(!m_bufferCache); - m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, MTL::ResourceStorageModePrivate); + m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, MTL::ResourceStorageModeShared); #ifdef CEMU_DEBUG_ASSERT m_bufferCache->setLabel(GetLabel("Buffer cache", m_bufferCache)); #endif diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index 62254b213..87327c4c8 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -58,7 +58,7 @@ class MetalVertexBufferCache class MetalMemoryManager { public: - MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_bufferAllocator(metalRenderer, m_mtlr->GetOptimalBufferStorageMode()), m_framePersistentBufferAllocator(metalRenderer, MTL::ResourceStorageModePrivate), m_tempBufferAllocator(metalRenderer), m_vertexBufferCache(metalRenderer, m_framePersistentBufferAllocator) {} + MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_bufferAllocator(metalRenderer, m_mtlr->GetOptimalBufferStorageMode()), m_framePersistentBufferAllocator(metalRenderer, MTL::ResourceStorageModeShared), m_tempBufferAllocator(metalRenderer), m_vertexBufferCache(metalRenderer, m_framePersistentBufferAllocator) {} ~MetalMemoryManager(); // Pipelines diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 784049493..363d5e6d8 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -26,7 +26,7 @@ #include "imgui/imgui_extension.h" #include "imgui/imgui_impl_metal.h" -#define DEFAULT_COMMIT_TRESHOLD 256 +#define DEFAULT_COMMIT_TRESHOLD 196 #define OCCLUSION_QUERY_POOL_SIZE 1024 extern bool hasValidFramebufferAttached; @@ -917,15 +917,18 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 const auto fetchShader = LatteSHRC_GetActiveFetchShader(); // Check if we need to end the render pass - // Fragment shader is most likely to require a render pass flush, so check for it first - bool endRenderPass = CheckIfRenderPassNeedsFlush(pixelShader); - if (!endRenderPass) - endRenderPass = CheckIfRenderPassNeedsFlush(vertexShader); - if (!endRenderPass && geometryShader) - endRenderPass = CheckIfRenderPassNeedsFlush(geometryShader); - - if (endRenderPass) - EndEncoding(); + if (!m_state.m_isFirstDrawInRenderPass) + { + // Fragment shader is most likely to require a render pass flush, so check for it first + bool endRenderPass = CheckIfRenderPassNeedsFlush(pixelShader); + if (!endRenderPass) + endRenderPass = CheckIfRenderPassNeedsFlush(vertexShader); + if (!endRenderPass && geometryShader) + endRenderPass = CheckIfRenderPassNeedsFlush(geometryShader); + + if (endRenderPass) + EndEncoding(); + } // Primitive type const LattePrimitiveMode primitiveMode = static_cast(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]); @@ -1889,7 +1892,9 @@ void MetalRenderer::ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 s void MetalRenderer::CopyBufferToBuffer(MTL::Buffer* src, uint32 srcOffset, MTL::Buffer* dst, uint32 dstOffset, uint32 size, MTL::RenderStages after, MTL::RenderStages before) { + // TODO: uncomment and fix performance issues // Do the copy in a vertex shader on Apple GPUs + /* if (m_isAppleGPU && m_encoderType == MetalEncoderType::Render) { auto renderCommandEncoder = static_cast(m_commandEncoder); @@ -1910,10 +1915,11 @@ void MetalRenderer::CopyBufferToBuffer(MTL::Buffer* src, uint32 srcOffset, MTL:: } else { + */ auto blitCommandEncoder = GetBlitCommandEncoder(); blitCommandEncoder->copyFromBuffer(src, srcOffset, dst, dstOffset, size); - } + //} } void MetalRenderer::SwapBuffer(bool mainWindow) From 7ad57f5cc8d3146f31e0d55d31352499dcea423a Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 21 Sep 2024 14:02:30 +0200 Subject: [PATCH 181/368] add the option to disable accurate barriers --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 363d5e6d8..96d43adf1 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -11,7 +11,6 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalQuery.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" - #include "Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h" #include "Cafe/HW/Latte/Core/LatteShader.h" @@ -21,6 +20,7 @@ #include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "HW/Latte/Renderer/Metal/MetalLayerHandle.h" #include "HW/Latte/Renderer/Renderer.h" +#include "config/CemuConfig.h" #define IMGUI_IMPL_METAL_CPP #include "imgui/imgui_extension.h" @@ -917,7 +917,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 const auto fetchShader = LatteSHRC_GetActiveFetchShader(); // Check if we need to end the render pass - if (!m_state.m_isFirstDrawInRenderPass) + if (!m_state.m_isFirstDrawInRenderPass && GetConfig().vk_accurate_barriers) { // Fragment shader is most likely to require a render pass flush, so check for it first bool endRenderPass = CheckIfRenderPassNeedsFlush(pixelShader); From 4b548f948216bf01afdcd7dc24e77fccecc2d6eb Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 21 Sep 2024 14:58:31 +0200 Subject: [PATCH 182/368] never disable accurate barriers for certain shaders --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 96d43adf1..4d10b2b83 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -916,8 +916,20 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 } const auto fetchShader = LatteSHRC_GetActiveFetchShader(); + bool neverSkipAccurateBarrier = false; + + // "Accurate barriers" is usually enabled globally but since the CPU cost is substantial we allow users to disable it (debug -> 'Accurate barriers' option) + // We always force accurate barriers for known problematic shaders + if (pixelShader) + { + if (pixelShader->baseHash == 0x6f6f6e7b9aae57af && pixelShader->auxHash == 0x00078787f9249249) // BotW lava + neverSkipAccurateBarrier = true; + if (pixelShader->baseHash == 0x4c0bd596e3aef4a6 && pixelShader->auxHash == 0x003c3c3fc9269249) // BotW foam layer for water on the bottom of waterfalls + neverSkipAccurateBarrier = true; + } + // Check if we need to end the render pass - if (!m_state.m_isFirstDrawInRenderPass && GetConfig().vk_accurate_barriers) + if (!m_state.m_isFirstDrawInRenderPass && (GetConfig().vk_accurate_barriers || neverSkipAccurateBarrier)) { // Fragment shader is most likely to require a render pass flush, so check for it first bool endRenderPass = CheckIfRenderPassNeedsFlush(pixelShader); @@ -1323,7 +1335,7 @@ void MetalRenderer::SetBuffer(MTL::RenderCommandEncoder* renderCommandEncoder, M if (buffer == boundBuffer.m_buffer) { - // Just update the offset + // Update just the offset boundBuffer.m_offset = offset; switch (shaderType) @@ -1341,6 +1353,8 @@ void MetalRenderer::SetBuffer(MTL::RenderCommandEncoder* renderCommandEncoder, M renderCommandEncoder->setFragmentBufferOffset(offset, index); break; } + + return; } boundBuffer = {buffer, offset}; From 8b68df0c591f2fe151d34d48a6b4da0da7d0bd02 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 22 Sep 2024 08:58:24 +0200 Subject: [PATCH 183/368] use correct texture decoders --- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index a62ecdc43..69ec3fb6e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -6,10 +6,10 @@ std::map MTL_COLOR_FORMAT_TABLE = { {Latte::E_GX2SURFFMT::INVALID_FORMAT, {MTL::PixelFormatInvalid, MetalDataType::NONE, 0}}, - {Latte::E_GX2SURFFMT::R4_G4_UNORM, {MTL::PixelFormatRG8Unorm, MetalDataType::FLOAT, 2}}, // TODO: correct? - {Latte::E_GX2SURFFMT::R5_G6_B5_UNORM, {MTL::PixelFormatB5G6R5Unorm, MetalDataType::FLOAT, 2}}, // TODO: correct? - {Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM, {MTL::PixelFormatBGR5A1Unorm, MetalDataType::FLOAT, 2}}, // TODO: correct? - {Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM, {MTL::PixelFormatABGR4Unorm, MetalDataType::FLOAT, 2}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R4_G4_UNORM, {MTL::PixelFormatRG8Unorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R5_G6_B5_UNORM, {MTL::PixelFormatB5G6R5Unorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM, {MTL::PixelFormatBGR5A1Unorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM, {MTL::PixelFormatABGR4Unorm, MetalDataType::FLOAT, 2}}, {Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM, {MTL::PixelFormatA1BGR5Unorm, MetalDataType::FLOAT, 2}}, {Latte::E_GX2SURFFMT::R8_UNORM, {MTL::PixelFormatR8Unorm, MetalDataType::FLOAT, 1}}, {Latte::E_GX2SURFFMT::R8_SNORM, {MTL::PixelFormatR8Snorm, MetalDataType::FLOAT, 1}}, @@ -25,12 +25,12 @@ std::map MTL_COLOR_FORMAT_TABLE = { {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SINT, {MTL::PixelFormatRGBA8Sint, MetalDataType::INT, 4}}, {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SRGB, {MTL::PixelFormatRGBA8Unorm_sRGB, MetalDataType::FLOAT, 4}}, {Latte::E_GX2SURFFMT::R10_G10_B10_A2_UNORM, {MTL::PixelFormatRGB10A2Unorm, MetalDataType::FLOAT, 4}}, - {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SNORM, {MTL::PixelFormatRGBA16Snorm, MetalDataType::FLOAT, 8}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SNORM, {MTL::PixelFormatRGBA16Snorm, MetalDataType::FLOAT, 8}}, {Latte::E_GX2SURFFMT::R10_G10_B10_A2_UINT, {MTL::PixelFormatRGB10A2Uint, MetalDataType::UINT, 4}}, - {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SINT, {MTL::PixelFormatRGBA16Sint, MetalDataType::INT, 8}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SINT, {MTL::PixelFormatRGBA16Sint, MetalDataType::INT, 8}}, {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SRGB, {MTL::PixelFormatRGB10A2Unorm, MetalDataType::FLOAT, 4}}, // TODO: sRGB? - {Latte::E_GX2SURFFMT::A2_B10_G10_R10_UNORM, {MTL::PixelFormatBGR10A2Unorm, MetalDataType::FLOAT, 4}}, // TODO: correct? - {Latte::E_GX2SURFFMT::A2_B10_G10_R10_UINT, {MTL::PixelFormatRGB10A2Uint, MetalDataType::UINT, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::A2_B10_G10_R10_UNORM, {MTL::PixelFormatBGR10A2Unorm, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::A2_B10_G10_R10_UINT, {MTL::PixelFormatRGB10A2Uint, MetalDataType::UINT, 4}}, {Latte::E_GX2SURFFMT::R16_UNORM, {MTL::PixelFormatR16Unorm, MetalDataType::FLOAT, 2}}, {Latte::E_GX2SURFFMT::R16_SNORM, {MTL::PixelFormatR16Snorm, MetalDataType::FLOAT, 2}}, {Latte::E_GX2SURFFMT::R16_UINT, {MTL::PixelFormatR16Uint, MetalDataType::UINT, 2}}, @@ -172,6 +172,9 @@ size_t GetMtlTextureBytesPerImage(Latte::E_GX2SURFFMT format, bool isDepth, uint return CeilDivide(height, formatInfo.blockTexelSize.y) * bytesPerRow; } +// TODO: change the decoders when a format is not supported +// TODO: R10_G10_B10_A2_UINT and R10_G10_B10_A2_SINT +// TODO: A2_B10_G10_R10_UNORM and A2_B10_G10_R10_UINT TextureDecoder* GetMtlTextureDecoder(Latte::E_GX2SURFFMT format, bool isDepth) { if (isDepth) @@ -232,7 +235,7 @@ TextureDecoder* GetMtlTextureDecoder(Latte::E_GX2SURFFMT format, bool isDepth) case Latte::E_GX2SURFFMT::R8_G8_SNORM: return TextureDecoder_R8_G8::getInstance(); case Latte::E_GX2SURFFMT::R4_G4_UNORM: - return TextureDecoder_R4_G4::getInstance(); + return TextureDecoder_R4_G4::getInstance(); // TODO: unpack to 8 bits case Latte::E_GX2SURFFMT::R32_FLOAT: return TextureDecoder_R32_FLOAT::getInstance(); case Latte::E_GX2SURFFMT::R32_UINT: @@ -256,11 +259,11 @@ TextureDecoder* GetMtlTextureDecoder(Latte::E_GX2SURFFMT format, bool isDepth) case Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM: return TextureDecoder_R5_G5_B5_A1_UNORM_swappedRB::getInstance(); case Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM: - return TextureDecoder_A1_B5_G5_R5_UNORM_vulkan::getInstance(); + return TextureDecoder_A1_B5_G5_R5_UNORM::getInstance(); case Latte::E_GX2SURFFMT::R11_G11_B10_FLOAT: return TextureDecoder_R11_G11_B10_FLOAT::getInstance(); case Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM: - return TextureDecoder_R4_G4_B4_A4_UNORM::getInstance(); + return TextureDecoder_R4_G4_B4_A4_UNORM::getInstance(); // TODO: ABGR4 case Latte::E_GX2SURFFMT::R10_G10_B10_A2_UNORM: return TextureDecoder_R10_G10_B10_A2_UNORM::getInstance(); case Latte::E_GX2SURFFMT::R10_G10_B10_A2_SNORM: @@ -288,9 +291,9 @@ TextureDecoder* GetMtlTextureDecoder(Latte::E_GX2SURFFMT format, bool isDepth) case Latte::E_GX2SURFFMT::BC5_SNORM: return TextureDecoder_BC5::getInstance(); case Latte::E_GX2SURFFMT::R24_X8_UNORM: - return TextureDecoder_R24_X8::getInstance(); + return TextureDecoder_R24_X8::getInstance(); // TODO: correct? case Latte::E_GX2SURFFMT::X24_G8_UINT: - return TextureDecoder_X24_G8_UINT::getInstance(); // todo - verify + return TextureDecoder_X24_G8_UINT::getInstance(); // todo: correct? default: debug_printf("invalid color texture format %u\n", (uint32)format); cemu_assert_debug(false); From 3cf831d46a1c1328f1c2f7e94803db84a4791d2a Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 23 Sep 2024 16:47:22 +0200 Subject: [PATCH 184/368] do vertex restride on the GPU & don't over-sync --- .../HW/Latte/Renderer/Metal/MetalMemoryManager.cpp | 12 ++++-------- .../HW/Latte/Renderer/Metal/MetalMemoryManager.h | 6 +++--- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 8 +++++++- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index e406abf6d..c2f08532e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -9,7 +9,7 @@ MetalVertexBufferCache::~MetalVertexBufferCache() { } -MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Buffer* bufferCache, uint32 bufferIndex, size_t stride) +MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Buffer* bufferCache, uint32 bufferIndex, size_t stride, std::vector& barrierBuffers) { auto vertexBufferRange = m_bufferRanges[bufferIndex]; auto& restrideInfo = *vertexBufferRange.restrideInfo; @@ -28,14 +28,14 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu restrideInfo.allocation = m_bufferAllocator.GetBufferAllocation(newSize); buffer = m_bufferAllocator.GetBuffer(restrideInfo.allocation.bufferIndex); - // HACK: the restriding is done on the CPU, since doing it on the GPU was causing over-synchronization + /* uint8* oldPtr = (uint8*)bufferCache->contents() + vertexBufferRange.offset; uint8* newPtr = (uint8*)buffer->contents() + restrideInfo.allocation.offset; for (size_t elem = 0; elem < vertexBufferRange.size / stride; elem++) memcpy(newPtr + elem * newStride, oldPtr + elem * stride, stride); + */ - /* if (m_mtlr->GetEncoderType() == MetalEncoderType::Render) { auto renderCommandEncoder = static_cast(m_mtlr->GetCommandEncoder()); @@ -56,16 +56,12 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangleStrip, NS::UInteger(0), vertexBufferRange.size / stride); - // TODO: do the barriers in one call? - MTL::Resource* barrierBuffers[] = {buffer}; - renderCommandEncoder->memoryBarrier(barrierBuffers, 1, MTL::RenderStageVertex, MTL::RenderStageVertex); + vectorAppendUnique(barrierBuffers, static_cast(buffer)); } else { - debug_printf("vertex buffer restride needs an active render command encoder\n"); cemu_assert_suspicious(); } - */ restrideInfo.memoryInvalidated = false; restrideInfo.lastStride = newStride; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index 87327c4c8..ff74a8ee6 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -42,7 +42,7 @@ class MetalVertexBufferCache range.offset = INVALID_OFFSET; } - MetalRestridedBufferRange RestrideBufferIfNeeded(MTL::Buffer* bufferCache, uint32 bufferIndex, size_t stride); + MetalRestridedBufferRange RestrideBufferIfNeeded(MTL::Buffer* bufferCache, uint32 bufferIndex, size_t stride, std::vector& barrierBuffers); private: class MetalRenderer* m_mtlr; @@ -105,9 +105,9 @@ class MetalMemoryManager m_vertexBufferCache.UntrackVertexBuffer(bufferIndex); } - MetalRestridedBufferRange RestrideBufferIfNeeded(uint32 bufferIndex, size_t stride) + MetalRestridedBufferRange RestrideBufferIfNeeded(uint32 bufferIndex, size_t stride, std::vector& barrierBuffers) { - return m_vertexBufferCache.RestrideBufferIfNeeded(m_bufferCache, bufferIndex, stride); + return m_vertexBufferCache.RestrideBufferIfNeeded(m_bufferCache, bufferIndex, stride, barrierBuffers); } private: diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 4d10b2b83..589fb20af 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1141,6 +1141,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // Resources // Vertex buffers + std::vector barrierBuffers; for (uint8 i = 0; i < MAX_MTL_BUFFERS; i++) { auto& vertexBufferRange = m_state.m_vertexBuffers[i]; @@ -1161,7 +1162,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + i * 7; uint32 bufferStride = (LatteGPUState.contextNew.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; - auto restridedBuffer = m_memoryManager->RestrideBufferIfNeeded(i, bufferStride); + auto restridedBuffer = m_memoryManager->RestrideBufferIfNeeded(i, bufferStride, barrierBuffers); buffer = restridedBuffer.buffer; offset = restridedBuffer.offset; @@ -1172,6 +1173,11 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 } } + if (!barrierBuffers.empty()) + { + renderCommandEncoder->memoryBarrier(barrierBuffers.data(), barrierBuffers.size(), MTL::RenderStageVertex, MTL::RenderStageVertex); + } + // Render pipeline state MTL::RenderPipelineState* renderPipelineState; if (usesGeometryShader) From 3de2b0325beb4ccab270c3ee5379d55d8ea6f68f Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 28 Sep 2024 11:51:50 +0200 Subject: [PATCH 185/368] only enable metal on apple platforms by default --- CMakeLists.txt | 11 +++++- src/Cafe/CMakeLists.txt | 86 ++++++++++++++++++++--------------------- 2 files changed, 51 insertions(+), 46 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4b028d843..898f00868 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -103,11 +103,20 @@ if (UNIX AND NOT APPLE) option(ENABLE_FERAL_GAMEMODE "Enables Feral Interactive GameMode Support" ON) endif() +if (APPLE) + set(ENABLE_METAL_DEFAULT ON) +else() + set(ENABLE_METAL_DEFAULT OFF) +endif() + option(ENABLE_OPENGL "Enables the OpenGL backend" ON) option(ENABLE_VULKAN "Enables the Vulkan backend" ON) -option(ENABLE_METAL "Enables the Metal backend" ON) +option(ENABLE_METAL "Enables the Metal backend" $ENABLE_METAL_DEFAULT) option(ENABLE_DISCORD_RPC "Enables the Discord Rich Presence feature" ON) +if (ENABLE_METAL AND NOT APPLE) + message(FATAL_ERROR "Metal backend is only supported on Apple platforms") +endif() # input backends if (WIN32) diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index 10c852703..8dd401b7a 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -534,52 +534,48 @@ if(APPLE) endif() if(ENABLE_METAL) - if(APPLE) - target_sources(CemuCafe PRIVATE - HW/Latte/Renderer/Metal/MetalRenderer.cpp - HW/Latte/Renderer/Metal/MetalRenderer.h - HW/Latte/Renderer/Metal/MetalCommon.h - HW/Latte/Renderer/Metal/MetalCppImpl.cpp - HW/Latte/Renderer/Metal/MetalLayer.mm - HW/Latte/Renderer/Metal/MetalLayer.h - HW/Latte/Renderer/Metal/MetalLayerHandle.cpp - HW/Latte/Renderer/Metal/MetalLayerHandle.h - HW/Latte/Renderer/Metal/LatteToMtl.cpp - HW/Latte/Renderer/Metal/LatteToMtl.h - HW/Latte/Renderer/Metal/LatteTextureMtl.cpp - HW/Latte/Renderer/Metal/LatteTextureMtl.h - HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp - HW/Latte/Renderer/Metal/LatteTextureViewMtl.h - HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp - HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h - HW/Latte/Renderer/Metal/RendererShaderMtl.cpp - HW/Latte/Renderer/Metal/RendererShaderMtl.h - HW/Latte/Renderer/Metal/CachedFBOMtl.cpp - HW/Latte/Renderer/Metal/CachedFBOMtl.h - HW/Latte/Renderer/Metal/MetalBufferAllocator.h - HW/Latte/Renderer/Metal/MetalMemoryManager.cpp - HW/Latte/Renderer/Metal/MetalMemoryManager.h - HW/Latte/Renderer/Metal/MetalPipelineCache.cpp - HW/Latte/Renderer/Metal/MetalPipelineCache.h - HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp - HW/Latte/Renderer/Metal/MetalDepthStencilCache.h - HW/Latte/Renderer/Metal/MetalSamplerCache.cpp - HW/Latte/Renderer/Metal/MetalSamplerCache.h - HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp - HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h - HW/Latte/Renderer/Metal/MetalQuery.cpp - HW/Latte/Renderer/Metal/MetalQuery.h - HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h - HW/Latte/Renderer/Metal/UtilityShaderSource.h - ) + target_sources(CemuCafe PRIVATE + HW/Latte/Renderer/Metal/MetalRenderer.cpp + HW/Latte/Renderer/Metal/MetalRenderer.h + HW/Latte/Renderer/Metal/MetalCommon.h + HW/Latte/Renderer/Metal/MetalCppImpl.cpp + HW/Latte/Renderer/Metal/MetalLayer.mm + HW/Latte/Renderer/Metal/MetalLayer.h + HW/Latte/Renderer/Metal/MetalLayerHandle.cpp + HW/Latte/Renderer/Metal/MetalLayerHandle.h + HW/Latte/Renderer/Metal/LatteToMtl.cpp + HW/Latte/Renderer/Metal/LatteToMtl.h + HW/Latte/Renderer/Metal/LatteTextureMtl.cpp + HW/Latte/Renderer/Metal/LatteTextureMtl.h + HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp + HW/Latte/Renderer/Metal/LatteTextureViewMtl.h + HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp + HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h + HW/Latte/Renderer/Metal/RendererShaderMtl.cpp + HW/Latte/Renderer/Metal/RendererShaderMtl.h + HW/Latte/Renderer/Metal/CachedFBOMtl.cpp + HW/Latte/Renderer/Metal/CachedFBOMtl.h + HW/Latte/Renderer/Metal/MetalBufferAllocator.h + HW/Latte/Renderer/Metal/MetalMemoryManager.cpp + HW/Latte/Renderer/Metal/MetalMemoryManager.h + HW/Latte/Renderer/Metal/MetalPipelineCache.cpp + HW/Latte/Renderer/Metal/MetalPipelineCache.h + HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp + HW/Latte/Renderer/Metal/MetalDepthStencilCache.h + HW/Latte/Renderer/Metal/MetalSamplerCache.cpp + HW/Latte/Renderer/Metal/MetalSamplerCache.h + HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp + HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h + HW/Latte/Renderer/Metal/MetalQuery.cpp + HW/Latte/Renderer/Metal/MetalQuery.h + HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h + HW/Latte/Renderer/Metal/UtilityShaderSource.h + ) - #target_link_libraries(CemuCafe PRIVATE - # "-framework Metal" - # "-framework QuartzCore" - #) - else() - message(FATAL_ERROR "Metal is only supported on macOS") - endif() + #target_link_libraries(CemuCafe PRIVATE + # "-framework Metal" + # "-framework QuartzCore" + #) endif() set_property(TARGET CemuCafe PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") From ed5f72a1ca60d11805542d517dc6988a3c43c257 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 28 Sep 2024 12:06:49 +0200 Subject: [PATCH 186/368] include texture decoder in pixel format info --- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 193 ++++++------------ src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h | 4 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 2 +- 3 files changed, 65 insertions(+), 134 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index 69ec3fb6e..b765f0e10 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -6,7 +6,7 @@ std::map MTL_COLOR_FORMAT_TABLE = { {Latte::E_GX2SURFFMT::INVALID_FORMAT, {MTL::PixelFormatInvalid, MetalDataType::NONE, 0}}, - {Latte::E_GX2SURFFMT::R4_G4_UNORM, {MTL::PixelFormatRG8Unorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R4_G4_UNORM, {MTL::PixelFormatABGR4Unorm, MetalDataType::FLOAT, 2}}, {Latte::E_GX2SURFFMT::R5_G6_B5_UNORM, {MTL::PixelFormatB5G6R5Unorm, MetalDataType::FLOAT, 2}}, {Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM, {MTL::PixelFormatBGR5A1Unorm, MetalDataType::FLOAT, 2}}, {Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM, {MTL::PixelFormatABGR4Unorm, MetalDataType::FLOAT, 2}}, @@ -83,8 +83,69 @@ std::map MTL_DEPTH_FORMAT_TABLE = { {Latte::E_GX2SURFFMT::D32_FLOAT, {MTL::PixelFormatDepth32Float, MetalDataType::NONE, 4, {1, 1}}}, }; +// TODO: change the decoders when a format is not supported +// TODO: R10_G10_B10_A2_UINT and R10_G10_B10_A2_SINT +// TODO: A2_B10_G10_R10_UNORM and A2_B10_G10_R10_UINT void CheckForPixelFormatSupport(const MetalPixelFormatSupport& support) { + // Texture decoders + + // Color + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R32_G32_B32_A32_FLOAT].textureDecoder = TextureDecoder_R32_G32_B32_A32_FLOAT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R32_G32_B32_A32_UINT].textureDecoder = TextureDecoder_R32_G32_B32_A32_UINT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_G16_B16_A16_FLOAT].textureDecoder = TextureDecoder_R16_G16_B16_A16_FLOAT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_G16_B16_A16_UINT].textureDecoder = TextureDecoder_R16_G16_B16_A16_UINT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_G16_B16_A16_UNORM].textureDecoder = TextureDecoder_R16_G16_B16_A16::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_G16_B16_A16_SNORM].textureDecoder = TextureDecoder_R16_G16_B16_A16::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_G8_B8_A8_UNORM].textureDecoder = TextureDecoder_R8_G8_B8_A8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_G8_B8_A8_SNORM].textureDecoder = TextureDecoder_R8_G8_B8_A8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_G8_B8_A8_SRGB].textureDecoder = TextureDecoder_R8_G8_B8_A8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_G8_B8_A8_UINT].textureDecoder = TextureDecoder_R8_G8_B8_A8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_G8_B8_A8_SINT].textureDecoder = TextureDecoder_R8_G8_B8_A8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R32_G32_FLOAT].textureDecoder = TextureDecoder_R32_G32_FLOAT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R32_G32_UINT].textureDecoder = TextureDecoder_R32_G32_UINT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_G16_UNORM].textureDecoder = TextureDecoder_R16_G16::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_G16_FLOAT].textureDecoder = TextureDecoder_R16_G16_FLOAT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_G8_UNORM].textureDecoder = TextureDecoder_R8_G8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_G8_SNORM].textureDecoder = TextureDecoder_R8_G8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_UNORM].textureDecoder = TextureDecoder_R4_G4_UNORM_To_RGBA4::getInstance(); // TODO: to ABGR4 + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R32_FLOAT].textureDecoder = TextureDecoder_R32_FLOAT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R32_UINT].textureDecoder = TextureDecoder_R32_UINT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_FLOAT].textureDecoder = TextureDecoder_R16_FLOAT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_UNORM].textureDecoder = TextureDecoder_R16_UNORM::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_SNORM].textureDecoder = TextureDecoder_R16_SNORM::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_UINT].textureDecoder = TextureDecoder_R16_UINT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_UNORM].textureDecoder = TextureDecoder_R8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_SNORM].textureDecoder = TextureDecoder_R8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_UINT].textureDecoder = TextureDecoder_R8_UINT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R5_G6_B5_UNORM].textureDecoder = TextureDecoder_R5_G6_B5_swappedRB::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM].textureDecoder = TextureDecoder_R5_G5_B5_A1_UNORM_swappedRB::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM].textureDecoder = TextureDecoder_A1_B5_G5_R5_UNORM::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R11_G11_B10_FLOAT].textureDecoder = TextureDecoder_R11_G11_B10_FLOAT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM].textureDecoder = TextureDecoder_R4_G4_B4_A4_UNORM::getInstance(); // TODO: ABGR4 + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R10_G10_B10_A2_UNORM].textureDecoder = TextureDecoder_R10_G10_B10_A2_UNORM::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R10_G10_B10_A2_SNORM].textureDecoder = TextureDecoder_R10_G10_B10_A2_SNORM_To_RGBA16::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R10_G10_B10_A2_SRGB].textureDecoder = TextureDecoder_R10_G10_B10_A2_UNORM::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC1_SRGB].textureDecoder = TextureDecoder_BC1::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC1_UNORM].textureDecoder = TextureDecoder_BC1::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC2_UNORM].textureDecoder = TextureDecoder_BC2::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC2_SRGB].textureDecoder = TextureDecoder_BC2::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC3_UNORM].textureDecoder = TextureDecoder_BC3::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC3_SRGB].textureDecoder = TextureDecoder_BC3::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC4_UNORM].textureDecoder = TextureDecoder_BC4::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC4_SNORM].textureDecoder = TextureDecoder_BC4::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC5_UNORM].textureDecoder = TextureDecoder_BC5::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC5_SNORM].textureDecoder = TextureDecoder_BC5::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R24_X8_UNORM].textureDecoder = TextureDecoder_R24_X8::getInstance(); // TODO: correct? + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::X24_G8_UINT].textureDecoder = TextureDecoder_X24_G8_UINT::getInstance(); // todo: correct? + + // Depth + MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D24_S8_UNORM].textureDecoder = TextureDecoder_D24_S8::getInstance(); + MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D24_S8_FLOAT].textureDecoder = TextureDecoder_NullData64::getInstance(); + MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D32_FLOAT].textureDecoder = TextureDecoder_R32_FLOAT::getInstance(); + MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D16_UNORM].textureDecoder = TextureDecoder_R16_UNORM::getInstance(); + MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D32_S8_FLOAT].textureDecoder = TextureDecoder_D32_S8_UINT_X24::getInstance(); + // Color formats for (auto& [fmt, formatInfo] : MTL_COLOR_FORMAT_TABLE) { @@ -172,136 +233,6 @@ size_t GetMtlTextureBytesPerImage(Latte::E_GX2SURFFMT format, bool isDepth, uint return CeilDivide(height, formatInfo.blockTexelSize.y) * bytesPerRow; } -// TODO: change the decoders when a format is not supported -// TODO: R10_G10_B10_A2_UINT and R10_G10_B10_A2_SINT -// TODO: A2_B10_G10_R10_UNORM and A2_B10_G10_R10_UINT -TextureDecoder* GetMtlTextureDecoder(Latte::E_GX2SURFFMT format, bool isDepth) -{ - if (isDepth) - { - switch (format) - { - case Latte::E_GX2SURFFMT::D24_S8_UNORM: - return TextureDecoder_D24_S8::getInstance(); - case Latte::E_GX2SURFFMT::D24_S8_FLOAT: - return TextureDecoder_NullData64::getInstance(); - case Latte::E_GX2SURFFMT::D32_FLOAT: - return TextureDecoder_R32_FLOAT::getInstance(); - case Latte::E_GX2SURFFMT::D16_UNORM: - return TextureDecoder_R16_UNORM::getInstance(); - case Latte::E_GX2SURFFMT::D32_S8_FLOAT: - return TextureDecoder_D32_S8_UINT_X24::getInstance(); - default: - debug_printf("invalid depth texture format %u\n", (uint32)format); - cemu_assert_debug(false); - return nullptr; - } - } else - { - switch (format) - { - case Latte::E_GX2SURFFMT::R32_G32_B32_A32_FLOAT: - return TextureDecoder_R32_G32_B32_A32_FLOAT::getInstance(); - case Latte::E_GX2SURFFMT::R32_G32_B32_A32_UINT: - return TextureDecoder_R32_G32_B32_A32_UINT::getInstance(); - case Latte::E_GX2SURFFMT::R16_G16_B16_A16_FLOAT: - return TextureDecoder_R16_G16_B16_A16_FLOAT::getInstance(); - case Latte::E_GX2SURFFMT::R16_G16_B16_A16_UINT: - return TextureDecoder_R16_G16_B16_A16_UINT::getInstance(); - case Latte::E_GX2SURFFMT::R16_G16_B16_A16_UNORM: - return TextureDecoder_R16_G16_B16_A16::getInstance(); - case Latte::E_GX2SURFFMT::R16_G16_B16_A16_SNORM: - return TextureDecoder_R16_G16_B16_A16::getInstance(); - case Latte::E_GX2SURFFMT::R8_G8_B8_A8_UNORM: - return TextureDecoder_R8_G8_B8_A8::getInstance(); - case Latte::E_GX2SURFFMT::R8_G8_B8_A8_SNORM: - return TextureDecoder_R8_G8_B8_A8::getInstance(); - case Latte::E_GX2SURFFMT::R8_G8_B8_A8_SRGB: - return TextureDecoder_R8_G8_B8_A8::getInstance(); - case Latte::E_GX2SURFFMT::R8_G8_B8_A8_UINT: - return TextureDecoder_R8_G8_B8_A8::getInstance(); - case Latte::E_GX2SURFFMT::R8_G8_B8_A8_SINT: - return TextureDecoder_R8_G8_B8_A8::getInstance(); - case Latte::E_GX2SURFFMT::R32_G32_FLOAT: - return TextureDecoder_R32_G32_FLOAT::getInstance(); - case Latte::E_GX2SURFFMT::R32_G32_UINT: - return TextureDecoder_R32_G32_UINT::getInstance(); - case Latte::E_GX2SURFFMT::R16_G16_UNORM: - return TextureDecoder_R16_G16::getInstance(); - case Latte::E_GX2SURFFMT::R16_G16_FLOAT: - return TextureDecoder_R16_G16_FLOAT::getInstance(); - case Latte::E_GX2SURFFMT::R8_G8_UNORM: - return TextureDecoder_R8_G8::getInstance(); - case Latte::E_GX2SURFFMT::R8_G8_SNORM: - return TextureDecoder_R8_G8::getInstance(); - case Latte::E_GX2SURFFMT::R4_G4_UNORM: - return TextureDecoder_R4_G4::getInstance(); // TODO: unpack to 8 bits - case Latte::E_GX2SURFFMT::R32_FLOAT: - return TextureDecoder_R32_FLOAT::getInstance(); - case Latte::E_GX2SURFFMT::R32_UINT: - return TextureDecoder_R32_UINT::getInstance(); - case Latte::E_GX2SURFFMT::R16_FLOAT: - return TextureDecoder_R16_FLOAT::getInstance(); - case Latte::E_GX2SURFFMT::R16_UNORM: - return TextureDecoder_R16_UNORM::getInstance(); - case Latte::E_GX2SURFFMT::R16_SNORM: - return TextureDecoder_R16_SNORM::getInstance(); - case Latte::E_GX2SURFFMT::R16_UINT: - return TextureDecoder_R16_UINT::getInstance(); - case Latte::E_GX2SURFFMT::R8_UNORM: - return TextureDecoder_R8::getInstance(); - case Latte::E_GX2SURFFMT::R8_SNORM: - return TextureDecoder_R8::getInstance(); - case Latte::E_GX2SURFFMT::R8_UINT: - return TextureDecoder_R8_UINT::getInstance(); - case Latte::E_GX2SURFFMT::R5_G6_B5_UNORM: - return TextureDecoder_R5_G6_B5_swappedRB::getInstance(); - case Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM: - return TextureDecoder_R5_G5_B5_A1_UNORM_swappedRB::getInstance(); - case Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM: - return TextureDecoder_A1_B5_G5_R5_UNORM::getInstance(); - case Latte::E_GX2SURFFMT::R11_G11_B10_FLOAT: - return TextureDecoder_R11_G11_B10_FLOAT::getInstance(); - case Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM: - return TextureDecoder_R4_G4_B4_A4_UNORM::getInstance(); // TODO: ABGR4 - case Latte::E_GX2SURFFMT::R10_G10_B10_A2_UNORM: - return TextureDecoder_R10_G10_B10_A2_UNORM::getInstance(); - case Latte::E_GX2SURFFMT::R10_G10_B10_A2_SNORM: - return TextureDecoder_R10_G10_B10_A2_SNORM_To_RGBA16::getInstance(); - case Latte::E_GX2SURFFMT::R10_G10_B10_A2_SRGB: - return TextureDecoder_R10_G10_B10_A2_UNORM::getInstance(); - case Latte::E_GX2SURFFMT::BC1_SRGB: - return TextureDecoder_BC1::getInstance(); - case Latte::E_GX2SURFFMT::BC1_UNORM: - return TextureDecoder_BC1::getInstance(); - case Latte::E_GX2SURFFMT::BC2_UNORM: - return TextureDecoder_BC2::getInstance(); - case Latte::E_GX2SURFFMT::BC2_SRGB: - return TextureDecoder_BC2::getInstance(); - case Latte::E_GX2SURFFMT::BC3_UNORM: - return TextureDecoder_BC3::getInstance(); - case Latte::E_GX2SURFFMT::BC3_SRGB: - return TextureDecoder_BC3::getInstance(); - case Latte::E_GX2SURFFMT::BC4_UNORM: - return TextureDecoder_BC4::getInstance(); - case Latte::E_GX2SURFFMT::BC4_SNORM: - return TextureDecoder_BC4::getInstance(); - case Latte::E_GX2SURFFMT::BC5_UNORM: - return TextureDecoder_BC5::getInstance(); - case Latte::E_GX2SURFFMT::BC5_SNORM: - return TextureDecoder_BC5::getInstance(); - case Latte::E_GX2SURFFMT::R24_X8_UNORM: - return TextureDecoder_R24_X8::getInstance(); // TODO: correct? - case Latte::E_GX2SURFFMT::X24_G8_UINT: - return TextureDecoder_X24_G8_UINT::getInstance(); // todo: correct? - default: - debug_printf("invalid color texture format %u\n", (uint32)format); - cemu_assert_debug(false); - return nullptr; - } - } -} - MTL::PrimitiveType GetMtlPrimitiveType(LattePrimitiveMode primitiveMode) { switch (primitiveMode) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h index 726d13eb7..7544ceed9 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h @@ -7,6 +7,7 @@ //#include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Common/precompiled.h" +#include "HW/Latte/Core/LatteTextureLoader.h" struct Uvec2 { uint32 x; @@ -27,6 +28,7 @@ struct MetalPixelFormatInfo { size_t bytesPerBlock; Uvec2 blockTexelSize = {1, 1}; bool hasStencil = false; + TextureDecoder* textureDecoder = nullptr; }; void CheckForPixelFormatSupport(const MetalPixelFormatSupport& support); @@ -61,8 +63,6 @@ size_t GetMtlTextureBytesPerRow(Latte::E_GX2SURFFMT format, bool isDepth, uint32 size_t GetMtlTextureBytesPerImage(Latte::E_GX2SURFFMT format, bool isDepth, uint32 height, size_t bytesPerRow); -TextureDecoder* GetMtlTextureDecoder(Latte::E_GX2SURFFMT format, bool isDepth); - MTL::PrimitiveType GetMtlPrimitiveType(LattePrimitiveMode primitiveMode); MTL::VertexFormat GetMtlVertexFormat(uint8 format); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 589fb20af..61a533550 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -497,7 +497,7 @@ void MetalRenderer::texture_releaseTextureUploadBuffer(uint8* mem) TextureDecoder* MetalRenderer::texture_chooseDecodedFormat(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, uint32 width, uint32 height) { - return GetMtlTextureDecoder(format, isDepth); + return GetMtlPixelFormatInfo(format, isDepth).textureDecoder; } void MetalRenderer::texture_clearSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex) From f9e71e9eb2f1f54dda5e05ed73f0f223782cd674 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 28 Sep 2024 12:53:34 +0200 Subject: [PATCH 187/368] implement ABGR4 texture decoders --- src/Cafe/HW/Latte/Core/LatteTextureLoader.h | 86 ++++++++++++++++++- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 10 +-- 2 files changed, 90 insertions(+), 6 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteTextureLoader.h b/src/Cafe/HW/Latte/Core/LatteTextureLoader.h index f6de57d68..5df2a0e46 100644 --- a/src/Cafe/HW/Latte/Core/LatteTextureLoader.h +++ b/src/Cafe/HW/Latte/Core/LatteTextureLoader.h @@ -632,6 +632,47 @@ class TextureDecoder_R4_G4_UNORM_To_RGBA4_vk : public TextureDecoder, public Sin } }; +class TextureDecoder_R4_G4_UNORM_To_ABGR4 : public TextureDecoder, public SingletonClass +{ +public: + sint32 getBytesPerTexel(LatteTextureLoaderCtx* textureLoader) override + { + return 2; + } + + void decode(LatteTextureLoaderCtx* textureLoader, uint8* outputData) override + { + for (sint32 y = 0; y < textureLoader->height; y += textureLoader->stepY) + { + sint32 yc = y; + for (sint32 x = 0; x < textureLoader->width; x += textureLoader->stepX) + { + uint8* blockData = LatteTextureLoader_GetInput(textureLoader, x, y); + sint32 pixelOffset = (x + yc * textureLoader->width) * 2; + uint8 v = (*(uint8*)(blockData + 0)); + uint8 c0 = (v & 0xF); + uint8 c1 = (v >> 4) & 0xF; + v = (c0 << 4) | c1; + *(uint8*)(outputData + pixelOffset + 0) = v; + *(uint8*)(outputData + pixelOffset + 1) = 0; + } + } + } + + void decodePixelToRGBA(uint8* blockData, uint8* outputPixel, uint8 blockOffsetX, uint8 blockOffsetY) override + { + uint8 v0 = *(blockData + 0); + uint8 c0 = (v0 & 0xF); + uint8 c1 = (v0 >> 4) & 0xF; + c0 = (c0 << 4) | c0; + c1 = (c1 << 4) | c1; + *(outputPixel + 0) = c0; + *(outputPixel + 1) = c1; + *(outputPixel + 2) = 0; + *(outputPixel + 3) = 255; + } +}; + class TextureDecoder_R4G4_UNORM_To_RGBA8 : public TextureDecoder, public SingletonClass { public: @@ -723,6 +764,49 @@ class TextureDecoder_R4_G4_B4_A4_UNORM : public TextureDecoder, public Singleton } }; +class TextureDecoder_R4G4B4A4_UNORM_To_ABGR4 : public TextureDecoder, public SingletonClass +{ +public: + sint32 getBytesPerTexel(LatteTextureLoaderCtx* textureLoader) override + { + return 2; + } + + void decode(LatteTextureLoaderCtx* textureLoader, uint8* outputData) override + { + for (sint32 y = 0; y < textureLoader->height; y += textureLoader->stepY) + { + sint32 yc = y; + for (sint32 x = 0; x < textureLoader->width; x += textureLoader->stepX) + { + uint8* blockData = LatteTextureLoader_GetInput(textureLoader, x, y); + sint32 pixelOffset = (x + yc * textureLoader->width) * 2; + uint8 v0 = (*(uint8*)(blockData + 0)); + uint8 v1 = (*(uint8*)(blockData + 1)); + *(uint8*)(outputData + pixelOffset + 0) = v0; // todo: Verify + *(uint8*)(outputData + pixelOffset + 1) = v1; // todo: Verify + } + } + } + + void decodePixelToRGBA(uint8* blockData, uint8* outputPixel, uint8 blockOffsetX, uint8 blockOffsetY) override + { + uint8 v0 = *(blockData + 0); + uint8 v1 = *(blockData + 1); + uint8 c0 = (v0 & 0xF); + uint8 c1 = (v0 >> 4) & 0xF; + uint8 c2 = (v1 & 0xF); + uint8 c3 = (v1 >> 4) & 0xF; + c0 = (c0 << 4) | c0; + c1 = (c1 << 4) | c1; + c2 = (c2 << 4) | c2; + c3 = (c3 << 4) | c3; + *(outputPixel + 0) = c0; + *(outputPixel + 1) = c1; + *(outputPixel + 2) = c2; + *(outputPixel + 3) = c3; + } +}; class TextureDecoder_R4G4B4A4_UNORM_To_RGBA8 : public TextureDecoder, public SingletonClass { @@ -2121,4 +2205,4 @@ class TextureDecoder_BC5 : public TextureDecoder, public SingletonClass Date: Sat, 28 Sep 2024 15:24:10 +0200 Subject: [PATCH 188/368] use alternative texture decoders for unsupported formats --- src/Cafe/HW/Latte/Core/LatteTextureLoader.h | 2 +- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 79 +++++++++---------- 2 files changed, 39 insertions(+), 42 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteTextureLoader.h b/src/Cafe/HW/Latte/Core/LatteTextureLoader.h index 5df2a0e46..e1c1b413c 100644 --- a/src/Cafe/HW/Latte/Core/LatteTextureLoader.h +++ b/src/Cafe/HW/Latte/Core/LatteTextureLoader.h @@ -764,7 +764,7 @@ class TextureDecoder_R4_G4_B4_A4_UNORM : public TextureDecoder, public Singleton } }; -class TextureDecoder_R4G4B4A4_UNORM_To_ABGR4 : public TextureDecoder, public SingletonClass +class TextureDecoder_R4_G4_B4_A4_UNORM_To_ABGR4 : public TextureDecoder, public SingletonClass { public: sint32 getBytesPerTexel(LatteTextureLoaderCtx* textureLoader) override diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index 550e652f0..dcf61e880 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -1,5 +1,6 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #include "Cemu/Logging/CemuLogging.h" +#include "HW/Latte/Core/LatteTextureLoader.h" #include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "Metal/MTLPixelFormat.hpp" @@ -122,7 +123,7 @@ void CheckForPixelFormatSupport(const MetalPixelFormatSupport& support) MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM].textureDecoder = TextureDecoder_R5_G5_B5_A1_UNORM_swappedRB::getInstance(); MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM].textureDecoder = TextureDecoder_A1_B5_G5_R5_UNORM::getInstance(); MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R11_G11_B10_FLOAT].textureDecoder = TextureDecoder_R11_G11_B10_FLOAT::getInstance(); - MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM].textureDecoder = TextureDecoder_R4G4B4A4_UNORM_To_ABGR4::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM].textureDecoder = TextureDecoder_R4_G4_B4_A4_UNORM_To_ABGR4::getInstance(); MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R10_G10_B10_A2_UNORM].textureDecoder = TextureDecoder_R10_G10_B10_A2_UNORM::getInstance(); MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R10_G10_B10_A2_SNORM].textureDecoder = TextureDecoder_R10_G10_B10_A2_SNORM_To_RGBA16::getInstance(); MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R10_G10_B10_A2_SRGB].textureDecoder = TextureDecoder_R10_G10_B10_A2_UNORM::getInstance(); @@ -139,50 +140,46 @@ void CheckForPixelFormatSupport(const MetalPixelFormatSupport& support) MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R24_X8_UNORM].textureDecoder = TextureDecoder_R24_X8::getInstance(); MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::X24_G8_UINT].textureDecoder = TextureDecoder_X24_G8_UINT::getInstance(); - // Depth - MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D24_S8_UNORM].textureDecoder = TextureDecoder_D24_S8::getInstance(); - MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D24_S8_FLOAT].textureDecoder = TextureDecoder_NullData64::getInstance(); // TODO: why? - MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D32_FLOAT].textureDecoder = TextureDecoder_R32_FLOAT::getInstance(); - MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D16_UNORM].textureDecoder = TextureDecoder_R16_UNORM::getInstance(); - MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D32_S8_FLOAT].textureDecoder = TextureDecoder_D32_S8_UINT_X24::getInstance(); - - // Color formats - for (auto& [fmt, formatInfo] : MTL_COLOR_FORMAT_TABLE) + if (!support.m_supportsPacked16BitFormats) { - switch (formatInfo.pixelFormat) - { - case MTL::PixelFormatR8Unorm_sRGB: - if (!support.m_supportsR8Unorm_sRGB) - formatInfo.pixelFormat = MTL::PixelFormatRGBA8Unorm_sRGB; - break; - case MTL::PixelFormatRG8Unorm_sRGB: - if (!support.m_supportsRG8Unorm_sRGB) - formatInfo.pixelFormat = MTL::PixelFormatRGBA8Unorm_sRGB; - break; - case MTL::PixelFormatB5G6R5Unorm: - case MTL::PixelFormatA1BGR5Unorm: - case MTL::PixelFormatABGR4Unorm: - case MTL::PixelFormatBGR5A1Unorm: - if (!support.m_supportsPacked16BitFormats) - formatInfo.pixelFormat = MTL::PixelFormatRGBA8Unorm; - break; - default: - break; - } + // B5G6R5Unorm + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R5_G6_B5_UNORM].pixelFormat = MTL::PixelFormatRGBA8Unorm; + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R5_G6_B5_UNORM].bytesPerBlock = 4; + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R5_G6_B5_UNORM].textureDecoder = TextureDecoder_R5G6B5_UNORM_To_RGBA8::getInstance(); + + // A1BGR5Unorm + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM].pixelFormat = MTL::PixelFormatRGBA8Unorm; + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM].textureDecoder = TextureDecoder_A1_B5_G5_R5_UNORM_vulkan_To_RGBA8::getInstance(); + + // ABGR4Unorm + // TODO: use RG8Unorm + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_UNORM].pixelFormat = MTL::PixelFormatRGBA8Unorm; + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_UNORM].bytesPerBlock = 4; + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_UNORM].textureDecoder = TextureDecoder_R4G4_UNORM_To_RGBA8::getInstance(); + + // ABGR4Unorm + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM].pixelFormat = MTL::PixelFormatRGBA8Unorm; + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM].bytesPerBlock = 4; + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM].textureDecoder = TextureDecoder_R4G4B4A4_UNORM_To_RGBA8::getInstance(); + + // BGR5A1Unorm + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM].pixelFormat = MTL::PixelFormatRGBA8Unorm; + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM].textureDecoder = TextureDecoder_R5_G5_B5_A1_UNORM_swappedRB_To_RGBA8::getInstance(); } - // Depth formats - for (auto& [fmt, formatInfo] : MTL_DEPTH_FORMAT_TABLE) + // Depth + MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D24_S8_UNORM].textureDecoder = TextureDecoder_D24_S8::getInstance(); + MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D24_S8_FLOAT].textureDecoder = TextureDecoder_NullData64::getInstance(); // TODO: why? + MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D32_FLOAT].textureDecoder = TextureDecoder_R32_FLOAT::getInstance(); + MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D16_UNORM].textureDecoder = TextureDecoder_R16_UNORM::getInstance(); + MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D32_S8_FLOAT].textureDecoder = TextureDecoder_D32_S8_UINT_X24::getInstance(); + + if (!support.m_supportsDepth24Unorm_Stencil8) { - switch (formatInfo.pixelFormat) - { - case MTL::PixelFormatDepth24Unorm_Stencil8: - if (!support.m_supportsDepth24Unorm_Stencil8) - formatInfo.pixelFormat = MTL::PixelFormatDepth32Float_Stencil8; - break; - default: - break; - } + // Depth24Unorm_Stencil8 + MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D24_S8_UNORM].pixelFormat = MTL::PixelFormatDepth32Float_Stencil8; + // TODO: implement the decoder + //MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D24_S8_UNORM].textureDecoder = TextureDecoder_D24_S8_To_D32_S8::getInstance(); } } From 8c30bc805ca05dfc7c256f7a8b8d0feb26c64217 Mon Sep 17 00:00:00 2001 From: Samo Z Date: Sat, 28 Sep 2024 17:42:27 +0200 Subject: [PATCH 189/368] use RG8Unorm format if ABGR4Unorm is not available --- src/Cafe/HW/Latte/Core/LatteTextureLoader.h | 45 +++++++++++++++++++ .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 7 ++- 2 files changed, 48 insertions(+), 4 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteTextureLoader.h b/src/Cafe/HW/Latte/Core/LatteTextureLoader.h index e1c1b413c..656d8a3a6 100644 --- a/src/Cafe/HW/Latte/Core/LatteTextureLoader.h +++ b/src/Cafe/HW/Latte/Core/LatteTextureLoader.h @@ -720,6 +720,51 @@ class TextureDecoder_R4G4_UNORM_To_RGBA8 : public TextureDecoder, public Singlet } }; +class TextureDecoder_R4G4_UNORM_To_RG8 : public TextureDecoder, public SingletonClass +{ +public: + sint32 getBytesPerTexel(LatteTextureLoaderCtx* textureLoader) override + { + return 2; + } + + void decode(LatteTextureLoaderCtx* textureLoader, uint8* outputData) override + { + for (sint32 y = 0; y < textureLoader->height; y += textureLoader->stepY) + { + sint32 yc = y; + for (sint32 x = 0; x < textureLoader->width; x += textureLoader->stepX) + { + uint8* blockData = LatteTextureLoader_GetInput(textureLoader, x, y); + sint32 pixelOffset = (x + yc * textureLoader->width) * 2; + uint8 v0 = (*(uint8*)(blockData + 0)); + + uint8 red4 = (v0 >> 4) & 0xF; + uint8 green4 = (v0 & 0xF); + + red4 = (red4 << 4) | red4; + green4 = (green4 << 4) | green4; + + *(uint8*)(outputData + pixelOffset + 0) = red4; + *(uint8*)(outputData + pixelOffset + 1) = green4; + } + } + } + + void decodePixelToRGBA(uint8* blockData, uint8* outputPixel, uint8 blockOffsetX, uint8 blockOffsetY) override + { + uint8 v0 = *(blockData + 0); + uint8 red4 = (v0 >> 4) & 0xF; + uint8 green4 = (v0 & 0xF); + red4 = (red4 << 4) | red4; + green4 = (green4 << 4) | green4; + *(outputPixel + 0) = red4; + *(outputPixel + 1) = green4; + *(outputPixel + 2) = 0; + *(outputPixel + 3) = 255; + } +}; + class TextureDecoder_R4_G4_B4_A4_UNORM : public TextureDecoder, public SingletonClass { public: diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index dcf61e880..d3a5850c5 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -152,10 +152,9 @@ void CheckForPixelFormatSupport(const MetalPixelFormatSupport& support) MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM].textureDecoder = TextureDecoder_A1_B5_G5_R5_UNORM_vulkan_To_RGBA8::getInstance(); // ABGR4Unorm - // TODO: use RG8Unorm - MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_UNORM].pixelFormat = MTL::PixelFormatRGBA8Unorm; - MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_UNORM].bytesPerBlock = 4; - MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_UNORM].textureDecoder = TextureDecoder_R4G4_UNORM_To_RGBA8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_UNORM].pixelFormat = MTL::PixelFormatRG8Unorm; + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_UNORM].bytesPerBlock = 2; + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_UNORM].textureDecoder = TextureDecoder_R4G4_UNORM_To_RG8::getInstance(); // ABGR4Unorm MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM].pixelFormat = MTL::PixelFormatRGBA8Unorm; From 6c8cd5c4452db2f7d68f3409f769e2e70ba142c3 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 30 Sep 2024 13:31:28 +0200 Subject: [PATCH 190/368] use private storage mode when possible --- src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp | 2 +- src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index c2f08532e..d324d6522 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -117,7 +117,7 @@ void MetalMemoryManager::InitBufferCache(size_t size) { cemu_assert_debug(!m_bufferCache); - m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, MTL::ResourceStorageModeShared); + m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, MTL::ResourceStorageModePrivate); #ifdef CEMU_DEBUG_ASSERT m_bufferCache->setLabel(GetLabel("Buffer cache", m_bufferCache)); #endif diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index ff74a8ee6..ee0652492 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -58,7 +58,7 @@ class MetalVertexBufferCache class MetalMemoryManager { public: - MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_bufferAllocator(metalRenderer, m_mtlr->GetOptimalBufferStorageMode()), m_framePersistentBufferAllocator(metalRenderer, MTL::ResourceStorageModeShared), m_tempBufferAllocator(metalRenderer), m_vertexBufferCache(metalRenderer, m_framePersistentBufferAllocator) {} + MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_bufferAllocator(metalRenderer, m_mtlr->GetOptimalBufferStorageMode()), m_framePersistentBufferAllocator(metalRenderer, MTL::ResourceStorageModePrivate), m_tempBufferAllocator(metalRenderer), m_vertexBufferCache(metalRenderer, m_framePersistentBufferAllocator) {} ~MetalMemoryManager(); // Pipelines From b088ddcfabb9e2748e99553f0f8509d6fb7021ff Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 30 Sep 2024 14:18:00 +0200 Subject: [PATCH 191/368] implement output shaders --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 20 +- .../HW/Latte/Renderer/RendererOuputShader.cpp | 182 +++++++++++++++++- .../HW/Latte/Renderer/RendererOuputShader.h | 7 +- 3 files changed, 205 insertions(+), 4 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 61a533550..9e0b46413 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -20,6 +20,7 @@ #include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "HW/Latte/Renderer/Metal/MetalLayerHandle.h" #include "HW/Latte/Renderer/Renderer.h" +#include "Metal/MTLRenderPipeline.hpp" #include "config/CemuConfig.h" #define IMGUI_IMPL_METAL_CPP @@ -297,8 +298,25 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput auto renderCommandEncoder = GetTemporaryRenderCommandEncoder(renderPassDescriptor); renderPassDescriptor->release(); + // Get a render pipeline + auto vertexShaderMtl = static_cast(shader->GetVertexShader())->GetFunction(); + auto fragmentShaderMtl = static_cast(shader->GetFragmentShader())->GetFunction(); + + auto renderPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); + renderPipelineDescriptor->setVertexFunction(vertexShaderMtl); + renderPipelineDescriptor->setFragmentFunction(fragmentShaderMtl); + renderPipelineDescriptor->colorAttachments()->object(0)->setPixelFormat(m_state.m_usesSRGB ? MTL::PixelFormatBGRA8Unorm_sRGB : MTL::PixelFormatBGRA8Unorm); + + NS::Error* error = nullptr; + auto renderPipelineState = m_device->newRenderPipelineState(renderPipelineDescriptor, &error); + if (error) + { + printf("AAA: %s\n", error->localizedDescription()->utf8String()); + error->release(); + } + // Draw to Metal layer - renderCommandEncoder->setRenderPipelineState(m_state.m_usesSRGB ? m_presentPipelineSRGB : m_presentPipelineLinear); + renderCommandEncoder->setRenderPipelineState(renderPipelineState); renderCommandEncoder->setFragmentTexture(presentTexture, 0); renderCommandEncoder->setFragmentSamplerState((useLinearTexFilter ? m_linearSampler : m_nearestSampler), 0); diff --git a/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp b/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp index ab468055c..d77b19c57 100644 --- a/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp +++ b/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp @@ -20,6 +20,19 @@ void main() } )"; +const std::string RendererOutputShader::s_copy_shader_source_mtl = +R"(#include +using namespace metal; + +struct VertexOut { + float2 uv; +}; + +fragment float4 main0(VertexOut in [[stage_in]], texture2d textureSrc [[texture(0)]], sampler samplr [[sampler(0)]]) { + return float4(textureSrc.sample(samplr, in.uv).rgb, 1.0); +} +)"; + const std::string RendererOutputShader::s_bicubic_shader_source = R"( #version 420 @@ -80,6 +93,56 @@ void main(){ } )"; +const std::string RendererOutputShader::s_bicubic_shader_source_mtl = +R"(#include +using namespace metal; + +float4 cubic(float x) { + float x2 = x * x; + float x3 = x2 * x; + float4 w; + w.x = -x3 + 3 * x2 - 3 * x + 1; + w.y = 3 * x3 - 6 * x2 + 4; + w.z = -3 * x3 + 3 * x2 + 3 * x + 1; + w.w = x3; + return w / 6.0; +} + +float4 bcFilter(texture2d textureSrc, sampler samplr, float2 texcoord, float2 texscale) { + float fx = fract(texcoord.x); + float fy = fract(texcoord.y); + texcoord.x -= fx; + texcoord.y -= fy; + + float4 xcubic = cubic(fx); + float4 ycubic = cubic(fy); + + float4 c = float4(texcoord.x - 0.5, texcoord.x + 1.5, texcoord.y - 0.5, texcoord.y + 1.5); + float4 s = float4(xcubic.x + xcubic.y, xcubic.z + xcubic.w, ycubic.x + ycubic.y, ycubic.z + ycubic.w); + float4 offset = c + float4(xcubic.y, xcubic.w, ycubic.y, ycubic.w) / s; + + float4 sample0 = textureSrc.sample(samplr, float2(offset.x, offset.z) * texscale); + float4 sample1 = textureSrc.sample(samplr, float2(offset.y, offset.z) * texscale); + float4 sample2 = textureSrc.sample(samplr, float2(offset.x, offset.w) * texscale); + float4 sample3 = textureSrc.sample(samplr, float2(offset.y, offset.w) * texscale); + + float sx = s.x / (s.x + s.y); + float sy = s.z / (s.z + s.w); + + return mix( + mix(sample3, sample2, sx), + mix(sample1, sample0, sx), sy); +} + +struct VertexOut { + float2 uv; +}; + +fragment float4 main0(VertexOut in [[stage_in]], texture2d textureSrc [[texture(0)]], sampler samplr [[sampler(0)]], constant float2& textureSrcResolution [[buffer(0)]]) { + return float4(bcFilter(textureSrc, samplr, in.uv * textureSrcResolution, float2(1.0, 1.0) / textureSrcResolution).rgb, 1.0); +} +)"; + const std::string RendererOutputShader::s_hermite_shader_source = R"(#version 420 @@ -147,6 +210,70 @@ void main(){ } )"; +const std::string RendererOutputShader::s_hermite_shader_source_mtl = +R"(#include +using namespace metal; + +// https://www.shadertoy.com/view/MllSzX + +float3 CubicHermite(float3 A, float3 B, float3 C, float3 D, float t) { + float t2 = t*t; + float t3 = t*t*t; + float3 a = -A/2.0 + (3.0*B)/2.0 - (3.0*C)/2.0 + D/2.0; + float3 b = A - (5.0*B)/2.0 + 2.0*C - D / 2.0; + float3 c = -A/2.0 + C/2.0; + float3 d = B; + + return a*t3 + b*t2 + c*t + d; +} + + +float3 BicubicHermiteTexture(texture2d textureSrc, sampler samplr, float2 uv, float4 texelSize) { + float2 pixel = uv*texelSize.zw + 0.5; + float2 frac = fract(pixel); + pixel = floor(pixel) / texelSize.zw - float2(texelSize.xy/2.0); + + float4 doubleSize = texelSize*texelSize; + + float3 C00 = textureSrc.sample(samplr, pixel + float2(-texelSize.x ,-texelSize.y)).rgb; + float3 C10 = textureSrc.sample(samplr, pixel + float2( 0.0 ,-texelSize.y)).rgb; + float3 C20 = textureSrc.sample(samplr, pixel + float2( texelSize.x ,-texelSize.y)).rgb; + float3 C30 = textureSrc.sample(samplr, pixel + float2( doubleSize.x,-texelSize.y)).rgb; + + float3 C01 = textureSrc.sample(samplr, pixel + float2(-texelSize.x , 0.0)).rgb; + float3 C11 = textureSrc.sample(samplr, pixel + float2( 0.0 , 0.0)).rgb; + float3 C21 = textureSrc.sample(samplr, pixel + float2( texelSize.x , 0.0)).rgb; + float3 C31 = textureSrc.sample(samplr, pixel + float2( doubleSize.x, 0.0)).rgb; + + float3 C02 = textureSrc.sample(samplr, pixel + float2(-texelSize.x , texelSize.y)).rgb; + float3 C12 = textureSrc.sample(samplr, pixel + float2( 0.0 , texelSize.y)).rgb; + float3 C22 = textureSrc.sample(samplr, pixel + float2( texelSize.x , texelSize.y)).rgb; + float3 C32 = textureSrc.sample(samplr, pixel + float2( doubleSize.x, texelSize.y)).rgb; + + float3 C03 = textureSrc.sample(samplr, pixel + float2(-texelSize.x , doubleSize.y)).rgb; + float3 C13 = textureSrc.sample(samplr, pixel + float2( 0.0 , doubleSize.y)).rgb; + float3 C23 = textureSrc.sample(samplr, pixel + float2( texelSize.x , doubleSize.y)).rgb; + float3 C33 = textureSrc.sample(samplr, pixel + float2( doubleSize.x, doubleSize.y)).rgb; + + float3 CP0X = CubicHermite(C00, C10, C20, C30, frac.x); + float3 CP1X = CubicHermite(C01, C11, C21, C31, frac.x); + float3 CP2X = CubicHermite(C02, C12, C22, C32, frac.x); + float3 CP3X = CubicHermite(C03, C13, C23, C33, frac.x); + + return CubicHermite(CP0X, CP1X, CP2X, CP3X, frac.y); +} + +struct VertexOut { + float4 position [[position]]; + float2 uv; +}; + +fragment float4 main0(VertexOut in [[stage_in]], texture2d textureSrc [[texture(0)]], sampler samplr [[sampler(0)]], constant float2& outputResolution [[buffer(0)]]) { + float4 texelSize = float4(1.0 / outputResolution.xy, outputResolution.xy); + return float4(BicubicHermiteTexture(textureSrc, samplr, in.uv, texelSize), 1.0); +} +)"; + RendererOutputShader::RendererOutputShader(const std::string& vertex_source, const std::string& fragment_source) { m_vertex_shader = g_renderer->shader_create(RendererShader::ShaderType::kVertex, 0, 0, vertex_source, false, false); @@ -341,6 +468,45 @@ void main(){ )"; return vertex_source.str(); } + +std::string RendererOutputShader::GetMetalVertexSource(bool render_upside_down) +{ + // vertex shader + std::ostringstream vertex_source; + vertex_source << + R"(#include +using namespace metal; + +struct VertexOut { + float4 position [[position]]; + float2 uv; +}; + +vertex VertexOut main0(ushort vid [[vertex_id]]) { + VertexOut out; + float2 pos; + if (vid == 0) pos = float2(-1.0, -3.0); + else if (vid == 1) pos = float2(-1.0, 1.0); + else if (vid == 2) pos = float2(3.0, 1.0); + out.uv = pos * 0.5 + 0.5; + out.uv.y = 1.0 - out.uv.y; +)"; + + if (render_upside_down) + { + vertex_source << + R"( pos.y = -pos.y; + )"; + } + + vertex_source << + R"( out.position = float4(pos, 0.0, 1.0); + return out; +} +)"; + return vertex_source.str(); +} + void RendererOutputShader::InitializeStatic() { std::string vertex_source, vertex_source_ud; @@ -372,7 +538,19 @@ void RendererOutputShader::InitializeStatic() s_hermit_shader = new RendererOutputShader(vertex_source, s_hermite_shader_source); s_hermit_shader_ud = new RendererOutputShader(vertex_source_ud, s_hermite_shader_source);*/ - } else { - cemuLog_logDebug(LogType::Force, "Output shader not implemented for Metal"); + } + else + { + vertex_source = GetMetalVertexSource(false); + vertex_source_ud = GetMetalVertexSource(true); + + s_copy_shader = new RendererOutputShader(vertex_source, s_copy_shader_source_mtl); + s_copy_shader_ud = new RendererOutputShader(vertex_source_ud, s_copy_shader_source_mtl); + + s_bicubic_shader = new RendererOutputShader(vertex_source, s_bicubic_shader_source_mtl); + s_bicubic_shader_ud = new RendererOutputShader(vertex_source_ud, s_bicubic_shader_source_mtl); + + s_hermit_shader = new RendererOutputShader(vertex_source, s_hermite_shader_source_mtl); + s_hermit_shader_ud = new RendererOutputShader(vertex_source_ud, s_hermite_shader_source_mtl); } } diff --git a/src/Cafe/HW/Latte/Renderer/RendererOuputShader.h b/src/Cafe/HW/Latte/Renderer/RendererOuputShader.h index 398ac663a..e175dfe87 100644 --- a/src/Cafe/HW/Latte/Renderer/RendererOuputShader.h +++ b/src/Cafe/HW/Latte/Renderer/RendererOuputShader.h @@ -40,8 +40,9 @@ class RendererOutputShader static RendererOutputShader* s_hermit_shader; static RendererOutputShader* s_hermit_shader_ud; - static std::string GetVulkanVertexSource(bool render_upside_down); static std::string GetOpenGlVertexSource(bool render_upside_down); + static std::string GetVulkanVertexSource(bool render_upside_down); + static std::string GetMetalVertexSource(bool render_upside_down); protected: RendererShader* m_vertex_shader; @@ -61,4 +62,8 @@ class RendererOutputShader static const std::string s_bicubic_shader_source_vk; static const std::string s_hermite_shader_source_vk; + + static const std::string s_copy_shader_source_mtl; + static const std::string s_bicubic_shader_source_mtl; + static const std::string s_hermite_shader_source_mtl; }; From 28e553eb1afad155ba7d975107b75801d1e77701 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 30 Sep 2024 16:28:07 +0200 Subject: [PATCH 192/368] cache output shaders --- src/Cafe/CMakeLists.txt | 2 + .../Renderer/Metal/MetalOutputShaderCache.cpp | 38 +++++++++++++++++++ .../Renderer/Metal/MetalOutputShaderCache.h | 20 ++++++++++ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 29 +++++++------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 2 + 5 files changed, 77 insertions(+), 14 deletions(-) create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index 8dd401b7a..c73604589 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -558,6 +558,8 @@ if(ENABLE_METAL) HW/Latte/Renderer/Metal/MetalBufferAllocator.h HW/Latte/Renderer/Metal/MetalMemoryManager.cpp HW/Latte/Renderer/Metal/MetalMemoryManager.h + HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp + HW/Latte/Renderer/Metal/MetalOutputShaderCache.h HW/Latte/Renderer/Metal/MetalPipelineCache.cpp HW/Latte/Renderer/Metal/MetalPipelineCache.h HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp new file mode 100644 index 000000000..8a69a442a --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp @@ -0,0 +1,38 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" + +MetalOutputShaderCache::~MetalOutputShaderCache() +{ + for (uint8 i = 0; i < METAL_OUTPUT_SHADER_CACHE_SIZE; i++) + { + if (m_cache[i]) + m_cache[i]->release(); + } +} + +MTL::RenderPipelineState* MetalOutputShaderCache::GetPipeline(RendererOutputShader* shader, uint8 shaderIndex, bool usesSRGB) +{ + uint8 cacheIndex = (usesSRGB ? METAL_SHADER_TYPE_COUNT : 0) + shaderIndex; + auto& renderPipelineState = m_cache[cacheIndex]; + if (renderPipelineState) + return renderPipelineState; + + // Create a new render pipeline state + auto vertexShaderMtl = static_cast(shader->GetVertexShader())->GetFunction(); + auto fragmentShaderMtl = static_cast(shader->GetFragmentShader())->GetFunction(); + + auto renderPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); + renderPipelineDescriptor->setVertexFunction(vertexShaderMtl); + renderPipelineDescriptor->setFragmentFunction(fragmentShaderMtl); + renderPipelineDescriptor->colorAttachments()->object(0)->setPixelFormat(usesSRGB ? MTL::PixelFormatBGRA8Unorm_sRGB : MTL::PixelFormatBGRA8Unorm); + + NS::Error* error = nullptr; + renderPipelineState = m_mtlr->GetDevice()->newRenderPipelineState(renderPipelineDescriptor, &error); + if (error) + { + cemuLog_log(LogType::Force, "error creating output render pipeline state: {}", error->localizedDescription()->utf8String()); + error->release(); + } + + return renderPipelineState; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h new file mode 100644 index 000000000..85b9e8b24 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h @@ -0,0 +1,20 @@ +#pragma once + +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" + +constexpr uint8 METAL_SHADER_TYPE_COUNT = 6; +constexpr uint8 METAL_OUTPUT_SHADER_CACHE_SIZE = 2 * METAL_SHADER_TYPE_COUNT; + +class MetalOutputShaderCache +{ +public: + MetalOutputShaderCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} + ~MetalOutputShaderCache(); + + MTL::RenderPipelineState* GetPipeline(RendererOutputShader* shader, uint8 shaderIndex, bool usesSRGB); + +private: + class MetalRenderer* m_mtlr; + + MTL::RenderPipelineState* m_cache[METAL_OUTPUT_SHADER_CACHE_SIZE] = {nullptr}; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 9e0b46413..117245445 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -4,6 +4,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h" @@ -81,6 +82,7 @@ MetalRenderer::MetalRenderer() textureDescriptor->release(); m_memoryManager = new MetalMemoryManager(this); + m_outputShaderCache = new MetalOutputShaderCache(this); m_pipelineCache = new MetalPipelineCache(this); m_depthStencilCache = new MetalDepthStencilCache(this); m_samplerCache = new MetalSamplerCache(this); @@ -174,6 +176,7 @@ MetalRenderer::~MetalRenderer() m_presentPipelineLinear->release(); m_presentPipelineSRGB->release(); + delete m_outputShaderCache; delete m_pipelineCache; delete m_depthStencilCache; delete m_samplerCache; @@ -276,7 +279,6 @@ void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) m_performanceMonitor.ResetPerFrameData(); } -// TODO: use `shader` for drawing void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutputShader* shader, bool useLinearTexFilter, sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight, bool padView, bool clearBackground) @@ -299,21 +301,20 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput renderPassDescriptor->release(); // Get a render pipeline - auto vertexShaderMtl = static_cast(shader->GetVertexShader())->GetFunction(); - auto fragmentShaderMtl = static_cast(shader->GetFragmentShader())->GetFunction(); - auto renderPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); - renderPipelineDescriptor->setVertexFunction(vertexShaderMtl); - renderPipelineDescriptor->setFragmentFunction(fragmentShaderMtl); - renderPipelineDescriptor->colorAttachments()->object(0)->setPixelFormat(m_state.m_usesSRGB ? MTL::PixelFormatBGRA8Unorm_sRGB : MTL::PixelFormatBGRA8Unorm); + // Find out which shader we are using + uint8 shaderIndex = 255; + if (shader == RendererOutputShader::s_copy_shader) shaderIndex = 0; + else if (shader == RendererOutputShader::s_bicubic_shader) shaderIndex = 1; + else if (shader == RendererOutputShader::s_hermit_shader) shaderIndex = 2; + else if (shader == RendererOutputShader::s_copy_shader_ud) shaderIndex = 3; + else if (shader == RendererOutputShader::s_bicubic_shader_ud) shaderIndex = 4; + else if (shader == RendererOutputShader::s_hermit_shader_ud) shaderIndex = 5; - NS::Error* error = nullptr; - auto renderPipelineState = m_device->newRenderPipelineState(renderPipelineDescriptor, &error); - if (error) - { - printf("AAA: %s\n", error->localizedDescription()->utf8String()); - error->release(); - } + uint8 shaderType = shaderIndex % 3; + + // Get the render pipeline state + auto renderPipelineState = m_outputShaderCache->GetPipeline(shader, shaderIndex, m_state.m_usesSRGB); // Draw to Metal layer renderCommandEncoder->setRenderPipelineState(renderPipelineState); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 8eb0e319d..04956ac77 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -4,6 +4,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h" struct MetalBufferAllocation { @@ -460,6 +461,7 @@ class MetalRenderer : public Renderer // Managers and caches class MetalMemoryManager* m_memoryManager; + class MetalOutputShaderCache* m_outputShaderCache; class MetalPipelineCache* m_pipelineCache; class MetalDepthStencilCache* m_depthStencilCache; class MetalSamplerCache* m_samplerCache; From 07cb8b800abb5fe8f61055c62e0be31645bf8c6d Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 30 Sep 2024 16:39:26 +0200 Subject: [PATCH 193/368] set output shader uniforms --- src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp | 5 ++--- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 12 ++++++++++++ src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp | 3 ++- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp b/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp index 4f88a0a0c..aca804690 100644 --- a/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp +++ b/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp @@ -934,7 +934,7 @@ void LatteRenderTarget_copyToBackbuffer(LatteTextureView* textureView, bool isPa { sint32 scaling_filter = downscaling ? GetConfig().downscale_filter : GetConfig().upscale_filter; - if (g_renderer->GetType() == RendererAPI::Vulkan || g_renderer->GetType() == RendererAPI::Metal) + if (g_renderer->GetType() == RendererAPI::Vulkan) { // force linear or nearest neighbor filter if(scaling_filter != kLinearFilter && scaling_filter != kNearestNeighborFilter) @@ -978,8 +978,7 @@ void LatteRenderTarget_copyToBackbuffer(LatteTextureView* textureView, bool isPa filter = LatteTextureView::MagFilter::kNearestNeighbor; } } - // HACK: comment out the assert - //cemu_assert(shader); + cemu_assert(shader); g_renderer->DrawBackbufferQuad(textureView, shader, filter==LatteTextureView::MagFilter::kLinear, imageX, imageY, imageWidth, imageHeight, isPadView, clearBackground); g_renderer->HandleScreenshotRequest(textureView, isPadView); if (!g_renderer->ImguiBegin(!isPadView)) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 117245445..ff2d716d8 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -310,6 +310,7 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput else if (shader == RendererOutputShader::s_copy_shader_ud) shaderIndex = 3; else if (shader == RendererOutputShader::s_bicubic_shader_ud) shaderIndex = 4; else if (shader == RendererOutputShader::s_hermit_shader_ud) shaderIndex = 5; + printf("Shader index: %u\n", shaderIndex); uint8 shaderType = shaderIndex % 3; @@ -321,6 +322,17 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput renderCommandEncoder->setFragmentTexture(presentTexture, 0); renderCommandEncoder->setFragmentSamplerState((useLinearTexFilter ? m_linearSampler : m_nearestSampler), 0); + // Set uniforms + float outputSize[2] = {(float)imageWidth, (float)imageHeight}; + switch (shaderType) + { + case 2: + renderCommandEncoder->setFragmentBytes(outputSize, sizeof(outputSize), 0); + break; + default: + break; + } + renderCommandEncoder->setViewport(MTL::Viewport{(double)imageX, (double)imageY, (double)imageWidth, (double)imageHeight, 0.0, 1.0}); renderCommandEncoder->setScissorRect(MTL::ScissorRect{(uint32)imageX, (uint32)imageY, (uint32)imageWidth, (uint32)imageHeight}); diff --git a/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp b/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp index d77b19c57..3a0b9b46d 100644 --- a/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp +++ b/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp @@ -138,7 +138,8 @@ struct VertexOut { float2 uv; }; -fragment float4 main0(VertexOut in [[stage_in]], texture2d textureSrc [[texture(0)]], sampler samplr [[sampler(0)]], constant float2& textureSrcResolution [[buffer(0)]]) { +fragment float4 main0(VertexOut in [[stage_in]], texture2d textureSrc [[texture(0)]], sampler samplr [[sampler(0)]]) { + float2 textureSrcResolution = float2(textureSrc.get_width(), textureSrc.get_height()); return float4(bcFilter(textureSrc, samplr, in.uv * textureSrcResolution, float2(1.0, 1.0) / textureSrcResolution).rgb, 1.0); } )"; From 778037f335d79061df1f61c8b5b4c216889cb27a Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 30 Sep 2024 19:28:09 +0200 Subject: [PATCH 194/368] remove useless print statement --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index ff2d716d8..8bd906617 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -310,7 +310,6 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput else if (shader == RendererOutputShader::s_copy_shader_ud) shaderIndex = 3; else if (shader == RendererOutputShader::s_bicubic_shader_ud) shaderIndex = 4; else if (shader == RendererOutputShader::s_hermit_shader_ud) shaderIndex = 5; - printf("Shader index: %u\n", shaderIndex); uint8 shaderType = shaderIndex % 3; From c65123bbbe8bbf7cf07a1a521eb485f794d59f36 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 30 Sep 2024 19:36:13 +0200 Subject: [PATCH 195/368] use logging instead of printing --- src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp | 2 +- .../HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp | 2 +- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 7 +------ src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp | 2 +- 4 files changed, 4 insertions(+), 9 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index d3a5850c5..8a73da2bd 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -310,7 +310,7 @@ MTL::VertexFormat GetMtlVertexFormat(uint8 format) case FMT_2_10_10_10: return MTL::VertexFormatUInt; // verified to match OpenGL default: - printf("unsupported vertex format %u\n", (uint32)format); + cemuLog_log(LogType::Force, "unsupported vertex format {}", (uint32)format); assert_dbg(); return MTL::VertexFormatInvalid; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp index f822fb13c..7043b99fe 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp @@ -15,7 +15,7 @@ MetalHybridComputePipeline::MetalHybridComputePipeline(class MetalRenderer* mtlR vertexFunction->release(); if (error) { - printf("error creating hybrid render pipeline state: %s\n", error->localizedDescription()->utf8String()); + cemuLog_log(LogType::Force, "error creating hybrid render pipeline state: {}", error->localizedDescription()->utf8String()); error->release(); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 8bd906617..cfe45fb87 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -17,6 +17,7 @@ #include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/Core/LatteIndices.h" #include "Cemu/Logging/CemuDebugLogging.h" +#include "Cemu/Logging/CemuLogging.h" #include "HW/Latte/Core/LatteConst.h" #include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "HW/Latte/Renderer/Metal/MetalLayerHandle.h" @@ -938,12 +939,6 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 LatteDecompilerShader* vertexShader = LatteSHRC_GetActiveVertexShader(); LatteDecompilerShader* geometryShader = LatteSHRC_GetActiveGeometryShader(); LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader(); - // TODO: is this even needed? Also, should go to draw_beginSequence - if (!vertexShader || !static_cast(vertexShader->shader)->GetFunction()) - { - printf("no vertex function, skipping draw\n"); - return; - } const auto fetchShader = LatteSHRC_GetActiveFetchShader(); bool neverSkipAccurateBarrier = false; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index d343ef453..84751eee1 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -20,7 +20,7 @@ RendererShaderMtl::RendererShaderMtl(MetalRenderer* mtlRenderer, ShaderType type MTL::Library* library = m_mtlr->GetDevice()->newLibrary(ToNSString(mslCode), nullptr, &error); if (error) { - printf("failed to create library (error: %s) -> source:\n%s\n", error->localizedDescription()->utf8String(), mslCode.c_str()); + cemuLog_log(LogType::Force, "failed to create library: {}", error->localizedDescription()->utf8String()); error->release(); return; } From 5d01c77efc5d464377dfb99ea90ac96057882f81 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 30 Sep 2024 19:43:53 +0200 Subject: [PATCH 196/368] rename hybrid pipeline to void vertex pipeline --- src/Cafe/CMakeLists.txt | 4 ++-- .../Metal/MetalHybridComputePipeline.h | 20 ------------------- .../Renderer/Metal/MetalMemoryManager.cpp | 2 +- .../Latte/Renderer/Metal/MetalMemoryManager.h | 6 +++--- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 8 ++++---- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 6 +++--- ...peline.cpp => MetalVoidVertexPipeline.cpp} | 9 +++------ .../Renderer/Metal/MetalVoidVertexPipeline.h | 16 +++++++++++++++ 8 files changed, 32 insertions(+), 39 deletions(-) delete mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h rename src/Cafe/HW/Latte/Renderer/Metal/{MetalHybridComputePipeline.cpp => MetalVoidVertexPipeline.cpp} (67%) create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index c73604589..9b5f8d3ed 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -566,8 +566,8 @@ if(ENABLE_METAL) HW/Latte/Renderer/Metal/MetalDepthStencilCache.h HW/Latte/Renderer/Metal/MetalSamplerCache.cpp HW/Latte/Renderer/Metal/MetalSamplerCache.h - HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp - HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h + HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.cpp + HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h HW/Latte/Renderer/Metal/MetalQuery.cpp HW/Latte/Renderer/Metal/MetalQuery.h HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h deleted file mode 100644 index 5aeee65f2..000000000 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h +++ /dev/null @@ -1,20 +0,0 @@ -#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" -#include "HW/Latte/Renderer/Metal/MetalRenderer.h" -#include "Metal/MTLLibrary.hpp" -#include "Metal/MTLRenderPipeline.hpp" - -// TODO: rename to MetalVoidVertexPipeline -class MetalHybridComputePipeline -{ -public: - MetalHybridComputePipeline(class MetalRenderer* mtlRenderer, MTL::Library* library, const std::string& vertexFunctionName/*, const std::string& kernelFunctionName*/); - ~MetalHybridComputePipeline(); - - MTL::RenderPipelineState* GetRenderPipelineState() const { return m_renderPipelineState; } - - //MTL::RenderPipelineState* GetComputePipelineState() const { return m_computePipelineState; } - -private: - MTL::RenderPipelineState* m_renderPipelineState; - //MTL::RenderPipelineState* m_computePipelineState; -}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index d324d6522..9765b24ac 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -1,6 +1,6 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" -#include "Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h" #include "Common/precompiled.h" #include "HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Metal/MTLResource.hpp" diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index ee0652492..a79111fd3 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -24,7 +24,7 @@ class MetalVertexBufferCache MetalVertexBufferCache(class MetalRenderer* metalRenderer, MetalDefaultBufferAllocator& bufferAllocator) : m_mtlr{metalRenderer}, m_bufferAllocator{bufferAllocator} {} ~MetalVertexBufferCache(); - void SetRestrideBufferPipeline(class MetalHybridComputePipeline* restrideBufferPipeline) + void SetRestrideBufferPipeline(class MetalVoidVertexPipeline* restrideBufferPipeline) { m_restrideBufferPipeline = restrideBufferPipeline; } @@ -48,7 +48,7 @@ class MetalVertexBufferCache class MetalRenderer* m_mtlr; MetalDefaultBufferAllocator& m_bufferAllocator; - class MetalHybridComputePipeline* m_restrideBufferPipeline = nullptr; + class MetalVoidVertexPipeline* m_restrideBufferPipeline = nullptr; MetalVertexBufferRange m_bufferRanges[LATTE_MAX_VERTEX_BUFFERS] = {}; @@ -62,7 +62,7 @@ class MetalMemoryManager ~MetalMemoryManager(); // Pipelines - void SetRestrideBufferPipeline(class MetalHybridComputePipeline* restrideBufferPipeline) + void SetRestrideBufferPipeline(class MetalVoidVertexPipeline* restrideBufferPipeline) { m_vertexBufferCache.SetRestrideBufferPipeline(restrideBufferPipeline); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index cfe45fb87..0587b2680 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -9,7 +9,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h" -#include "Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalQuery.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h" @@ -159,9 +159,9 @@ MetalRenderer::MetalRenderer() // Hybrid pipelines if (m_isAppleGPU) - m_copyBufferToBufferPipeline = new MetalHybridComputePipeline(this, utilityLibrary, "vertexCopyBufferToBuffer"); - //m_copyTextureToTexturePipeline = new MetalHybridComputePipeline(this, utilityLibrary, "vertexCopyTextureToTexture"); - m_restrideBufferPipeline = new MetalHybridComputePipeline(this, utilityLibrary, "vertexRestrideBuffer"); + m_copyBufferToBufferPipeline = new MetalVoidVertexPipeline(this, utilityLibrary, "vertexCopyBufferToBuffer"); + //m_copyTextureToTexturePipeline = new MetalVoidVertexPipeline(this, utilityLibrary, "vertexCopyTextureToTexture"); + m_restrideBufferPipeline = new MetalVoidVertexPipeline(this, utilityLibrary, "vertexRestrideBuffer"); utilityLibrary->release(); m_memoryManager->SetRestrideBufferPipeline(m_restrideBufferPipeline); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 04956ac77..327104a19 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -471,9 +471,9 @@ class MetalRenderer : public Renderer MTL::RenderPipelineState* m_presentPipelineSRGB; // Hybrid pipelines - class MetalHybridComputePipeline* m_copyBufferToBufferPipeline; - //class MetalHybridComputePipeline* m_copyTextureToTexturePipeline; - class MetalHybridComputePipeline* m_restrideBufferPipeline; + class MetalVoidVertexPipeline* m_copyBufferToBufferPipeline; + //class MetalVoidVertexPipeline* m_copyTextureToTexturePipeline; + class MetalVoidVertexPipeline* m_restrideBufferPipeline; // Resources MTL::SamplerState* m_nearestSampler; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.cpp similarity index 67% rename from src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp rename to src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.cpp index 7043b99fe..ded711f9f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.cpp @@ -1,6 +1,6 @@ -#include "Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h" -MetalHybridComputePipeline::MetalHybridComputePipeline(class MetalRenderer* mtlRenderer, MTL::Library* library, const std::string& vertexFunctionName/*, const std::string& kernelFunctionName*/) +MetalVoidVertexPipeline::MetalVoidVertexPipeline(class MetalRenderer* mtlRenderer, MTL::Library* library, const std::string& vertexFunctionName) { // Render pipeline state MTL::Function* vertexFunction = library->newFunction(ToNSString(vertexFunctionName)); @@ -18,12 +18,9 @@ MetalHybridComputePipeline::MetalHybridComputePipeline(class MetalRenderer* mtlR cemuLog_log(LogType::Force, "error creating hybrid render pipeline state: {}", error->localizedDescription()->utf8String()); error->release(); } - - // Compute pipeline state } -MetalHybridComputePipeline::~MetalHybridComputePipeline() +MetalVoidVertexPipeline::~MetalVoidVertexPipeline() { m_renderPipelineState->release(); - //m_computePipelineState->release(); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h new file mode 100644 index 000000000..57666a57a --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h @@ -0,0 +1,16 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" +#include "HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Metal/MTLLibrary.hpp" +#include "Metal/MTLRenderPipeline.hpp" + +class MetalVoidVertexPipeline +{ +public: + MetalVoidVertexPipeline(class MetalRenderer* mtlRenderer, MTL::Library* library, const std::string& vertexFunctionName); + ~MetalVoidVertexPipeline(); + + MTL::RenderPipelineState* GetRenderPipelineState() const { return m_renderPipelineState; } + +private: + MTL::RenderPipelineState* m_renderPipelineState; +}; From a3bfde80b08d489e2291563598d41217d3c321a2 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 30 Sep 2024 19:59:30 +0200 Subject: [PATCH 197/368] remove old present pipelines --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 6 ++- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 4 +- .../Renderer/Metal/UtilityShaderSource.h | 54 +++++++++---------- 3 files changed, 31 insertions(+), 33 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 0587b2680..f33f8af2e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -120,6 +120,7 @@ MetalRenderer::MetalRenderer() } // Present pipeline + /* MTL::Function* fullscreenVertexFunction = utilityLibrary->newFunction(ToNSString("vertexFullscreen")); MTL::Function* presentFragmentFunction = utilityLibrary->newFunction(ToNSString("fragmentPresent")); @@ -153,6 +154,7 @@ MetalRenderer::MetalRenderer() debug_printf("failed to create sRGB present pipeline (error: %s)\n", error->localizedDescription()->utf8String()); error->release(); } + */ // Copy texture pipelines auto copyTextureToColorPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); @@ -174,8 +176,8 @@ MetalRenderer::~MetalRenderer() //delete m_copyTextureToTexturePipeline; delete m_restrideBufferPipeline; - m_presentPipelineLinear->release(); - m_presentPipelineSRGB->release(); + //m_presentPipelineLinear->release(); + //m_presentPipelineSRGB->release(); delete m_outputShaderCache; delete m_pipelineCache; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 327104a19..2c9b150f8 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -467,8 +467,8 @@ class MetalRenderer : public Renderer class MetalSamplerCache* m_samplerCache; // Pipelines - MTL::RenderPipelineState* m_presentPipelineLinear; - MTL::RenderPipelineState* m_presentPipelineSRGB; + //MTL::RenderPipelineState* m_presentPipelineLinear; + //MTL::RenderPipelineState* m_presentPipelineSRGB; // Hybrid pipelines class MetalVoidVertexPipeline* m_copyBufferToBufferPipeline; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h index 95606df7b..9fba19467 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h @@ -3,54 +3,50 @@ #define __STRINGIFY(x) #x #define _STRINGIFY(x) __STRINGIFY(x) -constexpr const char* utilityShaderSource = R"V0G0N( -#include +constexpr const char* utilityShaderSource = R"(#include using namespace metal; #define GET_BUFFER_BINDING(index) (28 + index) #define GET_TEXTURE_BINDING(index) (29 + index) #define GET_SAMPLER_BINDING(index) (14 + index)\n -constant float2 positions[] = {float2(-1.0, -3.0), float2(-1.0, 1.0), float2(3.0, 1.0)}; - -struct VertexOut { - float4 position [[position]]; - float2 texCoord; -}; - -vertex VertexOut vertexFullscreen(ushort vid [[vertex_id]]) { - VertexOut out; - out.position = float4(positions[vid], 0.0, 1.0); - out.texCoord = positions[vid] * 0.5 + 0.5; - out.texCoord.y = 1.0 - out.texCoord.y; - - return out; -} - -fragment float4 fragmentPresent(VertexOut in [[stage_in]], texture2d tex [[texture(0)]], sampler samplr [[sampler(0)]]) { - return tex.sample(samplr, in.texCoord); -} +//constant float2 positions[] = {float2(-1.0, -3.0), float2(-1.0, 1.0), float2(3.0, 1.0)}; +// +//struct VertexOut { +// float4 position [[position]]; +// float2 texCoord; +//}; +// +//vertex VertexOut vertexFullscreen(ushort vid [[vertex_id]]) { +// VertexOut out; +// out.position = float4(positions[vid], 0.0, 1.0); +// out.texCoord = positions[vid] * 0.5 + 0.5; +// out.texCoord.y = 1.0 - out.texCoord.y; +// +// return out; +//} +// +//fragment float4 fragmentPresent(VertexOut in [[stage_in]], texture2d tex [[texture(0)]], //sampler samplr [[sampler(0)]]) { +// return tex.sample(samplr, in.texCoord); +//} vertex void vertexCopyBufferToBuffer(uint vid [[vertex_id]], device uint8_t* src [[buffer(GET_BUFFER_BINDING(0))]], device uint8_t* dst [[buffer(GET_BUFFER_BINDING(1))]]) { dst[vid] = src[vid]; } -/* -vertex void vertexCopyTextureToTexture(uint vid [[vertex_id]], texture2d src [[texture(GET_TEXTURE_BINDING(0))]], texture2d dst [[texture(GET_TEXTURE_BINDING(1))]], constant uint32_t& width [[buffer(GET_BUFFER_BINDING(0))]]) { - uint2 coord = uint2(vid % width, vid / width); - return dst.write(float4(src.read(coord).r, 0.0, 0.0, 0.0), coord); -} -*/ +//vertex void vertexCopyTextureToTexture(uint vid [[vertex_id]], texture2d src [[texture(GET_TEXTURE_BINDING(0))]], texture2d dst [[texture(GET_TEXTURE_BINDING(1))]], constant uint32_t& width [[buffer(GET_BUFFER_BINDING(0))]]) { +// uint2 coord = uint2(vid % width, vid / width); +// return dst.write(float4(src.read(coord).r, 0.0, 0.0, 0.0), coord); +//} struct RestrideParams { uint oldStride; uint newStride; }; -// TODO: use uint32? Since that would require less iterations vertex void vertexRestrideBuffer(uint vid [[vertex_id]], device uint8_t* src [[buffer(GET_BUFFER_BINDING(0))]], device uint8_t* dst [[buffer(GET_BUFFER_BINDING(1))]], constant RestrideParams& params [[buffer(GET_BUFFER_BINDING(2))]]) { for (uint32_t i = 0; i < params.oldStride; i++) { dst[vid * params.newStride + i] = src[vid * params.oldStride + i]; } } -)V0G0N"; +)"; From 94e8ed5a4681d2d31870d0c91d675cee5c82eba8 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 1 Oct 2024 17:38:14 +0200 Subject: [PATCH 198/368] fetch vertices manually if needed --- src/Cafe/HW/Latte/Core/FetchShader.cpp | 50 ++++++-- src/Cafe/HW/Latte/Core/FetchShader.h | 5 +- src/Cafe/HW/Latte/Core/LatteShader.cpp | 21 ++- .../LatteDecompilerEmitMSL.cpp | 92 ++++++++++---- .../LatteDecompilerEmitMSLHeader.hpp | 25 ++-- .../Renderer/Metal/MetalPipelineCache.cpp | 120 +++++++++--------- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 33 ++--- .../Renderer/Metal/RendererShaderMtl.cpp | 2 +- 8 files changed, 207 insertions(+), 141 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/FetchShader.cpp b/src/Cafe/HW/Latte/Core/FetchShader.cpp index 5933fe055..6da6100b2 100644 --- a/src/Cafe/HW/Latte/Core/FetchShader.cpp +++ b/src/Cafe/HW/Latte/Core/FetchShader.cpp @@ -8,8 +8,12 @@ #include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInstructions.h" #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/ISA/LatteInstructions.h" +#include "HW/Latte/Renderer/Renderer.h" #include "util/containers/LookupTableL3.h" #include "util/helpers/fspinlock.h" +#if BOOST_OS_MACOS +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#endif #include /* SHA1_DIGEST_LENGTH */ #include /* EVP_Digest */ @@ -71,7 +75,7 @@ uint32 LatteShaderRecompiler_getAttributeAlignment(LatteParsedFetchShaderAttribu return 4; } -void LatteShader_calculateFSKey(LatteFetchShader* fetchShader) +void LatteShader_calculateFSKey(LatteFetchShader* fetchShader, uint32* contextRegister) { uint64 key = 0; for (sint32 g = 0; g < fetchShader->bufferGroups.size(); g++) @@ -104,11 +108,25 @@ void LatteShader_calculateFSKey(LatteFetchShader* fetchShader) key = std::rotl(key, 8); key += (uint64)attrib->semanticId; key = std::rotl(key, 8); - key += (uint64)(attrib->offset & 3); - key = std::rotl(key, 2); + if (g_renderer->GetType() == RendererAPI::Metal) + key += (uint64)attrib->offset; + else + key += (uint64)(attrib->offset & 3); + key = std::rotl(key, 7); } } // todo - also hash invalid buffer groups? + + if (g_renderer->GetType() == RendererAPI::Metal) + { + for (sint32 g = 0; g < fetchShader->bufferGroups.size(); g++) + { + LatteParsedFetchShaderBufferGroup_t& group = fetchShader->bufferGroups[g]; + key += (uint64)group.attributeBufferIndex; + key = std::rotl(key, 5); + } + } + fetchShader->key = key; } @@ -146,8 +164,8 @@ void LatteFetchShader::CalculateFetchShaderVkHash() this->vkPipelineHashFragment = h; } -void LatteFetchShader::CalculateFetchShaderMtlObjectShaderHash(uint32* contextRegister) -{uint64 key = 0; +void LatteFetchShader::CheckIfVerticesNeedManualFetchMtl(uint32* contextRegister) +{ for (sint32 g = 0; g < bufferGroups.size(); g++) { LatteParsedFetchShaderBufferGroup_t& group = bufferGroups[g]; @@ -155,12 +173,16 @@ void LatteFetchShader::CalculateFetchShaderMtlObjectShaderHash(uint32* contextRe uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; uint32 bufferStride = (contextRegister[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; - key += (uint64)bufferIndex; - key = std::rotl(key, 5); - key += (uint64)bufferStride; - key = std::rotl(key, 5); + if (bufferStride % 4 != 0) + mtlFetchVertexManually = true; + + for (sint32 f = 0; f < group.attribCount; f++) + { + auto& attr = group.attrib[f]; + if (attr.offset + GetMtlVertexFormatSize(attr.format) > bufferStride) + mtlFetchVertexManually = true; + } } - mtlShaderHashObject = key; } void _fetchShaderDecompiler_parseInstruction_VTX_SEMANTIC(LatteFetchShader* parsedFetchShader, uint32* contextRegister, const LatteClauseInstruction_VTX* instr) @@ -343,9 +365,9 @@ LatteFetchShader* LatteShaderRecompiler_createFetchShader(LatteFetchShader::Cach { // empty fetch shader, seen in Minecraft // these only make sense when vertex shader does not call FS? - LatteShader_calculateFSKey(newFetchShader); + LatteShader_calculateFSKey(newFetchShader, contextRegister); newFetchShader->CalculateFetchShaderVkHash(); - newFetchShader->CalculateFetchShaderMtlObjectShaderHash(contextRegister); + newFetchShader->CheckIfVerticesNeedManualFetchMtl(contextRegister); return newFetchShader; } @@ -403,9 +425,9 @@ LatteFetchShader* LatteShaderRecompiler_createFetchShader(LatteFetchShader::Cach } bufferGroup.vboStride = vboOffset; } - LatteShader_calculateFSKey(newFetchShader); + LatteShader_calculateFSKey(newFetchShader, contextRegister); newFetchShader->CalculateFetchShaderVkHash(); - newFetchShader->CalculateFetchShaderMtlObjectShaderHash(contextRegister); + newFetchShader->CheckIfVerticesNeedManualFetchMtl(contextRegister); // register in cache // its possible that during multi-threaded shader cache loading, two identical (same hash) fetch shaders get created simultaneously diff --git a/src/Cafe/HW/Latte/Core/FetchShader.h b/src/Cafe/HW/Latte/Core/FetchShader.h index 9aeed6bde..1e580f430 100644 --- a/src/Cafe/HW/Latte/Core/FetchShader.h +++ b/src/Cafe/HW/Latte/Core/FetchShader.h @@ -47,16 +47,15 @@ struct LatteFetchShader uint64 vkPipelineHashFragment{}; // hash of all fetch shader state that influences the Vulkan graphics pipeline // Metal - uint64 mtlShaderHashObject{}; + bool mtlFetchVertexManually{}; // cache info CacheHash m_cacheHash{}; bool m_isRegistered{}; // if true, fetch shader is referenced by cache (RegisterInCache() succeeded) - void CalculateFetchShaderVkHash(); - void CalculateFetchShaderMtlObjectShaderHash(uint32* contextRegister); + void CheckIfVerticesNeedManualFetchMtl(uint32* contextRegister); uint64 getVkPipelineHashFragment() const { return vkPipelineHashFragment; }; diff --git a/src/Cafe/HW/Latte/Core/LatteShader.cpp b/src/Cafe/HW/Latte/Core/LatteShader.cpp index 0af3b577b..bc1279c32 100644 --- a/src/Cafe/HW/Latte/Core/LatteShader.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShader.cpp @@ -503,11 +503,21 @@ void LatteSHRC_UpdateVSBaseHash(uint8* vertexShaderPtr, uint32 vertexShaderSize, uint64 vsHash = vsHash1 + vsHash2 + _activeFetchShader->key + _activePSImportTable.key + (usesGeometryShader ? 0x1111ULL : 0ULL); if (g_renderer->GetType() == RendererAPI::Metal) { - if (usesGeometryShader) + if (usesGeometryShader || _activeFetchShader->mtlFetchVertexManually) { - vsHash += _activeFetchShader->mtlShaderHashObject; + for (sint32 g = 0; g < _activeFetchShader->bufferGroups.size(); g++) + { + LatteParsedFetchShaderBufferGroup_t& group = _activeFetchShader->bufferGroups[g]; + uint32 bufferIndex = group.attributeBufferIndex; + uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; + uint32 bufferStride = (LatteGPUState.contextRegister[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + + vsHash += (uint64)bufferStride; + vsHash = std::rotl(vsHash, 7); + } } - else + + if (!usesGeometryShader) { // Rasterization bool rasterizationEnabled = !LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); @@ -524,6 +534,10 @@ void LatteSHRC_UpdateVSBaseHash(uint8* vertexShaderPtr, uint32 vertexShaderSize, if (rasterizationEnabled) vsHash += 51ULL; + + // Vertex fetch + if (_activeFetchShader->mtlFetchVertexManually) + vsHash += 349ULL; } } @@ -531,6 +545,7 @@ void LatteSHRC_UpdateVSBaseHash(uint8* vertexShaderPtr, uint32 vertexShaderSize, vsHash += tmp; auto primitiveType = LatteGPUState.contextNew.VGT_PRIMITIVE_TYPE.get_PRIMITIVE_MODE(); + // TODO: include always in the hash in case of geometry shader or rect shader if (primitiveType == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS) { vsHash += 13ULL; diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 615c8a5f1..05ba6abaa 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3854,10 +3854,12 @@ static void LatteDecompiler_emitAttributeImport(LatteDecompilerShaderContext* sh void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, LatteDecompilerShader* shader) { bool isRectVertexShader = (static_cast(shaderContext->contextRegisters[mmVGT_PRIMITIVE_TYPE]) == LattePrimitiveMode::RECTS); + bool usesGeometryShader = (shaderContext->options->usesGeometryShader || isRectVertexShader); + bool fetchVertexManually = (usesGeometryShader || (shaderContext->fetchShader && shaderContext->fetchShader->mtlFetchVertexManually)); // Rasterization rasterizationEnabled = true; - if (shader->shaderType == LatteConst::ShaderType::Vertex && !(shaderContext->options->usesGeometryShader || isRectVertexShader)) + if (shader->shaderType == LatteConst::ShaderType::Vertex && !usesGeometryShader) { rasterizationEnabled = !shaderContext->contextRegistersNew->PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); @@ -3885,7 +3887,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, src->add("#include " _CRLF); src->add("using namespace metal;" _CRLF); // header part (definitions for inputs and outputs) - LatteDecompiler::emitHeader(shaderContext, isRectVertexShader, rasterizationEnabled); + LatteDecompiler::emitHeader(shaderContext, isRectVertexShader, fetchVertexManually, rasterizationEnabled); // helper functions LatteDecompiler_emitHelperFunctions(shaderContext, src); const char* functionType = ""; @@ -3893,21 +3895,32 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, switch (shader->shaderType) { case LatteConst::ShaderType::Vertex: - if (shaderContext->options->usesGeometryShader || isRectVertexShader) + if (fetchVertexManually) { // TODO: clean this up - // fetchVertex will modify vid in case of an indexed draw + // fetchVertex will modify vid in case of an object shader and an indexed draw // Vertex buffers std::string vertexBufferDefinitions = "#define VERTEX_BUFFER_DEFINITIONS "; std::string vertexBuffers = "#define VERTEX_BUFFERS "; - std::string inputFetchDefinition = "VertexIn fetchVertex(thread uint& vid, device uint* indexBuffer, uchar indexType VERTEX_BUFFER_DEFINITIONS) {\n"; + std::string inputFetchDefinition = "VertexIn fetchVertex("; + if (usesGeometryShader) + inputFetchDefinition += "thread uint&"; + else + inputFetchDefinition += "uint"; + inputFetchDefinition += " vid, uint iid"; + if (usesGeometryShader) + inputFetchDefinition += ", device uint* indexBuffer, uchar indexType"; + inputFetchDefinition += " VERTEX_BUFFER_DEFINITIONS) {\n"; // Index buffer - inputFetchDefinition += "if (indexType == 1) // UShort\n"; - inputFetchDefinition += "vid = ((device ushort*)indexBuffer)[vid];\n"; - inputFetchDefinition += "else if (indexType == 2) // UInt\n"; - inputFetchDefinition += "vid = ((device uint*)indexBuffer)[vid];\n"; + if (usesGeometryShader) + { + inputFetchDefinition += "if (indexType == 1) // UShort\n"; + inputFetchDefinition += "vid = ((device ushort*)indexBuffer)[vid];\n"; + inputFetchDefinition += "else if (indexType == 2) // UInt\n"; + inputFetchDefinition += "vid = ((device uint*)indexBuffer)[vid];\n"; + } inputFetchDefinition += "VertexIn in;\n"; for (auto& bufferGroup : shaderContext->fetchShader->bufferGroups) @@ -3980,11 +3993,22 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, break; } + // Get the fetch type + std::string fetchTypeStr; + if (attr.fetchType == LatteConst::VertexFetchType2::VERTEX_DATA) + fetchTypeStr = "vid"; + else if (attr.fetchType == LatteConst::VertexFetchType2::INSTANCE_DATA) + fetchTypeStr = "iid"; + else if (attr.fetchType == LatteConst::VertexFetchType2::NO_INDEX_OFFSET_DATA) + fetchTypeStr = "0"; // TODO: correct? + // Fetch the attribute - inputFetchDefinition += fmt::format("in.ATTRIBUTE_NAME{} = ", semanticId); - inputFetchDefinition += fmt::format("uint4(*(device {}*)", formatName); + inputFetchDefinition += fmt::format("in.ATTRIBUTE_NAME{} = uint4(uint", semanticId); + if (componentCount != 1) + inputFetchDefinition += fmt::format("{}", componentCount); + inputFetchDefinition += fmt::format("(*(device {}*)", formatName); inputFetchDefinition += fmt::format("(vertexBuffer{}", attr.attributeBufferIndex); - inputFetchDefinition += fmt::format(" + vid * {} + {})", bufferStride, attr.offset); + inputFetchDefinition += fmt::format(" + {} * {} + {}))", fetchTypeStr, bufferStride, attr.offset); for (uint8 i = 0; i < (4 - componentCount); i++) inputFetchDefinition += ", 0"; inputFetchDefinition += ");\n"; @@ -4014,7 +4038,10 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, src->add(vertexBuffers.c_str()); src->add("\n"); src->add(inputFetchDefinition.c_str()); + } + if (usesGeometryShader) + { functionType = "[[object, max_total_threads_per_threadgroup(VERTICES_PER_VERTEX_PRIMITIVE), max_total_threadgroups_per_mesh_grid(1)]]"; outputTypeName = "void"; } @@ -4038,20 +4065,33 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, } // start of main src->addFmt("{} {} main0(", functionType, outputTypeName); - LatteDecompiler::emitInputs(shaderContext, isRectVertexShader); + LatteDecompiler::emitInputs(shaderContext, isRectVertexShader, fetchVertexManually); src->add(") {" _CRLF); - if ((shaderContext->options->usesGeometryShader || isRectVertexShader) && (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry)) + if (fetchVertexManually && (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry)) { if (shader->shaderType == LatteConst::ShaderType::Vertex) { - // Calculate the imaginary vertex id - src->add("uint vid = tig * VERTICES_PER_VERTEX_PRIMITIVE + tid;" _CRLF); - src->add("uint iid = vid / verticesPerInstance;" _CRLF); - src->add("vid %= verticesPerInstance;" _CRLF); - // Fetch the input - src->add("VertexIn in = fetchVertex(vid, indexBuffer, indexType VERTEX_BUFFERS);" _CRLF); - // Output is defined as object payload - src->add("object_data VertexOut& out = objectPayload.vertexOut[tid];" _CRLF); + if (usesGeometryShader) + { + // Calculate the imaginary vertex id + src->add("uint vid = tig * VERTICES_PER_VERTEX_PRIMITIVE + tid;" _CRLF); + src->add("uint iid = vid / verticesPerInstance;" _CRLF); + src->add("vid %= verticesPerInstance;" _CRLF); + + // Fetch the input + src->add("VertexIn in = fetchVertex(vid, iid, indexBuffer, indexType VERTEX_BUFFERS);" _CRLF); + + // Output is defined as object payload + src->add("object_data VertexOut& out = objectPayload.vertexOut[tid];" _CRLF); + } + else + { + // Fetch the input + src->add("VertexIn in = fetchVertex(vid, iid VERTEX_BUFFERS);" _CRLF); + + if (rasterizationEnabled) + src->add("VertexOut out;" _CRLF); + } } else if (shader->shaderType == LatteConst::ShaderType::Geometry) { @@ -4258,11 +4298,11 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, } // TODO: is the if statement even needed? - if (shaderContext->options->usesGeometryShader || isRectVertexShader) + if (usesGeometryShader) { // import from geometry shader if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->addFmt("{} = as_type(in.passParameterSem{});" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId & 0x7F); + src->addFmt("{} = bitCast(in.passParameterSem{});" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId & 0x7F); else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) src->addFmt("{} = in.passParameterSem{};" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId & 0x7F); else @@ -4306,7 +4346,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, src->add("out.pointSize = supportBuffer.pointSize;" _CRLF); } - if ((shaderContext->options->usesGeometryShader || isRectVertexShader) && (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry)) + if (usesGeometryShader && (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry)) { if (shader->shaderType == LatteConst::ShaderType::Vertex) { @@ -4346,7 +4386,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, src->add("out.position.z = (out.position.z + out.position.w) / 2.0;" _CRLF); // Return - if (!(shaderContext->options->usesGeometryShader || isRectVertexShader) || shader->shaderType == LatteConst::ShaderType::Pixel) + if (!usesGeometryShader || shader->shaderType == LatteConst::ShaderType::Pixel) src->add("return out;" _CRLF); } diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 9770c595d..5a2c54aca 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -143,7 +143,7 @@ namespace LatteDecompiler } } - static void _emitAttributes(LatteDecompilerShaderContext* decompilerContext, bool isRectVertexShader) + static void _emitAttributes(LatteDecompilerShaderContext* decompilerContext, bool fetchVertexManually) { auto src = decompilerContext->shaderSource; std::string attributeNames; @@ -159,7 +159,7 @@ namespace LatteDecompiler cemu_assert_debug(decompilerContext->output->resourceMappingMTL.attributeMapping[i] >= 0); src->addFmt("uint4 attrDataSem{}", i); - if (decompilerContext->options->usesGeometryShader || isRectVertexShader) + if (fetchVertexManually) attributeNames += "#define ATTRIBUTE_NAME" + std::to_string((sint32)decompilerContext->output->resourceMappingMTL.attributeMapping[i]) + " attrDataSem" + std::to_string(i) + "\n"; else src->addFmt(" [[attribute({})]]", (sint32)decompilerContext->output->resourceMappingMTL.attributeMapping[i]); @@ -250,13 +250,13 @@ namespace LatteDecompiler src->add("};" _CRLF _CRLF); } - static void _emitInputsAndOutputs(LatteDecompilerShaderContext* decompilerContext, bool isRectVertexShader, bool rasterizationEnabled) + static void _emitInputsAndOutputs(LatteDecompilerShaderContext* decompilerContext, bool isRectVertexShader, bool fetchVertexManually, bool rasterizationEnabled) { auto src = decompilerContext->shaderSource; if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) { - _emitAttributes(decompilerContext, isRectVertexShader); + _emitAttributes(decompilerContext, fetchVertexManually); } else if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel) { @@ -339,13 +339,12 @@ namespace LatteDecompiler } } - static void emitHeader(LatteDecompilerShaderContext* decompilerContext, bool isRectVertexShader, bool rasterizationEnabled) + static void emitHeader(LatteDecompilerShaderContext* decompilerContext, bool isRectVertexShader, bool fetchVertexManually, bool rasterizationEnabled) { auto src = decompilerContext->shaderSource; if ((decompilerContext->options->usesGeometryShader || isRectVertexShader) && (decompilerContext->shaderType == LatteConst::ShaderType::Vertex || decompilerContext->shaderType == LatteConst::ShaderType::Geometry)) { - // TODO: make vsOutPrimType parth of the shader hash LattePrimitiveMode vsOutPrimType = static_cast(decompilerContext->contextRegisters[mmVGT_PRIMITIVE_TYPE]); uint32 gsOutPrimType = decompilerContext->contextRegisters[mmVGT_GS_OUT_PRIM_TYPE]; @@ -398,7 +397,7 @@ namespace LatteDecompiler // uniform buffers _emitUniformBuffers(decompilerContext); // inputs and outputs - _emitInputsAndOutputs(decompilerContext, isRectVertexShader, rasterizationEnabled); + _emitInputsAndOutputs(decompilerContext, isRectVertexShader, fetchVertexManually, rasterizationEnabled); if (dump_shaders_enabled) decompilerContext->shaderSource->add("// end of shader inputs/outputs" _CRLF); @@ -472,7 +471,7 @@ namespace LatteDecompiler } } - static void emitInputs(LatteDecompilerShaderContext* decompilerContext, bool isRectVertexShader) + static void emitInputs(LatteDecompilerShaderContext* decompilerContext, bool isRectVertexShader, bool fetchVertexManually) { auto src = decompilerContext->shaderSource; @@ -491,14 +490,18 @@ namespace LatteDecompiler src->addFmt(", device uint* indexBuffer [[buffer({})]]", decompilerContext->output->resourceMappingMTL.indexBufferBinding); // TODO: put into the support buffer? src->addFmt(", constant uchar& indexType [[buffer({})]]", decompilerContext->output->resourceMappingMTL.indexTypeBinding); - src->add(" VERTEX_BUFFER_DEFINITIONS"); } else { - src->add("VertexIn in [[stage_in]]"); - src->add(", uint vid [[vertex_id]]"); + src->add("uint vid [[vertex_id]]"); src->add(", uint iid [[instance_id]]"); } + + if (fetchVertexManually) + src->add(" VERTEX_BUFFER_DEFINITIONS"); + else + src->add(", VertexIn in [[stage_in]]"); + break; case LatteConst::ShaderType::Geometry: src->add("MeshType mesh"); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 436ef99cc..b7f5c88c0 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -326,76 +326,81 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte if (pipeline) return pipeline; - // Vertex descriptor - MTL::VertexDescriptor* vertexDescriptor = MTL::VertexDescriptor::alloc()->init(); - for (auto& bufferGroup : fetchShader->bufferGroups) - { - std::optional fetchType; + auto vertexShaderMtl = static_cast(vertexShader->shader); - uint32 minBufferStride = 0; - for (sint32 j = 0; j < bufferGroup.attribCount; ++j) - { - auto& attr = bufferGroup.attrib[j]; + // Render pipeline state + MTL::RenderPipelineDescriptor* desc = MTL::RenderPipelineDescriptor::alloc()->init(); + desc->setVertexFunction(vertexShaderMtl->GetFunction()); - uint32 semanticId = vertexShader->resourceMapping.attributeMapping[attr.semanticId]; - if (semanticId == (uint32)-1) - continue; // attribute not used? + // Vertex descriptor + if (!fetchShader->mtlFetchVertexManually) + { + MTL::VertexDescriptor* vertexDescriptor = MTL::VertexDescriptor::alloc()->init(); + for (auto& bufferGroup : fetchShader->bufferGroups) + { + std::optional fetchType; - auto attribute = vertexDescriptor->attributes()->object(semanticId); - attribute->setOffset(attr.offset); - attribute->setBufferIndex(GET_MTL_VERTEX_BUFFER_INDEX(attr.attributeBufferIndex)); - attribute->setFormat(GetMtlVertexFormat(attr.format)); + uint32 minBufferStride = 0; + for (sint32 j = 0; j < bufferGroup.attribCount; ++j) + { + auto& attr = bufferGroup.attrib[j]; - minBufferStride = std::max(minBufferStride, attr.offset + GetMtlVertexFormatSize(attr.format)); + uint32 semanticId = vertexShader->resourceMapping.attributeMapping[attr.semanticId]; + if (semanticId == (uint32)-1) + continue; // attribute not used? - if (fetchType.has_value()) - cemu_assert_debug(fetchType == attr.fetchType); - else - fetchType = attr.fetchType; + auto attribute = vertexDescriptor->attributes()->object(semanticId); + attribute->setOffset(attr.offset); + attribute->setBufferIndex(GET_MTL_VERTEX_BUFFER_INDEX(attr.attributeBufferIndex)); + attribute->setFormat(GetMtlVertexFormat(attr.format)); - if (attr.fetchType == LatteConst::INSTANCE_DATA) - { - cemu_assert_debug(attr.aluDivisor == 1); // other divisor not yet supported - } - } + minBufferStride = std::max(minBufferStride, attr.offset + GetMtlVertexFormatSize(attr.format)); - uint32 bufferIndex = bufferGroup.attributeBufferIndex; - uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; - uint32 bufferStride = (lcr.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + if (fetchType.has_value()) + cemu_assert_debug(fetchType == attr.fetchType); + else + fetchType = attr.fetchType; - auto layout = vertexDescriptor->layouts()->object(GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)); - if (bufferStride == 0) - { - // Buffer stride cannot be zero, let's use the minimum stride - bufferStride = minBufferStride; + if (attr.fetchType == LatteConst::INSTANCE_DATA) + { + cemu_assert_debug(attr.aluDivisor == 1); // other divisor not yet supported + } + } - // Additionally, constant vertex function must be used - layout->setStepFunction(MTL::VertexStepFunctionConstant); - layout->setStepRate(0); - } - else - { - if (!fetchType.has_value() || fetchType == LatteConst::VertexFetchType2::VERTEX_DATA) - layout->setStepFunction(MTL::VertexStepFunctionPerVertex); - else if (fetchType == LatteConst::VertexFetchType2::INSTANCE_DATA) - layout->setStepFunction(MTL::VertexStepFunctionPerInstance); + uint32 bufferIndex = bufferGroup.attributeBufferIndex; + uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; + uint32 bufferStride = (lcr.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + + auto layout = vertexDescriptor->layouts()->object(GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)); + if (bufferStride == 0) + { + // Buffer stride cannot be zero, let's use the minimum stride + bufferStride = minBufferStride; + + // Additionally, constant vertex function must be used + layout->setStepFunction(MTL::VertexStepFunctionConstant); + layout->setStepRate(0); + } else { - debug_printf("unimplemented vertex fetch type %u\n", (uint32)fetchType.value()); - cemu_assert(false); + if (!fetchType.has_value() || fetchType == LatteConst::VertexFetchType2::VERTEX_DATA) + layout->setStepFunction(MTL::VertexStepFunctionPerVertex); + else if (fetchType == LatteConst::VertexFetchType2::INSTANCE_DATA) + layout->setStepFunction(MTL::VertexStepFunctionPerInstance); + else + { + debug_printf("unimplemented vertex fetch type %u\n", (uint32)fetchType.value()); + cemu_assert(false); + } } - } - bufferStride = Align(bufferStride, 4); - layout->setStride(bufferStride); - } - - auto vertexShaderMtl = static_cast(vertexShader->shader); + bufferStride = Align(bufferStride, 4); + layout->setStride(bufferStride); + } - // Render pipeline state - MTL::RenderPipelineDescriptor* desc = MTL::RenderPipelineDescriptor::alloc()->init(); - desc->setVertexFunction(vertexShaderMtl->GetFunction()); - // TODO: don't always set the vertex descriptor? - desc->setVertexDescriptor(vertexDescriptor); + // TODO: don't always set the vertex descriptor? + desc->setVertexDescriptor(vertexDescriptor); + vertexDescriptor->release(); + } SetFragmentState(desc, lastUsedFBO, activeFBO, pixelShader, lcr); @@ -448,7 +453,6 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte } } desc->release(); - vertexDescriptor->release(); return pipeline; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index f33f8af2e..41f48b111 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -18,11 +18,11 @@ #include "Cafe/HW/Latte/Core/LatteIndices.h" #include "Cemu/Logging/CemuDebugLogging.h" #include "Cemu/Logging/CemuLogging.h" -#include "HW/Latte/Core/LatteConst.h" -#include "HW/Latte/Renderer/Metal/MetalCommon.h" -#include "HW/Latte/Renderer/Metal/MetalLayerHandle.h" -#include "HW/Latte/Renderer/Renderer.h" -#include "Metal/MTLRenderPipeline.hpp" +#include "Cafe/HW/Latte/Core/FetchShader.h" +#include "Cafe/HW/Latte/Core/LatteConst.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h" +#include "Cafe/HW/Latte/Renderer/Renderer.h" #include "config/CemuConfig.h" #define IMGUI_IMPL_METAL_CPP @@ -975,6 +975,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 bool isPrimitiveRect = (primitiveMode == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS); bool usesGeometryShader = (geometryShader != nullptr || isPrimitiveRect); + //bool fetchVertexManually = (usesGeometryShader || fetchShader->mtlFetchVertexManually); // Index buffer Renderer::INDEX_TYPE hostIndexType; @@ -1174,26 +1175,8 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 auto& vertexBufferRange = m_state.m_vertexBuffers[i]; if (vertexBufferRange.offset != INVALID_OFFSET) { - MTL::Buffer* buffer; - size_t offset; - - // Restride - if (usesGeometryShader) - { - // Object shaders don't need restriding, since the attributes are fetched in the shader - buffer = m_memoryManager->GetBufferCache(); - offset = m_state.m_vertexBuffers[i].offset; - } - else - { - uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + i * 7; - uint32 bufferStride = (LatteGPUState.contextNew.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; - - auto restridedBuffer = m_memoryManager->RestrideBufferIfNeeded(i, bufferStride, barrierBuffers); - - buffer = restridedBuffer.buffer; - offset = restridedBuffer.offset; - } + MTL::Buffer* buffer = m_memoryManager->GetBufferCache(); + size_t offset = m_state.m_vertexBuffers[i].offset; // Bind SetBuffer(renderCommandEncoder, GetMtlShaderType(vertexShader->shaderType, usesGeometryShader), buffer, offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index 84751eee1..359b9fd0c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -20,7 +20,7 @@ RendererShaderMtl::RendererShaderMtl(MetalRenderer* mtlRenderer, ShaderType type MTL::Library* library = m_mtlr->GetDevice()->newLibrary(ToNSString(mslCode), nullptr, &error); if (error) { - cemuLog_log(LogType::Force, "failed to create library: {}", error->localizedDescription()->utf8String()); + cemuLog_log(LogType::Force, "failed to create library: {} -> {}", error->localizedDescription()->utf8String(), mslCode.c_str()); error->release(); return; } From f2096dedddb9a605c467a043df569f7f3c0c212c Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 1 Oct 2024 18:11:05 +0200 Subject: [PATCH 199/368] remove vertex restriding code --- .../Renderer/Metal/MetalMemoryManager.cpp | 16 +++--- .../Latte/Renderer/Metal/MetalMemoryManager.h | 11 ++-- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 51 ++++++++++++++----- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 6 ++- 4 files changed, 56 insertions(+), 28 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 9765b24ac..1c788e215 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -2,9 +2,8 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h" #include "Common/precompiled.h" -#include "HW/Latte/Renderer/Metal/MetalRenderer.h" -#include "Metal/MTLResource.hpp" +/* MetalVertexBufferCache::~MetalVertexBufferCache() { } @@ -28,13 +27,11 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu restrideInfo.allocation = m_bufferAllocator.GetBufferAllocation(newSize); buffer = m_bufferAllocator.GetBuffer(restrideInfo.allocation.bufferIndex); - /* - uint8* oldPtr = (uint8*)bufferCache->contents() + vertexBufferRange.offset; - uint8* newPtr = (uint8*)buffer->contents() + restrideInfo.allocation.offset; + //uint8* oldPtr = (uint8*)bufferCache->contents() + vertexBufferRange.offset; + //uint8* newPtr = (uint8*)buffer->contents() + restrideInfo.allocation.offset; - for (size_t elem = 0; elem < vertexBufferRange.size / stride; elem++) - memcpy(newPtr + elem * newStride, oldPtr + elem * stride, stride); - */ + //for (size_t elem = 0; elem < vertexBufferRange.size / stride; elem++) + // memcpy(newPtr + elem * newStride, oldPtr + elem * stride, stride); if (m_mtlr->GetEncoderType() == MetalEncoderType::Render) { @@ -94,6 +91,7 @@ void MetalVertexBufferCache::MemoryRangeChanged(size_t offset, size_t size) } } } +*/ MetalMemoryManager::~MetalMemoryManager() { @@ -144,7 +142,7 @@ void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, si m_tempBufferAllocator.UnlockBuffer(allocation.bufferIndex); // Notify vertex buffer cache about the change - m_vertexBufferCache.MemoryRangeChanged(offset, size); + //m_vertexBufferCache.MemoryRangeChanged(offset, size); } void MetalMemoryManager::CopyBufferCache(size_t srcOffset, size_t dstOffset, size_t size) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index a79111fd3..4ea5769e8 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -1,8 +1,8 @@ #pragma once #include "Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h" -#include "Metal/MTLResource.hpp" +/* struct MetalRestridedBufferRange { MTL::Buffer* buffer; @@ -54,18 +54,21 @@ class MetalVertexBufferCache void MemoryRangeChanged(size_t offset, size_t size); }; +*/ class MetalMemoryManager { public: - MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_bufferAllocator(metalRenderer, m_mtlr->GetOptimalBufferStorageMode()), m_framePersistentBufferAllocator(metalRenderer, MTL::ResourceStorageModePrivate), m_tempBufferAllocator(metalRenderer), m_vertexBufferCache(metalRenderer, m_framePersistentBufferAllocator) {} + MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_bufferAllocator(metalRenderer, m_mtlr->GetOptimalBufferStorageMode()), m_framePersistentBufferAllocator(metalRenderer, MTL::ResourceStorageModePrivate), m_tempBufferAllocator(metalRenderer)/*, m_vertexBufferCache(metalRenderer, m_framePersistentBufferAllocator)*/ {} ~MetalMemoryManager(); // Pipelines + /* void SetRestrideBufferPipeline(class MetalVoidVertexPipeline* restrideBufferPipeline) { m_vertexBufferCache.SetRestrideBufferPipeline(restrideBufferPipeline); } + */ MetalDefaultBufferAllocator& GetBufferAllocator() { @@ -95,6 +98,7 @@ class MetalMemoryManager void CopyBufferCache(size_t srcOffset, size_t dstOffset, size_t size); // Vertex buffer cache + /* void TrackVertexBuffer(uint32 bufferIndex, size_t offset, size_t size, MetalRestrideInfo* restrideInfo) { m_vertexBufferCache.TrackVertexBuffer(bufferIndex, offset, size, restrideInfo); @@ -109,6 +113,7 @@ class MetalMemoryManager { return m_vertexBufferCache.RestrideBufferIfNeeded(m_bufferCache, bufferIndex, stride, barrierBuffers); } + */ private: class MetalRenderer* m_mtlr; @@ -118,7 +123,7 @@ class MetalMemoryManager MetalDefaultBufferAllocator m_bufferAllocator; MetalDefaultBufferAllocator m_framePersistentBufferAllocator; MetalTemporaryBufferAllocator m_tempBufferAllocator; - MetalVertexBufferCache m_vertexBufferCache; + //MetalVertexBufferCache m_vertexBufferCache; MTL::Buffer* m_bufferCache = nullptr; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 41f48b111..7f10365ba 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -163,10 +163,10 @@ MetalRenderer::MetalRenderer() if (m_isAppleGPU) m_copyBufferToBufferPipeline = new MetalVoidVertexPipeline(this, utilityLibrary, "vertexCopyBufferToBuffer"); //m_copyTextureToTexturePipeline = new MetalVoidVertexPipeline(this, utilityLibrary, "vertexCopyTextureToTexture"); - m_restrideBufferPipeline = new MetalVoidVertexPipeline(this, utilityLibrary, "vertexRestrideBuffer"); + //m_restrideBufferPipeline = new MetalVoidVertexPipeline(this, utilityLibrary, "vertexRestrideBuffer"); utilityLibrary->release(); - m_memoryManager->SetRestrideBufferPipeline(m_restrideBufferPipeline); + //m_memoryManager->SetRestrideBufferPipeline(m_restrideBufferPipeline); } MetalRenderer::~MetalRenderer() @@ -174,7 +174,7 @@ MetalRenderer::~MetalRenderer() if (m_isAppleGPU) delete m_copyBufferToBufferPipeline; //delete m_copyTextureToTexturePipeline; - delete m_restrideBufferPipeline; + //delete m_restrideBufferPipeline; //m_presentPipelineLinear->release(); //m_presentPipelineSRGB->release(); @@ -831,16 +831,16 @@ void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, u if (buffer.offset == offset && buffer.size == size) return; - if (buffer.offset != INVALID_OFFSET) - { - m_memoryManager->UntrackVertexBuffer(bufferIndex); - } + //if (buffer.offset != INVALID_OFFSET) + //{ + // m_memoryManager->UntrackVertexBuffer(bufferIndex); + //} buffer.offset = offset; buffer.size = size; - buffer.restrideInfo = {}; + //buffer.restrideInfo = {}; - m_memoryManager->TrackVertexBuffer(bufferIndex, offset, size, &buffer.restrideInfo); + //m_memoryManager->TrackVertexBuffer(bufferIndex, offset, size, &buffer.restrideInfo); } void MetalRenderer::buffer_bindUniformBuffer(LatteConst::ShaderType shaderType, uint32 bufferIndex, uint32 offset, uint32 size) @@ -1169,12 +1169,35 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // Resources // Vertex buffers - std::vector barrierBuffers; + //std::vector barrierBuffers; for (uint8 i = 0; i < MAX_MTL_BUFFERS; i++) { auto& vertexBufferRange = m_state.m_vertexBuffers[i]; if (vertexBufferRange.offset != INVALID_OFFSET) { + /* + MTL::Buffer* buffer; + size_t offset; + + // Restride + if (usesGeometryShader) + { + // Object shaders don't need restriding, since the attributes are fetched in the shader + buffer = m_memoryManager->GetBufferCache(); + offset = m_state.m_vertexBuffers[i].offset; + } + else + { + uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + i * 7; + uint32 bufferStride = (LatteGPUState.contextNew.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + + auto restridedBuffer = m_memoryManager->RestrideBufferIfNeeded(i, bufferStride, barrierBuffers); + + buffer = restridedBuffer.buffer; + offset = restridedBuffer.offset; + } + */ + MTL::Buffer* buffer = m_memoryManager->GetBufferCache(); size_t offset = m_state.m_vertexBuffers[i].offset; @@ -1183,10 +1206,10 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 } } - if (!barrierBuffers.empty()) - { - renderCommandEncoder->memoryBarrier(barrierBuffers.data(), barrierBuffers.size(), MTL::RenderStageVertex, MTL::RenderStageVertex); - } + //if (!barrierBuffers.empty()) + //{ + // renderCommandEncoder->memoryBarrier(barrierBuffers.data(), barrierBuffers.size(), MTL::RenderStageVertex, MTL::RenderStageVertex); + //} // Render pipeline state MTL::RenderPipelineState* renderPipelineState; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 2c9b150f8..526f33a5c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -19,19 +19,21 @@ struct MetalBufferAllocation } }; +/* struct MetalRestrideInfo { bool memoryInvalidated = true; size_t lastStride = 0; MetalBufferAllocation allocation{}; }; +*/ struct MetalBoundBuffer { size_t offset = INVALID_OFFSET; size_t size = 0; // Memory manager will write restride info to this variable - MetalRestrideInfo restrideInfo; + //MetalRestrideInfo restrideInfo; }; enum MetalGeneralShaderType @@ -473,7 +475,7 @@ class MetalRenderer : public Renderer // Hybrid pipelines class MetalVoidVertexPipeline* m_copyBufferToBufferPipeline; //class MetalVoidVertexPipeline* m_copyTextureToTexturePipeline; - class MetalVoidVertexPipeline* m_restrideBufferPipeline; + //class MetalVoidVertexPipeline* m_restrideBufferPipeline; // Resources MTL::SamplerState* m_nearestSampler; From 50175fce66dbb6ec3b15dc4c4c19bcaa5e877366 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 1 Oct 2024 18:18:39 +0200 Subject: [PATCH 200/368] report manual vertex fetch draws per frame --- .../Renderer/Metal/MetalPerformanceMonitor.h | 6 +++-- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 26 +++++++++++-------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h index e9e1690c4..1bf017b25 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h @@ -8,7 +8,8 @@ class MetalPerformanceMonitor // Per frame data uint32 m_renderPasses = 0; uint32 m_clears = 0; - uint32 m_vertexBufferRestrides = 0; + uint32 m_manualVertexFetchDraws = 0; + uint32 m_meshDraws = 0; uint32 m_triangleFans = 0; MetalPerformanceMonitor() = default; @@ -18,7 +19,8 @@ class MetalPerformanceMonitor { m_renderPasses = 0; m_clears = 0; - m_vertexBufferRestrides = 0; + m_manualVertexFetchDraws = 0; + m_meshDraws = 0; m_triangleFans = 0; } }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 7f10365ba..1e5711f9f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -475,20 +475,20 @@ void MetalRenderer::DeleteFontTextures() void MetalRenderer::AppendOverlayDebugInfo() { ImGui::Text("--- GPU info ---"); - ImGui::Text("Is Apple GPU %s", (m_isAppleGPU ? "yes" : "no")); - ImGui::Text("Has unified memory %s", (m_hasUnifiedMemory ? "yes" : "no")); - ImGui::Text("Supports Metal3 %s", (m_supportsMetal3 ? "yes" : "no")); + ImGui::Text("Is Apple GPU %s", (m_isAppleGPU ? "yes" : "no")); + ImGui::Text("Has unified memory %s", (m_hasUnifiedMemory ? "yes" : "no")); + ImGui::Text("Supports Metal3 %s", (m_supportsMetal3 ? "yes" : "no")); ImGui::Text("--- Metal info ---"); - ImGui::Text("Render pipeline states %zu", m_pipelineCache->GetPipelineCacheSize()); - ImGui::Text("Buffer allocator memory %zuMB", m_performanceMonitor.m_bufferAllocatorMemory / 1024 / 1024); + ImGui::Text("Render pipeline states %zu", m_pipelineCache->GetPipelineCacheSize()); + ImGui::Text("Buffer allocator memory %zuMB", m_performanceMonitor.m_bufferAllocatorMemory / 1024 / 1024); ImGui::Text("--- Metal info (per frame) ---"); - ImGui::Text("Command buffers %zu", m_commandBuffers.size()); - ImGui::Text("Render passes %u", m_performanceMonitor.m_renderPasses); - ImGui::Text("Clears %u", m_performanceMonitor.m_clears); - ImGui::Text("Vertex buffer restrides %u", m_performanceMonitor.m_vertexBufferRestrides); - ImGui::Text("Triangle fans %u", m_performanceMonitor.m_triangleFans); + ImGui::Text("Command buffers %zu", m_commandBuffers.size()); + ImGui::Text("Render passes %u", m_performanceMonitor.m_renderPasses); + ImGui::Text("Clears %u", m_performanceMonitor.m_clears); + ImGui::Text("Manual vertex fetch draws %u (mesh draws: %u)", m_performanceMonitor.m_manualVertexFetchDraws, m_performanceMonitor.m_meshDraws); + ImGui::Text("Triangle fans %u", m_performanceMonitor.m_triangleFans); } // TODO: halfZ @@ -975,7 +975,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 bool isPrimitiveRect = (primitiveMode == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS); bool usesGeometryShader = (geometryShader != nullptr || isPrimitiveRect); - //bool fetchVertexManually = (usesGeometryShader || fetchShader->mtlFetchVertexManually); + bool fetchVertexManually = (usesGeometryShader || fetchShader->mtlFetchVertexManually); // Index buffer Renderer::INDEX_TYPE hostIndexType; @@ -1299,6 +1299,10 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 LatteStreamout_FinishDrawcall(false); // Debug + if (fetchVertexManually) + m_performanceMonitor.m_manualVertexFetchDraws++; + if (usesGeometryShader) + m_performanceMonitor.m_meshDraws++; if (primitiveMode == LattePrimitiveMode::TRIANGLE_FAN) m_performanceMonitor.m_triangleFans++; From 2fb4d83a5ff3753e2b5684f80cc315b82b23457e Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 1 Oct 2024 19:07:19 +0200 Subject: [PATCH 201/368] don't set verticesPerInstance twice --- .../LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp | 4 ++-- .../LatteDecompilerEmitMSLHeader.hpp | 7 ++----- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 6 +----- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 05ba6abaa..13f7bccff 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -4075,8 +4075,8 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, { // Calculate the imaginary vertex id src->add("uint vid = tig * VERTICES_PER_VERTEX_PRIMITIVE + tid;" _CRLF); - src->add("uint iid = vid / verticesPerInstance;" _CRLF); - src->add("vid %= verticesPerInstance;" _CRLF); + src->add("uint iid = vid / supportBuffer.verticesPerInstance;" _CRLF); + src->add("vid %= supportBuffer.verticesPerInstance;" _CRLF); // Fetch the input src->add("VertexIn in = fetchVertex(vid, iid, indexBuffer, indexType VERTEX_BUFFERS);" _CRLF); diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 5a2c54aca..1e3091a6e 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -85,8 +85,7 @@ namespace LatteDecompiler uniformCurrentOffset += 8; } // define verticesPerInstance + streamoutBufferBaseX - if ((shader->shaderType == LatteConst::ShaderType::Vertex && !decompilerContext->options->usesGeometryShader) || - (shader->shaderType == LatteConst::ShaderType::Geometry)) + if (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry) { src->add("int verticesPerInstance;" _CRLF); uniformOffsets.offset_verticesPerInstance = uniformCurrentOffset; @@ -484,9 +483,7 @@ namespace LatteDecompiler src->add(", mesh_grid_properties meshGridProperties"); src->add(", uint tig [[threadgroup_position_in_grid]]"); src->add(", uint tid [[thread_index_in_threadgroup]]"); - // TODO: put into the support buffer? - src->addFmt(", constant uint& verticesPerInstance [[buffer({})]]", decompilerContext->output->resourceMappingMTL.verticesPerInstanceBinding); - // TODO: inly include index buffer if needed + // TODO: only include index buffer if needed src->addFmt(", device uint* indexBuffer [[buffer({})]]", decompilerContext->output->resourceMappingMTL.indexBufferBinding); // TODO: put into the support buffer? src->addFmt(", constant uchar& indexType [[buffer({})]]", decompilerContext->output->resourceMappingMTL.indexTypeBinding); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 1e5711f9f..0416f2d9b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1247,13 +1247,9 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // We have already retrieved the buffer, no need for it to be locked anymore bufferAllocator.UnlockBuffer(indexBufferIndex); } + if (usesGeometryShader) { - uint32 verticesPerInstance = count / instanceCount; - // TODO: make a helper function for this - renderCommandEncoder->setObjectBytes(&verticesPerInstance, sizeof(verticesPerInstance), vertexShader->resourceMapping.verticesPerInstanceBinding); - encoderState.m_buffers[METAL_SHADER_TYPE_OBJECT][vertexShader->resourceMapping.verticesPerInstanceBinding] = {nullptr}; - if (indexBuffer) SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_OBJECT, indexBuffer, indexBufferOffset, vertexShader->resourceMapping.indexBufferBinding); From 6ba6157bae298adac1b72b307faa865c65d72275 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 1 Oct 2024 19:37:11 +0200 Subject: [PATCH 202/368] properly implement ABGR4 decoder --- src/Cafe/HW/Latte/Core/LatteTextureLoader.h | 43 +------------------ .../Latte/Renderer/Vulkan/VulkanRenderer.cpp | 2 +- 2 files changed, 2 insertions(+), 43 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteTextureLoader.h b/src/Cafe/HW/Latte/Core/LatteTextureLoader.h index 656d8a3a6..0f558945e 100644 --- a/src/Cafe/HW/Latte/Core/LatteTextureLoader.h +++ b/src/Cafe/HW/Latte/Core/LatteTextureLoader.h @@ -594,7 +594,7 @@ class TextureDecoder_R4_G4_UNORM_To_RGBA4 : public TextureDecoder, public Single } }; -class TextureDecoder_R4_G4_UNORM_To_RGBA4_vk : public TextureDecoder, public SingletonClass +class TextureDecoder_R4_G4_UNORM_To_ABGR4 : public TextureDecoder, public SingletonClass { public: sint32 getBytesPerTexel(LatteTextureLoaderCtx* textureLoader) override @@ -632,47 +632,6 @@ class TextureDecoder_R4_G4_UNORM_To_RGBA4_vk : public TextureDecoder, public Sin } }; -class TextureDecoder_R4_G4_UNORM_To_ABGR4 : public TextureDecoder, public SingletonClass -{ -public: - sint32 getBytesPerTexel(LatteTextureLoaderCtx* textureLoader) override - { - return 2; - } - - void decode(LatteTextureLoaderCtx* textureLoader, uint8* outputData) override - { - for (sint32 y = 0; y < textureLoader->height; y += textureLoader->stepY) - { - sint32 yc = y; - for (sint32 x = 0; x < textureLoader->width; x += textureLoader->stepX) - { - uint8* blockData = LatteTextureLoader_GetInput(textureLoader, x, y); - sint32 pixelOffset = (x + yc * textureLoader->width) * 2; - uint8 v = (*(uint8*)(blockData + 0)); - uint8 c0 = (v & 0xF); - uint8 c1 = (v >> 4) & 0xF; - v = (c0 << 4) | c1; - *(uint8*)(outputData + pixelOffset + 0) = v; - *(uint8*)(outputData + pixelOffset + 1) = 0; - } - } - } - - void decodePixelToRGBA(uint8* blockData, uint8* outputPixel, uint8 blockOffsetX, uint8 blockOffsetY) override - { - uint8 v0 = *(blockData + 0); - uint8 c0 = (v0 & 0xF); - uint8 c1 = (v0 >> 4) & 0xF; - c0 = (c0 << 4) | c0; - c1 = (c1 << 4) | c1; - *(outputPixel + 0) = c0; - *(outputPixel + 1) = c1; - *(outputPixel + 2) = 0; - *(outputPixel + 3) = 255; - } -}; - class TextureDecoder_R4G4_UNORM_To_RGBA8 : public TextureDecoder, public SingletonClass { public: diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp index f966a6128..2f776f7a3 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp @@ -2283,7 +2283,7 @@ void VulkanRenderer::GetTextureFormatInfoVK(Latte::E_GX2SURFFMT format, bool isD } else { formatInfoOut->vkImageFormat = VK_FORMAT_R4G4B4A4_UNORM_PACK16; - formatInfoOut->decoder = TextureDecoder_R4_G4_UNORM_To_RGBA4_vk::getInstance(); + formatInfoOut->decoder = TextureDecoder_R4_G4_UNORM_To_ABGR4::getInstance(); } } else From ebcb62a7851d94a4a1105da1e4ef1eaf8e74e58a Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 2 Oct 2024 17:18:20 +0200 Subject: [PATCH 203/368] add: an option to turn fast math off --- src/Cafe/CafeSystem.cpp | 3 +- .../Renderer/Metal/RendererShaderMtl.cpp | 10 +++++-- src/config/CemuConfig.cpp | 20 +++++++------ src/config/CemuConfig.h | 1 + src/gui/GeneralSettings2.cpp | 30 +++++++++++++++++-- src/gui/GeneralSettings2.h | 7 ++--- 6 files changed, 52 insertions(+), 19 deletions(-) diff --git a/src/Cafe/CafeSystem.cpp b/src/Cafe/CafeSystem.cpp index 958a5a576..96e5621dd 100644 --- a/src/Cafe/CafeSystem.cpp +++ b/src/Cafe/CafeSystem.cpp @@ -251,6 +251,7 @@ void InfoLog_PrintActiveSettings() if (ActiveSettings::GetGraphicsAPI() == GraphicAPI::kVulkan) { cemuLog_log(LogType::Force, "Async compile: {}", GetConfig().async_compile.GetValue() ? "true" : "false"); + cemuLog_log(LogType::Force, "Fast math: {}", GetConfig().fast_math.GetValue() ? "true" : "false"); if(!GetConfig().vk_accurate_barriers.GetValue()) cemuLog_log(LogType::Force, "Accurate barriers are disabled!"); } @@ -1013,7 +1014,7 @@ namespace CafeSystem { // starting with Cemu 1.27.0 /vol/storage_mlc01/ is virtualized, meaning that it doesn't point to one singular host os folder anymore // instead it now uses a more complex solution to source titles with various formats (folder, wud, wua) from the game paths and host mlc path - + // todo - mount /vol/storage_mlc01/ with base priority to the host mlc? // since mounting titles is an expensive operation we have to avoid mounting all titles at once diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index 359b9fd0c..4aff3e14c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -7,8 +7,7 @@ #include "Cemu/Logging/CemuLogging.h" #include "Common/precompiled.h" -#include "HW/Latte/Core/FetchShader.h" -#include "HW/Latte/ISA/RegDefines.h" +#include "config/CemuConfig.h" extern std::atomic_int g_compiled_shaders_total; extern std::atomic_int g_compiled_shaders_async; @@ -16,8 +15,13 @@ extern std::atomic_int g_compiled_shaders_async; RendererShaderMtl::RendererShaderMtl(MetalRenderer* mtlRenderer, ShaderType type, uint64 baseHash, uint64 auxHash, bool isGameShader, bool isGfxPackShader, const std::string& mslCode) : RendererShader(type, baseHash, auxHash, isGameShader, isGfxPackShader), m_mtlr{mtlRenderer} { + MTL::CompileOptions* options = MTL::CompileOptions::alloc()->init(); + if (GetConfig().fast_math) + options->setFastMathEnabled(true); + NS::Error* error = nullptr; - MTL::Library* library = m_mtlr->GetDevice()->newLibrary(ToNSString(mslCode), nullptr, &error); + MTL::Library* library = m_mtlr->GetDevice()->newLibrary(ToNSString(mslCode), options, &error); + options->release(); if (error) { cemuLog_log(LogType::Force, "failed to create library: {} -> {}", error->localizedDescription()->utf8String(), mslCode.c_str()); diff --git a/src/config/CemuConfig.cpp b/src/config/CemuConfig.cpp index e7920e84b..06ad94d5e 100644 --- a/src/config/CemuConfig.cpp +++ b/src/config/CemuConfig.cpp @@ -32,7 +32,7 @@ void CemuConfig::Load(XMLConfigParser& parser) mlc_path = mlc; permanent_storage = parser.get("permanent_storage", permanent_storage); - + language = parser.get("language", wxLANGUAGE_DEFAULT); use_discord_presence = parser.get("use_discord_presence", true); fullscreen_menubar = parser.get("fullscreen_menubar", false); @@ -102,7 +102,7 @@ void CemuConfig::Load(XMLConfigParser& parser) cemuLog_log(LogType::Force, "config load error: can't load recently launched game file: {}", path); } } - + recent_nfc_files.clear(); auto nfc_parser = parser.get("RecentNFCFiles"); for (auto element = nfc_parser.get("Entry"); element.valid(); element = nfc_parser.get("Entry", element)) @@ -198,7 +198,7 @@ void CemuConfig::Load(XMLConfigParser& parser) { graphic_pack_entries[path].try_emplace("_disabled", "true"); } - + for (auto preset = element.get("Preset"); preset.valid(); preset = element.get("Preset", preset)) { const std::string category = preset.get("category", ""); @@ -206,7 +206,7 @@ void CemuConfig::Load(XMLConfigParser& parser) graphic_pack_entries[path].try_emplace(category, active_preset); } } - + } // graphics @@ -219,6 +219,7 @@ void CemuConfig::Load(XMLConfigParser& parser) downscale_filter = graphic.get("DownscaleFilter", kLinearFilter); fullscreen_scaling = graphic.get("FullscreenScaling", kKeepAspectRatio); async_compile = graphic.get("AsyncCompile", async_compile); + fast_math = graphic.get("FastMath", fast_math); vk_accurate_barriers = graphic.get("vkAccurateBarriers", true); // this used to be "VulkanAccurateBarriers" but because we changed the default to true in 1.27.1 the option name had to be changed auto overlay_node = graphic.get("Overlay"); @@ -373,7 +374,7 @@ void CemuConfig::Save(XMLConfigParser& parser) // config.set("cpu_mode", cpu_mode.GetValue()); //config.set("console_region", console_region.GetValue()); config.set("console_language", console_language.GetValue()); - + auto wpos = config.set("window_position"); wpos.set("x", window_position.x); wpos.set("y", window_position.y); @@ -408,13 +409,13 @@ void CemuConfig::Save(XMLConfigParser& parser) { launch_files_parser.set("Entry", entry.c_str()); } - + auto nfc_files_parser = config.set("RecentNFCFiles"); for (const auto& entry : recent_nfc_files) { nfc_files_parser.set("Entry", entry.c_str()); } - + // game paths auto game_path_parser = config.set("GamePaths"); for (const auto& entry : game_paths) @@ -455,11 +456,11 @@ void CemuConfig::Save(XMLConfigParser& parser) entry.set_attribute("disabled", true); continue; } - + auto preset = entry.set("Preset"); if(!kv.first.empty()) preset.set("category", kv.first.c_str()); - + preset.set("preset", kv.second.c_str()); } } @@ -475,6 +476,7 @@ void CemuConfig::Save(XMLConfigParser& parser) graphic.set("DownscaleFilter", downscale_filter); graphic.set("FullscreenScaling", fullscreen_scaling); graphic.set("AsyncCompile", async_compile.GetValue()); + graphic.set("FastMath", fast_math.GetValue()); graphic.set("vkAccurateBarriers", vk_accurate_barriers); auto overlay_node = graphic.set("Overlay"); diff --git a/src/config/CemuConfig.h b/src/config/CemuConfig.h index 842350774..c231ae08c 100644 --- a/src/config/CemuConfig.h +++ b/src/config/CemuConfig.h @@ -444,6 +444,7 @@ struct CemuConfig ConfigValue gx2drawdone_sync {true}; ConfigValue render_upside_down{ false }; ConfigValue async_compile{ true }; + ConfigValue fast_math{ true }; ConfigValue vk_accurate_barriers{ true }; diff --git a/src/gui/GeneralSettings2.cpp b/src/gui/GeneralSettings2.cpp index 63121e292..4cd0be9a4 100644 --- a/src/gui/GeneralSettings2.cpp +++ b/src/gui/GeneralSettings2.cpp @@ -344,6 +344,10 @@ wxPanel* GeneralSettings2::AddGraphicsPage(wxNotebook* notebook) m_async_compile->SetToolTip(_("Enables async shader and pipeline compilation. Reduces stutter at the cost of objects not rendering for a short time.\nVulkan only")); graphic_misc_row->Add(m_async_compile, 0, wxALL, 5); + m_fast_math = new wxCheckBox(box, wxID_ANY, _("Fast math")); + m_fast_math->SetToolTip(_("Enables fast math for all shaders. May cause minor inaccuracies in some games.\nMetal only")); + graphic_misc_row->Add(m_fast_math, 0, wxALL, 5); + m_gx2drawdone_sync = new wxCheckBox(box, wxID_ANY, _("Full sync at GX2DrawDone()")); m_gx2drawdone_sync->SetToolTip(_("If synchronization is requested by the game, the emulated CPU will wait for the GPU to finish all operations.\nThis is more accurate behavior, but may cause lower performance")); graphic_misc_row->Add(m_gx2drawdone_sync, 0, wxALL, 5); @@ -1038,6 +1042,7 @@ void GeneralSettings2::StoreConfig() config.vsync = m_vsync->GetSelection(); config.gx2drawdone_sync = m_gx2drawdone_sync->IsChecked(); config.async_compile = m_async_compile->IsChecked(); + config.fast_math = m_fast_math->IsChecked(); config.upscale_filter = m_upscale_filter->GetSelection(); config.downscale_filter = m_downscale_filter->GetSelection(); @@ -1500,7 +1505,7 @@ void GeneralSettings2::HandleGraphicsApiSelection() selection = GetConfig().vsync; m_vsync->Clear(); - if(m_graphic_api->GetSelection() == 0) + if (m_graphic_api->GetSelection() == 0) { // OpenGL m_vsync->AppendString(_("Off")); @@ -1515,12 +1520,14 @@ void GeneralSettings2::HandleGraphicsApiSelection() m_gx2drawdone_sync->Enable(); m_async_compile->Disable(); + m_fast_math->Disable(); } - else + else if (m_graphic_api->GetSelection() == 1) { // Vulkan m_gx2drawdone_sync->Disable(); m_async_compile->Enable(); + m_fast_math->Disable(); m_vsync->AppendString(_("Off")); m_vsync->AppendString(_("Double buffering")); @@ -1553,6 +1560,24 @@ void GeneralSettings2::HandleGraphicsApiSelection() } } } + else + { + // Metal + m_gx2drawdone_sync->Disable(); + m_async_compile->Enable(); + m_fast_math->Enable(); + + // TODO: vsync options + m_vsync->AppendString(_("Off")); + m_vsync->AppendString(_("Double buffering")); + m_vsync->AppendString(_("Triple buffering")); + + m_vsync->Select(selection); + + // TODO: add an option to select the graphic device + m_graphic_device->Clear(); + m_graphic_device->Disable(); + } } void GeneralSettings2::ApplyConfig() @@ -1604,6 +1629,7 @@ void GeneralSettings2::ApplyConfig() m_graphic_api->SetSelection(config.graphic_api); m_vsync->SetSelection(config.vsync); m_async_compile->SetValue(config.async_compile); + m_fast_math->SetValue(config.fast_math); m_gx2drawdone_sync->SetValue(config.gx2drawdone_sync); m_upscale_filter->SetSelection(config.upscale_filter); m_downscale_filter->SetSelection(config.downscale_filter); diff --git a/src/gui/GeneralSettings2.h b/src/gui/GeneralSettings2.h index b1ab01e86..01c4845fa 100644 --- a/src/gui/GeneralSettings2.h +++ b/src/gui/GeneralSettings2.h @@ -28,7 +28,7 @@ class GeneralSettings2 : public wxDialog bool m_has_account_change = false; // keep track of dirty state of accounts - + wxPanel* AddGeneralPage(wxNotebook* notebook); wxPanel* AddGraphicsPage(wxNotebook* notebook); wxPanel* AddAudioPage(wxNotebook* notebook); @@ -52,7 +52,7 @@ class GeneralSettings2 : public wxDialog // Graphics wxChoice* m_graphic_api, * m_graphic_device; wxChoice* m_vsync; - wxCheckBox *m_async_compile, *m_gx2drawdone_sync; + wxCheckBox *m_async_compile, *m_fast_math, *m_gx2drawdone_sync; wxRadioBox* m_upscale_filter, *m_downscale_filter, *m_fullscreen_scaling; wxChoice* m_overlay_position, *m_notification_position, *m_overlay_scale, *m_notification_scale; wxCheckBox* m_controller_profile_name, *m_controller_low_battery, *m_shader_compiling, *m_friends_data; @@ -106,11 +106,10 @@ class GeneralSettings2 : public wxDialog void UpdateAudioDevice(); // refreshes audio device list for dropdown void UpdateAudioDeviceList(); - + void ResetAccountInformation(); void UpdateAccountInformation(); void UpdateOnlineAccounts(); void HandleGraphicsApiSelection(); void ApplyConfig(); }; - From 6b784ad949eaadd4d4f03c928dba66fa0a754230 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 2 Oct 2024 17:28:50 +0200 Subject: [PATCH 204/368] log relevant settings --- src/Cafe/CafeSystem.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/Cafe/CafeSystem.cpp b/src/Cafe/CafeSystem.cpp index 96e5621dd..a6691c3d5 100644 --- a/src/Cafe/CafeSystem.cpp +++ b/src/Cafe/CafeSystem.cpp @@ -251,8 +251,13 @@ void InfoLog_PrintActiveSettings() if (ActiveSettings::GetGraphicsAPI() == GraphicAPI::kVulkan) { cemuLog_log(LogType::Force, "Async compile: {}", GetConfig().async_compile.GetValue() ? "true" : "false"); + if (!GetConfig().vk_accurate_barriers.GetValue()) + cemuLog_log(LogType::Force, "Accurate barriers are disabled!"); + } + else if (ActiveSettings::GetGraphicsAPI() == GraphicAPI::kMetal) + { cemuLog_log(LogType::Force, "Fast math: {}", GetConfig().fast_math.GetValue() ? "true" : "false"); - if(!GetConfig().vk_accurate_barriers.GetValue()) + if (!GetConfig().vk_accurate_barriers.GetValue()) cemuLog_log(LogType::Force, "Accurate barriers are disabled!"); } cemuLog_log(LogType::Force, "Console language: {}", stdx::to_underlying(config.console_language.GetValue())); From 756470f1a7e6926850211283b5c1bbd0b2f362c6 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 9 Oct 2024 16:47:15 +0200 Subject: [PATCH 205/368] remove CMake hacks --- CMakeLists.txt | 11 +++++------ src/Cafe/CafeSystem.cpp | 1 + src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp | 1 - 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 898f00868..d840ffc92 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -111,7 +111,7 @@ endif() option(ENABLE_OPENGL "Enables the OpenGL backend" ON) option(ENABLE_VULKAN "Enables the Vulkan backend" ON) -option(ENABLE_METAL "Enables the Metal backend" $ENABLE_METAL_DEFAULT) +option(ENABLE_METAL "Enables the Metal backend" ${ENABLE_METAL_DEFAULT}) option(ENABLE_DISCORD_RPC "Enables the Discord Rich Presence feature" ON) if (ENABLE_METAL AND NOT APPLE) @@ -202,8 +202,7 @@ if (ENABLE_OPENGL) find_package(OpenGL REQUIRED) endif() -# TODO: handle this differently? -if (ENABLE_METAL AND APPLE) +if (ENABLE_METAL) include_directories(${CMAKE_SOURCE_DIR}/dependencies/metal-cpp) endif() @@ -232,9 +231,9 @@ if (ENABLE_WXWIDGETS) endif() if (ENABLE_CUBEB) - #if (NOT ENABLE_VCPKG) - #find_package(cubeb) - #endif() + if (NOT ENABLE_VCPKG) + find_package(cubeb) + endif() if (NOT cubeb_FOUND) option(BUILD_TESTS "" OFF) option(BUILD_TOOLS "" OFF) diff --git a/src/Cafe/CafeSystem.cpp b/src/Cafe/CafeSystem.cpp index a6691c3d5..1d9ebd063 100644 --- a/src/Cafe/CafeSystem.cpp +++ b/src/Cafe/CafeSystem.cpp @@ -256,6 +256,7 @@ void InfoLog_PrintActiveSettings() } else if (ActiveSettings::GetGraphicsAPI() == GraphicAPI::kMetal) { + cemuLog_log(LogType::Force, "Async compile: {}", GetConfig().async_compile.GetValue() ? "true" : "false"); cemuLog_log(LogType::Force, "Fast math: {}", GetConfig().fast_math.GetValue() ? "true" : "false"); if (!GetConfig().vk_accurate_barriers.GetValue()) cemuLog_log(LogType::Force, "Accurate barriers are disabled!"); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index 8a73da2bd..477213350 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -156,7 +156,6 @@ void CheckForPixelFormatSupport(const MetalPixelFormatSupport& support) MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_UNORM].bytesPerBlock = 2; MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_UNORM].textureDecoder = TextureDecoder_R4G4_UNORM_To_RG8::getInstance(); - // ABGR4Unorm MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM].pixelFormat = MTL::PixelFormatRGBA8Unorm; MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM].bytesPerBlock = 4; MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM].textureDecoder = TextureDecoder_R4G4B4A4_UNORM_To_RGBA8::getInstance(); From bfd9059eecf4225306ad070fdef30b6c82346ed9 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 9 Oct 2024 16:54:02 +0200 Subject: [PATCH 206/368] remove the -g flag on debug builds --- CMakeLists.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d840ffc92..8a04c7719 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,8 +1,5 @@ cmake_minimum_required(VERSION 3.21.1) -# TODO: remove this -set(CMAKE_CXX_FLAGS_DEBUG "-g") - option(ENABLE_VCPKG "Enable the vcpkg package manager" ON) option(MACOS_BUNDLE "The executable when built on macOS will be created as an application bundle" OFF) From bdfac965e0c24e9bb3555f47d5e0478be74277b0 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 10 Oct 2024 18:26:14 +0200 Subject: [PATCH 207/368] implement async shader compilation --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 12 +- .../Renderer/Metal/RendererShaderMtl.cpp | 151 +++++++++++++++++- .../Latte/Renderer/Metal/RendererShaderMtl.h | 34 ++-- 3 files changed, 177 insertions(+), 20 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 0416f2d9b..7cd858576 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -223,14 +223,16 @@ void MetalRenderer::ResizeLayer(const Vector2i& size, bool mainWindow) void MetalRenderer::Initialize() { Renderer::Initialize(); + RendererShaderMtl::Initialize(); } void MetalRenderer::Shutdown() { // TODO: should shutdown both layers ImGui_ImplMetal_Shutdown(); - Renderer::Shutdown(); CommitCommandBuffer(); + Renderer::Shutdown(); + RendererShaderMtl::Shutdown(); } bool MetalRenderer::IsPadWindowActive() @@ -935,13 +937,21 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 return; } + // TODO: special state 8 and 5 + auto& encoderState = m_state.m_encoderState; // Shaders LatteDecompilerShader* vertexShader = LatteSHRC_GetActiveVertexShader(); + if (vertexShader && !vertexShader->shader->IsCompiled()) + return; LatteDecompilerShader* geometryShader = LatteSHRC_GetActiveGeometryShader(); + if (geometryShader && !geometryShader->shader->IsCompiled()) + return; LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader(); const auto fetchShader = LatteSHRC_GetActiveFetchShader(); + if (vertexShader && !pixelShader->shader->IsCompiled()) + return; bool neverSkipAccurateBarrier = false; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index 4aff3e14c..0f6740806 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -1,6 +1,5 @@ #include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" -#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" //#include "Cemu/FileCache/FileCache.h" //#include "config/ActiveSettings.h" @@ -8,35 +7,173 @@ #include "Cemu/Logging/CemuLogging.h" #include "Common/precompiled.h" #include "config/CemuConfig.h" +#include "util/helpers/helpers.h" extern std::atomic_int g_compiled_shaders_total; extern std::atomic_int g_compiled_shaders_async; +class ShaderMtlThreadPool +{ +public: + void StartThreads() + { + if (m_threadsActive.exchange(true)) + return; + // create thread pool + const uint32 threadCount = 2; + for (uint32 i = 0; i < threadCount; ++i) + s_threads.emplace_back(&ShaderMtlThreadPool::CompilerThreadFunc, this); + } + + void StopThreads() + { + if (!m_threadsActive.exchange(false)) + return; + for (uint32 i = 0; i < s_threads.size(); ++i) + s_compilationQueueCount.increment(); + for (auto& it : s_threads) + it.join(); + s_threads.clear(); + } + + ~ShaderMtlThreadPool() + { + StopThreads(); + } + + void CompilerThreadFunc() + { + SetThreadName("mtlShaderComp"); + while (m_threadsActive.load(std::memory_order::relaxed)) + { + s_compilationQueueCount.decrementWithWait(); + s_compilationQueueMutex.lock(); + if (s_compilationQueue.empty()) + { + // queue empty again, shaders compiled synchronously via PreponeCompilation() + s_compilationQueueMutex.unlock(); + continue; + } + RendererShaderMtl* job = s_compilationQueue.front(); + s_compilationQueue.pop_front(); + // set compilation state + cemu_assert_debug(job->m_compilationState.getValue() == RendererShaderMtl::COMPILATION_STATE::QUEUED); + job->m_compilationState.setValue(RendererShaderMtl::COMPILATION_STATE::COMPILING); + s_compilationQueueMutex.unlock(); + // compile + job->CompileInternal(); + ++g_compiled_shaders_async; + // mark as compiled + cemu_assert_debug(job->m_compilationState.getValue() == RendererShaderMtl::COMPILATION_STATE::COMPILING); + job->m_compilationState.setValue(RendererShaderMtl::COMPILATION_STATE::DONE); + } + } + + bool HasThreadsRunning() const { return m_threadsActive; } + +public: + std::vector s_threads; + + std::deque s_compilationQueue; + CounterSemaphore s_compilationQueueCount; + std::mutex s_compilationQueueMutex; + +private: + std::atomic m_threadsActive; +} shaderMtlThreadPool; + +void RendererShaderMtl::Initialize() +{ + shaderMtlThreadPool.StartThreads(); +} + +void RendererShaderMtl::Shutdown() +{ + shaderMtlThreadPool.StopThreads(); +} + RendererShaderMtl::RendererShaderMtl(MetalRenderer* mtlRenderer, ShaderType type, uint64 baseHash, uint64 auxHash, bool isGameShader, bool isGfxPackShader, const std::string& mslCode) - : RendererShader(type, baseHash, auxHash, isGameShader, isGfxPackShader), m_mtlr{mtlRenderer} + : RendererShader(type, baseHash, auxHash, isGameShader, isGfxPackShader), m_mtlr{mtlRenderer}, m_mslCode{mslCode} +{ + // start async compilation + shaderMtlThreadPool.s_compilationQueueMutex.lock(); + m_compilationState.setValue(COMPILATION_STATE::QUEUED); + shaderMtlThreadPool.s_compilationQueue.push_back(this); + shaderMtlThreadPool.s_compilationQueueCount.increment(); + shaderMtlThreadPool.s_compilationQueueMutex.unlock(); + cemu_assert_debug(shaderMtlThreadPool.HasThreadsRunning()); // make sure .StartThreads() was called +} + +RendererShaderMtl::~RendererShaderMtl() +{ + if (m_function) + m_function->release(); +} + +void RendererShaderMtl::PreponeCompilation(bool isRenderThread) +{ + shaderMtlThreadPool.s_compilationQueueMutex.lock(); + bool isStillQueued = m_compilationState.hasState(COMPILATION_STATE::QUEUED); + if (isStillQueued) + { + // remove from queue + shaderMtlThreadPool.s_compilationQueue.erase(std::remove(shaderMtlThreadPool.s_compilationQueue.begin(), shaderMtlThreadPool.s_compilationQueue.end(), this), shaderMtlThreadPool.s_compilationQueue.end()); + m_compilationState.setValue(COMPILATION_STATE::COMPILING); + } + shaderMtlThreadPool.s_compilationQueueMutex.unlock(); + if (!isStillQueued) + { + m_compilationState.waitUntilValue(COMPILATION_STATE::DONE); + --g_compiled_shaders_async; // compilation caused a stall so we don't consider this one async + return; + } + else + { + // compile synchronously + CompileInternal(); + m_compilationState.setValue(COMPILATION_STATE::DONE); + } +} + +bool RendererShaderMtl::IsCompiled() +{ + return m_compilationState.hasState(COMPILATION_STATE::DONE); +}; + +bool RendererShaderMtl::WaitForCompiled() +{ + m_compilationState.waitUntilValue(COMPILATION_STATE::DONE); + return true; +} + +void RendererShaderMtl::CompileInternal() { MTL::CompileOptions* options = MTL::CompileOptions::alloc()->init(); + // TODO: always disable fast math for problematic shaders if (GetConfig().fast_math) options->setFastMathEnabled(true); NS::Error* error = nullptr; - MTL::Library* library = m_mtlr->GetDevice()->newLibrary(ToNSString(mslCode), options, &error); + MTL::Library* library = m_mtlr->GetDevice()->newLibrary(ToNSString(m_mslCode), options, &error); options->release(); if (error) { - cemuLog_log(LogType::Force, "failed to create library: {} -> {}", error->localizedDescription()->utf8String(), mslCode.c_str()); + cemuLog_log(LogType::Force, "failed to create library: {} -> {}", error->localizedDescription()->utf8String(), m_mslCode.c_str()); error->release(); + FinishCompilation(); return; } m_function = library->newFunction(ToNSString("main0")); library->release(); + FinishCompilation(); + // Count shader compilation g_compiled_shaders_total++; } -RendererShaderMtl::~RendererShaderMtl() +void RendererShaderMtl::FinishCompilation() { - if (m_function) - m_function->release(); + m_mslCode.clear(); + m_mslCode.shrink_to_fit(); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h index 0758b0e63..ddf72d81c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h @@ -4,20 +4,26 @@ #include "HW/Latte/Renderer/Metal/CachedFBOMtl.h" #include "HW/Latte/Renderer/Metal/MetalRenderer.h" #include "util/helpers/ConcurrentQueue.h" +#include "util/helpers/Semaphore.h" #include class RendererShaderMtl : public RendererShader { - //enum class COMPILATION_STATE : uint32 - //{ - // NONE, - // QUEUED, - // COMPILING, - // DONE - //}; + friend class ShaderMtlThreadPool; + + enum class COMPILATION_STATE : uint32 + { + NONE, + QUEUED, + COMPILING, + DONE + }; public: + static void Initialize(); + static void Shutdown(); + RendererShaderMtl(class MetalRenderer* mtlRenderer, ShaderType type, uint64 baseHash, uint64 auxHash, bool isGameShader, bool isGfxPackShader, const std::string& mslCode); virtual ~RendererShaderMtl(); @@ -42,15 +48,19 @@ class RendererShaderMtl : public RendererShader cemu_assert_suspicious(); } - // TODO: implement this - void PreponeCompilation(bool isRenderThread) override {} - bool IsCompiled() override { return true; } - bool WaitForCompiled() override { return true; } + void PreponeCompilation(bool isRenderThread) override; + bool IsCompiled() override; + bool WaitForCompiled() override; private: class MetalRenderer* m_mtlr; MTL::Function* m_function = nullptr; - void Compile(const std::string& mslCode); + StateSemaphore m_compilationState{ COMPILATION_STATE::NONE }; + + std::string m_mslCode; + + void CompileInternal(); + void FinishCompilation(); }; From 03bc647e1cfd9597ef99a135ea6525db8c52e1c0 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 10 Oct 2024 18:53:22 +0200 Subject: [PATCH 208/368] prepone compilation for rect geometry shaders --- src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index b7f5c88c0..29459539f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -178,6 +178,7 @@ static RendererShaderMtl* rectsEmulationGS_generate(MetalRenderer* metalRenderer gsSrc.append("}\r\n"); auto mtlShader = new RendererShaderMtl(metalRenderer, RendererShader::ShaderType::kGeometry, 0, 0, false, false, gsSrc); + mtlShader->PreponeCompilation(true); return mtlShader; } From 641ef71cab311fec5afa6d70a405e058d38ad1d5 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 10 Oct 2024 19:38:14 +0200 Subject: [PATCH 209/368] count compiled shaders properly --- src/Cafe/HW/Latte/Core/LatteShaderCache.cpp | 34 +++++++++++++++---- .../Renderer/Metal/RendererShaderMtl.cpp | 31 +++++++++++++++-- .../Latte/Renderer/Metal/RendererShaderMtl.h | 7 ++++ 3 files changed, 63 insertions(+), 9 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp b/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp index 1ba50dec5..4659ff10a 100644 --- a/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp @@ -11,6 +11,7 @@ #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Cafe/HW/Latte/Renderer/OpenGL/RendererShaderGL.h" #include "Cafe/HW/Latte/Renderer/Vulkan/RendererShaderVk.h" +#include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h" #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanPipelineStableCache.h" @@ -158,12 +159,19 @@ bool LoadTGAFile(const std::vector& buffer, TGAFILE *tgaFile) void LatteShaderCache_finish() { - if (g_renderer->GetType() == RendererAPI::Vulkan) + if (g_renderer->GetType() == RendererAPI::Vulkan) + { RendererShaderVk::ShaderCacheLoading_end(); + } else if (g_renderer->GetType() == RendererAPI::OpenGL) + { RendererShaderGL::ShaderCacheLoading_end(); + } else if (g_renderer->GetType() == RendererAPI::Metal) + { + RendererShaderMtl::ShaderCacheLoading_end(); MetalPipelineCache::ShaderCacheLoading_end(); + } } uint32 LatteShaderCache_getShaderCacheExtraVersion(uint64 titleId) @@ -243,11 +251,18 @@ void LatteShaderCache_Load() fs::create_directories(ActiveSettings::GetCachePath("shaderCache/precompiled"), ec); // initialize renderer specific caches if (g_renderer->GetType() == RendererAPI::Vulkan) + { RendererShaderVk::ShaderCacheLoading_begin(cacheTitleId); + } else if (g_renderer->GetType() == RendererAPI::OpenGL) + { RendererShaderGL::ShaderCacheLoading_begin(cacheTitleId); + } else if (g_renderer->GetType() == RendererAPI::Metal) + { + RendererShaderMtl::ShaderCacheLoading_begin(cacheTitleId); MetalPipelineCache::ShaderCacheLoading_begin(cacheTitleId); + } // get cache file name const auto pathGeneric = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}_shaders.bin", cacheTitleId); const auto pathGenericPre1_25_0 = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}.bin", cacheTitleId); // before 1.25.0 @@ -776,11 +791,18 @@ void LatteShaderCache_Close() s_shaderCacheGeneric = nullptr; } if (g_renderer->GetType() == RendererAPI::Vulkan) - RendererShaderVk::ShaderCacheLoading_Close(); - else if (g_renderer->GetType() == RendererAPI::OpenGL) - RendererShaderGL::ShaderCacheLoading_Close(); - else if (g_renderer->GetType() == RendererAPI::Metal) - MetalPipelineCache::ShaderCacheLoading_Close(); + { + RendererShaderVk::ShaderCacheLoading_Close(); + } + else if (g_renderer->GetType() == RendererAPI::OpenGL) + { + RendererShaderGL::ShaderCacheLoading_Close(); + } + else if (g_renderer->GetType() == RendererAPI::Metal) + { + RendererShaderMtl::ShaderCacheLoading_Close(); + MetalPipelineCache::ShaderCacheLoading_Close(); + } // if Vulkan then also close pipeline cache if (g_renderer->GetType() == RendererAPI::Vulkan) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index 0f6740806..c8babb14e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -9,6 +9,8 @@ #include "config/CemuConfig.h" #include "util/helpers/helpers.h" +static bool s_isLoadingShadersMtl{false}; + extern std::atomic_int g_compiled_shaders_total; extern std::atomic_int g_compiled_shaders_async; @@ -62,7 +64,8 @@ class ShaderMtlThreadPool s_compilationQueueMutex.unlock(); // compile job->CompileInternal(); - ++g_compiled_shaders_async; + if (job->ShouldCountCompilation()) + ++g_compiled_shaders_async; // mark as compiled cemu_assert_debug(job->m_compilationState.getValue() == RendererShaderMtl::COMPILATION_STATE::COMPILING); job->m_compilationState.setValue(RendererShaderMtl::COMPILATION_STATE::DONE); @@ -82,6 +85,21 @@ class ShaderMtlThreadPool std::atomic m_threadsActive; } shaderMtlThreadPool; +// TODO: find out if it would be possible to cache compiled Metal shaders +void RendererShaderMtl::ShaderCacheLoading_begin(uint64 cacheTitleId) +{ + s_isLoadingShadersMtl = true; +} + +void RendererShaderMtl::ShaderCacheLoading_end() +{ + s_isLoadingShadersMtl = false; +} + +void RendererShaderMtl::ShaderCacheLoading_Close() +{ +} + void RendererShaderMtl::Initialize() { shaderMtlThreadPool.StartThreads(); @@ -124,7 +142,8 @@ void RendererShaderMtl::PreponeCompilation(bool isRenderThread) if (!isStillQueued) { m_compilationState.waitUntilValue(COMPILATION_STATE::DONE); - --g_compiled_shaders_async; // compilation caused a stall so we don't consider this one async + if (ShouldCountCompilation()) + --g_compiled_shaders_async; // compilation caused a stall so we don't consider this one async return; } else @@ -146,6 +165,11 @@ bool RendererShaderMtl::WaitForCompiled() return true; } +bool RendererShaderMtl::ShouldCountCompilation() const +{ + return !s_isLoadingShadersMtl && m_isGameShader; +} + void RendererShaderMtl::CompileInternal() { MTL::CompileOptions* options = MTL::CompileOptions::alloc()->init(); @@ -169,7 +193,8 @@ void RendererShaderMtl::CompileInternal() FinishCompilation(); // Count shader compilation - g_compiled_shaders_total++; + if (ShouldCountCompilation()) + g_compiled_shaders_total++; } void RendererShaderMtl::FinishCompilation() diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h index ddf72d81c..40d04c870 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h @@ -21,6 +21,10 @@ class RendererShaderMtl : public RendererShader }; public: + static void ShaderCacheLoading_begin(uint64 cacheTitleId); + static void ShaderCacheLoading_end(); + static void ShaderCacheLoading_Close(); + static void Initialize(); static void Shutdown(); @@ -61,6 +65,9 @@ class RendererShaderMtl : public RendererShader std::string m_mslCode; + bool ShouldCountCompilation() const; + void CompileInternal(); + void FinishCompilation(); }; From 8b783e63dc8bbd23cc5eb5fd0033295e10175869 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 13 Oct 2024 09:20:45 +0200 Subject: [PATCH 210/368] refactor pipeline cache --- src/Cafe/CMakeLists.txt | 2 + src/Cafe/HW/Latte/Core/LatteShaderCache.cpp | 21 - .../Renderer/Metal/MetalPipelineCache.cpp | 587 ++--------------- .../Latte/Renderer/Metal/MetalPipelineCache.h | 22 +- .../Renderer/Metal/MetalPipelineCompiler.cpp | 596 ++++++++++++++++++ .../Renderer/Metal/MetalPipelineCompiler.h | 38 ++ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 7 +- 7 files changed, 680 insertions(+), 593 deletions(-) create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index 9b5f8d3ed..3d1a02305 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -560,6 +560,8 @@ if(ENABLE_METAL) HW/Latte/Renderer/Metal/MetalMemoryManager.h HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp HW/Latte/Renderer/Metal/MetalOutputShaderCache.h + HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp + HW/Latte/Renderer/Metal/MetalPipelineCompiler.h HW/Latte/Renderer/Metal/MetalPipelineCache.cpp HW/Latte/Renderer/Metal/MetalPipelineCache.h HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp diff --git a/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp b/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp index 4659ff10a..cdb41184e 100644 --- a/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp @@ -160,18 +160,11 @@ bool LoadTGAFile(const std::vector& buffer, TGAFILE *tgaFile) void LatteShaderCache_finish() { if (g_renderer->GetType() == RendererAPI::Vulkan) - { RendererShaderVk::ShaderCacheLoading_end(); - } else if (g_renderer->GetType() == RendererAPI::OpenGL) - { RendererShaderGL::ShaderCacheLoading_end(); - } else if (g_renderer->GetType() == RendererAPI::Metal) - { RendererShaderMtl::ShaderCacheLoading_end(); - MetalPipelineCache::ShaderCacheLoading_end(); - } } uint32 LatteShaderCache_getShaderCacheExtraVersion(uint64 titleId) @@ -251,18 +244,11 @@ void LatteShaderCache_Load() fs::create_directories(ActiveSettings::GetCachePath("shaderCache/precompiled"), ec); // initialize renderer specific caches if (g_renderer->GetType() == RendererAPI::Vulkan) - { RendererShaderVk::ShaderCacheLoading_begin(cacheTitleId); - } else if (g_renderer->GetType() == RendererAPI::OpenGL) - { RendererShaderGL::ShaderCacheLoading_begin(cacheTitleId); - } else if (g_renderer->GetType() == RendererAPI::Metal) - { RendererShaderMtl::ShaderCacheLoading_begin(cacheTitleId); - MetalPipelineCache::ShaderCacheLoading_begin(cacheTitleId); - } // get cache file name const auto pathGeneric = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}_shaders.bin", cacheTitleId); const auto pathGenericPre1_25_0 = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}.bin", cacheTitleId); // before 1.25.0 @@ -791,18 +777,11 @@ void LatteShaderCache_Close() s_shaderCacheGeneric = nullptr; } if (g_renderer->GetType() == RendererAPI::Vulkan) - { RendererShaderVk::ShaderCacheLoading_Close(); - } else if (g_renderer->GetType() == RendererAPI::OpenGL) - { RendererShaderGL::ShaderCacheLoading_Close(); - } else if (g_renderer->GetType() == RendererAPI::Metal) - { RendererShaderMtl::ShaderCacheLoading_Close(); - MetalPipelineCache::ShaderCacheLoading_Close(); - } // if Vulkan then also close pipeline cache if (g_renderer->GetType() == RendererAPI::Vulkan) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 29459539f..a70f75418 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -1,517 +1,13 @@ -#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" -#include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" -#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" -#include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/ISA/RegDefines.h" -#include "Cemu/Logging/CemuLogging.h" -#include "HW/Latte/Core/LatteConst.h" -#include "config/ActiveSettings.h" +#include "Cafe/HW/Latte/Core/LatteConst.h" -static void rectsEmulationGS_outputSingleVertex(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable* psInputTable, sint32 vIdx, const LatteContextRegister& latteRegister) -{ - auto parameterMask = vertexShader->outputParameterMask; - for (uint32 i = 0; i < 32; i++) - { - if ((parameterMask & (1 << i)) == 0) - continue; - sint32 vsSemanticId = psInputTable->getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i); - if (vsSemanticId < 0) - continue; - // make sure PS has matching input - if (!psInputTable->hasPSImportForSemanticId(vsSemanticId)) - continue; - gsSrc.append(fmt::format("out.passParameterSem{} = objectPayload.vertexOut[{}].passParameterSem{};\r\n", vsSemanticId, vIdx, vsSemanticId)); - } - gsSrc.append(fmt::format("out.position = objectPayload.vertexOut[{}].position;\r\n", vIdx)); - gsSrc.append(fmt::format("mesh.set_vertex({}, out);\r\n", vIdx)); -} - -static void rectsEmulationGS_outputGeneratedVertex(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable* psInputTable, const char* variant, const LatteContextRegister& latteRegister) -{ - auto parameterMask = vertexShader->outputParameterMask; - for (uint32 i = 0; i < 32; i++) - { - if ((parameterMask & (1 << i)) == 0) - continue; - sint32 vsSemanticId = psInputTable->getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i); - if (vsSemanticId < 0) - continue; - // make sure PS has matching input - if (!psInputTable->hasPSImportForSemanticId(vsSemanticId)) - continue; - gsSrc.append(fmt::format("out.passParameterSem{} = gen4thVertex{}(objectPayload.vertexOut[0].passParameterSem{}, objectPayload.vertexOut[1].passParameterSem{}, objectPayload.vertexOut[2].passParameterSem{});\r\n", vsSemanticId, variant, vsSemanticId, vsSemanticId, vsSemanticId)); - } - gsSrc.append(fmt::format("out.position = gen4thVertex{}(objectPayload.vertexOut[0].position, objectPayload.vertexOut[1].position, objectPayload.vertexOut[2].position);\r\n", variant)); - gsSrc.append(fmt::format("mesh.set_vertex(3, out);\r\n")); -} - -static void rectsEmulationGS_outputVerticesCode(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable* psInputTable, sint32 p0, sint32 p1, sint32 p2, sint32 p3, const char* variant, const LatteContextRegister& latteRegister) -{ - sint32 pList[4] = { p0, p1, p2, p3 }; - for (sint32 i = 0; i < 4; i++) - { - if (pList[i] == 3) - rectsEmulationGS_outputGeneratedVertex(gsSrc, vertexShader, psInputTable, variant, latteRegister); - else - rectsEmulationGS_outputSingleVertex(gsSrc, vertexShader, psInputTable, pList[i], latteRegister); - } - gsSrc.append(fmt::format("mesh.set_index(0, {});\r\n", pList[0])); - gsSrc.append(fmt::format("mesh.set_index(1, {});\r\n", pList[1])); - gsSrc.append(fmt::format("mesh.set_index(2, {});\r\n", pList[2])); - gsSrc.append(fmt::format("mesh.set_index(3, {});\r\n", pList[1])); - gsSrc.append(fmt::format("mesh.set_index(4, {});\r\n", pList[2])); - gsSrc.append(fmt::format("mesh.set_index(5, {});\r\n", pList[3])); -} - -static RendererShaderMtl* rectsEmulationGS_generate(MetalRenderer* metalRenderer, const LatteDecompilerShader* vertexShader, const LatteContextRegister& latteRegister) -{ - std::string gsSrc; - gsSrc.append("#include \r\n"); - gsSrc.append("using namespace metal;\r\n"); - - LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable(); - - // inputs & outputs - std::string vertexOutDefinition = "struct VertexOut {\r\n"; - vertexOutDefinition += "float4 position;\r\n"; - std::string geometryOutDefinition = "struct GeometryOut {\r\n"; - geometryOutDefinition += "float4 position [[position]];\r\n"; - auto parameterMask = vertexShader->outputParameterMask; - for (sint32 f = 0; f < 2; f++) - { - for (uint32 i = 0; i < 32; i++) - { - if ((parameterMask & (1 << i)) == 0) - continue; - sint32 vsSemanticId = psInputTable->getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i); - if (vsSemanticId < 0) - continue; - auto psImport = psInputTable->getPSImportBySemanticId(vsSemanticId); - if (psImport == nullptr) - continue; - - if (f == 0) - { - vertexOutDefinition += fmt::format("float4 passParameterSem{};\r\n", vsSemanticId); - } - else - { - geometryOutDefinition += fmt::format("float4 passParameterSem{}", vsSemanticId); - - geometryOutDefinition += fmt::format(" [[user(locn{})]]", psInputTable->getPSImportLocationBySemanticId(vsSemanticId)); - if (psImport->isFlat) - geometryOutDefinition += " [[flat]]"; - if (psImport->isNoPerspective) - geometryOutDefinition += " [[center_no_perspective]]"; - geometryOutDefinition += ";\r\n"; - } - } - } - vertexOutDefinition += "};\r\n"; - geometryOutDefinition += "};\r\n"; - - gsSrc.append(vertexOutDefinition); - gsSrc.append(geometryOutDefinition); - - gsSrc.append("struct ObjectPayload {\r\n"); - gsSrc.append("VertexOut vertexOut[3];\r\n"); - gsSrc.append("};\r\n"); - - // gen function - gsSrc.append("float4 gen4thVertexA(float4 a, float4 b, float4 c)\r\n"); - gsSrc.append("{\r\n"); - gsSrc.append("return b - (c - a);\r\n"); - gsSrc.append("}\r\n"); - - gsSrc.append("float4 gen4thVertexB(float4 a, float4 b, float4 c)\r\n"); - gsSrc.append("{\r\n"); - gsSrc.append("return c - (b - a);\r\n"); - gsSrc.append("}\r\n"); - - gsSrc.append("float4 gen4thVertexC(float4 a, float4 b, float4 c)\r\n"); - gsSrc.append("{\r\n"); - gsSrc.append("return c + (b - a);\r\n"); - gsSrc.append("}\r\n"); - - // main - gsSrc.append("using MeshType = mesh;\r\n"); - gsSrc.append("[[mesh, max_total_threads_per_threadgroup(1)]]\r\n"); - gsSrc.append("void main0(MeshType mesh, const object_data ObjectPayload& objectPayload [[payload]])\r\n"); - gsSrc.append("{\r\n"); - gsSrc.append("GeometryOut out;\r\n"); - - // there are two possible winding orders that need different triangle generation: - // 0 1 - // 2 3 - // and - // 0 1 - // 3 2 - // all others are just symmetries of these cases - - // we can determine the case by comparing the distance 0<->1 and 0<->2 - - gsSrc.append("float dist0_1 = length(objectPayload.vertexOut[1].position.xy - objectPayload.vertexOut[0].position.xy);\r\n"); - gsSrc.append("float dist0_2 = length(objectPayload.vertexOut[2].position.xy - objectPayload.vertexOut[0].position.xy);\r\n"); - gsSrc.append("float dist1_2 = length(objectPayload.vertexOut[2].position.xy - objectPayload.vertexOut[1].position.xy);\r\n"); - - // emit vertices - gsSrc.append("if(dist0_1 > dist0_2 && dist0_1 > dist1_2)\r\n"); - gsSrc.append("{\r\n"); - // p0 to p1 is diagonal - rectsEmulationGS_outputVerticesCode(gsSrc, vertexShader, psInputTable, 2, 1, 0, 3, "A", latteRegister); - gsSrc.append("} else if ( dist0_2 > dist0_1 && dist0_2 > dist1_2 ) {\r\n"); - // p0 to p2 is diagonal - rectsEmulationGS_outputVerticesCode(gsSrc, vertexShader, psInputTable, 1, 2, 0, 3, "B", latteRegister); - gsSrc.append("} else {\r\n"); - // p1 to p2 is diagonal - rectsEmulationGS_outputVerticesCode(gsSrc, vertexShader, psInputTable, 0, 1, 2, 3, "C", latteRegister); - gsSrc.append("}\r\n"); - - gsSrc.append("mesh.set_primitive_count(2);\r\n"); - - gsSrc.append("}\r\n"); - - auto mtlShader = new RendererShaderMtl(metalRenderer, RendererShader::ShaderType::kGeometry, 0, 0, false, false, gsSrc); - mtlShader->PreponeCompilation(true); - - return mtlShader; -} - -#define INVALID_TITLE_ID 0xFFFFFFFFFFFFFFFF - -uint64 s_cacheTitleId = INVALID_TITLE_ID; - -extern std::atomic_int g_compiled_shaders_total; -extern std::atomic_int g_compiled_shaders_async; - -template -void SetFragmentState(T* desc, CachedFBOMtl* lastUsedFBO, CachedFBOMtl* activeFBO, const LatteDecompilerShader* pixelShader, const LatteContextRegister& lcr) -{ - // Rasterization - bool rasterizationEnabled = !lcr.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); - - // HACK - // TODO: include this in the hash? - if (!lcr.PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) - rasterizationEnabled = true; - - // Culling both front and back faces effectively disables rasterization - const auto& polygonControlReg = lcr.PA_SU_SC_MODE_CNTL; - uint32 cullFront = polygonControlReg.get_CULL_FRONT(); - uint32 cullBack = polygonControlReg.get_CULL_BACK(); - if (cullFront && cullBack) - rasterizationEnabled = false; - - auto pixelShaderMtl = static_cast(pixelShader->shader); - - if (!rasterizationEnabled || !pixelShaderMtl) - { - desc->setRasterizationEnabled(false); - return; - } - - desc->setFragmentFunction(pixelShaderMtl->GetFunction()); - - // Color attachments - const Latte::LATTE_CB_COLOR_CONTROL& colorControlReg = lcr.CB_COLOR_CONTROL; - uint32 blendEnableMask = colorControlReg.get_BLEND_MASK(); - uint32 renderTargetMask = lcr.CB_TARGET_MASK.get_MASK(); - for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) - { - const auto& colorBuffer = lastUsedFBO->colorBuffer[i]; - auto texture = static_cast(colorBuffer.texture); - if (!texture) - { - continue; - } - auto colorAttachment = desc->colorAttachments()->object(i); - colorAttachment->setPixelFormat(texture->GetRGBAView()->pixelFormat()); - - // Disable writes if not in the active FBO - if (!activeFBO->colorBuffer[i].texture) - { - colorAttachment->setWriteMask(MTL::ColorWriteMaskNone); - continue; - } - - colorAttachment->setWriteMask(GetMtlColorWriteMask((renderTargetMask >> (i * 4)) & 0xF)); - - // Blending - bool blendEnabled = ((blendEnableMask & (1 << i))) != 0; - // Only float data type is blendable - if (blendEnabled && GetMtlPixelFormatInfo(texture->format, false).dataType == MetalDataType::FLOAT) - { - colorAttachment->setBlendingEnabled(true); - - const auto& blendControlReg = lcr.CB_BLENDN_CONTROL[i]; - - auto rgbBlendOp = GetMtlBlendOp(blendControlReg.get_COLOR_COMB_FCN()); - auto srcRgbBlendFactor = GetMtlBlendFactor(blendControlReg.get_COLOR_SRCBLEND()); - auto dstRgbBlendFactor = GetMtlBlendFactor(blendControlReg.get_COLOR_DSTBLEND()); - - colorAttachment->setRgbBlendOperation(rgbBlendOp); - colorAttachment->setSourceRGBBlendFactor(srcRgbBlendFactor); - colorAttachment->setDestinationRGBBlendFactor(dstRgbBlendFactor); - if (blendControlReg.get_SEPARATE_ALPHA_BLEND()) - { - colorAttachment->setAlphaBlendOperation(GetMtlBlendOp(blendControlReg.get_ALPHA_COMB_FCN())); - colorAttachment->setSourceAlphaBlendFactor(GetMtlBlendFactor(blendControlReg.get_ALPHA_SRCBLEND())); - colorAttachment->setDestinationAlphaBlendFactor(GetMtlBlendFactor(blendControlReg.get_ALPHA_DSTBLEND())); - } - else - { - colorAttachment->setAlphaBlendOperation(rgbBlendOp); - colorAttachment->setSourceAlphaBlendFactor(srcRgbBlendFactor); - colorAttachment->setDestinationAlphaBlendFactor(dstRgbBlendFactor); - } - } - } - - // Depth stencil attachment - if (lastUsedFBO->depthBuffer.texture) - { - auto texture = static_cast(lastUsedFBO->depthBuffer.texture); - desc->setDepthAttachmentPixelFormat(texture->GetRGBAView()->pixelFormat()); - if (lastUsedFBO->depthBuffer.hasStencil) - { - desc->setStencilAttachmentPixelFormat(texture->GetRGBAView()->pixelFormat()); - } - } -} - -void MetalPipelineCache::ShaderCacheLoading_begin(uint64 cacheTitleId) -{ - s_cacheTitleId = cacheTitleId; -} - -void MetalPipelineCache::ShaderCacheLoading_end() -{ -} - -void MetalPipelineCache::ShaderCacheLoading_Close() -{ - g_compiled_shaders_total = 0; - g_compiled_shaders_async = 0; -} - -MetalPipelineCache::~MetalPipelineCache() -{ - for (auto& pair : m_pipelineCache) - { - pair.second->release(); - } - m_pipelineCache.clear(); - - NS::Error* error = nullptr; - m_binaryArchive->serializeToURL(m_binaryArchiveURL, &error); - if (error) - { - cemuLog_log(LogType::Force, "error serializing binary archive: {}", error->localizedDescription()->utf8String()); - error->release(); - } - m_binaryArchive->release(); - - m_binaryArchiveURL->release(); -} - -MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, CachedFBOMtl* lastUsedFBO, CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) -{ - uint64 stateHash = CalculateRenderPipelineHash(fetchShader, vertexShader, pixelShader, lastUsedFBO, lcr); - auto& pipeline = m_pipelineCache[stateHash]; - if (pipeline) - return pipeline; - - auto vertexShaderMtl = static_cast(vertexShader->shader); - - // Render pipeline state - MTL::RenderPipelineDescriptor* desc = MTL::RenderPipelineDescriptor::alloc()->init(); - desc->setVertexFunction(vertexShaderMtl->GetFunction()); - - // Vertex descriptor - if (!fetchShader->mtlFetchVertexManually) - { - MTL::VertexDescriptor* vertexDescriptor = MTL::VertexDescriptor::alloc()->init(); - for (auto& bufferGroup : fetchShader->bufferGroups) - { - std::optional fetchType; - - uint32 minBufferStride = 0; - for (sint32 j = 0; j < bufferGroup.attribCount; ++j) - { - auto& attr = bufferGroup.attrib[j]; - - uint32 semanticId = vertexShader->resourceMapping.attributeMapping[attr.semanticId]; - if (semanticId == (uint32)-1) - continue; // attribute not used? - - auto attribute = vertexDescriptor->attributes()->object(semanticId); - attribute->setOffset(attr.offset); - attribute->setBufferIndex(GET_MTL_VERTEX_BUFFER_INDEX(attr.attributeBufferIndex)); - attribute->setFormat(GetMtlVertexFormat(attr.format)); - - minBufferStride = std::max(minBufferStride, attr.offset + GetMtlVertexFormatSize(attr.format)); - - if (fetchType.has_value()) - cemu_assert_debug(fetchType == attr.fetchType); - else - fetchType = attr.fetchType; - - if (attr.fetchType == LatteConst::INSTANCE_DATA) - { - cemu_assert_debug(attr.aluDivisor == 1); // other divisor not yet supported - } - } - - uint32 bufferIndex = bufferGroup.attributeBufferIndex; - uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; - uint32 bufferStride = (lcr.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; - - auto layout = vertexDescriptor->layouts()->object(GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)); - if (bufferStride == 0) - { - // Buffer stride cannot be zero, let's use the minimum stride - bufferStride = minBufferStride; - - // Additionally, constant vertex function must be used - layout->setStepFunction(MTL::VertexStepFunctionConstant); - layout->setStepRate(0); - } - else - { - if (!fetchType.has_value() || fetchType == LatteConst::VertexFetchType2::VERTEX_DATA) - layout->setStepFunction(MTL::VertexStepFunctionPerVertex); - else if (fetchType == LatteConst::VertexFetchType2::INSTANCE_DATA) - layout->setStepFunction(MTL::VertexStepFunctionPerInstance); - else - { - debug_printf("unimplemented vertex fetch type %u\n", (uint32)fetchType.value()); - cemu_assert(false); - } - } - bufferStride = Align(bufferStride, 4); - layout->setStride(bufferStride); - } - - // TODO: don't always set the vertex descriptor? - desc->setVertexDescriptor(vertexDescriptor); - vertexDescriptor->release(); - } - - SetFragmentState(desc, lastUsedFBO, activeFBO, pixelShader, lcr); - - TryLoadBinaryArchive(); - - // Load binary - if (m_binaryArchive) - { - NS::Object* binArchives[] = {m_binaryArchive}; - auto binaryArchives = NS::Array::alloc()->init(binArchives, 1); - desc->setBinaryArchives(binaryArchives); - binaryArchives->release(); - } - - NS::Error* error = nullptr; -#ifdef CEMU_DEBUG_ASSERT - desc->setLabel(GetLabel("Cached render pipeline state", desc)); -#endif - pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, MTL::PipelineOptionFailOnBinaryArchiveMiss, nullptr, &error); - - // Pipeline wasn't found in the binary archive, we need to compile it - if (error) - { - desc->setBinaryArchives(nullptr); - - error->release(); - error = nullptr; -#ifdef CEMU_DEBUG_ASSERT - desc->setLabel(GetLabel("New render pipeline state", desc)); -#endif - pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, &error); - if (error) - { - cemuLog_log(LogType::Force, "error creating render pipeline state: {}", error->localizedDescription()->utf8String()); - error->release(); - } - else - { - // Save binary - if (m_binaryArchive) - { - NS::Error* error = nullptr; - m_binaryArchive->addRenderPipelineFunctions(desc, &error); - if (error) - { - cemuLog_log(LogType::Force, "error saving render pipeline functions: {}", error->localizedDescription()->utf8String()); - error->release(); - } - } - } - } - desc->release(); - - return pipeline; -} - -MTL::RenderPipelineState* MetalPipelineCache::GetMeshPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, CachedFBOMtl* lastUsedFBO, CachedFBOMtl* activeFBO, const LatteContextRegister& lcr, Renderer::INDEX_TYPE hostIndexType) -{ - uint64 stateHash = CalculateRenderPipelineHash(fetchShader, vertexShader, pixelShader, lastUsedFBO, lcr); - - stateHash += lcr.GetRawView()[mmVGT_PRIMITIVE_TYPE]; - stateHash = std::rotl(stateHash, 7); - - stateHash += (uint8)hostIndexType; - stateHash = std::rotl(stateHash, 7); - - auto& pipeline = m_pipelineCache[stateHash]; - if (pipeline) - return pipeline; - - auto objectShaderMtl = static_cast(vertexShader->shader); - RendererShaderMtl* meshShaderMtl; - if (geometryShader) - { - meshShaderMtl = static_cast(geometryShader->shader); - } - else - { - // If there is no geometry shader, it means that we are emulating rects - meshShaderMtl = rectsEmulationGS_generate(m_mtlr, vertexShader, lcr); - } - - // Render pipeline state - MTL::MeshRenderPipelineDescriptor* desc = MTL::MeshRenderPipelineDescriptor::alloc()->init(); - desc->setObjectFunction(objectShaderMtl->GetFunction()); - desc->setMeshFunction(meshShaderMtl->GetFunction()); - - SetFragmentState(desc, lastUsedFBO, activeFBO, pixelShader, lcr); - - TryLoadBinaryArchive(); - - // Load binary - // TODO: no binary archives? :( - - NS::Error* error = nullptr; -#ifdef CEMU_DEBUG_ASSERT - desc->setLabel(GetLabel("Mesh pipeline state", desc)); -#endif - pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, MTL::PipelineOptionNone, nullptr, &error); - desc->release(); - if (error) - { - cemuLog_log(LogType::Force, "error creating mesh render pipeline state: {}", error->localizedDescription()->utf8String()); - error->release(); - } - - return pipeline; -} - -uint64 MetalPipelineCache::CalculateRenderPipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, const LatteContextRegister& lcr) +uint64 MetalPipelineCache::CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) { // Hash uint64 stateHash = 0; @@ -523,6 +19,12 @@ uint64 MetalPipelineCache::CalculateRenderPipelineHash(const LatteFetchShader* f stateHash += textureView->GetRGBAView()->pixelFormat() + i * 31; stateHash = std::rotl(stateHash, 7); + + if (activeFBO->colorBuffer[i].texture) + { + stateHash += 1; + stateHash = std::rotl(stateHash, 1); + } } if (lastUsedFBO->depthBuffer.texture) @@ -530,6 +32,12 @@ uint64 MetalPipelineCache::CalculateRenderPipelineHash(const LatteFetchShader* f auto textureView = static_cast(lastUsedFBO->depthBuffer.texture); stateHash += textureView->GetRGBAView()->pixelFormat(); stateHash = std::rotl(stateHash, 7); + + if (activeFBO->depthBuffer.texture) + { + stateHash += 1; + stateHash = std::rotl(stateHash, 1); + } } for (auto& group : fetchShader->bufferGroups) @@ -586,55 +94,38 @@ uint64 MetalPipelineCache::CalculateRenderPipelineHash(const LatteFetchShader* f } } - return stateHash; -} + // Mesh pipeline + const LattePrimitiveMode primitiveMode = static_cast(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]); + bool isPrimitiveRect = (primitiveMode == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS); -void MetalPipelineCache::TryLoadBinaryArchive() -{ - if (m_binaryArchive || s_cacheTitleId == INVALID_TITLE_ID) - return; + bool usesGeometryShader = (geometryShader != nullptr || isPrimitiveRect); - // GPU name - const char* deviceName1 = m_mtlr->GetDevice()->name()->utf8String(); - std::string deviceName; - deviceName.assign(deviceName1); - - // Replace spaces with underscores - for (auto& c : deviceName) + if (usesGeometryShader) { - if (c == ' ') - c = '_'; + stateHash += lcr.GetRawView()[mmVGT_PRIMITIVE_TYPE]; + stateHash = std::rotl(stateHash, 7); } - // OS version - auto osVersion = NS::ProcessInfo::processInfo()->operatingSystemVersion(); - - // Precompiled binaries cannot be shared between different devices or OS versions - const std::string cacheFilename = fmt::format("{:016x}_mtl_pipelines.bin", s_cacheTitleId); - const fs::path cachePath = ActiveSettings::GetCachePath("shaderCache/precompiled/{}/{}-{}-{}/{}", deviceName, osVersion.majorVersion, osVersion.minorVersion, osVersion.patchVersion, cacheFilename); - - // Create the directory if it doesn't exist - std::filesystem::create_directories(cachePath.parent_path()); + return stateHash; +} - m_binaryArchiveURL = NS::URL::fileURLWithPath(ToNSString((const char*)cachePath.generic_u8string().c_str())); +MetalPipelineCache::~MetalPipelineCache() +{ + for (auto& [key, value] : m_pipelineCache) + { + value->release(); + } +} - MTL::BinaryArchiveDescriptor* desc = MTL::BinaryArchiveDescriptor::alloc()->init(); - desc->setUrl(m_binaryArchiveURL); +MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) +{ + auto& pipeline = m_pipelineCache[CalculatePipelineHash(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedFBO, activeFBO, lcr)]; + if (pipeline) + return pipeline; - NS::Error* error = nullptr; - m_binaryArchive = m_mtlr->GetDevice()->newBinaryArchive(desc, &error); - if (error) - { - desc->setUrl(nullptr); + MetalPipelineCompiler compiler(m_mtlr); + compiler.InitFromState(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedFBO, activeFBO, lcr); + pipeline = compiler.Compile(false, true); - error->release(); - error = nullptr; - m_binaryArchive = m_mtlr->GetDevice()->newBinaryArchive(desc, &error); - if (error) - { - cemuLog_log(LogType::Force, "failed to create binary archive: {}", error->localizedDescription()->utf8String()); - error->release(); - } - } - desc->release(); + return pipeline; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h index 916a90728..18b163f6d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h @@ -1,24 +1,17 @@ #pragma once -#include - -#include "HW/Latte/ISA/LatteReg.h" -#include "HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h" -#include "Cafe/HW/Latte/Renderer/Renderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h" +// TODO: binary archives class MetalPipelineCache { public: - static void ShaderCacheLoading_begin(uint64 cacheTitleId); - static void ShaderCacheLoading_end(); - static void ShaderCacheLoading_Close(); + static uint64 CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr); MetalPipelineCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} ~MetalPipelineCache(); - MTL::RenderPipelineState* GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr); - - MTL::RenderPipelineState* GetMeshPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr, Renderer::INDEX_TYPE hostIndexType); + MTL::RenderPipelineState* GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr); // Debug size_t GetPipelineCacheSize() const { return m_pipelineCache.size(); } @@ -27,11 +20,4 @@ class MetalPipelineCache class MetalRenderer* m_mtlr; std::map m_pipelineCache; - - NS::URL* m_binaryArchiveURL; - MTL::BinaryArchive* m_binaryArchive; - - uint64 CalculateRenderPipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, const LatteContextRegister& lcr); - - void TryLoadBinaryArchive(); }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp new file mode 100644 index 000000000..9eb29cb6a --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp @@ -0,0 +1,596 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" + +#include "Cafe/HW/Latte/Core/FetchShader.h" +#include "Cafe/HW/Latte/ISA/RegDefines.h" +#include "Cafe/HW/Latte/Core/LatteConst.h" +#include "Cafe/HW/Latte/Core/LatteShader.h" + +static void rectsEmulationGS_outputSingleVertex(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable* psInputTable, sint32 vIdx, const LatteContextRegister& latteRegister) +{ + auto parameterMask = vertexShader->outputParameterMask; + for (uint32 i = 0; i < 32; i++) + { + if ((parameterMask & (1 << i)) == 0) + continue; + sint32 vsSemanticId = psInputTable->getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i); + if (vsSemanticId < 0) + continue; + // make sure PS has matching input + if (!psInputTable->hasPSImportForSemanticId(vsSemanticId)) + continue; + gsSrc.append(fmt::format("out.passParameterSem{} = objectPayload.vertexOut[{}].passParameterSem{};\r\n", vsSemanticId, vIdx, vsSemanticId)); + } + gsSrc.append(fmt::format("out.position = objectPayload.vertexOut[{}].position;\r\n", vIdx)); + gsSrc.append(fmt::format("mesh.set_vertex({}, out);\r\n", vIdx)); +} + +static void rectsEmulationGS_outputGeneratedVertex(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable* psInputTable, const char* variant, const LatteContextRegister& latteRegister) +{ + auto parameterMask = vertexShader->outputParameterMask; + for (uint32 i = 0; i < 32; i++) + { + if ((parameterMask & (1 << i)) == 0) + continue; + sint32 vsSemanticId = psInputTable->getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i); + if (vsSemanticId < 0) + continue; + // make sure PS has matching input + if (!psInputTable->hasPSImportForSemanticId(vsSemanticId)) + continue; + gsSrc.append(fmt::format("out.passParameterSem{} = gen4thVertex{}(objectPayload.vertexOut[0].passParameterSem{}, objectPayload.vertexOut[1].passParameterSem{}, objectPayload.vertexOut[2].passParameterSem{});\r\n", vsSemanticId, variant, vsSemanticId, vsSemanticId, vsSemanticId)); + } + gsSrc.append(fmt::format("out.position = gen4thVertex{}(objectPayload.vertexOut[0].position, objectPayload.vertexOut[1].position, objectPayload.vertexOut[2].position);\r\n", variant)); + gsSrc.append(fmt::format("mesh.set_vertex(3, out);\r\n")); +} + +static void rectsEmulationGS_outputVerticesCode(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable* psInputTable, sint32 p0, sint32 p1, sint32 p2, sint32 p3, const char* variant, const LatteContextRegister& latteRegister) +{ + sint32 pList[4] = { p0, p1, p2, p3 }; + for (sint32 i = 0; i < 4; i++) + { + if (pList[i] == 3) + rectsEmulationGS_outputGeneratedVertex(gsSrc, vertexShader, psInputTable, variant, latteRegister); + else + rectsEmulationGS_outputSingleVertex(gsSrc, vertexShader, psInputTable, pList[i], latteRegister); + } + gsSrc.append(fmt::format("mesh.set_index(0, {});\r\n", pList[0])); + gsSrc.append(fmt::format("mesh.set_index(1, {});\r\n", pList[1])); + gsSrc.append(fmt::format("mesh.set_index(2, {});\r\n", pList[2])); + gsSrc.append(fmt::format("mesh.set_index(3, {});\r\n", pList[1])); + gsSrc.append(fmt::format("mesh.set_index(4, {});\r\n", pList[2])); + gsSrc.append(fmt::format("mesh.set_index(5, {});\r\n", pList[3])); +} + +static RendererShaderMtl* rectsEmulationGS_generate(MetalRenderer* metalRenderer, const LatteDecompilerShader* vertexShader, const LatteContextRegister& latteRegister) +{ + std::string gsSrc; + gsSrc.append("#include \r\n"); + gsSrc.append("using namespace metal;\r\n"); + + LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable(); + + // inputs & outputs + std::string vertexOutDefinition = "struct VertexOut {\r\n"; + vertexOutDefinition += "float4 position;\r\n"; + std::string geometryOutDefinition = "struct GeometryOut {\r\n"; + geometryOutDefinition += "float4 position [[position]];\r\n"; + auto parameterMask = vertexShader->outputParameterMask; + for (sint32 f = 0; f < 2; f++) + { + for (uint32 i = 0; i < 32; i++) + { + if ((parameterMask & (1 << i)) == 0) + continue; + sint32 vsSemanticId = psInputTable->getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i); + if (vsSemanticId < 0) + continue; + auto psImport = psInputTable->getPSImportBySemanticId(vsSemanticId); + if (psImport == nullptr) + continue; + + if (f == 0) + { + vertexOutDefinition += fmt::format("float4 passParameterSem{};\r\n", vsSemanticId); + } + else + { + geometryOutDefinition += fmt::format("float4 passParameterSem{}", vsSemanticId); + + geometryOutDefinition += fmt::format(" [[user(locn{})]]", psInputTable->getPSImportLocationBySemanticId(vsSemanticId)); + if (psImport->isFlat) + geometryOutDefinition += " [[flat]]"; + if (psImport->isNoPerspective) + geometryOutDefinition += " [[center_no_perspective]]"; + geometryOutDefinition += ";\r\n"; + } + } + } + vertexOutDefinition += "};\r\n"; + geometryOutDefinition += "};\r\n"; + + gsSrc.append(vertexOutDefinition); + gsSrc.append(geometryOutDefinition); + + gsSrc.append("struct ObjectPayload {\r\n"); + gsSrc.append("VertexOut vertexOut[3];\r\n"); + gsSrc.append("};\r\n"); + + // gen function + gsSrc.append("float4 gen4thVertexA(float4 a, float4 b, float4 c)\r\n"); + gsSrc.append("{\r\n"); + gsSrc.append("return b - (c - a);\r\n"); + gsSrc.append("}\r\n"); + + gsSrc.append("float4 gen4thVertexB(float4 a, float4 b, float4 c)\r\n"); + gsSrc.append("{\r\n"); + gsSrc.append("return c - (b - a);\r\n"); + gsSrc.append("}\r\n"); + + gsSrc.append("float4 gen4thVertexC(float4 a, float4 b, float4 c)\r\n"); + gsSrc.append("{\r\n"); + gsSrc.append("return c + (b - a);\r\n"); + gsSrc.append("}\r\n"); + + // main + gsSrc.append("using MeshType = mesh;\r\n"); + gsSrc.append("[[mesh, max_total_threads_per_threadgroup(1)]]\r\n"); + gsSrc.append("void main0(MeshType mesh, const object_data ObjectPayload& objectPayload [[payload]])\r\n"); + gsSrc.append("{\r\n"); + gsSrc.append("GeometryOut out;\r\n"); + + // there are two possible winding orders that need different triangle generation: + // 0 1 + // 2 3 + // and + // 0 1 + // 3 2 + // all others are just symmetries of these cases + + // we can determine the case by comparing the distance 0<->1 and 0<->2 + + gsSrc.append("float dist0_1 = length(objectPayload.vertexOut[1].position.xy - objectPayload.vertexOut[0].position.xy);\r\n"); + gsSrc.append("float dist0_2 = length(objectPayload.vertexOut[2].position.xy - objectPayload.vertexOut[0].position.xy);\r\n"); + gsSrc.append("float dist1_2 = length(objectPayload.vertexOut[2].position.xy - objectPayload.vertexOut[1].position.xy);\r\n"); + + // emit vertices + gsSrc.append("if(dist0_1 > dist0_2 && dist0_1 > dist1_2)\r\n"); + gsSrc.append("{\r\n"); + // p0 to p1 is diagonal + rectsEmulationGS_outputVerticesCode(gsSrc, vertexShader, psInputTable, 2, 1, 0, 3, "A", latteRegister); + gsSrc.append("} else if ( dist0_2 > dist0_1 && dist0_2 > dist1_2 ) {\r\n"); + // p0 to p2 is diagonal + rectsEmulationGS_outputVerticesCode(gsSrc, vertexShader, psInputTable, 1, 2, 0, 3, "B", latteRegister); + gsSrc.append("} else {\r\n"); + // p1 to p2 is diagonal + rectsEmulationGS_outputVerticesCode(gsSrc, vertexShader, psInputTable, 0, 1, 2, 3, "C", latteRegister); + gsSrc.append("}\r\n"); + + gsSrc.append("mesh.set_primitive_count(2);\r\n"); + + gsSrc.append("}\r\n"); + + auto mtlShader = new RendererShaderMtl(metalRenderer, RendererShader::ShaderType::kGeometry, 0, 0, false, false, gsSrc); + mtlShader->PreponeCompilation(true); + + return mtlShader; +} + +#define INVALID_TITLE_ID 0xFFFFFFFFFFFFFFFF + +uint64 s_cacheTitleId = INVALID_TITLE_ID; + +extern std::atomic_int g_compiled_shaders_total; +extern std::atomic_int g_compiled_shaders_async; + +template +void SetFragmentState(T* desc, CachedFBOMtl* lastUsedFBO, CachedFBOMtl* activeFBO, const LatteDecompilerShader* pixelShader, const LatteContextRegister& lcr) +{ + // Rasterization + bool rasterizationEnabled = !lcr.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); + + // HACK + // TODO: include this in the hash? + if (!lcr.PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) + rasterizationEnabled = true; + + // Culling both front and back faces effectively disables rasterization + const auto& polygonControlReg = lcr.PA_SU_SC_MODE_CNTL; + uint32 cullFront = polygonControlReg.get_CULL_FRONT(); + uint32 cullBack = polygonControlReg.get_CULL_BACK(); + if (cullFront && cullBack) + rasterizationEnabled = false; + + auto pixelShaderMtl = static_cast(pixelShader->shader); + + if (!rasterizationEnabled || !pixelShaderMtl) + { + desc->setRasterizationEnabled(false); + return; + } + + desc->setFragmentFunction(pixelShaderMtl->GetFunction()); + + // Color attachments + const Latte::LATTE_CB_COLOR_CONTROL& colorControlReg = lcr.CB_COLOR_CONTROL; + uint32 blendEnableMask = colorControlReg.get_BLEND_MASK(); + uint32 renderTargetMask = lcr.CB_TARGET_MASK.get_MASK(); + for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + { + const auto& colorBuffer = lastUsedFBO->colorBuffer[i]; + auto texture = static_cast(colorBuffer.texture); + if (!texture) + { + continue; + } + auto colorAttachment = desc->colorAttachments()->object(i); + colorAttachment->setPixelFormat(texture->GetRGBAView()->pixelFormat()); + + // Disable writes if not in the active FBO + if (!activeFBO->colorBuffer[i].texture) + { + colorAttachment->setWriteMask(MTL::ColorWriteMaskNone); + continue; + } + + colorAttachment->setWriteMask(GetMtlColorWriteMask((renderTargetMask >> (i * 4)) & 0xF)); + + // Blending + bool blendEnabled = ((blendEnableMask & (1 << i))) != 0; + // Only float data type is blendable + if (blendEnabled && GetMtlPixelFormatInfo(texture->format, false).dataType == MetalDataType::FLOAT) + { + colorAttachment->setBlendingEnabled(true); + + const auto& blendControlReg = lcr.CB_BLENDN_CONTROL[i]; + + auto rgbBlendOp = GetMtlBlendOp(blendControlReg.get_COLOR_COMB_FCN()); + auto srcRgbBlendFactor = GetMtlBlendFactor(blendControlReg.get_COLOR_SRCBLEND()); + auto dstRgbBlendFactor = GetMtlBlendFactor(blendControlReg.get_COLOR_DSTBLEND()); + + colorAttachment->setRgbBlendOperation(rgbBlendOp); + colorAttachment->setSourceRGBBlendFactor(srcRgbBlendFactor); + colorAttachment->setDestinationRGBBlendFactor(dstRgbBlendFactor); + if (blendControlReg.get_SEPARATE_ALPHA_BLEND()) + { + colorAttachment->setAlphaBlendOperation(GetMtlBlendOp(blendControlReg.get_ALPHA_COMB_FCN())); + colorAttachment->setSourceAlphaBlendFactor(GetMtlBlendFactor(blendControlReg.get_ALPHA_SRCBLEND())); + colorAttachment->setDestinationAlphaBlendFactor(GetMtlBlendFactor(blendControlReg.get_ALPHA_DSTBLEND())); + } + else + { + colorAttachment->setAlphaBlendOperation(rgbBlendOp); + colorAttachment->setSourceAlphaBlendFactor(srcRgbBlendFactor); + colorAttachment->setDestinationAlphaBlendFactor(dstRgbBlendFactor); + } + } + } + + // Depth stencil attachment + if (lastUsedFBO->depthBuffer.texture) + { + auto texture = static_cast(lastUsedFBO->depthBuffer.texture); + desc->setDepthAttachmentPixelFormat(texture->GetRGBAView()->pixelFormat()); + if (lastUsedFBO->depthBuffer.hasStencil) + { + desc->setStencilAttachmentPixelFormat(texture->GetRGBAView()->pixelFormat()); + } + } +} + +MetalPipelineCompiler::~MetalPipelineCompiler() +{ + /* + for (auto& pair : m_pipelineCache) + { + pair.second->release(); + } + m_pipelineCache.clear(); + + NS::Error* error = nullptr; + m_binaryArchive->serializeToURL(m_binaryArchiveURL, &error); + if (error) + { + cemuLog_log(LogType::Force, "error serializing binary archive: {}", error->localizedDescription()->utf8String()); + error->release(); + } + m_binaryArchive->release(); + + m_binaryArchiveURL->release(); + */ + m_pipelineDescriptor->release(); +} + +void MetalPipelineCompiler::InitFromState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) +{ + const LattePrimitiveMode primitiveMode = static_cast(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]); + bool isPrimitiveRect = (primitiveMode == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS); + + m_usesGeometryShader = (geometryShader != nullptr || isPrimitiveRect); + + if (m_usesGeometryShader) + InitFromStateMesh(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedFBO, activeFBO, lcr); + else + InitFromStateRender(fetchShader, vertexShader, pixelShader, lastUsedFBO, activeFBO, lcr); +} + +MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool isRenderThread) +{ + if (m_usesGeometryShader) + { + auto desc = static_cast(m_pipelineDescriptor); + + NS::Error* error = nullptr; +#ifdef CEMU_DEBUG_ASSERT + desc->setLabel(GetLabel("Mesh render pipeline state", desc)); +#endif + MTL::RenderPipelineState* pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, MTL::PipelineOptionNone, nullptr, &error); + desc->release(); + if (error) + { + cemuLog_log(LogType::Force, "error creating mesh render pipeline state: {}", error->localizedDescription()->utf8String()); + error->release(); + } + + return pipeline; + } + else + { + auto desc = static_cast(m_pipelineDescriptor); + + NS::Error* error = nullptr; +#ifdef CEMU_DEBUG_ASSERT + desc->setLabel(GetLabel("Render pipeline state", desc)); +#endif + MTL::RenderPipelineState* pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, MTL::PipelineOptionNone, nullptr, &error); + if (error) + { + cemuLog_log(LogType::Force, "error creating render pipeline state: {}", error->localizedDescription()->utf8String()); + error->release(); + } + + return pipeline; + } +} + +void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, CachedFBOMtl* lastUsedFBO, CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) +{ + // Shaders + auto vertexShaderMtl = static_cast(vertexShader->shader); + + // Render pipeline state + MTL::RenderPipelineDescriptor* desc = MTL::RenderPipelineDescriptor::alloc()->init(); + desc->setVertexFunction(vertexShaderMtl->GetFunction()); + + // Vertex descriptor + if (!fetchShader->mtlFetchVertexManually) + { + MTL::VertexDescriptor* vertexDescriptor = MTL::VertexDescriptor::alloc()->init(); + for (auto& bufferGroup : fetchShader->bufferGroups) + { + std::optional fetchType; + + uint32 minBufferStride = 0; + for (sint32 j = 0; j < bufferGroup.attribCount; ++j) + { + auto& attr = bufferGroup.attrib[j]; + + uint32 semanticId = vertexShader->resourceMapping.attributeMapping[attr.semanticId]; + if (semanticId == (uint32)-1) + continue; // attribute not used? + + auto attribute = vertexDescriptor->attributes()->object(semanticId); + attribute->setOffset(attr.offset); + attribute->setBufferIndex(GET_MTL_VERTEX_BUFFER_INDEX(attr.attributeBufferIndex)); + attribute->setFormat(GetMtlVertexFormat(attr.format)); + + minBufferStride = std::max(minBufferStride, attr.offset + GetMtlVertexFormatSize(attr.format)); + + if (fetchType.has_value()) + cemu_assert_debug(fetchType == attr.fetchType); + else + fetchType = attr.fetchType; + + if (attr.fetchType == LatteConst::INSTANCE_DATA) + { + cemu_assert_debug(attr.aluDivisor == 1); // other divisor not yet supported + } + } + + uint32 bufferIndex = bufferGroup.attributeBufferIndex; + uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; + uint32 bufferStride = (lcr.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + + auto layout = vertexDescriptor->layouts()->object(GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)); + if (bufferStride == 0) + { + // Buffer stride cannot be zero, let's use the minimum stride + bufferStride = minBufferStride; + + // Additionally, constant vertex function must be used + layout->setStepFunction(MTL::VertexStepFunctionConstant); + layout->setStepRate(0); + } + else + { + if (!fetchType.has_value() || fetchType == LatteConst::VertexFetchType2::VERTEX_DATA) + layout->setStepFunction(MTL::VertexStepFunctionPerVertex); + else if (fetchType == LatteConst::VertexFetchType2::INSTANCE_DATA) + layout->setStepFunction(MTL::VertexStepFunctionPerInstance); + else + { + debug_printf("unimplemented vertex fetch type %u\n", (uint32)fetchType.value()); + cemu_assert(false); + } + } + bufferStride = Align(bufferStride, 4); + layout->setStride(bufferStride); + } + + // TODO: don't always set the vertex descriptor? + desc->setVertexDescriptor(vertexDescriptor); + vertexDescriptor->release(); + } + + SetFragmentState(desc, lastUsedFBO, activeFBO, pixelShader, lcr); + + m_pipelineDescriptor = desc; + + //TryLoadBinaryArchive(); + + // Load binary + /* + if (m_binaryArchive) + { + NS::Object* binArchives[] = {m_binaryArchive}; + auto binaryArchives = NS::Array::alloc()->init(binArchives, 1); + desc->setBinaryArchives(binaryArchives); + binaryArchives->release(); + } + */ + + /* + NS::Error* error = nullptr; +#ifdef CEMU_DEBUG_ASSERT + desc->setLabel(GetLabel("Cached render pipeline state", desc)); +#endif + pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, MTL::PipelineOptionFailOnBinaryArchiveMiss, nullptr, &error); + + // Pipeline wasn't found in the binary archive, we need to compile it + if (error) + { + desc->setBinaryArchives(nullptr); + + error->release(); + error = nullptr; +#ifdef CEMU_DEBUG_ASSERT + desc->setLabel(GetLabel("New render pipeline state", desc)); +#endif + pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, &error); + if (error) + { + cemuLog_log(LogType::Force, "error creating render pipeline state: {}", error->localizedDescription()->utf8String()); + error->release(); + } + else + { + // Save binary + if (m_binaryArchive) + { + NS::Error* error = nullptr; + m_binaryArchive->addRenderPipelineFunctions(desc, &error); + if (error) + { + cemuLog_log(LogType::Force, "error saving render pipeline functions: {}", error->localizedDescription()->utf8String()); + error->release(); + } + } + } + } + desc->release(); + + return pipeline; + */ +} + +void MetalPipelineCompiler::InitFromStateMesh(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, CachedFBOMtl* lastUsedFBO, CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) +{ + auto objectShaderMtl = static_cast(vertexShader->shader); + RendererShaderMtl* meshShaderMtl; + if (geometryShader) + { + meshShaderMtl = static_cast(geometryShader->shader); + } + else + { + // If there is no geometry shader, it means that we are emulating rects + meshShaderMtl = rectsEmulationGS_generate(m_mtlr, vertexShader, lcr); + } + + // Render pipeline state + MTL::MeshRenderPipelineDescriptor* desc = MTL::MeshRenderPipelineDescriptor::alloc()->init(); + desc->setObjectFunction(objectShaderMtl->GetFunction()); + desc->setMeshFunction(meshShaderMtl->GetFunction()); + + SetFragmentState(desc, lastUsedFBO, activeFBO, pixelShader, lcr); + + m_pipelineDescriptor = desc; + + //TryLoadBinaryArchive(); + + // Load binary + // TODO: no binary archives? :( + + /* + NS::Error* error = nullptr; +#ifdef CEMU_DEBUG_ASSERT + desc->setLabel(GetLabel("Mesh pipeline state", desc)); +#endif + pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, MTL::PipelineOptionNone, nullptr, &error); + desc->release(); + if (error) + { + cemuLog_log(LogType::Force, "error creating mesh render pipeline state: {}", error->localizedDescription()->utf8String()); + error->release(); + } + + return pipeline; + */ +} + +/* +void MetalPipelineCache::TryLoadBinaryArchive() +{ + if (m_binaryArchive || s_cacheTitleId == INVALID_TITLE_ID) + return; + + // GPU name + const char* deviceName1 = m_mtlr->GetDevice()->name()->utf8String(); + std::string deviceName; + deviceName.assign(deviceName1); + + // Replace spaces with underscores + for (auto& c : deviceName) + { + if (c == ' ') + c = '_'; + } + + // OS version + auto osVersion = NS::ProcessInfo::processInfo()->operatingSystemVersion(); + + // Precompiled binaries cannot be shared between different devices or OS versions + const std::string cacheFilename = fmt::format("{:016x}_mtl_pipelines.bin", s_cacheTitleId); + const fs::path cachePath = ActiveSettings::GetCachePath("shaderCache/precompiled/{}/{}-{}-{}/{}", deviceName, osVersion.majorVersion, osVersion.minorVersion, osVersion.patchVersion, cacheFilename); + + // Create the directory if it doesn't exist + std::filesystem::create_directories(cachePath.parent_path()); + + m_binaryArchiveURL = NS::URL::fileURLWithPath(ToNSString((const char*)cachePath.generic_u8string().c_str())); + + MTL::BinaryArchiveDescriptor* desc = MTL::BinaryArchiveDescriptor::alloc()->init(); + desc->setUrl(m_binaryArchiveURL); + + NS::Error* error = nullptr; + m_binaryArchive = m_mtlr->GetDevice()->newBinaryArchive(desc, &error); + if (error) + { + desc->setUrl(nullptr); + + error->release(); + error = nullptr; + m_binaryArchive = m_mtlr->GetDevice()->newBinaryArchive(desc, &error); + if (error) + { + cemuLog_log(LogType::Force, "failed to create binary archive: {}", error->localizedDescription()->utf8String()); + error->release(); + } + } + desc->release(); +} +*/ diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h new file mode 100644 index 000000000..282c174d0 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h @@ -0,0 +1,38 @@ +#pragma once + +#include + +#include "Foundation/NSObject.hpp" +#include "HW/Latte/ISA/LatteReg.h" +#include "HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h" +#include "Cafe/HW/Latte/Renderer/Renderer.h" + +class MetalPipelineCompiler +{ +public: + MetalPipelineCompiler(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} + ~MetalPipelineCompiler(); + + void InitFromState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr); + + MTL::RenderPipelineState* Compile(bool forceCompile, bool isRenderThread); + +private: + class MetalRenderer* m_mtlr; + + bool m_usesGeometryShader; + + /* + std::map m_pipelineCache; + + NS::URL* m_binaryArchiveURL; + MTL::BinaryArchive* m_binaryArchive; + */ + NS::Object* m_pipelineDescriptor; + + void InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr); + + void InitFromStateMesh(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr); + + //void TryLoadBinaryArchive(); +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 7cd858576..76ed4c551 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1222,12 +1222,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 //} // Render pipeline state - MTL::RenderPipelineState* renderPipelineState; - if (usesGeometryShader) - renderPipelineState = m_pipelineCache->GetMeshPipelineState(fetchShader, vertexShader, geometryShader, pixelShader, m_state.m_lastUsedFBO, m_state.m_activeFBO, LatteGPUState.contextNew, hostIndexType); - else - renderPipelineState = m_pipelineCache->GetRenderPipelineState(fetchShader, vertexShader, pixelShader, m_state.m_lastUsedFBO, m_state.m_activeFBO, LatteGPUState.contextNew); - + MTL::RenderPipelineState* renderPipelineState = m_pipelineCache->GetRenderPipelineState(fetchShader, vertexShader, geometryShader, pixelShader, m_state.m_lastUsedFBO, m_state.m_activeFBO, LatteGPUState.contextNew); if (!renderPipelineState) return; From e9e510d2cd72083f29c432767816a9ed112ced38 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 14 Oct 2024 20:00:37 +0200 Subject: [PATCH 211/368] add: base for pipeline caching --- .../Renderer/Metal/MetalPipelineCache.cpp | 378 ++++++++++++++++++ .../Latte/Renderer/Metal/MetalPipelineCache.h | 53 ++- 2 files changed, 430 insertions(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index a70f75418..ea95c2662 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -6,6 +6,14 @@ #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/ISA/RegDefines.h" #include "Cafe/HW/Latte/Core/LatteConst.h" +#include "Cafe/HW/Latte/Core/LatteCachedFBO.h" +#include "Cafe/HW/Latte/Common/RegisterSerializer.h" +#include "Cafe/HW/Latte/Core/LatteShaderCache.h" +#include "Cemu/FileCache/FileCache.h" +#include "HW/Latte/Core/LatteShader.h" +#include "util/helpers/helpers.h" +#include "config/ActiveSettings.h" +#include uint64 MetalPipelineCache::CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) { @@ -129,3 +137,373 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte return pipeline; } + +struct +{ + uint32 pipelineLoadIndex; + uint32 pipelineMaxFileIndex; + + std::atomic_uint32_t pipelinesQueued; + std::atomic_uint32_t pipelinesLoaded; +} g_mtlCacheState; + +uint32 MetalPipelineCache::BeginLoading(uint64 cacheTitleId) +{ + std::error_code ec; + fs::create_directories(ActiveSettings::GetCachePath("shaderCache/transferable"), ec); + const auto pathCacheFile = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}_mtlpipeline.bin", cacheTitleId); + + // init cache loader state + g_mtlCacheState.pipelineLoadIndex = 0; + g_mtlCacheState.pipelineMaxFileIndex = 0; + g_mtlCacheState.pipelinesLoaded = 0; + g_mtlCacheState.pipelinesQueued = 0; + + // start async compilation threads + m_compilationCount.store(0); + m_compilationQueue.clear(); + + // get core count + uint32 cpuCoreCount = GetPhysicalCoreCount(); + m_numCompilationThreads = std::clamp(cpuCoreCount, 1u, 8u); + // TODO: uncomment? + //if (VulkanRenderer::GetInstance()->GetDisableMultithreadedCompilation()) + // m_numCompilationThreads = 1; + + for (uint32 i = 0; i < m_numCompilationThreads; i++) + { + std::thread compileThread(&MetalPipelineCache::CompilerThread, this); + compileThread.detach(); + } + + // open cache file or create it + cemu_assert_debug(s_cache == nullptr); + s_cache = FileCache::Open(pathCacheFile, true, LatteShaderCache_getPipelineCacheExtraVersion(cacheTitleId)); + if (!s_cache) + { + cemuLog_log(LogType::Force, "Failed to open or create Vulkan pipeline cache file: {}", _pathToUtf8(pathCacheFile)); + return 0; + } + else + { + s_cache->UseCompression(false); + g_mtlCacheState.pipelineMaxFileIndex = s_cache->GetMaximumFileIndex(); + } + return s_cache->GetFileCount(); +} + +bool MetalPipelineCache::UpdateLoading(uint32& pipelinesLoadedTotal, uint32& pipelinesMissingShaders) +{ + pipelinesLoadedTotal = g_mtlCacheState.pipelinesLoaded; + pipelinesMissingShaders = 0; + while (g_mtlCacheState.pipelineLoadIndex <= g_mtlCacheState.pipelineMaxFileIndex) + { + if (m_compilationQueue.size() >= 50) + { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + return true; // queue up to 50 entries at a time + } + + uint64 fileNameA, fileNameB; + std::vector fileData; + if (s_cache->GetFileByIndex(g_mtlCacheState.pipelineLoadIndex, &fileNameA, &fileNameB, fileData)) + { + // queue for async compilation + g_mtlCacheState.pipelinesQueued++; + m_compilationQueue.push(std::move(fileData)); + g_mtlCacheState.pipelineLoadIndex++; + return true; + } + g_mtlCacheState.pipelineLoadIndex++; + } + if (g_mtlCacheState.pipelinesLoaded != g_mtlCacheState.pipelinesQueued) + { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + return true; // pipelines still compiling + } + return false; // done +} + +void MetalPipelineCache::EndLoading() +{ + // shut down compilation threads + uint32 threadCount = m_numCompilationThreads; + m_numCompilationThreads = 0; // signal thread shutdown + for (uint32 i = 0; i < threadCount; i++) + { + m_compilationQueue.push({}); // push empty workload for every thread. Threads then will shutdown after checking for m_numCompilationThreads == 0 + } + // keep cache file open for writing of new pipelines +} + +void MetalPipelineCache::Close() +{ + if(s_cache) + { + delete s_cache; + s_cache = nullptr; + } +} + +struct CachedPipeline +{ + struct ShaderHash + { + uint64 baseHash; + uint64 auxHash; + bool isPresent{}; + + void set(uint64 baseHash, uint64 auxHash) + { + this->baseHash = baseHash; + this->auxHash = auxHash; + this->isPresent = true; + } + }; + + ShaderHash vsHash; // includes fetch shader + ShaderHash gsHash; + ShaderHash psHash; + + Latte::GPUCompactedRegisterState gpuState; +}; + +void MetalPipelineCache::LoadPipelineFromCache(std::span fileData) +{ + static FSpinlock s_spinlockSharedInternal; + + // deserialize file + LatteContextRegister* lcr = new LatteContextRegister(); + s_spinlockSharedInternal.lock(); + CachedPipeline* cachedPipeline = new CachedPipeline(); + s_spinlockSharedInternal.unlock(); + + MemStreamReader streamReader(fileData.data(), fileData.size()); + if (!DeserializePipeline(streamReader, *cachedPipeline)) + { + // failed to deserialize + s_spinlockSharedInternal.lock(); + delete lcr; + delete cachedPipeline; + s_spinlockSharedInternal.unlock(); + return; + } + // restored register view from compacted state + Latte::LoadGPURegisterState(*lcr, cachedPipeline->gpuState); + + LatteDecompilerShader* vertexShader = nullptr; + LatteDecompilerShader* geometryShader = nullptr; + LatteDecompilerShader* pixelShader = nullptr; + // find vertex shader + if (cachedPipeline->vsHash.isPresent) + { + vertexShader = LatteSHRC_FindVertexShader(cachedPipeline->vsHash.baseHash, cachedPipeline->vsHash.auxHash); + if (!vertexShader) + { + cemuLog_logDebug(LogType::Force, "Vertex shader not found in cache"); + return; + } + } + // find geometry shader + if (cachedPipeline->gsHash.isPresent) + { + geometryShader = LatteSHRC_FindGeometryShader(cachedPipeline->gsHash.baseHash, cachedPipeline->gsHash.auxHash); + if (!geometryShader) + { + cemuLog_logDebug(LogType::Force, "Geometry shader not found in cache"); + return; + } + } + // find pixel shader + if (cachedPipeline->psHash.isPresent) + { + pixelShader = LatteSHRC_FindPixelShader(cachedPipeline->psHash.baseHash, cachedPipeline->psHash.auxHash); + if (!pixelShader) + { + cemuLog_logDebug(LogType::Force, "Pixel shader not found in cache"); + return; + } + } + + if (!pixelShader) + { + cemu_assert_debug(false); + return; + } + + // create pipeline info + m_pipelineIsCachedLock.lock(); + m_pipelineIsCachedLock.unlock(); + throw; + // TODO: uncomment + /* + // compile + { + MetalPipelineCompiler pp(m_mtlr); + if (!pp.InitFromState(fetchShader, vertexShader, geometryShader, pixelShader, activeFBO, activeFBO, *lcr)) + { + s_spinlockSharedInternal.lock(); + delete lcr; + delete cachedPipeline; + s_spinlockSharedInternal.unlock(); + return; + } + pp.Compile(true, true); + // destroy pp early + } + // on success, calculate pipeline hash and flag as present in cache + uint64 pipelineBaseHash = vertexShader->baseHash; + uint64 pipelineStateHash = CalculatePipelineHash(fetchShader, vertexShader, geometryShader, pixelShader, activeFBO, activeFBO, *lcr); + m_pipelineIsCachedLock.lock(); + m_pipelineIsCached.emplace(pipelineBaseHash, pipelineStateHash); + m_pipelineIsCachedLock.unlock(); + */ + + // clean up + s_spinlockSharedInternal.lock(); + delete lcr; + delete cachedPipeline; + s_spinlockSharedInternal.unlock(); +} + +bool MetalPipelineCache::HasPipelineCached(uint64 baseHash, uint64 pipelineStateHash) +{ + PipelineHash ph(baseHash, pipelineStateHash); + return m_pipelineIsCached.find(ph) != m_pipelineIsCached.end(); +} + +ConcurrentQueue g_mtlPipelineCachingQueue; + +void MetalPipelineCache::AddCurrentStateToCache(uint64 baseHash, uint64 pipelineStateHash) +{ + m_pipelineIsCached.emplace(baseHash, pipelineStateHash); + if (!m_pipelineCacheStoreThread) + { + m_pipelineCacheStoreThread = new std::thread(&MetalPipelineCache::WorkerThread, this); + m_pipelineCacheStoreThread->detach(); + } + // fill job structure with cached GPU state + // for each cached pipeline we store: + // - Active shaders (referenced by hash) + // - An almost-complete register state of the GPU (minus some ALU uniform constants which aren't relevant) + CachedPipeline* job = new CachedPipeline(); + auto vs = LatteSHRC_GetActiveVertexShader(); + auto gs = LatteSHRC_GetActiveGeometryShader(); + auto ps = LatteSHRC_GetActivePixelShader(); + if (vs) + job->vsHash.set(vs->baseHash, vs->auxHash); + if (gs) + job->gsHash.set(gs->baseHash, gs->auxHash); + if (ps) + job->psHash.set(ps->baseHash, ps->auxHash); + Latte::StoreGPURegisterState(LatteGPUState.contextNew, job->gpuState); + // queue job + g_mtlPipelineCachingQueue.push(job); +} + +bool MetalPipelineCache::SerializePipeline(MemStreamWriter& memWriter, CachedPipeline& cachedPipeline) +{ + memWriter.writeBE(0x01); // version + uint8 presentMask = 0; + if (cachedPipeline.vsHash.isPresent) + presentMask |= 1; + if (cachedPipeline.gsHash.isPresent) + presentMask |= 2; + if (cachedPipeline.psHash.isPresent) + presentMask |= 4; + memWriter.writeBE(presentMask); + if (cachedPipeline.vsHash.isPresent) + { + memWriter.writeBE(cachedPipeline.vsHash.baseHash); + memWriter.writeBE(cachedPipeline.vsHash.auxHash); + } + if (cachedPipeline.gsHash.isPresent) + { + memWriter.writeBE(cachedPipeline.gsHash.baseHash); + memWriter.writeBE(cachedPipeline.gsHash.auxHash); + } + if (cachedPipeline.psHash.isPresent) + { + memWriter.writeBE(cachedPipeline.psHash.baseHash); + memWriter.writeBE(cachedPipeline.psHash.auxHash); + } + Latte::SerializeRegisterState(cachedPipeline.gpuState, memWriter); + return true; +} + +bool MetalPipelineCache::DeserializePipeline(MemStreamReader& memReader, CachedPipeline& cachedPipeline) +{ + // version + if (memReader.readBE() != 1) + { + cemuLog_log(LogType::Force, "Cached Vulkan pipeline corrupted or has unknown version"); + return false; + } + // shader hashes + uint8 presentMask = memReader.readBE(); + if (presentMask & 1) + { + uint64 baseHash = memReader.readBE(); + uint64 auxHash = memReader.readBE(); + cachedPipeline.vsHash.set(baseHash, auxHash); + } + if (presentMask & 2) + { + uint64 baseHash = memReader.readBE(); + uint64 auxHash = memReader.readBE(); + cachedPipeline.gsHash.set(baseHash, auxHash); + } + if (presentMask & 4) + { + uint64 baseHash = memReader.readBE(); + uint64 auxHash = memReader.readBE(); + cachedPipeline.psHash.set(baseHash, auxHash); + } + // deserialize GPU state + if (!Latte::DeserializeRegisterState(cachedPipeline.gpuState, memReader)) + { + return false; + } + cemu_assert_debug(!memReader.hasError()); + return true; +} + +int MetalPipelineCache::CompilerThread() +{ + SetThreadName("plCacheCompiler"); + while (m_numCompilationThreads != 0) + { + std::vector pipelineData = m_compilationQueue.pop(); + if(pipelineData.empty()) + continue; + LoadPipelineFromCache(pipelineData); + ++g_mtlCacheState.pipelinesLoaded; + } + return 0; +} + +void MetalPipelineCache::WorkerThread() +{ + SetThreadName("plCacheWriter"); + while (true) + { + CachedPipeline* job; + g_mtlPipelineCachingQueue.pop(job); + if (!s_cache) + { + delete job; + continue; + } + // serialize + MemStreamWriter memWriter(1024 * 4); + SerializePipeline(memWriter, *job); + auto blob = memWriter.getResult(); + // file name is derived from data hash + uint8 hash[SHA256_DIGEST_LENGTH]; + SHA256(blob.data(), blob.size(), hash); + uint64 nameA = *(uint64be*)(hash + 0); + uint64 nameB = *(uint64be*)(hash + 8); + s_cache->AddFileAsync({ nameA, nameB }, blob.data(), blob.size()); + delete job; + } +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h index 18b163f6d..5e6d476fe 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h @@ -1,18 +1,54 @@ #pragma once #include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h" +#include "util/helpers/ConcurrentQueue.h" +#include "util/helpers/fspinlock.h" // TODO: binary archives class MetalPipelineCache { public: - static uint64 CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr); + struct PipelineHash + { + PipelineHash(uint64 h0, uint64 h1) : h0(h0), h1(h1) {}; + + uint64 h0; + uint64 h1; + + bool operator==(const PipelineHash& r) const + { + return h0 == r.h0 && h1 == r.h1; + } + + struct HashFunc + { + size_t operator()(const PipelineHash& v) const + { + static_assert(sizeof(uint64) == sizeof(size_t)); + return v.h0 ^ v.h1; + } + }; + }; MetalPipelineCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} ~MetalPipelineCache(); MTL::RenderPipelineState* GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr); + // Cache loading + uint32 BeginLoading(uint64 cacheTitleId); // returns count of pipelines stored in cache + bool UpdateLoading(uint32& pipelinesLoadedTotal, uint32& pipelinesMissingShaders); + void EndLoading(); + void LoadPipelineFromCache(std::span fileData); + void Close(); // called on title exit + + bool HasPipelineCached(uint64 baseHash, uint64 pipelineStateHash); + void AddCurrentStateToCache(uint64 baseHash, uint64 pipelineStateHash); + + // pipeline serialization for file + bool SerializePipeline(class MemStreamWriter& memWriter, struct CachedPipeline& cachedPipeline); + bool DeserializePipeline(class MemStreamReader& memReader, struct CachedPipeline& cachedPipeline); + // Debug size_t GetPipelineCacheSize() const { return m_pipelineCache.size(); } @@ -20,4 +56,19 @@ class MetalPipelineCache class MetalRenderer* m_mtlr; std::map m_pipelineCache; + + std::thread* m_pipelineCacheStoreThread; + + std::unordered_set m_pipelineIsCached; + FSpinlock m_pipelineIsCachedLock; + class FileCache* s_cache; + + std::atomic_uint32_t m_numCompilationThreads{ 0 }; + ConcurrentQueue> m_compilationQueue; + std::atomic_uint32_t m_compilationCount; + + static uint64 CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr); + + int CompilerThread(); + void WorkerThread(); }; From 6b47d4f61e503a7f3e43c5894e593003882917f9 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 15 Oct 2024 07:48:59 +0200 Subject: [PATCH 212/368] implement pipeline cache serializing --- src/Cafe/HW/Latte/Core/LatteShaderCache.cpp | 32 +++++++--- .../Renderer/Metal/MetalPipelineCache.cpp | 62 ++++++++++++------- .../Latte/Renderer/Metal/MetalPipelineCache.h | 7 ++- 3 files changed, 66 insertions(+), 35 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp b/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp index cdb41184e..126dcc500 100644 --- a/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp @@ -64,7 +64,7 @@ FileCache* s_shaderCacheGeneric = nullptr; // contains hardware and version inde #define SHADER_CACHE_TYPE_PIXEL (2) bool LatteShaderCache_readSeparableShader(uint8* shaderInfoData, sint32 shaderInfoSize); -void LatteShaderCache_LoadVulkanPipelineCache(uint64 cacheTitleId); +void LatteShaderCache_LoadPipelineCache(uint64 cacheTitleId); bool LatteShaderCache_updatePipelineLoadingProgress(); void LatteShaderCache_ShowProgress(const std::function & loadUpdateFunc, bool isPipelines); @@ -347,9 +347,9 @@ void LatteShaderCache_Load() cemuLog_log(LogType::Force, "Shader cache loaded with {} shaders. Commited mem {}MB. Took {}ms", numLoadedShaders, (sint32)(memCommited/1024/1024), timeLoad); #endif LatteShaderCache_finish(); - // if Vulkan then also load pipeline cache - if (g_renderer->GetType() == RendererAPI::Vulkan) - LatteShaderCache_LoadVulkanPipelineCache(cacheTitleId); + // if Vulkan or Metal then also load pipeline cache + if (g_renderer->GetType() == RendererAPI::Vulkan || g_renderer->GetType() == RendererAPI::Metal) + LatteShaderCache_LoadPipelineCache(cacheTitleId); g_renderer->BeginFrame(true); @@ -504,13 +504,18 @@ void LatteShaderCache_ShowProgress(const std::function & loadUpdateF } } -void LatteShaderCache_LoadVulkanPipelineCache(uint64 cacheTitleId) +void LatteShaderCache_LoadPipelineCache(uint64 cacheTitleId) { - auto& pipelineCache = VulkanPipelineStableCache::GetInstance(); - g_shaderCacheLoaderState.pipelineFileCount = pipelineCache.BeginLoading(cacheTitleId); + if (g_renderer->GetType() == RendererAPI::Vulkan) + g_shaderCacheLoaderState.pipelineFileCount = VulkanPipelineStableCache::GetInstance().BeginLoading(cacheTitleId); + else if (g_renderer->GetType() == RendererAPI::Metal) + g_shaderCacheLoaderState.pipelineFileCount = MetalPipelineCache::GetInstance().BeginLoading(cacheTitleId); g_shaderCacheLoaderState.loadedPipelines = 0; LatteShaderCache_ShowProgress(LatteShaderCache_updatePipelineLoadingProgress, true); - pipelineCache.EndLoading(); + if (g_renderer->GetType() == RendererAPI::Vulkan) + VulkanPipelineStableCache::GetInstance().EndLoading(); + else if (g_renderer->GetType() == RendererAPI::Metal) + MetalPipelineCache::GetInstance().EndLoading(); if(Latte_GetStopSignal()) LatteThread_Exit(); } @@ -518,7 +523,12 @@ void LatteShaderCache_LoadVulkanPipelineCache(uint64 cacheTitleId) bool LatteShaderCache_updatePipelineLoadingProgress() { uint32 pipelinesMissingShaders = 0; - return VulkanPipelineStableCache::GetInstance().UpdateLoading(g_shaderCacheLoaderState.loadedPipelines, pipelinesMissingShaders); + if (g_renderer->GetType() == RendererAPI::Vulkan) + return VulkanPipelineStableCache::GetInstance().UpdateLoading(g_shaderCacheLoaderState.loadedPipelines, pipelinesMissingShaders); + else if (g_renderer->GetType() == RendererAPI::Metal) + return MetalPipelineCache::GetInstance().UpdateLoading(g_shaderCacheLoaderState.loadedPipelines, pipelinesMissingShaders); + + return false; } uint64 LatteShaderCache_getShaderNameInTransferableCache(uint64 baseHash, uint32 shaderType) @@ -783,9 +793,11 @@ void LatteShaderCache_Close() else if (g_renderer->GetType() == RendererAPI::Metal) RendererShaderMtl::ShaderCacheLoading_Close(); - // if Vulkan then also close pipeline cache + // if Vulkan or Metal then also close pipeline cache if (g_renderer->GetType() == RendererAPI::Vulkan) VulkanPipelineStableCache::GetInstance().Close(); + else if (g_renderer->GetType() == RendererAPI::Metal) + MetalPipelineCache::GetInstance().Close(); } #include diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index ea95c2662..2922d70ce 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -15,6 +15,43 @@ #include "config/ActiveSettings.h" #include +MetalPipelineCache* g_mtlPipelineCache = nullptr; + +MetalPipelineCache& MetalPipelineCache::GetInstance() +{ + return *g_mtlPipelineCache; +} + +MetalPipelineCache::MetalPipelineCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} +{ + g_mtlPipelineCache = this; +} + +MetalPipelineCache::~MetalPipelineCache() +{ + for (auto& [key, value] : m_pipelineCache) + { + value->release(); + } +} + +MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) +{ + uint64 hash = CalculatePipelineHash(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedFBO, activeFBO, lcr); + auto& pipeline = m_pipelineCache[hash]; + if (pipeline) + return pipeline; + + MetalPipelineCompiler compiler(m_mtlr); + compiler.InitFromState(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedFBO, activeFBO, lcr); + pipeline = compiler.Compile(false, true); + + if (!HasPipelineCached(vertexShader->baseHash, hash)) + AddCurrentStateToCache(vertexShader->baseHash, hash); + + return pipeline; +} + uint64 MetalPipelineCache::CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) { // Hash @@ -117,27 +154,6 @@ uint64 MetalPipelineCache::CalculatePipelineHash(const LatteFetchShader* fetchSh return stateHash; } -MetalPipelineCache::~MetalPipelineCache() -{ - for (auto& [key, value] : m_pipelineCache) - { - value->release(); - } -} - -MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) -{ - auto& pipeline = m_pipelineCache[CalculatePipelineHash(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedFBO, activeFBO, lcr)]; - if (pipeline) - return pipeline; - - MetalPipelineCompiler compiler(m_mtlr); - compiler.InitFromState(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedFBO, activeFBO, lcr); - pipeline = compiler.Compile(false, true); - - return pipeline; -} - struct { uint32 pipelineLoadIndex; @@ -181,7 +197,7 @@ uint32 MetalPipelineCache::BeginLoading(uint64 cacheTitleId) s_cache = FileCache::Open(pathCacheFile, true, LatteShaderCache_getPipelineCacheExtraVersion(cacheTitleId)); if (!s_cache) { - cemuLog_log(LogType::Force, "Failed to open or create Vulkan pipeline cache file: {}", _pathToUtf8(pathCacheFile)); + cemuLog_log(LogType::Force, "Failed to open or create Metal pipeline cache file: {}", _pathToUtf8(pathCacheFile)); return 0; } else @@ -436,7 +452,7 @@ bool MetalPipelineCache::DeserializePipeline(MemStreamReader& memReader, CachedP // version if (memReader.readBE() != 1) { - cemuLog_log(LogType::Force, "Cached Vulkan pipeline corrupted or has unknown version"); + cemuLog_log(LogType::Force, "Cached Metal pipeline corrupted or has unknown version"); return false; } // shader hashes diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h index 5e6d476fe..59f61a15e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h @@ -7,7 +7,7 @@ // TODO: binary archives class MetalPipelineCache { -public: +private: struct PipelineHash { PipelineHash(uint64 h0, uint64 h1) : h0(h0), h1(h1) {}; @@ -30,7 +30,10 @@ class MetalPipelineCache }; }; - MetalPipelineCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} +public: + static MetalPipelineCache& GetInstance(); + + MetalPipelineCache(class MetalRenderer* metalRenderer); ~MetalPipelineCache(); MTL::RenderPipelineState* GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr); From cd21d957b3832c78ac5364cb5f8406efc91c204a Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 15 Oct 2024 17:15:46 +0200 Subject: [PATCH 213/368] refactor fbos --- src/Cafe/CMakeLists.txt | 3 + .../Renderer/Metal/MetalAttachmentsInfo.cpp | 48 ++++++++++++++++ .../Renderer/Metal/MetalAttachmentsInfo.h | 15 +++++ .../Renderer/Metal/MetalPipelineCache.cpp | 55 +++++++++---------- .../Latte/Renderer/Metal/MetalPipelineCache.h | 4 +- .../Renderer/Metal/MetalPipelineCompiler.cpp | 45 ++++++++------- .../Renderer/Metal/MetalPipelineCompiler.h | 14 ++--- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 21 +++---- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 13 ++++- 9 files changed, 143 insertions(+), 75 deletions(-) create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.cpp create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index 3d1a02305..b30f8efef 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -534,6 +534,7 @@ if(APPLE) endif() if(ENABLE_METAL) + # TODO: sort alphabetically target_sources(CemuCafe PRIVATE HW/Latte/Renderer/Metal/MetalRenderer.cpp HW/Latte/Renderer/Metal/MetalRenderer.h @@ -555,6 +556,8 @@ if(ENABLE_METAL) HW/Latte/Renderer/Metal/RendererShaderMtl.h HW/Latte/Renderer/Metal/CachedFBOMtl.cpp HW/Latte/Renderer/Metal/CachedFBOMtl.h + HW/Latte/Renderer/Metal/MetalAttachmentsInfo.cpp + HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h HW/Latte/Renderer/Metal/MetalBufferAllocator.h HW/Latte/Renderer/Metal/MetalMemoryManager.cpp HW/Latte/Renderer/Metal/MetalMemoryManager.h diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.cpp new file mode 100644 index 000000000..88a2dface --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.cpp @@ -0,0 +1,48 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h" +#include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" + +MetalAttachmentsInfo::MetalAttachmentsInfo(class CachedFBOMtl* fbo) +{ + for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + { + const auto& colorBuffer = fbo->colorBuffer[i]; + auto texture = static_cast(colorBuffer.texture); + if (!texture) + continue; + + colorFormats[i] = texture->format; + } + + // Depth stencil attachment + if (fbo->depthBuffer.texture) + { + auto texture = static_cast(fbo->depthBuffer.texture); + depthFormat = texture->format; + hasStencil = fbo->depthBuffer.hasStencil; + } +} + +MetalAttachmentsInfo::MetalAttachmentsInfo(const LatteContextRegister& lcr, const LatteDecompilerShader* pixelShader) +{ + uint8 cbMask = LatteMRT::GetActiveColorBufferMask(pixelShader, lcr); + bool dbMask = LatteMRT::GetActiveDepthBufferMask(lcr); + + // Color attachments + for (int i = 0; i < 8; ++i) + { + if ((cbMask & (1 << i)) == 0) + continue; + + colorFormats[i] = LatteMRT::GetColorBufferFormat(i, lcr); + } + + // Depth stencil attachment + if (dbMask) + { + Latte::E_GX2SURFFMT format = LatteMRT::GetDepthBufferFormat(lcr); + depthFormat = format; + hasStencil = GetMtlPixelFormatInfo(format, true).hasStencil; + } +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h new file mode 100644 index 000000000..c8ebe7c11 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h @@ -0,0 +1,15 @@ +#pragma once + +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" + +class MetalAttachmentsInfo +{ +public: + MetalAttachmentsInfo() = default; + MetalAttachmentsInfo(class CachedFBOMtl* fbo); + MetalAttachmentsInfo(const LatteContextRegister& lcr, const class LatteDecompilerShader* pixelShader); + + Latte::E_GX2SURFFMT colorFormats[LATTE_NUM_COLOR_TARGET] = {Latte::E_GX2SURFFMT::INVALID_FORMAT}; + Latte::E_GX2SURFFMT depthFormat = Latte::E_GX2SURFFMT::INVALID_FORMAT; + bool hasStencil = false; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 2922d70ce..bb533b7f2 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -11,6 +11,9 @@ #include "Cafe/HW/Latte/Core/LatteShaderCache.h" #include "Cemu/FileCache/FileCache.h" #include "HW/Latte/Core/LatteShader.h" +#include "HW/Latte/ISA/LatteReg.h" +#include "HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h" #include "util/helpers/helpers.h" #include "config/ActiveSettings.h" #include @@ -35,15 +38,15 @@ MetalPipelineCache::~MetalPipelineCache() } } -MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) +MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) { - uint64 hash = CalculatePipelineHash(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedFBO, activeFBO, lcr); + uint64 hash = CalculatePipelineHash(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); auto& pipeline = m_pipelineCache[hash]; if (pipeline) return pipeline; MetalPipelineCompiler compiler(m_mtlr); - compiler.InitFromState(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedFBO, activeFBO, lcr); + compiler.InitFromState(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); pipeline = compiler.Compile(false, true); if (!HasPipelineCached(vertexShader->baseHash, hash)) @@ -52,33 +55,32 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte return pipeline; } -uint64 MetalPipelineCache::CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) +uint64 MetalPipelineCache::CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) { // Hash uint64 stateHash = 0; for (int i = 0; i < Latte::GPU_LIMITS::NUM_COLOR_ATTACHMENTS; ++i) { - auto textureView = static_cast(lastUsedFBO->colorBuffer[i].texture); - if (!textureView) - continue; + Latte::E_GX2SURFFMT format = lastUsedAttachmentsInfo.colorFormats[i]; + if (format == Latte::E_GX2SURFFMT::INVALID_FORMAT) + continue; - stateHash += textureView->GetRGBAView()->pixelFormat() + i * 31; + stateHash += GetMtlPixelFormat(format, false) + i * 31; stateHash = std::rotl(stateHash, 7); - if (activeFBO->colorBuffer[i].texture) + if (activeAttachmentsInfo.colorFormats[i] == Latte::E_GX2SURFFMT::INVALID_FORMAT) { stateHash += 1; stateHash = std::rotl(stateHash, 1); } } - if (lastUsedFBO->depthBuffer.texture) + if (lastUsedAttachmentsInfo.depthFormat != Latte::E_GX2SURFFMT::INVALID_FORMAT) { - auto textureView = static_cast(lastUsedFBO->depthBuffer.texture); - stateHash += textureView->GetRGBAView()->pixelFormat(); + stateHash += GetMtlPixelFormat(lastUsedAttachmentsInfo.depthFormat, true); stateHash = std::rotl(stateHash, 7); - if (activeFBO->depthBuffer.texture) + if (activeAttachmentsInfo.depthFormat == Latte::E_GX2SURFFMT::INVALID_FORMAT) { stateHash += 1; stateHash = std::rotl(stateHash, 1); @@ -347,33 +349,28 @@ void MetalPipelineCache::LoadPipelineFromCache(std::span fileData) return; } - // create pipeline info - m_pipelineIsCachedLock.lock(); - m_pipelineIsCachedLock.unlock(); - throw; - // TODO: uncomment - /* + MetalAttachmentsInfo attachmentsInfo(*lcr, pixelShader); + // compile { MetalPipelineCompiler pp(m_mtlr); - if (!pp.InitFromState(fetchShader, vertexShader, geometryShader, pixelShader, activeFBO, activeFBO, *lcr)) - { - s_spinlockSharedInternal.lock(); - delete lcr; - delete cachedPipeline; - s_spinlockSharedInternal.unlock(); - return; - } + pp.InitFromState(vertexShader->compatibleFetchShader, vertexShader, geometryShader, pixelShader, attachmentsInfo, attachmentsInfo, *lcr); + //{ + // s_spinlockSharedInternal.lock(); + // delete lcr; + // delete cachedPipeline; + // s_spinlockSharedInternal.unlock(); + // return; + //} pp.Compile(true, true); // destroy pp early } // on success, calculate pipeline hash and flag as present in cache uint64 pipelineBaseHash = vertexShader->baseHash; - uint64 pipelineStateHash = CalculatePipelineHash(fetchShader, vertexShader, geometryShader, pixelShader, activeFBO, activeFBO, *lcr); + uint64 pipelineStateHash = CalculatePipelineHash(vertexShader->compatibleFetchShader, vertexShader, geometryShader, pixelShader, attachmentsInfo, attachmentsInfo, *lcr); m_pipelineIsCachedLock.lock(); m_pipelineIsCached.emplace(pipelineBaseHash, pipelineStateHash); m_pipelineIsCachedLock.unlock(); - */ // clean up s_spinlockSharedInternal.lock(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h index 59f61a15e..d74b50904 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h @@ -36,7 +36,7 @@ class MetalPipelineCache MetalPipelineCache(class MetalRenderer* metalRenderer); ~MetalPipelineCache(); - MTL::RenderPipelineState* GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr); + MTL::RenderPipelineState* GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); // Cache loading uint32 BeginLoading(uint64 cacheTitleId); // returns count of pipelines stored in cache @@ -70,7 +70,7 @@ class MetalPipelineCache ConcurrentQueue> m_compilationQueue; std::atomic_uint32_t m_compilationCount; - static uint64 CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr); + static uint64 CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); int CompilerThread(); void WorkerThread(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp index 9eb29cb6a..e715ae26d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp @@ -10,6 +10,8 @@ #include "Cafe/HW/Latte/ISA/RegDefines.h" #include "Cafe/HW/Latte/Core/LatteConst.h" #include "Cafe/HW/Latte/Core/LatteShader.h" +#include "HW/Latte/ISA/LatteReg.h" +#include "Metal/MTLPixelFormat.hpp" static void rectsEmulationGS_outputSingleVertex(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable* psInputTable, sint32 vIdx, const LatteContextRegister& latteRegister) { @@ -189,7 +191,7 @@ extern std::atomic_int g_compiled_shaders_total; extern std::atomic_int g_compiled_shaders_async; template -void SetFragmentState(T* desc, CachedFBOMtl* lastUsedFBO, CachedFBOMtl* activeFBO, const LatteDecompilerShader* pixelShader, const LatteContextRegister& lcr) +void SetFragmentState(T* desc, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteDecompilerShader* pixelShader, const LatteContextRegister& lcr) { // Rasterization bool rasterizationEnabled = !lcr.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); @@ -222,17 +224,16 @@ void SetFragmentState(T* desc, CachedFBOMtl* lastUsedFBO, CachedFBOMtl* activeFB uint32 renderTargetMask = lcr.CB_TARGET_MASK.get_MASK(); for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) { - const auto& colorBuffer = lastUsedFBO->colorBuffer[i]; - auto texture = static_cast(colorBuffer.texture); - if (!texture) - { + Latte::E_GX2SURFFMT format = lastUsedAttachmentsInfo.colorFormats[i]; + if (format == Latte::E_GX2SURFFMT::INVALID_FORMAT) continue; - } + + MTL::PixelFormat pixelFormat = GetMtlPixelFormat(format, false); auto colorAttachment = desc->colorAttachments()->object(i); - colorAttachment->setPixelFormat(texture->GetRGBAView()->pixelFormat()); + colorAttachment->setPixelFormat(pixelFormat); // Disable writes if not in the active FBO - if (!activeFBO->colorBuffer[i].texture) + if (activeAttachmentsInfo.colorFormats[i] == Latte::E_GX2SURFFMT::INVALID_FORMAT) { colorAttachment->setWriteMask(MTL::ColorWriteMaskNone); continue; @@ -243,7 +244,7 @@ void SetFragmentState(T* desc, CachedFBOMtl* lastUsedFBO, CachedFBOMtl* activeFB // Blending bool blendEnabled = ((blendEnableMask & (1 << i))) != 0; // Only float data type is blendable - if (blendEnabled && GetMtlPixelFormatInfo(texture->format, false).dataType == MetalDataType::FLOAT) + if (blendEnabled && GetMtlPixelFormatInfo(format, false).dataType == MetalDataType::FLOAT) { colorAttachment->setBlendingEnabled(true); @@ -272,14 +273,12 @@ void SetFragmentState(T* desc, CachedFBOMtl* lastUsedFBO, CachedFBOMtl* activeFB } // Depth stencil attachment - if (lastUsedFBO->depthBuffer.texture) + if (lastUsedAttachmentsInfo.depthFormat != Latte::E_GX2SURFFMT::INVALID_FORMAT) { - auto texture = static_cast(lastUsedFBO->depthBuffer.texture); - desc->setDepthAttachmentPixelFormat(texture->GetRGBAView()->pixelFormat()); - if (lastUsedFBO->depthBuffer.hasStencil) - { - desc->setStencilAttachmentPixelFormat(texture->GetRGBAView()->pixelFormat()); - } + MTL::PixelFormat pixelFormat = GetMtlPixelFormat(lastUsedAttachmentsInfo.depthFormat, true); + desc->setDepthAttachmentPixelFormat(pixelFormat); + if (lastUsedAttachmentsInfo.hasStencil) + desc->setStencilAttachmentPixelFormat(pixelFormat); } } @@ -306,7 +305,7 @@ MetalPipelineCompiler::~MetalPipelineCompiler() m_pipelineDescriptor->release(); } -void MetalPipelineCompiler::InitFromState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) +void MetalPipelineCompiler::InitFromState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) { const LattePrimitiveMode primitiveMode = static_cast(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]); bool isPrimitiveRect = (primitiveMode == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS); @@ -314,9 +313,9 @@ void MetalPipelineCompiler::InitFromState(const LatteFetchShader* fetchShader, c m_usesGeometryShader = (geometryShader != nullptr || isPrimitiveRect); if (m_usesGeometryShader) - InitFromStateMesh(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedFBO, activeFBO, lcr); + InitFromStateMesh(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); else - InitFromStateRender(fetchShader, vertexShader, pixelShader, lastUsedFBO, activeFBO, lcr); + InitFromStateRender(fetchShader, vertexShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); } MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool isRenderThread) @@ -358,7 +357,7 @@ MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool } } -void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, CachedFBOMtl* lastUsedFBO, CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) +void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) { // Shaders auto vertexShaderMtl = static_cast(vertexShader->shader); @@ -437,7 +436,7 @@ void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchSha vertexDescriptor->release(); } - SetFragmentState(desc, lastUsedFBO, activeFBO, pixelShader, lcr); + SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, pixelShader, lcr); m_pipelineDescriptor = desc; @@ -498,7 +497,7 @@ void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchSha */ } -void MetalPipelineCompiler::InitFromStateMesh(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, CachedFBOMtl* lastUsedFBO, CachedFBOMtl* activeFBO, const LatteContextRegister& lcr) +void MetalPipelineCompiler::InitFromStateMesh(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) { auto objectShaderMtl = static_cast(vertexShader->shader); RendererShaderMtl* meshShaderMtl; @@ -517,7 +516,7 @@ void MetalPipelineCompiler::InitFromStateMesh(const LatteFetchShader* fetchShade desc->setObjectFunction(objectShaderMtl->GetFunction()); desc->setMeshFunction(meshShaderMtl->GetFunction()); - SetFragmentState(desc, lastUsedFBO, activeFBO, pixelShader, lcr); + SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, pixelShader, lcr); m_pipelineDescriptor = desc; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h index 282c174d0..e1e3e7543 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h @@ -1,11 +1,9 @@ #pragma once -#include +#include "Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h" -#include "Foundation/NSObject.hpp" -#include "HW/Latte/ISA/LatteReg.h" -#include "HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h" -#include "Cafe/HW/Latte/Renderer/Renderer.h" +#include "Cafe/HW/Latte/ISA/LatteReg.h" +#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h" class MetalPipelineCompiler { @@ -13,7 +11,7 @@ class MetalPipelineCompiler MetalPipelineCompiler(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} ~MetalPipelineCompiler(); - void InitFromState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr); + void InitFromState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); MTL::RenderPipelineState* Compile(bool forceCompile, bool isRenderThread); @@ -30,9 +28,9 @@ class MetalPipelineCompiler */ NS::Object* m_pipelineDescriptor; - void InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr); + void InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); - void InitFromStateMesh(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr); + void InitFromStateMesh(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); //void TryLoadBinaryArchive(); }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 76ed4c551..17050326f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -23,6 +23,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h" #include "Cafe/HW/Latte/Renderer/Renderer.h" +#include "HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h" #include "config/CemuConfig.h" #define IMGUI_IMPL_METAL_CPP @@ -511,13 +512,13 @@ LatteCachedFBO* MetalRenderer::rendertarget_createCachedFBO(uint64 key) void MetalRenderer::rendertarget_deleteCachedFBO(LatteCachedFBO* cfbo) { - if (cfbo == (LatteCachedFBO*)m_state.m_activeFBO) - m_state.m_activeFBO = nullptr; + if (cfbo == (LatteCachedFBO*)m_state.m_activeFBO.m_fbo) + m_state.m_activeFBO = {nullptr}; } void MetalRenderer::rendertarget_bindFramebufferObject(LatteCachedFBO* cfbo) { - m_state.m_activeFBO = (CachedFBOMtl*)cfbo; + m_state.m_activeFBO = {(CachedFBOMtl*)cfbo, MetalAttachmentsInfo((CachedFBOMtl*)cfbo)}; } void* MetalRenderer::texture_acquireTextureUploadBuffer(uint32 size) @@ -1008,7 +1009,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // Disable depth write when there is no depth attachment auto& depthControl = LatteGPUState.contextNew.DB_DEPTH_CONTROL; bool depthWriteEnable = depthControl.get_Z_WRITE_ENABLE(); - if (!m_state.m_activeFBO->depthBuffer.texture) + if (!m_state.m_activeFBO.m_fbo->depthBuffer.texture) depthControl.set_Z_WRITE_ENABLE(false); MTL::DepthStencilState* depthStencilState = m_depthStencilCache->GetDepthStencilState(LatteGPUState.contextNew); @@ -1222,7 +1223,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 //} // Render pipeline state - MTL::RenderPipelineState* renderPipelineState = m_pipelineCache->GetRenderPipelineState(fetchShader, vertexShader, geometryShader, pixelShader, m_state.m_lastUsedFBO, m_state.m_activeFBO, LatteGPUState.contextNew); + MTL::RenderPipelineState* renderPipelineState = m_pipelineCache->GetRenderPipelineState(fetchShader, vertexShader, geometryShader, pixelShader, m_state.m_lastUsedFBO.m_attachmentsInfo, m_state.m_activeFBO.m_attachmentsInfo, LatteGPUState.contextNew); if (!renderPipelineState) return; @@ -1524,12 +1525,12 @@ MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(bool forceRecr { if (m_encoderType == MetalEncoderType::Render) { - bool needsNewRenderPass = (m_state.m_lastUsedFBO == nullptr); + bool needsNewRenderPass = (m_state.m_lastUsedFBO.m_fbo == nullptr); if (!needsNewRenderPass) { for (uint8 i = 0; i < 8; i++) { - if (m_state.m_activeFBO->colorBuffer[i].texture && m_state.m_activeFBO->colorBuffer[i].texture != m_state.m_lastUsedFBO->colorBuffer[i].texture) + if (m_state.m_activeFBO.m_fbo->colorBuffer[i].texture && m_state.m_activeFBO.m_fbo->colorBuffer[i].texture != m_state.m_lastUsedFBO.m_fbo->colorBuffer[i].texture) { needsNewRenderPass = true; break; @@ -1539,7 +1540,7 @@ MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(bool forceRecr if (!needsNewRenderPass) { - if (m_state.m_activeFBO->depthBuffer.texture && (m_state.m_activeFBO->depthBuffer.texture != m_state.m_lastUsedFBO->depthBuffer.texture || ( m_state.m_activeFBO->depthBuffer.hasStencil && !m_state.m_lastUsedFBO->depthBuffer.hasStencil))) + if (m_state.m_activeFBO.m_fbo->depthBuffer.texture && (m_state.m_activeFBO.m_fbo->depthBuffer.texture != m_state.m_lastUsedFBO.m_fbo->depthBuffer.texture || ( m_state.m_activeFBO.m_fbo->depthBuffer.hasStencil && !m_state.m_lastUsedFBO.m_fbo->depthBuffer.hasStencil))) { needsNewRenderPass = true; } @@ -1557,7 +1558,7 @@ MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(bool forceRecr auto commandBuffer = GetCommandBuffer(); - auto renderCommandEncoder = commandBuffer->renderCommandEncoder(m_state.m_activeFBO->GetRenderPassDescriptor()); + auto renderCommandEncoder = commandBuffer->renderCommandEncoder(m_state.m_activeFBO.m_fbo->GetRenderPassDescriptor()); #ifdef CEMU_DEBUG_ASSERT renderCommandEncoder->setLabel(GetLabel("Render command encoder", renderCommandEncoder)); #endif @@ -1716,7 +1717,7 @@ bool MetalRenderer::CheckIfRenderPassNeedsFlush(LatteDecompilerShader* shader) // If the texture is also used in the current render pass, we need to end the render pass to "flush" the texture for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) { - auto colorTarget = m_state.m_activeFBO->colorBuffer[i].texture; + auto colorTarget = m_state.m_activeFBO.m_fbo->colorBuffer[i].texture; if (colorTarget && colorTarget->baseTexture == baseTexture) return true; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 526f33a5c..93c9a56d8 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -5,6 +5,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h" struct MetalBufferAllocation { @@ -121,6 +122,12 @@ struct MetalStreamoutState sint32 verticesPerInstance; }; +struct MetalActiveFBOState +{ + class CachedFBOMtl* m_fbo = nullptr; + MetalAttachmentsInfo m_attachmentsInfo; +}; + struct MetalState { MetalEncoderState m_encoderState{}; @@ -130,9 +137,9 @@ struct MetalState bool m_skipDrawSequence = false; bool m_isFirstDrawInRenderPass = true; - class CachedFBOMtl* m_activeFBO = nullptr; - // If the FBO changes, but it's the same FBO as the last one with some omitted attachments, this FBO doesn't change' - class CachedFBOMtl* m_lastUsedFBO = nullptr; + MetalActiveFBOState m_activeFBO; + // If the FBO changes, but it's the same FBO as the last one with some omitted attachments, this FBO doesn't change + MetalActiveFBOState m_lastUsedFBO; MetalBoundBuffer m_vertexBuffers[MAX_MTL_BUFFERS] = {{}}; // TODO: find out what is the max number of bound textures on the Wii U From 944cc8be7d4f721dad7d3320f3c8eefe87197f17 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 15 Oct 2024 17:47:47 +0200 Subject: [PATCH 214/368] store loaded pipelines --- .../Renderer/Metal/MetalPipelineCache.cpp | 29 +++++++------- .../Latte/Renderer/Metal/MetalPipelineCache.h | 39 ++++--------------- 2 files changed, 20 insertions(+), 48 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index bb533b7f2..910794aa5 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -14,6 +14,7 @@ #include "HW/Latte/ISA/LatteReg.h" #include "HW/Latte/Renderer/Metal/LatteToMtl.h" #include "HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h" +#include "Metal/MTLRenderPipeline.hpp" #include "util/helpers/helpers.h" #include "config/ActiveSettings.h" #include @@ -49,8 +50,7 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte compiler.InitFromState(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); pipeline = compiler.Compile(false, true); - if (!HasPipelineCached(vertexShader->baseHash, hash)) - AddCurrentStateToCache(vertexShader->baseHash, hash); + AddCurrentStateToCache(hash); return pipeline; } @@ -351,6 +351,7 @@ void MetalPipelineCache::LoadPipelineFromCache(std::span fileData) MetalAttachmentsInfo attachmentsInfo(*lcr, pixelShader); + MTL::RenderPipelineState* pipeline = nullptr; // compile { MetalPipelineCompiler pp(m_mtlr); @@ -362,15 +363,18 @@ void MetalPipelineCache::LoadPipelineFromCache(std::span fileData) // s_spinlockSharedInternal.unlock(); // return; //} - pp.Compile(true, true); + pipeline = pp.Compile(true, true); // destroy pp early } + // on success, calculate pipeline hash and flag as present in cache - uint64 pipelineBaseHash = vertexShader->baseHash; - uint64 pipelineStateHash = CalculatePipelineHash(vertexShader->compatibleFetchShader, vertexShader, geometryShader, pixelShader, attachmentsInfo, attachmentsInfo, *lcr); - m_pipelineIsCachedLock.lock(); - m_pipelineIsCached.emplace(pipelineBaseHash, pipelineStateHash); - m_pipelineIsCachedLock.unlock(); + if (pipeline) + { + uint64 pipelineStateHash = CalculatePipelineHash(vertexShader->compatibleFetchShader, vertexShader, geometryShader, pixelShader, attachmentsInfo, attachmentsInfo, *lcr); + m_pipelineCacheLock.lock(); + m_pipelineCache[pipelineStateHash] = pipeline; + m_pipelineCacheLock.unlock(); + } // clean up s_spinlockSharedInternal.lock(); @@ -379,17 +383,10 @@ void MetalPipelineCache::LoadPipelineFromCache(std::span fileData) s_spinlockSharedInternal.unlock(); } -bool MetalPipelineCache::HasPipelineCached(uint64 baseHash, uint64 pipelineStateHash) -{ - PipelineHash ph(baseHash, pipelineStateHash); - return m_pipelineIsCached.find(ph) != m_pipelineIsCached.end(); -} - ConcurrentQueue g_mtlPipelineCachingQueue; -void MetalPipelineCache::AddCurrentStateToCache(uint64 baseHash, uint64 pipelineStateHash) +void MetalPipelineCache::AddCurrentStateToCache(uint64 pipelineStateHash) { - m_pipelineIsCached.emplace(baseHash, pipelineStateHash); if (!m_pipelineCacheStoreThread) { m_pipelineCacheStoreThread = new std::thread(&MetalPipelineCache::WorkerThread, this); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h index d74b50904..be26bdee0 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h @@ -7,29 +7,6 @@ // TODO: binary archives class MetalPipelineCache { -private: - struct PipelineHash - { - PipelineHash(uint64 h0, uint64 h1) : h0(h0), h1(h1) {}; - - uint64 h0; - uint64 h1; - - bool operator==(const PipelineHash& r) const - { - return h0 == r.h0 && h1 == r.h1; - } - - struct HashFunc - { - size_t operator()(const PipelineHash& v) const - { - static_assert(sizeof(uint64) == sizeof(size_t)); - return v.h0 ^ v.h1; - } - }; - }; - public: static MetalPipelineCache& GetInstance(); @@ -45,13 +22,6 @@ class MetalPipelineCache void LoadPipelineFromCache(std::span fileData); void Close(); // called on title exit - bool HasPipelineCached(uint64 baseHash, uint64 pipelineStateHash); - void AddCurrentStateToCache(uint64 baseHash, uint64 pipelineStateHash); - - // pipeline serialization for file - bool SerializePipeline(class MemStreamWriter& memWriter, struct CachedPipeline& cachedPipeline); - bool DeserializePipeline(class MemStreamReader& memReader, struct CachedPipeline& cachedPipeline); - // Debug size_t GetPipelineCacheSize() const { return m_pipelineCache.size(); } @@ -59,11 +29,10 @@ class MetalPipelineCache class MetalRenderer* m_mtlr; std::map m_pipelineCache; + FSpinlock m_pipelineCacheLock; std::thread* m_pipelineCacheStoreThread; - std::unordered_set m_pipelineIsCached; - FSpinlock m_pipelineIsCachedLock; class FileCache* s_cache; std::atomic_uint32_t m_numCompilationThreads{ 0 }; @@ -72,6 +41,12 @@ class MetalPipelineCache static uint64 CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); + void AddCurrentStateToCache(uint64 pipelineStateHash); + + // pipeline serialization for file + bool SerializePipeline(class MemStreamWriter& memWriter, struct CachedPipeline& cachedPipeline); + bool DeserializePipeline(class MemStreamReader& memReader, struct CachedPipeline& cachedPipeline); + int CompilerThread(); void WorkerThread(); }; From 79f5586c6ce51835bc73ddcdbb38748b87611e56 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 15 Oct 2024 18:32:12 +0200 Subject: [PATCH 215/368] report pipeline compilation count --- .../Renderer/Metal/MetalPipelineCache.cpp | 4 +- .../Renderer/Metal/MetalPipelineCompiler.cpp | 51 +++++++++++-------- .../Renderer/Metal/MetalPipelineCompiler.h | 2 +- 3 files changed, 34 insertions(+), 23 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 910794aa5..4a202cc44 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -48,7 +48,7 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte MetalPipelineCompiler compiler(m_mtlr); compiler.InitFromState(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); - pipeline = compiler.Compile(false, true); + pipeline = compiler.Compile(false, true, true); AddCurrentStateToCache(hash); @@ -363,7 +363,7 @@ void MetalPipelineCache::LoadPipelineFromCache(std::span fileData) // s_spinlockSharedInternal.unlock(); // return; //} - pipeline = pp.Compile(true, true); + pipeline = pp.Compile(true, true, false); // destroy pp early } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp index e715ae26d..33d1ee7fc 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp @@ -10,8 +10,11 @@ #include "Cafe/HW/Latte/ISA/RegDefines.h" #include "Cafe/HW/Latte/Core/LatteConst.h" #include "Cafe/HW/Latte/Core/LatteShader.h" -#include "HW/Latte/ISA/LatteReg.h" -#include "Metal/MTLPixelFormat.hpp" +#include + +extern std::atomic_int g_compiling_pipelines; +extern std::atomic_int g_compiling_pipelines_async; +extern std::atomic_uint64_t g_compiling_pipelines_syncTimeSum; static void rectsEmulationGS_outputSingleVertex(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable* psInputTable, sint32 vIdx, const LatteContextRegister& latteRegister) { @@ -318,8 +321,12 @@ void MetalPipelineCompiler::InitFromState(const LatteFetchShader* fetchShader, c InitFromStateRender(fetchShader, vertexShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); } -MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool isRenderThread) +MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool isRenderThread, bool showInOverlay) { + MTL::RenderPipelineState* pipeline = nullptr; + NS::Error* error = nullptr; + + auto start = std::chrono::high_resolution_clock::now(); if (m_usesGeometryShader) { auto desc = static_cast(m_pipelineDescriptor); @@ -328,15 +335,7 @@ MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool #ifdef CEMU_DEBUG_ASSERT desc->setLabel(GetLabel("Mesh render pipeline state", desc)); #endif - MTL::RenderPipelineState* pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, MTL::PipelineOptionNone, nullptr, &error); - desc->release(); - if (error) - { - cemuLog_log(LogType::Force, "error creating mesh render pipeline state: {}", error->localizedDescription()->utf8String()); - error->release(); - } - - return pipeline; + pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, MTL::PipelineOptionNone, nullptr, &error); } else { @@ -346,15 +345,27 @@ MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool #ifdef CEMU_DEBUG_ASSERT desc->setLabel(GetLabel("Render pipeline state", desc)); #endif - MTL::RenderPipelineState* pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, MTL::PipelineOptionNone, nullptr, &error); - if (error) - { - cemuLog_log(LogType::Force, "error creating render pipeline state: {}", error->localizedDescription()->utf8String()); - error->release(); - } - - return pipeline; + pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, MTL::PipelineOptionNone, nullptr, &error); } + auto end = std::chrono::high_resolution_clock::now(); + + auto creationDuration = std::chrono::duration_cast(end - start).count(); + + if (error) + { + cemuLog_log(LogType::Force, "error creating render pipeline state: {}", error->localizedDescription()->utf8String()); + error->release(); + } + else if (showInOverlay) + { + if (isRenderThread) + g_compiling_pipelines_syncTimeSum += creationDuration; + else + g_compiling_pipelines_async++; + g_compiling_pipelines++; + } + + return pipeline; } void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h index e1e3e7543..39a4b8a4c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h @@ -13,7 +13,7 @@ class MetalPipelineCompiler void InitFromState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); - MTL::RenderPipelineState* Compile(bool forceCompile, bool isRenderThread); + MTL::RenderPipelineState* Compile(bool forceCompile, bool isRenderThread, bool showInOverlay); private: class MetalRenderer* m_mtlr; From d1c69e99459f90b86e9fe1b9ae5bd2b70b766d38 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 15 Oct 2024 19:19:16 +0200 Subject: [PATCH 216/368] set shader just before compiling --- .../Renderer/Metal/MetalPipelineCompiler.cpp | 64 ++++++++++--------- .../Renderer/Metal/MetalPipelineCompiler.h | 7 +- 2 files changed, 39 insertions(+), 32 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp index 33d1ee7fc..ee01f04bb 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp @@ -10,6 +10,8 @@ #include "Cafe/HW/Latte/ISA/RegDefines.h" #include "Cafe/HW/Latte/Core/LatteConst.h" #include "Cafe/HW/Latte/Core/LatteShader.h" +#include "HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h" +#include "HW/Latte/Renderer/RendererShader.h" #include extern std::atomic_int g_compiling_pipelines; @@ -194,7 +196,7 @@ extern std::atomic_int g_compiled_shaders_total; extern std::atomic_int g_compiled_shaders_async; template -void SetFragmentState(T* desc, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteDecompilerShader* pixelShader, const LatteContextRegister& lcr) +void SetFragmentState(T* desc, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) { // Rasterization bool rasterizationEnabled = !lcr.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); @@ -211,16 +213,13 @@ void SetFragmentState(T* desc, const MetalAttachmentsInfo& lastUsedAttachmentsIn if (cullFront && cullBack) rasterizationEnabled = false; - auto pixelShaderMtl = static_cast(pixelShader->shader); - - if (!rasterizationEnabled || !pixelShaderMtl) + // TODO: check if the pixel shader is valid as well? + if (!rasterizationEnabled/* || !pixelShaderMtl*/) { desc->setRasterizationEnabled(false); return; } - desc->setFragmentFunction(pixelShaderMtl->GetFunction()); - // Color attachments const Latte::LATTE_CB_COLOR_CONTROL& colorControlReg = lcr.CB_COLOR_CONTROL; uint32 blendEnableMask = colorControlReg.get_BLEND_MASK(); @@ -310,15 +309,29 @@ MetalPipelineCompiler::~MetalPipelineCompiler() void MetalPipelineCompiler::InitFromState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) { + // Shaders + m_vertexShader = static_cast(vertexShader->shader); + if (geometryShader) + { + m_geometryShader = static_cast(geometryShader->shader); + } + else + { + // If there is no geometry shader, it means that we are emulating rects + m_geometryShader = rectsEmulationGS_generate(m_mtlr, vertexShader, lcr); + } + m_pixelShader = static_cast(pixelShader->shader); + + // Check if the pipeline uses a geometry shader const LattePrimitiveMode primitiveMode = static_cast(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]); bool isPrimitiveRect = (primitiveMode == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS); m_usesGeometryShader = (geometryShader != nullptr || isPrimitiveRect); if (m_usesGeometryShader) - InitFromStateMesh(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); + InitFromStateMesh(fetchShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); else - InitFromStateRender(fetchShader, vertexShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); + InitFromStateRender(fetchShader, vertexShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); } MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool isRenderThread, bool showInOverlay) @@ -331,6 +344,11 @@ MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool { auto desc = static_cast(m_pipelineDescriptor); + // Shaders + desc->setObjectFunction(m_vertexShader->GetFunction()); + desc->setMeshFunction(m_geometryShader->GetFunction()); + desc->setFragmentFunction(m_pixelShader->GetFunction()); + NS::Error* error = nullptr; #ifdef CEMU_DEBUG_ASSERT desc->setLabel(GetLabel("Mesh render pipeline state", desc)); @@ -341,6 +359,10 @@ MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool { auto desc = static_cast(m_pipelineDescriptor); + // Shaders + desc->setVertexFunction(m_vertexShader->GetFunction()); + desc->setFragmentFunction(m_pixelShader->GetFunction()); + NS::Error* error = nullptr; #ifdef CEMU_DEBUG_ASSERT desc->setLabel(GetLabel("Render pipeline state", desc)); @@ -368,14 +390,10 @@ MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool return pipeline; } -void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) +void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) { - // Shaders - auto vertexShaderMtl = static_cast(vertexShader->shader); - // Render pipeline state MTL::RenderPipelineDescriptor* desc = MTL::RenderPipelineDescriptor::alloc()->init(); - desc->setVertexFunction(vertexShaderMtl->GetFunction()); // Vertex descriptor if (!fetchShader->mtlFetchVertexManually) @@ -447,7 +465,7 @@ void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchSha vertexDescriptor->release(); } - SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, pixelShader, lcr); + SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); m_pipelineDescriptor = desc; @@ -508,26 +526,12 @@ void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchSha */ } -void MetalPipelineCompiler::InitFromStateMesh(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) +void MetalPipelineCompiler::InitFromStateMesh(const LatteFetchShader* fetchShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) { - auto objectShaderMtl = static_cast(vertexShader->shader); - RendererShaderMtl* meshShaderMtl; - if (geometryShader) - { - meshShaderMtl = static_cast(geometryShader->shader); - } - else - { - // If there is no geometry shader, it means that we are emulating rects - meshShaderMtl = rectsEmulationGS_generate(m_mtlr, vertexShader, lcr); - } - // Render pipeline state MTL::MeshRenderPipelineDescriptor* desc = MTL::MeshRenderPipelineDescriptor::alloc()->init(); - desc->setObjectFunction(objectShaderMtl->GetFunction()); - desc->setMeshFunction(meshShaderMtl->GetFunction()); - SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, pixelShader, lcr); + SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); m_pipelineDescriptor = desc; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h index 39a4b8a4c..4f0febefb 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h @@ -18,6 +18,9 @@ class MetalPipelineCompiler private: class MetalRenderer* m_mtlr; + const class RendererShaderMtl* m_vertexShader; + const class RendererShaderMtl* m_geometryShader; + const class RendererShaderMtl* m_pixelShader; bool m_usesGeometryShader; /* @@ -28,9 +31,9 @@ class MetalPipelineCompiler */ NS::Object* m_pipelineDescriptor; - void InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); + void InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); - void InitFromStateMesh(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); + void InitFromStateMesh(const LatteFetchShader* fetchShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); //void TryLoadBinaryArchive(); }; From cbde7f983cdd2ec5736281a79257f759349b41b4 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 15 Oct 2024 19:48:32 +0200 Subject: [PATCH 217/368] force compile shaders if needed --- .../Renderer/Metal/MetalPipelineCompiler.cpp | 55 +++++++++++++------ .../Renderer/Metal/MetalPipelineCompiler.h | 6 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 28 ++++------ 3 files changed, 51 insertions(+), 38 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp index ee01f04bb..54aa83b1b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp @@ -309,25 +309,22 @@ MetalPipelineCompiler::~MetalPipelineCompiler() void MetalPipelineCompiler::InitFromState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) { - // Shaders - m_vertexShader = static_cast(vertexShader->shader); - if (geometryShader) - { - m_geometryShader = static_cast(geometryShader->shader); - } - else - { - // If there is no geometry shader, it means that we are emulating rects - m_geometryShader = rectsEmulationGS_generate(m_mtlr, vertexShader, lcr); - } - m_pixelShader = static_cast(pixelShader->shader); - // Check if the pipeline uses a geometry shader const LattePrimitiveMode primitiveMode = static_cast(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]); bool isPrimitiveRect = (primitiveMode == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS); m_usesGeometryShader = (geometryShader != nullptr || isPrimitiveRect); + // Shaders + m_vertexShaderMtl = static_cast(vertexShader->shader); + if (geometryShader) + m_geometryShaderMtl = static_cast(geometryShader->shader); + else if (isPrimitiveRect) + m_geometryShaderMtl = rectsEmulationGS_generate(m_mtlr, vertexShader, lcr); + else + m_geometryShaderMtl = nullptr; + m_pixelShaderMtl = static_cast(pixelShader->shader); + if (m_usesGeometryShader) InitFromStateMesh(fetchShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); else @@ -336,6 +333,28 @@ void MetalPipelineCompiler::InitFromState(const LatteFetchShader* fetchShader, c MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool isRenderThread, bool showInOverlay) { + if (forceCompile) + { + // if some shader stages are not compiled yet, compile them now + if (m_vertexShaderMtl && !m_vertexShaderMtl->IsCompiled()) + m_vertexShaderMtl->PreponeCompilation(isRenderThread); + if (m_geometryShaderMtl && !m_geometryShaderMtl->IsCompiled()) + m_geometryShaderMtl->PreponeCompilation(isRenderThread); + if (m_pixelShaderMtl && !m_pixelShaderMtl->IsCompiled()) + m_pixelShaderMtl->PreponeCompilation(isRenderThread); + } + else + { + // fail early if some shader stages are not compiled + if (m_vertexShaderMtl && !m_vertexShaderMtl->IsCompiled()) + return nullptr; + if (m_geometryShaderMtl && !m_geometryShaderMtl->IsCompiled()) + return nullptr; + if (m_pixelShaderMtl && !m_pixelShaderMtl->IsCompiled()) + return nullptr; + } + + // Compile MTL::RenderPipelineState* pipeline = nullptr; NS::Error* error = nullptr; @@ -345,9 +364,9 @@ MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool auto desc = static_cast(m_pipelineDescriptor); // Shaders - desc->setObjectFunction(m_vertexShader->GetFunction()); - desc->setMeshFunction(m_geometryShader->GetFunction()); - desc->setFragmentFunction(m_pixelShader->GetFunction()); + desc->setObjectFunction(m_vertexShaderMtl->GetFunction()); + desc->setMeshFunction(m_geometryShaderMtl->GetFunction()); + desc->setFragmentFunction(m_pixelShaderMtl->GetFunction()); NS::Error* error = nullptr; #ifdef CEMU_DEBUG_ASSERT @@ -360,8 +379,8 @@ MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool auto desc = static_cast(m_pipelineDescriptor); // Shaders - desc->setVertexFunction(m_vertexShader->GetFunction()); - desc->setFragmentFunction(m_pixelShader->GetFunction()); + desc->setVertexFunction(m_vertexShaderMtl->GetFunction()); + desc->setFragmentFunction(m_pixelShaderMtl->GetFunction()); NS::Error* error = nullptr; #ifdef CEMU_DEBUG_ASSERT diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h index 4f0febefb..f39b1fb5e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h @@ -18,9 +18,9 @@ class MetalPipelineCompiler private: class MetalRenderer* m_mtlr; - const class RendererShaderMtl* m_vertexShader; - const class RendererShaderMtl* m_geometryShader; - const class RendererShaderMtl* m_pixelShader; + class RendererShaderMtl* m_vertexShaderMtl; + class RendererShaderMtl* m_geometryShaderMtl; + class RendererShaderMtl* m_pixelShaderMtl; bool m_usesGeometryShader; /* diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 17050326f..2b420e6e2 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -944,15 +944,9 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // Shaders LatteDecompilerShader* vertexShader = LatteSHRC_GetActiveVertexShader(); - if (vertexShader && !vertexShader->shader->IsCompiled()) - return; LatteDecompilerShader* geometryShader = LatteSHRC_GetActiveGeometryShader(); - if (geometryShader && !geometryShader->shader->IsCompiled()) - return; LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader(); const auto fetchShader = LatteSHRC_GetActiveFetchShader(); - if (vertexShader && !pixelShader->shader->IsCompiled()) - return; bool neverSkipAccurateBarrier = false; @@ -1004,6 +998,17 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // Render pass auto renderCommandEncoder = GetRenderCommandEncoder(); + // Render pipeline state + MTL::RenderPipelineState* renderPipelineState = m_pipelineCache->GetRenderPipelineState(fetchShader, vertexShader, geometryShader, pixelShader, m_state.m_lastUsedFBO.m_attachmentsInfo, m_state.m_activeFBO.m_attachmentsInfo, LatteGPUState.contextNew); + if (!renderPipelineState) + return; + + if (renderPipelineState != encoderState.m_renderPipelineState) + { + renderCommandEncoder->setRenderPipelineState(renderPipelineState); + encoderState.m_renderPipelineState = renderPipelineState; + } + // Depth stencil state // Disable depth write when there is no depth attachment @@ -1222,17 +1227,6 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // renderCommandEncoder->memoryBarrier(barrierBuffers.data(), barrierBuffers.size(), MTL::RenderStageVertex, MTL::RenderStageVertex); //} - // Render pipeline state - MTL::RenderPipelineState* renderPipelineState = m_pipelineCache->GetRenderPipelineState(fetchShader, vertexShader, geometryShader, pixelShader, m_state.m_lastUsedFBO.m_attachmentsInfo, m_state.m_activeFBO.m_attachmentsInfo, LatteGPUState.contextNew); - if (!renderPipelineState) - return; - - if (renderPipelineState != encoderState.m_renderPipelineState) - { - renderCommandEncoder->setRenderPipelineState(renderPipelineState); - encoderState.m_renderPipelineState = renderPipelineState; - } - // Prepare streamout m_state.m_streamoutState.verticesPerInstance = count; LatteStreamout_PrepareDrawcall(count, instanceCount); From 4dcb858ab8acf6036c29132fcb2b9d1149e28f86 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 15 Oct 2024 20:03:26 +0200 Subject: [PATCH 218/368] check if pipeline is eligible for serializing --- .../Renderer/Metal/MetalPipelineCache.cpp | 12 +++++++++--- .../Renderer/Metal/MetalPipelineCompiler.cpp | 18 ++++++++++-------- .../Renderer/Metal/MetalPipelineCompiler.h | 6 +++--- 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 4a202cc44..476417d3f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -10,6 +10,7 @@ #include "Cafe/HW/Latte/Common/RegisterSerializer.h" #include "Cafe/HW/Latte/Core/LatteShaderCache.h" #include "Cemu/FileCache/FileCache.h" +#include "Common/precompiled.h" #include "HW/Latte/Core/LatteShader.h" #include "HW/Latte/ISA/LatteReg.h" #include "HW/Latte/Renderer/Metal/LatteToMtl.h" @@ -47,10 +48,13 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte return pipeline; MetalPipelineCompiler compiler(m_mtlr); - compiler.InitFromState(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); + bool fbosMatch; + compiler.InitFromState(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr, fbosMatch); pipeline = compiler.Compile(false, true, true); - AddCurrentStateToCache(hash); + // If FBOs don't match, it wouldn't be possible to reconstruct the pipeline from the cache + if (fbosMatch) + AddCurrentStateToCache(hash); return pipeline; } @@ -355,7 +359,9 @@ void MetalPipelineCache::LoadPipelineFromCache(std::span fileData) // compile { MetalPipelineCompiler pp(m_mtlr); - pp.InitFromState(vertexShader->compatibleFetchShader, vertexShader, geometryShader, pixelShader, attachmentsInfo, attachmentsInfo, *lcr); + bool fbosMatch; + pp.InitFromState(vertexShader->compatibleFetchShader, vertexShader, geometryShader, pixelShader, attachmentsInfo, attachmentsInfo, *lcr, fbosMatch); + cemu_assert_debug(fbosMatch); //{ // s_spinlockSharedInternal.lock(); // delete lcr; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp index 54aa83b1b..9b865fb51 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp @@ -196,7 +196,7 @@ extern std::atomic_int g_compiled_shaders_total; extern std::atomic_int g_compiled_shaders_async; template -void SetFragmentState(T* desc, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) +void SetFragmentState(T* desc, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr, bool& fbosMatch) { // Rasterization bool rasterizationEnabled = !lcr.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); @@ -221,6 +221,7 @@ void SetFragmentState(T* desc, const MetalAttachmentsInfo& lastUsedAttachmentsIn } // Color attachments + fbosMatch = true; const Latte::LATTE_CB_COLOR_CONTROL& colorControlReg = lcr.CB_COLOR_CONTROL; uint32 blendEnableMask = colorControlReg.get_BLEND_MASK(); uint32 renderTargetMask = lcr.CB_TARGET_MASK.get_MASK(); @@ -238,6 +239,7 @@ void SetFragmentState(T* desc, const MetalAttachmentsInfo& lastUsedAttachmentsIn if (activeAttachmentsInfo.colorFormats[i] == Latte::E_GX2SURFFMT::INVALID_FORMAT) { colorAttachment->setWriteMask(MTL::ColorWriteMaskNone); + fbosMatch = false; continue; } @@ -307,7 +309,7 @@ MetalPipelineCompiler::~MetalPipelineCompiler() m_pipelineDescriptor->release(); } -void MetalPipelineCompiler::InitFromState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) +void MetalPipelineCompiler::InitFromState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr, bool& fbosMatch) { // Check if the pipeline uses a geometry shader const LattePrimitiveMode primitiveMode = static_cast(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]); @@ -326,9 +328,9 @@ void MetalPipelineCompiler::InitFromState(const LatteFetchShader* fetchShader, c m_pixelShaderMtl = static_cast(pixelShader->shader); if (m_usesGeometryShader) - InitFromStateMesh(fetchShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); + InitFromStateMesh(fetchShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr, fbosMatch); else - InitFromStateRender(fetchShader, vertexShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); + InitFromStateRender(fetchShader, vertexShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr, fbosMatch); } MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool isRenderThread, bool showInOverlay) @@ -409,7 +411,7 @@ MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool return pipeline; } -void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) +void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr, bool& fbosMatch) { // Render pipeline state MTL::RenderPipelineDescriptor* desc = MTL::RenderPipelineDescriptor::alloc()->init(); @@ -484,7 +486,7 @@ void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchSha vertexDescriptor->release(); } - SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); + SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr, fbosMatch); m_pipelineDescriptor = desc; @@ -545,12 +547,12 @@ void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchSha */ } -void MetalPipelineCompiler::InitFromStateMesh(const LatteFetchShader* fetchShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) +void MetalPipelineCompiler::InitFromStateMesh(const LatteFetchShader* fetchShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr, bool& fbosMatch) { // Render pipeline state MTL::MeshRenderPipelineDescriptor* desc = MTL::MeshRenderPipelineDescriptor::alloc()->init(); - SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); + SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr, fbosMatch); m_pipelineDescriptor = desc; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h index f39b1fb5e..3b9731a3e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h @@ -11,7 +11,7 @@ class MetalPipelineCompiler MetalPipelineCompiler(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} ~MetalPipelineCompiler(); - void InitFromState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); + void InitFromState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr, bool& fbosMatch); MTL::RenderPipelineState* Compile(bool forceCompile, bool isRenderThread, bool showInOverlay); @@ -31,9 +31,9 @@ class MetalPipelineCompiler */ NS::Object* m_pipelineDescriptor; - void InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); + void InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr, bool& fbosMatch); - void InitFromStateMesh(const LatteFetchShader* fetchShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); + void InitFromStateMesh(const LatteFetchShader* fetchShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr, bool& fbosMatch); //void TryLoadBinaryArchive(); }; From 7d9194a738abf9697c77167e37fc96c18c581254 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 15 Oct 2024 20:24:04 +0200 Subject: [PATCH 219/368] don't overshadow error --- src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp index 9b865fb51..94292e046 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp @@ -370,7 +370,6 @@ MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool desc->setMeshFunction(m_geometryShaderMtl->GetFunction()); desc->setFragmentFunction(m_pixelShaderMtl->GetFunction()); - NS::Error* error = nullptr; #ifdef CEMU_DEBUG_ASSERT desc->setLabel(GetLabel("Mesh render pipeline state", desc)); #endif @@ -384,7 +383,6 @@ MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool desc->setVertexFunction(m_vertexShaderMtl->GetFunction()); desc->setFragmentFunction(m_pixelShaderMtl->GetFunction()); - NS::Error* error = nullptr; #ifdef CEMU_DEBUG_ASSERT desc->setLabel(GetLabel("Render pipeline state", desc)); #endif From 8f2385a69071d5febbe1359001c34eb5e49cc927 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 16 Oct 2024 19:20:25 +0200 Subject: [PATCH 220/368] use lcr instead of contextNew --- src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp index 94292e046..910b354b1 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp @@ -264,8 +264,8 @@ void SetFragmentState(T* desc, const MetalAttachmentsInfo& lastUsedAttachmentsIn if (blendControlReg.get_SEPARATE_ALPHA_BLEND()) { colorAttachment->setAlphaBlendOperation(GetMtlBlendOp(blendControlReg.get_ALPHA_COMB_FCN())); - colorAttachment->setSourceAlphaBlendFactor(GetMtlBlendFactor(blendControlReg.get_ALPHA_SRCBLEND())); - colorAttachment->setDestinationAlphaBlendFactor(GetMtlBlendFactor(blendControlReg.get_ALPHA_DSTBLEND())); + colorAttachment->setSourceAlphaBlendFactor(GetMtlBlendFactor(blendControlReg.get_ALPHA_SRCBLEND())); + colorAttachment->setDestinationAlphaBlendFactor(GetMtlBlendFactor(blendControlReg.get_ALPHA_DSTBLEND())); } else { @@ -312,7 +312,7 @@ MetalPipelineCompiler::~MetalPipelineCompiler() void MetalPipelineCompiler::InitFromState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr, bool& fbosMatch) { // Check if the pipeline uses a geometry shader - const LattePrimitiveMode primitiveMode = static_cast(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]); + const LattePrimitiveMode primitiveMode = static_cast(lcr.VGT_PRIMITIVE_TYPE.get_PRIMITIVE_MODE()); bool isPrimitiveRect = (primitiveMode == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS); m_usesGeometryShader = (geometryShader != nullptr || isPrimitiveRect); From 15eb6bb37f0011257d040791dac9b549baecbf4f Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 19 Oct 2024 09:29:14 +0200 Subject: [PATCH 221/368] fix: pipeline cache with mesh shaders --- .../Renderer/Metal/MetalPipelineCache.cpp | 13 +++-- .../Renderer/Metal/MetalPipelineCompiler.cpp | 48 ++++++++++--------- .../Renderer/Metal/MetalPipelineCompiler.h | 1 + 3 files changed, 35 insertions(+), 27 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 476417d3f..214c822ff 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -43,19 +43,21 @@ MetalPipelineCache::~MetalPipelineCache() MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) { uint64 hash = CalculatePipelineHash(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); - auto& pipeline = m_pipelineCache[hash]; - if (pipeline) - return pipeline; + auto it = m_pipelineCache.find(hash); + if (it != m_pipelineCache.end()) + return it->second; MetalPipelineCompiler compiler(m_mtlr); bool fbosMatch; compiler.InitFromState(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr, fbosMatch); - pipeline = compiler.Compile(false, true, true); + MTL::RenderPipelineState* pipeline = compiler.Compile(false, true, true); // If FBOs don't match, it wouldn't be possible to reconstruct the pipeline from the cache if (fbosMatch) AddCurrentStateToCache(hash); + m_pipelineCache.insert({hash, pipeline}); + return pipeline; } @@ -355,6 +357,9 @@ void MetalPipelineCache::LoadPipelineFromCache(std::span fileData) MetalAttachmentsInfo attachmentsInfo(*lcr, pixelShader); + // TODO: this shouldn't probably be called directly + LatteShader_UpdatePSInputs(lcr->GetRawView()); + MTL::RenderPipelineState* pipeline = nullptr; // compile { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp index 910b354b1..d46358853 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp @@ -10,8 +10,7 @@ #include "Cafe/HW/Latte/ISA/RegDefines.h" #include "Cafe/HW/Latte/Core/LatteConst.h" #include "Cafe/HW/Latte/Core/LatteShader.h" -#include "HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h" -#include "HW/Latte/Renderer/RendererShader.h" + #include extern std::atomic_int g_compiling_pipelines; @@ -196,23 +195,8 @@ extern std::atomic_int g_compiled_shaders_total; extern std::atomic_int g_compiled_shaders_async; template -void SetFragmentState(T* desc, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr, bool& fbosMatch) +void SetFragmentState(T* desc, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, bool rasterizationEnabled, const LatteContextRegister& lcr, bool& fbosMatch) { - // Rasterization - bool rasterizationEnabled = !lcr.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); - - // HACK - // TODO: include this in the hash? - if (!lcr.PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) - rasterizationEnabled = true; - - // Culling both front and back faces effectively disables rasterization - const auto& polygonControlReg = lcr.PA_SU_SC_MODE_CNTL; - uint32 cullFront = polygonControlReg.get_CULL_FRONT(); - uint32 cullBack = polygonControlReg.get_CULL_BACK(); - if (cullFront && cullBack) - rasterizationEnabled = false; - // TODO: check if the pixel shader is valid as well? if (!rasterizationEnabled/* || !pixelShaderMtl*/) { @@ -317,6 +301,21 @@ void MetalPipelineCompiler::InitFromState(const LatteFetchShader* fetchShader, c m_usesGeometryShader = (geometryShader != nullptr || isPrimitiveRect); + // Rasterization + m_rasterizationEnabled = !lcr.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); + + // HACK + // TODO: include this in the hash? + if (!lcr.PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) + m_rasterizationEnabled = true; + + // Culling both front and back faces effectively disables rasterization + const auto& polygonControlReg = lcr.PA_SU_SC_MODE_CNTL; + uint32 cullFront = polygonControlReg.get_CULL_FRONT(); + uint32 cullBack = polygonControlReg.get_CULL_BACK(); + if (cullFront && cullBack) + m_rasterizationEnabled = false; + // Shaders m_vertexShaderMtl = static_cast(vertexShader->shader); if (geometryShader) @@ -368,7 +367,8 @@ MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool // Shaders desc->setObjectFunction(m_vertexShaderMtl->GetFunction()); desc->setMeshFunction(m_geometryShaderMtl->GetFunction()); - desc->setFragmentFunction(m_pixelShaderMtl->GetFunction()); + if (m_rasterizationEnabled) + desc->setFragmentFunction(m_pixelShaderMtl->GetFunction()); #ifdef CEMU_DEBUG_ASSERT desc->setLabel(GetLabel("Mesh render pipeline state", desc)); @@ -381,7 +381,8 @@ MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool // Shaders desc->setVertexFunction(m_vertexShaderMtl->GetFunction()); - desc->setFragmentFunction(m_pixelShaderMtl->GetFunction()); + if (m_rasterizationEnabled) + desc->setFragmentFunction(m_pixelShaderMtl->GetFunction()); #ifdef CEMU_DEBUG_ASSERT desc->setLabel(GetLabel("Render pipeline state", desc)); @@ -397,7 +398,8 @@ MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool cemuLog_log(LogType::Force, "error creating render pipeline state: {}", error->localizedDescription()->utf8String()); error->release(); } - else if (showInOverlay) + + if (showInOverlay) { if (isRenderThread) g_compiling_pipelines_syncTimeSum += creationDuration; @@ -484,7 +486,7 @@ void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchSha vertexDescriptor->release(); } - SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr, fbosMatch); + SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, m_rasterizationEnabled, lcr, fbosMatch); m_pipelineDescriptor = desc; @@ -550,7 +552,7 @@ void MetalPipelineCompiler::InitFromStateMesh(const LatteFetchShader* fetchShade // Render pipeline state MTL::MeshRenderPipelineDescriptor* desc = MTL::MeshRenderPipelineDescriptor::alloc()->init(); - SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr, fbosMatch); + SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, m_rasterizationEnabled, lcr, fbosMatch); m_pipelineDescriptor = desc; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h index 3b9731a3e..5965c764a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h @@ -22,6 +22,7 @@ class MetalPipelineCompiler class RendererShaderMtl* m_geometryShaderMtl; class RendererShaderMtl* m_pixelShaderMtl; bool m_usesGeometryShader; + bool m_rasterizationEnabled; /* std::map m_pipelineCache; From 295a6ed9fd07f3791c2a71db5ae1776775a4a5bf Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 19 Oct 2024 09:39:55 +0200 Subject: [PATCH 222/368] only add pipeline to cache if compilation was attempted --- .../HW/Latte/Renderer/Metal/MetalPipelineCache.cpp | 11 ++++++++--- .../HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp | 5 ++++- .../HW/Latte/Renderer/Metal/MetalPipelineCompiler.h | 2 +- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 214c822ff..07277e68f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -50,13 +50,16 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte MetalPipelineCompiler compiler(m_mtlr); bool fbosMatch; compiler.InitFromState(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr, fbosMatch); - MTL::RenderPipelineState* pipeline = compiler.Compile(false, true, true); + bool attemptedCompilation = false; + MTL::RenderPipelineState* pipeline = compiler.Compile(false, true, true, attemptedCompilation); // If FBOs don't match, it wouldn't be possible to reconstruct the pipeline from the cache if (fbosMatch) AddCurrentStateToCache(hash); - m_pipelineCache.insert({hash, pipeline}); + // Place the pipeline to the cache if the compilation was at least attempted + if (attemptedCompilation) + m_pipelineCache.insert({hash, pipeline}); return pipeline; } @@ -374,7 +377,9 @@ void MetalPipelineCache::LoadPipelineFromCache(std::span fileData) // s_spinlockSharedInternal.unlock(); // return; //} - pipeline = pp.Compile(true, true, false); + bool attemptedCompilation = false; + pipeline = pp.Compile(true, true, false, attemptedCompilation); + cemu_assert_debug(attemptedCompilation); // destroy pp early } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp index d46358853..73b86fe91 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp @@ -332,7 +332,7 @@ void MetalPipelineCompiler::InitFromState(const LatteFetchShader* fetchShader, c InitFromStateRender(fetchShader, vertexShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr, fbosMatch); } -MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool isRenderThread, bool showInOverlay) +MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool isRenderThread, bool showInOverlay, bool& attemptedCompilation) { if (forceCompile) { @@ -408,6 +408,9 @@ MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool g_compiling_pipelines++; } + // Inform the pipeline cache that compilation was at least attempted + attemptedCompilation = true; + return pipeline; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h index 5965c764a..e40675559 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h @@ -13,7 +13,7 @@ class MetalPipelineCompiler void InitFromState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr, bool& fbosMatch); - MTL::RenderPipelineState* Compile(bool forceCompile, bool isRenderThread, bool showInOverlay); + MTL::RenderPipelineState* Compile(bool forceCompile, bool isRenderThread, bool showInOverlay, bool& attemptedCompilation); private: class MetalRenderer* m_mtlr; From 17507157914a4f0f30eb7f1454d9ec845ea7610e Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 19 Oct 2024 15:32:45 +0200 Subject: [PATCH 223/368] retrieve ps input table without using global variable --- src/Cafe/HW/Latte/Core/LatteShader.cpp | 35 +++++----- src/Cafe/HW/Latte/Core/LatteShader.h | 3 +- .../Renderer/Metal/MetalPipelineCache.cpp | 5 +- .../Renderer/Metal/MetalPipelineCompiler.cpp | 67 +++++++++---------- 4 files changed, 54 insertions(+), 56 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteShader.cpp b/src/Cafe/HW/Latte/Core/LatteShader.cpp index bc1279c32..9e3e6b1f6 100644 --- a/src/Cafe/HW/Latte/Core/LatteShader.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShader.cpp @@ -209,11 +209,9 @@ void LatteShader_free(LatteDecompilerShader* shader) delete shader; } -// both vertex and geometry/pixel shader depend on PS inputs -// we prepare the PS import info in advance -void LatteShader_UpdatePSInputs(uint32* contextRegisters) +void LatteShader_CreatePSInputTable(LatteShaderPSInputTable* psInputTable, uint32* contextRegisters) { - // PS control + // PS control uint32 psControl0 = contextRegisters[mmSPI_PS_IN_CONTROL_0]; uint32 spi0_positionEnable = (psControl0 >> 8) & 1; uint32 spi0_positionCentroid = (psControl0 >> 9) & 1; @@ -242,12 +240,12 @@ void LatteShader_UpdatePSInputs(uint32* contextRegisters) { key += std::rotr(spi0_paramGen, 7); key += std::rotr(spi0_paramGenAddr, 3); - _activePSImportTable.paramGen = spi0_paramGen; - _activePSImportTable.paramGenGPR = spi0_paramGenAddr; + psInputTable->paramGen = spi0_paramGen; + psInputTable->paramGenGPR = spi0_paramGenAddr; } else { - _activePSImportTable.paramGen = 0; + psInputTable->paramGen = 0; } // semantic imports from vertex shader @@ -281,9 +279,9 @@ void LatteShader_UpdatePSInputs(uint32* contextRegisters) key = std::rotl(key, 7); if (spi0_positionEnable && f == spi0_positionAddr) { - _activePSImportTable.import[f].semanticId = LATTE_ANALYZER_IMPORT_INDEX_SPIPOSITION; - _activePSImportTable.import[f].isFlat = false; - _activePSImportTable.import[f].isNoPerspective = false; + psInputTable->import[f].semanticId = LATTE_ANALYZER_IMPORT_INDEX_SPIPOSITION; + psInputTable->import[f].isFlat = false; + psInputTable->import[f].isNoPerspective = false; key += (uint64)0x33; } else @@ -296,13 +294,20 @@ void LatteShader_UpdatePSInputs(uint32* contextRegisters) semanticMask[psSemanticId >> 3] |= (1 << (psSemanticId & 7)); #endif - _activePSImportTable.import[f].semanticId = psSemanticId; - _activePSImportTable.import[f].isFlat = (psInputControl&(1 << 10)) != 0; - _activePSImportTable.import[f].isNoPerspective = (psInputControl&(1 << 12)) != 0; + psInputTable->import[f].semanticId = psSemanticId; + psInputTable->import[f].isFlat = (psInputControl&(1 << 10)) != 0; + psInputTable->import[f].isNoPerspective = (psInputControl&(1 << 12)) != 0; } } - _activePSImportTable.key = key; - _activePSImportTable.count = numPSInputs; + psInputTable->key = key; + psInputTable->count = numPSInputs; +} + +// both vertex and geometry/pixel shader depend on PS inputs +// we prepare the PS import info in advance +void LatteShader_UpdatePSInputs(uint32* contextRegisters) +{ + LatteShader_CreatePSInputTable(&_activePSImportTable, contextRegisters); } void LatteShader_CreateRendererShader(LatteDecompilerShader* shader, bool compileAsync) diff --git a/src/Cafe/HW/Latte/Core/LatteShader.h b/src/Cafe/HW/Latte/Core/LatteShader.h index f8dc6d1a3..85d53b01b 100644 --- a/src/Cafe/HW/Latte/Core/LatteShader.h +++ b/src/Cafe/HW/Latte/Core/LatteShader.h @@ -84,6 +84,7 @@ struct LatteShaderPSInputTable } }; +void LatteShader_CreatePSInputTable(LatteShaderPSInputTable* psInputTable, uint32* contextRegisters); void LatteShader_UpdatePSInputs(uint32* contextRegisters); LatteShaderPSInputTable* LatteSHRC_GetPSInputTable(); @@ -126,4 +127,4 @@ void LatteShaderCache_writeSeparableGeometryShader(uint64 shaderBaseHash, uint64 void LatteShaderCache_writeSeparablePixelShader(uint64 shaderBaseHash, uint64 shaderAuxHash, uint8* pixelShader, uint32 pixelShaderSize, uint32* contextRegisters, bool usesGeometryShader); // todo - refactor this -sint32 LatteDecompiler_getTextureSamplerBaseIndex(LatteConst::ShaderType shaderType); \ No newline at end of file +sint32 LatteDecompiler_getTextureSamplerBaseIndex(LatteConst::ShaderType shaderType); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 07277e68f..9e49959c9 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -54,7 +54,7 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte MTL::RenderPipelineState* pipeline = compiler.Compile(false, true, true, attemptedCompilation); // If FBOs don't match, it wouldn't be possible to reconstruct the pipeline from the cache - if (fbosMatch) + if (pipeline && fbosMatch) AddCurrentStateToCache(hash); // Place the pipeline to the cache if the compilation was at least attempted @@ -360,9 +360,6 @@ void MetalPipelineCache::LoadPipelineFromCache(std::span fileData) MetalAttachmentsInfo attachmentsInfo(*lcr, pixelShader); - // TODO: this shouldn't probably be called directly - LatteShader_UpdatePSInputs(lcr->GetRawView()); - MTL::RenderPipelineState* pipeline = nullptr; // compile { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp index 73b86fe91..a8bce2913 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp @@ -17,18 +17,18 @@ extern std::atomic_int g_compiling_pipelines; extern std::atomic_int g_compiling_pipelines_async; extern std::atomic_uint64_t g_compiling_pipelines_syncTimeSum; -static void rectsEmulationGS_outputSingleVertex(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable* psInputTable, sint32 vIdx, const LatteContextRegister& latteRegister) +static void rectsEmulationGS_outputSingleVertex(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable& psInputTable, sint32 vIdx, const LatteContextRegister& latteRegister) { auto parameterMask = vertexShader->outputParameterMask; for (uint32 i = 0; i < 32; i++) { if ((parameterMask & (1 << i)) == 0) continue; - sint32 vsSemanticId = psInputTable->getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i); + sint32 vsSemanticId = psInputTable.getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i); if (vsSemanticId < 0) continue; // make sure PS has matching input - if (!psInputTable->hasPSImportForSemanticId(vsSemanticId)) + if (!psInputTable.hasPSImportForSemanticId(vsSemanticId)) continue; gsSrc.append(fmt::format("out.passParameterSem{} = objectPayload.vertexOut[{}].passParameterSem{};\r\n", vsSemanticId, vIdx, vsSemanticId)); } @@ -36,18 +36,18 @@ static void rectsEmulationGS_outputSingleVertex(std::string& gsSrc, const LatteD gsSrc.append(fmt::format("mesh.set_vertex({}, out);\r\n", vIdx)); } -static void rectsEmulationGS_outputGeneratedVertex(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable* psInputTable, const char* variant, const LatteContextRegister& latteRegister) +static void rectsEmulationGS_outputGeneratedVertex(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable& psInputTable, const char* variant, const LatteContextRegister& latteRegister) { auto parameterMask = vertexShader->outputParameterMask; for (uint32 i = 0; i < 32; i++) { if ((parameterMask & (1 << i)) == 0) continue; - sint32 vsSemanticId = psInputTable->getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i); + sint32 vsSemanticId = psInputTable.getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i); if (vsSemanticId < 0) continue; // make sure PS has matching input - if (!psInputTable->hasPSImportForSemanticId(vsSemanticId)) + if (!psInputTable.hasPSImportForSemanticId(vsSemanticId)) continue; gsSrc.append(fmt::format("out.passParameterSem{} = gen4thVertex{}(objectPayload.vertexOut[0].passParameterSem{}, objectPayload.vertexOut[1].passParameterSem{}, objectPayload.vertexOut[2].passParameterSem{});\r\n", vsSemanticId, variant, vsSemanticId, vsSemanticId, vsSemanticId)); } @@ -55,7 +55,7 @@ static void rectsEmulationGS_outputGeneratedVertex(std::string& gsSrc, const Lat gsSrc.append(fmt::format("mesh.set_vertex(3, out);\r\n")); } -static void rectsEmulationGS_outputVerticesCode(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable* psInputTable, sint32 p0, sint32 p1, sint32 p2, sint32 p3, const char* variant, const LatteContextRegister& latteRegister) +static void rectsEmulationGS_outputVerticesCode(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable& psInputTable, sint32 p0, sint32 p1, sint32 p2, sint32 p3, const char* variant, const LatteContextRegister& latteRegister) { sint32 pList[4] = { p0, p1, p2, p3 }; for (sint32 i = 0; i < 4; i++) @@ -79,7 +79,8 @@ static RendererShaderMtl* rectsEmulationGS_generate(MetalRenderer* metalRenderer gsSrc.append("#include \r\n"); gsSrc.append("using namespace metal;\r\n"); - LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable(); + LatteShaderPSInputTable psInputTable; + LatteShader_CreatePSInputTable(&psInputTable, latteRegister.GetRawView()); // inputs & outputs std::string vertexOutDefinition = "struct VertexOut {\r\n"; @@ -87,35 +88,29 @@ static RendererShaderMtl* rectsEmulationGS_generate(MetalRenderer* metalRenderer std::string geometryOutDefinition = "struct GeometryOut {\r\n"; geometryOutDefinition += "float4 position [[position]];\r\n"; auto parameterMask = vertexShader->outputParameterMask; - for (sint32 f = 0; f < 2; f++) + for (uint32 i = 0; i < 32; i++) { - for (uint32 i = 0; i < 32; i++) - { - if ((parameterMask & (1 << i)) == 0) - continue; - sint32 vsSemanticId = psInputTable->getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i); - if (vsSemanticId < 0) - continue; - auto psImport = psInputTable->getPSImportBySemanticId(vsSemanticId); - if (psImport == nullptr) - continue; - - if (f == 0) - { - vertexOutDefinition += fmt::format("float4 passParameterSem{};\r\n", vsSemanticId); - } - else - { - geometryOutDefinition += fmt::format("float4 passParameterSem{}", vsSemanticId); - - geometryOutDefinition += fmt::format(" [[user(locn{})]]", psInputTable->getPSImportLocationBySemanticId(vsSemanticId)); - if (psImport->isFlat) - geometryOutDefinition += " [[flat]]"; - if (psImport->isNoPerspective) - geometryOutDefinition += " [[center_no_perspective]]"; - geometryOutDefinition += ";\r\n"; - } - } + if ((parameterMask & (1 << i)) == 0) + continue; + sint32 vsSemanticId = psInputTable.getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i); + if (vsSemanticId < 0) + continue; + auto psImport = psInputTable.getPSImportBySemanticId(vsSemanticId); + if (psImport == nullptr) + continue; + + // VertexOut + vertexOutDefinition += fmt::format("float4 passParameterSem{};\r\n", vsSemanticId); + + // GeometryOut + geometryOutDefinition += fmt::format("float4 passParameterSem{}", vsSemanticId); + + geometryOutDefinition += fmt::format(" [[user(locn{})]]", psInputTable.getPSImportLocationBySemanticId(vsSemanticId)); + if (psImport->isFlat) + geometryOutDefinition += " [[flat]]"; + if (psImport->isNoPerspective) + geometryOutDefinition += " [[center_no_perspective]]"; + geometryOutDefinition += ";\r\n"; } vertexOutDefinition += "};\r\n"; geometryOutDefinition += "};\r\n"; From b8021b642d31d0ff7514460690a84e3959c010da Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 24 Oct 2024 17:15:24 +0200 Subject: [PATCH 224/368] fix: incorrect texture usages --- src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp | 5 +---- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 3 ++- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index 142870501..c6a5012bd 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -65,7 +65,7 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM } else if (textureType == MTL::TextureTypeCube) { - // Do notjing + // Do nothing } else if (textureType == MTL::TextureTypeCubeArray) { @@ -81,13 +81,10 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM MTL::TextureUsage usage = MTL::TextureUsageShaderRead | MTL::TextureUsagePixelFormatView; if (!Latte::IsCompressedFormat(format)) - { usage |= MTL::TextureUsageRenderTarget; - } desc->setUsage(usage); m_texture = mtlRenderer->GetDevice()->newTexture(desc); - desc->release(); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 2b420e6e2..e560c2c33 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -23,7 +23,6 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h" #include "Cafe/HW/Latte/Renderer/Renderer.h" -#include "HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h" #include "config/CemuConfig.h" #define IMGUI_IMPL_METAL_CPP @@ -70,6 +69,7 @@ MetalRenderer::MetalRenderer() MTL::TextureDescriptor* textureDescriptor = MTL::TextureDescriptor::alloc()->init(); textureDescriptor->setTextureType(MTL::TextureType1D); textureDescriptor->setWidth(1); + textureDescriptor->setUsage(MTL::TextureUsageShaderRead); m_nullTexture1D = m_device->newTexture(textureDescriptor); #ifdef CEMU_DEBUG_ASSERT m_nullTexture1D->setLabel(GetLabel("Null texture 1D", m_nullTexture1D)); @@ -77,6 +77,7 @@ MetalRenderer::MetalRenderer() textureDescriptor->setTextureType(MTL::TextureType2D); textureDescriptor->setHeight(1); + textureDescriptor->setUsage(MTL::TextureUsageShaderRead | MTL::TextureUsageRenderTarget); m_nullTexture2D = m_device->newTexture(textureDescriptor); #ifdef CEMU_DEBUG_ASSERT m_nullTexture2D->setLabel(GetLabel("Null texture 2D", m_nullTexture2D)); From 665eb23e4a48d480cf5cbc08a9e77350ddf85e36 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 28 Oct 2024 16:11:47 +0100 Subject: [PATCH 225/368] fix: incorrect pipeline compilation time report --- .../Renderer/Metal/MetalPipelineCache.cpp | 2 - .../Latte/Renderer/Metal/MetalPipelineCache.h | 1 - .../Renderer/Metal/MetalPipelineCompiler.cpp | 132 +----------------- .../Renderer/Metal/MetalPipelineCompiler.h | 8 -- 4 files changed, 1 insertion(+), 142 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 9e49959c9..bc77e00f3 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -35,9 +35,7 @@ MetalPipelineCache::MetalPipelineCache(class MetalRenderer* metalRenderer) : m_m MetalPipelineCache::~MetalPipelineCache() { for (auto& [key, value] : m_pipelineCache) - { value->release(); - } } MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h index be26bdee0..b1307568d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h @@ -4,7 +4,6 @@ #include "util/helpers/ConcurrentQueue.h" #include "util/helpers/fspinlock.h" -// TODO: binary archives class MetalPipelineCache { public: diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp index a8bce2913..6dd6087b1 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp @@ -386,7 +386,7 @@ MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool } auto end = std::chrono::high_resolution_clock::now(); - auto creationDuration = std::chrono::duration_cast(end - start).count(); + auto creationDuration = std::chrono::duration_cast(end - start).count(); if (error) { @@ -479,7 +479,6 @@ void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchSha layout->setStride(bufferStride); } - // TODO: don't always set the vertex descriptor? desc->setVertexDescriptor(vertexDescriptor); vertexDescriptor->release(); } @@ -487,62 +486,6 @@ void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchSha SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, m_rasterizationEnabled, lcr, fbosMatch); m_pipelineDescriptor = desc; - - //TryLoadBinaryArchive(); - - // Load binary - /* - if (m_binaryArchive) - { - NS::Object* binArchives[] = {m_binaryArchive}; - auto binaryArchives = NS::Array::alloc()->init(binArchives, 1); - desc->setBinaryArchives(binaryArchives); - binaryArchives->release(); - } - */ - - /* - NS::Error* error = nullptr; -#ifdef CEMU_DEBUG_ASSERT - desc->setLabel(GetLabel("Cached render pipeline state", desc)); -#endif - pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, MTL::PipelineOptionFailOnBinaryArchiveMiss, nullptr, &error); - - // Pipeline wasn't found in the binary archive, we need to compile it - if (error) - { - desc->setBinaryArchives(nullptr); - - error->release(); - error = nullptr; -#ifdef CEMU_DEBUG_ASSERT - desc->setLabel(GetLabel("New render pipeline state", desc)); -#endif - pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, &error); - if (error) - { - cemuLog_log(LogType::Force, "error creating render pipeline state: {}", error->localizedDescription()->utf8String()); - error->release(); - } - else - { - // Save binary - if (m_binaryArchive) - { - NS::Error* error = nullptr; - m_binaryArchive->addRenderPipelineFunctions(desc, &error); - if (error) - { - cemuLog_log(LogType::Force, "error saving render pipeline functions: {}", error->localizedDescription()->utf8String()); - error->release(); - } - } - } - } - desc->release(); - - return pipeline; - */ } void MetalPipelineCompiler::InitFromStateMesh(const LatteFetchShader* fetchShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr, bool& fbosMatch) @@ -553,77 +496,4 @@ void MetalPipelineCompiler::InitFromStateMesh(const LatteFetchShader* fetchShade SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, m_rasterizationEnabled, lcr, fbosMatch); m_pipelineDescriptor = desc; - - //TryLoadBinaryArchive(); - - // Load binary - // TODO: no binary archives? :( - - /* - NS::Error* error = nullptr; -#ifdef CEMU_DEBUG_ASSERT - desc->setLabel(GetLabel("Mesh pipeline state", desc)); -#endif - pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, MTL::PipelineOptionNone, nullptr, &error); - desc->release(); - if (error) - { - cemuLog_log(LogType::Force, "error creating mesh render pipeline state: {}", error->localizedDescription()->utf8String()); - error->release(); - } - - return pipeline; - */ -} - -/* -void MetalPipelineCache::TryLoadBinaryArchive() -{ - if (m_binaryArchive || s_cacheTitleId == INVALID_TITLE_ID) - return; - - // GPU name - const char* deviceName1 = m_mtlr->GetDevice()->name()->utf8String(); - std::string deviceName; - deviceName.assign(deviceName1); - - // Replace spaces with underscores - for (auto& c : deviceName) - { - if (c == ' ') - c = '_'; - } - - // OS version - auto osVersion = NS::ProcessInfo::processInfo()->operatingSystemVersion(); - - // Precompiled binaries cannot be shared between different devices or OS versions - const std::string cacheFilename = fmt::format("{:016x}_mtl_pipelines.bin", s_cacheTitleId); - const fs::path cachePath = ActiveSettings::GetCachePath("shaderCache/precompiled/{}/{}-{}-{}/{}", deviceName, osVersion.majorVersion, osVersion.minorVersion, osVersion.patchVersion, cacheFilename); - - // Create the directory if it doesn't exist - std::filesystem::create_directories(cachePath.parent_path()); - - m_binaryArchiveURL = NS::URL::fileURLWithPath(ToNSString((const char*)cachePath.generic_u8string().c_str())); - - MTL::BinaryArchiveDescriptor* desc = MTL::BinaryArchiveDescriptor::alloc()->init(); - desc->setUrl(m_binaryArchiveURL); - - NS::Error* error = nullptr; - m_binaryArchive = m_mtlr->GetDevice()->newBinaryArchive(desc, &error); - if (error) - { - desc->setUrl(nullptr); - - error->release(); - error = nullptr; - m_binaryArchive = m_mtlr->GetDevice()->newBinaryArchive(desc, &error); - if (error) - { - cemuLog_log(LogType::Force, "failed to create binary archive: {}", error->localizedDescription()->utf8String()); - error->release(); - } - } - desc->release(); } -*/ diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h index e40675559..e3fab932a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h @@ -24,17 +24,9 @@ class MetalPipelineCompiler bool m_usesGeometryShader; bool m_rasterizationEnabled; - /* - std::map m_pipelineCache; - - NS::URL* m_binaryArchiveURL; - MTL::BinaryArchive* m_binaryArchive; - */ NS::Object* m_pipelineDescriptor; void InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr, bool& fbosMatch); void InitFromStateMesh(const LatteFetchShader* fetchShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr, bool& fbosMatch); - - //void TryLoadBinaryArchive(); }; From bca32c43d0023ae298fc1a3e6235a0f50f3932cb Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 28 Oct 2024 16:38:17 +0100 Subject: [PATCH 226/368] refactor the way pipelines are stored --- .../Renderer/Metal/MetalPipelineCache.cpp | 53 ++++++++----------- .../Latte/Renderer/Metal/MetalPipelineCache.h | 4 +- .../Renderer/Metal/MetalPipelineCompiler.cpp | 13 +++-- .../Renderer/Metal/MetalPipelineCompiler.h | 10 +++- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 11 ++-- 5 files changed, 45 insertions(+), 46 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index bc77e00f3..58c432f5f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -15,6 +15,7 @@ #include "HW/Latte/ISA/LatteReg.h" #include "HW/Latte/Renderer/Metal/LatteToMtl.h" #include "HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h" +#include "HW/Latte/Renderer/Metal/MetalPipelineCompiler.h" #include "Metal/MTLRenderPipeline.hpp" #include "util/helpers/helpers.h" #include "config/ActiveSettings.h" @@ -34,32 +35,32 @@ MetalPipelineCache::MetalPipelineCache(class MetalRenderer* metalRenderer) : m_m MetalPipelineCache::~MetalPipelineCache() { - for (auto& [key, value] : m_pipelineCache) - value->release(); + for (auto& [key, pipelineObj] : m_pipelineCache) + { + pipelineObj->m_pipeline->release(); + delete pipelineObj; + } } -MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) +PipelineObject* MetalPipelineCache::GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) { uint64 hash = CalculatePipelineHash(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); - auto it = m_pipelineCache.find(hash); - if (it != m_pipelineCache.end()) - return it->second; + PipelineObject*& pipelineObj = m_pipelineCache[hash]; + if (pipelineObj) + return pipelineObj; + + pipelineObj = new PipelineObject(); - MetalPipelineCompiler compiler(m_mtlr); + MetalPipelineCompiler compiler(m_mtlr, *pipelineObj); bool fbosMatch; compiler.InitFromState(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr, fbosMatch); - bool attemptedCompilation = false; - MTL::RenderPipelineState* pipeline = compiler.Compile(false, true, true, attemptedCompilation); + compiler.Compile(false, true, true); // If FBOs don't match, it wouldn't be possible to reconstruct the pipeline from the cache - if (pipeline && fbosMatch) + if (fbosMatch) AddCurrentStateToCache(hash); - // Place the pipeline to the cache if the compilation was at least attempted - if (attemptedCompilation) - m_pipelineCache.insert({hash, pipeline}); - - return pipeline; + return pipelineObj; } uint64 MetalPipelineCache::CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) @@ -358,32 +359,24 @@ void MetalPipelineCache::LoadPipelineFromCache(std::span fileData) MetalAttachmentsInfo attachmentsInfo(*lcr, pixelShader); - MTL::RenderPipelineState* pipeline = nullptr; + PipelineObject* pipelineObject = new PipelineObject(); + // compile { - MetalPipelineCompiler pp(m_mtlr); + MetalPipelineCompiler pp(m_mtlr, *pipelineObject); bool fbosMatch; pp.InitFromState(vertexShader->compatibleFetchShader, vertexShader, geometryShader, pixelShader, attachmentsInfo, attachmentsInfo, *lcr, fbosMatch); cemu_assert_debug(fbosMatch); - //{ - // s_spinlockSharedInternal.lock(); - // delete lcr; - // delete cachedPipeline; - // s_spinlockSharedInternal.unlock(); - // return; - //} - bool attemptedCompilation = false; - pipeline = pp.Compile(true, true, false, attemptedCompilation); - cemu_assert_debug(attemptedCompilation); + pp.Compile(true, true, false); // destroy pp early } - // on success, calculate pipeline hash and flag as present in cache - if (pipeline) + // on success, cache the pipeline + if (pipelineObject->m_pipeline) { uint64 pipelineStateHash = CalculatePipelineHash(vertexShader->compatibleFetchShader, vertexShader, geometryShader, pixelShader, attachmentsInfo, attachmentsInfo, *lcr); m_pipelineCacheLock.lock(); - m_pipelineCache[pipelineStateHash] = pipeline; + m_pipelineCache[pipelineStateHash] = pipelineObject; m_pipelineCacheLock.unlock(); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h index b1307568d..f4f5e9635 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h @@ -12,7 +12,7 @@ class MetalPipelineCache MetalPipelineCache(class MetalRenderer* metalRenderer); ~MetalPipelineCache(); - MTL::RenderPipelineState* GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); + PipelineObject* GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); // Cache loading uint32 BeginLoading(uint64 cacheTitleId); // returns count of pipelines stored in cache @@ -27,7 +27,7 @@ class MetalPipelineCache private: class MetalRenderer* m_mtlr; - std::map m_pipelineCache; + std::map m_pipelineCache; FSpinlock m_pipelineCacheLock; std::thread* m_pipelineCacheStoreThread; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp index 6dd6087b1..611d190dd 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp @@ -327,7 +327,7 @@ void MetalPipelineCompiler::InitFromState(const LatteFetchShader* fetchShader, c InitFromStateRender(fetchShader, vertexShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr, fbosMatch); } -MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool isRenderThread, bool showInOverlay, bool& attemptedCompilation) +bool MetalPipelineCompiler::Compile(bool forceCompile, bool isRenderThread, bool showInOverlay) { if (forceCompile) { @@ -343,11 +343,11 @@ MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool { // fail early if some shader stages are not compiled if (m_vertexShaderMtl && !m_vertexShaderMtl->IsCompiled()) - return nullptr; + return false; if (m_geometryShaderMtl && !m_geometryShaderMtl->IsCompiled()) - return nullptr; + return false; if (m_pixelShaderMtl && !m_pixelShaderMtl->IsCompiled()) - return nullptr; + return false; } // Compile @@ -403,10 +403,9 @@ MTL::RenderPipelineState* MetalPipelineCompiler::Compile(bool forceCompile, bool g_compiling_pipelines++; } - // Inform the pipeline cache that compilation was at least attempted - attemptedCompilation = true; + m_pipelineObj.m_pipeline = pipeline; - return pipeline; + return true; } void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr, bool& fbosMatch) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h index e3fab932a..d762d8025 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h @@ -5,18 +5,24 @@ #include "Cafe/HW/Latte/ISA/LatteReg.h" #include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h" +struct PipelineObject +{ + MTL::RenderPipelineState* m_pipeline = nullptr; +}; + class MetalPipelineCompiler { public: - MetalPipelineCompiler(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} + MetalPipelineCompiler(class MetalRenderer* metalRenderer, PipelineObject& pipelineObj) : m_mtlr{metalRenderer}, m_pipelineObj{pipelineObj} {} ~MetalPipelineCompiler(); void InitFromState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr, bool& fbosMatch); - MTL::RenderPipelineState* Compile(bool forceCompile, bool isRenderThread, bool showInOverlay, bool& attemptedCompilation); + bool Compile(bool forceCompile, bool isRenderThread, bool showInOverlay); private: class MetalRenderer* m_mtlr; + PipelineObject& m_pipelineObj; class RendererShaderMtl* m_vertexShaderMtl; class RendererShaderMtl* m_geometryShaderMtl; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index e560c2c33..b34747441 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -23,6 +23,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h" #include "Cafe/HW/Latte/Renderer/Renderer.h" +#include "HW/Latte/Renderer/Metal/MetalPipelineCompiler.h" #include "config/CemuConfig.h" #define IMGUI_IMPL_METAL_CPP @@ -1000,14 +1001,14 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 auto renderCommandEncoder = GetRenderCommandEncoder(); // Render pipeline state - MTL::RenderPipelineState* renderPipelineState = m_pipelineCache->GetRenderPipelineState(fetchShader, vertexShader, geometryShader, pixelShader, m_state.m_lastUsedFBO.m_attachmentsInfo, m_state.m_activeFBO.m_attachmentsInfo, LatteGPUState.contextNew); - if (!renderPipelineState) + PipelineObject* pipelineObj = m_pipelineCache->GetRenderPipelineState(fetchShader, vertexShader, geometryShader, pixelShader, m_state.m_lastUsedFBO.m_attachmentsInfo, m_state.m_activeFBO.m_attachmentsInfo, LatteGPUState.contextNew); + if (!pipelineObj->m_pipeline) return; - if (renderPipelineState != encoderState.m_renderPipelineState) + if (pipelineObj->m_pipeline != encoderState.m_renderPipelineState) { - renderCommandEncoder->setRenderPipelineState(renderPipelineState); - encoderState.m_renderPipelineState = renderPipelineState; + renderCommandEncoder->setRenderPipelineState(pipelineObj->m_pipeline); + encoderState.m_renderPipelineState = pipelineObj->m_pipeline; } // Depth stencil state From 4e3f94e87003d6061c2958f76288dfe4bae9efa9 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 28 Oct 2024 17:32:43 +0100 Subject: [PATCH 227/368] compile pipelines async --- .../Renderer/Metal/MetalPipelineCache.cpp | 96 ++++++++++++++++--- 1 file changed, 84 insertions(+), 12 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 58c432f5f..73951cf8c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -1,26 +1,78 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" -#include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" -#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/ISA/RegDefines.h" #include "Cafe/HW/Latte/Core/LatteConst.h" -#include "Cafe/HW/Latte/Core/LatteCachedFBO.h" #include "Cafe/HW/Latte/Common/RegisterSerializer.h" #include "Cafe/HW/Latte/Core/LatteShaderCache.h" #include "Cemu/FileCache/FileCache.h" #include "Common/precompiled.h" -#include "HW/Latte/Core/LatteShader.h" -#include "HW/Latte/ISA/LatteReg.h" -#include "HW/Latte/Renderer/Metal/LatteToMtl.h" -#include "HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h" -#include "HW/Latte/Renderer/Metal/MetalPipelineCompiler.h" -#include "Metal/MTLRenderPipeline.hpp" +#include "Cafe/HW/Latte/Core/LatteShader.h" +#include "Cafe/HW/Latte/ISA/LatteReg.h" #include "util/helpers/helpers.h" #include "config/ActiveSettings.h" + #include +static bool g_compilePipelineThreadInit{false}; +static std::mutex g_compilePipelineMutex; +static std::condition_variable g_compilePipelineCondVar; +static std::queue g_compilePipelineRequests; + +static void compileThreadFunc(sint32 threadIndex) +{ + SetThreadName("compilePl"); + + // one thread runs at normal priority while the others run at lower priority + if(threadIndex != 0) + ; // TODO: set thread priority + + while (true) + { + std::unique_lock lock(g_compilePipelineMutex); + while (g_compilePipelineRequests.empty()) + g_compilePipelineCondVar.wait(lock); + + MetalPipelineCompiler* request = g_compilePipelineRequests.front(); + + g_compilePipelineRequests.pop(); + + lock.unlock(); + + request->Compile(true, false, true); + delete request; + } +} + +static void initCompileThread() +{ + uint32 numCompileThreads; + + uint32 cpuCoreCount = GetPhysicalCoreCount(); + if (cpuCoreCount <= 2) + numCompileThreads = 1; + else + numCompileThreads = 2 + (cpuCoreCount - 3); // 2 plus one additionally for every extra core above 3 + + numCompileThreads = std::min(numCompileThreads, 8u); // cap at 8 + + for (uint32 i = 0; i < numCompileThreads; i++) + { + std::thread compileThread(compileThreadFunc, i); + compileThread.detach(); + } +} + +static void queuePipeline(MetalPipelineCompiler* v) +{ + std::unique_lock lock(g_compilePipelineMutex); + g_compilePipelineRequests.push(std::move(v)); + lock.unlock(); + g_compilePipelineCondVar.notify_one(); +} + MetalPipelineCache* g_mtlPipelineCache = nullptr; MetalPipelineCache& MetalPipelineCache::GetInstance() @@ -51,10 +103,30 @@ PipelineObject* MetalPipelineCache::GetRenderPipelineState(const LatteFetchShade pipelineObj = new PipelineObject(); - MetalPipelineCompiler compiler(m_mtlr, *pipelineObj); + MetalPipelineCompiler* compiler = new MetalPipelineCompiler(m_mtlr, *pipelineObj); bool fbosMatch; - compiler.InitFromState(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr, fbosMatch); - compiler.Compile(false, true, true); + compiler->InitFromState(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr, fbosMatch); + + bool allowAsyncCompile = false; + // TODO: uncomment + if (GetConfig().async_compile) + allowAsyncCompile = true;//IsAsyncPipelineAllowed(indexCount); + + if (allowAsyncCompile) + { + if (!g_compilePipelineThreadInit) + { + initCompileThread(); + g_compilePipelineThreadInit = true; + } + + queuePipeline(compiler); + } + else + { + compiler->Compile(false, true, true); + delete compiler; + } // If FBOs don't match, it wouldn't be possible to reconstruct the pipeline from the cache if (fbosMatch) From 7906733bfa9bd62f1fe7b1d0ece967a32adc9587 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 28 Oct 2024 19:02:44 +0100 Subject: [PATCH 228/368] don't compile certain pipelines async --- .../Renderer/Metal/MetalPipelineCache.cpp | 25 ++++++++++++++++--- .../Latte/Renderer/Metal/MetalPipelineCache.h | 3 ++- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 2 +- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 73951cf8c..101b6d688 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -73,6 +73,23 @@ static void queuePipeline(MetalPipelineCompiler* v) g_compilePipelineCondVar.notify_one(); } +// make a guess if a pipeline is not essential +// non-essential means that skipping these drawcalls shouldn't lead to permanently corrupted graphics +bool IsAsyncPipelineAllowed(const MetalAttachmentsInfo& attachmentsInfo, Vector2i extend, uint32 indexCount) +{ + if (extend.x == 1600 && extend.y == 1600) + return false; // Splatoon ink mechanics use 1600x1600 R8 and R8G8 framebuffers, this resolution is rare enough that we can just blacklist it globally + + if (attachmentsInfo.depthFormat != Latte::E_GX2SURFFMT::INVALID_FORMAT) + return true; // aggressive filter but seems to work well so far + + // small index count (3,4,5,6) is often associated with full-viewport quads (which are considered essential due to often being used to generate persistent textures) + if (indexCount <= 6) + return false; + + return true; +} + MetalPipelineCache* g_mtlPipelineCache = nullptr; MetalPipelineCache& MetalPipelineCache::GetInstance() @@ -94,7 +111,7 @@ MetalPipelineCache::~MetalPipelineCache() } } -PipelineObject* MetalPipelineCache::GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) +PipelineObject* MetalPipelineCache::GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, Vector2i extend, uint32 indexCount, const LatteContextRegister& lcr) { uint64 hash = CalculatePipelineHash(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); PipelineObject*& pipelineObj = m_pipelineCache[hash]; @@ -108,9 +125,8 @@ PipelineObject* MetalPipelineCache::GetRenderPipelineState(const LatteFetchShade compiler->InitFromState(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr, fbosMatch); bool allowAsyncCompile = false; - // TODO: uncomment if (GetConfig().async_compile) - allowAsyncCompile = true;//IsAsyncPipelineAllowed(indexCount); + allowAsyncCompile = IsAsyncPipelineAllowed(activeAttachmentsInfo, extend, indexCount); if (allowAsyncCompile) { @@ -124,7 +140,8 @@ PipelineObject* MetalPipelineCache::GetRenderPipelineState(const LatteFetchShade } else { - compiler->Compile(false, true, true); + // Also force compile to ensure that the pipeline is ready + cemu_assert_debug(compiler->Compile(true, true, true)); delete compiler; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h index f4f5e9635..d49ec6a25 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h @@ -3,6 +3,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h" #include "util/helpers/ConcurrentQueue.h" #include "util/helpers/fspinlock.h" +#include "util/math/vector2.h" class MetalPipelineCache { @@ -12,7 +13,7 @@ class MetalPipelineCache MetalPipelineCache(class MetalRenderer* metalRenderer); ~MetalPipelineCache(); - PipelineObject* GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); + PipelineObject* GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, Vector2i extend, uint32 indexCount, const LatteContextRegister& lcr); // Cache loading uint32 BeginLoading(uint64 cacheTitleId); // returns count of pipelines stored in cache diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index b34747441..dc4244ec0 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1001,7 +1001,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 auto renderCommandEncoder = GetRenderCommandEncoder(); // Render pipeline state - PipelineObject* pipelineObj = m_pipelineCache->GetRenderPipelineState(fetchShader, vertexShader, geometryShader, pixelShader, m_state.m_lastUsedFBO.m_attachmentsInfo, m_state.m_activeFBO.m_attachmentsInfo, LatteGPUState.contextNew); + PipelineObject* pipelineObj = m_pipelineCache->GetRenderPipelineState(fetchShader, vertexShader, geometryShader, pixelShader, m_state.m_lastUsedFBO.m_attachmentsInfo, m_state.m_activeFBO.m_attachmentsInfo, m_state.m_activeFBO.m_fbo->m_size, count, LatteGPUState.contextNew); if (!pipelineObj->m_pipeline) return; From 85db0dc4685bf4b4d9bb761e4232b5e32cb577c1 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 29 Oct 2024 07:44:47 +0100 Subject: [PATCH 229/368] cache all pipelines --- .../Renderer/Metal/MetalPipelineCache.cpp | 32 +++++++++++++------ .../Latte/Renderer/Metal/MetalPipelineCache.h | 2 +- .../Renderer/Metal/MetalPipelineCompiler.cpp | 18 +++++------ .../Renderer/Metal/MetalPipelineCompiler.h | 6 ++-- 4 files changed, 34 insertions(+), 24 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 101b6d688..d49060fbd 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -11,6 +11,7 @@ #include "Common/precompiled.h" #include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/ISA/LatteReg.h" +#include "HW/Latte/Renderer/Metal/MetalPipelineCompiler.h" #include "util/helpers/helpers.h" #include "config/ActiveSettings.h" @@ -121,8 +122,7 @@ PipelineObject* MetalPipelineCache::GetRenderPipelineState(const LatteFetchShade pipelineObj = new PipelineObject(); MetalPipelineCompiler* compiler = new MetalPipelineCompiler(m_mtlr, *pipelineObj); - bool fbosMatch; - compiler->InitFromState(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr, fbosMatch); + compiler->InitFromState(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); bool allowAsyncCompile = false; if (GetConfig().async_compile) @@ -145,9 +145,8 @@ PipelineObject* MetalPipelineCache::GetRenderPipelineState(const LatteFetchShade delete compiler; } - // If FBOs don't match, it wouldn't be possible to reconstruct the pipeline from the cache - if (fbosMatch) - AddCurrentStateToCache(hash); + // Save to cache + AddCurrentStateToCache(hash, lastUsedAttachmentsInfo); return pipelineObj; } @@ -380,6 +379,8 @@ struct CachedPipeline ShaderHash gsHash; ShaderHash psHash; + MetalAttachmentsInfo lastUsedAttachmentsInfo; + Latte::GPUCompactedRegisterState gpuState; }; @@ -453,9 +454,7 @@ void MetalPipelineCache::LoadPipelineFromCache(std::span fileData) // compile { MetalPipelineCompiler pp(m_mtlr, *pipelineObject); - bool fbosMatch; - pp.InitFromState(vertexShader->compatibleFetchShader, vertexShader, geometryShader, pixelShader, attachmentsInfo, attachmentsInfo, *lcr, fbosMatch); - cemu_assert_debug(fbosMatch); + pp.InitFromState(vertexShader->compatibleFetchShader, vertexShader, geometryShader, pixelShader, cachedPipeline->lastUsedAttachmentsInfo, attachmentsInfo, *lcr); pp.Compile(true, true, false); // destroy pp early } @@ -463,7 +462,7 @@ void MetalPipelineCache::LoadPipelineFromCache(std::span fileData) // on success, cache the pipeline if (pipelineObject->m_pipeline) { - uint64 pipelineStateHash = CalculatePipelineHash(vertexShader->compatibleFetchShader, vertexShader, geometryShader, pixelShader, attachmentsInfo, attachmentsInfo, *lcr); + uint64 pipelineStateHash = CalculatePipelineHash(vertexShader->compatibleFetchShader, vertexShader, geometryShader, pixelShader, cachedPipeline->lastUsedAttachmentsInfo, attachmentsInfo, *lcr); m_pipelineCacheLock.lock(); m_pipelineCache[pipelineStateHash] = pipelineObject; m_pipelineCacheLock.unlock(); @@ -478,7 +477,7 @@ void MetalPipelineCache::LoadPipelineFromCache(std::span fileData) ConcurrentQueue g_mtlPipelineCachingQueue; -void MetalPipelineCache::AddCurrentStateToCache(uint64 pipelineStateHash) +void MetalPipelineCache::AddCurrentStateToCache(uint64 pipelineStateHash, const MetalAttachmentsInfo& lastUsedAttachmentsInfo) { if (!m_pipelineCacheStoreThread) { @@ -499,6 +498,7 @@ void MetalPipelineCache::AddCurrentStateToCache(uint64 pipelineStateHash) job->gsHash.set(gs->baseHash, gs->auxHash); if (ps) job->psHash.set(ps->baseHash, ps->auxHash); + job->lastUsedAttachmentsInfo = lastUsedAttachmentsInfo; Latte::StoreGPURegisterState(LatteGPUState.contextNew, job->gpuState); // queue job g_mtlPipelineCachingQueue.push(job); @@ -530,7 +530,13 @@ bool MetalPipelineCache::SerializePipeline(MemStreamWriter& memWriter, CachedPip memWriter.writeBE(cachedPipeline.psHash.baseHash); memWriter.writeBE(cachedPipeline.psHash.auxHash); } + + for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + memWriter.writeBE((uint16)cachedPipeline.lastUsedAttachmentsInfo.colorFormats[i]); + memWriter.writeBE((uint16)cachedPipeline.lastUsedAttachmentsInfo.depthFormat); + Latte::SerializeRegisterState(cachedPipeline.gpuState, memWriter); + return true; } @@ -562,12 +568,18 @@ bool MetalPipelineCache::DeserializePipeline(MemStreamReader& memReader, CachedP uint64 auxHash = memReader.readBE(); cachedPipeline.psHash.set(baseHash, auxHash); } + + for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + cachedPipeline.lastUsedAttachmentsInfo.colorFormats[i] = (Latte::E_GX2SURFFMT)memReader.readBE(); + cachedPipeline.lastUsedAttachmentsInfo.depthFormat = (Latte::E_GX2SURFFMT)memReader.readBE(); + // deserialize GPU state if (!Latte::DeserializeRegisterState(cachedPipeline.gpuState, memReader)) { return false; } cemu_assert_debug(!memReader.hasError()); + return true; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h index d49ec6a25..270c2db72 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h @@ -41,7 +41,7 @@ class MetalPipelineCache static uint64 CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); - void AddCurrentStateToCache(uint64 pipelineStateHash); + void AddCurrentStateToCache(uint64 pipelineStateHash, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo); // pipeline serialization for file bool SerializePipeline(class MemStreamWriter& memWriter, struct CachedPipeline& cachedPipeline); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp index 611d190dd..9d74e2d92 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp @@ -190,7 +190,7 @@ extern std::atomic_int g_compiled_shaders_total; extern std::atomic_int g_compiled_shaders_async; template -void SetFragmentState(T* desc, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, bool rasterizationEnabled, const LatteContextRegister& lcr, bool& fbosMatch) +void SetFragmentState(T* desc, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, bool rasterizationEnabled, const LatteContextRegister& lcr) { // TODO: check if the pixel shader is valid as well? if (!rasterizationEnabled/* || !pixelShaderMtl*/) @@ -200,7 +200,6 @@ void SetFragmentState(T* desc, const MetalAttachmentsInfo& lastUsedAttachmentsIn } // Color attachments - fbosMatch = true; const Latte::LATTE_CB_COLOR_CONTROL& colorControlReg = lcr.CB_COLOR_CONTROL; uint32 blendEnableMask = colorControlReg.get_BLEND_MASK(); uint32 renderTargetMask = lcr.CB_TARGET_MASK.get_MASK(); @@ -218,7 +217,6 @@ void SetFragmentState(T* desc, const MetalAttachmentsInfo& lastUsedAttachmentsIn if (activeAttachmentsInfo.colorFormats[i] == Latte::E_GX2SURFFMT::INVALID_FORMAT) { colorAttachment->setWriteMask(MTL::ColorWriteMaskNone); - fbosMatch = false; continue; } @@ -288,7 +286,7 @@ MetalPipelineCompiler::~MetalPipelineCompiler() m_pipelineDescriptor->release(); } -void MetalPipelineCompiler::InitFromState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr, bool& fbosMatch) +void MetalPipelineCompiler::InitFromState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) { // Check if the pipeline uses a geometry shader const LattePrimitiveMode primitiveMode = static_cast(lcr.VGT_PRIMITIVE_TYPE.get_PRIMITIVE_MODE()); @@ -322,9 +320,9 @@ void MetalPipelineCompiler::InitFromState(const LatteFetchShader* fetchShader, c m_pixelShaderMtl = static_cast(pixelShader->shader); if (m_usesGeometryShader) - InitFromStateMesh(fetchShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr, fbosMatch); + InitFromStateMesh(fetchShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); else - InitFromStateRender(fetchShader, vertexShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr, fbosMatch); + InitFromStateRender(fetchShader, vertexShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); } bool MetalPipelineCompiler::Compile(bool forceCompile, bool isRenderThread, bool showInOverlay) @@ -408,7 +406,7 @@ bool MetalPipelineCompiler::Compile(bool forceCompile, bool isRenderThread, bool return true; } -void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr, bool& fbosMatch) +void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) { // Render pipeline state MTL::RenderPipelineDescriptor* desc = MTL::RenderPipelineDescriptor::alloc()->init(); @@ -482,17 +480,17 @@ void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchSha vertexDescriptor->release(); } - SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, m_rasterizationEnabled, lcr, fbosMatch); + SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, m_rasterizationEnabled, lcr); m_pipelineDescriptor = desc; } -void MetalPipelineCompiler::InitFromStateMesh(const LatteFetchShader* fetchShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr, bool& fbosMatch) +void MetalPipelineCompiler::InitFromStateMesh(const LatteFetchShader* fetchShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) { // Render pipeline state MTL::MeshRenderPipelineDescriptor* desc = MTL::MeshRenderPipelineDescriptor::alloc()->init(); - SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, m_rasterizationEnabled, lcr, fbosMatch); + SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, m_rasterizationEnabled, lcr); m_pipelineDescriptor = desc; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h index d762d8025..5006ed595 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h @@ -16,7 +16,7 @@ class MetalPipelineCompiler MetalPipelineCompiler(class MetalRenderer* metalRenderer, PipelineObject& pipelineObj) : m_mtlr{metalRenderer}, m_pipelineObj{pipelineObj} {} ~MetalPipelineCompiler(); - void InitFromState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr, bool& fbosMatch); + void InitFromState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); bool Compile(bool forceCompile, bool isRenderThread, bool showInOverlay); @@ -32,7 +32,7 @@ class MetalPipelineCompiler NS::Object* m_pipelineDescriptor; - void InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr, bool& fbosMatch); + void InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); - void InitFromStateMesh(const LatteFetchShader* fetchShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr, bool& fbosMatch); + void InitFromStateMesh(const LatteFetchShader* fetchShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); }; From 00256e5589438f2226d2966bee471acf1ed33f3c Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 29 Oct 2024 17:43:29 +0100 Subject: [PATCH 230/368] only set blend color when changed --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 16 ++++++++++++---- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h | 1 + 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index dc4244ec0..4252bea34 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1051,10 +1051,18 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 } // Blend color - float* blendColorConstant = (float*)LatteGPUState.contextRegister + Latte::REGADDR::CB_BLEND_RED; + uint32* blendColorConstantU32 = LatteGPUState.contextRegister + Latte::REGADDR::CB_BLEND_RED; - // TODO: only set when changed - renderCommandEncoder->setBlendColor(blendColorConstant[0], blendColorConstant[1], blendColorConstant[2], blendColorConstant[3]); + if (blendColorConstantU32[0] != encoderState.m_blendColor[0] || blendColorConstantU32[1] != encoderState.m_blendColor[1] || blendColorConstantU32[2] != encoderState.m_blendColor[2] || blendColorConstantU32[3] != encoderState.m_blendColor[3]) + { + float* blendColorConstant = (float*)LatteGPUState.contextRegister + Latte::REGADDR::CB_BLEND_RED; + renderCommandEncoder->setBlendColor(blendColorConstant[0], blendColorConstant[1], blendColorConstant[2], blendColorConstant[3]); + + encoderState.m_blendColor[0] = blendColorConstantU32[0]; + encoderState.m_blendColor[1] = blendColorConstantU32[1]; + encoderState.m_blendColor[2] = blendColorConstantU32[2]; + encoderState.m_blendColor[3] = blendColorConstantU32[3]; + } // polygon control const auto& polygonControlReg = LatteGPUState.contextNew.PA_SU_SC_MODE_CNTL; @@ -1178,7 +1186,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 { encoderState.m_scissor = m_state.m_scissor; - // TODO: clamp scissor to render target dimensions + // TODO: clamp scissor to render target dimensions? //scissor.width = ; //scissor.height = ; renderCommandEncoder->setScissorRect(encoderState.m_scissor); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 93c9a56d8..99d95ac7e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -99,6 +99,7 @@ struct MetalEncoderState MTL::ScissorRect m_scissor; uint32 m_stencilRefFront = 0; uint32 m_stencilRefBack = 0; + uint32 m_blendColor[4] = {0}; uint32 m_depthBias = 0; uint32 m_depthSlope = 0; uint32 m_depthClamp = 0; From e7ac19979dce98f28947c8761ce4391fc7abe91b Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 30 Oct 2024 09:20:50 +0100 Subject: [PATCH 231/368] make occlusion queries accumulate with draws --- .../HW/Latte/Renderer/Metal/MetalCommon.h | 7 +++++ .../HW/Latte/Renderer/Metal/MetalQuery.cpp | 17 ++++++----- src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h | 7 ++++- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 17 +++++------ .../HW/Latte/Renderer/Metal/MetalRenderer.h | 29 +++++++------------ 5 files changed, 40 insertions(+), 37 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h index a1fe7f826..d7de0a288 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -22,6 +22,13 @@ struct MetalPixelFormatSupport } }; +// TODO: don't define a new struct for this +struct MetalQueryRange +{ + uint32 begin; + uint32 end; +}; + #define MAX_MTL_BUFFERS 31 // Buffer indices 28-30 are reserved for the helper shaders #define GET_MTL_VERTEX_BUFFER_INDEX(index) (MAX_MTL_BUFFERS - index - 4) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp index 91f252e8a..6e6b14c33 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp @@ -1,6 +1,5 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalQuery.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" -#include "HW/Latte/Renderer/Metal/MetalCommon.h" bool LatteQueryObjectMtl::getResult(uint64& numSamplesPassed) { @@ -13,29 +12,31 @@ bool LatteQueryObjectMtl::getResult(uint64& numSamplesPassed) if (!CommandBufferCompleted(m_commandBuffer)) return false; - numSamplesPassed = m_mtlr->GetOcclusionQueryResultsPtr()[m_queryIndex]; + uint64* resultPtr = m_mtlr->GetOcclusionQueryResultsPtr(); + + numSamplesPassed = 0; + for (uint32 i = m_range.begin; i != m_range.end; i = (i + 1) % MetalRenderer::OCCLUSION_QUERY_POOL_SIZE) + numSamplesPassed += resultPtr[i]; return true; } LatteQueryObjectMtl::~LatteQueryObjectMtl() { - if (m_queryIndex != INVALID_UINT32) - m_mtlr->ReleaseOcclusionQueryIndex(m_queryIndex); - if (m_commandBuffer) m_commandBuffer->release(); } void LatteQueryObjectMtl::begin() { - m_queryIndex = m_mtlr->GetAvailableOcclusionQueryIndex(); - m_mtlr->SetActiveOcclusionQueryIndex(m_queryIndex); + m_range.begin = m_mtlr->GetOcclusionQueryIndex(); + m_mtlr->BeginOcclusionQuery(); } void LatteQueryObjectMtl::end() { - m_mtlr->SetActiveOcclusionQueryIndex(INVALID_UINT32); + m_range.end = m_mtlr->GetOcclusionQueryIndex(); + m_mtlr->EndOcclusionQuery(); if (m_mtlr->IsCommandBufferActive()) { m_commandBuffer = m_mtlr->GetCurrentCommandBuffer()->retain(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h index 554cdacec..7c9bc2cfa 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h @@ -14,10 +14,15 @@ class LatteQueryObjectMtl : public LatteQueryObject void begin() override; void end() override; + void GrowRange() + { + m_range.end++; + } + private: class MetalRenderer* m_mtlr; - uint32 m_queryIndex = INVALID_UINT32; + MetalQueryRange m_range = {INVALID_UINT32, INVALID_UINT32}; // TODO: make this a list of command buffers MTL::CommandBuffer* m_commandBuffer = nullptr; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 4252bea34..8b3377ac9 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -31,7 +31,6 @@ #include "imgui/imgui_impl_metal.h" #define DEFAULT_COMMIT_TRESHOLD 196 -#define OCCLUSION_QUERY_POOL_SIZE 1024 extern bool hasValidFramebufferAttached; @@ -98,10 +97,6 @@ MetalRenderer::MetalRenderer() #endif m_occlusionQuery.m_resultsPtr = (uint64*)m_occlusionQuery.m_resultBuffer->contents(); - m_occlusionQuery.m_availableIndices.reserve(OCCLUSION_QUERY_POOL_SIZE); - for (uint32 i = 0; i < OCCLUSION_QUERY_POOL_SIZE; i++) - m_occlusionQuery.m_availableIndices.push_back(i); - // Initialize state for (uint32 i = 0; i < METAL_SHADER_TYPE_TOTAL; i++) { @@ -1115,11 +1110,10 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 } // Visibility result mode - if (m_occlusionQuery.m_activeIndex != encoderState.m_visibilityResultOffset) + if (m_occlusionQuery.m_active) { - auto mode = (m_occlusionQuery.m_activeIndex == INVALID_UINT32 ? MTL::VisibilityResultModeDisabled : MTL::VisibilityResultModeCounting); - renderCommandEncoder->setVisibilityResultMode(mode, m_occlusionQuery.m_activeIndex * sizeof(uint64)); - encoderState.m_visibilityResultOffset = m_occlusionQuery.m_activeIndex; + auto mode = (m_occlusionQuery.m_currentIndex == INVALID_UINT32 ? MTL::VisibilityResultModeDisabled : MTL::VisibilityResultModeCounting); + renderCommandEncoder->setVisibilityResultMode(mode, m_occlusionQuery.m_currentIndex * sizeof(uint64)); } // todo - how does culling behave with rects? @@ -1302,6 +1296,11 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 m_state.m_isFirstDrawInRenderPass = false; + // Occlusion queries + if (m_occlusionQuery.m_active) + m_occlusionQuery.m_currentIndex = (m_occlusionQuery.m_currentIndex + 1) % OCCLUSION_QUERY_POOL_SIZE; + + // Streamout LatteStreamout_FinishDrawcall(false); // Debug diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 99d95ac7e..9c1bb2dcd 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -104,7 +104,6 @@ struct MetalEncoderState uint32 m_depthSlope = 0; uint32 m_depthClamp = 0; bool m_depthClipEnable = true; - uint32 m_visibilityResultOffset = INVALID_UINT32; struct { MTL::Buffer* m_buffer; size_t m_offset; @@ -170,7 +169,8 @@ enum class MetalEncoderType class MetalRenderer : public Renderer { public: - static const inline int TEXTURE_READBACK_SIZE = 32 * 1024 * 1024; // 32 MB + static constexpr uint32 OCCLUSION_QUERY_POOL_SIZE = 1024; + static constexpr uint32 TEXTURE_READBACK_SIZE = 32 * 1024 * 1024; // 32 MB MetalRenderer(); ~MetalRenderer() override; @@ -428,28 +428,19 @@ class MetalRenderer : public Renderer return m_occlusionQuery.m_resultsPtr; } - uint32 GetAvailableOcclusionQueryIndex() + uint32 GetOcclusionQueryIndex() { - if (m_occlusionQuery.m_availableIndices.empty()) - { - cemuLog_log(LogType::Force, "No occlusion query index available"); - return 0; - } - - uint32 queryIndex = m_occlusionQuery.m_availableIndices.back(); - m_occlusionQuery.m_availableIndices.pop_back(); - - return queryIndex; + return m_occlusionQuery.m_currentIndex; } - void ReleaseOcclusionQueryIndex(uint32 queryIndex) + void BeginOcclusionQuery() { - m_occlusionQuery.m_availableIndices.push_back(queryIndex); + m_occlusionQuery.m_active = true; } - void SetActiveOcclusionQueryIndex(uint32 queryIndex) + void EndOcclusionQuery() { - m_occlusionQuery.m_activeIndex = queryIndex; + m_occlusionQuery.m_active = false; } private: @@ -505,8 +496,8 @@ class MetalRenderer : public Renderer { MTL::Buffer* m_resultBuffer; uint64* m_resultsPtr; - std::vector m_availableIndices; - uint32 m_activeIndex = INVALID_UINT32; + uint32 m_currentIndex = 0; + bool m_active = false; } m_occlusionQuery; // Active objects From 7eb8508db41a206a87d7f70ec871b8090427886a Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 31 Oct 2024 17:13:22 +0100 Subject: [PATCH 232/368] fix: CI when metal is disabled --- CMakeLists.txt | 2 ++ src/Cafe/CMakeLists.txt | 9 ++++++--- src/Cafe/HW/Latte/Core/FetchShader.cpp | 4 +++- src/Cafe/HW/Latte/Core/LatteShader.cpp | 4 ++-- src/Cafe/HW/Latte/Core/LatteShaderCache.cpp | 17 +++++++++++++++++ .../LegacyShaderDecompiler/LatteDecompiler.cpp | 8 +++++--- src/gui/CMakeLists.txt | 9 +++++++-- src/gui/GameProfileWindow.cpp | 6 +++--- src/gui/GeneralSettings2.cpp | 2 +- src/gui/MainWindow.cpp | 2 ++ src/gui/PadViewFrame.cpp | 2 ++ 11 files changed, 50 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8a04c7719..5fdc359a3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -201,6 +201,8 @@ endif() if (ENABLE_METAL) include_directories(${CMAKE_SOURCE_DIR}/dependencies/metal-cpp) + + add_definitions(-DENABLE_METAL=1) endif() if (ENABLE_DISCORD_RPC) diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index b30f8efef..5b1e6fde8 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -151,9 +151,6 @@ add_library(CemuCafe HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitGLSLAttrDecoder.cpp HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitGLSL.cpp HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitGLSLHeader.hpp - HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp - HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp - HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h HW/Latte/LegacyShaderDecompiler/LatteDecompilerInstructions.h HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h @@ -579,6 +576,12 @@ if(ENABLE_METAL) HW/Latte/Renderer/Metal/UtilityShaderSource.h ) + target_sources(CemuCafe PRIVATE + HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp + HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp + HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp + ) + #target_link_libraries(CemuCafe PRIVATE # "-framework Metal" # "-framework QuartzCore" diff --git a/src/Cafe/HW/Latte/Core/FetchShader.cpp b/src/Cafe/HW/Latte/Core/FetchShader.cpp index 6da6100b2..d50447b31 100644 --- a/src/Cafe/HW/Latte/Core/FetchShader.cpp +++ b/src/Cafe/HW/Latte/Core/FetchShader.cpp @@ -11,7 +11,7 @@ #include "HW/Latte/Renderer/Renderer.h" #include "util/containers/LookupTableL3.h" #include "util/helpers/fspinlock.h" -#if BOOST_OS_MACOS +#if ENABLE_METAL #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #endif #include /* SHA1_DIGEST_LENGTH */ @@ -166,6 +166,7 @@ void LatteFetchShader::CalculateFetchShaderVkHash() void LatteFetchShader::CheckIfVerticesNeedManualFetchMtl(uint32* contextRegister) { +#if ENABLE_METAL for (sint32 g = 0; g < bufferGroups.size(); g++) { LatteParsedFetchShaderBufferGroup_t& group = bufferGroups[g]; @@ -183,6 +184,7 @@ void LatteFetchShader::CheckIfVerticesNeedManualFetchMtl(uint32* contextRegister mtlFetchVertexManually = true; } } +#endif } void _fetchShaderDecompiler_parseInstruction_VTX_SEMANTIC(LatteFetchShader* parsedFetchShader, uint32* contextRegister, const LatteClauseInstruction_VTX* instr) diff --git a/src/Cafe/HW/Latte/Core/LatteShader.cpp b/src/Cafe/HW/Latte/Core/LatteShader.cpp index 9e3e6b1f6..504bddcc0 100644 --- a/src/Cafe/HW/Latte/Core/LatteShader.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShader.cpp @@ -14,7 +14,7 @@ #include "config/ActiveSettings.h" #include "Cafe/GameProfile/GameProfile.h" #include "util/containers/flat_hash_map.hpp" -#if BOOST_OS_MACOS +#if ENABLE_METAL #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #endif #include @@ -591,7 +591,7 @@ void LatteSHRC_UpdatePSBaseHash(uint8* pixelShaderPtr, uint32 pixelShaderSize, b // get vertex shader uint64 psHash = psHash1 + psHash2 + _activePSImportTable.key + (usesGeometryShader ? hashCacheGS.prevHash1 : 0ULL); -#if BOOST_OS_MACOS +#if ENABLE_METAL if (g_renderer->GetType() == RendererAPI::Metal) { for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) diff --git a/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp b/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp index 126dcc500..ed9dc4671 100644 --- a/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp @@ -11,8 +11,10 @@ #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Cafe/HW/Latte/Renderer/OpenGL/RendererShaderGL.h" #include "Cafe/HW/Latte/Renderer/Vulkan/RendererShaderVk.h" +#if ENABLE_METAL #include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h" +#endif #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanPipelineStableCache.h" #include @@ -163,8 +165,10 @@ void LatteShaderCache_finish() RendererShaderVk::ShaderCacheLoading_end(); else if (g_renderer->GetType() == RendererAPI::OpenGL) RendererShaderGL::ShaderCacheLoading_end(); +#if ENABLE_METAL else if (g_renderer->GetType() == RendererAPI::Metal) RendererShaderMtl::ShaderCacheLoading_end(); +#endif } uint32 LatteShaderCache_getShaderCacheExtraVersion(uint64 titleId) @@ -247,8 +251,11 @@ void LatteShaderCache_Load() RendererShaderVk::ShaderCacheLoading_begin(cacheTitleId); else if (g_renderer->GetType() == RendererAPI::OpenGL) RendererShaderGL::ShaderCacheLoading_begin(cacheTitleId); +#if ENABLE_METAL else if (g_renderer->GetType() == RendererAPI::Metal) RendererShaderMtl::ShaderCacheLoading_begin(cacheTitleId); +#endif + // get cache file name const auto pathGeneric = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}_shaders.bin", cacheTitleId); const auto pathGenericPre1_25_0 = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}.bin", cacheTitleId); // before 1.25.0 @@ -508,14 +515,18 @@ void LatteShaderCache_LoadPipelineCache(uint64 cacheTitleId) { if (g_renderer->GetType() == RendererAPI::Vulkan) g_shaderCacheLoaderState.pipelineFileCount = VulkanPipelineStableCache::GetInstance().BeginLoading(cacheTitleId); +#if ENABLE_METAL else if (g_renderer->GetType() == RendererAPI::Metal) g_shaderCacheLoaderState.pipelineFileCount = MetalPipelineCache::GetInstance().BeginLoading(cacheTitleId); +#endif g_shaderCacheLoaderState.loadedPipelines = 0; LatteShaderCache_ShowProgress(LatteShaderCache_updatePipelineLoadingProgress, true); if (g_renderer->GetType() == RendererAPI::Vulkan) VulkanPipelineStableCache::GetInstance().EndLoading(); +#if ENABLE_METAL else if (g_renderer->GetType() == RendererAPI::Metal) MetalPipelineCache::GetInstance().EndLoading(); +#endif if(Latte_GetStopSignal()) LatteThread_Exit(); } @@ -525,8 +536,10 @@ bool LatteShaderCache_updatePipelineLoadingProgress() uint32 pipelinesMissingShaders = 0; if (g_renderer->GetType() == RendererAPI::Vulkan) return VulkanPipelineStableCache::GetInstance().UpdateLoading(g_shaderCacheLoaderState.loadedPipelines, pipelinesMissingShaders); +#if ENABLE_METAL else if (g_renderer->GetType() == RendererAPI::Metal) return MetalPipelineCache::GetInstance().UpdateLoading(g_shaderCacheLoaderState.loadedPipelines, pipelinesMissingShaders); +#endif return false; } @@ -790,14 +803,18 @@ void LatteShaderCache_Close() RendererShaderVk::ShaderCacheLoading_Close(); else if (g_renderer->GetType() == RendererAPI::OpenGL) RendererShaderGL::ShaderCacheLoading_Close(); +#if ENABLE_METAL else if (g_renderer->GetType() == RendererAPI::Metal) RendererShaderMtl::ShaderCacheLoading_Close(); +#endif // if Vulkan or Metal then also close pipeline cache if (g_renderer->GetType() == RendererAPI::Vulkan) VulkanPipelineStableCache::GetInstance().Close(); +#if ENABLE_METAL else if (g_renderer->GetType() == RendererAPI::Metal) MetalPipelineCache::GetInstance().Close(); +#endif } #include diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.cpp index c2051090b..7359a1ff1 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.cpp @@ -1067,10 +1067,12 @@ void _LatteDecompiler_Process(LatteDecompilerShaderContext* shaderContext, uint8 // emit code if (shaderContext->shader->hasError == false) { - if (g_renderer->GetType() == RendererAPI::Metal) - LatteDecompiler_emitMSLShader(shaderContext, shaderContext->shader); + if (g_renderer->GetType() == RendererAPI::OpenGL || g_renderer->GetType() == RendererAPI::Vulkan) + LatteDecompiler_emitGLSLShader(shaderContext, shaderContext->shader); +#if ENABLE_METAL else - LatteDecompiler_emitGLSLShader(shaderContext, shaderContext->shader); + LatteDecompiler_emitMSLShader(shaderContext, shaderContext->shader); +#endif } LatteDecompiler_cleanup(shaderContext); // fast access diff --git a/src/gui/CMakeLists.txt b/src/gui/CMakeLists.txt index df98c1f1e..f12f120ec 100644 --- a/src/gui/CMakeLists.txt +++ b/src/gui/CMakeLists.txt @@ -4,8 +4,6 @@ add_library(CemuGui canvas/OpenGLCanvas.h canvas/VulkanCanvas.cpp canvas/VulkanCanvas.h - canvas/MetalCanvas.cpp - canvas/MetalCanvas.h CemuApp.cpp CemuApp.h CemuUpdateWindow.cpp @@ -131,6 +129,13 @@ add_library(CemuGui wxHelper.h ) +if(ENABLE_METAL) + target_sources(CemuGui PRIVATE + canvas/MetalCanvas.cpp + canvas/MetalCanvas.h + ) +endif() + set_property(TARGET CemuGui PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") diff --git a/src/gui/GameProfileWindow.cpp b/src/gui/GameProfileWindow.cpp index fe3327025..76b8801c4 100644 --- a/src/gui/GameProfileWindow.cpp +++ b/src/gui/GameProfileWindow.cpp @@ -113,7 +113,7 @@ GameProfileWindow::GameProfileWindow(wxWindow* parent, uint64_t title_id) first_row->Add(new wxStaticText(panel, wxID_ANY, _("Graphics API")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); wxString gapi_values[] = { "", "OpenGL", "Vulkan", -#ifdef __APPLE__ +#if ENABLE_METAL "Metal" #endif }; @@ -271,7 +271,7 @@ void GameProfileWindow::ApplyProfile() if (!m_game_profile.m_graphics_api.has_value()) m_graphic_api->SetSelection(0); // selecting "" else - m_graphic_api->SetSelection(1 + m_game_profile.m_graphics_api.value()); // "", OpenGL, Vulkan + m_graphic_api->SetSelection(1 + m_game_profile.m_graphics_api.value()); // "", OpenGL, Vulkan, Metal m_shader_mul_accuracy->SetSelection((int)m_game_profile.m_accurateShaderMul); //// audio @@ -338,7 +338,7 @@ void GameProfileWindow::SaveProfile() if (m_graphic_api->GetSelection() == 0) m_game_profile.m_graphics_api = {}; else - m_game_profile.m_graphics_api = (GraphicAPI)(m_graphic_api->GetSelection() - 1); // "", OpenGL, Vulkan + m_game_profile.m_graphics_api = (GraphicAPI)(m_graphic_api->GetSelection() - 1); // "", OpenGL, Vulkan, Metal // controller for (int i = 0; i < 8; ++i) diff --git a/src/gui/GeneralSettings2.cpp b/src/gui/GeneralSettings2.cpp index 4cd0be9a4..4812a8846 100644 --- a/src/gui/GeneralSettings2.cpp +++ b/src/gui/GeneralSettings2.cpp @@ -314,7 +314,7 @@ wxPanel* GeneralSettings2::AddGraphicsPage(wxNotebook* notebook) { choices[api_size++] = "Vulkan"; } -#ifdef __APPLE__ +#if ENABLE_METAL choices[api_size++] = "Metal"; #endif diff --git a/src/gui/MainWindow.cpp b/src/gui/MainWindow.cpp index 7f738c2ec..2e44a4c76 100644 --- a/src/gui/MainWindow.cpp +++ b/src/gui/MainWindow.cpp @@ -1570,8 +1570,10 @@ void MainWindow::CreateCanvas() m_render_canvas = new VulkanCanvas(m_game_panel, wxSize(1280, 720), true); else if (ActiveSettings::GetGraphicsAPI() == kOpenGL) m_render_canvas = GLCanvas_Create(m_game_panel, wxSize(1280, 720), true); +#if ENABLE_METAL else m_render_canvas = new MetalCanvas(m_game_panel, wxSize(1280, 720), true); +#endif // mouse events m_render_canvas->Bind(wxEVT_MOTION, &MainWindow::OnMouseMove, this); diff --git a/src/gui/PadViewFrame.cpp b/src/gui/PadViewFrame.cpp index 6d1ec7d62..94319299e 100644 --- a/src/gui/PadViewFrame.cpp +++ b/src/gui/PadViewFrame.cpp @@ -77,8 +77,10 @@ void PadViewFrame::InitializeRenderCanvas() m_render_canvas = new VulkanCanvas(this, wxSize(854, 480), false); else if (ActiveSettings::GetGraphicsAPI() == kOpenGL) m_render_canvas = GLCanvas_Create(this, wxSize(854, 480), false); +#if ENABLE_METAL else m_render_canvas = new MetalCanvas(this, wxSize(854, 480), false); +#endif sizer->Add(m_render_canvas, 1, wxEXPAND, 0, nullptr); } SetSizer(sizer); From 927c52e268d2c70df060ba554f0ad41de88464f1 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 1 Nov 2024 15:30:15 +0100 Subject: [PATCH 233/368] implement calculate texture lod --- .../LatteDecompilerEmitMSL.cpp | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 13f7bccff..e81856c3d 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2690,16 +2690,16 @@ static void _emitTEXGetCompTexLodCode(LatteDecompilerShaderContext* shaderContex { // 3 coordinates if(shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) - src->addFmt("float4(textureQueryLod(tex{}, {}.{}{}{}),0.0,0.0)", texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]], resultElemTable[texInstruction->textureFetch.srcSel[2]]); + src->addFmt("float4(textureCalculateLod(tex{}, samplr{}, {}.{}{}{}),0.0,0.0)", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]], resultElemTable[texInstruction->textureFetch.srcSel[2]]); else - src->addFmt("float4(textureQueryLod(tex{}, bitCast({}.{}{}{})),0.0,0.0)", texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]], resultElemTable[texInstruction->textureFetch.srcSel[2]]); + src->addFmt("float4(textureCalculateLod(tex{}, samplr{}, bitCast({}.{}{}{})),0.0,0.0)", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]], resultElemTable[texInstruction->textureFetch.srcSel[2]]); } else { if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) - src->addFmt("float4(textureQueryLod(tex{}, {}.{}{}),0.0,0.0)", texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]]); + src->addFmt("float4(textureCalculateLod(tex{}, samplr{}, {}.{}{}),0.0,0.0)", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]]); else - src->addFmt("float4(textureQueryLod(tex{}, bitCast({}.{}{})),0.0,0.0)", texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]]); + src->addFmt("float4(textureCalculateLod(tex{}, samplr{}, bitCast({}.{}{})),0.0,0.0)", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]]); debugBreakpoint(); } @@ -3734,7 +3734,6 @@ void LatteDecompiler_emitHelperFunctions(LatteDecompilerShaderContext* shaderCon // Sample compare emulate // TODO: only add when needed - // TODO: lod_options overload // TODO: when the sampler has linear min mag filter, use gather and filter manually // TODO: offset? @@ -3745,6 +3744,15 @@ void LatteDecompiler_emitHelperFunctions(LatteDecompilerShaderContext* shaderCon "}\r\n" ); + // Texture calculate lod + // TODO: only add when needed + fCStr_shaderSource->add("" + "template\r\n" + "float2 textureCalculateLod(TextureT tex, sampler samplr, CoordT coord) {\r\n" + "float lod = tex.calculate_unclamped_lod(samplr, coord);\r\n" + "return float2(floor(lod), fract(lod));\r\n" + "}\r\n"); + // clamp fCStr_shaderSource->add("" "int clampFI32(int v)\r\n" From 6eb46ed6eb565961a436d84bb939ef713641e874 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 1 Nov 2024 15:46:49 +0100 Subject: [PATCH 234/368] remove 'Vulkan' from cache loading message --- src/Cafe/HW/Latte/Core/LatteShaderCache.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp b/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp index ed9dc4671..f1299e406 100644 --- a/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp @@ -437,7 +437,7 @@ void LatteShaderCache_ShowProgress(const std::function & loadUpdateF std::string text; if (isPipelines) { - text = "Loading cached Vulkan pipelines..."; + text = "Loading cached pipelines..."; } else { From 61de5a3d3a05bf313be45b05677da63239e1ce66 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 1 Nov 2024 15:51:44 +0100 Subject: [PATCH 235/368] only log todo for vulkan output shader --- src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp b/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp index 3a0b9b46d..7208ce9ff 100644 --- a/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp +++ b/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp @@ -301,7 +301,8 @@ RendererOutputShader::RendererOutputShader(const std::string& vertex_source, con } else { - cemuLog_logDebug(LogType::Force, "RendererOutputShader() - todo for Vulkan"); + if (g_renderer->GetType() == RendererAPI::Vulkan) + cemuLog_logDebug(LogType::Force, "RendererOutputShader() - todo for Vulkan"); m_attributes[0].m_loc_texture_src_resolution = -1; m_attributes[0].m_loc_input_resolution = -1; m_attributes[0].m_loc_output_resolution = -1; From ab41de4f9feffa277ddcf3710d300d78f81dfd2b Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 3 Nov 2024 11:53:16 +0100 Subject: [PATCH 236/368] use host memory instead of buffer cache when possible --- .../HW/Latte/Renderer/Metal/MetalCommon.h | 4 +- .../Renderer/Metal/MetalMemoryManager.cpp | 21 +++- .../Latte/Renderer/Metal/MetalMemoryManager.h | 13 ++ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 116 ++++++++++++++---- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 8 +- 5 files changed, 138 insertions(+), 24 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h index d7de0a288..20fd6b9de 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -31,7 +31,9 @@ struct MetalQueryRange #define MAX_MTL_BUFFERS 31 // Buffer indices 28-30 are reserved for the helper shaders -#define GET_MTL_VERTEX_BUFFER_INDEX(index) (MAX_MTL_BUFFERS - index - 4) +#define MTL_RESERVED_BUFFERS 3 +#define MAX_MTL_VERTEX_BUFFERS (MAX_MTL_BUFFERS - MTL_RESERVED_BUFFERS) +#define GET_MTL_VERTEX_BUFFER_INDEX(index) (MAX_MTL_VERTEX_BUFFERS - index - 1) #define MAX_MTL_TEXTURES 31 #define MAX_MTL_SAMPLERS 16 diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 1c788e215..4eb4d1056 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -1,6 +1,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h" +#include "Cemu/Logging/CemuLogging.h" #include "Common/precompiled.h" /* @@ -115,7 +116,23 @@ void MetalMemoryManager::InitBufferCache(size_t size) { cemu_assert_debug(!m_bufferCache); - m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, MTL::ResourceStorageModePrivate); + // First, try to import the host memory as a buffer + // TODO: only import if the option is ticked in game profile + if (m_mtlr->IsAppleGPU()) + { + m_importedMemBaseAddress = 0x10000000; + size_t hostAllocationSize = 0x40000000ull; + // TODO: get size of allocation + m_bufferCache = m_mtlr->GetDevice()->newBuffer(memory_getPointerFromVirtualOffset(m_importedMemBaseAddress), hostAllocationSize, MTL::ResourceStorageModeShared, nullptr); + if (m_bufferCache) + m_useHostMemoryForCache = true; + else + cemuLog_logDebug(LogType::Force, "Failed to import host memory as a buffer"); + } + + if (!m_useHostMemoryForCache) + m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, MTL::ResourceStorageModePrivate); + #ifdef CEMU_DEBUG_ASSERT m_bufferCache->setLabel(GetLabel("Buffer cache", m_bufferCache)); #endif @@ -123,6 +140,7 @@ void MetalMemoryManager::InitBufferCache(size_t size) void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, size_t size) { + cemu_assert_debug(!m_useHostMemoryForCache); cemu_assert_debug(m_bufferCache); cemu_assert_debug((offset + size) <= m_bufferCache->length()); @@ -147,6 +165,7 @@ void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, si void MetalMemoryManager::CopyBufferCache(size_t srcOffset, size_t dstOffset, size_t size) { + cemu_assert_debug(!m_useHostMemoryForCache); cemu_assert_debug(m_bufferCache); m_mtlr->CopyBufferToBuffer(m_bufferCache, srcOffset, m_bufferCache, dstOffset, size, ALL_MTL_RENDER_STAGES, ALL_MTL_RENDER_STAGES); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index 4ea5769e8..4e8b25940 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -115,6 +115,17 @@ class MetalMemoryManager } */ + // Getters + bool UseHostMemoryForCache() const + { + return m_useHostMemoryForCache; + } + + MPTR GetImportedMemBaseAddress() const + { + return m_importedMemBaseAddress; + } + private: class MetalRenderer* m_mtlr; @@ -126,4 +137,6 @@ class MetalMemoryManager //MetalVertexBufferCache m_vertexBufferCache; MTL::Buffer* m_bufferCache = nullptr; + bool m_useHostMemoryForCache = false; + MPTR m_importedMemBaseAddress; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 8b3377ac9..7c80a0bc8 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -30,8 +30,6 @@ #include "imgui/imgui_extension.h" #include "imgui/imgui_impl_metal.h" -#define DEFAULT_COMMIT_TRESHOLD 196 - extern bool hasValidFramebufferAttached; float supportBufferData[512 * 4]; @@ -90,6 +88,12 @@ MetalRenderer::MetalRenderer() m_depthStencilCache = new MetalDepthStencilCache(this); m_samplerCache = new MetalSamplerCache(this); + // Lower the commit treshold when host memory is used for cache to reduce latency + if (m_memoryManager->UseHostMemoryForCache()) + m_defaultCommitTreshlod = 64; + else + m_defaultCommitTreshlod = 196; + // Occlusion queries m_occlusionQuery.m_resultBuffer = m_device->newBuffer(OCCLUSION_QUERY_POOL_SIZE * sizeof(uint64), MTL::ResourceStorageModeShared); #ifdef CEMU_DEBUG_ASSERT @@ -97,8 +101,11 @@ MetalRenderer::MetalRenderer() #endif m_occlusionQuery.m_resultsPtr = (uint64*)m_occlusionQuery.m_resultBuffer->contents(); - // Initialize state - for (uint32 i = 0; i < METAL_SHADER_TYPE_TOTAL; i++) + // Reset vertex and uniform buffers + for (uint32 i = 0; i < MAX_MTL_VERTEX_BUFFERS; i++) + m_state.m_vertexBufferOffsets[i] = INVALID_OFFSET; + + for (uint32 i = 0; i < METAL_SHADER_TYPE_TOTAL; i++) { for (uint32 j = 0; j < MAX_MTL_BUFFERS; j++) m_state.m_uniformBufferOffsets[i][j] = INVALID_OFFSET; @@ -821,23 +828,28 @@ void MetalRenderer::bufferCache_copy(uint32 srcOffset, uint32 dstOffset, uint32 void MetalRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint32 dstOffset, uint32 size) { + if (m_memoryManager->UseHostMemoryForCache()) + dstOffset -= m_memoryManager->GetImportedMemBaseAddress(); + CopyBufferToBuffer(GetXfbRingBuffer(), srcOffset, m_memoryManager->GetBufferCache(), dstOffset, size, MTL::RenderStageVertex | MTL::RenderStageMesh, ALL_MTL_RENDER_STAGES); } void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, uint32 size) { + cemu_assert_debug(!m_memoryManager->UseHostMemoryForCache()); cemu_assert_debug(bufferIndex < LATTE_MAX_VERTEX_BUFFERS); - auto& buffer = m_state.m_vertexBuffers[bufferIndex]; - if (buffer.offset == offset && buffer.size == size) - return; + + m_state.m_vertexBufferOffsets[bufferIndex] = offset; + //if (buffer.offset == offset && buffer.size == size) + // return; //if (buffer.offset != INVALID_OFFSET) //{ // m_memoryManager->UntrackVertexBuffer(bufferIndex); //} - buffer.offset = offset; - buffer.size = size; + //buffer.offset = offset; + //buffer.size = size; //buffer.restrideInfo = {}; //m_memoryManager->TrackVertexBuffer(bufferIndex, offset, size, &buffer.restrideInfo); @@ -845,6 +857,8 @@ void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, u void MetalRenderer::buffer_bindUniformBuffer(LatteConst::ShaderType shaderType, uint32 bufferIndex, uint32 offset, uint32 size) { + cemu_assert_debug(!m_memoryManager->UseHostMemoryForCache()); + m_state.m_uniformBufferOffsets[GetMtlGeneralShaderType(shaderType)][bufferIndex] = offset; } @@ -988,9 +1002,24 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 uint32 indexBufferIndex = 0; LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexBufferOffset, indexBufferIndex); - // synchronize vertex and uniform cache and update buffer bindings - // We need to call this before getting the render command encoder, since it can cause buffer copies - LatteBufferCache_Sync(indexMin + baseVertex, indexMax + baseVertex, baseInstance, instanceCount); + // Buffer cache + if (m_memoryManager->UseHostMemoryForCache()) + { + // direct memory access (Wii U memory space imported as a buffer), update buffer bindings + draw_updateVertexBuffersDirectAccess(); + if (vertexShader) + draw_updateUniformBuffersDirectAccess(vertexShader, mmSQ_VTX_UNIFORM_BLOCK_START); + if (geometryShader) + draw_updateUniformBuffersDirectAccess(geometryShader, mmSQ_GS_UNIFORM_BLOCK_START); + if (pixelShader) + draw_updateUniformBuffersDirectAccess(pixelShader, mmSQ_PS_UNIFORM_BLOCK_START); + } + else + { + // synchronize vertex and uniform cache and update buffer bindings + // We need to call this before getting the render command encoder, since it can cause buffer copies + LatteBufferCache_Sync(indexMin + baseVertex, indexMax + baseVertex, baseInstance, instanceCount); + } // Render pass auto renderCommandEncoder = GetRenderCommandEncoder(); @@ -1190,10 +1219,10 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // Vertex buffers //std::vector barrierBuffers; - for (uint8 i = 0; i < MAX_MTL_BUFFERS; i++) + for (uint8 i = 0; i < MAX_MTL_VERTEX_BUFFERS; i++) { - auto& vertexBufferRange = m_state.m_vertexBuffers[i]; - if (vertexBufferRange.offset != INVALID_OFFSET) + size_t offset = m_state.m_vertexBufferOffsets[i]; + if (offset != INVALID_OFFSET) { /* MTL::Buffer* buffer; @@ -1218,11 +1247,8 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 } */ - MTL::Buffer* buffer = m_memoryManager->GetBufferCache(); - size_t offset = m_state.m_vertexBuffers[i].offset; - // Bind - SetBuffer(renderCommandEncoder, GetMtlShaderType(vertexShader->shaderType, usesGeometryShader), buffer, offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); + SetBuffer(renderCommandEncoder, GetMtlShaderType(vertexShader->shaderType, usesGeometryShader), m_memoryManager->GetBufferCache(), offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); } } @@ -1301,7 +1327,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 m_occlusionQuery.m_currentIndex = (m_occlusionQuery.m_currentIndex + 1) % OCCLUSION_QUERY_POOL_SIZE; // Streamout - LatteStreamout_FinishDrawcall(false); + LatteStreamout_FinishDrawcall(m_memoryManager->UseHostMemoryForCache()); // Debug if (fetchVertexManually) @@ -1333,6 +1359,54 @@ void MetalRenderer::draw_endSequence() } } +void MetalRenderer::draw_updateVertexBuffersDirectAccess() +{ + LatteFetchShader* parsedFetchShader = LatteSHRC_GetActiveFetchShader(); + if (!parsedFetchShader) + return; + + for (auto& bufferGroup : parsedFetchShader->bufferGroups) + { + uint32 bufferIndex = bufferGroup.attributeBufferIndex; + uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; + MPTR bufferAddress = LatteGPUState.contextRegister[bufferBaseRegisterIndex + 0]; + //uint32 bufferSize = LatteGPUState.contextRegister[bufferBaseRegisterIndex + 1] + 1; + //uint32 bufferStride = (LatteGPUState.contextRegister[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + + if (bufferAddress == MPTR_NULL) [[unlikely]] + bufferAddress = 0x10000000; // TODO: really? + + m_state.m_vertexBufferOffsets[bufferIndex] = bufferAddress - m_memoryManager->GetImportedMemBaseAddress(); + } +} + +void MetalRenderer::draw_updateUniformBuffersDirectAccess(LatteDecompilerShader* shader, const uint32 uniformBufferRegOffset) +{ + if (shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK) + { + for (const auto& buf : shader->list_quickBufferList) + { + sint32 i = buf.index; + MPTR physicalAddr = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 0]; + uint32 uniformSize = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 1] + 1; + + if (physicalAddr == MPTR_NULL) [[unlikely]] + { + cemu_assert_unimplemented(); + continue; + } + uniformSize = std::min(uniformSize, buf.size); + + cemu_assert_debug(physicalAddr < 0x50000000); + + uint32 bufferIndex = i; + cemu_assert_debug(bufferIndex < 16); + + m_state.m_uniformBufferOffsets[GetMtlGeneralShaderType(shader->shaderType)][bufferIndex] = physicalAddr - m_memoryManager->GetImportedMemBaseAddress(); + } + } +} + void* MetalRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) { auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator(); @@ -1486,7 +1560,7 @@ MTL::CommandBuffer* MetalRenderer::GetCommandBuffer() m_commandBuffers.push_back({mtlCommandBuffer}); m_recordedDrawcalls = 0; - m_commitTreshold = DEFAULT_COMMIT_TRESHOLD; + m_commitTreshold = m_defaultCommitTreshlod; // Notify memory manager about the new command buffer m_memoryManager->GetTemporaryBufferAllocator().SetActiveCommandBuffer(mtlCommandBuffer); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 9c1bb2dcd..9ddc5e93f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -29,6 +29,7 @@ struct MetalRestrideInfo }; */ +/* struct MetalBoundBuffer { size_t offset = INVALID_OFFSET; @@ -36,6 +37,7 @@ struct MetalBoundBuffer // Memory manager will write restride info to this variable //MetalRestrideInfo restrideInfo; }; +*/ enum MetalGeneralShaderType { @@ -141,7 +143,7 @@ struct MetalState // If the FBO changes, but it's the same FBO as the last one with some omitted attachments, this FBO doesn't change MetalActiveFBOState m_lastUsedFBO; - MetalBoundBuffer m_vertexBuffers[MAX_MTL_BUFFERS] = {{}}; + size_t m_vertexBufferOffsets[MAX_MTL_VERTEX_BUFFERS] = {INVALID_OFFSET}; // TODO: find out what is the max number of bound textures on the Wii U class LatteTextureViewMtl* m_textures[64] = {nullptr}; size_t m_uniformBufferOffsets[METAL_GENERAL_SHADER_TYPE_TOTAL][MAX_MTL_BUFFERS]; @@ -277,6 +279,9 @@ class MetalRenderer : public Renderer void draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst) override; void draw_endSequence() override; + void draw_updateVertexBuffersDirectAccess(); + void draw_updateUniformBuffersDirectAccess(LatteDecompilerShader* shader, const uint32 uniformBufferRegOffset); + // index void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override; void indexData_uploadIndexMemory(uint32 bufferIndex, uint32 offset, uint32 size) override; @@ -506,6 +511,7 @@ class MetalRenderer : public Renderer MTL::CommandEncoder* m_commandEncoder = nullptr; uint32 m_recordedDrawcalls; + uint32 m_defaultCommitTreshlod; uint32 m_commitTreshold; // State From 03d4e86b617835e12664a6db24a561d52d8238fc Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 3 Nov 2024 12:09:47 +0100 Subject: [PATCH 237/368] add an option to use the host memory instead of buffer cache --- src/Cafe/CafeSystem.cpp | 1 + src/Cafe/GameProfile/GameProfile.cpp | 14 +++++++++----- src/Cafe/GameProfile/GameProfile.h | 2 ++ .../HW/Latte/Renderer/Metal/MetalMemoryManager.cpp | 6 +++--- src/gui/GameProfileWindow.cpp | 9 +++++++++ src/gui/GameProfileWindow.h | 3 ++- 6 files changed, 26 insertions(+), 9 deletions(-) diff --git a/src/Cafe/CafeSystem.cpp b/src/Cafe/CafeSystem.cpp index 40d26a671..08228b621 100644 --- a/src/Cafe/CafeSystem.cpp +++ b/src/Cafe/CafeSystem.cpp @@ -258,6 +258,7 @@ void InfoLog_PrintActiveSettings() { cemuLog_log(LogType::Force, "Async compile: {}", GetConfig().async_compile.GetValue() ? "true" : "false"); cemuLog_log(LogType::Force, "Fast math: {}", GetConfig().fast_math.GetValue() ? "true" : "false"); + cemuLog_log(LogType::Force, "Use host memory for cache: {}", g_current_game_profile->UseHostMemForCache() ? "true" : "false"); if (!GetConfig().vk_accurate_barriers.GetValue()) cemuLog_log(LogType::Force, "Accurate barriers are disabled!"); } diff --git a/src/Cafe/GameProfile/GameProfile.cpp b/src/Cafe/GameProfile/GameProfile.cpp index ee92107a7..337786edd 100644 --- a/src/Cafe/GameProfile/GameProfile.cpp +++ b/src/Cafe/GameProfile/GameProfile.cpp @@ -127,7 +127,7 @@ bool gameProfile_loadIntegerOption(IniParser& iniParser, const char* optionName, { cemuLog_log(LogType::Force, "Value '{}' is out of range for option '{}' in game profile", *option_value, optionName); return false; - } + } } template @@ -224,8 +224,9 @@ bool GameProfile::Load(uint64_t title_id) gameProfile_loadIntegerOption(&iniParser, "graphics_api", &graphicsApi, -1, 0, 1); if (graphicsApi.value != -1) m_graphics_api = (GraphicAPI)graphicsApi.value; - + gameProfile_loadEnumOption(iniParser, "accurateShaderMul", m_accurateShaderMul); + gameProfile_loadBooleanOption2(iniParser, "useHostMemForCache", m_useHostMemForCache); // legacy support auto option_precompiledShaders = iniParser.FindOption("precompiledShaders"); @@ -277,7 +278,7 @@ bool GameProfile::Load(uint64_t title_id) void GameProfile::Save(uint64_t title_id) { auto gameProfileDir = ActiveSettings::GetConfigPath("gameProfiles"); - if (std::error_code ex_ec; !fs::exists(gameProfileDir, ex_ec)) + if (std::error_code ex_ec; !fs::exists(gameProfileDir, ex_ec)) fs::create_directories(gameProfileDir, ex_ec); auto gameProfilePath = gameProfileDir / fmt::format("{:016x}.ini", title_id); FileStream* fs = FileStream::createFile2(gameProfilePath); @@ -308,6 +309,7 @@ void GameProfile::Save(uint64_t title_id) fs->writeLine("[Graphics]"); WRITE_ENTRY(accurateShaderMul); + WRITE_ENTRY(useHostMemForCache); WRITE_OPTIONAL_ENTRY(precompiledShaders); WRITE_OPTIONAL_ENTRY(graphics_api); fs->writeLine(""); @@ -337,6 +339,7 @@ void GameProfile::ResetOptional() // graphic settings m_accurateShaderMul = AccurateShaderMulOption::True; + m_useHostMemForCache = false; // cpu settings m_threadQuantum = kThreadQuantumDefault; m_cpuMode.reset(); // CPUModeOption::kSingleCoreRecompiler; @@ -354,9 +357,10 @@ void GameProfile::Reset() // general settings m_loadSharedLibraries = true; m_startWithPadView = false; - + // graphic settings m_accurateShaderMul = AccurateShaderMulOption::True; + m_useHostMemForCache = false; m_precompiledShaders = PrecompiledShaderOption::Auto; // cpu settings m_threadQuantum = kThreadQuantumDefault; @@ -366,4 +370,4 @@ void GameProfile::Reset() // controller settings for (auto& profile : m_controllerProfile) profile.reset(); -} \ No newline at end of file +} diff --git a/src/Cafe/GameProfile/GameProfile.h b/src/Cafe/GameProfile/GameProfile.h index 6a1f2ebd6..e2ab29f7a 100644 --- a/src/Cafe/GameProfile/GameProfile.h +++ b/src/Cafe/GameProfile/GameProfile.h @@ -31,6 +31,7 @@ class GameProfile [[nodiscard]] const std::optional& GetGraphicsAPI() const { return m_graphics_api; } [[nodiscard]] const AccurateShaderMulOption& GetAccurateShaderMul() const { return m_accurateShaderMul; } + [[nodiscard]] bool UseHostMemForCache() const { return m_useHostMemForCache; } [[nodiscard]] const std::optional& GetPrecompiledShadersState() const { return m_precompiledShaders; } [[nodiscard]] uint32 GetThreadQuantum() const { return m_threadQuantum; } @@ -54,6 +55,7 @@ class GameProfile // graphic settings std::optional m_graphics_api{}; AccurateShaderMulOption m_accurateShaderMul = AccurateShaderMulOption::True; + bool m_useHostMemForCache = false; std::optional m_precompiledShaders{}; // cpu settings uint32 m_threadQuantum = kThreadQuantumDefault; // values: 20000 45000 60000 80000 100000 diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 4eb4d1056..5f02847a8 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -1,8 +1,10 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h" + #include "Cemu/Logging/CemuLogging.h" #include "Common/precompiled.h" +#include "GameProfile/GameProfile.h" /* MetalVertexBufferCache::~MetalVertexBufferCache() @@ -117,8 +119,7 @@ void MetalMemoryManager::InitBufferCache(size_t size) cemu_assert_debug(!m_bufferCache); // First, try to import the host memory as a buffer - // TODO: only import if the option is ticked in game profile - if (m_mtlr->IsAppleGPU()) + if (g_current_game_profile->UseHostMemForCache() && m_mtlr->IsAppleGPU()) { m_importedMemBaseAddress = 0x10000000; size_t hostAllocationSize = 0x40000000ull; @@ -165,7 +166,6 @@ void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, si void MetalMemoryManager::CopyBufferCache(size_t srcOffset, size_t dstOffset, size_t size) { - cemu_assert_debug(!m_useHostMemoryForCache); cemu_assert_debug(m_bufferCache); m_mtlr->CopyBufferToBuffer(m_bufferCache, srcOffset, m_bufferCache, dstOffset, size, ALL_MTL_RENDER_STAGES, ALL_MTL_RENDER_STAGES); diff --git a/src/gui/GameProfileWindow.cpp b/src/gui/GameProfileWindow.cpp index 76b8801c4..c1aa63e42 100644 --- a/src/gui/GameProfileWindow.cpp +++ b/src/gui/GameProfileWindow.cpp @@ -127,6 +127,13 @@ GameProfileWindow::GameProfileWindow(wxWindow* parent, uint64_t title_id) m_shader_mul_accuracy->SetToolTip(_("EXPERT OPTION\nControls the accuracy of floating point multiplication in shaders.\n\nRecommended: true")); first_row->Add(m_shader_mul_accuracy, 0, wxALL, 5); + first_row->Add(new wxStaticText(panel, wxID_ANY, _("Use host memory for cache")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); + + wxString mem_values[] = { _("false"), _("true")}; + m_use_host_mem_for_cache = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(mem_values), mem_values); + m_use_host_mem_for_cache->SetToolTip(_("EXPERT OPTION\nAllows the GPU to access data directly without the need for an intermediate cache. May increase performance and reduce memory usage, but can also cause flickering.\n\nMetal only\n\nRecommended: false")); + first_row->Add(m_use_host_mem_for_cache, 0, wxALL, 5); + /*first_row->Add(new wxStaticText(panel, wxID_ANY, _("GPU buffer cache accuracy")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); wxString accuarcy_values[] = { _("high"), _("medium"), _("low") }; m_cache_accuracy = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(accuarcy_values), accuarcy_values); @@ -273,6 +280,7 @@ void GameProfileWindow::ApplyProfile() else m_graphic_api->SetSelection(1 + m_game_profile.m_graphics_api.value()); // "", OpenGL, Vulkan, Metal m_shader_mul_accuracy->SetSelection((int)m_game_profile.m_accurateShaderMul); + m_use_host_mem_for_cache->SetSelection((int)m_game_profile.m_useHostMemForCache); //// audio //m_disable_audio->Set3StateValue(GetCheckboxState(m_game_profile.disableAudio)); @@ -332,6 +340,7 @@ void GameProfileWindow::SaveProfile() // gpu m_game_profile.m_accurateShaderMul = (AccurateShaderMulOption)m_shader_mul_accuracy->GetSelection(); + m_game_profile.m_useHostMemForCache = (bool)m_use_host_mem_for_cache->GetSelection(); if (m_game_profile.m_accurateShaderMul != AccurateShaderMulOption::False && m_game_profile.m_accurateShaderMul != AccurateShaderMulOption::True) m_game_profile.m_accurateShaderMul = AccurateShaderMulOption::True; // force a legal value diff --git a/src/gui/GameProfileWindow.h b/src/gui/GameProfileWindow.h index 6ca36de68..a1fe8132c 100644 --- a/src/gui/GameProfileWindow.h +++ b/src/gui/GameProfileWindow.h @@ -40,6 +40,7 @@ class GameProfileWindow : public wxFrame wxChoice* m_graphic_api; wxChoice* m_shader_mul_accuracy; + wxChoice* m_use_host_mem_for_cache; //wxChoice* m_cache_accuracy; // audio @@ -47,4 +48,4 @@ class GameProfileWindow : public wxFrame // controller wxComboBox* m_controller_profile[8]; -}; \ No newline at end of file +}; From b38ca6a58ad757368088d58ea2400eb1825719d7 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 3 Nov 2024 12:43:35 +0100 Subject: [PATCH 238/368] add an option to choose buffer cache type --- src/Cafe/CafeSystem.cpp | 2 +- src/Cafe/GameProfile/GameProfile.cpp | 8 +-- src/Cafe/GameProfile/GameProfile.h | 4 +- .../Renderer/Metal/MetalMemoryManager.cpp | 51 ++++++++++++------- .../Latte/Renderer/Metal/MetalMemoryManager.h | 6 ++- src/config/CemuConfig.h | 23 +++++++++ src/gui/GameProfileWindow.cpp | 15 +++--- src/gui/GameProfileWindow.h | 2 +- 8 files changed, 75 insertions(+), 36 deletions(-) diff --git a/src/Cafe/CafeSystem.cpp b/src/Cafe/CafeSystem.cpp index 08228b621..7ba93fc8b 100644 --- a/src/Cafe/CafeSystem.cpp +++ b/src/Cafe/CafeSystem.cpp @@ -258,7 +258,7 @@ void InfoLog_PrintActiveSettings() { cemuLog_log(LogType::Force, "Async compile: {}", GetConfig().async_compile.GetValue() ? "true" : "false"); cemuLog_log(LogType::Force, "Fast math: {}", GetConfig().fast_math.GetValue() ? "true" : "false"); - cemuLog_log(LogType::Force, "Use host memory for cache: {}", g_current_game_profile->UseHostMemForCache() ? "true" : "false"); + cemuLog_log(LogType::Force, "Buffer cache type: {}", g_current_game_profile->GetBufferCacheType()); if (!GetConfig().vk_accurate_barriers.GetValue()) cemuLog_log(LogType::Force, "Accurate barriers are disabled!"); } diff --git a/src/Cafe/GameProfile/GameProfile.cpp b/src/Cafe/GameProfile/GameProfile.cpp index 337786edd..a4ce8fe84 100644 --- a/src/Cafe/GameProfile/GameProfile.cpp +++ b/src/Cafe/GameProfile/GameProfile.cpp @@ -226,7 +226,7 @@ bool GameProfile::Load(uint64_t title_id) m_graphics_api = (GraphicAPI)graphicsApi.value; gameProfile_loadEnumOption(iniParser, "accurateShaderMul", m_accurateShaderMul); - gameProfile_loadBooleanOption2(iniParser, "useHostMemForCache", m_useHostMemForCache); + gameProfile_loadEnumOption(iniParser, "bufferCacheType", m_bufferCacheType); // legacy support auto option_precompiledShaders = iniParser.FindOption("precompiledShaders"); @@ -309,7 +309,7 @@ void GameProfile::Save(uint64_t title_id) fs->writeLine("[Graphics]"); WRITE_ENTRY(accurateShaderMul); - WRITE_ENTRY(useHostMemForCache); + WRITE_ENTRY(bufferCacheType); WRITE_OPTIONAL_ENTRY(precompiledShaders); WRITE_OPTIONAL_ENTRY(graphics_api); fs->writeLine(""); @@ -339,7 +339,7 @@ void GameProfile::ResetOptional() // graphic settings m_accurateShaderMul = AccurateShaderMulOption::True; - m_useHostMemForCache = false; + m_bufferCacheType = BufferCacheType::DevicePrivate; // cpu settings m_threadQuantum = kThreadQuantumDefault; m_cpuMode.reset(); // CPUModeOption::kSingleCoreRecompiler; @@ -360,7 +360,7 @@ void GameProfile::Reset() // graphic settings m_accurateShaderMul = AccurateShaderMulOption::True; - m_useHostMemForCache = false; + m_bufferCacheType = BufferCacheType::DevicePrivate; m_precompiledShaders = PrecompiledShaderOption::Auto; // cpu settings m_threadQuantum = kThreadQuantumDefault; diff --git a/src/Cafe/GameProfile/GameProfile.h b/src/Cafe/GameProfile/GameProfile.h index e2ab29f7a..5c2d28d70 100644 --- a/src/Cafe/GameProfile/GameProfile.h +++ b/src/Cafe/GameProfile/GameProfile.h @@ -31,7 +31,7 @@ class GameProfile [[nodiscard]] const std::optional& GetGraphicsAPI() const { return m_graphics_api; } [[nodiscard]] const AccurateShaderMulOption& GetAccurateShaderMul() const { return m_accurateShaderMul; } - [[nodiscard]] bool UseHostMemForCache() const { return m_useHostMemForCache; } + [[nodiscard]] BufferCacheType GetBufferCacheType() const { return m_bufferCacheType; } [[nodiscard]] const std::optional& GetPrecompiledShadersState() const { return m_precompiledShaders; } [[nodiscard]] uint32 GetThreadQuantum() const { return m_threadQuantum; } @@ -55,7 +55,7 @@ class GameProfile // graphic settings std::optional m_graphics_api{}; AccurateShaderMulOption m_accurateShaderMul = AccurateShaderMulOption::True; - bool m_useHostMemForCache = false; + BufferCacheType m_bufferCacheType = BufferCacheType::DevicePrivate; std::optional m_precompiledShaders{}; // cpu settings uint32 m_threadQuantum = kThreadQuantumDefault; // values: 20000 45000 60000 80000 100000 diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 5f02847a8..cd041c5aa 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -4,7 +4,6 @@ #include "Cemu/Logging/CemuLogging.h" #include "Common/precompiled.h" -#include "GameProfile/GameProfile.h" /* MetalVertexBufferCache::~MetalVertexBufferCache() @@ -118,21 +117,24 @@ void MetalMemoryManager::InitBufferCache(size_t size) { cemu_assert_debug(!m_bufferCache); + m_bufferCacheType = g_current_game_profile->GetBufferCacheType(); + // First, try to import the host memory as a buffer - if (g_current_game_profile->UseHostMemForCache() && m_mtlr->IsAppleGPU()) + if (m_bufferCacheType == BufferCacheType::Host && m_mtlr->IsAppleGPU()) { m_importedMemBaseAddress = 0x10000000; size_t hostAllocationSize = 0x40000000ull; // TODO: get size of allocation m_bufferCache = m_mtlr->GetDevice()->newBuffer(memory_getPointerFromVirtualOffset(m_importedMemBaseAddress), hostAllocationSize, MTL::ResourceStorageModeShared, nullptr); - if (m_bufferCache) - m_useHostMemoryForCache = true; - else + if (!m_bufferCache) + { cemuLog_logDebug(LogType::Force, "Failed to import host memory as a buffer"); + m_bufferCacheType = BufferCacheType::DevicePrivate; + } } - if (!m_useHostMemoryForCache) - m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, MTL::ResourceStorageModePrivate); + if (!m_bufferCache) + m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, (m_bufferCacheType == BufferCacheType::DevicePrivate ? MTL::ResourceStorageModePrivate : MTL::ResourceStorageModeShared)); #ifdef CEMU_DEBUG_ASSERT m_bufferCache->setLabel(GetLabel("Buffer cache", m_bufferCache)); @@ -141,24 +143,31 @@ void MetalMemoryManager::InitBufferCache(size_t size) void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, size_t size) { - cemu_assert_debug(!m_useHostMemoryForCache); + cemu_assert_debug(m_bufferCacheType != BufferCacheType::Host); cemu_assert_debug(m_bufferCache); cemu_assert_debug((offset + size) <= m_bufferCache->length()); - auto allocation = m_tempBufferAllocator.GetBufferAllocation(size); - auto buffer = m_tempBufferAllocator.GetBufferOutsideOfCommandBuffer(allocation.bufferIndex); - memcpy((uint8*)buffer->contents() + allocation.offset, data, size); + if (m_bufferCacheType == BufferCacheType::DevicePrivate) + { + auto allocation = m_tempBufferAllocator.GetBufferAllocation(size); + auto buffer = m_tempBufferAllocator.GetBufferOutsideOfCommandBuffer(allocation.bufferIndex); + memcpy((uint8*)buffer->contents() + allocation.offset, data, size); - // Lock the buffer to make sure it's not deallocated before the copy is done - m_tempBufferAllocator.LockBuffer(allocation.bufferIndex); + // Lock the buffer to make sure it's not deallocated before the copy is done + m_tempBufferAllocator.LockBuffer(allocation.bufferIndex); - m_mtlr->CopyBufferToBuffer(buffer, allocation.offset, m_bufferCache, offset, size, ALL_MTL_RENDER_STAGES, ALL_MTL_RENDER_STAGES); + m_mtlr->CopyBufferToBuffer(buffer, allocation.offset, m_bufferCache, offset, size, ALL_MTL_RENDER_STAGES, ALL_MTL_RENDER_STAGES); - // Make sure the buffer has the right command buffer - m_tempBufferAllocator.GetBuffer(allocation.bufferIndex); // TODO: make a helper function for this + // Make sure the buffer has the right command buffer + m_tempBufferAllocator.GetBuffer(allocation.bufferIndex); // TODO: make a helper function for this - // We can now safely unlock the buffer - m_tempBufferAllocator.UnlockBuffer(allocation.bufferIndex); + // We can now safely unlock the buffer + m_tempBufferAllocator.UnlockBuffer(allocation.bufferIndex); + } + else + { + memcpy((uint8*)m_bufferCache->contents() + offset, data, size); + } // Notify vertex buffer cache about the change //m_vertexBufferCache.MemoryRangeChanged(offset, size); @@ -166,7 +175,11 @@ void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, si void MetalMemoryManager::CopyBufferCache(size_t srcOffset, size_t dstOffset, size_t size) { + cemu_assert_debug(m_bufferCacheType != BufferCacheType::Host); cemu_assert_debug(m_bufferCache); - m_mtlr->CopyBufferToBuffer(m_bufferCache, srcOffset, m_bufferCache, dstOffset, size, ALL_MTL_RENDER_STAGES, ALL_MTL_RENDER_STAGES); + if (m_bufferCacheType == BufferCacheType::DevicePrivate) + m_mtlr->CopyBufferToBuffer(m_bufferCache, srcOffset, m_bufferCache, dstOffset, size, ALL_MTL_RENDER_STAGES, ALL_MTL_RENDER_STAGES); + else + memcpy((uint8*)m_bufferCache->contents() + dstOffset, (uint8*)m_bufferCache->contents() + srcOffset, size); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index 4e8b25940..6cc4ab1ee 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -2,6 +2,8 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h" +#include "GameProfile/GameProfile.h" + /* struct MetalRestridedBufferRange { @@ -118,7 +120,7 @@ class MetalMemoryManager // Getters bool UseHostMemoryForCache() const { - return m_useHostMemoryForCache; + return (m_bufferCacheType == BufferCacheType::Host); } MPTR GetImportedMemBaseAddress() const @@ -137,6 +139,6 @@ class MetalMemoryManager //MetalVertexBufferCache m_vertexBufferCache; MTL::Buffer* m_bufferCache = nullptr; - bool m_useHostMemoryForCache = false; + BufferCacheType m_bufferCacheType; MPTR m_importedMemBaseAddress; }; diff --git a/src/config/CemuConfig.h b/src/config/CemuConfig.h index 988916eb6..02dc873ab 100644 --- a/src/config/CemuConfig.h +++ b/src/config/CemuConfig.h @@ -124,6 +124,14 @@ enum class AccurateShaderMulOption }; ENABLE_ENUM_ITERATORS(AccurateShaderMulOption, AccurateShaderMulOption::False, AccurateShaderMulOption::True); +enum class BufferCacheType +{ + DevicePrivate, + DeviceShared, + Host, +}; +ENABLE_ENUM_ITERATORS(BufferCacheType, BufferCacheType::DevicePrivate, BufferCacheType::Host); + enum class CPUMode { SinglecoreInterpreter = 0, @@ -222,6 +230,21 @@ struct fmt::formatter : formatter { } }; template <> +struct fmt::formatter : formatter { + template + auto format(const BufferCacheType c, FormatContext &ctx) const { + string_view name; + switch (c) + { + case BufferCacheType::DevicePrivate: name = "device private"; break; + case BufferCacheType::DeviceShared: name = "device shared"; break; + case BufferCacheType::Host: name = "host"; break; + default: name = "unknown"; break; + } + return formatter::format(name, ctx); + } +}; +template <> struct fmt::formatter : formatter { template auto format(const CPUMode c, FormatContext &ctx) const { diff --git a/src/gui/GameProfileWindow.cpp b/src/gui/GameProfileWindow.cpp index c1aa63e42..f54a8fb4e 100644 --- a/src/gui/GameProfileWindow.cpp +++ b/src/gui/GameProfileWindow.cpp @@ -8,6 +8,7 @@ #include #include +#include "config/CemuConfig.h" #include "gui/helpers/wxHelpers.h" #include "input/InputManager.h" @@ -127,12 +128,12 @@ GameProfileWindow::GameProfileWindow(wxWindow* parent, uint64_t title_id) m_shader_mul_accuracy->SetToolTip(_("EXPERT OPTION\nControls the accuracy of floating point multiplication in shaders.\n\nRecommended: true")); first_row->Add(m_shader_mul_accuracy, 0, wxALL, 5); - first_row->Add(new wxStaticText(panel, wxID_ANY, _("Use host memory for cache")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); + first_row->Add(new wxStaticText(panel, wxID_ANY, _("Buffer cache type")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); - wxString mem_values[] = { _("false"), _("true")}; - m_use_host_mem_for_cache = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(mem_values), mem_values); - m_use_host_mem_for_cache->SetToolTip(_("EXPERT OPTION\nAllows the GPU to access data directly without the need for an intermediate cache. May increase performance and reduce memory usage, but can also cause flickering.\n\nMetal only\n\nRecommended: false")); - first_row->Add(m_use_host_mem_for_cache, 0, wxALL, 5); + wxString cache_values[] = { _("device private"), _("device shared"), _("host")}; + m_buffer_cache_type = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(cache_values), cache_values); + m_buffer_cache_type->SetToolTip(_("EXPERT OPTION\nDecides how the buffer cache memory will be managed.\n\nMetal only\n\nRecommended: device private")); + first_row->Add(m_buffer_cache_type, 0, wxALL, 5); /*first_row->Add(new wxStaticText(panel, wxID_ANY, _("GPU buffer cache accuracy")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); wxString accuarcy_values[] = { _("high"), _("medium"), _("low") }; @@ -280,7 +281,7 @@ void GameProfileWindow::ApplyProfile() else m_graphic_api->SetSelection(1 + m_game_profile.m_graphics_api.value()); // "", OpenGL, Vulkan, Metal m_shader_mul_accuracy->SetSelection((int)m_game_profile.m_accurateShaderMul); - m_use_host_mem_for_cache->SetSelection((int)m_game_profile.m_useHostMemForCache); + m_buffer_cache_type->SetSelection((int)m_game_profile.m_bufferCacheType); //// audio //m_disable_audio->Set3StateValue(GetCheckboxState(m_game_profile.disableAudio)); @@ -340,7 +341,7 @@ void GameProfileWindow::SaveProfile() // gpu m_game_profile.m_accurateShaderMul = (AccurateShaderMulOption)m_shader_mul_accuracy->GetSelection(); - m_game_profile.m_useHostMemForCache = (bool)m_use_host_mem_for_cache->GetSelection(); + m_game_profile.m_bufferCacheType = (BufferCacheType)m_buffer_cache_type->GetSelection(); if (m_game_profile.m_accurateShaderMul != AccurateShaderMulOption::False && m_game_profile.m_accurateShaderMul != AccurateShaderMulOption::True) m_game_profile.m_accurateShaderMul = AccurateShaderMulOption::True; // force a legal value diff --git a/src/gui/GameProfileWindow.h b/src/gui/GameProfileWindow.h index a1fe8132c..22eda48d8 100644 --- a/src/gui/GameProfileWindow.h +++ b/src/gui/GameProfileWindow.h @@ -40,7 +40,7 @@ class GameProfileWindow : public wxFrame wxChoice* m_graphic_api; wxChoice* m_shader_mul_accuracy; - wxChoice* m_use_host_mem_for_cache; + wxChoice* m_buffer_cache_type; //wxChoice* m_cache_accuracy; // audio From 31c10bd288de2524549d2af694c1738d5606eda9 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 3 Nov 2024 16:35:47 +0100 Subject: [PATCH 239/368] move fast math option to game profile --- src/Cafe/CafeSystem.cpp | 2 +- src/Cafe/GameProfile/GameProfile.cpp | 4 ++++ src/Cafe/GameProfile/GameProfile.h | 2 ++ .../HW/Latte/Renderer/Metal/RendererShaderMtl.cpp | 6 +++--- src/config/CemuConfig.cpp | 2 -- src/config/CemuConfig.h | 1 - src/gui/GameProfileWindow.cpp | 11 ++++++++++- src/gui/GameProfileWindow.h | 1 + src/gui/GeneralSettings2.cpp | 9 --------- src/gui/GeneralSettings2.h | 2 +- 10 files changed, 22 insertions(+), 18 deletions(-) diff --git a/src/Cafe/CafeSystem.cpp b/src/Cafe/CafeSystem.cpp index 7ba93fc8b..a0c072b5a 100644 --- a/src/Cafe/CafeSystem.cpp +++ b/src/Cafe/CafeSystem.cpp @@ -257,7 +257,7 @@ void InfoLog_PrintActiveSettings() else if (ActiveSettings::GetGraphicsAPI() == GraphicAPI::kMetal) { cemuLog_log(LogType::Force, "Async compile: {}", GetConfig().async_compile.GetValue() ? "true" : "false"); - cemuLog_log(LogType::Force, "Fast math: {}", GetConfig().fast_math.GetValue() ? "true" : "false"); + cemuLog_log(LogType::Force, "Fast math: {}", g_current_game_profile->GetFastMath() ? "true" : "false"); cemuLog_log(LogType::Force, "Buffer cache type: {}", g_current_game_profile->GetBufferCacheType()); if (!GetConfig().vk_accurate_barriers.GetValue()) cemuLog_log(LogType::Force, "Accurate barriers are disabled!"); diff --git a/src/Cafe/GameProfile/GameProfile.cpp b/src/Cafe/GameProfile/GameProfile.cpp index a4ce8fe84..9389b2793 100644 --- a/src/Cafe/GameProfile/GameProfile.cpp +++ b/src/Cafe/GameProfile/GameProfile.cpp @@ -226,6 +226,7 @@ bool GameProfile::Load(uint64_t title_id) m_graphics_api = (GraphicAPI)graphicsApi.value; gameProfile_loadEnumOption(iniParser, "accurateShaderMul", m_accurateShaderMul); + gameProfile_loadBooleanOption2(iniParser, "fastMath", m_fastMath); gameProfile_loadEnumOption(iniParser, "bufferCacheType", m_bufferCacheType); // legacy support @@ -309,6 +310,7 @@ void GameProfile::Save(uint64_t title_id) fs->writeLine("[Graphics]"); WRITE_ENTRY(accurateShaderMul); + WRITE_ENTRY(fastMath); WRITE_ENTRY(bufferCacheType); WRITE_OPTIONAL_ENTRY(precompiledShaders); WRITE_OPTIONAL_ENTRY(graphics_api); @@ -339,6 +341,7 @@ void GameProfile::ResetOptional() // graphic settings m_accurateShaderMul = AccurateShaderMulOption::True; + m_fastMath = true; m_bufferCacheType = BufferCacheType::DevicePrivate; // cpu settings m_threadQuantum = kThreadQuantumDefault; @@ -360,6 +363,7 @@ void GameProfile::Reset() // graphic settings m_accurateShaderMul = AccurateShaderMulOption::True; + m_fastMath = true; m_bufferCacheType = BufferCacheType::DevicePrivate; m_precompiledShaders = PrecompiledShaderOption::Auto; // cpu settings diff --git a/src/Cafe/GameProfile/GameProfile.h b/src/Cafe/GameProfile/GameProfile.h index 5c2d28d70..0f68bc3a5 100644 --- a/src/Cafe/GameProfile/GameProfile.h +++ b/src/Cafe/GameProfile/GameProfile.h @@ -31,6 +31,7 @@ class GameProfile [[nodiscard]] const std::optional& GetGraphicsAPI() const { return m_graphics_api; } [[nodiscard]] const AccurateShaderMulOption& GetAccurateShaderMul() const { return m_accurateShaderMul; } + [[nodiscard]] bool GetFastMath() const { return m_fastMath; } [[nodiscard]] BufferCacheType GetBufferCacheType() const { return m_bufferCacheType; } [[nodiscard]] const std::optional& GetPrecompiledShadersState() const { return m_precompiledShaders; } @@ -55,6 +56,7 @@ class GameProfile // graphic settings std::optional m_graphics_api{}; AccurateShaderMulOption m_accurateShaderMul = AccurateShaderMulOption::True; + bool m_fastMath = false; BufferCacheType m_bufferCacheType = BufferCacheType::DevicePrivate; std::optional m_precompiledShaders{}; // cpu settings diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index c8babb14e..f8b5efe91 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -1,12 +1,12 @@ #include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" + //#include "Cemu/FileCache/FileCache.h" //#include "config/ActiveSettings.h" - #include "Cemu/Logging/CemuLogging.h" #include "Common/precompiled.h" -#include "config/CemuConfig.h" +#include "GameProfile/GameProfile.h" #include "util/helpers/helpers.h" static bool s_isLoadingShadersMtl{false}; @@ -174,7 +174,7 @@ void RendererShaderMtl::CompileInternal() { MTL::CompileOptions* options = MTL::CompileOptions::alloc()->init(); // TODO: always disable fast math for problematic shaders - if (GetConfig().fast_math) + if (g_current_game_profile->GetFastMath()) options->setFastMathEnabled(true); NS::Error* error = nullptr; diff --git a/src/config/CemuConfig.cpp b/src/config/CemuConfig.cpp index 06ad94d5e..64c2f3559 100644 --- a/src/config/CemuConfig.cpp +++ b/src/config/CemuConfig.cpp @@ -219,7 +219,6 @@ void CemuConfig::Load(XMLConfigParser& parser) downscale_filter = graphic.get("DownscaleFilter", kLinearFilter); fullscreen_scaling = graphic.get("FullscreenScaling", kKeepAspectRatio); async_compile = graphic.get("AsyncCompile", async_compile); - fast_math = graphic.get("FastMath", fast_math); vk_accurate_barriers = graphic.get("vkAccurateBarriers", true); // this used to be "VulkanAccurateBarriers" but because we changed the default to true in 1.27.1 the option name had to be changed auto overlay_node = graphic.get("Overlay"); @@ -476,7 +475,6 @@ void CemuConfig::Save(XMLConfigParser& parser) graphic.set("DownscaleFilter", downscale_filter); graphic.set("FullscreenScaling", fullscreen_scaling); graphic.set("AsyncCompile", async_compile.GetValue()); - graphic.set("FastMath", fast_math.GetValue()); graphic.set("vkAccurateBarriers", vk_accurate_barriers); auto overlay_node = graphic.set("Overlay"); diff --git a/src/config/CemuConfig.h b/src/config/CemuConfig.h index 02dc873ab..32c87aa66 100644 --- a/src/config/CemuConfig.h +++ b/src/config/CemuConfig.h @@ -467,7 +467,6 @@ struct CemuConfig ConfigValue gx2drawdone_sync {true}; ConfigValue render_upside_down{ false }; ConfigValue async_compile{ true }; - ConfigValue fast_math{ true }; ConfigValue vk_accurate_barriers{ true }; diff --git a/src/gui/GameProfileWindow.cpp b/src/gui/GameProfileWindow.cpp index f54a8fb4e..b26a866d5 100644 --- a/src/gui/GameProfileWindow.cpp +++ b/src/gui/GameProfileWindow.cpp @@ -128,9 +128,16 @@ GameProfileWindow::GameProfileWindow(wxWindow* parent, uint64_t title_id) m_shader_mul_accuracy->SetToolTip(_("EXPERT OPTION\nControls the accuracy of floating point multiplication in shaders.\n\nRecommended: true")); first_row->Add(m_shader_mul_accuracy, 0, wxALL, 5); + first_row->Add(new wxStaticText(panel, wxID_ANY, _("Fast math")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); + + wxString math_values[] = { _("false"), _("true") }; + m_fast_math = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(math_values), math_values); + m_fast_math->SetToolTip(_("Enables fast math for all shaders. May (rarely) cause graphical bugs.\n\nMetal only\n\nRecommended: true")); + first_row->Add(m_fast_math, 0, wxALL, 5); + first_row->Add(new wxStaticText(panel, wxID_ANY, _("Buffer cache type")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); - wxString cache_values[] = { _("device private"), _("device shared"), _("host")}; + wxString cache_values[] = { _("device private"), _("device shared"), _("host") }; m_buffer_cache_type = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(cache_values), cache_values); m_buffer_cache_type->SetToolTip(_("EXPERT OPTION\nDecides how the buffer cache memory will be managed.\n\nMetal only\n\nRecommended: device private")); first_row->Add(m_buffer_cache_type, 0, wxALL, 5); @@ -281,6 +288,7 @@ void GameProfileWindow::ApplyProfile() else m_graphic_api->SetSelection(1 + m_game_profile.m_graphics_api.value()); // "", OpenGL, Vulkan, Metal m_shader_mul_accuracy->SetSelection((int)m_game_profile.m_accurateShaderMul); + m_fast_math->SetSelection((int)m_game_profile.m_fastMath); m_buffer_cache_type->SetSelection((int)m_game_profile.m_bufferCacheType); //// audio @@ -341,6 +349,7 @@ void GameProfileWindow::SaveProfile() // gpu m_game_profile.m_accurateShaderMul = (AccurateShaderMulOption)m_shader_mul_accuracy->GetSelection(); + m_game_profile.m_fastMath = (bool)m_fast_math->GetSelection(); m_game_profile.m_bufferCacheType = (BufferCacheType)m_buffer_cache_type->GetSelection(); if (m_game_profile.m_accurateShaderMul != AccurateShaderMulOption::False && m_game_profile.m_accurateShaderMul != AccurateShaderMulOption::True) m_game_profile.m_accurateShaderMul = AccurateShaderMulOption::True; // force a legal value diff --git a/src/gui/GameProfileWindow.h b/src/gui/GameProfileWindow.h index 22eda48d8..88d5b4381 100644 --- a/src/gui/GameProfileWindow.h +++ b/src/gui/GameProfileWindow.h @@ -40,6 +40,7 @@ class GameProfileWindow : public wxFrame wxChoice* m_graphic_api; wxChoice* m_shader_mul_accuracy; + wxChoice* m_fast_math; wxChoice* m_buffer_cache_type; //wxChoice* m_cache_accuracy; diff --git a/src/gui/GeneralSettings2.cpp b/src/gui/GeneralSettings2.cpp index 4812a8846..8b6e0ee15 100644 --- a/src/gui/GeneralSettings2.cpp +++ b/src/gui/GeneralSettings2.cpp @@ -344,10 +344,6 @@ wxPanel* GeneralSettings2::AddGraphicsPage(wxNotebook* notebook) m_async_compile->SetToolTip(_("Enables async shader and pipeline compilation. Reduces stutter at the cost of objects not rendering for a short time.\nVulkan only")); graphic_misc_row->Add(m_async_compile, 0, wxALL, 5); - m_fast_math = new wxCheckBox(box, wxID_ANY, _("Fast math")); - m_fast_math->SetToolTip(_("Enables fast math for all shaders. May cause minor inaccuracies in some games.\nMetal only")); - graphic_misc_row->Add(m_fast_math, 0, wxALL, 5); - m_gx2drawdone_sync = new wxCheckBox(box, wxID_ANY, _("Full sync at GX2DrawDone()")); m_gx2drawdone_sync->SetToolTip(_("If synchronization is requested by the game, the emulated CPU will wait for the GPU to finish all operations.\nThis is more accurate behavior, but may cause lower performance")); graphic_misc_row->Add(m_gx2drawdone_sync, 0, wxALL, 5); @@ -1042,7 +1038,6 @@ void GeneralSettings2::StoreConfig() config.vsync = m_vsync->GetSelection(); config.gx2drawdone_sync = m_gx2drawdone_sync->IsChecked(); config.async_compile = m_async_compile->IsChecked(); - config.fast_math = m_fast_math->IsChecked(); config.upscale_filter = m_upscale_filter->GetSelection(); config.downscale_filter = m_downscale_filter->GetSelection(); @@ -1520,14 +1515,12 @@ void GeneralSettings2::HandleGraphicsApiSelection() m_gx2drawdone_sync->Enable(); m_async_compile->Disable(); - m_fast_math->Disable(); } else if (m_graphic_api->GetSelection() == 1) { // Vulkan m_gx2drawdone_sync->Disable(); m_async_compile->Enable(); - m_fast_math->Disable(); m_vsync->AppendString(_("Off")); m_vsync->AppendString(_("Double buffering")); @@ -1565,7 +1558,6 @@ void GeneralSettings2::HandleGraphicsApiSelection() // Metal m_gx2drawdone_sync->Disable(); m_async_compile->Enable(); - m_fast_math->Enable(); // TODO: vsync options m_vsync->AppendString(_("Off")); @@ -1629,7 +1621,6 @@ void GeneralSettings2::ApplyConfig() m_graphic_api->SetSelection(config.graphic_api); m_vsync->SetSelection(config.vsync); m_async_compile->SetValue(config.async_compile); - m_fast_math->SetValue(config.fast_math); m_gx2drawdone_sync->SetValue(config.gx2drawdone_sync); m_upscale_filter->SetSelection(config.upscale_filter); m_downscale_filter->SetSelection(config.downscale_filter); diff --git a/src/gui/GeneralSettings2.h b/src/gui/GeneralSettings2.h index 01c4845fa..83ede03b0 100644 --- a/src/gui/GeneralSettings2.h +++ b/src/gui/GeneralSettings2.h @@ -52,7 +52,7 @@ class GeneralSettings2 : public wxDialog // Graphics wxChoice* m_graphic_api, * m_graphic_device; wxChoice* m_vsync; - wxCheckBox *m_async_compile, *m_fast_math, *m_gx2drawdone_sync; + wxCheckBox *m_async_compile, *m_gx2drawdone_sync; wxRadioBox* m_upscale_filter, *m_downscale_filter, *m_fullscreen_scaling; wxChoice* m_overlay_position, *m_notification_position, *m_overlay_scale, *m_notification_scale; wxCheckBox* m_controller_profile_name, *m_controller_low_battery, *m_shader_compiling, *m_friends_data; From e00d244e0df7c6844a556e4c3c64bcb95592fa44 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 3 Nov 2024 16:45:13 +0100 Subject: [PATCH 240/368] remove unused code --- .../Renderer/Metal/MetalMemoryManager.cpp | 93 ---------- .../Latte/Renderer/Metal/MetalMemoryManager.h | 83 +-------- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 159 +----------------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 29 +--- 4 files changed, 5 insertions(+), 359 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index cd041c5aa..4de12549b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -5,96 +5,6 @@ #include "Cemu/Logging/CemuLogging.h" #include "Common/precompiled.h" -/* -MetalVertexBufferCache::~MetalVertexBufferCache() -{ -} - -MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Buffer* bufferCache, uint32 bufferIndex, size_t stride, std::vector& barrierBuffers) -{ - auto vertexBufferRange = m_bufferRanges[bufferIndex]; - auto& restrideInfo = *vertexBufferRange.restrideInfo; - - if (stride % 4 == 0) - { - // No restride needed - return {bufferCache, vertexBufferRange.offset}; - } - - MTL::Buffer* buffer; - if (restrideInfo.memoryInvalidated || stride != restrideInfo.lastStride) - { - size_t newStride = Align(stride, 4); - size_t newSize = vertexBufferRange.size / stride * newStride; - restrideInfo.allocation = m_bufferAllocator.GetBufferAllocation(newSize); - buffer = m_bufferAllocator.GetBuffer(restrideInfo.allocation.bufferIndex); - - //uint8* oldPtr = (uint8*)bufferCache->contents() + vertexBufferRange.offset; - //uint8* newPtr = (uint8*)buffer->contents() + restrideInfo.allocation.offset; - - //for (size_t elem = 0; elem < vertexBufferRange.size / stride; elem++) - // memcpy(newPtr + elem * newStride, oldPtr + elem * stride, stride); - - if (m_mtlr->GetEncoderType() == MetalEncoderType::Render) - { - auto renderCommandEncoder = static_cast(m_mtlr->GetCommandEncoder()); - - renderCommandEncoder->setRenderPipelineState(m_restrideBufferPipeline->GetRenderPipelineState()); - m_mtlr->GetEncoderState().m_renderPipelineState = m_restrideBufferPipeline->GetRenderPipelineState(); - - m_mtlr->SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_VERTEX, bufferCache, vertexBufferRange.offset, GET_HELPER_BUFFER_BINDING(0)); - m_mtlr->SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_VERTEX, buffer, restrideInfo.allocation.offset, GET_HELPER_BUFFER_BINDING(1)); - - struct - { - uint32 oldStride; - uint32 newStride; - } strideData = {static_cast(stride), static_cast(newStride)}; - renderCommandEncoder->setVertexBytes(&strideData, sizeof(strideData), GET_HELPER_BUFFER_BINDING(2)); - m_mtlr->GetEncoderState().m_buffers[METAL_SHADER_TYPE_VERTEX][GET_HELPER_BUFFER_BINDING(2)] = {nullptr}; - - renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangleStrip, NS::UInteger(0), vertexBufferRange.size / stride); - - vectorAppendUnique(barrierBuffers, static_cast(buffer)); - } - else - { - cemu_assert_suspicious(); - } - - restrideInfo.memoryInvalidated = false; - restrideInfo.lastStride = newStride; - - // Debug - m_mtlr->GetPerformanceMonitor().m_vertexBufferRestrides++; - } - else - { - buffer = m_bufferAllocator.GetBuffer(restrideInfo.allocation.bufferIndex); - } - - return {buffer, restrideInfo.allocation.offset}; -} - -void MetalVertexBufferCache::MemoryRangeChanged(size_t offset, size_t size) -{ - for (uint32 i = 0; i < LATTE_MAX_VERTEX_BUFFERS; i++) - { - auto vertexBufferRange = m_bufferRanges[i]; - if (vertexBufferRange.offset != INVALID_OFFSET) - { - if ((offset < vertexBufferRange.offset && (offset + size) < (vertexBufferRange.offset + vertexBufferRange.size)) || - (offset > vertexBufferRange.offset && (offset + size) > (vertexBufferRange.offset + vertexBufferRange.size))) - { - continue; - } - - vertexBufferRange.restrideInfo->memoryInvalidated = true; - } - } -} -*/ - MetalMemoryManager::~MetalMemoryManager() { if (m_bufferCache) @@ -168,9 +78,6 @@ void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, si { memcpy((uint8*)m_bufferCache->contents() + offset, data, size); } - - // Notify vertex buffer cache about the change - //m_vertexBufferCache.MemoryRangeChanged(offset, size); } void MetalMemoryManager::CopyBufferCache(size_t srcOffset, size_t dstOffset, size_t size) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index 6cc4ab1ee..ecae3d3ee 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -4,74 +4,12 @@ #include "GameProfile/GameProfile.h" -/* -struct MetalRestridedBufferRange -{ - MTL::Buffer* buffer; - size_t offset; -}; - -struct MetalVertexBufferRange -{ - size_t offset = INVALID_OFFSET; - size_t size; - MetalRestrideInfo* restrideInfo; -}; - -class MetalVertexBufferCache -{ -public: - friend class MetalMemoryManager; - - MetalVertexBufferCache(class MetalRenderer* metalRenderer, MetalDefaultBufferAllocator& bufferAllocator) : m_mtlr{metalRenderer}, m_bufferAllocator{bufferAllocator} {} - ~MetalVertexBufferCache(); - - void SetRestrideBufferPipeline(class MetalVoidVertexPipeline* restrideBufferPipeline) - { - m_restrideBufferPipeline = restrideBufferPipeline; - } - - void TrackVertexBuffer(uint32 bufferIndex, size_t offset, size_t size, MetalRestrideInfo* restrideInfo) - { - m_bufferRanges[bufferIndex] = MetalVertexBufferRange{offset, size, restrideInfo}; - } - - void UntrackVertexBuffer(uint32 bufferIndex) - { - auto& range = m_bufferRanges[bufferIndex]; - //if (range.restrideInfo->allocation.offset != INVALID_OFFSET) - // m_bufferAllocator.FreeAllocation(range.restrideInfo->allocation); - range.offset = INVALID_OFFSET; - } - - MetalRestridedBufferRange RestrideBufferIfNeeded(MTL::Buffer* bufferCache, uint32 bufferIndex, size_t stride, std::vector& barrierBuffers); - -private: - class MetalRenderer* m_mtlr; - MetalDefaultBufferAllocator& m_bufferAllocator; - - class MetalVoidVertexPipeline* m_restrideBufferPipeline = nullptr; - - MetalVertexBufferRange m_bufferRanges[LATTE_MAX_VERTEX_BUFFERS] = {}; - - void MemoryRangeChanged(size_t offset, size_t size); -}; -*/ - class MetalMemoryManager { public: - MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_bufferAllocator(metalRenderer, m_mtlr->GetOptimalBufferStorageMode()), m_framePersistentBufferAllocator(metalRenderer, MTL::ResourceStorageModePrivate), m_tempBufferAllocator(metalRenderer)/*, m_vertexBufferCache(metalRenderer, m_framePersistentBufferAllocator)*/ {} + MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_bufferAllocator(metalRenderer, m_mtlr->GetOptimalBufferStorageMode()), m_framePersistentBufferAllocator(metalRenderer, MTL::ResourceStorageModePrivate), m_tempBufferAllocator(metalRenderer) {} ~MetalMemoryManager(); - // Pipelines - /* - void SetRestrideBufferPipeline(class MetalVoidVertexPipeline* restrideBufferPipeline) - { - m_vertexBufferCache.SetRestrideBufferPipeline(restrideBufferPipeline); - } - */ - MetalDefaultBufferAllocator& GetBufferAllocator() { return m_bufferAllocator; @@ -99,24 +37,6 @@ class MetalMemoryManager void UploadToBufferCache(const void* data, size_t offset, size_t size); void CopyBufferCache(size_t srcOffset, size_t dstOffset, size_t size); - // Vertex buffer cache - /* - void TrackVertexBuffer(uint32 bufferIndex, size_t offset, size_t size, MetalRestrideInfo* restrideInfo) - { - m_vertexBufferCache.TrackVertexBuffer(bufferIndex, offset, size, restrideInfo); - } - - void UntrackVertexBuffer(uint32 bufferIndex) - { - m_vertexBufferCache.UntrackVertexBuffer(bufferIndex); - } - - MetalRestridedBufferRange RestrideBufferIfNeeded(uint32 bufferIndex, size_t stride, std::vector& barrierBuffers) - { - return m_vertexBufferCache.RestrideBufferIfNeeded(m_bufferCache, bufferIndex, stride, barrierBuffers); - } - */ - // Getters bool UseHostMemoryForCache() const { @@ -136,7 +56,6 @@ class MetalMemoryManager MetalDefaultBufferAllocator m_bufferAllocator; MetalDefaultBufferAllocator m_framePersistentBufferAllocator; MetalTemporaryBufferAllocator m_tempBufferAllocator; - //MetalVertexBufferCache m_vertexBufferCache; MTL::Buffer* m_bufferCache = nullptr; BufferCacheType m_bufferCacheType; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 7c80a0bc8..93d3c08d7 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -124,54 +124,11 @@ MetalRenderer::MetalRenderer() return; } - // Present pipeline - /* - MTL::Function* fullscreenVertexFunction = utilityLibrary->newFunction(ToNSString("vertexFullscreen")); - MTL::Function* presentFragmentFunction = utilityLibrary->newFunction(ToNSString("fragmentPresent")); - - MTL::RenderPipelineDescriptor* renderPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); - renderPipelineDescriptor->setVertexFunction(fullscreenVertexFunction); - renderPipelineDescriptor->setFragmentFunction(presentFragmentFunction); - fullscreenVertexFunction->release(); - presentFragmentFunction->release(); - - error = nullptr; - renderPipelineDescriptor->colorAttachments()->object(0)->setPixelFormat(MTL::PixelFormatBGRA8Unorm); -#ifdef CEMU_DEBUG_ASSERT - renderPipelineDescriptor->setLabel(GetLabel("Present pipeline linear", renderPipelineDescriptor)); -#endif - m_presentPipelineLinear = m_device->newRenderPipelineState(renderPipelineDescriptor, &error); - if (error) - { - debug_printf("failed to create linear present pipeline (error: %s)\n", error->localizedDescription()->utf8String()); - error->release(); - } - - error = nullptr; - renderPipelineDescriptor->colorAttachments()->object(0)->setPixelFormat(MTL::PixelFormatBGRA8Unorm_sRGB); -#ifdef CEMU_DEBUG_ASSERT - renderPipelineDescriptor->setLabel(GetLabel("Present pipeline sRGB", renderPipelineDescriptor)); -#endif - m_presentPipelineSRGB = m_device->newRenderPipelineState(renderPipelineDescriptor, &error); - renderPipelineDescriptor->release(); - if (error) - { - debug_printf("failed to create sRGB present pipeline (error: %s)\n", error->localizedDescription()->utf8String()); - error->release(); - } - */ - - // Copy texture pipelines - auto copyTextureToColorPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); - - // Hybrid pipelines + // Void vertex pipelines if (m_isAppleGPU) m_copyBufferToBufferPipeline = new MetalVoidVertexPipeline(this, utilityLibrary, "vertexCopyBufferToBuffer"); - //m_copyTextureToTexturePipeline = new MetalVoidVertexPipeline(this, utilityLibrary, "vertexCopyTextureToTexture"); - //m_restrideBufferPipeline = new MetalVoidVertexPipeline(this, utilityLibrary, "vertexRestrideBuffer"); - utilityLibrary->release(); - //m_memoryManager->SetRestrideBufferPipeline(m_restrideBufferPipeline); + utilityLibrary->release(); } MetalRenderer::~MetalRenderer() @@ -741,74 +698,6 @@ void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* so //sourceTexture->GetEffectiveSize(sourceEffectiveWidth, sourceEffectiveHeight, srcMip); texture_copyImageSubData(sourceTexture, srcMip, 0, 0, srcSlice, destinationTexture, dstMip, 0, 0, dstSlice, effectiveCopyWidth, effectiveCopyHeight, 1); - - /* - sint32 texSrcMip = srcMip; - sint32 texSrcSlice = srcSlice; - sint32 texDstMip = dstMip; - sint32 texDstSlice = dstSlice; - - // Create texture views - LatteTextureViewMtl* srcTextureMtl = static_cast(sourceTexture->GetOrCreateView(srcMip, 1, srcSlice, 1)); - LatteTextureViewMtl* dstTextureMtl = static_cast(destinationTexture->GetOrCreateView(dstMip, 1, dstSlice, 1)); - - // check if texture rescale ratios match - // todo - if not, we have to use drawcall based copying - if (!LatteTexture_doesEffectiveRescaleRatioMatch(sourceTexture, texSrcMip, destinationTexture, texDstMip)) - { - cemuLog_logDebug(LogType::Force, "surfaceCopy_copySurfaceWithFormatConversion(): Mismatching dimensions"); - return; - } - - // check if bpp size matches - if (sourceTexture->GetBPP() != destinationTexture->GetBPP()) - { - cemuLog_logDebug(LogType::Force, "surfaceCopy_copySurfaceWithFormatConversion(): Mismatching BPP"); - return; - } - - if (m_encoderType == MetalEncoderType::Render) - { - auto renderCommandEncoder = static_cast(m_commandEncoder); - - renderCommandEncoder->setRenderPipelineState(m_copyTextureToTexturePipeline->GetRenderPipelineState()); - m_state.m_encoderState.m_renderPipelineState = m_copyTextureToTexturePipeline->GetRenderPipelineState(); - - SetTexture(renderCommandEncoder, METAL_SHADER_TYPE_VERTEX, srcTextureMtl->GetRGBAView(), GET_HELPER_TEXTURE_BINDING(0)); - SetTexture(renderCommandEncoder, METAL_SHADER_TYPE_VERTEX, dstTextureMtl->GetRGBAView(), GET_HELPER_TEXTURE_BINDING(1)); - renderCommandEncoder->setVertexBytes(&effectiveCopyWidth, sizeof(effectiveCopyWidth), GET_HELPER_BUFFER_BINDING(0)); - m_state.m_encoderState.m_buffers[METAL_SHADER_TYPE_VERTEX][GET_HELPER_BUFFER_BINDING(0)] = {nullptr}; - - renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3)); - } - else - { - bool copyingToWholeRegion = ((effectiveCopyWidth == dstTextureMtl->GetMipWidth(dstMip) && effectiveCopyHeight == dstTextureMtl->GetMipHeight(dstMip))); - - auto renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); - auto colorAttachment = renderPassDescriptor->colorAttachments()->object(0); - colorAttachment->setTexture(dstTextureMtl->GetTexture()); - // We don't care about previous contents if we are about to overwrite the whole region - colorAttachment->setLoadAction(copyingToWholeRegion ? MTL::LoadActionDontCare : MTL::LoadActionLoad); - colorAttachment->setStoreAction(MTL::StoreActionStore); - colorAttachment->setSlice(dstSlice); - colorAttachment->setLevel(dstMip); - - auto renderCommandEncoder = GetTemporaryRenderCommandEncoder(renderPassDescriptor); - - auto pipeline = (srcTextureMtl->IsDepth() ? m_copyTextureToColorPipeline : m_copyTextureToDepthPipeline); - renderCommandEncoder->setRenderPipelineState(pipeline); - - renderCommandEncoder->setFragmentTexture(srcTextureMtl->GetTexture(), GET_HELPER_TEXTURE_BINDING(0)); - renderCommandEncoder->setFragmentBytes(&effectiveCopyWidth, offsetof(effectiveCopyWidth), GET_HELPER_BUFFER_BINDING(0)); - - renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3)); - - EndEncoding(); - - debug_printf("surface copy with no render command encoder, skipping copy\n"); - } - */ } void MetalRenderer::bufferCache_init(const sint32 bufferSize) @@ -840,19 +729,6 @@ void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, u cemu_assert_debug(bufferIndex < LATTE_MAX_VERTEX_BUFFERS); m_state.m_vertexBufferOffsets[bufferIndex] = offset; - //if (buffer.offset == offset && buffer.size == size) - // return; - - //if (buffer.offset != INVALID_OFFSET) - //{ - // m_memoryManager->UntrackVertexBuffer(bufferIndex); - //} - - //buffer.offset = offset; - //buffer.size = size; - //buffer.restrideInfo = {}; - - //m_memoryManager->TrackVertexBuffer(bufferIndex, offset, size, &buffer.restrideInfo); } void MetalRenderer::buffer_bindUniformBuffer(LatteConst::ShaderType shaderType, uint32 bufferIndex, uint32 offset, uint32 size) @@ -1218,45 +1094,16 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // Resources // Vertex buffers - //std::vector barrierBuffers; for (uint8 i = 0; i < MAX_MTL_VERTEX_BUFFERS; i++) { size_t offset = m_state.m_vertexBufferOffsets[i]; if (offset != INVALID_OFFSET) { - /* - MTL::Buffer* buffer; - size_t offset; - - // Restride - if (usesGeometryShader) - { - // Object shaders don't need restriding, since the attributes are fetched in the shader - buffer = m_memoryManager->GetBufferCache(); - offset = m_state.m_vertexBuffers[i].offset; - } - else - { - uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + i * 7; - uint32 bufferStride = (LatteGPUState.contextNew.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; - - auto restridedBuffer = m_memoryManager->RestrideBufferIfNeeded(i, bufferStride, barrierBuffers); - - buffer = restridedBuffer.buffer; - offset = restridedBuffer.offset; - } - */ - // Bind SetBuffer(renderCommandEncoder, GetMtlShaderType(vertexShader->shaderType, usesGeometryShader), m_memoryManager->GetBufferCache(), offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); } } - //if (!barrierBuffers.empty()) - //{ - // renderCommandEncoder->memoryBarrier(barrierBuffers.data(), barrierBuffers.size(), MTL::RenderStageVertex, MTL::RenderStageVertex); - //} - // Prepare streamout m_state.m_streamoutState.verticesPerInstance = count; LatteStreamout_PrepareDrawcall(count, instanceCount); @@ -1370,8 +1217,6 @@ void MetalRenderer::draw_updateVertexBuffersDirectAccess() uint32 bufferIndex = bufferGroup.attributeBufferIndex; uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; MPTR bufferAddress = LatteGPUState.contextRegister[bufferBaseRegisterIndex + 0]; - //uint32 bufferSize = LatteGPUState.contextRegister[bufferBaseRegisterIndex + 1] + 1; - //uint32 bufferStride = (LatteGPUState.contextRegister[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; if (bufferAddress == MPTR_NULL) [[unlikely]] bufferAddress = 0x10000000; // TODO: really? diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 9ddc5e93f..de416825c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -20,25 +20,6 @@ struct MetalBufferAllocation } }; -/* -struct MetalRestrideInfo -{ - bool memoryInvalidated = true; - size_t lastStride = 0; - MetalBufferAllocation allocation{}; -}; -*/ - -/* -struct MetalBoundBuffer -{ - size_t offset = INVALID_OFFSET; - size_t size = 0; - // Memory manager will write restride info to this variable - //MetalRestrideInfo restrideInfo; -}; -*/ - enum MetalGeneralShaderType { METAL_GENERAL_SHADER_TYPE_VERTEX, @@ -143,7 +124,7 @@ struct MetalState // If the FBO changes, but it's the same FBO as the last one with some omitted attachments, this FBO doesn't change MetalActiveFBOState m_lastUsedFBO; - size_t m_vertexBufferOffsets[MAX_MTL_VERTEX_BUFFERS] = {INVALID_OFFSET}; + size_t m_vertexBufferOffsets[MAX_MTL_VERTEX_BUFFERS]; // TODO: find out what is the max number of bound textures on the Wii U class LatteTextureViewMtl* m_textures[64] = {nullptr}; size_t m_uniformBufferOffsets[METAL_GENERAL_SHADER_TYPE_TOTAL][MAX_MTL_BUFFERS]; @@ -472,14 +453,8 @@ class MetalRenderer : public Renderer class MetalDepthStencilCache* m_depthStencilCache; class MetalSamplerCache* m_samplerCache; - // Pipelines - //MTL::RenderPipelineState* m_presentPipelineLinear; - //MTL::RenderPipelineState* m_presentPipelineSRGB; - - // Hybrid pipelines + // Void vertex pipelines class MetalVoidVertexPipeline* m_copyBufferToBufferPipeline; - //class MetalVoidVertexPipeline* m_copyTextureToTexturePipeline; - //class MetalVoidVertexPipeline* m_restrideBufferPipeline; // Resources MTL::SamplerState* m_nearestSampler; From fbea328b9be0af9ac2ed719f5497a544403627c5 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 3 Nov 2024 16:58:32 +0100 Subject: [PATCH 241/368] correctly report memory usage for host buffer cache --- src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp | 5 ++--- src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h | 6 ++++++ src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 3 ++- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 4de12549b..4f27ccf57 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -33,9 +33,8 @@ void MetalMemoryManager::InitBufferCache(size_t size) if (m_bufferCacheType == BufferCacheType::Host && m_mtlr->IsAppleGPU()) { m_importedMemBaseAddress = 0x10000000; - size_t hostAllocationSize = 0x40000000ull; - // TODO: get size of allocation - m_bufferCache = m_mtlr->GetDevice()->newBuffer(memory_getPointerFromVirtualOffset(m_importedMemBaseAddress), hostAllocationSize, MTL::ResourceStorageModeShared, nullptr); + m_hostAllocationSize = 0x40000000ull; // TODO: get size of allocation + m_bufferCache = m_mtlr->GetDevice()->newBuffer(memory_getPointerFromVirtualOffset(m_importedMemBaseAddress), m_hostAllocationSize, MTL::ResourceStorageModeShared, nullptr); if (!m_bufferCache) { cemuLog_logDebug(LogType::Force, "Failed to import host memory as a buffer"); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index ecae3d3ee..8f3c4d4ca 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -48,6 +48,11 @@ class MetalMemoryManager return m_importedMemBaseAddress; } + size_t GetHostAllocationSize() const + { + return m_hostAllocationSize; + } + private: class MetalRenderer* m_mtlr; @@ -60,4 +65,5 @@ class MetalMemoryManager MTL::Buffer* m_bufferCache = nullptr; BufferCacheType m_bufferCacheType; MPTR m_importedMemBaseAddress; + size_t m_hostAllocationSize = 0; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 93d3c08d7..b19ef07a7 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -204,7 +204,8 @@ bool MetalRenderer::IsPadWindowActive() bool MetalRenderer::GetVRAMInfo(int& usageInMB, int& totalInMB) const { - usageInMB = m_device->currentAllocatedSize() / 1024 / 1024; + // Subtract host memory from total VRAM, since it's shared with the CPU + usageInMB = (m_device->currentAllocatedSize() - m_memoryManager->GetHostAllocationSize()) / 1024 / 1024; totalInMB = m_recommendedMaxVRAMUsage / 1024 / 1024; return true; From a00d409ab764feae40d7a896e11b872b88c2ca2a Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 4 Nov 2024 16:03:34 +0100 Subject: [PATCH 242/368] improve host and device shared buffer cache modes --- .../Renderer/Metal/MetalMemoryManager.cpp | 23 +++++++++++++------ .../Latte/Renderer/Metal/MetalMemoryManager.h | 5 ++++ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 4 ++-- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 4f27ccf57..2d0688845 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -4,6 +4,7 @@ #include "Cemu/Logging/CemuLogging.h" #include "Common/precompiled.h" +#include "HW/MMU/MMU.h" MetalMemoryManager::~MetalMemoryManager() { @@ -30,15 +31,23 @@ void MetalMemoryManager::InitBufferCache(size_t size) m_bufferCacheType = g_current_game_profile->GetBufferCacheType(); // First, try to import the host memory as a buffer - if (m_bufferCacheType == BufferCacheType::Host && m_mtlr->IsAppleGPU()) + if (m_bufferCacheType == BufferCacheType::Host) { - m_importedMemBaseAddress = 0x10000000; - m_hostAllocationSize = 0x40000000ull; // TODO: get size of allocation - m_bufferCache = m_mtlr->GetDevice()->newBuffer(memory_getPointerFromVirtualOffset(m_importedMemBaseAddress), m_hostAllocationSize, MTL::ResourceStorageModeShared, nullptr); - if (!m_bufferCache) + if (m_mtlr->HasUnifiedMemory()) { - cemuLog_logDebug(LogType::Force, "Failed to import host memory as a buffer"); - m_bufferCacheType = BufferCacheType::DevicePrivate; + m_importedMemBaseAddress = mmuRange_MEM2.getBase(); + m_hostAllocationSize = mmuRange_MEM2.getSize(); + m_bufferCache = m_mtlr->GetDevice()->newBuffer(memory_getPointerFromVirtualOffset(m_importedMemBaseAddress), m_hostAllocationSize, MTL::ResourceStorageModeShared, nullptr); + if (!m_bufferCache) + { + cemuLog_logDebug(LogType::Force, "Failed to import host memory as a buffer, using device shared mode instead"); + m_bufferCacheType = BufferCacheType::DeviceShared; + } + } + else + { + cemuLog_logDebug(LogType::Force, "Host buffer cache mode is only available on unified memory systems, using device shared mode instead"); + m_bufferCacheType = BufferCacheType::DeviceShared; } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index 8f3c4d4ca..a35df24c8 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -43,6 +43,11 @@ class MetalMemoryManager return (m_bufferCacheType == BufferCacheType::Host); } + bool NeedsReducedLatency() const + { + return (m_bufferCacheType == BufferCacheType::DeviceShared || m_bufferCacheType == BufferCacheType::Host); + } + MPTR GetImportedMemBaseAddress() const { return m_importedMemBaseAddress; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index b19ef07a7..20f8f6785 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -88,8 +88,8 @@ MetalRenderer::MetalRenderer() m_depthStencilCache = new MetalDepthStencilCache(this); m_samplerCache = new MetalSamplerCache(this); - // Lower the commit treshold when host memory is used for cache to reduce latency - if (m_memoryManager->UseHostMemoryForCache()) + // Lower the commit treshold when buffer cache needs reduced latency + if (m_memoryManager->NeedsReducedLatency()) m_defaultCommitTreshlod = 64; else m_defaultCommitTreshlod = 196; From 2e93b08b3991370390ede71368a59f4f01a5e484 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 4 Nov 2024 16:10:37 +0100 Subject: [PATCH 243/368] rename buffer cache type to buffer cache mode --- src/Cafe/CafeSystem.cpp | 2 +- src/Cafe/GameProfile/GameProfile.cpp | 8 ++++---- src/Cafe/GameProfile/GameProfile.h | 4 ++-- .../Renderer/Metal/MetalMemoryManager.cpp | 18 +++++++++--------- .../Latte/Renderer/Metal/MetalMemoryManager.h | 6 +++--- src/config/CemuConfig.h | 14 +++++++------- src/gui/GameProfileWindow.cpp | 12 ++++++------ src/gui/GameProfileWindow.h | 2 +- 8 files changed, 33 insertions(+), 33 deletions(-) diff --git a/src/Cafe/CafeSystem.cpp b/src/Cafe/CafeSystem.cpp index a0c072b5a..f8c650cb3 100644 --- a/src/Cafe/CafeSystem.cpp +++ b/src/Cafe/CafeSystem.cpp @@ -258,7 +258,7 @@ void InfoLog_PrintActiveSettings() { cemuLog_log(LogType::Force, "Async compile: {}", GetConfig().async_compile.GetValue() ? "true" : "false"); cemuLog_log(LogType::Force, "Fast math: {}", g_current_game_profile->GetFastMath() ? "true" : "false"); - cemuLog_log(LogType::Force, "Buffer cache type: {}", g_current_game_profile->GetBufferCacheType()); + cemuLog_log(LogType::Force, "Buffer cache type: {}", g_current_game_profile->GetBufferCacheMode()); if (!GetConfig().vk_accurate_barriers.GetValue()) cemuLog_log(LogType::Force, "Accurate barriers are disabled!"); } diff --git a/src/Cafe/GameProfile/GameProfile.cpp b/src/Cafe/GameProfile/GameProfile.cpp index 9389b2793..f8e1305c7 100644 --- a/src/Cafe/GameProfile/GameProfile.cpp +++ b/src/Cafe/GameProfile/GameProfile.cpp @@ -227,7 +227,7 @@ bool GameProfile::Load(uint64_t title_id) gameProfile_loadEnumOption(iniParser, "accurateShaderMul", m_accurateShaderMul); gameProfile_loadBooleanOption2(iniParser, "fastMath", m_fastMath); - gameProfile_loadEnumOption(iniParser, "bufferCacheType", m_bufferCacheType); + gameProfile_loadEnumOption(iniParser, "bufferCacheMode", m_bufferCacheMode); // legacy support auto option_precompiledShaders = iniParser.FindOption("precompiledShaders"); @@ -311,7 +311,7 @@ void GameProfile::Save(uint64_t title_id) fs->writeLine("[Graphics]"); WRITE_ENTRY(accurateShaderMul); WRITE_ENTRY(fastMath); - WRITE_ENTRY(bufferCacheType); + WRITE_ENTRY(bufferCacheMode); WRITE_OPTIONAL_ENTRY(precompiledShaders); WRITE_OPTIONAL_ENTRY(graphics_api); fs->writeLine(""); @@ -342,7 +342,7 @@ void GameProfile::ResetOptional() // graphic settings m_accurateShaderMul = AccurateShaderMulOption::True; m_fastMath = true; - m_bufferCacheType = BufferCacheType::DevicePrivate; + m_bufferCacheMode = BufferCacheMode::DevicePrivate; // cpu settings m_threadQuantum = kThreadQuantumDefault; m_cpuMode.reset(); // CPUModeOption::kSingleCoreRecompiler; @@ -364,7 +364,7 @@ void GameProfile::Reset() // graphic settings m_accurateShaderMul = AccurateShaderMulOption::True; m_fastMath = true; - m_bufferCacheType = BufferCacheType::DevicePrivate; + m_bufferCacheMode = BufferCacheMode::DevicePrivate; m_precompiledShaders = PrecompiledShaderOption::Auto; // cpu settings m_threadQuantum = kThreadQuantumDefault; diff --git a/src/Cafe/GameProfile/GameProfile.h b/src/Cafe/GameProfile/GameProfile.h index 0f68bc3a5..078a70a24 100644 --- a/src/Cafe/GameProfile/GameProfile.h +++ b/src/Cafe/GameProfile/GameProfile.h @@ -32,7 +32,7 @@ class GameProfile [[nodiscard]] const std::optional& GetGraphicsAPI() const { return m_graphics_api; } [[nodiscard]] const AccurateShaderMulOption& GetAccurateShaderMul() const { return m_accurateShaderMul; } [[nodiscard]] bool GetFastMath() const { return m_fastMath; } - [[nodiscard]] BufferCacheType GetBufferCacheType() const { return m_bufferCacheType; } + [[nodiscard]] BufferCacheMode GetBufferCacheMode() const { return m_bufferCacheMode; } [[nodiscard]] const std::optional& GetPrecompiledShadersState() const { return m_precompiledShaders; } [[nodiscard]] uint32 GetThreadQuantum() const { return m_threadQuantum; } @@ -57,7 +57,7 @@ class GameProfile std::optional m_graphics_api{}; AccurateShaderMulOption m_accurateShaderMul = AccurateShaderMulOption::True; bool m_fastMath = false; - BufferCacheType m_bufferCacheType = BufferCacheType::DevicePrivate; + BufferCacheMode m_bufferCacheMode = BufferCacheMode::DevicePrivate; std::optional m_precompiledShaders{}; // cpu settings uint32 m_threadQuantum = kThreadQuantumDefault; // values: 20000 45000 60000 80000 100000 diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 2d0688845..8e0c141fa 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -28,10 +28,10 @@ void MetalMemoryManager::InitBufferCache(size_t size) { cemu_assert_debug(!m_bufferCache); - m_bufferCacheType = g_current_game_profile->GetBufferCacheType(); + m_bufferCacheMode = g_current_game_profile->GetBufferCacheMode(); // First, try to import the host memory as a buffer - if (m_bufferCacheType == BufferCacheType::Host) + if (m_bufferCacheMode == BufferCacheMode::Host) { if (m_mtlr->HasUnifiedMemory()) { @@ -41,18 +41,18 @@ void MetalMemoryManager::InitBufferCache(size_t size) if (!m_bufferCache) { cemuLog_logDebug(LogType::Force, "Failed to import host memory as a buffer, using device shared mode instead"); - m_bufferCacheType = BufferCacheType::DeviceShared; + m_bufferCacheMode = BufferCacheMode::DeviceShared; } } else { cemuLog_logDebug(LogType::Force, "Host buffer cache mode is only available on unified memory systems, using device shared mode instead"); - m_bufferCacheType = BufferCacheType::DeviceShared; + m_bufferCacheMode = BufferCacheMode::DeviceShared; } } if (!m_bufferCache) - m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, (m_bufferCacheType == BufferCacheType::DevicePrivate ? MTL::ResourceStorageModePrivate : MTL::ResourceStorageModeShared)); + m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, (m_bufferCacheMode == BufferCacheMode::DevicePrivate ? MTL::ResourceStorageModePrivate : MTL::ResourceStorageModeShared)); #ifdef CEMU_DEBUG_ASSERT m_bufferCache->setLabel(GetLabel("Buffer cache", m_bufferCache)); @@ -61,11 +61,11 @@ void MetalMemoryManager::InitBufferCache(size_t size) void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, size_t size) { - cemu_assert_debug(m_bufferCacheType != BufferCacheType::Host); + cemu_assert_debug(m_bufferCacheMode != BufferCacheMode::Host); cemu_assert_debug(m_bufferCache); cemu_assert_debug((offset + size) <= m_bufferCache->length()); - if (m_bufferCacheType == BufferCacheType::DevicePrivate) + if (m_bufferCacheMode == BufferCacheMode::DevicePrivate) { auto allocation = m_tempBufferAllocator.GetBufferAllocation(size); auto buffer = m_tempBufferAllocator.GetBufferOutsideOfCommandBuffer(allocation.bufferIndex); @@ -90,10 +90,10 @@ void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, si void MetalMemoryManager::CopyBufferCache(size_t srcOffset, size_t dstOffset, size_t size) { - cemu_assert_debug(m_bufferCacheType != BufferCacheType::Host); + cemu_assert_debug(m_bufferCacheMode != BufferCacheMode::Host); cemu_assert_debug(m_bufferCache); - if (m_bufferCacheType == BufferCacheType::DevicePrivate) + if (m_bufferCacheMode == BufferCacheMode::DevicePrivate) m_mtlr->CopyBufferToBuffer(m_bufferCache, srcOffset, m_bufferCache, dstOffset, size, ALL_MTL_RENDER_STAGES, ALL_MTL_RENDER_STAGES); else memcpy((uint8*)m_bufferCache->contents() + dstOffset, (uint8*)m_bufferCache->contents() + srcOffset, size); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index a35df24c8..f74d30d06 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -40,12 +40,12 @@ class MetalMemoryManager // Getters bool UseHostMemoryForCache() const { - return (m_bufferCacheType == BufferCacheType::Host); + return (m_bufferCacheMode == BufferCacheMode::Host); } bool NeedsReducedLatency() const { - return (m_bufferCacheType == BufferCacheType::DeviceShared || m_bufferCacheType == BufferCacheType::Host); + return (m_bufferCacheMode == BufferCacheMode::DeviceShared || m_bufferCacheMode == BufferCacheMode::Host); } MPTR GetImportedMemBaseAddress() const @@ -68,7 +68,7 @@ class MetalMemoryManager MetalTemporaryBufferAllocator m_tempBufferAllocator; MTL::Buffer* m_bufferCache = nullptr; - BufferCacheType m_bufferCacheType; + BufferCacheMode m_bufferCacheMode; MPTR m_importedMemBaseAddress; size_t m_hostAllocationSize = 0; }; diff --git a/src/config/CemuConfig.h b/src/config/CemuConfig.h index 32c87aa66..68170456d 100644 --- a/src/config/CemuConfig.h +++ b/src/config/CemuConfig.h @@ -124,13 +124,13 @@ enum class AccurateShaderMulOption }; ENABLE_ENUM_ITERATORS(AccurateShaderMulOption, AccurateShaderMulOption::False, AccurateShaderMulOption::True); -enum class BufferCacheType +enum class BufferCacheMode { DevicePrivate, DeviceShared, Host, }; -ENABLE_ENUM_ITERATORS(BufferCacheType, BufferCacheType::DevicePrivate, BufferCacheType::Host); +ENABLE_ENUM_ITERATORS(BufferCacheMode, BufferCacheMode::DevicePrivate, BufferCacheMode::Host); enum class CPUMode { @@ -230,15 +230,15 @@ struct fmt::formatter : formatter { } }; template <> -struct fmt::formatter : formatter { +struct fmt::formatter : formatter { template - auto format(const BufferCacheType c, FormatContext &ctx) const { + auto format(const BufferCacheMode c, FormatContext &ctx) const { string_view name; switch (c) { - case BufferCacheType::DevicePrivate: name = "device private"; break; - case BufferCacheType::DeviceShared: name = "device shared"; break; - case BufferCacheType::Host: name = "host"; break; + case BufferCacheMode::DevicePrivate: name = "device private"; break; + case BufferCacheMode::DeviceShared: name = "device shared"; break; + case BufferCacheMode::Host: name = "host"; break; default: name = "unknown"; break; } return formatter::format(name, ctx); diff --git a/src/gui/GameProfileWindow.cpp b/src/gui/GameProfileWindow.cpp index b26a866d5..120dd5e9a 100644 --- a/src/gui/GameProfileWindow.cpp +++ b/src/gui/GameProfileWindow.cpp @@ -135,12 +135,12 @@ GameProfileWindow::GameProfileWindow(wxWindow* parent, uint64_t title_id) m_fast_math->SetToolTip(_("Enables fast math for all shaders. May (rarely) cause graphical bugs.\n\nMetal only\n\nRecommended: true")); first_row->Add(m_fast_math, 0, wxALL, 5); - first_row->Add(new wxStaticText(panel, wxID_ANY, _("Buffer cache type")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); + first_row->Add(new wxStaticText(panel, wxID_ANY, _("Buffer cache mode")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); wxString cache_values[] = { _("device private"), _("device shared"), _("host") }; - m_buffer_cache_type = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(cache_values), cache_values); - m_buffer_cache_type->SetToolTip(_("EXPERT OPTION\nDecides how the buffer cache memory will be managed.\n\nMetal only\n\nRecommended: device private")); - first_row->Add(m_buffer_cache_type, 0, wxALL, 5); + m_buffer_cache_mode = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(cache_values), cache_values); + m_buffer_cache_mode->SetToolTip(_("EXPERT OPTION\nDecides how the buffer cache memory will be managed.\n\nMetal only\n\nRecommended: device private")); + first_row->Add(m_buffer_cache_mode, 0, wxALL, 5); /*first_row->Add(new wxStaticText(panel, wxID_ANY, _("GPU buffer cache accuracy")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); wxString accuarcy_values[] = { _("high"), _("medium"), _("low") }; @@ -289,7 +289,7 @@ void GameProfileWindow::ApplyProfile() m_graphic_api->SetSelection(1 + m_game_profile.m_graphics_api.value()); // "", OpenGL, Vulkan, Metal m_shader_mul_accuracy->SetSelection((int)m_game_profile.m_accurateShaderMul); m_fast_math->SetSelection((int)m_game_profile.m_fastMath); - m_buffer_cache_type->SetSelection((int)m_game_profile.m_bufferCacheType); + m_buffer_cache_mode->SetSelection((int)m_game_profile.m_bufferCacheMode); //// audio //m_disable_audio->Set3StateValue(GetCheckboxState(m_game_profile.disableAudio)); @@ -350,7 +350,7 @@ void GameProfileWindow::SaveProfile() // gpu m_game_profile.m_accurateShaderMul = (AccurateShaderMulOption)m_shader_mul_accuracy->GetSelection(); m_game_profile.m_fastMath = (bool)m_fast_math->GetSelection(); - m_game_profile.m_bufferCacheType = (BufferCacheType)m_buffer_cache_type->GetSelection(); + m_game_profile.m_bufferCacheMode = (BufferCacheMode)m_buffer_cache_mode->GetSelection(); if (m_game_profile.m_accurateShaderMul != AccurateShaderMulOption::False && m_game_profile.m_accurateShaderMul != AccurateShaderMulOption::True) m_game_profile.m_accurateShaderMul = AccurateShaderMulOption::True; // force a legal value diff --git a/src/gui/GameProfileWindow.h b/src/gui/GameProfileWindow.h index 88d5b4381..8bf0c91c8 100644 --- a/src/gui/GameProfileWindow.h +++ b/src/gui/GameProfileWindow.h @@ -41,7 +41,7 @@ class GameProfileWindow : public wxFrame wxChoice* m_shader_mul_accuracy; wxChoice* m_fast_math; - wxChoice* m_buffer_cache_type; + wxChoice* m_buffer_cache_mode; //wxChoice* m_cache_accuracy; // audio From 27902009c6980af5c262931de9a7c45559eabe03 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 4 Nov 2024 19:03:59 +0100 Subject: [PATCH 244/368] separate metal shader cache --- src/Cafe/HW/Latte/Core/LatteShaderCache.cpp | 6 +++++- src/gui/components/wxGameList.cpp | 20 +++++++++++--------- src/tools/ShaderCacheMerger.cpp | 4 ++++ 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp b/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp index f1299e406..a76df8c60 100644 --- a/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp @@ -257,7 +257,11 @@ void LatteShaderCache_Load() #endif // get cache file name - const auto pathGeneric = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}_shaders.bin", cacheTitleId); + fs::path pathGeneric; + if (g_renderer->GetType() == RendererAPI::Metal) + pathGeneric = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}_mtlshaders.bin", cacheTitleId); + else + pathGeneric = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}_shaders.bin", cacheTitleId); const auto pathGenericPre1_25_0 = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}.bin", cacheTitleId); // before 1.25.0 const auto pathGenericPre1_16_0 = ActiveSettings::GetCachePath("shaderCache/transferable/{:08x}.bin", CafeSystem::GetRPXHashBase()); // before 1.16.0 diff --git a/src/gui/components/wxGameList.cpp b/src/gui/components/wxGameList.cpp index 6cbb58594..509c46622 100644 --- a/src/gui/components/wxGameList.cpp +++ b/src/gui/components/wxGameList.cpp @@ -70,7 +70,9 @@ std::list _getCachesPaths(const TitleId& titleId) ActiveSettings::GetCachePath(L"shaderCache/precompiled/{:016x}_spirv.bin", titleId), ActiveSettings::GetCachePath(L"shaderCache/precompiled/{:016x}_gl.bin", titleId), ActiveSettings::GetCachePath(L"shaderCache/transferable/{:016x}_shaders.bin", titleId), - ActiveSettings::GetCachePath(L"shaderCache/transferable/{:016x}_vkpipeline.bin", titleId)}; + ActiveSettings::GetCachePath(L"shaderCache/transferable/{:016x}_mtlshaders.bin", titleId), + ActiveSettings::GetCachePath(L"shaderCache/transferable/{:016x}_vkpipeline.bin", titleId), + ActiveSettings::GetCachePath(L"shaderCache/transferable/{:016x}_mtlpipeline.bin", titleId)}; cachePaths.remove_if( [](const fs::path& cachePath) @@ -200,13 +202,13 @@ void wxGameList::OnGameListSize(wxSizeEvent &event) for(int i = GetColumnCount() - 1; i > 0; i--) { #ifdef wxHAS_LISTCTRL_COLUMN_ORDER - if(GetColumnWidth(GetColumnIndexFromOrder(i)) > 0) + if(GetColumnWidth(GetColumnIndexFromOrder(i)) > 0) { last_col_index = GetColumnIndexFromOrder(i); break; } #else - if(GetColumnWidth(i) > 0) + if(GetColumnWidth(i) > 0) { last_col_index = i; break; @@ -938,13 +940,13 @@ void wxGameList::OnColumnBeginResize(wxListEvent& event) for(int i = GetColumnCount() - 1; i > 0; i--) { #ifdef wxHAS_LISTCTRL_COLUMN_ORDER - if(GetColumnWidth(GetColumnIndexFromOrder(i)) > 0) + if(GetColumnWidth(GetColumnIndexFromOrder(i)) > 0) { last_col_index = GetColumnIndexFromOrder(i); break; } #else - if(GetColumnWidth(i) > 0) + if(GetColumnWidth(i) > 0) { last_col_index = i; break; @@ -1076,7 +1078,7 @@ void wxGameList::OnGameEntryUpdatedByTitleId(wxTitleIdEvent& event) wxString minutesText = formatWxString(wxPLURAL("{} minute", "{} minutes", minutes), minutes); SetItem(index, ColumnGameTime, hoursText + " " + minutesText); } - + // last played if (playTimeStat.last_played.year != 0) { @@ -1290,7 +1292,7 @@ bool wxGameList::QueryIconForTitle(TitleId titleId, int& icon, int& iconSmall) return true; } -void wxGameList::DeleteCachedStrings() +void wxGameList::DeleteCachedStrings() { m_name_cache.clear(); } @@ -1448,7 +1450,7 @@ void wxGameList::CreateShortcut(GameInfo2& gameInfo) if (SUCCEEDED(hres)) { hres = shellLinkFile->Save(outputPath.wc_str(), TRUE); - shellLinkFile->Release(); + shellLinkFile->Release(); } shellLink->Release(); } @@ -1457,4 +1459,4 @@ void wxGameList::CreateShortcut(GameInfo2& gameInfo) wxMessageBox(errorMsg, _("Error"), wxOK | wxCENTRE | wxICON_ERROR); } } -#endif \ No newline at end of file +#endif diff --git a/src/tools/ShaderCacheMerger.cpp b/src/tools/ShaderCacheMerger.cpp index 14a54252a..7a2727dd0 100644 --- a/src/tools/ShaderCacheMerger.cpp +++ b/src/tools/ShaderCacheMerger.cpp @@ -106,6 +106,8 @@ void MergeShaderAndPipelineCacheFiles() auto filename = it.path().filename().generic_string(); if (std::regex_match(filename, std::regex("^[0-9a-fA-F]{16}(?:_shaders.bin)"))) MergeShaderCacheFile(filename); + if (std::regex_match(filename, std::regex("^[0-9a-fA-F]{16}(?:_mtlshaders.bin)"))) + MergeShaderCacheFile(filename); } printf("\nScanning for pipeline cache files to merge...\n"); for (const auto& it : fs::directory_iterator("shaderCache/transferable/")) @@ -115,6 +117,8 @@ void MergeShaderAndPipelineCacheFiles() auto filename = it.path().filename().generic_string(); if (std::regex_match(filename, std::regex("^[0-9a-fA-F]{16}(?:_vkpipeline.bin)"))) MergePipelineCacheFile(filename); + if (std::regex_match(filename, std::regex("^[0-9a-fA-F]{16}(?:_mtlpipeline.bin)"))) + MergePipelineCacheFile(filename); } } From 07c62d5f9526349c44e038fb43e76e78851f0d48 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 5 Nov 2024 17:06:08 +0100 Subject: [PATCH 245/368] release texture upload buffer --- src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp | 8 +++++++- src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h | 4 +++- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 7 ++++--- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 8e0c141fa..01cee8b9a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -14,7 +14,7 @@ MetalMemoryManager::~MetalMemoryManager() } } -void* MetalMemoryManager::GetTextureUploadBuffer(size_t size) +void* MetalMemoryManager::AcquireTextureUploadBuffer(size_t size) { if (m_textureUploadBuffer.size() < size) { @@ -24,6 +24,12 @@ void* MetalMemoryManager::GetTextureUploadBuffer(size_t size) return m_textureUploadBuffer.data(); } +void MetalMemoryManager::ReleaseTextureUploadBuffer(uint8* mem) +{ + cemu_assert_debug(m_textureUploadBuffer.data() == mem); + m_textureUploadBuffer.clear(); +} + void MetalMemoryManager::InitBufferCache(size_t size) { cemu_assert_debug(!m_bufferCache); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index f74d30d06..3d70e0db6 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -30,7 +30,9 @@ class MetalMemoryManager return m_bufferCache; } - void* GetTextureUploadBuffer(size_t size); + // Texture upload buffer + void* AcquireTextureUploadBuffer(size_t size); + void ReleaseTextureUploadBuffer(uint8* mem); // Buffer cache void InitBufferCache(size_t size); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 20f8f6785..6ee3d9b65 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -456,9 +456,10 @@ void MetalRenderer::AppendOverlayDebugInfo() ImGui::Text("Triangle fans %u", m_performanceMonitor.m_triangleFans); } -// TODO: halfZ void MetalRenderer::renderTarget_setViewport(float x, float y, float width, float height, float nearZ, float farZ, bool halfZ) { + // halfZ is handled in the shader + m_state.m_viewport = MTL::Viewport{x, y, width, height, nearZ, farZ}; } @@ -485,12 +486,12 @@ void MetalRenderer::rendertarget_bindFramebufferObject(LatteCachedFBO* cfbo) void* MetalRenderer::texture_acquireTextureUploadBuffer(uint32 size) { - return m_memoryManager->GetTextureUploadBuffer(size); + return m_memoryManager->AcquireTextureUploadBuffer(size); } void MetalRenderer::texture_releaseTextureUploadBuffer(uint8* mem) { - // TODO: should the texture buffer get released? + m_memoryManager->ReleaseTextureUploadBuffer(mem); } TextureDecoder* MetalRenderer::texture_chooseDecodedFormat(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, uint32 width, uint32 height) From ea0a30b5649ae4272cb1ff0f3be66d76372fb155 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 5 Nov 2024 17:13:34 +0100 Subject: [PATCH 246/368] handle special state 8 and 5 --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 38 ++++++++++++++++--- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 2 + 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 6ee3d9b65..9c2a040e8 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -20,10 +20,6 @@ #include "Cemu/Logging/CemuLogging.h" #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Core/LatteConst.h" -#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" -#include "Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h" -#include "Cafe/HW/Latte/Renderer/Renderer.h" -#include "HW/Latte/Renderer/Metal/MetalPipelineCompiler.h" #include "config/CemuConfig.h" #define IMGUI_IMPL_METAL_CPP @@ -34,6 +30,9 @@ extern bool hasValidFramebufferAttached; float supportBufferData[512 * 4]; +// Defined in the OpenGL renderer +void LatteDraw_handleSpecialState8_clearAsDepth(); + MetalRenderer::MetalRenderer() { m_device = MTL::CreateSystemDefaultDevice(); @@ -827,7 +826,19 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 return; } - // TODO: special state 8 and 5 + // fast clear color as depth + if (LatteGPUState.contextNew.GetSpecialStateValues()[8] != 0) + { + LatteDraw_handleSpecialState8_clearAsDepth(); + LatteGPUState.drawCallCounter++; + return; + } + else if (LatteGPUState.contextNew.GetSpecialStateValues()[5] != 0) + { + draw_handleSpecialState5(); + LatteGPUState.drawCallCounter++; + return; + } auto& encoderState = m_state.m_encoderState; @@ -1254,6 +1265,23 @@ void MetalRenderer::draw_updateUniformBuffersDirectAccess(LatteDecompilerShader* } } +void MetalRenderer::draw_handleSpecialState5() +{ + LatteMRT::UpdateCurrentFBO(); + LatteRenderTarget_updateViewport(); + + LatteTextureView* colorBuffer = LatteMRT::GetColorAttachment(0); + LatteTextureView* depthBuffer = LatteMRT::GetDepthAttachment(); + + sint32 vpWidth, vpHeight; + LatteMRT::GetVirtualViewportDimensions(vpWidth, vpHeight); + + surfaceCopy_copySurfaceWithFormatConversion( + depthBuffer->baseTexture, depthBuffer->firstMip, depthBuffer->firstSlice, + colorBuffer->baseTexture, colorBuffer->firstMip, colorBuffer->firstSlice, + vpWidth, vpHeight); +} + void* MetalRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) { auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index de416825c..ad45c3267 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -263,6 +263,8 @@ class MetalRenderer : public Renderer void draw_updateVertexBuffersDirectAccess(); void draw_updateUniformBuffersDirectAccess(LatteDecompilerShader* shader, const uint32 uniformBufferRegOffset); + void draw_handleSpecialState5(); + // index void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override; void indexData_uploadIndexMemory(uint32 bufferIndex, uint32 offset, uint32 size) override; From 552c4901b9cfd1c5aba91e7061cdcfea99ad2ca9 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 5 Nov 2024 17:42:02 +0100 Subject: [PATCH 247/368] implement occlusion query flush --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 6 ++++-- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h | 7 +++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 9c2a040e8..969562823 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1232,7 +1232,7 @@ void MetalRenderer::draw_updateVertexBuffersDirectAccess() MPTR bufferAddress = LatteGPUState.contextRegister[bufferBaseRegisterIndex + 0]; if (bufferAddress == MPTR_NULL) [[unlikely]] - bufferAddress = 0x10000000; // TODO: really? + bufferAddress = m_memoryManager->GetImportedMemBaseAddress(); m_state.m_vertexBufferOffsets[bufferIndex] = bufferAddress - m_memoryManager->GetImportedMemBaseAddress(); } @@ -1317,7 +1317,9 @@ void MetalRenderer::occlusionQuery_destroy(LatteQueryObject* queryObj) { } void MetalRenderer::occlusionQuery_flush() { - // TODO: implement + // TODO: wait for all command buffers with occlusion queries? + if (m_occlusionQuery.m_lastCommandBuffer) + m_occlusionQuery.m_lastCommandBuffer->waitUntilCompleted(); } void MetalRenderer::occlusionQuery_updateState() { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index ad45c3267..baae288d4 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -429,6 +429,12 @@ class MetalRenderer : public Renderer void EndOcclusionQuery() { m_occlusionQuery.m_active = false; + if (m_occlusionQuery.m_lastCommandBuffer) + m_occlusionQuery.m_lastCommandBuffer->release(); + if (IsCommandBufferActive()) + m_occlusionQuery.m_lastCommandBuffer = GetCurrentCommandBuffer()->retain(); + else + m_occlusionQuery.m_lastCommandBuffer = nullptr; } private: @@ -480,6 +486,7 @@ class MetalRenderer : public Renderer uint64* m_resultsPtr; uint32 m_currentIndex = 0; bool m_active = false; + MTL::CommandBuffer* m_lastCommandBuffer = nullptr; } m_occlusionQuery; // Active objects From c46c8214f5acaaea0d4f0b12b569e2b39d9c4812 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 5 Nov 2024 17:57:20 +0100 Subject: [PATCH 248/368] refactor command buffers --- .../Renderer/Metal/MetalPerformanceMonitor.h | 2 + .../HW/Latte/Renderer/Metal/MetalQuery.cpp | 5 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 52 +++++++++---------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 15 +++--- 4 files changed, 35 insertions(+), 39 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h index 1bf017b25..cb65162e0 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h @@ -6,6 +6,7 @@ class MetalPerformanceMonitor size_t m_bufferAllocatorMemory = 0; // Per frame data + uint32 m_commandBuffers = 0; uint32 m_renderPasses = 0; uint32 m_clears = 0; uint32 m_manualVertexFetchDraws = 0; @@ -17,6 +18,7 @@ class MetalPerformanceMonitor void ResetPerFrameData() { + m_commandBuffers = 0; m_renderPasses = 0; m_clears = 0; m_manualVertexFetchDraws = 0; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp index 6e6b14c33..5a60d4eaa 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp @@ -37,9 +37,8 @@ void LatteQueryObjectMtl::end() { m_range.end = m_mtlr->GetOcclusionQueryIndex(); m_mtlr->EndOcclusionQuery(); + + m_commandBuffer = m_mtlr->GetCurrentCommandBuffer()->retain(); if (m_mtlr->IsCommandBufferActive()) - { - m_commandBuffer = m_mtlr->GetCurrentCommandBuffer()->retain(); m_mtlr->RequestSoonCommit(); - } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 969562823..b2b230ba6 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -234,7 +234,6 @@ void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) // Reset the command buffers (they are released by TemporaryBufferAllocator) CommitCommandBuffer(); - m_commandBuffers.clear(); // Release frame persistent buffers m_memoryManager->GetFramePersistentBufferAllocator().ResetAllocations(); @@ -318,12 +317,9 @@ void MetalRenderer::Flush(bool waitIdle) CommitCommandBuffer(); if (waitIdle) { - for (auto commandBuffer : m_commandBuffers) - { - cemu_assert_debug(commandBuffer.m_commited); + cemu_assert_debug(m_currentCommandBuffer.m_commited); - commandBuffer.m_commandBuffer->waitUntilCompleted(); - } + m_currentCommandBuffer.m_commandBuffer->waitUntilCompleted(); } } @@ -448,7 +444,7 @@ void MetalRenderer::AppendOverlayDebugInfo() ImGui::Text("Buffer allocator memory %zuMB", m_performanceMonitor.m_bufferAllocatorMemory / 1024 / 1024); ImGui::Text("--- Metal info (per frame) ---"); - ImGui::Text("Command buffers %zu", m_commandBuffers.size()); + ImGui::Text("Command buffers %u", m_performanceMonitor.m_commandBuffers); ImGui::Text("Render passes %u", m_performanceMonitor.m_renderPasses); ImGui::Text("Clears %u", m_performanceMonitor.m_clears); ImGui::Text("Manual vertex fetch draws %u (mesh draws: %u)", m_performanceMonitor.m_manualVertexFetchDraws, m_performanceMonitor.m_meshDraws); @@ -1427,14 +1423,14 @@ void MetalRenderer::SetSamplerState(MTL::RenderCommandEncoder* renderCommandEnco MTL::CommandBuffer* MetalRenderer::GetCommandBuffer() { - bool needsNewCommandBuffer = (m_commandBuffers.empty() || m_commandBuffers.back().m_commited); + bool needsNewCommandBuffer = (!m_currentCommandBuffer.m_commandBuffer || m_currentCommandBuffer.m_commited); if (needsNewCommandBuffer) { // Debug //m_commandQueue->insertDebugCaptureBoundary(); MTL::CommandBuffer* mtlCommandBuffer = m_commandQueue->commandBuffer(); - m_commandBuffers.push_back({mtlCommandBuffer}); + m_currentCommandBuffer = {mtlCommandBuffer}; m_recordedDrawcalls = 0; m_commitTreshold = m_defaultCommitTreshlod; @@ -1442,11 +1438,14 @@ MTL::CommandBuffer* MetalRenderer::GetCommandBuffer() // Notify memory manager about the new command buffer m_memoryManager->GetTemporaryBufferAllocator().SetActiveCommandBuffer(mtlCommandBuffer); + // Debug + m_performanceMonitor.m_commandBuffers++; + return mtlCommandBuffer; } else { - return m_commandBuffers.back().m_commandBuffer; + return m_currentCommandBuffer.m_commandBuffer; } } @@ -1594,27 +1593,26 @@ void MetalRenderer::EndEncoding() void MetalRenderer::CommitCommandBuffer() { - if (m_commandBuffers.size() != 0) - { - EndEncoding(); + if (!m_currentCommandBuffer.m_commandBuffer) + return; - auto& commandBuffer = m_commandBuffers.back(); - if (!commandBuffer.m_commited) - { - // Handled differently, since it seems like Metal doesn't always call the completion handler - //commandBuffer.m_commandBuffer->addCompletedHandler(^(MTL::CommandBuffer*) { - // m_memoryManager->GetTemporaryBufferAllocator().CommandBufferFinished(commandBuffer.m_commandBuffer); - //}); + EndEncoding(); - commandBuffer.m_commandBuffer->commit(); - commandBuffer.m_commandBuffer->release(); - commandBuffer.m_commited = true; + if (!m_currentCommandBuffer.m_commited) + { + // Handled differently, since it seems like Metal doesn't always call the completion handler + //commandBuffer.m_commandBuffer->addCompletedHandler(^(MTL::CommandBuffer*) { + // m_memoryManager->GetTemporaryBufferAllocator().CommandBufferFinished(commandBuffer.m_commandBuffer); + //}); - m_memoryManager->GetTemporaryBufferAllocator().SetActiveCommandBuffer(nullptr); + m_currentCommandBuffer.m_commandBuffer->commit(); + m_currentCommandBuffer.m_commandBuffer->release(); + m_currentCommandBuffer.m_commited = true; - // Debug - //m_commandQueue->insertDebugCaptureBoundary(); - } + m_memoryManager->GetTemporaryBufferAllocator().SetActiveCommandBuffer(nullptr); + + // Debug + //m_commandQueue->insertDebugCaptureBoundary(); } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index baae288d4..3ffdaa9e4 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -137,7 +137,7 @@ struct MetalState struct MetalCommandBuffer { - MTL::CommandBuffer* m_commandBuffer; + MTL::CommandBuffer* m_commandBuffer = nullptr; bool m_commited = false; }; @@ -280,14 +280,14 @@ class MetalRenderer : public Renderer bool IsCommandBufferActive() const { - return (m_commandBuffers.size() != 0); + return (m_currentCommandBuffer.m_commandBuffer && !m_currentCommandBuffer.m_commited); } MTL::CommandBuffer* GetCurrentCommandBuffer() { - cemu_assert_debug(m_commandBuffers.size() != 0); + cemu_assert_debug(m_currentCommandBuffer.m_commandBuffer); - return m_commandBuffers[m_commandBuffers.size() - 1].m_commandBuffer; + return m_currentCommandBuffer.m_commandBuffer; } void RequestSoonCommit() @@ -431,10 +431,7 @@ class MetalRenderer : public Renderer m_occlusionQuery.m_active = false; if (m_occlusionQuery.m_lastCommandBuffer) m_occlusionQuery.m_lastCommandBuffer->release(); - if (IsCommandBufferActive()) - m_occlusionQuery.m_lastCommandBuffer = GetCurrentCommandBuffer()->retain(); - else - m_occlusionQuery.m_lastCommandBuffer = nullptr; + m_occlusionQuery.m_lastCommandBuffer = GetCurrentCommandBuffer()->retain(); } private: @@ -490,7 +487,7 @@ class MetalRenderer : public Renderer } m_occlusionQuery; // Active objects - std::vector m_commandBuffers; + MetalCommandBuffer m_currentCommandBuffer{}; MetalEncoderType m_encoderType = MetalEncoderType::None; MTL::CommandEncoder* m_commandEncoder = nullptr; From fc1580f31ffce170dd2697810305320d9dacea80 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 5 Nov 2024 18:07:03 +0100 Subject: [PATCH 249/368] remove outdated todo notices --- src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp | 1 - src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp | 6 ------ src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp | 2 +- src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h | 2 +- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 1 - 5 files changed, 2 insertions(+), 10 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index 477213350..249614345 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -84,7 +84,6 @@ std::map MTL_DEPTH_FORMAT_TABLE = { {Latte::E_GX2SURFFMT::D32_FLOAT, {MTL::PixelFormatDepth32Float, MetalDataType::NONE, 4, {1, 1}}}, }; -// TODO: change the decoders when a format is not supported // TODO: R10_G10_B10_A2_UINT and R10_G10_B10_A2_SINT // TODO: A2_B10_G10_R10_UNORM and A2_B10_G10_R10_UINT void CheckForPixelFormatSupport(const MetalPixelFormatSupport& support) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp index 96375e0bf..427530c2c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp @@ -54,8 +54,6 @@ MTL::DepthStencilState* MetalDepthStencilCache::GetDepthStencilState(const Latte uint32 stencilWriteMaskBack = LatteGPUState.contextNew.DB_STENCILREFMASK_BF.get_STENCILWRITEMASK_B(); MTL::StencilDescriptor* frontStencil = MTL::StencilDescriptor::alloc()->init(); - // TODO: set reference - //depthStencilState.front.reference = stencilRefFront; frontStencil->setReadMask(stencilCompareMaskFront); frontStencil->setWriteMask(stencilWriteMaskFront); frontStencil->setStencilCompareFunction(GetMtlCompareFunc(frontStencilFunc)); @@ -67,8 +65,6 @@ MTL::DepthStencilState* MetalDepthStencilCache::GetDepthStencilState(const Latte MTL::StencilDescriptor* backStencil = MTL::StencilDescriptor::alloc()->init(); if (backStencilEnable) { - // TODO: set reference - //depthStencilState.back.reference = stencilRefBack; backStencil->setReadMask(stencilCompareMaskBack); backStencil->setWriteMask(stencilWriteMaskBack); backStencil->setStencilCompareFunction(GetMtlCompareFunc(backStencilFunc)); @@ -78,8 +74,6 @@ MTL::DepthStencilState* MetalDepthStencilCache::GetDepthStencilState(const Latte } else { - // TODO: set reference - //depthStencilState.back.reference = stencilRefFront; backStencil->setReadMask(stencilCompareMaskFront); backStencil->setWriteMask(stencilWriteMaskFront); backStencil->setStencilCompareFunction(GetMtlCompareFunc(frontStencilFunc)); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index d49060fbd..dcd957c8a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -27,7 +27,7 @@ static void compileThreadFunc(sint32 threadIndex) SetThreadName("compilePl"); // one thread runs at normal priority while the others run at lower priority - if(threadIndex != 0) + if (threadIndex != 0) ; // TODO: set thread priority while (true) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h index 7c9bc2cfa..3de0939a0 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h @@ -23,6 +23,6 @@ class LatteQueryObjectMtl : public LatteQueryObject class MetalRenderer* m_mtlr; MetalQueryRange m_range = {INVALID_UINT32, INVALID_UINT32}; - // TODO: make this a list of command buffers + // TODO: make this a list of command buffers? MTL::CommandBuffer* m_commandBuffer = nullptr; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index b2b230ba6..380a3f711 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1719,7 +1719,6 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE auto textureView = m_state.m_textures[hostTextureUnit]; if (!textureView) { - // TODO: don't bind if already bound if (textureDim == Latte::E_DIM::DIM_1D) SetTexture(renderCommandEncoder, mtlShaderType, m_nullTexture1D, binding); else From 2c9f627f725668320e30871eda2b3c29e22d35fe Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 8 Nov 2024 08:33:39 +0100 Subject: [PATCH 250/368] remove MetalLogging log type --- src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp | 2 +- src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp | 2 +- src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp | 4 ++-- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 4 ++++ src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h | 4 +--- src/Cemu/Logging/CemuLogging.cpp | 1 - src/Cemu/Logging/CemuLogging.h | 1 - src/gui/MainWindow.cpp | 2 -- 8 files changed, 9 insertions(+), 11 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index c6a5012bd..f1497d301 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -105,5 +105,5 @@ LatteTextureView* LatteTextureMtl::CreateView(Latte::E_DIM dim, Latte::E_GX2SURF void LatteTextureMtl::AllocateOnHost() { - cemuLog_log(LogType::MetalLogging, "not implemented"); + cemuLog_log(LogType::Force, "not implemented"); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index 249614345..436a421b2 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -255,7 +255,7 @@ MTL::PrimitiveType GetMtlPrimitiveType(LattePrimitiveMode primitiveMode) case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS: return MTL::PrimitiveTypeTriangle; // rects are emulated as 2 triangles default: - cemuLog_logDebug(LogType::Force, "Metal-Unsupported: Render pipeline with primitive mode {} created", primitiveMode); + cemuLog_log(LogType::Force, "Unsupported primitive mode {}", primitiveMode); cemu_assert_debug(false); return MTL::PrimitiveTypeTriangle; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 01cee8b9a..8b9ac89f5 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -46,13 +46,13 @@ void MetalMemoryManager::InitBufferCache(size_t size) m_bufferCache = m_mtlr->GetDevice()->newBuffer(memory_getPointerFromVirtualOffset(m_importedMemBaseAddress), m_hostAllocationSize, MTL::ResourceStorageModeShared, nullptr); if (!m_bufferCache) { - cemuLog_logDebug(LogType::Force, "Failed to import host memory as a buffer, using device shared mode instead"); + cemuLog_log(LogType::Force, "Failed to import host memory as a buffer, using device shared mode instead"); m_bufferCacheMode = BufferCacheMode::DeviceShared; } } else { - cemuLog_logDebug(LogType::Force, "Host buffer cache mode is only available on unified memory systems, using device shared mode instead"); + cemuLog_log(LogType::Force, "Host buffer cache mode is only available on unified memory systems, using device shared mode instead"); m_bufferCacheMode = BufferCacheMode::DeviceShared; } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 380a3f711..967e2266d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -245,6 +245,10 @@ void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) m_performanceMonitor.ResetPerFrameData(); } +void MetalRenderer::HandleScreenshotRequest(LatteTextureView* texView, bool padView) { + cemuLog_log(LogType::Force, "Screenshots are not yet supported on Metal"); +} + void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutputShader* shader, bool useLinearTexFilter, sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight, bool padView, bool clearBackground) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 3ffdaa9e4..162356117 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -186,9 +186,7 @@ class MetalRenderer : public Renderer void DrawEmptyFrame(bool mainWindow) override; void SwapBuffers(bool swapTV, bool swapDRC) override; - void HandleScreenshotRequest(LatteTextureView* texView, bool padView) override { - cemuLog_log(LogType::MetalLogging, "Screenshots are not yet supported on Metal"); - } + void HandleScreenshotRequest(LatteTextureView* texView, bool padView) override; void DrawBackbufferQuad(LatteTextureView* texView, RendererOutputShader* shader, bool useLinearTexFilter, sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight, diff --git a/src/Cemu/Logging/CemuLogging.cpp b/src/Cemu/Logging/CemuLogging.cpp index 6b77b2265..d7e3bc3e6 100644 --- a/src/Cemu/Logging/CemuLogging.cpp +++ b/src/Cemu/Logging/CemuLogging.cpp @@ -59,7 +59,6 @@ const std::map g_logging_window_mapping {LogType::TextureReadback, "Texture readback"}, {LogType::OpenGLLogging, "OpenGL debug output"}, {LogType::VulkanValidation, "Vulkan validation layer"}, - {LogType::MetalLogging, "Metal debug output"}, }; bool cemuLog_advancedPPCLoggingEnabled() diff --git a/src/Cemu/Logging/CemuLogging.h b/src/Cemu/Logging/CemuLogging.h index 337bfa910..fae134b47 100644 --- a/src/Cemu/Logging/CemuLogging.h +++ b/src/Cemu/Logging/CemuLogging.h @@ -20,7 +20,6 @@ enum class LogType : sint32 OpenGLLogging = 10, // OpenGL debug logging TextureCache = 11, // texture cache warnings and info VulkanValidation = 12, // Vulkan validation layer - MetalLogging = 13, // Metal debug logging Patches = 14, CoreinitMem = 8, // coreinit memory functions CoreinitMP = 15, diff --git a/src/gui/MainWindow.cpp b/src/gui/MainWindow.cpp index 2e44a4c76..b5452aa8d 100644 --- a/src/gui/MainWindow.cpp +++ b/src/gui/MainWindow.cpp @@ -2232,7 +2232,6 @@ void MainWindow::RecreateMenu() debugLoggingMenu->AppendSeparator(); debugLoggingMenu->AppendCheckItem(MAINFRAME_MENU_ID_DEBUG_LOGGING0 + stdx::to_underlying(LogType::OpenGLLogging), _("&OpenGL debug output"), wxEmptyString)->Check(cemuLog_isLoggingEnabled(LogType::OpenGLLogging)); debugLoggingMenu->AppendCheckItem(MAINFRAME_MENU_ID_DEBUG_LOGGING0 + stdx::to_underlying(LogType::VulkanValidation), _("&Vulkan validation layer (slow)"), wxEmptyString)->Check(cemuLog_isLoggingEnabled(LogType::VulkanValidation)); - debugLoggingMenu->AppendCheckItem(MAINFRAME_MENU_ID_DEBUG_LOGGING0 + stdx::to_underlying(LogType::MetalLogging), _("&Metal debug output"), wxEmptyString)->Check(cemuLog_isLoggingEnabled(LogType::MetalLogging)); debugLoggingMenu->AppendCheckItem(MAINFRAME_MENU_ID_DEBUG_ADVANCED_PPC_INFO, _("&Log PPC context for API"), wxEmptyString)->Check(cemuLog_advancedPPCLoggingEnabled()); m_loggingSubmenu = debugLoggingMenu; // debug->dump submenu @@ -2302,7 +2301,6 @@ void MainWindow::RecreateMenu() // these options cant be toggled after the renderer backend is initialized: m_loggingSubmenu->Enable(MAINFRAME_MENU_ID_DEBUG_LOGGING0 + stdx::to_underlying(LogType::OpenGLLogging), false); m_loggingSubmenu->Enable(MAINFRAME_MENU_ID_DEBUG_LOGGING0 + stdx::to_underlying(LogType::VulkanValidation), false); - m_loggingSubmenu->Enable(MAINFRAME_MENU_ID_DEBUG_LOGGING0 + stdx::to_underlying(LogType::MetalLogging), false); UpdateNFCMenu(); } From f26495707ee3321e45b6820a88d0be4e70c1e6ec Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 8 Nov 2024 16:58:29 +0100 Subject: [PATCH 251/368] implement screenshots --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 74 ++++++++++++++++++- 1 file changed, 73 insertions(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 967e2266d..341c78435 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -21,6 +21,7 @@ #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Core/LatteConst.h" #include "config/CemuConfig.h" +#include "gui/guiWrapper.h" #define IMGUI_IMPL_METAL_CPP #include "imgui/imgui_extension.h" @@ -246,7 +247,78 @@ void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) } void MetalRenderer::HandleScreenshotRequest(LatteTextureView* texView, bool padView) { - cemuLog_log(LogType::Force, "Screenshots are not yet supported on Metal"); + const bool hasScreenshotRequest = gui_hasScreenshotRequest(); + if (!hasScreenshotRequest && m_screenshot_state == ScreenshotState::None) + return; + + if (m_mainLayer.GetDrawable()) + { + // we already took a pad view screenshow and want a main window screenshot + if (m_screenshot_state == ScreenshotState::Main && padView) + return; + + if (m_screenshot_state == ScreenshotState::Pad && !padView) + return; + + // remember which screenshot is left to take + if (m_screenshot_state == ScreenshotState::None) + m_screenshot_state = padView ? ScreenshotState::Main : ScreenshotState::Pad; + else + m_screenshot_state = ScreenshotState::None; + } + else + m_screenshot_state = ScreenshotState::None; + + auto texMtl = static_cast(texView->baseTexture); + + int width, height; + texMtl->GetEffectiveSize(width, height, 0); + + uint32 bytesPerRow = GetMtlTextureBytesPerRow(texMtl->format, texMtl->IsDepth(), width); + uint32 size = GetMtlTextureBytesPerImage(texMtl->format, texMtl->IsDepth(), height, bytesPerRow); + + // TODO: get a buffer from the memory manager + MTL::Buffer* buffer = m_device->newBuffer(size, MTL::ResourceStorageModeShared); + + auto blitCommandEncoder = GetBlitCommandEncoder(); + blitCommandEncoder->copyFromTexture(texMtl->GetTexture(), 0, 0, MTL::Origin(0, 0, 0), MTL::Size(width, height, 1), buffer, 0, bytesPerRow, 0); + + uint8* bufferPtr = (uint8*)buffer->contents(); + + bool formatValid = true; + std::vector rgb_data; + rgb_data.reserve(3 * width * height); + + auto pixelFormat = texMtl->GetTexture()->pixelFormat(); + // TODO: implement more formats + switch (pixelFormat) + { + case MTL::PixelFormatRGBA8Unorm: + for (auto ptr = bufferPtr; ptr < bufferPtr + size; ptr += 4) + { + rgb_data.emplace_back(*ptr); + rgb_data.emplace_back(*(ptr + 1)); + rgb_data.emplace_back(*(ptr + 2)); + } + break; + case MTL::PixelFormatRGBA8Unorm_sRGB: + for (auto ptr = bufferPtr; ptr < bufferPtr + size; ptr += 4) + { + rgb_data.emplace_back(SRGBComponentToRGB(*ptr)); + rgb_data.emplace_back(SRGBComponentToRGB(*(ptr + 1))); + rgb_data.emplace_back(SRGBComponentToRGB(*(ptr + 2))); + } + break; + default: + cemuLog_log(LogType::Force, "Unsupported screenshot texture pixel format {}", pixelFormat); + formatValid = false; + break; + } + + buffer->release(); + + if (formatValid) + SaveScreenshot(rgb_data, width, height, !padView); } void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutputShader* shader, bool useLinearTexFilter, From 6897abd59b10213310c4049861875304d93f81ec Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 8 Nov 2024 17:27:50 +0100 Subject: [PATCH 252/368] fix: a typo in the attribute decoder --- .../LatteDecompilerEmitMSLAttrDecoder.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp index 585309a1e..9ee5c31f1 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp @@ -486,7 +486,7 @@ void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); - src->add("attrDecoder.xy = as_type(uint2(float(int(attrDecoder.x)), float(int(attrDecoder.y))));" _CRLF); + src->add("attrDecoder.xy = as_type(float2(float(int(attrDecoder.x)), float(int(attrDecoder.y))));" _CRLF); src->add("attrDecoder.zw = uint2(0);" _CRLF); } else if (attrib->format == FMT_16 && attrib->nfa == 1 && attrib->isSigned == 0) From f1db7d5ab97c15f0b1d30108fe76deab9c5167d1 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 9 Nov 2024 07:25:20 +0100 Subject: [PATCH 253/368] don't log texture allocation warnings --- src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index f1497d301..03cd9285d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -103,7 +103,8 @@ LatteTextureView* LatteTextureMtl::CreateView(Latte::E_DIM dim, Latte::E_GX2SURF return new LatteTextureViewMtl(m_mtlr, this, dim, format, firstMip, mipCount, firstSlice, sliceCount); } +// TODO: lazy allocation? void LatteTextureMtl::AllocateOnHost() { - cemuLog_log(LogType::Force, "not implemented"); + // The texture is already allocated } From 02c36d9125abcfad3ed2b62bbb0d86743cd2a297 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 9 Nov 2024 08:04:17 +0100 Subject: [PATCH 254/368] add: blit todo notice --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 341c78435..d1b8f268f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -614,6 +614,7 @@ void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, s memcpy(allocation.data, pixelData, compressedImageSize); //buffer->didModifyRange(NS::Range(allocation.offset, allocation.size)); + // TODO: specify blit options when copying to a depth stencil texture? // Copy the data from the temporary buffer to the texture blitCommandEncoder->copyFromBuffer(buffer, allocation.offset, bytesPerRow, 0, MTL::Size(width, height, 1), textureMtl->GetTexture(), sliceIndex, mipIndex, MTL::Origin(0, 0, offsetZ)); //} From 97be576cd666e47a4f75bbe5ab18f3ffd2f5a23c Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 10 Nov 2024 07:57:22 +0100 Subject: [PATCH 255/368] cast texture slice to uint --- .../Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index e81856c3d..74079a5f8 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2353,8 +2353,9 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(", "); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add("), "); + src->add("), uint("); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); src->addFmt(", {}", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer0)); } @@ -2396,8 +2397,9 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(", "); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add("), "); + src->add("), uint("); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); } else if(texDim == Latte::E_DIM::DIM_3D) { From ed32feb3d9750932cab575bbd0acba1c724b784c Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 10 Nov 2024 15:18:45 +0100 Subject: [PATCH 256/368] rint texture slice & explicitly specify texture lod --- .../LatteDecompilerEmitMSL.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 74079a5f8..552fd48d9 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2240,6 +2240,8 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex bool unnormalizationHandled = false; bool useTexelCoordinates = false; + bool isRead = ((texOpcode == GPU7_TEX_INST_SAMPLE && (texInstruction->textureFetch.unnormalized[0] && texInstruction->textureFetch.unnormalized[1] && texInstruction->textureFetch.unnormalized[2] && texInstruction->textureFetch.unnormalized[3])) || + texOpcode == GPU7_TEX_INST_LD); // handle illegal combinations if (texOpcode == GPU7_TEX_INST_FETCH4 && (texDim == Latte::E_DIM::DIM_1D || texDim == Latte::E_DIM::DIM_1D_ARRAY)) @@ -2266,8 +2268,7 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex if (!emulateCompare) { src->add("."); - if ((texOpcode == GPU7_TEX_INST_SAMPLE && (texInstruction->textureFetch.unnormalized[0] && texInstruction->textureFetch.unnormalized[1] && texInstruction->textureFetch.unnormalized[2] && texInstruction->textureFetch.unnormalized[3])) || - texOpcode == GPU7_TEX_INST_LD) + if (isRead) { if (hasOffset) cemu_assert_unimplemented(); @@ -2353,9 +2354,9 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(", "); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add("), uint("); + src->add("), uint(rint("); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add(")"); + src->add("))"); src->addFmt(", {}", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer0)); } @@ -2397,9 +2398,9 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(", "); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add("), uint("); + src->add("), uint(rint("); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add(")"); + src->add("))"); } else if(texDim == Latte::E_DIM::DIM_3D) { @@ -2443,7 +2444,7 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex // 1D textures don't support lod if (texDim != Latte::E_DIM::DIM_1D && texDim != Latte::E_DIM::DIM_1D_ARRAY) { - if( texOpcode == GPU7_TEX_INST_SAMPLE_L || texOpcode == GPU7_TEX_INST_SAMPLE_LB || texOpcode == GPU7_TEX_INST_SAMPLE_C_L) + if (texOpcode == GPU7_TEX_INST_SAMPLE_L || texOpcode == GPU7_TEX_INST_SAMPLE_LB || texOpcode == GPU7_TEX_INST_SAMPLE_C_L) { src->add(", "); if (texOpcode == GPU7_TEX_INST_SAMPLE_LB) @@ -2458,7 +2459,7 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex src->add(")"); } } - else if( texOpcode == GPU7_TEX_INST_SAMPLE_LZ || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ ) + else if (!isRead && !isGather/*texOpcode == GPU7_TEX_INST_SAMPLE_LZ || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ*/) { src->add(", level(0.0)"); } From a52095b40a5658896626d9d4482f2d8f95268d19 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 10 Nov 2024 20:09:24 +0100 Subject: [PATCH 257/368] sync between command buffers --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 15 +++++++++++++++ src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h | 5 +++++ 2 files changed, 20 insertions(+) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index d1b8f268f..f28b79b82 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -27,6 +27,8 @@ #include "imgui/imgui_extension.h" #include "imgui/imgui_impl_metal.h" +#define EVENT_VALUE_WRAP 4096 + extern bool hasValidFramebufferAttached; float supportBufferData[512 * 4]; @@ -48,6 +50,9 @@ MetalRenderer::MetalRenderer() CheckForPixelFormatSupport(m_pixelFormatSupport); + // Synchronization resources + m_event = m_device->newEvent(); + // Resources MTL::SamplerDescriptor* samplerDescriptor = MTL::SamplerDescriptor::alloc()->init(); #ifdef CEMU_DEBUG_ASSERT @@ -161,6 +166,8 @@ MetalRenderer::~MetalRenderer() m_occlusionQuery.m_resultBuffer->release(); + m_event->release(); + m_commandQueue->release(); m_device->release(); } @@ -1509,6 +1516,10 @@ MTL::CommandBuffer* MetalRenderer::GetCommandBuffer() MTL::CommandBuffer* mtlCommandBuffer = m_commandQueue->commandBuffer(); m_currentCommandBuffer = {mtlCommandBuffer}; + // Wait for the previous command buffer + if (m_eventValue != -1) + mtlCommandBuffer->encodeWait(m_event, m_eventValue); + m_recordedDrawcalls = 0; m_commitTreshold = m_defaultCommitTreshlod; @@ -1682,6 +1693,10 @@ void MetalRenderer::CommitCommandBuffer() // m_memoryManager->GetTemporaryBufferAllocator().CommandBufferFinished(commandBuffer.m_commandBuffer); //}); + // Signal event + m_eventValue = (m_eventValue + 1) % EVENT_VALUE_WRAP; + m_currentCommandBuffer.m_commandBuffer->encodeSignalEvent(m_event, m_eventValue); + m_currentCommandBuffer.m_commandBuffer->commit(); m_currentCommandBuffer.m_commandBuffer->release(); m_currentCommandBuffer.m_commited = true; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 162356117..c272c729b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -6,6 +6,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h" +#include struct MetalBufferAllocation { @@ -459,6 +460,10 @@ class MetalRenderer : public Renderer // Void vertex pipelines class MetalVoidVertexPipeline* m_copyBufferToBufferPipeline; + // Synchronization resources + MTL::Event* m_event; + int32_t m_eventValue = -1; + // Resources MTL::SamplerState* m_nearestSampler; MTL::SamplerState* m_linearSampler; From 326d3442cd57a3b7804367be9c0842ca1628b6b4 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 11 Nov 2024 15:45:13 +0100 Subject: [PATCH 258/368] cleanup the buffer allocator --- src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h index 9998ac894..d3a0d8467 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h @@ -144,6 +144,8 @@ class MetalBufferAllocator protected: class MetalRenderer* m_mtlr; + + // TODO: make these template arguments bool m_isCPUAccessible; MTL::ResourceOptions m_options; @@ -153,8 +155,7 @@ class MetalBufferAllocator { auto& buffer = m_buffers[bufferIndex]; buffer.m_freeRanges.clear(); - buffer.m_freeRanges.reserve(1); - buffer.m_freeRanges.push_back({0, m_buffers[bufferIndex].m_buffer->length()}); + buffer.m_freeRanges.push_back({0, buffer.m_buffer->length()}); } }; @@ -191,7 +192,6 @@ class MetalTemporaryBufferAllocator : public MetalBufferAllocator Date: Mon, 11 Nov 2024 17:21:16 +0100 Subject: [PATCH 259/368] check if verticesPerInstance uniform is used --- .../LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp | 6 ++++++ .../LatteDecompilerEmitMSLHeader.hpp | 10 +++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp index b5697d42e..3a45ade50 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp @@ -547,6 +547,12 @@ namespace LatteDecompiler { decompilerContext->hasUniformVarBlock = true; // uf_verticesPerInstance and uf_streamoutBufferBase* } + if (g_renderer->GetType() == RendererAPI::Metal) + { + // TODO: also check for rect primitive + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex && decompilerContext->options->usesGeometryShader) + decompilerContext->hasUniformVarBlock = true; // uf_verticesPerInstance + } } void _initUniformBindingPoints(LatteDecompilerShaderContext* decompilerContext) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 1e3091a6e..0ca4422a9 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -5,7 +5,7 @@ namespace LatteDecompiler { - static void _emitUniformVariables(LatteDecompilerShaderContext* decompilerContext) + static void _emitUniformVariables(LatteDecompilerShaderContext* decompilerContext, bool isRectVertexShader) { auto src = decompilerContext->shaderSource; @@ -85,7 +85,11 @@ namespace LatteDecompiler uniformCurrentOffset += 8; } // define verticesPerInstance + streamoutBufferBaseX - if (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry) + if ((shader->shaderType == LatteConst::ShaderType::Vertex && + (decompilerContext->options->usesGeometryShader || isRectVertexShader)) || + (decompilerContext->analyzer.useSSBOForStreamout && + (shader->shaderType == LatteConst::ShaderType::Vertex && !decompilerContext->options->usesGeometryShader) || + (shader->shaderType == LatteConst::ShaderType::Geometry))) { src->add("int verticesPerInstance;" _CRLF); uniformOffsets.offset_verticesPerInstance = uniformCurrentOffset; @@ -392,7 +396,7 @@ namespace LatteDecompiler if(dump_shaders_enabled) decompilerContext->shaderSource->add("// start of shader inputs/outputs, predetermined by Cemu. Do not touch" _CRLF); // uniform variables - _emitUniformVariables(decompilerContext); + _emitUniformVariables(decompilerContext, isRectVertexShader); // uniform buffers _emitUniformBuffers(decompilerContext); // inputs and outputs From a1b91648311ff6edf2a7a41dc1b3e76ad8cc43ba Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 11 Nov 2024 17:33:12 +0100 Subject: [PATCH 260/368] don't mask ps inputs with 0x7F --- .../LatteDecompilerEmitMSL.cpp | 25 ++++--------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 552fd48d9..315354dd7 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -4308,27 +4308,12 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, continue; } - // TODO: is the if statement even needed? - if (usesGeometryShader) - { - // import from geometry shader - if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->addFmt("{} = bitCast(in.passParameterSem{});" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId & 0x7F); - else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) - src->addFmt("{} = in.passParameterSem{};" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId & 0x7F); - else - cemu_assert_unimplemented(); - } + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->addFmt("{} = bitCast(in.passParameterSem{});" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId); + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("{} = in.passParameterSem{};" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId); else - { - // import from vertex shader - if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->addFmt("{} = bitCast(in.passParameterSem{});" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId); - else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) - src->addFmt("{} = in.passParameterSem{};" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId); - else - cemu_assert_unimplemented(); - } + cemu_assert_unimplemented(); } // front facing attribute if (frontFace_enabled) From 6ea6ad37d657dea4676f61beed8b553b89dd958f Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 11 Nov 2024 17:40:58 +0100 Subject: [PATCH 261/368] use verticesPerInstance for rect vertex shaders --- .../Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp index 3a45ade50..e0b39767f 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp @@ -549,8 +549,10 @@ namespace LatteDecompiler } if (g_renderer->GetType() == RendererAPI::Metal) { + bool isRectVertexShader = (static_cast(decompilerContext->contextRegisters[mmVGT_PRIMITIVE_TYPE]) == LattePrimitiveMode::RECTS); + // TODO: also check for rect primitive - if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex && decompilerContext->options->usesGeometryShader) + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex && (decompilerContext->options->usesGeometryShader || isRectVertexShader)) decompilerContext->hasUniformVarBlock = true; // uf_verticesPerInstance } } From 5b7a0fa3f6d3a305e932499e5feac3ac679ad3a4 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 14 Nov 2024 17:10:42 +0100 Subject: [PATCH 262/368] warn about invalid primitive mode --- .../LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 0ca4422a9..d10854bf7 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -366,7 +366,7 @@ namespace LatteDecompiler src->add("#define VERTICES_PER_VERTEX_PRIMITIVE 3" _CRLF); break; default: - cemu_assert_suspicious(); + cemuLog_log(LogType::Force, "Unknown vertex out primitive type {}", vsOutPrimType); break; } if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry) @@ -386,7 +386,7 @@ namespace LatteDecompiler src->add("#define GET_PRIMITIVE_COUNT(vertexCount) (vertexCount - 2)" _CRLF); break; default: - cemu_assert_suspicious(); + cemuLog_log(LogType::Force, "Unknown geometry out primitive type {}", gsOutPrimType); break; } } From c9b18efc031e185cce3f888119222c7c7af53f9f Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 14 Nov 2024 18:48:27 +0100 Subject: [PATCH 263/368] don't transform depth to 0...1 --- .../Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 315354dd7..7728e0078 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3609,7 +3609,7 @@ void LatteDecompiler_emitClauseCodeMSL(LatteDecompilerShaderContext* shaderConte // write point size if (shaderContext->analyzer.outputPointSize && shaderContext->analyzer.writesPointSize == false) src->add("out.pointSize = supportBuffer.pointSize;" _CRLF); - src->add("out.position.z = (out.position.z + out.position.w) / 2.0;" _CRLF); + //src->add("out.position.z = (out.position.z + out.position.w) / 2.0;" _CRLF); src->add("mesh.set_vertex(vertexIndex, out);" _CRLF); src->add("vertexIndex++;" _CRLF); // increment transform feedback pointer @@ -4378,8 +4378,8 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, if (rasterizationEnabled) { - if (shader->shaderType == LatteConst::ShaderType::Vertex && !shaderContext->options->usesGeometryShader) - src->add("out.position.z = (out.position.z + out.position.w) / 2.0;" _CRLF); + //if (shader->shaderType == LatteConst::ShaderType::Vertex && !shaderContext->options->usesGeometryShader) + // src->add("out.position.z = (out.position.z + out.position.w) / 2.0;" _CRLF); // Return if (!usesGeometryShader || shader->shaderType == LatteConst::ShaderType::Pixel) From d7324165693d8eecef843d650bb8567337c459d8 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 14 Nov 2024 19:10:36 +0100 Subject: [PATCH 264/368] invalidate index buffers when a command buffer finishes --- src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h index d3a0d8467..7a1525969 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h @@ -14,6 +14,8 @@ struct MetalBufferRange constexpr size_t BASE_ALLOCATION_SIZE = 8 * 1024 * 1024; // 8 MB constexpr size_t MAX_ALLOCATION_SIZE = 64 * 1024 * 1024; // 64 MB +void LatteIndices_invalidateAll(); + template class MetalBufferAllocator { @@ -261,8 +263,10 @@ class MetalTemporaryBufferAllocator : public MetalBufferAllocatorfirst)) @@ -280,6 +284,8 @@ class MetalTemporaryBufferAllocator : public MetalBufferAllocatorfirst->release(); it = m_executingCommandBuffers.erase(it); + + atLeastOneCompleted = true; } else { @@ -287,6 +293,9 @@ class MetalTemporaryBufferAllocator : public MetalBufferAllocator Date: Fri, 15 Nov 2024 17:01:21 +0100 Subject: [PATCH 265/368] handle halfZ in the shader decompiler --- .../LatteDecompilerEmitMSL.cpp | 17 ++++++----------- .../LatteDecompilerEmitMSLHeader.hpp | 5 +++++ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 7728e0078..3ed15f73e 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3123,13 +3123,13 @@ static void _emitExportCode(LatteDecompilerShaderContext* shaderContext, LatteDe _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); src->add(";" _CRLF); src->add("finalPos.xy = finalPos.xy * supportBuffer.windowSpaceToClipSpaceTransform - float2(1.0,1.0);"); - src->add("out.position = finalPos;"); + src->add("SET_POSITION(finalPos);"); } else { - src->add("out.position = "); + src->add("SET_POSITION("); _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); - src->add(";" _CRLF); + src->add(");" _CRLF); } } else if (cfInstruction->exportType == 1 && cfInstruction->exportArrayBase == GPU7_DECOMPILER_CF_EXPORT_POINT_SIZE ) @@ -3371,7 +3371,7 @@ static void _emitCFRingWriteCode(LatteDecompilerShaderContext* shaderContext, La src->addFmt(" = "); _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, burstIndex); src->add(";" _CRLF); - src->add("out.position = pos;" _CRLF); + src->add("SET_POSITION(pos);" _CRLF); src->add("}" _CRLF); } else if (parameterExportType == 2 && parameterExportBase < 16) @@ -3609,7 +3609,6 @@ void LatteDecompiler_emitClauseCodeMSL(LatteDecompilerShaderContext* shaderConte // write point size if (shaderContext->analyzer.outputPointSize && shaderContext->analyzer.writesPointSize == false) src->add("out.pointSize = supportBuffer.pointSize;" _CRLF); - //src->add("out.position.z = (out.position.z + out.position.w) / 2.0;" _CRLF); src->add("mesh.set_vertex(vertexIndex, out);" _CRLF); src->add("vertexIndex++;" _CRLF); // increment transform feedback pointer @@ -4376,14 +4375,10 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, } } - if (rasterizationEnabled) + if (rasterizationEnabled && (!usesGeometryShader || shader->shaderType == LatteConst::ShaderType::Pixel)) { - //if (shader->shaderType == LatteConst::ShaderType::Vertex && !shaderContext->options->usesGeometryShader) - // src->add("out.position.z = (out.position.z + out.position.w) / 2.0;" _CRLF); - // Return - if (!usesGeometryShader || shader->shaderType == LatteConst::ShaderType::Pixel) - src->add("return out;" _CRLF); + src->add("return out;" _CRLF); } // end of shader main diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index d10854bf7..04e3410de 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -392,6 +392,11 @@ namespace LatteDecompiler } } + if (decompilerContext->contextRegistersNew->PA_CL_CLIP_CNTL.get_DX_CLIP_SPACE_DEF()) + src->add("#define SET_POSITION(_v) out.position = _v" _CRLF); + else + src->add("#define SET_POSITION(_v) out.position = _v; out.position.z = (out.position.z + out.position.w) / 2.0" _CRLF); + const bool dump_shaders_enabled = ActiveSettings::DumpShadersEnabled(); if(dump_shaders_enabled) decompilerContext->shaderSource->add("// start of shader inputs/outputs, predetermined by Cemu. Do not touch" _CRLF); From a72136419c97174072f291c0a76946be920d89d9 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 15 Nov 2024 17:32:29 +0100 Subject: [PATCH 266/368] log instead of printf --- src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp | 2 +- .../HW/Latte/Renderer/Metal/MetalLayerHandle.cpp | 2 +- .../Renderer/Metal/MetalPipelineCompiler.cpp | 2 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 16 ++++++++-------- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index 436a421b2..9de3bd1ce 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -240,7 +240,7 @@ MTL::PrimitiveType GetMtlPrimitiveType(LattePrimitiveMode primitiveMode) case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::LINE_LOOP: return MTL::PrimitiveTypeLineStrip; // line loops are emulated as line strips with an extra connecting strip at the end case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::LINE_STRIP_ADJACENT: // Tropical Freeze level 3-6 - debug_printf("Metal doesn't support line strip adjacent primitive, using line strip instead\n"); + cemuLog_logOnce(LogType::Force, "Metal doesn't support line strip adjacent primitive, using line strip instead"); return MTL::PrimitiveTypeLineStrip; case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::TRIANGLES: return MTL::PrimitiveTypeTriangle; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp index ad16b89ae..1155c1528 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp @@ -32,7 +32,7 @@ bool MetalLayerHandle::AcquireDrawable() m_drawable = m_layer->nextDrawable(); if (!m_drawable) { - debug_printf("layer %p failed to acquire next drawable\n", this); + cemuLog_log(LogType::Force, "layer {} failed to acquire next drawable", (void*)this); return false; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp index 9d74e2d92..194498ecc 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp @@ -468,7 +468,7 @@ void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchSha layout->setStepFunction(MTL::VertexStepFunctionPerInstance); else { - debug_printf("unimplemented vertex fetch type %u\n", (uint32)fetchType.value()); + cemuLog_log(LogType::Force, "unimplemented vertex fetch type {}", (uint32)fetchType.value()); cemu_assert(false); } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index f28b79b82..ef1c0d711 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -123,7 +123,7 @@ MetalRenderer::MetalRenderer() MTL::Library* utilityLibrary = m_device->newLibrary(ToNSString(utilityShaderSource), nullptr, &error); if (error) { - debug_printf("failed to create utility library (error: %s)\n", error->localizedDescription()->utf8String()); + cemuLog_log(LogType::Force, "failed to create utility library (error: {})", error->localizedDescription()->utf8String()); error->release(); throw; return; @@ -454,7 +454,7 @@ void MetalRenderer::ImguiEnd() if (m_encoderType != MetalEncoderType::Render) { - debug_printf("no render command encoder, cannot draw ImGui\n"); + cemuLog_logOnce(LogType::Force, "no render command encoder, cannot draw ImGui"); return; } @@ -850,7 +850,7 @@ void MetalRenderer::draw_beginSequence() LatteSHRC_UpdateActiveShaders(); if (LatteGPUState.activeShaderHasError) { - debug_printf("Skipping drawcalls due to shader error\n"); + cemuLog_logOnce(LogType::Force, "Skipping drawcalls due to shader error\n"); m_state.m_skipDrawSequence = true; cemu_assert_debug(false); return; @@ -863,14 +863,14 @@ void MetalRenderer::draw_beginSequence() LatteGPUState.repeatTextureInitialization = false; if (!LatteMRT::UpdateCurrentFBO()) { - debug_printf("Rendertarget invalid\n"); + cemuLog_logOnce(LogType::Force, "Rendertarget invalid\n"); m_state.m_skipDrawSequence = true; return; // no render target } if (!hasValidFramebufferAttached && !streamoutEnable) { - debug_printf("Drawcall with no color buffer or depth buffer attached\n"); + cemuLog_logOnce(LogType::Force, "Drawcall with no color buffer or depth buffer attached\n"); m_state.m_skipDrawSequence = true; return; // no render target } @@ -1241,7 +1241,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 verticesPerPrimitive = 3; break; default: - debug_printf("invalid primitive mode %u\n", (uint32)primitiveMode); + cemuLog_log(LogType::Force, "unimplemented geometry shader primitive mode {}", (uint32)primitiveMode); break; } @@ -1804,7 +1804,7 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE uint32 binding = shader->resourceMapping.getTextureBaseBindingPoint() + i; if (binding >= MAX_MTL_TEXTURES) { - debug_printf("invalid texture binding %u\n", binding); + cemuLog_logOnce(LogType::Force, "invalid texture binding {}", binding); continue; } @@ -1952,7 +1952,7 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE uint32 binding = shader->resourceMapping.uniformBuffersBindingPoint[i]; if (binding >= MAX_MTL_BUFFERS) { - debug_printf("invalid buffer binding%u\n", binding); + cemuLog_logOnce(LogType::Force, "invalid buffer binding {}", binding); continue; } From a81ee7934ecf3a81c8d2deef8540eea3e67b25e4 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 15 Nov 2024 19:52:22 +0100 Subject: [PATCH 267/368] maximize concurrent shader compilation when loading shader cache --- src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index f8b5efe91..e81ee59df 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -88,16 +88,18 @@ class ShaderMtlThreadPool // TODO: find out if it would be possible to cache compiled Metal shaders void RendererShaderMtl::ShaderCacheLoading_begin(uint64 cacheTitleId) { - s_isLoadingShadersMtl = true; + // Maximize shader compilation speed + static_cast(g_renderer.get())->GetDevice()->setShouldMaximizeConcurrentCompilation(true); } void RendererShaderMtl::ShaderCacheLoading_end() { - s_isLoadingShadersMtl = false; + static_cast(g_renderer.get())->GetDevice()->setShouldMaximizeConcurrentCompilation(false); } void RendererShaderMtl::ShaderCacheLoading_Close() { + // Do nothing } void RendererShaderMtl::Initialize() From 2890819118f400f9687fede0622652d3f13ae724 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 16 Nov 2024 08:57:00 +0100 Subject: [PATCH 268/368] fix: triangle fan index count --- src/Cafe/HW/Latte/Core/LatteIndices.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/Cafe/HW/Latte/Core/LatteIndices.cpp b/src/Cafe/HW/Latte/Core/LatteIndices.cpp index e92f0f732..5b6d8495d 100644 --- a/src/Cafe/HW/Latte/Core/LatteIndices.cpp +++ b/src/Cafe/HW/Latte/Core/LatteIndices.cpp @@ -91,6 +91,21 @@ uint32 LatteIndices_calculateIndexOutputSize(LattePrimitiveMode primitiveMode, L cemu_assert_suspicious(); return 0; } + else if (primitiveMode == LattePrimitiveMode::TRIANGLE_FAN && g_renderer->GetType() == RendererAPI::Metal) + { + if (indexType == LatteIndexType::AUTO) + { + if (count <= 0xFFFF) + return count * sizeof(uint16); + return count * sizeof(uint32); + } + if (indexType == LatteIndexType::U16_BE || indexType == LatteIndexType::U16_LE) + return count * sizeof(uint16); + if (indexType == LatteIndexType::U32_BE || indexType == LatteIndexType::U32_LE) + return count * sizeof(uint32); + cemu_assert_suspicious(); + return 0; + } else if(indexType == LatteIndexType::AUTO) return 0; else if (indexType == LatteIndexType::U16_BE || indexType == LatteIndexType::U16_LE) From 8a8037377fb122522c6099194c634d42189b7b94 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 16 Nov 2024 15:28:23 +0100 Subject: [PATCH 269/368] rework the command buffer system --- .../Metal/LatteTextureReadbackMtl.cpp | 2 +- .../Renderer/Metal/MetalBufferAllocator.h | 42 ++++------------- .../HW/Latte/Renderer/Metal/MetalQuery.cpp | 12 ++--- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 47 +++++++++++++++---- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 17 ++++++- 5 files changed, 66 insertions(+), 54 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp index ca4e31a7f..05b579e70 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp @@ -1,7 +1,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" -#include "HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" LatteTextureReadbackInfoMtl::~LatteTextureReadbackInfoMtl() { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h index 7a1525969..209b1395c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h @@ -201,8 +201,6 @@ class MetalTemporaryBufferAllocator : public MetalBufferAllocator{})); cemu_assert_debug(result.second); m_activeCommandBufferIt = result.first; - commandBuffer->retain(); } else { @@ -263,41 +260,20 @@ class MetalTemporaryBufferAllocator : public MetalBufferAllocatorsecond) { - if (CommandBufferCompleted(it->first)) - { - for (auto bufferIndex : it->second) - { - auto& buffer = m_buffers[bufferIndex]; - buffer.m_data.m_commandBufferCount--; - - // TODO: is this neccessary? - if (!buffer.m_data.IsLocked() && buffer.m_data.m_commandBufferCount == 0) - FreeBuffer(bufferIndex); - } - - it->first->release(); + auto& buffer = m_buffers[bufferIndex]; + buffer.m_data.m_commandBufferCount--; - it = m_executingCommandBuffers.erase(it); - - atLeastOneCompleted = true; - } - else - { - ++it; - } + // TODO: is this neccessary? + if (!buffer.m_data.IsLocked() && buffer.m_data.m_commandBufferCount == 0) + FreeBuffer(bufferIndex); } - if (atLeastOneCompleted) - LatteIndices_invalidateAll(); - - //if (erase) - // m_commandBuffersFrames.erase(commandBuffer); + m_executingCommandBuffers.erase(it); } MTL::Buffer* GetBuffer(uint32 bufferIndex) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp index 5a60d4eaa..ee79f2dd8 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp @@ -3,13 +3,7 @@ bool LatteQueryObjectMtl::getResult(uint64& numSamplesPassed) { - if (!m_commandBuffer) - { - numSamplesPassed = 0; - return true; - } - - if (!CommandBufferCompleted(m_commandBuffer)) + if (m_commandBuffer && !CommandBufferCompleted(m_commandBuffer)) return false; uint64* resultPtr = m_mtlr->GetOcclusionQueryResultsPtr(); @@ -38,7 +32,7 @@ void LatteQueryObjectMtl::end() m_range.end = m_mtlr->GetOcclusionQueryIndex(); m_mtlr->EndOcclusionQuery(); - m_commandBuffer = m_mtlr->GetCurrentCommandBuffer()->retain(); - if (m_mtlr->IsCommandBufferActive()) + m_commandBuffer = m_mtlr->GetAndRetainCurrentCommandBufferIfNotCompleted(); + if (m_commandBuffer) m_mtlr->RequestSoonCommit(); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index ef1c0d711..3f7c46da5 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -20,6 +20,7 @@ #include "Cemu/Logging/CemuLogging.h" #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Core/LatteConst.h" +#include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "config/CemuConfig.h" #include "gui/guiWrapper.h" @@ -398,12 +399,9 @@ void MetalRenderer::Flush(bool waitIdle) { if (m_recordedDrawcalls > 0 || waitIdle) CommitCommandBuffer(); - if (waitIdle) - { - cemu_assert_debug(m_currentCommandBuffer.m_commited); - m_currentCommandBuffer.m_commandBuffer->waitUntilCompleted(); - } + if (waitIdle && m_executingCommandBuffers.size() != 0) + m_executingCommandBuffers.back()->waitUntilCompleted(); } void MetalRenderer::NotifyLatteCommandProcessorIdle() @@ -1397,13 +1395,12 @@ void MetalRenderer::occlusionQuery_destroy(LatteQueryObject* queryObj) { } void MetalRenderer::occlusionQuery_flush() { - // TODO: wait for all command buffers with occlusion queries? if (m_occlusionQuery.m_lastCommandBuffer) m_occlusionQuery.m_lastCommandBuffer->waitUntilCompleted(); } void MetalRenderer::occlusionQuery_updateState() { - // TODO: implement + ProcessFinishedCommandBuffers(); } void MetalRenderer::SetBuffer(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Buffer* buffer, size_t offset, uint32 index) @@ -1686,6 +1683,9 @@ void MetalRenderer::CommitCommandBuffer() EndEncoding(); + ProcessFinishedCommandBuffers(); + + // Commit the command buffer if (!m_currentCommandBuffer.m_commited) { // Handled differently, since it seems like Metal doesn't always call the completion handler @@ -1695,12 +1695,14 @@ void MetalRenderer::CommitCommandBuffer() // Signal event m_eventValue = (m_eventValue + 1) % EVENT_VALUE_WRAP; - m_currentCommandBuffer.m_commandBuffer->encodeSignalEvent(m_event, m_eventValue); + auto mtlCommandBuffer = m_currentCommandBuffer.m_commandBuffer; + mtlCommandBuffer->encodeSignalEvent(m_event, m_eventValue); - m_currentCommandBuffer.m_commandBuffer->commit(); - m_currentCommandBuffer.m_commandBuffer->release(); + mtlCommandBuffer->commit(); m_currentCommandBuffer.m_commited = true; + m_executingCommandBuffers.push_back(mtlCommandBuffer); + m_memoryManager->GetTemporaryBufferAllocator().SetActiveCommandBuffer(nullptr); // Debug @@ -1708,6 +1710,31 @@ void MetalRenderer::CommitCommandBuffer() } } +void MetalRenderer::ProcessFinishedCommandBuffers() +{ + // Check for finished command buffers + bool atLeastOneCompleted = false; + for (auto it = m_executingCommandBuffers.begin(); it != m_executingCommandBuffers.end();) + { + auto commandBuffer = *it; + if (CommandBufferCompleted(commandBuffer)) + { + m_memoryManager->GetTemporaryBufferAllocator().CommandBufferFinished(commandBuffer); + commandBuffer->release(); + it = m_executingCommandBuffers.erase(it); + atLeastOneCompleted = true; + } + else + { + ++it; + } + } + + // Invalidate indices if at least one command buffer has completed + if (atLeastOneCompleted) + LatteIndices_invalidateAll(); +} + bool MetalRenderer::AcquireDrawable(bool mainWindow) { auto& layer = GetLayer(mainWindow); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index c272c729b..2f8514892 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -289,6 +289,15 @@ class MetalRenderer : public Renderer return m_currentCommandBuffer.m_commandBuffer; } + MTL::CommandBuffer* GetAndRetainCurrentCommandBufferIfNotCompleted() + { + // The command buffer has been commited and has finished execution + if (m_currentCommandBuffer.m_commited && m_executingCommandBuffers.size() == 0) + return nullptr; + + return GetCurrentCommandBuffer()->retain(); + } + void RequestSoonCommit() { m_commitTreshold = m_recordedDrawcalls + 8; @@ -337,6 +346,7 @@ class MetalRenderer : public Renderer MTL::BlitCommandEncoder* GetBlitCommandEncoder(); void EndEncoding(); void CommitCommandBuffer(); + void ProcessFinishedCommandBuffers(); bool AcquireDrawable(bool mainWindow); @@ -428,9 +438,13 @@ class MetalRenderer : public Renderer void EndOcclusionQuery() { m_occlusionQuery.m_active = false; + + // Release the old command buffer if (m_occlusionQuery.m_lastCommandBuffer) m_occlusionQuery.m_lastCommandBuffer->release(); - m_occlusionQuery.m_lastCommandBuffer = GetCurrentCommandBuffer()->retain(); + + // Get and retain the current command buffer + m_occlusionQuery.m_lastCommandBuffer = GetAndRetainCurrentCommandBufferIfNotCompleted(); } private: @@ -491,6 +505,7 @@ class MetalRenderer : public Renderer // Active objects MetalCommandBuffer m_currentCommandBuffer{}; + std::vector m_executingCommandBuffers; MetalEncoderType m_encoderType = MetalEncoderType::None; MTL::CommandEncoder* m_commandEncoder = nullptr; From b48656850ec63a771b252dc4b9b63e1dd315b7f7 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 19 Nov 2024 16:46:46 +0100 Subject: [PATCH 270/368] limit height to 1 for 1D textures --- src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index 03cd9285d..d8c52d4d8 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -27,15 +27,12 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM effectiveBaseHeight = std::max(1, effectiveBaseHeight); effectiveBaseDepth = std::max(1, effectiveBaseDepth); - desc->setWidth(effectiveBaseWidth); - desc->setHeight(effectiveBaseHeight); - desc->setMipmapLevelCount(mipLevels); - MTL::TextureType textureType; switch (dim) { case Latte::E_DIM::DIM_1D: textureType = MTL::TextureType1D; + effectiveBaseHeight = 1; break; case Latte::E_DIM::DIM_2D: case Latte::E_DIM::DIM_2D_MSAA: @@ -59,6 +56,10 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM } desc->setTextureType(textureType); + desc->setWidth(effectiveBaseWidth); + desc->setHeight(effectiveBaseHeight); + desc->setMipmapLevelCount(mipLevels); + if (textureType == MTL::TextureType3D) { desc->setDepth(effectiveBaseDepth); From b14098426469ce7595794f110bbc5b9b874ff407 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 19 Nov 2024 17:48:28 +0100 Subject: [PATCH 271/368] fix: texture sample gradient errors --- .../LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 3ed15f73e..a08a84784 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2240,8 +2240,7 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex bool unnormalizationHandled = false; bool useTexelCoordinates = false; - bool isRead = ((texOpcode == GPU7_TEX_INST_SAMPLE && (texInstruction->textureFetch.unnormalized[0] && texInstruction->textureFetch.unnormalized[1] && texInstruction->textureFetch.unnormalized[2] && texInstruction->textureFetch.unnormalized[3])) || - texOpcode == GPU7_TEX_INST_LD); + bool isRead = ((texOpcode == GPU7_TEX_INST_SAMPLE && (texInstruction->textureFetch.unnormalized[0] && texInstruction->textureFetch.unnormalized[1] && texInstruction->textureFetch.unnormalized[2] && texInstruction->textureFetch.unnormalized[3])) || texOpcode == GPU7_TEX_INST_LD); // handle illegal combinations if (texOpcode == GPU7_TEX_INST_FETCH4 && (texDim == Latte::E_DIM::DIM_1D || texDim == Latte::E_DIM::DIM_1D_ARRAY)) @@ -2459,7 +2458,7 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex src->add(")"); } } - else if (!isRead && !isGather/*texOpcode == GPU7_TEX_INST_SAMPLE_LZ || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ*/) + else if (texOpcode == GPU7_TEX_INST_SAMPLE_LZ || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ) { src->add(", level(0.0)"); } @@ -2469,9 +2468,9 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex if (texOpcode == GPU7_TEX_INST_SAMPLE_G) { if (texDim == Latte::E_DIM::DIM_2D || - texDim == Latte::E_DIM::DIM_1D ) + texDim == Latte::E_DIM::DIM_1D) { - src->add(",gradH.xy,gradV.xy"); + src->add(", gradient2d(gradH.xy, gradV.xy)"); } else { From c5bef60dbb3d7867bfb8340a307f01f9c6c9d2a8 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 19 Nov 2024 18:27:02 +0100 Subject: [PATCH 272/368] save pipeline even if compilation failed --- .../Renderer/Metal/MetalPipelineCache.cpp | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index dcd957c8a..1c69b5312 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -416,7 +416,7 @@ void MetalPipelineCache::LoadPipelineFromCache(std::span fileData) vertexShader = LatteSHRC_FindVertexShader(cachedPipeline->vsHash.baseHash, cachedPipeline->vsHash.auxHash); if (!vertexShader) { - cemuLog_logDebug(LogType::Force, "Vertex shader not found in cache"); + cemuLog_log(LogType::Force, "Vertex shader not found in cache"); return; } } @@ -426,7 +426,7 @@ void MetalPipelineCache::LoadPipelineFromCache(std::span fileData) geometryShader = LatteSHRC_FindGeometryShader(cachedPipeline->gsHash.baseHash, cachedPipeline->gsHash.auxHash); if (!geometryShader) { - cemuLog_logDebug(LogType::Force, "Geometry shader not found in cache"); + cemuLog_log(LogType::Force, "Geometry shader not found in cache"); return; } } @@ -436,7 +436,7 @@ void MetalPipelineCache::LoadPipelineFromCache(std::span fileData) pixelShader = LatteSHRC_FindPixelShader(cachedPipeline->psHash.baseHash, cachedPipeline->psHash.auxHash); if (!pixelShader) { - cemuLog_logDebug(LogType::Force, "Pixel shader not found in cache"); + cemuLog_log(LogType::Force, "Pixel shader not found in cache"); return; } } @@ -459,14 +459,11 @@ void MetalPipelineCache::LoadPipelineFromCache(std::span fileData) // destroy pp early } - // on success, cache the pipeline - if (pipelineObject->m_pipeline) - { - uint64 pipelineStateHash = CalculatePipelineHash(vertexShader->compatibleFetchShader, vertexShader, geometryShader, pixelShader, cachedPipeline->lastUsedAttachmentsInfo, attachmentsInfo, *lcr); - m_pipelineCacheLock.lock(); - m_pipelineCache[pipelineStateHash] = pipelineObject; - m_pipelineCacheLock.unlock(); - } + // Cache the pipeline + uint64 pipelineStateHash = CalculatePipelineHash(vertexShader->compatibleFetchShader, vertexShader, geometryShader, pixelShader, cachedPipeline->lastUsedAttachmentsInfo, attachmentsInfo, *lcr); + m_pipelineCacheLock.lock(); + m_pipelineCache[pipelineStateHash] = pipelineObject; + m_pipelineCacheLock.unlock(); // clean up s_spinlockSharedInternal.lock(); From 7f5040a6e08ca3a362b481777732e5f969611946 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 19 Nov 2024 19:13:15 +0100 Subject: [PATCH 273/368] don't release autoreleased objects --- src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp | 1 - src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp | 1 - src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 6 ++---- .../HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.cpp | 1 - src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp | 1 - 5 files changed, 2 insertions(+), 8 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp index 8a69a442a..aec662bb7 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp @@ -31,7 +31,6 @@ MTL::RenderPipelineState* MetalOutputShaderCache::GetPipeline(RendererOutputShad if (error) { cemuLog_log(LogType::Force, "error creating output render pipeline state: {}", error->localizedDescription()->utf8String()); - error->release(); } return renderPipelineState; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp index 194498ecc..fb92727d9 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp @@ -389,7 +389,6 @@ bool MetalPipelineCompiler::Compile(bool forceCompile, bool isRenderThread, bool if (error) { cemuLog_log(LogType::Force, "error creating render pipeline state: {}", error->localizedDescription()->utf8String()); - error->release(); } if (showInOverlay) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 3f7c46da5..4a6c99539 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -125,8 +125,6 @@ MetalRenderer::MetalRenderer() if (error) { cemuLog_log(LogType::Force, "failed to create utility library (error: {})", error->localizedDescription()->utf8String()); - error->release(); - throw; return; } @@ -1666,7 +1664,7 @@ void MetalRenderer::EndEncoding() if (m_commandEncoder) { m_commandEncoder->endEncoding(); - m_commandEncoder->release(); + //m_commandEncoder->release(); m_commandEncoder = nullptr; m_encoderType = MetalEncoderType::None; @@ -1720,7 +1718,7 @@ void MetalRenderer::ProcessFinishedCommandBuffers() if (CommandBufferCompleted(commandBuffer)) { m_memoryManager->GetTemporaryBufferAllocator().CommandBufferFinished(commandBuffer); - commandBuffer->release(); + //commandBuffer->release(); it = m_executingCommandBuffers.erase(it); atLeastOneCompleted = true; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.cpp index ded711f9f..6789505c3 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.cpp @@ -16,7 +16,6 @@ MetalVoidVertexPipeline::MetalVoidVertexPipeline(class MetalRenderer* mtlRendere if (error) { cemuLog_log(LogType::Force, "error creating hybrid render pipeline state: {}", error->localizedDescription()->utf8String()); - error->release(); } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index e81ee59df..ab33e0716 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -185,7 +185,6 @@ void RendererShaderMtl::CompileInternal() if (error) { cemuLog_log(LogType::Force, "failed to create library: {} -> {}", error->localizedDescription()->utf8String(), m_mslCode.c_str()); - error->release(); FinishCompilation(); return; } From fda2f406402d2eb724407a08cac05e212bc35d06 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 19 Nov 2024 19:55:54 +0100 Subject: [PATCH 274/368] fix: output shader issues --- .../HW/Latte/Renderer/RendererOuputShader.cpp | 66 ++++++++++++------- 1 file changed, 44 insertions(+), 22 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp b/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp index bcb897433..55c97a3a1 100644 --- a/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp +++ b/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp @@ -1,5 +1,6 @@ #include "Cafe/HW/Latte/Renderer/RendererOuputShader.h" #include "Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h" +#include "HW/Latte/Renderer/Renderer.h" const std::string RendererOutputShader::s_copy_shader_source = R"( @@ -141,7 +142,7 @@ vec3 BicubicHermiteTexture(vec2 uv, vec4 texelSize) vec2 pixel = uv*texelSize.zw + 0.5; vec2 frac = fract(pixel); pixel = floor(pixel) / texelSize.zw - vec2(texelSize.xy/2.0); - + vec4 doubleSize = texelSize*2.0; vec3 C00 = texture(textureSrc, pixel + vec2(-texelSize.x ,-texelSize.y)).rgb; @@ -244,7 +245,11 @@ fragment float4 main0(VertexOut in [[stage_in]], texture2d textureSrc [[t RendererOutputShader::RendererOutputShader(const std::string& vertex_source, const std::string& fragment_source) { - auto finalFragmentSrc = PrependFragmentPreamble(fragment_source); + std::string finalFragmentSrc; + if (g_renderer->GetType() == RendererAPI::Metal) + finalFragmentSrc = fragment_source; + else + finalFragmentSrc = PrependFragmentPreamble(fragment_source); m_vertex_shader = g_renderer->shader_create(RendererShader::ShaderType::kVertex, 0, 0, vertex_source, false, false); m_fragment_shader = g_renderer->shader_create(RendererShader::ShaderType::kFragment, 0, 0, finalFragmentSrc, false, false); @@ -470,24 +475,41 @@ layout(location = 0) out vec4 colorOut0; } void RendererOutputShader::InitializeStatic() { - std::string vertex_source, vertex_source_ud; - // vertex shader - if (g_renderer->GetType() == RendererAPI::OpenGL) - { - vertex_source = GetOpenGlVertexSource(false); - vertex_source_ud = GetOpenGlVertexSource(true); - } - else if (g_renderer->GetType() == RendererAPI::Vulkan) - { - vertex_source = GetVulkanVertexSource(false); - vertex_source_ud = GetVulkanVertexSource(true); - } - s_copy_shader = new RendererOutputShader(vertex_source, s_copy_shader_source); - s_copy_shader_ud = new RendererOutputShader(vertex_source_ud, s_copy_shader_source); - - s_bicubic_shader = new RendererOutputShader(vertex_source, s_bicubic_shader_source); - s_bicubic_shader_ud = new RendererOutputShader(vertex_source_ud, s_bicubic_shader_source); - - s_hermit_shader = new RendererOutputShader(vertex_source, s_hermite_shader_source); - s_hermit_shader_ud = new RendererOutputShader(vertex_source_ud, s_hermite_shader_source); + if (g_renderer->GetType() == RendererAPI::Metal) + { + std::string vertex_source = GetMetalVertexSource(false); + std::string vertex_source_ud = GetMetalVertexSource(true); + + s_copy_shader = new RendererOutputShader(vertex_source, s_copy_shader_source_mtl); + s_copy_shader_ud = new RendererOutputShader(vertex_source_ud, s_copy_shader_source_mtl); + + s_bicubic_shader = new RendererOutputShader(vertex_source, s_bicubic_shader_source_mtl); + s_bicubic_shader_ud = new RendererOutputShader(vertex_source_ud, s_bicubic_shader_source_mtl); + + s_hermit_shader = new RendererOutputShader(vertex_source, s_hermite_shader_source_mtl); + s_hermit_shader_ud = new RendererOutputShader(vertex_source_ud, s_hermite_shader_source_mtl); + } + else + { + std::string vertex_source, vertex_source_ud; + // vertex shader + if (g_renderer->GetType() == RendererAPI::OpenGL) + { + vertex_source = GetOpenGlVertexSource(false); + vertex_source_ud = GetOpenGlVertexSource(true); + } + else if (g_renderer->GetType() == RendererAPI::Vulkan) + { + vertex_source = GetVulkanVertexSource(false); + vertex_source_ud = GetVulkanVertexSource(true); + } + s_copy_shader = new RendererOutputShader(vertex_source, s_copy_shader_source); + s_copy_shader_ud = new RendererOutputShader(vertex_source_ud, s_copy_shader_source); + + s_bicubic_shader = new RendererOutputShader(vertex_source, s_bicubic_shader_source); + s_bicubic_shader_ud = new RendererOutputShader(vertex_source_ud, s_bicubic_shader_source); + + s_hermit_shader = new RendererOutputShader(vertex_source, s_hermite_shader_source); + s_hermit_shader_ud = new RendererOutputShader(vertex_source_ud, s_hermite_shader_source); + } } From 9095035532bf69edd52613ab6210f5db5d161660 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 20 Nov 2024 19:30:30 +0100 Subject: [PATCH 275/368] only set array length for texture arrays --- src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index d8c52d4d8..da00d7ec8 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -64,15 +64,11 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM { desc->setDepth(effectiveBaseDepth); } - else if (textureType == MTL::TextureTypeCube) - { - // Do nothing - } else if (textureType == MTL::TextureTypeCubeArray) { desc->setArrayLength(effectiveBaseDepth / 6); } - else + else if (textureType == MTL::TextureType2DArray) { desc->setArrayLength(effectiveBaseDepth); } From d9f857bcc4588e02b5dd7980299da9cceea288cf Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 20 Nov 2024 19:38:53 +0100 Subject: [PATCH 276/368] only set array length for texture view arrays --- .../Renderer/Metal/LatteTextureViewMtl.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp index 5374126ac..e77e47156 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp @@ -2,6 +2,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Metal/MTLTexture.hpp" uint32 LatteTextureMtl_AdjustTextureCompSel(Latte::E_GX2SURFFMT format, uint32 compSel) { @@ -158,21 +159,21 @@ MTL::Texture* LatteTextureViewMtl::CreateSwizzledView(uint32 gpuSamplerSwizzle) uint32 baseLevel = firstMip; uint32 levelCount = this->numMip; - uint32 baseLayer; - uint32 layerCount; - // TODO: check if base texture is 3D texture as well + uint32 baseLayer = 0; + uint32 layerCount = 1; + + // TODO: check if base texture is 3D texture as well? if (textureType == MTL::TextureType3D) { cemu_assert_debug(firstMip == 0); cemu_assert_debug(this->numSlice == baseTexture->depth); - baseLayer = 0; - layerCount = 1; } - else - { - baseLayer = firstSlice; + // Cube array needs to have layer count multiple of 6 as opposed to when creating a texture + else if (textureType == MTL::TextureTypeCubeArray || textureType == MTL::TextureType2DArray) + { + baseLayer = firstSlice; layerCount = this->numSlice; - } + } MTL::TextureSwizzleChannels swizzle; swizzle.red = GetMtlTextureSwizzle(compSelR); From 732e3be63daa7694579daa01c3371fa8d954b775 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 20 Nov 2024 19:44:04 +0100 Subject: [PATCH 277/368] explicitly make integer constants int --- .../HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index a08a84784..f6caf319b 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -849,7 +849,7 @@ static void _emitOperandInputCode(LatteDecompilerShaderContext* shaderContext, L else if( GPU7_ALU_SRC_IS_LITERAL(aluInstruction->sourceOperand[operandIndex].sel) ) { if( requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) - src->addFmt("0x{:x}", aluInstruction->literalData.w[aluInstruction->sourceOperand[operandIndex].chan]); + src->addFmt("int(0x{:x})", aluInstruction->literalData.w[aluInstruction->sourceOperand[operandIndex].chan]); else if( requiredType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT ) src->addFmt("uint(0x{:x})", aluInstruction->literalData.w[aluInstruction->sourceOperand[operandIndex].chan]); else if (requiredType == LATTE_DECOMPILER_DTYPE_FLOAT) From 66ad59db9227be3380b831cdfd959fd1d9a619b1 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 22 Nov 2024 19:44:49 +0100 Subject: [PATCH 278/368] implement state 5 through a draw call --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 48 ++++++++++++--- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 4 ++ .../Renderer/Metal/UtilityShaderSource.h | 59 +++++++++---------- 3 files changed, 74 insertions(+), 37 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 4a6c99539..cfb63fc5a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -21,6 +21,9 @@ #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Core/LatteConst.h" #include "HW/Latte/Renderer/Metal/MetalCommon.h" +#include "Metal/MTLPixelFormat.hpp" +#include "Metal/MTLRenderCommandEncoder.hpp" +#include "Metal/MTLRenderPipeline.hpp" #include "config/CemuConfig.h" #include "gui/guiWrapper.h" @@ -125,7 +128,26 @@ MetalRenderer::MetalRenderer() if (error) { cemuLog_log(LogType::Force, "failed to create utility library (error: {})", error->localizedDescription()->utf8String()); - return; + } + + // Pipelines + MTL::Function* vertexFullscreenFunction = utilityLibrary->newFunction(ToNSString("vertexFullscreen")); + MTL::Function* fragmentCopyDepthToColorFunction = utilityLibrary->newFunction(ToNSString("fragmentCopyDepthToColor")); + + MTL::RenderPipelineDescriptor* rpd = MTL::RenderPipelineDescriptor::alloc()->init(); + rpd->setVertexFunction(vertexFullscreenFunction); + rpd->setFragmentFunction(fragmentCopyDepthToColorFunction); + // TODO: don't hardcode the format + rpd->colorAttachments()->object(0)->setPixelFormat(MTL::PixelFormatR16Unorm); + + vertexFullscreenFunction->release(); + fragmentCopyDepthToColorFunction->release(); + + error = nullptr; + m_copyDepthToColorPipeline = m_device->newRenderPipelineState(rpd, &error); + if (error) + { + cemuLog_log(LogType::Force, "failed to create copy depth to color pipeline (error: {})", error->localizedDescription()->utf8String()); } // Void vertex pipelines @@ -142,8 +164,7 @@ MetalRenderer::~MetalRenderer() //delete m_copyTextureToTexturePipeline; //delete m_restrideBufferPipeline; - //m_presentPipelineLinear->release(); - //m_presentPipelineSRGB->release(); + m_copyDepthToColorPipeline->release(); delete m_outputShaderCache; delete m_pipelineCache; @@ -1348,14 +1369,27 @@ void MetalRenderer::draw_handleSpecialState5() LatteTextureView* colorBuffer = LatteMRT::GetColorAttachment(0); LatteTextureView* depthBuffer = LatteMRT::GetDepthAttachment(); + auto mtlDepthTexture = static_cast(depthBuffer)->GetRGBAView(); sint32 vpWidth, vpHeight; LatteMRT::GetVirtualViewportDimensions(vpWidth, vpHeight); - surfaceCopy_copySurfaceWithFormatConversion( - depthBuffer->baseTexture, depthBuffer->firstMip, depthBuffer->firstSlice, - colorBuffer->baseTexture, colorBuffer->firstMip, colorBuffer->firstSlice, - vpWidth, vpHeight); + // Sadly, we need to end encoding to ensure that the depth data is up-to-date + + // Copy depth to color + auto renderCommandEncoder = GetRenderCommandEncoder(); + + auto& encoderState = m_state.m_encoderState; + + renderCommandEncoder->setRenderPipelineState(m_copyDepthToColorPipeline); + // TODO: make a helper function for this + encoderState.m_renderPipelineState = m_copyDepthToColorPipeline; + SetTexture(renderCommandEncoder, METAL_SHADER_TYPE_FRAGMENT, mtlDepthTexture, GET_HELPER_TEXTURE_BINDING(0)); + // TODO: make a helper function for this + renderCommandEncoder->setFragmentBytes(&vpWidth, sizeof(sint32), GET_HELPER_BUFFER_BINDING(0)); + encoderState.m_buffers[METAL_SHADER_TYPE_FRAGMENT][GET_HELPER_BUFFER_BINDING(0)] = {nullptr}; + + renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3)); } void* MetalRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 2f8514892..6d5bea6c6 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -6,6 +6,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h" +#include "Metal/MTLRenderPipeline.hpp" #include struct MetalBufferAllocation @@ -471,6 +472,9 @@ class MetalRenderer : public Renderer class MetalDepthStencilCache* m_depthStencilCache; class MetalSamplerCache* m_samplerCache; + // Pipelines + MTL::RenderPipelineState* m_copyDepthToColorPipeline; + // Void vertex pipelines class MetalVoidVertexPipeline* m_copyBufferToBufferPipeline; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h index 9fba19467..2041f4f88 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h @@ -8,24 +8,24 @@ using namespace metal; #define GET_BUFFER_BINDING(index) (28 + index) #define GET_TEXTURE_BINDING(index) (29 + index) -#define GET_SAMPLER_BINDING(index) (14 + index)\n +#define GET_SAMPLER_BINDING(index) (14 + index) + +constant float2 positions[] = {float2(-1.0, -3.0), float2(-1.0, 1.0), float2(3.0, 1.0)}; + +struct VertexOut { + float4 position [[position]]; + float2 texCoord; +}; + +vertex VertexOut vertexFullscreen(ushort vid [[vertex_id]]) { + VertexOut out; + out.position = float4(positions[vid], 0.0, 1.0); + out.texCoord = positions[vid] * 0.5 + 0.5; + out.texCoord.y = 1.0 - out.texCoord.y; + + return out; +} -//constant float2 positions[] = {float2(-1.0, -3.0), float2(-1.0, 1.0), float2(3.0, 1.0)}; -// -//struct VertexOut { -// float4 position [[position]]; -// float2 texCoord; -//}; -// -//vertex VertexOut vertexFullscreen(ushort vid [[vertex_id]]) { -// VertexOut out; -// out.position = float4(positions[vid], 0.0, 1.0); -// out.texCoord = positions[vid] * 0.5 + 0.5; -// out.texCoord.y = 1.0 - out.texCoord.y; -// -// return out; -//} -// //fragment float4 fragmentPresent(VertexOut in [[stage_in]], texture2d tex [[texture(0)]], //sampler samplr [[sampler(0)]]) { // return tex.sample(samplr, in.texCoord); //} @@ -34,19 +34,18 @@ vertex void vertexCopyBufferToBuffer(uint vid [[vertex_id]], device uint8_t* src dst[vid] = src[vid]; } -//vertex void vertexCopyTextureToTexture(uint vid [[vertex_id]], texture2d src [[texture(GET_TEXTURE_BINDING(0))]], texture2d dst [[texture(GET_TEXTURE_BINDING(1))]], constant uint32_t& width [[buffer(GET_BUFFER_BINDING(0))]]) { -// uint2 coord = uint2(vid % width, vid / width); -// return dst.write(float4(src.read(coord).r, 0.0, 0.0, 0.0), coord); -//} +fragment float4 fragmentCopyDepthToColor(VertexOut in [[stage_in]], texture2d src [[texture(GET_TEXTURE_BINDING(0))]]) { + return float4(src.read(uint2(in.position.xy)).r, 0.0, 0.0, 0.0); +} -struct RestrideParams { - uint oldStride; - uint newStride; -}; +//struct RestrideParams { +// uint oldStride; +// uint newStride; +//}; -vertex void vertexRestrideBuffer(uint vid [[vertex_id]], device uint8_t* src [[buffer(GET_BUFFER_BINDING(0))]], device uint8_t* dst [[buffer(GET_BUFFER_BINDING(1))]], constant RestrideParams& params [[buffer(GET_BUFFER_BINDING(2))]]) { - for (uint32_t i = 0; i < params.oldStride; i++) { - dst[vid * params.newStride + i] = src[vid * params.oldStride + i]; - } -} +//vertex void vertexRestrideBuffer(uint vid [[vertex_id]], device uint8_t* src [[buffer//(GET_BUFFER_BINDING(0))]], device uint8_t* dst [[buffer(GET_BUFFER_BINDING(1))]], constant //RestrideParams& params [[buffer(GET_BUFFER_BINDING(2))]]) { +// for (uint32_t i = 0; i < params.oldStride; i++) { +// dst[vid * params.newStride + i] = src[vid * params.oldStride + i]; +// } +//} )"; From 00857b233b6c92dc3ecb5c4b0341d960f4a1665a Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 22 Nov 2024 20:03:41 +0100 Subject: [PATCH 279/368] support arbitrary pixel formats for state 5 --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 45 +++++++++++-------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 3 +- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index cfb63fc5a..a29a23568 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -134,22 +134,12 @@ MetalRenderer::MetalRenderer() MTL::Function* vertexFullscreenFunction = utilityLibrary->newFunction(ToNSString("vertexFullscreen")); MTL::Function* fragmentCopyDepthToColorFunction = utilityLibrary->newFunction(ToNSString("fragmentCopyDepthToColor")); - MTL::RenderPipelineDescriptor* rpd = MTL::RenderPipelineDescriptor::alloc()->init(); - rpd->setVertexFunction(vertexFullscreenFunction); - rpd->setFragmentFunction(fragmentCopyDepthToColorFunction); - // TODO: don't hardcode the format - rpd->colorAttachments()->object(0)->setPixelFormat(MTL::PixelFormatR16Unorm); - + m_copyDepthToColorDesc = MTL::RenderPipelineDescriptor::alloc()->init(); + m_copyDepthToColorDesc->setVertexFunction(vertexFullscreenFunction); + m_copyDepthToColorDesc->setFragmentFunction(fragmentCopyDepthToColorFunction); vertexFullscreenFunction->release(); fragmentCopyDepthToColorFunction->release(); - error = nullptr; - m_copyDepthToColorPipeline = m_device->newRenderPipelineState(rpd, &error); - if (error) - { - cemuLog_log(LogType::Force, "failed to create copy depth to color pipeline (error: {})", error->localizedDescription()->utf8String()); - } - // Void vertex pipelines if (m_isAppleGPU) m_copyBufferToBufferPipeline = new MetalVoidVertexPipeline(this, utilityLibrary, "vertexCopyBufferToBuffer"); @@ -164,7 +154,9 @@ MetalRenderer::~MetalRenderer() //delete m_copyTextureToTexturePipeline; //delete m_restrideBufferPipeline; - m_copyDepthToColorPipeline->release(); + m_copyDepthToColorDesc->release(); + for (const auto [pixelFormat, pipeline] : m_copyDepthToColorPipelines) + pipeline->release(); delete m_outputShaderCache; delete m_pipelineCache; @@ -1369,22 +1361,39 @@ void MetalRenderer::draw_handleSpecialState5() LatteTextureView* colorBuffer = LatteMRT::GetColorAttachment(0); LatteTextureView* depthBuffer = LatteMRT::GetDepthAttachment(); - auto mtlDepthTexture = static_cast(depthBuffer)->GetRGBAView(); + auto colorTextureMtl = static_cast(colorBuffer); + auto depthTextureMtl = static_cast(depthBuffer); sint32 vpWidth, vpHeight; LatteMRT::GetVirtualViewportDimensions(vpWidth, vpHeight); + // Get the pipeline + MTL::PixelFormat colorPixelFormat = colorTextureMtl->GetRGBAView()->pixelFormat(); + auto& pipeline = m_copyDepthToColorPipelines[colorPixelFormat]; + if (!pipeline) + { + m_copyDepthToColorDesc->colorAttachments()->object(0)->setPixelFormat(colorPixelFormat); + + NS::Error* error = nullptr; + pipeline = m_device->newRenderPipelineState(m_copyDepthToColorDesc, &error); + if (error) + { + cemuLog_log(LogType::Force, "failed to create copy depth to color pipeline (error: {})", error->localizedDescription()->utf8String()); + } + } + // Sadly, we need to end encoding to ensure that the depth data is up-to-date + EndEncoding(); // Copy depth to color auto renderCommandEncoder = GetRenderCommandEncoder(); auto& encoderState = m_state.m_encoderState; - renderCommandEncoder->setRenderPipelineState(m_copyDepthToColorPipeline); + renderCommandEncoder->setRenderPipelineState(pipeline); // TODO: make a helper function for this - encoderState.m_renderPipelineState = m_copyDepthToColorPipeline; - SetTexture(renderCommandEncoder, METAL_SHADER_TYPE_FRAGMENT, mtlDepthTexture, GET_HELPER_TEXTURE_BINDING(0)); + encoderState.m_renderPipelineState = pipeline; + SetTexture(renderCommandEncoder, METAL_SHADER_TYPE_FRAGMENT, depthTextureMtl->GetRGBAView(), GET_HELPER_TEXTURE_BINDING(0)); // TODO: make a helper function for this renderCommandEncoder->setFragmentBytes(&vpWidth, sizeof(sint32), GET_HELPER_BUFFER_BINDING(0)); encoderState.m_buffers[METAL_SHADER_TYPE_FRAGMENT][GET_HELPER_BUFFER_BINDING(0)] = {nullptr}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 6d5bea6c6..010f3f922 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -473,7 +473,8 @@ class MetalRenderer : public Renderer class MetalSamplerCache* m_samplerCache; // Pipelines - MTL::RenderPipelineState* m_copyDepthToColorPipeline; + MTL::RenderPipelineDescriptor* m_copyDepthToColorDesc; + std::map m_copyDepthToColorPipelines; // Void vertex pipelines class MetalVoidVertexPipeline* m_copyBufferToBufferPipeline; From 14258cdb284215ef1031b37507e6726a44ad24d6 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 23 Nov 2024 08:39:50 +0100 Subject: [PATCH 280/368] Revert "only set array length for texture view arrays" This reverts commit d9f857bcc4588e02b5dd7980299da9cceea288cf. --- .../Renderer/Metal/LatteTextureViewMtl.cpp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp index e77e47156..5374126ac 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp @@ -2,7 +2,6 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" -#include "Metal/MTLTexture.hpp" uint32 LatteTextureMtl_AdjustTextureCompSel(Latte::E_GX2SURFFMT format, uint32 compSel) { @@ -159,21 +158,21 @@ MTL::Texture* LatteTextureViewMtl::CreateSwizzledView(uint32 gpuSamplerSwizzle) uint32 baseLevel = firstMip; uint32 levelCount = this->numMip; - uint32 baseLayer = 0; - uint32 layerCount = 1; - - // TODO: check if base texture is 3D texture as well? + uint32 baseLayer; + uint32 layerCount; + // TODO: check if base texture is 3D texture as well if (textureType == MTL::TextureType3D) { cemu_assert_debug(firstMip == 0); cemu_assert_debug(this->numSlice == baseTexture->depth); + baseLayer = 0; + layerCount = 1; } - // Cube array needs to have layer count multiple of 6 as opposed to when creating a texture - else if (textureType == MTL::TextureTypeCubeArray || textureType == MTL::TextureType2DArray) - { - baseLayer = firstSlice; + else + { + baseLayer = firstSlice; layerCount = this->numSlice; - } + } MTL::TextureSwizzleChannels swizzle; swizzle.red = GetMtlTextureSwizzle(compSelR); From 009dab8a280441baada89999a241c36d4954df57 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 23 Nov 2024 08:41:00 +0100 Subject: [PATCH 281/368] remove useless includes --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 3 --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h | 1 - 2 files changed, 4 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index a29a23568..890295127 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -21,9 +21,6 @@ #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Core/LatteConst.h" #include "HW/Latte/Renderer/Metal/MetalCommon.h" -#include "Metal/MTLPixelFormat.hpp" -#include "Metal/MTLRenderCommandEncoder.hpp" -#include "Metal/MTLRenderPipeline.hpp" #include "config/CemuConfig.h" #include "gui/guiWrapper.h" diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 010f3f922..3f508ae8c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -6,7 +6,6 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h" -#include "Metal/MTLRenderPipeline.hpp" #include struct MetalBufferAllocation From 05b603d652da5b78d8e5992169cc1788e17add63 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 23 Nov 2024 18:06:13 +0100 Subject: [PATCH 282/368] fix: incorrect texture read coord type --- .../Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp | 4 ++-- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index f6caf319b..20b0e8459 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2316,7 +2316,7 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex // handle integer coordinates for texelFetch if (texDim == Latte::E_DIM::DIM_2D || texDim == Latte::E_DIM::DIM_2D_MSAA) { - src->add("int2("); + src->add("uint2("); src->add("float2("); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, texCoordDataType); src->addFmt(", "); @@ -2330,7 +2330,7 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex else if (texDim == Latte::E_DIM::DIM_1D) { // VC DS games forget to initialize textures and use texel fetch on an uninitialized texture (a dim of 0 maps to 1D) - src->add("int("); + src->add("uint("); src->add("float("); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, (texOpcode == GPU7_TEX_INST_LD) ? LATTE_DECOMPILER_DTYPE_SIGNED_INT : LATTE_DECOMPILER_DTYPE_FLOAT); src->addFmt(")*supportBuffer.tex{}Scale.x", texInstruction->textureFetch.textureIndex); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 890295127..82bf4d4b7 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -20,7 +20,6 @@ #include "Cemu/Logging/CemuLogging.h" #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Core/LatteConst.h" -#include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "config/CemuConfig.h" #include "gui/guiWrapper.h" From 18bd24b9c72b7e395dddc527bbf5dca7e438058c Mon Sep 17 00:00:00 2001 From: Samo Z Date: Sat, 30 Nov 2024 20:00:06 +0100 Subject: [PATCH 283/368] only set concurrent compilation if Metal 3 is available --- src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp | 1 - src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h | 7 ++++++- src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp | 4 ++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index 9de3bd1ce..6c4a251e7 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -2,7 +2,6 @@ #include "Cemu/Logging/CemuLogging.h" #include "HW/Latte/Core/LatteTextureLoader.h" #include "HW/Latte/Renderer/Metal/MetalCommon.h" -#include "Metal/MTLPixelFormat.hpp" std::map MTL_COLOR_FORMAT_TABLE = { {Latte::E_GX2SURFFMT::INVALID_FORMAT, {MTL::PixelFormatInvalid, MetalDataType::NONE, 0}}, diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 3f508ae8c..6a5db69b3 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -6,7 +6,6 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h" -#include struct MetalBufferAllocation { @@ -277,6 +276,12 @@ class MetalRenderer : public Renderer // Helpers MetalPerformanceMonitor& GetPerformanceMonitor() { return m_performanceMonitor; } + void SetShouldMaximizeConcurrentCompilation(bool shouldMaximizeConcurrentCompilation) + { + if (m_supportsMetal3) + m_device->setShouldMaximizeConcurrentCompilation(shouldMaximizeConcurrentCompilation); + } + bool IsCommandBufferActive() const { return (m_currentCommandBuffer.m_commandBuffer && !m_currentCommandBuffer.m_commited); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index ab33e0716..c77443c81 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -89,12 +89,12 @@ class ShaderMtlThreadPool void RendererShaderMtl::ShaderCacheLoading_begin(uint64 cacheTitleId) { // Maximize shader compilation speed - static_cast(g_renderer.get())->GetDevice()->setShouldMaximizeConcurrentCompilation(true); + static_cast(g_renderer.get())->SetShouldMaximizeConcurrentCompilation(true); } void RendererShaderMtl::ShaderCacheLoading_end() { - static_cast(g_renderer.get())->GetDevice()->setShouldMaximizeConcurrentCompilation(false); + static_cast(g_renderer.get())->SetShouldMaximizeConcurrentCompilation(false); } void RendererShaderMtl::ShaderCacheLoading_Close() From 1752126f5c1feed522e12dfdf23d5b4028f8ce99 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 1 Dec 2024 18:57:57 +0100 Subject: [PATCH 284/368] flush file streamer after each write --- .../HW/Latte/Renderer/Metal/MetalPipelineCache.cpp | 6 +++--- src/Cemu/FileCache/FileCache.cpp | 10 ++++++++-- src/Common/unix/FileStream_unix.cpp | 5 +++++ src/Common/unix/FileStream_unix.h | 2 ++ 4 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp index 1c69b5312..a922365b2 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -1,17 +1,17 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h" #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/ISA/RegDefines.h" #include "Cafe/HW/Latte/Core/LatteConst.h" #include "Cafe/HW/Latte/Common/RegisterSerializer.h" #include "Cafe/HW/Latte/Core/LatteShaderCache.h" -#include "Cemu/FileCache/FileCache.h" -#include "Common/precompiled.h" #include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/ISA/LatteReg.h" -#include "HW/Latte/Renderer/Metal/MetalPipelineCompiler.h" +#include "Cemu/FileCache/FileCache.h" +#include "Common/precompiled.h" #include "util/helpers/helpers.h" #include "config/ActiveSettings.h" diff --git a/src/Cemu/FileCache/FileCache.cpp b/src/Cemu/FileCache/FileCache.cpp index b284b66bd..820115d66 100644 --- a/src/Cemu/FileCache/FileCache.cpp +++ b/src/Cemu/FileCache/FileCache.cpp @@ -111,7 +111,7 @@ FileCache* FileCache::Create(const fs::path& path, uint32 extraVersion) fileCache->fileTableEntries[0].fileOffset = fileCache->fileTableOffset; fileCache->fileTableEntries[0].fileSize = fileCache->fileTableSize; // write header - + fs->writeU32(FILECACHE_MAGIC_V3); fs->writeU32(fileCache->extraVersion); fs->writeU64(fileCache->dataOffset); @@ -316,7 +316,7 @@ bool _uncompressFileData(const uint8* rawData, size_t rawSize, std::vectorSetPosition(this->dataOffset + currentStartOffset); fileStream->writeData(rawData, rawSize); +#ifdef __APPLE__ + fileStream->Flush(); +#endif // write file table entry fileStream->SetPosition(this->dataOffset + this->fileTableOffset + (uint64)(sizeof(FileTableEntry)*entryIndex)); fileStream->writeData(this->fileTableEntries + entryIndex, sizeof(FileTableEntry)); +#ifdef __APPLE__ + fileStream->Flush(); +#endif if (isCompressed) free(rawData); } diff --git a/src/Common/unix/FileStream_unix.cpp b/src/Common/unix/FileStream_unix.cpp index 4bc9b5263..0e9f11895 100644 --- a/src/Common/unix/FileStream_unix.cpp +++ b/src/Common/unix/FileStream_unix.cpp @@ -116,6 +116,11 @@ void FileStream::extract(std::vector& data) readData(data.data(), fileSize); } +void FileStream::Flush() +{ + m_fileStream.flush(); +} + uint32 FileStream::readData(void* data, uint32 length) { SyncReadWriteSeek(false); diff --git a/src/Common/unix/FileStream_unix.h b/src/Common/unix/FileStream_unix.h index 12c971d14..0a2fa7ed9 100644 --- a/src/Common/unix/FileStream_unix.h +++ b/src/Common/unix/FileStream_unix.h @@ -22,6 +22,8 @@ class FileStream bool SetEndOfFile(); void extract(std::vector& data); + void Flush(); + // reading uint32 readData(void* data, uint32 length); bool readU64(uint64& v); From ac96a1dd53e6abea980ffbd8493f2766727a98e9 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 8 Dec 2024 15:00:48 +0100 Subject: [PATCH 285/368] output all fragment inputs in the vertex shader --- .../LatteDecompilerEmitMSLHeader.hpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 04e3410de..395b24213 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -2,6 +2,7 @@ #include "Common/precompiled.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" +#include "HW/Latte/Core/LatteShader.h" namespace LatteDecompiler { @@ -185,6 +186,7 @@ namespace LatteDecompiler LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable(); auto parameterMask = shaderContext->shader->outputParameterMask; + bool psInputsWritten[GPU7_PS_MAX_INPUTS] = {false}; for (uint32 i = 0; i < 32; i++) { if ((parameterMask&(1 << i)) == 0) @@ -205,6 +207,8 @@ namespace LatteDecompiler if (psInputIndex == -1) continue; // no ps input + psInputsWritten[psInputIndex] = true; + src->addFmt("float4 passParameterSem{}", psInputTable->import[psInputIndex].semanticId); if (!isRectVertexShader) { @@ -217,6 +221,19 @@ namespace LatteDecompiler src->addFmt(";" _CRLF); } + // TODO: handle this in the fragment shader instead? + // Declare all PS inputs that are not written by the VS + for (uint32 i = 0; i < psInputTable->count; i++) + { + if (psInputsWritten[i]) + continue; + + if (psInputTable->import[i].semanticId > LATTE_ANALYZER_IMPORT_INDEX_PARAM_MAX) + continue; + + src->addFmt("float4 unknown{} [[user(locn{})]];" _CRLF, psInputTable->import[i].semanticId, i); + } + src->add("};" _CRLF _CRLF); if (isRectVertexShader) From 434ffc982852c75b557ffbd053594c84de6d9214 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 10 Dec 2024 08:11:10 +0100 Subject: [PATCH 286/368] clamp texture mip count --- src/Cafe/HW/Latte/Core/LatteTexture.cpp | 34 ++++++++++++++++ src/Cafe/HW/Latte/Core/LatteTextureLegacy.cpp | 40 +++---------------- .../Latte/Renderer/Metal/LatteTextureMtl.cpp | 4 ++ .../Renderer/Metal/LatteTextureViewMtl.cpp | 4 ++ 4 files changed, 47 insertions(+), 35 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteTexture.cpp b/src/Cafe/HW/Latte/Core/LatteTexture.cpp index 3c5610006..4445fb26b 100644 --- a/src/Cafe/HW/Latte/Core/LatteTexture.cpp +++ b/src/Cafe/HW/Latte/Core/LatteTexture.cpp @@ -1308,6 +1308,40 @@ LatteTexture::LatteTexture(Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddre { this->enableReadback = true; } + + // calculate number of potential mip levels (from effective size) + sint32 effectiveWidth = width; + sint32 effectiveHeight = height; + sint32 effectiveDepth = depth; + if (this->overwriteInfo.hasResolutionOverwrite) + { + effectiveWidth = this->overwriteInfo.width; + effectiveHeight = this->overwriteInfo.height; + effectiveDepth = this->overwriteInfo.depth; + } + this->maxPossibleMipLevels = 1; + if (dim != Latte::E_DIM::DIM_3D) + { + for (sint32 i = 0; i < 20; i++) + { + if ((effectiveWidth >> i) <= 1 && (effectiveHeight >> i) <= 1) + { + this->maxPossibleMipLevels = i + 1; + break; + } + } + } + else + { + for (sint32 i = 0; i < 20; i++) + { + if ((effectiveWidth >> i) <= 1 && (effectiveHeight >> i) <= 1 && (effectiveDepth >> i) <= 1) + { + this->maxPossibleMipLevels = i + 1; + break; + } + } + } } LatteTexture::~LatteTexture() diff --git a/src/Cafe/HW/Latte/Core/LatteTextureLegacy.cpp b/src/Cafe/HW/Latte/Core/LatteTextureLegacy.cpp index 50aa4d876..25c9f54b3 100644 --- a/src/Cafe/HW/Latte/Core/LatteTextureLegacy.cpp +++ b/src/Cafe/HW/Latte/Core/LatteTextureLegacy.cpp @@ -13,7 +13,7 @@ struct TexScaleXY float xy[2]; }; -struct +struct { TexScaleXY perUnit[Latte::GPU_LIMITS::NUM_TEXTURES_PER_STAGE]; // stores actualResolution/effectiveResolution ratio for each texture }LatteTextureScale[static_cast(LatteConst::ShaderType::TotalCount)] = { }; @@ -73,46 +73,16 @@ void LatteTexture_ReloadData(LatteTexture* tex) LatteTextureView* LatteTexture_CreateTexture(Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) { const auto tex = g_renderer->texture_createTextureEx(dim, physAddress, physMipAddress, format, width, height, depth, pitch, mipLevels, swizzle, tileMode, isDepth); + // init slice/mip info array LatteTexture_InitSliceAndMipInfo(tex); LatteTexture_RegisterTextureMemoryOccupancy(tex); cemu_assert_debug(mipLevels != 0); - // calculate number of potential mip levels (from effective size) - sint32 effectiveWidth = width; - sint32 effectiveHeight = height; - sint32 effectiveDepth = depth; - if (tex->overwriteInfo.hasResolutionOverwrite) - { - effectiveWidth = tex->overwriteInfo.width; - effectiveHeight = tex->overwriteInfo.height; - effectiveDepth = tex->overwriteInfo.depth; - } - tex->maxPossibleMipLevels = 1; - if (dim != Latte::E_DIM::DIM_3D) - { - for (sint32 i = 0; i < 20; i++) - { - if ((effectiveWidth >> i) <= 1 && (effectiveHeight >> i) <= 1) - { - tex->maxPossibleMipLevels = i + 1; - break; - } - } - } - else - { - for (sint32 i = 0; i < 20; i++) - { - if ((effectiveWidth >> i) <= 1 && (effectiveHeight >> i) <= 1 && (effectiveDepth >> i) <= 1) - { - tex->maxPossibleMipLevels = i + 1; - break; - } - } - } + LatteTexture_ReloadData(tex); LatteTC_MarkTextureStillInUse(tex); LatteTC_RegisterTexture(tex); + // create initial view that maps to the whole texture tex->baseView = tex->GetOrCreateView(0, tex->mipLevels, 0, tex->depth); return tex->baseView; @@ -371,4 +341,4 @@ uint64 LatteTexture_getNextUpdateEventCounter() void LatteTexture_init() { -} \ No newline at end of file +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index da00d7ec8..aedd5a3ec 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -56,6 +56,10 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM } desc->setTextureType(textureType); + // Clamp mip levels + mipLevels = std::min(mipLevels, (uint32)maxPossibleMipLevels); + mipLevels = std::max(mipLevels, (uint32)1); + desc->setWidth(effectiveBaseWidth); desc->setHeight(effectiveBaseHeight); desc->setMipmapLevelCount(mipLevels); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp index 5374126ac..3b157bf4e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp @@ -180,6 +180,10 @@ MTL::Texture* LatteTextureViewMtl::CreateSwizzledView(uint32 gpuSamplerSwizzle) swizzle.blue = GetMtlTextureSwizzle(compSelB); swizzle.alpha = GetMtlTextureSwizzle(compSelA); + // Clamp mip levels + levelCount = std::min(levelCount, m_baseTexture->maxPossibleMipLevels - baseLevel); + levelCount = std::max(levelCount, (uint32)1); + auto pixelFormat = GetMtlPixelFormat(format, m_baseTexture->IsDepth()); MTL::Texture* texture = m_baseTexture->GetTexture()->newTextureView(pixelFormat, textureType, NS::Range::Make(baseLevel, levelCount), NS::Range::Make(baseLayer, layerCount), swizzle); From 74a1162a170f0e72aa204fbc541af86fd73df8cd Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 10 Dec 2024 16:44:31 +0100 Subject: [PATCH 287/368] only set array length for texture view arrays --- .../HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp index 3b157bf4e..aa4481061 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp @@ -2,6 +2,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Metal/MTLTexture.hpp" uint32 LatteTextureMtl_AdjustTextureCompSel(Latte::E_GX2SURFFMT format, uint32 compSel) { @@ -158,20 +159,19 @@ MTL::Texture* LatteTextureViewMtl::CreateSwizzledView(uint32 gpuSamplerSwizzle) uint32 baseLevel = firstMip; uint32 levelCount = this->numMip; - uint32 baseLayer; - uint32 layerCount; + uint32 baseLayer = 0; + uint32 layerCount = 1; // TODO: check if base texture is 3D texture as well if (textureType == MTL::TextureType3D) { cemu_assert_debug(firstMip == 0); cemu_assert_debug(this->numSlice == baseTexture->depth); - baseLayer = 0; - layerCount = 1; } else { baseLayer = firstSlice; - layerCount = this->numSlice; + if (textureType == MTL::TextureTypeCubeArray || textureType == MTL::TextureType2DArray) + layerCount = this->numSlice; } MTL::TextureSwizzleChannels swizzle; From 137becb89471dcd81698cca61b1dcd4a09ffdba2 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 14 Dec 2024 11:00:42 +0100 Subject: [PATCH 288/368] do safety checks before clearing --- .../LatteDecompilerAnalyzer.cpp | 1 - .../HW/Latte/Renderer/Metal/LatteTextureMtl.cpp | 2 +- src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h | 5 +++++ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 17 ++++++++++++++--- 4 files changed, 20 insertions(+), 5 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp index e0b39767f..ad0f08177 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp @@ -551,7 +551,6 @@ namespace LatteDecompiler { bool isRectVertexShader = (static_cast(decompilerContext->contextRegisters[mmVGT_PRIMITIVE_TYPE]) == LattePrimitiveMode::RECTS); - // TODO: also check for rect primitive if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex && (decompilerContext->options->usesGeometryShader || isRectVertexShader)) decompilerContext->hasUniformVarBlock = true; // uf_verticesPerInstance } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index aedd5a3ec..a64a2f265 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -81,7 +81,7 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM desc->setPixelFormat(pixelFormat); MTL::TextureUsage usage = MTL::TextureUsageShaderRead | MTL::TextureUsagePixelFormatView; - if (!Latte::IsCompressedFormat(format)) + if (FormatIsRenderable(format)) usage |= MTL::TextureUsageRenderTarget; desc->setUsage(usage); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h index 20fd6b9de..ba9ebc36b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -84,3 +84,8 @@ inline bool CommandBufferCompleted(MTL::CommandBuffer* commandBuffer) auto status = commandBuffer->status(); return (status == MTL::CommandBufferStatusCompleted || status == MTL::CommandBufferStatusError); } + +inline bool FormatIsRenderable(Latte::E_GX2SURFFMT format) +{ + return !Latte::IsCompressedFormat(format); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 82bf4d4b7..23b65e67e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -634,6 +634,12 @@ void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, s void MetalRenderer::texture_clearColorSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a) { + if (!FormatIsRenderable(hostTexture->format)) + { + cemuLog_logOnce(LogType::Force, "cannot clear texture with pixel format {}, because it's not renderable", hostTexture->format); + return; + } + auto mtlTexture = static_cast(hostTexture)->GetTexture(); ClearColorTextureInternal(mtlTexture, sliceIndex, mipIndex, r, g, b, a); @@ -641,6 +647,13 @@ void MetalRenderer::texture_clearColorSlice(LatteTexture* hostTexture, sint32 sl void MetalRenderer::texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sliceIndex, sint32 mipIndex, bool clearDepth, bool clearStencil, float depthValue, uint32 stencilValue) { + clearStencil = (clearStencil && GetMtlPixelFormatInfo(hostTexture->format, true).hasStencil); + if (!clearDepth && !clearStencil) + { + cemuLog_logOnce(LogType::Force, "skipping depth/stencil clear"); + return; + } + auto mtlTexture = static_cast(hostTexture)->GetTexture(); MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); @@ -654,7 +667,7 @@ void MetalRenderer::texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sl depthAttachment->setSlice(sliceIndex); depthAttachment->setLevel(mipIndex); } - if (clearStencil && GetMtlPixelFormatInfo(hostTexture->format, true).hasStencil) + if (clearStencil) { auto stencilAttachment = renderPassDescriptor->stencilAttachment(); stencilAttachment->setTexture(mtlTexture); @@ -2046,8 +2059,6 @@ void MetalRenderer::ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 s colorAttachment->setSlice(sliceIndex); colorAttachment->setLevel(mipIndex); - MTL::Texture* colorRenderTargets[8] = {nullptr}; - colorRenderTargets[0] = mtlTexture; GetTemporaryRenderCommandEncoder(renderPassDescriptor); renderPassDescriptor->release(); EndEncoding(); From ba9a9370fee645da84317c94393fb9b26467ff14 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 14 Dec 2024 11:21:35 +0100 Subject: [PATCH 289/368] make clear message more descriptive --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 23b65e67e..a578948ac 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -636,7 +636,7 @@ void MetalRenderer::texture_clearColorSlice(LatteTexture* hostTexture, sint32 sl { if (!FormatIsRenderable(hostTexture->format)) { - cemuLog_logOnce(LogType::Force, "cannot clear texture with pixel format {}, because it's not renderable", hostTexture->format); + cemuLog_logOnce(LogType::Force, "cannot clear color texture with format {}, because it's not renderable", hostTexture->format); return; } From d64e0c9b6f688781ef3d48356d34e4e8d895db7c Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 15 Dec 2024 13:18:24 +0100 Subject: [PATCH 290/368] rework gpu selection --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 54 ++++++++++++++++++- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 4 +- .../Latte/Renderer/Vulkan/VulkanRenderer.cpp | 14 +++-- .../HW/Latte/Renderer/Vulkan/VulkanRenderer.h | 14 +---- src/config/CemuConfig.cpp | 4 +- src/config/CemuConfig.h | 2 +- src/gui/GeneralSettings2.cpp | 54 ++++++++++++++----- 7 files changed, 105 insertions(+), 41 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index a578948ac..50c605237 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -20,6 +20,8 @@ #include "Cemu/Logging/CemuLogging.h" #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Core/LatteConst.h" +#include "Foundation/NSString.hpp" +#include "Metal/MTLDevice.hpp" #include "config/CemuConfig.h" #include "gui/guiWrapper.h" @@ -36,10 +38,53 @@ float supportBufferData[512 * 4]; // Defined in the OpenGL renderer void LatteDraw_handleSpecialState8_clearAsDepth(); +std::vector MetalRenderer::GetDevices() +{ + auto devices = MTL::CopyAllDevices(); + std::vector result; + result.reserve(devices->count()); + for (uint32 i = 0; i < devices->count(); i++) + { + MTL::Device* device = static_cast(devices->object(i)); + result.push_back(std::string(device->name()->utf8String())); + } + + return result; +} + MetalRenderer::MetalRenderer() { - m_device = MTL::CreateSystemDefaultDevice(); - m_commandQueue = m_device->newCommandQueue(); + // Pick a device + auto& config = GetConfig(); + const bool hasDeviceSet = !config.graphic_device_name.empty(); + + // If a device is set, try to find it + if (hasDeviceSet) + { + auto devices = MTL::CopyAllDevices(); + for (uint32 i = 0; i < devices->count(); i++) + { + MTL::Device* device = static_cast(devices->object(i)); + std::string name = std::string(device->name()->utf8String()); + if (name == config.graphic_device_name) + { + m_device = device; + break; + } + } + } + + if (!m_device) + { + if (hasDeviceSet) + { + cemuLog_log(LogType::Force, "The selected GPU ({}) could not be found. Using the system default device.", config.graphic_device_name); + config.graphic_device_name = ""; + } + + // Use the system default device + m_device = MTL::CreateSystemDefaultDevice(); + } // Feature support m_isAppleGPU = m_device->supportsFamily(MTL::GPUFamilyApple1); @@ -50,6 +95,9 @@ MetalRenderer::MetalRenderer() CheckForPixelFormatSupport(m_pixelFormatSupport); + // Create command queue + m_commandQueue = m_device->newCommandQueue(); + // Synchronization resources m_event = m_device->newEvent(); @@ -523,6 +571,7 @@ void MetalRenderer::DeleteFontTextures() void MetalRenderer::AppendOverlayDebugInfo() { ImGui::Text("--- GPU info ---"); + ImGui::Text("GPU name %s", m_device->name()->utf8String()); ImGui::Text("Is Apple GPU %s", (m_isAppleGPU ? "yes" : "no")); ImGui::Text("Has unified memory %s", (m_hasUnifiedMemory ? "yes" : "no")); ImGui::Text("Supports Metal3 %s", (m_supportsMetal3 ? "yes" : "no")); @@ -636,6 +685,7 @@ void MetalRenderer::texture_clearColorSlice(LatteTexture* hostTexture, sint32 sl { if (!FormatIsRenderable(hostTexture->format)) { + // TODO: handle this somehow? cemuLog_logOnce(LogType::Force, "cannot clear color texture with format {}, because it's not renderable", hostTexture->format); return; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 6a5db69b3..cf4fc29fa 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -155,6 +155,8 @@ class MetalRenderer : public Renderer static constexpr uint32 OCCLUSION_QUERY_POOL_SIZE = 1024; static constexpr uint32 TEXTURE_READBACK_SIZE = 32 * 1024 * 1024; // 32 MB + static std::vector GetDevices(); + MetalRenderer(); ~MetalRenderer() override; @@ -459,7 +461,7 @@ class MetalRenderer : public Renderer MetalPerformanceMonitor m_performanceMonitor; // Metal objects - MTL::Device* m_device; + MTL::Device* m_device = nullptr; MTL::CommandQueue* m_commandQueue; // Feature support diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp index 998aac474..600d82de9 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp @@ -91,7 +91,7 @@ VKAPI_ATTR VkBool32 VKAPI_CALL DebugUtilsCallback(VkDebugUtilsMessageSeverityFla return VK_FALSE; } -std::vector VulkanRenderer::GetDevices() +std::vector VulkanRenderer::GetDevices() { if(!vkEnumerateInstanceVersion) { @@ -105,7 +105,7 @@ std::vector VulkanRenderer::GetDevices() apiVersion = VK_API_VERSION_1_1; } - std::vector result; + std::vector result; std::vector requiredExtensions; requiredExtensions.clear(); @@ -168,7 +168,7 @@ std::vector VulkanRenderer::GetDevices() physDeviceProps.pNext = &physDeviceIDProps; vkGetPhysicalDeviceProperties2(device, &physDeviceProps); - result.emplace_back(physDeviceProps.properties.deviceName, physDeviceIDProps.deviceUUID); + result.emplace_back(physDeviceProps.properties.deviceName); } } vkDestroySurfaceKHR(instance, surface, nullptr); @@ -181,7 +181,6 @@ std::vector VulkanRenderer::GetDevices() vkDestroyInstance(instance, nullptr); return result; - } void VulkanRenderer::DetermineVendor() @@ -389,8 +388,7 @@ VulkanRenderer::VulkanRenderer() auto surface = CreateFramebufferSurface(m_instance, gui_getWindowInfo().window_main); auto& config = GetConfig(); - decltype(config.graphic_device_uuid) zero{}; - const bool has_device_set = config.graphic_device_uuid != zero; + const bool has_device_set = !config.graphic_device_name.empty(); VkPhysicalDevice fallbackDevice = VK_NULL_HANDLE; @@ -410,7 +408,7 @@ VulkanRenderer::VulkanRenderer() physDeviceProps.pNext = &physDeviceIDProps; vkGetPhysicalDeviceProperties2(device, &physDeviceProps); - if (memcmp(config.graphic_device_uuid.data(), physDeviceIDProps.deviceUUID, VK_UUID_SIZE) != 0) + if (config.graphic_device_name != physDeviceProps.properties.deviceName) continue; } @@ -423,7 +421,7 @@ VulkanRenderer::VulkanRenderer() { cemuLog_log(LogType::Force, "The selected GPU could not be found or is not suitable. Falling back to first available device instead"); m_physicalDevice = fallbackDevice; - config.graphic_device_uuid = {}; // resetting device selection + config.graphic_device_name = ""; // resetting device selection } else if (m_physicalDevice == VK_NULL_HANDLE) { diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h index 867647a34..683094147 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h @@ -156,19 +156,7 @@ class VulkanRenderer : public Renderer sint32 texelCountY; }FormatInfoVK; - struct DeviceInfo - { - DeviceInfo(const std::string name, uint8* uuid) - : name(name) - { - std::copy(uuid, uuid + VK_UUID_SIZE, this->uuid.data()); - } - - std::string name; - std::array uuid; - }; - - static std::vector GetDevices(); + static std::vector GetDevices(); VulkanRenderer(); virtual ~VulkanRenderer(); diff --git a/src/config/CemuConfig.cpp b/src/config/CemuConfig.cpp index 0e39656b2..701e6be5b 100644 --- a/src/config/CemuConfig.cpp +++ b/src/config/CemuConfig.cpp @@ -212,7 +212,7 @@ void CemuConfig::Load(XMLConfigParser& parser) // graphics auto graphic = parser.get("Graphic"); graphic_api = graphic.get("api", kOpenGL); - graphic.get("device", graphic_device_uuid); + graphic.get("device", graphic_device_name); vsync = graphic.get("VSync", 0); gx2drawdone_sync = graphic.get("GX2DrawdoneSync", true); upscale_filter = graphic.get("UpscaleFilter", kBicubicHermiteFilter); @@ -468,7 +468,7 @@ void CemuConfig::Save(XMLConfigParser& parser) // graphics auto graphic = config.set("Graphic"); graphic.set("api", graphic_api); - graphic.set("device", graphic_device_uuid); + graphic.set("device", graphic_device_name); graphic.set("VSync", vsync); graphic.set("GX2DrawdoneSync", gx2drawdone_sync); //graphic.set("PrecompiledShaders", precompiled_shaders.GetValue()); diff --git a/src/config/CemuConfig.h b/src/config/CemuConfig.h index d1e5e214a..0dc8cf2b8 100644 --- a/src/config/CemuConfig.h +++ b/src/config/CemuConfig.h @@ -462,7 +462,7 @@ struct CemuConfig // graphics ConfigValue graphic_api{ kVulkan }; - std::array graphic_device_uuid; + std::string graphic_device_name; ConfigValue vsync{ 0 }; // 0 = off, 1+ = on depending on render backend ConfigValue gx2drawdone_sync {true}; ConfigValue render_upside_down{ false }; diff --git a/src/gui/GeneralSettings2.cpp b/src/gui/GeneralSettings2.cpp index 8b6e0ee15..7071c7e8f 100644 --- a/src/gui/GeneralSettings2.cpp +++ b/src/gui/GeneralSettings2.cpp @@ -1,3 +1,5 @@ +#include "Foundation/NSString.hpp" +#include "Metal/MTLDevice.hpp" #include "gui/wxgui.h" #include "gui/GeneralSettings2.h" #include "gui/CemuApp.h" @@ -27,6 +29,9 @@ #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h" #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h" +#if ENABLE_METAL +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#endif #include "Cafe/Account/Account.h" #include @@ -82,15 +87,15 @@ class wxInputDeviceDescription : public wxClientData IAudioInputAPI::DeviceDescriptionPtr m_description; }; -class wxVulkanUUID : public wxClientData +class wxGraphicsDevice : public wxClientData { public: - wxVulkanUUID(const VulkanRenderer::DeviceInfo& info) - : m_device_info(info) {} - const VulkanRenderer::DeviceInfo& GetDeviceInfo() const { return m_device_info; } + wxGraphicsDevice(const std::string& name) + : m_name(name) {} + const std::string& GetName() const { return m_name; } private: - VulkanRenderer::DeviceInfo m_device_info; + std::string m_name; }; class wxAccountData : public wxClientData @@ -1025,14 +1030,14 @@ void GeneralSettings2::StoreConfig() selection = m_graphic_device->GetSelection(); if(selection != wxNOT_FOUND) { - const auto* info = (wxVulkanUUID*)m_graphic_device->GetClientObject(selection); + const auto* info = (wxGraphicsDevice*)m_graphic_device->GetClientObject(selection); if(info) - config.graphic_device_uuid = info->GetDeviceInfo().uuid; + config.graphic_device_name = info->GetName(); else - config.graphic_device_uuid = {}; + config.graphic_device_name = ""; } else - config.graphic_device_uuid = {}; + config.graphic_device_name = ""; config.vsync = m_vsync->GetSelection(); @@ -1538,14 +1543,14 @@ void GeneralSettings2::HandleGraphicsApiSelection() { for(const auto& device : devices) { - m_graphic_device->Append(device.name, new wxVulkanUUID(device)); + m_graphic_device->Append(device, new wxGraphicsDevice(device)); } m_graphic_device->SetSelection(0); const auto& config = GetConfig(); for(size_t i = 0; i < devices.size(); ++i) { - if(config.graphic_device_uuid == devices[i].uuid) + if(config.graphic_device_name == devices[i]) { m_graphic_device->SetSelection(i); break; @@ -1566,9 +1571,30 @@ void GeneralSettings2::HandleGraphicsApiSelection() m_vsync->Select(selection); - // TODO: add an option to select the graphic device - m_graphic_device->Clear(); - m_graphic_device->Disable(); + m_graphic_device->Enable(); + m_graphic_device->Clear(); + +#if ENABLE_METAL + auto devices = MetalRenderer::GetDevices(); + if (!devices.empty()) + { + for (const auto& device : devices) + { + m_graphic_device->Append(device, new wxGraphicsDevice(device)); + } + m_graphic_device->SetSelection(0); + + const auto& config = GetConfig(); + for (size_t i = 0; i < devices.size(); ++i) + { + if (config.graphic_device_name == devices[i]) + { + m_graphic_device->SetSelection(i); + break; + } + } + } +#endif } } From c6e8b5c9332bdb9648d25dfcbb1c542cb511d4b3 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 16 Dec 2024 14:13:11 +0100 Subject: [PATCH 291/368] fix: depth stencil state issues --- src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp | 8 ---- .../LatteDecompilerEmitMSL.cpp | 2 +- .../Renderer/Metal/MetalDepthStencilCache.cpp | 39 +++++++++---------- 3 files changed, 20 insertions(+), 29 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp b/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp index 712753a77..d654de2fc 100644 --- a/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp +++ b/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp @@ -450,14 +450,6 @@ bool LatteMRT::UpdateCurrentFBO() uint8 colorBufferMask = GetActiveColorBufferMask(pixelShader, LatteGPUState.contextNew); bool depthBufferMask = GetActiveDepthBufferMask(LatteGPUState.contextNew); - // if depth test is not used then detach the depth buffer - bool depthEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_Z_ENABLE(); - bool stencilTestEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ENABLE(); - bool backStencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_BACK_STENCIL_ENABLE(); - - if (!depthEnable && !stencilTestEnable && !backStencilEnable) - depthBufferMask = false; - bool hasResizedTexture = false; // set to true if any of the color buffers or the depth buffer reference a resized texture (via graphic pack texture rules) sLatteRenderTargetState.renderTargetIsResized = false; // real size diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 20b0e8459..264a0099a 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3121,7 +3121,7 @@ static void _emitExportCode(LatteDecompilerShaderContext* shaderContext, LatteDe src->add("float4 finalPos = "); _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); src->add(";" _CRLF); - src->add("finalPos.xy = finalPos.xy * supportBuffer.windowSpaceToClipSpaceTransform - float2(1.0,1.0);"); + src->add("finalPos.xy = finalPos.xy * supportBuffer.windowSpaceToClipSpaceTransform - float2(1.0,1.0);" _CRLF); src->add("SET_POSITION(finalPos);"); } else diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp index 427530c2c..a1e4005b5 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp @@ -21,37 +21,36 @@ MTL::DepthStencilState* MetalDepthStencilCache::GetDepthStencilState(const Latte return depthStencilState; // Depth stencil state - bool depthEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_Z_ENABLE(); - auto depthFunc = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_Z_FUNC(); - bool depthWriteEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_Z_WRITE_ENABLE(); + bool depthEnable = lcr.DB_DEPTH_CONTROL.get_Z_ENABLE(); + auto depthFunc = lcr.DB_DEPTH_CONTROL.get_Z_FUNC(); + bool depthWriteEnable = lcr.DB_DEPTH_CONTROL.get_Z_WRITE_ENABLE(); MTL::DepthStencilDescriptor* desc = MTL::DepthStencilDescriptor::alloc()->init(); - desc->setDepthWriteEnabled(depthWriteEnable); - if (depthEnable) { + desc->setDepthWriteEnabled(depthWriteEnable); desc->setDepthCompareFunction(GetMtlCompareFunc(depthFunc)); - } + } // Stencil state - bool stencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ENABLE(); + bool stencilEnable = lcr.DB_DEPTH_CONTROL.get_STENCIL_ENABLE(); if (stencilEnable) { // get stencil control parameters - bool backStencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_BACK_STENCIL_ENABLE(); - auto frontStencilFunc = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_FUNC_F(); - auto frontStencilZPass = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ZPASS_F(); - auto frontStencilZFail = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ZFAIL_F(); - auto frontStencilFail = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_FAIL_F(); - auto backStencilFunc = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_FUNC_B(); - auto backStencilZPass = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ZPASS_B(); - auto backStencilZFail = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ZFAIL_B(); - auto backStencilFail = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_FAIL_B(); + bool backStencilEnable = lcr.DB_DEPTH_CONTROL.get_BACK_STENCIL_ENABLE(); + auto frontStencilFunc = lcr.DB_DEPTH_CONTROL.get_STENCIL_FUNC_F(); + auto frontStencilZPass = lcr.DB_DEPTH_CONTROL.get_STENCIL_ZPASS_F(); + auto frontStencilZFail = lcr.DB_DEPTH_CONTROL.get_STENCIL_ZFAIL_F(); + auto frontStencilFail = lcr.DB_DEPTH_CONTROL.get_STENCIL_FAIL_F(); + auto backStencilFunc = lcr.DB_DEPTH_CONTROL.get_STENCIL_FUNC_B(); + auto backStencilZPass = lcr.DB_DEPTH_CONTROL.get_STENCIL_ZPASS_B(); + auto backStencilZFail = lcr.DB_DEPTH_CONTROL.get_STENCIL_ZFAIL_B(); + auto backStencilFail = lcr.DB_DEPTH_CONTROL.get_STENCIL_FAIL_B(); // get stencil control parameters - uint32 stencilCompareMaskFront = LatteGPUState.contextNew.DB_STENCILREFMASK.get_STENCILMASK_F(); - uint32 stencilWriteMaskFront = LatteGPUState.contextNew.DB_STENCILREFMASK.get_STENCILWRITEMASK_F(); - uint32 stencilCompareMaskBack = LatteGPUState.contextNew.DB_STENCILREFMASK_BF.get_STENCILMASK_B(); - uint32 stencilWriteMaskBack = LatteGPUState.contextNew.DB_STENCILREFMASK_BF.get_STENCILWRITEMASK_B(); + uint32 stencilCompareMaskFront = lcr.DB_STENCILREFMASK.get_STENCILMASK_F(); + uint32 stencilWriteMaskFront = lcr.DB_STENCILREFMASK.get_STENCILWRITEMASK_F(); + uint32 stencilCompareMaskBack = lcr.DB_STENCILREFMASK_BF.get_STENCILMASK_B(); + uint32 stencilWriteMaskBack = lcr.DB_STENCILREFMASK_BF.get_STENCILWRITEMASK_B(); MTL::StencilDescriptor* frontStencil = MTL::StencilDescriptor::alloc()->init(); frontStencil->setReadMask(stencilCompareMaskFront); From 4281f6e0c817e99f6db5bc00f61b2972fb28d1c6 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 16 Dec 2024 14:24:53 +0100 Subject: [PATCH 292/368] implement shader uint min max instructions --- .../LatteDecompilerEmitMSL.cpp | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 264a0099a..bc2becf8b 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -1331,19 +1331,23 @@ static void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderConte } else if( aluInstruction->opcode == ALU_OP2_INST_ADD_INT ) _emitALUOperationBinary(shaderContext, aluInstruction, " + "); - else if( aluInstruction->opcode == ALU_OP2_INST_MAX_INT || aluInstruction->opcode == ALU_OP2_INST_MIN_INT ) + else if( aluInstruction->opcode == ALU_OP2_INST_MAX_INT || aluInstruction->opcode == ALU_OP2_INST_MIN_INT || + aluInstruction->opcode == ALU_OP2_INST_MAX_UINT || aluInstruction->opcode == ALU_OP2_INST_MIN_UINT) { // not verified + bool isUnsigned = aluInstruction->opcode == ALU_OP2_INST_MAX_UINT || aluInstruction->opcode == ALU_OP2_INST_MIN_UINT; + auto opType = isUnsigned ? LATTE_DECOMPILER_DTYPE_UNSIGNED_INT : LATTE_DECOMPILER_DTYPE_SIGNED_INT; _emitInstructionOutputVariableName(shaderContext, aluInstruction); - if( aluInstruction->opcode == ALU_OP2_INST_MAX_INT ) - src->add(" = max("); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, opType, outputType); + if( aluInstruction->opcode == ALU_OP2_INST_MAX_INT || aluInstruction->opcode == ALU_OP2_INST_MAX_UINT ) + src->add("max("); else - src->add(" = min("); - _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); - _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add("min("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, opType); src->add(", "); - _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); - _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + _emitOperandInputCode(shaderContext, aluInstruction, 1, opType); + _emitTypeConversionSuffixMSL(shaderContext, opType, outputType); src->add(");" _CRLF); } else if( aluInstruction->opcode == ALU_OP2_INST_SUB_INT ) From fa004a33c6cbadeb4447b82f836e02239488406f Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 18 Dec 2024 16:38:55 +0100 Subject: [PATCH 293/368] add an option to preserve position invariance --- src/Cafe/CafeSystem.cpp | 1 + src/Cafe/GameProfile/GameProfile.cpp | 4 ++++ src/Cafe/GameProfile/GameProfile.h | 4 +++- .../LatteDecompilerEmitMSLHeader.hpp | 2 +- src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp | 2 ++ src/gui/GameProfileWindow.cpp | 9 +++++++++ src/gui/GameProfileWindow.h | 1 + 7 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/Cafe/CafeSystem.cpp b/src/Cafe/CafeSystem.cpp index f2e576f49..d1de472ed 100644 --- a/src/Cafe/CafeSystem.cpp +++ b/src/Cafe/CafeSystem.cpp @@ -259,6 +259,7 @@ void InfoLog_PrintActiveSettings() cemuLog_log(LogType::Force, "Async compile: {}", GetConfig().async_compile.GetValue() ? "true" : "false"); cemuLog_log(LogType::Force, "Fast math: {}", g_current_game_profile->GetFastMath() ? "true" : "false"); cemuLog_log(LogType::Force, "Buffer cache type: {}", g_current_game_profile->GetBufferCacheMode()); + cemuLog_log(LogType::Force, "Position invariance: {}", g_current_game_profile->GetPositionInvariance() ? "true" : "false"); if (!GetConfig().vk_accurate_barriers.GetValue()) cemuLog_log(LogType::Force, "Accurate barriers are disabled!"); } diff --git a/src/Cafe/GameProfile/GameProfile.cpp b/src/Cafe/GameProfile/GameProfile.cpp index f8e1305c7..d06a32dc4 100644 --- a/src/Cafe/GameProfile/GameProfile.cpp +++ b/src/Cafe/GameProfile/GameProfile.cpp @@ -228,6 +228,7 @@ bool GameProfile::Load(uint64_t title_id) gameProfile_loadEnumOption(iniParser, "accurateShaderMul", m_accurateShaderMul); gameProfile_loadBooleanOption2(iniParser, "fastMath", m_fastMath); gameProfile_loadEnumOption(iniParser, "bufferCacheMode", m_bufferCacheMode); + gameProfile_loadBooleanOption2(iniParser, "positionInvariance", m_positionInvariance); // legacy support auto option_precompiledShaders = iniParser.FindOption("precompiledShaders"); @@ -312,6 +313,7 @@ void GameProfile::Save(uint64_t title_id) WRITE_ENTRY(accurateShaderMul); WRITE_ENTRY(fastMath); WRITE_ENTRY(bufferCacheMode); + WRITE_ENTRY(positionInvariance); WRITE_OPTIONAL_ENTRY(precompiledShaders); WRITE_OPTIONAL_ENTRY(graphics_api); fs->writeLine(""); @@ -343,6 +345,7 @@ void GameProfile::ResetOptional() m_accurateShaderMul = AccurateShaderMulOption::True; m_fastMath = true; m_bufferCacheMode = BufferCacheMode::DevicePrivate; + m_positionInvariance = false; // cpu settings m_threadQuantum = kThreadQuantumDefault; m_cpuMode.reset(); // CPUModeOption::kSingleCoreRecompiler; @@ -365,6 +368,7 @@ void GameProfile::Reset() m_accurateShaderMul = AccurateShaderMulOption::True; m_fastMath = true; m_bufferCacheMode = BufferCacheMode::DevicePrivate; + m_positionInvariance = false; m_precompiledShaders = PrecompiledShaderOption::Auto; // cpu settings m_threadQuantum = kThreadQuantumDefault; diff --git a/src/Cafe/GameProfile/GameProfile.h b/src/Cafe/GameProfile/GameProfile.h index 078a70a24..359e6a0ac 100644 --- a/src/Cafe/GameProfile/GameProfile.h +++ b/src/Cafe/GameProfile/GameProfile.h @@ -33,6 +33,7 @@ class GameProfile [[nodiscard]] const AccurateShaderMulOption& GetAccurateShaderMul() const { return m_accurateShaderMul; } [[nodiscard]] bool GetFastMath() const { return m_fastMath; } [[nodiscard]] BufferCacheMode GetBufferCacheMode() const { return m_bufferCacheMode; } + [[nodiscard]] bool GetPositionInvariance() const { return m_positionInvariance; } [[nodiscard]] const std::optional& GetPrecompiledShadersState() const { return m_precompiledShaders; } [[nodiscard]] uint32 GetThreadQuantum() const { return m_threadQuantum; } @@ -56,8 +57,9 @@ class GameProfile // graphic settings std::optional m_graphics_api{}; AccurateShaderMulOption m_accurateShaderMul = AccurateShaderMulOption::True; - bool m_fastMath = false; + bool m_fastMath = true; BufferCacheMode m_bufferCacheMode = BufferCacheMode::DevicePrivate; + bool m_positionInvariance = false; std::optional m_precompiledShaders{}; // cpu settings uint32 m_threadQuantum = kThreadQuantumDefault; // values: 20000 45000 60000 80000 100000 diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 395b24213..1a2dfa3c1 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -180,7 +180,7 @@ namespace LatteDecompiler auto* src = shaderContext->shaderSource; src->add("struct VertexOut {" _CRLF); - src->add("float4 position [[position]];" _CRLF); + src->add("float4 position [[position]] [[invariant]];" _CRLF); if (shaderContext->analyzer.outputPointSize) src->add("float pointSize [[point_size]];" _CRLF); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index c77443c81..c4492e3c4 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -178,6 +178,8 @@ void RendererShaderMtl::CompileInternal() // TODO: always disable fast math for problematic shaders if (g_current_game_profile->GetFastMath()) options->setFastMathEnabled(true); + if (g_current_game_profile->GetPositionInvariance()) + options->setPreserveInvariance(true); NS::Error* error = nullptr; MTL::Library* library = m_mtlr->GetDevice()->newLibrary(ToNSString(m_mslCode), options, &error); diff --git a/src/gui/GameProfileWindow.cpp b/src/gui/GameProfileWindow.cpp index 120dd5e9a..c46f0f254 100644 --- a/src/gui/GameProfileWindow.cpp +++ b/src/gui/GameProfileWindow.cpp @@ -142,6 +142,13 @@ GameProfileWindow::GameProfileWindow(wxWindow* parent, uint64_t title_id) m_buffer_cache_mode->SetToolTip(_("EXPERT OPTION\nDecides how the buffer cache memory will be managed.\n\nMetal only\n\nRecommended: device private")); first_row->Add(m_buffer_cache_mode, 0, wxALL, 5); + first_row->Add(new wxStaticText(panel, wxID_ANY, _("Position invariance")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); + + wxString pos_values[] = { _("false"), _("true") }; + m_position_invariance = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(pos_values), pos_values); + m_position_invariance->SetToolTip(_("Disables most optimizations for vertex positions. May fix polygon cutouts in some games.\n\nMetal only\n\nRecommended: false")); + first_row->Add(m_position_invariance, 0, wxALL, 5); + /*first_row->Add(new wxStaticText(panel, wxID_ANY, _("GPU buffer cache accuracy")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); wxString accuarcy_values[] = { _("high"), _("medium"), _("low") }; m_cache_accuracy = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(accuarcy_values), accuarcy_values); @@ -290,6 +297,7 @@ void GameProfileWindow::ApplyProfile() m_shader_mul_accuracy->SetSelection((int)m_game_profile.m_accurateShaderMul); m_fast_math->SetSelection((int)m_game_profile.m_fastMath); m_buffer_cache_mode->SetSelection((int)m_game_profile.m_bufferCacheMode); + m_position_invariance->SetSelection((int)m_game_profile.m_positionInvariance); //// audio //m_disable_audio->Set3StateValue(GetCheckboxState(m_game_profile.disableAudio)); @@ -351,6 +359,7 @@ void GameProfileWindow::SaveProfile() m_game_profile.m_accurateShaderMul = (AccurateShaderMulOption)m_shader_mul_accuracy->GetSelection(); m_game_profile.m_fastMath = (bool)m_fast_math->GetSelection(); m_game_profile.m_bufferCacheMode = (BufferCacheMode)m_buffer_cache_mode->GetSelection(); + m_game_profile.m_positionInvariance = (bool)m_position_invariance->GetSelection(); if (m_game_profile.m_accurateShaderMul != AccurateShaderMulOption::False && m_game_profile.m_accurateShaderMul != AccurateShaderMulOption::True) m_game_profile.m_accurateShaderMul = AccurateShaderMulOption::True; // force a legal value diff --git a/src/gui/GameProfileWindow.h b/src/gui/GameProfileWindow.h index 8bf0c91c8..ddd72c775 100644 --- a/src/gui/GameProfileWindow.h +++ b/src/gui/GameProfileWindow.h @@ -42,6 +42,7 @@ class GameProfileWindow : public wxFrame wxChoice* m_shader_mul_accuracy; wxChoice* m_fast_math; wxChoice* m_buffer_cache_mode; + wxChoice* m_position_invariance; //wxChoice* m_cache_accuracy; // audio From 770d6cfda799efea617fb94b6be7bee01e488ed8 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 18 Dec 2024 19:30:12 +0100 Subject: [PATCH 294/368] Revert "rework gpu selection" This reverts commit d64e0c9b6f688781ef3d48356d34e4e8d895db7c. --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 54 +------------------ .../HW/Latte/Renderer/Metal/MetalRenderer.h | 4 +- .../Latte/Renderer/Vulkan/VulkanRenderer.cpp | 14 ++--- .../HW/Latte/Renderer/Vulkan/VulkanRenderer.h | 14 ++++- src/config/CemuConfig.cpp | 4 +- src/config/CemuConfig.h | 2 +- src/gui/GeneralSettings2.cpp | 54 +++++-------------- 7 files changed, 41 insertions(+), 105 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 50c605237..a578948ac 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -20,8 +20,6 @@ #include "Cemu/Logging/CemuLogging.h" #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Core/LatteConst.h" -#include "Foundation/NSString.hpp" -#include "Metal/MTLDevice.hpp" #include "config/CemuConfig.h" #include "gui/guiWrapper.h" @@ -38,53 +36,10 @@ float supportBufferData[512 * 4]; // Defined in the OpenGL renderer void LatteDraw_handleSpecialState8_clearAsDepth(); -std::vector MetalRenderer::GetDevices() -{ - auto devices = MTL::CopyAllDevices(); - std::vector result; - result.reserve(devices->count()); - for (uint32 i = 0; i < devices->count(); i++) - { - MTL::Device* device = static_cast(devices->object(i)); - result.push_back(std::string(device->name()->utf8String())); - } - - return result; -} - MetalRenderer::MetalRenderer() { - // Pick a device - auto& config = GetConfig(); - const bool hasDeviceSet = !config.graphic_device_name.empty(); - - // If a device is set, try to find it - if (hasDeviceSet) - { - auto devices = MTL::CopyAllDevices(); - for (uint32 i = 0; i < devices->count(); i++) - { - MTL::Device* device = static_cast(devices->object(i)); - std::string name = std::string(device->name()->utf8String()); - if (name == config.graphic_device_name) - { - m_device = device; - break; - } - } - } - - if (!m_device) - { - if (hasDeviceSet) - { - cemuLog_log(LogType::Force, "The selected GPU ({}) could not be found. Using the system default device.", config.graphic_device_name); - config.graphic_device_name = ""; - } - - // Use the system default device - m_device = MTL::CreateSystemDefaultDevice(); - } + m_device = MTL::CreateSystemDefaultDevice(); + m_commandQueue = m_device->newCommandQueue(); // Feature support m_isAppleGPU = m_device->supportsFamily(MTL::GPUFamilyApple1); @@ -95,9 +50,6 @@ MetalRenderer::MetalRenderer() CheckForPixelFormatSupport(m_pixelFormatSupport); - // Create command queue - m_commandQueue = m_device->newCommandQueue(); - // Synchronization resources m_event = m_device->newEvent(); @@ -571,7 +523,6 @@ void MetalRenderer::DeleteFontTextures() void MetalRenderer::AppendOverlayDebugInfo() { ImGui::Text("--- GPU info ---"); - ImGui::Text("GPU name %s", m_device->name()->utf8String()); ImGui::Text("Is Apple GPU %s", (m_isAppleGPU ? "yes" : "no")); ImGui::Text("Has unified memory %s", (m_hasUnifiedMemory ? "yes" : "no")); ImGui::Text("Supports Metal3 %s", (m_supportsMetal3 ? "yes" : "no")); @@ -685,7 +636,6 @@ void MetalRenderer::texture_clearColorSlice(LatteTexture* hostTexture, sint32 sl { if (!FormatIsRenderable(hostTexture->format)) { - // TODO: handle this somehow? cemuLog_logOnce(LogType::Force, "cannot clear color texture with format {}, because it's not renderable", hostTexture->format); return; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index cf4fc29fa..6a5db69b3 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -155,8 +155,6 @@ class MetalRenderer : public Renderer static constexpr uint32 OCCLUSION_QUERY_POOL_SIZE = 1024; static constexpr uint32 TEXTURE_READBACK_SIZE = 32 * 1024 * 1024; // 32 MB - static std::vector GetDevices(); - MetalRenderer(); ~MetalRenderer() override; @@ -461,7 +459,7 @@ class MetalRenderer : public Renderer MetalPerformanceMonitor m_performanceMonitor; // Metal objects - MTL::Device* m_device = nullptr; + MTL::Device* m_device; MTL::CommandQueue* m_commandQueue; // Feature support diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp index dea66bb49..a28eef4ed 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp @@ -91,7 +91,7 @@ VKAPI_ATTR VkBool32 VKAPI_CALL DebugUtilsCallback(VkDebugUtilsMessageSeverityFla return VK_FALSE; } -std::vector VulkanRenderer::GetDevices() +std::vector VulkanRenderer::GetDevices() { if(!vkEnumerateInstanceVersion) { @@ -105,7 +105,7 @@ std::vector VulkanRenderer::GetDevices() apiVersion = VK_API_VERSION_1_1; } - std::vector result; + std::vector result; std::vector requiredExtensions; requiredExtensions.clear(); @@ -168,7 +168,7 @@ std::vector VulkanRenderer::GetDevices() physDeviceProps.pNext = &physDeviceIDProps; vkGetPhysicalDeviceProperties2(device, &physDeviceProps); - result.emplace_back(physDeviceProps.properties.deviceName); + result.emplace_back(physDeviceProps.properties.deviceName, physDeviceIDProps.deviceUUID); } } vkDestroySurfaceKHR(instance, surface, nullptr); @@ -181,6 +181,7 @@ std::vector VulkanRenderer::GetDevices() vkDestroyInstance(instance, nullptr); return result; + } void VulkanRenderer::DetermineVendor() @@ -388,7 +389,8 @@ VulkanRenderer::VulkanRenderer() auto surface = CreateFramebufferSurface(m_instance, gui_getWindowInfo().window_main); auto& config = GetConfig(); - const bool has_device_set = !config.graphic_device_name.empty(); + decltype(config.graphic_device_uuid) zero{}; + const bool has_device_set = config.graphic_device_uuid != zero; VkPhysicalDevice fallbackDevice = VK_NULL_HANDLE; @@ -408,7 +410,7 @@ VulkanRenderer::VulkanRenderer() physDeviceProps.pNext = &physDeviceIDProps; vkGetPhysicalDeviceProperties2(device, &physDeviceProps); - if (config.graphic_device_name != physDeviceProps.properties.deviceName) + if (memcmp(config.graphic_device_uuid.data(), physDeviceIDProps.deviceUUID, VK_UUID_SIZE) != 0) continue; } @@ -421,7 +423,7 @@ VulkanRenderer::VulkanRenderer() { cemuLog_log(LogType::Force, "The selected GPU could not be found or is not suitable. Falling back to first available device instead"); m_physicalDevice = fallbackDevice; - config.graphic_device_name = ""; // resetting device selection + config.graphic_device_uuid = {}; // resetting device selection } else if (m_physicalDevice == VK_NULL_HANDLE) { diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h index 683094147..867647a34 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h @@ -156,7 +156,19 @@ class VulkanRenderer : public Renderer sint32 texelCountY; }FormatInfoVK; - static std::vector GetDevices(); + struct DeviceInfo + { + DeviceInfo(const std::string name, uint8* uuid) + : name(name) + { + std::copy(uuid, uuid + VK_UUID_SIZE, this->uuid.data()); + } + + std::string name; + std::array uuid; + }; + + static std::vector GetDevices(); VulkanRenderer(); virtual ~VulkanRenderer(); diff --git a/src/config/CemuConfig.cpp b/src/config/CemuConfig.cpp index 701e6be5b..0e39656b2 100644 --- a/src/config/CemuConfig.cpp +++ b/src/config/CemuConfig.cpp @@ -212,7 +212,7 @@ void CemuConfig::Load(XMLConfigParser& parser) // graphics auto graphic = parser.get("Graphic"); graphic_api = graphic.get("api", kOpenGL); - graphic.get("device", graphic_device_name); + graphic.get("device", graphic_device_uuid); vsync = graphic.get("VSync", 0); gx2drawdone_sync = graphic.get("GX2DrawdoneSync", true); upscale_filter = graphic.get("UpscaleFilter", kBicubicHermiteFilter); @@ -468,7 +468,7 @@ void CemuConfig::Save(XMLConfigParser& parser) // graphics auto graphic = config.set("Graphic"); graphic.set("api", graphic_api); - graphic.set("device", graphic_device_name); + graphic.set("device", graphic_device_uuid); graphic.set("VSync", vsync); graphic.set("GX2DrawdoneSync", gx2drawdone_sync); //graphic.set("PrecompiledShaders", precompiled_shaders.GetValue()); diff --git a/src/config/CemuConfig.h b/src/config/CemuConfig.h index 0dc8cf2b8..d1e5e214a 100644 --- a/src/config/CemuConfig.h +++ b/src/config/CemuConfig.h @@ -462,7 +462,7 @@ struct CemuConfig // graphics ConfigValue graphic_api{ kVulkan }; - std::string graphic_device_name; + std::array graphic_device_uuid; ConfigValue vsync{ 0 }; // 0 = off, 1+ = on depending on render backend ConfigValue gx2drawdone_sync {true}; ConfigValue render_upside_down{ false }; diff --git a/src/gui/GeneralSettings2.cpp b/src/gui/GeneralSettings2.cpp index 7071c7e8f..8b6e0ee15 100644 --- a/src/gui/GeneralSettings2.cpp +++ b/src/gui/GeneralSettings2.cpp @@ -1,5 +1,3 @@ -#include "Foundation/NSString.hpp" -#include "Metal/MTLDevice.hpp" #include "gui/wxgui.h" #include "gui/GeneralSettings2.h" #include "gui/CemuApp.h" @@ -29,9 +27,6 @@ #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h" #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h" -#if ENABLE_METAL -#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" -#endif #include "Cafe/Account/Account.h" #include @@ -87,15 +82,15 @@ class wxInputDeviceDescription : public wxClientData IAudioInputAPI::DeviceDescriptionPtr m_description; }; -class wxGraphicsDevice : public wxClientData +class wxVulkanUUID : public wxClientData { public: - wxGraphicsDevice(const std::string& name) - : m_name(name) {} - const std::string& GetName() const { return m_name; } + wxVulkanUUID(const VulkanRenderer::DeviceInfo& info) + : m_device_info(info) {} + const VulkanRenderer::DeviceInfo& GetDeviceInfo() const { return m_device_info; } private: - std::string m_name; + VulkanRenderer::DeviceInfo m_device_info; }; class wxAccountData : public wxClientData @@ -1030,14 +1025,14 @@ void GeneralSettings2::StoreConfig() selection = m_graphic_device->GetSelection(); if(selection != wxNOT_FOUND) { - const auto* info = (wxGraphicsDevice*)m_graphic_device->GetClientObject(selection); + const auto* info = (wxVulkanUUID*)m_graphic_device->GetClientObject(selection); if(info) - config.graphic_device_name = info->GetName(); + config.graphic_device_uuid = info->GetDeviceInfo().uuid; else - config.graphic_device_name = ""; + config.graphic_device_uuid = {}; } else - config.graphic_device_name = ""; + config.graphic_device_uuid = {}; config.vsync = m_vsync->GetSelection(); @@ -1543,14 +1538,14 @@ void GeneralSettings2::HandleGraphicsApiSelection() { for(const auto& device : devices) { - m_graphic_device->Append(device, new wxGraphicsDevice(device)); + m_graphic_device->Append(device.name, new wxVulkanUUID(device)); } m_graphic_device->SetSelection(0); const auto& config = GetConfig(); for(size_t i = 0; i < devices.size(); ++i) { - if(config.graphic_device_name == devices[i]) + if(config.graphic_device_uuid == devices[i].uuid) { m_graphic_device->SetSelection(i); break; @@ -1571,30 +1566,9 @@ void GeneralSettings2::HandleGraphicsApiSelection() m_vsync->Select(selection); - m_graphic_device->Enable(); - m_graphic_device->Clear(); - -#if ENABLE_METAL - auto devices = MetalRenderer::GetDevices(); - if (!devices.empty()) - { - for (const auto& device : devices) - { - m_graphic_device->Append(device, new wxGraphicsDevice(device)); - } - m_graphic_device->SetSelection(0); - - const auto& config = GetConfig(); - for (size_t i = 0; i < devices.size(); ++i) - { - if (config.graphic_device_name == devices[i]) - { - m_graphic_device->SetSelection(i); - break; - } - } - } -#endif + // TODO: add an option to select the graphic device + m_graphic_device->Clear(); + m_graphic_device->Disable(); } } From 79290eae3fadc27552e25aa82df155bfa98f7e75 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 18 Dec 2024 19:31:24 +0100 Subject: [PATCH 295/368] display gpu name --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index a578948ac..86ce0956d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -523,6 +523,7 @@ void MetalRenderer::DeleteFontTextures() void MetalRenderer::AppendOverlayDebugInfo() { ImGui::Text("--- GPU info ---"); + ImGui::Text("GPU %s", m_device->name()->utf8String()); ImGui::Text("Is Apple GPU %s", (m_isAppleGPU ? "yes" : "no")); ImGui::Text("Has unified memory %s", (m_hasUnifiedMemory ? "yes" : "no")); ImGui::Text("Supports Metal3 %s", (m_supportsMetal3 ? "yes" : "no")); From 08ea28f56ec679d87154c7ef141ff8a592b3dbbd Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 18 Dec 2024 19:51:58 +0100 Subject: [PATCH 296/368] add an option to select gpu for metal --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 48 +++++++++++- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 8 ++ .../Latte/Renderer/Vulkan/VulkanRenderer.cpp | 8 +- src/config/CemuConfig.cpp | 6 +- src/config/CemuConfig.h | 5 +- src/gui/GeneralSettings2.cpp | 78 +++++++++++++++---- 6 files changed, 130 insertions(+), 23 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 86ce0956d..ee2fff895 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -36,10 +36,51 @@ float supportBufferData[512 * 4]; // Defined in the OpenGL renderer void LatteDraw_handleSpecialState8_clearAsDepth(); +std::vector MetalRenderer::GetDevices() +{ + auto devices = MTL::CopyAllDevices(); + std::vector result; + result.reserve(devices->count()); + for (uint32 i = 0; i < devices->count(); i++) + { + MTL::Device* device = static_cast(devices->object(i)); + result.emplace_back(std::string(device->name()->utf8String()), device->registryID()); + } + + return result; +} + MetalRenderer::MetalRenderer() { - m_device = MTL::CreateSystemDefaultDevice(); - m_commandQueue = m_device->newCommandQueue(); + // Pick a device + auto& config = GetConfig(); + const bool hasDeviceSet = config.mtl_graphic_device_uuid != 0; + + // If a device is set, try to find it + if (hasDeviceSet) + { + auto devices = MTL::CopyAllDevices(); + for (uint32 i = 0; i < devices->count(); i++) + { + MTL::Device* device = static_cast(devices->object(i)); + if (device->registryID() == config.mtl_graphic_device_uuid) + { + m_device = device; + break; + } + } + } + + if (!m_device) + { + if (hasDeviceSet) + { + cemuLog_log(LogType::Force, "The selected GPU ({}) could not be found. Using the system default device.", config.mtl_graphic_device_uuid); + config.mtl_graphic_device_uuid = 0; + } + // Use the system default device + m_device = MTL::CreateSystemDefaultDevice(); + } // Feature support m_isAppleGPU = m_device->supportsFamily(MTL::GPUFamilyApple1); @@ -50,6 +91,9 @@ MetalRenderer::MetalRenderer() CheckForPixelFormatSupport(m_pixelFormatSupport); + // Command queue + m_commandQueue = m_device->newCommandQueue(); + // Synchronization resources m_event = m_device->newEvent(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 6a5db69b3..1deddd047 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -155,6 +155,14 @@ class MetalRenderer : public Renderer static constexpr uint32 OCCLUSION_QUERY_POOL_SIZE = 1024; static constexpr uint32 TEXTURE_READBACK_SIZE = 32 * 1024 * 1024; // 32 MB + struct DeviceInfo + { + std::string name; + uint64 uuid; + }; + + static std::vector GetDevices(); + MetalRenderer(); ~MetalRenderer() override; diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp index a28eef4ed..98959b2cb 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp @@ -389,8 +389,8 @@ VulkanRenderer::VulkanRenderer() auto surface = CreateFramebufferSurface(m_instance, gui_getWindowInfo().window_main); auto& config = GetConfig(); - decltype(config.graphic_device_uuid) zero{}; - const bool has_device_set = config.graphic_device_uuid != zero; + decltype(config.vk_graphic_device_uuid) zero{}; + const bool has_device_set = config.vk_graphic_device_uuid != zero; VkPhysicalDevice fallbackDevice = VK_NULL_HANDLE; @@ -410,7 +410,7 @@ VulkanRenderer::VulkanRenderer() physDeviceProps.pNext = &physDeviceIDProps; vkGetPhysicalDeviceProperties2(device, &physDeviceProps); - if (memcmp(config.graphic_device_uuid.data(), physDeviceIDProps.deviceUUID, VK_UUID_SIZE) != 0) + if (memcmp(config.vk_graphic_device_uuid.data(), physDeviceIDProps.deviceUUID, VK_UUID_SIZE) != 0) continue; } @@ -423,7 +423,7 @@ VulkanRenderer::VulkanRenderer() { cemuLog_log(LogType::Force, "The selected GPU could not be found or is not suitable. Falling back to first available device instead"); m_physicalDevice = fallbackDevice; - config.graphic_device_uuid = {}; // resetting device selection + config.vk_graphic_device_uuid = {}; // resetting device selection } else if (m_physicalDevice == VK_NULL_HANDLE) { diff --git a/src/config/CemuConfig.cpp b/src/config/CemuConfig.cpp index 0e39656b2..00e56d6d3 100644 --- a/src/config/CemuConfig.cpp +++ b/src/config/CemuConfig.cpp @@ -212,7 +212,8 @@ void CemuConfig::Load(XMLConfigParser& parser) // graphics auto graphic = parser.get("Graphic"); graphic_api = graphic.get("api", kOpenGL); - graphic.get("device", graphic_device_uuid); + graphic.get("vkDevice", vk_graphic_device_uuid); + graphic.get("mtlDevice", mtl_graphic_device_uuid); vsync = graphic.get("VSync", 0); gx2drawdone_sync = graphic.get("GX2DrawdoneSync", true); upscale_filter = graphic.get("UpscaleFilter", kBicubicHermiteFilter); @@ -468,7 +469,8 @@ void CemuConfig::Save(XMLConfigParser& parser) // graphics auto graphic = config.set("Graphic"); graphic.set("api", graphic_api); - graphic.set("device", graphic_device_uuid); + graphic.set("vkDevice", vk_graphic_device_uuid); + graphic.set("mtlDevice", mtl_graphic_device_uuid); graphic.set("VSync", vsync); graphic.set("GX2DrawdoneSync", gx2drawdone_sync); //graphic.set("PrecompiledShaders", precompiled_shaders.GetValue()); diff --git a/src/config/CemuConfig.h b/src/config/CemuConfig.h index d1e5e214a..b3ff1999b 100644 --- a/src/config/CemuConfig.h +++ b/src/config/CemuConfig.h @@ -462,8 +462,9 @@ struct CemuConfig // graphics ConfigValue graphic_api{ kVulkan }; - std::array graphic_device_uuid; - ConfigValue vsync{ 0 }; // 0 = off, 1+ = on depending on render backend + std::array vk_graphic_device_uuid; + uint64 mtl_graphic_device_uuid{0}; + ConfigValue vsync{ 0 }; // 0 = off, 1+ = depending on render backend ConfigValue gx2drawdone_sync {true}; ConfigValue render_upside_down{ false }; ConfigValue async_compile{ true }; diff --git a/src/gui/GeneralSettings2.cpp b/src/gui/GeneralSettings2.cpp index 8b6e0ee15..8663dbffa 100644 --- a/src/gui/GeneralSettings2.cpp +++ b/src/gui/GeneralSettings2.cpp @@ -27,6 +27,9 @@ #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h" #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h" +#if ENABLE_METAL +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#endif #include "Cafe/Account/Account.h" #include @@ -93,6 +96,19 @@ class wxVulkanUUID : public wxClientData VulkanRenderer::DeviceInfo m_device_info; }; +#if ENABLE_METAL +class wxMetalUUID : public wxClientData +{ +public: + wxMetalUUID(const MetalRenderer::DeviceInfo& info) + : m_device_info(info) {} + const MetalRenderer::DeviceInfo& GetDeviceInfo() const { return m_device_info; } + +private: + MetalRenderer::DeviceInfo m_device_info; +}; +#endif + class wxAccountData : public wxClientData { public: @@ -1023,16 +1039,32 @@ void GeneralSettings2::StoreConfig() config.graphic_api = (GraphicAPI)m_graphic_api->GetSelection(); selection = m_graphic_device->GetSelection(); - if(selection != wxNOT_FOUND) - { - const auto* info = (wxVulkanUUID*)m_graphic_device->GetClientObject(selection); - if(info) - config.graphic_device_uuid = info->GetDeviceInfo().uuid; - else - config.graphic_device_uuid = {}; + if (config.graphic_api == GraphicAPI::kVulkan) + { + if (selection != wxNOT_FOUND) + { + const auto* info = (wxVulkanUUID*)m_graphic_device->GetClientObject(selection); + if (info) + config.vk_graphic_device_uuid = info->GetDeviceInfo().uuid; + else + config.vk_graphic_device_uuid = {}; + } + else + config.vk_graphic_device_uuid = {}; + } + else if (config.graphic_api == GraphicAPI::kMetal) + { + if (selection != wxNOT_FOUND) + { + const auto* info = (wxMetalUUID*)m_graphic_device->GetClientObject(selection); + if (info) + config.mtl_graphic_device_uuid = info->GetDeviceInfo().uuid; + else + config.mtl_graphic_device_uuid = {}; + } + else + config.mtl_graphic_device_uuid = {}; } - else - config.graphic_device_uuid = {}; config.vsync = m_vsync->GetSelection(); @@ -1545,7 +1577,7 @@ void GeneralSettings2::HandleGraphicsApiSelection() const auto& config = GetConfig(); for(size_t i = 0; i < devices.size(); ++i) { - if(config.graphic_device_uuid == devices[i].uuid) + if(config.vk_graphic_device_uuid == devices[i].uuid) { m_graphic_device->SetSelection(i); break; @@ -1566,9 +1598,29 @@ void GeneralSettings2::HandleGraphicsApiSelection() m_vsync->Select(selection); - // TODO: add an option to select the graphic device - m_graphic_device->Clear(); - m_graphic_device->Disable(); + m_graphic_device->Enable(); + auto devices = MetalRenderer::GetDevices(); + m_graphic_device->Clear(); +#if ENABLE_METAL + if(!devices.empty()) + { + for (const auto& device : devices) + { + m_graphic_device->Append(device.name, new wxMetalUUID(device)); + } + m_graphic_device->SetSelection(0); + + const auto& config = GetConfig(); + for (size_t i = 0; i < devices.size(); ++i) + { + if (config.mtl_graphic_device_uuid == devices[i].uuid) + { + m_graphic_device->SetSelection(i); + break; + } + } + } +#endif } } From 967a74024a5a4856c61dda95b30d9202abff12f9 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 21 Dec 2024 08:52:18 +0100 Subject: [PATCH 297/368] implement point coord properly --- .../Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp | 4 ++-- .../LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index bc2becf8b..caba5229f 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -4287,9 +4287,9 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, cemu_assert_debug((psInputTable->paramGen) == 1); // handle the other bits (the same set of coordinates with different perspective/projection settings?) uint32 paramGenGPRIndex = psInputTable->paramGenGPR; if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) - src->addFmt("{} = in.position.xyxy;" _CRLF, _getRegisterVarName(shaderContext, paramGenGPRIndex)); + src->addFmt("{} = pointCoord.xyxy;" _CRLF, _getRegisterVarName(shaderContext, paramGenGPRIndex)); else - src->addFmt("{} = bitCast(gl_PointCoord.xyxy);" _CRLF, _getRegisterVarName(shaderContext, paramGenGPRIndex)); + src->addFmt("{} = bitCast(pointCoord.xyxy);" _CRLF, _getRegisterVarName(shaderContext, paramGenGPRIndex)); } for (sint32 i = 0; i < psInputTable->count; i++) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 1a2dfa3c1..2d871d99d 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -516,6 +516,7 @@ namespace LatteDecompiler } else { + // TODO: only include these if needed? src->add("uint vid [[vertex_id]]"); src->add(", uint iid [[instance_id]]"); } @@ -532,6 +533,8 @@ namespace LatteDecompiler break; case LatteConst::ShaderType::Pixel: src->add("FragmentIn in [[stage_in]]"); + // TODO: only include these if needed? + src->add(", float2 pointCoord [[point_coord]]"); src->add(", bool frontFacing [[front_facing]]"); break; default: From 55d9436bf3aa8bc05d28b987d42d10aea9addd72 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 22 Dec 2024 10:15:13 +0100 Subject: [PATCH 298/368] fix typo in texture decoder --- src/Cafe/HW/Latte/Core/LatteTextureLoader.h | 2 +- .../HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/Core/LatteTextureLoader.h b/src/Cafe/HW/Latte/Core/LatteTextureLoader.h index 0f558945e..ad557bb3c 100644 --- a/src/Cafe/HW/Latte/Core/LatteTextureLoader.h +++ b/src/Cafe/HW/Latte/Core/LatteTextureLoader.h @@ -679,7 +679,7 @@ class TextureDecoder_R4G4_UNORM_To_RGBA8 : public TextureDecoder, public Singlet } }; -class TextureDecoder_R4G4_UNORM_To_RG8 : public TextureDecoder, public SingletonClass +class TextureDecoder_R4G4_UNORM_To_RG8 : public TextureDecoder, public SingletonClass { public: sint32 getBytesPerTexel(LatteTextureLoaderCtx* textureLoader) override diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index caba5229f..b925f862c 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2481,6 +2481,7 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex cemu_assert_unimplemented(); } } + // offset if( texOpcode == GPU7_TEX_INST_SAMPLE_L || texOpcode == GPU7_TEX_INST_SAMPLE_LZ || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ || texOpcode == GPU7_TEX_INST_SAMPLE || texOpcode == GPU7_TEX_INST_SAMPLE_C ) { From 40dab1e2122f864f877ce3256ec8dc62ed28e297 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 4 Jan 2025 11:24:00 +0100 Subject: [PATCH 299/368] fallback to transparent black for custom sampler border color --- src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp index 4f987d83c..b7d5a2ecd 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp @@ -110,7 +110,8 @@ MTL::SamplerState* MetalSamplerCache::GetSamplerState(const LatteContextRegister else { // Metal doesn't support custom border color - samplerDescriptor->setBorderColor(MTL::SamplerBorderColorOpaqueBlack); + cemuLog_logOnce(LogType::Force, "Custom border color is not supported in Metal, using transparent black instead"); + samplerDescriptor->setBorderColor(MTL::SamplerBorderColorTransparentBlack); } samplerState = m_mtlr->GetDevice()->newSamplerState(samplerDescriptor); From 1d8806cf06ce0760a90a881eb0f1b5da7d9e7c47 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 4 Jan 2025 12:42:06 +0100 Subject: [PATCH 300/368] add an option to capture GPU frame --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 35 +++++++++++++++++++ .../HW/Latte/Renderer/Metal/MetalRenderer.h | 15 ++++++++ src/gui/MainWindow.cpp | 21 +++++++++++ 3 files changed, 71 insertions(+) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index ee2fff895..72b756e10 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -303,6 +303,17 @@ void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) // Debug m_performanceMonitor.ResetPerFrameData(); + + // GPU capture + if (m_capturing) + { + EndCapture(); + } + else if (m_captureFrame) + { + StartCapture(); + m_captureFrame = false; + } } void MetalRenderer::HandleScreenshotRequest(LatteTextureView* texView, bool padView) { @@ -2161,3 +2172,27 @@ void MetalRenderer::EnsureImGuiBackend() //ImGui_ImplMetal_CreateFontsTexture(m_device); } } + +void MetalRenderer::StartCapture() +{ + auto captureManager = MTL::CaptureManager::sharedCaptureManager(); + auto desc = MTL::CaptureDescriptor::alloc()->init(); + desc->setCaptureObject(m_device); + + NS::Error* error = nullptr; + captureManager->startCapture(desc, &error); + if (error) + { + cemuLog_log(LogType::Force, "Failed to start GPU capture: {}", error->localizedDescription()->utf8String()); + } + + m_capturing = true; +} + +void MetalRenderer::EndCapture() +{ + auto captureManager = MTL::CaptureManager::sharedCaptureManager(); + captureManager->stopCapture(); + + m_capturing = false; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 1deddd047..49aa40b10 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -460,6 +460,12 @@ class MetalRenderer : public Renderer m_occlusionQuery.m_lastCommandBuffer = GetAndRetainCurrentCommandBufferIfNotCompleted(); } + // GPU capture + void CaptureFrame() + { + m_captureFrame = true; + } + private: MetalLayerHandle m_mainLayer; MetalLayerHandle m_padLayer; @@ -533,6 +539,11 @@ class MetalRenderer : public Renderer // State MetalState m_state; + // GPU capture + bool m_captureFrame = false; + bool m_capturing = false; + + // Helpers MetalLayerHandle& GetLayer(bool mainWindow) { return (mainWindow ? m_mainLayer : m_padLayer); @@ -541,4 +552,8 @@ class MetalRenderer : public Renderer void SwapBuffer(bool mainWindow); void EnsureImGuiBackend(); + + // GPU capture + void StartCapture(); + void EndCapture(); }; diff --git a/src/gui/MainWindow.cpp b/src/gui/MainWindow.cpp index 740120c1c..fc5152c5a 100644 --- a/src/gui/MainWindow.cpp +++ b/src/gui/MainWindow.cpp @@ -1,3 +1,5 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Renderer.h" #include "gui/wxgui.h" #include "gui/MainWindow.h" #include "gui/guiWrapper.h" @@ -137,6 +139,7 @@ enum MAINFRAME_MENU_ID_DEBUG_VIEW_TEXTURE_RELATIONS, MAINFRAME_MENU_ID_DEBUG_AUDIO_AUX_ONLY, MAINFRAME_MENU_ID_DEBUG_VK_ACCURATE_BARRIERS, + MAINFRAME_MENU_ID_DEBUG_GPU_CAPTURE, // debug->logging MAINFRAME_MENU_ID_DEBUG_LOGGING0 = 21500, @@ -212,6 +215,7 @@ EVT_MENU(MAINFRAME_MENU_ID_DEBUG_DUMP_CURL_REQUESTS, MainWindow::OnDebugSetting) EVT_MENU(MAINFRAME_MENU_ID_DEBUG_RENDER_UPSIDE_DOWN, MainWindow::OnDebugSetting) EVT_MENU(MAINFRAME_MENU_ID_DEBUG_AUDIO_AUX_ONLY, MainWindow::OnDebugSetting) EVT_MENU(MAINFRAME_MENU_ID_DEBUG_VK_ACCURATE_BARRIERS, MainWindow::OnDebugSetting) +EVT_MENU(MAINFRAME_MENU_ID_DEBUG_GPU_CAPTURE, MainWindow::OnDebugSetting) EVT_MENU(MAINFRAME_MENU_ID_DEBUG_DUMP_RAM, MainWindow::OnDebugSetting) EVT_MENU(MAINFRAME_MENU_ID_DEBUG_DUMP_FST, MainWindow::OnDebugSetting) // debug -> View ... @@ -1007,6 +1011,20 @@ void MainWindow::OnDebugSetting(wxCommandEvent& event) if(!GetConfig().vk_accurate_barriers) wxMessageBox(_("Warning: Disabling the accurate barriers option will lead to flickering graphics but may improve performance. It is highly recommended to leave it turned on."), _("Accurate barriers are off"), wxOK); } + else if (event.GetId() == MAINFRAME_MENU_ID_DEBUG_GPU_CAPTURE) + { +#if ENABLE_METAL + if (g_renderer->GetType() == RendererAPI::Metal) + { + static_cast(g_renderer.get())->CaptureFrame(); + } + else + { + wxMessageBox(_("GPU capture is only supported on Metal."), _("Error"), wxOK | wxCENTRE | wxICON_ERROR); + return; + } +#endif + } else if (event.GetId() == MAINFRAME_MENU_ID_DEBUG_AUDIO_AUX_ONLY) ActiveSettings::EnableAudioOnlyAux(event.IsChecked()); else if (event.GetId() == MAINFRAME_MENU_ID_DEBUG_DUMP_RAM) @@ -2254,6 +2272,9 @@ void MainWindow::RecreateMenu() auto accurateBarriers = debugMenu->AppendCheckItem(MAINFRAME_MENU_ID_DEBUG_VK_ACCURATE_BARRIERS, _("&Accurate barriers (Vulkan)"), wxEmptyString); accurateBarriers->Check(GetConfig().vk_accurate_barriers); + auto gpuCapture = debugMenu->Append(MAINFRAME_MENU_ID_DEBUG_VK_ACCURATE_BARRIERS, _("&GPU capture (Metal)"), wxEmptyString); + gpuCapture->Enable(m_game_launched && g_renderer->GetType() == RendererAPI::Metal); + debugMenu->AppendSeparator(); #ifdef CEMU_DEBUG_ASSERT From 9a61e81715245602602100200a5f6270e0a61ee2 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 4 Jan 2025 13:54:07 +0100 Subject: [PATCH 301/368] support saving GPU captures to a file --- .../HW/Latte/Renderer/Metal/MetalCommon.h | 12 +++++++ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 35 +++++++++++++++++++ src/config/CemuConfig.cpp | 2 ++ src/config/CemuConfig.h | 1 + src/gui/GeneralSettings2.cpp | 18 ++++++++++ src/gui/GeneralSettings2.h | 1 + 6 files changed, 69 insertions(+) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h index ba9ebc36b..952fd1de9 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -67,6 +67,18 @@ inline NS::String* ToNSString(const std::string& str) return ToNSString(str.c_str()); } +// Cast from const char* to NS::URL* +inline NS::URL* ToNSURL(const char* str) +{ + return NS::URL::fileURLWithPath(ToNSString(str)); +} + +// Cast from std::string to NS::URL* +inline NS::URL* ToNSURL(const std::string& str) +{ + return ToNSURL(str.c_str()); +} + inline NS::String* GetLabel(const std::string& label, const void* identifier) { return ToNSString(label + " (" + std::to_string(reinterpret_cast(identifier)) + ")"); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 72b756e10..19a4b55d8 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -20,6 +20,9 @@ #include "Cemu/Logging/CemuLogging.h" #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Core/LatteConst.h" +#include "Common/precompiled.h" +#include "HW/Latte/Renderer/Metal/MetalCommon.h" +#include "Metal/MTLCaptureManager.hpp" #include "config/CemuConfig.h" #include "gui/guiWrapper.h" @@ -2179,6 +2182,38 @@ void MetalRenderer::StartCapture() auto desc = MTL::CaptureDescriptor::alloc()->init(); desc->setCaptureObject(m_device); + // Check if a debugger with support for GPU capture is attached + if (captureManager->supportsDestination(MTL::CaptureDestinationDeveloperTools)) + { + desc->setDestination(MTL::CaptureDestinationDeveloperTools); + } + else + { + if (GetConfig().gpu_capture_dir.GetValue().empty()) + { + cemuLog_log(LogType::Force, "No GPU capture directory specified, cannot do a GPU capture"); + return; + } + + // Check if the GPU trace document destination is available + if (!captureManager->supportsDestination(MTL::CaptureDestinationGPUTraceDocument)) + { + cemuLog_log(LogType::Force, "GPU trace document destination is not available, cannot do a GPU capture"); + return; + } + + // Get current date and time as a string + auto now = std::chrono::system_clock::now(); + std::time_t now_time = std::chrono::system_clock::to_time_t(now); + std::ostringstream oss; + oss << std::put_time(std::localtime(&now_time), "%Y-%m-%d_%H-%M-%S"); + std::string now_str = oss.str(); + + std::string capturePath = fmt::format("{}/cemu_{}.gputrace", GetConfig().gpu_capture_dir.GetValue(), now_str); + desc->setDestination(MTL::CaptureDestinationGPUTraceDocument); + desc->setOutputURL(ToNSURL(capturePath)); + } + NS::Error* error = nullptr; captureManager->startCapture(desc, &error); if (error) diff --git a/src/config/CemuConfig.cpp b/src/config/CemuConfig.cpp index 00e56d6d3..cbea09cb8 100644 --- a/src/config/CemuConfig.cpp +++ b/src/config/CemuConfig.cpp @@ -336,6 +336,7 @@ void CemuConfig::Load(XMLConfigParser& parser) crash_dump = debug.get("CrashDumpUnix", crash_dump); #endif gdb_port = debug.get("GDBPort", 1337); + gpu_capture_dir = debug.get("GPUCaptureDir", ""); // input auto input = parser.get("Input"); @@ -537,6 +538,7 @@ void CemuConfig::Save(XMLConfigParser& parser) debug.set("CrashDumpUnix", crash_dump.GetValue()); #endif debug.set("GDBPort", gdb_port); + debug.set("GPUCaptureDir", gpu_capture_dir.GetValue()); // input auto input = config.set("Input"); diff --git a/src/config/CemuConfig.h b/src/config/CemuConfig.h index b3ff1999b..08a7c9940 100644 --- a/src/config/CemuConfig.h +++ b/src/config/CemuConfig.h @@ -526,6 +526,7 @@ struct CemuConfig // debug ConfigValueBounds crash_dump{ CrashDump::Disabled }; ConfigValue gdb_port{ 1337 }; + ConfigValue gpu_capture_dir{}; void Load(XMLConfigParser& parser); void Save(XMLConfigParser& parser); diff --git a/src/gui/GeneralSettings2.cpp b/src/gui/GeneralSettings2.cpp index 8663dbffa..68cd93edd 100644 --- a/src/gui/GeneralSettings2.cpp +++ b/src/gui/GeneralSettings2.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -892,6 +893,21 @@ wxPanel* GeneralSettings2::AddDebugPage(wxNotebook* notebook) debug_panel_sizer->Add(debug_row, 0, wxALL | wxEXPAND, 5); } + { + auto* debug_row = new wxFlexGridSizer(0, 2, 0, 0); + debug_row->SetFlexibleDirection(wxBOTH); + debug_row->SetNonFlexibleGrowMode(wxFLEX_GROWMODE_SPECIFIED); + + debug_row->Add(new wxStaticText(panel, wxID_ANY, _("GPU capture save directory"), wxDefaultPosition, wxDefaultSize, 0), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); + + m_gpu_capture_dir = new wxTextCtrl(panel, wxID_ANY, wxEmptyString, wxDefaultPosition, wxDefaultSize, wxTE_DONTWRAP); + m_gpu_capture_dir->SetMinSize(wxSize(150, -1)); + m_gpu_capture_dir->SetToolTip(_("Cemu will save the GPU captures done by selecting Debug -> GPU capture in the menu bar in this directory. If a debugger with support for GPU captures (like Xcode) is attached, the capture will be opened in that debugger instead.")); + + debug_row->Add(m_gpu_capture_dir, 0, wxALL | wxEXPAND, 5); + debug_panel_sizer->Add(debug_row, 0, wxALL | wxEXPAND, 5); + } + panel->SetSizerAndFit(debug_panel_sizer); return panel; @@ -1101,6 +1117,7 @@ void GeneralSettings2::StoreConfig() // debug config.crash_dump = (CrashDump)m_crash_dump->GetSelection(); config.gdb_port = m_gdb_port->GetValue(); + config.gpu_capture_dir = m_gpu_capture_dir->GetValue().utf8_string(); g_config.Save(); } @@ -1794,6 +1811,7 @@ void GeneralSettings2::ApplyConfig() // debug m_crash_dump->SetSelection((int)config.crash_dump.GetValue()); m_gdb_port->SetValue(config.gdb_port.GetValue()); + m_gpu_capture_dir->SetValue(wxHelper::FromUtf8(config.gpu_capture_dir.GetValue())); } void GeneralSettings2::OnAudioAPISelected(wxCommandEvent& event) diff --git a/src/gui/GeneralSettings2.h b/src/gui/GeneralSettings2.h index 83ede03b0..2551b2bdc 100644 --- a/src/gui/GeneralSettings2.h +++ b/src/gui/GeneralSettings2.h @@ -78,6 +78,7 @@ class GeneralSettings2 : public wxDialog // Debug wxChoice* m_crash_dump; wxSpinCtrl* m_gdb_port; + wxTextCtrl* m_gpu_capture_dir; void OnAccountCreate(wxCommandEvent& event); void OnAccountDelete(wxCommandEvent& event); From 813c52c23cb609d5803d5f292a441c3ef19d0f4f Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 4 Jan 2025 13:55:49 +0100 Subject: [PATCH 302/368] add gpu capture environment notice --- src/gui/GeneralSettings2.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gui/GeneralSettings2.cpp b/src/gui/GeneralSettings2.cpp index 68cd93edd..9c6d85805 100644 --- a/src/gui/GeneralSettings2.cpp +++ b/src/gui/GeneralSettings2.cpp @@ -902,7 +902,7 @@ wxPanel* GeneralSettings2::AddDebugPage(wxNotebook* notebook) m_gpu_capture_dir = new wxTextCtrl(panel, wxID_ANY, wxEmptyString, wxDefaultPosition, wxDefaultSize, wxTE_DONTWRAP); m_gpu_capture_dir->SetMinSize(wxSize(150, -1)); - m_gpu_capture_dir->SetToolTip(_("Cemu will save the GPU captures done by selecting Debug -> GPU capture in the menu bar in this directory. If a debugger with support for GPU captures (like Xcode) is attached, the capture will be opened in that debugger instead.")); + m_gpu_capture_dir->SetToolTip(_("Cemu will save the GPU captures done by selecting Debug -> GPU capture in the menu bar in this directory. If a debugger with support for GPU captures (like Xcode) is attached, the capture will be opened in that debugger instead. If such debugger is not attached, METAL_CAPTURE_ENABLED must be set to 1 as an environment variable.")); debug_row->Add(m_gpu_capture_dir, 0, wxALL | wxEXPAND, 5); debug_panel_sizer->Add(debug_row, 0, wxALL | wxEXPAND, 5); From 337ec6b721accd855e286aa2134be932e6e12e72 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 4 Jan 2025 17:02:03 +0100 Subject: [PATCH 303/368] fix: GPU capture button not working --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 3 +++ src/gui/MainWindow.cpp | 14 ++++---------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 19a4b55d8..c5bdd335c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -188,6 +188,9 @@ MetalRenderer::MetalRenderer() m_copyBufferToBufferPipeline = new MetalVoidVertexPipeline(this, utilityLibrary, "vertexCopyBufferToBuffer"); utilityLibrary->release(); + + // HACK: for some reason, this variable ends up being initialized to some garbage data, even though its declared as bool m_captureFrame = false; + m_captureFrame = false; } MetalRenderer::~MetalRenderer() diff --git a/src/gui/MainWindow.cpp b/src/gui/MainWindow.cpp index fc5152c5a..e7cccca8c 100644 --- a/src/gui/MainWindow.cpp +++ b/src/gui/MainWindow.cpp @@ -1013,16 +1013,10 @@ void MainWindow::OnDebugSetting(wxCommandEvent& event) } else if (event.GetId() == MAINFRAME_MENU_ID_DEBUG_GPU_CAPTURE) { + cemu_assert_debug(g_renderer->GetType() == RendererAPI::Metal); + #if ENABLE_METAL - if (g_renderer->GetType() == RendererAPI::Metal) - { - static_cast(g_renderer.get())->CaptureFrame(); - } - else - { - wxMessageBox(_("GPU capture is only supported on Metal."), _("Error"), wxOK | wxCENTRE | wxICON_ERROR); - return; - } + static_cast(g_renderer.get())->CaptureFrame(); #endif } else if (event.GetId() == MAINFRAME_MENU_ID_DEBUG_AUDIO_AUX_ONLY) @@ -2272,7 +2266,7 @@ void MainWindow::RecreateMenu() auto accurateBarriers = debugMenu->AppendCheckItem(MAINFRAME_MENU_ID_DEBUG_VK_ACCURATE_BARRIERS, _("&Accurate barriers (Vulkan)"), wxEmptyString); accurateBarriers->Check(GetConfig().vk_accurate_barriers); - auto gpuCapture = debugMenu->Append(MAINFRAME_MENU_ID_DEBUG_VK_ACCURATE_BARRIERS, _("&GPU capture (Metal)"), wxEmptyString); + auto gpuCapture = debugMenu->Append(MAINFRAME_MENU_ID_DEBUG_GPU_CAPTURE, _("&GPU capture (Metal)")); gpuCapture->Enable(m_game_launched && g_renderer->GetType() == RendererAPI::Metal); debugMenu->AppendSeparator(); From e4068856bc962a68021da094ec4214ee89d7068c Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 8 Jan 2025 16:09:15 +0100 Subject: [PATCH 304/368] implement framebuffer fetch --- src/Cafe/HW/Latte/Core/LatteShader.cpp | 107 ++++++++++-------- .../LegacyShaderDecompiler/LatteDecompiler.h | 1 + .../LatteDecompilerAnalyzer.cpp | 36 ++++++ .../LatteDecompilerEmitMSL.cpp | 48 ++++++-- .../LatteDecompilerEmitMSLHeader.hpp | 78 +++++++------ src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h | 2 +- 6 files changed, 179 insertions(+), 93 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteShader.cpp b/src/Cafe/HW/Latte/Core/LatteShader.cpp index f46c68d4a..7ad258840 100644 --- a/src/Cafe/HW/Latte/Core/LatteShader.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShader.cpp @@ -504,11 +504,31 @@ void LatteSHRC_UpdateVSBaseHash(uint8* vertexShaderPtr, uint32 vertexShaderSize, uint64 vsHash2 = 0; _calculateShaderProgramHash(vsProgramCode, vertexShaderSize, &hashCacheVS, &vsHash1, &vsHash2); uint64 vsHash = vsHash1 + vsHash2 + _activeFetchShader->key + _activePSImportTable.key + (usesGeometryShader ? 0x1111ULL : 0ULL); + + uint32 tmp = LatteGPUState.contextNew.PA_CL_VTE_CNTL.getRawValue() ^ 0x43F; + vsHash += tmp; + + auto primitiveType = LatteGPUState.contextNew.VGT_PRIMITIVE_TYPE.get_PRIMITIVE_MODE(); + // TODO: include always in the hash in case of geometry shader or rect shader + if (primitiveType == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS) + { + vsHash += 13ULL; + } + else if (primitiveType == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::POINTS) + { + // required for Vulkan since we have to write the pointsize in the shader + vsHash += 71ULL; + } + vsHash += (LatteGPUState.contextRegister[mmVGT_STRMOUT_EN] ? 21 : 0); + // halfZ + if (LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_DX_CLIP_SPACE_DEF()) + vsHash += 0x1537; + if (g_renderer->GetType() == RendererAPI::Metal) { if (usesGeometryShader || _activeFetchShader->mtlFetchVertexManually) { - for (sint32 g = 0; g < _activeFetchShader->bufferGroups.size(); g++) + for (sint32 g = 0; g < _activeFetchShader->bufferGroups.size(); g++) { LatteParsedFetchShaderBufferGroup_t& group = _activeFetchShader->bufferGroups[g]; uint32 bufferIndex = group.attributeBufferIndex; @@ -522,47 +542,28 @@ void LatteSHRC_UpdateVSBaseHash(uint8* vertexShaderPtr, uint32 vertexShaderSize, if (!usesGeometryShader) { - // Rasterization - bool rasterizationEnabled = !LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); + // Rasterization + bool rasterizationEnabled = !LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); - // HACK - if (!LatteGPUState.contextNew.PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) - rasterizationEnabled = true; + // HACK + if (!LatteGPUState.contextNew.PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) + rasterizationEnabled = true; - const auto& polygonControlReg = LatteGPUState.contextNew.PA_SU_SC_MODE_CNTL; - uint32 cullFront = polygonControlReg.get_CULL_FRONT(); - uint32 cullBack = polygonControlReg.get_CULL_BACK(); - if (cullFront && cullBack) - rasterizationEnabled = false; + const auto& polygonControlReg = LatteGPUState.contextNew.PA_SU_SC_MODE_CNTL; + uint32 cullFront = polygonControlReg.get_CULL_FRONT(); + uint32 cullBack = polygonControlReg.get_CULL_BACK(); + if (cullFront && cullBack) + rasterizationEnabled = false; - if (rasterizationEnabled) - vsHash += 51ULL; + if (rasterizationEnabled) + vsHash += 51ULL; - // Vertex fetch - if (_activeFetchShader->mtlFetchVertexManually) - vsHash += 349ULL; + // Vertex fetch + if (_activeFetchShader->mtlFetchVertexManually) + vsHash += 349ULL; } } - uint32 tmp = LatteGPUState.contextNew.PA_CL_VTE_CNTL.getRawValue() ^ 0x43F; - vsHash += tmp; - - auto primitiveType = LatteGPUState.contextNew.VGT_PRIMITIVE_TYPE.get_PRIMITIVE_MODE(); - // TODO: include always in the hash in case of geometry shader or rect shader - if (primitiveType == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS) - { - vsHash += 13ULL; - } - else if (primitiveType == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::POINTS) - { - // required for Vulkan since we have to write the pointsize in the shader - vsHash += 71ULL; - } - vsHash += (LatteGPUState.contextRegister[mmVGT_STRMOUT_EN] ? 21 : 0); - // halfZ - if (LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_DX_CLIP_SPACE_DEF()) - vsHash += 0x1537; - _shaderBaseHash_vs = vsHash; } @@ -589,19 +590,6 @@ void LatteSHRC_UpdatePSBaseHash(uint8* pixelShaderPtr, uint32 pixelShaderSize, b // get vertex shader uint64 psHash = psHash1 + psHash2 + _activePSImportTable.key + (usesGeometryShader ? hashCacheGS.prevHash1 : 0ULL); -#if ENABLE_METAL - if (g_renderer->GetType() == RendererAPI::Metal) - { - for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) - { - auto format = LatteMRT::GetColorBufferFormat(i, LatteGPUState.contextNew); - uint8 dataType = (uint8)GetMtlPixelFormatInfo(format, false).dataType; - psHash += (uint64)dataType; - psHash = std::rotl(psHash, 7); - } - } -#endif - _shaderBaseHash_ps = psHash; } @@ -635,6 +623,7 @@ uint64 LatteSHRC_CalcVSAuxHash(LatteDecompilerShader* vertexShader, uint32* cont auxHashTex += 0x333; } } + return auxHash + auxHashTex; } @@ -668,6 +657,28 @@ uint64 LatteSHRC_CalcPSAuxHash(LatteDecompilerShader* pixelShader, uint32* conte auxHash = (auxHash << 3) | (auxHash >> 61); auxHash += (uint64)dim; } + + // Textures as render targets + for (uint32 i = 0; i < pixelShader->textureUnitListCount; i++) + { + uint8 t = pixelShader->textureUnitList[i]; + auxHash = std::rotl(auxHash, 11); + auxHash += (uint64)pixelShader->textureRenderTargetIndex[t]; + } + +#if ENABLE_METAL + if (g_renderer->GetType() == RendererAPI::Metal) + { + for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + { + auto format = LatteMRT::GetColorBufferFormat(i, LatteGPUState.contextNew); + uint8 dataType = (uint8)GetMtlPixelFormatInfo(format, false).dataType; + auxHash = std::rotl(auxHash, 7); + auxHash += (uint64)dataType; + } + } +#endif + return auxHash; } diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h index 2812facc7..64aa1413d 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h @@ -183,6 +183,7 @@ struct LatteDecompilerShader std::bitset textureUnitMask2; uint16 textureUnitSamplerAssignment[LATTE_NUM_MAX_TEX_UNITS]{ 0 }; // LATTE_DECOMPILER_SAMPLER_NONE means undefined bool textureUsesDepthCompare[LATTE_NUM_MAX_TEX_UNITS]{}; + uint8 textureRenderTargetIndex[LATTE_NUM_MAX_TEX_UNITS] = {255}; // analyzer stage (pixel outputs) uint32 pixelColorOutputMask{ 0 }; // from LSB to MSB, 1 bit per written output. 1 if written (indices of color attachments) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp index b9ca7b5d4..2d0c7f762 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp @@ -850,6 +850,42 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD shader->textureUnitList[shader->textureUnitListCount] = i; shader->textureUnitListCount++; } + shader->textureRenderTargetIndex[i] = 255; + } + // check if textures are used as render targets + if (shader->shaderType == LatteConst::ShaderType::Pixel) + { + for (sint32 i = 0; i < shader->textureUnitListCount; i++) + { + sint32 textureIndex = shader->textureUnitList[i]; + const auto& texRegister = texRegs[textureIndex]; + + // get physical address of texture data + MPTR physAddr = (texRegister.word2.get_BASE_ADDRESS() << 8); + if (physAddr == MPTR_NULL) + continue; // invalid data + + for (sint32 j = 0; j < LATTE_NUM_COLOR_TARGET; j++) + { + uint32* colorBufferRegBase = shaderContext->contextRegisters + (mmCB_COLOR0_BASE + j); + uint32 regColorBufferBase = colorBufferRegBase[mmCB_COLOR0_BASE - mmCB_COLOR0_BASE] & 0xFFFFFF00; // the low 8 bits are ignored? How to Survive seems to rely on this + uint32 regColorSize = colorBufferRegBase[mmCB_COLOR0_SIZE - mmCB_COLOR0_BASE]; + uint32 regColorInfo = colorBufferRegBase[mmCB_COLOR0_INFO - mmCB_COLOR0_BASE]; + uint32 regColorView = colorBufferRegBase[mmCB_COLOR0_VIEW - mmCB_COLOR0_BASE]; + // decode color buffer reg info + Latte::E_HWTILEMODE colorBufferTileMode = (Latte::E_HWTILEMODE)((regColorInfo >> 8) & 0xF); + uint32 numberType = (regColorInfo >> 12) & 7; + Latte::E_GX2SURFFMT colorBufferFormat = LatteMRT::GetColorBufferFormat(j, *shaderContext->contextRegistersNew); + + MPTR colorBufferPhysMem = regColorBufferBase; + + if (physAddr == colorBufferPhysMem) + { + shader->textureRenderTargetIndex[i] = j; + break; + } + } + } } // for geometry shaders check the copy shader for stream writes if (shader->shaderType == LatteConst::ShaderType::Geometry && shaderContext->parsedGSCopyShader->list_streamWrites.empty() == false) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index b925f862c..22c511ba1 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2260,6 +2260,22 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex return; } + // Do a framebuffer fetch if possible + // TODO: filter out more? + uint8 renderTargetIndex = shaderContext->shader->textureRenderTargetIndex[texInstruction->textureFetch.textureIndex]; + if (renderTargetIndex != 255) + { + src->addFmt("col{}.", renderTargetIndex); + // TODO: clean up + std::string components[] = {"x", "y", "z", "w"}; + for (sint32 i = 0; i < numWrittenElements; i++) + { + src->addFmt("{}", components[i]); + } + src->add(");" _CRLF); + return; + } + if (emulateCompare) { cemu_assert_debug(!isGather); @@ -2630,20 +2646,28 @@ static void _emitTEXGetTextureResInfoCode(LatteDecompilerShaderContext* shaderCo // todo - mip index parameter? - auto texDim = shaderContext->shader->textureUnitDim[texInstruction->textureFetch.textureIndex]; - - if (texDim == Latte::E_DIM::DIM_1D) - src->addFmt(" = int4(tex{}.get_width(), 1, 1, 1).", texInstruction->textureFetch.textureIndex); - else if (texDim == Latte::E_DIM::DIM_1D_ARRAY) - src->addFmt(" = int4(tex{}.get_width(), tex{}.get_array_size(), 1, 1).", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex); - else if (texDim == Latte::E_DIM::DIM_2D || texDim == Latte::E_DIM::DIM_2D_MSAA) - src->addFmt(" = int4(tex{}.get_width(), tex{}.get_height(), 1, 1).", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex); - else if (texDim == Latte::E_DIM::DIM_2D_ARRAY) - src->addFmt(" = int4(tex{}.get_width(), tex{}.get_height(), tex{}.get_array_size(), 1).", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex); + if (shaderContext->shader->textureRenderTargetIndex[texInstruction->textureFetch.textureIndex] != 255) + { + // TODO: use the render target size + src->addFmt(" = int4(1920, 1080, 1, 1)."); + } else { - cemu_assert_debug(false); - src->addFmt(" = int4(tex{}.get_width(), tex{}.get_height(), 1, 1).", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex); + auto texDim = shaderContext->shader->textureUnitDim[texInstruction->textureFetch.textureIndex]; + + if (texDim == Latte::E_DIM::DIM_1D) + src->addFmt(" = int4(tex{}.get_width(), 1, 1, 1).", texInstruction->textureFetch.textureIndex); + else if (texDim == Latte::E_DIM::DIM_1D_ARRAY) + src->addFmt(" = int4(tex{}.get_width(), tex{}.get_array_size(), 1, 1).", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex); + else if (texDim == Latte::E_DIM::DIM_2D || texDim == Latte::E_DIM::DIM_2D_MSAA) + src->addFmt(" = int4(tex{}.get_width(), tex{}.get_height(), 1, 1).", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex); + else if (texDim == Latte::E_DIM::DIM_2D_ARRAY) + src->addFmt(" = int4(tex{}.get_width(), tex{}.get_height(), tex{}.get_array_size(), 1).", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex); + else + { + cemu_assert_debug(false); + src->addFmt(" = int4(tex{}.get_width(), tex{}.get_height(), 1, 1).", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex); + } } for(sint32 f=0; f<4; f++) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 2d871d99d..40e704554 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -448,6 +448,8 @@ namespace LatteDecompiler static void _emitTextureDefinitions(LatteDecompilerShaderContext* shaderContext) { + bool renderTargetIndexUsed[LATTE_NUM_COLOR_TARGET] = {false}; + auto src = shaderContext->shaderSource; // texture sampler definition for (sint32 i = 0; i < LATTE_NUM_MAX_TEX_UNITS; i++) @@ -455,44 +457,56 @@ namespace LatteDecompiler if (!shaderContext->output->textureUnitMask[i]) continue; - src->add(", "); + uint8 renderTargetIndex = shaderContext->shader->textureRenderTargetIndex[i]; + if (renderTargetIndex == 255) + { + src->add(", "); - // Only 2D and 2D array textures can be used with comparison samplers - if (shaderContext->shader->textureUsesDepthCompare[i] && IsValidDepthTextureType(shaderContext->shader->textureUnitDim[i])) - src->add("depth"); - else - src->add("texture"); + // Only 2D and 2D array textures can be used with comparison samplers + if (shaderContext->shader->textureUsesDepthCompare[i] && IsValidDepthTextureType(shaderContext->shader->textureUnitDim[i])) + src->add("depth"); + else + src->add("texture"); - if (shaderContext->shader->textureIsIntegerFormat[i]) - { - // integer samplers - if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_1D) - src->add("1d"); - else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D || shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_MSAA) - src->add("2d"); - else - cemu_assert_unimplemented(); + if (shaderContext->shader->textureIsIntegerFormat[i]) + { + // integer samplers + if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_1D) + src->add("1d"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D || shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_MSAA) + src->add("2d"); + else + cemu_assert_unimplemented(); + } + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D || shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_MSAA) + src->add("2d"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_1D) + src->add("1d"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_ARRAY) + src->add("2d_array"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_CUBEMAP) + src->add("cube_array"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_3D) + src->add("3d"); + else + { + cemu_assert_unimplemented(); + } + + uint32 binding = shaderContext->output->resourceMappingMTL.textureUnitToBindingPoint[i]; + //uint32 textureBinding = shaderContext->output->resourceMappingMTL.textureUnitToBindingPoint[i] % 31; + //uint32 samplerBinding = textureBinding % 16; + src->addFmt(" tex{} [[texture({})]]", i, binding); + src->addFmt(", sampler samplr{} [[sampler({})]]", i, binding); } - else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D || shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_MSAA) - src->add("2d"); - else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_1D) - src->add("1d"); - else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_ARRAY) - src->add("2d_array"); - else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_CUBEMAP) - src->add("cube_array"); - else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_3D) - src->add("3d"); else { - cemu_assert_unimplemented(); + if (!renderTargetIndexUsed[renderTargetIndex]) + { + src->addFmt(", {} col{} [[color({})]]", GetDataTypeStr(GetColorBufferDataType(renderTargetIndex, *shaderContext->contextRegistersNew)), renderTargetIndex, renderTargetIndex); + renderTargetIndexUsed[renderTargetIndex] = true; + } } - - uint32 binding = shaderContext->output->resourceMappingMTL.textureUnitToBindingPoint[i]; - //uint32 textureBinding = shaderContext->output->resourceMappingMTL.textureUnitToBindingPoint[i] % 31; - //uint32 samplerBinding = textureBinding % 16; - src->addFmt(" tex{} [[texture({})]]", i, binding); - src->addFmt(", sampler samplr{} [[sampler({})]]", i, binding); } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h index 7544ceed9..ef25ca5d5 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h @@ -55,7 +55,7 @@ inline const char* GetDataTypeStr(MetalDataType dataType) return "float4"; default: cemu_assert_suspicious(); - return ""; + return "INVALID"; } } From 68d328b0947eb0c23dec734fc28f577958f24401 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 8 Jan 2025 16:44:54 +0100 Subject: [PATCH 305/368] mask out color attachments --- .../HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h | 2 +- .../LatteDecompilerAnalyzer.cpp | 11 ++++------- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h index 64aa1413d..21f6d2b2b 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h @@ -183,7 +183,7 @@ struct LatteDecompilerShader std::bitset textureUnitMask2; uint16 textureUnitSamplerAssignment[LATTE_NUM_MAX_TEX_UNITS]{ 0 }; // LATTE_DECOMPILER_SAMPLER_NONE means undefined bool textureUsesDepthCompare[LATTE_NUM_MAX_TEX_UNITS]{}; - uint8 textureRenderTargetIndex[LATTE_NUM_MAX_TEX_UNITS] = {255}; + uint8 textureRenderTargetIndex[LATTE_NUM_MAX_TEX_UNITS]; // analyzer stage (pixel outputs) uint32 pixelColorOutputMask{ 0 }; // from LSB to MSB, 1 bit per written output. 1 if written (indices of color attachments) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp index 2d0c7f762..b1ede036a 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp @@ -855,6 +855,7 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD // check if textures are used as render targets if (shader->shaderType == LatteConst::ShaderType::Pixel) { + uint8 colorBufferMask = LatteMRT::GetActiveColorBufferMask(shader, *shaderContext->contextRegistersNew); for (sint32 i = 0; i < shader->textureUnitListCount; i++) { sint32 textureIndex = shader->textureUnitList[i]; @@ -867,15 +868,11 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD for (sint32 j = 0; j < LATTE_NUM_COLOR_TARGET; j++) { + if (((colorBufferMask) & (1 << j)) == 0) + continue; // color buffer not enabled + uint32* colorBufferRegBase = shaderContext->contextRegisters + (mmCB_COLOR0_BASE + j); uint32 regColorBufferBase = colorBufferRegBase[mmCB_COLOR0_BASE - mmCB_COLOR0_BASE] & 0xFFFFFF00; // the low 8 bits are ignored? How to Survive seems to rely on this - uint32 regColorSize = colorBufferRegBase[mmCB_COLOR0_SIZE - mmCB_COLOR0_BASE]; - uint32 regColorInfo = colorBufferRegBase[mmCB_COLOR0_INFO - mmCB_COLOR0_BASE]; - uint32 regColorView = colorBufferRegBase[mmCB_COLOR0_VIEW - mmCB_COLOR0_BASE]; - // decode color buffer reg info - Latte::E_HWTILEMODE colorBufferTileMode = (Latte::E_HWTILEMODE)((regColorInfo >> 8) & 0xF); - uint32 numberType = (regColorInfo >> 12) & 7; - Latte::E_GX2SURFFMT colorBufferFormat = LatteMRT::GetColorBufferFormat(j, *shaderContext->contextRegistersNew); MPTR colorBufferPhysMem = regColorBufferBase; From 27a31fedabf24d90b0c56b1fa770fe29ce729296 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 8 Jan 2025 16:58:06 +0100 Subject: [PATCH 306/368] clean up accurate barriers --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 23 ++++++++++++------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 3 +-- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index c5bdd335c..d9850f071 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1031,7 +1031,11 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 endRenderPass = CheckIfRenderPassNeedsFlush(geometryShader); if (endRenderPass) + { EndEncoding(); + // TODO: only log in debug? + cemuLog_logOnce(LogType::Force, "Ending render pass due to render target self-dependency\n"); + } } // Primitive type @@ -1871,6 +1875,11 @@ bool MetalRenderer::CheckIfRenderPassNeedsFlush(LatteDecompilerShader* shader) const auto relative_textureUnit = shader->resourceMapping.getTextureUnitFromBindingPoint(i); auto hostTextureUnit = relative_textureUnit; auto textureDim = shader->textureUnitDim[relative_textureUnit]; + + // Texture is accessed as a framebuffer fetch, therefore there is no need to flush it + if (shader->textureRenderTargetIndex[relative_textureUnit] != 255) + continue; + auto texUnitRegIndex = hostTextureUnit * 7; switch (shader->shaderType) { @@ -1895,15 +1904,13 @@ bool MetalRenderer::CheckIfRenderPassNeedsFlush(LatteDecompilerShader* shader) continue; LatteTexture* baseTexture = textureView->baseTexture; - if (!m_state.m_isFirstDrawInRenderPass) + + // If the texture is also used in the current render pass, we need to end the render pass to "flush" the texture + for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) { - // If the texture is also used in the current render pass, we need to end the render pass to "flush" the texture - for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) - { - auto colorTarget = m_state.m_activeFBO.m_fbo->colorBuffer[i].texture; - if (colorTarget && colorTarget->baseTexture == baseTexture) - return true; - } + auto colorTarget = m_state.m_activeFBO.m_fbo->colorBuffer[i].texture; + if (colorTarget && colorTarget->baseTexture == baseTexture) + return true; } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 49aa40b10..c3898f3a6 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -125,8 +125,7 @@ struct MetalState MetalActiveFBOState m_lastUsedFBO; size_t m_vertexBufferOffsets[MAX_MTL_VERTEX_BUFFERS]; - // TODO: find out what is the max number of bound textures on the Wii U - class LatteTextureViewMtl* m_textures[64] = {nullptr}; + class LatteTextureViewMtl* m_textures[LATTE_NUM_MAX_TEX_UNITS] = {nullptr}; size_t m_uniformBufferOffsets[METAL_GENERAL_SHADER_TYPE_TOTAL][MAX_MTL_BUFFERS]; MTL::Viewport m_viewport; From 6d1d739de5713497fdf72d5de58fa04f1a6df08b Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 9 Jan 2025 16:03:53 +0100 Subject: [PATCH 307/368] limit framebuffer fetch to 2D textures --- .../LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp index b1ede036a..0f9f28c61 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp @@ -866,6 +866,13 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD if (physAddr == MPTR_NULL) continue; // invalid data + // Check for dimension + auto dim = shader->textureUnitDim[textureIndex]; + // TODO: 2D arrays could technically be supported as well + if (dim != Latte::E_DIM::DIM_2D) + continue; + + // Check if the texture is used as render target for (sint32 j = 0; j < LATTE_NUM_COLOR_TARGET; j++) { if (((colorBufferMask) & (1 << j)) == 0) @@ -876,6 +883,7 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD MPTR colorBufferPhysMem = regColorBufferBase; + // TODO: check if mip matches as well? if (physAddr == colorBufferPhysMem) { shader->textureRenderTargetIndex[i] = j; From 03ec23140b8de7ce8839aa84b3f186e58e8fc7f5 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 9 Jan 2025 16:09:02 +0100 Subject: [PATCH 308/368] don't bind textures when framebuffer fetched --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index d9850f071..176ae6df1 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1926,6 +1926,11 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE { const auto relative_textureUnit = shader->resourceMapping.getTextureUnitFromBindingPoint(i); auto hostTextureUnit = relative_textureUnit; + + // Don't bind textures that are accessed with a framebuffer fetch + if (shader->textureRenderTargetIndex[relative_textureUnit] != 255) + continue; + auto textureDim = shader->textureUnitDim[relative_textureUnit]; auto texUnitRegIndex = hostTextureUnit * 7; switch (shader->shaderType) From 3fae686f21a69ca93044ebfa030b6862fd29b847 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 9 Jan 2025 16:11:52 +0100 Subject: [PATCH 309/368] remove the accurate barriers option --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 4 ++++ src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 176ae6df1..6bb7964cb 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1008,6 +1008,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader(); const auto fetchShader = LatteSHRC_GetActiveFetchShader(); + /* bool neverSkipAccurateBarrier = false; // "Accurate barriers" is usually enabled globally but since the CPU cost is substantial we allow users to disable it (debug -> 'Accurate barriers' option) @@ -1037,6 +1038,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 cemuLog_logOnce(LogType::Force, "Ending render pass due to render target self-dependency\n"); } } + */ // Primitive type const LattePrimitiveMode primitiveMode = static_cast(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]); @@ -1867,6 +1869,7 @@ bool MetalRenderer::AcquireDrawable(bool mainWindow) return layer.AcquireDrawable(); } +/* bool MetalRenderer::CheckIfRenderPassNeedsFlush(LatteDecompilerShader* shader) { sint32 textureCount = shader->resourceMapping.getTextureCount(); @@ -1916,6 +1919,7 @@ bool MetalRenderer::CheckIfRenderPassNeedsFlush(LatteDecompilerShader* shader) return false; } +*/ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader, bool usesGeometryShader) { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index c3898f3a6..60fb8e034 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -362,7 +362,7 @@ class MetalRenderer : public Renderer bool AcquireDrawable(bool mainWindow); - bool CheckIfRenderPassNeedsFlush(LatteDecompilerShader* shader); + //bool CheckIfRenderPassNeedsFlush(LatteDecompilerShader* shader); void BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader, bool usesGeometryShader); void ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a); From a0239cb75648e96267f8df2a9352cae3d0e2d0f1 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 9 Jan 2025 16:27:45 +0100 Subject: [PATCH 310/368] check for framebuffer fetch support --- .../LatteDecompilerEmitMSL.cpp | 33 ++++++++++--------- .../LatteDecompilerEmitMSLHeader.hpp | 24 +++++++------- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 26 ++++++++------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 6 ++++ 4 files changed, 50 insertions(+), 39 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 22c511ba1..a9e3184c9 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -10,7 +10,7 @@ #include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInstructions.h" #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Renderer/Renderer.h" -#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #include "config/ActiveSettings.h" #include "util/helpers/StringBuf.h" @@ -2261,19 +2261,22 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex } // Do a framebuffer fetch if possible - // TODO: filter out more? - uint8 renderTargetIndex = shaderContext->shader->textureRenderTargetIndex[texInstruction->textureFetch.textureIndex]; - if (renderTargetIndex != 255) - { - src->addFmt("col{}.", renderTargetIndex); - // TODO: clean up - std::string components[] = {"x", "y", "z", "w"}; - for (sint32 i = 0; i < numWrittenElements; i++) - { - src->addFmt("{}", components[i]); - } - src->add(");" _CRLF); - return; + if (static_cast(g_renderer.get())->SupportsFramebufferFetch()) + { + // TODO: filter out more? + uint8 renderTargetIndex = shaderContext->shader->textureRenderTargetIndex[texInstruction->textureFetch.textureIndex]; + if (renderTargetIndex != 255) + { + src->addFmt("col{}.", renderTargetIndex); + // TODO: clean up + std::string components[] = {"x", "y", "z", "w"}; + for (sint32 i = 0; i < numWrittenElements; i++) + { + src->addFmt("{}", components[i]); + } + src->add(");" _CRLF); + return; + } } if (emulateCompare) @@ -2646,7 +2649,7 @@ static void _emitTEXGetTextureResInfoCode(LatteDecompilerShaderContext* shaderCo // todo - mip index parameter? - if (shaderContext->shader->textureRenderTargetIndex[texInstruction->textureFetch.textureIndex] != 255) + if (static_cast(g_renderer.get())->SupportsFramebufferFetch() && shaderContext->shader->textureRenderTargetIndex[texInstruction->textureFetch.textureIndex] != 255) { // TODO: use the render target size src->addFmt(" = int4(1920, 1080, 1, 1)."); diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 40e704554..84722a24b 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -1,7 +1,7 @@ #pragma once #include "Common/precompiled.h" -#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "HW/Latte/Core/LatteShader.h" namespace LatteDecompiler @@ -458,11 +458,19 @@ namespace LatteDecompiler continue; uint8 renderTargetIndex = shaderContext->shader->textureRenderTargetIndex[i]; - if (renderTargetIndex == 255) + if (static_cast(g_renderer.get())->SupportsFramebufferFetch() && renderTargetIndex != 255) { - src->add(", "); + if (!renderTargetIndexUsed[renderTargetIndex]) + { + src->addFmt(", {} col{} [[color({})]]", GetDataTypeStr(GetColorBufferDataType(renderTargetIndex, *shaderContext->contextRegistersNew)), renderTargetIndex, renderTargetIndex); + renderTargetIndexUsed[renderTargetIndex] = true; + } + } + else + { + src->add(", "); - // Only 2D and 2D array textures can be used with comparison samplers + // Only certain texture dimensions can be used with comparison samplers if (shaderContext->shader->textureUsesDepthCompare[i] && IsValidDepthTextureType(shaderContext->shader->textureUnitDim[i])) src->add("depth"); else @@ -499,14 +507,6 @@ namespace LatteDecompiler src->addFmt(" tex{} [[texture({})]]", i, binding); src->addFmt(", sampler samplr{} [[sampler({})]]", i, binding); } - else - { - if (!renderTargetIndexUsed[renderTargetIndex]) - { - src->addFmt(", {} col{} [[color({})]]", GetDataTypeStr(GetColorBufferDataType(renderTargetIndex, *shaderContext->contextRegistersNew)), renderTargetIndex, renderTargetIndex); - renderTargetIndexUsed[renderTargetIndex] = true; - } - } } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 6bb7964cb..bd6f93158 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -87,6 +87,7 @@ MetalRenderer::MetalRenderer() // Feature support m_isAppleGPU = m_device->supportsFamily(MTL::GPUFamilyApple1); + m_supportsFramebufferFetch = m_device->supportsFamily(MTL::GPUFamilyApple2); m_hasUnifiedMemory = m_device->hasUnifiedMemory(); m_supportsMetal3 = m_device->supportsFamily(MTL::GPUFamilyMetal3); m_recommendedMaxVRAMUsage = m_device->recommendedMaxWorkingSetSize(); @@ -584,21 +585,22 @@ void MetalRenderer::DeleteFontTextures() void MetalRenderer::AppendOverlayDebugInfo() { ImGui::Text("--- GPU info ---"); - ImGui::Text("GPU %s", m_device->name()->utf8String()); - ImGui::Text("Is Apple GPU %s", (m_isAppleGPU ? "yes" : "no")); - ImGui::Text("Has unified memory %s", (m_hasUnifiedMemory ? "yes" : "no")); - ImGui::Text("Supports Metal3 %s", (m_supportsMetal3 ? "yes" : "no")); + ImGui::Text("GPU %s", m_device->name()->utf8String()); + ImGui::Text("Is Apple GPU %s", (m_isAppleGPU ? "yes" : "no")); + ImGui::Text("Supports framebuffer fetch %s", (m_supportsFramebufferFetch ? "yes" : "no")); + ImGui::Text("Has unified memory %s", (m_hasUnifiedMemory ? "yes" : "no")); + ImGui::Text("Supports Metal3 %s", (m_supportsMetal3 ? "yes" : "no")); ImGui::Text("--- Metal info ---"); - ImGui::Text("Render pipeline states %zu", m_pipelineCache->GetPipelineCacheSize()); - ImGui::Text("Buffer allocator memory %zuMB", m_performanceMonitor.m_bufferAllocatorMemory / 1024 / 1024); + ImGui::Text("Render pipeline states %zu", m_pipelineCache->GetPipelineCacheSize()); + ImGui::Text("Buffer allocator memory %zuMB", m_performanceMonitor.m_bufferAllocatorMemory / 1024 / 1024); ImGui::Text("--- Metal info (per frame) ---"); - ImGui::Text("Command buffers %u", m_performanceMonitor.m_commandBuffers); - ImGui::Text("Render passes %u", m_performanceMonitor.m_renderPasses); - ImGui::Text("Clears %u", m_performanceMonitor.m_clears); - ImGui::Text("Manual vertex fetch draws %u (mesh draws: %u)", m_performanceMonitor.m_manualVertexFetchDraws, m_performanceMonitor.m_meshDraws); - ImGui::Text("Triangle fans %u", m_performanceMonitor.m_triangleFans); + ImGui::Text("Command buffers %u", m_performanceMonitor.m_commandBuffers); + ImGui::Text("Render passes %u", m_performanceMonitor.m_renderPasses); + ImGui::Text("Clears %u", m_performanceMonitor.m_clears); + ImGui::Text("Manual vertex fetch draws %u (mesh draws: %u)", m_performanceMonitor.m_manualVertexFetchDraws, m_performanceMonitor.m_meshDraws); + ImGui::Text("Triangle fans %u", m_performanceMonitor.m_triangleFans); } void MetalRenderer::renderTarget_setViewport(float x, float y, float width, float height, float nearZ, float farZ, bool halfZ) @@ -1932,7 +1934,7 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE auto hostTextureUnit = relative_textureUnit; // Don't bind textures that are accessed with a framebuffer fetch - if (shader->textureRenderTargetIndex[relative_textureUnit] != 255) + if (m_supportsFramebufferFetch && shader->textureRenderTargetIndex[relative_textureUnit] != 255) continue; auto textureDim = shader->textureUnitDim[relative_textureUnit]; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 60fb8e034..db79471d0 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -375,6 +375,11 @@ class MetalRenderer : public Renderer return m_isAppleGPU; } + bool SupportsFramebufferFetch() const + { + return m_supportsFramebufferFetch; + } + bool HasUnifiedMemory() const { return m_hasUnifiedMemory; @@ -477,6 +482,7 @@ class MetalRenderer : public Renderer // Feature support bool m_isAppleGPU; + bool m_supportsFramebufferFetch; bool m_hasUnifiedMemory; bool m_supportsMetal3; uint32 m_recommendedMaxVRAMUsage; From d9bf99cb208beed8601675bcc3a56757cf995b5f Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 9 Jan 2025 20:34:49 +0100 Subject: [PATCH 311/368] fix: a typo in texture index --- .../LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp index 0f9f28c61..9a40d7436 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp @@ -872,6 +872,14 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD if (dim != Latte::E_DIM::DIM_2D) continue; + // Check for mip level + // TODO: uncomment? + /* + auto lastMip = texRegister.word5.get_LAST_LEVEL(); + if (lastMip != 0) + continue; + */ + // Check if the texture is used as render target for (sint32 j = 0; j < LATTE_NUM_COLOR_TARGET; j++) { @@ -886,7 +894,7 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD // TODO: check if mip matches as well? if (physAddr == colorBufferPhysMem) { - shader->textureRenderTargetIndex[i] = j; + shader->textureRenderTargetIndex[textureIndex] = j; break; } } From 4479584eb644daa5f6fc3ad52c5269b46c9c77ca Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 10 Jan 2025 09:55:35 +0100 Subject: [PATCH 312/368] support texture lod query for framebuffer fetch --- .../LatteDecompilerEmitMSL.cpp | 33 +++++++++++-------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index a9e3184c9..bcfe2d3d2 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2261,9 +2261,9 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex } // Do a framebuffer fetch if possible + // TODO: support comparison samplers if (static_cast(g_renderer.get())->SupportsFramebufferFetch()) { - // TODO: filter out more? uint8 renderTargetIndex = shaderContext->shader->textureRenderTargetIndex[texInstruction->textureFetch.textureIndex]; if (renderTargetIndex != 255) { @@ -2720,24 +2720,31 @@ static void _emitTEXGetCompTexLodCode(LatteDecompilerShaderContext* shaderContex src->add(" = "); _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, shaderContext->typeTracker.defaultDataType); - if( shaderContext->shader->textureUnitDim[texInstruction->textureFetch.textureIndex] == Latte::E_DIM::DIM_CUBEMAP ) + if (static_cast(g_renderer.get())->SupportsFramebufferFetch() && shaderContext->shader->textureRenderTargetIndex[texInstruction->textureFetch.textureIndex] != 255) { - // 3 coordinates - if(shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) - src->addFmt("float4(textureCalculateLod(tex{}, samplr{}, {}.{}{}{}),0.0,0.0)", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]], resultElemTable[texInstruction->textureFetch.srcSel[2]]); - else - src->addFmt("float4(textureCalculateLod(tex{}, samplr{}, bitCast({}.{}{}{})),0.0,0.0)", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]], resultElemTable[texInstruction->textureFetch.srcSel[2]]); + // We assume that textures accessed as framebuffer fetch are always sampled at pixel coordinates, therefore the lod would always be 0.0 + src->add("float4(0.0, 0.0, 0.0, 0.0)"); } else { - if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) - src->addFmt("float4(textureCalculateLod(tex{}, samplr{}, {}.{}{}),0.0,0.0)", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]]); - else - src->addFmt("float4(textureCalculateLod(tex{}, samplr{}, bitCast({}.{}{})),0.0,0.0)", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]]); - debugBreakpoint(); + if (shaderContext->shader->textureUnitDim[texInstruction->textureFetch.textureIndex] == Latte::E_DIM::DIM_CUBEMAP) + { + // 3 coordinates + if(shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("float4(textureCalculateLod(tex{}, samplr{}, {}.{}{}{}), 0.0, 0.0)", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]], resultElemTable[texInstruction->textureFetch.srcSel[2]]); + else + src->addFmt("float4(textureCalculateLod(tex{}, samplr{}, bitCast({}.{}{}{})), 0.0, 0.0)", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]], resultElemTable[texInstruction->textureFetch.srcSel[2]]); + } + else + { + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("float4(textureCalculateLod(tex{}, samplr{}, {}.{}{}), 0.0, 0.0)", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]]); + else + src->addFmt("float4(textureCalculateLod(tex{}, samplr{}, bitCast({}.{}{})), 0.0, 0.0)", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]]); + debugBreakpoint(); + } } - _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, shaderContext->typeTracker.defaultDataType); src->add("."); From ca3fe9610434a4fe4b61acca9cc5f8c4f3cf9db5 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 10 Jan 2025 15:40:19 +0100 Subject: [PATCH 313/368] fix: component indexing for framebuffer fetch --- .../LatteDecompilerEmitMSL.cpp | 482 +++++++++--------- 1 file changed, 237 insertions(+), 245 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index bcfe2d3d2..ba0180dc9 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2261,282 +2261,274 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex } // Do a framebuffer fetch if possible - // TODO: support comparison samplers - if (static_cast(g_renderer.get())->SupportsFramebufferFetch()) + uint8 renderTargetIndex = shaderContext->shader->textureRenderTargetIndex[texInstruction->textureFetch.textureIndex]; + if (static_cast(g_renderer.get())->SupportsFramebufferFetch() && renderTargetIndex != 255) { - uint8 renderTargetIndex = shaderContext->shader->textureRenderTargetIndex[texInstruction->textureFetch.textureIndex]; - if (renderTargetIndex != 255) - { - src->addFmt("col{}.", renderTargetIndex); - // TODO: clean up - std::string components[] = {"x", "y", "z", "w"}; - for (sint32 i = 0; i < numWrittenElements; i++) - { - src->addFmt("{}", components[i]); - } - src->add(");" _CRLF); - return; - } + // TODO: support comparison samplers + src->addFmt("col{}", renderTargetIndex); } - - if (emulateCompare) + else { - cemu_assert_debug(!isGather); + if (emulateCompare) + { + cemu_assert_debug(!isGather); - src->add("sampleCompareEmulate("); - } + src->add("sampleCompareEmulate("); + } - src->addFmt("tex{}", texInstruction->textureFetch.textureIndex); - if (!emulateCompare) - { - src->add("."); - if (isRead) + src->addFmt("tex{}", texInstruction->textureFetch.textureIndex); + if (!emulateCompare) { - if (hasOffset) - cemu_assert_unimplemented(); - src->add("read("); - unnormalizationHandled = true; - useTexelCoordinates = true; + src->add("."); + if (isRead) + { + if (hasOffset) + cemu_assert_unimplemented(); + src->add("read("); + unnormalizationHandled = true; + useTexelCoordinates = true; + } + else + { + if (isGather) + src->add("gather"); + else + src->add("sample"); + if (isCompare) + src->add("_compare"); + src->addFmt("(samplr{}, ", texInstruction->textureFetch.textureIndex); + } } else { - if (isGather) - src->add("gather"); - else - src->add("sample"); - if (isCompare) - src->add("_compare"); - src->addFmt("(samplr{}, ", texInstruction->textureFetch.textureIndex); + src->addFmt(", samplr{}, ", texInstruction->textureFetch.textureIndex); } - } - else - { - src->addFmt(", samplr{}, ", texInstruction->textureFetch.textureIndex); - } - - // for textureGather() add shift (todo: depends on rounding mode set in sampler registers?) - if (texOpcode == GPU7_TEX_INST_FETCH4) - { - if (texDim == Latte::E_DIM::DIM_2D) - { - //src->addFmt2("(vec2(-0.1) / vec2(textureSize(tex{},0).xy)) + ", texInstruction->textureIndex); - // vec2(-0.00001) is minimum to break Nvidia - // vec2(0.0001) is minimum to fix shadows on Intel, also fixes it on AMD (Windows and Linux) + // for textureGather() add shift (todo: depends on rounding mode set in sampler registers?) + if (texOpcode == GPU7_TEX_INST_FETCH4) + { + if (texDim == Latte::E_DIM::DIM_2D) + { + //src->addFmt2("(vec2(-0.1) / vec2(textureSize(tex{},0).xy)) + ", texInstruction->textureIndex); - // todo - emulating coordinate rounding mode correctly is tricky - // GX2 supports two modes: Truncate or rounding according to DX9 rules - // Vulkan uses truncate mode when point sampling (min and mag is both nearest) otherwise it uses rounding + // vec2(-0.00001) is minimum to break Nvidia + // vec2(0.0001) is minimum to fix shadows on Intel, also fixes it on AMD (Windows and Linux) - // adding a small fixed bias is enough to avoid vendor-specific cases where small inaccuracies cause the number to get rounded down due to truncation - src->addFmt("float2(0.0001) + "); - } - } + // todo - emulating coordinate rounding mode correctly is tricky + // GX2 supports two modes: Truncate or rounding according to DX9 rules + // Vulkan uses truncate mode when point sampling (min and mag is both nearest) otherwise it uses rounding - const sint32 texCoordDataType = (texOpcode == GPU7_TEX_INST_LD) ? LATTE_DECOMPILER_DTYPE_SIGNED_INT : LATTE_DECOMPILER_DTYPE_FLOAT; - if(useTexelCoordinates) - { - // handle integer coordinates for texelFetch - if (texDim == Latte::E_DIM::DIM_2D || texDim == Latte::E_DIM::DIM_2D_MSAA) - { - src->add("uint2("); - src->add("float2("); - _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, texCoordDataType); - src->addFmt(", "); - _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, texCoordDataType); + // adding a small fixed bias is enough to avoid vendor-specific cases where small inaccuracies cause the number to get rounded down due to truncation + src->addFmt("float2(0.0001) + "); + } + } - src->addFmt(")*supportBuffer.tex{}Scale", texInstruction->textureFetch.textureIndex); // close float2 and scale + const sint32 texCoordDataType = (texOpcode == GPU7_TEX_INST_LD) ? LATTE_DECOMPILER_DTYPE_SIGNED_INT : LATTE_DECOMPILER_DTYPE_FLOAT; + if(useTexelCoordinates) + { + // handle integer coordinates for texelFetch + if (texDim == Latte::E_DIM::DIM_2D || texDim == Latte::E_DIM::DIM_2D_MSAA) + { + src->add("uint2("); + src->add("float2("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, texCoordDataType); + src->addFmt(", "); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, texCoordDataType); - src->add("), 0"); // close int2 and lod param - // todo - lod - } - else if (texDim == Latte::E_DIM::DIM_1D) - { - // VC DS games forget to initialize textures and use texel fetch on an uninitialized texture (a dim of 0 maps to 1D) - src->add("uint("); - src->add("float("); - _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, (texOpcode == GPU7_TEX_INST_LD) ? LATTE_DECOMPILER_DTYPE_SIGNED_INT : LATTE_DECOMPILER_DTYPE_FLOAT); - src->addFmt(")*supportBuffer.tex{}Scale.x", texInstruction->textureFetch.textureIndex); - src->add("), 0"); - // todo - lod - } - else - cemu_assert_debug(false); - } - else /* useTexelCoordinates == false */ - { - // float coordinates - if ( (texOpcode == GPU7_TEX_INST_SAMPLE_C || texOpcode == GPU7_TEX_INST_SAMPLE_C_L || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ) ) - { - // shadow sampler - if (texDim == Latte::E_DIM::DIM_2D_ARRAY) - { - // 3 coords + compare value - src->add("float2("); - _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add(", "); - _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add("), uint(rint("); - _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add("))"); + src->addFmt(")*supportBuffer.tex{}Scale", texInstruction->textureFetch.textureIndex); // close float2 and scale - src->addFmt(", {}", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer0)); - } - else if (texDim == Latte::E_DIM::DIM_CUBEMAP) - { - // 2 coords + faceId - if (texInstruction->textureFetch.srcSel[0] >= 4 || texInstruction->textureFetch.srcSel[1] >= 4) - { - debugBreakpoint(); - } - src->addFmt("redcCUBEReverse({},", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0)); - _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_SIGNED_INT); - src->addFmt(")"); - src->addFmt(", uint(cubeMapArrayIndex{})", texInstruction->textureFetch.textureIndex); // cubemap index - } - else if (texDim == Latte::E_DIM::DIM_1D) - { - // 1 coord + 1 unused coord (per spec) + compare value - if (texInstruction->textureFetch.srcSel[0] >= 4) - { - debugBreakpoint(); - } - src->addFmt("{}, {}", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], -1, -1, -1, tempBuffer0), _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer1)); - } - else - { - // 2 coords + compare value (as float3) - if (texInstruction->textureFetch.srcSel[0] >= 4 && texInstruction->textureFetch.srcSel[1] >= 4) - { - debugBreakpoint(); - } - src->addFmt("float2({}), {}", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0), _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer1)); - } - } - else if(texDim == Latte::E_DIM::DIM_2D_ARRAY) - { - // 3 coords - src->add("float2("); - _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add(", "); - _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add("), uint(rint("); - _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add("))"); - } - else if(texDim == Latte::E_DIM::DIM_3D) - { - // 3 coords - src->add("float3("); - _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add(", "); - _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add(", "); - _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add(")"); - } - else if( texDim == Latte::E_DIM::DIM_CUBEMAP ) - { - // 2 coords + faceId - cemu_assert_debug(texInstruction->textureFetch.srcSel[0] < 4); - cemu_assert_debug(texInstruction->textureFetch.srcSel[1] < 4); - src->addFmt("redcCUBEReverse({},", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0)); - _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_SIGNED_INT); - src->add(")"); - src->addFmt(", uint(cubeMapArrayIndex{})", texInstruction->textureFetch.textureIndex); // cubemap index - } - else if( texDim == Latte::E_DIM::DIM_1D ) - { - // 1 coord - src->add(_getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], -1, -1, -1, tempBuffer0)); - } - else - { - // 2 coords - src->add("float2("); - _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add(","); - _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add(")"); - // avoid truncate to effectively round downwards on texel edges - if (ActiveSettings::ForceSamplerRoundToPrecision()) - src->addFmt("+ float2(1.0)/float2(tex{}.get_width(), tex{}.get_height())/512.0", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex); - } - // lod or lod bias parameter - // 1D textures don't support lod - if (texDim != Latte::E_DIM::DIM_1D && texDim != Latte::E_DIM::DIM_1D_ARRAY) - { - if (texOpcode == GPU7_TEX_INST_SAMPLE_L || texOpcode == GPU7_TEX_INST_SAMPLE_LB || texOpcode == GPU7_TEX_INST_SAMPLE_C_L) + src->add("), 0"); // close int2 and lod param + // todo - lod + } + else if (texDim == Latte::E_DIM::DIM_1D) + { + // VC DS games forget to initialize textures and use texel fetch on an uninitialized texture (a dim of 0 maps to 1D) + src->add("uint("); + src->add("float("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, (texOpcode == GPU7_TEX_INST_LD) ? LATTE_DECOMPILER_DTYPE_SIGNED_INT : LATTE_DECOMPILER_DTYPE_FLOAT); + src->addFmt(")*supportBuffer.tex{}Scale.x", texInstruction->textureFetch.textureIndex); + src->add("), 0"); + // todo - lod + } + else + cemu_assert_debug(false); + } + else /* useTexelCoordinates == false */ + { + // float coordinates + if ( (texOpcode == GPU7_TEX_INST_SAMPLE_C || texOpcode == GPU7_TEX_INST_SAMPLE_C_L || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ) ) { - src->add(", "); - if (texOpcode == GPU7_TEX_INST_SAMPLE_LB) + // shadow sampler + if (texDim == Latte::E_DIM::DIM_2D_ARRAY) { - src->addFmt("bias({})", _FormatFloatAsConstant((float)texInstruction->textureFetch.lodBias / 16.0f)); + // 3 coords + compare value + src->add("float2("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(", "); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("), uint(rint("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("))"); + + src->addFmt(", {}", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer0)); + } + else if (texDim == Latte::E_DIM::DIM_CUBEMAP) + { + // 2 coords + faceId + if (texInstruction->textureFetch.srcSel[0] >= 4 || texInstruction->textureFetch.srcSel[1] >= 4) + { + debugBreakpoint(); + } + src->addFmt("redcCUBEReverse({},", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0)); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->addFmt(")"); + src->addFmt(", uint(cubeMapArrayIndex{})", texInstruction->textureFetch.textureIndex); // cubemap index + } + else if (texDim == Latte::E_DIM::DIM_1D) + { + // 1 coord + 1 unused coord (per spec) + compare value + if (texInstruction->textureFetch.srcSel[0] >= 4) + { + debugBreakpoint(); + } + src->addFmt("{}, {}", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], -1, -1, -1, tempBuffer0), _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer1)); } else { - // TODO: is this correct? - src->add("level("); - _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 3, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add(")"); + // 2 coords + compare value (as float3) + if (texInstruction->textureFetch.srcSel[0] >= 4 && texInstruction->textureFetch.srcSel[1] >= 4) + { + debugBreakpoint(); + } + src->addFmt("float2({}), {}", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0), _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer1)); } } - else if (texOpcode == GPU7_TEX_INST_SAMPLE_LZ || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ) + else if(texDim == Latte::E_DIM::DIM_2D_ARRAY) { - src->add(", level(0.0)"); + // 3 coords + src->add("float2("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(", "); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("), uint(rint("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("))"); } - } - } - // gradient parameters - if (texOpcode == GPU7_TEX_INST_SAMPLE_G) - { - if (texDim == Latte::E_DIM::DIM_2D || - texDim == Latte::E_DIM::DIM_1D) - { - src->add(", gradient2d(gradH.xy, gradV.xy)"); - } - else - { - cemu_assert_unimplemented(); - } - } + else if(texDim == Latte::E_DIM::DIM_3D) + { + // 3 coords + src->add("float3("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(", "); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(", "); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + } + else if( texDim == Latte::E_DIM::DIM_CUBEMAP ) + { + // 2 coords + faceId + cemu_assert_debug(texInstruction->textureFetch.srcSel[0] < 4); + cemu_assert_debug(texInstruction->textureFetch.srcSel[1] < 4); + src->addFmt("redcCUBEReverse({},", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0)); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(")"); + src->addFmt(", uint(cubeMapArrayIndex{})", texInstruction->textureFetch.textureIndex); // cubemap index + } + else if( texDim == Latte::E_DIM::DIM_1D ) + { + // 1 coord + src->add(_getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], -1, -1, -1, tempBuffer0)); + } + else + { + // 2 coords + src->add("float2("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + // avoid truncate to effectively round downwards on texel edges + if (ActiveSettings::ForceSamplerRoundToPrecision()) + src->addFmt("+ float2(1.0)/float2(tex{}.get_width(), tex{}.get_height())/512.0", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex); + } + // lod or lod bias parameter + // 1D textures don't support lod + if (texDim != Latte::E_DIM::DIM_1D && texDim != Latte::E_DIM::DIM_1D_ARRAY) + { + if (texOpcode == GPU7_TEX_INST_SAMPLE_L || texOpcode == GPU7_TEX_INST_SAMPLE_LB || texOpcode == GPU7_TEX_INST_SAMPLE_C_L) + { + src->add(", "); + if (texOpcode == GPU7_TEX_INST_SAMPLE_LB) + { + src->addFmt("bias({})", _FormatFloatAsConstant((float)texInstruction->textureFetch.lodBias / 16.0f)); + } + else + { + // TODO: is this correct? + src->add("level("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 3, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + } + } + else if (texOpcode == GPU7_TEX_INST_SAMPLE_LZ || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ) + { + src->add(", level(0.0)"); + } + } + } + // gradient parameters + if (texOpcode == GPU7_TEX_INST_SAMPLE_G) + { + if (texDim == Latte::E_DIM::DIM_2D || + texDim == Latte::E_DIM::DIM_1D) + { + src->add(", gradient2d(gradH.xy, gradV.xy)"); + } + else + { + cemu_assert_unimplemented(); + } + } - // offset - if( texOpcode == GPU7_TEX_INST_SAMPLE_L || texOpcode == GPU7_TEX_INST_SAMPLE_LZ || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ || texOpcode == GPU7_TEX_INST_SAMPLE || texOpcode == GPU7_TEX_INST_SAMPLE_C ) - { - if( hasOffset ) - { - uint8 offsetComponentCount = 0; - if( texDim == Latte::E_DIM::DIM_1D ) - offsetComponentCount = 1; - else if( texDim == Latte::E_DIM::DIM_2D ) - offsetComponentCount = 2; - else if( texDim == Latte::E_DIM::DIM_3D ) - offsetComponentCount = 3; - else if( texDim == Latte::E_DIM::DIM_2D_ARRAY ) - offsetComponentCount = 2; - else - cemu_assert_unimplemented(); + // offset + if( texOpcode == GPU7_TEX_INST_SAMPLE_L || texOpcode == GPU7_TEX_INST_SAMPLE_LZ || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ || texOpcode == GPU7_TEX_INST_SAMPLE || texOpcode == GPU7_TEX_INST_SAMPLE_C ) + { + if( hasOffset ) + { + uint8 offsetComponentCount = 0; + if( texDim == Latte::E_DIM::DIM_1D ) + offsetComponentCount = 1; + else if( texDim == Latte::E_DIM::DIM_2D ) + offsetComponentCount = 2; + else if( texDim == Latte::E_DIM::DIM_3D ) + offsetComponentCount = 3; + else if( texDim == Latte::E_DIM::DIM_2D_ARRAY ) + offsetComponentCount = 2; + else + cemu_assert_unimplemented(); - if( (texInstruction->textureFetch.offsetX&1) ) - cemu_assert_unimplemented(); - if( (texInstruction->textureFetch.offsetY&1) ) - cemu_assert_unimplemented(); - if ((texInstruction->textureFetch.offsetZ & 1)) - cemu_assert_unimplemented(); + if( (texInstruction->textureFetch.offsetX&1) ) + cemu_assert_unimplemented(); + if( (texInstruction->textureFetch.offsetY&1) ) + cemu_assert_unimplemented(); + if ((texInstruction->textureFetch.offsetZ & 1)) + cemu_assert_unimplemented(); - if( offsetComponentCount == 1 ) - src->addFmt(",{}", texInstruction->textureFetch.offsetX/2); - else if( offsetComponentCount == 2 ) - src->addFmt(",int2({},{})", texInstruction->textureFetch.offsetX/2, texInstruction->textureFetch.offsetY/2, texInstruction->textureFetch.offsetZ/2); - else if( offsetComponentCount == 3 ) - src->addFmt(",int3({},{},{})", texInstruction->textureFetch.offsetX/2, texInstruction->textureFetch.offsetY/2, texInstruction->textureFetch.offsetZ/2); - } - } + if( offsetComponentCount == 1 ) + src->addFmt(",{}", texInstruction->textureFetch.offsetX/2); + else if( offsetComponentCount == 2 ) + src->addFmt(",int2({},{})", texInstruction->textureFetch.offsetX/2, texInstruction->textureFetch.offsetY/2, texInstruction->textureFetch.offsetZ/2); + else if( offsetComponentCount == 3 ) + src->addFmt(",int3({},{},{})", texInstruction->textureFetch.offsetX/2, texInstruction->textureFetch.offsetY/2, texInstruction->textureFetch.offsetZ/2); + } + } - // lod bias (TODO: wht?) + // lod bias (TODO: wht?) + + src->add(")"); + } - src->add(")"); // sample_compare doesn't return a float if (!isCompare) { From 217e2edda3df57e4b5e9d328b201f2676c62f5d0 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 11 Jan 2025 10:26:35 +0100 Subject: [PATCH 314/368] check if pixel formats match for framebuffer fetch --- .../LatteDecompilerAnalyzer.cpp | 9 ++++- .../LatteDecompilerEmitMSL.cpp | 36 +++++++++---------- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp index 9a40d7436..4d924e94e 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp @@ -9,6 +9,9 @@ #include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/Renderer/Renderer.h" +// Defined in LatteTextureLegacy.cpp +Latte::E_GX2SURFFMT LatteTexture_ReconstructGX2Format(const Latte::LATTE_SQ_TEX_RESOURCE_WORD1_N& texUnitWord1, const Latte::LATTE_SQ_TEX_RESOURCE_WORD4_N& texUnitWord4); + /* * Return index of used color attachment based on shader pixel export index (0-7) */ @@ -876,10 +879,13 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD // TODO: uncomment? /* auto lastMip = texRegister.word5.get_LAST_LEVEL(); + // TODO: multiple mip levels could technically be supported as well if (lastMip != 0) continue; */ + Latte::E_GX2SURFFMT format = LatteTexture_ReconstructGX2Format(texRegister.word1, texRegister.word4); + // Check if the texture is used as render target for (sint32 j = 0; j < LATTE_NUM_COLOR_TARGET; j++) { @@ -890,9 +896,10 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD uint32 regColorBufferBase = colorBufferRegBase[mmCB_COLOR0_BASE - mmCB_COLOR0_BASE] & 0xFFFFFF00; // the low 8 bits are ignored? How to Survive seems to rely on this MPTR colorBufferPhysMem = regColorBufferBase; + Latte::E_GX2SURFFMT colorBufferFormat = LatteMRT::GetColorBufferFormat(j, *shaderContext->contextRegistersNew); // TODO: check if mip matches as well? - if (physAddr == colorBufferPhysMem) + if (physAddr == colorBufferPhysMem && format == colorBufferFormat) { shader->textureRenderTargetIndex[textureIndex] = j; break; diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index ba0180dc9..488cc2d58 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2293,7 +2293,7 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex if (isGather) src->add("gather"); else - src->add("sample"); + src->add("sample"); if (isCompare) src->add("_compare"); src->addFmt("(samplr{}, ", texInstruction->textureFetch.textureIndex); @@ -2456,25 +2456,25 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex // 1D textures don't support lod if (texDim != Latte::E_DIM::DIM_1D && texDim != Latte::E_DIM::DIM_1D_ARRAY) { - if (texOpcode == GPU7_TEX_INST_SAMPLE_L || texOpcode == GPU7_TEX_INST_SAMPLE_LB || texOpcode == GPU7_TEX_INST_SAMPLE_C_L) - { - src->add(", "); - if (texOpcode == GPU7_TEX_INST_SAMPLE_LB) - { - src->addFmt("bias({})", _FormatFloatAsConstant((float)texInstruction->textureFetch.lodBias / 16.0f)); - } - else - { - // TODO: is this correct? - src->add("level("); + if (texOpcode == GPU7_TEX_INST_SAMPLE_L || texOpcode == GPU7_TEX_INST_SAMPLE_LB || texOpcode == GPU7_TEX_INST_SAMPLE_C_L) + { + src->add(", "); + if (texOpcode == GPU7_TEX_INST_SAMPLE_LB) + { + src->addFmt("bias({})", _FormatFloatAsConstant((float)texInstruction->textureFetch.lodBias / 16.0f)); + } + else + { + // TODO: is this correct? + src->add("level("); _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 3, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(")"); - } - } - else if (texOpcode == GPU7_TEX_INST_SAMPLE_LZ || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ) - { - src->add(", level(0.0)"); - } + } + } + else if (texOpcode == GPU7_TEX_INST_SAMPLE_LZ || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ) + { + src->add(", level(0.0)"); + } } } // gradient parameters From f4985c481efe0676f7a77d7f634aab941504e12a Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 11 Jan 2025 10:59:28 +0100 Subject: [PATCH 315/368] add an option to disable framebuffer fetch --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 2 +- src/config/CemuConfig.cpp | 4 +++- src/config/CemuConfig.h | 3 ++- src/gui/GeneralSettings2.cpp | 14 ++++++++++++++ src/gui/GeneralSettings2.h | 1 + 5 files changed, 21 insertions(+), 3 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index bd6f93158..dc3b8ae04 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -87,7 +87,7 @@ MetalRenderer::MetalRenderer() // Feature support m_isAppleGPU = m_device->supportsFamily(MTL::GPUFamilyApple1); - m_supportsFramebufferFetch = m_device->supportsFamily(MTL::GPUFamilyApple2); + m_supportsFramebufferFetch = GetConfig().framebuffer_fetch.GetValue() ? m_device->supportsFamily(MTL::GPUFamilyApple2) : false; m_hasUnifiedMemory = m_device->hasUnifiedMemory(); m_supportsMetal3 = m_device->supportsFamily(MTL::GPUFamilyMetal3); m_recommendedMaxVRAMUsage = m_device->recommendedMaxWorkingSetSize(); diff --git a/src/config/CemuConfig.cpp b/src/config/CemuConfig.cpp index c22d71503..dc38647a3 100644 --- a/src/config/CemuConfig.cpp +++ b/src/config/CemuConfig.cpp @@ -338,6 +338,7 @@ void CemuConfig::Load(XMLConfigParser& parser) #endif gdb_port = debug.get("GDBPort", 1337); gpu_capture_dir = debug.get("GPUCaptureDir", ""); + framebuffer_fetch = debug.get("FramebufferFetch", true); // input auto input = parser.get("Input"); @@ -540,7 +541,8 @@ void CemuConfig::Save(XMLConfigParser& parser) debug.set("CrashDumpUnix", crash_dump.GetValue()); #endif debug.set("GDBPort", gdb_port); - debug.set("GPUCaptureDir", gpu_capture_dir.GetValue()); + debug.set("GPUCaptureDir", gpu_capture_dir); + debug.set("FramebufferFetch", framebuffer_fetch); // input auto input = config.set("Input"); diff --git a/src/config/CemuConfig.h b/src/config/CemuConfig.h index 56af04652..0990c6523 100644 --- a/src/config/CemuConfig.h +++ b/src/config/CemuConfig.h @@ -527,7 +527,8 @@ struct CemuConfig // debug ConfigValueBounds crash_dump{ CrashDump::Disabled }; ConfigValue gdb_port{ 1337 }; - ConfigValue gpu_capture_dir{}; + ConfigValue gpu_capture_dir{ "" }; + ConfigValue framebuffer_fetch{ true }; void Load(XMLConfigParser& parser); void Save(XMLConfigParser& parser); diff --git a/src/gui/GeneralSettings2.cpp b/src/gui/GeneralSettings2.cpp index c5bc974d7..31d164819 100644 --- a/src/gui/GeneralSettings2.cpp +++ b/src/gui/GeneralSettings2.cpp @@ -910,6 +910,18 @@ wxPanel* GeneralSettings2::AddDebugPage(wxNotebook* notebook) debug_panel_sizer->Add(debug_row, 0, wxALL | wxEXPAND, 5); } + { + auto* debug_row = new wxFlexGridSizer(0, 2, 0, 0); + debug_row->SetFlexibleDirection(wxBOTH); + debug_row->SetNonFlexibleGrowMode(wxFLEX_GROWMODE_SPECIFIED); + + m_framebuffer_fetch = new wxCheckBox(panel, wxID_ANY, _("Framebuffer fetch")); + m_framebuffer_fetch->SetToolTip(_("Enable framebuffer fetch for eligible textures on supported devices.")); + + debug_row->Add(m_framebuffer_fetch, 0, wxALL | wxEXPAND, 5); + debug_panel_sizer->Add(debug_row, 0, wxALL | wxEXPAND, 5); + } + panel->SetSizerAndFit(debug_panel_sizer); return panel; @@ -1121,6 +1133,7 @@ void GeneralSettings2::StoreConfig() config.crash_dump = (CrashDump)m_crash_dump->GetSelection(); config.gdb_port = m_gdb_port->GetValue(); config.gpu_capture_dir = m_gpu_capture_dir->GetValue().utf8_string(); + config.framebuffer_fetch = m_framebuffer_fetch->IsChecked(); g_config.Save(); } @@ -1816,6 +1829,7 @@ void GeneralSettings2::ApplyConfig() m_crash_dump->SetSelection((int)config.crash_dump.GetValue()); m_gdb_port->SetValue(config.gdb_port.GetValue()); m_gpu_capture_dir->SetValue(wxHelper::FromUtf8(config.gpu_capture_dir.GetValue())); + m_framebuffer_fetch->SetValue(config.framebuffer_fetch); } void GeneralSettings2::OnAudioAPISelected(wxCommandEvent& event) diff --git a/src/gui/GeneralSettings2.h b/src/gui/GeneralSettings2.h index 54a782544..58459e958 100644 --- a/src/gui/GeneralSettings2.h +++ b/src/gui/GeneralSettings2.h @@ -80,6 +80,7 @@ class GeneralSettings2 : public wxDialog wxChoice* m_crash_dump; wxSpinCtrl* m_gdb_port; wxTextCtrl* m_gpu_capture_dir; + wxCheckBox* m_framebuffer_fetch; void OnAccountCreate(wxCommandEvent& event); void OnAccountDelete(wxCommandEvent& event); From f686a7b77d231a4d987e597c2a25eb137d600653 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 12 Jan 2025 12:15:08 +0100 Subject: [PATCH 316/368] use as_type instead of bitCast in shaders --- .../LatteDecompilerEmitMSL.cpp | 128 ++++++++++-------- 1 file changed, 72 insertions(+), 56 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 488cc2d58..26b94cc74 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -792,7 +792,7 @@ static void _emitOperandInputCode(LatteDecompilerShaderContext* shaderContext, L if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) { // need to convert (not cast) from int bits to float - src->add("bitCast("); + src->add("as_type("); // TODO: correct? } else if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_FLOAT ) { @@ -862,7 +862,7 @@ static void _emitOperandInputCode(LatteDecompilerShaderContext* shaderContext, L src->add(_FormatFloatAsConstant(*(float*)&constVal)); } else - src->addFmt("bitCast(0x{:08x})", constVal); + src->addFmt("as_type(0x{:08x})", constVal); } } else if( GPU7_ALU_SRC_IS_CFILE(aluInstruction->sourceOperand[operandIndex].sel) ) @@ -909,14 +909,14 @@ void _emitTypeConversionPrefixMSL(LatteDecompilerShaderContext* shaderContext, s return; StringBuf* src = shaderContext->shaderSource; if (sourceType == LATTE_DECOMPILER_DTYPE_FLOAT && destinationType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->add("bitCast("); + src->add("as_type("); else if (sourceType == LATTE_DECOMPILER_DTYPE_FLOAT && destinationType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT) - src->add("bitCast("); - else if( sourceType == LATTE_DECOMPILER_DTYPE_SIGNED_INT && destinationType == LATTE_DECOMPILER_DTYPE_FLOAT ) - src->add("bitCast("); - else if( sourceType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT && destinationType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + src->add("as_type("); + else if (sourceType == LATTE_DECOMPILER_DTYPE_SIGNED_INT && destinationType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->add("as_type("); + else if (sourceType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT && destinationType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) src->add("int("); - else if( sourceType == LATTE_DECOMPILER_DTYPE_SIGNED_INT && destinationType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT ) + else if (sourceType == LATTE_DECOMPILER_DTYPE_SIGNED_INT && destinationType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT) src->add("uint("); else cemu_assert_debug(false); @@ -1016,7 +1016,7 @@ static void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderConte src->add(" = "); if( outputType != LATTE_DECOMPILER_DTYPE_SIGNED_INT ) debugBreakpoint(); // todo - src->add("bitCast(tempResultf)"); + src->add("as_type(tempResultf)"); src->add(";" _CRLF); } else if( aluInstruction->opcode == ALU_OP2_INST_MOVA_INT ) @@ -1113,9 +1113,9 @@ static void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderConte _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(");" _CRLF); // INF becomes 0.0 - src->add("if( isinf(tempResultf) == true && (bitCast(tempResultf)&0x80000000) == 0 ) tempResultf = 0.0;" _CRLF); + src->add("if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) == 0 ) tempResultf = 0.0;" _CRLF); // -INF becomes -0.0 - src->add("else if( isinf(tempResultf) == true && (bitCast(tempResultf)&0x80000000) != 0 ) tempResultf = -0.0;" _CRLF); + src->add("else if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) != 0 ) tempResultf = -0.0;" _CRLF); // assign result to output _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); @@ -1135,14 +1135,14 @@ static void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderConte if (aluInstruction->opcode == ALU_OP2_INST_RECIPSQRT_CLAMPED) { // note: if( -INF < 0.0 ) does not resolve to true - src->add("if( isinf(tempResultf) == true && (bitCast(tempResultf)&0x80000000) != 0 ) tempResultf = -3.40282347E+38F;" _CRLF); - src->add("else if( isinf(tempResultf) == true && (bitCast(tempResultf)&0x80000000) == 0 ) tempResultf = 3.40282347E+38F;" _CRLF); + src->add("if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) != 0 ) tempResultf = -3.40282347E+38F;" _CRLF); + src->add("else if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) == 0 ) tempResultf = 3.40282347E+38F;" _CRLF); } else if (aluInstruction->opcode == ALU_OP2_INST_RECIPSQRT_FF) { // untested (BotW bombs) - src->add("if( isinf(tempResultf) == true && (bitCast(tempResultf)&0x80000000) != 0 ) tempResultf = -0.0;" _CRLF); - src->add("else if( isinf(tempResultf) == true && (bitCast(tempResultf)&0x80000000) == 0 ) tempResultf = 0.0;" _CRLF); + src->add("if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) != 0 ) tempResultf = -0.0;" _CRLF); + src->add("else if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) == 0 ) tempResultf = 0.0;" _CRLF); } // assign result to output _emitInstructionOutputVariableName(shaderContext, aluInstruction); @@ -1995,7 +1995,7 @@ static void _emitALUClauseCode(LatteDecompilerShaderContext* shaderContext, Latt { _emitInstructionOutputVariableName(shaderContext, &aluInstruction); src->add(" = "); - src->add("bitCast(bitCast("); + src->add("as_type(as_type("); // TODO: correct? _emitInstructionOutputVariableName(shaderContext, &aluInstruction); src->add(")"); if( aluInstruction.omod == 1 ) @@ -2092,9 +2092,9 @@ static void _emitTEXSampleCoordInputComponent(LatteDecompilerShaderContext* shad if(interpretSrcAsType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) { if( elementSel == 4 ) - src->add("bitCast(0.0)"); + src->add("as_type(0.0)"); else if( elementSel == 5 ) - src->add("bitCast(1.0)"); + src->add("as_type(1.0)"); } else if(interpretSrcAsType == LATTE_DECOMPILER_DTYPE_FLOAT ) { @@ -2109,15 +2109,20 @@ static const char* _texGprAccessElemTable[8] = {"x","y","z","w","_","_","_","_"} static char* _getTexGPRAccess(LatteDecompilerShaderContext* shaderContext, sint32 gprIndex, uint32 dataType, sint8 selX, sint8 selY, sint8 selZ, sint8 selW, char* tempBuffer) { - // bitCast(R{}i.w) + // as_type(R{}i.w) *tempBuffer = '\0'; - uint8 elemCount = (selX > 0 ? 1 : 0) + (selY > 0 ? 1 : 0) + (selZ > 0 ? 1 : 0) + (selW > 0 ? 1 : 0); + uint8 elemCount = (selX >= 0 ? 1 : 0) + (selY >= 0 ? 1 : 0) + (selZ >= 0 ? 1 : 0) + (selW >= 0 ? 1 : 0); if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) { if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) ; // no conversion else if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) - strcat(tempBuffer, "bitCast("); + { + if (elemCount == 1) + strcat(tempBuffer, "as_type("); + else + strcat(tempBuffer, ("as_type(").c_str()); + } else cemu_assert_unimplemented(); strcat(tempBuffer, _getRegisterVarName(shaderContext, gprIndex)); @@ -2221,19 +2226,29 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex // integer samplers if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) // uint to int { - if(numWrittenElements == 1) + if (numWrittenElements == 1) src->add(" = int("); else shaderContext->shaderSource->addFmt(" = int{}(", numWrittenElements); } else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) - src->add(" = bitCast("); + { + if (numWrittenElements == 1) + src->add(" = as_type("); + else + shaderContext->shaderSource->addFmt(" = as_type(", numWrittenElements); + } } else { // float samplers if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->add(" = bitCast("); + { + if (numWrittenElements == 1) + src->add(" = as_type("); + else + shaderContext->shaderSource->addFmt(" = as_type(", numWrittenElements); + } else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) src->add(" = ("); } @@ -2725,14 +2740,14 @@ static void _emitTEXGetCompTexLodCode(LatteDecompilerShaderContext* shaderContex if(shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) src->addFmt("float4(textureCalculateLod(tex{}, samplr{}, {}.{}{}{}), 0.0, 0.0)", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]], resultElemTable[texInstruction->textureFetch.srcSel[2]]); else - src->addFmt("float4(textureCalculateLod(tex{}, samplr{}, bitCast({}.{}{}{})), 0.0, 0.0)", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]], resultElemTable[texInstruction->textureFetch.srcSel[2]]); + src->addFmt("float4(textureCalculateLod(tex{}, samplr{}, as_type({}.{}{}{})), 0.0, 0.0)", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]], resultElemTable[texInstruction->textureFetch.srcSel[2]]); } else { if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) src->addFmt("float4(textureCalculateLod(tex{}, samplr{}, {}.{}{}), 0.0, 0.0)", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]]); else - src->addFmt("float4(textureCalculateLod(tex{}, samplr{}, bitCast({}.{}{})), 0.0, 0.0)", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]]); + src->addFmt("float4(textureCalculateLod(tex{}, samplr{}, as_type({}.{}{})), 0.0, 0.0)", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]]); debugBreakpoint(); } } @@ -2766,7 +2781,7 @@ static void _emitTEXSetCubemapIndexCode(LatteDecompilerShaderContext* shaderCont const char* resultElemTable[4] = {"x","y","z","w"}; if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->addFmt(" = bitCast(R{}i.{});" _CRLF, texInstruction->srcGpr, resultElemTable[texInstruction->textureFetch.srcSel[0]]); + src->addFmt(" = as_type(R{}i.{});" _CRLF, texInstruction->srcGpr, resultElemTable[texInstruction->textureFetch.srcSel[0]]); else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) src->addFmt(" = R{}f.{};" _CRLF, texInstruction->srcGpr, resultElemTable[texInstruction->textureFetch.srcSel[0]]); else @@ -2935,30 +2950,41 @@ static void _emitTEXVFetchCode(LatteDecompilerShaderContext* shaderContext, Latt _writeDestMaskXYZW(shaderContext, texInstruction->dstSel); const char* resultElemTable[4] = {"x","y","z","w"}; + uint32 numWrittenElements = 0; + for (sint32 f=0; f<4; f++) + { + if (texInstruction->dstSel[f] < 4) + numWrittenElements++; + } src->add(" = "); if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->add("bitCast("); + { + if (numWrittenElements == 1) + src->add("as_type("); + else + src->addFmt("as_type(", numWrittenElements); + } else src->add("("); src->addFmt("ubuff{}.d[", texInstruction->textureFetch.textureIndex - 0x80); - if( shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) src->addFmt("{}.{}", _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]]); else - src->addFmt("bitCast({}.{})", _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]]); + src->addFmt("as_type({}.{})", _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]]); src->add("]."); - for(sint32 f=0; f<4; f++) + for (sint32 f=0; f<4; f++) { - if( texInstruction->dstSel[f] < 4 ) + if (texInstruction->dstSel[f] < 4) { src->add(resultElemTable[texInstruction->dstSel[f]]); } - else if( texInstruction->dstSel[f] == 7 ) + else if (texInstruction->dstSel[f] == 7) { // masked and not written } @@ -2980,7 +3006,12 @@ static void _emitTEXReadMemCode(LatteDecompilerShaderContext* shaderContext, Lat src->add(" = "); if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->add("bitCast("); + { + if (count == 1) + src->add("as_type("); + else + src->addFmt("as_type(", count); + } else src->add("("); @@ -3008,6 +3039,7 @@ static void _emitTEXReadMemCode(LatteDecompilerShaderContext* shaderContext, Lat { cemu_assert_unimplemented(); } + if (count < readCount) { if (count == 1) @@ -3796,22 +3828,6 @@ void LatteDecompiler_emitHelperFunctions(LatteDecompilerShaderContext* shaderCon "return as_type(clamp(as_type(v), 0.0, 1.0));\r\n" "}\r\n"); - // Bit cast - - // Scalar - fCStr_shaderSource->add("" - "template\r\n" - "ResultT bitCast(T x) {\r\n" - "return as_type(x);\r\n" - "}\r\n"); - - // Vector - fCStr_shaderSource->add("" - "template\r\n" - "vec bitCast(vec x) {\r\n" - "return as_type>(x);\r\n" - "}\r\n"); - // mul non-ieee way (0*NaN/INF => 0.0) if (shaderContext->options->strictMul) { @@ -3819,7 +3835,7 @@ void LatteDecompiler_emitHelperFunctions(LatteDecompilerShaderContext* shaderCon //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ return mix(a*b,0.0,a==0.0||b==0.0); }" STR_LINEBREAK); //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ return mix(vec2(a*b,0.0),vec2(0.0,0.0),(equal(vec2(a),vec2(0.0,0.0))||equal(vec2(b),vec2(0.0,0.0)))).x; }" STR_LINEBREAK); //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ if( a == 0.0 || b == 0.0 ) return 0.0; return a*b; }" STR_LINEBREAK); - //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){float r = a*b;r = bitCast(floatBitsToInt(r)&(((floatBitsToInt(a) != 0) && (floatBitsToInt(b) != 0))?0xFFFFFFFF:0));return r;}" STR_LINEBREAK); works + //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){float r = a*b;r = intBitsToFloat(floatBitsToInt(r)&(((floatBitsToInt(a) != 0) && (floatBitsToInt(b) != 0))?0xFFFFFFFF:0));return r;}" STR_LINEBREAK); works // for "min" it used to be: float mul_nonIEEE(float a, float b){ return min(a*b,min(abs(a)*3.40282347E+38F,abs(b)*3.40282347E+38F)); } @@ -4273,7 +4289,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) src->addFmt("{} = int4(vid, 0, 0, iid);" _CRLF, _getRegisterVarName(shaderContext, 0)); else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) - src->addFmt("{} = float4(vid, 0, 0, iid);" _CRLF, _getRegisterVarName(shaderContext, 0)); // TODO: bitCast(float4(vid, 0, 0, iid))? + src->addFmt("{} = float4(vid, 0, 0, iid);" _CRLF, _getRegisterVarName(shaderContext, 0)); // TODO: as_type(float4(vid, 0, 0, iid))? else cemu_assert_unimplemented(); } @@ -4316,7 +4332,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) src->addFmt("{} = pointCoord.xyxy;" _CRLF, _getRegisterVarName(shaderContext, paramGenGPRIndex)); else - src->addFmt("{} = bitCast(pointCoord.xyxy);" _CRLF, _getRegisterVarName(shaderContext, paramGenGPRIndex)); + src->addFmt("{} = as_type(pointCoord.xyxy);" _CRLF, _getRegisterVarName(shaderContext, paramGenGPRIndex)); } for (sint32 i = 0; i < psInputTable->count; i++) @@ -4333,12 +4349,12 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) src->addFmt("{} = GET_FRAGCOORD();" _CRLF, _getRegisterVarName(shaderContext, gprIndex)); else - src->addFmt("{} = bitCast(GET_FRAGCOORD());" _CRLF, _getRegisterVarName(shaderContext, gprIndex)); + src->addFmt("{} = as_type(GET_FRAGCOORD());" _CRLF, _getRegisterVarName(shaderContext, gprIndex)); continue; } if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->addFmt("{} = bitCast(in.passParameterSem{});" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId); + src->addFmt("{} = as_type(in.passParameterSem{});" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId); else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) src->addFmt("{} = in.passParameterSem{};" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId); else @@ -4352,7 +4368,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, if (frontFace_allBits) cemu_assert_debug(false); if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->addFmt("{}.{} = bitCast(frontFacing?1.0:0.0);" _CRLF, _getRegisterVarName(shaderContext, frontFace_regIndex), _getElementStrByIndex(frontFace_chan)); + src->addFmt("{}.{} = as_type(frontFacing ? 1.0 : 0.0);" _CRLF, _getRegisterVarName(shaderContext, frontFace_regIndex), _getElementStrByIndex(frontFace_chan)); else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) src->addFmt("{}.{} = frontFacing ? 1.0 : 0.0;" _CRLF, _getRegisterVarName(shaderContext, frontFace_regIndex), _getElementStrByIndex(frontFace_chan)); else From 82c2d14828aff6e8a1d708c6342379854fed3654 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 12 Jan 2025 13:08:30 +0100 Subject: [PATCH 317/368] fix: device preference not getting loaded --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h | 2 +- src/config/CemuConfig.cpp | 2 +- src/config/CemuConfig.h | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index db79471d0..2eefce718 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -477,7 +477,7 @@ class MetalRenderer : public Renderer MetalPerformanceMonitor m_performanceMonitor; // Metal objects - MTL::Device* m_device; + MTL::Device* m_device = nullptr; MTL::CommandQueue* m_commandQueue; // Feature support diff --git a/src/config/CemuConfig.cpp b/src/config/CemuConfig.cpp index dc38647a3..7542dc310 100644 --- a/src/config/CemuConfig.cpp +++ b/src/config/CemuConfig.cpp @@ -214,7 +214,7 @@ void CemuConfig::Load(XMLConfigParser& parser) auto graphic = parser.get("Graphic"); graphic_api = graphic.get("api", kOpenGL); graphic.get("vkDevice", vk_graphic_device_uuid); - graphic.get("mtlDevice", mtl_graphic_device_uuid); + mtl_graphic_device_uuid = graphic.get("mtlDevice", 0); vsync = graphic.get("VSync", 0); gx2drawdone_sync = graphic.get("GX2DrawdoneSync", true); upscale_filter = graphic.get("UpscaleFilter", kBicubicHermiteFilter); diff --git a/src/config/CemuConfig.h b/src/config/CemuConfig.h index 0990c6523..991d9a89e 100644 --- a/src/config/CemuConfig.h +++ b/src/config/CemuConfig.h @@ -464,9 +464,9 @@ struct CemuConfig // graphics ConfigValue graphic_api{ kVulkan }; std::array vk_graphic_device_uuid; - uint64 mtl_graphic_device_uuid{0}; + uint64 mtl_graphic_device_uuid{ 0 }; ConfigValue vsync{ 0 }; // 0 = off, 1+ = depending on render backend - ConfigValue gx2drawdone_sync {true}; + ConfigValue gx2drawdone_sync { true }; ConfigValue render_upside_down{ false }; ConfigValue async_compile{ true }; From a28f9d65fc4dfe8934d856b368d4d6954be74892 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 12 Jan 2025 13:45:38 +0100 Subject: [PATCH 318/368] fix: incorrect texture view pointers --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 2eefce718..760ad6bc2 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -125,7 +125,7 @@ struct MetalState MetalActiveFBOState m_lastUsedFBO; size_t m_vertexBufferOffsets[MAX_MTL_VERTEX_BUFFERS]; - class LatteTextureViewMtl* m_textures[LATTE_NUM_MAX_TEX_UNITS] = {nullptr}; + class LatteTextureViewMtl* m_textures[LATTE_NUM_MAX_TEX_UNITS * 3] = {nullptr}; size_t m_uniformBufferOffsets[METAL_GENERAL_SHADER_TYPE_TOTAL][MAX_MTL_BUFFERS]; MTL::Viewport m_viewport; From 388bbe25865c258d624c2b93da9148845edb8211 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 12 Jan 2025 17:04:21 +0100 Subject: [PATCH 319/368] release command buffers --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index dc3b8ae04..f014fed21 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1840,7 +1840,7 @@ void MetalRenderer::ProcessFinishedCommandBuffers() if (CommandBufferCompleted(commandBuffer)) { m_memoryManager->GetTemporaryBufferAllocator().CommandBufferFinished(commandBuffer); - //commandBuffer->release(); + commandBuffer->release(); it = m_executingCommandBuffers.erase(it); atLeastOneCompleted = true; } From aec7a182cefcacd1a9ea64e07b675635ad4ffc47 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 12 Jan 2025 18:02:36 +0100 Subject: [PATCH 320/368] remove useless texture member variables --- src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp | 2 +- src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h | 11 ----------- .../Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp | 4 ++-- .../HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp | 2 +- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 8 ++++---- 5 files changed, 8 insertions(+), 19 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index a64a2f265..f3bd14b95 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -8,7 +8,7 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) - : LatteTexture(dim, physAddress, physMipAddress, format, width, height, depth, pitch, mipLevels, swizzle, tileMode, isDepth), m_mtlr(mtlRenderer), m_format(format), m_isDepth(isDepth) + : LatteTexture(dim, physAddress, physMipAddress, format, width, height, depth, pitch, mipLevels, swizzle, tileMode, isDepth), m_mtlr(mtlRenderer) { MTL::TextureDescriptor* desc = MTL::TextureDescriptor::alloc()->init(); desc->setStorageMode(MTL::StorageModePrivate); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h index 81942dfaf..884a5c5b0 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h @@ -17,14 +17,6 @@ class LatteTextureMtl : public LatteTexture return m_texture; } - Latte::E_GX2SURFFMT GetFormat() const { - return m_format; - } - - bool IsDepth() const { - return m_isDepth; - } - void AllocateOnHost() override; protected: @@ -34,7 +26,4 @@ class LatteTextureMtl : public LatteTexture class MetalRenderer* m_mtlr; MTL::Texture* m_texture; - - Latte::E_GX2SURFFMT m_format; - bool m_isDepth; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp index 05b579e70..405c49df4 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp @@ -19,8 +19,8 @@ void LatteTextureReadbackInfoMtl::StartTransfer() cemu_assert_debug(m_textureView->firstMip == 0); cemu_assert_debug(m_textureView->baseTexture->dim != Latte::E_DIM::DIM_3D); - size_t bytesPerRow = GetMtlTextureBytesPerRow(baseTexture->format, baseTexture->IsDepth(), baseTexture->width); - size_t bytesPerImage = GetMtlTextureBytesPerImage(baseTexture->format, baseTexture->IsDepth(), baseTexture->height, bytesPerRow); + size_t bytesPerRow = GetMtlTextureBytesPerRow(baseTexture->format, baseTexture->isDepth, baseTexture->width); + size_t bytesPerImage = GetMtlTextureBytesPerImage(baseTexture->format, baseTexture->isDepth, baseTexture->height, bytesPerRow); auto blitCommandEncoder = m_mtlr->GetBlitCommandEncoder(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp index aa4481061..a06b11f02 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp @@ -184,7 +184,7 @@ MTL::Texture* LatteTextureViewMtl::CreateSwizzledView(uint32 gpuSamplerSwizzle) levelCount = std::min(levelCount, m_baseTexture->maxPossibleMipLevels - baseLevel); levelCount = std::max(levelCount, (uint32)1); - auto pixelFormat = GetMtlPixelFormat(format, m_baseTexture->IsDepth()); + auto pixelFormat = GetMtlPixelFormat(format, m_baseTexture->isDepth); MTL::Texture* texture = m_baseTexture->GetTexture()->newTextureView(pixelFormat, textureType, NS::Range::Make(baseLevel, levelCount), NS::Range::Make(baseLayer, layerCount), swizzle); return texture; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index f014fed21..9e8b90a12 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -351,8 +351,8 @@ void MetalRenderer::HandleScreenshotRequest(LatteTextureView* texView, bool padV int width, height; texMtl->GetEffectiveSize(width, height, 0); - uint32 bytesPerRow = GetMtlTextureBytesPerRow(texMtl->format, texMtl->IsDepth(), width); - uint32 size = GetMtlTextureBytesPerImage(texMtl->format, texMtl->IsDepth(), height, bytesPerRow); + uint32 bytesPerRow = GetMtlTextureBytesPerRow(texMtl->format, texMtl->isDepth, width); + uint32 size = GetMtlTextureBytesPerImage(texMtl->format, texMtl->isDepth, height, bytesPerRow); // TODO: get a buffer from the memory manager MTL::Buffer* buffer = m_device->newBuffer(size, MTL::ResourceStorageModeShared); @@ -670,9 +670,9 @@ void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, s sliceIndex = 0; } - size_t bytesPerRow = GetMtlTextureBytesPerRow(textureMtl->GetFormat(), textureMtl->IsDepth(), width); + size_t bytesPerRow = GetMtlTextureBytesPerRow(textureMtl->format, textureMtl->isDepth, width); // No need to set bytesPerImage for 3D textures, since we always load just one slice - //size_t bytesPerImage = GetMtlTextureBytesPerImage(textureMtl->GetFormat(), textureMtl->IsDepth(), height, bytesPerRow); + //size_t bytesPerImage = GetMtlTextureBytesPerImage(textureMtl->GetFormat(), textureMtl->isDepth, height, bytesPerRow); //if (m_isAppleGPU) //{ // textureMtl->GetTexture()->replaceRegion(MTL::Region(0, 0, offsetZ, width, height, 1), mipIndex, sliceIndex, pixelData, bytesPerRow, 0); From 21bc5f247bdc3ce28fd9b4386e7e0a79a5ae1755 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 12 Jan 2025 18:05:24 +0100 Subject: [PATCH 321/368] add texture copy block size workaround --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 9e8b90a12..83c39b5ca 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -762,6 +762,18 @@ void MetalRenderer::texture_setLatteTexture(LatteTextureView* textureView, uint3 void MetalRenderer::texture_copyImageSubData(LatteTexture* src, sint32 srcMip, sint32 effectiveSrcX, sint32 effectiveSrcY, sint32 srcSlice, LatteTexture* dst, sint32 dstMip, sint32 effectiveDstX, sint32 effectiveDstY, sint32 dstSlice, sint32 effectiveCopyWidth, sint32 effectiveCopyHeight, sint32 srcDepth_) { + // Source size seems to apply to the destination texture as well, therefore we need to adjust it when block size doesn't match + Uvec2 srcBlockTexelSize = GetMtlPixelFormatInfo(src->format, src->isDepth).blockTexelSize; + Uvec2 dstBlockTexelSize = GetMtlPixelFormatInfo(dst->format, dst->isDepth).blockTexelSize; + if (srcBlockTexelSize.x != dstBlockTexelSize.x || srcBlockTexelSize.y != dstBlockTexelSize.y) + { + uint32 multX = (srcBlockTexelSize.x > dstBlockTexelSize.x ? srcBlockTexelSize.x / dstBlockTexelSize.x : dstBlockTexelSize.x / srcBlockTexelSize.x); + effectiveCopyWidth *= multX; + + uint32 multY = (srcBlockTexelSize.y > dstBlockTexelSize.y ? srcBlockTexelSize.y / dstBlockTexelSize.y : dstBlockTexelSize.y / srcBlockTexelSize.y); + effectiveCopyHeight *= multY; + } + auto blitCommandEncoder = GetBlitCommandEncoder(); auto mtlSrc = static_cast(src)->GetTexture(); From 1e3a3ef2989299fd0993316082f7d1b26a27dcfa Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 15 Jan 2025 14:27:38 +0100 Subject: [PATCH 322/368] prepare for AIR cache --- .../Latte/Renderer/Metal/LatteTextureMtl.cpp | 5 +- .../HW/Latte/Renderer/Metal/MetalCommon.h | 8 ++ .../Renderer/Metal/RendererShaderMtl.cpp | 92 ++++++++++++++++++- .../Latte/Renderer/Metal/RendererShaderMtl.h | 2 + 4 files changed, 101 insertions(+), 6 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index f3bd14b95..3c0005efc 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -2,9 +2,6 @@ #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" -#include "Common/precompiled.h" -#include "Metal/MTLResource.hpp" -#include "Metal/MTLTexture.hpp" LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) @@ -12,7 +9,7 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM { MTL::TextureDescriptor* desc = MTL::TextureDescriptor::alloc()->init(); desc->setStorageMode(MTL::StorageModePrivate); - desc->setCpuCacheMode(MTL::CPUCacheModeWriteCombined); + //desc->setCpuCacheMode(MTL::CPUCacheModeWriteCombined); sint32 effectiveBaseWidth = width; sint32 effectiveBaseHeight = height; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h index 952fd1de9..f3dd17332 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -101,3 +101,11 @@ inline bool FormatIsRenderable(Latte::E_GX2SURFFMT format) { return !Latte::IsCompressedFormat(format); } + +template +inline void executeCommand(fmt::format_string fmt, T&&... args) { + std::string command = fmt::format(fmt, std::forward(args)...); + int res = system(command.c_str()); + if (res != 0) + cemuLog_log(LogType::Force, "command \"{}\" failed with exit code {}", command, res); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index c4492e3c4..dc1256a4b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -2,14 +2,21 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" -//#include "Cemu/FileCache/FileCache.h" -//#include "config/ActiveSettings.h" +#include "Cemu/FileCache/FileCache.h" +#include "config/ActiveSettings.h" #include "Cemu/Logging/CemuLogging.h" #include "Common/precompiled.h" #include "GameProfile/GameProfile.h" #include "util/helpers/helpers.h" +#define METAL_AIR_CACHE_NAME "Cemu_AIR_cache" +#define METAL_AIR_CACHE_PATH "/Volumes/" METAL_AIR_CACHE_NAME +#define METAL_AIR_CACHE_SIZE (512 * 1024 * 1024) +#define METAL_AIR_CACHE_BLOCK_COUNT (METAL_AIR_CACHE_SIZE / 512) + static bool s_isLoadingShadersMtl{false}; +static std::atomic s_hasRAMFilesystem{false}; +class FileCache* s_airCache{nullptr}; extern std::atomic_int g_compiled_shaders_total; extern std::atomic_int g_compiled_shaders_async; @@ -88,12 +95,44 @@ class ShaderMtlThreadPool // TODO: find out if it would be possible to cache compiled Metal shaders void RendererShaderMtl::ShaderCacheLoading_begin(uint64 cacheTitleId) { + s_isLoadingShadersMtl = true; + + // Open AIR cache + if (s_airCache) + { + delete s_airCache; + s_airCache = nullptr; + } + uint32 airCacheMagic = GeneratePrecompiledCacheId(); + const std::string cacheFilename = fmt::format("{:016x}_air.bin", cacheTitleId); + const fs::path cachePath = ActiveSettings::GetCachePath("shaderCache/precompiled/{}", cacheFilename); + s_airCache = FileCache::Open(cachePath, true, airCacheMagic); + if (!s_airCache) + cemuLog_log(LogType::Force, "Unable to open AIR cache {}", cacheFilename); + // Maximize shader compilation speed static_cast(g_renderer.get())->SetShouldMaximizeConcurrentCompilation(true); } void RendererShaderMtl::ShaderCacheLoading_end() { + s_isLoadingShadersMtl = false; + + // Close the AIR cache + if (s_airCache) + { + delete s_airCache; + s_airCache = nullptr; + } + + // Close RAM filesystem + if (s_hasRAMFilesystem) + { + executeCommand("diskutil eject {}", METAL_AIR_CACHE_PATH); + s_hasRAMFilesystem = false; + } + + // Reset shader compilation speed static_cast(g_renderer.get())->SetShouldMaximizeConcurrentCompilation(false); } @@ -174,6 +213,49 @@ bool RendererShaderMtl::ShouldCountCompilation() const void RendererShaderMtl::CompileInternal() { + // First, try to retrieve the compiled shader from the AIR cache + if (s_isLoadingShadersMtl && (m_isGameShader && !m_isGfxPackShader) && s_airCache) + { + cemu_assert_debug(m_baseHash != 0); + uint64 h1, h2; + GenerateShaderPrecompiledCacheFilename(m_type, m_baseHash, m_auxHash, h1, h2); + std::vector cacheFileData; + if (s_airCache->GetFile({ h1, h2 }, cacheFileData)) + { + CompileFromAIR(std::span(cacheFileData.data(), cacheFileData.size())); + FinishCompilation(); + } + else + { + // Ensure that RAM filesystem exists + if (!s_hasRAMFilesystem) + { + s_hasRAMFilesystem = true; + executeCommand("diskutil erasevolume HFS+ {} $(hdiutil attach -nomount ram://{})", METAL_AIR_CACHE_NAME, METAL_AIR_CACHE_BLOCK_COUNT); + } + + // The shader is not in the cache, compile it + std::string filename = fmt::format("{}_{}", h1, h2); + // TODO: store the source + executeCommand("xcrun -sdk macosx metal -o {}.ir -c {}.metal", filename, filename); + executeCommand("xcrun -sdk macosx metallib -o {}.metallib {}.ir", filename, filename); + // TODO: clean up + + // Load from the newly Generated AIR + // std::span airData = ; + //CompileFromAIR(std::span((uint8*)cacheFileData.data(), cacheFileData.size() / sizeof(uint8))); + FinishCompilation(); + + // Store in the cache + uint64 h1, h2; + GenerateShaderPrecompiledCacheFilename(m_type, m_baseHash, m_auxHash, h1, h2); + //s_airCache->AddFile({ h1, h2 }, airData.data(), airData.size()); + } + + return; + } + + // Compile from source MTL::CompileOptions* options = MTL::CompileOptions::alloc()->init(); // TODO: always disable fast math for problematic shaders if (g_current_game_profile->GetFastMath()) @@ -200,6 +282,12 @@ void RendererShaderMtl::CompileInternal() g_compiled_shaders_total++; } +void RendererShaderMtl::CompileFromAIR(std::span data) +{ + // TODO: implement this + printf("LOADING SHADER FROM AIR CACHE\n"); +} + void RendererShaderMtl::FinishCompilation() { m_mslCode.clear(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h index 40d04c870..98d186875 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h @@ -69,5 +69,7 @@ class RendererShaderMtl : public RendererShader void CompileInternal(); + void CompileFromAIR(std::span data); + void FinishCompilation(); }; From 3d84b78362e5abce03ec5008f805cb031ddfcc26 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 15 Jan 2025 14:47:47 +0100 Subject: [PATCH 323/368] support multiple components in type conversion --- .../LatteDecompilerEmitMSL.cpp | 38 ++++++++++++------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 26b94cc74..e14d494a3 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -33,7 +33,7 @@ void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext */ // local prototypes -void _emitTypeConversionPrefixMSL(LatteDecompilerShaderContext* shaderContext, sint32 sourceType, sint32 destinationType); +void _emitTypeConversionPrefixMSL(LatteDecompilerShaderContext* shaderContext, sint32 sourceType, sint32 destinationType, sint32 componentCount = 1); void _emitTypeConversionSuffixMSL(LatteDecompilerShaderContext* shaderContext, sint32 sourceType, sint32 destinationType); void LatteDecompiler_emitClauseCodeMSL(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, bool isSubroutine); @@ -903,21 +903,32 @@ static void _emitOperandInputCode(LatteDecompilerShaderContext* shaderContext, L _emitTypeConversionSuffixMSL(shaderContext, requiredType, requiredTypeOut); } -void _emitTypeConversionPrefixMSL(LatteDecompilerShaderContext* shaderContext, sint32 sourceType, sint32 destinationType) +void _emitTypeConversionPrefixMSL(LatteDecompilerShaderContext* shaderContext, sint32 sourceType, sint32 destinationType, sint32 componentCount) { if( sourceType == destinationType ) return; StringBuf* src = shaderContext->shaderSource; - if (sourceType == LATTE_DECOMPILER_DTYPE_FLOAT && destinationType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->add("as_type("); - else if (sourceType == LATTE_DECOMPILER_DTYPE_FLOAT && destinationType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT) - src->add("as_type("); - else if (sourceType == LATTE_DECOMPILER_DTYPE_SIGNED_INT && destinationType == LATTE_DECOMPILER_DTYPE_FLOAT) - src->add("as_type("); - else if (sourceType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT && destinationType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->add("int("); - else if (sourceType == LATTE_DECOMPILER_DTYPE_SIGNED_INT && destinationType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT) - src->add("uint("); + if (destinationType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + if (componentCount == 1) + src->add("as_type("); + else + src->addFmt("as_type(", componentCount); + } + else if (destinationType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT) + { + if (componentCount == 1) + src->add("as_type("); + else + src->addFmt("as_type(", componentCount); + } + else if (destinationType == LATTE_DECOMPILER_DTYPE_FLOAT) + { + if (componentCount == 1) + src->add("as_type("); + else + src->addFmt("as_type(", componentCount); + } else cemu_assert_debug(false); } @@ -2880,7 +2891,7 @@ static void _emitGSReadInputVFetchCode(LatteDecompilerShaderContext* shaderConte } src->add(" = "); - _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, shaderContext->typeTracker.defaultDataType); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, shaderContext->typeTracker.defaultDataType, numWrittenElements); src->add("(objectPayload.vertexOut["); if (texInstruction->textureFetch.srcSel[0] >= 4) cemu_assert_unimplemented(); @@ -2895,7 +2906,6 @@ static void _emitGSReadInputVFetchCode(LatteDecompilerShaderContext* shaderConte if( texInstruction->dstSel[f] < 4 ) { src->add(resultElemTable[texInstruction->dstSel[f]]); - numWrittenElements++; } else if( texInstruction->dstSel[f] == 7 ) { From a8da524dd4232ab7399cf1d63b3fc37f4b22f2ff Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 15 Jan 2025 17:40:45 +0100 Subject: [PATCH 324/368] use precompiled shaders --- .../HW/Latte/Renderer/Metal/MetalCommon.h | 50 ++++++++ .../Renderer/Metal/RendererShaderMtl.cpp | 121 ++++++++++++------ .../Latte/Renderer/Metal/RendererShaderMtl.h | 6 +- 3 files changed, 136 insertions(+), 41 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h index f3dd17332..28d922251 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -109,3 +109,53 @@ inline void executeCommand(fmt::format_string fmt, T&&... args) { if (res != 0) cemuLog_log(LogType::Force, "command \"{}\" failed with exit code {}", command, res); } + +class MemoryMappedFile +{ +public: + MemoryMappedFile(const std::string& filePath) + { + // Open the file + m_fd = open(filePath.c_str(), O_RDONLY); + if (m_fd == -1) { + cemuLog_log(LogType::Force, "failed to open file: {}", filePath); + return; + } + + // Get the file size + struct stat fileStat; + if (fstat(m_fd, &fileStat) == -1) + { + close(m_fd); + cemuLog_log(LogType::Force, "failed to get file size: {}", filePath); + return; + } + m_fileSize = fileStat.st_size; + + // Memory map the file + m_data = mmap(nullptr, m_fileSize, PROT_READ, MAP_PRIVATE, m_fd, 0); + if (m_data == MAP_FAILED) + { + close(m_fd); + cemuLog_log(LogType::Force, "failed to memory map file: {}", filePath); + return; + } + } + + ~MemoryMappedFile() + { + if (m_data && m_data != MAP_FAILED) + munmap(m_data, m_fileSize); + + if (m_fd != -1) + close(m_fd); + } + + uint8* data() const { return static_cast(m_data); } + size_t size() const { return m_fileSize; } + +private: + int m_fd = -1; + void* m_data = nullptr; + size_t m_fileSize = 0; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index dc1256a4b..e6aa0cabe 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -128,7 +128,7 @@ void RendererShaderMtl::ShaderCacheLoading_end() // Close RAM filesystem if (s_hasRAMFilesystem) { - executeCommand("diskutil eject {}", METAL_AIR_CACHE_PATH); + //executeCommand("diskutil eject {}", METAL_AIR_CACHE_PATH); s_hasRAMFilesystem = false; } @@ -211,8 +211,50 @@ bool RendererShaderMtl::ShouldCountCompilation() const return !s_isLoadingShadersMtl && m_isGameShader; } +MTL::Library* RendererShaderMtl::LibraryFromSource() +{ + // Compile from source + MTL::CompileOptions* options = MTL::CompileOptions::alloc()->init(); + // TODO: always disable fast math for problematic shaders + if (g_current_game_profile->GetFastMath()) + options->setFastMathEnabled(true); + if (g_current_game_profile->GetPositionInvariance()) + options->setPreserveInvariance(true); + + NS::Error* error = nullptr; + MTL::Library* library = m_mtlr->GetDevice()->newLibrary(ToNSString(m_mslCode), options, &error); + options->release(); + FinishCompilation(); + if (error) + { + cemuLog_log(LogType::Force, "failed to create library from source: {} -> {}", error->localizedDescription()->utf8String(), m_mslCode.c_str()); + return nullptr; + } + + return library; +} + +MTL::Library* RendererShaderMtl::LibraryFromAIR(std::span data) +{ + dispatch_data_t dispatchData = dispatch_data_create(data.data(), data.size(), nullptr, DISPATCH_DATA_DESTRUCTOR_DEFAULT); + + NS::Error* error = nullptr; + MTL::Library* library = m_mtlr->GetDevice()->newLibrary(dispatchData, &error); + FinishCompilation(); + printf("AIR size: %zu\n", data.size()); + if (error) + { + cemuLog_log(LogType::Force, "failed to create library from AIR: {}", error->localizedDescription()->utf8String()); + return nullptr; + } + + return library; +} + void RendererShaderMtl::CompileInternal() { + MTL::Library* library; + // First, try to retrieve the compiled shader from the AIR cache if (s_isLoadingShadersMtl && (m_isGameShader && !m_isGfxPackShader) && s_airCache) { @@ -222,72 +264,73 @@ void RendererShaderMtl::CompileInternal() std::vector cacheFileData; if (s_airCache->GetFile({ h1, h2 }, cacheFileData)) { - CompileFromAIR(std::span(cacheFileData.data(), cacheFileData.size())); - FinishCompilation(); + library = LibraryFromAIR(std::span(cacheFileData.data(), cacheFileData.size())); } else { // Ensure that RAM filesystem exists + static std::atomic s_creatingRAMFilesystem{false}; if (!s_hasRAMFilesystem) { + if (s_creatingRAMFilesystem) + { + while (!s_hasRAMFilesystem) + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + } + else + { + s_creatingRAMFilesystem = true; + executeCommand("diskutil erasevolume HFS+ {} $(hdiutil attach -nomount ram://{})", METAL_AIR_CACHE_NAME, METAL_AIR_CACHE_BLOCK_COUNT); + s_creatingRAMFilesystem = false; + } s_hasRAMFilesystem = true; - executeCommand("diskutil erasevolume HFS+ {} $(hdiutil attach -nomount ram://{})", METAL_AIR_CACHE_NAME, METAL_AIR_CACHE_BLOCK_COUNT); } // The shader is not in the cache, compile it - std::string filename = fmt::format("{}_{}", h1, h2); - // TODO: store the source - executeCommand("xcrun -sdk macosx metal -o {}.ir -c {}.metal", filename, filename); - executeCommand("xcrun -sdk macosx metallib -o {}.metallib {}.ir", filename, filename); - // TODO: clean up + std::string baseFilename = fmt::format("{}/{}_{}", METAL_AIR_CACHE_PATH, h1, h2); + + // Source + std::ofstream mslFile; + mslFile.open(fmt::format("{}.metal", baseFilename)); + mslFile << m_mslCode; + mslFile.close(); + + // Compile + executeCommand("xcrun -sdk macosx metal -o {}.ir -c {}.metal -Wno-unused-variable -Wno-sign-compare", baseFilename, baseFilename); + executeCommand("xcrun -sdk macosx metallib -o {}.metallib {}.ir", baseFilename, baseFilename); + + // Clean up + executeCommand("rm {}.metal", baseFilename); + executeCommand("rm {}.ir", baseFilename); // Load from the newly Generated AIR - // std::span airData = ; - //CompileFromAIR(std::span((uint8*)cacheFileData.data(), cacheFileData.size() / sizeof(uint8))); - FinishCompilation(); + MemoryMappedFile airFile(fmt::format("{}.metallib", baseFilename)); + std::span airData = std::span(airFile.data(), airFile.size()); + library = LibraryFromAIR(std::span(cacheFileData.data(), cacheFileData.size())); // Store in the cache uint64 h1, h2; GenerateShaderPrecompiledCacheFilename(m_type, m_baseHash, m_auxHash, h1, h2); - //s_airCache->AddFile({ h1, h2 }, airData.data(), airData.size()); + s_airCache->AddFile({ h1, h2 }, airData.data(), airData.size()); } - - return; } - - // Compile from source - MTL::CompileOptions* options = MTL::CompileOptions::alloc()->init(); - // TODO: always disable fast math for problematic shaders - if (g_current_game_profile->GetFastMath()) - options->setFastMathEnabled(true); - if (g_current_game_profile->GetPositionInvariance()) - options->setPreserveInvariance(true); - - NS::Error* error = nullptr; - MTL::Library* library = m_mtlr->GetDevice()->newLibrary(ToNSString(m_mslCode), options, &error); - options->release(); - if (error) + else { - cemuLog_log(LogType::Force, "failed to create library: {} -> {}", error->localizedDescription()->utf8String(), m_mslCode.c_str()); - FinishCompilation(); - return; + // Compile from source + library = LibraryFromSource(); } + + if (!library) + return; + m_function = library->newFunction(ToNSString("main0")); library->release(); - FinishCompilation(); - // Count shader compilation if (ShouldCountCompilation()) g_compiled_shaders_total++; } -void RendererShaderMtl::CompileFromAIR(std::span data) -{ - // TODO: implement this - printf("LOADING SHADER FROM AIR CACHE\n"); -} - void RendererShaderMtl::FinishCompilation() { m_mslCode.clear(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h index 98d186875..98973a0e0 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h @@ -67,9 +67,11 @@ class RendererShaderMtl : public RendererShader bool ShouldCountCompilation() const; - void CompileInternal(); + MTL::Library* LibraryFromSource(); + + MTL::Library* LibraryFromAIR(std::span data); - void CompileFromAIR(std::span data); + void CompileInternal(); void FinishCompilation(); }; From 371c08992362e4831f941eab0c94b61f59171738 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 15 Jan 2025 19:25:29 +0100 Subject: [PATCH 325/368] fix: AIR shaders not getting loaded properly --- .../HW/Latte/Renderer/Metal/MetalCommon.h | 23 +++++++++++++++---- .../Renderer/Metal/RendererShaderMtl.cpp | 11 ++++----- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h index 28d922251..2543a9fc3 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -123,14 +123,27 @@ class MemoryMappedFile } // Get the file size + // Use a loop to handle the case where the file size is 0 (more of a safety net) struct stat fileStat; - if (fstat(m_fd, &fileStat) == -1) + while (true) { - close(m_fd); - cemuLog_log(LogType::Force, "failed to get file size: {}", filePath); - return; + if (fstat(m_fd, &fileStat) == -1) + { + close(m_fd); + cemuLog_log(LogType::Force, "failed to get file size: {}", filePath); + return; + } + m_fileSize = fileStat.st_size; + + if (m_fileSize == 0) + { + cemuLog_logOnce(LogType::Force, "file size is 0: {}", filePath); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + continue; + } + + break; } - m_fileSize = fileStat.st_size; // Memory map the file m_data = mmap(nullptr, m_fileSize, PROT_READ, MAP_PRIVATE, m_fd, 0); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index e6aa0cabe..303ba1e98 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -29,7 +29,7 @@ class ShaderMtlThreadPool if (m_threadsActive.exchange(true)) return; // create thread pool - const uint32 threadCount = 2; + const uint32 threadCount = 8; for (uint32 i = 0; i < threadCount; ++i) s_threads.emplace_back(&ShaderMtlThreadPool::CompilerThreadFunc, this); } @@ -128,7 +128,7 @@ void RendererShaderMtl::ShaderCacheLoading_end() // Close RAM filesystem if (s_hasRAMFilesystem) { - //executeCommand("diskutil eject {}", METAL_AIR_CACHE_PATH); + executeCommand("diskutil eject {}", METAL_AIR_CACHE_PATH); s_hasRAMFilesystem = false; } @@ -241,7 +241,6 @@ MTL::Library* RendererShaderMtl::LibraryFromAIR(std::span data) NS::Error* error = nullptr; MTL::Library* library = m_mtlr->GetDevice()->newLibrary(dispatchData, &error); FinishCompilation(); - printf("AIR size: %zu\n", data.size()); if (error) { cemuLog_log(LogType::Force, "failed to create library from AIR: {}", error->localizedDescription()->utf8String()); @@ -296,17 +295,17 @@ void RendererShaderMtl::CompileInternal() mslFile.close(); // Compile - executeCommand("xcrun -sdk macosx metal -o {}.ir -c {}.metal -Wno-unused-variable -Wno-sign-compare", baseFilename, baseFilename); + executeCommand("xcrun -sdk macosx metal -o {}.ir -c {}.metal -w", baseFilename, baseFilename); executeCommand("xcrun -sdk macosx metallib -o {}.metallib {}.ir", baseFilename, baseFilename); // Clean up executeCommand("rm {}.metal", baseFilename); executeCommand("rm {}.ir", baseFilename); - // Load from the newly Generated AIR + // Load from the newly generated AIR MemoryMappedFile airFile(fmt::format("{}.metallib", baseFilename)); std::span airData = std::span(airFile.data(), airFile.size()); - library = LibraryFromAIR(std::span(cacheFileData.data(), cacheFileData.size())); + library = LibraryFromAIR(std::span(airData.data(), airData.size())); // Store in the cache uint64 h1, h2; From f5eb184969361be46f68dbcd9cd55a266d2ac7a8 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 15 Jan 2025 19:30:46 +0100 Subject: [PATCH 326/368] check if shaders compiled successfully --- src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h | 7 ++++++- src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp | 8 +++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h index 2543a9fc3..a03e7cae2 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -103,11 +103,16 @@ inline bool FormatIsRenderable(Latte::E_GX2SURFFMT format) } template -inline void executeCommand(fmt::format_string fmt, T&&... args) { +inline bool executeCommand(fmt::format_string fmt, T&&... args) { std::string command = fmt::format(fmt, std::forward(args)...); int res = system(command.c_str()); if (res != 0) + { cemuLog_log(LogType::Force, "command \"{}\" failed with exit code {}", command, res); + return false; + } + + return true; } class MemoryMappedFile diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index 303ba1e98..810c827b0 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -29,7 +29,7 @@ class ShaderMtlThreadPool if (m_threadsActive.exchange(true)) return; // create thread pool - const uint32 threadCount = 8; + const uint32 threadCount = 2; for (uint32 i = 0; i < threadCount; ++i) s_threads.emplace_back(&ShaderMtlThreadPool::CompilerThreadFunc, this); } @@ -295,8 +295,10 @@ void RendererShaderMtl::CompileInternal() mslFile.close(); // Compile - executeCommand("xcrun -sdk macosx metal -o {}.ir -c {}.metal -w", baseFilename, baseFilename); - executeCommand("xcrun -sdk macosx metallib -o {}.metallib {}.ir", baseFilename, baseFilename); + if (!executeCommand("xcrun -sdk macosx metal -o {}.ir -c {}.metal -w", baseFilename, baseFilename)) + return; + if (!executeCommand("xcrun -sdk macosx metallib -o {}.metallib {}.ir", baseFilename, baseFilename)) + return; // Clean up executeCommand("rm {}.metal", baseFilename); From 5af904b5e2630ac1b34885e4d51f4c994a62f497 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Wed, 15 Jan 2025 19:45:59 +0100 Subject: [PATCH 327/368] delete unused metallibs --- src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index 810c827b0..0ef4c5b78 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -11,7 +11,7 @@ #define METAL_AIR_CACHE_NAME "Cemu_AIR_cache" #define METAL_AIR_CACHE_PATH "/Volumes/" METAL_AIR_CACHE_NAME -#define METAL_AIR_CACHE_SIZE (512 * 1024 * 1024) +#define METAL_AIR_CACHE_SIZE (16 * 1024 * 1024) #define METAL_AIR_CACHE_BLOCK_COUNT (METAL_AIR_CACHE_SIZE / 512) static bool s_isLoadingShadersMtl{false}; @@ -313,6 +313,9 @@ void RendererShaderMtl::CompileInternal() uint64 h1, h2; GenerateShaderPrecompiledCacheFilename(m_type, m_baseHash, m_auxHash, h1, h2); s_airCache->AddFile({ h1, h2 }, airData.data(), airData.size()); + + // Clean up + executeCommand("rm {}.metallib", baseFilename); } } else From 0b1932c206c64adba37070aea9e4c9c10c4801e0 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 16 Jan 2025 15:53:51 +0100 Subject: [PATCH 328/368] compile shaders to AIR at runtime --- .../Renderer/Metal/RendererShaderMtl.cpp | 172 ++++++++++-------- .../Latte/Renderer/Metal/RendererShaderMtl.h | 2 + 2 files changed, 101 insertions(+), 73 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index 0ef4c5b78..09a254998 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -15,7 +15,7 @@ #define METAL_AIR_CACHE_BLOCK_COUNT (METAL_AIR_CACHE_SIZE / 512) static bool s_isLoadingShadersMtl{false}; -static std::atomic s_hasRAMFilesystem{false}; +static bool s_hasRAMFilesystem{false}; class FileCache* s_airCache{nullptr}; extern std::atomic_int g_compiled_shaders_total; @@ -28,10 +28,14 @@ class ShaderMtlThreadPool { if (m_threadsActive.exchange(true)) return; - // create thread pool + + // Create thread pool const uint32 threadCount = 2; for (uint32 i = 0; i < threadCount; ++i) s_threads.emplace_back(&ShaderMtlThreadPool::CompilerThreadFunc, this); + + // Create AIR cache thread + s_airCacheThread = new std::thread(&ShaderMtlThreadPool::AIRCacheThreadFunc, this); } void StopThreads() @@ -43,6 +47,9 @@ class ShaderMtlThreadPool for (auto& it : s_threads) it.join(); s_threads.clear(); + + s_airCacheThread->join(); + delete s_airCacheThread; } ~ShaderMtlThreadPool() @@ -79,15 +86,48 @@ class ShaderMtlThreadPool } } + void AIRCacheThreadFunc() + { + SetThreadName("mtlAIRCache"); + while (m_threadsActive.load(std::memory_order::relaxed)) + { + s_airCacheQueueCount.decrementWithWait(); + s_airCacheQueueMutex.lock(); + if (s_airCacheQueue.empty()) + { + s_airCacheQueueMutex.unlock(); + continue; + } + + // Create RAM filesystem + if (!s_hasRAMFilesystem) + { + executeCommand("diskutil erasevolume HFS+ {} $(hdiutil attach -nomount ram://{})", METAL_AIR_CACHE_NAME, METAL_AIR_CACHE_BLOCK_COUNT); + s_hasRAMFilesystem = true; + } + + RendererShaderMtl* job = s_airCacheQueue.front(); + s_airCacheQueue.pop_front(); + s_airCacheQueueMutex.unlock(); + // compile + job->CompileToAIR(); + } + } + bool HasThreadsRunning() const { return m_threadsActive; } public: std::vector s_threads; + std::thread* s_airCacheThread{nullptr}; std::deque s_compilationQueue; CounterSemaphore s_compilationQueueCount; std::mutex s_compilationQueueMutex; + std::deque s_airCacheQueue; + CounterSemaphore s_airCacheQueueCount; + std::mutex s_airCacheQueueMutex; + private: std::atomic m_threadsActive; } shaderMtlThreadPool; @@ -118,6 +158,12 @@ void RendererShaderMtl::ShaderCacheLoading_end() { s_isLoadingShadersMtl = false; + // Reset shader compilation speed + static_cast(g_renderer.get())->SetShouldMaximizeConcurrentCompilation(false); +} + +void RendererShaderMtl::ShaderCacheLoading_Close() +{ // Close the AIR cache if (s_airCache) { @@ -127,18 +173,7 @@ void RendererShaderMtl::ShaderCacheLoading_end() // Close RAM filesystem if (s_hasRAMFilesystem) - { executeCommand("diskutil eject {}", METAL_AIR_CACHE_PATH); - s_hasRAMFilesystem = false; - } - - // Reset shader compilation speed - static_cast(g_renderer.get())->SetShouldMaximizeConcurrentCompilation(false); -} - -void RendererShaderMtl::ShaderCacheLoading_Close() -{ - // Do nothing } void RendererShaderMtl::Initialize() @@ -215,7 +250,6 @@ MTL::Library* RendererShaderMtl::LibraryFromSource() { // Compile from source MTL::CompileOptions* options = MTL::CompileOptions::alloc()->init(); - // TODO: always disable fast math for problematic shaders if (g_current_game_profile->GetFastMath()) options->setFastMathEnabled(true); if (g_current_game_profile->GetPositionInvariance()) @@ -224,7 +258,6 @@ MTL::Library* RendererShaderMtl::LibraryFromSource() NS::Error* error = nullptr; MTL::Library* library = m_mtlr->GetDevice()->newLibrary(ToNSString(m_mslCode), options, &error); options->release(); - FinishCompilation(); if (error) { cemuLog_log(LogType::Force, "failed to create library from source: {} -> {}", error->localizedDescription()->utf8String(), m_mslCode.c_str()); @@ -240,7 +273,6 @@ MTL::Library* RendererShaderMtl::LibraryFromAIR(std::span data) NS::Error* error = nullptr; MTL::Library* library = m_mtlr->GetDevice()->newLibrary(dispatchData, &error); - FinishCompilation(); if (error) { cemuLog_log(LogType::Force, "failed to create library from AIR: {}", error->localizedDescription()->utf8String()); @@ -252,7 +284,7 @@ MTL::Library* RendererShaderMtl::LibraryFromAIR(std::span data) void RendererShaderMtl::CompileInternal() { - MTL::Library* library; + MTL::Library* library = nullptr; // First, try to retrieve the compiled shader from the AIR cache if (s_isLoadingShadersMtl && (m_isGameShader && !m_isGfxPackShader) && s_airCache) @@ -264,69 +296,25 @@ void RendererShaderMtl::CompileInternal() if (s_airCache->GetFile({ h1, h2 }, cacheFileData)) { library = LibraryFromAIR(std::span(cacheFileData.data(), cacheFileData.size())); - } - else - { - // Ensure that RAM filesystem exists - static std::atomic s_creatingRAMFilesystem{false}; - if (!s_hasRAMFilesystem) - { - if (s_creatingRAMFilesystem) - { - while (!s_hasRAMFilesystem) - std::this_thread::sleep_for(std::chrono::milliseconds(500)); - } - else - { - s_creatingRAMFilesystem = true; - executeCommand("diskutil erasevolume HFS+ {} $(hdiutil attach -nomount ram://{})", METAL_AIR_CACHE_NAME, METAL_AIR_CACHE_BLOCK_COUNT); - s_creatingRAMFilesystem = false; - } - s_hasRAMFilesystem = true; - } - - // The shader is not in the cache, compile it - std::string baseFilename = fmt::format("{}/{}_{}", METAL_AIR_CACHE_PATH, h1, h2); - - // Source - std::ofstream mslFile; - mslFile.open(fmt::format("{}.metal", baseFilename)); - mslFile << m_mslCode; - mslFile.close(); - - // Compile - if (!executeCommand("xcrun -sdk macosx metal -o {}.ir -c {}.metal -w", baseFilename, baseFilename)) - return; - if (!executeCommand("xcrun -sdk macosx metallib -o {}.metallib {}.ir", baseFilename, baseFilename)) - return; - - // Clean up - executeCommand("rm {}.metal", baseFilename); - executeCommand("rm {}.ir", baseFilename); - - // Load from the newly generated AIR - MemoryMappedFile airFile(fmt::format("{}.metallib", baseFilename)); - std::span airData = std::span(airFile.data(), airFile.size()); - library = LibraryFromAIR(std::span(airData.data(), airData.size())); - - // Store in the cache - uint64 h1, h2; - GenerateShaderPrecompiledCacheFilename(m_type, m_baseHash, m_auxHash, h1, h2); - s_airCache->AddFile({ h1, h2 }, airData.data(), airData.size()); - - // Clean up - executeCommand("rm {}.metallib", baseFilename); + FinishCompilation(); } } - else + + // Not in the cache, compile from source + if (!library) { // Compile from source library = LibraryFromSource(); + if (!library) + return; + + // Store in the AIR cache + shaderMtlThreadPool.s_airCacheQueueMutex.lock(); + shaderMtlThreadPool.s_airCacheQueue.push_back(this); + shaderMtlThreadPool.s_airCacheQueueCount.increment(); + shaderMtlThreadPool.s_airCacheQueueMutex.unlock(); } - if (!library) - return; - m_function = library->newFunction(ToNSString("main0")); library->release(); @@ -335,6 +323,44 @@ void RendererShaderMtl::CompileInternal() g_compiled_shaders_total++; } +void RendererShaderMtl::CompileToAIR() +{ + uint64 h1, h2; + GenerateShaderPrecompiledCacheFilename(m_type, m_baseHash, m_auxHash, h1, h2); + + // The shader is not in the cache, compile it + std::string baseFilename = fmt::format("{}/{}_{}", METAL_AIR_CACHE_PATH, h1, h2); + + // Source + std::ofstream mslFile; + mslFile.open(fmt::format("{}.metal", baseFilename)); + mslFile << m_mslCode; + mslFile.close(); + + // Compile + if (!executeCommand("xcrun -sdk macosx metal -o {}.ir -c {}.metal -w", baseFilename, baseFilename)) + return; + if (!executeCommand("xcrun -sdk macosx metallib -o {}.metallib {}.ir", baseFilename, baseFilename)) + return; + + // Clean up + executeCommand("rm {}.metal", baseFilename); + executeCommand("rm {}.ir", baseFilename); + + // Load from the newly generated AIR + MemoryMappedFile airFile(fmt::format("{}.metallib", baseFilename)); + std::span airData = std::span(airFile.data(), airFile.size()); + //library = LibraryFromAIR(std::span(airData.data(), airData.size())); + + // Store in the cache + s_airCache->AddFile({ h1, h2 }, airData.data(), airData.size()); + + // Clean up + executeCommand("rm {}.metallib", baseFilename); + + FinishCompilation(); +} + void RendererShaderMtl::FinishCompilation() { m_mslCode.clear(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h index 98973a0e0..a749253ec 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h @@ -73,5 +73,7 @@ class RendererShaderMtl : public RendererShader void CompileInternal(); + void CompileToAIR(); + void FinishCompilation(); }; From 40264302c22bf1bf2c0570230fe2601fa3789ffa Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 16 Jan 2025 16:36:02 +0100 Subject: [PATCH 329/368] use the ENABLE_METAL macro --- src/Cafe/HW/Latte/Core/LatteShader.cpp | 32 ++++++++++++++------------ 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteShader.cpp b/src/Cafe/HW/Latte/Core/LatteShader.cpp index 7ad258840..82ae7990d 100644 --- a/src/Cafe/HW/Latte/Core/LatteShader.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShader.cpp @@ -524,6 +524,7 @@ void LatteSHRC_UpdateVSBaseHash(uint8* vertexShaderPtr, uint32 vertexShaderSize, if (LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_DX_CLIP_SPACE_DEF()) vsHash += 0x1537; +#if ENABLE_METAL if (g_renderer->GetType() == RendererAPI::Metal) { if (usesGeometryShader || _activeFetchShader->mtlFetchVertexManually) @@ -542,27 +543,28 @@ void LatteSHRC_UpdateVSBaseHash(uint8* vertexShaderPtr, uint32 vertexShaderSize, if (!usesGeometryShader) { - // Rasterization - bool rasterizationEnabled = !LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); + // Rasterization + bool rasterizationEnabled = !LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); - // HACK - if (!LatteGPUState.contextNew.PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) - rasterizationEnabled = true; + // HACK + if (!LatteGPUState.contextNew.PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) + rasterizationEnabled = true; - const auto& polygonControlReg = LatteGPUState.contextNew.PA_SU_SC_MODE_CNTL; - uint32 cullFront = polygonControlReg.get_CULL_FRONT(); - uint32 cullBack = polygonControlReg.get_CULL_BACK(); - if (cullFront && cullBack) - rasterizationEnabled = false; + const auto& polygonControlReg = LatteGPUState.contextNew.PA_SU_SC_MODE_CNTL; + uint32 cullFront = polygonControlReg.get_CULL_FRONT(); + uint32 cullBack = polygonControlReg.get_CULL_BACK(); + if (cullFront && cullBack) + rasterizationEnabled = false; - if (rasterizationEnabled) - vsHash += 51ULL; + if (rasterizationEnabled) + vsHash += 51ULL; - // Vertex fetch - if (_activeFetchShader->mtlFetchVertexManually) - vsHash += 349ULL; + // Vertex fetch + if (_activeFetchShader->mtlFetchVertexManually) + vsHash += 349ULL; } } +#endif _shaderBaseHash_vs = vsHash; } From ec2410222c248365e6d3ab74ead6156880fc3f9f Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 16 Jan 2025 18:31:09 +0100 Subject: [PATCH 330/368] support metal shaders in graphics packs --- src/Cafe/GraphicPack/GraphicPack2.cpp | 21 ++++++++++++------- src/Cafe/GraphicPack/GraphicPack2.h | 29 +++++++++++++------------- src/Cafe/HW/Latte/Core/LatteShader.cpp | 2 +- 3 files changed, 30 insertions(+), 22 deletions(-) diff --git a/src/Cafe/GraphicPack/GraphicPack2.cpp b/src/Cafe/GraphicPack/GraphicPack2.cpp index edb4c9e1b..f2e6d487a 100644 --- a/src/Cafe/GraphicPack/GraphicPack2.cpp +++ b/src/Cafe/GraphicPack/GraphicPack2.cpp @@ -428,7 +428,7 @@ GraphicPack2::GraphicPack2(fs::path rulesPath, IniParser& rules) for (const auto& presetEntry : m_presets) { tmp_map[presetEntry->category].emplace_back(presetEntry); - + for (auto& presetVar : presetEntry->variables) { const auto it = m_preset_vars.find(presetVar.first); @@ -683,12 +683,15 @@ void GraphicPack2::LoadShaders() wchar_t shader_type[256]{}; if (filename.size() < 256 && swscanf(filename.c_str(), L"%" SCNx64 "_%" SCNx64 "_%ls", &shader_base_hash, &shader_aux_hash, shader_type) == 3) { + bool isMetalShader = (shader_type[2] == '_' && shader_type[3] == 'm' && shader_type[4] == 's' && shader_type[5] == 'l'); + printf("IS METAL SHADER: %u\n", isMetalShader); + if (shader_type[0] == 'p' && shader_type[1] == 's') - m_custom_shaders.emplace_back(LoadShader(p, shader_base_hash, shader_aux_hash, GP_SHADER_TYPE::PIXEL)); + m_custom_shaders.emplace_back(LoadShader(p, shader_base_hash, shader_aux_hash, GP_SHADER_TYPE::PIXEL, isMetalShader)); else if (shader_type[0] == 'v' && shader_type[1] == 's') - m_custom_shaders.emplace_back(LoadShader(p, shader_base_hash, shader_aux_hash, GP_SHADER_TYPE::VERTEX)); + m_custom_shaders.emplace_back(LoadShader(p, shader_base_hash, shader_aux_hash, GP_SHADER_TYPE::VERTEX, isMetalShader)); else if (shader_type[0] == 'g' && shader_type[1] == 's') - m_custom_shaders.emplace_back(LoadShader(p, shader_base_hash, shader_aux_hash, GP_SHADER_TYPE::GEOMETRY)); + m_custom_shaders.emplace_back(LoadShader(p, shader_base_hash, shader_aux_hash, GP_SHADER_TYPE::GEOMETRY, isMetalShader)); } else if (filename == L"output.glsl") { @@ -1047,7 +1050,7 @@ bool GraphicPack2::Deactivate() return true; } -const std::string* GraphicPack2::FindCustomShaderSource(uint64 shaderBaseHash, uint64 shaderAuxHash, GP_SHADER_TYPE type, bool isVulkanRenderer) +const std::string* GraphicPack2::FindCustomShaderSource(uint64 shaderBaseHash, uint64 shaderAuxHash, GP_SHADER_TYPE type, bool isVulkanRenderer, bool isMetalRenderer) { for (const auto& gp : GraphicPack2::GetActiveGraphicPacks()) { @@ -1057,9 +1060,12 @@ const std::string* GraphicPack2::FindCustomShaderSource(uint64 shaderBaseHash, u if (it == gp->m_custom_shaders.end()) continue; - if(isVulkanRenderer && (*it).isPreVulkanShader) + if (isVulkanRenderer && (*it).isPreVulkanShader) continue; + if (isMetalRenderer != (*it).isMetalShader) + continue; + return &it->source; } return nullptr; @@ -1217,7 +1223,7 @@ void GraphicPack2::ApplyShaderPresets(std::string& shader_source) const } } -GraphicPack2::CustomShader GraphicPack2::LoadShader(const fs::path& path, uint64 shader_base_hash, uint64 shader_aux_hash, GP_SHADER_TYPE shader_type) const +GraphicPack2::CustomShader GraphicPack2::LoadShader(const fs::path& path, uint64 shader_base_hash, uint64 shader_aux_hash, GP_SHADER_TYPE shader_type, bool isMetalShader) const { CustomShader shader; @@ -1236,6 +1242,7 @@ GraphicPack2::CustomShader GraphicPack2::LoadShader(const fs::path& path, uint64 shader.shader_aux_hash = shader_aux_hash; shader.type = shader_type; shader.isPreVulkanShader = this->m_version <= 3; + shader.isMetalShader = isMetalShader; return shader; } diff --git a/src/Cafe/GraphicPack/GraphicPack2.h b/src/Cafe/GraphicPack/GraphicPack2.h index 9b6a86d4f..5fca2f441 100644 --- a/src/Cafe/GraphicPack/GraphicPack2.h +++ b/src/Cafe/GraphicPack/GraphicPack2.h @@ -57,7 +57,7 @@ class GraphicPack2 sint32 lod_bias = -1; // in 1/64th steps sint32 relative_lod_bias = -1; // in 1/64th steps sint32 anistropic_value = -1; // 1< vars) : name(name), variables(std::move(vars)) {} Preset(std::string_view category, std::string_view name, std::unordered_map vars) : category(category), name(name), variables(std::move(vars)) {} - + Preset(std::string_view category, std::string_view name, std::string_view condition, std::unordered_map vars) : category(category), name(name), condition(condition), variables(std::move(vars)) {} }; @@ -136,19 +137,19 @@ class GraphicPack2 bool SetActivePreset(std::string_view category, std::string_view name, bool update_visibility = true); bool SetActivePreset(std::string_view name); void UpdatePresetVisibility(); - + void AddConstantsForCurrentPreset(ExpressionParser& ep); bool ResolvePresetConstant(const std::string& varname, double& value) const; [[nodiscard]] const std::vector& GetPresets() const { return m_presets; } [[nodiscard]] std::unordered_map> GetCategorizedPresets(std::vector& order) const; - + // shaders void LoadShaders(); bool HasShaders() const; const std::vector& GetCustomShaders() const { return m_custom_shaders; } - static const std::string* FindCustomShaderSource(uint64 shaderBaseHash, uint64 shaderAuxHash, GP_SHADER_TYPE type, bool isVulkanRenderer); + static const std::string* FindCustomShaderSource(uint64 shaderBaseHash, uint64 shaderAuxHash, GP_SHADER_TYPE type, bool isVulkanRenderer, bool isMetalRenderer); const std::string& GetOutputShaderSource() const { return m_output_shader_source; } const std::string& GetDownscalingShaderSource() const { return m_downscaling_shader_source; } @@ -194,7 +195,7 @@ class GraphicPack2 { for (auto& var : preset->variables) parser.AddConstant(var.first, (TType)var.second.second); - } + } } for(const auto& preset : active_presets) { @@ -202,7 +203,7 @@ class GraphicPack2 { for (auto& var : preset->variables) parser.TryAddConstant(var.first, (TType)var.second.second); - } + } } for (auto& var : m_preset_vars) @@ -228,7 +229,7 @@ class GraphicPack2 bool m_activated = false; // set if the graphic pack is currently used by the running game std::vector m_title_ids; bool m_patchedFilesLoaded = false; // set to true once patched files are loaded - + sint32 m_vsync_frequency = -1; sint32 m_fs_priority = 100; @@ -241,12 +242,12 @@ class GraphicPack2 std::vector m_presets; // default preset vars std::unordered_map m_preset_vars; - + std::vector m_custom_shaders; std::vector m_texture_rules; std::string m_output_shader_source, m_upscaling_shader_source, m_downscaling_shader_source; std::unique_ptr m_output_shader, m_upscaling_shader, m_downscaling_shader, m_output_shader_ud, m_upscaling_shader_ud, m_downscaling_shader_ud; - + template bool ParseRule(const ExpressionParser& parser, IniParser& iniParser, const char* option_name, T* value_out) const; @@ -257,7 +258,7 @@ class GraphicPack2 std::vector ParseTitleIds(IniParser& rules, const char* option_name) const; - CustomShader LoadShader(const fs::path& path, uint64 shader_base_hash, uint64 shader_aux_hash, GP_SHADER_TYPE shader_type) const; + CustomShader LoadShader(const fs::path& path, uint64 shader_base_hash, uint64 shader_aux_hash, GP_SHADER_TYPE shader_type, bool isMetalShader) const; void ApplyShaderPresets(std::string& shader_source) const; void LoadReplacedFiles(); void _iterateReplacedFiles(const fs::path& currentPath, bool isAOC); @@ -330,6 +331,6 @@ std::vector GraphicPack2::ParseList(const ExpressionParser& parser, IniParser } catch (const std::invalid_argument&) {} } - + return result; -} \ No newline at end of file +} diff --git a/src/Cafe/HW/Latte/Core/LatteShader.cpp b/src/Cafe/HW/Latte/Core/LatteShader.cpp index 7ad258840..0ad42da6d 100644 --- a/src/Cafe/HW/Latte/Core/LatteShader.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShader.cpp @@ -339,7 +339,7 @@ void LatteShader_CreateRendererShader(LatteDecompilerShader* shader, bool compil // check if a custom shader is present std::string shaderSrc; - const std::string* customShaderSrc = GraphicPack2::FindCustomShaderSource(shader->baseHash, shader->auxHash, gpShaderType, g_renderer->GetType() == RendererAPI::Vulkan); + const std::string* customShaderSrc = GraphicPack2::FindCustomShaderSource(shader->baseHash, shader->auxHash, gpShaderType, g_renderer->GetType() == RendererAPI::Vulkan, g_renderer->GetType() == RendererAPI::Metal); if (customShaderSrc) { shaderSrc.assign(*customShaderSrc); From 0bf245baf441835c16767524066b3a0def97f685 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 16 Jan 2025 19:37:14 +0100 Subject: [PATCH 331/368] remove printf --- src/Cafe/GraphicPack/GraphicPack2.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Cafe/GraphicPack/GraphicPack2.cpp b/src/Cafe/GraphicPack/GraphicPack2.cpp index f2e6d487a..77c89e78d 100644 --- a/src/Cafe/GraphicPack/GraphicPack2.cpp +++ b/src/Cafe/GraphicPack/GraphicPack2.cpp @@ -684,7 +684,6 @@ void GraphicPack2::LoadShaders() if (filename.size() < 256 && swscanf(filename.c_str(), L"%" SCNx64 "_%" SCNx64 "_%ls", &shader_base_hash, &shader_aux_hash, shader_type) == 3) { bool isMetalShader = (shader_type[2] == '_' && shader_type[3] == 'm' && shader_type[4] == 's' && shader_type[5] == 'l'); - printf("IS METAL SHADER: %u\n", isMetalShader); if (shader_type[0] == 'p' && shader_type[1] == 's') m_custom_shaders.emplace_back(LoadShader(p, shader_base_hash, shader_aux_hash, GP_SHADER_TYPE::PIXEL, isMetalShader)); From 770063506de62a842c1b1a0f2b359471a289d50d Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 17 Jan 2025 13:52:18 +0100 Subject: [PATCH 332/368] add AIR cache to cache paths --- src/gui/components/wxGameList.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/gui/components/wxGameList.cpp b/src/gui/components/wxGameList.cpp index 509c46622..fb03843a2 100644 --- a/src/gui/components/wxGameList.cpp +++ b/src/gui/components/wxGameList.cpp @@ -69,6 +69,7 @@ std::list _getCachesPaths(const TitleId& titleId) ActiveSettings::GetCachePath(L"shaderCache/driver/vk/{:016x}.bin", titleId), ActiveSettings::GetCachePath(L"shaderCache/precompiled/{:016x}_spirv.bin", titleId), ActiveSettings::GetCachePath(L"shaderCache/precompiled/{:016x}_gl.bin", titleId), + ActiveSettings::GetCachePath(L"shaderCache/precompiled/{:016x}_air.bin", titleId), ActiveSettings::GetCachePath(L"shaderCache/transferable/{:016x}_shaders.bin", titleId), ActiveSettings::GetCachePath(L"shaderCache/transferable/{:016x}_mtlshaders.bin", titleId), ActiveSettings::GetCachePath(L"shaderCache/transferable/{:016x}_vkpipeline.bin", titleId), From 2a218d418f0e16b02ca7ef674d0f97231465f15b Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 17 Jan 2025 14:51:25 +0100 Subject: [PATCH 333/368] use depth mask when writing to depth --- src/Cafe/HW/Latte/Core/LatteShader.cpp | 7 +++++++ src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h | 4 ++-- .../LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp | 7 ++++--- .../LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp | 3 +++ .../LatteDecompilerEmitMSLHeader.hpp | 4 +--- 5 files changed, 17 insertions(+), 8 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteShader.cpp b/src/Cafe/HW/Latte/Core/LatteShader.cpp index 2e1b69ed3..e01645842 100644 --- a/src/Cafe/HW/Latte/Core/LatteShader.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShader.cpp @@ -678,6 +678,13 @@ uint64 LatteSHRC_CalcPSAuxHash(LatteDecompilerShader* pixelShader, uint32* conte auxHash = std::rotl(auxHash, 7); auxHash += (uint64)dataType; } + + bool hasDepthBuffer = LatteMRT::GetActiveDepthBufferMask(LatteGPUState.contextNew); + if (hasDepthBuffer) + { + auxHash = std::rotl(auxHash, 5); + auxHash += 13u; + } } #endif diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h index 21f6d2b2b..475bacb0c 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h @@ -187,8 +187,8 @@ struct LatteDecompilerShader // analyzer stage (pixel outputs) uint32 pixelColorOutputMask{ 0 }; // from LSB to MSB, 1 bit per written output. 1 if written (indices of color attachments) - // analyzer stage (depth write) - bool depthWritten{ false }; + // analyzer stage (depth output) + bool depthMask{ false }; // analyzer stage (geometry shader parameters/inputs) uint32 ringParameterCount{ 0 }; uint32 ringParameterCountFromPrevStage{ 0 }; // used in geometry shader to hold VS ringParameterCount diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp index 4d924e94e..cbb2b3ff8 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp @@ -387,7 +387,7 @@ void LatteDecompiler_analyzeExport(LatteDecompilerShaderContext* shaderContext, LatteDecompilerShader* shader = shaderContext->shader; if( shader->shaderType == LatteConst::ShaderType::Pixel ) { - if( cfInstruction->exportType == 0 && cfInstruction->exportArrayBase < 8 ) + if (cfInstruction->exportType == 0 && cfInstruction->exportArrayBase < 8) { // remember color outputs that are written for(uint32 i=0; i<(cfInstruction->exportBurstCount+1); i++) @@ -396,9 +396,10 @@ void LatteDecompiler_analyzeExport(LatteDecompilerShaderContext* shaderContext, shader->pixelColorOutputMask |= (1<exportType == 0 && cfInstruction->exportArrayBase == 61 ) + else if (cfInstruction->exportType == 0 && cfInstruction->exportArrayBase == 61) { - shader->depthWritten = true; + if (LatteMRT::GetActiveDepthBufferMask(*shaderContext->contextRegistersNew)) + shader->depthMask = true; } else debugBreakpoint(); diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index e14d494a3..d3f2e3e86 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3316,6 +3316,9 @@ static void _emitExportCode(LatteDecompilerShaderContext* shaderContext, LatteDe cemu_assert_unimplemented(); // ukn } + if (!shaderContext->shader->depthMask) + return; + src->add("out.passDepth = "); _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); src->add(".x"); diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 84722a24b..ab8906718 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -298,10 +298,8 @@ namespace LatteDecompiler } // generate depth output for pixel shader - if (decompilerContext->shader->depthWritten) - { + if (decompilerContext->shader->depthMask) src->add("float passDepth [[depth(any)]];" _CRLF); - } src->add("};" _CRLF _CRLF); } From 6cf7f437a7f66ceb0d9cb7c5d4615a6267e9f8eb Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 17 Jan 2025 14:51:54 +0100 Subject: [PATCH 334/368] set AIR cache thread priority --- .../Renderer/Metal/RendererShaderMtl.cpp | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index 09a254998..94e4e9286 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -35,7 +35,18 @@ class ShaderMtlThreadPool s_threads.emplace_back(&ShaderMtlThreadPool::CompilerThreadFunc, this); // Create AIR cache thread - s_airCacheThread = new std::thread(&ShaderMtlThreadPool::AIRCacheThreadFunc, this); + s_airCacheThread = new std::thread(&ShaderMtlThreadPool::AIRCacheThreadFunc, this); + + // Set priority + sched_param schedParam; + schedParam.sched_priority = 20; + if (pthread_setschedparam(s_airCacheThread->native_handle(), SCHED_FIFO, &schedParam) != 0) { + cemuLog_log(LogType::Force, "failed to set FIFO thread priority"); + } + + if (pthread_setschedparam(s_airCacheThread->native_handle(), SCHED_RR, &schedParam) != 0) { + cemuLog_log(LogType::Force, "failed to set RR thread priority"); + } } void StopThreads() @@ -48,8 +59,12 @@ class ShaderMtlThreadPool it.join(); s_threads.clear(); - s_airCacheThread->join(); - delete s_airCacheThread; + if (s_airCacheThread) + { + s_airCacheQueueCount.increment(); + s_airCacheThread->join(); + delete s_airCacheThread; + } } ~ShaderMtlThreadPool() From 993421399851223c22903ec9a7bd0f259f93aae5 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 17 Jan 2025 14:56:59 +0100 Subject: [PATCH 335/368] disable AIR cache --- .../Renderer/Metal/RendererShaderMtl.cpp | 22 ++++++++++++++++++- .../Latte/Renderer/Metal/RendererShaderMtl.h | 4 ++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index 94e4e9286..9e6d3b9c9 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -35,6 +35,7 @@ class ShaderMtlThreadPool s_threads.emplace_back(&ShaderMtlThreadPool::CompilerThreadFunc, this); // Create AIR cache thread + /* s_airCacheThread = new std::thread(&ShaderMtlThreadPool::AIRCacheThreadFunc, this); // Set priority @@ -47,6 +48,7 @@ class ShaderMtlThreadPool if (pthread_setschedparam(s_airCacheThread->native_handle(), SCHED_RR, &schedParam) != 0) { cemuLog_log(LogType::Force, "failed to set RR thread priority"); } + */ } void StopThreads() @@ -59,12 +61,14 @@ class ShaderMtlThreadPool it.join(); s_threads.clear(); + /* if (s_airCacheThread) { s_airCacheQueueCount.increment(); s_airCacheThread->join(); delete s_airCacheThread; } + */ } ~ShaderMtlThreadPool() @@ -101,6 +105,7 @@ class ShaderMtlThreadPool } } + /* void AIRCacheThreadFunc() { SetThreadName("mtlAIRCache"); @@ -128,20 +133,23 @@ class ShaderMtlThreadPool job->CompileToAIR(); } } + */ bool HasThreadsRunning() const { return m_threadsActive; } public: std::vector s_threads; - std::thread* s_airCacheThread{nullptr}; + //std::thread* s_airCacheThread{nullptr}; std::deque s_compilationQueue; CounterSemaphore s_compilationQueueCount; std::mutex s_compilationQueueMutex; + /* std::deque s_airCacheQueue; CounterSemaphore s_airCacheQueueCount; std::mutex s_airCacheQueueMutex; + */ private: std::atomic m_threadsActive; @@ -153,6 +161,7 @@ void RendererShaderMtl::ShaderCacheLoading_begin(uint64 cacheTitleId) s_isLoadingShadersMtl = true; // Open AIR cache + /* if (s_airCache) { delete s_airCache; @@ -164,6 +173,7 @@ void RendererShaderMtl::ShaderCacheLoading_begin(uint64 cacheTitleId) s_airCache = FileCache::Open(cachePath, true, airCacheMagic); if (!s_airCache) cemuLog_log(LogType::Force, "Unable to open AIR cache {}", cacheFilename); + */ // Maximize shader compilation speed static_cast(g_renderer.get())->SetShouldMaximizeConcurrentCompilation(true); @@ -187,8 +197,10 @@ void RendererShaderMtl::ShaderCacheLoading_Close() } // Close RAM filesystem + /* if (s_hasRAMFilesystem) executeCommand("diskutil eject {}", METAL_AIR_CACHE_PATH); + */ } void RendererShaderMtl::Initialize() @@ -282,6 +294,7 @@ MTL::Library* RendererShaderMtl::LibraryFromSource() return library; } +/* MTL::Library* RendererShaderMtl::LibraryFromAIR(std::span data) { dispatch_data_t dispatchData = dispatch_data_create(data.data(), data.size(), nullptr, DISPATCH_DATA_DESTRUCTOR_DEFAULT); @@ -296,12 +309,14 @@ MTL::Library* RendererShaderMtl::LibraryFromAIR(std::span data) return library; } +*/ void RendererShaderMtl::CompileInternal() { MTL::Library* library = nullptr; // First, try to retrieve the compiled shader from the AIR cache + /* if (s_isLoadingShadersMtl && (m_isGameShader && !m_isGfxPackShader) && s_airCache) { cemu_assert_debug(m_baseHash != 0); @@ -314,6 +329,7 @@ void RendererShaderMtl::CompileInternal() FinishCompilation(); } } + */ // Not in the cache, compile from source if (!library) @@ -324,10 +340,12 @@ void RendererShaderMtl::CompileInternal() return; // Store in the AIR cache + /* shaderMtlThreadPool.s_airCacheQueueMutex.lock(); shaderMtlThreadPool.s_airCacheQueue.push_back(this); shaderMtlThreadPool.s_airCacheQueueCount.increment(); shaderMtlThreadPool.s_airCacheQueueMutex.unlock(); + */ } m_function = library->newFunction(ToNSString("main0")); @@ -338,6 +356,7 @@ void RendererShaderMtl::CompileInternal() g_compiled_shaders_total++; } +/* void RendererShaderMtl::CompileToAIR() { uint64 h1, h2; @@ -375,6 +394,7 @@ void RendererShaderMtl::CompileToAIR() FinishCompilation(); } +*/ void RendererShaderMtl::FinishCompilation() { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h index a749253ec..9953ba746 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h @@ -69,11 +69,11 @@ class RendererShaderMtl : public RendererShader MTL::Library* LibraryFromSource(); - MTL::Library* LibraryFromAIR(std::span data); + //MTL::Library* LibraryFromAIR(std::span data); void CompileInternal(); - void CompileToAIR(); + //void CompileToAIR(); void FinishCompilation(); }; From 24ff85b11f3dc8eec0b69f18d3ca7323c6896b3e Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 18 Jan 2025 09:40:31 +0100 Subject: [PATCH 336/368] implement new index cache --- .../Renderer/Metal/MetalBufferAllocator.h | 83 ++++++++++++------- .../Renderer/Metal/MetalMemoryManager.cpp | 6 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 51 +++++------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 5 +- .../Renderer/Metal/RendererShaderMtl.cpp | 6 +- 5 files changed, 87 insertions(+), 64 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h index 209b1395c..102ccdc90 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h @@ -1,8 +1,8 @@ #pragma once #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" -#include "Common/precompiled.h" -#include "Metal/MTLResource.hpp" +#include "util/helpers/MemoryPool.h" + #include struct MetalBufferRange @@ -54,7 +54,7 @@ class MetalBufferAllocator return m_buffers[bufferIndex].m_buffer; } - MetalBufferAllocation GetBufferAllocation(size_t size) + MetalBufferAllocation GetAllocation(size_t size) { // Align the size size = Align(size, 128); @@ -121,29 +121,6 @@ class MetalBufferAllocator return allocation; } - void FreeAllocation(MetalBufferAllocation& allocation) - { - MetalBufferRange range; - range.offset = allocation.offset; - range.size = allocation.size; - - allocation.offset = INVALID_OFFSET; - - // Find the correct position to insert the free range - auto& buffer = m_buffers[allocation.bufferIndex]; - for (uint32 i = 0; i < buffer.m_freeRanges.size(); i++) - { - auto& freeRange = buffer.m_freeRanges[i]; - if (freeRange.offset + freeRange.size == range.offset) - { - freeRange.size += range.size; - return; - } - } - - buffer.m_freeRanges.push_back(range); - } - protected: class MetalRenderer* m_mtlr; @@ -276,7 +253,7 @@ class MetalTemporaryBufferAllocator : public MetalBufferAllocatorbufferIndex); + + return allocation; + } + + void FreeAllocation(MetalBufferAllocation& allocation) + { + // TODO + /* + MetalBufferRange range; + range.offset = allocation.offset; + range.size = allocation.size; + + allocation.offset = INVALID_OFFSET; + + // Find the correct position to insert the free range + auto& buffer = m_buffers[allocation.bufferIndex]; + for (uint32 i = 0; i < buffer.m_freeRanges.size(); i++) + { + auto& freeRange = buffer.m_freeRanges[i]; + if (freeRange.offset + freeRange.size == range.offset) + { + freeRange.size += range.size; + return; + } + } + + buffer.m_freeRanges.push_back(range); + */ + UnlockBuffer(allocation.bufferIndex); + } + + void FreeAllocation(MetalBufferAllocation* allocation) + { + FreeAllocation(*allocation); + m_poolAllocatorReservation.freeObj(allocation); + } + /* MetalBufferAllocation GetBufferAllocation(size_t size) { @@ -350,5 +375,7 @@ class MetalTemporaryBufferAllocator : public MetalBufferAllocator> m_executingCommandBuffers; std::map>::iterator m_activeCommandBufferIt; + MemoryPool m_poolAllocatorReservation{32}; + uint16 m_framesSinceBackBufferAccess = 0; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 8b9ac89f5..25d82d5f2 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -73,7 +73,7 @@ void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, si if (m_bufferCacheMode == BufferCacheMode::DevicePrivate) { - auto allocation = m_tempBufferAllocator.GetBufferAllocation(size); + auto allocation = m_tempBufferAllocator.GetAllocation(size); auto buffer = m_tempBufferAllocator.GetBufferOutsideOfCommandBuffer(allocation.bufferIndex); memcpy((uint8*)buffer->contents() + allocation.offset, data, size); @@ -82,8 +82,8 @@ void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, si m_mtlr->CopyBufferToBuffer(buffer, allocation.offset, m_bufferCache, offset, size, ALL_MTL_RENDER_STAGES, ALL_MTL_RENDER_STAGES); - // Make sure the buffer has the right command buffer - m_tempBufferAllocator.GetBuffer(allocation.bufferIndex); // TODO: make a helper function for this + // Mark buffer as used + m_tempBufferAllocator.MarkBufferAsUsed(allocation.bufferIndex); // We can now safely unlock the buffer m_tempBufferAllocator.UnlockBuffer(allocation.bufferIndex); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 83c39b5ca..a5d50c461 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -683,7 +683,7 @@ void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, s // Allocate a temporary buffer auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator(); - auto allocation = bufferAllocator.GetBufferAllocation(compressedImageSize); + auto allocation = bufferAllocator.GetAllocation(compressedImageSize); auto buffer = bufferAllocator.GetBuffer(allocation.bufferIndex); // Copy the data to the temporary buffer @@ -1067,9 +1067,9 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 uint32 hostIndexCount; uint32 indexMin = 0; uint32 indexMax = 0; - uint32 indexBufferOffset = 0; - uint32 indexBufferIndex = 0; - LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexBufferOffset, indexBufferIndex); + Renderer::IndexAllocation indexAllocation; + LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexAllocation); + MetalBufferAllocation* indexAllocationMtl = static_cast(indexAllocation.rendererInternal); // Buffer cache if (m_memoryManager->UseHostMemoryForCache()) @@ -1312,16 +1312,13 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 if (hostIndexType != INDEX_TYPE::NONE) { auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator(); - indexBuffer = bufferAllocator.GetBuffer(indexBufferIndex); - - // We have already retrieved the buffer, no need for it to be locked anymore - bufferAllocator.UnlockBuffer(indexBufferIndex); + indexBuffer = bufferAllocator.GetBuffer(indexAllocationMtl->bufferIndex); } if (usesGeometryShader) { if (indexBuffer) - SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_OBJECT, indexBuffer, indexBufferOffset, vertexShader->resourceMapping.indexBufferBinding); + SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_OBJECT, indexBuffer, indexAllocationMtl->offset, vertexShader->resourceMapping.indexBufferBinding); uint8 hostIndexTypeU8 = (uint8)hostIndexType; renderCommandEncoder->setObjectBytes(&hostIndexTypeU8, sizeof(hostIndexTypeU8), vertexShader->resourceMapping.indexTypeBinding); @@ -1352,7 +1349,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 if (indexBuffer) { auto mtlIndexType = GetMtlIndexType(hostIndexType); - renderCommandEncoder->drawIndexedPrimitives(mtlPrimitiveType, hostIndexCount, mtlIndexType, indexBuffer, indexBufferOffset, instanceCount, baseVertex, baseInstance); + renderCommandEncoder->drawIndexedPrimitives(mtlPrimitiveType, hostIndexCount, mtlIndexType, indexBuffer, indexAllocationMtl->offset, instanceCount, baseVertex, baseInstance); } else { @@ -1492,29 +1489,27 @@ void MetalRenderer::draw_handleSpecialState5() renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3)); } -void* MetalRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) +Renderer::IndexAllocation MetalRenderer::indexData_reserveIndexMemory(uint32 size) { auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator(); - auto allocation = bufferAllocator.GetBufferAllocation(size); - offset = allocation.offset; - bufferIndex = allocation.bufferIndex; + auto allocation = bufferAllocator.GetAllocationPtr(size); + + return {allocation->data, allocation}; +} - // Lock the buffer so that it doesn't get released - bufferAllocator.LockBuffer(allocation.bufferIndex); +void MetalRenderer::indexData_releaseIndexMemory(IndexAllocation& allocation) +{ + auto allocationMtl = static_cast(allocation.rendererInternal); - return allocation.data; + auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator(); + bufferAllocator.FreeAllocation(allocationMtl); } -void MetalRenderer::indexData_uploadIndexMemory(uint32 bufferIndex, uint32 offset, uint32 size) +void MetalRenderer::indexData_uploadIndexMemory(IndexAllocation& allocation) { - // Do nothing - /* - if (!HasUnifiedMemory()) - { - auto buffer = m_memoryManager->GetTemporaryBufferAllocator().GetBufferOutsideOfCommandBuffer(bufferIndex); - buffer->didModifyRange(NS::Range(offset, size)); - } - */ + // TODO: uncomment + //auto& bufferAllocator = m_memoryManager->GetBufferAllocator(); + //bufferAllocator.FlushAllocation(static_cast(allocation.rendererInternal)); } LatteQueryObject* MetalRenderer::occlusionQuery_create() { @@ -2102,9 +2097,9 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE } } - auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator(); size_t size = shader->uniform.uniformRangeSize; - auto supportBuffer = bufferAllocator.GetBufferAllocation(size); + auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator(); + auto supportBuffer = bufferAllocator.GetAllocation(size); memcpy(supportBuffer.data, supportBufferData, size); auto buffer = bufferAllocator.GetBuffer(supportBuffer.bufferIndex); //if (!HasUnifiedMemory()) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 760ad6bc2..5a1dbbf52 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -271,8 +271,9 @@ class MetalRenderer : public Renderer void draw_handleSpecialState5(); // index - void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override; - void indexData_uploadIndexMemory(uint32 bufferIndex, uint32 offset, uint32 size) override; + IndexAllocation indexData_reserveIndexMemory(uint32 size) override; + void indexData_releaseIndexMemory(IndexAllocation& allocation) override; + void indexData_uploadIndexMemory(IndexAllocation& allocation) override; // occlusion queries LatteQueryObject* occlusionQuery_create() override; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index 9e6d3b9c9..07073e08c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -15,8 +15,8 @@ #define METAL_AIR_CACHE_BLOCK_COUNT (METAL_AIR_CACHE_SIZE / 512) static bool s_isLoadingShadersMtl{false}; -static bool s_hasRAMFilesystem{false}; -class FileCache* s_airCache{nullptr}; +//static bool s_hasRAMFilesystem{false}; +//class FileCache* s_airCache{nullptr}; extern std::atomic_int g_compiled_shaders_total; extern std::atomic_int g_compiled_shaders_async; @@ -190,6 +190,7 @@ void RendererShaderMtl::ShaderCacheLoading_end() void RendererShaderMtl::ShaderCacheLoading_Close() { // Close the AIR cache + /* if (s_airCache) { delete s_airCache; @@ -197,7 +198,6 @@ void RendererShaderMtl::ShaderCacheLoading_Close() } // Close RAM filesystem - /* if (s_hasRAMFilesystem) executeCommand("diskutil eject {}", METAL_AIR_CACHE_PATH); */ From 97b806f16f959779420231a76cb8088013c8ceb7 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 18 Jan 2025 14:42:57 +0100 Subject: [PATCH 337/368] rework buffer allocators --- src/Cafe/CMakeLists.txt | 1 + .../Renderer/Metal/MetalBufferAllocator.cpp | 233 +++++++++ .../Renderer/Metal/MetalBufferAllocator.h | 463 +++++------------- .../Renderer/Metal/MetalMemoryManager.cpp | 16 +- .../Latte/Renderer/Metal/MetalMemoryManager.h | 24 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 65 +-- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 17 +- 7 files changed, 384 insertions(+), 435 deletions(-) create mode 100644 src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.cpp diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index ced427660..881a6d6dd 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -555,6 +555,7 @@ if(ENABLE_METAL) HW/Latte/Renderer/Metal/CachedFBOMtl.h HW/Latte/Renderer/Metal/MetalAttachmentsInfo.cpp HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h + HW/Latte/Renderer/Metal/MetalBufferAllocator.cpp HW/Latte/Renderer/Metal/MetalBufferAllocator.h HW/Latte/Renderer/Metal/MetalMemoryManager.cpp HW/Latte/Renderer/Metal/MetalMemoryManager.h diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.cpp new file mode 100644 index 000000000..62d0c0939 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.cpp @@ -0,0 +1,233 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h" + +MetalBufferChunkedHeap::~MetalBufferChunkedHeap() +{ + for (auto& chunk : m_chunkBuffers) + chunk->release(); +} + +uint32 MetalBufferChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) +{ + size_t allocationSize = std::max(m_minimumBufferAllocationSize, minimumAllocationSize); + MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(allocationSize, MTL::ResourceStorageModeShared); + cemu_assert_debug(buffer); + cemu_assert_debug(m_chunkBuffers.size() == chunkIndex); + m_chunkBuffers.emplace_back(buffer); + + return allocationSize; +} + +void MetalSynchronizedRingAllocator::addUploadBufferSyncPoint(AllocatorBuffer_t& buffer, uint32 offset) +{ + auto commandBuffer = m_mtlr->GetCurrentCommandBuffer(); + if (commandBuffer == buffer.lastSyncpointCommandBuffer) + return; + buffer.lastSyncpointCommandBuffer = commandBuffer; + buffer.queue_syncPoints.emplace(commandBuffer, offset); +} + +void MetalSynchronizedRingAllocator::allocateAdditionalUploadBuffer(uint32 sizeRequiredForAlloc) +{ + // calculate buffer size, should be a multiple of bufferAllocSize that is at least as large as sizeRequiredForAlloc + uint32 bufferAllocSize = m_minimumBufferAllocSize; + while (bufferAllocSize < sizeRequiredForAlloc) + bufferAllocSize += m_minimumBufferAllocSize; + + AllocatorBuffer_t newBuffer{}; + newBuffer.writeIndex = 0; + newBuffer.basePtr = nullptr; + newBuffer.mtlBuffer = m_mtlr->GetDevice()->newBuffer(bufferAllocSize, MTL::ResourceStorageModeShared); + newBuffer.basePtr = (uint8*)newBuffer.mtlBuffer->contents(); + newBuffer.size = bufferAllocSize; + newBuffer.index = (uint32)m_buffers.size(); + m_buffers.push_back(newBuffer); +} + +MetalSynchronizedRingAllocator::AllocatorReservation_t MetalSynchronizedRingAllocator::AllocateBufferMemory(uint32 size, uint32 alignment) +{ + if (alignment < 128) + alignment = 128; + size = (size + 127) & ~127; + + for (auto& itr : m_buffers) + { + // align pointer + uint32 alignmentPadding = (alignment - (itr.writeIndex % alignment)) % alignment; + uint32 distanceToSyncPoint; + if (!itr.queue_syncPoints.empty()) + { + if (itr.queue_syncPoints.front().offset < itr.writeIndex) + distanceToSyncPoint = 0xFFFFFFFF; + else + distanceToSyncPoint = itr.queue_syncPoints.front().offset - itr.writeIndex; + } + else + distanceToSyncPoint = 0xFFFFFFFF; + uint32 spaceNeeded = alignmentPadding + size; + if (spaceNeeded > distanceToSyncPoint) + continue; // not enough space in current buffer + if ((itr.writeIndex + spaceNeeded) > itr.size) + { + // wrap-around + spaceNeeded = size; + alignmentPadding = 0; + // check if there is enough space in current buffer after wrap-around + if (!itr.queue_syncPoints.empty()) + { + distanceToSyncPoint = itr.queue_syncPoints.front().offset - 0; + if (spaceNeeded > distanceToSyncPoint) + continue; + } + else if (spaceNeeded > itr.size) + continue; + itr.writeIndex = 0; + } + addUploadBufferSyncPoint(itr, itr.writeIndex); + itr.writeIndex += alignmentPadding; + uint32 offset = itr.writeIndex; + itr.writeIndex += size; + itr.cleanupCounter = 0; + MetalSynchronizedRingAllocator::AllocatorReservation_t res; + res.mtlBuffer = itr.mtlBuffer; + res.memPtr = itr.basePtr + offset; + res.bufferOffset = offset; + res.size = size; + res.bufferIndex = itr.index; + + return res; + } + + // allocate new buffer + allocateAdditionalUploadBuffer(size); + + return AllocateBufferMemory(size, alignment); +} + +void MetalSynchronizedRingAllocator::FlushReservation(AllocatorReservation_t& uploadReservation) +{ + /* + cemu_assert_debug(m_bufferType == VKR_BUFFER_TYPE::STAGING); // only the staging buffer isn't coherent + // todo - use nonCoherentAtomSize for flush size (instead of hardcoded constant) + VkMappedMemoryRange flushedRange{}; + flushedRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; + flushedRange.memory = uploadReservation.vkMem; + flushedRange.offset = uploadReservation.bufferOffset; + flushedRange.size = uploadReservation.size; + vkFlushMappedMemoryRanges(m_vkr->GetLogicalDevice(), 1, &flushedRange); + */ +} + +void MetalSynchronizedRingAllocator::CleanupBuffer(MTL::CommandBuffer* latestFinishedCommandBuffer) +{ + for (auto& itr : m_buffers) + { + while (!itr.queue_syncPoints.empty() && latestFinishedCommandBuffer == itr.queue_syncPoints.front().commandBuffer) + { + itr.queue_syncPoints.pop(); + } + if (itr.queue_syncPoints.empty()) + itr.cleanupCounter++; + } + + // check if last buffer is available for deletion + if (m_buffers.size() >= 2) + { + auto& lastBuffer = m_buffers.back(); + if (lastBuffer.cleanupCounter >= 1000) + { + // release buffer + lastBuffer.mtlBuffer->release(); + m_buffers.pop_back(); + } + } +} + +MTL::Buffer* MetalSynchronizedRingAllocator::GetBufferByIndex(uint32 index) const +{ + return m_buffers[index].mtlBuffer; +} + +void MetalSynchronizedRingAllocator::GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const +{ + numBuffers = (uint32)m_buffers.size(); + totalBufferSize = 0; + freeBufferSize = 0; + for (auto& itr : m_buffers) + { + totalBufferSize += itr.size; + // calculate free space in buffer + uint32 distanceToSyncPoint; + if (!itr.queue_syncPoints.empty()) + { + if (itr.queue_syncPoints.front().offset < itr.writeIndex) + distanceToSyncPoint = (itr.size - itr.writeIndex) + itr.queue_syncPoints.front().offset; // size with wrap-around + else + distanceToSyncPoint = itr.queue_syncPoints.front().offset - itr.writeIndex; + } + else + distanceToSyncPoint = itr.size; + freeBufferSize += distanceToSyncPoint; + } +} + +/* MetalSynchronizedHeapAllocator */ + +MetalSynchronizedHeapAllocator::MetalSynchronizedHeapAllocator(class MetalRenderer* mtlRenderer, size_t minimumBufferAllocSize) + : m_mtlr(mtlRenderer), m_chunkedHeap(m_mtlr, minimumBufferAllocSize) {}; + +MetalSynchronizedHeapAllocator::AllocatorReservation* MetalSynchronizedHeapAllocator::AllocateBufferMemory(uint32 size, uint32 alignment) +{ + CHAddr addr = m_chunkedHeap.alloc(size, alignment); + m_activeAllocations.emplace_back(addr); + AllocatorReservation* res = m_poolAllocatorReservation.allocObj(); + res->bufferIndex = addr.chunkIndex; + res->bufferOffset = addr.offset; + res->size = size; + res->mtlBuffer = m_chunkedHeap.GetBufferByIndex(addr.chunkIndex); + res->memPtr = m_chunkedHeap.GetChunkPtr(addr.chunkIndex) + addr.offset; + + return res; +} + +void MetalSynchronizedHeapAllocator::FreeReservation(AllocatorReservation* uploadReservation) +{ + // put the allocation on a delayed release queue for the current command buffer + MTL::CommandBuffer* currentCommandBuffer = m_mtlr->GetCurrentCommandBuffer(); + auto it = std::find_if(m_activeAllocations.begin(), m_activeAllocations.end(), [&uploadReservation](const TrackedAllocation& allocation) { return allocation.allocation.chunkIndex == uploadReservation->bufferIndex && allocation.allocation.offset == uploadReservation->bufferOffset; }); + cemu_assert_debug(it != m_activeAllocations.end()); + m_releaseQueue[currentCommandBuffer].emplace_back(it->allocation); + m_activeAllocations.erase(it); + m_poolAllocatorReservation.freeObj(uploadReservation); +} + +void MetalSynchronizedHeapAllocator::FlushReservation(AllocatorReservation* uploadReservation) +{ + /* + if (m_chunkedHeap.RequiresFlush(uploadReservation->bufferIndex)) + { + VkMappedMemoryRange flushedRange{}; + flushedRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; + flushedRange.memory = uploadReservation->vkMem; + flushedRange.offset = uploadReservation->bufferOffset; + flushedRange.size = uploadReservation->size; + vkFlushMappedMemoryRanges(VulkanRenderer::GetInstance()->GetLogicalDevice(), 1, &flushedRange); + } + */ +} + +void MetalSynchronizedHeapAllocator::CleanupBuffer(MTL::CommandBuffer* latestFinishedCommandBuffer) +{ + auto it = m_releaseQueue.find(latestFinishedCommandBuffer); + if (it == m_releaseQueue.end()) + return; + + // release allocations + for(auto& addr : it->second) + m_chunkedHeap.free(addr); + it = m_releaseQueue.erase(it); +} + +void MetalSynchronizedHeapAllocator::GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const +{ + m_chunkedHeap.GetStats(numBuffers, totalBufferSize, freeBufferSize); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h index 102ccdc90..1db06527b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h @@ -1,381 +1,140 @@ #pragma once #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "util/ChunkedHeap/ChunkedHeap.h" #include "util/helpers/MemoryPool.h" #include -struct MetalBufferRange +class MetalBufferChunkedHeap : private ChunkedHeap<> { - size_t offset; - size_t size; -}; + public: + MetalBufferChunkedHeap(const class MetalRenderer* mtlRenderer, size_t minimumBufferAllocationSize) : m_mtlr(mtlRenderer), m_minimumBufferAllocationSize(minimumBufferAllocationSize) { }; + ~MetalBufferChunkedHeap(); -constexpr size_t BASE_ALLOCATION_SIZE = 8 * 1024 * 1024; // 8 MB -constexpr size_t MAX_ALLOCATION_SIZE = 64 * 1024 * 1024; // 64 MB + using ChunkedHeap::alloc; + using ChunkedHeap::free; -void LatteIndices_invalidateAll(); + uint8* GetChunkPtr(uint32 index) const + { + if (index >= m_chunkBuffers.size()) + return nullptr; -template -class MetalBufferAllocator -{ -public: - struct Buffer - { - MTL::Buffer* m_buffer; - std::vector m_freeRanges; - BufferT m_data; - }; - - MetalBufferAllocator(class MetalRenderer* metalRenderer, MTL::ResourceOptions storageMode) : m_mtlr{metalRenderer} { - m_isCPUAccessible = (storageMode == MTL::ResourceStorageModeShared) || (storageMode == MTL::ResourceStorageModeManaged); - - m_options = storageMode; - if (m_isCPUAccessible) - m_options |= MTL::ResourceCPUCacheModeWriteCombined; - } + return (uint8*)m_chunkBuffers[index]->contents(); + } - ~MetalBufferAllocator() + MTL::Buffer* GetBufferByIndex(uint32 index) const { - for (auto buffer : m_buffers) - { - buffer.m_buffer->release(); - } - } - - void ResetAllocations() - { - for (uint32 i = 0; i < m_buffers.size(); i++) - FreeBuffer(i); - } + cemu_assert_debug(index < m_chunkBuffers.size()); - MTL::Buffer* GetBuffer(uint32 bufferIndex) - { - return m_buffers[bufferIndex].m_buffer; + return m_chunkBuffers[index]; } - MetalBufferAllocation GetAllocation(size_t size) - { - // Align the size - size = Align(size, 128); - - // First, try to find a free range - for (uint32 i = 0; i < m_buffers.size(); i++) - { - auto& buffer = m_buffers[i]; - for (uint32 j = 0; j < buffer.m_freeRanges.size(); j++) - { - auto& range = buffer.m_freeRanges[j]; - if (size <= range.size) - { - MetalBufferAllocation allocation; - allocation.bufferIndex = i; - allocation.offset = range.offset; - allocation.size = size; - allocation.data = (m_isCPUAccessible ? (uint8*)buffer.m_buffer->contents() + range.offset : nullptr); - - range.offset += size; - range.size -= size; - - if (range.size == 0) - { - buffer.m_freeRanges.erase(buffer.m_freeRanges.begin() + j); - } - - return allocation; - } - } - } - - // If no free range was found, allocate a new buffer - size_t allocationSize = BASE_ALLOCATION_SIZE * (1u << m_buffers.size()); - allocationSize = std::min(allocationSize, MAX_ALLOCATION_SIZE); // Limit the allocation size - allocationSize = std::max(allocationSize, size); - MTL::Buffer* mtlBuffer = m_mtlr->GetDevice()->newBuffer(allocationSize, m_options); - #ifdef CEMU_DEBUG_ASSERT - mtlBuffer->setLabel(GetLabel("Buffer from buffer allocator", mtlBuffer)); - #endif - - MetalBufferAllocation allocation; - allocation.bufferIndex = m_buffers.size(); - allocation.offset = 0; - allocation.size = size; - allocation.data = (m_isCPUAccessible ? mtlBuffer->contents() : nullptr); - - m_buffers.push_back({mtlBuffer}); - auto& buffer = m_buffers.back(); - - // If the buffer is larger than the requested size, add the remaining space to the free buffer ranges - if (size < allocationSize) - { - MetalBufferRange range; - range.offset = size; - range.size = allocationSize - size; - - buffer.m_freeRanges.push_back(range); - } - - // Debug - m_mtlr->GetPerformanceMonitor().m_bufferAllocatorMemory += allocationSize; - - return allocation; - } + void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const + { + numBuffers = m_chunkBuffers.size(); + totalBufferSize = m_numHeapBytes; + freeBufferSize = m_numHeapBytes - m_numAllocatedBytes; + } -protected: - class MetalRenderer* m_mtlr; + private: + uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override; - // TODO: make these template arguments - bool m_isCPUAccessible; - MTL::ResourceOptions m_options; + const class MetalRenderer* m_mtlr; - std::vector m_buffers; - - void FreeBuffer(uint32 bufferIndex) - { - auto& buffer = m_buffers[bufferIndex]; - buffer.m_freeRanges.clear(); - buffer.m_freeRanges.push_back({0, buffer.m_buffer->length()}); - } -}; - -struct Empty {}; -typedef MetalBufferAllocator MetalDefaultBufferAllocator; - -struct MetalSyncedBuffer -{ - uint32 m_commandBufferCount = 0; - MTL::CommandBuffer* m_lastCommandBuffer = nullptr; - uint32 m_lock = 0; - - bool IsLocked() const - { - return (m_lock != 0); - } + std::vector m_chunkBuffers; + size_t m_minimumBufferAllocationSize; }; -constexpr uint16 BUFFER_RELEASE_FRAME_TRESHOLD = 1024; - -class MetalTemporaryBufferAllocator : public MetalBufferAllocator +// a circular ring-buffer which tracks and releases memory per command-buffer +class MetalSynchronizedRingAllocator { public: - MetalTemporaryBufferAllocator(class MetalRenderer* metalRenderer) : MetalBufferAllocator(metalRenderer, MTL::ResourceStorageModeShared) {} - - void LockBuffer(uint32 bufferIndex) - { - m_buffers[bufferIndex].m_data.m_lock++; - } - - void UnlockBuffer(uint32 bufferIndex) - { - auto& buffer = m_buffers[bufferIndex]; - - buffer.m_data.m_lock--; - - // Release the buffer if it wasn't released due to the lock - if (!buffer.m_data.IsLocked() && buffer.m_data.m_commandBufferCount == 0) - FreeBuffer(bufferIndex); - } - - void EndFrame() - { - // Unlock all buffers - for (uint32_t i = 0; i < m_buffers.size(); i++) - { - auto& buffer = m_buffers[i]; - - if (buffer.m_data.IsLocked()) - { - if (buffer.m_data.m_commandBufferCount == 0) - FreeBuffer(i); - - buffer.m_data.m_lock = 0; - } - } - - // TODO: do this for other buffer allocators as well? - // Track how many frames have passed since the last access to the back buffer - if (!m_buffers.empty()) - { - auto& backBuffer = m_buffers.back(); - if (backBuffer.m_data.m_commandBufferCount == 0) - { - // Release the back buffer if it hasn't been accessed for a while - if (m_framesSinceBackBufferAccess >= BUFFER_RELEASE_FRAME_TRESHOLD) - { - // Debug - m_mtlr->GetPerformanceMonitor().m_bufferAllocatorMemory -= backBuffer.m_buffer->length(); - - backBuffer.m_buffer->release(); - m_buffers.pop_back(); - - m_framesSinceBackBufferAccess = 0; - } - else - { - m_framesSinceBackBufferAccess++; - } - } - else - { - m_framesSinceBackBufferAccess = 0; - } - } - } - - void SetActiveCommandBuffer(MTL::CommandBuffer* commandBuffer) - { - m_activeCommandBuffer = commandBuffer; - if (commandBuffer) - { - auto result = m_executingCommandBuffers.emplace(std::make_pair(m_activeCommandBuffer, std::vector{})); - cemu_assert_debug(result.second); - m_activeCommandBufferIt = result.first; - } - else - { - m_activeCommandBufferIt = m_executingCommandBuffers.end(); - } - } - - void CommandBufferFinished(MTL::CommandBuffer* commandBuffer) - { - auto it = m_executingCommandBuffers.find(commandBuffer); - for (auto bufferIndex : it->second) - { - auto& buffer = m_buffers[bufferIndex]; - buffer.m_data.m_commandBufferCount--; - - // TODO: is this neccessary? - if (!buffer.m_data.IsLocked() && buffer.m_data.m_commandBufferCount == 0) - FreeBuffer(bufferIndex); - } - - m_executingCommandBuffers.erase(it); - } - - void MarkBufferAsUsed(uint32 bufferIndex) - { - cemu_assert_debug(m_activeCommandBuffer); - - auto& buffer = m_buffers[bufferIndex]; - if (buffer.m_data.m_commandBufferCount == 0 || buffer.m_data.m_lastCommandBuffer != m_activeCommandBuffer) - { - m_activeCommandBufferIt->second.push_back(bufferIndex); - buffer.m_data.m_commandBufferCount++; - buffer.m_data.m_lastCommandBuffer = m_activeCommandBuffer; - } - } - - MTL::Buffer* GetBuffer(uint32 bufferIndex) - { - MarkBufferAsUsed(bufferIndex); - - return m_buffers[bufferIndex].m_buffer; - } - - MTL::Buffer* GetBufferOutsideOfCommandBuffer(uint32 bufferIndex) - { - return m_buffers[bufferIndex].m_buffer; - } - - MetalBufferAllocation* GetAllocationPtr(size_t size) - { - MetalBufferAllocation* allocation = m_poolAllocatorReservation.allocObj(); - *allocation = GetAllocation(size); - - LockBuffer(allocation->bufferIndex); - - return allocation; - } - - void FreeAllocation(MetalBufferAllocation& allocation) - { - // TODO - /* - MetalBufferRange range; - range.offset = allocation.offset; - range.size = allocation.size; - - allocation.offset = INVALID_OFFSET; - - // Find the correct position to insert the free range - auto& buffer = m_buffers[allocation.bufferIndex]; - for (uint32 i = 0; i < buffer.m_freeRanges.size(); i++) - { - auto& freeRange = buffer.m_freeRanges[i]; - if (freeRange.offset + freeRange.size == range.offset) - { - freeRange.size += range.size; - return; - } - } - - buffer.m_freeRanges.push_back(range); - */ - UnlockBuffer(allocation.bufferIndex); - } - - void FreeAllocation(MetalBufferAllocation* allocation) - { - FreeAllocation(*allocation); - m_poolAllocatorReservation.freeObj(allocation); - } - - /* - MetalBufferAllocation GetBufferAllocation(size_t size) - { - if (!m_activeCommandBuffer) - throw std::runtime_error("No active command buffer when allocating a buffer!"); - - auto allocation = MetalBufferAllocator::GetBufferAllocation(size); - - auto& buffer = m_buffers[allocation.bufferIndex]; - if (buffer.m_commandBuffers.empty() || buffer.m_commandBuffers.back() != m_activeCommandBuffer) - buffer.m_commandBuffers.push_back(m_activeCommandBuffer); - - return allocation; - } - */ - - // For debugging - /* - void LogInfo() - { - debug_printf("BUFFERS:\n"); - for (auto& buffer : m_buffers) - { - debug_printf(" %p -> size: %lu, command buffers: %zu\n", buffer.m_buffer, buffer.m_buffer->length(), buffer.m_data.m_commandBuffers.size()); - uint32 same = 0; - uint32 completed = 0; - for (uint32 i = 0; i < buffer.m_data.m_commandBuffers.size(); i++) - { - if (m_mtlr->CommandBufferCompleted(buffer.m_data.m_commandBuffers[i])) - completed++; - for (uint32 j = 0; j < buffer.m_data.m_commandBuffers.size(); j++) - { - if (i != j && buffer.m_data.m_commandBuffers[i] == buffer.m_data.m_commandBuffers[j]) - same++; - } - } - debug_printf(" same: %u\n", same); - debug_printf(" completed: %u\n", completed); - - debug_printf(" FREE RANGES:\n"); - for (auto& range : buffer.m_freeRanges) - { - debug_printf(" offset: %zu, size: %zu\n", range.offset, range.size); - } - } - } - */ + MetalSynchronizedRingAllocator(class MetalRenderer* mtlRenderer, uint32 minimumBufferAllocSize) : m_mtlr(mtlRenderer), m_minimumBufferAllocSize(minimumBufferAllocSize) {}; + MetalSynchronizedRingAllocator(const MetalSynchronizedRingAllocator&) = delete; // disallow copy + + struct BufferSyncPoint_t + { + // todo - modularize sync point + MTL::CommandBuffer* commandBuffer; + uint32 offset; + + BufferSyncPoint_t(MTL::CommandBuffer* _commandBuffer, uint32 _offset) : commandBuffer(_commandBuffer), offset(_offset) {}; + }; + + struct AllocatorBuffer_t + { + MTL::Buffer* mtlBuffer; + uint8* basePtr; + uint32 size; + uint32 writeIndex; + std::queue queue_syncPoints; + MTL::CommandBuffer* lastSyncpointCommandBuffer{ nullptr }; + uint32 index; + uint32 cleanupCounter{ 0 }; // increased by one every time CleanupBuffer() is called if there is no sync point. If it reaches 300 then the buffer is released + }; + + struct AllocatorReservation_t + { + MTL::Buffer* mtlBuffer; + uint8* memPtr; + uint32 bufferOffset; + uint32 size; + uint32 bufferIndex; + }; + + AllocatorReservation_t AllocateBufferMemory(uint32 size, uint32 alignment); + void FlushReservation(AllocatorReservation_t& uploadReservation); + void CleanupBuffer(MTL::CommandBuffer* latestFinishedCommandBuffer); + MTL::Buffer* GetBufferByIndex(uint32 index) const; + + void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const; private: - MTL::CommandBuffer* m_activeCommandBuffer = nullptr; + void allocateAdditionalUploadBuffer(uint32 sizeRequiredForAlloc); + void addUploadBufferSyncPoint(AllocatorBuffer_t& buffer, uint32 offset); - std::map> m_executingCommandBuffers; - std::map>::iterator m_activeCommandBufferIt; + const class MetalRenderer* m_mtlr; + const uint32 m_minimumBufferAllocSize; - MemoryPool m_poolAllocatorReservation{32}; + std::vector m_buffers; +}; - uint16 m_framesSinceBackBufferAccess = 0; +// heap style allocator with released memory being freed after the current command buffer finishes +class MetalSynchronizedHeapAllocator +{ + struct TrackedAllocation + { + TrackedAllocation(CHAddr allocation) : allocation(allocation) {}; + CHAddr allocation; + }; + + public: + MetalSynchronizedHeapAllocator(class MetalRenderer* mtlRenderer, size_t minimumBufferAllocSize); + MetalSynchronizedHeapAllocator(const MetalSynchronizedHeapAllocator&) = delete; // disallow copy + + struct AllocatorReservation + { + MTL::Buffer* mtlBuffer; + uint8* memPtr; + uint32 bufferOffset; + uint32 size; + uint32 bufferIndex; + }; + + AllocatorReservation* AllocateBufferMemory(uint32 size, uint32 alignment); + void FreeReservation(AllocatorReservation* uploadReservation); + void FlushReservation(AllocatorReservation* uploadReservation); + + void CleanupBuffer(MTL::CommandBuffer* latestFinishedCommandBuffer); + + void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const; + private: + const class MetalRenderer* m_mtlr; + MetalBufferChunkedHeap m_chunkedHeap; + // allocations + std::vector m_activeAllocations; + MemoryPool m_poolAllocatorReservation{32}; + // release queue + std::unordered_map> m_releaseQueue; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 25d82d5f2..45a06139e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -73,20 +73,14 @@ void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, si if (m_bufferCacheMode == BufferCacheMode::DevicePrivate) { - auto allocation = m_tempBufferAllocator.GetAllocation(size); - auto buffer = m_tempBufferAllocator.GetBufferOutsideOfCommandBuffer(allocation.bufferIndex); - memcpy((uint8*)buffer->contents() + allocation.offset, data, size); + auto blitCommandEncoder = m_mtlr->GetBlitCommandEncoder(); - // Lock the buffer to make sure it's not deallocated before the copy is done - m_tempBufferAllocator.LockBuffer(allocation.bufferIndex); + auto allocation = m_stagingAllocator.AllocateBufferMemory(size, 1); + memcpy(allocation.memPtr, data, size); - m_mtlr->CopyBufferToBuffer(buffer, allocation.offset, m_bufferCache, offset, size, ALL_MTL_RENDER_STAGES, ALL_MTL_RENDER_STAGES); + blitCommandEncoder->copyFromBuffer(allocation.mtlBuffer, allocation.bufferOffset, m_bufferCache, offset, size); - // Mark buffer as used - m_tempBufferAllocator.MarkBufferAsUsed(allocation.bufferIndex); - - // We can now safely unlock the buffer - m_tempBufferAllocator.UnlockBuffer(allocation.bufferIndex); + //m_mtlr->CopyBufferToBuffer(allocation.mtlBuffer, allocation.bufferOffset, m_bufferCache, offset, size, ALL_MTL_RENDER_STAGES, ALL_MTL_RENDER_STAGES); } else { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index 3d70e0db6..4f0403374 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -7,27 +7,28 @@ class MetalMemoryManager { public: - MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_bufferAllocator(metalRenderer, m_mtlr->GetOptimalBufferStorageMode()), m_framePersistentBufferAllocator(metalRenderer, MTL::ResourceStorageModePrivate), m_tempBufferAllocator(metalRenderer) {} + MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_stagingAllocator(m_mtlr/*, m_mtlr->GetOptimalBufferStorageMode()*/, 32u * 1024 * 1024), m_indexAllocator(m_mtlr/*, m_mtlr->GetOptimalBufferStorageMode()*/, 4u * 1024 * 1024) {} ~MetalMemoryManager(); - MetalDefaultBufferAllocator& GetBufferAllocator() + MetalSynchronizedRingAllocator& GetStagingAllocator() { - return m_bufferAllocator; + return m_stagingAllocator; } - MetalDefaultBufferAllocator& GetFramePersistentBufferAllocator() + MetalSynchronizedHeapAllocator& GetIndexAllocator() { - return m_framePersistentBufferAllocator; + return m_indexAllocator; } - MetalTemporaryBufferAllocator& GetTemporaryBufferAllocator() + MTL::Buffer* GetBufferCache() { - return m_tempBufferAllocator; + return m_bufferCache; } - MTL::Buffer* GetBufferCache() + void CleanupBuffers(MTL::CommandBuffer* latestFinishedCommandBuffer) { - return m_bufferCache; + m_stagingAllocator.CleanupBuffer(latestFinishedCommandBuffer); + m_indexAllocator.CleanupBuffer(latestFinishedCommandBuffer); } // Texture upload buffer @@ -65,9 +66,8 @@ class MetalMemoryManager std::vector m_textureUploadBuffer; - MetalDefaultBufferAllocator m_bufferAllocator; - MetalDefaultBufferAllocator m_framePersistentBufferAllocator; - MetalTemporaryBufferAllocator m_tempBufferAllocator; + MetalSynchronizedRingAllocator m_stagingAllocator; + MetalSynchronizedHeapAllocator m_indexAllocator; MTL::Buffer* m_bufferCache = nullptr; BufferCacheMode m_bufferCacheMode; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index a5d50c461..17a0d86df 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -21,8 +21,8 @@ #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Core/LatteConst.h" #include "Common/precompiled.h" +#include "HW/Latte/Renderer/Metal/MetalBufferAllocator.h" #include "HW/Latte/Renderer/Metal/MetalCommon.h" -#include "Metal/MTLCaptureManager.hpp" #include "config/CemuConfig.h" #include "gui/guiWrapper.h" @@ -191,6 +191,7 @@ MetalRenderer::MetalRenderer() utilityLibrary->release(); // HACK: for some reason, this variable ends up being initialized to some garbage data, even though its declared as bool m_captureFrame = false; + m_occlusionQuery.m_lastCommandBuffer = nullptr; m_captureFrame = false; } @@ -302,12 +303,6 @@ void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) // Reset the command buffers (they are released by TemporaryBufferAllocator) CommitCommandBuffer(); - // Release frame persistent buffers - m_memoryManager->GetFramePersistentBufferAllocator().ResetAllocations(); - - // Unlock all temporary buffers - m_memoryManager->GetTemporaryBufferAllocator().EndFrame(); - // Debug m_performanceMonitor.ResetPerFrameData(); @@ -682,17 +677,16 @@ void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, s auto blitCommandEncoder = GetBlitCommandEncoder(); // Allocate a temporary buffer - auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator(); - auto allocation = bufferAllocator.GetAllocation(compressedImageSize); - auto buffer = bufferAllocator.GetBuffer(allocation.bufferIndex); + auto& bufferAllocator = m_memoryManager->GetStagingAllocator(); + auto allocation = bufferAllocator.AllocateBufferMemory(compressedImageSize, 1); // Copy the data to the temporary buffer - memcpy(allocation.data, pixelData, compressedImageSize); + memcpy(allocation.memPtr, pixelData, compressedImageSize); //buffer->didModifyRange(NS::Range(allocation.offset, allocation.size)); // TODO: specify blit options when copying to a depth stencil texture? // Copy the data from the temporary buffer to the texture - blitCommandEncoder->copyFromBuffer(buffer, allocation.offset, bytesPerRow, 0, MTL::Size(width, height, 1), textureMtl->GetTexture(), sliceIndex, mipIndex, MTL::Origin(0, 0, offsetZ)); + blitCommandEncoder->copyFromBuffer(allocation.mtlBuffer, allocation.bufferOffset, bytesPerRow, 0, MTL::Size(width, height, 1), textureMtl->GetTexture(), sliceIndex, mipIndex, MTL::Origin(0, 0, offsetZ)); //} } @@ -1069,7 +1063,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 uint32 indexMax = 0; Renderer::IndexAllocation indexAllocation; LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexAllocation); - MetalBufferAllocation* indexAllocationMtl = static_cast(indexAllocation.rendererInternal); + auto indexAllocationMtl = static_cast(indexAllocation.rendererInternal); // Buffer cache if (m_memoryManager->UseHostMemoryForCache()) @@ -1308,17 +1302,10 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 BindStageResources(renderCommandEncoder, pixelShader, usesGeometryShader); // Draw - MTL::Buffer* indexBuffer = nullptr; - if (hostIndexType != INDEX_TYPE::NONE) - { - auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator(); - indexBuffer = bufferAllocator.GetBuffer(indexAllocationMtl->bufferIndex); - } - if (usesGeometryShader) { - if (indexBuffer) - SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_OBJECT, indexBuffer, indexAllocationMtl->offset, vertexShader->resourceMapping.indexBufferBinding); + if (hostIndexType != INDEX_TYPE::NONE) + SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_OBJECT, indexAllocationMtl->mtlBuffer, indexAllocationMtl->bufferOffset, vertexShader->resourceMapping.indexBufferBinding); uint8 hostIndexTypeU8 = (uint8)hostIndexType; renderCommandEncoder->setObjectBytes(&hostIndexTypeU8, sizeof(hostIndexTypeU8), vertexShader->resourceMapping.indexTypeBinding); @@ -1346,10 +1333,10 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 } else { - if (indexBuffer) + if (hostIndexType != INDEX_TYPE::NONE) { auto mtlIndexType = GetMtlIndexType(hostIndexType); - renderCommandEncoder->drawIndexedPrimitives(mtlPrimitiveType, hostIndexCount, mtlIndexType, indexBuffer, indexAllocationMtl->offset, instanceCount, baseVertex, baseInstance); + renderCommandEncoder->drawIndexedPrimitives(mtlPrimitiveType, hostIndexCount, mtlIndexType, indexAllocationMtl->mtlBuffer, indexAllocationMtl->bufferOffset, instanceCount, baseVertex, baseInstance); } else { @@ -1491,25 +1478,19 @@ void MetalRenderer::draw_handleSpecialState5() Renderer::IndexAllocation MetalRenderer::indexData_reserveIndexMemory(uint32 size) { - auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator(); - auto allocation = bufferAllocator.GetAllocationPtr(size); + auto allocation = m_memoryManager->GetIndexAllocator().AllocateBufferMemory(size, 128); - return {allocation->data, allocation}; + return {allocation->memPtr, allocation}; } void MetalRenderer::indexData_releaseIndexMemory(IndexAllocation& allocation) { - auto allocationMtl = static_cast(allocation.rendererInternal); - - auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator(); - bufferAllocator.FreeAllocation(allocationMtl); + m_memoryManager->GetIndexAllocator().FreeReservation(static_cast(allocation.rendererInternal)); } void MetalRenderer::indexData_uploadIndexMemory(IndexAllocation& allocation) { - // TODO: uncomment - //auto& bufferAllocator = m_memoryManager->GetBufferAllocator(); - //bufferAllocator.FlushAllocation(static_cast(allocation.rendererInternal)); + m_memoryManager->GetIndexAllocator().FlushReservation(static_cast(allocation.rendererInternal)); } LatteQueryObject* MetalRenderer::occlusionQuery_create() { @@ -1647,9 +1628,6 @@ MTL::CommandBuffer* MetalRenderer::GetCommandBuffer() m_recordedDrawcalls = 0; m_commitTreshold = m_defaultCommitTreshlod; - // Notify memory manager about the new command buffer - m_memoryManager->GetTemporaryBufferAllocator().SetActiveCommandBuffer(mtlCommandBuffer); - // Debug m_performanceMonitor.m_commandBuffers++; @@ -1830,8 +1808,6 @@ void MetalRenderer::CommitCommandBuffer() m_executingCommandBuffers.push_back(mtlCommandBuffer); - m_memoryManager->GetTemporaryBufferAllocator().SetActiveCommandBuffer(nullptr); - // Debug //m_commandQueue->insertDebugCaptureBoundary(); } @@ -1846,7 +1822,7 @@ void MetalRenderer::ProcessFinishedCommandBuffers() auto commandBuffer = *it; if (CommandBufferCompleted(commandBuffer)) { - m_memoryManager->GetTemporaryBufferAllocator().CommandBufferFinished(commandBuffer); + m_memoryManager->CleanupBuffers(commandBuffer); commandBuffer->release(); it = m_executingCommandBuffers.erase(it); atLeastOneCompleted = true; @@ -2098,14 +2074,13 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE } size_t size = shader->uniform.uniformRangeSize; - auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator(); - auto supportBuffer = bufferAllocator.GetAllocation(size); - memcpy(supportBuffer.data, supportBufferData, size); - auto buffer = bufferAllocator.GetBuffer(supportBuffer.bufferIndex); + auto& bufferAllocator = m_memoryManager->GetStagingAllocator(); + auto supportBuffer = bufferAllocator.AllocateBufferMemory(size, 1); + memcpy(supportBuffer.memPtr, supportBufferData, size); //if (!HasUnifiedMemory()) // buffer->didModifyRange(NS::Range(supportBuffer.offset, size)); - SetBuffer(renderCommandEncoder, mtlShaderType, buffer, supportBuffer.offset, shader->resourceMapping.uniformVarsBufferBindingPoint); + SetBuffer(renderCommandEncoder, mtlShaderType, supportBuffer.mtlBuffer, supportBuffer.bufferOffset, shader->resourceMapping.uniformVarsBufferBindingPoint); } // Uniform buffers diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 5a1dbbf52..04c63be82 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -7,19 +7,6 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h" -struct MetalBufferAllocation -{ - void* data; - uint32 bufferIndex; - size_t offset = INVALID_OFFSET; - size_t size; - - bool IsValid() const - { - return offset != INVALID_OFFSET; - } -}; - enum MetalGeneralShaderType { METAL_GENERAL_SHADER_TYPE_VERTEX, @@ -295,14 +282,14 @@ class MetalRenderer : public Renderer return (m_currentCommandBuffer.m_commandBuffer && !m_currentCommandBuffer.m_commited); } - MTL::CommandBuffer* GetCurrentCommandBuffer() + MTL::CommandBuffer* GetCurrentCommandBuffer() const { cemu_assert_debug(m_currentCommandBuffer.m_commandBuffer); return m_currentCommandBuffer.m_commandBuffer; } - MTL::CommandBuffer* GetAndRetainCurrentCommandBufferIfNotCompleted() + MTL::CommandBuffer* GetAndRetainCurrentCommandBufferIfNotCompleted() const { // The command buffer has been commited and has finished execution if (m_currentCommandBuffer.m_commited && m_executingCommandBuffers.size() == 0) From d086eb3db506b9d595f39400b0f7c83545a41bfa Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 18 Jan 2025 17:22:28 +0100 Subject: [PATCH 338/368] fix: index buffer crashes --- src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.cpp | 4 ++-- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 6 ------ 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.cpp index 62d0c0939..44d11a3b7 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.cpp @@ -222,9 +222,9 @@ void MetalSynchronizedHeapAllocator::CleanupBuffer(MTL::CommandBuffer* latestFin return; // release allocations - for(auto& addr : it->second) + for (auto& addr : it->second) m_chunkedHeap.free(addr); - it = m_releaseQueue.erase(it); + m_releaseQueue.erase(it); } void MetalSynchronizedHeapAllocator::GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 17a0d86df..19afbf06e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1816,7 +1816,6 @@ void MetalRenderer::CommitCommandBuffer() void MetalRenderer::ProcessFinishedCommandBuffers() { // Check for finished command buffers - bool atLeastOneCompleted = false; for (auto it = m_executingCommandBuffers.begin(); it != m_executingCommandBuffers.end();) { auto commandBuffer = *it; @@ -1825,17 +1824,12 @@ void MetalRenderer::ProcessFinishedCommandBuffers() m_memoryManager->CleanupBuffers(commandBuffer); commandBuffer->release(); it = m_executingCommandBuffers.erase(it); - atLeastOneCompleted = true; } else { ++it; } } - - // Invalidate indices if at least one command buffer has completed - if (atLeastOneCompleted) - LatteIndices_invalidateAll(); } bool MetalRenderer::AcquireDrawable(bool mainWindow) From 6d6c04ae3c5eeccf29ad2e1eb4d960e40588ba1b Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 18 Jan 2025 17:27:51 +0100 Subject: [PATCH 339/368] update debug overlay --- .../Renderer/Metal/MetalPerformanceMonitor.h | 2 -- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 29 +++++++++++++++++-- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h index cb65162e0..bdbaa84b9 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h @@ -3,8 +3,6 @@ class MetalPerformanceMonitor { public: - size_t m_bufferAllocatorMemory = 0; - // Per frame data uint32 m_commandBuffers = 0; uint32 m_renderPasses = 0; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 19afbf06e..45bc967cb 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -16,11 +16,10 @@ #include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/Core/LatteIndices.h" -#include "Cemu/Logging/CemuDebugLogging.h" +#include "Cafe/HW/Latte/Core/LatteBufferCache.h" #include "Cemu/Logging/CemuLogging.h" #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Core/LatteConst.h" -#include "Common/precompiled.h" #include "HW/Latte/Renderer/Metal/MetalBufferAllocator.h" #include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "config/CemuConfig.h" @@ -588,7 +587,6 @@ void MetalRenderer::AppendOverlayDebugInfo() ImGui::Text("--- Metal info ---"); ImGui::Text("Render pipeline states %zu", m_pipelineCache->GetPipelineCacheSize()); - ImGui::Text("Buffer allocator memory %zuMB", m_performanceMonitor.m_bufferAllocatorMemory / 1024 / 1024); ImGui::Text("--- Metal info (per frame) ---"); ImGui::Text("Command buffers %u", m_performanceMonitor.m_commandBuffers); @@ -596,6 +594,31 @@ void MetalRenderer::AppendOverlayDebugInfo() ImGui::Text("Clears %u", m_performanceMonitor.m_clears); ImGui::Text("Manual vertex fetch draws %u (mesh draws: %u)", m_performanceMonitor.m_manualVertexFetchDraws, m_performanceMonitor.m_meshDraws); ImGui::Text("Triangle fans %u", m_performanceMonitor.m_triangleFans); + + ImGui::Text("--- Cache debug info ---"); + + uint32 bufferCacheHeapSize = 0; + uint32 bufferCacheAllocationSize = 0; + uint32 bufferCacheNumAllocations = 0; + + LatteBufferCache_getStats(bufferCacheHeapSize, bufferCacheAllocationSize, bufferCacheNumAllocations); + + ImGui::Text("Buffer"); + ImGui::SameLine(60.0f); + ImGui::Text("%06uKB / %06uKB Allocs: %u", (uint32)(bufferCacheAllocationSize + 1023) / 1024, ((uint32)bufferCacheHeapSize + 1023) / 1024, (uint32)bufferCacheNumAllocations); + + uint32 numBuffers; + size_t totalSize, freeSize; + + m_memoryManager->GetStagingAllocator().GetStats(numBuffers, totalSize, freeSize); + ImGui::Text("Staging"); + ImGui::SameLine(60.0f); + ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers); + + m_memoryManager->GetIndexAllocator().GetStats(numBuffers, totalSize, freeSize); + ImGui::Text("Index"); + ImGui::SameLine(60.0f); + ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers); } void MetalRenderer::renderTarget_setViewport(float x, float y, float width, float height, float nearZ, float farZ, bool halfZ) From bf93f907398e38fd4f5fe82156fc266dad787667 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 18 Jan 2025 18:01:40 +0100 Subject: [PATCH 340/368] flush uploaded buffers --- .../Renderer/Metal/MetalBufferAllocator.cpp | 32 +++++-------------- .../Renderer/Metal/MetalBufferAllocator.h | 31 +++++++++++++++--- .../Renderer/Metal/MetalMemoryManager.cpp | 1 + .../Latte/Renderer/Metal/MetalMemoryManager.h | 2 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 10 +++--- 5 files changed, 42 insertions(+), 34 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.cpp index 44d11a3b7..05d169b30 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.cpp @@ -9,7 +9,7 @@ MetalBufferChunkedHeap::~MetalBufferChunkedHeap() uint32 MetalBufferChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) { size_t allocationSize = std::max(m_minimumBufferAllocationSize, minimumAllocationSize); - MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(allocationSize, MTL::ResourceStorageModeShared); + MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(allocationSize, m_options); cemu_assert_debug(buffer); cemu_assert_debug(m_chunkBuffers.size() == chunkIndex); m_chunkBuffers.emplace_back(buffer); @@ -36,7 +36,7 @@ void MetalSynchronizedRingAllocator::allocateAdditionalUploadBuffer(uint32 sizeR AllocatorBuffer_t newBuffer{}; newBuffer.writeIndex = 0; newBuffer.basePtr = nullptr; - newBuffer.mtlBuffer = m_mtlr->GetDevice()->newBuffer(bufferAllocSize, MTL::ResourceStorageModeShared); + newBuffer.mtlBuffer = m_mtlr->GetDevice()->newBuffer(bufferAllocSize, m_options); newBuffer.basePtr = (uint8*)newBuffer.mtlBuffer->contents(); newBuffer.size = bufferAllocSize; newBuffer.index = (uint32)m_buffers.size(); @@ -105,16 +105,10 @@ MetalSynchronizedRingAllocator::AllocatorReservation_t MetalSynchronizedRingAllo void MetalSynchronizedRingAllocator::FlushReservation(AllocatorReservation_t& uploadReservation) { - /* - cemu_assert_debug(m_bufferType == VKR_BUFFER_TYPE::STAGING); // only the staging buffer isn't coherent - // todo - use nonCoherentAtomSize for flush size (instead of hardcoded constant) - VkMappedMemoryRange flushedRange{}; - flushedRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; - flushedRange.memory = uploadReservation.vkMem; - flushedRange.offset = uploadReservation.bufferOffset; - flushedRange.size = uploadReservation.size; - vkFlushMappedMemoryRanges(m_vkr->GetLogicalDevice(), 1, &flushedRange); - */ + if (RequiresFlush()) + { + uploadReservation.mtlBuffer->didModifyRange(NS::Range(uploadReservation.bufferOffset, uploadReservation.size)); + } } void MetalSynchronizedRingAllocator::CleanupBuffer(MTL::CommandBuffer* latestFinishedCommandBuffer) @@ -172,9 +166,6 @@ void MetalSynchronizedRingAllocator::GetStats(uint32& numBuffers, size_t& totalB /* MetalSynchronizedHeapAllocator */ -MetalSynchronizedHeapAllocator::MetalSynchronizedHeapAllocator(class MetalRenderer* mtlRenderer, size_t minimumBufferAllocSize) - : m_mtlr(mtlRenderer), m_chunkedHeap(m_mtlr, minimumBufferAllocSize) {}; - MetalSynchronizedHeapAllocator::AllocatorReservation* MetalSynchronizedHeapAllocator::AllocateBufferMemory(uint32 size, uint32 alignment) { CHAddr addr = m_chunkedHeap.alloc(size, alignment); @@ -202,17 +193,10 @@ void MetalSynchronizedHeapAllocator::FreeReservation(AllocatorReservation* uploa void MetalSynchronizedHeapAllocator::FlushReservation(AllocatorReservation* uploadReservation) { - /* - if (m_chunkedHeap.RequiresFlush(uploadReservation->bufferIndex)) + if (m_chunkedHeap.RequiresFlush()) { - VkMappedMemoryRange flushedRange{}; - flushedRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; - flushedRange.memory = uploadReservation->vkMem; - flushedRange.offset = uploadReservation->bufferOffset; - flushedRange.size = uploadReservation->size; - vkFlushMappedMemoryRanges(VulkanRenderer::GetInstance()->GetLogicalDevice(), 1, &flushedRange); + uploadReservation->mtlBuffer->didModifyRange(NS::Range(uploadReservation->bufferOffset, uploadReservation->size)); } - */ } void MetalSynchronizedHeapAllocator::CleanupBuffer(MTL::CommandBuffer* latestFinishedCommandBuffer) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h index 1db06527b..2a62de19c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h @@ -1,15 +1,24 @@ #pragma once #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Metal/MTLResource.hpp" #include "util/ChunkedHeap/ChunkedHeap.h" #include "util/helpers/MemoryPool.h" #include +inline MTL::ResourceOptions GetResourceOptions(MTL::ResourceOptions options) +{ + if (options & MTL::ResourceStorageModeShared || options & MTL::ResourceStorageModeManaged) + options |= MTL::ResourceCPUCacheModeWriteCombined; + + return options; +} + class MetalBufferChunkedHeap : private ChunkedHeap<> { public: - MetalBufferChunkedHeap(const class MetalRenderer* mtlRenderer, size_t minimumBufferAllocationSize) : m_mtlr(mtlRenderer), m_minimumBufferAllocationSize(minimumBufferAllocationSize) { }; + MetalBufferChunkedHeap(const class MetalRenderer* mtlRenderer, MTL::ResourceOptions options, size_t minimumBufferAllocationSize) : m_mtlr(mtlRenderer), m_options(GetResourceOptions(options)), m_minimumBufferAllocationSize(minimumBufferAllocationSize) { }; ~MetalBufferChunkedHeap(); using ChunkedHeap::alloc; @@ -30,6 +39,11 @@ class MetalBufferChunkedHeap : private ChunkedHeap<> return m_chunkBuffers[index]; } + bool RequiresFlush() const + { + return m_options & MTL::ResourceStorageModeManaged; + } + void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const { numBuffers = m_chunkBuffers.size(); @@ -42,15 +56,17 @@ class MetalBufferChunkedHeap : private ChunkedHeap<> const class MetalRenderer* m_mtlr; - std::vector m_chunkBuffers; + MTL::ResourceOptions m_options; size_t m_minimumBufferAllocationSize; + + std::vector m_chunkBuffers; }; // a circular ring-buffer which tracks and releases memory per command-buffer class MetalSynchronizedRingAllocator { public: - MetalSynchronizedRingAllocator(class MetalRenderer* mtlRenderer, uint32 minimumBufferAllocSize) : m_mtlr(mtlRenderer), m_minimumBufferAllocSize(minimumBufferAllocSize) {}; + MetalSynchronizedRingAllocator(class MetalRenderer* mtlRenderer, MTL::ResourceOptions options, uint32 minimumBufferAllocSize) : m_mtlr(mtlRenderer), m_options(GetResourceOptions(options)), m_minimumBufferAllocSize(minimumBufferAllocSize) {}; MetalSynchronizedRingAllocator(const MetalSynchronizedRingAllocator&) = delete; // disallow copy struct BufferSyncPoint_t @@ -88,6 +104,11 @@ class MetalSynchronizedRingAllocator void CleanupBuffer(MTL::CommandBuffer* latestFinishedCommandBuffer); MTL::Buffer* GetBufferByIndex(uint32 index) const; + bool RequiresFlush() const + { + return m_options & MTL::ResourceStorageModeManaged; + } + void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const; private: @@ -95,6 +116,8 @@ class MetalSynchronizedRingAllocator void addUploadBufferSyncPoint(AllocatorBuffer_t& buffer, uint32 offset); const class MetalRenderer* m_mtlr; + + MTL::ResourceOptions m_options; const uint32 m_minimumBufferAllocSize; std::vector m_buffers; @@ -110,7 +133,7 @@ class MetalSynchronizedHeapAllocator }; public: - MetalSynchronizedHeapAllocator(class MetalRenderer* mtlRenderer, size_t minimumBufferAllocSize); + MetalSynchronizedHeapAllocator(class MetalRenderer* mtlRenderer, MTL::ResourceOptions options, size_t minimumBufferAllocSize) : m_mtlr(mtlRenderer), m_chunkedHeap(m_mtlr, options, minimumBufferAllocSize) {} MetalSynchronizedHeapAllocator(const MetalSynchronizedHeapAllocator&) = delete; // disallow copy struct AllocatorReservation diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 45a06139e..7b1dd53fc 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -77,6 +77,7 @@ void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, si auto allocation = m_stagingAllocator.AllocateBufferMemory(size, 1); memcpy(allocation.memPtr, data, size); + m_stagingAllocator.FlushReservation(allocation); blitCommandEncoder->copyFromBuffer(allocation.mtlBuffer, allocation.bufferOffset, m_bufferCache, offset, size); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index 4f0403374..4e55fa6f5 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -7,7 +7,7 @@ class MetalMemoryManager { public: - MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_stagingAllocator(m_mtlr/*, m_mtlr->GetOptimalBufferStorageMode()*/, 32u * 1024 * 1024), m_indexAllocator(m_mtlr/*, m_mtlr->GetOptimalBufferStorageMode()*/, 4u * 1024 * 1024) {} + MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_stagingAllocator(m_mtlr, m_mtlr->GetOptimalBufferStorageMode(), 32u * 1024 * 1024), m_indexAllocator(m_mtlr, m_mtlr->GetOptimalBufferStorageMode(), 4u * 1024 * 1024) {} ~MetalMemoryManager(); MetalSynchronizedRingAllocator& GetStagingAllocator() diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 45bc967cb..61e5c94a7 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -702,6 +702,7 @@ void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, s // Allocate a temporary buffer auto& bufferAllocator = m_memoryManager->GetStagingAllocator(); auto allocation = bufferAllocator.AllocateBufferMemory(compressedImageSize, 1); + bufferAllocator.FlushReservation(allocation); // Copy the data to the temporary buffer memcpy(allocation.memPtr, pixelData, compressedImageSize); @@ -2092,12 +2093,11 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE size_t size = shader->uniform.uniformRangeSize; auto& bufferAllocator = m_memoryManager->GetStagingAllocator(); - auto supportBuffer = bufferAllocator.AllocateBufferMemory(size, 1); - memcpy(supportBuffer.memPtr, supportBufferData, size); - //if (!HasUnifiedMemory()) - // buffer->didModifyRange(NS::Range(supportBuffer.offset, size)); + auto allocation = bufferAllocator.AllocateBufferMemory(size, 1); + memcpy(allocation.memPtr, supportBufferData, size); + bufferAllocator.FlushReservation(allocation); - SetBuffer(renderCommandEncoder, mtlShaderType, supportBuffer.mtlBuffer, supportBuffer.bufferOffset, shader->resourceMapping.uniformVarsBufferBindingPoint); + SetBuffer(renderCommandEncoder, mtlShaderType, allocation.mtlBuffer, allocation.bufferOffset, shader->resourceMapping.uniformVarsBufferBindingPoint); } // Uniform buffers From 86f364889afe8ef6372bda85c0e3c6b8df9f5576 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 18 Jan 2025 19:40:22 +0100 Subject: [PATCH 341/368] fix: sample compare component count --- .../LatteDecompilerEmitMSL.cpp | 152 +++++++++--------- 1 file changed, 77 insertions(+), 75 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index d3f2e3e86..023aefc02 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2295,6 +2295,10 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex } else { + // sample_compare returns a float, need to convert to float4 + if (isCompare) + src->addFmt("float4("); + if (emulateCompare) { cemu_assert_debug(!isGather); @@ -2306,24 +2310,24 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex if (!emulateCompare) { src->add("."); - if (isRead) - { - if (hasOffset) - cemu_assert_unimplemented(); - src->add("read("); - unnormalizationHandled = true; - useTexelCoordinates = true; - } - else - { - if (isGather) - src->add("gather"); - else - src->add("sample"); - if (isCompare) - src->add("_compare"); - src->addFmt("(samplr{}, ", texInstruction->textureFetch.textureIndex); - } + if (isRead) + { + if (hasOffset) + cemu_assert_unimplemented(); + src->add("read("); + unnormalizationHandled = true; + useTexelCoordinates = true; + } + else + { + if (isGather) + src->add("gather"); + else + src->add("sample"); + if (isCompare) + src->add("_compare"); + src->addFmt("(samplr{}, ", texInstruction->textureFetch.textureIndex); + } } else { @@ -2555,65 +2559,63 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex src->add(")"); } - // sample_compare doesn't return a float - if (!isCompare) - { - if( texOpcode == GPU7_TEX_INST_SAMPLE_C || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ ) - { - src->add("."); + if (isCompare) + src->add(")"); - if (numWrittenElements > 1) - { - // result is copied into multiple channels - for (sint32 f = 0; f < numWrittenElements; f++) - { - cemu_assert_debug(texInstruction->dstSel[f] == 0); // only x component is defined - src->add("x"); - } - } - else - { - src->add("x"); - } - } - else - { - src->add("."); - for (sint32 f = 0; f < 4; f++) - { - if( texInstruction->dstSel[f] < 4 ) - { - uint8 elemIndex = texInstruction->dstSel[f]; - if (isGather) + if (texOpcode == GPU7_TEX_INST_SAMPLE_C || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ) + { + src->add("."); + + if (numWrittenElements > 1) + { + // result is copied into multiple channels + for (sint32 f = 0; f < numWrittenElements; f++) + { + cemu_assert_debug(texInstruction->dstSel[f] == 0); // only x component is defined + src->add("x"); + } + } + else + { + src->add("x"); + } + } + else + { + src->add("."); + for (sint32 f = 0; f < 4; f++) + { + if (texInstruction->dstSel[f] < 4) + { + uint8 elemIndex = texInstruction->dstSel[f]; + if (isGather) + { + // 's textureGather() and GPU7's FETCH4 instruction have a different order of elements + // xyzw: top-left, top-right, bottom-right, bottom-left + // textureGather xyzw + // fetch4 yzxw + // translate index from fetch4 to textureGather order + static uint8 fetchToGather[4] = { - // 's textureGather() and GPU7's FETCH4 instruction have a different order of elements - // xyzw: top-left, top-right, bottom-right, bottom-left - // textureGather xyzw - // fetch4 yzxw - // translate index from fetch4 to textureGather order - static uint8 fetchToGather[4] = - { - 2, // x -> z - 0, // y -> x - 1, // z -> y - 3, // w -> w - }; - elemIndex = fetchToGather[elemIndex]; - } - src->add(resultElemTable[elemIndex]); - numWrittenElements++; - } - else if( texInstruction->dstSel[f] == 7 ) - { - // masked and not written - } - else - { - cemu_assert_unimplemented(); - } - } - } - } + 2, // x -> z + 0, // y -> x + 1, // z -> y + 3, // w -> w + }; + elemIndex = fetchToGather[elemIndex]; + } + src->add(resultElemTable[elemIndex]); + } + else if (texInstruction->dstSel[f] == 7) + { + // masked and not written + } + else + { + cemu_assert_unimplemented(); + } + } + } src->add(");"); // debug From 225d11c43e512d03236d8b5cae8d7be0094528c1 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 18 Jan 2025 19:50:35 +0100 Subject: [PATCH 342/368] fix dfdx and dfdy component count --- .../LatteDecompilerEmitMSL.cpp | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 023aefc02..93b662526 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -450,9 +450,23 @@ static void _emitRegisterAccessCode(LatteDecompilerShaderContext* shaderContext, StringBuf* src = shaderContext->shaderSource; sint32 registerElementDataType = shaderContext->typeTracker.defaultDataType; cemu_assert_debug(gprIndex >= 0 && gprIndex <= 127); + + sint32 channelArray[4]; + channelArray[0] = channel0; + channelArray[1] = channel1; + channelArray[2] = channel2; + channelArray[3] = channel3; + + sint32 numComponents = 0; + for (sint32 i = 0; i < 4; i++) + { + if (channelArray[i] >= 0 && channelArray[i] <= 3) + numComponents++; + } + if (dataType >= 0) { - _emitTypeConversionPrefixMSL(shaderContext, registerElementDataType, dataType); + _emitTypeConversionPrefixMSL(shaderContext, registerElementDataType, dataType, numComponents); } if (shaderContext->typeTracker.useArrayGPRs) src->add("R"); @@ -464,12 +478,6 @@ static void _emitRegisterAccessCode(LatteDecompilerShaderContext* shaderContext, src->add("."); - sint32 channelArray[4]; - channelArray[0] = channel0; - channelArray[1] = channel1; - channelArray[2] = channel2; - channelArray[3] = channel3; - for (sint32 i = 0; i < 4; i++) { if (channelArray[i] >= 0 && channelArray[i] <= 3) @@ -2807,7 +2815,7 @@ static void _emitTEXGetGradientsHV(LatteDecompilerShaderContext* shaderContext, sint32 componentCount = 0; for (sint32 i = 0; i < 4; i++) { - if(texInstruction->dstSel[i] == 7) + if (texInstruction->dstSel[i] == 7) continue; componentCount++; } @@ -2840,10 +2848,10 @@ static void _emitTEXGetGradientsHV(LatteDecompilerShaderContext* shaderContext, src->add(" = "); - _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, shaderContext->typeTracker.defaultDataType); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, shaderContext->typeTracker.defaultDataType, componentCount); src->addFmt("{}(", funcName); - _emitRegisterAccessCode(shaderContext, texInstruction->srcGpr, (componentCount >= 1) ? texInstruction->textureFetch.srcSel[0] : -1, (componentCount >= 2) ? texInstruction->textureFetch.srcSel[1] : -1, (componentCount >= 3) ? texInstruction->textureFetch.srcSel[2] : -1, (componentCount >= 4)?texInstruction->textureFetch.srcSel[3]:-1, LATTE_DECOMPILER_DTYPE_FLOAT); + _emitRegisterAccessCode(shaderContext, texInstruction->srcGpr, (componentCount >= 1) ? texInstruction->textureFetch.srcSel[0] : -1, (componentCount >= 2) ? texInstruction->textureFetch.srcSel[1] : -1, (componentCount >= 3) ? texInstruction->textureFetch.srcSel[2] : -1, (componentCount >= 4) ? texInstruction->textureFetch.srcSel[3] : -1, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(")"); From 9e944510933a2b9068e5db9fad9be3b7fe365d69 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 19 Jan 2025 10:15:20 +0100 Subject: [PATCH 343/368] check for texture and color tile mode --- .../LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp index cbb2b3ff8..37e228d50 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp @@ -870,6 +870,10 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD if (physAddr == MPTR_NULL) continue; // invalid data + auto tileMode = texRegister.word0.get_TILE_MODE(); + if (Latte::TM_IsMacroTiled(tileMode)) + physAddr &= 0x700; + // Check for dimension auto dim = shader->textureUnitDim[textureIndex]; // TODO: 2D arrays could technically be supported as well @@ -896,7 +900,13 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD uint32* colorBufferRegBase = shaderContext->contextRegisters + (mmCB_COLOR0_BASE + j); uint32 regColorBufferBase = colorBufferRegBase[mmCB_COLOR0_BASE - mmCB_COLOR0_BASE] & 0xFFFFFF00; // the low 8 bits are ignored? How to Survive seems to rely on this + uint32 regColorInfo = colorBufferRegBase[mmCB_COLOR0_INFO - mmCB_COLOR0_BASE]; + MPTR colorBufferPhysMem = regColorBufferBase; + Latte::E_HWTILEMODE colorBufferTileMode = (Latte::E_HWTILEMODE)((regColorInfo >> 8) & 0xF); + if (Latte::TM_IsMacroTiled(colorBufferTileMode)) + colorBufferPhysMem &= ~0x700; + Latte::E_GX2SURFFMT colorBufferFormat = LatteMRT::GetColorBufferFormat(j, *shaderContext->contextRegistersNew); // TODO: check if mip matches as well? From db8c7de23672e21e53b4886270cd1db37752023c Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 19 Jan 2025 10:25:28 +0100 Subject: [PATCH 344/368] speed up framebuffer fetch lookup --- .../LatteDecompilerAnalyzer.cpp | 54 ++++++++++++------- .../LatteDecompilerEmitMSL.cpp | 1 + 2 files changed, 37 insertions(+), 18 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp index 37e228d50..f9611ff17 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp @@ -8,6 +8,8 @@ #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/Renderer/Renderer.h" +#include "Common/MemPtr.h" +#include "HW/Latte/ISA/LatteReg.h" // Defined in LatteTextureLegacy.cpp Latte::E_GX2SURFFMT LatteTexture_ReconstructGX2Format(const Latte::LATTE_SQ_TEX_RESOURCE_WORD1_N& texUnitWord1, const Latte::LATTE_SQ_TEX_RESOURCE_WORD4_N& texUnitWord4); @@ -859,7 +861,36 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD // check if textures are used as render targets if (shader->shaderType == LatteConst::ShaderType::Pixel) { - uint8 colorBufferMask = LatteMRT::GetActiveColorBufferMask(shader, *shaderContext->contextRegistersNew); + struct { + sint32 index; + MPTR physAddr; + Latte::E_GX2SURFFMT format; + } colorBuffers[LATTE_NUM_COLOR_TARGET]{}; + + uint8 colorBufferMask = LatteMRT::GetActiveColorBufferMask(shader, *shaderContext->contextRegistersNew); + sint32 colorBufferCount = 0; + for (sint32 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + { + auto& colorBuffer = colorBuffers[colorBufferCount]; + if (((colorBufferMask) & (1 << i)) == 0) + continue; // color buffer not enabled + + uint32* colorBufferRegBase = shaderContext->contextRegisters + (mmCB_COLOR0_BASE + i); + uint32 regColorBufferBase = colorBufferRegBase[mmCB_COLOR0_BASE - mmCB_COLOR0_BASE] & 0xFFFFFF00; // the low 8 bits are ignored? How to Survive seems to rely on this + + uint32 regColorInfo = colorBufferRegBase[mmCB_COLOR0_INFO - mmCB_COLOR0_BASE]; + + MPTR colorBufferPhysMem = regColorBufferBase; + Latte::E_HWTILEMODE colorBufferTileMode = (Latte::E_HWTILEMODE)((regColorInfo >> 8) & 0xF); + if (Latte::TM_IsMacroTiled(colorBufferTileMode)) + colorBufferPhysMem &= ~0x700; + + Latte::E_GX2SURFFMT colorBufferFormat = LatteMRT::GetColorBufferFormat(i, *shaderContext->contextRegistersNew); + + colorBuffer = {i, colorBufferPhysMem, colorBufferFormat}; + colorBufferCount++; + } + for (sint32 i = 0; i < shader->textureUnitListCount; i++) { sint32 textureIndex = shader->textureUnitList[i]; @@ -892,27 +923,14 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD Latte::E_GX2SURFFMT format = LatteTexture_ReconstructGX2Format(texRegister.word1, texRegister.word4); // Check if the texture is used as render target - for (sint32 j = 0; j < LATTE_NUM_COLOR_TARGET; j++) + for (sint32 j = 0; j < colorBufferCount; j++) { - if (((colorBufferMask) & (1 << j)) == 0) - continue; // color buffer not enabled - - uint32* colorBufferRegBase = shaderContext->contextRegisters + (mmCB_COLOR0_BASE + j); - uint32 regColorBufferBase = colorBufferRegBase[mmCB_COLOR0_BASE - mmCB_COLOR0_BASE] & 0xFFFFFF00; // the low 8 bits are ignored? How to Survive seems to rely on this - - uint32 regColorInfo = colorBufferRegBase[mmCB_COLOR0_INFO - mmCB_COLOR0_BASE]; - - MPTR colorBufferPhysMem = regColorBufferBase; - Latte::E_HWTILEMODE colorBufferTileMode = (Latte::E_HWTILEMODE)((regColorInfo >> 8) & 0xF); - if (Latte::TM_IsMacroTiled(colorBufferTileMode)) - colorBufferPhysMem &= ~0x700; - - Latte::E_GX2SURFFMT colorBufferFormat = LatteMRT::GetColorBufferFormat(j, *shaderContext->contextRegistersNew); + const auto& colorBuffer = colorBuffers[j]; // TODO: check if mip matches as well? - if (physAddr == colorBufferPhysMem && format == colorBufferFormat) + if (physAddr == colorBuffer.physAddr && format == colorBuffer.format) { - shader->textureRenderTargetIndex[textureIndex] = j; + shader->textureRenderTargetIndex[textureIndex] = colorBuffer.index; break; } } diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 93b662526..c4b50db12 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2299,6 +2299,7 @@ static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContex if (static_cast(g_renderer.get())->SupportsFramebufferFetch() && renderTargetIndex != 255) { // TODO: support comparison samplers + // TODO: support swizzling src->addFmt("col{}", renderTargetIndex); } else From 29cd6989c19d86bb66a930e543dfedc8f41c22ab Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 19 Jan 2025 11:23:00 +0100 Subject: [PATCH 345/368] skip bindings for framebuffer fetched textures --- .../HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp index f9611ff17..01aa3579d 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp @@ -511,7 +511,7 @@ namespace LatteDecompiler // for Vulkan we use consecutive indices for (sint32 i = 0; i < LATTE_NUM_MAX_TEX_UNITS; i++) { - if (!decompilerContext->output->textureUnitMask[i]) + if (!decompilerContext->output->textureUnitMask[i] || decompilerContext->shader->textureRenderTargetIndex[i] != 255) continue; decompilerContext->output->resourceMappingMTL.textureUnitToBindingPoint[i] = decompilerContext->currentTextureBindingPointMTL; decompilerContext->currentTextureBindingPointMTL++; From 709f24bf416eee332279384fba9ba8828858062a Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 19 Jan 2025 15:43:08 +0100 Subject: [PATCH 346/368] fix typo in framebuffer fetch --- .../HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp index 01aa3579d..aeb90ac68 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp @@ -903,7 +903,7 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD auto tileMode = texRegister.word0.get_TILE_MODE(); if (Latte::TM_IsMacroTiled(tileMode)) - physAddr &= 0x700; + physAddr &= ~0x700; // Check for dimension auto dim = shader->textureUnitDim[textureIndex]; From 29a48352780489d65e62cb8c908bbc6b85e42ccb Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 20 Jan 2025 13:47:59 +0100 Subject: [PATCH 347/368] fix incorrect ABGR4 texture decoder --- src/Cafe/HW/Latte/Core/LatteTextureLoader.h | 44 ------------------- .../HW/Latte/Renderer/Metal/LatteToMtl.cpp | 2 +- 2 files changed, 1 insertion(+), 45 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteTextureLoader.h b/src/Cafe/HW/Latte/Core/LatteTextureLoader.h index ad557bb3c..7b2c109b3 100644 --- a/src/Cafe/HW/Latte/Core/LatteTextureLoader.h +++ b/src/Cafe/HW/Latte/Core/LatteTextureLoader.h @@ -768,50 +768,6 @@ class TextureDecoder_R4_G4_B4_A4_UNORM : public TextureDecoder, public Singleton } }; -class TextureDecoder_R4_G4_B4_A4_UNORM_To_ABGR4 : public TextureDecoder, public SingletonClass -{ -public: - sint32 getBytesPerTexel(LatteTextureLoaderCtx* textureLoader) override - { - return 2; - } - - void decode(LatteTextureLoaderCtx* textureLoader, uint8* outputData) override - { - for (sint32 y = 0; y < textureLoader->height; y += textureLoader->stepY) - { - sint32 yc = y; - for (sint32 x = 0; x < textureLoader->width; x += textureLoader->stepX) - { - uint8* blockData = LatteTextureLoader_GetInput(textureLoader, x, y); - sint32 pixelOffset = (x + yc * textureLoader->width) * 2; - uint8 v0 = (*(uint8*)(blockData + 0)); - uint8 v1 = (*(uint8*)(blockData + 1)); - *(uint8*)(outputData + pixelOffset + 0) = v0; // todo: Verify - *(uint8*)(outputData + pixelOffset + 1) = v1; // todo: Verify - } - } - } - - void decodePixelToRGBA(uint8* blockData, uint8* outputPixel, uint8 blockOffsetX, uint8 blockOffsetY) override - { - uint8 v0 = *(blockData + 0); - uint8 v1 = *(blockData + 1); - uint8 c0 = (v0 & 0xF); - uint8 c1 = (v0 >> 4) & 0xF; - uint8 c2 = (v1 & 0xF); - uint8 c3 = (v1 >> 4) & 0xF; - c0 = (c0 << 4) | c0; - c1 = (c1 << 4) | c1; - c2 = (c2 << 4) | c2; - c3 = (c3 << 4) | c3; - *(outputPixel + 0) = c0; - *(outputPixel + 1) = c1; - *(outputPixel + 2) = c2; - *(outputPixel + 3) = c3; - } -}; - class TextureDecoder_R4G4B4A4_UNORM_To_RGBA8 : public TextureDecoder, public SingletonClass { public: diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp index 6c4a251e7..7bf295df8 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -121,7 +121,7 @@ void CheckForPixelFormatSupport(const MetalPixelFormatSupport& support) MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM].textureDecoder = TextureDecoder_R5_G5_B5_A1_UNORM_swappedRB::getInstance(); MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM].textureDecoder = TextureDecoder_A1_B5_G5_R5_UNORM::getInstance(); MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R11_G11_B10_FLOAT].textureDecoder = TextureDecoder_R11_G11_B10_FLOAT::getInstance(); - MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM].textureDecoder = TextureDecoder_R4_G4_B4_A4_UNORM_To_ABGR4::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM].textureDecoder = TextureDecoder_R4_G4_B4_A4_UNORM::getInstance(); MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R10_G10_B10_A2_UNORM].textureDecoder = TextureDecoder_R10_G10_B10_A2_UNORM::getInstance(); MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R10_G10_B10_A2_SNORM].textureDecoder = TextureDecoder_R10_G10_B10_A2_SNORM_To_RGBA16::getInstance(); MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R10_G10_B10_A2_SRGB].textureDecoder = TextureDecoder_R10_G10_B10_A2_UNORM::getInstance(); From 7ad4d480cbe18fa43cb7dade9d344e5c7b93851b Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 20 Jan 2025 17:02:32 +0100 Subject: [PATCH 348/368] filter framebuffer fetch more --- .../LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp index aeb90ac68..ce3203166 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp @@ -865,6 +865,7 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD sint32 index; MPTR physAddr; Latte::E_GX2SURFFMT format; + Latte::E_HWTILEMODE tileMode; } colorBuffers[LATTE_NUM_COLOR_TARGET]{}; uint8 colorBufferMask = LatteMRT::GetActiveColorBufferMask(shader, *shaderContext->contextRegistersNew); @@ -882,12 +883,10 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD MPTR colorBufferPhysMem = regColorBufferBase; Latte::E_HWTILEMODE colorBufferTileMode = (Latte::E_HWTILEMODE)((regColorInfo >> 8) & 0xF); - if (Latte::TM_IsMacroTiled(colorBufferTileMode)) - colorBufferPhysMem &= ~0x700; Latte::E_GX2SURFFMT colorBufferFormat = LatteMRT::GetColorBufferFormat(i, *shaderContext->contextRegistersNew); - colorBuffer = {i, colorBufferPhysMem, colorBufferFormat}; + colorBuffer = {i, colorBufferPhysMem, colorBufferFormat, colorBufferTileMode}; colorBufferCount++; } @@ -902,8 +901,6 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD continue; // invalid data auto tileMode = texRegister.word0.get_TILE_MODE(); - if (Latte::TM_IsMacroTiled(tileMode)) - physAddr &= ~0x700; // Check for dimension auto dim = shader->textureUnitDim[textureIndex]; @@ -927,8 +924,7 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD { const auto& colorBuffer = colorBuffers[j]; - // TODO: check if mip matches as well? - if (physAddr == colorBuffer.physAddr && format == colorBuffer.format) + if (physAddr == colorBuffer.physAddr && format == colorBuffer.format && tileMode == colorBuffer.tileMode) { shader->textureRenderTargetIndex[textureIndex] = colorBuffer.index; break; From bc6fb816daf37ae1dd7c94efe4f1c60ba28309ff Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 23 Jan 2025 14:50:14 +0100 Subject: [PATCH 349/368] add auto option for position invariance --- src/Cafe/CafeSystem.cpp | 2 +- src/Cafe/GameProfile/GameProfile.cpp | 6 +-- src/Cafe/GameProfile/GameProfile.h | 4 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 54 +++++++++++++++++++ .../HW/Latte/Renderer/Metal/MetalRenderer.h | 8 +++ .../Renderer/Metal/RendererShaderMtl.cpp | 10 ++-- src/config/CemuConfig.h | 23 ++++++++ src/gui/GameProfileWindow.cpp | 10 ++-- 8 files changed, 103 insertions(+), 14 deletions(-) diff --git a/src/Cafe/CafeSystem.cpp b/src/Cafe/CafeSystem.cpp index 6a440b69e..090201304 100644 --- a/src/Cafe/CafeSystem.cpp +++ b/src/Cafe/CafeSystem.cpp @@ -260,7 +260,7 @@ void InfoLog_PrintActiveSettings() cemuLog_log(LogType::Force, "Async compile: {}", GetConfig().async_compile.GetValue() ? "true" : "false"); cemuLog_log(LogType::Force, "Fast math: {}", g_current_game_profile->GetFastMath() ? "true" : "false"); cemuLog_log(LogType::Force, "Buffer cache type: {}", g_current_game_profile->GetBufferCacheMode()); - cemuLog_log(LogType::Force, "Position invariance: {}", g_current_game_profile->GetPositionInvariance() ? "true" : "false"); + cemuLog_log(LogType::Force, "Position invariance: {}", g_current_game_profile->GetPositionInvariance()); if (!GetConfig().vk_accurate_barriers.GetValue()) cemuLog_log(LogType::Force, "Accurate barriers are disabled!"); } diff --git a/src/Cafe/GameProfile/GameProfile.cpp b/src/Cafe/GameProfile/GameProfile.cpp index ff3978605..49bac2ae2 100644 --- a/src/Cafe/GameProfile/GameProfile.cpp +++ b/src/Cafe/GameProfile/GameProfile.cpp @@ -228,7 +228,7 @@ bool GameProfile::Load(uint64_t title_id) gameProfile_loadEnumOption(iniParser, "accurateShaderMul", m_accurateShaderMul); gameProfile_loadBooleanOption2(iniParser, "fastMath", m_fastMath); gameProfile_loadEnumOption(iniParser, "bufferCacheMode", m_bufferCacheMode); - gameProfile_loadBooleanOption2(iniParser, "positionInvariance", m_positionInvariance); + gameProfile_loadEnumOption(iniParser, "positionInvariance2", m_positionInvariance); // legacy support auto option_precompiledShaders = iniParser.FindOption("precompiledShaders"); @@ -345,7 +345,7 @@ void GameProfile::ResetOptional() m_accurateShaderMul = AccurateShaderMulOption::True; m_fastMath = true; m_bufferCacheMode = BufferCacheMode::DevicePrivate; - m_positionInvariance = false; + m_positionInvariance = PositionInvariance::Auto; // cpu settings m_threadQuantum = kThreadQuantumDefault; m_cpuMode.reset(); // CPUModeOption::kSingleCoreRecompiler; @@ -368,7 +368,7 @@ void GameProfile::Reset() m_accurateShaderMul = AccurateShaderMulOption::True; m_fastMath = true; m_bufferCacheMode = BufferCacheMode::DevicePrivate; - m_positionInvariance = false; + m_positionInvariance = PositionInvariance::Auto; m_precompiledShaders = PrecompiledShaderOption::Auto; // cpu settings m_threadQuantum = kThreadQuantumDefault; diff --git a/src/Cafe/GameProfile/GameProfile.h b/src/Cafe/GameProfile/GameProfile.h index 359e6a0ac..9e40b3785 100644 --- a/src/Cafe/GameProfile/GameProfile.h +++ b/src/Cafe/GameProfile/GameProfile.h @@ -33,7 +33,7 @@ class GameProfile [[nodiscard]] const AccurateShaderMulOption& GetAccurateShaderMul() const { return m_accurateShaderMul; } [[nodiscard]] bool GetFastMath() const { return m_fastMath; } [[nodiscard]] BufferCacheMode GetBufferCacheMode() const { return m_bufferCacheMode; } - [[nodiscard]] bool GetPositionInvariance() const { return m_positionInvariance; } + [[nodiscard]] PositionInvariance GetPositionInvariance() const { return m_positionInvariance; } [[nodiscard]] const std::optional& GetPrecompiledShadersState() const { return m_precompiledShaders; } [[nodiscard]] uint32 GetThreadQuantum() const { return m_threadQuantum; } @@ -59,7 +59,7 @@ class GameProfile AccurateShaderMulOption m_accurateShaderMul = AccurateShaderMulOption::True; bool m_fastMath = true; BufferCacheMode m_bufferCacheMode = BufferCacheMode::DevicePrivate; - bool m_positionInvariance = false; + PositionInvariance m_positionInvariance = PositionInvariance::Auto; std::optional m_precompiledShaders{}; // cpu settings uint32 m_threadQuantum = kThreadQuantumDefault; // values: 20000 45000 60000 80000 100000 diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 61e5c94a7..d91dd2660 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -17,6 +17,7 @@ #include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/Core/LatteIndices.h" #include "Cafe/HW/Latte/Core/LatteBufferCache.h" +#include "CafeSystem.h" #include "Cemu/Logging/CemuLogging.h" #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Core/LatteConst.h" @@ -54,6 +55,59 @@ std::vector MetalRenderer::GetDevices() MetalRenderer::MetalRenderer() { + // Options + + // Position invariance + switch (g_current_game_profile->GetPositionInvariance()) + { + case PositionInvariance::Auto: + switch (CafeSystem::GetForegroundTitleId()) + { + // Minecraft: Story Mode + case 0x000500001020A300: // EUR + case 0x00050000101E0100: // USA + //case 0x000500001020a200: // USA + // Resident Evil: Revelations + case 0x000500001012B400: // EUR + case 0x000500001012CF00: // USA + // The Legend of Zelda: Breath of the Wild + case 0x00050000101C9500: // EUR + case 0x00050000101C9400: // USA + case 0x00050000101C9300: // JPN + // Ninja Gaiden 3: Razor's Edge + case 0x0005000010110B00: // EUR + case 0x0005000010110A00: // USA + case 0x0005000010110900: // JPN + case 0x0005000010139B00: // EUR (TODO: check) + // Bayonetta 2 + case 0x0005000010172700: // EUR + case 0x0005000010172600: // USA + // LEGO STAR WARS: The Force Awakens + case 0x00050000101DAA00: // EUR + case 0x00050000101DAB00: // USA + // Bayonetta + case 0x0005000010157F00: // EUR + case 0x0005000010157E00: // USA + case 0x000500001014DB00: // JPN + // Disney Planes + case 0x0005000010136900: // EUR + case 0x0005000010136A00: // EUR + case 0x0005000010136B00: // EUR + case 0x000500001011C500: // USA (TODO: check) + m_positionInvariance = true; + break; + default: + m_positionInvariance = false; + break; + } + case PositionInvariance::False: + m_positionInvariance = false; + break; + case PositionInvariance::True: + m_positionInvariance = true; + break; + } + // Pick a device auto& config = GetConfig(); const bool hasDeviceSet = config.mtl_graphic_device_uuid != 0; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 04c63be82..2aa68973c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -358,6 +358,11 @@ class MetalRenderer : public Renderer void CopyBufferToBuffer(MTL::Buffer* src, uint32 srcOffset, MTL::Buffer* dst, uint32 dstOffset, uint32 size, MTL::RenderStages after, MTL::RenderStages before); // Getters + bool GetPositionInvariance() const + { + return m_positionInvariance; + } + bool IsAppleGPU() const { return m_isAppleGPU; @@ -464,6 +469,9 @@ class MetalRenderer : public Renderer MetalPerformanceMonitor m_performanceMonitor; + // Options + bool m_positionInvariance; + // Metal objects MTL::Device* m_device = nullptr; MTL::CommandQueue* m_commandQueue; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index 07073e08c..b0ba48a4e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -2,8 +2,8 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" -#include "Cemu/FileCache/FileCache.h" -#include "config/ActiveSettings.h" +//#include "Cemu/FileCache/FileCache.h" +//#include "config/ActiveSettings.h" #include "Cemu/Logging/CemuLogging.h" #include "Common/precompiled.h" #include "GameProfile/GameProfile.h" @@ -279,8 +279,12 @@ MTL::Library* RendererShaderMtl::LibraryFromSource() MTL::CompileOptions* options = MTL::CompileOptions::alloc()->init(); if (g_current_game_profile->GetFastMath()) options->setFastMathEnabled(true); - if (g_current_game_profile->GetPositionInvariance()) + + if (m_mtlr->GetPositionInvariance()) + { + // TODO: filter out based on GPU state options->setPreserveInvariance(true); + } NS::Error* error = nullptr; MTL::Library* library = m_mtlr->GetDevice()->newLibrary(ToNSString(m_mslCode), options, &error); diff --git a/src/config/CemuConfig.h b/src/config/CemuConfig.h index 991d9a89e..6233b7811 100644 --- a/src/config/CemuConfig.h +++ b/src/config/CemuConfig.h @@ -132,6 +132,14 @@ enum class BufferCacheMode }; ENABLE_ENUM_ITERATORS(BufferCacheMode, BufferCacheMode::DevicePrivate, BufferCacheMode::Host); +enum class PositionInvariance +{ + Auto, + False, + True, +}; +ENABLE_ENUM_ITERATORS(PositionInvariance, PositionInvariance::False, PositionInvariance::True); + enum class CPUMode { SinglecoreInterpreter = 0, @@ -245,6 +253,21 @@ struct fmt::formatter : formatter { } }; template <> +struct fmt::formatter : formatter { + template + auto format(const PositionInvariance c, FormatContext &ctx) const { + string_view name; + switch (c) + { + case PositionInvariance::Auto: name = "auto"; break; + case PositionInvariance::False: name = "false"; break; + case PositionInvariance::True: name = "true"; break; + default: name = "unknown"; break; + } + return formatter::format(name, ctx); + } +}; +template <> struct fmt::formatter : formatter { template auto format(const CPUMode c, FormatContext &ctx) const { diff --git a/src/gui/GameProfileWindow.cpp b/src/gui/GameProfileWindow.cpp index c46f0f254..2500887a0 100644 --- a/src/gui/GameProfileWindow.cpp +++ b/src/gui/GameProfileWindow.cpp @@ -144,9 +144,9 @@ GameProfileWindow::GameProfileWindow(wxWindow* parent, uint64_t title_id) first_row->Add(new wxStaticText(panel, wxID_ANY, _("Position invariance")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); - wxString pos_values[] = { _("false"), _("true") }; + wxString pos_values[] = { _("auto"), _("false"), _("true") }; m_position_invariance = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(pos_values), pos_values); - m_position_invariance->SetToolTip(_("Disables most optimizations for vertex positions. May fix polygon cutouts in some games.\n\nMetal only\n\nRecommended: false")); + m_position_invariance->SetToolTip(_("Disables most optimizations for vertex positions. May fix polygon cutouts or flickering in some games.\n\nMetal only\n\nRecommended: auto")); first_row->Add(m_position_invariance, 0, wxALL, 5); /*first_row->Add(new wxStaticText(panel, wxID_ANY, _("GPU buffer cache accuracy")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); @@ -357,11 +357,11 @@ void GameProfileWindow::SaveProfile() // gpu m_game_profile.m_accurateShaderMul = (AccurateShaderMulOption)m_shader_mul_accuracy->GetSelection(); - m_game_profile.m_fastMath = (bool)m_fast_math->GetSelection(); - m_game_profile.m_bufferCacheMode = (BufferCacheMode)m_buffer_cache_mode->GetSelection(); - m_game_profile.m_positionInvariance = (bool)m_position_invariance->GetSelection(); if (m_game_profile.m_accurateShaderMul != AccurateShaderMulOption::False && m_game_profile.m_accurateShaderMul != AccurateShaderMulOption::True) m_game_profile.m_accurateShaderMul = AccurateShaderMulOption::True; // force a legal value + m_game_profile.m_fastMath = (bool)m_fast_math->GetSelection(); + m_game_profile.m_bufferCacheMode = (BufferCacheMode)m_buffer_cache_mode->GetSelection(); + m_game_profile.m_positionInvariance = (PositionInvariance)m_position_invariance->GetSelection(); if (m_graphic_api->GetSelection() == 0) m_game_profile.m_graphics_api = {}; From d912837c9c963d80088ff2fe08eb3e442e8cd0be Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 23 Jan 2025 15:03:48 +0100 Subject: [PATCH 350/368] add auto option for buffer cache mode --- src/Cafe/GameProfile/GameProfile.cpp | 14 +++++----- src/Cafe/GameProfile/GameProfile.h | 2 +- .../Renderer/Metal/MetalMemoryManager.cpp | 27 +++++++++++++++++++ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 2 +- src/config/CemuConfig.h | 4 ++- 5 files changed, 38 insertions(+), 11 deletions(-) diff --git a/src/Cafe/GameProfile/GameProfile.cpp b/src/Cafe/GameProfile/GameProfile.cpp index 49bac2ae2..73a6e9274 100644 --- a/src/Cafe/GameProfile/GameProfile.cpp +++ b/src/Cafe/GameProfile/GameProfile.cpp @@ -227,7 +227,7 @@ bool GameProfile::Load(uint64_t title_id) gameProfile_loadEnumOption(iniParser, "accurateShaderMul", m_accurateShaderMul); gameProfile_loadBooleanOption2(iniParser, "fastMath", m_fastMath); - gameProfile_loadEnumOption(iniParser, "bufferCacheMode", m_bufferCacheMode); + gameProfile_loadEnumOption(iniParser, "bufferCacheMode2", m_bufferCacheMode); gameProfile_loadEnumOption(iniParser, "positionInvariance2", m_positionInvariance); // legacy support @@ -295,25 +295,23 @@ void GameProfile::Save(uint64_t title_id) #define WRITE_OPTIONAL_ENTRY(__NAME) if (m_##__NAME) fs->writeLine(fmt::format("{} = {}", #__NAME, m_##__NAME.value()).c_str()); #define WRITE_ENTRY(__NAME) fs->writeLine(fmt::format("{} = {}", #__NAME, m_##__NAME).c_str()); +#define WRITE_ENTRY_NUMBERED(__NAME, __NUM) fs->writeLine(fmt::format("{} = {}", #__NAME #__NUM, m_##__NAME).c_str()); fs->writeLine("[General]"); WRITE_OPTIONAL_ENTRY(loadSharedLibraries); WRITE_ENTRY(startWithPadView); - fs->writeLine(""); - fs->writeLine("[CPU]"); WRITE_OPTIONAL_ENTRY(cpuMode); WRITE_ENTRY(threadQuantum); - fs->writeLine(""); fs->writeLine("[Graphics]"); WRITE_ENTRY(accurateShaderMul); WRITE_ENTRY(fastMath); - WRITE_ENTRY(bufferCacheMode); - WRITE_ENTRY(positionInvariance); + WRITE_ENTRY_NUMBERED(bufferCacheMode, 2); + WRITE_ENTRY_NUMBERED(positionInvariance, 2); WRITE_OPTIONAL_ENTRY(precompiledShaders); WRITE_OPTIONAL_ENTRY(graphics_api); fs->writeLine(""); @@ -344,7 +342,7 @@ void GameProfile::ResetOptional() // graphic settings m_accurateShaderMul = AccurateShaderMulOption::True; m_fastMath = true; - m_bufferCacheMode = BufferCacheMode::DevicePrivate; + m_bufferCacheMode = BufferCacheMode::Auto; m_positionInvariance = PositionInvariance::Auto; // cpu settings m_threadQuantum = kThreadQuantumDefault; @@ -367,7 +365,7 @@ void GameProfile::Reset() // graphic settings m_accurateShaderMul = AccurateShaderMulOption::True; m_fastMath = true; - m_bufferCacheMode = BufferCacheMode::DevicePrivate; + m_bufferCacheMode = BufferCacheMode::Auto; m_positionInvariance = PositionInvariance::Auto; m_precompiledShaders = PrecompiledShaderOption::Auto; // cpu settings diff --git a/src/Cafe/GameProfile/GameProfile.h b/src/Cafe/GameProfile/GameProfile.h index 9e40b3785..58fd099b2 100644 --- a/src/Cafe/GameProfile/GameProfile.h +++ b/src/Cafe/GameProfile/GameProfile.h @@ -58,7 +58,7 @@ class GameProfile std::optional m_graphics_api{}; AccurateShaderMulOption m_accurateShaderMul = AccurateShaderMulOption::True; bool m_fastMath = true; - BufferCacheMode m_bufferCacheMode = BufferCacheMode::DevicePrivate; + BufferCacheMode m_bufferCacheMode = BufferCacheMode::Auto; PositionInvariance m_positionInvariance = PositionInvariance::Auto; std::optional m_precompiledShaders{}; // cpu settings diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 7b1dd53fc..11afd8920 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -2,9 +2,11 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h" +#include "CafeSystem.h" #include "Cemu/Logging/CemuLogging.h" #include "Common/precompiled.h" #include "HW/MMU/MMU.h" +#include "config/CemuConfig.h" MetalMemoryManager::~MetalMemoryManager() { @@ -36,6 +38,31 @@ void MetalMemoryManager::InitBufferCache(size_t size) m_bufferCacheMode = g_current_game_profile->GetBufferCacheMode(); + if (m_bufferCacheMode == BufferCacheMode::Auto) + { + // TODO: do this for all unified memory systems? + if (m_mtlr->IsAppleGPU()) + { + switch (CafeSystem::GetForegroundTitleId()) + { + // The Legend of Zelda: Wind Waker HD + case 0x0005000010143600: // EUR + case 0x0005000010143500: // USA + case 0x0005000010143400: // JPN + // TODO: use host instead? + m_bufferCacheMode = BufferCacheMode::DeviceShared; + break; + default: + m_bufferCacheMode = BufferCacheMode::DevicePrivate; + break; + } + } + else + { + m_bufferCacheMode = BufferCacheMode::DevicePrivate; + } + } + // First, try to import the host memory as a buffer if (m_bufferCacheMode == BufferCacheMode::Host) { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index d91dd2660..d2bd89b0a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -76,9 +76,9 @@ MetalRenderer::MetalRenderer() case 0x00050000101C9300: // JPN // Ninja Gaiden 3: Razor's Edge case 0x0005000010110B00: // EUR + case 0x0005000010139B00: // EUR (TODO: check) case 0x0005000010110A00: // USA case 0x0005000010110900: // JPN - case 0x0005000010139B00: // EUR (TODO: check) // Bayonetta 2 case 0x0005000010172700: // EUR case 0x0005000010172600: // USA diff --git a/src/config/CemuConfig.h b/src/config/CemuConfig.h index 6233b7811..17f224abb 100644 --- a/src/config/CemuConfig.h +++ b/src/config/CemuConfig.h @@ -126,11 +126,12 @@ ENABLE_ENUM_ITERATORS(AccurateShaderMulOption, AccurateShaderMulOption::False, A enum class BufferCacheMode { + Auto, DevicePrivate, DeviceShared, Host, }; -ENABLE_ENUM_ITERATORS(BufferCacheMode, BufferCacheMode::DevicePrivate, BufferCacheMode::Host); +ENABLE_ENUM_ITERATORS(BufferCacheMode, BufferCacheMode::Auto, BufferCacheMode::Host); enum class PositionInvariance { @@ -244,6 +245,7 @@ struct fmt::formatter : formatter { string_view name; switch (c) { + case BufferCacheMode::Auto: name = "auto"; break; case BufferCacheMode::DevicePrivate: name = "device private"; break; case BufferCacheMode::DeviceShared: name = "device shared"; break; case BufferCacheMode::Host: name = "host"; break; From 3c3f254e6d7d67d609c2edc758df88c12376ceef Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 23 Jan 2025 15:44:12 +0100 Subject: [PATCH 351/368] fix auto buffer cache mode not showing --- src/Cafe/GameProfile/GameProfile.cpp | 1 + src/gui/GameProfileWindow.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Cafe/GameProfile/GameProfile.cpp b/src/Cafe/GameProfile/GameProfile.cpp index 73a6e9274..16812d1c2 100644 --- a/src/Cafe/GameProfile/GameProfile.cpp +++ b/src/Cafe/GameProfile/GameProfile.cpp @@ -327,6 +327,7 @@ void GameProfile::Save(uint64_t title_id) #undef WRITE_OPTIONAL_ENTRY #undef WRITE_ENTRY +#undef WRITE_ENTRY_NUMBERED delete fs; } diff --git a/src/gui/GameProfileWindow.cpp b/src/gui/GameProfileWindow.cpp index 2500887a0..aae450d8d 100644 --- a/src/gui/GameProfileWindow.cpp +++ b/src/gui/GameProfileWindow.cpp @@ -137,7 +137,7 @@ GameProfileWindow::GameProfileWindow(wxWindow* parent, uint64_t title_id) first_row->Add(new wxStaticText(panel, wxID_ANY, _("Buffer cache mode")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); - wxString cache_values[] = { _("device private"), _("device shared"), _("host") }; + wxString cache_values[] = { _("auto"), _("device private"), _("device shared"), _("host") }; m_buffer_cache_mode = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(cache_values), cache_values); m_buffer_cache_mode->SetToolTip(_("EXPERT OPTION\nDecides how the buffer cache memory will be managed.\n\nMetal only\n\nRecommended: device private")); first_row->Add(m_buffer_cache_mode, 0, wxALL, 5); From 9db0e920acbd5068fc7232797306722fcd2a0f21 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 23 Jan 2025 15:47:25 +0100 Subject: [PATCH 352/368] change descriptions for some expert options --- src/gui/GameProfileWindow.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gui/GameProfileWindow.cpp b/src/gui/GameProfileWindow.cpp index aae450d8d..3ff718a83 100644 --- a/src/gui/GameProfileWindow.cpp +++ b/src/gui/GameProfileWindow.cpp @@ -132,21 +132,21 @@ GameProfileWindow::GameProfileWindow(wxWindow* parent, uint64_t title_id) wxString math_values[] = { _("false"), _("true") }; m_fast_math = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(math_values), math_values); - m_fast_math->SetToolTip(_("Enables fast math for all shaders. May (rarely) cause graphical bugs.\n\nMetal only\n\nRecommended: true")); + m_fast_math->SetToolTip(_("EXPERT OPTION\nEnables fast math for all shaders. May (rarely) cause graphical bugs.\n\nMetal only\n\nRecommended: true")); first_row->Add(m_fast_math, 0, wxALL, 5); first_row->Add(new wxStaticText(panel, wxID_ANY, _("Buffer cache mode")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); wxString cache_values[] = { _("auto"), _("device private"), _("device shared"), _("host") }; m_buffer_cache_mode = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(cache_values), cache_values); - m_buffer_cache_mode->SetToolTip(_("EXPERT OPTION\nDecides how the buffer cache memory will be managed.\n\nMetal only\n\nRecommended: device private")); + m_buffer_cache_mode->SetToolTip(_("EXPERT OPTION\nDecides how the buffer cache memory will be managed.\n\nMetal only\n\nRecommended: auto")); first_row->Add(m_buffer_cache_mode, 0, wxALL, 5); first_row->Add(new wxStaticText(panel, wxID_ANY, _("Position invariance")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); wxString pos_values[] = { _("auto"), _("false"), _("true") }; m_position_invariance = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(pos_values), pos_values); - m_position_invariance->SetToolTip(_("Disables most optimizations for vertex positions. May fix polygon cutouts or flickering in some games.\n\nMetal only\n\nRecommended: auto")); + m_position_invariance->SetToolTip(_("EXPERT OPTION\nDisables most optimizations for vertex positions. May fix polygon cutouts or flickering in some games.\n\nMetal only\n\nRecommended: auto")); first_row->Add(m_position_invariance, 0, wxALL, 5); /*first_row->Add(new wxStaticText(panel, wxID_ANY, _("GPU buffer cache accuracy")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); From a74c6eef4937a67071d8d60ba758f91162f37f20 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 23 Jan 2025 19:26:12 +0100 Subject: [PATCH 353/368] apply position invariance in Wonderful 101 --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index d2bd89b0a..20b7af256 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -94,6 +94,11 @@ MetalRenderer::MetalRenderer() case 0x0005000010136A00: // EUR case 0x0005000010136B00: // EUR case 0x000500001011C500: // USA (TODO: check) + // Wonderful 101 + case 0x0005000010135300: // EUR + case 0x000500001012DC00: // USA + case 0x0005000010116300: // JPN + case 0x0005000010185600: // JPN m_positionInvariance = true; break; default: From 28bcaf07d91cfe6c825684ff4756ecbec94aa6bb Mon Sep 17 00:00:00 2001 From: Samuliak Date: Thu, 23 Jan 2025 19:54:17 +0100 Subject: [PATCH 354/368] fix some memory leaks --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 2 ++ src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp | 1 + 2 files changed, 3 insertions(+) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 20b7af256..58ca83c73 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -49,6 +49,7 @@ std::vector MetalRenderer::GetDevices() MTL::Device* device = static_cast(devices->object(i)); result.emplace_back(std::string(device->name()->utf8String()), device->registryID()); } + devices->release(); return result; } @@ -130,6 +131,7 @@ MetalRenderer::MetalRenderer() break; } } + devices->release(); } if (!m_device) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index b0ba48a4e..88f436db4 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -340,6 +340,7 @@ void RendererShaderMtl::CompileInternal() { // Compile from source library = LibraryFromSource(); + FinishCompilation(); if (!library) return; From 4f0bc724a8d978e29374f5b8712eb17572025baa Mon Sep 17 00:00:00 2001 From: Samuliak Date: Fri, 24 Jan 2025 16:19:42 +0100 Subject: [PATCH 355/368] fix memory leaks --- src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp | 1 + src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp index aec662bb7..2f4295d40 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp @@ -28,6 +28,7 @@ MTL::RenderPipelineState* MetalOutputShaderCache::GetPipeline(RendererOutputShad NS::Error* error = nullptr; renderPipelineState = m_mtlr->GetDevice()->newRenderPipelineState(renderPipelineDescriptor, &error); + renderPipelineDescriptor->release(); if (error) { cemuLog_log(LogType::Force, "error creating output render pipeline state: {}", error->localizedDescription()->utf8String()); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 58ca83c73..6dce01fff 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1856,7 +1856,7 @@ void MetalRenderer::EndEncoding() if (m_commandEncoder) { m_commandEncoder->endEncoding(); - //m_commandEncoder->release(); + m_commandEncoder->release(); m_commandEncoder = nullptr; m_encoderType = MetalEncoderType::None; From b50b9135a027d262e36d0ac1db503a15ba7b08b8 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 25 Jan 2025 08:04:45 +0100 Subject: [PATCH 356/368] turn position invariance on in Mario Kart 8 --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 6dce01fff..78c17695c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -64,6 +64,11 @@ MetalRenderer::MetalRenderer() case PositionInvariance::Auto: switch (CafeSystem::GetForegroundTitleId()) { + // Mario Kart 9 + case 0x000500001010ED00: // EUR + case 0x000500001010EC00: // USA + case 0x000500001010EB00: // JPN + case 0x0005000010183A00: // JPN (TODO: check) // Minecraft: Story Mode case 0x000500001020A300: // EUR case 0x00050000101E0100: // USA From 14f42fc653a206f6dc4e2e0f311f0abbd368e377 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 25 Jan 2025 08:59:48 +0100 Subject: [PATCH 357/368] create stack scoped helper macro --- .../Latte/Renderer/Metal/LatteTextureMtl.cpp | 3 +- .../HW/Latte/Renderer/Metal/MetalCommon.h | 10 ++-- .../Renderer/Metal/MetalDepthStencilCache.cpp | 10 +--- .../Renderer/Metal/MetalOutputShaderCache.cpp | 3 +- .../Renderer/Metal/MetalPipelineCompiler.cpp | 3 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 58 ++++++------------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 4 ++ .../Renderer/Metal/MetalSamplerCache.cpp | 3 +- .../Metal/MetalVoidVertexPipeline.cpp | 6 +- .../Renderer/Metal/RendererShaderMtl.cpp | 3 +- 10 files changed, 39 insertions(+), 64 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp index 3c0005efc..c5d1f5406 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -7,7 +7,7 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM Latte::E_HWTILEMODE tileMode, bool isDepth) : LatteTexture(dim, physAddress, physMipAddress, format, width, height, depth, pitch, mipLevels, swizzle, tileMode, isDepth), m_mtlr(mtlRenderer) { - MTL::TextureDescriptor* desc = MTL::TextureDescriptor::alloc()->init(); + NS_STACK_SCOPED MTL::TextureDescriptor* desc = MTL::TextureDescriptor::alloc()->init(); desc->setStorageMode(MTL::StorageModePrivate); //desc->setCpuCacheMode(MTL::CPUCacheModeWriteCombined); @@ -83,7 +83,6 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM desc->setUsage(usage); m_texture = mtlRenderer->GetDevice()->newTexture(desc); - desc->release(); } LatteTextureMtl::~LatteTextureMtl() diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h index a03e7cae2..e858baf0a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -50,10 +50,12 @@ inline size_t Align(size_t size, size_t alignment) return (size + alignment - 1) & ~(alignment - 1); } -//inline std::string GetColorAttachmentTypeStr(uint32 index) -//{ -// return "COLOR_ATTACHMENT" + std::to_string(index) + "_TYPE"; -//} +__attribute__((unused)) static inline void ETStackAutoRelease(void* object) +{ + (*(NS::Object**)object)->release(); +} + +#define NS_STACK_SCOPED __attribute__((cleanup(ETStackAutoRelease))) __attribute__((unused)) // Cast from const char* to NS::String* inline NS::String* ToNSString(const char* str) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp index a1e4005b5..1fe680bb4 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp @@ -25,7 +25,7 @@ MTL::DepthStencilState* MetalDepthStencilCache::GetDepthStencilState(const Latte auto depthFunc = lcr.DB_DEPTH_CONTROL.get_Z_FUNC(); bool depthWriteEnable = lcr.DB_DEPTH_CONTROL.get_Z_WRITE_ENABLE(); - MTL::DepthStencilDescriptor* desc = MTL::DepthStencilDescriptor::alloc()->init(); + NS_STACK_SCOPED MTL::DepthStencilDescriptor* desc = MTL::DepthStencilDescriptor::alloc()->init(); if (depthEnable) { desc->setDepthWriteEnabled(depthWriteEnable); @@ -52,7 +52,7 @@ MTL::DepthStencilState* MetalDepthStencilCache::GetDepthStencilState(const Latte uint32 stencilCompareMaskBack = lcr.DB_STENCILREFMASK_BF.get_STENCILMASK_B(); uint32 stencilWriteMaskBack = lcr.DB_STENCILREFMASK_BF.get_STENCILWRITEMASK_B(); - MTL::StencilDescriptor* frontStencil = MTL::StencilDescriptor::alloc()->init(); + NS_STACK_SCOPED MTL::StencilDescriptor* frontStencil = MTL::StencilDescriptor::alloc()->init(); frontStencil->setReadMask(stencilCompareMaskFront); frontStencil->setWriteMask(stencilWriteMaskFront); frontStencil->setStencilCompareFunction(GetMtlCompareFunc(frontStencilFunc)); @@ -61,7 +61,7 @@ MTL::DepthStencilState* MetalDepthStencilCache::GetDepthStencilState(const Latte frontStencil->setDepthStencilPassOperation(GetMtlStencilOp(frontStencilZPass)); desc->setFrontFaceStencil(frontStencil); - MTL::StencilDescriptor* backStencil = MTL::StencilDescriptor::alloc()->init(); + NS_STACK_SCOPED MTL::StencilDescriptor* backStencil = MTL::StencilDescriptor::alloc()->init(); if (backStencilEnable) { backStencil->setReadMask(stencilCompareMaskBack); @@ -81,13 +81,9 @@ MTL::DepthStencilState* MetalDepthStencilCache::GetDepthStencilState(const Latte backStencil->setDepthStencilPassOperation(GetMtlStencilOp(frontStencilZPass)); } desc->setBackFaceStencil(backStencil); - - frontStencil->release(); - backStencil->release(); } depthStencilState = m_mtlr->GetDevice()->newDepthStencilState(desc); - desc->release(); return depthStencilState; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp index 2f4295d40..48cca54fa 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp @@ -21,14 +21,13 @@ MTL::RenderPipelineState* MetalOutputShaderCache::GetPipeline(RendererOutputShad auto vertexShaderMtl = static_cast(shader->GetVertexShader())->GetFunction(); auto fragmentShaderMtl = static_cast(shader->GetFragmentShader())->GetFunction(); - auto renderPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); + NS_STACK_SCOPED auto renderPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); renderPipelineDescriptor->setVertexFunction(vertexShaderMtl); renderPipelineDescriptor->setFragmentFunction(fragmentShaderMtl); renderPipelineDescriptor->colorAttachments()->object(0)->setPixelFormat(usesSRGB ? MTL::PixelFormatBGRA8Unorm_sRGB : MTL::PixelFormatBGRA8Unorm); NS::Error* error = nullptr; renderPipelineState = m_mtlr->GetDevice()->newRenderPipelineState(renderPipelineDescriptor, &error); - renderPipelineDescriptor->release(); if (error) { cemuLog_log(LogType::Force, "error creating output render pipeline state: {}", error->localizedDescription()->utf8String()); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp index fb92727d9..afd63f8b6 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp @@ -413,7 +413,7 @@ void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchSha // Vertex descriptor if (!fetchShader->mtlFetchVertexManually) { - MTL::VertexDescriptor* vertexDescriptor = MTL::VertexDescriptor::alloc()->init(); + NS_STACK_SCOPED MTL::VertexDescriptor* vertexDescriptor = MTL::VertexDescriptor::alloc()->init(); for (auto& bufferGroup : fetchShader->bufferGroups) { std::optional fetchType; @@ -476,7 +476,6 @@ void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchSha } desc->setVertexDescriptor(vertexDescriptor); - vertexDescriptor->release(); } SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, m_rasterizationEnabled, lcr); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 78c17695c..fe595b82b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -21,7 +21,6 @@ #include "Cemu/Logging/CemuLogging.h" #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Core/LatteConst.h" -#include "HW/Latte/Renderer/Metal/MetalBufferAllocator.h" #include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "config/CemuConfig.h" #include "gui/guiWrapper.h" @@ -41,7 +40,7 @@ void LatteDraw_handleSpecialState8_clearAsDepth(); std::vector MetalRenderer::GetDevices() { - auto devices = MTL::CopyAllDevices(); + NS_STACK_SCOPED auto devices = MTL::CopyAllDevices(); std::vector result; result.reserve(devices->count()); for (uint32 i = 0; i < devices->count(); i++) @@ -49,7 +48,6 @@ std::vector MetalRenderer::GetDevices() MTL::Device* device = static_cast(devices->object(i)); result.emplace_back(std::string(device->name()->utf8String()), device->registryID()); } - devices->release(); return result; } @@ -126,7 +124,7 @@ MetalRenderer::MetalRenderer() // If a device is set, try to find it if (hasDeviceSet) { - auto devices = MTL::CopyAllDevices(); + NS_STACK_SCOPED auto devices = MTL::CopyAllDevices(); for (uint32 i = 0; i < devices->count(); i++) { MTL::Device* device = static_cast(devices->object(i)); @@ -136,7 +134,6 @@ MetalRenderer::MetalRenderer() break; } } - devices->release(); } if (!m_device) @@ -167,7 +164,7 @@ MetalRenderer::MetalRenderer() m_event = m_device->newEvent(); // Resources - MTL::SamplerDescriptor* samplerDescriptor = MTL::SamplerDescriptor::alloc()->init(); + NS_STACK_SCOPED MTL::SamplerDescriptor* samplerDescriptor = MTL::SamplerDescriptor::alloc()->init(); #ifdef CEMU_DEBUG_ASSERT samplerDescriptor->setLabel(GetLabel("Nearest sampler state", samplerDescriptor)); #endif @@ -179,10 +176,9 @@ MetalRenderer::MetalRenderer() samplerDescriptor->setLabel(GetLabel("Linear sampler state", samplerDescriptor)); #endif m_linearSampler = m_device->newSamplerState(samplerDescriptor); - samplerDescriptor->release(); // Null resources - MTL::TextureDescriptor* textureDescriptor = MTL::TextureDescriptor::alloc()->init(); + NS_STACK_SCOPED MTL::TextureDescriptor* textureDescriptor = MTL::TextureDescriptor::alloc()->init(); textureDescriptor->setTextureType(MTL::TextureType1D); textureDescriptor->setWidth(1); textureDescriptor->setUsage(MTL::TextureUsageShaderRead); @@ -198,7 +194,6 @@ MetalRenderer::MetalRenderer() #ifdef CEMU_DEBUG_ASSERT m_nullTexture2D->setLabel(GetLabel("Null texture 2D", m_nullTexture2D)); #endif - textureDescriptor->release(); m_memoryManager = new MetalMemoryManager(this); m_outputShaderCache = new MetalOutputShaderCache(this); @@ -233,28 +228,24 @@ MetalRenderer::MetalRenderer() // Create the library NS::Error* error = nullptr; - MTL::Library* utilityLibrary = m_device->newLibrary(ToNSString(utilityShaderSource), nullptr, &error); + NS_STACK_SCOPED MTL::Library* utilityLibrary = m_device->newLibrary(ToNSString(utilityShaderSource), nullptr, &error); if (error) { cemuLog_log(LogType::Force, "failed to create utility library (error: {})", error->localizedDescription()->utf8String()); } // Pipelines - MTL::Function* vertexFullscreenFunction = utilityLibrary->newFunction(ToNSString("vertexFullscreen")); - MTL::Function* fragmentCopyDepthToColorFunction = utilityLibrary->newFunction(ToNSString("fragmentCopyDepthToColor")); + NS_STACK_SCOPED MTL::Function* vertexFullscreenFunction = utilityLibrary->newFunction(ToNSString("vertexFullscreen")); + NS_STACK_SCOPED MTL::Function* fragmentCopyDepthToColorFunction = utilityLibrary->newFunction(ToNSString("fragmentCopyDepthToColor")); m_copyDepthToColorDesc = MTL::RenderPipelineDescriptor::alloc()->init(); m_copyDepthToColorDesc->setVertexFunction(vertexFullscreenFunction); m_copyDepthToColorDesc->setFragmentFunction(fragmentCopyDepthToColorFunction); - vertexFullscreenFunction->release(); - fragmentCopyDepthToColorFunction->release(); // Void vertex pipelines if (m_isAppleGPU) m_copyBufferToBufferPipeline = new MetalVoidVertexPipeline(this, utilityLibrary, "vertexCopyBufferToBuffer"); - utilityLibrary->release(); - // HACK: for some reason, this variable ends up being initialized to some garbage data, even though its declared as bool m_captureFrame = false; m_occlusionQuery.m_lastCommandBuffer = nullptr; m_captureFrame = false; @@ -414,13 +405,12 @@ void MetalRenderer::HandleScreenshotRequest(LatteTextureView* texView, bool padV uint32 bytesPerRow = GetMtlTextureBytesPerRow(texMtl->format, texMtl->isDepth, width); uint32 size = GetMtlTextureBytesPerImage(texMtl->format, texMtl->isDepth, height, bytesPerRow); - // TODO: get a buffer from the memory manager - MTL::Buffer* buffer = m_device->newBuffer(size, MTL::ResourceStorageModeShared); - auto blitCommandEncoder = GetBlitCommandEncoder(); - blitCommandEncoder->copyFromTexture(texMtl->GetTexture(), 0, 0, MTL::Origin(0, 0, 0), MTL::Size(width, height, 1), buffer, 0, bytesPerRow, 0); - uint8* bufferPtr = (uint8*)buffer->contents(); + auto& bufferAllocator = m_memoryManager->GetStagingAllocator(); + auto buffer = bufferAllocator.AllocateBufferMemory(size, 1); + + blitCommandEncoder->copyFromTexture(texMtl->GetTexture(), 0, 0, MTL::Origin(0, 0, 0), MTL::Size(width, height, 1), buffer.mtlBuffer, buffer.bufferOffset, bytesPerRow, 0); bool formatValid = true; std::vector rgb_data; @@ -431,7 +421,7 @@ void MetalRenderer::HandleScreenshotRequest(LatteTextureView* texView, bool padV switch (pixelFormat) { case MTL::PixelFormatRGBA8Unorm: - for (auto ptr = bufferPtr; ptr < bufferPtr + size; ptr += 4) + for (auto ptr = buffer.memPtr; ptr < buffer.memPtr + size; ptr += 4) { rgb_data.emplace_back(*ptr); rgb_data.emplace_back(*(ptr + 1)); @@ -439,7 +429,7 @@ void MetalRenderer::HandleScreenshotRequest(LatteTextureView* texView, bool padV } break; case MTL::PixelFormatRGBA8Unorm_sRGB: - for (auto ptr = bufferPtr; ptr < bufferPtr + size; ptr += 4) + for (auto ptr = buffer.memPtr; ptr < buffer.memPtr + size; ptr += 4) { rgb_data.emplace_back(SRGBComponentToRGB(*ptr)); rgb_data.emplace_back(SRGBComponentToRGB(*(ptr + 1))); @@ -452,8 +442,6 @@ void MetalRenderer::HandleScreenshotRequest(LatteTextureView* texView, bool padV break; } - buffer->release(); - if (formatValid) SaveScreenshot(rgb_data, width, height, !padView); } @@ -470,14 +458,13 @@ void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutput // Create render pass auto& layer = GetLayer(!padView); - MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); + NS_STACK_SCOPED MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); auto colorAttachment = renderPassDescriptor->colorAttachments()->object(0); colorAttachment->setTexture(layer.GetDrawable()->texture()); colorAttachment->setLoadAction(clearBackground ? MTL::LoadActionClear : MTL::LoadActionLoad); colorAttachment->setStoreAction(MTL::StoreActionStore); auto renderCommandEncoder = GetTemporaryRenderCommandEncoder(renderPassDescriptor); - renderPassDescriptor->release(); // Get a render pipeline @@ -557,7 +544,7 @@ bool MetalRenderer::ImguiBegin(bool mainWindow) auto& layer = GetLayer(mainWindow); // Render pass descriptor - MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); + NS_STACK_SCOPED MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); auto colorAttachment = renderPassDescriptor->colorAttachments()->object(0); colorAttachment->setTexture(layer.GetDrawable()->texture()); colorAttachment->setLoadAction(MTL::LoadActionLoad); @@ -570,7 +557,6 @@ bool MetalRenderer::ImguiBegin(bool mainWindow) if (m_encoderType != MetalEncoderType::Render) GetTemporaryRenderCommandEncoder(renderPassDescriptor); - renderPassDescriptor->release(); return true; } @@ -605,7 +591,7 @@ ImTextureID MetalRenderer::GenerateTexture(const std::vector& data, const tmp[(i * 4) + 3] = 0xFF; } - MTL::TextureDescriptor* desc = MTL::TextureDescriptor::alloc()->init(); + NS_STACK_SCOPED MTL::TextureDescriptor* desc = MTL::TextureDescriptor::alloc()->init(); desc->setTextureType(MTL::TextureType2D); desc->setPixelFormat(MTL::PixelFormatRGBA8Unorm); desc->setWidth(size.x); @@ -614,7 +600,6 @@ ImTextureID MetalRenderer::GenerateTexture(const std::vector& data, const desc->setUsage(MTL::TextureUsageShaderRead); MTL::Texture* texture = m_device->newTexture(desc); - desc->release(); // TODO: do a GPU copy? texture->replaceRegion(MTL::Region(0, 0, size.x, size.y), 0, 0, tmp.data(), size.x * 4, 0); @@ -768,11 +753,8 @@ void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, s // Allocate a temporary buffer auto& bufferAllocator = m_memoryManager->GetStagingAllocator(); auto allocation = bufferAllocator.AllocateBufferMemory(compressedImageSize, 1); - bufferAllocator.FlushReservation(allocation); - - // Copy the data to the temporary buffer memcpy(allocation.memPtr, pixelData, compressedImageSize); - //buffer->didModifyRange(NS::Range(allocation.offset, allocation.size)); + bufferAllocator.FlushReservation(allocation); // TODO: specify blit options when copying to a depth stencil texture? // Copy the data from the temporary buffer to the texture @@ -804,7 +786,7 @@ void MetalRenderer::texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sl auto mtlTexture = static_cast(hostTexture)->GetTexture(); - MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); + NS_STACK_SCOPED MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); if (clearDepth) { auto depthAttachment = renderPassDescriptor->depthAttachment(); @@ -827,7 +809,6 @@ void MetalRenderer::texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sl } GetTemporaryRenderCommandEncoder(renderPassDescriptor); - renderPassDescriptor->release(); EndEncoding(); // Debug @@ -2195,7 +2176,7 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE void MetalRenderer::ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a) { - MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); + NS_STACK_SCOPED MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); auto colorAttachment = renderPassDescriptor->colorAttachments()->object(0); colorAttachment->setTexture(mtlTexture); colorAttachment->setClearColor(MTL::ClearColor(r, g, b, a)); @@ -2205,7 +2186,6 @@ void MetalRenderer::ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 s colorAttachment->setLevel(mipIndex); GetTemporaryRenderCommandEncoder(renderPassDescriptor); - renderPassDescriptor->release(); EndEncoding(); // Debug diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 2aa68973c..9a2b168c5 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -6,6 +6,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h" +#include "Foundation/NSAutoreleasePool.hpp" enum MetalGeneralShaderType { @@ -527,6 +528,9 @@ class MetalRenderer : public Renderer MTL::CommandBuffer* m_lastCommandBuffer = nullptr; } m_occlusionQuery; + // Autorelease pool + NS::AutoreleasePool* m_autoreleasePool; + // Active objects MetalCommandBuffer m_currentCommandBuffer{}; std::vector m_executingCommandBuffers; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp index b7d5a2ecd..a4d734eec 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp @@ -21,7 +21,7 @@ MTL::SamplerState* MetalSamplerCache::GetSamplerState(const LatteContextRegister // Sampler state const _LatteRegisterSetSampler* samplerWords = lcr.SQ_TEX_SAMPLER + samplerIndex; - MTL::SamplerDescriptor* samplerDescriptor = MTL::SamplerDescriptor::alloc()->init(); + NS_STACK_SCOPED MTL::SamplerDescriptor* samplerDescriptor = MTL::SamplerDescriptor::alloc()->init(); // lod uint32 iMinLOD = samplerWords->WORD1.get_MIN_LOD(); @@ -115,7 +115,6 @@ MTL::SamplerState* MetalSamplerCache::GetSamplerState(const LatteContextRegister } samplerState = m_mtlr->GetDevice()->newSamplerState(samplerDescriptor); - samplerDescriptor->release(); return samplerState; } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.cpp index 6789505c3..7e810e67b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.cpp @@ -3,16 +3,14 @@ MetalVoidVertexPipeline::MetalVoidVertexPipeline(class MetalRenderer* mtlRenderer, MTL::Library* library, const std::string& vertexFunctionName) { // Render pipeline state - MTL::Function* vertexFunction = library->newFunction(ToNSString(vertexFunctionName)); + NS_STACK_SCOPED MTL::Function* vertexFunction = library->newFunction(ToNSString(vertexFunctionName)); - MTL::RenderPipelineDescriptor* renderPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); + NS_STACK_SCOPED MTL::RenderPipelineDescriptor* renderPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); renderPipelineDescriptor->setVertexFunction(vertexFunction); renderPipelineDescriptor->setRasterizationEnabled(false); NS::Error* error = nullptr; m_renderPipelineState = mtlRenderer->GetDevice()->newRenderPipelineState(renderPipelineDescriptor, &error); - renderPipelineDescriptor->release(); - vertexFunction->release(); if (error) { cemuLog_log(LogType::Force, "error creating hybrid render pipeline state: {}", error->localizedDescription()->utf8String()); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp index 88f436db4..7fd38b5ed 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -276,7 +276,7 @@ bool RendererShaderMtl::ShouldCountCompilation() const MTL::Library* RendererShaderMtl::LibraryFromSource() { // Compile from source - MTL::CompileOptions* options = MTL::CompileOptions::alloc()->init(); + NS_STACK_SCOPED MTL::CompileOptions* options = MTL::CompileOptions::alloc()->init(); if (g_current_game_profile->GetFastMath()) options->setFastMathEnabled(true); @@ -288,7 +288,6 @@ MTL::Library* RendererShaderMtl::LibraryFromSource() NS::Error* error = nullptr; MTL::Library* library = m_mtlr->GetDevice()->newLibrary(ToNSString(m_mslCode), options, &error); - options->release(); if (error) { cemuLog_log(LogType::Force, "failed to create library from source: {} -> {}", error->localizedDescription()->utf8String(), m_mslCode.c_str()); From af08521676f32a79d7ec220eecdb44659892b2b5 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 25 Jan 2025 09:57:03 +0100 Subject: [PATCH 358/368] cleanup auto position invariance --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index fe595b82b..7bae23e99 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -62,11 +62,6 @@ MetalRenderer::MetalRenderer() case PositionInvariance::Auto: switch (CafeSystem::GetForegroundTitleId()) { - // Mario Kart 9 - case 0x000500001010ED00: // EUR - case 0x000500001010EC00: // USA - case 0x000500001010EB00: // JPN - case 0x0005000010183A00: // JPN (TODO: check) // Minecraft: Story Mode case 0x000500001020A300: // EUR case 0x00050000101E0100: // USA @@ -83,6 +78,11 @@ MetalRenderer::MetalRenderer() case 0x0005000010139B00: // EUR (TODO: check) case 0x0005000010110A00: // USA case 0x0005000010110900: // JPN + // Mario Kart 8 + case 0x000500001010ED00: // EUR + case 0x000500001010EC00: // USA + case 0x000500001010EB00: // JPN + case 0x0005000010183A00: // JPN (TODO: check) // Bayonetta 2 case 0x0005000010172700: // EUR case 0x0005000010172600: // USA @@ -95,14 +95,14 @@ MetalRenderer::MetalRenderer() case 0x000500001014DB00: // JPN // Disney Planes case 0x0005000010136900: // EUR - case 0x0005000010136A00: // EUR - case 0x0005000010136B00: // EUR + case 0x0005000010136A00: // EUR (TODO: check) + case 0x0005000010136B00: // EUR (TODO: check) case 0x000500001011C500: // USA (TODO: check) // Wonderful 101 case 0x0005000010135300: // EUR case 0x000500001012DC00: // USA case 0x0005000010116300: // JPN - case 0x0005000010185600: // JPN + case 0x0005000010185600: // JPN (TODO: check) m_positionInvariance = true; break; default: From 0c216e40e067bbd2c934e2a6aa68fbf58fa7fe02 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 25 Jan 2025 20:23:22 +0100 Subject: [PATCH 359/368] speed up render pass change check --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 30 ++++++++++++------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 1 + 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 7bae23e99..eccab5aba 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -698,6 +698,7 @@ void MetalRenderer::rendertarget_deleteCachedFBO(LatteCachedFBO* cfbo) void MetalRenderer::rendertarget_bindFramebufferObject(LatteCachedFBO* cfbo) { m_state.m_activeFBO = {(CachedFBOMtl*)cfbo, MetalAttachmentsInfo((CachedFBOMtl*)cfbo)}; + m_state.m_fboChanged = true; } void* MetalRenderer::texture_acquireTextureUploadBuffer(uint32 size) @@ -1732,6 +1733,9 @@ MTL::RenderCommandEncoder* MetalRenderer::GetTemporaryRenderCommandEncoder(MTL:: // Some render passes clear the attachments, forceRecreate is supposed to be used in those cases MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(bool forceRecreate) { + bool fboChanged = m_state.m_fboChanged; + m_state.m_fboChanged = false; + // Check if we need to begin a new render pass if (m_commandEncoder) { @@ -1739,24 +1743,28 @@ MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(bool forceRecr { if (m_encoderType == MetalEncoderType::Render) { - bool needsNewRenderPass = (m_state.m_lastUsedFBO.m_fbo == nullptr); - if (!needsNewRenderPass) + bool needsNewRenderPass = false; + if (fboChanged) { - for (uint8 i = 0; i < 8; i++) + needsNewRenderPass = (m_state.m_lastUsedFBO.m_fbo == nullptr); + if (!needsNewRenderPass) { - if (m_state.m_activeFBO.m_fbo->colorBuffer[i].texture && m_state.m_activeFBO.m_fbo->colorBuffer[i].texture != m_state.m_lastUsedFBO.m_fbo->colorBuffer[i].texture) + for (uint8 i = 0; i < 8; i++) { - needsNewRenderPass = true; - break; + if (m_state.m_activeFBO.m_fbo->colorBuffer[i].texture && m_state.m_activeFBO.m_fbo->colorBuffer[i].texture != m_state.m_lastUsedFBO.m_fbo->colorBuffer[i].texture) + { + needsNewRenderPass = true; + break; + } } } - } - if (!needsNewRenderPass) - { - if (m_state.m_activeFBO.m_fbo->depthBuffer.texture && (m_state.m_activeFBO.m_fbo->depthBuffer.texture != m_state.m_lastUsedFBO.m_fbo->depthBuffer.texture || ( m_state.m_activeFBO.m_fbo->depthBuffer.hasStencil && !m_state.m_lastUsedFBO.m_fbo->depthBuffer.hasStencil))) + if (!needsNewRenderPass) { - needsNewRenderPass = true; + if (m_state.m_activeFBO.m_fbo->depthBuffer.texture && (m_state.m_activeFBO.m_fbo->depthBuffer.texture != m_state.m_lastUsedFBO.m_fbo->depthBuffer.texture || ( m_state.m_activeFBO.m_fbo->depthBuffer.hasStencil && !m_state.m_lastUsedFBO.m_fbo->depthBuffer.hasStencil))) + { + needsNewRenderPass = true; + } } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 9a2b168c5..6dc780d82 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -111,6 +111,7 @@ struct MetalState MetalActiveFBOState m_activeFBO; // If the FBO changes, but it's the same FBO as the last one with some omitted attachments, this FBO doesn't change MetalActiveFBOState m_lastUsedFBO; + bool m_fboChanged = false; size_t m_vertexBufferOffsets[MAX_MTL_VERTEX_BUFFERS]; class LatteTextureViewMtl* m_textures[LATTE_NUM_MAX_TEX_UNITS * 3] = {nullptr}; From 24e1bba31cf7aa1e9f4066b960e9ca7230d2ffc9 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 26 Jan 2025 10:23:07 +0100 Subject: [PATCH 360/368] choose the closest matching sampler border color --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 3 +- .../Renderer/Metal/MetalSamplerCache.cpp | 73 +++++++++++++++---- .../Latte/Renderer/Metal/MetalSamplerCache.h | 5 +- 3 files changed, 63 insertions(+), 18 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index eccab5aba..93f330c0d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -2049,8 +2049,7 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE MTL::SamplerState* sampler; if (stageSamplerIndex != LATTE_DECOMPILER_SAMPLER_NONE) { - uint32 samplerIndex = stageSamplerIndex + LatteDecompiler_getTextureSamplerBaseIndex(shader->shaderType); - sampler = m_samplerCache->GetSamplerState(LatteGPUState.contextNew, samplerIndex); + sampler = m_samplerCache->GetSamplerState(LatteGPUState.contextNew, shader->shaderType, stageSamplerIndex); } else { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp index a4d734eec..abc3d19ca 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp @@ -1,6 +1,21 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" -#include "HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Cafe/HW/Latte/Core/LatteShader.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Metal/MTLSampler.hpp" + +inline const char* BorderColorToStr(MTL::SamplerBorderColor borderColor) +{ + switch (borderColor) + { + case MTL::SamplerBorderColorTransparentBlack: + return "transparent black"; + case MTL::SamplerBorderColorOpaqueBlack: + return "opaque black"; + case MTL::SamplerBorderColorOpaqueWhite: + return "opaque white"; + } +} MetalSamplerCache::~MetalSamplerCache() { @@ -11,9 +26,11 @@ MetalSamplerCache::~MetalSamplerCache() m_samplerCache.clear(); } -MTL::SamplerState* MetalSamplerCache::GetSamplerState(const LatteContextRegister& lcr, uint32 samplerIndex) +MTL::SamplerState* MetalSamplerCache::GetSamplerState(const LatteContextRegister& lcr, LatteConst::ShaderType shaderType, uint32 stageSamplerIndex) { - uint64 stateHash = CalculateSamplerHash(lcr, samplerIndex); + uint32 samplerIndex = stageSamplerIndex + LatteDecompiler_getTextureSamplerBaseIndex(shaderType); + + uint64 stateHash = CalculateSamplerHash(lcr, shaderType, stageSamplerIndex, samplerIndex); auto& samplerState = m_samplerCache[stateHash]; if (samplerState) return samplerState; @@ -77,9 +94,9 @@ MTL::SamplerState* MetalSamplerCache::GetSamplerState(const LatteContextRegister auto clampY = samplerWords->WORD0.get_CLAMP_Y(); auto clampZ = samplerWords->WORD0.get_CLAMP_Z(); - samplerDescriptor->setSAddressMode(GetMtlSamplerAddressMode(clampX)); - samplerDescriptor->setTAddressMode(GetMtlSamplerAddressMode(clampY)); - samplerDescriptor->setRAddressMode(GetMtlSamplerAddressMode(clampZ)); + samplerDescriptor->setSAddressMode(GetMtlSamplerAddressMode(clampX)); + samplerDescriptor->setTAddressMode(GetMtlSamplerAddressMode(clampY)); + samplerDescriptor->setRAddressMode(GetMtlSamplerAddressMode(clampZ)); auto maxAniso = samplerWords->WORD0.get_MAX_ANISO_RATIO(); @@ -101,25 +118,53 @@ MTL::SamplerState* MetalSamplerCache::GetSamplerState(const LatteContextRegister // border auto borderType = samplerWords->WORD0.get_BORDER_COLOR_TYPE(); + MTL::SamplerBorderColor borderColor; if (borderType == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_BORDER_COLOR_TYPE::TRANSPARENT_BLACK) - samplerDescriptor->setBorderColor(MTL::SamplerBorderColorTransparentBlack); + borderColor = MTL::SamplerBorderColorTransparentBlack; else if (borderType == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_BORDER_COLOR_TYPE::OPAQUE_BLACK) - samplerDescriptor->setBorderColor(MTL::SamplerBorderColorOpaqueBlack); + borderColor = MTL::SamplerBorderColorOpaqueBlack; else if (borderType == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_BORDER_COLOR_TYPE::OPAQUE_WHITE) - samplerDescriptor->setBorderColor(MTL::SamplerBorderColorOpaqueWhite); - else + borderColor = MTL::SamplerBorderColorOpaqueWhite; + else [[unlikely]] { - // Metal doesn't support custom border color - cemuLog_logOnce(LogType::Force, "Custom border color is not supported in Metal, using transparent black instead"); - samplerDescriptor->setBorderColor(MTL::SamplerBorderColorTransparentBlack); + _LatteRegisterSetSamplerBorderColor* borderColorReg; + if (shaderType == LatteConst::ShaderType::Vertex) + borderColorReg = LatteGPUState.contextNew.TD_VS_SAMPLER_BORDER_COLOR + stageSamplerIndex; + else if (shaderType == LatteConst::ShaderType::Pixel) + borderColorReg = LatteGPUState.contextNew.TD_PS_SAMPLER_BORDER_COLOR + stageSamplerIndex; + else // geometry + borderColorReg = LatteGPUState.contextNew.TD_GS_SAMPLER_BORDER_COLOR + stageSamplerIndex; + float r = borderColorReg->red.get_channelValue(); + float g = borderColorReg->green.get_channelValue(); + float b = borderColorReg->blue.get_channelValue(); + float a = borderColorReg->alpha.get_channelValue(); + + // Metal doesn't support custom border color + // Let's find the best match + bool opaque = (a == 1.0f); + bool white = (r == 1.0f); + if (opaque) + { + if (white) + borderColor = MTL::SamplerBorderColorOpaqueWhite; + else + borderColor = MTL::SamplerBorderColorOpaqueBlack; + } + else + { + borderColor = MTL::SamplerBorderColorTransparentBlack; + } + + cemuLog_log(LogType::Force, "Custom border color ({}, {}, {}, {}) is not supported on Metal, using {} instead", r, g, b, a, BorderColorToStr(borderColor)); } + samplerDescriptor->setBorderColor(borderColor); samplerState = m_mtlr->GetDevice()->newSamplerState(samplerDescriptor); return samplerState; } -uint64 MetalSamplerCache::CalculateSamplerHash(const LatteContextRegister& lcr, uint32 samplerIndex) +uint64 MetalSamplerCache::CalculateSamplerHash(const LatteContextRegister& lcr, LatteConst::ShaderType shaderType, uint32 stageSamplerIndex, uint32 samplerIndex) { const _LatteRegisterSetSampler* samplerWords = lcr.SQ_TEX_SAMPLER + samplerIndex; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h index 891d7e035..17857f0ee 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h @@ -2,6 +2,7 @@ #include +#include "HW/Latte/Core/LatteConst.h" #include "HW/Latte/ISA/LatteReg.h" class MetalSamplerCache @@ -10,12 +11,12 @@ class MetalSamplerCache MetalSamplerCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} ~MetalSamplerCache(); - MTL::SamplerState* GetSamplerState(const LatteContextRegister& lcr, uint32 samplerIndex); + MTL::SamplerState* GetSamplerState(const LatteContextRegister& lcr, LatteConst::ShaderType shaderType, uint32 stageSamplerIndex); private: class MetalRenderer* m_mtlr; std::map m_samplerCache; - uint64 CalculateSamplerHash(const LatteContextRegister& lcr, uint32 samplerIndex); + uint64 CalculateSamplerHash(const LatteContextRegister& lcr, LatteConst::ShaderType shaderType, uint32 stageSamplerIndex, uint32 samplerIndex); }; From 2e6eafde2e4cdd2467191d4077fb4106f43ef0f8 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 26 Jan 2025 10:39:39 +0100 Subject: [PATCH 361/368] make sampler hash more accurate --- .../Renderer/Metal/MetalSamplerCache.cpp | 108 +++++++++++------- 1 file changed, 64 insertions(+), 44 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp index abc3d19ca..79d9b22ab 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp @@ -2,7 +2,6 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" -#include "Metal/MTLSampler.hpp" inline const char* BorderColorToStr(MTL::SamplerBorderColor borderColor) { @@ -17,6 +16,54 @@ inline const char* BorderColorToStr(MTL::SamplerBorderColor borderColor) } } +MTL::SamplerBorderColor GetBorderColor(LatteConst::ShaderType shaderType, uint32 stageSamplerIndex, const _LatteRegisterSetSampler* samplerWords, bool logWorkaround) +{ + auto borderType = samplerWords->WORD0.get_BORDER_COLOR_TYPE(); + + MTL::SamplerBorderColor borderColor; + if (borderType == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_BORDER_COLOR_TYPE::TRANSPARENT_BLACK) + borderColor = MTL::SamplerBorderColorTransparentBlack; + else if (borderType == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_BORDER_COLOR_TYPE::OPAQUE_BLACK) + borderColor = MTL::SamplerBorderColorOpaqueBlack; + else if (borderType == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_BORDER_COLOR_TYPE::OPAQUE_WHITE) + borderColor = MTL::SamplerBorderColorOpaqueWhite; + else [[unlikely]] + { + _LatteRegisterSetSamplerBorderColor* borderColorReg; + if (shaderType == LatteConst::ShaderType::Vertex) + borderColorReg = LatteGPUState.contextNew.TD_VS_SAMPLER_BORDER_COLOR + stageSamplerIndex; + else if (shaderType == LatteConst::ShaderType::Pixel) + borderColorReg = LatteGPUState.contextNew.TD_PS_SAMPLER_BORDER_COLOR + stageSamplerIndex; + else // geometry + borderColorReg = LatteGPUState.contextNew.TD_GS_SAMPLER_BORDER_COLOR + stageSamplerIndex; + float r = borderColorReg->red.get_channelValue(); + float g = borderColorReg->green.get_channelValue(); + float b = borderColorReg->blue.get_channelValue(); + float a = borderColorReg->alpha.get_channelValue(); + + // Metal doesn't support custom border color + // Let's find the best match + bool opaque = (a == 1.0f); + bool white = (r == 1.0f); + if (opaque) + { + if (white) + borderColor = MTL::SamplerBorderColorOpaqueWhite; + else + borderColor = MTL::SamplerBorderColorOpaqueBlack; + } + else + { + borderColor = MTL::SamplerBorderColorTransparentBlack; + } + + if (logWorkaround) + cemuLog_log(LogType::Force, "Custom border color ({}, {}, {}, {}) is not supported on Metal, using {} instead", r, g, b, a, BorderColorToStr(borderColor)); + } + + return borderColor; +} + MetalSamplerCache::~MetalSamplerCache() { for (auto& pair : m_samplerCache) @@ -115,48 +162,8 @@ MTL::SamplerState* MetalSamplerCache::GetSamplerState(const LatteContextRegister // TODO: is it okay to just cast? samplerDescriptor->setCompareFunction(GetMtlCompareFunc((Latte::E_COMPAREFUNC)samplerWords->WORD0.get_DEPTH_COMPARE_FUNCTION())); - // border - auto borderType = samplerWords->WORD0.get_BORDER_COLOR_TYPE(); - - MTL::SamplerBorderColor borderColor; - if (borderType == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_BORDER_COLOR_TYPE::TRANSPARENT_BLACK) - borderColor = MTL::SamplerBorderColorTransparentBlack; - else if (borderType == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_BORDER_COLOR_TYPE::OPAQUE_BLACK) - borderColor = MTL::SamplerBorderColorOpaqueBlack; - else if (borderType == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_BORDER_COLOR_TYPE::OPAQUE_WHITE) - borderColor = MTL::SamplerBorderColorOpaqueWhite; - else [[unlikely]] - { - _LatteRegisterSetSamplerBorderColor* borderColorReg; - if (shaderType == LatteConst::ShaderType::Vertex) - borderColorReg = LatteGPUState.contextNew.TD_VS_SAMPLER_BORDER_COLOR + stageSamplerIndex; - else if (shaderType == LatteConst::ShaderType::Pixel) - borderColorReg = LatteGPUState.contextNew.TD_PS_SAMPLER_BORDER_COLOR + stageSamplerIndex; - else // geometry - borderColorReg = LatteGPUState.contextNew.TD_GS_SAMPLER_BORDER_COLOR + stageSamplerIndex; - float r = borderColorReg->red.get_channelValue(); - float g = borderColorReg->green.get_channelValue(); - float b = borderColorReg->blue.get_channelValue(); - float a = borderColorReg->alpha.get_channelValue(); - - // Metal doesn't support custom border color - // Let's find the best match - bool opaque = (a == 1.0f); - bool white = (r == 1.0f); - if (opaque) - { - if (white) - borderColor = MTL::SamplerBorderColorOpaqueWhite; - else - borderColor = MTL::SamplerBorderColorOpaqueBlack; - } - else - { - borderColor = MTL::SamplerBorderColorTransparentBlack; - } - - cemuLog_log(LogType::Force, "Custom border color ({}, {}, {}, {}) is not supported on Metal, using {} instead", r, g, b, a, BorderColorToStr(borderColor)); - } + // Border color + auto borderColor = GetBorderColor(shaderType, stageSamplerIndex, samplerWords, true); samplerDescriptor->setBorderColor(borderColor); samplerState = m_mtlr->GetDevice()->newSamplerState(samplerDescriptor); @@ -168,6 +175,19 @@ uint64 MetalSamplerCache::CalculateSamplerHash(const LatteContextRegister& lcr, { const _LatteRegisterSetSampler* samplerWords = lcr.SQ_TEX_SAMPLER + samplerIndex; + uint64 hash = 0; + hash = std::rotl(hash, 17); + hash += (uint64)samplerWords->WORD0.getRawValue(); + hash = std::rotl(hash, 17); + hash += (uint64)samplerWords->WORD1.getRawValue(); + hash = std::rotl(hash, 17); + hash += (uint64)samplerWords->WORD2.getRawValue(); + + auto borderColor = GetBorderColor(shaderType, stageSamplerIndex, samplerWords, true); + + hash = std::rotl(hash, 5); + hash += (uint64)borderColor; + // TODO: check this - return *((uint64*)samplerWords); + return hash; } From 5d17b1e525ab47e356f01f67be0e6684f6d847b2 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 26 Jan 2025 10:46:18 +0100 Subject: [PATCH 362/368] improve sampler border color message --- .../Renderer/Metal/MetalSamplerCache.cpp | 45 ++++++++++++------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp index 79d9b22ab..8dab80b6d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp @@ -3,20 +3,7 @@ #include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" -inline const char* BorderColorToStr(MTL::SamplerBorderColor borderColor) -{ - switch (borderColor) - { - case MTL::SamplerBorderColorTransparentBlack: - return "transparent black"; - case MTL::SamplerBorderColorOpaqueBlack: - return "opaque black"; - case MTL::SamplerBorderColorOpaqueWhite: - return "opaque white"; - } -} - -MTL::SamplerBorderColor GetBorderColor(LatteConst::ShaderType shaderType, uint32 stageSamplerIndex, const _LatteRegisterSetSampler* samplerWords, bool logWorkaround) +MTL::SamplerBorderColor GetBorderColor(LatteConst::ShaderType shaderType, uint32 stageSamplerIndex, const _LatteRegisterSetSampler* samplerWords, bool logWorkaround = false) { auto borderType = samplerWords->WORD0.get_BORDER_COLOR_TYPE(); @@ -58,7 +45,33 @@ MTL::SamplerBorderColor GetBorderColor(LatteConst::ShaderType shaderType, uint32 } if (logWorkaround) - cemuLog_log(LogType::Force, "Custom border color ({}, {}, {}, {}) is not supported on Metal, using {} instead", r, g, b, a, BorderColorToStr(borderColor)); + { + float newR, newG, newB, newA; + switch (borderColor) + { + case MTL::SamplerBorderColorTransparentBlack: + newR = 0.0f; + newG = 0.0f; + newB = 0.0f; + newA = 0.0f; + break; + case MTL::SamplerBorderColorOpaqueBlack: + newR = 0.0f; + newG = 0.0f; + newB = 0.0f; + newA = 1.0f; + break; + case MTL::SamplerBorderColorOpaqueWhite: + newR = 1.0f; + newG = 1.0f; + newB = 1.0f; + newA = 1.0f; + break; + } + + if (r != newR || g != newG || b != newB || a != newA) + cemuLog_log(LogType::Force, "Custom border color ({}, {}, {}, {}) is not supported on Metal, using ({}, {}, {}, {}) instead", r, g, b, a, newR, newG, newB, newA); + } } return borderColor; @@ -183,7 +196,7 @@ uint64 MetalSamplerCache::CalculateSamplerHash(const LatteContextRegister& lcr, hash = std::rotl(hash, 17); hash += (uint64)samplerWords->WORD2.getRawValue(); - auto borderColor = GetBorderColor(shaderType, stageSamplerIndex, samplerWords, true); + auto borderColor = GetBorderColor(shaderType, stageSamplerIndex, samplerWords); hash = std::rotl(hash, 5); hash += (uint64)borderColor; From 8df01528f45cd196d4b404bbb35c41c208629476 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 26 Jan 2025 11:12:28 +0100 Subject: [PATCH 363/368] fix auto position invariance not working --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 93f330c0d..f2a42d236 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -109,6 +109,7 @@ MetalRenderer::MetalRenderer() m_positionInvariance = false; break; } + break; case PositionInvariance::False: m_positionInvariance = false; break; From 15e5e4487ef80d3af262809570f5fafac4868e47 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 26 Jan 2025 13:16:06 +0100 Subject: [PATCH 364/368] determine GPU vendor --- src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index f2a42d236..329e7ee3f 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -148,6 +148,19 @@ MetalRenderer::MetalRenderer() m_device = MTL::CreateSystemDefaultDevice(); } + // Vendor + const char* deviceName = m_device->name()->utf8String(); + if (memcmp(deviceName, "Apple", 5) == 0) + m_vendor = GfxVendor::Apple; + else if (memcmp(deviceName, "AMD", 3) == 0) + m_vendor = GfxVendor::AMD; + else if (memcmp(deviceName, "Intel", 5) == 0) + m_vendor = GfxVendor::Intel; + else if (memcmp(deviceName, "NVIDIA", 6) == 0) + m_vendor = GfxVendor::Nvidia; + else + m_vendor = GfxVendor::Generic; + // Feature support m_isAppleGPU = m_device->supportsFamily(MTL::GPUFamilyApple1); m_supportsFramebufferFetch = GetConfig().framebuffer_fetch.GetValue() ? m_device->supportsFamily(MTL::GPUFamilyApple2) : false; From 58a8b708f4317d8a22e90734f4b08ca018a17f05 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 27 Jan 2025 20:36:34 +0100 Subject: [PATCH 365/368] turn position invariance on for Star Fox Zero --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 55 ++++++++++--------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 329e7ee3f..a3bf1e32b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -62,42 +62,47 @@ MetalRenderer::MetalRenderer() case PositionInvariance::Auto: switch (CafeSystem::GetForegroundTitleId()) { + // Bayonetta + case 0x0005000010157F00: // EUR + case 0x0005000010157E00: // USA + case 0x000500001014DB00: // JPN + // Bayonetta 2 + case 0x0005000010172700: // EUR + case 0x0005000010172600: // USA + // Disney Planes + case 0x0005000010136900: // EUR + case 0x0005000010136A00: // EUR (TODO: check) + case 0x0005000010136B00: // EUR (TODO: check) + case 0x000500001011C500: // USA (TODO: check) + // LEGO STAR WARS: The Force Awakens + case 0x00050000101DAA00: // EUR + case 0x00050000101DAB00: // USA + // Mario Kart 8 + case 0x000500001010ED00: // EUR + case 0x000500001010EC00: // USA + case 0x000500001010EB00: // JPN + case 0x0005000010183A00: // JPN (TODO: check) // Minecraft: Story Mode case 0x000500001020A300: // EUR case 0x00050000101E0100: // USA //case 0x000500001020a200: // USA + // Ninja Gaiden 3: Razor's Edge + case 0x0005000010110B00: // EUR + case 0x0005000010139B00: // EUR (TODO: check) + case 0x0005000010110A00: // USA + case 0x0005000010110900: // JPN // Resident Evil: Revelations case 0x000500001012B400: // EUR case 0x000500001012CF00: // USA + // Star Fox Zero + case 0x00050000101B0500: // EUR + case 0x0005000010201C00: // EUR (TODO: check) + case 0x00050000101B0400: // USA + case 0x0005000010201B00: // USA (TODO: check) // The Legend of Zelda: Breath of the Wild case 0x00050000101C9500: // EUR case 0x00050000101C9400: // USA case 0x00050000101C9300: // JPN - // Ninja Gaiden 3: Razor's Edge - case 0x0005000010110B00: // EUR - case 0x0005000010139B00: // EUR (TODO: check) - case 0x0005000010110A00: // USA - case 0x0005000010110900: // JPN - // Mario Kart 8 - case 0x000500001010ED00: // EUR - case 0x000500001010EC00: // USA - case 0x000500001010EB00: // JPN - case 0x0005000010183A00: // JPN (TODO: check) - // Bayonetta 2 - case 0x0005000010172700: // EUR - case 0x0005000010172600: // USA - // LEGO STAR WARS: The Force Awakens - case 0x00050000101DAA00: // EUR - case 0x00050000101DAB00: // USA - // Bayonetta - case 0x0005000010157F00: // EUR - case 0x0005000010157E00: // USA - case 0x000500001014DB00: // JPN - // Disney Planes - case 0x0005000010136900: // EUR - case 0x0005000010136A00: // EUR (TODO: check) - case 0x0005000010136B00: // EUR (TODO: check) - case 0x000500001011C500: // USA (TODO: check) // Wonderful 101 case 0x0005000010135300: // EUR case 0x000500001012DC00: // USA From 05518c01fb578ae68e00c315996a339181ef68fc Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 28 Jan 2025 07:12:10 +0100 Subject: [PATCH 366/368] support max anisotropy overwrite --- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 17 +++++++++++- .../Renderer/Metal/MetalSamplerCache.cpp | 26 ++++--------------- .../Latte/Renderer/Metal/MetalSamplerCache.h | 4 +-- 3 files changed, 23 insertions(+), 24 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index a3bf1e32b..fb284b80c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -2068,7 +2068,22 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE MTL::SamplerState* sampler; if (stageSamplerIndex != LATTE_DECOMPILER_SAMPLER_NONE) { - sampler = m_samplerCache->GetSamplerState(LatteGPUState.contextNew, shader->shaderType, stageSamplerIndex); + uint32 samplerIndex = stageSamplerIndex + LatteDecompiler_getTextureSamplerBaseIndex(shader->shaderType); + _LatteRegisterSetSampler* samplerWords = LatteGPUState.contextNew.SQ_TEX_SAMPLER + samplerIndex; + + // Overwriting + + // Lod bias + //if (baseTexture->overwriteInfo.hasLodBias) + // samplerWords->WORD1.set_LOD_BIAS(baseTexture->overwriteInfo.lodBias); + //else if (baseTexture->overwriteInfo.hasRelativeLodBias) + // samplerWords->WORD1.set_LOD_BIAS(samplerWords->WORD1.get_LOD_BIAS() + baseTexture->overwriteInfo.relativeLodBias); + + // Max anisotropy + if (baseTexture->overwriteInfo.anisotropicLevel >= 0) + samplerWords->WORD0.set_MAX_ANISO_RATIO(baseTexture->overwriteInfo.anisotropicLevel); + + sampler = m_samplerCache->GetSamplerState(LatteGPUState.contextNew, shader->shaderType, stageSamplerIndex, samplerWords); } else { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp index 8dab80b6d..3a1371a51 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp @@ -86,32 +86,22 @@ MetalSamplerCache::~MetalSamplerCache() m_samplerCache.clear(); } -MTL::SamplerState* MetalSamplerCache::GetSamplerState(const LatteContextRegister& lcr, LatteConst::ShaderType shaderType, uint32 stageSamplerIndex) +MTL::SamplerState* MetalSamplerCache::GetSamplerState(const LatteContextRegister& lcr, LatteConst::ShaderType shaderType, uint32 stageSamplerIndex, const _LatteRegisterSetSampler* samplerWords) { - uint32 samplerIndex = stageSamplerIndex + LatteDecompiler_getTextureSamplerBaseIndex(shaderType); - - uint64 stateHash = CalculateSamplerHash(lcr, shaderType, stageSamplerIndex, samplerIndex); + uint64 stateHash = CalculateSamplerHash(lcr, shaderType, stageSamplerIndex, samplerWords); auto& samplerState = m_samplerCache[stateHash]; if (samplerState) return samplerState; // Sampler state - const _LatteRegisterSetSampler* samplerWords = lcr.SQ_TEX_SAMPLER + samplerIndex; + NS_STACK_SCOPED MTL::SamplerDescriptor* samplerDescriptor = MTL::SamplerDescriptor::alloc()->init(); // lod uint32 iMinLOD = samplerWords->WORD1.get_MIN_LOD(); uint32 iMaxLOD = samplerWords->WORD1.get_MAX_LOD(); - sint32 iLodBias = samplerWords->WORD1.get_LOD_BIAS(); - - // TODO: uncomment - // apply relative lod bias from graphic pack - //if (baseTexture->overwriteInfo.hasRelativeLodBias) - // iLodBias += baseTexture->overwriteInfo.relativeLodBias; - // apply absolute lod bias from graphic pack - //if (baseTexture->overwriteInfo.hasLodBias) - // iLodBias = baseTexture->overwriteInfo.lodBias; + //sint32 iLodBias = samplerWords->WORD1.get_LOD_BIAS(); auto filterMip = samplerWords->WORD0.get_MIP_FILTER(); if (filterMip == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_Z_FILTER::NONE) @@ -160,10 +150,6 @@ MTL::SamplerState* MetalSamplerCache::GetSamplerState(const LatteContextRegister auto maxAniso = samplerWords->WORD0.get_MAX_ANISO_RATIO(); - // TODO: uncomment - //if (baseTexture->overwriteInfo.anisotropicLevel >= 0) - // maxAniso = baseTexture->overwriteInfo.anisotropicLevel; - if (maxAniso > 0) samplerDescriptor->setMaxAnisotropy(1 << maxAniso); @@ -184,10 +170,8 @@ MTL::SamplerState* MetalSamplerCache::GetSamplerState(const LatteContextRegister return samplerState; } -uint64 MetalSamplerCache::CalculateSamplerHash(const LatteContextRegister& lcr, LatteConst::ShaderType shaderType, uint32 stageSamplerIndex, uint32 samplerIndex) +uint64 MetalSamplerCache::CalculateSamplerHash(const LatteContextRegister& lcr, LatteConst::ShaderType shaderType, uint32 stageSamplerIndex, const _LatteRegisterSetSampler* samplerWords) { - const _LatteRegisterSetSampler* samplerWords = lcr.SQ_TEX_SAMPLER + samplerIndex; - uint64 hash = 0; hash = std::rotl(hash, 17); hash += (uint64)samplerWords->WORD0.getRawValue(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h index 17857f0ee..cbb02cf3b 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h @@ -11,12 +11,12 @@ class MetalSamplerCache MetalSamplerCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} ~MetalSamplerCache(); - MTL::SamplerState* GetSamplerState(const LatteContextRegister& lcr, LatteConst::ShaderType shaderType, uint32 stageSamplerIndex); + MTL::SamplerState* GetSamplerState(const LatteContextRegister& lcr, LatteConst::ShaderType shaderType, uint32 stageSamplerIndex, const _LatteRegisterSetSampler* samplerWords); private: class MetalRenderer* m_mtlr; std::map m_samplerCache; - uint64 CalculateSamplerHash(const LatteContextRegister& lcr, LatteConst::ShaderType shaderType, uint32 stageSamplerIndex, uint32 samplerIndex); + uint64 CalculateSamplerHash(const LatteContextRegister& lcr, LatteConst::ShaderType shaderType, uint32 stageSamplerIndex, const _LatteRegisterSetSampler* samplerWords); }; From 1fb9cfd78326c2f95a483d9a627a7f7995aa4437 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 28 Jan 2025 16:41:30 +0100 Subject: [PATCH 367/368] create helper function for rasterization kill --- src/Cafe/HW/Latte/Core/LatteShader.cpp | 16 +------ src/Cafe/HW/Latte/ISA/LatteReg.h | 46 +++++++++++++------ .../LatteDecompilerEmitMSL.cpp | 16 +------ .../Renderer/Metal/MetalPipelineCompiler.cpp | 14 +----- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 10 +--- 5 files changed, 37 insertions(+), 65 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteShader.cpp b/src/Cafe/HW/Latte/Core/LatteShader.cpp index e01645842..3091e079d 100644 --- a/src/Cafe/HW/Latte/Core/LatteShader.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShader.cpp @@ -9,6 +9,7 @@ #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h" #include "Cafe/OS/libs/gx2/GX2.h" // todo - remove dependency #include "Cafe/GraphicPack/GraphicPack2.h" +#include "HW/Latte/Core/Latte.h" #include "HW/Latte/Renderer/Renderer.h" #include "util/helpers/StringParser.h" #include "config/ActiveSettings.h" @@ -543,20 +544,7 @@ void LatteSHRC_UpdateVSBaseHash(uint8* vertexShaderPtr, uint32 vertexShaderSize, if (!usesGeometryShader) { - // Rasterization - bool rasterizationEnabled = !LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); - - // HACK - if (!LatteGPUState.contextNew.PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) - rasterizationEnabled = true; - - const auto& polygonControlReg = LatteGPUState.contextNew.PA_SU_SC_MODE_CNTL; - uint32 cullFront = polygonControlReg.get_CULL_FRONT(); - uint32 cullBack = polygonControlReg.get_CULL_BACK(); - if (cullFront && cullBack) - rasterizationEnabled = false; - - if (rasterizationEnabled) + if (LatteGPUState.contextNew.IsRasterizationEnabled()) vsHash += 51ULL; // Vertex fetch diff --git a/src/Cafe/HW/Latte/ISA/LatteReg.h b/src/Cafe/HW/Latte/ISA/LatteReg.h index d1a2a0289..121d4e892 100644 --- a/src/Cafe/HW/Latte/ISA/LatteReg.h +++ b/src/Cafe/HW/Latte/ISA/LatteReg.h @@ -52,20 +52,20 @@ namespace Latte { // same as E_TILEMODE but contains additional options with special meaning TM_LINEAR_GENERAL = 0, - TM_LINEAR_ALIGNED = 1, + TM_LINEAR_ALIGNED = 1, // micro-tiled - TM_1D_TILED_THIN1 = 2, - TM_1D_TILED_THICK = 3, + TM_1D_TILED_THIN1 = 2, + TM_1D_TILED_THICK = 3, // macro-tiled - TM_2D_TILED_THIN1 = 4, - TM_2D_TILED_THIN2 = 5, - TM_2D_TILED_THIN4 = 6, - TM_2D_TILED_THICK = 7, + TM_2D_TILED_THIN1 = 4, + TM_2D_TILED_THIN2 = 5, + TM_2D_TILED_THIN4 = 6, + TM_2D_TILED_THICK = 7, - TM_2B_TILED_THIN1 = 8, - TM_2B_TILED_THIN2 = 9, + TM_2B_TILED_THIN1 = 8, + TM_2B_TILED_THIN2 = 9, TM_2B_TILED_THIN4 = 10, TM_2B_TILED_THICK = 11, @@ -179,7 +179,7 @@ namespace Latte HWFMT_4_4_4_4 = 0xB, HWFMT_5_5_5_1 = 0xC, HWFMT_32 = 0xD, - HWFMT_32_FLOAT = 0xE, + HWFMT_32_FLOAT = 0xE, HWFMT_16_16 = 0xF, HWFMT_16_16_FLOAT = 0x10, HWFMT_8_24 = 0x11, @@ -284,7 +284,7 @@ namespace Latte R32_G32_B32_A32_UINT = (HWFMT_32_32_32_32 | FMT_BIT_INT), R32_G32_B32_A32_SINT = (HWFMT_32_32_32_32 | FMT_BIT_INT | FMT_BIT_SIGNED), R32_G32_B32_A32_FLOAT = (HWFMT_32_32_32_32_FLOAT | FMT_BIT_FLOAT), - + // depth D24_S8_UNORM = (HWFMT_8_24), D24_S8_FLOAT = (HWFMT_8_24 | FMT_BIT_FLOAT), @@ -353,7 +353,7 @@ namespace Latte enum GPU_LIMITS { NUM_VERTEX_BUFFERS = 16, - NUM_TEXTURES_PER_STAGE = 18, + NUM_TEXTURES_PER_STAGE = 18, NUM_SAMPLERS_PER_STAGE = 18, // is this 16 or 18? NUM_COLOR_ATTACHMENTS = 8, }; @@ -1579,7 +1579,7 @@ struct LatteContextRegister /* +0x3A4C0 */ _LatteRegisterSetTextureUnit SQ_TEX_START_GS[Latte::GPU_LIMITS::NUM_TEXTURES_PER_STAGE]; uint8 padding_3A6B8[0x3C000 - 0x3A6B8]; - + /* +0x3C000 */ _LatteRegisterSetSampler SQ_TEX_SAMPLER[18 * 3]; /* +0x3C288 */ @@ -1598,6 +1598,24 @@ struct LatteContextRegister { return (uint32*)hleSpecialState; } + + bool IsRasterizationEnabled() const + { + bool rasterizationEnabled = !PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); + + // GX2SetSpecialState(0, true) enables DX_RASTERIZATION_KILL, but still expects depth writes to happen? -> Research which stages are disabled by DX_RASTERIZATION_KILL exactly + // for now we use a workaround: + if (!PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) + rasterizationEnabled = true; + + // Culling both front and back faces effectively disables rasterization + uint32 cullFront = PA_SU_SC_MODE_CNTL.get_CULL_FRONT(); + uint32 cullBack = PA_SU_SC_MODE_CNTL.get_CULL_BACK(); + if (cullFront && cullBack) + rasterizationEnabled = false; + + return rasterizationEnabled; + } }; static_assert(sizeof(LatteContextRegister) == 0x10000 * 4 + 9 * 4); @@ -1664,4 +1682,4 @@ static_assert(offsetof(LatteContextRegister, SQ_PGM_RESOURCES_ES) == Latte::REGA static_assert(offsetof(LatteContextRegister, SQ_PGM_START_GS) == Latte::REGADDR::SQ_PGM_START_GS * 4); static_assert(offsetof(LatteContextRegister, SQ_PGM_RESOURCES_GS) == Latte::REGADDR::SQ_PGM_RESOURCES_GS * 4); static_assert(offsetof(LatteContextRegister, SPI_VS_OUT_CONFIG) == Latte::REGADDR::SPI_VS_OUT_CONFIG * 4); -static_assert(offsetof(LatteContextRegister, LATTE_SPI_VS_OUT_ID_N) == Latte::REGADDR::SPI_VS_OUT_ID_0 * 4); \ No newline at end of file +static_assert(offsetof(LatteContextRegister, LATTE_SPI_VS_OUT_ID_N) == Latte::REGADDR::SPI_VS_OUT_ID_0 * 4); diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index c4b50db12..1c7311270 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3938,21 +3938,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, bool fetchVertexManually = (usesGeometryShader || (shaderContext->fetchShader && shaderContext->fetchShader->mtlFetchVertexManually)); // Rasterization - rasterizationEnabled = true; - if (shader->shaderType == LatteConst::ShaderType::Vertex && !usesGeometryShader) - { - rasterizationEnabled = !shaderContext->contextRegistersNew->PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); - - // HACK - if (!shaderContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) - rasterizationEnabled = true; - - const auto& polygonControlReg = shaderContext->contextRegistersNew->PA_SU_SC_MODE_CNTL; - uint32 cullFront = polygonControlReg.get_CULL_FRONT(); - uint32 cullBack = polygonControlReg.get_CULL_BACK(); - if (cullFront && cullBack) - rasterizationEnabled = false; - } + rasterizationEnabled = shaderContext->contextRegistersNew->IsRasterizationEnabled(); StringBuf* src = new StringBuf(1024*1024*12); // reserve 12MB for generated source (we resize-to-fit at the end) shaderContext->shaderSource = src; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp index afd63f8b6..1892c257e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp @@ -295,19 +295,7 @@ void MetalPipelineCompiler::InitFromState(const LatteFetchShader* fetchShader, c m_usesGeometryShader = (geometryShader != nullptr || isPrimitiveRect); // Rasterization - m_rasterizationEnabled = !lcr.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); - - // HACK - // TODO: include this in the hash? - if (!lcr.PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) - m_rasterizationEnabled = true; - - // Culling both front and back faces effectively disables rasterization - const auto& polygonControlReg = lcr.PA_SU_SC_MODE_CNTL; - uint32 cullFront = polygonControlReg.get_CULL_FRONT(); - uint32 cullBack = polygonControlReg.get_CULL_BACK(); - if (cullFront && cullBack) - m_rasterizationEnabled = false; + m_rasterizationEnabled = lcr.IsRasterizationEnabled(); // Shaders m_vertexShaderMtl = static_cast(vertexShader->shader); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index fb284b80c..b9e84db56 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1065,15 +1065,7 @@ void MetalRenderer::draw_beginSequence() LatteRenderTarget_updateViewport(); LatteRenderTarget_updateScissorBox(); - // check for conditions which would turn the drawcalls into no-ops - bool rasterizerEnable = !LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); - - // GX2SetSpecialState(0, true) enables DX_RASTERIZATION_KILL, but still expects depth writes to happen? -> Research which stages are disabled by DX_RASTERIZATION_KILL exactly - // for now we use a workaround: - if (!LatteGPUState.contextNew.PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) - rasterizerEnable = true; - - if (!rasterizerEnable && !streamoutEnable) + if (!LatteGPUState.contextNew.IsRasterizationEnabled() && !streamoutEnable) m_state.m_skipDrawSequence = true; } From 3fececc3baaab17a440a7be4f9d5a624df7abb43 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 28 Jan 2025 19:23:47 +0100 Subject: [PATCH 368/368] support line strip as vertex output with geometry shaders --- .../LatteDecompilerEmitMSL.cpp | 6 +++- .../LatteDecompilerEmitMSLHeader.hpp | 25 +++------------ .../HW/Latte/Renderer/Metal/MetalCommon.h | 31 +++++++++++++++++++ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 27 +++++----------- 4 files changed, 48 insertions(+), 41 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 1c7311270..a2b0bc2d9 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -4140,7 +4140,11 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, if (usesGeometryShader) { // Calculate the imaginary vertex id - src->add("uint vid = tig * VERTICES_PER_VERTEX_PRIMITIVE + tid;" _CRLF); + LattePrimitiveMode vsOutPrimType = shaderContext->contextRegistersNew->VGT_PRIMITIVE_TYPE.get_PRIMITIVE_MODE(); + if (PrimitiveRequiresConnection(vsOutPrimType)) + src->add("uint vid = tig + tid;" _CRLF); + else + src->add("uint vid = tig * VERTICES_PER_VERTEX_PRIMITIVE + tid;" _CRLF); src->add("uint iid = vid / supportBuffer.verticesPerInstance;" _CRLF); src->add("vid %= supportBuffer.verticesPerInstance;" _CRLF); diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index ab8906718..967976994 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -2,7 +2,7 @@ #include "Common/precompiled.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" -#include "HW/Latte/Core/LatteShader.h" +#include "Cafe/HW/Latte/Core/LatteShader.h" namespace LatteDecompiler { @@ -363,27 +363,10 @@ namespace LatteDecompiler if ((decompilerContext->options->usesGeometryShader || isRectVertexShader) && (decompilerContext->shaderType == LatteConst::ShaderType::Vertex || decompilerContext->shaderType == LatteConst::ShaderType::Geometry)) { - LattePrimitiveMode vsOutPrimType = static_cast(decompilerContext->contextRegisters[mmVGT_PRIMITIVE_TYPE]); - uint32 gsOutPrimType = decompilerContext->contextRegisters[mmVGT_GS_OUT_PRIM_TYPE]; + LattePrimitiveMode vsOutPrimType = decompilerContext->contextRegistersNew->VGT_PRIMITIVE_TYPE.get_PRIMITIVE_MODE(); + src->addFmt("#define VERTICES_PER_VERTEX_PRIMITIVE {}" _CRLF, GetVerticesPerPrimitive(vsOutPrimType)); - switch (vsOutPrimType) - { - case LattePrimitiveMode::POINTS: - src->add("#define VERTICES_PER_VERTEX_PRIMITIVE 1" _CRLF); - break; - case LattePrimitiveMode::LINES: - src->add("#define VERTICES_PER_VERTEX_PRIMITIVE 2" _CRLF); - break; - case LattePrimitiveMode::TRIANGLES: - src->add("#define VERTICES_PER_VERTEX_PRIMITIVE 3" _CRLF); - break; - case LattePrimitiveMode::RECTS: - src->add("#define VERTICES_PER_VERTEX_PRIMITIVE 3" _CRLF); - break; - default: - cemuLog_log(LogType::Force, "Unknown vertex out primitive type {}", vsOutPrimType); - break; - } + uint32 gsOutPrimType = decompilerContext->contextRegisters[mmVGT_GS_OUT_PRIM_TYPE]; if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry) { switch (gsOutPrimType) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h index e858baf0a..b46978ec2 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -117,6 +117,7 @@ inline bool executeCommand(fmt::format_string fmt, T&&... args) { return true; } +/* class MemoryMappedFile { public: @@ -179,3 +180,33 @@ class MemoryMappedFile void* m_data = nullptr; size_t m_fileSize = 0; }; +*/ + +inline uint32 GetVerticesPerPrimitive(LattePrimitiveMode primitiveMode) +{ + switch (primitiveMode) + { + case LattePrimitiveMode::POINTS: + return 1; + case LattePrimitiveMode::LINES: + return 2; + case LattePrimitiveMode::LINE_STRIP: + // Same as line, but requires connection + return 2; + case LattePrimitiveMode::TRIANGLES: + return 3; + case LattePrimitiveMode::RECTS: + return 3; + default: + cemuLog_log(LogType::Force, "Unimplemented primitive type {}", primitiveMode); + return 0; + } +} + +inline bool PrimitiveRequiresConnection(LattePrimitiveMode primitiveMode) +{ + if (primitiveMode == LattePrimitiveMode::LINE_STRIP) + return true; + else + return false; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index b9e84db56..02bf91f8e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -1132,7 +1132,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 */ // Primitive type - const LattePrimitiveMode primitiveMode = static_cast(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]); + const LattePrimitiveMode primitiveMode = LatteGPUState.contextNew.VGT_PRIMITIVE_TYPE.get_PRIMITIVE_MODE(); auto mtlPrimitiveType = GetMtlPrimitiveType(primitiveMode); bool isPrimitiveRect = (primitiveMode == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS); @@ -1394,25 +1394,14 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 renderCommandEncoder->setObjectBytes(&hostIndexTypeU8, sizeof(hostIndexTypeU8), vertexShader->resourceMapping.indexTypeBinding); encoderState.m_buffers[METAL_SHADER_TYPE_OBJECT][vertexShader->resourceMapping.indexTypeBinding] = {nullptr}; - uint32 verticesPerPrimitive = 0; - switch (primitiveMode) - { - case LattePrimitiveMode::POINTS: - verticesPerPrimitive = 1; - break; - case LattePrimitiveMode::LINES: - verticesPerPrimitive = 2; - break; - case LattePrimitiveMode::TRIANGLES: - case LattePrimitiveMode::RECTS: - verticesPerPrimitive = 3; - break; - default: - cemuLog_log(LogType::Force, "unimplemented geometry shader primitive mode {}", (uint32)primitiveMode); - break; - } + uint32 verticesPerPrimitive = GetVerticesPerPrimitive(primitiveMode); + uint32 threadgroupCount = count * instanceCount; + if (PrimitiveRequiresConnection(primitiveMode)) + threadgroupCount -= verticesPerPrimitive - 1; + else + threadgroupCount /= verticesPerPrimitive; - renderCommandEncoder->drawMeshThreadgroups(MTL::Size(count * instanceCount / verticesPerPrimitive, 1, 1), MTL::Size(verticesPerPrimitive, 1, 1), MTL::Size(1, 1, 1)); + renderCommandEncoder->drawMeshThreadgroups(MTL::Size(threadgroupCount, 1, 1), MTL::Size(verticesPerPrimitive, 1, 1), MTL::Size(1, 1, 1)); } else {