Skip to content

Commit 90c56b7

Browse files
authored
Latte: Optimizations and tweaks (#706)
1 parent 323bdfa commit 90c56b7

10 files changed

+742
-402
lines changed

src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp

+672-385
Large diffs are not rendered by default.

src/Cafe/HW/Latte/Core/LatteOverlay.cpp

+4-2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ struct OverlayStats
2626

2727
double fps{};
2828
uint32 draw_calls_per_frame{};
29+
uint32 fast_draw_calls_per_frame{};
2930
float cpu_usage{}; // cemu cpu usage in %
3031
std::vector<float> cpu_per_core; // global cpu usage in % per core
3132
uint32 ram_usage{}; // ram usage in MB
@@ -86,7 +87,7 @@ void LatteOverlay_renderOverlay(ImVec2& position, ImVec2& pivot, sint32 directio
8687
ImGui::Text("FPS: %.2lf", g_state.fps);
8788

8889
if (config.overlay.drawcalls)
89-
ImGui::Text("Draws/f: %d", g_state.draw_calls_per_frame);
90+
ImGui::Text("Draws/f: %d (fast: %d)", g_state.draw_calls_per_frame, g_state.fast_draw_calls_per_frame);
9091

9192
if (config.overlay.cpu_usage)
9293
ImGui::Text("CPU: %.2lf%%", g_state.cpu_usage);
@@ -588,13 +589,14 @@ static void UpdateStats_CpuPerCore()
588589
}
589590
}
590591

591-
void LatteOverlay_updateStats(double fps, sint32 drawcalls)
592+
void LatteOverlay_updateStats(double fps, sint32 drawcalls, sint32 fastDrawcalls)
592593
{
593594
if (GetConfig().overlay.position == ScreenPosition::kDisabled)
594595
return;
595596

596597
g_state.fps = fps;
597598
g_state.draw_calls_per_frame = drawcalls;
599+
g_state.fast_draw_calls_per_frame = fastDrawcalls;
598600
UpdateStats_CemuCpu();
599601
UpdateStats_CpuPerCore();
600602

src/Cafe/HW/Latte/Core/LatteOverlay.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@
22

33
void LatteOverlay_init();
44
void LatteOverlay_render(bool pad_view);
5-
void LatteOverlay_updateStats(double fps, sint32 drawcalls);
5+
void LatteOverlay_updateStats(double fps, sint32 drawcalls, sint32 fastDrawcalls);
66

77
void LatteOverlay_pushNotification(const std::string& text, sint32 duration);

src/Cafe/HW/Latte/Core/LattePerformanceMonitor.cpp

+5-3
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ void LattePerformanceMonitor_frameEnd()
3838
uint64 indexDataCached = 0;
3939
uint32 frameCounter = 0;
4040
uint32 drawCallCounter = 0;
41+
uint32 fastDrawCallCounter = 0;
4142
uint32 shaderBindCounter = 0;
4243
uint32 recompilerLeaveCount = 0;
4344
uint32 threadLeaveCount = 0;
@@ -53,6 +54,7 @@ void LattePerformanceMonitor_frameEnd()
5354
indexDataCached += performanceMonitor.cycle[i].indexDataCached;
5455
frameCounter += performanceMonitor.cycle[i].frameCounter;
5556
drawCallCounter += performanceMonitor.cycle[i].drawCallCounter;
57+
fastDrawCallCounter += performanceMonitor.cycle[i].fastDrawCallCounter;
5658
shaderBindCounter += performanceMonitor.cycle[i].shaderBindCount;
5759
recompilerLeaveCount += performanceMonitor.cycle[i].recompilerLeaveCount;
5860
threadLeaveCount += performanceMonitor.cycle[i].threadLeaveCount;
@@ -75,7 +77,6 @@ void LattePerformanceMonitor_frameEnd()
7577
indexDataUploadPerFrame /= 1024ULL;
7678

7779
double fps = (double)elapsedFrames2S * 1000.0 / (double)totalElapsedTimeFPS;
78-
uint32 drawCallsPerFrame = drawCallCounter / elapsedFrames;
7980
uint32 shaderBindsPerFrame = shaderBindCounter / elapsedFrames;
8081
passedCycles = passedCycles * 1000ULL / totalElapsedTime;
8182
uint32 rlps = (uint32)((uint64)recompilerLeaveCount * 1000ULL / (uint64)totalElapsedTime);
@@ -85,6 +86,7 @@ void LattePerformanceMonitor_frameEnd()
8586
// next counter cycle
8687
sint32 nextCycleIndex = (performanceMonitor.cycleIndex + 1) % PERFORMANCE_MONITOR_TRACK_CYCLES;
8788
performanceMonitor.cycle[nextCycleIndex].drawCallCounter = 0;
89+
performanceMonitor.cycle[nextCycleIndex].fastDrawCallCounter = 0;
8890
performanceMonitor.cycle[nextCycleIndex].frameCounter = 0;
8991
performanceMonitor.cycle[nextCycleIndex].shaderBindCount = 0;
9092
performanceMonitor.cycle[nextCycleIndex].lastCycleCount = PPCInterpreter_getMainCoreCycleCounter();
@@ -104,12 +106,12 @@ void LattePerformanceMonitor_frameEnd()
104106

105107
if (isFirstUpdate)
106108
{
107-
LatteOverlay_updateStats(0.0, 0);
109+
LatteOverlay_updateStats(0.0, 0, 0);
108110
gui_updateWindowTitles(false, false, 0.0);
109111
}
110112
else
111113
{
112-
LatteOverlay_updateStats(fps, drawCallCounter / elapsedFrames);
114+
LatteOverlay_updateStats(fps, drawCallCounter / elapsedFrames, fastDrawCallCounter / elapsedFrames);
113115
gui_updateWindowTitles(false, false, fps);
114116
}
115117
}

src/Cafe/HW/Latte/Core/LattePerformanceMonitor.h

+1
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ typedef struct
8484
uint32 lastUpdate;
8585
uint32 frameCounter;
8686
uint32 drawCallCounter;
87+
uint32 fastDrawCallCounter;
8788
uint32 shaderBindCount;
8889
uint64 vertexDataUploaded; // amount of vertex data uploaded to GPU (bytes)
8990
uint64 vertexDataCached; // amount of vertex data reused from GPU cache (bytes)

src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp

+28
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,34 @@ LatteTextureView* LatteMRT::GetColorAttachmentTexture(uint32 index, bool createN
295295
uint32 colorBufferHeight = pitchHeight / colorBufferPitch;
296296
uint32 colorBufferWidth = colorBufferPitch;
297297

298+
// colorbuffer width/height has to be padded to 8/32 alignment but the actual resolution might be smaller
299+
// use the scissor box as a clue to figure out the original resolution if possible
300+
#if 0
301+
uint32 scissorBoxWidth = LatteGPUState.contextNew.PA_SC_GENERIC_SCISSOR_BR.get_BR_X();
302+
uint32 scissorBoxHeight = LatteGPUState.contextNew.PA_SC_GENERIC_SCISSOR_BR.get_BR_Y();
303+
if (((scissorBoxWidth + 7) & ~7) == colorBufferWidth)
304+
colorBufferWidth = scissorBoxWidth;
305+
if (((colorBufferHeight + 31) & ~31) == colorBufferHeight)
306+
colorBufferHeight = scissorBoxHeight;
307+
#endif
308+
309+
// log resolution changes if the above heuristic takes effect
310+
// this is useful to find resolutions which need to be updated in gfx pack texture rules
311+
#if 0
312+
uint32 colorBufferHeight2 = pitchHeight / colorBufferPitch;
313+
static std::unordered_set<uint64> s_foundColorBufferResMappings;
314+
if (colorBufferPitch != colorBufferWidth || colorBufferHeight != colorBufferHeight2)
315+
{
316+
// only log unique, source and dest resolution. Encode into a key with 16 bits per component
317+
uint64 resHash = (uint64)colorBufferWidth | ((uint64)colorBufferHeight << 16) | ((uint64)colorBufferPitch << 32) | ((uint64)colorBufferHeight2 << 48);
318+
if( !s_foundColorBufferResMappings.contains(resHash) )
319+
{
320+
s_foundColorBufferResMappings.insert(resHash);
321+
cemuLog_log(LogType::Force, "[COLORBUFFER-DBG] Using res {}x{} instead of {}x{}", colorBufferWidth, colorBufferHeight, colorBufferPitch, colorBufferHeight2);
322+
}
323+
}
324+
#endif
325+
298326
bool colorBufferWasFound = false;
299327
sint32 viewFirstMip = 0; // todo
300328

src/Cafe/HW/Latte/Core/LatteTextureReadback.cpp

+28-9
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,11 @@
88
#include "Cafe/HW/Latte/Core/LatteTexture.h"
99
#include "Cafe/HW/Latte/Renderer/OpenGL/LatteTextureViewGL.h"
1010

11-
// #define LOG_READBACK_TIME
11+
//#define LOG_READBACK_TIME
1212

1313
struct LatteTextureReadbackQueueEntry
1414
{
15+
HRTick initiateTime;
1516
uint32 lastUpdateDrawcallIndex;
1617
LatteTextureView* textureView;
1718
};
@@ -22,12 +23,12 @@ std::queue<LatteTextureReadbackInfo*> sTextureActiveReadbackQueue; // readbacks
2223
void LatteTextureReadback_StartTransfer(LatteTextureView* textureView)
2324
{
2425
cemuLog_log(LogType::TextureReadback, "[TextureReadback-Start] PhysAddr {:08x} Res {}x{} Fmt {} Slice {} Mip {}", textureView->baseTexture->physAddress, textureView->baseTexture->width, textureView->baseTexture->height, textureView->baseTexture->format, textureView->firstSlice, textureView->firstMip);
26+
HRTick currentTick = HighResolutionTimer().now().getTick();
2527
// create info entry and store in ordered linked list
2628
LatteTextureReadbackInfo* readbackInfo = g_renderer->texture_createReadback(textureView);
2729
sTextureActiveReadbackQueue.push(readbackInfo);
2830
readbackInfo->StartTransfer();
29-
//debug_printf("[Tex-Readback] %08x %dx%d TM %d FMT %04x\n", textureView->baseTexture->physAddress, textureView->baseTexture->width, textureView->baseTexture->height, textureView->baseTexture->tileMode, textureView->baseTexture->format);
30-
readbackInfo->transferStartTime = HighResolutionTimer().now().getTick();
31+
readbackInfo->transferStartTime = currentTick;
3132
}
3233

3334
/*
@@ -41,9 +42,15 @@ bool LatteTextureReadback_Update(bool forceStart)
4142
for (size_t i = 0; i < sTextureScheduledReadbacks.size(); i++)
4243
{
4344
LatteTextureReadbackQueueEntry& entry = sTextureScheduledReadbacks[i];
44-
uint32 numPassedDrawcalls = LatteGPUState.drawCallCounter - entry.lastUpdateDrawcallIndex;
45-
if (forceStart || numPassedDrawcalls >= 5)
45+
uint32 numElapsedDrawcalls = LatteGPUState.drawCallCounter - entry.lastUpdateDrawcallIndex;
46+
if (forceStart || numElapsedDrawcalls >= 5)
4647
{
48+
#ifdef LOG_READBACK_TIME
49+
double elapsedSecondsSinceInitiate = HighResolutionTimer::getTimeDiff(entry.initiateTime, HighResolutionTimer().now().getTick());
50+
char initiateElapsedTimeStr[32];
51+
sprintf(initiateElapsedTimeStr, "%.4lfms", elapsedSecondsSinceInitiate);
52+
cemuLog_log(LogType::TextureReadback, "[TextureReadback-Update] Starting transfer for {:08x} after {} elapsed drawcalls. Time since initiate: {} Force-start: {}", entry.textureView->baseTexture->physAddress, numElapsedDrawcalls, initiateElapsedTimeStr, forceStart?"yes":"no");
53+
#endif
4754
LatteTextureReadback_StartTransfer(entry.textureView);
4855
// remove element
4956
vectorRemoveByIndex(sTextureScheduledReadbacks, i);
@@ -91,6 +98,7 @@ void LatteTextureReadback_Initate(LatteTextureView* textureView)
9198
}
9299
// queue
93100
LatteTextureReadbackQueueEntry queueEntry;
101+
queueEntry.initiateTime = HighResolutionTimer().now().getTick();
94102
queueEntry.textureView = textureView;
95103
queueEntry.lastUpdateDrawcallIndex = LatteGPUState.drawCallCounter;
96104
sTextureScheduledReadbacks.emplace_back(queueEntry);
@@ -112,6 +120,14 @@ void LatteTextureReadback_UpdateFinishedTransfers(bool forceFinish)
112120
if (!readbackInfo->IsFinished())
113121
{
114122
readbackInfo->waitStartTime = HighResolutionTimer().now().getTick();
123+
#ifdef LOG_READBACK_TIME
124+
if (cemuLog_isLoggingEnabled(LogType::TextureReadback))
125+
{
126+
double elapsedSecondsTransfer = HighResolutionTimer::getTimeDiff(readbackInfo->transferStartTime, HighResolutionTimer().now().getTick());
127+
forceLog_printf("[Texture-Readback] Force-finish: %08x Res %4d/%4d TM %d FMT %04x Transfer time so far: %.4lfms", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0);
128+
}
129+
#endif
130+
readbackInfo->forceFinish = true;
115131
readbackInfo->ForceFinish();
116132
// rerun logic since ->ForceFinish() can recurively call this function and thus modify the queue
117133
continue;
@@ -125,10 +141,13 @@ void LatteTextureReadback_UpdateFinishedTransfers(bool forceFinish)
125141
}
126142
// performance testing
127143
#ifdef LOG_READBACK_TIME
128-
HRTick currentTick = HighResolutionTimer().now().getTick();
129-
double elapsedSecondsTransfer = HighResolutionTimer::getTimeDiff(readbackInfo->transferStartTime, currentTick);
130-
double elapsedSecondsWaiting = HighResolutionTimer::getTimeDiff(readbackInfo->waitStartTime, currentTick);
131-
cemuLog_log(LogType::Force, "[Texture-Readback] {:08x} Res {:4}/{:4} TM {} FMT {:04x} ReadbackLatency: {:6.3}ms WaitTime: {:6.3}ms ForcedWait {}", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0, elapsedSecondsWaiting * 1000.0, forceFinish?"yes":"no");
144+
if (cemuLog_isLoggingEnabled(LogType::TextureReadback))
145+
{
146+
HRTick currentTick = HighResolutionTimer().now().getTick();
147+
double elapsedSecondsTransfer = HighResolutionTimer::getTimeDiff(readbackInfo->transferStartTime, currentTick);
148+
double elapsedSecondsWaiting = HighResolutionTimer::getTimeDiff(readbackInfo->waitStartTime, currentTick);
149+
forceLog_printf("[Texture-Readback] %08x Res %4d/%4d TM %d FMT %04x ReadbackLatency: %6.3lfms WaitTime: %6.3lfms ForcedWait %s", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0, elapsedSecondsWaiting * 1000.0, readbackInfo->forceFinish ? "yes" : "no");
150+
}
132151
#endif
133152
uint8* pixelData = readbackInfo->GetData();
134153
LatteTextureLoader_writeReadbackTextureToMemory(&readbackInfo->hostTextureCopy, 0, 0, pixelData);

src/Cafe/HW/Latte/Core/LatteTextureReadbackInfo.h

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ class LatteTextureReadbackInfo
2121

2222
HRTick transferStartTime;
2323
HRTick waitStartTime;
24+
bool forceFinish{ false }; // set to true if not finished in time for dependent operation
2425
// texture info
2526
LatteTextureDefinition hostTextureCopy{};
2627

src/Cafe/HW/Latte/ISA/LatteReg.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -484,7 +484,7 @@ namespace Latte
484484
SQ_TEX_RESOURCE_WORD0_N_GS = 0xE930,
485485
SQ_TEX_RESOURCE_WORD_FIRST = SQ_TEX_RESOURCE_WORD0_N_PS,
486486
SQ_TEX_RESOURCE_WORD_LAST = (SQ_TEX_RESOURCE_WORD0_N_GS + GPU_LIMITS::NUM_TEXTURES_PER_STAGE * 7 - 1),
487-
// there are 54 samplers with 3 registers each. 18 per stage. For stage indices see SAMPLER_BASE_INDEX_*
487+
// there are 54 samplers with 3 registers each. 18 (actually only 16?) per stage. For stage indices see SAMPLER_BASE_INDEX_*
488488
SQ_TEX_SAMPLER_WORD0_0 = 0xF000,
489489
SQ_TEX_SAMPLER_WORD1_0 = 0xF001,
490490
SQ_TEX_SAMPLER_WORD2_0 = 0xF002,

src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -2002,7 +2002,7 @@ void VulkanRenderer::SubmitCommandBuffer(VkSemaphore signalSemaphore, VkSemaphor
20022002
occlusionQuery_notifyBeginCommandBuffer();
20032003

20042004
m_recordedDrawcalls = 0;
2005-
m_submitThreshold = 500; // this used to be 750 before 1.25.5, but more frequent submission is actually better for latency
2005+
m_submitThreshold = 300;
20062006
m_submitOnIdle = false;
20072007
}
20082008

0 commit comments

Comments
 (0)