Description
Was updating an old 32 bit project (VS2017) to latest GLM and noticed a significant performance drop.
Going back through the versions, 0.9.7.6 was the last to not have the issue.
I isolated the issue with this example:
#include <iostream>
#include <chrono>
using namespace std;
using namespace std::chrono;
//#define GLM_FORCE_INLINE
#include "glm/glm.hpp"
#include "glm/gtc/matrix_transform.hpp"
typedef glm::vec4 vec4;
typedef glm::mat4 mat4;
vec4 function()
{
vec4 sum = vec4(0.0f);
mat4 i_transform(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
for (int i = 0; i < 100000; i++)
{
i_transform[0] = sum;
vec4 pos[8] =
{
i_transform * vec4(-1.01f, -1.01f, -1.01f, 1.0f), // 0
i_transform * vec4(-1.01f, -1.01f, +1.01f, 1.0f), // 1
i_transform * vec4(+1.01f, -1.01f, +1.01f, 1.0f), // 2
i_transform * vec4(+1.01f, -1.01f, -1.01f, 1.0f), // 3
i_transform * vec4(-1.01f, +1.01f, -1.01f, 1.0f), // 4
i_transform * vec4(-1.01f, +1.01f, +1.01f, 1.0f), // 5
i_transform * vec4(+1.01f, +1.01f, +1.01f, 1.0f), // 6
i_transform * vec4(+1.01f, +1.01f, -1.01f, 1.0f) // 7
};
sum = pos[0] + pos[1] + pos[2] + pos[3] +
pos[4] + pos[5] + pos[6] + pos[7];
}
return sum;
}
int main()
{
high_resolution_clock::time_point t1 = high_resolution_clock::now();
vec4 ret = function();
high_resolution_clock::time_point t2 = high_resolution_clock::now();
auto duration = duration_cast<microseconds>(t2 - t1).count();
cout << duration << "us (" << ret.x << " " << ret.y << " " << ret.z << ")";
getchar();
return 0;
}
Using latest Visual studio in release and 0.9.9 GLM I get these durations:
GLM 0.9.9.0 - 32 bit - 12000
GLM 0.9.7.6 - 32 bit - 2300
64 bit seems unaffected. Looking at the assembly between the two versions it seem that mat4 *vec4 is no longer getting inlined - so it has to do a function call.
Defining GLM_FORCE_INLINE does make this go away in this example, but using it on an entire application makes plenty of other places perform worse.
I have fiddled with the release settings but found nothing to fix this. Have not tested any other compilers.
You may not care about this, but mat4 *vec4 is such a common operation that it should be fast in the default case if possible.