-
Notifications
You must be signed in to change notification settings - Fork 310
Speed comparision for vec3 vs vec4 arithmetics
The code in comparison is based on galaxy.cpp (point to quad conversion in render function).
Three mods were tested:
4 → 4 | Both Blob::position and GalaxyVertex::position are vec4, all calculations done using vec4 |
3 → 3 | Both Blob::position and GalaxyVertex::position are vec3, all calculations done using vec4 |
4 → 3 | Blob::position is vec4, GalaxyVertex::position is vec3, calculations done using vec4 results then stored to vec3 |
3 → 4 → 3 | Both stored as vec3 but for calculation converted to vec4 |
The test program was compiled in 2 modes for ARM64 (Oracle cloud VM):
- -O2
- -O3
- -O3 -march=native
And in 4 modes for Intel (i5 mobile 8th gen):
- -O2
- -O2 -msse
- -O3
- -O3 -march=native -msse4
- -O3 -march=native -mavx
Results (ms):
CPU | Optimization | 4 → 4 | 4 → 3 | 3 → 3 | 3 → 4 → 3 |
ARM | -O2 | 15050.80 | 10115.40 | 9513.18 | 10185.00 |
-O3 | 7760.55 | 8150.00 | 8403.77 | 8846.66 | |
-O3 -march=native | 7768.69 | 8082.59 | 8366.02 | 8818.15 | |
Intel | -O2 | 10093.30 | 10476.60 | 10367.00 | 10387.40 |
-O2 -msse | 9992.38 | 10496.80 | 10416.40 | 10426.10 | |
-O3 | 9711.28 | 11479.80 | 11213.50 | 11108.90 | |
-O3 -march=native -msse4 | 9522.09 | 10475.90 | 11250.10 | 11687.20 | |
-O3 -march=native -mavx | 9515.35 | 10484.40 | 11100.40 | 11623.70 |
Conclusion:
- For code targeting new hardware with heavy optimizations and vector extensions enabled prefer vec4 and then convert to vec3 if required. Generic ARM64 prefers vec3.
- For code targeting old hardware or without vector extensions enabled and with medium optimizations vec3 is sufficient (performance drop is less then 3%) and behaves better then vec4 math and conversion to vec3.
- Conversion from vec3 to voc4 in simple calculations doesn’t give any speedup.
Test code:
#include <algorithm>
#include <chrono>
#include <iostream>
#include <Eigen/Core>
#ifndef USEVEC3OUT
#error “Define USEVEC3OUT 0/1”
#endif
#ifndef USEVEC3IN
#error “Define USEVEC3IN 0/1”
#endif
using std::chrono::duration;
using std::chrono::duration_cast;
using std::chrono::high_resolution_clock;
using std::chrono::milliseconds;
struct Blob
{
#if USEVEC3IN
Eigen::Vector3f position;
#else
Eigen::Vector4f position;
#endif
std::uint8_t colorIndex, brightness;
};
struct GalaxyVertex
{
#if USEVEC3OUT
Eigen::Vector3f position;
#else
Eigen::Vector4f position;
#endif
Eigen::Matrix<uint8_t, 4, 1> texCoord; // texCoord.x = x, texCoord.y = y, texCoord.z = color index, texCoord.w = alpha
};
void fn(const Eigen::Matrix4f &m, const Eigen::Matrix3f &viewMat, Blob blobs[], float size, uint8_t alpha, GalaxyVertex g_vertices[], std::size_t N)
{
#if USEVEC3IN
Eigen::Vector3f v0 = viewMat * Eigen::Vector3f(-1, -1, 0) * size;
Eigen::Vector3f v1 = viewMat * Eigen::Vector3f( 1, -1, 0) * size;
Eigen::Vector3f v2 = viewMat * Eigen::Vector3f( 1, 1, 0) * size;
Eigen::Vector3f v3 = viewMat * Eigen::Vector3f(-1, 1, 0) * size;
#else
Eigen::Vector4f v0(Eigen::Vector4f::Zero());
Eigen::Vector4f v1(Eigen::Vector4f::Zero());
Eigen::Vector4f v2(Eigen::Vector4f::Zero());
Eigen::Vector4f v3(Eigen::Vector4f::Zero());
v0.head(3) = viewMat * Eigen::Vector3f(-1, -1, 0) * size;
v1.head(3) = viewMat * Eigen::Vector3f( 1, -1, 0) * size;
v2.head(3) = viewMat * Eigen::Vector3f( 1, 1, 0) * size;
v3.head(3) = viewMat * Eigen::Vector3f(-1, 1, 0) * size;
#endif
for (std::size_t i = 0, j = 0; i < N; i++)
{
Blob &b = blobs[i];
#if USEVEC3IN
Eigen::Vector3f p = (m * Eigen::Vector4f(b.position.x(), b.position.y(), b.position.z(), 1.0f)).head(3);
#else
Eigen::Vector4f p = m * b.position;
#endif
#if USEVEC3OUT
#if USEVEC3IN
g_vertices[j+0] = { p + v0, { std::uint8_t(0), std::uint8_t(0), b.colorIndex, alpha } };
g_vertices[j+1] = { p + v1, { std::uint8_t(255), std::uint8_t(0), b.colorIndex, alpha } };
g_vertices[j+2] = { p + v2, { std::uint8_t(255), std::uint8_t(255), b.colorIndex, alpha } };
g_vertices[j+3] = { p + v3, { std::uint8_t(0), std::uint8_t(255), b.colorIndex, alpha } };
#else
g_vertices[j+0] = { (p + v0).head(3), { std::uint8_t(0), std::uint8_t(0), b.colorIndex, alpha } };
g_vertices[j+1] = { (p + v1).head(3), { std::uint8_t(255), std::uint8_t(0), b.colorIndex, alpha } };
g_vertices[j+2] = { (p + v2).head(3), { std::uint8_t(255), std::uint8_t(255), b.colorIndex, alpha } };
g_vertices[j+3] = { (p + v3).head(3), { std::uint8_t(0), std::uint8_t(255), b.colorIndex, alpha } };
#endif
#else
g_vertices[j+0] = { p + v0, { std::uint8_t(0), std::uint8_t(0), b.colorIndex, alpha } };
g_vertices[j+1] = { p + v1, { std::uint8_t(255), std::uint8_t(0), b.colorIndex, alpha } };
g_vertices[j+2] = { p + v2, { std::uint8_t(255), std::uint8_t(255), b.colorIndex, alpha } };
g_vertices[j+3] = { p + v3, { std::uint8_t(0), std::uint8_t(255), b.colorIndex, alpha } };
#endif
j+=4;
}
}
int main()
{
Eigen::Matrix4f m(Eigen::Matrix4f::Identity());
Eigen::Matrix3f viewMat(Eigen::Matrix3f::Identity());
constexpr std::size_t N = 1024 * 10;
Blob *blobs = new Blob[N];
for (std::size_t i = 0; i < N; i++)
{
#if USEVEC3IN
Eigen::Vector3f position(i / 3.0, i / 2.0f, i / 3.0f);
#else
Eigen::Vector4f position(i / 5.0f, i / 4.0f, i / 3.0f, 1.0f);
#endif
blobs[i] = {position, i%15, i%17};
}
GalaxyVertex g_vertices = new GalaxyVertex[N4];
#if USEVEC3OUT
#if USEVEC3IN
std::cout << “vec3 → vec3\n”;
#else
std::cout << “vec4 → vec3\n”;
#endif
#else
std::cout << “vec4 → vec4\n”;
#endif
auto t1 = high_resolution_clock::now();
for (int i = 0; i < 100000; i++)
fn(m, viewMat, blobs, 10.1f, 16, g_vertices, N);
auto t2 = high_resolution_clock::now();
/* Getting number of milliseconds as a double. */
duration<double, std::milli> ms_double = t2 – t1;
std::cout << ms_double.count() << “\n”;
return 0;
}
To compile:
g++ -I /usr/include/eigen3 -o galaxy-test-4-3-sse -O2 -msse galaxy-test.cc -DUSEVEC3OUT=1 -DUSEVEC3IN=0