[jak2] faster startup (#2738)

Trying to make up for some of the startup speed lost in the SDL transition. This saves about 1s from start (from ~3s), and about 500 MB of RAM. - Faster TIE unpack by merging matrix groups, more efficient vertex transforms, and skipping normal transforms on groups with no normals. - Refactor generic merc and merc to use a single renderer with multiple interfaces, rather than many renderers. Removed "LightningRenderer" as a special thing, but Warp is still special - Add more profiling stuff to startup and the loader. - Remove `SDL_INIT_HAPTIC` - this turned out to be needed for force-feedback steering wheels, and not needed for controller vibration - Switched `vag-player` to use quicksort instead of the default GOAL sort (very slow)
2026-07-04 21:35:47 -04:00 · 2023-06-17 17:23:25 -04:00
parent 1512c6bd9c
commit 6e779d1f1c
27 changed files with 540 additions and 354 deletions
@@ -3,6 +3,8 @@
 #include <algorithm>
 #include <functional>

+#include "xmmintrin.h"
+
 #include "common/util/Assert.h"

 namespace tfrag3 {
@@ -166,6 +168,7 @@ std::array<math::Vector3f, 3> tie_normal_transform_v2(const std::array<math::Vec
  // vmadday.xyzw acc, vf25, vf18
  // vmaddz.xyzw vf12, vf26, vf18
  result[2] = vf18;
+
  return result;
  //
  // sqc2 vf10, -112(t8)
@@ -199,6 +202,22 @@ u32 unpack_tie_normal(const std::array<math::Vector3f, 3>& mat, s8 nx, s8 ny, s8
  return pack_to_gl_normal(as_int.x(), as_int.y(), as_int.z());
 }

+/*
+void tie_normal_v3(__m128* out, const std::array<math::Vector4f, 4>& in) {
+  math::Vector3f x_row = in[0].xyz();
+  math::Vector3f y_row = in[1].xyz();
+  math::Vector3f z_row = in[2].xyz();
+
+  x_row.normalize();
+  y_row = x_row.cross(y_row.cross(x_row)).normalized();
+  z_row = x_row.cross(y_row);
+
+  out[0] = _mm_setr_ps(x_row[0], x_row[1], x_row[2], 0);
+  out[1] = _mm_setr_ps(y_row[0], y_row[1], y_row[2], 0);
+  out[2] = _mm_setr_ps(z_row[0], z_row[1], z_row[2], 0);
+}
+ */
+
 void TieTree::unpack() {
  unpacked.vertices.resize(packed_vertices.color_indices.size());
  size_t i = 0;
@@ -222,24 +241,53 @@ void TieTree::unpack() {
      }
    } else {
      const auto& mat = packed_vertices.matrices[grp.matrix_idx];
-      auto nmat = tie_normal_transform_v2(mat);

-      for (u32 src_idx = grp.start_vert; src_idx < grp.end_vert; src_idx++) {
-        auto& vtx = unpacked.vertices[i];
-        vtx.color_index = packed_vertices.color_indices[i];
-        const auto& proto_vtx = packed_vertices.vertices[src_idx];
-        auto temp = mat[0] * proto_vtx.x + mat[1] * proto_vtx.y + mat[2] * proto_vtx.z + mat[3];
-        vtx.x = temp.x();
-        vtx.y = temp.y();
-        vtx.z = temp.z();
-        vtx.s = proto_vtx.s;
-        vtx.t = proto_vtx.t;
-        vtx.nor = unpack_tie_normal(nmat, proto_vtx.nx, proto_vtx.ny, proto_vtx.nz);
-        vtx.r = proto_vtx.r;
-        vtx.g = proto_vtx.g;
-        vtx.b = proto_vtx.b;
-        vtx.a = proto_vtx.a;
-        i++;
+      __m128 mat0 = _mm_loadu_ps(mat[0].data());
+      __m128 mat1 = _mm_loadu_ps(mat[1].data());
+      __m128 mat2 = _mm_loadu_ps(mat[2].data());
+      __m128 mat3 = _mm_loadu_ps(mat[3].data());
+
+      if (grp.has_normals) {
+        auto nmat = tie_normal_transform_v2(mat);
+        for (u32 src_idx = grp.start_vert; src_idx < grp.end_vert; src_idx++) {
+          auto& vtx = unpacked.vertices[i];
+          vtx.color_index = packed_vertices.color_indices[i];
+          const auto& proto_vtx = packed_vertices.vertices[src_idx];
+          // auto temp = mat[0] * proto_vtx.x + mat[1] * proto_vtx.y + mat[2] * proto_vtx.z +
+          // mat[3];
+          __m128 transformed = mat3;
+          transformed = _mm_add_ps(transformed, _mm_mul_ps(_mm_set1_ps(proto_vtx.x), mat0));
+          transformed = _mm_add_ps(transformed, _mm_mul_ps(_mm_set1_ps(proto_vtx.y), mat1));
+          transformed = _mm_add_ps(transformed, _mm_mul_ps(_mm_set1_ps(proto_vtx.z), mat2));
+          _mm_storeu_ps(&vtx.x, transformed);
+          vtx.s = proto_vtx.s;
+          vtx.t = proto_vtx.t;
+          vtx.nor = unpack_tie_normal(nmat, proto_vtx.nx, proto_vtx.ny, proto_vtx.nz);
+          vtx.r = proto_vtx.r;
+          vtx.g = proto_vtx.g;
+          vtx.b = proto_vtx.b;
+          vtx.a = proto_vtx.a;
+          i++;
+        }
+      } else {
+        for (u32 src_idx = grp.start_vert; src_idx < grp.end_vert; src_idx++) {
+          auto& vtx = unpacked.vertices[i];
+          vtx.color_index = packed_vertices.color_indices[i];
+          const auto& proto_vtx = packed_vertices.vertices[src_idx];
+          __m128 transformed = mat3;
+          transformed = _mm_add_ps(transformed, _mm_mul_ps(_mm_set1_ps(proto_vtx.x), mat0));
+          transformed = _mm_add_ps(transformed, _mm_mul_ps(_mm_set1_ps(proto_vtx.y), mat1));
+          transformed = _mm_add_ps(transformed, _mm_mul_ps(_mm_set1_ps(proto_vtx.z), mat2));
+          _mm_storeu_ps(&vtx.x, transformed);
+          vtx.s = proto_vtx.s;
+          vtx.t = proto_vtx.t;
+          vtx.nor = 0;
+          vtx.r = proto_vtx.r;
+          vtx.g = proto_vtx.g;
+          vtx.b = proto_vtx.b;
+          vtx.a = proto_vtx.a;
+          i++;
+        }
      }
    }
  }