[jak2] faster startup (#2738)

Trying to make up for some of the startup speed lost in the SDL
transition. This saves about 1s from start (from ~3s), and about 500 MB
of RAM.

- Faster TIE unpack by merging matrix groups, more efficient vertex
transforms, and skipping normal transforms on groups with no normals.
- Refactor generic merc and merc to use a single renderer with multiple
interfaces, rather than many renderers. Removed "LightningRenderer" as a
special thing, but Warp is still special
- Add more profiling stuff to startup and the loader.
- Remove `SDL_INIT_HAPTIC` - this turned out to be needed for
force-feedback steering wheels, and not needed for controller vibration
- Switched `vag-player` to use quicksort instead of the default GOAL
sort (very slow)
This commit is contained in:
water111
2023-06-17 17:23:25 -04:00
committed by GitHub
parent 1512c6bd9c
commit 6e779d1f1c
27 changed files with 540 additions and 354 deletions
+65 -17
View File
@@ -3,6 +3,8 @@
#include <algorithm>
#include <functional>
#include "xmmintrin.h"
#include "common/util/Assert.h"
namespace tfrag3 {
@@ -166,6 +168,7 @@ std::array<math::Vector3f, 3> tie_normal_transform_v2(const std::array<math::Vec
// vmadday.xyzw acc, vf25, vf18
// vmaddz.xyzw vf12, vf26, vf18
result[2] = vf18;
return result;
//
// sqc2 vf10, -112(t8)
@@ -199,6 +202,22 @@ u32 unpack_tie_normal(const std::array<math::Vector3f, 3>& mat, s8 nx, s8 ny, s8
return pack_to_gl_normal(as_int.x(), as_int.y(), as_int.z());
}
/*
void tie_normal_v3(__m128* out, const std::array<math::Vector4f, 4>& in) {
math::Vector3f x_row = in[0].xyz();
math::Vector3f y_row = in[1].xyz();
math::Vector3f z_row = in[2].xyz();
x_row.normalize();
y_row = x_row.cross(y_row.cross(x_row)).normalized();
z_row = x_row.cross(y_row);
out[0] = _mm_setr_ps(x_row[0], x_row[1], x_row[2], 0);
out[1] = _mm_setr_ps(y_row[0], y_row[1], y_row[2], 0);
out[2] = _mm_setr_ps(z_row[0], z_row[1], z_row[2], 0);
}
*/
void TieTree::unpack() {
unpacked.vertices.resize(packed_vertices.color_indices.size());
size_t i = 0;
@@ -222,24 +241,53 @@ void TieTree::unpack() {
}
} else {
const auto& mat = packed_vertices.matrices[grp.matrix_idx];
auto nmat = tie_normal_transform_v2(mat);
for (u32 src_idx = grp.start_vert; src_idx < grp.end_vert; src_idx++) {
auto& vtx = unpacked.vertices[i];
vtx.color_index = packed_vertices.color_indices[i];
const auto& proto_vtx = packed_vertices.vertices[src_idx];
auto temp = mat[0] * proto_vtx.x + mat[1] * proto_vtx.y + mat[2] * proto_vtx.z + mat[3];
vtx.x = temp.x();
vtx.y = temp.y();
vtx.z = temp.z();
vtx.s = proto_vtx.s;
vtx.t = proto_vtx.t;
vtx.nor = unpack_tie_normal(nmat, proto_vtx.nx, proto_vtx.ny, proto_vtx.nz);
vtx.r = proto_vtx.r;
vtx.g = proto_vtx.g;
vtx.b = proto_vtx.b;
vtx.a = proto_vtx.a;
i++;
__m128 mat0 = _mm_loadu_ps(mat[0].data());
__m128 mat1 = _mm_loadu_ps(mat[1].data());
__m128 mat2 = _mm_loadu_ps(mat[2].data());
__m128 mat3 = _mm_loadu_ps(mat[3].data());
if (grp.has_normals) {
auto nmat = tie_normal_transform_v2(mat);
for (u32 src_idx = grp.start_vert; src_idx < grp.end_vert; src_idx++) {
auto& vtx = unpacked.vertices[i];
vtx.color_index = packed_vertices.color_indices[i];
const auto& proto_vtx = packed_vertices.vertices[src_idx];
// auto temp = mat[0] * proto_vtx.x + mat[1] * proto_vtx.y + mat[2] * proto_vtx.z +
// mat[3];
__m128 transformed = mat3;
transformed = _mm_add_ps(transformed, _mm_mul_ps(_mm_set1_ps(proto_vtx.x), mat0));
transformed = _mm_add_ps(transformed, _mm_mul_ps(_mm_set1_ps(proto_vtx.y), mat1));
transformed = _mm_add_ps(transformed, _mm_mul_ps(_mm_set1_ps(proto_vtx.z), mat2));
_mm_storeu_ps(&vtx.x, transformed);
vtx.s = proto_vtx.s;
vtx.t = proto_vtx.t;
vtx.nor = unpack_tie_normal(nmat, proto_vtx.nx, proto_vtx.ny, proto_vtx.nz);
vtx.r = proto_vtx.r;
vtx.g = proto_vtx.g;
vtx.b = proto_vtx.b;
vtx.a = proto_vtx.a;
i++;
}
} else {
for (u32 src_idx = grp.start_vert; src_idx < grp.end_vert; src_idx++) {
auto& vtx = unpacked.vertices[i];
vtx.color_index = packed_vertices.color_indices[i];
const auto& proto_vtx = packed_vertices.vertices[src_idx];
__m128 transformed = mat3;
transformed = _mm_add_ps(transformed, _mm_mul_ps(_mm_set1_ps(proto_vtx.x), mat0));
transformed = _mm_add_ps(transformed, _mm_mul_ps(_mm_set1_ps(proto_vtx.y), mat1));
transformed = _mm_add_ps(transformed, _mm_mul_ps(_mm_set1_ps(proto_vtx.z), mat2));
_mm_storeu_ps(&vtx.x, transformed);
vtx.s = proto_vtx.s;
vtx.t = proto_vtx.t;
vtx.nor = 0;
vtx.r = proto_vtx.r;
vtx.g = proto_vtx.g;
vtx.b = proto_vtx.b;
vtx.a = proto_vtx.a;
i++;
}
}
}
}