mirror of
https://github.com/open-goal/jak-project
synced 2026-07-04 21:35:47 -04:00
[jak2] faster startup (#2738)
Trying to make up for some of the startup speed lost in the SDL transition. This saves about 1s from start (from ~3s), and about 500 MB of RAM. - Faster TIE unpack by merging matrix groups, more efficient vertex transforms, and skipping normal transforms on groups with no normals. - Refactor generic merc and merc to use a single renderer with multiple interfaces, rather than many renderers. Removed "LightningRenderer" as a special thing, but Warp is still special - Add more profiling stuff to startup and the loader. - Remove `SDL_INIT_HAPTIC` - this turned out to be needed for force-feedback steering wheels, and not needed for controller vibration - Switched `vag-player` to use quicksort instead of the default GOAL sort (very slow)
This commit is contained in:
@@ -3,6 +3,8 @@
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
|
||||
#include "xmmintrin.h"
|
||||
|
||||
#include "common/util/Assert.h"
|
||||
|
||||
namespace tfrag3 {
|
||||
@@ -166,6 +168,7 @@ std::array<math::Vector3f, 3> tie_normal_transform_v2(const std::array<math::Vec
|
||||
// vmadday.xyzw acc, vf25, vf18
|
||||
// vmaddz.xyzw vf12, vf26, vf18
|
||||
result[2] = vf18;
|
||||
|
||||
return result;
|
||||
//
|
||||
// sqc2 vf10, -112(t8)
|
||||
@@ -199,6 +202,22 @@ u32 unpack_tie_normal(const std::array<math::Vector3f, 3>& mat, s8 nx, s8 ny, s8
|
||||
return pack_to_gl_normal(as_int.x(), as_int.y(), as_int.z());
|
||||
}
|
||||
|
||||
/*
|
||||
void tie_normal_v3(__m128* out, const std::array<math::Vector4f, 4>& in) {
|
||||
math::Vector3f x_row = in[0].xyz();
|
||||
math::Vector3f y_row = in[1].xyz();
|
||||
math::Vector3f z_row = in[2].xyz();
|
||||
|
||||
x_row.normalize();
|
||||
y_row = x_row.cross(y_row.cross(x_row)).normalized();
|
||||
z_row = x_row.cross(y_row);
|
||||
|
||||
out[0] = _mm_setr_ps(x_row[0], x_row[1], x_row[2], 0);
|
||||
out[1] = _mm_setr_ps(y_row[0], y_row[1], y_row[2], 0);
|
||||
out[2] = _mm_setr_ps(z_row[0], z_row[1], z_row[2], 0);
|
||||
}
|
||||
*/
|
||||
|
||||
void TieTree::unpack() {
|
||||
unpacked.vertices.resize(packed_vertices.color_indices.size());
|
||||
size_t i = 0;
|
||||
@@ -222,24 +241,53 @@ void TieTree::unpack() {
|
||||
}
|
||||
} else {
|
||||
const auto& mat = packed_vertices.matrices[grp.matrix_idx];
|
||||
auto nmat = tie_normal_transform_v2(mat);
|
||||
|
||||
for (u32 src_idx = grp.start_vert; src_idx < grp.end_vert; src_idx++) {
|
||||
auto& vtx = unpacked.vertices[i];
|
||||
vtx.color_index = packed_vertices.color_indices[i];
|
||||
const auto& proto_vtx = packed_vertices.vertices[src_idx];
|
||||
auto temp = mat[0] * proto_vtx.x + mat[1] * proto_vtx.y + mat[2] * proto_vtx.z + mat[3];
|
||||
vtx.x = temp.x();
|
||||
vtx.y = temp.y();
|
||||
vtx.z = temp.z();
|
||||
vtx.s = proto_vtx.s;
|
||||
vtx.t = proto_vtx.t;
|
||||
vtx.nor = unpack_tie_normal(nmat, proto_vtx.nx, proto_vtx.ny, proto_vtx.nz);
|
||||
vtx.r = proto_vtx.r;
|
||||
vtx.g = proto_vtx.g;
|
||||
vtx.b = proto_vtx.b;
|
||||
vtx.a = proto_vtx.a;
|
||||
i++;
|
||||
__m128 mat0 = _mm_loadu_ps(mat[0].data());
|
||||
__m128 mat1 = _mm_loadu_ps(mat[1].data());
|
||||
__m128 mat2 = _mm_loadu_ps(mat[2].data());
|
||||
__m128 mat3 = _mm_loadu_ps(mat[3].data());
|
||||
|
||||
if (grp.has_normals) {
|
||||
auto nmat = tie_normal_transform_v2(mat);
|
||||
for (u32 src_idx = grp.start_vert; src_idx < grp.end_vert; src_idx++) {
|
||||
auto& vtx = unpacked.vertices[i];
|
||||
vtx.color_index = packed_vertices.color_indices[i];
|
||||
const auto& proto_vtx = packed_vertices.vertices[src_idx];
|
||||
// auto temp = mat[0] * proto_vtx.x + mat[1] * proto_vtx.y + mat[2] * proto_vtx.z +
|
||||
// mat[3];
|
||||
__m128 transformed = mat3;
|
||||
transformed = _mm_add_ps(transformed, _mm_mul_ps(_mm_set1_ps(proto_vtx.x), mat0));
|
||||
transformed = _mm_add_ps(transformed, _mm_mul_ps(_mm_set1_ps(proto_vtx.y), mat1));
|
||||
transformed = _mm_add_ps(transformed, _mm_mul_ps(_mm_set1_ps(proto_vtx.z), mat2));
|
||||
_mm_storeu_ps(&vtx.x, transformed);
|
||||
vtx.s = proto_vtx.s;
|
||||
vtx.t = proto_vtx.t;
|
||||
vtx.nor = unpack_tie_normal(nmat, proto_vtx.nx, proto_vtx.ny, proto_vtx.nz);
|
||||
vtx.r = proto_vtx.r;
|
||||
vtx.g = proto_vtx.g;
|
||||
vtx.b = proto_vtx.b;
|
||||
vtx.a = proto_vtx.a;
|
||||
i++;
|
||||
}
|
||||
} else {
|
||||
for (u32 src_idx = grp.start_vert; src_idx < grp.end_vert; src_idx++) {
|
||||
auto& vtx = unpacked.vertices[i];
|
||||
vtx.color_index = packed_vertices.color_indices[i];
|
||||
const auto& proto_vtx = packed_vertices.vertices[src_idx];
|
||||
__m128 transformed = mat3;
|
||||
transformed = _mm_add_ps(transformed, _mm_mul_ps(_mm_set1_ps(proto_vtx.x), mat0));
|
||||
transformed = _mm_add_ps(transformed, _mm_mul_ps(_mm_set1_ps(proto_vtx.y), mat1));
|
||||
transformed = _mm_add_ps(transformed, _mm_mul_ps(_mm_set1_ps(proto_vtx.z), mat2));
|
||||
_mm_storeu_ps(&vtx.x, transformed);
|
||||
vtx.s = proto_vtx.s;
|
||||
vtx.t = proto_vtx.t;
|
||||
vtx.nor = 0;
|
||||
vtx.r = proto_vtx.r;
|
||||
vtx.g = proto_vtx.g;
|
||||
vtx.b = proto_vtx.b;
|
||||
vtx.a = proto_vtx.a;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user