Update aurora & flower/grass draw batching

2026-06-16 05:55:37 -04:00 · 2026-06-13 10:40:57 -06:00
parent 7c5ed6a0e1
commit 00707024bb
10 changed files with 909 additions and 296 deletions
@@ -7,265 +7,11 @@
 #include "JSystem/JKernel/JKRHeap.h"

 #if TARGET_PC
-#include <algorithm>
+#include <aurora/dl.hpp>
 #include <tracy/Tracy.hpp>
-#include <vector>
-#include "dusk/logging.h"

 namespace {

-u16 read_be16(const u8* data) {
-    return (u16(data[0]) << 8) | data[1];
-}
-
-void append_be16(std::vector<u8>& out, u16 value) {
-    out.push_back(value >> 8);
-    out.push_back(value & 0xFF);
-}
-
-void append_bytes(std::vector<u8>& out, const u8* data, u32 size) {
-    out.insert(out.end(), data, data + size);
-}
-
-bool is_matrix_idx_attr(GXAttr attr) {
-    return attr >= GX_VA_PNMTXIDX && attr <= GX_VA_TEX7MTXIDX;
-}
-
-bool is_draw_opcode(u8 opcode) {
-    return opcode == GX_QUADS || opcode == GX_TRIANGLES || opcode == GX_TRIANGLESTRIP ||
-           opcode == GX_TRIANGLEFAN || opcode == GX_LINES || opcode == GX_LINESTRIP ||
-           opcode == GX_POINTS;
-}
-
-bool is_mergeable_draw_opcode(u8 opcode) {
-    return opcode == GX_QUADS || opcode == GX_TRIANGLES || opcode == GX_TRIANGLESTRIP ||
-           opcode == GX_TRIANGLEFAN;
-}
-
-bool calc_vtx_stride(const GXVtxDescList* vtxDesc, u32& stride) {
-    stride = 0;
-    for (; vtxDesc->attr != GX_VA_NULL; vtxDesc++) {
-        switch (vtxDesc->type) {
-        case GX_NONE:
-            break;
-        case GX_DIRECT:
-            if (!is_matrix_idx_attr(vtxDesc->attr)) {
-                return false;
-            }
-            stride += 1;
-            break;
-        case GX_INDEX8:
-            stride += 1;
-            break;
-        case GX_INDEX16:
-            stride += 2;
-            break;
-        default:
-            return false;
-        }
-    }
-    return stride != 0;
-}
-
-bool get_command_size(const u8* dlStart, u32 dlSize, u32 offset, u32 stride, u32& cmdSize) {
-    if (offset >= dlSize) {
-        return false;
-    }
-
-    const u8 cmd = dlStart[offset];
-    const u8 opcode = cmd & GX_OPCODE_MASK;
-    switch (opcode) {
-    case GX_NOP:
-    case GX_CMD_INVL_VC:
-        cmdSize = 1;
-        return true;
-    case (GX_LOAD_BP_REG & GX_OPCODE_MASK):
-        cmdSize = 5;
-        return offset + cmdSize <= dlSize;
-    case GX_LOAD_CP_REG:
-        cmdSize = 6;
-        return offset + cmdSize <= dlSize;
-    case GX_LOAD_XF_REG: {
-        if (offset + 5 > dlSize) {
-            return false;
-        }
-        const u16 count = read_be16(dlStart + offset + 1) + 1;
-        cmdSize = 5 + count * 4;
-        return offset + cmdSize <= dlSize;
-    }
-    case GX_LOAD_INDX_A:
-    case GX_LOAD_INDX_B:
-    case GX_LOAD_INDX_C:
-    case GX_LOAD_INDX_D:
-        cmdSize = 5;
-        return offset + cmdSize <= dlSize;
-    case GX_CMD_CALL_DL:
-        cmdSize = 9;
-        return offset + cmdSize <= dlSize;
-    default:
-        if (is_draw_opcode(opcode)) {
-            if (offset + 3 > dlSize) {
-                return false;
-            }
-            const u16 vtxCount = read_be16(dlStart + offset + 1);
-            cmdSize = 3 + vtxCount * stride;
-            return offset + cmdSize <= dlSize;
-        }
-        return false;
-    }
-}
-
-struct MergeRun {
-    u8 cmd = 0;
-    u16 vtxCount = 0;
-    std::vector<u8> vertices;
-};
-
-void flush_merge_run(std::vector<u8>& out, MergeRun& run) {
-    if (run.vtxCount == 0) {
-        return;
-    }
-
-    out.push_back(run.cmd);
-    append_be16(out, run.vtxCount);
-    append_bytes(out, run.vertices.data(), run.vertices.size());
-    run.vertices.clear();
-    run.vtxCount = 0;
-}
-
-void append_vertex(std::vector<u8>& out, const u8* vertices, u32 stride, u16 idx) {
-    append_bytes(out, vertices + idx * stride, stride);
-}
-
-bool triangulate_draw(
-    std::vector<u8>& out, u8 opcode, const u8* vertices, u32 stride, u16 vtxCount) {
-    switch (opcode) {
-    case GX_TRIANGLES:
-        append_bytes(out, vertices, vtxCount * stride);
-        return true;
-    case GX_TRIANGLEFAN:
-        if (vtxCount < 3) {
-            return false;
-        }
-        for (u16 v = 2; v < vtxCount; v++) {
-            append_vertex(out, vertices, stride, 0);
-            append_vertex(out, vertices, stride, v - 1);
-            append_vertex(out, vertices, stride, v);
-        }
-        return true;
-    case GX_TRIANGLESTRIP:
-        if (vtxCount < 3) {
-            return false;
-        }
-        for (u16 v = 2; v < vtxCount; v++) {
-            if ((v & 1) == 0) {
-                append_vertex(out, vertices, stride, v - 2);
-                append_vertex(out, vertices, stride, v - 1);
-            } else {
-                append_vertex(out, vertices, stride, v - 1);
-                append_vertex(out, vertices, stride, v - 2);
-            }
-            append_vertex(out, vertices, stride, v);
-        }
-        return true;
-    case GX_QUADS:
-        if ((vtxCount & 3) != 0) {
-            return false;
-        }
-        for (u16 v = 0; v < vtxCount; v += 4) {
-            append_vertex(out, vertices, stride, v);
-            append_vertex(out, vertices, stride, v + 1);
-            append_vertex(out, vertices, stride, v + 2);
-            append_vertex(out, vertices, stride, v + 2);
-            append_vertex(out, vertices, stride, v + 3);
-            append_vertex(out, vertices, stride, v);
-        }
-        return true;
-    default:
-        return false;
-    }
-}
-
-void append_triangles_to_run(
-    std::vector<u8>& out, MergeRun& run, u8 cmd, const std::vector<u8>& vertices, u32 stride) {
-    u32 offset = 0;
-    u32 remaining = vertices.size() / stride;
-    while (remaining != 0) {
-        if (run.vtxCount != 0 && run.cmd != cmd) {
-            flush_merge_run(out, run);
-        }
-
-        if (run.vtxCount == 0) {
-            run.cmd = cmd;
-        }
-
-        u32 available = 0xFFFF - run.vtxCount;
-        if (available == 0) {
-            flush_merge_run(out, run);
-            continue;
-        }
-
-        u32 toCopy = std::min(remaining, available);
-        append_bytes(run.vertices, vertices.data() + offset * stride, toCopy * stride);
-        run.vtxCount += toCopy;
-        offset += toCopy;
-        remaining -= toCopy;
-
-        if (run.vtxCount == 0xFFFF) {
-            flush_merge_run(out, run);
-        }
-    }
-}
-
-bool optimize_display_list(const u8* dlStart, u32 dlSize, u32 stride, std::vector<u8>& out) {
-    MergeRun run;
-    out.reserve(dlSize);
-
-    for (u32 offset = 0; offset < dlSize;) {
-        u32 cmdSize = 0;
-        if (!get_command_size(dlStart, dlSize, offset, stride, cmdSize)) {
-            return false;
-        }
-
-        const u8 cmd = dlStart[offset];
-        const u8 opcode = cmd & GX_OPCODE_MASK;
-        if (opcode == GX_NOP) {
-            offset += cmdSize;
-            continue;
-        }
-
-        if (!is_draw_opcode(opcode)) {
-            flush_merge_run(out, run);
-            append_bytes(out, dlStart + offset, cmdSize);
-            offset += cmdSize;
-            continue;
-        }
-
-        if (!is_mergeable_draw_opcode(opcode)) {
-            flush_merge_run(out, run);
-            append_bytes(out, dlStart + offset, cmdSize);
-            offset += cmdSize;
-            continue;
-        }
-
-        const u16 vtxCount = read_be16(dlStart + offset + 1);
-        const u8* vertices = dlStart + offset + 3;
-        std::vector<u8> triangles;
-        if (!triangulate_draw(triangles, opcode, vertices, stride, vtxCount)) {
-            flush_merge_run(out, run);
-            append_bytes(out, dlStart + offset, cmdSize);
-            offset += cmdSize;
-            continue;
-        }
-
-        append_triangles_to_run(out, run, (GX_TRIANGLES | (cmd & GX_VAT_MASK)), triangles, stride);
-        offset += cmdSize;
-    }
-
-    flush_merge_run(out, run);
-    return true;
-}
-
 void set_display_list_copy(void*& displayList, u32& displayListSize, const u8* data, u32 size) {
    const u32 alignedSize = ALIGN_NEXT(size, 0x20);
    u8* newDL = JKR_NEW_ARRAY_ARGS(u8, alignedSize, 0x20);
@@ -289,20 +35,11 @@ u32 J3DShapeDraw::countVertex(u32 stride) {
    u8* dlStart = (u8*)getDisplayList();

 #if TARGET_PC
-    for (u32 offset = 0; offset < getDisplayListSize();) {
-        u8 cmd = dlStart[offset];
-        u8 opcode = cmd & GX_OPCODE_MASK;
-        u32 cmdSize = 0;
-        if (!get_command_size(dlStart, getDisplayListSize(), offset, stride, cmdSize)) {
-            break;
+    aurora::gx::dl::Reader reader{dlStart, getDisplayListSize(), static_cast<u8>(stride)};
+    while (const auto cmd = reader.next()) {
+        if (cmd->kind != aurora::gx::dl::Command::Kind::Passthrough) {
+            count += cmd->draw.vtxCount;
        }
-        if (!is_draw_opcode(opcode)) {
-            offset += cmdSize;
-            continue;
-        }
-        int vtxNum = be16(*reinterpret_cast<u16*>(dlStart + offset + 1));
-        count += vtxNum;
-        offset += 3 + stride * vtxNum;
    }
 #else
    for (u8* dl = dlStart; (dl - dlStart) < getDisplayListSize();) {
@@ -320,6 +57,53 @@ u32 J3DShapeDraw::countVertex(u32 stride) {
    return count;
 }

+#if TARGET_PC
+void J3DShapeDraw::addTexMtxIndexInDL(u32 stride, u32 attrOffs, u32 valueBase) {
+    u32 byteNum = countVertex(stride);
+    u32 oldSize = mDisplayListSize;
+    u32 newSize = ALIGN_NEXT(oldSize + byteNum, 0x20);
+    u8* newDLStart = JKR_NEW_ARRAY_ARGS(u8, newSize, 0x20);
+    u8* oldDLStart = (u8*)mDisplayList;
+    u8* newDL = newDLStart;
+
+    aurora::gx::dl::Reader reader{oldDLStart, mDisplayListSize, static_cast<u8>(stride)};
+    while (const auto cmd = reader.next()) {
+        if (cmd->kind == aurora::gx::dl::Command::Kind::Passthrough) {
+            std::memcpy(newDL, cmd->data, cmd->size);
+            newDL += cmd->size;
+            continue;
+        }
+
+        const auto& draw = cmd->draw;
+        const u32 headerSize = draw.vertices - cmd->data;
+        std::memcpy(newDL, cmd->data, headerSize);
+        newDL += headerSize;
+
+        for (u32 i = 0; i < draw.vtxCount; i++) {
+            const u8* oldVtx = draw.vertices + stride * i;
+            u8 pnmtxidx = oldVtx[0];
+            std::memcpy(newDL, oldVtx, attrOffs);
+            newDL += attrOffs;
+            *newDL++ = valueBase + pnmtxidx;
+            std::memcpy(newDL, oldVtx + attrOffs, stride - attrOffs);
+            newDL += stride - attrOffs;
+        }
+    }
+    if (reader.failed()) {
+        // preserve the remainder untouched
+        std::memcpy(newDL, oldDLStart + reader.pos(), mDisplayListSize - reader.pos());
+        newDL += mDisplayListSize - reader.pos();
+    }
+
+    u32 realSize = ALIGN_NEXT((uintptr_t)newDL - (uintptr_t)newDLStart, 0x20);
+    for (; (newDL - newDLStart) < newSize; newDL++)
+        *newDL = 0;
+
+    mDisplayListSize = realSize;
+    mDisplayList = newDLStart;
+    DCStoreRange(newDLStart, mDisplayListSize);
+}
+#else
 void J3DShapeDraw::addTexMtxIndexInDL(u32 stride, u32 attrOffs, u32 valueBase) {
    u32 byteNum = countVertex(stride);
    u32 oldSize = mDisplayListSize;
@@ -330,32 +114,13 @@ void J3DShapeDraw::addTexMtxIndexInDL(u32 stride, u32 attrOffs, u32 valueBase) {
    u8* newDL = newDLStart;

    for (; (oldDL - oldDLStart) < mDisplayListSize;) {
-#if TARGET_PC
-        u32 oldOffset = oldDL - oldDLStart;
-        u32 cmdSize = 0;
-        if (!get_command_size(oldDLStart, mDisplayListSize, oldOffset, stride, cmdSize)) {
-            memcpy(newDL, oldDL, mDisplayListSize - oldOffset);
-            newDL += mDisplayListSize - oldOffset;
-            break;
-        }
-#endif
        // Copy command
        u8 cmd = *(u8*)oldDL;
        oldDL++;
        *newDL++ = cmd;

-#if TARGET_PC
-        u8 opcode = cmd & GX_OPCODE_MASK;
-        if (!is_draw_opcode(opcode)) {
-            memcpy(newDL, oldDL, cmdSize - 1);
-            oldDL += cmdSize - 1;
-            newDL += cmdSize - 1;
-            continue;
-        }
-#else
        if (cmd != GX_TRIANGLEFAN && cmd != GX_TRIANGLESTRIP)
            break;
-#endif

        // Copy count
        int vtxNum = *(u16*)oldDL;
@@ -384,6 +149,7 @@ void J3DShapeDraw::addTexMtxIndexInDL(u32 stride, u32 attrOffs, u32 valueBase) {
    mDisplayList = newDLStart;
    DCStoreRange(newDLStart, mDisplayListSize);
 }
+#endif

 J3DShapeDraw::J3DShapeDraw(const u8* displayList, u32 displayListSize) {
 #if TARGET_PC
@@ -397,12 +163,8 @@ J3DShapeDraw::J3DShapeDraw(const u8* displayList, u32 displayListSize) {
 #if TARGET_PC
 J3DShapeDraw::J3DShapeDraw(
    const u8* displayList, u32 displayListSize, const GXVtxDescList* vtxDesc) {
-    u32 stride = 0;
-    std::vector<u8> optimized;
-    if (calc_vtx_stride(vtxDesc, stride) &&
-        optimize_display_list(displayList, displayListSize, stride, optimized))
-    {
-        set_display_list_copy(mDisplayList, mDisplayListSize, optimized.data(), optimized.size());
+    if (const auto optimized = aurora::gx::dl::optimize(displayList, displayListSize, vtxDesc)) {
+        set_display_list_copy(mDisplayList, mDisplayListSize, optimized->data(), optimized->size());
    } else {
        set_display_list_copy(mDisplayList, mDisplayListSize, displayList, displayListSize);
    }