From 82fb2cc26a52a15fcdf01e6b589336bd3ee48223 Mon Sep 17 00:00:00 2001 From: water111 <48171810+water111@users.noreply.github.com> Date: Fri, 15 Mar 2024 20:31:11 -0400 Subject: [PATCH] Port `bones.gc` math to GOAL (#3425) Reverse engineer the skinning matrix calculation and port to GOAL. This is about 3x faster than the MIPS2c version. As usual, there is a `*use-new-bones*` flag to go back to the old version. Fix for a bug in the compiler's `.div.vf` implementation (only happens if src/dst are the same), and fix for a typo in the register allocator that would sometimes cause it not to consider xmm8-xmm15. --- docs/progress-notes/bones.md | 500 +++++++++++++++++++ goal_src/jak1/engine/gfx/foreground/bones.gc | 156 +++++- goal_src/jak2/engine/gfx/foreground/bones.gc | 253 +++++++++- goalc/compiler/compilation/Asm.cpp | 12 +- goalc/regalloc/Allocator_v2.cpp | 8 +- goalc/regalloc/allocator_interface.cpp | 2 +- 6 files changed, 892 insertions(+), 39 deletions(-) create mode 100644 docs/progress-notes/bones.md diff --git a/docs/progress-notes/bones.md b/docs/progress-notes/bones.md new file mode 100644 index 0000000000..6b2a4fa0bf --- /dev/null +++ b/docs/progress-notes/bones.md @@ -0,0 +1,500 @@ +The `bones.gc` file computes skinning matrices for foreground rendering. + +Arguments: + - `a0`: output matrix area, each matrix is 7x quadwords contaning the vertex and normal transformation matrices. + - `a1`: input joint array. The joints contain the inverse bind pose. + - `a2`: input bones array. The world space bone transforms + - `a3`: num bones + + - `vf28, vf29, vf30, vf31` the camera matrix + - `vf25, vf26, vf27` the camera matrix again + +```asm + daddiu sp, sp, -96 + sd ra, 0(sp) + sq s2, 16(sp) + sq s3, 32(sp) + sq s4, 48(sp) + sq s5, 64(sp) + sq gp, 80(sp) + + lui v1, 4096 + lui t0, 4096 + ori v1, v1, 54272 ;; v1 = DMA reg addr + ori t0, t0, 53248 ;; t0 = DMA reg addr + lui t2, 32767 ;; 0x7fff.... + daddiu t1, a3, -16 ;; bone count - 16 (maybe we do 16 bones at a time?) + ori t2, t2, 65535 ;; 0x7fff'ffff + lui at, 28672 ;; scratchpad addr + addiu t4, r0, 64 ;; t4 = 64 (= 16 bones * 4) + addiu t5, r0, 1280 ;; t5 = 1280 (= 16 bones * 80) + bgez t1, L17 ;; more than 16 bones? + addiu t3, r0, 16 ;; t3 = 16 + + ;; if first run is under 16 bones, adjust counts +B1: + or t3, a3, r0 ;; t3 = num bones + sll r0, r0, 0 + dsll t4, t3, 2 ;; t4 = num bones * 4 + dsll a3, t3, 4 ;; a3 = num bones * 16 + dsll t1, t3, 6 ;; t1 = num bones * 64 + sll r0, r0, 0 + daddu t5, t1, a3 ;; t5 = num bones * 80 + addiu t1, r0, 0 ;; t1 = 0 (remaining bones count) +B2: +L17: + addiu a3, r0, 0 ;; a3 = 0 + addiu t6, r0, 1 ;; t6 = 1 + and a1, a1, t2 ;; mask off upper bits of address (not sure why, but they do this sometimes) + sll r0, r0, 0 + daddiu a1, a1, 12 ;; adjustment of joint pointer for the strided dma stuff. + or a0, a0, r0 + daddiu a1, a1, -80 + sll r0, r0, 0 + + ;; wait for DMA to be free... + + +B5: +L19: + addiu t6, r0, 80 + addiu t7, r0, 264 + sw t6, 128(v1) ;; addr in spad = 80 for joints + sw a1, 16(v1) + sw t4, 32(v1) ;; size: num bones * 4 + sw t7, 0(v1) + daddu a1, a1, t5 + ;; wait for dma to complete + + +B8: +L21: + and a2, a2, t2 ;; clean up bones addr + sll r0, r0, 0 + dsll t2, t3, 2 ;; t2 = bones * 4 + addiu t4, r0, 256 ;; t4 = 256 + daddu t2, t2, t3 ;; t2 = bones * 5 (size of the bone) + addiu t6, r0, 1104 ;; addr in spad = 1104 for bones. + dsll t5, t2, 4 + sw t6, 128(v1) + addiu t8, r0, 0 + sw a2, 16(v1) + daddu a2, a2, t5 + sw t2, 32(v1) + addiu t2, r0, 1 + sw t4, 0(v1) +;; wait for dma +;; + +B11: +L23: + dsll t5, t8, 2 ;; ?? not sure what this is, but always zero? + daddu t9, t5, at ;; ptr to bone-work + sll r0, r0, 0 + lwu t5, 0(t9) ;; t5 = (-> bone-layout joint) + or t6, t3, r0 + lwu t7, 8(t9) ;; t7 = (-> bone-layout bone) + or ra, t3, r0 + lwu t3, 16(t9) ;; t3 = (-> bone-layout output) + sll r0, r0, 0 + sw ra, 44(at) ;; stash sp-size + beq ra, r0, L36 + sw t8, 48(at) ;; stash sp-bufnum + +B12: + daddiu t1, t1, -16 ;; decrement bones count + addiu t9, r0, 1280 ;; next DMA math stuff + bgez t1, L24 ;; check if partial bone buffer + addiu t8, r0, 16 ;; .... + +B13: + daddiu t8, t1, 16 + addiu t1, r0, 0 + dsll t9, t8, 4 + dsll ra, t8, 6 + beq t8, r0, L25 + daddu t9, ra, t9 + +B14: +L24: + dsll t4, t8, 2 + dsll ra, t2, 2 + daddu gp, ra, at + sw a1, 16(v1) + addiu ra, r0, 264 + lwu gp, 0(gp) + andi gp, gp, 16383 + sw t4, 32(v1) + daddu a1, a1, t9 + sw gp, 128(v1) + addiu t4, r0, 0 + sw ra, 0(v1) + +;; and now, for the actual bones. +B15: +L25: + sll r0, r0, 0 + sw t8, 40(at) ;; in-count + sll r0, r0, 0 + lqc2 vf1, 0(t5) ;; vf1, vf2, vf3, vf4 = inverse bind pose + sll r0, r0, 0 + lqc2 vf2, 16(t5) + sll r0, r0, 0 + lqc2 vf3, 32(t5) + sll r0, r0, 0 + lqc2 vf4, 48(t5) + sll r0, r0, 0 + lqc2 vf5, 0(t7) ;; vf5, vf6, vf7, vf8 = input bone matrix. + sll r0, r0, 0 + lqc2 vf6, 16(t7) + sll r0, r0, 0 + lqc2 vf7, 32(t7) + sll r0, r0, 0 + lqc2 vf8, 48(t7) + vcallms 0 ;; run bone program + sll r0, r0, 0 +B16: +L26: + sll r0, r0, 0 + sll r0, r0, 0 + daddiu t5, t5, 64 ;; advance joint + sll r0, r0, 0 + daddiu t7, t7, 80 ;; advance bone. + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + lq t8, 0(t5) ;; load next joint + sll r0, r0, 0 + lq t9, 16(t5) + sll r0, r0, 0 + lq ra, 32(t5) + sll r0, r0, 0 + lq gp, 48(t5) + sll r0, r0, 0 + lq s5, 0(t7) ;; load next bone + sll r0, r0, 0 + lq s4, 16(t7) + sll r0, r0, 0 + lq s3, 32(t7) + sll r0, r0, 0 + lq s2, 48(t7) + sll r0, r0, 0 + qmtc2.ni vf1, t8 ;; swap in new inputs + sll r0, r0, 0 + qmtc2.ni vf2, t9 + sll r0, r0, 0 + qmtc2.ni vf3, ra + sll r0, r0, 0 + qmtc2.ni vf4, gp + sll r0, r0, 0 + qmtc2.ni vf5, s5 + sll r0, r0, 0 + qmtc2.ni vf6, s4 + sll r0, r0, 0 + qmtc2.ni vf7, s3 + sll r0, r0, 0 + qmtc2.ni vf8, s2 + sll r0, r0, 0 + qmfc2.i t8, vf13 ;; swap out result in (vf13, vf14, vf15, vf16) and (vf9, vf10, vf11) + sll r0, r0, 0 + qmfc2.ni t9, vf14 + sll r0, r0, 0 + qmfc2.ni ra, vf15 + sll r0, r0, 0 + qmfc2.ni gp, vf16 + sll r0, r0, 0 + qmfc2.ni s5, vf9 + sll r0, r0, 0 + qmfc2.ni s4, vf10 + sll r0, r0, 0 + qmfc2.ni s3, vf11 + vcallms 0 + sll r0, r0, 0 + sll r0, r0, 0 + sq t8, 0(t3) + sll r0, r0, 0 + sq t9, 16(t3) + sll r0, r0, 0 + sq ra, 32(t3) + sll r0, r0, 0 + sq gp, 48(t3) + sll r0, r0, 0 + sq s5, 64(t3) + sll r0, r0, 0 + sq s4, 80(t3) + sll r0, r0, 0 + sq s3, 96(t3) + sll r0, r0, 0 + sq r0, 112(t3) + daddiu t3, t3, 128 + daddiu t6, t6, -1 + bgtz t6, L26 + sll r0, r0, 0 + +B17: + sll r0, r0, 0 + lw t3, 40(at) + beq t3, r0, L29 + sll r0, r0, 0 + +B18: +L27: + lw t4, 0(v1) + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + andi t4, t4, 256 + sll r0, r0, 0 + beq t4, r0, L28 + sll r0, r0, 0 + +B19: + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + beq r0, r0, L27 + sll r0, r0, 0 + +B20: +L28: + dsll t5, t2, 2 + sll r0, r0, 0 + addiu t4, r0, 1 + daddu t5, t5, at + sll r0, r0, 0 + lwu t6, 8(t5) + dsll t5, t3, 2 + andi t6, t6, 16383 + daddu t5, t5, t3 + sw t6, 128(v1) + dsll t6, t5, 4 + sw a2, 16(v1) + addiu t7, r0, 256 + sw t5, 32(v1) + daddu a2, a2, t6 + sw t7, 0(v1) +B21: +L29: + sll r0, r0, 0 + lw t5, 48(at) + sll r0, r0, 0 + lw t6, 44(at) +B22: +L30: + lw t7, 0(t0) + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + andi t7, t7, 256 + sll r0, r0, 0 + beq t7, r0, L31 + sll r0, r0, 0 + +B23: + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + beq r0, r0, L30 + sll r0, r0, 0 + +B24: +L31: + beq t6, r0, L32 + sll r0, r0, 0 + +B25: + dsll t7, t5, 2 + lui t8, 28672 + daddu t7, t7, t8 + lwu t7, 16(t7) + andi t7, t7, 16383 + sw t7, 128(t0) + sw a0, 16(t0) + dsll t7, t6, 3 + sw t7, 32(t0) + addiu t7, r0, 256 + sw t7, 0(t0) + dsll t6, t6, 7 + daddu a0, a0, t6 +B26: +L32: + beq t3, r0, L35 + sll r0, r0, 0 + +B27: + bne t4, r0, L35 + sll r0, r0, 0 + +B28: +L33: + lw t6, 0(v1) + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + andi t6, t6, 256 + sll r0, r0, 0 + beq t6, r0, L34 + sll r0, r0, 0 + +B29: + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + sll r0, r0, 0 + beq r0, r0, L33 + sll r0, r0, 0 + +B30: +L34: + dsll t6, t2, 2 + lui t7, 28672 + daddu t6, t6, t7 + lwu t6, 8(t6) + andi t6, t6, 16383 + sw t6, 128(v1) + sw a2, 16(v1) + addiu t6, r0, 5 + mult3 t6, t6, t3 + sw t6, 32(v1) + addiu t6, r0, 256 + sw t6, 0(v1) + addiu t6, r0, 80 + mult3 t6, t6, t3 + daddu a2, a2, t6 +B31: +L35: + or t8, t2, r0 + bne t1, r0, L22 + or t2, t5, r0 + +B32: + beq a3, r0, L22 + addiu a3, r0, 1 + +B33: +L36: + or v0, r0, r0 + ld ra, 0(sp) + lq gp, 80(sp) + lq s5, 64(sp) + lq s4, 48(sp) + lq s3, 32(sp) + lq s2, 16(sp) + jr ra + daddiu sp, sp, 96 +``` + + +# VU0 micoprogram + +- vf1, vf2, vf3, vf4 = inverse bind pose +- vf5, vf6, vf7, vf8 = input bone matrix. +- `vf28, vf29, vf30, vf31` the camera matrix +- `vf25, vf26, vf27` the camera matrix again +- (vf13, vf14, vf15, vf16) output point transformation +- (vf09, vf10, vf11) output normal transformation + +``` +First: multiply bone and bind pose: (vf13, vf14, vf15, vf16) = (vf5, vf6, vf7, vf8) * (vf1, vf2, vf3, vf4). +This is doing a true matrix multiplication. + nop | mulax.xyzw ACC, vf05, vf01 + nop | madday.xyzw ACC, vf06, vf01 + nop | maddaz.xyzw ACC, vf07, vf01 + nop | maddw.xyzw vf13, vf08, vf01 + nop | mulax.xyzw ACC, vf05, vf02 + nop | madday.xyzw ACC, vf06, vf02 + nop | maddaz.xyzw ACC, vf07, vf02 + nop | maddw.xyzw vf14, vf08, vf02 + nop | mulax.xyzw ACC, vf05, vf03 + nop | madday.xyzw ACC, vf06, vf03 + nop | maddaz.xyzw ACC, vf07, vf03 + nop | maddw.xyzw vf15, vf08, vf03 + nop | mulax.xyzw ACC, vf05, vf04 + nop | madday.xyzw ACC, vf06, vf04 + nop | maddaz.xyzw ACC, vf07, vf04 + nop | maddw.xyzw vf16, vf08, vf04 + +;; vf09 = cross(y, z) + nop | opmula.xyz ACC, vf14, vf15 + nop | opmsub.xyz vf09, vf15, vf14 + +;; vf10 = cross(z, x) + nop | opmula.xyz ACC, vf15, vf13 + nop | opmsub.xyz vf10, vf13, vf15 + +;; vf11 = cross(x, y) + nop | opmula.xyz ACC, vf13, vf14 + nop | opmsub.xyz vf11, vf14, vf13 + + ;; vf12 = cross (y, z) * x + nop | mul.xyz vf12, vf13, vf09 + + ;; second multiply: doing (vf13....) = cam * (vf5, vf6, vf7, vf8) * (vf1, vf2, vf3, vf4) + nop | mulax.xyzw ACC, vf28, vf13 + nop | madday.xyzw ACC, vf29, vf13 + nop | maddaz.xyzw ACC, vf30, vf13 + nop | maddw.xyzw vf13, vf31, vf13 + + nop | mulax.w ACC, vf00, vf12 + nop | madday.w ACC, vf00, vf12 + nop | maddz.w vf12, vf00, vf12 + vf12.w = dot (cross(y, z), x) [before the second multiply] + + nop | mulax.xyzw ACC, vf28, vf14 + nop | madday.xyzw ACC, vf29, vf14 + nop | maddaz.xyzw ACC, vf30, vf14 + div Q, vf00.w, vf12.w | maddw.xyzw vf14, vf31, vf14 ;; divide + nop | mulax.xyzw ACC, vf28, vf15 + nop | madday.xyzw ACC, vf29, vf15 + nop | maddaz.xyzw ACC, vf30, vf15 + nop | maddw.xyzw vf15, vf31, vf15 + nop | mulax.xyzw ACC, vf28, vf16 + nop | madday.xyzw ACC, vf29, vf16 + nop | maddaz.xyzw ACC, vf30, vf16 + nop | maddw.xyzw vf16, vf31, vf16 + +;; normal scale + nop | mul.xyzw vf09, vf09, Q + nop | mul.xyzw vf10, vf10, Q + nop | mul.xyzw vf11, vf11, Q + +;; apply cam to normal matrix too + nop | mulax.xyzw ACC, vf25, vf09 + nop | madday.xyzw ACC, vf26, vf09 + nop | maddz.xyzw vf09, vf27, vf09 + nop | mulax.xyzw ACC, vf25, vf10 + nop | madday.xyzw ACC, vf26, vf10 + nop | maddz.xyzw vf10, vf27, vf10 + nop | mulax.xyzw ACC, vf25, vf11 + nop | madday.xyzw ACC, vf26, vf11 :e + nop | maddz.xyzw vf11, vf27, vf11 +``` \ No newline at end of file diff --git a/goal_src/jak1/engine/gfx/foreground/bones.gc b/goal_src/jak1/engine/gfx/foreground/bones.gc index 8fac0ff0e1..6c36dc9f08 100644 --- a/goal_src/jak1/engine/gfx/foreground/bones.gc +++ b/goal_src/jak1/engine/gfx/foreground/bones.gc @@ -422,6 +422,141 @@ (def-mips2c bones-mtx-calc (function int pointer pointer int object none)) +(defmacro .cross.vf (out a b) + `(begin + (.outer.product.a.vf acc ,a ,b) + (.outer.product.b.vf ,out ,b ,a acc) + ) + ) + +(defun new-bones-mtx-calc-asm ((output (inline-array pris-mtx)) (joints (inline-array joint)) (bones (inline-array bone)) (cam matrix) (count int)) + "Compute skinning matrices." + ;; (declare (print-asm)) + (dotimes (i (- count 1)) + (let ((b (-> bones (+ i 1) transform)) + (j (-> joints i bind-pose)) + (out (-> output (+ i 1))) + ) + (rlet ( + (tmat0 :class vf) + (tmat1 :class vf) + (tmat2 :class vf) + (tmat3 :class vf) + (nmat0 :class vf) + (nmat1 :class vf) + (nmat2 :class vf) + (nmat3 :class vf) + (acc :class vf ) + (vf0 :class vf ) + (cam0 :class vf ) + (cam1 :class vf ) + (cam2 :class vf ) + (cam3 :class vf ) + ) + + (init-vf0-vector) + + ;; load bind-pose to tmat: + (.lvf tmat0 (&-> j quad 0)) + (.lvf tmat1 (&-> j quad 1)) + (.lvf tmat2 (&-> j quad 2)) + (.lvf tmat3 (&-> j quad 3)) + + ;; load bone to nmat + (.lvf nmat0 (&-> b quad 0)) + (.lvf nmat1 (&-> b quad 1)) + (.lvf nmat2 (&-> b quad 2)) + (.lvf nmat3 (&-> b quad 3)) + + ;; multiply bone and bind pose, store in tmat + (.mul.x.vf acc nmat0 tmat0) + (.add.mul.y.vf acc nmat1 tmat0 acc) + (.add.mul.z.vf acc nmat2 tmat0 acc) + (.add.mul.w.vf tmat0 nmat3 tmat0 acc) + (.mul.x.vf acc nmat0 tmat1) + (.add.mul.y.vf acc nmat1 tmat1 acc) + (.add.mul.z.vf acc nmat2 tmat1 acc) + (.add.mul.w.vf tmat1 nmat3 tmat1 acc) + (.mul.x.vf acc nmat0 tmat2) + (.add.mul.y.vf acc nmat1 tmat2 acc) + (.add.mul.z.vf acc nmat2 tmat2 acc) + (.add.mul.w.vf tmat2 nmat3 tmat2 acc) + (.mul.x.vf acc nmat0 tmat3) + (.add.mul.y.vf acc nmat1 tmat3 acc) + (.add.mul.z.vf acc nmat2 tmat3 acc) + (.add.mul.w.vf tmat3 nmat3 tmat3 acc) + + ;; compute inverse transpose, storing in nmat. + (.cross.vf nmat0 tmat1 tmat2) + (.cross.vf nmat1 tmat2 tmat0) + (.cross.vf nmat2 tmat0 tmat1) + + ;; dot nmat0 and tmat0 + (.mul.vf acc nmat0 tmat0) + (.add.y.vf acc acc acc :mask #b1) + (.add.z.vf acc acc acc :mask #b1) + + ;; divide! + (.div.vf acc vf0 acc :fsf #b11 :ftf #b0) + + ;; scale nmat: + (.mul.x.vf nmat0 nmat0 acc) + (.mul.x.vf nmat1 nmat1 acc) + (.mul.x.vf nmat2 nmat2 acc) + + ;; load camera + (.lvf cam0 (&-> cam quad 0)) + (.lvf cam1 (&-> cam quad 1)) + (.lvf cam2 (&-> cam quad 2)) + (.lvf cam3 (&-> cam quad 3)) + + ;; multiply tmat by camera + (.mul.x.vf acc cam0 tmat0) + (.add.mul.y.vf acc cam1 tmat0 acc) + (.add.mul.z.vf acc cam2 tmat0 acc) + (.add.mul.w.vf tmat0 cam3 tmat0 acc) + (.mul.x.vf acc cam0 tmat1) + (.add.mul.y.vf acc cam1 tmat1 acc) + (.add.mul.z.vf acc cam2 tmat1 acc) + (.add.mul.w.vf tmat1 cam3 tmat1 acc) + (.mul.x.vf acc cam0 tmat2) + (.add.mul.y.vf acc cam1 tmat2 acc) + (.add.mul.z.vf acc cam2 tmat2 acc) + (.add.mul.w.vf tmat2 cam3 tmat2 acc) + (.mul.x.vf acc cam0 tmat3) + (.add.mul.y.vf acc cam1 tmat3 acc) + (.add.mul.z.vf acc cam2 tmat3 acc) + (.add.mul.w.vf tmat3 cam3 tmat3 acc) + + ;; store tmat + (.svf (&-> out t-mtx quad 0) tmat0) + (.svf (&-> out t-mtx quad 1) tmat1) + (.svf (&-> out t-mtx quad 2) tmat2) + (.svf (&-> out t-mtx quad 3) tmat3) + + ;; multiply nmat + (.mul.x.vf acc cam0 nmat0) + (.add.mul.y.vf acc cam1 nmat0 acc) + (.add.mul.z.vf nmat0 cam2 nmat0 acc) + (.mul.x.vf acc cam0 nmat1) + (.add.mul.y.vf acc cam1 nmat1 acc) + (.add.mul.z.vf nmat1 cam2 nmat1 acc) + (.mul.x.vf acc cam0 nmat2) + (.add.mul.y.vf acc cam1 nmat2 acc) + (.add.mul.z.vf nmat2 cam2 nmat2 acc) + + ;; store nmat + (.svf (&-> out n-mtx quad 0) nmat0) + (.svf (&-> out n-mtx quad 1) nmat1) + (.svf (&-> out n-mtx quad 2) nmat2) + ) + ) + ) + (none) + ) + +(define *use-new-bones* #t) + (defun bones-mtx-calc-execute () "Do all pending bone calculations" (local-vars (v1-14 float)) @@ -481,13 +616,20 @@ (.mov v1-14 vf27) ;; hack?? - - (bones-mtx-calc - (the-as int (-> s4-0 matrix-area)) - (the-as pointer (-> s4-0 joints)) - (the-as pointer (-> s4-0 bones)) - (the-as int (-> s4-0 num-bones)) - v1-13 ;; hack, added + (if *use-new-bones* + (new-bones-mtx-calc-asm + (the (inline-array pris-mtx) (-> s4-0 matrix-area)) + (-> s4-0 joints) + (-> s4-0 bones) + v1-13 + (the int (-> s4-0 num-bones))) + (bones-mtx-calc + (the-as int (-> s4-0 matrix-area)) + (the-as pointer (-> s4-0 joints)) + (the-as pointer (-> s4-0 bones)) + (the-as int (-> s4-0 num-bones)) + v1-13 ;; hack, added + ) ) ) (when (logtest? (-> s4-0 flags) (bone-calc-flags bncfl00)) diff --git a/goal_src/jak2/engine/gfx/foreground/bones.gc b/goal_src/jak2/engine/gfx/foreground/bones.gc index 07bbd56cf7..b4092f48af 100644 --- a/goal_src/jak2/engine/gfx/foreground/bones.gc +++ b/goal_src/jak2/engine/gfx/foreground/bones.gc @@ -120,16 +120,224 @@ this is done by a linked list of "bone calculations", which is stashed in the dm ;; see the C++ code for more details. (def-mips2c bones-mtx-calc (function (inline-array pris-mtx) (inline-array joint) (inline-array bone) uint object none)) +(defun matrix-*float! ((output matrix3) (input matrix3) (x float)) + (dotimes (i 12) + (set! (-> output data i) (* x (-> input data i))) + ) + output + ) + +(defun matrix*!-first-three ((arg0 matrix3) (arg1 matrix3) (arg2 matrix)) + "Set dst = src1 * src2. It is okay for any arguments to be the same data. + This is a moderately efficient implementation." + (rlet ((acc :class vf) + (vf10 :class vf) + (vf11 :class vf) + (vf12 :class vf) + (vf14 :class vf) + (vf15 :class vf) + (vf16 :class vf) + (vf18 :class vf) + (vf19 :class vf) + (vf20 :class vf) + ) + (.lvf vf10 (&-> arg1 quad 0)) + (.lvf vf14 (&-> arg2 quad 0)) + (.lvf vf15 (&-> arg2 quad 1)) + (.lvf vf16 (&-> arg2 quad 2)) + (.lvf vf11 (&-> arg1 quad 1)) + (.lvf vf12 (&-> arg1 quad 2)) + (.mul.x.vf acc vf14 vf10) + (.add.mul.y.vf acc vf15 vf10 acc) + (.add.mul.z.vf vf18 vf16 vf10 acc) + (.mul.x.vf acc vf14 vf11) + (.add.mul.y.vf acc vf15 vf11 acc) + (.add.mul.z.vf vf19 vf16 vf11 acc) + (.mul.x.vf acc vf14 vf12) + (.add.mul.y.vf acc vf15 vf12 acc) + (.add.mul.z.vf vf20 vf16 vf12 acc) + (.svf (&-> arg0 quad 0) vf18) + (.svf (&-> arg0 quad 1) vf19) + (.svf (&-> arg0 quad 2) vf20) + arg0 + ) + ) + +(defun new-bones-mtx-calc ((output (inline-array pris-mtx)) (joints (inline-array joint)) (bones (inline-array bone)) (cam matrix) (count int)) + (dotimes (i (- count 1)) + (let ((b (-> bones (+ i 1) transform)) + (j (-> joints i bind-pose)) + (out (-> output (+ i 1))) + ) + + ;; multiply by bind pose + ;; mult swaps the args + (matrix*! (-> out t-mtx) j b) + + ;; clever way to compute inverse transpose of a 3x3: + (vector-cross! (-> out n-mtx vector 0) + (-> out t-mtx vector 1) + (-> out t-mtx vector 2) + ) + (vector-cross! (-> out n-mtx vector 1) + (-> out t-mtx vector 2) + (-> out t-mtx vector 0) + ) + (vector-cross! (-> out n-mtx vector 2) + (-> out t-mtx vector 0) + (-> out t-mtx vector 1) + ) + (let ((scale (/ 1. (vector-dot (-> out n-mtx vector 0) (-> out t-mtx vector 0))))) + (matrix-*float! (-> out n-mtx) (-> out n-mtx) scale) + ) + + ;; multiply by camera + (matrix*! (-> out t-mtx) (-> out t-mtx) cam) + (matrix*!-first-three (-> out n-mtx) (-> out n-mtx) cam) ;; WRONG!! + ) + ) + ) + +(defmacro .cross.vf (out a b) + `(begin + (.outer.product.a.vf acc ,a ,b) + (.outer.product.b.vf ,out ,b ,a acc) + ) + ) + +(defun new-bones-mtx-calc-asm ((output (inline-array pris-mtx)) (joints (inline-array joint)) (bones (inline-array bone)) (cam matrix) (count int)) + ;; (declare (print-asm)) + (dotimes (i (- count 1)) + (let ((b (-> bones (+ i 1) transform)) + (j (-> joints i bind-pose)) + (out (-> output (+ i 1))) + ) + (rlet ( + (tmat0 :class vf) + (tmat1 :class vf) + (tmat2 :class vf) + (tmat3 :class vf) + (nmat0 :class vf) + (nmat1 :class vf) + (nmat2 :class vf) + (nmat3 :class vf) + (acc :class vf ) + (vf0 :class vf ) + (cam0 :class vf ) + (cam1 :class vf ) + (cam2 :class vf ) + (cam3 :class vf ) + ) + + (init-vf0-vector) + + ;; load bind-pose to tmat: + (.lvf tmat0 (&-> j quad 0)) + (.lvf tmat1 (&-> j quad 1)) + (.lvf tmat2 (&-> j quad 2)) + (.lvf tmat3 (&-> j quad 3)) + + ;; load bone to nmat + (.lvf nmat0 (&-> b quad 0)) + (.lvf nmat1 (&-> b quad 1)) + (.lvf nmat2 (&-> b quad 2)) + (.lvf nmat3 (&-> b quad 3)) + + ;; multiply, store in tmat + (.mul.x.vf acc nmat0 tmat0) + (.add.mul.y.vf acc nmat1 tmat0 acc) + (.add.mul.z.vf acc nmat2 tmat0 acc) + (.add.mul.w.vf tmat0 nmat3 tmat0 acc) + (.mul.x.vf acc nmat0 tmat1) + (.add.mul.y.vf acc nmat1 tmat1 acc) + (.add.mul.z.vf acc nmat2 tmat1 acc) + (.add.mul.w.vf tmat1 nmat3 tmat1 acc) + (.mul.x.vf acc nmat0 tmat2) + (.add.mul.y.vf acc nmat1 tmat2 acc) + (.add.mul.z.vf acc nmat2 tmat2 acc) + (.add.mul.w.vf tmat2 nmat3 tmat2 acc) + (.mul.x.vf acc nmat0 tmat3) + (.add.mul.y.vf acc nmat1 tmat3 acc) + (.add.mul.z.vf acc nmat2 tmat3 acc) + (.add.mul.w.vf tmat3 nmat3 tmat3 acc) + + ;; compute inverse transpose, storing in nmat + (.cross.vf nmat0 tmat1 tmat2) + (.cross.vf nmat1 tmat2 tmat0) + (.cross.vf nmat2 tmat0 tmat1) + + ;; dot nmat0 and tmat0 + (.mul.vf acc nmat0 tmat0) + (.add.y.vf acc acc acc :mask #b1) + (.add.z.vf acc acc acc :mask #b1) + + ;; divide! + (.div.vf acc vf0 acc :fsf #b11 :ftf #b0) + + ;; scale nmat: + (.mul.x.vf nmat0 nmat0 acc) + (.mul.x.vf nmat1 nmat1 acc) + (.mul.x.vf nmat2 nmat2 acc) + + ;; load camera + (.lvf cam0 (&-> cam quad 0)) + (.lvf cam1 (&-> cam quad 1)) + (.lvf cam2 (&-> cam quad 2)) + (.lvf cam3 (&-> cam quad 3)) + + ;; multiply tmat by camera + (.mul.x.vf acc cam0 tmat0) + (.add.mul.y.vf acc cam1 tmat0 acc) + (.add.mul.z.vf acc cam2 tmat0 acc) + (.add.mul.w.vf tmat0 cam3 tmat0 acc) + (.mul.x.vf acc cam0 tmat1) + (.add.mul.y.vf acc cam1 tmat1 acc) + (.add.mul.z.vf acc cam2 tmat1 acc) + (.add.mul.w.vf tmat1 cam3 tmat1 acc) + (.mul.x.vf acc cam0 tmat2) + (.add.mul.y.vf acc cam1 tmat2 acc) + (.add.mul.z.vf acc cam2 tmat2 acc) + (.add.mul.w.vf tmat2 cam3 tmat2 acc) + (.mul.x.vf acc cam0 tmat3) + (.add.mul.y.vf acc cam1 tmat3 acc) + (.add.mul.z.vf acc cam2 tmat3 acc) + (.add.mul.w.vf tmat3 cam3 tmat3 acc) + + ;; store tmat + (.svf (&-> out t-mtx quad 0) tmat0) + (.svf (&-> out t-mtx quad 1) tmat1) + (.svf (&-> out t-mtx quad 2) tmat2) + (.svf (&-> out t-mtx quad 3) tmat3) + + ;; multiply nmat + (.mul.x.vf acc cam0 nmat0) + (.add.mul.y.vf acc cam1 nmat0 acc) + (.add.mul.z.vf nmat0 cam2 nmat0 acc) + (.mul.x.vf acc cam0 nmat1) + (.add.mul.y.vf acc cam1 nmat1 acc) + (.add.mul.z.vf nmat1 cam2 nmat1 acc) + (.mul.x.vf acc cam0 nmat2) + (.add.mul.y.vf acc cam1 nmat2 acc) + (.add.mul.z.vf nmat2 cam2 nmat2 acc) + + ;; store nmat + (.svf (&-> out n-mtx quad 0) nmat0) + (.svf (&-> out n-mtx quad 1) nmat1) + (.svf (&-> out n-mtx quad 2) nmat2) + ) + ) + ) + (none) + ) + + +(define *use-new-bones* #t) +(define *display-bone-stats* #f) +(define *num-bones* 0) + (defun bones-mtx-calc-execute () "Execute all bone matrix calculations." (rlet ((vf1 :class vf) - (vf25 :class vf) - (vf26 :class vf) - (vf27 :class vf) - (vf28 :class vf) - (vf29 :class vf) - (vf30 :class vf) - (vf31 :class vf) (vf4 :class vf) (vf5 :class vf) (vf6 :class vf) @@ -157,6 +365,7 @@ this is done by a linked list of "bone calculations", which is stashed in the dm (s5-1 (-> *math-camera* camera-rot)) (s4-1 (-> v1-26 first)) ) + (set! *num-bones* 0) (while (nonzero? s4-1) ;; loop ;; pick correct matrix. The pris-mtx includes camera rotation, so all that's needed in the final @@ -167,23 +376,19 @@ this is done by a linked list of "bone calculations", which is stashed in the dm ) ) ) - (.lvf vf28 (&-> v1-29 quad 0)) - (.lvf vf29 (&-> v1-29 quad 1)) - (.lvf vf30 (&-> v1-29 quad 2)) - (.lvf vf31 (&-> v1-29 trans quad)) - (.lvf vf25 (&-> v1-29 quad 0)) - (.lvf vf26 (&-> v1-29 quad 1)) - (.lvf vf27 (&-> v1-29 quad 2)) - ;; calculate! - (bones-mtx-calc - (-> s4-1 matrix-area) - (-> s4-1 joints) - (-> s4-1 bones) - (-> s4-1 num-bones) - v1-29 ;; hack, to pass matrix to bones-mtx-calc in a better way. - ) + (if *use-new-bones* + (new-bones-mtx-calc-asm (-> s4-1 matrix-area) (-> s4-1 joints) (-> s4-1 bones) v1-29 (the int (-> s4-1 num-bones))) + (bones-mtx-calc + (-> s4-1 matrix-area) + (-> s4-1 joints) + (-> s4-1 bones) + (-> s4-1 num-bones) + v1-29 ;; hack, to pass matrix to bones-mtx-calc in a better way. + ) + ) ) + (+! *num-bones* (-> s4-1 num-bones)) ;; there is an optional post-processing step for ripple. (when (logtest? (-> s4-1 flags) (bone-calc-flags write-ripple-data)) @@ -228,6 +433,10 @@ this is done by a linked list of "bone calculations", which is stashed in the dm ) ) + (when *display-bone-stats* + (format *stdcon* "num-bones: ~D~%" *num-bones*) + ) + ;; reset sqwc ;; (set! (-> (the-as dma-bank-control #x1000e000) sqwc) (new 'static 'dma-sqwc :sqwc #x1 :tqwc #x1)) diff --git a/goalc/compiler/compilation/Asm.cpp b/goalc/compiler/compilation/Asm.cpp index 95f8822742..da831286b6 100644 --- a/goalc/compiler/compilation/Asm.cpp +++ b/goalc/compiler/compilation/Asm.cpp @@ -1233,17 +1233,17 @@ Val* Compiler::compile_asm_div_vf(const goos::Object& form, const goos::Object& // Why do we even bother using VDIVPS instead of FDIV? Because otherwise in x86, you have to use // the FPU stack Registers are nicer. - // Save one temp reg, use the destination as one - auto temp_reg = env->make_vfr(dest->type()); + auto temp_reg1 = env->make_vfr(dest->type()); + auto temp_reg2 = env->make_vfr(dest->type()); - // Splat src1's value into the dest reg, keep it simple, this way no matter which vector component + // Splat src1's value into a temp reg, keep it simple, this way no matter which vector component // is accessed from the final result will be the correct answer - env->emit_ir(form, color, dest, src1, ftf_fsf_to_vector_element(fsf)); + env->emit_ir(form, color, temp_reg1, src1, ftf_fsf_to_vector_element(fsf)); // Splat src1's value into the the temp reg - env->emit_ir(form, color, temp_reg, src2, ftf_fsf_to_vector_element(ftf)); + env->emit_ir(form, color, temp_reg2, src2, ftf_fsf_to_vector_element(ftf)); // Perform the Division - env->emit_ir(form, color, dest, dest, temp_reg, IR_VFMath3Asm::Kind::DIV); + env->emit_ir(form, color, dest, temp_reg1, temp_reg2, IR_VFMath3Asm::Kind::DIV); return get_none(); } diff --git a/goalc/regalloc/Allocator_v2.cpp b/goalc/regalloc/Allocator_v2.cpp index 5c178e82ee..178037f106 100644 --- a/goalc/regalloc/Allocator_v2.cpp +++ b/goalc/regalloc/Allocator_v2.cpp @@ -332,7 +332,7 @@ const std::vector& get_alloc_order(int var_idx, if (is_gpr) { return REG_temp_first_order.gprs; } else { - return REG_temp_only_order.xmms; + return REG_temp_first_order.xmms; } } } @@ -1060,8 +1060,10 @@ bool run_assignment_on_var(const AllocationInput& input, for (auto& reg : assign_order) { bool worked = check_register_assign(input, *cache, var_idx, reg); if (trace) { - lg::print("m2 trying var {} in {}: {}\n", cache->iregs.at(var_idx).to_string(), - reg.print(), worked); + const auto& this_var = cache->vars.at(var_idx); + lg::print("m2 trying var {} in {} (live {} to {}): {}\n", + cache->iregs.at(var_idx).to_string(), reg.print(), this_var.first_live(), + this_var.last_live(), worked); } if (worked) { var.assign_to_register(reg); diff --git a/goalc/regalloc/allocator_interface.cpp b/goalc/regalloc/allocator_interface.cpp index 5ad94884c9..c24cb5c2c8 100644 --- a/goalc/regalloc/allocator_interface.cpp +++ b/goalc/regalloc/allocator_interface.cpp @@ -24,7 +24,7 @@ void print_allocate_input(const AllocationInput& in) { } } else { for (const auto& instruction : in.instructions) { - lg::print(" [{:3d}] {}\n", instruction.print()); + lg::print(" {}\n", instruction.print()); } } lg::print("[RegAlloc] Debug Input Constraints:\n");