From 82fb2cc26a52a15fcdf01e6b589336bd3ee48223 Mon Sep 17 00:00:00 2001
From: water111 <48171810+water111@users.noreply.github.com>
Date: Fri, 15 Mar 2024 20:31:11 -0400
Subject: [PATCH] Port `bones.gc` math to GOAL (#3425)

Reverse engineer the skinning matrix calculation and port to GOAL. This
is about 3x faster than the MIPS2c version.

As usual, there is a `*use-new-bones*` flag to go back to the old
version.

Fix for a bug in the compiler's `.div.vf` implementation (only happens
if src/dst are the same), and fix for a typo in the register allocator
that would sometimes cause it not to consider xmm8-xmm15.
---
 docs/progress-notes/bones.md                 | 500 +++++++++++++++++++
 goal_src/jak1/engine/gfx/foreground/bones.gc | 156 +++++-
 goal_src/jak2/engine/gfx/foreground/bones.gc | 253 +++++++++-
 goalc/compiler/compilation/Asm.cpp           |  12 +-
 goalc/regalloc/Allocator_v2.cpp              |   8 +-
 goalc/regalloc/allocator_interface.cpp       |   2 +-
 6 files changed, 892 insertions(+), 39 deletions(-)
 create mode 100644 docs/progress-notes/bones.md
diff --git a/docs/progress-notes/bones.md b/docs/progress-notes/bones.md
new file mode 100644
index 0000000000..6b2a4fa0bf
--- /dev/null
+++ b/docs/progress-notes/bones.md
@@ -0,0 +1,500 @@
+The `bones.gc` file computes skinning matrices for foreground rendering.
+
+Arguments:
+ - `a0`: output matrix area, each matrix is 7x quadwords contaning the vertex and normal transformation matrices.
+ - `a1`: input joint array. The joints contain the inverse bind pose.
+ - `a2`: input bones array. The world space bone transforms
+ - `a3`: num bones
+
+ - `vf28, vf29, vf30, vf31` the camera matrix
+ - `vf25, vf26, vf27` the camera matrix again
+
+```asm
+    daddiu sp, sp, -96
+    sd ra, 0(sp)
+    sq s2, 16(sp)
+    sq s3, 32(sp)
+    sq s4, 48(sp)
+    sq s5, 64(sp)
+    sq gp, 80(sp)
+
+    lui v1, 4096
+    lui t0, 4096
+    ori v1, v1, 54272   ;; v1 = DMA reg addr
+    ori t0, t0, 53248   ;; t0 = DMA reg addr
+    lui t2, 32767       ;; 0x7fff....
+    daddiu t1, a3, -16  ;; bone count - 16 (maybe we do 16 bones at a time?)
+    ori t2, t2, 65535   ;; 0x7fff'ffff
+    lui at, 28672       ;; scratchpad addr
+    addiu t4, r0, 64    ;; t4 = 64 (= 16 bones * 4)
+    addiu t5, r0, 1280  ;; t5 = 1280 (= 16 bones * 80)
+    bgez t1, L17        ;; more than 16 bones?
+    addiu t3, r0, 16    ;; t3 = 16
+
+   ;; if first run is under 16 bones, adjust counts
+B1:
+    or t3, a3, r0       ;; t3 = num bones
+    sll r0, r0, 0
+    dsll t4, t3, 2      ;; t4 = num bones * 4
+    dsll a3, t3, 4      ;; a3 = num bones * 16
+    dsll t1, t3, 6      ;; t1 = num bones * 64
+    sll r0, r0, 0
+    daddu t5, t1, a3    ;; t5 = num bones * 80
+    addiu t1, r0, 0     ;; t1 = 0 (remaining bones count)
+B2:
+L17:
+    addiu a3, r0, 0     ;; a3 = 0
+    addiu t6, r0, 1     ;; t6 = 1
+    and a1, a1, t2      ;; mask off upper bits of address (not sure why, but they do this sometimes)
+    sll r0, r0, 0
+    daddiu a1, a1, 12   ;; adjustment of joint pointer for the strided dma stuff.
+    or a0, a0, r0
+    daddiu a1, a1, -80
+    sll r0, r0, 0
+
+    ;; wait for DMA to be free...
+    <snip>
+
+B5:
+L19:
+    addiu t6, r0, 80
+    addiu t7, r0, 264
+    sw t6, 128(v1)   ;; addr in spad = 80 for joints
+    sw a1, 16(v1)
+    sw t4, 32(v1)    ;; size: num bones * 4
+    sw t7, 0(v1)
+    daddu a1, a1, t5
+    ;; wait for dma to complete
+    <snip>
+
+B8:
+L21:
+    and a2, a2, t2  ;; clean up bones addr
+    sll r0, r0, 0
+    dsll t2, t3, 2    ;; t2 = bones * 4
+    addiu t4, r0, 256 ;; t4 = 256
+    daddu t2, t2, t3  ;; t2 = bones * 5 (size of the bone)
+    addiu t6, r0, 1104  ;; addr in spad = 1104 for bones.
+    dsll t5, t2, 4
+    sw t6, 128(v1)
+    addiu t8, r0, 0
+    sw a2, 16(v1)
+    daddu a2, a2, t5
+    sw t2, 32(v1)
+    addiu t2, r0, 1
+    sw t4, 0(v1)
+;; wait for dma
+;; <snip>
+
+B11:
+L23:
+    dsll t5, t8, 2     ;; ?? not sure what this is, but always zero?
+    daddu t9, t5, at   ;; ptr to bone-work
+    sll r0, r0, 0
+    lwu t5, 0(t9)      ;; t5 = (-> bone-layout joint)
+    or t6, t3, r0
+    lwu t7, 8(t9)      ;; t7 = (-> bone-layout bone)
+    or ra, t3, r0
+    lwu t3, 16(t9)     ;; t3 = (-> bone-layout output)
+    sll r0, r0, 0
+    sw ra, 44(at)      ;; stash sp-size
+    beq ra, r0, L36
+    sw t8, 48(at)      ;; stash sp-bufnum
+
+B12:
+    daddiu t1, t1, -16   ;; decrement bones count
+    addiu t9, r0, 1280   ;; next DMA math stuff
+    bgez t1, L24         ;; check if partial bone buffer
+    addiu t8, r0, 16     ;; ....
+
+B13:
+    daddiu t8, t1, 16
+    addiu t1, r0, 0
+    dsll t9, t8, 4
+    dsll ra, t8, 6
+    beq t8, r0, L25
+    daddu t9, ra, t9
+
+B14:
+L24:
+    dsll t4, t8, 2
+    dsll ra, t2, 2
+    daddu gp, ra, at
+    sw a1, 16(v1)
+    addiu ra, r0, 264
+    lwu gp, 0(gp)
+    andi gp, gp, 16383
+    sw t4, 32(v1)
+    daddu a1, a1, t9
+    sw gp, 128(v1)
+    addiu t4, r0, 0
+    sw ra, 0(v1)
+
+;; and now, for the actual bones.
+B15:
+L25:
+    sll r0, r0, 0
+    sw t8, 40(at)   ;; in-count
+    sll r0, r0, 0
+    lqc2 vf1, 0(t5)  ;; vf1, vf2, vf3, vf4 = inverse bind pose
+    sll r0, r0, 0
+    lqc2 vf2, 16(t5)
+    sll r0, r0, 0
+    lqc2 vf3, 32(t5)
+    sll r0, r0, 0
+    lqc2 vf4, 48(t5)
+    sll r0, r0, 0
+    lqc2 vf5, 0(t7)  ;; vf5, vf6, vf7, vf8 = input bone matrix.
+    sll r0, r0, 0
+    lqc2 vf6, 16(t7)
+    sll r0, r0, 0
+    lqc2 vf7, 32(t7)
+    sll r0, r0, 0
+    lqc2 vf8, 48(t7)
+    vcallms 0         ;; run bone program
+    sll r0, r0, 0
+B16:
+L26:
+    sll r0, r0, 0
+    sll r0, r0, 0
+    daddiu t5, t5, 64  ;; advance joint
+    sll r0, r0, 0
+    daddiu t7, t7, 80  ;; advance bone.
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    lq t8, 0(t5)   ;; load next joint
+    sll r0, r0, 0
+    lq t9, 16(t5)
+    sll r0, r0, 0
+    lq ra, 32(t5)
+    sll r0, r0, 0
+    lq gp, 48(t5)
+    sll r0, r0, 0
+    lq s5, 0(t7)   ;; load next bone
+    sll r0, r0, 0
+    lq s4, 16(t7)
+    sll r0, r0, 0
+    lq s3, 32(t7)
+    sll r0, r0, 0
+    lq s2, 48(t7)
+    sll r0, r0, 0
+    qmtc2.ni vf1, t8 ;; swap in new inputs
+    sll r0, r0, 0
+    qmtc2.ni vf2, t9
+    sll r0, r0, 0
+    qmtc2.ni vf3, ra
+    sll r0, r0, 0
+    qmtc2.ni vf4, gp
+    sll r0, r0, 0
+    qmtc2.ni vf5, s5
+    sll r0, r0, 0
+    qmtc2.ni vf6, s4
+    sll r0, r0, 0
+    qmtc2.ni vf7, s3
+    sll r0, r0, 0
+    qmtc2.ni vf8, s2
+    sll r0, r0, 0
+    qmfc2.i t8, vf13   ;; swap out result in (vf13, vf14, vf15, vf16) and (vf9, vf10, vf11)
+    sll r0, r0, 0
+    qmfc2.ni t9, vf14
+    sll r0, r0, 0
+    qmfc2.ni ra, vf15
+    sll r0, r0, 0
+    qmfc2.ni gp, vf16
+    sll r0, r0, 0
+    qmfc2.ni s5, vf9
+    sll r0, r0, 0
+    qmfc2.ni s4, vf10
+    sll r0, r0, 0
+    qmfc2.ni s3, vf11
+    vcallms 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sq t8, 0(t3)
+    sll r0, r0, 0
+    sq t9, 16(t3)
+    sll r0, r0, 0
+    sq ra, 32(t3)
+    sll r0, r0, 0
+    sq gp, 48(t3)
+    sll r0, r0, 0
+    sq s5, 64(t3)
+    sll r0, r0, 0
+    sq s4, 80(t3)
+    sll r0, r0, 0
+    sq s3, 96(t3)
+    sll r0, r0, 0
+    sq r0, 112(t3)
+    daddiu t3, t3, 128
+    daddiu t6, t6, -1
+    bgtz t6, L26
+    sll r0, r0, 0
+
+B17:
+    sll r0, r0, 0
+    lw t3, 40(at)
+    beq t3, r0, L29
+    sll r0, r0, 0
+
+B18:
+L27:
+    lw t4, 0(v1)
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    andi t4, t4, 256
+    sll r0, r0, 0
+    beq t4, r0, L28
+    sll r0, r0, 0
+
+B19:
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    beq r0, r0, L27
+    sll r0, r0, 0
+
+B20:
+L28:
+    dsll t5, t2, 2
+    sll r0, r0, 0
+    addiu t4, r0, 1
+    daddu t5, t5, at
+    sll r0, r0, 0
+    lwu t6, 8(t5)
+    dsll t5, t3, 2
+    andi t6, t6, 16383
+    daddu t5, t5, t3
+    sw t6, 128(v1)
+    dsll t6, t5, 4
+    sw a2, 16(v1)
+    addiu t7, r0, 256
+    sw t5, 32(v1)
+    daddu a2, a2, t6
+    sw t7, 0(v1)
+B21:
+L29:
+    sll r0, r0, 0
+    lw t5, 48(at)
+    sll r0, r0, 0
+    lw t6, 44(at)
+B22:
+L30:
+    lw t7, 0(t0)
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    andi t7, t7, 256
+    sll r0, r0, 0
+    beq t7, r0, L31
+    sll r0, r0, 0
+
+B23:
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    beq r0, r0, L30
+    sll r0, r0, 0
+
+B24:
+L31:
+    beq t6, r0, L32
+    sll r0, r0, 0
+
+B25:
+    dsll t7, t5, 2
+    lui t8, 28672
+    daddu t7, t7, t8
+    lwu t7, 16(t7)
+    andi t7, t7, 16383
+    sw t7, 128(t0)
+    sw a0, 16(t0)
+    dsll t7, t6, 3
+    sw t7, 32(t0)
+    addiu t7, r0, 256
+    sw t7, 0(t0)
+    dsll t6, t6, 7
+    daddu a0, a0, t6
+B26:
+L32:
+    beq t3, r0, L35
+    sll r0, r0, 0
+
+B27:
+    bne t4, r0, L35
+    sll r0, r0, 0
+
+B28:
+L33:
+    lw t6, 0(v1)
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    andi t6, t6, 256
+    sll r0, r0, 0
+    beq t6, r0, L34
+    sll r0, r0, 0
+
+B29:
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    sll r0, r0, 0
+    beq r0, r0, L33
+    sll r0, r0, 0
+
+B30:
+L34:
+    dsll t6, t2, 2
+    lui t7, 28672
+    daddu t6, t6, t7
+    lwu t6, 8(t6)
+    andi t6, t6, 16383
+    sw t6, 128(v1)
+    sw a2, 16(v1)
+    addiu t6, r0, 5
+    mult3 t6, t6, t3
+    sw t6, 32(v1)
+    addiu t6, r0, 256
+    sw t6, 0(v1)
+    addiu t6, r0, 80
+    mult3 t6, t6, t3
+    daddu a2, a2, t6
+B31:
+L35:
+    or t8, t2, r0
+    bne t1, r0, L22
+    or t2, t5, r0
+
+B32:
+    beq a3, r0, L22
+    addiu a3, r0, 1
+
+B33:
+L36:
+    or v0, r0, r0
+    ld ra, 0(sp)
+    lq gp, 80(sp)
+    lq s5, 64(sp)
+    lq s4, 48(sp)
+    lq s3, 32(sp)
+    lq s2, 16(sp)
+    jr ra
+    daddiu sp, sp, 96
+```
+
+
+# VU0 micoprogram
+
+- vf1, vf2, vf3, vf4 = inverse bind pose
+- vf5, vf6, vf7, vf8 = input bone matrix.
+- `vf28, vf29, vf30, vf31` the camera matrix
+- `vf25, vf26, vf27` the camera matrix again
+- (vf13, vf14, vf15, vf16) output point transformation
+- (vf09, vf10, vf11) output normal transformation
+
+```
+First: multiply bone and bind pose: (vf13, vf14, vf15, vf16) = (vf5, vf6, vf7, vf8) * (vf1, vf2, vf3, vf4).
+This is doing a true matrix multiplication.
+  nop                        |  mulax.xyzw ACC, vf05, vf01
+  nop                        |  madday.xyzw ACC, vf06, vf01
+  nop                        |  maddaz.xyzw ACC, vf07, vf01
+  nop                        |  maddw.xyzw vf13, vf08, vf01
+  nop                        |  mulax.xyzw ACC, vf05, vf02
+  nop                        |  madday.xyzw ACC, vf06, vf02
+  nop                        |  maddaz.xyzw ACC, vf07, vf02
+  nop                        |  maddw.xyzw vf14, vf08, vf02
+  nop                        |  mulax.xyzw ACC, vf05, vf03
+  nop                        |  madday.xyzw ACC, vf06, vf03
+  nop                        |  maddaz.xyzw ACC, vf07, vf03
+  nop                        |  maddw.xyzw vf15, vf08, vf03
+  nop                        |  mulax.xyzw ACC, vf05, vf04
+  nop                        |  madday.xyzw ACC, vf06, vf04
+  nop                        |  maddaz.xyzw ACC, vf07, vf04
+  nop                        |  maddw.xyzw vf16, vf08, vf04
+
+;; vf09 = cross(y, z)
+  nop                        |  opmula.xyz ACC, vf14, vf15
+  nop                        |  opmsub.xyz vf09, vf15, vf14
+
+;; vf10 = cross(z, x)
+  nop                        |  opmula.xyz ACC, vf15, vf13
+  nop                        |  opmsub.xyz vf10, vf13, vf15
+
+;; vf11 = cross(x, y)
+  nop                        |  opmula.xyz ACC, vf13, vf14
+  nop                        |  opmsub.xyz vf11, vf14, vf13
+
+ ;; vf12 = cross (y, z) * x
+  nop                        |  mul.xyz vf12, vf13, vf09
+
+ ;; second multiply: doing (vf13....) = cam * (vf5, vf6, vf7, vf8) * (vf1, vf2, vf3, vf4)
+  nop                        |  mulax.xyzw ACC, vf28, vf13
+  nop                        |  madday.xyzw ACC, vf29, vf13
+  nop                        |  maddaz.xyzw ACC, vf30, vf13
+  nop                        |  maddw.xyzw vf13, vf31, vf13
+
+  nop                        |  mulax.w ACC, vf00, vf12
+  nop                        |  madday.w ACC, vf00, vf12
+  nop                        |  maddz.w vf12, vf00, vf12
+  vf12.w = dot (cross(y, z), x)  [before the second multiply]
+
+  nop                        |  mulax.xyzw ACC, vf28, vf14
+  nop                        |  madday.xyzw ACC, vf29, vf14
+  nop                        |  maddaz.xyzw ACC, vf30, vf14
+  div Q, vf00.w, vf12.w      |  maddw.xyzw vf14, vf31, vf14  ;; divide
+  nop                        |  mulax.xyzw ACC, vf28, vf15
+  nop                        |  madday.xyzw ACC, vf29, vf15
+  nop                        |  maddaz.xyzw ACC, vf30, vf15
+  nop                        |  maddw.xyzw vf15, vf31, vf15
+  nop                        |  mulax.xyzw ACC, vf28, vf16
+  nop                        |  madday.xyzw ACC, vf29, vf16
+  nop                        |  maddaz.xyzw ACC, vf30, vf16
+  nop                        |  maddw.xyzw vf16, vf31, vf16
+
+;; normal scale
+  nop                        |  mul.xyzw vf09, vf09, Q
+  nop                        |  mul.xyzw vf10, vf10, Q
+  nop                        |  mul.xyzw vf11, vf11, Q
+
+;; apply cam to normal matrix too
+  nop                        |  mulax.xyzw ACC, vf25, vf09
+  nop                        |  madday.xyzw ACC, vf26, vf09
+  nop                        |  maddz.xyzw vf09, vf27, vf09
+  nop                        |  mulax.xyzw ACC, vf25, vf10
+  nop                        |  madday.xyzw ACC, vf26, vf10
+  nop                        |  maddz.xyzw vf10, vf27, vf10
+  nop                        |  mulax.xyzw ACC, vf25, vf11
+  nop                        |  madday.xyzw ACC, vf26, vf11 :e
+  nop                        |  maddz.xyzw vf11, vf27, vf11
+```
\ No newline at end of file
diff --git a/goal_src/jak1/engine/gfx/foreground/bones.gc b/goal_src/jak1/engine/gfx/foreground/bones.gc
index 8fac0ff0e1..6c36dc9f08 100644
--- a/goal_src/jak1/engine/gfx/foreground/bones.gc
+++ b/goal_src/jak1/engine/gfx/foreground/bones.gc
@@ -422,6 +422,141 @@
 
 (def-mips2c bones-mtx-calc (function int pointer pointer int object none))
 
+(defmacro .cross.vf (out a b)
+  `(begin
+     (.outer.product.a.vf acc ,a ,b)
+     (.outer.product.b.vf ,out ,b ,a acc)
+     )
+  )
+
+(defun new-bones-mtx-calc-asm ((output (inline-array pris-mtx)) (joints (inline-array joint)) (bones (inline-array bone)) (cam matrix) (count int))
+  "Compute skinning matrices."
+  ;; (declare (print-asm))
+  (dotimes (i (- count 1))
+    (let ((b (-> bones (+ i 1) transform))
+          (j (-> joints i bind-pose))
+          (out (-> output (+ i 1)))
+          )
+      (rlet (
+             (tmat0 :class vf)
+             (tmat1 :class vf)
+             (tmat2 :class vf)
+             (tmat3 :class vf)
+             (nmat0 :class vf)
+             (nmat1 :class vf)
+             (nmat2 :class vf)
+             (nmat3 :class vf)
+             (acc :class vf  )
+             (vf0 :class vf  )
+             (cam0 :class vf )
+             (cam1 :class vf )
+             (cam2 :class vf )
+             (cam3 :class vf )
+             )
+
+            (init-vf0-vector)
+
+            ;; load bind-pose to tmat:
+            (.lvf tmat0 (&-> j quad 0))
+            (.lvf tmat1 (&-> j quad 1))
+            (.lvf tmat2 (&-> j quad 2))
+            (.lvf tmat3 (&-> j quad 3))
+
+            ;; load bone to nmat
+            (.lvf nmat0 (&-> b quad 0))
+            (.lvf nmat1 (&-> b quad 1))
+            (.lvf nmat2 (&-> b quad 2))
+            (.lvf nmat3 (&-> b quad 3))
+
+            ;; multiply bone and bind pose, store in tmat
+            (.mul.x.vf acc nmat0 tmat0)
+            (.add.mul.y.vf acc nmat1 tmat0 acc)
+            (.add.mul.z.vf acc nmat2 tmat0 acc)
+            (.add.mul.w.vf tmat0 nmat3 tmat0 acc)
+            (.mul.x.vf acc nmat0 tmat1)
+            (.add.mul.y.vf acc nmat1 tmat1 acc)
+            (.add.mul.z.vf acc nmat2 tmat1 acc)
+            (.add.mul.w.vf tmat1 nmat3 tmat1 acc)
+            (.mul.x.vf acc nmat0 tmat2)
+            (.add.mul.y.vf acc nmat1 tmat2 acc)
+            (.add.mul.z.vf acc nmat2 tmat2 acc)
+            (.add.mul.w.vf tmat2 nmat3 tmat2 acc)
+            (.mul.x.vf acc nmat0 tmat3)
+            (.add.mul.y.vf acc nmat1 tmat3 acc)
+            (.add.mul.z.vf acc nmat2 tmat3 acc)
+            (.add.mul.w.vf tmat3 nmat3 tmat3 acc)
+
+            ;; compute inverse transpose, storing in nmat.
+            (.cross.vf nmat0 tmat1 tmat2)
+            (.cross.vf nmat1 tmat2 tmat0)
+            (.cross.vf nmat2 tmat0 tmat1)
+
+            ;; dot nmat0 and tmat0
+            (.mul.vf acc nmat0 tmat0)
+            (.add.y.vf acc acc acc :mask #b1)
+            (.add.z.vf acc acc acc :mask #b1)
+
+            ;; divide!
+            (.div.vf acc vf0 acc :fsf #b11 :ftf #b0)
+
+            ;; scale nmat:
+            (.mul.x.vf nmat0 nmat0 acc)
+            (.mul.x.vf nmat1 nmat1 acc)
+            (.mul.x.vf nmat2 nmat2 acc)
+
+            ;; load camera
+            (.lvf cam0 (&-> cam quad 0))
+            (.lvf cam1 (&-> cam quad 1))
+            (.lvf cam2 (&-> cam quad 2))
+            (.lvf cam3 (&-> cam quad 3))
+
+            ;; multiply tmat by camera
+            (.mul.x.vf acc cam0 tmat0)
+            (.add.mul.y.vf acc cam1 tmat0 acc)
+            (.add.mul.z.vf acc cam2 tmat0 acc)
+            (.add.mul.w.vf tmat0 cam3 tmat0 acc)
+            (.mul.x.vf acc cam0 tmat1)
+            (.add.mul.y.vf acc cam1 tmat1 acc)
+            (.add.mul.z.vf acc cam2 tmat1 acc)
+            (.add.mul.w.vf tmat1 cam3 tmat1 acc)
+            (.mul.x.vf acc cam0 tmat2)
+            (.add.mul.y.vf acc cam1 tmat2 acc)
+            (.add.mul.z.vf acc cam2 tmat2 acc)
+            (.add.mul.w.vf tmat2 cam3 tmat2 acc)
+            (.mul.x.vf acc cam0 tmat3)
+            (.add.mul.y.vf acc cam1 tmat3 acc)
+            (.add.mul.z.vf acc cam2 tmat3 acc)
+            (.add.mul.w.vf tmat3 cam3 tmat3 acc)
+
+            ;; store tmat
+            (.svf (&-> out t-mtx quad 0) tmat0)
+            (.svf (&-> out t-mtx quad 1) tmat1)
+            (.svf (&-> out t-mtx quad 2) tmat2)
+            (.svf (&-> out t-mtx quad 3) tmat3)
+
+            ;; multiply nmat
+            (.mul.x.vf acc cam0 nmat0)
+            (.add.mul.y.vf acc cam1 nmat0 acc)
+            (.add.mul.z.vf nmat0 cam2 nmat0 acc)
+            (.mul.x.vf acc cam0 nmat1)
+            (.add.mul.y.vf acc cam1 nmat1 acc)
+            (.add.mul.z.vf nmat1 cam2 nmat1 acc)
+            (.mul.x.vf acc cam0 nmat2)
+            (.add.mul.y.vf acc cam1 nmat2 acc)
+            (.add.mul.z.vf nmat2 cam2 nmat2 acc)
+
+            ;; store nmat
+            (.svf (&-> out n-mtx quad 0) nmat0)
+            (.svf (&-> out n-mtx quad 1) nmat1)
+            (.svf (&-> out n-mtx quad 2) nmat2)
+            )
+      )
+    )
+  (none)
+  )
+
+(define *use-new-bones* #t)
+
 (defun bones-mtx-calc-execute ()
   "Do all pending bone calculations"
   (local-vars (v1-14 float))
@@ -481,13 +616,20 @@
         (.mov v1-14 vf27)
         ;; hack??
 
-
-        (bones-mtx-calc
-          (the-as int (-> s4-0 matrix-area))
-          (the-as pointer (-> s4-0 joints))
-          (the-as pointer (-> s4-0 bones))
-          (the-as int (-> s4-0 num-bones))
-          v1-13 ;; hack, added
+        (if *use-new-bones*
+          (new-bones-mtx-calc-asm
+            (the (inline-array pris-mtx) (-> s4-0 matrix-area))
+            (-> s4-0 joints)
+            (-> s4-0 bones)
+            v1-13
+            (the int (-> s4-0 num-bones)))
+          (bones-mtx-calc
+            (the-as int (-> s4-0 matrix-area))
+            (the-as pointer (-> s4-0 joints))
+            (the-as pointer (-> s4-0 bones))
+            (the-as int (-> s4-0 num-bones))
+            v1-13 ;; hack, added
+            )
           )
         )
         (when (logtest? (-> s4-0 flags) (bone-calc-flags bncfl00))
diff --git a/goal_src/jak2/engine/gfx/foreground/bones.gc b/goal_src/jak2/engine/gfx/foreground/bones.gc
index 07bbd56cf7..b4092f48af 100644
--- a/goal_src/jak2/engine/gfx/foreground/bones.gc
+++ b/goal_src/jak2/engine/gfx/foreground/bones.gc
@@ -120,16 +120,224 @@ this is done by a linked list of "bone calculations", which is stashed in the dm
 ;; see the C++ code for more details.
 (def-mips2c bones-mtx-calc (function (inline-array pris-mtx) (inline-array joint) (inline-array bone) uint object none))
 
+(defun matrix-*float! ((output matrix3) (input matrix3) (x float))
+  (dotimes (i 12)
+    (set! (-> output data i) (* x (-> input data i)))
+    )
+  output
+  )
+
+(defun matrix*!-first-three ((arg0 matrix3) (arg1 matrix3) (arg2 matrix))
+  "Set dst = src1 * src2. It is okay for any arguments to be the same data.
+   This is a moderately efficient implementation."
+  (rlet ((acc :class vf)
+         (vf10 :class vf)
+         (vf11 :class vf)
+         (vf12 :class vf)
+         (vf14 :class vf)
+         (vf15 :class vf)
+         (vf16 :class vf)
+         (vf18 :class vf)
+         (vf19 :class vf)
+         (vf20 :class vf)
+         )
+    (.lvf vf10 (&-> arg1 quad 0))
+    (.lvf vf14 (&-> arg2 quad 0))
+    (.lvf vf15 (&-> arg2 quad 1))
+    (.lvf vf16 (&-> arg2 quad 2))
+    (.lvf vf11 (&-> arg1 quad 1))
+    (.lvf vf12 (&-> arg1 quad 2))
+    (.mul.x.vf acc vf14 vf10)
+    (.add.mul.y.vf acc vf15 vf10 acc)
+    (.add.mul.z.vf vf18 vf16 vf10 acc)
+    (.mul.x.vf acc vf14 vf11)
+    (.add.mul.y.vf acc vf15 vf11 acc)
+    (.add.mul.z.vf vf19 vf16 vf11 acc)
+    (.mul.x.vf acc vf14 vf12)
+    (.add.mul.y.vf acc vf15 vf12 acc)
+    (.add.mul.z.vf vf20 vf16 vf12 acc)
+    (.svf (&-> arg0 quad 0) vf18)
+    (.svf (&-> arg0 quad 1) vf19)
+    (.svf (&-> arg0 quad 2) vf20)
+    arg0
+    )
+  )
+
+(defun new-bones-mtx-calc ((output (inline-array pris-mtx)) (joints (inline-array joint)) (bones (inline-array bone)) (cam matrix) (count int))
+  (dotimes (i (- count 1))
+    (let ((b (-> bones (+ i 1) transform))
+          (j (-> joints i bind-pose))
+          (out (-> output (+ i 1)))
+          )
+
+      ;; multiply by bind pose
+      ;; mult swaps the args
+      (matrix*! (-> out t-mtx) j b)
+
+      ;; clever way to compute inverse transpose of a 3x3:
+      (vector-cross! (-> out n-mtx vector 0)
+                     (-> out t-mtx vector 1)
+                     (-> out t-mtx vector 2)
+                     )
+      (vector-cross! (-> out n-mtx vector 1)
+                     (-> out t-mtx vector 2)
+                     (-> out t-mtx vector 0)
+                     )
+      (vector-cross! (-> out n-mtx vector 2)
+                     (-> out t-mtx vector 0)
+                     (-> out t-mtx vector 1)
+                     )
+      (let ((scale (/ 1. (vector-dot (-> out n-mtx vector 0) (-> out t-mtx vector 0)))))
+        (matrix-*float! (-> out n-mtx) (-> out n-mtx) scale)
+        )
+
+      ;; multiply by camera
+      (matrix*! (-> out t-mtx) (-> out t-mtx) cam)
+      (matrix*!-first-three (-> out n-mtx) (-> out n-mtx) cam) ;; WRONG!!
+      )
+    )
+  )
+
+(defmacro .cross.vf (out a b)
+  `(begin
+     (.outer.product.a.vf acc ,a ,b)
+     (.outer.product.b.vf ,out ,b ,a acc)
+     )
+  )
+
+(defun new-bones-mtx-calc-asm ((output (inline-array pris-mtx)) (joints (inline-array joint)) (bones (inline-array bone)) (cam matrix) (count int))
+  ;; (declare (print-asm))
+  (dotimes (i (- count 1))
+    (let ((b (-> bones (+ i 1) transform))
+          (j (-> joints i bind-pose))
+          (out (-> output (+ i 1)))
+          )
+      (rlet (
+             (tmat0 :class vf)
+             (tmat1 :class vf)
+             (tmat2 :class vf)
+             (tmat3 :class vf)
+             (nmat0 :class vf)
+             (nmat1 :class vf)
+             (nmat2 :class vf)
+             (nmat3 :class vf)
+             (acc :class vf  )
+             (vf0 :class vf  )
+             (cam0 :class vf )
+             (cam1 :class vf )
+             (cam2 :class vf )
+             (cam3 :class vf )
+             )
+
+            (init-vf0-vector)
+
+            ;; load bind-pose to tmat:
+            (.lvf tmat0 (&-> j quad 0))
+            (.lvf tmat1 (&-> j quad 1))
+            (.lvf tmat2 (&-> j quad 2))
+            (.lvf tmat3 (&-> j quad 3))
+
+            ;; load bone to nmat
+            (.lvf nmat0 (&-> b quad 0))
+            (.lvf nmat1 (&-> b quad 1))
+            (.lvf nmat2 (&-> b quad 2))
+            (.lvf nmat3 (&-> b quad 3))
+
+            ;; multiply, store in tmat
+            (.mul.x.vf acc nmat0 tmat0)
+            (.add.mul.y.vf acc nmat1 tmat0 acc)
+            (.add.mul.z.vf acc nmat2 tmat0 acc)
+            (.add.mul.w.vf tmat0 nmat3 tmat0 acc)
+            (.mul.x.vf acc nmat0 tmat1)
+            (.add.mul.y.vf acc nmat1 tmat1 acc)
+            (.add.mul.z.vf acc nmat2 tmat1 acc)
+            (.add.mul.w.vf tmat1 nmat3 tmat1 acc)
+            (.mul.x.vf acc nmat0 tmat2)
+            (.add.mul.y.vf acc nmat1 tmat2 acc)
+            (.add.mul.z.vf acc nmat2 tmat2 acc)
+            (.add.mul.w.vf tmat2 nmat3 tmat2 acc)
+            (.mul.x.vf acc nmat0 tmat3)
+            (.add.mul.y.vf acc nmat1 tmat3 acc)
+            (.add.mul.z.vf acc nmat2 tmat3 acc)
+            (.add.mul.w.vf tmat3 nmat3 tmat3 acc)
+
+            ;; compute inverse transpose, storing in nmat
+            (.cross.vf nmat0 tmat1 tmat2)
+            (.cross.vf nmat1 tmat2 tmat0)
+            (.cross.vf nmat2 tmat0 tmat1)
+
+            ;; dot nmat0 and tmat0
+            (.mul.vf acc nmat0 tmat0)
+            (.add.y.vf acc acc acc :mask #b1)
+            (.add.z.vf acc acc acc :mask #b1)
+
+            ;; divide!
+            (.div.vf acc vf0 acc :fsf #b11 :ftf #b0)
+
+            ;; scale nmat:
+            (.mul.x.vf nmat0 nmat0 acc)
+            (.mul.x.vf nmat1 nmat1 acc)
+            (.mul.x.vf nmat2 nmat2 acc)
+
+            ;; load camera
+            (.lvf cam0 (&-> cam quad 0))
+            (.lvf cam1 (&-> cam quad 1))
+            (.lvf cam2 (&-> cam quad 2))
+            (.lvf cam3 (&-> cam quad 3))
+
+            ;; multiply tmat by camera
+            (.mul.x.vf acc cam0 tmat0)
+            (.add.mul.y.vf acc cam1 tmat0 acc)
+            (.add.mul.z.vf acc cam2 tmat0 acc)
+            (.add.mul.w.vf tmat0 cam3 tmat0 acc)
+            (.mul.x.vf acc cam0 tmat1)
+            (.add.mul.y.vf acc cam1 tmat1 acc)
+            (.add.mul.z.vf acc cam2 tmat1 acc)
+            (.add.mul.w.vf tmat1 cam3 tmat1 acc)
+            (.mul.x.vf acc cam0 tmat2)
+            (.add.mul.y.vf acc cam1 tmat2 acc)
+            (.add.mul.z.vf acc cam2 tmat2 acc)
+            (.add.mul.w.vf tmat2 cam3 tmat2 acc)
+            (.mul.x.vf acc cam0 tmat3)
+            (.add.mul.y.vf acc cam1 tmat3 acc)
+            (.add.mul.z.vf acc cam2 tmat3 acc)
+            (.add.mul.w.vf tmat3 cam3 tmat3 acc)
+
+            ;; store tmat
+            (.svf (&-> out t-mtx quad 0) tmat0)
+            (.svf (&-> out t-mtx quad 1) tmat1)
+            (.svf (&-> out t-mtx quad 2) tmat2)
+            (.svf (&-> out t-mtx quad 3) tmat3)
+
+            ;; multiply nmat
+            (.mul.x.vf acc cam0 nmat0)
+            (.add.mul.y.vf acc cam1 nmat0 acc)
+            (.add.mul.z.vf nmat0 cam2 nmat0 acc)
+            (.mul.x.vf acc cam0 nmat1)
+            (.add.mul.y.vf acc cam1 nmat1 acc)
+            (.add.mul.z.vf nmat1 cam2 nmat1 acc)
+            (.mul.x.vf acc cam0 nmat2)
+            (.add.mul.y.vf acc cam1 nmat2 acc)
+            (.add.mul.z.vf nmat2 cam2 nmat2 acc)
+
+            ;; store nmat
+            (.svf (&-> out n-mtx quad 0) nmat0)
+            (.svf (&-> out n-mtx quad 1) nmat1)
+            (.svf (&-> out n-mtx quad 2) nmat2)
+            )
+      )
+    )
+  (none)
+  )
+
+
+(define *use-new-bones* #t)
+(define *display-bone-stats* #f)
+(define *num-bones* 0)
+
 (defun bones-mtx-calc-execute ()
   "Execute all bone matrix calculations."
   (rlet ((vf1 :class vf)
-         (vf25 :class vf)
-         (vf26 :class vf)
-         (vf27 :class vf)
-         (vf28 :class vf)
-         (vf29 :class vf)
-         (vf30 :class vf)
-         (vf31 :class vf)
          (vf4 :class vf)
          (vf5 :class vf)
          (vf6 :class vf)
@@ -157,6 +365,7 @@ this is done by a linked list of "bone calculations", which is stashed in the dm
                  (s5-1 (-> *math-camera* camera-rot))
                  (s4-1 (-> v1-26 first))
                  )
+            (set! *num-bones* 0)
             (while (nonzero? s4-1) ;; loop
 
               ;; pick correct matrix. The pris-mtx includes camera rotation, so all that's needed in the final
@@ -167,23 +376,19 @@ this is done by a linked list of "bone calculations", which is stashed in the dm
                                )
                            )
                     )
-                (.lvf vf28 (&-> v1-29 quad 0))
-                (.lvf vf29 (&-> v1-29 quad 1))
-                (.lvf vf30 (&-> v1-29 quad 2))
-                (.lvf vf31 (&-> v1-29 trans quad))
-                (.lvf vf25 (&-> v1-29 quad 0))
-                (.lvf vf26 (&-> v1-29 quad 1))
-                (.lvf vf27 (&-> v1-29 quad 2))
-
                 ;; calculate!
-                (bones-mtx-calc
-                  (-> s4-1 matrix-area)
-                  (-> s4-1 joints)
-                  (-> s4-1 bones)
-                  (-> s4-1 num-bones)
-                  v1-29 ;; hack, to pass matrix to bones-mtx-calc in a better way.
-                  )
+                (if *use-new-bones*
+                    (new-bones-mtx-calc-asm (-> s4-1 matrix-area) (-> s4-1 joints) (-> s4-1 bones) v1-29 (the int (-> s4-1 num-bones)))
+                    (bones-mtx-calc
+                      (-> s4-1 matrix-area)
+                      (-> s4-1 joints)
+                      (-> s4-1 bones)
+                      (-> s4-1 num-bones)
+                      v1-29 ;; hack, to pass matrix to bones-mtx-calc in a better way.
+                      )
+                    )
                 )
+              (+! *num-bones* (-> s4-1 num-bones))
 
               ;; there is an optional post-processing step for ripple.
               (when (logtest? (-> s4-1 flags) (bone-calc-flags write-ripple-data))
@@ -228,6 +433,10 @@ this is done by a linked list of "bone calculations", which is stashed in the dm
               )
             )
 
+          (when *display-bone-stats*
+            (format *stdcon* "num-bones: ~D~%" *num-bones*)
+            )
+
           ;; reset sqwc
           ;; (set! (-> (the-as dma-bank-control #x1000e000) sqwc) (new 'static 'dma-sqwc :sqwc #x1 :tqwc #x1))
 
diff --git a/goalc/compiler/compilation/Asm.cpp b/goalc/compiler/compilation/Asm.cpp
index 95f8822742..da831286b6 100644
--- a/goalc/compiler/compilation/Asm.cpp
+++ b/goalc/compiler/compilation/Asm.cpp
@@ -1233,17 +1233,17 @@ Val* Compiler::compile_asm_div_vf(const goos::Object& form, const goos::Object&
   // Why do we even bother using VDIVPS instead of FDIV? Because otherwise in x86, you have to use
   // the FPU stack Registers are nicer.
 
-  // Save one temp reg, use the destination as one
-  auto temp_reg = env->make_vfr(dest->type());
+  auto temp_reg1 = env->make_vfr(dest->type());
+  auto temp_reg2 = env->make_vfr(dest->type());
 
-  // Splat src1's value into the dest reg, keep it simple, this way no matter which vector component
+  // Splat src1's value into a temp reg, keep it simple, this way no matter which vector component
   // is accessed from the final result will be the correct answer
-  env->emit_ir<IR_SplatVF>(form, color, dest, src1, ftf_fsf_to_vector_element(fsf));
+  env->emit_ir<IR_SplatVF>(form, color, temp_reg1, src1, ftf_fsf_to_vector_element(fsf));
   // Splat src1's value into the the temp reg
-  env->emit_ir<IR_SplatVF>(form, color, temp_reg, src2, ftf_fsf_to_vector_element(ftf));
+  env->emit_ir<IR_SplatVF>(form, color, temp_reg2, src2, ftf_fsf_to_vector_element(ftf));
 
   // Perform the Division
-  env->emit_ir<IR_VFMath3Asm>(form, color, dest, dest, temp_reg, IR_VFMath3Asm::Kind::DIV);
+  env->emit_ir<IR_VFMath3Asm>(form, color, dest, temp_reg1, temp_reg2, IR_VFMath3Asm::Kind::DIV);
   return get_none();
 }
 
diff --git a/goalc/regalloc/Allocator_v2.cpp b/goalc/regalloc/Allocator_v2.cpp
index 5c178e82ee..178037f106 100644
--- a/goalc/regalloc/Allocator_v2.cpp
+++ b/goalc/regalloc/Allocator_v2.cpp
@@ -332,7 +332,7 @@ const std::vector<emitter::Register>& get_alloc_order(int var_idx,
       if (is_gpr) {
         return REG_temp_first_order.gprs;
       } else {
-        return REG_temp_only_order.xmms;
+        return REG_temp_first_order.xmms;
       }
     }
   }
@@ -1060,8 +1060,10 @@ bool run_assignment_on_var(const AllocationInput& input,
       for (auto& reg : assign_order) {
         bool worked = check_register_assign(input, *cache, var_idx, reg);
         if (trace) {
-          lg::print("m2 trying var {} in {}: {}\n", cache->iregs.at(var_idx).to_string(),
-                    reg.print(), worked);
+          const auto& this_var = cache->vars.at(var_idx);
+          lg::print("m2 trying var {} in {} (live {} to {}): {}\n",
+                    cache->iregs.at(var_idx).to_string(), reg.print(), this_var.first_live(),
+                    this_var.last_live(), worked);
         }
         if (worked) {
           var.assign_to_register(reg);
diff --git a/goalc/regalloc/allocator_interface.cpp b/goalc/regalloc/allocator_interface.cpp
index 5ad94884c9..c24cb5c2c8 100644
--- a/goalc/regalloc/allocator_interface.cpp
+++ b/goalc/regalloc/allocator_interface.cpp
@@ -24,7 +24,7 @@ void print_allocate_input(const AllocationInput& in) {
     }
   } else {
     for (const auto& instruction : in.instructions) {
-      lg::print(" [{:3d}] {}\n", instruction.print());
+      lg::print(" {}\n", instruction.print());
     }
   }
   lg::print("[RegAlloc] Debug Input Constraints:\n");