Dreamcast: Optimise vertex TnL

not completely accurate, but in flycast ~150,000 vertices went from 29 to 44 FPS
2025-05-31 22:07:55 +10:00 · 2025-05-31 22:07:55 +10:00 · 4286c2d0c1
parent f082d17ee4
commit 4286c2d0c1
4 changed files with 54 additions and 50 deletions
--- a/misc/dreamcast/VertexClip2.S
+++ b/misc/dreamcast/VertexClip2.S
@ -52,10 +52,11 @@
 .global _ClipEdge
 .align 4
 _ClipEdge:
-	add      #12, IN1 ! EX, IN1  = &v1->z
+	fschg             ! FE (swap to 32 bit FPU loads/stores)
+	add      #28, IN1 ! EX, IN1  = &v1->z
 	fldi0    fr4      ! LS, fr4  = 0
 	fmov.s  @IN1, fr2 ! LS, fr2  = v1->z
-	add      #12, IN2 ! EX, IN   = &v2->z
+	add      #28, IN2 ! EX, IN   = &v2->z
 	fldi0    fr5      ! LS, fr5  = 0
 	fmov.s  @IN2,fr11 ! LS, fr11 = v2->z
 	fsub     fr2,fr11 ! FE, fr11 = v2->z - v1->z
@ -70,9 +71,9 @@ _ClipEdge:
 	fabs     fr2      ! LS, fr2  = abs(v1->z)
 	mov.l  TYP,@OUT   ! LS, dst->cmd = TYPE
 	fmul     fr2,fr11 ! FE, fr11 = abs(v1->Z) / abs(v2->z - v1->z)  --> t
-	add      #-8, IN1 ! EX, IN1  = &v1->x
+	add     #-24, IN1 ! EX, IN1  = &v1->x
 	fldi1   fr10      ! LS, fr10 = 1
-	add      #-8, IN2 ! EX, IN2  = &v2->x
+	add     #-24, IN2 ! EX, IN2  = &v2->x
 	add       #4, OUT ! EX, OUT  = &dst->x
 	fsub    fr11,fr10 ! FE, invT = 1.0 - t  --> invT

@ -200,6 +201,7 @@ _ClipEdge:
 1:
 	mov.l  CLO,@OUT   ! LS, OUT->color = OUTCOLOR
 	add   #-24, OUT   ! EX, OUT += 8
+	fschg             ! FE (swap to 64 bit FPU loads/stores)
 	rts               ! CO, return after executing instruction in delay slot
 	pref   @OUT       ! LS, trigger store queue flush
 .size _ClipEdge, .-_ClipEdge
--- a/misc/dreamcast/VertexTransform.S
+++ b/misc/dreamcast/VertexTransform.S
@ -85,26 +85,26 @@
 ! To take advantage of SH4 dual instruction processing, 
 !  clipflag calculation and vertex output are interleaved
 .macro ProcessVertex1
-    fmov.s  F_W,@-DST ! LS, dst->w = W
+    fmov.s  F_Z,@-DST ! LS, dst->z = Z
    fmov.s  F_C,@-DST ! LS, dst->c = C
    fmov.s  F_V,@-DST ! LS, dst->v = V
    fcmp/gt ZERO, F_Z ! FE, T = Z > 0
    fmov.s  F_U,@-DST ! LS, dst->u = U
    movt    FLG       ! EX, CLIPFLAGS = T
-    fmov.s  F_Z,@-DST ! LS, dst->z = Z
+    fmov.s  F_W,@-DST ! LS, dst->w = W
    fmov.s  F_Y,@-DST ! LS, dst->y = Y
    fmov.s  F_X,@-DST ! LS, dst->x = X
    mov.l   VTX,@-DST ! LS, dst->flags = PVR_CMD_VERTEX
 .endm

 .macro ProcessVertex2
-    fmov.s  F_W,@-DST ! LS, dst->w = W
+    fmov.s  F_Z,@-DST ! LS, dst->z = Z
    fmov.s  F_C,@-DST ! LS, dst->c = C
    fmov.s  F_V,@-DST ! LS, dst->v = V
    fcmp/gt ZERO,F_Z  ! FE, T = Z > 0
    fmov.s  F_U,@-DST ! LS, dst->u = U
    movt    TMP       ! EX, tmp = T
-    fmov.s  F_Z,@-DST ! LS, dst->z = Z
+    fmov.s  F_W,@-DST ! LS, dst->w = W
    add     TMP,TMP   ! EX, tmp = tmp + tmp
    fmov.s  F_Y,@-DST ! LS, dst->y = Y
    or      TMP,FLG   ! EX, CLIPFLAGS |= tmp (T << 1)
@ -113,13 +113,13 @@
 .endm

 .macro ProcessVertex3
-    fmov.s  F_W,@-DST ! LS, dst->w = W
+    fmov.s  F_Z,@-DST ! LS, dst->z = Z
    fmov.s  F_C,@-DST ! LS, dst->c = C
    fmov.s  F_V,@-DST ! LS, dst->v = V
    fcmp/gt ZERO, F_Z ! FE, T = Z > 0
    fmov.s  F_U,@-DST ! LS, dst->u = U
    movt    TMP       ! EX, tmp = T
-    fmov.s  F_Z,@-DST ! LS, dst->z = Z
+    fmov.s  F_W,@-DST ! LS, dst->w = W
    fmov.s  F_Y,@-DST ! LS, dst->y = Y
    shll2   TMP       ! EX, tmp = tmp << 2
    fmov.s  F_X,@-DST ! LS, dst->x = X
@ -128,14 +128,14 @@
 .endm

 .macro ProcessVertex4
-    fmov.s  F_W,@-DST ! LS, dst->w = W
+    fmov.s  F_Z,@-DST ! LS, dst->z = Z
    or      EOS,FLG   ! EX, CLIPFLAGS |= PVR_CMD_VERTEX_EOL
    fmov.s  F_C,@-DST ! LS, dst->c = C
    fmov.s  F_V,@-DST ! LS, dst->v = V
    fcmp/gt ZERO, F_Z ! FE, T = Z > 0
    fmov.s  F_U,@-DST ! LS, dst->u = U
    movt    TMP       ! EX, tmp = T
-    fmov.s  F_Z,@-DST ! LS, dst->z = Z
+    fmov.s  F_W,@-DST ! LS, dst->w = W
    shll2   TMP       ! EX, tmp = tmp << 2
    fmov.s  F_Y,@-DST ! LS, dst->y = Y
    add     TMP,TMP   ! EX, tmp = (tmp << 2) + (tmp << 2) (T << 3)
--- a/third_party/gldc/gldc.h
+++ b/third_party/gldc/gldc.h
@ -8,10 +8,10 @@
 typedef struct {
    /* Same 32 byte layout as pvr_vertex_t */
    uint32_t flags;
-    float x, y, z;
+    float x, y, w;
    uint32_t u, v; // really floats, but stored as uint for better load/store codegen
    uint32_t bgra;
-    float w; // actually oargb, but repurposed since unused
+    float z; // actually oargb, but repurposed since unused
 } __attribute__ ((aligned (32))) Vertex;

 typedef struct {
--- a/third_party/gldc/sh4.c
+++ b/third_party/gldc/sh4.c
@ -2,46 +2,46 @@
 #include <dc/pvr.h>
 #include "gldc.h"

-// calculates 1/sqrt(x)
-static GLDC_FORCE_INLINE float sh4_fsrra(float x) {
-  asm volatile ("fsrra %[value]\n"
-  : [value] "+f" (x) // outputs (r/w to FPU register)
-  : // no inputs
-  : // no clobbers
-  );
-  return x;
-}
-
 static GLDC_FORCE_INLINE void PushVertex(Vertex* v, volatile Vertex* dst) {
-	float ww   = v->w * v->w;
-	dst->flags = v->flags;
-	float f	= sh4_fsrra(ww); // 1/sqrt(w^2) ~ 1/w
-	// Convert to NDC (viewport already applied)
-	float x	= v->x * f;
-	float y	= v->y * f;
-
-	dst->x	 = x;
-	dst->y	 = y;
-	dst->z	 = f;
-	dst->u	 = v->u;
-	dst->v	 = v->v;
-	dst->bgra  = v->bgra;
-	__asm__("pref @%0" : : "r"(dst));
+	asm volatile (
+"fmov.d    @%0+, dr0\n" // LS, FX  = *src, src += 8
+"fmov.d    @%0+, dr2\n" // LS, YW  = *src, src += 8
+"add        #32, %1 \n" // EX, dst += 32
+"fmul       fr3, fr3\n" // FE, W   = W * W 
+"fmov.d    @%0+, dr4\n" // LS, UV  = *src, src += 8
+"fmov.d    @%0+, dr6\n" // LS, C?  = *src, src += 8
+"fsrra           fr3\n" // FE, W   = 1/sqrt(W*W) ~ 1/W
+"fmov.d    dr6, @-%1\n" // LS, dst -= 8, *dst = C?
+"fmov.d    dr4, @-%1\n" // LS, dst -= 8, *dst = UV
+"fmul      fr3,  fr2\n" // FE, Y = W * Y
+"add      #-32,  %0 \n" // EX, src -= 32
+"fmov.d    dr2, @-%1\n" // LS, dst -= 8, *dst = YW
+"fmul      fr3,  fr1\n" // FE, Y = X * X
+"fmov.d    dr0, @-%1\n" // LS, dst -= 8, *dst = FX
+"pref      @%1      \n" // LS, flush store queue
+  :
+  : "r" (v), "r" (dst)
+  : "memory", "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7"
+  );
 }

 static inline void PushCommand(Vertex* v, volatile Vertex* dst)  {
-	uint32_t* s = (uint32_t*)v;
-	volatile uint32_t* sq = (volatile uint32_t*)dst;
-
-	sq[0] = *(s++);
-	sq[1] = *(s++);
-	sq[2] = *(s++);
-	sq[3] = *(s++);
-	sq[4] = *(s++);
-	sq[5] = *(s++);
-	sq[6] = *(s++);
-	sq[7] = *(s++);
-	__asm__("pref @%0" : : "r"(sq));
+	asm volatile (
+"add       #32, %1  \n" // EX, dst += 32
+"fmov.d    @%0+, dr0\n" // LS, fr0_fr1 = *src, src += 8
+"fmov.d    @%0+, dr2\n" // LS, fr2_fr3 = *src, src += 8
+"fmov.d    @%0+, dr4\n" // LS, fr4_fr5 = *src, src += 8
+"fmov.d    @%0+, dr6\n" // LS, fr6_fr7 = *src, src += 8
+"fmov.d    dr6, @-%1\n" // LS, dst -= 8, *dst = fr6_fr7
+"fmov.d    dr4, @-%1\n" // LS, dst -= 8, *dst = fr4_fr5
+"fmov.d    dr2, @-%1\n" // LS, dst -= 8, *dst = fr2_fr3
+"fmov.d    dr0, @-%1\n" // LS, dst -= 8, *dst = fr0_fr1
+"pref      @%1      \n" // LS, flush store queue
+"add       #-32, %0 \n" // EX, src -= 32
+  :
+  : "r" (v), "r" (dst)
+  : "memory", "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7"
+  );
 }

 extern void ClipEdge(Vertex* const v1, Vertex* const v2, volatile Vertex* vout, char type);
@ -58,6 +58,7 @@ extern void ProcessVertexList(Vertex* v3, int n, void* sq_addr);

 void SceneListSubmit(Vertex* v3, int n) {
 	volatile Vertex* dst = (volatile Vertex*)MEM_AREA_SQ_BASE;
+	asm volatile ("fschg"); // swap to 64 bit loads/stores

 	for (int i = 0; i < n; i++, v3++) 
 	{
@ -256,4 +257,5 @@ void SceneListSubmit(Vertex* v3, int n) {
 			break;
 		}
 	}
+	asm volatile ("fschg"); // swap back to 32 bit loads/stores
 }