Dreamcast: Optimise vertex TnL

not completely accurate, but in flycast ~150,000 vertices went from 29 to 44 FPS
This commit is contained in:
UnknownShadow200 2025-05-31 22:07:55 +10:00
parent f082d17ee4
commit 4286c2d0c1
4 changed files with 54 additions and 50 deletions

View File

@ -52,10 +52,11 @@
.global _ClipEdge
.align 4
_ClipEdge:
add #12, IN1 ! EX, IN1 = &v1->z
fschg ! FE (swap to 32 bit FPU loads/stores)
add #28, IN1 ! EX, IN1 = &v1->z
fldi0 fr4 ! LS, fr4 = 0
fmov.s @IN1, fr2 ! LS, fr2 = v1->z
add #12, IN2 ! EX, IN = &v2->z
add #28, IN2 ! EX, IN = &v2->z
fldi0 fr5 ! LS, fr5 = 0
fmov.s @IN2,fr11 ! LS, fr11 = v2->z
fsub fr2,fr11 ! FE, fr11 = v2->z - v1->z
@ -70,9 +71,9 @@ _ClipEdge:
fabs fr2 ! LS, fr2 = abs(v1->z)
mov.l TYP,@OUT ! LS, dst->cmd = TYPE
fmul fr2,fr11 ! FE, fr11 = abs(v1->Z) / abs(v2->z - v1->z) --> t
add #-8, IN1 ! EX, IN1 = &v1->x
add #-24, IN1 ! EX, IN1 = &v1->x
fldi1 fr10 ! LS, fr10 = 1
add #-8, IN2 ! EX, IN2 = &v2->x
add #-24, IN2 ! EX, IN2 = &v2->x
add #4, OUT ! EX, OUT = &dst->x
fsub fr11,fr10 ! FE, invT = 1.0 - t --> invT
@ -200,6 +201,7 @@ _ClipEdge:
1:
mov.l CLO,@OUT ! LS, OUT->color = OUTCOLOR
add #-24, OUT ! EX, OUT += 8
fschg ! FE (swap to 64 bit FPU loads/stores)
rts ! CO, return after executing instruction in delay slot
pref @OUT ! LS, trigger store queue flush
.size _ClipEdge, .-_ClipEdge

View File

@ -85,26 +85,26 @@
! To take advantage of SH4 dual instruction processing,
! clipflag calculation and vertex output are interleaved
.macro ProcessVertex1
fmov.s F_W,@-DST ! LS, dst->w = W
fmov.s F_Z,@-DST ! LS, dst->z = Z
fmov.s F_C,@-DST ! LS, dst->c = C
fmov.s F_V,@-DST ! LS, dst->v = V
fcmp/gt ZERO, F_Z ! FE, T = Z > 0
fmov.s F_U,@-DST ! LS, dst->u = U
movt FLG ! EX, CLIPFLAGS = T
fmov.s F_Z,@-DST ! LS, dst->z = Z
fmov.s F_W,@-DST ! LS, dst->w = W
fmov.s F_Y,@-DST ! LS, dst->y = Y
fmov.s F_X,@-DST ! LS, dst->x = X
mov.l VTX,@-DST ! LS, dst->flags = PVR_CMD_VERTEX
.endm
.macro ProcessVertex2
fmov.s F_W,@-DST ! LS, dst->w = W
fmov.s F_Z,@-DST ! LS, dst->z = Z
fmov.s F_C,@-DST ! LS, dst->c = C
fmov.s F_V,@-DST ! LS, dst->v = V
fcmp/gt ZERO,F_Z ! FE, T = Z > 0
fmov.s F_U,@-DST ! LS, dst->u = U
movt TMP ! EX, tmp = T
fmov.s F_Z,@-DST ! LS, dst->z = Z
fmov.s F_W,@-DST ! LS, dst->w = W
add TMP,TMP ! EX, tmp = tmp + tmp
fmov.s F_Y,@-DST ! LS, dst->y = Y
or TMP,FLG ! EX, CLIPFLAGS |= tmp (T << 1)
@ -113,13 +113,13 @@
.endm
.macro ProcessVertex3
fmov.s F_W,@-DST ! LS, dst->w = W
fmov.s F_Z,@-DST ! LS, dst->z = Z
fmov.s F_C,@-DST ! LS, dst->c = C
fmov.s F_V,@-DST ! LS, dst->v = V
fcmp/gt ZERO, F_Z ! FE, T = Z > 0
fmov.s F_U,@-DST ! LS, dst->u = U
movt TMP ! EX, tmp = T
fmov.s F_Z,@-DST ! LS, dst->z = Z
fmov.s F_W,@-DST ! LS, dst->w = W
fmov.s F_Y,@-DST ! LS, dst->y = Y
shll2 TMP ! EX, tmp = tmp << 2
fmov.s F_X,@-DST ! LS, dst->x = X
@ -128,14 +128,14 @@
.endm
.macro ProcessVertex4
fmov.s F_W,@-DST ! LS, dst->w = W
fmov.s F_Z,@-DST ! LS, dst->z = Z
or EOS,FLG ! EX, CLIPFLAGS |= PVR_CMD_VERTEX_EOL
fmov.s F_C,@-DST ! LS, dst->c = C
fmov.s F_V,@-DST ! LS, dst->v = V
fcmp/gt ZERO, F_Z ! FE, T = Z > 0
fmov.s F_U,@-DST ! LS, dst->u = U
movt TMP ! EX, tmp = T
fmov.s F_Z,@-DST ! LS, dst->z = Z
fmov.s F_W,@-DST ! LS, dst->w = W
shll2 TMP ! EX, tmp = tmp << 2
fmov.s F_Y,@-DST ! LS, dst->y = Y
add TMP,TMP ! EX, tmp = (tmp << 2) + (tmp << 2) (T << 3)

View File

@ -8,10 +8,10 @@
typedef struct {
/* Same 32 byte layout as pvr_vertex_t */
uint32_t flags;
float x, y, z;
float x, y, w;
uint32_t u, v; // really floats, but stored as uint for better load/store codegen
uint32_t bgra;
float w; // actually oargb, but repurposed since unused
float z; // actually oargb, but repurposed since unused
} __attribute__ ((aligned (32))) Vertex;
typedef struct {

View File

@ -2,46 +2,46 @@
#include <dc/pvr.h>
#include "gldc.h"
// calculates 1/sqrt(x)
static GLDC_FORCE_INLINE float sh4_fsrra(float x) {
asm volatile ("fsrra %[value]\n"
: [value] "+f" (x) // outputs (r/w to FPU register)
: // no inputs
: // no clobbers
);
return x;
}
static GLDC_FORCE_INLINE void PushVertex(Vertex* v, volatile Vertex* dst) {
float ww = v->w * v->w;
dst->flags = v->flags;
float f = sh4_fsrra(ww); // 1/sqrt(w^2) ~ 1/w
// Convert to NDC (viewport already applied)
float x = v->x * f;
float y = v->y * f;
dst->x = x;
dst->y = y;
dst->z = f;
dst->u = v->u;
dst->v = v->v;
dst->bgra = v->bgra;
__asm__("pref @%0" : : "r"(dst));
asm volatile (
"fmov.d @%0+, dr0\n" // LS, FX = *src, src += 8
"fmov.d @%0+, dr2\n" // LS, YW = *src, src += 8
"add #32, %1 \n" // EX, dst += 32
"fmul fr3, fr3\n" // FE, W = W * W
"fmov.d @%0+, dr4\n" // LS, UV = *src, src += 8
"fmov.d @%0+, dr6\n" // LS, C? = *src, src += 8
"fsrra fr3\n" // FE, W = 1/sqrt(W*W) ~ 1/W
"fmov.d dr6, @-%1\n" // LS, dst -= 8, *dst = C?
"fmov.d dr4, @-%1\n" // LS, dst -= 8, *dst = UV
"fmul fr3, fr2\n" // FE, Y = W * Y
"add #-32, %0 \n" // EX, src -= 32
"fmov.d dr2, @-%1\n" // LS, dst -= 8, *dst = YW
"fmul fr3, fr1\n" // FE, Y = X * X
"fmov.d dr0, @-%1\n" // LS, dst -= 8, *dst = FX
"pref @%1 \n" // LS, flush store queue
:
: "r" (v), "r" (dst)
: "memory", "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7"
);
}
static inline void PushCommand(Vertex* v, volatile Vertex* dst) {
uint32_t* s = (uint32_t*)v;
volatile uint32_t* sq = (volatile uint32_t*)dst;
sq[0] = *(s++);
sq[1] = *(s++);
sq[2] = *(s++);
sq[3] = *(s++);
sq[4] = *(s++);
sq[5] = *(s++);
sq[6] = *(s++);
sq[7] = *(s++);
__asm__("pref @%0" : : "r"(sq));
asm volatile (
"add #32, %1 \n" // EX, dst += 32
"fmov.d @%0+, dr0\n" // LS, fr0_fr1 = *src, src += 8
"fmov.d @%0+, dr2\n" // LS, fr2_fr3 = *src, src += 8
"fmov.d @%0+, dr4\n" // LS, fr4_fr5 = *src, src += 8
"fmov.d @%0+, dr6\n" // LS, fr6_fr7 = *src, src += 8
"fmov.d dr6, @-%1\n" // LS, dst -= 8, *dst = fr6_fr7
"fmov.d dr4, @-%1\n" // LS, dst -= 8, *dst = fr4_fr5
"fmov.d dr2, @-%1\n" // LS, dst -= 8, *dst = fr2_fr3
"fmov.d dr0, @-%1\n" // LS, dst -= 8, *dst = fr0_fr1
"pref @%1 \n" // LS, flush store queue
"add #-32, %0 \n" // EX, src -= 32
:
: "r" (v), "r" (dst)
: "memory", "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7"
);
}
extern void ClipEdge(Vertex* const v1, Vertex* const v2, volatile Vertex* vout, char type);
@ -58,6 +58,7 @@ extern void ProcessVertexList(Vertex* v3, int n, void* sq_addr);
void SceneListSubmit(Vertex* v3, int n) {
volatile Vertex* dst = (volatile Vertex*)MEM_AREA_SQ_BASE;
asm volatile ("fschg"); // swap to 64 bit loads/stores
for (int i = 0; i < n; i++, v3++)
{
@ -256,4 +257,5 @@ void SceneListSubmit(Vertex* v3, int n) {
break;
}
}
asm volatile ("fschg"); // swap back to 32 bit loads/stores
}