mirror of https://github.com/ClassiCube/ClassiCube
Dreamcast: Optimise vertex TnL
not completely accurate, but in flycast ~150,000 vertices went from 29 to 44 FPS
This commit is contained in:
parent
f082d17ee4
commit
4286c2d0c1
|
|
@ -52,10 +52,11 @@
|
|||
.global _ClipEdge
|
||||
.align 4
|
||||
_ClipEdge:
|
||||
add #12, IN1 ! EX, IN1 = &v1->z
|
||||
fschg ! FE (swap to 32 bit FPU loads/stores)
|
||||
add #28, IN1 ! EX, IN1 = &v1->z
|
||||
fldi0 fr4 ! LS, fr4 = 0
|
||||
fmov.s @IN1, fr2 ! LS, fr2 = v1->z
|
||||
add #12, IN2 ! EX, IN = &v2->z
|
||||
add #28, IN2 ! EX, IN = &v2->z
|
||||
fldi0 fr5 ! LS, fr5 = 0
|
||||
fmov.s @IN2,fr11 ! LS, fr11 = v2->z
|
||||
fsub fr2,fr11 ! FE, fr11 = v2->z - v1->z
|
||||
|
|
@ -70,9 +71,9 @@ _ClipEdge:
|
|||
fabs fr2 ! LS, fr2 = abs(v1->z)
|
||||
mov.l TYP,@OUT ! LS, dst->cmd = TYPE
|
||||
fmul fr2,fr11 ! FE, fr11 = abs(v1->Z) / abs(v2->z - v1->z) --> t
|
||||
add #-8, IN1 ! EX, IN1 = &v1->x
|
||||
add #-24, IN1 ! EX, IN1 = &v1->x
|
||||
fldi1 fr10 ! LS, fr10 = 1
|
||||
add #-8, IN2 ! EX, IN2 = &v2->x
|
||||
add #-24, IN2 ! EX, IN2 = &v2->x
|
||||
add #4, OUT ! EX, OUT = &dst->x
|
||||
fsub fr11,fr10 ! FE, invT = 1.0 - t --> invT
|
||||
|
||||
|
|
@ -200,6 +201,7 @@ _ClipEdge:
|
|||
1:
|
||||
mov.l CLO,@OUT ! LS, OUT->color = OUTCOLOR
|
||||
add #-24, OUT ! EX, OUT += 8
|
||||
fschg ! FE (swap to 64 bit FPU loads/stores)
|
||||
rts ! CO, return after executing instruction in delay slot
|
||||
pref @OUT ! LS, trigger store queue flush
|
||||
.size _ClipEdge, .-_ClipEdge
|
||||
|
|
|
|||
|
|
@ -85,26 +85,26 @@
|
|||
! To take advantage of SH4 dual instruction processing,
|
||||
! clipflag calculation and vertex output are interleaved
|
||||
.macro ProcessVertex1
|
||||
fmov.s F_W,@-DST ! LS, dst->w = W
|
||||
fmov.s F_Z,@-DST ! LS, dst->z = Z
|
||||
fmov.s F_C,@-DST ! LS, dst->c = C
|
||||
fmov.s F_V,@-DST ! LS, dst->v = V
|
||||
fcmp/gt ZERO, F_Z ! FE, T = Z > 0
|
||||
fmov.s F_U,@-DST ! LS, dst->u = U
|
||||
movt FLG ! EX, CLIPFLAGS = T
|
||||
fmov.s F_Z,@-DST ! LS, dst->z = Z
|
||||
fmov.s F_W,@-DST ! LS, dst->w = W
|
||||
fmov.s F_Y,@-DST ! LS, dst->y = Y
|
||||
fmov.s F_X,@-DST ! LS, dst->x = X
|
||||
mov.l VTX,@-DST ! LS, dst->flags = PVR_CMD_VERTEX
|
||||
.endm
|
||||
|
||||
.macro ProcessVertex2
|
||||
fmov.s F_W,@-DST ! LS, dst->w = W
|
||||
fmov.s F_Z,@-DST ! LS, dst->z = Z
|
||||
fmov.s F_C,@-DST ! LS, dst->c = C
|
||||
fmov.s F_V,@-DST ! LS, dst->v = V
|
||||
fcmp/gt ZERO,F_Z ! FE, T = Z > 0
|
||||
fmov.s F_U,@-DST ! LS, dst->u = U
|
||||
movt TMP ! EX, tmp = T
|
||||
fmov.s F_Z,@-DST ! LS, dst->z = Z
|
||||
fmov.s F_W,@-DST ! LS, dst->w = W
|
||||
add TMP,TMP ! EX, tmp = tmp + tmp
|
||||
fmov.s F_Y,@-DST ! LS, dst->y = Y
|
||||
or TMP,FLG ! EX, CLIPFLAGS |= tmp (T << 1)
|
||||
|
|
@ -113,13 +113,13 @@
|
|||
.endm
|
||||
|
||||
.macro ProcessVertex3
|
||||
fmov.s F_W,@-DST ! LS, dst->w = W
|
||||
fmov.s F_Z,@-DST ! LS, dst->z = Z
|
||||
fmov.s F_C,@-DST ! LS, dst->c = C
|
||||
fmov.s F_V,@-DST ! LS, dst->v = V
|
||||
fcmp/gt ZERO, F_Z ! FE, T = Z > 0
|
||||
fmov.s F_U,@-DST ! LS, dst->u = U
|
||||
movt TMP ! EX, tmp = T
|
||||
fmov.s F_Z,@-DST ! LS, dst->z = Z
|
||||
fmov.s F_W,@-DST ! LS, dst->w = W
|
||||
fmov.s F_Y,@-DST ! LS, dst->y = Y
|
||||
shll2 TMP ! EX, tmp = tmp << 2
|
||||
fmov.s F_X,@-DST ! LS, dst->x = X
|
||||
|
|
@ -128,14 +128,14 @@
|
|||
.endm
|
||||
|
||||
.macro ProcessVertex4
|
||||
fmov.s F_W,@-DST ! LS, dst->w = W
|
||||
fmov.s F_Z,@-DST ! LS, dst->z = Z
|
||||
or EOS,FLG ! EX, CLIPFLAGS |= PVR_CMD_VERTEX_EOL
|
||||
fmov.s F_C,@-DST ! LS, dst->c = C
|
||||
fmov.s F_V,@-DST ! LS, dst->v = V
|
||||
fcmp/gt ZERO, F_Z ! FE, T = Z > 0
|
||||
fmov.s F_U,@-DST ! LS, dst->u = U
|
||||
movt TMP ! EX, tmp = T
|
||||
fmov.s F_Z,@-DST ! LS, dst->z = Z
|
||||
fmov.s F_W,@-DST ! LS, dst->w = W
|
||||
shll2 TMP ! EX, tmp = tmp << 2
|
||||
fmov.s F_Y,@-DST ! LS, dst->y = Y
|
||||
add TMP,TMP ! EX, tmp = (tmp << 2) + (tmp << 2) (T << 3)
|
||||
|
|
|
|||
|
|
@ -8,10 +8,10 @@
|
|||
typedef struct {
|
||||
/* Same 32 byte layout as pvr_vertex_t */
|
||||
uint32_t flags;
|
||||
float x, y, z;
|
||||
float x, y, w;
|
||||
uint32_t u, v; // really floats, but stored as uint for better load/store codegen
|
||||
uint32_t bgra;
|
||||
float w; // actually oargb, but repurposed since unused
|
||||
float z; // actually oargb, but repurposed since unused
|
||||
} __attribute__ ((aligned (32))) Vertex;
|
||||
|
||||
typedef struct {
|
||||
|
|
|
|||
|
|
@ -2,46 +2,46 @@
|
|||
#include <dc/pvr.h>
|
||||
#include "gldc.h"
|
||||
|
||||
// calculates 1/sqrt(x)
|
||||
static GLDC_FORCE_INLINE float sh4_fsrra(float x) {
|
||||
asm volatile ("fsrra %[value]\n"
|
||||
: [value] "+f" (x) // outputs (r/w to FPU register)
|
||||
: // no inputs
|
||||
: // no clobbers
|
||||
);
|
||||
return x;
|
||||
}
|
||||
|
||||
static GLDC_FORCE_INLINE void PushVertex(Vertex* v, volatile Vertex* dst) {
|
||||
float ww = v->w * v->w;
|
||||
dst->flags = v->flags;
|
||||
float f = sh4_fsrra(ww); // 1/sqrt(w^2) ~ 1/w
|
||||
// Convert to NDC (viewport already applied)
|
||||
float x = v->x * f;
|
||||
float y = v->y * f;
|
||||
|
||||
dst->x = x;
|
||||
dst->y = y;
|
||||
dst->z = f;
|
||||
dst->u = v->u;
|
||||
dst->v = v->v;
|
||||
dst->bgra = v->bgra;
|
||||
__asm__("pref @%0" : : "r"(dst));
|
||||
asm volatile (
|
||||
"fmov.d @%0+, dr0\n" // LS, FX = *src, src += 8
|
||||
"fmov.d @%0+, dr2\n" // LS, YW = *src, src += 8
|
||||
"add #32, %1 \n" // EX, dst += 32
|
||||
"fmul fr3, fr3\n" // FE, W = W * W
|
||||
"fmov.d @%0+, dr4\n" // LS, UV = *src, src += 8
|
||||
"fmov.d @%0+, dr6\n" // LS, C? = *src, src += 8
|
||||
"fsrra fr3\n" // FE, W = 1/sqrt(W*W) ~ 1/W
|
||||
"fmov.d dr6, @-%1\n" // LS, dst -= 8, *dst = C?
|
||||
"fmov.d dr4, @-%1\n" // LS, dst -= 8, *dst = UV
|
||||
"fmul fr3, fr2\n" // FE, Y = W * Y
|
||||
"add #-32, %0 \n" // EX, src -= 32
|
||||
"fmov.d dr2, @-%1\n" // LS, dst -= 8, *dst = YW
|
||||
"fmul fr3, fr1\n" // FE, Y = X * X
|
||||
"fmov.d dr0, @-%1\n" // LS, dst -= 8, *dst = FX
|
||||
"pref @%1 \n" // LS, flush store queue
|
||||
:
|
||||
: "r" (v), "r" (dst)
|
||||
: "memory", "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7"
|
||||
);
|
||||
}
|
||||
|
||||
static inline void PushCommand(Vertex* v, volatile Vertex* dst) {
|
||||
uint32_t* s = (uint32_t*)v;
|
||||
volatile uint32_t* sq = (volatile uint32_t*)dst;
|
||||
|
||||
sq[0] = *(s++);
|
||||
sq[1] = *(s++);
|
||||
sq[2] = *(s++);
|
||||
sq[3] = *(s++);
|
||||
sq[4] = *(s++);
|
||||
sq[5] = *(s++);
|
||||
sq[6] = *(s++);
|
||||
sq[7] = *(s++);
|
||||
__asm__("pref @%0" : : "r"(sq));
|
||||
asm volatile (
|
||||
"add #32, %1 \n" // EX, dst += 32
|
||||
"fmov.d @%0+, dr0\n" // LS, fr0_fr1 = *src, src += 8
|
||||
"fmov.d @%0+, dr2\n" // LS, fr2_fr3 = *src, src += 8
|
||||
"fmov.d @%0+, dr4\n" // LS, fr4_fr5 = *src, src += 8
|
||||
"fmov.d @%0+, dr6\n" // LS, fr6_fr7 = *src, src += 8
|
||||
"fmov.d dr6, @-%1\n" // LS, dst -= 8, *dst = fr6_fr7
|
||||
"fmov.d dr4, @-%1\n" // LS, dst -= 8, *dst = fr4_fr5
|
||||
"fmov.d dr2, @-%1\n" // LS, dst -= 8, *dst = fr2_fr3
|
||||
"fmov.d dr0, @-%1\n" // LS, dst -= 8, *dst = fr0_fr1
|
||||
"pref @%1 \n" // LS, flush store queue
|
||||
"add #-32, %0 \n" // EX, src -= 32
|
||||
:
|
||||
: "r" (v), "r" (dst)
|
||||
: "memory", "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7"
|
||||
);
|
||||
}
|
||||
|
||||
extern void ClipEdge(Vertex* const v1, Vertex* const v2, volatile Vertex* vout, char type);
|
||||
|
|
@ -58,6 +58,7 @@ extern void ProcessVertexList(Vertex* v3, int n, void* sq_addr);
|
|||
|
||||
void SceneListSubmit(Vertex* v3, int n) {
|
||||
volatile Vertex* dst = (volatile Vertex*)MEM_AREA_SQ_BASE;
|
||||
asm volatile ("fschg"); // swap to 64 bit loads/stores
|
||||
|
||||
for (int i = 0; i < n; i++, v3++)
|
||||
{
|
||||
|
|
@ -256,4 +257,5 @@ void SceneListSubmit(Vertex* v3, int n) {
|
|||
break;
|
||||
}
|
||||
}
|
||||
asm volatile ("fschg"); // swap back to 32 bit loads/stores
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue