N64: Optimise clipping path (test world back to ~11.0 ms again)

This commit is contained in:
UnknownShadow200 2025-07-18 23:32:38 +10:00
parent 9b4e0ebd80
commit 952824356e
2 changed files with 19 additions and 21 deletions

View File

@ -543,21 +543,14 @@ GPUCmd_DrawQuad:
#undef v2_cflags
#undef v3_cflags
// ########################
// Clipped triangle path
// ########################
// If not, go with slow clipping path
# now do the actual drawing
// ###########################
// Slow clipped triangle path
// ###########################
li a1, %lo(VERTEX_CACHE) + V0_OFFSET
li a2, %lo(VERTEX_CACHE) + V1_OFFSET
jal DrawClippedTriangle
li a3, %lo(VERTEX_CACHE) + V2_OFFSET
li a1, %lo(VERTEX_CACHE) + V0_OFFSET
li a2, %lo(VERTEX_CACHE) + V2_OFFSET
jal DrawClippedTriangle
li a3, %lo(VERTEX_CACHE) + V3_OFFSET
jal DrawClippedQuad
li a0, %lo(VERTEX_CACHE) + V3_OFFSET
j RSPQ_Loop
nop
@ -566,15 +559,17 @@ GPUCmd_DrawQuad:
################################################################
# DrawClippedTriangle - Breaks a triangle into one or more clipped tris
################################################################
.func DrawClippedTriangle
DrawClippedTriangle:
.func DrawClippedQuad
DrawClippedQuad:
sw ra, %lo(DRAW_TRI_RA) // TODO find a register for this
lbu t0, SCREEN_VTX_CLIP_CODE(vtx1)
lbu t1, SCREEN_VTX_CLIP_CODE(vtx2)
lbu t2, SCREEN_VTX_CLIP_CODE(vtx3)
lbu t3, SCREEN_VTX_CLIP_CODE(vtx4)
or t5, t0, t1
or t5, t2
or t5, t3
move s1, zero
jal GL_ClipTriangle

View File

@ -1,5 +1,5 @@
#define CLIPPING_PLANE_COUNT 6
#define CLIPPING_CACHE_SIZE 9
#define CLIPPING_CACHE_SIZE 10
#define CLIPPING_PLANE_SIZE 8
.section .data.gl_clipping
@ -14,7 +14,7 @@ CLIP_PLANES:
.half 0, 0, 1, -1
.align 4
CACHE_OFFSETS: .half 2,4,6,8,10,12,14,16,18
CACHE_OFFSETS: .half 2,4,6,8, 10,12,14,16, 18,20
.section .bss.gl_clipping
@ -33,8 +33,8 @@ CLIP_LISTS:
# Clip a triangle against the view-frustum by using the Sutherland-Hodgman algorithm
# https://en.wikipedia.org/wiki/Sutherland%E2%80%93Hodgman_algorithm
# Args:
# a1-a3 = Vertices
# t5 = OR'd clip flags of the triangle's vertices
# a1-a3,a0 = Vertices
# t5 = OR'd clip flags of the triangle's vertices
# Returns:
# s1 = Pointer to list of output vertices
# s2 = Pointer to end of list
@ -59,6 +59,7 @@ GL_ClipTriangle:
#define vtx1 a1
#define vtx2 a2
#define vtx3 a3
#define vtx4 a0
#define vplane $v01
#define vint_f $v02
@ -85,13 +86,14 @@ GL_ClipTriangle:
li in_list, %lo(CLIP_LIST0)
move in_count, zero
# Put three original vertices in the out_list
# Put four original vertices in the out_list
# (So after the initial swap they will be in the in_list)
li out_list, %lo(CLIP_LIST1)
sh vtx1, 0(out_list)
sh vtx2, 2(out_list)
sh vtx3, 4(out_list)
li out_count, 3*2
sh vtx4, 6(out_list)
li out_count, 4*2
li plane, %lo(CLIP_PLANES)
li plane_flag, 1
@ -100,7 +102,7 @@ GL_ClipTriangle:
li t0, %lo(CACHE_OFFSETS)
vxor voff1, voff1
lqv voff0, 0,t0
lsv voff1, 16,t0
ldv voff1, 16,t0
# Temporarily use the RDP staging area as a map of which cache slots are used
# Init to zero
@ -201,6 +203,7 @@ gl_clip_no_swap:
vlt vcache0, vcache0.h2
vlt vcache0, vcache0.e4
vlt vcache0, vcache1.e0
vlt vcache0, vcache1.e1
mfc2 t0, vcache0.e0