N64: Optimise clipping path (test world back to ~11.0 ms again)

2025-07-18 23:32:38 +10:00 · 2025-07-18 23:32:38 +10:00 · 952824356e
parent 9b4e0ebd80
commit 952824356e
2 changed files with 19 additions and 21 deletions
--- a/misc/n64/rsp_gpu.S
+++ b/misc/n64/rsp_gpu.S
@ -543,21 +543,14 @@ GPUCmd_DrawQuad:
 	#undef v2_cflags
 	#undef v3_cflags

-// ########################
-// Clipped triangle path
-// ########################
-	// If not, go with slow clipping path
-	
-	# now do the actual drawing
+// ###########################
+// Slow clipped triangle path
+// ###########################
 	li a1, %lo(VERTEX_CACHE) + V0_OFFSET
 	li a2, %lo(VERTEX_CACHE) + V1_OFFSET
-	jal DrawClippedTriangle
 	li a3, %lo(VERTEX_CACHE) + V2_OFFSET
-
-	li a1, %lo(VERTEX_CACHE) + V0_OFFSET
-	li a2, %lo(VERTEX_CACHE) + V2_OFFSET
-	jal DrawClippedTriangle
-	li a3, %lo(VERTEX_CACHE) + V3_OFFSET
+	jal DrawClippedQuad
+	li a0, %lo(VERTEX_CACHE) + V3_OFFSET

    j RSPQ_Loop
    nop
@ -566,15 +559,17 @@ GPUCmd_DrawQuad:
    ################################################################
    # DrawClippedTriangle - Breaks a triangle into one or more clipped tris
    ################################################################
-.func DrawClippedTriangle
-DrawClippedTriangle:
+.func DrawClippedQuad
+DrawClippedQuad:
    sw ra, %lo(DRAW_TRI_RA) // TODO find a register for this

    lbu t0, SCREEN_VTX_CLIP_CODE(vtx1)
    lbu t1, SCREEN_VTX_CLIP_CODE(vtx2)
    lbu t2, SCREEN_VTX_CLIP_CODE(vtx3)
+    lbu t3, SCREEN_VTX_CLIP_CODE(vtx4)
    or t5, t0, t1
    or t5, t2
+    or t5, t3

    move s1, zero
    jal GL_ClipTriangle
--- a/misc/n64/rsp_gpu_clipping.inc
+++ b/misc/n64/rsp_gpu_clipping.inc
@ -1,5 +1,5 @@
 #define CLIPPING_PLANE_COUNT  6
-#define CLIPPING_CACHE_SIZE   9
+#define CLIPPING_CACHE_SIZE   10
 #define CLIPPING_PLANE_SIZE   8

    .section .data.gl_clipping
@ -14,7 +14,7 @@ CLIP_PLANES:
    .half 0, 0, 1, -1

    .align 4
-CACHE_OFFSETS: .half 2,4,6,8,10,12,14,16,18
+CACHE_OFFSETS: .half 2,4,6,8, 10,12,14,16, 18,20

    .section .bss.gl_clipping

@ -33,8 +33,8 @@ CLIP_LISTS:
    #   Clip a triangle against the view-frustum by using the Sutherland-Hodgman algorithm
    #   https://en.wikipedia.org/wiki/Sutherland%E2%80%93Hodgman_algorithm
    # Args:
-    #   a1-a3 = Vertices
-    #   t5    = OR'd clip flags of the triangle's vertices
+    #   a1-a3,a0 = Vertices
+    #   t5       = OR'd clip flags of the triangle's vertices
    # Returns:
    #   s1    = Pointer to list of output vertices
    #   s2    = Pointer to end of list
@ -59,6 +59,7 @@ GL_ClipTriangle:
    #define vtx1            a1
    #define vtx2            a2
    #define vtx3            a3
+    #define vtx4            a0

    #define vplane          $v01
    #define vint_f          $v02
@ -85,13 +86,14 @@ GL_ClipTriangle:
    li in_list, %lo(CLIP_LIST0)
    move in_count, zero

-    # Put three original vertices in the out_list
+    # Put four original vertices in the out_list
    # (So after the initial swap they will be in the in_list)
    li out_list, %lo(CLIP_LIST1)
    sh vtx1, 0(out_list)
    sh vtx2, 2(out_list)
    sh vtx3, 4(out_list)
-    li out_count, 3*2
+    sh vtx4, 6(out_list)
+    li out_count, 4*2

    li plane, %lo(CLIP_PLANES)
    li plane_flag, 1
@ -100,7 +102,7 @@ GL_ClipTriangle:
    li t0, %lo(CACHE_OFFSETS)
    vxor voff1, voff1
    lqv voff0,  0,t0
-    lsv voff1, 16,t0
+    ldv voff1, 16,t0

    # Temporarily use the RDP staging area as a map of which cache slots are used
    # Init to zero
@ -201,6 +203,7 @@ gl_clip_no_swap:
    vlt vcache0, vcache0.h2
    vlt vcache0, vcache0.e4
    vlt vcache0, vcache1.e0
+    vlt vcache0, vcache1.e1

    mfc2 t0, vcache0.e0