From 8a2a1eb41a6a9d3b9b398fc9f763695cf37eccfe Mon Sep 17 00:00:00 2001 From: Cuyler36 Date: Wed, 18 Jun 2025 07:43:31 -0400 Subject: [PATCH] link dolphin ar, ai, & mtx. Finish Dolphin SDK --- configure.py | 27 +- include/dolphin/ar.h | 2 +- include/dolphin/mtx.h | 63 ++-- src/static/dolphin/ai/ai.c | 348 ++++++++++++++++++++ src/static/dolphin/ar/__ar.h | 13 + src/static/dolphin/ar/ar.c | 314 ++++++++++++++++++ src/static/dolphin/ar/arq.c | 150 +++++++++ src/static/dolphin/mtx/mtx.c | 563 ++++++++++++++++++++++++++++++++ src/static/dolphin/mtx/mtx44.c | 88 +++++ src/static/dolphin/mtx/mtxvec.c | 103 ++++++ src/static/dolphin/mtx/vec.c | 171 ++++++++++ 11 files changed, 1810 insertions(+), 32 deletions(-) create mode 100644 src/static/dolphin/ai/ai.c create mode 100644 src/static/dolphin/ar/__ar.h create mode 100644 src/static/dolphin/ar/ar.c create mode 100644 src/static/dolphin/ar/arq.c create mode 100644 src/static/dolphin/mtx/mtx.c create mode 100644 src/static/dolphin/mtx/mtx44.c create mode 100644 src/static/dolphin/mtx/mtxvec.c create mode 100644 src/static/dolphin/mtx/vec.c diff --git a/configure.py b/configure.py index 754938e8..993f253d 100644 --- a/configure.py +++ b/configure.py @@ -276,6 +276,17 @@ def DolphinLib(lib_name: str, objects: List[Object]) -> Dict[str, Any]: } +def DolphinLibMtx(lib_name: str, objects: List[Object]) -> Dict[str, Any]: + return { + "lib": lib_name, + "mw_version": "GC/1.2.5", + "cflags": [*cflags_runtime, "-char signed"], + "progress_category": "sdk", + "src_dir": "src/static", + "objects": objects, + } + + def JSystemLib(lib_name: str, objects: List[Object]) -> Dict[str, Any]: return { "lib": lib_name, @@ -435,14 +446,14 @@ config.libs = [ DolphinLib( "ai", [ - Object(NonMatching, "dolphin/ai/ai.c"), + Object(Matching, "dolphin/ai/ai.c"), ], ), DolphinLib( "ar", [ - Object(NonMatching, "dolphin/ar/ar.c"), - Object(NonMatching, "dolphin/ar/arq.c"), + Object(Matching, "dolphin/ar/ar.c"), + Object(Matching, "dolphin/ar/arq.c"), ], ), DolphinLib( @@ -539,13 +550,13 @@ config.libs = [ Object(Matching, "dolphin/gx/GXTransform.c"), ], ), - DolphinLib( + DolphinLibMtx( "mtx", [ - Object(NonMatching, "dolphin/mtx/mtx.c"), - Object(NonMatching, "dolphin/mtx/mtx44.c"), - Object(NonMatching, "dolphin/mtx/mtxvec.c"), - Object(NonMatching, "dolphin/mtx/vec.c"), + Object(Matching, "dolphin/mtx/mtx.c"), + Object(Matching, "dolphin/mtx/mtx44.c"), + Object(Matching, "dolphin/mtx/mtxvec.c"), + Object(Matching, "dolphin/mtx/vec.c"), ], ), DolphinLib( diff --git a/include/dolphin/ar.h b/include/dolphin/ar.h index 9c53e70f..b73dffc7 100644 --- a/include/dolphin/ar.h +++ b/include/dolphin/ar.h @@ -35,7 +35,7 @@ void ARQInit(); void ARQPostRequest(ARQRequest* task, u32 owner, u32 type, u32 priority, u32 source, u32 dest, u32 length, ARQCallback callback); // AR functions. -ARCallback ARRegisterDMACallback(ARCallback callback); +ARQCallback ARRegisterDMACallback(ARQCallback callback); u32 ARGetDMAStatus(); void ARStartDMA(u32 type, u32 mainmem_addr, u32 aram_addr, u32 length); u32 ARAlloc(u32 length); diff --git a/include/dolphin/mtx.h b/include/dolphin/mtx.h index b18ae3b2..102f9d4e 100644 --- a/include/dolphin/mtx.h +++ b/include/dolphin/mtx.h @@ -13,10 +13,8 @@ extern "C" { #define MTXDegToRad(a) ((a) * 0.01745329252f) typedef struct { - f32 x; - f32 y; - f32 z; -} Vec; + f32 x, y, z; +} Vec, *VecPtr, Point3d, *Point3dPtr; typedef f32 Mtx34[3][4]; typedef f32 Mtx23[2][3]; @@ -86,26 +84,45 @@ static inline void MTXSetPosition(GC_Mtx mtx, const Vec* pos) { //////////////////////////////////////////// #ifdef MTX_PS -#define MTXIdentity PSMTXIdentity -#define MTXCopy PSMTXCopy -#define MTXConcat PSMTXConcat -#define MTXConcatArray PSMTXConcatArray -#define MTXTranspose PSMTXTranspose -#define MTXInverse PSMTXInverse -#define MTXInvXpose PSMTXInvXpose - -#define MTXScale PSMTXScale -#define MTXTrans PSMTXTrans - -#define MTXMultVec PSMTXMultVec +#define VECSquareMag PSVECSquareMag +#define VECNormalize PSVECNormalize +#define VECDistance PSVECDistance +#define VECMag PSVECMag +#define VECAdd PSVECAdd +#define VECDotProduct PSVECDotProduct +#define VECSquareDistance PSVECSquareDistance +#define VECCrossProduct PSVECCrossProduct +#define MTXMultVec PSMTXMultVec +#define MTXMultVecArray PSMTXMultVecArray +#define MTXMultVecSR PSMTXMultVecSR +#define MTXScale PSMTXScale +#define MTXCopy PSMTXCopy +#define MTXConcat PSMTXConcat +#define MTXInverse PSMTXInverse +#define MTXTranspose PSMTXTranspose +#define MTXIdentity PSMTXIdentity +#define MTXRotRad PSMTXRotRad +#define MTXTrans PSMTXTrans #else -#define MTXIdentity C_MTXIdentity -#define MTXCopy C_MTXCopy -#define MTXConcat C_MTXConcat -#define MTXConcatArray C_MTXConcatArray -#define MTXTranspose C_MTXTranspose -#define MTXInverse C_MTXInverse -#define MTXInvXpose C_MTXInvXpose +#define VECSquareMag C_VECSquareMag +#define VECNormalize C_VECNormalize +#define VECDistance C_VECDistance +#define VECMag C_VECMag +#define VECAdd C_VECAdd +#define VECDotProduct C_VECDotProduct +#define VECSquareDistance C_VECSquareDistance +#define VECCrossProduct C_VECCrossProduct +#define MTXMultVec C_MTXMultVec +#define MTXMultVecArray C_MTXMultVecArray +#define MTXMultVecSR C_MTXMultVecSR +#define MTXScale C_MTXScale +#define MTXCopy C_MTXCopy +#define MTXConcat C_MTXConcat +#define MTXInverse C_MTXInverse +#define MTXTranspose C_MTXTranspose +#define MTXIdentity C_MTXIdentity +#define MTXRotRad C_MTXRotRad +#define MTXTrans C_MTXTrans #endif #ifdef __cplusplus diff --git a/src/static/dolphin/ai/ai.c b/src/static/dolphin/ai/ai.c new file mode 100644 index 00000000..b0aede97 --- /dev/null +++ b/src/static/dolphin/ai/ai.c @@ -0,0 +1,348 @@ +#include +#include +#include +#include +#include + +#include "gx/__gx.h" + +static AISCallback __AIS_Callback; +static AIDCallback __AID_Callback; +static u8* __CallbackStack; +static u8* __OldStack; +static BOOL __AI_init_flag; +static OSTime bound_32KHz; +static OSTime bound_48KHz; +static OSTime min_wait; +static OSTime max_wait; +static OSTime buffer; + +struct STRUCT_TIMELOG { + OSTime t_start; + OSTime t1; + OSTime t2; + OSTime t3; + OSTime t4; + OSTime t_end; +}; + +#if DEBUG +struct STRUCT_TIMELOG profile; +#endif + +static void __AI_set_stream_sample_rate(u32 rate); +static void __AIDHandler(__OSInterrupt interrupt, OSContext* context); +static void __AISHandler(__OSInterrupt interrupt, OSContext* context); +static void __AICallbackStackSwitch(void* cb); +static void __AI_SRC_INIT(void); + +AIDCallback AIRegisterDMACallback(AIDCallback callback) +{ + AIDCallback old_callback; + BOOL old; + + old_callback = __AID_Callback; + old = OSDisableInterrupts(); + __AID_Callback = callback; + OSRestoreInterrupts(old); + return old_callback; +} + +void AIInitDMA(u32 start_addr, u32 length) +{ + BOOL old; + + old = OSDisableInterrupts(); + __DSPRegs[24] = (__DSPRegs[24] & 0xFFFFFC00) | (start_addr >> 16); + __DSPRegs[25] = (__DSPRegs[25] & 0xFFFF001F) | (start_addr & 0xFFFF); + ASSERTMSGLINE(0x12E, (length & 0x1F) == 0, + "AIStartDMA: length must be multiple of 32 bytes"); + __DSPRegs[27] = (__DSPRegs[27] & 0xFFFF8000) | ((length >> 5) & 0xFFFF); + OSRestoreInterrupts(old); +} + +void AIStartDMA(void) { __DSPRegs[27] = __DSPRegs[27] | 0x8000; } + +u32 AIGetStreamSampleCount(void) { + return __AIRegs[2]; +} + +void AIResetStreamSampleCount(void) +{ + __AIRegs[0] = (__AIRegs[0] & ~0x20) | 0x20; +} + +inline void AISetStreamTrigger(u32 trigger) { __AIRegs[3] = trigger; } + +u32 AIGetStreamTrigger(void) { + return __AIRegs[3]; +} + +void AISetStreamPlayState(u32 state) +{ + BOOL old; + u8 vol_left; + u8 vol_right; + + if (state != AIGetStreamPlayState()) { + if (AIGetStreamSampleRate() == 0 && state == AI_STREAM_START) { + vol_left = AIGetStreamVolRight(); + vol_right = AIGetStreamVolLeft(); + AISetStreamVolRight(0); + AISetStreamVolLeft(0); + old = OSDisableInterrupts(); + __AI_SRC_INIT(); + SET_REG_FIELD(0, __AIRegs[0], 1, 5, 1); + SET_REG_FIELD(0, __AIRegs[0], 1, 0, AI_STREAM_START); + OSRestoreInterrupts(old); + AISetStreamVolLeft(vol_left); + AISetStreamVolRight(vol_right); + return; + } + SET_REG_FIELD(0x27F, __AIRegs[0], 1, 0, state); + } +} + +u32 AIGetStreamPlayState(void) { return __AIRegs[0] & 1; } + +void AISetDSPSampleRate(u32 rate) +{ + BOOL old; + u32 play_state; + u32 afr_state; + u8 vol_left; + u8 vol_right; + + if (rate != AIGetDSPSampleRate()) { + __AIRegs[0] = (__AIRegs[0] & 0xFFFFFFBF); + if (rate == AI_SAMPLERATE_32KHZ) { + vol_left = AIGetStreamVolLeft(); + vol_right = AIGetStreamVolRight(); + play_state = AIGetStreamPlayState(); + afr_state = AIGetStreamSampleRate(); + AISetStreamVolLeft(0U); + AISetStreamVolRight(0U); + old = OSDisableInterrupts(); + __AI_SRC_INIT(); + SET_REG_FIELD(0x2D8, __AIRegs[0], 1, 5, 1); + SET_REG_FIELD(0x2D9, __AIRegs[0], 1, 1, afr_state); + SET_REG_FIELD(0x2DA, __AIRegs[0], 1, 0, play_state); + __AIRegs[0] |= 0x40; + OSRestoreInterrupts(old); + AISetStreamVolLeft(vol_left); + AISetStreamVolRight(vol_right); + } + } +} + +u32 AIGetDSPSampleRate(void) { return GET_REG_FIELD(__AIRegs[0], 1, 6) ^ 1; } + +void AISetStreamSampleRate(u32 rate) +{ + if (rate == AI_SAMPLERATE_48KHZ) { + __AI_set_stream_sample_rate(rate); + return; + } +#if DEBUG + OSReport("AISetStreamSampleRate(): OBSOLETED. Only 48KHz streaming from " + "disk is supported!\n"); +#endif +} + +static void __AI_set_stream_sample_rate(u32 rate) +{ + BOOL old; + u32 play_state; + u8 vol_left; + u8 vol_right; + u32 dsp_src_state; + + if (rate != AIGetStreamSampleRate()) { + play_state = AIGetStreamPlayState(); + vol_left = AIGetStreamVolLeft(); + vol_right = AIGetStreamVolRight(); + AISetStreamVolRight(0); + AISetStreamVolLeft(0); + dsp_src_state = __AIRegs[0] & 0x40; + SET_REG_FIELD(0, __AIRegs[0], 1, 6, 0); + old = OSDisableInterrupts(); + __AI_SRC_INIT(); + __AIRegs[0] |= dsp_src_state; + SET_REG_FIELD(0x368, __AIRegs[0], 1, 5, 1); + SET_REG_FIELD(0x369, __AIRegs[0], 1, 1, rate); + OSRestoreInterrupts(old); + AISetStreamPlayState(play_state); + AISetStreamVolLeft(vol_left); + AISetStreamVolRight(vol_right); + } +} + +u32 AIGetStreamSampleRate(void) { return GET_REG_FIELD(__AIRegs[0], 1, 1); } + +void AISetStreamVolLeft(u8 vol) +{ + SET_REG_FIELD(0x3A3, __AIRegs[1], 8, 0, vol); +} + +u8 AIGetStreamVolLeft(void) { return GET_REG_FIELD(__AIRegs[1], 8, 0); } + +void AISetStreamVolRight(u8 vol) +{ + SET_REG_FIELD(0x3CC, __AIRegs[1], 8, 8, vol); +} + +u8 AIGetStreamVolRight(void) { return (__AIRegs[1] & (0xFF << 8)) >> 8; } + +void AIInit(u8* stack) +{ + if (__AI_init_flag != TRUE) { + bound_32KHz = OSNanosecondsToTicks(31524); + bound_48KHz = OSNanosecondsToTicks(42024); + min_wait = OSNanosecondsToTicks(42000); + max_wait = OSNanosecondsToTicks(63000); + buffer = OSNanosecondsToTicks(3000); + AISetStreamVolRight(0); + AISetStreamVolLeft(0); + AISetStreamTrigger(0); + AIResetStreamSampleCount(); + __AI_set_stream_sample_rate(AI_SAMPLERATE_48KHZ); + AISetDSPSampleRate(AI_SAMPLERATE_32KHZ); +#if DEBUG + OSReport("AIInit(): DSP is 32KHz\n"); +#endif + __AIS_Callback = NULL; + __AID_Callback = NULL; + __CallbackStack = stack; + if (stack) { + ASSERTMSGLINE(0x444, ((u32)stack & 7) != 0, + "AIInit: stack must be 8-byte aligned"); + } + __OSSetInterruptHandler(5, __AIDHandler); + __OSUnmaskInterrupts(0x04000000); + __OSSetInterruptHandler(8, __AISHandler); + __OSUnmaskInterrupts(0x800000); + __AI_init_flag = TRUE; + } +} + +static void __AISHandler(__OSInterrupt interrupt, OSContext* context) +{ + OSContext exceptionContext; + + __AIRegs[0] |= 8; + OSClearContext(&exceptionContext); + OSSetCurrentContext(&exceptionContext); + if (__AIS_Callback) { + __AIS_Callback(__AIRegs[2]); + } + OSClearContext(&exceptionContext); + OSSetCurrentContext(context); +} + +static void __AIDHandler(__OSInterrupt interrupt, OSContext* context) +{ + OSContext exceptionContext; + u16 tmp; + + tmp = __DSPRegs[5]; + tmp = (tmp & ~0xA0) | 8; + __DSPRegs[5] = tmp; + OSClearContext(&exceptionContext); + OSSetCurrentContext(&exceptionContext); + if (__AID_Callback) { + if (__CallbackStack) { + __AICallbackStackSwitch(__AID_Callback); + } else { + __AID_Callback(); + } + } + OSClearContext(&exceptionContext); + OSSetCurrentContext(context); +} + +static asm void __AICallbackStackSwitch(register void* cb) +{ +#ifdef __MWERKS__ // clang-format off + nofralloc + mflr r0 + stw r0, 0x4(r1) + stwu r1, -0x18(r1) + stw r31, 0x14(r1) + mr r31, r3 + lis r5, __OldStack@ha + addi r5, r5, __OldStack@l + stw r1, 0x0(r5) + lis r5, __CallbackStack@ha + addi r5, r5, __CallbackStack@l + lwz r1, 0x0(r5) + subi r1, r1, 0x8 + mtlr r31 + blrl + lis r5, __OldStack@ha + addi r5, r5, __OldStack@l + lwz r1, 0x0(r5) + lwz r0, 0x1c(r1) + lwz r31, 0x14(r1) + addi r1, r1, 0x18 + mtlr r0 + blr +#endif // clang-format on +} + +void __AI_SRC_INIT(void) +{ + OSTime rising_32khz = 0; + OSTime rising_48khz = 0; + OSTime diff = 0; + OSTime t1 = 0; + OSTime temp; + u32 temp0; + u32 temp1; + u32 done = 0; + u32 volume = 0; + u32 Init_Cnt = 0; + u32 walking = 0; + + walking = 0; + Init_Cnt = 0; + temp = 0; + +#if DEBUG + profile.t_start = OSGetTime(); +#endif + + while (!done) { + SET_REG_FIELD(0, __AIRegs[0], 1, 5, 1); + SET_REG_FIELD(0, __AIRegs[0], 1, 1, 0); + SET_REG_FIELD(0, __AIRegs[0], 1, 0, AI_STREAM_START); + temp0 = __AIRegs[2]; + while (temp0 == __AIRegs[2]) { } + rising_32khz = OSGetTime(); + SET_REG_FIELD(0, __AIRegs[0], 1, 1, 1); + SET_REG_FIELD(0, __AIRegs[0], 1, 0, AI_STREAM_START); + temp1 = __AIRegs[2]; + while (temp1 == __AIRegs[2]) { } + rising_48khz = OSGetTime(); + diff = rising_48khz - rising_32khz; + SET_REG_FIELD(0, __AIRegs[0], 1, 1, 0); + SET_REG_FIELD(0, __AIRegs[0], 1, 0, AI_STREAM_STOP); + if (diff < bound_32KHz - buffer) { + temp = min_wait; + done = 1; + Init_Cnt++; + } else if (diff >= bound_32KHz + buffer + && diff < bound_48KHz - buffer) { + temp = max_wait; + done = 1; + Init_Cnt++; + } else { + done = 0; + walking = 1; + Init_Cnt++; + } + } + while (rising_48khz + temp > OSGetTime()) { } +#if DEBUG + profile.t_end = OSGetTime(); +#endif +} diff --git a/src/static/dolphin/ar/__ar.h b/src/static/dolphin/ar/__ar.h new file mode 100644 index 00000000..f7dd6a18 --- /dev/null +++ b/src/static/dolphin/ar/__ar.h @@ -0,0 +1,13 @@ +#ifndef _DOLPHIN_AR_INTERNAL_H_ +#define _DOLPHIN_AR_INTERNAL_H_ + +#include + +void __ARQPopTaskQueueHi(void); +void __ARQServiceQueueLo(void); +void __ARQCallbackHack(u32 pointerToARQRequest); +void __ARQInterruptServiceRoutine(void); +void __ARQInitTempQueue(void); +void __ARQPushTempQueue(struct ARQRequest* task); + +#endif // _DOLPHIN_AR_INTERNAL_H_ diff --git a/src/static/dolphin/ar/ar.c b/src/static/dolphin/ar/ar.c new file mode 100644 index 00000000..ecafff65 --- /dev/null +++ b/src/static/dolphin/ar/ar.c @@ -0,0 +1,314 @@ +#include +// #include "fake_tgmath.h" +#include +#include +#include +#include + +#include "ar/__ar.h" + +static void (*__AR_Callback)(); +static u32 __AR_Size; +static u32 __AR_InternalSize; +static u32 __AR_ExpansionSize; +static u32 __AR_StackPointer; +static u32 __AR_FreeBlocks; +static u32* __AR_BlockLength; +static int __AR_init_flag; + +// functions +static void __ARHandler(__OSInterrupt exception, struct OSContext* context); +static void __ARWaitForDMA(void); +static void __ARWriteDMA(u32 mmem_addr, u32 aram_addr, u32 length); +static void __ARReadDMA(u32 mmem_addr, u32 aram_addr, u32 length); +static void __ARChecksize(void); + +ARQCallback ARRegisterDMACallback(ARQCallback callback) +{ + ARQCallback old_callback; + int old; + + old_callback = __AR_Callback; + old = OSDisableInterrupts(); + __AR_Callback = callback; + OSRestoreInterrupts(old); + return old_callback; +} + +void ARStartDMA(u32 type, u32 mainmem_addr, u32 aram_addr, u32 length) +{ + int old; + + old = OSDisableInterrupts(); + + __DSPRegs[DSP_ARAM_DMA_MM_HI] + = (__DSPRegs[DSP_ARAM_DMA_MM_HI] & 0xFFFFFC00 | (mainmem_addr >> 0x10)); + __DSPRegs[DSP_ARAM_DMA_MM_LO] + = (__DSPRegs[DSP_ARAM_DMA_MM_LO] & 0xFFFF001F | ((u16)mainmem_addr)); + __DSPRegs[DSP_ARAM_DMA_ARAM_HI] + = (__DSPRegs[DSP_ARAM_DMA_ARAM_HI] & 0xFFFFFC00 | (aram_addr >> 0x10)); + __DSPRegs[DSP_ARAM_DMA_ARAM_LO] + = (__DSPRegs[DSP_ARAM_DMA_ARAM_LO] & 0xFFFF001F | ((u16)aram_addr)); + __DSPRegs[DSP_ARAM_DMA_SIZE_HI] + = __DSPRegs[DSP_ARAM_DMA_SIZE_HI] & ~0x8000 | ((type << 0xF) & ~0x7FFF); + __DSPRegs[DSP_ARAM_DMA_SIZE_HI] + = (__DSPRegs[DSP_ARAM_DMA_SIZE_HI] & 0xFFFFFC00) | (length >> 0x10); + __DSPRegs[DSP_ARAM_DMA_SIZE_LO] + = (__DSPRegs[DSP_ARAM_DMA_SIZE_LO] & 0xFFFF001F) + | (length & 0x0000FFFF); + OSRestoreInterrupts(old); +} + +u32 ARAlloc(u32 length) +{ + u32 tmp; + int old; + + old = OSDisableInterrupts(); + ASSERTMSGLINE(0x17E, !(length & 0x1F), + "ARAlloc(): length is not multiple of 32bytes!"); + ASSERTMSGLINE(0x182, length <= (__AR_Size - __AR_StackPointer), + "ARAlloc(): Out of ARAM!"); + ASSERTMSGLINE(0x183, __AR_FreeBlocks, "ARAlloc(): No more free blocks!"); + tmp = __AR_StackPointer; + __AR_StackPointer += length; + *__AR_BlockLength = length; + __AR_BlockLength += 1; + __AR_FreeBlocks -= 1; + OSRestoreInterrupts(old); + return tmp; +} + +u32 ARInit(u32* stack_index_addr, u32 num_entries) +{ + BOOL enabled; + + if (__AR_init_flag == 1) { + return 0x4000; + } + enabled = OSDisableInterrupts(); + __AR_Callback = NULL; + __OSSetInterruptHandler(6, __ARHandler); + __OSUnmaskInterrupts(0x02000000); + __AR_StackPointer = 0x4000; + __AR_FreeBlocks = num_entries; + __AR_BlockLength = stack_index_addr; + + // WHY? + __DSPRegs[DSP_ARAM_REFRESH] = __DSPRegs[DSP_ARAM_REFRESH] & 0xff + | __DSPRegs[DSP_ARAM_REFRESH] & ~0xff; + __ARChecksize(); + __AR_init_flag = 1; + OSRestoreInterrupts(enabled); + + return __AR_StackPointer; +} + +u32 ARGetBaseAddress(void) { return 0x4000; } + +u32 ARGetSize(void) { return __AR_Size; } + +static void __ARHandler(__OSInterrupt exception, struct OSContext* context) +{ + struct OSContext exceptionContext; + u16 tmp; + + tmp = __DSPRegs[DSP_CONTROL_STATUS]; + tmp = (tmp & ~0x88) | 0x20; + __DSPRegs[DSP_CONTROL_STATUS] = (tmp); + OSClearContext(&exceptionContext); + OSSetCurrentContext(&exceptionContext); + if (__AR_Callback) { + __AR_Callback(); + } + OSClearContext(&exceptionContext); + OSSetCurrentContext(context); +} + +static void __ARClearInterrupt(void) +{ + u16 tmp; + tmp = __DSPRegs[DSP_CONTROL_STATUS]; + tmp = tmp & ~0x88 | 0x20; + __DSPRegs[DSP_CONTROL_STATUS] = tmp; +} + +static void __ARWaitForDMA(void) +{ + while (__DSPRegs[DSP_CONTROL_STATUS] & 0x200) + ; +} + +static void __ARWriteDMA(u32 mmem_addr, u32 aram_addr, u32 length) +{ + // Main mem address + __DSPRegs[DSP_ARAM_DMA_MM_HI] + = (u16)((__DSPRegs[DSP_ARAM_DMA_MM_HI] & ~0x03ff) + | (u16)(mmem_addr >> 16)); + __DSPRegs[DSP_ARAM_DMA_MM_LO] + = (u16)((__DSPRegs[DSP_ARAM_DMA_MM_LO] & ~0xffe0) + | (u16)(mmem_addr & 0xffff)); + + // ARAM address + __DSPRegs[DSP_ARAM_DMA_ARAM_HI] + = (u16)((__DSPRegs[DSP_ARAM_DMA_ARAM_HI] & ~0x03ff) + | (u16)(aram_addr >> 16)); + __DSPRegs[DSP_ARAM_DMA_ARAM_LO] + = (u16)((__DSPRegs[DSP_ARAM_DMA_ARAM_LO] & ~0xffe0) + | (u16)(aram_addr & 0xffff)); + + // DMA buffer size + __DSPRegs[DSP_ARAM_DMA_SIZE_HI] + = (u16)(__DSPRegs[DSP_ARAM_DMA_SIZE_HI] & ~0x8000); + + __DSPRegs[DSP_ARAM_DMA_SIZE_HI] + = (u16)((__DSPRegs[DSP_ARAM_DMA_SIZE_HI] & ~0x03ff) + | (u16)(length >> 16)); + __DSPRegs[DSP_ARAM_DMA_SIZE_LO] + = (u16)((__DSPRegs[DSP_ARAM_DMA_SIZE_LO] & ~0xffe0) + | (u16)(length & 0xffff)); + + __ARWaitForDMA(); + __ARClearInterrupt(); +} + +static void __ARReadDMA(u32 mmem_addr, u32 aram_addr, u32 length) +{ + // Main mem address + __DSPRegs[DSP_ARAM_DMA_MM_HI] + = (u16)((__DSPRegs[DSP_ARAM_DMA_MM_HI] & ~0x03ff) + | (u16)(mmem_addr >> 16)); + __DSPRegs[DSP_ARAM_DMA_MM_LO] + = (u16)((__DSPRegs[DSP_ARAM_DMA_MM_LO] & ~0xffe0) + | (u16)(mmem_addr & 0xffff)); + + // ARAM address + __DSPRegs[DSP_ARAM_DMA_ARAM_HI] + = (u16)((__DSPRegs[DSP_ARAM_DMA_ARAM_HI] & ~0x03ff) + | (u16)(aram_addr >> 16)); + __DSPRegs[DSP_ARAM_DMA_ARAM_LO] + = (u16)((__DSPRegs[DSP_ARAM_DMA_ARAM_LO] & ~0xffe0) + | (u16)(aram_addr & 0xffff)); + + // DMA buffer size + __DSPRegs[DSP_ARAM_DMA_SIZE_HI] + = (u16)(__DSPRegs[DSP_ARAM_DMA_SIZE_HI] | 0x8000); + + __DSPRegs[DSP_ARAM_DMA_SIZE_HI] + = (u16)((__DSPRegs[DSP_ARAM_DMA_SIZE_HI] & ~0x03ff) + | (u16)(length >> 16)); + __DSPRegs[DSP_ARAM_DMA_SIZE_LO] + = (u16)((__DSPRegs[DSP_ARAM_DMA_SIZE_LO] & ~0xffe0) + | (u16)(length & 0xffff)); + + __ARWaitForDMA(); + __ARClearInterrupt(); +} + +static void __ARChecksize(void) +{ + u8 test_data_pad[63]; + u8 dummy_data_pad[63]; + u8 buffer_pad[63]; + u32* test_data; + u32* dummy_data; + u32* buffer; + u16 ARAM_mode; + u32 ARAM_size; + u32 i; + + do { + } while (!(__DSPRegs[DSP_ARAM_MODE] & 1)); + + ARAM_mode = 3; + ARAM_size = __AR_InternalSize = 0x1000000; + + __DSPRegs[DSP_ARAM_SIZE] + = ((__DSPRegs[DSP_ARAM_SIZE] & 0xFFFFFFC0) | ARAM_mode) | 0x20; + + test_data = (void*)ALIGN_NEXT((u32)test_data_pad, 0x20); + dummy_data = (void*)ALIGN_NEXT((u32)dummy_data_pad, 0x20); + buffer = (void*)ALIGN_NEXT((u32)buffer_pad, 0x20); + for (i = 0; i < 8; i++) { + test_data[i] = 0xDEADBEEF; + dummy_data[i] = 0xBAD0BAD0; + } + + DCFlushRange(test_data, 0x20); + DCFlushRange(dummy_data, 0x20); + + __AR_ExpansionSize = 0; + + __ARWriteDMA((u32)dummy_data, ARAM_size + 0x0, 0x20U); + __ARWriteDMA((u32)dummy_data, ARAM_size + 0x200000, 0x20U); + __ARWriteDMA((u32)dummy_data, ARAM_size + 0x1000000, 0x20U); + __ARWriteDMA((u32)dummy_data, ARAM_size + 0x200, 0x20U); + __ARWriteDMA((u32)dummy_data, ARAM_size + 0x400000, 0x20U); + + memset(buffer, 0, 0x20); + DCFlushRange(buffer, 0x20); + + __ARWriteDMA((u32)test_data, ARAM_size, 0x20U); + DCInvalidateRange(buffer, 0x20); + + __ARReadDMA((u32)buffer, ARAM_size, 0x20U); + PPCSync(); + + if (*buffer == *test_data) { + memset(buffer, 0, 0x20); + DCFlushRange(buffer, 0x20); + + __ARReadDMA((u32)buffer, ARAM_size + 0x200000, 0x20U); + PPCSync(); + + if (*buffer == *test_data) { + ARAM_size += 0x200000; + __AR_ExpansionSize = 0x200000; + } else { + memset(buffer, 0, 0x20); + DCFlushRange(buffer, 0x20); + + __ARReadDMA((u32)buffer, ARAM_size + 0x01000000, 0x20U); + PPCSync(); + + if (*buffer == *test_data) { + ARAM_mode |= 8; + ARAM_size += 0x400000; + __AR_ExpansionSize = 0x400000; + } else { + memset(buffer, 0, 0x20); + DCFlushRange(buffer, 0x20); + + __ARReadDMA((u32)buffer, ARAM_size + 0x200, 0x20U); + PPCSync(); + + if (*buffer == *test_data) { + ARAM_mode |= 0x10; + ARAM_size += 0x800000; + __AR_ExpansionSize = 0x800000; + } else { + memset(buffer, 0, 0x20); + DCFlushRange(buffer, 0x20); + + __ARReadDMA((u32)buffer, ARAM_size + 0x400000, 0x20U); + PPCSync(); + + if (*buffer == *test_data) { + ARAM_mode |= 0x18; + ARAM_size += 0x01000000; + __AR_ExpansionSize = 0x1000000; + } else { + ARAM_mode |= 0x20; + ARAM_size += 0x02000000; + __AR_ExpansionSize = 0x2000000; + } + } + } + } + + __DSPRegs[DSP_ARAM_SIZE] + = ((u16)(__DSPRegs[DSP_ARAM_SIZE] & 0xFFFFFFC0) | ARAM_mode); + } + *(u32*)OSPhysicalToUncached(0xD0) = ARAM_size; + __AR_Size = ARAM_size; +} diff --git a/src/static/dolphin/ar/arq.c b/src/static/dolphin/ar/arq.c new file mode 100644 index 00000000..6c521b29 --- /dev/null +++ b/src/static/dolphin/ar/arq.c @@ -0,0 +1,150 @@ +#include +#include +#include + +#include "ar/__ar.h" + +static struct ARQRequest* __ARQRequestQueueHi; +static struct ARQRequest* __ARQRequestTailHi; +static struct ARQRequest* __ARQRequestQueueLo; +static struct ARQRequest* __ARQRequestTailLo; +static struct ARQRequest* __ARQRequestPendingHi; +static struct ARQRequest* __ARQRequestPendingLo; +static ARQCallback __ARQCallbackHi; +static ARQCallback __ARQCallbackLo; +static u32 __ARQChunkSize; +static int __ARQ_init_flag; + +inline void __ARQPopTaskQueueHi(void) +{ + if (__ARQRequestQueueHi) { + if (__ARQRequestQueueHi->type == 0) { + ARStartDMA(__ARQRequestQueueHi->type, __ARQRequestQueueHi->source, + __ARQRequestQueueHi->dest, __ARQRequestQueueHi->length); + } else { + ARStartDMA(__ARQRequestQueueHi->type, __ARQRequestQueueHi->dest, + __ARQRequestQueueHi->source, + __ARQRequestQueueHi->length); + } + __ARQCallbackHi = __ARQRequestQueueHi->callback; + __ARQRequestPendingHi = __ARQRequestQueueHi; + __ARQRequestQueueHi = __ARQRequestQueueHi->next; + } +} + +void __ARQServiceQueueLo(void) +{ + if (__ARQRequestPendingLo == 0 && __ARQRequestQueueLo) { + __ARQRequestPendingLo = __ARQRequestQueueLo; + __ARQRequestQueueLo = __ARQRequestQueueLo->next; + } + if (__ARQRequestPendingLo) { + if (__ARQRequestPendingLo->length <= __ARQChunkSize) { + if (__ARQRequestPendingLo->type == 0) { + ARStartDMA( + __ARQRequestPendingLo->type, __ARQRequestPendingLo->source, + __ARQRequestPendingLo->dest, __ARQRequestPendingLo->length); + } else { + ARStartDMA(__ARQRequestPendingLo->type, + __ARQRequestPendingLo->dest, + __ARQRequestPendingLo->source, + __ARQRequestPendingLo->length); + } + __ARQCallbackLo = __ARQRequestPendingLo->callback; + } else if (__ARQRequestPendingLo->type == 0) { + ARStartDMA(__ARQRequestPendingLo->type, + __ARQRequestPendingLo->source, + __ARQRequestPendingLo->dest, __ARQChunkSize); + } else { + ARStartDMA(__ARQRequestPendingLo->type, __ARQRequestPendingLo->dest, + __ARQRequestPendingLo->source, __ARQChunkSize); + } + __ARQRequestPendingLo->length -= __ARQChunkSize; + __ARQRequestPendingLo->source += __ARQChunkSize; + __ARQRequestPendingLo->dest += __ARQChunkSize; + } +} + +void __ARQCallbackHack(u32 unused) { } + +void __ARQInterruptServiceRoutine() +{ + if (__ARQCallbackHi) { + __ARQCallbackHi((u32)__ARQRequestPendingHi); + __ARQRequestPendingHi = NULL; + __ARQCallbackHi = NULL; + } else if (__ARQCallbackLo) { + __ARQCallbackLo((u32)__ARQRequestPendingLo); + __ARQRequestPendingLo = NULL; + __ARQCallbackLo = NULL; + } + __ARQPopTaskQueueHi(); + if (__ARQRequestPendingHi == 0) { + __ARQServiceQueueLo(); + } +} + +void ARQInit(void) +{ + if (__ARQ_init_flag != 1) { + __ARQRequestQueueHi = __ARQRequestQueueLo = NULL; + __ARQChunkSize = 0x1000; + ARRegisterDMACallback(__ARQInterruptServiceRoutine); + __ARQRequestPendingHi = NULL; + __ARQRequestPendingLo = NULL; + __ARQCallbackHi = NULL; + __ARQCallbackLo = NULL; + __ARQ_init_flag = 1; + } +} + +void ARQPostRequest(struct ARQRequest* request, u32 owner, u32 type, + u32 priority, u32 source, u32 dest, u32 length, + ARQCallback callback) +{ + int level; + + ASSERTLINE(0x1A9, request); + ASSERTLINE(0x1AA, (type == ARQ_TYPE_MRAM_TO_ARAM) + || (type == ARQ_TYPE_ARAM_TO_MRAM)); + ASSERTLINE(0x1AB, (priority == ARQ_PRIORITY_LOW) + || (priority == ARQ_PRIORITY_HIGH)); + ASSERTLINE(0x1AE, (length % ARQ_DMA_ALIGNMENT) == 0); + request->next = NULL; + request->owner = owner; + request->type = type; + request->source = source; + request->dest = dest; + request->length = length; + if (callback) { + request->callback = callback; + } else { + request->callback = __ARQCallbackHack; + } + level = OSDisableInterrupts(); + switch (priority) { + case ARQ_PRIORITY_LOW: + if (__ARQRequestQueueLo) { + __ARQRequestTailLo->next = request; + } else { + __ARQRequestQueueLo = request; + } + __ARQRequestTailLo = request; + break; + case ARQ_PRIORITY_HIGH: + if (__ARQRequestQueueHi) { + __ARQRequestTailHi->next = request; + } else { + __ARQRequestQueueHi = request; + } + __ARQRequestTailHi = request; + break; + } + if ((__ARQRequestPendingHi == 0) && (__ARQRequestPendingLo == 0)) { + __ARQPopTaskQueueHi(); + if (__ARQRequestPendingHi == 0) { + __ARQServiceQueueLo(); + } + } + OSRestoreInterrupts(level); +} diff --git a/src/static/dolphin/mtx/mtx.c b/src/static/dolphin/mtx/mtx.c new file mode 100644 index 00000000..6e900dcb --- /dev/null +++ b/src/static/dolphin/mtx/mtx.c @@ -0,0 +1,563 @@ +#include +#include +#include + +#define qr0 0 + +// unsorted externs +extern f32 sinf(f32); +extern f32 cosf(f32); +extern f32 tanf(f32); + +// .sbss +static float Unit01[2] = { 0.0f, 1.0f }; + +// MEME: if this function is not here, 0.0f and 1.0f have wrong order in .sdata2 +void C_MTXIdentity(GC_Mtx mtx) +{ + mtx[0][0] = 1.0f; + mtx[0][1] = 0.0f; + mtx[0][2] = 0.0f; + mtx[1][0] = 0.0f; + mtx[1][1] = 1.0f; + mtx[1][2] = 0.0f; + mtx[2][0] = 0.0f; + mtx[2][1] = 0.0f; + mtx[2][2] = 1.0f; +} + +void PSMTXIdentity(register GC_Mtx m) +{ + register f32 c_zero = 0.0f; + register f32 c_one = 1.0f; + register f32 c_01; + register f32 c_10; + +#ifdef __MWERKS__ // clang-format off + asm { + psq_st c_zero, 8(m), 0, qr0 + ps_merge01 c_01, c_zero, c_one + psq_st c_zero, 24(m), 0, qr0 + ps_merge10 c_10, c_one, c_zero + psq_st c_zero, 32(m), 0, qr0 + psq_st c_01, 16(m), 0, qr0 + psq_st c_10, 0(m), 0, qr0 + psq_st c_10, 40(m), 0, qr0 + } +#endif // clang-format on +} + +asm void PSMTXCopy(const register GC_Mtx src, register GC_Mtx dst) +{ +#ifdef __MWERKS__ // clang-format off + psq_l f0, 0(src), 0, qr0 + psq_st f0, 0(dst), 0, qr0 + psq_l f1, 8(src), 0, qr0 + psq_st f1, 8(dst), 0, qr0 + psq_l f2, 16(src), 0, qr0 + psq_st f2, 16(dst), 0, qr0 + psq_l f3, 24(src), 0, qr0 + psq_st f3, 24(dst), 0, qr0 + psq_l f4, 32(src), 0, qr0 + psq_st f4, 32(dst), 0, qr0 + psq_l f5, 40(src), 0, qr0 + psq_st f5, 40(dst), 0, qr0 +#endif // clang-format on +} + +asm void PSMTXConcat(const register GC_Mtx mA, const register GC_Mtx mB, register GC_Mtx mAB) +{ +#ifdef __MWERKS__ // clang-format off + nofralloc + stwu r1, -64(r1) + psq_l f0, 0(mA), 0, qr0 + stfd f14, 8(r1) + psq_l f6, 0(mB), 0, qr0 + lis r6, Unit01@ha + psq_l f7, 8(mB), 0, qr0 + stfd f15, 16(r1) + addi r6, r6, Unit01@l + stfd f31, 40(r1) + psq_l f8, 16(mB), 0, qr0 + ps_muls0 f12, f6, f0 + psq_l f2, 16(mA), 0, qr0 + ps_muls0 f13, f7, f0 + psq_l f31, 0(r6), 0, qr0 + ps_muls0 f14, f6, f2 + psq_l f9, 24(mB), 0, qr0 + ps_muls0 f15, f7, f2 + psq_l f1, 8(mA), 0, qr0 + ps_madds1 f12, f8, f0, f12 + psq_l f3, 24(mA), 0, qr0 + ps_madds1 f14, f8, f2, f14 + psq_l f10, 32(mB), 0, qr0 + ps_madds1 f13, f9, f0, f13 + psq_l f11, 40(mB), 0, qr0 + ps_madds1 f15, f9, f2, f15 + psq_l f4, 32(mA), 0, qr0 + psq_l f5, 40(mA), 0, qr0 + ps_madds0 f12, f10, f1, f12 + ps_madds0 f13, f11, f1, f13 + ps_madds0 f14, f10, f3, f14 + ps_madds0 f15, f11, f3, f15 + psq_st f12, 0(mAB), 0, qr0 + ps_muls0 f2, f6, f4 + ps_madds1 f13, f31, f1, f13 + ps_muls0 f0, f7, f4 + psq_st f14, 16(mAB), 0, qr0 + ps_madds1 f15, f31, f3, f15 + psq_st f13, 8(mAB), 0, qr0 + ps_madds1 f2, f8, f4, f2 + ps_madds1 f0, f9, f4, f0 + ps_madds0 f2, f10, f5, f2 + lfd f14, 8(r1) + psq_st f15, 24(mAB), 0, qr0 + ps_madds0 f0, f11, f5, f0 + psq_st f2, 32(mAB), 0, qr0 + ps_madds1 f0, f31, f5, f0 + lfd f15, 16(r1) + psq_st f0, 40(mAB), 0, qr0 + lfd f31, 40(r1) + addi r1, r1, 64 + blr +#endif // clang-format on +} + +asm u32 PSMTXInverse(const register GC_Mtx src, register GC_Mtx inv) +{ +#ifdef __MWERKS__ // clang-format off + psq_l f0, 0(src), 1, qr0 + psq_l f1, 4(src), 0, qr0 + psq_l f2, 16(src), 1, qr0 + ps_merge10 f6, f1, f0 + psq_l f3, 20(src), 0, qr0 + psq_l f4, 32(src), 1, qr0 + ps_merge10 f7, f3, f2 + psq_l f5, 36(src), 0, qr0 + ps_mul f11, f3, f6 + ps_mul f13, f5, f7 + ps_merge10 f8, f5, f4 + ps_msub f11, f1, f7, f11 + ps_mul f12, f1, f8 + ps_msub f13, f3, f8, f13 + ps_mul f10, f3, f4 + ps_msub f12, f5, f6, f12 + ps_mul f9, f0, f5 + ps_mul f8, f1, f2 + ps_sub f6, f6, f6 + ps_msub f10, f2, f5, f10 + ps_mul f7, f0, f13 + ps_msub f9, f1, f4, f9 + ps_madd f7, f2, f12, f7 + ps_msub f8, f0, f3, f8 + ps_madd f7, f4, f11, f7 + ps_cmpo0 cr0, f7, f6 + bne skip_return + li r3, 0 + blr +skip_return: + fres f0, f7 + ps_add f6, f0, f0 + ps_mul f5, f0, f0 + ps_nmsub f0, f7, f5, f6 + lfs f1, 12(src) + ps_muls0 f13, f13, f0 + lfs f2, 28(src) + ps_muls0 f12, f12, f0 + lfs f3, 44(src) + ps_muls0 f11, f11, f0 + ps_merge00 f5, f13, f12 + ps_muls0 f10, f10, f0 + ps_merge11 f4, f13, f12 + ps_muls0 f9, f9, f0 + psq_st f5, 0(inv), 0, qr0 + ps_mul f6, f13, f1 + psq_st f4, 16(inv), 0, qr0 + ps_muls0 f8, f8, f0 + ps_madd f6, f12, f2, f6 + psq_st f10, 32(inv), 1, qr0 + ps_nmadd f6, f11, f3, f6 + psq_st f9, 36(inv), 1, qr0 + ps_mul f7, f10, f1 + ps_merge00 f5, f11, f6 + psq_st f8, 40(inv), 1, qr0 + ps_merge11 f4, f11, f6 + psq_st f5, 8(inv), 0, qr0 + ps_madd f7, f9, f2, f7 + psq_st f4, 24(inv), 0, qr0 + ps_nmadd f7, f8, f3, f7 + li r3, 1 + psq_st f7, 44(inv), 1, qr0 +#endif // clang-format on +} + +void PSMTXRotRad(GC_Mtx m, char axis, f32 rad) +{ + f32 s = sinf(rad); + f32 c = cosf(rad); + + PSMTXRotTrig(m, axis, s, c); +} + +void PSMTXRotTrig(register GC_Mtx m, register char axis, register f32 sinA, + register f32 cosA) +{ + register f32 fc0; + register f32 fc1; + register f32 nsinA; + register f32 fw0, fw1, fw2, fw3; + + fc0 = 0.0f; + fc1 = 1.0f; + +#ifdef __MWERKS__ // clang-format off + asm { + ori axis, axis, 0x20 + ps_neg nsinA, sinA + cmplwi axis, 'x' + beq _case_x + cmplwi axis, 'y' + beq _case_y + cmplwi axis, 'z' + beq _case_z + b _end + +_case_x: + psq_st fc1, 0(m), 1, qr0 + psq_st fc0, 4(m), 0, qr0 + ps_merge00 fw0, sinA, cosA + psq_st fc0, 12(m), 0, qr0 + ps_merge00 fw1, cosA, nsinA + psq_st fc0, 28(m), 0, qr0 + psq_st fc0, 44(m), 1, qr0 + psq_st fw0, 36(m), 0, qr0 + psq_st fw1, 20(m), 0, qr0 + b _end; + +_case_y: + ps_merge00 fw0, cosA, fc0 + ps_merge00 fw1, fc0, fc1 + psq_st fc0, 24(m), 0, qr0 + psq_st fw0, 0(m), 0, qr0 + ps_merge00 fw2, nsinA, fc0 + ps_merge00 fw3, sinA, fc0 + psq_st fw0, 40(m), 0, qr0 + psq_st fw1, 16(m), 0, qr0 + psq_st fw3, 8(m), 0, qr0 + psq_st fw2, 32(m), 0, qr0 + b _end; + +_case_z: + psq_st fc0, 8(m), 0, qr0 + ps_merge00 fw0, sinA, cosA + ps_merge00 fw2, cosA, nsinA + psq_st fc0, 24(m), 0, qr0 + psq_st fc0, 32(m), 0, qr0 + ps_merge00 fw1, fc1, fc0 + psq_st fw0, 16(m), 0, qr0 + psq_st fw2, 0(m), 0, qr0 + psq_st fw1, 40(m), 0, qr0 + +_end: + } +#endif // clang-format on +} + +static inline void __PSMTXRotAxisRadInternal(register GC_Mtx m, + const register Vec* axis, + register f32 sT, register f32 cT) +{ +} + +void PSMTXRotAxisRad(register GC_Mtx m, const Vec* axis, register f32 rad) +{ + register f32 tmp0, tmp1, tmp2, tmp3, tmp4; + register f32 tmp5, tmp6, tmp7, tmp8, tmp9; + + register f32 sT; + register f32 cT; + register f32 oneMinusCosT; + register f32 zero; + Vec axisNormalized; + register Vec* axisNormalizedPtr; + + zero = 0.0f; + axisNormalizedPtr = &axisNormalized; + sT = sinf(rad); + cT = cosf(rad); + oneMinusCosT = 1.0f - cT; + + PSVECNormalize(axis, axisNormalizedPtr); + + // `rad` reused here -- absolutely disgusting. + // Also might've been an inline as in prime. +#ifdef __MWERKS__ // clang-format off + asm { + psq_l rad, 0x0(axisNormalizedPtr), 0, qr0 + lfs tmp1, 0x8(axisNormalizedPtr) + ps_merge00 tmp0, cT, cT + ps_muls0 tmp4, rad, oneMinusCosT + ps_muls0 tmp5, tmp1, oneMinusCosT + ps_muls1 tmp3, tmp4, rad + ps_muls0 tmp2, tmp4, rad + ps_muls0 rad, rad, sT + ps_muls0 tmp4, tmp4, tmp1 + fnmsubs tmp6, tmp1, sT, tmp3 + fmadds tmp7, tmp1, sT, tmp3 + ps_neg tmp9, rad + ps_sum0 tmp8, tmp4, zero, rad + ps_sum0 tmp2, tmp2, tmp6, tmp0 + ps_sum1 tmp3, tmp0, tmp7, tmp3 + ps_sum0 tmp6, tmp9, zero, tmp4 + ps_sum0 tmp9, tmp4, tmp4, tmp9 + psq_st tmp8, 0x8(m), 0, qr0 + ps_muls0 tmp5, tmp5, tmp1 + psq_st tmp2, 0x0(m), 0, qr0 + ps_sum1 tmp4, rad, tmp9, tmp4 + psq_st tmp3, 0x10(m), 0, qr0 + ps_sum0 tmp5, tmp5, zero, tmp0 + psq_st tmp6, 0x18(m), 0, qr0 + psq_st tmp4, 0x20(m), 0, qr0 + psq_st tmp5, 0x28(m), 0, qr0 + } +#endif // clang-format on +} + +void PSMTXTrans(register GC_Mtx m, register f32 xT, register f32 yT, + register f32 zT) +{ + register f32 c0 = 0.0F; + register f32 c1 = 1.0F; +#ifdef __MWERKS__ // clang-format off + asm { + stfs xT, 12(m) + stfs yT, 28(m) + psq_st c0, 4(m), 0, qr0 + psq_st c0, 32(m), 0, qr0 + stfs c0, 16(m) + stfs c1, 20(m) + stfs c0, 24(m) + stfs c1, 40(m) + stfs zT, 44(m) + stfs c1, 0(m) + } +#endif // clang-format on +} + +asm void PSMTXTransApply(const register GC_Mtx src, register GC_Mtx dst, register f32 xT, + register f32 yT, register f32 zT) +{ +#ifdef __MWERKS__ // clang-format off + nofralloc + psq_l fp4, 0(src), 0, qr0 + psq_l fp5, 8(src), 0, qr0 + psq_l fp7, 24(src), 0, qr0 + psq_l fp8, 40(src), 0, qr0 + ps_sum1 fp5, xT, fp5, fp5 + psq_l fp6, 16(src), 0, qr0 + ps_sum1 fp7, yT, fp7, fp7 + psq_l fp9, 32(src), 0, qr0 + ps_sum1 fp8, zT, fp8, fp8 + + psq_st fp4, 0(dst), 0, qr0 + psq_st fp5, 8(dst), 0, qr0 + psq_st fp6, 16(dst), 0, qr0 + psq_st fp7, 24(dst), 0, qr0 + psq_st fp9, 32(dst), 0, qr0 + psq_st fp8, 40(dst), 0, qr0 + blr +#endif // clang-format on +} + +void PSMTXScale(register GC_Mtx m, register f32 xS, register f32 yS, + register f32 zS) +{ + register f32 c0 = 0.0F; +#ifdef __MWERKS__ // clang-format off + asm { + stfs xS, 0(m) + psq_st c0, 4(m), 0, 0 + psq_st c0, 12(m), 0, 0 + stfs yS, 20(m) + psq_st c0, 24(m), 0, 0 + psq_st c0, 32(m), 0, 0 + stfs zS, 40(m) + stfs c0, 44(m) + } +#endif // clang-format on +} + +asm void PSMTXScaleApply(const register GC_Mtx src, register GC_Mtx dst, register f32 xS, + register f32 yS, register f32 zS) +{ +#ifdef __MWERKS__ // clang-format off + nofralloc + psq_l fp4, 0(src), 0, 0 + psq_l fp5, 8(src), 0, 0 + ps_muls0 fp4, fp4, xS + psq_l fp6, 16(src), 0, 0 + ps_muls0 fp5, fp5, xS + psq_l fp7, 24(src), 0, 0 + ps_muls0 fp6, fp6, yS + psq_l fp8, 32(src), 0, 0 + psq_st fp4, 0(dst), 0, 0 + ps_muls0 fp7, fp7, yS + psq_l fp2, 40(src), 0, 0 + psq_st fp5, 8(dst), 0, 0 + ps_muls0 fp8, fp8, zS + psq_st fp6, 16(dst), 0, 0 + ps_muls0 fp2, fp2, zS + psq_st fp7, 24(dst), 0, 0 + psq_st fp8, 32(dst), 0, 0 + psq_st fp2, 40(dst), 0, 0 + blr +#endif // clang-format on +} + +void PSMTXQuat(register GC_Mtx m, const register PSQuaternion* q) +{ + register f32 c_zero, c_one, c_two, scale; + register f32 tmp0, tmp1, tmp2, tmp3, tmp4; + register f32 tmp5, tmp6, tmp7, tmp8, tmp9; + + c_one = 1.0F; +#ifdef __MWERKS__ // clang-format off + asm { + psq_l tmp0, 0(q), 0, 0 + psq_l tmp1, 8(q), 0, 0 + fsubs c_zero, c_one, c_one + fadds c_two, c_one, c_one + ps_mul tmp2, tmp0, tmp0 + ps_merge10 tmp5, tmp0, tmp0 + ps_madd tmp4, tmp1, tmp1, tmp2 + ps_mul tmp3, tmp1, tmp1 + ps_sum0 scale, tmp4, tmp4, tmp4 + ps_muls1 tmp7, tmp5, tmp1 + fres tmp9, scale + ps_sum1 tmp4, tmp3, tmp4, tmp2 + ps_nmsub scale, scale, tmp9, c_two + ps_muls1 tmp6, tmp1, tmp1 + ps_mul scale, tmp9, scale + ps_sum0 tmp2, tmp2, tmp2, tmp2 + fmuls scale, scale, c_two + ps_madd tmp8, tmp0, tmp5, tmp6 + ps_msub tmp6, tmp0, tmp5, tmp6 + psq_st c_zero, 12(m), 1, 0 + ps_nmsub tmp2, tmp2, scale, c_one + ps_nmsub tmp4, tmp4, scale, c_one + psq_st c_zero, 44(m), 1, 0 + ps_mul tmp8, tmp8, scale + ps_mul tmp6, tmp6, scale + psq_st tmp2, 40(m), 1, 0 + ps_madds0 tmp5, tmp0, tmp1, tmp7 + ps_merge00 tmp1, tmp8, tmp4 + ps_nmsub tmp7, tmp7, c_two, tmp5 + ps_merge10 tmp0, tmp4, tmp6 + psq_st tmp1, 16(m), 0, 0 + ps_mul tmp5, tmp5, scale + ps_mul tmp7, tmp7, scale + psq_st tmp0, 0(m), 0, 0 + psq_st tmp5, 8(m), 1, 0 + ps_merge10 tmp3, tmp7, c_zero + ps_merge01 tmp9, tmp7, tmp5 + psq_st tmp3, 24(m), 0, 0 + psq_st tmp9, 32(m), 0, 0 + } +#endif // clang-format on +} + +void C_MTXLookAt(GC_Mtx m, const Vec* camPos, const Vec* camUp, const Vec* target) { + Vec vLook; + Vec vRight; + Vec vUp; + + ASSERTMSGLINE(2105, m, "MTXLookAt(): NULL MtxPtr 'm' "); + ASSERTMSGLINE(2106, camPos, "MTXLookAt(): NULL VecPtr 'camPos' "); + ASSERTMSGLINE(2107, camUp, "MTXLookAt(): NULL VecPtr 'camUp' "); + ASSERTMSGLINE(2108, target, "MTXLookAt(): NULL Point3dPtr 'target' "); + + vLook.x = camPos->x - target->x; + vLook.y = camPos->y - target->y; + vLook.z = camPos->z - target->z; + VECNormalize(&vLook, &vLook); + VECCrossProduct(camUp, &vLook, &vRight); + VECNormalize(&vRight, &vRight); + VECCrossProduct(&vLook, &vRight, &vUp); + m[0][0] = vRight.x; + m[0][1] = vRight.y; + m[0][2] = vRight.z; + m[0][3] = -((camPos->z * vRight.z) + ((camPos->x * vRight.x) + (camPos->y * vRight.y))); + m[1][0] = vUp.x; + m[1][1] = vUp.y; + m[1][2] = vUp.z; + m[1][3] = -((camPos->z * vUp.z) + ((camPos->x * vUp.x) + (camPos->y * vUp.y))); + m[2][0] = vLook.x; + m[2][1] = vLook.y; + m[2][2] = vLook.z; + m[2][3] = -((camPos->z * vLook.z) + ((camPos->x * vLook.x) + (camPos->y * vLook.y))); +} + +void C_MTXLightFrustum(GC_Mtx m, f32 t, f32 b, f32 l, f32 r, f32 n, f32 scaleS, + f32 scaleT, f32 transS, f32 transT) +{ + f32 _tmp; + + _tmp = 1 / (r - l); + m[0][0] = (scaleS * (2 * n * _tmp)); + m[0][1] = 0; + m[0][2] = (scaleS * (_tmp * (r + l))) - transS; + m[0][3] = 0; + _tmp = 1 / (t - b); + m[1][0] = 0; + m[1][1] = (scaleT * (2 * n * _tmp)); + m[1][2] = (scaleT * (_tmp * (t + b))) - transT; + m[1][3] = 0; + m[2][0] = 0; + m[2][1] = 0; + m[2][2] = -1; + m[2][3] = 0; +} + +void C_MTXLightPerspective(GC_Mtx m, f32 fovY, f32 aspect, f32 scaleS, f32 scaleT, + f32 transS, f32 transT) +{ + f32 angle; + f32 cot; + + angle = (0.5f * fovY); + angle = angle * 0.017453293f; + cot = 1 / tanf(angle); + m[0][0] = (scaleS * (cot / aspect)); + m[0][1] = 0; + m[0][2] = -transS; + m[0][3] = 0; + m[1][0] = 0; + m[1][1] = (cot * scaleT); + m[1][2] = -transT; + m[1][3] = 0; + m[2][0] = 0; + m[2][1] = 0; + m[2][2] = -1; + m[2][3] = 0; +} + +void C_MTXLightOrtho(GC_Mtx m, f32 t, f32 b, f32 l, f32 r, f32 scaleS, f32 scaleT, + f32 transS, f32 transT) +{ + f32 _tmp; + + _tmp = 1 / (r - l); + m[0][0] = (2 * _tmp * scaleS); + m[0][1] = 0; + m[0][2] = 0; + m[0][3] = (transS + (scaleS * (_tmp * -(r + l)))); + _tmp = 1 / (t - b); + m[1][0] = 0; + m[1][1] = (2 * _tmp * scaleT); + m[1][2] = 0; + m[1][3] = (transT + (scaleT * (_tmp * -(t + b)))); + m[2][0] = 0; + m[2][1] = 0; + m[2][2] = 0; + m[2][3] = 1; +} diff --git a/src/static/dolphin/mtx/mtx44.c b/src/static/dolphin/mtx/mtx44.c new file mode 100644 index 00000000..910d6576 --- /dev/null +++ b/src/static/dolphin/mtx/mtx44.c @@ -0,0 +1,88 @@ +#include +#include +#include + +extern f32 tanf(f32); + +// NOTE: this is not present in SMS but needed for .sdata2 to match +// stolen from prime +void C_MTXFrustum(Mtx44 m, f32 t, f32 b, f32 l, f32 r, f32 n, f32 f) +{ + f32 tmp; + + tmp = 1.0f / (r - l); + m[0][0] = (2 * n) * tmp; + m[0][1] = 0.0f; + m[0][2] = (r + l) * tmp; + m[0][3] = 0.0f; + + tmp = 1.0f / (t - b); + m[1][0] = 0.0f; + m[1][1] = (2 * n) * tmp; + m[1][2] = (t + b) * tmp; + m[1][3] = 0.0f; + + m[2][0] = 0.0f; + m[2][1] = 0.0f; + + tmp = 1.0f / (f - n); + m[2][2] = -(n)*tmp; + m[2][3] = -(f * n) * tmp; + m[3][0] = 0.0f; + m[3][1] = 0.0f; + m[3][2] = -1.0f; + m[3][3] = 0.0f; +} + +void C_MTXPerspective(GC_Mtx m, f32 fovY, f32 aspect, f32 n, f32 f) +{ + f32 angle; + f32 cot; + f32 tmp; + + angle = (0.5f * fovY); + angle = angle * 0.017453293f; + cot = 1 / tanf(angle); + m[0][0] = (cot / aspect); + m[0][1] = 0; + m[0][2] = 0; + m[0][3] = 0; + m[1][0] = 0; + m[1][1] = (cot); + m[1][2] = 0; + m[1][3] = 0; + m[2][0] = 0; + m[2][1] = 0; + tmp = 1 / (f - n); + m[2][2] = (-n * tmp); + m[2][3] = (tmp * -(f * n)); + m[3][0] = 0; + m[3][1] = 0; + m[3][2] = -1; + m[3][3] = 0; +} + +void C_MTXOrtho(GC_Mtx m, f32 t, f32 b, f32 l, f32 r, f32 n, f32 f) +{ + f32 tmp; + + tmp = 1 / (r - l); + m[0][0] = 2 * tmp; + m[0][1] = 0; + m[0][2] = 0; + m[0][3] = (tmp * -(r + l)); + tmp = 1 / (t - b); + m[1][0] = 0; + m[1][1] = 2 * tmp; + m[1][2] = 0; + m[1][3] = (tmp * -(t + b)); + m[2][0] = 0; + m[2][1] = 0; + tmp = 1 / (f - n); + m[2][2] = (-1 * tmp); + m[2][3] = (-f * tmp); + m[3][0] = 0; + m[3][1] = 0; + m[3][2] = 0; + m[3][3] = 1; +} diff --git a/src/static/dolphin/mtx/mtxvec.c b/src/static/dolphin/mtx/mtxvec.c new file mode 100644 index 00000000..325609da --- /dev/null +++ b/src/static/dolphin/mtx/mtxvec.c @@ -0,0 +1,103 @@ +#include +#include + +#define qr0 0 + +asm void PSMTXMultVec(const register Mtx44 m, const register Vec* src, register Vec* dst) +{ +#ifdef __MWERKS__ // clang-format off + nofralloc + psq_l f0, Vec.x(src), 0, qr0 + psq_l f2, 0(m), 0, qr0 + psq_l f1, Vec.z(src), 1, qr0 + ps_mul f4, f2, f0 + psq_l f3, 8(m), 0, qr0 + ps_madd f5, f3, f1, f4 + psq_l f8, 16(m), 0, qr0 + ps_sum0 f6, f5, f6, f5 + psq_l f9, 24(m), 0, qr0 + ps_mul f10, f8, f0 + psq_st f6, Vec.x(dst), 1, qr0 + ps_madd f11, f9, f1, f10 + psq_l f2, 32(m), 0, qr0 + ps_sum0 f12, f11, f12, f11 + psq_l f3, 40(m), 0, qr0 + ps_mul f4, f2, f0 + psq_st f12, Vec.y(dst), 1, qr0 + ps_madd f5, f3, f1, f4 + ps_sum0 f6, f5, f6, f5 + psq_st f6, Vec.z(dst), 1, qr0 + blr +#endif // clang-format on +} + +asm void PSMTXMultVecArray(const register GC_Mtx m, const register Vec* srcBase, + register Vec* dstBase, register u32 count) +{ +#ifdef __MWERKS__ // clang-format off + psq_l f13, 0x0(m), 0, qr0 + psq_l f12, 0x10(m), 0, qr0 + subi count, count, 0x1 + psq_l f11, 0x8(m), 0, qr0 + ps_merge00 f0, f13, f12 + subi dstBase, dstBase, 0x4 + psq_l f10, 0x18(m), 0, qr0 + ps_merge11 f1, f13, f12 + + mtctr count + psq_l f4, 0x20(m), 0, qr0 + ps_merge00 f2, f11, f10 + psq_l f5, 0x28(m), 0, qr0 + ps_merge11 f3, f11, f10 + psq_l f6, Vec.x(srcBase), 0, qr0 + psq_lu f7, Vec.z(srcBase), 1, qr0 + ps_madds0 f8, f0, f6, f3 + ps_mul f9, f4, f6 + ps_madds1 f8, f1, f6, f8 + ps_madd f10, f5, f7, f9 + +loop: + psq_lu f6, Vec.y(srcBase), 0, qr0 + ps_madds0 f12, f2, f7, f8 + psq_lu f7, Vec.z(srcBase), 1, qr0 + ps_sum0 f13, f10, f9, f10 + ps_madds0 f8, f0, f6, f3 + ps_mul f9, f4, f6 + psq_stu f12, 0x4(dstBase), 0, qr0 + ps_madds1 f8, f1, f6, f8 + psq_stu f13, 0x8(dstBase), 1, qr0 + ps_madd f10, f5, f7, f9 + bdnz loop + + ps_madds0 f12, f2, f7, f8 + ps_sum0 f13, f10, f9, f10 + psq_stu f12, 0x4(dstBase), 0, qr0 + psq_stu f13, 0x8(dstBase), 1, qr0 +#endif // clang-format on +} + +asm void PSMTXMultVecSR(const register Mtx44 m, const register Vec* src, register Vec* dst) +{ +#ifdef __MWERKS__ // clang-format off + psq_l f0, 0x0(m), 0, qr0 + psq_l f6, Vec.x(src), 0, qr0 + psq_l f2, 0x10(m), 0, qr0 + ps_mul f8, f0, f6 + psq_l f4, 0x20(m), 0, qr0 + ps_mul f10, f2, f6 + psq_l f7, Vec.z(src), 1, qr0 + ps_mul f12, f4, f6 + psq_l f3, 0x18(m), 0, qr0 + ps_sum0 f8, f8, f8, f8 + psq_l f5, 0x28(m), 0, qr0 + ps_sum0 f10, f10, f10, f10 + psq_l f1, 0x8(m), 0, qr0 + ps_sum0 f12, f12, f12, f12 + ps_madd f9, f1, f7, f8 + psq_st f9, Vec.x(dst), 1, qr0 + ps_madd f11, f3, f7, f10 + psq_st f11, Vec.y(dst), 1, qr0 + ps_madd f13, f5, f7, f12 + psq_st f13, Vec.z(dst), 1, qr0 +#endif // clang-format on +} diff --git a/src/static/dolphin/mtx/vec.c b/src/static/dolphin/mtx/vec.c new file mode 100644 index 00000000..995438aa --- /dev/null +++ b/src/static/dolphin/mtx/vec.c @@ -0,0 +1,171 @@ +#include +#include + +#define qr0 0 + +asm void PSVECAdd(register Vec* a, register Vec* b, register Vec* c) +{ +#ifdef __MWERKS__ // clang-format off + psq_l f2, Vec.x(a), 0, qr0 + psq_l f4, Vec.x(b), 0, qr0 + ps_add f6, f2, f4 + psq_st f6, Vec.x(c), 0, qr0 + psq_l f3, Vec.z(a), 1, qr0 + psq_l f5, Vec.z(b), 1, qr0 + ps_add f7, f3, f5 + psq_st f7, Vec.z(c), 1, qr0 +#endif // clang-format on +} + +asm void PSVECSubtract(register Vec* a, register Vec* b, register Vec* c) +{ +#ifdef __MWERKS__ // clang-format off + psq_l f2, Vec.x(a), 0, qr0 + psq_l f4, Vec.x(b), 0, qr0 + ps_sub f6, f2, f4 + psq_st f6, Vec.x(c), 0, qr0 + psq_l f3, Vec.z(a), 1, qr0 + psq_l f5, Vec.z(b), 1, qr0 + ps_sub f7, f3, f5 + psq_st f7, Vec.z(c), 1, qr0 +#endif // clang-format on +} + +asm void PSVECScale(register Vec* src, register Vec* dst, register f32 mult) +{ +#ifdef __MWERKS__ // clang-format off + psq_l f0, Vec.x(src), 0, qr0 + psq_l f2, Vec.z(src), 1, qr0 + ps_muls0 f0, f0, f1 + psq_st f0, Vec.x(dst), 0, qr0 + ps_muls0 f0, f2, f1 + psq_st f0, Vec.z(dst), 1, qr0 +#endif // clang-format on +} + +void PSVECNormalize(const register Vec* vec1, register Vec* dst) +{ + register float c_half = 0.5f; + register float c_three = 3.0f; + register float v1_xy; + register float v1_z; + register float xx_zz; + register float xx_yy; + register float sqsum; + register float rsqrt; + register float nwork0; + register float nwork1; + +#ifdef __MWERKS__ // clang-format off + asm + { + psq_l v1_xy, Vec.x(vec1), 0, qr0 + ps_mul xx_yy, v1_xy, v1_xy + psq_l v1_z, Vec.z(vec1), 1, qr0 + ps_madd xx_zz, v1_z, v1_z, xx_yy + ps_sum0 sqsum, xx_zz, v1_z, xx_yy + frsqrte rsqrt, sqsum + fmuls nwork0, rsqrt, rsqrt + fmuls nwork1, rsqrt, c_half + fnmsubs nwork0, nwork0, sqsum, c_three + fmuls rsqrt, nwork0, nwork1 + ps_muls0 v1_xy, v1_xy, rsqrt + psq_st v1_xy, Vec.x(dst), 0, qr0 + ps_muls0 v1_z, v1_z, rsqrt + psq_st v1_z, Vec.z(dst), 1, qr0 + } +#endif // clang-format on +} + +asm float PSVECMag(register Vec* v) +{ +#ifdef __MWERKS__ // clang-format off + psq_l f0, Vec.x(v), 0, qr0 + ps_mul f0, f0, f0 + lfs f1, Vec.z(v) + ps_madd f1, f1, f1, f0 + lfs f4, 0.5f + ps_sum0 f1, f1, f0, f0 + frsqrte f0, f1 + lfs f3, 3.0f + fmuls f2, f0, f0 + fmuls f0, f0, f4 + fnmsubs f2, f2, f1, f3 + fmuls f0, f2, f0 + fsel f0, f0, f0, f1 + fmuls f1, f1, f0 +#endif // clang-format on +} + +asm f32 PSVECDotProduct(register Vec* vec1, register Vec* vec2) +{ +#ifdef __MWERKS__ // clang-format off + psq_l f2, Vec.y(vec1), 0, qr0 + psq_l f3, Vec.y(vec2), 0, qr0 + ps_mul f2, f2, f3 + psq_l f5, Vec.x(vec1), 0, qr0 + psq_l f4, Vec.x(vec2), 0, qr0 + ps_madd f3, f5, f4, f2 + ps_sum0 f1, f3, f2, f2 +#endif // clang-format on +} + +asm void PSVECCrossProduct(register Vec* vec1, register Vec* vec2, + register Vec* dst) +{ +#ifdef __MWERKS__ // clang-format off + psq_l f1, Vec.x(vec2), 0, qr0 + lfs f2, Vec.z(vec1) + psq_l f0, Vec.x(vec1), 0, qr0 + ps_merge10 f6, f1, f1 + lfs f3, Vec.z(vec2) + ps_mul f4, f1, f2 + ps_muls0 f7, f1, f0 + ps_msub f5, f0, f3, f4 + ps_msub f8, f0, f6, f7 + ps_merge11 f9, f5, f5 + ps_merge01 f10, f5, f8 + psq_st f9, Vec.x(dst), 1, qr0 + ps_neg f10, f10 + psq_st f10, Vec.y(dst), 0, qr0 +#endif // clang-format on +} + +asm f32 PSVECSquareDistance(register Vec* vec1, register Vec* vec2) +{ +#ifdef __MWERKS__ // clang-format off + psq_l f0, Vec.y(vec1), 0, qr0 + psq_l f1, Vec.y(vec2), 0, qr0 + ps_sub f2, f0, f1 + psq_l f0, Vec.x(vec1), 0, qr0 + psq_l f1, Vec.x(vec2), 0, qr0 + ps_mul f2, f2, f2 + ps_sub f0, f0, f1 + ps_madd f1, f0, f0, f2 + ps_sum0 f1, f1, f2, f2 +#endif // clang-format on +} + +asm f32 PSVECDistance(register Vec* vec1, register Vec* vec2) +{ +#ifdef __MWERKS__ // clang-format off + psq_l f0, Vec.y(vec1), 0, qr0 + psq_l f1, Vec.y(vec2), 0, qr0 + ps_sub f2, f0, f1 + psq_l f0, Vec.x(vec1), 0, qr0 + psq_l f1, Vec.x(vec2), 0, qr0 + ps_mul f2, f2, f2 + ps_sub f0, f0, f1 + lfs f3, 0.5f + ps_madd f0, f0, f0, f2 + ps_sum0 f0, f0, f2, f2 + lfs f4, 3.0f + frsqrte f1, f0 + fmuls f2, f1, f1 + fmuls f1, f1, f3 + fnmsubs f2, f2, f0, f4 + fmuls f1, f2, f1 + fsel f1, f1, f1, f0 + fmuls f1, f0, f1 +#endif // clang-format on +}