rsx/cfg/fp: Add delay-slot detection to remove unnecessary barriers

- Reduces emitted barriers by like 99%
This commit is contained in:
kd-11 2025-12-07 23:59:48 +03:00 committed by kd-11
parent 93f89b8a74
commit 1e6fe1f4ab
4 changed files with 77 additions and 5 deletions

View File

@ -1,6 +1,7 @@
#include "stdafx.h"
#include "RegisterAnnotationPass.h"
#include "Emu/RSX/Program/Assembler/FPOpcodes.h"
#include "Emu/RSX/Program/RSXFragmentProgram.h"
#include <span>
#include <unordered_map>
@ -13,6 +14,38 @@ namespace rsx::assembler::FP
static constexpr char content_float16 = 'H';
static constexpr char content_dual = 'D';
bool is_delay_slot(const Instruction& instruction)
{
OPDEST dst{ .HEX = instruction.bytecode[0] };
SRC0 src0{ .HEX = instruction.bytecode[1] };
SRC1 src1{ .HEX = instruction.bytecode[2] };
if (dst.opcode != RSX_FP_OPCODE_MOV || // These slots are always populated with MOV
dst.no_dest || // Must have a sink
src0.reg_type != RSX_FP_REGISTER_TYPE_TEMP || // Must read from reg
dst.dest_reg != src0.tmp_reg_index || // Must be a write-to-self
dst.fp16 || // Always full lane. We need to collect more data on this but it won't matter
dst.saturate || // Precision modifier
(dst.prec != RSX_FP_PRECISION_REAL &&
dst.prec != RSX_FP_PRECISION_UNKNOWN)) // Cannot have precision modifiers
{
return false;
}
// Check if we have precision modifiers on the source
if (src0.abs || src0.neg || src1.scale)
{
return false;
}
if (dst.mask_x && src0.swizzle_x != 0) return false;
if (dst.mask_y && src0.swizzle_y != 1) return false;
if (dst.mask_z && src0.swizzle_z != 2) return false;
if (dst.mask_w && src0.swizzle_w != 3) return false;
return true;
}
std::vector<RegisterRef> compile_register_file(const std::array<char, 48 * 8>& file)
{
std::vector<RegisterRef> results;
@ -90,10 +123,15 @@ namespace rsx::assembler::FP
}
// Decay instructions into register references
void annotate_instructions(BasicBlock* block, const RSXFragmentProgram& prog)
void annotate_instructions(BasicBlock* block, const RSXFragmentProgram& prog, bool skip_delay_slots)
{
for (auto& instruction : block->instructions)
{
if (skip_delay_slots && is_delay_slot(instruction))
{
continue;
}
const u32 operand_count = get_operand_count(static_cast<FP_opcode>(instruction.opcode));
for (u32 i = 0; i < operand_count; i++)
{
@ -178,7 +216,7 @@ namespace rsx::assembler::FP
{
for (auto& block : graph.blocks)
{
annotate_instructions(&block, m_prog);
annotate_instructions(&block, m_prog, m_config.skip_delay_slots);
annotate_block_io(&block);
}
}

View File

@ -6,6 +6,11 @@ struct RSXFragmentProgram;
namespace rsx::assembler::FP
{
struct RegisterAnnotationPassOptions
{
bool skip_delay_slots = false; // When enabled, detect delay slots and ignore annotating them.
};
// The annotation pass annotates each basic block with 2 pieces of information:
// 1. The "input" register list for a block.
// 2. The "output" register list for a block (clobber list).
@ -14,13 +19,16 @@ namespace rsx::assembler::FP
class RegisterAnnotationPass : public CFGPass
{
public:
RegisterAnnotationPass(const RSXFragmentProgram& prog)
: m_prog(prog)
RegisterAnnotationPass(
const RSXFragmentProgram& prog,
const RegisterAnnotationPassOptions& options = {})
: m_prog(prog), m_config(options)
{}
void run(FlowGraph& graph) override;
private:
const RSXFragmentProgram& m_prog;
RegisterAnnotationPassOptions m_config;
};
}

View File

@ -1294,7 +1294,7 @@ std::string FragmentProgramDecompiler::Decompile()
const auto rop_inputs = get_fragment_program_output_set(m_prog.ctrl, m_prog.mrt_buffers_count);
rop_block->input_list.insert(rop_block->input_list.end(), rop_inputs.begin(), rop_inputs.end());
FP::RegisterAnnotationPass annotation_pass{ m_prog };
FP::RegisterAnnotationPass annotation_pass{ m_prog, { .skip_delay_slots = true } };
FP::RegisterDependencyPass dependency_pass{};
annotation_pass.run(graph);

View File

@ -568,4 +568,30 @@ namespace rsx::assembler
EXPECT_EQ(src1.fp16, 1);
EXPECT_EQ(src1.swizzle_x, 1);
}
TEST(TestFPIR, RegisterDependencyPass_SkipDelaySlots)
{
// Instruction 2 clobers H1 which in turn clobbers R0.
// Instruction 3 reads from R0 but is a delay slot that does nothing and can be NOPed.
auto graph = CFG_from_source(R"(
ADD R1, R0, R1;
MOV H1, R1
MOV R0, R0;
)");
ASSERT_EQ(graph.blocks.size(), 1);
ASSERT_EQ(graph.blocks.front().instructions.size(), 3);
auto& block = graph.blocks.front();
RSXFragmentProgram prog{};
FP::RegisterAnnotationPass annotation_pass{ prog, { .skip_delay_slots = true } };
FP::RegisterDependencyPass deps_pass{};
annotation_pass.run(graph);
deps_pass.run(graph);
// Delay slot detection will cause no dependency injection
ASSERT_EQ(block.instructions.size(), 3);
}
}