From 64bcd8c03081ad00a6a48f00f4c584bf6646b07a Mon Sep 17 00:00:00 2001 From: Tyler Wilding Date: Mon, 30 Mar 2026 20:20:47 -0400 Subject: [PATCH] goalc: Get CodeTester tests passing on Arm64 (only targetting macOS atm) (#3290) This PR does the following: - Designs a mechanism by which arm64 instructions can be encoded and emitted - Dispatch our higher-level instruction emitting calls to either x86 or arm64 instructions depending on what the compiler is set to (defaults to x86) - Bare minimum scaffolding to get the arm64 instructions successfully executing atleast on apple silicon - Implement enough instructions to get the codetester test suite passing on arm --- .github/workflows/build-matrix.yaml | 2 +- .github/workflows/release-pipeline.yaml | 4 +- .vscode/launch.json | 24 +- CMakeLists.txt | 16 +- CMakePresets.json | 18 +- Taskfile.yml | 3 + common/custom_data/TFrag3Data.cpp | 7 +- common/util/crc32.h | 8 +- common/util/os.cpp | 1 + common/util/simd_util.h | 7 + decompiler/extractor/main.cpp | 2 +- game/CMakeLists.txt | 3 +- game/common/vu.h | 7 +- game/graphics/opengl_renderer/SkyBlendCPU.cpp | 5 +- .../opengl_renderer/background/Tie3.cpp | 1 - .../opengl_renderer/background/Tie3.h | 1 - .../background/background_common.cpp | 11 +- .../opengl_renderer/foreground/Merc2.cpp | 7 +- game/runtime.cpp | 13 +- game/sound/989snd/sndplay.cpp | 1 - game/system/hid/input_bindings.h | 2 +- goalc/CMakeLists.txt | 5 + goalc/compiler/CodeGenerator.cpp | 112 +- goalc/compiler/CodeGenerator.h | 11 +- goalc/compiler/Compiler.cpp | 8 +- goalc/compiler/Compiler.h | 3 + goalc/compiler/IR.cpp | 888 +- goalc/compiler/IR.h | 380 +- goalc/compiler/compilation/Function.cpp | 12 +- goalc/compiler/compilation/Type.cpp | 8 +- goalc/debugger/disassemble.cpp | 5 +- goalc/debugger/disassemble.h | 8 +- goalc/emitter/CodeTester.cpp | 128 +- goalc/emitter/CodeTester.h | 27 +- goalc/emitter/IGen.cpp | 1089 +++ goalc/emitter/IGen.h | 3707 ++------ goalc/emitter/IGenARM64.cpp | 1203 +++ goalc/emitter/IGenARM64.h | 803 ++ goalc/emitter/IGenX86.cpp | 2450 ++++++ goalc/emitter/IGenX86.h | 803 ++ goalc/emitter/Instruction.h | 169 +- goalc/emitter/InstructionSet.h | 5 + goalc/emitter/ObjectGenerator.cpp | 10 +- goalc/emitter/ObjectGenerator.h | 5 + goalc/emitter/Register.h | 65 +- goalc/main.cpp | 6 +- goalc/simple_main.cpp | 10 +- lsp/state/workspace.cpp | 5 +- test/CMakeLists.txt | 2 - test/goalc/CMakeLists.txt | 54 +- test/goalc/test_CodeTester.cpp | 378 + test/goalc/test_arithmetic.cpp | 4 +- test/goalc/test_collections.cpp | 2 +- test/goalc/test_compiler.cpp | 4 +- test/goalc/test_control_statements.cpp | 4 +- test/goalc/test_debugger.cpp | 12 +- test/goalc/test_game_no_debug.cpp | 2 +- test/goalc/test_goal_kernel.cpp | 2 +- test/goalc/test_goal_kernel2.cpp | 2 +- test/goalc/test_goal_kernel3.cpp | 2 +- test/goalc/test_jak2_compiler.cpp | 2 +- test/goalc/test_type_consistency.cpp | 16 +- test/goalc/test_variables.cpp | 2 +- test/goalc/test_vector_float.cpp | 2 +- test/goalc/test_with_game.cpp | 2 +- test/offline/config/jak1/config.jsonc | 64 +- test/offline/framework/execution.cpp | 2 +- test/test_CodeTester.cpp | 233 - test/test_emitter.cpp | 7804 +++++++++-------- test/test_emitter_avx.cpp | 666 +- third-party/sse2neon/sse2neon.h | 5268 +++++++---- vendor.yaml | 2 +- 72 files changed, 17001 insertions(+), 9598 deletions(-) create mode 100644 common/util/simd_util.h create mode 100644 goalc/emitter/IGen.cpp create mode 100644 goalc/emitter/IGenARM64.cpp create mode 100644 goalc/emitter/IGenARM64.h create mode 100644 goalc/emitter/IGenX86.cpp create mode 100644 goalc/emitter/IGenX86.h create mode 100644 goalc/emitter/InstructionSet.h create mode 100644 test/goalc/test_CodeTester.cpp delete mode 100644 test/test_CodeTester.cpp diff --git a/.github/workflows/build-matrix.yaml b/.github/workflows/build-matrix.yaml index 2f6cf5a2f4..2afb74f7fb 100644 --- a/.github/workflows/build-matrix.yaml +++ b/.github/workflows/build-matrix.yaml @@ -57,6 +57,6 @@ jobs: name: "🍎 MacOS" uses: ./.github/workflows/macos-build-arm.yaml with: - cmakePreset: "Release-macos-x86_64-clang" + cmakePreset: "Release-macos-arm64-clang" cachePrefix: "" secrets: inherit diff --git a/.github/workflows/release-pipeline.yaml b/.github/workflows/release-pipeline.yaml index 9eb4269909..29189cbac2 100644 --- a/.github/workflows/release-pipeline.yaml +++ b/.github/workflows/release-pipeline.yaml @@ -79,7 +79,7 @@ jobs: uploadArtifacts: true secrets: inherit - build_macos_arm: + build_macos_arm_rosetta: needs: - cut_release name: "🍎 MacOS" @@ -98,7 +98,7 @@ jobs: - build_windows_clang - build_linux_clang - build_macos_intel - - build_macos_arm + - build_macos_arm_rosetta name: "Upload Artifacts" runs-on: ubuntu-latest steps: diff --git a/.vscode/launch.json b/.vscode/launch.json index 0c44e384f5..ab489aa64e 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -5,13 +5,31 @@ "version": "0.2.0", "configurations": [ { - "name": "run python script", + "name": "Run C++ Tests LLDB", + "type": "cppdbg", + "request": "launch", + "program": "${workspaceFolder}/build/goalc-test", + "args": [ + "--gtest_brief=0", + "--gtest_filter=*CodeTester*", + "--gtest_break_on_failure" + ], + "stopAtEntry": false, + "cwd": "${workspaceFolder}", + "environment": [], + "externalConsole": false, + "MIMode": "lldb" + }, + { + "name": "Append File Docs", "type": "python", "request": "launch", "program": "${workspaceFolder}/scripts/ci/lint-characters.py", "console": "integratedTerminal", "cwd": "${workspaceFolder}", - "args": ["--fix"] + "args": [ + "--fix" + ] }, ] -} +} \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 1d39eaa652..aeceb3ebca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,6 +36,13 @@ endif() # a more recent issue - https://github.com/libsdl-org/SDL/issues/12078 if (APPLE) enable_language(OBJC) + execute_process( + COMMAND xcrun --show-sdk-path + OUTPUT_VARIABLE MACOSX_SYSROOT + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + # Tell CMake to use it + set(CMAKE_OSX_SYSROOT "${MACOSX_SYSROOT}" CACHE PATH "macOS SDK path" FORCE) endif() # Setup compiler flags @@ -127,11 +134,12 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang") -Wsign-promo \ -fdiagnostics-color=always" ) - - # pin to AVX for macOS, hopefully all macOS runners have atleast this architecture - # technically speaking, SSE4 is the cutoff for Apple Silicon so...only a matter of time! - if(NOT CMAKE_CXX_COMPILER_TARGET STREQUAL "arm64-apple-darwin") + + # TODO - make a proper flag for arm compiling + if (CMAKE_APPLE_SILICON_PROCESSOR STREQUAL "x86_64") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx") + else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -mcrc") endif() # additional c++ flags for release mode for our projects diff --git a/CMakePresets.json b/CMakePresets.json index 9dd278d59b..f5e6a75eb3 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -62,12 +62,22 @@ "name": "base-linux-debug", "hidden": true, "inherits": "base", - "binaryDir": "${sourceDir}/build/Release/bin", + "binaryDir": "${sourceDir}/build/Debug/bin", "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug", "CMAKE_INSTALL_PREFIX": "${sourceDir}/build/install/${presetName}" } }, + { + "name": "base-macos-debug", + "hidden": true, + "inherits": "base", + "binaryDir": "${sourceDir}/build/Debug/bin", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Debug", + "CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}" + } + }, { "name": "base-macos-release", "hidden": true, @@ -174,6 +184,12 @@ "description": "Build with Clang as Release without Debug Symbols", "inherits": ["base-linux-release", "base-clang"] }, + { + "name": "Debug-macos-arm64-clang", + "displayName": "MacOS ARM64 Debug (clang)", + "description": "Build for ARM64 with Clang as Debug", + "inherits": ["base-macos-arm64", "base-macos-debug", "base-clang"] + }, { "name": "Release-macos-arm64-clang", "displayName": "MacOS ARM64 Release (clang)", diff --git a/Taskfile.yml b/Taskfile.yml index 7fe511f955..4e80bf997f 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -237,3 +237,6 @@ tasks: type-test: cmds: - cmd: '{{.GOALCTEST_BIN_RELEASE_DIR}}/goalc-test --gtest_brief=0 --gtest_filter="*{{.TYPE_CONSISTENCY_TEST_FILTER}}*" --gtest_break_on_failure' + tests-filtered: + cmds: + - cmd: '{{.GOALCTEST_BIN_RELEASE_DIR}}/goalc-test --gtest_brief=0 --gtest_filter="*{{.FILTER}}*" --gtest_break_on_failure' diff --git a/common/custom_data/TFrag3Data.cpp b/common/custom_data/TFrag3Data.cpp index 12cb53a3dc..4ef1fa09dd 100644 --- a/common/custom_data/TFrag3Data.cpp +++ b/common/custom_data/TFrag3Data.cpp @@ -3,14 +3,9 @@ #include #include -#ifndef __aarch64__ -#include "xmmintrin.h" -#else -#include "third-party/sse2neon/sse2neon.h" -#endif - #include "common/log/log.h" #include "common/util/Assert.h" +#include "common/util/simd_util.h" namespace tfrag3 { diff --git a/common/util/crc32.h b/common/util/crc32.h index 4ce43c7cba..aae05f5121 100644 --- a/common/util/crc32.h +++ b/common/util/crc32.h @@ -10,17 +10,16 @@ u32 crc32(const u8* data, size_t size); #ifdef __aarch64__ #include +// Computes CRC32C inline u32 crc32(const u8* data, size_t size) { u32 result = 0xffffffff; while (size >= 4) { - u32 x; - memcpy(&x, data, 4); + result = __crc32cw(result, *reinterpret_cast(data)); data += 4; size -= 4; - result = __crc32w(result, x); } while (size) { - result = __crc32b(result, *data); + result = __crc32cb(result, *data); data++; size--; } @@ -28,6 +27,7 @@ inline u32 crc32(const u8* data, size_t size) { } #else #include +// Computes CRC32C inline u32 crc32(const u8* data, size_t size) { u32 result = 0xffffffff; while (size >= 4) { diff --git a/common/util/os.cpp b/common/util/os.cpp index e928bc5cf7..45506835e7 100644 --- a/common/util/os.cpp +++ b/common/util/os.cpp @@ -47,6 +47,7 @@ void __cpuidex(int result[4], int eax, int ecx) { : "0"(eax), "2"(ecx)); } #else +// TODO ARM - implement ARM64 detection, check for NEON instead of AVX // for now, just return 0's. void __cpuidex(int result[4], int eax, int ecx) { lg::warn("cpuid not implemented on this platform"); diff --git a/common/util/simd_util.h b/common/util/simd_util.h new file mode 100644 index 0000000000..db76336033 --- /dev/null +++ b/common/util/simd_util.h @@ -0,0 +1,7 @@ +#pragma once + +#ifndef __aarch64__ +#include +#else +#include "third-party/sse2neon/sse2neon.h" +#endif \ No newline at end of file diff --git a/decompiler/extractor/main.cpp b/decompiler/extractor/main.cpp index 3d203811b5..3f015ea647 100644 --- a/decompiler/extractor/main.cpp +++ b/decompiler/extractor/main.cpp @@ -129,7 +129,7 @@ ExtractorErrorCode compile(const fs::path& iso_data_path, const std::string& dat // Determine which config to use from the database const auto version_info = get_version_info_or_default(iso_data_path); - Compiler compiler(game_name_to_version(version_info.game_name)); + Compiler compiler(game_name_to_version(version_info.game_name), emitter::InstructionSet::X86); compiler.make_system().set_constant("*iso-data*", absolute(iso_data_path).string()); compiler.make_system().set_constant("*use-iso-data-path*", true); file_util::set_iso_data_dir(absolute(iso_data_path)); diff --git a/game/CMakeLists.txt b/game/CMakeLists.txt index 60365627ce..1677eca0e5 100644 --- a/game/CMakeLists.txt +++ b/game/CMakeLists.txt @@ -12,8 +12,7 @@ if(ARM64_ARCH) set(OG_ASM_FUNCS_FILE kernel/asm_funcs_arm64.s) enable_language(ASM) set(CMAKE_ASM_SOURCE_FILE_EXTENSIONS ${CMAKE_ASM_SOURCE_FILE_EXTENSIONS} s) - # set(CMAKE_ASM_COMPILE_OBJECT "${CMAKE_ASM_COMPILER} -o ") - set_source_files_properties(${OG_ASM_FUNCS_FILE} PROPERTIES COMPILE_FLAGS "-g") + set_source_files_properties(${OG_ASM_FUNCS_FILE} PROPERTIES COMPILE_FLAGS "-arch arm64 -g") else() set(OG_ASM_FUNCS_FILE kernel/asm_funcs_x86_64.asm) enable_language(ASM_NASM) diff --git a/game/common/vu.h b/game/common/vu.h index 2d47eb30a8..c85eda4b7e 100644 --- a/game/common/vu.h +++ b/game/common/vu.h @@ -1,15 +1,10 @@ #pragma once #include -#ifdef __aarch64__ -#include "third-party/sse2neon/sse2neon.h" -#else -#include -#endif - #include "common/common_types.h" #include "common/math/Vector.h" #include "common/util/Assert.h" +#include "common/util/simd_util.h" enum class Mask { NONE = 0, diff --git a/game/graphics/opengl_renderer/SkyBlendCPU.cpp b/game/graphics/opengl_renderer/SkyBlendCPU.cpp index 655eea8f5c..c70d47d32a 100644 --- a/game/graphics/opengl_renderer/SkyBlendCPU.cpp +++ b/game/graphics/opengl_renderer/SkyBlendCPU.cpp @@ -1,10 +1,7 @@ #include "SkyBlendCPU.h" -#ifndef __aarch64__ -#include -#endif - #include "common/util/os.h" +#include "common/util/simd_util.h" #include "game/graphics/opengl_renderer/AdgifHandler.h" diff --git a/game/graphics/opengl_renderer/background/Tie3.cpp b/game/graphics/opengl_renderer/background/Tie3.cpp index a32eab6c06..b484368322 100644 --- a/game/graphics/opengl_renderer/background/Tie3.cpp +++ b/game/graphics/opengl_renderer/background/Tie3.cpp @@ -711,7 +711,6 @@ void Tie3::envmap_second_pass_draw(const Tree& tree, void Tie3::draw_debug_window() { ImGui::Checkbox("envmap 2nd draw", &m_draw_envmap_second_draw); ImGui::SliderFloat("envmap str", &m_envmap_strength, 0, 2); - ImGui::Checkbox("Fast ToD", &m_use_fast_time_of_day); ImGui::SameLine(); ImGui::Checkbox("All Visible", &m_debug_all_visible); ImGui::Checkbox("Hide Wind", &m_hide_wind); diff --git a/game/graphics/opengl_renderer/background/Tie3.h b/game/graphics/opengl_renderer/background/Tie3.h index cd1f49793d..37082c55d0 100644 --- a/game/graphics/opengl_renderer/background/Tie3.h +++ b/game/graphics/opengl_renderer/background/Tie3.h @@ -150,7 +150,6 @@ class Tie3 : public BucketRenderer { static constexpr int TIME_OF_DAY_COLOR_COUNT = 8192; bool m_has_level = false; - bool m_use_fast_time_of_day = true; bool m_debug_all_visible = false; bool m_hide_wind = false; bool m_draw_envmap_second_draw = true; diff --git a/game/graphics/opengl_renderer/background/background_common.cpp b/game/graphics/opengl_renderer/background/background_common.cpp index b3f9f6ca56..df6d9c0b14 100644 --- a/game/graphics/opengl_renderer/background/background_common.cpp +++ b/game/graphics/opengl_renderer/background/background_common.cpp @@ -2,13 +2,8 @@ #include "background_common.h" -#ifdef __aarch64__ -#include "third-party/sse2neon/sse2neon.h" -#else -#include -#endif - #include "common/util/os.h" +#include "common/util/simd_util.h" #include "game/graphics/opengl_renderer/BucketRenderer.h" #include "game/graphics/pipelines/opengl.h" @@ -329,9 +324,6 @@ void interp_time_of_day_slow(const math::Vector itimes[4], void interp_time_of_day(const math::Vector itimes[4], const tfrag3::PackedTimeOfDay& packed_colors, math::Vector* out) { -#ifdef __aarch64__ - interp_time_of_day_slow(itimes, packed_colors, out); -#else math::Vector weights[8]; for (int component = 0; component < 8; component++) { int quad_idx = component / 2; @@ -483,7 +475,6 @@ void interp_time_of_day(const math::Vector itimes[4], _mm_storel_epi64((__m128i*)(&out[color_quad * 4 + 2]), result); } } -#endif } bool sphere_in_view_ref(const math::Vector4f& sphere, const math::Vector4f* planes) { diff --git a/game/graphics/opengl_renderer/foreground/Merc2.cpp b/game/graphics/opengl_renderer/foreground/Merc2.cpp index 07c7204d84..68bebd86b2 100644 --- a/game/graphics/opengl_renderer/foreground/Merc2.cpp +++ b/game/graphics/opengl_renderer/foreground/Merc2.cpp @@ -1,13 +1,8 @@ #include "Merc2.h" -#ifdef __aarch64__ -#include "third-party/sse2neon/sse2neon.h" -#else -#include -#endif - #include "common/global_profiler/GlobalProfiler.h" #include "common/util/fnv.h" +#include "common/util/simd_util.h" #include "game/graphics/opengl_renderer/EyeRenderer.h" #include "game/graphics/opengl_renderer/background/background_common.h" diff --git a/game/runtime.cpp b/game/runtime.cpp index 586466aa9a..8a5a634a29 100644 --- a/game/runtime.cpp +++ b/game/runtime.cpp @@ -27,7 +27,6 @@ #include "common/global_profiler/GlobalProfiler.h" #include "common/goal_constants.h" #include "common/log/log.h" -#include "common/util/FileUtil.h" #include "common/versions/versions.h" #include "game/external/discord.h" @@ -55,9 +54,6 @@ #include "game/kernel/jak3/klisten.h" #include "game/kernel/jak3/kscheme.h" #include "game/kernel/jakx/kboot.h" -#include "game/kernel/jakx/kdgo.h" -#include "game/kernel/jakx/klisten.h" -#include "game/kernel/jakx/kscheme.h" #include "game/overlord/common/fake_iso.h" #include "game/overlord/common/iso.h" #include "game/overlord/common/sbank.h" @@ -70,7 +66,6 @@ #include "game/overlord/jak1/overlord.h" #include "game/overlord/jak1/ramdisk.h" #include "game/overlord/jak1/srpc.h" -#include "game/overlord/jak1/ssound.h" #include "game/overlord/jak1/stream.h" #include "game/overlord/jak2/dma.h" #include "game/overlord/jak2/iso_cd.h" @@ -82,7 +77,6 @@ #include "game/overlord/jak2/stream.h" #include "game/overlord/jak2/streamlist.h" #include "game/overlord/jak2/vag.h" -#include "game/overlord/jak3/init.h" #include "game/overlord/jak3/overlord.h" #include "game/system/Deci2Server.h" #include "game/system/iop_thread.h" @@ -155,6 +149,13 @@ void deci2_runner(SystemThreadInterface& iface) { void ee_runner(SystemThreadInterface& iface) { prof().root_event(); // Allocate Main RAM. Must have execute enabled. + // TODO Apple Silicon - You cannot make a page be RWX, + // or more specifically it can't be both writable and executable at the same time + // + // https://github.com/zherczeg/sljit/issues/99 + // + // The solution to this is to flip-flop between permissions, or perhaps have two threads + // one that has writing permission, and another with executable permission if (EE_MEM_LOW_MAP) { g_ee_main_mem = (u8*)mmap((void*)0x10000000, EE_MAIN_MEM_SIZE, PROT_EXEC | PROT_READ | PROT_WRITE, diff --git a/game/sound/989snd/sndplay.cpp b/game/sound/989snd/sndplay.cpp index ba934983f5..8e6a587509 100644 --- a/game/sound/989snd/sndplay.cpp +++ b/game/sound/989snd/sndplay.cpp @@ -1,4 +1,3 @@ -#include #include #include diff --git a/game/system/hid/input_bindings.h b/game/system/hid/input_bindings.h index 810f26a25c..a38cd043d9 100644 --- a/game/system/hid/input_bindings.h +++ b/game/system/hid/input_bindings.h @@ -391,7 +391,7 @@ extern const InputBindingGroups DEFAULT_MOUSE_BINDS; // So there are some potential solutions but this doesn't feel high priority and this was always an // issue. struct CommandBinding { - enum Source { CONTROLLER, KEYBOARD, MOUSE }; + enum class Source { CONTROLLER, KEYBOARD, MOUSE }; u32 host_key; InputModifiers modifiers; diff --git a/goalc/CMakeLists.txt b/goalc/CMakeLists.txt index 20c40403e5..89b5d8a96c 100644 --- a/goalc/CMakeLists.txt +++ b/goalc/CMakeLists.txt @@ -4,6 +4,9 @@ add_library(compiler emitter/ObjectFileData.cpp emitter/ObjectGenerator.cpp emitter/Register.cpp + emitter/IGen.cpp + emitter/IGenARM64.cpp + emitter/IGenX86.cpp debugger/disassemble.cpp build_level/common/build_level.cpp build_actor/common/animation_processing.cpp @@ -66,6 +69,8 @@ add_library(compiler build_actor/jak3/build_actor.cpp debugger/Debugger.cpp debugger/DebugInfo.cpp + emitter/IGenX86.cpp + emitter/IGenARM64.cpp listener/Listener.cpp listener/MemoryMap.cpp make/MakeSystem.cpp diff --git a/goalc/compiler/CodeGenerator.cpp b/goalc/compiler/CodeGenerator.cpp index b6430987b7..cdb4260928 100644 --- a/goalc/compiler/CodeGenerator.cpp +++ b/goalc/compiler/CodeGenerator.cpp @@ -7,6 +7,7 @@ #include "CodeGenerator.h" +#include #include #include "IR.h" @@ -18,8 +19,11 @@ using namespace emitter; -CodeGenerator::CodeGenerator(FileEnv* env, DebugInfo* debug_info, GameVersion version) - : m_gen(version), m_fe(env), m_debug_info(debug_info) {} +CodeGenerator::CodeGenerator(FileEnv* env, + DebugInfo* debug_info, + GameVersion version, + InstructionSet instruction_set) + : m_gen(version, instruction_set), m_fe(env), m_debug_info(debug_info) {} /*! * Generate an object file. @@ -62,9 +66,21 @@ std::vector CodeGenerator::run(const TypeSystem* ts) { void CodeGenerator::do_function(FunctionEnv* env, int f_idx) { if (env->is_asm_func) { - do_asm_function(env, f_idx, env->asm_func_saved_regs); + if (m_gen.instr_set() == InstructionSet::X86) { + do_asm_function_x86(env, f_idx, env->asm_func_saved_regs); + } else if (m_gen.instr_set() == InstructionSet::ARM64) { + do_asm_function_arm64(env, f_idx, env->asm_func_saved_regs); + } else { + throw std::runtime_error("CodeGenerator::do_function, instruction set not supported"); + } } else { - do_goal_function(env, f_idx); + if (m_gen.instr_set() == InstructionSet::X86) { + do_goal_function_x86(env, f_idx); + } else if (m_gen.instr_set() == InstructionSet::ARM64) { + do_goal_function_arm64(env, f_idx); + } else { + throw std::runtime_error("CodeGenerator::do_function, instruction set not supported"); + } } } @@ -72,7 +88,7 @@ void CodeGenerator::do_function(FunctionEnv* env, int f_idx) { * Add instructions to the function, specified by index. * Generates prologues / epilogues. */ -void CodeGenerator::do_goal_function(FunctionEnv* env, int f_idx) { +void CodeGenerator::do_goal_function_x86(FunctionEnv* env, int f_idx) { bool use_new_xmms = true; auto* debug = &m_debug_info->function_by_name(env->name()); @@ -88,7 +104,7 @@ void CodeGenerator::do_goal_function(FunctionEnv* env, int f_idx) { // count how many xmm's we have to backup int n_xmm_backups = 0; for (auto& saved_reg : allocs.used_saved_regs) { - if (saved_reg.is_xmm()) { + if (saved_reg.is_xmm(m_gen.instr_set())) { n_xmm_backups++; } } @@ -100,14 +116,15 @@ void CodeGenerator::do_goal_function(FunctionEnv* env, int f_idx) { if (n_xmm_backups > 0) { // offset the stack stack_offset += xmm_backup_stack_offset; - m_gen.add_instr_no_ir(f_rec, IGen::sub_gpr64_imm(RSP, xmm_backup_stack_offset), + m_gen.add_instr_no_ir(f_rec, IGen::sub_gpr64_imm(m_gen, RSP, xmm_backup_stack_offset), InstructionInfo::Kind::PROLOGUE); // back up xmms int i = 0; for (auto& saved_reg : allocs.used_saved_regs) { - if (saved_reg.is_xmm()) { + if (saved_reg.is_xmm(m_gen.instr_set())) { int offset = i * XMM_SIZE; - m_gen.add_instr_no_ir(f_rec, IGen::store128_xmm128_reg_offset(RSP, saved_reg, offset), + m_gen.add_instr_no_ir(f_rec, + IGen::store128_xmm128_reg_offset(m_gen, RSP, saved_reg, offset), InstructionInfo::Kind::PROLOGUE); i++; } @@ -116,10 +133,10 @@ void CodeGenerator::do_goal_function(FunctionEnv* env, int f_idx) { } else { // back up xmms (currently not aligned) for (auto& saved_reg : allocs.used_saved_regs) { - if (saved_reg.is_xmm()) { - m_gen.add_instr_no_ir(f_rec, IGen::sub_gpr64_imm8s(RSP, XMM_SIZE), + if (saved_reg.is_xmm(m_gen.instr_set())) { + m_gen.add_instr_no_ir(f_rec, IGen::sub_gpr64_imm8s(m_gen, RSP, XMM_SIZE), InstructionInfo::Kind::PROLOGUE); - m_gen.add_instr_no_ir(f_rec, IGen::store128_gpr64_xmm128(RSP, saved_reg), + m_gen.add_instr_no_ir(f_rec, IGen::store128_gpr64_simd128(m_gen, RSP, saved_reg), InstructionInfo::Kind::PROLOGUE); stack_offset += XMM_SIZE; } @@ -128,8 +145,9 @@ void CodeGenerator::do_goal_function(FunctionEnv* env, int f_idx) { // back up gprs for (auto& saved_reg : allocs.used_saved_regs) { - if (saved_reg.is_gpr()) { - m_gen.add_instr_no_ir(f_rec, IGen::push_gpr64(saved_reg), InstructionInfo::Kind::PROLOGUE); + if (saved_reg.is_gpr(m_gen.instr_set())) { + m_gen.add_instr_no_ir(f_rec, IGen::push_gpr64(m_gen, saved_reg), + InstructionInfo::Kind::PROLOGUE); stack_offset += GPR_SIZE; } } @@ -152,7 +170,7 @@ void CodeGenerator::do_goal_function(FunctionEnv* env, int f_idx) { } else { // otherwise to an extra push, and remember so we can do an extra pop later on. bonus_push = true; - m_gen.add_instr_no_ir(f_rec, IGen::push_gpr64(ri.get_saved_gpr(0)), + m_gen.add_instr_no_ir(f_rec, IGen::push_gpr64(m_gen, ri.get_saved_gpr(0)), InstructionInfo::Kind::PROLOGUE); } stack_offset += 8; @@ -162,7 +180,7 @@ void CodeGenerator::do_goal_function(FunctionEnv* env, int f_idx) { // do manual stack offset. if (manually_added_stack_offset) { - m_gen.add_instr_no_ir(f_rec, IGen::sub_gpr64_imm(RSP, manually_added_stack_offset), + m_gen.add_instr_no_ir(f_rec, IGen::sub_gpr64_imm(m_gen, RSP, manually_added_stack_offset), InstructionInfo::Kind::PROLOGUE); } } @@ -178,20 +196,20 @@ void CodeGenerator::do_goal_function(FunctionEnv* env, int f_idx) { auto& bonus = allocs.stack_ops.at(ir_idx); for (auto& op : bonus.ops) { if (op.load) { - if (op.reg.is_gpr() && op.reg_class == RegClass::GPR_64) { + if (op.reg.is_gpr(m_gen.instr_set()) && op.reg_class == RegClass::GPR_64) { // todo, s8 or 0 offset if possible? m_gen.add_instr(IGen::load64_gpr64_plus_s32( - op.reg, allocs.get_slot_for_spill(op.slot) * GPR_SIZE, RSP), + m_gen, op.reg, allocs.get_slot_for_spill(op.slot) * GPR_SIZE, RSP), i_rec); - } else if (op.reg.is_xmm() && op.reg_class == RegClass::FLOAT) { + } else if (op.reg.is_xmm(m_gen.instr_set()) && op.reg_class == RegClass::FLOAT) { // load xmm32 off of the stack m_gen.add_instr(IGen::load_reg_offset_xmm32( - op.reg, RSP, allocs.get_slot_for_spill(op.slot) * GPR_SIZE), + m_gen, op.reg, RSP, allocs.get_slot_for_spill(op.slot) * GPR_SIZE), i_rec); - } else if (op.reg.is_xmm() && + } else if (op.reg.is_xmm(m_gen.instr_set()) && (op.reg_class == RegClass::VECTOR_FLOAT || op.reg_class == RegClass::INT_128)) { m_gen.add_instr(IGen::load128_xmm128_reg_offset( - op.reg, RSP, allocs.get_slot_for_spill(op.slot) * GPR_SIZE), + m_gen, op.reg, RSP, allocs.get_slot_for_spill(op.slot) * GPR_SIZE), i_rec); } else { ASSERT(false); @@ -200,25 +218,25 @@ void CodeGenerator::do_goal_function(FunctionEnv* env, int f_idx) { } // do the actual op - ir->do_codegen(&m_gen, allocs, i_rec); + ir->do_codegen_x86(&m_gen, allocs, i_rec); // store things back on the stack if needed. for (auto& op : bonus.ops) { if (op.store) { - if (op.reg.is_gpr() && op.reg_class == RegClass::GPR_64) { + if (op.reg.is_gpr(m_gen.instr_set()) && op.reg_class == RegClass::GPR_64) { // todo, s8 or 0 offset if possible? m_gen.add_instr(IGen::store64_gpr64_plus_s32( - RSP, allocs.get_slot_for_spill(op.slot) * GPR_SIZE, op.reg), + m_gen, RSP, allocs.get_slot_for_spill(op.slot) * GPR_SIZE, op.reg), i_rec); - } else if (op.reg.is_xmm() && op.reg_class == RegClass::FLOAT) { + } else if (op.reg.is_xmm(m_gen.instr_set()) && op.reg_class == RegClass::FLOAT) { // store xmm32 on the stack m_gen.add_instr(IGen::store_reg_offset_xmm32( - RSP, op.reg, allocs.get_slot_for_spill(op.slot) * GPR_SIZE), + m_gen, RSP, op.reg, allocs.get_slot_for_spill(op.slot) * GPR_SIZE), i_rec); - } else if (op.reg.is_xmm() && + } else if (op.reg.is_xmm(m_gen.instr_set()) && (op.reg_class == RegClass::VECTOR_FLOAT || op.reg_class == RegClass::INT_128)) { m_gen.add_instr(IGen::store128_xmm128_reg_offset( - RSP, op.reg, allocs.get_slot_for_spill(op.slot) * GPR_SIZE), + m_gen, RSP, op.reg, allocs.get_slot_for_spill(op.slot) * GPR_SIZE), i_rec); } else { ASSERT(false); @@ -231,21 +249,22 @@ void CodeGenerator::do_goal_function(FunctionEnv* env, int f_idx) { if (manually_added_stack_offset || allocs.needs_aligned_stack_for_spills || env->needs_aligned_stack()) { if (manually_added_stack_offset) { - m_gen.add_instr_no_ir(f_rec, IGen::add_gpr64_imm(RSP, manually_added_stack_offset), + m_gen.add_instr_no_ir(f_rec, IGen::add_gpr64_imm(m_gen, RSP, manually_added_stack_offset), InstructionInfo::Kind::EPILOGUE); } if (bonus_push) { ASSERT(!manually_added_stack_offset); - m_gen.add_instr_no_ir(f_rec, IGen::pop_gpr64(ri.get_saved_gpr(0)), + m_gen.add_instr_no_ir(f_rec, IGen::pop_gpr64(m_gen, ri.get_saved_gpr(0)), InstructionInfo::Kind::EPILOGUE); } } for (int i = int(allocs.used_saved_regs.size()); i-- > 0;) { auto& saved_reg = allocs.used_saved_regs.at(i); - if (saved_reg.is_gpr()) { - m_gen.add_instr_no_ir(f_rec, IGen::pop_gpr64(saved_reg), InstructionInfo::Kind::EPILOGUE); + if (saved_reg.is_gpr(m_gen.instr_set())) { + m_gen.add_instr_no_ir(f_rec, IGen::pop_gpr64(m_gen, saved_reg), + InstructionInfo::Kind::EPILOGUE); } } @@ -254,33 +273,38 @@ void CodeGenerator::do_goal_function(FunctionEnv* env, int f_idx) { int j = n_xmm_backups; for (int i = int(allocs.used_saved_regs.size()); i-- > 0;) { auto& saved_reg = allocs.used_saved_regs.at(i); - if (saved_reg.is_xmm()) { + if (saved_reg.is_xmm(m_gen.instr_set())) { j--; int offset = j * XMM_SIZE; - m_gen.add_instr_no_ir(f_rec, IGen::load128_xmm128_reg_offset(saved_reg, RSP, offset), + m_gen.add_instr_no_ir(f_rec, + IGen::load128_xmm128_reg_offset(m_gen, saved_reg, RSP, offset), InstructionInfo::Kind::EPILOGUE); } } ASSERT(j == 0); - m_gen.add_instr_no_ir(f_rec, IGen::add_gpr64_imm(RSP, xmm_backup_stack_offset), + m_gen.add_instr_no_ir(f_rec, IGen::add_gpr64_imm(m_gen, RSP, xmm_backup_stack_offset), InstructionInfo::Kind::EPILOGUE); } } else { for (int i = int(allocs.used_saved_regs.size()); i-- > 0;) { auto& saved_reg = allocs.used_saved_regs.at(i); - if (saved_reg.is_xmm()) { - m_gen.add_instr_no_ir(f_rec, IGen::load128_xmm128_gpr64(saved_reg, RSP), + if (saved_reg.is_xmm(m_gen.instr_set())) { + m_gen.add_instr_no_ir(f_rec, IGen::load128_simd128_gpr64(m_gen, saved_reg, RSP), InstructionInfo::Kind::EPILOGUE); - m_gen.add_instr_no_ir(f_rec, IGen::add_gpr64_imm8s(RSP, XMM_SIZE), + m_gen.add_instr_no_ir(f_rec, IGen::add_gpr64_imm8s(m_gen, RSP, XMM_SIZE), InstructionInfo::Kind::EPILOGUE); } } } - m_gen.add_instr_no_ir(f_rec, IGen::ret(), InstructionInfo::Kind::EPILOGUE); + m_gen.add_instr_no_ir(f_rec, IGen::ret(m_gen), InstructionInfo::Kind::EPILOGUE); } -void CodeGenerator::do_asm_function(FunctionEnv* env, int f_idx, bool allow_saved_regs) { +void CodeGenerator::do_goal_function_arm64(FunctionEnv* env, int f_idx) { + throw std::runtime_error("NYI - CodeGenerator::do_goal_function_arm64"); +} + +void CodeGenerator::do_asm_function_x86(FunctionEnv* env, int f_idx, bool allow_saved_regs) { auto f_rec = m_gen.get_existing_function_record(f_idx); const auto& allocs = env->alloc_result(); @@ -316,6 +340,10 @@ void CodeGenerator::do_asm_function(FunctionEnv* env, int f_idx, bool allow_save } // do the actual op - ir->do_codegen(&m_gen, allocs, i_rec); + ir->do_codegen_x86(&m_gen, allocs, i_rec); } } + +void CodeGenerator::do_asm_function_arm64(FunctionEnv* env, int f_idx, bool allow_saved_regs) { + throw std::runtime_error("NYI - CodeGenerator::do_asm_function"); +} diff --git a/goalc/compiler/CodeGenerator.h b/goalc/compiler/CodeGenerator.h index 24f380688f..f7e06690af 100644 --- a/goalc/compiler/CodeGenerator.h +++ b/goalc/compiler/CodeGenerator.h @@ -18,14 +18,19 @@ class TypeSystem; class CodeGenerator { public: - CodeGenerator(FileEnv* env, DebugInfo* debug_info, GameVersion version); + CodeGenerator(FileEnv* env, + DebugInfo* debug_info, + GameVersion version, + emitter::InstructionSet instruction_set); std::vector run(const TypeSystem* ts); emitter::ObjectGeneratorStats get_obj_stats() const { return m_gen.get_stats(); } private: void do_function(FunctionEnv* env, int f_idx); - void do_goal_function(FunctionEnv* env, int f_idx); - void do_asm_function(FunctionEnv* env, int f_idx, bool allow_saved_regs); + void do_goal_function_x86(FunctionEnv* env, int f_idx); + void do_goal_function_arm64(FunctionEnv* env, int f_idx); + void do_asm_function_x86(FunctionEnv* env, int f_idx, bool allow_saved_regs); + void do_asm_function_arm64(FunctionEnv* env, int f_idx, bool allow_saved_regs); emitter::ObjectGenerator m_gen; FileEnv* m_fe = nullptr; DebugInfo* m_debug_info = nullptr; diff --git a/goalc/compiler/Compiler.cpp b/goalc/compiler/Compiler.cpp index 88587a34a6..e61c302f7e 100644 --- a/goalc/compiler/Compiler.cpp +++ b/goalc/compiler/Compiler.cpp @@ -10,6 +10,8 @@ #include "common/link_types.h" #include "common/util/FileUtil.h" +#include "goalc/compiler/CodeGenerator.h" +#include "goalc/emitter/InstructionSet.h" #include "goalc/make/Tools.h" #include "goalc/regalloc/Allocator.h" #include "goalc/regalloc/Allocator_v2.h" @@ -19,10 +21,12 @@ using namespace goos; Compiler::Compiler(GameVersion version, + emitter::InstructionSet instr_set, const std::optional repl_config, const std::string& user_profile, std::unique_ptr repl) : m_version(version), + m_instr_set(instr_set), m_goos(user_profile), m_debugger(&m_listener, &m_goos.reader, version), m_make(repl_config, user_profile), @@ -307,7 +311,7 @@ std::vector Compiler::codegen_object_file(FileEnv* env) { try { auto debug_info = &m_debugger.get_debug_info_for_object(env->name()); debug_info->clear(); - CodeGenerator gen(env, debug_info, m_version); + CodeGenerator gen(env, debug_info, m_version, m_instr_set); bool ok = true; auto result = gen.run(&m_ts); for (auto& f : env->functions()) { @@ -331,7 +335,7 @@ bool Compiler::codegen_and_disassemble_object_file(FileEnv* env, bool omit_ir) { auto debug_info = &m_debugger.get_debug_info_for_object(env->name()); debug_info->clear(); - CodeGenerator gen(env, debug_info, m_version); + CodeGenerator gen(env, debug_info, m_version, m_instr_set); *data_out = gen.run(&m_ts); bool ok = true; *asm_out = debug_info->disassemble_all_functions(&ok, &m_goos.reader, omit_ir); diff --git a/goalc/compiler/Compiler.h b/goalc/compiler/Compiler.h index 859a74e110..06a8e6475c 100644 --- a/goalc/compiler/Compiler.h +++ b/goalc/compiler/Compiler.h @@ -16,6 +16,7 @@ #include "goalc/compiler/symbol_info.h" #include "goalc/data_compiler/game_text_common.h" #include "goalc/debugger/Debugger.h" +#include "goalc/emitter/InstructionSet.h" #include "goalc/emitter/Register.h" #include "goalc/listener/Listener.h" #include "goalc/make/MakeSystem.h" @@ -46,6 +47,7 @@ struct GlobalConstantInfo { class Compiler { public: Compiler(GameVersion version, + emitter::InstructionSet instr_set, const std::optional repl_config = {}, const std::string& user_profile = "#f", std::unique_ptr repl = nullptr); @@ -118,6 +120,7 @@ class Compiler { private: GameVersion m_version; + emitter::InstructionSet m_instr_set; TypeSystem m_ts; std::unique_ptr m_global_env = nullptr; std::unique_ptr m_none = nullptr; diff --git a/goalc/compiler/IR.cpp b/goalc/compiler/IR.cpp index e83abfc15f..c0d9cf38ac 100644 --- a/goalc/compiler/IR.cpp +++ b/goalc/compiler/IR.cpp @@ -4,10 +4,14 @@ #include "common/symbols.h" +#include "goalc/compiler/Env.h" #include "goalc/emitter/IGen.h" #include "fmt/format.h" +// TODO ARM64 - just silencing errors while things are not implemented obviously +#pragma GCC diagnostic ignored "-Wunused-parameter" + using namespace emitter; namespace { Register get_reg(const RegVal* rv, const AllocationResult& allocs, emitter::IR_Record irec) { @@ -71,20 +75,20 @@ void load_constant(u64 value, Register dest_reg) { s64 svalue = value; if (svalue == 0) { - gen->add_instr(IGen::xor_gpr64_gpr64(dest_reg, dest_reg), irec); + gen->add_instr(IGen::xor_gpr64_gpr64(*gen, dest_reg, dest_reg), irec); } else if (svalue > 0) { if (svalue < UINT32_MAX) { - gen->add_instr(IGen::mov_gpr64_u32(dest_reg, value), irec); + gen->add_instr(IGen::mov_gpr64_u32(*gen, dest_reg, value), irec); } else { // need a real 64 bit load - gen->add_instr(IGen::mov_gpr64_u64(dest_reg, value), irec); + gen->add_instr(IGen::mov_gpr64_u64(*gen, dest_reg, value), irec); } } else { if (svalue >= INT32_MIN) { - gen->add_instr(IGen::mov_gpr64_s32(dest_reg, svalue), irec); + gen->add_instr(IGen::mov_gpr64_s32(*gen, dest_reg, svalue), irec); } else { // need a real 64 bit load - gen->add_instr(IGen::mov_gpr64_u64(dest_reg, value), irec); + gen->add_instr(IGen::mov_gpr64_u64(*gen, dest_reg, value), irec); } } } @@ -107,42 +111,42 @@ void regset_common(emitter::ObjectGenerator* gen, if (src_reg == dst_reg) { // eliminate move gen->count_eliminated_move(); - gen->add_instr(IGen::null(), irec); + gen->add_instr(IGen::null(*gen), irec); } else { - gen->add_instr(IGen::mov_gpr64_gpr64(dst_reg, src_reg), irec); + gen->add_instr(IGen::mov_gpr64_gpr64(*gen, dst_reg, src_reg), irec); } } else if (src_class == RegClass::FLOAT && dst_class == RegClass::FLOAT) { if (src_reg == dst_reg) { // eliminate move gen->count_eliminated_move(); - gen->add_instr(IGen::null(), irec); + gen->add_instr(IGen::null(*gen), irec); } else { - gen->add_instr(IGen::mov_xmm32_xmm32(dst_reg, src_reg), irec); + gen->add_instr(IGen::mov_xmm32_xmm32(*gen, dst_reg, src_reg), irec); } } else if (src_is_xmm128 && dst_is_xmm128) { if (src_reg == dst_reg) { // eliminate move gen->count_eliminated_move(); - gen->add_instr(IGen::null(), irec); + gen->add_instr(IGen::null(*gen), irec); } else { - gen->add_instr(IGen::mov_vf_vf(dst_reg, src_reg), irec); + gen->add_instr(IGen::mov_vf_vf(*gen, dst_reg, src_reg), irec); } } else if (src_class == RegClass::FLOAT && dst_class == RegClass::GPR_64) { // xmm 1x -> gpr - gen->add_instr(IGen::movd_gpr32_xmm32(dst_reg, src_reg), irec); + gen->add_instr(IGen::movd_gpr32_xmm32(*gen, dst_reg, src_reg), irec); // don't forget to sign extend - gen->add_instr(IGen::movsx_r64_r32(dst_reg, dst_reg), irec); + gen->add_instr(IGen::movsx_r64_r32(*gen, dst_reg, dst_reg), irec); } else if (src_class == RegClass::GPR_64 && dst_class == RegClass::FLOAT) { // gpr -> xmm 1x - gen->add_instr(IGen::movd_xmm32_gpr32(dst_reg, src_reg), irec); + gen->add_instr(IGen::movd_xmm32_gpr32(*gen, dst_reg, src_reg), irec); } else if (src_is_xmm128 && dst_class == RegClass::FLOAT) { - gen->add_instr(IGen::mov_xmm32_xmm32(dst_reg, src_reg), irec); + gen->add_instr(IGen::mov_xmm32_xmm32(*gen, dst_reg, src_reg), irec); } else if (src_class == RegClass::FLOAT && dst_is_xmm128) { - gen->add_instr(IGen::mov_xmm32_xmm32(dst_reg, src_reg), irec); + gen->add_instr(IGen::mov_xmm32_xmm32(*gen, dst_reg, src_reg), irec); } else if (src_class == RegClass::GPR_64 && dst_is_xmm128) { - gen->add_instr(IGen::movq_xmm64_gpr64(dst_reg, src_reg), irec); + gen->add_instr(IGen::movq_xmm64_gpr64(*gen, dst_reg, src_reg), irec); } else if (src_is_xmm128 && dst_class == RegClass::GPR_64) { - gen->add_instr(IGen::movq_gpr64_xmm64(dst_reg, src_reg), irec); + gen->add_instr(IGen::movq_gpr64_xmm64(*gen, dst_reg, src_reg), irec); } else { ASSERT(false); // unhandled move. } @@ -180,20 +184,26 @@ void IR_Return::add_constraints(std::vector* constraints, int my constraints->push_back(c); } -void IR_Return::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_Return::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { auto val_reg = get_reg(m_value, allocs, irec); auto dest_reg = get_reg(m_return_reg, allocs, irec); if (val_reg == dest_reg) { - gen->add_instr(IGen::null(), irec); + gen->add_instr(IGen::null(*gen), irec); } else { regset_common(gen, allocs, irec, m_return_reg, m_value, true); // gen->add_instr(IGen::mov_gpr64_gpr64(dest_reg, val_reg), irec); } } +void IR_Return::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_Return::do_codegen_arm64"); +} + ///////////////////// // LoadConstant64 ///////////////////// @@ -210,13 +220,19 @@ RegAllocInstr IR_LoadConstant64::to_rai() { return rai; } -void IR_LoadConstant64::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_LoadConstant64::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { auto dest_reg = get_reg(m_dest, allocs, irec); load_constant(m_value, gen, irec, dest_reg); } +void IR_LoadConstant64::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_LoadConstant64::do_codegen_arm64"); +} + ///////////////////// // LoadSymbolPointer ///////////////////// @@ -233,32 +249,38 @@ RegAllocInstr IR_LoadSymbolPointer::to_rai() { return rai; } -void IR_LoadSymbolPointer::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_LoadSymbolPointer::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { auto dest_reg = get_reg(m_dest, allocs, irec); if (m_name == "#f") { static_assert(false_symbol_offset() == 0, "false symbol location"); - if (dest_reg.is_xmm()) { - gen->add_instr(IGen::movq_xmm64_gpr64(dest_reg, gRegInfo.get_st_reg()), irec); + if (dest_reg.is_xmm(gen->instr_set())) { + gen->add_instr(IGen::movq_xmm64_gpr64(*gen, dest_reg, gRegInfo.get_st_reg()), irec); } else { - gen->add_instr(IGen::mov_gpr64_gpr64(dest_reg, gRegInfo.get_st_reg()), irec); + gen->add_instr(IGen::mov_gpr64_gpr64(*gen, dest_reg, gRegInfo.get_st_reg()), irec); } } else if (m_name == "#t") { - gen->add_instr(IGen::lea_reg_plus_off8(dest_reg, gRegInfo.get_st_reg(), + gen->add_instr(IGen::lea_reg_plus_off8(*gen, dest_reg, gRegInfo.get_st_reg(), true_symbol_offset(gen->version())), irec); } else if (m_name == "_empty_") { - gen->add_instr(IGen::lea_reg_plus_off8(dest_reg, gRegInfo.get_st_reg(), + gen->add_instr(IGen::lea_reg_plus_off8(*gen, dest_reg, gRegInfo.get_st_reg(), empty_pair_offset_from_s7(gen->version())), irec); } else { - auto instr = - gen->add_instr(IGen::lea_reg_plus_off32(dest_reg, gRegInfo.get_st_reg(), 0x0afecafe), irec); + auto instr = gen->add_instr( + IGen::lea_reg_plus_off32(*gen, dest_reg, gRegInfo.get_st_reg(), 0x0afecafe), irec); gen->link_instruction_symbol_ptr(instr, m_name); } } +void IR_LoadSymbolPointer::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_LoadSymbolPointer::do_codegen_arm64"); +} + ///////////////////// // SetSymbolValue ///////////////////// @@ -276,17 +298,23 @@ RegAllocInstr IR_SetSymbolValue::to_rai() { return rai; } -void IR_SetSymbolValue::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_SetSymbolValue::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { auto src_reg = get_reg(m_src, allocs, irec); auto instr = gen->add_instr( IGen::store32_gpr64_gpr64_plus_gpr64_plus_s32( - gRegInfo.get_st_reg(), gRegInfo.get_offset_reg(), src_reg, LINK_SYM_NO_OFFSET_FLAG), + *gen, gRegInfo.get_st_reg(), gRegInfo.get_offset_reg(), src_reg, LINK_SYM_NO_OFFSET_FLAG), irec); gen->link_instruction_symbol_mem(instr, m_dest->name()); } +void IR_SetSymbolValue::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_SetSymbolValue::do_codegen_arm64"); +} + ///////////////////// // GetSymbolValue ///////////////////// @@ -304,25 +332,31 @@ RegAllocInstr IR_GetSymbolValue::to_rai() { return rai; } -void IR_GetSymbolValue::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_GetSymbolValue::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { auto dst_reg = get_reg(m_dest, allocs, irec); if (m_sext) { - auto instr = gen->add_instr( - IGen::load32s_gpr64_gpr64_plus_gpr64_plus_s32( - dst_reg, gRegInfo.get_st_reg(), gRegInfo.get_offset_reg(), LINK_SYM_NO_OFFSET_FLAG), - irec); + auto instr = gen->add_instr(IGen::load32s_gpr64_gpr64_plus_gpr64_plus_s32( + *gen, dst_reg, gRegInfo.get_st_reg(), gRegInfo.get_offset_reg(), + LINK_SYM_NO_OFFSET_FLAG), + irec); gen->link_instruction_symbol_mem(instr, m_src->name()); } else { - auto instr = gen->add_instr( - IGen::load32u_gpr64_gpr64_plus_gpr64_plus_s32( - dst_reg, gRegInfo.get_st_reg(), gRegInfo.get_offset_reg(), LINK_SYM_NO_OFFSET_FLAG), - irec); + auto instr = gen->add_instr(IGen::load32u_gpr64_gpr64_plus_gpr64_plus_s32( + *gen, dst_reg, gRegInfo.get_st_reg(), gRegInfo.get_offset_reg(), + LINK_SYM_NO_OFFSET_FLAG), + irec); gen->link_instruction_symbol_mem(instr, m_src->name()); } } +void IR_GetSymbolValue::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_GetSymbolValue::do_codegen_arm64"); +} + ///////////////////// // RegSet ///////////////////// @@ -339,12 +373,18 @@ RegAllocInstr IR_RegSet::to_rai() { return rai; } -void IR_RegSet::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_RegSet::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { regset_common(gen, allocs, irec, m_dest, m_src, true); } +void IR_RegSet::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_RegSet::do_codegen_arm64"); +} + std::string IR_RegSet::print() { return fmt::format("mov {}, {}", m_dest->print(), m_src->print()); } @@ -373,14 +413,20 @@ RegAllocInstr IR_GotoLabel::to_rai() { return rai; } -void IR_GotoLabel::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_GotoLabel::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { (void)allocs; - auto instr = gen->add_instr(IGen::jmp_32(), irec); + auto instr = gen->add_instr(IGen::jmp_32(*gen), irec); gen->link_instruction_jump(instr, gen->get_future_ir_record_in_same_func(irec, m_dest->idx)); } +void IR_GotoLabel::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_GotoLabel::do_codegen_arm64"); +} + void IR_GotoLabel::resolve(const Label* dest) { ASSERT(!m_resolved); m_dest = dest; @@ -449,15 +495,21 @@ void IR_FunctionCall::add_constraints(std::vector* constraints, } } -void IR_FunctionCall::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_FunctionCall::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { auto freg = get_reg(m_func, allocs, irec); - gen->add_instr(IGen::add_gpr64_gpr64(freg, emitter::gRegInfo.get_offset_reg()), irec); - gen->add_instr(IGen::call_r64(freg), irec); + gen->add_instr(IGen::add_gpr64_gpr64(*gen, freg, emitter::gRegInfo.get_offset_reg()), irec); + gen->add_instr(IGen::call_r64(*gen, freg), irec); // todo, can we do a sub to undo the modification to the register? does that actually work? } +void IR_FunctionCall::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_FunctionCall::do_codegen_arm64"); +} + ///////////////////// // RegValAddr ///////////////////// @@ -475,15 +527,21 @@ RegAllocInstr IR_RegValAddr::to_rai() { return rai; } -void IR_RegValAddr::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_RegValAddr::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { int stack_offset = get_stack_offset(m_src, allocs); auto dst = get_reg(m_dest, allocs, irec); // x86 pointer to var - gen->add_instr(IGen::lea_reg_plus_off(dst, RSP, stack_offset), irec); + gen->add_instr(IGen::lea_reg_plus_off(*gen, dst, RSP, stack_offset), irec); // x86 -> GOAL pointer - gen->add_instr(IGen::sub_gpr64_gpr64(dst, emitter::gRegInfo.get_offset_reg()), irec); + gen->add_instr(IGen::sub_gpr64_gpr64(*gen, dst, emitter::gRegInfo.get_offset_reg()), irec); +} + +void IR_RegValAddr::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_RegValAddr::do_codegen_arm64"); } ///////////////////// @@ -503,13 +561,19 @@ RegAllocInstr IR_StaticVarAddr::to_rai() { return rai; } -void IR_StaticVarAddr::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_StaticVarAddr::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { auto dr = get_reg(m_dest, allocs, irec); - auto instr = gen->add_instr(IGen::static_addr(dr, 0), irec); + auto instr = gen->add_instr(IGen::static_addr(*gen, dr, 0), irec); gen->link_instruction_static(instr, m_src->rec, m_src->get_addr_offset()); - gen->add_instr(IGen::sub_gpr64_gpr64(dr, emitter::gRegInfo.get_offset_reg()), irec); + gen->add_instr(IGen::sub_gpr64_gpr64(*gen, dr, emitter::gRegInfo.get_offset_reg()), irec); +} + +void IR_StaticVarAddr::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_StaticVarAddr::do_codegen_arm64"); } ///////////////////// @@ -528,13 +592,19 @@ RegAllocInstr IR_FunctionAddr::to_rai() { return rai; } -void IR_FunctionAddr::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_FunctionAddr::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { auto dr = get_reg(m_dest, allocs, irec); - auto instr = gen->add_instr(IGen::static_addr(dr, 0), irec); + auto instr = gen->add_instr(IGen::static_addr(*gen, dr, 0), irec); gen->link_instruction_to_function(instr, gen->get_existing_function_record(m_src->idx_in_file)); - gen->add_instr(IGen::sub_gpr64_gpr64(dr, emitter::gRegInfo.get_offset_reg()), irec); + gen->add_instr(IGen::sub_gpr64_gpr64(*gen, dr, emitter::gRegInfo.get_offset_reg()), irec); +} + +void IR_FunctionAddr::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_FunctionAddr::do_codegen_arm64"); } ///////////////////// @@ -607,97 +677,108 @@ RegAllocInstr IR_IntegerMath::to_rai() { return rai; } -void IR_IntegerMath::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_IntegerMath::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { switch (m_kind) { case IntegerMathKind::ADD_64: gen->add_instr( - IGen::add_gpr64_gpr64(get_reg(m_dest, allocs, irec), get_reg(m_arg, allocs, irec)), irec); + IGen::add_gpr64_gpr64(*gen, get_reg(m_dest, allocs, irec), get_reg(m_arg, allocs, irec)), + irec); break; case IntegerMathKind::SUB_64: gen->add_instr( - IGen::sub_gpr64_gpr64(get_reg(m_dest, allocs, irec), get_reg(m_arg, allocs, irec)), irec); + IGen::sub_gpr64_gpr64(*gen, get_reg(m_dest, allocs, irec), get_reg(m_arg, allocs, irec)), + irec); break; case IntegerMathKind::AND_64: gen->add_instr( - IGen::and_gpr64_gpr64(get_reg(m_dest, allocs, irec), get_reg(m_arg, allocs, irec)), irec); + IGen::and_gpr64_gpr64(*gen, get_reg(m_dest, allocs, irec), get_reg(m_arg, allocs, irec)), + irec); break; case IntegerMathKind::OR_64: gen->add_instr( - IGen::or_gpr64_gpr64(get_reg(m_dest, allocs, irec), get_reg(m_arg, allocs, irec)), irec); + IGen::or_gpr64_gpr64(*gen, get_reg(m_dest, allocs, irec), get_reg(m_arg, allocs, irec)), + irec); break; case IntegerMathKind::XOR_64: gen->add_instr( - IGen::xor_gpr64_gpr64(get_reg(m_dest, allocs, irec), get_reg(m_arg, allocs, irec)), irec); + IGen::xor_gpr64_gpr64(*gen, get_reg(m_dest, allocs, irec), get_reg(m_arg, allocs, irec)), + irec); break; case IntegerMathKind::NOT_64: - gen->add_instr(IGen::not_gpr64(get_reg(m_dest, allocs, irec)), irec); + gen->add_instr(IGen::not_gpr64(*gen, get_reg(m_dest, allocs, irec)), irec); ASSERT(!m_arg); break; case IntegerMathKind::SHLV_64: - gen->add_instr(IGen::shl_gpr64_cl(get_reg(m_dest, allocs, irec)), irec); + gen->add_instr(IGen::shl_gpr64_cl(*gen, get_reg(m_dest, allocs, irec)), irec); ASSERT(get_reg(m_arg, allocs, irec) == emitter::RCX); break; case IntegerMathKind::SHRV_64: - gen->add_instr(IGen::shr_gpr64_cl(get_reg(m_dest, allocs, irec)), irec); + gen->add_instr(IGen::shr_gpr64_cl(*gen, get_reg(m_dest, allocs, irec)), irec); ASSERT(get_reg(m_arg, allocs, irec) == emitter::RCX); break; case IntegerMathKind::SARV_64: - gen->add_instr(IGen::sar_gpr64_cl(get_reg(m_dest, allocs, irec)), irec); + gen->add_instr(IGen::sar_gpr64_cl(*gen, get_reg(m_dest, allocs, irec)), irec); ASSERT(get_reg(m_arg, allocs, irec) == emitter::RCX); break; case IntegerMathKind::SHL_64: - gen->add_instr(IGen::shl_gpr64_u8(get_reg(m_dest, allocs, irec), m_shift_amount), irec); + gen->add_instr(IGen::shl_gpr64_u8(*gen, get_reg(m_dest, allocs, irec), m_shift_amount), irec); break; case IntegerMathKind::SHR_64: - gen->add_instr(IGen::shr_gpr64_u8(get_reg(m_dest, allocs, irec), m_shift_amount), irec); + gen->add_instr(IGen::shr_gpr64_u8(*gen, get_reg(m_dest, allocs, irec), m_shift_amount), irec); break; case IntegerMathKind::SAR_64: - gen->add_instr(IGen::sar_gpr64_u8(get_reg(m_dest, allocs, irec), m_shift_amount), irec); + gen->add_instr(IGen::sar_gpr64_u8(*gen, get_reg(m_dest, allocs, irec), m_shift_amount), irec); break; case IntegerMathKind::IMUL_32: { // just a 32-bit multiply, signed/unsigned doesn't affect lower 32 bits of result. auto dr = get_reg(m_dest, allocs, irec); - gen->add_instr(IGen::imul_gpr32_gpr32(dr, get_reg(m_arg, allocs, irec)), irec); + gen->add_instr(IGen::imul_gpr32_gpr32(*gen, dr, get_reg(m_arg, allocs, irec)), irec); // the PS2 sign extends the result even if we used multu. We replicate this here. - gen->add_instr(IGen::movsx_r64_r32(dr, dr), irec); + gen->add_instr(IGen::movsx_r64_r32(*gen, dr, dr), irec); } break; case IntegerMathKind::IMUL_64: { auto dr = get_reg(m_dest, allocs, irec); - gen->add_instr(IGen::imul_gpr64_gpr64(dr, get_reg(m_arg, allocs, irec)), irec); + gen->add_instr(IGen::imul_gpr64_gpr64(*gen, dr, get_reg(m_arg, allocs, irec)), irec); } break; case IntegerMathKind::IDIV_32: { - gen->add_instr(IGen::cdq(), irec); - gen->add_instr(IGen::idiv_gpr32(get_reg(m_arg, allocs, irec)), irec); - gen->add_instr(IGen::movsx_r64_r32(get_reg(m_dest, allocs, irec), emitter::RAX), irec); + gen->add_instr(IGen::cdq(*gen), irec); + gen->add_instr(IGen::idiv_gpr32(*gen, get_reg(m_arg, allocs, irec)), irec); + gen->add_instr(IGen::movsx_r64_r32(*gen, get_reg(m_dest, allocs, irec), emitter::RAX), irec); } break; case IntegerMathKind::UDIV_32: { // zero extend, not sign extend to avoid overflow - gen->add_instr(IGen::xor_gpr64_gpr64(Register(RDX), Register(RDX)), irec); - gen->add_instr(IGen::unsigned_div_gpr32(get_reg(m_arg, allocs, irec)), irec); + gen->add_instr(IGen::xor_gpr64_gpr64(*gen, Register(RDX), Register(RDX)), irec); + gen->add_instr(IGen::unsigned_div_gpr32(*gen, get_reg(m_arg, allocs, irec)), irec); // note: this probably needs hardware testing to know for sure if the PS2 actually sign // extends here or not. Nothing seems to break either way, and PCSX2/Dobie interpreters both // sign extend, so that seems like the safest option. - gen->add_instr(IGen::movsx_r64_r32(get_reg(m_dest, allocs, irec), emitter::RAX), irec); + gen->add_instr(IGen::movsx_r64_r32(*gen, get_reg(m_dest, allocs, irec), emitter::RAX), irec); } break; case IntegerMathKind::IMOD_32: { - gen->add_instr(IGen::cdq(), irec); - gen->add_instr(IGen::idiv_gpr32(get_reg(m_arg, allocs, irec)), irec); - gen->add_instr(IGen::movsx_r64_r32(get_reg(m_dest, allocs, irec), emitter::RDX), irec); + gen->add_instr(IGen::cdq(*gen), irec); + gen->add_instr(IGen::idiv_gpr32(*gen, get_reg(m_arg, allocs, irec)), irec); + gen->add_instr(IGen::movsx_r64_r32(*gen, get_reg(m_dest, allocs, irec), emitter::RDX), irec); } break; case IntegerMathKind::UMOD_32: { // zero extend, not sign extend to avoid overflow - gen->add_instr(IGen::xor_gpr64_gpr64(Register(RDX), Register(RDX)), irec); - gen->add_instr(IGen::unsigned_div_gpr32(get_reg(m_arg, allocs, irec)), irec); + gen->add_instr(IGen::xor_gpr64_gpr64(*gen, Register(RDX), Register(RDX)), irec); + gen->add_instr(IGen::unsigned_div_gpr32(*gen, get_reg(m_arg, allocs, irec)), irec); // see note on udiv, same applies here. - gen->add_instr(IGen::movsx_r64_r32(get_reg(m_dest, allocs, irec), emitter::RDX), irec); + gen->add_instr(IGen::movsx_r64_r32(*gen, get_reg(m_dest, allocs, irec), emitter::RDX), irec); } break; default: ASSERT(false); } } +void IR_IntegerMath::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_IntegerMath::do_codegen_arm64"); +} + ///////////////////// // FloatMath ///////////////////// @@ -736,43 +817,55 @@ RegAllocInstr IR_FloatMath::to_rai() { return rai; } -void IR_FloatMath::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_FloatMath::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { switch (m_kind) { case FloatMathKind::DIV_SS: gen->add_instr( - IGen::divss_xmm_xmm(get_reg(m_dest, allocs, irec), get_reg(m_arg, allocs, irec)), irec); + IGen::divss_xmm_xmm(*gen, get_reg(m_dest, allocs, irec), get_reg(m_arg, allocs, irec)), + irec); break; case FloatMathKind::MUL_SS: gen->add_instr( - IGen::mulss_xmm_xmm(get_reg(m_dest, allocs, irec), get_reg(m_arg, allocs, irec)), irec); + IGen::mulss_xmm_xmm(*gen, get_reg(m_dest, allocs, irec), get_reg(m_arg, allocs, irec)), + irec); break; case FloatMathKind::ADD_SS: gen->add_instr( - IGen::addss_xmm_xmm(get_reg(m_dest, allocs, irec), get_reg(m_arg, allocs, irec)), irec); + IGen::addss_xmm_xmm(*gen, get_reg(m_dest, allocs, irec), get_reg(m_arg, allocs, irec)), + irec); break; case FloatMathKind::SUB_SS: gen->add_instr( - IGen::subss_xmm_xmm(get_reg(m_dest, allocs, irec), get_reg(m_arg, allocs, irec)), irec); + IGen::subss_xmm_xmm(*gen, get_reg(m_dest, allocs, irec), get_reg(m_arg, allocs, irec)), + irec); break; case FloatMathKind::MAX_SS: gen->add_instr( - IGen::maxss_xmm_xmm(get_reg(m_dest, allocs, irec), get_reg(m_arg, allocs, irec)), irec); + IGen::maxss_xmm_xmm(*gen, get_reg(m_dest, allocs, irec), get_reg(m_arg, allocs, irec)), + irec); break; case FloatMathKind::MIN_SS: gen->add_instr( - IGen::minss_xmm_xmm(get_reg(m_dest, allocs, irec), get_reg(m_arg, allocs, irec)), irec); + IGen::minss_xmm_xmm(*gen, get_reg(m_dest, allocs, irec), get_reg(m_arg, allocs, irec)), + irec); break; case FloatMathKind::SQRT_SS: - gen->add_instr(IGen::sqrts_xmm(get_reg(m_dest, allocs, irec), get_reg(m_arg, allocs, irec)), - irec); + gen->add_instr( + IGen::sqrts_xmm(*gen, get_reg(m_dest, allocs, irec), get_reg(m_arg, allocs, irec)), irec); break; default: ASSERT(false); } } +void IR_FloatMath::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_FloatMath::do_codegen_arm64"); +} + ///////////////////// // StaticVarLoad ///////////////////// @@ -790,9 +883,9 @@ RegAllocInstr IR_StaticVarLoad::to_rai() { return rai; } -void IR_StaticVarLoad::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_StaticVarLoad::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { auto load_info = m_src->get_load_info(); ASSERT(m_src->get_addr_offset() == 0); @@ -801,18 +894,26 @@ void IR_StaticVarLoad::do_codegen(emitter::ObjectGenerator* gen, ASSERT(load_info.load_size == 4); ASSERT(load_info.requires_load == true); - auto instr = gen->add_instr(IGen::static_load_xmm32(get_reg(m_dest, allocs, irec), 0), irec); + auto instr = + gen->add_instr(IGen::static_load_xmm32(*gen, get_reg(m_dest, allocs, irec), 0), irec); gen->link_instruction_static(instr, m_src->rec, 0); } else if (m_dest->ireg().reg_class == RegClass::VECTOR_FLOAT) { // we don't check the load info intentionally because we want to allow loading an entire // vector structure. - auto instr = gen->add_instr(IGen::loadvf_rip_plus_s32(get_reg(m_dest, allocs, irec), 0), irec); + auto instr = + gen->add_instr(IGen::loadvf_rip_plus_s32(*gen, get_reg(m_dest, allocs, irec), 0), irec); gen->link_instruction_static(instr, m_src->rec, 0); } else { ASSERT(false); } } +void IR_StaticVarLoad::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_StaticVarLoad::do_codegen_arm64"); +} + ///////////////////// // ConditionalBranch ///////////////////// @@ -858,45 +959,45 @@ RegAllocInstr IR_ConditionalBranch::to_rai() { return rai; } -void IR_ConditionalBranch::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { - Instruction jump_instr(0); +void IR_ConditionalBranch::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + Instruction jump_instr = InstructionX86(0); ASSERT(m_resolved); switch (condition.kind) { case ConditionKind::EQUAL: - jump_instr = IGen::je_32(); + jump_instr = IGen::je_32(*gen); break; case ConditionKind::NOT_EQUAL: - jump_instr = IGen::jne_32(); + jump_instr = IGen::jne_32(*gen); break; case ConditionKind::LEQ: if (condition.is_signed) { - jump_instr = IGen::jle_32(); + jump_instr = IGen::jle_32(*gen); } else { - jump_instr = IGen::jbe_32(); + jump_instr = IGen::jbe_32(*gen); } break; case ConditionKind::GEQ: if (condition.is_signed) { - jump_instr = IGen::jge_32(); + jump_instr = IGen::jge_32(*gen); } else { - jump_instr = IGen::jae_32(); + jump_instr = IGen::jae_32(*gen); } break; case ConditionKind::LT: if (condition.is_signed) { - jump_instr = IGen::jl_32(); + jump_instr = IGen::jl_32(*gen); } else { - jump_instr = IGen::jb_32(); + jump_instr = IGen::jb_32(*gen); } break; case ConditionKind::GT: if (condition.is_signed) { - jump_instr = IGen::jg_32(); + jump_instr = IGen::jg_32(*gen); } else { - jump_instr = IGen::ja_32(); + jump_instr = IGen::ja_32(*gen); } break; default: @@ -904,11 +1005,11 @@ void IR_ConditionalBranch::do_codegen(emitter::ObjectGenerator* gen, } if (condition.is_float) { - gen->add_instr( - IGen::cmp_flt_flt(get_reg(condition.a, allocs, irec), get_reg(condition.b, allocs, irec)), - irec); + gen->add_instr(IGen::cmp_flt_flt(*gen, get_reg(condition.a, allocs, irec), + get_reg(condition.b, allocs, irec)), + irec); } else { - gen->add_instr(IGen::cmp_gpr64_gpr64(get_reg(condition.a, allocs, irec), + gen->add_instr(IGen::cmp_gpr64_gpr64(*gen, get_reg(condition.a, allocs, irec), get_reg(condition.b, allocs, irec)), irec); } @@ -917,6 +1018,12 @@ void IR_ConditionalBranch::do_codegen(emitter::ObjectGenerator* gen, gen->link_instruction_jump(jump_rec, gen->get_future_ir_record_in_same_func(irec, label.idx)); } +void IR_ConditionalBranch::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_ConditionalBranch::do_codegen_arm64"); +} + ///////////////////// // LoadConstantOffset ///////////////////// @@ -939,33 +1046,39 @@ RegAllocInstr IR_LoadConstOffset::to_rai() { return rai; } -void IR_LoadConstOffset::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_LoadConstOffset::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { auto dest_reg = m_use_coloring ? get_reg(m_dest, allocs, irec) : get_no_color_reg(m_dest); auto base_reg = m_use_coloring ? get_reg(m_base, allocs, irec) : get_no_color_reg(m_base); if (m_dest->ireg().reg_class == RegClass::GPR_64) { - gen->add_instr(IGen::load_goal_gpr(dest_reg, base_reg, emitter::gRegInfo.get_offset_reg(), + gen->add_instr(IGen::load_goal_gpr(*gen, dest_reg, base_reg, emitter::gRegInfo.get_offset_reg(), m_offset, m_info.size, m_info.sign_extend), irec); } else if (m_dest->ireg().reg_class == RegClass::FLOAT && m_info.size == 4 && m_info.sign_extend == false && m_info.reg == RegClass::FLOAT) { - gen->add_instr( - IGen::load_goal_xmm32(dest_reg, base_reg, emitter::gRegInfo.get_offset_reg(), m_offset), - irec); + gen->add_instr(IGen::load_goal_xmm32(*gen, dest_reg, base_reg, + emitter::gRegInfo.get_offset_reg(), m_offset), + irec); } else if ((m_dest->ireg().reg_class == RegClass::VECTOR_FLOAT || m_dest->ireg().reg_class == RegClass::INT_128) && m_info.size == 16 && m_info.sign_extend == false && m_info.reg == m_dest->ireg().reg_class) { - gen->add_instr( - IGen::load_goal_xmm128(dest_reg, base_reg, emitter::gRegInfo.get_offset_reg(), m_offset), - irec); + gen->add_instr(IGen::load_goal_xmm128(*gen, dest_reg, base_reg, + emitter::gRegInfo.get_offset_reg(), m_offset), + irec); } else { - throw std::runtime_error("IR_LoadConstOffset::do_codegen not supported"); + throw std::runtime_error("IR_LoadConstOffset::do_codegen_x86 not supported"); } } +void IR_LoadConstOffset::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_LoadConstOffset::do_codegen_arm64"); +} + /////////////////////// // StoreConstantOffset /////////////////////// @@ -987,33 +1100,39 @@ RegAllocInstr IR_StoreConstOffset::to_rai() { return rai; } -void IR_StoreConstOffset::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_StoreConstOffset::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { auto base_reg = m_use_coloring ? get_reg(m_base, allocs, irec) : get_no_color_reg(m_base); auto value_reg = m_use_coloring ? get_reg(m_value, allocs, irec) : get_no_color_reg(m_value); if (m_value->ireg().reg_class == RegClass::GPR_64) { - gen->add_instr(IGen::store_goal_gpr(base_reg, value_reg, emitter::gRegInfo.get_offset_reg(), - m_offset, m_size), + gen->add_instr(IGen::store_goal_gpr(*gen, base_reg, value_reg, + emitter::gRegInfo.get_offset_reg(), m_offset, m_size), irec); } else if (m_value->ireg().reg_class == RegClass::FLOAT && m_size == 4) { - gen->add_instr( - IGen::store_goal_xmm32(base_reg, value_reg, emitter::gRegInfo.get_offset_reg(), m_offset), - irec); + gen->add_instr(IGen::store_goal_xmm32(*gen, base_reg, value_reg, + emitter::gRegInfo.get_offset_reg(), m_offset), + irec); } else if ((m_value->ireg().reg_class == RegClass::VECTOR_FLOAT || m_value->ireg().reg_class == RegClass::INT_128) && m_size == 16) { - gen->add_instr( - IGen::store_goal_vf(base_reg, value_reg, emitter::gRegInfo.get_offset_reg(), m_offset), - irec); + gen->add_instr(IGen::store_goal_vf(*gen, base_reg, value_reg, + emitter::gRegInfo.get_offset_reg(), m_offset), + irec); } else { throw std::runtime_error( - fmt::format("IR_StoreConstOffset::do_codegen can't handle this (c {} sz {})", + fmt::format("IR_StoreConstOffset::do_codegen_x86 can't handle this (c {} sz {})", fmt::underlying(m_value->ireg().reg_class), m_size)); } } +void IR_StoreConstOffset::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_StoreConstOffset::do_codegen_arm64"); +} + /////////////////////// // Null /////////////////////// @@ -1025,14 +1144,20 @@ RegAllocInstr IR_Null::to_rai() { return {}; } -void IR_Null::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_Null::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { (void)gen; (void)allocs; (void)irec; } +void IR_Null::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_Null::do_codegen_arm64"); +} + /////////////////////// // ValueReset /////////////////////// @@ -1050,14 +1175,20 @@ RegAllocInstr IR_ValueReset::to_rai() { return rai; } -void IR_ValueReset::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_ValueReset::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { (void)gen; (void)allocs; (void)irec; } +void IR_ValueReset::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_ValueReset::do_codegen_arm64"); +} + /////////////////////// // FloatToInt /////////////////////// @@ -1075,13 +1206,21 @@ RegAllocInstr IR_FloatToInt::to_rai() { return rai; } -void IR_FloatToInt::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { - gen->add_instr(IGen::float_to_int32(get_reg(m_dest, allocs, irec), get_reg(m_src, allocs, irec)), - irec); - gen->add_instr(IGen::movsx_r64_r32(get_reg(m_dest, allocs, irec), get_reg(m_dest, allocs, irec)), - irec); +void IR_FloatToInt::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + gen->add_instr( + IGen::float_to_int32(*gen, get_reg(m_dest, allocs, irec), get_reg(m_src, allocs, irec)), + irec); + gen->add_instr( + IGen::movsx_r64_r32(*gen, get_reg(m_dest, allocs, irec), get_reg(m_dest, allocs, irec)), + irec); +} + +void IR_FloatToInt::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_FloatToInt::do_codegen_arm64"); } /////////////////////// @@ -1101,11 +1240,18 @@ RegAllocInstr IR_IntToFloat::to_rai() { return rai; } -void IR_IntToFloat::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { - gen->add_instr(IGen::int32_to_float(get_reg(m_dest, allocs, irec), get_reg(m_src, allocs, irec)), - irec); +void IR_IntToFloat::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + gen->add_instr( + IGen::int32_to_float(*gen, get_reg(m_dest, allocs, irec), get_reg(m_src, allocs, irec)), + irec); +} + +void IR_IntToFloat::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_IntToFloat::do_codegen_arm64"); } /////////////////////// @@ -1124,23 +1270,29 @@ RegAllocInstr IR_GetStackAddr::to_rai() { return rai; } -void IR_GetStackAddr::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_GetStackAddr::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { auto dest_reg = get_reg(m_dest, allocs, irec); int offset = GPR_SIZE * allocs.get_slot_for_var(m_slot); if (offset == 0) { - gen->add_instr(IGen::mov_gpr64_gpr64(dest_reg, RSP), irec); - gen->add_instr(IGen::sub_gpr64_gpr64(dest_reg, gRegInfo.get_offset_reg()), irec); + gen->add_instr(IGen::mov_gpr64_gpr64(*gen, dest_reg, RSP), irec); + gen->add_instr(IGen::sub_gpr64_gpr64(*gen, dest_reg, gRegInfo.get_offset_reg()), irec); } else { // dest = offset + RSP - gen->add_instr(IGen::lea_reg_plus_off(dest_reg, RSP, offset), irec); + gen->add_instr(IGen::lea_reg_plus_off(*gen, dest_reg, RSP, offset), irec); // dest = offset + RSP - offset - gen->add_instr(IGen::sub_gpr64_gpr64(dest_reg, gRegInfo.get_offset_reg()), irec); + gen->add_instr(IGen::sub_gpr64_gpr64(*gen, dest_reg, gRegInfo.get_offset_reg()), irec); } } +void IR_GetStackAddr::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_GetStackAddr::do_codegen_arm64"); +} + /////////////////////// // Nop /////////////////////// @@ -1155,10 +1307,16 @@ RegAllocInstr IR_Nop::to_rai() { return {}; } -void IR_Nop::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult&, - emitter::IR_Record irec) { - gen->add_instr(IGen::nop(), irec); +void IR_Nop::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult&, + emitter::IR_Record irec) { + gen->add_instr(IGen::nop(*gen), irec); +} + +void IR_Nop::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_Nop::do_codegen_arm64"); } /////////////////////// @@ -1189,11 +1347,17 @@ RegAllocInstr IR_AsmRet::to_rai() { return {}; } -void IR_AsmRet::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_AsmRet::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { (void)allocs; - gen->add_instr(IGen::ret(), irec); + gen->add_instr(IGen::ret(*gen), irec); +} + +void IR_AsmRet::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_AsmRet::do_codegen_arm64"); } /////////////////////// @@ -1210,11 +1374,17 @@ RegAllocInstr IR_AsmFNop::to_rai() { return {}; } -void IR_AsmFNop::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_AsmFNop::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { (void)allocs; - gen->add_instr(IGen::nop_vf(), irec); + gen->add_instr(IGen::nop_vf(*gen), irec); +} + +void IR_AsmFNop::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_AsmFNop::do_codegen_arm64"); } /////////////////////// @@ -1231,11 +1401,17 @@ RegAllocInstr IR_AsmFWait::to_rai() { return {}; } -void IR_AsmFWait::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_AsmFWait::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { (void)allocs; - gen->add_instr(IGen::wait_vf(), irec); + gen->add_instr(IGen::wait_vf(*gen), irec); +} + +void IR_AsmFWait::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_AsmFWait::do_codegen_arm64"); } /////////////////////// @@ -1256,16 +1432,22 @@ RegAllocInstr IR_AsmPush::to_rai() { return rai; } -void IR_AsmPush::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_AsmPush::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { if (m_use_coloring) { - gen->add_instr(IGen::push_gpr64(get_reg(m_src, allocs, irec)), irec); + gen->add_instr(IGen::push_gpr64(*gen, get_reg(m_src, allocs, irec)), irec); } else { - gen->add_instr(IGen::push_gpr64(get_no_color_reg(m_src)), irec); + gen->add_instr(IGen::push_gpr64(*gen, get_no_color_reg(m_src)), irec); } } +void IR_AsmPush::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_AsmPush::do_codegen_arm64"); +} + /////////////////////// // AsmPop /////////////////////// @@ -1284,16 +1466,22 @@ RegAllocInstr IR_AsmPop::to_rai() { return rai; } -void IR_AsmPop::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_AsmPop::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { if (m_use_coloring) { - gen->add_instr(IGen::pop_gpr64(get_reg(m_dst, allocs, irec)), irec); + gen->add_instr(IGen::pop_gpr64(*gen, get_reg(m_dst, allocs, irec)), irec); } else { - gen->add_instr(IGen::pop_gpr64(get_no_color_reg(m_dst)), irec); + gen->add_instr(IGen::pop_gpr64(*gen, get_no_color_reg(m_dst)), irec); } } +void IR_AsmPop::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_AsmPop::do_codegen_arm64"); +} + /////////////////////// // AsmSub /////////////////////// @@ -1315,17 +1503,25 @@ RegAllocInstr IR_AsmSub::to_rai() { return rai; } -void IR_AsmSub::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_AsmSub::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { if (m_use_coloring) { gen->add_instr( - IGen::sub_gpr64_gpr64(get_reg(m_dst, allocs, irec), get_reg(m_src, allocs, irec)), irec); + IGen::sub_gpr64_gpr64(*gen, get_reg(m_dst, allocs, irec), get_reg(m_src, allocs, irec)), + irec); } else { - gen->add_instr(IGen::sub_gpr64_gpr64(get_no_color_reg(m_dst), get_no_color_reg(m_src)), irec); + gen->add_instr(IGen::sub_gpr64_gpr64(*gen, get_no_color_reg(m_dst), get_no_color_reg(m_src)), + irec); } } +void IR_AsmSub::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_AsmSub::do_codegen_arm64"); +} + /////////////////////// // AsmAdd /////////////////////// @@ -1347,17 +1543,25 @@ RegAllocInstr IR_AsmAdd::to_rai() { return rai; } -void IR_AsmAdd::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_AsmAdd::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { if (m_use_coloring) { gen->add_instr( - IGen::add_gpr64_gpr64(get_reg(m_dst, allocs, irec), get_reg(m_src, allocs, irec)), irec); + IGen::add_gpr64_gpr64(*gen, get_reg(m_dst, allocs, irec), get_reg(m_src, allocs, irec)), + irec); } else { - gen->add_instr(IGen::add_gpr64_gpr64(get_no_color_reg(m_dst), get_no_color_reg(m_src)), irec); + gen->add_instr(IGen::add_gpr64_gpr64(*gen, get_no_color_reg(m_dst), get_no_color_reg(m_src)), + irec); } } +void IR_AsmAdd::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_AsmAdd::do_codegen_arm64"); +} + /////////////////////// // AsmGetSymbolValue /////////////////////// @@ -1380,25 +1584,31 @@ RegAllocInstr IR_GetSymbolValueAsm::to_rai() { return rai; } -void IR_GetSymbolValueAsm::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_GetSymbolValueAsm::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { auto dst_reg = m_use_coloring ? get_reg(m_dest, allocs, irec) : get_no_color_reg(m_dest); if (m_sext) { - auto instr = gen->add_instr( - IGen::load32s_gpr64_gpr64_plus_gpr64_plus_s32( - dst_reg, gRegInfo.get_st_reg(), gRegInfo.get_offset_reg(), LINK_SYM_NO_OFFSET_FLAG), - irec); + auto instr = gen->add_instr(IGen::load32s_gpr64_gpr64_plus_gpr64_plus_s32( + *gen, dst_reg, gRegInfo.get_st_reg(), gRegInfo.get_offset_reg(), + LINK_SYM_NO_OFFSET_FLAG), + irec); gen->link_instruction_symbol_mem(instr, m_sym_name); } else { - auto instr = gen->add_instr( - IGen::load32u_gpr64_gpr64_plus_gpr64_plus_s32( - dst_reg, gRegInfo.get_st_reg(), gRegInfo.get_offset_reg(), LINK_SYM_NO_OFFSET_FLAG), - irec); + auto instr = gen->add_instr(IGen::load32u_gpr64_gpr64_plus_gpr64_plus_s32( + *gen, dst_reg, gRegInfo.get_st_reg(), gRegInfo.get_offset_reg(), + LINK_SYM_NO_OFFSET_FLAG), + irec); gen->link_instruction_symbol_mem(instr, m_sym_name); } } +void IR_GetSymbolValueAsm::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_GetSymbolValueAsm::do_codegen_arm64"); +} + /////////////////////// // AsmJumpReg /////////////////////// @@ -1417,11 +1627,17 @@ RegAllocInstr IR_JumpReg::to_rai() { return rai; } -void IR_JumpReg::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_JumpReg::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { auto src_reg = m_use_coloring ? get_reg(m_src, allocs, irec) : get_no_color_reg(m_src); - gen->add_instr(IGen::jmp_r64(src_reg), irec); + gen->add_instr(IGen::jmp_r64(*gen, src_reg), irec); +} + +void IR_JumpReg::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_JumpReg::do_codegen_arm64"); } /////////////////////// @@ -1444,12 +1660,18 @@ RegAllocInstr IR_RegSetAsm::to_rai() { return rai; } -void IR_RegSetAsm::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_RegSetAsm::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { regset_common(gen, allocs, irec, m_dst, m_src, m_use_coloring); } +void IR_RegSetAsm::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_RegSetAsm::do_codegen_arm64"); +} + /////////////////////// // AsmVF3 /////////////////////// @@ -1502,40 +1724,46 @@ RegAllocInstr IR_VFMath3Asm::to_rai() { return rai; } -void IR_VFMath3Asm::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_VFMath3Asm::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { auto dst = get_reg_asm(m_dst, allocs, irec, m_use_coloring); auto src1 = get_reg_asm(m_src1, allocs, irec, m_use_coloring); auto src2 = get_reg_asm(m_src2, allocs, irec, m_use_coloring); switch (m_kind) { case Kind::XOR: - gen->add_instr(IGen::xor_vf(dst, src1, src2), irec); + gen->add_instr(IGen::xor_vf(*gen, dst, src1, src2), irec); break; case Kind::SUB: - gen->add_instr(IGen::sub_vf(dst, src1, src2), irec); + gen->add_instr(IGen::sub_vf(*gen, dst, src1, src2), irec); break; case Kind::ADD: - gen->add_instr(IGen::add_vf(dst, src1, src2), irec); + gen->add_instr(IGen::add_vf(*gen, dst, src1, src2), irec); break; case Kind::MUL: - gen->add_instr(IGen::mul_vf(dst, src1, src2), irec); + gen->add_instr(IGen::mul_vf(*gen, dst, src1, src2), irec); break; case Kind::MAX: - gen->add_instr(IGen::max_vf(dst, src1, src2), irec); + gen->add_instr(IGen::max_vf(*gen, dst, src1, src2), irec); break; case Kind::MIN: - gen->add_instr(IGen::min_vf(dst, src1, src2), irec); + gen->add_instr(IGen::min_vf(*gen, dst, src1, src2), irec); break; case Kind::DIV: - gen->add_instr(IGen::div_vf(dst, src1, src2), irec); + gen->add_instr(IGen::div_vf(*gen, dst, src1, src2), irec); break; default: ASSERT(false); } } +void IR_VFMath3Asm::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_VFMath3Asm::do_codegen_arm64"); +} + /////////////////////// // IR_Int128Math3Asm /////////////////////// @@ -1627,9 +1855,9 @@ RegAllocInstr IR_Int128Math3Asm::to_rai() { return rai; } -void IR_Int128Math3Asm::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_Int128Math3Asm::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { auto dst = get_reg_asm(m_dst, allocs, irec, m_use_coloring); auto src1 = get_reg_asm(m_src1, allocs, irec, m_use_coloring); auto src2 = get_reg_asm(m_src2, allocs, irec, m_use_coloring); @@ -1637,77 +1865,83 @@ void IR_Int128Math3Asm::do_codegen(emitter::ObjectGenerator* gen, switch (m_kind) { case Kind::PEXTUB: // NOTE: this is intentionally swapped because x86 and PS2 do this opposite ways. - gen->add_instr(IGen::pextub_swapped(dst, src2, src1), irec); + gen->add_instr(IGen::pextub_swapped(*gen, dst, src2, src1), irec); break; case Kind::PEXTUH: // NOTE: this is intentionally swapped because x86 and PS2 do this opposite ways. - gen->add_instr(IGen::pextuh_swapped(dst, src2, src1), irec); + gen->add_instr(IGen::pextuh_swapped(*gen, dst, src2, src1), irec); break; case Kind::PEXTUW: // NOTE: this is intentionally swapped because x86 and PS2 do this opposite ways. - gen->add_instr(IGen::pextuw_swapped(dst, src2, src1), irec); + gen->add_instr(IGen::pextuw_swapped(*gen, dst, src2, src1), irec); break; case Kind::PEXTLB: // NOTE: this is intentionally swapped because x86 and PS2 do this opposite ways. - gen->add_instr(IGen::pextlb_swapped(dst, src2, src1), irec); + gen->add_instr(IGen::pextlb_swapped(*gen, dst, src2, src1), irec); break; case Kind::PEXTLH: // NOTE: this is intentionally swapped because x86 and PS2 do this opposite ways. - gen->add_instr(IGen::pextlh_swapped(dst, src2, src1), irec); + gen->add_instr(IGen::pextlh_swapped(*gen, dst, src2, src1), irec); break; case Kind::PEXTLW: // NOTE: this is intentionally swapped because x86 and PS2 do this opposite ways. - gen->add_instr(IGen::pextlw_swapped(dst, src2, src1), irec); + gen->add_instr(IGen::pextlw_swapped(*gen, dst, src2, src1), irec); break; case Kind::PCPYLD: // NOTE: this is intentionally swapped because x86 and PS2 do this opposite ways. - gen->add_instr(IGen::pcpyld_swapped(dst, src2, src1), irec); + gen->add_instr(IGen::pcpyld_swapped(*gen, dst, src2, src1), irec); break; case Kind::PCPYUD: - gen->add_instr(IGen::pcpyud(dst, src1, src2), irec); + gen->add_instr(IGen::pcpyud(*gen, dst, src1, src2), irec); break; case Kind::PCEQB: - gen->add_instr(IGen::parallel_compare_e_b(dst, src2, src1), irec); + gen->add_instr(IGen::parallel_compare_e_b(*gen, dst, src2, src1), irec); break; case Kind::PCEQH: - gen->add_instr(IGen::parallel_compare_e_h(dst, src2, src1), irec); + gen->add_instr(IGen::parallel_compare_e_h(*gen, dst, src2, src1), irec); break; case Kind::PCEQW: - gen->add_instr(IGen::parallel_compare_e_w(dst, src2, src1), irec); + gen->add_instr(IGen::parallel_compare_e_w(*gen, dst, src2, src1), irec); break; case Kind::PCGTB: - gen->add_instr(IGen::parallel_compare_gt_b(dst, src1, src2), irec); + gen->add_instr(IGen::parallel_compare_gt_b(*gen, dst, src1, src2), irec); break; case Kind::PCGTH: - gen->add_instr(IGen::parallel_compare_gt_h(dst, src1, src2), irec); + gen->add_instr(IGen::parallel_compare_gt_h(*gen, dst, src1, src2), irec); break; case Kind::PCGTW: - gen->add_instr(IGen::parallel_compare_gt_w(dst, src1, src2), irec); + gen->add_instr(IGen::parallel_compare_gt_w(*gen, dst, src1, src2), irec); break; case Kind::PSUBW: // psubW on mips is psubD on x86... - gen->add_instr(IGen::vpsubd(dst, src1, src2), irec); + gen->add_instr(IGen::vpsubd(*gen, dst, src1, src2), irec); break; case Kind::POR: - gen->add_instr(IGen::parallel_bitwise_or(dst, src2, src1), irec); + gen->add_instr(IGen::parallel_bitwise_or(*gen, dst, src2, src1), irec); break; case Kind::PXOR: - gen->add_instr(IGen::parallel_bitwise_xor(dst, src2, src1), irec); + gen->add_instr(IGen::parallel_bitwise_xor(*gen, dst, src2, src1), irec); break; case Kind::PAND: - gen->add_instr(IGen::parallel_bitwise_and(dst, src2, src1), irec); + gen->add_instr(IGen::parallel_bitwise_and(*gen, dst, src2, src1), irec); break; case Kind::PACKUSWB: - gen->add_instr(IGen::vpackuswb(dst, src1, src2), irec); + gen->add_instr(IGen::vpackuswb(*gen, dst, src1, src2), irec); break; case Kind::PADDB: - gen->add_instr(IGen::parallel_add_byte(dst, src1, src2), irec); + gen->add_instr(IGen::parallel_add_byte(*gen, dst, src1, src2), irec); break; default: ASSERT(false); } } +void IR_Int128Math3Asm::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_Int128Math3Asm::do_codegen_arm64"); +} + /////////////////////// // AsmVF2 /////////////////////// @@ -1741,24 +1975,30 @@ RegAllocInstr IR_VFMath2Asm::to_rai() { return rai; } -void IR_VFMath2Asm::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_VFMath2Asm::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { auto dst = get_reg_asm(m_dst, allocs, irec, m_use_coloring); auto src = get_reg_asm(m_src, allocs, irec, m_use_coloring); switch (m_kind) { case Kind::ITOF: - gen->add_instr(IGen::itof_vf(dst, src), irec); + gen->add_instr(IGen::itof_vf(*gen, dst, src), irec); break; case Kind::FTOI: - gen->add_instr(IGen::ftoi_vf(dst, src), irec); + gen->add_instr(IGen::ftoi_vf(*gen, dst, src), irec); break; default: ASSERT(false); } } +void IR_VFMath2Asm::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_VFMath2Asm::do_codegen_arm64"); +} + /////////////////////// // AsmInt128-2 /////////////////////// @@ -1833,9 +2073,9 @@ RegAllocInstr IR_Int128Math2Asm::to_rai() { return rai; } -void IR_Int128Math2Asm::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_Int128Math2Asm::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { auto dst = get_reg_asm(m_dst, allocs, irec, m_use_coloring); auto src = get_reg_asm(m_src, allocs, irec, m_use_coloring); @@ -1845,62 +2085,68 @@ void IR_Int128Math2Asm::do_codegen(emitter::ObjectGenerator* gen, ASSERT(m_imm.has_value()); ASSERT(*m_imm >= 0); ASSERT(*m_imm <= 255); - gen->add_instr(IGen::pw_sll(dst, src, *m_imm), irec); + gen->add_instr(IGen::pw_sll(*gen, dst, src, *m_imm), irec); break; case Kind::PW_SRL: ASSERT(m_imm.has_value()); ASSERT(*m_imm >= 0); ASSERT(*m_imm <= 255); - gen->add_instr(IGen::pw_srl(dst, src, *m_imm), irec); + gen->add_instr(IGen::pw_srl(*gen, dst, src, *m_imm), irec); break; case Kind::PH_SLL: // you are technically allowed to put values > 32 in here. ASSERT(m_imm.has_value()); ASSERT(*m_imm >= 0); ASSERT(*m_imm <= 255); - gen->add_instr(IGen::ph_sll(dst, src, *m_imm), irec); + gen->add_instr(IGen::ph_sll(*gen, dst, src, *m_imm), irec); break; case Kind::PH_SRL: ASSERT(m_imm.has_value()); ASSERT(*m_imm >= 0); ASSERT(*m_imm <= 255); - gen->add_instr(IGen::ph_srl(dst, src, *m_imm), irec); + gen->add_instr(IGen::ph_srl(*gen, dst, src, *m_imm), irec); break; case Kind::PW_SRA: ASSERT(m_imm.has_value()); ASSERT(*m_imm >= 0); ASSERT(*m_imm <= 255); - gen->add_instr(IGen::pw_sra(dst, src, *m_imm), irec); + gen->add_instr(IGen::pw_sra(*gen, dst, src, *m_imm), irec); break; case Kind::VPSRLDQ: ASSERT(m_imm.has_value()); ASSERT(*m_imm >= 0); ASSERT(*m_imm <= 255); - gen->add_instr(IGen::vpsrldq(dst, src, *m_imm), irec); + gen->add_instr(IGen::vpsrldq(*gen, dst, src, *m_imm), irec); break; case Kind::VPSLLDQ: ASSERT(m_imm.has_value()); ASSERT(*m_imm >= 0); ASSERT(*m_imm <= 255); - gen->add_instr(IGen::vpslldq(dst, src, *m_imm), irec); + gen->add_instr(IGen::vpslldq(*gen, dst, src, *m_imm), irec); break; case Kind::VPSHUFLW: ASSERT(m_imm.has_value()); ASSERT(*m_imm >= 0); ASSERT(*m_imm <= 255); - gen->add_instr(IGen::vpshuflw(dst, src, *m_imm), irec); + gen->add_instr(IGen::vpshuflw(*gen, dst, src, *m_imm), irec); break; case Kind::VPSHUFHW: ASSERT(m_imm.has_value()); ASSERT(*m_imm >= 0); ASSERT(*m_imm <= 255); - gen->add_instr(IGen::vpshufhw(dst, src, *m_imm), irec); + gen->add_instr(IGen::vpshufhw(*gen, dst, src, *m_imm), irec); break; default: ASSERT(false); } } +void IR_Int128Math2Asm::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_Int128Math2Asm::do_codegen_arm64"); +} + // ---- Blend VF IR_BlendVF::IR_BlendVF(bool use_color, @@ -1925,13 +2171,19 @@ RegAllocInstr IR_BlendVF::to_rai() { return rai; } -void IR_BlendVF::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_BlendVF::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { auto dst = get_reg_asm(m_dst, allocs, irec, m_use_coloring); auto src1 = get_reg_asm(m_src1, allocs, irec, m_use_coloring); auto src2 = get_reg_asm(m_src2, allocs, irec, m_use_coloring); - gen->add_instr(IGen::blend_vf(dst, src1, src2, m_mask), irec); + gen->add_instr(IGen::blend_vf(*gen, dst, src1, src2, m_mask), irec); +} + +void IR_BlendVF::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_BlendVF::do_codegen_arm64"); } // ----- Splat VF @@ -1956,12 +2208,18 @@ RegAllocInstr IR_SplatVF::to_rai() { return rai; } -void IR_SplatVF::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_SplatVF::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { auto dst = get_reg_asm(m_dst, allocs, irec, m_use_coloring); auto src = get_reg_asm(m_src, allocs, irec, m_use_coloring); - gen->add_instr(IGen::splat_vf(dst, src, m_element), irec); + gen->add_instr(IGen::splat_vf(*gen, dst, src, m_element), irec); +} + +void IR_SplatVF::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_SplatVF::do_codegen_arm64"); } // ---- Swizzle VF @@ -1986,12 +2244,18 @@ RegAllocInstr IR_SwizzleVF::to_rai() { return rai; } -void IR_SwizzleVF::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_SwizzleVF::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { auto dst = get_reg_asm(m_dst, allocs, irec, m_use_coloring); auto src = get_reg_asm(m_src, allocs, irec, m_use_coloring); - gen->add_instr(IGen::swizzle_vf(dst, src, m_controlBytes), irec); + gen->add_instr(IGen::swizzle_vf(*gen, dst, src, m_controlBytes), irec); +} + +void IR_SwizzleVF::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_SwizzleVF::do_codegen_arm64"); } // ---- Square Root VF @@ -2013,10 +2277,16 @@ RegAllocInstr IR_SqrtVF::to_rai() { return rai; } -void IR_SqrtVF::do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) { +void IR_SqrtVF::do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { auto dst = get_reg_asm(m_dst, allocs, irec, m_use_coloring); auto src = get_reg_asm(m_src, allocs, irec, m_use_coloring); - gen->add_instr(IGen::sqrt_vf(dst, src), irec); + gen->add_instr(IGen::sqrt_vf(*gen, dst, src), irec); +} + +void IR_SqrtVF::do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) { + throw std::runtime_error("NYI - IR_SqrtVF::do_codegen_arm64"); } diff --git a/goalc/compiler/IR.h b/goalc/compiler/IR.h index 5691bb729a..9a098864e9 100644 --- a/goalc/compiler/IR.h +++ b/goalc/compiler/IR.h @@ -2,9 +2,9 @@ #include -#include "CodeGenerator.h" #include "Val.h" +#include "goalc/compiler/Label.h" #include "goalc/emitter/ObjectGenerator.h" #include "goalc/emitter/Register.h" #include "goalc/regalloc/allocator_interface.h" @@ -13,9 +13,12 @@ class IR { public: virtual std::string print() = 0; virtual RegAllocInstr to_rai() = 0; - virtual void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) = 0; + virtual void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) = 0; + virtual void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) = 0; virtual void add_constraints(std::vector* constraints, int my_id) { (void)constraints; (void)my_id; @@ -29,9 +32,12 @@ class IR_Return : public IR { std::string print() override; RegAllocInstr to_rai() override; void add_constraints(std::vector* constraints, int my_id) override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; const RegVal* value() { return m_value; } protected: @@ -45,9 +51,12 @@ class IR_LoadConstant64 : public IR { IR_LoadConstant64(const RegVal* dest, u64 value); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; protected: const RegVal* m_dest = nullptr; @@ -59,9 +68,12 @@ class IR_LoadSymbolPointer : public IR { IR_LoadSymbolPointer(const RegVal* dest, std::string name); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; protected: const RegVal* m_dest = nullptr; @@ -73,9 +85,12 @@ class IR_SetSymbolValue : public IR { IR_SetSymbolValue(const SymbolVal* dest, const RegVal* src); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; protected: const SymbolVal* m_dest = nullptr; @@ -87,9 +102,12 @@ class IR_GetSymbolValue : public IR { IR_GetSymbolValue(const RegVal* dest, const SymbolVal* src, bool sext); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; protected: const RegVal* m_dest = nullptr; @@ -102,9 +120,12 @@ class IR_RegSet : public IR { IR_RegSet(const RegVal* dest, const RegVal* src); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; protected: const RegVal* m_dest = nullptr; @@ -120,9 +141,12 @@ class IR_FunctionCall : public IR { std::optional ret_reg); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; void add_constraints(std::vector* constraints, int my_id) override; protected: @@ -138,9 +162,12 @@ class IR_RegValAddr : public IR { IR_RegValAddr(const RegVal* dest, const RegVal* src); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; protected: const RegVal* m_dest = nullptr; @@ -152,9 +179,12 @@ class IR_StaticVarAddr : public IR { IR_StaticVarAddr(const RegVal* dest, const StaticObject* src); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; protected: const RegVal* m_dest = nullptr; @@ -166,9 +196,12 @@ class IR_StaticVarLoad : public IR { IR_StaticVarLoad(const RegVal* dest, const StaticObject* src); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; protected: const RegVal* m_dest = nullptr; @@ -180,9 +213,12 @@ class IR_FunctionAddr : public IR { IR_FunctionAddr(const RegVal* dest, FunctionEnv* src); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; protected: const RegVal* m_dest = nullptr; @@ -216,9 +252,12 @@ class IR_IntegerMath : public IR { IR_IntegerMath(IntegerMathKind kind, RegVal* dest, u8 shift_amount); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; IntegerMathKind get_kind() const { return m_kind; } protected: @@ -235,9 +274,12 @@ class IR_FloatMath : public IR { IR_FloatMath(FloatMathKind kind, RegVal* dest, RegVal* arg); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; FloatMathKind get_kind() const { return m_kind; } protected: @@ -265,9 +307,12 @@ class IR_GotoLabel : public IR { explicit IR_GotoLabel(const Label* dest); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; protected: const Label* m_dest = nullptr; @@ -279,9 +324,12 @@ class IR_ConditionalBranch : public IR { IR_ConditionalBranch(const Condition& condition, Label _label); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; void mark_as_resolved() { m_resolved = true; } Condition condition; @@ -296,9 +344,12 @@ class IR_Null : public IR { IR_Null() = default; std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; }; class IR_ValueReset : public IR { @@ -306,9 +357,12 @@ class IR_ValueReset : public IR { IR_ValueReset(std::vector args); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; private: std::vector m_args; @@ -319,9 +373,12 @@ class IR_FloatToInt : public IR { IR_FloatToInt(const RegVal* dest, const RegVal* src); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; private: const RegVal* m_dest = nullptr; @@ -333,9 +390,12 @@ class IR_IntToFloat : public IR { IR_IntToFloat(const RegVal* dest, const RegVal* src); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; private: const RegVal* m_dest = nullptr; @@ -347,9 +407,12 @@ class IR_GetStackAddr : public IR { IR_GetStackAddr(const RegVal* dest, int slot); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; private: const RegVal* m_dest = nullptr; @@ -361,9 +424,12 @@ class IR_Nop : public IR { IR_Nop(); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; }; class IR_Asm : public IR { @@ -384,9 +450,12 @@ class IR_LoadConstOffset : public IR_Asm { bool use_coloring = true); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; private: const RegVal* m_dest = nullptr; @@ -404,9 +473,12 @@ class IR_StoreConstOffset : public IR_Asm { bool use_coloring = true); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; private: const RegVal* m_value = nullptr; @@ -420,9 +492,12 @@ class IR_AsmRet : public IR_Asm { IR_AsmRet(bool use_coloring); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; }; class IR_AsmPush : public IR_Asm { @@ -430,9 +505,12 @@ class IR_AsmPush : public IR_Asm { IR_AsmPush(bool use_coloring, const RegVal* src); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; private: const RegVal* m_src = nullptr; @@ -443,9 +521,12 @@ class IR_AsmPop : public IR_Asm { IR_AsmPop(bool use_coloring, const RegVal* dst); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; private: const RegVal* m_dst = nullptr; @@ -456,9 +537,12 @@ class IR_AsmSub : public IR_Asm { IR_AsmSub(bool use_coloring, const RegVal* dst, const RegVal* src); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; private: const RegVal* m_dst = nullptr; @@ -470,9 +554,12 @@ class IR_AsmAdd : public IR_Asm { IR_AsmAdd(bool use_coloring, const RegVal* dst, const RegVal* src); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; private: const RegVal* m_dst = nullptr; @@ -484,9 +571,12 @@ class IR_AsmFNop : public IR_Asm { IR_AsmFNop(); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; }; class IR_AsmFWait : public IR_Asm { @@ -494,9 +584,12 @@ class IR_AsmFWait : public IR_Asm { IR_AsmFWait(); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; }; class IR_GetSymbolValueAsm : public IR_Asm { @@ -504,9 +597,12 @@ class IR_GetSymbolValueAsm : public IR_Asm { IR_GetSymbolValueAsm(bool use_coloring, const RegVal* dest, std::string sym_name, bool sext); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; protected: const RegVal* m_dest = nullptr; @@ -519,9 +615,12 @@ class IR_JumpReg : public IR_Asm { IR_JumpReg(bool use_coloring, const RegVal* src); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; protected: const RegVal* m_src = nullptr; @@ -532,9 +631,12 @@ class IR_RegSetAsm : public IR_Asm { IR_RegSetAsm(bool use_color, const RegVal* dst, const RegVal* src); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; protected: const RegVal* m_dst = nullptr; @@ -551,9 +653,12 @@ class IR_VFMath3Asm : public IR_Asm { Kind kind); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; protected: const RegVal* m_dst = nullptr; @@ -594,9 +699,12 @@ class IR_Int128Math3Asm : public IR_Asm { Kind kind); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; protected: const RegVal* m_dst = nullptr; @@ -615,9 +723,12 @@ class IR_Int128Math2Asm : public IR_Asm { std::optional = std::nullopt); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; protected: const RegVal* m_dst = nullptr; @@ -632,9 +743,12 @@ class IR_VFMath2Asm : public IR_Asm { IR_VFMath2Asm(bool use_color, const RegVal* dst, const RegVal* src, Kind kind); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; protected: const RegVal* m_dst = nullptr; @@ -647,9 +761,12 @@ class IR_BlendVF : public IR_Asm { IR_BlendVF(bool use_color, const RegVal* dst, const RegVal* src1, const RegVal* src2, u8 mask); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; protected: const RegVal* m_dst = nullptr; @@ -666,9 +783,12 @@ class IR_SplatVF : public IR_Asm { const emitter::Register::VF_ELEMENT element); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; protected: const RegVal* m_dst = nullptr; @@ -681,9 +801,12 @@ class IR_SwizzleVF : public IR_Asm { IR_SwizzleVF(bool use_color, const RegVal* dst, const RegVal* src, const u8 m_controlBytes); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; protected: const RegVal* m_dst = nullptr; @@ -696,9 +819,12 @@ class IR_SqrtVF : public IR_Asm { IR_SqrtVF(bool use_color, const RegVal* dst, const RegVal* src); std::string print() override; RegAllocInstr to_rai() override; - void do_codegen(emitter::ObjectGenerator* gen, - const AllocationResult& allocs, - emitter::IR_Record irec) override; + void do_codegen_x86(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; + void do_codegen_arm64(emitter::ObjectGenerator* gen, + const AllocationResult& allocs, + emitter::IR_Record irec) override; protected: const RegVal* m_dst = nullptr; diff --git a/goalc/compiler/compilation/Function.cpp b/goalc/compiler/compilation/Function.cpp index bca1aecebe..b3c703c2ab 100644 --- a/goalc/compiler/compilation/Function.cpp +++ b/goalc/compiler/compilation/Function.cpp @@ -192,7 +192,8 @@ Val* Compiler::compile_lambda(const goos::Object& form, const goos::Object& rest IRegConstraint constr; constr.instr_idx = 0; // constraint at function start auto ireg_arg = new_func_env->make_ireg( - lambda.params.at(i).type, arg_regs.at(i).is_gpr() ? RegClass::GPR_64 : RegClass::INT_128); + lambda.params.at(i).type, + arg_regs.at(i).is_gpr(m_instr_set) ? RegClass::GPR_64 : RegClass::INT_128); ireg_arg->mark_as_settable(); constr.ireg = ireg_arg->ireg(); constr.desired_register = arg_regs.at(i); @@ -230,7 +231,8 @@ Val* Compiler::compile_lambda(const goos::Object& form, const goos::Object& rest for (u32 i = 0; i < lambda.params.size(); i++) { auto ireg = new_func_env->make_ireg( - lambda.params.at(i).type, arg_regs.at(i).is_gpr() ? RegClass::GPR_64 : RegClass::INT_128); + lambda.params.at(i).type, + arg_regs.at(i).is_gpr(m_instr_set) ? RegClass::GPR_64 : RegClass::INT_128); ireg->mark_as_settable(); if (!new_func_env->params.insert({m_goos.intern_ptr(lambda.params.at(i).name), ireg}) .second) { @@ -608,7 +610,7 @@ Val* Compiler::compile_real_function_call(const goos::Object& form, auto cc = get_function_calling_convention(function->type(), m_ts); RegClass ret_reg_class = RegClass::GPR_64; - if (cc.return_reg && cc.return_reg->is_xmm()) { + if (cc.return_reg && cc.return_reg->is_128bit_simd(m_instr_set)) { ret_reg_class = RegClass::INT_128; } @@ -641,8 +643,8 @@ Val* Compiler::compile_real_function_call(const goos::Object& form, for (int i = 0; i < (int)args.size(); i++) { const auto& arg = args.at(i); auto reg = cc.arg_regs.at(i); - arg_outs.push_back( - env->make_ireg(arg->type(), reg.is_xmm() ? RegClass::INT_128 : RegClass::GPR_64)); + arg_outs.push_back(env->make_ireg( + arg->type(), reg.is_128bit_simd(m_instr_set) ? RegClass::INT_128 : RegClass::GPR_64)); arg_outs.back()->mark_as_settable(); env->emit_ir(form, arg_outs.back(), arg); } diff --git a/goalc/compiler/compilation/Type.cpp b/goalc/compiler/compilation/Type.cpp index 946ca06405..4d4e7d6510 100644 --- a/goalc/compiler/compilation/Type.cpp +++ b/goalc/compiler/compilation/Type.cpp @@ -572,7 +572,8 @@ Val* Compiler::compile_defmethod(const goos::Object& form, const goos::Object& _ IRegConstraint constr; constr.instr_idx = 0; // constraint at function start auto ireg_arg = new_func_env->make_ireg( - lambda.params.at(i).type, arg_regs.at(i).is_gpr() ? RegClass::GPR_64 : RegClass::INT_128); + lambda.params.at(i).type, + arg_regs.at(i).is_gpr(m_instr_set) ? RegClass::GPR_64 : RegClass::INT_128); ireg_arg->mark_as_settable(); constr.ireg = ireg_arg->ireg(); constr.desired_register = arg_regs.at(i); @@ -609,8 +610,9 @@ Val* Compiler::compile_defmethod(const goos::Object& form, const goos::Object& _ func_block_env->emit_ir(form, reset_args_for_coloring); for (u32 i = 0; i < lambda.params.size(); i++) { - auto ireg = new_func_env->make_ireg( - lambda.params.at(i).type, arg_regs.at(i).is_gpr() ? RegClass::GPR_64 : RegClass::INT_128); + auto ireg = new_func_env->make_ireg(lambda.params.at(i).type, arg_regs.at(i).is_gpr(m_instr_set) + ? RegClass::GPR_64 + : RegClass::INT_128); ireg->mark_as_settable(); if (!new_func_env->params.insert({m_goos.intern_ptr(lambda.params.at(i).name), ireg}).second) { throw_compiler_error(form, "defmethod has multiple arguments named {}", diff --git a/goalc/debugger/disassemble.cpp b/goalc/debugger/disassemble.cpp index 3d567ce1a3..f328ef8cdc 100644 --- a/goalc/debugger/disassemble.cpp +++ b/goalc/debugger/disassemble.cpp @@ -2,9 +2,8 @@ #include "common/goos/Reader.h" -#include "Zydis/Zydis.h" -#include "goalc/compiler/Env.h" -#include "goalc/compiler/IR.h" +#include "Zydis/Decoder.h" +#include "Zydis/Formatter.h" #include "fmt/color.h" #include "fmt/format.h" diff --git a/goalc/debugger/disassemble.h b/goalc/debugger/disassemble.h index c3679d56f0..a54102ba77 100644 --- a/goalc/debugger/disassemble.h +++ b/goalc/debugger/disassemble.h @@ -2,6 +2,7 @@ #include #include +#include #include #include "common/common_types.h" @@ -17,14 +18,13 @@ class HeapObject; } // namespace goos struct InstructionInfo { - emitter::Instruction instruction; //! the actual x86 instruction + emitter::Instruction instruction; enum class Kind { PROLOGUE, IR, EPILOGUE } kind; int ir_idx = -1; int offset = -1; InstructionInfo(const emitter::Instruction& _instruction, Kind _kind) : instruction(_instruction), kind(_kind) {} - InstructionInfo(const emitter::Instruction& _instruction, Kind _kind, int _ir_idx) : instruction(_instruction), kind(_kind), ir_idx(_ir_idx) {} }; @@ -43,4 +43,6 @@ std::string disassemble_x86_function( const std::vector& ir_strings, bool* had_failure, bool print_whole_function, - bool omit_ir); \ No newline at end of file + bool omit_ir); + +// TODO ARM64 - disassemble arm64 functions as well \ No newline at end of file diff --git a/goalc/emitter/CodeTester.cpp b/goalc/emitter/CodeTester.cpp index fece3b8c69..1cefe8eb15 100644 --- a/goalc/emitter/CodeTester.cpp +++ b/goalc/emitter/CodeTester.cpp @@ -6,7 +6,12 @@ * The CodeTester can't be used for tests requiring the full GOAL language/linking. */ +#include + #include "common/common_types.h" + +#include "goalc/emitter/Instruction.h" +#include "goalc/emitter/Register.h" #ifdef OS_POSIX #include #elif _WIN32 @@ -18,11 +23,12 @@ #include "CodeTester.h" #include "IGen.h" -#include "fmt/format.h" - namespace emitter { -CodeTester::CodeTester() : m_info(RegisterInfo::make_register_info()) {} +CodeTester::CodeTester() : m_info(RegisterInfo::make_register_info()), m_gen(GameVersion::Jak1) {} + +CodeTester::CodeTester(InstructionSet instruction_set) + : m_info(RegisterInfo::make_register_info()), m_gen(GameVersion::Jak1, instruction_set) {} /*! * Convert to a string for comparison against an assembler or tests. @@ -50,27 +56,37 @@ std::string CodeTester::dump_to_hex_string(bool nospace) { /*! * Add an instruction to the buffer. */ -void CodeTester::emit(const Instruction& instr) { - code_buffer_size += instr.emit(code_buffer + code_buffer_size); +void CodeTester::emit(const emitter::Instruction& instr) { + u8* start = code_buffer + code_buffer_size; + code_buffer_size += instr.emit(start); ASSERT(code_buffer_size <= code_buffer_capacity); } - /*! * Add a return instruction to the buffer. */ void CodeTester::emit_return() { - emit(IGen::ret()); + emit(IGen::ret(m_gen)); } /*! * Pop all GPRs off of the stack. Optionally exclude rax. * Pops RSP always, which is weird, but doesn't cause issues. */ -void CodeTester::emit_pop_all_gprs(bool exclude_rax) { - for (int i = 16; i-- > 0;) { - if (i != RAX || !exclude_rax) { - emit(IGen::pop_gpr64(i)); +void CodeTester::emit_pop_all_gprs(bool exclude_return_register) { + if (m_gen.instr_set() == InstructionSet::X86) { + for (int i = 16; i-- > 0;) { + if (i != RAX || !exclude_return_register) { + emit(IGen::pop_gpr64(m_gen, i)); + } } + } else if (m_gen.instr_set() == InstructionSet::ARM64) { + for (int i = 31; i-- > 0;) { + if (i != X0 || !exclude_return_register) { + emit(IGen::pop_gpr64(m_gen, i)); + } + } + } else { + throw std::runtime_error("CodeTester::emit_pop_all_gprs unhandled instruction set"); } } @@ -78,34 +94,62 @@ void CodeTester::emit_pop_all_gprs(bool exclude_rax) { * Push all GPRs onto the stack. Optionally exclude RAX. * Pushes RSP always, which is weird, but doesn't cause issues. */ -void CodeTester::emit_push_all_gprs(bool exclude_rax) { - for (int i = 0; i < 16; i++) { - if (i != RAX || !exclude_rax) { - emit(IGen::push_gpr64(i)); +void CodeTester::emit_push_all_gprs(bool exclude_return_register) { + if (m_gen.instr_set() == InstructionSet::X86) { + for (int i = 0; i < 16; i++) { + if (i != RAX || !exclude_return_register) { + emit(IGen::push_gpr64(m_gen, i)); + } } + } else if (m_gen.instr_set() == InstructionSet::ARM64) { + for (int i = 0; i < 31; i++) { + if (i != X0 || !exclude_return_register) { + emit(IGen::push_gpr64(m_gen, i)); + } + } + } else { + throw std::runtime_error("CodeTester::emit_push_all_gprs unhandled instruction set"); } } /*! * Push all xmm registers (all 128-bits) to the stack. */ -void CodeTester::emit_push_all_xmms() { - emit(IGen::sub_gpr64_imm8s(RSP, 8)); - for (int i = 0; i < 16; i++) { - emit(IGen::sub_gpr64_imm8s(RSP, 16)); - emit(IGen::store128_gpr64_xmm128(RSP, XMM0 + i)); +void CodeTester::emit_push_all_simd() { + if (m_gen.instr_set() == InstructionSet::X86) { + emit(IGen::sub_gpr64_imm8s(m_gen, RSP, 8)); + for (int i = 0; i < 16; i++) { + emit(IGen::sub_gpr64_imm8s(m_gen, RSP, 16)); + emit(IGen::store128_gpr64_simd128(m_gen, RSP, XMM0 + i)); + } + } else if (m_gen.instr_set() == InstructionSet::ARM64) { + for (int i = 0; i < 16; i++) { + emit(IGen::sub_gpr64_imm8s(m_gen, SP, 16)); + emit(IGen::store128_gpr64_simd128(m_gen, SP, Q0 + i)); + } + } else { + throw std::runtime_error("CodeTester::emit_push_all_simd unhandled instruction set"); } } /*! * Pop all xmm registers (all 128-bits) from the stack */ -void CodeTester::emit_pop_all_xmms() { - for (int i = 0; i < 16; i++) { - emit(IGen::load128_xmm128_gpr64(XMM0 + i, RSP)); - emit(IGen::add_gpr64_imm8s(RSP, 16)); +void CodeTester::emit_pop_all_simd() { + if (m_gen.instr_set() == InstructionSet::X86) { + for (int i = 0; i < 16; i++) { + emit(IGen::load128_simd128_gpr64(m_gen, XMM0 + i, RSP)); + emit(IGen::add_gpr64_imm8s(m_gen, RSP, 16)); + } + emit(IGen::add_gpr64_imm8s(m_gen, RSP, 8)); + } else if (m_gen.instr_set() == InstructionSet::ARM64) { + for (int i = 0; i < 16; i++) { + emit(IGen::load128_simd128_gpr64(m_gen, Q0 + i, SP)); + emit(IGen::add_gpr64_imm8s(m_gen, SP, 16)); + } + } else { + throw std::runtime_error("CodeTester::emit_pop_all_simd unhandled instruction set"); } - emit(IGen::add_gpr64_imm8s(RSP, 8)); } /*! @@ -119,8 +163,23 @@ void CodeTester::clear() { * Execute the buffered code with no arguments, return the value of RAX. */ u64 CodeTester::execute() { +#if defined(__aarch64__) + // allegedly needed because ARM requires flushing after writing new instructions + // on x86 it does nothing + __builtin___clear_cache((char*)code_buffer, (char*)code_buffer + code_buffer_size); +#endif // clang-format off +#if defined(__APPLE__) && defined(__aarch64__) + // TODO - we may need to switch to using pthread_jit_write_protect_np + // there may also be issues if multiple threasd are involved + // but this seems to work so keep it simple until something proves otherwise. + mprotect(code_buffer, code_buffer_capacity, PROT_EXEC | PROT_READ); + auto ret = ((u64(*)())code_buffer)(); + mprotect(code_buffer, code_buffer_capacity, PROT_WRITE | PROT_READ); + return ret; +#else return ((u64(*)())code_buffer)(); +#endif // clang-format on } @@ -130,7 +189,14 @@ u64 CodeTester::execute() { */ u64 CodeTester::execute(u64 in0, u64 in1, u64 in2, u64 in3) { // clang-format off +#if defined(__APPLE__) && defined(__aarch64__) + mprotect(code_buffer, code_buffer_capacity, PROT_EXEC | PROT_READ); + auto ret = ((u64(*)(u64, u64, u64, u64))code_buffer)(in0, in1, in2, in3); + mprotect(code_buffer, code_buffer_capacity, PROT_WRITE | PROT_READ); + return ret; +#else return ((u64(*)(u64, u64, u64, u64))code_buffer)(in0, in1, in2, in3); +#endif // clang-format on } @@ -138,8 +204,20 @@ u64 CodeTester::execute(u64 in0, u64 in1, u64 in2, u64 in3) { * Allocate a code buffer of the given size. */ void CodeTester::init_code_buffer(int capacity) { +// TODO Apple Silicon - You cannot make a page be RWX, +// or more specifically it can't be both writable and executable at the same time +// +// https://github.com/zherczeg/sljit/issues/99 +// +// The solution to this is to flip-flop between permissions, or perhaps have two threads +// one that has writing permission, and another with executable permission +#if defined(__APPLE__) && defined(__aarch64__) + code_buffer = (u8*)mmap(nullptr, capacity, PROT_WRITE | PROT_READ, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_JIT, 0, 0); +#else code_buffer = (u8*)mmap(nullptr, capacity, PROT_EXEC | PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); +#endif if (code_buffer == (u8*)(-1)) { ASSERT_MSG(false, "[CodeTester] Failed to map memory!"); } diff --git a/goalc/emitter/CodeTester.h b/goalc/emitter/CodeTester.h index a25abef5f0..7db9c3d716 100644 --- a/goalc/emitter/CodeTester.h +++ b/goalc/emitter/CodeTester.h @@ -8,9 +8,6 @@ * The CodeTester can't be used for tests requiring the full GOAL language/linking. */ -#ifndef JAK_CODETESTER_H -#define JAK_CODETESTER_H - #include #include #include @@ -20,16 +17,28 @@ #include "common/common_types.h" +#include "goalc/emitter/InstructionSet.h" +#include "goalc/emitter/ObjectGenerator.h" + namespace emitter { class CodeTester { + private: + int code_buffer_size = 0; + int code_buffer_capacity = 0; + u8* code_buffer = nullptr; + RegisterInfo m_info; + ObjectGenerator m_gen; + public: CodeTester(); + CodeTester(InstructionSet instruction_set); std::string dump_to_hex_string(bool nospace = false); + ObjectGenerator generator() const { return m_gen; } void init_code_buffer(int capacity); void emit_push_all_gprs(bool exclude_rax = false); void emit_pop_all_gprs(bool exclude_rax = false); - void emit_push_all_xmms(); - void emit_pop_all_xmms(); + void emit_push_all_simd(); + void emit_pop_all_simd(); void emit_return(); void emit(const Instruction& instr); u64 execute(); @@ -64,6 +73,7 @@ class CodeTester { * Should allow emitter tests which run code to do the right thing on windows. */ Register get_c_abi_arg_reg(int i) { + // TODO ARM64 - x86 specific #ifdef _WIN32 switch (i) { case 0: @@ -128,12 +138,5 @@ class CodeTester { void clear(); ~CodeTester(); - - private: - int code_buffer_size = 0; - int code_buffer_capacity = 0; - u8* code_buffer = nullptr; - RegisterInfo m_info; }; } // namespace emitter -#endif // JAK_CODETESTER_H diff --git a/goalc/emitter/IGen.cpp b/goalc/emitter/IGen.cpp new file mode 100644 index 0000000000..a621ce6b9c --- /dev/null +++ b/goalc/emitter/IGen.cpp @@ -0,0 +1,1089 @@ +#include "IGen.h" + +#include "IGenARM64.h" +#include "IGenX86.h" +#include "goalc/emitter/ObjectGenerator.h" + +#define IGEN_DISPATCH(name, ...) \ + switch (gen.instr_set()) { \ + case InstructionSet::X86: \ + return X86::name(__VA_ARGS__); \ + case InstructionSet::ARM64: \ + return ARM64::name(__VA_ARGS__); \ + } + +namespace emitter { +namespace IGen { + +Instruction mov_gpr64_gpr64(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(mov_gpr64_gpr64, dst, src); +} + +Instruction mov_gpr64_u64(const ObjectGenerator& gen, Register dst, uint64_t val) { + IGEN_DISPATCH(mov_gpr64_u64, dst, val); +} + +Instruction mov_gpr64_u32(const ObjectGenerator& gen, Register dst, uint64_t val) { + IGEN_DISPATCH(mov_gpr64_u32, dst, val); +} + +Instruction mov_gpr64_s32(const ObjectGenerator& gen, Register dst, int64_t val) { + IGEN_DISPATCH(mov_gpr64_s32, dst, val); +} + +Instruction movd_gpr32_xmm32(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(movd_gpr32_xmm32, dst, src); +} + +Instruction movd_xmm32_gpr32(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(movd_xmm32_gpr32, dst, src); +} + +Instruction movq_gpr64_xmm64(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(movq_gpr64_xmm64, dst, src); +} + +Instruction movq_xmm64_gpr64(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(movq_xmm64_gpr64, dst, src); +} + +Instruction mov_xmm32_xmm32(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(mov_xmm32_xmm32, dst, src); +} + +Instruction load8s_gpr64_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2) { + IGEN_DISPATCH(load8s_gpr64_gpr64_plus_gpr64, dst, addr1, addr2); +} + +Instruction store8_gpr64_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register value) { + IGEN_DISPATCH(store8_gpr64_gpr64_plus_gpr64, addr1, addr2, value); +} + +Instruction load8s_gpr64_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset) { + IGEN_DISPATCH(load8s_gpr64_gpr64_plus_gpr64_plus_s8, dst, addr1, addr2, offset); +} + +Instruction store8_gpr64_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register value, + s64 offset) { + IGEN_DISPATCH(store8_gpr64_gpr64_plus_gpr64_plus_s8, addr1, addr2, value, offset); +} + +Instruction load8s_gpr64_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset) { + IGEN_DISPATCH(load8s_gpr64_gpr64_plus_gpr64_plus_s32, dst, addr1, addr2, offset); +} + +Instruction store8_gpr64_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register value, + s64 offset) { + IGEN_DISPATCH(store8_gpr64_gpr64_plus_gpr64_plus_s32, addr1, addr2, value, offset); +} + +Instruction load8u_gpr64_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2) { + IGEN_DISPATCH(load8u_gpr64_gpr64_plus_gpr64, dst, addr1, addr2); +} + +Instruction load8u_gpr64_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset) { + IGEN_DISPATCH(load8u_gpr64_gpr64_plus_gpr64_plus_s8, dst, addr1, addr2, offset); +} + +Instruction load8u_gpr64_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset) { + IGEN_DISPATCH(load8u_gpr64_gpr64_plus_gpr64_plus_s32, dst, addr1, addr2, offset); +} + +Instruction load16s_gpr64_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2) { + IGEN_DISPATCH(load16s_gpr64_gpr64_plus_gpr64, dst, addr1, addr2); +} + +Instruction store16_gpr64_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register value) { + IGEN_DISPATCH(store16_gpr64_gpr64_plus_gpr64, addr1, addr2, value); +} + +Instruction store16_gpr64_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register value, + s64 offset) { + IGEN_DISPATCH(store16_gpr64_gpr64_plus_gpr64_plus_s8, addr1, addr2, value, offset); +} + +Instruction store16_gpr64_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register value, + s64 offset) { + IGEN_DISPATCH(store16_gpr64_gpr64_plus_gpr64_plus_s32, addr1, addr2, value, offset); +} + +Instruction load16s_gpr64_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset) { + IGEN_DISPATCH(load16s_gpr64_gpr64_plus_gpr64_plus_s8, dst, addr1, addr2, offset); +} + +Instruction load16s_gpr64_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset) { + IGEN_DISPATCH(load16s_gpr64_gpr64_plus_gpr64_plus_s32, dst, addr1, addr2, offset); +} + +Instruction load16u_gpr64_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2) { + IGEN_DISPATCH(load16u_gpr64_gpr64_plus_gpr64, dst, addr1, addr2); +} + +Instruction load16u_gpr64_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset) { + IGEN_DISPATCH(load16u_gpr64_gpr64_plus_gpr64_plus_s8, dst, addr1, addr2, offset); +} + +Instruction load16u_gpr64_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset) { + IGEN_DISPATCH(load16u_gpr64_gpr64_plus_gpr64_plus_s32, dst, addr1, addr2, offset); +} + +Instruction load32s_gpr64_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2) { + IGEN_DISPATCH(load32s_gpr64_gpr64_plus_gpr64, dst, addr1, addr2); +} + +Instruction store32_gpr64_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register value) { + IGEN_DISPATCH(store32_gpr64_gpr64_plus_gpr64, addr1, addr2, value); +} + +Instruction load32s_gpr64_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset) { + IGEN_DISPATCH(load32s_gpr64_gpr64_plus_gpr64_plus_s8, dst, addr1, addr2, offset); +} + +Instruction store32_gpr64_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register value, + s64 offset) { + IGEN_DISPATCH(store32_gpr64_gpr64_plus_gpr64_plus_s8, addr1, addr2, value, offset); +} + +Instruction load32s_gpr64_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset) { + IGEN_DISPATCH(load32s_gpr64_gpr64_plus_gpr64_plus_s32, dst, addr1, addr2, offset); +} + +Instruction store32_gpr64_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register value, + s64 offset) { + IGEN_DISPATCH(store32_gpr64_gpr64_plus_gpr64_plus_s32, addr1, addr2, value, offset); +} + +Instruction load32u_gpr64_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2) { + IGEN_DISPATCH(load32u_gpr64_gpr64_plus_gpr64, dst, addr1, addr2); +} + +Instruction load32u_gpr64_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset) { + IGEN_DISPATCH(load32u_gpr64_gpr64_plus_gpr64_plus_s8, dst, addr1, addr2, offset); +} + +Instruction load32u_gpr64_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset) { + IGEN_DISPATCH(load32u_gpr64_gpr64_plus_gpr64_plus_s32, dst, addr1, addr2, offset); +} + +Instruction load64_gpr64_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2) { + IGEN_DISPATCH(load64_gpr64_gpr64_plus_gpr64, dst, addr1, addr2); +} + +Instruction store64_gpr64_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register value) { + IGEN_DISPATCH(store64_gpr64_gpr64_plus_gpr64, addr1, addr2, value); +} + +Instruction load64_gpr64_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset) { + IGEN_DISPATCH(load64_gpr64_gpr64_plus_gpr64_plus_s8, dst, addr1, addr2, offset); +} + +Instruction store64_gpr64_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register value, + s64 offset) { + IGEN_DISPATCH(store64_gpr64_gpr64_plus_gpr64_plus_s8, addr1, addr2, value, offset); +} + +Instruction load64_gpr64_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset) { + IGEN_DISPATCH(load64_gpr64_gpr64_plus_gpr64_plus_s32, dst, addr1, addr2, offset); +} + +Instruction store64_gpr64_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register value, + s64 offset) { + IGEN_DISPATCH(store64_gpr64_gpr64_plus_gpr64_plus_s32, addr1, addr2, value, offset); +} + +Instruction store_goal_vf(const ObjectGenerator& gen, + Register addr, + Register value, + Register off, + s64 offset) { + IGEN_DISPATCH(store_goal_vf, addr, value, off, offset); +} + +Instruction store_goal_gpr(const ObjectGenerator& gen, + Register addr, + Register value, + Register off, + int offset, + int size) { + IGEN_DISPATCH(store_goal_gpr, addr, value, off, offset, size); +} + +Instruction load_goal_xmm128(const ObjectGenerator& gen, + Register dst, + Register addr, + Register off, + int offset) { + IGEN_DISPATCH(load_goal_xmm128, dst, addr, off, offset); +} + +Instruction load_goal_gpr(const ObjectGenerator& gen, + Register dst, + Register addr, + Register off, + int offset, + int size, + bool sign_extend) { + IGEN_DISPATCH(load_goal_gpr, dst, addr, off, offset, size, sign_extend); +} + +Instruction store32_xmm32_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register xmm_value) { + IGEN_DISPATCH(store32_xmm32_gpr64_plus_gpr64, addr1, addr2, xmm_value); +} + +Instruction load32_xmm32_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register simd_dest, + Register addr1, + Register addr2) { + IGEN_DISPATCH(load32_xmm32_gpr64_plus_gpr64, simd_dest, addr1, addr2); +} + +Instruction store32_xmm32_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register xmm_value, + s64 offset) { + IGEN_DISPATCH(store32_xmm32_gpr64_plus_gpr64_plus_s8, addr1, addr2, xmm_value, offset); +} + +Instruction load32_xmm32_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register simd_dest, + Register addr1, + Register addr2, + s64 offset) { + IGEN_DISPATCH(load32_xmm32_gpr64_plus_gpr64_plus_s8, simd_dest, addr1, addr2, offset); +} + +Instruction store32_xmm32_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register xmm_value, + s64 offset) { + IGEN_DISPATCH(store32_xmm32_gpr64_plus_gpr64_plus_s32, addr1, addr2, xmm_value, offset); +} + +Instruction lea_reg_plus_off32(const ObjectGenerator& gen, + Register dest, + Register base, + s64 offset) { + IGEN_DISPATCH(lea_reg_plus_off32, dest, base, offset); +} + +Instruction lea_reg_plus_off8(const ObjectGenerator& gen, + Register dest, + Register base, + s64 offset) { + IGEN_DISPATCH(lea_reg_plus_off8, dest, base, offset); +} + +Instruction lea_reg_plus_off(const ObjectGenerator& gen, Register dest, Register base, s64 offset) { + IGEN_DISPATCH(lea_reg_plus_off, dest, base, offset); +} + +Instruction store32_xmm32_gpr64_plus_s32(const ObjectGenerator& gen, + Register base, + Register xmm_value, + s64 offset) { + IGEN_DISPATCH(store32_xmm32_gpr64_plus_s32, base, xmm_value, offset); +} + +Instruction store32_xmm32_gpr64_plus_s8(const ObjectGenerator& gen, + Register base, + Register xmm_value, + s64 offset) { + IGEN_DISPATCH(store32_xmm32_gpr64_plus_s8, base, xmm_value, offset); +} + +Instruction load32_xmm32_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register simd_dest, + Register addr1, + Register addr2, + s64 offset) { + IGEN_DISPATCH(load32_xmm32_gpr64_plus_gpr64_plus_s32, simd_dest, addr1, addr2, offset); +} + +Instruction load32_xmm32_gpr64_plus_s32(const ObjectGenerator& gen, + Register simd_dest, + Register base, + s64 offset) { + IGEN_DISPATCH(load32_xmm32_gpr64_plus_s32, simd_dest, base, offset); +} + +Instruction load32_xmm32_gpr64_plus_s8(const ObjectGenerator& gen, + Register simd_dest, + Register base, + s64 offset) { + IGEN_DISPATCH(load32_xmm32_gpr64_plus_s8, simd_dest, base, offset); +} + +Instruction load_goal_xmm32(const ObjectGenerator& gen, + Register simd_dest, + Register addr, + Register off, + s64 offset) { + IGEN_DISPATCH(load_goal_xmm32, simd_dest, addr, off, offset); +} + +Instruction store_goal_xmm32(const ObjectGenerator& gen, + Register addr, + Register xmm_value, + Register off, + s64 offset) { + IGEN_DISPATCH(store_goal_xmm32, addr, xmm_value, off, offset); +} + +Instruction store_reg_offset_xmm32(const ObjectGenerator& gen, + Register base, + Register xmm_value, + s64 offset) { + IGEN_DISPATCH(store_reg_offset_xmm32, base, xmm_value, offset); +} + +Instruction load_reg_offset_xmm32(const ObjectGenerator& gen, + Register simd_dest, + Register base, + s64 offset) { + IGEN_DISPATCH(load_reg_offset_xmm32, simd_dest, base, offset); +} + +Instruction store128_gpr64_simd128(const ObjectGenerator& gen, + Register gpr_addr, + Register xmm_value) { + IGEN_DISPATCH(store128_gpr64_simd128, gpr_addr, xmm_value); +} + +Instruction store128_gpr64_simd128_s32(const ObjectGenerator& gen, + Register gpr_addr, + Register xmm_value, + s64 offset) { + IGEN_DISPATCH(store128_gpr64_simd128_s32, gpr_addr, xmm_value, offset); +} + +Instruction store128_gpr64_simd128_s8(const ObjectGenerator& gen, + Register gpr_addr, + Register xmm_value, + s64 offset) { + IGEN_DISPATCH(store128_gpr64_simd128_s8, gpr_addr, xmm_value, offset); +} + +Instruction load128_simd128_gpr64(const ObjectGenerator& gen, + Register simd_dest, + Register gpr_addr) { + IGEN_DISPATCH(load128_simd128_gpr64, simd_dest, gpr_addr); +} + +Instruction load128_simd128_gpr64_s32(const ObjectGenerator& gen, + Register simd_dest, + Register gpr_addr, + s64 offset) { + IGEN_DISPATCH(load128_simd128_gpr64_s32, simd_dest, gpr_addr, offset); +} + +Instruction load128_simd128_gpr64_s8(const ObjectGenerator& gen, + Register simd_dest, + Register gpr_addr, + s64 offset) { + IGEN_DISPATCH(load128_simd128_gpr64_s8, simd_dest, gpr_addr, offset); +} + +Instruction load128_xmm128_reg_offset(const ObjectGenerator& gen, + Register simd_dest, + Register base, + s64 offset) { + IGEN_DISPATCH(load128_xmm128_reg_offset, simd_dest, base, offset); +} + +Instruction store128_xmm128_reg_offset(const ObjectGenerator& gen, + Register base, + Register xmm_val, + s64 offset) { + IGEN_DISPATCH(store128_xmm128_reg_offset, base, xmm_val, offset); +} + +Instruction load64_rip_s32(const ObjectGenerator& gen, Register dest, s64 offset) { + IGEN_DISPATCH(load64_rip_s32, dest, offset); +} + +Instruction load32s_rip_s32(const ObjectGenerator& gen, Register dest, s64 offset) { + IGEN_DISPATCH(load32s_rip_s32, dest, offset); +} + +Instruction load32u_rip_s32(const ObjectGenerator& gen, Register dest, s64 offset) { + IGEN_DISPATCH(load32u_rip_s32, dest, offset); +} + +Instruction load16u_rip_s32(const ObjectGenerator& gen, Register dest, s64 offset) { + IGEN_DISPATCH(load16u_rip_s32, dest, offset); +} + +Instruction load16s_rip_s32(const ObjectGenerator& gen, Register dest, s64 offset) { + IGEN_DISPATCH(load16s_rip_s32, dest, offset); +} + +Instruction load8u_rip_s32(const ObjectGenerator& gen, Register dest, s64 offset) { + IGEN_DISPATCH(load8u_rip_s32, dest, offset); +} + +Instruction load8s_rip_s32(const ObjectGenerator& gen, Register dest, s64 offset) { + IGEN_DISPATCH(load8s_rip_s32, dest, offset); +} + +Instruction static_load(const ObjectGenerator& gen, + Register dest, + s64 offset, + int size, + bool sign_extend) { + IGEN_DISPATCH(static_load, dest, offset, size, sign_extend); +} + +Instruction store64_rip_s32(const ObjectGenerator& gen, Register src, s64 offset) { + IGEN_DISPATCH(store64_rip_s32, src, offset); +} + +Instruction store32_rip_s32(const ObjectGenerator& gen, Register src, s64 offset) { + IGEN_DISPATCH(store32_rip_s32, src, offset); +} + +Instruction store16_rip_s32(const ObjectGenerator& gen, Register src, s64 offset) { + IGEN_DISPATCH(store16_rip_s32, src, offset); +} + +Instruction store8_rip_s32(const ObjectGenerator& gen, Register src, s64 offset) { + IGEN_DISPATCH(store8_rip_s32, src, offset); +} + +Instruction static_store(const ObjectGenerator& gen, Register value, s64 offset, int size) { + IGEN_DISPATCH(static_store, value, offset, size); +} + +Instruction static_addr(const ObjectGenerator& gen, Register dst, s64 offset) { + IGEN_DISPATCH(static_addr, dst, offset); +} + +Instruction static_load_xmm32(const ObjectGenerator& gen, Register simd_dest, s64 offset) { + IGEN_DISPATCH(static_load_xmm32, simd_dest, offset); +} + +Instruction static_store_xmm32(const ObjectGenerator& gen, Register xmm_value, s64 offset) { + IGEN_DISPATCH(static_store_xmm32, xmm_value, offset); +} + +Instruction load64_gpr64_plus_s32(const ObjectGenerator& gen, + Register dst_reg, + int32_t offset, + Register src_reg) { + IGEN_DISPATCH(load64_gpr64_plus_s32, dst_reg, offset, src_reg); +} + +Instruction store64_gpr64_plus_s32(const ObjectGenerator& gen, + Register addr, + int32_t offset, + Register value) { + IGEN_DISPATCH(store64_gpr64_plus_s32, addr, offset, value); +} + +Instruction ret(const ObjectGenerator& gen) { + IGEN_DISPATCH(ret); +} + +Instruction push_gpr64(const ObjectGenerator& gen, Register reg) { + IGEN_DISPATCH(push_gpr64, reg); +} + +Instruction pop_gpr64(const ObjectGenerator& gen, Register reg) { + IGEN_DISPATCH(pop_gpr64, reg); +} + +Instruction call_r64(const ObjectGenerator& gen, Register reg_) { + IGEN_DISPATCH(call_r64, reg_); +} + +Instruction jmp_r64(const ObjectGenerator& gen, Register reg_) { + IGEN_DISPATCH(jmp_r64, reg_); +} + +Instruction sub_gpr64_imm8s(const ObjectGenerator& gen, Register reg, int64_t imm) { + IGEN_DISPATCH(sub_gpr64_imm8s, reg, imm); +} + +Instruction sub_gpr64_imm32s(const ObjectGenerator& gen, Register reg, int64_t imm) { + IGEN_DISPATCH(sub_gpr64_imm32s, reg, imm); +} + +Instruction add_gpr64_imm8s(const ObjectGenerator& gen, Register reg, int64_t v) { + IGEN_DISPATCH(add_gpr64_imm8s, reg, v); +} + +Instruction add_gpr64_imm32s(const ObjectGenerator& gen, Register reg, int64_t v) { + IGEN_DISPATCH(add_gpr64_imm32s, reg, v); +} + +Instruction add_gpr64_imm(const ObjectGenerator& gen, Register reg, int64_t imm) { + IGEN_DISPATCH(add_gpr64_imm, reg, imm); +} + +Instruction sub_gpr64_imm(const ObjectGenerator& gen, Register reg, int64_t imm) { + IGEN_DISPATCH(sub_gpr64_imm, reg, imm); +} + +Instruction add_gpr64_gpr64(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(add_gpr64_gpr64, dst, src); +} + +Instruction sub_gpr64_gpr64(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(sub_gpr64_gpr64, dst, src); +} + +Instruction imul_gpr32_gpr32(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(imul_gpr32_gpr32, dst, src); +} + +Instruction imul_gpr64_gpr64(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(imul_gpr64_gpr64, dst, src); +} + +Instruction idiv_gpr32(const ObjectGenerator& gen, Register reg) { + IGEN_DISPATCH(idiv_gpr32, reg); +} + +Instruction unsigned_div_gpr32(const ObjectGenerator& gen, Register reg) { + IGEN_DISPATCH(unsigned_div_gpr32, reg); +} + +Instruction cdq(const ObjectGenerator& gen) { + IGEN_DISPATCH(cdq); +} + +Instruction movsx_r64_r32(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(movsx_r64_r32, dst, src); +} + +Instruction cmp_gpr64_gpr64(const ObjectGenerator& gen, Register a, Register b) { + IGEN_DISPATCH(cmp_gpr64_gpr64, a, b); +} + +Instruction or_gpr64_gpr64(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(or_gpr64_gpr64, dst, src); +} + +Instruction and_gpr64_gpr64(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(and_gpr64_gpr64, dst, src); +} + +Instruction xor_gpr64_gpr64(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(xor_gpr64_gpr64, dst, src); +} + +Instruction not_gpr64(const ObjectGenerator& gen, Register reg) { + IGEN_DISPATCH(not_gpr64, reg); +} + +Instruction shl_gpr64_cl(const ObjectGenerator& gen, Register reg) { + IGEN_DISPATCH(shl_gpr64_cl, reg); +} + +Instruction shr_gpr64_cl(const ObjectGenerator& gen, Register reg) { + IGEN_DISPATCH(shr_gpr64_cl, reg); +} + +Instruction sar_gpr64_cl(const ObjectGenerator& gen, Register reg) { + IGEN_DISPATCH(sar_gpr64_cl, reg); +} + +Instruction shl_gpr64_u8(const ObjectGenerator& gen, Register reg, uint8_t sa) { + IGEN_DISPATCH(shl_gpr64_u8, reg, sa); +} + +Instruction shr_gpr64_u8(const ObjectGenerator& gen, Register reg, uint8_t sa) { + IGEN_DISPATCH(shr_gpr64_u8, reg, sa); +} + +Instruction sar_gpr64_u8(const ObjectGenerator& gen, Register reg, uint8_t sa) { + IGEN_DISPATCH(sar_gpr64_u8, reg, sa); +} + +Instruction jmp_32(const ObjectGenerator& gen) { + IGEN_DISPATCH(jmp_32); +} + +Instruction je_32(const ObjectGenerator& gen) { + IGEN_DISPATCH(je_32); +} + +Instruction jne_32(const ObjectGenerator& gen) { + IGEN_DISPATCH(jne_32); +} + +Instruction jle_32(const ObjectGenerator& gen) { + IGEN_DISPATCH(jle_32); +} + +Instruction jge_32(const ObjectGenerator& gen) { + IGEN_DISPATCH(jge_32); +} + +Instruction jl_32(const ObjectGenerator& gen) { + IGEN_DISPATCH(jl_32); +} + +Instruction jg_32(const ObjectGenerator& gen) { + IGEN_DISPATCH(jg_32); +} + +Instruction jbe_32(const ObjectGenerator& gen) { + IGEN_DISPATCH(jbe_32); +} + +Instruction jae_32(const ObjectGenerator& gen) { + IGEN_DISPATCH(jae_32); +} + +Instruction jb_32(const ObjectGenerator& gen) { + IGEN_DISPATCH(jb_32); +} + +Instruction ja_32(const ObjectGenerator& gen) { + IGEN_DISPATCH(ja_32); +} + +Instruction cmp_flt_flt(const ObjectGenerator& gen, Register a, Register b) { + IGEN_DISPATCH(cmp_flt_flt, a, b); +} + +Instruction sqrts_xmm(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(sqrts_xmm, dst, src); +} + +Instruction mulss_xmm_xmm(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(mulss_xmm_xmm, dst, src); +} + +Instruction divss_xmm_xmm(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(divss_xmm_xmm, dst, src); +} + +Instruction subss_xmm_xmm(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(subss_xmm_xmm, dst, src); +} + +Instruction addss_xmm_xmm(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(addss_xmm_xmm, dst, src); +} + +Instruction minss_xmm_xmm(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(minss_xmm_xmm, dst, src); +} + +Instruction maxss_xmm_xmm(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(maxss_xmm_xmm, dst, src); +} + +Instruction int32_to_float(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(int32_to_float, dst, src); +} + +Instruction float_to_int32(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(float_to_int32, dst, src); +} + +Instruction nop(const ObjectGenerator& gen) { + IGEN_DISPATCH(nop); +} + +Instruction null(const ObjectGenerator& gen) { + IGEN_DISPATCH(null); +} + +Instruction nop_vf(const ObjectGenerator& gen) { + IGEN_DISPATCH(nop_vf); +} + +Instruction wait_vf(const ObjectGenerator& gen) { + IGEN_DISPATCH(wait_vf); +} + +Instruction mov_vf_vf(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(mov_vf_vf, dst, src); +} + +Instruction loadvf_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2) { + IGEN_DISPATCH(loadvf_gpr64_plus_gpr64, dst, addr1, addr2); +} + +Instruction loadvf_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset) { + IGEN_DISPATCH(loadvf_gpr64_plus_gpr64_plus_s8, dst, addr1, addr2, offset); +} + +Instruction loadvf_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset) { + IGEN_DISPATCH(loadvf_gpr64_plus_gpr64_plus_s32, dst, addr1, addr2, offset); +} + +Instruction storevf_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register value, + Register addr1, + Register addr2) { + IGEN_DISPATCH(storevf_gpr64_plus_gpr64, value, addr1, addr2); +} + +Instruction storevf_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register value, + Register addr1, + Register addr2, + s64 offset) { + IGEN_DISPATCH(storevf_gpr64_plus_gpr64_plus_s8, value, addr1, addr2, offset); +} + +Instruction storevf_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register value, + Register addr1, + Register addr2, + s64 offset) { + IGEN_DISPATCH(storevf_gpr64_plus_gpr64_plus_s32, value, addr1, addr2, offset); +} + +Instruction loadvf_rip_plus_s32(const ObjectGenerator& gen, Register dest, s64 offset) { + IGEN_DISPATCH(loadvf_rip_plus_s32, dest, offset); +} + +Instruction blend_vf(const ObjectGenerator& gen, + Register dst, + Register src1, + Register src2, + u8 mask) { + IGEN_DISPATCH(blend_vf, dst, src1, src2, mask); +} + +Instruction +shuffle_vf(const ObjectGenerator& gen, Register dst, Register src, u8 dx, u8 dy, u8 dz, u8 dw) { + IGEN_DISPATCH(shuffle_vf, dst, src, dx, dy, dz, dw); +} + +Instruction swizzle_vf(const ObjectGenerator& gen, Register dst, Register src, u8 controlBytes) { + IGEN_DISPATCH(swizzle_vf, dst, src, controlBytes); +} + +Instruction splat_vf(const ObjectGenerator& gen, + Register dst, + Register src, + Register::VF_ELEMENT element) { + IGEN_DISPATCH(splat_vf, dst, src, element); +} + +Instruction xor_vf(const ObjectGenerator& gen, Register dst, Register src1, Register src2) { + IGEN_DISPATCH(xor_vf, dst, src1, src2); +} + +Instruction sub_vf(const ObjectGenerator& gen, Register dst, Register src1, Register src2) { + IGEN_DISPATCH(sub_vf, dst, src1, src2); +} + +Instruction add_vf(const ObjectGenerator& gen, Register dst, Register src1, Register src2) { + IGEN_DISPATCH(add_vf, dst, src1, src2); +} + +Instruction mul_vf(const ObjectGenerator& gen, Register dst, Register src1, Register src2) { + IGEN_DISPATCH(mul_vf, dst, src1, src2); +} + +Instruction max_vf(const ObjectGenerator& gen, Register dst, Register src1, Register src2) { + IGEN_DISPATCH(max_vf, dst, src1, src2); +} + +Instruction min_vf(const ObjectGenerator& gen, Register dst, Register src1, Register src2) { + IGEN_DISPATCH(min_vf, dst, src1, src2); +} + +Instruction div_vf(const ObjectGenerator& gen, Register dst, Register src1, Register src2) { + IGEN_DISPATCH(div_vf, dst, src1, src2); +} + +Instruction sqrt_vf(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(sqrt_vf, dst, src); +} + +Instruction itof_vf(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(itof_vf, dst, src); +} + +Instruction ftoi_vf(const ObjectGenerator& gen, Register dst, Register src) { + IGEN_DISPATCH(ftoi_vf, dst, src); +} + +Instruction pw_sra(const ObjectGenerator& gen, Register dst, Register src, u8 imm) { + IGEN_DISPATCH(pw_sra, dst, src, imm); +} + +Instruction pw_srl(const ObjectGenerator& gen, Register dst, Register src, u8 imm) { + IGEN_DISPATCH(pw_srl, dst, src, imm); +} + +Instruction ph_srl(const ObjectGenerator& gen, Register dst, Register src, u8 imm) { + IGEN_DISPATCH(ph_srl, dst, src, imm); +} + +Instruction pw_sll(const ObjectGenerator& gen, Register dst, Register src, u8 imm) { + IGEN_DISPATCH(pw_sll, dst, src, imm); +} + +Instruction ph_sll(const ObjectGenerator& gen, Register dst, Register src, u8 imm) { + IGEN_DISPATCH(ph_sll, dst, src, imm); +} + +Instruction parallel_add_byte(const ObjectGenerator& gen, + Register dst, + Register src0, + Register src1) { + IGEN_DISPATCH(parallel_add_byte, dst, src0, src1); +} + +Instruction parallel_bitwise_or(const ObjectGenerator& gen, + Register dst, + Register src0, + Register src1) { + IGEN_DISPATCH(parallel_bitwise_or, dst, src0, src1); +} + +Instruction parallel_bitwise_xor(const ObjectGenerator& gen, + Register dst, + Register src0, + Register src1) { + IGEN_DISPATCH(parallel_bitwise_xor, dst, src0, src1); +} + +Instruction parallel_bitwise_and(const ObjectGenerator& gen, + Register dst, + Register src0, + Register src1) { + IGEN_DISPATCH(parallel_bitwise_and, dst, src0, src1); +} + +Instruction pextub_swapped(const ObjectGenerator& gen, Register dst, Register src0, Register src1) { + IGEN_DISPATCH(pextub_swapped, dst, src0, src1); +} + +Instruction pextuh_swapped(const ObjectGenerator& gen, Register dst, Register src0, Register src1) { + IGEN_DISPATCH(pextuh_swapped, dst, src0, src1); +} + +Instruction pextuw_swapped(const ObjectGenerator& gen, Register dst, Register src0, Register src1) { + IGEN_DISPATCH(pextuw_swapped, dst, src0, src1); +} + +Instruction pextlb_swapped(const ObjectGenerator& gen, Register dst, Register src0, Register src1) { + IGEN_DISPATCH(pextlb_swapped, dst, src0, src1); +} + +Instruction pextlh_swapped(const ObjectGenerator& gen, Register dst, Register src0, Register src1) { + IGEN_DISPATCH(pextlh_swapped, dst, src0, src1); +} + +Instruction pextlw_swapped(const ObjectGenerator& gen, Register dst, Register src0, Register src1) { + IGEN_DISPATCH(pextlw_swapped, dst, src0, src1); +} + +Instruction parallel_compare_e_b(const ObjectGenerator& gen, + Register dst, + Register src0, + Register src1) { + IGEN_DISPATCH(parallel_compare_e_b, dst, src0, src1); +} + +Instruction parallel_compare_e_h(const ObjectGenerator& gen, + Register dst, + Register src0, + Register src1) { + IGEN_DISPATCH(parallel_compare_e_h, dst, src0, src1); +} + +Instruction parallel_compare_e_w(const ObjectGenerator& gen, + Register dst, + Register src0, + Register src1) { + IGEN_DISPATCH(parallel_compare_e_w, dst, src0, src1); +} + +Instruction parallel_compare_gt_b(const ObjectGenerator& gen, + Register dst, + Register src0, + Register src1) { + IGEN_DISPATCH(parallel_compare_gt_b, dst, src0, src1); +} + +Instruction parallel_compare_gt_h(const ObjectGenerator& gen, + Register dst, + Register src0, + Register src1) { + IGEN_DISPATCH(parallel_compare_gt_h, dst, src0, src1); +} + +Instruction parallel_compare_gt_w(const ObjectGenerator& gen, + Register dst, + Register src0, + Register src1) { + IGEN_DISPATCH(parallel_compare_gt_w, dst, src0, src1); +} + +Instruction vpunpcklqdq(const ObjectGenerator& gen, Register dst, Register src0, Register src1) { + IGEN_DISPATCH(vpunpcklqdq, dst, src0, src1); +} + +Instruction pcpyld_swapped(const ObjectGenerator& gen, Register dst, Register src0, Register src1) { + IGEN_DISPATCH(pcpyld_swapped, dst, src0, src1); +} + +Instruction pcpyud(const ObjectGenerator& gen, Register dst, Register src0, Register src1) { + IGEN_DISPATCH(pcpyud, dst, src0, src1); +} + +Instruction vpsubd(const ObjectGenerator& gen, Register dst, Register src0, Register src1) { + IGEN_DISPATCH(vpsubd, dst, src0, src1); +} + +Instruction vpsrldq(const ObjectGenerator& gen, Register dst, Register src, u8 imm) { + IGEN_DISPATCH(vpsrldq, dst, src, imm); +} + +Instruction vpslldq(const ObjectGenerator& gen, Register dst, Register src, u8 imm) { + IGEN_DISPATCH(vpslldq, dst, src, imm); +} + +Instruction vpshuflw(const ObjectGenerator& gen, Register dst, Register src, u8 imm) { + IGEN_DISPATCH(vpshuflw, dst, src, imm); +} + +Instruction vpshufhw(const ObjectGenerator& gen, Register dst, Register src, u8 imm) { + IGEN_DISPATCH(vpshufhw, dst, src, imm); +} + +Instruction vpackuswb(const ObjectGenerator& gen, Register dst, Register src0, Register src1) { + IGEN_DISPATCH(vpackuswb, dst, src0, src1); +} + +}; // namespace IGen +}; // namespace emitter diff --git a/goalc/emitter/IGen.h b/goalc/emitter/IGen.h index 334666c62e..cfb985ea2e 100644 --- a/goalc/emitter/IGen.h +++ b/goalc/emitter/IGen.h @@ -1,2764 +1,991 @@ #pragma once -#include - #include "Instruction.h" #include "Register.h" -#include "common/util/Assert.h" +#include "goalc/emitter/ObjectGenerator.h" namespace emitter { -class IGen { - public: - //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - // MOVES - //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - /*! - * Move data from src to dst. Moves all 64-bits of the GPR. - */ - static Instruction mov_gpr64_gpr64(Register dst, Register src) { - ASSERT(dst.is_gpr()); - ASSERT(src.is_gpr()); - Instruction instr(0x89); - instr.set_modrm_and_rex(src.hw_id(), dst.hw_id(), 3, true); - return instr; - } +namespace IGen { +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// MOVES +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +/*! + * Move data from src to dst. Moves all 64-bits of the GPR. + */ +Instruction mov_gpr64_gpr64(const ObjectGenerator& gen, Register dst, Register src); - /*! - * Move a 64-bit constant into a register. - */ - static Instruction mov_gpr64_u64(Register dst, uint64_t val) { - ASSERT(dst.is_gpr()); - bool rex_b = false; - auto dst_hw_id = dst.hw_id(); - if (dst_hw_id >= 8) { - dst_hw_id -= 8; - rex_b = true; - } - Instruction instr(0xb8 + dst_hw_id); - instr.set(REX(true, false, false, rex_b)); - instr.set(Imm(8, val)); - return instr; - } +/*! + * Move a 64-bit constant into a register. + */ +Instruction mov_gpr64_u64(const ObjectGenerator& gen, Register dst, uint64_t val); - /*! - * Move a 32-bit constant into a register. Zeros the upper 32 bits. - */ - static Instruction mov_gpr64_u32(Register dst, uint64_t val) { - ASSERT(val <= UINT32_MAX); - ASSERT(dst.is_gpr()); - auto dst_hw_id = dst.hw_id(); - bool rex_b = false; - if (dst_hw_id >= 8) { - dst_hw_id -= 8; - rex_b = true; - } +/*! + * Move a 32-bit constant into a register. Zeros the upper 32 bits. + */ +Instruction mov_gpr64_u32(const ObjectGenerator& gen, Register dst, uint64_t val); - Instruction instr(0xb8 + dst_hw_id); - if (rex_b) { - instr.set(REX(false, false, false, rex_b)); - } - instr.set(Imm(4, val)); - return instr; - } +/*! + * Move a signed 32-bit constant into a register. Sign extends for the upper 32 bits. + * When possible prefer mov_gpr64_u32. (use this only for negative values...) + * This is always bigger than mov_gpr64_u32, but smaller than a mov_gpr_u64. + */ +Instruction mov_gpr64_s32(const ObjectGenerator& gen, Register dst, int64_t val); - /*! - * Move a signed 32-bit constant into a register. Sign extends for the upper 32 bits. - * When possible prefer mov_gpr64_u32. (use this only for negative values...) - * This is always bigger than mov_gpr64_u32, but smaller than a mov_gpr_u64. - */ - static Instruction mov_gpr64_s32(Register dst, int64_t val) { - ASSERT(val >= INT32_MIN && val <= INT32_MAX); - ASSERT(dst.is_gpr()); - Instruction instr(0xc7); - instr.set_modrm_and_rex(0, dst.hw_id(), 3, true); - instr.set(Imm(4, val)); - return instr; - } +/*! + * Move 32-bits of xmm to 32 bits of gpr (no sign extension). + */ +Instruction movd_gpr32_xmm32(const ObjectGenerator& gen, Register dst, Register src); - /*! - * Move 32-bits of xmm to 32 bits of gpr (no sign extension). - */ - static Instruction movd_gpr32_xmm32(Register dst, Register src) { - ASSERT(dst.is_gpr()); - ASSERT(src.is_xmm()); - Instruction instr(0x66); - instr.set_op2(0x0f); - instr.set_op3(0x7e); - instr.set_modrm_and_rex(src.hw_id(), dst.hw_id(), 3, false); - instr.swap_op0_rex(); - return instr; - } +/*! + * Move 32-bits of gpr to 32-bits of xmm (no sign extension) + */ +Instruction movd_xmm32_gpr32(const ObjectGenerator& gen, Register dst, Register src); - /*! - * Move 32-bits of gpr to 32-bits of xmm (no sign extension) - */ - static Instruction movd_xmm32_gpr32(Register dst, Register src) { - ASSERT(dst.is_xmm()); - ASSERT(src.is_gpr()); - Instruction instr(0x66); - instr.set_op2(0x0f); - instr.set_op3(0x6e); - instr.set_modrm_and_rex(dst.hw_id(), src.hw_id(), 3, false); - instr.swap_op0_rex(); - return instr; - } +/*! + * Move 64-bits of xmm to 64 bits of gpr (no sign extension). + */ +Instruction movq_gpr64_xmm64(const ObjectGenerator& gen, Register dst, Register src); - /*! - * Move 64-bits of xmm to 64 bits of gpr (no sign extension). - */ - static Instruction movq_gpr64_xmm64(Register dst, Register src) { - ASSERT(dst.is_gpr()); - ASSERT(src.is_xmm()); - Instruction instr(0x66); - instr.set_op2(0x0f); - instr.set_op3(0x7e); - instr.set_modrm_and_rex(src.hw_id(), dst.hw_id(), 3, true); - instr.swap_op0_rex(); - return instr; - } +/*! + * Move 64-bits of gpr to 64-bits of xmm (no sign extension) + */ +Instruction movq_xmm64_gpr64(const ObjectGenerator& gen, Register dst, Register src); - /*! - * Move 64-bits of gpr to 64-bits of xmm (no sign extension) - */ - static Instruction movq_xmm64_gpr64(Register dst, Register src) { - ASSERT(dst.is_xmm()); - ASSERT(src.is_gpr()); - Instruction instr(0x66); - instr.set_op2(0x0f); - instr.set_op3(0x6e); - instr.set_modrm_and_rex(dst.hw_id(), src.hw_id(), 3, true); - instr.swap_op0_rex(); - return instr; - } +/*! + * Move 32-bits between xmm's + */ +Instruction mov_xmm32_xmm32(const ObjectGenerator& gen, Register dst, Register src); - /*! - * Move 32-bits between xmm's - */ - static Instruction mov_xmm32_xmm32(Register dst, Register src) { - ASSERT(dst.is_xmm()); - ASSERT(src.is_xmm()); - Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x10); - instr.set_modrm_and_rex(dst.hw_id(), src.hw_id(), 3, false); - instr.swap_op0_rex(); - return instr; - } +// todo - GPR64 -> XMM64 (zext) +// todo - XMM -> GPR64 - // todo - GPR64 -> XMM64 (zext) - // todo - XMM -> GPR64 +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// GOAL Loads and Stores +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - // GOAL Loads and Stores - //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +/*! + * movsx dst, BYTE PTR [addr1 + addr2] + * addr1 and addr2 have to be different registers. + * Cannot use rsp. + */ +Instruction load8s_gpr64_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2); - /*! - * movsx dst, BYTE PTR [addr1 + addr2] - * addr1 and addr2 have to be different registers. - * Cannot use rsp. - */ - static Instruction load8s_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2) { - ASSERT(dst.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - Instruction instr(0xf); - instr.set_op2(0xbe); - instr.set_modrm_and_rex_for_reg_plus_reg_addr(dst.hw_id(), addr1.hw_id(), addr2.hw_id(), true, - false); - return instr; - } +Instruction store8_gpr64_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register value); - static Instruction store8_gpr64_gpr64_plus_gpr64(Register addr1, Register addr2, Register value) { - ASSERT(value.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - Instruction instr(0x88); - instr.set_modrm_and_rex_for_reg_plus_reg_addr(value.hw_id(), addr1.hw_id(), addr2.hw_id()); - if (value.id() > RBX) { - instr.add_rex(); - } - return instr; - } +Instruction load8s_gpr64_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset); - static Instruction load8s_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, - Register addr1, - Register addr2, - s64 offset) { - ASSERT(dst.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); - Instruction instr(0xf); - instr.set_op2(0xbe); - instr.set_modrm_and_rex_for_reg_plus_reg_plus_s8(dst.hw_id(), addr1.hw_id(), addr2.hw_id(), - offset, true); - return instr; - } +Instruction store8_gpr64_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register value, + s64 offset); - static Instruction store8_gpr64_gpr64_plus_gpr64_plus_s8(Register addr1, - Register addr2, - Register value, - s64 offset) { - ASSERT(value.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); - Instruction instr(0x88); - instr.set_modrm_and_rex_for_reg_plus_reg_plus_s8(value.hw_id(), addr1.hw_id(), addr2.hw_id(), - offset, false); - if (value.id() > RBX) { - instr.add_rex(); - } - return instr; - } - - static Instruction load8s_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, - Register addr1, - Register addr2, - s64 offset) { - ASSERT(dst.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0xf); - instr.set_op2(0xbe); - instr.set_modrm_and_rex_for_reg_plus_reg_plus_s32(dst.hw_id(), addr1.hw_id(), addr2.hw_id(), - offset, true); - return instr; - } - - static Instruction store8_gpr64_gpr64_plus_gpr64_plus_s32(Register addr1, - Register addr2, - Register value, - s64 offset) { - ASSERT(value.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0x88); - instr.set_modrm_and_rex_for_reg_plus_reg_plus_s32(value.hw_id(), addr1.hw_id(), addr2.hw_id(), - offset, false); - if (value.id() > RBX) { - instr.add_rex(); - } - return instr; - } - - /*! - * movzx dst, BYTE PTR [addr1 + addr2] - * addr1 and addr2 have to be different registers. - * Cannot use rsp. - */ - static Instruction load8u_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2) { - ASSERT(dst.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - Instruction instr(0xf); - instr.set_op2(0xb6); - instr.set_modrm_and_rex_for_reg_plus_reg_addr(dst.hw_id(), addr1.hw_id(), addr2.hw_id(), true, - false); - return instr; - } - - static Instruction load8u_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, - Register addr1, - Register addr2, - s64 offset) { - ASSERT(dst.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); - Instruction instr(0xf); - instr.set_op2(0xb6); - instr.set_modrm_and_rex_for_reg_plus_reg_plus_s8(dst.hw_id(), addr1.hw_id(), addr2.hw_id(), - offset, true); - return instr; - } - - static Instruction load8u_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, - Register addr1, - Register addr2, - s64 offset) { - ASSERT(dst.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0xf); - instr.set_op2(0xb6); - instr.set_modrm_and_rex_for_reg_plus_reg_plus_s32(dst.hw_id(), addr1.hw_id(), addr2.hw_id(), - offset, true); - return instr; - } - - /*! - * movsx dst, WORD PTR [addr1 + addr2] - * addr1 and addr2 have to be different registers. - * Cannot use rsp. - */ - static Instruction load16s_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2) { - ASSERT(dst.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - Instruction instr(0xf); - instr.set_op2(0xbf); - instr.set_modrm_and_rex_for_reg_plus_reg_addr(dst.hw_id(), addr1.hw_id(), addr2.hw_id(), true, - false); - return instr; - } - - static Instruction store16_gpr64_gpr64_plus_gpr64(Register addr1, - Register addr2, - Register value) { - ASSERT(value.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - Instruction instr(0x66); - instr.set_op2(0x89); - instr.set_modrm_and_rex_for_reg_plus_reg_addr(value.hw_id(), addr1.hw_id(), addr2.hw_id()); - instr.swap_op0_rex(); // why????? - return instr; - } - - static Instruction store16_gpr64_gpr64_plus_gpr64_plus_s8(Register addr1, - Register addr2, - Register value, - s64 offset) { - ASSERT(value.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); - Instruction instr(0x66); - instr.set_op2(0x89); - instr.set_modrm_and_rex_for_reg_plus_reg_plus_s8(value.hw_id(), addr1.hw_id(), addr2.hw_id(), - offset, false); - instr.swap_op0_rex(); // why????? - return instr; - } - - static Instruction store16_gpr64_gpr64_plus_gpr64_plus_s32(Register addr1, - Register addr2, - Register value, - s64 offset) { - ASSERT(value.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0x66); - instr.set_op2(0x89); - instr.set_modrm_and_rex_for_reg_plus_reg_plus_s32(value.hw_id(), addr1.hw_id(), addr2.hw_id(), - offset, false); - instr.swap_op0_rex(); // why????? - return instr; - } - - static Instruction load16s_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, - Register addr1, - Register addr2, - s64 offset) { - ASSERT(dst.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); - Instruction instr(0xf); - instr.set_op2(0xbf); - instr.set_modrm_and_rex_for_reg_plus_reg_plus_s8(dst.hw_id(), addr1.hw_id(), addr2.hw_id(), - offset, true); - return instr; - } - - static Instruction load16s_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, - Register addr1, - Register addr2, - s64 offset) { - ASSERT(dst.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0xf); - instr.set_op2(0xbf); - instr.set_modrm_and_rex_for_reg_plus_reg_plus_s32(dst.hw_id(), addr1.hw_id(), addr2.hw_id(), - offset, true); - return instr; - } - - /*! - * movzx dst, WORD PTR [addr1 + addr2] - * addr1 and addr2 have to be different registers. - * Cannot use rsp. - */ - static Instruction load16u_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2) { - ASSERT(dst.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - Instruction instr(0xf); - instr.set_op2(0xb7); - instr.set_modrm_and_rex_for_reg_plus_reg_addr(dst.hw_id(), addr1.hw_id(), addr2.hw_id(), true, - false); - return instr; - } - - static Instruction load16u_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, - Register addr1, - Register addr2, - s64 offset) { - ASSERT(dst.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); - Instruction instr(0xf); - instr.set_op2(0xb7); - instr.set_modrm_and_rex_for_reg_plus_reg_plus_s8(dst.hw_id(), addr1.hw_id(), addr2.hw_id(), - offset, true); - return instr; - } - - static Instruction load16u_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, - Register addr1, - Register addr2, - s64 offset) { - ASSERT(dst.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0xf); - instr.set_op2(0xb7); - instr.set_modrm_and_rex_for_reg_plus_reg_plus_s32(dst.hw_id(), addr1.hw_id(), addr2.hw_id(), - offset, true); - return instr; - } - - /*! - * movsxd dst, DWORD PTR [addr1 + addr2] - * addr1 and addr2 have to be different registers. - * Cannot use rsp. - */ - static Instruction load32s_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2) { - ASSERT(dst.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - Instruction instr(0x63); - instr.set_modrm_and_rex_for_reg_plus_reg_addr(dst.hw_id(), addr1.hw_id(), addr2.hw_id(), true); - return instr; - } - - static Instruction store32_gpr64_gpr64_plus_gpr64(Register addr1, - Register addr2, - Register value) { - ASSERT(value.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - Instruction instr(0x89); - instr.set_modrm_and_rex_for_reg_plus_reg_addr(value.hw_id(), addr1.hw_id(), addr2.hw_id()); - return instr; - } - - static Instruction load32s_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, - Register addr1, - Register addr2, - s64 offset) { - ASSERT(dst.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); - Instruction instr(0x63); - instr.set_modrm_and_rex_for_reg_plus_reg_plus_s8(dst.hw_id(), addr1.hw_id(), addr2.hw_id(), - offset, true); - return instr; - } - - static Instruction store32_gpr64_gpr64_plus_gpr64_plus_s8(Register addr1, - Register addr2, - Register value, - s64 offset) { - ASSERT(value.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); - Instruction instr(0x89); - instr.set_modrm_and_rex_for_reg_plus_reg_plus_s8(value.hw_id(), addr1.hw_id(), addr2.hw_id(), - offset, false); - return instr; - } - - static Instruction load32s_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, - Register addr1, - Register addr2, - s64 offset) { - ASSERT(dst.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0x63); - instr.set_modrm_and_rex_for_reg_plus_reg_plus_s32(dst.hw_id(), addr1.hw_id(), addr2.hw_id(), - offset, true); - return instr; - } - - static Instruction store32_gpr64_gpr64_plus_gpr64_plus_s32(Register addr1, - Register addr2, - Register value, - s64 offset) { - ASSERT(value.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0x89); - instr.set_modrm_and_rex_for_reg_plus_reg_plus_s32(value.hw_id(), addr1.hw_id(), addr2.hw_id(), - offset, false); - return instr; - } - - /*! - * movzxd dst, DWORD PTR [addr1 + addr2] - * addr1 and addr2 have to be different registers. - * Cannot use rsp. - */ - static Instruction load32u_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2) { - ASSERT(dst.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - Instruction instr(0x8b); - instr.set_modrm_and_rex_for_reg_plus_reg_addr(dst.hw_id(), addr1.hw_id(), addr2.hw_id()); - return instr; - } - - static Instruction load32u_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, - Register addr1, - Register addr2, - s64 offset) { - ASSERT(dst.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); - Instruction instr(0x8b); - instr.set_modrm_and_rex_for_reg_plus_reg_plus_s8(dst.hw_id(), addr1.hw_id(), addr2.hw_id(), - offset, false); - return instr; - } - - static Instruction load32u_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, - Register addr1, - Register addr2, - s64 offset) { - ASSERT(dst.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0x8b); - instr.set_modrm_and_rex_for_reg_plus_reg_plus_s32(dst.hw_id(), addr1.hw_id(), addr2.hw_id(), - offset, false); - return instr; - } - - /*! - * mov dst, QWORD PTR [addr1 + addr2] - * addr1 and addr2 have to be different registers. - * Cannot use rsp. - */ - static Instruction load64_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2) { - ASSERT(dst.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - Instruction instr(0x8b); - instr.set_modrm_and_rex_for_reg_plus_reg_addr(dst.hw_id(), addr1.hw_id(), addr2.hw_id(), true); - return instr; - } - - static Instruction store64_gpr64_gpr64_plus_gpr64(Register addr1, - Register addr2, - Register value) { - ASSERT(value.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - Instruction instr(0x89); - instr.set_modrm_and_rex_for_reg_plus_reg_addr(value.hw_id(), addr1.hw_id(), addr2.hw_id(), - true); - return instr; - } - - static Instruction load64_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, - Register addr1, - Register addr2, - s64 offset) { - ASSERT(dst.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); - Instruction instr(0x8b); - instr.set_modrm_and_rex_for_reg_plus_reg_plus_s8(dst.hw_id(), addr1.hw_id(), addr2.hw_id(), - offset, true); - return instr; - } - - static Instruction store64_gpr64_gpr64_plus_gpr64_plus_s8(Register addr1, - Register addr2, - Register value, - s64 offset) { - ASSERT(value.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); - Instruction instr(0x89); - instr.set_modrm_and_rex_for_reg_plus_reg_plus_s8(value.hw_id(), addr1.hw_id(), addr2.hw_id(), - offset, true); - return instr; - } - - static Instruction load64_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, - Register addr1, - Register addr2, - s64 offset) { - ASSERT(dst.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0x8b); - instr.set_modrm_and_rex_for_reg_plus_reg_plus_s32(dst.hw_id(), addr1.hw_id(), addr2.hw_id(), - offset, true); - return instr; - } - - static Instruction store64_gpr64_gpr64_plus_gpr64_plus_s32(Register addr1, - Register addr2, - Register value, - s64 offset) { - ASSERT(value.is_gpr()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0x89); - instr.set_modrm_and_rex_for_reg_plus_reg_plus_s32(value.hw_id(), addr1.hw_id(), addr2.hw_id(), - offset, true); - return instr; - } - - static Instruction store_goal_vf(Register addr, Register value, Register off, s64 offset) { - if (offset == 0) { - return storevf_gpr64_plus_gpr64(value, addr, off); - } else if (offset >= INT8_MIN && offset <= INT8_MAX) { - return storevf_gpr64_plus_gpr64_plus_s8(value, addr, off, offset); - } else if (offset >= INT32_MIN && offset <= INT32_MAX) { - return storevf_gpr64_plus_gpr64_plus_s32(value, addr, off, offset); - } - ASSERT(false); - return {0}; - } - - static Instruction store_goal_gpr(Register addr, - Register value, - Register off, - int offset, - int size) { - switch (size) { - case 1: - if (offset == 0) { - return store8_gpr64_gpr64_plus_gpr64(addr, off, value); - } else if (offset >= INT8_MIN && offset <= INT8_MAX) { - return store8_gpr64_gpr64_plus_gpr64_plus_s8(addr, off, value, offset); - } else if (offset >= INT32_MIN && offset <= INT32_MAX) { - return store8_gpr64_gpr64_plus_gpr64_plus_s32(addr, off, value, offset); - } else { - ASSERT(false); - } - case 2: - if (offset == 0) { - return store16_gpr64_gpr64_plus_gpr64(addr, off, value); - } else if (offset >= INT8_MIN && offset <= INT8_MAX) { - return store16_gpr64_gpr64_plus_gpr64_plus_s8(addr, off, value, offset); - } else if (offset >= INT32_MIN && offset <= INT32_MAX) { - return store16_gpr64_gpr64_plus_gpr64_plus_s32(addr, off, value, offset); - } else { - ASSERT(false); - } - case 4: - if (offset == 0) { - return store32_gpr64_gpr64_plus_gpr64(addr, off, value); - } else if (offset >= INT8_MIN && offset <= INT8_MAX) { - return store32_gpr64_gpr64_plus_gpr64_plus_s8(addr, off, value, offset); - } else if (offset >= INT32_MIN && offset <= INT32_MAX) { - return store32_gpr64_gpr64_plus_gpr64_plus_s32(addr, off, value, offset); - } else { - ASSERT(false); - } - case 8: - if (offset == 0) { - return store64_gpr64_gpr64_plus_gpr64(addr, off, value); - } else if (offset >= INT8_MIN && offset <= INT8_MAX) { - return store64_gpr64_gpr64_plus_gpr64_plus_s8(addr, off, value, offset); - } else if (offset >= INT32_MIN && offset <= INT32_MAX) { - return store64_gpr64_gpr64_plus_gpr64_plus_s32(addr, off, value, offset); - } else { - ASSERT(false); - } - default: - ASSERT(false); - return {0}; - } - } - - static Instruction load_goal_xmm128(Register dst, Register addr, Register off, int offset) { - if (offset == 0) { - return loadvf_gpr64_plus_gpr64(dst, addr, off); - } else if (offset >= INT8_MIN && offset <= INT8_MAX) { - return loadvf_gpr64_plus_gpr64_plus_s8(dst, addr, off, offset); - } else if (offset >= INT32_MIN && offset <= INT32_MAX) { - return loadvf_gpr64_plus_gpr64_plus_s32(dst, addr, off, offset); - } else { - ASSERT(false); - return {0}; - } - } - - /*! - * Load memory at addr + offset, where addr is a GOAL pointer and off is the offset register. - * This will pick the appropriate fancy addressing mode instruction. - */ - static Instruction load_goal_gpr(Register dst, - Register addr, - Register off, - int offset, - int size, - bool sign_extend) { - switch (size) { - case 1: - if (offset == 0) { - if (sign_extend) { - return load8s_gpr64_gpr64_plus_gpr64(dst, addr, off); - } else { - return load8u_gpr64_gpr64_plus_gpr64(dst, addr, off); - } - } else if (offset >= INT8_MIN && offset <= INT8_MAX) { - if (sign_extend) { - return load8s_gpr64_gpr64_plus_gpr64_plus_s8(dst, addr, off, offset); - } else { - return load8u_gpr64_gpr64_plus_gpr64_plus_s8(dst, addr, off, offset); - } - } else if (offset >= INT32_MIN && offset <= INT32_MAX) { - if (sign_extend) { - return load8s_gpr64_gpr64_plus_gpr64_plus_s32(dst, addr, off, offset); - } else { - return load8u_gpr64_gpr64_plus_gpr64_plus_s32(dst, addr, off, offset); - } - } else { - ASSERT(false); - } - case 2: - if (offset == 0) { - if (sign_extend) { - return load16s_gpr64_gpr64_plus_gpr64(dst, addr, off); - } else { - return load16u_gpr64_gpr64_plus_gpr64(dst, addr, off); - } - } else if (offset >= INT8_MIN && offset <= INT8_MAX) { - if (sign_extend) { - return load16s_gpr64_gpr64_plus_gpr64_plus_s8(dst, addr, off, offset); - } else { - return load16u_gpr64_gpr64_plus_gpr64_plus_s8(dst, addr, off, offset); - } - } else if (offset >= INT32_MIN && offset <= INT32_MAX) { - if (sign_extend) { - return load16s_gpr64_gpr64_plus_gpr64_plus_s32(dst, addr, off, offset); - } else { - return load16u_gpr64_gpr64_plus_gpr64_plus_s32(dst, addr, off, offset); - } - } else { - ASSERT(false); - } - case 4: - if (offset == 0) { - if (sign_extend) { - return load32s_gpr64_gpr64_plus_gpr64(dst, addr, off); - } else { - return load32u_gpr64_gpr64_plus_gpr64(dst, addr, off); - } - } else if (offset >= INT8_MIN && offset <= INT8_MAX) { - if (sign_extend) { - return load32s_gpr64_gpr64_plus_gpr64_plus_s8(dst, addr, off, offset); - } else { - return load32u_gpr64_gpr64_plus_gpr64_plus_s8(dst, addr, off, offset); - } - } else if (offset >= INT32_MIN && offset <= INT32_MAX) { - if (sign_extend) { - return load32s_gpr64_gpr64_plus_gpr64_plus_s32(dst, addr, off, offset); - } else { - return load32u_gpr64_gpr64_plus_gpr64_plus_s32(dst, addr, off, offset); - } - } else { - ASSERT(false); - } - case 8: - if (offset == 0) { - return load64_gpr64_gpr64_plus_gpr64(dst, addr, off); - - } else if (offset >= INT8_MIN && offset <= INT8_MAX) { - return load64_gpr64_gpr64_plus_gpr64_plus_s8(dst, addr, off, offset); - - } else if (offset >= INT32_MIN && offset <= INT32_MAX) { - return load64_gpr64_gpr64_plus_gpr64_plus_s32(dst, addr, off, offset); - - } else { - ASSERT(false); - } - default: - ASSERT(false); - return {0}; - } - } - - //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - // LOADS n' STORES - XMM32 - //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - static Instruction store32_xmm32_gpr64_plus_gpr64(Register addr1, - Register addr2, - Register xmm_value) { - ASSERT(xmm_value.is_xmm()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - - Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x11); - instr.set_modrm_and_rex_for_reg_plus_reg_addr(xmm_value.hw_id(), addr1.hw_id(), addr2.hw_id()); - - instr.swap_op0_rex(); - return instr; - } - - static Instruction load32_xmm32_gpr64_plus_gpr64(Register xmm_dest, +Instruction load8s_gpr64_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register dst, Register addr1, - Register addr2) { - ASSERT(xmm_dest.is_xmm()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - - Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x10); - instr.set_modrm_and_rex_for_reg_plus_reg_addr(xmm_dest.hw_id(), addr1.hw_id(), addr2.hw_id()); - - instr.swap_op0_rex(); - return instr; - } - - static Instruction store32_xmm32_gpr64_plus_gpr64_plus_s8(Register addr1, - Register addr2, - Register xmm_value, - s64 offset) { - ASSERT(xmm_value.is_xmm()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); - - Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x11); - instr.set_modrm_and_rex_for_reg_plus_reg_plus_s8(xmm_value.hw_id(), addr1.hw_id(), - addr2.hw_id(), offset, false); - - instr.swap_op0_rex(); - return instr; - } - - static Instruction load32_xmm32_gpr64_plus_gpr64_plus_s8(Register xmm_dest, - Register addr1, - Register addr2, - s64 offset) { - ASSERT(xmm_dest.is_xmm()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); - - Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x10); - instr.set_modrm_and_rex_for_reg_plus_reg_plus_s8(xmm_dest.hw_id(), addr1.hw_id(), addr2.hw_id(), - offset, false); - - instr.swap_op0_rex(); - return instr; - } - - static Instruction store32_xmm32_gpr64_plus_gpr64_plus_s32(Register addr1, - Register addr2, - Register xmm_value, - s64 offset) { - ASSERT(xmm_value.is_xmm()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - - Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x11); - instr.set_modrm_and_rex_for_reg_plus_reg_plus_s32(xmm_value.hw_id(), addr1.hw_id(), - addr2.hw_id(), offset, false); - - instr.swap_op0_rex(); - return instr; - } - - static Instruction lea_reg_plus_off32(Register dest, Register base, s64 offset) { - ASSERT(dest.is_gpr()); - ASSERT(base.is_gpr()); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0x8d); - instr.set_modrm_rex_sib_for_reg_reg_disp(dest.hw_id(), 2, base.hw_id(), true); - instr.set(Imm(4, offset)); - return instr; - } - - static Instruction lea_reg_plus_off8(Register dest, Register base, s64 offset) { - ASSERT(dest.is_gpr()); - ASSERT(base.is_gpr()); - ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); - Instruction instr(0x8d); - instr.set_modrm_rex_sib_for_reg_reg_disp(dest.hw_id(), 1, base.hw_id(), true); - instr.set(Imm(1, offset)); - return instr; - } - - static Instruction lea_reg_plus_off(Register dest, Register base, s64 offset) { - if (offset >= INT8_MIN && offset <= INT8_MAX) { - return lea_reg_plus_off8(dest, base, offset); - } else if (offset >= INT32_MIN && offset <= INT32_MAX) { - return lea_reg_plus_off32(dest, base, offset); - } else { - ASSERT(false); - return {0}; - } - } - - static Instruction store32_xmm32_gpr64_plus_s32(Register base, Register xmm_value, s64 offset) { - ASSERT(xmm_value.is_xmm()); - ASSERT(base.is_gpr()); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x11); - instr.set_modrm_rex_sib_for_reg_reg_disp(xmm_value.hw_id(), 2, base.hw_id(), false); - instr.set(Imm(4, offset)); - instr.swap_op0_rex(); - return instr; - } - - static Instruction store32_xmm32_gpr64_plus_s8(Register base, Register xmm_value, s64 offset) { - ASSERT(xmm_value.is_xmm()); - ASSERT(base.is_gpr()); - ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); - Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x11); - instr.set_modrm_rex_sib_for_reg_reg_disp(xmm_value.hw_id(), 1, base.hw_id(), false); - instr.set(Imm(1, offset)); - instr.swap_op0_rex(); - return instr; - } - - static Instruction load32_xmm32_gpr64_plus_gpr64_plus_s32(Register xmm_dest, - Register addr1, - Register addr2, - s64 offset) { - ASSERT(xmm_dest.is_xmm()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - - Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x10); - instr.set_modrm_and_rex_for_reg_plus_reg_plus_s32(xmm_dest.hw_id(), addr1.hw_id(), - addr2.hw_id(), offset, false); - - instr.swap_op0_rex(); - return instr; - } - - static Instruction load32_xmm32_gpr64_plus_s32(Register xmm_dest, Register base, s64 offset) { - ASSERT(xmm_dest.is_xmm()); - ASSERT(base.is_gpr()); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x10); - instr.set_modrm_rex_sib_for_reg_reg_disp(xmm_dest.hw_id(), 2, base.hw_id(), false); - instr.set(Imm(4, offset)); - instr.swap_op0_rex(); - return instr; - } - - static Instruction load32_xmm32_gpr64_plus_s8(Register xmm_dest, Register base, s64 offset) { - ASSERT(xmm_dest.is_xmm()); - ASSERT(base.is_gpr()); - ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); - Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x10); - instr.set_modrm_rex_sib_for_reg_reg_disp(xmm_dest.hw_id(), 1, base.hw_id(), false); - instr.set(Imm(1, offset)); - instr.swap_op0_rex(); - return instr; - } - - static Instruction load_goal_xmm32(Register xmm_dest, Register addr, Register off, s64 offset) { - if (offset == 0) { - return load32_xmm32_gpr64_plus_gpr64(xmm_dest, addr, off); - } else if (offset >= INT8_MIN && offset <= INT8_MAX) { - return load32_xmm32_gpr64_plus_gpr64_plus_s8(xmm_dest, addr, off, offset); - } else if (offset >= INT32_MIN && offset <= INT32_MAX) { - return load32_xmm32_gpr64_plus_gpr64_plus_s32(xmm_dest, addr, off, offset); - } else { - ASSERT(false); - return {0}; - } - } - - static Instruction store_goal_xmm32(Register addr, Register xmm_value, Register off, s64 offset) { - if (offset == 0) { - return store32_xmm32_gpr64_plus_gpr64(addr, off, xmm_value); - } else if (offset >= INT8_MIN && offset <= INT8_MAX) { - return store32_xmm32_gpr64_plus_gpr64_plus_s8(addr, off, xmm_value, offset); - } else if (offset >= INT32_MIN && offset <= INT32_MAX) { - return store32_xmm32_gpr64_plus_gpr64_plus_s32(addr, off, xmm_value, offset); - } else { - ASSERT(false); - return {0}; - } - } - - static Instruction store_reg_offset_xmm32(Register base, Register xmm_value, s64 offset) { - ASSERT(base.is_gpr()); - ASSERT(xmm_value.is_xmm()); - if (offset >= INT8_MIN && offset <= INT8_MAX) { - return store32_xmm32_gpr64_plus_s8(base, xmm_value, offset); - } else if (offset >= INT32_MIN && offset <= INT32_MAX) { - return store32_xmm32_gpr64_plus_s32(base, xmm_value, offset); - } else { - ASSERT(false); - return {0}; - } - } - - static Instruction load_reg_offset_xmm32(Register xmm_dest, Register base, s64 offset) { - ASSERT(base.is_gpr()); - ASSERT(xmm_dest.is_xmm()); - if (offset >= INT8_MIN && offset <= INT8_MAX) { - return load32_xmm32_gpr64_plus_s8(xmm_dest, base, offset); - } else if (offset >= INT32_MIN && offset <= INT32_MAX) { - return load32_xmm32_gpr64_plus_s32(xmm_dest, base, offset); - } else { - ASSERT(false); - return {0}; - } - } - - //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - // LOADS n' STORES - XMM128 - //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - /*! - * Store a 128-bit xmm into an address stored in a register, no offset - */ - static Instruction store128_gpr64_xmm128(Register gpr_addr, Register xmm_value) { - ASSERT(gpr_addr.is_gpr()); - ASSERT(xmm_value.is_xmm()); - Instruction instr(0x66); - // Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x7f); - instr.set_modrm_and_rex_for_reg_addr(xmm_value.hw_id(), gpr_addr.hw_id(), false); - instr.swap_op0_rex(); - return instr; - } - - static Instruction store128_gpr64_xmm128_s32(Register gpr_addr, Register xmm_value, s64 offset) { - ASSERT(gpr_addr.is_gpr()); - ASSERT(xmm_value.is_xmm()); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0x66); - // Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x7f); - instr.set_modrm_rex_sib_for_reg_reg_disp(xmm_value.hw_id(), 2, gpr_addr.hw_id(), false); - instr.set(Imm(4, offset)); - instr.swap_op0_rex(); - return instr; - } - - static Instruction store128_gpr64_xmm128_s8(Register gpr_addr, Register xmm_value, s64 offset) { - ASSERT(gpr_addr.is_gpr()); - ASSERT(xmm_value.is_xmm()); - ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); - Instruction instr(0x66); - // Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x7f); - instr.set_modrm_rex_sib_for_reg_reg_disp(xmm_value.hw_id(), 1, gpr_addr.hw_id(), false); - instr.set(Imm(1, offset)); - instr.swap_op0_rex(); - return instr; - } - - static Instruction load128_xmm128_gpr64(Register xmm_dest, Register gpr_addr) { - ASSERT(gpr_addr.is_gpr()); - ASSERT(xmm_dest.is_xmm()); - Instruction instr(0x66); - // Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x6f); - instr.set_modrm_and_rex_for_reg_addr(xmm_dest.hw_id(), gpr_addr.hw_id(), false); - instr.swap_op0_rex(); - return instr; - } - - static Instruction load128_xmm128_gpr64_s32(Register xmm_dest, Register gpr_addr, s64 offset) { - ASSERT(gpr_addr.is_gpr()); - ASSERT(xmm_dest.is_xmm()); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0x66); - // Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x6f); - instr.set_modrm_rex_sib_for_reg_reg_disp(xmm_dest.hw_id(), 2, gpr_addr.hw_id(), false); - instr.set(Imm(4, offset)); - instr.swap_op0_rex(); - return instr; - } - - static Instruction load128_xmm128_gpr64_s8(Register xmm_dest, Register gpr_addr, s64 offset) { - ASSERT(gpr_addr.is_gpr()); - ASSERT(xmm_dest.is_xmm()); - ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); - Instruction instr(0x66); - // Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x6f); - instr.set_modrm_rex_sib_for_reg_reg_disp(xmm_dest.hw_id(), 1, gpr_addr.hw_id(), false); - instr.set(Imm(1, offset)); - instr.swap_op0_rex(); - return instr; - } - - static Instruction load128_xmm128_reg_offset(Register xmm_dest, Register base, s64 offset) { - if (offset == 0) { - return load128_xmm128_gpr64(xmm_dest, base); - } else if (offset >= INT8_MIN && offset <= INT8_MAX) { - return load128_xmm128_gpr64_s8(xmm_dest, base, offset); - } else if (offset >= INT32_MIN && offset <= INT32_MAX) { - return load128_xmm128_gpr64_s32(xmm_dest, base, offset); - } else { - ASSERT(false); - return {0}; - } - } - - static Instruction store128_xmm128_reg_offset(Register base, Register xmm_val, s64 offset) { - if (offset == 0) { - return store128_gpr64_xmm128(base, xmm_val); - } else if (offset >= INT8_MIN && offset <= INT8_MAX) { - return store128_gpr64_xmm128_s8(base, xmm_val, offset); - } else if (offset >= INT32_MIN && offset <= INT32_MAX) { - return store128_gpr64_xmm128_s32(base, xmm_val, offset); - } else { - ASSERT(false); - return {0}; - } - } - - //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - // RIP loads and stores - //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - static Instruction load64_rip_s32(Register dest, s64 offset) { - ASSERT(dest.is_gpr()); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0x8b); - instr.set_modrm_and_rex_for_rip_plus_s32(dest.hw_id(), offset, true); - return instr; - } - - static Instruction load32s_rip_s32(Register dest, s64 offset) { - ASSERT(dest.is_gpr()); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0x63); - instr.set_modrm_and_rex_for_rip_plus_s32(dest.hw_id(), offset, true); - return instr; - } - - static Instruction load32u_rip_s32(Register dest, s64 offset) { - ASSERT(dest.is_gpr()); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0x8b); - instr.set_modrm_and_rex_for_rip_plus_s32(dest.hw_id(), offset, false); - return instr; - } - - static Instruction load16u_rip_s32(Register dest, s64 offset) { - ASSERT(dest.is_gpr()); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0xf); - instr.set_op2(0xb7); - instr.set_modrm_and_rex_for_rip_plus_s32(dest.hw_id(), offset, true); - return instr; - } - - static Instruction load16s_rip_s32(Register dest, s64 offset) { - ASSERT(dest.is_gpr()); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0xf); - instr.set_op2(0xbf); - instr.set_modrm_and_rex_for_rip_plus_s32(dest.hw_id(), offset, true); - return instr; - } - - static Instruction load8u_rip_s32(Register dest, s64 offset) { - ASSERT(dest.is_gpr()); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0xf); - instr.set_op2(0xb6); - instr.set_modrm_and_rex_for_rip_plus_s32(dest.hw_id(), offset, true); - return instr; - } - - static Instruction load8s_rip_s32(Register dest, s64 offset) { - ASSERT(dest.is_gpr()); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0xf); - instr.set_op2(0xbe); - instr.set_modrm_and_rex_for_rip_plus_s32(dest.hw_id(), offset, true); - return instr; - } - - static Instruction static_load(Register dest, s64 offset, int size, bool sign_extend) { - switch (size) { - case 1: - if (sign_extend) { - return load8s_rip_s32(dest, offset); - } else { - return load8u_rip_s32(dest, offset); - } - break; - case 2: - if (sign_extend) { - return load16s_rip_s32(dest, offset); - } else { - return load16u_rip_s32(dest, offset); - } - break; - case 4: - if (sign_extend) { - return load32s_rip_s32(dest, offset); - } else { - return load32u_rip_s32(dest, offset); - } - break; - case 8: - return load64_rip_s32(dest, offset); - default: - ASSERT(false); - } - } - - static Instruction store64_rip_s32(Register src, s64 offset) { - ASSERT(src.is_gpr()); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0x89); - instr.set_modrm_and_rex_for_rip_plus_s32(src.hw_id(), offset, true); - return instr; - } - - static Instruction store32_rip_s32(Register src, s64 offset) { - ASSERT(src.is_gpr()); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0x89); - instr.set_modrm_and_rex_for_rip_plus_s32(src.hw_id(), offset, false); - return instr; - } - - static Instruction store16_rip_s32(Register src, s64 offset) { - ASSERT(src.is_gpr()); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0x66); - instr.set_op2(0x89); - instr.set_modrm_and_rex_for_rip_plus_s32(src.hw_id(), offset, false); - instr.swap_op0_rex(); - return instr; - } - - static Instruction store8_rip_s32(Register src, s64 offset) { - ASSERT(src.is_gpr()); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0x88); - instr.set_modrm_and_rex_for_rip_plus_s32(src.hw_id(), offset, false); - if (src.id() > RBX) { - instr.add_rex(); - } - return instr; - } - - static Instruction static_store(Register value, s64 offset, int size) { - switch (size) { - case 1: - return store8_rip_s32(value, offset); - case 2: - return store16_rip_s32(value, offset); - case 4: - return store32_rip_s32(value, offset); - case 8: - return store64_rip_s32(value, offset); - default: - ASSERT(false); - } - } - - static Instruction static_addr(Register dst, s64 offset) { - ASSERT(dst.is_gpr()); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0x8d); - instr.set_modrm_and_rex_for_rip_plus_s32(dst.hw_id(), offset, true); - return instr; - } - - static Instruction static_load_xmm32(Register xmm_dest, s64 offset) { - ASSERT(xmm_dest.is_xmm()); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - - Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x10); - instr.set_modrm_and_rex_for_rip_plus_s32(xmm_dest.hw_id(), offset, false); - - instr.swap_op0_rex(); - return instr; - } - - static Instruction static_store_xmm32(Register xmm_value, s64 offset) { - ASSERT(xmm_value.is_xmm()); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - - Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x11); - instr.set_modrm_and_rex_for_rip_plus_s32(xmm_value.hw_id(), offset, false); - - instr.swap_op0_rex(); - return instr; - } - - // TODO, special load/stores of 128 bit values. - - // TODO, consider specialized stack loads and stores? - static Instruction load64_gpr64_plus_s32(Register dst_reg, int32_t offset, Register src_reg) { - ASSERT(dst_reg.is_gpr()); - ASSERT(src_reg.is_gpr()); - Instruction instr(0x8b); - instr.set_modrm_rex_sib_for_reg_reg_disp(dst_reg.hw_id(), 2, src_reg.hw_id(), true); - instr.set_disp(Imm(4, offset)); - return instr; - } - - /*! - * Store 64-bits from gpr into memory located at 64-bit reg + 32-bit signed offset. - */ - static Instruction store64_gpr64_plus_s32(Register addr, int32_t offset, Register value) { - ASSERT(addr.is_gpr()); - ASSERT(value.is_gpr()); - Instruction instr(0x89); - instr.set_modrm_rex_sib_for_reg_reg_disp(value.hw_id(), 2, addr.hw_id(), true); - instr.set_disp(Imm(4, offset)); - return instr; - } - - //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - // FUNCTION STUFF - //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - /*! - * Function return. Pops the 64-bit return address (real) off the stack and jumps to it. - */ - static Instruction ret() { return Instruction(0xc3); } - - /*! - * Instruction to push gpr (64-bits) onto the stack - */ - static Instruction push_gpr64(Register reg) { - ASSERT(reg.is_gpr()); - if (reg.hw_id() >= 8) { - auto i = Instruction(0x50 + reg.hw_id() - 8); - i.set(REX(false, false, false, true)); - return i; - } - return Instruction(0x50 + reg.hw_id()); - } - - /*! - * Instruction to pop 64 bit gpr from the stack - */ - static Instruction pop_gpr64(Register reg) { - ASSERT(reg.is_gpr()); - if (reg.hw_id() >= 8) { - auto i = Instruction(0x58 + reg.hw_id() - 8); - i.set(REX(false, false, false, true)); - return i; - } - return Instruction(0x58 + reg.hw_id()); - } - - /*! - * Call a function stored in a 64-bit gpr - */ - static Instruction call_r64(Register reg_) { - ASSERT(reg_.is_gpr()); - auto reg = reg_.hw_id(); - Instruction instr(0xff); - if (reg >= 8) { - instr.set(REX(false, false, false, true)); - reg -= 8; - } - ASSERT(reg < 8); - ModRM mrm; - mrm.rm = reg; - mrm.reg_op = 2; - mrm.mod = 3; - instr.set(mrm); - return instr; - } - - /*! - * Jump to an x86-64 address stored in a 64-bit gpr. - */ - static Instruction jmp_r64(Register reg_) { - ASSERT(reg_.is_gpr()); - auto reg = reg_.hw_id(); - Instruction instr(0xff); - if (reg >= 8) { - instr.set(REX(false, false, false, true)); - reg -= 8; - } - ASSERT(reg < 8); - ModRM mrm; - mrm.rm = reg; - mrm.reg_op = 4; - mrm.mod = 3; - instr.set(mrm); - return instr; - } - - //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - // INTEGER MATH - //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - static Instruction sub_gpr64_imm8s(Register reg, int64_t imm) { - ASSERT(reg.is_gpr()); - ASSERT(imm >= INT8_MIN && imm <= INT8_MAX); - // SUB r/m64, imm8 : REX.W + 83 /5 ib - Instruction instr(0x83); - instr.set_modrm_and_rex(5, reg.hw_id(), 3, true); - instr.set(Imm(1, imm)); - return instr; - } - - static Instruction sub_gpr64_imm32s(Register reg, int64_t imm) { - ASSERT(reg.is_gpr()); - ASSERT(imm >= INT32_MIN && imm <= INT32_MAX); - Instruction instr(0x81); - instr.set_modrm_and_rex(5, reg.hw_id(), 3, true); - instr.set(Imm(4, imm)); - return instr; - } - - static Instruction add_gpr64_imm8s(Register reg, int64_t v) { - ASSERT(v >= INT8_MIN && v <= INT8_MAX); - Instruction instr(0x83); - instr.set_modrm_and_rex(0, reg.hw_id(), 3, true); - instr.set(Imm(1, v)); - return instr; - } - - static Instruction add_gpr64_imm32s(Register reg, int64_t v) { - ASSERT(v >= INT32_MIN && v <= INT32_MAX); - Instruction instr(0x81); - instr.set_modrm_and_rex(0, reg.hw_id(), 3, true); - instr.set(Imm(4, v)); - return instr; - } - - static Instruction add_gpr64_imm(Register reg, int64_t imm) { - if (imm >= INT8_MIN && imm <= INT8_MAX) { - return add_gpr64_imm8s(reg, imm); - } else if (imm >= INT32_MIN && imm <= INT32_MAX) { - return add_gpr64_imm32s(reg, imm); - } else { - throw std::runtime_error("Invalid `add` with reg[" + reg.print() + "]/imm[" + - std::to_string(imm) + "]"); - } - } - - static Instruction sub_gpr64_imm(Register reg, int64_t imm) { - if (imm >= INT8_MIN && imm <= INT8_MAX) { - return sub_gpr64_imm8s(reg, imm); - } else if (imm >= INT32_MIN && imm <= INT32_MAX) { - return sub_gpr64_imm32s(reg, imm); - } else { - throw std::runtime_error("Invalid `sub` with reg[" + reg.print() + "]/imm[" + - std::to_string(imm) + "]"); - } - } - - static Instruction add_gpr64_gpr64(Register dst, Register src) { - Instruction instr(0x01); - ASSERT(dst.is_gpr()); - ASSERT(src.is_gpr()); - instr.set_modrm_and_rex(src.hw_id(), dst.hw_id(), 3, true); - return instr; - } - - static Instruction sub_gpr64_gpr64(Register dst, Register src) { - Instruction instr(0x29); - ASSERT(dst.is_gpr()); - ASSERT(src.is_gpr()); - instr.set_modrm_and_rex(src.hw_id(), dst.hw_id(), 3, true); - return instr; - } - - /*! - * Multiply gprs (32-bit, signed). - * (Note - probably worth doing imul on gpr64's to implement the EE's unsigned multiply) - */ - static Instruction imul_gpr32_gpr32(Register dst, Register src) { - Instruction instr(0xf); - instr.set_op2(0xaf); - ASSERT(dst.is_gpr()); - ASSERT(src.is_gpr()); - instr.set_modrm_and_rex(dst.hw_id(), src.hw_id(), 3, false); - return instr; - } - - /*! - * Multiply gprs (64-bit, signed). - * DANGER - this treats all operands as 64-bit. This is not like the EE. - */ - static Instruction imul_gpr64_gpr64(Register dst, Register src) { - Instruction instr(0xf); - instr.set_op2(0xaf); - ASSERT(dst.is_gpr()); - ASSERT(src.is_gpr()); - instr.set_modrm_and_rex(dst.hw_id(), src.hw_id(), 3, true); - return instr; - } - - /*! - * Divide (idiv, 32 bit) - */ - static Instruction idiv_gpr32(Register reg) { - Instruction instr(0xf7); - ASSERT(reg.is_gpr()); - instr.set_modrm_and_rex(7, reg.hw_id(), 3, false); - return instr; - } - - static Instruction unsigned_div_gpr32(Register reg) { - Instruction instr(0xf7); - ASSERT(reg.is_gpr()); - instr.set_modrm_and_rex(6, reg.hw_id(), 3, false); - return instr; - } - - /*! - * Convert doubleword to quadword for division. - */ - static Instruction cdq() { - Instruction instr(0x99); - return instr; - } - - /*! - * Move from gpr32 to gpr64, with sign extension. - * Needed for multiplication/divsion madness. - */ - static Instruction movsx_r64_r32(Register dst, Register src) { - Instruction instr(0x63); - ASSERT(dst.is_gpr()); - ASSERT(src.is_gpr()); - instr.set_modrm_and_rex(dst.hw_id(), src.hw_id(), 3, true); - return instr; - } - - /*! - * Compare gpr64. This sets the flags for the jumps. - * todo UNTESTED - */ - static Instruction cmp_gpr64_gpr64(Register a, Register b) { - Instruction instr(0x3b); - ASSERT(a.is_gpr()); - ASSERT(b.is_gpr()); - instr.set_modrm_and_rex(a.hw_id(), b.hw_id(), 3, true); - return instr; - } - - //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - // BIT STUFF - //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - /*! - * Or of two gprs - */ - static Instruction or_gpr64_gpr64(Register dst, Register src) { - Instruction instr(0x0b); - ASSERT(dst.is_gpr()); - ASSERT(src.is_gpr()); - instr.set_modrm_and_rex(dst.hw_id(), src.hw_id(), 3, true); - return instr; - } - - /*! - * And of two gprs - */ - static Instruction and_gpr64_gpr64(Register dst, Register src) { - Instruction instr(0x23); - ASSERT(dst.is_gpr()); - ASSERT(src.is_gpr()); - instr.set_modrm_and_rex(dst.hw_id(), src.hw_id(), 3, true); - return instr; - } - - /*! - * Xor of two gprs - */ - static Instruction xor_gpr64_gpr64(Register dst, Register src) { - Instruction instr(0x33); - ASSERT(dst.is_gpr()); - ASSERT(src.is_gpr()); - instr.set_modrm_and_rex(dst.hw_id(), src.hw_id(), 3, true); - return instr; - } - - /*! - * Bitwise not a gpr - */ - static Instruction not_gpr64(Register reg) { - Instruction instr(0xf7); - ASSERT(reg.is_gpr()); - instr.set_modrm_and_rex(2, reg.hw_id(), 3, true); - return instr; - } - - //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - // SHIFTS - //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - /*! - * Shift 64-bit gpr left by CL register - */ - static Instruction shl_gpr64_cl(Register reg) { - ASSERT(reg.is_gpr()); - Instruction instr(0xd3); - instr.set_modrm_and_rex(4, reg.hw_id(), 3, true); - return instr; - } - - /*! - * Shift 64-bit gpr right (logical) by CL register - */ - static Instruction shr_gpr64_cl(Register reg) { - ASSERT(reg.is_gpr()); - Instruction instr(0xd3); - instr.set_modrm_and_rex(5, reg.hw_id(), 3, true); - return instr; - } - - /*! - * Shift 64-bit gpr right (arithmetic) by CL register - */ - static Instruction sar_gpr64_cl(Register reg) { - ASSERT(reg.is_gpr()); - Instruction instr(0xd3); - instr.set_modrm_and_rex(7, reg.hw_id(), 3, true); - return instr; - } - - /*! - * Shift 64-ptr left (logical) by the constant shift amount "sa". - */ - static Instruction shl_gpr64_u8(Register reg, uint8_t sa) { - ASSERT(reg.is_gpr()); - Instruction instr(0xc1); - instr.set_modrm_and_rex(4, reg.hw_id(), 3, true); - instr.set(Imm(1, sa)); - return instr; - } - - /*! - * Shift 64-ptr right (logical) by the constant shift amount "sa". - */ - static Instruction shr_gpr64_u8(Register reg, uint8_t sa) { - ASSERT(reg.is_gpr()); - Instruction instr(0xc1); - instr.set_modrm_and_rex(5, reg.hw_id(), 3, true); - instr.set(Imm(1, sa)); - return instr; - } - - /*! - * Shift 64-ptr right (arithmetic) by the constant shift amount "sa". - */ - static Instruction sar_gpr64_u8(Register reg, uint8_t sa) { - ASSERT(reg.is_gpr()); - Instruction instr(0xc1); - instr.set_modrm_and_rex(7, reg.hw_id(), 3, true); - instr.set(Imm(1, sa)); - return instr; - } - - //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - // CONTROL FLOW - //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - /*! - * Jump, 32-bit constant offset. The offset is by default 0 and must be patched later. - */ - static Instruction jmp_32() { - Instruction instr(0xe9); - instr.set(Imm(4, 0)); - return instr; - } - - /*! - * Jump if equal. - */ - static Instruction je_32() { - Instruction instr(0x0f); - instr.set_op2(0x84); - instr.set(Imm(4, 0)); - return instr; - } - - /*! - * Jump not equal. - */ - static Instruction jne_32() { - Instruction instr(0x0f); - instr.set_op2(0x85); - instr.set(Imm(4, 0)); - return instr; - } - - /*! - * Jump less than or equal. - */ - static Instruction jle_32() { - Instruction instr(0x0f); - instr.set_op2(0x8e); - instr.set(Imm(4, 0)); - return instr; - } - - /*! - * Jump greater than or equal. - */ - static Instruction jge_32() { - Instruction instr(0x0f); - instr.set_op2(0x8d); - instr.set(Imm(4, 0)); - return instr; - } - - /*! - * Jump less than - */ - static Instruction jl_32() { - Instruction instr(0x0f); - instr.set_op2(0x8c); - instr.set(Imm(4, 0)); - return instr; - } - - /*! - * Jump greater than - */ - static Instruction jg_32() { - Instruction instr(0x0f); - instr.set_op2(0x8f); - instr.set(Imm(4, 0)); - return instr; - } - - /*! - * Jump below or equal - */ - static Instruction jbe_32() { - Instruction instr(0x0f); - instr.set_op2(0x86); - instr.set(Imm(4, 0)); - return instr; - } - - /*! - * Jump above or equal - */ - static Instruction jae_32() { - Instruction instr(0x0f); - instr.set_op2(0x83); - instr.set(Imm(4, 0)); - return instr; - } - - /*! - * Jump below - */ - static Instruction jb_32() { - Instruction instr(0x0f); - instr.set_op2(0x82); - instr.set(Imm(4, 0)); - return instr; - } - - /*! - * Jump above - */ - static Instruction ja_32() { - Instruction instr(0x0f); - instr.set_op2(0x87); - instr.set(Imm(4, 0)); - return instr; - } - - //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - // FLOAT MATH - //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - /*! - * Compare two floats and set flag register for jump (ucomiss) - */ - static Instruction cmp_flt_flt(Register a, Register b) { - ASSERT(a.is_xmm()); - ASSERT(b.is_xmm()); - Instruction instr(0x0f); - instr.set_op2(0x2e); - instr.set_modrm_and_rex(a.hw_id(), b.hw_id(), 3, false); - return instr; - } - - static Instruction sqrts_xmm(Register dst, Register src) { - ASSERT(dst.is_xmm()); - ASSERT(src.is_xmm()); - Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x51); - instr.set_modrm_and_rex(dst.hw_id(), src.hw_id(), 3, false); - instr.swap_op0_rex(); - return instr; - } - - /*! - * Multiply two floats in xmm's - */ - static Instruction mulss_xmm_xmm(Register dst, Register src) { - ASSERT(dst.is_xmm()); - ASSERT(src.is_xmm()); - Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x59); - instr.set_modrm_and_rex(dst.hw_id(), src.hw_id(), 3, false); - instr.swap_op0_rex(); - return instr; - } - - /*! - * Divide two floats in xmm's - */ - static Instruction divss_xmm_xmm(Register dst, Register src) { - ASSERT(dst.is_xmm()); - ASSERT(src.is_xmm()); - Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x5e); - instr.set_modrm_and_rex(dst.hw_id(), src.hw_id(), 3, false); - instr.swap_op0_rex(); - return instr; - } - - /*! - * Subtract two floats in xmm's - */ - static Instruction subss_xmm_xmm(Register dst, Register src) { - ASSERT(dst.is_xmm()); - ASSERT(src.is_xmm()); - Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x5c); - instr.set_modrm_and_rex(dst.hw_id(), src.hw_id(), 3, false); - instr.swap_op0_rex(); - return instr; - } - - /*! - * Add two floats in xmm's - */ - static Instruction addss_xmm_xmm(Register dst, Register src) { - ASSERT(dst.is_xmm()); - ASSERT(src.is_xmm()); - Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x58); - instr.set_modrm_and_rex(dst.hw_id(), src.hw_id(), 3, false); - instr.swap_op0_rex(); - return instr; - } - - /*! - * Floating point minimum. - */ - static Instruction minss_xmm_xmm(Register dst, Register src) { - ASSERT(dst.is_xmm()); - ASSERT(src.is_xmm()); - Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x5d); - instr.set_modrm_and_rex(dst.hw_id(), src.hw_id(), 3, false); - instr.swap_op0_rex(); - return instr; - } - - /*! - * Floating point maximum. - */ - static Instruction maxss_xmm_xmm(Register dst, Register src) { - ASSERT(dst.is_xmm()); - ASSERT(src.is_xmm()); - Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x5f); - instr.set_modrm_and_rex(dst.hw_id(), src.hw_id(), 3, false); - instr.swap_op0_rex(); - return instr; - } - - /*! - * Convert GPR int32 to XMM float (single precision) - */ - static Instruction int32_to_float(Register dst, Register src) { - ASSERT(dst.is_xmm()); - ASSERT(src.is_gpr()); - Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x2a); - instr.set_modrm_and_rex(dst.hw_id(), src.hw_id(), 3, false); - instr.swap_op0_rex(); - return instr; - } - - /*! - * Convert XMM float to GPR int32(single precision) (truncate) - */ - static Instruction float_to_int32(Register dst, Register src) { - ASSERT(dst.is_gpr()); - ASSERT(src.is_xmm()); - Instruction instr(0xf3); - instr.set_op2(0x0f); - instr.set_op3(0x2c); - instr.set_modrm_and_rex(dst.hw_id(), src.hw_id(), 3, false); - instr.swap_op0_rex(); - return instr; - } - - static Instruction nop() { - // NOP - Instruction instr(0x90); - return instr; - } - - // TODO - rsqrt / abs / sqrt - - //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - // UTILITIES - //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - /*! - * A "null" instruction. This instruction does not generate any bytes - * but can be referred to by a label. Useful to insert in place of a real instruction - * if the real instruction has been optimized out. - */ - static Instruction null() { - Instruction i(0); - i.m_flags |= Instruction::kIsNull; - return i; - } - - ///////////////////////////// - // AVX (VF - Vector Float) // - ///////////////////////////// - - static Instruction nop_vf() { - Instruction instr(0xd9); // FNOP - instr.set_op2(0xd0); - return instr; - } - - static Instruction wait_vf() { - Instruction instr(0x9B); // FWAIT / WAIT - return instr; - } - - static Instruction mov_vf_vf(Register dst, Register src) { - ASSERT(dst.is_xmm()); - ASSERT(src.is_xmm()); - - if (src.hw_id() >= 8 && dst.hw_id() < 8) { - // in this case, we can use the 0x29 encoding, which swaps src and dst, in order to use the - // 2 byte VEX prefix, where the 0x28 encoding would require an extra byte. - // compilers/assemblers seem to prefer 0x28, unless 0x29 would save you a byte. - Instruction instr(0x29); - instr.set_vex_modrm_and_rex(src.hw_id(), dst.hw_id(), 3, VEX3::LeadingBytes::P_0F, false); - return instr; - } else { - Instruction instr(0x28); - instr.set_vex_modrm_and_rex(dst.hw_id(), src.hw_id(), 3, VEX3::LeadingBytes::P_0F, false); - return instr; - } - } - - static Instruction loadvf_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2) { - ASSERT(dst.is_xmm()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - Instruction instr(0x28); - instr.set_vex_modrm_and_rex_for_reg_plus_reg_addr(dst.hw_id(), addr1.hw_id(), addr2.hw_id(), - VEX3::LeadingBytes::P_0F, false); - return instr; - } - - static Instruction loadvf_gpr64_plus_gpr64_plus_s8(Register dst, - Register addr1, - Register addr2, - s64 offset) { - ASSERT(dst.is_xmm()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); - Instruction instr(0x28); - instr.set_vex_modrm_and_rex_for_reg_plus_reg_plus_s8(dst.hw_id(), addr1.hw_id(), addr2.hw_id(), - offset, VEX3::LeadingBytes::P_0F, false); - return instr; - } - - static Instruction loadvf_gpr64_plus_gpr64_plus_s32(Register dst, - Register addr1, - Register addr2, - s64 offset) { - ASSERT(dst.is_xmm()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0x28); - instr.set_vex_modrm_and_rex_for_reg_plus_reg_plus_s32(dst.hw_id(), addr1.hw_id(), addr2.hw_id(), - offset, VEX3::LeadingBytes::P_0F, false); - return instr; - } - - static Instruction storevf_gpr64_plus_gpr64(Register value, Register addr1, Register addr2) { - ASSERT(value.is_xmm()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - Instruction instr(0x29); - instr.set_vex_modrm_and_rex_for_reg_plus_reg_addr(value.hw_id(), addr1.hw_id(), addr2.hw_id(), - VEX3::LeadingBytes::P_0F, false); - return instr; - } - - static Instruction storevf_gpr64_plus_gpr64_plus_s8(Register value, - Register addr1, - Register addr2, - s64 offset) { - ASSERT(value.is_xmm()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); - Instruction instr(0x29); - instr.set_vex_modrm_and_rex_for_reg_plus_reg_plus_s8( - value.hw_id(), addr1.hw_id(), addr2.hw_id(), offset, VEX3::LeadingBytes::P_0F, false); - return instr; - } - - static Instruction storevf_gpr64_plus_gpr64_plus_s32(Register value, - Register addr1, - Register addr2, - s64 offset) { - ASSERT(value.is_xmm()); - ASSERT(addr1.is_gpr()); - ASSERT(addr2.is_gpr()); - ASSERT(addr1 != addr2); - ASSERT(addr1 != RSP); - ASSERT(addr2 != RSP); - ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); - Instruction instr(0x29); - instr.set_vex_modrm_and_rex_for_reg_plus_reg_plus_s32( - value.hw_id(), addr1.hw_id(), addr2.hw_id(), offset, VEX3::LeadingBytes::P_0F, false); - return instr; - } - - static Instruction loadvf_rip_plus_s32(Register dest, s64 offset) { - ASSERT(dest.is_xmm()); - ASSERT(offset >= INT32_MIN); - ASSERT(offset <= INT32_MAX); - Instruction instr(0x28); - instr.set_vex_modrm_and_rex_for_rip_plus_s32(dest.hw_id(), offset); - return instr; - } - - // TODO - rip relative loads and stores. - - static Instruction blend_vf(Register dst, Register src1, Register src2, u8 mask) { - ASSERT(!(mask & 0b11110000)); - ASSERT(dst.is_xmm()); - ASSERT(src1.is_xmm()); - ASSERT(src2.is_xmm()); - Instruction instr(0x0c); // VBLENDPS - instr.set_vex_modrm_and_rex(dst.hw_id(), src2.hw_id(), VEX3::LeadingBytes::P_0F_3A, - src1.hw_id(), false, VexPrefix::P_66); - instr.set(Imm(1, mask)); - return instr; - } - - static Instruction shuffle_vf(Register dst, Register src, u8 dx, u8 dy, u8 dz, u8 dw) { - ASSERT(dst.is_xmm()); - ASSERT(src.is_xmm()); - ASSERT(dx < 4); - ASSERT(dy < 4); - ASSERT(dz < 4); - ASSERT(dw < 4); - u8 imm = dx + (dy << 2) + (dz << 4) + (dw << 6); - return swizzle_vf(dst, src, imm); - - // SSE encoding version: - // Instruction instr(0x0f); - // instr.set_op2(0xc6); - // instr.set_modrm_and_rex(dst.hw_id(), src.hw_id(), 3, false); - // instr.set(Imm(1, imm)); - // return instr; - } - - /* - Generic Swizzle (re-arrangment of packed FPs) operation, the control bytes are quite involved. - Here's a brief run-down: - - 8-bits / 4 groups of 2 bits - - Right-to-left, each group is used to determine which element in `src` gets copied into - `dst`'s element (W->X). - - GROUP OPTIONS - - 00b - Copy the least-significant element (X) - - 01b - Copy the second element (from the right) (Y) - - 10b - Copy the third element (from the right) (Z) - - 11b - Copy the most significant element (W) - Examples - ; xmm1 = (1.5, 2.5, 3.5, 4.5) (W,Z,Y,X in x86 land) - SHUFPS xmm1, xmm1, 0xff ; Copy the most significant element to all positions - > (1.5, 1.5, 1.5, 1.5) - SHUFPS xmm1, xmm1, 0x39 ; Rotate right - > (4.5, 1.5, 2.5, 3.5) - */ - static Instruction swizzle_vf(Register dst, Register src, u8 controlBytes) { - ASSERT(dst.is_xmm()); - ASSERT(src.is_xmm()); - Instruction instr(0xC6); // VSHUFPS - - // we use the AVX "VEX" encoding here. This is a three-operand form, - // but we just set both source - // to the same register. It seems like this is one byte longer but is faster maybe? - instr.set_vex_modrm_and_rex(dst.hw_id(), src.hw_id(), VEX3::LeadingBytes::P_0F, src.hw_id()); - instr.set(Imm(1, controlBytes)); - return instr; - } - - /* - Splats a single element in 'src' to all elements in 'dst' - For example (pseudocode): - xmm1 = (1.5, 2.5, 3.5, 4.5) - xmm2 = (1, 2, 3, 4) - splat_vf(xmm1, xmm2, XMM_ELEMENT::X); - xmm1 = (4, 4, 4, 4) - */ - static Instruction splat_vf(Register dst, Register src, Register::VF_ELEMENT element) { - switch (element) { - case Register::VF_ELEMENT::X: // Least significant element - return swizzle_vf(dst, src, 0b00000000); - break; - case Register::VF_ELEMENT::Y: - return swizzle_vf(dst, src, 0b01010101); - break; - case Register::VF_ELEMENT::Z: - return swizzle_vf(dst, src, 0b10101010); - break; - case Register::VF_ELEMENT::W: // Most significant element - return swizzle_vf(dst, src, 0b11111111); - break; - default: - ASSERT(false); - return {0}; - } - } - - static Instruction xor_vf(Register dst, Register src1, Register src2) { - ASSERT(dst.is_xmm()); - ASSERT(src1.is_xmm()); - ASSERT(src2.is_xmm()); - Instruction instr(0x57); // VXORPS - instr.set_vex_modrm_and_rex(dst.hw_id(), src2.hw_id(), VEX3::LeadingBytes::P_0F, src1.hw_id()); - return instr; - } - - static Instruction sub_vf(Register dst, Register src1, Register src2) { - ASSERT(dst.is_xmm()); - ASSERT(src1.is_xmm()); - ASSERT(src2.is_xmm()); - Instruction instr(0x5c); // VSUBPS - instr.set_vex_modrm_and_rex(dst.hw_id(), src2.hw_id(), VEX3::LeadingBytes::P_0F, src1.hw_id()); - return instr; - } - - static Instruction add_vf(Register dst, Register src1, Register src2) { - ASSERT(dst.is_xmm()); - ASSERT(src1.is_xmm()); - ASSERT(src2.is_xmm()); - Instruction instr(0x58); // VADDPS - instr.set_vex_modrm_and_rex(dst.hw_id(), src2.hw_id(), VEX3::LeadingBytes::P_0F, src1.hw_id()); - return instr; - } - - static Instruction mul_vf(Register dst, Register src1, Register src2) { - ASSERT(dst.is_xmm()); - ASSERT(src1.is_xmm()); - ASSERT(src2.is_xmm()); - Instruction instr(0x59); // VMULPS - instr.set_vex_modrm_and_rex(dst.hw_id(), src2.hw_id(), VEX3::LeadingBytes::P_0F, src1.hw_id()); - return instr; - } - - static Instruction max_vf(Register dst, Register src1, Register src2) { - ASSERT(dst.is_xmm()); - ASSERT(src1.is_xmm()); - ASSERT(src2.is_xmm()); - Instruction instr(0x5F); // VMAXPS - instr.set_vex_modrm_and_rex(dst.hw_id(), src2.hw_id(), VEX3::LeadingBytes::P_0F, src1.hw_id()); - return instr; - } - - static Instruction min_vf(Register dst, Register src1, Register src2) { - ASSERT(dst.is_xmm()); - ASSERT(src1.is_xmm()); - ASSERT(src2.is_xmm()); - Instruction instr(0x5D); // VMINPS - instr.set_vex_modrm_and_rex(dst.hw_id(), src2.hw_id(), VEX3::LeadingBytes::P_0F, src1.hw_id()); - return instr; - } - - static Instruction div_vf(Register dst, Register src1, Register src2) { - ASSERT(dst.is_xmm()); - ASSERT(src1.is_xmm()); - ASSERT(src2.is_xmm()); - Instruction instr(0x5E); // VDIVPS - instr.set_vex_modrm_and_rex(dst.hw_id(), src2.hw_id(), VEX3::LeadingBytes::P_0F, src1.hw_id()); - return instr; - } - - static Instruction sqrt_vf(Register dst, Register src) { - ASSERT(dst.is_xmm()); - ASSERT(src.is_xmm()); - Instruction instr(0x51); // VSQRTPS - instr.set_vex_modrm_and_rex(dst.hw_id(), src.hw_id(), VEX3::LeadingBytes::P_0F, 0b0); - return instr; - } - - static Instruction itof_vf(Register dst, Register src) { - ASSERT(dst.is_xmm()); - ASSERT(src.is_xmm()); - Instruction instr(0x5b); // VCVTDQ2PS - instr.set_vex_modrm_and_rex(dst.hw_id(), src.hw_id(), VEX3::LeadingBytes::P_0F, 0); - return instr; - } - - static Instruction ftoi_vf(Register dst, Register src) { - ASSERT(dst.is_xmm()); - ASSERT(src.is_xmm()); - // VEX.128.F3.0F.WIG 5B /r VCVTTPS2DQ xmm1, xmm2/m128 - Instruction instr(0x5b); // VCVTTPS2DQ - instr.set_vex_modrm_and_rex(dst.hw_id(), src.hw_id(), VEX3::LeadingBytes::P_0F, 0, false, - VexPrefix::P_F3); - return instr; - } - - static Instruction pw_sra(Register dst, Register src, u8 imm) { - ASSERT(dst.is_xmm()); - ASSERT(src.is_xmm()); - // VEX.128.66.0F.WIG 72 /4 ib VPSRAD xmm1, xmm2, imm8 - Instruction instr(0x72); - instr.set_vex_modrm_and_rex(4, src.hw_id(), VEX3::LeadingBytes::P_0F, dst.hw_id(), false, - VexPrefix::P_66); - instr.set(Imm(1, imm)); - return instr; - } - - static Instruction pw_srl(Register dst, Register src, u8 imm) { - ASSERT(dst.is_xmm()); - ASSERT(src.is_xmm()); - // VEX.128.66.0F.WIG 72 /2 ib VPSRLD xmm1, xmm2, imm8 - Instruction instr(0x72); - instr.set_vex_modrm_and_rex(2, src.hw_id(), VEX3::LeadingBytes::P_0F, dst.hw_id(), false, - VexPrefix::P_66); - instr.set(Imm(1, imm)); - return instr; - } - - static Instruction ph_srl(Register dst, Register src, u8 imm) { - ASSERT(dst.is_xmm()); - ASSERT(src.is_xmm()); - // VEX.128.66.0F.WIG 71 /2 ib VPSRLW - Instruction instr(0x71); - instr.set_vex_modrm_and_rex(2, src.hw_id(), VEX3::LeadingBytes::P_0F, dst.hw_id(), false, - VexPrefix::P_66); - instr.set(Imm(1, imm)); - return instr; - } - - static Instruction pw_sll(Register dst, Register src, u8 imm) { - ASSERT(dst.is_xmm()); - ASSERT(src.is_xmm()); - // VEX.128.66.0F.WIG 72 /6 ib VPSLLD xmm1, xmm2, imm8 - Instruction instr(0x72); - instr.set_vex_modrm_and_rex(6, src.hw_id(), VEX3::LeadingBytes::P_0F, dst.hw_id(), false, - VexPrefix::P_66); - instr.set(Imm(1, imm)); - return instr; - } - static Instruction ph_sll(Register dst, Register src, u8 imm) { - ASSERT(dst.is_xmm()); - ASSERT(src.is_xmm()); - // VEX.128.66.0F.WIG 71 /6 ib VPSLLW xmm1, xmm2, imm8 - Instruction instr(0x71); - instr.set_vex_modrm_and_rex(6, src.hw_id(), VEX3::LeadingBytes::P_0F, dst.hw_id(), false, - VexPrefix::P_66); - instr.set(Imm(1, imm)); - return instr; - } - - static Instruction parallel_add_byte(Register dst, Register src0, Register src1) { - ASSERT(dst.is_xmm()); - ASSERT(src0.is_xmm()); - ASSERT(src1.is_xmm()); - // VEX.128.66.0F.WIG FC /r VPADDB xmm1, xmm2, xmm3/m128 - // reg, vex, r/m - Instruction instr(0xFC); - instr.set_vex_modrm_and_rex(dst.hw_id(), src1.hw_id(), VEX3::LeadingBytes::P_0F, src0.hw_id(), - false, VexPrefix::P_66); - return instr; - } - - static Instruction parallel_bitwise_or(Register dst, Register src0, Register src1) { - ASSERT(dst.is_xmm()); - ASSERT(src0.is_xmm()); - ASSERT(src1.is_xmm()); - // VEX.128.66.0F.WIG EB /r VPOR xmm1, xmm2, xmm3/m128 - // reg, vex, r/m - Instruction instr(0xEB); - instr.set_vex_modrm_and_rex(dst.hw_id(), src1.hw_id(), VEX3::LeadingBytes::P_0F, src0.hw_id(), - false, VexPrefix::P_66); - return instr; - } - - static Instruction parallel_bitwise_xor(Register dst, Register src0, Register src1) { - ASSERT(dst.is_xmm()); - ASSERT(src0.is_xmm()); - ASSERT(src1.is_xmm()); - // VEX.128.66.0F.WIG EF /r VPXOR xmm1, xmm2, xmm3/m128 - // reg, vex, r/m - Instruction instr(0xEF); - instr.set_vex_modrm_and_rex(dst.hw_id(), src1.hw_id(), VEX3::LeadingBytes::P_0F, src0.hw_id(), - false, VexPrefix::P_66); - return instr; - } - - static Instruction parallel_bitwise_and(Register dst, Register src0, Register src1) { - ASSERT(dst.is_xmm()); - ASSERT(src0.is_xmm()); - ASSERT(src1.is_xmm()); - // VEX.128.66.0F.WIG DB /r VPAND xmm1, xmm2, xmm3/m128 - // reg, vex, r/m - Instruction instr(0xDB); - instr.set_vex_modrm_and_rex(dst.hw_id(), src1.hw_id(), VEX3::LeadingBytes::P_0F, src0.hw_id(), - false, VexPrefix::P_66); - return instr; - } - - // Reminder - a word in MIPS = 32bits = a DWORD in x86 - // MIPS || x86 - // ----------------------- - // byte || byte - // halfword || word - // word || dword - // doubleword || quadword - - // -- Unpack High Data Instructions - static Instruction pextub_swapped(Register dst, Register src0, Register src1) { - ASSERT(dst.is_xmm()); - ASSERT(src0.is_xmm()); - ASSERT(src1.is_xmm()); - // VEX.128.66.0F.WIG 68/r VPUNPCKHBW xmm1,xmm2, xmm3/m128 - // reg, vex, r/m - Instruction instr(0x68); - instr.set_vex_modrm_and_rex(dst.hw_id(), src1.hw_id(), VEX3::LeadingBytes::P_0F, src0.hw_id(), - false, VexPrefix::P_66); - return instr; - } - - static Instruction pextuh_swapped(Register dst, Register src0, Register src1) { - ASSERT(dst.is_xmm()); - ASSERT(src0.is_xmm()); - ASSERT(src1.is_xmm()); - // VEX.128.66.0F.WIG 69/r VPUNPCKHWD xmm1,xmm2, xmm3/m128 - // reg, vex, r/m - Instruction instr(0x69); - instr.set_vex_modrm_and_rex(dst.hw_id(), src1.hw_id(), VEX3::LeadingBytes::P_0F, src0.hw_id(), - false, VexPrefix::P_66); - return instr; - } - - static Instruction pextuw_swapped(Register dst, Register src0, Register src1) { - ASSERT(dst.is_xmm()); - ASSERT(src0.is_xmm()); - ASSERT(src1.is_xmm()); - // VEX.128.66.0F.WIG 6A/r VPUNPCKHDQ xmm1, xmm2, xmm3/m128 - // reg, vex, r/m - Instruction instr(0x6a); - instr.set_vex_modrm_and_rex(dst.hw_id(), src1.hw_id(), VEX3::LeadingBytes::P_0F, src0.hw_id(), - false, VexPrefix::P_66); - return instr; - } - - // -- Unpack Low Data Instructions - static Instruction pextlb_swapped(Register dst, Register src0, Register src1) { - ASSERT(dst.is_xmm()); - ASSERT(src0.is_xmm()); - ASSERT(src1.is_xmm()); - // VEX.128.66.0F.WIG 60/r VPUNPCKLBW xmm1,xmm2, xmm3/m128 - // reg, vex, r/m - Instruction instr(0x60); - instr.set_vex_modrm_and_rex(dst.hw_id(), src1.hw_id(), VEX3::LeadingBytes::P_0F, src0.hw_id(), - false, VexPrefix::P_66); - return instr; - } - - static Instruction pextlh_swapped(Register dst, Register src0, Register src1) { - ASSERT(dst.is_xmm()); - ASSERT(src0.is_xmm()); - ASSERT(src1.is_xmm()); - // VEX.128.66.0F.WIG 61/r VPUNPCKLWD xmm1,xmm2, xmm3/m128 - // reg, vex, r/m - Instruction instr(0x61); - instr.set_vex_modrm_and_rex(dst.hw_id(), src1.hw_id(), VEX3::LeadingBytes::P_0F, src0.hw_id(), - false, VexPrefix::P_66); - return instr; - } - - static Instruction pextlw_swapped(Register dst, Register src0, Register src1) { - ASSERT(dst.is_xmm()); - ASSERT(src0.is_xmm()); - ASSERT(src1.is_xmm()); - // VEX.128.66.0F.WIG 62/r VPUNPCKLDQ xmm1, xmm2, xmm3/m128 - // reg, vex, r/m - Instruction instr(0x62); - instr.set_vex_modrm_and_rex(dst.hw_id(), src1.hw_id(), VEX3::LeadingBytes::P_0F, src0.hw_id(), - false, VexPrefix::P_66); - return instr; - } - - // Equal to than comparison as 16 bytes (8 bits) - static Instruction parallel_compare_e_b(Register dst, Register src0, Register src1) { - ASSERT(dst.is_xmm()); - ASSERT(src0.is_xmm()); - ASSERT(src1.is_xmm()); - // VEX.128.66.0F.WIG 74 /r VPCMPEQB xmm1, xmm2, xmm3/m128 - // reg, vex, r/m - Instruction instr(0x74); - instr.set_vex_modrm_and_rex(dst.hw_id(), src1.hw_id(), VEX3::LeadingBytes::P_0F, src0.hw_id(), - false, VexPrefix::P_66); - return instr; - } - - // Equal to than comparison as 8 halfwords (16 bits) - static Instruction parallel_compare_e_h(Register dst, Register src0, Register src1) { - ASSERT(dst.is_xmm()); - ASSERT(src0.is_xmm()); - ASSERT(src1.is_xmm()); - // VEX.128.66.0F.WIG 75 /r VPCMPEQW xmm1, xmm2, xmm3/m128 - // reg, vex, r/m - Instruction instr(0x75); - instr.set_vex_modrm_and_rex(dst.hw_id(), src1.hw_id(), VEX3::LeadingBytes::P_0F, src0.hw_id(), - false, VexPrefix::P_66); - return instr; - } - - // Equal to than comparison as 4 words (32 bits) - static Instruction parallel_compare_e_w(Register dst, Register src0, Register src1) { - ASSERT(dst.is_xmm()); - ASSERT(src0.is_xmm()); - ASSERT(src1.is_xmm()); - // VEX.128.66.0F.WIG 76 /r VPCMPEQD xmm1, xmm2, xmm3/m128 - // reg, vex, r/m - Instruction instr(0x76); - instr.set_vex_modrm_and_rex(dst.hw_id(), src1.hw_id(), VEX3::LeadingBytes::P_0F, src0.hw_id(), - false, VexPrefix::P_66); - return instr; - } - - // Greater than comparison as 16 bytes (8 bits) - static Instruction parallel_compare_gt_b(Register dst, Register src0, Register src1) { - ASSERT(dst.is_xmm()); - ASSERT(src0.is_xmm()); - ASSERT(src1.is_xmm()); - // VEX.128.66.0F.WIG 64 /r VPCMPGTB xmm1, xmm2, xmm3/m128 - // reg, vex, r/m - Instruction instr(0x64); - instr.set_vex_modrm_and_rex(dst.hw_id(), src1.hw_id(), VEX3::LeadingBytes::P_0F, src0.hw_id(), - false, VexPrefix::P_66); - return instr; - } - - // Greater than comparison as 8 halfwords (16 bits) - static Instruction parallel_compare_gt_h(Register dst, Register src0, Register src1) { - ASSERT(dst.is_xmm()); - ASSERT(src0.is_xmm()); - ASSERT(src1.is_xmm()); - // VEX.128.66.0F.WIG 65 /r VPCMPGTW xmm1, xmm2, xmm3/m128 - // reg, vex, r/m - Instruction instr(0x65); - instr.set_vex_modrm_and_rex(dst.hw_id(), src1.hw_id(), VEX3::LeadingBytes::P_0F, src0.hw_id(), - false, VexPrefix::P_66); - return instr; - } - - // Greater than comparison as 4 words (32 bits) - static Instruction parallel_compare_gt_w(Register dst, Register src0, Register src1) { - ASSERT(dst.is_xmm()); - ASSERT(src0.is_xmm()); - ASSERT(src1.is_xmm()); - // VEX.128.66.0F.WIG 66 /r VPCMPGTD xmm1, xmm2, xmm3/m128 - // reg, vex, r/m - Instruction instr(0x66); - instr.set_vex_modrm_and_rex(dst.hw_id(), src1.hw_id(), VEX3::LeadingBytes::P_0F, src0.hw_id(), - false, VexPrefix::P_66); - return instr; - } - - static Instruction vpunpcklqdq(Register dst, Register src0, Register src1) { - ASSERT(dst.is_xmm()); - ASSERT(src0.is_xmm()); - ASSERT(src1.is_xmm()); - // VEX.128.66.0F.WIG 6C/r VPUNPCKLQDQ xmm1, xmm2, xmm3/m128 - // reg, vex, r/m - Instruction instr(0x6c); - instr.set_vex_modrm_and_rex(dst.hw_id(), src1.hw_id(), VEX3::LeadingBytes::P_0F, src0.hw_id(), - false, VexPrefix::P_66); - return instr; - } - - static Instruction pcpyld_swapped(Register dst, Register src0, Register src1) { - return vpunpcklqdq(dst, src0, src1); - } - - static Instruction pcpyud(Register dst, Register src0, Register src1) { - ASSERT(dst.is_xmm()); - ASSERT(src0.is_xmm()); - ASSERT(src1.is_xmm()); - // VEX.128.66.0F.WIG 6D/r VPUNPCKHQDQ xmm1, xmm2, xmm3/m128 - // reg, vex, r/m - Instruction instr(0x6d); - instr.set_vex_modrm_and_rex(dst.hw_id(), src1.hw_id(), VEX3::LeadingBytes::P_0F, src0.hw_id(), - false, VexPrefix::P_66); - return instr; - } - - static Instruction vpsubd(Register dst, Register src0, Register src1) { - ASSERT(dst.is_xmm()); - ASSERT(src0.is_xmm()); - ASSERT(src1.is_xmm()); - // VEX.128.66.0F.WIG FA /r VPSUBD xmm1, xmm2, xmm3/m128 - // reg, vec, r/m - Instruction instr(0xfa); - instr.set_vex_modrm_and_rex(dst.hw_id(), src1.hw_id(), VEX3::LeadingBytes::P_0F, src0.hw_id(), - false, VexPrefix::P_66); - return instr; - } - - static Instruction vpsrldq(Register dst, Register src, u8 imm) { - ASSERT(dst.is_xmm()); - ASSERT(src.is_xmm()); - // VEX.128.66.0F.WIG 73 /3 ib VPSRLDQ xmm1, xmm2, imm8 - Instruction instr(0x73); - instr.set_vex_modrm_and_rex(3, src.hw_id(), VEX3::LeadingBytes::P_0F, dst.hw_id(), false, - VexPrefix::P_66); - instr.set(Imm(1, imm)); - return instr; - } - - static Instruction vpslldq(Register dst, Register src, u8 imm) { - ASSERT(dst.is_xmm()); - ASSERT(src.is_xmm()); - // VEX.128.66.0F.WIG 73 /7 ib VPSLLDQ xmm1, xmm2, imm8 - Instruction instr(0x73); - instr.set_vex_modrm_and_rex(7, src.hw_id(), VEX3::LeadingBytes::P_0F, dst.hw_id(), false, - VexPrefix::P_66); - instr.set(Imm(1, imm)); - return instr; - } - - static Instruction vpshuflw(Register dst, Register src, u8 imm) { - ASSERT(dst.is_xmm()); - ASSERT(src.is_xmm()); - // VEX.128.F2.0F.WIG 70 /r ib VPSHUFLW xmm1, xmm2/m128, imm8 - Instruction instr(0x70); - instr.set_vex_modrm_and_rex(dst.hw_id(), src.hw_id(), VEX3::LeadingBytes::P_0F, 0, false, - VexPrefix::P_F2); - instr.set(Imm(1, imm)); - return instr; - } - - static Instruction vpshufhw(Register dst, Register src, u8 imm) { - ASSERT(dst.is_xmm()); - ASSERT(src.is_xmm()); - // VEX.128.F3.0F.WIG 70 /r ib VPSHUFHW xmm1, xmm2/m128, imm8 - Instruction instr(0x70); - instr.set_vex_modrm_and_rex(dst.hw_id(), src.hw_id(), VEX3::LeadingBytes::P_0F, 0, false, - VexPrefix::P_F3); - instr.set(Imm(1, imm)); - return instr; - } - - static Instruction vpackuswb(Register dst, Register src0, Register src1) { - ASSERT(dst.is_xmm()); - ASSERT(src0.is_xmm()); - ASSERT(src1.is_xmm()); - // VEX.128.66.0F.WIG 67 /r VPACKUSWB xmm1, xmm2, xmm3/m128 - // reg, vex, r/m - - Instruction instr(0x67); - instr.set_vex_modrm_and_rex(dst.hw_id(), src1.hw_id(), VEX3::LeadingBytes::P_0F, src0.hw_id(), - false, VexPrefix::P_66); - return instr; - } -}; + Register addr2, + s64 offset); + +Instruction store8_gpr64_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register value, + s64 offset); + +/*! + * movzx dst, BYTE PTR [addr1 + addr2] + * addr1 and addr2 have to be different registers. + * Cannot use rsp. + */ +Instruction load8u_gpr64_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2); + +Instruction load8u_gpr64_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset); + +Instruction load8u_gpr64_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset); + +/*! + * movsx dst, WORD PTR [addr1 + addr2] + * addr1 and addr2 have to be different registers. + * Cannot use rsp. + */ +Instruction load16s_gpr64_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2); + +Instruction store16_gpr64_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register value); + +Instruction store16_gpr64_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register value, + s64 offset); + +Instruction store16_gpr64_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register value, + s64 offset); + +Instruction load16s_gpr64_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset); + +Instruction load16s_gpr64_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset); + +/*! + * movzx dst, WORD PTR [addr1 + addr2] + * addr1 and addr2 have to be different registers. + * Cannot use rsp. + */ +Instruction load16u_gpr64_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2); + +Instruction load16u_gpr64_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset); + +Instruction load16u_gpr64_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset); + +/*! + * movsxd dst, DWORD PTR [addr1 + addr2] + * addr1 and addr2 have to be different registers. + * Cannot use rsp. + */ +Instruction load32s_gpr64_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2); + +Instruction store32_gpr64_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register value); + +Instruction load32s_gpr64_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset); + +Instruction store32_gpr64_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register value, + s64 offset); + +Instruction load32s_gpr64_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset); + +Instruction store32_gpr64_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register value, + s64 offset); + +/*! + * movzxd dst, DWORD PTR [addr1 + addr2] + * addr1 and addr2 have to be different registers. + * Cannot use rsp. + */ +Instruction load32u_gpr64_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2); + +Instruction load32u_gpr64_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset); + +Instruction load32u_gpr64_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset); + +/*! + * mov dst, QWORD PTR [addr1 + addr2] + * addr1 and addr2 have to be different registers. + * Cannot use rsp. + */ +Instruction load64_gpr64_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2); + +Instruction store64_gpr64_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register value); + +Instruction load64_gpr64_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset); + +Instruction store64_gpr64_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register value, + s64 offset); + +Instruction load64_gpr64_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset); + +Instruction store64_gpr64_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register value, + s64 offset); + +Instruction store_goal_vf(const ObjectGenerator& gen, + Register addr, + Register value, + Register off, + s64 offset); + +Instruction store_goal_gpr(const ObjectGenerator& gen, + Register addr, + Register value, + Register off, + int offset, + int size); + +Instruction load_goal_xmm128(const ObjectGenerator& gen, + Register dst, + Register addr, + Register off, + int offset); + +/*! + * Load memory at addr + offset, where addr is a GOAL pointer and off is the offset register. + * This will pick the appropriate fancy addressing mode instruction. + */ +Instruction load_goal_gpr(const ObjectGenerator& gen, + Register dst, + Register addr, + Register off, + int offset, + int size, + bool sign_extend); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// LOADS n' STORES - XMM32 +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +Instruction store32_xmm32_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register xmm_value); + +Instruction load32_xmm32_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register simd_dest, + Register addr1, + Register addr2); + +Instruction store32_xmm32_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register xmm_value, + s64 offset); + +Instruction load32_xmm32_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register simd_dest, + Register addr1, + Register addr2, + s64 offset); + +Instruction store32_xmm32_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register addr1, + Register addr2, + Register xmm_value, + s64 offset); + +Instruction lea_reg_plus_off32(const ObjectGenerator& gen, + Register dest, + Register base, + s64 offset); + +Instruction lea_reg_plus_off8(const ObjectGenerator& gen, Register dest, Register base, s64 offset); + +Instruction lea_reg_plus_off(const ObjectGenerator& gen, Register dest, Register base, s64 offset); + +Instruction store32_xmm32_gpr64_plus_s32(const ObjectGenerator& gen, + Register base, + Register xmm_value, + s64 offset); + +Instruction store32_xmm32_gpr64_plus_s8(const ObjectGenerator& gen, + Register base, + Register xmm_value, + s64 offset); + +Instruction load32_xmm32_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register simd_dest, + Register addr1, + Register addr2, + s64 offset); + +Instruction load32_xmm32_gpr64_plus_s32(const ObjectGenerator& gen, + Register simd_dest, + Register base, + s64 offset); + +Instruction load32_xmm32_gpr64_plus_s8(const ObjectGenerator& gen, + Register simd_dest, + Register base, + s64 offset); + +Instruction load_goal_xmm32(const ObjectGenerator& gen, + Register simd_dest, + Register addr, + Register off, + s64 offset); + +Instruction store_goal_xmm32(const ObjectGenerator& gen, + Register addr, + Register xmm_value, + Register off, + s64 offset); + +Instruction store_reg_offset_xmm32(const ObjectGenerator& gen, + Register base, + Register xmm_value, + s64 offset); + +Instruction load_reg_offset_xmm32(const ObjectGenerator& gen, + Register simd_dest, + Register base, + s64 offset); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// LOADS n' STORES - XMM128 +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +/*! + * Store a 128-bit xmm into an address stored in a register, no offset + */ +Instruction store128_gpr64_simd128(const ObjectGenerator& gen, + Register gpr_addr, + Register xmm_value); + +Instruction store128_gpr64_simd128_s32(const ObjectGenerator& gen, + Register gpr_addr, + Register xmm_value, + s64 offset); + +Instruction store128_gpr64_simd128_s8(const ObjectGenerator& gen, + Register gpr_addr, + Register xmm_value, + s64 offset); + +Instruction load128_simd128_gpr64(const ObjectGenerator& gen, + Register simd_dest, + Register gpr_addr); + +Instruction load128_simd128_gpr64_s32(const ObjectGenerator& gen, + Register simd_dest, + Register gpr_addr, + s64 offset); + +Instruction load128_simd128_gpr64_s8(const ObjectGenerator& gen, + Register simd_dest, + Register gpr_addr, + s64 offset); + +Instruction load128_xmm128_reg_offset(const ObjectGenerator& gen, + Register simd_dest, + Register base, + s64 offset); + +Instruction store128_xmm128_reg_offset(const ObjectGenerator& gen, + Register base, + Register xmm_val, + s64 offset); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// RIP loads and stores +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +Instruction load64_rip_s32(const ObjectGenerator& gen, Register dest, s64 offset); + +Instruction load32s_rip_s32(const ObjectGenerator& gen, Register dest, s64 offset); + +Instruction load32u_rip_s32(const ObjectGenerator& gen, Register dest, s64 offset); + +Instruction load16u_rip_s32(const ObjectGenerator& gen, Register dest, s64 offset); + +Instruction load16s_rip_s32(const ObjectGenerator& gen, Register dest, s64 offset); + +Instruction load8u_rip_s32(const ObjectGenerator& gen, Register dest, s64 offset); + +Instruction load8s_rip_s32(const ObjectGenerator& gen, Register dest, s64 offset); + +Instruction static_load(const ObjectGenerator& gen, + Register dest, + s64 offset, + int size, + bool sign_extend); + +Instruction store64_rip_s32(const ObjectGenerator& gen, Register src, s64 offset); + +Instruction store32_rip_s32(const ObjectGenerator& gen, Register src, s64 offset); + +Instruction store16_rip_s32(const ObjectGenerator& gen, Register src, s64 offset); + +Instruction store8_rip_s32(const ObjectGenerator& gen, Register src, s64 offset); + +Instruction static_store(const ObjectGenerator& gen, Register value, s64 offset, int size); + +Instruction static_addr(const ObjectGenerator& gen, Register dst, s64 offset); + +Instruction static_load_xmm32(const ObjectGenerator& gen, Register simd_dest, s64 offset); + +Instruction static_store_xmm32(const ObjectGenerator& gen, Register xmm_value, s64 offset); + +// TODO, special load/stores of 128 bit values. + +// TODO, consider specialized stack loads and stores? +Instruction load64_gpr64_plus_s32(const ObjectGenerator& gen, + Register dst_reg, + int32_t offset, + Register src_reg); + +/*! + * Store 64-bits from gpr into memory located at 64-bit reg + 32-bit signed offset. + */ +Instruction store64_gpr64_plus_s32(const ObjectGenerator& gen, + Register addr, + int32_t offset, + Register value); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// FUNCTION STUFF +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +/*! + * Function return. Pops the 64-bit return address (real) off the stack and jumps to it. + */ +Instruction ret(const ObjectGenerator& gen); + +/*! + * Instruction to push gpr (64-bits) onto the stack + */ +Instruction push_gpr64(const ObjectGenerator& gen, Register reg); + +/*! + * Instruction to pop 64 bit gpr from the stack + */ +Instruction pop_gpr64(const ObjectGenerator& gen, Register reg); + +/*! + * Call a function stored in a 64-bit gpr + */ +Instruction call_r64(const ObjectGenerator& gen, Register reg_); + +/*! + * Jump to an x86-64 address stored in a 64-bit gpr. + */ +Instruction jmp_r64(const ObjectGenerator& gen, Register reg_); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// INTEGER MATH +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +Instruction sub_gpr64_imm8s(const ObjectGenerator& gen, Register reg, int64_t imm); + +Instruction sub_gpr64_imm32s(const ObjectGenerator& gen, Register reg, int64_t imm); + +Instruction add_gpr64_imm8s(const ObjectGenerator& gen, Register reg, int64_t v); + +Instruction add_gpr64_imm32s(const ObjectGenerator& gen, Register reg, int64_t v); + +Instruction add_gpr64_imm(const ObjectGenerator& gen, Register reg, int64_t imm); + +Instruction sub_gpr64_imm(const ObjectGenerator& gen, Register reg, int64_t imm); + +Instruction add_gpr64_gpr64(const ObjectGenerator& gen, Register dst, Register src); + +Instruction sub_gpr64_gpr64(const ObjectGenerator& gen, Register dst, Register src); + +/*! + * Multiply gprs (32-bit, signed). + * (Note - probably worth doing imul on gpr64's to implement the EE's unsigned multiply) + */ +Instruction imul_gpr32_gpr32(const ObjectGenerator& gen, Register dst, Register src); + +/*! + * Multiply gprs (64-bit, signed). + * DANGER - this treats all operands as 64-bit. This is not like the EE. + */ +Instruction imul_gpr64_gpr64(const ObjectGenerator& gen, Register dst, Register src); + +/*! + * Divide (idiv, 32 bit) + */ +Instruction idiv_gpr32(const ObjectGenerator& gen, Register reg); + +Instruction unsigned_div_gpr32(const ObjectGenerator& gen, Register reg); + +/*! + * Convert doubleword to quadword for division. + */ +Instruction cdq(const ObjectGenerator& gen); + +/*! + * Move from gpr32 to gpr64, with sign extension. + * Needed for multiplication/divsion madness. + */ +Instruction movsx_r64_r32(const ObjectGenerator& gen, Register dst, Register src); + +/*! + * Compare gpr64. This sets the flags for the jumps. + * todo UNTESTED + */ +Instruction cmp_gpr64_gpr64(const ObjectGenerator& gen, Register a, Register b); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// BIT STUFF +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +/*! + * Or of two gprs + */ +Instruction or_gpr64_gpr64(const ObjectGenerator& gen, Register dst, Register src); + +/*! + * And of two gprs + */ +Instruction and_gpr64_gpr64(const ObjectGenerator& gen, Register dst, Register src); + +/*! + * Xor of two gprs + */ +Instruction xor_gpr64_gpr64(const ObjectGenerator& gen, Register dst, Register src); + +/*! + * Bitwise not a gpr + */ +Instruction not_gpr64(const ObjectGenerator& gen, Register reg); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// SHIFTS +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +/*! + * Shift 64-bit gpr left by CL register + */ +Instruction shl_gpr64_cl(const ObjectGenerator& gen, Register reg); + +/*! + * Shift 64-bit gpr right (logical) by CL register + */ +Instruction shr_gpr64_cl(const ObjectGenerator& gen, Register reg); + +/*! + * Shift 64-bit gpr right (arithmetic) by CL register + */ +Instruction sar_gpr64_cl(const ObjectGenerator& gen, Register reg); + +/*! + * Shift 64-ptr left (logical) by the constant shift amount "sa". + */ +Instruction shl_gpr64_u8(const ObjectGenerator& gen, Register reg, uint8_t sa); + +/*! + * Shift 64-ptr right (logical) by the constant shift amount "sa". + */ +Instruction shr_gpr64_u8(const ObjectGenerator& gen, Register reg, uint8_t sa); + +/*! + * Shift 64-ptr right (arithmetic) by the constant shift amount "sa". + */ +Instruction sar_gpr64_u8(const ObjectGenerator& gen, Register reg, uint8_t sa); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// CONTROL FLOW +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +/*! + * Jump, 32-bit constant offset. The offset is by default 0 and must be patched later. + */ +Instruction jmp_32(const ObjectGenerator& gen); + +/*! + * Jump if equal. + */ +Instruction je_32(const ObjectGenerator& gen); + +/*! + * Jump not equal. + */ +Instruction jne_32(const ObjectGenerator& gen); + +/*! + * Jump less than or equal. + */ +Instruction jle_32(const ObjectGenerator& gen); + +/*! + * Jump greater than or equal. + */ +Instruction jge_32(const ObjectGenerator& gen); + +/*! + * Jump less than + */ +Instruction jl_32(const ObjectGenerator& gen); + +/*! + * Jump greater than + */ +Instruction jg_32(const ObjectGenerator& gen); + +/*! + * Jump below or equal + */ +Instruction jbe_32(const ObjectGenerator& gen); + +/*! + * Jump above or equal + */ +Instruction jae_32(const ObjectGenerator& gen); + +/*! + * Jump below + */ +Instruction jb_32(const ObjectGenerator& gen); + +/*! + * Jump above + */ +Instruction ja_32(const ObjectGenerator& gen); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// FLOAT MATH +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +/*! + * Compare two floats and set flag register for jump (ucomiss) + */ +Instruction cmp_flt_flt(const ObjectGenerator& gen, Register a, Register b); + +Instruction sqrts_xmm(const ObjectGenerator& gen, Register dst, Register src); + +/*! + * Multiply two floats in xmm's + */ +Instruction mulss_xmm_xmm(const ObjectGenerator& gen, Register dst, Register src); + +/*! + * Divide two floats in xmm's + */ +Instruction divss_xmm_xmm(const ObjectGenerator& gen, Register dst, Register src); + +/*! + * Subtract two floats in xmm's + */ +Instruction subss_xmm_xmm(const ObjectGenerator& gen, Register dst, Register src); + +/*! + * Add two floats in xmm's + */ +Instruction addss_xmm_xmm(const ObjectGenerator& gen, Register dst, Register src); + +/*! + * Floating point minimum. + */ +Instruction minss_xmm_xmm(const ObjectGenerator& gen, Register dst, Register src); + +/*! + * Floating point maximum. + */ +Instruction maxss_xmm_xmm(const ObjectGenerator& gen, Register dst, Register src); + +/*! + * Convert GPR int32 to XMM float (single precision) + */ +Instruction int32_to_float(const ObjectGenerator& gen, Register dst, Register src); + +/*! + * Convert XMM float to GPR int32(single precision) (truncate) + */ +Instruction float_to_int32(const ObjectGenerator& gen, Register dst, Register src); + +Instruction nop(const ObjectGenerator& gen); + +// TODO - rsqrt / abs / sqrt + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// UTILITIES +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +/*! + * A "null" instruction. This instruction does not generate any bytes + * but can be referred to by a label. Useful to insert in place of a real instruction + * if the real instruction has been optimized out. + */ +Instruction null(const ObjectGenerator& gen); + +///////////////////////////// +// AVX (VF - Vector Float) // +///////////////////////////// + +Instruction nop_vf(const ObjectGenerator& gen); + +Instruction wait_vf(const ObjectGenerator& gen); + +Instruction mov_vf_vf(const ObjectGenerator& gen, Register dst, Register src); + +Instruction loadvf_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2); + +Instruction loadvf_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset); + +Instruction loadvf_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register dst, + Register addr1, + Register addr2, + s64 offset); + +Instruction storevf_gpr64_plus_gpr64(const ObjectGenerator& gen, + Register value, + Register addr1, + Register addr2); + +Instruction storevf_gpr64_plus_gpr64_plus_s8(const ObjectGenerator& gen, + Register value, + Register addr1, + Register addr2, + s64 offset); + +Instruction storevf_gpr64_plus_gpr64_plus_s32(const ObjectGenerator& gen, + Register value, + Register addr1, + Register addr2, + s64 offset); + +Instruction loadvf_rip_plus_s32(const ObjectGenerator& gen, Register dest, s64 offset); + +// TODO - rip relative loads and stores. + +Instruction blend_vf(const ObjectGenerator& gen, + Register dst, + Register src1, + Register src2, + u8 mask); + +Instruction +shuffle_vf(const ObjectGenerator& gen, Register dst, Register src, u8 dx, u8 dy, u8 dz, u8 dw); + +/* + Generic Swizzle (re-arrangment of packed FPs) operation, the control bytes are quite involved. + Here's a brief run-down: + - 8-bits / 4 groups of 2 bits + - Right-to-left, each group is used to determine which element in `src` gets copied into + `dst`'s element (W->X). + - GROUP OPTIONS + - 00b - Copy the least-significant element (X) + - 01b - Copy the second element (from the right) (Y) + - 10b - Copy the third element (from the right) (Z) + - 11b - Copy the most significant element (W) + Examples + ; xmm1 = (1.5, 2.5, 3.5, 4.5) (W,Z,Y,X in x86 land) + SHUFPS xmm1, xmm1, 0xff ; Copy the most significant element to all positions + > (1.5, 1.5, 1.5, 1.5) + SHUFPS xmm1, xmm1, 0x39 ; Rotate right + > (4.5, 1.5, 2.5, 3.5) + */ +Instruction swizzle_vf(const ObjectGenerator& gen, Register dst, Register src, u8 controlBytes); + +/* + Splats a single element in 'src' to all elements in 'dst' + For example (pseudocode): + xmm1 = (1.5, 2.5, 3.5, 4.5) + xmm2 = (1, 2, 3, 4) + splat_vf(xmm1, xmm2, XMM_ELEMENT::X); + xmm1 = (4, 4, 4, 4) + */ +Instruction splat_vf(const ObjectGenerator& gen, + Register dst, + Register src, + Register::VF_ELEMENT element); + +Instruction xor_vf(const ObjectGenerator& gen, Register dst, Register src1, Register src2); + +Instruction sub_vf(const ObjectGenerator& gen, Register dst, Register src1, Register src2); + +Instruction add_vf(const ObjectGenerator& gen, Register dst, Register src1, Register src2); + +Instruction mul_vf(const ObjectGenerator& gen, Register dst, Register src1, Register src2); + +Instruction max_vf(const ObjectGenerator& gen, Register dst, Register src1, Register src2); + +Instruction min_vf(const ObjectGenerator& gen, Register dst, Register src1, Register src2); + +Instruction div_vf(const ObjectGenerator& gen, Register dst, Register src1, Register src2); + +Instruction sqrt_vf(const ObjectGenerator& gen, Register dst, Register src); + +Instruction itof_vf(const ObjectGenerator& gen, Register dst, Register src); + +Instruction ftoi_vf(const ObjectGenerator& gen, Register dst, Register src); + +Instruction pw_sra(const ObjectGenerator& gen, Register dst, Register src, u8 imm); + +Instruction pw_srl(const ObjectGenerator& gen, Register dst, Register src, u8 imm); + +Instruction ph_srl(const ObjectGenerator& gen, Register dst, Register src, u8 imm); + +Instruction pw_sll(const ObjectGenerator& gen, Register dst, Register src, u8 imm); + +Instruction ph_sll(const ObjectGenerator& gen, Register dst, Register src, u8 imm); + +Instruction parallel_add_byte(const ObjectGenerator& gen, + Register dst, + Register src0, + Register src1); + +Instruction parallel_bitwise_or(const ObjectGenerator& gen, + Register dst, + Register src0, + Register src1); + +Instruction parallel_bitwise_xor(const ObjectGenerator& gen, + Register dst, + Register src0, + Register src1); + +Instruction parallel_bitwise_and(const ObjectGenerator& gen, + Register dst, + Register src0, + Register src1); + +// Reminder - a word in MIPS = 32bits = a DWORD in x86 +// MIPS || x86 +// ----------------------- +// byte || byte +// halfword || word +// word || dword +// doubleword || quadword + +// -- Unpack High Data Instructions +Instruction pextub_swapped(const ObjectGenerator& gen, Register dst, Register src0, Register src1); + +Instruction pextuh_swapped(const ObjectGenerator& gen, Register dst, Register src0, Register src1); + +Instruction pextuw_swapped(const ObjectGenerator& gen, Register dst, Register src0, Register src1); + +// -- Unpack Low Data Instructions +Instruction pextlb_swapped(const ObjectGenerator& gen, Register dst, Register src0, Register src1); + +Instruction pextlh_swapped(const ObjectGenerator& gen, Register dst, Register src0, Register src1); + +Instruction pextlw_swapped(const ObjectGenerator& gen, Register dst, Register src0, Register src1); + +// Equal to than comparison as 16 bytes (8 bits) +Instruction parallel_compare_e_b(const ObjectGenerator& gen, + Register dst, + Register src0, + Register src1); + +// Equal to than comparison as 8 halfwords (16 bits) +Instruction parallel_compare_e_h(const ObjectGenerator& gen, + Register dst, + Register src0, + Register src1); + +// Equal to than comparison as 4 words (32 bits) +Instruction parallel_compare_e_w(const ObjectGenerator& gen, + Register dst, + Register src0, + Register src1); + +// Greater than comparison as 16 bytes (8 bits) +Instruction parallel_compare_gt_b(const ObjectGenerator& gen, + Register dst, + Register src0, + Register src1); + +// Greater than comparison as 8 halfwords (16 bits) +Instruction parallel_compare_gt_h(const ObjectGenerator& gen, + Register dst, + Register src0, + Register src1); + +// Greater than comparison as 4 words (32 bits) +Instruction parallel_compare_gt_w(const ObjectGenerator& gen, + Register dst, + Register src0, + Register src1); + +Instruction vpunpcklqdq(const ObjectGenerator& gen, Register dst, Register src0, Register src1); + +Instruction pcpyld_swapped(const ObjectGenerator& gen, Register dst, Register src0, Register src1); + +Instruction pcpyud(const ObjectGenerator& gen, Register dst, Register src0, Register src1); + +Instruction vpsubd(const ObjectGenerator& gen, Register dst, Register src0, Register src1); + +Instruction vpsrldq(const ObjectGenerator& gen, Register dst, Register src, u8 imm); + +Instruction vpslldq(const ObjectGenerator& gen, Register dst, Register src, u8 imm); + +Instruction vpshuflw(const ObjectGenerator& gen, Register dst, Register src, u8 imm); + +Instruction vpshufhw(const ObjectGenerator& gen, Register dst, Register src, u8 imm); + +Instruction vpackuswb(const ObjectGenerator& gen, Register dst, Register src0, Register src1); +}; // namespace IGen } // namespace emitter diff --git a/goalc/emitter/IGenARM64.cpp b/goalc/emitter/IGenARM64.cpp new file mode 100644 index 0000000000..089faeae28 --- /dev/null +++ b/goalc/emitter/IGenARM64.cpp @@ -0,0 +1,1203 @@ + +#include "IGenARM64.h" + +#include "goalc/emitter/Instruction.h" +#include "goalc/emitter/InstructionSet.h" +#include "goalc/emitter/Register.h" + +// https://armconverter.com/?code=ret +// https://developer.arm.com/documentation/ddi0487/latest + +// TODO ARM64 - just silencing errors while things are not implemented obviously +#pragma GCC diagnostic ignored "-Wunused-parameter" + +namespace emitter { +namespace IGen { +namespace ARM64 { +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// MOVES +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +const auto instr_set = emitter::InstructionSet::ARM64; +using namespace emitter::ARM64; + +InstructionARM64 mov_gpr64_gpr64(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 mov_gpr64_u64(Register dst, uint64_t val) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 mov_gpr64_u32(Register dst, uint64_t val) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 mov_gpr64_s32(Register dst, int64_t val) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 movd_gpr32_xmm32(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 movd_xmm32_gpr32(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 movq_gpr64_xmm64(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 movq_xmm64_gpr64(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 mov_xmm32_xmm32(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +// todo - GPR64 -> XMM64 (zext) +// todo - XMM -> GPR64 + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// GOAL Loads and Stores +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +InstructionARM64 load8s_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store8_gpr64_gpr64_plus_gpr64(Register addr1, Register addr2, Register value) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load8s_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store8_gpr64_gpr64_plus_gpr64_plus_s8(Register addr1, + Register addr2, + Register value, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load8s_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store8_gpr64_gpr64_plus_gpr64_plus_s32(Register addr1, + Register addr2, + Register value, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load8u_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load8u_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load8u_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load16s_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store16_gpr64_gpr64_plus_gpr64(Register addr1, Register addr2, Register value) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store16_gpr64_gpr64_plus_gpr64_plus_s8(Register addr1, + Register addr2, + Register value, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store16_gpr64_gpr64_plus_gpr64_plus_s32(Register addr1, + Register addr2, + Register value, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load16s_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load16s_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load16u_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load16u_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load16u_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load32s_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store32_gpr64_gpr64_plus_gpr64(Register addr1, Register addr2, Register value) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load32s_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store32_gpr64_gpr64_plus_gpr64_plus_s8(Register addr1, + Register addr2, + Register value, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load32s_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store32_gpr64_gpr64_plus_gpr64_plus_s32(Register addr1, + Register addr2, + Register value, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load32u_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load32u_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load32u_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load64_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store64_gpr64_gpr64_plus_gpr64(Register addr1, Register addr2, Register value) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load64_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store64_gpr64_gpr64_plus_gpr64_plus_s8(Register addr1, + Register addr2, + Register value, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load64_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store64_gpr64_gpr64_plus_gpr64_plus_s32(Register addr1, + Register addr2, + Register value, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store_goal_vf(Register addr, Register value, Register off, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store_goal_gpr(Register addr, Register value, Register off, int offset, int size) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load_goal_xmm128(Register dst, Register addr, Register off, int offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load_goal_gpr(Register dst, + Register addr, + Register off, + int offset, + int size, + bool sign_extend) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// LOADS n' STORES - XMM32 +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +InstructionARM64 store32_xmm32_gpr64_plus_gpr64(Register addr1, + Register addr2, + Register xmm_value) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load32_xmm32_gpr64_plus_gpr64(Register simd_dest, Register addr1, Register addr2) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store32_xmm32_gpr64_plus_gpr64_plus_s8(Register addr1, + Register addr2, + Register xmm_value, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load32_xmm32_gpr64_plus_gpr64_plus_s8(Register simd_dest, + Register addr1, + Register addr2, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store32_xmm32_gpr64_plus_gpr64_plus_s32(Register addr1, + Register addr2, + Register xmm_value, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 lea_reg_plus_off32(Register dest, Register base, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 lea_reg_plus_off8(Register dest, Register base, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 lea_reg_plus_off(Register dest, Register base, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store32_xmm32_gpr64_plus_s32(Register base, Register xmm_value, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store32_xmm32_gpr64_plus_s8(Register base, Register xmm_value, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load32_xmm32_gpr64_plus_gpr64_plus_s32(Register simd_dest, + Register addr1, + Register addr2, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load32_xmm32_gpr64_plus_s32(Register simd_dest, Register base, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load32_xmm32_gpr64_plus_s8(Register simd_dest, Register base, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load_goal_xmm32(Register simd_dest, Register addr, Register off, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store_goal_xmm32(Register addr, Register xmm_value, Register off, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store_reg_offset_xmm32(Register base, Register xmm_value, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load_reg_offset_xmm32(Register simd_dest, Register base, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// LOADS n' STORES - SIMD (128-bit, QWORDS) +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +InstructionARM64 store128_gpr64_simd128(Register gpr_addr, Register simd_reg) { + // https://www.scs.stanford.edu/~zyedidia/arm64/str_imm_fpsimd.html + // - STR Qn, [Xn] (unsigned offset) + ASSERT(gpr_addr.is_gpr(instr_set)); + ASSERT( + simd_reg.is_128bit_simd(instr_set)); // TODO ARM64 - this assertion isn't as useful for ARM + // since Q registers are not unique in terms of their id + return InstructionARM64(Base(0b0011110110, 10), Rn(gpr_addr.id()), Rt(simd_reg.id()), Imm12(0)); +} + +InstructionARM64 store128_gpr64_simd128_s32(Register gpr_addr, Register xmm_value, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store128_gpr64_simd128_s8(Register gpr_addr, Register xmm_value, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load128_simd128_gpr64(Register simd_dest, Register gpr_addr) { + // https://www.scs.stanford.edu/~zyedidia/arm64/ldr_imm_fpsimd.html + // - LDR , [{, #}] + ASSERT(gpr_addr.is_gpr(instr_set)); + ASSERT(simd_dest.is_128bit_simd( + instr_set)); // TODO ARM64 - this assertion isn't as useful for ARM + // since Q registers are not unique in terms of their id + return InstructionARM64(Base(0b0011110111, 10), Rn(gpr_addr.id()), Rt(simd_dest.id()), Imm12(0)); +} + +InstructionARM64 load128_simd128_gpr64_s32(Register simd_dest, Register gpr_addr, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load128_simd128_gpr64_s8(Register simd_dest, Register gpr_addr, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load128_xmm128_reg_offset(Register simd_dest, Register base, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store128_xmm128_reg_offset(Register base, Register xmm_val, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// RIP loads and stores +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +InstructionARM64 load64_rip_s32(Register dest, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load32s_rip_s32(Register dest, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load32u_rip_s32(Register dest, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load16u_rip_s32(Register dest, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load16s_rip_s32(Register dest, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load8u_rip_s32(Register dest, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 load8s_rip_s32(Register dest, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 static_load(Register dest, s64 offset, int size, bool sign_extend) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store64_rip_s32(Register src, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store32_rip_s32(Register src, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store16_rip_s32(Register src, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store8_rip_s32(Register src, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 static_store(Register value, s64 offset, int size) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 static_addr(Register dst, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 static_load_xmm32(Register simd_dest, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 static_store_xmm32(Register xmm_value, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +// TODO, special load/stores of 128 bit values. + +// TODO, consider specialized stack loads and stores? +InstructionARM64 load64_gpr64_plus_s32(Register dst_reg, int32_t offset, Register src_reg) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 store64_gpr64_plus_s32(Register addr, int32_t offset, Register value) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// FUNCTION STUFF +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +InstructionARM64 ret() { + // https://www.scs.stanford.edu/~zyedidia/arm64/ret.html + // - defaults to using X30 if Rn is absent + return InstructionARM64(Base(0b1101011001011111000000, 22), Rn(30)); +} + +InstructionARM64 push_gpr64(Register reg) { + // ARM64 stack grows down, so we subtract 16 from SP and store the register + // Equivalent assembly: STR reg, [SP, #-16]! + // - https://www.scs.stanford.edu/~zyedidia/arm64/str_imm_gen.html + // We use 16 because in ARM, the stack must be 16-byte aligned. + // This does mean we are inefficiently using the stack, there are a few better options: + // - Push in pairs, two registers at a time + // - Preallocate stack-space + // But we can't do either of these at this level, this is an optimization that has to come from + // higher in the stack. Here we are concerned with just satisfying the need to push a GPR + ASSERT(reg.is_gpr(instr_set)); + return InstructionARM64(Base(0b1111100000000000000011, 22), Imm9(-16), Rn(ARM64_REG::SP), + Rt(reg.id())); +} + +InstructionARM64 pop_gpr64(Register reg) { + // ldr reg, [sp], #16 + // - https://www.scs.stanford.edu/~zyedidia/arm64/ldr_imm_gen.html + ASSERT(reg.is_gpr(instr_set)); + return InstructionARM64(Base(0b1111100001000000000001, 22), Imm9(16), Rn(ARM64_REG::SP), + Rt(reg.id())); +} + +InstructionARM64 call_r64(Register reg_) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 jmp_r64(Register reg_) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// INTEGER MATH +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +// NOTE: ARM can actually handle 12-bit immediate values, so if it's actually worth it, we +// could leverage these instructions for more than just 8-bit values +InstructionARM64 sub_gpr64_imm8s(Register reg, int64_t imm) { + // You cannot subtract or add with a negative immediate in ARM + // therefore depending on the value of the immediate, we use a different instruction + ASSERT(reg.is_gpr(instr_set)); + if (imm < 0) { + return add_gpr64_imm8s(reg, std::abs(imm)); + } + // https://www.scs.stanford.edu/~zyedidia/arm64/sub_addsub_imm.html + // - SUB , , #imm12 {, LSL #12} + // - using a shift of 0 here (last bit in the base) + return InstructionARM64(Base(0b1101000100, 10), Imm12(imm), Rn(reg.id()), Rd(reg.id())); +} + +// NOTE: ARM can actually handle 12-bit immediate values, so if it's actually worth it, we +// could leverage these instructions for more than just 8-bit values +InstructionARM64 add_gpr64_imm8s(Register reg, int64_t imm) { + // You cannot subtract or add with a negative immediate in ARM + // therefore depending on the value of the immediate, we use a different instruction + ASSERT(reg.is_gpr(instr_set)); + if (imm < 0) { + return sub_gpr64_imm8s(reg, abs(imm)); + } + // https://www.scs.stanford.edu/~zyedidia/arm64/add_addsub_imm.html + // ADD , , #{, } + return InstructionARM64(Base(0b1001000100, 10), Imm12(imm), Rn(reg.id()), Rd(reg.id())); +} + +InstructionARM64 sub_gpr64_imm32s(Register reg, int64_t imm) { + // ARM64 does not support this kind of single-instruction + ASSERT_MSG(false, "sub_gpr64_imm32s not supported on ARM64"); + return InstructionARM64(0b0); +} + +InstructionARM64 add_gpr64_imm32s(Register reg, int64_t imm) { + // ARM64 does not support this kind of single-instruction + ASSERT_MSG(false, "sub_gpr64_imm32s not supported on ARM64"); + return InstructionARM64(0b0); +} + +InstructionARM64 add_gpr64_imm(Register reg, int64_t imm) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 sub_gpr64_imm(Register reg, int64_t imm) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 add_gpr64_gpr64(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 sub_gpr64_gpr64(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 imul_gpr32_gpr32(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 imul_gpr64_gpr64(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 idiv_gpr32(Register reg) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 unsigned_div_gpr32(Register reg) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 cdq() { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 movsx_r64_r32(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 cmp_gpr64_gpr64(Register a, Register b) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// BIT STUFF +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +InstructionARM64 or_gpr64_gpr64(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 and_gpr64_gpr64(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 xor_gpr64_gpr64(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 not_gpr64(Register reg) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// SHIFTS +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +InstructionARM64 shl_gpr64_cl(Register reg) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 shr_gpr64_cl(Register reg) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 sar_gpr64_cl(Register reg) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 shl_gpr64_u8(Register reg, uint8_t sa) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 shr_gpr64_u8(Register reg, uint8_t sa) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 sar_gpr64_u8(Register reg, uint8_t sa) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// CONTROL FLOW +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +InstructionARM64 jmp_32() { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 je_32() { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 jne_32() { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 jle_32() { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 jge_32() { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 jl_32() { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 jg_32() { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 jbe_32() { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 jae_32() { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 jb_32() { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 ja_32() { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// FLOAT MATH +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +InstructionARM64 cmp_flt_flt(Register a, Register b) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 sqrts_xmm(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 mulss_xmm_xmm(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 divss_xmm_xmm(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 subss_xmm_xmm(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 addss_xmm_xmm(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 minss_xmm_xmm(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 maxss_xmm_xmm(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 int32_to_float(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 float_to_int32(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 nop() { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +// TODO - rsqrt / abs / sqrt + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// UTILITIES +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +InstructionARM64 null() { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +///////////////////////////// +// AVX (VF - Vector Float) // +///////////////////////////// + +InstructionARM64 nop_vf() { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 wait_vf() { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 mov_vf_vf(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 loadvf_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 loadvf_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 loadvf_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 storevf_gpr64_plus_gpr64(Register value, Register addr1, Register addr2) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 storevf_gpr64_plus_gpr64_plus_s8(Register value, + Register addr1, + Register addr2, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 storevf_gpr64_plus_gpr64_plus_s32(Register value, + Register addr1, + Register addr2, + s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 loadvf_rip_plus_s32(Register dest, s64 offset) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +// TODO - rip relative loads and stores. + +InstructionARM64 blend_vf(Register dst, Register src1, Register src2, u8 mask) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 shuffle_vf(Register dst, Register src, u8 dx, u8 dy, u8 dz, u8 dw) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 swizzle_vf(Register dst, Register src, u8 controlBytes) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 splat_vf(Register dst, Register src, Register::VF_ELEMENT element) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 xor_vf(Register dst, Register src1, Register src2) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 sub_vf(Register dst, Register src1, Register src2) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 add_vf(Register dst, Register src1, Register src2) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 mul_vf(Register dst, Register src1, Register src2) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 max_vf(Register dst, Register src1, Register src2) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 min_vf(Register dst, Register src1, Register src2) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 div_vf(Register dst, Register src1, Register src2) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 sqrt_vf(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 itof_vf(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 ftoi_vf(Register dst, Register src) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 pw_sra(Register dst, Register src, u8 imm) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 pw_srl(Register dst, Register src, u8 imm) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 ph_srl(Register dst, Register src, u8 imm) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 pw_sll(Register dst, Register src, u8 imm) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} +InstructionARM64 ph_sll(Register dst, Register src, u8 imm) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 parallel_add_byte(Register dst, Register src0, Register src1) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 parallel_bitwise_or(Register dst, Register src0, Register src1) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 parallel_bitwise_xor(Register dst, Register src0, Register src1) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 parallel_bitwise_and(Register dst, Register src0, Register src1) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 pextub_swapped(Register dst, Register src0, Register src1) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 pextuh_swapped(Register dst, Register src0, Register src1) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 pextuw_swapped(Register dst, Register src0, Register src1) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 pextlb_swapped(Register dst, Register src0, Register src1) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 pextlh_swapped(Register dst, Register src0, Register src1) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 pextlw_swapped(Register dst, Register src0, Register src1) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 parallel_compare_e_b(Register dst, Register src0, Register src1) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 parallel_compare_e_h(Register dst, Register src0, Register src1) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 parallel_compare_e_w(Register dst, Register src0, Register src1) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 parallel_compare_gt_b(Register dst, Register src0, Register src1) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 parallel_compare_gt_h(Register dst, Register src0, Register src1) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 parallel_compare_gt_w(Register dst, Register src0, Register src1) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 vpunpcklqdq(Register dst, Register src0, Register src1) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 pcpyld_swapped(Register dst, Register src0, Register src1) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 pcpyud(Register dst, Register src0, Register src1) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 vpsubd(Register dst, Register src0, Register src1) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 vpsrldq(Register dst, Register src, u8 imm) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 vpslldq(Register dst, Register src, u8 imm) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 vpshuflw(Register dst, Register src, u8 imm) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 vpshufhw(Register dst, Register src, u8 imm) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} + +InstructionARM64 vpackuswb(Register dst, Register src0, Register src1) { + ASSERT_MSG(false, "not yet implemented"); + return InstructionARM64(0b0); +} +} // namespace ARM64 +} // namespace IGen +} // namespace emitter \ No newline at end of file diff --git a/goalc/emitter/IGenARM64.h b/goalc/emitter/IGenARM64.h new file mode 100644 index 0000000000..d69500a99a --- /dev/null +++ b/goalc/emitter/IGenARM64.h @@ -0,0 +1,803 @@ +#pragma once + +#include "goalc/emitter/Instruction.h" +#include "goalc/emitter/Register.h" + +namespace emitter { +namespace IGen { +namespace ARM64 { +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// MOVES +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +/*! + * Move data from src to dst. Moves all 64-bits of the GPR. + */ +InstructionARM64 mov_gpr64_gpr64(Register dst, Register src); + +/*! + * Move a 64-bit constant into a register. + */ +InstructionARM64 mov_gpr64_u64(Register dst, uint64_t val); + +/*! + * Move a 32-bit constant into a register. Zeros the upper 32 bits. + */ +InstructionARM64 mov_gpr64_u32(Register dst, uint64_t val); + +/*! + * Move a signed 32-bit constant into a register. Sign extends for the upper 32 bits. + * When possible prefer mov_gpr64_u32. (use this only for negative values...) + * This is always bigger than mov_gpr64_u32, but smaller than a mov_gpr_u64. + */ +InstructionARM64 mov_gpr64_s32(Register dst, int64_t val); + +/*! + * Move 32-bits of xmm to 32 bits of gpr (no sign extension). + */ +InstructionARM64 movd_gpr32_xmm32(Register dst, Register src); + +/*! + * Move 32-bits of gpr to 32-bits of xmm (no sign extension) + */ +InstructionARM64 movd_xmm32_gpr32(Register dst, Register src); + +/*! + * Move 64-bits of xmm to 64 bits of gpr (no sign extension). + */ +InstructionARM64 movq_gpr64_xmm64(Register dst, Register src); + +/*! + * Move 64-bits of gpr to 64-bits of xmm (no sign extension) + */ +InstructionARM64 movq_xmm64_gpr64(Register dst, Register src); + +/*! + * Move 32-bits between xmm's + */ +InstructionARM64 mov_xmm32_xmm32(Register dst, Register src); + +// todo - GPR64 -> XMM64 (zext) +// todo - XMM -> GPR64 + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// GOAL Loads and Stores +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +/*! + * movsx dst, BYTE PTR [addr1 + addr2] + * addr1 and addr2 have to be different registers. + * Cannot use rsp. + */ +InstructionARM64 load8s_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2); + +InstructionARM64 store8_gpr64_gpr64_plus_gpr64(Register addr1, Register addr2, Register value); + +InstructionARM64 load8s_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset); + +InstructionARM64 store8_gpr64_gpr64_plus_gpr64_plus_s8(Register addr1, + Register addr2, + Register value, + s64 offset); + +InstructionARM64 load8s_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset); + +InstructionARM64 store8_gpr64_gpr64_plus_gpr64_plus_s32(Register addr1, + Register addr2, + Register value, + s64 offset); + +/*! + * movzx dst, BYTE PTR [addr1 + addr2] + * addr1 and addr2 have to be different registers. + * Cannot use rsp. + */ +InstructionARM64 load8u_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2); + +InstructionARM64 load8u_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset); + +InstructionARM64 load8u_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset); + +/*! + * movsx dst, WORD PTR [addr1 + addr2] + * addr1 and addr2 have to be different registers. + * Cannot use rsp. + */ +InstructionARM64 load16s_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2); + +InstructionARM64 store16_gpr64_gpr64_plus_gpr64(Register addr1, Register addr2, Register value); + +InstructionARM64 store16_gpr64_gpr64_plus_gpr64_plus_s8(Register addr1, + Register addr2, + Register value, + s64 offset); + +InstructionARM64 store16_gpr64_gpr64_plus_gpr64_plus_s32(Register addr1, + Register addr2, + Register value, + s64 offset); + +InstructionARM64 load16s_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset); + +InstructionARM64 load16s_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset); + +/*! + * movzx dst, WORD PTR [addr1 + addr2] + * addr1 and addr2 have to be different registers. + * Cannot use rsp. + */ +InstructionARM64 load16u_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2); + +InstructionARM64 load16u_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset); + +InstructionARM64 load16u_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset); + +/*! + * movsxd dst, DWORD PTR [addr1 + addr2] + * addr1 and addr2 have to be different registers. + * Cannot use rsp. + */ +InstructionARM64 load32s_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2); + +InstructionARM64 store32_gpr64_gpr64_plus_gpr64(Register addr1, Register addr2, Register value); + +InstructionARM64 load32s_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset); + +InstructionARM64 store32_gpr64_gpr64_plus_gpr64_plus_s8(Register addr1, + Register addr2, + Register value, + s64 offset); + +InstructionARM64 load32s_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset); + +InstructionARM64 store32_gpr64_gpr64_plus_gpr64_plus_s32(Register addr1, + Register addr2, + Register value, + s64 offset); + +/*! + * movzxd dst, DWORD PTR [addr1 + addr2] + * addr1 and addr2 have to be different registers. + * Cannot use rsp. + */ +InstructionARM64 load32u_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2); + +InstructionARM64 load32u_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset); + +InstructionARM64 load32u_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset); + +/*! + * mov dst, QWORD PTR [addr1 + addr2] + * addr1 and addr2 have to be different registers. + * Cannot use rsp. + */ +InstructionARM64 load64_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2); + +InstructionARM64 store64_gpr64_gpr64_plus_gpr64(Register addr1, Register addr2, Register value); + +InstructionARM64 load64_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset); + +InstructionARM64 store64_gpr64_gpr64_plus_gpr64_plus_s8(Register addr1, + Register addr2, + Register value, + s64 offset); + +InstructionARM64 load64_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset); + +InstructionARM64 store64_gpr64_gpr64_plus_gpr64_plus_s32(Register addr1, + Register addr2, + Register value, + s64 offset); + +InstructionARM64 store_goal_vf(Register addr, Register value, Register off, s64 offset); + +InstructionARM64 store_goal_gpr(Register addr, Register value, Register off, int offset, int size); + +InstructionARM64 load_goal_xmm128(Register dst, Register addr, Register off, int offset); + +/*! + * Load memory at addr + offset, where addr is a GOAL pointer and off is the offset register. + * This will pick the appropriate fancy addressing mode instruction. + */ +InstructionARM64 load_goal_gpr(Register dst, + Register addr, + Register off, + int offset, + int size, + bool sign_extend); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// LOADS n' STORES - XMM32 +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +InstructionARM64 store32_xmm32_gpr64_plus_gpr64(Register addr1, Register addr2, Register xmm_value); + +InstructionARM64 load32_xmm32_gpr64_plus_gpr64(Register simd_dest, Register addr1, Register addr2); + +InstructionARM64 store32_xmm32_gpr64_plus_gpr64_plus_s8(Register addr1, + Register addr2, + Register xmm_value, + s64 offset); + +InstructionARM64 load32_xmm32_gpr64_plus_gpr64_plus_s8(Register simd_dest, + Register addr1, + Register addr2, + s64 offset); + +InstructionARM64 store32_xmm32_gpr64_plus_gpr64_plus_s32(Register addr1, + Register addr2, + Register xmm_value, + s64 offset); + +InstructionARM64 lea_reg_plus_off32(Register dest, Register base, s64 offset); + +InstructionARM64 lea_reg_plus_off8(Register dest, Register base, s64 offset); + +InstructionARM64 lea_reg_plus_off(Register dest, Register base, s64 offset); + +InstructionARM64 store32_xmm32_gpr64_plus_s32(Register base, Register xmm_value, s64 offset); + +InstructionARM64 store32_xmm32_gpr64_plus_s8(Register base, Register xmm_value, s64 offset); + +InstructionARM64 load32_xmm32_gpr64_plus_gpr64_plus_s32(Register simd_dest, + Register addr1, + Register addr2, + s64 offset); + +InstructionARM64 load32_xmm32_gpr64_plus_s32(Register simd_dest, Register base, s64 offset); + +InstructionARM64 load32_xmm32_gpr64_plus_s8(Register simd_dest, Register base, s64 offset); + +InstructionARM64 load_goal_xmm32(Register simd_dest, Register addr, Register off, s64 offset); + +InstructionARM64 store_goal_xmm32(Register addr, Register xmm_value, Register off, s64 offset); + +InstructionARM64 store_reg_offset_xmm32(Register base, Register xmm_value, s64 offset); + +InstructionARM64 load_reg_offset_xmm32(Register simd_dest, Register base, s64 offset); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// LOADS n' STORES - XMM128 +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +/*! + * Store a 128-bit xmm into an address stored in a register, no offset + */ +InstructionARM64 store128_gpr64_simd128(Register gpr_addr, Register xmm_value); + +InstructionARM64 store128_gpr64_simd128_s32(Register gpr_addr, Register xmm_value, s64 offset); + +InstructionARM64 store128_gpr64_simd128_s8(Register gpr_addr, Register xmm_value, s64 offset); + +InstructionARM64 load128_simd128_gpr64(Register simd_dest, Register gpr_addr); + +InstructionARM64 load128_simd128_gpr64_s32(Register simd_dest, Register gpr_addr, s64 offset); + +InstructionARM64 load128_simd128_gpr64_s8(Register simd_dest, Register gpr_addr, s64 offset); + +InstructionARM64 load128_xmm128_reg_offset(Register simd_dest, Register base, s64 offset); + +InstructionARM64 store128_xmm128_reg_offset(Register base, Register xmm_val, s64 offset); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// RIP loads and stores +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +InstructionARM64 load64_rip_s32(Register dest, s64 offset); + +InstructionARM64 load32s_rip_s32(Register dest, s64 offset); + +InstructionARM64 load32u_rip_s32(Register dest, s64 offset); + +InstructionARM64 load16u_rip_s32(Register dest, s64 offset); + +InstructionARM64 load16s_rip_s32(Register dest, s64 offset); + +InstructionARM64 load8u_rip_s32(Register dest, s64 offset); + +InstructionARM64 load8s_rip_s32(Register dest, s64 offset); + +InstructionARM64 static_load(Register dest, s64 offset, int size, bool sign_extend); + +InstructionARM64 store64_rip_s32(Register src, s64 offset); + +InstructionARM64 store32_rip_s32(Register src, s64 offset); + +InstructionARM64 store16_rip_s32(Register src, s64 offset); + +InstructionARM64 store8_rip_s32(Register src, s64 offset); + +InstructionARM64 static_store(Register value, s64 offset, int size); + +InstructionARM64 static_addr(Register dst, s64 offset); + +InstructionARM64 static_load_xmm32(Register simd_dest, s64 offset); + +InstructionARM64 static_store_xmm32(Register xmm_value, s64 offset); + +// TODO, special load/stores of 128 bit values. + +// TODO, consider specialized stack loads and stores? +InstructionARM64 load64_gpr64_plus_s32(Register dst_reg, int32_t offset, Register src_reg); + +/*! + * Store 64-bits from gpr into memory located at 64-bit reg + 32-bit signed offset. + */ +InstructionARM64 store64_gpr64_plus_s32(Register addr, int32_t offset, Register value); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// FUNCTION STUFF +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +/*! + * Function return. Pops the 64-bit return address (real) off the stack and jumps to it. + */ +InstructionARM64 ret(); + +/*! + * Instruction to push gpr (64-bits) onto the stack + */ +InstructionARM64 push_gpr64(Register reg); + +/*! + * Instruction to pop 64 bit gpr from the stack + */ +InstructionARM64 pop_gpr64(Register reg); + +/*! + * Call a function stored in a 64-bit gpr + */ +InstructionARM64 call_r64(Register reg_); + +/*! + * Jump to an x86-64 address stored in a 64-bit gpr. + */ +InstructionARM64 jmp_r64(Register reg_); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// INTEGER MATH +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +InstructionARM64 sub_gpr64_imm8s(Register reg, int64_t imm); + +InstructionARM64 sub_gpr64_imm32s(Register reg, int64_t imm); + +InstructionARM64 add_gpr64_imm8s(Register reg, int64_t v); + +InstructionARM64 add_gpr64_imm32s(Register reg, int64_t v); + +InstructionARM64 add_gpr64_imm(Register reg, int64_t imm); + +InstructionARM64 sub_gpr64_imm(Register reg, int64_t imm); + +InstructionARM64 add_gpr64_gpr64(Register dst, Register src); + +InstructionARM64 sub_gpr64_gpr64(Register dst, Register src); + +/*! + * Multiply gprs (32-bit, signed). + * (Note - probably worth doing imul on gpr64's to implement the EE's unsigned multiply) + */ +InstructionARM64 imul_gpr32_gpr32(Register dst, Register src); + +/*! + * Multiply gprs (64-bit, signed). + * DANGER - this treats all operands as 64-bit. This is not like the EE. + */ +InstructionARM64 imul_gpr64_gpr64(Register dst, Register src); + +/*! + * Divide (idiv, 32 bit) + */ +InstructionARM64 idiv_gpr32(Register reg); + +InstructionARM64 unsigned_div_gpr32(Register reg); + +/*! + * Convert doubleword to quadword for division. + */ +InstructionARM64 cdq(); + +/*! + * Move from gpr32 to gpr64, with sign extension. + * Needed for multiplication/divsion madness. + */ +InstructionARM64 movsx_r64_r32(Register dst, Register src); + +/*! + * Compare gpr64. This sets the flags for the jumps. + * todo UNTESTED + */ +InstructionARM64 cmp_gpr64_gpr64(Register a, Register b); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// BIT STUFF +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +/*! + * Or of two gprs + */ +InstructionARM64 or_gpr64_gpr64(Register dst, Register src); + +/*! + * And of two gprs + */ +InstructionARM64 and_gpr64_gpr64(Register dst, Register src); + +/*! + * Xor of two gprs + */ +InstructionARM64 xor_gpr64_gpr64(Register dst, Register src); + +/*! + * Bitwise not a gpr + */ +InstructionARM64 not_gpr64(Register reg); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// SHIFTS +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +/*! + * Shift 64-bit gpr left by CL register + */ +InstructionARM64 shl_gpr64_cl(Register reg); + +/*! + * Shift 64-bit gpr right (logical) by CL register + */ +InstructionARM64 shr_gpr64_cl(Register reg); + +/*! + * Shift 64-bit gpr right (arithmetic) by CL register + */ +InstructionARM64 sar_gpr64_cl(Register reg); + +/*! + * Shift 64-ptr left (logical) by the constant shift amount "sa". + */ +InstructionARM64 shl_gpr64_u8(Register reg, uint8_t sa); + +/*! + * Shift 64-ptr right (logical) by the constant shift amount "sa". + */ +InstructionARM64 shr_gpr64_u8(Register reg, uint8_t sa); + +/*! + * Shift 64-ptr right (arithmetic) by the constant shift amount "sa". + */ +InstructionARM64 sar_gpr64_u8(Register reg, uint8_t sa); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// CONTROL FLOW +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +/*! + * Jump, 32-bit constant offset. The offset is by default 0 and must be patched later. + */ +InstructionARM64 jmp_32(); + +/*! + * Jump if equal. + */ +InstructionARM64 je_32(); + +/*! + * Jump not equal. + */ +InstructionARM64 jne_32(); + +/*! + * Jump less than or equal. + */ +InstructionARM64 jle_32(); + +/*! + * Jump greater than or equal. + */ +InstructionARM64 jge_32(); + +/*! + * Jump less than + */ +InstructionARM64 jl_32(); + +/*! + * Jump greater than + */ +InstructionARM64 jg_32(); + +/*! + * Jump below or equal + */ +InstructionARM64 jbe_32(); + +/*! + * Jump above or equal + */ +InstructionARM64 jae_32(); + +/*! + * Jump below + */ +InstructionARM64 jb_32(); + +/*! + * Jump above + */ +InstructionARM64 ja_32(); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// FLOAT MATH +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +/*! + * Compare two floats and set flag register for jump (ucomiss) + */ +InstructionARM64 cmp_flt_flt(Register a, Register b); + +InstructionARM64 sqrts_xmm(Register dst, Register src); + +/*! + * Multiply two floats in xmm's + */ +InstructionARM64 mulss_xmm_xmm(Register dst, Register src); + +/*! + * Divide two floats in xmm's + */ +InstructionARM64 divss_xmm_xmm(Register dst, Register src); + +/*! + * Subtract two floats in xmm's + */ +InstructionARM64 subss_xmm_xmm(Register dst, Register src); + +/*! + * Add two floats in xmm's + */ +InstructionARM64 addss_xmm_xmm(Register dst, Register src); + +/*! + * Floating point minimum. + */ +InstructionARM64 minss_xmm_xmm(Register dst, Register src); + +/*! + * Floating point maximum. + */ +InstructionARM64 maxss_xmm_xmm(Register dst, Register src); + +/*! + * Convert GPR int32 to XMM float (single precision) + */ +InstructionARM64 int32_to_float(Register dst, Register src); + +/*! + * Convert XMM float to GPR int32(single precision) (truncate) + */ +InstructionARM64 float_to_int32(Register dst, Register src); + +InstructionARM64 nop(); + +// TODO - rsqrt / abs / sqrt + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// UTILITIES +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +/*! + * A "null" instruction. This instruction does not generate any bytes + * but can be referred to by a label. Useful to insert in place of a real instruction + * if the real instruction has been optimized out. + */ +InstructionARM64 null(); + +///////////////////////////// +// AVX (VF - Vector Float) // +///////////////////////////// + +InstructionARM64 nop_vf(); + +InstructionARM64 wait_vf(); + +InstructionARM64 mov_vf_vf(Register dst, Register src); + +InstructionARM64 loadvf_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2); + +InstructionARM64 loadvf_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset); + +InstructionARM64 loadvf_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset); + +InstructionARM64 storevf_gpr64_plus_gpr64(Register value, Register addr1, Register addr2); + +InstructionARM64 storevf_gpr64_plus_gpr64_plus_s8(Register value, + Register addr1, + Register addr2, + s64 offset); + +InstructionARM64 storevf_gpr64_plus_gpr64_plus_s32(Register value, + Register addr1, + Register addr2, + s64 offset); + +InstructionARM64 loadvf_rip_plus_s32(Register dest, s64 offset); + +// TODO - rip relative loads and stores. + +InstructionARM64 blend_vf(Register dst, Register src1, Register src2, u8 mask); + +InstructionARM64 shuffle_vf(Register dst, Register src, u8 dx, u8 dy, u8 dz, u8 dw); + +/* + Generic Swizzle (re-arrangment of packed FPs) operation, the control bytes are quite involved. + Here's a brief run-down: + - 8-bits / 4 groups of 2 bits + - Right-to-left, each group is used to determine which element in `src` gets copied into + `dst`'s element (W->X). + - GROUP OPTIONS + - 00b - Copy the least-significant element (X) + - 01b - Copy the second element (from the right) (Y) + - 10b - Copy the third element (from the right) (Z) + - 11b - Copy the most significant element (W) + Examples + ; xmm1 = (1.5, 2.5, 3.5, 4.5) (W,Z,Y,X in x86 land) + SHUFPS xmm1, xmm1, 0xff ; Copy the most significant element to all positions + > (1.5, 1.5, 1.5, 1.5) + SHUFPS xmm1, xmm1, 0x39 ; Rotate right + > (4.5, 1.5, 2.5, 3.5) + */ +InstructionARM64 swizzle_vf(Register dst, Register src, u8 controlBytes); + +/* + Splats a single element in 'src' to all elements in 'dst' + For example (pseudocode): + xmm1 = (1.5, 2.5, 3.5, 4.5) + xmm2 = (1, 2, 3, 4) + splat_vf(xmm1, xmm2, XMM_ELEMENT::X); + xmm1 = (4, 4, 4, 4) + */ +InstructionARM64 splat_vf(Register dst, Register src, Register::VF_ELEMENT element); + +InstructionARM64 xor_vf(Register dst, Register src1, Register src2); + +InstructionARM64 sub_vf(Register dst, Register src1, Register src2); + +InstructionARM64 add_vf(Register dst, Register src1, Register src2); + +InstructionARM64 mul_vf(Register dst, Register src1, Register src2); + +InstructionARM64 max_vf(Register dst, Register src1, Register src2); + +InstructionARM64 min_vf(Register dst, Register src1, Register src2); + +InstructionARM64 div_vf(Register dst, Register src1, Register src2); + +InstructionARM64 sqrt_vf(Register dst, Register src); + +InstructionARM64 itof_vf(Register dst, Register src); + +InstructionARM64 ftoi_vf(Register dst, Register src); + +InstructionARM64 pw_sra(Register dst, Register src, u8 imm); + +InstructionARM64 pw_srl(Register dst, Register src, u8 imm); + +InstructionARM64 ph_srl(Register dst, Register src, u8 imm); + +InstructionARM64 pw_sll(Register dst, Register src, u8 imm); + +InstructionARM64 ph_sll(Register dst, Register src, u8 imm); + +InstructionARM64 parallel_add_byte(Register dst, Register src0, Register src1); + +InstructionARM64 parallel_bitwise_or(Register dst, Register src0, Register src1); + +InstructionARM64 parallel_bitwise_xor(Register dst, Register src0, Register src1); + +InstructionARM64 parallel_bitwise_and(Register dst, Register src0, Register src1); + +// Reminder - a word in MIPS = 32bits = a DWORD in x86 +// MIPS || x86 +// ----------------------- +// byte || byte +// halfword || word +// word || dword +// doubleword || quadword + +// -- Unpack High Data Instructions +InstructionARM64 pextub_swapped(Register dst, Register src0, Register src1); + +InstructionARM64 pextuh_swapped(Register dst, Register src0, Register src1); + +InstructionARM64 pextuw_swapped(Register dst, Register src0, Register src1); + +// -- Unpack Low Data Instructions +InstructionARM64 pextlb_swapped(Register dst, Register src0, Register src1); + +InstructionARM64 pextlh_swapped(Register dst, Register src0, Register src1); + +InstructionARM64 pextlw_swapped(Register dst, Register src0, Register src1); + +// Equal to than comparison as 16 bytes (8 bits) +InstructionARM64 parallel_compare_e_b(Register dst, Register src0, Register src1); + +// Equal to than comparison as 8 halfwords (16 bits) +InstructionARM64 parallel_compare_e_h(Register dst, Register src0, Register src1); + +// Equal to than comparison as 4 words (32 bits) +InstructionARM64 parallel_compare_e_w(Register dst, Register src0, Register src1); + +// Greater than comparison as 16 bytes (8 bits) +InstructionARM64 parallel_compare_gt_b(Register dst, Register src0, Register src1); + +// Greater than comparison as 8 halfwords (16 bits) +InstructionARM64 parallel_compare_gt_h(Register dst, Register src0, Register src1); + +// Greater than comparison as 4 words (32 bits) +InstructionARM64 parallel_compare_gt_w(Register dst, Register src0, Register src1); + +InstructionARM64 vpunpcklqdq(Register dst, Register src0, Register src1); + +InstructionARM64 pcpyld_swapped(Register dst, Register src0, Register src1); + +InstructionARM64 pcpyud(Register dst, Register src0, Register src1); + +InstructionARM64 vpsubd(Register dst, Register src0, Register src1); + +InstructionARM64 vpsrldq(Register dst, Register src, u8 imm); + +InstructionARM64 vpslldq(Register dst, Register src, u8 imm); + +InstructionARM64 vpshuflw(Register dst, Register src, u8 imm); + +InstructionARM64 vpshufhw(Register dst, Register src, u8 imm); + +InstructionARM64 vpackuswb(Register dst, Register src0, Register src1); +} // namespace ARM64 +} // namespace IGen +} // namespace emitter \ No newline at end of file diff --git a/goalc/emitter/IGenX86.cpp b/goalc/emitter/IGenX86.cpp new file mode 100644 index 0000000000..17bff85849 --- /dev/null +++ b/goalc/emitter/IGenX86.cpp @@ -0,0 +1,2450 @@ +#include "IGenX86.h" + +#include + +#include "goalc/emitter/InstructionSet.h" + +namespace emitter { +namespace IGen { +namespace X86 { + +const auto instr_set = emitter::InstructionSet::X86; + +InstructionX86 mov_gpr64_gpr64(Register dst, Register src) { + ASSERT(dst.is_gpr(instr_set)); + ASSERT(src.is_gpr(instr_set)); + InstructionX86 instr(0x89); + instr.set_modrm_and_rex(src.hw_id(instr_set), dst.hw_id(instr_set), 3, true); + return instr; +} + +InstructionX86 mov_gpr64_u64(Register dst, uint64_t val) { + ASSERT(dst.is_gpr(instr_set)); + bool rex_b = false; + auto dst_hw_id = dst.hw_id(instr_set); + if (dst_hw_id >= 8) { + dst_hw_id -= 8; + rex_b = true; + } + InstructionX86 instr(0xb8 + dst_hw_id); + instr.set(REX(true, false, false, rex_b)); + instr.set(Imm(8, val)); + return instr; +} + +InstructionX86 mov_gpr64_u32(Register dst, uint64_t val) { + ASSERT(val <= UINT32_MAX); + ASSERT(dst.is_gpr(instr_set)); + auto dst_hw_id = dst.hw_id(instr_set); + bool rex_b = false; + if (dst_hw_id >= 8) { + dst_hw_id -= 8; + rex_b = true; + } + + InstructionX86 instr(0xb8 + dst_hw_id); + if (rex_b) { + instr.set(REX(false, false, false, rex_b)); + } + instr.set(Imm(4, val)); + return instr; +} + +InstructionX86 mov_gpr64_s32(Register dst, int64_t val) { + ASSERT(val >= INT32_MIN && val <= INT32_MAX); + ASSERT(dst.is_gpr(instr_set)); + InstructionX86 instr(0xc7); + instr.set_modrm_and_rex(0, dst.hw_id(instr_set), 3, true); + instr.set(Imm(4, val)); + return instr; +} + +InstructionX86 movd_gpr32_xmm32(Register dst, Register src) { + ASSERT(dst.is_gpr(instr_set)); + ASSERT(src.is_xmm(instr_set)); + InstructionX86 instr(0x66); + instr.set_op2(0x0f); + instr.set_op3(0x7e); + instr.set_modrm_and_rex(src.hw_id(instr_set), dst.hw_id(instr_set), 3, false); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 movd_xmm32_gpr32(Register dst, Register src) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src.is_gpr(instr_set)); + InstructionX86 instr(0x66); + instr.set_op2(0x0f); + instr.set_op3(0x6e); + instr.set_modrm_and_rex(dst.hw_id(instr_set), src.hw_id(instr_set), 3, false); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 movq_gpr64_xmm64(Register dst, Register src) { + ASSERT(dst.is_gpr(instr_set)); + ASSERT(src.is_xmm(instr_set)); + InstructionX86 instr(0x66); + instr.set_op2(0x0f); + instr.set_op3(0x7e); + instr.set_modrm_and_rex(src.hw_id(instr_set), dst.hw_id(instr_set), 3, true); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 movq_xmm64_gpr64(Register dst, Register src) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src.is_gpr(instr_set)); + InstructionX86 instr(0x66); + instr.set_op2(0x0f); + instr.set_op3(0x6e); + instr.set_modrm_and_rex(dst.hw_id(instr_set), src.hw_id(instr_set), 3, true); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 mov_xmm32_xmm32(Register dst, Register src) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src.is_xmm(instr_set)); + InstructionX86 instr(0xf3); + instr.set_op2(0x0f); + instr.set_op3(0x10); + instr.set_modrm_and_rex(dst.hw_id(instr_set), src.hw_id(instr_set), 3, false); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 load8s_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2) { + ASSERT(dst.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + InstructionX86 instr(0xf); + instr.set_op2(0xbe); + instr.set_modrm_and_rex_for_reg_plus_reg_addr(dst.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), true, false); + return instr; +} + +InstructionX86 store8_gpr64_gpr64_plus_gpr64(Register addr1, Register addr2, Register value) { + ASSERT(value.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + InstructionX86 instr(0x88); + instr.set_modrm_and_rex_for_reg_plus_reg_addr(value.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set)); + if (value.id() > RBX) { + instr.add_rex(); + } + return instr; +} + +InstructionX86 load8s_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT(dst.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); + InstructionX86 instr(0xf); + instr.set_op2(0xbe); + instr.set_modrm_and_rex_for_reg_plus_reg_plus_s8(dst.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), offset, true); + return instr; +} + +InstructionX86 store8_gpr64_gpr64_plus_gpr64_plus_s8(Register addr1, + Register addr2, + Register value, + s64 offset) { + ASSERT(value.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); + InstructionX86 instr(0x88); + instr.set_modrm_and_rex_for_reg_plus_reg_plus_s8(value.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), offset, false); + if (value.id() > RBX) { + instr.add_rex(); + } + return instr; +} + +InstructionX86 load8s_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT(dst.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0xf); + instr.set_op2(0xbe); + instr.set_modrm_and_rex_for_reg_plus_reg_plus_s32(dst.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), offset, true); + return instr; +} + +InstructionX86 store8_gpr64_gpr64_plus_gpr64_plus_s32(Register addr1, + Register addr2, + Register value, + s64 offset) { + ASSERT(value.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0x88); + instr.set_modrm_and_rex_for_reg_plus_reg_plus_s32(value.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), offset, false); + if (value.id() > RBX) { + instr.add_rex(); + } + return instr; +} + +InstructionX86 load8u_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2) { + ASSERT(dst.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + InstructionX86 instr(0xf); + instr.set_op2(0xb6); + instr.set_modrm_and_rex_for_reg_plus_reg_addr(dst.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), true, false); + return instr; +} + +InstructionX86 load8u_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT(dst.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); + InstructionX86 instr(0xf); + instr.set_op2(0xb6); + instr.set_modrm_and_rex_for_reg_plus_reg_plus_s8(dst.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), offset, true); + return instr; +} + +InstructionX86 load8u_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT(dst.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0xf); + instr.set_op2(0xb6); + instr.set_modrm_and_rex_for_reg_plus_reg_plus_s32(dst.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), offset, true); + return instr; +} + +InstructionX86 load16s_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2) { + ASSERT(dst.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + InstructionX86 instr(0xf); + instr.set_op2(0xbf); + instr.set_modrm_and_rex_for_reg_plus_reg_addr(dst.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), true, false); + return instr; +} + +InstructionX86 store16_gpr64_gpr64_plus_gpr64(Register addr1, Register addr2, Register value) { + ASSERT(value.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + InstructionX86 instr(0x66); + instr.set_op2(0x89); + instr.set_modrm_and_rex_for_reg_plus_reg_addr(value.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set)); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 store16_gpr64_gpr64_plus_gpr64_plus_s8(Register addr1, + Register addr2, + Register value, + s64 offset) { + ASSERT(value.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); + InstructionX86 instr(0x66); + instr.set_op2(0x89); + instr.set_modrm_and_rex_for_reg_plus_reg_plus_s8(value.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), offset, false); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 store16_gpr64_gpr64_plus_gpr64_plus_s32(Register addr1, + Register addr2, + Register value, + s64 offset) { + ASSERT(value.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0x66); + instr.set_op2(0x89); + instr.set_modrm_and_rex_for_reg_plus_reg_plus_s32(value.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), offset, false); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 load16s_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT(dst.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); + InstructionX86 instr(0xf); + instr.set_op2(0xbf); + instr.set_modrm_and_rex_for_reg_plus_reg_plus_s8(dst.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), offset, true); + return instr; +} + +InstructionX86 load16s_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT(dst.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0xf); + instr.set_op2(0xbf); + instr.set_modrm_and_rex_for_reg_plus_reg_plus_s32(dst.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), offset, true); + return instr; +} + +InstructionX86 load16u_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2) { + ASSERT(dst.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + InstructionX86 instr(0xf); + instr.set_op2(0xb7); + instr.set_modrm_and_rex_for_reg_plus_reg_addr(dst.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), true, false); + return instr; +} + +InstructionX86 load16u_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT(dst.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); + InstructionX86 instr(0xf); + instr.set_op2(0xb7); + instr.set_modrm_and_rex_for_reg_plus_reg_plus_s8(dst.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), offset, true); + return instr; +} + +InstructionX86 load16u_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT(dst.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0xf); + instr.set_op2(0xb7); + instr.set_modrm_and_rex_for_reg_plus_reg_plus_s32(dst.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), offset, true); + return instr; +} + +InstructionX86 load32s_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2) { + ASSERT(dst.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + InstructionX86 instr(0x63); + instr.set_modrm_and_rex_for_reg_plus_reg_addr(dst.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), true); + return instr; +} + +InstructionX86 store32_gpr64_gpr64_plus_gpr64(Register addr1, Register addr2, Register value) { + ASSERT(value.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + InstructionX86 instr(0x89); + instr.set_modrm_and_rex_for_reg_plus_reg_addr(value.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set)); + return instr; +} + +InstructionX86 load32s_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT(dst.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); + InstructionX86 instr(0x63); + instr.set_modrm_and_rex_for_reg_plus_reg_plus_s8(dst.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), offset, true); + return instr; +} + +InstructionX86 store32_gpr64_gpr64_plus_gpr64_plus_s8(Register addr1, + Register addr2, + Register value, + s64 offset) { + ASSERT(value.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); + InstructionX86 instr(0x89); + instr.set_modrm_and_rex_for_reg_plus_reg_plus_s8(value.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), offset, false); + return instr; +} + +InstructionX86 load32s_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT(dst.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0x63); + instr.set_modrm_and_rex_for_reg_plus_reg_plus_s32(dst.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), offset, true); + return instr; +} + +InstructionX86 store32_gpr64_gpr64_plus_gpr64_plus_s32(Register addr1, + Register addr2, + Register value, + s64 offset) { + ASSERT(value.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0x89); + instr.set_modrm_and_rex_for_reg_plus_reg_plus_s32(value.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), offset, false); + return instr; +} + +InstructionX86 load32u_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2) { + ASSERT(dst.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + InstructionX86 instr(0x8b); + instr.set_modrm_and_rex_for_reg_plus_reg_addr(dst.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set)); + return instr; +} + +InstructionX86 load32u_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT(dst.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); + InstructionX86 instr(0x8b); + instr.set_modrm_and_rex_for_reg_plus_reg_plus_s8(dst.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), offset, false); + return instr; +} + +InstructionX86 load32u_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT(dst.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0x8b); + instr.set_modrm_and_rex_for_reg_plus_reg_plus_s32(dst.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), offset, false); + return instr; +} + +InstructionX86 load64_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2) { + ASSERT(dst.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + InstructionX86 instr(0x8b); + instr.set_modrm_and_rex_for_reg_plus_reg_addr(dst.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), true); + return instr; +} + +InstructionX86 store64_gpr64_gpr64_plus_gpr64(Register addr1, Register addr2, Register value) { + ASSERT(value.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + InstructionX86 instr(0x89); + instr.set_modrm_and_rex_for_reg_plus_reg_addr(value.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), true); + return instr; +} + +InstructionX86 load64_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT(dst.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); + InstructionX86 instr(0x8b); + instr.set_modrm_and_rex_for_reg_plus_reg_plus_s8(dst.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), offset, true); + return instr; +} + +InstructionX86 store64_gpr64_gpr64_plus_gpr64_plus_s8(Register addr1, + Register addr2, + Register value, + s64 offset) { + ASSERT(value.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); + InstructionX86 instr(0x89); + instr.set_modrm_and_rex_for_reg_plus_reg_plus_s8(value.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), offset, true); + return instr; +} + +InstructionX86 load64_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT(dst.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0x8b); + instr.set_modrm_and_rex_for_reg_plus_reg_plus_s32(dst.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), offset, true); + return instr; +} + +InstructionX86 store64_gpr64_gpr64_plus_gpr64_plus_s32(Register addr1, + Register addr2, + Register value, + s64 offset) { + ASSERT(value.is_gpr(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0x89); + instr.set_modrm_and_rex_for_reg_plus_reg_plus_s32(value.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), offset, true); + return instr; +} + +InstructionX86 store_goal_gpr(Register addr, Register value, Register off, int offset, int size) { + switch (size) { + case 1: + if (offset == 0) { + return store8_gpr64_gpr64_plus_gpr64(addr, off, value); + } else if (offset >= INT8_MIN && offset <= INT8_MAX) { + return store8_gpr64_gpr64_plus_gpr64_plus_s8(addr, off, value, offset); + } else if (offset >= INT32_MIN && offset <= INT32_MAX) { + return store8_gpr64_gpr64_plus_gpr64_plus_s32(addr, off, value, offset); + } else { + ASSERT(false); + } + case 2: + if (offset == 0) { + return store16_gpr64_gpr64_plus_gpr64(addr, off, value); + } else if (offset >= INT8_MIN && offset <= INT8_MAX) { + return store16_gpr64_gpr64_plus_gpr64_plus_s8(addr, off, value, offset); + } else if (offset >= INT32_MIN && offset <= INT32_MAX) { + return store16_gpr64_gpr64_plus_gpr64_plus_s32(addr, off, value, offset); + } else { + ASSERT(false); + } + case 4: + if (offset == 0) { + return store32_gpr64_gpr64_plus_gpr64(addr, off, value); + } else if (offset >= INT8_MIN && offset <= INT8_MAX) { + return store32_gpr64_gpr64_plus_gpr64_plus_s8(addr, off, value, offset); + } else if (offset >= INT32_MIN && offset <= INT32_MAX) { + return store32_gpr64_gpr64_plus_gpr64_plus_s32(addr, off, value, offset); + } else { + ASSERT(false); + } + case 8: + if (offset == 0) { + return store64_gpr64_gpr64_plus_gpr64(addr, off, value); + } else if (offset >= INT8_MIN && offset <= INT8_MAX) { + return store64_gpr64_gpr64_plus_gpr64_plus_s8(addr, off, value, offset); + } else if (offset >= INT32_MIN && offset <= INT32_MAX) { + return store64_gpr64_gpr64_plus_gpr64_plus_s32(addr, off, value, offset); + } else { + ASSERT(false); + } + default: + ASSERT(false); + return {0}; + } +} + +InstructionX86 load_goal_gpr(Register dst, + Register addr, + Register off, + int offset, + int size, + bool sign_extend) { + switch (size) { + case 1: + if (offset == 0) { + if (sign_extend) { + return load8s_gpr64_gpr64_plus_gpr64(dst, addr, off); + } else { + return load8u_gpr64_gpr64_plus_gpr64(dst, addr, off); + } + } else if (offset >= INT8_MIN && offset <= INT8_MAX) { + if (sign_extend) { + return load8s_gpr64_gpr64_plus_gpr64_plus_s8(dst, addr, off, offset); + } else { + return load8u_gpr64_gpr64_plus_gpr64_plus_s8(dst, addr, off, offset); + } + } else if (offset >= INT32_MIN && offset <= INT32_MAX) { + if (sign_extend) { + return load8s_gpr64_gpr64_plus_gpr64_plus_s32(dst, addr, off, offset); + } else { + return load8u_gpr64_gpr64_plus_gpr64_plus_s32(dst, addr, off, offset); + } + } else { + ASSERT(false); + } + case 2: + if (offset == 0) { + if (sign_extend) { + return load16s_gpr64_gpr64_plus_gpr64(dst, addr, off); + } else { + return load16u_gpr64_gpr64_plus_gpr64(dst, addr, off); + } + } else if (offset >= INT8_MIN && offset <= INT8_MAX) { + if (sign_extend) { + return load16s_gpr64_gpr64_plus_gpr64_plus_s8(dst, addr, off, offset); + } else { + return load16u_gpr64_gpr64_plus_gpr64_plus_s8(dst, addr, off, offset); + } + } else if (offset >= INT32_MIN && offset <= INT32_MAX) { + if (sign_extend) { + return load16s_gpr64_gpr64_plus_gpr64_plus_s32(dst, addr, off, offset); + } else { + return load16u_gpr64_gpr64_plus_gpr64_plus_s32(dst, addr, off, offset); + } + } else { + ASSERT(false); + } + case 4: + if (offset == 0) { + if (sign_extend) { + return load32s_gpr64_gpr64_plus_gpr64(dst, addr, off); + } else { + return load32u_gpr64_gpr64_plus_gpr64(dst, addr, off); + } + } else if (offset >= INT8_MIN && offset <= INT8_MAX) { + if (sign_extend) { + return load32s_gpr64_gpr64_plus_gpr64_plus_s8(dst, addr, off, offset); + } else { + return load32u_gpr64_gpr64_plus_gpr64_plus_s8(dst, addr, off, offset); + } + } else if (offset >= INT32_MIN && offset <= INT32_MAX) { + if (sign_extend) { + return load32s_gpr64_gpr64_plus_gpr64_plus_s32(dst, addr, off, offset); + } else { + return load32u_gpr64_gpr64_plus_gpr64_plus_s32(dst, addr, off, offset); + } + } else { + ASSERT(false); + } + case 8: + if (offset == 0) { + return load64_gpr64_gpr64_plus_gpr64(dst, addr, off); + + } else if (offset >= INT8_MIN && offset <= INT8_MAX) { + return load64_gpr64_gpr64_plus_gpr64_plus_s8(dst, addr, off, offset); + + } else if (offset >= INT32_MIN && offset <= INT32_MAX) { + return load64_gpr64_gpr64_plus_gpr64_plus_s32(dst, addr, off, offset); + + } else { + ASSERT(false); + } + default: + ASSERT(false); + return {0}; + } +} + +InstructionX86 store32_xmm32_gpr64_plus_gpr64(Register addr1, Register addr2, Register xmm_value) { + ASSERT(xmm_value.is_xmm(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + + InstructionX86 instr(0xf3); + instr.set_op2(0x0f); + instr.set_op3(0x11); + instr.set_modrm_and_rex_for_reg_plus_reg_addr(xmm_value.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set)); + + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 load32_xmm32_gpr64_plus_gpr64(Register simd_dest, Register addr1, Register addr2) { + ASSERT(simd_dest.is_xmm(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + + InstructionX86 instr(0xf3); + instr.set_op2(0x0f); + instr.set_op3(0x10); + instr.set_modrm_and_rex_for_reg_plus_reg_addr(simd_dest.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set)); + + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 store32_xmm32_gpr64_plus_gpr64_plus_s8(Register addr1, + Register addr2, + Register xmm_value, + s64 offset) { + ASSERT(xmm_value.is_xmm(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); + + InstructionX86 instr(0xf3); + instr.set_op2(0x0f); + instr.set_op3(0x11); + instr.set_modrm_and_rex_for_reg_plus_reg_plus_s8( + xmm_value.hw_id(instr_set), addr1.hw_id(instr_set), addr2.hw_id(instr_set), offset, false); + + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 load32_xmm32_gpr64_plus_gpr64_plus_s8(Register simd_dest, + Register addr1, + Register addr2, + s64 offset) { + ASSERT(simd_dest.is_xmm(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); + + InstructionX86 instr(0xf3); + instr.set_op2(0x0f); + instr.set_op3(0x10); + instr.set_modrm_and_rex_for_reg_plus_reg_plus_s8( + simd_dest.hw_id(instr_set), addr1.hw_id(instr_set), addr2.hw_id(instr_set), offset, false); + + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 store32_xmm32_gpr64_plus_gpr64_plus_s32(Register addr1, + Register addr2, + Register xmm_value, + s64 offset) { + ASSERT(xmm_value.is_xmm(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + + InstructionX86 instr(0xf3); + instr.set_op2(0x0f); + instr.set_op3(0x11); + instr.set_modrm_and_rex_for_reg_plus_reg_plus_s32( + xmm_value.hw_id(instr_set), addr1.hw_id(instr_set), addr2.hw_id(instr_set), offset, false); + + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 lea_reg_plus_off32(Register dest, Register base, s64 offset) { + ASSERT(dest.is_gpr(instr_set)); + ASSERT(base.is_gpr(instr_set)); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0x8d); + instr.set_modrm_rex_sib_for_reg_reg_disp(dest.hw_id(instr_set), 2, base.hw_id(instr_set), true); + instr.set(Imm(4, offset)); + return instr; +} + +InstructionX86 lea_reg_plus_off8(Register dest, Register base, s64 offset) { + ASSERT(dest.is_gpr(instr_set)); + ASSERT(base.is_gpr(instr_set)); + ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); + InstructionX86 instr(0x8d); + instr.set_modrm_rex_sib_for_reg_reg_disp(dest.hw_id(instr_set), 1, base.hw_id(instr_set), true); + instr.set(Imm(1, offset)); + return instr; +} + +InstructionX86 lea_reg_plus_off(Register dest, Register base, s64 offset) { + if (offset >= INT8_MIN && offset <= INT8_MAX) { + return lea_reg_plus_off8(dest, base, offset); + } else if (offset >= INT32_MIN && offset <= INT32_MAX) { + return lea_reg_plus_off32(dest, base, offset); + } else { + ASSERT(false); + return {0}; + } +} + +InstructionX86 store32_xmm32_gpr64_plus_s32(Register base, Register xmm_value, s64 offset) { + ASSERT(xmm_value.is_xmm(instr_set)); + ASSERT(base.is_gpr(instr_set)); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0xf3); + instr.set_op2(0x0f); + instr.set_op3(0x11); + instr.set_modrm_rex_sib_for_reg_reg_disp(xmm_value.hw_id(instr_set), 2, base.hw_id(instr_set), + false); + instr.set(Imm(4, offset)); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 store32_xmm32_gpr64_plus_s8(Register base, Register xmm_value, s64 offset) { + ASSERT(xmm_value.is_xmm(instr_set)); + ASSERT(base.is_gpr(instr_set)); + ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); + InstructionX86 instr(0xf3); + instr.set_op2(0x0f); + instr.set_op3(0x11); + instr.set_modrm_rex_sib_for_reg_reg_disp(xmm_value.hw_id(instr_set), 1, base.hw_id(instr_set), + false); + instr.set(Imm(1, offset)); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 load32_xmm32_gpr64_plus_gpr64_plus_s32(Register simd_dest, + Register addr1, + Register addr2, + s64 offset) { + ASSERT(simd_dest.is_xmm(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + + InstructionX86 instr(0xf3); + instr.set_op2(0x0f); + instr.set_op3(0x10); + instr.set_modrm_and_rex_for_reg_plus_reg_plus_s32( + simd_dest.hw_id(instr_set), addr1.hw_id(instr_set), addr2.hw_id(instr_set), offset, false); + + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 load32_xmm32_gpr64_plus_s32(Register simd_dest, Register base, s64 offset) { + ASSERT(simd_dest.is_xmm(instr_set)); + ASSERT(base.is_gpr(instr_set)); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0xf3); + instr.set_op2(0x0f); + instr.set_op3(0x10); + instr.set_modrm_rex_sib_for_reg_reg_disp(simd_dest.hw_id(instr_set), 2, base.hw_id(instr_set), + false); + instr.set(Imm(4, offset)); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 load32_xmm32_gpr64_plus_s8(Register simd_dest, Register base, s64 offset) { + ASSERT(simd_dest.is_xmm(instr_set)); + ASSERT(base.is_gpr(instr_set)); + ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); + InstructionX86 instr(0xf3); + instr.set_op2(0x0f); + instr.set_op3(0x10); + instr.set_modrm_rex_sib_for_reg_reg_disp(simd_dest.hw_id(instr_set), 1, base.hw_id(instr_set), + false); + instr.set(Imm(1, offset)); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 load_goal_xmm32(Register simd_dest, Register addr, Register off, s64 offset) { + if (offset == 0) { + return load32_xmm32_gpr64_plus_gpr64(simd_dest, addr, off); + } else if (offset >= INT8_MIN && offset <= INT8_MAX) { + return load32_xmm32_gpr64_plus_gpr64_plus_s8(simd_dest, addr, off, offset); + } else if (offset >= INT32_MIN && offset <= INT32_MAX) { + return load32_xmm32_gpr64_plus_gpr64_plus_s32(simd_dest, addr, off, offset); + } else { + ASSERT(false); + return {0}; + } +} + +InstructionX86 store_goal_xmm32(Register addr, Register xmm_value, Register off, s64 offset) { + if (offset == 0) { + return store32_xmm32_gpr64_plus_gpr64(addr, off, xmm_value); + } else if (offset >= INT8_MIN && offset <= INT8_MAX) { + return store32_xmm32_gpr64_plus_gpr64_plus_s8(addr, off, xmm_value, offset); + } else if (offset >= INT32_MIN && offset <= INT32_MAX) { + return store32_xmm32_gpr64_plus_gpr64_plus_s32(addr, off, xmm_value, offset); + } else { + ASSERT(false); + return {0}; + } +} + +InstructionX86 store_reg_offset_xmm32(Register base, Register xmm_value, s64 offset) { + ASSERT(base.is_gpr(instr_set)); + ASSERT(xmm_value.is_xmm(instr_set)); + if (offset >= INT8_MIN && offset <= INT8_MAX) { + return store32_xmm32_gpr64_plus_s8(base, xmm_value, offset); + } else if (offset >= INT32_MIN && offset <= INT32_MAX) { + return store32_xmm32_gpr64_plus_s32(base, xmm_value, offset); + } else { + ASSERT(false); + return {0}; + } +} + +InstructionX86 load_reg_offset_xmm32(Register simd_dest, Register base, s64 offset) { + ASSERT(base.is_gpr(instr_set)); + ASSERT(simd_dest.is_xmm(instr_set)); + if (offset >= INT8_MIN && offset <= INT8_MAX) { + return load32_xmm32_gpr64_plus_s8(simd_dest, base, offset); + } else if (offset >= INT32_MIN && offset <= INT32_MAX) { + return load32_xmm32_gpr64_plus_s32(simd_dest, base, offset); + } else { + ASSERT(false); + return {0}; + } +} + +InstructionX86 store128_gpr64_simd128(Register gpr_addr, Register xmm_value) { + ASSERT(gpr_addr.is_gpr(instr_set)); + ASSERT(xmm_value.is_xmm(instr_set)); + InstructionX86 instr(0x66); + + instr.set_op2(0x0f); + instr.set_op3(0x7f); + instr.set_modrm_and_rex_for_reg_addr(xmm_value.hw_id(instr_set), gpr_addr.hw_id(instr_set), + false); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 store128_gpr64_simd128_s32(Register gpr_addr, Register xmm_value, s64 offset) { + ASSERT(gpr_addr.is_gpr(instr_set)); + ASSERT(xmm_value.is_xmm(instr_set)); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0x66); + + instr.set_op2(0x0f); + instr.set_op3(0x7f); + instr.set_modrm_rex_sib_for_reg_reg_disp(xmm_value.hw_id(instr_set), 2, gpr_addr.hw_id(instr_set), + false); + instr.set(Imm(4, offset)); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 store128_gpr64_simd128_s8(Register gpr_addr, Register xmm_value, s64 offset) { + ASSERT(gpr_addr.is_gpr(instr_set)); + ASSERT(xmm_value.is_xmm(instr_set)); + ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); + InstructionX86 instr(0x66); + + instr.set_op2(0x0f); + instr.set_op3(0x7f); + instr.set_modrm_rex_sib_for_reg_reg_disp(xmm_value.hw_id(instr_set), 1, gpr_addr.hw_id(instr_set), + false); + instr.set(Imm(1, offset)); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 load128_simd128_gpr64(Register simd_dest, Register gpr_addr) { + ASSERT(gpr_addr.is_gpr(instr_set)); + ASSERT(simd_dest.is_xmm(instr_set)); + InstructionX86 instr(0x66); + + instr.set_op2(0x0f); + instr.set_op3(0x6f); + instr.set_modrm_and_rex_for_reg_addr(simd_dest.hw_id(instr_set), gpr_addr.hw_id(instr_set), + false); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 load128_simd128_gpr64_s32(Register simd_dest, Register gpr_addr, s64 offset) { + ASSERT(gpr_addr.is_gpr(instr_set)); + ASSERT(simd_dest.is_xmm(instr_set)); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0x66); + + instr.set_op2(0x0f); + instr.set_op3(0x6f); + instr.set_modrm_rex_sib_for_reg_reg_disp(simd_dest.hw_id(instr_set), 2, gpr_addr.hw_id(instr_set), + false); + instr.set(Imm(4, offset)); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 load128_simd128_gpr64_s8(Register simd_dest, Register gpr_addr, s64 offset) { + ASSERT(gpr_addr.is_gpr(instr_set)); + ASSERT(simd_dest.is_xmm(instr_set)); + ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); + InstructionX86 instr(0x66); + + instr.set_op2(0x0f); + instr.set_op3(0x6f); + instr.set_modrm_rex_sib_for_reg_reg_disp(simd_dest.hw_id(instr_set), 1, gpr_addr.hw_id(instr_set), + false); + instr.set(Imm(1, offset)); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 load128_xmm128_reg_offset(Register simd_dest, Register base, s64 offset) { + if (offset == 0) { + return load128_simd128_gpr64(simd_dest, base); + } else if (offset >= INT8_MIN && offset <= INT8_MAX) { + return load128_simd128_gpr64_s8(simd_dest, base, offset); + } else if (offset >= INT32_MIN && offset <= INT32_MAX) { + return load128_simd128_gpr64_s32(simd_dest, base, offset); + } else { + ASSERT(false); + return {0}; + } +} + +InstructionX86 store128_xmm128_reg_offset(Register base, Register xmm_val, s64 offset) { + if (offset == 0) { + return store128_gpr64_simd128(base, xmm_val); + } else if (offset >= INT8_MIN && offset <= INT8_MAX) { + return store128_gpr64_simd128_s8(base, xmm_val, offset); + } else if (offset >= INT32_MIN && offset <= INT32_MAX) { + return store128_gpr64_simd128_s32(base, xmm_val, offset); + } else { + ASSERT(false); + return {0}; + } +} + +InstructionX86 load64_rip_s32(Register dest, s64 offset) { + ASSERT(dest.is_gpr(instr_set)); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0x8b); + instr.set_modrm_and_rex_for_rip_plus_s32(dest.hw_id(instr_set), offset, true); + return instr; +} + +InstructionX86 load32s_rip_s32(Register dest, s64 offset) { + ASSERT(dest.is_gpr(instr_set)); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0x63); + instr.set_modrm_and_rex_for_rip_plus_s32(dest.hw_id(instr_set), offset, true); + return instr; +} + +InstructionX86 load32u_rip_s32(Register dest, s64 offset) { + ASSERT(dest.is_gpr(instr_set)); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0x8b); + instr.set_modrm_and_rex_for_rip_plus_s32(dest.hw_id(instr_set), offset, false); + return instr; +} + +InstructionX86 load16u_rip_s32(Register dest, s64 offset) { + ASSERT(dest.is_gpr(instr_set)); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0xf); + instr.set_op2(0xb7); + instr.set_modrm_and_rex_for_rip_plus_s32(dest.hw_id(instr_set), offset, true); + return instr; +} + +InstructionX86 load16s_rip_s32(Register dest, s64 offset) { + ASSERT(dest.is_gpr(instr_set)); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0xf); + instr.set_op2(0xbf); + instr.set_modrm_and_rex_for_rip_plus_s32(dest.hw_id(instr_set), offset, true); + return instr; +} + +InstructionX86 load8u_rip_s32(Register dest, s64 offset) { + ASSERT(dest.is_gpr(instr_set)); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0xf); + instr.set_op2(0xb6); + instr.set_modrm_and_rex_for_rip_plus_s32(dest.hw_id(instr_set), offset, true); + return instr; +} + +InstructionX86 load8s_rip_s32(Register dest, s64 offset) { + ASSERT(dest.is_gpr(instr_set)); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0xf); + instr.set_op2(0xbe); + instr.set_modrm_and_rex_for_rip_plus_s32(dest.hw_id(instr_set), offset, true); + return instr; +} + +InstructionX86 static_load(Register dest, s64 offset, int size, bool sign_extend) { + switch (size) { + case 1: + if (sign_extend) { + return load8s_rip_s32(dest, offset); + } else { + return load8u_rip_s32(dest, offset); + } + break; + case 2: + if (sign_extend) { + return load16s_rip_s32(dest, offset); + } else { + return load16u_rip_s32(dest, offset); + } + break; + case 4: + if (sign_extend) { + return load32s_rip_s32(dest, offset); + } else { + return load32u_rip_s32(dest, offset); + } + break; + case 8: + return load64_rip_s32(dest, offset); + default: + ASSERT(false); + } +} + +InstructionX86 store64_rip_s32(Register src, s64 offset) { + ASSERT(src.is_gpr(instr_set)); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0x89); + instr.set_modrm_and_rex_for_rip_plus_s32(src.hw_id(instr_set), offset, true); + return instr; +} + +InstructionX86 store32_rip_s32(Register src, s64 offset) { + ASSERT(src.is_gpr(instr_set)); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0x89); + instr.set_modrm_and_rex_for_rip_plus_s32(src.hw_id(instr_set), offset, false); + return instr; +} + +InstructionX86 store16_rip_s32(Register src, s64 offset) { + ASSERT(src.is_gpr(instr_set)); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0x66); + instr.set_op2(0x89); + instr.set_modrm_and_rex_for_rip_plus_s32(src.hw_id(instr_set), offset, false); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 store8_rip_s32(Register src, s64 offset) { + ASSERT(src.is_gpr(instr_set)); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0x88); + instr.set_modrm_and_rex_for_rip_plus_s32(src.hw_id(instr_set), offset, false); + if (src.id() > RBX) { + instr.add_rex(); + } + return instr; +} + +InstructionX86 static_store(Register value, s64 offset, int size) { + switch (size) { + case 1: + return store8_rip_s32(value, offset); + case 2: + return store16_rip_s32(value, offset); + case 4: + return store32_rip_s32(value, offset); + case 8: + return store64_rip_s32(value, offset); + default: + ASSERT(false); + } +} + +InstructionX86 static_addr(Register dst, s64 offset) { + ASSERT(dst.is_gpr(instr_set)); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0x8d); + instr.set_modrm_and_rex_for_rip_plus_s32(dst.hw_id(instr_set), offset, true); + return instr; +} + +InstructionX86 static_load_xmm32(Register simd_dest, s64 offset) { + ASSERT(simd_dest.is_xmm(instr_set)); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + + InstructionX86 instr(0xf3); + instr.set_op2(0x0f); + instr.set_op3(0x10); + instr.set_modrm_and_rex_for_rip_plus_s32(simd_dest.hw_id(instr_set), offset, false); + + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 static_store_xmm32(Register xmm_value, s64 offset) { + ASSERT(xmm_value.is_xmm(instr_set)); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + + InstructionX86 instr(0xf3); + instr.set_op2(0x0f); + instr.set_op3(0x11); + instr.set_modrm_and_rex_for_rip_plus_s32(xmm_value.hw_id(instr_set), offset, false); + + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 load64_gpr64_plus_s32(Register dst_reg, int32_t offset, Register src_reg) { + ASSERT(dst_reg.is_gpr(instr_set)); + ASSERT(src_reg.is_gpr(instr_set)); + InstructionX86 instr(0x8b); + instr.set_modrm_rex_sib_for_reg_reg_disp(dst_reg.hw_id(instr_set), 2, src_reg.hw_id(instr_set), + true); + instr.set_disp(Imm(4, offset)); + return instr; +} + +InstructionX86 store64_gpr64_plus_s32(Register addr, int32_t offset, Register value) { + ASSERT(addr.is_gpr(instr_set)); + ASSERT(value.is_gpr(instr_set)); + InstructionX86 instr(0x89); + instr.set_modrm_rex_sib_for_reg_reg_disp(value.hw_id(instr_set), 2, addr.hw_id(instr_set), true); + instr.set_disp(Imm(4, offset)); + return instr; +} + +InstructionX86 ret() { + return InstructionX86(0xc3); +} + +InstructionX86 push_gpr64(Register reg) { + ASSERT(reg.is_gpr(instr_set)); + if (reg.hw_id(instr_set) >= 8) { + auto i = InstructionX86(0x50 + reg.hw_id(instr_set) - 8); + i.set(REX(false, false, false, true)); + return i; + } + return InstructionX86(0x50 + reg.hw_id(instr_set)); +} + +InstructionX86 pop_gpr64(Register reg) { + ASSERT(reg.is_gpr(instr_set)); + if (reg.hw_id(instr_set) >= 8) { + auto i = InstructionX86(0x58 + reg.hw_id(instr_set) - 8); + i.set(REX(false, false, false, true)); + return i; + } + return InstructionX86(0x58 + reg.hw_id(instr_set)); +} + +InstructionX86 call_r64(Register reg_) { + ASSERT(reg_.is_gpr(instr_set)); + auto reg = reg_.hw_id(instr_set); + InstructionX86 instr(0xff); + if (reg >= 8) { + instr.set(REX(false, false, false, true)); + reg -= 8; + } + ASSERT(reg < 8); + ModRM mrm; + mrm.rm = reg; + mrm.reg_op = 2; + mrm.mod = 3; + instr.set(mrm); + return instr; +} + +InstructionX86 jmp_r64(Register reg_) { + ASSERT(reg_.is_gpr(instr_set)); + auto reg = reg_.hw_id(instr_set); + InstructionX86 instr(0xff); + if (reg >= 8) { + instr.set(REX(false, false, false, true)); + reg -= 8; + } + ASSERT(reg < 8); + ModRM mrm; + mrm.rm = reg; + mrm.reg_op = 4; + mrm.mod = 3; + instr.set(mrm); + return instr; +} + +InstructionX86 sub_gpr64_imm8s(Register reg, int64_t imm) { + ASSERT(reg.is_gpr(instr_set)); + ASSERT(imm >= INT8_MIN && imm <= INT8_MAX); + + InstructionX86 instr(0x83); + instr.set_modrm_and_rex(5, reg.hw_id(instr_set), 3, true); + instr.set(Imm(1, imm)); + return instr; +} + +InstructionX86 sub_gpr64_imm32s(Register reg, int64_t imm) { + ASSERT(reg.is_gpr(instr_set)); + ASSERT(imm >= INT32_MIN && imm <= INT32_MAX); + InstructionX86 instr(0x81); + instr.set_modrm_and_rex(5, reg.hw_id(instr_set), 3, true); + instr.set(Imm(4, imm)); + return instr; +} + +InstructionX86 add_gpr64_imm8s(Register reg, int64_t v) { + ASSERT(v >= INT8_MIN && v <= INT8_MAX); + InstructionX86 instr(0x83); + instr.set_modrm_and_rex(0, reg.hw_id(instr_set), 3, true); + instr.set(Imm(1, v)); + return instr; +} + +InstructionX86 add_gpr64_imm32s(Register reg, int64_t v) { + ASSERT(v >= INT32_MIN && v <= INT32_MAX); + InstructionX86 instr(0x81); + instr.set_modrm_and_rex(0, reg.hw_id(instr_set), 3, true); + instr.set(Imm(4, v)); + return instr; +} + +InstructionX86 add_gpr64_imm(Register reg, int64_t imm) { + if (imm >= INT8_MIN && imm <= INT8_MAX) { + return add_gpr64_imm8s(reg, imm); + } else if (imm >= INT32_MIN && imm <= INT32_MAX) { + return add_gpr64_imm32s(reg, imm); + } else { + throw std::runtime_error("Invalid `add` with reg[" + reg.print() + "]/imm[" + + std::to_string(imm) + "]"); + } +} + +InstructionX86 sub_gpr64_imm(Register reg, int64_t imm) { + if (imm >= INT8_MIN && imm <= INT8_MAX) { + return sub_gpr64_imm8s(reg, imm); + } else if (imm >= INT32_MIN && imm <= INT32_MAX) { + return sub_gpr64_imm32s(reg, imm); + } else { + throw std::runtime_error("Invalid `sub` with reg[" + reg.print() + "]/imm[" + + std::to_string(imm) + "]"); + } +} + +InstructionX86 add_gpr64_gpr64(Register dst, Register src) { + InstructionX86 instr(0x01); + ASSERT(dst.is_gpr(instr_set)); + ASSERT(src.is_gpr(instr_set)); + instr.set_modrm_and_rex(src.hw_id(instr_set), dst.hw_id(instr_set), 3, true); + return instr; +} + +InstructionX86 sub_gpr64_gpr64(Register dst, Register src) { + InstructionX86 instr(0x29); + ASSERT(dst.is_gpr(instr_set)); + ASSERT(src.is_gpr(instr_set)); + instr.set_modrm_and_rex(src.hw_id(instr_set), dst.hw_id(instr_set), 3, true); + return instr; +} + +InstructionX86 imul_gpr32_gpr32(Register dst, Register src) { + InstructionX86 instr(0xf); + instr.set_op2(0xaf); + ASSERT(dst.is_gpr(instr_set)); + ASSERT(src.is_gpr(instr_set)); + instr.set_modrm_and_rex(dst.hw_id(instr_set), src.hw_id(instr_set), 3, false); + return instr; +} + +InstructionX86 imul_gpr64_gpr64(Register dst, Register src) { + InstructionX86 instr(0xf); + instr.set_op2(0xaf); + ASSERT(dst.is_gpr(instr_set)); + ASSERT(src.is_gpr(instr_set)); + instr.set_modrm_and_rex(dst.hw_id(instr_set), src.hw_id(instr_set), 3, true); + return instr; +} + +InstructionX86 idiv_gpr32(Register reg) { + InstructionX86 instr(0xf7); + ASSERT(reg.is_gpr(instr_set)); + instr.set_modrm_and_rex(7, reg.hw_id(instr_set), 3, false); + return instr; +} + +InstructionX86 unsigned_div_gpr32(Register reg) { + InstructionX86 instr(0xf7); + ASSERT(reg.is_gpr(instr_set)); + instr.set_modrm_and_rex(6, reg.hw_id(instr_set), 3, false); + return instr; +} + +InstructionX86 cdq() { + InstructionX86 instr(0x99); + return instr; +} + +InstructionX86 movsx_r64_r32(Register dst, Register src) { + InstructionX86 instr(0x63); + ASSERT(dst.is_gpr(instr_set)); + ASSERT(src.is_gpr(instr_set)); + instr.set_modrm_and_rex(dst.hw_id(instr_set), src.hw_id(instr_set), 3, true); + return instr; +} + +InstructionX86 cmp_gpr64_gpr64(Register a, Register b) { + InstructionX86 instr(0x3b); + ASSERT(a.is_gpr(instr_set)); + ASSERT(b.is_gpr(instr_set)); + instr.set_modrm_and_rex(a.hw_id(instr_set), b.hw_id(instr_set), 3, true); + return instr; +} + +InstructionX86 or_gpr64_gpr64(Register dst, Register src) { + InstructionX86 instr(0x0b); + ASSERT(dst.is_gpr(instr_set)); + ASSERT(src.is_gpr(instr_set)); + instr.set_modrm_and_rex(dst.hw_id(instr_set), src.hw_id(instr_set), 3, true); + return instr; +} + +InstructionX86 and_gpr64_gpr64(Register dst, Register src) { + InstructionX86 instr(0x23); + ASSERT(dst.is_gpr(instr_set)); + ASSERT(src.is_gpr(instr_set)); + instr.set_modrm_and_rex(dst.hw_id(instr_set), src.hw_id(instr_set), 3, true); + return instr; +} + +InstructionX86 xor_gpr64_gpr64(Register dst, Register src) { + InstructionX86 instr(0x33); + ASSERT(dst.is_gpr(instr_set)); + ASSERT(src.is_gpr(instr_set)); + instr.set_modrm_and_rex(dst.hw_id(instr_set), src.hw_id(instr_set), 3, true); + return instr; +} + +InstructionX86 not_gpr64(Register reg) { + InstructionX86 instr(0xf7); + ASSERT(reg.is_gpr(instr_set)); + instr.set_modrm_and_rex(2, reg.hw_id(instr_set), 3, true); + return instr; +} + +InstructionX86 shl_gpr64_cl(Register reg) { + ASSERT(reg.is_gpr(instr_set)); + InstructionX86 instr(0xd3); + instr.set_modrm_and_rex(4, reg.hw_id(instr_set), 3, true); + return instr; +} + +InstructionX86 shr_gpr64_cl(Register reg) { + ASSERT(reg.is_gpr(instr_set)); + InstructionX86 instr(0xd3); + instr.set_modrm_and_rex(5, reg.hw_id(instr_set), 3, true); + return instr; +} + +InstructionX86 sar_gpr64_cl(Register reg) { + ASSERT(reg.is_gpr(instr_set)); + InstructionX86 instr(0xd3); + instr.set_modrm_and_rex(7, reg.hw_id(instr_set), 3, true); + return instr; +} + +InstructionX86 shl_gpr64_u8(Register reg, uint8_t sa) { + ASSERT(reg.is_gpr(instr_set)); + InstructionX86 instr(0xc1); + instr.set_modrm_and_rex(4, reg.hw_id(instr_set), 3, true); + instr.set(Imm(1, sa)); + return instr; +} + +InstructionX86 shr_gpr64_u8(Register reg, uint8_t sa) { + ASSERT(reg.is_gpr(instr_set)); + InstructionX86 instr(0xc1); + instr.set_modrm_and_rex(5, reg.hw_id(instr_set), 3, true); + instr.set(Imm(1, sa)); + return instr; +} + +InstructionX86 sar_gpr64_u8(Register reg, uint8_t sa) { + ASSERT(reg.is_gpr(instr_set)); + InstructionX86 instr(0xc1); + instr.set_modrm_and_rex(7, reg.hw_id(instr_set), 3, true); + instr.set(Imm(1, sa)); + return instr; +} + +InstructionX86 jmp_32() { + InstructionX86 instr(0xe9); + instr.set(Imm(4, 0)); + return instr; +} + +InstructionX86 je_32() { + InstructionX86 instr(0x0f); + instr.set_op2(0x84); + instr.set(Imm(4, 0)); + return instr; +} + +InstructionX86 jne_32() { + InstructionX86 instr(0x0f); + instr.set_op2(0x85); + instr.set(Imm(4, 0)); + return instr; +} + +InstructionX86 jle_32() { + InstructionX86 instr(0x0f); + instr.set_op2(0x8e); + instr.set(Imm(4, 0)); + return instr; +} + +InstructionX86 jge_32() { + InstructionX86 instr(0x0f); + instr.set_op2(0x8d); + instr.set(Imm(4, 0)); + return instr; +} + +InstructionX86 jl_32() { + InstructionX86 instr(0x0f); + instr.set_op2(0x8c); + instr.set(Imm(4, 0)); + return instr; +} + +InstructionX86 jg_32() { + InstructionX86 instr(0x0f); + instr.set_op2(0x8f); + instr.set(Imm(4, 0)); + return instr; +} + +InstructionX86 jbe_32() { + InstructionX86 instr(0x0f); + instr.set_op2(0x86); + instr.set(Imm(4, 0)); + return instr; +} + +InstructionX86 jae_32() { + InstructionX86 instr(0x0f); + instr.set_op2(0x83); + instr.set(Imm(4, 0)); + return instr; +} + +InstructionX86 jb_32() { + InstructionX86 instr(0x0f); + instr.set_op2(0x82); + instr.set(Imm(4, 0)); + return instr; +} + +InstructionX86 ja_32() { + InstructionX86 instr(0x0f); + instr.set_op2(0x87); + instr.set(Imm(4, 0)); + return instr; +} + +InstructionX86 cmp_flt_flt(Register a, Register b) { + ASSERT(a.is_xmm(instr_set)); + ASSERT(b.is_xmm(instr_set)); + InstructionX86 instr(0x0f); + instr.set_op2(0x2e); + instr.set_modrm_and_rex(a.hw_id(instr_set), b.hw_id(instr_set), 3, false); + return instr; +} + +InstructionX86 sqrts_xmm(Register dst, Register src) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src.is_xmm(instr_set)); + InstructionX86 instr(0xf3); + instr.set_op2(0x0f); + instr.set_op3(0x51); + instr.set_modrm_and_rex(dst.hw_id(instr_set), src.hw_id(instr_set), 3, false); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 mulss_xmm_xmm(Register dst, Register src) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src.is_xmm(instr_set)); + InstructionX86 instr(0xf3); + instr.set_op2(0x0f); + instr.set_op3(0x59); + instr.set_modrm_and_rex(dst.hw_id(instr_set), src.hw_id(instr_set), 3, false); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 divss_xmm_xmm(Register dst, Register src) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src.is_xmm(instr_set)); + InstructionX86 instr(0xf3); + instr.set_op2(0x0f); + instr.set_op3(0x5e); + instr.set_modrm_and_rex(dst.hw_id(instr_set), src.hw_id(instr_set), 3, false); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 subss_xmm_xmm(Register dst, Register src) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src.is_xmm(instr_set)); + InstructionX86 instr(0xf3); + instr.set_op2(0x0f); + instr.set_op3(0x5c); + instr.set_modrm_and_rex(dst.hw_id(instr_set), src.hw_id(instr_set), 3, false); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 addss_xmm_xmm(Register dst, Register src) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src.is_xmm(instr_set)); + InstructionX86 instr(0xf3); + instr.set_op2(0x0f); + instr.set_op3(0x58); + instr.set_modrm_and_rex(dst.hw_id(instr_set), src.hw_id(instr_set), 3, false); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 minss_xmm_xmm(Register dst, Register src) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src.is_xmm(instr_set)); + InstructionX86 instr(0xf3); + instr.set_op2(0x0f); + instr.set_op3(0x5d); + instr.set_modrm_and_rex(dst.hw_id(instr_set), src.hw_id(instr_set), 3, false); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 maxss_xmm_xmm(Register dst, Register src) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src.is_xmm(instr_set)); + InstructionX86 instr(0xf3); + instr.set_op2(0x0f); + instr.set_op3(0x5f); + instr.set_modrm_and_rex(dst.hw_id(instr_set), src.hw_id(instr_set), 3, false); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 int32_to_float(Register dst, Register src) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src.is_gpr(instr_set)); + InstructionX86 instr(0xf3); + instr.set_op2(0x0f); + instr.set_op3(0x2a); + instr.set_modrm_and_rex(dst.hw_id(instr_set), src.hw_id(instr_set), 3, false); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 float_to_int32(Register dst, Register src) { + ASSERT(dst.is_gpr(instr_set)); + ASSERT(src.is_xmm(instr_set)); + InstructionX86 instr(0xf3); + instr.set_op2(0x0f); + instr.set_op3(0x2c); + instr.set_modrm_and_rex(dst.hw_id(instr_set), src.hw_id(instr_set), 3, false); + instr.swap_op0_rex(); + return instr; +} + +InstructionX86 nop() { + InstructionX86 instr(0x90); + return instr; +} + +InstructionX86 null() { + InstructionX86 i(0); + i.m_flags |= InstructionX86::kIsNull; + return i; +} + +InstructionX86 nop_vf() { + InstructionX86 instr(0xd9); + instr.set_op2(0xd0); + return instr; +} + +InstructionX86 wait_vf() { + InstructionX86 instr(0x9B); + return instr; +} + +InstructionX86 mov_vf_vf(Register dst, Register src) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src.is_xmm(instr_set)); + + if (src.hw_id(instr_set) >= 8 && dst.hw_id(instr_set) < 8) { + InstructionX86 instr(0x29); + instr.set_vex_modrm_and_rex(src.hw_id(instr_set), dst.hw_id(instr_set), 3, + VEX3::LeadingBytes::P_0F, false); + return instr; + } else { + InstructionX86 instr(0x28); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src.hw_id(instr_set), 3, + VEX3::LeadingBytes::P_0F, false); + return instr; + } +} + +InstructionX86 loadvf_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + InstructionX86 instr(0x28); + instr.set_vex_modrm_and_rex_for_reg_plus_reg_addr(dst.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), + VEX3::LeadingBytes::P_0F, false); + return instr; +} + +InstructionX86 loadvf_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); + InstructionX86 instr(0x28); + instr.set_vex_modrm_and_rex_for_reg_plus_reg_plus_s8(dst.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), offset, + VEX3::LeadingBytes::P_0F, false); + return instr; +} + +InstructionX86 loadvf_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0x28); + instr.set_vex_modrm_and_rex_for_reg_plus_reg_plus_s32( + dst.hw_id(instr_set), addr1.hw_id(instr_set), addr2.hw_id(instr_set), offset, + VEX3::LeadingBytes::P_0F, false); + return instr; +} + +InstructionX86 load_goal_xmm128(Register dst, Register addr, Register off, int offset) { + if (offset == 0) { + return loadvf_gpr64_plus_gpr64(dst, addr, off); + } else if (offset >= INT8_MIN && offset <= INT8_MAX) { + return loadvf_gpr64_plus_gpr64_plus_s8(dst, addr, off, offset); + } else if (offset >= INT32_MIN && offset <= INT32_MAX) { + return loadvf_gpr64_plus_gpr64_plus_s32(dst, addr, off, offset); + } else { + ASSERT(false); + return {0}; + } +} + +InstructionX86 storevf_gpr64_plus_gpr64(Register value, Register addr1, Register addr2) { + ASSERT(value.is_xmm(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + InstructionX86 instr(0x29); + instr.set_vex_modrm_and_rex_for_reg_plus_reg_addr(value.hw_id(instr_set), addr1.hw_id(instr_set), + addr2.hw_id(instr_set), + VEX3::LeadingBytes::P_0F, false); + return instr; +} + +InstructionX86 storevf_gpr64_plus_gpr64_plus_s8(Register value, + Register addr1, + Register addr2, + s64 offset) { + ASSERT(value.is_xmm(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + ASSERT(offset >= INT8_MIN && offset <= INT8_MAX); + InstructionX86 instr(0x29); + instr.set_vex_modrm_and_rex_for_reg_plus_reg_plus_s8( + value.hw_id(instr_set), addr1.hw_id(instr_set), addr2.hw_id(instr_set), offset, + VEX3::LeadingBytes::P_0F, false); + return instr; +} + +InstructionX86 storevf_gpr64_plus_gpr64_plus_s32(Register value, + Register addr1, + Register addr2, + s64 offset) { + ASSERT(value.is_xmm(instr_set)); + ASSERT(addr1.is_gpr(instr_set)); + ASSERT(addr2.is_gpr(instr_set)); + ASSERT(addr1 != addr2); + ASSERT(addr1 != RSP); + ASSERT(addr2 != RSP); + ASSERT(offset >= INT32_MIN && offset <= INT32_MAX); + InstructionX86 instr(0x29); + instr.set_vex_modrm_and_rex_for_reg_plus_reg_plus_s32( + value.hw_id(instr_set), addr1.hw_id(instr_set), addr2.hw_id(instr_set), offset, + VEX3::LeadingBytes::P_0F, false); + return instr; +} + +InstructionX86 store_goal_vf(Register addr, Register value, Register off, s64 offset) { + if (offset == 0) { + return storevf_gpr64_plus_gpr64(value, addr, off); + } else if (offset >= INT8_MIN && offset <= INT8_MAX) { + return storevf_gpr64_plus_gpr64_plus_s8(value, addr, off, offset); + } else if (offset >= INT32_MIN && offset <= INT32_MAX) { + return storevf_gpr64_plus_gpr64_plus_s32(value, addr, off, offset); + } + ASSERT(false); + return {0}; +} + +InstructionX86 loadvf_rip_plus_s32(Register dest, s64 offset) { + ASSERT(dest.is_xmm(instr_set)); + ASSERT(offset >= INT32_MIN); + ASSERT(offset <= INT32_MAX); + InstructionX86 instr(0x28); + instr.set_vex_modrm_and_rex_for_rip_plus_s32(dest.hw_id(instr_set), offset); + return instr; +} + +InstructionX86 blend_vf(Register dst, Register src1, Register src2, u8 mask) { + ASSERT(!(mask & 0b11110000)); + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + ASSERT(src2.is_xmm(instr_set)); + InstructionX86 instr(0x0c); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src2.hw_id(instr_set), + VEX3::LeadingBytes::P_0F_3A, src1.hw_id(instr_set), false, + VexPrefix::P_66); + instr.set(Imm(1, mask)); + return instr; +} + +InstructionX86 swizzle_vf(Register dst, Register src, u8 controlBytes) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src.is_xmm(instr_set)); + InstructionX86 instr(0xC6); + + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src.hw_id(instr_set)); + instr.set(Imm(1, controlBytes)); + return instr; +} + +InstructionX86 shuffle_vf(Register dst, Register src, u8 dx, u8 dy, u8 dz, u8 dw) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src.is_xmm(instr_set)); + ASSERT(dx < 4); + ASSERT(dy < 4); + ASSERT(dz < 4); + ASSERT(dw < 4); + u8 imm = dx + (dy << 2) + (dz << 4) + (dw << 6); + return swizzle_vf(dst, src, imm); +} + +InstructionX86 splat_vf(Register dst, Register src, Register::VF_ELEMENT element) { + switch (element) { + case Register::VF_ELEMENT::X: + return swizzle_vf(dst, src, 0b00000000); + break; + case Register::VF_ELEMENT::Y: + return swizzle_vf(dst, src, 0b01010101); + break; + case Register::VF_ELEMENT::Z: + return swizzle_vf(dst, src, 0b10101010); + break; + case Register::VF_ELEMENT::W: + return swizzle_vf(dst, src, 0b11111111); + break; + default: + ASSERT(false); + return {0}; + } +} + +InstructionX86 xor_vf(Register dst, Register src1, Register src2) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + ASSERT(src2.is_xmm(instr_set)); + InstructionX86 instr(0x57); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src2.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src1.hw_id(instr_set)); + return instr; +} + +InstructionX86 sub_vf(Register dst, Register src1, Register src2) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + ASSERT(src2.is_xmm(instr_set)); + InstructionX86 instr(0x5c); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src2.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src1.hw_id(instr_set)); + return instr; +} + +InstructionX86 add_vf(Register dst, Register src1, Register src2) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + ASSERT(src2.is_xmm(instr_set)); + InstructionX86 instr(0x58); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src2.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src1.hw_id(instr_set)); + return instr; +} + +InstructionX86 mul_vf(Register dst, Register src1, Register src2) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + ASSERT(src2.is_xmm(instr_set)); + InstructionX86 instr(0x59); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src2.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src1.hw_id(instr_set)); + return instr; +} + +InstructionX86 max_vf(Register dst, Register src1, Register src2) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + ASSERT(src2.is_xmm(instr_set)); + InstructionX86 instr(0x5F); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src2.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src1.hw_id(instr_set)); + return instr; +} + +InstructionX86 min_vf(Register dst, Register src1, Register src2) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + ASSERT(src2.is_xmm(instr_set)); + InstructionX86 instr(0x5D); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src2.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src1.hw_id(instr_set)); + return instr; +} + +InstructionX86 div_vf(Register dst, Register src1, Register src2) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + ASSERT(src2.is_xmm(instr_set)); + InstructionX86 instr(0x5E); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src2.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src1.hw_id(instr_set)); + return instr; +} + +InstructionX86 sqrt_vf(Register dst, Register src) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src.is_xmm(instr_set)); + InstructionX86 instr(0x51); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + 0b0); + return instr; +} + +InstructionX86 itof_vf(Register dst, Register src) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src.is_xmm(instr_set)); + InstructionX86 instr(0x5b); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + 0); + return instr; +} + +InstructionX86 ftoi_vf(Register dst, Register src) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src.is_xmm(instr_set)); + + InstructionX86 instr(0x5b); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + 0, false, VexPrefix::P_F3); + return instr; +} + +InstructionX86 pw_sra(Register dst, Register src, u8 imm) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src.is_xmm(instr_set)); + + InstructionX86 instr(0x72); + instr.set_vex_modrm_and_rex(4, src.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + dst.hw_id(instr_set), false, VexPrefix::P_66); + instr.set(Imm(1, imm)); + return instr; +} + +InstructionX86 pw_srl(Register dst, Register src, u8 imm) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src.is_xmm(instr_set)); + + InstructionX86 instr(0x72); + instr.set_vex_modrm_and_rex(2, src.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + dst.hw_id(instr_set), false, VexPrefix::P_66); + instr.set(Imm(1, imm)); + return instr; +} + +InstructionX86 ph_srl(Register dst, Register src, u8 imm) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src.is_xmm(instr_set)); + + InstructionX86 instr(0x71); + instr.set_vex_modrm_and_rex(2, src.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + dst.hw_id(instr_set), false, VexPrefix::P_66); + instr.set(Imm(1, imm)); + return instr; +} + +InstructionX86 pw_sll(Register dst, Register src, u8 imm) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src.is_xmm(instr_set)); + + InstructionX86 instr(0x72); + instr.set_vex_modrm_and_rex(6, src.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + dst.hw_id(instr_set), false, VexPrefix::P_66); + instr.set(Imm(1, imm)); + return instr; +} +InstructionX86 ph_sll(Register dst, Register src, u8 imm) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src.is_xmm(instr_set)); + + InstructionX86 instr(0x71); + instr.set_vex_modrm_and_rex(6, src.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + dst.hw_id(instr_set), false, VexPrefix::P_66); + instr.set(Imm(1, imm)); + return instr; +} + +InstructionX86 parallel_add_byte(Register dst, Register src0, Register src1) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src0.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + + InstructionX86 instr(0xFC); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src1.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src0.hw_id(instr_set), false, VexPrefix::P_66); + return instr; +} + +InstructionX86 parallel_bitwise_or(Register dst, Register src0, Register src1) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src0.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + + InstructionX86 instr(0xEB); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src1.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src0.hw_id(instr_set), false, VexPrefix::P_66); + return instr; +} + +InstructionX86 parallel_bitwise_xor(Register dst, Register src0, Register src1) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src0.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + + InstructionX86 instr(0xEF); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src1.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src0.hw_id(instr_set), false, VexPrefix::P_66); + return instr; +} + +InstructionX86 parallel_bitwise_and(Register dst, Register src0, Register src1) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src0.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + + InstructionX86 instr(0xDB); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src1.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src0.hw_id(instr_set), false, VexPrefix::P_66); + return instr; +} + +InstructionX86 pextub_swapped(Register dst, Register src0, Register src1) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src0.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + + InstructionX86 instr(0x68); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src1.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src0.hw_id(instr_set), false, VexPrefix::P_66); + return instr; +} + +InstructionX86 pextuh_swapped(Register dst, Register src0, Register src1) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src0.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + + InstructionX86 instr(0x69); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src1.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src0.hw_id(instr_set), false, VexPrefix::P_66); + return instr; +} + +InstructionX86 pextuw_swapped(Register dst, Register src0, Register src1) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src0.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + + InstructionX86 instr(0x6a); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src1.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src0.hw_id(instr_set), false, VexPrefix::P_66); + return instr; +} + +InstructionX86 pextlb_swapped(Register dst, Register src0, Register src1) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src0.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + + InstructionX86 instr(0x60); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src1.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src0.hw_id(instr_set), false, VexPrefix::P_66); + return instr; +} + +InstructionX86 pextlh_swapped(Register dst, Register src0, Register src1) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src0.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + + InstructionX86 instr(0x61); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src1.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src0.hw_id(instr_set), false, VexPrefix::P_66); + return instr; +} + +InstructionX86 pextlw_swapped(Register dst, Register src0, Register src1) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src0.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + + InstructionX86 instr(0x62); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src1.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src0.hw_id(instr_set), false, VexPrefix::P_66); + return instr; +} + +InstructionX86 parallel_compare_e_b(Register dst, Register src0, Register src1) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src0.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + + InstructionX86 instr(0x74); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src1.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src0.hw_id(instr_set), false, VexPrefix::P_66); + return instr; +} + +InstructionX86 parallel_compare_e_h(Register dst, Register src0, Register src1) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src0.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + + InstructionX86 instr(0x75); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src1.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src0.hw_id(instr_set), false, VexPrefix::P_66); + return instr; +} + +InstructionX86 parallel_compare_e_w(Register dst, Register src0, Register src1) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src0.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + + InstructionX86 instr(0x76); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src1.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src0.hw_id(instr_set), false, VexPrefix::P_66); + return instr; +} + +InstructionX86 parallel_compare_gt_b(Register dst, Register src0, Register src1) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src0.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + + InstructionX86 instr(0x64); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src1.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src0.hw_id(instr_set), false, VexPrefix::P_66); + return instr; +} + +InstructionX86 parallel_compare_gt_h(Register dst, Register src0, Register src1) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src0.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + + InstructionX86 instr(0x65); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src1.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src0.hw_id(instr_set), false, VexPrefix::P_66); + return instr; +} + +InstructionX86 parallel_compare_gt_w(Register dst, Register src0, Register src1) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src0.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + + InstructionX86 instr(0x66); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src1.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src0.hw_id(instr_set), false, VexPrefix::P_66); + return instr; +} + +InstructionX86 vpunpcklqdq(Register dst, Register src0, Register src1) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src0.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + + InstructionX86 instr(0x6c); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src1.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src0.hw_id(instr_set), false, VexPrefix::P_66); + return instr; +} + +InstructionX86 pcpyld_swapped(Register dst, Register src0, Register src1) { + return vpunpcklqdq(dst, src0, src1); +} + +InstructionX86 pcpyud(Register dst, Register src0, Register src1) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src0.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + + InstructionX86 instr(0x6d); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src1.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src0.hw_id(instr_set), false, VexPrefix::P_66); + return instr; +} + +InstructionX86 vpsubd(Register dst, Register src0, Register src1) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src0.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + + InstructionX86 instr(0xfa); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src1.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src0.hw_id(instr_set), false, VexPrefix::P_66); + return instr; +} + +InstructionX86 vpsrldq(Register dst, Register src, u8 imm) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src.is_xmm(instr_set)); + + InstructionX86 instr(0x73); + instr.set_vex_modrm_and_rex(3, src.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + dst.hw_id(instr_set), false, VexPrefix::P_66); + instr.set(Imm(1, imm)); + return instr; +} + +InstructionX86 vpslldq(Register dst, Register src, u8 imm) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src.is_xmm(instr_set)); + + InstructionX86 instr(0x73); + instr.set_vex_modrm_and_rex(7, src.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + dst.hw_id(instr_set), false, VexPrefix::P_66); + instr.set(Imm(1, imm)); + return instr; +} + +InstructionX86 vpshuflw(Register dst, Register src, u8 imm) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src.is_xmm(instr_set)); + + InstructionX86 instr(0x70); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + 0, false, VexPrefix::P_F2); + instr.set(Imm(1, imm)); + return instr; +} + +InstructionX86 vpshufhw(Register dst, Register src, u8 imm) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src.is_xmm(instr_set)); + + InstructionX86 instr(0x70); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + 0, false, VexPrefix::P_F3); + instr.set(Imm(1, imm)); + return instr; +} + +InstructionX86 vpackuswb(Register dst, Register src0, Register src1) { + ASSERT(dst.is_xmm(instr_set)); + ASSERT(src0.is_xmm(instr_set)); + ASSERT(src1.is_xmm(instr_set)); + + InstructionX86 instr(0x67); + instr.set_vex_modrm_and_rex(dst.hw_id(instr_set), src1.hw_id(instr_set), VEX3::LeadingBytes::P_0F, + src0.hw_id(instr_set), false, VexPrefix::P_66); + return instr; +} +} // namespace X86 +} // namespace IGen +} // namespace emitter \ No newline at end of file diff --git a/goalc/emitter/IGenX86.h b/goalc/emitter/IGenX86.h new file mode 100644 index 0000000000..eae830043f --- /dev/null +++ b/goalc/emitter/IGenX86.h @@ -0,0 +1,803 @@ +#pragma once + +#include "goalc/emitter/Instruction.h" +#include "goalc/emitter/Register.h" + +namespace emitter { +namespace IGen { +namespace X86 { +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// MOVES +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +/*! + * Move data from src to dst. Moves all 64-bits of the GPR. + */ +InstructionX86 mov_gpr64_gpr64(Register dst, Register src); + +/*! + * Move a 64-bit constant into a register. + */ +InstructionX86 mov_gpr64_u64(Register dst, uint64_t val); + +/*! + * Move a 32-bit constant into a register. Zeros the upper 32 bits. + */ +InstructionX86 mov_gpr64_u32(Register dst, uint64_t val); + +/*! + * Move a signed 32-bit constant into a register. Sign extends for the upper 32 bits. + * When possible prefer mov_gpr64_u32. (use this only for negative values...) + * This is always bigger than mov_gpr64_u32, but smaller than a mov_gpr_u64. + */ +InstructionX86 mov_gpr64_s32(Register dst, int64_t val); + +/*! + * Move 32-bits of xmm to 32 bits of gpr (no sign extension). + */ +InstructionX86 movd_gpr32_xmm32(Register dst, Register src); + +/*! + * Move 32-bits of gpr to 32-bits of xmm (no sign extension) + */ +InstructionX86 movd_xmm32_gpr32(Register dst, Register src); + +/*! + * Move 64-bits of xmm to 64 bits of gpr (no sign extension). + */ +InstructionX86 movq_gpr64_xmm64(Register dst, Register src); + +/*! + * Move 64-bits of gpr to 64-bits of xmm (no sign extension) + */ +InstructionX86 movq_xmm64_gpr64(Register dst, Register src); + +/*! + * Move 32-bits between xmm's + */ +InstructionX86 mov_xmm32_xmm32(Register dst, Register src); + +// todo - GPR64 -> XMM64 (zext) +// todo - XMM -> GPR64 + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// GOAL Loads and Stores +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +/*! + * movsx dst, BYTE PTR [addr1 + addr2] + * addr1 and addr2 have to be different registers. + * Cannot use rsp. + */ +InstructionX86 load8s_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2); + +InstructionX86 store8_gpr64_gpr64_plus_gpr64(Register addr1, Register addr2, Register value); + +InstructionX86 load8s_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset); + +InstructionX86 store8_gpr64_gpr64_plus_gpr64_plus_s8(Register addr1, + Register addr2, + Register value, + s64 offset); + +InstructionX86 load8s_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset); + +InstructionX86 store8_gpr64_gpr64_plus_gpr64_plus_s32(Register addr1, + Register addr2, + Register value, + s64 offset); + +/*! + * movzx dst, BYTE PTR [addr1 + addr2] + * addr1 and addr2 have to be different registers. + * Cannot use rsp. + */ +InstructionX86 load8u_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2); + +InstructionX86 load8u_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset); + +InstructionX86 load8u_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset); + +/*! + * movsx dst, WORD PTR [addr1 + addr2] + * addr1 and addr2 have to be different registers. + * Cannot use rsp. + */ +InstructionX86 load16s_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2); + +InstructionX86 store16_gpr64_gpr64_plus_gpr64(Register addr1, Register addr2, Register value); + +InstructionX86 store16_gpr64_gpr64_plus_gpr64_plus_s8(Register addr1, + Register addr2, + Register value, + s64 offset); + +InstructionX86 store16_gpr64_gpr64_plus_gpr64_plus_s32(Register addr1, + Register addr2, + Register value, + s64 offset); + +InstructionX86 load16s_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset); + +InstructionX86 load16s_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset); + +/*! + * movzx dst, WORD PTR [addr1 + addr2] + * addr1 and addr2 have to be different registers. + * Cannot use rsp. + */ +InstructionX86 load16u_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2); + +InstructionX86 load16u_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset); + +InstructionX86 load16u_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset); + +/*! + * movsxd dst, DWORD PTR [addr1 + addr2] + * addr1 and addr2 have to be different registers. + * Cannot use rsp. + */ +InstructionX86 load32s_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2); + +InstructionX86 store32_gpr64_gpr64_plus_gpr64(Register addr1, Register addr2, Register value); + +InstructionX86 load32s_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset); + +InstructionX86 store32_gpr64_gpr64_plus_gpr64_plus_s8(Register addr1, + Register addr2, + Register value, + s64 offset); + +InstructionX86 load32s_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset); + +InstructionX86 store32_gpr64_gpr64_plus_gpr64_plus_s32(Register addr1, + Register addr2, + Register value, + s64 offset); + +/*! + * movzxd dst, DWORD PTR [addr1 + addr2] + * addr1 and addr2 have to be different registers. + * Cannot use rsp. + */ +InstructionX86 load32u_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2); + +InstructionX86 load32u_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset); + +InstructionX86 load32u_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset); + +/*! + * mov dst, QWORD PTR [addr1 + addr2] + * addr1 and addr2 have to be different registers. + * Cannot use rsp. + */ +InstructionX86 load64_gpr64_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2); + +InstructionX86 store64_gpr64_gpr64_plus_gpr64(Register addr1, Register addr2, Register value); + +InstructionX86 load64_gpr64_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset); + +InstructionX86 store64_gpr64_gpr64_plus_gpr64_plus_s8(Register addr1, + Register addr2, + Register value, + s64 offset); + +InstructionX86 load64_gpr64_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset); + +InstructionX86 store64_gpr64_gpr64_plus_gpr64_plus_s32(Register addr1, + Register addr2, + Register value, + s64 offset); + +InstructionX86 store_goal_vf(Register addr, Register value, Register off, s64 offset); + +InstructionX86 store_goal_gpr(Register addr, Register value, Register off, int offset, int size); + +InstructionX86 load_goal_xmm128(Register dst, Register addr, Register off, int offset); + +/*! + * Load memory at addr + offset, where addr is a GOAL pointer and off is the offset register. + * This will pick the appropriate fancy addressing mode instruction. + */ +InstructionX86 load_goal_gpr(Register dst, + Register addr, + Register off, + int offset, + int size, + bool sign_extend); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// LOADS n' STORES - XMM32 +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +InstructionX86 store32_xmm32_gpr64_plus_gpr64(Register addr1, Register addr2, Register xmm_value); + +InstructionX86 load32_xmm32_gpr64_plus_gpr64(Register simd_dest, Register addr1, Register addr2); + +InstructionX86 store32_xmm32_gpr64_plus_gpr64_plus_s8(Register addr1, + Register addr2, + Register xmm_value, + s64 offset); + +InstructionX86 load32_xmm32_gpr64_plus_gpr64_plus_s8(Register simd_dest, + Register addr1, + Register addr2, + s64 offset); + +InstructionX86 store32_xmm32_gpr64_plus_gpr64_plus_s32(Register addr1, + Register addr2, + Register xmm_value, + s64 offset); + +InstructionX86 lea_reg_plus_off32(Register dest, Register base, s64 offset); + +InstructionX86 lea_reg_plus_off8(Register dest, Register base, s64 offset); + +InstructionX86 lea_reg_plus_off(Register dest, Register base, s64 offset); + +InstructionX86 store32_xmm32_gpr64_plus_s32(Register base, Register xmm_value, s64 offset); + +InstructionX86 store32_xmm32_gpr64_plus_s8(Register base, Register xmm_value, s64 offset); + +InstructionX86 load32_xmm32_gpr64_plus_gpr64_plus_s32(Register simd_dest, + Register addr1, + Register addr2, + s64 offset); + +InstructionX86 load32_xmm32_gpr64_plus_s32(Register simd_dest, Register base, s64 offset); + +InstructionX86 load32_xmm32_gpr64_plus_s8(Register simd_dest, Register base, s64 offset); + +InstructionX86 load_goal_xmm32(Register simd_dest, Register addr, Register off, s64 offset); + +InstructionX86 store_goal_xmm32(Register addr, Register xmm_value, Register off, s64 offset); + +InstructionX86 store_reg_offset_xmm32(Register base, Register xmm_value, s64 offset); + +InstructionX86 load_reg_offset_xmm32(Register simd_dest, Register base, s64 offset); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// LOADS n' STORES - XMM128 +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +/*! + * Store a 128-bit xmm into an address stored in a register, no offset + */ +InstructionX86 store128_gpr64_simd128(Register gpr_addr, Register xmm_value); + +InstructionX86 store128_gpr64_simd128_s32(Register gpr_addr, Register xmm_value, s64 offset); + +InstructionX86 store128_gpr64_simd128_s8(Register gpr_addr, Register xmm_value, s64 offset); + +InstructionX86 load128_simd128_gpr64(Register simd_dest, Register gpr_addr); + +InstructionX86 load128_simd128_gpr64_s32(Register simd_dest, Register gpr_addr, s64 offset); + +InstructionX86 load128_simd128_gpr64_s8(Register simd_dest, Register gpr_addr, s64 offset); + +InstructionX86 load128_xmm128_reg_offset(Register simd_dest, Register base, s64 offset); + +InstructionX86 store128_xmm128_reg_offset(Register base, Register xmm_val, s64 offset); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// RIP loads and stores +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +InstructionX86 load64_rip_s32(Register dest, s64 offset); + +InstructionX86 load32s_rip_s32(Register dest, s64 offset); + +InstructionX86 load32u_rip_s32(Register dest, s64 offset); + +InstructionX86 load16u_rip_s32(Register dest, s64 offset); + +InstructionX86 load16s_rip_s32(Register dest, s64 offset); + +InstructionX86 load8u_rip_s32(Register dest, s64 offset); + +InstructionX86 load8s_rip_s32(Register dest, s64 offset); + +InstructionX86 static_load(Register dest, s64 offset, int size, bool sign_extend); + +InstructionX86 store64_rip_s32(Register src, s64 offset); + +InstructionX86 store32_rip_s32(Register src, s64 offset); + +InstructionX86 store16_rip_s32(Register src, s64 offset); + +InstructionX86 store8_rip_s32(Register src, s64 offset); + +InstructionX86 static_store(Register value, s64 offset, int size); + +InstructionX86 static_addr(Register dst, s64 offset); + +InstructionX86 static_load_xmm32(Register simd_dest, s64 offset); + +InstructionX86 static_store_xmm32(Register xmm_value, s64 offset); + +// TODO, special load/stores of 128 bit values. + +// TODO, consider specialized stack loads and stores? +InstructionX86 load64_gpr64_plus_s32(Register dst_reg, int32_t offset, Register src_reg); + +/*! + * Store 64-bits from gpr into memory located at 64-bit reg + 32-bit signed offset. + */ +InstructionX86 store64_gpr64_plus_s32(Register addr, int32_t offset, Register value); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// FUNCTION STUFF +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +/*! + * Function return. Pops the 64-bit return address (real) off the stack and jumps to it. + */ +InstructionX86 ret(); + +/*! + * Instruction to push gpr (64-bits) onto the stack + */ +InstructionX86 push_gpr64(Register reg); + +/*! + * Instruction to pop 64 bit gpr from the stack + */ +InstructionX86 pop_gpr64(Register reg); + +/*! + * Call a function stored in a 64-bit gpr + */ +InstructionX86 call_r64(Register reg_); + +/*! + * Jump to an x86-64 address stored in a 64-bit gpr. + */ +InstructionX86 jmp_r64(Register reg_); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// INTEGER MATH +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +InstructionX86 sub_gpr64_imm8s(Register reg, int64_t imm); + +InstructionX86 sub_gpr64_imm32s(Register reg, int64_t imm); + +InstructionX86 add_gpr64_imm8s(Register reg, int64_t v); + +InstructionX86 add_gpr64_imm32s(Register reg, int64_t v); + +InstructionX86 add_gpr64_imm(Register reg, int64_t imm); + +InstructionX86 sub_gpr64_imm(Register reg, int64_t imm); + +InstructionX86 add_gpr64_gpr64(Register dst, Register src); + +InstructionX86 sub_gpr64_gpr64(Register dst, Register src); + +/*! + * Multiply gprs (32-bit, signed). + * (Note - probably worth doing imul on gpr64's to implement the EE's unsigned multiply) + */ +InstructionX86 imul_gpr32_gpr32(Register dst, Register src); + +/*! + * Multiply gprs (64-bit, signed). + * DANGER - this treats all operands as 64-bit. This is not like the EE. + */ +InstructionX86 imul_gpr64_gpr64(Register dst, Register src); + +/*! + * Divide (idiv, 32 bit) + */ +InstructionX86 idiv_gpr32(Register reg); + +InstructionX86 unsigned_div_gpr32(Register reg); + +/*! + * Convert doubleword to quadword for division. + */ +InstructionX86 cdq(); + +/*! + * Move from gpr32 to gpr64, with sign extension. + * Needed for multiplication/divsion madness. + */ +InstructionX86 movsx_r64_r32(Register dst, Register src); + +/*! + * Compare gpr64. This sets the flags for the jumps. + * todo UNTESTED + */ +InstructionX86 cmp_gpr64_gpr64(Register a, Register b); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// BIT STUFF +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +/*! + * Or of two gprs + */ +InstructionX86 or_gpr64_gpr64(Register dst, Register src); + +/*! + * And of two gprs + */ +InstructionX86 and_gpr64_gpr64(Register dst, Register src); + +/*! + * Xor of two gprs + */ +InstructionX86 xor_gpr64_gpr64(Register dst, Register src); + +/*! + * Bitwise not a gpr + */ +InstructionX86 not_gpr64(Register reg); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// SHIFTS +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +/*! + * Shift 64-bit gpr left by CL register + */ +InstructionX86 shl_gpr64_cl(Register reg); + +/*! + * Shift 64-bit gpr right (logical) by CL register + */ +InstructionX86 shr_gpr64_cl(Register reg); + +/*! + * Shift 64-bit gpr right (arithmetic) by CL register + */ +InstructionX86 sar_gpr64_cl(Register reg); + +/*! + * Shift 64-ptr left (logical) by the constant shift amount "sa". + */ +InstructionX86 shl_gpr64_u8(Register reg, uint8_t sa); + +/*! + * Shift 64-ptr right (logical) by the constant shift amount "sa". + */ +InstructionX86 shr_gpr64_u8(Register reg, uint8_t sa); + +/*! + * Shift 64-ptr right (arithmetic) by the constant shift amount "sa". + */ +InstructionX86 sar_gpr64_u8(Register reg, uint8_t sa); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// CONTROL FLOW +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +/*! + * Jump, 32-bit constant offset. The offset is by default 0 and must be patched later. + */ +InstructionX86 jmp_32(); + +/*! + * Jump if equal. + */ +InstructionX86 je_32(); + +/*! + * Jump not equal. + */ +InstructionX86 jne_32(); + +/*! + * Jump less than or equal. + */ +InstructionX86 jle_32(); + +/*! + * Jump greater than or equal. + */ +InstructionX86 jge_32(); + +/*! + * Jump less than + */ +InstructionX86 jl_32(); + +/*! + * Jump greater than + */ +InstructionX86 jg_32(); + +/*! + * Jump below or equal + */ +InstructionX86 jbe_32(); + +/*! + * Jump above or equal + */ +InstructionX86 jae_32(); + +/*! + * Jump below + */ +InstructionX86 jb_32(); + +/*! + * Jump above + */ +InstructionX86 ja_32(); + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// FLOAT MATH +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +/*! + * Compare two floats and set flag register for jump (ucomiss) + */ +InstructionX86 cmp_flt_flt(Register a, Register b); + +InstructionX86 sqrts_xmm(Register dst, Register src); + +/*! + * Multiply two floats in xmm's + */ +InstructionX86 mulss_xmm_xmm(Register dst, Register src); + +/*! + * Divide two floats in xmm's + */ +InstructionX86 divss_xmm_xmm(Register dst, Register src); + +/*! + * Subtract two floats in xmm's + */ +InstructionX86 subss_xmm_xmm(Register dst, Register src); + +/*! + * Add two floats in xmm's + */ +InstructionX86 addss_xmm_xmm(Register dst, Register src); + +/*! + * Floating point minimum. + */ +InstructionX86 minss_xmm_xmm(Register dst, Register src); + +/*! + * Floating point maximum. + */ +InstructionX86 maxss_xmm_xmm(Register dst, Register src); + +/*! + * Convert GPR int32 to XMM float (single precision) + */ +InstructionX86 int32_to_float(Register dst, Register src); + +/*! + * Convert XMM float to GPR int32(single precision) (truncate) + */ +InstructionX86 float_to_int32(Register dst, Register src); + +InstructionX86 nop(); + +// TODO - rsqrt / abs / sqrt + +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +// UTILITIES +//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +/*! + * A "null" instruction. This instruction does not generate any bytes + * but can be referred to by a label. Useful to insert in place of a real instruction + * if the real instruction has been optimized out. + */ +InstructionX86 null(); + +///////////////////////////// +// AVX (VF - Vector Float) // +///////////////////////////// + +InstructionX86 nop_vf(); + +InstructionX86 wait_vf(); + +InstructionX86 mov_vf_vf(Register dst, Register src); + +InstructionX86 loadvf_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2); + +InstructionX86 loadvf_gpr64_plus_gpr64_plus_s8(Register dst, + Register addr1, + Register addr2, + s64 offset); + +InstructionX86 loadvf_gpr64_plus_gpr64_plus_s32(Register dst, + Register addr1, + Register addr2, + s64 offset); + +InstructionX86 storevf_gpr64_plus_gpr64(Register value, Register addr1, Register addr2); + +InstructionX86 storevf_gpr64_plus_gpr64_plus_s8(Register value, + Register addr1, + Register addr2, + s64 offset); + +InstructionX86 storevf_gpr64_plus_gpr64_plus_s32(Register value, + Register addr1, + Register addr2, + s64 offset); + +InstructionX86 loadvf_rip_plus_s32(Register dest, s64 offset); + +// TODO - rip relative loads and stores. + +InstructionX86 blend_vf(Register dst, Register src1, Register src2, u8 mask); + +InstructionX86 shuffle_vf(Register dst, Register src, u8 dx, u8 dy, u8 dz, u8 dw); + +/* + Generic Swizzle (re-arrangment of packed FPs) operation, the control bytes are quite involved. + Here's a brief run-down: + - 8-bits / 4 groups of 2 bits + - Right-to-left, each group is used to determine which element in `src` gets copied into + `dst`'s element (W->X). + - GROUP OPTIONS + - 00b - Copy the least-significant element (X) + - 01b - Copy the second element (from the right) (Y) + - 10b - Copy the third element (from the right) (Z) + - 11b - Copy the most significant element (W) + Examples + ; xmm1 = (1.5, 2.5, 3.5, 4.5) (W,Z,Y,X in x86 land) + SHUFPS xmm1, xmm1, 0xff ; Copy the most significant element to all positions + > (1.5, 1.5, 1.5, 1.5) + SHUFPS xmm1, xmm1, 0x39 ; Rotate right + > (4.5, 1.5, 2.5, 3.5) + */ +InstructionX86 swizzle_vf(Register dst, Register src, u8 controlBytes); + +/* + Splats a single element in 'src' to all elements in 'dst' + For example (pseudocode): + xmm1 = (1.5, 2.5, 3.5, 4.5) + xmm2 = (1, 2, 3, 4) + splat_vf(xmm1, xmm2, XMM_ELEMENT::X); + xmm1 = (4, 4, 4, 4) + */ +InstructionX86 splat_vf(Register dst, Register src, Register::VF_ELEMENT element); + +InstructionX86 xor_vf(Register dst, Register src1, Register src2); + +InstructionX86 sub_vf(Register dst, Register src1, Register src2); + +InstructionX86 add_vf(Register dst, Register src1, Register src2); + +InstructionX86 mul_vf(Register dst, Register src1, Register src2); + +InstructionX86 max_vf(Register dst, Register src1, Register src2); + +InstructionX86 min_vf(Register dst, Register src1, Register src2); + +InstructionX86 div_vf(Register dst, Register src1, Register src2); + +InstructionX86 sqrt_vf(Register dst, Register src); + +InstructionX86 itof_vf(Register dst, Register src); + +InstructionX86 ftoi_vf(Register dst, Register src); + +InstructionX86 pw_sra(Register dst, Register src, u8 imm); + +InstructionX86 pw_srl(Register dst, Register src, u8 imm); + +InstructionX86 ph_srl(Register dst, Register src, u8 imm); + +InstructionX86 pw_sll(Register dst, Register src, u8 imm); + +InstructionX86 ph_sll(Register dst, Register src, u8 imm); + +InstructionX86 parallel_add_byte(Register dst, Register src0, Register src1); + +InstructionX86 parallel_bitwise_or(Register dst, Register src0, Register src1); + +InstructionX86 parallel_bitwise_xor(Register dst, Register src0, Register src1); + +InstructionX86 parallel_bitwise_and(Register dst, Register src0, Register src1); + +// Reminder - a word in MIPS = 32bits = a DWORD in x86 +// MIPS || x86 +// ----------------------- +// byte || byte +// halfword || word +// word || dword +// doubleword || quadword + +// -- Unpack High Data Instructions +InstructionX86 pextub_swapped(Register dst, Register src0, Register src1); + +InstructionX86 pextuh_swapped(Register dst, Register src0, Register src1); + +InstructionX86 pextuw_swapped(Register dst, Register src0, Register src1); + +// -- Unpack Low Data Instructions +InstructionX86 pextlb_swapped(Register dst, Register src0, Register src1); + +InstructionX86 pextlh_swapped(Register dst, Register src0, Register src1); + +InstructionX86 pextlw_swapped(Register dst, Register src0, Register src1); + +// Equal to than comparison as 16 bytes (8 bits) +InstructionX86 parallel_compare_e_b(Register dst, Register src0, Register src1); + +// Equal to than comparison as 8 halfwords (16 bits) +InstructionX86 parallel_compare_e_h(Register dst, Register src0, Register src1); + +// Equal to than comparison as 4 words (32 bits) +InstructionX86 parallel_compare_e_w(Register dst, Register src0, Register src1); + +// Greater than comparison as 16 bytes (8 bits) +InstructionX86 parallel_compare_gt_b(Register dst, Register src0, Register src1); + +// Greater than comparison as 8 halfwords (16 bits) +InstructionX86 parallel_compare_gt_h(Register dst, Register src0, Register src1); + +// Greater than comparison as 4 words (32 bits) +InstructionX86 parallel_compare_gt_w(Register dst, Register src0, Register src1); + +InstructionX86 vpunpcklqdq(Register dst, Register src0, Register src1); + +InstructionX86 pcpyld_swapped(Register dst, Register src0, Register src1); + +InstructionX86 pcpyud(Register dst, Register src0, Register src1); + +InstructionX86 vpsubd(Register dst, Register src0, Register src1); + +InstructionX86 vpsrldq(Register dst, Register src, u8 imm); + +InstructionX86 vpslldq(Register dst, Register src, u8 imm); + +InstructionX86 vpshuflw(Register dst, Register src, u8 imm); + +InstructionX86 vpshufhw(Register dst, Register src, u8 imm); + +InstructionX86 vpackuswb(Register dst, Register src0, Register src1); +} // namespace X86 +} // namespace IGen +} // namespace emitter \ No newline at end of file diff --git a/goalc/emitter/Instruction.h b/goalc/emitter/Instruction.h index b2bd0357ab..8bda3f995b 100644 --- a/goalc/emitter/Instruction.h +++ b/goalc/emitter/Instruction.h @@ -1,12 +1,111 @@ #pragma once -#ifndef JAK_INSTRUCTION_H -#define JAK_INSTRUCTION_H +#include +#include #include "common/common_types.h" #include "common/util/Assert.h" namespace emitter { + +/*! + * A high-level description of a opcode. It can emit itself. + */ +template +struct InstructionImpl { + /*! + * Emit into a buffer and return how many bytes written (can be zero) + */ + u8 emit(u8* buffer) const { return static_cast(this)->emit(buffer); } + + // TODO - the below might only be relevant for X86, in which case + // they can eventually leave this parent type + // and at that point, things can likely be simplified + // + // For now, just trying to make things compile / work + u8 length() const { return static_cast(this)->length(); } + + int get_imm_size() const { return static_cast(this)->get_imm_size(); } + + int get_disp_size() const { return static_cast(this)->get_disp_size(); } + + int offset_of_imm() const { return static_cast(this)->offset_of_imm(); } + + int offset_of_disp() const { return static_cast(this)->offset_of_disp(); } +}; + +namespace ARM64 { +struct Field { + u32 bits; + constexpr explicit Field(u32 v) : bits(v) {} +}; + +constexpr u32 Base(u32 value, u32 width) { + return value << (32 - width); +} + +constexpr Field Rd(u32 x) { + return Field{(x & 31) << 0}; +} + +constexpr Field Rt(u32 x) { + return Field{(x & 31) << 0}; +} + +constexpr Field Rn(u32 x) { + return Field{(x & 31) << 5}; +} + +constexpr Field Rm(u32 x) { + return Field{(x & 31) << 16}; +} + +constexpr Field Imm6(u32 x) { + return Field{(x & 0b111111) << 10}; +} + +constexpr Field Imm9(s32 x) { + return Field{(static_cast(x) & 0b111111111) << 12}; +} + +constexpr Field Imm12(u32 x) { + ASSERT(x >= 0 && x <= 4095); + return Field{(static_cast(x) & 0b111111111111) << 10}; +} +} // namespace ARM64 + +struct InstructionARM64 : InstructionImpl { + // The ARM instruction stream is a sequence of word-aligned words. Each ARM instruction is a + // single 32-bit word in that stream. + // Info: + // - https://yurichev.com/mirrors/ARMv8-A_Architecture_Reference_Manual_(Issue_A.a).pdf + // - https://www.scs.stanford.edu/~zyedidia/arm64/ + // - https://armconverter.com/?lock=arm64&code=STR+X0,+[SP,+%23-8]! + u32 encoding; + + InstructionARM64() = delete; + template + constexpr InstructionARM64(uint32_t base, Fs... fields) : encoding((base | ... | fields.bits)) { + static_assert((std::is_same_v && ...), + "All operands must be Field types"); + } + + uint8_t emit(uint8_t* buffer) const { + memcpy(buffer, &encoding, 4); + return 4; + } + + uint8_t length() const { return 4; } + + int get_imm_size() const { return 0; } + + int offset_of_imm() const { return 0; } + + int offset_of_disp() const { return 0; } + + int get_disp_size() const { return 0; } +}; + /*! * The ModRM byte */ @@ -133,13 +232,7 @@ struct VEX2 { : R(r), reg_id(_reg_id), prefix(_prefix), L(l) {} }; -/*! - * A high-level description of an x86-64 opcode. It can emit itself. - */ -struct Instruction { - Instruction(uint8_t opcode) : op(opcode) {} - uint8_t op; - +struct InstructionX86 : InstructionImpl { enum Flags { kOp2Set = (1 << 0), kOp3Set = (1 << 1), @@ -151,23 +244,27 @@ struct Instruction { kSetImm = (1 << 7), }; + InstructionX86(u8 opcode) : op(opcode) {} + + u8 op; + u8 m_flags = 0; - uint8_t op2; + u8 op2; - uint8_t op3; + u8 op3; u8 n_vex = 0; - uint8_t vex[3] = {0, 0, 0}; + u8 vex[3] = {0, 0, 0}; // the rex byte - uint8_t m_rex = 0; + u8 m_rex = 0; // the modrm byte - uint8_t m_modrm = 0; + u8 m_modrm = 0; // the sib byte - uint8_t m_sib = 0; + u8 m_sib = 0; // the displacement Imm disp; @@ -924,9 +1021,6 @@ struct Instruction { return offset; } - /*! - * Emit into a buffer and return how many bytes written (can be zero) - */ uint8_t emit(uint8_t* buffer) const { if (m_flags & kIsNull) return 0; @@ -1015,6 +1109,41 @@ struct Instruction { return count; } }; -} // namespace emitter -#endif // JAK_INSTRUCTION_H +class Instruction { + public: + using Variant = std::variant; + + Variant instr; + + Instruction() = delete; + + template + Instruction(T v) : instr(std::move(v)) {} + + u8 emit(u8* buffer) const { + return std::visit([&](auto const& i) { return i.emit(buffer); }, instr); + } + + u8 length() const { + return std::visit([](auto const& i) { return i.length(); }, instr); + } + + int get_imm_size() const { + return std::visit([](auto const& i) { return i.get_imm_size(); }, instr); + } + + int get_disp_size() const { + return std::visit([](auto const& i) { return i.get_disp_size(); }, instr); + } + + int offset_of_imm() const { + return std::visit([](auto const& i) { return i.offset_of_imm(); }, instr); + } + + int offset_of_disp() const { + return std::visit([](auto const& i) { return i.offset_of_disp(); }, instr); + } +}; + +} // namespace emitter diff --git a/goalc/emitter/InstructionSet.h b/goalc/emitter/InstructionSet.h new file mode 100644 index 0000000000..c408b530fc --- /dev/null +++ b/goalc/emitter/InstructionSet.h @@ -0,0 +1,5 @@ +#pragma once + +namespace emitter { +enum class InstructionSet { X86, ARM64 }; +}; \ No newline at end of file diff --git a/goalc/emitter/ObjectGenerator.cpp b/goalc/emitter/ObjectGenerator.cpp index a2e11a0df5..b77f37f2ea 100644 --- a/goalc/emitter/ObjectGenerator.cpp +++ b/goalc/emitter/ObjectGenerator.cpp @@ -1,6 +1,6 @@ /*! * @file ObjectGenerator.cpp - * Tool to build GOAL object files. Will eventually support v3 and v4. + * Tool to build GOAL object files. * * There are 5 steps: * 1. The user adds static data / instructions and specifies links. @@ -21,11 +21,13 @@ #include "goalc/debugger/DebugInfo.h" -#include "fmt/format.h" - namespace emitter { -ObjectGenerator::ObjectGenerator(GameVersion version) : m_version(version) {} +ObjectGenerator::ObjectGenerator(GameVersion version) + : m_version(version), m_instruction_set(InstructionSet::X86) {} + +ObjectGenerator::ObjectGenerator(GameVersion version, InstructionSet instr_set) + : m_version(version), m_instruction_set(instr_set) {} /*! * Build an object file with the v3 format. diff --git a/goalc/emitter/ObjectGenerator.h b/goalc/emitter/ObjectGenerator.h index 8cfaf2c607..7638c9e775 100644 --- a/goalc/emitter/ObjectGenerator.h +++ b/goalc/emitter/ObjectGenerator.h @@ -15,6 +15,7 @@ #include "common/versions/versions.h" #include "goalc/debugger/DebugInfo.h" +#include "goalc/emitter/InstructionSet.h" struct FunctionDebugInfo; class TypeSystem; @@ -64,6 +65,7 @@ struct ObjectGeneratorStats { class ObjectGenerator { public: ObjectGenerator(GameVersion version); + ObjectGenerator(GameVersion version, InstructionSet instr_set); ObjectFileData generate_data_v3(const TypeSystem* ts); FunctionRecord add_function_to_seg(int seg, FunctionDebugInfo* debug, @@ -99,6 +101,8 @@ class ObjectGenerator { GameVersion version() const { return m_version; } + InstructionSet instr_set() const { return m_instruction_set; } + private: void handle_temp_static_type_links(int seg); void handle_temp_jump_links(int seg); @@ -209,6 +213,7 @@ class ObjectGenerator { template using seg_map = std::array>, N_SEG>; GameVersion m_version; + InstructionSet m_instruction_set; // final data seg_vector m_data_by_seg; diff --git a/goalc/emitter/Register.h b/goalc/emitter/Register.h index 44ff8df332..9dfe5b1be5 100644 --- a/goalc/emitter/Register.h +++ b/goalc/emitter/Register.h @@ -13,6 +13,8 @@ #include "common/goal_constants.h" #include "common/util/Assert.h" +#include "goalc/emitter/InstructionSet.h" + namespace emitter { enum class HWRegKind : u8 { GPR, XMM, INVALID }; @@ -60,9 +62,6 @@ enum X86_REG : s8 { XMM15, // saved }; -// TODO - i think it'll be better to make some sort of abstraction -// mapping between x86 and arm, but just using this enum as a place to prototype -// the registers to use. enum ARM64_REG : s8 { X0, // arg 0, caller-saved RDI X1, // arg 1, caller-saved RSI @@ -104,7 +103,7 @@ enum ARM64_REG : s8 { // quadword registers, equivalent to XMMs // the convention in arm64 is the callee preserves all Q values // at the same time though, the caller should not depend on this convention! - Q0, + Q0 = 0, Q1, Q2, Q3, @@ -119,23 +118,7 @@ enum ARM64_REG : s8 { Q12, Q13, Q14, - Q15, - Q16, - Q17, - Q18, - Q19, - Q20, - Q21, - Q22, - Q23, - Q24, - Q25, - Q26, - Q27, - Q28, - Q29, - Q30, - Q31 + Q15 }; class Register { @@ -145,14 +128,44 @@ class Register { // intentionally not explicit so we can use X86_REGs in place of Registers Register(int id) : m_id(id) {} - bool is_xmm() const { return m_id >= XMM0 && m_id <= XMM15; } + bool is_128bit_simd(emitter::InstructionSet instr_set) const { + if (instr_set == emitter::InstructionSet::X86) { + return m_id >= XMM0 && m_id <= XMM15; + } else if (instr_set == emitter::InstructionSet::ARM64) { + return m_id >= Q0 && m_id <= Q15; + } else { + ASSERT_MSG(false, "is_128bit_simd: instruction set not supported"); + } + } - bool is_gpr() const { return m_id >= RAX && m_id <= R15; } + bool is_xmm(emitter::InstructionSet instr_set) const { + if (instr_set == emitter::InstructionSet::X86) { + return m_id >= XMM0 && m_id <= XMM15; + } else if (instr_set == emitter::InstructionSet::ARM64) { + return false; + } else { + ASSERT_MSG(false, "is_xmm: instruction set not supported"); + } + } - int hw_id() const { - if (is_xmm()) { + bool is_gpr(emitter::InstructionSet instr_set) const { + if (instr_set == emitter::InstructionSet::X86) { + return m_id >= RAX && m_id <= R15; + } else if (instr_set == emitter::InstructionSet::ARM64) { + return (m_id >= X0 && m_id <= X30) || m_id == SP; + } else { + ASSERT_MSG(false, "is_gpr: instruction set not supported"); + } + } + + int hw_id(emitter::InstructionSet instr_set) const { + // ARM64 does not require the concept of a hw_id + if (instr_set != emitter::InstructionSet::X86) { + ASSERT_MSG(false, "hw_id is only applicable for x86"); + } + if (is_xmm(instr_set)) { return m_id - XMM0; - } else if (is_gpr()) { + } else if (is_gpr(instr_set)) { return m_id - RAX; } else { ASSERT(false); diff --git a/goalc/main.cpp b/goalc/main.cpp index c7e5952551..0c50d802a7 100644 --- a/goalc/main.cpp +++ b/goalc/main.cpp @@ -103,7 +103,7 @@ int main(int argc, char** argv) { // if a command is provided on the command line, no REPL just run the compiler on it try { if (!cmd.empty()) { - compiler = std::make_unique(game_version); + compiler = std::make_unique(game_version, emitter::InstructionSet::X86); compiler->run_front_end_on_string(cmd); return 0; } @@ -130,7 +130,7 @@ int main(int argc, char** argv) { // the compiler may throw an exception if it fails to load its standard library. try { compiler = std::make_unique( - game_version, std::make_optional(repl_config), username, + game_version, emitter::InstructionSet::X86, std::make_optional(repl_config), username, std::make_unique(username, repl_config, startup_file, nrepl_server_ok)); // Start nREPL Server if it spun up successfully if (nrepl_server_ok) { @@ -158,7 +158,7 @@ int main(int argc, char** argv) { compiler->save_repl_history(); } compiler = std::make_unique( - game_version, std::make_optional(repl_config), username, + game_version, emitter::InstructionSet::X86, std::make_optional(repl_config), username, std::make_unique(username, repl_config, startup_file, nrepl_server_ok)); status = ReplStatus::OK; } diff --git a/goalc/simple_main.cpp b/goalc/simple_main.cpp index fe14bbb1de..f5b031f1ab 100644 --- a/goalc/simple_main.cpp +++ b/goalc/simple_main.cpp @@ -3,6 +3,7 @@ #include "common/versions/versions.h" #include "goalc/compiler/Compiler.h" +#include "goalc/emitter/InstructionSet.h" int main(int argc, char** argv) { // logging @@ -27,16 +28,17 @@ int main(int argc, char** argv) { std::unique_ptr compiler; ReplStatus status = ReplStatus::OK; try { - compiler = std::make_unique(game_version, std::nullopt, "", - std::make_unique(game_version)); + compiler = std::make_unique(game_version, emitter::InstructionSet::X86, std::nullopt, + "", std::make_unique(game_version)); while (status != ReplStatus::WANT_EXIT) { if (status == ReplStatus::WANT_RELOAD) { lg::info("Reloading compiler..."); if (compiler) { compiler->save_repl_history(); } - compiler = std::make_unique(game_version, std::nullopt, "", - std::make_unique(game_version)); + compiler = + std::make_unique(game_version, emitter::InstructionSet::X86, std::nullopt, "", + std::make_unique(game_version)); status = ReplStatus::OK; } std::string input_from_stdin = compiler->get_repl_input(); diff --git a/lsp/state/workspace.cpp b/lsp/state/workspace.cpp index f957d0a11f..9781ede4d3 100644 --- a/lsp/state/workspace.cpp +++ b/lsp/state/workspace.cpp @@ -298,8 +298,9 @@ void Workspace::start_tracking_file(const LSPSpec::DocumentUri& file_uri, const std::string progress_title = fmt::format("Compiling {}", version_to_game_name_external(game_version.value())); m_requester.send_progress_create_request(progress_title, "compiling project", -1); - m_compiler_instances.emplace(game_version.value(), - std::make_unique(game_version.value())); + m_compiler_instances.emplace( + game_version.value(), + std::make_unique(game_version.value(), emitter::InstructionSet::X86)); try { // TODO - this should happen on a separate thread so the LSP is not blocking during this // lengthy step diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 1eaf63f62c..484e79fd4a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -9,7 +9,6 @@ add_executable(goalc-test ${CMAKE_CURRENT_LIST_DIR}/test_kernel_jak1.cpp ${CMAKE_CURRENT_LIST_DIR}/all_jak1_symbols.cpp ${CMAKE_CURRENT_LIST_DIR}/test_type_system.cpp - ${CMAKE_CURRENT_LIST_DIR}/test_CodeTester.cpp ${CMAKE_CURRENT_LIST_DIR}/test_emitter.cpp ${CMAKE_CURRENT_LIST_DIR}/test_emitter_avx.cpp ${CMAKE_CURRENT_LIST_DIR}/test_common_util.cpp @@ -17,7 +16,6 @@ add_executable(goalc-test ${CMAKE_CURRENT_LIST_DIR}/test_math.cpp ${CMAKE_CURRENT_LIST_DIR}/test_zstd.cpp ${CMAKE_CURRENT_LIST_DIR}/test_zydis.cpp - ${CMAKE_CURRENT_LIST_DIR}/goalc/test_goal_kernel.cpp ${CMAKE_CURRENT_LIST_DIR}/decompiler/FormRegressionTest.cpp ${CMAKE_CURRENT_LIST_DIR}/decompiler/test_AtomicOpBuilder.cpp ${CMAKE_CURRENT_LIST_DIR}/decompiler/test_FormBeforeExpressions.cpp diff --git a/test/goalc/CMakeLists.txt b/test/goalc/CMakeLists.txt index b4b0e498a3..ef59df4a54 100644 --- a/test/goalc/CMakeLists.txt +++ b/test/goalc/CMakeLists.txt @@ -1,19 +1,43 @@ -set(GOALC_TEST_CASES - ${CMAKE_CURRENT_LIST_DIR}/test_arithmetic.cpp - ${CMAKE_CURRENT_LIST_DIR}/test_collections.cpp - ${CMAKE_CURRENT_LIST_DIR}/test_compiler.cpp - ${CMAKE_CURRENT_LIST_DIR}/test_control_statements.cpp - ${CMAKE_CURRENT_LIST_DIR}/test_debugger.cpp - ${CMAKE_CURRENT_LIST_DIR}/test_game_no_debug.cpp - ${CMAKE_CURRENT_LIST_DIR}/test_goal_kernel.cpp - ${CMAKE_CURRENT_LIST_DIR}/test_goal_kernel2.cpp - ${CMAKE_CURRENT_LIST_DIR}/test_goal_kernel3.cpp - ${CMAKE_CURRENT_LIST_DIR}/test_jak2_compiler.cpp - ${CMAKE_CURRENT_LIST_DIR}/test_variables.cpp - ${CMAKE_CURRENT_LIST_DIR}/test_with_game.cpp - ${CMAKE_CURRENT_LIST_DIR}/test_type_consistency.cpp - ${CMAKE_CURRENT_LIST_DIR}/test_vector_float.cpp +# TODO - order matters, unfortunately, the kernel tests write to the filesystem and +# other tests depend on that, solve that someday, maybe + +if(CMAKE_APPLE_SILICON_PROCESSOR STREQUAL "arm64") + set(GOALC_TEST_CASES + # ${CMAKE_CURRENT_LIST_DIR}/test_goal_kernel.cpp + # ${CMAKE_CURRENT_LIST_DIR}/test_goal_kernel2.cpp + # ${CMAKE_CURRENT_LIST_DIR}/test_goal_kernel3.cpp + ${CMAKE_CURRENT_LIST_DIR}/test_CodeTester.cpp + # ${CMAKE_CURRENT_LIST_DIR}/test_arithmetic.cpp + # ${CMAKE_CURRENT_LIST_DIR}/test_collections.cpp + # ${CMAKE_CURRENT_LIST_DIR}/test_compiler.cpp + # ${CMAKE_CURRENT_LIST_DIR}/test_control_statements.cpp + # ${CMAKE_CURRENT_LIST_DIR}/test_debugger.cpp + # ${CMAKE_CURRENT_LIST_DIR}/test_game_no_debug.cpp + # ${CMAKE_CURRENT_LIST_DIR}/test_jak2_compiler.cpp + # ${CMAKE_CURRENT_LIST_DIR}/test_variables.cpp + # ${CMAKE_CURRENT_LIST_DIR}/test_with_game.cpp + # ${CMAKE_CURRENT_LIST_DIR}/test_type_consistency.cpp + # ${CMAKE_CURRENT_LIST_DIR}/test_vector_float.cpp ) +else() + set(GOALC_TEST_CASES + ${CMAKE_CURRENT_LIST_DIR}/test_goal_kernel.cpp + ${CMAKE_CURRENT_LIST_DIR}/test_goal_kernel2.cpp + ${CMAKE_CURRENT_LIST_DIR}/test_goal_kernel3.cpp + ${CMAKE_CURRENT_LIST_DIR}/test_CodeTester.cpp + ${CMAKE_CURRENT_LIST_DIR}/test_arithmetic.cpp + ${CMAKE_CURRENT_LIST_DIR}/test_collections.cpp + ${CMAKE_CURRENT_LIST_DIR}/test_compiler.cpp + ${CMAKE_CURRENT_LIST_DIR}/test_control_statements.cpp + ${CMAKE_CURRENT_LIST_DIR}/test_debugger.cpp + ${CMAKE_CURRENT_LIST_DIR}/test_game_no_debug.cpp + ${CMAKE_CURRENT_LIST_DIR}/test_jak2_compiler.cpp + ${CMAKE_CURRENT_LIST_DIR}/test_variables.cpp + ${CMAKE_CURRENT_LIST_DIR}/test_with_game.cpp + ${CMAKE_CURRENT_LIST_DIR}/test_type_consistency.cpp + ${CMAKE_CURRENT_LIST_DIR}/test_vector_float.cpp + ) +endif() set(GOALC_TEST_FRAMEWORK_SOURCES ${CMAKE_CURRENT_LIST_DIR}/framework/test_runner.cpp diff --git a/test/goalc/test_CodeTester.cpp b/test/goalc/test_CodeTester.cpp new file mode 100644 index 0000000000..d1ffe56d63 --- /dev/null +++ b/test/goalc/test_CodeTester.cpp @@ -0,0 +1,378 @@ +/*! + * @file test_CodeTester.cpp + * Tests for the CodeTester, a tool for testing the emitter by emitting code and running it + * from within the test application. + * + * These tests should just make sure the basic functionality of CodeTester works, and that it + * can generate prologues/epilogues, and execute them without crashing. + */ + +#include "goalc/emitter/CodeTester.h" +#include "goalc/emitter/IGen.h" +#include "goalc/emitter/InstructionSet.h" +#include "goalc/emitter/Register.h" +#include "gtest/gtest.h" + +using namespace emitter; + +TEST(CodeTester, prologue_x86) { + CodeTester tester; + tester.init_code_buffer(256); + tester.emit_push_all_gprs(); + // check we generate the right code for pushing all gpr's + EXPECT_EQ(tester.dump_to_hex_string(), + "50 51 52 53 54 55 56 57 41 50 41 51 41 52 41 53 41 54 41 55 41 56 41 57"); +} + +TEST(CodeTester, prologue_arm64) { + CodeTester tester(emitter::InstructionSet::ARM64); + tester.init_code_buffer(256); + // tester.emit(IGen::push_gpr64(tester.generator(), ARM64_REG::X0)); + // EXPECT_EQ(tester.dump_to_hex_string(), "e0 8f 1f f8"); + tester.emit_push_all_gprs(); + // check we generate the right code for pushing all gpr's + EXPECT_EQ(tester.dump_to_hex_string(), + "e0 0f 1f f8 e1 0f 1f f8 e2 0f 1f f8 e3 0f 1f f8 e4 0f 1f f8 e5 0f 1f f8 e6 0f 1f f8 " + "e7 0f 1f f8 e8 0f 1f f8 e9 0f 1f f8 ea 0f 1f f8 eb 0f 1f f8 ec 0f 1f f8 ed 0f 1f f8 " + "ee 0f 1f f8 ef 0f 1f f8 f0 0f 1f f8 f1 0f 1f f8 f2 0f 1f f8 f3 0f 1f f8 f4 0f 1f f8 " + "f5 0f 1f f8 f6 0f 1f f8 f7 0f 1f f8 f8 0f 1f f8 f9 0f 1f f8 fa 0f 1f f8 fb 0f 1f f8 " + "fc 0f 1f f8 fd 0f 1f f8 fe 0f 1f f8"); +} + +TEST(CodeTester, epilogue_x86) { + CodeTester tester; + tester.init_code_buffer(256); + tester.emit_pop_all_gprs(); + // check we generate the right code for popping all gpr's + EXPECT_EQ(tester.dump_to_hex_string(), + "41 5f 41 5e 41 5d 41 5c 41 5b 41 5a 41 59 41 58 5f 5e 5d 5c 5b 5a 59 58"); +} + +TEST(CodeTester, epilogue_arm64) { + CodeTester tester(emitter::InstructionSet::ARM64); + tester.init_code_buffer(256); + tester.emit_pop_all_gprs(); + // check we generate the right code for popping all gpr's + EXPECT_EQ(tester.dump_to_hex_string(), + "fe 07 41 f8 fd 07 41 f8 fc 07 41 f8 fb 07 41 f8 fa 07 41 f8 f9 07 41 f8 f8 07 41 f8 " + "f7 07 41 f8 f6 07 41 f8 f5 07 41 f8 f4 07 41 f8 f3 07 41 f8 f2 07 41 f8 f1 07 41 f8 " + "f0 07 41 f8 ef 07 41 f8 ee 07 41 f8 ed 07 41 f8 ec 07 41 f8 eb 07 41 f8 ea 07 41 f8 " + "e9 07 41 f8 e8 07 41 f8 e7 07 41 f8 e6 07 41 f8 e5 07 41 f8 e4 07 41 f8 e3 07 41 f8 " + "e2 07 41 f8 e1 07 41 f8 e0 07 41 f8"); +} + +TEST(CodeTester, sub_gpr64_imm8_x86) { + CodeTester tester; + tester.init_code_buffer(256); + for (int i = 0; i < 16; i++) { + tester.emit(IGen::sub_gpr64_imm8s(tester.generator(), i, -1)); + } + EXPECT_EQ(tester.dump_to_hex_string(true), + "4883E8FF4883E9FF4883EAFF4883EBFF4883ECFF4883EDFF4883EEFF4883EFFF4983E8FF4983E9FF4983EA" + "FF4983EBFF4983ECFF4983EDFF4983EEFF4983EFFF"); +} + +TEST(CodeTester, sub_gpr64_imm8_arm64) { + CodeTester tester(emitter::InstructionSet::ARM64); + tester.init_code_buffer(256); + for (int i = 0; i < 31; i++) { + tester.emit(IGen::sub_gpr64_imm8s(tester.generator(), i, -1)); + } + EXPECT_EQ(tester.dump_to_hex_string(true), + "0004009121040091420400916304009184040091A5040091C6040091E704009108050091290500914A0500" + "916B0500918C050091AD050091CE050091EF0500911006009131060091520600917306009194060091B506" + "0091D6060091F706009118070091390700915A0700917B0700919C070091BD070091DE070091"); +} + +TEST(CodeTester, add_gpr64_imm8_x86) { + CodeTester tester; + tester.init_code_buffer(256); + for (int i = 0; i < 16; i++) { + tester.emit(IGen::add_gpr64_imm8s(tester.generator(), i, -1)); + } + EXPECT_EQ(tester.dump_to_hex_string(true), + "4883C0FF4883C1FF4883C2FF4883C3FF4883C4FF4883C5FF4883C6FF4883C7FF4983C0FF4983C1FF4983C2" + "FF4983C3FF4983C4FF4983C5FF4983C6FF4983C7FF"); +} + +TEST(CodeTester, add_gpr64_imm8_arm64) { + CodeTester tester(emitter::InstructionSet::ARM64); + tester.init_code_buffer(256); + for (int i = 0; i < 31; i++) { + tester.emit(IGen::add_gpr64_imm8s(tester.generator(), i, -1)); + } + EXPECT_EQ(tester.dump_to_hex_string(true), + "000400D1210400D1420400D1630400D1840400D1A50400D1C60400D1E70400D1080500D1290500D14A0500" + "D16B0500D18C0500D1AD0500D1CE0500D1EF0500D1100600D1310600D1520600D1730600D1940600D1B506" + "00D1D60600D1F70600D1180700D1390700D15A0700D17B0700D19C0700D1BD0700D1DE0700D1"); +} + +TEST(CodeTester, simd_store_128_x86) { + CodeTester tester; + tester.init_code_buffer(256); + // movdqa [rbx], xmm3 + // movdqa [r14], xmm3 + // movdqa [rbx], xmm14 + // movdqa [r14], xmm13 + tester.emit(IGen::store128_gpr64_simd128(tester.generator(), RBX, XMM3)); + tester.emit(IGen::store128_gpr64_simd128(tester.generator(), R14, XMM3)); + tester.emit(IGen::store128_gpr64_simd128(tester.generator(), RBX, XMM14)); + tester.emit(IGen::store128_gpr64_simd128(tester.generator(), R14, XMM13)); + EXPECT_EQ(tester.dump_to_hex_string(), + "66 0f 7f 1b 66 41 0f 7f 1e 66 44 0f 7f 33 66 45 0f 7f 2e"); + + tester.clear(); + tester.emit(IGen::store128_gpr64_simd128(tester.generator(), RSP, XMM1)); + EXPECT_EQ(tester.dump_to_hex_string(), "66 0f 7f 0c 24"); // requires SIB byte. + + tester.clear(); + tester.emit(IGen::store128_gpr64_simd128(tester.generator(), R12, XMM13)); + EXPECT_EQ(tester.dump_to_hex_string(), "66 45 0f 7f 2c 24"); // requires SIB byte and REX byte + + tester.clear(); + tester.emit(IGen::store128_gpr64_simd128(tester.generator(), RBP, XMM1)); + EXPECT_EQ(tester.dump_to_hex_string(), "66 0f 7f 4d 00"); + + tester.clear(); + tester.emit(IGen::store128_gpr64_simd128(tester.generator(), RBP, XMM11)); + EXPECT_EQ(tester.dump_to_hex_string(), "66 44 0f 7f 5d 00"); + + tester.clear(); + tester.emit(IGen::store128_gpr64_simd128(tester.generator(), R13, XMM2)); + EXPECT_EQ(tester.dump_to_hex_string(), "66 41 0f 7f 55 00"); + + tester.clear(); + tester.emit(IGen::store128_gpr64_simd128(tester.generator(), R13, XMM12)); + EXPECT_EQ(tester.dump_to_hex_string(), "66 45 0f 7f 65 00"); +} + +TEST(CodeTester, simd_store_128_arm64) { + CodeTester tester(emitter::InstructionSet::ARM64); + tester.init_code_buffer(256); + + tester.emit(IGen::store128_gpr64_simd128(tester.generator(), X2, Q3)); + tester.emit(IGen::store128_gpr64_simd128(tester.generator(), X14, Q3)); + tester.emit(IGen::store128_gpr64_simd128(tester.generator(), X2, Q14)); + tester.emit(IGen::store128_gpr64_simd128(tester.generator(), X14, Q13)); + EXPECT_EQ(tester.dump_to_hex_string(), "43 00 80 3d c3 01 80 3d 4e 00 80 3d cd 01 80 3d"); +} + +TEST(CodeTester, xmm_load_128_x86) { + CodeTester tester; + tester.init_code_buffer(256); + + tester.emit(IGen::load128_simd128_gpr64(tester.generator(), XMM3, RBX)); + tester.emit(IGen::load128_simd128_gpr64(tester.generator(), XMM3, R14)); + tester.emit(IGen::load128_simd128_gpr64(tester.generator(), XMM14, RBX)); + tester.emit(IGen::load128_simd128_gpr64(tester.generator(), XMM13, R14)); + EXPECT_EQ(tester.dump_to_hex_string(), + "66 0f 6f 1b 66 41 0f 6f 1e 66 44 0f 6f 33 66 45 0f 6f 2e"); + + tester.clear(); + tester.emit(IGen::load128_simd128_gpr64(tester.generator(), XMM1, RSP)); + EXPECT_EQ(tester.dump_to_hex_string(), "66 0f 6f 0c 24"); // requires SIB byte. + + tester.clear(); + tester.emit(IGen::load128_simd128_gpr64(tester.generator(), XMM13, R12)); + EXPECT_EQ(tester.dump_to_hex_string(), "66 45 0f 6f 2c 24"); // requires SIB byte and REX byte + + tester.clear(); + tester.emit(IGen::load128_simd128_gpr64(tester.generator(), XMM1, RBP)); + EXPECT_EQ(tester.dump_to_hex_string(), "66 0f 6f 4d 00"); + + tester.clear(); + tester.emit(IGen::load128_simd128_gpr64(tester.generator(), XMM11, RBP)); + EXPECT_EQ(tester.dump_to_hex_string(), "66 44 0f 6f 5d 00"); + + tester.clear(); + tester.emit(IGen::load128_simd128_gpr64(tester.generator(), XMM2, R13)); + EXPECT_EQ(tester.dump_to_hex_string(), "66 41 0f 6f 55 00"); + + tester.clear(); + tester.emit(IGen::load128_simd128_gpr64(tester.generator(), XMM12, R13)); + EXPECT_EQ(tester.dump_to_hex_string(), "66 45 0f 6f 65 00"); +} + +TEST(CodeTester, xmm_load_128_arm64) { + CodeTester tester(emitter::InstructionSet::ARM64); + tester.init_code_buffer(256); + + tester.emit(IGen::load128_simd128_gpr64(tester.generator(), Q3, X1)); + tester.emit(IGen::load128_simd128_gpr64(tester.generator(), Q3, X14)); + tester.emit(IGen::load128_simd128_gpr64(tester.generator(), Q14, X1)); + tester.emit(IGen::load128_simd128_gpr64(tester.generator(), Q13, X14)); + EXPECT_EQ(tester.dump_to_hex_string(), "23 00 c0 3d c3 01 c0 3d 2e 00 c0 3d cd 01 c0 3d"); +} + +// These tests actually execute the code, you cannot execute arm64 code on x86 and vise versa +// so these tests have to be conditional based on the platform unfortunately. +TEST(CodeTester, execute_push_pop_simd_x86) { + CodeTester tester; + tester.init_code_buffer(512); + tester.emit_push_all_simd(); + tester.emit_pop_all_simd(); + tester.emit_return(); + EXPECT_EQ( + tester.dump_to_hex_string(), + "48 83 ec 08 48 83 ec 10 66 0f 7f 04 24 48 83 ec 10 66 0f 7f 0c 24 48 83 ec 10 66 0f 7f 14 " + "24 48 83 ec 10 66 0f 7f 1c 24 48 83 ec 10 66 0f 7f 24 24 48 83 ec 10 66 0f 7f 2c 24 48 83 " + "ec 10 66 0f 7f 34 24 48 83 ec 10 66 0f 7f 3c 24 48 83 ec 10 66 44 0f 7f 04 24 48 83 ec 10 " + "66 44 0f 7f 0c 24 48 83 ec 10 66 44 0f 7f 14 24 48 83 ec 10 66 44 0f 7f 1c 24 48 83 ec 10 " + "66 44 0f 7f 24 24 48 83 ec 10 66 44 0f 7f 2c 24 48 83 ec 10 66 44 0f 7f 34 24 48 83 ec 10 " + "66 44 0f 7f 3c 24 66 0f 6f 04 24 48 83 c4 10 66 0f 6f 0c 24 48 83 c4 10 66 0f 6f 14 24 48 " + "83 c4 10 66 0f 6f 1c 24 48 83 c4 10 66 0f 6f 24 24 48 83 c4 10 66 0f 6f 2c 24 48 83 c4 10 " + "66 0f 6f 34 24 48 83 c4 10 66 0f 6f 3c 24 48 83 c4 10 66 44 0f 6f 04 24 48 83 c4 10 66 44 " + "0f 6f 0c 24 48 83 c4 10 66 44 0f 6f 14 24 48 83 c4 10 66 44 0f 6f 1c 24 48 83 c4 10 66 44 " + "0f 6f 24 24 48 83 c4 10 66 44 0f 6f 2c 24 48 83 c4 10 66 44 0f 6f 34 24 48 83 c4 10 66 44 " + "0f 6f 3c 24 48 83 c4 10 48 83 c4 08 c3"); +#ifndef __aarch64__ + tester.execute(); +#endif +} + +TEST(CodeTester, execute_push_pop_simd_arm64) { + CodeTester tester(emitter::InstructionSet::ARM64); + tester.init_code_buffer(512); + tester.emit_push_all_simd(); + tester.emit_pop_all_simd(); + tester.emit_return(); + EXPECT_EQ( + tester.dump_to_hex_string(), + "ff 43 00 d1 e0 03 80 3d ff 43 00 d1 e1 03 80 3d ff 43 00 d1 e2 03 80 3d ff 43 00 d1 e3 03 " + "80 3d ff 43 00 d1 e4 03 80 3d ff 43 00 d1 e5 03 80 3d ff 43 00 d1 e6 03 80 3d ff 43 00 d1 " + "e7 03 80 3d ff 43 00 d1 e8 03 80 3d ff 43 00 d1 e9 03 80 3d ff 43 00 d1 ea 03 80 3d ff 43 " + "00 d1 eb 03 80 3d ff 43 00 d1 ec 03 80 3d ff 43 00 d1 ed 03 80 3d ff 43 00 d1 ee 03 80 3d " + "ff 43 00 d1 ef 03 80 3d e0 03 c0 3d ff 43 00 91 e1 03 c0 3d ff 43 00 91 e2 03 c0 3d ff 43 " + "00 91 e3 03 c0 3d ff 43 00 91 e4 03 c0 3d ff 43 00 91 e5 03 c0 3d ff 43 00 91 e6 03 c0 3d " + "ff 43 00 91 e7 03 c0 3d ff 43 00 91 e8 03 c0 3d ff 43 00 91 e9 03 c0 3d ff 43 00 91 ea 03 " + "c0 3d ff 43 00 91 eb 03 c0 3d ff 43 00 91 ec 03 c0 3d ff 43 00 91 ed 03 c0 3d ff 43 00 91 " + "ee 03 c0 3d ff 43 00 91 ef 03 c0 3d ff 43 00 91 c0 03 5f d6"); +#ifdef __aarch64__ + tester.execute(); +#endif +} + +TEST(CodeTester, execute_push_pop_all_the_things_x86) { + CodeTester tester; + tester.init_code_buffer(512); + tester.emit_push_all_simd(); + tester.emit_push_all_gprs(); + + // ... + tester.emit_pop_all_gprs(); + tester.emit_pop_all_simd(); + tester.emit_return(); + EXPECT_EQ(tester.dump_to_hex_string(), + "48 83 ec 08 48 83 ec 10 66 0f 7f 04 24 48 83 ec 10 66 0f 7f 0c 24 48 83 ec 10 66 0f " + "7f 14 24 48 83 ec 10 66 0f 7f 1c 24 48 83 ec 10 66 0f 7f 24 24 48 83 ec 10 66 0f 7f " + "2c 24 48 83 ec 10 66 0f 7f 34 24 48 83 ec 10 66 0f 7f 3c 24 48 83 ec 10 66 44 0f 7f " + "04 24 48 83 ec 10 66 44 0f 7f 0c 24 48 83 ec 10 66 44 0f 7f 14 24 48 83 ec 10 66 44 " + "0f 7f 1c 24 48 83 ec 10 66 44 0f 7f 24 24 48 83 ec 10 66 44 0f 7f 2c 24 48 83 ec 10 " + "66 44 0f 7f 34 24 48 83 ec 10 66 44 0f 7f 3c 24 50 51 52 53 54 55 56 57 41 50 41 51 " + "41 52 41 53 41 54 41 55 41 56 41 57 41 5f 41 5e 41 5d 41 5c 41 5b 41 5a 41 59 41 58 " + "5f 5e 5d 5c 5b 5a 59 58 66 0f 6f 04 24 48 83 c4 10 66 0f 6f 0c 24 48 83 c4 10 66 0f " + "6f 14 24 48 83 c4 10 66 0f 6f 1c 24 48 83 c4 10 66 0f 6f 24 24 48 83 c4 10 66 0f 6f " + "2c 24 48 83 c4 10 66 0f 6f 34 24 48 83 c4 10 66 0f 6f 3c 24 48 83 c4 10 66 44 0f 6f " + "04 24 48 83 c4 10 66 44 0f 6f 0c 24 48 83 c4 10 66 44 0f 6f 14 24 48 83 c4 10 66 44 " + "0f 6f 1c 24 48 83 c4 10 66 44 0f 6f 24 24 48 83 c4 10 66 44 0f 6f 2c 24 48 83 c4 10 " + "66 44 0f 6f 34 24 48 83 c4 10 66 44 0f 6f 3c 24 48 83 c4 10 48 83 c4 08 c3"); +#ifndef __aarch64__ + tester.execute(); +#endif +} + +TEST(CodeTester, execute_push_pop_all_the_things_arm64) { + CodeTester tester(emitter::InstructionSet::ARM64); + tester.init_code_buffer(512); + tester.emit_push_all_simd(); + tester.emit_push_all_gprs(); + + // ... + tester.emit_pop_all_gprs(); + tester.emit_pop_all_simd(); + tester.emit_return(); + EXPECT_EQ( + tester.dump_to_hex_string(), + "ff 43 00 d1 e0 03 80 3d ff 43 00 d1 e1 03 80 3d ff 43 00 d1 e2 03 80 3d ff 43 00 d1 e3 03 " + "80 3d ff 43 00 d1 e4 03 80 3d ff 43 00 d1 e5 03 80 3d ff 43 00 d1 e6 03 80 3d ff 43 00 d1 " + "e7 03 80 3d ff 43 00 d1 e8 03 80 3d ff 43 00 d1 e9 03 80 3d ff 43 00 d1 ea 03 80 3d ff 43 " + "00 d1 eb 03 80 3d ff 43 00 d1 ec 03 80 3d ff 43 00 d1 ed 03 80 3d ff 43 00 d1 ee 03 80 3d " + "ff 43 00 d1 ef 03 80 3d e0 0f 1f f8 e1 0f 1f f8 e2 0f 1f f8 e3 0f 1f f8 e4 0f 1f f8 e5 0f " + "1f f8 e6 0f 1f f8 e7 0f 1f f8 e8 0f 1f f8 e9 0f 1f f8 ea 0f 1f f8 eb 0f 1f f8 ec 0f 1f f8 " + "ed 0f 1f f8 ee 0f 1f f8 ef 0f 1f f8 f0 0f 1f f8 f1 0f 1f f8 f2 0f 1f f8 f3 0f 1f f8 f4 0f " + "1f f8 f5 0f 1f f8 f6 0f 1f f8 f7 0f 1f f8 f8 0f 1f f8 f9 0f 1f f8 fa 0f 1f f8 fb 0f 1f f8 " + "fc 0f 1f f8 fd 0f 1f f8 fe 0f 1f f8 fe 07 41 f8 fd 07 41 f8 fc 07 41 f8 fb 07 41 f8 fa 07 " + "41 f8 f9 07 41 f8 f8 07 41 f8 f7 07 41 f8 f6 07 41 f8 f5 07 41 f8 f4 07 41 f8 f3 07 41 f8 " + "f2 07 41 f8 f1 07 41 f8 f0 07 41 f8 ef 07 41 f8 ee 07 41 f8 ed 07 41 f8 ec 07 41 f8 eb 07 " + "41 f8 ea 07 41 f8 e9 07 41 f8 e8 07 41 f8 e7 07 41 f8 e6 07 41 f8 e5 07 41 f8 e4 07 41 f8 " + "e3 07 41 f8 e2 07 41 f8 e1 07 41 f8 e0 07 41 f8 e0 03 c0 3d ff 43 00 91 e1 03 c0 3d ff 43 " + "00 91 e2 03 c0 3d ff 43 00 91 e3 03 c0 3d ff 43 00 91 e4 03 c0 3d ff 43 00 91 e5 03 c0 3d " + "ff 43 00 91 e6 03 c0 3d ff 43 00 91 e7 03 c0 3d ff 43 00 91 e8 03 c0 3d ff 43 00 91 e9 03 " + "c0 3d ff 43 00 91 ea 03 c0 3d ff 43 00 91 eb 03 c0 3d ff 43 00 91 ec 03 c0 3d ff 43 00 91 " + "ed 03 c0 3d ff 43 00 91 ee 03 c0 3d ff 43 00 91 ef 03 c0 3d ff 43 00 91 c0 03 5f d6"); +#ifdef __aarch64__ + tester.execute(); +#endif +} + +TEST(CodeTester, execute_return_x86) { + CodeTester tester; + tester.init_code_buffer(256); + // test creating a function which simply returns + tester.emit_return(); + EXPECT_EQ(tester.dump_to_hex_string(), "c3"); + // and execute it! +#ifndef __aarch64__ + tester.execute(); +#endif +} + +TEST(CodeTester, execute_return_arm64) { + CodeTester tester(emitter::InstructionSet::ARM64); + tester.init_code_buffer(256); + // test creating a function which simply returns + tester.emit(IGen::add_gpr64_imm8s(tester.generator(), ARM64_REG::X0, 1)); + tester.emit(IGen::ret(tester.generator())); + EXPECT_EQ(tester.dump_to_hex_string(), "00 04 00 91 c0 03 5f d6"); + // and execute it! +#ifdef __aarch64__ + tester.execute(); +#endif +} + +TEST(CodeTester, execute_push_pop_gprs_x86) { + CodeTester tester; + tester.init_code_buffer(256); + // test we can push/pop gprs without crashing. + tester.emit_push_all_gprs(); + tester.emit_pop_all_gprs(); + tester.emit_return(); + EXPECT_EQ(tester.dump_to_hex_string(), + "50 51 52 53 54 55 56 57 41 50 41 51 41 52 41 53 41 54 41 55 41 56 41 57 41 5f 41 5e " + "41 5d 41 5c 41 5b 41 5a 41 59 41 58 5f 5e 5d 5c 5b 5a 59 58 c3"); +#ifndef __aarch64__ + tester.execute(); +#endif +} + +TEST(CodeTester, execute_push_pop_gprs_arm64) { + CodeTester tester(emitter::InstructionSet::ARM64); + tester.init_code_buffer(256); + // test we can push/pop gprs without crashing. + tester.emit_push_all_gprs(); + tester.emit_pop_all_gprs(); + tester.emit_return(); + EXPECT_EQ(tester.dump_to_hex_string(), + "e0 0f 1f f8 e1 0f 1f f8 e2 0f 1f f8 e3 0f 1f f8 e4 0f 1f f8 e5 0f 1f f8 e6 0f 1f f8 " + "e7 0f 1f f8 e8 0f 1f f8 e9 0f 1f f8 ea 0f 1f f8 eb 0f 1f f8 ec 0f 1f f8 ed 0f 1f f8 " + "ee 0f 1f f8 ef 0f 1f f8 f0 0f 1f f8 f1 0f 1f f8 f2 0f 1f f8 f3 0f 1f f8 f4 0f 1f f8 " + "f5 0f 1f f8 f6 0f 1f f8 f7 0f 1f f8 f8 0f 1f f8 f9 0f 1f f8 fa 0f 1f f8 fb 0f 1f f8 " + "fc 0f 1f f8 fd 0f 1f f8 fe 0f 1f f8 fe 07 41 f8 fd 07 41 f8 fc 07 41 f8 fb 07 41 f8 " + "fa 07 41 f8 f9 07 41 f8 f8 07 41 f8 f7 07 41 f8 f6 07 41 f8 f5 07 41 f8 f4 07 41 f8 " + "f3 07 41 f8 f2 07 41 f8 f1 07 41 f8 f0 07 41 f8 ef 07 41 f8 ee 07 41 f8 ed 07 41 f8 " + "ec 07 41 f8 eb 07 41 f8 ea 07 41 f8 e9 07 41 f8 e8 07 41 f8 e7 07 41 f8 e6 07 41 f8 " + "e5 07 41 f8 e4 07 41 f8 e3 07 41 f8 e2 07 41 f8 e1 07 41 f8 e0 07 41 f8 c0 03 5f d6"); +#ifdef __aarch64__ + tester.execute(); +#endif +} \ No newline at end of file diff --git a/test/goalc/test_arithmetic.cpp b/test/goalc/test_arithmetic.cpp index 80143c833b..c736a4a28a 100644 --- a/test/goalc/test_arithmetic.cpp +++ b/test/goalc/test_arithmetic.cpp @@ -1,5 +1,3 @@ -// https://github.com/google/googletest/blob/master/googletest/docs/advanced.md#value-parameterized-tests - #include #include #include @@ -119,7 +117,7 @@ class ArithmeticTests : public testing::TestWithParam { // Called before the first test in this test suite. static void SetUpTestSuite() { runtime_thread = std::make_unique(std::thread(GoalTest::runtime_no_kernel_jak1)); - compiler = std::make_unique(GameVersion::Jak1); + compiler = std::make_unique(GameVersion::Jak1, emitter::InstructionSet::X86); runner = std::make_unique(); runner->c = compiler.get(); } diff --git a/test/goalc/test_collections.cpp b/test/goalc/test_collections.cpp index dca4e3cdc9..528a15967c 100644 --- a/test/goalc/test_collections.cpp +++ b/test/goalc/test_collections.cpp @@ -14,7 +14,7 @@ class CollectionTests : public testing::TestWithParam { public: static void SetUpTestSuite() { runtime_thread = std::make_unique(std::thread(GoalTest::runtime_no_kernel_jak1)); - compiler = std::make_unique(GameVersion::Jak1); + compiler = std::make_unique(GameVersion::Jak1, emitter::InstructionSet::X86); runner = std::make_unique(); runner->c = compiler.get(); } diff --git a/test/goalc/test_compiler.cpp b/test/goalc/test_compiler.cpp index de23e731ef..2ca0223112 100644 --- a/test/goalc/test_compiler.cpp +++ b/test/goalc/test_compiler.cpp @@ -2,6 +2,6 @@ #include "gtest/gtest.h" TEST(CompilerAndRuntime, ConstructCompiler) { - Compiler compiler1(GameVersion::Jak1); - Compiler compiler2(GameVersion::Jak2); + Compiler compiler1(GameVersion::Jak1, emitter::InstructionSet::X86); + Compiler compiler2(GameVersion::Jak2, emitter::InstructionSet::X86); } diff --git a/test/goalc/test_control_statements.cpp b/test/goalc/test_control_statements.cpp index 6c44357ccc..95aa0a1677 100644 --- a/test/goalc/test_control_statements.cpp +++ b/test/goalc/test_control_statements.cpp @@ -1,8 +1,8 @@ #include #include -#include "game/runtime.h" #include "goalc/compiler/Compiler.h" +#include "goalc/emitter/InstructionSet.h" #include "gtest/gtest.h" #include "test/goalc/framework/test_runner.h" @@ -14,7 +14,7 @@ class ControlStatementTests : public testing::TestWithParam(std::thread(GoalTest::runtime_no_kernel_jak1)); - compiler = std::make_unique(GameVersion::Jak1); + compiler = std::make_unique(GameVersion::Jak1, emitter::InstructionSet::X86); runner = std::make_unique(); runner->c = compiler.get(); } diff --git a/test/goalc/test_debugger.cpp b/test/goalc/test_debugger.cpp index 63a96e77ce..4bcd58d29d 100644 --- a/test/goalc/test_debugger.cpp +++ b/test/goalc/test_debugger.cpp @@ -34,7 +34,7 @@ void connect_compiler_and_debugger(Compiler& compiler, bool do_break) { } } // namespace TEST(Jak1Debugger, DebuggerBasicConnect) { - Compiler compiler(GameVersion::Jak1); + Compiler compiler(GameVersion::Jak1, emitter::InstructionSet::X86); // evidently you can't ptrace threads in your own process, so we need to run the runtime in a // separate process. if (!fork()) { @@ -51,7 +51,7 @@ TEST(Jak1Debugger, DebuggerBasicConnect) { } TEST(Jak1Debugger, DebuggerBreakAndContinue) { - Compiler compiler(GameVersion::Jak1); + Compiler compiler(GameVersion::Jak1, emitter::InstructionSet::X86); // evidently you can't ptrace threads in your own process, so we need to run the runtime in a // separate process. if (!fork()) { @@ -73,7 +73,7 @@ TEST(Jak1Debugger, DebuggerBreakAndContinue) { } TEST(Jak1Debugger, DebuggerReadMemory) { - Compiler compiler(GameVersion::Jak1); + Compiler compiler(GameVersion::Jak1, emitter::InstructionSet::X86); // evidently you can't ptrace threads in your own process, so we need to run the runtime in a // separate process. if (!fork()) { @@ -97,7 +97,7 @@ TEST(Jak1Debugger, DebuggerReadMemory) { } TEST(Jak1Debugger, DebuggerWriteMemory) { - Compiler compiler(GameVersion::Jak1); + Compiler compiler(GameVersion::Jak1, emitter::InstructionSet::X86); // evidently you can't ptrace threads in your own process, so we need to run the runtime in a // separate process. if (!fork()) { @@ -128,7 +128,7 @@ TEST(Jak1Debugger, DebuggerWriteMemory) { } TEST(Jak1Debugger, Symbol) { - Compiler compiler(GameVersion::Jak1); + Compiler compiler(GameVersion::Jak1, emitter::InstructionSet::X86); // evidently you can't ptrace threads in your own process, so we need to run the runtime in a // separate process. if (!fork()) { @@ -160,7 +160,7 @@ TEST(Jak1Debugger, Symbol) { TEST(Jak1Debugger, SimpleBreakpoint) { try { - Compiler compiler(GameVersion::Jak1); + Compiler compiler(GameVersion::Jak1, emitter::InstructionSet::X86); if (!fork()) { GoalTest::runtime_no_kernel_jak1(); diff --git a/test/goalc/test_game_no_debug.cpp b/test/goalc/test_game_no_debug.cpp index 2a81df8c0a..cfb9ea892b 100644 --- a/test/goalc/test_game_no_debug.cpp +++ b/test/goalc/test_game_no_debug.cpp @@ -5,7 +5,7 @@ #include "test/goalc/framework/test_runner.h" TEST(Jak1NoDebugSegment, Init) { - Compiler compiler(GameVersion::Jak1); + Compiler compiler(GameVersion::Jak1, emitter::InstructionSet::X86); compiler.run_front_end_on_string("(build-kernel)"); std::thread runtime_thread = std::thread(GoalTest::runtime_with_kernel_no_debug_segment); diff --git a/test/goalc/test_goal_kernel.cpp b/test/goalc/test_goal_kernel.cpp index 2a02f8b827..e7231e2d26 100644 --- a/test/goalc/test_goal_kernel.cpp +++ b/test/goalc/test_goal_kernel.cpp @@ -37,7 +37,7 @@ class Jak1KernelTest : public testing::Test { void TearDown() {} struct SharedCompiler { - SharedCompiler(GameVersion v) : compiler(v) {} + SharedCompiler(GameVersion v) : compiler(v, emitter::InstructionSet::X86) {} std::thread runtime_thread; Compiler compiler; GoalTest::CompilerTestRunner runner; diff --git a/test/goalc/test_goal_kernel2.cpp b/test/goalc/test_goal_kernel2.cpp index fd237d2b73..94dbcf33cf 100644 --- a/test/goalc/test_goal_kernel2.cpp +++ b/test/goalc/test_goal_kernel2.cpp @@ -38,7 +38,7 @@ class Jak2KernelTest : public testing::Test { void TearDown() {} struct SharedCompiler { - SharedCompiler(GameVersion v) : compiler(v) {} + SharedCompiler(GameVersion v) : compiler(v, emitter::InstructionSet::X86) {} std::thread runtime_thread; Compiler compiler; GoalTest::CompilerTestRunner runner; diff --git a/test/goalc/test_goal_kernel3.cpp b/test/goalc/test_goal_kernel3.cpp index fe458dd937..e25db287f2 100644 --- a/test/goalc/test_goal_kernel3.cpp +++ b/test/goalc/test_goal_kernel3.cpp @@ -38,7 +38,7 @@ class Jak3KernelTest : public testing::Test { void TearDown() {} struct SharedCompiler { - SharedCompiler(GameVersion v) : compiler(v) {} + SharedCompiler(GameVersion v) : compiler(v, emitter::InstructionSet::X86) {} std::thread runtime_thread; Compiler compiler; GoalTest::CompilerTestRunner runner; diff --git a/test/goalc/test_jak2_compiler.cpp b/test/goalc/test_jak2_compiler.cpp index 6572c4a1c7..4bc80a53fb 100644 --- a/test/goalc/test_jak2_compiler.cpp +++ b/test/goalc/test_jak2_compiler.cpp @@ -11,7 +11,7 @@ class Jak2GoalcTests : public testing::TestWithParam { public: static void SetUpTestSuite() { runtime_thread = std::make_unique(std::thread(GoalTest::runtime_no_kernel_jak2)); - compiler = std::make_unique(GameVersion::Jak2); + compiler = std::make_unique(GameVersion::Jak2, emitter::InstructionSet::X86); runner = std::make_unique(); runner->c = compiler.get(); } diff --git a/test/goalc/test_type_consistency.cpp b/test/goalc/test_type_consistency.cpp index 66c8df1168..8fdb59b39f 100644 --- a/test/goalc/test_type_consistency.cpp +++ b/test/goalc/test_type_consistency.cpp @@ -22,7 +22,7 @@ void add_jak3_expected_type_mismatches(Compiler& /*c*/) {} void add_jakx_expected_type_mismatches(Compiler& /*c*/) {} TEST(Jak1TypeConsistency, MANUAL_TEST_TypeConsistencyWithBuildFirst) { - Compiler compiler(GameVersion::Jak1); + Compiler compiler(GameVersion::Jak1, emitter::InstructionSet::X86); compiler.enable_throw_on_redefines(); add_common_expected_type_mismatches(compiler); add_jak1_expected_type_mismatches(compiler); @@ -31,7 +31,7 @@ TEST(Jak1TypeConsistency, MANUAL_TEST_TypeConsistencyWithBuildFirst) { } TEST(Jak2TypeConsistency, MANUAL_TEST_TypeConsistencyWithBuildFirst) { - Compiler compiler(GameVersion::Jak2); + Compiler compiler(GameVersion::Jak2, emitter::InstructionSet::X86); compiler.enable_throw_on_redefines(); add_common_expected_type_mismatches(compiler); add_jak2_expected_type_mismatches(compiler); @@ -40,7 +40,7 @@ TEST(Jak2TypeConsistency, MANUAL_TEST_TypeConsistencyWithBuildFirst) { } TEST(Jak3TypeConsistency, TypeConsistencyWithBuildFirst) { - Compiler compiler(GameVersion::Jak3); + Compiler compiler(GameVersion::Jak3, emitter::InstructionSet::X86); compiler.enable_throw_on_redefines(); add_common_expected_type_mismatches(compiler); add_jak3_expected_type_mismatches(compiler); @@ -49,7 +49,7 @@ TEST(Jak3TypeConsistency, TypeConsistencyWithBuildFirst) { } // TEST(JakXTypeConsistency, TypeConsistencyWithBuildFirst) { -// Compiler compiler(GameVersion::JakX); +// Compiler compiler(GameVersion::JakX, emitter::InstructionSet::X86); // compiler.enable_throw_on_redefines(); // add_common_expected_type_mismatches(compiler); // add_jakx_expected_type_mismatches(compiler); @@ -58,7 +58,7 @@ TEST(Jak3TypeConsistency, TypeConsistencyWithBuildFirst) { // } TEST(Jak1TypeConsistency, TypeConsistency) { - Compiler compiler(GameVersion::Jak1); + Compiler compiler(GameVersion::Jak1, emitter::InstructionSet::X86); compiler.enable_throw_on_redefines(); add_common_expected_type_mismatches(compiler); add_jak1_expected_type_mismatches(compiler); @@ -67,7 +67,7 @@ TEST(Jak1TypeConsistency, TypeConsistency) { } TEST(Jak2TypeConsistency, TypeConsistency) { - Compiler compiler(GameVersion::Jak2); + Compiler compiler(GameVersion::Jak2, emitter::InstructionSet::X86); compiler.enable_throw_on_redefines(); add_common_expected_type_mismatches(compiler); add_jak2_expected_type_mismatches(compiler); @@ -76,7 +76,7 @@ TEST(Jak2TypeConsistency, TypeConsistency) { } TEST(Jak3TypeConsistency, TypeConsistency) { - Compiler compiler(GameVersion::Jak3); + Compiler compiler(GameVersion::Jak3, emitter::InstructionSet::X86); compiler.enable_throw_on_redefines(); add_common_expected_type_mismatches(compiler); add_jak3_expected_type_mismatches(compiler); @@ -85,7 +85,7 @@ TEST(Jak3TypeConsistency, TypeConsistency) { } // TEST(JakXTypeConsistency, TypeConsistency) { -// Compiler compiler(GameVersion::JakX); +// Compiler compiler(GameVersion::JakX, emitter::InstructionSet::X86); // compiler.enable_throw_on_redefines(); // add_common_expected_type_mismatches(compiler); // add_jakx_expected_type_mismatches(compiler); diff --git a/test/goalc/test_variables.cpp b/test/goalc/test_variables.cpp index 554a498776..c725d1ce6a 100644 --- a/test/goalc/test_variables.cpp +++ b/test/goalc/test_variables.cpp @@ -32,7 +32,7 @@ class VariableTests : public testing::TestWithParam { void TearDown() {} struct SharedCompiler { - SharedCompiler(GameVersion version) : compiler(version) {} + SharedCompiler(GameVersion version) : compiler(version, emitter::InstructionSet::X86) {} std::thread runtime_thread; Compiler compiler; GoalTest::CompilerTestRunner runner; diff --git a/test/goalc/test_vector_float.cpp b/test/goalc/test_vector_float.cpp index 842d89a355..a937614ff3 100644 --- a/test/goalc/test_vector_float.cpp +++ b/test/goalc/test_vector_float.cpp @@ -51,7 +51,7 @@ class WithMinimalGameTests : public ::testing::Test { void TearDown() {} struct SharedCompiler { - SharedCompiler(GameVersion v) : compiler(v) {} + SharedCompiler(GameVersion v) : compiler(v, emitter::InstructionSet::X86) {} std::thread runtime_thread; Compiler compiler; GoalTest::CompilerTestRunner runner; diff --git a/test/goalc/test_with_game.cpp b/test/goalc/test_with_game.cpp index d5ee5985dc..2b16a7e2a7 100644 --- a/test/goalc/test_with_game.cpp +++ b/test/goalc/test_with_game.cpp @@ -49,7 +49,7 @@ class WithGameTests : public ::testing::Test { void TearDown() {} struct SharedCompiler { - SharedCompiler(GameVersion v) : compiler(v) {} + SharedCompiler(GameVersion v) : compiler(v, emitter::InstructionSet::X86) {} std::thread runtime_thread; Compiler compiler; GoalTest::CompilerTestRunner runner; diff --git a/test/offline/config/jak1/config.jsonc b/test/offline/config/jak1/config.jsonc index 224a8abc4e..1eb0548be1 100644 --- a/test/offline/config/jak1/config.jsonc +++ b/test/offline/config/jak1/config.jsonc @@ -28,13 +28,11 @@ "DGO/DAR.DGO", "DGO/TIT.DGO" ], - "skip_compile_files": [ "timer", // accessing timer regs "display", // interrupt handlers "target-snowball" // screwed up labels, likely cut content ], - "skip_compile_functions": [ /// GCOMMON // these functions are not implemented by the compiler in OpenGOAL, but are in GOAL. @@ -47,212 +45,154 @@ "breakpoint-range-set!", // inline assembly "valid?", - /// GKERNEL // asm "(method 10 process)", "(method 14 dead-pool)", - /// GSTATE "enter-state", // stack pointer asm - /// MATH "rand-vu-init", "rand-vu", "rand-vu-nostep", // random hardware - // trig "sin-rad", // fpu acc "cos-rad", // fpu acc "atan-series-rad", // fpu acc - /// VECTOR-H "(method 3 vector)", // this function appears twice, which confuses the compiler. "vector4-dot", // fpu acc - "(method 3 profile-frame)", // double definition. - // dma-disasm "disasm-dma-list", // missing a single cast :( - // math camera "transform-point-vector!", "transform-point-qword!", "transform-point-vector-scale!", - // display-h "put-draw-env", - // geometry "calculate-basis-functions-vector!", // asm requiring manual rewrite "curve-evaluate!", // asm requiring manual rewrite "point-in-triangle-cross", // logior on floats manual fixup - // texture "(method 9 texture-page-dir)", // multiplication on pointers "adgif-shader<-texture-with-update!", // misrecognized bitfield stuff. - // asm "invalidate-cache-line", - // stats-h "(method 11 perf-stat)", "(method 12 perf-stat)", - // sprite-distorter "sprite-draw-distorters", // uses clipping flag. - // sync-info "(method 15 sync-info)", // needs display stuff first "(method 15 sync-info-eased)", // needs display stuff first "(method 15 sync-info-paused)", // needs display stuff first - // sparticle "lookup-part-group-pointer-by-name", // address of element in array issue - // ripple - calls an asm function "ripple-execute", - "get-task-status", - "print-game-text-scaled", // float/int, looks like a bug in original code? - // aligner - return-from-thread, currently not supported "(method 9 align-control)", - // stat collection "start-perf-stat-collection", "end-perf-stat-collection", - // double definition "(method 3 game-save)", - // new stack boxed array "update-time-of-day", - // weird asm, was rewritten "close-sky-buffer", - // float to int "(method 10 bsp-header)", - // multiply defined. "(method 3 sprite-aux-list)", - // camera "slave-set-rotation!", "v-slrp2!", "v-slrp3!", // vector-dot involving the stack - // function returning float with a weird cast. "debug-menu-item-var-make-float", - // decompiler BUG "level-hint-task-process", "(method 26 level)", "(method 9 level)", "(method 10 level)", // asm - // cam-states "cam-los-collide", // vector-dot involving the stack - // cam-layout "cam-layout-save-cam-trans", // temporary, im sure this can be fixed - // anim-tester "(method 3 anim-tester)", "anim-tester-save-object-seqs", // anim-tester -- new basic on the stack - // default-menu "all-texture-tweak-adjust", // dynamic-field access placeholder case TODO "debug-menu-make-instance-menu", // also disabled - // joint "(method 9 art-mesh-geo)", // PLACEHOLDER array access "flatten-joint-control-to-spr", "make-joint-jump-tables", "(method 5 art-joint-anim)", // defined identically twice in the same file...probably a bug? - // process-drawable "fill-skeleton-cache", // cache dxwbin "execute-math-engine", // handle casts -- was fixed manually - // ambient - "ambient-type-music", // IR_StoreConstOffset::do_codegen can't handle this (c {} sz {}) - + "ambient-type-music", // IR_StoreConstOffset::do_codegen_x86 can't handle this (c {} sz {}) // main "display-loop", "on", - // target-handler "target-generic-event-handler", // return type forced to none - // shadow-cpu-h "(method 3 shadow-edge)", // defined twice in the same file, one is wrong and old - // sky - these are skipped and not used "sky-draw", "sky-upload", "sky-add-frame-data", - // drawable "vis-cull", // unsupported asm "draw-instance-info", // skipped for now, debug only "foreground-engine-execute", "real-main-draw-hook", // dma handling not complete - // generic-obs "command-get-process", // handle casts - // navigate "end-collect-nav", "start-collect-nav", - // appears twice "(method 9 drawable-tree-instance-tie)", "(method 11 drawable-tree-instance-tie)", "(method 12 drawable-tree-instance-tie)", "(method 13 drawable-tree-instance-tie)", - "ray-triangle-intersect", // requires SLL implementation - "(method 51 snow-bunny)", // bitfield problem - "ice-cube-default-event-handler", // return casted to none issue "(method 51 ice-cube)", // bitfield problem - "(method 13 collide-mesh)", // scratchpad sadness "(method 10 collide-mesh)", // collide-mesh-cache-tri handling - // not in use in PC port "tie-near-init-engine", "tie-near-end-buffer", - "(method 19 process-drawable)", - "curve-evaluate!", - "generic-reset-buffers", "generic-merc-execute-all", - /// COLLIDE-EDGE-GRAB "(method 9 edge-grab-info)", // asm - /// COLLIDE-SHAPE-RIDER // type mess "(method 22 collide-shape-prim-mesh)", - /// COLLIDE-REACTION-TARGET "poly-find-nearest-edge", - /// GLIST // i dont even want to know "glst-find-node-by-name", "glst-length-of-longest-name", - "race-time-save" ], - "skip_compile_states": { "cam-master-active": [ "event" @@ -267,4 +207,4 @@ "code" // dead code not analyzed properly after a loop ] } -} +} \ No newline at end of file diff --git a/test/offline/framework/execution.cpp b/test/offline/framework/execution.cpp index ac19781559..36407a373a 100644 --- a/test/offline/framework/execution.cpp +++ b/test/offline/framework/execution.cpp @@ -104,7 +104,7 @@ OfflineTestCompileResult compile(OfflineTestDecompiler& dc, const OfflineTestWorkGroup& work_group, const OfflineTestConfig& config) { OfflineTestCompileResult result; - Compiler compiler(game_name_to_version(config.game_name)); + Compiler compiler(game_name_to_version(config.game_name), emitter::InstructionSet::X86); compiler.run_front_end_on_file( {"decompiler", "config", game_name_to_all_types[config.game_name]}); diff --git a/test/test_CodeTester.cpp b/test/test_CodeTester.cpp deleted file mode 100644 index a70c1e77eb..0000000000 --- a/test/test_CodeTester.cpp +++ /dev/null @@ -1,233 +0,0 @@ -/*! - * @file test_CodeTester.cpp - * Tests for the CodeTester, a tool for testing the emitter by emitting code and running it - * from within the test application. - * - * These tests should just make sure the basic functionality of CodeTester works, and that it - * can generate prologues/epilogues, and execute them without crashing. - */ - -#include "goalc/emitter/CodeTester.h" -#include "goalc/emitter/IGen.h" -#include "gtest/gtest.h" - -using namespace emitter; - -TEST(CodeTester, prologue) { - CodeTester tester; - tester.init_code_buffer(256); - tester.emit_push_all_gprs(); - // check we generate the right code for pushing all gpr's - EXPECT_EQ(tester.dump_to_hex_string(), - "50 51 52 53 54 55 56 57 41 50 41 51 41 52 41 53 41 54 41 55 41 56 41 57"); -} - -TEST(CodeTester, epilogue) { - CodeTester tester; - tester.init_code_buffer(256); - tester.emit_pop_all_gprs(); - // check we generate the right code for popping all gpr's - EXPECT_EQ(tester.dump_to_hex_string(), - "41 5f 41 5e 41 5d 41 5c 41 5b 41 5a 41 59 41 58 5f 5e 5d 5c 5b 5a 59 58"); -} - -TEST(CodeTester, execute_return) { - CodeTester tester; - tester.init_code_buffer(256); - // test creating a function which simply returns - tester.emit_return(); - // and execute it! - tester.execute(); -} - -TEST(CodeTester, execute_push_pop_gprs) { - CodeTester tester; - tester.init_code_buffer(256); - // test we can push/pop gprs without crashing. - tester.emit_push_all_gprs(); - tester.emit_pop_all_gprs(); - tester.emit_return(); - tester.execute(); -} - -TEST(CodeTester, xmm_store_128) { - CodeTester tester; - tester.init_code_buffer(256); - // movdqa [rbx], xmm3 - // movdqa [r14], xmm3 - // movdqa [rbx], xmm14 - // movdqa [r14], xmm13 - tester.emit(IGen::store128_gpr64_xmm128(RBX, XMM3)); - tester.emit(IGen::store128_gpr64_xmm128(R14, XMM3)); - tester.emit(IGen::store128_gpr64_xmm128(RBX, XMM14)); - tester.emit(IGen::store128_gpr64_xmm128(R14, XMM13)); - EXPECT_EQ(tester.dump_to_hex_string(), - "66 0f 7f 1b 66 41 0f 7f 1e 66 44 0f 7f 33 66 45 0f 7f 2e"); - - tester.clear(); - tester.emit(IGen::store128_gpr64_xmm128(RSP, XMM1)); - EXPECT_EQ(tester.dump_to_hex_string(), "66 0f 7f 0c 24"); // requires SIB byte. - - tester.clear(); - tester.emit(IGen::store128_gpr64_xmm128(R12, XMM13)); - EXPECT_EQ(tester.dump_to_hex_string(), "66 45 0f 7f 2c 24"); // requires SIB byte and REX byte - - tester.clear(); - tester.emit(IGen::store128_gpr64_xmm128(RBP, XMM1)); - EXPECT_EQ(tester.dump_to_hex_string(), "66 0f 7f 4d 00"); - - tester.clear(); - tester.emit(IGen::store128_gpr64_xmm128(RBP, XMM11)); - EXPECT_EQ(tester.dump_to_hex_string(), "66 44 0f 7f 5d 00"); - - tester.clear(); - tester.emit(IGen::store128_gpr64_xmm128(R13, XMM2)); - EXPECT_EQ(tester.dump_to_hex_string(), "66 41 0f 7f 55 00"); - - tester.clear(); - tester.emit(IGen::store128_gpr64_xmm128(R13, XMM12)); - EXPECT_EQ(tester.dump_to_hex_string(), "66 45 0f 7f 65 00"); - - // tester.emit(IGen::store128_gpr64_xmm128(RBX, XMM3)); - // tester.emit(IGen::store128_gpr64_xmm128(R14, XMM3)); - // tester.emit(IGen::store128_gpr64_xmm128(RBX, XMM14)); - // tester.emit(IGen::store128_gpr64_xmm128(R14, XMM13)); - // EXPECT_EQ(tester.dump_to_hex_string(), - // "f3 0f 7f 1b f3 41 0f 7f 1e f3 44 0f 7f 33 f3 45 0f 7f 2e"); - // - // tester.clear(); - // tester.emit(IGen::store128_gpr64_xmm128(RSP, XMM1)); - // EXPECT_EQ(tester.dump_to_hex_string(), "f3 0f 7f 0c 24"); // requires SIB byte. - // - // tester.clear(); - // tester.emit(IGen::store128_gpr64_xmm128(R12, XMM13)); - // EXPECT_EQ(tester.dump_to_hex_string(), "f3 45 0f 7f 2c 24"); // requires SIB byte and REX - // byte - // - // tester.clear(); - // tester.emit(IGen::store128_gpr64_xmm128(RBP, XMM1)); - // EXPECT_EQ(tester.dump_to_hex_string(), "f3 0f 7f 4d 00"); - // - // tester.clear(); - // tester.emit(IGen::store128_gpr64_xmm128(RBP, XMM11)); - // EXPECT_EQ(tester.dump_to_hex_string(), "f3 44 0f 7f 5d 00"); - // - // tester.clear(); - // tester.emit(IGen::store128_gpr64_xmm128(R13, XMM2)); - // EXPECT_EQ(tester.dump_to_hex_string(), "f3 41 0f 7f 55 00"); - // - // tester.clear(); - // tester.emit(IGen::store128_gpr64_xmm128(R13, XMM12)); - // EXPECT_EQ(tester.dump_to_hex_string(), "f3 45 0f 7f 65 00"); -} - -TEST(CodeTester, sub_gpr64_imm8) { - CodeTester tester; - tester.init_code_buffer(256); - for (int i = 0; i < 16; i++) { - tester.emit(IGen::sub_gpr64_imm8s(i, -1)); - } - EXPECT_EQ(tester.dump_to_hex_string(true), - "4883E8FF4883E9FF4883EAFF4883EBFF4883ECFF4883EDFF4883EEFF4883EFFF4983E8FF4983E9FF4983EA" - "FF4983EBFF4983ECFF4983EDFF4983EEFF4983EFFF"); -} - -TEST(CodeTester, add_gpr64_imm8) { - CodeTester tester; - tester.init_code_buffer(256); - for (int i = 0; i < 16; i++) { - tester.emit(IGen::add_gpr64_imm8s(i, -1)); - } - EXPECT_EQ(tester.dump_to_hex_string(true), - "4883C0FF4883C1FF4883C2FF4883C3FF4883C4FF4883C5FF4883C6FF4883C7FF4983C0FF4983C1FF4983C2" - "FF4983C3FF4983C4FF4983C5FF4983C6FF4983C7FF"); -} - -TEST(CodeTester, xmm_load_128) { - CodeTester tester; - tester.init_code_buffer(256); - - // tester.emit(IGen::load128_xmm128_gpr64(XMM3, RBX)); - // tester.emit(IGen::load128_xmm128_gpr64(XMM3, R14)); - // tester.emit(IGen::load128_xmm128_gpr64(XMM14, RBX)); - // tester.emit(IGen::load128_xmm128_gpr64(XMM13, R14)); - // EXPECT_EQ(tester.dump_to_hex_string(), - // "f3 0f 6f 1b f3 41 0f 6f 1e f3 44 0f 6f 33 f3 45 0f 6f 2e"); - // - // tester.clear(); - // tester.emit(IGen::load128_xmm128_gpr64(XMM1, RSP)); - // EXPECT_EQ(tester.dump_to_hex_string(), "f3 0f 6f 0c 24"); // requires SIB byte. - // - // tester.clear(); - // tester.emit(IGen::load128_xmm128_gpr64(XMM13, R12)); - // EXPECT_EQ(tester.dump_to_hex_string(), "f3 45 0f 6f 2c 24"); // requires SIB byte and REX - // byte - // - // tester.clear(); - // tester.emit(IGen::load128_xmm128_gpr64(XMM1, RBP)); - // EXPECT_EQ(tester.dump_to_hex_string(), "f3 0f 6f 4d 00"); - // - // tester.clear(); - // tester.emit(IGen::load128_xmm128_gpr64(XMM11, RBP)); - // EXPECT_EQ(tester.dump_to_hex_string(), "f3 44 0f 6f 5d 00"); - // - // tester.clear(); - // tester.emit(IGen::load128_xmm128_gpr64(XMM2, R13)); - // EXPECT_EQ(tester.dump_to_hex_string(), "f3 41 0f 6f 55 00"); - // - // tester.clear(); - // tester.emit(IGen::load128_xmm128_gpr64(XMM12, R13)); - // EXPECT_EQ(tester.dump_to_hex_string(), "f3 45 0f 6f 65 00"); - tester.emit(IGen::load128_xmm128_gpr64(XMM3, RBX)); - tester.emit(IGen::load128_xmm128_gpr64(XMM3, R14)); - tester.emit(IGen::load128_xmm128_gpr64(XMM14, RBX)); - tester.emit(IGen::load128_xmm128_gpr64(XMM13, R14)); - EXPECT_EQ(tester.dump_to_hex_string(), - "66 0f 6f 1b 66 41 0f 6f 1e 66 44 0f 6f 33 66 45 0f 6f 2e"); - - tester.clear(); - tester.emit(IGen::load128_xmm128_gpr64(XMM1, RSP)); - EXPECT_EQ(tester.dump_to_hex_string(), "66 0f 6f 0c 24"); // requires SIB byte. - - tester.clear(); - tester.emit(IGen::load128_xmm128_gpr64(XMM13, R12)); - EXPECT_EQ(tester.dump_to_hex_string(), "66 45 0f 6f 2c 24"); // requires SIB byte and REX byte - - tester.clear(); - tester.emit(IGen::load128_xmm128_gpr64(XMM1, RBP)); - EXPECT_EQ(tester.dump_to_hex_string(), "66 0f 6f 4d 00"); - - tester.clear(); - tester.emit(IGen::load128_xmm128_gpr64(XMM11, RBP)); - EXPECT_EQ(tester.dump_to_hex_string(), "66 44 0f 6f 5d 00"); - - tester.clear(); - tester.emit(IGen::load128_xmm128_gpr64(XMM2, R13)); - EXPECT_EQ(tester.dump_to_hex_string(), "66 41 0f 6f 55 00"); - - tester.clear(); - tester.emit(IGen::load128_xmm128_gpr64(XMM12, R13)); - EXPECT_EQ(tester.dump_to_hex_string(), "66 45 0f 6f 65 00"); -} - -TEST(CodeTester, push_pop_xmms) { - CodeTester tester; - tester.init_code_buffer(512); - tester.emit_push_all_xmms(); - tester.emit_pop_all_xmms(); - tester.emit_return(); - tester.execute(); -} - -TEST(CodeTester, push_pop_all_the_things) { - CodeTester tester; - tester.init_code_buffer(512); - tester.emit_push_all_xmms(); - tester.emit_push_all_gprs(); - - // ... - tester.emit_pop_all_gprs(); - tester.emit_pop_all_xmms(); - tester.emit_return(); - tester.execute(); -} diff --git a/test/test_emitter.cpp b/test/test_emitter.cpp index b39ce889a7..4538b1a7c1 100644 --- a/test/test_emitter.cpp +++ b/test/test_emitter.cpp @@ -1,3901 +1,3903 @@ -#include "goalc/emitter/CodeTester.h" -#include "goalc/emitter/IGen.h" -#include "gtest/gtest.h" - -using namespace emitter; - -TEST(EmitterIntegerMath, add_gpr64_imm8s) { - CodeTester tester; - tester.init_code_buffer(256); - - std::vector vals = {0, 1, -1, INT32_MIN, INT32_MAX, INT64_MIN, INT64_MAX}; - std::vector imms = {0, 1, -1, INT8_MIN, INT8_MAX}; - - // test the ones that aren't rsp - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (auto val : vals) { - for (auto imm : imms) { - auto expected = val + imm; - - tester.clear(); - tester.emit_push_all_gprs(true); - - // move initial value to register - tester.emit(IGen::mov_gpr64_gpr64(i, tester.get_c_abi_arg_reg(0))); - // do the add - tester.emit(IGen::add_gpr64_imm8s(i, imm)); - // move for return - tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); - - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - auto result = tester.execute_ret(val, 0, 0, 0); - EXPECT_EQ(result, expected); - } - } - } - - tester.clear(); - tester.emit(IGen::add_gpr64_imm8s(RSP, 12)); - EXPECT_EQ(tester.dump_to_hex_string(), "48 83 c4 0c"); -} - -TEST(EmitterIntegerMath, add_gpr64_imm32s) { - CodeTester tester; - tester.init_code_buffer(256); - - std::vector vals = {0, 1, -1, INT32_MIN, INT32_MAX, INT64_MIN, INT64_MAX}; - std::vector imms = {0, 1, -1, INT8_MIN, INT8_MAX, INT32_MIN, INT32_MAX}; - - // test the ones that aren't rsp - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (auto val : vals) { - for (auto imm : imms) { - auto expected = val + imm; - - tester.clear(); - tester.emit_push_all_gprs(true); - - // move initial value to register - tester.emit(IGen::mov_gpr64_gpr64(i, tester.get_c_abi_arg_reg(0))); - // do the add - tester.emit(IGen::add_gpr64_imm32s(i, imm)); - // move for return - tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); - - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - auto result = tester.execute_ret(val, 0, 0, 0); - EXPECT_EQ(result, expected); - } - } - } - - tester.clear(); - tester.emit(IGen::add_gpr64_imm32s(RSP, 12)); - EXPECT_EQ(tester.dump_to_hex_string(), "48 81 c4 0c 00 00 00"); -} - -TEST(EmitterIntegerMath, sub_gpr64_imm8s) { - CodeTester tester; - tester.init_code_buffer(256); - - std::vector vals = {0, 1, -1, INT32_MIN, INT32_MAX, INT64_MIN, INT64_MAX}; - std::vector imms = {0, 1, -1, INT8_MIN, INT8_MAX}; - - // test the ones that aren't rsp - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (auto val : vals) { - for (auto imm : imms) { - auto expected = val - imm; - - tester.clear(); - tester.emit_push_all_gprs(true); - - // move initial value to register - tester.emit(IGen::mov_gpr64_gpr64(i, tester.get_c_abi_arg_reg(0))); - // do the add - tester.emit(IGen::sub_gpr64_imm8s(i, imm)); - // move for return - tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); - - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - auto result = tester.execute_ret(val, 0, 0, 0); - EXPECT_EQ(result, expected); - } - } - } - - tester.clear(); - tester.emit(IGen::sub_gpr64_imm8s(RSP, 12)); - EXPECT_EQ(tester.dump_to_hex_string(), "48 83 ec 0c"); -} - -TEST(EmitterIntegerMath, sub_gpr64_imm32s) { - CodeTester tester; - tester.init_code_buffer(256); - - std::vector vals = {0, 1, -1, INT32_MIN, INT32_MAX, INT64_MIN, INT64_MAX}; - std::vector imms = {0, 1, -1, INT8_MIN, INT8_MAX, INT32_MIN, INT32_MAX}; - - // test the ones that aren't rsp - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (auto val : vals) { - for (auto imm : imms) { - auto expected = val - imm; - - tester.clear(); - tester.emit_push_all_gprs(true); - - // move initial value to register - tester.emit(IGen::mov_gpr64_gpr64(i, tester.get_c_abi_arg_reg(0))); - // do the add - tester.emit(IGen::sub_gpr64_imm32s(i, imm)); - // move for return - tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); - - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - auto result = tester.execute_ret(val, 0, 0, 0); - EXPECT_EQ(result, expected); - } - } - } - - tester.clear(); - tester.emit(IGen::sub_gpr64_imm32s(RSP, 12)); - EXPECT_EQ(tester.dump_to_hex_string(), "48 81 ec 0c 00 00 00"); -} - -TEST(EmitterIntegerMath, add_gpr64_gpr64) { - CodeTester tester; - tester.init_code_buffer(256); - std::vector vals = {0, 1, -2, INT32_MIN, INT32_MAX, INT64_MIN, - INT64_MAX, 117, 32, -348473, 83747382}; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - for (auto v1 : vals) { - for (auto v2 : vals) { - auto expected = v1 + v2; - tester.clear(); - tester.emit_push_all_gprs(true); - tester.emit(IGen::mov_gpr64_u64(i, v1)); - tester.emit(IGen::mov_gpr64_u64(j, v2)); - tester.emit(IGen::add_gpr64_gpr64(i, j)); - tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); - tester.emit_pop_all_gprs(true); - tester.emit_return(); - auto result = tester.execute_ret(0, 0, 0, 0); - EXPECT_EQ(result, expected); - } - } - } - } -} - -TEST(EmitterIntegerMath, sub_gpr64_gpr64) { - CodeTester tester; - tester.init_code_buffer(256); - std::vector vals = {0, 1, -2, INT32_MIN, INT32_MAX, INT64_MIN, - INT64_MAX, 117, 32, -348473, 83747382}; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - for (auto v1 : vals) { - for (auto v2 : vals) { - auto expected = v1 - v2; - tester.clear(); - tester.emit_push_all_gprs(true); - tester.emit(IGen::mov_gpr64_u64(i, v1)); - tester.emit(IGen::mov_gpr64_u64(j, v2)); - tester.emit(IGen::sub_gpr64_gpr64(i, j)); - tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); - tester.emit_pop_all_gprs(true); - tester.emit_return(); - auto result = tester.execute_ret(0, 0, 0, 0); - EXPECT_EQ(result, expected); - } - } - } - } -} - -TEST(EmitterIntegerMath, mul_gpr32_gpr32) { - CodeTester tester; - tester.init_code_buffer(256); - std::vector vals = { - 0, 1, -2, -20, 123123, INT32_MIN, INT32_MAX, INT32_MIN + 1, INT32_MAX - 1}; - - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - for (auto v1 : vals) { - for (auto v2 : vals) { - // this is kind of weird behavior, but it's what the PS2 CPU does, I think. - // the lower 32-bits of the result are sign extended, even if this sign doesn't match - // the sign of the real product. This is true for both signed and unsigned multiply. - auto expected = ((s64(v1) * s64(v2)) << 32) >> 32; - tester.clear(); - tester.emit_push_all_gprs(true); - tester.emit(IGen::mov_gpr64_u64(i, (s64)v1)); - tester.emit(IGen::mov_gpr64_u64(j, (s64)v2)); - tester.emit(IGen::imul_gpr32_gpr32(i, j)); - tester.emit(IGen::movsx_r64_r32(RAX, i)); // weird PS2 sign extend. - tester.emit_pop_all_gprs(true); - tester.emit_return(); - auto result = tester.execute_ret(0, 0, 0, 0); - EXPECT_EQ(result, expected); - } - } - } - } -} - -TEST(EmitterIntegerMath, or_gpr64_gpr64) { - CodeTester tester; - tester.init_code_buffer(256); - std::vector vals = {0, 1, -2, INT32_MIN, INT32_MAX, INT64_MIN, - INT64_MAX, 117, 32, -348473, 83747382}; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - for (auto v1 : vals) { - for (auto v2 : vals) { - auto expected = v1 | v2; - tester.clear(); - tester.emit_push_all_gprs(true); - tester.emit(IGen::mov_gpr64_u64(i, v1)); - tester.emit(IGen::mov_gpr64_u64(j, v2)); - tester.emit(IGen::or_gpr64_gpr64(i, j)); - tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); - tester.emit_pop_all_gprs(true); - tester.emit_return(); - auto result = tester.execute_ret(0, 0, 0, 0); - EXPECT_EQ(result, expected); - } - } - } - } -} - -TEST(EmitterIntegerMath, and_gpr64_gpr64) { - CodeTester tester; - tester.init_code_buffer(256); - std::vector vals = {0, 1, -2, INT32_MIN, INT32_MAX, INT64_MIN, - INT64_MAX, 117, 32, -348473, 83747382}; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - for (auto v1 : vals) { - for (auto v2 : vals) { - auto expected = v1 & v2; - tester.clear(); - tester.emit_push_all_gprs(true); - tester.emit(IGen::mov_gpr64_u64(i, v1)); - tester.emit(IGen::mov_gpr64_u64(j, v2)); - tester.emit(IGen::and_gpr64_gpr64(i, j)); - tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); - tester.emit_pop_all_gprs(true); - tester.emit_return(); - auto result = tester.execute_ret(0, 0, 0, 0); - EXPECT_EQ(result, expected); - } - } - } - } -} - -TEST(EmitterIntegerMath, xor_gpr64_gpr64) { - CodeTester tester; - tester.init_code_buffer(256); - std::vector vals = {0, 1, -2, INT32_MIN, INT32_MAX, INT64_MIN, - INT64_MAX, 117, 32, -348473, 83747382}; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - for (auto v1 : vals) { - for (auto v2 : vals) { - auto expected = v1 ^ v2; - tester.clear(); - tester.emit_push_all_gprs(true); - tester.emit(IGen::mov_gpr64_u64(i, v1)); - tester.emit(IGen::mov_gpr64_u64(j, v2)); - tester.emit(IGen::xor_gpr64_gpr64(i, j)); - tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); - tester.emit_pop_all_gprs(true); - tester.emit_return(); - auto result = tester.execute_ret(0, 0, 0, 0); - EXPECT_EQ(result, expected); - } - } - } - } -} - -TEST(EmitterIntegerMath, not_gpr64) { - CodeTester tester; - tester.init_code_buffer(256); - std::vector vals = {0, 1, -2, INT32_MIN, INT32_MAX, INT64_MIN, - INT64_MAX, 117, 32, -348473, 83747382}; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - for (auto v1 : vals) { - auto expected = ~v1; - tester.clear(); - tester.emit_push_all_gprs(true); - tester.emit(IGen::mov_gpr64_u64(i, v1)); - tester.emit(IGen::not_gpr64(i)); - tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); - tester.emit_pop_all_gprs(true); - tester.emit_return(); - auto result = tester.execute_ret(0, 0, 0, 0); - EXPECT_EQ(result, expected); - } - } -} - -TEST(EmitterIntegerMath, shl_gpr64_cl) { - CodeTester tester; - tester.init_code_buffer(256); - std::vector vals = {0, 1, -2, INT32_MIN, INT32_MAX, INT64_MIN, - INT64_MAX, 117, 32, -348473, 83747382}; - std::vector sas = {0, 1, 23, 53, 64}; - - for (int i = 0; i < 16; i++) { - if (i == RSP || i == RCX) { - continue; - } - for (auto v : vals) { - for (auto sa : sas) { - auto expected = v << sa; - tester.clear(); - tester.emit_push_all_gprs(true); - tester.emit(IGen::mov_gpr64_u64(i, v)); - tester.emit(IGen::mov_gpr64_u64(RCX, sa)); - tester.emit(IGen::shl_gpr64_cl(i)); - tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); - tester.emit_pop_all_gprs(true); - tester.emit_return(); - auto result = tester.execute_ret(0, 0, 0, 0); - EXPECT_EQ(result, expected); - } - } - } -} - -TEST(EmitterIntegerMath, shr_gpr64_cl) { - CodeTester tester; - tester.init_code_buffer(256); - std::vector vals = {0, 1, u64(-2), u64(INT32_MIN), INT32_MAX, u64(INT64_MIN), - INT64_MAX, 117, 32, u64(-348473), 83747382}; - std::vector sas = {0, 1, 23, 53, 64}; - - for (int i = 0; i < 16; i++) { - if (i == RSP || i == RCX) { - continue; - } - for (auto v : vals) { - for (auto sa : sas) { - auto expected = v >> sa; - tester.clear(); - tester.emit_push_all_gprs(true); - tester.emit(IGen::mov_gpr64_u64(i, v)); - tester.emit(IGen::mov_gpr64_u64(RCX, sa)); - tester.emit(IGen::shr_gpr64_cl(i)); - tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); - tester.emit_pop_all_gprs(true); - tester.emit_return(); - auto result = tester.execute_ret(0, 0, 0, 0); - EXPECT_EQ(result, expected); - } - } - } -} - -TEST(EmitterIntegerMath, sar_gpr64_cl) { - CodeTester tester; - tester.init_code_buffer(256); - std::vector vals = {0, 1, -2, INT32_MIN, INT32_MAX, INT64_MIN, - INT64_MAX, 117, 32, -348473, 83747382}; - std::vector sas = {0, 1, 23, 53, 64}; - - for (int i = 0; i < 16; i++) { - if (i == RSP || i == RCX) { - continue; - } - for (auto v : vals) { - for (auto sa : sas) { - auto expected = v >> sa; - tester.clear(); - tester.emit_push_all_gprs(true); - tester.emit(IGen::mov_gpr64_u64(i, v)); - tester.emit(IGen::mov_gpr64_u64(RCX, sa)); - tester.emit(IGen::sar_gpr64_cl(i)); - tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); - tester.emit_pop_all_gprs(true); - tester.emit_return(); - auto result = tester.execute_ret(0, 0, 0, 0); - EXPECT_EQ(result, expected); - } - } - } -} - -TEST(EmitterIntegerMath, shl_gpr64_u8) { - CodeTester tester; - tester.init_code_buffer(256); - std::vector vals = {0, 1, -2, INT32_MIN, INT32_MAX, INT64_MIN, - INT64_MAX, 117, 32, -348473, 83747382}; - std::vector sas = {0, 1, 23, 53, 64}; - - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - for (auto v : vals) { - for (auto sa : sas) { - auto expected = v << sa; - tester.clear(); - tester.emit_push_all_gprs(true); - tester.emit(IGen::mov_gpr64_u64(i, v)); - tester.emit(IGen::shl_gpr64_u8(i, sa)); - tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); - tester.emit_pop_all_gprs(true); - tester.emit_return(); - auto result = tester.execute_ret(0, 0, 0, 0); - EXPECT_EQ(result, expected); - } - } - } -} - -TEST(EmitterIntegerMath, shr_gpr64_u8) { - CodeTester tester; - tester.init_code_buffer(256); - std::vector vals = {0, 1, u64(-2), u64(INT32_MIN), INT32_MAX, u64(INT64_MIN), - INT64_MAX, 117, 32, u64(-348473), 83747382}; - std::vector sas = {0, 1, 23, 53, 64}; - - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - for (auto v : vals) { - for (auto sa : sas) { - auto expected = v >> sa; - tester.clear(); - tester.emit_push_all_gprs(true); - tester.emit(IGen::mov_gpr64_u64(i, v)); - tester.emit(IGen::shr_gpr64_u8(i, sa)); - tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); - tester.emit_pop_all_gprs(true); - tester.emit_return(); - auto result = tester.execute_ret(0, 0, 0, 0); - EXPECT_EQ(result, expected); - } - } - } -} - -TEST(EmitterIntegerMath, sar_gpr64_u8) { - CodeTester tester; - tester.init_code_buffer(256); - std::vector vals = {0, 1, -2, INT32_MIN, INT32_MAX, INT64_MIN, - INT64_MAX, 117, 32, -348473, 83747382}; - std::vector sas = {0, 1, 23, 53, 64}; - - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - for (auto v : vals) { - for (auto sa : sas) { - auto expected = v >> sa; - tester.clear(); - tester.emit_push_all_gprs(true); - tester.emit(IGen::mov_gpr64_u64(i, v)); - tester.emit(IGen::sar_gpr64_u8(i, sa)); - tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); - tester.emit_pop_all_gprs(true); - tester.emit_return(); - auto result = tester.execute_ret(0, 0, 0, 0); - EXPECT_EQ(result, expected); - } - } - } -} - -TEST(EmitterIntegerMath, jumps) { - CodeTester tester; - tester.init_code_buffer(256); - - std::vector reads; - - auto x = IGen::jmp_32(); - reads.push_back(tester.size() + x.offset_of_imm()); - tester.emit(x); - - x = IGen::je_32(); - reads.push_back(tester.size() + x.offset_of_imm()); - tester.emit(x); - - x = IGen::jne_32(); - reads.push_back(tester.size() + x.offset_of_imm()); - tester.emit(x); - - x = IGen::jle_32(); - reads.push_back(tester.size() + x.offset_of_imm()); - tester.emit(x); - - x = IGen::jge_32(); - reads.push_back(tester.size() + x.offset_of_imm()); - tester.emit(x); - - x = IGen::jl_32(); - reads.push_back(tester.size() + x.offset_of_imm()); - tester.emit(x); - - x = IGen::jg_32(); - reads.push_back(tester.size() + x.offset_of_imm()); - tester.emit(x); - - x = IGen::jbe_32(); - reads.push_back(tester.size() + x.offset_of_imm()); - tester.emit(x); - - x = IGen::jae_32(); - reads.push_back(tester.size() + x.offset_of_imm()); - tester.emit(x); - - x = IGen::jb_32(); - reads.push_back(tester.size() + x.offset_of_imm()); - tester.emit(x); - - x = IGen::ja_32(); - reads.push_back(tester.size() + x.offset_of_imm()); - tester.emit(x); - - for (auto off : reads) { - EXPECT_EQ(0, tester.read(off)); - } - - EXPECT_EQ(tester.dump_to_hex_string(true), - "E9000000000F84000000000F85000000000F8E000000000F8D000000000F8C000000000F8F000000000F86" - "000000000F83000000000F82000000000F8700000000"); -} - -TEST(EmitterIntegerMath, null) { - auto instr = IGen::null(); - EXPECT_EQ(0, instr.emit(nullptr)); -} - -TEST(EmitterLoadsAndStores, load_constant_64_and_move_gpr_gpr_64) { - std::vector u64_constants = {0, UINT64_MAX, INT64_MAX, 7, 12}; - - // test we can load a 64-bit constant into all gprs, move it to any other gpr, and return it. - // rsp is skipping because that's the stack pointer and would prevent us from popping gprs after - - CodeTester tester; - tester.init_code_buffer(256); - - for (auto constant : u64_constants) { - for (int r1 = 0; r1 < 16; r1++) { - if (r1 == RSP) { - continue; - } - - for (int r2 = 0; r2 < 16; r2++) { - if (r2 == RSP) { - continue; - } - tester.clear(); - tester.emit_push_all_gprs(true); - tester.emit(IGen::mov_gpr64_u64(r1, constant)); - tester.emit(IGen::mov_gpr64_gpr64(r2, r1)); - tester.emit(IGen::mov_gpr64_gpr64(RAX, r2)); - tester.emit_pop_all_gprs(true); - tester.emit_return(); - EXPECT_EQ(tester.execute(), constant); - } - } - } -} - -TEST(EmitterLoadsAndStores, load_constant_32_unsigned) { - std::vector u64_constants = {0, UINT32_MAX, INT32_MAX, 7, 12}; - - // test loading 32-bit constants, with all upper 32-bits zero. - // this uses a different opcode than 64-bit loads. - CodeTester tester; - tester.init_code_buffer(256); - - for (auto constant : u64_constants) { - for (int r1 = 0; r1 < 16; r1++) { - if (r1 == RSP) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - tester.emit(IGen::mov_gpr64_u64(r1, UINT64_MAX)); - tester.emit(IGen::mov_gpr64_u32(r1, constant)); - tester.emit(IGen::mov_gpr64_gpr64(RAX, r1)); - tester.emit_pop_all_gprs(true); - tester.emit_return(); - EXPECT_EQ(tester.execute(), constant); - } - } -} - -TEST(EmitterLoadsAndStores, load_constant_32_signed) { - std::vector s32_constants = {0, 1, INT32_MAX, INT32_MIN, 12, -1}; - - // test loading signed 32-bit constants. for values < 0 this will sign extend. - CodeTester tester; - tester.init_code_buffer(256); - - for (auto constant : s32_constants) { - for (int r1 = 0; r1 < 16; r1++) { - if (r1 == RSP) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - tester.emit(IGen::mov_gpr64_s32(r1, constant)); - tester.emit(IGen::mov_gpr64_gpr64(RAX, r1)); - tester.emit_pop_all_gprs(true); - tester.emit_return(); - EXPECT_EQ(tester.execute(), constant); - } - } -} - -TEST(EmitterLoadsAndStores, load8s_gpr64_goal_ptr_gpr64) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::load8s_gpr64_gpr64_plus_gpr64(RAX, RBX, RSI)); - EXPECT_EQ(tester.dump_to_hex_string(), "48 0f be 04 1e"); - - tester.clear(); - tester.emit(IGen::load8s_gpr64_gpr64_plus_gpr64(R12, RBX, RSI)); - EXPECT_EQ(tester.dump_to_hex_string(), "4c 0f be 24 1e"); - - tester.clear(); - tester.emit(IGen::load8s_gpr64_gpr64_plus_gpr64(R12, R15, RSI)); - EXPECT_EQ(tester.dump_to_hex_string(), "4e 0f be 24 3e"); - - tester.clear(); - tester.emit(IGen::load8s_gpr64_gpr64_plus_gpr64(R12, R15, R14)); - EXPECT_EQ(tester.dump_to_hex_string(), "4f 0f be 24 3e"); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - - // fill k with junk - if (k != i && k != j) { - tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); - } - - // load into k - tester.emit(IGen::load8s_gpr64_gpr64_plus_gpr64(k, i, j)); - - // move k to return register - tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - u8 memory[8] = {0, 0, 0xfd, 0xfe, 0xff, 0, 0, 0}; - - // run! - EXPECT_EQ(s64(tester.execute((u64)memory, 3, 0, 0)), -2); - EXPECT_EQ(s64(tester.execute((u64)memory, 2, 0, 0)), -3); - EXPECT_EQ(s64(tester.execute((u64)memory, 4, 0, 0)), -1); - EXPECT_EQ(s64(tester.execute((u64)memory, 5, 0, 0)), 0); - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, load8s_gpr64_gpr64_gpr64_s8) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::load8s_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3)); - - EXPECT_EQ(tester.dump_to_hex_string(), "48 0f be 44 1e fd"); - - auto instr = IGen::load8s_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3); - u8 buff[256]; - instr.emit(buff); - EXPECT_EQ(s8(buff[instr.offset_of_disp()]), -3); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - - // fill k with junk - if (k != i && k != j) { - tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); - } - - // load into k - tester.emit(IGen::load8s_gpr64_gpr64_plus_gpr64_plus_s8(k, i, j, -3)); - - // move k to return register - tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - u8 memory[8] = {0, 0, 0xfd, 0xfe, 0xff, 0, 0, 0}; - - // run! - EXPECT_EQ(s64(tester.execute((u64)memory, 3 + 3, 0, 0)), -2); - EXPECT_EQ(s64(tester.execute((u64)memory, 2 + 3, 0, 0)), -3); - EXPECT_EQ(s64(tester.execute((u64)memory, 4 + 3, 0, 0)), -1); - EXPECT_EQ(s64(tester.execute((u64)memory, 5 + 3, 0, 0)), 0); - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, load8s_gpr64_gpr64_gpr64_s32) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::load8s_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3)); - - EXPECT_EQ(tester.dump_to_hex_string(), "48 0f be 84 1e fd ff ff ff"); - - auto instr = IGen::load8s_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3); - u8 buff[256]; - instr.emit(buff); - EXPECT_EQ(*(s32*)(buff + instr.offset_of_disp()), -3); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - - // fill k with junk - if (k != i && k != j) { - tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); - } - - // load into k - tester.emit(IGen::load8s_gpr64_gpr64_plus_gpr64_plus_s32(k, i, j, -3)); - - // move k to return register - tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - u8 memory[8] = {0, 0, 0xfd, 0xfe, 0xff, 0, 0, 0}; - - // run! - EXPECT_EQ(s64(tester.execute((u64)memory, 3 + 3, 0, 0)), -2); - EXPECT_EQ(s64(tester.execute((u64)memory, 2 + 3, 0, 0)), -3); - EXPECT_EQ(s64(tester.execute((u64)memory, 4 + 3, 0, 0)), -1); - EXPECT_EQ(s64(tester.execute((u64)memory, 5 + 3, 0, 0)), 0); - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, load8u_gpr64_goal_ptr_gpr64) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::load8u_gpr64_gpr64_plus_gpr64(RAX, RBX, RSI)); - EXPECT_EQ(tester.dump_to_hex_string(), "48 0f b6 04 1e"); - - tester.clear(); - tester.emit(IGen::load8u_gpr64_gpr64_plus_gpr64(R12, RBX, RSI)); - EXPECT_EQ(tester.dump_to_hex_string(), "4c 0f b6 24 1e"); - - tester.clear(); - tester.emit(IGen::load8u_gpr64_gpr64_plus_gpr64(R12, R15, RSI)); - EXPECT_EQ(tester.dump_to_hex_string(), "4e 0f b6 24 3e"); - - tester.clear(); - tester.emit(IGen::load8u_gpr64_gpr64_plus_gpr64(R12, R15, R14)); - EXPECT_EQ(tester.dump_to_hex_string(), "4f 0f b6 24 3e"); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - - // fill k with junk - if (k != i && k != j) { - tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); - } - - // load into k - tester.emit(IGen::load8s_gpr64_gpr64_plus_gpr64(k, i, j)); - - // move k to return register - tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - u8 memory[8] = {0, 0, 0xfd, 0xfe, 0xff, 0, 0, 0}; - - // run! - EXPECT_EQ(s64(tester.execute((u64)memory, 3, 0, 0)), -2); - EXPECT_EQ(s64(tester.execute((u64)memory, 2, 0, 0)), -3); - EXPECT_EQ(s64(tester.execute((u64)memory, 4, 0, 0)), -1); - EXPECT_EQ(s64(tester.execute((u64)memory, 5, 0, 0)), 0); - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, load8u_gpr64_gpr64_gpr64_s8) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::load8u_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3)); - - EXPECT_EQ(tester.dump_to_hex_string(), "48 0f b6 44 1e fd"); - - auto instr = IGen::load8u_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3); - u8 buff[256]; - instr.emit(buff); - EXPECT_EQ(s8(buff[instr.offset_of_disp()]), -3); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - - // fill k with junk - if (k != i && k != j) { - tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); - } - - // load into k - tester.emit(IGen::load8u_gpr64_gpr64_plus_gpr64_plus_s8(k, i, j, -3)); - - // move k to return register - tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - u8 memory[8] = {0, 0, 0xfd, 0xfe, 0xff, 0, 0, 0}; - - // run! - EXPECT_EQ(s64(tester.execute((u64)memory, 3 + 3, 0, 0)), 0xfe); - EXPECT_EQ(s64(tester.execute((u64)memory, 2 + 3, 0, 0)), 0xfd); - EXPECT_EQ(s64(tester.execute((u64)memory, 4 + 3, 0, 0)), 0xff); - EXPECT_EQ(s64(tester.execute((u64)memory, 5 + 3, 0, 0)), 0); - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, load8u_gpr64_gpr64_gpr64_s32) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::load8u_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3)); - - EXPECT_EQ(tester.dump_to_hex_string(), "48 0f b6 84 1e fd ff ff ff"); - - auto instr = IGen::load8u_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3); - u8 buff[256]; - instr.emit(buff); - EXPECT_EQ(*(s32*)(buff + instr.offset_of_disp()), -3); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - - // fill k with junk - if (k != i && k != j) { - tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); - } - - // load into k - tester.emit(IGen::load8u_gpr64_gpr64_plus_gpr64_plus_s32(k, i, j, -3)); - - // move k to return register - tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - u8 memory[8] = {0, 0, 0xfd, 0xfe, 0xff, 0, 0, 0}; - - // run! - EXPECT_EQ(s64(tester.execute((u64)memory, 3 + 3, 0, 0)), 0xfe); - EXPECT_EQ(s64(tester.execute((u64)memory, 2 + 3, 0, 0)), 0xfd); - EXPECT_EQ(s64(tester.execute((u64)memory, 4 + 3, 0, 0)), 0xff); - EXPECT_EQ(s64(tester.execute((u64)memory, 5 + 3, 0, 0)), 0); - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, load16s_gpr64_goal_ptr_gpr64) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::load16s_gpr64_gpr64_plus_gpr64(RAX, RBX, RSI)); - EXPECT_EQ(tester.dump_to_hex_string(), "48 0f bf 04 1e"); - - tester.clear(); - tester.emit(IGen::load16s_gpr64_gpr64_plus_gpr64(R12, RBX, RSI)); - EXPECT_EQ(tester.dump_to_hex_string(), "4c 0f bf 24 1e"); - - tester.clear(); - tester.emit(IGen::load16s_gpr64_gpr64_plus_gpr64(R12, R15, RSI)); - EXPECT_EQ(tester.dump_to_hex_string(), "4e 0f bf 24 3e"); - - tester.clear(); - tester.emit(IGen::load16s_gpr64_gpr64_plus_gpr64(R12, R15, R14)); - EXPECT_EQ(tester.dump_to_hex_string(), "4f 0f bf 24 3e"); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - - // fill k with junk - if (k != i && k != j) { - tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); - } - - // load into k - tester.emit(IGen::load16s_gpr64_gpr64_plus_gpr64(k, i, j)); - - // move k to return register - tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - s16 memory[8] = {0, 0, -3, -2, -1, 0, 0, 0}; - - // run! - EXPECT_EQ(s64(tester.execute((u64)memory, 6, 0, 0)), -2); - EXPECT_EQ(s64(tester.execute((u64)memory, 4, 0, 0)), -3); - EXPECT_EQ(s64(tester.execute((u64)memory, 8, 0, 0)), -1); - EXPECT_EQ(s64(tester.execute((u64)memory, 10, 0, 0)), 0); - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, load16s_gpr64_gpr64_plus_gpr64_plus_s8) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::load16s_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3)); - - EXPECT_EQ(tester.dump_to_hex_string(), "48 0f bf 44 1e fd"); - - auto instr = IGen::load16s_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3); - u8 buff[256]; - instr.emit(buff); - EXPECT_EQ(s8(buff[instr.offset_of_disp()]), -3); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - - // fill k with junk - if (k != i && k != j) { - tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); - } - - // load into k - tester.emit(IGen::load16s_gpr64_gpr64_plus_gpr64_plus_s8(k, i, j, -3)); - - // move k to return register - tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - u16 memory[8] = {0, 0, 0xfffd, 0xfffe, 0xffff, 0, 0, 0}; - - // run! - EXPECT_EQ(s64(tester.execute((u64)memory, 6 + 3, 0, 0)), -2); - EXPECT_EQ(s64(tester.execute((u64)memory, 4 + 3, 0, 0)), -3); - EXPECT_EQ(s64(tester.execute((u64)memory, 8 + 3, 0, 0)), -1); - EXPECT_EQ(s64(tester.execute((u64)memory, 10 + 3, 0, 0)), 0); - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, load16s_gpr64_gpr64_plus_gpr64_plus_s32) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::load16s_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3)); - - EXPECT_EQ(tester.dump_to_hex_string(), "48 0f bf 84 1e fd ff ff ff"); - - auto instr = IGen::load16s_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3); - u8 buff[256]; - instr.emit(buff); - EXPECT_EQ(*(s32*)(buff + instr.offset_of_disp()), -3); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - - // fill k with junk - if (k != i && k != j) { - tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); - } - - // load into k - tester.emit(IGen::load16s_gpr64_gpr64_plus_gpr64_plus_s32(k, i, j, -3)); - - // move k to return register - tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - u16 memory[8] = {0, 0, 0xfffd, 0xfffe, 0xffff, 0, 0, 0}; - - // run! - EXPECT_EQ(s64(tester.execute((u64)memory, 6 + 3, 0, 0)), -2); - EXPECT_EQ(s64(tester.execute((u64)memory, 4 + 3, 0, 0)), -3); - EXPECT_EQ(s64(tester.execute((u64)memory, 8 + 3, 0, 0)), -1); - EXPECT_EQ(s64(tester.execute((u64)memory, 10 + 3, 0, 0)), 0); - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, load16u_gpr64_goal_ptr_gpr64) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::load16u_gpr64_gpr64_plus_gpr64(RAX, RBX, RSI)); - EXPECT_EQ(tester.dump_to_hex_string(), "48 0f b7 04 1e"); - - tester.clear(); - tester.emit(IGen::load16u_gpr64_gpr64_plus_gpr64(R12, RBX, RSI)); - EXPECT_EQ(tester.dump_to_hex_string(), "4c 0f b7 24 1e"); - - tester.clear(); - tester.emit(IGen::load16u_gpr64_gpr64_plus_gpr64(R12, R15, RSI)); - EXPECT_EQ(tester.dump_to_hex_string(), "4e 0f b7 24 3e"); - - tester.clear(); - tester.emit(IGen::load16u_gpr64_gpr64_plus_gpr64(R12, R15, R14)); - EXPECT_EQ(tester.dump_to_hex_string(), "4f 0f b7 24 3e"); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - - // fill k with junk - if (k != i && k != j) { - tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); - } - - // load into k - tester.emit(IGen::load16u_gpr64_gpr64_plus_gpr64(k, i, j)); - - // move k to return register - tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - s16 memory[8] = {0, 0, -3, -2, -1, 0, 0, 0}; - - // run! - EXPECT_EQ(s64(tester.execute((u64)memory, 6, 0, 0)), 0xfffe); - EXPECT_EQ(s64(tester.execute((u64)memory, 4, 0, 0)), 0xfffd); - EXPECT_EQ(s64(tester.execute((u64)memory, 8, 0, 0)), 0xffff); - EXPECT_EQ(s64(tester.execute((u64)memory, 10, 0, 0)), 0); - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, load16u_gpr64_gpr64_plus_gpr64_plus_s8) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::load16u_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3)); - - EXPECT_EQ(tester.dump_to_hex_string(), "48 0f b7 44 1e fd"); - - auto instr = IGen::load16u_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3); - u8 buff[256]; - instr.emit(buff); - EXPECT_EQ(s8(buff[instr.offset_of_disp()]), -3); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - - // fill k with junk - if (k != i && k != j) { - tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); - } - - // load into k - tester.emit(IGen::load16u_gpr64_gpr64_plus_gpr64_plus_s8(k, i, j, -3)); - - // move k to return register - tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - u16 memory[8] = {0, 0, 0xfffd, 0xfffe, 0xffff, 0, 0, 0}; - - // run! - EXPECT_EQ(s64(tester.execute((u64)memory, 6 + 3, 0, 0)), 0xfffe); - EXPECT_EQ(s64(tester.execute((u64)memory, 4 + 3, 0, 0)), 0xfffd); - EXPECT_EQ(s64(tester.execute((u64)memory, 8 + 3, 0, 0)), 0xffff); - EXPECT_EQ(s64(tester.execute((u64)memory, 10 + 3, 0, 0)), 0); - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, load16u_gpr64_gpr64_plus_gpr64_plus_s32) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::load16u_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3)); - - EXPECT_EQ(tester.dump_to_hex_string(), "48 0f b7 84 1e fd ff ff ff"); - - auto instr = IGen::load16u_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3); - u8 buff[256]; - instr.emit(buff); - EXPECT_EQ(*(s32*)(buff + instr.offset_of_disp()), -3); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - - // fill k with junk - if (k != i && k != j) { - tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); - } - - // load into k - tester.emit(IGen::load16u_gpr64_gpr64_plus_gpr64_plus_s32(k, i, j, -3)); - - // move k to return register - tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - u16 memory[8] = {0, 0, 0xfffd, 0xfffe, 0xffff, 0, 0, 0}; - - // run! - EXPECT_EQ(s64(tester.execute((u64)memory, 6 + 3, 0, 0)), 0xfffe); - EXPECT_EQ(s64(tester.execute((u64)memory, 4 + 3, 0, 0)), 0xfffd); - EXPECT_EQ(s64(tester.execute((u64)memory, 8 + 3, 0, 0)), 0xffff); - EXPECT_EQ(s64(tester.execute((u64)memory, 10 + 3, 0, 0)), 0); - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, load32s_gpr64_goal_ptr_gpr64) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::load32s_gpr64_gpr64_plus_gpr64(RAX, RBX, RSI)); - EXPECT_EQ(tester.dump_to_hex_string(), "48 63 04 1e"); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - - // fill k with junk - if (k != i && k != j) { - tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); - } - - // load into k - tester.emit(IGen::load32s_gpr64_gpr64_plus_gpr64(k, i, j)); - - // move k to return register - tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - s32 memory[8] = {0, 0, -3, -2, -1, 0, 0, 0}; - - // run! - EXPECT_EQ(s64(tester.execute((u64)memory, 12, 0, 0)), -2); - EXPECT_EQ(s64(tester.execute((u64)memory, 8, 0, 0)), -3); - EXPECT_EQ(s64(tester.execute((u64)memory, 16, 0, 0)), -1); - EXPECT_EQ(s64(tester.execute((u64)memory, 20, 0, 0)), 0); - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, load32s_gpr64_gpr64_plus_gpr64_plus_s8) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::load32s_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3)); - - EXPECT_EQ(tester.dump_to_hex_string(), "48 63 44 1e fd"); - - auto instr = IGen::load32s_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3); - u8 buff[256]; - instr.emit(buff); - EXPECT_EQ(s8(buff[instr.offset_of_disp()]), -3); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - - // fill k with junk - if (k != i && k != j) { - tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); - } - - // load into k - tester.emit(IGen::load32s_gpr64_gpr64_plus_gpr64_plus_s8(k, i, j, -3)); - - // move k to return register - tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - u32 memory[8] = {0, 0, 0xfffffffd, 0xfffffffe, 0xffffffff, 0, 0, 0}; - - // run! - EXPECT_EQ(s64(tester.execute((u64)memory, 12 + 3, 0, 0)), -2); - EXPECT_EQ(s64(tester.execute((u64)memory, 8 + 3, 0, 0)), -3); - EXPECT_EQ(s64(tester.execute((u64)memory, 16 + 3, 0, 0)), -1); - EXPECT_EQ(s64(tester.execute((u64)memory, 20 + 3, 0, 0)), 0); - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, load32s_gpr64_gpr64_plus_gpr64_plus_s32) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::load32s_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3)); - - EXPECT_EQ(tester.dump_to_hex_string(), "48 63 84 1e fd ff ff ff"); - - auto instr = IGen::load32s_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3); - u8 buff[256]; - instr.emit(buff); - EXPECT_EQ(*(s32*)(buff + instr.offset_of_disp()), -3); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - - // fill k with junk - if (k != i && k != j) { - tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); - } - - // load into k - tester.emit(IGen::load32s_gpr64_gpr64_plus_gpr64_plus_s32(k, i, j, -3)); - - // move k to return register - tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - u32 memory[8] = {0, 0, 0xfffffffd, 0xfffffffe, 0xffffffff, 0, 0, 0}; - - // run! - EXPECT_EQ(s64(tester.execute((u64)memory, 12 + 3, 0, 0)), -2); - EXPECT_EQ(s64(tester.execute((u64)memory, 8 + 3, 0, 0)), -3); - EXPECT_EQ(s64(tester.execute((u64)memory, 16 + 3, 0, 0)), -1); - EXPECT_EQ(s64(tester.execute((u64)memory, 20 + 3, 0, 0)), 0); - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, load32u_gpr64_goal_ptr_gpr64) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::load32u_gpr64_gpr64_plus_gpr64(RAX, RBX, RSI)); - EXPECT_EQ(tester.dump_to_hex_string(), "8b 04 1e"); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - - // fill k with junk - if (k != i && k != j) { - tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); - } - - // load into k - tester.emit(IGen::load32u_gpr64_gpr64_plus_gpr64(k, i, j)); - - // move k to return register - tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - s32 memory[8] = {0, 0, -3, -2, -1, 0, 0, 0}; - - // run! - EXPECT_EQ(s64(tester.execute((u64)memory, 12, 0, 0)), 0xfffffffe); - EXPECT_EQ(s64(tester.execute((u64)memory, 8, 0, 0)), 0xfffffffd); - EXPECT_EQ(s64(tester.execute((u64)memory, 16, 0, 0)), 0xffffffff); - EXPECT_EQ(s64(tester.execute((u64)memory, 20, 0, 0)), 0); - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, load32u_gpr64_gpr64_plus_gpr64_plus_s8) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::load32u_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3)); - - EXPECT_EQ(tester.dump_to_hex_string(), "8b 44 1e fd"); - - auto instr = IGen::load32u_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3); - u8 buff[256]; - instr.emit(buff); - EXPECT_EQ(s8(buff[instr.offset_of_disp()]), -3); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - - // fill k with junk - if (k != i && k != j) { - tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); - } - - // load into k - tester.emit(IGen::load32u_gpr64_gpr64_plus_gpr64_plus_s8(k, i, j, -3)); - - // move k to return register - tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - s32 memory[8] = {0, 0, -3, -2, -1, 0, 0, 0}; - - // run! - EXPECT_EQ(s64(tester.execute((u64)memory, 12 + 3, 0, 0)), 0xfffffffe); - EXPECT_EQ(s64(tester.execute((u64)memory, 8 + 3, 0, 0)), 0xfffffffd); - EXPECT_EQ(s64(tester.execute((u64)memory, 16 + 3, 0, 0)), 0xffffffff); - EXPECT_EQ(s64(tester.execute((u64)memory, 20 + 3, 0, 0)), 0); - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, load32u_gpr64_gpr64_plus_gpr64_plus_s32) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::load32u_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3)); - - EXPECT_EQ(tester.dump_to_hex_string(), "8b 84 1e fd ff ff ff"); - - auto instr = IGen::load32u_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3); - u8 buff[256]; - instr.emit(buff); - EXPECT_EQ(*(s32*)(buff + instr.offset_of_disp()), -3); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - - // fill k with junk - if (k != i && k != j) { - tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); - } - - // load into k - tester.emit(IGen::load32u_gpr64_gpr64_plus_gpr64_plus_s32(k, i, j, -3)); - - // move k to return register - tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - u32 memory[8] = {0, 0, 0xfffffffd, 0xfffffffe, 0xffffffff, 0, 0, 0}; - - // run! - EXPECT_EQ(s64(tester.execute((u64)memory, 12 + 3, 0, 0)), 0xfffffffe); - EXPECT_EQ(s64(tester.execute((u64)memory, 8 + 3, 0, 0)), 0xfffffffd); - EXPECT_EQ(s64(tester.execute((u64)memory, 16 + 3, 0, 0)), 0xffffffff); - EXPECT_EQ(s64(tester.execute((u64)memory, 20 + 3, 0, 0)), 0); - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, load64_gpr64_goal_ptr_gpr64) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::load64_gpr64_gpr64_plus_gpr64(RAX, RBX, RSI)); - EXPECT_EQ(tester.dump_to_hex_string(), "48 8b 04 1e"); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - - // fill k with junk - if (k != i && k != j) { - tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); - } - - // load into k - tester.emit(IGen::load64_gpr64_gpr64_plus_gpr64(k, i, j)); - - // move k to return register - tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - s64 memory[8] = {0, 0, -3, -2, -1, 0, 0, 0}; - - // run! - EXPECT_EQ(s64(tester.execute((u64)memory, 24, 0, 0)), -2); - EXPECT_EQ(s64(tester.execute((u64)memory, 16, 0, 0)), -3); - EXPECT_EQ(s64(tester.execute((u64)memory, 32, 0, 0)), -1); - EXPECT_EQ(s64(tester.execute((u64)memory, 40, 0, 0)), 0); - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, load64_gpr64_gpr64_plus_gpr64_plus_s8) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::load64_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3)); - - EXPECT_EQ(tester.dump_to_hex_string(), "48 8b 44 1e fd"); - - auto instr = IGen::load64_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3); - u8 buff[256]; - instr.emit(buff); - EXPECT_EQ(s8(buff[instr.offset_of_disp()]), -3); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - - // fill k with junk - if (k != i && k != j) { - tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); - } - - // load into k - tester.emit(IGen::load64_gpr64_gpr64_plus_gpr64_plus_s8(k, i, j, -3)); - - // move k to return register - tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - s64 memory[8] = {0, 0, -3, -2, -1, 0, 0, 0}; - - // run! - EXPECT_EQ(s64(tester.execute((u64)memory, 24 + 3, 0, 0)), -2); - EXPECT_EQ(s64(tester.execute((u64)memory, 16 + 3, 0, 0)), -3); - EXPECT_EQ(s64(tester.execute((u64)memory, 32 + 3, 0, 0)), -1); - EXPECT_EQ(s64(tester.execute((u64)memory, 40 + 3, 0, 0)), 0); - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, load64_gpr64_gpr64_plus_gpr64_plus_s32) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::load64_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3)); - - EXPECT_EQ(tester.dump_to_hex_string(), "48 8b 84 1e fd ff ff ff"); - - auto instr = IGen::load64_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3); - u8 buff[256]; - instr.emit(buff); - EXPECT_EQ(*(s32*)(buff + instr.offset_of_disp()), -3); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - - // fill k with junk - if (k != i && k != j) { - tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); - } - - // load into k - tester.emit(IGen::load64_gpr64_gpr64_plus_gpr64_plus_s32(k, i, j, -3)); - - // move k to return register - tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - s64 memory[8] = {0, 0, -3, -2, -1, 0, 0, 0}; - - // run! - EXPECT_EQ(s64(tester.execute((u64)memory, 24 + 3, 0, 0)), -2); - EXPECT_EQ(s64(tester.execute((u64)memory, 16 + 3, 0, 0)), -3); - EXPECT_EQ(s64(tester.execute((u64)memory, 32 + 3, 0, 0)), -1); - EXPECT_EQ(s64(tester.execute((u64)memory, 40 + 3, 0, 0)), 0); - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, store8_gpr64_gpr64_plus_gpr64) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::store8_gpr64_gpr64_plus_gpr64(RAX, RCX, RDX)); - EXPECT_EQ(tester.dump_to_hex_string(), "88 14 01"); - - [[maybe_unused]] int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP || k == j || k == i) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - tester.emit(IGen::pop_gpr64(k)); // k will have the value to store. - - // store! - tester.emit(IGen::store8_gpr64_gpr64_plus_gpr64(i, j, k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - s8 memory[8] = {0, 0, 3, -2, 1, 0, 0, 0}; - - // run! - tester.execute((u64)memory, 3, 0xffffffffffffff07, 0); - EXPECT_EQ(memory[2], 3); - EXPECT_EQ(memory[3], 7); - EXPECT_EQ(memory[4], 1); - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, store8_gpr64_gpr64_plus_gpr64_plus_s8) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::store8_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RCX, RDX, 12)); - EXPECT_EQ(tester.dump_to_hex_string(), "88 54 01 0c"); - - auto instr = IGen::store8_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RCX, RDX, -3); - u8 buff[256]; - instr.emit(buff); - EXPECT_EQ(*(s8*)(buff + instr.offset_of_disp()), -3); - - [[maybe_unused]] int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP || k == j || k == i) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - tester.emit(IGen::pop_gpr64(k)); // k will have the value to store. - - // store - tester.emit(IGen::store8_gpr64_gpr64_plus_gpr64_plus_s8(i, j, k, -3)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - s8 memory[8] = {0, 0, 3, -2, 1, 0, 0, 0}; - - // run! - tester.execute((u64)memory, 6, 0xffffffffffffff07, 0); - EXPECT_EQ(memory[2], 3); - EXPECT_EQ(memory[3], 7); - EXPECT_EQ(memory[4], 1); - - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, store8_gpr64_gpr64_plus_gpr64_plus_s32) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::store8_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RCX, RDX, 12)); - EXPECT_EQ(tester.dump_to_hex_string(), "88 94 01 0c 00 00 00"); - - auto instr = IGen::store8_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RCX, RDX, -3); - u8 buff[256]; - instr.emit(buff); - EXPECT_EQ(*(s32*)(buff + instr.offset_of_disp()), -3); - - [[maybe_unused]] int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP || k == j || k == i) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - tester.emit(IGen::pop_gpr64(k)); // k will have the value to store. - - // store - tester.emit(IGen::store8_gpr64_gpr64_plus_gpr64_plus_s32(i, j, k, -3)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - s8 memory[8] = {0, 0, 3, -2, 1, 0, 0, 0}; - - // run! - tester.execute((u64)memory, 6, 0xffffffffffffff07, 0); - EXPECT_EQ(memory[2], 3); - EXPECT_EQ(memory[3], 7); - EXPECT_EQ(memory[4], 1); - - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, store16_gpr64_gpr64_plus_gpr64) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::store16_gpr64_gpr64_plus_gpr64(RCX, RAX, R8)); - EXPECT_EQ(tester.dump_to_hex_string(), "66 44 89 04 08"); - - [[maybe_unused]] int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP || k == j || k == i) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - tester.emit(IGen::pop_gpr64(k)); // k will have the value to store. - - // store! - tester.emit(IGen::store16_gpr64_gpr64_plus_gpr64(i, j, k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - s16 memory[8] = {0, 0, 3, -2, 1, 0, 0, 0}; - - // run! - tester.execute((u64)memory, 6, 0xffffffffffffff07, 0); - EXPECT_EQ(memory[2], 3); - EXPECT_EQ(memory[3], s16(0xff07)); - EXPECT_EQ(memory[4], 1); - - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, store16_gpr64_gpr64_plus_gpr64_plus_s8) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::store16_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RCX, R8, 12)); - EXPECT_EQ(tester.dump_to_hex_string(), "66 44 89 44 01 0c"); - - auto instr = IGen::store16_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RCX, RDX, -3); - u8 buff[256]; - instr.emit(buff); - EXPECT_EQ(*(s8*)(buff + instr.offset_of_disp()), -3); - - [[maybe_unused]] int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP || k == j || k == i) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - tester.emit(IGen::pop_gpr64(k)); // k will have the value to store. - - // store - tester.emit(IGen::store16_gpr64_gpr64_plus_gpr64_plus_s8(i, j, k, -3)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - s16 memory[8] = {0, 0, 3, -2, 1, 0, 0, 0}; - - // run! - tester.execute((u64)memory, 6 + 3, 0xffffffffffffff07, 0); - EXPECT_EQ(memory[2], 3); - EXPECT_EQ(memory[3], s16(0xff07)); - EXPECT_EQ(memory[4], 1); - - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, store16_gpr64_gpr64_plus_gpr64_plus_s32) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::store16_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RCX, R8, 12)); - EXPECT_EQ(tester.dump_to_hex_string(), "66 44 89 84 01 0c 00 00 00"); - - auto instr = IGen::store16_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RCX, RDX, -3); - u8 buff[256]; - instr.emit(buff); - EXPECT_EQ(*(s8*)(buff + instr.offset_of_disp()), -3); - - [[maybe_unused]] int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP || k == j || k == i) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - tester.emit(IGen::pop_gpr64(k)); // k will have the value to store. - - // store - tester.emit(IGen::store16_gpr64_gpr64_plus_gpr64_plus_s32(i, j, k, -3)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - s16 memory[8] = {0, 0, 3, -2, 1, 0, 0, 0}; - - // run! - tester.execute((u64)memory, 6 + 3, 0xffffffffffffff07, 0); - EXPECT_EQ(memory[2], 3); - EXPECT_EQ(memory[3], s16(0xff07)); - EXPECT_EQ(memory[4], 1); - - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, store32_gpr64_gpr64_plus_gpr64) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::store32_gpr64_gpr64_plus_gpr64(RCX, RAX, R8)); - EXPECT_EQ(tester.dump_to_hex_string(), "44 89 04 08"); - - [[maybe_unused]] int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP || k == j || k == i) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - tester.emit(IGen::pop_gpr64(k)); // k will have the value to store. - - // store! - tester.emit(IGen::store32_gpr64_gpr64_plus_gpr64(i, j, k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - s32 memory[8] = {0, 0, 3, -2, 1, 0, 0, 0}; - - // run! - tester.execute((u64)memory, 12, 0xffffffff12341234, 0); - EXPECT_EQ(memory[2], 3); - EXPECT_EQ(memory[3], 0x12341234); - EXPECT_EQ(memory[4], 1); - - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, store32_gpr64_gpr64_plus_gpr64_plus_s8) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::store32_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RCX, R8, 12)); - EXPECT_EQ(tester.dump_to_hex_string(), "44 89 44 01 0c"); - - auto instr = IGen::store32_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RCX, RDX, -3); - u8 buff[256]; - instr.emit(buff); - EXPECT_EQ(*(s8*)(buff + instr.offset_of_disp()), -3); - - [[maybe_unused]] int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP || k == j || k == i) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - tester.emit(IGen::pop_gpr64(k)); // k will have the value to store. - - // store - tester.emit(IGen::store32_gpr64_gpr64_plus_gpr64_plus_s8(i, j, k, -3)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - s32 memory[8] = {0, 0, 3, -2, 1, 0, 0, 0}; - - // run! - tester.execute((u64)memory, 12 + 3, 0xffffffffffffff07, 0); - EXPECT_EQ(memory[2], 3); - EXPECT_EQ(memory[3], s32(0xffffff07)); - EXPECT_EQ(memory[4], 1); - - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, store32_gpr64_gpr64_plus_gpr64_plus_s32) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::store32_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RCX, R8, 12)); - EXPECT_EQ(tester.dump_to_hex_string(), "44 89 84 01 0c 00 00 00"); - - auto instr = IGen::store32_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RCX, RDX, -3); - u8 buff[256]; - instr.emit(buff); - EXPECT_EQ(*(s8*)(buff + instr.offset_of_disp()), -3); - - [[maybe_unused]] int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP || k == j || k == i) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - tester.emit(IGen::pop_gpr64(k)); // k will have the value to store. - - // store - tester.emit(IGen::store32_gpr64_gpr64_plus_gpr64_plus_s32(i, j, k, -3)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - s32 memory[8] = {0, 0, 3, -2, 1, 0, 0, 0}; - - // run! - tester.execute((u64)memory, 12 + 3, 0xffffffffffffff07, 0); - EXPECT_EQ(memory[2], 3); - EXPECT_EQ(memory[3], s32(0xffffff07)); - EXPECT_EQ(memory[4], 1); - - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, store64_gpr64_gpr64_plus_gpr64) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::store64_gpr64_gpr64_plus_gpr64(RCX, RAX, R8)); - EXPECT_EQ(tester.dump_to_hex_string(), "4c 89 04 08"); - - [[maybe_unused]] int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP || k == j || k == i) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - tester.emit(IGen::pop_gpr64(k)); // k will have the value to store. - - // store! - tester.emit(IGen::store64_gpr64_gpr64_plus_gpr64(i, j, k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - s64 memory[8] = {0, 0, 3, -2, 1, 0, 0, 0}; - - // run! - tester.execute((u64)memory, 24, 0xffffffff12341234, 0); - EXPECT_EQ(memory[2], 3); - EXPECT_EQ(memory[3], 0xffffffff12341234); - EXPECT_EQ(memory[4], 1); - - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, store64_gpr64_gpr64_plus_gpr64_plus_s8) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::store64_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RCX, R8, 12)); - EXPECT_EQ(tester.dump_to_hex_string(), "4c 89 44 01 0c"); - - auto instr = IGen::store64_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RCX, RDX, -3); - u8 buff[256]; - instr.emit(buff); - EXPECT_EQ(*(s8*)(buff + instr.offset_of_disp()), -3); - - [[maybe_unused]] int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP || k == j || k == i) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - tester.emit(IGen::pop_gpr64(k)); // k will have the value to store. - - // store - tester.emit(IGen::store64_gpr64_gpr64_plus_gpr64_plus_s8(i, j, k, -3)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - s64 memory[8] = {0, 0, 3, -2, 1, 0, 0, 0}; - - // run! - tester.execute((u64)memory, 24 + 3, 0xffffffffffffff07, 0); - EXPECT_EQ(memory[2], 3); - EXPECT_EQ(memory[3], s64(0xffffffffffffff07)); - EXPECT_EQ(memory[4], 1); - - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, store64_gpr64_gpr64_plus_gpr64_plus_s32) { - CodeTester tester; - tester.init_code_buffer(512); - - tester.clear(); - tester.emit(IGen::store64_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RCX, R8, 12)); - EXPECT_EQ(tester.dump_to_hex_string(), "4c 89 84 01 0c 00 00 00"); - - auto instr = IGen::store64_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RCX, RDX, -3); - u8 buff[256]; - instr.emit(buff); - EXPECT_EQ(*(s8*)(buff + instr.offset_of_disp()), -3); - - [[maybe_unused]] int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - if (k == RSP || k == j || k == i) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - tester.emit(IGen::pop_gpr64(k)); // k will have the value to store. - - // store - tester.emit(IGen::store64_gpr64_gpr64_plus_gpr64_plus_s32(i, j, k, -3)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - // prepare the memory: - s64 memory[8] = {0, 0, 3, -2, 1, 0, 0, 0}; - - // run! - tester.execute((u64)memory, 24 + 3, 0xffffffffffffff07, 0); - EXPECT_EQ(memory[2], 3); - EXPECT_EQ(memory[3], s64(0xffffffffffffff07)); - EXPECT_EQ(memory[4], 1); - - iter++; - } - } - } -} - -TEST(EmitterLoadsAndStores, load64_rip) { - CodeTester tester; - tester.init_code_buffer(256); - tester.emit(IGen::load64_rip_s32(RAX, 12)); - EXPECT_EQ(tester.dump_to_hex_string(), "48 8b 05 0c 00 00 00"); - - tester.clear(); - for (int i = 0; i < 16; i++) { - tester.emit(IGen::load64_rip_s32(i, 12)); - } - - EXPECT_EQ(tester.dump_to_hex_string(true), - "488B050C000000488B0D0C000000488B150C000000488B1D0C000000488B250C000000488B2D0C00000048" - "8B350C000000488B3D0C0000004C8B050C0000004C8B0D0C0000004C8B150C0000004C8B1D0C0000004C8B" - "250C0000004C8B2D0C0000004C8B350C0000004C8B3D0C000000"); -} - -TEST(EmitterLoadsAndStores, load32s_rip) { - CodeTester tester; - tester.init_code_buffer(256); - tester.emit(IGen::load32s_rip_s32(RAX, 12)); - EXPECT_EQ(tester.dump_to_hex_string(), "48 63 05 0c 00 00 00"); - - tester.clear(); - for (int i = 0; i < 16; i++) { - tester.emit(IGen::load32s_rip_s32(i, 12)); - } - - EXPECT_EQ(tester.dump_to_hex_string(true), - "4863050C00000048630D0C0000004863150C00000048631D0C0000004863250C00000048632D0C00000048" - "63350C00000048633D0C0000004C63050C0000004C630D0C0000004C63150C0000004C631D0C0000004C63" - "250C0000004C632D0C0000004C63350C0000004C633D0C000000"); -} - -TEST(EmitterLoadsAndStores, load32u_rip) { - CodeTester tester; - tester.init_code_buffer(256); - tester.emit(IGen::load32u_rip_s32(RAX, 12)); - EXPECT_EQ(tester.dump_to_hex_string(), "8b 05 0c 00 00 00"); - - tester.clear(); - for (int i = 0; i < 16; i++) { - tester.emit(IGen::load32u_rip_s32(i, 12)); - } - - EXPECT_EQ(tester.dump_to_hex_string(true), - "8B050C0000008B0D0C0000008B150C0000008B1D0C0000008B250C0000008B2D0C0000008B350C0000008B" - "3D0C000000448B050C000000448B0D0C000000448B150C000000448B1D0C000000448B250C000000448B2D" - "0C000000448B350C000000448B3D0C000000"); -} - -TEST(EmitterLoadsAndStores, load16u_rip) { - CodeTester tester; - tester.init_code_buffer(256); - tester.emit(IGen::load16u_rip_s32(RAX, 12)); - EXPECT_EQ(tester.dump_to_hex_string(), "48 0f b7 05 0c 00 00 00"); - - tester.clear(); - for (int i = 0; i < 16; i++) { - tester.emit(IGen::load16u_rip_s32(i, 12)); - } - - EXPECT_EQ(tester.dump_to_hex_string(true), - "480FB7050C000000480FB70D0C000000480FB7150C000000480FB71D0C000000480FB7250C000000480FB7" - "2D0C000000480FB7350C000000480FB73D0C0000004C0FB7050C0000004C0FB70D0C0000004C0FB7150C00" - "00004C0FB71D0C0000004C0FB7250C0000004C0FB72D0C0000004C0FB7350C0000004C0FB73D0C000000"); -} - -TEST(EmitterLoadsAndStores, load16s_rip) { - CodeTester tester; - tester.init_code_buffer(256); - tester.emit(IGen::load16s_rip_s32(RAX, 12)); - EXPECT_EQ(tester.dump_to_hex_string(), "48 0f bf 05 0c 00 00 00"); - - tester.clear(); - for (int i = 0; i < 16; i++) { - tester.emit(IGen::load16s_rip_s32(i, 12)); - } - - EXPECT_EQ(tester.dump_to_hex_string(true), - "480FBF050C000000480FBF0D0C000000480FBF150C000000480FBF1D0C000000480FBF250C000000480FBF" - "2D0C000000480FBF350C000000480FBF3D0C0000004C0FBF050C0000004C0FBF0D0C0000004C0FBF150C00" - "00004C0FBF1D0C0000004C0FBF250C0000004C0FBF2D0C0000004C0FBF350C0000004C0FBF3D0C000000"); -} - -TEST(EmitterLoadsAndStores, load8s_rip) { - CodeTester tester; - tester.init_code_buffer(256); - tester.emit(IGen::load8s_rip_s32(RAX, 12)); - EXPECT_EQ(tester.dump_to_hex_string(), "48 0f be 05 0c 00 00 00"); - - tester.clear(); - for (int i = 0; i < 16; i++) { - tester.emit(IGen::load8s_rip_s32(i, 12)); - } - - EXPECT_EQ(tester.dump_to_hex_string(true), - "480FBE050C000000480FBE0D0C000000480FBE150C000000480FBE1D0C000000480FBE250C000000480FBE" - "2D0C000000480FBE350C000000480FBE3D0C0000004C0FBE050C0000004C0FBE0D0C0000004C0FBE150C00" - "00004C0FBE1D0C0000004C0FBE250C0000004C0FBE2D0C0000004C0FBE350C0000004C0FBE3D0C000000"); -} - -TEST(EmitterLoadsAndStores, load8u_rip) { - CodeTester tester; - tester.init_code_buffer(256); - tester.emit(IGen::load8u_rip_s32(RAX, 12)); - EXPECT_EQ(tester.dump_to_hex_string(), "48 0f b6 05 0c 00 00 00"); - - tester.clear(); - for (int i = 0; i < 16; i++) { - tester.emit(IGen::load8u_rip_s32(i, 12)); - } - - EXPECT_EQ(tester.dump_to_hex_string(true), - "480FB6050C000000480FB60D0C000000480FB6150C000000480FB61D0C000000480FB6250C000000480FB6" - "2D0C000000480FB6350C000000480FB63D0C0000004C0FB6050C0000004C0FB60D0C0000004C0FB6150C00" - "00004C0FB61D0C0000004C0FB6250C0000004C0FB62D0C0000004C0FB6350C0000004C0FB63D0C000000"); -} - -TEST(EmitterLoadsAndStores, store64_rip_s32) { - CodeTester tester; - tester.init_code_buffer(256); - tester.emit(IGen::store64_rip_s32(RAX, 12)); - EXPECT_EQ(tester.dump_to_hex_string(), "48 89 05 0c 00 00 00"); - - tester.clear(); - for (int i = 0; i < 16; i++) { - tester.emit(IGen::store64_rip_s32(i, 12)); - } - - EXPECT_EQ(tester.dump_to_hex_string(true), - "4889050C00000048890D0C0000004889150C00000048891D0C0000004889250C00000048892D0C00000048" - "89350C00000048893D0C0000004C89050C0000004C890D0C0000004C89150C0000004C891D0C0000004C89" - "250C0000004C892D0C0000004C89350C0000004C893D0C000000"); -} - -TEST(EmitterLoadsAndStores, store32_rip_s32) { - CodeTester tester; - tester.init_code_buffer(256); - tester.emit(IGen::store32_rip_s32(RAX, 12)); - EXPECT_EQ(tester.dump_to_hex_string(), "89 05 0c 00 00 00"); - - tester.clear(); - for (int i = 0; i < 16; i++) { - tester.emit(IGen::store32_rip_s32(i, 12)); - } - - EXPECT_EQ(tester.dump_to_hex_string(true), - "89050C000000890D0C00000089150C000000891D0C00000089250C000000892D0C00000089350C00000089" - "3D0C0000004489050C00000044890D0C0000004489150C00000044891D0C0000004489250C00000044892D" - "0C0000004489350C00000044893D0C000000"); -} - -TEST(EmitterLoadsAndStores, store16_rip_s32) { - CodeTester tester; - tester.init_code_buffer(256); - tester.emit(IGen::store16_rip_s32(RAX, 12)); - EXPECT_EQ(tester.dump_to_hex_string(), "66 89 05 0c 00 00 00"); - - tester.clear(); - for (int i = 0; i < 16; i++) { - tester.emit(IGen::store16_rip_s32(i, 12)); - } - - EXPECT_EQ(tester.dump_to_hex_string(true), - "6689050C00000066890D0C0000006689150C00000066891D0C0000006689250C00000066892D0C00000066" - "89350C00000066893D0C000000664489050C0000006644890D0C000000664489150C0000006644891D0C00" - "0000664489250C0000006644892D0C000000664489350C0000006644893D0C000000"); -} - -TEST(EmitterLoadsAndStores, store8_rip_s32) { - CodeTester tester; - tester.init_code_buffer(256); - tester.emit(IGen::store8_rip_s32(RAX, 12)); - EXPECT_EQ(tester.dump_to_hex_string(), "88 05 0c 00 00 00"); - - tester.clear(); - for (int i = 0; i < 16; i++) { - tester.emit(IGen::store8_rip_s32(i, 12)); - } - - EXPECT_EQ(tester.dump_to_hex_string(true), - "88050C000000880D0C00000088150C000000881D0C0000004088250C00000040882D0C0000004088350C00" - "000040883D0C0000004488050C00000044880D0C0000004488150C00000044881D0C0000004488250C0000" - "0044882D0C0000004488350C00000044883D0C000000"); -} - -TEST(EmitterLoadsAndStores, static_addr) { - CodeTester tester; - tester.init_code_buffer(512); - - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - tester.clear(); - tester.emit_push_all_gprs(true); - tester.emit(IGen::mov_gpr64_u64(i, 12345)); // load test reg with junk - int start_of_lea = tester.size(); - auto lea_instr = IGen::static_addr(i, INT32_MAX); - tester.emit(lea_instr); - // patch instruction to lea the start of this code + 1. - tester.write(-start_of_lea - lea_instr.length() + 1, - start_of_lea + lea_instr.offset_of_disp()); - tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); - tester.emit_pop_all_gprs(true); - tester.emit_return(); - - auto result = tester.execute(); - EXPECT_EQ(result, (u64)(tester.data()) + 1); - } -} - -#ifdef __linux__ -TEST(EmitterXmm32, load32_xmm32_gpr64_plus_gpr64) { - CodeTester tester; - tester.init_code_buffer(512); - tester.emit(IGen::load32_xmm32_gpr64_plus_gpr64(XMM3, RAX, RBX)); - EXPECT_EQ(tester.dump_to_hex_string(), "f3 0f 10 1c 03"); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - for (int k = 0; k < 16; k++) { - tester.clear(); - tester.emit_push_all_xmms(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // fill k with junk - tester.emit(IGen::mov_gpr64_u64(i, (iter & 1) ? 0 : UINT64_MAX)); - tester.emit(IGen::movd_xmm32_gpr32(XMM0 + k, i)); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - - // load into k - tester.emit(IGen::load32_xmm32_gpr64_plus_gpr64(XMM0 + k, i, j)); - // move to return - tester.emit(IGen::movd_gpr32_xmm32(RAX, XMM0 + k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_pop_all_xmms(); - tester.emit_return(); - - // prepare the memory: - float memory[8] = {0, 0, 1.23f, 3.45f, 5.67f, 0, 0, 0}; - - // run! - EXPECT_FLOAT_EQ(tester.execute_ret((u64)memory, 3 * sizeof(float), 0, 0), 3.45f); - EXPECT_FLOAT_EQ(tester.execute_ret((u64)memory, 2 * sizeof(float), 0, 0), 1.23f); - EXPECT_FLOAT_EQ(tester.execute_ret((u64)memory, 4 * sizeof(float), 0, 0), 5.67f); - EXPECT_FLOAT_EQ(tester.execute_ret((u64)memory, 5 * sizeof(float), 0, 0), 0); - - iter++; - } - } - } -} - -TEST(EmitterXmm32, load32_xmm32_gpr64_plus_gpr64_plus_s8) { - CodeTester tester; - tester.init_code_buffer(512); - tester.emit(IGen::load32_xmm32_gpr64_plus_gpr64_plus_s8(XMM3, RAX, RBX, -1)); - EXPECT_EQ(tester.dump_to_hex_string(), "f3 0f 10 5c 03 ff"); - - auto instr = IGen::load32_xmm32_gpr64_plus_gpr64_plus_s8(XMM3, RBX, RSI, -3); - u8 buff[256]; - instr.emit(buff); - EXPECT_EQ(s8(buff[instr.offset_of_disp()]), -3); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - for (int k = 0; k < 16; k++) { - tester.clear(); - tester.emit_push_all_xmms(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // fill k with junk - tester.emit(IGen::mov_gpr64_u64(i, (iter & 1) ? 0 : UINT64_MAX)); - tester.emit(IGen::movd_xmm32_gpr32(XMM0 + k, i)); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - - // load into k - tester.emit(IGen::load32_xmm32_gpr64_plus_gpr64_plus_s8(XMM0 + k, i, j, -3)); - // move to return - tester.emit(IGen::movd_gpr32_xmm32(RAX, XMM0 + k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_pop_all_xmms(); - tester.emit_return(); - - // prepare the memory: - float memory[8] = {0, 0, 1.23f, 3.45f, 5.67f, 0, 0, 0}; - - // run! - EXPECT_FLOAT_EQ(tester.execute_ret((u64)memory, 3 * sizeof(float) + 3, 0, 0), 3.45f); - EXPECT_FLOAT_EQ(tester.execute_ret((u64)memory, 2 * sizeof(float) + 3, 0, 0), 1.23f); - EXPECT_FLOAT_EQ(tester.execute_ret((u64)memory, 4 * sizeof(float) + 3, 0, 0), 5.67f); - EXPECT_FLOAT_EQ(tester.execute_ret((u64)memory, 5 * sizeof(float) + 3, 0, 0), 0); - - iter++; - } - } - } -} - -TEST(EmitterXmm32, load32_xmm32_gpr64_plus_gpr64_plus_s32) { - CodeTester tester; - tester.init_code_buffer(512); - tester.emit(IGen::load32_xmm32_gpr64_plus_gpr64_plus_s32(XMM3, RAX, RBX, -1)); - EXPECT_EQ(tester.dump_to_hex_string(), "f3 0f 10 9c 03 ff ff ff ff"); - - auto instr = IGen::load32_xmm32_gpr64_plus_gpr64_plus_s32(XMM3, RBX, RSI, -1234); - u8 buff[256]; - instr.emit(buff); - EXPECT_EQ(*(s32*)(buff + instr.offset_of_disp()), -1234); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - for (int k = 0; k < 16; k++) { - tester.clear(); - tester.emit_push_all_xmms(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); - - // fill k with junk - tester.emit(IGen::mov_gpr64_u64(i, (iter & 1) ? 0 : UINT64_MAX)); - tester.emit(IGen::movd_xmm32_gpr32(XMM0 + k, i)); - - // pop args into appropriate register - tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 - tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 - - s64 offset = (iter & 1) ? INT32_MAX : INT32_MIN; - - // load into k - tester.emit(IGen::load32_xmm32_gpr64_plus_gpr64_plus_s32(XMM0 + k, i, j, offset)); - // move to return - tester.emit(IGen::movd_gpr32_xmm32(RAX, XMM0 + k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_pop_all_xmms(); - tester.emit_return(); - - // prepare the memory: - float memory[8] = {0, 0, 1.23f, 3.45f, 5.67f, 0, 0, 0}; - - // run! - EXPECT_FLOAT_EQ(tester.execute_ret((u64)memory, 3 * sizeof(float) - offset, 0, 0), - 3.45f); - EXPECT_FLOAT_EQ(tester.execute_ret((u64)memory, 2 * sizeof(float) - offset, 0, 0), - 1.23f); - EXPECT_FLOAT_EQ(tester.execute_ret((u64)memory, 4 * sizeof(float) - offset, 0, 0), - 5.67f); - EXPECT_FLOAT_EQ(tester.execute_ret((u64)memory, 5 * sizeof(float) - offset, 0, 0), - 0); - iter++; - } - } - } -} - -namespace { -template -float as_float(T x) { - float result; - memcpy(&result, &x, sizeof(float)); - return result; -} - -u32 as_u32(float x) { - u32 result; - memcpy(&result, &x, 4); - return result; -} -} // namespace - -TEST(EmitterXmm32, store32_xmm32_gpr64_plus_gpr64) { - CodeTester tester; - tester.init_code_buffer(512); - tester.emit(IGen::store32_xmm32_gpr64_plus_gpr64(RAX, RBX, XMM7)); - EXPECT_EQ(tester.dump_to_hex_string(), "f3 0f 11 3c 03"); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - - for (int k = 0; k < 16; k++) { - tester.clear(); - tester.emit_push_all_xmms(); - tester.emit_push_all_gprs(true); - // push args to the stack - - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); // addr2 - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); // addr1 - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); // value - - // pop value into addr1 GPR - tester.emit(IGen::pop_gpr64(i)); - // move to XMM - tester.emit(IGen::movd_xmm32_gpr32(XMM0 + k, i)); - - // pop addrs - tester.emit(IGen::pop_gpr64(i)); - tester.emit(IGen::pop_gpr64(j)); - - // store - tester.emit(IGen::store32_xmm32_gpr64_plus_gpr64(i, j, XMM0 + k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_pop_all_xmms(); - tester.emit_return(); - - // prepare the memory: - float memory[8] = {0, 0, 1.23f, 3.45f, 5.67f, 0, 0, 0}; - - // run! - tester.execute((u64)memory, 12, as_u32(1.234f), 0); - EXPECT_FLOAT_EQ(memory[2], 1.23f); - EXPECT_FLOAT_EQ(memory[3], 1.234f); - EXPECT_FLOAT_EQ(memory[4], 5.67f); - - iter++; - } - } - } -} - -TEST(EmitterXmm32, store32_xmm32_gpr64_plus_gpr64_plus_s8) { - CodeTester tester; - tester.init_code_buffer(512); - tester.emit(IGen::store32_xmm32_gpr64_plus_gpr64_plus_s8(RAX, RBX, XMM3, -1)); - EXPECT_EQ(tester.dump_to_hex_string(), "f3 0f 11 5c 03 ff"); - - auto instr = IGen::store32_xmm32_gpr64_plus_gpr64_plus_s8(RBX, RSI, XMM3, -3); - u8 buff[256]; - instr.emit(buff); - EXPECT_EQ(s8(buff[instr.offset_of_disp()]), -3); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - for (int k = 0; k < 16; k++) { - tester.clear(); - tester.emit_push_all_xmms(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); // addr2 - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); // addr1 - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); // value - - // pop value into addr1 GPR - tester.emit(IGen::pop_gpr64(i)); - // move to XMM - tester.emit(IGen::movd_xmm32_gpr32(XMM0 + k, i)); - - // pop addrs - tester.emit(IGen::pop_gpr64(i)); - tester.emit(IGen::pop_gpr64(j)); - - s64 offset = (iter & 1) ? INT8_MAX : INT8_MIN; - - // load into k - tester.emit(IGen::store32_xmm32_gpr64_plus_gpr64_plus_s8(i, j, XMM0 + k, offset)); - - // move to return - tester.emit(IGen::movd_gpr32_xmm32(RAX, XMM0 + k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_pop_all_xmms(); - tester.emit_return(); - - // prepare the memory: - float memory[8] = {0, 0, 1.23f, 3.45f, 5.67f, 0, 0, 0}; - - // run! - tester.execute((u64)memory, 12 - offset, as_u32(1.234f), 0); - EXPECT_FLOAT_EQ(memory[2], 1.23f); - EXPECT_FLOAT_EQ(memory[3], 1.234f); - EXPECT_FLOAT_EQ(memory[4], 5.67f); - - iter++; - } - } - } -} - -TEST(EmitterXmm32, store32_xmm32_gpr64_plus_gpr64_plus_s32) { - CodeTester tester; - tester.init_code_buffer(512); - tester.emit(IGen::store32_xmm32_gpr64_plus_gpr64_plus_s32(RAX, RBX, XMM3, -1)); - EXPECT_EQ(tester.dump_to_hex_string(), "f3 0f 11 9c 03 ff ff ff ff"); - - auto instr = IGen::store32_xmm32_gpr64_plus_gpr64_plus_s32(RBX, RSI, XMM3, -1234); - u8 buff[256]; - instr.emit(buff); - EXPECT_EQ(*(s32*)(buff + instr.offset_of_disp()), -1234); - - int iter = 0; - for (int i = 0; i < 16; i++) { - if (i == RSP) { - continue; - } - for (int j = 0; j < 16; j++) { - if (j == RSP || j == i) { - continue; - } - for (int k = 0; k < 16; k++) { - tester.clear(); - tester.emit_push_all_xmms(); - tester.emit_push_all_gprs(true); - // push args to the stack - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); // addr2 - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); // addr1 - tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); // value - - // pop value into addr1 GPR - tester.emit(IGen::pop_gpr64(i)); - // move to XMM - tester.emit(IGen::movd_xmm32_gpr32(XMM0 + k, i)); - - // pop addrs - tester.emit(IGen::pop_gpr64(i)); - tester.emit(IGen::pop_gpr64(j)); - - s64 offset = (iter & 1) ? INT32_MAX : INT32_MIN; - - // load into k - tester.emit(IGen::store32_xmm32_gpr64_plus_gpr64_plus_s32(i, j, XMM0 + k, offset)); - - // move to return - tester.emit(IGen::movd_gpr32_xmm32(RAX, XMM0 + k)); - - // return! - tester.emit_pop_all_gprs(true); - tester.emit_pop_all_xmms(); - tester.emit_return(); - - // prepare the memory: - float memory[8] = {0, 0, 1.23f, 3.45f, 5.67f, 0, 0, 0}; - - // run! - tester.execute((u64)memory, 12 - offset, as_u32(1.234f), 0); - EXPECT_FLOAT_EQ(memory[2], 1.23f); - EXPECT_FLOAT_EQ(memory[3], 1.234f); - EXPECT_FLOAT_EQ(memory[4], 5.67f); - - iter++; - } - } - } -} - -TEST(EmitterXmm32, static_load_xmm32) { - CodeTester tester; - tester.init_code_buffer(512); - for (int i = 0; i < 16; i++) { - tester.clear(); - tester.emit_push_all_xmms(); - tester.emit_push_all_gprs(true); - - auto loc_of_load = tester.size(); - auto load_instr = IGen::static_load_xmm32(XMM0 + i, INT32_MAX); - - tester.emit(load_instr); - tester.emit(IGen::movd_gpr32_xmm32(RAX, XMM0 + i)); - tester.emit_pop_all_gprs(true); - tester.emit_pop_all_xmms(); - tester.emit_return(); - auto loc_of_float = tester.emit_data(float(1.2345f)); - - // patch offset - tester.write(loc_of_float - loc_of_load - load_instr.length(), - loc_of_load + load_instr.offset_of_disp()); - - auto result = tester.execute_ret(0, 0, 0, 0); - EXPECT_FLOAT_EQ(result, 1.2345f); - } -} - -TEST(EmitterXmm32, static_store_xmm32) { - CodeTester tester; - tester.init_code_buffer(512); - for (int i = 0; i < 16; i++) { - tester.clear(); - tester.emit_push_all_xmms(); - tester.emit_push_all_gprs(true); - tester.emit(IGen::movd_xmm32_gpr32(XMM0 + i, tester.get_c_abi_arg_reg(0))); - - auto loc_of_store = tester.size(); - auto store_instr = IGen::static_store_xmm32(XMM0 + i, INT32_MAX); - - tester.emit(store_instr); - tester.emit_pop_all_gprs(true); - tester.emit_pop_all_xmms(); - tester.emit_return(); - auto loc_of_float = tester.emit_data(float(1.2345f)); - - tester.write(loc_of_float - loc_of_store - store_instr.length(), - loc_of_store + store_instr.offset_of_disp()); - tester.execute(as_u32(-44.567f), 0, 0, 0); - EXPECT_FLOAT_EQ(-44.567f, tester.read(loc_of_float)); - } -} - -TEST(EmitterXmm32, ucomiss) { - CodeTester tester; - tester.init_code_buffer(512); - tester.emit(IGen::cmp_flt_flt(XMM13, XMM14)); - EXPECT_EQ("45 0f 2e ee", tester.dump_to_hex_string()); -} - -TEST(EmitterXmm32, mul) { - CodeTester tester; - tester.init_code_buffer(512); - - std::vector vals = {0.f, 1.f, 0.2f, -1.f, 1235423.2f, -3457343.3f, 7.545f}; - - for (auto f : vals) { - for (auto g : vals) { - for (int i = 0; i < 16; i++) { - for (int j = 0; j < 16; j++) { - if (i == j) { - continue; - } - auto expected = f * g; - tester.clear(); - tester.emit_push_all_xmms(); - tester.emit_push_all_gprs(true); - u64 val = 0; - memcpy(&val, &f, sizeof(float)); - tester.emit(IGen::mov_gpr64_u64(RAX, val)); - tester.emit(IGen::movd_xmm32_gpr32(XMM0 + i, RAX)); - memcpy(&val, &g, sizeof(float)); - tester.emit(IGen::mov_gpr64_u64(RAX, val)); - tester.emit(IGen::movd_xmm32_gpr32(XMM0 + j, RAX)); - tester.emit(IGen::mulss_xmm_xmm(XMM0 + j, XMM0 + i)); - tester.emit(IGen::movd_gpr32_xmm32(RAX, XMM0 + j)); - tester.emit_pop_all_gprs(true); - tester.emit_pop_all_xmms(); - tester.emit_return(); - auto result = tester.execute_ret(0, 0, 0, 0); - EXPECT_FLOAT_EQ(result, expected); - } - } - } - } -} - -TEST(EmitterXmm32, div) { - CodeTester tester; - tester.init_code_buffer(512); - - std::vector vals = {1.f, 0.2f, -1.f, 1235423.2f, -3457343.3f, 7.545f}; - - for (auto f : vals) { - for (auto g : vals) { - for (int i = 0; i < 16; i++) { - for (int j = 0; j < 16; j++) { - if (i == j) { - continue; - } - auto expected = g / f; - tester.clear(); - tester.emit_push_all_xmms(); - tester.emit_push_all_gprs(true); - u64 val = 0; - memcpy(&val, &f, sizeof(float)); - tester.emit(IGen::mov_gpr64_u64(RAX, val)); - tester.emit(IGen::movd_xmm32_gpr32(XMM0 + i, RAX)); - memcpy(&val, &g, sizeof(float)); - tester.emit(IGen::mov_gpr64_u64(RAX, val)); - tester.emit(IGen::movd_xmm32_gpr32(XMM0 + j, RAX)); - tester.emit(IGen::divss_xmm_xmm(XMM0 + j, XMM0 + i)); - tester.emit(IGen::movd_gpr32_xmm32(RAX, XMM0 + j)); - tester.emit_pop_all_gprs(true); - tester.emit_pop_all_xmms(); - tester.emit_return(); - auto result = tester.execute_ret(0, 0, 0, 0); - EXPECT_FLOAT_EQ(result, expected); - } - } - } - } -} - -TEST(EmitterXmm32, add) { - CodeTester tester; - tester.init_code_buffer(512); - - std::vector vals = {0.f, 1.f, 0.2f, -1.f, 1235423.2f, -3457343.3f, 7.545f}; - for (auto f : vals) { - for (auto g : vals) { - for (int i = 0; i < 16; i++) { - for (int j = 0; j < 16; j++) { - if (i == j) { - continue; - } - auto expected = g + f; - tester.clear(); - tester.emit_push_all_xmms(); - tester.emit_push_all_gprs(true); - u64 val = 0; - memcpy(&val, &f, sizeof(float)); - tester.emit(IGen::mov_gpr64_u64(RAX, val)); - tester.emit(IGen::movd_xmm32_gpr32(XMM0 + i, RAX)); - memcpy(&val, &g, sizeof(float)); - tester.emit(IGen::mov_gpr64_u64(RAX, val)); - tester.emit(IGen::movd_xmm32_gpr32(XMM0 + j, RAX)); - tester.emit(IGen::addss_xmm_xmm(XMM0 + j, XMM0 + i)); - tester.emit(IGen::movd_gpr32_xmm32(RAX, XMM0 + j)); - tester.emit_pop_all_gprs(true); - tester.emit_pop_all_xmms(); - tester.emit_return(); - auto result = tester.execute_ret(0, 0, 0, 0); - EXPECT_FLOAT_EQ(result, expected); - } - } - } - } -} - -TEST(EmitterXmm32, sub) { - CodeTester tester; - tester.init_code_buffer(512); - - std::vector vals = {0.f, 1.f, 0.2f, -1.f, 1235423.2f, -3457343.3f, 7.545f}; - - for (auto f : vals) { - for (auto g : vals) { - for (int i = 0; i < 16; i++) { - for (int j = 0; j < 16; j++) { - if (i == j) { - continue; - } - auto expected = g - f; - tester.clear(); - tester.emit_push_all_xmms(); - tester.emit_push_all_gprs(true); - u64 val = 0; - memcpy(&val, &f, sizeof(float)); - tester.emit(IGen::mov_gpr64_u64(RAX, val)); - tester.emit(IGen::movd_xmm32_gpr32(XMM0 + i, RAX)); - memcpy(&val, &g, sizeof(float)); - tester.emit(IGen::mov_gpr64_u64(RAX, val)); - tester.emit(IGen::movd_xmm32_gpr32(XMM0 + j, RAX)); - tester.emit(IGen::subss_xmm_xmm(XMM0 + j, XMM0 + i)); - tester.emit(IGen::movd_gpr32_xmm32(RAX, XMM0 + j)); - tester.emit_pop_all_gprs(true); - tester.emit_pop_all_xmms(); - tester.emit_return(); - auto result = tester.execute_ret(0, 0, 0, 0); - EXPECT_FLOAT_EQ(result, expected); - } - } - } - } -} - -TEST(EmitterXmm32, float_to_int) { - CodeTester tester; - tester.init_code_buffer(512); - - std::vector vals = {0.f, 1.f, 0.2f, -1.f, 1235423.2f, -3457343.3f, - 7.545f, 0.1f, 0.9f, -0.1f, -0.9f}; - - for (auto g : vals) { - for (int i = 0; i < 16; i++) { - for (int j = 0; j < 16; j++) { - if (j == RSP) { - continue; - } - s32 expected = g; - tester.clear(); - tester.emit_push_all_xmms(); - tester.emit_push_all_gprs(true); - u64 val = 0; - memcpy(&val, &g, sizeof(float)); - tester.emit(IGen::mov_gpr64_u64(RAX, val)); - tester.emit(IGen::movd_xmm32_gpr32(XMM0 + i, RAX)); - tester.emit(IGen::float_to_int32(j, XMM0 + i)); - tester.emit(IGen::mov_gpr64_gpr64(RAX, j)); - tester.emit_pop_all_gprs(true); - tester.emit_pop_all_xmms(); - tester.emit_return(); - auto result = tester.execute_ret(0, 0, 0, 0); - EXPECT_EQ(result, expected); - } - } - } -} - -TEST(EmitterXmm32, int_to_float) { - CodeTester tester; - tester.init_code_buffer(512); - - std::vector vals = {0, 1, -1, INT32_MAX, -3457343, 7, INT32_MIN}; - - for (auto g : vals) { - for (int i = 0; i < 16; i++) { - for (int j = 0; j < 16; j++) { - if (j == RSP) { - continue; - } - float expected = g; - tester.clear(); - tester.emit_push_all_xmms(); - tester.emit_push_all_gprs(true); - tester.emit(IGen::mov_gpr64_u64(j, g)); - tester.emit(IGen::int32_to_float(XMM0 + i, j)); - tester.emit(IGen::movd_gpr32_xmm32(RAX, XMM0 + i)); - tester.emit_pop_all_gprs(true); - tester.emit_pop_all_xmms(); - tester.emit_return(); - auto result = tester.execute_ret(0, 0, 0, 0); - EXPECT_EQ(result, expected); - } - } - } -} - -TEST(EmitterSlow, xmm32_move) { - std::vector u32_constants = {0, INT32_MAX, UINT32_MAX, 17}; - - // test moving between xmms (32-bit) and gprs. - CodeTester tester; - tester.init_code_buffer(512); - - for (auto constant : u32_constants) { - for (int r1 = 0; r1 < 16; r1++) { - if (r1 == RSP) { - continue; - } - for (int r2 = 0; r2 < 16; r2++) { - if (r2 == RSP) { - continue; - } - for (int r3 = 0; r3 < 16; r3++) { - for (int r4 = 0; r4 < 16; r4++) { - tester.clear(); - tester.emit_push_all_xmms(); - tester.emit_push_all_gprs(true); - // move constant to gpr - tester.emit(IGen::mov_gpr64_u32(r1, constant)); - // move gpr to xmm - tester.emit(IGen::movd_xmm32_gpr32(XMM0 + r3, r1)); - // move xmm to xmm - tester.emit(IGen::mov_xmm32_xmm32(XMM0 + r4, XMM0 + r3)); - // move xmm to gpr - tester.emit(IGen::movd_gpr32_xmm32(r2, XMM0 + r4)); - // return! - tester.emit(IGen::mov_gpr64_gpr64(RAX, r2)); - tester.emit_pop_all_gprs(true); - tester.emit_pop_all_xmms(); - tester.emit_return(); - } - } - } - } - } - // todo - finish this test -} -#endif - -TEST(Emitter, LEA) { - CodeTester tester; - tester.init_code_buffer(1024); - tester.emit(IGen::lea_reg_plus_off(RDI, RSP, -3)); - tester.emit(IGen::lea_reg_plus_off(RDI, R12, -3)); - tester.emit(IGen::lea_reg_plus_off(R13, RSP, -3)); - tester.emit(IGen::lea_reg_plus_off(R13, R12, -3)); - tester.emit(IGen::lea_reg_plus_off(RDI, RSP, -300)); - tester.emit(IGen::lea_reg_plus_off(RDI, R12, -300)); - tester.emit(IGen::lea_reg_plus_off(R13, RSP, -300)); - tester.emit(IGen::lea_reg_plus_off(R13, R12, -300)); - EXPECT_EQ(tester.dump_to_hex_string(true), - "488D7C24FD498D7C24FD4C8D6C24FD4D8D6C24FD488DBC24D4FEFFFF498DBC24D4FEFFFF4C8DAC24D4FEFF" - "FF4D8DAC24D4FEFFFF"); -} - -TEST(EmitterXMM, StackLoad32) { - CodeTester tester; - tester.init_code_buffer(1024); - tester.emit(IGen::load32_xmm32_gpr64_plus_s32(XMM0 + 3, RSP, -1234)); - tester.emit(IGen::load32_xmm32_gpr64_plus_s32(XMM0 + 13, RSP, -1234)); - EXPECT_EQ(tester.dump_to_hex_string(true), "F30F109C242EFBFFFFF3440F10AC242EFBFFFF"); -} - -TEST(EmitterXMM, StackLoad8) { - CodeTester tester; - tester.init_code_buffer(1024); - tester.emit(IGen::load32_xmm32_gpr64_plus_s8(XMM0 + 3, RSP, -12)); - tester.emit(IGen::load32_xmm32_gpr64_plus_s8(XMM0 + 13, RSP, -12)); - EXPECT_EQ(tester.dump_to_hex_string(true), "F30F105C24F4F3440F106C24F4"); -} - -TEST(EmitterXMM, StackLoadFull32) { - CodeTester tester; - tester.init_code_buffer(1024); - tester.emit(IGen::load128_xmm128_gpr64_s32(XMM0 + 3, RSP, -1234)); - tester.emit(IGen::load128_xmm128_gpr64_s32(XMM0 + 13, RSP, -1234)); - EXPECT_EQ(tester.dump_to_hex_string(true), "660F6F9C242EFBFFFF66440F6FAC242EFBFFFF"); -} - -TEST(EmitterXMM, StackLoadFull8) { - CodeTester tester; - tester.init_code_buffer(1024); - tester.emit(IGen::load128_xmm128_gpr64_s8(XMM0 + 3, RSP, -12)); - tester.emit(IGen::load128_xmm128_gpr64_s8(XMM0 + 13, RSP, -12)); - EXPECT_EQ(tester.dump_to_hex_string(true), "660F6F5C24F466440F6F6C24F4"); -} - -TEST(EmitterXMM, StackStore32) { - CodeTester tester; - tester.init_code_buffer(1024); - tester.emit(IGen::store32_xmm32_gpr64_plus_s32(RSP, XMM0 + 3, -1234)); - tester.emit(IGen::store32_xmm32_gpr64_plus_s32(RSP, XMM0 + 13, -1234)); - EXPECT_EQ(tester.dump_to_hex_string(true), "F30F119C242EFBFFFFF3440F11AC242EFBFFFF"); -} - -TEST(EmitterXMM, StackStore8) { - CodeTester tester; - tester.init_code_buffer(1024); - tester.emit(IGen::store32_xmm32_gpr64_plus_s8(RSP, XMM0 + 3, -12)); - tester.emit(IGen::store32_xmm32_gpr64_plus_s8(RSP, XMM0 + 13, -12)); - EXPECT_EQ(tester.dump_to_hex_string(true), "F30F115C24F4F3440F116C24F4"); -} - -TEST(EmitterXMM, StackStoreFull32) { - CodeTester tester; - tester.init_code_buffer(1024); - tester.emit(IGen::store128_gpr64_xmm128_s32(RSP, XMM0 + 3, -1234)); - tester.emit(IGen::store128_gpr64_xmm128_s32(RSP, XMM0 + 13, -1234)); - EXPECT_EQ(tester.dump_to_hex_string(true), "660F7F9C242EFBFFFF66440F7FAC242EFBFFFF"); -} - -TEST(EmitterXMM, StackStoreFull8) { - CodeTester tester; - tester.init_code_buffer(1024); - tester.emit(IGen::store128_gpr64_xmm128_s8(RSP, XMM0 + 3, -12)); - tester.emit(IGen::store128_gpr64_xmm128_s8(RSP, XMM0 + 13, -12)); - EXPECT_EQ(tester.dump_to_hex_string(true), "660F7F5C24F466440F7F6C24F4"); -} - -TEST(EmitterXMM, SqrtS) { - CodeTester tester; - tester.init_code_buffer(1024); - tester.emit(IGen::sqrts_xmm(XMM0 + 1, XMM0 + 2)); - tester.emit(IGen::sqrts_xmm(XMM0 + 11, XMM0 + 2)); - tester.emit(IGen::sqrts_xmm(XMM0 + 1, XMM0 + 12)); - tester.emit(IGen::sqrts_xmm(XMM0 + 11, XMM0 + 12)); - EXPECT_EQ(tester.dump_to_hex_string(true), "F30F51CAF3440F51DAF3410F51CCF3450F51DC"); -} +// #include "goalc/emitter/CodeTester.h" +// #include "goalc/emitter/IGen.h" +// #include "gtest/gtest.h" + +// using namespace emitter; + +// TEST(EmitterIntegerMath, add_gpr64_imm8s) { +// CodeTester tester; +// tester.init_code_buffer(256); + +// std::vector vals = {0, 1, -1, INT32_MIN, INT32_MAX, INT64_MIN, INT64_MAX}; +// std::vector imms = {0, 1, -1, INT8_MIN, INT8_MAX}; + +// // test the ones that aren't rsp +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (auto val : vals) { +// for (auto imm : imms) { +// auto expected = val + imm; + +// tester.clear(); +// tester.emit_push_all_gprs(true); + +// // move initial value to register +// tester.emit(IGen::mov_gpr64_gpr64(i, tester.get_c_abi_arg_reg(0))); +// // do the add +// tester.emit(IGen::add_gpr64_imm8s(i, imm)); +// // move for return +// tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); + +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// auto result = tester.execute_ret(val, 0, 0, 0); +// EXPECT_EQ(result, expected); +// } +// } +// } + +// tester.clear(); +// tester.emit(IGen::add_gpr64_imm8s(RSP, 12)); +// EXPECT_EQ(tester.dump_to_hex_string(), "48 83 c4 0c"); +// } + +// TEST(EmitterIntegerMath, add_gpr64_imm32s) { +// CodeTester tester; +// tester.init_code_buffer(256); + +// std::vector vals = {0, 1, -1, INT32_MIN, INT32_MAX, INT64_MIN, INT64_MAX}; +// std::vector imms = {0, 1, -1, INT8_MIN, INT8_MAX, INT32_MIN, INT32_MAX}; + +// // test the ones that aren't rsp +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (auto val : vals) { +// for (auto imm : imms) { +// auto expected = val + imm; + +// tester.clear(); +// tester.emit_push_all_gprs(true); + +// // move initial value to register +// tester.emit(IGen::mov_gpr64_gpr64(i, tester.get_c_abi_arg_reg(0))); +// // do the add +// tester.emit(IGen::add_gpr64_imm32s(i, imm)); +// // move for return +// tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); + +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// auto result = tester.execute_ret(val, 0, 0, 0); +// EXPECT_EQ(result, expected); +// } +// } +// } + +// tester.clear(); +// tester.emit(IGen::add_gpr64_imm32s(RSP, 12)); +// EXPECT_EQ(tester.dump_to_hex_string(), "48 81 c4 0c 00 00 00"); +// } + +// TEST(EmitterIntegerMath, sub_gpr64_imm8s) { +// CodeTester tester; +// tester.init_code_buffer(256); + +// std::vector vals = {0, 1, -1, INT32_MIN, INT32_MAX, INT64_MIN, INT64_MAX}; +// std::vector imms = {0, 1, -1, INT8_MIN, INT8_MAX}; + +// // test the ones that aren't rsp +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (auto val : vals) { +// for (auto imm : imms) { +// auto expected = val - imm; + +// tester.clear(); +// tester.emit_push_all_gprs(true); + +// // move initial value to register +// tester.emit(IGen::mov_gpr64_gpr64(i, tester.get_c_abi_arg_reg(0))); +// // do the add +// tester.emit(IGen::sub_gpr64_imm8s(i, imm)); +// // move for return +// tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); + +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// auto result = tester.execute_ret(val, 0, 0, 0); +// EXPECT_EQ(result, expected); +// } +// } +// } + +// tester.clear(); +// tester.emit(IGen::sub_gpr64_imm8s(RSP, 12)); +// EXPECT_EQ(tester.dump_to_hex_string(), "48 83 ec 0c"); +// } + +// TEST(EmitterIntegerMath, sub_gpr64_imm32s) { +// CodeTester tester; +// tester.init_code_buffer(256); + +// std::vector vals = {0, 1, -1, INT32_MIN, INT32_MAX, INT64_MIN, INT64_MAX}; +// std::vector imms = {0, 1, -1, INT8_MIN, INT8_MAX, INT32_MIN, INT32_MAX}; + +// // test the ones that aren't rsp +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (auto val : vals) { +// for (auto imm : imms) { +// auto expected = val - imm; + +// tester.clear(); +// tester.emit_push_all_gprs(true); + +// // move initial value to register +// tester.emit(IGen::mov_gpr64_gpr64(i, tester.get_c_abi_arg_reg(0))); +// // do the add +// tester.emit(IGen::sub_gpr64_imm32s(i, imm)); +// // move for return +// tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); + +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// auto result = tester.execute_ret(val, 0, 0, 0); +// EXPECT_EQ(result, expected); +// } +// } +// } + +// tester.clear(); +// tester.emit(IGen::sub_gpr64_imm32s(RSP, 12)); +// EXPECT_EQ(tester.dump_to_hex_string(), "48 81 ec 0c 00 00 00"); +// } + +// TEST(EmitterIntegerMath, add_gpr64_gpr64) { +// CodeTester tester; +// tester.init_code_buffer(256); +// std::vector vals = {0, 1, -2, INT32_MIN, INT32_MAX, INT64_MIN, +// INT64_MAX, 117, 32, -348473, 83747382}; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } +// for (auto v1 : vals) { +// for (auto v2 : vals) { +// auto expected = v1 + v2; +// tester.clear(); +// tester.emit_push_all_gprs(true); +// tester.emit(IGen::mov_gpr64_u64(i, v1)); +// tester.emit(IGen::mov_gpr64_u64(j, v2)); +// tester.emit(IGen::add_gpr64_gpr64(i, j)); +// tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); +// auto result = tester.execute_ret(0, 0, 0, 0); +// EXPECT_EQ(result, expected); +// } +// } +// } +// } +// } + +// TEST(EmitterIntegerMath, sub_gpr64_gpr64) { +// CodeTester tester; +// tester.init_code_buffer(256); +// std::vector vals = {0, 1, -2, INT32_MIN, INT32_MAX, INT64_MIN, +// INT64_MAX, 117, 32, -348473, 83747382}; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } +// for (auto v1 : vals) { +// for (auto v2 : vals) { +// auto expected = v1 - v2; +// tester.clear(); +// tester.emit_push_all_gprs(true); +// tester.emit(IGen::mov_gpr64_u64(i, v1)); +// tester.emit(IGen::mov_gpr64_u64(j, v2)); +// tester.emit(IGen::sub_gpr64_gpr64(i, j)); +// tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); +// auto result = tester.execute_ret(0, 0, 0, 0); +// EXPECT_EQ(result, expected); +// } +// } +// } +// } +// } + +// TEST(EmitterIntegerMath, mul_gpr32_gpr32) { +// CodeTester tester; +// tester.init_code_buffer(256); +// std::vector vals = { +// 0, 1, -2, -20, 123123, INT32_MIN, INT32_MAX, INT32_MIN + 1, INT32_MAX - 1}; + +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } +// for (auto v1 : vals) { +// for (auto v2 : vals) { +// // this is kind of weird behavior, but it's what the PS2 CPU does, I think. +// // the lower 32-bits of the result are sign extended, even if this sign doesn't match +// // the sign of the real product. This is true for both signed and unsigned multiply. +// auto expected = ((s64(v1) * s64(v2)) << 32) >> 32; +// tester.clear(); +// tester.emit_push_all_gprs(true); +// tester.emit(IGen::mov_gpr64_u64(i, (s64)v1)); +// tester.emit(IGen::mov_gpr64_u64(j, (s64)v2)); +// tester.emit(IGen::imul_gpr32_gpr32(i, j)); +// tester.emit(IGen::movsx_r64_r32(RAX, i)); // weird PS2 sign extend. +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); +// auto result = tester.execute_ret(0, 0, 0, 0); +// EXPECT_EQ(result, expected); +// } +// } +// } +// } +// } + +// TEST(EmitterIntegerMath, or_gpr64_gpr64) { +// CodeTester tester; +// tester.init_code_buffer(256); +// std::vector vals = {0, 1, -2, INT32_MIN, INT32_MAX, INT64_MIN, +// INT64_MAX, 117, 32, -348473, 83747382}; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } +// for (auto v1 : vals) { +// for (auto v2 : vals) { +// auto expected = v1 | v2; +// tester.clear(); +// tester.emit_push_all_gprs(true); +// tester.emit(IGen::mov_gpr64_u64(i, v1)); +// tester.emit(IGen::mov_gpr64_u64(j, v2)); +// tester.emit(IGen::or_gpr64_gpr64(i, j)); +// tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); +// auto result = tester.execute_ret(0, 0, 0, 0); +// EXPECT_EQ(result, expected); +// } +// } +// } +// } +// } + +// TEST(EmitterIntegerMath, and_gpr64_gpr64) { +// CodeTester tester; +// tester.init_code_buffer(256); +// std::vector vals = {0, 1, -2, INT32_MIN, INT32_MAX, INT64_MIN, +// INT64_MAX, 117, 32, -348473, 83747382}; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } +// for (auto v1 : vals) { +// for (auto v2 : vals) { +// auto expected = v1 & v2; +// tester.clear(); +// tester.emit_push_all_gprs(true); +// tester.emit(IGen::mov_gpr64_u64(i, v1)); +// tester.emit(IGen::mov_gpr64_u64(j, v2)); +// tester.emit(IGen::and_gpr64_gpr64(i, j)); +// tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); +// auto result = tester.execute_ret(0, 0, 0, 0); +// EXPECT_EQ(result, expected); +// } +// } +// } +// } +// } + +// TEST(EmitterIntegerMath, xor_gpr64_gpr64) { +// CodeTester tester; +// tester.init_code_buffer(256); +// std::vector vals = {0, 1, -2, INT32_MIN, INT32_MAX, INT64_MIN, +// INT64_MAX, 117, 32, -348473, 83747382}; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } +// for (auto v1 : vals) { +// for (auto v2 : vals) { +// auto expected = v1 ^ v2; +// tester.clear(); +// tester.emit_push_all_gprs(true); +// tester.emit(IGen::mov_gpr64_u64(i, v1)); +// tester.emit(IGen::mov_gpr64_u64(j, v2)); +// tester.emit(IGen::xor_gpr64_gpr64(i, j)); +// tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); +// auto result = tester.execute_ret(0, 0, 0, 0); +// EXPECT_EQ(result, expected); +// } +// } +// } +// } +// } + +// TEST(EmitterIntegerMath, not_gpr64) { +// CodeTester tester; +// tester.init_code_buffer(256); +// std::vector vals = {0, 1, -2, INT32_MIN, INT32_MAX, INT64_MIN, +// INT64_MAX, 117, 32, -348473, 83747382}; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } +// for (auto v1 : vals) { +// auto expected = ~v1; +// tester.clear(); +// tester.emit_push_all_gprs(true); +// tester.emit(IGen::mov_gpr64_u64(i, v1)); +// tester.emit(IGen::not_gpr64(i)); +// tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); +// auto result = tester.execute_ret(0, 0, 0, 0); +// EXPECT_EQ(result, expected); +// } +// } +// } + +// TEST(EmitterIntegerMath, shl_gpr64_cl) { +// CodeTester tester; +// tester.init_code_buffer(256); +// std::vector vals = {0, 1, -2, INT32_MIN, INT32_MAX, INT64_MIN, +// INT64_MAX, 117, 32, -348473, 83747382}; +// std::vector sas = {0, 1, 23, 53, 64}; + +// for (int i = 0; i < 16; i++) { +// if (i == RSP || i == RCX) { +// continue; +// } +// for (auto v : vals) { +// for (auto sa : sas) { +// auto expected = v << sa; +// tester.clear(); +// tester.emit_push_all_gprs(true); +// tester.emit(IGen::mov_gpr64_u64(i, v)); +// tester.emit(IGen::mov_gpr64_u64(RCX, sa)); +// tester.emit(IGen::shl_gpr64_cl(i)); +// tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); +// auto result = tester.execute_ret(0, 0, 0, 0); +// EXPECT_EQ(result, expected); +// } +// } +// } +// } + +// TEST(EmitterIntegerMath, shr_gpr64_cl) { +// CodeTester tester; +// tester.init_code_buffer(256); +// std::vector vals = {0, 1, u64(-2), u64(INT32_MIN), INT32_MAX, u64(INT64_MIN), +// INT64_MAX, 117, 32, u64(-348473), 83747382}; +// std::vector sas = {0, 1, 23, 53, 64}; + +// for (int i = 0; i < 16; i++) { +// if (i == RSP || i == RCX) { +// continue; +// } +// for (auto v : vals) { +// for (auto sa : sas) { +// auto expected = v >> sa; +// tester.clear(); +// tester.emit_push_all_gprs(true); +// tester.emit(IGen::mov_gpr64_u64(i, v)); +// tester.emit(IGen::mov_gpr64_u64(RCX, sa)); +// tester.emit(IGen::shr_gpr64_cl(i)); +// tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); +// auto result = tester.execute_ret(0, 0, 0, 0); +// EXPECT_EQ(result, expected); +// } +// } +// } +// } + +// TEST(EmitterIntegerMath, sar_gpr64_cl) { +// CodeTester tester; +// tester.init_code_buffer(256); +// std::vector vals = {0, 1, -2, INT32_MIN, INT32_MAX, INT64_MIN, +// INT64_MAX, 117, 32, -348473, 83747382}; +// std::vector sas = {0, 1, 23, 53, 64}; + +// for (int i = 0; i < 16; i++) { +// if (i == RSP || i == RCX) { +// continue; +// } +// for (auto v : vals) { +// for (auto sa : sas) { +// auto expected = v >> sa; +// tester.clear(); +// tester.emit_push_all_gprs(true); +// tester.emit(IGen::mov_gpr64_u64(i, v)); +// tester.emit(IGen::mov_gpr64_u64(RCX, sa)); +// tester.emit(IGen::sar_gpr64_cl(i)); +// tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); +// auto result = tester.execute_ret(0, 0, 0, 0); +// EXPECT_EQ(result, expected); +// } +// } +// } +// } + +// TEST(EmitterIntegerMath, shl_gpr64_u8) { +// CodeTester tester; +// tester.init_code_buffer(256); +// std::vector vals = {0, 1, -2, INT32_MIN, INT32_MAX, INT64_MIN, +// INT64_MAX, 117, 32, -348473, 83747382}; +// std::vector sas = {0, 1, 23, 53, 64}; + +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } +// for (auto v : vals) { +// for (auto sa : sas) { +// auto expected = v << sa; +// tester.clear(); +// tester.emit_push_all_gprs(true); +// tester.emit(IGen::mov_gpr64_u64(i, v)); +// tester.emit(IGen::shl_gpr64_u8(i, sa)); +// tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); +// auto result = tester.execute_ret(0, 0, 0, 0); +// EXPECT_EQ(result, expected); +// } +// } +// } +// } + +// TEST(EmitterIntegerMath, shr_gpr64_u8) { +// CodeTester tester; +// tester.init_code_buffer(256); +// std::vector vals = {0, 1, u64(-2), u64(INT32_MIN), INT32_MAX, u64(INT64_MIN), +// INT64_MAX, 117, 32, u64(-348473), 83747382}; +// std::vector sas = {0, 1, 23, 53, 64}; + +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } +// for (auto v : vals) { +// for (auto sa : sas) { +// auto expected = v >> sa; +// tester.clear(); +// tester.emit_push_all_gprs(true); +// tester.emit(IGen::mov_gpr64_u64(i, v)); +// tester.emit(IGen::shr_gpr64_u8(i, sa)); +// tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); +// auto result = tester.execute_ret(0, 0, 0, 0); +// EXPECT_EQ(result, expected); +// } +// } +// } +// } + +// TEST(EmitterIntegerMath, sar_gpr64_u8) { +// CodeTester tester; +// tester.init_code_buffer(256); +// std::vector vals = {0, 1, -2, INT32_MIN, INT32_MAX, INT64_MIN, +// INT64_MAX, 117, 32, -348473, 83747382}; +// std::vector sas = {0, 1, 23, 53, 64}; + +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } +// for (auto v : vals) { +// for (auto sa : sas) { +// auto expected = v >> sa; +// tester.clear(); +// tester.emit_push_all_gprs(true); +// tester.emit(IGen::mov_gpr64_u64(i, v)); +// tester.emit(IGen::sar_gpr64_u8(i, sa)); +// tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); +// auto result = tester.execute_ret(0, 0, 0, 0); +// EXPECT_EQ(result, expected); +// } +// } +// } +// } + +// TEST(EmitterIntegerMath, jumps) { +// CodeTester tester; +// tester.init_code_buffer(256); + +// std::vector reads; + +// auto x = IGen::jmp_32(); +// reads.push_back(tester.size() + x.offset_of_imm()); +// tester.emit(x); + +// x = IGen::je_32(); +// reads.push_back(tester.size() + x.offset_of_imm()); +// tester.emit(x); + +// x = IGen::jne_32(); +// reads.push_back(tester.size() + x.offset_of_imm()); +// tester.emit(x); + +// x = IGen::jle_32(); +// reads.push_back(tester.size() + x.offset_of_imm()); +// tester.emit(x); + +// x = IGen::jge_32(); +// reads.push_back(tester.size() + x.offset_of_imm()); +// tester.emit(x); + +// x = IGen::jl_32(); +// reads.push_back(tester.size() + x.offset_of_imm()); +// tester.emit(x); + +// x = IGen::jg_32(); +// reads.push_back(tester.size() + x.offset_of_imm()); +// tester.emit(x); + +// x = IGen::jbe_32(); +// reads.push_back(tester.size() + x.offset_of_imm()); +// tester.emit(x); + +// x = IGen::jae_32(); +// reads.push_back(tester.size() + x.offset_of_imm()); +// tester.emit(x); + +// x = IGen::jb_32(); +// reads.push_back(tester.size() + x.offset_of_imm()); +// tester.emit(x); + +// x = IGen::ja_32(); +// reads.push_back(tester.size() + x.offset_of_imm()); +// tester.emit(x); + +// for (auto off : reads) { +// EXPECT_EQ(0, tester.read(off)); +// } + +// EXPECT_EQ(tester.dump_to_hex_string(true), +// "E9000000000F84000000000F85000000000F8E000000000F8D000000000F8C000000000F8F000000000F86" +// "000000000F83000000000F82000000000F8700000000"); +// } + +// TEST(EmitterIntegerMath, null) { +// auto instr = IGen::null(); +// EXPECT_EQ(0, instr.emit(nullptr)); +// } + +// TEST(EmitterLoadsAndStores, load_constant_64_and_move_gpr_gpr_64) { +// std::vector u64_constants = {0, UINT64_MAX, INT64_MAX, 7, 12}; + +// // test we can load a 64-bit constant into all gprs, move it to any other gpr, and return it. +// // rsp is skipping because that's the stack pointer and would prevent us from popping gprs +// after + +// CodeTester tester; +// tester.init_code_buffer(256); + +// for (auto constant : u64_constants) { +// for (int r1 = 0; r1 < 16; r1++) { +// if (r1 == RSP) { +// continue; +// } + +// for (int r2 = 0; r2 < 16; r2++) { +// if (r2 == RSP) { +// continue; +// } +// tester.clear(); +// tester.emit_push_all_gprs(true); +// tester.emit(IGen::mov_gpr64_u64(r1, constant)); +// tester.emit(IGen::mov_gpr64_gpr64(r2, r1)); +// tester.emit(IGen::mov_gpr64_gpr64(RAX, r2)); +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); +// EXPECT_EQ(tester.execute(), constant); +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, load_constant_32_unsigned) { +// std::vector u64_constants = {0, UINT32_MAX, INT32_MAX, 7, 12}; + +// // test loading 32-bit constants, with all upper 32-bits zero. +// // this uses a different opcode than 64-bit loads. +// CodeTester tester; +// tester.init_code_buffer(256); + +// for (auto constant : u64_constants) { +// for (int r1 = 0; r1 < 16; r1++) { +// if (r1 == RSP) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// tester.emit(IGen::mov_gpr64_u64(r1, UINT64_MAX)); +// tester.emit(IGen::mov_gpr64_u32(r1, constant)); +// tester.emit(IGen::mov_gpr64_gpr64(RAX, r1)); +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); +// EXPECT_EQ(tester.execute(), constant); +// } +// } +// } + +// TEST(EmitterLoadsAndStores, load_constant_32_signed) { +// std::vector s32_constants = {0, 1, INT32_MAX, INT32_MIN, 12, -1}; + +// // test loading signed 32-bit constants. for values < 0 this will sign extend. +// CodeTester tester; +// tester.init_code_buffer(256); + +// for (auto constant : s32_constants) { +// for (int r1 = 0; r1 < 16; r1++) { +// if (r1 == RSP) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// tester.emit(IGen::mov_gpr64_s32(r1, constant)); +// tester.emit(IGen::mov_gpr64_gpr64(RAX, r1)); +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); +// EXPECT_EQ(tester.execute(), constant); +// } +// } +// } + +// TEST(EmitterLoadsAndStores, load8s_gpr64_goal_ptr_gpr64) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::load8s_gpr64_gpr64_plus_gpr64(RAX, RBX, RSI)); +// EXPECT_EQ(tester.dump_to_hex_string(), "48 0f be 04 1e"); + +// tester.clear(); +// tester.emit(IGen::load8s_gpr64_gpr64_plus_gpr64(R12, RBX, RSI)); +// EXPECT_EQ(tester.dump_to_hex_string(), "4c 0f be 24 1e"); + +// tester.clear(); +// tester.emit(IGen::load8s_gpr64_gpr64_plus_gpr64(R12, R15, RSI)); +// EXPECT_EQ(tester.dump_to_hex_string(), "4e 0f be 24 3e"); + +// tester.clear(); +// tester.emit(IGen::load8s_gpr64_gpr64_plus_gpr64(R12, R15, R14)); +// EXPECT_EQ(tester.dump_to_hex_string(), "4f 0f be 24 3e"); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 + +// // fill k with junk +// if (k != i && k != j) { +// tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); +// } + +// // load into k +// tester.emit(IGen::load8s_gpr64_gpr64_plus_gpr64(k, i, j)); + +// // move k to return register +// tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// u8 memory[8] = {0, 0, 0xfd, 0xfe, 0xff, 0, 0, 0}; + +// // run! +// EXPECT_EQ(s64(tester.execute((u64)memory, 3, 0, 0)), -2); +// EXPECT_EQ(s64(tester.execute((u64)memory, 2, 0, 0)), -3); +// EXPECT_EQ(s64(tester.execute((u64)memory, 4, 0, 0)), -1); +// EXPECT_EQ(s64(tester.execute((u64)memory, 5, 0, 0)), 0); +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, load8s_gpr64_gpr64_gpr64_s8) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::load8s_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3)); + +// EXPECT_EQ(tester.dump_to_hex_string(), "48 0f be 44 1e fd"); + +// auto instr = IGen::load8s_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3); +// u8 buff[256]; +// instr.emit(buff); +// EXPECT_EQ(s8(buff[instr.offset_of_disp()]), -3); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 + +// // fill k with junk +// if (k != i && k != j) { +// tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); +// } + +// // load into k +// tester.emit(IGen::load8s_gpr64_gpr64_plus_gpr64_plus_s8(k, i, j, -3)); + +// // move k to return register +// tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// u8 memory[8] = {0, 0, 0xfd, 0xfe, 0xff, 0, 0, 0}; + +// // run! +// EXPECT_EQ(s64(tester.execute((u64)memory, 3 + 3, 0, 0)), -2); +// EXPECT_EQ(s64(tester.execute((u64)memory, 2 + 3, 0, 0)), -3); +// EXPECT_EQ(s64(tester.execute((u64)memory, 4 + 3, 0, 0)), -1); +// EXPECT_EQ(s64(tester.execute((u64)memory, 5 + 3, 0, 0)), 0); +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, load8s_gpr64_gpr64_gpr64_s32) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::load8s_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3)); + +// EXPECT_EQ(tester.dump_to_hex_string(), "48 0f be 84 1e fd ff ff ff"); + +// auto instr = IGen::load8s_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3); +// u8 buff[256]; +// instr.emit(buff); +// EXPECT_EQ(*(s32*)(buff + instr.offset_of_disp()), -3); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 + +// // fill k with junk +// if (k != i && k != j) { +// tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); +// } + +// // load into k +// tester.emit(IGen::load8s_gpr64_gpr64_plus_gpr64_plus_s32(k, i, j, -3)); + +// // move k to return register +// tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// u8 memory[8] = {0, 0, 0xfd, 0xfe, 0xff, 0, 0, 0}; + +// // run! +// EXPECT_EQ(s64(tester.execute((u64)memory, 3 + 3, 0, 0)), -2); +// EXPECT_EQ(s64(tester.execute((u64)memory, 2 + 3, 0, 0)), -3); +// EXPECT_EQ(s64(tester.execute((u64)memory, 4 + 3, 0, 0)), -1); +// EXPECT_EQ(s64(tester.execute((u64)memory, 5 + 3, 0, 0)), 0); +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, load8u_gpr64_goal_ptr_gpr64) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::load8u_gpr64_gpr64_plus_gpr64(RAX, RBX, RSI)); +// EXPECT_EQ(tester.dump_to_hex_string(), "48 0f b6 04 1e"); + +// tester.clear(); +// tester.emit(IGen::load8u_gpr64_gpr64_plus_gpr64(R12, RBX, RSI)); +// EXPECT_EQ(tester.dump_to_hex_string(), "4c 0f b6 24 1e"); + +// tester.clear(); +// tester.emit(IGen::load8u_gpr64_gpr64_plus_gpr64(R12, R15, RSI)); +// EXPECT_EQ(tester.dump_to_hex_string(), "4e 0f b6 24 3e"); + +// tester.clear(); +// tester.emit(IGen::load8u_gpr64_gpr64_plus_gpr64(R12, R15, R14)); +// EXPECT_EQ(tester.dump_to_hex_string(), "4f 0f b6 24 3e"); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 + +// // fill k with junk +// if (k != i && k != j) { +// tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); +// } + +// // load into k +// tester.emit(IGen::load8s_gpr64_gpr64_plus_gpr64(k, i, j)); + +// // move k to return register +// tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// u8 memory[8] = {0, 0, 0xfd, 0xfe, 0xff, 0, 0, 0}; + +// // run! +// EXPECT_EQ(s64(tester.execute((u64)memory, 3, 0, 0)), -2); +// EXPECT_EQ(s64(tester.execute((u64)memory, 2, 0, 0)), -3); +// EXPECT_EQ(s64(tester.execute((u64)memory, 4, 0, 0)), -1); +// EXPECT_EQ(s64(tester.execute((u64)memory, 5, 0, 0)), 0); +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, load8u_gpr64_gpr64_gpr64_s8) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::load8u_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3)); + +// EXPECT_EQ(tester.dump_to_hex_string(), "48 0f b6 44 1e fd"); + +// auto instr = IGen::load8u_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3); +// u8 buff[256]; +// instr.emit(buff); +// EXPECT_EQ(s8(buff[instr.offset_of_disp()]), -3); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 + +// // fill k with junk +// if (k != i && k != j) { +// tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); +// } + +// // load into k +// tester.emit(IGen::load8u_gpr64_gpr64_plus_gpr64_plus_s8(k, i, j, -3)); + +// // move k to return register +// tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// u8 memory[8] = {0, 0, 0xfd, 0xfe, 0xff, 0, 0, 0}; + +// // run! +// EXPECT_EQ(s64(tester.execute((u64)memory, 3 + 3, 0, 0)), 0xfe); +// EXPECT_EQ(s64(tester.execute((u64)memory, 2 + 3, 0, 0)), 0xfd); +// EXPECT_EQ(s64(tester.execute((u64)memory, 4 + 3, 0, 0)), 0xff); +// EXPECT_EQ(s64(tester.execute((u64)memory, 5 + 3, 0, 0)), 0); +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, load8u_gpr64_gpr64_gpr64_s32) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::load8u_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3)); + +// EXPECT_EQ(tester.dump_to_hex_string(), "48 0f b6 84 1e fd ff ff ff"); + +// auto instr = IGen::load8u_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3); +// u8 buff[256]; +// instr.emit(buff); +// EXPECT_EQ(*(s32*)(buff + instr.offset_of_disp()), -3); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 + +// // fill k with junk +// if (k != i && k != j) { +// tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); +// } + +// // load into k +// tester.emit(IGen::load8u_gpr64_gpr64_plus_gpr64_plus_s32(k, i, j, -3)); + +// // move k to return register +// tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// u8 memory[8] = {0, 0, 0xfd, 0xfe, 0xff, 0, 0, 0}; + +// // run! +// EXPECT_EQ(s64(tester.execute((u64)memory, 3 + 3, 0, 0)), 0xfe); +// EXPECT_EQ(s64(tester.execute((u64)memory, 2 + 3, 0, 0)), 0xfd); +// EXPECT_EQ(s64(tester.execute((u64)memory, 4 + 3, 0, 0)), 0xff); +// EXPECT_EQ(s64(tester.execute((u64)memory, 5 + 3, 0, 0)), 0); +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, load16s_gpr64_goal_ptr_gpr64) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::load16s_gpr64_gpr64_plus_gpr64(RAX, RBX, RSI)); +// EXPECT_EQ(tester.dump_to_hex_string(), "48 0f bf 04 1e"); + +// tester.clear(); +// tester.emit(IGen::load16s_gpr64_gpr64_plus_gpr64(R12, RBX, RSI)); +// EXPECT_EQ(tester.dump_to_hex_string(), "4c 0f bf 24 1e"); + +// tester.clear(); +// tester.emit(IGen::load16s_gpr64_gpr64_plus_gpr64(R12, R15, RSI)); +// EXPECT_EQ(tester.dump_to_hex_string(), "4e 0f bf 24 3e"); + +// tester.clear(); +// tester.emit(IGen::load16s_gpr64_gpr64_plus_gpr64(R12, R15, R14)); +// EXPECT_EQ(tester.dump_to_hex_string(), "4f 0f bf 24 3e"); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 + +// // fill k with junk +// if (k != i && k != j) { +// tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); +// } + +// // load into k +// tester.emit(IGen::load16s_gpr64_gpr64_plus_gpr64(k, i, j)); + +// // move k to return register +// tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// s16 memory[8] = {0, 0, -3, -2, -1, 0, 0, 0}; + +// // run! +// EXPECT_EQ(s64(tester.execute((u64)memory, 6, 0, 0)), -2); +// EXPECT_EQ(s64(tester.execute((u64)memory, 4, 0, 0)), -3); +// EXPECT_EQ(s64(tester.execute((u64)memory, 8, 0, 0)), -1); +// EXPECT_EQ(s64(tester.execute((u64)memory, 10, 0, 0)), 0); +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, load16s_gpr64_gpr64_plus_gpr64_plus_s8) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::load16s_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3)); + +// EXPECT_EQ(tester.dump_to_hex_string(), "48 0f bf 44 1e fd"); + +// auto instr = IGen::load16s_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3); +// u8 buff[256]; +// instr.emit(buff); +// EXPECT_EQ(s8(buff[instr.offset_of_disp()]), -3); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 + +// // fill k with junk +// if (k != i && k != j) { +// tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); +// } + +// // load into k +// tester.emit(IGen::load16s_gpr64_gpr64_plus_gpr64_plus_s8(k, i, j, -3)); + +// // move k to return register +// tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// u16 memory[8] = {0, 0, 0xfffd, 0xfffe, 0xffff, 0, 0, 0}; + +// // run! +// EXPECT_EQ(s64(tester.execute((u64)memory, 6 + 3, 0, 0)), -2); +// EXPECT_EQ(s64(tester.execute((u64)memory, 4 + 3, 0, 0)), -3); +// EXPECT_EQ(s64(tester.execute((u64)memory, 8 + 3, 0, 0)), -1); +// EXPECT_EQ(s64(tester.execute((u64)memory, 10 + 3, 0, 0)), 0); +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, load16s_gpr64_gpr64_plus_gpr64_plus_s32) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::load16s_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3)); + +// EXPECT_EQ(tester.dump_to_hex_string(), "48 0f bf 84 1e fd ff ff ff"); + +// auto instr = IGen::load16s_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3); +// u8 buff[256]; +// instr.emit(buff); +// EXPECT_EQ(*(s32*)(buff + instr.offset_of_disp()), -3); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 + +// // fill k with junk +// if (k != i && k != j) { +// tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); +// } + +// // load into k +// tester.emit(IGen::load16s_gpr64_gpr64_plus_gpr64_plus_s32(k, i, j, -3)); + +// // move k to return register +// tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// u16 memory[8] = {0, 0, 0xfffd, 0xfffe, 0xffff, 0, 0, 0}; + +// // run! +// EXPECT_EQ(s64(tester.execute((u64)memory, 6 + 3, 0, 0)), -2); +// EXPECT_EQ(s64(tester.execute((u64)memory, 4 + 3, 0, 0)), -3); +// EXPECT_EQ(s64(tester.execute((u64)memory, 8 + 3, 0, 0)), -1); +// EXPECT_EQ(s64(tester.execute((u64)memory, 10 + 3, 0, 0)), 0); +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, load16u_gpr64_goal_ptr_gpr64) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::load16u_gpr64_gpr64_plus_gpr64(RAX, RBX, RSI)); +// EXPECT_EQ(tester.dump_to_hex_string(), "48 0f b7 04 1e"); + +// tester.clear(); +// tester.emit(IGen::load16u_gpr64_gpr64_plus_gpr64(R12, RBX, RSI)); +// EXPECT_EQ(tester.dump_to_hex_string(), "4c 0f b7 24 1e"); + +// tester.clear(); +// tester.emit(IGen::load16u_gpr64_gpr64_plus_gpr64(R12, R15, RSI)); +// EXPECT_EQ(tester.dump_to_hex_string(), "4e 0f b7 24 3e"); + +// tester.clear(); +// tester.emit(IGen::load16u_gpr64_gpr64_plus_gpr64(R12, R15, R14)); +// EXPECT_EQ(tester.dump_to_hex_string(), "4f 0f b7 24 3e"); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 + +// // fill k with junk +// if (k != i && k != j) { +// tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); +// } + +// // load into k +// tester.emit(IGen::load16u_gpr64_gpr64_plus_gpr64(k, i, j)); + +// // move k to return register +// tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// s16 memory[8] = {0, 0, -3, -2, -1, 0, 0, 0}; + +// // run! +// EXPECT_EQ(s64(tester.execute((u64)memory, 6, 0, 0)), 0xfffe); +// EXPECT_EQ(s64(tester.execute((u64)memory, 4, 0, 0)), 0xfffd); +// EXPECT_EQ(s64(tester.execute((u64)memory, 8, 0, 0)), 0xffff); +// EXPECT_EQ(s64(tester.execute((u64)memory, 10, 0, 0)), 0); +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, load16u_gpr64_gpr64_plus_gpr64_plus_s8) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::load16u_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3)); + +// EXPECT_EQ(tester.dump_to_hex_string(), "48 0f b7 44 1e fd"); + +// auto instr = IGen::load16u_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3); +// u8 buff[256]; +// instr.emit(buff); +// EXPECT_EQ(s8(buff[instr.offset_of_disp()]), -3); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 + +// // fill k with junk +// if (k != i && k != j) { +// tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); +// } + +// // load into k +// tester.emit(IGen::load16u_gpr64_gpr64_plus_gpr64_plus_s8(k, i, j, -3)); + +// // move k to return register +// tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// u16 memory[8] = {0, 0, 0xfffd, 0xfffe, 0xffff, 0, 0, 0}; + +// // run! +// EXPECT_EQ(s64(tester.execute((u64)memory, 6 + 3, 0, 0)), 0xfffe); +// EXPECT_EQ(s64(tester.execute((u64)memory, 4 + 3, 0, 0)), 0xfffd); +// EXPECT_EQ(s64(tester.execute((u64)memory, 8 + 3, 0, 0)), 0xffff); +// EXPECT_EQ(s64(tester.execute((u64)memory, 10 + 3, 0, 0)), 0); +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, load16u_gpr64_gpr64_plus_gpr64_plus_s32) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::load16u_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3)); + +// EXPECT_EQ(tester.dump_to_hex_string(), "48 0f b7 84 1e fd ff ff ff"); + +// auto instr = IGen::load16u_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3); +// u8 buff[256]; +// instr.emit(buff); +// EXPECT_EQ(*(s32*)(buff + instr.offset_of_disp()), -3); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 + +// // fill k with junk +// if (k != i && k != j) { +// tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); +// } + +// // load into k +// tester.emit(IGen::load16u_gpr64_gpr64_plus_gpr64_plus_s32(k, i, j, -3)); + +// // move k to return register +// tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// u16 memory[8] = {0, 0, 0xfffd, 0xfffe, 0xffff, 0, 0, 0}; + +// // run! +// EXPECT_EQ(s64(tester.execute((u64)memory, 6 + 3, 0, 0)), 0xfffe); +// EXPECT_EQ(s64(tester.execute((u64)memory, 4 + 3, 0, 0)), 0xfffd); +// EXPECT_EQ(s64(tester.execute((u64)memory, 8 + 3, 0, 0)), 0xffff); +// EXPECT_EQ(s64(tester.execute((u64)memory, 10 + 3, 0, 0)), 0); +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, load32s_gpr64_goal_ptr_gpr64) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::load32s_gpr64_gpr64_plus_gpr64(RAX, RBX, RSI)); +// EXPECT_EQ(tester.dump_to_hex_string(), "48 63 04 1e"); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 + +// // fill k with junk +// if (k != i && k != j) { +// tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); +// } + +// // load into k +// tester.emit(IGen::load32s_gpr64_gpr64_plus_gpr64(k, i, j)); + +// // move k to return register +// tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// s32 memory[8] = {0, 0, -3, -2, -1, 0, 0, 0}; + +// // run! +// EXPECT_EQ(s64(tester.execute((u64)memory, 12, 0, 0)), -2); +// EXPECT_EQ(s64(tester.execute((u64)memory, 8, 0, 0)), -3); +// EXPECT_EQ(s64(tester.execute((u64)memory, 16, 0, 0)), -1); +// EXPECT_EQ(s64(tester.execute((u64)memory, 20, 0, 0)), 0); +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, load32s_gpr64_gpr64_plus_gpr64_plus_s8) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::load32s_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3)); + +// EXPECT_EQ(tester.dump_to_hex_string(), "48 63 44 1e fd"); + +// auto instr = IGen::load32s_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3); +// u8 buff[256]; +// instr.emit(buff); +// EXPECT_EQ(s8(buff[instr.offset_of_disp()]), -3); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 + +// // fill k with junk +// if (k != i && k != j) { +// tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); +// } + +// // load into k +// tester.emit(IGen::load32s_gpr64_gpr64_plus_gpr64_plus_s8(k, i, j, -3)); + +// // move k to return register +// tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// u32 memory[8] = {0, 0, 0xfffffffd, 0xfffffffe, 0xffffffff, 0, 0, 0}; + +// // run! +// EXPECT_EQ(s64(tester.execute((u64)memory, 12 + 3, 0, 0)), -2); +// EXPECT_EQ(s64(tester.execute((u64)memory, 8 + 3, 0, 0)), -3); +// EXPECT_EQ(s64(tester.execute((u64)memory, 16 + 3, 0, 0)), -1); +// EXPECT_EQ(s64(tester.execute((u64)memory, 20 + 3, 0, 0)), 0); +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, load32s_gpr64_gpr64_plus_gpr64_plus_s32) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::load32s_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3)); + +// EXPECT_EQ(tester.dump_to_hex_string(), "48 63 84 1e fd ff ff ff"); + +// auto instr = IGen::load32s_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3); +// u8 buff[256]; +// instr.emit(buff); +// EXPECT_EQ(*(s32*)(buff + instr.offset_of_disp()), -3); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 + +// // fill k with junk +// if (k != i && k != j) { +// tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); +// } + +// // load into k +// tester.emit(IGen::load32s_gpr64_gpr64_plus_gpr64_plus_s32(k, i, j, -3)); + +// // move k to return register +// tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// u32 memory[8] = {0, 0, 0xfffffffd, 0xfffffffe, 0xffffffff, 0, 0, 0}; + +// // run! +// EXPECT_EQ(s64(tester.execute((u64)memory, 12 + 3, 0, 0)), -2); +// EXPECT_EQ(s64(tester.execute((u64)memory, 8 + 3, 0, 0)), -3); +// EXPECT_EQ(s64(tester.execute((u64)memory, 16 + 3, 0, 0)), -1); +// EXPECT_EQ(s64(tester.execute((u64)memory, 20 + 3, 0, 0)), 0); +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, load32u_gpr64_goal_ptr_gpr64) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::load32u_gpr64_gpr64_plus_gpr64(RAX, RBX, RSI)); +// EXPECT_EQ(tester.dump_to_hex_string(), "8b 04 1e"); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 + +// // fill k with junk +// if (k != i && k != j) { +// tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); +// } + +// // load into k +// tester.emit(IGen::load32u_gpr64_gpr64_plus_gpr64(k, i, j)); + +// // move k to return register +// tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// s32 memory[8] = {0, 0, -3, -2, -1, 0, 0, 0}; + +// // run! +// EXPECT_EQ(s64(tester.execute((u64)memory, 12, 0, 0)), 0xfffffffe); +// EXPECT_EQ(s64(tester.execute((u64)memory, 8, 0, 0)), 0xfffffffd); +// EXPECT_EQ(s64(tester.execute((u64)memory, 16, 0, 0)), 0xffffffff); +// EXPECT_EQ(s64(tester.execute((u64)memory, 20, 0, 0)), 0); +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, load32u_gpr64_gpr64_plus_gpr64_plus_s8) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::load32u_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3)); + +// EXPECT_EQ(tester.dump_to_hex_string(), "8b 44 1e fd"); + +// auto instr = IGen::load32u_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3); +// u8 buff[256]; +// instr.emit(buff); +// EXPECT_EQ(s8(buff[instr.offset_of_disp()]), -3); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 + +// // fill k with junk +// if (k != i && k != j) { +// tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); +// } + +// // load into k +// tester.emit(IGen::load32u_gpr64_gpr64_plus_gpr64_plus_s8(k, i, j, -3)); + +// // move k to return register +// tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// s32 memory[8] = {0, 0, -3, -2, -1, 0, 0, 0}; + +// // run! +// EXPECT_EQ(s64(tester.execute((u64)memory, 12 + 3, 0, 0)), 0xfffffffe); +// EXPECT_EQ(s64(tester.execute((u64)memory, 8 + 3, 0, 0)), 0xfffffffd); +// EXPECT_EQ(s64(tester.execute((u64)memory, 16 + 3, 0, 0)), 0xffffffff); +// EXPECT_EQ(s64(tester.execute((u64)memory, 20 + 3, 0, 0)), 0); +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, load32u_gpr64_gpr64_plus_gpr64_plus_s32) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::load32u_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3)); + +// EXPECT_EQ(tester.dump_to_hex_string(), "8b 84 1e fd ff ff ff"); + +// auto instr = IGen::load32u_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3); +// u8 buff[256]; +// instr.emit(buff); +// EXPECT_EQ(*(s32*)(buff + instr.offset_of_disp()), -3); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 + +// // fill k with junk +// if (k != i && k != j) { +// tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); +// } + +// // load into k +// tester.emit(IGen::load32u_gpr64_gpr64_plus_gpr64_plus_s32(k, i, j, -3)); + +// // move k to return register +// tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// u32 memory[8] = {0, 0, 0xfffffffd, 0xfffffffe, 0xffffffff, 0, 0, 0}; + +// // run! +// EXPECT_EQ(s64(tester.execute((u64)memory, 12 + 3, 0, 0)), 0xfffffffe); +// EXPECT_EQ(s64(tester.execute((u64)memory, 8 + 3, 0, 0)), 0xfffffffd); +// EXPECT_EQ(s64(tester.execute((u64)memory, 16 + 3, 0, 0)), 0xffffffff); +// EXPECT_EQ(s64(tester.execute((u64)memory, 20 + 3, 0, 0)), 0); +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, load64_gpr64_goal_ptr_gpr64) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::load64_gpr64_gpr64_plus_gpr64(RAX, RBX, RSI)); +// EXPECT_EQ(tester.dump_to_hex_string(), "48 8b 04 1e"); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 + +// // fill k with junk +// if (k != i && k != j) { +// tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); +// } + +// // load into k +// tester.emit(IGen::load64_gpr64_gpr64_plus_gpr64(k, i, j)); + +// // move k to return register +// tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// s64 memory[8] = {0, 0, -3, -2, -1, 0, 0, 0}; + +// // run! +// EXPECT_EQ(s64(tester.execute((u64)memory, 24, 0, 0)), -2); +// EXPECT_EQ(s64(tester.execute((u64)memory, 16, 0, 0)), -3); +// EXPECT_EQ(s64(tester.execute((u64)memory, 32, 0, 0)), -1); +// EXPECT_EQ(s64(tester.execute((u64)memory, 40, 0, 0)), 0); +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, load64_gpr64_gpr64_plus_gpr64_plus_s8) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::load64_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3)); + +// EXPECT_EQ(tester.dump_to_hex_string(), "48 8b 44 1e fd"); + +// auto instr = IGen::load64_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RBX, RSI, -3); +// u8 buff[256]; +// instr.emit(buff); +// EXPECT_EQ(s8(buff[instr.offset_of_disp()]), -3); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 + +// // fill k with junk +// if (k != i && k != j) { +// tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); +// } + +// // load into k +// tester.emit(IGen::load64_gpr64_gpr64_plus_gpr64_plus_s8(k, i, j, -3)); + +// // move k to return register +// tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// s64 memory[8] = {0, 0, -3, -2, -1, 0, 0, 0}; + +// // run! +// EXPECT_EQ(s64(tester.execute((u64)memory, 24 + 3, 0, 0)), -2); +// EXPECT_EQ(s64(tester.execute((u64)memory, 16 + 3, 0, 0)), -3); +// EXPECT_EQ(s64(tester.execute((u64)memory, 32 + 3, 0, 0)), -1); +// EXPECT_EQ(s64(tester.execute((u64)memory, 40 + 3, 0, 0)), 0); +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, load64_gpr64_gpr64_plus_gpr64_plus_s32) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::load64_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3)); + +// EXPECT_EQ(tester.dump_to_hex_string(), "48 8b 84 1e fd ff ff ff"); + +// auto instr = IGen::load64_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RBX, RSI, -3); +// u8 buff[256]; +// instr.emit(buff); +// EXPECT_EQ(*(s32*)(buff + instr.offset_of_disp()), -3); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 + +// // fill k with junk +// if (k != i && k != j) { +// tester.emit(IGen::mov_gpr64_u64(k, (iter & 1) ? 0 : UINT64_MAX)); +// } + +// // load into k +// tester.emit(IGen::load64_gpr64_gpr64_plus_gpr64_plus_s32(k, i, j, -3)); + +// // move k to return register +// tester.emit(IGen::mov_gpr64_gpr64(RAX, k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// s64 memory[8] = {0, 0, -3, -2, -1, 0, 0, 0}; + +// // run! +// EXPECT_EQ(s64(tester.execute((u64)memory, 24 + 3, 0, 0)), -2); +// EXPECT_EQ(s64(tester.execute((u64)memory, 16 + 3, 0, 0)), -3); +// EXPECT_EQ(s64(tester.execute((u64)memory, 32 + 3, 0, 0)), -1); +// EXPECT_EQ(s64(tester.execute((u64)memory, 40 + 3, 0, 0)), 0); +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, store8_gpr64_gpr64_plus_gpr64) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::store8_gpr64_gpr64_plus_gpr64(RAX, RCX, RDX)); +// EXPECT_EQ(tester.dump_to_hex_string(), "88 14 01"); + +// [[maybe_unused]] int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP || k == j || k == i) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 +// tester.emit(IGen::pop_gpr64(k)); // k will have the value to store. + +// // store! +// tester.emit(IGen::store8_gpr64_gpr64_plus_gpr64(i, j, k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// s8 memory[8] = {0, 0, 3, -2, 1, 0, 0, 0}; + +// // run! +// tester.execute((u64)memory, 3, 0xffffffffffffff07, 0); +// EXPECT_EQ(memory[2], 3); +// EXPECT_EQ(memory[3], 7); +// EXPECT_EQ(memory[4], 1); +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, store8_gpr64_gpr64_plus_gpr64_plus_s8) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::store8_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RCX, RDX, 12)); +// EXPECT_EQ(tester.dump_to_hex_string(), "88 54 01 0c"); + +// auto instr = IGen::store8_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RCX, RDX, -3); +// u8 buff[256]; +// instr.emit(buff); +// EXPECT_EQ(*(s8*)(buff + instr.offset_of_disp()), -3); + +// [[maybe_unused]] int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP || k == j || k == i) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 +// tester.emit(IGen::pop_gpr64(k)); // k will have the value to store. + +// // store +// tester.emit(IGen::store8_gpr64_gpr64_plus_gpr64_plus_s8(i, j, k, -3)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// s8 memory[8] = {0, 0, 3, -2, 1, 0, 0, 0}; + +// // run! +// tester.execute((u64)memory, 6, 0xffffffffffffff07, 0); +// EXPECT_EQ(memory[2], 3); +// EXPECT_EQ(memory[3], 7); +// EXPECT_EQ(memory[4], 1); + +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, store8_gpr64_gpr64_plus_gpr64_plus_s32) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::store8_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RCX, RDX, 12)); +// EXPECT_EQ(tester.dump_to_hex_string(), "88 94 01 0c 00 00 00"); + +// auto instr = IGen::store8_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RCX, RDX, -3); +// u8 buff[256]; +// instr.emit(buff); +// EXPECT_EQ(*(s32*)(buff + instr.offset_of_disp()), -3); + +// [[maybe_unused]] int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP || k == j || k == i) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 +// tester.emit(IGen::pop_gpr64(k)); // k will have the value to store. + +// // store +// tester.emit(IGen::store8_gpr64_gpr64_plus_gpr64_plus_s32(i, j, k, -3)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// s8 memory[8] = {0, 0, 3, -2, 1, 0, 0, 0}; + +// // run! +// tester.execute((u64)memory, 6, 0xffffffffffffff07, 0); +// EXPECT_EQ(memory[2], 3); +// EXPECT_EQ(memory[3], 7); +// EXPECT_EQ(memory[4], 1); + +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, store16_gpr64_gpr64_plus_gpr64) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::store16_gpr64_gpr64_plus_gpr64(RCX, RAX, R8)); +// EXPECT_EQ(tester.dump_to_hex_string(), "66 44 89 04 08"); + +// [[maybe_unused]] int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP || k == j || k == i) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 +// tester.emit(IGen::pop_gpr64(k)); // k will have the value to store. + +// // store! +// tester.emit(IGen::store16_gpr64_gpr64_plus_gpr64(i, j, k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// s16 memory[8] = {0, 0, 3, -2, 1, 0, 0, 0}; + +// // run! +// tester.execute((u64)memory, 6, 0xffffffffffffff07, 0); +// EXPECT_EQ(memory[2], 3); +// EXPECT_EQ(memory[3], s16(0xff07)); +// EXPECT_EQ(memory[4], 1); + +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, store16_gpr64_gpr64_plus_gpr64_plus_s8) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::store16_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RCX, R8, 12)); +// EXPECT_EQ(tester.dump_to_hex_string(), "66 44 89 44 01 0c"); + +// auto instr = IGen::store16_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RCX, RDX, -3); +// u8 buff[256]; +// instr.emit(buff); +// EXPECT_EQ(*(s8*)(buff + instr.offset_of_disp()), -3); + +// [[maybe_unused]] int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP || k == j || k == i) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 +// tester.emit(IGen::pop_gpr64(k)); // k will have the value to store. + +// // store +// tester.emit(IGen::store16_gpr64_gpr64_plus_gpr64_plus_s8(i, j, k, -3)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// s16 memory[8] = {0, 0, 3, -2, 1, 0, 0, 0}; + +// // run! +// tester.execute((u64)memory, 6 + 3, 0xffffffffffffff07, 0); +// EXPECT_EQ(memory[2], 3); +// EXPECT_EQ(memory[3], s16(0xff07)); +// EXPECT_EQ(memory[4], 1); + +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, store16_gpr64_gpr64_plus_gpr64_plus_s32) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::store16_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RCX, R8, 12)); +// EXPECT_EQ(tester.dump_to_hex_string(), "66 44 89 84 01 0c 00 00 00"); + +// auto instr = IGen::store16_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RCX, RDX, -3); +// u8 buff[256]; +// instr.emit(buff); +// EXPECT_EQ(*(s8*)(buff + instr.offset_of_disp()), -3); + +// [[maybe_unused]] int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP || k == j || k == i) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 +// tester.emit(IGen::pop_gpr64(k)); // k will have the value to store. + +// // store +// tester.emit(IGen::store16_gpr64_gpr64_plus_gpr64_plus_s32(i, j, k, -3)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// s16 memory[8] = {0, 0, 3, -2, 1, 0, 0, 0}; + +// // run! +// tester.execute((u64)memory, 6 + 3, 0xffffffffffffff07, 0); +// EXPECT_EQ(memory[2], 3); +// EXPECT_EQ(memory[3], s16(0xff07)); +// EXPECT_EQ(memory[4], 1); + +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, store32_gpr64_gpr64_plus_gpr64) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::store32_gpr64_gpr64_plus_gpr64(RCX, RAX, R8)); +// EXPECT_EQ(tester.dump_to_hex_string(), "44 89 04 08"); + +// [[maybe_unused]] int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP || k == j || k == i) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 +// tester.emit(IGen::pop_gpr64(k)); // k will have the value to store. + +// // store! +// tester.emit(IGen::store32_gpr64_gpr64_plus_gpr64(i, j, k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// s32 memory[8] = {0, 0, 3, -2, 1, 0, 0, 0}; + +// // run! +// tester.execute((u64)memory, 12, 0xffffffff12341234, 0); +// EXPECT_EQ(memory[2], 3); +// EXPECT_EQ(memory[3], 0x12341234); +// EXPECT_EQ(memory[4], 1); + +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, store32_gpr64_gpr64_plus_gpr64_plus_s8) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::store32_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RCX, R8, 12)); +// EXPECT_EQ(tester.dump_to_hex_string(), "44 89 44 01 0c"); + +// auto instr = IGen::store32_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RCX, RDX, -3); +// u8 buff[256]; +// instr.emit(buff); +// EXPECT_EQ(*(s8*)(buff + instr.offset_of_disp()), -3); + +// [[maybe_unused]] int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP || k == j || k == i) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 +// tester.emit(IGen::pop_gpr64(k)); // k will have the value to store. + +// // store +// tester.emit(IGen::store32_gpr64_gpr64_plus_gpr64_plus_s8(i, j, k, -3)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// s32 memory[8] = {0, 0, 3, -2, 1, 0, 0, 0}; + +// // run! +// tester.execute((u64)memory, 12 + 3, 0xffffffffffffff07, 0); +// EXPECT_EQ(memory[2], 3); +// EXPECT_EQ(memory[3], s32(0xffffff07)); +// EXPECT_EQ(memory[4], 1); + +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, store32_gpr64_gpr64_plus_gpr64_plus_s32) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::store32_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RCX, R8, 12)); +// EXPECT_EQ(tester.dump_to_hex_string(), "44 89 84 01 0c 00 00 00"); + +// auto instr = IGen::store32_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RCX, RDX, -3); +// u8 buff[256]; +// instr.emit(buff); +// EXPECT_EQ(*(s8*)(buff + instr.offset_of_disp()), -3); + +// [[maybe_unused]] int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP || k == j || k == i) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 +// tester.emit(IGen::pop_gpr64(k)); // k will have the value to store. + +// // store +// tester.emit(IGen::store32_gpr64_gpr64_plus_gpr64_plus_s32(i, j, k, -3)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// s32 memory[8] = {0, 0, 3, -2, 1, 0, 0, 0}; + +// // run! +// tester.execute((u64)memory, 12 + 3, 0xffffffffffffff07, 0); +// EXPECT_EQ(memory[2], 3); +// EXPECT_EQ(memory[3], s32(0xffffff07)); +// EXPECT_EQ(memory[4], 1); + +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, store64_gpr64_gpr64_plus_gpr64) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::store64_gpr64_gpr64_plus_gpr64(RCX, RAX, R8)); +// EXPECT_EQ(tester.dump_to_hex_string(), "4c 89 04 08"); + +// [[maybe_unused]] int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP || k == j || k == i) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 +// tester.emit(IGen::pop_gpr64(k)); // k will have the value to store. + +// // store! +// tester.emit(IGen::store64_gpr64_gpr64_plus_gpr64(i, j, k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// s64 memory[8] = {0, 0, 3, -2, 1, 0, 0, 0}; + +// // run! +// tester.execute((u64)memory, 24, 0xffffffff12341234, 0); +// EXPECT_EQ(memory[2], 3); +// EXPECT_EQ(memory[3], 0xffffffff12341234); +// EXPECT_EQ(memory[4], 1); + +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, store64_gpr64_gpr64_plus_gpr64_plus_s8) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::store64_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RCX, R8, 12)); +// EXPECT_EQ(tester.dump_to_hex_string(), "4c 89 44 01 0c"); + +// auto instr = IGen::store64_gpr64_gpr64_plus_gpr64_plus_s8(RAX, RCX, RDX, -3); +// u8 buff[256]; +// instr.emit(buff); +// EXPECT_EQ(*(s8*)(buff + instr.offset_of_disp()), -3); + +// [[maybe_unused]] int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP || k == j || k == i) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 +// tester.emit(IGen::pop_gpr64(k)); // k will have the value to store. + +// // store +// tester.emit(IGen::store64_gpr64_gpr64_plus_gpr64_plus_s8(i, j, k, -3)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// s64 memory[8] = {0, 0, 3, -2, 1, 0, 0, 0}; + +// // run! +// tester.execute((u64)memory, 24 + 3, 0xffffffffffffff07, 0); +// EXPECT_EQ(memory[2], 3); +// EXPECT_EQ(memory[3], s64(0xffffffffffffff07)); +// EXPECT_EQ(memory[4], 1); + +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, store64_gpr64_gpr64_plus_gpr64_plus_s32) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// tester.clear(); +// tester.emit(IGen::store64_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RCX, R8, 12)); +// EXPECT_EQ(tester.dump_to_hex_string(), "4c 89 84 01 0c 00 00 00"); + +// auto instr = IGen::store64_gpr64_gpr64_plus_gpr64_plus_s32(RAX, RCX, RDX, -3); +// u8 buff[256]; +// instr.emit(buff); +// EXPECT_EQ(*(s8*)(buff + instr.offset_of_disp()), -3); + +// [[maybe_unused]] int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// if (k == RSP || k == j || k == i) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 +// tester.emit(IGen::pop_gpr64(k)); // k will have the value to store. + +// // store +// tester.emit(IGen::store64_gpr64_gpr64_plus_gpr64_plus_s32(i, j, k, -3)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// // prepare the memory: +// s64 memory[8] = {0, 0, 3, -2, 1, 0, 0, 0}; + +// // run! +// tester.execute((u64)memory, 24 + 3, 0xffffffffffffff07, 0); +// EXPECT_EQ(memory[2], 3); +// EXPECT_EQ(memory[3], s64(0xffffffffffffff07)); +// EXPECT_EQ(memory[4], 1); + +// iter++; +// } +// } +// } +// } + +// TEST(EmitterLoadsAndStores, load64_rip) { +// CodeTester tester; +// tester.init_code_buffer(256); +// tester.emit(IGen::load64_rip_s32(RAX, 12)); +// EXPECT_EQ(tester.dump_to_hex_string(), "48 8b 05 0c 00 00 00"); + +// tester.clear(); +// for (int i = 0; i < 16; i++) { +// tester.emit(IGen::load64_rip_s32(i, 12)); +// } + +// EXPECT_EQ(tester.dump_to_hex_string(true), +// "488B050C000000488B0D0C000000488B150C000000488B1D0C000000488B250C000000488B2D0C00000048" +// "8B350C000000488B3D0C0000004C8B050C0000004C8B0D0C0000004C8B150C0000004C8B1D0C0000004C8B" +// "250C0000004C8B2D0C0000004C8B350C0000004C8B3D0C000000"); +// } + +// TEST(EmitterLoadsAndStores, load32s_rip) { +// CodeTester tester; +// tester.init_code_buffer(256); +// tester.emit(IGen::load32s_rip_s32(RAX, 12)); +// EXPECT_EQ(tester.dump_to_hex_string(), "48 63 05 0c 00 00 00"); + +// tester.clear(); +// for (int i = 0; i < 16; i++) { +// tester.emit(IGen::load32s_rip_s32(i, 12)); +// } + +// EXPECT_EQ(tester.dump_to_hex_string(true), +// "4863050C00000048630D0C0000004863150C00000048631D0C0000004863250C00000048632D0C00000048" +// "63350C00000048633D0C0000004C63050C0000004C630D0C0000004C63150C0000004C631D0C0000004C63" +// "250C0000004C632D0C0000004C63350C0000004C633D0C000000"); +// } + +// TEST(EmitterLoadsAndStores, load32u_rip) { +// CodeTester tester; +// tester.init_code_buffer(256); +// tester.emit(IGen::load32u_rip_s32(RAX, 12)); +// EXPECT_EQ(tester.dump_to_hex_string(), "8b 05 0c 00 00 00"); + +// tester.clear(); +// for (int i = 0; i < 16; i++) { +// tester.emit(IGen::load32u_rip_s32(i, 12)); +// } + +// EXPECT_EQ(tester.dump_to_hex_string(true), +// "8B050C0000008B0D0C0000008B150C0000008B1D0C0000008B250C0000008B2D0C0000008B350C0000008B" +// "3D0C000000448B050C000000448B0D0C000000448B150C000000448B1D0C000000448B250C000000448B2D" +// "0C000000448B350C000000448B3D0C000000"); +// } + +// TEST(EmitterLoadsAndStores, load16u_rip) { +// CodeTester tester; +// tester.init_code_buffer(256); +// tester.emit(IGen::load16u_rip_s32(RAX, 12)); +// EXPECT_EQ(tester.dump_to_hex_string(), "48 0f b7 05 0c 00 00 00"); + +// tester.clear(); +// for (int i = 0; i < 16; i++) { +// tester.emit(IGen::load16u_rip_s32(i, 12)); +// } + +// EXPECT_EQ(tester.dump_to_hex_string(true), +// "480FB7050C000000480FB70D0C000000480FB7150C000000480FB71D0C000000480FB7250C000000480FB7" +// "2D0C000000480FB7350C000000480FB73D0C0000004C0FB7050C0000004C0FB70D0C0000004C0FB7150C00" +// "00004C0FB71D0C0000004C0FB7250C0000004C0FB72D0C0000004C0FB7350C0000004C0FB73D0C000000"); +// } + +// TEST(EmitterLoadsAndStores, load16s_rip) { +// CodeTester tester; +// tester.init_code_buffer(256); +// tester.emit(IGen::load16s_rip_s32(RAX, 12)); +// EXPECT_EQ(tester.dump_to_hex_string(), "48 0f bf 05 0c 00 00 00"); + +// tester.clear(); +// for (int i = 0; i < 16; i++) { +// tester.emit(IGen::load16s_rip_s32(i, 12)); +// } + +// EXPECT_EQ(tester.dump_to_hex_string(true), +// "480FBF050C000000480FBF0D0C000000480FBF150C000000480FBF1D0C000000480FBF250C000000480FBF" +// "2D0C000000480FBF350C000000480FBF3D0C0000004C0FBF050C0000004C0FBF0D0C0000004C0FBF150C00" +// "00004C0FBF1D0C0000004C0FBF250C0000004C0FBF2D0C0000004C0FBF350C0000004C0FBF3D0C000000"); +// } + +// TEST(EmitterLoadsAndStores, load8s_rip) { +// CodeTester tester; +// tester.init_code_buffer(256); +// tester.emit(IGen::load8s_rip_s32(RAX, 12)); +// EXPECT_EQ(tester.dump_to_hex_string(), "48 0f be 05 0c 00 00 00"); + +// tester.clear(); +// for (int i = 0; i < 16; i++) { +// tester.emit(IGen::load8s_rip_s32(i, 12)); +// } + +// EXPECT_EQ(tester.dump_to_hex_string(true), +// "480FBE050C000000480FBE0D0C000000480FBE150C000000480FBE1D0C000000480FBE250C000000480FBE" +// "2D0C000000480FBE350C000000480FBE3D0C0000004C0FBE050C0000004C0FBE0D0C0000004C0FBE150C00" +// "00004C0FBE1D0C0000004C0FBE250C0000004C0FBE2D0C0000004C0FBE350C0000004C0FBE3D0C000000"); +// } + +// TEST(EmitterLoadsAndStores, load8u_rip) { +// CodeTester tester; +// tester.init_code_buffer(256); +// tester.emit(IGen::load8u_rip_s32(RAX, 12)); +// EXPECT_EQ(tester.dump_to_hex_string(), "48 0f b6 05 0c 00 00 00"); + +// tester.clear(); +// for (int i = 0; i < 16; i++) { +// tester.emit(IGen::load8u_rip_s32(i, 12)); +// } + +// EXPECT_EQ(tester.dump_to_hex_string(true), +// "480FB6050C000000480FB60D0C000000480FB6150C000000480FB61D0C000000480FB6250C000000480FB6" +// "2D0C000000480FB6350C000000480FB63D0C0000004C0FB6050C0000004C0FB60D0C0000004C0FB6150C00" +// "00004C0FB61D0C0000004C0FB6250C0000004C0FB62D0C0000004C0FB6350C0000004C0FB63D0C000000"); +// } + +// TEST(EmitterLoadsAndStores, store64_rip_s32) { +// CodeTester tester; +// tester.init_code_buffer(256); +// tester.emit(IGen::store64_rip_s32(RAX, 12)); +// EXPECT_EQ(tester.dump_to_hex_string(), "48 89 05 0c 00 00 00"); + +// tester.clear(); +// for (int i = 0; i < 16; i++) { +// tester.emit(IGen::store64_rip_s32(i, 12)); +// } + +// EXPECT_EQ(tester.dump_to_hex_string(true), +// "4889050C00000048890D0C0000004889150C00000048891D0C0000004889250C00000048892D0C00000048" +// "89350C00000048893D0C0000004C89050C0000004C890D0C0000004C89150C0000004C891D0C0000004C89" +// "250C0000004C892D0C0000004C89350C0000004C893D0C000000"); +// } + +// TEST(EmitterLoadsAndStores, store32_rip_s32) { +// CodeTester tester; +// tester.init_code_buffer(256); +// tester.emit(IGen::store32_rip_s32(RAX, 12)); +// EXPECT_EQ(tester.dump_to_hex_string(), "89 05 0c 00 00 00"); + +// tester.clear(); +// for (int i = 0; i < 16; i++) { +// tester.emit(IGen::store32_rip_s32(i, 12)); +// } + +// EXPECT_EQ(tester.dump_to_hex_string(true), +// "89050C000000890D0C00000089150C000000891D0C00000089250C000000892D0C00000089350C00000089" +// "3D0C0000004489050C00000044890D0C0000004489150C00000044891D0C0000004489250C00000044892D" +// "0C0000004489350C00000044893D0C000000"); +// } + +// TEST(EmitterLoadsAndStores, store16_rip_s32) { +// CodeTester tester; +// tester.init_code_buffer(256); +// tester.emit(IGen::store16_rip_s32(RAX, 12)); +// EXPECT_EQ(tester.dump_to_hex_string(), "66 89 05 0c 00 00 00"); + +// tester.clear(); +// for (int i = 0; i < 16; i++) { +// tester.emit(IGen::store16_rip_s32(i, 12)); +// } + +// EXPECT_EQ(tester.dump_to_hex_string(true), +// "6689050C00000066890D0C0000006689150C00000066891D0C0000006689250C00000066892D0C00000066" +// "89350C00000066893D0C000000664489050C0000006644890D0C000000664489150C0000006644891D0C00" +// "0000664489250C0000006644892D0C000000664489350C0000006644893D0C000000"); +// } + +// TEST(EmitterLoadsAndStores, store8_rip_s32) { +// CodeTester tester; +// tester.init_code_buffer(256); +// tester.emit(IGen::store8_rip_s32(RAX, 12)); +// EXPECT_EQ(tester.dump_to_hex_string(), "88 05 0c 00 00 00"); + +// tester.clear(); +// for (int i = 0; i < 16; i++) { +// tester.emit(IGen::store8_rip_s32(i, 12)); +// } + +// EXPECT_EQ(tester.dump_to_hex_string(true), +// "88050C000000880D0C00000088150C000000881D0C0000004088250C00000040882D0C0000004088350C00" +// "000040883D0C0000004488050C00000044880D0C0000004488150C00000044881D0C0000004488250C0000" +// "0044882D0C0000004488350C00000044883D0C000000"); +// } + +// TEST(EmitterLoadsAndStores, static_addr) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// tester.clear(); +// tester.emit_push_all_gprs(true); +// tester.emit(IGen::mov_gpr64_u64(i, 12345)); // load test reg with junk +// int start_of_lea = tester.size(); +// auto lea_instr = IGen::static_addr(i, INT32_MAX); +// tester.emit(lea_instr); +// // patch instruction to lea the start of this code + 1. +// tester.write(-start_of_lea - lea_instr.length() + 1, +// start_of_lea + lea_instr.offset_of_disp()); +// tester.emit(IGen::mov_gpr64_gpr64(RAX, i)); +// tester.emit_pop_all_gprs(true); +// tester.emit_return(); + +// auto result = tester.execute(); +// EXPECT_EQ(result, (u64)(tester.data()) + 1); +// } +// } + +// #ifdef __linux__ +// TEST(EmitterXmm32, load32_xmm32_gpr64_plus_gpr64) { +// CodeTester tester; +// tester.init_code_buffer(512); +// tester.emit(IGen::load32_xmm32_gpr64_plus_gpr64(XMM3, RAX, RBX)); +// EXPECT_EQ(tester.dump_to_hex_string(), "f3 0f 10 1c 03"); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } +// for (int k = 0; k < 16; k++) { +// tester.clear(); +// tester.emit_push_all_xmms(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // fill k with junk +// tester.emit(IGen::mov_gpr64_u64(i, (iter & 1) ? 0 : UINT64_MAX)); +// tester.emit(IGen::movd_xmm32_gpr32(XMM0 + k, i)); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 + +// // load into k +// tester.emit(IGen::load32_xmm32_gpr64_plus_gpr64(XMM0 + k, i, j)); +// // move to return +// tester.emit(IGen::movd_gpr32_xmm32(RAX, XMM0 + k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_pop_all_xmms(); +// tester.emit_return(); + +// // prepare the memory: +// float memory[8] = {0, 0, 1.23f, 3.45f, 5.67f, 0, 0, 0}; + +// // run! +// EXPECT_FLOAT_EQ(tester.execute_ret((u64)memory, 3 * sizeof(float), 0, 0), 3.45f); +// EXPECT_FLOAT_EQ(tester.execute_ret((u64)memory, 2 * sizeof(float), 0, 0), 1.23f); +// EXPECT_FLOAT_EQ(tester.execute_ret((u64)memory, 4 * sizeof(float), 0, 0), 5.67f); +// EXPECT_FLOAT_EQ(tester.execute_ret((u64)memory, 5 * sizeof(float), 0, 0), 0); + +// iter++; +// } +// } +// } +// } + +// TEST(EmitterXmm32, load32_xmm32_gpr64_plus_gpr64_plus_s8) { +// CodeTester tester; +// tester.init_code_buffer(512); +// tester.emit(IGen::load32_xmm32_gpr64_plus_gpr64_plus_s8(XMM3, RAX, RBX, -1)); +// EXPECT_EQ(tester.dump_to_hex_string(), "f3 0f 10 5c 03 ff"); + +// auto instr = IGen::load32_xmm32_gpr64_plus_gpr64_plus_s8(XMM3, RBX, RSI, -3); +// u8 buff[256]; +// instr.emit(buff); +// EXPECT_EQ(s8(buff[instr.offset_of_disp()]), -3); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } +// for (int k = 0; k < 16; k++) { +// tester.clear(); +// tester.emit_push_all_xmms(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // fill k with junk +// tester.emit(IGen::mov_gpr64_u64(i, (iter & 1) ? 0 : UINT64_MAX)); +// tester.emit(IGen::movd_xmm32_gpr32(XMM0 + k, i)); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 + +// // load into k +// tester.emit(IGen::load32_xmm32_gpr64_plus_gpr64_plus_s8(XMM0 + k, i, j, -3)); +// // move to return +// tester.emit(IGen::movd_gpr32_xmm32(RAX, XMM0 + k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_pop_all_xmms(); +// tester.emit_return(); + +// // prepare the memory: +// float memory[8] = {0, 0, 1.23f, 3.45f, 5.67f, 0, 0, 0}; + +// // run! +// EXPECT_FLOAT_EQ(tester.execute_ret((u64)memory, 3 * sizeof(float) + 3, 0, +// 0), 3.45f); EXPECT_FLOAT_EQ(tester.execute_ret((u64)memory, 2 * sizeof(float) + 3, +// 0, 0), 1.23f); EXPECT_FLOAT_EQ(tester.execute_ret((u64)memory, 4 * sizeof(float) + +// 3, 0, 0), 5.67f); EXPECT_FLOAT_EQ(tester.execute_ret((u64)memory, 5 * +// sizeof(float) + 3, 0, 0), 0); + +// iter++; +// } +// } +// } +// } + +// TEST(EmitterXmm32, load32_xmm32_gpr64_plus_gpr64_plus_s32) { +// CodeTester tester; +// tester.init_code_buffer(512); +// tester.emit(IGen::load32_xmm32_gpr64_plus_gpr64_plus_s32(XMM3, RAX, RBX, -1)); +// EXPECT_EQ(tester.dump_to_hex_string(), "f3 0f 10 9c 03 ff ff ff ff"); + +// auto instr = IGen::load32_xmm32_gpr64_plus_gpr64_plus_s32(XMM3, RBX, RSI, -1234); +// u8 buff[256]; +// instr.emit(buff); +// EXPECT_EQ(*(s32*)(buff + instr.offset_of_disp()), -1234); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } +// for (int k = 0; k < 16; k++) { +// tester.clear(); +// tester.emit_push_all_xmms(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); + +// // fill k with junk +// tester.emit(IGen::mov_gpr64_u64(i, (iter & 1) ? 0 : UINT64_MAX)); +// tester.emit(IGen::movd_xmm32_gpr32(XMM0 + k, i)); + +// // pop args into appropriate register +// tester.emit(IGen::pop_gpr64(i)); // i will have offset 0 +// tester.emit(IGen::pop_gpr64(j)); // j will have offset 1 + +// s64 offset = (iter & 1) ? INT32_MAX : INT32_MIN; + +// // load into k +// tester.emit(IGen::load32_xmm32_gpr64_plus_gpr64_plus_s32(XMM0 + k, i, j, offset)); +// // move to return +// tester.emit(IGen::movd_gpr32_xmm32(RAX, XMM0 + k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_pop_all_xmms(); +// tester.emit_return(); + +// // prepare the memory: +// float memory[8] = {0, 0, 1.23f, 3.45f, 5.67f, 0, 0, 0}; + +// // run! +// EXPECT_FLOAT_EQ(tester.execute_ret((u64)memory, 3 * sizeof(float) - offset, 0, 0), +// 3.45f); +// EXPECT_FLOAT_EQ(tester.execute_ret((u64)memory, 2 * sizeof(float) - offset, 0, 0), +// 1.23f); +// EXPECT_FLOAT_EQ(tester.execute_ret((u64)memory, 4 * sizeof(float) - offset, 0, 0), +// 5.67f); +// EXPECT_FLOAT_EQ(tester.execute_ret((u64)memory, 5 * sizeof(float) - offset, 0, 0), +// 0); +// iter++; +// } +// } +// } +// } + +// namespace { +// template +// float as_float(T x) { +// float result; +// memcpy(&result, &x, sizeof(float)); +// return result; +// } + +// u32 as_u32(float x) { +// u32 result; +// memcpy(&result, &x, 4); +// return result; +// } +// } // namespace + +// TEST(EmitterXmm32, store32_xmm32_gpr64_plus_gpr64) { +// CodeTester tester; +// tester.init_code_buffer(512); +// tester.emit(IGen::store32_xmm32_gpr64_plus_gpr64(RAX, RBX, XMM7)); +// EXPECT_EQ(tester.dump_to_hex_string(), "f3 0f 11 3c 03"); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } + +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } + +// for (int k = 0; k < 16; k++) { +// tester.clear(); +// tester.emit_push_all_xmms(); +// tester.emit_push_all_gprs(true); +// // push args to the stack + +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); // addr2 +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); // addr1 +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); // value + +// // pop value into addr1 GPR +// tester.emit(IGen::pop_gpr64(i)); +// // move to XMM +// tester.emit(IGen::movd_xmm32_gpr32(XMM0 + k, i)); + +// // pop addrs +// tester.emit(IGen::pop_gpr64(i)); +// tester.emit(IGen::pop_gpr64(j)); + +// // store +// tester.emit(IGen::store32_xmm32_gpr64_plus_gpr64(i, j, XMM0 + k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_pop_all_xmms(); +// tester.emit_return(); + +// // prepare the memory: +// float memory[8] = {0, 0, 1.23f, 3.45f, 5.67f, 0, 0, 0}; + +// // run! +// tester.execute((u64)memory, 12, as_u32(1.234f), 0); +// EXPECT_FLOAT_EQ(memory[2], 1.23f); +// EXPECT_FLOAT_EQ(memory[3], 1.234f); +// EXPECT_FLOAT_EQ(memory[4], 5.67f); + +// iter++; +// } +// } +// } +// } + +// TEST(EmitterXmm32, store32_xmm32_gpr64_plus_gpr64_plus_s8) { +// CodeTester tester; +// tester.init_code_buffer(512); +// tester.emit(IGen::store32_xmm32_gpr64_plus_gpr64_plus_s8(RAX, RBX, XMM3, -1)); +// EXPECT_EQ(tester.dump_to_hex_string(), "f3 0f 11 5c 03 ff"); + +// auto instr = IGen::store32_xmm32_gpr64_plus_gpr64_plus_s8(RBX, RSI, XMM3, -3); +// u8 buff[256]; +// instr.emit(buff); +// EXPECT_EQ(s8(buff[instr.offset_of_disp()]), -3); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } +// for (int k = 0; k < 16; k++) { +// tester.clear(); +// tester.emit_push_all_xmms(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); // addr2 +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); // addr1 +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); // value + +// // pop value into addr1 GPR +// tester.emit(IGen::pop_gpr64(i)); +// // move to XMM +// tester.emit(IGen::movd_xmm32_gpr32(XMM0 + k, i)); + +// // pop addrs +// tester.emit(IGen::pop_gpr64(i)); +// tester.emit(IGen::pop_gpr64(j)); + +// s64 offset = (iter & 1) ? INT8_MAX : INT8_MIN; + +// // load into k +// tester.emit(IGen::store32_xmm32_gpr64_plus_gpr64_plus_s8(i, j, XMM0 + k, offset)); + +// // move to return +// tester.emit(IGen::movd_gpr32_xmm32(RAX, XMM0 + k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_pop_all_xmms(); +// tester.emit_return(); + +// // prepare the memory: +// float memory[8] = {0, 0, 1.23f, 3.45f, 5.67f, 0, 0, 0}; + +// // run! +// tester.execute((u64)memory, 12 - offset, as_u32(1.234f), 0); +// EXPECT_FLOAT_EQ(memory[2], 1.23f); +// EXPECT_FLOAT_EQ(memory[3], 1.234f); +// EXPECT_FLOAT_EQ(memory[4], 5.67f); + +// iter++; +// } +// } +// } +// } + +// TEST(EmitterXmm32, store32_xmm32_gpr64_plus_gpr64_plus_s32) { +// CodeTester tester; +// tester.init_code_buffer(512); +// tester.emit(IGen::store32_xmm32_gpr64_plus_gpr64_plus_s32(RAX, RBX, XMM3, -1)); +// EXPECT_EQ(tester.dump_to_hex_string(), "f3 0f 11 9c 03 ff ff ff ff"); + +// auto instr = IGen::store32_xmm32_gpr64_plus_gpr64_plus_s32(RBX, RSI, XMM3, -1234); +// u8 buff[256]; +// instr.emit(buff); +// EXPECT_EQ(*(s32*)(buff + instr.offset_of_disp()), -1234); + +// int iter = 0; +// for (int i = 0; i < 16; i++) { +// if (i == RSP) { +// continue; +// } +// for (int j = 0; j < 16; j++) { +// if (j == RSP || j == i) { +// continue; +// } +// for (int k = 0; k < 16; k++) { +// tester.clear(); +// tester.emit_push_all_xmms(); +// tester.emit_push_all_gprs(true); +// // push args to the stack +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(1))); // addr2 +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(0))); // addr1 +// tester.emit(IGen::push_gpr64(tester.get_c_abi_arg_reg(2))); // value + +// // pop value into addr1 GPR +// tester.emit(IGen::pop_gpr64(i)); +// // move to XMM +// tester.emit(IGen::movd_xmm32_gpr32(XMM0 + k, i)); + +// // pop addrs +// tester.emit(IGen::pop_gpr64(i)); +// tester.emit(IGen::pop_gpr64(j)); + +// s64 offset = (iter & 1) ? INT32_MAX : INT32_MIN; + +// // load into k +// tester.emit(IGen::store32_xmm32_gpr64_plus_gpr64_plus_s32(i, j, XMM0 + k, offset)); + +// // move to return +// tester.emit(IGen::movd_gpr32_xmm32(RAX, XMM0 + k)); + +// // return! +// tester.emit_pop_all_gprs(true); +// tester.emit_pop_all_xmms(); +// tester.emit_return(); + +// // prepare the memory: +// float memory[8] = {0, 0, 1.23f, 3.45f, 5.67f, 0, 0, 0}; + +// // run! +// tester.execute((u64)memory, 12 - offset, as_u32(1.234f), 0); +// EXPECT_FLOAT_EQ(memory[2], 1.23f); +// EXPECT_FLOAT_EQ(memory[3], 1.234f); +// EXPECT_FLOAT_EQ(memory[4], 5.67f); + +// iter++; +// } +// } +// } +// } + +// TEST(EmitterXmm32, static_load_xmm32) { +// CodeTester tester; +// tester.init_code_buffer(512); +// for (int i = 0; i < 16; i++) { +// tester.clear(); +// tester.emit_push_all_xmms(); +// tester.emit_push_all_gprs(true); + +// auto loc_of_load = tester.size(); +// auto load_instr = IGen::static_load_xmm32(XMM0 + i, INT32_MAX); + +// tester.emit(load_instr); +// tester.emit(IGen::movd_gpr32_xmm32(RAX, XMM0 + i)); +// tester.emit_pop_all_gprs(true); +// tester.emit_pop_all_xmms(); +// tester.emit_return(); +// auto loc_of_float = tester.emit_data(float(1.2345f)); + +// // patch offset +// tester.write(loc_of_float - loc_of_load - load_instr.length(), +// loc_of_load + load_instr.offset_of_disp()); + +// auto result = tester.execute_ret(0, 0, 0, 0); +// EXPECT_FLOAT_EQ(result, 1.2345f); +// } +// } + +// TEST(EmitterXmm32, static_store_xmm32) { +// CodeTester tester; +// tester.init_code_buffer(512); +// for (int i = 0; i < 16; i++) { +// tester.clear(); +// tester.emit_push_all_xmms(); +// tester.emit_push_all_gprs(true); +// tester.emit(IGen::movd_xmm32_gpr32(XMM0 + i, tester.get_c_abi_arg_reg(0))); + +// auto loc_of_store = tester.size(); +// auto store_instr = IGen::static_store_xmm32(XMM0 + i, INT32_MAX); + +// tester.emit(store_instr); +// tester.emit_pop_all_gprs(true); +// tester.emit_pop_all_xmms(); +// tester.emit_return(); +// auto loc_of_float = tester.emit_data(float(1.2345f)); + +// tester.write(loc_of_float - loc_of_store - store_instr.length(), +// loc_of_store + store_instr.offset_of_disp()); +// tester.execute(as_u32(-44.567f), 0, 0, 0); +// EXPECT_FLOAT_EQ(-44.567f, tester.read(loc_of_float)); +// } +// } + +// TEST(EmitterXmm32, ucomiss) { +// CodeTester tester; +// tester.init_code_buffer(512); +// tester.emit(IGen::cmp_flt_flt(XMM13, XMM14)); +// EXPECT_EQ("45 0f 2e ee", tester.dump_to_hex_string()); +// } + +// TEST(EmitterXmm32, mul) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// std::vector vals = {0.f, 1.f, 0.2f, -1.f, 1235423.2f, -3457343.3f, 7.545f}; + +// for (auto f : vals) { +// for (auto g : vals) { +// for (int i = 0; i < 16; i++) { +// for (int j = 0; j < 16; j++) { +// if (i == j) { +// continue; +// } +// auto expected = f * g; +// tester.clear(); +// tester.emit_push_all_xmms(); +// tester.emit_push_all_gprs(true); +// u64 val = 0; +// memcpy(&val, &f, sizeof(float)); +// tester.emit(IGen::mov_gpr64_u64(RAX, val)); +// tester.emit(IGen::movd_xmm32_gpr32(XMM0 + i, RAX)); +// memcpy(&val, &g, sizeof(float)); +// tester.emit(IGen::mov_gpr64_u64(RAX, val)); +// tester.emit(IGen::movd_xmm32_gpr32(XMM0 + j, RAX)); +// tester.emit(IGen::mulss_xmm_xmm(XMM0 + j, XMM0 + i)); +// tester.emit(IGen::movd_gpr32_xmm32(RAX, XMM0 + j)); +// tester.emit_pop_all_gprs(true); +// tester.emit_pop_all_xmms(); +// tester.emit_return(); +// auto result = tester.execute_ret(0, 0, 0, 0); +// EXPECT_FLOAT_EQ(result, expected); +// } +// } +// } +// } +// } + +// TEST(EmitterXmm32, div) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// std::vector vals = {1.f, 0.2f, -1.f, 1235423.2f, -3457343.3f, 7.545f}; + +// for (auto f : vals) { +// for (auto g : vals) { +// for (int i = 0; i < 16; i++) { +// for (int j = 0; j < 16; j++) { +// if (i == j) { +// continue; +// } +// auto expected = g / f; +// tester.clear(); +// tester.emit_push_all_xmms(); +// tester.emit_push_all_gprs(true); +// u64 val = 0; +// memcpy(&val, &f, sizeof(float)); +// tester.emit(IGen::mov_gpr64_u64(RAX, val)); +// tester.emit(IGen::movd_xmm32_gpr32(XMM0 + i, RAX)); +// memcpy(&val, &g, sizeof(float)); +// tester.emit(IGen::mov_gpr64_u64(RAX, val)); +// tester.emit(IGen::movd_xmm32_gpr32(XMM0 + j, RAX)); +// tester.emit(IGen::divss_xmm_xmm(XMM0 + j, XMM0 + i)); +// tester.emit(IGen::movd_gpr32_xmm32(RAX, XMM0 + j)); +// tester.emit_pop_all_gprs(true); +// tester.emit_pop_all_xmms(); +// tester.emit_return(); +// auto result = tester.execute_ret(0, 0, 0, 0); +// EXPECT_FLOAT_EQ(result, expected); +// } +// } +// } +// } +// } + +// TEST(EmitterXmm32, add) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// std::vector vals = {0.f, 1.f, 0.2f, -1.f, 1235423.2f, -3457343.3f, 7.545f}; +// for (auto f : vals) { +// for (auto g : vals) { +// for (int i = 0; i < 16; i++) { +// for (int j = 0; j < 16; j++) { +// if (i == j) { +// continue; +// } +// auto expected = g + f; +// tester.clear(); +// tester.emit_push_all_xmms(); +// tester.emit_push_all_gprs(true); +// u64 val = 0; +// memcpy(&val, &f, sizeof(float)); +// tester.emit(IGen::mov_gpr64_u64(RAX, val)); +// tester.emit(IGen::movd_xmm32_gpr32(XMM0 + i, RAX)); +// memcpy(&val, &g, sizeof(float)); +// tester.emit(IGen::mov_gpr64_u64(RAX, val)); +// tester.emit(IGen::movd_xmm32_gpr32(XMM0 + j, RAX)); +// tester.emit(IGen::addss_xmm_xmm(XMM0 + j, XMM0 + i)); +// tester.emit(IGen::movd_gpr32_xmm32(RAX, XMM0 + j)); +// tester.emit_pop_all_gprs(true); +// tester.emit_pop_all_xmms(); +// tester.emit_return(); +// auto result = tester.execute_ret(0, 0, 0, 0); +// EXPECT_FLOAT_EQ(result, expected); +// } +// } +// } +// } +// } + +// TEST(EmitterXmm32, sub) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// std::vector vals = {0.f, 1.f, 0.2f, -1.f, 1235423.2f, -3457343.3f, 7.545f}; + +// for (auto f : vals) { +// for (auto g : vals) { +// for (int i = 0; i < 16; i++) { +// for (int j = 0; j < 16; j++) { +// if (i == j) { +// continue; +// } +// auto expected = g - f; +// tester.clear(); +// tester.emit_push_all_xmms(); +// tester.emit_push_all_gprs(true); +// u64 val = 0; +// memcpy(&val, &f, sizeof(float)); +// tester.emit(IGen::mov_gpr64_u64(RAX, val)); +// tester.emit(IGen::movd_xmm32_gpr32(XMM0 + i, RAX)); +// memcpy(&val, &g, sizeof(float)); +// tester.emit(IGen::mov_gpr64_u64(RAX, val)); +// tester.emit(IGen::movd_xmm32_gpr32(XMM0 + j, RAX)); +// tester.emit(IGen::subss_xmm_xmm(XMM0 + j, XMM0 + i)); +// tester.emit(IGen::movd_gpr32_xmm32(RAX, XMM0 + j)); +// tester.emit_pop_all_gprs(true); +// tester.emit_pop_all_xmms(); +// tester.emit_return(); +// auto result = tester.execute_ret(0, 0, 0, 0); +// EXPECT_FLOAT_EQ(result, expected); +// } +// } +// } +// } +// } + +// TEST(EmitterXmm32, float_to_int) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// std::vector vals = {0.f, 1.f, 0.2f, -1.f, 1235423.2f, -3457343.3f, +// 7.545f, 0.1f, 0.9f, -0.1f, -0.9f}; + +// for (auto g : vals) { +// for (int i = 0; i < 16; i++) { +// for (int j = 0; j < 16; j++) { +// if (j == RSP) { +// continue; +// } +// s32 expected = g; +// tester.clear(); +// tester.emit_push_all_xmms(); +// tester.emit_push_all_gprs(true); +// u64 val = 0; +// memcpy(&val, &g, sizeof(float)); +// tester.emit(IGen::mov_gpr64_u64(RAX, val)); +// tester.emit(IGen::movd_xmm32_gpr32(XMM0 + i, RAX)); +// tester.emit(IGen::float_to_int32(j, XMM0 + i)); +// tester.emit(IGen::mov_gpr64_gpr64(RAX, j)); +// tester.emit_pop_all_gprs(true); +// tester.emit_pop_all_xmms(); +// tester.emit_return(); +// auto result = tester.execute_ret(0, 0, 0, 0); +// EXPECT_EQ(result, expected); +// } +// } +// } +// } + +// TEST(EmitterXmm32, int_to_float) { +// CodeTester tester; +// tester.init_code_buffer(512); + +// std::vector vals = {0, 1, -1, INT32_MAX, -3457343, 7, INT32_MIN}; + +// for (auto g : vals) { +// for (int i = 0; i < 16; i++) { +// for (int j = 0; j < 16; j++) { +// if (j == RSP) { +// continue; +// } +// float expected = g; +// tester.clear(); +// tester.emit_push_all_xmms(); +// tester.emit_push_all_gprs(true); +// tester.emit(IGen::mov_gpr64_u64(j, g)); +// tester.emit(IGen::int32_to_float(XMM0 + i, j)); +// tester.emit(IGen::movd_gpr32_xmm32(RAX, XMM0 + i)); +// tester.emit_pop_all_gprs(true); +// tester.emit_pop_all_xmms(); +// tester.emit_return(); +// auto result = tester.execute_ret(0, 0, 0, 0); +// EXPECT_EQ(result, expected); +// } +// } +// } +// } + +// TEST(EmitterSlow, xmm32_move) { +// std::vector u32_constants = {0, INT32_MAX, UINT32_MAX, 17}; + +// // test moving between xmms (32-bit) and gprs. +// CodeTester tester; +// tester.init_code_buffer(512); + +// for (auto constant : u32_constants) { +// for (int r1 = 0; r1 < 16; r1++) { +// if (r1 == RSP) { +// continue; +// } +// for (int r2 = 0; r2 < 16; r2++) { +// if (r2 == RSP) { +// continue; +// } +// for (int r3 = 0; r3 < 16; r3++) { +// for (int r4 = 0; r4 < 16; r4++) { +// tester.clear(); +// tester.emit_push_all_xmms(); +// tester.emit_push_all_gprs(true); +// // move constant to gpr +// tester.emit(IGen::mov_gpr64_u32(r1, constant)); +// // move gpr to xmm +// tester.emit(IGen::movd_xmm32_gpr32(XMM0 + r3, r1)); +// // move xmm to xmm +// tester.emit(IGen::mov_xmm32_xmm32(XMM0 + r4, XMM0 + r3)); +// // move xmm to gpr +// tester.emit(IGen::movd_gpr32_xmm32(r2, XMM0 + r4)); +// // return! +// tester.emit(IGen::mov_gpr64_gpr64(RAX, r2)); +// tester.emit_pop_all_gprs(true); +// tester.emit_pop_all_xmms(); +// tester.emit_return(); +// } +// } +// } +// } +// } +// // todo - finish this test +// } +// #endif + +// TEST(Emitter, LEA) { +// CodeTester tester; +// tester.init_code_buffer(1024); +// tester.emit(IGen::lea_reg_plus_off(RDI, RSP, -3)); +// tester.emit(IGen::lea_reg_plus_off(RDI, R12, -3)); +// tester.emit(IGen::lea_reg_plus_off(R13, RSP, -3)); +// tester.emit(IGen::lea_reg_plus_off(R13, R12, -3)); +// tester.emit(IGen::lea_reg_plus_off(RDI, RSP, -300)); +// tester.emit(IGen::lea_reg_plus_off(RDI, R12, -300)); +// tester.emit(IGen::lea_reg_plus_off(R13, RSP, -300)); +// tester.emit(IGen::lea_reg_plus_off(R13, R12, -300)); +// EXPECT_EQ(tester.dump_to_hex_string(true), +// "488D7C24FD498D7C24FD4C8D6C24FD4D8D6C24FD488DBC24D4FEFFFF498DBC24D4FEFFFF4C8DAC24D4FEFF" +// "FF4D8DAC24D4FEFFFF"); +// } + +// TEST(EmitterXMM, StackLoad32) { +// CodeTester tester; +// tester.init_code_buffer(1024); +// tester.emit(IGen::load32_xmm32_gpr64_plus_s32(XMM0 + 3, RSP, -1234)); +// tester.emit(IGen::load32_xmm32_gpr64_plus_s32(XMM0 + 13, RSP, -1234)); +// EXPECT_EQ(tester.dump_to_hex_string(true), "F30F109C242EFBFFFFF3440F10AC242EFBFFFF"); +// } + +// TEST(EmitterXMM, StackLoad8) { +// CodeTester tester; +// tester.init_code_buffer(1024); +// tester.emit(IGen::load32_xmm32_gpr64_plus_s8(XMM0 + 3, RSP, -12)); +// tester.emit(IGen::load32_xmm32_gpr64_plus_s8(XMM0 + 13, RSP, -12)); +// EXPECT_EQ(tester.dump_to_hex_string(true), "F30F105C24F4F3440F106C24F4"); +// } + +// TEST(EmitterXMM, StackLoadFull32) { +// CodeTester tester; +// tester.init_code_buffer(1024); +// tester.emit(IGen::load128_simd128_gpr64_s32(XMM0 + 3, RSP, -1234)); +// tester.emit(IGen::load128_simd128_gpr64_s32(XMM0 + 13, RSP, -1234)); +// EXPECT_EQ(tester.dump_to_hex_string(true), "660F6F9C242EFBFFFF66440F6FAC242EFBFFFF"); +// } + +// TEST(EmitterXMM, StackLoadFull8) { +// CodeTester tester; +// tester.init_code_buffer(1024); +// tester.emit(IGen::load128_simd128_gpr64_s8(XMM0 + 3, RSP, -12)); +// tester.emit(IGen::load128_simd128_gpr64_s8(XMM0 + 13, RSP, -12)); +// EXPECT_EQ(tester.dump_to_hex_string(true), "660F6F5C24F466440F6F6C24F4"); +// } + +// TEST(EmitterXMM, StackStore32) { +// CodeTester tester; +// tester.init_code_buffer(1024); +// tester.emit(IGen::store32_xmm32_gpr64_plus_s32(RSP, XMM0 + 3, -1234)); +// tester.emit(IGen::store32_xmm32_gpr64_plus_s32(RSP, XMM0 + 13, -1234)); +// EXPECT_EQ(tester.dump_to_hex_string(true), "F30F119C242EFBFFFFF3440F11AC242EFBFFFF"); +// } + +// TEST(EmitterXMM, StackStore8) { +// CodeTester tester; +// tester.init_code_buffer(1024); +// tester.emit(IGen::store32_xmm32_gpr64_plus_s8(RSP, XMM0 + 3, -12)); +// tester.emit(IGen::store32_xmm32_gpr64_plus_s8(RSP, XMM0 + 13, -12)); +// EXPECT_EQ(tester.dump_to_hex_string(true), "F30F115C24F4F3440F116C24F4"); +// } + +// TEST(EmitterXMM, StackStoreFull32) { +// CodeTester tester; +// tester.init_code_buffer(1024); +// tester.emit(IGen::store128_gpr64_simd128_s32(RSP, XMM0 + 3, -1234)); +// tester.emit(IGen::store128_gpr64_simd128_s32(RSP, XMM0 + 13, -1234)); +// EXPECT_EQ(tester.dump_to_hex_string(true), "660F7F9C242EFBFFFF66440F7FAC242EFBFFFF"); +// } + +// TEST(EmitterXMM, StackStoreFull8) { +// CodeTester tester; +// tester.init_code_buffer(1024); +// tester.emit(IGen::store128_gpr64_simd128_s8(RSP, XMM0 + 3, -12)); +// tester.emit(IGen::store128_gpr64_simd128_s8(RSP, XMM0 + 13, -12)); +// EXPECT_EQ(tester.dump_to_hex_string(true), "660F7F5C24F466440F7F6C24F4"); +// } + +// TEST(EmitterXMM, SqrtS) { +// CodeTester tester; +// tester.init_code_buffer(1024); +// tester.emit(IGen::sqrts_xmm(XMM0 + 1, XMM0 + 2)); +// tester.emit(IGen::sqrts_xmm(XMM0 + 11, XMM0 + 2)); +// tester.emit(IGen::sqrts_xmm(XMM0 + 1, XMM0 + 12)); +// tester.emit(IGen::sqrts_xmm(XMM0 + 11, XMM0 + 12)); +// EXPECT_EQ(tester.dump_to_hex_string(true), "F30F51CAF3440F51DAF3410F51CCF3450F51DC"); +// } diff --git a/test/test_emitter_avx.cpp b/test/test_emitter_avx.cpp index 393ce9ed05..4558dc85e8 100644 --- a/test/test_emitter_avx.cpp +++ b/test/test_emitter_avx.cpp @@ -7,14 +7,14 @@ using namespace emitter; TEST(EmitterAVX, VF_NOP) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::nop_vf()); + tester.emit(IGen::nop_vf(tester.generator())); EXPECT_EQ(tester.dump_to_hex_string(true), "D9D0"); } TEST(EmitterAVX, WAIT_VF) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::wait_vf()); + tester.emit(IGen::wait_vf(tester.generator())); EXPECT_EQ(tester.dump_to_hex_string(true), "9B"); } @@ -23,7 +23,7 @@ TEST(EmitterAVX, MOV_VF) { tester.init_code_buffer(10000); for (int i = 0; i < 16; i++) { for (int j = 0; j < 16; j++) { - tester.emit(IGen::mov_vf_vf(XMM0 + i, XMM0 + j)); + tester.emit(IGen::mov_vf_vf(tester.generator(), XMM0 + i, XMM0 + j)); } } @@ -58,10 +58,10 @@ TEST(EmitterAVX, MOV_VF) { TEST(EmitterAVX, LoadVF_Reg) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::loadvf_gpr64_plus_gpr64(XMM0 + 3, RSI, R15)); - tester.emit(IGen::loadvf_gpr64_plus_gpr64(XMM0 + 3, R12, R15)); - tester.emit(IGen::loadvf_gpr64_plus_gpr64(XMM0 + 13, RSI, R15)); - tester.emit(IGen::loadvf_gpr64_plus_gpr64(XMM0 + 13, R12, R15)); + tester.emit(IGen::loadvf_gpr64_plus_gpr64(tester.generator(), XMM0 + 3, RSI, R15)); + tester.emit(IGen::loadvf_gpr64_plus_gpr64(tester.generator(), XMM0 + 3, R12, R15)); + tester.emit(IGen::loadvf_gpr64_plus_gpr64(tester.generator(), XMM0 + 13, RSI, R15)); + tester.emit(IGen::loadvf_gpr64_plus_gpr64(tester.generator(), XMM0 + 13, R12, R15)); EXPECT_EQ(tester.dump_to_hex_string(true), "C4C178281C37C48178281C3CC44178282C37C40178282C3C"); } @@ -69,10 +69,10 @@ TEST(EmitterAVX, LoadVF_Reg) { TEST(EmitterAVX, LoadVF_RegS8) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::loadvf_gpr64_plus_gpr64_plus_s8(XMM0 + 3, RSI, R15, -3)); - tester.emit(IGen::loadvf_gpr64_plus_gpr64_plus_s8(XMM0 + 3, R12, R15, -3)); - tester.emit(IGen::loadvf_gpr64_plus_gpr64_plus_s8(XMM0 + 13, RSI, R15, -3)); - tester.emit(IGen::loadvf_gpr64_plus_gpr64_plus_s8(XMM0 + 13, R12, R15, -3)); + tester.emit(IGen::loadvf_gpr64_plus_gpr64_plus_s8(tester.generator(), XMM0 + 3, RSI, R15, -3)); + tester.emit(IGen::loadvf_gpr64_plus_gpr64_plus_s8(tester.generator(), XMM0 + 3, R12, R15, -3)); + tester.emit(IGen::loadvf_gpr64_plus_gpr64_plus_s8(tester.generator(), XMM0 + 13, RSI, R15, -3)); + tester.emit(IGen::loadvf_gpr64_plus_gpr64_plus_s8(tester.generator(), XMM0 + 13, R12, R15, -3)); EXPECT_EQ(tester.dump_to_hex_string(true), "C4C178285C37FDC48178285C3CFDC44178286C37FDC40178286C3CFD"); @@ -81,10 +81,14 @@ TEST(EmitterAVX, LoadVF_RegS8) { TEST(EmitterAVX, LoadVF_RegS32) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::loadvf_gpr64_plus_gpr64_plus_s32(XMM0 + 3, RSI, R15, -0x100)); - tester.emit(IGen::loadvf_gpr64_plus_gpr64_plus_s32(XMM0 + 3, R12, R15, -0x100)); - tester.emit(IGen::loadvf_gpr64_plus_gpr64_plus_s32(XMM0 + 13, RSI, R15, -0x100)); - tester.emit(IGen::loadvf_gpr64_plus_gpr64_plus_s32(XMM0 + 13, R12, R15, -0x100)); + tester.emit( + IGen::loadvf_gpr64_plus_gpr64_plus_s32(tester.generator(), XMM0 + 3, RSI, R15, -0x100)); + tester.emit( + IGen::loadvf_gpr64_plus_gpr64_plus_s32(tester.generator(), XMM0 + 3, R12, R15, -0x100)); + tester.emit( + IGen::loadvf_gpr64_plus_gpr64_plus_s32(tester.generator(), XMM0 + 13, RSI, R15, -0x100)); + tester.emit( + IGen::loadvf_gpr64_plus_gpr64_plus_s32(tester.generator(), XMM0 + 13, R12, R15, -0x100)); EXPECT_EQ(tester.dump_to_hex_string(true), "C4C178289C3700FFFFFFC48178289C3C00FFFFFFC4417828AC3700FFFFFFC4017828AC3C00FFFFFF"); @@ -93,10 +97,10 @@ TEST(EmitterAVX, LoadVF_RegS32) { TEST(EmitterAVX, StoreVF_Reg) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::storevf_gpr64_plus_gpr64(XMM0 + 3, RSI, R15)); - tester.emit(IGen::storevf_gpr64_plus_gpr64(XMM0 + 3, R12, R15)); - tester.emit(IGen::storevf_gpr64_plus_gpr64(XMM0 + 13, RSI, R15)); - tester.emit(IGen::storevf_gpr64_plus_gpr64(XMM0 + 13, R12, R15)); + tester.emit(IGen::storevf_gpr64_plus_gpr64(tester.generator(), XMM0 + 3, RSI, R15)); + tester.emit(IGen::storevf_gpr64_plus_gpr64(tester.generator(), XMM0 + 3, R12, R15)); + tester.emit(IGen::storevf_gpr64_plus_gpr64(tester.generator(), XMM0 + 13, RSI, R15)); + tester.emit(IGen::storevf_gpr64_plus_gpr64(tester.generator(), XMM0 + 13, R12, R15)); EXPECT_EQ(tester.dump_to_hex_string(true), "C4C178291C37C48178291C3CC44178292C37C40178292C3C"); } @@ -104,10 +108,10 @@ TEST(EmitterAVX, StoreVF_Reg) { TEST(EmitterAVX, StoreVF_RegS8) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::storevf_gpr64_plus_gpr64_plus_s8(XMM0 + 3, RSI, R15, -3)); - tester.emit(IGen::storevf_gpr64_plus_gpr64_plus_s8(XMM0 + 3, R12, R15, -3)); - tester.emit(IGen::storevf_gpr64_plus_gpr64_plus_s8(XMM0 + 13, RSI, R15, -3)); - tester.emit(IGen::storevf_gpr64_plus_gpr64_plus_s8(XMM0 + 13, R12, R15, -3)); + tester.emit(IGen::storevf_gpr64_plus_gpr64_plus_s8(tester.generator(), XMM0 + 3, RSI, R15, -3)); + tester.emit(IGen::storevf_gpr64_plus_gpr64_plus_s8(tester.generator(), XMM0 + 3, R12, R15, -3)); + tester.emit(IGen::storevf_gpr64_plus_gpr64_plus_s8(tester.generator(), XMM0 + 13, RSI, R15, -3)); + tester.emit(IGen::storevf_gpr64_plus_gpr64_plus_s8(tester.generator(), XMM0 + 13, R12, R15, -3)); EXPECT_EQ(tester.dump_to_hex_string(true), "C4C178295C37FDC48178295C3CFDC44178296C37FDC40178296C3CFD"); @@ -116,10 +120,14 @@ TEST(EmitterAVX, StoreVF_RegS8) { TEST(EmitterAVX, StoreVF_RegS32) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::storevf_gpr64_plus_gpr64_plus_s32(XMM0 + 3, RSI, R15, -0x100)); - tester.emit(IGen::storevf_gpr64_plus_gpr64_plus_s32(XMM0 + 3, R12, R15, -0x100)); - tester.emit(IGen::storevf_gpr64_plus_gpr64_plus_s32(XMM0 + 13, RSI, R15, -0x100)); - tester.emit(IGen::storevf_gpr64_plus_gpr64_plus_s32(XMM0 + 13, R12, R15, -0x100)); + tester.emit( + IGen::storevf_gpr64_plus_gpr64_plus_s32(tester.generator(), XMM0 + 3, RSI, R15, -0x100)); + tester.emit( + IGen::storevf_gpr64_plus_gpr64_plus_s32(tester.generator(), XMM0 + 3, R12, R15, -0x100)); + tester.emit( + IGen::storevf_gpr64_plus_gpr64_plus_s32(tester.generator(), XMM0 + 13, RSI, R15, -0x100)); + tester.emit( + IGen::storevf_gpr64_plus_gpr64_plus_s32(tester.generator(), XMM0 + 13, R12, R15, -0x100)); EXPECT_EQ(tester.dump_to_hex_string(true), "C4C178299C3700FFFFFFC48178299C3C00FFFFFFC4417829AC3700FFFFFFC4017829AC3C00FFFFFF"); @@ -128,14 +136,14 @@ TEST(EmitterAVX, StoreVF_RegS32) { TEST(EmitterAVX, MulVF) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::mul_vf(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::mul_vf(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::mul_vf(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::mul_vf(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::mul_vf(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::mul_vf(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::mul_vf(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::mul_vf(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::mul_vf(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::mul_vf(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::mul_vf(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::mul_vf(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::mul_vf(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::mul_vf(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::mul_vf(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::mul_vf(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E059DBC4C16059DDC59059DBC4C11059DDC56059EBC4416059EDC51059EBC4411059ED"); @@ -144,64 +152,64 @@ TEST(EmitterAVX, MulVF) { TEST(EmitterAVX, ShuffleVF) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::shuffle_vf(XMM0 + 3, XMM0 + 4, 1, 3, 2, 1)); - tester.emit(IGen::shuffle_vf(XMM0 + 3, XMM0 + 14, 1, 3, 2, 1)); - tester.emit(IGen::shuffle_vf(XMM0 + 13, XMM0 + 4, 1, 3, 2, 1)); - tester.emit(IGen::shuffle_vf(XMM0 + 13, XMM0 + 14, 1, 3, 2, 1)); + tester.emit(IGen::shuffle_vf(tester.generator(), XMM0 + 3, XMM0 + 4, 1, 3, 2, 1)); + tester.emit(IGen::shuffle_vf(tester.generator(), XMM0 + 3, XMM0 + 14, 1, 3, 2, 1)); + tester.emit(IGen::shuffle_vf(tester.generator(), XMM0 + 13, XMM0 + 4, 1, 3, 2, 1)); + tester.emit(IGen::shuffle_vf(tester.generator(), XMM0 + 13, XMM0 + 14, 1, 3, 2, 1)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5D8C6DC6DC4C108C6DE6DC558C6EC6DC44108C6EE6D"); } TEST(EmitterAVX, SplatVF_X) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::splat_vf(XMM0 + 3, XMM0 + 4, Register::VF_ELEMENT::X)); - tester.emit(IGen::splat_vf(XMM0 + 3, XMM0 + 14, Register::VF_ELEMENT::X)); - tester.emit(IGen::splat_vf(XMM0 + 13, XMM0 + 4, Register::VF_ELEMENT::X)); - tester.emit(IGen::splat_vf(XMM0 + 13, XMM0 + 14, Register::VF_ELEMENT::X)); + tester.emit(IGen::splat_vf(tester.generator(), XMM0 + 3, XMM0 + 4, Register::VF_ELEMENT::X)); + tester.emit(IGen::splat_vf(tester.generator(), XMM0 + 3, XMM0 + 14, Register::VF_ELEMENT::X)); + tester.emit(IGen::splat_vf(tester.generator(), XMM0 + 13, XMM0 + 4, Register::VF_ELEMENT::X)); + tester.emit(IGen::splat_vf(tester.generator(), XMM0 + 13, XMM0 + 14, Register::VF_ELEMENT::X)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5D8C6DC00C4C108C6DE00C558C6EC00C44108C6EE00"); } TEST(EmitterAVX, SplatVF_Y) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::splat_vf(XMM0 + 3, XMM0 + 4, Register::VF_ELEMENT::Y)); - tester.emit(IGen::splat_vf(XMM0 + 3, XMM0 + 14, Register::VF_ELEMENT::Y)); - tester.emit(IGen::splat_vf(XMM0 + 13, XMM0 + 4, Register::VF_ELEMENT::Y)); - tester.emit(IGen::splat_vf(XMM0 + 13, XMM0 + 14, Register::VF_ELEMENT::Y)); + tester.emit(IGen::splat_vf(tester.generator(), XMM0 + 3, XMM0 + 4, Register::VF_ELEMENT::Y)); + tester.emit(IGen::splat_vf(tester.generator(), XMM0 + 3, XMM0 + 14, Register::VF_ELEMENT::Y)); + tester.emit(IGen::splat_vf(tester.generator(), XMM0 + 13, XMM0 + 4, Register::VF_ELEMENT::Y)); + tester.emit(IGen::splat_vf(tester.generator(), XMM0 + 13, XMM0 + 14, Register::VF_ELEMENT::Y)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5D8C6DC55C4C108C6DE55C558C6EC55C44108C6EE55"); } TEST(EmitterAVX, SplatVF_Z) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::splat_vf(XMM0 + 3, XMM0 + 4, Register::VF_ELEMENT::Z)); - tester.emit(IGen::splat_vf(XMM0 + 3, XMM0 + 14, Register::VF_ELEMENT::Z)); - tester.emit(IGen::splat_vf(XMM0 + 13, XMM0 + 4, Register::VF_ELEMENT::Z)); - tester.emit(IGen::splat_vf(XMM0 + 13, XMM0 + 14, Register::VF_ELEMENT::Z)); + tester.emit(IGen::splat_vf(tester.generator(), XMM0 + 3, XMM0 + 4, Register::VF_ELEMENT::Z)); + tester.emit(IGen::splat_vf(tester.generator(), XMM0 + 3, XMM0 + 14, Register::VF_ELEMENT::Z)); + tester.emit(IGen::splat_vf(tester.generator(), XMM0 + 13, XMM0 + 4, Register::VF_ELEMENT::Z)); + tester.emit(IGen::splat_vf(tester.generator(), XMM0 + 13, XMM0 + 14, Register::VF_ELEMENT::Z)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5D8C6DCAAC4C108C6DEAAC558C6ECAAC44108C6EEAA"); } TEST(EmitterAVX, SplatVF_W) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::splat_vf(XMM0 + 3, XMM0 + 4, Register::VF_ELEMENT::W)); - tester.emit(IGen::splat_vf(XMM0 + 3, XMM0 + 14, Register::VF_ELEMENT::W)); - tester.emit(IGen::splat_vf(XMM0 + 13, XMM0 + 4, Register::VF_ELEMENT::W)); - tester.emit(IGen::splat_vf(XMM0 + 13, XMM0 + 14, Register::VF_ELEMENT::W)); + tester.emit(IGen::splat_vf(tester.generator(), XMM0 + 3, XMM0 + 4, Register::VF_ELEMENT::W)); + tester.emit(IGen::splat_vf(tester.generator(), XMM0 + 3, XMM0 + 14, Register::VF_ELEMENT::W)); + tester.emit(IGen::splat_vf(tester.generator(), XMM0 + 13, XMM0 + 4, Register::VF_ELEMENT::W)); + tester.emit(IGen::splat_vf(tester.generator(), XMM0 + 13, XMM0 + 14, Register::VF_ELEMENT::W)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5D8C6DCFFC4C108C6DEFFC558C6ECFFC44108C6EEFF"); } TEST(EmitterAVX, XorVF) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::xor_vf(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::xor_vf(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::xor_vf(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::xor_vf(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::xor_vf(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::xor_vf(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::xor_vf(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::xor_vf(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::xor_vf(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::xor_vf(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::xor_vf(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::xor_vf(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::xor_vf(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::xor_vf(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::xor_vf(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::xor_vf(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E057DBC4C16057DDC59057DBC4C11057DDC56057EBC4416057EDC51057EBC4411057ED"); @@ -210,14 +218,14 @@ TEST(EmitterAVX, XorVF) { TEST(EmitterAVX, SubVF) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::sub_vf(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::sub_vf(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::sub_vf(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::sub_vf(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::sub_vf(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::sub_vf(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::sub_vf(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::sub_vf(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::sub_vf(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::sub_vf(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::sub_vf(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::sub_vf(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::sub_vf(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::sub_vf(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::sub_vf(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::sub_vf(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E05CDBC4C1605CDDC5905CDBC4C1105CDDC5605CEBC441605CEDC5105CEBC441105CED"); @@ -226,14 +234,14 @@ TEST(EmitterAVX, SubVF) { TEST(EmitterAVX, AddVF) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::add_vf(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::add_vf(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::add_vf(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::add_vf(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::add_vf(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::add_vf(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::add_vf(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::add_vf(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::add_vf(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::add_vf(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::add_vf(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::add_vf(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::add_vf(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::add_vf(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::add_vf(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::add_vf(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E058DBC4C16058DDC59058DBC4C11058DDC56058EBC4416058EDC51058EBC4411058ED"); @@ -242,14 +250,14 @@ TEST(EmitterAVX, AddVF) { TEST(EmitterAVX, MaxVF) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::max_vf(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::max_vf(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::max_vf(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::max_vf(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::max_vf(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::max_vf(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::max_vf(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::max_vf(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::max_vf(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::max_vf(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::max_vf(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::max_vf(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::max_vf(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::max_vf(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::max_vf(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::max_vf(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E05FDBC4C1605FDDC5905FDBC4C1105FDDC5605FEBC441605FEDC5105FEBC441105FED"); @@ -258,14 +266,14 @@ TEST(EmitterAVX, MaxVF) { TEST(EmitterAVX, MinVF) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::min_vf(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::min_vf(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::min_vf(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::min_vf(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::min_vf(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::min_vf(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::min_vf(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::min_vf(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::min_vf(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::min_vf(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::min_vf(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::min_vf(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::min_vf(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::min_vf(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::min_vf(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::min_vf(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E05DDBC4C1605DDDC5905DDBC4C1105DDDC5605DEBC441605DEDC5105DEBC441105DED"); @@ -274,14 +282,14 @@ TEST(EmitterAVX, MinVF) { TEST(EmitterAVX, BlendVF) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::blend_vf(XMM0 + 3, XMM0 + 3, XMM0 + 3, 3)); - tester.emit(IGen::blend_vf(XMM0 + 3, XMM0 + 3, XMM0 + 13, 3)); - tester.emit(IGen::blend_vf(XMM0 + 3, XMM0 + 13, XMM0 + 3, 3)); - tester.emit(IGen::blend_vf(XMM0 + 3, XMM0 + 13, XMM0 + 13, 3)); - tester.emit(IGen::blend_vf(XMM0 + 13, XMM0 + 3, XMM0 + 3, 3)); - tester.emit(IGen::blend_vf(XMM0 + 13, XMM0 + 3, XMM0 + 13, 3)); - tester.emit(IGen::blend_vf(XMM0 + 13, XMM0 + 13, XMM0 + 3, 3)); - tester.emit(IGen::blend_vf(XMM0 + 13, XMM0 + 13, XMM0 + 13, 3)); + tester.emit(IGen::blend_vf(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3, 3)); + tester.emit(IGen::blend_vf(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13, 3)); + tester.emit(IGen::blend_vf(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3, 3)); + tester.emit(IGen::blend_vf(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13, 3)); + tester.emit(IGen::blend_vf(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3, 3)); + tester.emit(IGen::blend_vf(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13, 3)); + tester.emit(IGen::blend_vf(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3, 3)); + tester.emit(IGen::blend_vf(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13, 3)); EXPECT_EQ(tester.dump_to_hex_string(true), "C4E3610CDB03C4C3610CDD03C4E3110CDB03C4C3110CDD03C463610CEB03C443610CED03C463110CEB03C4" @@ -291,14 +299,14 @@ TEST(EmitterAVX, BlendVF) { TEST(EmitterAVX, DivVF) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::div_vf(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::div_vf(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::div_vf(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::div_vf(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::div_vf(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::div_vf(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::div_vf(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::div_vf(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::div_vf(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::div_vf(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::div_vf(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::div_vf(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::div_vf(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::div_vf(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::div_vf(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::div_vf(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E05EDBC4C1605EDDC5905EDBC4C1105EDDC5605EEBC441605EEDC5105EEBC441105EED"); @@ -307,10 +315,10 @@ TEST(EmitterAVX, DivVF) { TEST(EmitterAVX, SqrtVF) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::sqrt_vf(XMM0 + 3, XMM0 + 4)); - tester.emit(IGen::sqrt_vf(XMM0 + 3, XMM0 + 14)); - tester.emit(IGen::sqrt_vf(XMM0 + 13, XMM0 + 4)); - tester.emit(IGen::sqrt_vf(XMM0 + 13, XMM0 + 14)); + tester.emit(IGen::sqrt_vf(tester.generator(), XMM0 + 3, XMM0 + 4)); + tester.emit(IGen::sqrt_vf(tester.generator(), XMM0 + 3, XMM0 + 14)); + tester.emit(IGen::sqrt_vf(tester.generator(), XMM0 + 13, XMM0 + 4)); + tester.emit(IGen::sqrt_vf(tester.generator(), XMM0 + 13, XMM0 + 14)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5F851DCC4C17851DEC57851ECC4417851EE"); } @@ -318,72 +326,72 @@ TEST(EmitterAVX, SqrtVF) { TEST(EmitterAVX, RIP) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::loadvf_rip_plus_s32(XMM0 + 3, -123)); - tester.emit(IGen::loadvf_rip_plus_s32(XMM0 + 13, -123)); + tester.emit(IGen::loadvf_rip_plus_s32(tester.generator(), XMM0 + 3, -123)); + tester.emit(IGen::loadvf_rip_plus_s32(tester.generator(), XMM0 + 13, -123)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5F8281D85FFFFFFC578282D85FFFFFF"); } TEST(EmitterAVX, ITOF) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::itof_vf(XMM0 + 3, XMM0 + 4)); - tester.emit(IGen::itof_vf(XMM0 + 3, XMM0 + 14)); - tester.emit(IGen::itof_vf(XMM0 + 13, XMM0 + 4)); - tester.emit(IGen::itof_vf(XMM0 + 13, XMM0 + 14)); + tester.emit(IGen::itof_vf(tester.generator(), XMM0 + 3, XMM0 + 4)); + tester.emit(IGen::itof_vf(tester.generator(), XMM0 + 3, XMM0 + 14)); + tester.emit(IGen::itof_vf(tester.generator(), XMM0 + 13, XMM0 + 4)); + tester.emit(IGen::itof_vf(tester.generator(), XMM0 + 13, XMM0 + 14)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5F85BDCC4C1785BDEC5785BECC441785BEE"); } TEST(EmitterAVX, FTOI) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::ftoi_vf(XMM0 + 3, XMM0 + 4)); - tester.emit(IGen::ftoi_vf(XMM0 + 3, XMM0 + 14)); - tester.emit(IGen::ftoi_vf(XMM0 + 13, XMM0 + 4)); - tester.emit(IGen::ftoi_vf(XMM0 + 13, XMM0 + 14)); + tester.emit(IGen::ftoi_vf(tester.generator(), XMM0 + 3, XMM0 + 4)); + tester.emit(IGen::ftoi_vf(tester.generator(), XMM0 + 3, XMM0 + 14)); + tester.emit(IGen::ftoi_vf(tester.generator(), XMM0 + 13, XMM0 + 4)); + tester.emit(IGen::ftoi_vf(tester.generator(), XMM0 + 13, XMM0 + 14)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5FA5BDCC4C17A5BDEC57A5BECC4417A5BEE"); } TEST(EmitterAVX, VPSRAD) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::pw_sra(XMM0 + 3, XMM0 + 4, 3)); - tester.emit(IGen::pw_sra(XMM0 + 3, XMM0 + 14, 4)); - tester.emit(IGen::pw_sra(XMM0 + 13, XMM0 + 4, 5)); - tester.emit(IGen::pw_sra(XMM0 + 13, XMM0 + 14, 6)); + tester.emit(IGen::pw_sra(tester.generator(), XMM0 + 3, XMM0 + 4, 3)); + tester.emit(IGen::pw_sra(tester.generator(), XMM0 + 3, XMM0 + 14, 4)); + tester.emit(IGen::pw_sra(tester.generator(), XMM0 + 13, XMM0 + 4, 5)); + tester.emit(IGen::pw_sra(tester.generator(), XMM0 + 13, XMM0 + 14, 6)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E172E403C4C16172E604C59172E405C4C11172E606"); } TEST(EmitterAVX, VPSRLD) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::pw_srl(XMM0 + 3, XMM0 + 4, 3)); - tester.emit(IGen::pw_srl(XMM0 + 3, XMM0 + 14, 4)); - tester.emit(IGen::pw_srl(XMM0 + 13, XMM0 + 4, 5)); - tester.emit(IGen::pw_srl(XMM0 + 13, XMM0 + 14, 6)); + tester.emit(IGen::pw_srl(tester.generator(), XMM0 + 3, XMM0 + 4, 3)); + tester.emit(IGen::pw_srl(tester.generator(), XMM0 + 3, XMM0 + 14, 4)); + tester.emit(IGen::pw_srl(tester.generator(), XMM0 + 13, XMM0 + 4, 5)); + tester.emit(IGen::pw_srl(tester.generator(), XMM0 + 13, XMM0 + 14, 6)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E172D403C4C16172D604C59172D405C4C11172D606"); } TEST(EmitterAVX, VPSLLD) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::pw_sll(XMM0 + 3, XMM0 + 4, 3)); - tester.emit(IGen::pw_sll(XMM0 + 3, XMM0 + 14, 4)); - tester.emit(IGen::pw_sll(XMM0 + 13, XMM0 + 4, 5)); - tester.emit(IGen::pw_sll(XMM0 + 13, XMM0 + 14, 6)); + tester.emit(IGen::pw_sll(tester.generator(), XMM0 + 3, XMM0 + 4, 3)); + tester.emit(IGen::pw_sll(tester.generator(), XMM0 + 3, XMM0 + 14, 4)); + tester.emit(IGen::pw_sll(tester.generator(), XMM0 + 13, XMM0 + 4, 5)); + tester.emit(IGen::pw_sll(tester.generator(), XMM0 + 13, XMM0 + 14, 6)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E172F403C4C16172F604C59172F405C4C11172F606"); } TEST(EmitterAVX, VPCMPEQB) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::parallel_compare_e_b(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::parallel_compare_e_b(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::parallel_compare_e_b(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::parallel_compare_e_b(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::parallel_compare_e_b(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::parallel_compare_e_b(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::parallel_compare_e_b(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::parallel_compare_e_b(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::parallel_compare_e_b(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::parallel_compare_e_b(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::parallel_compare_e_b(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::parallel_compare_e_b(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::parallel_compare_e_b(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::parallel_compare_e_b(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::parallel_compare_e_b(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::parallel_compare_e_b(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E174DBC4C16174DDC59174DBC4C11174DDC56174EBC4416174EDC51174EBC4411174ED"); } @@ -391,14 +399,14 @@ TEST(EmitterAVX, VPCMPEQB) { TEST(EmitterAVX, VPCMPEQW) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::parallel_compare_e_h(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::parallel_compare_e_h(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::parallel_compare_e_h(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::parallel_compare_e_h(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::parallel_compare_e_h(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::parallel_compare_e_h(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::parallel_compare_e_h(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::parallel_compare_e_h(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::parallel_compare_e_h(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::parallel_compare_e_h(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::parallel_compare_e_h(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::parallel_compare_e_h(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::parallel_compare_e_h(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::parallel_compare_e_h(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::parallel_compare_e_h(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::parallel_compare_e_h(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E175DBC4C16175DDC59175DBC4C11175DDC56175EBC4416175EDC51175EBC4411175ED"); } @@ -406,14 +414,14 @@ TEST(EmitterAVX, VPCMPEQW) { TEST(EmitterAVX, VPCMPEQD) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::parallel_compare_e_w(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::parallel_compare_e_w(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::parallel_compare_e_w(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::parallel_compare_e_w(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::parallel_compare_e_w(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::parallel_compare_e_w(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::parallel_compare_e_w(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::parallel_compare_e_w(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::parallel_compare_e_w(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::parallel_compare_e_w(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::parallel_compare_e_w(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::parallel_compare_e_w(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::parallel_compare_e_w(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::parallel_compare_e_w(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::parallel_compare_e_w(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::parallel_compare_e_w(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E176DBC4C16176DDC59176DBC4C11176DDC56176EBC4416176EDC51176EBC4411176ED"); } @@ -421,14 +429,14 @@ TEST(EmitterAVX, VPCMPEQD) { TEST(EmitterAVX, VPCMPGTB) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::parallel_compare_gt_b(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::parallel_compare_gt_b(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::parallel_compare_gt_b(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::parallel_compare_gt_b(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::parallel_compare_gt_b(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::parallel_compare_gt_b(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::parallel_compare_gt_b(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::parallel_compare_gt_b(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::parallel_compare_gt_b(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::parallel_compare_gt_b(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::parallel_compare_gt_b(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::parallel_compare_gt_b(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::parallel_compare_gt_b(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::parallel_compare_gt_b(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::parallel_compare_gt_b(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::parallel_compare_gt_b(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E164DBC4C16164DDC59164DBC4C11164DDC56164EBC4416164EDC51164EBC4411164ED"); } @@ -436,14 +444,14 @@ TEST(EmitterAVX, VPCMPGTB) { TEST(EmitterAVX, VPCMPGTW) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::parallel_compare_gt_h(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::parallel_compare_gt_h(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::parallel_compare_gt_h(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::parallel_compare_gt_h(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::parallel_compare_gt_h(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::parallel_compare_gt_h(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::parallel_compare_gt_h(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::parallel_compare_gt_h(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::parallel_compare_gt_h(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::parallel_compare_gt_h(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::parallel_compare_gt_h(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::parallel_compare_gt_h(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::parallel_compare_gt_h(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::parallel_compare_gt_h(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::parallel_compare_gt_h(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::parallel_compare_gt_h(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E165DBC4C16165DDC59165DBC4C11165DDC56165EBC4416165EDC51165EBC4411165ED"); } @@ -451,14 +459,14 @@ TEST(EmitterAVX, VPCMPGTW) { TEST(EmitterAVX, VPCMPGTD) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::parallel_compare_gt_w(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::parallel_compare_gt_w(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::parallel_compare_gt_w(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::parallel_compare_gt_w(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::parallel_compare_gt_w(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::parallel_compare_gt_w(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::parallel_compare_gt_w(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::parallel_compare_gt_w(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::parallel_compare_gt_w(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::parallel_compare_gt_w(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::parallel_compare_gt_w(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::parallel_compare_gt_w(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::parallel_compare_gt_w(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::parallel_compare_gt_w(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::parallel_compare_gt_w(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::parallel_compare_gt_w(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E166DBC4C16166DDC59166DBC4C11166DDC56166EBC4416166EDC51166EBC4411166ED"); } @@ -466,14 +474,14 @@ TEST(EmitterAVX, VPCMPGTD) { TEST(EmitterAVX, VPUNPCKLBW) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::pextlb_swapped(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::pextlb_swapped(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::pextlb_swapped(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::pextlb_swapped(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::pextlb_swapped(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::pextlb_swapped(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::pextlb_swapped(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::pextlb_swapped(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::pextlb_swapped(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::pextlb_swapped(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::pextlb_swapped(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::pextlb_swapped(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::pextlb_swapped(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::pextlb_swapped(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::pextlb_swapped(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::pextlb_swapped(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E160DBC4C16160DDC59160DBC4C11160DDC56160EBC4416160EDC51160EBC4411160ED"); } @@ -481,14 +489,14 @@ TEST(EmitterAVX, VPUNPCKLBW) { TEST(EmitterAVX, VPUNPCKLWD) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::pextlh_swapped(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::pextlh_swapped(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::pextlh_swapped(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::pextlh_swapped(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::pextlh_swapped(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::pextlh_swapped(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::pextlh_swapped(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::pextlh_swapped(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::pextlh_swapped(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::pextlh_swapped(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::pextlh_swapped(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::pextlh_swapped(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::pextlh_swapped(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::pextlh_swapped(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::pextlh_swapped(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::pextlh_swapped(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E161DBC4C16161DDC59161DBC4C11161DDC56161EBC4416161EDC51161EBC4411161ED"); } @@ -496,14 +504,14 @@ TEST(EmitterAVX, VPUNPCKLWD) { TEST(EmitterAVX, VPUNPCKLDQ) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::pextlw_swapped(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::pextlw_swapped(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::pextlw_swapped(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::pextlw_swapped(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::pextlw_swapped(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::pextlw_swapped(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::pextlw_swapped(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::pextlw_swapped(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::pextlw_swapped(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::pextlw_swapped(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::pextlw_swapped(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::pextlw_swapped(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::pextlw_swapped(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::pextlw_swapped(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::pextlw_swapped(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::pextlw_swapped(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E162DBC4C16162DDC59162DBC4C11162DDC56162EBC4416162EDC51162EBC4411162ED"); } @@ -511,14 +519,14 @@ TEST(EmitterAVX, VPUNPCKLDQ) { TEST(EmitterAVX, VPUNPCKHBW) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::pextub_swapped(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::pextub_swapped(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::pextub_swapped(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::pextub_swapped(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::pextub_swapped(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::pextub_swapped(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::pextub_swapped(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::pextub_swapped(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::pextub_swapped(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::pextub_swapped(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::pextub_swapped(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::pextub_swapped(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::pextub_swapped(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::pextub_swapped(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::pextub_swapped(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::pextub_swapped(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E168DBC4C16168DDC59168DBC4C11168DDC56168EBC4416168EDC51168EBC4411168ED"); } @@ -526,14 +534,14 @@ TEST(EmitterAVX, VPUNPCKHBW) { TEST(EmitterAVX, VPUNPCKHWD) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::pextuh_swapped(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::pextuh_swapped(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::pextuh_swapped(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::pextuh_swapped(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::pextuh_swapped(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::pextuh_swapped(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::pextuh_swapped(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::pextuh_swapped(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::pextuh_swapped(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::pextuh_swapped(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::pextuh_swapped(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::pextuh_swapped(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::pextuh_swapped(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::pextuh_swapped(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::pextuh_swapped(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::pextuh_swapped(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E169DBC4C16169DDC59169DBC4C11169DDC56169EBC4416169EDC51169EBC4411169ED"); } @@ -541,14 +549,14 @@ TEST(EmitterAVX, VPUNPCKHWD) { TEST(EmitterAVX, VPUNPCKHDQ) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::pextuw_swapped(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::pextuw_swapped(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::pextuw_swapped(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::pextuw_swapped(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::pextuw_swapped(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::pextuw_swapped(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::pextuw_swapped(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::pextuw_swapped(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::pextuw_swapped(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::pextuw_swapped(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::pextuw_swapped(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::pextuw_swapped(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::pextuw_swapped(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::pextuw_swapped(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::pextuw_swapped(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::pextuw_swapped(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E16ADBC4C1616ADDC5916ADBC4C1116ADDC5616AEBC441616AEDC5116AEBC441116AED"); } @@ -556,14 +564,14 @@ TEST(EmitterAVX, VPUNPCKHDQ) { TEST(EmitterAVX, VPUNPCKLQDQ) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::pcpyld_swapped(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::pcpyld_swapped(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::pcpyld_swapped(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::pcpyld_swapped(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::pcpyld_swapped(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::pcpyld_swapped(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::pcpyld_swapped(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::pcpyld_swapped(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::pcpyld_swapped(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::pcpyld_swapped(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::pcpyld_swapped(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::pcpyld_swapped(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::pcpyld_swapped(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::pcpyld_swapped(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::pcpyld_swapped(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::pcpyld_swapped(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E16CDBC4C1616CDDC5916CDBC4C1116CDDC5616CEBC441616CEDC5116CEBC441116CED"); } @@ -571,14 +579,14 @@ TEST(EmitterAVX, VPUNPCKLQDQ) { TEST(EmitterAVX, VPUNPCKHQDQ) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::pcpyud(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::pcpyud(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::pcpyud(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::pcpyud(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::pcpyud(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::pcpyud(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::pcpyud(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::pcpyud(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::pcpyud(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::pcpyud(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::pcpyud(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::pcpyud(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::pcpyud(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::pcpyud(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::pcpyud(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::pcpyud(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E16DDBC4C1616DDDC5916DDBC4C1116DDDC5616DEBC441616DEDC5116DEBC441116DED"); } @@ -586,74 +594,74 @@ TEST(EmitterAVX, VPUNPCKHQDQ) { TEST(EmitterAVX, VPSRLDQ) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::vpsrldq(XMM0 + 3, XMM0 + 4, 3)); - tester.emit(IGen::vpsrldq(XMM0 + 3, XMM0 + 14, 4)); - tester.emit(IGen::vpsrldq(XMM0 + 13, XMM0 + 4, 5)); - tester.emit(IGen::vpsrldq(XMM0 + 13, XMM0 + 14, 6)); + tester.emit(IGen::vpsrldq(tester.generator(), XMM0 + 3, XMM0 + 4, 3)); + tester.emit(IGen::vpsrldq(tester.generator(), XMM0 + 3, XMM0 + 14, 4)); + tester.emit(IGen::vpsrldq(tester.generator(), XMM0 + 13, XMM0 + 4, 5)); + tester.emit(IGen::vpsrldq(tester.generator(), XMM0 + 13, XMM0 + 14, 6)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E173DC03C4C16173DE04C59173DC05C4C11173DE06"); } TEST(EmitterAVX, VPSLLDQ) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::vpslldq(XMM0 + 3, XMM0 + 4, 3)); - tester.emit(IGen::vpslldq(XMM0 + 3, XMM0 + 14, 4)); - tester.emit(IGen::vpslldq(XMM0 + 13, XMM0 + 4, 5)); - tester.emit(IGen::vpslldq(XMM0 + 13, XMM0 + 14, 6)); + tester.emit(IGen::vpslldq(tester.generator(), XMM0 + 3, XMM0 + 4, 3)); + tester.emit(IGen::vpslldq(tester.generator(), XMM0 + 3, XMM0 + 14, 4)); + tester.emit(IGen::vpslldq(tester.generator(), XMM0 + 13, XMM0 + 4, 5)); + tester.emit(IGen::vpslldq(tester.generator(), XMM0 + 13, XMM0 + 14, 6)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E173FC03C4C16173FE04C59173FC05C4C11173FE06"); } TEST(EmitterAVX, VPSHUFLW) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::vpshuflw(XMM0 + 3, XMM0 + 4, 3)); - tester.emit(IGen::vpshuflw(XMM0 + 3, XMM0 + 14, 4)); - tester.emit(IGen::vpshuflw(XMM0 + 13, XMM0 + 4, 5)); - tester.emit(IGen::vpshuflw(XMM0 + 13, XMM0 + 14, 6)); + tester.emit(IGen::vpshuflw(tester.generator(), XMM0 + 3, XMM0 + 4, 3)); + tester.emit(IGen::vpshuflw(tester.generator(), XMM0 + 3, XMM0 + 14, 4)); + tester.emit(IGen::vpshuflw(tester.generator(), XMM0 + 13, XMM0 + 4, 5)); + tester.emit(IGen::vpshuflw(tester.generator(), XMM0 + 13, XMM0 + 14, 6)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5FB70DC03C4C17B70DE04C57B70EC05C4417B70EE06"); } TEST(EmitterAVX, VPSHUFHW) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::vpshufhw(XMM0 + 3, XMM0 + 4, 3)); - tester.emit(IGen::vpshufhw(XMM0 + 3, XMM0 + 14, 4)); - tester.emit(IGen::vpshufhw(XMM0 + 13, XMM0 + 4, 5)); - tester.emit(IGen::vpshufhw(XMM0 + 13, XMM0 + 14, 6)); + tester.emit(IGen::vpshufhw(tester.generator(), XMM0 + 3, XMM0 + 4, 3)); + tester.emit(IGen::vpshufhw(tester.generator(), XMM0 + 3, XMM0 + 14, 4)); + tester.emit(IGen::vpshufhw(tester.generator(), XMM0 + 13, XMM0 + 4, 5)); + tester.emit(IGen::vpshufhw(tester.generator(), XMM0 + 13, XMM0 + 14, 6)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5FA70DC03C4C17A70DE04C57A70EC05C4417A70EE06"); } TEST(EmitterAVX, movq_to_gpr_from_xmm) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::movq_gpr64_xmm64(RSP, XMM0 + 3)); - tester.emit(IGen::movq_gpr64_xmm64(RSP, XMM0 + 13)); - tester.emit(IGen::movq_gpr64_xmm64(R12, XMM0 + 3)); - tester.emit(IGen::movq_gpr64_xmm64(R12, XMM0 + 13)); + tester.emit(IGen::movq_gpr64_xmm64(tester.generator(), RSP, XMM0 + 3)); + tester.emit(IGen::movq_gpr64_xmm64(tester.generator(), RSP, XMM0 + 13)); + tester.emit(IGen::movq_gpr64_xmm64(tester.generator(), R12, XMM0 + 3)); + tester.emit(IGen::movq_gpr64_xmm64(tester.generator(), R12, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "66480F7EDC664C0F7EEC66490F7EDC664D0F7EEC"); } TEST(EmitterAVX, movq_to_xmm_from_gpr) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::movq_xmm64_gpr64(XMM0 + 3, RSP)); - tester.emit(IGen::movq_xmm64_gpr64(XMM0 + 13, RSP)); - tester.emit(IGen::movq_xmm64_gpr64(XMM0 + 3, R12)); - tester.emit(IGen::movq_xmm64_gpr64(XMM0 + 13, R12)); + tester.emit(IGen::movq_xmm64_gpr64(tester.generator(), XMM0 + 3, RSP)); + tester.emit(IGen::movq_xmm64_gpr64(tester.generator(), XMM0 + 13, RSP)); + tester.emit(IGen::movq_xmm64_gpr64(tester.generator(), XMM0 + 3, R12)); + tester.emit(IGen::movq_xmm64_gpr64(tester.generator(), XMM0 + 13, R12)); EXPECT_EQ(tester.dump_to_hex_string(true), "66480F6EDC664C0F6EEC66490F6EDC664D0F6EEC"); } TEST(EmitterAVX, VPSUBD) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::vpsubd(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::vpsubd(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::vpsubd(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::vpsubd(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::vpsubd(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::vpsubd(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::vpsubd(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::vpsubd(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::vpsubd(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::vpsubd(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::vpsubd(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::vpsubd(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::vpsubd(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::vpsubd(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::vpsubd(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::vpsubd(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E1FADBC4C161FADDC591FADBC4C111FADDC561FAEBC44161FAEDC511FAEBC44111FAED"); } @@ -661,14 +669,14 @@ TEST(EmitterAVX, VPSUBD) { TEST(EmitterAVX, VPOR) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::parallel_bitwise_or(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::parallel_bitwise_or(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::parallel_bitwise_or(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::parallel_bitwise_or(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::parallel_bitwise_or(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::parallel_bitwise_or(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::parallel_bitwise_or(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::parallel_bitwise_or(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::parallel_bitwise_or(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::parallel_bitwise_or(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::parallel_bitwise_or(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::parallel_bitwise_or(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::parallel_bitwise_or(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::parallel_bitwise_or(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::parallel_bitwise_or(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::parallel_bitwise_or(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E1EBDBC4C161EBDDC591EBDBC4C111EBDDC561EBEBC44161EBEDC511EBEBC44111EBED"); } @@ -676,14 +684,14 @@ TEST(EmitterAVX, VPOR) { TEST(EmitterAVX, VPADDB) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::parallel_add_byte(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::parallel_add_byte(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::parallel_add_byte(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::parallel_add_byte(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::parallel_add_byte(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::parallel_add_byte(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::parallel_add_byte(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::parallel_add_byte(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::parallel_add_byte(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::parallel_add_byte(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::parallel_add_byte(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::parallel_add_byte(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::parallel_add_byte(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::parallel_add_byte(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::parallel_add_byte(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::parallel_add_byte(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E1FCDBC4C161FCDDC591FCDBC4C111FCDDC561FCEBC44161FCEDC511FCEBC44111FCED"); } @@ -691,14 +699,14 @@ TEST(EmitterAVX, VPADDB) { TEST(EmitterAVX, VPXOR) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::parallel_bitwise_xor(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::parallel_bitwise_xor(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::parallel_bitwise_xor(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::parallel_bitwise_xor(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::parallel_bitwise_xor(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::parallel_bitwise_xor(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::parallel_bitwise_xor(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::parallel_bitwise_xor(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::parallel_bitwise_xor(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::parallel_bitwise_xor(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::parallel_bitwise_xor(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::parallel_bitwise_xor(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::parallel_bitwise_xor(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::parallel_bitwise_xor(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::parallel_bitwise_xor(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::parallel_bitwise_xor(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E1EFDBC4C161EFDDC591EFDBC4C111EFDDC561EFEBC44161EFEDC511EFEBC44111EFED"); } @@ -706,14 +714,14 @@ TEST(EmitterAVX, VPXOR) { TEST(EmitterAVX, VPAND) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::parallel_bitwise_and(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::parallel_bitwise_and(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::parallel_bitwise_and(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::parallel_bitwise_and(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::parallel_bitwise_and(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::parallel_bitwise_and(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::parallel_bitwise_and(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::parallel_bitwise_and(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::parallel_bitwise_and(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::parallel_bitwise_and(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::parallel_bitwise_and(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::parallel_bitwise_and(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::parallel_bitwise_and(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::parallel_bitwise_and(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::parallel_bitwise_and(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::parallel_bitwise_and(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E1DBDBC4C161DBDDC591DBDBC4C111DBDDC561DBEBC44161DBEDC511DBEBC44111DBED"); } @@ -721,14 +729,14 @@ TEST(EmitterAVX, VPAND) { TEST(EmitterAVX, VPACKUSWB) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::vpackuswb(XMM0 + 3, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::vpackuswb(XMM0 + 3, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::vpackuswb(XMM0 + 3, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::vpackuswb(XMM0 + 3, XMM0 + 13, XMM0 + 13)); - tester.emit(IGen::vpackuswb(XMM0 + 13, XMM0 + 3, XMM0 + 3)); - tester.emit(IGen::vpackuswb(XMM0 + 13, XMM0 + 3, XMM0 + 13)); - tester.emit(IGen::vpackuswb(XMM0 + 13, XMM0 + 13, XMM0 + 3)); - tester.emit(IGen::vpackuswb(XMM0 + 13, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::vpackuswb(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::vpackuswb(tester.generator(), XMM0 + 3, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::vpackuswb(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::vpackuswb(tester.generator(), XMM0 + 3, XMM0 + 13, XMM0 + 13)); + tester.emit(IGen::vpackuswb(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 3)); + tester.emit(IGen::vpackuswb(tester.generator(), XMM0 + 13, XMM0 + 3, XMM0 + 13)); + tester.emit(IGen::vpackuswb(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 3)); + tester.emit(IGen::vpackuswb(tester.generator(), XMM0 + 13, XMM0 + 13, XMM0 + 13)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E167DBC4C16167DDC59167DBC4C11167DDC56167EBC4416167EDC51167EBC4411167ED"); } @@ -736,19 +744,19 @@ TEST(EmitterAVX, VPACKUSWB) { TEST(EmitterAVX, VPSRLW) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::ph_srl(XMM0 + 3, XMM0 + 4, 3)); - tester.emit(IGen::ph_srl(XMM0 + 3, XMM0 + 14, 4)); - tester.emit(IGen::ph_srl(XMM0 + 13, XMM0 + 4, 5)); - tester.emit(IGen::ph_srl(XMM0 + 13, XMM0 + 14, 6)); + tester.emit(IGen::ph_srl(tester.generator(), XMM0 + 3, XMM0 + 4, 3)); + tester.emit(IGen::ph_srl(tester.generator(), XMM0 + 3, XMM0 + 14, 4)); + tester.emit(IGen::ph_srl(tester.generator(), XMM0 + 13, XMM0 + 4, 5)); + tester.emit(IGen::ph_srl(tester.generator(), XMM0 + 13, XMM0 + 14, 6)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E171D403C4C16171D604C59171D405C4C11171D606"); } TEST(EmitterAVX, VPSLLW) { CodeTester tester; tester.init_code_buffer(1024); - tester.emit(IGen::ph_sll(XMM0 + 3, XMM0 + 4, 3)); - tester.emit(IGen::ph_sll(XMM0 + 3, XMM0 + 14, 4)); - tester.emit(IGen::ph_sll(XMM0 + 13, XMM0 + 4, 5)); - tester.emit(IGen::ph_sll(XMM0 + 13, XMM0 + 14, 6)); + tester.emit(IGen::ph_sll(tester.generator(), XMM0 + 3, XMM0 + 4, 3)); + tester.emit(IGen::ph_sll(tester.generator(), XMM0 + 3, XMM0 + 14, 4)); + tester.emit(IGen::ph_sll(tester.generator(), XMM0 + 13, XMM0 + 4, 5)); + tester.emit(IGen::ph_sll(tester.generator(), XMM0 + 13, XMM0 + 14, 6)); EXPECT_EQ(tester.dump_to_hex_string(true), "C5E171F403C4C16171F604C59171F405C4C11171F606"); } \ No newline at end of file diff --git a/third-party/sse2neon/sse2neon.h b/third-party/sse2neon/sse2neon.h index d1a0f0c511..16b6e8341a 100644 --- a/third-party/sse2neon/sse2neon.h +++ b/third-party/sse2neon/sse2neon.h @@ -1,6 +1,30 @@ #ifndef SSE2NEON_H #define SSE2NEON_H +/* + * sse2neon is freely redistributable under the MIT License. + * + * Copyright (c) 2015-2026 SSE2NEON Contributors. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + // This header file provides a simple API translation layer // between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions // @@ -27,52 +51,170 @@ // Cuda Chen // Aymen Qader // Anthony Roberts - -/* - * sse2neon is freely redistributable under the MIT License. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +// Sean Luchen +// Marcin Serwin +// Ben Niu +// Even Rouault +// Marcus Buretorp /* Tunable configurations */ -/* Enable precise implementation of math operations - * This would slow down the computation a bit, but gives consistent result with - * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result) +/* PRECISION FLAGS + * + * These flags control the precision/performance trade-off for operations where + * NEON behavior diverges from x86 SSE. Default is 0 (performance over + * precision). Set to 1 before including this header for x86-compatible + * behavior. + * + * Example: + * #define SSE2NEON_PRECISE_MINMAX 1 // Enable before include + * #include "sse2neon.h" + * + * Recommended configurations: + * - Performance: No flags (default) + * - Balanced: SSE2NEON_PRECISE_MINMAX=1, SSE2NEON_PRECISE_SQRT=1 + * (ARMv7: also consider SSE2NEON_PRECISE_DIV=1 for division) + * - Exact: All flags set to 1 + */ + +/* SSE2NEON_PRECISE_MINMAX + * Affects: _mm_min_ps, _mm_max_ps, _mm_min_ss, _mm_max_ss, + * _mm_min_pd, _mm_max_pd, _mm_min_sd, _mm_max_sd + * + * Issue: NEON fmin/fmax propagate NaN differently than SSE. When one operand + * is NaN, SSE returns the second operand while NEON may return NaN. + * + * Default (0): Fast NEON min/max, potential NaN divergence + * Enabled (1): Additional comparison to match x86 NaN handling + * + * Symptoms when disabled: NaN "holes" in rendered images, unexpected NaN + * propagation in signal processing */ -/* _mm_min|max_ps|ss|pd|sd */ #ifndef SSE2NEON_PRECISE_MINMAX #define SSE2NEON_PRECISE_MINMAX (0) #endif -/* _mm_rcp_ps and _mm_div_ps */ + +/* SSE2NEON_PRECISE_DIV + * Affects: _mm_rcp_ps, _mm_rcp_ss (all architectures) + * _mm_div_ps, _mm_div_ss (ARMv7 only, ARMv8 uses native vdivq_f32) + * + * Issue: NEON reciprocal estimate (vrecpe) has ~11-bit precision. SSE's rcpps + * provides ~12-bit precision. For division on ARMv7, we use reciprocal + * approximation since there's no native divide instruction. + * + * Default (0): Single Newton-Raphson refinement (~12-bit precision) + * Enabled (1): Two N-R refinements (~24-bit precision) + * + * Note on reciprocals: Enabling this flag makes _mm_rcp_ps MORE accurate than + * SSE's specified ~12-bit precision. This improves ARMv7 division accuracy but + * may differ from code expecting SSE's coarser reciprocal approximation. + * + * WARNING: This flag improves numerical precision only. It does NOT fix + * IEEE-754 corner-case divergence (NaN propagation, signed zero, infinity + * handling). ARMv7 division behavior will still differ from x86 SSE for these + * edge cases. + * + * Symptoms when disabled: Slight precision differences in division-heavy code + */ #ifndef SSE2NEON_PRECISE_DIV #define SSE2NEON_PRECISE_DIV (0) #endif -/* _mm_sqrt_ps and _mm_rsqrt_ps */ + +/* SSE2NEON_PRECISE_SQRT + * Affects: _mm_sqrt_ps, _mm_sqrt_ss, _mm_rsqrt_ps, _mm_rsqrt_ss + * + * Issue: NEON reciprocal square root estimate (vrsqrte) has lower precision + * than x86 SSE's rsqrtps/sqrtps. + * + * Default (0): Single Newton-Raphson refinement + * Enabled (1): Two N-R refinements for improved precision + * + * Symptoms when disabled: Precision loss in physics simulations, graphics + * normalization, or iterative algorithms + */ #ifndef SSE2NEON_PRECISE_SQRT #define SSE2NEON_PRECISE_SQRT (0) #endif -/* _mm_dp_pd */ + +/* SSE2NEON_PRECISE_DP + * Affects: _mm_dp_ps, _mm_dp_pd + * + * Issue: The dot product mask parameter controls which elements participate. + * When an element is masked out, x86 multiplies by 0.0 while NEON + * skips the multiply entirely. + * + * Default (0): Skip masked elements (faster, but 0.0 * NaN = NaN divergence) + * Enabled (1): Multiply masked elements by 0.0 (matches x86 NaN propagation) + * + * Symptoms when disabled: Different results when dot product inputs contain + * NaN in masked-out lanes + */ #ifndef SSE2NEON_PRECISE_DP #define SSE2NEON_PRECISE_DP (0) #endif +/* SSE2NEON_UNDEFINED_ZERO + * Affects: _mm_undefined_ps, _mm_undefined_si128, _mm_undefined_pd + * + * Issue: These intrinsics return vectors with "undefined" contents per Intel + * spec. On x86, this means truly uninitialized memory (garbage values). + * + * MSVC Semantic Drift: MSVC on ARM forces zero-initialization for these + * intrinsics, which differs from x86 behavior where garbage is returned. + * GCC/Clang on ARM match x86 by returning uninitialized memory. + * + * This macro provides explicit control over the behavior: + * Default (0): Compiler-dependent (MSVC=zero, GCC/Clang=undefined) + * Enabled (1): Force zero-initialization on all compilers (safer, portable) + * + * When to enable: + * - Deterministic behavior across compilers is required + * - Debugging memory-related issues where undefined values cause problems + * - Security-sensitive code where uninitialized memory is a concern + * + * Note: Using undefined values without first writing to them is undefined + * behavior. Well-formed code should not depend on either behavior. + */ +#ifndef SSE2NEON_UNDEFINED_ZERO +#define SSE2NEON_UNDEFINED_ZERO (0) +#endif + +/* SSE2NEON_MWAIT_POLICY + * Affects: _mm_mwait + * + * Issue: x86 MONITOR/MWAIT allows a thread to sleep until a write occurs to a + * monitored address range. ARM has no userspace equivalent for address- + * range monitoring. _mm_monitor is a no-op; _mm_mwait can only provide + * low-power wait hints without true "wake on store" semantics. + * + * Note: The x86 extensions/hints parameters (C-state hints) are ignored on ARM + * as there is no architectural equivalent. No memory ordering is provided + * beyond what the hint instruction itself offers. + * + * WARNING: Policies 1 and 2 (WFE/WFI) may cause issues: + * - WFE: May sleep until event/interrupt; can wake spuriously. Always check + * your condition in a loop. May trap in EL0 (SCTLR_EL1.nTWE). + * - WFI: May trap (SIGILL) in EL0 on Linux, iOS, macOS (SCTLR_EL1.nTWI). + * - Neither provides "wake on address write" semantics. + * + * Policy values: + * 0 (default): yield - Safe everywhere, never blocks, just a hint + * 1: wfe - Event wait, may sleep until event/interrupt + * 2: wfi - Interrupt wait, may trap in EL0 on many platforms + * + * Recommended usage: + * - Policy 0: General-purpose code, spin-wait loops (safe default) + * - Policy 1: Only if you control both reader/writer and use SEV/SEVL + * - Policy 2: Only for bare-metal or kernel code with known OS support + * + * Migration note: Code relying on x86 MONITOR/MWAIT for lock-free waiting + * should migrate to proper atomics + OS wait primitives (futex, condition + * variables) for correct cross-platform behavior. + */ +#ifndef SSE2NEON_MWAIT_POLICY +#define SSE2NEON_MWAIT_POLICY (0) +#endif + /* Enable inclusion of windows.h on MSVC platforms * This makes _mm_clflush functional on windows, as there is no builtin. */ @@ -80,15 +222,145 @@ #define SSE2NEON_INCLUDE_WINDOWS_H (0) #endif +/* Consolidated Platform Detection + * + * These macros simplify platform-specific code throughout the header by + * providing single-point definitions for architecture and compiler detection. + * This reduces the 147+ verbose architecture checks to simple macro usage. + * + * Architecture: + * SSE2NEON_ARCH_AARCH64 - 64-bit ARM (AArch64, including Apple Silicon) + * Encompasses: __aarch64__, __arm64__, _M_ARM64, _M_ARM64EC + * + * Compiler: + * SSE2NEON_COMPILER_GCC_COMPAT - GCC or Clang (supports GNU extensions) + * SSE2NEON_COMPILER_MSVC - Microsoft Visual C++ + * SSE2NEON_COMPILER_CLANG - Clang specifically (subset of GCC_COMPAT) + */ + +/* Compiler detection + * + * Check Clang first: it defines __GNUC__ for compatibility. + * Clang-CL also defines _MSC_VER for MSVC ABI compatibility. + * + * Compiler matrix: + * Compiler | GCC_COMPAT | CLANG | MSVC + * -----------+------------+-------+------ + * GCC | 1 | 0 | 0 + * Clang | 1 | 1 | 0 + * Clang-CL | 1 | 1 | 1 + * MSVC | 0 | 0 | 1 + */ +#if defined(__clang__) +/* Clang compiler detected (including Apple Clang) */ +#define SSE2NEON_COMPILER_CLANG 1 +#define SSE2NEON_COMPILER_GCC_COMPAT 1 /* Clang supports GCC extensions */ +#if defined(_MSC_VER) +#define SSE2NEON_COMPILER_MSVC 1 /* Clang-CL: Clang with MSVC on Windows */ +#else +#define SSE2NEON_COMPILER_MSVC 0 +#endif +/* Clang < 11 has known NEON codegen bugs (issue #622) */ +#if __clang_major__ < 11 +#error "Clang versions earlier than 11 are not supported." +#endif + +#elif defined(__GNUC__) +/* GCC compiler (only reached if not Clang, since Clang also defines __GNUC__) + */ +#define SSE2NEON_COMPILER_CLANG 0 +#define SSE2NEON_COMPILER_GCC_COMPAT 1 +#define SSE2NEON_COMPILER_MSVC 0 +/* GCC < 10 has incomplete ARM intrinsics support */ +#if __GNUC__ < 10 +#error "GCC versions earlier than 10 are not supported." +#endif + +#elif defined(_MSC_VER) +/* Microsoft Visual C++ (native, not Clang-CL) */ +#define SSE2NEON_COMPILER_CLANG 0 +#define SSE2NEON_COMPILER_GCC_COMPAT 0 /* No GCC extensions available */ +#define SSE2NEON_COMPILER_MSVC 1 + +#else +#error "Unsupported compiler. SSE2NEON requires GCC 10+, Clang 11+, or MSVC." +#endif + +/* Architecture detection */ +#if defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || \ + defined(_M_ARM64EC) +#define SSE2NEON_ARCH_AARCH64 1 +#else +#define SSE2NEON_ARCH_AARCH64 0 +#endif + +/* ARM64EC Support - EXPERIMENTAL with known limitations + * + * ARM64EC is Microsoft's hybrid ABI bridging x64 and ARM64 within a single + * Windows process, enabling incremental migration of x64 applications to ARM64. + * Compiler support remains incomplete (limited LLVM/GCC coverage). + * + * Compiler behavior: + * - MSVC defines both _M_AMD64 and _M_ARM64EC (but NOT _M_ARM64) + * - Requires arm64_neon.h instead of arm_neon.h + * + * Known limitations: + * 1. Windows headers: SSE2NEON_INCLUDE_WINDOWS_H must be 0 (default). + * Include sse2neon.h BEFORE any Windows headers to avoid type conflicts. + * 2. Include order: sse2neon.h must be included BEFORE or any C++ + * standard headers that pull it in (e.g., , ). + * 3. ABI boundary: __m128/SSE types must NOT cross x64/ARM64EC module + * boundaries (exports/imports) as layouts differ between ABIs. + * Users needing cross-ABI SIMD interop should use MSVC's softintrin. + * 4. CRC32 hardware intrinsics are disabled; software fallback is used. + * + * SSE2NEON_ARM64EC is 1 when compiling for ARM64EC with MSVC, 0 otherwise. + * Note: clang-cl ARM64EC builds are not currently detected by this macro. + * + * Recommendation: Use native ARM64 compilation when possible. + */ +#if SSE2NEON_COMPILER_MSVC && defined(_M_ARM64EC) +#define SSE2NEON_ARM64EC 1 +#else +#define SSE2NEON_ARM64EC 0 +#endif + +/* Early ARM64EC + SSE2NEON_INCLUDE_WINDOWS_H check. + * This must come BEFORE any standard includes because and other + * headers can trigger winnt.h, which fails with "Must define a target + * architecture" on ARM64EC before we could emit our own error. + */ +#if SSE2NEON_ARM64EC && SSE2NEON_INCLUDE_WINDOWS_H +#error \ + "SSE2NEON_INCLUDE_WINDOWS_H=1 is not supported on ARM64EC. " \ + "Include separately AFTER sse2neon.h instead." +#endif + +/* Endianness check + * + * SSE2NEON assumes little-endian byte ordering for lane-to-memory mappings. + * Big-endian ARM targets would produce silently incorrect results because + * SSE intrinsics define lane ordering relative to little-endian memory layout. + * + * GCC/Clang define __BYTE_ORDER__. For compilers that don't (e.g., MSVC), + * we check for explicit big-endian ARM macros. MSVC only targets little-endian + * ARM, so no additional check is needed there. + */ +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__) +#error "sse2neon requires little-endian target; big-endian is not supported" +#elif defined(__ARMEB__) || defined(__AARCH64EB__) || defined(__BIG_ENDIAN__) +#error "sse2neon requires little-endian target; big-endian is not supported" +#endif + /* compiler specific definitions */ -#if defined(__GNUC__) || defined(__clang__) +#if SSE2NEON_COMPILER_GCC_COMPAT #pragma push_macro("FORCE_INLINE") #pragma push_macro("ALIGN_STRUCT") #define FORCE_INLINE static inline __attribute__((always_inline)) #define ALIGN_STRUCT(x) __attribute__((aligned(x))) #define _sse2neon_likely(x) __builtin_expect(!!(x), 1) #define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0) -#elif defined(_MSC_VER) +#elif SSE2NEON_COMPILER_MSVC #if _MSVC_TRADITIONAL #error Using the traditional MSVC preprocessor is not supported! Use /Zc:preprocessor instead. #endif @@ -100,8 +372,6 @@ #endif #define _sse2neon_likely(x) (x) #define _sse2neon_unlikely(x) (x) -#else -#pragma message("Macro name collisions may happen with unsupported compilers.") #endif /* C language does not allow initializing a variable with a function call. */ @@ -111,49 +381,152 @@ #define _sse2neon_const const #endif +#if defined(__cplusplus) +#define _sse2neon_reinterpret_cast(t, e) reinterpret_cast(e) +#define _sse2neon_static_cast(t, e) static_cast(e) +#define _sse2neon_const_cast(t, e) const_cast(e) +#else +#define _sse2neon_reinterpret_cast(t, e) ((t) (e)) +#define _sse2neon_static_cast(t, e) ((t) (e)) +#define _sse2neon_const_cast(t, e) ((t) (e)) +#endif + +/* ARM64EC winnt.h workaround: define architecture macros before any headers + * that might include winnt.h. Windows SDK 10.0.26100.0+ requires _ARM64EC_ or + * _ARM64_ but MSVC 17.x only defines _M_ARM64EC. + */ +#if SSE2NEON_ARM64EC +/* Warn if winnt.h was already included - the workaround won't help */ +#ifdef _WINNT_ +#pragma message( \ + "warning: sse2neon.h included after winnt.h; ARM64EC workaround may fail") +#endif +/* Define _ARM64EC_ for winnt.h architecture check (kept for user detection) */ +#if !defined(_ARM64EC_) +#define _ARM64EC_ 1 +#define _SSE2NEON_DEFINED_ARM64EC_ +#endif +/* Define _M_ARM64 temporarily for headers that derive _ARM64_ from it */ +#if !defined(_M_ARM64) +#define _M_ARM64 1 +#define _SSE2NEON_DEFINED_M_ARM64 +#endif +#endif /* SSE2NEON_ARM64EC */ + +#include #include #include +#include -#if defined(_WIN32) -/* Definitions for _mm_{malloc,free} are provided by - * from both MinGW-w64 and MSVC. +FORCE_INLINE double sse2neon_recast_u64_f64(uint64_t val) +{ + double tmp; + memcpy(&tmp, &val, sizeof(uint64_t)); + return tmp; +} + +FORCE_INLINE int64_t sse2neon_recast_f64_s64(double val) +{ + int64_t tmp; + memcpy(&tmp, &val, sizeof(uint64_t)); + return tmp; +} + +/* MSVC provides _mm_{malloc,free} in ; MinGW needs our definitions + * but still uses _aligned_malloc/_aligned_free from . */ +#if SSE2NEON_COMPILER_MSVC #define SSE2NEON_ALLOC_DEFINED #endif /* If using MSVC */ -#ifdef _MSC_VER +#if SSE2NEON_COMPILER_MSVC + +/* ARM64EC SSE header blocking: pre-define include guards to prevent MSVC SSE + * headers (mmintrin.h, xmmintrin.h, etc.) and Windows SDK softintrin.h from + * loading, as their __m128 union types conflict with sse2neon's NEON types. + */ +#if SSE2NEON_ARM64EC || defined(_M_ARM64EC) +/* Detect if was already included - SSE types may have leaked. + * Check both _INTRIN_H_ and _INTRIN_H to cover different MSVC versions. */ +#if defined(_INTRIN_H_) || defined(_INTRIN_H) +#error \ + "sse2neon.h must be included BEFORE or C++ headers on ARM64EC. " \ + "SSE type definitions from conflict with sse2neon's NEON types." +#endif +#define _INCLUDED_MM2 +#define _MMINTRIN_H_INCLUDED +#define _XMMINTRIN_H_INCLUDED +#define _EMMINTRIN_H_INCLUDED +#define _PMMINTRIN_H_INCLUDED +#define _TMMINTRIN_H_INCLUDED +#define _SMMINTRIN_H_INCLUDED +#define _NMMINTRIN_H_INCLUDED +#define _WMMINTRIN_H_INCLUDED +#define _IMMINTRIN_H_INCLUDED +#define _ZMMINTRIN_H_INCLUDED +#define _AMMINTRIN_H_INCLUDED +/* Block Windows SDK softintrin */ +#define _SOFTINTRIN_H_ +#define _DISABLE_SOFTINTRIN_ 1 +#endif /* SSE2NEON_ARM64EC */ #include + +/* Windows headers inclusion. + * ARM64EC case is blocked by early check near SSE2NEON_ARM64EC definition. + */ #if SSE2NEON_INCLUDE_WINDOWS_H #include #include #endif +/* Clean up _M_ARM64 (could mislead into pure ARM64 paths). Keep _ARM64EC_. */ +#ifdef _SSE2NEON_DEFINED_ARM64EC_ +#undef _SSE2NEON_DEFINED_ARM64EC_ +#endif +#ifdef _SSE2NEON_DEFINED_M_ARM64 +#undef _M_ARM64 +#undef _SSE2NEON_DEFINED_M_ARM64 +#endif + #if !defined(__cplusplus) -#error sse2neon only supports C++ compilation with this compiler +#error "SSE2NEON only supports C++ compilation with this compiler" #endif #ifdef SSE2NEON_ALLOC_DEFINED #include #endif -#if (defined(_M_AMD64) || defined(__x86_64__)) || \ - (defined(_M_ARM64) || defined(__arm64__)) +/* 64-bit bit scanning available on x64 and AArch64 (including ARM64EC) */ +#if (defined(_M_AMD64) || defined(__x86_64__)) || SSE2NEON_ARCH_AARCH64 #define SSE2NEON_HAS_BITSCAN64 #endif + +#endif /* SSE2NEON_COMPILER_MSVC */ + +/* MinGW uses _aligned_malloc/_aligned_free from */ +#if defined(__MINGW32__) +#include #endif -#if defined(__GNUC__) || defined(__clang__) +/* Statement expression helpers for macro-based intrinsics. + * + * For GCC/Clang: Uses __extension__({}) statement expressions which have + * natural access to all surrounding variables. + * + * For MSVC: Uses immediately-invoked lambdas. The distinction between + * _sse2neon_define0 ([=] capture) and _sse2neon_define1 ([] no capture) + * exists for lambda capture semantics, though in practice both work the + * same since 'imm' parameters are compile-time constants that get + * substituted before the lambda is created. + */ +#if SSE2NEON_COMPILER_GCC_COMPAT #define _sse2neon_define0(type, s, body) \ __extension__({ \ type _a = (s); \ body \ }) -#define _sse2neon_define1(type, s, body) \ - __extension__({ \ - type _a = (s); \ - body \ - }) +#define _sse2neon_define1(type, s, body) _sse2neon_define0(type, s, body) #define _sse2neon_define2(type, a, b, body) \ __extension__({ \ type _a = (a), _b = (b); \ @@ -168,13 +541,10 @@ #define _sse2neon_return(ret) return ret #endif -#define _sse2neon_init(...) \ - { \ - __VA_ARGS__ \ - } +#define _sse2neon_init(...) {__VA_ARGS__} /* Compiler barrier */ -#if defined(_MSC_VER) +#if SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG #define SSE2NEON_BARRIER() _ReadWriteBarrier() #else #define SSE2NEON_BARRIER() \ @@ -199,55 +569,62 @@ FORCE_INLINE void _sse2neon_smp_mb(void) #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \ !defined(__STDC_NO_ATOMICS__) atomic_thread_fence(memory_order_seq_cst); -#elif defined(__GNUC__) || defined(__clang__) +#elif SSE2NEON_COMPILER_GCC_COMPAT __atomic_thread_fence(__ATOMIC_SEQ_CST); #else /* MSVC */ __dmb(_ARM64_BARRIER_ISH); #endif } -/* Architecture-specific build options */ -/* FIXME: #pragma GCC push_options is only available on GCC */ -#if defined(__GNUC__) -#if defined(__arm__) && __ARM_ARCH == 7 -/* According to ARM C Language Extensions Architecture specification, - * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON) - * architecture supported. +/* Architecture-specific build options. + * #pragma GCC push_options/target are GCC-specific; Clang ignores these. + * MSVC on ARM always has NEON/SIMD available. */ +#if SSE2NEON_COMPILER_GCC_COMPAT +#if defined(__arm__) +/* 32-bit ARM: ARMv7-A or ARMv8-A in AArch32 mode */ #if !defined(__ARM_NEON) || !defined(__ARM_NEON__) #error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON." #endif -#if !defined(__clang__) +#if !SSE2NEON_COMPILER_CLANG #pragma GCC push_options +#if __ARM_ARCH >= 8 +#pragma GCC target("fpu=neon-fp-armv8") +#else #pragma GCC target("fpu=neon") #endif -#elif defined(__aarch64__) || defined(_M_ARM64) -#if !defined(__clang__) && !defined(_MSC_VER) +#endif +#elif SSE2NEON_ARCH_AARCH64 +#if !SSE2NEON_COMPILER_CLANG #pragma GCC push_options #pragma GCC target("+simd") #endif -#elif __ARM_ARCH == 8 -#if !defined(__ARM_NEON) || !defined(__ARM_NEON__) -#error \ - "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON." -#endif -#if !defined(__clang__) && !defined(_MSC_VER) -#pragma GCC push_options -#endif #else -#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A." +#error "Unsupported target. Must be ARMv7-A+NEON, ARMv8-A, or AArch64." #endif #endif +/* ARM64EC: use arm64_neon.h (arm_neon.h guards with _M_ARM||_M_ARM64) */ +#if SSE2NEON_ARM64EC || defined(_M_ARM64EC) +#include +#else #include -#if (!defined(__aarch64__) && !defined(_M_ARM64)) && (__ARM_ARCH == 8) +#endif + +/* Include ACLE for CRC32 and other intrinsics on ARMv8+ */ +#if SSE2NEON_ARCH_AARCH64 || __ARM_ARCH >= 8 #if defined __has_include && __has_include() #include +#define SSE2NEON_HAS_ACLE 1 +#else +#define SSE2NEON_HAS_ACLE 0 #endif +#else +#define SSE2NEON_HAS_ACLE 0 #endif /* Apple Silicon cache lines are double of what is commonly used by Intel, AMD - * and other Arm microarchtectures use. + * and other Arm microarchitectures use. * From sysctl -a on Apple M1: * hw.cachelinesize: 128 */ @@ -257,41 +634,30 @@ FORCE_INLINE void _sse2neon_smp_mb(void) #define SSE2NEON_CACHELINE_SIZE 64 #endif -/* Rounding functions require either Aarch64 instructions or libm failback */ -#if !defined(__aarch64__) && !defined(_M_ARM64) +/* Rounding functions require either Aarch64 instructions or libm fallback */ +#if !SSE2NEON_ARCH_AARCH64 #include #endif -/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only - * or even not accessible in user mode. - * To write or access to these registers in user mode, - * we have to perform syscall instead. +/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only or + * even not accessible in user mode. + * To write or access to these registers in user mode, we have to perform + * syscall instead. */ -#if (!defined(__aarch64__) && !defined(_M_ARM64)) +#if !SSE2NEON_ARCH_AARCH64 #include #endif /* "__has_builtin" can be used to query support for built-in functions * provided by gcc/clang and other compilers that support it. + * GCC 10+ and Clang 11+ have native __has_builtin support. + * MSVC does not provide these GCC/Clang builtins. */ -#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */ -/* Compatibility with gcc <= 9 */ -#if defined(__GNUC__) && (__GNUC__ <= 9) -#define __has_builtin(x) HAS##x -#define HAS__builtin_popcount 1 -#define HAS__builtin_popcountll 1 - -// __builtin_shuffle introduced in GCC 4.7.0 -#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7)) -#define HAS__builtin_shuffle 1 -#else -#define HAS__builtin_shuffle 0 -#endif - -#define HAS__builtin_shufflevector 0 -#define HAS__builtin_nontemporal_store 0 -#else +#ifndef __has_builtin +#if SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG #define __has_builtin(x) 0 +#else +#error "Unsupported compiler: __has_builtin not available" #endif #endif @@ -303,8 +669,21 @@ FORCE_INLINE void _sse2neon_smp_mb(void) * argument "a" of mm_shuffle_ps that will be places in fp1 of result. * fp0 is the same for fp0 of result. */ +#ifndef _MM_SHUFFLE #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) +#endif + +/** + * MACRO for shuffle parameter for _mm_shuffle_pd(). + * Argument fp1 is a digit[01] that represents the fp from argument "b" + * of mm_shuffle_pd that will be placed in fp1 of result. + * fp0 is a digit[01] that represents the fp from argument "a" of mm_shuffle_pd + * that will be placed in fp0 of result. + */ +#ifndef _MM_SHUFFLE2 +#define _MM_SHUFFLE2(fp1, fp0) (((fp1) << 1) | (fp0)) +#endif #if __has_builtin(__builtin_shufflevector) #define _sse2neon_shuffle(type, a, b, ...) \ @@ -334,27 +713,215 @@ FORCE_INLINE void _sse2neon_smp_mb(void) #define _MM_FROUND_CUR_DIRECTION 0x04 #define _MM_FROUND_NO_EXC 0x08 #define _MM_FROUND_RAISE_EXC 0x00 +#ifndef _MM_FROUND_NINT #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) +#endif +#ifndef _MM_FROUND_FLOOR #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) +#endif +#ifndef _MM_FROUND_CEIL #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) +#endif +#ifndef _MM_FROUND_TRUNC #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) +#endif +#ifndef _MM_FROUND_RINT #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) +#endif +#ifndef _MM_FROUND_NEARBYINT #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) +#endif +#ifndef _MM_ROUND_NEAREST #define _MM_ROUND_NEAREST 0x0000 +#endif +#ifndef _MM_ROUND_DOWN #define _MM_ROUND_DOWN 0x2000 +#endif +#ifndef _MM_ROUND_UP #define _MM_ROUND_UP 0x4000 +#endif +#ifndef _MM_ROUND_TOWARD_ZERO #define _MM_ROUND_TOWARD_ZERO 0x6000 -/* Flush zero mode macros. */ +#endif +#ifndef _MM_ROUND_MASK +#define _MM_ROUND_MASK 0x6000 +#endif +/* Flush-to-zero (FTZ) mode macros. + * On x86, FTZ (MXCSR bit 15) flushes denormal outputs to zero. + * On ARM, FPCR/FPSCR bit 24 provides unified FZ+DAZ behavior. + * ARMv7 NEON: Per ARM ARM, Advanced SIMD has "Flush-to-zero mode always + * enabled" - denormals flush regardless of FPSCR.FZ (some impls may vary). + * ARMv8: FPCR.FZ correctly controls denormal handling for NEON ops. + */ +#ifndef _MM_FLUSH_ZERO_MASK #define _MM_FLUSH_ZERO_MASK 0x8000 +#endif +#ifndef _MM_FLUSH_ZERO_ON #define _MM_FLUSH_ZERO_ON 0x8000 +#endif +#ifndef _MM_FLUSH_ZERO_OFF #define _MM_FLUSH_ZERO_OFF 0x0000 -/* Denormals are zeros mode macros. */ +#endif +/* Denormals-are-zero (DAZ) mode macros. + * On x86, DAZ (MXCSR bit 6) treats denormal inputs as zero. + * On ARM, setting DAZ enables the same FPCR/FPSCR bit 24 as FTZ, + * providing unified handling for both input and output denormals. + */ +#ifndef _MM_DENORMALS_ZERO_MASK #define _MM_DENORMALS_ZERO_MASK 0x0040 +#endif +#ifndef _MM_DENORMALS_ZERO_ON #define _MM_DENORMALS_ZERO_ON 0x0040 +#endif +#ifndef _MM_DENORMALS_ZERO_OFF #define _MM_DENORMALS_ZERO_OFF 0x0000 +#endif -/* indicate immediate constant argument in a given range */ -#define __constrange(a, b) const +/* MXCSR Exception Flags - NOT EMULATED + * + * SSE provides floating-point exception flags in the MXCSR register (bits 0-5) + * that are NOT emulated on ARM NEON. Code relying on _mm_getcsr() to detect + * floating-point exceptions will silently fail to detect them. + * + * MXCSR Exception Flag Layout (x86): + * Bit 0 (IE): Invalid Operation Exception - NOT EMULATED + * Bit 1 (DE): Denormal Exception - NOT EMULATED + * Bit 2 (ZE): Divide-by-Zero Exception - NOT EMULATED + * Bit 3 (OE): Overflow Exception - NOT EMULATED + * Bit 4 (UE): Underflow Exception - NOT EMULATED + * Bit 5 (PE): Precision Exception - NOT EMULATED + * + * MXCSR Exception Mask Layout (x86): + * Bits 7-12: Exception masks (mask = suppress exception) - NOT EMULATED + * + * Why Not Emulated: + * - ARM NEON does not set sticky exception flags like x86 SSE + * - ARM FPSR (Floating-Point Status Register) has different semantics + * - Emulating per-operation exception tracking would require wrapping every + * floating-point intrinsic with software checks, severely impacting + * performance + * - Thread-local exception state tracking would add significant complexity + * + * Impact: + * - Scientific computing code checking for overflow/underflow will miss events + * - Financial applications validating precision will not detect precision loss + * - Numerical code checking for invalid operations (NaN generation) won't + * detect them + * + * Workarounds: + * - Use explicit NaN/Inf checks after critical operations: isnan(), isinf() + * - Implement application-level range validation for overflow detection + * - Use higher precision arithmetic where precision loss is critical + * + * The macros below are defined for API compatibility but provide no + * functionality. + */ + +/* Exception flag macros (MXCSR bits 0-5) - defined for API compatibility only + */ +#ifndef _MM_EXCEPT_INVALID +#define _MM_EXCEPT_INVALID 0x0001 +#endif +#ifndef _MM_EXCEPT_DENORM +#define _MM_EXCEPT_DENORM 0x0002 +#endif +#ifndef _MM_EXCEPT_DIV_ZERO +#define _MM_EXCEPT_DIV_ZERO 0x0004 +#endif +#ifndef _MM_EXCEPT_OVERFLOW +#define _MM_EXCEPT_OVERFLOW 0x0008 +#endif +#ifndef _MM_EXCEPT_UNDERFLOW +#define _MM_EXCEPT_UNDERFLOW 0x0010 +#endif +#ifndef _MM_EXCEPT_INEXACT +#define _MM_EXCEPT_INEXACT 0x0020 +#endif +#ifndef _MM_EXCEPT_MASK +#define _MM_EXCEPT_MASK \ + (_MM_EXCEPT_INVALID | _MM_EXCEPT_DENORM | _MM_EXCEPT_DIV_ZERO | \ + _MM_EXCEPT_OVERFLOW | _MM_EXCEPT_UNDERFLOW | _MM_EXCEPT_INEXACT) +#endif + +/* Exception mask macros (MXCSR bits 7-12) - defined for API compatibility only + */ +#ifndef _MM_MASK_INVALID +#define _MM_MASK_INVALID 0x0080 +#endif +#ifndef _MM_MASK_DENORM +#define _MM_MASK_DENORM 0x0100 +#endif +#ifndef _MM_MASK_DIV_ZERO +#define _MM_MASK_DIV_ZERO 0x0200 +#endif +#ifndef _MM_MASK_OVERFLOW +#define _MM_MASK_OVERFLOW 0x0400 +#endif +#ifndef _MM_MASK_UNDERFLOW +#define _MM_MASK_UNDERFLOW 0x0800 +#endif +#ifndef _MM_MASK_INEXACT +#define _MM_MASK_INEXACT 0x1000 +#endif +#ifndef _MM_MASK_MASK +#define _MM_MASK_MASK \ + (_MM_MASK_INVALID | _MM_MASK_DENORM | _MM_MASK_DIV_ZERO | \ + _MM_MASK_OVERFLOW | _MM_MASK_UNDERFLOW | _MM_MASK_INEXACT) +#endif + +/* Exception state accessor macros - silent stubs for API compatibility. + * These macros exist for API compatibility but provide NO functionality. + * On ARM, exception flags are never set by sse2neon intrinsics. + * + * _MM_GET_EXCEPTION_STATE() - Always returns 0 (no exceptions detected) + * _MM_SET_EXCEPTION_STATE() - Silently ignored (cannot clear nonexistent flags) + * _MM_GET_EXCEPTION_MASK() - Always returns all-masked (0x1F80) + * _MM_SET_EXCEPTION_MASK() - Silently ignored (no effect on ARM) + */ +#ifndef _MM_GET_EXCEPTION_STATE +#define _MM_GET_EXCEPTION_STATE() (0) +#endif +#ifndef _MM_SET_EXCEPTION_STATE +#define _MM_SET_EXCEPTION_STATE(x) ((void) (x)) +#endif +#ifndef _MM_GET_EXCEPTION_MASK +#define _MM_GET_EXCEPTION_MASK() (_MM_MASK_MASK) +#endif +#ifndef _MM_SET_EXCEPTION_MASK +#define _MM_SET_EXCEPTION_MASK(x) ((void) (x)) +#endif + +/* Compile-time validation for immediate constant arguments. + * This macro validates that: + * 1. The argument is a compile-time constant (via __builtin_constant_p) + * 2. The argument is within the specified range [min, max] + * + * When validation fails, __builtin_unreachable() is called to trigger + * compiler diagnostics. This pattern follows SIMDe's approach but adapted + * for use within macro bodies rather than as function attributes. + * + * Usage: Place at the beginning of macro bodies that require immediate + * constant arguments. The macro expands to a statement, so use a semicolon: + * SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); + */ +#if defined(__has_builtin) +#if __has_builtin(__builtin_constant_p) && __has_builtin(__builtin_unreachable) +#define SSE2NEON_REQUIRE_CONST_RANGE(arg, min, max) \ + (void) ((__builtin_constant_p(arg) && ((arg) < (min) || (arg) > (max))) \ + ? (__builtin_unreachable(), 0) \ + : 0) +#endif +#endif +#if !defined(SSE2NEON_REQUIRE_CONST_RANGE) +/* Fallback: no compile-time validation */ +#define SSE2NEON_REQUIRE_CONST_RANGE(arg, min, max) ((void) 0) +#endif + +/* Allow users to disable constant validation if needed for testing */ +#ifdef SSE2NEON_DISABLE_CONSTANT_VALIDATION +#undef SSE2NEON_REQUIRE_CONST_RANGE +#define SSE2NEON_REQUIRE_CONST_RANGE(arg, min, max) ((void) 0) +#endif /* A few intrinsics accept traditional data types like ints or floats, but * most operate on data types that are specific to SSE. @@ -367,13 +934,18 @@ typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */ // On ARM 32-bit architecture, the float64x2_t is not supported. // The data type __m128d should be represented in a different way for related // intrinsic conversion. -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */ #else typedef float32x4_t __m128d; #endif typedef int64x2_t __m128i; /* 128-bit vector containing integers */ +// Some intrinsics operate on unaligned data types. +typedef int16_t ALIGN_STRUCT(1) unaligned_int16_t; +typedef int32_t ALIGN_STRUCT(1) unaligned_int32_t; +typedef int64_t ALIGN_STRUCT(1) unaligned_int64_t; + // __int64 is defined in the Intrinsics Guide which maps to different datatype // in different data model #if !(defined(_WIN32) || defined(_WIN64) || defined(__int64)) @@ -463,7 +1035,7 @@ typedef int64x2_t __m128i; /* 128-bit vector containing integers */ #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x) -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x) #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x) @@ -532,9 +1104,127 @@ typedef union ALIGN_STRUCT(16) SIMDVec { } SIMDVec; // casting using SIMDVec -#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n]) -#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n]) -#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n]) +#define vreinterpretq_nth_u64_m128i(x, n) \ + (_sse2neon_reinterpret_cast(SIMDVec *, &x)->m128_u64[n]) +#define vreinterpretq_nth_u32_m128i(x, n) \ + (_sse2neon_reinterpret_cast(SIMDVec *, &x)->m128_u32[n]) +#define vreinterpretq_nth_u8_m128i(x, n) \ + (_sse2neon_reinterpret_cast(SIMDVec *, &x)->m128_u8[n]) + +/* Portable infinity check using IEEE 754 bit representation. + * Infinity has all exponent bits set and zero mantissa bits. + * This avoids dependency on math.h INFINITY macro or compiler builtins. + */ +FORCE_INLINE int _sse2neon_isinf_f32(float v) +{ + union { + float f; + uint32_t u; + } u = {v}; + /* Mask out sign bit, check if remaining bits equal infinity pattern */ + return (u.u & 0x7FFFFFFF) == 0x7F800000; +} + +FORCE_INLINE int _sse2neon_isinf_f64(double v) +{ + union { + double d; + uint64_t u; + } u = {v}; + return (u.u & 0x7FFFFFFFFFFFFFFFULL) == 0x7FF0000000000000ULL; +} + +/* Safe helper to load double[2] as float32x4_t without strict aliasing + * violation. Used in ARMv7 fallback paths where float64x2_t is not natively + * supported. + */ +FORCE_INLINE float32x4_t sse2neon_vld1q_f32_from_f64pair(const double *p) +{ + float32x4_t tmp; + memcpy(&tmp, p, sizeof(tmp)); + return tmp; +} + +/* Safe float/double to integer conversion with x86 SSE semantics. + * x86 SSE returns the "integer indefinite" value (0x80000000 for int32, + * 0x8000000000000000 for int64) for all out-of-range conversions including + * NaN, infinity, and values exceeding the representable range. + * ARM NEON differs by saturating to INT_MAX/INT_MIN for overflows and + * returning 0 for NaN, so we need these helpers to ensure x86 compatibility. + */ +FORCE_INLINE int32_t _sse2neon_cvtd_s32(double v) +{ + /* Check for NaN or infinity first */ + if (v != v || _sse2neon_isinf_f64(v)) + return INT32_MIN; + /* INT32_MAX is exactly representable as double (2147483647.0) */ + if (v >= _sse2neon_static_cast(double, INT32_MAX) + 1.0) + return INT32_MIN; + if (v < _sse2neon_static_cast(double, INT32_MIN)) + return INT32_MIN; + return _sse2neon_static_cast(int32_t, v); +} + +FORCE_INLINE int32_t _sse2neon_cvtf_s32(float v) +{ + if (v != v || _sse2neon_isinf_f32(v)) + return INT32_MIN; + /* (float)INT32_MAX rounds up to 2147483648.0f, which is out of range. + * Use the double representation for accurate comparison. + */ + if (v >= _sse2neon_static_cast(double, INT32_MAX) + 1.0) + return INT32_MIN; + if (v < _sse2neon_static_cast(double, INT32_MIN)) + return INT32_MIN; + return _sse2neon_static_cast(int32_t, v); +} + +FORCE_INLINE int64_t _sse2neon_cvtd_s64(double v) +{ + if (v != v || _sse2neon_isinf_f64(v)) + return INT64_MIN; + /* (double)INT64_MAX rounds up to 2^63 which is out of range. + * Any double >= 2^63 is out of range for int64. + */ + if (v >= _sse2neon_static_cast(double, INT64_MAX)) + return INT64_MIN; + if (v < _sse2neon_static_cast(double, INT64_MIN)) + return INT64_MIN; + return _sse2neon_static_cast(int64_t, v); +} + +FORCE_INLINE int64_t _sse2neon_cvtf_s64(float v) +{ + if (v != v || _sse2neon_isinf_f32(v)) + return INT64_MIN; + /* (float)INT64_MAX rounds up significantly beyond INT64_MAX */ + if (v >= _sse2neon_static_cast(float, INT64_MAX)) + return INT64_MIN; + if (v < _sse2neon_static_cast(float, INT64_MIN)) + return INT64_MIN; + return _sse2neon_static_cast(int64_t, v); +} + +/* Vectorized helper: apply x86 saturation semantics to NEON conversion result. + * ARM returns 0 for NaN and INT32_MAX for positive overflow, but x86 returns + * INT32_MIN ("integer indefinite") for both. This function fixes up the result. + */ +FORCE_INLINE int32x4_t _sse2neon_cvtps_epi32_fixup(float32x4_t f, int32x4_t cvt) +{ + /* Detect values >= 2147483648.0f (out of INT32 range) */ + float32x4_t max_f = vdupq_n_f32(2147483648.0f); + uint32x4_t overflow = vcgeq_f32(f, max_f); + + /* Detect NaN: x != x for NaN values */ + uint32x4_t is_nan = vmvnq_u32(vceqq_f32(f, f)); + + /* Combine: any overflow or NaN should produce INT32_MIN */ + uint32x4_t need_indefinite = vorrq_u32(overflow, is_nan); + + /* Blend: select INT32_MIN where needed */ + int32x4_t indefinite = vdupq_n_s32(INT32_MIN); + return vbslq_s32(need_indefinite, indefinite, cvt); +} /* SSE macros */ #define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode @@ -544,7 +1234,9 @@ typedef union ALIGN_STRUCT(16) SIMDVec { // Function declaration // SSE -FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(); +FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void); +FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void); +FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int); FORCE_INLINE __m128 _mm_move_ss(__m128, __m128); FORCE_INLINE __m128 _mm_or_ps(__m128, __m128); FORCE_INLINE __m128 _mm_set_ps1(float); @@ -560,7 +1252,7 @@ FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int); FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t); FORCE_INLINE __m128d _mm_set_pd(double, double); FORCE_INLINE __m128i _mm_set1_epi32(int); -FORCE_INLINE __m128i _mm_setzero_si128(); +FORCE_INLINE __m128i _mm_setzero_si128(void); // SSE4.1 FORCE_INLINE __m128d _mm_ceil_pd(__m128d); FORCE_INLINE __m128 _mm_ceil_ps(__m128); @@ -574,10 +1266,9 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t); /* Backwards compatibility for compilers with lack of specific type support */ // Older gcc does not define vld1q_u8_x4 type -#if defined(__GNUC__) && !defined(__clang__) && \ - ((__GNUC__ <= 12 && defined(__arm__)) || \ - (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \ - (__GNUC__ <= 9 && defined(__aarch64__))) +#if defined(__GNUC__) && !defined(__clang__) && \ + ((__GNUC__ <= 13 && defined(__arm__)) || \ + (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__))) FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) { uint8x16x4_t ret; @@ -595,7 +1286,39 @@ FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) } #endif -#if !defined(__aarch64__) && !defined(_M_ARM64) +/* Wrapper for vcreate_u64 to handle Apple iOS toolchain variations. + * On iOS, vcreate_u64 may be defined as a macro in arm_neon.h, which can + * cause parsing issues in complex macro expansions. + * This wrapper provides a function-call interface using vdup_n_u64(), which + * is bit-exact and avoids macro expansion pitfalls. + * + * Other AArch64 platforms (Linux, macOS, Android) use native vcreate_u64. + * + * User override: Define SSE2NEON_IOS_COMPAT=1 to enable, + * or SSE2NEON_IOS_COMPAT=0 to disable. + */ +#if defined(__APPLE__) && SSE2NEON_ARCH_AARCH64 +#include +#endif + +#ifndef SSE2NEON_IOS_COMPAT +#if defined(__APPLE__) && SSE2NEON_ARCH_AARCH64 && TARGET_OS_IOS +#define SSE2NEON_IOS_COMPAT 1 +#else +#define SSE2NEON_IOS_COMPAT 0 +#endif +#endif + +#if SSE2NEON_IOS_COMPAT +FORCE_INLINE uint64x1_t _sse2neon_vcreate_u64(uint64_t a) +{ + return vdup_n_u64(a); +} +#else +#define _sse2neon_vcreate_u64(a) vcreate_u64(a) +#endif + +#if !SSE2NEON_ARCH_AARCH64 /* emulate vaddv u8 variant */ FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8) { @@ -610,7 +1333,7 @@ FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8) } #endif -#if !defined(__aarch64__) && !defined(_M_ARM64) +#if !SSE2NEON_ARCH_AARCH64 /* emulate vaddvq u8 variant */ FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a) { @@ -628,7 +1351,7 @@ FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a) } #endif -#if !defined(__aarch64__) && !defined(_M_ARM64) +#if !SSE2NEON_ARCH_AARCH64 /* emulate vaddvq u16 variant */ FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a) { @@ -636,7 +1359,7 @@ FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a) uint64x2_t n = vpaddlq_u32(m); uint64x1_t o = vget_low_u64(n) + vget_high_u64(n); - return vget_lane_u32((uint32x2_t) o, 0); + return vget_lane_u32(vreinterpret_u32_u64(o), 0); } #else // Wraps vaddvq_u16 @@ -646,6 +1369,33 @@ FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a) } #endif +/* Fast "any nonzero" check for horizontal reduction in PCMPXSTR operations. + * These helpers are optimized for the "any match" test pattern common in + * string comparison intrinsics. On ARMv7, OR-based reduction is used instead + * of max-based reduction for slightly better performance on some cores. + * + * For NEON comparison results (0x00 or 0xFF per lane), OR-based reduction + * correctly detects any nonzero element because: max(a,b) > 0 IFF OR(a,b) != 0 + */ +#if !SSE2NEON_ARCH_AARCH64 +/* ARMv7: OR-based reduction - 3 ops vs 4 ops for vpmax cascade */ +FORCE_INLINE uint32_t _sse2neon_any_nonzero_u8x16(uint8x16_t v) +{ + uint32x4_t as_u32 = vreinterpretq_u32_u8(v); + uint32x2_t or_half = vorr_u32(vget_low_u32(as_u32), vget_high_u32(as_u32)); + uint32x2_t or_final = vorr_u32(or_half, vrev64_u32(or_half)); + return vget_lane_u32(or_final, 0); +} + +FORCE_INLINE uint32_t _sse2neon_any_nonzero_u16x8(uint16x8_t v) +{ + uint32x4_t as_u32 = vreinterpretq_u32_u16(v); + uint32x2_t or_half = vorr_u32(vget_low_u32(as_u32), vget_high_u32(as_u32)); + uint32x2_t or_final = vorr_u32(or_half, vrev64_u32(or_half)); + return vget_lane_u32(or_final, 0); +} +#endif + /* Function Naming Conventions * The naming convention of SSE intrinsics is straightforward. A generic SSE * intrinsic function is given as follows: @@ -658,7 +1408,7 @@ FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a) * This last part, , is a little complicated. It identifies the * content of the input values, and can be set to any of the following values: * + ps - vectors contain floats (ps stands for packed single-precision) - * + pd - vectors cantain doubles (pd stands for packed double-precision) + * + pd - vectors contain doubles (pd stands for packed double-precision) * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit * signed integers * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit @@ -683,6 +1433,13 @@ FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a) */ /* Constants for use with _mm_prefetch. */ +#if SSE2NEON_ARM64EC +/* winnt.h defines these as macros; undef to allow our enum definition */ +#undef _MM_HINT_NTA +#undef _MM_HINT_T0 +#undef _MM_HINT_T1 +#undef _MM_HINT_T2 +#endif enum _mm_hint { _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */ _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */ @@ -698,7 +1455,7 @@ typedef struct { uint8_t bit23 : 1; uint8_t bit24 : 1; uint8_t res2 : 7; -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 uint32_t res3; #endif } fpcr_bitfield; @@ -795,11 +1552,16 @@ FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b) FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b) { - float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); - float32x2_t a22 = - vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); - float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/ - float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + float32x4_t _a = vreinterpretq_f32_m128(a); + float32x4_t _b = vreinterpretq_f32_m128(b); + /* vtrn interleaves elements: trn1({a[2],a[3]}, {a[0],a[1]}) = {a[2], a[0]} + */ +#if SSE2NEON_ARCH_AARCH64 + float32x2_t a02 = vtrn1_f32(vget_high_f32(_a), vget_low_f32(_a)); +#else + float32x2_t a02 = vtrn_f32(vget_high_f32(_a), vget_low_f32(_a)).val[0]; +#endif + float32x2_t b32 = vget_high_f32(_b); return vreinterpretq_m128_f32(vcombine_f32(a02, b32)); } @@ -842,15 +1604,15 @@ FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) // supported by WoA has crypto extensions. If this changes in the future, // this can be verified via the runtime-only method of: // IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) -#if (defined(_M_ARM64) && !defined(__clang__)) || \ - (defined(__ARM_FEATURE_CRYPTO) && \ +#if ((defined(_M_ARM64) || SSE2NEON_ARM64EC) && !defined(__clang__)) || \ + (defined(__ARM_FEATURE_CRYPTO) && \ (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64))) // Wraps vmull_p64 FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) { poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0); poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0); -#if defined(_MSC_VER) +#if SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG __n64 a1 = {a}, b1 = {b}; return vreinterpretq_u64_p128(vmull_p64(a1, b1)); #else @@ -906,7 +1668,7 @@ static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL // instructions. -#if defined(__aarch64__) +#if SSE2NEON_ARCH_AARCH64 uint8x16_t lm_p0 = vreinterpretq_u8_u64( vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); uint8x16_t lm_p1 = vreinterpretq_u8_u64( @@ -934,7 +1696,7 @@ static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h); // De-interleave -#if defined(__aarch64__) +#if SSE2NEON_ARCH_AARCH64 uint8x16_t t0 = vreinterpretq_u8_u64( vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); uint8x16_t t1 = vreinterpretq_u8_u64( @@ -965,11 +1727,11 @@ static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) #endif // ARMv7 polyfill // C equivalent: -// __m128i _mm_shuffle_epi32_default(__m128i a, -// __constrange(0, 255) int imm) { +// __m128i _mm_shuffle_epi32_default(__m128i a, const int imm) { +// // imm must be a compile-time constant in range [0, 255] // __m128i ret; -// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; -// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03]; +// ret[0] = a[(imm) & 0x3]; ret[1] = a[((imm) >> 2) & 0x3]; +// ret[2] = a[((imm) >> 4) & 0x03]; ret[3] = a[((imm) >> 6) & 0x03]; // return ret; // } #define _mm_shuffle_epi32_default(a, imm) \ @@ -1067,7 +1829,7 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a) return vreinterpretq_m128i_s32(vcombine_s32(a32, a33)); } -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 #define _mm_shuffle_epi32_splat(a, imm) \ vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))) #else @@ -1081,11 +1843,11 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a) // control in imm8, and store the results in dst. // // C equivalent: -// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b, -// __constrange(0, 255) int imm) { +// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b, const int imm) { +// // imm must be a compile-time constant in range [0, 255] // __m128 ret; -// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; -// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03]; +// ret[0] = a[(imm) & 0x3]; ret[1] = a[((imm) >> 2) & 0x3]; +// ret[2] = b[((imm) >> 4) & 0x03]; ret[3] = b[((imm) >> 6) & 0x03]; // return ret; // } // @@ -1105,7 +1867,7 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a) // Shuffle 16-bit integers in the low 64 bits of a using the control in imm8. // Store the results in the low 64 bits of dst, with the high 64 bits being -// copied from from a to dst. +// copied from a to dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16 #define _mm_shufflelo_epi16_function(a, imm) \ _sse2neon_define1( \ @@ -1122,7 +1884,7 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a) // Shuffle 16-bit integers in the high 64 bits of a using the control in imm8. // Store the results in the high 64 bits of dst, with the low 64 bits being -// copied from from a to dst. +// copied from a to dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16 #define _mm_shufflehi_epi16_function(a, imm) \ _sse2neon_define1( \ @@ -1507,8 +2269,7 @@ FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a) { -#if (defined(__aarch64__) || defined(_M_ARM64)) || \ - defined(__ARM_FEATURE_DIRECTED_ROUNDING) +#if SSE2NEON_ARCH_AARCH64 || defined(__ARM_FEATURE_DIRECTED_ROUNDING) return vreinterpret_m64_s32( vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))))); #else @@ -1523,8 +2284,8 @@ FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b) { - return vreinterpretq_m128_f32( - vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); + return vreinterpretq_m128_f32(vsetq_lane_f32( + _sse2neon_static_cast(float, b), vreinterpretq_f32_m128(a), 0)); } // Convert the lower single-precision (32-bit) floating-point element in a to a @@ -1532,14 +2293,13 @@ FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si FORCE_INLINE int _mm_cvt_ss2si(__m128 a) { -#if (defined(__aarch64__) || defined(_M_ARM64)) || \ - defined(__ARM_FEATURE_DIRECTED_ROUNDING) +#if SSE2NEON_ARCH_AARCH64 || defined(__ARM_FEATURE_DIRECTED_ROUNDING) return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))), 0); #else float32_t data = vgetq_lane_f32( vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0); - return (int32_t) data; + return _sse2neon_static_cast(int32_t, data); #endif } @@ -1642,8 +2402,8 @@ FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b) { - return vreinterpretq_m128_f32( - vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); + return vreinterpretq_m128_f32(vsetq_lane_f32( + _sse2neon_static_cast(float, b), vreinterpretq_f32_m128(a), 0)); } // Copy the lower single-precision (32-bit) floating-point element of a to dst. @@ -1663,13 +2423,13 @@ FORCE_INLINE float _mm_cvtss_f32(__m128 a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64 FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a) { -#if (defined(__aarch64__) || defined(_M_ARM64)) || \ - defined(__ARM_FEATURE_DIRECTED_ROUNDING) - return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0); +#if SSE2NEON_ARCH_AARCH64 || defined(__ARM_FEATURE_DIRECTED_ROUNDING) + return _sse2neon_static_cast( + int64_t, vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0)); #else float32_t data = vgetq_lane_f32( vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0); - return (int64_t) data; + return _sse2neon_static_cast(int64_t, data); #endif } @@ -1678,16 +2438,19 @@ FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a) { - return vreinterpret_m64_s32( - vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))); + float32x4_t f = vreinterpretq_f32_m128(a); + int32x4_t cvt = vcvtq_s32_f32(f); + int32x4_t result = _sse2neon_cvtps_epi32_fixup(f, cvt); + return vreinterpret_m64_s32(vget_low_s32(result)); } // Convert the lower single-precision (32-bit) floating-point element in a to a // 32-bit integer with truncation, and store the result in dst. +// x86 returns INT32_MIN for NaN and out-of-range values. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si FORCE_INLINE int _mm_cvtt_ss2si(__m128 a) { - return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0); + return _sse2neon_cvtf_s32(vgetq_lane_f32(vreinterpretq_f32_m128(a), 0)); } // Convert packed single-precision (32-bit) floating-point elements in a to @@ -1702,29 +2465,35 @@ FORCE_INLINE int _mm_cvtt_ss2si(__m128 a) // Convert the lower single-precision (32-bit) floating-point element in a to a // 64-bit integer with truncation, and store the result in dst. +// x86 returns INT64_MIN for NaN and out-of-range values. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64 FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a) { - return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + return _sse2neon_cvtf_s64(vgetq_lane_f32(vreinterpretq_f32_m128(a), 0)); } // Divide packed single-precision (32-bit) floating-point elements in a by // packed elements in b, and store the results in dst. // Due to ARMv7-A NEON's lack of a precise division intrinsic, we implement // division by multiplying a by b's reciprocal before using the Newton-Raphson -// method to approximate the results. +// method to approximate the results. Use SSE2NEON_PRECISE_DIV for improved +// precision on ARMv7. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128_f32( vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else - float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b)); - recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); - // Additional Netwon-Raphson iteration for accuracy - recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); - return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip)); + float32x4_t _a = vreinterpretq_f32_m128(a); + float32x4_t _b = vreinterpretq_f32_m128(b); + float32x4_t recip = vrecpeq_f32(_b); + recip = vmulq_f32(recip, vrecpsq_f32(recip, _b)); +#if SSE2NEON_PRECISE_DIV + // Additional Newton-Raphson iteration for accuracy + recip = vmulq_f32(recip, vrecpsq_f32(recip, _b)); +#endif + return vreinterpretq_m128_f32(vmulq_f32(_a, recip)); #endif } @@ -1746,22 +2515,32 @@ FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b) // Extract a 16-bit integer from a, selected with imm8, and store the result in // the lower element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16 -#define _mm_extract_pi16(a, imm) \ - (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm)) +// imm must be a compile-time constant in range [0, 3] +#define _mm_extract_pi16(a, imm) \ + (SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 3), \ + _sse2neon_static_cast(int32_t, \ + vget_lane_u16(vreinterpret_u16_m64(a), (imm)))) // Free aligned memory that was allocated with _mm_malloc. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free +// +// WARNING: Only use on pointers from _mm_malloc(). On Windows, passing memory +// from malloc/calloc/new corrupts the heap. See _mm_malloc() for details. #if !defined(SSE2NEON_ALLOC_DEFINED) FORCE_INLINE void _mm_free(void *addr) { +#if defined(_WIN32) + _aligned_free(addr); +#else free(addr); +#endif } #endif -FORCE_INLINE uint64_t _sse2neon_get_fpcr() +FORCE_INLINE uint64_t _sse2neon_get_fpcr(void) { uint64_t value; -#if defined(_MSC_VER) +#if SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG value = _ReadStatusReg(ARM64_FPCR); #else __asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */ @@ -1771,10 +2550,10 @@ FORCE_INLINE uint64_t _sse2neon_get_fpcr() FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value) { -#if defined(_MSC_VER) +#if SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG _WriteStatusReg(ARM64_FPCR, value); #else - __asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */ + __asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */ #endif } @@ -1782,18 +2561,18 @@ FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value) // The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or // _MM_FLUSH_ZERO_OFF // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE -FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode() +FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void) { union { fpcr_bitfield field; -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 uint64_t value; #else uint32_t value; #endif } r; -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 r.value = _sse2neon_get_fpcr(); #else __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ @@ -1806,35 +2585,33 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode() // The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, // _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE -FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE() +FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void) { - union { - fpcr_bitfield field; -#if defined(__aarch64__) || defined(_M_ARM64) - uint64_t value; -#else - uint32_t value; -#endif - } r; - -#if defined(__aarch64__) || defined(_M_ARM64) - r.value = _sse2neon_get_fpcr(); -#else - __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ -#endif - - if (r.field.bit22) { - return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP; - } else { - return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST; + const int mask = FE_TONEAREST | FE_DOWNWARD | FE_UPWARD | FE_TOWARDZERO; + switch (fegetround() & mask) { + case FE_TONEAREST: + return _MM_ROUND_NEAREST; + case FE_DOWNWARD: + return _MM_ROUND_DOWN; + case FE_UPWARD: + return _MM_ROUND_UP; + case FE_TOWARDZERO: + return _MM_ROUND_TOWARD_ZERO; + default: + // fegetround() must return _MM_ROUND_NEAREST, _MM_ROUND_DOWN, + // _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO on success. all the other error + // cases we treat them as FE_TOWARDZERO (truncate). + return _MM_ROUND_TOWARD_ZERO; } } // Copy a to dst, and insert the 16-bit integer i into dst at the location // specified by imm8. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16 -#define _mm_insert_pi16(a, b, imm) \ - vreinterpret_m64_s16(vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))) +// imm must be a compile-time constant in range [0, 3] +#define _mm_insert_pi16(a, b, imm) \ + (SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 3), \ + vreinterpret_m64_s16(vset_lane_s16((b), vreinterpret_s16_m64(a), (imm)))) // Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point // elements) from memory into dst. mem_addr must be aligned on a 16-byte @@ -1879,8 +2656,9 @@ FORCE_INLINE __m128 _mm_load1_ps(const float *p) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p) { - return vreinterpretq_m128_f32( - vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p))); + return vreinterpretq_m128_f32(vcombine_f32( + vget_low_f32(a), + vld1_f32(_sse2neon_reinterpret_cast(const float32_t *, p)))); } // Load 2 single-precision (32-bit) floating-point elements from memory into the @@ -1890,7 +2668,8 @@ FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p) FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p) { return vreinterpretq_m128_f32( - vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a))); + vcombine_f32(vld1_f32(_sse2neon_reinterpret_cast(const float32_t *, p)), + vget_high_f32(a))); } // Load 4 single-precision (32-bit) floating-point elements from memory into dst @@ -1918,25 +2697,45 @@ FORCE_INLINE __m128 _mm_loadu_ps(const float *p) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16 FORCE_INLINE __m128i _mm_loadu_si16(const void *p) { - return vreinterpretq_m128i_s16( - vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0)); + return vreinterpretq_m128i_s16(vsetq_lane_s16( + *_sse2neon_reinterpret_cast(const unaligned_int16_t *, p), + vdupq_n_s16(0), 0)); } // Load unaligned 64-bit integer from memory into the first element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64 FORCE_INLINE __m128i _mm_loadu_si64(const void *p) { - return vreinterpretq_m128i_s64( - vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0))); + return vreinterpretq_m128i_s64(vsetq_lane_s64( + *_sse2neon_reinterpret_cast(const unaligned_int64_t *, p), + vdupq_n_s64(0), 0)); } // Allocate size bytes of memory, aligned to the alignment specified in align, // and return a pointer to the allocated memory. _mm_free should be used to free // memory that is allocated with _mm_malloc. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc +// +// Memory allocated by this function MUST be freed with _mm_free(), NOT with +// standard free() or delete. Mixing allocators: +// - Windows: CORRUPTS HEAP (free on _aligned_malloc memory is invalid) +// - Other platforms: Works (maps to free), but pair for Windows portability +// +// Incorrect usage (causes memory corruption on Windows): +// void *ptr = _mm_malloc(1024, 16); +// free(ptr); // WRONG - use _mm_free() instead +// +// Implementation notes: +// - Windows: Uses _aligned_malloc() +// - Other platforms: Uses posix_memalign() or malloc() for small alignments +// +// See also: _mm_free() for deallocation requirements. #if !defined(SSE2NEON_ALLOC_DEFINED) FORCE_INLINE void *_mm_malloc(size_t size, size_t align) { +#if defined(_WIN32) + return _aligned_malloc(size, align); +#else void *ptr; if (align == 1) return malloc(size); @@ -1945,6 +2744,7 @@ FORCE_INLINE void *_mm_malloc(size_t size, size_t align) if (!posix_memalign(&ptr, align, size)) return ptr; return NULL; +#endif } #endif @@ -1955,11 +2755,11 @@ FORCE_INLINE void *_mm_malloc(size_t size, size_t align) FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr) { int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7); - __m128 b = _mm_load_ps((const float *) mem_addr); + __m128 b = _mm_load_ps(_sse2neon_reinterpret_cast(const float *, mem_addr)); int8x8_t masked = vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a), vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b)))); - vst1_s8((int8_t *) mem_addr, masked); + vst1_s8(_sse2neon_reinterpret_cast(int8_t *, mem_addr), masked); } // Conditionally store 8-bit integer elements from a into memory using mask @@ -2081,7 +2881,7 @@ FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b) { -#if defined(aarch64__) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128_u64( vzip2q_u64(vreinterpretq_u64_m128(b), vreinterpretq_u64_m128(a))); #else @@ -2108,18 +2908,18 @@ FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B) FORCE_INLINE int _mm_movemask_pi8(__m64 a) { uint8x8_t input = vreinterpret_u8_m64(a); -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 static const int8_t shift[8] = {0, 1, 2, 3, 4, 5, 6, 7}; uint8x8_t tmp = vshr_n_u8(input, 7); return vaddv_u8(vshl_u8(tmp, vld1_s8(shift))); #else - // Refer the implementation of `_mm_movemask_epi8` - uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7)); - uint32x2_t paired16 = - vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7)); - uint8x8_t paired32 = - vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14)); - return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4); + // Note: Uses the same method as _mm_movemask_epi8. + uint8x8_t msbs = vshr_n_u8(input, 7); + uint32x2_t bits = vreinterpret_u32_u8(msbs); + bits = vsra_n_u32(bits, bits, 7); + bits = vsra_n_u32(bits, bits, 14); + uint8x8_t output = vreinterpret_u8_u32(bits); + return (vget_lane_u8(output, 4) << 4) | vget_lane_u8(output, 0); #endif } @@ -2129,20 +2929,18 @@ FORCE_INLINE int _mm_movemask_pi8(__m64 a) FORCE_INLINE int _mm_movemask_ps(__m128 a) { uint32x4_t input = vreinterpretq_u32_m128(a); -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 static const int32_t shift[4] = {0, 1, 2, 3}; uint32x4_t tmp = vshrq_n_u32(input, 31); - return vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift))); + return _sse2neon_static_cast(int, + vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift)))); #else - // Uses the exact same method as _mm_movemask_epi8, see that for details. - // Shift out everything but the sign bits with a 32-bit unsigned shift - // right. - uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31)); - // Merge the two pairs together with a 64-bit unsigned shift right + add. - uint8x16_t paired = - vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31)); - // Extract the result. - return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2); + // Note: Uses the same method as _mm_movemask_epi8. + uint32x4_t msbs = vshrq_n_u32(input, 31); + uint64x2_t bits = vreinterpretq_u64_u32(msbs); + bits = vsraq_n_u64(bits, bits, 31); + uint8x16_t output = vreinterpretq_u8_u64(bits); + return (vgetq_lane_u8(output, 8) << 2) | vgetq_lane_u8(output, 0); #endif } @@ -2240,7 +3038,7 @@ FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) FORCE_INLINE void _mm_prefetch(char const *p, int i) { (void) i; -#if defined(_MSC_VER) +#if SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG switch (i) { case _MM_HINT_NTA: __prefetch2(p, 1); @@ -2291,8 +3089,13 @@ FORCE_INLINE void _mm_prefetch(char const *p, int i) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps FORCE_INLINE __m128 _mm_rcp_ps(__m128 in) { - float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in)); - recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); + float32x4_t _in = vreinterpretq_f32_m128(in); + float32x4_t recip = vrecpeq_f32(_in); + recip = vmulq_f32(recip, vrecpsq_f32(recip, _in)); +#if SSE2NEON_PRECISE_DIV + // Additional Newton-Raphson iteration for accuracy + recip = vmulq_f32(recip, vrecpsq_f32(recip, _in)); +#endif return vreinterpretq_m128_f32(recip); } @@ -2312,7 +3115,8 @@ FORCE_INLINE __m128 _mm_rcp_ss(__m128 a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in) { - float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in)); + float32x4_t _in = vreinterpretq_f32_m128(in); + float32x4_t out = vrsqrteq_f32(_in); // Generate masks for detecting whether input has any 0.0f/-0.0f // (which becomes positive/negative infinity by IEEE-754 arithmetic rules). @@ -2323,13 +3127,16 @@ FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in) const uint32x4_t has_neg_zero = vceqq_u32(neg_inf, vreinterpretq_u32_f32(out)); - out = vmulq_f32( - out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); + out = vmulq_f32(out, vrsqrtsq_f32(vmulq_f32(_in, out), out)); +#if SSE2NEON_PRECISE_SQRT + // Additional Newton-Raphson iteration for accuracy + out = vmulq_f32(out, vrsqrtsq_f32(vmulq_f32(_in, out), out)); +#endif // Set output vector element to infinity/negative-infinity if // the corresponding input vector element is 0.0f/-0.0f. - out = vbslq_f32(has_pos_zero, (float32x4_t) pos_inf, out); - out = vbslq_f32(has_neg_zero, (float32x4_t) neg_inf, out); + out = vbslq_f32(has_pos_zero, vreinterpretq_f32_u32(pos_inf), out); + out = vbslq_f32(has_neg_zero, vreinterpretq_f32_u32(neg_inf), out); return vreinterpretq_m128_f32(out); } @@ -2354,7 +3161,8 @@ FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b) uint64x1_t t = vpaddl_u32(vpaddl_u16( vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))))); return vreinterpret_m64_u16( - vset_lane_u16((int) vget_lane_u64(t, 0), vdup_n_u16(0), 0)); + vset_lane_u16(_sse2neon_static_cast(uint16_t, vget_lane_u64(t, 0)), + vdup_n_u16(0), 0)); } // Macro: Set the flush zero bits of the MXCSR control and status register to @@ -2367,14 +3175,14 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag) // regardless of the value of the FZ bit. union { fpcr_bitfield field; -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 uint64_t value; #else uint32_t value; #endif } r; -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 r.value = _sse2neon_get_fpcr(); #else __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ @@ -2382,10 +3190,10 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag) r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON; -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 _sse2neon_set_fpcr(r.value); #else - __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ + __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ #endif } @@ -2413,44 +3221,26 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) { - union { - fpcr_bitfield field; -#if defined(__aarch64__) || defined(_M_ARM64) - uint64_t value; -#else - uint32_t value; -#endif - } r; - -#if defined(__aarch64__) || defined(_M_ARM64) - r.value = _sse2neon_get_fpcr(); -#else - __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ -#endif - switch (rounding) { - case _MM_ROUND_TOWARD_ZERO: - r.field.bit22 = 1; - r.field.bit23 = 1; + case _MM_ROUND_NEAREST: + rounding = FE_TONEAREST; break; case _MM_ROUND_DOWN: - r.field.bit22 = 0; - r.field.bit23 = 1; + rounding = FE_DOWNWARD; break; case _MM_ROUND_UP: - r.field.bit22 = 1; - r.field.bit23 = 0; + rounding = FE_UPWARD; break; - default: //_MM_ROUND_NEAREST - r.field.bit22 = 0; - r.field.bit23 = 0; + case _MM_ROUND_TOWARD_ZERO: + rounding = FE_TOWARDZERO; + break; + default: + // rounding must be _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, + // _MM_ROUND_TOWARD_ZERO. all the other invalid values we treat them as + // FE_TOWARDZERO (truncate). + rounding = FE_TOWARDZERO; } - -#if defined(__aarch64__) || defined(_M_ARM64) - _sse2neon_set_fpcr(r.value); -#else - __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ -#endif + fesetround(rounding); } // Copy single-precision (32-bit) floating-point element a to the lower element @@ -2472,18 +3262,53 @@ FORCE_INLINE __m128 _mm_set1_ps(float _w) // Set the MXCSR control and status register with the value in unsigned 32-bit // integer a. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr -// FIXME: _mm_setcsr() implementation supports changing the rounding mode only. +// +// Supported MXCSR fields: +// - Bits 13-14: Rounding mode (RM) - SUPPORTED via ARM FPCR/FPSCR +// - Bit 15 (FZ): Flush-to-zero mode - SUPPORTED via ARM FPCR/FPSCR bit 24 +// - Bit 6 (DAZ): Denormals-are-zero mode - SUPPORTED (unified with FZ on ARM) +// +// Unsupported MXCSR fields (silently ignored): +// - Bits 0-5: Exception flags (IE, DE, ZE, OE, UE, PE) - NOT EMULATED +// - Bits 7-12: Exception masks - NOT EMULATED +// See "MXCSR Exception Flags - NOT EMULATED" documentation block for details. +// +// ARM Platform Behavior: +// - ARM FPCR/FPSCR bit 24 provides unified FZ+DAZ behavior. Setting either +// _MM_FLUSH_ZERO_ON or _MM_DENORMALS_ZERO_ON enables the same ARM bit. +// - ARMv7 NEON: "Flush-to-zero mode always enabled" per ARM ARM (impl may vary) +// - ARMv8: FPCR.FZ correctly controls denormal handling for NEON operations FORCE_INLINE void _mm_setcsr(unsigned int a) { - _MM_SET_ROUNDING_MODE(a); + _MM_SET_ROUNDING_MODE(a & _MM_ROUND_MASK); + // ARM FPCR.bit24 handles both FZ and DAZ - set if either is requested + _MM_SET_FLUSH_ZERO_MODE( + (a & _MM_FLUSH_ZERO_MASK) | + ((a & _MM_DENORMALS_ZERO_MASK) ? _MM_FLUSH_ZERO_ON : 0)); } // Get the unsigned 32-bit value of the MXCSR control and status register. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr -// FIXME: _mm_getcsr() implementation supports reading the rounding mode only. -FORCE_INLINE unsigned int _mm_getcsr() +// +// Returned MXCSR fields: +// - Bits 13-14: Rounding mode (RM) - Reflects current ARM FPCR/FPSCR setting +// - Bit 15 (FZ): Flush-to-zero mode - Reflects ARM FPCR/FPSCR bit 24 +// - Bit 6 (DAZ): Denormals-are-zero mode - Mirrors FZ (unified on ARM) +// +// Fields always returned as zero (NOT EMULATED): +// - Bits 0-5: Exception flags - ALWAYS 0 (exceptions not tracked) +// - Bits 7-12: Exception masks - ALWAYS 0 (use _MM_GET_EXCEPTION_MASK() +// instead) See "MXCSR Exception Flags - NOT EMULATED" documentation block for +// details. +// +// ARM Platform Behavior: +// - When ARM FPCR/FPSCR bit 24 is enabled, both FZ and DAZ bits are reported +// as set (the original setting cannot be distinguished). +// - ARMv7 NEON: Returned bits reflect FPSCR, but NEON always flushes denormals +FORCE_INLINE unsigned int _mm_getcsr(void) { - return _MM_GET_ROUNDING_MODE(); + return _MM_GET_ROUNDING_MODE() | _MM_GET_FLUSH_ZERO_MODE() | + _MM_GET_DENORMALS_ZERO_MODE(); } // Set packed single-precision (32-bit) floating-point elements in dst with the @@ -2505,15 +3330,20 @@ FORCE_INLINE __m128 _mm_setzero_ps(void) // Shuffle 16-bit integers in a using the control in imm8, and store the results // in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16 +// imm must be a compile-time constant in range [0, 255] #ifdef _sse2neon_shuffle -#define _mm_shuffle_pi16(a, imm) \ - vreinterpret_m64_s16(vshuffle_s16( \ - vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \ - ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3))) +#define _mm_shuffle_pi16(a, imm) \ + __extension__({ \ + SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); \ + vreinterpret_m64_s16( \ + vshuffle_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), \ + ((imm) & 0x3), (((imm) >> 2) & 0x3), \ + (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3))); \ + }) #else #define _mm_shuffle_pi16(a, imm) \ _sse2neon_define1( \ - __m64, a, int16x4_t ret; \ + __m64, a, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); int16x4_t ret; \ ret = vmov_n_s16( \ vget_lane_s16(vreinterpret_s16_m64(_a), (imm) & (0x3))); \ ret = vset_lane_s16( \ @@ -2559,11 +3389,12 @@ FORCE_INLINE void _mm_lfence(void) _sse2neon_smp_mb(); } -// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255) -// int imm) +// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int imm) +// imm must be a compile-time constant in range [0, 255] #ifdef _sse2neon_shuffle #define _mm_shuffle_ps(a, b, imm) \ __extension__({ \ + SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); \ float32x4_t _input1 = vreinterpretq_f32_m128(a); \ float32x4_t _input2 = vreinterpretq_f32_m128(b); \ float32x4_t _shuf = \ @@ -2572,63 +3403,64 @@ FORCE_INLINE void _mm_lfence(void) vreinterpretq_m128_f32(_shuf); \ }) #else // generic -#define _mm_shuffle_ps(a, b, imm) \ - _sse2neon_define2( \ - __m128, a, b, __m128 ret; switch (imm) { \ - case _MM_SHUFFLE(1, 0, 3, 2): \ - ret = _mm_shuffle_ps_1032(_a, _b); \ - break; \ - case _MM_SHUFFLE(2, 3, 0, 1): \ - ret = _mm_shuffle_ps_2301(_a, _b); \ - break; \ - case _MM_SHUFFLE(0, 3, 2, 1): \ - ret = _mm_shuffle_ps_0321(_a, _b); \ - break; \ - case _MM_SHUFFLE(2, 1, 0, 3): \ - ret = _mm_shuffle_ps_2103(_a, _b); \ - break; \ - case _MM_SHUFFLE(1, 0, 1, 0): \ - ret = _mm_movelh_ps(_a, _b); \ - break; \ - case _MM_SHUFFLE(1, 0, 0, 1): \ - ret = _mm_shuffle_ps_1001(_a, _b); \ - break; \ - case _MM_SHUFFLE(0, 1, 0, 1): \ - ret = _mm_shuffle_ps_0101(_a, _b); \ - break; \ - case _MM_SHUFFLE(3, 2, 1, 0): \ - ret = _mm_shuffle_ps_3210(_a, _b); \ - break; \ - case _MM_SHUFFLE(0, 0, 1, 1): \ - ret = _mm_shuffle_ps_0011(_a, _b); \ - break; \ - case _MM_SHUFFLE(0, 0, 2, 2): \ - ret = _mm_shuffle_ps_0022(_a, _b); \ - break; \ - case _MM_SHUFFLE(2, 2, 0, 0): \ - ret = _mm_shuffle_ps_2200(_a, _b); \ - break; \ - case _MM_SHUFFLE(3, 2, 0, 2): \ - ret = _mm_shuffle_ps_3202(_a, _b); \ - break; \ - case _MM_SHUFFLE(3, 2, 3, 2): \ - ret = _mm_movehl_ps(_b, _a); \ - break; \ - case _MM_SHUFFLE(1, 1, 3, 3): \ - ret = _mm_shuffle_ps_1133(_a, _b); \ - break; \ - case _MM_SHUFFLE(2, 0, 1, 0): \ - ret = _mm_shuffle_ps_2010(_a, _b); \ - break; \ - case _MM_SHUFFLE(2, 0, 0, 1): \ - ret = _mm_shuffle_ps_2001(_a, _b); \ - break; \ - case _MM_SHUFFLE(2, 0, 3, 2): \ - ret = _mm_shuffle_ps_2032(_a, _b); \ - break; \ - default: \ - ret = _mm_shuffle_ps_default(_a, _b, (imm)); \ - break; \ +#define _mm_shuffle_ps(a, b, imm) \ + _sse2neon_define2( \ + __m128, a, b, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); __m128 ret; \ + switch (imm) { \ + case _MM_SHUFFLE(1, 0, 3, 2): \ + ret = _mm_shuffle_ps_1032(_a, _b); \ + break; \ + case _MM_SHUFFLE(2, 3, 0, 1): \ + ret = _mm_shuffle_ps_2301(_a, _b); \ + break; \ + case _MM_SHUFFLE(0, 3, 2, 1): \ + ret = _mm_shuffle_ps_0321(_a, _b); \ + break; \ + case _MM_SHUFFLE(2, 1, 0, 3): \ + ret = _mm_shuffle_ps_2103(_a, _b); \ + break; \ + case _MM_SHUFFLE(1, 0, 1, 0): \ + ret = _mm_movelh_ps(_a, _b); \ + break; \ + case _MM_SHUFFLE(1, 0, 0, 1): \ + ret = _mm_shuffle_ps_1001(_a, _b); \ + break; \ + case _MM_SHUFFLE(0, 1, 0, 1): \ + ret = _mm_shuffle_ps_0101(_a, _b); \ + break; \ + case _MM_SHUFFLE(3, 2, 1, 0): \ + ret = _mm_shuffle_ps_3210(_a, _b); \ + break; \ + case _MM_SHUFFLE(0, 0, 1, 1): \ + ret = _mm_shuffle_ps_0011(_a, _b); \ + break; \ + case _MM_SHUFFLE(0, 0, 2, 2): \ + ret = _mm_shuffle_ps_0022(_a, _b); \ + break; \ + case _MM_SHUFFLE(2, 2, 0, 0): \ + ret = _mm_shuffle_ps_2200(_a, _b); \ + break; \ + case _MM_SHUFFLE(3, 2, 0, 2): \ + ret = _mm_shuffle_ps_3202(_a, _b); \ + break; \ + case _MM_SHUFFLE(3, 2, 3, 2): \ + ret = _mm_movehl_ps(_b, _a); \ + break; \ + case _MM_SHUFFLE(1, 1, 3, 3): \ + ret = _mm_shuffle_ps_1133(_a, _b); \ + break; \ + case _MM_SHUFFLE(2, 0, 1, 0): \ + ret = _mm_shuffle_ps_2010(_a, _b); \ + break; \ + case _MM_SHUFFLE(2, 0, 0, 1): \ + ret = _mm_shuffle_ps_2001(_a, _b); \ + break; \ + case _MM_SHUFFLE(2, 0, 3, 2): \ + ret = _mm_shuffle_ps_2032(_a, _b); \ + break; \ + default: \ + ret = _mm_shuffle_ps_default(_a, _b, (imm)); \ + break; \ } _sse2neon_return(ret);) #endif @@ -2640,29 +3472,29 @@ FORCE_INLINE void _mm_lfence(void) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 && !SSE2NEON_PRECISE_SQRT return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in))); #else - float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in)); + float32x4_t _in = vreinterpretq_f32_m128(in); + float32x4_t recip = vrsqrteq_f32(_in); - // Test for vrsqrteq_f32(0) -> positive infinity case. - // Change to zero, so that s * 1/sqrt(s) result is zero too. + // Test for vrsqrteq_f32(0) -> infinity case (both +Inf and -Inf). + // vrsqrteq_f32(+0) = +Inf, vrsqrteq_f32(-0) = -Inf + // Change recip to zero so that s * 1/sqrt(s) preserves signed zero: + // +0 * 0 = +0, -0 * 0 = -0 (IEEE-754 sign rule) + const uint32x4_t abs_mask = vdupq_n_u32(0x7FFFFFFF); const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000); const uint32x4_t div_by_zero = - vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip)); + vceqq_u32(pos_inf, vandq_u32(abs_mask, vreinterpretq_u32_f32(recip))); recip = vreinterpretq_f32_u32( vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip))); - recip = vmulq_f32( - vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), - recip); - // Additional Netwon-Raphson iteration for accuracy - recip = vmulq_f32( - vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), - recip); + recip = vmulq_f32(vrsqrtsq_f32(vmulq_f32(recip, recip), _in), recip); + // Additional Newton-Raphson iteration for accuracy + recip = vmulq_f32(vrsqrtsq_f32(vmulq_f32(recip, recip), _in), recip); // sqrt(s) = s * 1/sqrt(s) - return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip)); + return vreinterpretq_m128_f32(vmulq_f32(_in, recip)); #endif } @@ -2751,31 +3583,43 @@ FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16 FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a) { - vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0); + vst1q_lane_s16(_sse2neon_reinterpret_cast(int16_t *, p), + vreinterpretq_s16_m128i(a), 0); } // Stores 64-bits of integer data a at the address p. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64 FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a) { - vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0); + vst1q_lane_s64(_sse2neon_reinterpret_cast(int64_t *, p), + vreinterpretq_s64_m128i(a), 0); } // Store 64-bits of integer data from a into memory using a non-temporal memory // hint. +// Note: ARM lacks direct non-temporal store for single 64-bit value. STNP +// requires pair stores; __builtin_nontemporal_store may generate regular store +// on AArch64 for sub-128-bit types. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a) { - vst1_s64((int64_t *) p, vreinterpret_s64_m64(a)); +#if __has_builtin(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, p); +#else + vst1_s64(_sse2neon_reinterpret_cast(int64_t *, p), vreinterpret_s64_m64(a)); +#endif } // Store 128-bits (composed of 4 packed single-precision (32-bit) floating- // point elements) from a into memory using a non-temporal memory hint. +// Note: On AArch64, __builtin_nontemporal_store generates STNP (Store +// Non-temporal Pair), providing true non-temporal hint for 128-bit stores. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps FORCE_INLINE void _mm_stream_ps(float *p, __m128 a) { #if __has_builtin(__builtin_nontemporal_store) - __builtin_nontemporal_store(a, (float32x4_t *) p); + __builtin_nontemporal_store(a, + _sse2neon_reinterpret_cast(float32x4_t *, p)); #else vst1q_f32(p, vreinterpretq_f32_m128(a)); #endif @@ -2805,6 +3649,7 @@ FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) // (32-bit) floating-point elements in row0, row1, row2, and row3, and store the // transposed matrix in these vectors (row0 now contains column 0, etc.). // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS +#ifndef _MM_TRANSPOSE4_PS #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ do { \ float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \ @@ -2818,6 +3663,7 @@ FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \ vget_high_f32(ROW23.val[1])); \ } while (0) +#endif // according to the documentation, these intrinsics behave the same as the // non-'u' versions. We'll just alias them here. @@ -2829,39 +3675,47 @@ FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) #define _mm_ucomineq_ss _mm_comineq_ss // Return vector of type __m128i with undefined elements. +// Note: MSVC forces zero-initialization while GCC/Clang return truly undefined +// memory. Use SSE2NEON_UNDEFINED_ZERO=1 to force zero on all compilers. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128 FORCE_INLINE __m128i _mm_undefined_si128(void) { -#if defined(__GNUC__) || defined(__clang__) +#if SSE2NEON_UNDEFINED_ZERO || \ + (SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG) + return _mm_setzero_si128(); +#else +#if SSE2NEON_COMPILER_GCC_COMPAT #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wuninitialized" #endif __m128i a; -#if defined(_MSC_VER) - a = _mm_setzero_si128(); -#endif return a; -#if defined(__GNUC__) || defined(__clang__) +#if SSE2NEON_COMPILER_GCC_COMPAT #pragma GCC diagnostic pop #endif +#endif } // Return vector of type __m128 with undefined elements. +// Note: MSVC forces zero-initialization while GCC/Clang return truly undefined +// memory. Use SSE2NEON_UNDEFINED_ZERO=1 to force zero on all compilers. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps FORCE_INLINE __m128 _mm_undefined_ps(void) { -#if defined(__GNUC__) || defined(__clang__) +#if SSE2NEON_UNDEFINED_ZERO || \ + (SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG) + return _mm_setzero_ps(); +#else +#if SSE2NEON_COMPILER_GCC_COMPAT #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wuninitialized" #endif __m128 a; -#if defined(_MSC_VER) - a = _mm_setzero_ps(); -#endif return a; -#if defined(__GNUC__) || defined(__clang__) +#if SSE2NEON_COMPILER_GCC_COMPAT #pragma GCC diagnostic pop #endif +#endif } // Unpack and interleave single-precision (32-bit) floating-point elements from @@ -2869,7 +3723,7 @@ FORCE_INLINE __m128 _mm_undefined_ps(void) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128_f32( vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else @@ -2885,7 +3739,7 @@ FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128_f32( vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else @@ -2944,16 +3798,22 @@ FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_f64( vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - double *da = (double *) &a; - double *db = (double *) &b; + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); double c[2]; - c[0] = da[0] + db[0]; - c[1] = da[1] + db[1]; - return vld1q_f32((float32_t *) c); + c[0] = a0 + b0; + c[1] = a1 + b1; + return sse2neon_vld1q_f32_from_f64pair(c); #endif } @@ -2963,15 +3823,17 @@ FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return _mm_move_sd(a, _mm_add_pd(a, b)); #else - double *da = (double *) &a; - double *db = (double *) &b; + double a0, a1, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); double c[2]; - c[0] = da[0] + db[0]; - c[1] = da[1]; - return vld1q_f32((float32_t *) c); + c[0] = a0 + b0; + c[1] = a1; + return sse2neon_vld1q_f32_from_f64pair(c); #endif } @@ -3062,8 +3924,8 @@ FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16 FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b) { - return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a), - vreinterpretq_u16_m128i(b)); + return vreinterpretq_m128i_u16( + vrhaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); } // Average packed unsigned 8-bit integers in a and b, and store the results in @@ -3085,6 +3947,16 @@ FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128 #define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm) +/* Cast Intrinsics - Zero-Cost Type Reinterpretation + * + * The _mm_cast* intrinsics reinterpret vector types (__m128, __m128d, __m128i) + * without generating any instructions. These are pure type annotations that + * perform bitwise reinterpretation, NOT value conversion. + * + * Maps to ARM NEON vreinterpret_* / vreinterpretq_* (also zero-cost bitcasts). + * https://developer.arm.com/architectures/instruction-sets/intrinsics/#q=vreinterpret + */ + // Cast vector of type __m128d to type __m128. This intrinsic is only used for // compilation and does not generate any instructions, thus it has zero latency. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps @@ -3122,7 +3994,7 @@ FORCE_INLINE __m128i _mm_castps_si128(__m128 a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a)); #else return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a)); @@ -3152,12 +4024,14 @@ FORCE_INLINE void _mm_clflush(void const *p) * compilation is successful. */ #if defined(__APPLE__) - sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE); -#elif defined(__GNUC__) || defined(__clang__) - uintptr_t ptr = (uintptr_t) p; - __builtin___clear_cache((char *) ptr, - (char *) ptr + SSE2NEON_CACHELINE_SIZE); -#elif (_MSC_VER) && SSE2NEON_INCLUDE_WINDOWS_H + sys_icache_invalidate(_sse2neon_const_cast(void *, p), + SSE2NEON_CACHELINE_SIZE); +#elif SSE2NEON_COMPILER_GCC_COMPAT + uintptr_t ptr = _sse2neon_reinterpret_cast(uintptr_t, p); + __builtin___clear_cache( + _sse2neon_reinterpret_cast(char *, ptr), + _sse2neon_reinterpret_cast(char *, ptr) + SSE2NEON_CACHELINE_SIZE); +#elif SSE2NEON_COMPILER_MSVC && SSE2NEON_INCLUDE_WINDOWS_H FlushInstructionCache(GetCurrentProcess(), p, SSE2NEON_CACHELINE_SIZE); #endif } @@ -3194,15 +4068,22 @@ FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_u64( vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) - uint32x4_t cmp = - vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); - uint32x4_t swapped = vrev64q_u32(cmp); - return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped)); + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + uint64_t d[2]; + d[0] = a0 == b0 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1 == b1 ? ~UINT64_C(0) : UINT64_C(0); + return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } @@ -3220,17 +4101,21 @@ FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_u64( vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1 >= b1 ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3242,15 +4127,16 @@ FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return _mm_move_sd(a, _mm_cmpge_pd(a, b)); #else // expand "_mm_cmpge_pd()" to reduce unnecessary operations - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); uint64_t d[2]; - d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); @@ -3289,17 +4175,21 @@ FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_u64( vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1 > b1 ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3311,15 +4201,16 @@ FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return _mm_move_sd(a, _mm_cmpgt_pd(a, b)); #else // expand "_mm_cmpge_pd()" to reduce unnecessary operations - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); uint64_t d[2]; - d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); @@ -3331,17 +4222,21 @@ FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_u64( vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1 <= b1 ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3353,15 +4248,16 @@ FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return _mm_move_sd(a, _mm_cmple_pd(a, b)); #else // expand "_mm_cmpge_pd()" to reduce unnecessary operations - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); uint64_t d[2]; - d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); @@ -3403,17 +4299,21 @@ FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_u64( vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1 < b1 ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3425,14 +4325,15 @@ FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return _mm_move_sd(a, _mm_cmplt_pd(a, b)); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); uint64_t d[2]; - d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); @@ -3444,15 +4345,22 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64( vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))))); #else - // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) - uint32x4_t cmp = - vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); - uint32x4_t swapped = vrev64q_u32(cmp); - return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped))); + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + uint64_t d[2]; + d[0] = a0 != b0 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1 != b1 ? ~UINT64_C(0) : UINT64_C(0); + return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } @@ -3470,20 +4378,22 @@ FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_u64(veorq_u64( vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = - !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = - !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = !(a0 >= b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = !(a1 >= b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3503,20 +4413,22 @@ FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_u64(veorq_u64( vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = - !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = - !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = !(a0 > b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = !(a1 > b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3536,20 +4448,22 @@ FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_u64(veorq_u64( vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = - !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = - !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = !(a0 <= b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = !(a1 <= b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3569,20 +4483,22 @@ FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_u64(veorq_u64( vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = - !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = - !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = !(a0 < b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = !(a1 < b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3602,7 +4518,7 @@ FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 // Excluding NaNs, any two floating point numbers can be compared. uint64x2_t not_nan_a = vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a)); @@ -3610,19 +4526,17 @@ FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b) vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b)); return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b)); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = ((*(double *) &a0) == (*(double *) &a0) && - (*(double *) &b0) == (*(double *) &b0)) - ? ~UINT64_C(0) - : UINT64_C(0); - d[1] = ((*(double *) &a1) == (*(double *) &a1) && - (*(double *) &b1) == (*(double *) &b1)) - ? ~UINT64_C(0) - : UINT64_C(0); + d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = (a1 == a1 && b1 == b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3634,17 +4548,15 @@ FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return _mm_move_sd(a, _mm_cmpord_pd(a, b)); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); uint64_t d[2]; - d[0] = ((*(double *) &a0) == (*(double *) &a0) && - (*(double *) &b0) == (*(double *) &b0)) - ? ~UINT64_C(0) - : UINT64_C(0); + d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); @@ -3656,7 +4568,7 @@ FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 // Two NaNs are not equal in comparison operation. uint64x2_t not_nan_a = vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a)); @@ -3665,19 +4577,17 @@ FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b) return vreinterpretq_m128d_s32( vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b)))); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = ((*(double *) &a0) == (*(double *) &a0) && - (*(double *) &b0) == (*(double *) &b0)) - ? UINT64_C(0) - : ~UINT64_C(0); - d[1] = ((*(double *) &a1) == (*(double *) &a1) && - (*(double *) &b1) == (*(double *) &b1)) - ? UINT64_C(0) - : ~UINT64_C(0); + d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0); + d[1] = (a1 == a1 && b1 == b1) ? UINT64_C(0) : ~UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3689,17 +4599,15 @@ FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return _mm_move_sd(a, _mm_cmpunord_pd(a, b)); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); uint64_t d[2]; - d[0] = ((*(double *) &a0) == (*(double *) &a0) && - (*(double *) &b0) == (*(double *) &b0)) - ? UINT64_C(0) - : ~UINT64_C(0); + d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); @@ -3711,13 +4619,13 @@ FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1; #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - - return (*(double *) &a0 >= *(double *) &b0); + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + return a0 >= b0; #endif } @@ -3726,13 +4634,14 @@ FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1; #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); - return (*(double *) &a0 > *(double *) &b0); + return a0 > b0; #endif } @@ -3741,13 +4650,14 @@ FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1; #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); - return (*(double *) &a0 <= *(double *) &b0); + return a0 <= b0; #endif } @@ -3756,13 +4666,14 @@ FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1; #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); - return (*(double *) &a0 < *(double *) &b0); + return a0 < b0; #endif } @@ -3771,19 +4682,14 @@ FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1; #else - uint32x4_t a_not_nan = - vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a)); - uint32x4_t b_not_nan = - vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b)); - uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_eq_b = - vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); - uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan), - vreinterpretq_u64_u32(a_eq_b)); - return vgetq_lane_u64(and_results, 0) & 0x1; + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + return a0 == b0 ? 1 : 0; #endif } @@ -3800,12 +4706,14 @@ FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_f64( vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))))); #else - double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0); - double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1); + double a0 = _sse2neon_static_cast( + double, vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0)); + double a1 = _sse2neon_static_cast( + double, vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1)); return _mm_set_pd(a1, a0); #endif } @@ -3823,18 +4731,13 @@ FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32 FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a) { -// vrnd32xq_f64 not supported on clang -#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__) - float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a)); - int64x2_t integers = vcvtq_s64_f64(rounded); - return vreinterpretq_m128i_s32( - vcombine_s32(vmovn_s64(integers), vdup_n_s32(0))); -#else __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); - double d0 = ((double *) &rnd)[0]; - double d1 = ((double *) &rnd)[1]; - return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0); -#endif + double d0, d1; + d0 = sse2neon_recast_u64_f64( + vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0)); + d1 = sse2neon_recast_u64_f64( + vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1)); + return _mm_set_epi32(0, 0, _sse2neon_cvtd_s32(d1), _sse2neon_cvtd_s32(d0)); } // Convert packed double-precision (64-bit) floating-point elements in a to @@ -3843,9 +4746,15 @@ FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a) FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a) { __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); - double d0 = ((double *) &rnd)[0]; - double d1 = ((double *) &rnd)[1]; - int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1}; + double d0, d1; + d0 = sse2neon_recast_u64_f64( + vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0)); + d1 = sse2neon_recast_u64_f64( + vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1)); + int32_t ALIGN_STRUCT(16) data[2] = { + _sse2neon_cvtd_s32(d0), + _sse2neon_cvtd_s32(d1), + }; return vreinterpret_m64_s32(vld1_s32(data)); } @@ -3855,13 +4764,15 @@ FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a)); return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0))); #else - float a0 = (float) ((double *) &a)[0]; - float a1 = (float) ((double *) &a)[1]; - return _mm_set_ps(0, 0, a1, a0); + double a0, a1; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + return _mm_set_ps(0, 0, _sse2neon_static_cast(float, a1), + _sse2neon_static_cast(float, a0)); #endif } @@ -3870,69 +4781,82 @@ FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_f64( vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a)))); #else - double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0); - double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1); + double a0 = _sse2neon_static_cast( + double, vget_lane_s32(vreinterpret_s32_m64(a), 0)); + double a1 = _sse2neon_static_cast( + double, vget_lane_s32(vreinterpret_s32_m64(a), 1)); return _mm_set_pd(a1, a0); #endif } // Convert packed single-precision (32-bit) floating-point elements in a to // packed 32-bit integers, and store the results in dst. +// x86 returns INT32_MIN ("integer indefinite") for NaN and out-of-range values. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32 // *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A // does not support! It is supported on ARMv8-A however. FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) { #if defined(__ARM_FEATURE_FRINT) - return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a))); -#elif (defined(__aarch64__) || defined(_M_ARM64)) || \ - defined(__ARM_FEATURE_DIRECTED_ROUNDING) + float32x4_t f = vreinterpretq_f32_m128(a); + int32x4_t cvt = vcvtq_s32_f32(vrnd32xq_f32(f)); + return vreinterpretq_m128i_s32(_sse2neon_cvtps_epi32_fixup(f, cvt)); +#elif SSE2NEON_ARCH_AARCH64 || defined(__ARM_FEATURE_DIRECTED_ROUNDING) + float32x4_t f = vreinterpretq_f32_m128(a); + int32x4_t cvt; switch (_MM_GET_ROUNDING_MODE()) { case _MM_ROUND_NEAREST: - return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a)); + cvt = vcvtnq_s32_f32(f); + break; case _MM_ROUND_DOWN: - return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a)); + cvt = vcvtmq_s32_f32(f); + break; case _MM_ROUND_UP: - return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a)); + cvt = vcvtpq_s32_f32(f); + break; default: // _MM_ROUND_TOWARD_ZERO - return vreinterpretq_m128i_s32(vcvtq_s32_f32(a)); + cvt = vcvtq_s32_f32(f); + break; } + return vreinterpretq_m128i_s32(_sse2neon_cvtps_epi32_fixup(f, cvt)); #else - float *f = (float *) &a; + float *f = _sse2neon_reinterpret_cast(float *, &a); switch (_MM_GET_ROUNDING_MODE()) { case _MM_ROUND_NEAREST: { + float32x4_t fv = vreinterpretq_f32_m128(a); uint32x4_t signmask = vdupq_n_u32(0x80000000); - float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), - vdupq_n_f32(0.5f)); /* +/- 0.5 */ - int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32( - vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/ - int32x4_t r_trunc = vcvtq_s32_f32( - vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */ + float32x4_t half = + vbslq_f32(signmask, fv, vdupq_n_f32(0.5f)); /* +/- 0.5 */ + int32x4_t r_normal = + vcvtq_s32_f32(vaddq_f32(fv, half)); /* round to integer: [a + 0.5]*/ + int32x4_t r_trunc = vcvtq_s32_f32(fv); /* truncate to integer: [a] */ int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32( vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ float32x4_t delta = vsubq_f32( - vreinterpretq_f32_m128(a), - vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ + fv, vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */ - return vreinterpretq_m128i_s32( - vbslq_s32(is_delta_half, r_even, r_normal)); + int32x4_t result = vbslq_s32(is_delta_half, r_even, r_normal); + return vreinterpretq_m128i_s32(_sse2neon_cvtps_epi32_fixup(fv, result)); } case _MM_ROUND_DOWN: - return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]), - floorf(f[0])); + return _mm_set_epi32( + _sse2neon_cvtf_s32(floorf(f[3])), _sse2neon_cvtf_s32(floorf(f[2])), + _sse2neon_cvtf_s32(floorf(f[1])), _sse2neon_cvtf_s32(floorf(f[0]))); case _MM_ROUND_UP: - return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), - ceilf(f[0])); + return _mm_set_epi32( + _sse2neon_cvtf_s32(ceilf(f[3])), _sse2neon_cvtf_s32(ceilf(f[2])), + _sse2neon_cvtf_s32(ceilf(f[1])), _sse2neon_cvtf_s32(ceilf(f[0]))); default: // _MM_ROUND_TOWARD_ZERO - return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1], - (int32_t) f[0]); + return _mm_set_epi32(_sse2neon_cvtf_s32(f[3]), _sse2neon_cvtf_s32(f[2]), + _sse2neon_cvtf_s32(f[1]), + _sse2neon_cvtf_s32(f[0])); } #endif } @@ -3943,12 +4867,14 @@ FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_f64( vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a)))); #else - double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); - double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); + double a0 = _sse2neon_static_cast( + double, vgetq_lane_f32(vreinterpretq_f32_m128(a), 0)); + double a1 = _sse2neon_static_cast( + double, vgetq_lane_f32(vreinterpretq_f32_m128(a), 1)); return _mm_set_pd(a1, a0); #endif } @@ -3957,10 +4883,13 @@ FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64 FORCE_INLINE double _mm_cvtsd_f64(__m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) - return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0); +#if SSE2NEON_ARCH_AARCH64 + return _sse2neon_static_cast(double, + vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0)); #else - return ((double *) &a)[0]; + double _a = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + return _a; #endif } @@ -3969,13 +4898,10 @@ FORCE_INLINE double _mm_cvtsd_f64(__m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32 FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) - return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0); -#else __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); - double ret = ((double *) &rnd)[0]; - return (int32_t) ret; -#endif + double ret = sse2neon_recast_u64_f64( + vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0)); + return _sse2neon_cvtd_s32(ret); } // Convert the lower double-precision (64-bit) floating-point element in a to a @@ -3983,13 +4909,10 @@ FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64 FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) - return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0); -#else __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); - double ret = ((double *) &rnd)[0]; - return (int64_t) ret; -#endif + double ret = sse2neon_recast_u64_f64( + vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0)); + return _sse2neon_cvtd_s64(ret); } // Convert the lower double-precision (64-bit) floating-point element in a to a @@ -4004,13 +4927,15 @@ FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128_f32(vsetq_lane_f32( vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0), vreinterpretq_f32_m128(a), 0)); #else - return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0], - vreinterpretq_f32_m128(a), 0)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + return vreinterpretq_m128_f32(vsetq_lane_f32( + _sse2neon_static_cast(float, b0), vreinterpretq_f32_m128(a), 0)); #endif } @@ -4038,13 +4963,13 @@ FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b) { -#if defined(__aarch64__) || defined(_M_ARM64) - return vreinterpretq_m128d_f64( - vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0)); +#if SSE2NEON_ARCH_AARCH64 + return vreinterpretq_m128d_f64(vsetq_lane_f64( + _sse2neon_static_cast(double, b), vreinterpretq_f64_m128d(a), 0)); #else - double bf = (double) b; + int64_t _b = sse2neon_recast_f64_s64(_sse2neon_static_cast(double, b)); return vreinterpretq_m128d_s64( - vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0)); + vsetq_lane_s64(_b, vreinterpretq_s64_m128d(a), 0)); #endif } @@ -4066,13 +4991,13 @@ FORCE_INLINE __m128i _mm_cvtsi32_si128(int a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b) { -#if defined(__aarch64__) || defined(_M_ARM64) - return vreinterpretq_m128d_f64( - vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0)); +#if SSE2NEON_ARCH_AARCH64 + return vreinterpretq_m128d_f64(vsetq_lane_f64( + _sse2neon_static_cast(double, b), vreinterpretq_f64_m128d(a), 0)); #else - double bf = (double) b; + int64_t _b = sse2neon_recast_f64_s64(_sse2neon_static_cast(double, b)); return vreinterpretq_m128d_s64( - vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0)); + vsetq_lane_s64(_b, vreinterpretq_s64_m128d(a), 0)); #endif } @@ -4102,13 +5027,14 @@ FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b) { - double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); -#if defined(__aarch64__) || defined(_M_ARM64) + double d = _sse2neon_static_cast( + double, vgetq_lane_f32(vreinterpretq_f32_m128(b), 0)); +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_f64( vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0)); #else - return vreinterpretq_m128d_s64( - vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0)); + return vreinterpretq_m128d_s64(vsetq_lane_s64( + sse2neon_recast_f64_s64(d), vreinterpretq_s64_m128d(a), 0)); #endif } @@ -4117,9 +5043,10 @@ FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32 FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a) { - double a0 = ((double *) &a)[0]; - double a1 = ((double *) &a)[1]; - return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0); + double a0, a1; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + return _mm_set_epi32(0, 0, _sse2neon_cvtd_s32(a1), _sse2neon_cvtd_s32(a0)); } // Convert packed double-precision (64-bit) floating-point elements in a to @@ -4127,18 +5054,57 @@ FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32 FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a) { - double a0 = ((double *) &a)[0]; - double a1 = ((double *) &a)[1]; - int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1}; +#if SSE2NEON_ARCH_AARCH64 + /* Vectorized AArch64 path - branchless, no memory round-trip */ + float64x2_t f = vreinterpretq_f64_m128d(a); + + /* Convert f64 to i64 with truncation toward zero. + * Out-of-range values produce undefined results, but we mask them below. + */ + int64x2_t i64 = vcvtq_s64_f64(f); + + /* Detect values outside INT32 range: >= 2147483648.0 or < -2147483648.0 + * x86 returns INT32_MIN (0x80000000) for these cases. + */ + float64x2_t max_f = vdupq_n_f64(2147483648.0); /* INT32_MAX + 1 */ + float64x2_t min_f = vdupq_n_f64(-2147483648.0); + uint64x2_t overflow = vorrq_u64(vcgeq_f64(f, max_f), vcltq_f64(f, min_f)); + + /* Detect NaN: a value is NaN if it's not equal to itself. + * Use XOR with all-ones since vmvnq_u64 doesn't exist. */ + uint64x2_t eq_self = vceqq_f64(f, f); + uint64x2_t is_nan = veorq_u64(eq_self, vdupq_n_u64(UINT64_MAX)); + + /* Combine: any overflow or NaN should produce INT32_MIN */ + uint64x2_t need_indefinite = vorrq_u64(overflow, is_nan); + + /* Narrow i64 to i32 (simple truncation of upper 32 bits) */ + int32x2_t i32 = vmovn_s64(i64); + + /* Blend: select INT32_MIN where needed, otherwise use converted value */ + uint32x2_t mask32 = vmovn_u64(need_indefinite); + int32x2_t indefinite = vdup_n_s32(INT32_MIN); + return vreinterpret_m64_s32(vbsl_s32(mask32, indefinite, i32)); +#else + /* Scalar fallback for ARMv7 (no f64 SIMD support) */ + double a0, a1; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + int32_t ALIGN_STRUCT(16) data[2] = {_sse2neon_cvtd_s32(a0), + _sse2neon_cvtd_s32(a1)}; return vreinterpret_m64_s32(vld1_s32(data)); +#endif } // Convert packed single-precision (32-bit) floating-point elements in a to // packed 32-bit integers with truncation, and store the results in dst. +// x86 returns INT32_MIN ("integer indefinite") for NaN and out-of-range values. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32 FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) { - return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))); + float32x4_t f = vreinterpretq_f32_m128(a); + int32x4_t cvt = vcvtq_s32_f32(f); + return vreinterpretq_m128i_s32(_sse2neon_cvtps_epi32_fixup(f, cvt)); } // Convert the lower double-precision (64-bit) floating-point element in a to a @@ -4146,8 +5112,9 @@ FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32 FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a) { - double ret = *((double *) &a); - return (int32_t) ret; + double _a = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + return _sse2neon_cvtd_s32(_a); } // Convert the lower double-precision (64-bit) floating-point element in a to a @@ -4155,12 +5122,9 @@ FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64 FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) - return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0); -#else - double ret = *((double *) &a); - return (int64_t) ret; -#endif + double _a = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + return _sse2neon_cvtd_s64(_a); } // Convert the lower double-precision (64-bit) floating-point element in a to a @@ -4173,16 +5137,22 @@ FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_f64( vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - double *da = (double *) &a; - double *db = (double *) &b; + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); double c[2]; - c[0] = da[0] / db[0]; - c[1] = da[1] / db[1]; - return vld1q_f32((float32_t *) c); + c[0] = a0 / b0; + c[1] = a1 / b1; + return sse2neon_vld1q_f32_from_f64pair(c); #endif } @@ -4193,7 +5163,7 @@ FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 float64x2_t tmp = vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)); return vreinterpretq_m128d_f64( @@ -4206,18 +5176,21 @@ FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b) // Extract a 16-bit integer from a, selected with imm8, and store the result in // the lower element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16 -// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm) -#define _mm_extract_epi16(a, imm) \ - vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm)) +// FORCE_INLINE int _mm_extract_epi16(__m128i a, const int imm) +// imm must be a compile-time constant in range [0, 7] +#define _mm_extract_epi16(a, imm) \ + (SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 7), \ + vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))) // Copy a to dst, and insert the 16-bit integer i into dst at the location // specified by imm8. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16 -// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b, -// __constrange(0,8) int imm) -#define _mm_insert_epi16(a, b, imm) \ - vreinterpretq_m128i_s16( \ - vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))) +// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b, const int imm) +// imm must be a compile-time constant in range [0, 7] +#define _mm_insert_epi16(a, b, imm) \ + (SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 7), \ + vreinterpretq_m128i_s16( \ + vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm)))) // Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point // elements) from memory into dst. mem_addr must be aligned on a 16-byte @@ -4225,10 +5198,10 @@ FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd FORCE_INLINE __m128d _mm_load_pd(const double *p) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_f64(vld1q_f64(p)); #else - const float *fp = (const float *) p; + const float *fp = _sse2neon_reinterpret_cast(const float *, p); float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]}; return vreinterpretq_m128d_f32(vld1q_f32(data)); #endif @@ -4245,10 +5218,10 @@ FORCE_INLINE __m128d _mm_load_pd(const double *p) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd FORCE_INLINE __m128d _mm_load_sd(const double *p) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0)); #else - const float *fp = (const float *) p; + const float *fp = _sse2neon_reinterpret_cast(const float *, p); float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0}; return vreinterpretq_m128d_f32(vld1q_f32(data)); #endif @@ -4259,7 +5232,8 @@ FORCE_INLINE __m128d _mm_load_sd(const double *p) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128 FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) { - return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); + return vreinterpretq_m128i_s32( + vld1q_s32(_sse2neon_reinterpret_cast(const int32_t *, p))); } // Load a double-precision (64-bit) floating-point element from memory into both @@ -4267,10 +5241,11 @@ FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd FORCE_INLINE __m128d _mm_load1_pd(const double *p) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_f64(vld1q_dup_f64(p)); #else - return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p)); + return vreinterpretq_m128d_s64( + vdupq_n_s64(*_sse2neon_reinterpret_cast(const int64_t *, p))); #endif } @@ -4280,12 +5255,13 @@ FORCE_INLINE __m128d _mm_load1_pd(const double *p) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_f64( vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p))); #else - return vreinterpretq_m128d_f32(vcombine_f32( - vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p))); + return vreinterpretq_m128d_f32( + vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(a)), + vld1_f32(_sse2neon_reinterpret_cast(const float *, p)))); #endif } @@ -4297,7 +5273,8 @@ FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p) * lower 64 bits of the result, zeroing the upper 64 bits of the result. */ return vreinterpretq_m128i_s32( - vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0))); + vcombine_s32(vld1_s32(_sse2neon_reinterpret_cast(int32_t const *, p)), + vcreate_s32(0))); } // Load a double-precision (64-bit) floating-point element from memory into the @@ -4306,12 +5283,12 @@ FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_f64( vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a)))); #else return vreinterpretq_m128d_f32( - vcombine_f32(vld1_f32((const float *) p), + vcombine_f32(vld1_f32(_sse2neon_reinterpret_cast(const float *, p)), vget_high_f32(vreinterpretq_f32_m128d(a)))); #endif } @@ -4322,11 +5299,11 @@ FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd FORCE_INLINE __m128d _mm_loadr_pd(const double *p) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 float64x2_t v = vld1q_f64(p); return vreinterpretq_m128d_f64(vextq_f64(v, v, 1)); #else - int64x2_t v = vld1q_s64((const int64_t *) p); + int64x2_t v = vld1q_s64(_sse2neon_reinterpret_cast(const int64_t *, p)); return vreinterpretq_m128d_s64(vextq_s64(v, v, 1)); #endif } @@ -4343,15 +5320,17 @@ FORCE_INLINE __m128d _mm_loadu_pd(const double *p) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128 FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p) { - return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); + return vreinterpretq_m128i_s32( + vld1q_s32(_sse2neon_reinterpret_cast(const unaligned_int32_t *, p))); } // Load unaligned 32-bit integer from memory into the first element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32 FORCE_INLINE __m128i _mm_loadu_si32(const void *p) { - return vreinterpretq_m128i_s32( - vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0)); + return vreinterpretq_m128i_s32(vsetq_lane_s32( + *_sse2neon_reinterpret_cast(const unaligned_int32_t *, p), + vdupq_n_s32(0), 0)); } // Multiply packed signed 16-bit integers in a and b, producing intermediate @@ -4362,7 +5341,7 @@ FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b) { int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), vget_low_s16(vreinterpretq_s16_m128i(b))); -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 int32x4_t high = vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)); @@ -4386,11 +5365,11 @@ FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b) FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr) { int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7); - __m128 b = _mm_load_ps((const float *) mem_addr); + __m128 b = _mm_load_ps(_sse2neon_reinterpret_cast(const float *, mem_addr)); int8x16_t masked = vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128(b)); - vst1q_s8((int8_t *) mem_addr, masked); + vst1q_s8(_sse2neon_reinterpret_cast(int8_t *, mem_addr), masked); } // Compare packed signed 16-bit integers in a and b, and store packed maximum @@ -4416,7 +5395,7 @@ FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 #if SSE2NEON_PRECISE_MINMAX float64x2_t _a = vreinterpretq_f64_m128d(a); float64x2_t _b = vreinterpretq_f64_m128d(b); @@ -4426,15 +5405,19 @@ FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b) vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #endif #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); - uint64_t d[2]; - d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0; - d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1; + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + int64_t d[2]; + d[0] = a0 > b0 ? sse2neon_recast_f64_s64(a0) : sse2neon_recast_f64_s64(b0); + d[1] = a1 > b1 ? sse2neon_recast_f64_s64(a1) : sse2neon_recast_f64_s64(b1); - return vreinterpretq_m128d_u64(vld1q_u64(d)); + return vreinterpretq_m128d_s64(vld1q_s64(d)); #endif } @@ -4444,13 +5427,15 @@ FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return _mm_move_sd(a, _mm_max_pd(a, b)); #else - double *da = (double *) &a; - double *db = (double *) &b; - double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]}; - return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c)); + double a0, a1, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double c[2] = {a0 > b0 ? a0 : b0, a1}; + return vreinterpretq_m128d_f32(sse2neon_vld1q_f32_from_f64pair(c)); #endif } @@ -4477,7 +5462,7 @@ FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 #if SSE2NEON_PRECISE_MINMAX float64x2_t _a = vreinterpretq_f64_m128d(a); float64x2_t _b = vreinterpretq_f64_m128d(b); @@ -4487,14 +5472,18 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b) vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #endif #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); - uint64_t d[2]; - d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0; - d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1; - return vreinterpretq_m128d_u64(vld1q_u64(d)); + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + int64_t d[2]; + d[0] = a0 < b0 ? sse2neon_recast_f64_s64(a0) : sse2neon_recast_f64_s64(b0); + d[1] = a1 < b1 ? sse2neon_recast_f64_s64(a1) : sse2neon_recast_f64_s64(b1); + return vreinterpretq_m128d_s64(vld1q_s64(d)); #endif } @@ -4504,13 +5493,15 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return _mm_move_sd(a, _mm_min_pd(a, b)); #else - double *da = (double *) &a; - double *db = (double *) &b; - double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]}; - return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c)); + double a0, a1, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double c[2] = {a0 < b0 ? a0 : b0, a1}; + return vreinterpretq_m128d_f32(sse2neon_vld1q_f32_from_f64pair(c)); #endif } @@ -4537,84 +5528,78 @@ FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b) // Create mask from the most significant bit of each 8-bit element in a, and // store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8 +// +// Input (__m128i): 16 bytes, extract bit 7 (MSB) of each +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// |0|1|2|3|4|5|6|7|8|9|A|B|C|D|E|F| byte index +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | ... | +// MSB MSB +// v v v v v v v v v v v v v v v +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// |0|1|2|3|4|5|6|7|8|9|A|B|C|D|E|F| bit position in result +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// |<-- low byte ->|<-- high byte->| +// +// Output (int): 16-bit mask where bit[i] = MSB of input byte[i] FORCE_INLINE int _mm_movemask_epi8(__m128i a) { - // Use increasingly wide shifts+adds to collect the sign bits - // together. - // Since the widening shifts would be rather confusing to follow in little - // endian, everything will be illustrated in big endian order instead. This - // has a different result - the bits would actually be reversed on a big - // endian machine. - - // Starting input (only half the elements are shown): - // 89 ff 1d c0 00 10 99 33 uint8x16_t input = vreinterpretq_u8_m128i(a); - // Shift out everything but the sign bits with an unsigned shift right. +#if SSE2NEON_ARCH_AARCH64 + // AArch64: Variable shift + horizontal add (vaddv). // - // Bytes of the vector:: - // 89 ff 1d c0 00 10 99 33 - // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7) - // | | | | | | | | - // 01 01 00 01 00 00 01 00 - // - // Bits of first important lane(s): - // 10001001 (89) - // \______ - // | - // 00000001 (01) - uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); + // Step 1: Extract MSB of each byte (vshr #7: 0x80->1, 0x7F->0) + uint8x16_t msbs = vshrq_n_u8(input, 7); - // Merge the even lanes together with a 16-bit unsigned shift right + add. - // 'xx' represents garbage data which will be ignored in the final result. - // In the important bytes, the add functions like a binary OR. + // Step 2: Shift each byte left by its bit position (0-7 per half) // - // 01 01 00 01 00 00 01 00 - // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7)) - // \| \| \| \| - // xx 03 xx 01 xx 00 xx 02 + // msbs: [ 1 ][ 0 ][ 1 ][ 1 ][ 0 ][ 1 ][ 0 ][ 1 ] (example) + // shifts: [ 0 ][ 1 ][ 2 ][ 3 ][ 4 ][ 5 ][ 6 ][ 7 ] + // | | | | | | | | + // <<0 <<1 <<2 <<3 <<4 <<5 <<6 <<7 + // v v v v v v v v + // result: [0x01][0x00][0x04][0x08][0x00][0x20][0x00][0x80] // - // 00000001 00000001 (01 01) - // \_______ | - // \| - // xxxxxxxx xxxxxx11 (xx 03) - uint32x4_t paired16 = - vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); + // Horizontal sum: 0x01+0x04+0x08+0x20+0x80 = 0xAD = 0b10101101 + // Each bit in sum corresponds to one input byte's MSB. + static const int8_t shift_table[16] = {0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7}; + int8x16_t shifts = vld1q_s8(shift_table); + uint8x16_t positioned = vshlq_u8(msbs, shifts); - // Repeat with a wider 32-bit shift + add. - // xx 03 xx 01 xx 00 xx 02 - // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >> - // 14)) - // \| \| - // xx xx xx 0d xx xx xx 02 + // Step 3: Sum each half -> bits [7:0] and [15:8] + return vaddv_u8(vget_low_u8(positioned)) | + (vaddv_u8(vget_high_u8(positioned)) << 8); +#else + // ARMv7: Shift-right-accumulate (no vaddv). // - // 00000011 00000001 (03 01) - // \\_____ || - // '----.\|| - // xxxxxxxx xxxx1101 (xx 0d) - uint64x2_t paired32 = - vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); + // Step 1: Extract MSB of each byte + uint8x16_t msbs = vshrq_n_u8(input, 7); + uint64x2_t bits = vreinterpretq_u64_u8(msbs); - // Last, an even wider 64-bit shift + add to get our result in the low 8 bit - // lanes. xx xx xx 0d xx xx xx 02 - // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >> - // 28)) - // \| - // xx xx xx xx xx xx xx d2 + // Step 2: Parallel bit collection via shift-right-accumulate // - // 00001101 00000010 (0d 02) - // \ \___ | | - // '---. \| | - // xxxxxxxx 11010010 (xx d2) - uint8x16_t paired64 = - vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); + // Initial (8 bytes shown): + // byte: [ 0 ][ 1 ][ 2 ][ 3 ][ 4 ][ 5 ][ 6 ][ 7 ] + // value: [ 01 ][ 00 ][ 01 ][ 01 ][ 00 ][ 01 ][ 00 ][ 01 ] + // + // vsra(..., 7): add original + (original >> 7) + // byte 1 gets: orig[1] + orig[0] = b1|b0 in bits [1:0] + // byte 3 gets: orig[3] + orig[2] = b3|b2 in bits [1:0] + // ... + // Result: pairs combined into odd bytes + // + // vsra(..., 14): combine pairs -> 4 bits in bytes 3,7 + // vsra(..., 28): combine all -> 8 bits in byte 7 (actually byte 0) + bits = vsraq_n_u64(bits, bits, 7); + bits = vsraq_n_u64(bits, bits, 14); + bits = vsraq_n_u64(bits, bits, 28); - // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts. - // xx xx xx xx xx xx xx d2 - // || return paired64[0] - // d2 - // Note: Little endian would return the correct value 4b (01001011) instead. - return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8); + // Step 3: Extract packed result from byte 0 of each half + uint8x16_t output = vreinterpretq_u8_u64(bits); + return vgetq_lane_u8(output, 0) | (vgetq_lane_u8(output, 8) << 8); +#endif } // Set each bit of mask dst based on the most significant bit of the @@ -4624,8 +5609,8 @@ FORCE_INLINE int _mm_movemask_pd(__m128d a) { uint64x2_t input = vreinterpretq_u64_m128d(a); uint64x2_t high_bits = vshrq_n_u64(input, 63); - return (int) (vgetq_lane_u64(high_bits, 0) | - (vgetq_lane_u64(high_bits, 1) << 1)); + return _sse2neon_static_cast(int, vgetq_lane_u64(high_bits, 0) | + (vgetq_lane_u64(high_bits, 1) << 1)); } // Copy the lower 64-bit integer in a to dst. @@ -4660,16 +5645,22 @@ FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_f64( vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - double *da = (double *) &a; - double *db = (double *) &b; + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); double c[2]; - c[0] = da[0] * db[0]; - c[1] = da[1] * db[1]; - return vld1q_f32((float32_t *) c); + c[0] = a0 * b0; + c[1] = a1 * b1; + return sse2neon_vld1q_f32_from_f64pair(c); #endif } @@ -4697,10 +5688,9 @@ FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16 FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b) { - /* FIXME: issue with large values because of result saturation */ - // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a), - // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return - // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1)); + // vmull_s16 is used instead of vqdmulhq_s16 to avoid saturation issues + // with large values (e.g., -32768 * -32768). vmull_s16 produces full 32-bit + // products without saturation. int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a)); int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b)); int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */ @@ -4721,7 +5711,7 @@ FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b) uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a)); uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b)); uint32x4_t ab3210 = vmull_u16(a3210, b3210); -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 uint32x4_t ab7654 = vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)); uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210), @@ -4800,9 +5790,9 @@ FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) // Arm cores. Experience with several databases has shown has shown an 'isb' is // a reasonable approximation. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause -FORCE_INLINE void _mm_pause() +FORCE_INLINE void _mm_pause(void) { -#if defined(_MSC_VER) +#if SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG __isb(_ARM64_BARRIER_SY); #else __asm__ __volatile__("isb\n"); @@ -4816,7 +5806,8 @@ FORCE_INLINE void _mm_pause() // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8 FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b) { - uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b)); + uint16x8_t t = vpaddlq_u8( + vabdq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t))); } @@ -4877,12 +5868,16 @@ FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b1, signed char b0) { - int8_t ALIGN_STRUCT(16) - data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, - (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, - (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, - (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; - return (__m128i) vld1q_s8(data); + int8_t ALIGN_STRUCT(16) data[16] = { + _sse2neon_static_cast(int8_t, b0), _sse2neon_static_cast(int8_t, b1), + _sse2neon_static_cast(int8_t, b2), _sse2neon_static_cast(int8_t, b3), + _sse2neon_static_cast(int8_t, b4), _sse2neon_static_cast(int8_t, b5), + _sse2neon_static_cast(int8_t, b6), _sse2neon_static_cast(int8_t, b7), + _sse2neon_static_cast(int8_t, b8), _sse2neon_static_cast(int8_t, b9), + _sse2neon_static_cast(int8_t, b10), _sse2neon_static_cast(int8_t, b11), + _sse2neon_static_cast(int8_t, b12), _sse2neon_static_cast(int8_t, b13), + _sse2neon_static_cast(int8_t, b14), _sse2neon_static_cast(int8_t, b15)}; + return vreinterpretq_m128i_s8(vld1q_s8(data)); } // Set packed double-precision (64-bit) floating-point elements in dst with the @@ -4891,10 +5886,11 @@ FORCE_INLINE __m128i _mm_set_epi8(signed char b15, FORCE_INLINE __m128d _mm_set_pd(double e1, double e0) { double ALIGN_STRUCT(16) data[2] = {e0, e1}; -#if defined(__aarch64__) || defined(_M_ARM64) - return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data)); +#if SSE2NEON_ARCH_AARCH64 + return vreinterpretq_m128d_f64( + vld1q_f64(_sse2neon_reinterpret_cast(float64_t *, data))); #else - return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data)); + return vreinterpretq_m128d_f32(sse2neon_vld1q_f32_from_f64pair(data)); #endif } @@ -4908,14 +5904,14 @@ FORCE_INLINE __m128d _mm_set_pd(double e1, double e0) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd FORCE_INLINE __m128d _mm_set_sd(double a) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0)); #else return _mm_set_pd(0, a); #endif } -// Broadcast 16-bit integer a to all all elements of dst. +// Broadcast 16-bit integer a to all elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16 FORCE_INLINE __m128i _mm_set1_epi16(short w) { @@ -4955,10 +5951,11 @@ FORCE_INLINE __m128i _mm_set1_epi8(signed char w) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd FORCE_INLINE __m128d _mm_set1_pd(double d) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_f64(vdupq_n_f64(d)); #else - return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d)); + int64_t _d = sse2neon_recast_f64_s64(d); + return vreinterpretq_m128d_s64(vdupq_n_s64(_d)); #endif } @@ -4974,7 +5971,8 @@ FORCE_INLINE __m128i _mm_setr_epi16(short w0, short w7) { int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7}; - return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data)); + return vreinterpretq_m128i_s16( + vld1q_s16(_sse2neon_reinterpret_cast(int16_t *, data))); } // Set packed 32-bit integers in dst with the supplied values in reverse order. @@ -5011,12 +6009,16 @@ FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, signed char b14, signed char b15) { - int8_t ALIGN_STRUCT(16) - data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, - (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, - (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, - (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; - return (__m128i) vld1q_s8(data); + int8_t ALIGN_STRUCT(16) data[16] = { + _sse2neon_static_cast(int8_t, b0), _sse2neon_static_cast(int8_t, b1), + _sse2neon_static_cast(int8_t, b2), _sse2neon_static_cast(int8_t, b3), + _sse2neon_static_cast(int8_t, b4), _sse2neon_static_cast(int8_t, b5), + _sse2neon_static_cast(int8_t, b6), _sse2neon_static_cast(int8_t, b7), + _sse2neon_static_cast(int8_t, b8), _sse2neon_static_cast(int8_t, b9), + _sse2neon_static_cast(int8_t, b10), _sse2neon_static_cast(int8_t, b11), + _sse2neon_static_cast(int8_t, b12), _sse2neon_static_cast(int8_t, b13), + _sse2neon_static_cast(int8_t, b14), _sse2neon_static_cast(int8_t, b15)}; + return vreinterpretq_m128i_s8(vld1q_s8(data)); } // Set packed double-precision (64-bit) floating-point elements in dst with the @@ -5031,7 +6033,7 @@ FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd FORCE_INLINE __m128d _mm_setzero_pd(void) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_f64(vdupq_n_f64(0)); #else return vreinterpretq_m128d_f32(vdupq_n_f32(0)); @@ -5048,11 +6050,12 @@ FORCE_INLINE __m128i _mm_setzero_si128(void) // Shuffle 32-bit integers in a using the control in imm8, and store the results // in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32 -// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, -// __constrange(0,255) int imm) +// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, const int imm) +// imm must be a compile-time constant in range [0, 255] #if defined(_sse2neon_shuffle) #define _mm_shuffle_epi32(a, imm) \ __extension__({ \ + SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); \ int32x4_t _input = vreinterpretq_s32_m128i(a); \ int32x4_t _shuf = \ vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \ @@ -5060,77 +6063,84 @@ FORCE_INLINE __m128i _mm_setzero_si128(void) vreinterpretq_m128i_s32(_shuf); \ }) #else // generic -#define _mm_shuffle_epi32(a, imm) \ - _sse2neon_define1( \ - __m128i, a, __m128i ret; switch (imm) { \ - case _MM_SHUFFLE(1, 0, 3, 2): \ - ret = _mm_shuffle_epi_1032(_a); \ - break; \ - case _MM_SHUFFLE(2, 3, 0, 1): \ - ret = _mm_shuffle_epi_2301(_a); \ - break; \ - case _MM_SHUFFLE(0, 3, 2, 1): \ - ret = _mm_shuffle_epi_0321(_a); \ - break; \ - case _MM_SHUFFLE(2, 1, 0, 3): \ - ret = _mm_shuffle_epi_2103(_a); \ - break; \ - case _MM_SHUFFLE(1, 0, 1, 0): \ - ret = _mm_shuffle_epi_1010(_a); \ - break; \ - case _MM_SHUFFLE(1, 0, 0, 1): \ - ret = _mm_shuffle_epi_1001(_a); \ - break; \ - case _MM_SHUFFLE(0, 1, 0, 1): \ - ret = _mm_shuffle_epi_0101(_a); \ - break; \ - case _MM_SHUFFLE(2, 2, 1, 1): \ - ret = _mm_shuffle_epi_2211(_a); \ - break; \ - case _MM_SHUFFLE(0, 1, 2, 2): \ - ret = _mm_shuffle_epi_0122(_a); \ - break; \ - case _MM_SHUFFLE(3, 3, 3, 2): \ - ret = _mm_shuffle_epi_3332(_a); \ - break; \ - case _MM_SHUFFLE(0, 0, 0, 0): \ - ret = _mm_shuffle_epi32_splat(_a, 0); \ - break; \ - case _MM_SHUFFLE(1, 1, 1, 1): \ - ret = _mm_shuffle_epi32_splat(_a, 1); \ - break; \ - case _MM_SHUFFLE(2, 2, 2, 2): \ - ret = _mm_shuffle_epi32_splat(_a, 2); \ - break; \ - case _MM_SHUFFLE(3, 3, 3, 3): \ - ret = _mm_shuffle_epi32_splat(_a, 3); \ - break; \ - default: \ - ret = _mm_shuffle_epi32_default(_a, (imm)); \ - break; \ +#define _mm_shuffle_epi32(a, imm) \ + _sse2neon_define1( \ + __m128i, a, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); __m128i ret; \ + switch (imm) { \ + case _MM_SHUFFLE(1, 0, 3, 2): \ + ret = _mm_shuffle_epi_1032(_a); \ + break; \ + case _MM_SHUFFLE(2, 3, 0, 1): \ + ret = _mm_shuffle_epi_2301(_a); \ + break; \ + case _MM_SHUFFLE(0, 3, 2, 1): \ + ret = _mm_shuffle_epi_0321(_a); \ + break; \ + case _MM_SHUFFLE(2, 1, 0, 3): \ + ret = _mm_shuffle_epi_2103(_a); \ + break; \ + case _MM_SHUFFLE(1, 0, 1, 0): \ + ret = _mm_shuffle_epi_1010(_a); \ + break; \ + case _MM_SHUFFLE(1, 0, 0, 1): \ + ret = _mm_shuffle_epi_1001(_a); \ + break; \ + case _MM_SHUFFLE(0, 1, 0, 1): \ + ret = _mm_shuffle_epi_0101(_a); \ + break; \ + case _MM_SHUFFLE(2, 2, 1, 1): \ + ret = _mm_shuffle_epi_2211(_a); \ + break; \ + case _MM_SHUFFLE(0, 1, 2, 2): \ + ret = _mm_shuffle_epi_0122(_a); \ + break; \ + case _MM_SHUFFLE(3, 3, 3, 2): \ + ret = _mm_shuffle_epi_3332(_a); \ + break; \ + case _MM_SHUFFLE(0, 0, 0, 0): \ + ret = _mm_shuffle_epi32_splat(_a, 0); \ + break; \ + case _MM_SHUFFLE(1, 1, 1, 1): \ + ret = _mm_shuffle_epi32_splat(_a, 1); \ + break; \ + case _MM_SHUFFLE(2, 2, 2, 2): \ + ret = _mm_shuffle_epi32_splat(_a, 2); \ + break; \ + case _MM_SHUFFLE(3, 3, 3, 3): \ + ret = _mm_shuffle_epi32_splat(_a, 3); \ + break; \ + default: \ + ret = _mm_shuffle_epi32_default(_a, (imm)); \ + break; \ } _sse2neon_return(ret);) #endif // Shuffle double-precision (64-bit) floating-point elements using the control // in imm8, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd +// imm8 must be a compile-time constant in range [0, 3] #ifdef _sse2neon_shuffle -#define _mm_shuffle_pd(a, b, imm8) \ - vreinterpretq_m128d_s64( \ - vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \ - imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2)) +#define _mm_shuffle_pd(a, b, imm8) \ + __extension__({ \ + SSE2NEON_REQUIRE_CONST_RANGE(imm8, 0, 3); \ + vreinterpretq_m128d_s64(vshuffleq_s64( \ + vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \ + (imm8) & 0x1, (((imm8) & 0x2) >> 1) + 2)); \ + }) #else -#define _mm_shuffle_pd(a, b, imm8) \ - _mm_castsi128_pd(_mm_set_epi64x( \ - vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \ - vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1))) +#define _mm_shuffle_pd(a, b, imm8) \ + (SSE2NEON_REQUIRE_CONST_RANGE(imm8, 0, 3), \ + _mm_castsi128_pd(_mm_set_epi64x( \ + vgetq_lane_s64(vreinterpretq_s64_m128d(b), ((imm8) & 0x2) >> 1), \ + vgetq_lane_s64(vreinterpretq_s64_m128d(a), (imm8) & 0x1)))) #endif -// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, -// __constrange(0,255) int imm) +// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, const int imm) +// imm must be a compile-time constant in range [0, 255] #if defined(_sse2neon_shuffle) #define _mm_shufflehi_epi16(a, imm) \ __extension__({ \ + SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); \ int16x8_t _input = vreinterpretq_s16_m128i(a); \ int16x8_t _shuf = \ vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \ @@ -5138,23 +6148,28 @@ FORCE_INLINE __m128i _mm_setzero_si128(void) (((imm) >> 6) & 0x3) + 4); \ vreinterpretq_m128i_s16(_shuf); \ }) -#else // generic -#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm)) +#else +#define _mm_shufflehi_epi16(a, imm) \ + (SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255), \ + _mm_shufflehi_epi16_function((a), (imm))) #endif -// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a, -// __constrange(0,255) int imm) +// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a, const int imm) +// imm must be a compile-time constant in range [0, 255] #if defined(_sse2neon_shuffle) #define _mm_shufflelo_epi16(a, imm) \ __extension__({ \ + SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); \ int16x8_t _input = vreinterpretq_s16_m128i(a); \ int16x8_t _shuf = vshuffleq_s16( \ _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \ (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \ vreinterpretq_m128i_s16(_shuf); \ }) -#else // generic -#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm)) +#else +#define _mm_shufflelo_epi16(a, imm) \ + (SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255), \ + _mm_shufflelo_epi16_function((a), (imm))) #endif // Shift packed 16-bit integers in a left by count while shifting in zeros, and @@ -5163,10 +6178,10 @@ FORCE_INLINE __m128i _mm_setzero_si128(void) FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); - if (_sse2neon_unlikely(c & ~15)) + if (_sse2neon_unlikely(c > 15)) return _mm_setzero_si128(); - int16x8_t vc = vdupq_n_s16((int16_t) c); + int16x8_t vc = vdupq_n_s16(_sse2neon_static_cast(int16_t, c)); return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc)); } @@ -5176,10 +6191,10 @@ FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count) FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); - if (_sse2neon_unlikely(c & ~31)) + if (_sse2neon_unlikely(c > 31)) return _mm_setzero_si128(); - int32x4_t vc = vdupq_n_s32((int32_t) c); + int32x4_t vc = vdupq_n_s32(_sse2neon_static_cast(int32_t, c)); return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc)); } @@ -5189,10 +6204,10 @@ FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count) FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); - if (_sse2neon_unlikely(c & ~63)) + if (_sse2neon_unlikely(c > 63)) return _mm_setzero_si128(); - int64x2_t vc = vdupq_n_s64((int64_t) c); + int64x2_t vc = vdupq_n_s64(_sse2neon_static_cast(int64_t, c)); return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc)); } @@ -5204,7 +6219,8 @@ FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm) if (_sse2neon_unlikely(imm & ~15)) return _mm_setzero_si128(); return vreinterpretq_m128i_s16( - vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm))); + vshlq_s16(vreinterpretq_s16_m128i(a), + vdupq_n_s16(_sse2neon_static_cast(int16_t, imm)))); } // Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and @@ -5232,13 +6248,14 @@ FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm) // Shift a left by imm8 bytes while shifting in zeros, and store the results in // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128 -#define _mm_slli_si128(a, imm) \ - _sse2neon_define1( \ - __m128i, a, int8x16_t ret; \ - if (_sse2neon_unlikely(imm == 0)) ret = vreinterpretq_s8_m128i(_a); \ - else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \ - else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a), \ - ((imm <= 0 || imm > 15) ? 0 : (16 - imm))); \ +// imm must be a compile-time constant in range [0, 255] +#define _mm_slli_si128(a, imm) \ + _sse2neon_define1( \ + __m128i, a, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); int8x16_t ret; \ + if (_sse2neon_unlikely((imm) == 0)) ret = vreinterpretq_s8_m128i(_a); \ + else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \ + else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a), \ + (((imm) <= 0 || (imm) > 15) ? 0 : (16 - (imm)))); \ _sse2neon_return(vreinterpretq_m128i_s8(ret));) // Compute the square root of packed double-precision (64-bit) floating-point @@ -5246,12 +6263,15 @@ FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a))); #else - double a0 = sqrt(((double *) &a)[0]); - double a1 = sqrt(((double *) &a)[1]); - return _mm_set_pd(a1, a0); + double a0, a1; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double _a0 = sqrt(a0); + double _a1 = sqrt(a1); + return _mm_set_pd(_a1, _a0); #endif } @@ -5261,10 +6281,13 @@ FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return _mm_move_sd(a, _mm_sqrt_pd(b)); #else - return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0])); + double _a, _b; + _a = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + _b = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + return _mm_set_pd(_a, sqrt(_b)); #endif } @@ -5273,11 +6296,12 @@ FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16 FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count) { - int64_t c = vgetq_lane_s64(count, 0); - if (_sse2neon_unlikely(c & ~15)) + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (_sse2neon_unlikely(c > 15)) return _mm_cmplt_epi16(a, _mm_setzero_si128()); return vreinterpretq_m128i_s16( - vshlq_s16((int16x8_t) a, vdupq_n_s16((int) -c))); + vshlq_s16(vreinterpretq_s16_m128i(a), + vdupq_n_s16(-_sse2neon_static_cast(int16_t, c)))); } // Shift packed 32-bit integers in a right by count while shifting in sign bits, @@ -5285,11 +6309,12 @@ FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32 FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count) { - int64_t c = vgetq_lane_s64(count, 0); - if (_sse2neon_unlikely(c & ~31)) + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (_sse2neon_unlikely(c > 31)) return _mm_cmplt_epi32(a, _mm_setzero_si128()); return vreinterpretq_m128i_s32( - vshlq_s32((int32x4_t) a, vdupq_n_s32((int) -c))); + vshlq_s32(vreinterpretq_s32_m128i(a), + vdupq_n_s32(-_sse2neon_static_cast(int32_t, c)))); } // Shift packed 16-bit integers in a right by imm8 while shifting in sign @@ -5297,17 +6322,21 @@ FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16 FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm) { - const int count = (imm & ~15) ? 15 : imm; - return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count)); + const int16_t count = + (imm & ~15) ? 15 : _sse2neon_static_cast(int16_t, imm); + return vreinterpretq_m128i_s16( + vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(-count))); } // Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, // and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32 -// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm) +// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, const int imm) +// imm must be a compile-time constant in range [0, 255] #define _mm_srai_epi32(a, imm) \ _sse2neon_define0( \ - __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) == 0)) { \ + __m128i, a, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); __m128i ret; \ + if (_sse2neon_unlikely((imm) == 0)) { \ ret = _a; \ } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) { \ ret = vreinterpretq_m128i_s32( \ @@ -5323,10 +6352,10 @@ FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm) FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); - if (_sse2neon_unlikely(c & ~15)) + if (_sse2neon_unlikely(c > 15)) return _mm_setzero_si128(); - int16x8_t vc = vdupq_n_s16(-(int16_t) c); + int16x8_t vc = vdupq_n_s16(-_sse2neon_static_cast(int16_t, c)); return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc)); } @@ -5336,10 +6365,10 @@ FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count) FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); - if (_sse2neon_unlikely(c & ~31)) + if (_sse2neon_unlikely(c > 31)) return _mm_setzero_si128(); - int32x4_t vc = vdupq_n_s32(-(int32_t) c); + int32x4_t vc = vdupq_n_s32(-_sse2neon_static_cast(int32_t, c)); return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc)); } @@ -5349,32 +6378,37 @@ FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count) FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); - if (_sse2neon_unlikely(c & ~63)) + if (_sse2neon_unlikely(c > 63)) return _mm_setzero_si128(); - int64x2_t vc = vdupq_n_s64(-(int64_t) c); + int64x2_t vc = vdupq_n_s64(-_sse2neon_static_cast(int64_t, c)); return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc)); } // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16 -#define _mm_srli_epi16(a, imm) \ - _sse2neon_define0( \ - __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) { \ - ret = _mm_setzero_si128(); \ - } else { \ - ret = vreinterpretq_m128i_u16( \ - vshlq_u16(vreinterpretq_u16_m128i(_a), vdupq_n_s16(-(imm)))); \ +// imm must be a compile-time constant in range [0, 255] +#define _mm_srli_epi16(a, imm) \ + _sse2neon_define0( \ + __m128i, a, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); __m128i ret; \ + if (_sse2neon_unlikely((imm) & ~15)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_u16(vshlq_u16( \ + vreinterpretq_u16_m128i(_a), \ + vdupq_n_s16(_sse2neon_static_cast(int16_t, -(imm))))); \ } _sse2neon_return(ret);) // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32 -// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm) +// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, const int imm) +// imm must be a compile-time constant in range [0, 255] #define _mm_srli_epi32(a, imm) \ _sse2neon_define0( \ - __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~31)) { \ + __m128i, a, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); __m128i ret; \ + if (_sse2neon_unlikely((imm) & ~31)) { \ ret = _mm_setzero_si128(); \ } else { \ ret = vreinterpretq_m128i_u32( \ @@ -5384,9 +6418,11 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) // Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64 +// imm must be a compile-time constant in range [0, 255] #define _mm_srli_epi64(a, imm) \ _sse2neon_define0( \ - __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~63)) { \ + __m128i, a, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); __m128i ret; \ + if (_sse2neon_unlikely((imm) & ~63)) { \ ret = _mm_setzero_si128(); \ } else { \ ret = vreinterpretq_m128i_u64( \ @@ -5396,12 +6432,13 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) // Shift a right by imm8 bytes while shifting in zeros, and store the results in // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128 -#define _mm_srli_si128(a, imm) \ - _sse2neon_define1( \ - __m128i, a, int8x16_t ret; \ - if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \ - else ret = vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0), \ - (imm > 15 ? 0 : imm)); \ +// imm must be a compile-time constant in range [0, 255] +#define _mm_srli_si128(a, imm) \ + _sse2neon_define1( \ + __m128i, a, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); int8x16_t ret; \ + if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \ + else ret = vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0), \ + ((imm) > 15 ? 0 : (imm))); \ _sse2neon_return(vreinterpretq_m128i_s8(ret));) // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point @@ -5410,10 +6447,12 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) - vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a)); +#if SSE2NEON_ARCH_AARCH64 + vst1q_f64(_sse2neon_reinterpret_cast(float64_t *, mem_addr), + vreinterpretq_f64_m128d(a)); #else - vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a)); + vst1q_f32(_sse2neon_reinterpret_cast(float32_t *, mem_addr), + vreinterpretq_f32_m128d(a)); #endif } @@ -5423,13 +6462,13 @@ FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1 FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a)); - vst1q_f64((float64_t *) mem_addr, + vst1q_f64(_sse2neon_reinterpret_cast(float64_t *, mem_addr), vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low))); #else float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a)); - vst1q_f32((float32_t *) mem_addr, + vst1q_f32(_sse2neon_reinterpret_cast(float32_t *, mem_addr), vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low))); #endif } @@ -5439,10 +6478,12 @@ FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) - vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a))); +#if SSE2NEON_ARCH_AARCH64 + vst1_f64(_sse2neon_reinterpret_cast(float64_t *, mem_addr), + vget_low_f64(vreinterpretq_f64_m128d(a))); #else - vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a))); + vst1_u64(_sse2neon_reinterpret_cast(uint64_t *, mem_addr), + vget_low_u64(vreinterpretq_u64_m128d(a))); #endif } @@ -5451,7 +6492,8 @@ FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128 FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a) { - vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); + vst1q_s32(_sse2neon_reinterpret_cast(int32_t *, p), + vreinterpretq_s32_m128i(a)); } // Store the lower double-precision (64-bit) floating-point element from a into @@ -5465,10 +6507,12 @@ FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) - vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a))); +#if SSE2NEON_ARCH_AARCH64 + vst1_f64(_sse2neon_reinterpret_cast(float64_t *, mem_addr), + vget_high_f64(vreinterpretq_f64_m128d(a))); #else - vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a))); + vst1_f32(_sse2neon_reinterpret_cast(float32_t *, mem_addr), + vget_high_f32(vreinterpretq_f32_m128d(a))); #endif } @@ -5476,7 +6520,8 @@ FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64 FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b) { - vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b))); + vst1_u64(_sse2neon_reinterpret_cast(uint64_t *, a), + vget_low_u64(vreinterpretq_u64_m128i(b))); } // Store the lower double-precision (64-bit) floating-point element from a into @@ -5484,10 +6529,12 @@ FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) - vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a))); +#if SSE2NEON_ARCH_AARCH64 + vst1_f64(_sse2neon_reinterpret_cast(float64_t *, mem_addr), + vget_low_f64(vreinterpretq_f64_m128d(a))); #else - vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a))); + vst1_f32(_sse2neon_reinterpret_cast(float32_t *, mem_addr), + vget_low_f32(vreinterpretq_f32_m128d(a))); #endif } @@ -5515,7 +6562,8 @@ FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128 FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a) { - vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); + vst1q_s32(_sse2neon_reinterpret_cast(int32_t *, p), + vreinterpretq_s32_m128i(a)); } // Store 32-bit integer from the first element of a into memory. mem_addr does @@ -5523,54 +6571,75 @@ FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32 FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a) { - vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0); + vst1q_lane_s32(_sse2neon_reinterpret_cast(int32_t *, p), + vreinterpretq_s32_m128i(a), 0); } // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point // elements) from a into memory using a non-temporal memory hint. mem_addr must // be aligned on a 16-byte boundary or a general-protection exception may be // generated. +// Note: On AArch64, __builtin_nontemporal_store generates STNP (Store +// Non-temporal Pair), providing true non-temporal hint for 128-bit stores. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd FORCE_INLINE void _mm_stream_pd(double *p, __m128d a) { #if __has_builtin(__builtin_nontemporal_store) - __builtin_nontemporal_store(a, (float32x4_t *) p); -#elif defined(__aarch64__) || defined(_M_ARM64) + __builtin_nontemporal_store(a, _sse2neon_reinterpret_cast(__m128d *, p)); +#elif SSE2NEON_ARCH_AARCH64 vst1q_f64(p, vreinterpretq_f64_m128d(a)); #else - vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a)); + vst1q_s64(_sse2neon_reinterpret_cast(int64_t *, p), + vreinterpretq_s64_m128d(a)); #endif } // Store 128-bits of integer data from a into memory using a non-temporal memory // hint. mem_addr must be aligned on a 16-byte boundary or a general-protection // exception may be generated. +// Note: On AArch64, __builtin_nontemporal_store generates STNP (Store +// Non-temporal Pair), providing true non-temporal hint for 128-bit stores. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128 FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a) { #if __has_builtin(__builtin_nontemporal_store) __builtin_nontemporal_store(a, p); #else - vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a)); + vst1q_s64(_sse2neon_reinterpret_cast(int64_t *, p), + vreinterpretq_s64_m128i(a)); #endif } // Store 32-bit integer a into memory using a non-temporal hint to minimize // cache pollution. If the cache line containing address mem_addr is already in // the cache, the cache will be updated. +// Note: ARM lacks non-temporal store for 32-bit scalar. STNP requires pair +// stores; __builtin_nontemporal_store may generate regular store on AArch64. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32 FORCE_INLINE void _mm_stream_si32(int *p, int a) { - vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0); +#if __has_builtin(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, p); +#else + vst1q_lane_s32(_sse2neon_reinterpret_cast(int32_t *, p), vdupq_n_s32(a), 0); +#endif } // Store 64-bit integer a into memory using a non-temporal hint to minimize // cache pollution. If the cache line containing address mem_addr is already in // the cache, the cache will be updated. +// Note: ARM lacks direct non-temporal store for single 64-bit value. STNP +// requires pair stores; __builtin_nontemporal_store may generate regular store +// on AArch64. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64 FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a) { - vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a)); +#if __has_builtin(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, p); +#else + vst1_s64(_sse2neon_reinterpret_cast(int64_t *, p), + vdup_n_s64(_sse2neon_static_cast(int64_t, a))); +#endif } // Subtract packed 16-bit integers in b from packed 16-bit integers in a, and @@ -5615,16 +6684,22 @@ FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_f64( vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - double *da = (double *) &a; - double *db = (double *) &b; + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); double c[2]; - c[0] = da[0] - db[0]; - c[1] = da[1] - db[1]; - return vld1q_f32((float32_t *) c); + c[0] = a0 - b0; + c[1] = a1 - b1; + return sse2neon_vld1q_f32_from_f64pair(c); #endif } @@ -5690,21 +6765,25 @@ FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b) #define _mm_ucomineq_sd _mm_comineq_sd // Return vector of type __m128d with undefined elements. +// Note: MSVC forces zero-initialization while GCC/Clang return truly undefined +// memory. Use SSE2NEON_UNDEFINED_ZERO=1 to force zero on all compilers. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd FORCE_INLINE __m128d _mm_undefined_pd(void) { -#if defined(__GNUC__) || defined(__clang__) +#if SSE2NEON_UNDEFINED_ZERO || \ + (SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG) + return _mm_setzero_pd(); +#else +#if SSE2NEON_COMPILER_GCC_COMPAT #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wuninitialized" #endif __m128d a; -#if defined(_MSC_VER) - a = _mm_setzero_pd(); -#endif return a; -#if defined(__GNUC__) || defined(__clang__) +#if SSE2NEON_COMPILER_GCC_COMPAT #pragma GCC diagnostic pop #endif +#endif } // Unpack and interleave 16-bit integers from the high half of a and b, and @@ -5712,7 +6791,7 @@ FORCE_INLINE __m128d _mm_undefined_pd(void) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16 FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128i_s16( vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); #else @@ -5728,7 +6807,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32 FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128i_s32( vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); #else @@ -5744,7 +6823,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64 FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128i_s64( vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); #else @@ -5759,7 +6838,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8 FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128i_s8( vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); #else @@ -5777,7 +6856,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_f64( vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else @@ -5792,7 +6871,7 @@ FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16 FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128i_s16( vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); #else @@ -5808,7 +6887,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32 FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128i_s32( vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); #else @@ -5824,7 +6903,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64 FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128i_s64( vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); #else @@ -5839,7 +6918,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8 FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128i_s8( vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); #else @@ -5855,7 +6934,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_f64( vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else @@ -5885,6 +6964,13 @@ FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) /* SSE3 */ +// Rounding mode note: The single-precision horizontal operations +// (_mm_addsub_ps, _mm_hadd_ps, _mm_hsub_ps) are sensitive to rounding mode +// on ARM. On x86, these intrinsics produce consistent results regardless of +// MXCSR rounding mode. On ARM NEON, the current FPCR/FPSCR rounding mode +// affects intermediate results. For consistent cross-platform behavior, call +// _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST) before using these intrinsics. + // Alternatively add and subtract packed double-precision (64-bit) // floating-point elements in a to/from packed elements in b, and store the // results in dst. @@ -5892,7 +6978,7 @@ FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b) { _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f); -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(mask))); @@ -5903,13 +6989,12 @@ FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b) // Alternatively add and subtract packed single-precision (32-bit) // floating-point elements in a to/from packed elements in b, and store the -// results in dst. +// results in dst. See SSE3 rounding mode note above. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b) { _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f); -#if (defined(__aarch64__) || defined(_M_ARM64)) || \ - defined(__ARM_FEATURE_FMA) /* VFPv4+ */ +#if SSE2NEON_ARCH_AARCH64 || defined(__ARM_FEATURE_FMA) /* VFPv4+ */ return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(mask), vreinterpretq_f32_m128(b))); @@ -5923,23 +7008,31 @@ FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_f64( vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - double *da = (double *) &a; - double *db = (double *) &b; - double c[] = {da[0] + da[1], db[0] + db[1]}; - return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + double c[] = {a0 + a1, b0 + b1}; + return vreinterpretq_m128d_u64( + vld1q_u64(_sse2neon_reinterpret_cast(uint64_t *, c))); #endif } // Horizontally add adjacent pairs of single-precision (32-bit) floating-point // elements in a and b, and pack the results in dst. +// See SSE3 rounding mode note above. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128_f32( vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else @@ -5955,29 +7048,37 @@ FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) // Horizontally subtract adjacent pairs of double-precision (64-bit) // floating-point elements in a and b, and pack the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd -FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b) +FORCE_INLINE __m128d _mm_hsub_pd(__m128d a, __m128d b) { -#if defined(__aarch64__) || defined(_M_ARM64) - float64x2_t a = vreinterpretq_f64_m128d(_a); - float64x2_t b = vreinterpretq_f64_m128d(_b); +#if SSE2NEON_ARCH_AARCH64 + float64x2_t _a = vreinterpretq_f64_m128d(a); + float64x2_t _b = vreinterpretq_f64_m128d(b); return vreinterpretq_m128d_f64( - vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b))); + vsubq_f64(vuzp1q_f64(_a, _b), vuzp2q_f64(_a, _b))); #else - double *da = (double *) &_a; - double *db = (double *) &_b; - double c[] = {da[0] - da[1], db[0] - db[1]}; - return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + double c[] = {a0 - a1, b0 - b1}; + return vreinterpretq_m128d_u64( + vld1q_u64(_sse2neon_reinterpret_cast(uint64_t *, c))); #endif } // Horizontally subtract adjacent pairs of single-precision (32-bit) // floating-point elements in a and b, and pack the results in dst. +// See SSE3 rounding mode note above. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b) { float32x4_t a = vreinterpretq_f32_m128(_a); float32x4_t b = vreinterpretq_f32_m128(_b); -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128_f32( vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b))); #else @@ -5997,12 +7098,32 @@ FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd #define _mm_loaddup_pd _mm_load1_pd +// Sets up a linear address range to be monitored by hardware and activates the +// monitor. The address range should be a write-back memory caching type. +// +// ARM implementation notes: +// - This is a NO-OP. ARM has no userspace equivalent for "monitor a cacheline +// and wake on store". There is no "armed" address after calling this. +// - The extensions and hints parameters are ignored (no architectural +// equivalent for x86 C-state hints on ARM). +// - _mm_mwait provides only a low-power hint, not a monitor-armed wait. +// +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_monitor +FORCE_INLINE void _mm_monitor(void const *p, + unsigned int extensions, + unsigned int hints) +{ + (void) p; + (void) extensions; + (void) hints; +} + // Duplicate the low double-precision (64-bit) floating-point element from a, // and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd FORCE_INLINE __m128d _mm_movedup_pd(__m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_f64( vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0)); #else @@ -6016,7 +7137,7 @@ FORCE_INLINE __m128d _mm_movedup_pd(__m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128_f32( vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a))); #elif defined(_sse2neon_shuffle) @@ -6035,7 +7156,7 @@ FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128_f32( vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a))); #elif defined(_sse2neon_shuffle) @@ -6049,6 +7170,64 @@ FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a) #endif } +// Provides a hint that allows the processor to enter an implementation- +// dependent optimized state while waiting for a memory write to the monitored +// address range set up by _mm_monitor. +// +// ARM implementation notes: +// - This is only a LOW-POWER HINT, not a monitor-armed wait. Since _mm_monitor +// is a no-op on ARM, there is no "armed" address range to wake on. +// - The extensions and hints parameters are ignored (no architectural +// equivalent for x86 C-state hints on ARM). +// - No memory ordering is guaranteed beyond what the hint instruction provides. +// - WFI/WFE in EL0 may trap depending on OS configuration (Linux can trap +// EL0 WFI/WFE via SCTLR_EL1; iOS/macOS may also restrict these). +// +// Behavior controlled by SSE2NEON_MWAIT_POLICY (see top of file for details). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mwait +FORCE_INLINE void _mm_mwait(unsigned int extensions, unsigned int hints) +{ + (void) extensions; + (void) hints; + + // ARM implementation: low-power hint via yield/wfe/wfi. + // x86: no-op for compilation (MONITOR/MWAIT require CPL0, trap in + // userspace). +#if SSE2NEON_ARCH_AARCH64 || defined(__arm__) || defined(_M_ARM) || \ + defined(_M_ARM64) + // Use MSVC intrinsics on Windows ARM, inline asm on GCC/Clang. + // Note: GCC's arm_acle.h may not define __yield/__wfe/__wfi on all + // versions. +#if SSE2NEON_MWAIT_POLICY == 0 + // Policy 0: yield - safe everywhere, never blocks +#if SSE2NEON_COMPILER_MSVC + __yield(); +#else + __asm__ __volatile__("yield" ::: "memory"); +#endif + +#elif SSE2NEON_MWAIT_POLICY == 1 + // Policy 1: wfe - event wait, requires SEV/SEVL, may block +#if SSE2NEON_COMPILER_MSVC + __wfe(); +#else + __asm__ __volatile__("wfe" ::: "memory"); +#endif + +#elif SSE2NEON_MWAIT_POLICY == 2 + // Policy 2: wfi - interrupt wait, may trap in EL0 +#if SSE2NEON_COMPILER_MSVC + __wfi(); +#else + __asm__ __volatile__("wfi" ::: "memory"); +#endif + +#else +#error "Invalid SSE2NEON_MWAIT_POLICY value (must be 0, 1, or 2)" +#endif +#endif /* ARM architecture */ +} + /* SSSE3 */ // Compute the absolute value of packed signed 16-bit integers in a, and store @@ -6102,33 +7281,59 @@ FORCE_INLINE __m64 _mm_abs_pi8(__m64 a) // Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift // the result right by imm8 bytes, and store the low 16 bytes in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8 +// imm must be a compile-time constant in range [0, 255] #if defined(__GNUC__) && !defined(__clang__) -#define _mm_alignr_epi8(a, b, imm) \ - __extension__({ \ - uint8x16_t _a = vreinterpretq_u8_m128i(a); \ - uint8x16_t _b = vreinterpretq_u8_m128i(b); \ - __m128i ret; \ - if (_sse2neon_unlikely((imm) & ~31)) \ - ret = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \ - else if (imm >= 16) \ - ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0); \ - else \ - ret = \ - vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \ - ret; \ +#define _mm_alignr_epi8(a, b, imm) \ + __extension__({ \ + SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); \ + __m128i _a_m128i = (a); \ + uint8x16_t _a = vreinterpretq_u8_m128i(_a_m128i); \ + uint8x16_t _b = vreinterpretq_u8_m128i(b); \ + __m128i ret; \ + if (_sse2neon_unlikely((imm) & ~31)) \ + ret = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \ + else if ((imm) >= 16) \ + ret = vreinterpretq_m128i_s8( \ + vextq_s8(vreinterpretq_s8_m128i(_a_m128i), vdupq_n_s8(0), \ + ((imm) >= 16 && (imm) < 32) ? (imm) - 16 : 0)); \ + else \ + ret = vreinterpretq_m128i_u8( \ + vextq_u8(_b, _a, (imm) < 16 ? (imm) : 0)); \ + ret; \ }) +// Clang path: inline _mm_srli_si128 logic to avoid both: +// 1. Variable shadowing: _mm_srli_si128(_a, ...) creates __m128i _a = (_a) +// 2. Double evaluation: _mm_srli_si128((a), ...) re-evaluates macro arg +#elif SSE2NEON_COMPILER_CLANG +#define _mm_alignr_epi8(a, b, imm) \ + _sse2neon_define2( \ + __m128i, a, b, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); \ + uint8x16_t __a = vreinterpretq_u8_m128i(_a); \ + uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret; \ + if (_sse2neon_unlikely((imm) & ~31)) ret = \ + vreinterpretq_m128i_u8(vdupq_n_u8(0)); \ + else if ((imm) >= 16) ret = vreinterpretq_m128i_s8( \ + vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0), \ + ((imm) >= 16 && (imm) < 32) ? (imm) - 16 : 0)); \ + else ret = vreinterpretq_m128i_u8( \ + vextq_u8(__b, __a, (imm) < 16 ? (imm) : 0)); \ + _sse2neon_return(ret);) + +// MSVC path: use _a (lambda parameter) since lambda [] cannot capture (a). +// No shadowing issue because lambda parameters shadow captures properly. #else -#define _mm_alignr_epi8(a, b, imm) \ - _sse2neon_define2( \ - __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a); \ - uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret; \ - if (_sse2neon_unlikely((imm) & ~31)) ret = \ - vreinterpretq_m128i_u8(vdupq_n_u8(0)); \ - else if (imm >= 16) ret = \ - _mm_srli_si128(_a, imm >= 16 ? imm - 16 : 0); \ - else ret = \ - vreinterpretq_m128i_u8(vextq_u8(__b, __a, imm < 16 ? imm : 0)); \ +#define _mm_alignr_epi8(a, b, imm) \ + _sse2neon_define2( \ + __m128i, a, b, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); \ + uint8x16_t __a = vreinterpretq_u8_m128i(_a); \ + uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret; \ + if (_sse2neon_unlikely((imm) & ~31)) ret = \ + vreinterpretq_m128i_u8(vdupq_n_u8(0)); \ + else if ((imm) >= 16) ret = \ + _mm_srli_si128(_a, (imm) >= 16 ? (imm) - 16 : 0); \ + else ret = vreinterpretq_m128i_u8( \ + vextq_u8(__b, __a, (imm) < 16 ? (imm) : 0)); \ _sse2neon_return(ret);) #endif @@ -6136,26 +7341,41 @@ FORCE_INLINE __m64 _mm_abs_pi8(__m64 a) // Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift // the result right by imm8 bytes, and store the low 8 bytes in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8 +// imm must be a compile-time constant in range [0, 255] +#if defined(__GNUC__) && !defined(__clang__) #define _mm_alignr_pi8(a, b, imm) \ - _sse2neon_define2( \ - __m64, a, b, __m64 ret; if (_sse2neon_unlikely((imm) >= 16)) { \ + __extension__({ \ + SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); \ + __m64 _a = (a), _b = (b); \ + __m64 ret; \ + if (_sse2neon_unlikely((imm) >= 16)) { \ ret = vreinterpret_m64_s8(vdup_n_s8(0)); \ + } else if ((imm) >= 8) { \ + ret = vreinterpret_m64_u8( \ + vext_u8(vreinterpret_u8_m64(_a), vdup_n_u8(0), (imm) - 8)); \ } else { \ - uint8x8_t tmp_low; \ - uint8x8_t tmp_high; \ - if ((imm) >= 8) { \ - const int idx = (imm) -8; \ - tmp_low = vreinterpret_u8_m64(_a); \ - tmp_high = vdup_n_u8(0); \ - ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ - } else { \ - const int idx = (imm); \ - tmp_low = vreinterpret_u8_m64(_b); \ - tmp_high = vreinterpret_u8_m64(_a); \ - ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ - } \ + ret = vreinterpret_m64_u8(vext_u8( \ + vreinterpret_u8_m64(_b), vreinterpret_u8_m64(_a), (imm))); \ + } \ + ret; \ + }) + +#else +#define _mm_alignr_pi8(a, b, imm) \ + _sse2neon_define2( \ + __m64, a, b, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); __m64 ret; \ + if (_sse2neon_unlikely((imm) >= 16)) { \ + ret = vreinterpret_m64_s8(vdup_n_s8(0)); \ + } else if ((imm) >= 8) { \ + ret = vreinterpret_m64_u8(vext_u8(vreinterpret_u8_m64(_a), \ + vdup_n_u8(0), ((imm) - 8) & 7)); \ + } else { \ + ret = vreinterpret_m64_u8(vext_u8( \ + vreinterpret_u8_m64(_b), vreinterpret_u8_m64(_a), (imm) & 7)); \ } _sse2neon_return(ret);) +#endif + // Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the // signed 16-bit results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16 @@ -6163,7 +7383,7 @@ FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b) { int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128i_s16(vpaddq_s16(a, b)); #else return vreinterpretq_m128i_s16( @@ -6179,7 +7399,7 @@ FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b) { int32x4_t a = vreinterpretq_s32_m128i(_a); int32x4_t b = vreinterpretq_s32_m128i(_b); -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128i_s32(vpaddq_s32(a, b)); #else return vreinterpretq_m128i_s32( @@ -6211,7 +7431,7 @@ FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16 FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); return vreinterpretq_s64_s16( @@ -6236,7 +7456,7 @@ FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b) { int16x4_t a = vreinterpret_s16_m64(_a); int16x4_t b = vreinterpret_s16_m64(_b); -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b))); #else int16x4x2_t res = vuzp_s16(a, b); @@ -6251,7 +7471,7 @@ FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b) { int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128i_s16( vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); #else @@ -6267,7 +7487,7 @@ FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b) { int32x4_t a = vreinterpretq_s32_m128i(_a); int32x4_t b = vreinterpretq_s32_m128i(_b); -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128i_s32( vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b))); #else @@ -6283,7 +7503,7 @@ FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b) { int16x4_t a = vreinterpret_s16_m64(_a); int16x4_t b = vreinterpret_s16_m64(_b); -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b))); #else int16x4x2_t c = vuzp_s16(a, b); @@ -6298,7 +7518,7 @@ FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b) { int32x2_t a = vreinterpret_s32_m64(_a); int32x2_t b = vreinterpret_s32_m64(_b); -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b))); #else int32x2x2_t c = vuzp_s32(a, b); @@ -6313,7 +7533,7 @@ FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b) { int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128i_s16( vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); #else @@ -6329,7 +7549,7 @@ FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b) { int16x4_t a = vreinterpret_s16_m64(_a); int16x4_t b = vreinterpret_s16_m64(_b); -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b))); #else int16x4x2_t c = vuzp_s16(a, b); @@ -6344,7 +7564,7 @@ FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16 FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 uint8x16_t a = vreinterpretq_u8_m128i(_a); int8x16_t b = vreinterpretq_s8_m128i(_b); int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), @@ -6448,7 +7668,7 @@ FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b) uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b uint8x16_t idx_masked = vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked)); #elif defined(__GNUC__) int8x16_t ret; @@ -6475,7 +7695,8 @@ FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b) FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b) { const int8x8_t controlMask = - vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07))); + vand_s8(vreinterpret_s8_m64(b), + vdup_n_s8(_sse2neon_static_cast(int8_t, 0x1 << 7 | 0x07))); int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask); return vreinterpret_m64_s8(res); } @@ -6494,7 +7715,7 @@ FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b) // (b < 0) ? 0xFFFF : 0 uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15)); // (b == 0) ? 0xFFFF : 0 -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b)); #else int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0))); @@ -6523,7 +7744,7 @@ FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b) uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31)); // (b == 0) ? 0xFFFFFFFF : 0 -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b)); #else int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0))); @@ -6552,7 +7773,7 @@ FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b) uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7)); // (b == 0) ? 0xFF : 0 -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b)); #else int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0))); @@ -6581,7 +7802,7 @@ FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b) uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15)); // (b == 0) ? 0xFFFF : 0 -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b)); #else int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0))); @@ -6610,7 +7831,7 @@ FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b) uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31)); // (b == 0) ? 0xFFFFFFFF : 0 -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b)); #else int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0))); @@ -6639,7 +7860,7 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7)); // (b == 0) ? 0xFF : 0 -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b)); #else int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0))); @@ -6659,31 +7880,32 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) // Blend packed 16-bit integers from a and b using control mask imm8, and store // the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16 -// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b, -// __constrange(0,255) int imm) -#define _mm_blend_epi16(a, b, imm) \ - _sse2neon_define2( \ - __m128i, a, b, \ - const uint16_t _mask[8] = \ - _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \ - ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \ - ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \ - ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \ - ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \ - ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \ - ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \ - ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0); \ - uint16x8_t _mask_vec = vld1q_u16(_mask); \ - uint16x8_t __a = vreinterpretq_u16_m128i(_a); \ - uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \ +// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b, const int imm) +// imm must be a compile-time constant in range [0, 255] +#define _mm_blend_epi16(a, b, imm) \ + _sse2neon_define2( \ + __m128i, a, b, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); \ + const uint16_t _mask[8] = _sse2neon_init( \ + ((imm) & (1 << 0)) ? _sse2neon_static_cast(uint16_t, -1) : 0x0, \ + ((imm) & (1 << 1)) ? _sse2neon_static_cast(uint16_t, -1) : 0x0, \ + ((imm) & (1 << 2)) ? _sse2neon_static_cast(uint16_t, -1) : 0x0, \ + ((imm) & (1 << 3)) ? _sse2neon_static_cast(uint16_t, -1) : 0x0, \ + ((imm) & (1 << 4)) ? _sse2neon_static_cast(uint16_t, -1) : 0x0, \ + ((imm) & (1 << 5)) ? _sse2neon_static_cast(uint16_t, -1) : 0x0, \ + ((imm) & (1 << 6)) ? _sse2neon_static_cast(uint16_t, -1) : 0x0, \ + ((imm) & (1 << 7)) ? _sse2neon_static_cast(uint16_t, -1) : 0x0); \ + uint16x8_t _mask_vec = vld1q_u16(_mask); \ + uint16x8_t __a = vreinterpretq_u16_m128i(_a); \ + uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \ vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, __b, __a)));) // Blend packed double-precision (64-bit) floating-point elements from a and b // using control mask imm8, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd +// imm must be a compile-time constant in range [0, 3] #define _mm_blend_pd(a, b, imm) \ _sse2neon_define2( \ - __m128d, a, b, \ + __m128d, a, b, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 3); \ const uint64_t _mask[2] = \ _sse2neon_init(((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \ ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)); \ @@ -6695,18 +7917,19 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) // Blend packed single-precision (32-bit) floating-point elements from a and b // using mask, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps -FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8) -{ - const uint32_t ALIGN_STRUCT(16) - data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, - ((imm8) & (1 << 1)) ? UINT32_MAX : 0, - ((imm8) & (1 << 2)) ? UINT32_MAX : 0, - ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; - uint32x4_t mask = vld1q_u32(data); - float32x4_t a = vreinterpretq_f32_m128(_a); - float32x4_t b = vreinterpretq_f32_m128(_b); - return vreinterpretq_m128_f32(vbslq_f32(mask, b, a)); -} +// imm8 must be a compile-time constant in range [0, 15] +#define _mm_blend_ps(a, b, imm8) \ + _sse2neon_define2( \ + __m128, a, b, SSE2NEON_REQUIRE_CONST_RANGE(imm8, 0, 15); \ + const uint32_t _mask[4] = \ + _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0, \ + ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \ + ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \ + ((imm8) & (1 << 3)) ? UINT32_MAX : 0); \ + uint32x4_t _mask_vec = vld1q_u32(_mask); \ + float32x4_t __a = vreinterpretq_f32_m128(_a); \ + float32x4_t __b = vreinterpretq_f32_m128(_b); _sse2neon_return( \ + vreinterpretq_m128_f32(vbslq_f32(_mask_vec, __b, __a)));) // Blend packed 8-bit integers from a and b using mask, and store the results in // dst. @@ -6728,7 +7951,7 @@ FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask) { uint64x2_t mask = vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63)); -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 float64x2_t a = vreinterpretq_f64_m128d(_a); float64x2_t b = vreinterpretq_f64_m128d(_b); return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a)); @@ -6758,11 +7981,13 @@ FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd FORCE_INLINE __m128d _mm_ceil_pd(__m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a))); #else - double *f = (double *) &a; - return _mm_set_pd(ceil(f[1]), ceil(f[0])); + double a0, a1; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + return _mm_set_pd(ceil(a1), ceil(a0)); #endif } @@ -6772,11 +7997,10 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps FORCE_INLINE __m128 _mm_ceil_ps(__m128 a) { -#if (defined(__aarch64__) || defined(_M_ARM64)) || \ - defined(__ARM_FEATURE_DIRECTED_ROUNDING) +#if SSE2NEON_ARCH_AARCH64 || defined(__ARM_FEATURE_DIRECTED_ROUNDING) return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a))); #else - float *f = (float *) &a; + float *f = _sse2neon_reinterpret_cast(float *, &a); return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0])); #endif } @@ -6805,7 +8029,7 @@ FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b) // in dst FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128i_u64( vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b))); #else @@ -6930,7 +8154,7 @@ FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a) return vreinterpretq_m128i_u32(u32x4); } -// Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed +// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed // 64-bit integers, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64 FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a) @@ -6949,11 +8173,11 @@ FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a) FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm) { // Generate mask value from constant immediate bit value - const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0; - const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0; + const int64_t bit0Mask = imm & 0x01 ? INT64_C(-1) : 0; + const int64_t bit1Mask = imm & 0x02 ? INT64_C(-1) : 0; #if !SSE2NEON_PRECISE_DP - const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0; - const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0; + const int64_t bit4Mask = imm & 0x10 ? INT64_C(-1) : 0; + const int64_t bit5Mask = imm & 0x20 ? INT64_C(-1) : 0; #endif // Conditional multiplication #if !SSE2NEON_PRECISE_DP @@ -6962,7 +8186,7 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm) _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask)); __m128d tmp = _mm_and_pd(mul, mulMask); #else -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) * vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0) : 0; @@ -6970,16 +8194,28 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm) vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1) : 0; #else - double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0; - double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0; + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + double d0 = (imm & 0x10) ? a0 * b0 : 0; + double d1 = (imm & 0x20) ? a1 * b1 : 0; #endif __m128d tmp = _mm_set_pd(d1, d0); #endif // Sum the products -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp)); #else - double sum = *((double *) &tmp) + *(((double *) &tmp) + 1); + double _tmp0 = sse2neon_recast_u64_f64( + vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 0)); + double _tmp1 = sse2neon_recast_u64_f64( + vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 1)); + double sum = _tmp0 + _tmp1; #endif // Conditionally store the sum const __m128d sumMask = @@ -6994,38 +8230,55 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm) { - float32x4_t elementwise_prod = _mm_mul_ps(a, b); + /* Early exit: no input selected or no output lanes */ + if ((imm & 0xF0) == 0 || (imm & 0x0F) == 0) + return _mm_setzero_ps(); -#if defined(__aarch64__) || defined(_M_ARM64) - /* shortcuts */ - if (imm == 0xFF) { - return _mm_set1_ps(vaddvq_f32(elementwise_prod)); + float32x4_t prod = vreinterpretq_f32_m128(_mm_mul_ps(a, b)); + +#if SSE2NEON_ARCH_AARCH64 + /* Fast path: all elements, broadcast to all lanes */ + if (imm == 0xFF) + return _mm_set1_ps(vaddvq_f32(prod)); + + /* Fast path: 3-element dot product (x,y,z), broadcast to all lanes */ + if (imm == 0x7F) { + prod = vsetq_lane_f32(0.0f, prod, 3); + return _mm_set1_ps(vaddvq_f32(prod)); } - if ((imm & 0x0F) == 0x0F) { - if (!(imm & (1 << 4))) - elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 0); - if (!(imm & (1 << 5))) - elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 1); - if (!(imm & (1 << 6))) - elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 2); - if (!(imm & (1 << 7))) - elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 3); + /* Vectorized generic path: apply input mask, sum, apply output mask */ + const uint32_t input_mask[4] = { + (imm & (1 << 4)) ? ~UINT32_C(0) : UINT32_C(0), + (imm & (1 << 5)) ? ~UINT32_C(0) : UINT32_C(0), + (imm & (1 << 6)) ? ~UINT32_C(0) : UINT32_C(0), + (imm & (1 << 7)) ? ~UINT32_C(0) : UINT32_C(0), + }; + prod = vreinterpretq_f32_u32( + vandq_u32(vreinterpretq_u32_f32(prod), vld1q_u32(input_mask))); - return _mm_set1_ps(vaddvq_f32(elementwise_prod)); - } -#endif + float32x4_t sum = vdupq_n_f32(vaddvq_f32(prod)); + const uint32_t output_mask[4] = { + (imm & 0x1) ? ~UINT32_C(0) : UINT32_C(0), + (imm & 0x2) ? ~UINT32_C(0) : UINT32_C(0), + (imm & 0x4) ? ~UINT32_C(0) : UINT32_C(0), + (imm & 0x8) ? ~UINT32_C(0) : UINT32_C(0), + }; + return vreinterpretq_m128_f32(vreinterpretq_f32_u32( + vandq_u32(vreinterpretq_u32_f32(sum), vld1q_u32(output_mask)))); +#else + /* ARMv7: scalar fallback (no vaddvq_f32) */ float s = 0.0f; if (imm & (1 << 4)) - s += vgetq_lane_f32(elementwise_prod, 0); + s += vgetq_lane_f32(prod, 0); if (imm & (1 << 5)) - s += vgetq_lane_f32(elementwise_prod, 1); + s += vgetq_lane_f32(prod, 1); if (imm & (1 << 6)) - s += vgetq_lane_f32(elementwise_prod, 2); + s += vgetq_lane_f32(prod, 2); if (imm & (1 << 7)) - s += vgetq_lane_f32(elementwise_prod, 3); + s += vgetq_lane_f32(prod, 3); const float32_t res[4] = { (imm & 0x1) ? s : 0.0f, @@ -7034,31 +8287,42 @@ FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm) (imm & 0x8) ? s : 0.0f, }; return vreinterpretq_m128_f32(vld1q_f32(res)); +#endif } // Extract a 32-bit integer from a, selected with imm8, and store the result in // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32 -// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm) -#define _mm_extract_epi32(a, imm) \ - vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)) +// FORCE_INLINE int _mm_extract_epi32(__m128i a, const int imm) +// imm must be a compile-time constant in range [0, 3] +#define _mm_extract_epi32(a, imm) \ + (SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 3), \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))) // Extract a 64-bit integer from a, selected with imm8, and store the result in // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64 -// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm) -#define _mm_extract_epi64(a, imm) \ - vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm)) +// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, const int imm) +// imm must be a compile-time constant in range [0, 1] +#define _mm_extract_epi64(a, imm) \ + (SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 1), \ + vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))) // Extract an 8-bit integer from a, selected with imm8, and store the result in -// the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a, -// __constrange(0,16) int imm) +// the lower element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8 -#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm)) +// FORCE_INLINE int _mm_extract_epi8(__m128i a, const int imm) +// imm must be a compile-time constant in range [0, 15] +#define _mm_extract_epi8(a, imm) \ + (SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 15), \ + vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))) // Extracts the selected single-precision (32-bit) floating-point from a. -// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm) -#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm)) +// FORCE_INLINE int _mm_extract_ps(__m128 a, const int imm) +// imm must be a compile-time constant in range [0, 3] +#define _mm_extract_ps(a, imm) \ + (SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 3), \ + vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))) // Round the packed double-precision (64-bit) floating-point elements in a down // to an integer value, and store the results as packed double-precision @@ -7066,11 +8330,13 @@ FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd FORCE_INLINE __m128d _mm_floor_pd(__m128d a) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a))); #else - double *f = (double *) &a; - return _mm_set_pd(floor(f[1]), floor(f[0])); + double a0, a1; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + return _mm_set_pd(floor(a1), floor(a0)); #endif } @@ -7080,11 +8346,10 @@ FORCE_INLINE __m128d _mm_floor_pd(__m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps FORCE_INLINE __m128 _mm_floor_ps(__m128 a) { -#if (defined(__aarch64__) || defined(_M_ARM64)) || \ - defined(__ARM_FEATURE_DIRECTED_ROUNDING) +#if SSE2NEON_ARCH_AARCH64 || defined(__ARM_FEATURE_DIRECTED_ROUNDING) return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a))); #else - float *f = (float *) &a; + float *f = _sse2neon_reinterpret_cast(float *, &a); return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0])); #endif } @@ -7112,51 +8377,56 @@ FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b) // Copy a to dst, and insert the 32-bit integer i into dst at the location // specified by imm8. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32 -// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b, -// __constrange(0,4) int imm) -#define _mm_insert_epi32(a, b, imm) \ - vreinterpretq_m128i_s32( \ - vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))) +// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b, const int imm) +// imm must be a compile-time constant in range [0, 3] +#define _mm_insert_epi32(a, b, imm) \ + (SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 3), \ + vreinterpretq_m128i_s32( \ + vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm)))) // Copy a to dst, and insert the 64-bit integer i into dst at the location // specified by imm8. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64 -// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b, -// __constrange(0,2) int imm) -#define _mm_insert_epi64(a, b, imm) \ - vreinterpretq_m128i_s64( \ - vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))) +// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b, const int imm) +// imm must be a compile-time constant in range [0, 1] +#define _mm_insert_epi64(a, b, imm) \ + (SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 1), \ + vreinterpretq_m128i_s64( \ + vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm)))) // Copy a to dst, and insert the lower 8-bit integer from i into dst at the // location specified by imm8. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8 -// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b, -// __constrange(0,16) int imm) -#define _mm_insert_epi8(a, b, imm) \ - vreinterpretq_m128i_s8(vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))) +// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b, const int imm) +// imm must be a compile-time constant in range [0, 15] +#define _mm_insert_epi8(a, b, imm) \ + (SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 15), \ + vreinterpretq_m128i_s8( \ + vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm)))) // Copy a to tmp, then insert a single-precision (32-bit) floating-point // element from b into tmp using the control in imm8. Store tmp to dst using // the mask in imm8 (elements are zeroed out when the corresponding bit is set). // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps -#define _mm_insert_ps(a, b, imm8) \ - _sse2neon_define2( \ - __m128, a, b, \ - float32x4_t tmp1 = \ - vsetq_lane_f32(vgetq_lane_f32(_b, (imm8 >> 6) & 0x3), \ - vreinterpretq_f32_m128(_a), 0); \ - float32x4_t tmp2 = \ - vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), \ - vreinterpretq_f32_m128(_a), ((imm8 >> 4) & 0x3)); \ - const uint32_t data[4] = \ - _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0, \ - ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \ - ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \ - ((imm8) & (1 << 3)) ? UINT32_MAX : 0); \ - uint32x4_t mask = vld1q_u32(data); \ - float32x4_t all_zeros = vdupq_n_f32(0); \ - \ - _sse2neon_return(vreinterpretq_m128_f32( \ +// imm8 must be a compile-time constant in range [0, 255] +#define _mm_insert_ps(a, b, imm8) \ + _sse2neon_define2( \ + __m128, a, b, SSE2NEON_REQUIRE_CONST_RANGE(imm8, 0, 255); \ + float32x4_t tmp1 = \ + vsetq_lane_f32(vgetq_lane_f32(_b, ((imm8) >> 6) & 0x3), \ + vreinterpretq_f32_m128(_a), 0); \ + float32x4_t tmp2 = \ + vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), \ + vreinterpretq_f32_m128(_a), (((imm8) >> 4) & 0x3)); \ + const uint32_t data[4] = \ + _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0, \ + ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \ + ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \ + ((imm8) & (1 << 3)) ? UINT32_MAX : 0); \ + uint32x4_t mask = vld1q_u32(data); \ + float32x4_t all_zeros = vdupq_n_f32(0); \ + \ + _sse2neon_return(vreinterpretq_m128_f32( \ vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))));) // Compare packed signed 32-bit integers in a and b, and store packed maximum @@ -7236,45 +8506,40 @@ FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16 FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a) { - __m128i dst; uint16_t min, idx = 0; -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 + uint16x8_t _a = vreinterpretq_u16_m128i(a); // Find the minimum value - min = vminvq_u16(vreinterpretq_u16_m128i(a)); + min = vminvq_u16(_a); // Get the index of the minimum value static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7}; uint16x8_t minv = vdupq_n_u16(min); - uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a)); + uint16x8_t cmeq = vceqq_u16(minv, _a); idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq)); #else + uint16x8_t _a = vreinterpretq_u16_m128i(a); // Find the minimum value - __m64 tmp; - tmp = vreinterpret_m64_u16( - vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)), - vget_high_u16(vreinterpretq_u16_m128i(a)))); - tmp = vreinterpret_m64_u16( - vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); - tmp = vreinterpret_m64_u16( - vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); - min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0); + uint16x4_t tmp = vmin_u16(vget_low_u16(_a), vget_high_u16(_a)); + tmp = vpmin_u16(tmp, tmp); + tmp = vpmin_u16(tmp, tmp); + min = vget_lane_u16(tmp, 0); // Get the index of the minimum value int i; for (i = 0; i < 8; i++) { - if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) { - idx = (uint16_t) i; + if (min == vgetq_lane_u16(_a, 0)) { + idx = _sse2neon_static_cast(uint16_t, i); break; } - a = _mm_srli_si128(a, 2); + _a = vreinterpretq_u16_s8( + vextq_s8(vreinterpretq_s8_u16(_a), vreinterpretq_s8_u16(_a), 2)); } #endif // Generate result - dst = _mm_setzero_si128(); - dst = vreinterpretq_m128i_u16( - vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0)); - dst = vreinterpretq_m128i_u16( - vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1)); - return dst; + uint16x8_t result = vdupq_n_u16(0); + result = vsetq_lane_u16(min, result, 0); + result = vsetq_lane_u16(idx, result, 1); + return vreinterpretq_m128i_u16(result); } // Compute the sum of absolute differences (SADs) of quadruplets of unsigned @@ -7298,9 +8563,9 @@ FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm) vreinterpretq_u32_m128i(a), 1)); break; default: -#if defined(__GNUC__) || defined(__clang__) +#if SSE2NEON_COMPILER_GCC_COMPAT __builtin_unreachable(); -#elif defined(_MSC_VER) +#elif SSE2NEON_COMPILER_MSVC __assume(0); #endif break; @@ -7324,9 +8589,9 @@ FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm) vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3))); break; default: -#if defined(__GNUC__) || defined(__clang__) +#if SSE2NEON_COMPILER_GCC_COMPAT __builtin_unreachable(); -#elif defined(_MSC_VER) +#elif SSE2NEON_COMPILER_MSVC __assume(0); #endif break; @@ -7341,7 +8606,7 @@ FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm) c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b)); uint8x16_t _a_3 = vextq_u8(_a, _a, 3); c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b)); -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 // |0|4|2|6| c04 = vpaddq_s16(c04, c26); // |1|5|3|7| @@ -7401,23 +8666,25 @@ FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding) { -#if defined(__aarch64__) || defined(_M_ARM64) + rounding &= ~(_MM_FROUND_RAISE_EXC | _MM_FROUND_NO_EXC); + +#if SSE2NEON_ARCH_AARCH64 switch (rounding) { - case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): + case _MM_FROUND_TO_NEAREST_INT: return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a))); - case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): + case _MM_FROUND_TO_NEG_INF: return _mm_floor_pd(a); - case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): + case _MM_FROUND_TO_POS_INF: return _mm_ceil_pd(a); - case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): + case _MM_FROUND_TO_ZERO: return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a))); default: //_MM_FROUND_CUR_DIRECTION return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a))); } #else - double *v_double = (double *) &a; + double *v_double = _sse2neon_reinterpret_cast(double *, &a); - if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) || + if (rounding == _MM_FROUND_TO_NEAREST_INT || (rounding == _MM_FROUND_CUR_DIRECTION && _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) { double res[2], tmp; @@ -7450,11 +8717,11 @@ FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding) res[i] = (v_double[i] < 0) ? -res[i] : res[i]; } return _mm_set_pd(res[1], res[0]); - } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) || + } else if (rounding == _MM_FROUND_TO_NEG_INF || (rounding == _MM_FROUND_CUR_DIRECTION && _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) { return _mm_floor_pd(a); - } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) || + } else if (rounding == _MM_FROUND_TO_POS_INF || (rounding == _MM_FROUND_CUR_DIRECTION && _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) { return _mm_ceil_pd(a); @@ -7470,49 +8737,58 @@ FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding) // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding) { -#if (defined(__aarch64__) || defined(_M_ARM64)) || \ - defined(__ARM_FEATURE_DIRECTED_ROUNDING) + rounding &= ~(_MM_FROUND_RAISE_EXC | _MM_FROUND_NO_EXC); + +#if SSE2NEON_ARCH_AARCH64 || defined(__ARM_FEATURE_DIRECTED_ROUNDING) switch (rounding) { - case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): + case _MM_FROUND_TO_NEAREST_INT: return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a))); - case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): + case _MM_FROUND_TO_NEG_INF: return _mm_floor_ps(a); - case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): + case _MM_FROUND_TO_POS_INF: return _mm_ceil_ps(a); - case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): + case _MM_FROUND_TO_ZERO: return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a))); default: //_MM_FROUND_CUR_DIRECTION return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a))); } #else - float *v_float = (float *) &a; + float *v_float = _sse2neon_reinterpret_cast(float *, &a); + float32x4_t v = vreinterpretq_f32_m128(a); - if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) || + /* Detect values safe to convert to int32. Values outside this range + * (including infinity, NaN, and large finite values) must be preserved + * as-is since integer conversion would produce undefined results. */ + const float32x4_t max_representable = vdupq_n_f32(2147483520.0f); + uint32x4_t is_safe = + vcleq_f32(vabsq_f32(v), max_representable); /* |v| <= max int32 */ + + if (rounding == _MM_FROUND_TO_NEAREST_INT || (rounding == _MM_FROUND_CUR_DIRECTION && _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) { uint32x4_t signmask = vdupq_n_u32(0x80000000); - float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), - vdupq_n_f32(0.5f)); /* +/- 0.5 */ - int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32( - vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/ - int32x4_t r_trunc = vcvtq_s32_f32( - vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */ + float32x4_t half = + vbslq_f32(signmask, v, vdupq_n_f32(0.5f)); /* +/- 0.5 */ + int32x4_t r_normal = + vcvtq_s32_f32(vaddq_f32(v, half)); /* round to integer: [a + 0.5]*/ + int32x4_t r_trunc = vcvtq_s32_f32(v); /* truncate to integer: [a] */ int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32( vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ float32x4_t delta = vsubq_f32( - vreinterpretq_f32_m128(a), - vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ + v, vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */ - return vreinterpretq_m128_f32( - vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal))); - } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) || + float32x4_t rounded = + vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)); + /* Preserve original value for inputs outside int32 range */ + return vreinterpretq_m128_f32(vbslq_f32(is_safe, rounded, v)); + } else if (rounding == _MM_FROUND_TO_NEG_INF || (rounding == _MM_FROUND_CUR_DIRECTION && _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) { return _mm_floor_ps(a); - } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) || + } else if (rounding == _MM_FROUND_TO_POS_INF || (rounding == _MM_FROUND_CUR_DIRECTION && _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) { return _mm_ceil_ps(a); @@ -7557,13 +8833,16 @@ FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding) // Load 128-bits of integer data from memory into dst using a non-temporal // memory hint. mem_addr must be aligned on a 16-byte boundary or a // general-protection exception may be generated. +// Note: On AArch64, __builtin_nontemporal_load generates LDNP (Load +// Non-temporal Pair), providing true non-temporal hint for 128-bit loads. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128 FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p) { -#if __has_builtin(__builtin_nontemporal_store) +#if __has_builtin(__builtin_nontemporal_load) return __builtin_nontemporal_load(p); #else - return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p)); + return vreinterpretq_m128i_s64( + vld1q_s64(_sse2neon_reinterpret_cast(int64_t *, p))); #endif } @@ -7572,8 +8851,9 @@ FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones FORCE_INLINE int _mm_test_all_ones(__m128i a) { - return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) == - ~(uint64_t) 0; + return _sse2neon_static_cast(uint64_t, + vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) == + ~_sse2neon_static_cast(uint64_t, 0); } // Compute the bitwise AND of 128 bits (representing integer data) in a and @@ -7592,14 +8872,22 @@ FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask) // zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, // otherwise return 0. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero +// Note: Argument names may be wrong in the Intel intrinsics guide. FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask) { - uint64x2_t zf = - vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a)); - uint64x2_t cf = - vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a)); - uint64x2_t result = vandq_u64(zf, cf); - return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1)); + uint64x2_t v = vreinterpretq_u64_m128i(a); + uint64x2_t m = vreinterpretq_u64_m128i(mask); + + // find ones (set-bits) and zeros (clear-bits) under clip mask + uint64x2_t ones = vandq_u64(m, v); + uint64x2_t zeros = vbicq_u64(m, v); + + // If both 128-bit variables are populated (non-zero) then return 1. + // For comparison purposes, first compact each var down to 32-bits. + uint32x2_t reduced = vpmax_u32(vqmovn_u64(ones), vqmovn_u64(zeros)); + + // if folding minimum is non-zero then both vars must be non-zero + return (vget_lane_u32(vpmin_u32(reduced, reduced), 0) != 0); } // Compute the bitwise AND of 128 bits (representing integer data) in a and b, @@ -7609,9 +8897,9 @@ FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128 FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b) { - int64x2_t s64 = + int64x2_t s64_vec = vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)); - return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); + return !(vgetq_lane_s64(s64_vec, 0) | vgetq_lane_s64(s64_vec, 1)); } // Compute the bitwise AND of 128 bits (representing integer data) in a and b, @@ -7629,17 +8917,17 @@ FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128 FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b) { - int64x2_t s64 = + int64x2_t s64_vec = vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)); - return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); + return !(vgetq_lane_s64(s64_vec, 0) | vgetq_lane_s64(s64_vec, 1)); } /* SSE4.2 */ -const static uint16_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask16b[8] = { +static const uint16_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask16b[8] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, }; -const static uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = { +static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, }; @@ -7799,40 +9087,40 @@ const static uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = { SSE2NEON_CAT(u, size))) \ } while (0) -#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type) \ - static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \ - int lb) \ - { \ - __m128i mtx[16]; \ - PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ - SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \ - return SSE2NEON_CAT( \ - _sse2neon_aggregate_equal_any_, \ - SSE2NEON_CAT( \ - SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ - SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \ - type))))(la, lb, mtx); \ +#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type) \ + static uint16_t _sse2neon_cmp_##type##_equal_any(__m128i a, int la, \ + __m128i b, int lb) \ + { \ + __m128i mtx[16]; \ + PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ + SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \ + return SSE2NEON_CAT( \ + _sse2neon_aggregate_equal_any_, \ + SSE2NEON_CAT( \ + SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ + SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \ + type))))(la, lb, mtx); \ } -#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word) \ - static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \ - int lb) \ - { \ - __m128i mtx[16]; \ - PCMPSTR_RANGES( \ - a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ - SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word); \ - return SSE2NEON_CAT( \ - _sse2neon_aggregate_ranges_, \ - SSE2NEON_CAT( \ - SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ - SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \ - type))))(la, lb, mtx); \ +#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word) \ + static uint16_t _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, \ + __m128i b, int lb) \ + { \ + __m128i mtx[16]; \ + PCMPSTR_RANGES( \ + a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ + SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word); \ + return SSE2NEON_CAT( \ + _sse2neon_aggregate_ranges_, \ + SSE2NEON_CAT( \ + SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ + SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \ + type))))(la, lb, mtx); \ } #define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type) \ - static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la, \ - __m128i b, int lb) \ + static uint16_t _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la, \ + __m128i b, int lb) \ { \ __m128i mtx[16]; \ PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ @@ -7846,40 +9134,88 @@ const static uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = { SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx); \ } -static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16]) +static uint16_t _sse2neon_aggregate_equal_any_8x16(int la, + int lb, + __m128i mtx[16]) { - int res = 0; int m = (1 << la) - 1; uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b); - uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask); - uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask); + uint8x8_t t_lo = + vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, m & 0xff)), vec_mask); + uint8x8_t t_hi = + vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, m >> 8)), vec_mask); uint8x16_t vec = vcombine_u8(t_lo, t_hi); - for (int j = 0; j < lb; j++) { - mtx[j] = vreinterpretq_m128i_u8( - vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j]))); - mtx[j] = vreinterpretq_m128i_u8( - vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7)); - int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0; - res |= (tmp << j); + + /* Process all 16 rows in parallel. + * For each row j, check if any element in mtx[j] (masked by vec) is + * non-zero. Result bit j = 1 if row j has any match. + * + * Key optimization: Process all rows, then mask by lb at the end. + * This allows full SIMD utilization without loop-carried dependencies. + */ +#if SSE2NEON_ARCH_AARCH64 + /* AArch64: Use vmaxvq for horizontal max (equivalent to OR for 0/1) */ +#define SSE2NEON_UMAXV_MATCH(i) \ + ((vmaxvq_u8(vandq_u8(vec, vreinterpretq_u8_m128i(mtx[i]))) ? 1U : 0U) \ + << (i)) + uint16_t res = _sse2neon_static_cast( + uint16_t, (SSE2NEON_UMAXV_MATCH(0) | SSE2NEON_UMAXV_MATCH(1) | + SSE2NEON_UMAXV_MATCH(2) | SSE2NEON_UMAXV_MATCH(3) | + SSE2NEON_UMAXV_MATCH(4) | SSE2NEON_UMAXV_MATCH(5) | + SSE2NEON_UMAXV_MATCH(6) | SSE2NEON_UMAXV_MATCH(7) | + SSE2NEON_UMAXV_MATCH(8) | SSE2NEON_UMAXV_MATCH(9) | + SSE2NEON_UMAXV_MATCH(10) | SSE2NEON_UMAXV_MATCH(11) | + SSE2NEON_UMAXV_MATCH(12) | SSE2NEON_UMAXV_MATCH(13) | + SSE2NEON_UMAXV_MATCH(14) | SSE2NEON_UMAXV_MATCH(15)) & + 0xFFFFu); +#undef SSE2NEON_UMAXV_MATCH +#else + /* ARMv7: Use OR-based horizontal reduction (faster than vpmax cascade). + * The _sse2neon_any_nonzero_u8x16 helper uses 3 OR ops vs 4 vpmax ops. + */ + uint16_t res = 0; + for (int j = 0; j < 16; j++) { + uint8x16_t masked = vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])); + res |= (_sse2neon_any_nonzero_u8x16(masked) ? 1U : 0U) << j; } - return res; +#endif + /* Mask result to valid range based on lb */ + return res & _sse2neon_static_cast(uint16_t, (1 << lb) - 1); } -static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16]) +static uint16_t _sse2neon_aggregate_equal_any_16x8(int la, + int lb, + __m128i mtx[16]) { - int res = 0; - int m = (1 << la) - 1; + uint16_t m = _sse2neon_static_cast(uint16_t, 1 << la) - 1; uint16x8_t vec = vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b)); - for (int j = 0; j < lb; j++) { - mtx[j] = vreinterpretq_m128i_u16( - vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j]))); - mtx[j] = vreinterpretq_m128i_u16( - vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15)); - int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0; - res |= (tmp << j); + + /* Process all 8 rows in parallel for 16-bit word mode. + * Result bit j = 1 if any element in row j matches. + */ +#if SSE2NEON_ARCH_AARCH64 + /* AArch64: Use vmaxvq for horizontal max */ +#define SSE2NEON_UMAXV_MATCH16(i) \ + ((vmaxvq_u16(vandq_u16(vec, vreinterpretq_u16_m128i(mtx[i]))) ? 1U : 0U) \ + << (i)) + uint16_t res = _sse2neon_static_cast( + uint16_t, (SSE2NEON_UMAXV_MATCH16(0) | SSE2NEON_UMAXV_MATCH16(1) | + SSE2NEON_UMAXV_MATCH16(2) | SSE2NEON_UMAXV_MATCH16(3) | + SSE2NEON_UMAXV_MATCH16(4) | SSE2NEON_UMAXV_MATCH16(5) | + SSE2NEON_UMAXV_MATCH16(6) | SSE2NEON_UMAXV_MATCH16(7)) & + 0xFFu); +#undef SSE2NEON_UMAXV_MATCH16 +#else + /* ARMv7: Use OR-based horizontal reduction */ + uint16_t res = 0; + for (int j = 0; j < 8; j++) { + uint16x8_t masked = vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])); + res |= (_sse2neon_any_nonzero_u16x8(masked) ? 1U : 0U) << j; } - return res; +#endif + /* Mask result to valid range based on lb */ + return res & _sse2neon_static_cast(uint16_t, (1 << lb) - 1); } /* clang-format off */ @@ -7890,12 +9226,51 @@ static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16]) SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_) -static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16]) +static uint16_t _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16]) { - int res = 0; - int m = (1 << la) - 1; + uint16_t m = _sse2neon_static_cast(uint16_t, 1 << la) - 1; uint16x8_t vec = vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b)); + +#if SSE2NEON_ARCH_AARCH64 + /* Vectorized: process all 8 rows in parallel using vmaxvq. + * For RANGES mode with word elements: + * - Each row has 8 u16 values representing comparisons with 4 range pairs + * - Adjacent u16 elements [2k, 2k+1] form a range: (char >= low, char <= + * high) + * - Result bit j = 1 if any range pair matches for haystack position j + * + * Algorithm per row: + * 1. Mask by la validity: vand(vec, mtx[i]) + * 2. Swap adjacent u16 pairs: vrev32 swaps within each 32-bit lane + * 3. Pair-AND: AND original with swapped to get [m0&m1, m0&m1, ...] + * 4. Horizontal OR via vmaxvq_u16 (faster than vmaxvq_u32) + */ +#define SSE2NEON_RANGES_MATCH16(i) \ + do { \ + uint16x8_t masked = vandq_u16(vec, vreinterpretq_u16_m128i(mtx[i])); \ + uint16x8_t swapped = vrev32q_u16(masked); \ + uint16x8_t pair_and = vandq_u16(masked, swapped); \ + res |= _sse2neon_static_cast(uint16_t, \ + (vmaxvq_u16(pair_and) ? 1U : 0U) << i); \ + } while (0) + + uint16_t res = 0; + SSE2NEON_RANGES_MATCH16(0); + SSE2NEON_RANGES_MATCH16(1); + SSE2NEON_RANGES_MATCH16(2); + SSE2NEON_RANGES_MATCH16(3); + SSE2NEON_RANGES_MATCH16(4); + SSE2NEON_RANGES_MATCH16(5); + SSE2NEON_RANGES_MATCH16(6); + SSE2NEON_RANGES_MATCH16(7); +#undef SSE2NEON_RANGES_MATCH16 + + /* Mask result to valid range based on lb */ + return res & _sse2neon_static_cast(uint16_t, (1 << lb) - 1); +#else + /* ARMv7 fallback: sequential loop */ + uint16_t res = 0; for (int j = 0; j < lb; j++) { mtx[j] = vreinterpretq_m128i_u16( vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j]))); @@ -7905,25 +9280,70 @@ static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16]) vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16)); uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]), vreinterpretq_u32_m128i(tmp)); -#if defined(__aarch64__) || defined(_M_ARM64) - int t = vaddvq_u32(vec_res) ? 1 : 0; -#else uint64x2_t sumh = vpaddlq_u32(vec_res); - int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1); -#endif + uint16_t t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1); res |= (t << j); } return res; +#endif } -static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16]) +static uint16_t _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16]) { - int res = 0; - int m = (1 << la) - 1; + uint16_t m = _sse2neon_static_cast(uint16_t, (1 << la) - 1); uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b); - uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask); - uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask); + uint8x8_t t_lo = + vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, m & 0xff)), vec_mask); + uint8x8_t t_hi = + vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, m >> 8)), vec_mask); uint8x16_t vec = vcombine_u8(t_lo, t_hi); + +#if SSE2NEON_ARCH_AARCH64 + /* Vectorized: process all 16 rows in parallel using vmaxvq. + * For RANGES mode with byte elements: + * - Each row has 16 bytes representing comparisons with 8 range pairs + * - Adjacent bytes [2k, 2k+1] form a range: (char >= low, char <= high) + * - Result bit j = 1 if any range pair matches for haystack position j + * + * Algorithm per row: + * 1. Mask by la validity: vand(vec, mtx[i]) + * 2. Swap adjacent bytes: vrev16 swaps within each 16-bit lane + * 3. Pair-AND: AND original with swapped to get [b0&b1, b0&b1, ...] + * 4. Horizontal OR via vmaxvq_u8 (faster than vmaxvq_u16) + */ +#define SSE2NEON_RANGES_MATCH8(i) \ + do { \ + uint8x16_t masked = vandq_u8(vec, vreinterpretq_u8_m128i(mtx[i])); \ + uint8x16_t swapped = vrev16q_u8(masked); \ + uint8x16_t pair_and = vandq_u8(masked, swapped); \ + res |= _sse2neon_static_cast(uint16_t, (vmaxvq_u8(pair_and) ? 1U : 0U) \ + << i); \ + } while (0) + + uint16_t res = 0; + SSE2NEON_RANGES_MATCH8(0); + SSE2NEON_RANGES_MATCH8(1); + SSE2NEON_RANGES_MATCH8(2); + SSE2NEON_RANGES_MATCH8(3); + SSE2NEON_RANGES_MATCH8(4); + SSE2NEON_RANGES_MATCH8(5); + SSE2NEON_RANGES_MATCH8(6); + SSE2NEON_RANGES_MATCH8(7); + SSE2NEON_RANGES_MATCH8(8); + SSE2NEON_RANGES_MATCH8(9); + SSE2NEON_RANGES_MATCH8(10); + SSE2NEON_RANGES_MATCH8(11); + SSE2NEON_RANGES_MATCH8(12); + SSE2NEON_RANGES_MATCH8(13); + SSE2NEON_RANGES_MATCH8(14); + SSE2NEON_RANGES_MATCH8(15); +#undef SSE2NEON_RANGES_MATCH8 + + /* Mask result to valid range based on lb */ + return res & _sse2neon_static_cast(uint16_t, (1 << lb) - 1); +#else + /* ARMv7 fallback: sequential loop */ + uint16_t res = 0; for (int j = 0; j < lb; j++) { mtx[j] = vreinterpretq_m128i_u8( vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j]))); @@ -7933,10 +9353,11 @@ static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16]) vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8)); uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]), vreinterpretq_u16_m128i(tmp)); - int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0; + uint16_t t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0; res |= (t << j); } return res; +#endif } #define SSE2NEON_CMP_RANGES_IS_BYTE 1 @@ -7955,22 +9376,29 @@ SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_) #undef SSE2NEON_CMP_RANGES_IS_BYTE #undef SSE2NEON_CMP_RANGES_IS_WORD -static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb) +static uint16_t _sse2neon_cmp_byte_equal_each(__m128i a, + int la, + __m128i b, + int lb) { uint8x16_t mtx = vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)); - int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb)); - int m1 = 0x10000 - (1 << la); - int tb = 0x10000 - (1 << lb); + uint16_t m0 = + _sse2neon_static_cast(uint16_t, (la < lb) ? 0 : (1 << la) - (1 << lb)); + uint16_t m1 = _sse2neon_static_cast(uint16_t, 0x10000 - (1 << la)); + uint16_t tb = _sse2neon_static_cast(uint16_t, 0x10000 - (1 << lb)); uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi; uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi; vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b); - vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask); - vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask); - vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask); - vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask); - tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask); - tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask); + vec0_lo = vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, m0)), vec_mask); + vec0_hi = + vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, m0 >> 8)), vec_mask); + vec1_lo = vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, m1)), vec_mask); + vec1_hi = + vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, m1 >> 8)), vec_mask); + tmp_lo = vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, tb)), vec_mask); + tmp_hi = + vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, tb >> 8)), vec_mask); res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx)); res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx)); @@ -7979,17 +9407,21 @@ static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb) res_lo = vand_u8(res_lo, vec_mask); res_hi = vand_u8(res_hi, vec_mask); - int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8); - return res; + return _sse2neon_vaddv_u8(res_lo) + + _sse2neon_static_cast(uint16_t, _sse2neon_vaddv_u8(res_hi) << 8); } -static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb) +static uint16_t _sse2neon_cmp_word_equal_each(__m128i a, + int la, + __m128i b, + int lb) { uint16x8_t mtx = vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)); - int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb)); - int m1 = 0x100 - (1 << la); - int tb = 0x100 - (1 << lb); + uint16_t m0 = _sse2neon_static_cast( + uint16_t, (la < lb) ? 0 : ((1 << la) - (1 << lb))); + uint16_t m1 = _sse2neon_static_cast(uint16_t, 0x100 - (1 << la)); + uint16_t tb = _sse2neon_static_cast(uint16_t, 0x100 - (1 << lb)); uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b); uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask); uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask); @@ -8000,53 +9432,308 @@ static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb) return _sse2neon_vaddvq_u16(mtx); } -#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1 -#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0 +/* EQUAL_ORDERED aggregation for 8x16 (byte mode). + * The algorithm checks where string a appears in string b. + * For result bit i: AND together mtx[i][0] & mtx[i+1][1] & mtx[i+2][2] & ... + * + * Vectorization approach: transpose matrix FIRST, then apply masking to + * transposed matrix, then use vextq diagonal extraction. + * After transpose: mtx_T[j][i] = mtx[i][j] = (a[j] == b[i]) + * vextq on mtx_T gives: result[i] = mtx_T[0][i] & mtx_T[1][i+1] & ... + * = mtx[i][0] & mtx[i+1][1] & ... (correct!) + */ +static uint16_t _sse2neon_aggregate_equal_ordered_8x16(int bound, + int la, + int lb, + __m128i mtx[16]) +{ +#if SSE2NEON_ARCH_AARCH64 + uint8x16_t rows[16]; + for (int i = 0; i < 16; i++) + rows[i] = vreinterpretq_u8_m128i(mtx[i]); -#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type) \ - static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes( \ - int bound, int la, int lb, __m128i mtx[16]) \ - { \ - int res = 0; \ - int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la); \ - uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)( \ - vld1_u##size(_sse2neon_cmpestr_mask##size##b), \ - vld1q_u##size(_sse2neon_cmpestr_mask##size##b)); \ - uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)( \ - vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask), \ - vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \ - vtstq_u##size(vdupq_n_u##size(m1), vec_mask)); \ - uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \ - uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0); \ - for (int j = 0; j < lb; j++) { \ - mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size( \ - vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j]))); \ - } \ - for (int j = lb; j < bound; j++) { \ - mtx[j] = vreinterpretq_m128i_u##size( \ - vbslq_u##size(vec1, vec_minusone, vec_zero)); \ - } \ - unsigned SSE2NEON_IIF(data_type)(char, short) *ptr = \ - (unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx; \ - for (int i = 0; i < bound; i++) { \ - int val = 1; \ - for (int j = 0, k = i; j < bound - i && k < bound; j++, k++) \ - val &= ptr[k * bound + j]; \ - res += val << i; \ - } \ - return res; \ + /* Transpose the 16x16 byte matrix using hierarchical vtrn operations. + * After transpose: rows[j][i] = original mtx[i][j] + */ + /* Level 1: Transpose 2x2 blocks of 8-bit elements */ + for (int i = 0; i < 16; i += 2) { + uint8x16x2_t t = vtrnq_u8(rows[i], rows[i + 1]); + rows[i] = t.val[0]; + rows[i + 1] = t.val[1]; } -/* clang-format off */ -#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \ - prefix##IMPL(8, 16, prefix##IS_UBYTE) \ - prefix##IMPL(16, 8, prefix##IS_UWORD) -/* clang-format on */ + /* Level 2: Transpose 2x2 blocks of 16-bit elements */ + for (int i = 0; i < 16; i += 4) { + uint16x8x2_t t0 = vtrnq_u16(vreinterpretq_u16_u8(rows[i]), + vreinterpretq_u16_u8(rows[i + 2])); + uint16x8x2_t t1 = vtrnq_u16(vreinterpretq_u16_u8(rows[i + 1]), + vreinterpretq_u16_u8(rows[i + 3])); + rows[i] = vreinterpretq_u8_u16(t0.val[0]); + rows[i + 2] = vreinterpretq_u8_u16(t0.val[1]); + rows[i + 1] = vreinterpretq_u8_u16(t1.val[0]); + rows[i + 3] = vreinterpretq_u8_u16(t1.val[1]); + } -SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_) + /* Level 3: Transpose 2x2 blocks of 32-bit elements */ + for (int i = 0; i < 16; i += 8) { + uint32x4x2_t t0 = vtrnq_u32(vreinterpretq_u32_u8(rows[i]), + vreinterpretq_u32_u8(rows[i + 4])); + uint32x4x2_t t1 = vtrnq_u32(vreinterpretq_u32_u8(rows[i + 1]), + vreinterpretq_u32_u8(rows[i + 5])); + uint32x4x2_t t2 = vtrnq_u32(vreinterpretq_u32_u8(rows[i + 2]), + vreinterpretq_u32_u8(rows[i + 6])); + uint32x4x2_t t3 = vtrnq_u32(vreinterpretq_u32_u8(rows[i + 3]), + vreinterpretq_u32_u8(rows[i + 7])); + rows[i] = vreinterpretq_u8_u32(t0.val[0]); + rows[i + 4] = vreinterpretq_u8_u32(t0.val[1]); + rows[i + 1] = vreinterpretq_u8_u32(t1.val[0]); + rows[i + 5] = vreinterpretq_u8_u32(t1.val[1]); + rows[i + 2] = vreinterpretq_u8_u32(t2.val[0]); + rows[i + 6] = vreinterpretq_u8_u32(t2.val[1]); + rows[i + 3] = vreinterpretq_u8_u32(t3.val[0]); + rows[i + 7] = vreinterpretq_u8_u32(t3.val[1]); + } -#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE -#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD + /* Level 4: Swap 64-bit halves between row pairs */ + { + uint8x16_t tmp; +#define SSE2NEON_SWAP_HL_8(a, b) \ + tmp = vcombine_u8(vget_low_u8(a), vget_low_u8(b)); \ + b = vcombine_u8(vget_high_u8(a), vget_high_u8(b)); \ + a = tmp; + + SSE2NEON_SWAP_HL_8(rows[0], rows[8]); + SSE2NEON_SWAP_HL_8(rows[1], rows[9]); + SSE2NEON_SWAP_HL_8(rows[2], rows[10]); + SSE2NEON_SWAP_HL_8(rows[3], rows[11]); + SSE2NEON_SWAP_HL_8(rows[4], rows[12]); + SSE2NEON_SWAP_HL_8(rows[5], rows[13]); + SSE2NEON_SWAP_HL_8(rows[6], rows[14]); + SSE2NEON_SWAP_HL_8(rows[7], rows[15]); +#undef SSE2NEON_SWAP_HL_8 + } + + /* Apply masking to TRANSPOSED matrix: + * - Rows j >= la: set entire row to 0xFF (needle positions beyond la) + * - For rows j < la: columns k >= lb set to 0x00 (force AND fail for + * positions that would access haystack beyond lb) + * + * lb_valid has bits set for valid positions (0..lb-1) + * lb_clear has 0xFF for positions < lb, 0x00 for positions >= lb + */ + uint8x16_t vec_ff = vdupq_n_u8(0xFF); + uint16_t lb_valid = + _sse2neon_static_cast(uint16_t, (1U << lb) - 1); /* e.g. lb=6: 0x003F */ + uint8x8_t pos_mask = vld1_u8(_sse2neon_cmpestr_mask8b); + uint8x16_t lb_clear = vcombine_u8( + vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, lb_valid)), pos_mask), + vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, lb_valid >> 8)), + pos_mask)); + + for (int j = 0; j < la; j++) { + rows[j] = vandq_u8(rows[j], lb_clear); /* clear positions >= lb */ + } + for (int j = la; j < 16; j++) { + rows[j] = vec_ff; + } + + /* vextq diagonal extraction: shift row k by k, then AND all rows. + * result[i] = rows[0][i] & rows[1][i+1] & rows[2][i+2] & ... + */ + uint8x16_t result = vec_ff; + +/* Shift row K by K positions, filling with 0xFF, then AND into result */ +#define SSE2NEON_VEXT_AND_8(K) \ + do { \ + uint8x16_t shifted = vextq_u8(rows[K], vec_ff, K); \ + result = vandq_u8(result, shifted); \ + } while (0) + + SSE2NEON_VEXT_AND_8(0); + SSE2NEON_VEXT_AND_8(1); + SSE2NEON_VEXT_AND_8(2); + SSE2NEON_VEXT_AND_8(3); + SSE2NEON_VEXT_AND_8(4); + SSE2NEON_VEXT_AND_8(5); + SSE2NEON_VEXT_AND_8(6); + SSE2NEON_VEXT_AND_8(7); + SSE2NEON_VEXT_AND_8(8); + SSE2NEON_VEXT_AND_8(9); + SSE2NEON_VEXT_AND_8(10); + SSE2NEON_VEXT_AND_8(11); + SSE2NEON_VEXT_AND_8(12); + SSE2NEON_VEXT_AND_8(13); + SSE2NEON_VEXT_AND_8(14); + SSE2NEON_VEXT_AND_8(15); + +#undef SSE2NEON_VEXT_AND_8 + + /* Convert result to bitmask: each lane is 0xFF (match) or 0x00 (no match). + * Extract MSB of each byte to form 16-bit result using _mm_movemask_epi8 + * approach: shift right to get MSB in LSB, position each bit, sum halves. + */ + uint8x16_t msbs = vshrq_n_u8(result, 7); + static const int8_t shift_table[16] = {0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7}; + int8x16_t shifts = vld1q_s8(shift_table); + uint8x16_t positioned = vshlq_u8(msbs, shifts); + return _sse2neon_static_cast(uint16_t, + vaddv_u8(vget_low_u8(positioned)) | + (vaddv_u8(vget_high_u8(positioned)) << 8)); +#else + /* ARMv7 fallback: apply masking and use scalar extraction */ + uint16_t m1 = _sse2neon_static_cast(uint16_t, 0x10000 - (1 << la)); + uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b); + uint8x16_t vec1 = vcombine_u8( + vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, m1)), vec_mask), + vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, m1 >> 8)), vec_mask)); + uint8x16_t vec_minusone = vdupq_n_u8(0xFF); + uint8x16_t vec_zero = vdupq_n_u8(0); + + for (int j = 0; j < lb; j++) { + mtx[j] = vreinterpretq_m128i_u8( + vbslq_u8(vec1, vec_minusone, vreinterpretq_u8_m128i(mtx[j]))); + } + for (int j = lb; j < bound; j++) { + mtx[j] = vreinterpretq_m128i_u8(vbslq_u8(vec1, vec_minusone, vec_zero)); + } + + uint16_t res = 0; + unsigned char *ptr = _sse2neon_reinterpret_cast(unsigned char *, mtx); + for (int i = 0; i < bound; i++) { + int val = 1; + for (int j = 0, k = i; j < bound - i && k < bound; j++, k++) + val &= ptr[k * bound + j]; + res += _sse2neon_static_cast(uint16_t, val << i); + } + return res; +#endif +} + +/* EQUAL_ORDERED aggregation for 16x8 (word mode). + * Same algorithm as 8x16 but for 16-bit elements with 8 lanes. + * + * Vectorization approach: transpose matrix FIRST, then apply masking to + * transposed matrix, then use vextq diagonal extraction. + */ +static uint16_t _sse2neon_aggregate_equal_ordered_16x8(int bound, + int la, + int lb, + __m128i mtx[16]) +{ +#if SSE2NEON_ARCH_AARCH64 + uint16x8_t rows[8]; + for (int i = 0; i < 8; i++) + rows[i] = vreinterpretq_u16_m128i(mtx[i]); + + /* Transpose the 8x8 word matrix using hierarchical vtrn operations. + * After transpose: rows[j][i] = original mtx[i][j] + */ + /* Level 1: Transpose 2x2 blocks of 16-bit elements */ + for (int i = 0; i < 8; i += 2) { + uint16x8x2_t t = vtrnq_u16(rows[i], rows[i + 1]); + rows[i] = t.val[0]; + rows[i + 1] = t.val[1]; + } + + /* Level 2: Transpose 2x2 blocks of 32-bit elements */ + for (int i = 0; i < 8; i += 4) { + uint32x4x2_t t0 = vtrnq_u32(vreinterpretq_u32_u16(rows[i]), + vreinterpretq_u32_u16(rows[i + 2])); + uint32x4x2_t t1 = vtrnq_u32(vreinterpretq_u32_u16(rows[i + 1]), + vreinterpretq_u32_u16(rows[i + 3])); + rows[i] = vreinterpretq_u16_u32(t0.val[0]); + rows[i + 2] = vreinterpretq_u16_u32(t0.val[1]); + rows[i + 1] = vreinterpretq_u16_u32(t1.val[0]); + rows[i + 3] = vreinterpretq_u16_u32(t1.val[1]); + } + + /* Level 3: Swap 64-bit halves between row pairs */ + { + uint16x8_t tmp; +#define SSE2NEON_SWAP_HL_16(a, b) \ + tmp = vcombine_u16(vget_low_u16(a), vget_low_u16(b)); \ + b = vcombine_u16(vget_high_u16(a), vget_high_u16(b)); \ + a = tmp; + + SSE2NEON_SWAP_HL_16(rows[0], rows[4]); + SSE2NEON_SWAP_HL_16(rows[1], rows[5]); + SSE2NEON_SWAP_HL_16(rows[2], rows[6]); + SSE2NEON_SWAP_HL_16(rows[3], rows[7]); +#undef SSE2NEON_SWAP_HL_16 + } + + /* Apply masking to TRANSPOSED matrix: + * - Rows j >= la: set entire row to 0xFFFF + * - For rows j < la: columns k >= lb set to 0x0000 + */ + uint16x8_t vec_ff = vdupq_n_u16(0xFFFF); + uint16_t lb_valid = + _sse2neon_static_cast(uint16_t, (1U << lb) - 1); /* e.g. lb=6: 0x003F */ + uint16x8_t pos_mask = vld1q_u16(_sse2neon_cmpestr_mask16b); + uint16x8_t lb_clear = vtstq_u16(vdupq_n_u16(lb_valid), pos_mask); + + for (int j = 0; j < la; j++) { + rows[j] = vandq_u16(rows[j], lb_clear); + } + for (int j = la; j < 8; j++) { + rows[j] = vec_ff; + } + + /* vextq diagonal extraction: shift row k by k, then AND all rows */ + uint16x8_t result = vec_ff; + +#define SSE2NEON_VEXT_AND_16(K) \ + do { \ + uint16x8_t shifted = vextq_u16(rows[K], vec_ff, K); \ + result = vandq_u16(result, shifted); \ + } while (0) + + SSE2NEON_VEXT_AND_16(0); + SSE2NEON_VEXT_AND_16(1); + SSE2NEON_VEXT_AND_16(2); + SSE2NEON_VEXT_AND_16(3); + SSE2NEON_VEXT_AND_16(4); + SSE2NEON_VEXT_AND_16(5); + SSE2NEON_VEXT_AND_16(6); + SSE2NEON_VEXT_AND_16(7); + +#undef SSE2NEON_VEXT_AND_16 + + /* Convert result to bitmask: each lane is 0xFFFF or 0x0000. + * Extract MSB of each word and form 8-bit result. + */ + uint16x8_t msbs = vshrq_n_u16(result, 15); + uint16x8_t positioned = vmulq_u16(msbs, pos_mask); + return _sse2neon_static_cast(uint16_t, _sse2neon_vaddvq_u16(positioned)); +#else + /* ARMv7 fallback: apply masking and use scalar extraction */ + uint16_t m1 = _sse2neon_static_cast(uint16_t, 0x100 - (1 << la)); + uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b); + uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask); + uint16x8_t vec_minusone = vdupq_n_u16(0xFFFF); + uint16x8_t vec_zero = vdupq_n_u16(0); + + for (int j = 0; j < lb; j++) { + mtx[j] = vreinterpretq_m128i_u16( + vbslq_u16(vec1, vec_minusone, vreinterpretq_u16_m128i(mtx[j]))); + } + for (int j = lb; j < bound; j++) { + mtx[j] = + vreinterpretq_m128i_u16(vbslq_u16(vec1, vec_minusone, vec_zero)); + } + + uint16_t res = 0; + unsigned short *ptr = _sse2neon_reinterpret_cast(unsigned short *, mtx); + for (int i = 0; i < bound; i++) { + int val = 1; + for (int j = 0, k = i; j < bound - i && k < bound; j++, k++) + val &= ptr[k * bound + j]; + res += _sse2neon_static_cast(uint16_t, val << i); + } + return res; +#endif +} /* clang-format off */ #define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \ @@ -8056,42 +9743,48 @@ SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_) SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_) -#define SSE2NEON_CMPESTR_LIST \ - _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any) \ - _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any) \ - _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any) \ - _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any) \ - _(CMP_UBYTE_RANGES, cmp_ubyte_ranges) \ - _(CMP_UWORD_RANGES, cmp_uword_ranges) \ - _(CMP_SBYTE_RANGES, cmp_sbyte_ranges) \ - _(CMP_SWORD_RANGES, cmp_sword_ranges) \ - _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each) \ - _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each) \ - _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each) \ - _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each) \ - _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \ - _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \ - _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \ - _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered) +#define SSE2NEON_CMPESTR_LIST \ + _SSE2NEON(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any) \ + _SSE2NEON(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any) \ + _SSE2NEON(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any) \ + _SSE2NEON(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any) \ + _SSE2NEON(CMP_UBYTE_RANGES, cmp_ubyte_ranges) \ + _SSE2NEON(CMP_UWORD_RANGES, cmp_uword_ranges) \ + _SSE2NEON(CMP_SBYTE_RANGES, cmp_sbyte_ranges) \ + _SSE2NEON(CMP_SWORD_RANGES, cmp_sword_ranges) \ + _SSE2NEON(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each) \ + _SSE2NEON(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each) \ + _SSE2NEON(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each) \ + _SSE2NEON(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each) \ + _SSE2NEON(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \ + _SSE2NEON(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \ + _SSE2NEON(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \ + _SSE2NEON(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered) enum { -#define _(name, func_suffix) name, +#define _SSE2NEON(name, func_suffix) name, SSE2NEON_CMPESTR_LIST -#undef _ +#undef _SSE2NEON }; -typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb); +typedef uint16_t (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb); static cmpestr_func_t _sse2neon_cmpfunc_table[] = { -#define _(name, func_suffix) _sse2neon_##func_suffix, +#define _SSE2NEON(name, func_suffix) _sse2neon_##func_suffix, SSE2NEON_CMPESTR_LIST -#undef _ +#undef _SSE2NEON }; -FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound) +FORCE_INLINE uint16_t _sse2neon_sido_negative(int res, + int lb, + int imm8, + int bound) { switch (imm8 & 0x30) { case _SIDD_NEGATIVE_POLARITY: res ^= 0xffffffff; break; + case _SIDD_MASKED_POSITIVE_POLARITY: + res &= (1 << lb) - 1; + break; case _SIDD_MASKED_NEGATIVE_POLARITY: res ^= (1 << lb) - 1; break; @@ -8099,12 +9792,12 @@ FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound) break; } - return res & ((bound == 8) ? 0xFF : 0xFFFF); + return _sse2neon_static_cast(uint16_t, res &((bound == 8) ? 0xFF : 0xFFFF)); } FORCE_INLINE int _sse2neon_clz(unsigned int x) { -#if _MSC_VER +#if SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG unsigned long cnt = 0; if (_BitScanReverse(&cnt, x)) return 31 - cnt; @@ -8116,7 +9809,7 @@ FORCE_INLINE int _sse2neon_clz(unsigned int x) FORCE_INLINE int _sse2neon_ctz(unsigned int x) { -#if _MSC_VER +#if SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG unsigned long cnt = 0; if (_BitScanForward(&cnt, x)) return cnt; @@ -8128,7 +9821,7 @@ FORCE_INLINE int _sse2neon_ctz(unsigned int x) FORCE_INLINE int _sse2neon_ctzll(unsigned long long x) { -#if _MSC_VER +#if SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG unsigned long cnt; #if defined(SSE2NEON_HAS_BITSCAN64) if (_BitScanForward64(&cnt, x)) @@ -8148,7 +9841,7 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x) #define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y) #define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \ - const int var = (imm & 0x01) ? 8 : 16 + const int var = ((imm) & 0x01) ? 8 : 16 #define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \ int tmp1 = la ^ (la >> 31); \ @@ -8163,32 +9856,35 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x) // As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the // length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of // string a and b. -#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE) \ - SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); \ - SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb); \ - int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \ +#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE) \ + SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); \ + SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb); \ + uint16_t r2 = (_sse2neon_cmpfunc_table[(imm8) & 0x0f])(a, la, b, lb); \ r2 = _sse2neon_sido_negative(r2, lb, imm8, bound) -#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8) \ - return (r2 == 0) ? bound \ - : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \ - : _sse2neon_ctz(r2)) +#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8) \ + return (r2 == 0) ? bound \ + : (((imm8) & 0x40) ? (31 - _sse2neon_clz(r2)) \ + : _sse2neon_ctz(r2)) #define SSE2NEON_CMPSTR_GENERATE_MASK(dst) \ __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \ - if (imm8 & 0x40) { \ + if ((imm8) & 0x40) { \ if (bound == 8) { \ uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2), \ vld1q_u16(_sse2neon_cmpestr_mask16b)); \ dst = vreinterpretq_m128i_u16(vbslq_u16( \ - tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst))); \ + tmp, vdupq_n_u16(_sse2neon_static_cast(uint16_t, -1)), \ + vreinterpretq_u16_m128i(dst))); \ } else { \ - uint8x16_t vec_r2 = \ - vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8)); \ + uint8x16_t vec_r2 = vcombine_u8( \ + vdup_n_u8(_sse2neon_static_cast(uint8_t, r2)), \ + vdup_n_u8(_sse2neon_static_cast(uint8_t, r2 >> 8))); \ uint8x16_t tmp = \ vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b)); \ dst = vreinterpretq_m128i_u8( \ - vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst))); \ + vbslq_u8(tmp, vdupq_n_u8(_sse2neon_static_cast(uint8_t, -1)), \ + vreinterpretq_u8_m128i(dst))); \ } \ } else { \ if (bound == 16) { \ @@ -8196,7 +9892,8 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x) vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \ } else { \ dst = vreinterpretq_m128i_u8( \ - vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0)); \ + vsetq_lane_u8(_sse2neon_static_cast(uint8_t, r2 & 0xff), \ + vreinterpretq_u8_m128i(dst), 0)); \ } \ } \ return dst @@ -8213,7 +9910,7 @@ FORCE_INLINE int _mm_cmpestra(__m128i a, { int lb_cpy = lb; SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX); - return !r2 & (lb_cpy > bound); + return !r2 & (lb_cpy >= bound); } // Compare packed strings in a and b with lengths la and lb using the control in @@ -8274,6 +9971,9 @@ FORCE_INLINE int _mm_cmpestrs(__m128i a, int lb, const int imm8) { + (void) a; + (void) b; + (void) lb; SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); return la <= (bound - 1); } @@ -8287,13 +9987,16 @@ FORCE_INLINE int _mm_cmpestrz(__m128i a, int lb, const int imm8) { + (void) a; + (void) b; + (void) la; SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); return lb <= (bound - 1); } #define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8) \ do { \ - if (imm8 & 0x01) { \ + if ((imm8) & 0x01) { \ uint16x8_t equal_mask_##str = \ vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \ uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \ @@ -8368,6 +10071,7 @@ FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8) { + (void) b; SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); int la; SSE2NEON_CMPISTRX_LENGTH(a, la, imm8); @@ -8379,6 +10083,7 @@ FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8) { + (void) a; SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); int lb; SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8); @@ -8389,7 +10094,7 @@ FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8) // in b for greater than. FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 return vreinterpretq_m128i_u64( vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); #else @@ -8399,21 +10104,76 @@ FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b) #endif } +/* A function-like macro to generate CRC-32C calculation using Barrett + * reduction. + * + * The input parameters depict as follows: + * - 'crc' means initial value or CRC. + * - 'v' means the element of input message. + * - 'bit' means the element size of input message (e.g., if each message is one + * byte then 'bit' will be 8 as 1 byte equals 8 bits. + * - 'shift' represents a toggle to perform shifting. + * + * For a reminder, the CRC calculation uses bit-reflected sense. + * + * As there are two mysterious variables 'p' and 'mu', here are what they serve: + * 1. 'p' stands for Polynomial P(x) in CRC calculation. + * As we are using CRC-32C, 'p' has the value of 0x105EC76F1 (0x1EDC6F41 in + * bit-reflected form). + * 2. 'mu' stands for the multiplicative inverse of 'p' in GF(64). + * 'mu' has the value of 0x1dea713f1. + * (mu_{64} = \lfloor 2^{64} / P(x) \rfloor = 0x11f91caf6) + * (the bit-reflected form of 0x11f91caf6 is 0x1dea713f1) + * + * The CRC value is calculated as follows: + * 1. Update (XOR) 'crc' with new input message element 'v'. + * 2. Create 'orig' and 'tmp' vector. + * Before creating the vectors, We store 'crc' in lower half of vector + * then shift left by 'bit' bits so that the result of carry-less + * multiplication will always appear in the upper half of destination vector. + * Doing so can reduce some masking and subtraction operations. + * For one exception is that there is no need to perform shifting if 'bit' + * is 64. + * 3. Do carry-less multiplication on the lower half of 'tmp' with 'mu'. + * 4. Do carry-less multiplication on the upper half of 'tmp' with 'p'. + * 5. Extract the lower (in bit-reflected sense) 32 bits in the upper half of + * 'tmp'. + */ +#define SSE2NEON_CRC32C_BASE(crc, v, bit, shift) \ + do { \ + crc ^= v; \ + uint64x2_t orig = \ + vcombine_u64(_sse2neon_vcreate_u64(SSE2NEON_IIF(shift)( \ + (uint64_t) (crc) << (bit), (uint64_t) (crc))), \ + _sse2neon_vcreate_u64(0x0)); \ + uint64x2_t tmp = orig; \ + uint64_t p = 0x105EC76F1; \ + uint64_t mu = 0x1dea713f1; \ + tmp = \ + _sse2neon_vmull_p64(vget_low_u64(tmp), _sse2neon_vcreate_u64(mu)); \ + tmp = \ + _sse2neon_vmull_p64(vget_high_u64(tmp), _sse2neon_vcreate_u64(p)); \ + crc = vgetq_lane_u32(vreinterpretq_u32_u64(tmp), 2); \ + } while (0) + // Starting with the initial value in crc, accumulates a CRC32 value for // unsigned 16-bit integer v, and stores the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16 FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v) { -#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) +#if SSE2NEON_ARCH_AARCH64 && defined(__ARM_FEATURE_CRC32) && !SSE2NEON_ARM64EC __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t" : [c] "+r"(crc) : [v] "r"(v)); -#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \ - (defined(_M_ARM64) && !defined(__clang__)) +#elif ((__ARM_ARCH >= 8) && defined(__ARM_FEATURE_CRC32)) || \ + (SSE2NEON_COMPILER_MSVC && defined(_M_ARM64) && !SSE2NEON_ARM64EC && \ + !SSE2NEON_COMPILER_CLANG) crc = __crc32ch(crc, v); +#elif defined(__ARM_FEATURE_CRYPTO) + SSE2NEON_CRC32C_BASE(crc, v, 16, 1); #else - crc = _mm_crc32_u8(crc, v & 0xff); - crc = _mm_crc32_u8(crc, (v >> 8) & 0xff); + crc = _mm_crc32_u8(crc, _sse2neon_static_cast(uint8_t, v & 0xff)); + crc = _mm_crc32_u8(crc, _sse2neon_static_cast(uint8_t, (v >> 8) & 0xff)); #endif return crc; } @@ -8423,16 +10183,20 @@ FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32 FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v) { -#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) +#if SSE2NEON_ARCH_AARCH64 && defined(__ARM_FEATURE_CRC32) && !SSE2NEON_ARM64EC __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t" : [c] "+r"(crc) : [v] "r"(v)); -#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \ - (defined(_M_ARM64) && !defined(__clang__)) +#elif ((__ARM_ARCH >= 8) && defined(__ARM_FEATURE_CRC32)) || \ + (SSE2NEON_COMPILER_MSVC && defined(_M_ARM64) && !SSE2NEON_ARM64EC && \ + !SSE2NEON_COMPILER_CLANG) crc = __crc32cw(crc, v); +#elif defined(__ARM_FEATURE_CRYPTO) + SSE2NEON_CRC32C_BASE(crc, v, 32, 1); #else - crc = _mm_crc32_u16(crc, v & 0xffff); - crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff); + crc = _mm_crc32_u16(crc, _sse2neon_static_cast(uint16_t, v & 0xffff)); + crc = + _mm_crc32_u16(crc, _sse2neon_static_cast(uint16_t, (v >> 16) & 0xffff)); #endif return crc; } @@ -8442,15 +10206,21 @@ FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64 FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v) { -#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) +#if SSE2NEON_ARCH_AARCH64 && defined(__ARM_FEATURE_CRC32) && !SSE2NEON_ARM64EC __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t" : [c] "+r"(crc) : [v] "r"(v)); -#elif (defined(_M_ARM64) && !defined(__clang__)) - crc = __crc32cd((uint32_t) crc, v); +#elif (SSE2NEON_COMPILER_MSVC && defined(_M_ARM64) && !SSE2NEON_ARM64EC && \ + !SSE2NEON_COMPILER_CLANG) + crc = __crc32cd(_sse2neon_static_cast(uint32_t, crc), v); +#elif defined(__ARM_FEATURE_CRYPTO) + SSE2NEON_CRC32C_BASE(crc, v, 64, 0); #else - crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff); - crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff); + crc = _mm_crc32_u32(_sse2neon_static_cast(uint32_t, crc), + _sse2neon_static_cast(uint32_t, v & 0xffffffff)); + crc = + _mm_crc32_u32(_sse2neon_static_cast(uint32_t, crc), + _sse2neon_static_cast(uint32_t, (v >> 32) & 0xffffffff)); #endif return crc; } @@ -8460,28 +10230,44 @@ FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) { -#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) +#if SSE2NEON_ARCH_AARCH64 && defined(__ARM_FEATURE_CRC32) && !SSE2NEON_ARM64EC __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t" : [c] "+r"(crc) : [v] "r"(v)); -#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \ - (defined(_M_ARM64) && !defined(__clang__)) +#elif ((__ARM_ARCH >= 8) && defined(__ARM_FEATURE_CRC32)) || \ + (SSE2NEON_COMPILER_MSVC && defined(_M_ARM64) && !SSE2NEON_ARM64EC && \ + !SSE2NEON_COMPILER_CLANG) crc = __crc32cb(crc, v); -#else +#elif defined(__ARM_FEATURE_CRYPTO) + SSE2NEON_CRC32C_BASE(crc, v, 8, 1); +#else // Fall back to the generic table lookup approach + // Adapted from: https://create.stephan-brumme.com/crc32/ + // Apply half-byte comparison algorithm for the best ratio between + // performance and lookup table. + crc ^= v; - for (int bit = 0; bit < 8; bit++) { - if (crc & 1) - crc = (crc >> 1) ^ UINT32_C(0x82f63b78); - else - crc = (crc >> 1); - } + + // The lookup table just needs to store every 16th entry + // of the standard look-up table. + static const uint32_t crc32_half_byte_tbl[] = { + 0x00000000, 0x105ec76f, 0x20bd8ede, 0x30e349b1, 0x417b1dbc, 0x5125dad3, + 0x61c69362, 0x7198540d, 0x82f63b78, 0x92a8fc17, 0xa24bb5a6, 0xb21572c9, + 0xc38d26c4, 0xd3d3e1ab, 0xe330a81a, 0xf36e6f75, + }; + + crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F]; + crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F]; #endif return crc; } /* AES */ -#if !defined(__ARM_FEATURE_CRYPTO) && (!defined(_M_ARM64) || defined(__clang__)) +/* AES software fallback tables. + * Needed when __ARM_FEATURE_CRYPTO is not available, OR on ARM64EC where + * hardware crypto intrinsics may not be accessible despite the feature macro. + */ +#if !defined(__ARM_FEATURE_CRYPTO) || SSE2NEON_ARM64EC || defined(_M_ARM64EC) /* clang-format off */ #define SSE2NEON_AES_SBOX(w) \ { \ @@ -8571,8 +10357,67 @@ static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0); static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0); #undef SSE2NEON_AES_H0 +// File-scope constants for AES permutations - hoisted from inline functions +// to ensure single load across multiple intrinsic calls. +// ShiftRows permutation indices for encryption +static const uint8_t ALIGN_STRUCT(16) _sse2neon_aes_shift_rows[16] = { + 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3, + 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb, +}; +// InvShiftRows permutation indices for decryption +static const uint8_t ALIGN_STRUCT(16) _sse2neon_aes_inv_shift_rows[16] = { + 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb, + 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3, +}; +// Rotate right by 8 bits within each 32-bit word (for MixColumns) +static const uint8_t ALIGN_STRUCT(16) _sse2neon_aes_ror32by8[16] = { + 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, + 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc, +}; + +#if SSE2NEON_ARCH_AARCH64 +// NEON S-box lookup using 4x64-byte tables; reused by aesenc/dec/keygenassist. +// Uses vsubq_u8 instead of C++ operator- for MSVC compatibility. +FORCE_INLINE uint8x16_t _sse2neon_aes_subbytes(uint8x16_t x) +{ + uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), x); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), + vsubq_u8(x, vdupq_n_u8(0x40))); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), + vsubq_u8(x, vdupq_n_u8(0x80))); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), + vsubq_u8(x, vdupq_n_u8(0xc0))); + return v; +} + +FORCE_INLINE uint8x16_t _sse2neon_aes_inv_subbytes(uint8x16_t x) +{ + uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), x); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), + vsubq_u8(x, vdupq_n_u8(0x40))); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), + vsubq_u8(x, vdupq_n_u8(0x80))); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), + vsubq_u8(x, vdupq_n_u8(0xc0))); + return v; +} + +// AES xtime: multiply by {02} in GF(2^8) with reduction polynomial 0x11b +// Uses signed comparison to generate mask: if MSB set, XOR with 0x1b +FORCE_INLINE uint8x16_t _sse2neon_aes_xtime(uint8x16_t v) +{ + // Arithmetic right shift by 7 gives 0xFF for bytes >= 0x80, 0x00 otherwise + uint8x16_t mask = + vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(v), 7)); + // AND with reduction polynomial 0x1b + uint8x16_t reduced = vandq_u8(mask, vdupq_n_u8(0x1b)); + // Shift left and XOR with reduction + return veorq_u8(vshlq_n_u8(v, 1), reduced); +} +#endif + /* x_time function and matrix multiply function */ -#if !defined(__aarch64__) && !defined(_M_ARM64) +#if !SSE2NEON_ARCH_AARCH64 #define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b)) #define SSE2NEON_MULTIPLY(x, y) \ (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^ \ @@ -8588,49 +10433,42 @@ static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0); // for more information. FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey) { -#if defined(__aarch64__) || defined(_M_ARM64) - static const uint8_t shift_rows[] = { - 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3, - 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb, - }; - static const uint8_t ror32by8[] = { - 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, - 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc, - }; - +#if SSE2NEON_ARCH_AARCH64 uint8x16_t v; uint8x16_t w = vreinterpretq_u8_m128i(a); /* shift rows */ - w = vqtbl1q_u8(w, vld1q_u8(shift_rows)); + w = vqtbl1q_u8(w, vld1q_u8(_sse2neon_aes_shift_rows)); /* sub bytes */ - // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and - // look up each of the table. After each lookup, we load the next table - // which locates at the next 64-bytes. In the meantime, the index in the - // table would be smaller than it was, so the index parameters of - // `vqtbx4q_u8()` need to be added the same constant as the loaded tables. - v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w); - // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))' - v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40); - v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80); - v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0); + v = _sse2neon_aes_subbytes(w); - /* mix columns */ - w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b); - w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v); - w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); + /* mix columns: + * MixColumns multiplies each column by the matrix: + * [02 03 01 01] + * [01 02 03 01] + * [01 01 02 03] + * [03 01 01 02] + * Using: out = xtime(v) ^ ror8(xtime(v)^v) ^ rot16(v) + */ + w = _sse2neon_aes_xtime(v); // w = v * {02} + w = veorq_u8(w, vreinterpretq_u8_u16(vrev32q_u16(vreinterpretq_u16_u8(v)))); + w = veorq_u8(w, + vqtbl1q_u8(veorq_u8(v, w), vld1q_u8(_sse2neon_aes_ror32by8))); /* add round key */ - return vreinterpretq_m128i_u8(w) ^ RoundKey; + return vreinterpretq_m128i_u8( + veorq_u8(w, vreinterpretq_u8_m128i(RoundKey))); #else /* ARMv7-A implementation for a table-based AES */ -#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \ - (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \ - ((uint32_t) (b1) << 8) | (uint32_t) (b0)) -// muliplying 'x' by 2 in GF(2^8) +#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \ + ((_sse2neon_static_cast(uint32_t, b3) << 24) | \ + (_sse2neon_static_cast(uint32_t, b2) << 16) | \ + (_sse2neon_static_cast(uint32_t, b1) << 8) | \ + _sse2neon_static_cast(uint32_t, b0)) +// multiplying 'x' by 2 in GF(2^8) #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */)) -// muliplying 'x' by 3 in GF(2^8) +// multiplying 'x' by 3 in GF(2^8) #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x) #define SSE2NEON_AES_U0(p) \ SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p)) @@ -8685,69 +10523,114 @@ FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128 FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey) { -#if defined(__aarch64__) - static const uint8_t inv_shift_rows[] = { - 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb, - 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3, - }; - static const uint8_t ror32by8[] = { - 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, - 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc, - }; - +#if SSE2NEON_ARCH_AARCH64 uint8x16_t v; uint8x16_t w = vreinterpretq_u8_m128i(a); // inverse shift rows - w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows)); + w = vqtbl1q_u8(w, vld1q_u8(_sse2neon_aes_inv_shift_rows)); // inverse sub bytes - v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w); - v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40); - v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80); - v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0); + v = _sse2neon_aes_inv_subbytes(w); - // inverse mix columns - // muliplying 'v' by 4 in GF(2^8) - w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b); - w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b); - v ^= w; - v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w); + /* inverse mix columns: + * InvMixColumns multiplies each column by the matrix: + * [0E 0B 0D 09] + * [09 0E 0B 0D] + * [0D 09 0E 0B] + * [0B 0D 09 0E] + * Computed as: v*{04} ^ v ^ rotate(v*{04}, 16) then standard MixColumns + */ + // v*{04} = xtime(xtime(v)) + w = _sse2neon_aes_xtime(v); + w = _sse2neon_aes_xtime(w); + v = veorq_u8(v, w); + v = veorq_u8(v, vreinterpretq_u8_u16(vrev32q_u16(vreinterpretq_u16_u8(w)))); - w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & - 0x1b); // muliplying 'v' by 2 in GF(2^8) - w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v); - w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); + // Apply standard MixColumns to transformed v + w = _sse2neon_aes_xtime(v); + w = veorq_u8(w, vreinterpretq_u8_u16(vrev32q_u16(vreinterpretq_u16_u8(v)))); + w = veorq_u8(w, + vqtbl1q_u8(veorq_u8(v, w), vld1q_u8(_sse2neon_aes_ror32by8))); // add round key - return vreinterpretq_m128i_u8(w) ^ RoundKey; + return vreinterpretq_m128i_u8( + veorq_u8(w, vreinterpretq_u8_m128i(RoundKey))); -#else /* ARMv7-A NEON implementation */ - /* FIXME: optimized for NEON */ - uint8_t i, e, f, g, h, v[4][4]; - uint8_t *_a = (uint8_t *) &a; - for (i = 0; i < 16; ++i) { - v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]]; - } +#else /* ARMv7-A implementation using inverse T-tables */ + // GF(2^8) multiplication helpers for InvMixColumns coefficients +#define SSE2NEON_AES_DEC_B2W(b0, b1, b2, b3) \ + ((_sse2neon_static_cast(uint32_t, b3) << 24) | \ + (_sse2neon_static_cast(uint32_t, b2) << 16) | \ + (_sse2neon_static_cast(uint32_t, b1) << 8) | \ + _sse2neon_static_cast(uint32_t, b0)) + // xtime: multiply by 2 in GF(2^8), using 0x011b to clear bit 8 +#define SSE2NEON_AES_DEC_X2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b)) + // multiply by 4 in GF(2^8) +#define SSE2NEON_AES_DEC_X4(x) SSE2NEON_AES_DEC_X2(SSE2NEON_AES_DEC_X2(x)) + // multiply by 8 in GF(2^8) +#define SSE2NEON_AES_DEC_X8(x) SSE2NEON_AES_DEC_X2(SSE2NEON_AES_DEC_X4(x)) + // InvMixColumns coefficients: 0x09, 0x0b, 0x0d, 0x0e +#define SSE2NEON_AES_DEC_F9(x) (SSE2NEON_AES_DEC_X8(x) ^ (x)) +#define SSE2NEON_AES_DEC_FB(x) \ + (SSE2NEON_AES_DEC_X8(x) ^ SSE2NEON_AES_DEC_X2(x) ^ (x)) +#define SSE2NEON_AES_DEC_FD(x) \ + (SSE2NEON_AES_DEC_X8(x) ^ SSE2NEON_AES_DEC_X4(x) ^ (x)) +#define SSE2NEON_AES_DEC_FE(x) \ + (SSE2NEON_AES_DEC_X8(x) ^ SSE2NEON_AES_DEC_X4(x) ^ SSE2NEON_AES_DEC_X2(x)) + // Inverse T-table generators combining InvSubBytes + InvMixColumns +#define SSE2NEON_AES_DEC_V0(p) \ + SSE2NEON_AES_DEC_B2W(SSE2NEON_AES_DEC_FE(p), SSE2NEON_AES_DEC_F9(p), \ + SSE2NEON_AES_DEC_FD(p), SSE2NEON_AES_DEC_FB(p)) +#define SSE2NEON_AES_DEC_V1(p) \ + SSE2NEON_AES_DEC_B2W(SSE2NEON_AES_DEC_FB(p), SSE2NEON_AES_DEC_FE(p), \ + SSE2NEON_AES_DEC_F9(p), SSE2NEON_AES_DEC_FD(p)) +#define SSE2NEON_AES_DEC_V2(p) \ + SSE2NEON_AES_DEC_B2W(SSE2NEON_AES_DEC_FD(p), SSE2NEON_AES_DEC_FB(p), \ + SSE2NEON_AES_DEC_FE(p), SSE2NEON_AES_DEC_F9(p)) +#define SSE2NEON_AES_DEC_V3(p) \ + SSE2NEON_AES_DEC_B2W(SSE2NEON_AES_DEC_F9(p), SSE2NEON_AES_DEC_FD(p), \ + SSE2NEON_AES_DEC_FB(p), SSE2NEON_AES_DEC_FE(p)) - // inverse mix columns - for (i = 0; i < 4; ++i) { - e = v[i][0]; - f = v[i][1]; - g = v[i][2]; - h = v[i][3]; + // Inverse T-tables: combine InvShiftRows + InvSubBytes + InvMixColumns + // Each table entry is the InvMixColumns result for that S-box output + static const uint32_t ALIGN_STRUCT(16) aes_inv_table[4][256] = { + SSE2NEON_AES_RSBOX(SSE2NEON_AES_DEC_V0), + SSE2NEON_AES_RSBOX(SSE2NEON_AES_DEC_V1), + SSE2NEON_AES_RSBOX(SSE2NEON_AES_DEC_V2), + SSE2NEON_AES_RSBOX(SSE2NEON_AES_DEC_V3), + }; +#undef SSE2NEON_AES_DEC_B2W +#undef SSE2NEON_AES_DEC_X2 +#undef SSE2NEON_AES_DEC_X4 +#undef SSE2NEON_AES_DEC_X8 +#undef SSE2NEON_AES_DEC_F9 +#undef SSE2NEON_AES_DEC_FB +#undef SSE2NEON_AES_DEC_FD +#undef SSE2NEON_AES_DEC_FE +#undef SSE2NEON_AES_DEC_V0 +#undef SSE2NEON_AES_DEC_V1 +#undef SSE2NEON_AES_DEC_V2 +#undef SSE2NEON_AES_DEC_V3 - v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^ - SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09); - v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^ - SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d); - v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^ - SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b); - v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^ - SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e); - } + uint32_t x0 = _mm_cvtsi128_si32(a); + uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)); + uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA)); + uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF)); - return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey; + // InvShiftRows is integrated into table indexing: + // Row 0: no shift, Row 1: right by 1, Row 2: right by 2, Row 3: right by 3 + __m128i out = _mm_set_epi32( + (aes_inv_table[0][x3 & 0xff] ^ aes_inv_table[1][(x2 >> 8) & 0xff] ^ + aes_inv_table[2][(x1 >> 16) & 0xff] ^ aes_inv_table[3][x0 >> 24]), + (aes_inv_table[0][x2 & 0xff] ^ aes_inv_table[1][(x1 >> 8) & 0xff] ^ + aes_inv_table[2][(x0 >> 16) & 0xff] ^ aes_inv_table[3][x3 >> 24]), + (aes_inv_table[0][x1 & 0xff] ^ aes_inv_table[1][(x0 >> 8) & 0xff] ^ + aes_inv_table[2][(x3 >> 16) & 0xff] ^ aes_inv_table[3][x2 >> 24]), + (aes_inv_table[0][x0 & 0xff] ^ aes_inv_table[1][(x3 >> 8) & 0xff] ^ + aes_inv_table[2][(x2 >> 16) & 0xff] ^ aes_inv_table[3][x1 >> 24])); + + return _mm_xor_si128(out, RoundKey); #endif } @@ -8756,26 +10639,19 @@ FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) { -#if defined(__aarch64__) - static const uint8_t shift_rows[] = { - 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3, - 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb, - }; - +#if SSE2NEON_ARCH_AARCH64 uint8x16_t v; uint8x16_t w = vreinterpretq_u8_m128i(a); - // shift rows - w = vqtbl1q_u8(w, vld1q_u8(shift_rows)); + // shift rows - use file-scope constant + w = vqtbl1q_u8(w, vld1q_u8(_sse2neon_aes_shift_rows)); // sub bytes - v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w); - v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40); - v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80); - v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0); + v = _sse2neon_aes_subbytes(w); // add round key - return vreinterpretq_m128i_u8(v) ^ RoundKey; + return vreinterpretq_m128i_u8( + veorq_u8(v, vreinterpretq_u8_m128i(RoundKey))); #else /* ARMv7-A implementation */ uint8_t v[16] = { @@ -8797,7 +10673,126 @@ FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)], }; - return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey; + return _mm_xor_si128(vreinterpretq_m128i_u8(vld1q_u8(v)), RoundKey); +#endif +} + +FORCE_INLINE uint8x16_t _sse2neon_vqtbl1q_u8(uint8x16_t t, uint8x16_t idx) +{ +#if SSE2NEON_ARCH_AARCH64 + return vqtbl1q_u8(t, idx); +#else + // Split 'idx' into two D registers. + uint8x8_t idx_low = vget_low_u8(idx); + uint8x8_t idx_high = vget_high_u8(idx); + + uint8x8x2_t tbl = { + vget_low_u8(t), + vget_high_u8(t), + }; + + // Perform Lookup using vtbl2_u8. + // Perform lookup for the first 8 bytes of the result. + uint8x8_t ret_low = vtbl2_u8(tbl, idx_low); + // Perform lookup for the second 8 bytes of the result. + uint8x8_t ret_high = vtbl2_u8(tbl, idx_high); + + // Combine the retults. + return vcombine_u8(ret_low, ret_high); +#endif +} + +FORCE_INLINE uint8x16_t _sse2neon_vqtbl4q_u8(uint8x16x4_t t, uint8x16_t idx) +{ +#if SSE2NEON_ARCH_AARCH64 + return vqtbl4q_u8(t, idx); +#else + // Split 'idx' into two D registers. + uint8x8_t idx_lo = vget_low_u8(idx); + uint8x8_t idx_hi = vget_high_u8(idx); + + uint8x8x4_t tbl_chunk_0 = { + vget_low_u8(t.val[0]), + vget_high_u8(t.val[0]), + vget_low_u8(t.val[1]), + vget_high_u8(t.val[1]), + }; + + uint8x8x4_t tbl_chunk_1 = { + vget_low_u8(t.val[2]), + vget_high_u8(t.val[2]), + vget_low_u8(t.val[3]), + vget_high_u8(t.val[3]), + }; + + // Shift indices down by 32 so index 32 becomes 0 for the new table. + uint8x16_t idx_minus_32 = vsubq_u8(idx, vdupq_n_u8(32)); + uint8x8_t idx_lo_mod = vget_low_u8(idx_minus_32); + uint8x8_t idx_hi_mod = vget_high_u8(idx_minus_32); + + // Pass 1: Use vtbl4_u8 (VTBL). + // NOTE: VTBL produces 0 of the indices are larger than 31. + uint8x8_t ret_lo = vtbl4_u8(tbl_chunk_0, idx_lo); + uint8x8_t ret_hi = vtbl4_u8(tbl_chunk_0, idx_hi); + + // Use vtbx4_u8 (VTBX). + // It takes the result of Pass 1 as the accumulator. + ret_lo = vtbx4_u8(ret_lo, tbl_chunk_1, idx_lo_mod); + ret_hi = vtbx4_u8(ret_hi, tbl_chunk_1, idx_hi_mod); + + // Combine the results + return vcombine_u8(ret_lo, ret_hi); +#endif +} + +FORCE_INLINE uint8x16_t _sse2neon_vqtbx4q_u8(uint8x16_t acc, + uint8x16x4_t t, + uint8x16_t idx) +{ +#if SSE2NEON_ARCH_AARCH64 + return vqtbx4q_u8(acc, t, idx); +#else + // Split 'acc' into two D registers. + uint8x8_t ret_low = vget_low_u8(acc); + uint8x8_t ret_high = vget_high_u8(acc); + // Split 'idx' into two D registers. + uint8x8_t idx_low = vget_low_u8(idx); + uint8x8_t idx_high = vget_high_u8(idx); + + uint8x8x4_t tbl_chunk_0 = { + vget_low_u8(t.val[0]), + vget_high_u8(t.val[0]), + vget_low_u8(t.val[1]), + vget_high_u8(t.val[1]), + }; + + uint8x8x4_t tbl_chunk_1 = { + vget_low_u8(t.val[2]), + vget_high_u8(t.val[2]), + vget_low_u8(t.val[3]), + vget_high_u8(t.val[3]), + }; + + // Adjust indices: We want to map index 32 to index 0 of this new table. + // To do so, we subtract 32 from all indices. + // NOTE: If the original index is smaller than 32, the adjusted index wraps + // around due to unsigned underflow (e.g., 5 - 32 = 229). + // Since 229 > 31, vtbx4_u8 (VTBX) preserves the result from Pass 1. + // This is the intended behavior. + uint8x16_t idx_minus_32 = vsubq_u8(idx, vdupq_n_u8(32)); + uint8x8_t idx_low_mod = vget_low_u8(idx_minus_32); + uint8x8_t idx_high_mod = vget_high_u8(idx_minus_32); + + // Perform vtbx4_u8 in the first chunk. + ret_low = vtbx4_u8(ret_low, tbl_chunk_0, idx_low); + ret_high = vtbx4_u8(ret_high, tbl_chunk_0, idx_high); + + // Perform vtbx4_u8 on the second chunk. + ret_low = vtbx4_u8(ret_low, tbl_chunk_1, idx_low_mod); + ret_high = vtbx4_u8(ret_high, tbl_chunk_1, idx_high_mod); + + // Combine the results. + return vcombine_u8(ret_low, ret_high); #endif } @@ -8806,36 +10801,42 @@ FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128 FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey) { -#if defined(__aarch64__) - static const uint8_t inv_shift_rows[] = { - 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb, - 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3, - }; - +#if SSE2NEON_ARCH_AARCH64 uint8x16_t v; uint8x16_t w = vreinterpretq_u8_m128i(a); - // inverse shift rows - w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows)); + // inverse shift rows - use file-scope constant + w = vqtbl1q_u8(w, vld1q_u8(_sse2neon_aes_inv_shift_rows)); // inverse sub bytes - v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w); - v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40); - v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80); - v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0); + v = _sse2neon_aes_inv_subbytes(w); // add round key - return vreinterpretq_m128i_u8(v) ^ RoundKey; + return vreinterpretq_m128i_u8( + veorq_u8(v, vreinterpretq_u8_m128i(RoundKey))); -#else /* ARMv7-A NEON implementation */ - /* FIXME: optimized for NEON */ - uint8_t v[4][4]; - uint8_t *_a = (uint8_t *) &a; - for (int i = 0; i < 16; ++i) { - v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]]; - } +#else /* ARMv7-A implementation */ + // Inverse shift rows indices: 0,13,10,7,4,1,14,11,8,5,2,15,12,9,6,3 + uint8_t v[16] = { + _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)], + _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)], + _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)], + _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)], + _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)], + _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)], + _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)], + _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)], + _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)], + _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)], + _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)], + _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)], + _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)], + _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)], + _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)], + _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)], + }; - return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey; + return _mm_xor_si128(vreinterpretq_m128i_u8(vld1q_u8(v)), RoundKey); #endif } @@ -8843,29 +10844,28 @@ FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128 FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a) { -#if defined(__aarch64__) - static const uint8_t ror32by8[] = { - 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, - 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc, - }; +#if SSE2NEON_ARCH_AARCH64 uint8x16_t v = vreinterpretq_u8_m128i(a); uint8x16_t w; - // multiplying 'v' by 4 in GF(2^8) - w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b); - w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b); - v ^= w; - v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w); + /* InvMixColumns: same algorithm as in _mm_aesdec_si128 */ + // v*{04} = xtime(xtime(v)) + w = _sse2neon_aes_xtime(v); + w = _sse2neon_aes_xtime(w); + v = veorq_u8(v, w); + v = veorq_u8(v, vreinterpretq_u8_u16(vrev32q_u16(vreinterpretq_u16_u8(w)))); - // multiplying 'v' by 2 in GF(2^8) - w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b); - w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v); - w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); + // Apply standard MixColumns pattern + w = _sse2neon_aes_xtime(v); + w = veorq_u8(w, vreinterpretq_u8_u16(vrev32q_u16(vreinterpretq_u16_u8(v)))); + w = veorq_u8(w, + vqtbl1q_u8(veorq_u8(v, w), vld1q_u8(_sse2neon_aes_ror32by8))); return vreinterpretq_m128i_u8(w); #else /* ARMv7-A NEON implementation */ uint8_t i, e, f, g, h, v[4][4]; - vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a)); + vst1q_u8(_sse2neon_reinterpret_cast(uint8_t *, v), + vreinterpretq_u8_m128i(a)); for (i = 0; i < 4; ++i) { e = v[i][0]; f = v[i][1]; @@ -8882,7 +10882,8 @@ FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a) SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e); } - return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)); + return vreinterpretq_m128i_u8( + vld1q_u8(_sse2neon_reinterpret_cast(uint8_t *, v))); #endif } @@ -8897,25 +10898,27 @@ FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a) // for details. FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) { -#if defined(__aarch64__) +#if SSE2NEON_ARCH_AARCH64 uint8x16_t _a = vreinterpretq_u8_m128i(a); - uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a); - v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40); - v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80); - v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0); + uint8x16_t sub = _sse2neon_aes_subbytes(_a); - uint32x4_t v_u32 = vreinterpretq_u32_u8(v); - uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24)); - uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon)); + uint32x4_t sub_u32 = vreinterpretq_u32_u8(sub); + uint32x4_t rot = + vorrq_u32(vshrq_n_u32(sub_u32, 8), vshlq_n_u32(sub_u32, 24)); + uint32x4_t rcon_vec = + vdupq_n_u32(_sse2neon_static_cast(uint32_t, rcon)); // lane-wise xor + uint32x4_t rot_xor = veorq_u32(rot, rcon_vec); - return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v)); + return vreinterpretq_m128i_u32(vtrn2q_u32(sub_u32, rot_xor)); #else /* ARMv7-A NEON implementation */ uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)); uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF)); for (int i = 0; i < 4; ++i) { - ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]]; - ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]]; + (_sse2neon_reinterpret_cast(uint8_t *, &X1))[i] = + _sse2neon_sbox[(_sse2neon_reinterpret_cast(uint8_t *, &X1))[i]]; + (_sse2neon_reinterpret_cast(uint8_t *, &X3))[i] = + _sse2neon_sbox[(_sse2neon_reinterpret_cast(uint8_t *, &X3))[i]]; } return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3, ((X1 >> 8) | (X1 << 24)) ^ rcon, X1); @@ -8924,7 +10927,7 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) #undef SSE2NEON_AES_SBOX #undef SSE2NEON_AES_RSBOX -#if defined(__aarch64__) +#if SSE2NEON_ARCH_AARCH64 #undef SSE2NEON_XT #undef SSE2NEON_MULTIPLY #endif @@ -8982,22 +10985,23 @@ FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a) // Assist in expanding the AES cipher key by computing steps towards generating // a round key for encryption cipher using data from a and an 8-bit round -// constant specified in imm8, and store the result in dst." +// constant specified in imm8, and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128 FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) { // AESE does ShiftRows and SubBytes on A - uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)); + uint8x16_t sb_ = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)); -#ifndef _MSC_VER +#if !SSE2NEON_COMPILER_MSVC || SSE2NEON_COMPILER_CLANG uint8x16_t dest = { // Undo ShiftRows step from AESE and extract X1 and X3 - u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1) - u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1)) - u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3) - u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3)) + sb_[0x4], sb_[0x1], sb_[0xE], sb_[0xB], // SubBytes(X1) + sb_[0x1], sb_[0xE], sb_[0xB], sb_[0x4], // ROT(SubBytes(X1)) + sb_[0xC], sb_[0x9], sb_[0x6], sb_[0x3], // SubBytes(X3) + sb_[0x9], sb_[0x6], sb_[0x3], sb_[0xC], // ROT(SubBytes(X3)) }; - uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon}; + uint32x4_t r = {0, _sse2neon_static_cast(unsigned, rcon), 0, + _sse2neon_static_cast(unsigned, rcon)}; return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r); #else // We have to do this hack because MSVC is strictly adhering to the CPP @@ -9006,20 +11010,23 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) // As per the Windows ARM64 ABI, it is always little endian, so this works __n128 dest{ - ((uint64_t) u8.n128_u8[0x4] << 0) | ((uint64_t) u8.n128_u8[0x1] << 8) | - ((uint64_t) u8.n128_u8[0xE] << 16) | - ((uint64_t) u8.n128_u8[0xB] << 24) | - ((uint64_t) u8.n128_u8[0x1] << 32) | - ((uint64_t) u8.n128_u8[0xE] << 40) | - ((uint64_t) u8.n128_u8[0xB] << 48) | - ((uint64_t) u8.n128_u8[0x4] << 56), - ((uint64_t) u8.n128_u8[0xC] << 0) | ((uint64_t) u8.n128_u8[0x9] << 8) | - ((uint64_t) u8.n128_u8[0x6] << 16) | - ((uint64_t) u8.n128_u8[0x3] << 24) | - ((uint64_t) u8.n128_u8[0x9] << 32) | - ((uint64_t) u8.n128_u8[0x6] << 40) | - ((uint64_t) u8.n128_u8[0x3] << 48) | - ((uint64_t) u8.n128_u8[0xC] << 56)}; + ((uint64_t) sb_.n128_u8[0x4] << 0) | + ((uint64_t) sb_.n128_u8[0x1] << 8) | + ((uint64_t) sb_.n128_u8[0xE] << 16) | + ((uint64_t) sb_.n128_u8[0xB] << 24) | + ((uint64_t) sb_.n128_u8[0x1] << 32) | + ((uint64_t) sb_.n128_u8[0xE] << 40) | + ((uint64_t) sb_.n128_u8[0xB] << 48) | + ((uint64_t) sb_.n128_u8[0x4] << 56), + ((uint64_t) sb_.n128_u8[0xC] << 0) | + ((uint64_t) sb_.n128_u8[0x9] << 8) | + ((uint64_t) sb_.n128_u8[0x6] << 16) | + ((uint64_t) sb_.n128_u8[0x3] << 24) | + ((uint64_t) sb_.n128_u8[0x9] << 32) | + ((uint64_t) sb_.n128_u8[0x6] << 40) | + ((uint64_t) sb_.n128_u8[0x3] << 48) | + ((uint64_t) sb_.n128_u8[0xC] << 56), + }; dest.n128_u32[1] = dest.n128_u32[1] ^ rcon; dest.n128_u32[3] = dest.n128_u32[3] ^ rcon; @@ -9056,18 +11063,18 @@ FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm) } } -FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode() +FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void) { union { fpcr_bitfield field; -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 uint64_t value; #else uint32_t value; #endif } r; -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 r.value = _sse2neon_get_fpcr(); #else __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ @@ -9081,10 +11088,10 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode() // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32 FORCE_INLINE int _mm_popcnt_u32(unsigned int a) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 #if __has_builtin(__builtin_popcount) return __builtin_popcount(a); -#elif defined(_MSC_VER) +#elif SSE2NEON_COMPILER_MSVC return _CountOneBits(a); #else return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a))); @@ -9095,7 +11102,7 @@ FORCE_INLINE int _mm_popcnt_u32(unsigned int a) uint16x4_t count16x4_val; uint32x2_t count32x2_val; - input_val = vld1_u8((uint8_t *) &a); + input_val = vld1_u8(_sse2neon_reinterpret_cast(uint8_t *, &a)); count8x8_val = vcnt_u8(input_val); count16x4_val = vpaddl_u8(count8x8_val); count32x2_val = vpaddl_u16(count16x4_val); @@ -9110,10 +11117,10 @@ FORCE_INLINE int _mm_popcnt_u32(unsigned int a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64 FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 #if __has_builtin(__builtin_popcountll) return __builtin_popcountll(a); -#elif defined(_MSC_VER) +#elif SSE2NEON_COMPILER_MSVC return _CountOneBits64(a); #else return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a))); @@ -9125,7 +11132,7 @@ FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) uint32x2_t count32x2_val; uint64x1_t count64x1_val; - input_val = vld1_u8((uint8_t *) &a); + input_val = vld1_u8(_sse2neon_reinterpret_cast(uint8_t *, &a)); count8x8_val = vcnt_u8(input_val); count16x4_val = vpaddl_u8(count8x8_val); count32x2_val = vpaddl_u16(count16x4_val); @@ -9141,14 +11148,14 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag) // regardless of the value of the FZ bit. union { fpcr_bitfield field; -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 uint64_t value; #else uint32_t value; #endif } r; -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 r.value = _sse2neon_get_fpcr(); #else __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ @@ -9156,10 +11163,10 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag) r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON; -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 _sse2neon_set_fpcr(r.value); #else - __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ + __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ #endif } @@ -9167,16 +11174,15 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc FORCE_INLINE uint64_t _rdtsc(void) { -#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_ARCH_AARCH64 uint64_t val; /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the - * system counter is at least 56 bits wide; from Armv8.6, the counter - * must be 64 bits wide. So the system counter could be less than 64 - * bits wide and it is attributed with the flag 'cap_user_time_short' - * is true. + * system counter is at least 56 bits wide; from Armv8.6, the counter must + * be 64 bits wide. So the system counter could be less than 64 bits wide + * and it is attributed with the flag 'cap_user_time_short' is true. */ -#if defined(_MSC_VER) +#if SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2)); #else __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val)); @@ -9204,7 +11210,7 @@ FORCE_INLINE uint64_t _rdtsc(void) #endif } -#if defined(__GNUC__) || defined(__clang__) +#if SSE2NEON_COMPILER_GCC_COMPAT #pragma pop_macro("ALIGN_STRUCT") #pragma pop_macro("FORCE_INLINE") #endif diff --git a/vendor.yaml b/vendor.yaml index eace983a89..a68535c65d 100644 --- a/vendor.yaml +++ b/vendor.yaml @@ -40,7 +40,7 @@ third-party/libtinyfiledialogs: alternatives: - https://github.com/btzy/nativefiledialog-extended (only file dialog support though!) third-party/sse2neon: - git: https://github.com/DLTcollab/sse2neon/commit/2eede22be8c5922e44616260c5eab728e3c5e26f + git: https://github.com/DLTcollab/sse2neon/releases/tag/v1.9.1 license: MIT third-party/curl: git: https://github.com/curl/curl/tree/curl-8_3_0