#include "SkyBlendCPU.h" #include "game/graphics/opengl_renderer/AdgifHandler.h" #include "common/util/os.h" #include SkyBlendCPU::SkyBlendCPU() { for (int i = 0; i < 2; i++) { glGenTextures(1, &m_textures[i].gl); glBindTexture(GL_TEXTURE_2D, m_textures[i].gl); glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, m_sizes[i], m_sizes[i], 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, 0); m_texture_data[i].resize(4 * m_sizes[i] * m_sizes[i]); } } SkyBlendCPU::~SkyBlendCPU() { for (auto& tex : m_textures) { glDeleteTextures(1, &tex.gl); } } void blend_sky_initial_fast(u8 intensity, u8* out, const u8* in, u32 size) { if (get_cpu_info().has_avx2) { #ifdef __AVX2__ __m256i intensity_vec = _mm256_set1_epi16(intensity); for (u32 i = 0; i < size / 16; i++) { __m128i tex_data8 = _mm_loadu_si128((const __m128i*)(in + (i * 16))); __m256i tex_data16 = _mm256_cvtepu8_epi16(tex_data8); tex_data16 = _mm256_mullo_epi16(tex_data16, intensity_vec); tex_data16 = _mm256_srli_epi16(tex_data16, 7); auto hi = _mm256_extracti128_si256(tex_data16, 1); auto result = _mm_packus_epi16(_mm256_castsi256_si128(tex_data16), hi); _mm_storeu_si128((__m128i*)(out + (i * 16)), result); } #else ASSERT(false); #endif } else { __m128i intensity_vec = _mm_set1_epi16(intensity); for (u32 i = 0; i < size / 8; i++) { __m128i tex_data8 = _mm_loadu_si64((const __m128i*)(in + (i * 8))); __m128i tex_data16 = _mm_cvtepu8_epi16(tex_data8); tex_data16 = _mm_mullo_epi16(tex_data16, intensity_vec); tex_data16 = _mm_srli_epi16(tex_data16, 7); auto result = _mm_packus_epi16(tex_data16, tex_data16); _mm_storeu_si64((__m128i*)(out + (i * 8)), result); } } } void blend_sky_fast(u8 intensity, u8* out, const u8* in, u32 size) { if (get_cpu_info().has_avx2) { #ifdef __AVX2__ __m256i intensity_vec = _mm256_set1_epi16(intensity); __m256i max_intensity = _mm256_set1_epi16(255); for (u32 i = 0; i < size / 16; i++) { __m128i tex_data8 = _mm_loadu_si128((const __m128i*)(in + (i * 16))); __m128i out_val = _mm_loadu_si128((const __m128i*)(out + (i * 16))); __m256i tex_data16 = _mm256_cvtepu8_epi16(tex_data8); tex_data16 = _mm256_mullo_epi16(tex_data16, intensity_vec); tex_data16 = _mm256_srli_epi16(tex_data16, 7); tex_data16 = _mm256_min_epi16(max_intensity, tex_data16); auto hi = _mm256_extracti128_si256(tex_data16, 1); auto result = _mm_packus_epi16(_mm256_castsi256_si128(tex_data16), hi); out_val = _mm_adds_epu8(out_val, result); _mm_storeu_si128((__m128i*)(out + (i * 16)), out_val); } #else ASSERT(false); #endif } else { __m128i intensity_vec = _mm_set1_epi16(intensity); __m128i max_intensity = _mm_set1_epi16(255); for (u32 i = 0; i < size / 8; i++) { __m128i tex_data8 = _mm_loadu_si64((const __m128i*)(in + (i * 8))); __m128i out_val = _mm_loadu_si64((const __m128i*)(out + (i * 8))); __m128i tex_data16 = _mm_cvtepu8_epi16(tex_data8); tex_data16 = _mm_mullo_epi16(tex_data16, intensity_vec); tex_data16 = _mm_srli_epi16(tex_data16, 7); tex_data16 = _mm_min_epi16(max_intensity, tex_data16); auto result = _mm_packus_epi16(tex_data16, tex_data16); out_val = _mm_adds_epu8(out_val, result); _mm_storeu_si64((__m128i*)(out + (i * 8)), out_val); } } /* */ } SkyBlendStats SkyBlendCPU::do_sky_blends(DmaFollower& dma, SharedRenderState* render_state, ScopedProfilerNode& /*prof*/) { SkyBlendStats stats; while (dma.current_tag().qwc == 6) { // assuming that the vif and gif-tag is correct auto setup_data = dma.read_and_advance(); // first is an adgif AdgifHelper adgif(setup_data.data + 16); ASSERT(adgif.is_normal_adgif()); ASSERT(adgif.alpha().data == 0x8000000068); // Cs + Cd // next is the actual draw auto draw_data = dma.read_and_advance(); ASSERT(draw_data.size_bytes == 6 * 16); GifTag draw_or_blend_tag(draw_data.data); // the first draw overwrites the previous frame's draw by disabling alpha blend (ABE = 0) bool is_first_draw = !GsPrim(draw_or_blend_tag.prim()).abe(); // here's we're relying on the format of the drawing to get the alpha/offset. u32 coord; u32 intensity; memcpy(&coord, draw_data.data + (5 * 16), 4); memcpy(&intensity, draw_data.data + 16, 4); // we didn't parse the render-to-texture setup earlier, so we need a way to tell sky from // clouds. we can look at the drawing coordinates to tell - the sky is smaller than the clouds. int buffer_idx = 0; if (coord == 0x200) { // sky buffer_idx = 0; } else if (coord == 0x400) { buffer_idx = 1; } else { ASSERT(false); // bad data } // look up the source texture auto tex = render_state->texture_pool->lookup_gpu_texture(adgif.tex0().tbp0()); ASSERT(tex); // slow version /* if (is_first_draw) { memset(m_texture_data[buffer_idx].data(), 0, m_texture_data[buffer_idx].size()); } // intensities should be 0-128 (maybe higher is okay, but I don't see how this could be // generated with the GOAL code.) ASSERT(intensity <= 128); ASSERT(m_texture_data[buffer_idx].size() == tex->data.size()); for (size_t i = 0; i < m_texture_data[buffer_idx].size(); i++) { u32 val = tex->data[i] * intensity; val >>= 7; m_texture_data[buffer_idx][i] += val; } */ if (tex->get_data_ptr()) { if (is_first_draw) { blend_sky_initial_fast(intensity, m_texture_data[buffer_idx].data(), tex->get_data_ptr(), tex->data_size()); } else { blend_sky_fast(intensity, m_texture_data[buffer_idx].data(), tex->get_data_ptr(), tex->data_size()); } if (buffer_idx == 0) { if (is_first_draw) { stats.sky_draws++; } else { stats.sky_blends++; } } else { if (is_first_draw) { stats.cloud_draws++; } else { stats.cloud_blends++; } } glBindTexture(GL_TEXTURE_2D, m_textures[buffer_idx].gl); glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, m_sizes[buffer_idx], m_sizes[buffer_idx], 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, m_texture_data[buffer_idx].data()); render_state->texture_pool->move_existing_to_vram(m_textures[buffer_idx].tex, m_textures[buffer_idx].tbp); } } return stats; } void SkyBlendCPU::init_textures(TexturePool& tex_pool) { for (int i = 0; i < 2; i++) { // update it glBindTexture(GL_TEXTURE_2D, m_textures[i].gl); glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, m_sizes[i], m_sizes[i], 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, m_texture_data[i].data()); TextureInput in; in.gpu_texture = m_textures[i].gl; in.w = m_sizes[i]; in.h = m_sizes[i]; in.name = fmt::format("PC-SKY-CPU-{}", i); u32 tbp = SKY_TEXTURE_VRAM_ADDRS[i]; m_textures[i].tex = tex_pool.give_texture_and_load_to_vram(in, tbp); m_textures[i].tbp = tbp; } }