#include "background_common.h" #include "game/graphics/opengl_renderer/BucketRenderer.h" #include "game/graphics/pipelines/opengl.h" #include "common/util/os.h" #include DoubleDraw setup_opengl_from_draw_mode(DrawMode mode, u32 tex_unit, bool mipmap) { glActiveTexture(tex_unit); if (mode.get_zt_enable()) { glEnable(GL_DEPTH_TEST); switch (mode.get_depth_test()) { case GsTest::ZTest::NEVER: glDepthFunc(GL_NEVER); break; case GsTest::ZTest::ALWAYS: glDepthFunc(GL_ALWAYS); break; case GsTest::ZTest::GEQUAL: glDepthFunc(GL_GEQUAL); break; case GsTest::ZTest::GREATER: glDepthFunc(GL_GREATER); break; default: ASSERT(false); } } else { glDisable(GL_DEPTH_TEST); } if (mode.get_ab_enable() && mode.get_alpha_blend() != DrawMode::AlphaBlend::DISABLED) { glEnable(GL_BLEND); switch (mode.get_alpha_blend()) { case DrawMode::AlphaBlend::SRC_DST_SRC_DST: glBlendEquation(GL_FUNC_ADD); glBlendFuncSeparate(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ZERO); break; case DrawMode::AlphaBlend::SRC_0_SRC_DST: glBlendEquation(GL_FUNC_ADD); glBlendFuncSeparate(GL_SRC_ALPHA, GL_ONE, GL_ONE, GL_ZERO); break; case DrawMode::AlphaBlend::SRC_0_FIX_DST: glBlendEquation(GL_FUNC_ADD); glBlendFuncSeparate(GL_ONE, GL_ONE, GL_ONE, GL_ZERO); break; case DrawMode::AlphaBlend::SRC_DST_FIX_DST: // Cv = (Cs - Cd) * FIX + Cd // Cs * FIX * 0.5 // Cd * FIX * 0.5 glBlendEquation(GL_FUNC_ADD); glBlendFuncSeparate(GL_CONSTANT_COLOR, GL_CONSTANT_COLOR, GL_ONE, GL_ZERO); glBlendColor(0.5, 0.5, 0.5, 0.5); break; case DrawMode::AlphaBlend::ZERO_SRC_SRC_DST: glBlendFuncSeparate(GL_SRC_ALPHA, GL_ONE, GL_ONE, GL_ZERO); glBlendEquation(GL_FUNC_REVERSE_SUBTRACT); break; default: ASSERT(false); } } else { glDisable(GL_BLEND); } if (mode.get_clamp_s_enable()) { glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); } else { glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT); } if (mode.get_clamp_t_enable()) { glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); } else { glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT); } if (mode.get_filt_enable()) { glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, mipmap ? GL_LINEAR_MIPMAP_LINEAR : GL_LINEAR); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); } else { glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); } // for some reason, they set atest NEVER + FB_ONLY to disable depth writes bool alpha_hack_to_disable_z_write = false; DoubleDraw double_draw; float alpha_min = 0.; if (mode.get_at_enable()) { switch (mode.get_alpha_test()) { case DrawMode::AlphaTest::ALWAYS: break; case DrawMode::AlphaTest::GEQUAL: alpha_min = mode.get_aref() / 127.f; switch (mode.get_alpha_fail()) { case GsTest::AlphaFail::KEEP: // ok, no need for double draw break; case GsTest::AlphaFail::FB_ONLY: // darn, we need to draw twice double_draw.kind = DoubleDrawKind::AFAIL_NO_DEPTH_WRITE; double_draw.aref_second = alpha_min; break; default: ASSERT(false); } break; case DrawMode::AlphaTest::NEVER: if (mode.get_alpha_fail() == GsTest::AlphaFail::FB_ONLY) { alpha_hack_to_disable_z_write = true; } else { ASSERT(false); } break; default: ASSERT(false); } } if (mode.get_depth_write_enable() && !alpha_hack_to_disable_z_write) { glDepthMask(GL_TRUE); } else { glDepthMask(GL_FALSE); } double_draw.aref_first = alpha_min; return double_draw; } DoubleDraw setup_tfrag_shader(SharedRenderState* render_state, DrawMode mode, ShaderId shader) { auto draw_settings = setup_opengl_from_draw_mode(mode, GL_TEXTURE0, true); glUniform1f(glGetUniformLocation(render_state->shaders[shader].id(), "alpha_min"), draw_settings.aref_first); glUniform1f(glGetUniformLocation(render_state->shaders[shader].id(), "alpha_max"), 10.f); return draw_settings; } void first_tfrag_draw_setup(const TfragRenderSettings& settings, SharedRenderState* render_state, ShaderId shader) { render_state->shaders[shader].activate(); glUniform1i(glGetUniformLocation(render_state->shaders[shader].id(), "tex_T0"), 0); glUniformMatrix4fv(glGetUniformLocation(render_state->shaders[shader].id(), "camera"), 1, GL_FALSE, settings.math_camera.data()); glUniform4f(glGetUniformLocation(render_state->shaders[shader].id(), "hvdf_offset"), settings.hvdf_offset[0], settings.hvdf_offset[1], settings.hvdf_offset[2], settings.hvdf_offset[3]); glUniform1f(glGetUniformLocation(render_state->shaders[shader].id(), "fog_constant"), settings.fog.x()); glUniform1f(glGetUniformLocation(render_state->shaders[shader].id(), "fog_min"), settings.fog.y()); glUniform1f(glGetUniformLocation(render_state->shaders[shader].id(), "fog_max"), settings.fog.z()); glUniform4f(glGetUniformLocation(render_state->shaders[shader].id(), "fog_color"), render_state->fog_color[0], render_state->fog_color[1], render_state->fog_color[2], render_state->fog_intensity); } void interp_time_of_day_slow(const float weights[8], const std::vector& in, math::Vector* out) { // Timer interp_timer; for (size_t color = 0; color < in.size(); color++) { math::Vector4f result = math::Vector4f::zero(); for (int component = 0; component < 8; component++) { result += in[color].rgba[component].cast() * weights[component]; } result[0] = std::min(result[0], 255.f); result[1] = std::min(result[1], 255.f); result[2] = std::min(result[2], 255.f); result[3] = std::min(result[3], 128.f); // note: different for alpha! out[color] = result.cast(); } } // we want to absolutely minimize the number of time we have to "cross lanes" in AVX (meaning X // component of one vector interacts with Y component of another). We can make this a lot better by // taking groups of 4 time of day colors (each containing 8x RGBAs) and rearranging them with this // pattern. We want to compute: // [rgba][0][0] * weights[0] + [rgba][0][1] * weights[1] + [rgba][0][2]... + rgba[0][7] * weights[7] // RGBA is already a vector of 4 components, but with AVX we have vectors with 32 bytes which fit // 16 colors in them. // This makes each vector have: // colors0 = [rgba][0][0], [rgba][1][0], [rgba][2][0], [rgba][3][0] // colors1 = [rgba][0][1], [rgba][1][1], [rgba][2][1], [rgba][3][1] // ... // so we can basically add up the columns (multiplying by weights in between) // and we'll end up with [final0, final1, final2, final3, final4] // the swizzle function below rearranges to get this pattern. // it's not the most efficient way to do it, but it just runs during loading and not on every frame. SwizzledTimeOfDay swizzle_time_of_day(const std::vector& in) { SwizzledTimeOfDay out; out.data.resize((in.size() + 3) * 8 * 4); // we're rearranging per 4 colors (groups of 32 * 4 = 128) // color (lots of these) // component (8 of these) // channel (4 of these, rgba) for (u32 color_quad = 0; color_quad < (in.size() + 3) / 4; color_quad++) { u8* quad_out = out.data.data() + color_quad * 128; for (u32 component = 0; component < 8; component++) { for (u32 color = 0; color < 4; color++) { for (u32 channel = 0; channel < 4; channel++) { size_t in_idx = color_quad * 4 + color; if (in_idx < in.size()) { *quad_out = in.at(color_quad * 4 + color).rgba[component][channel]; } else { *quad_out = 0; } quad_out++; } } } } out.color_count = (in.size() + 3) & (~3); return out; } // This does the same thing as interp_time_of_day_slow, but is faster. // Due to using integers instead of floats, it may be a tiny bit different. // TODO: it might be possible to reorder the loop into two blocks of loads and avoid spilling xmms. // It's ~8x faster than the slow version. void interp_time_of_day_fast_avx2(const float weights[8], const SwizzledTimeOfDay& in, math::Vector* out) { // even though the colors are 8 bits, we'll use 16 bits so we can saturate correctly #ifdef __AVX2__ // weight multipliers __m256i weights0 = _mm256_set1_epi16(weights[0] * 64.f); __m256i weights1 = _mm256_set1_epi16(weights[1] * 64.f); __m256i weights2 = _mm256_set1_epi16(weights[2] * 64.f); __m256i weights3 = _mm256_set1_epi16(weights[3] * 64.f); __m256i weights4 = _mm256_set1_epi16(weights[4] * 64.f); __m256i weights5 = _mm256_set1_epi16(weights[5] * 64.f); __m256i weights6 = _mm256_set1_epi16(weights[6] * 64.f); __m256i weights7 = _mm256_set1_epi16(weights[7] * 64.f); // saturation: note that alpha is saturated to 128 but the rest are 255. // TODO: maybe we should saturate to 255 for everybody (can do this using a single packus) and // change the shader to deal with this. __m256i sat = _mm256_set_epi16(128, 255, 255, 255, 128, 255, 255, 255, 128, 255, 255, 255, 128, 255, 255, 255); for (u32 color_quad = 0; color_quad < in.color_count / 4; color_quad++) { // first, load colors. We put 16 bytes / register and don't touch the upper half because we // convert u8s to u16s. const u8* base = in.data.data() + color_quad * 128; __m128i color0_p = _mm_loadu_si128((const __m128i*)(base + 0)); __m128i color1_p = _mm_loadu_si128((const __m128i*)(base + 16)); __m128i color2_p = _mm_loadu_si128((const __m128i*)(base + 32)); __m128i color3_p = _mm_loadu_si128((const __m128i*)(base + 48)); __m128i color4_p = _mm_loadu_si128((const __m128i*)(base + 64)); __m128i color5_p = _mm_loadu_si128((const __m128i*)(base + 80)); __m128i color6_p = _mm_loadu_si128((const __m128i*)(base + 96)); __m128i color7_p = _mm_loadu_si128((const __m128i*)(base + 112)); // unpack to 16-bits. each has 16x 16 bit colors. __m256i color0 = _mm256_cvtepu8_epi16(color0_p); __m256i color1 = _mm256_cvtepu8_epi16(color1_p); __m256i color2 = _mm256_cvtepu8_epi16(color2_p); __m256i color3 = _mm256_cvtepu8_epi16(color3_p); __m256i color4 = _mm256_cvtepu8_epi16(color4_p); __m256i color5 = _mm256_cvtepu8_epi16(color5_p); __m256i color6 = _mm256_cvtepu8_epi16(color6_p); __m256i color7 = _mm256_cvtepu8_epi16(color7_p); // multiply by weights color0 = _mm256_mullo_epi16(color0, weights0); color1 = _mm256_mullo_epi16(color1, weights1); color2 = _mm256_mullo_epi16(color2, weights2); color3 = _mm256_mullo_epi16(color3, weights3); color4 = _mm256_mullo_epi16(color4, weights4); color5 = _mm256_mullo_epi16(color5, weights5); color6 = _mm256_mullo_epi16(color6, weights6); color7 = _mm256_mullo_epi16(color7, weights7); // add. This order minimizes dependencies. color0 = _mm256_add_epi16(color0, color1); color2 = _mm256_add_epi16(color2, color3); color4 = _mm256_add_epi16(color4, color5); color6 = _mm256_add_epi16(color6, color7); color0 = _mm256_add_epi16(color0, color2); color4 = _mm256_add_epi16(color4, color6); color0 = _mm256_add_epi16(color0, color4); // divide, because we multiplied our weights by 2^7. color0 = _mm256_srli_epi16(color0, 6); // saturate color0 = _mm256_min_epu16(sat, color0); // back to u8s. auto hi = _mm256_extracti128_si256(color0, 1); auto result = _mm_packus_epi16(_mm256_castsi256_si128(color0), hi); // store result _mm_storeu_si128((__m128i*)(&out[color_quad * 4]), result); } #else // unreachable. (void)weights; (void)in; (void)out; ASSERT(false); #endif } void interp_time_of_day_fast(const float weights[8], const SwizzledTimeOfDay& in, math::Vector* out) { // even though the colors are 8 bits, we'll use 16 bits so we can saturate correctly if (get_cpu_info().has_avx2) { interp_time_of_day_fast_avx2(weights, in, out); return; } // weight multipliers __m128i weights0 = _mm_set1_epi16(weights[0] * 64.f); __m128i weights1 = _mm_set1_epi16(weights[1] * 64.f); __m128i weights2 = _mm_set1_epi16(weights[2] * 64.f); __m128i weights3 = _mm_set1_epi16(weights[3] * 64.f); __m128i weights4 = _mm_set1_epi16(weights[4] * 64.f); __m128i weights5 = _mm_set1_epi16(weights[5] * 64.f); __m128i weights6 = _mm_set1_epi16(weights[6] * 64.f); __m128i weights7 = _mm_set1_epi16(weights[7] * 64.f); // saturation: note that alpha is saturated to 128 but the rest are 255. // TODO: maybe we should saturate to 255 for everybody (can do this using a single packus) and // change the shader to deal with this. __m128i sat = _mm_set_epi16(128, 255, 255, 255, 128, 255, 255, 255); for (u32 color_quad = 0; color_quad < in.color_count / 4; color_quad++) { // first, load colors. We put 16 bytes / register and don't touch the upper half because we // convert u8s to u16s. { const u8* base = in.data.data() + color_quad * 128; __m128i color0_p = _mm_loadu_si64((const __m128i*)(base + 0)); __m128i color1_p = _mm_loadu_si64((const __m128i*)(base + 16)); __m128i color2_p = _mm_loadu_si64((const __m128i*)(base + 32)); __m128i color3_p = _mm_loadu_si64((const __m128i*)(base + 48)); __m128i color4_p = _mm_loadu_si64((const __m128i*)(base + 64)); __m128i color5_p = _mm_loadu_si64((const __m128i*)(base + 80)); __m128i color6_p = _mm_loadu_si64((const __m128i*)(base + 96)); __m128i color7_p = _mm_loadu_si64((const __m128i*)(base + 112)); // unpack to 16-bits. each has 16x 16 bit colors. __m128i color0 = _mm_cvtepu8_epi16(color0_p); __m128i color1 = _mm_cvtepu8_epi16(color1_p); __m128i color2 = _mm_cvtepu8_epi16(color2_p); __m128i color3 = _mm_cvtepu8_epi16(color3_p); __m128i color4 = _mm_cvtepu8_epi16(color4_p); __m128i color5 = _mm_cvtepu8_epi16(color5_p); __m128i color6 = _mm_cvtepu8_epi16(color6_p); __m128i color7 = _mm_cvtepu8_epi16(color7_p); // multiply by weights color0 = _mm_mullo_epi16(color0, weights0); color1 = _mm_mullo_epi16(color1, weights1); color2 = _mm_mullo_epi16(color2, weights2); color3 = _mm_mullo_epi16(color3, weights3); color4 = _mm_mullo_epi16(color4, weights4); color5 = _mm_mullo_epi16(color5, weights5); color6 = _mm_mullo_epi16(color6, weights6); color7 = _mm_mullo_epi16(color7, weights7); // add. This order minimizes dependencies. color0 = _mm_add_epi16(color0, color1); color2 = _mm_add_epi16(color2, color3); color4 = _mm_add_epi16(color4, color5); color6 = _mm_add_epi16(color6, color7); color0 = _mm_add_epi16(color0, color2); color4 = _mm_add_epi16(color4, color6); color0 = _mm_add_epi16(color0, color4); // divide, because we multiplied our weights by 2^7. color0 = _mm_srli_epi16(color0, 6); // saturate color0 = _mm_min_epu16(sat, color0); // back to u8s. auto result = _mm_packus_epi16(color0, color0); // store result _mm_storeu_si64((__m128i*)(&out[color_quad * 4]), result); } { const u8* base = in.data.data() + color_quad * 128 + 8; __m128i color0_p = _mm_loadu_si64((const __m128i*)(base + 0)); __m128i color1_p = _mm_loadu_si64((const __m128i*)(base + 16)); __m128i color2_p = _mm_loadu_si64((const __m128i*)(base + 32)); __m128i color3_p = _mm_loadu_si64((const __m128i*)(base + 48)); __m128i color4_p = _mm_loadu_si64((const __m128i*)(base + 64)); __m128i color5_p = _mm_loadu_si64((const __m128i*)(base + 80)); __m128i color6_p = _mm_loadu_si64((const __m128i*)(base + 96)); __m128i color7_p = _mm_loadu_si64((const __m128i*)(base + 112)); // unpack to 16-bits. each has 16x 16 bit colors. __m128i color0 = _mm_cvtepu8_epi16(color0_p); __m128i color1 = _mm_cvtepu8_epi16(color1_p); __m128i color2 = _mm_cvtepu8_epi16(color2_p); __m128i color3 = _mm_cvtepu8_epi16(color3_p); __m128i color4 = _mm_cvtepu8_epi16(color4_p); __m128i color5 = _mm_cvtepu8_epi16(color5_p); __m128i color6 = _mm_cvtepu8_epi16(color6_p); __m128i color7 = _mm_cvtepu8_epi16(color7_p); // multiply by weights color0 = _mm_mullo_epi16(color0, weights0); color1 = _mm_mullo_epi16(color1, weights1); color2 = _mm_mullo_epi16(color2, weights2); color3 = _mm_mullo_epi16(color3, weights3); color4 = _mm_mullo_epi16(color4, weights4); color5 = _mm_mullo_epi16(color5, weights5); color6 = _mm_mullo_epi16(color6, weights6); color7 = _mm_mullo_epi16(color7, weights7); // add. This order minimizes dependencies. color0 = _mm_add_epi16(color0, color1); color2 = _mm_add_epi16(color2, color3); color4 = _mm_add_epi16(color4, color5); color6 = _mm_add_epi16(color6, color7); color0 = _mm_add_epi16(color0, color2); color4 = _mm_add_epi16(color4, color6); color0 = _mm_add_epi16(color0, color4); // divide, because we multiplied our weights by 2^7. color0 = _mm_srli_epi16(color0, 6); // saturate color0 = _mm_min_epu16(sat, color0); // back to u8s. auto result = _mm_packus_epi16(color0, color0); // store result _mm_storeu_si64((__m128i*)(&out[color_quad * 4 + 2]), result); } } } bool sphere_in_view_ref(const math::Vector4f& sphere, const math::Vector4f* planes) { math::Vector4f acc = planes[0] * sphere.x() + planes[1] * sphere.y() + planes[2] * sphere.z() - planes[3]; return acc.x() > -sphere.w() && acc.y() > -sphere.w() && acc.z() > -sphere.w() && acc.w() > -sphere.w(); } // this isn't super efficient, but we spend so little time here it's not worth it to go faster. void cull_check_all_slow(const math::Vector4f* planes, const std::vector& nodes, const u8* level_occlusion_string, u8* out) { if (level_occlusion_string) { for (size_t i = 0; i < nodes.size(); i++) { u16 my_id = nodes[i].my_id; bool not_occluded = my_id != 0xffff && level_occlusion_string[my_id / 8] & (1 << (7 - (my_id & 7))); out[i] = not_occluded && sphere_in_view_ref(nodes[i].bsphere, planes); } } else { for (size_t i = 0; i < nodes.size(); i++) { out[i] = sphere_in_view_ref(nodes[i].bsphere, planes); } } } void make_all_visible_multidraws(std::pair* draw_ptrs_out, GLsizei* counts_out, void** index_offsets_out, const std::vector& draws) { u64 md_idx = 0; for (size_t i = 0; i < draws.size(); i++) { const auto& draw = draws[i]; u64 iidx = draw.first_index_index; std::pair ds; ds.first = md_idx; ds.second = 1; counts_out[md_idx] = draw.num_indices; index_offsets_out[md_idx] = (void*)(iidx * sizeof(u32)); md_idx++; draw_ptrs_out[i] = ds; } } u32 make_all_visible_multidraws(std::pair* draw_ptrs_out, GLsizei* counts_out, void** index_offsets_out, const std::vector& draws) { u64 md_idx = 0; u32 num_tris = 0; for (size_t i = 0; i < draws.size(); i++) { const auto& draw = draws[i]; u64 iidx = draw.unpacked.idx_of_first_idx_in_full_buffer; std::pair ds; ds.first = md_idx; ds.second = 0; for (auto& grp : draw.vis_groups) { // visible! // let's use a multidraw counts_out[md_idx] = grp.num_inds; index_offsets_out[md_idx] = (void*)(iidx * sizeof(u32)); ds.second++; md_idx++; num_tris += grp.num_tris; iidx += grp.num_inds; } draw_ptrs_out[i] = ds; } return num_tris; } u32 make_all_visible_index_list(std::pair* group_out, u32* idx_out, const std::vector& draws, const u32* idx_in) { int idx_buffer_ptr = 0; for (size_t i = 0; i < draws.size(); i++) { const auto& draw = draws[i]; std::pair ds; ds.first = idx_buffer_ptr; memcpy(&idx_out[idx_buffer_ptr], idx_in + draw.first_index_index, draw.num_indices * sizeof(u32)); idx_buffer_ptr += draw.num_indices; ds.second = idx_buffer_ptr - ds.first; group_out[i] = ds; } return idx_buffer_ptr; } u32 make_multidraws_from_vis_string(std::pair* draw_ptrs_out, GLsizei* counts_out, void** index_offsets_out, const std::vector& draws, const std::vector& vis_data) { u64 md_idx = 0; u32 num_tris = 0; u32 sanity_check = 0; for (size_t i = 0; i < draws.size(); i++) { const auto& draw = draws[i]; u64 iidx = draw.unpacked.idx_of_first_idx_in_full_buffer; ASSERT(sanity_check == iidx); std::pair ds; ds.first = md_idx; ds.second = 0; for (auto& grp : draw.vis_groups) { sanity_check += grp.num_inds; if (grp.vis_idx_in_pc_bvh == 0xffffffff || vis_data[grp.vis_idx_in_pc_bvh]) { // visible! // let's use a multidraw counts_out[md_idx] = grp.num_inds; index_offsets_out[md_idx] = (void*)(iidx * sizeof(u32)); ds.second++; md_idx++; num_tris += grp.num_tris; } iidx += grp.num_inds; } draw_ptrs_out[i] = ds; } return num_tris; } u32 make_index_list_from_vis_string(std::pair* group_out, u32* idx_out, const std::vector& draws, const std::vector& vis_data, const u32* idx_in, u32* num_tris_out) { int idx_buffer_ptr = 0; u32 num_tris = 0; for (size_t i = 0; i < draws.size(); i++) { const auto& draw = draws[i]; int vtx_idx = 0; std::pair ds; ds.first = idx_buffer_ptr; bool building_run = false; int run_start_out = 0; int run_start_in = 0; for (auto& grp : draw.vis_groups) { bool vis = grp.vis_idx_in_pc_bvh == 0xffffffff || vis_data[grp.vis_idx_in_pc_bvh]; if (vis) { num_tris += grp.num_tris; } if (building_run) { if (vis) { idx_buffer_ptr += grp.num_inds; } else { building_run = false; memcpy(&idx_out[run_start_out], idx_in + draw.unpacked.idx_of_first_idx_in_full_buffer + run_start_in, (idx_buffer_ptr - run_start_out) * sizeof(u32)); } } else { if (vis) { building_run = true; run_start_out = idx_buffer_ptr; run_start_in = vtx_idx; idx_buffer_ptr += grp.num_inds; } } vtx_idx += grp.num_inds; } if (building_run) { memcpy(&idx_out[run_start_out], idx_in + draw.unpacked.idx_of_first_idx_in_full_buffer + run_start_in, (idx_buffer_ptr - run_start_out) * sizeof(u32)); } ds.second = idx_buffer_ptr - ds.first; group_out[i] = ds; } *num_tris_out = num_tris; return idx_buffer_ptr; } u32 make_all_visible_index_list(std::pair* group_out, u32* idx_out, const std::vector& draws, const u32* idx_in, u32* num_tris_out) { int idx_buffer_ptr = 0; u32 num_tris = 0; for (size_t i = 0; i < draws.size(); i++) { const auto& draw = draws[i]; std::pair ds; ds.first = idx_buffer_ptr; u32 num_inds = 0; for (auto& grp : draw.vis_groups) { num_inds += grp.num_inds; num_tris += grp.num_tris; } memcpy(&idx_out[idx_buffer_ptr], idx_in + draw.unpacked.idx_of_first_idx_in_full_buffer, num_inds * sizeof(u32)); idx_buffer_ptr += num_inds; ds.second = idx_buffer_ptr - ds.first; group_out[i] = ds; } *num_tris_out = num_tris; return idx_buffer_ptr; }