#include "background_common.h"
#include "game/graphics/opengl_renderer/BucketRenderer.h"
#include "game/graphics/pipelines/opengl.h"
#include "common/util/os.h"

#include <immintrin.h>

DoubleDraw setup_opengl_from_draw_mode(DrawMode mode, u32 tex_unit, bool mipmap) {
  glActiveTexture(tex_unit);

  if (mode.get_zt_enable()) {
    glEnable(GL_DEPTH_TEST);
    switch (mode.get_depth_test()) {
      case GsTest::ZTest::NEVER:
        glDepthFunc(GL_NEVER);
        break;
      case GsTest::ZTest::ALWAYS:
        glDepthFunc(GL_ALWAYS);
        break;
      case GsTest::ZTest::GEQUAL:
        glDepthFunc(GL_GEQUAL);
        break;
      case GsTest::ZTest::GREATER:
        glDepthFunc(GL_GREATER);
        break;
      default:
        ASSERT(false);
    }
  } else {
    glDisable(GL_DEPTH_TEST);
  }

  if (mode.get_ab_enable() && mode.get_alpha_blend() != DrawMode::AlphaBlend::DISABLED) {
    glEnable(GL_BLEND);
    switch (mode.get_alpha_blend()) {
      case DrawMode::AlphaBlend::SRC_DST_SRC_DST:
        glBlendEquation(GL_FUNC_ADD);
        glBlendFuncSeparate(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ZERO);
        break;
      case DrawMode::AlphaBlend::SRC_0_SRC_DST:
        glBlendEquation(GL_FUNC_ADD);
        glBlendFuncSeparate(GL_SRC_ALPHA, GL_ONE, GL_ONE, GL_ZERO);
        break;
      case DrawMode::AlphaBlend::SRC_0_FIX_DST:
        glBlendEquation(GL_FUNC_ADD);
        glBlendFuncSeparate(GL_ONE, GL_ONE, GL_ONE, GL_ZERO);
        break;
      case DrawMode::AlphaBlend::SRC_DST_FIX_DST:
        // Cv = (Cs - Cd) * FIX + Cd
        // Cs * FIX * 0.5
        // Cd * FIX * 0.5
        glBlendEquation(GL_FUNC_ADD);
        glBlendFuncSeparate(GL_CONSTANT_COLOR, GL_CONSTANT_COLOR, GL_ONE, GL_ZERO);
        glBlendColor(0.5, 0.5, 0.5, 0.5);
        break;
      case DrawMode::AlphaBlend::ZERO_SRC_SRC_DST:
        glBlendFuncSeparate(GL_SRC_ALPHA, GL_ONE, GL_ONE, GL_ZERO);
        glBlendEquation(GL_FUNC_REVERSE_SUBTRACT);
        break;
      default:
        ASSERT(false);
    }
  } else {
    glDisable(GL_BLEND);
  }

  if (mode.get_clamp_s_enable()) {
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
  } else {
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
  }

  if (mode.get_clamp_t_enable()) {
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
  } else {
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
  }

  if (mode.get_filt_enable()) {
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER,
                    mipmap ? GL_LINEAR_MIPMAP_LINEAR : GL_LINEAR);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
  } else {
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
  }

  // for some reason, they set atest NEVER + FB_ONLY to disable depth writes
  bool alpha_hack_to_disable_z_write = false;
  DoubleDraw double_draw;

  float alpha_min = 0.;
  if (mode.get_at_enable()) {
    switch (mode.get_alpha_test()) {
      case DrawMode::AlphaTest::ALWAYS:
        break;
      case DrawMode::AlphaTest::GEQUAL:
        alpha_min = mode.get_aref() / 127.f;
        switch (mode.get_alpha_fail()) {
          case GsTest::AlphaFail::KEEP:
            // ok, no need for double draw
            break;
          case GsTest::AlphaFail::FB_ONLY:
            // darn, we need to draw twice
            double_draw.kind = DoubleDrawKind::AFAIL_NO_DEPTH_WRITE;
            double_draw.aref_second = alpha_min;
            break;
          default:
            ASSERT(false);
        }
        break;
      case DrawMode::AlphaTest::NEVER:
        if (mode.get_alpha_fail() == GsTest::AlphaFail::FB_ONLY) {
          alpha_hack_to_disable_z_write = true;
        } else {
          ASSERT(false);
        }
        break;
      default:
        ASSERT(false);
    }
  }

  if (mode.get_depth_write_enable() && !alpha_hack_to_disable_z_write) {
    glDepthMask(GL_TRUE);
  } else {
    glDepthMask(GL_FALSE);
  }
  double_draw.aref_first = alpha_min;
  return double_draw;
}

DoubleDraw setup_tfrag_shader(SharedRenderState* render_state, DrawMode mode, ShaderId shader) {
  auto draw_settings = setup_opengl_from_draw_mode(mode, GL_TEXTURE0, true);
  glUniform1f(glGetUniformLocation(render_state->shaders[shader].id(), "alpha_min"),
              draw_settings.aref_first);
  glUniform1f(glGetUniformLocation(render_state->shaders[shader].id(), "alpha_max"), 10.f);
  return draw_settings;
}

void first_tfrag_draw_setup(const TfragRenderSettings& settings,
                            SharedRenderState* render_state,
                            ShaderId shader) {
  render_state->shaders[shader].activate();
  glUniform1i(glGetUniformLocation(render_state->shaders[shader].id(), "tex_T0"), 0);
  glUniformMatrix4fv(glGetUniformLocation(render_state->shaders[shader].id(), "camera"), 1,
                     GL_FALSE, settings.math_camera.data());
  glUniform4f(glGetUniformLocation(render_state->shaders[shader].id(), "hvdf_offset"),
              settings.hvdf_offset[0], settings.hvdf_offset[1], settings.hvdf_offset[2],
              settings.hvdf_offset[3]);
  glUniform1f(glGetUniformLocation(render_state->shaders[shader].id(), "fog_constant"),
              settings.fog.x());
  glUniform1f(glGetUniformLocation(render_state->shaders[shader].id(), "fog_min"),
              settings.fog.y());
  glUniform1f(glGetUniformLocation(render_state->shaders[shader].id(), "fog_max"),
              settings.fog.z());
  glUniform4f(glGetUniformLocation(render_state->shaders[shader].id(), "fog_color"),
              render_state->fog_color[0], render_state->fog_color[1], render_state->fog_color[2],
              render_state->fog_intensity);
}

void interp_time_of_day_slow(const float weights[8],
                             const std::vector<tfrag3::TimeOfDayColor>& in,
                             math::Vector<u8, 4>* out) {
  // Timer interp_timer;
  for (size_t color = 0; color < in.size(); color++) {
    math::Vector4f result = math::Vector4f::zero();
    for (int component = 0; component < 8; component++) {
      result += in[color].rgba[component].cast<float>() * weights[component];
    }
    result[0] = std::min(result[0], 255.f);
    result[1] = std::min(result[1], 255.f);
    result[2] = std::min(result[2], 255.f);
    result[3] = std::min(result[3], 128.f);  // note: different for alpha!
    out[color] = result.cast<u8>();
  }
}

// we want to absolutely minimize the number of time we have to "cross lanes" in AVX (meaning X
// component of one vector interacts with Y component of another).  We can make this a lot better by
// taking groups of 4 time of day colors (each containing 8x RGBAs) and rearranging them with this
// pattern.  We want to compute:
// [rgba][0][0] * weights[0] + [rgba][0][1] * weights[1] + [rgba][0][2]... + rgba[0][7] * weights[7]
// RGBA is already a vector of 4 components, but with AVX we have vectors with 32 bytes which fit
// 16 colors in them.

// This makes each vector have:
// colors0 = [rgba][0][0], [rgba][1][0], [rgba][2][0], [rgba][3][0]
// colors1 = [rgba][0][1], [rgba][1][1], [rgba][2][1], [rgba][3][1]
// ...
// so we can basically add up the columns (multiplying by weights in between)
// and we'll end up with [final0, final1, final2, final3, final4]

// the swizzle function below rearranges to get this pattern.
// it's not the most efficient way to do it, but it just runs during loading and not on every frame.

SwizzledTimeOfDay swizzle_time_of_day(const std::vector<tfrag3::TimeOfDayColor>& in) {
  SwizzledTimeOfDay out;
  out.data.resize((in.size() + 3) * 8 * 4);

  // we're rearranging per 4 colors (groups of 32 * 4 = 128)
  // color (lots of these)
  // component (8 of these)
  // channel (4 of these, rgba)

  for (u32 color_quad = 0; color_quad < (in.size() + 3) / 4; color_quad++) {
    u8* quad_out = out.data.data() + color_quad * 128;
    for (u32 component = 0; component < 8; component++) {
      for (u32 color = 0; color < 4; color++) {
        for (u32 channel = 0; channel < 4; channel++) {
          size_t in_idx = color_quad * 4 + color;
          if (in_idx < in.size()) {
            *quad_out = in.at(color_quad * 4 + color).rgba[component][channel];
          } else {
            *quad_out = 0;
          }
          quad_out++;
        }
      }
    }
  }
  out.color_count = (in.size() + 3) & (~3);
  return out;
}

// This does the same thing as interp_time_of_day_slow, but is faster.
// Due to using integers instead of floats, it may be a tiny bit different.
// TODO: it might be possible to reorder the loop into two blocks of loads and avoid spilling xmms.
// It's ~8x faster than the slow version.

void interp_time_of_day_fast_avx2(const float weights[8],
                                  const SwizzledTimeOfDay& in,
                                  math::Vector<u8, 4>* out) {
  // even though the colors are 8 bits, we'll use 16 bits so we can saturate correctly
#ifdef __AVX2__
  // weight multipliers
  __m256i weights0 = _mm256_set1_epi16(weights[0] * 64.f);
  __m256i weights1 = _mm256_set1_epi16(weights[1] * 64.f);
  __m256i weights2 = _mm256_set1_epi16(weights[2] * 64.f);
  __m256i weights3 = _mm256_set1_epi16(weights[3] * 64.f);
  __m256i weights4 = _mm256_set1_epi16(weights[4] * 64.f);
  __m256i weights5 = _mm256_set1_epi16(weights[5] * 64.f);
  __m256i weights6 = _mm256_set1_epi16(weights[6] * 64.f);
  __m256i weights7 = _mm256_set1_epi16(weights[7] * 64.f);

  // saturation: note that alpha is saturated to 128 but the rest are 255.
  // TODO: maybe we should saturate to 255 for everybody (can do this using a single packus) and
  // change the shader to deal with this.
  __m256i sat = _mm256_set_epi16(128, 255, 255, 255, 128, 255, 255, 255, 128, 255, 255, 255, 128,
                                 255, 255, 255);

  for (u32 color_quad = 0; color_quad < in.color_count / 4; color_quad++) {
    // first, load colors. We put 16 bytes / register and don't touch the upper half because we
    // convert u8s to u16s.
    const u8* base = in.data.data() + color_quad * 128;
    __m128i color0_p = _mm_loadu_si128((const __m128i*)(base + 0));
    __m128i color1_p = _mm_loadu_si128((const __m128i*)(base + 16));
    __m128i color2_p = _mm_loadu_si128((const __m128i*)(base + 32));
    __m128i color3_p = _mm_loadu_si128((const __m128i*)(base + 48));
    __m128i color4_p = _mm_loadu_si128((const __m128i*)(base + 64));
    __m128i color5_p = _mm_loadu_si128((const __m128i*)(base + 80));
    __m128i color6_p = _mm_loadu_si128((const __m128i*)(base + 96));
    __m128i color7_p = _mm_loadu_si128((const __m128i*)(base + 112));

    // unpack to 16-bits. each has 16x 16 bit colors.
    __m256i color0 = _mm256_cvtepu8_epi16(color0_p);
    __m256i color1 = _mm256_cvtepu8_epi16(color1_p);
    __m256i color2 = _mm256_cvtepu8_epi16(color2_p);
    __m256i color3 = _mm256_cvtepu8_epi16(color3_p);
    __m256i color4 = _mm256_cvtepu8_epi16(color4_p);
    __m256i color5 = _mm256_cvtepu8_epi16(color5_p);
    __m256i color6 = _mm256_cvtepu8_epi16(color6_p);
    __m256i color7 = _mm256_cvtepu8_epi16(color7_p);

    // multiply by weights
    color0 = _mm256_mullo_epi16(color0, weights0);
    color1 = _mm256_mullo_epi16(color1, weights1);
    color2 = _mm256_mullo_epi16(color2, weights2);
    color3 = _mm256_mullo_epi16(color3, weights3);
    color4 = _mm256_mullo_epi16(color4, weights4);
    color5 = _mm256_mullo_epi16(color5, weights5);
    color6 = _mm256_mullo_epi16(color6, weights6);
    color7 = _mm256_mullo_epi16(color7, weights7);

    // add. This order minimizes dependencies.
    color0 = _mm256_add_epi16(color0, color1);
    color2 = _mm256_add_epi16(color2, color3);
    color4 = _mm256_add_epi16(color4, color5);
    color6 = _mm256_add_epi16(color6, color7);

    color0 = _mm256_add_epi16(color0, color2);
    color4 = _mm256_add_epi16(color4, color6);

    color0 = _mm256_add_epi16(color0, color4);

    // divide, because we multiplied our weights by 2^7.
    color0 = _mm256_srli_epi16(color0, 6);

    // saturate
    color0 = _mm256_min_epu16(sat, color0);

    // back to u8s.
    auto hi = _mm256_extracti128_si256(color0, 1);
    auto result = _mm_packus_epi16(_mm256_castsi256_si128(color0), hi);

    // store result
    _mm_storeu_si128((__m128i*)(&out[color_quad * 4]), result);
  }
#else
  // unreachable.
  (void)weights;
  (void)in;
  (void)out;
  ASSERT(false);
#endif
}

void interp_time_of_day_fast(const float weights[8],
                             const SwizzledTimeOfDay& in,
                             math::Vector<u8, 4>* out) {
  // even though the colors are 8 bits, we'll use 16 bits so we can saturate correctly
  if (get_cpu_info().has_avx2) {
    interp_time_of_day_fast_avx2(weights, in, out);
    return;
  }

  // weight multipliers
  __m128i weights0 = _mm_set1_epi16(weights[0] * 64.f);
  __m128i weights1 = _mm_set1_epi16(weights[1] * 64.f);
  __m128i weights2 = _mm_set1_epi16(weights[2] * 64.f);
  __m128i weights3 = _mm_set1_epi16(weights[3] * 64.f);
  __m128i weights4 = _mm_set1_epi16(weights[4] * 64.f);
  __m128i weights5 = _mm_set1_epi16(weights[5] * 64.f);
  __m128i weights6 = _mm_set1_epi16(weights[6] * 64.f);
  __m128i weights7 = _mm_set1_epi16(weights[7] * 64.f);

  // saturation: note that alpha is saturated to 128 but the rest are 255.
  // TODO: maybe we should saturate to 255 for everybody (can do this using a single packus) and
  // change the shader to deal with this.
  __m128i sat = _mm_set_epi16(128, 255, 255, 255, 128, 255, 255, 255);

  for (u32 color_quad = 0; color_quad < in.color_count / 4; color_quad++) {
    // first, load colors. We put 16 bytes / register and don't touch the upper half because we
    // convert u8s to u16s.
    {
      const u8* base = in.data.data() + color_quad * 128;
      __m128i color0_p = _mm_loadu_si64((const __m128i*)(base + 0));
      __m128i color1_p = _mm_loadu_si64((const __m128i*)(base + 16));
      __m128i color2_p = _mm_loadu_si64((const __m128i*)(base + 32));
      __m128i color3_p = _mm_loadu_si64((const __m128i*)(base + 48));
      __m128i color4_p = _mm_loadu_si64((const __m128i*)(base + 64));
      __m128i color5_p = _mm_loadu_si64((const __m128i*)(base + 80));
      __m128i color6_p = _mm_loadu_si64((const __m128i*)(base + 96));
      __m128i color7_p = _mm_loadu_si64((const __m128i*)(base + 112));

      // unpack to 16-bits. each has 16x 16 bit colors.
      __m128i color0 = _mm_cvtepu8_epi16(color0_p);
      __m128i color1 = _mm_cvtepu8_epi16(color1_p);
      __m128i color2 = _mm_cvtepu8_epi16(color2_p);
      __m128i color3 = _mm_cvtepu8_epi16(color3_p);
      __m128i color4 = _mm_cvtepu8_epi16(color4_p);
      __m128i color5 = _mm_cvtepu8_epi16(color5_p);
      __m128i color6 = _mm_cvtepu8_epi16(color6_p);
      __m128i color7 = _mm_cvtepu8_epi16(color7_p);

      // multiply by weights
      color0 = _mm_mullo_epi16(color0, weights0);
      color1 = _mm_mullo_epi16(color1, weights1);
      color2 = _mm_mullo_epi16(color2, weights2);
      color3 = _mm_mullo_epi16(color3, weights3);
      color4 = _mm_mullo_epi16(color4, weights4);
      color5 = _mm_mullo_epi16(color5, weights5);
      color6 = _mm_mullo_epi16(color6, weights6);
      color7 = _mm_mullo_epi16(color7, weights7);

      // add. This order minimizes dependencies.
      color0 = _mm_add_epi16(color0, color1);
      color2 = _mm_add_epi16(color2, color3);
      color4 = _mm_add_epi16(color4, color5);
      color6 = _mm_add_epi16(color6, color7);

      color0 = _mm_add_epi16(color0, color2);
      color4 = _mm_add_epi16(color4, color6);

      color0 = _mm_add_epi16(color0, color4);

      // divide, because we multiplied our weights by 2^7.
      color0 = _mm_srli_epi16(color0, 6);

      // saturate
      color0 = _mm_min_epu16(sat, color0);

      // back to u8s.
      auto result = _mm_packus_epi16(color0, color0);

      // store result
      _mm_storeu_si64((__m128i*)(&out[color_quad * 4]), result);
    }

    {
      const u8* base = in.data.data() + color_quad * 128 + 8;
      __m128i color0_p = _mm_loadu_si64((const __m128i*)(base + 0));
      __m128i color1_p = _mm_loadu_si64((const __m128i*)(base + 16));
      __m128i color2_p = _mm_loadu_si64((const __m128i*)(base + 32));
      __m128i color3_p = _mm_loadu_si64((const __m128i*)(base + 48));
      __m128i color4_p = _mm_loadu_si64((const __m128i*)(base + 64));
      __m128i color5_p = _mm_loadu_si64((const __m128i*)(base + 80));
      __m128i color6_p = _mm_loadu_si64((const __m128i*)(base + 96));
      __m128i color7_p = _mm_loadu_si64((const __m128i*)(base + 112));

      // unpack to 16-bits. each has 16x 16 bit colors.
      __m128i color0 = _mm_cvtepu8_epi16(color0_p);
      __m128i color1 = _mm_cvtepu8_epi16(color1_p);
      __m128i color2 = _mm_cvtepu8_epi16(color2_p);
      __m128i color3 = _mm_cvtepu8_epi16(color3_p);
      __m128i color4 = _mm_cvtepu8_epi16(color4_p);
      __m128i color5 = _mm_cvtepu8_epi16(color5_p);
      __m128i color6 = _mm_cvtepu8_epi16(color6_p);
      __m128i color7 = _mm_cvtepu8_epi16(color7_p);

      // multiply by weights
      color0 = _mm_mullo_epi16(color0, weights0);
      color1 = _mm_mullo_epi16(color1, weights1);
      color2 = _mm_mullo_epi16(color2, weights2);
      color3 = _mm_mullo_epi16(color3, weights3);
      color4 = _mm_mullo_epi16(color4, weights4);
      color5 = _mm_mullo_epi16(color5, weights5);
      color6 = _mm_mullo_epi16(color6, weights6);
      color7 = _mm_mullo_epi16(color7, weights7);

      // add. This order minimizes dependencies.
      color0 = _mm_add_epi16(color0, color1);
      color2 = _mm_add_epi16(color2, color3);
      color4 = _mm_add_epi16(color4, color5);
      color6 = _mm_add_epi16(color6, color7);

      color0 = _mm_add_epi16(color0, color2);
      color4 = _mm_add_epi16(color4, color6);

      color0 = _mm_add_epi16(color0, color4);

      // divide, because we multiplied our weights by 2^7.
      color0 = _mm_srli_epi16(color0, 6);

      // saturate
      color0 = _mm_min_epu16(sat, color0);

      // back to u8s.
      auto result = _mm_packus_epi16(color0, color0);

      // store result
      _mm_storeu_si64((__m128i*)(&out[color_quad * 4 + 2]), result);
    }
  }
}

bool sphere_in_view_ref(const math::Vector4f& sphere, const math::Vector4f* planes) {
  math::Vector4f acc =
      planes[0] * sphere.x() + planes[1] * sphere.y() + planes[2] * sphere.z() - planes[3];

  return acc.x() > -sphere.w() && acc.y() > -sphere.w() && acc.z() > -sphere.w() &&
         acc.w() > -sphere.w();
}

// this isn't super efficient, but we spend so little time here it's not worth it to go faster.
void cull_check_all_slow(const math::Vector4f* planes,
                         const std::vector<tfrag3::VisNode>& nodes,
                         const u8* level_occlusion_string,
                         u8* out) {
  if (level_occlusion_string) {
    for (size_t i = 0; i < nodes.size(); i++) {
      u16 my_id = nodes[i].my_id;
      bool not_occluded =
          my_id != 0xffff && level_occlusion_string[my_id / 8] & (1 << (7 - (my_id & 7)));
      out[i] = not_occluded && sphere_in_view_ref(nodes[i].bsphere, planes);
    }
  } else {
    for (size_t i = 0; i < nodes.size(); i++) {
      out[i] = sphere_in_view_ref(nodes[i].bsphere, planes);
    }
  }
}

void make_all_visible_multidraws(std::pair<int, int>* draw_ptrs_out,
                                 GLsizei* counts_out,
                                 void** index_offsets_out,
                                 const std::vector<tfrag3::ShrubDraw>& draws) {
  u64 md_idx = 0;
  for (size_t i = 0; i < draws.size(); i++) {
    const auto& draw = draws[i];
    u64 iidx = draw.first_index_index;
    std::pair<int, int> ds;
    ds.first = md_idx;
    ds.second = 1;
    counts_out[md_idx] = draw.num_indices;
    index_offsets_out[md_idx] = (void*)(iidx * sizeof(u32));
    md_idx++;
    draw_ptrs_out[i] = ds;
  }
}

u32 make_all_visible_multidraws(std::pair<int, int>* draw_ptrs_out,
                                GLsizei* counts_out,
                                void** index_offsets_out,
                                const std::vector<tfrag3::StripDraw>& draws) {
  u64 md_idx = 0;
  u32 num_tris = 0;
  for (size_t i = 0; i < draws.size(); i++) {
    const auto& draw = draws[i];
    u64 iidx = draw.unpacked.idx_of_first_idx_in_full_buffer;
    std::pair<int, int> ds;
    ds.first = md_idx;
    ds.second = 0;
    for (auto& grp : draw.vis_groups) {
      // visible!
      // let's use a multidraw
      counts_out[md_idx] = grp.num_inds;
      index_offsets_out[md_idx] = (void*)(iidx * sizeof(u32));
      ds.second++;
      md_idx++;
      num_tris += grp.num_tris;
      iidx += grp.num_inds;
    }
    draw_ptrs_out[i] = ds;
  }
  return num_tris;
}

u32 make_all_visible_index_list(std::pair<int, int>* group_out,
                                u32* idx_out,
                                const std::vector<tfrag3::ShrubDraw>& draws,
                                const u32* idx_in) {
  int idx_buffer_ptr = 0;
  for (size_t i = 0; i < draws.size(); i++) {
    const auto& draw = draws[i];
    std::pair<int, int> ds;
    ds.first = idx_buffer_ptr;
    memcpy(&idx_out[idx_buffer_ptr], idx_in + draw.first_index_index,
           draw.num_indices * sizeof(u32));
    idx_buffer_ptr += draw.num_indices;
    ds.second = idx_buffer_ptr - ds.first;
    group_out[i] = ds;
  }
  return idx_buffer_ptr;
}

u32 make_multidraws_from_vis_string(std::pair<int, int>* draw_ptrs_out,
                                    GLsizei* counts_out,
                                    void** index_offsets_out,
                                    const std::vector<tfrag3::StripDraw>& draws,
                                    const std::vector<u8>& vis_data) {
  u64 md_idx = 0;
  u32 num_tris = 0;
  u32 sanity_check = 0;
  for (size_t i = 0; i < draws.size(); i++) {
    const auto& draw = draws[i];
    u64 iidx = draw.unpacked.idx_of_first_idx_in_full_buffer;
    ASSERT(sanity_check == iidx);
    std::pair<int, int> ds;
    ds.first = md_idx;
    ds.second = 0;
    for (auto& grp : draw.vis_groups) {
      sanity_check += grp.num_inds;
      if (grp.vis_idx_in_pc_bvh == 0xffffffff || vis_data[grp.vis_idx_in_pc_bvh]) {
        // visible!
        // let's use a multidraw
        counts_out[md_idx] = grp.num_inds;
        index_offsets_out[md_idx] = (void*)(iidx * sizeof(u32));
        ds.second++;
        md_idx++;
        num_tris += grp.num_tris;
      }
      iidx += grp.num_inds;
    }
    draw_ptrs_out[i] = ds;
  }
  return num_tris;
}

u32 make_index_list_from_vis_string(std::pair<int, int>* group_out,
                                    u32* idx_out,
                                    const std::vector<tfrag3::StripDraw>& draws,
                                    const std::vector<u8>& vis_data,
                                    const u32* idx_in,
                                    u32* num_tris_out) {
  int idx_buffer_ptr = 0;
  u32 num_tris = 0;
  for (size_t i = 0; i < draws.size(); i++) {
    const auto& draw = draws[i];
    int vtx_idx = 0;
    std::pair<int, int> ds;
    ds.first = idx_buffer_ptr;
    bool building_run = false;
    int run_start_out = 0;
    int run_start_in = 0;
    for (auto& grp : draw.vis_groups) {
      bool vis = grp.vis_idx_in_pc_bvh == 0xffffffff || vis_data[grp.vis_idx_in_pc_bvh];
      if (vis) {
        num_tris += grp.num_tris;
      }

      if (building_run) {
        if (vis) {
          idx_buffer_ptr += grp.num_inds;
        } else {
          building_run = false;
          memcpy(&idx_out[run_start_out],
                 idx_in + draw.unpacked.idx_of_first_idx_in_full_buffer + run_start_in,
                 (idx_buffer_ptr - run_start_out) * sizeof(u32));
        }
      } else {
        if (vis) {
          building_run = true;
          run_start_out = idx_buffer_ptr;
          run_start_in = vtx_idx;
          idx_buffer_ptr += grp.num_inds;
        }
      }
      vtx_idx += grp.num_inds;
    }

    if (building_run) {
      memcpy(&idx_out[run_start_out],
             idx_in + draw.unpacked.idx_of_first_idx_in_full_buffer + run_start_in,
             (idx_buffer_ptr - run_start_out) * sizeof(u32));
    }

    ds.second = idx_buffer_ptr - ds.first;
    group_out[i] = ds;
  }
  *num_tris_out = num_tris;
  return idx_buffer_ptr;
}

u32 make_all_visible_index_list(std::pair<int, int>* group_out,
                                u32* idx_out,
                                const std::vector<tfrag3::StripDraw>& draws,
                                const u32* idx_in,
                                u32* num_tris_out) {
  int idx_buffer_ptr = 0;
  u32 num_tris = 0;
  for (size_t i = 0; i < draws.size(); i++) {
    const auto& draw = draws[i];
    std::pair<int, int> ds;
    ds.first = idx_buffer_ptr;
    u32 num_inds = 0;
    for (auto& grp : draw.vis_groups) {
      num_inds += grp.num_inds;
      num_tris += grp.num_tris;
    }
    memcpy(&idx_out[idx_buffer_ptr], idx_in + draw.unpacked.idx_of_first_idx_in_full_buffer,
           num_inds * sizeof(u32));
    idx_buffer_ptr += num_inds;
    ds.second = idx_buffer_ptr - ds.first;
    group_out[i] = ds;
  }
  *num_tris_out = num_tris;
  return idx_buffer_ptr;
}