SERVER-82971 Implement optimized block-based simple8b sum

This is significantly faster than the previous implementation. The block-based sum will be used to efficiently calculate the delta for the last element in a BSONColumn.
2023-11-30 18:17:58 +00:00 · 2023-11-30 18:17:58 +00:00 · 20d0dd80d4
parent dd52961205
commit 20d0dd80d4
7 changed files with 674 additions and 4 deletions
--- a/src/mongo/bson/util/SConscript
+++ b/src/mongo/bson/util/SConscript
@ -19,6 +19,7 @@ env.Library(
    source=[
        'bsoncolumn.cpp',
        'bsoncolumnbuilder.cpp',
        'simple8b.cpp',
        'simple8b_builder.cpp',
        'simple8b_type_util.cpp',
    ],
--- a/src/mongo/bson/util/simple8b.cpp
+++ b/src/mongo/bson/util/simple8b.cpp
@ -0,0 +1,530 @@
 /**
 *    Copyright (C) 2023-present MongoDB, Inc.
 *
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the Server Side Public License, version 1,
 *    as published by MongoDB, Inc.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    Server Side Public License for more details.
 *
 *    You should have received a copy of the Server Side Public License
 *    along with this program. If not, see
 *    <http://www.mongodb.com/licensing/server-side-public-license>.
 *
 *    As a special exception, the copyright holders give permission to link the
 *    code of portions of this program with the OpenSSL library under certain
 *    conditions as described in each individual source file and distribute
 *    linked combinations including the program with the OpenSSL library. You
 *    must comply with the Server Side Public License in all respects for
 *    all of the code used other than as permitted herein. If you modify file(s)
 *    with this exception, you may extend this exception to your version of the
 *    file(s), but you are not obligated to do so. If you do not wish to do so,
 *    delete this exception statement from your version. If you delete this
 *    exception statement from all source files in the program, then also delete
 *    it in the license file.
 */
 #include "mongo/bson/util/simple8b.h"
 #include "mongo/bson/util/simple8b_type_util.h"
 #include <limits>
 namespace mongo::simple8b {
 namespace {
 // Performs addition as unsigned and cast back to signed to get overflow defined to wrapped around
 // instead of undefined behavior.
 static constexpr int64_t add(int64_t lhs, int64_t rhs) {
    return static_cast<int64_t>(static_cast<uint64_t>(lhs) + static_cast<uint64_t>(rhs));
 }
 static constexpr int128_t add(int128_t lhs, int128_t rhs) {
    return static_cast<int128_t>(static_cast<uint128_t>(lhs) + static_cast<uint128_t>(rhs));
 }
 // Simple Simple8b decoder for decoding any basic simple8b block where all bits are used for the
 // value, decodes signed integer at runtime. Suitable for selectors with many bits per slot. Encoded
 // should be be machine endian and first slot should start at least significant bit.
 template <int bits>
 struct SimpleDecoder {
    // Number of values in this block.
    static constexpr int iters = 60 / bits;
    // Bit mask to extract a single slot and to check for the missing bit pattern.
    static constexpr uint64_t mask = (1ull << bits) - 1;
    // Calculate the sum of all slots.
    template <typename T>
    static T sum(uint64_t encoded) {
        T decoded = 0;
        for (int i = iters; i; --i) {
            uint64_t slot = encoded & mask;
            if (slot != mask) {
                decoded = add(decoded, Simple8bTypeUtil::decodeInt64(slot));
            }
            encoded >>= bits;
        };
        return decoded;
    }
    // Returns value of last slot. Treats missing as 0.
    static int64_t lastIgnoreSkip(uint64_t encoded) {
        encoded >>= (bits * (iters - 1));
        if (encoded == mask)
            return 0;
        return Simple8bTypeUtil::decodeInt64(encoded);
    }
 };
 // Table-based decoder that uses a lookup table for decoding unsigned integers into signed. Suitable
 // for selectors with few bits per slot as the internal lookup table grows with bits per slot.
 // Encoded should be be machine endian and first slot should start at least significant bit.
 template <int bits>
 struct TableDecoder {
    // Type to store in lookup table, depends on bit width per slot.
    using T = std::conditional_t<bits <= 8, int8_t, int16_t>;
    // Constant to constrain table size.
    static constexpr int kMaxTableSize = 1 << 13;
    static constexpr int shift = bits;
    // Number of values in this block
    static constexpr int iters = (60 / bits * bits + shift - 1) / shift;
    // Number of entries in lookup table
    static constexpr int entries = 1 << shift;
    // Bit mask to extract a single slot and to check for the missing bit pattern.
    static constexpr uint64_t mask = (1ull << bits) - 1;
    // Largest possible value that can be stored in this slot
    static constexpr int64_t kMaxSlotValue = Simple8bTypeUtil::decodeInt64(mask - 1);
    // Smallest possible value that can be stored in this slot
    static constexpr int64_t kMinSlotValue = Simple8bTypeUtil::decodeInt64(mask - 2);
    // Verify that lookup table is within size limit and that it can store our possible range of
    // values
    static_assert(entries <= kMaxTableSize, "lookup table too large");
    static_assert(kMaxSlotValue <= std::numeric_limits<T>::max(),
                  "lookup table cannot store full decoded value range");
    static_assert(kMinSlotValue >= std::numeric_limits<T>::min(),
                  "lookup table cannot store full decoded value range");
    T table[entries];
    // Initialize lookup table
    constexpr TableDecoder() : table() {
        for (unsigned i = 0; i < entries; ++i) {
            uint64_t slot = i;
            bool skip = slot == mask;
            if (!skip) {
                table[i] += Simple8bTypeUtil::decodeInt64(slot);
            }
        }
    }
    // Calculate the sum of all slots
    template <typename T>
    T sum(uint64_t encoded) const {
        T decoded = 0;
        for (int i = iters; i; --i) {
            decoded += table[encoded % entries];
            encoded >>= shift;
        };
        return decoded;
    }
    // Returns value of last slot. Treats missing as 0
    int64_t lastIgnoreSkip(uint64_t encoded) const {
        encoded >>= (bits * (iters - 1));
        return table[encoded];
    }
 };
 // Table-based decoder that uses a lookup table for decoding multiple unsigned integers into signed
 // at once. Suitable for selectors with few bits per slot as the internal lookup table grows with
 // bits per slot. Encoded should be be machine endian and first slot should start at least
 // significant bit.
 template <int bits>
 struct ParallelTableDecoder {
    // Constant to constrain table size, 2^X.
    static constexpr int kMaxTableSizeExp = 13;
    static constexpr int kMaxTableSize = 1 << kMaxTableSizeExp;
    // Number of slots that we can decode together
    static constexpr int parallel = kMaxTableSizeExp / bits;
    // Number of shift to get to the next decoding iteration.
    static constexpr int shift = bits * parallel;
    // Number of decoding iterations in this block
    static constexpr int iters = (60 / bits * bits + shift - 1) / shift;
    // Number of entries in lookup table
    static constexpr int entries = 1 << shift;
    // Bit mask to extract a single slot and to check for the missing bit pattern.
    static constexpr uint64_t mask = (1ull << bits) - 1;
    // Largest possible value that can be stored in this slot
    static constexpr int64_t kMaxSlotValue = Simple8bTypeUtil::decodeInt64(mask - 1);
    // Smallest possible value that can be stored in this slot
    static constexpr int64_t kMinSlotValue = Simple8bTypeUtil::decodeInt64(mask - 2);
    // Verify that lookup table is within size limit and that it can store our possible range of
    // values
    static_assert(
        bits > 1,
        "simple8b slots needs to use at least 2 bits to be meaningful for parallel decoding");
    static_assert(parallel > 1, "bit size too large to fit in table for parallel decoding");
    static_assert(kMaxSlotValue * parallel <= std::numeric_limits<int8_t>::max(),
                  "lookup table cannot store full decoded value range");
    static_assert(kMinSlotValue * parallel >= std::numeric_limits<int8_t>::min(),
                  "lookup table cannot store full decoded value range");
    int8_t table[entries];
    // Initialize lookup table
    constexpr ParallelTableDecoder() : table() {
        for (unsigned i = 0; i < entries; ++i) {
            for (int j = 0; j < parallel; ++j) {
                uint64_t slot = (i >> (j * bits)) & mask;
                if (slot != mask) {
                    table[i] += Simple8bTypeUtil::decodeInt64(slot);
                }
            }
        }
    }
    // Calculate the sum of all slots
    template <typename T>
    T sum(uint64_t encoded) const {
        T decoded = 0;
        for (int i = iters; i; --i) {
            decoded = add(decoded, table[encoded % entries]);
            encoded >>= shift;
        };
        return decoded;
    }
 };
 // Special Simple8b decoder for decoding the extended selectors where the slot bits are split up in
 // a value and count for a left shift. Encoded should be be machine endian and first slot should
 // start at least significant bit.
 template <int valueBits, int countBits, int countScale>
 struct ExtendedDecoder {
    static constexpr int bits = valueBits + countBits;
    static constexpr int iters = 56 / bits;
    static constexpr uint64_t mask = (1ull << bits) - 1;
    static constexpr uint64_t valueMask = (1ull << valueBits) - 1;
    static constexpr uint64_t countMask = (1ull << countBits) - 1;
    // Calculate the sum of all slots
    template <typename T>
    T sum(uint64_t encoded) const {
        T decoded = 0;
        for (int i = iters; i; --i) {
            if ((encoded & mask) != mask) {
                uint64_t count = encoded & countMask;
                make_unsigned_t<T> value = (encoded >> countBits) & valueMask;
                decoded = add(decoded, Simple8bTypeUtil::decodeInt(value << (count * countScale)));
            }
            encoded >>= bits;
        };
        return decoded;
    }
    // Returns value of last slot. Treats missing as 0
    template <typename T>
    T lastIgnoreSkip(uint64_t encoded) const {
        encoded >>= (bits * (iters - 1));
        if ((encoded & mask) == mask)
            return 0;
        uint64_t count = encoded & countMask;
        make_unsigned_t<T> value = (encoded >> countBits) & valueMask;
        return Simple8bTypeUtil::decodeInt(value << (count * countScale));
    }
 };
 // Storage for all decoders that we need for our various selector types
 static constexpr ParallelTableDecoder<2> decoderParallel2;
 static constexpr ParallelTableDecoder<3> decoderParallel3;
 static constexpr ParallelTableDecoder<4> decoderParallel4;
 static constexpr ParallelTableDecoder<5> decoderParallel5;
 static constexpr ParallelTableDecoder<6> decoderParallel6;
 static constexpr TableDecoder<2> decoder2;
 static constexpr TableDecoder<3> decoder3;
 static constexpr TableDecoder<4> decoder4;
 static constexpr TableDecoder<5> decoder5;
 static constexpr TableDecoder<6> decoder6;
 static constexpr TableDecoder<7> decoder7;
 static constexpr TableDecoder<8> decoder8;
 static constexpr TableDecoder<10> decoder10;
 static constexpr TableDecoder<12> decoder12;
 static constexpr SimpleDecoder<15> decoder15;
 static constexpr SimpleDecoder<20> decoder20;
 static constexpr SimpleDecoder<30> decoder30;
 static constexpr SimpleDecoder<60> decoder60;
 static constexpr ExtendedDecoder<2, 4, 1> decoderExtended7_1;
 static constexpr ExtendedDecoder<3, 4, 1> decoderExtended7_2;
 static constexpr ExtendedDecoder<4, 4, 1> decoderExtended7_3;
 static constexpr ExtendedDecoder<5, 4, 1> decoderExtended7_4;
 static constexpr ExtendedDecoder<7, 4, 1> decoderExtended7_5;
 static constexpr ExtendedDecoder<10, 4, 1> decoderExtended7_6;
 static constexpr ExtendedDecoder<14, 4, 1> decoderExtended7_7;
 static constexpr ExtendedDecoder<24, 4, 1> decoderExtended7_8;
 static constexpr ExtendedDecoder<52, 4, 1> decoderExtended7_9;
 static constexpr ExtendedDecoder<4, 4, 4> decoderExtended8_1;
 static constexpr ExtendedDecoder<5, 4, 4> decoderExtended8_2;
 static constexpr ExtendedDecoder<7, 4, 4> decoderExtended8_3;
 static constexpr ExtendedDecoder<10, 4, 4> decoderExtended8_4;
 static constexpr ExtendedDecoder<14, 4, 4> decoderExtended8_5;
 static constexpr ExtendedDecoder<24, 4, 4> decoderExtended8_6;
 static constexpr ExtendedDecoder<52, 4, 4> decoderExtended8_7;
 static constexpr ExtendedDecoder<4, 5, 4> decoderExtended8_8;
 static constexpr ExtendedDecoder<6, 5, 4> decoderExtended8_9;
 static constexpr ExtendedDecoder<9, 5, 4> decoderExtended8_10;
 static constexpr ExtendedDecoder<13, 5, 4> decoderExtended8_11;
 static constexpr ExtendedDecoder<23, 5, 4> decoderExtended8_12;
 static constexpr ExtendedDecoder<51, 5, 4> decoderExtended8_13;
 // Decodes last slot for simple8b block. Treats missing as 0.
 template <typename T>
 T decodeLastSlotIgnoreSkip(uint64_t encoded) {
    auto selector = encoded & simple8b_internal::kBaseSelectorMask;
    encoded >>= 4;
    switch (selector) {
        case 1:  // Only 0 or missing deltas
            break;
        case 2:
            return decoder2.lastIgnoreSkip(encoded);
        case 3:
            return decoder3.lastIgnoreSkip(encoded);
        case 4:
            return decoder4.lastIgnoreSkip(encoded);
        case 5:
            return decoder5.lastIgnoreSkip(encoded);
        case 6:
            return decoder6.lastIgnoreSkip(encoded);
        case 7: {
            auto extended = encoded & simple8b_internal::kBaseSelectorMask;
            encoded >>= 4;
            switch (extended) {
                case 0:
                    return decoder7.lastIgnoreSkip(encoded);
                case 1:
                    return decoderExtended7_1.lastIgnoreSkip<T>(encoded);
                case 2:
                    return decoderExtended7_2.lastIgnoreSkip<T>(encoded);
                case 3:
                    return decoderExtended7_3.lastIgnoreSkip<T>(encoded);
                case 4:
                    return decoderExtended7_4.lastIgnoreSkip<T>(encoded);
                case 5:
                    return decoderExtended7_5.lastIgnoreSkip<T>(encoded);
                case 6:
                    return decoderExtended7_6.lastIgnoreSkip<T>(encoded);
                case 7:
                    return decoderExtended7_7.lastIgnoreSkip<T>(encoded);
                case 8:
                    return decoderExtended7_8.lastIgnoreSkip<T>(encoded);
                case 9:
                    return decoderExtended7_9.lastIgnoreSkip<T>(encoded);
                default:
                    invariant(false);  // invalid encoding
                    break;
            }
            break;
        }
        case 8: {
            auto extended = encoded & simple8b_internal::kBaseSelectorMask;
            encoded >>= 4;
            switch (extended) {
                case 0:
                    return decoder8.lastIgnoreSkip(encoded);
                case 1:
                    return decoderExtended8_1.lastIgnoreSkip<T>(encoded);
                case 2:
                    return decoderExtended8_2.lastIgnoreSkip<T>(encoded);
                case 3:
                    return decoderExtended8_3.lastIgnoreSkip<T>(encoded);
                case 4:
                    return decoderExtended8_4.lastIgnoreSkip<T>(encoded);
                case 5:
                    return decoderExtended8_5.lastIgnoreSkip<T>(encoded);
                case 6:
                    return decoderExtended8_6.lastIgnoreSkip<T>(encoded);
                case 7:
                    return decoderExtended8_7.lastIgnoreSkip<T>(encoded);
                case 8:
                    return decoderExtended8_8.lastIgnoreSkip<T>(encoded);
                case 9:
                    return decoderExtended8_9.lastIgnoreSkip<T>(encoded);
                case 10:
                    return decoderExtended8_10.lastIgnoreSkip<T>(encoded);
                case 11:
                    return decoderExtended8_11.lastIgnoreSkip<T>(encoded);
                case 12:
                    return decoderExtended8_12.lastIgnoreSkip<T>(encoded);
                case 13:
                    return decoderExtended8_13.lastIgnoreSkip<T>(encoded);
                default:
                    invariant(false);  // invalid encoding
                    break;
            }
            break;
        }
        case 9:
            return decoder10.lastIgnoreSkip(encoded);
        case 10:
            return decoder12.lastIgnoreSkip(encoded);
        case 11:
            return decoder15.lastIgnoreSkip(encoded);
        case 12:
            return decoder20.lastIgnoreSkip(encoded);
        case 13:
            return decoder30.lastIgnoreSkip(encoded);
        case 14:
            return decoder60.lastIgnoreSkip(encoded);
        case 15:
            break;
        default:
            break;
    }
    return 0;
 }
 // Decodes and sums all slots in simple8b block, writes last encountered non-rle block in
 // 'prevNonRLE'.
 template <typename T>
 T decodeAndSum(uint64_t encoded, uint64_t* prevNonRLE) {
    auto selector = encoded & simple8b_internal::kBaseSelectorMask;
    if (selector != simple8b_internal::kRleSelector) {
        *prevNonRLE = encoded;
    }
    encoded >>= 4;
    switch (selector) {
        case 1:  // Only 0 or missing deltas
            return 0;
        case 2:
            return decoderParallel2.sum<T>(encoded);
        case 3:
            return decoderParallel3.sum<T>(encoded);
        case 4:
            return decoderParallel4.sum<T>(encoded);
        case 5:
            return decoderParallel5.sum<T>(encoded);
        case 6:
            return decoderParallel6.sum<T>(encoded);
        case 7: {
            auto extended = encoded & simple8b_internal::kBaseSelectorMask;
            encoded >>= 4;
            switch (extended) {
                case 0:
                    return decoder7.sum<T>(encoded);
                case 1:
                    return decoderExtended7_1.sum<T>(encoded);
                case 2:
                    return decoderExtended7_2.sum<T>(encoded);
                case 3:
                    return decoderExtended7_3.sum<T>(encoded);
                case 4:
                    return decoderExtended7_4.sum<T>(encoded);
                case 5:
                    return decoderExtended7_5.sum<T>(encoded);
                case 6:
                    return decoderExtended7_6.sum<T>(encoded);
                case 7:
                    return decoderExtended7_7.sum<T>(encoded);
                case 8:
                    return decoderExtended7_8.sum<T>(encoded);
                case 9:
                    return decoderExtended7_9.sum<T>(encoded);
                default:
                    break;
            }
            break;
        }
        case 8: {
            auto extended = encoded & simple8b_internal::kBaseSelectorMask;
            encoded >>= 4;
            switch (extended) {
                case 0:
                    return decoder8.sum<T>(encoded);
                case 1:
                    return decoderExtended8_1.sum<T>(encoded);
                case 2:
                    return decoderExtended8_2.sum<T>(encoded);
                case 3:
                    return decoderExtended8_3.sum<T>(encoded);
                case 4:
                    return decoderExtended8_4.sum<T>(encoded);
                case 5:
                    return decoderExtended8_5.sum<T>(encoded);
                case 6:
                    return decoderExtended8_6.sum<T>(encoded);
                case 7:
                    return decoderExtended8_7.sum<T>(encoded);
                case 8:
                    return decoderExtended8_8.sum<T>(encoded);
                case 9:
                    return decoderExtended8_9.sum<T>(encoded);
                case 10:
                    return decoderExtended8_10.sum<T>(encoded);
                case 11:
                    return decoderExtended8_11.sum<T>(encoded);
                case 12:
                    return decoderExtended8_12.sum<T>(encoded);
                case 13:
                    return decoderExtended8_13.sum<T>(encoded);
                default:
                    break;
            }
            break;
        }
        case 9:
            return decoder10.sum<T>(encoded);
        case 10:
            return decoder12.sum<T>(encoded);
        case 11:
            return decoder15.sum<T>(encoded);
        case 12:
            return decoder20.sum<T>(encoded);
        case 13:
            return decoder30.sum<T>(encoded);
        case 14:
            return decoder60.sum<T>(encoded);
        case simple8b_internal::kRleSelector:
            return decodeLastSlotIgnoreSkip<T>(*prevNonRLE) * ((encoded & 0xf) + 1) *
                simple8b_internal::kRleMultiplier;
        default:
            break;
    }
    fassertFailed(8297100);
    return 0;
 }
 }  // namespace
 template <typename T>
 T sum(const char* buffer, size_t size, uint64_t& prevNonRLE) {
    invariant(size % 8 == 0);
    const char* end = buffer + size;
    T sum = 0;
    while (buffer != end) {
        uint64_t encoded = ConstDataView(buffer).read<LittleEndian<uint64_t>>();
        sum = add(sum, decodeAndSum<T>(encoded, &prevNonRLE));
        buffer += sizeof(uint64_t);
    }
    return sum;
 }
 // Explicit template instantiations for our supported types
 template int64_t sum<int64_t>(const char*, size_t, uint64_t&);
 template int128_t sum<int128_t>(const char*, size_t, uint64_t&);
 }  // namespace mongo::simple8b
--- a/src/mongo/bson/util/simple8b.h
+++ b/src/mongo/bson/util/simple8b.h
@ -334,4 +334,17 @@ typename Simple8b<T>::Iterator Simple8b<T>::end() const {
    return {_buffer + _size};
 }
 namespace simple8b {
 // Constant for a simple8b block containing a single 'missing' value.
 static constexpr uint64_t kSingleSkip = 0xFFFFFFFFFFFFFFFE;
 /**
 * Calculates the sum for multiple simple8b blocks in a buffer. 'prevNonRLE' should be initialized
 * to 'kSingleSkip' when calculating sum for the first buffer. If the caller needs sum from multiple
 * buffers the value should be passed along between the calls.
 */
 template <typename T>
 T sum(const char* buffer, size_t size, uint64_t& prevNonRLE);
 }  // namespace simple8b
 }  // namespace mongo
--- a/src/mongo/bson/util/simple8b_bm.cpp
+++ b/src/mongo/bson/util/simple8b_bm.cpp
@ -31,16 +31,42 @@
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
 #include <random>
 #include <boost/cstdint.hpp>
 #include "mongo/bson/util/builder.h"
 #include "mongo/bson/util/simple8b.h"
 #include "mongo/bson/util/simple8b_builder.h"
 #include "mongo/bson/util/simple8b_type_util.h"
 #include "mongo/util/shared_buffer.h"
 namespace mongo {
 BufBuilder generateIntegers() {
    std::mt19937_64 seedGen(1337);
    std::mt19937 gen(seedGen());
    std::normal_distribution<> d(100, 10);
    std::uniform_int_distribution skip(1, 100);
    BufBuilder buffer;
    Simple8bBuilder<uint64_t> s8bBuilder(
        [&buffer](uint64_t simple8bBlock) { buffer.appendNum(simple8bBlock); });
    // Generate 10k integers
    for (int i = 0; i < 10000; ++i) {
        // 5% chance for missing
        if (skip(gen) <= 5) {
            s8bBuilder.skip();
        } else {
            s8bBuilder.append(std::lround(d(gen)));
        }
    }
    s8bBuilder.flush();
    return buffer;
 }
 void BM_increasingValues(benchmark::State& state) {
    size_t totalBytes = 0;
    for (auto _ : state) {
@ -152,11 +178,52 @@ void BM_decode(benchmark::State& state) {
    state.SetBytesProcessed(totalBytes);
 }
 void BM_sum(benchmark::State& state) {
    BufBuilder buffer = generateIntegers();
    auto size = buffer.len();
    auto buf = buffer.release();
    size_t totalBytes = 0;
    for (auto _ : state) {
        benchmark::ClobberMemory();
        uint64_t prev = simple8b::kSingleSkip;
        benchmark::DoNotOptimize(simple8b::sum<int64_t>(buf.get(), size, prev));
        totalBytes += size;
    }
    state.SetBytesProcessed(totalBytes);
 }
 void BM_sumUnoptimized(benchmark::State& state) {
    BufBuilder buffer = generateIntegers();
    auto size = buffer.len();
    auto buf = buffer.release();
    size_t totalBytes = 0;
    for (auto _ : state) {
        benchmark::ClobberMemory();
        Simple8b<uint64_t> s8b(buf.get(), size);
        int64_t sum = 0;
        for (auto&& val : s8b) {
            if (val) {
                sum += Simple8bTypeUtil::decodeInt64(*val);
            }
        }
        totalBytes += size;
    }
    state.SetBytesProcessed(totalBytes);
 }
 BENCHMARK(BM_increasingValues)->Arg(100);
 BENCHMARK(BM_rle)->Arg(100);
 BENCHMARK(BM_changingSmallValues)->Arg(100);
 BENCHMARK(BM_changingLargeValues)->Arg(100);
 BENCHMARK(BM_selectorSeven)->Arg(100);
 BENCHMARK(BM_decode);
 BENCHMARK(BM_sum);
 BENCHMARK(BM_sumUnoptimized);
 }  // namespace mongo
--- a/src/mongo/bson/util/simple8b_test.cpp
+++ b/src/mongo/bson/util/simple8b_test.cpp
@ -46,6 +46,7 @@
 #include "mongo/base/string_data.h"
 #include "mongo/bson/util/builder.h"
 #include "mongo/bson/util/simple8b_builder.h"
 #include "mongo/bson/util/simple8b_type_util.h"
 #include "mongo/platform/int128.h"
 #include "mongo/stdx/type_traits.h"
 #include "mongo/unittest/assert.h"
@ -54,6 +55,16 @@
 using namespace mongo;
 // Performs addition as unsigned and cast back to signed to get overflow defined to wrapped around
 // instead of undefined behavior.
 static constexpr int64_t add(int64_t lhs, int64_t rhs) {
    return static_cast<int64_t>(static_cast<uint64_t>(lhs) + static_cast<uint64_t>(rhs));
 }
 static constexpr int128_t add(int128_t lhs, int128_t rhs) {
    return static_cast<int128_t>(static_cast<uint128_t>(lhs) + static_cast<uint128_t>(rhs));
 }
 template <typename T>
 void assertValuesEqual(const Simple8b<T>& actual, const std::vector<boost::optional<T>>& expected) {
    auto it = actual.begin();
@ -104,6 +115,17 @@ void testSimple8b(const std::vector<boost::optional<T>>& expectedValues,
    Simple8b<T> s8b(buffer.get(), size);
    assertValuesEqual(s8b, expectedValues);
    make_signed_t<T> sum = 0;
    for (auto&& val : expectedValues) {
        if (val) {
            sum = add(sum, Simple8bTypeUtil::decodeInt(*val));
        }
    }
    uint64_t prev = simple8b::kSingleSkip;
    auto s = simple8b::sum<make_signed_t<T>>(
        reinterpret_cast<const char*>(expectedBinary.data()), expectedBinary.size(), prev);
    ASSERT_EQ(s, sum);
 }
 template <typename T>
--- a/src/mongo/bson/util/simple8b_type_util.h
+++ b/src/mongo/bson/util/simple8b_type_util.h
@ -54,13 +54,13 @@ public:
    // store as an unsigned integer
    // the most significant bit position to the least significant bit and call simple8b as an
    // unsigned integer.
-    static uint64_t encodeInt64(int64_t val) {
+    static constexpr uint64_t encodeInt64(int64_t val) {
        return (static_cast<uint64_t>(val) << 1) ^ (val >> 63);
    }
-    static int64_t decodeInt64(uint64_t val) {
+    static constexpr int64_t decodeInt64(uint64_t val) {
        return (val >> 1) ^ (~(val & 1) + 1);
    }
-    static uint128_t encodeInt128(int128_t val) {
+    static constexpr uint128_t encodeInt128(int128_t val) {
        // The Abseil right shift implementation on signed int128 is not correct as an arithmetic
        // shift in their non-intrinsic implementation. When we detect this case we replace the
        // right arithmetic shift of 127 positions that needs to produce 0xFF..FF or 0x00..00
@ -77,10 +77,13 @@ public:
 #endif
    }
-    static int128_t decodeInt128(uint128_t val) {
+    static constexpr int128_t decodeInt128(uint128_t val) {
        return static_cast<int128_t>((val >> 1) ^ (~(val & 1) + 1));
    }
    template <typename T>
    static constexpr auto decodeInt(T val);
    // These methods are for encoding OID with simple8b. The unique identifier is not part of
    // the encoded integer and must thus be provided when decoding.
    // Re-organize the bytes so that most of the entropy is in the least significant bytes.
@ -129,4 +132,14 @@ public:
    static constexpr std::array<double, kMemoryAsInteger> kScaleMultiplier = {
        1, 10, 100, 10000, 100000000};
 };
 template <>
 inline auto Simple8bTypeUtil::decodeInt(uint64_t val) {
    return decodeInt64(val);
 }
 template <>
 inline auto Simple8bTypeUtil::decodeInt(uint128_t val) {
    return decodeInt128(val);
 }
 }  // namespace mongo
--- a/src/mongo/platform/int128.h
+++ b/src/mongo/platform/int128.h
@ -33,3 +33,27 @@
 using uint128_t = absl::uint128;
 using int128_t = absl::int128;
 namespace mongo {
 template <typename T>
 struct make_unsigned : public std::make_unsigned<T> {};
 template <>
 struct make_unsigned<int128_t> {
    using type = uint128_t;
 };
 template <typename T>
 struct make_signed : public std::make_signed<T> {};
 template <>
 struct make_signed<uint128_t> {
    using type = int128_t;
 };
 template <typename T>
 using make_unsigned_t = typename make_unsigned<T>::type;
 template <typename T>
 using make_signed_t = typename make_signed<T>::type;
 }  // namespace mongo