SERVER-82971 Implement optimized block-based simple8b sum

This is significantly faster than the previous implementation. The block-based sum will be used to efficiently calculate the delta for the last element in a BSONColumn.
2023-11-30 18:17:58 +00:00 · 2023-11-30 18:17:58 +00:00 · 20d0dd80d4
parent dd52961205
commit 20d0dd80d4
7 changed files with 674 additions and 4 deletions
--- a/src/mongo/bson/util/SConscript
+++ b/src/mongo/bson/util/SConscript
@ -19,6 +19,7 @@ env.Library(
    source=[
        'bsoncolumn.cpp',
        'bsoncolumnbuilder.cpp',
+        'simple8b.cpp',
        'simple8b_builder.cpp',
        'simple8b_type_util.cpp',
    ],
--- a/src/mongo/bson/util/simple8b.cpp
+++ b/src/mongo/bson/util/simple8b.cpp
@ -0,0 +1,530 @@
+/**
+ *    Copyright (C) 2023-present MongoDB, Inc.
+ *
+ *    This program is free software: you can redistribute it and/or modify
+ *    it under the terms of the Server Side Public License, version 1,
+ *    as published by MongoDB, Inc.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    Server Side Public License for more details.
+ *
+ *    You should have received a copy of the Server Side Public License
+ *    along with this program. If not, see
+ *    <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the Server Side Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/bson/util/simple8b.h"
+#include "mongo/bson/util/simple8b_type_util.h"
+
+#include <limits>
+
+namespace mongo::simple8b {
+namespace {
+
+// Performs addition as unsigned and cast back to signed to get overflow defined to wrapped around
+// instead of undefined behavior.
+static constexpr int64_t add(int64_t lhs, int64_t rhs) {
+    return static_cast<int64_t>(static_cast<uint64_t>(lhs) + static_cast<uint64_t>(rhs));
+}
+
+static constexpr int128_t add(int128_t lhs, int128_t rhs) {
+    return static_cast<int128_t>(static_cast<uint128_t>(lhs) + static_cast<uint128_t>(rhs));
+}
+
+// Simple Simple8b decoder for decoding any basic simple8b block where all bits are used for the
+// value, decodes signed integer at runtime. Suitable for selectors with many bits per slot. Encoded
+// should be be machine endian and first slot should start at least significant bit.
+template <int bits>
+struct SimpleDecoder {
+    // Number of values in this block.
+    static constexpr int iters = 60 / bits;
+
+    // Bit mask to extract a single slot and to check for the missing bit pattern.
+    static constexpr uint64_t mask = (1ull << bits) - 1;
+
+    // Calculate the sum of all slots.
+    template <typename T>
+    static T sum(uint64_t encoded) {
+        T decoded = 0;
+        for (int i = iters; i; --i) {
+            uint64_t slot = encoded & mask;
+            if (slot != mask) {
+                decoded = add(decoded, Simple8bTypeUtil::decodeInt64(slot));
+            }
+            encoded >>= bits;
+        };
+        return decoded;
+    }
+
+    // Returns value of last slot. Treats missing as 0.
+    static int64_t lastIgnoreSkip(uint64_t encoded) {
+        encoded >>= (bits * (iters - 1));
+        if (encoded == mask)
+            return 0;
+        return Simple8bTypeUtil::decodeInt64(encoded);
+    }
+};
+
+// Table-based decoder that uses a lookup table for decoding unsigned integers into signed. Suitable
+// for selectors with few bits per slot as the internal lookup table grows with bits per slot.
+// Encoded should be be machine endian and first slot should start at least significant bit.
+template <int bits>
+struct TableDecoder {
+    // Type to store in lookup table, depends on bit width per slot.
+    using T = std::conditional_t<bits <= 8, int8_t, int16_t>;
+
+    // Constant to constrain table size.
+    static constexpr int kMaxTableSize = 1 << 13;
+
+    static constexpr int shift = bits;
+
+    // Number of values in this block
+    static constexpr int iters = (60 / bits * bits + shift - 1) / shift;
+    // Number of entries in lookup table
+    static constexpr int entries = 1 << shift;
+    // Bit mask to extract a single slot and to check for the missing bit pattern.
+    static constexpr uint64_t mask = (1ull << bits) - 1;
+
+    // Largest possible value that can be stored in this slot
+    static constexpr int64_t kMaxSlotValue = Simple8bTypeUtil::decodeInt64(mask - 1);
+    // Smallest possible value that can be stored in this slot
+    static constexpr int64_t kMinSlotValue = Simple8bTypeUtil::decodeInt64(mask - 2);
+
+    // Verify that lookup table is within size limit and that it can store our possible range of
+    // values
+    static_assert(entries <= kMaxTableSize, "lookup table too large");
+    static_assert(kMaxSlotValue <= std::numeric_limits<T>::max(),
+                  "lookup table cannot store full decoded value range");
+    static_assert(kMinSlotValue >= std::numeric_limits<T>::min(),
+                  "lookup table cannot store full decoded value range");
+
+    T table[entries];
+
+    // Initialize lookup table
+    constexpr TableDecoder() : table() {
+        for (unsigned i = 0; i < entries; ++i) {
+            uint64_t slot = i;
+            bool skip = slot == mask;
+            if (!skip) {
+                table[i] += Simple8bTypeUtil::decodeInt64(slot);
+            }
+        }
+    }
+
+    // Calculate the sum of all slots
+    template <typename T>
+    T sum(uint64_t encoded) const {
+        T decoded = 0;
+        for (int i = iters; i; --i) {
+            decoded += table[encoded % entries];
+            encoded >>= shift;
+        };
+        return decoded;
+    }
+
+    // Returns value of last slot. Treats missing as 0
+    int64_t lastIgnoreSkip(uint64_t encoded) const {
+        encoded >>= (bits * (iters - 1));
+        return table[encoded];
+    }
+};
+
+// Table-based decoder that uses a lookup table for decoding multiple unsigned integers into signed
+// at once. Suitable for selectors with few bits per slot as the internal lookup table grows with
+// bits per slot. Encoded should be be machine endian and first slot should start at least
+// significant bit.
+template <int bits>
+struct ParallelTableDecoder {
+    // Constant to constrain table size, 2^X.
+    static constexpr int kMaxTableSizeExp = 13;
+    static constexpr int kMaxTableSize = 1 << kMaxTableSizeExp;
+
+    // Number of slots that we can decode together
+    static constexpr int parallel = kMaxTableSizeExp / bits;
+
+    // Number of shift to get to the next decoding iteration.
+    static constexpr int shift = bits * parallel;
+    // Number of decoding iterations in this block
+    static constexpr int iters = (60 / bits * bits + shift - 1) / shift;
+    // Number of entries in lookup table
+    static constexpr int entries = 1 << shift;
+    // Bit mask to extract a single slot and to check for the missing bit pattern.
+    static constexpr uint64_t mask = (1ull << bits) - 1;
+
+    // Largest possible value that can be stored in this slot
+    static constexpr int64_t kMaxSlotValue = Simple8bTypeUtil::decodeInt64(mask - 1);
+    // Smallest possible value that can be stored in this slot
+    static constexpr int64_t kMinSlotValue = Simple8bTypeUtil::decodeInt64(mask - 2);
+
+    // Verify that lookup table is within size limit and that it can store our possible range of
+    // values
+    static_assert(
+        bits > 1,
+        "simple8b slots needs to use at least 2 bits to be meaningful for parallel decoding");
+    static_assert(parallel > 1, "bit size too large to fit in table for parallel decoding");
+    static_assert(kMaxSlotValue * parallel <= std::numeric_limits<int8_t>::max(),
+                  "lookup table cannot store full decoded value range");
+    static_assert(kMinSlotValue * parallel >= std::numeric_limits<int8_t>::min(),
+                  "lookup table cannot store full decoded value range");
+
+    int8_t table[entries];
+
+    // Initialize lookup table
+    constexpr ParallelTableDecoder() : table() {
+        for (unsigned i = 0; i < entries; ++i) {
+            for (int j = 0; j < parallel; ++j) {
+                uint64_t slot = (i >> (j * bits)) & mask;
+                if (slot != mask) {
+                    table[i] += Simple8bTypeUtil::decodeInt64(slot);
+                }
+            }
+        }
+    }
+
+    // Calculate the sum of all slots
+    template <typename T>
+    T sum(uint64_t encoded) const {
+        T decoded = 0;
+        for (int i = iters; i; --i) {
+            decoded = add(decoded, table[encoded % entries]);
+            encoded >>= shift;
+        };
+        return decoded;
+    }
+};
+
+// Special Simple8b decoder for decoding the extended selectors where the slot bits are split up in
+// a value and count for a left shift. Encoded should be be machine endian and first slot should
+// start at least significant bit.
+template <int valueBits, int countBits, int countScale>
+struct ExtendedDecoder {
+    static constexpr int bits = valueBits + countBits;
+    static constexpr int iters = 56 / bits;
+    static constexpr uint64_t mask = (1ull << bits) - 1;
+    static constexpr uint64_t valueMask = (1ull << valueBits) - 1;
+    static constexpr uint64_t countMask = (1ull << countBits) - 1;
+
+    // Calculate the sum of all slots
+    template <typename T>
+    T sum(uint64_t encoded) const {
+        T decoded = 0;
+        for (int i = iters; i; --i) {
+            if ((encoded & mask) != mask) {
+                uint64_t count = encoded & countMask;
+                make_unsigned_t<T> value = (encoded >> countBits) & valueMask;
+
+                decoded = add(decoded, Simple8bTypeUtil::decodeInt(value << (count * countScale)));
+            }
+
+            encoded >>= bits;
+        };
+        return decoded;
+    }
+
+    // Returns value of last slot. Treats missing as 0
+    template <typename T>
+    T lastIgnoreSkip(uint64_t encoded) const {
+        encoded >>= (bits * (iters - 1));
+        if ((encoded & mask) == mask)
+            return 0;
+
+        uint64_t count = encoded & countMask;
+        make_unsigned_t<T> value = (encoded >> countBits) & valueMask;
+
+        return Simple8bTypeUtil::decodeInt(value << (count * countScale));
+    }
+};
+
+// Storage for all decoders that we need for our various selector types
+static constexpr ParallelTableDecoder<2> decoderParallel2;
+static constexpr ParallelTableDecoder<3> decoderParallel3;
+static constexpr ParallelTableDecoder<4> decoderParallel4;
+static constexpr ParallelTableDecoder<5> decoderParallel5;
+static constexpr ParallelTableDecoder<6> decoderParallel6;
+static constexpr TableDecoder<2> decoder2;
+static constexpr TableDecoder<3> decoder3;
+static constexpr TableDecoder<4> decoder4;
+static constexpr TableDecoder<5> decoder5;
+static constexpr TableDecoder<6> decoder6;
+static constexpr TableDecoder<7> decoder7;
+static constexpr TableDecoder<8> decoder8;
+static constexpr TableDecoder<10> decoder10;
+static constexpr TableDecoder<12> decoder12;
+static constexpr SimpleDecoder<15> decoder15;
+static constexpr SimpleDecoder<20> decoder20;
+static constexpr SimpleDecoder<30> decoder30;
+static constexpr SimpleDecoder<60> decoder60;
+static constexpr ExtendedDecoder<2, 4, 1> decoderExtended7_1;
+static constexpr ExtendedDecoder<3, 4, 1> decoderExtended7_2;
+static constexpr ExtendedDecoder<4, 4, 1> decoderExtended7_3;
+static constexpr ExtendedDecoder<5, 4, 1> decoderExtended7_4;
+static constexpr ExtendedDecoder<7, 4, 1> decoderExtended7_5;
+static constexpr ExtendedDecoder<10, 4, 1> decoderExtended7_6;
+static constexpr ExtendedDecoder<14, 4, 1> decoderExtended7_7;
+static constexpr ExtendedDecoder<24, 4, 1> decoderExtended7_8;
+static constexpr ExtendedDecoder<52, 4, 1> decoderExtended7_9;
+static constexpr ExtendedDecoder<4, 4, 4> decoderExtended8_1;
+static constexpr ExtendedDecoder<5, 4, 4> decoderExtended8_2;
+static constexpr ExtendedDecoder<7, 4, 4> decoderExtended8_3;
+static constexpr ExtendedDecoder<10, 4, 4> decoderExtended8_4;
+static constexpr ExtendedDecoder<14, 4, 4> decoderExtended8_5;
+static constexpr ExtendedDecoder<24, 4, 4> decoderExtended8_6;
+static constexpr ExtendedDecoder<52, 4, 4> decoderExtended8_7;
+static constexpr ExtendedDecoder<4, 5, 4> decoderExtended8_8;
+static constexpr ExtendedDecoder<6, 5, 4> decoderExtended8_9;
+static constexpr ExtendedDecoder<9, 5, 4> decoderExtended8_10;
+static constexpr ExtendedDecoder<13, 5, 4> decoderExtended8_11;
+static constexpr ExtendedDecoder<23, 5, 4> decoderExtended8_12;
+static constexpr ExtendedDecoder<51, 5, 4> decoderExtended8_13;
+
+// Decodes last slot for simple8b block. Treats missing as 0.
+template <typename T>
+T decodeLastSlotIgnoreSkip(uint64_t encoded) {
+    auto selector = encoded & simple8b_internal::kBaseSelectorMask;
+    encoded >>= 4;
+    switch (selector) {
+        case 1:  // Only 0 or missing deltas
+            break;
+        case 2:
+            return decoder2.lastIgnoreSkip(encoded);
+        case 3:
+            return decoder3.lastIgnoreSkip(encoded);
+        case 4:
+            return decoder4.lastIgnoreSkip(encoded);
+        case 5:
+            return decoder5.lastIgnoreSkip(encoded);
+        case 6:
+            return decoder6.lastIgnoreSkip(encoded);
+        case 7: {
+
+            auto extended = encoded & simple8b_internal::kBaseSelectorMask;
+            encoded >>= 4;
+            switch (extended) {
+                case 0:
+                    return decoder7.lastIgnoreSkip(encoded);
+                case 1:
+                    return decoderExtended7_1.lastIgnoreSkip<T>(encoded);
+                case 2:
+                    return decoderExtended7_2.lastIgnoreSkip<T>(encoded);
+                case 3:
+                    return decoderExtended7_3.lastIgnoreSkip<T>(encoded);
+                case 4:
+                    return decoderExtended7_4.lastIgnoreSkip<T>(encoded);
+                case 5:
+                    return decoderExtended7_5.lastIgnoreSkip<T>(encoded);
+                case 6:
+                    return decoderExtended7_6.lastIgnoreSkip<T>(encoded);
+                case 7:
+                    return decoderExtended7_7.lastIgnoreSkip<T>(encoded);
+                case 8:
+                    return decoderExtended7_8.lastIgnoreSkip<T>(encoded);
+                case 9:
+                    return decoderExtended7_9.lastIgnoreSkip<T>(encoded);
+                default:
+                    invariant(false);  // invalid encoding
+                    break;
+            }
+            break;
+        }
+        case 8: {
+            auto extended = encoded & simple8b_internal::kBaseSelectorMask;
+            encoded >>= 4;
+            switch (extended) {
+                case 0:
+                    return decoder8.lastIgnoreSkip(encoded);
+                case 1:
+                    return decoderExtended8_1.lastIgnoreSkip<T>(encoded);
+                case 2:
+                    return decoderExtended8_2.lastIgnoreSkip<T>(encoded);
+                case 3:
+                    return decoderExtended8_3.lastIgnoreSkip<T>(encoded);
+                case 4:
+                    return decoderExtended8_4.lastIgnoreSkip<T>(encoded);
+                case 5:
+                    return decoderExtended8_5.lastIgnoreSkip<T>(encoded);
+                case 6:
+                    return decoderExtended8_6.lastIgnoreSkip<T>(encoded);
+                case 7:
+                    return decoderExtended8_7.lastIgnoreSkip<T>(encoded);
+                case 8:
+                    return decoderExtended8_8.lastIgnoreSkip<T>(encoded);
+                case 9:
+                    return decoderExtended8_9.lastIgnoreSkip<T>(encoded);
+                case 10:
+                    return decoderExtended8_10.lastIgnoreSkip<T>(encoded);
+                case 11:
+                    return decoderExtended8_11.lastIgnoreSkip<T>(encoded);
+                case 12:
+                    return decoderExtended8_12.lastIgnoreSkip<T>(encoded);
+                case 13:
+                    return decoderExtended8_13.lastIgnoreSkip<T>(encoded);
+                default:
+                    invariant(false);  // invalid encoding
+                    break;
+            }
+            break;
+        }
+        case 9:
+            return decoder10.lastIgnoreSkip(encoded);
+        case 10:
+            return decoder12.lastIgnoreSkip(encoded);
+        case 11:
+            return decoder15.lastIgnoreSkip(encoded);
+        case 12:
+            return decoder20.lastIgnoreSkip(encoded);
+        case 13:
+            return decoder30.lastIgnoreSkip(encoded);
+        case 14:
+            return decoder60.lastIgnoreSkip(encoded);
+        case 15:
+            break;
+        default:
+            break;
+    }
+    return 0;
+}
+
+// Decodes and sums all slots in simple8b block, writes last encountered non-rle block in
+// 'prevNonRLE'.
+template <typename T>
+T decodeAndSum(uint64_t encoded, uint64_t* prevNonRLE) {
+    auto selector = encoded & simple8b_internal::kBaseSelectorMask;
+    if (selector != simple8b_internal::kRleSelector) {
+        *prevNonRLE = encoded;
+    }
+    encoded >>= 4;
+    switch (selector) {
+        case 1:  // Only 0 or missing deltas
+            return 0;
+        case 2:
+            return decoderParallel2.sum<T>(encoded);
+        case 3:
+            return decoderParallel3.sum<T>(encoded);
+        case 4:
+            return decoderParallel4.sum<T>(encoded);
+        case 5:
+            return decoderParallel5.sum<T>(encoded);
+        case 6:
+            return decoderParallel6.sum<T>(encoded);
+        case 7: {
+            auto extended = encoded & simple8b_internal::kBaseSelectorMask;
+            encoded >>= 4;
+            switch (extended) {
+                case 0:
+                    return decoder7.sum<T>(encoded);
+                case 1:
+                    return decoderExtended7_1.sum<T>(encoded);
+                case 2:
+                    return decoderExtended7_2.sum<T>(encoded);
+                case 3:
+                    return decoderExtended7_3.sum<T>(encoded);
+                case 4:
+                    return decoderExtended7_4.sum<T>(encoded);
+                case 5:
+                    return decoderExtended7_5.sum<T>(encoded);
+                case 6:
+                    return decoderExtended7_6.sum<T>(encoded);
+                case 7:
+                    return decoderExtended7_7.sum<T>(encoded);
+                case 8:
+                    return decoderExtended7_8.sum<T>(encoded);
+                case 9:
+                    return decoderExtended7_9.sum<T>(encoded);
+                default:
+                    break;
+            }
+            break;
+        }
+        case 8: {
+            auto extended = encoded & simple8b_internal::kBaseSelectorMask;
+            encoded >>= 4;
+            switch (extended) {
+                case 0:
+                    return decoder8.sum<T>(encoded);
+                case 1:
+                    return decoderExtended8_1.sum<T>(encoded);
+                case 2:
+                    return decoderExtended8_2.sum<T>(encoded);
+                case 3:
+                    return decoderExtended8_3.sum<T>(encoded);
+                case 4:
+                    return decoderExtended8_4.sum<T>(encoded);
+                case 5:
+                    return decoderExtended8_5.sum<T>(encoded);
+                case 6:
+                    return decoderExtended8_6.sum<T>(encoded);
+                case 7:
+                    return decoderExtended8_7.sum<T>(encoded);
+                case 8:
+                    return decoderExtended8_8.sum<T>(encoded);
+                case 9:
+                    return decoderExtended8_9.sum<T>(encoded);
+                case 10:
+                    return decoderExtended8_10.sum<T>(encoded);
+                case 11:
+                    return decoderExtended8_11.sum<T>(encoded);
+                case 12:
+                    return decoderExtended8_12.sum<T>(encoded);
+                case 13:
+                    return decoderExtended8_13.sum<T>(encoded);
+                default:
+                    break;
+            }
+            break;
+        }
+        case 9:
+            return decoder10.sum<T>(encoded);
+        case 10:
+            return decoder12.sum<T>(encoded);
+        case 11:
+            return decoder15.sum<T>(encoded);
+        case 12:
+            return decoder20.sum<T>(encoded);
+        case 13:
+            return decoder30.sum<T>(encoded);
+        case 14:
+            return decoder60.sum<T>(encoded);
+        case simple8b_internal::kRleSelector:
+            return decodeLastSlotIgnoreSkip<T>(*prevNonRLE) * ((encoded & 0xf) + 1) *
+                simple8b_internal::kRleMultiplier;
+        default:
+            break;
+    }
+    fassertFailed(8297100);
+    return 0;
+}
+
+}  // namespace
+
+template <typename T>
+T sum(const char* buffer, size_t size, uint64_t& prevNonRLE) {
+    invariant(size % 8 == 0);
+    const char* end = buffer + size;
+    T sum = 0;
+    while (buffer != end) {
+        uint64_t encoded = ConstDataView(buffer).read<LittleEndian<uint64_t>>();
+        sum = add(sum, decodeAndSum<T>(encoded, &prevNonRLE));
+        buffer += sizeof(uint64_t);
+    }
+    return sum;
+}
+
+// Explicit template instantiations for our supported types
+template int64_t sum<int64_t>(const char*, size_t, uint64_t&);
+template int128_t sum<int128_t>(const char*, size_t, uint64_t&);
+
+}  // namespace mongo::simple8b
--- a/src/mongo/bson/util/simple8b.h
+++ b/src/mongo/bson/util/simple8b.h
@ -334,4 +334,17 @@ typename Simple8b<T>::Iterator Simple8b<T>::end() const {
    return {_buffer + _size};
 }

+namespace simple8b {
+// Constant for a simple8b block containing a single 'missing' value.
+static constexpr uint64_t kSingleSkip = 0xFFFFFFFFFFFFFFFE;
+
+/**
+ * Calculates the sum for multiple simple8b blocks in a buffer. 'prevNonRLE' should be initialized
+ * to 'kSingleSkip' when calculating sum for the first buffer. If the caller needs sum from multiple
+ * buffers the value should be passed along between the calls.
+ */
+template <typename T>
+T sum(const char* buffer, size_t size, uint64_t& prevNonRLE);
+}  // namespace simple8b
+
 }  // namespace mongo
--- a/src/mongo/bson/util/simple8b_bm.cpp
+++ b/src/mongo/bson/util/simple8b_bm.cpp
@ -31,16 +31,42 @@
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
+#include <random>

 #include <boost/cstdint.hpp>

 #include "mongo/bson/util/builder.h"
 #include "mongo/bson/util/simple8b.h"
 #include "mongo/bson/util/simple8b_builder.h"
+#include "mongo/bson/util/simple8b_type_util.h"
 #include "mongo/util/shared_buffer.h"

 namespace mongo {

+BufBuilder generateIntegers() {
+    std::mt19937_64 seedGen(1337);
+    std::mt19937 gen(seedGen());
+    std::normal_distribution<> d(100, 10);
+    std::uniform_int_distribution skip(1, 100);
+
+    BufBuilder buffer;
+    Simple8bBuilder<uint64_t> s8bBuilder(
+        [&buffer](uint64_t simple8bBlock) { buffer.appendNum(simple8bBlock); });
+
+    // Generate 10k integers
+    for (int i = 0; i < 10000; ++i) {
+        // 5% chance for missing
+        if (skip(gen) <= 5) {
+            s8bBuilder.skip();
+        } else {
+            s8bBuilder.append(std::lround(d(gen)));
+        }
+    }
+
+    s8bBuilder.flush();
+    return buffer;
+}
+
 void BM_increasingValues(benchmark::State& state) {
    size_t totalBytes = 0;
    for (auto _ : state) {
@ -152,11 +178,52 @@ void BM_decode(benchmark::State& state) {
    state.SetBytesProcessed(totalBytes);
 }

+void BM_sum(benchmark::State& state) {
+    BufBuilder buffer = generateIntegers();
+    auto size = buffer.len();
+    auto buf = buffer.release();
+
+    size_t totalBytes = 0;
+
+    for (auto _ : state) {
+        benchmark::ClobberMemory();
+        uint64_t prev = simple8b::kSingleSkip;
+        benchmark::DoNotOptimize(simple8b::sum<int64_t>(buf.get(), size, prev));
+        totalBytes += size;
+    }
+
+    state.SetBytesProcessed(totalBytes);
+}
+
+void BM_sumUnoptimized(benchmark::State& state) {
+    BufBuilder buffer = generateIntegers();
+    auto size = buffer.len();
+    auto buf = buffer.release();
+
+    size_t totalBytes = 0;
+
+    for (auto _ : state) {
+        benchmark::ClobberMemory();
+        Simple8b<uint64_t> s8b(buf.get(), size);
+        int64_t sum = 0;
+        for (auto&& val : s8b) {
+            if (val) {
+                sum += Simple8bTypeUtil::decodeInt64(*val);
+            }
+        }
+        totalBytes += size;
+    }
+
+    state.SetBytesProcessed(totalBytes);
+}
+
 BENCHMARK(BM_increasingValues)->Arg(100);
 BENCHMARK(BM_rle)->Arg(100);
 BENCHMARK(BM_changingSmallValues)->Arg(100);
 BENCHMARK(BM_changingLargeValues)->Arg(100);
 BENCHMARK(BM_selectorSeven)->Arg(100);
 BENCHMARK(BM_decode);
+BENCHMARK(BM_sum);
+BENCHMARK(BM_sumUnoptimized);

 }  // namespace mongo
--- a/src/mongo/bson/util/simple8b_test.cpp
+++ b/src/mongo/bson/util/simple8b_test.cpp
@ -46,6 +46,7 @@
 #include "mongo/base/string_data.h"
 #include "mongo/bson/util/builder.h"
 #include "mongo/bson/util/simple8b_builder.h"
+#include "mongo/bson/util/simple8b_type_util.h"
 #include "mongo/platform/int128.h"
 #include "mongo/stdx/type_traits.h"
 #include "mongo/unittest/assert.h"
@ -54,6 +55,16 @@

 using namespace mongo;

+// Performs addition as unsigned and cast back to signed to get overflow defined to wrapped around
+// instead of undefined behavior.
+static constexpr int64_t add(int64_t lhs, int64_t rhs) {
+    return static_cast<int64_t>(static_cast<uint64_t>(lhs) + static_cast<uint64_t>(rhs));
+}
+
+static constexpr int128_t add(int128_t lhs, int128_t rhs) {
+    return static_cast<int128_t>(static_cast<uint128_t>(lhs) + static_cast<uint128_t>(rhs));
+}
+
 template <typename T>
 void assertValuesEqual(const Simple8b<T>& actual, const std::vector<boost::optional<T>>& expected) {
    auto it = actual.begin();
@ -104,6 +115,17 @@ void testSimple8b(const std::vector<boost::optional<T>>& expectedValues,

    Simple8b<T> s8b(buffer.get(), size);
    assertValuesEqual(s8b, expectedValues);
+
+    make_signed_t<T> sum = 0;
+    for (auto&& val : expectedValues) {
+        if (val) {
+            sum = add(sum, Simple8bTypeUtil::decodeInt(*val));
+        }
+    }
+    uint64_t prev = simple8b::kSingleSkip;
+    auto s = simple8b::sum<make_signed_t<T>>(
+        reinterpret_cast<const char*>(expectedBinary.data()), expectedBinary.size(), prev);
+    ASSERT_EQ(s, sum);
 }

 template <typename T>
--- a/src/mongo/bson/util/simple8b_type_util.h
+++ b/src/mongo/bson/util/simple8b_type_util.h
@ -54,13 +54,13 @@ public:
    // store as an unsigned integer
    // the most significant bit position to the least significant bit and call simple8b as an
    // unsigned integer.
-    static uint64_t encodeInt64(int64_t val) {
+    static constexpr uint64_t encodeInt64(int64_t val) {
        return (static_cast<uint64_t>(val) << 1) ^ (val >> 63);
    }
-    static int64_t decodeInt64(uint64_t val) {
+    static constexpr int64_t decodeInt64(uint64_t val) {
        return (val >> 1) ^ (~(val & 1) + 1);
    }
-    static uint128_t encodeInt128(int128_t val) {
+    static constexpr uint128_t encodeInt128(int128_t val) {
        // The Abseil right shift implementation on signed int128 is not correct as an arithmetic
        // shift in their non-intrinsic implementation. When we detect this case we replace the
        // right arithmetic shift of 127 positions that needs to produce 0xFF..FF or 0x00..00
@ -77,10 +77,13 @@ public:
 #endif
    }

-    static int128_t decodeInt128(uint128_t val) {
+    static constexpr int128_t decodeInt128(uint128_t val) {
        return static_cast<int128_t>((val >> 1) ^ (~(val & 1) + 1));
    }

+    template <typename T>
+    static constexpr auto decodeInt(T val);
+
    // These methods are for encoding OID with simple8b. The unique identifier is not part of
    // the encoded integer and must thus be provided when decoding.
    // Re-organize the bytes so that most of the entropy is in the least significant bytes.
@ -129,4 +132,14 @@ public:
    static constexpr std::array<double, kMemoryAsInteger> kScaleMultiplier = {
        1, 10, 100, 10000, 100000000};
 };
+
+template <>
+inline auto Simple8bTypeUtil::decodeInt(uint64_t val) {
+    return decodeInt64(val);
+}
+
+template <>
+inline auto Simple8bTypeUtil::decodeInt(uint128_t val) {
+    return decodeInt128(val);
+}
 }  // namespace mongo
--- a/src/mongo/platform/int128.h
+++ b/src/mongo/platform/int128.h
@ -33,3 +33,27 @@

 using uint128_t = absl::uint128;
 using int128_t = absl::int128;
+
+namespace mongo {
+template <typename T>
+struct make_unsigned : public std::make_unsigned<T> {};
+
+template <>
+struct make_unsigned<int128_t> {
+    using type = uint128_t;
+};
+
+template <typename T>
+struct make_signed : public std::make_signed<T> {};
+
+template <>
+struct make_signed<uint128_t> {
+    using type = int128_t;
+};
+
+template <typename T>
+using make_unsigned_t = typename make_unsigned<T>::type;
+
+template <typename T>
+using make_signed_t = typename make_signed<T>::type;
+}  // namespace mongo