SERVER-109844 Basic support for disaggregated storage clusters (#40566)

Co-authored-by: Benety Goh <benety@mongodb.com>
Co-authored-by: Mathias Stearn <mathias@mongodb.com>
Co-authored-by: Kaitlin Mahar <kaitlin.mahar@mongodb.com>
Co-authored-by: Brandon Stoll <bstoll@users.noreply.github.com>
Co-authored-by: Vanessa Noia <54818020+nessnoia@users.noreply.github.com>
Co-authored-by: graphite-app[bot] <96075541+graphite-app[bot]@users.noreply.github.com>
Co-authored-by: Vishnu K <vishnu.kaushik@mongodb.com>
Co-authored-by: Sunil Narasimhamurthy <suniltheta@gmail.com>
Co-authored-by: Jiawei Yang <youngyang0820@gmail.com>
Co-authored-by: Will Korteland <korteland@users.noreply.github.com>
Co-authored-by: Saman Memaripour <amirsaman.memaripour@mongodb.com>
Co-authored-by: huayu-ouyang <huayu.ouyang@mongodb.com>
Co-authored-by: Suganthi Mani <38441312+smani87@users.noreply.github.com>
Co-authored-by: Thomas Goyne <thomas.goyne@mongodb.com>
Co-authored-by: Haley Connelly <haley.connelly@mongodb.com>
Co-authored-by: Billy Donahue <BillyDonahue@users.noreply.github.com>
Co-authored-by: Kirollos Morkos <kiro.morkos@mongodb.com>
Co-authored-by: Lingzhi Deng <lingzhi.deng@mongodb.com>
Co-authored-by: Hartek Sabharwal <hartek.sabharwal@mongodb.com>
Co-authored-by: Aaron Himelman <aaron.himelman@mongodb.com>
Co-authored-by: Moustafa Maher <m.maher@mongodb.com>
Co-authored-by: prathmesh-kallurkar <prathmesh.kallurkar@mongodb.com>
Co-authored-by: Dan Larkin-York <13419935+dhly-etc@users.noreply.github.com>
Co-authored-by: Shreyas Kalyan <35750327+shreyaskalyan@users.noreply.github.com>
Co-authored-by: Shreyas Kalyan <shreyas.kalyan@mongodb.com>
Co-authored-by: Jonathan Reams <jbreams@mongodb.com>
Co-authored-by: adriangzz <adriangonzalezmontemayor@gmail.com>
Co-authored-by: Eric Milkie <milkie@users.noreply.github.com>
Co-authored-by: Aaron B <aaron.balsara@mongodb.com>
Co-authored-by: Ali Mir <ali.mir@mongodb.com>
Co-authored-by: Alex Blekhman <alexander.blekhman@mongodb.com>
Co-authored-by: mpobrien <mpobrien005@gmail.com>
Co-authored-by: Mark Benvenuto <mark.benvenuto@mongodb.com>
Co-authored-by: Ruby Chen <ruby.chen@mongodb.com>
Co-authored-by: Jagadish Nallapaneni <146780625+jagadishmdb@users.noreply.github.com>
Co-authored-by: Jonas Bergler <jonas.bergler@mongodb.com>
Co-authored-by: Peter Macko <peter.macko@mongodb.com>
Co-authored-by: Nic <nic.hollingum@mongodb.com>
Co-authored-by: Jiawei Yang <jiawei.yang@mongodb.com>
Co-authored-by: Jordi Serra Torrens <jordist@users.noreply.github.com>
Co-authored-by: Sunil Narasimhamurthy <sunil.narasimhamurthy@mongodb.com>
GitOrigin-RevId: a1c6609c820052137e2aa759711e86c337ae6f9f
This commit is contained in:
Matthew Russotto 2025-08-29 17:49:01 -04:00 committed by MongoDB Bot
parent 7f3bd6ca62
commit 8d12269eec
182 changed files with 44273 additions and 689 deletions

21
.github/CODEOWNERS vendored
View File

@ -81,6 +81,9 @@ WORKSPACE.bazel @10gen/devprod-build @svc-auto-approve-bot
/buildscripts/idl/**/idl_compatibility_errors.py @10gen/query-optimization @svc-auto-approve-bot
/buildscripts/idl/**/test_compatibility.py @10gen/query-optimization @svc-auto-approve-bot
# The following patterns are parsed from ./buildscripts/modules/atlas/OWNERS.yml
/buildscripts/modules/atlas/ @10gen/server-disagg-storage @svc-auto-approve-bot
# The following patterns are parsed from ./buildscripts/monitor_build_status/OWNERS.yml
/buildscripts/monitor_build_status/ @10gen/devprod-correctness @svc-auto-approve-bot
@ -888,6 +891,8 @@ WORKSPACE.bazel @10gen/devprod-build @svc-auto-approve-bot
/jstests/libs/**/catalog_list_operations_consistency_validator.js @10gen/server-catalog-and-routing @svc-auto-approve-bot
/jstests/libs/**/raw_operation_utils.js @10gen/server-collection-write-path @svc-auto-approve-bot
/jstests/libs/**/json_utils.js @10gen/query-integration-extensions @svc-auto-approve-bot
/jstests/libs/**/replicated_ident_utils.js @10gen/server-storage-engine-integration @svc-auto-approve-bot
/jstests/libs/**/replicated_record_ids_utils.js @10gen/server-storage-engine-integration @svc-auto-approve-bot
# The following patterns are parsed from ./jstests/libs/clustered_collections/OWNERS.yml
/jstests/libs/clustered_collections/**/* @10gen/server-collection-write-path @svc-auto-approve-bot
@ -1831,9 +1836,6 @@ WORKSPACE.bazel @10gen/devprod-build @svc-auto-approve-bot
/src/mongo/db/commands/query_cmd/**/release_memory_cmd.* @10gen/query-execution @svc-auto-approve-bot
/src/mongo/db/commands/query_cmd/**/update_metrics.* @10gen/query-execution @svc-auto-approve-bot
# The following patterns are parsed from ./src/mongo/db/disagg_storage/OWNERS.yml
/src/mongo/db/disagg_storage/**/* @10gen/server-disagg-storage @svc-auto-approve-bot
# The following patterns are parsed from ./src/mongo/db/exec/OWNERS.yml
/src/mongo/db/exec/**/* @10gen/query-execution-classic @svc-auto-approve-bot
/src/mongo/db/exec/**/OWNERS.yml @10gen/query-execution-staff-leads @10gen/query-integration-staff-leads @10gen/query-optimization-staff-leads @svc-auto-approve-bot
@ -2043,6 +2045,15 @@ WORKSPACE.bazel @10gen/devprod-build @svc-auto-approve-bot
# The following patterns are parsed from ./src/mongo/db/modules/atlas/OWNERS.yml
/src/mongo/db/modules/atlas/**/* @10gen/server-disagg-storage @svc-auto-approve-bot
# The following patterns are parsed from ./src/mongo/db/modules/atlas/jstests/disagg_storage/OWNERS.yml
/src/mongo/db/modules/atlas/jstests/disagg_storage/**/* @10gen/server-disagg-storage @svc-auto-approve-bot
# The following patterns are parsed from ./src/mongo/db/modules/atlas/src/disagg_storage/OWNERS.yml
/src/mongo/db/modules/atlas/src/disagg_storage/**/* @10gen/server-disagg-storage @svc-auto-approve-bot
# The following patterns are parsed from ./src/mongo/db/modules/atlas/src/disagg_storage/encryption/OWNERS.yml
/src/mongo/db/modules/atlas/src/disagg_storage/encryption/**/* @10gen/server-security @svc-auto-approve-bot
# The following patterns are parsed from ./src/mongo/db/modules/enterprise/OWNERS.yml
/src/mongo/db/modules/enterprise/BUILD.bazel @10gen/devprod-build @svc-auto-approve-bot
/src/mongo/db/modules/enterprise/README.md @10gen/server-release @svc-auto-approve-bot
@ -2589,6 +2600,9 @@ WORKSPACE.bazel @10gen/devprod-build @svc-auto-approve-bot
# The following patterns are parsed from ./src/mongo/db/repl/split_horizon/OWNERS.yml
/src/mongo/db/repl/split_horizon/**/* @10gen/server-split-horizon @svc-auto-approve-bot
# The following patterns are parsed from ./src/mongo/db/rss/OWNERS.yml
/src/mongo/db/rss/**/* @10gen/server-replication @10gen/server-storage-execution @svc-auto-approve-bot
# The following patterns are parsed from ./src/mongo/db/s/OWNERS.yml
/src/mongo/db/s/**/* @10gen/server-cluster-scalability @svc-auto-approve-bot
/src/mongo/db/s/**/*transaction* @10gen/server-transactions @svc-auto-approve-bot
@ -3068,6 +3082,7 @@ WORKSPACE.bazel @10gen/devprod-build @svc-auto-approve-bot
/src/third_party/**/croaring @10gen/query-execution @svc-auto-approve-bot
/src/third_party/**/fmt @10gen/server-programmability @svc-auto-approve-bot
/src/third_party/**/folly @10gen/server-workload-scheduling @svc-auto-approve-bot
/src/third_party/**/googletest_restricted_for_disagg_only @10gen/server-disagg-storage @svc-auto-approve-bot
/src/third_party/**/gperftools @10gen/server-workload-scheduling @svc-auto-approve-bot
/src/third_party/**/grpc @10gen/server-networking-and-observability @svc-auto-approve-bot
/src/third_party/**/icu4c* @10gen/query-execution @svc-auto-approve-bot

View File

@ -66,6 +66,7 @@ a notice will be included in
| [pyiso8601] | MIT | 2.1.0 | unknown | |
| [RoaringBitmap/CRoaring] | Unknown License | v3.0.1 | | ✗ |
| [SchemaStore/schemastore] | Apache-2.0 | Unknown | | |
| [sls-proto] | Unknown License | 1.0 | unknown | ✗ |
| [smhasher] | Unknown License | Unknown | unknown | ✗ |
| [Snowball Stemming Algorithms] | BSD-3-Clause | 7b264ffa0f767c579d052fd8142558dc8264d795 | ✗ | ✗ |
| [subunit] | BSD-3-Clause, Apache-2.0 | 1.4.4 | unknown | |
@ -122,6 +123,7 @@ a notice will be included in
[opentelemetry-cpp]: https://github.com/open-telemetry/opentelemetry-cpp/
[opentelemetry-proto]: https://github.com/open-telemetry/opentelemetry-proto
[pyiso8601]: https://pypi.org/project/iso8601/
[sls-proto]: https://github.com/10gen/sls
[smhasher]: https://github.com/aappleby/smhasher/blob/a6bd3ce/
[subunit]: https://github.com/testing-cabal/subunit
[tcmalloc]: https://github.com/google/tcmalloc

View File

@ -1,12 +1,12 @@
enterprise:
jstest_dirs:
- src/mongo/db/modules/enterprise/jstests
# atlas:
# fixture_dirs:
# - buildscripts/modules/atlas/fixtures
# hook_dirs:
# - buildscripts/modules/atlas/hooks
# suite_dirs:
# - buildscripts/modules/atlas/suites
# jstest_dirs:
# - buildscripts/modules/atlas/jstests
atlas:
fixture_dirs:
- buildscripts/modules/atlas/fixtures
hook_dirs:
- buildscripts/modules/atlas/hooks
suite_dirs:
- buildscripts/modules/atlas/suites
jstest_dirs:
- src/mongo/db/modules/atlas/jstests

View File

@ -1,5 +1,6 @@
"""Utilities for constructing fixtures that may span multiple versions."""
import json
import logging
import threading
from abc import ABC, abstractmethod
@ -223,6 +224,33 @@ class ReplSetBuilder(FixtureBuilder):
)
replset.install_mongod(node)
if replset.disagg_base_config:
members = []
for idx, node in enumerate(replset.nodes):
member = {
"_id": idx,
"host": node.get_internal_connection_string(),
"priority": 1
}
members.append(member)
disagg_base_config = {
**replset.disagg_base_config,
"replSetConfig": {
"_id": replset.replset_name,
"version": 1,
"term": 1,
"members": members,
}
}
for node in replset.nodes:
opts = node.get_mongod_options()
opts["set_parameters"]["disaggregatedStorageConfig"] = json.dumps(
disagg_base_config)
opts["set_parameters"]["disaggregatedStorageEnabled"] = True
opts["set_parameters"]["logComponentVerbosity"] = json.dumps(
{"disaggregatedStorage": 5})
node.set_mongod_options(opts)
if replset.start_initial_sync_node:
if not replset.initial_sync_node:
replset.initial_sync_node_idx = replset.num_nodes

View File

@ -74,6 +74,7 @@ class ReplicaSetFixture(interface.ReplFixture, interface._DockerComposeInterface
launch_mongot=False,
load_all_extensions=False,
router_endpoint_for_mongot: Optional[int] = None,
disagg_base_config=None,
):
"""Initialize ReplicaSetFixture."""
@ -139,6 +140,8 @@ class ReplicaSetFixture(interface.ReplFixture, interface._DockerComposeInterface
# Set the default oplogSize to 511MB.
self.mongod_options.setdefault("oplogSize", 511)
self.disagg_base_config = disagg_base_config
# The dbpath in mongod_options is used as the dbpath prefix for replica set members and
# takes precedence over other settings. The ShardedClusterFixture uses this parameter to
# create replica sets and assign their dbpath structure explicitly.
@ -462,12 +465,14 @@ class ReplicaSetFixture(interface.ReplFixture, interface._DockerComposeInterface
primary = self.nodes[0]
client = primary.mongo_client()
while True:
self.logger.info("Waiting for primary on port %d to be elected.", primary.port)
is_master = client.admin.command("isMaster")["ismaster"]
if is_master:
self.logger.info(
"Waiting for primary on port %d to be elected.", primary.port)
cmd_result = client.admin.command("isMaster")
if cmd_result["ismaster"]:
break
time.sleep(0.1) # Wait a little bit before trying again.
self.logger.info("Primary on port %d successfully elected.", primary.port)
self.logger.info(
"Primary on port %d successfully elected.", primary.port)
def _await_secondaries(self):
# Wait for the secondaries to become available.

View File

@ -188,6 +188,12 @@ class MongoDFixture(interface.Fixture, interface._DockerComposeInterface):
self.logger.debug("Mongod not running when gathering standalone fixture pid.")
return out
def get_mongod_options(self):
return self.mongod_options
def set_mongod_options(self, options):
self.mongod_options = options
def _handle_await_ready_retry(self, deadline):
remaining = deadline - time.time()
if remaining <= 0.0:

View File

@ -50,6 +50,10 @@ class FixtureSetupTestCase(FixtureTestCase):
self.fixture.await_ready()
if (
not isinstance(self.fixture, (fixture_interface.NoOpFixture, ExternalFixture))
# TODO(SERVER-109851): Remove this.
# disagg mongod does not yet support "refreshLogicalSessionCacheNow" because it requires
# wtimeout support.
and self.fixture.__class__.__name__ != "DisaggFixture"
# Replica set with --configsvr cannot run refresh unless it is part of a sharded cluster.
and not (
isinstance(self.fixture, ReplicaSetFixture)

View File

@ -3,7 +3,9 @@
import copy
import os
import os.path
import random
import shutil
import string
import sys
import threading
import uuid
@ -67,6 +69,9 @@ class _SingleJSTestCase(interface.ProcessTestCase):
global_vars["MongoRunner.dataPath"] = data_path
test_data = global_vars.get("TestData", {}).copy()
test_run_id = "".join(random.choices(string.ascii_letters + string.digits, k=10))
self.fixture.test_run_id = test_run_id
test_data["test_run_id"] = test_run_id
test_data["minPort"] = core.network.PortAllocator.min_test_port(self.fixture.job_num)
test_data["maxPort"] = core.network.PortAllocator.max_test_port(self.fixture.job_num)
test_data["peerPids"] = self.fixture.pids()

View File

@ -1 +0,0 @@
"""Empty."""

View File

@ -1,51 +0,0 @@
import os
import subprocess
import sys
import unittest
import yaml
import buildscripts.burn_in_tests as under_test
class TestBurnInTestsEnd2End(unittest.TestCase):
@unittest.skip(
"Disabled since this test has behavior dependent on currently modified jstests. Re-enable with SERVER-108783."
)
@classmethod
def setUpClass(cls):
subprocess.run(
[
sys.executable,
"buildscripts/burn_in_tests.py",
"generate-test-membership-map-file-for-ci",
]
)
@classmethod
def tearDownClass(cls):
if os.path.exists(under_test.BURN_IN_TEST_MEMBERSHIP_FILE):
os.remove(under_test.BURN_IN_TEST_MEMBERSHIP_FILE)
def test_valid_yaml_output(self):
process = subprocess.run(
[
sys.executable,
"buildscripts/burn_in_tests.py",
"run",
"--yaml",
],
text=True,
capture_output=True,
)
self.assertEqual(
0,
process.returncode,
process.stderr,
)
output = process.stdout
try:
yaml.safe_load(output)
except Exception:
self.fail(msg="burn_in_tests.py does not output valid yaml.")

View File

@ -93,8 +93,6 @@ include:
- filename: etc/evergreen_yml_components/variants/codecoverage/test_dev.yml
- filename: src/mongo/db/modules/atlas/atlas_dev.yml
parameters:
- key: evergreen_config_file_path
value: "etc/evergreen.yml"

View File

@ -79,6 +79,7 @@ rules:
- assigned_to_jira_team_server_workload_scheduling
- assigned_to_jira_team_server_networking_and_observability
- assigned_to_jira_team_server_integration
- assigned_to_jira_team_server_disagg
# https://github.com/10gen/mothra/blob/main/mothra/teams/rnd_dev_prod.yaml
- assigned_to_jira_team_devprod_build
@ -130,6 +131,7 @@ rules:
- incompatible_inmemory
- incompatible_all_feature_flags
- incompatible_development_variant
- incompatible_disaggregated_storage
- requires_compile_variant
- requires_large_host
- requires_large_host_tsan

View File

@ -1236,6 +1236,19 @@ functions:
args:
- "./src/evergreen/resmoke_tests_execute_bazel.sh"
"assume ECR role": &assume_ecr_role
command: ec2.assume_role
params:
role_arn: "${disagg_storage_ecr_arn}"
"fetch module images": &fetch_module_images
command: subprocess.exec
params:
binary: bash
add_expansions_to_env: true # needed to get the AWS secrets from ec2.assume_role
args:
- "./src/evergreen/fetch_module_images.sh"
"retrieve generated test configuration":
&retrieve_generated_test_configuration
command: s3.get
@ -1472,6 +1485,8 @@ functions:
- *f_expansions_write
- *sign_macos_dev_binaries
- *multiversion_exclude_tags_generate
- *assume_ecr_role
- *fetch_module_images
- *execute_resmoke_tests
# The existence of the "run_tests_infrastructure_failure" file indicates this failure isn't
# directly actionable. We use type=setup rather than type=system or type=test for this command
@ -1521,6 +1536,8 @@ functions:
- *configure_evergreen_api_credentials
- *sign_macos_dev_binaries
- *multiversion_exclude_tags_generate
- *assume_ecr_role
- *fetch_module_images
- *execute_resmoke_tests
# The existence of the "run_tests_infrastructure_failure" file indicates this failure isn't
# directly actionable. We use type=setup rather than type=system or type=test for this command
@ -3294,3 +3311,11 @@ functions:
[AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN]
args:
- "./src/evergreen/container_registry_login.sh"
"build and push module images": &build_and_push_module_images
command: subprocess.exec
params:
binary: bash
add_expansions_to_env: true # needed to get the AWS secrets from ec2.assume_role
args:
- "./src/evergreen/build_and_push_module_images.sh"

View File

@ -212,6 +212,8 @@ tasks:
- "src/src/**.yml"
- "src/src/mongo/client/sdam/json_tests/sdam_tests/**"
- "src/src/mongo/client/sdam/json_tests/server_selection_tests/**"
- "src/src/mongo/db/modules/atlas/evergreen/**"
- "src/src/mongo/db/modules/atlas/jstests/**"
- "src/src/mongo/db/modules/enterprise/docs/**"
- "src/src/mongo/db/modules/enterprise/jstests/**"
- "src/src/mongo/db/modules/subscription/jstests/**"

View File

@ -431,7 +431,13 @@ tasks:
- <<: *run_jepsen_template
name: jepsen_config_fuzzer_list-append
tags: ["assigned_to_jira_team_server_repl", "experimental", "jepsen_docker"]
tags:
[
"assigned_to_jira_team_server_repl",
"experimental",
"jepsen_docker",
"uses_docker",
]
commands:
- func: "do setup"
- func: "do jepsen docker setup"
@ -514,7 +520,13 @@ tasks:
- <<: *run_jepsen_template
name: jepsen_list-append
tags: ["assigned_to_jira_team_server_repl", "experimental", "jepsen_docker"]
tags:
[
"assigned_to_jira_team_server_repl",
"experimental",
"jepsen_docker",
"uses_docker",
]
commands:
- func: "do setup"
- func: "do jepsen docker setup"
@ -689,11 +701,7 @@ tasks:
# Check that the mutational fuzzer can parse JS files modified in a patch build.
- name: lint_fuzzer_sanity_patch
tags:
[
"assigned_to_jira_team_devprod_correctness",
"development_critical_single_variant",
]
tags: ["assigned_to_jira_team_devprod_correctness", "experimental"]
patch_only: true
commands:
- command: manifest.load
@ -1517,6 +1525,22 @@ tasks:
commands:
- func: "generate smoke test tasks"
- name: push_mongod_to_ecr
tags: ["assigned_to_jira_team_disag_mongod"]
depends_on:
- name: package
commands:
- command: manifest.load
- func: "git get project and add git tag"
- func: "f_expansions_write"
- func: "set up venv"
- func: "fetch dist tarball"
- func: "extract binaries"
- command: ec2.assume_role
params:
role_arn: "${disagg_storage_ecr_arn}"
- func: "build and push module images"
- name: selinux_rhel8_enterprise
tags: ["assigned_to_jira_team_server_security", "experimental"]
depends_on:

View File

@ -1962,14 +1962,15 @@ tasks:
- <<: *jstestfuzz_template
name: resharding_timeseries_fuzzer_gen
tags:
[
tags: [
"assigned_to_jira_team_server_cluster_scalability",
"default",
"feature_flag_guarded",
"random_name",
"require_npm",
"requires_all_feature_flags",
# TODO SERVER-109849: Remove this tag.
"incompatible_disaggregated_storage",
]
commands:
- func: "generate resmoke tasks"

View File

@ -667,12 +667,13 @@ tasks:
- <<: *jstestfuzz_template
name: initial_sync_fuzzer_sanity_patch_gen
patch_only: true
tags:
[
tags: [
"assigned_to_jira_team_server_repl",
"default",
"require_npm",
"random_name",
# TODO SERVER-109849: Remove this tag.
"incompatible_disaggregated_storage",
]
commands:
- func: "generate resmoke tasks"
@ -1551,13 +1552,14 @@ tasks:
- <<: *jstestfuzz_template
name: rollback_fuzzer_sanity_patch_gen
patch_only: true
tags:
[
tags: [
"assigned_to_jira_team_server_repl",
"default",
"rollbackfuzzer",
"require_npm",
"random_name",
# TODO SERVER-109849: Remove this tag.
"incompatible_disaggregated_storage",
]
commands:
- func: "generate resmoke tasks"
@ -2341,3 +2343,30 @@ tasks:
- func: "generate resmoke tasks"
vars:
suite: v1index_jscore_passthrough
################################################
# Disagg Storage tasks #
################################################
- <<: *gen_task_template
name: disagg_storage_gen
tags:
[
"assigned_to_jira_team_server_disagg",
"default",
"large",
"clustered_collections",
"uses_docker",
]
commands:
- func: "generate resmoke tasks"
vars:
suite: disagg_storage
use_large_distro: "true"
- <<: *task_template
name: disagg_repl_jscore_passthrough
tags: ["assigned_to_jira_team_server_disagg", "default", "uses_docker"]
commands:
- func: "do setup"
- func: "run tests"

View File

@ -85,7 +85,7 @@ buildvariants:
- <<: *linux-arm64-dynamic-compile-params
name: &amazon-linux2023-arm64-static-compile amazon-linux2023-arm64-static-compile
display_name: "! Amazon Linux 2023 arm64 Enterprise Compile"
display_name: "! Amazon Linux 2023 arm64 Atlas Compile"
tags: ["required", "bazel_check", "forbid_tasks_tagged_with_experimental"]
expansions:
<<: *linux-arm64-static-enterprise-compile-expansions
@ -104,7 +104,6 @@ buildvariants:
# since it's running on a c6g.16xlarge
bazel_compile_flags: >-
--define=MONGO_DISTMOD=amazon2023
--//bazel/config:build_otel=True
--remote_execution_priority=3
--jobs=1600
--build_atlas=True
@ -169,8 +168,8 @@ buildvariants:
- name: .release_critical .requires_large_host !.requires_compile_variant !.incompatible_development_variant !.requires_all_feature_flags
distros:
- amazon2023-arm64-atlas-latest-large
- name: .default !.requires_large_host !.requires_compile_variant !.incompatible_development_variant !.requires_all_feature_flags
- name: .default .requires_large_host !.requires_compile_variant !.incompatible_development_variant !.requires_all_feature_flags
- name: .default !.requires_large_host !.uses_docker !.requires_compile_variant !.incompatible_development_variant !.requires_all_feature_flags
- name: .default .requires_large_host !.uses_docker !.requires_compile_variant !.incompatible_development_variant !.requires_all_feature_flags
distros:
- amazon2023-arm64-atlas-latest-large
- name: .fuzzer_deterministic

View File

@ -168,10 +168,16 @@ buildvariants:
- name: .release_critical .requires_large_host !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags !.suggested_excluding_required__for_devprod_mitigation_only
distros:
- amazon2023-arm64-atlas-latest-large
- name: .default !.requires_large_host !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags !.suggested_excluding_required__for_devprod_mitigation_only
- name: .default .requires_large_host !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags !.suggested_excluding_required__for_devprod_mitigation_only
- name: .default !.requires_large_host !.uses_docker !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags !.suggested_excluding_required__for_devprod_mitigation_only
- name: .default .requires_large_host !.uses_docker !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags !.suggested_excluding_required__for_devprod_mitigation_only
distros:
- amazon2023-arm64-atlas-latest-large
- name: .default !.requires_large_host .uses_docker !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags !.suggested_excluding_required__for_devprod_mitigation_only
distros:
- amazon2023-arm64-latest-small
- name: .default .requires_large_host .uses_docker !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags !.suggested_excluding_required__for_devprod_mitigation_only
distros:
- amazon2023-arm64-latest-large
- name: .fuzzer_deterministic
- <<: *enterprise-amazon-linux2023-arm64-all-feature-flags-template
@ -193,8 +199,8 @@ buildvariants:
- name: .release_critical .requires_large_host !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags !.suggested_excluding_required__for_devprod_mitigation_only !.requires_all_feature_flags
distros:
- amazon2023-arm64-latest-large
- name: .default !.requires_large_host !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags !.suggested_excluding_required__for_devprod_mitigation_only !.requires_all_feature_flags
- name: .default .requires_large_host !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags !.suggested_excluding_required__for_devprod_mitigation_only !.requires_all_feature_flags
- name: .default !.requires_large_host !.uses_docker !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags !.suggested_excluding_required__for_devprod_mitigation_only !.requires_all_feature_flags
- name: .default .requires_large_host !.uses_docker !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags !.suggested_excluding_required__for_devprod_mitigation_only !.requires_all_feature_flags
distros:
- amazon2023-arm64-latest-large
- name: .fuzzer_deterministic
@ -219,8 +225,8 @@ buildvariants:
- name: .release_critical .requires_large_host !.requires_compile_variant !.incompatible_development_variant !.requires_all_feature_flags !.multiversion !.suggested_excluding_required__for_devprod_mitigation_only
distros:
- amazon2023-arm64-atlas-latest-large
- name: .default !.requires_large_host !.requires_compile_variant !.incompatible_development_variant !.requires_all_feature_flags !.multiversion !.suggested_excluding_required__for_devprod_mitigation_only
- name: .default .requires_large_host !.requires_compile_variant !.incompatible_development_variant !.requires_all_feature_flags !.multiversion !.suggested_excluding_required__for_devprod_mitigation_only
- name: .default !.requires_large_host !.uses_docker !.requires_compile_variant !.incompatible_development_variant !.requires_all_feature_flags !.multiversion !.suggested_excluding_required__for_devprod_mitigation_only
- name: .default .requires_large_host !.uses_docker !.requires_compile_variant !.incompatible_development_variant !.requires_all_feature_flags !.multiversion !.suggested_excluding_required__for_devprod_mitigation_only
distros:
- amazon2023-arm64-atlas-latest-large
- name: .fuzzer_deterministic
@ -524,8 +530,8 @@ buildvariants:
- name: .release_critical .requires_large_host !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags !.suggested_excluding_required__for_devprod_mitigation_only !.multiversion
distros:
- amazon2023-arm64-latest-large
- name: .default !.requires_large_host !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags !.suggested_excluding_required__for_devprod_mitigation_only !.multiversion
- name: .default .requires_large_host !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags !.suggested_excluding_required__for_devprod_mitigation_only !.multiversion
- name: .default !.requires_large_host !.uses_docker !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags !.suggested_excluding_required__for_devprod_mitigation_only !.multiversion
- name: .default .requires_large_host !.uses_docker !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags !.suggested_excluding_required__for_devprod_mitigation_only !.multiversion
distros:
- amazon2023-arm64-latest-large
- name: .fuzzer_deterministic !.multiversion
@ -548,8 +554,8 @@ buildvariants:
# - name: .release_critical .requires_large_host !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags !.multiversion !.serverless !.exclude_when_record_ids_replicated
# distros:
# - amazon2023-arm64-atlas-latest-large
# - name: .default !.requires_large_host !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags !.multiversion !.serverless !.exclude_when_record_ids_replicated
# - name: .default .requires_large_host !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags !.multiversion !.serverless !.exclude_when_record_ids_replicated
# - name: .default !.requires_large_host !.uses_docker !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags !.multiversion !.serverless !.exclude_when_record_ids_replicated
# - name: .default .requires_large_host !.uses_docker !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags !.multiversion !.serverless !.exclude_when_record_ids_replicated
# distros:
# - amazon2023-arm64-atlas-latest-large
# expansions:

View File

@ -129,8 +129,8 @@ buildvariants:
- name: .release_critical .requires_large_host !.requires_compile_variant !.incompatible_development_variant !.requires_all_feature_flags
distros:
- rhel8.8-medium
- name: .default !.requires_large_host !.requires_compile_variant !.incompatible_development_variant !.requires_all_feature_flags
- name: .default .requires_large_host !.requires_compile_variant !.incompatible_development_variant !.requires_all_feature_flags
- name: .default !.requires_large_host !.uses_docker !.requires_compile_variant !.incompatible_development_variant !.requires_all_feature_flags
- name: .default .requires_large_host !.uses_docker !.requires_compile_variant !.incompatible_development_variant !.requires_all_feature_flags
distros:
- rhel8.8-medium

View File

@ -303,8 +303,8 @@ buildvariants:
- name: .release_critical .requires_large_host !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags
distros:
- rhel8.8-medium
- name: .default !.requires_large_host !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags
- name: .default .requires_large_host !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags
- name: .default !.requires_large_host !.uses_docker !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags
- name: .default .requires_large_host !.uses_docker !.requires_compile_variant !.incompatible_development_variant !.incompatible_all_feature_flags
distros:
- rhel8.8-medium

View File

@ -226,11 +226,11 @@ buildvariants:
- name: .release_critical .requires_large_host_debug_mode !.incompatible_development_variant !.incompatible_debug_mode !.incompatible_system_allocator !.requires_all_feature_flags
distros:
- *enterprise-rhel-8-64-bit-dynamic-debug-mode-large-distro-name
- name: .default !.requires_large_host !.requires_large_host_debug_mode !.incompatible_development_variant !.incompatible_debug_mode !.incompatible_system_allocator !.requires_all_feature_flags
- name: .default .requires_large_host !.incompatible_development_variant !.incompatible_debug_mode !.incompatible_system_allocator !.requires_all_feature_flags
- name: .default !.requires_large_host !.requires_large_host_debug_mode !.uses_docker !.incompatible_development_variant !.incompatible_debug_mode !.incompatible_system_allocator !.requires_all_feature_flags
- name: .default .requires_large_host !.uses_docker !.incompatible_development_variant !.incompatible_debug_mode !.incompatible_system_allocator !.requires_all_feature_flags
distros:
- *enterprise-rhel-8-64-bit-dynamic-debug-mode-large-distro-name
- name: .default .requires_large_host_debug_mode !.incompatible_development_variant !.incompatible_debug_mode !.incompatible_system_allocator !.requires_all_feature_flags
- name: .default .requires_large_host_debug_mode !.uses_docker !.incompatible_development_variant !.incompatible_debug_mode !.incompatible_system_allocator !.requires_all_feature_flags
distros:
- *enterprise-rhel-8-64-bit-dynamic-debug-mode-large-distro-name
- name: .non_deterministic !.requires_large_host !.requires_large_host_debug_mode !.incompatible_development_variant !.incompatible_debug_mode !.incompatible_system_allocator !.requires_all_feature_flags
@ -480,8 +480,8 @@ buildvariants:
- name: .release_critical .requires_large_host !.requires_compile_variant !.incompatible_development_variant !.incompatible_aubsan !.incompatible_system_allocator !.incompatible_all_feature_flags
distros:
- rhel8.8-xlarge
- name: .default !.requires_large_host !.requires_compile_variant !.incompatible_development_variant !.incompatible_aubsan !.incompatible_system_allocator !.incompatible_all_feature_flags
- name: .default .requires_large_host !.requires_compile_variant !.incompatible_development_variant !.incompatible_aubsan !.incompatible_system_allocator !.incompatible_all_feature_flags
- name: .default !.requires_large_host !.uses_docker !.requires_compile_variant !.incompatible_development_variant !.incompatible_aubsan !.incompatible_system_allocator !.incompatible_all_feature_flags
- name: .default .requires_large_host !.uses_docker !.requires_compile_variant !.incompatible_development_variant !.incompatible_aubsan !.incompatible_system_allocator !.incompatible_all_feature_flags
distros:
- rhel8.8-xlarge
- name: .non_deterministic !.requires_large_host !.requires_compile_variant !.incompatible_development_variant !.incompatible_aubsan !.incompatible_system_allocator !.incompatible_all_feature_flags
@ -583,11 +583,11 @@ buildvariants:
- name: .release_critical .requires_large_host_tsan !.requires_compile_variant !.incompatible_development_variant !.incompatible_tsan !.incompatible_system_allocator !.incompatible_all_feature_flags
distros:
- *enterprise-rhel8-debug-tsan-large-distro-name
- name: .default !.requires_large_host !.requires_large_host_tsan !.requires_compile_variant !.incompatible_development_variant !.incompatible_tsan !.incompatible_system_allocator !.incompatible_all_feature_flags
- name: .default .requires_large_host !.requires_compile_variant !.incompatible_development_variant !.incompatible_tsan !.incompatible_system_allocator !.incompatible_all_feature_flags
- name: .default !.requires_large_host !.requires_large_host_tsan !.uses_docker !.requires_compile_variant !.incompatible_development_variant !.incompatible_tsan !.incompatible_system_allocator !.incompatible_all_feature_flags
- name: .default .requires_large_host !.uses_docker !.requires_compile_variant !.incompatible_development_variant !.incompatible_tsan !.incompatible_system_allocator !.incompatible_all_feature_flags
distros:
- *enterprise-rhel8-debug-tsan-large-distro-name
- name: .default .requires_large_host_tsan !.requires_compile_variant !.incompatible_development_variant !.incompatible_tsan !.incompatible_system_allocator !.incompatible_all_feature_flags
- name: .default .requires_large_host_tsan !.uses_docker !.requires_compile_variant !.incompatible_development_variant !.incompatible_tsan !.incompatible_system_allocator !.incompatible_all_feature_flags
distros:
- *enterprise-rhel8-debug-tsan-large-distro-name
- name: .non_deterministic !.requires_large_host !.requires_large_host_tsan !.requires_compile_variant !.incompatible_development_variant !.incompatible_tsan !.incompatible_system_allocator !.incompatible_all_feature_flags

View File

@ -0,0 +1,7 @@
set -e
for dir in ./src/src/mongo/db/modules/*; do
if test -f $dir/evergreen/build_and_push_images.sh; then
bash $dir/evergreen/build_and_push_images.sh
fi
done

View File

@ -0,0 +1,7 @@
set -e
for dir in ./src/src/mongo/db/modules/*; do
if test -f $dir/evergreen/fetch_images.sh; then
bash $dir/evergreen/fetch_images.sh
fi
done

View File

@ -87,3 +87,9 @@ filters:
- "json_utils.js":
approvers:
- 10gen/query-integration-extensions
- "replicated_ident_utils.js":
approvers:
- 10gen/server-storage-engine-integration
- "replicated_record_ids_utils.js":
approvers:
- 10gen/server-storage-engine-integration

View File

@ -2,7 +2,7 @@
// documents, otherwise.
import {ReplSetTest} from "jstests/libs/replsettest.js";
function getShowRecordIdsCursor(node, dbName, replicatedCollName) {
export function getShowRecordIdsCursor(node, dbName, replicatedCollName) {
return node
.getDB(dbName)
[replicatedCollName].aggregate([{"$project": {"recordId": {"$meta": "recordId"}, "document": "$$ROOT"}}]);

View File

@ -0,0 +1,86 @@
/*
* Helpers for basic testing of replicated idents.
*/
function getOplog(node) {
return node.getDB("local").oplog.rs;
}
export function getSortedCatalogEntries(node, sortField = "ident") {
const adminDB = node.getDB("admin");
const isSystemProfile = {"name": "system.profile"};
const isLocal = {"db": "local"};
const match = {$nor: [isSystemProfile, isLocal]};
return adminDB.aggregate([{$listCatalog: {}}, {$match: match}, {$sort: {sortField: 1}}]).toArray();
}
/**
* Given catalog entries for 2 nodes, where catalog entries for both nodes must be sorted by the
* same field, validates that each entry has a matching 'ident'.
*/
export function assertMatchingCatalogIdents(node0CatalogIdents, node1CatalogIdents) {
jsTest.log(
`Asserting catalog entries for node0 ${tojson(node0CatalogIdents)} with node1 ${tojson(node1CatalogIdents)}`,
);
assert.eq(
node0CatalogIdents.length,
node1CatalogIdents.length,
`Expected nodes to have same number of entries. Entries for node0 ${tojson(
node0CatalogIdents,
)}, entries for node1 ${node1CatalogIdents}`,
);
const numCatalogEntries = node0CatalogIdents.length;
const entriesThatDontMatch = [];
for (let i = 0; i < numCatalogEntries; i++) {
const entryNode0 = node0CatalogIdents[i];
const entryNode1 = node1CatalogIdents[i];
if (bsonWoCompare(entryNode0, entryNode1) !== 0) {
// For visibility, collect all mismatched entries before failing.
entriesThatDontMatch.push([entryNode0, entryNode1]);
jsTest.log(
`Expected both nodes to have same entries. Node0 has ${tojson(
entryNode0,
)}, Node1 has ${tojson(entryNode1)}`,
);
}
}
assert.eq(
0,
entriesThatDontMatch.length,
`Catalog entries for were expected to match, but don't. Entries that don't match ${tojson(
entriesThatDontMatch,
)}`,
);
}
// Validates that all 'create' collection oplog entries contain collection idents.
export function assertCreateOplogEntriesContainIdents(node) {
const createOps = getOplog(node)
.find({"op": "c", "o.create": {$exists: true}})
.toArray();
jsTest.log("Create oplog entries on node " + node.port + " " + tojson(createOps));
assert.lt(0, createOps.length);
for (let op of createOps) {
assert(
op.hasOwnProperty("o2"),
`Expected to have 'o2' field present in ${tojson(
op,
)}. Dumping all create oplog entries ${tojson(createOps)}`,
);
const o2 = op["o2"];
assert(
o2.hasOwnProperty("ident"),
`Expected to find 'ident' property in 'o2' field of ${tojson(
op,
)}. Dumping all create oplog entries ${tojson(createOps)}`,
);
assert(
o2.hasOwnProperty("idIndexIdent"),
`Expected to find 'iddIndexIdent' property in 'o2' field of ${tojson(
op,
)}. Dumping all create oplog entries ${tojson(createOps)}`,
);
}
}

View File

@ -1768,6 +1768,28 @@ export class ReplSetTest {
});
}
/**
* Runs replSetInitiate on the first node of the replica set.
*
* TODO (SERVER-109841): Replsetinitiate is currently a no-op command for disagg. Determine the
* next steps for this function if additional functionality is to be incorporated.
*/
initiateForDisagg(cfg, initCmd) {
const startTime = new Date(); // Measure the execution time of this function.
// Blocks until there is a primary. We use a faster retry interval here since we expect the
// primary to be ready very soon. We also turn the failpoint off once we have a primary.
this.getPrimary(this.kDefaultTimeoutMS, 25 /* retryIntervalMS */);
jsTest.log(
"ReplSetTest initiateForDisagg took " +
(new Date() - startTime) +
"ms for " +
this.nodes.length +
" nodes.",
);
}
/**
* Steps up 'node' as primary and by default it waits for the stepped up node to become a
* writable primary and waits for all nodes to reach the same optime before sending the
@ -3589,7 +3611,7 @@ function _constructStartNewInstances(rst, opts) {
rst._unbridgedPorts = Array.from({length: numNodes}, rst._allocatePortForNode);
rst._unbridgedNodes = [];
} else {
rst.ports = Array.from({length: numNodes}, rst._allocatePortForNode);
rst.ports = opts.ports || Array.from({length: numNodes}, rst._allocatePortForNode);
}
for (let i = 0; i < numNodes; i++) {

View File

@ -530,6 +530,7 @@ encrypted_storage_engine:
slack: server-security
jira: Server Security
files:
- src/mongo/db/modules/atlas/src/disagg_storage/encryption
- src/mongo/db/modules/enterprise/src/encryptdb
security:
@ -854,8 +855,7 @@ disagg_storage:
slack: disaggregated-storage-mongod
jira: RSSD
files:
- src/mongo/db/modules/atlas
- src/mongo/db/disagg_storage
- src/mongo/db/modules/atlas/src/disagg_storage
storage_engine_api:
meta:
@ -992,6 +992,13 @@ installer:
files:
- src/mongo/installer/
replicated_storage_service:
meta:
slack: server-replication
jira: Server Replication
files:
- src/mongo/db/rss
replication:
meta:
slack: server-replication

View File

@ -914,6 +914,49 @@
},
"scope": "required"
},
{
"supplier": {
"name": "Organization: github"
},
"name": "googletest",
"version": "1.17.0",
"licenses": [
{
"license": {
"id": "BSD-3-Clause"
}
}
],
"purl": "pkg:github/googletest/googletest@v1.17.0",
"properties": [
{
"name": "internal:team_responsible",
"value": "Disaggregated Storage"
},
{
"name": "emits_persisted_data",
"value": "false"
},
{
"name": "info_link",
"value": "https://github.com/google/googletest"
},
{
"name": "import_script_path",
"value": "src/third_party/googletest_restricted_for_disagg_only/scripts/import.sh"
}
],
"type": "library",
"bom-ref": "e57f94bd-b0b1-4e47-912e-c690a01e4f95",
"evidence": {
"occurrences": [
{
"location": "src/third_party/googletest_restricted_for_disagg_only"
}
]
},
"scope": "required"
},
{
"type": "library",
"bom-ref": "pkg:github/gperftools/gperftools@gperftools-2.9.1",

View File

@ -21,6 +21,11 @@ generate_config_header(
"MONGO_CONFIG_OTEL": "1",
},
"//conditions:default": {},
}) | select({
"//bazel/config:build_atlas_enabled": {
"MONGO_CONFIG_DISAGG_STORAGE": "1",
},
"//conditions:default": {},
}) | select({
"//bazel/config:mutex_observation_enabled": {
"MONGO_CONFIG_MUTEX_OBSERVATION": "1",

View File

@ -125,5 +125,8 @@
// Defined if the build includes OpenTelemetry
@mongo_config_otel@
// Defined if the build includes disaggregated storage
@mongo_config_disagg_storage@
// Defined if the build includes mutex observation
@mongo_config_mutex_observation@

View File

@ -3320,6 +3320,8 @@ mongo_cc_library(
"//src/mongo/db/repl:serveronly_repl",
"//src/mongo/db/repl:storage_interface_impl",
"//src/mongo/db/repl:topology_coordinator",
"//src/mongo/db/rss:persistence_provider_impl",
"//src/mongo/db/rss:service_lifecycle_impl",
"rw_concern_d",
"//src/mongo/db/session:kill_sessions_local",
"//src/mongo/db/session:service_liaison_mongod",
@ -3433,6 +3435,11 @@ mongo_cc_library(
"//src/mongo/db/modules/enterprise/src/kmip:kmip_configuration",
],
"//conditions:default": [],
}) + select({
"//bazel/config:build_atlas_enabled": [
"//src/mongo/db/modules/atlas/src/disagg_storage/encryption:sls_log_encryption_manager",
],
"//conditions:default": [],
}),
)
@ -3530,6 +3537,7 @@ mongo_cc_library(
"//src/mongo/db/repl:storage_interface_impl",
"//src/mongo/db/repl:topology_coordinator",
"//src/mongo/db/repl:wait_for_majority_service",
"//src/mongo/db/rss:replicated_storage_service",
"//src/mongo/db/s:query_analysis_writer",
"//src/mongo/db/s:sessions_collection_config_server",
"//src/mongo/db/s:sharding_commands_d",
@ -3563,11 +3571,6 @@ mongo_cc_library(
"//src/mongo/util/tracing_profiler",
],
"//conditions:default": [],
}) + select({
"//bazel/config:build_atlas_required_settings": [
"//src/mongo/db/modules/atlas:atlas_only",
],
"//conditions:default": [],
}),
)
@ -3697,6 +3700,7 @@ mongo_cc_library(
"//src/mongo/db/local_catalog:catalog_impl",
"//src/mongo/db/op_observer",
"//src/mongo/db/repl:replmocks",
"//src/mongo/db/rss:persistence_provider_impl",
"//src/mongo/db/s:sharding_runtime_d",
"//src/mongo/db/storage:storage_control",
"//src/mongo/db/storage:storage_options",
@ -4138,6 +4142,7 @@ mongo_cc_library(
"//src/mongo/db/local_catalog:database_holder",
"//src/mongo/db/op_observer",
"//src/mongo/db/repl:replmocks",
"//src/mongo/db/rss:persistence_provider_impl",
"//src/mongo/db/s:sharding_runtime_d",
"//src/mongo/db/stats:top",
"//src/mongo/db/storage:storage_control",
@ -4369,6 +4374,7 @@ mongo_cc_benchmark(
"//src/mongo/db/repl:repl_coordinator_impl",
"//src/mongo/db/repl:serveronly_repl",
"//src/mongo/db/repl:storage_interface_impl",
"//src/mongo/db/rss:persistence_provider_impl",
"//src/mongo/db/s:sharding_runtime_d",
"//src/mongo/db/storage:storage_control",
"//src/mongo/db/storage/wiredtiger:storage_wiredtiger",

View File

@ -127,24 +127,62 @@ Timestamp getMedianAppliedTimestamp(const std::vector<repl::MemberData>& sortedM
const int sustainerIdx = sortedMemberData.size() / 2;
return sortedMemberData[sustainerIdx].getLastAppliedOpTime().getTimestamp();
}
} // namespace
namespace flow_control_details {
ReplicationTimestampProvider::ReplicationTimestampProvider(repl::ReplicationCoordinator* replCoord)
: _replCoord(replCoord) {}
Timestamp ReplicationTimestampProvider::getCurrSustainerTimestamp() const {
return getMedianAppliedTimestamp(_currMemberData);
}
Timestamp ReplicationTimestampProvider::getPrevSustainerTimestamp() const {
return getMedianAppliedTimestamp(_prevMemberData);
}
repl::TimestampAndWallTime ReplicationTimestampProvider::getTargetTimestampAndWallTime() const {
auto time = _replCoord->getLastCommittedOpTimeAndWallTime();
return {.timestamp = time.opTime.getTimestamp(), .wallTime = time.wallTime};
}
repl::TimestampAndWallTime ReplicationTimestampProvider::getLastWriteTimestampAndWallTime() const {
auto time = _replCoord->getMyLastAppliedOpTimeAndWallTime();
return {.timestamp = time.opTime.getTimestamp(), .wallTime = time.wallTime};
}
void ReplicationTimestampProvider::update() {
_prevMemberData = _currMemberData;
_currMemberData = _replCoord->getMemberData();
// Sort MemberData with the 0th index being the node with the lowest applied optime.
std::sort(_currMemberData.begin(),
_currMemberData.end(),
[](const repl::MemberData& left, const repl::MemberData& right) -> bool {
return left.getLastAppliedOpTime() < right.getLastAppliedOpTime();
});
}
bool ReplicationTimestampProvider::flowControlUsable() const {
return _replCoord->canAcceptNonLocalWrites();
}
/**
* Sanity checks whether the successive queries of topology data are comparable for doing a flow
* control calculation. In particular, the number of members must be the same and the median
* applier's timestamp must not go backwards.
*/
bool sustainerAdvanced(const std::vector<repl::MemberData>& prevMemberData,
const std::vector<repl::MemberData>& currMemberData) {
if (currMemberData.size() == 0 || currMemberData.size() != prevMemberData.size()) {
bool ReplicationTimestampProvider::sustainerAdvanced() const {
if (_currMemberData.size() == 0 || _currMemberData.size() != _prevMemberData.size()) {
LOGV2_WARNING(22223,
"Flow control detected a change in topology",
"prevSize"_attr = prevMemberData.size(),
"currSize"_attr = currMemberData.size());
"prevSize"_attr = _prevMemberData.size(),
"currSize"_attr = _currMemberData.size());
return false;
}
auto currSustainerAppliedTs = getMedianAppliedTimestamp(currMemberData);
auto prevSustainerAppliedTs = getMedianAppliedTimestamp(prevMemberData);
auto currSustainerAppliedTs = getMedianAppliedTimestamp(_currMemberData);
auto prevSustainerAppliedTs = getMedianAppliedTimestamp(_prevMemberData);
if (currSustainerAppliedTs < prevSustainerAppliedTs) {
LOGV2_WARNING(22224,
@ -156,13 +194,42 @@ bool sustainerAdvanced(const std::vector<repl::MemberData>& prevMemberData,
return true;
}
} // namespace
void ReplicationTimestampProvider::setCurrMemberData_forTest(
const std::vector<repl::MemberData>& memberData) {
_currMemberData = memberData;
std::sort(_currMemberData.begin(),
_currMemberData.end(),
[](const repl::MemberData& left, const repl::MemberData& right) -> bool {
return left.getLastAppliedOpTime() < right.getLastAppliedOpTime();
});
}
void ReplicationTimestampProvider::setPrevMemberData_forTest(
const std::vector<repl::MemberData>& memberData) {
_prevMemberData = memberData;
std::sort(_prevMemberData.begin(),
_prevMemberData.end(),
[](const repl::MemberData& left, const repl::MemberData& right) -> bool {
return left.getLastAppliedOpTime() < right.getLastAppliedOpTime();
});
}
} // namespace flow_control_details
FlowControl::FlowControl(repl::ReplicationCoordinator* replCoord)
: _replCoord(replCoord), _lastTimeSustainerAdvanced(Date_t::now()) {}
: _timestampProvider(
std::make_unique<flow_control_details::ReplicationTimestampProvider>(replCoord)),
_lastTimeSustainerAdvanced(Date_t::now()) {}
FlowControl::FlowControl(ServiceContext* service, repl::ReplicationCoordinator* replCoord)
: _replCoord(replCoord), _lastTimeSustainerAdvanced(Date_t::now()) {
: FlowControl(service,
std::make_unique<flow_control_details::ReplicationTimestampProvider>(replCoord)) {
}
FlowControl::FlowControl(ServiceContext* service,
std::unique_ptr<TimestampProvider> timestampProvider)
: _timestampProvider(std::move(timestampProvider)), _lastTimeSustainerAdvanced(Date_t::now()) {
// Initialize _lastTargetTicketsPermitted to maximum tickets to make sure flow control doesn't
// cause a slow start on start up.
FlowControlTicketholder::set(service, std::make_unique<FlowControlTicketholder>(kMaxTickets));
@ -254,44 +321,26 @@ void FlowControl::disableUntil(Date_t deadline) {
_disableUntil.store(deadline);
}
/**
* Advance the `_*MemberData` fields and sort the new data by the element's last applied optime.
*/
void FlowControl::_updateTopologyData() {
_prevMemberData = _currMemberData;
_currMemberData = _replCoord->getMemberData();
// Sort MemberData with the 0th index being the node with the lowest applied optime.
std::sort(_currMemberData.begin(),
_currMemberData.end(),
[](const repl::MemberData& left, const repl::MemberData& right) -> bool {
return left.getLastAppliedOpTime() < right.getLastAppliedOpTime();
});
}
int FlowControl::_calculateNewTicketsForLag(const std::vector<repl::MemberData>& prevMemberData,
const std::vector<repl::MemberData>& currMemberData,
int FlowControl::_calculateNewTicketsForLag(const Timestamp& prevSustainerTimestamp,
const Timestamp& currSustainerTimestamp,
std::int64_t locksUsedLastPeriod,
double locksPerOp,
std::uint64_t lagMillis,
std::uint64_t thresholdLagMillis) {
invariant(prevSustainerTimestamp <= currSustainerTimestamp,
fmt::format("PrevSustainer: {} CurrSustainer: {}",
prevSustainerTimestamp.toString(),
currSustainerTimestamp.toString()));
invariant(lagMillis >= thresholdLagMillis);
const auto currSustainerAppliedTs = getMedianAppliedTimestamp(currMemberData);
const auto prevSustainerAppliedTs = getMedianAppliedTimestamp(prevMemberData);
invariant(prevSustainerAppliedTs <= currSustainerAppliedTs,
fmt::format("PrevSustainer: {} CurrSustainer: {}",
prevSustainerAppliedTs.toString(),
currSustainerAppliedTs.toString()));
const std::int64_t sustainerAppliedCount =
_approximateOpsBetween(prevSustainerAppliedTs, currSustainerAppliedTs);
_approximateOpsBetween(prevSustainerTimestamp, currSustainerTimestamp);
LOGV2_DEBUG(22218,
DEBUG_LOG_LEVEL,
" PrevApplied: {prevSustainerAppliedTs} CurrApplied: {currSustainerAppliedTs} "
" PrevApplied: {prevSustainerTimestamp} CurrApplied: {currSustainerTimestamp} "
"NumSustainerApplied: {sustainerAppliedCount}",
"prevSustainerAppliedTs"_attr = prevSustainerAppliedTs,
"currSustainerAppliedTs"_attr = currSustainerAppliedTs,
"prevSustainerTimestamp"_attr = prevSustainerTimestamp,
"currSustainerTimestamp"_attr = currSustainerTimestamp,
"sustainerAppliedCount"_attr = sustainerAppliedCount);
if (sustainerAppliedCount > 0) {
_lastTimeSustainerAdvanced = Date_t::now();
@ -359,35 +408,35 @@ int FlowControl::getNumTickets(Date_t now) {
}
// Flow Control is only enabled on nodes that can accept writes.
const bool canAcceptWrites = _replCoord->canAcceptNonLocalWrites();
const bool flowControlUsable = _timestampProvider->flowControlUsable();
if (auto sfp = flowControlTicketOverride.scoped(); MONGO_unlikely(sfp.isActive())) {
int numTickets = sfp.getData().getIntField("numTickets");
if (numTickets > 0 && canAcceptWrites) {
if (numTickets > 0 && flowControlUsable) {
return numTickets;
}
}
// It's important to update the topology on each iteration.
_updateTopologyData();
const repl::OpTimeAndWallTime myLastApplied = _replCoord->getMyLastAppliedOpTimeAndWallTime();
const repl::OpTimeAndWallTime lastCommitted = _replCoord->getLastCommittedOpTimeAndWallTime();
_timestampProvider->update();
const auto lastWriteTime = _timestampProvider->getLastWriteTimestampAndWallTime();
const auto lastTargetTime = _timestampProvider->getTargetTimestampAndWallTime();
const double locksPerOp = _getLocksPerOp();
const std::int64_t locksUsedLastPeriod = _getLocksUsedLastPeriod();
if (gFlowControlEnabled.load() == false || canAcceptWrites == false || locksPerOp < 0.0) {
_trimSamples(std::min(lastCommitted.opTime.getTimestamp(),
getMedianAppliedTimestamp(_prevMemberData)));
if (gFlowControlEnabled.load() == false || flowControlUsable == false || locksPerOp < 0.0) {
_trimSamples(
std::min(lastTargetTime.timestamp, _timestampProvider->getPrevSustainerTimestamp()));
return kMaxTickets;
}
int ret = 0;
const auto thresholdLagMillis = getThresholdLagMillis();
// Successive lastCommitted and lastApplied wall clock time recordings are not guaranteed to be
// Successive lastTargetTime and lastApplied wall clock time recordings are not guaranteed to be
// monotonically increasing. Recordings that satisfy the following check result in a negative
// value for lag, so ignore them.
const bool ignoreWallTimes = lastCommitted.wallTime > myLastApplied.wallTime;
const bool ignoreWallTimes = lastTargetTime.wallTime > lastWriteTime.wallTime;
// _approximateOpsBetween will return -1 if the input timestamps are in the same "bucket".
// This is an indication that there are very few ops between the two timestamps.
@ -395,9 +444,8 @@ int FlowControl::getNumTickets(Date_t now) {
// Don't let the no-op writer on idle systems fool the sophisticated "is the replica set
// lagged" classifier.
const bool isHealthy = !ignoreWallTimes &&
(getLagMillis(myLastApplied.wallTime, lastCommitted.wallTime) < thresholdLagMillis ||
_approximateOpsBetween(lastCommitted.opTime.getTimestamp(),
myLastApplied.opTime.getTimestamp()) == -1);
(getLagMillis(lastWriteTime.wallTime, lastTargetTime.wallTime) < thresholdLagMillis ||
_approximateOpsBetween(lastTargetTime.timestamp, lastWriteTime.timestamp) == -1);
if (isHealthy) {
// The add/multiply technique is used to ensure ticket allocation can ramp up quickly,
@ -412,16 +460,16 @@ int FlowControl::getNumTickets(Date_t now) {
auto waitTime = curTimeMicros64() - _startWaitTime;
_isLaggedTimeMicros.fetchAndAddRelaxed(waitTime);
}
} else if (!ignoreWallTimes && sustainerAdvanced(_prevMemberData, _currMemberData)) {
} else if (!ignoreWallTimes && _timestampProvider->sustainerAdvanced()) {
// Expected case where flow control has meaningful data from the last period to make a new
// calculation.
ret =
_calculateNewTicketsForLag(_prevMemberData,
_currMemberData,
locksUsedLastPeriod,
locksPerOp,
getLagMillis(myLastApplied.wallTime, lastCommitted.wallTime),
thresholdLagMillis);
ret = _calculateNewTicketsForLag(
_timestampProvider->getPrevSustainerTimestamp(),
_timestampProvider->getCurrSustainerTimestamp(),
locksUsedLastPeriod,
locksPerOp,
getLagMillis(lastWriteTime.wallTime, lastTargetTime.wallTime),
thresholdLagMillis);
if (!_isLagged.load()) {
_isLagged.store(true);
_isLaggedCount.fetchAndAddRelaxed(1);
@ -443,9 +491,10 @@ int FlowControl::getNumTickets(Date_t now) {
DEBUG_LOG_LEVEL,
"FlowControl debug.",
"isLagged"_attr = (_isLagged.load() ? "true" : "false"),
"currlagMillis"_attr = getLagMillis(myLastApplied.wallTime, lastCommitted.wallTime),
"opsLagged"_attr = _approximateOpsBetween(lastCommitted.opTime.getTimestamp(),
myLastApplied.opTime.getTimestamp()),
"currlagMillis"_attr =
getLagMillis(lastWriteTime.wallTime, lastTargetTime.wallTime),
"opsLagged"_attr =
_approximateOpsBetween(lastTargetTime.timestamp, lastWriteTime.timestamp),
"granting"_attr = ret,
"lastGranted"_attr = _lastTargetTicketsPermitted.load(),
"lastSustainerApplied"_attr = _lastSustainerAppliedCount.load(),
@ -457,7 +506,7 @@ int FlowControl::getNumTickets(Date_t now) {
_lastTargetTicketsPermitted.store(ret);
_trimSamples(
std::min(lastCommitted.opTime.getTimestamp(), getMedianAppliedTimestamp(_prevMemberData)));
std::min(lastTargetTime.timestamp, _timestampProvider->getPrevSustainerTimestamp()));
return ret;
}

View File

@ -34,6 +34,7 @@
#include "mongo/bson/timestamp.h"
#include "mongo/db/operation_context.h"
#include "mongo/db/repl/member_data.h"
#include "mongo/db/repl/optime.h"
#include "mongo/db/repl/replication_coordinator.h"
#include "mongo/db/repl/replication_coordinator_fwd.h"
#include "mongo/db/service_context.h"
@ -62,12 +63,66 @@ namespace mongo {
*/
class FlowControl {
public:
class TimestampProvider {
public:
virtual ~TimestampProvider() = default;
/**
* The sustainer timestamp is the timestamp which, if moved forward, will cause an
* advance in the target timestamp. For replication, it is the median applied timestamp
* on all the relevant nodes. We need to know this timestamp both for the current iteration
* and the previous iteration.
*/
virtual Timestamp getCurrSustainerTimestamp() const = 0;
virtual Timestamp getPrevSustainerTimestamp() const = 0;
/**
* The target time is the time we are trying to throttle to. For replication, it is the
* last committed time (majority snapshot time).
*/
virtual repl::TimestampAndWallTime getTargetTimestampAndWallTime() const = 0;
/**
* The last write time is what we are trying to control. For replication, it is
* the last applied time.
*/
virtual repl::TimestampAndWallTime getLastWriteTimestampAndWallTime() const = 0;
/**
* Is flow control possible with this timestamp provider? For replication,
* true if this is a primary and majority read concern is enabled.
*/
virtual bool flowControlUsable() const = 0;
/**
* Are the previous and current updates compatible? For replication,
* makes sure number of nodes is the same and the median node timestamp (the sustainer)
* has not gone backwards.
*/
virtual bool sustainerAdvanced() const = 0;
/**
* Advance the `_*MemberData` fields and sort the new data by the element's last applied
* optime.
*/
virtual void update() = 0;
};
static constexpr int kMaxTickets = 1000 * 1000 * 1000;
/**
* Construct a flow control object based on a custom timestamp provider.
* Takes ownership of the timestamp provider.
*/
FlowControl(ServiceContext* service, std::unique_ptr<TimestampProvider> timestampProvider);
/**
* Construct a replication-based flow control object.
*/
FlowControl(ServiceContext* service, repl::ReplicationCoordinator* replCoord);
/**
* Construct a flow control object without adding a periodic job runner for testing.
* Construct a replication-based flow control object without adding a periodic job runner for
* testing.
*/
FlowControl(repl::ReplicationCoordinator* replCoord);
@ -122,13 +177,13 @@ public:
std::int64_t _approximateOpsBetween(Timestamp prevTs, Timestamp currTs);
void _updateTopologyData();
int _calculateNewTicketsForLag(const std::vector<repl::MemberData>& prevMemberData,
const std::vector<repl::MemberData>& currMemberData,
int _calculateNewTicketsForLag(const Timestamp& prevSustainerTimestamp,
const Timestamp& currSustainerTimestamp,
std::int64_t locksUsedLastPeriod,
double locksPerOp,
std::uint64_t lagMillis,
std::uint64_t thresholdLagMillis);
void _trimSamples(Timestamp trimSamplesTo);
// Sample of (timestamp, ops, lock acquisitions) where ops and lock acquisitions are
@ -139,7 +194,7 @@ public:
}
private:
repl::ReplicationCoordinator* _replCoord;
std::unique_ptr<TimestampProvider> _timestampProvider;
// These values are updated with each flow control computation and are also surfaced in server
// status.
@ -161,9 +216,6 @@ private:
std::int64_t _lastPollLockAcquisitions = 0;
std::vector<repl::MemberData> _currMemberData;
std::vector<repl::MemberData> _prevMemberData;
Date_t _lastTimeSustainerAdvanced;
// This value is used for calculating server status metrics.
@ -172,4 +224,27 @@ private:
PeriodicJobAnchor _jobAnchor;
};
namespace flow_control_details {
class ReplicationTimestampProvider final : public FlowControl::TimestampProvider {
public:
explicit ReplicationTimestampProvider(repl::ReplicationCoordinator* replCoord);
Timestamp getCurrSustainerTimestamp() const final;
Timestamp getPrevSustainerTimestamp() const final;
repl::TimestampAndWallTime getTargetTimestampAndWallTime() const final;
repl::TimestampAndWallTime getLastWriteTimestampAndWallTime() const final;
bool flowControlUsable() const final;
bool sustainerAdvanced() const final;
void update() final;
void setCurrMemberData_forTest(const std::vector<repl::MemberData>& memberData);
void setPrevMemberData_forTest(const std::vector<repl::MemberData>& memberData);
private:
repl::ReplicationCoordinator* _replCoord;
std::vector<repl::MemberData> _currMemberData;
std::vector<repl::MemberData> _prevMemberData;
};
} // namespace flow_control_details
} // namespace mongo

View File

@ -241,6 +241,15 @@ TEST_F(FlowControlTest, CalculatingTickets) {
currMemberData.emplace_back(constructMemberData(Timestamp(2000)));
currMemberData.emplace_back(constructMemberData(Timestamp(3000)));
flow_control_details::ReplicationTimestampProvider timestampProvider(replCoordMock);
timestampProvider.setPrevMemberData_forTest(prevMemberData);
timestampProvider.setCurrMemberData_forTest(currMemberData);
auto prevSustainerTimestamp = timestampProvider.getPrevSustainerTimestamp();
auto currSustainerTimestamp = timestampProvider.getCurrSustainerTimestamp();
ASSERT_EQ(Timestamp(1000), prevSustainerTimestamp);
ASSERT_EQ(Timestamp(2000), currSustainerTimestamp);
// Construct samples where Timestamp X maps to operation number X.
for (int ts = 1; ts <= 3000; ++ts) {
flowControl->sample(Timestamp(ts), 1);
@ -251,8 +260,8 @@ TEST_F(FlowControlTest, CalculatingTickets) {
const std::uint64_t thresholdLag = 1;
const std::uint64_t currLag = thresholdLag;
ASSERT_EQ(1900,
flowControl->_calculateNewTicketsForLag(prevMemberData,
currMemberData,
flowControl->_calculateNewTicketsForLag(prevSustainerTimestamp,
currSustainerTimestamp,
locksUsedLastPeriod,
locksPerOp,
currLag,

View File

@ -39,6 +39,7 @@
#include "mongo/db/op_observer/op_observer_util.h"
#include "mongo/db/operation_context.h"
#include "mongo/db/repl/oplog_entry.h"
#include "mongo/db/rss/replicated_storage_service.h"
#include "mongo/util/assert_util.h"
#include "mongo/util/decorable.h"
#include "mongo/util/namespace_string_util.h"
@ -123,7 +124,9 @@ void AuthOpObserver::onCreateCollection(
BSONObj o2;
if (createCollCatalogIdentifier.has_value() &&
shouldReplicateLocalCatalogIdentifers(VersionContext::getDecoration(opCtx))) {
shouldReplicateLocalCatalogIdentifers(
rss::ReplicatedStorageService::get(opCtx).getPersistenceProvider(),
VersionContext::getDecoration(opCtx))) {
o2 = repl::MutableOplogEntry::makeCreateCollObject2(
createCollCatalogIdentifier->catalogId,
createCollCatalogIdentifier->ident,

View File

@ -1471,6 +1471,7 @@ mongo_cc_library(
"//src/mongo/db/repl:repl_server_parameters",
"//src/mongo/db/repl:replica_set_messages",
"//src/mongo/db/repl/dbcheck",
"//src/mongo/db/rss:replicated_storage_service",
"//src/mongo/db/s:sharding_catalog_manager",
"//src/mongo/db/s:sharding_commands_d",
"//src/mongo/db/s:transaction_coordinator",

View File

@ -35,6 +35,7 @@
#include "mongo/db/index_builds/index_builds_coordinator.h"
#include "mongo/db/operation_context.h"
#include "mongo/db/repl/replication_coordinator.h"
#include "mongo/db/rss/replicated_storage_service.h"
#include "mongo/db/s/transaction_coordinator_service.h"
#include "mongo/logv2/log.h"
#include "mongo/platform/compiler.h"
@ -71,7 +72,10 @@ Status stepDownForShutdown(OperationContext* opCtx,
// Specify a high freeze time, so that if there is a stall during shut down, the node
// does not run for election.
replCoord->stepDown(opCtx, false /* force */, waitTime, Days(1));
auto& rss = rss::ReplicatedStorageService::get(opCtx);
if (rss.getPersistenceProvider().shouldStepDownForShutdown()) {
replCoord->stepDown(opCtx, false /* force */, waitTime, Days(1));
}
if (MONGO_unlikely(hangInShutdownAfterStepdown.shouldFail())) {
LOGV2(4695100, "hangInShutdownAfterStepdown failpoint enabled");

View File

@ -1,31 +0,0 @@
load("//bazel:mongo_src_rules.bzl", "idl_generator", "mongo_cc_benchmark", "mongo_cc_library", "mongo_cc_unit_test")
package(default_visibility = ["//visibility:public"])
exports_files(
glob([
"*.h",
"*.cpp",
]),
)
idl_generator(
name = "server_parameters_gen",
src = "server_parameters.idl",
deps = [
"//src/mongo/db:basic_types_gen",
],
)
mongo_cc_library(
name = "server_parameters",
srcs = [
"server_parameters_gen",
],
hdrs = [
],
deps = [
"//src/mongo/db:server_base",
"//src/mongo/idl:idl_parser",
],
)

View File

@ -1,5 +0,0 @@
version: 1.0.0
filters:
- "*":
approvers:
- 10gen/server-disagg-storage

View File

@ -1,45 +0,0 @@
# Copyright (C) 2025-present MongoDB, Inc.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the Server Side Public License, version 1,
# as published by MongoDB, Inc.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Server Side Public License for more details.
#
# You should have received a copy of the Server Side Public License
# along with this program. If not, see
# <http://www.mongodb.com/licensing/server-side-public-license>.
#
# As a special exception, the copyright holders give permission to link the
# code of portions of this program with the OpenSSL library under certain
# conditions as described in each individual source file and distribute
# linked combinations including the program with the OpenSSL library. You
# must comply with the Server Side Public License in all respects for
# all of the code used other than as permitted herein. If you modify file(s)
# with this exception, you may extend this exception to your version of the
# file(s), but you are not obligated to do so. If you do not wish to do so,
# delete this exception statement from your version. If you delete this
# exception statement from all source files in the program, then also delete
# it in the license file.
#
# server setParameters for disaggregated storage
global:
cpp_namespace: "mongo::disagg"
imports:
- "mongo/db/basic_types.idl"
server_parameters:
disaggregatedStorageEnabled:
description: >-
Set this to run the server as a compute node in a disaggregated storage cluster.
set_at: startup
cpp_vartype: bool
cpp_varname: gDisaggregatedStorageEnabled
default: false
redact: false

View File

@ -158,7 +158,7 @@ private:
X(abortAllTransactions) \
X(joinLogicalSessionCache) \
X(shutDownCursorManager) \
X(shutDownSLSStateMachine) \
X(shutDownStateRequiredForStorageAccess) \
/* For magic restore: */ \
X(magicRestoreToolTotal) \
X(readMagicRestoreConfig) \

View File

@ -33,20 +33,31 @@
#include "mongo/db/ftdc/collector.h"
#include "mongo/db/ftdc/controller.h"
#include "mongo/db/ftdc/ftdc_system_stats.h"
#include "mongo/logv2/log.h"
#include "mongo/util/errno_util.h"
#include "mongo/util/functional.h"
#include "mongo/util/processinfo.h"
#include "mongo/util/procparser.h"
#include <cstdint>
#include <iostream>
#include <memory>
#include <set>
#include <string>
#include <utility>
#include <vector>
#include <ifaddrs.h>
#include <linux/ethtool.h>
#include <linux/if.h>
#include <linux/sockios.h>
#include <sys/ioctl.h>
#include <sys/prctl.h>
#include <sys/resource.h>
#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kFTDC
namespace mongo {
namespace {
@ -102,6 +113,188 @@ static const std::map<StringData, std::set<StringData>> kSockstatKeys{
{"TCP"_sd, {"inuse"_sd, "orphan"_sd, "tw"_sd, "alloc"_sd}},
};
/**
* Class to gather NIC stats by emulating ethtool -S functionality by using the ioctl SIOCETHTOOL.
*/
class EthTool {
public:
static std::unique_ptr<EthTool> create(StringData interface) {
int fd = socket(AF_INET, SOCK_DGRAM, 0);
if (fd == -1) {
auto ec = lastPosixError();
LOGV2_WARNING(
10985539, "Ethtool socket allocation failed", "error"_attr = errorMessage(ec));
return nullptr;
}
auto ethtool = std::unique_ptr<EthTool>(new EthTool(interface, fd));
auto drvinfo = ethtool->get_info();
// Some Linux interfaces cannot be found by ethtool IOCTL.
// Some Linux interfaces have no stats (i.e. the "bridge" driver used by containers).
if (!drvinfo.has_value() || drvinfo->n_stats == 0) {
LOGV2_WARNING(10985540,
"Skipping Ethtool stats collection for interface",
"interface"_attr = interface);
return nullptr;
}
return ethtool;
}
~EthTool() {
free(_gstrings);
close(_fd);
}
// Get a list of all non-loopback interfaces for the machine
static std::vector<std::string> interface_names() {
struct ifaddrs* ifaddr;
if (getifaddrs(&ifaddr) == -1) {
auto ec = lastPosixError();
uasserted(10985538, fmt::format("getifaddrs failed: {}", errorMessage(ec)));
}
ON_BLOCK_EXIT([&] { freeifaddrs(ifaddr); });
std::set<std::string> names;
for (ifaddrs* ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) {
if (ifa->ifa_addr == NULL) {
continue;
}
if ((ifa->ifa_flags & IFF_LOOPBACK) == IFF_LOOPBACK) {
continue;
}
names.insert(ifa->ifa_name);
}
std::vector<std::string> vec;
std::copy(names.begin(), names.end(), std::back_inserter(vec));
return vec;
}
// Get a list of stats names for a given interface
std::vector<StringData>& get_strings() {
if (!_names.has_value()) {
auto drvinfo = get_info();
_get_strings(drvinfo->n_stats);
}
return _names.get();
}
// Get a list of stats for a given interface
std::vector<uint64_t> get_stats() {
if (!_names.has_value()) {
return std::vector<uint64_t>();
}
return _get_stats(_names->size());
}
// Get a some basic information about the interface
boost::optional<ethtool_drvinfo> get_info() {
ethtool_drvinfo drvinfo;
memset(&drvinfo, 0, sizeof(drvinfo));
drvinfo.cmd = ETHTOOL_GDRVINFO;
if (_ioctlNoThrow("drvinfo", &drvinfo)) {
return boost::none;
}
return boost::optional<ethtool_drvinfo>(drvinfo);
}
// Name of the interface this class monitors
StringData name() const {
return _interface;
}
private:
explicit EthTool(StringData interface, int fd) : _fd(fd), _interface(std::string(interface)) {}
void _get_strings(size_t count) {
_gstrings = static_cast<ethtool_gstrings*>(
calloc(1, sizeof(ethtool_gstrings) + count * ETH_GSTRING_LEN));
_gstrings->cmd = ETHTOOL_GSTRINGS;
_gstrings->string_set = ETH_SS_STATS;
_gstrings->len = count;
_names.emplace(std::vector<StringData>());
if (_ioctlNoThrow("get_strings", _gstrings)) {
return;
}
char* ptr = reinterpret_cast<char*>(_gstrings) + sizeof(ethtool_gstrings);
for (size_t i = 0; i < count; i++) {
auto s = StringData(ptr);
_names->push_back(s);
ptr += ETH_GSTRING_LEN;
}
}
std::vector<uint64_t> _get_stats(size_t count) {
std::vector<char> stats_buf(sizeof(ethtool_stats) + count * 8,
0); /* 8 is the number specfied in ethtool.h */
ethtool_stats* stats = reinterpret_cast<ethtool_stats*>(stats_buf.data());
stats->cmd = ETHTOOL_GSTATS;
stats->n_stats = count;
if (_ioctlNoThrow("get_stats", stats)) {
return std::vector<uint64_t>();
}
char* ptr = reinterpret_cast<char*>(stats) + sizeof(ethtool_stats);
std::vector<uint64_t> stats_vec(ptr, ptr + count * 8);
return stats_vec;
}
// Returns non-zero on error
int _ioctlNoThrow(StringData name, void* cmd) {
ifreq ifr;
strcpy(ifr.ifr_name, _interface.c_str());
ifr.ifr_data = cmd;
auto ret = ioctl(_fd, SIOCETHTOOL, &ifr);
if (MONGO_unlikely(ret) && !_warningLogged) {
auto ec = lastPosixError();
_warningLogged = true;
LOGV2_WARNING(10985553,
"Failed to get strings for ethtool",
"interface"_attr = _interface,
"name"_attr = name,
"error"_attr = errorMessage(ec));
}
return ret;
}
private:
int _fd;
ethtool_gstrings* _gstrings{nullptr};
boost::optional<std::vector<StringData>> _names;
std::string _interface;
bool _warningLogged{false};
};
/**
* Collect metrics from the Linux /proc file system.
*/
@ -111,6 +304,16 @@ public:
for (const auto& disk : _disks) {
_disksStringData.emplace_back(disk);
}
auto interfaces = EthTool::interface_names();
_ethtools.reserve(interfaces.size());
for (const auto& ifn : interfaces) {
auto nic = EthTool::create(ifn);
if (nic) {
_ethtools.push_back(std::move(nic));
}
}
}
void collect(OperationContext* opCtx, BSONObjBuilder& builder) override {
@ -219,6 +422,29 @@ public:
&subObjBuilder);
subObjBuilder.doneFast();
}
{
BSONObjBuilder subObjBuilder(builder.subobjStart("ethtool"_sd));
for (auto& tool : _ethtools) {
BSONObjBuilder subNICBuilder(subObjBuilder.subobjStart(tool->name()));
auto names = tool->get_strings();
if (names.empty()) {
continue;
}
auto stats = tool->get_stats();
if (stats.empty()) {
continue;
}
invariant(stats.size() >= names.size());
for (size_t i = 0; i < names.size(); i++) {
subNICBuilder.append(names[i], static_cast<long long>(stats[i]));
}
}
}
}
private:
@ -227,6 +453,8 @@ private:
// List of physical disks to collect stats from as StringData to pass to parseProcDiskStatsFile.
std::vector<StringData> _disksStringData;
std::vector<std::unique_ptr<EthTool>> _ethtools;
};
class SimpleFunctionCollector final : public FTDCCollectorInterface {

View File

@ -106,6 +106,7 @@ mongo_cc_library(
"//src/mongo/db/repl:oplog_visibility_manager",
"//src/mongo/db/repl:optime",
"//src/mongo/db/repl:repl_coordinator_interface", # TODO(SERVER-93876): Remove.
"//src/mongo/db/rss:replicated_storage_service",
"//src/mongo/db/storage:oplog_truncate_markers",
"//src/mongo/db/storage:record_store_base",
],
@ -124,6 +125,7 @@ mongo_cc_library(
":durable_catalog_entry_metadata",
"//src/mongo/db:server_base",
"//src/mongo/db/op_observer:op_observer_util",
"//src/mongo/db/rss:replicated_storage_service",
"//src/mongo/db/storage:feature_document_util",
"//src/mongo/db/storage:ident",
"//src/mongo/db/storage:mdb_catalog",
@ -646,7 +648,6 @@ mongo_cc_library(
"//src/mongo/db:vector_clock",
"//src/mongo/db/collection_crud",
"//src/mongo/db/commands:server_status_core",
"//src/mongo/db/disagg_storage:server_parameters",
"//src/mongo/db/index:index_access_method",
"//src/mongo/db/index:preallocated_container_pool",
"//src/mongo/db/matcher/doc_validation",
@ -656,6 +657,7 @@ mongo_cc_library(
"//src/mongo/db/repl:oplog",
"//src/mongo/db/repl:repl_server_parameters",
"//src/mongo/db/repl:repl_settings",
"//src/mongo/db/rss:replicated_storage_service",
"//src/mongo/db/stats:top",
"//src/mongo/db/storage:mdb_catalog",
"//src/mongo/db/storage:record_store_base",

View File

@ -176,6 +176,8 @@ struct CollectionOptions {
boost::optional<EncryptedFieldConfig> encryptedFieldConfig;
// When 'true', will use the same recordIds across all nodes in the replica set.
// When using disaggregated storage, will be enabled implicitly when the collection
// is created.
bool recordIdsReplicated = false;
};

View File

@ -35,7 +35,6 @@
#include "mongo/bson/bsonobjbuilder.h"
#include "mongo/db/audit.h"
#include "mongo/db/basic_types_gen.h"
#include "mongo/db/disagg_storage/server_parameters_gen.h"
#include "mongo/db/index_builds/index_build_block.h"
#include "mongo/db/index_builds/index_builds_common.h"
#include "mongo/db/local_catalog/catalog_raii.h"
@ -65,6 +64,7 @@
#include "mongo/db/record_id.h"
#include "mongo/db/repl/oplog.h"
#include "mongo/db/repl/replication_coordinator.h"
#include "mongo/db/rss/replicated_storage_service.h"
#include "mongo/db/server_feature_flags_gen.h"
#include "mongo/db/server_options.h"
#include "mongo/db/server_parameter.h"
@ -158,10 +158,9 @@ RecordId acquireCatalogId(
OperationContext* opCtx,
const boost::optional<CreateCollCatalogIdentifier>& createCollCatalogIdentifier,
MDBCatalog* mdbCatalog) {
if (disagg::gDisaggregatedStorageEnabled && createCollCatalogIdentifier.has_value()) {
// Replicated catalogIds aren't compatible with standard architecture, as a node may create
// local collection whose catalogId collides with that of a replicated collection created on
// another node.
auto& rss = rss::ReplicatedStorageService::get(opCtx);
if (rss.getPersistenceProvider().shouldUseReplicatedCatalogIdentifiers() &&
createCollCatalogIdentifier.has_value()) {
return createCollCatalogIdentifier->catalogId;
}
return mdbCatalog->reserveCatalogId(opCtx);
@ -770,8 +769,10 @@ Collection* DatabaseImpl::_createCollection(
// Additionally, we do not set the recordIdsReplicated:true option on timeseries and
// clustered collections because in those cases the recordId is the _id, or on capped
// collections which utilizes a separate mechanism for ensuring uniform recordIds.
if (generatedUUID && !nss.isOnInternalDb() && !optionsWithUUID.timeseries &&
!optionsWithUUID.clusteredIndex && !optionsWithUUID.capped &&
const bool collectionTypeSupportsReplicatedRecordIds =
!optionsWithUUID.timeseries && !optionsWithUUID.clusteredIndex && !optionsWithUUID.capped;
const auto& provider = rss::ReplicatedStorageService::get(opCtx).getPersistenceProvider();
if (generatedUUID && !nss.isOnInternalDb() && collectionTypeSupportsReplicatedRecordIds &&
gFeatureFlagRecordIdsReplicated.isEnabledUseLastLTSFCVWhenUninitialized(
VersionContext::getDecoration(opCtx),
serverGlobalParams.featureCompatibility.acquireFCVSnapshot()) &&
@ -781,6 +782,19 @@ Collection* DatabaseImpl::_createCollection(
"Collection will use recordIdsReplicated:true.",
"oldValue"_attr = optionsWithUUID.recordIdsReplicated);
optionsWithUUID.recordIdsReplicated = true;
} else if (provider.shouldUseReplicatedRecordIds() && nss.isReplicated() &&
!nss.isImplicitlyReplicated() && collectionTypeSupportsReplicatedRecordIds) {
tassert(10985561,
str::stream() << "Replicated record IDs must be enabled with " << provider.name(),
gFeatureFlagRecordIdsReplicated.isEnabledUseLatestFCVWhenUninitialized(
VersionContext::getDecoration(opCtx),
serverGlobalParams.featureCompatibility.acquireFCVSnapshot()));
LOGV2_DEBUG(10985560,
2,
"Collection will use recordIdsReplicated:true",
"provider"_attr = provider.name(),
"oldValue"_attr = optionsWithUUID.recordIdsReplicated);
optionsWithUUID.recordIdsReplicated = true;
}
uassert(ErrorCodes::CommandNotSupported,

View File

@ -39,6 +39,7 @@
#include "mongo/db/local_catalog/shard_role_api/transaction_resources.h"
#include "mongo/db/op_observer/op_observer_util.h"
#include "mongo/db/operation_context.h"
#include "mongo/db/rss/replicated_storage_service.h"
#include "mongo/db/storage/feature_document_util.h"
#include "mongo/db/storage/kv/kv_engine.h"
#include "mongo/db/storage/mdb_catalog.h"
@ -331,10 +332,13 @@ Status createIndex(OperationContext* opCtx,
auto& ru = *shard_role_details::getRecoveryUnit(opCtx);
auto storageEngine = opCtx->getServiceContext()->getStorageEngine();
auto kvEngine = storageEngine->getEngine();
auto& provider = rss::ReplicatedStorageService::get(opCtx).getPersistenceProvider();
invariant(collectionOptions.uuid);
bool replicateLocalCatalogIdentifiers =
shouldReplicateLocalCatalogIdentifers(VersionContext::getDecoration(opCtx));
bool replicateLocalCatalogIdentifiers = shouldReplicateLocalCatalogIdentifers(
rss::ReplicatedStorageService::get(opCtx).getPersistenceProvider(),
VersionContext::getDecoration(opCtx));
if (replicateLocalCatalogIdentifiers) {
// If a previous attempt at creating this index was rolled back, the ident may still be drop
// pending. Complete that drop before creating the index if so.
@ -349,6 +353,7 @@ Status createIndex(OperationContext* opCtx,
}
Status status = kvEngine->createSortedDataInterface(
provider,
ru,
nss,
*collectionOptions.uuid,
@ -466,8 +471,10 @@ Status dropAndRecreateIndexIdentForResume(OperationContext* opCtx,
return status;
invariant(collectionOptions.uuid);
auto& provider = rss::ReplicatedStorageService::get(opCtx).getPersistenceProvider();
status =
engine->createSortedDataInterface(*shard_role_details::getRecoveryUnit(opCtx),
engine->createSortedDataInterface(provider,
*shard_role_details::getRecoveryUnit(opCtx),
nss,
*collectionOptions.uuid,
ident,

View File

@ -39,6 +39,7 @@
#include "mongo/db/repl/oplog.h"
#include "mongo/db/repl/optime.h"
#include "mongo/db/repl/replication_coordinator.h"
#include "mongo/db/rss/replicated_storage_service.h"
#include "mongo/db/storage/record_store.h"
#include "mongo/db/storage/recovery_unit.h"
#include "mongo/db/storage/storage_options.h"
@ -146,6 +147,10 @@ std::vector<OplogSlot> LocalOplogInfo::getNextOpTimes(OperationContext* opCtx, s
Timestamp ts;
// Provide a sample to FlowControl after the `oplogInfo.newOpMutex` is released.
ON_BLOCK_EXIT([opCtx, &ts, count] {
auto& rss = rss::ReplicatedStorageService::get(opCtx);
if (!rss.getPersistenceProvider().shouldUseOplogWritesForFlowControlSampling())
return;
auto flowControl = FlowControl::get(opCtx);
if (flowControl) {
flowControl->sample(ts, count);

View File

@ -154,17 +154,14 @@
#include "mongo/db/repl/repl_settings.h"
#include "mongo/db/repl/replication_consistency_markers_impl.h"
#include "mongo/db/repl/replication_coordinator.h"
#include "mongo/db/repl/replication_coordinator_external_state_impl.h"
#include "mongo/db/repl/replication_coordinator_impl.h"
#include "mongo/db/repl/replication_coordinator_impl_gen.h"
#include "mongo/db/repl/replication_process.h"
#include "mongo/db/repl/replication_recovery.h"
#include "mongo/db/repl/storage_interface.h"
#include "mongo/db/repl/storage_interface_impl.h"
#include "mongo/db/repl/topology_coordinator.h"
#include "mongo/db/repl/wait_for_majority_service.h"
#include "mongo/db/replication_state_transition_lock_guard.h"
#include "mongo/db/request_execution_context.h"
#include "mongo/db/rss/replicated_storage_service.h"
#include "mongo/db/s/migration_blocking_operation/multi_update_coordinator.h"
#include "mongo/db/s/migration_chunk_cloner_source_op_observer.h"
#include "mongo/db/s/query_analysis_op_observer_configsvr.h"
@ -606,9 +603,9 @@ ExitCode _initAndListen(ServiceContext* serviceContext) {
ec != ExitCode::clean)
return ec;
FlowControl::set(serviceContext,
std::make_unique<FlowControl>(
serviceContext, repl::ReplicationCoordinator::get(serviceContext)));
auto& rss = rss::ReplicatedStorageService::get(serviceContext);
auto& serviceLifecycle = rss.getServiceLifecycle();
serviceLifecycle.initializeFlowControl(serviceContext);
// If a crash occurred during file-copy based initial sync, we may need to finish or clean up.
{
@ -620,8 +617,20 @@ ExitCode _initAndListen(ServiceContext* serviceContext) {
admission::initializeExecutionControl(serviceContext);
auto lastShutdownState = catalog::startUpStorageEngineAndCollectionCatalog(
serviceContext, &cc(), StorageEngineInitFlags{}, &startupTimeElapsedBuilder);
serviceLifecycle.initializeStorageEngineExtensions(serviceContext);
auto lastShutdownState = [&]() {
if (rss.getPersistenceProvider().shouldDelayDataAccessDuringStartup()) {
// If data isn't ready yet, we shouldn't try to read it.
auto initializeStorageEngineOpCtx = serviceContext->makeOperationContext(&cc());
return catalog::startUpStorageEngine(initializeStorageEngineOpCtx.get(),
StorageEngineInitFlags{},
&startupTimeElapsedBuilder);
} else {
return catalog::startUpStorageEngineAndCollectionCatalog(
serviceContext, &cc(), StorageEngineInitFlags{}, &startupTimeElapsedBuilder);
}
}();
StorageControl::startStorageControls(serviceContext);
auto logStartupStats = std::make_unique<ScopeGuard<std::function<void()>>>([&] {
@ -898,7 +907,8 @@ ExitCode _initAndListen(ServiceContext* serviceContext) {
&startupTimeElapsedBuilder);
replCoord->startup(startupOpCtx.get(), lastShutdownState);
} else {
if (storageEngine->supportsCappedCollections()) {
if (rss.getPersistenceProvider().supportsLocalCollections() &&
storageEngine->supportsCappedCollections()) {
logStartup(startupOpCtx.get());
}
@ -1368,30 +1378,16 @@ auto makeReplicaSetNodeExecutor(ServiceContext* serviceContext) {
"ReplNodeDbWorkerNetwork", nullptr, makeShardingEgressHooksList(serviceContext)));
}
auto makeReplicationExecutor(ServiceContext* serviceContext) {
ThreadPool::Options tpOptions;
tpOptions.threadNamePrefix = "ReplCoord-";
tpOptions.poolName = "ReplCoordThreadPool";
tpOptions.maxThreads = 50;
tpOptions.onCreateThread = [serviceContext](const std::string& threadName) {
Client::initThread(threadName,
serviceContext->getService(ClusterRole::ShardServer),
Client::noSession(),
ClientOperationKillableByStepdown{false});
};
auto hookList = std::make_unique<rpc::EgressMetadataHookList>();
hookList->addHook(std::make_unique<rpc::VectorClockMetadataHook>(serviceContext));
return executor::ThreadPoolTaskExecutor::create(
std::make_unique<ThreadPool>(tpOptions),
executor::makeNetworkInterface("ReplNetwork", nullptr, std::move(hookList)));
}
void setUpReplicaSetDDLHooks(ServiceContext* serviceContext) {
ReplicaSetDDLTracker::create(serviceContext);
DirectConnectionDDLHook::create(serviceContext);
}
void setUpReplication(ServiceContext* serviceContext) {
auto& serviceLifecycle =
rss::ReplicatedStorageService::get(serviceContext).getServiceLifecycle();
serviceLifecycle.initializeStateRequiredForStorageAccess(serviceContext);
repl::StorageInterface::set(serviceContext, std::make_unique<repl::StorageInterfaceImpl>());
auto storageInterface = repl::StorageInterface::get(serviceContext);
@ -1403,22 +1399,10 @@ void setUpReplication(ServiceContext* serviceContext) {
serviceContext,
std::make_unique<repl::ReplicationProcess>(
storageInterface, std::move(consistencyMarkers), std::move(recovery)));
auto replicationProcess = repl::ReplicationProcess::get(serviceContext);
repl::TopologyCoordinator::Options topoCoordOptions;
topoCoordOptions.maxSyncSourceLagSecs = Seconds(repl::maxSyncSourceLagSecs);
topoCoordOptions.clusterRole = serverGlobalParams.clusterRole;
std::unique_ptr<repl::ReplicationCoordinator> replCoord =
serviceLifecycle.initializeReplicationCoordinator(serviceContext);
auto replCoord = std::make_unique<repl::ReplicationCoordinatorImpl>(
serviceContext,
getGlobalReplSettings(),
std::make_unique<repl::ReplicationCoordinatorExternalStateImpl>(
serviceContext, storageInterface, replicationProcess),
makeReplicationExecutor(serviceContext),
std::make_unique<repl::TopologyCoordinator>(topoCoordOptions),
replicationProcess,
storageInterface,
SecureRandom().nextInt64());
// Only create a ReplicaSetNodeExecutor if sharding is disabled and replication is enabled.
// Note that sharding sets up its own executors for scheduling work to remote nodes.
if (serverGlobalParams.clusterRole.has(ClusterRole::None) &&
@ -1840,8 +1824,12 @@ void shutdownTask(const ShutdownTaskArgs& shutdownArgs) {
SectionScopedTimer scopedTimer(serviceContext->getFastClockSource(),
TimedSectionId::killAllOperations,
&shutdownTimeElapsedBuilder);
serviceContext->setKillAllOperations(
[](const StringData t) { return t == kFTDCThreadName; });
auto& serviceLifecycle =
rss::ReplicatedStorageService::get(serviceContext).getServiceLifecycle();
serviceContext->setKillAllOperations([&serviceLifecycle](const StringData t) {
return t == kFTDCThreadName ||
serviceLifecycle.shouldKeepThreadAliveUntilStorageEngineHasShutDown(t);
});
if (MONGO_unlikely(pauseWhileKillingOperationsAtShutdown.shouldFail())) {
LOGV2_OPTIONS(4701700,
@ -1987,6 +1975,13 @@ void shutdownTask(const ShutdownTaskArgs& shutdownArgs) {
true /* memLeakAllowed */);
}
// Depending on the underlying implementation, there may be some state that needs to be shut
// down after the replication subsystem and the storage engine.
auto& serviceLifecycle =
rss::ReplicatedStorageService::get(serviceContext).getServiceLifecycle();
serviceLifecycle.shutdownStateRequiredForStorageAccess(serviceContext,
&shutdownTimeElapsedBuilder);
// We drop the scope cache because leak sanitizer can't see across the
// thread we use for proxying MozJS requests. Dropping the cache cleans up
// the memory and makes leak sanitizer happy.

View File

@ -27,7 +27,6 @@
* it in the license file.
*/
#include "mongo/db/op_msg_fuzzer_router_fixture.h"
#include "mongo/base/string_data.h"

View File

@ -141,8 +141,8 @@ mongo_cc_library(
"//src/mongo:base",
"//src/mongo/bson/dotted_path:dotted_path_support", # TODO(SERVER-93876): Remove.
"//src/mongo/db:shard_role_api",
"//src/mongo/db/disagg_storage:server_parameters",
"//src/mongo/db/local_catalog:collection_options",
"//src/mongo/db/rss:replicated_storage_service",
],
)

View File

@ -66,6 +66,7 @@
#include "mongo/db/repl/oplog_entry_gen.h"
#include "mongo/db/repl/read_concern_args.h"
#include "mongo/db/repl/replication_coordinator.h"
#include "mongo/db/rss/replicated_storage_service.h"
#include "mongo/db/server_options.h"
#include "mongo/db/session/logical_session_id_helpers.h"
#include "mongo/db/session/session_txn_record_gen.h"
@ -330,8 +331,9 @@ void OpObserverImpl::onCreateIndex(OperationContext* opCtx,
return;
}
bool replicateLocalCatalogIdentifiers =
shouldReplicateLocalCatalogIdentifers(VersionContext::getDecoration(opCtx));
bool replicateLocalCatalogIdentifiers = shouldReplicateLocalCatalogIdentifers(
rss::ReplicatedStorageService::get(opCtx).getPersistenceProvider(),
VersionContext::getDecoration(opCtx));
BSONObjBuilder builder;
// Note that despite using this constant, we are not building a CreateIndexCommand here
@ -417,7 +419,9 @@ void OpObserverImpl::onStartIndexBuild(OperationContext* opCtx,
oplogEntry.setNss(nss.getCommandNS());
oplogEntry.setUuid(collUUID);
oplogEntry.setObject(oplogEntryBuilder.done());
if (shouldReplicateLocalCatalogIdentifers(VersionContext::getDecoration(opCtx))) {
if (shouldReplicateLocalCatalogIdentifers(
rss::ReplicatedStorageService::get(opCtx).getPersistenceProvider(),
VersionContext::getDecoration(opCtx))) {
// TODO (SERVER-109824): Move 'directoryPerDB' and 'directoryForIndexes' to the function
// parameters.
oplogEntry.setObject2(BSON("indexes" << o2IndexesArr.arr() << "directoryPerDB"
@ -1190,7 +1194,9 @@ void OpObserverImpl::onCreateCollection(
oplogEntry.setNss(collectionName.getCommandNS());
oplogEntry.setUuid(options.uuid);
oplogEntry.setObject(MutableOplogEntry::makeCreateCollObject(collectionName, options, idIndex));
if (shouldReplicateLocalCatalogIdentifers(VersionContext::getDecoration(opCtx))) {
if (shouldReplicateLocalCatalogIdentifers(
rss::ReplicatedStorageService::get(opCtx).getPersistenceProvider(),
VersionContext::getDecoration(opCtx))) {
invariant(createCollCatalogIdentifier.has_value(),
"Missing catalog identifier required to log replicated "
"collection");

View File

@ -33,8 +33,8 @@
#include "mongo/bson/bsonelement.h"
#include "mongo/bson/bsonobjbuilder.h"
#include "mongo/bson/dotted_path/dotted_path_support.h"
#include "mongo/db/disagg_storage/server_parameters_gen.h"
#include "mongo/db/global_catalog/shard_key_pattern.h"
#include "mongo/db/rss/replicated_storage_service.h"
#include "mongo/db/storage/storage_parameters_gen.h"
#include "mongo/util/duration.h"
#include "mongo/util/fail_point.h"
@ -53,10 +53,9 @@ const OpStateAccumulator::Decoration<std::unique_ptr<ShardingWriteRouter>>
MONGO_FAIL_POINT_DEFINE(addDestinedRecipient);
MONGO_FAIL_POINT_DEFINE(sleepBetweenInsertOpTimeGenerationAndLogOp);
bool shouldReplicateLocalCatalogIdentifers(const VersionContext& vCtx) {
if (disagg::gDisaggregatedStorageEnabled) {
// Disaggregated storage relies on consistent catalog storage. Safe-guard if FCV is not yet
// initialized despite the feature being enabled.
bool shouldReplicateLocalCatalogIdentifers(const rss::PersistenceProvider& provider,
const VersionContext& vCtx) {
if (provider.shouldUseReplicatedCatalogIdentifiers()) {
return true;
}
const auto fcvSnapshot = serverGlobalParams.featureCompatibility.acquireFCVSnapshot();

View File

@ -35,6 +35,7 @@
#include "mongo/db/local_catalog/collection_options.h"
#include "mongo/db/namespace_string.h"
#include "mongo/db/op_observer/op_observer.h"
#include "mongo/db/rss/persistence_provider.h"
#include "mongo/util/assert_util.h"
#include "mongo/util/decorable.h"
#include "mongo/util/fail_point.h"
@ -54,7 +55,8 @@ extern FailPoint sleepBetweenInsertOpTimeGenerationAndLogOp;
/**
* Returns true when local catalog identifiers should be replicated through the oplog.
*/
bool shouldReplicateLocalCatalogIdentifers(const VersionContext& vCtx);
bool shouldReplicateLocalCatalogIdentifers(const rss::PersistenceProvider&,
const VersionContext& vCtx);
/**
* Returns true if gFeatureFlagPrimaryDrivenIndexBuilds is enabled.

View File

@ -972,6 +972,7 @@ mongo_cc_library(
"//src/mongo/db/pipeline:change_stream_preimage",
"//src/mongo/db/query/write_ops",
"//src/mongo/db/repl/dbcheck",
"//src/mongo/db/rss:replicated_storage_service",
"//src/mongo/db/session:session_catalog_mongod",
"//src/mongo/db/stats:counters",
"//src/mongo/db/stats:server_read_concern_write_concern_metrics",

View File

@ -105,6 +105,7 @@
#include "mongo/db/repl/replication_coordinator.h"
#include "mongo/db/repl/timestamp_block.h"
#include "mongo/db/repl/transaction_oplog_application.h"
#include "mongo/db/rss/replicated_storage_service.h"
#include "mongo/db/service_context.h"
#include "mongo/db/session/logical_session_id_gen.h"
#include "mongo/db/sharding_environment/shard_id.h"
@ -211,7 +212,10 @@ StringData getInvalidatingReason(const OplogApplication::Mode mode, const bool i
boost::optional<CreateCollCatalogIdentifier> extractReplicatedCatalogIdentifier(
OperationContext* opCtx, const OplogEntry& oplogEntry) {
auto& o2 = oplogEntry.getObject2();
if (!o2 || !shouldReplicateLocalCatalogIdentifers(VersionContext::getDecoration(opCtx))) {
if (!o2 ||
!shouldReplicateLocalCatalogIdentifers(
rss::ReplicatedStorageService::get(opCtx).getPersistenceProvider(),
VersionContext::getDecoration(opCtx))) {
// Either no catalog identifier information was provided, or replicated local catalog
// identifiers are not supported.
return boost::none;
@ -365,7 +369,9 @@ void createIndexForApplyOps(OperationContext* opCtx,
IndexBuildInfo indexBuildInfo = [&] {
auto storageEngine = opCtx->getServiceContext()->getStorageEngine();
if (!indexMetadata ||
!shouldReplicateLocalCatalogIdentifers(VersionContext::getDecoration(opCtx))) {
!shouldReplicateLocalCatalogIdentifers(
rss::ReplicatedStorageService::get(opCtx).getPersistenceProvider(),
VersionContext::getDecoration(opCtx))) {
return IndexBuildInfo(indexSpec, *storageEngine, indexCollection->ns().dbName());
}
@ -733,8 +739,13 @@ void createOplog(OperationContext* opCtx,
uow.commit();
});
/* sync here so we don't get any surprising lag later when we try to sync */
service->getStorageEngine()->flushAllFiles(opCtx, /*callerHoldsReadLock*/ false);
// We cannot guarantee that we have a stable timestamp at this point, but if the persistence
// provider supports unstable checkpoints, we can take a checkpoint now to avoid any surprising
// lag later when we try to sync.
auto& rss = rss::ReplicatedStorageService::get(service);
if (rss.getPersistenceProvider().supportsUnstableCheckpoints()) {
service->getStorageEngine()->flushAllFiles(opCtx, /*callerHoldsReadLock*/ false);
}
}
void createOplog(OperationContext* opCtx) {
@ -961,7 +972,9 @@ const StringMap<ApplyOpMetadata> kOpsMap = {
auto swOplogEntry = IndexBuildOplogEntry::parse(
opCtx,
entry,
shouldReplicateLocalCatalogIdentifers(VersionContext::getDecoration(opCtx)));
shouldReplicateLocalCatalogIdentifers(
rss::ReplicatedStorageService::get(opCtx).getPersistenceProvider(),
VersionContext::getDecoration(opCtx)));
if (!swOplogEntry.isOK()) {
return swOplogEntry.getStatus().withContext(
"Error parsing 'startIndexBuild' oplog entry");

View File

@ -291,7 +291,7 @@ public:
}
// Should be called after all oplog entries have been processed to handle the deletes that
// were not superceded by a later write.
// were not superseded by a later write.
void handleLatestDeletes(std::function<void(OplogEntry*)> handler) {
std::for_each(_retryImageWrites.begin(),
_retryImageWrites.end(),

View File

@ -103,7 +103,6 @@
#include "mongo/db/session/session_txn_record_gen.h"
#include "mongo/db/sharding_environment/shard_id.h"
#include "mongo/db/stats/counters.h"
#include "mongo/db/storage/mdb_catalog.h"
#include "mongo/db/storage/write_unit_of_work.h"
#include "mongo/db/tenant_id.h"
#include "mongo/db/timeseries/timeseries_gen.h"
@ -141,21 +140,6 @@
namespace mongo {
namespace repl {
namespace {
CreateCollCatalogIdentifier newCatalogIdentifier(OperationContext* opCtx,
const DatabaseName& dbName,
bool includeIdIndexIdent) {
auto storageEngine = opCtx->getServiceContext()->getStorageEngine();
auto mdbCatalog = storageEngine->getMDBCatalog();
invariant(mdbCatalog);
CreateCollCatalogIdentifier catalogIdentifier;
catalogIdentifier.catalogId = mdbCatalog->reserveCatalogId(opCtx);
catalogIdentifier.ident = storageEngine->generateNewCollectionIdent(dbName);
if (includeIdIndexIdent) {
catalogIdentifier.idIndexIdent = storageEngine->generateNewIndexIdent(dbName);
}
return catalogIdentifier;
}
auto parseFromOplogEntryArray(const BSONObj& obj, int elem) {
BSONElement tsArray;
@ -632,45 +616,6 @@ TEST_F(OplogApplierImplTest, CreateCollectionCommand) {
ASSERT_TRUE(collectionExists(_opCtx.get(), nss));
}
TEST_F(OplogApplierImplTest, CreateCollectionCommandDisaggBasic) {
RAIIServerParameterControllerForTest disaggServer("disaggregatedStorageEnabled", true);
RAIIServerParameterControllerForTest replicateLocalCatalogInfoController(
"featureFlagReplicateLocalCatalogIdentifiers", true);
NamespaceString nss = NamespaceString::createNamespaceString_forTest("test.t");
auto catalogIdentifier =
newCatalogIdentifier(_opCtx.get(), nss.dbName(), true /* includeIdIndexIdent*/);
auto entry =
makeCreateCollectionOplogEntry(nextOpTime(),
nss,
CollectionOptions{.uuid = UUID::gen()},
BSON("v" << 2 << "key" << BSON("_id_" << 1) << "name"
<< "_id_") /* idIndex */,
catalogIdentifier);
bool applyCmdCalled = false;
_opObserver->onCreateCollectionFn =
[&](OperationContext* opCtx,
const NamespaceString& collNss,
const CollectionOptions&,
const BSONObj&,
const boost::optional<CreateCollCatalogIdentifier>& collCatalogIdentifier) {
applyCmdCalled = true;
ASSERT_TRUE(opCtx);
ASSERT_TRUE(
shard_role_details::getLocker(opCtx)->isDbLockedForMode(nss.dbName(), MODE_IX));
ASSERT_EQUALS(nss, collNss);
ASSERT(collCatalogIdentifier);
ASSERT_EQUALS(catalogIdentifier.catalogId, collCatalogIdentifier->catalogId);
ASSERT_EQUALS(catalogIdentifier.ident, collCatalogIdentifier->ident);
};
ASSERT_OK(_applyOplogEntryOrGroupedInsertsWrapper(
_opCtx.get(), ApplierOperation{&entry}, OplogApplication::Mode::kInitialSync));
ASSERT_TRUE(applyCmdCalled);
ASSERT_TRUE(collectionExists(_opCtx.get(), nss));
}
TEST_F(OplogApplierImplTest, CreateCollectionCommandMultitenant) {
setServerParameter("multitenancySupport", true);
setServerParameter("featureFlagRequireTenantID", true);

View File

@ -62,6 +62,7 @@
#include "mongo/db/repl/storage_interface_impl.h"
#include "mongo/db/session/session_catalog_mongod.h"
#include "mongo/db/sharding_environment/shard_id.h"
#include "mongo/db/storage/mdb_catalog.h"
#include "mongo/db/storage/write_unit_of_work.h"
#include "mongo/db/tenant_id.h"
#include "mongo/db/transaction/session_catalog_mongod_transaction_interface_impl.h"
@ -198,14 +199,18 @@ void OplogApplierImplOpObserver::onCollMod(OperationContext* opCtx,
onCollModFn(opCtx, nss, uuid, collModCmd, oldCollOptions, indexInfo);
}
std::unique_ptr<ReplicationCoordinator> OplogApplierImplTest::makeReplCoord(
ServiceContext* serviceContext) {
return std::make_unique<ReplicationCoordinatorMock>(serviceContext);
}
void OplogApplierImplTest::setUp() {
ServiceContextMongoDTest::setUp();
serviceContext = getServiceContext();
_opCtx = cc().makeOperationContext();
ReplicationCoordinator::set(serviceContext,
std::make_unique<ReplicationCoordinatorMock>(serviceContext));
ReplicationCoordinator::set(serviceContext, makeReplCoord(serviceContext));
ASSERT_OK(ReplicationCoordinator::get(_opCtx.get())->setFollowerMode(MemberState::RS_PRIMARY));
StorageInterface::set(serviceContext, std::make_unique<StorageInterfaceImpl>());
@ -625,5 +630,21 @@ void createIndex(OperationContext* opCtx,
opCtx, collUUID, spec, IndexBuildsManager::IndexConstraints::kEnforce, false);
}
CreateCollCatalogIdentifier newCatalogIdentifier(OperationContext* opCtx,
const DatabaseName& dbName,
bool includeIdIndexIdent) {
auto storageEngine = opCtx->getServiceContext()->getStorageEngine();
auto mdbCatalog = storageEngine->getMDBCatalog();
invariant(mdbCatalog);
CreateCollCatalogIdentifier catalogIdentifier;
catalogIdentifier.catalogId = mdbCatalog->reserveCatalogId(opCtx);
catalogIdentifier.ident = storageEngine->generateNewCollectionIdent(dbName);
if (includeIdIndexIdent) {
catalogIdentifier.idIndexIdent = storageEngine->generateNewIndexIdent(dbName);
}
return catalogIdentifier;
}
} // namespace repl
} // namespace mongo

View File

@ -277,6 +277,7 @@ protected:
return OpTime(Timestamp(Seconds(lastSecond++), 0), 1LL);
}
virtual std::unique_ptr<ReplicationCoordinator> makeReplCoord(ServiceContext*);
void setUp() override;
void tearDown() override;
@ -415,6 +416,13 @@ void createIndex(OperationContext* opCtx,
UUID collUUID,
const BSONObj& spec);
} // namespace MONGO_MOD_PUB repl
/**
* Generate a new catalog identifier.
*/
CreateCollCatalogIdentifier newCatalogIdentifier(OperationContext* opCtx,
const DatabaseName& dbName,
bool includeIdIndexIdent);
} // namespace MONGO_MOD_PUB repl
} // namespace mongo

View File

@ -212,4 +212,10 @@ public:
};
std::ostream& operator<<(std::ostream& out, const OpTimeAndWallTime& opTime);
// A convenience class for holding both a Timestamp and a Date_t.
struct TimestampAndWallTime {
Timestamp timestamp;
Date_t wallTime;
};
} // namespace mongo::repl

View File

@ -59,6 +59,7 @@
#include "mongo/db/repl/replication_consistency_markers.h"
#include "mongo/db/repl/replication_process.h"
#include "mongo/db/repl/rollback_test_fixture.h"
#include "mongo/db/rss/replicated_storage_service.h"
#include "mongo/db/server_options.h"
#include "mongo/db/service_context.h"
#include "mongo/db/session/logical_session_id.h"
@ -2250,7 +2251,9 @@ TEST_F(RollbackImplObserverInfoTest,
auto uuid = UUID::gen();
BSONObj indexObj;
if (shouldReplicateLocalCatalogIdentifers(VersionContext::getDecoration(_opCtx.get()))) {
if (shouldReplicateLocalCatalogIdentifers(
rss::ReplicatedStorageService::get(_opCtx.get()).getPersistenceProvider(),
VersionContext::getDecoration(_opCtx.get()))) {
indexObj = BSON("createIndexes" << nss.coll() << "spec"
<< BSON("v" << 2 << "key"
<< "x"

View File

@ -106,6 +106,7 @@
#include "mongo/db/repl/storage_interface.h"
#include "mongo/db/repl/storage_interface_impl.h"
#include "mongo/db/repl/timestamp_block.h"
#include "mongo/db/rss/replicated_storage_service.h"
#include "mongo/db/service_context.h"
#include "mongo/db/service_context_d_test_fixture.h"
#include "mongo/db/session/logical_session_id.h"
@ -916,7 +917,9 @@ public:
}
StringData indexNameOplogField() const {
return shouldReplicateLocalCatalogIdentifers(VersionContext::getDecoration(_opCtx))
return shouldReplicateLocalCatalogIdentifers(
rss::ReplicatedStorageService::get(_opCtx).getPersistenceProvider(),
VersionContext::getDecoration(_opCtx))
? "o.spec.name"
: "o.name";
}
@ -3088,14 +3091,16 @@ TEST_F(StorageTimestampTest, CreateCollectionWithSystemIndex) {
// supports 2 phase index build.
indexStartTs = op.getTimestamp();
indexCreateTs =
repl::OplogEntry(queryOplog(BSON("op" << "c"
<< "ns" << nss.getCommandNS().ns_forTest()
<< "o.createIndexes" << nss.coll()
<< (shouldReplicateLocalCatalogIdentifers(
VersionContext::getDecoration(_opCtx))
? "o.spec.name"
: "o.name")
<< "user_1_db_1")))
repl::OplogEntry(
queryOplog(BSON(
"op" << "c"
<< "ns" << nss.getCommandNS().ns_forTest() << "o.createIndexes" << nss.coll()
<< (shouldReplicateLocalCatalogIdentifers(
rss::ReplicatedStorageService::get(_opCtx).getPersistenceProvider(),
VersionContext::getDecoration(_opCtx))
? "o.spec.name"
: "o.name")
<< "user_1_db_1")))
.getTimestamp();
indexCompleteTs = indexCreateTs;

View File

@ -0,0 +1,77 @@
load("//bazel:mongo_src_rules.bzl", "mongo_cc_benchmark", "mongo_cc_library", "mongo_cc_unit_test")
package(default_visibility = ["//visibility:public"])
exports_files(
glob([
"*.h",
"*.cpp",
]),
)
mongo_cc_library(
name = "replicated_storage_service",
srcs = [
"replicated_storage_service.cpp",
],
hdrs = [
"replicated_storage_service.h",
],
deps = [
"//src/mongo:base",
"//src/mongo/db:service_context",
"//src/mongo/db/rss:persistence_provider",
"//src/mongo/db/rss:service_lifecycle",
],
)
mongo_cc_library(
name = "persistence_provider",
hdrs = [
"persistence_provider.h",
],
deps = [
"//src/mongo:base",
"//src/mongo/db:service_context",
],
)
mongo_cc_library(
name = "service_lifecycle",
hdrs = [
"service_lifecycle.h",
],
deps = [
"//src/mongo:base",
"//src/mongo/db:service_context",
"//src/mongo/db/repl:repl_coordinator_interface",
],
)
mongo_cc_library(
name = "persistence_provider_impl",
deps = [
"//src/mongo:base",
"//src/mongo/db/rss:persistence_provider",
"//src/mongo/db/rss/attached_storage:attached_persistence_provider",
] + select({
"//bazel/config:build_atlas_enabled": [
"//src/mongo/db/modules/atlas/src/disagg_storage:disaggregated_persistence_provider",
],
"//conditions:default": [],
}),
)
mongo_cc_library(
name = "service_lifecycle_impl",
deps = [
"//src/mongo:base",
"//src/mongo/db/rss:service_lifecycle",
"//src/mongo/db/rss/attached_storage:attached_service_lifecycle",
] + select({
"//bazel/config:build_atlas_enabled": [
"//src/mongo/db/modules/atlas/src/disagg_storage:disaggregated_service_lifecycle",
],
"//conditions:default": [],
}),
)

View File

@ -0,0 +1,6 @@
version: 1.0.0
filters:
- "*":
approvers:
- 10gen/server-storage-execution
- 10gen/server-replication

View File

@ -0,0 +1,43 @@
load("//bazel:mongo_src_rules.bzl", "mongo_cc_benchmark", "mongo_cc_library", "mongo_cc_unit_test")
package(default_visibility = ["//visibility:public"])
exports_files(
glob([
"*.h",
"*.cpp",
]),
)
mongo_cc_library(
name = "attached_persistence_provider",
srcs = [
"attached_persistence_provider.cpp",
],
hdrs = [
"attached_persistence_provider.h",
],
deps = [
"//src/mongo:base",
"//src/mongo/db:service_context",
"//src/mongo/db/rss:replicated_storage_service",
],
)
mongo_cc_library(
name = "attached_service_lifecycle",
srcs = [
"attached_service_lifecycle.cpp",
],
hdrs = [
"attached_service_lifecycle.h",
],
deps = [
"//src/mongo:base",
"//src/mongo/db:service_context",
"//src/mongo/db/admission:flow_control",
"//src/mongo/db/repl:repl_coordinator_impl",
"//src/mongo/db/repl:serveronly_repl",
"//src/mongo/db/rss:replicated_storage_service",
],
)

View File

@ -0,0 +1,101 @@
/**
* Copyright (C) 2025-present MongoDB, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the Server Side Public License, version 1,
* as published by MongoDB, Inc.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* Server Side Public License for more details.
*
* You should have received a copy of the Server Side Public License
* along with this program. If not, see
* <http://www.mongodb.com/licensing/server-side-public-license>.
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the Server Side Public License in all respects for
* all of the code used other than as permitted herein. If you modify file(s)
* with this exception, you may extend this exception to your version of the
* file(s), but you are not obligated to do so. If you do not wish to do so,
* delete this exception statement from your version. If you delete this
* exception statement from all source files in the program, then also delete
* it in the license file.
*/
#include "mongo/db/rss/attached_storage/attached_persistence_provider.h"
#include "mongo/base/string_data.h"
#include "mongo/db/rss/replicated_storage_service.h"
#include "mongo/db/service_context.h"
namespace mongo::rss {
namespace {
// Checkpoint every 60 seconds by default.
constexpr double kDefaultAttachedSyncDelaySeconds = 60.0;
ServiceContext::ConstructorActionRegisterer registerAttachedPersistenceProvider{
"AttachedPersistenceProvider", [](ServiceContext* service) {
auto& rss = ReplicatedStorageService::get(service);
rss.setPersistenceProvider(std::make_unique<AttachedPersistenceProvider>());
}};
} // namespace
std::string AttachedPersistenceProvider::name() const {
return "Attached Storage";
}
boost::optional<Timestamp> AttachedPersistenceProvider::getSentinelDataTimestamp() const {
return boost::none;
}
std::string AttachedPersistenceProvider::getWiredTigerConfig(int) const {
return "";
}
bool AttachedPersistenceProvider::shouldUseReplicatedCatalogIdentifiers() const {
return false;
}
bool AttachedPersistenceProvider::shouldUseReplicatedRecordIds() const {
return false;
}
bool AttachedPersistenceProvider::shouldUseOplogWritesForFlowControlSampling() const {
return true;
}
bool AttachedPersistenceProvider::shouldStepDownForShutdown() const {
return true;
}
bool AttachedPersistenceProvider::shouldDelayDataAccessDuringStartup() const {
return false;
}
bool AttachedPersistenceProvider::shouldAvoidDuplicateCheckpoints() const {
return false;
}
bool AttachedPersistenceProvider::supportsLocalCollections() const {
return true;
}
bool AttachedPersistenceProvider::supportsUnstableCheckpoints() const {
return true;
}
bool AttachedPersistenceProvider::supportsTableLogging() const {
return true;
}
bool AttachedPersistenceProvider::supportsMultiDocumentTransactions() const {
return true;
}
} // namespace mongo::rss

View File

@ -0,0 +1,105 @@
/**
* Copyright (C) 2025-present MongoDB, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the Server Side Public License, version 1,
* as published by MongoDB, Inc.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* Server Side Public License for more details.
*
* You should have received a copy of the Server Side Public License
* along with this program. If not, see
* <http://www.mongodb.com/licensing/server-side-public-license>.
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the Server Side Public License in all respects for
* all of the code used other than as permitted herein. If you modify file(s)
* with this exception, you may extend this exception to your version of the
* file(s), but you are not obligated to do so. If you do not wish to do so,
* delete this exception statement from your version. If you delete this
* exception statement from all source files in the program, then also delete
* it in the license file.
*/
#pragma once
#include "mongo/db/rss/persistence_provider.h"
namespace mongo::rss {
class AttachedPersistenceProvider : public PersistenceProvider {
public:
std::string name() const override;
/**
* We do not have any specific initialization requirements.
*/
boost::optional<Timestamp> getSentinelDataTimestamp() const override;
/**
* We do not have any additional WT config to add.
*/
std::string getWiredTigerConfig(int) const override;
/**
* Replicated catalog identifiers aren't compatible with attached storage as of right now, as a
* node may create a local collection whose catalog identifier collides with that of a
* replicated collection created on another node.
*/
bool shouldUseReplicatedCatalogIdentifiers() const override;
/**
* Attached storage does not require replicated RecordIds to function correctly.
*/
bool shouldUseReplicatedRecordIds() const override;
/**
* Flow control is based on the rate of generation of oplog data and the ability of the
* secondaries to keep the majority commit point relatively up-to-date.
*/
bool shouldUseOplogWritesForFlowControlSampling() const override;
/**
* Stepping down prior to shut down allows for a graceful and quick election most of the time.
*/
bool shouldStepDownForShutdown() const override;
/**
* We can safely initialize the catalog immediately after starting the storage engine.
*/
bool shouldDelayDataAccessDuringStartup() const override;
/**
* Running a duplicate checkpoint for a given timestamp has little effect other than being
* slightly inefficient, so there's no need to use extra synchronization to avoid it.
*/
bool shouldAvoidDuplicateCheckpoints() const override;
/**
* We can support local, fully unreplicated collections.
*/
bool supportsLocalCollections() const override;
/**
* We can support unstable checkpoints.
*/
bool supportsUnstableCheckpoints() const override;
/**
* We can support table logging.
*/
bool supportsTableLogging() const override;
/**
* We can support multi-document transactions.
*/
bool supportsMultiDocumentTransactions() const override;
};
} // namespace mongo::rss

View File

@ -0,0 +1,126 @@
/**
* Copyright (C) 2025-present MongoDB, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the Server Side Public License, version 1,
* as published by MongoDB, Inc.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* Server Side Public License for more details.
*
* You should have received a copy of the Server Side Public License
* along with this program. If not, see
* <http://www.mongodb.com/licensing/server-side-public-license>.
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the Server Side Public License in all respects for
* all of the code used other than as permitted herein. If you modify file(s)
* with this exception, you may extend this exception to your version of the
* file(s), but you are not obligated to do so. If you do not wish to do so,
* delete this exception statement from your version. If you delete this
* exception statement from all source files in the program, then also delete
* it in the license file.
*/
#include "mongo/db/rss/attached_storage/attached_service_lifecycle.h"
#include "mongo/db/admission/flow_control.h"
#include "mongo/db/global_settings.h"
#include "mongo/db/repl/replication_consistency_markers_impl.h"
#include "mongo/db/repl/replication_coordinator_external_state_impl.h"
#include "mongo/db/repl/replication_coordinator_impl.h"
#include "mongo/db/repl/storage_interface.h"
#include "mongo/db/repl/topology_coordinator.h"
#include "mongo/db/rss/replicated_storage_service.h"
#include "mongo/db/storage/storage_options.h"
#include "mongo/executor/network_interface_factory.h"
#include "mongo/executor/thread_pool_task_executor.h"
#include "mongo/rpc/metadata/egress_metadata_hook_list.h"
namespace mongo::rss {
namespace {
// Checkpoint every 60 seconds by default.
constexpr double kDefaultAttachedSyncDelaySeconds = 60.0;
ServiceContext::ConstructorActionRegisterer registerAttachedServiceLifecycle{
"AttachedServiceLifecycle", [](ServiceContext* service) {
auto& rss = ReplicatedStorageService::get(service);
rss.setServiceLifecycle(std::make_unique<AttachedServiceLifecycle>());
}};
auto makeReplicationExecutor(ServiceContext* serviceContext) {
ThreadPool::Options tpOptions;
tpOptions.threadNamePrefix = "ReplCoord-";
tpOptions.poolName = "ReplCoordThreadPool";
tpOptions.maxThreads = 50;
tpOptions.onCreateThread = [serviceContext](const std::string& threadName) {
Client::initThread(threadName,
serviceContext->getService(ClusterRole::ShardServer),
Client::noSession(),
ClientOperationKillableByStepdown{false});
};
auto hookList = std::make_unique<rpc::EgressMetadataHookList>();
hookList->addHook(std::make_unique<rpc::VectorClockMetadataHook>(serviceContext));
return executor::ThreadPoolTaskExecutor::create(
std::make_unique<ThreadPool>(tpOptions),
executor::makeNetworkInterface("ReplNetwork", nullptr, std::move(hookList)));
}
} // namespace
AttachedServiceLifecycle::AttachedServiceLifecycle()
: _initializedUsingDefaultSyncDelay{[]() {
if (storageGlobalParams.syncdelay.load() < 0.0) {
storageGlobalParams.syncdelay.store(kDefaultAttachedSyncDelaySeconds);
return true;
} // namespace mongo::rss
return false;
}()} {}
void AttachedServiceLifecycle::initializeFlowControl(ServiceContext* svcCtx) {
FlowControl::set(
svcCtx, std::make_unique<FlowControl>(svcCtx, repl::ReplicationCoordinator::get(svcCtx)));
}
void AttachedServiceLifecycle::initializeStorageEngineExtensions(ServiceContext*) {}
std::unique_ptr<repl::ReplicationCoordinator>
AttachedServiceLifecycle::initializeReplicationCoordinator(ServiceContext* svcCtx) {
auto storageInterface = repl::StorageInterface::get(svcCtx);
auto replicationProcess = repl::ReplicationProcess::get(svcCtx);
repl::TopologyCoordinator::Options topoCoordOptions;
topoCoordOptions.maxSyncSourceLagSecs = Seconds(repl::maxSyncSourceLagSecs);
topoCoordOptions.clusterRole = serverGlobalParams.clusterRole;
return std::make_unique<repl::ReplicationCoordinatorImpl>(
svcCtx,
getGlobalReplSettings(),
std::make_unique<repl::ReplicationCoordinatorExternalStateImpl>(
svcCtx, storageInterface, replicationProcess),
makeReplicationExecutor(svcCtx),
std::make_unique<repl::TopologyCoordinator>(topoCoordOptions),
replicationProcess,
storageInterface,
SecureRandom().nextInt64());
}
void AttachedServiceLifecycle::initializeStateRequiredForStorageAccess(ServiceContext*) {}
void AttachedServiceLifecycle::shutdownStateRequiredForStorageAccess(ServiceContext*,
BSONObjBuilder*) {}
bool AttachedServiceLifecycle::initializedUsingDefaultSyncDelay() const {
return _initializedUsingDefaultSyncDelay;
}
bool AttachedServiceLifecycle::shouldKeepThreadAliveUntilStorageEngineHasShutDown(
const StringData) const {
return false;
}
} // namespace mongo::rss

View File

@ -0,0 +1,77 @@
/**
* Copyright (C) 2025-present MongoDB, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the Server Side Public License, version 1,
* as published by MongoDB, Inc.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* Server Side Public License for more details.
*
* You should have received a copy of the Server Side Public License
* along with this program. If not, see
* <http://www.mongodb.com/licensing/server-side-public-license>.
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the Server Side Public License in all respects for
* all of the code used other than as permitted herein. If you modify file(s)
* with this exception, you may extend this exception to your version of the
* file(s), but you are not obligated to do so. If you do not wish to do so,
* delete this exception statement from your version. If you delete this
* exception statement from all source files in the program, then also delete
* it in the license file.
*/
#pragma once
#include "mongo/db/rss/service_lifecycle.h"
namespace mongo::rss {
class AttachedServiceLifecycle : public ServiceLifecycle {
public:
AttachedServiceLifecycle();
/**
* Initializes flow control based on oplog write rate.
*/
void initializeFlowControl(ServiceContext*) override;
/**
* There are no storage engine extensions utilized.
*/
void initializeStorageEngineExtensions(ServiceContext*) override;
/**
* Initializes a 'repl::ReplicationCoordinatorImpl'.
*/
std::unique_ptr<repl::ReplicationCoordinator> initializeReplicationCoordinator(
ServiceContext*) override;
/**
* There is no additional state required for storage access.
*/
void initializeStateRequiredForStorageAccess(ServiceContext*) override;
/**
* There is no additional state required for storage access.
*/
void shutdownStateRequiredForStorageAccess(ServiceContext*, BSONObjBuilder*) override;
bool initializedUsingDefaultSyncDelay() const override;
/**
* There are no specific persistence threads that must outlive the storage engine.
*/
bool shouldKeepThreadAliveUntilStorageEngineHasShutDown(StringData) const override;
private:
const bool _initializedUsingDefaultSyncDelay;
};
} // namespace mongo::rss

View File

@ -0,0 +1,131 @@
/**
* Copyright (C) 2025-present MongoDB, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the Server Side Public License, version 1,
* as published by MongoDB, Inc.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* Server Side Public License for more details.
*
* You should have received a copy of the Server Side Public License
* along with this program. If not, see
* <http://www.mongodb.com/licensing/server-side-public-license>.
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the Server Side Public License in all respects for
* all of the code used other than as permitted herein. If you modify file(s)
* with this exception, you may extend this exception to your version of the
* file(s), but you are not obligated to do so. If you do not wish to do so,
* delete this exception statement from your version. If you delete this
* exception statement from all source files in the program, then also delete
* it in the license file.
*/
#pragma once
#include "mongo/base/string_data.h"
#include "mongo/bson/bsonelement.h"
#include <string>
#include <utility>
#include <boost/optional.hpp>
namespace mongo {
namespace rss {
/**
* This class provides an abstraction around the persistence layer underlying the storage and
* replication subsystems. Depending on the configuration, the implementation may be backed by a
* local filesystem, a remote service, etc. The interface is built primarily around capabilities and
* expected behaviors, allowing consumers to act based on these flags, rather than needing to reason
* about how a particular provider would behave in a given context.
*/
class PersistenceProvider {
public:
virtual ~PersistenceProvider() = default;
/**
* The name of this provider, for use in e.g. logging and error messages.
*/
virtual std::string name() const = 0;
/**
* If not none, the KVEngine will use the returned Timestamp during initialization as the
* initial data timestamp.
*/
virtual boost::optional<Timestamp> getSentinelDataTimestamp() const = 0;
/**
* Additional configuration that shoudld be added to the WiredTiger config string for the
* 'wiredtiger_open' call. The 'flattenLeafPageDelta' is expected to be the corresponding
* WiredTigerConfig member value.
*/
virtual std::string getWiredTigerConfig(int flattenLeafPageDelta) const = 0;
/**
* If true, the provider expects that all catalog identifiers will be replicated and identical
* between nodes.
*/
virtual bool shouldUseReplicatedCatalogIdentifiers() const = 0;
/**
* If true, the provider expects that RecordIds will be replicated (either explicitly or
* implicitly) and identical between nodes.
*/
virtual bool shouldUseReplicatedRecordIds() const = 0;
/**
* If true, writes to the oplog should be used as the unit of progress for flow control
* sampling.
*/
virtual bool shouldUseOplogWritesForFlowControlSampling() const = 0;
/**
* If true, the node should step down prior to shutdown in order to minimize unavailability.
*/
virtual bool shouldStepDownForShutdown() const = 0;
/**
* If true, data may not be availabile immediately after starting the storage engine, so systems
* like the catalog should not be initialized immediately.
*/
virtual bool shouldDelayDataAccessDuringStartup() const = 0;
/**
* If true, the system should take precautions to avoid taking multiple checkopints for the same
* stable timestamp. The underlying key-value engine likely does not provide the necessary
* coordination by default.
*/
virtual bool shouldAvoidDuplicateCheckpoints() const = 0;
/**
* If true, the storage provider supports the use of local, unreplicated collections.
*/
virtual bool supportsLocalCollections() const = 0;
/**
* If true, the provider can support unstable checkpoints.
*/
virtual bool supportsUnstableCheckpoints() const = 0;
/**
* If true, the provider can support logging (i.e. journaling) on individual tables.
*/
virtual bool supportsTableLogging() const = 0;
/**
* If true, the provider supports multi-document transactions.
*/
virtual bool supportsMultiDocumentTransactions() const = 0;
};
} // namespace rss
} // namespace mongo

View File

@ -0,0 +1,63 @@
/**
* Copyright (C) 2025-present MongoDB, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the Server Side Public License, version 1,
* as published by MongoDB, Inc.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* Server Side Public License for more details.
*
* You should have received a copy of the Server Side Public License
* along with this program. If not, see
* <http://www.mongodb.com/licensing/server-side-public-license>.
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the Server Side Public License in all respects for
* all of the code used other than as permitted herein. If you modify file(s)
* with this exception, you may extend this exception to your version of the
* file(s), but you are not obligated to do so. If you do not wish to do so,
* delete this exception statement from your version. If you delete this
* exception statement from all source files in the program, then also delete
* it in the license file.
*/
#include "mongo/db/rss/replicated_storage_service.h"
namespace mongo::rss {
namespace {
const auto getReplicatedStorageService =
ServiceContext::declareDecoration<ReplicatedStorageService>();
} // namespace
ReplicatedStorageService& ReplicatedStorageService::get(ServiceContext* svcCtx) {
return getReplicatedStorageService(svcCtx);
}
ReplicatedStorageService& ReplicatedStorageService::get(OperationContext* opCtx) {
return get(opCtx->getServiceContext());
}
PersistenceProvider& ReplicatedStorageService::getPersistenceProvider() {
invariant(_provider);
return *_provider;
}
void ReplicatedStorageService::setPersistenceProvider(std::unique_ptr<PersistenceProvider>&& p) {
_provider = std::move(p);
}
ServiceLifecycle& ReplicatedStorageService::getServiceLifecycle() {
invariant(_lifecycle);
return *_lifecycle;
}
void ReplicatedStorageService::setServiceLifecycle(std::unique_ptr<ServiceLifecycle>&& l) {
_lifecycle = std::move(l);
}
} // namespace mongo::rss

View File

@ -0,0 +1,55 @@
/**
* Copyright (C) 2025-present MongoDB, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the Server Side Public License, version 1,
* as published by MongoDB, Inc.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* Server Side Public License for more details.
*
* You should have received a copy of the Server Side Public License
* along with this program. If not, see
* <http://www.mongodb.com/licensing/server-side-public-license>.
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the Server Side Public License in all respects for
* all of the code used other than as permitted herein. If you modify file(s)
* with this exception, you may extend this exception to your version of the
* file(s), but you are not obligated to do so. If you do not wish to do so,
* delete this exception statement from your version. If you delete this
* exception statement from all source files in the program, then also delete
* it in the license file.
*/
#pragma once
#include "mongo/db/operation_context.h"
#include "mongo/db/rss/persistence_provider.h"
#include "mongo/db/rss/service_lifecycle.h"
#include "mongo/db/service_context.h"
namespace mongo::rss {
class ReplicatedStorageService {
public:
static ReplicatedStorageService& get(ServiceContext*);
static ReplicatedStorageService& get(OperationContext*);
PersistenceProvider& getPersistenceProvider();
void setPersistenceProvider(std::unique_ptr<PersistenceProvider>&&);
ServiceLifecycle& getServiceLifecycle();
void setServiceLifecycle(std::unique_ptr<ServiceLifecycle>&&);
private:
std::unique_ptr<PersistenceProvider> _provider;
std::unique_ptr<ServiceLifecycle> _lifecycle;
};
} // namespace mongo::rss

View File

@ -0,0 +1,97 @@
/**
* Copyright (C) 2025-present MongoDB, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the Server Side Public License, version 1,
* as published by MongoDB, Inc.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* Server Side Public License for more details.
*
* You should have received a copy of the Server Side Public License
* along with this program. If not, see
* <http://www.mongodb.com/licensing/server-side-public-license>.
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the Server Side Public License in all respects for
* all of the code used other than as permitted herein. If you modify file(s)
* with this exception, you may extend this exception to your version of the
* file(s), but you are not obligated to do so. If you do not wish to do so,
* delete this exception statement from your version. If you delete this
* exception statement from all source files in the program, then also delete
* it in the license file.
*/
#pragma once
#include "mongo/db/repl/replication_coordinator.h"
#include "mongo/db/service_context.h"
#include <memory>
#include <string>
#include <utility>
namespace mongo {
namespace rss {
/**
* This class provides an abstraction for a set of functionalities related to the service lifecycle
* (i.e. startup and shutdown).
*
* The implementation details are generally closely related to the configured 'PersistenceProvider',
* but we separate it out from that class since that class is primarily focused on
* capabilities/behaviors, while this class instead represents a set of setup/teardown and related
* routines.
*/
class ServiceLifecycle {
public:
virtual ~ServiceLifecycle() = default;
/**
* Initializes the flow control algorithm for the current service configuration.
*/
virtual void initializeFlowControl(ServiceContext*) = 0;
/**
* Initializes any storage engine extensions necessary for the current service configuration.
*/
virtual void initializeStorageEngineExtensions(ServiceContext*) = 0;
/**
* Initializes and returns the replication coordinator appropriate for the current service
* configuration.
*/
virtual std::unique_ptr<repl::ReplicationCoordinator> initializeReplicationCoordinator(
ServiceContext*) = 0;
/**
* Initializes any state required to access 'repl::StorageInterface'. This method will be run
* prior to 'initializeReplicationCoordinator'.
*/
virtual void initializeStateRequiredForStorageAccess(ServiceContext*) = 0;
/**
* Tears down any state set up by 'initializeStateRequiredForStorageAccess'.
*/
virtual void shutdownStateRequiredForStorageAccess(ServiceContext*, BSONObjBuilder*) = 0;
/**
* If true, this instance was initialized using the default syncdelay parameter rather than any
* user-configured value.
*/
virtual bool initializedUsingDefaultSyncDelay() const = 0;
/**
* If true, the named thread must be kept alive until the storage engine has shut down.
*/
virtual bool shouldKeepThreadAliveUntilStorageEngineHasShutDown(
StringData threadName) const = 0;
};
} // namespace rss
} // namespace mongo

View File

@ -216,7 +216,7 @@ MongoDScopedGlobalServiceContextForTest::~MongoDScopedGlobalServiceContextForTes
std::swap(storageGlobalParams.engineSetByUser, _stashedStorageParams.engineSetByUser);
std::swap(storageGlobalParams.repair, _stashedStorageParams.repair);
storageGlobalParams.reset();
storageGlobalParams.reset_forTest();
}
} // namespace mongo

View File

@ -89,6 +89,7 @@
#include "mongo/db/repl/repl_set_member_in_standalone_mode.h"
#include "mongo/db/repl/replication_coordinator.h"
#include "mongo/db/repl/storage_interface.h"
#include "mongo/db/rss/replicated_storage_service.h"
#include "mongo/db/server_feature_flags_gen.h"
#include "mongo/db/service_context.h"
#include "mongo/db/storage/control/journal_flusher.h"
@ -878,6 +879,14 @@ void startupRecovery(OperationContext* opCtx,
StorageEngine* storageEngine,
StorageEngine::LastShutdownState lastShutdownState,
BSONObjBuilder* startupTimeElapsedBuilder = nullptr) {
auto& rss = rss::ReplicatedStorageService::get(opCtx);
if (rss.getPersistenceProvider().shouldDelayDataAccessDuringStartup()) {
LOGV2(10985327,
"Skip startupRecovery; it will be handled later when WT loads the "
"checkpoint");
return;
}
invariant(!storageGlobalParams.repair);
ServiceContext* svcCtx = opCtx->getServiceContext();

View File

@ -262,6 +262,7 @@ mongo_cc_library(
name = "mdb_catalog",
srcs = [
"mdb_catalog.cpp",
"//src/mongo/db/rss:persistence_provider.h",
"//src/mongo/db/storage/kv:kv_engine.h",
],
hdrs = [
@ -270,6 +271,7 @@ mongo_cc_library(
deps = [
":record_store_base",
"//src/mongo/db:server_base",
"//src/mongo/db/rss:replicated_storage_service",
"//src/mongo/db/storage:feature_document_util",
"//src/mongo/db/storage:ident",
],
@ -423,6 +425,7 @@ mongo_cc_library(
],
hdrs = [
"oplog_truncate_markers.h",
"//src/mongo/db/rss:persistence_provider.h",
"//src/mongo/db/storage/kv:kv_engine.h",
],
deps = [
@ -501,6 +504,7 @@ mongo_cc_library(
":storage_repair_observer",
"//src/mongo/db:server_base",
"//src/mongo/db:shard_role",
"//src/mongo/db/rss:replicated_storage_service",
"//src/mongo/db/storage/kv:kv_drop_pending_ident_reaper",
],
)

View File

@ -18,6 +18,7 @@ mongo_cc_library(
hdrs = [
"devnull_kv_engine.h",
"ephemeral_catalog_record_store.h",
"//src/mongo/db/rss:persistence_provider.h",
"//src/mongo/db/storage/kv:kv_engine.h",
],
deps = [

View File

@ -68,7 +68,8 @@ public:
std::unique_ptr<RecoveryUnit> newRecoveryUnit() override;
Status createRecordStore(const NamespaceString& nss,
Status createRecordStore(const rss::PersistenceProvider&,
const NamespaceString& nss,
StringData ident,
const RecordStore::Options& options) override {
return Status::OK();
@ -89,6 +90,7 @@ public:
KeyFormat keyFormat) override;
Status createSortedDataInterface(
const rss::PersistenceProvider&,
RecoveryUnit&,
const NamespaceString& nss,
const UUID& uuid,

View File

@ -90,7 +90,8 @@ public:
return nullptr;
}
Status createRecordStore(const NamespaceString& nss,
Status createRecordStore(const rss::PersistenceProvider&,
const NamespaceString& nss,
StringData ident,
const RecordStore::Options& options) override {
return Status::OK();
@ -108,6 +109,7 @@ public:
return {};
}
Status createSortedDataInterface(
const rss::PersistenceProvider&,
RecoveryUnit&,
const NamespaceString& nss,
const UUID& uuid,

View File

@ -32,6 +32,7 @@
#include "mongo/base/status.h"
#include "mongo/base/string_data.h"
#include "mongo/bson/timestamp.h"
#include "mongo/db/rss/persistence_provider.h"
#include "mongo/db/storage/compact_options.h"
#include "mongo/db/storage/record_store.h"
#include "mongo/db/storage/sorted_data_interface.h"
@ -126,7 +127,8 @@ public:
*
* Creates a 'RecordStore' and generated from the provided 'options'.
*/
virtual Status createRecordStore(const NamespaceString& nss,
virtual Status createRecordStore(const rss::PersistenceProvider&,
const NamespaceString& nss,
StringData ident,
const RecordStore::Options& options) = 0;
@ -201,6 +203,7 @@ public:
virtual bool underCachePressure(int concurrentWriteOuts, int concurrentReadOuts) = 0;
virtual Status createSortedDataInterface(
const rss::PersistenceProvider&,
RecoveryUnit&,
const NamespaceString& nss,
const UUID& uuid,
@ -257,10 +260,11 @@ public:
* This recovery process makes no guarantees about the integrity of data recovered or even that
* it still exists when recovered.
*/
virtual Status recoverOrphanedIdent(const NamespaceString& nss,
virtual Status recoverOrphanedIdent(const rss::PersistenceProvider& provider,
const NamespaceString& nss,
StringData ident,
const RecordStore::Options& recordStoreOptions) {
auto status = createRecordStore(nss, ident, recordStoreOptions);
auto status = createRecordStore(provider, nss, ident, recordStoreOptions);
if (status.isOK()) {
return {ErrorCodes::DataModifiedByRepair, "Orphan recovery created a new record store"};
}
@ -375,6 +379,22 @@ public:
*/
virtual void setJournalListener(JournalListener* jl) = 0;
/**
* See `StorageEngine::setLastMaterializedLsn`
*/
virtual void setLastMaterializedLsn(uint64_t lsn) {}
/**
* Configures the specified checkpoint as the starting point for recovery.
*/
virtual void setRecoveryCheckpointMetadata(StringData checkpointMetadata) {}
/**
* Configures the storage engine as the leader, allowing it to flush checkpoints to remote
* storage.
*/
virtual void promoteToLeader() {}
/**
* See `StorageEngine::setStableTimestamp`
*/

View File

@ -47,6 +47,7 @@
#include "mongo/db/namespace_string.h"
#include "mongo/db/operation_context.h"
#include "mongo/db/record_id.h"
#include "mongo/db/rss/replicated_storage_service.h"
#include "mongo/db/service_context_test_fixture.h"
#include "mongo/db/storage/key_format.h"
#include "mongo/db/storage/key_string/key_string.h"
@ -116,8 +117,10 @@ protected:
auto clientAndCtx = makeClientAndCtx("opCtx");
auto opCtx = clientAndCtx.opCtx();
KVEngine* engine = helper->getEngine();
auto& provider = rss::ReplicatedStorageService::get(opCtx).getPersistenceProvider();
ASSERT_OK(
engine->createRecordStore(NamespaceString::createNamespaceString_forTest("catalog"),
engine->createRecordStore(provider,
NamespaceString::createNamespaceString_forTest("catalog"),
"collection-catalog",
RecordStore::Options{}));
@ -231,7 +234,8 @@ protected:
const RecordStore::Options& recordStoreOptions,
boost::optional<UUID> uuid) {
auto opCtx = _makeOperationContext(engine);
ASSERT_OK(engine->createRecordStore(nss, ident, recordStoreOptions));
auto& provider = rss::ReplicatedStorageService::get(opCtx.get()).getPersistenceProvider();
ASSERT_OK(engine->createRecordStore(provider, nss, ident, recordStoreOptions));
auto rs = engine->getRecordStore(opCtx.get(), nss, ident, recordStoreOptions, uuid);
ASSERT(rs);
return rs;
@ -348,8 +352,14 @@ TEST_F(KVEngineTestHarness, SimpleSorted1) {
{
auto opCtx = _makeOperationContext(engine);
auto& ru = *shard_role_details::getRecoveryUnit(opCtx.get());
ASSERT_OK(engine->createSortedDataInterface(
ru, kNss, kUUID, kIdent, config, boost::none /* storageEngineIndexOptions */));
auto& provider = rss::ReplicatedStorageService::get(opCtx.get()).getPersistenceProvider();
ASSERT_OK(engine->createSortedDataInterface(provider,
ru,
kNss,
kUUID,
kIdent,
config,
boost::none /* storageEngineIndexOptions */));
sorted = engine->getSortedDataInterface(
opCtx.get(), ru, kNss, kUUID, kIdent, config, kRecordStoreOptions.keyFormat);
ASSERT(sorted);

View File

@ -39,6 +39,7 @@
#include "mongo/db/namespace_string.h"
#include "mongo/db/operation_context.h"
#include "mongo/db/record_id.h"
#include "mongo/db/rss/replicated_storage_service.h"
#include "mongo/db/service_context.h"
#include "mongo/db/service_context_test_fixture.h"
#include "mongo/db/storage/kv/kv_engine.h"
@ -237,7 +238,9 @@ public:
const auto nss = NamespaceString::createNamespaceString_forTest("a.b");
const auto ident = "collection-ident";
RecordStore::Options options;
ASSERT_OK(engine->createRecordStore(nss, ident, options));
auto& provider =
rss::ReplicatedStorageService::get(getGlobalServiceContext()).getPersistenceProvider();
ASSERT_OK(engine->createRecordStore(provider, nss, ident, options));
rs = engine->getRecordStore(op, nss, ident, options, UUID::gen());
ASSERT(rs);
}

View File

@ -34,6 +34,7 @@
#include "mongo/db/namespace_string.h"
#include "mongo/db/operation_context.h"
#include "mongo/db/record_id.h"
#include "mongo/db/rss/replicated_storage_service.h"
#include "mongo/db/storage/feature_document_util.h"
#include "mongo/db/storage/kv/kv_engine.h"
#include "mongo/db/storage/record_store.h"
@ -242,7 +243,9 @@ StatusWith<std::unique_ptr<RecordStore>> MDBCatalog::createRecordStoreForEntry(
const MDBCatalog::EntryIdentifier& entry,
const boost::optional<UUID>& uuid,
const RecordStore::Options& recordStoreOptions) {
Status status = _engine->createRecordStore(entry.nss, entry.ident, recordStoreOptions);
auto& provider = rss::ReplicatedStorageService::get(opCtx).getPersistenceProvider();
Status status =
_engine->createRecordStore(provider, entry.nss, entry.ident, recordStoreOptions);
if (!status.isOK()) {
return status;
}

View File

@ -715,6 +715,26 @@ public:
*/
virtual boost::optional<Timestamp> getLastStableRecoveryTimestamp() const = 0;
/**
* Sets the last materialized LSN, marking the highest phylog LSN
* that has been successfully written to the page server and should have no holes.
*
* TODO: Revisit how to handle cases where mongod speaks with a log server
* in a non-local zone due to failover.
*/
virtual void setLastMaterializedLsn(uint64_t lsn) = 0;
/**
* Configures the specified checkpoint as the starting point for recovery.
*/
virtual void setRecoveryCheckpointMetadata(StringData checkpointMetadata) = 0;
/**
* Configures the storage engine as the leader, allowing it to flush checkpoints to remote
* storage.
*/
virtual void promoteToLeader() = 0;
/**
* Sets the highest timestamp at which the storage engine is allowed to take a checkpoint. This
* timestamp must not decrease unless force=true is set, in which case we force the stable

View File

@ -38,6 +38,7 @@
#include "mongo/db/local_catalog/catalog_raii.h"
#include "mongo/db/local_catalog/shard_role_api/transaction_resources.h"
#include "mongo/db/operation_context.h"
#include "mongo/db/rss/replicated_storage_service.h"
#include "mongo/db/storage/backup_cursor_hooks.h"
#include "mongo/db/storage/deferred_drop_record_store.h"
#include "mongo/db/storage/disk_space_monitor.h"
@ -130,6 +131,15 @@ StorageEngineImpl::StorageEngineImpl(OperationContext* opCtx,
invariant(prevRecoveryUnit->isNoop());
shard_role_details::setRecoveryUnit(
opCtx, _engine->newRecoveryUnit(), WriteUnitOfWork::RecoveryUnitState::kNotInUnitOfWork);
auto& rss = rss::ReplicatedStorageService::get(opCtx->getServiceContext());
if (rss.getPersistenceProvider().shouldDelayDataAccessDuringStartup()) {
LOGV2(10985326,
"Skip loading catalog on startup; it will be handled later when WT loads the "
"checkpoint");
return;
}
// If we throw in this constructor, make sure to destroy the RecoveryUnit instance created above
// before '_engine' is destroyed.
ScopeGuard recoveryUnitResetGuard([&] {
@ -181,8 +191,9 @@ void StorageEngineImpl::loadMDBCatalog(OperationContext* opCtx,
if (!catalogExists) {
WriteUnitOfWork uow(opCtx);
auto& provider = rss::ReplicatedStorageService::get(opCtx).getPersistenceProvider();
auto status = _engine->createRecordStore(
kCatalogInfoNamespace, ident::kMbdCatalog, catalogRecordStoreOpts);
provider, kCatalogInfoNamespace, ident::kMbdCatalog, catalogRecordStoreOpts);
// BadValue is usually caused by invalid configuration string.
// We still fassert() but without a stack trace.
@ -422,8 +433,9 @@ Status StorageEngineImpl::_recoverOrphanedCollection(OperationContext* opCtx,
WriteUnitOfWork wuow(opCtx);
const auto recordStoreOptions =
_catalog->getParsedRecordStoreOptions(opCtx, catalogId, collectionName);
Status status =
_engine->recoverOrphanedIdent(collectionName, collectionIdent, recordStoreOptions);
auto& provider = rss::ReplicatedStorageService::get(opCtx).getPersistenceProvider();
Status status = _engine->recoverOrphanedIdent(
provider, collectionName, collectionIdent, recordStoreOptions);
bool dataModified = status.code() == ErrorCodes::DataModifiedByRepair;
if (!status.isOK() && !dataModified) {
@ -706,6 +718,18 @@ void StorageEngineImpl::setJournalListener(JournalListener* jl) {
_engine->setJournalListener(jl);
}
void StorageEngineImpl::setLastMaterializedLsn(uint64_t lsn) {
_engine->setLastMaterializedLsn(lsn);
}
void StorageEngineImpl::setRecoveryCheckpointMetadata(StringData checkpointMetadata) {
_engine->setRecoveryCheckpointMetadata(checkpointMetadata);
}
void StorageEngineImpl::promoteToLeader() {
_engine->promoteToLeader();
}
void StorageEngineImpl::setStableTimestamp(Timestamp stableTimestamp, bool force) {
_engine->setStableTimestamp(stableTimestamp, force);
}

View File

@ -144,6 +144,12 @@ public:
void cleanShutdown(ServiceContext* svcCtx, bool memLeakAllowed) override;
void setLastMaterializedLsn(uint64_t lsn) override;
void setRecoveryCheckpointMetadata(StringData checkpointMetadata) override;
void promoteToLeader() override;
void setStableTimestamp(Timestamp stableTimestamp, bool force = false) override;
Timestamp getStableTimestamp() const override;

View File

@ -129,7 +129,14 @@ public:
boost::optional<Timestamp> getLastStableRecoveryTimestamp() const final {
MONGO_UNREACHABLE;
}
void setStableTimestamp(Timestamp stableTimestamp, bool force = false) final {}
void setLastMaterializedLsn(uint64_t lsn) final {}
void setRecoveryCheckpointMetadata(StringData checkpointMetadata) final {}
void promoteToLeader() final {}
void setStableTimestamp(Timestamp stableTimestamp, bool force = false) override {}
Timestamp getStableTimestamp() const override {
return Timestamp();
}

View File

@ -36,6 +36,7 @@
#include "mongo/db/local_catalog/durable_catalog.h"
#include "mongo/db/local_catalog/shard_role_api/transaction_resources.h"
#include "mongo/db/repl/storage_interface_impl.h"
#include "mongo/db/rss/replicated_storage_service.h"
#include "mongo/db/service_context_d_test_fixture.h"
#include "mongo/db/storage/kv/kv_engine.h"
#include "mongo/db/storage/mdb_catalog.h"
@ -117,8 +118,9 @@ public:
*/
Status createCollTable(OperationContext* opCtx, NamespaceString collName) {
const std::string identName = _storageEngine->generateNewCollectionIdent(collName.dbName());
auto& provider = rss::ReplicatedStorageService::get(opCtx).getPersistenceProvider();
return _storageEngine->getEngine()->createRecordStore(
collName, identName, RecordStore::Options{});
provider, collName, identName, RecordStore::Options{});
}
Status dropIndexTable(OperationContext* opCtx, NamespaceString nss, StringData indexName) {

View File

@ -45,10 +45,14 @@
namespace mongo {
StorageGlobalParams::StorageGlobalParams() {
reset();
_reset();
}
void StorageGlobalParams::reset() {
void StorageGlobalParams::reset_forTest() {
_reset();
}
void StorageGlobalParams::_reset() {
engine = "wiredTiger";
engineSetByUser = false;
dbpath = kDefaultDbPath;
@ -60,7 +64,7 @@ void StorageGlobalParams::reset() {
noTableScan.store(false);
directoryperdb = false;
syncdelay.store(60.0);
syncdelay.store(-1.0);
queryableBackupMode = false;
groupCollections = false;
oplogMinRetentionHours.store(0.0);

View File

@ -44,7 +44,7 @@ namespace mongo {
struct StorageGlobalParams {
StorageGlobalParams();
void reset();
void reset_forTest();
// Returns the directory path used by the spill storage engine to store spilled data.
std::string getSpillDbPath() const;
@ -109,13 +109,14 @@ struct StorageGlobalParams {
// --syncdelay
// Delay in seconds between triggering the next checkpoint after the completion of the previous
// one. A value of 0 indicates that checkpointing will be skipped.
// one. A value of 0 indicates that checkpointing will be skipped. A value <0
// will result in using the default value for the configured persistence provider.
// Do not set this value on production systems.
// In almost every situation, you should use the default setting.
// This parameter is both a server parameter and a configuration parameter, and to resolve
// conflicts between the two the default must be set here.
// conflicts between the two, a default sentinel (<0) must be set here.
static constexpr double kMaxSyncdelaySecs = 60 * 60; // 1hr
AtomicWord<double> syncdelay{60.0}; // seconds between checkpoints
AtomicWord<double> syncdelay{-1.0}; // seconds between checkpoints
// --queryableBackupMode
// Prevents user-originating operations from performing writes to the server. Internally
@ -139,6 +140,9 @@ struct StorageGlobalParams {
// Test-only option. Disables table logging.
bool forceDisableTableLogging = false;
private:
void _reset();
};
extern StorageGlobalParams storageGlobalParams;

View File

@ -112,6 +112,7 @@ mongo_cc_library(
"//src/mongo/db:server_base",
"//src/mongo/db:server_feature_flags",
"//src/mongo/db:service_context",
"//src/mongo/db/rss:replicated_storage_service",
"//src/mongo/db/storage:container",
"//src/mongo/db/storage:exceptions",
"//src/mongo/db/storage:execution_context",
@ -173,6 +174,7 @@ mongo_cc_library(
deps = [
":storage_wiredtiger_core",
"//src/mongo/db:service_context_test_fixture",
"//src/mongo/db/rss:persistence_provider_impl",
"//src/mongo/db/storage:record_store_test_harness",
"//src/mongo/util:clock_source_mock",
],
@ -228,6 +230,7 @@ mongo_cc_unit_test(
"//src/mongo/db/storage:storage_options",
"//src/mongo/db/storage/kv:kv_engine_test_harness",
"//src/mongo/idl:server_parameter_test_controller",
"//src/mongo/idl:server_parameter_test_util",
"//src/mongo/util:clock_source_mock",
],
)

View File

@ -69,7 +69,7 @@ SpillWiredTigerKVEngine::SpillWiredTigerKVEngine(const std::string& canonicalNam
}
std::string config =
generateWTOpenConfigString(_wtConfig, wtExtensions.getOpenExtensionsConfig());
generateWTOpenConfigString(_wtConfig, wtExtensions.getOpenExtensionsConfig(), "");
LOGV2(10158000, "Opening spill WiredTiger", "config"_attr = config);
auto startTime = Date_t::now();

View File

@ -94,7 +94,8 @@ public:
MONGO_UNREACHABLE;
}
Status createRecordStore(const NamespaceString& nss,
Status createRecordStore(const rss::PersistenceProvider&,
const NamespaceString& nss,
StringData ident,
const RecordStore::Options& options) override {
MONGO_UNREACHABLE;
@ -126,6 +127,7 @@ public:
}
Status createSortedDataInterface(
const rss::PersistenceProvider&,
RecoveryUnit&,
const NamespaceString& nss,
const UUID& uuid,

View File

@ -32,10 +32,12 @@
#include "mongo/base/init.h" // IWYU pragma: keep
#include "mongo/base/string_data.h"
#include "mongo/db/rss/replicated_storage_service.h"
#include "mongo/db/service_context.h"
#include "mongo/db/service_context_test_fixture.h"
#include "mongo/db/storage/wiredtiger/wiredtiger_extensions.h"
#include "mongo/db/storage/wiredtiger/wiredtiger_global_options_gen.h"
#include "mongo/idl/server_parameter_test_util.h"
#include "mongo/unittest/temp_dir.h"
#include "mongo/unittest/unittest.h"
#include "mongo/util/clock_source_mock.h"

View File

@ -34,6 +34,7 @@
#include "mongo/db/service_context.h"
#include "mongo/util/assert_util.h"
#include "mongo/util/decorable.h"
#include "mongo/util/str.h"
#include <memory>
#include <utility>
@ -49,8 +50,33 @@ ServiceContext::ConstructorActionRegisterer setWiredTigerCustomizationHooks{
const auto getCustomizationHooks =
ServiceContext::declareDecoration<std::unique_ptr<WiredTigerCustomizationHooks>>();
const auto getWiredTigerCustomizationHooksRegistry =
ServiceContext::declareDecoration<WiredTigerCustomizationHooksRegistry>();
} // namespace
WiredTigerCustomizationHooksRegistry& WiredTigerCustomizationHooksRegistry::get(
ServiceContext* service) {
return getWiredTigerCustomizationHooksRegistry(service);
}
void WiredTigerCustomizationHooksRegistry::addHook(
std::unique_ptr<WiredTigerCustomizationHooks> custHook) {
invariant(custHook);
_hooks.push_back(std::move(custHook));
}
std::string WiredTigerCustomizationHooksRegistry::getTableCreateConfig(StringData tableName) const {
str::stream config;
for (const auto& h : _hooks) {
config << h->getTableCreateConfig(tableName);
}
return config;
}
void WiredTigerCustomizationHooks::set(ServiceContext* service,
std::unique_ptr<WiredTigerCustomizationHooks> customHooks) {
auto& hooks = getCustomizationHooks(service);

View File

@ -31,6 +31,7 @@
#include <memory>
#include <string>
#include <vector>
namespace mongo {
class StringData;
@ -58,4 +59,27 @@ public:
virtual std::string getTableCreateConfig(StringData tableName);
};
/**
* Registry to store multiple WiredTiger customization hooks.
*/
class WiredTigerCustomizationHooksRegistry {
public:
static WiredTigerCustomizationHooksRegistry& get(ServiceContext* serviceContext);
/**
* Adds a WiredTiger customization hook to the registry. Multiple hooks can be
* added, and their configurations will be combined.
*/
void addHook(std::unique_ptr<WiredTigerCustomizationHooks> custHook);
/**
* Gets a combined configuration string from all hooks in the registry for
* the provided table name during the `WT_SESSION::create` call.
*/
std::string getTableCreateConfig(StringData tableName) const;
private:
std::vector<std::unique_ptr<WiredTigerCustomizationHooks>> _hooks;
};
} // namespace mongo

View File

@ -69,6 +69,7 @@ public:
std::string liveRestoreSource;
int liveRestoreThreads;
double liveRestoreReadSizeMB;
int flattenLeafPageDelta;
std::string collectionBlockCompressor;
bool useIndexPrefixCompression;

View File

@ -126,6 +126,18 @@ server_parameters:
lte: 100
redact: false
wiredTigerFlattenLeafPageDelta:
description: >-
WiredTiger page read rewrites the leaf pages with deltas to a new disk image if
successful. We will use a ternary where 0=Disabled, 1=Enabled if disaggregatedStorageEnabled is true, 2=Enabled unconditionally.
set_at: startup
cpp_varname: "wiredTigerGlobalOptions.flattenLeafPageDelta"
default: 1
validator:
gte: 0
lte: 2
redact: false
wiredTigerEvictionDirtyTargetGB:
description: >-
Absolute dirty cache eviction target. Once eviction begins,

Some files were not shown because too many files have changed in this diff Show More