Skip to content

Commit a794c20

Browse files
committed
Add request limits for mongo
1 parent 5d625c2 commit a794c20

4 files changed

Lines changed: 138 additions & 30 deletions

File tree

nativelink-config/src/stores.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1490,6 +1490,7 @@ pub struct ExperimentalMongoSpec {
14901490
#[serde(default, deserialize_with = "convert_data_size_with_shellexpand")]
14911491
pub read_chunk_size: usize,
14921492

1493+
/// Deprecated, unused
14931494
/// Maximum number of concurrent uploads allowed.
14941495
/// Default: 10
14951496
#[serde(default, deserialize_with = "convert_numeric_with_shellexpand")]
@@ -1529,6 +1530,14 @@ pub struct ExperimentalMongoSpec {
15291530
deserialize_with = "convert_optional_numeric_with_shellexpand"
15301531
)]
15311532
pub write_concern_timeout_ms: Option<u32>,
1533+
1534+
/// Limits the number of requests at any one time
1535+
/// Default: Unlimited
1536+
#[serde(
1537+
default,
1538+
deserialize_with = "convert_optional_numeric_with_shellexpand"
1539+
)]
1540+
pub max_request_permits: Option<usize>,
15321541
}
15331542

15341543
impl Retry {

nativelink-error/src/lib.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,24 @@ impl From<zip::result::ZipError> for Error {
353353
}
354354
}
355355

356+
impl From<mongodb::error::Error> for Error {
357+
fn from(value: mongodb::error::Error) -> Self {
358+
Self::new(Code::Internal, value.to_string())
359+
}
360+
}
361+
362+
impl From<reqwest::Error> for Error {
363+
fn from(value: reqwest::Error) -> Self {
364+
Self::new(Code::Internal, value.to_string())
365+
}
366+
}
367+
368+
impl From<zip::result::ZipError> for Error {
369+
fn from(value: zip::result::ZipError) -> Self {
370+
Self::new(Code::Internal, value.to_string())
371+
}
372+
}
373+
356374
pub trait ResultExt<T> {
357375
/// # Errors
358376
///

nativelink-store/src/mongo_store.rs

Lines changed: 64 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ use core::ops::Bound;
1717
use core::pin::Pin;
1818
use core::time::Duration;
1919
use std::borrow::Cow;
20+
use std::sync::atomic::{AtomicUsize, Ordering};
2021
use std::sync::{Arc, Weak};
2122

2223
use async_trait::async_trait;
@@ -39,9 +40,9 @@ use nativelink_util::store_trait::{
3940
use nativelink_util::task::JoinHandleDropGuard;
4041
use parking_lot::{Mutex, RwLock};
4142
use patricia_tree::StringPatriciaMap;
42-
use tokio::sync::watch;
43+
use tokio::sync::{Semaphore, SemaphorePermit, watch};
4344
use tokio::time::sleep;
44-
use tracing::{error, info, warn};
45+
use tracing::{debug, error, info, trace, warn};
4546

4647
use crate::cas_utils::is_zero_digest;
4748

@@ -63,9 +64,6 @@ const DEFAULT_CONNECTION_TIMEOUT_MS: u64 = 3000;
6364
/// The default command timeout in milliseconds if not specified.
6465
const DEFAULT_COMMAND_TIMEOUT_MS: u64 = 10_000;
6566

66-
/// The default maximum number of concurrent uploads.
67-
const DEFAULT_MAX_CONCURRENT_UPLOADS: usize = 10;
68-
6967
/// The name of the field in `MongoDB` documents that stores the key.
7068
const KEY_FIELD: &str = "_id";
7169

@@ -102,16 +100,18 @@ pub struct ExperimentalMongoStore {
102100
#[metric(help = "The amount of data to read from MongoDB at a time")]
103101
read_chunk_size: usize,
104102

105-
/// The maximum number of concurrent uploads.
106-
#[metric(help = "The maximum number of concurrent uploads")]
107-
max_concurrent_uploads: usize,
108-
109103
/// Enable change streams for real-time updates.
110104
#[metric(help = "Whether change streams are enabled")]
111105
enable_change_streams: bool,
112106

113107
/// A manager for subscriptions to keys in `MongoDB`.
114108
subscription_manager: Mutex<Option<Arc<ExperimentalMongoSubscriptionManager>>>,
109+
110+
/// Limits the number of requests at any one time
111+
request_permits: Arc<Semaphore>,
112+
113+
/// Keep track of the request_permits queue size
114+
waiting_permits: Arc<AtomicUsize>,
115115
}
116116

117117
impl ExperimentalMongoStore {
@@ -149,8 +149,13 @@ impl ExperimentalMongoStore {
149149
spec.command_timeout_ms = DEFAULT_COMMAND_TIMEOUT_MS;
150150
}
151151

152-
if spec.max_concurrent_uploads == 0 {
153-
spec.max_concurrent_uploads = DEFAULT_MAX_CONCURRENT_UPLOADS;
152+
if let Some(max_permits) = spec.max_request_permits {
153+
if max_permits == 0 {
154+
return Err(make_err!(
155+
Code::InvalidArgument,
156+
"max_request_permits was set to zero, which will block mongo_store from working at all"
157+
));
158+
}
154159
}
155160

156161
// Configure client options
@@ -224,9 +229,12 @@ impl ExperimentalMongoStore {
224229
scheduler_collection,
225230
key_prefix: spec.key_prefix.clone().unwrap_or_default(),
226231
read_chunk_size: spec.read_chunk_size,
227-
max_concurrent_uploads: spec.max_concurrent_uploads,
228232
enable_change_streams: spec.enable_change_streams,
229233
subscription_manager: Mutex::new(None),
234+
request_permits: Arc::new(Semaphore::new(
235+
spec.max_request_permits.unwrap_or(Semaphore::MAX_PERMITS),
236+
)),
237+
waiting_permits: Arc::new(AtomicUsize::new(0)),
230238
};
231239

232240
Ok(Arc::new(store))
@@ -282,6 +290,19 @@ impl ExperimentalMongoStore {
282290
key.strip_prefix(&self.key_prefix).map(ToString::to_string)
283291
}
284292
}
293+
294+
async fn acquire_permit(&self) -> Result<SemaphorePermit<'_>, Error> {
295+
let waiting = self.waiting_permits.fetch_add(1, Ordering::Relaxed);
296+
297+
if waiting > 0 && waiting % 100 == 0 {
298+
debug!(waiting, "Number of waiting permits for Mongo");
299+
} else {
300+
trace!(waiting, "Number of waiting permits for Mongo");
301+
}
302+
let permit = self.request_permits.acquire().await;
303+
self.waiting_permits.fetch_sub(1, Ordering::Relaxed);
304+
Ok(permit?)
305+
}
285306
}
286307

287308
#[async_trait]
@@ -301,6 +322,11 @@ impl StoreDriver for ExperimentalMongoStore {
301322
let encoded_key = self.encode_key(key);
302323
let filter = doc! { KEY_FIELD: encoded_key.as_ref() };
303324

325+
// We could do this with acquire_many, but that's unsafe if the number of keys is greater
326+
// than the number of permits, as it'll block forever. Doing this one at a time is guaranteed
327+
// not to block provided no-one sets permits to 0, and we check for that case at startup.
328+
let semaphore = self.acquire_permit().await?;
329+
304330
match self.cas_collection.find_one(filter).await {
305331
Ok(Some(doc)) => {
306332
*result = doc.get_i64(SIZE_FIELD).ok().map(|v| v as u64);
@@ -315,7 +341,9 @@ impl StoreDriver for ExperimentalMongoStore {
315341
));
316342
}
317343
}
344+
drop(semaphore);
318345
}
346+
319347
Ok(())
320348
}
321349

@@ -370,6 +398,8 @@ impl StoreDriver for ExperimentalMongoStore {
370398
}
371399
}
372400

401+
let semaphore = self.acquire_permit().await?;
402+
373403
let mut cursor = self
374404
.cas_collection
375405
.find(filter)
@@ -394,6 +424,8 @@ impl StoreDriver for ExperimentalMongoStore {
394424
}
395425
}
396426

427+
drop(semaphore);
428+
397429
Ok(count)
398430
}
399431

@@ -463,6 +495,8 @@ impl StoreDriver for ExperimentalMongoStore {
463495
SIZE_FIELD: size,
464496
};
465497

498+
let semaphore = self.acquire_permit().await?;
499+
466500
// Upsert the document
467501
self.cas_collection
468502
.update_one(
@@ -473,6 +507,8 @@ impl StoreDriver for ExperimentalMongoStore {
473507
.await
474508
.map_err(|e| make_err!(Code::Internal, "Failed to update document in MongoDB: {e}"))?;
475509

510+
drop(semaphore);
511+
476512
Ok(())
477513
}
478514

@@ -496,6 +532,8 @@ impl StoreDriver for ExperimentalMongoStore {
496532
let encoded_key = self.encode_key(&key);
497533
let filter = doc! { KEY_FIELD: encoded_key.as_ref() };
498534

535+
let semaphore = self.acquire_permit().await?;
536+
499537
let doc = self
500538
.cas_collection
501539
.find_one(filter)
@@ -553,6 +591,8 @@ impl StoreDriver for ExperimentalMongoStore {
553591
}
554592
}
555593

594+
drop(semaphore);
595+
556596
writer.send_eof().map_err(|e| {
557597
make_err!(
558598
Code::Internal,
@@ -593,6 +633,8 @@ impl HealthStatusIndicator for ExperimentalMongoStore {
593633
}
594634

595635
async fn check_health(&self, namespace: Cow<'static, str>) -> HealthStatus {
636+
// Note we do not acquire a request_permit here, as the health check needs to always go through
637+
// even if everything else is fully loaded
596638
match self.database.run_command(doc! { "ping": 1 }).await {
597639
Ok(_) => HealthStatus::new_ok(self, "Connection healthy".into()),
598640
Err(e) => HealthStatus::new_failed(
@@ -918,7 +960,9 @@ impl SchedulerStore for ExperimentalMongoStore {
918960
VERSION_FIELD: current_version,
919961
};
920962

921-
match self
963+
let semaphore = self.acquire_permit().await?;
964+
965+
let result = match self
922966
.scheduler_collection
923967
.find_one_and_update(filter, update_doc)
924968
.upsert(true)
@@ -931,7 +975,9 @@ impl SchedulerStore for ExperimentalMongoStore {
931975
Code::Internal,
932976
"MongoDB error in update_data: {e}"
933977
)),
934-
}
978+
};
979+
drop(semaphore);
980+
result
935981
} else {
936982
let data_bytes = data.try_into_bytes().map_err(|e| {
937983
make_err!(
@@ -959,6 +1005,8 @@ impl SchedulerStore for ExperimentalMongoStore {
9591005
);
9601006
}
9611007

1008+
let semaphore = self.acquire_permit().await?;
1009+
9621010
self.scheduler_collection
9631011
.update_one(
9641012
doc! { KEY_FIELD: encoded_key.as_ref() },
@@ -970,6 +1018,8 @@ impl SchedulerStore for ExperimentalMongoStore {
9701018
make_err!(Code::Internal, "Failed to update scheduler document: {e}")
9711019
})?;
9721020

1021+
drop(semaphore);
1022+
9731023
Ok(Some(0))
9741024
}
9751025
}

nativelink-store/tests/mongo_store_test.rs

Lines changed: 47 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ fn create_test_spec_with_key_prefix(
7272
write_concern_w: Some("majority".to_string()),
7373
write_concern_j: Some(true),
7474
write_concern_timeout_ms: Some(10000), // Longer timeout for remote DB
75+
max_request_permits: None,
7576
}
7677
}
7778

@@ -967,25 +968,55 @@ async fn test_scheduler_store_operations() -> Result<(), Error> {
967968

968969
#[nativelink_test]
969970
async fn test_non_w_config() -> Result<(), Error> {
971+
let (mut spec, mongo_process) = TestMongoHelper::new_spec(None).await?;
972+
spec.write_concern_w = None;
973+
spec.write_concern_j = Some(true);
974+
spec.write_concern_timeout_ms = Some(1);
975+
970976
assert_eq!(
971977
Error::new(Code::InvalidArgument, "write_concern_w not set, but j and/or timeout set. Please set 'write_concern_w' to a non-default value. See https://www.mongodb.com/docs/manual/reference/write-concern/#w-option for options.".to_string()),
972-
ExperimentalMongoStore::new(ExperimentalMongoSpec {
973-
connection_string: "mongodb://dummy".to_string(),
974-
database: "dummy".to_string(),
975-
cas_collection: "test_cas".to_string(),
976-
scheduler_collection: "test_scheduler".to_string(),
977-
key_prefix: None,
978-
read_chunk_size: 1024,
979-
max_concurrent_uploads: 10,
980-
connection_timeout_ms: 10000,
981-
command_timeout_ms: 15000,
982-
enable_change_streams: false,
983-
write_concern_w: None,
984-
write_concern_j: Some(true),
985-
write_concern_timeout_ms: Some(1),
986-
})
987-
.await
978+
TestMongoHelper::new_with_spec_and_process(spec, mongo_process).await
988979
.unwrap_err()
989980
);
990981
Ok(())
991982
}
983+
984+
#[nativelink_test]
985+
async fn empty_request_permit() -> Result<(), Error> {
986+
let (mut spec, mongo_process) = TestMongoHelper::new_spec(None).await?;
987+
spec.max_request_permits = Some(0);
988+
989+
assert_eq!(
990+
Error::new(
991+
Code::InvalidArgument,
992+
"max_request_permits was set to zero, which will block mongo_store from working at all"
993+
.to_string()
994+
),
995+
TestMongoHelper::new_with_spec_and_process(spec, mongo_process)
996+
.await
997+
.unwrap_err()
998+
);
999+
Ok(())
1000+
}
1001+
1002+
#[nativelink_test]
1003+
async fn single_request_permit() -> Result<(), Error> {
1004+
let (mut spec, mongo_process) = TestMongoHelper::new_spec(None).await?;
1005+
spec.max_request_permits = Some(1);
1006+
let helper = TestMongoHelper::new_with_spec_and_process(spec, mongo_process).await?;
1007+
1008+
let data = Bytes::from_static(b"14");
1009+
let digest = DigestInfo::try_new(VALID_HASH1, 2)?;
1010+
helper.store.update_oneshot(digest, data.clone()).await?;
1011+
let result = helper.store.has(digest).await?;
1012+
assert!(
1013+
result.is_some(),
1014+
"Expected mongo store to have hash: {VALID_HASH1}",
1015+
);
1016+
1017+
assert!(logs_contain(
1018+
"Number of waiting permits for Mongo waiting=0"
1019+
));
1020+
1021+
Ok(())
1022+
}

0 commit comments

Comments
 (0)