Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
The `aws_s3` source now uses exponential backoff when retrying failed SQS `receive_message` operations. Previously, the source used a fixed 500ms delay between retries.

The new behavior starts at 500ms and doubles with each consecutive failure, capping at 30 seconds. This prevents excessive API calls during prolonged AWS SQS outages, invalid IAM permissions, or throttling scenarios, while still being responsive when the service recovers.

authors: @medzin @pront

Check failure

Code scanning / check-spelling

Unrecognized Spelling Error

medzin is not a recognized word. (unrecognized-spelling)
37 changes: 37 additions & 0 deletions src/common/backoff.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,40 @@ impl Iterator for ExponentialBackoff {
Some(duration)
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_exponential_backoff_sequence() {
let mut backoff = ExponentialBackoff::from_millis(500).max_delay(Duration::from_secs(30));

let expected_delays = [
Duration::from_millis(500),
Duration::from_secs(1),
Duration::from_secs(2),
Duration::from_secs(4),
Duration::from_secs(8),
Duration::from_secs(16),
Duration::from_secs(30), // Reached max duration
Duration::from_secs(30), // Should stay capped
];

for expected in expected_delays.iter() {
let actual = backoff.next().unwrap();
assert_eq!(actual, *expected);
}
}

#[test]
fn test_backoff_reset() {
let mut backoff = ExponentialBackoff::from_millis(500).max_delay(Duration::from_secs(30));
for _ in 0..2 {
backoff.next();
}
assert_eq!(backoff.next().unwrap(), Duration::from_secs(2));
backoff.reset();
assert_eq!(backoff.next().unwrap(), Duration::from_millis(500));
}
}
43 changes: 32 additions & 11 deletions src/sources/aws_s3/sqs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use std::{
num::NonZeroUsize,
panic,
sync::{Arc, LazyLock},
time::Duration,
};

use aws_sdk_s3::{Client as S3Client, operation::get_object::GetObjectError};
Expand Down Expand Up @@ -43,6 +44,7 @@ use crate::{
SourceSender,
aws::AwsTimeout,
codecs::Decoder,
common::backoff::ExponentialBackoff,
config::{SourceAcknowledgementsConfig, SourceContext},
event::{BatchNotifier, BatchStatus, EstimatedJsonEncodedSizeOf, Event, LogEvent},
internal_events::{
Expand Down Expand Up @@ -381,6 +383,7 @@ pub struct IngestorProcess {
log_namespace: LogNamespace,
bytes_received: Registered<BytesReceived>,
events_received: Registered<EventsReceived>,
backoff: ExponentialBackoff,
}

impl IngestorProcess {
Expand All @@ -399,6 +402,7 @@ impl IngestorProcess {
log_namespace,
bytes_received: register!(BytesReceived::from(Protocol::HTTP)),
events_received: register!(EventsReceived),
backoff: ExponentialBackoff::from_millis(500).max_delay(Duration::from_secs(30)),
}
}

Expand All @@ -409,23 +413,39 @@ impl IngestorProcess {
loop {
select! {
_ = &mut shutdown => break,
_ = self.run_once() => {},
result = self.run_once() => {
match result {
Ok(()) => {
// Reset backoff on successful receive
self.backoff.reset();
}
Err(_) => {
let delay = self.backoff.next().expect("backoff never ends");
trace!(
message = "run_once failed, will retry after delay",
delay_ms = delay.as_millis()
);
tokio::time::sleep(delay).await;
}
}
},
}
}
}

async fn run_once(&mut self) {
let messages = self.receive_messages().await;
let messages = messages
.inspect(|messages| {
async fn run_once(&mut self) -> Result<(), ()> {
let messages = match self.receive_messages().await {
Ok(messages) => {
emit!(SqsMessageReceiveSucceeded {
count: messages.len(),
});
})
.inspect_err(|err| {
emit!(SqsMessageReceiveError { error: err });
})
.unwrap_or_default();
messages
}
Err(err) => {
emit!(SqsMessageReceiveError { error: &err });
return Err(());
}
};

let mut delete_entries = Vec::new();
let mut deferred_entries = Vec::new();
Expand Down Expand Up @@ -521,7 +541,7 @@ impl IngestorProcess {
message = "Deferred queue not configured, but received deferred entries.",
internal_log_rate_limit = true
);
return;
return Ok(());
};
let cloned_entries = deferred_entries.clone();
match self
Expand Down Expand Up @@ -576,6 +596,7 @@ impl IngestorProcess {
}
}
}
Ok(())
}

async fn handle_sqs_message(&mut self, message: Message) -> Result<(), ProcessingError> {
Expand Down
Loading