Skip to content

Commit 350dc92

Browse files
committed
feat: add config ingester_traffic_overflow_action
1 parent 4f96fe9 commit 350dc92

File tree

7 files changed

+174
-15
lines changed

7 files changed

+174
-15
lines changed

agent/crates/public/src/queue/overwrite_queue.rs

+12
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ struct OverwriteQueue<T: Sized> {
5555

5656
counter: Counter,
5757

58+
total_overwritten_count: AtomicU64,
59+
5860
_marker: PhantomData<T>,
5961
}
6062

@@ -78,6 +80,7 @@ impl<T> OverwriteQueue<T> {
7880
notify: Condvar::new(),
7981
terminated: AtomicBool::new(false),
8082
counter: Counter::default(),
83+
total_overwritten_count: AtomicU64::new(0),
8184
_marker: PhantomData,
8285
}
8386
}
@@ -130,6 +133,8 @@ impl<T> OverwriteQueue<T> {
130133
self.counter
131134
.overwritten
132135
.fetch_add(to_overwrite as u64, Ordering::Relaxed);
136+
self.total_overwritten_count
137+
.fetch_add(to_overwrite as u64, Ordering::Relaxed);
133138
}
134139
}
135140
let free_after_end = self.size - (raw_end & (self.size - 1));
@@ -416,6 +421,13 @@ impl<T> Receiver<T> {
416421
}
417422
}
418423
}
424+
425+
pub fn total_overwritten_count(&self) -> u64 {
426+
self.counter()
427+
.queue
428+
.total_overwritten_count
429+
.load(Ordering::Relaxed)
430+
}
419431
}
420432

421433
impl<T> Drop for Receiver<T> {

agent/src/config/config.rs

+2
Original file line numberDiff line numberDiff line change
@@ -2129,6 +2129,7 @@ pub struct Communication {
21292129
#[serde(deserialize_with = "deser_usize_with_mega_unit")]
21302130
pub grpc_buffer_size: usize,
21312131
pub max_throughput_to_ingester: u64,
2132+
pub ingester_traffic_overflow_action: u8, // 0: wait, 1: drop
21322133
pub request_via_nat_ip: bool,
21332134
pub proxy_controller_ip: String,
21342135
pub proxy_controller_port: u16,
@@ -2145,6 +2146,7 @@ impl Default for Communication {
21452146
ingester_port: 30033,
21462147
grpc_buffer_size: 5 << 20,
21472148
max_throughput_to_ingester: 100,
2149+
ingester_traffic_overflow_action: 0,
21482150
request_via_nat_ip: false,
21492151
}
21502152
}

agent/src/config/handler.rs

+17-1
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,8 @@ pub struct SenderConfig {
231231
pub npb_bps_threshold: u64,
232232
pub npb_socket_type: agent::SocketType,
233233
pub multiple_sockets_to_ingester: bool,
234-
pub max_throughput_to_ingester: u64, // unit: Mbps
234+
pub max_throughput_to_ingester: u64, // unit: Mbps
235+
pub ingester_traffic_overflow_action: u8, // 0: wait, 1: drop
235236
pub collector_socket_type: agent::SocketType,
236237
pub standalone_data_file_size: u32,
237238
pub standalone_data_file_dir: String,
@@ -1740,6 +1741,10 @@ impl TryFrom<(Config, UserConfig)> for ModuleConfig {
17401741
.throughput_monitoring_interval,
17411742
multiple_sockets_to_ingester: conf.outputs.socket.multiple_sockets_to_ingester,
17421743
max_throughput_to_ingester: conf.global.communication.max_throughput_to_ingester,
1744+
ingester_traffic_overflow_action: conf
1745+
.global
1746+
.communication
1747+
.ingester_traffic_overflow_action,
17431748
collector_socket_type: conf.outputs.socket.data_socket_type,
17441749
standalone_data_file_size: conf.global.standalone_mode.max_data_file_size,
17451750
standalone_data_file_dir: conf.global.standalone_mode.data_file_dir.clone(),
@@ -3837,6 +3842,17 @@ impl ConfigHandler {
38373842
);
38383843
communication.max_throughput_to_ingester = new_communication.max_throughput_to_ingester;
38393844
}
3845+
if communication.ingester_traffic_overflow_action
3846+
!= new_communication.ingester_traffic_overflow_action
3847+
{
3848+
info!(
3849+
"Update global.communication.ingester_traffic_overflow_action from {:?} to {:?}.",
3850+
communication.ingester_traffic_overflow_action,
3851+
new_communication.ingester_traffic_overflow_action
3852+
);
3853+
communication.ingester_traffic_overflow_action =
3854+
new_communication.ingester_traffic_overflow_action;
3855+
}
38403856
if communication.ingester_ip != new_communication.ingester_ip {
38413857
info!(
38423858
"Update global.communication.ingester_ip from {:?} to {:?}.",

agent/src/sender/uniform_sender.rs

+44-10
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ pub struct SenderCounter {
5353
pub tx: AtomicU64,
5454
pub tx_bytes: AtomicU64,
5555
pub dropped: AtomicU64,
56+
pub waited: AtomicU64,
5657
}
5758

5859
impl RefCountable for SenderCounter {
@@ -89,6 +90,11 @@ impl RefCountable for SenderCounter {
8990
CounterType::Counted,
9091
CounterValue::Unsigned(self.dropped.swap(0, Ordering::Relaxed)),
9192
),
93+
(
94+
"waited",
95+
CounterType::Counted,
96+
CounterValue::Unsigned(self.waited.swap(0, Ordering::Relaxed)),
97+
),
9298
]
9399
}
94100
}
@@ -380,6 +386,7 @@ pub struct UniformSender<T> {
380386

381387
input: Arc<Receiver<T>>,
382388
counter: Arc<SenderCounter>,
389+
overwritten_count: u64,
383390

384391
encoder: Encoder<T>,
385392
private_conn: Mutex<Connection>,
@@ -426,6 +433,7 @@ impl<T: Sendable> UniformSender<T> {
426433
name,
427434
input,
428435
counter: Arc::new(SenderCounter::default()),
436+
overwritten_count: 0,
429437
encoder: Encoder::new(
430438
0,
431439
SendMessageType::TaggedFlow,
@@ -626,7 +634,11 @@ impl<T: Sendable> UniformSender<T> {
626634
}
627635
}
628636

629-
fn is_exceed_max_throughput(&mut self, max_throughput_mbps: u64) -> bool {
637+
fn is_exceed_max_throughput(
638+
&mut self,
639+
max_throughput_mbps: u64,
640+
ingester_traffic_overflow_action: u8,
641+
) -> bool {
630642
if max_throughput_mbps == 0 {
631643
return false;
632644
}
@@ -637,7 +649,7 @@ impl<T: Sendable> UniformSender<T> {
637649
.unwrap();
638650

639651
let used = now - Duration::from_nanos(SENT_START_DURATION.load(Ordering::Relaxed));
640-
if used > Duration::from_secs(1) {
652+
if used >= Duration::from_secs(1) {
641653
SENT_START_DURATION.store(now.as_nanos() as u64, Ordering::Relaxed);
642654
TOTAL_SENT_BYTES.store(0, Ordering::Relaxed);
643655
} else {
@@ -646,11 +658,23 @@ impl<T: Sendable> UniformSender<T> {
646658
> Duration::from_secs(5)
647659
{
648660
warn!(
649-
"{} sender dropping message, throughput execeed setting value 'max_throughput_to_ingester' {}Mbps",
650-
self.name, max_throughput_mbps
661+
"{} sender dropping message, throughput execeed setting value 'max_throughput_to_ingester' {}Mbps, action {} (0: wait, 1: drop), total overwrittern count {}",
662+
self.name, max_throughput_mbps, ingester_traffic_overflow_action, self.overwritten_count
651663
);
652664
LAST_LOGGING_DURATION.store(now.as_nanos() as u64, Ordering::Relaxed);
653665
}
666+
// action is wait
667+
if ingester_traffic_overflow_action == 0 {
668+
thread::sleep(Duration::from_secs(1) - used);
669+
// if the queue loses data, need to report the exception
670+
if self.input.total_overwritten_count() > self.overwritten_count {
671+
self.exception_handler
672+
.set(Exception::DataBpsThresholdExceeded);
673+
self.overwritten_count = self.input.total_overwritten_count();
674+
}
675+
return true;
676+
}
677+
// action is drop
654678
self.exception_handler
655679
.set(Exception::DataBpsThresholdExceeded);
656680
return true;
@@ -678,6 +702,7 @@ impl<T: Sendable> UniformSender<T> {
678702
let config = self.config.load();
679703
let socket_type = config.collector_socket_type;
680704
let max_throughput_mpbs = config.max_throughput_to_ingester;
705+
let ingester_traffic_overflow_action = config.ingester_traffic_overflow_action;
681706
match self.input.recv_all(
682707
&mut batch,
683708
Some(Duration::from_secs(Self::QUEUE_READ_TIMEOUT)),
@@ -688,12 +713,21 @@ impl<T: Sendable> UniformSender<T> {
688713
start_cached = Instant::now();
689714
self.cached = false;
690715
}
691-
if self.is_exceed_max_throughput(max_throughput_mpbs) {
692-
self.counter
693-
.dropped
694-
.fetch_add(batch.len() as u64, Ordering::Relaxed);
695-
batch.clear();
696-
continue;
716+
if self.is_exceed_max_throughput(
717+
max_throughput_mpbs,
718+
ingester_traffic_overflow_action,
719+
) {
720+
if ingester_traffic_overflow_action == 0 {
721+
self.counter
722+
.waited
723+
.fetch_add(batch.len() as u64, Ordering::Relaxed);
724+
} else {
725+
self.counter
726+
.dropped
727+
.fetch_add(batch.len() as u64, Ordering::Relaxed);
728+
batch.clear();
729+
continue;
730+
}
697731
}
698732
for send_item in batch.drain(..) {
699733
if !self.running.load(Ordering::Relaxed) {

server/agent_config/README-CH.md

+35-1
Original file line numberDiff line numberDiff line change
@@ -1051,9 +1051,43 @@ global:
10511051
**详细描述**:
10521052

10531053
向 Server 端 Ingester 模块发送可观测性数据的最大允许流量,
1054-
超过此限速时数据将会主动丢弃、且采集器会标记为异常状态并触发告警。
1054+
若`ingester_traffic_overflow_action` 配置为`丢弃`,超过此限速时数据将会主动丢弃、且采集器会标记为异常状态并触发告警。
10551055
配置为 0 表示不限速。
10561056

1057+
### Ingester 流量超限的动作 {#global.communication.ingester_traffic_overflow_action}
1058+
1059+
**标签**:
1060+
1061+
`hot_update`
1062+
1063+
**FQCN**:
1064+
1065+
`global.communication.ingester_traffic_overflow_action`
1066+
1067+
**默认值**:
1068+
```yaml
1069+
global:
1070+
communication:
1071+
ingester_traffic_overflow_action: 0
1072+
```
1073+
1074+
**枚举可选值**:
1075+
| Value | Note |
1076+
| ----- | ---------------------------- |
1077+
| 0 | 等待 |
1078+
| 1 | 丢弃 |
1079+
1080+
**模式**:
1081+
| Key | Value |
1082+
| ---- | ---------------------------- |
1083+
| Type | int |
1084+
1085+
**详细描述**:
1086+
1087+
Ingester 流量超限的动作
1088+
- 等待:暂停发送,数据缓存到队列,等待下次发送。
1089+
- 丢弃:直接丢弃数据,并触发 Agent `数据流量达到限速`异常。
1090+
10571091
### 请求 NAT IP 地址 {#global.communication.request_via_nat_ip}
10581092

10591093
**标签**:

server/agent_config/README.md

+36-1
Original file line numberDiff line numberDiff line change
@@ -1075,10 +1075,45 @@ global:
10751075
**Description**:
10761076

10771077
The maximum allowed flow rate for sending observability data to the server-side Ingester module.
1078-
When this rate limit is exceeded, the data will be actively discarded,
1078+
If `ingester_traffic_overflow_action` is set to `drop`,
1079+
when this rate limit is exceeded, the data will be actively discarded,
10791080
and the agent will be marked as abnormal and trigger an alarm.
10801081
Setting it to 0 means no speed limit.
10811082

1083+
### Action when the Ingester traffic exceeds the limit {#global.communication.ingester_traffic_overflow_action}
1084+
1085+
**Tags**:
1086+
1087+
`hot_update`
1088+
1089+
**FQCN**:
1090+
1091+
`global.communication.ingester_traffic_overflow_action`
1092+
1093+
**Default value**:
1094+
```yaml
1095+
global:
1096+
communication:
1097+
ingester_traffic_overflow_action: 0
1098+
```
1099+
1100+
**Enum options**:
1101+
| Value | Note |
1102+
| ----- | ---------------------------- |
1103+
| 0 | wait |
1104+
| 1 | drop |
1105+
1106+
**Schema**:
1107+
| Key | Value |
1108+
| ---- | ---------------------------- |
1109+
| Type | int |
1110+
1111+
**Description**:
1112+
1113+
Action when the Ingester traffic exceeds the limit
1114+
- wait: pause sending, cache data into queue, and wait for next sending
1115+
- drop: the data is discarded directly and the Agent `DATA_BPS_THRESHOLD_EXCEEDED` exception is triggered
1116+
10821117
### Request via NAT IP Address {#global.communication.request_via_nat_ip}
10831118

10841119
**Tags**:

server/agent_config/template.yaml

+28-2
Original file line numberDiff line numberDiff line change
@@ -713,14 +713,40 @@ global:
713713
# description:
714714
# en: |-
715715
# The maximum allowed flow rate for sending observability data to the server-side Ingester module.
716-
# When this rate limit is exceeded, the data will be actively discarded,
716+
# If `ingester_traffic_overflow_action` is set to `drop`,
717+
# when this rate limit is exceeded, the data will be actively discarded,
717718
# and the agent will be marked as abnormal and trigger an alarm.
718719
# Setting it to 0 means no speed limit.
719720
# ch: |-
720721
# 向 Server 端 Ingester 模块发送可观测性数据的最大允许流量,
721-
# 超过此限速时数据将会主动丢弃、且采集器会标记为异常状态并触发告警。
722+
# 若`ingester_traffic_overflow_action` 配置为`丢弃`,超过此限速时数据将会主动丢弃、且采集器会标记为异常状态并触发告警。
722723
# 配置为 0 表示不限速。
723724
max_throughput_to_ingester: 100
725+
# type: int
726+
# name:
727+
# en: Action when the Ingester traffic exceeds the limit
728+
# ch: Ingester 流量超限的动作
729+
# unit:
730+
# range: []
731+
# enum_options:
732+
# - 0:
733+
# en: wait
734+
# ch: 等待
735+
# - 1:
736+
# en: drop
737+
# ch: 丢弃
738+
# modification: hot_update
739+
# ee_feature: false
740+
# description:
741+
# en: |-
742+
# Action when the Ingester traffic exceeds the limit
743+
# - wait: pause sending, cache data into queue, and wait for next sending
744+
# - drop: the data is discarded directly and the Agent `DATA_BPS_THRESHOLD_EXCEEDED` exception is triggered
745+
# ch: |-
746+
# Ingester 流量超限的动作
747+
# - 等待:暂停发送,数据缓存到队列,等待下次发送。
748+
# - 丢弃:直接丢弃数据,并触发 Agent `数据流量达到限速`异常。
749+
ingester_traffic_overflow_action: 0
724750
# type: bool
725751
# name:
726752
# en: Request via NAT IP Address

0 commit comments

Comments
 (0)