feat: separate out write path executor with unbounded memory

praveen-influx · praveen-influx · commit ed5d40c60347 · 2025-06-02T10:32:20.000+01:00
Currently when there is an OOM while snapshotting, the process keeps going without crashing. This behaviour is observed in main (commit: be25c6f). This means the wal files keep increasing to a point that restarts never can replay all the files. This is happening because of the distribution of memory, in enterprise especially there is no need for an ingester to be allocated just 20% for datafusion memory pool (which runs the snapshot) as parquet cache is not in use at all. This 20% is too conservative for an ingester, so instead of redistributing the memory settings based on the mode it's running, a separate write path executor is introduced in this commit with no bound on memory (still uses `GreedyMemoryPool` under the hoold with `usize::MAX` as upper limit). This means write path executor will always run into OOM and stop the whole process. Also, it is important to let snapshotting process use as much memory as it needs as without that, the buffer will keep getting bigger and run into OOM anyway. closes: #26422
diff --git a/influxdb3/src/commands/serve.rs b/influxdb3/src/commands/serve.rs
@@ -1,7 +1,6 @@
 //! Entrypoint for InfluxDB 3 Core Server
 
 use anyhow::{Context, bail};
-use datafusion_util::config::register_iox_object_store;
 use futures::{FutureExt, future::FusedFuture, pin_mut};
 use influxdb3_authz::TokenAuthenticator;
 use influxdb3_cache::{
@@ -546,6 +545,20 @@ pub async fn command(config: Config) -> Result<()> {
     let f = SendPanicsToTracing::new_with_metrics(&metrics);
     std::mem::forget(f);
 
+    // When you have extra executor, you need separate metrics registry! It is not clear what
+    // the impact would be
+    // TODO: confirm this is not going to mess up downstream metrics consumers
+    let write_path_metrics = setup_metric_registry();
+
+    // Install custom panic handler and forget about it.
+    //
+    // This leaks the handler and prevents it from ever being dropped during the
+    // lifetime of the program - this is actually a good thing, as it prevents
+    // the panic handler from being removed while unwinding a panic (which in
+    // turn, causes a panic - see #548)
+    let write_path_panic_handler_fn = SendPanicsToTracing::new_with_metrics(&write_path_metrics);
+    std::mem::forget(write_path_panic_handler_fn);
+
     // Construct a token to trigger clean shutdown
     let frontend_shutdown = CancellationToken::new();
     let shutdown_manager = ShutdownManager::new(frontend_shutdown.clone());
@@ -619,8 +632,36 @@ pub async fn command(config: Config) -> Result<()> {
             Arc::clone(&metrics),
         ),
     ));
-    let runtime_env = exec.new_context().inner().runtime_env();
-    register_iox_object_store(runtime_env, parquet_store.id(), Arc::clone(&object_store));
+
+    // Note: using same metrics registry causes runtime panic.
+    let write_path_executor = Arc::new(Executor::new_with_config_and_executor(
+        ExecutorConfig {
+            // should this be divided? or should this contend for threads with executor that's
+            // setup for querying only
+            target_query_partitions: tokio_datafusion_config.num_threads.unwrap(),
+            object_stores: [&parquet_store]
+                .into_iter()
+                .map(|store| (store.id(), Arc::clone(store.object_store())))
+                .collect(),
+            metric_registry: Arc::clone(&write_path_metrics),
+            // use as much memory for persistence, can this be UnboundedMemoryPool?
+            mem_pool_size: usize::MAX,
+            // These are new additions, just skimming through the code it does not look like we can
+            // achieve the same effect as having a separate executor. It looks like it's for "all"
+            // queries, it'd be nice to have a filter to say when the query matches this pattern
+            // apply these limits. If that's possible maybe we could avoid creating a separate
+            // executor.
+            per_query_mem_pool_config: PerQueryMemoryPoolConfig::Disabled,
+            heap_memory_limit: None,
+        },
+        DedicatedExecutor::new(
+            "datafusion_write_path",
+            tokio_datafusion_config
+                .builder()
+                .map_err(Error::TokioRuntime)?,
+            Arc::clone(&write_path_metrics),
+        ),
+    ));
 
     let trace_header_parser = TraceHeaderParser::new()
         .with_jaeger_trace_context_header_name(
@@ -685,7 +726,7 @@ pub async fn command(config: Config) -> Result<()> {
         last_cache,
         distinct_cache,
         time_provider: Arc::<SystemProvider>::clone(&time_provider),
-        executor: Arc::clone(&exec),
+        executor: Arc::clone(&write_path_executor),
         wal_config,
         parquet_cache,
         metric_registry: Arc::clone(&metrics),
diff --git a/influxdb3_wal/src/lib.rs b/influxdb3_wal/src/lib.rs
@@ -16,6 +16,7 @@ use influxdb_line_protocol::FieldValue;
 use influxdb3_id::{ColumnId, DbId, SerdeVecMap, TableId};
 use influxdb3_shutdown::ShutdownToken;
 use iox_time::Time;
+use observability_deps::tracing::error;
 use serde::{Deserialize, Serialize};
 use serde_with::serde_as;
 use std::cmp::Ordering;
@@ -556,6 +557,9 @@ pub fn background_wal_flush<W: Wal>(
                     {
                         let snapshot_wal = Arc::clone(&wal);
                         tokio::spawn(async move {
+                            // since we're using separate executor with unlimited memory,
+                            // the errors here will never be due to lack of resources. Only OS
+                            // (operating system) can OOM kill the whole process
                             let snapshot_details = snapshot_complete.await.expect("snapshot failed");
                             assert_eq!(snapshot_info, snapshot_details);