feat: schema mismatch alignment

praveen-influx · praveen-influx · commit 2213e140c03e · 2025-02-24T10:38:43.000Z
diff --git a/influxdb3/src/commands/serve.rs b/influxdb3/src/commands/serve.rs
@@ -181,7 +181,7 @@ pub struct Config {
     #[clap(
         long = "gen1-duration",
         env = "INFLUXDB3_GEN1_DURATION",
-        default_value = "10m",
+        default_value = "1m",
         action
     )]
     pub gen1_duration: Gen1Duration,
@@ -577,9 +577,7 @@ pub async fn command(config: Config) -> Result<()> {
 
     info!("setting up background mem check for query buffer");
     background_buffer_checker(
-        // config.force_snapshot_mem_threshold.bytes(),
-        734003200,
-        // 536870912,
+        config.force_snapshot_mem_threshold.as_num_bytes(),
         &write_buffer_impl,
     )
     .await;
diff --git a/influxdb3_write/src/paths.rs b/influxdb3_write/src/paths.rs
@@ -75,7 +75,6 @@ impl ParquetFilePath {
                 chunk_idx = sub_chunk_index.unwrap(),
                 ext = PARQUET_FILE_EXTENSION
             ))
-
         } else {
             ObjPath::from(format!(
                 "{host_prefix}/dbs/{db_name}-{db_id}/{table_name}-{table_id}/{date_string}/{wal_seq:010}.{ext}",
diff --git a/influxdb3_write/src/write_buffer/queryable_buffer.rs b/influxdb3_write/src/write_buffer/queryable_buffer.rs
@@ -1,16 +1,11 @@
-use crate::paths::ParquetFilePath;
 use crate::persister::Persister;
 use crate::write_buffer::persisted_files::PersistedFiles;
 use crate::write_buffer::table_buffer::TableBuffer;
 use crate::{ChunkFilter, ParquetFile, ParquetFileId, PersistedSnapshot};
 use crate::{chunk::BufferChunk, write_buffer::table_buffer::SnaphotChunkIter};
+use crate::{paths::ParquetFilePath, write_buffer::table_buffer::array_ref_nulls_for_type};
 use anyhow::Context;
-use arrow::{
-    array::{AsArray, UInt64Array},
-    compute::take,
-    datatypes::TimestampNanosecondType,
-    record_batch::RecordBatch,
-};
+use arrow::record_batch::RecordBatch;
 use async_trait::async_trait;
 use data_types::{
     ChunkId, ChunkOrder, PartitionHashId, PartitionId, PartitionKey, TimestampMinMax,
@@ -24,7 +19,10 @@ use influxdb3_cache::parquet_cache::{CacheRequest, ParquetCacheOracle};
 use influxdb3_cache::{distinct_cache::DistinctCacheProvider, last_cache::LastCacheProvider};
 use influxdb3_catalog::catalog::{Catalog, DatabaseSchema, TableDefinition};
 use influxdb3_id::{DbId, TableId};
-use influxdb3_wal::{CatalogOp, SnapshotDetails, WalContents, WalFileNotifier, WalOp, WriteBatch};
+use influxdb3_wal::{
+    CatalogOp, SnapshotDetails, WalContents, WalFileNotifier, WalFileSequenceNumber, WalOp,
+    WriteBatch,
+};
 use iox_query::QueryChunk;
 use iox_query::chunk_statistics::{NoColumnRanges, create_chunk_statistics};
 use iox_query::exec::Executor;
@@ -36,9 +34,9 @@ use parking_lot::RwLock;
 use parquet::format::FileMetaData;
 use schema::Schema;
 use schema::sort::SortKey;
-use std::sync::Arc;
+use std::any::Any;
 use std::time::Duration;
-use std::{any::Any, collections::BTreeMap};
+use std::{iter::Peekable, slice::Iter, sync::Arc};
 use tokio::sync::oneshot::{self, Receiver};
 use tokio::task::JoinSet;
 
@@ -217,6 +215,7 @@ impl QueryableBuffer {
 
                         let persist_job = PersistJob {
                             database_id: *database_id,
+                            database_name: Arc::clone(&db_schema.name),
                             table_id: *table_id,
                             table_name: Arc::clone(&table_name),
                             chunk_time: chunk.chunk_time,
@@ -231,14 +230,13 @@ impl QueryableBuffer {
                                 None,
                             ),
                             // these clones are cheap and done one at a time
-                            batch: chunk.record_batch.clone(),
+                            batch: vec![chunk.record_batch.clone()],
                             schema: chunk.schema.clone(),
                             timestamp_min_max: chunk.timestamp_min_max,
                             sort_key: sort_key.clone(),
                         };
                         persisting_chunks.push(persist_job);
-                        snapshot_chunks.push_back(chunk);
-                        // snapshot_chunks.add_one(chunk);
+                        snapshot_chunks.push(chunk);
                         debug!(">>> finished with chunk");
                     }
                 }
@@ -322,6 +320,9 @@ impl QueryableBuffer {
                 )));
 
                 sort_dedupe_parallel(
+                    Arc::from(persister.node_identifier_prefix()),
+                    wal_file_number,
+                    Arc::clone(&catalog),
                     persist_jobs,
                     &persister,
                     executor,
@@ -421,7 +422,11 @@ impl QueryableBuffer {
     }
 }
 
+#[allow(clippy::too_many_arguments)]
 async fn sort_dedupe_parallel(
+    host_prefix: Arc<str>,
+    wal_file_number: WalFileSequenceNumber,
+    catalog: Arc<Catalog>,
     persist_jobs: Vec<PersistJob>,
     persister: &Arc<Persister>,
     executor: Arc<Executor>,
@@ -430,10 +435,16 @@ async fn sort_dedupe_parallel(
     persisted_files: Arc<PersistedFiles>,
     persisted_snapshot: Arc<Mutex<PersistedSnapshot>>,
 ) {
-    // if gen1 duration is 1m we should combine upto 10 of them
-    // to create a single parquet file
+    let iterator = PersistJobGroupedIterator::new(
+        &persist_jobs,
+        Arc::clone(&host_prefix),
+        wal_file_number,
+        Arc::clone(&catalog),
+        10,
+    );
+
     let mut set = JoinSet::new();
-    for persist_job in persist_jobs {
+    for persist_job in iterator {
         let persister = Arc::clone(persister);
         let executor = Arc::clone(&executor);
         let persisted_snapshot = Arc::clone(&persisted_snapshot);
@@ -560,34 +571,7 @@ async fn sort_dedupe_serial(
             }
         }
 
-        persisted_snapshot
-            .add_parquet_file(database_id, table_id, parquet_file)
-    }
-}
-
-#[derive(Debug)]
-struct MinMax {
-    min: i64,
-    max: i64,
-}
-
-impl MinMax {
-    fn new(min: i64, max: i64) -> Self {
-        // this doesn't check if min < max, a lot of the times
-        // it's good to start with i64::MAX for min and i64::MIN
-        // for max in loops so this type unlike TimestampMinMax
-        // doesn't check this pre-condition
-        Self { min, max }
-    }
-
-    fn update(&mut self, other: i64) {
-        self.min = other.min(self.min);
-        self.max = other.max(self.max);
-    }
-
-    fn to_ts_min_max(&self) -> TimestampMinMax {
-        // at this point min < max
-        TimestampMinMax::new(self.min, self.max)
+        persisted_snapshot.add_parquet_file(database_id, table_id, parquet_file)
     }
 }
 
@@ -768,16 +752,176 @@ impl BufferState {
 #[derive(Debug)]
 struct PersistJob {
     database_id: DbId,
+    database_name: Arc<str>,
     table_id: TableId,
     table_name: Arc<str>,
     chunk_time: i64,
     path: ParquetFilePath,
-    batch: RecordBatch,
+    batch: Vec<RecordBatch>,
     schema: Schema,
     timestamp_min_max: TimestampMinMax,
     sort_key: SortKey,
 }
 
+struct PersistJobGroupedIterator<'a> {
+    iter: Peekable<Iter<'a, PersistJob>>,
+    host_prefix: Arc<str>,
+    wal_file_number: WalFileSequenceNumber,
+    catalog: Arc<Catalog>,
+    chunk_size: usize,
+}
+
+impl<'a> PersistJobGroupedIterator<'a> {
+    fn new(
+        data: &'a [PersistJob],
+        host_prefix: Arc<str>,
+        wal_file_number: WalFileSequenceNumber,
+        catalog: Arc<Catalog>,
+        chunk_size: usize,
+    ) -> Self {
+        PersistJobGroupedIterator {
+            iter: data.iter().peekable(),
+            host_prefix: Arc::clone(&host_prefix),
+            wal_file_number,
+            catalog,
+            chunk_size,
+        }
+    }
+}
+
+impl Iterator for PersistJobGroupedIterator<'_> {
+    // This is a grouped persist job, since it includes exactly
+    // same fields with only difference being each job has a vec
+    // of batches, it's been reused for now. For clarity it might
+    // be better to have different types to represent this state
+    type Item = PersistJob;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let current_data = self.iter.next()?;
+        let current_table_id = &current_data.table_id;
+
+        let mut ts_min_max = current_data.timestamp_min_max;
+
+        let mut all_batches = Vec::with_capacity(self.chunk_size);
+        let mut all_schemas = Vec::with_capacity(self.chunk_size);
+        all_batches.extend_from_slice(&current_data.batch);
+        all_schemas.push(current_data.schema.clone());
+
+        let mut min_chunk_time = current_data.chunk_time;
+        // currently this naively assumes all batches are the same
+        // shape, but they may not be - in that case we should use
+        // the most recent table defn to add null arrays for batches
+        // with missing cols.
+        while all_batches.len() < self.chunk_size {
+            if let Some(next_data) = self.iter.peek() {
+                if next_data.table_id == *current_table_id {
+                    let next = self.iter.next().unwrap();
+                    ts_min_max = ts_min_max.union(&next.timestamp_min_max);
+                    min_chunk_time = min_chunk_time.min(next.chunk_time);
+                    all_batches.extend_from_slice(&next.batch);
+                    all_schemas.push(next.schema.clone());
+                } else {
+                    break;
+                }
+            } else {
+                break;
+            }
+        }
+
+        // most recent table defn
+        let table_defn = self
+            .catalog
+            .db_schema_by_id(&current_data.database_id)?
+            .table_definition_by_id(&current_data.table_id)?;
+
+        let expected_schema = table_defn.schema.clone();
+        let batches_with_schema_mismatch: Vec<(usize, RecordBatch)> = all_batches
+            .iter()
+            .cloned()
+            .enumerate()
+            // TODO: check if these are in order..
+            .filter(|(idx, _)| {
+                let schema = &all_schemas[*idx];
+                for field_1 in expected_schema.iter() {
+                    let mut found_field = false;
+                    for field_2 in schema.iter() {
+                        if field_1.1.name() == field_2.1.name() {
+                            found_field = true;
+                            break;
+                        }
+                    }
+
+                    if !found_field {
+                        return true;
+                    }
+                }
+                false
+            })
+            .collect();
+
+        if !batches_with_schema_mismatch.is_empty() {
+            // we need to add the missing fields - as schema changes are additive, when there is
+            // a mismatch it means new column has been added to table but the batches are missing
+            // them.
+            for (idx, batch) in &batches_with_schema_mismatch {
+                let mut cols = vec![];
+                let new_schema = &table_defn.schema;
+                // pick it's current iox schema, to add the columns (making null for missing)
+                let outdated_batch_schema = &all_schemas[*idx];
+                debug!(
+                    ?outdated_batch_schema,
+                    ">>> outdated batch schema when aligning mismatched schema"
+                );
+                for col_idx_with_field_details in new_schema.iter().enumerate() {
+                    let (col_idx, (influx_col_type, field)) = col_idx_with_field_details;
+                    let batch_field = outdated_batch_schema.field_by_name(field.name());
+                    let len = batch.columns()[0].len();
+                    if batch_field.is_some() {
+                        let col = Arc::clone(&batch.columns()[col_idx]);
+                        cols.push(col);
+                    } else {
+                        let null_array_col = array_ref_nulls_for_type(influx_col_type, len);
+                        cols.push(null_array_col);
+                    }
+                }
+
+                let new_arrow_schema = new_schema.as_arrow();
+                debug!(
+                    ?new_arrow_schema,
+                    ">>> new arrow schema for batch when aligning mismatched schema"
+                );
+                let new_rec_batch = RecordBatch::try_new(new_arrow_schema, cols).expect(
+                    "record batch to be created with new schema after fixing schema mismatch",
+                );
+
+                let _ = std::mem::replace(&mut all_batches[*idx], new_rec_batch);
+            }
+        }
+
+        Some(PersistJob {
+            database_id: current_data.database_id,
+            database_name: Arc::clone(&current_data.database_name),
+            table_id: current_data.table_id,
+            path: ParquetFilePath::new(
+                &self.host_prefix,
+                &current_data.database_name,
+                current_data.database_id.as_u32(),
+                &current_data.table_name,
+                current_data.table_id.as_u32(),
+                min_chunk_time,
+                self.wal_file_number,
+                None,
+            ),
+            table_name: Arc::clone(&current_data.table_name),
+            chunk_time: min_chunk_time,
+            batch: all_batches,
+            schema: current_data.schema.clone(),
+            timestamp_min_max: ts_min_max,
+            sort_key: current_data.sort_key.clone(),
+        })
+    }
+}
+
 pub(crate) struct SortDedupePersistSummary {
     pub file_size_bytes: u64,
     pub file_meta_data: FileMetaData,
@@ -799,7 +943,7 @@ async fn sort_dedupe_persist(
 ) -> Result<SortDedupePersistSummary, anyhow::Error> {
     // Dedupe and sort using the COMPACT query built into
     // iox_query
-    let row_count = persist_job.batch.num_rows();
+    let row_count = persist_job.batch.iter().map(|batch| batch.num_rows()).sum();
     info!(
         "Persisting {} rows for db id {} and table id {} and chunk {} to file {}",
         row_count,
@@ -818,7 +962,7 @@ async fn sort_dedupe_persist(
     );
 
     let chunks: Vec<Arc<dyn QueryChunk>> = vec![Arc::new(BufferChunk {
-        batches: vec![persist_job.batch],
+        batches: persist_job.batch,
         schema: persist_job.schema.clone(),
         stats: Arc::new(chunk_stats),
         partition_id: TransitionPartitionId::from_parts(
@@ -904,7 +1048,7 @@ mod tests {
     use parquet_file::storage::{ParquetStorage, StorageId};
     use std::num::NonZeroUsize;
 
-    #[tokio::test]
+    #[test_log::test(tokio::test)]
     async fn snapshot_works_with_not_all_columns_in_buffer() {
         let object_store: Arc<dyn ObjectStore> = Arc::new(InMemory::new());
         let metrics = Arc::new(metric::Registry::default());
diff --git a/influxdb3_write/src/write_buffer/table_buffer.rs b/influxdb3_write/src/write_buffer/table_buffer.rs