faradayio · seamusabshere · Mar 4, 2025 · Mar 4, 2025 · Mar 4, 2025 · Mar 4, 2025
diff --git a/scrubcsv/CHANGELOG.md b/scrubcsv/CHANGELOG.md
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.0.1] - 2025-03-04
+
+### Added
+
+- Added `--select-columns` which accepts a CSV of headers (in their final post-processed form) and only returns those headers. While this is technically redundant with `xsv select`, it allows earlier field selection when processing very large files.
+
 ## [1.0.0] - 2022-05-25
 
 ### Added

diff --git a/scrubcsv/src/main.rs b/scrubcsv/src/main.rs
@@ -9,6 +9,7 @@ use log::debug;
 use regex::{bytes::Regex as BytesRegex, Regex};
 use std::{
     borrow::Cow,
+    collections::HashSet,
     fs,
     io::{self, prelude::*},
     path::PathBuf,
@@ -93,6 +94,14 @@ struct Opt {
     #[structopt(long = "reserve-column-names")]
     reserve_column_names: Option<Regex>,
 
+    /// Allow selecting columns by name. Should be submitted as a CSV list of column names.
+    /// If not specified, all columns will be included.
+    /// Column names should be specified in their final form, after any cleaning.
+    /// So if you pass `--clean-column-names=stable`, and your original column name is "Column Name",
+    /// you should pass "column_name" here.
+    #[structopt(long = "select-columns")]
+    select_columns: Option<String>,
+
     /// Drop any rows where the specified column is empty or NULL. Can be passed
     /// more than once. Useful for cleaning primary key columns before
     /// upserting. Uses the cleaned form of column names.
@@ -226,13 +235,112 @@ fn run() -> Result<()> {
         hdr = new_hdr;
     }
 
+    // Calculate the number of expected columns read by the reader
+    // Different from the number of columns in the header
+    let expected_cols = hdr.len();
+
+    // If we have a list of columns to select, filter the header to only include those columns.
+    // Also create a Vec<bool> to track which columns to keep for later processing
+    let mut selected_cols = None;
+    // And store the length of the selected columns for later use
+    let mut selected_cols_len = 0;
+    // Store if we need to reorder the columns
+    let mut selected_cols_require_reordering = false;
+    // Store the order of the selected columns
+    let mut selected_cols_order = None;
+
+    if let Some(ref select_columns) = opt.select_columns {
+        let mut select_columns_rdr = csv::ReaderBuilder::new()
+            .has_headers(false)
+            .from_reader(select_columns.as_bytes());
+
+        let selected_columns_vec: Vec<String> = match select_columns_rdr
+            .records()
+            .next()
+        {
+            Some(Ok(record)) => record.iter().map(|s| s.to_string()).collect(),
+            Some(Err(_)) => {
+                return Err(format_err!("The provided CSV of headers is invalid."));
+            }
+            None => {
+                return Err(format_err!("The provided CSV of headers is empty."));
+            }
+        };
+
+        // You might have submitted a single newline
+        if selected_columns_vec.is_empty() {
+            return Err(format_err!("The provided CSV of headers is empty."));
+        }
+
+        // Now we make sure that all of the selected columns are in the header
+        for col in selected_columns_vec.iter() {
+            if !hdr
+                .iter()
+                .any(|h| &String::from_utf8_lossy(h).to_string() == col)
+            {
+                return Err(format_err!(
+                    "The provided CSV of headers does not contain the column {:?}",
+                    col
+                ));
+            }
+        }
+        let mut seen = HashSet::new();
+        if !selected_columns_vec.iter().all(|x| seen.insert(x)) {
+            return Err(format_err!(
+                "--select-columns cannot contain duplicate column names"
+            ));
+        }
+
+        // The positions of selected columns in the original header
+        selected_cols_len = selected_columns_vec.len();
+        let mut max_position_seen: i32 = -1;
+        let mut new_selected_cols: Vec<bool> = Vec::with_capacity(selected_cols_len);
+        // make an array of [final_position, original_position]
+        let mut final_to_original_position: Vec<(usize, usize)> =
+            Vec::with_capacity(selected_cols_len);
+        // this is the last time that i will know the original order of the columns (from hdr)
+        let mut original_position = 0;
+        for col in hdr.iter() {
+            let col_str = String::from_utf8_lossy(col);
+            match selected_columns_vec
+                .iter()
+                .position(|c| c == &col_str.to_string())
+            {
+                Some(final_position) => {
+                    new_selected_cols.push(true);
+                    final_to_original_position
+                        .push((final_position, original_position));
+                    original_position += 1;
+                    if (final_position as i32) < max_position_seen {
+                        selected_cols_require_reordering = true;
+                    } else {
+                        max_position_seen = final_position as i32;
+                    }
+                }
+                None => new_selected_cols.push(false),
+            }
+        }
+        // so we'll have a final_to_original_position like { 2 => 1, 1 => 0, 0 => 2 }
+        // crete a "selected_cols_order" vec which has the values ordered by the keys
+        final_to_original_position.sort_by_key(|(k, _)| *k);
+        let new_selected_cols_order: Vec<usize> =
+            final_to_original_position.iter().map(|(_, v)| *v).collect();
+
+        // The new header is the selected columns
+        let mut new_hdr = ByteRecord::default();
+        for col in selected_columns_vec.iter() {
+            new_hdr.push_field(col.as_bytes());
+        }
+
+        hdr = new_hdr;
+        selected_cols = Some(new_selected_cols);
+        selected_cols_order = Some(new_selected_cols_order);
+    }
+
     // Write our header to our output.
     wtr.write_byte_record(&hdr)
         .context("cannot write headers")?;
 
-    // Calculate the number of expected columns.
-    let expected_cols = hdr.len();
-
     // Just in case --drop-row-if-null was passed, precompute which columns are
     // required to contain a value.
     let required_cols = hdr
@@ -255,6 +363,7 @@ fn run() -> Result<()> {
     let use_fast_path = null_re.is_none()
         && !opt.replace_newlines
         && !opt.trim_whitespace
+        && opt.select_columns.is_none()
         && opt.drop_row_if_null.is_empty();
 
     // Iterate over all the rows, checking to make sure they look reasonable.
@@ -289,7 +398,20 @@ fn run() -> Result<()> {
                 .context("cannot write record")?;
         } else {
             // We need to apply one or more cleanups, so run the slow path.
-            let cleaned = record.into_iter().map(|mut val: &[u8]| -> Cow<'_, [u8]> {
+            // Process each column, but only keep selected columns if specified
+            let mut cleaned = if selected_cols.is_some() {
+                Vec::with_capacity(selected_cols_len)
+            } else {
+                Vec::with_capacity(record.len())
+            };
+
+            for (i, mut val) in record.into_iter().enumerate() {
+                // Skip this column if it's not in the selected columns
+                if let Some(ref selected_cols) = selected_cols {
+                    if !selected_cols[i] {
+                        continue;
+                    }
+                }
                 // Convert values matching `--null` regex to empty strings.
                 if let Some(ref null_re) = null_re {
                     if null_re.is_match(val) {
@@ -317,21 +439,51 @@ fn run() -> Result<()> {
                 }
 
                 // Fix newlines.
-                if opt.replace_newlines
+                let processed_val = if opt.replace_newlines
                     && (val.contains(&b'\n') || val.contains(&b'\r'))
                 {
                     NEWLINE_RE.replace_all(val, &b" "[..])
                 } else {
                     Cow::Borrowed(val)
+                };
+
+                cleaned.push(processed_val);
+            }
+            if selected_cols_require_reordering {
+                // https://stackoverflow.com/a/69774341/310192
+                fn sort_by_indices<T>(data: &mut [T], mut indices: Vec<usize>) {
+                    for idx in 0..data.len() {
+                        if indices[idx] != idx {
+                            let mut current_idx = idx;
+                            loop {
+                                let target_idx = indices[current_idx];
+                                indices[current_idx] = current_idx;
+                                if indices[target_idx] == target_idx {
+                                    break;
+                                }
+                                data.swap(current_idx, target_idx);
+                                current_idx = target_idx;
+                            }
+                        }
+                    }
                 }
-            });
+
+                // about 25% faster than creating a vector iteratively
+                sort_by_indices(
+                    &mut cleaned,
+                    selected_cols_order
+                        .as_ref()
+                        .expect("selected_cols_order should have a value here")
+                        .clone(),
+                );
+            }
             if opt.drop_row_if_null.is_empty() {
                 // Still somewhat fast!
                 wtr.write_record(cleaned).context("cannot write record")?;
             } else {
                 // We need to rebuild the record, check for null columns,
                 // and only output the record if everything's OK.
-                let row = cleaned.collect::<Vec<Cow<'_, [u8]>>>();
+                let row = &cleaned; // Use the cleaned Vec directly
                 for (value, &is_required_col) in row.iter().zip(required_cols.iter()) {
                     // If the column is NULL but shouldn't be, bail on this row.
                     if is_required_col && value.is_empty() {

diff --git a/scrubcsv/src/util.rs b/scrubcsv/src/util.rs
@@ -20,7 +20,7 @@ impl FromStr for CharSpecifier {
     type Err = Error;
 
     fn from_str(s: &str) -> Result<CharSpecifier> {
-        if s.as_bytes().len() == 1 {
+        if s.len() == 1 {
             Ok(CharSpecifier(Some(s.as_bytes()[0])))
         } else {
             match s {

diff --git a/scrubcsv/tests/tests.rs b/scrubcsv/tests/tests.rs
@@ -47,7 +47,7 @@ fn stdin_and_delimiter_and_quiet() {
     let testdir = TestDir::new("scrubcsv", "stdin_and_delimiter_and_quiet");
     let output = testdir
         .cmd()
-        .args(&["-d", "|"])
+        .args(["-d", "|"])
         .arg("-q")
         .output_with_stdin(
             "\
@@ -78,8 +78,8 @@ a\tb\tc
     );
     let output = testdir
         .cmd()
-        .args(&["-d", r"\t"])
-        .args(&["--quote", "none"])
+        .args(["-d", r"\t"])
+        .args(["--quote", "none"])
         .arg("in.csv")
         .expect_success();
     assert_eq!(
@@ -133,7 +133,7 @@ fn null_normalization() {
     let testdir = TestDir::new("scrubcsv", "null_normalization");
     let output = testdir
         .cmd()
-        .args(&["--null", "(?i)null|NIL"])
+        .args(["--null", "(?i)null|NIL"])
         .output_with_stdin("a,b,c,d,e\nnull,NIL,nil,,not null\n")
         .expect_success();
     assert_eq!(output.stdout_str(), "a,b,c,d,e\n,,,,not null\n")
@@ -144,7 +144,7 @@ fn null_normalization_of_null_bytes() {
     let testdir = TestDir::new("scrubcsv", "null_normalization_of_null_bytes");
     let output = testdir
         .cmd()
-        .args(&["--null", "\\x00"])
+        .args(["--null", "\\x00"])
         .output_with_stdin("a,b\n\0,\n")
         .expect_success();
     assert_eq!(output.stdout_str(), "a,b\n,\n")
@@ -237,7 +237,7 @@ fn drop_row_if_null() {
         .cmd()
         .arg("--drop-row-if-null=c1")
         .arg("--drop-row-if-null=c2")
-        .args(&["--null", "NULL"])
+        .args(["--null", "NULL"])
         .output_with_stdin(
             r#"c1,c2,c3
 1,,
@@ -256,3 +256,87 @@ a,b,c
 "#
     );
 }
+
+#[test]
+fn select_columns() {
+    let testdir = TestDir::new("scrubcsv", "select_columns");
+    let output = testdir
+        .cmd()
+        .arg("--select-columns=c1,c3")
+        .output_with_stdin(
+            r#"c1,c2,c3
+a,b,c
+d,e,f
+g,h,i
+"#,
+        )
+        .expect("error running scrubcsv");
+    eprintln!("{}", output.stderr_str());
+    assert_eq!(
+        output.stdout_str(),
+        r#"c1,c3
+a,c
+d,f
+g,i
+"#
+    );
+}
+
+#[test]
+fn select_columns_error_if_selected_columns_are_not_in_header() {
+    let testdir = TestDir::new(
+        "scrubcsv",
+        "select_columns_error_if_selected_columns_are_not_in_header",
+    );
+    let output = testdir
+        .cmd()
+        .arg("--select-columns=a,b")
+        .output_with_stdin(r#"c1,c2,c3"#)
+        .expect_failure();
+    assert!(output
+        .stderr_str()
+        .contains("The provided CSV of headers does not contain the column \"a\""));
+}
+
+#[test]
+fn select_columns_respects_order() {
+    let testdir = TestDir::new("scrubcsv", "select_columns");
+    let output = testdir
+        .cmd()
+        .arg("--select-columns=c5,c2,c4")
+        .output_with_stdin(
+            r#"c1,c2,c3,c4,c5
+c1-1,c2-1,c3-1,c4-1,c5-1
+c1-2,c2-2,c3-2,c4-2,c5-2
+c1-3,c2-3,c3-3,c4-3,c5-3
+"#,
+        )
+        .expect("error running scrubcsv");
+    assert_eq!(
+        output.stdout_str(),
+        r#"c5,c2,c4
+c5-1,c2-1,c4-1
+c5-2,c2-2,c4-2
+c5-3,c2-3,c4-3
+"#
+    );
+}
+
+#[test]
+fn select_columns_handles_duplicate_selected_columns() {
+    let testdir = TestDir::new("scrubcsv", "select_columns");
+    let output = testdir
+        .cmd()
+        .arg("--select-columns=c5,c5,c4")
+        .output_with_stdin(
+            r#"c1,c2,c3,c4,c5
+c1-1,c2-1,c3-1,c4-1,c5-1
+c1-2,c2-2,c3-2,c4-2,c5-2
+c1-3,c2-3,c3-3,c4-3,c5-3
+"#,
+        )
+        .expect_failure();
+    assert!(output
+        .stderr_str()
+        .contains("--select-columns cannot contain duplicate column names"));
+}