Skip to content

--select-columns #10

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions scrubcsv/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [1.0.1] - 2025-03-04

### Added

- Added `--select-columns` which accepts a CSV of headers (in their final post-processed form) and only returns those headers. While this is technically redundant with `xsv select`, it allows earlier field selection when processing very large files.

## [1.0.0] - 2022-05-25

### Added
Expand Down
166 changes: 159 additions & 7 deletions scrubcsv/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use log::debug;
use regex::{bytes::Regex as BytesRegex, Regex};
use std::{
borrow::Cow,
collections::HashSet,
fs,
io::{self, prelude::*},
path::PathBuf,
Expand Down Expand Up @@ -93,6 +94,14 @@ struct Opt {
#[structopt(long = "reserve-column-names")]
reserve_column_names: Option<Regex>,

/// Allow selecting columns by name. Should be submitted as a CSV list of column names.
/// If not specified, all columns will be included.
/// Column names should be specified in their final form, after any cleaning.
/// So if you pass `--clean-column-names=stable`, and your original column name is "Column Name",
/// you should pass "column_name" here.
#[structopt(long = "select-columns")]
select_columns: Option<String>,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i chose to implement this as a single argument with a csv, unlike --drop-row-if-null, because you could easily have 100 columns you want to select out of 1000


/// Drop any rows where the specified column is empty or NULL. Can be passed
/// more than once. Useful for cleaning primary key columns before
/// upserting. Uses the cleaned form of column names.
Expand Down Expand Up @@ -226,13 +235,112 @@ fn run() -> Result<()> {
hdr = new_hdr;
}

// Calculate the number of expected columns read by the reader
// Different from the number of columns in the header
let expected_cols = hdr.len();

// If we have a list of columns to select, filter the header to only include those columns.
// Also create a Vec<bool> to track which columns to keep for later processing
let mut selected_cols = None;
// And store the length of the selected columns for later use
let mut selected_cols_len = 0;
// Store if we need to reorder the columns
let mut selected_cols_require_reordering = false;
// Store the order of the selected columns
let mut selected_cols_order = None;

if let Some(ref select_columns) = opt.select_columns {
let mut select_columns_rdr = csv::ReaderBuilder::new()
.has_headers(false)
.from_reader(select_columns.as_bytes());

let selected_columns_vec: Vec<String> = match select_columns_rdr
.records()
.next()
{
Some(Ok(record)) => record.iter().map(|s| s.to_string()).collect(),
Some(Err(_)) => {
return Err(format_err!("The provided CSV of headers is invalid."));
}
None => {
return Err(format_err!("The provided CSV of headers is empty."));
}
};

// You might have submitted a single newline
if selected_columns_vec.is_empty() {
return Err(format_err!("The provided CSV of headers is empty."));
}

// Now we make sure that all of the selected columns are in the header
for col in selected_columns_vec.iter() {
if !hdr
.iter()
.any(|h| &String::from_utf8_lossy(h).to_string() == col)
{
return Err(format_err!(
"The provided CSV of headers does not contain the column {:?}",
col
));
}
}
let mut seen = HashSet::new();
if !selected_columns_vec.iter().all(|x| seen.insert(x)) {
return Err(format_err!(
"--select-columns cannot contain duplicate column names"
));
}

// The positions of selected columns in the original header
selected_cols_len = selected_columns_vec.len();
let mut max_position_seen: i32 = -1;
let mut new_selected_cols: Vec<bool> = Vec::with_capacity(selected_cols_len);
// make an array of [final_position, original_position]
let mut final_to_original_position: Vec<(usize, usize)> =
Vec::with_capacity(selected_cols_len);
// this is the last time that i will know the original order of the columns (from hdr)
let mut original_position = 0;
for col in hdr.iter() {
let col_str = String::from_utf8_lossy(col);
match selected_columns_vec
.iter()
.position(|c| c == &col_str.to_string())
{
Some(final_position) => {
new_selected_cols.push(true);
final_to_original_position
.push((final_position, original_position));
original_position += 1;
if (final_position as i32) < max_position_seen {
selected_cols_require_reordering = true;
} else {
max_position_seen = final_position as i32;
}
}
None => new_selected_cols.push(false),
}
}
// so we'll have a final_to_original_position like { 2 => 1, 1 => 0, 0 => 2 }
// crete a "selected_cols_order" vec which has the values ordered by the keys
final_to_original_position.sort_by_key(|(k, _)| *k);
let new_selected_cols_order: Vec<usize> =
final_to_original_position.iter().map(|(_, v)| *v).collect();

// The new header is the selected columns
let mut new_hdr = ByteRecord::default();
for col in selected_columns_vec.iter() {
new_hdr.push_field(col.as_bytes());
}

hdr = new_hdr;
selected_cols = Some(new_selected_cols);
selected_cols_order = Some(new_selected_cols_order);
}

// Write our header to our output.
wtr.write_byte_record(&hdr)
.context("cannot write headers")?;

// Calculate the number of expected columns.
let expected_cols = hdr.len();

// Just in case --drop-row-if-null was passed, precompute which columns are
// required to contain a value.
let required_cols = hdr
Expand All @@ -255,6 +363,7 @@ fn run() -> Result<()> {
let use_fast_path = null_re.is_none()
&& !opt.replace_newlines
&& !opt.trim_whitespace
&& opt.select_columns.is_none()
&& opt.drop_row_if_null.is_empty();

// Iterate over all the rows, checking to make sure they look reasonable.
Expand Down Expand Up @@ -289,7 +398,20 @@ fn run() -> Result<()> {
.context("cannot write record")?;
} else {
// We need to apply one or more cleanups, so run the slow path.
let cleaned = record.into_iter().map(|mut val: &[u8]| -> Cow<'_, [u8]> {
// Process each column, but only keep selected columns if specified
let mut cleaned = if selected_cols.is_some() {
Vec::with_capacity(selected_cols_len)
} else {
Vec::with_capacity(record.len())
};

for (i, mut val) in record.into_iter().enumerate() {
// Skip this column if it's not in the selected columns
if let Some(ref selected_cols) = selected_cols {
if !selected_cols[i] {
continue;
}
}
// Convert values matching `--null` regex to empty strings.
if let Some(ref null_re) = null_re {
if null_re.is_match(val) {
Expand Down Expand Up @@ -317,21 +439,51 @@ fn run() -> Result<()> {
}

// Fix newlines.
if opt.replace_newlines
let processed_val = if opt.replace_newlines
&& (val.contains(&b'\n') || val.contains(&b'\r'))
{
NEWLINE_RE.replace_all(val, &b" "[..])
} else {
Cow::Borrowed(val)
};

cleaned.push(processed_val);
}
if selected_cols_require_reordering {
// https://stackoverflow.com/a/69774341/310192
fn sort_by_indices<T>(data: &mut [T], mut indices: Vec<usize>) {
for idx in 0..data.len() {
if indices[idx] != idx {
let mut current_idx = idx;
loop {
let target_idx = indices[current_idx];
indices[current_idx] = current_idx;
if indices[target_idx] == target_idx {
break;
}
data.swap(current_idx, target_idx);
current_idx = target_idx;
}
}
}
}
});

// about 25% faster than creating a vector iteratively
sort_by_indices(
&mut cleaned,
selected_cols_order
.as_ref()
.expect("selected_cols_order should have a value here")
.clone(),
);
}
if opt.drop_row_if_null.is_empty() {
// Still somewhat fast!
wtr.write_record(cleaned).context("cannot write record")?;
} else {
// We need to rebuild the record, check for null columns,
// and only output the record if everything's OK.
let row = cleaned.collect::<Vec<Cow<'_, [u8]>>>();
let row = &cleaned; // Use the cleaned Vec directly
for (value, &is_required_col) in row.iter().zip(required_cols.iter()) {
// If the column is NULL but shouldn't be, bail on this row.
if is_required_col && value.is_empty() {
Expand Down
2 changes: 1 addition & 1 deletion scrubcsv/src/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ impl FromStr for CharSpecifier {
type Err = Error;

fn from_str(s: &str) -> Result<CharSpecifier> {
if s.as_bytes().len() == 1 {
if s.len() == 1 {
Ok(CharSpecifier(Some(s.as_bytes()[0])))
} else {
match s {
Expand Down
96 changes: 90 additions & 6 deletions scrubcsv/tests/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ fn stdin_and_delimiter_and_quiet() {
let testdir = TestDir::new("scrubcsv", "stdin_and_delimiter_and_quiet");
let output = testdir
.cmd()
.args(&["-d", "|"])
.args(["-d", "|"])
.arg("-q")
.output_with_stdin(
"\
Expand Down Expand Up @@ -78,8 +78,8 @@ a\tb\tc
);
let output = testdir
.cmd()
.args(&["-d", r"\t"])
.args(&["--quote", "none"])
.args(["-d", r"\t"])
.args(["--quote", "none"])
.arg("in.csv")
.expect_success();
assert_eq!(
Expand Down Expand Up @@ -133,7 +133,7 @@ fn null_normalization() {
let testdir = TestDir::new("scrubcsv", "null_normalization");
let output = testdir
.cmd()
.args(&["--null", "(?i)null|NIL"])
.args(["--null", "(?i)null|NIL"])
.output_with_stdin("a,b,c,d,e\nnull,NIL,nil,,not null\n")
.expect_success();
assert_eq!(output.stdout_str(), "a,b,c,d,e\n,,,,not null\n")
Expand All @@ -144,7 +144,7 @@ fn null_normalization_of_null_bytes() {
let testdir = TestDir::new("scrubcsv", "null_normalization_of_null_bytes");
let output = testdir
.cmd()
.args(&["--null", "\\x00"])
.args(["--null", "\\x00"])
.output_with_stdin("a,b\n\0,\n")
.expect_success();
assert_eq!(output.stdout_str(), "a,b\n,\n")
Expand Down Expand Up @@ -237,7 +237,7 @@ fn drop_row_if_null() {
.cmd()
.arg("--drop-row-if-null=c1")
.arg("--drop-row-if-null=c2")
.args(&["--null", "NULL"])
.args(["--null", "NULL"])
.output_with_stdin(
r#"c1,c2,c3
1,,
Expand All @@ -256,3 +256,87 @@ a,b,c
"#
);
}

#[test]
fn select_columns() {
let testdir = TestDir::new("scrubcsv", "select_columns");
let output = testdir
.cmd()
.arg("--select-columns=c1,c3")
.output_with_stdin(
r#"c1,c2,c3
a,b,c
d,e,f
g,h,i
"#,
)
.expect("error running scrubcsv");
eprintln!("{}", output.stderr_str());
assert_eq!(
output.stdout_str(),
r#"c1,c3
a,c
d,f
g,i
"#
);
}

#[test]
fn select_columns_error_if_selected_columns_are_not_in_header() {
let testdir = TestDir::new(
"scrubcsv",
"select_columns_error_if_selected_columns_are_not_in_header",
);
let output = testdir
.cmd()
.arg("--select-columns=a,b")
.output_with_stdin(r#"c1,c2,c3"#)
.expect_failure();
assert!(output
.stderr_str()
.contains("The provided CSV of headers does not contain the column \"a\""));
}

#[test]
fn select_columns_respects_order() {
let testdir = TestDir::new("scrubcsv", "select_columns");
let output = testdir
.cmd()
.arg("--select-columns=c5,c2,c4")
.output_with_stdin(
r#"c1,c2,c3,c4,c5
c1-1,c2-1,c3-1,c4-1,c5-1
c1-2,c2-2,c3-2,c4-2,c5-2
c1-3,c2-3,c3-3,c4-3,c5-3
"#,
)
.expect("error running scrubcsv");
assert_eq!(
output.stdout_str(),
r#"c5,c2,c4
c5-1,c2-1,c4-1
c5-2,c2-2,c4-2
c5-3,c2-3,c4-3
"#
);
}

#[test]
fn select_columns_handles_duplicate_selected_columns() {
let testdir = TestDir::new("scrubcsv", "select_columns");
let output = testdir
.cmd()
.arg("--select-columns=c5,c5,c4")
.output_with_stdin(
r#"c1,c2,c3,c4,c5
c1-1,c2-1,c3-1,c4-1,c5-1
c1-2,c2-2,c3-2,c4-2,c5-2
c1-3,c2-3,c3-3,c4-3,c5-3
"#,
)
.expect_failure();
assert!(output
.stderr_str()
.contains("--select-columns cannot contain duplicate column names"));
}
Loading