|
| 1 | +#!/usr/bin/env Rscript |
| 2 | + |
| 3 | +args <- commandArgs(trailingOnly = TRUE) |
| 4 | +if (length(args) != 2) { |
| 5 | + stop("Usage: Rscript dev/compare_tables.R <reference_dir> <generated_dir>") |
| 6 | +} |
| 7 | + |
| 8 | +reference_dir <- args[[1]] |
| 9 | +generated_dir <- args[[2]] |
| 10 | +comparison_digits <- 5L |
| 11 | + |
| 12 | +if (!dir.exists(reference_dir)) { |
| 13 | + stop("Reference directory not found: ", reference_dir) |
| 14 | +} |
| 15 | +if (!dir.exists(generated_dir)) { |
| 16 | + stop("Generated directory not found: ", generated_dir) |
| 17 | +} |
| 18 | + |
| 19 | +list_csv_files <- function(path) { |
| 20 | + sort(list.files(path, pattern = "\\.csv$", full.names = FALSE)) |
| 21 | +} |
| 22 | + |
| 23 | +reference_files <- list_csv_files(reference_dir) |
| 24 | +generated_files <- list_csv_files(generated_dir) |
| 25 | + |
| 26 | +if (!identical(reference_files, generated_files)) { |
| 27 | + missing_files <- setdiff(reference_files, generated_files) |
| 28 | + extra_files <- setdiff(generated_files, reference_files) |
| 29 | + details <- c( |
| 30 | + if (length(missing_files) > 0) paste("missing:", paste(missing_files, collapse = ", ")), |
| 31 | + if (length(extra_files) > 0) paste("extra:", paste(extra_files, collapse = ", ")) |
| 32 | + ) |
| 33 | + stop("CSV file set mismatch between reference and generated outputs. ", paste(details, collapse = "; ")) |
| 34 | +} |
| 35 | + |
| 36 | +first_mismatch <- function(reference_values, generated_values) { |
| 37 | + same <- (reference_values == generated_values) | (is.na(reference_values) & is.na(generated_values)) |
| 38 | + same[is.na(same)] <- FALSE |
| 39 | + mismatch <- which(!same) |
| 40 | + if (length(mismatch) == 0) { |
| 41 | + return(NA_integer_) |
| 42 | + } |
| 43 | + mismatch[[1]] |
| 44 | +} |
| 45 | + |
| 46 | +format_value <- function(value) { |
| 47 | + if (is.na(value)) { |
| 48 | + return("NA") |
| 49 | + } |
| 50 | + as.character(value) |
| 51 | +} |
| 52 | + |
| 53 | +compare_numeric_column <- function(reference_col, generated_col, file_label, col_name) { |
| 54 | + reference_cmp <- signif(reference_col, digits = comparison_digits) |
| 55 | + generated_cmp <- signif(generated_col, digits = comparison_digits) |
| 56 | + mismatch <- first_mismatch(reference_cmp, generated_cmp) |
| 57 | + if (!is.na(mismatch)) { |
| 58 | + stop( |
| 59 | + sprintf( |
| 60 | + "%s: numeric mismatch in column '%s' at row %d (reference=%s, generated=%s; compared at %d significant digits)", |
| 61 | + file_label, |
| 62 | + col_name, |
| 63 | + mismatch, |
| 64 | + format_value(reference_col[[mismatch]]), |
| 65 | + format_value(generated_col[[mismatch]]), |
| 66 | + comparison_digits |
| 67 | + ) |
| 68 | + ) |
| 69 | + } |
| 70 | +} |
| 71 | + |
| 72 | +compare_text_column <- function(reference_col, generated_col, file_label, col_name) { |
| 73 | + reference_chr <- ifelse(is.na(reference_col), NA_character_, as.character(reference_col)) |
| 74 | + generated_chr <- ifelse(is.na(generated_col), NA_character_, as.character(generated_col)) |
| 75 | + mismatch <- first_mismatch(reference_chr, generated_chr) |
| 76 | + if (!is.na(mismatch)) { |
| 77 | + stop( |
| 78 | + sprintf( |
| 79 | + "%s: value mismatch in column '%s' at row %d (reference=%s, generated=%s)", |
| 80 | + file_label, |
| 81 | + col_name, |
| 82 | + mismatch, |
| 83 | + format_value(reference_chr[[mismatch]]), |
| 84 | + format_value(generated_chr[[mismatch]]) |
| 85 | + ) |
| 86 | + ) |
| 87 | + } |
| 88 | +} |
| 89 | + |
| 90 | +for (filename in reference_files) { |
| 91 | + file_label <- paste0("results/tables/", filename) |
| 92 | + reference_df <- read.csv( |
| 93 | + file.path(reference_dir, filename), |
| 94 | + stringsAsFactors = FALSE, |
| 95 | + check.names = FALSE |
| 96 | + ) |
| 97 | + generated_df <- read.csv( |
| 98 | + file.path(generated_dir, filename), |
| 99 | + stringsAsFactors = FALSE, |
| 100 | + check.names = FALSE |
| 101 | + ) |
| 102 | + |
| 103 | + if (!identical(names(reference_df), names(generated_df))) { |
| 104 | + stop(file_label, ": column mismatch between reference and generated outputs") |
| 105 | + } |
| 106 | + if (nrow(reference_df) != nrow(generated_df) || ncol(reference_df) != ncol(generated_df)) { |
| 107 | + stop( |
| 108 | + file_label, |
| 109 | + ": dimension mismatch (reference=", |
| 110 | + nrow(reference_df), |
| 111 | + "x", |
| 112 | + ncol(reference_df), |
| 113 | + ", generated=", |
| 114 | + nrow(generated_df), |
| 115 | + "x", |
| 116 | + ncol(generated_df), |
| 117 | + ")" |
| 118 | + ) |
| 119 | + } |
| 120 | + |
| 121 | + for (col_name in names(reference_df)) { |
| 122 | + reference_col <- reference_df[[col_name]] |
| 123 | + generated_col <- generated_df[[col_name]] |
| 124 | + if (is.numeric(reference_col) && is.numeric(generated_col)) { |
| 125 | + compare_numeric_column(reference_col, generated_col, file_label, col_name) |
| 126 | + } else { |
| 127 | + compare_text_column(reference_col, generated_col, file_label, col_name) |
| 128 | + } |
| 129 | + } |
| 130 | +} |
| 131 | + |
| 132 | +message( |
| 133 | + "Tracked tables match regenerated outputs across ", |
| 134 | + length(reference_files), |
| 135 | + " files at ", |
| 136 | + comparison_digits, |
| 137 | + " significant digits." |
| 138 | +) |
0 commit comments