Skip to content

Track skips for problems #448

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 13 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion src/delimited_index.cc
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,14 @@ delimited_index::delimited_index(

bool has_quoted_newlines = quote != '\0';

size_t skip_counter = 0;

size_t start = find_first_line(
mmap_, skip_, comment_, skip_empty_rows, has_quoted_newlines, quote);
mmap_, skip_, comment_, skip_empty_rows, has_quoted_newlines, quote, skip_counter);

if (skip_counter) {
errors->add_skips_at_start(skip_counter, filename);
}

// If an empty file, or a file with only a newline.
if (start >= file_size - 1) {
Expand Down
8 changes: 7 additions & 1 deletion src/delimited_index_connection.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,15 @@ delimited_index_connection::delimited_index_connection(

bool has_quoted_newlines = quote != '\0';

size_t skip_counter = 0;

// Parse header
size_t start = find_first_line(
buf[i], skip_, comment_, skip_empty_rows, has_quoted_newlines, quote);
buf[i], skip_, comment_, skip_empty_rows, has_quoted_newlines, quote, skip_counter);

if (skip_counter) {
errors->add_skips_at_start(skip_counter, filename_);
}

if (delim == nullptr) {
delim_ = std::string(1, guess_delim(buf[i], start, 20, sz, quote));
Expand Down
11 changes: 10 additions & 1 deletion src/fixed_width_index.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once

#include "index.h"
#include "vroom_errors.h"

// clang-format off
#ifdef __clang__
Expand Down Expand Up @@ -53,6 +54,7 @@ class fixed_width_index
const char* comment,
const bool skip_empty_rows,
const size_t n_max,
std::shared_ptr<vroom_errors> errors,
const bool progress)
: col_starts_(col_starts),
col_ends_(col_ends),
Expand All @@ -76,14 +78,21 @@ class fixed_width_index
}

size_t file_size = mmap_.size();

size_t skip_counter = 0;

size_t start = find_first_line(
mmap_,
skip,
comment,
skip_empty_rows,
/* embedded_nl */ false,
/* quote */ '\0');
/* quote */ '\0',
skip_counter);

if (skip_counter) {
errors->add_skips_at_start(skip_counter, filename);
}

// Check for windows newlines
size_t first_nl;
Expand Down
10 changes: 9 additions & 1 deletion src/fixed_width_index_connection.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ fixed_width_index_connection::fixed_width_index_connection(
const char* comment,
const bool skip_empty_rows,
const size_t n_max,
std::shared_ptr<vroom_errors> errors,
const bool progress,
const size_t chunk_size) {

Expand Down Expand Up @@ -56,14 +57,21 @@ fixed_width_index_connection::fixed_width_index_connection(
size_t sz = R_ReadConnection(con, buf[i].data(), chunk_size - 1);
buf[i][sz] = '\0';

size_t skip_counter = 0;

// Parse header
size_t start = find_first_line(
buf[i],
skip,
comment,
skip_empty_rows,
/* embedded_nl */ false,
/* quote */ '\0');
/* quote */ '\0',
skip_counter);

if (skip_counter) {
errors->add_skips_at_start(skip_counter, filename_);
}

// Check for windows newlines
size_t first_nl;
Expand Down
1 change: 1 addition & 0 deletions src/fixed_width_index_connection.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class fixed_width_index_connection : public fixed_width_index {
const char* comment,
const bool skip_empty_rows,
const size_t n_max,
std::shared_ptr<vroom_errors> errors,
const bool progress,
const size_t chunk_size);

Expand Down
6 changes: 6 additions & 0 deletions src/index_collection.cc
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@ std::shared_ptr<vroom::index> make_fixed_width_index(
const char* comment,
const bool skip_empty_rows,
const size_t n_max,
const std::shared_ptr<vroom_errors>& errors,
const bool progress) {

auto standardise_one_path = cpp11::package("vroom")["standardise_one_path"];
Expand All @@ -308,6 +309,7 @@ std::shared_ptr<vroom::index> make_fixed_width_index(
comment,
skip_empty_rows,
n_max,
errors,
progress,
get_env("VROOM_CONNECTION_SIZE", 1 << 17));
} else {
Expand All @@ -321,6 +323,7 @@ std::shared_ptr<vroom::index> make_fixed_width_index(
comment,
skip_empty_rows,
n_max,
errors,
progress);
}
}
Expand All @@ -334,6 +337,7 @@ index_collection::index_collection(
const char* comment,
const bool skip_empty_rows,
const size_t n_max,
const std::shared_ptr<vroom_errors>& errors,
const bool progress)
: rows_(0), columns_(0) {

Expand All @@ -346,6 +350,7 @@ index_collection::index_collection(
comment,
skip_empty_rows,
n_max,
errors,
progress);

columns_ = first->num_columns();
Expand All @@ -363,6 +368,7 @@ index_collection::index_collection(
comment,
skip_empty_rows,
n_max,
errors,
progress);

check_column_consistency(first, idx, false, i);
Expand Down
1 change: 1 addition & 0 deletions src/index_collection.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class index_collection : public index,
const char* comment,
const bool skip_empty_rows,
const size_t n_max,
const std::shared_ptr<vroom_errors>& errors,
const bool progress);

string get(size_t row, size_t col) const override;
Expand Down
4 changes: 3 additions & 1 deletion src/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,8 @@ size_t find_first_line(
const char* comment,
const bool skip_empty_rows,
const bool embedded_nl,
const char quote) {
const char quote,
size_t &skip_counter) {

auto begin = skip_bom(source);
/* Skip skip parameters, comments and blank lines */
Expand All @@ -320,6 +321,7 @@ size_t find_first_line(
is_comment ? '\0' : quote); /* don't deal with quotes in comment lines*/
++begin;
skip = skip > 0 ? skip - 1 : skip;
++skip_counter;

std::tie(should_skip, is_comment) = is_blank_or_comment_line(
source.data() + begin,
Expand Down
2 changes: 2 additions & 0 deletions src/vroom.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@

auto errors = new std::shared_ptr<vroom_errors>(new vroom_errors());

(*errors)->has_header(has_header);

auto idx = std::make_shared<vroom::index_collection>(
inputs,
Rf_isNull(delim) ? nullptr : cpp11::as_cpp<const char*>(delim),
Expand Down
25 changes: 23 additions & 2 deletions src/vroom_errors.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <sstream>
#include <string>
#include <vector>
#include <map>

using namespace cpp11::literals;

Expand All @@ -29,7 +30,14 @@ class vroom_errors {
std::string actual = "",
std::string filename = "") {
std::lock_guard<std::mutex> guard(mutex_);
rows_.push_back(row + 1);
auto search = track_skips_.find(filename);
size_t skips_at_start = (search != track_skips_.end()) ? track_skips_.at(filename) : 0;
lines_.push_back(row + 1 + skips_at_start);
/* if data contains a header line, the 0th iterator will be the header line,
otherwise it will be the first line of data */
/* for rows , we don't want to include the header line
so only add a 1 if there is NOT a header */
rows_.push_back(row + !has_header_);
columns_.push_back(column + 1);
expected_.emplace_back(expected);
actual_.emplace_back(actual);
Expand Down Expand Up @@ -73,7 +81,8 @@ class vroom_errors {

cpp11::data_frame error_table() const {
return cpp11::writable::data_frame(
{"row"_nm = rows_,
{"line"_nm = lines_,
"row"_nm = rows_,
"col"_nm = columns_,
"expected"_nm = expected_,
"actual"_nm = actual_,
Expand Down Expand Up @@ -104,6 +113,7 @@ class vroom_errors {

void clear() {
std::lock_guard<std::mutex> guard(mutex_);
lines_.clear();
rows_.clear();
columns_.clear();
expected_.clear();
Expand All @@ -112,13 +122,24 @@ class vroom_errors {
parse_errors_.clear();
}

void add_skips_at_start(size_t skip_count, std::string filename){
track_skips_[filename] = skip_count;
}

void has_header(bool header){
has_header_ = header;
}

private:
mutable bool have_warned_ = false;
std::mutex mutex_;
std::vector<std::string> filenames_;
std::vector<parse_error> parse_errors_;
std::vector<size_t> lines_;
std::vector<size_t> rows_;
std::vector<size_t> columns_;
std::vector<std::string> expected_;
std::vector<std::string> actual_;
bool has_header_ = false;
std::map<std::string, size_t> track_skips_;
};
12 changes: 9 additions & 3 deletions src/vroom_fwf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@
filenames = get_filenames(inputs);
}

auto errors = new std::shared_ptr<vroom_errors>(new vroom_errors());

(*errors)->has_header(false);

auto idx = std::make_shared<vroom::index_collection>(
inputs,
col_starts,
Expand All @@ -48,10 +52,9 @@
comment,
skip_empty_rows,
n_max,
*errors,
progress);

auto errors = new std::shared_ptr<vroom_errors>(new vroom_errors());

return create_columns(
idx,
std::move(col_names),
Expand Down Expand Up @@ -116,13 +119,16 @@ std::vector<bool> find_empty_cols(Iterator begin, Iterator end, ptrdiff_t n) {
return cpp11::list();
}

size_t skip_counter = 0;

size_t s = find_first_line(
mmap,
skip,
comment.data(),
/* skip_empty_rows */ true,
/* embedded_nl */ false,
/* quote */ '\0');
/* quote */ '\0',
skip_counter);

std::vector<bool> empty = find_empty_cols(mmap.begin() + s, mmap.end(), n);
std::vector<int> begin, end;
Expand Down
Loading