Skip to content

Refactor jni writer data sink #12458

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
21c3deb
JNI bindings to write CSV
mythrocks Dec 15, 2022
7fa0204
Support for chunked CSV writes in JNI:
mythrocks Dec 21, 2022
faa64c4
Merge remote-tracking branch 'origin/branch-23.02' into hive-text-writer
mythrocks Dec 21, 2022
2fa91e7
Merge remote-tracking branch 'origin/branch-23.02' into hive-text-writer
mythrocks Dec 21, 2022
e446ae3
Added tests header inclusion.
mythrocks Dec 21, 2022
54a5a87
Formatting.
mythrocks Dec 21, 2022
c8f74de
Support to specify TRUE/FALSE strings.
mythrocks Dec 21, 2022
ebbfcb8
Added tests for combinations of True/False reps, header inclusion, etc.
mythrocks Dec 21, 2022
cce5574
Removed JNI's non-chunked CSV writes to memory.
mythrocks Dec 21, 2022
7089163
Merge remote-tracking branch 'origin/branch-23.02' into hive-text-writer
mythrocks Dec 27, 2022
15693f1
Added newline at the end of the file, per CUDF guideline.
mythrocks Dec 27, 2022
15e84c5
Removed unnecessary whitespace at top of file.
mythrocks Dec 27, 2022
0da15a4
Merge remote-tracking branch 'origin/branch-23.02' into hive-text-writer
mythrocks Dec 28, 2022
e9107c9
Re-added whitespace at end of file.
mythrocks Dec 28, 2022
52f62e2
Fixed header order. Removed trailing newlines.
mythrocks Dec 30, 2022
af7eed3
Postpone setting _first_write till after write.
mythrocks Dec 30, 2022
5728549
Merge remote-tracking branch 'origin/branch-23.02' into hive-text-writer
mythrocks Dec 30, 2022
0d82984
Trailing newlines.
mythrocks Dec 30, 2022
fa24027
Review changes:
mythrocks Dec 30, 2022
f5e30c5
More formatting .
mythrocks Dec 30, 2022
d417d30
Refactor jni_writer_data_sink
mythrocks Dec 30, 2022
d2ac7a6
Renamed jni_writer_data_sink.
mythrocks Dec 31, 2022
9ed5294
Renamed rotate_buffer. Plus, formatting.
mythrocks Dec 31, 2022
bbfebdf
Renamed jni_writer_data_sink header.
mythrocks Dec 31, 2022
d3642a4
Updated documentation for _inter_column_delimiter.
mythrocks Jan 3, 2023
c83e0d9
Updated copyright date.
mythrocks Jan 3, 2023
6dee89a
Merge remote-tracking branch 'origin/branch-23.02' into hive-text-writer
mythrocks Jan 3, 2023
e4fa895
Merge remote-tracking branch 'origin/branch-23.02' into hive-text-writer
mythrocks Jan 4, 2023
e1b6703
Merge remote-tracking branch 'mythrocks/hive-text-writer' into refact…
mythrocks Jan 4, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 15 additions & 8 deletions cpp/include/cudf/io/csv.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -1332,7 +1332,7 @@ class csv_writer_options {
size_type _rows_per_chunk = std::numeric_limits<size_type>::max();
// character to use for separating lines (default "\n")
std::string _line_terminator = "\n";
// character to use for separating lines (default "\n")
// character to use for separating column values (default ",")
char _inter_column_delimiter = ',';
// string to use for values != 0 in INT8 types (default 'true')
std::string _true_value = std::string{"true"};
Expand Down Expand Up @@ -1422,9 +1422,9 @@ class csv_writer_options {
[[nodiscard]] std::string get_line_terminator() const { return _line_terminator; }

/**
* @brief Returns character used for separating lines.
* @brief Returns character used for separating column values.
*
* @return Character used for separating lines
* @return Character used for separating column values.
*/
[[nodiscard]] char get_inter_column_delimiter() const { return _inter_column_delimiter; }

Expand Down Expand Up @@ -1479,9 +1479,9 @@ class csv_writer_options {
void set_line_terminator(std::string term) { _line_terminator = term; }

/**
* @brief Sets character used for separating lines.
* @brief Sets character used for separating column values.
*
* @param delim Character to indicate delimiting
* @param delim Character to delimit column values
*/
void set_inter_column_delimiter(char delim) { _inter_column_delimiter = delim; }

Expand All @@ -1498,6 +1498,13 @@ class csv_writer_options {
* @param val String to represent values == 0 in INT8 types
*/
void set_false_value(std::string val) { _false_value = val; }

/**
* @brief (Re)sets the table being written.
*
* @param table Table to be written
*/
void set_table(table_view const& table) { _table = table; }
};

/**
Expand Down Expand Up @@ -1586,9 +1593,9 @@ class csv_writer_options_builder {
}

/**
* @brief Sets character used for separating lines.
* @brief Sets character used for separating column values.
*
* @param delim Character to indicate delimiting
* @param delim Character to delimit column values
* @return this for chaining
*/
csv_writer_options_builder& inter_column_delimiter(char delim)
Expand Down
134 changes: 134 additions & 0 deletions java/src/main/java/ai/rapids/cudf/CSVWriterOptions.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
/*
*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

package ai.rapids.cudf;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public class CSVWriterOptions {

private String[] columnNames;
private Boolean includeHeader = false;
private String rowDelimiter = "\n";
private byte fieldDelimiter = ',';
private String nullValue = "\\N";
private String falseValue = "false";
private String trueValue = "true";

private CSVWriterOptions(Builder builder) {
this.columnNames = builder.columnNames.toArray(new String[builder.columnNames.size()]);
this.nullValue = builder.nullValue;
this.includeHeader = builder.includeHeader;
this.fieldDelimiter = builder.fieldDelimiter;
this.rowDelimiter = builder.rowDelimiter;
this.falseValue = builder.falseValue;
this.trueValue = builder.trueValue;
}

public String[] getColumnNames() {
return columnNames;
}

public Boolean getIncludeHeader() {
return includeHeader;
}

public String getRowDelimiter() {
return rowDelimiter;
}

public byte getFieldDelimiter() {
return fieldDelimiter;
}

public String getNullValue() {
return nullValue;
}

public String getTrueValue() {
return trueValue;
}

public String getFalseValue() {
return falseValue;
}

public static Builder builder() {
return new Builder();
}

public static class Builder {

private List<String> columnNames = Collections.emptyList();
private Boolean includeHeader = false;
private String rowDelimiter = "\n";
private byte fieldDelimiter = ',';
private String nullValue = "\\N";
private String falseValue = "false";
private String trueValue = "true";

public CSVWriterOptions build() {
return new CSVWriterOptions(this);
}

public Builder withColumnNames(List<String> columnNames) {
this.columnNames = columnNames;
return this;
}

public Builder withColumnNames(String... columnNames) {
List<String> columnNamesList = new ArrayList<>();
for (String columnName : columnNames) {
columnNamesList.add(columnName);
}
return withColumnNames(columnNamesList);
}

public Builder withIncludeHeader(Boolean includeHeader) {
this.includeHeader = includeHeader;
return this;
}

public Builder withRowDelimiter(String rowDelimiter) {
this.rowDelimiter = rowDelimiter;
return this;
}

public Builder withFieldDelimiter(byte fieldDelimiter) {
this.fieldDelimiter = fieldDelimiter;
return this;
}

public Builder withNullValue(String nullValue) {
this.nullValue = nullValue;
return this;
}

public Builder withTrueValue(String trueValue) {
this.trueValue = trueValue;
return this;
}

public Builder withFalseValue(String falseValue) {
this.falseValue = falseValue;
return this;
}
}
}
78 changes: 77 additions & 1 deletion java/src/main/java/ai/rapids/cudf/Table.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -857,6 +857,82 @@ public static Table readCSV(Schema schema, CSVOptions opts, HostMemoryBuffer buf
opts.getFalseValues()));
}

private static native void writeCSVToFile(long table,
String[] columnNames,
boolean includeHeader,
String rowDelimiter,
byte fieldDelimiter,
String nullValue,
String trueValue,
String falseValue,
String outputPath) throws CudfException;

public void writeCSVToFile(CSVWriterOptions options, String outputPath) {
writeCSVToFile(nativeHandle,
options.getColumnNames(),
options.getIncludeHeader(),
options.getRowDelimiter(),
options.getFieldDelimiter(),
options.getNullValue(),
options.getTrueValue(),
options.getFalseValue(),
outputPath);
}

private static native long startWriteCSVToBuffer(String[] columnNames,
boolean includeHeader,
String rowDelimiter,
byte fieldDelimiter,
String nullValue,
String trueValue,
String falseValue,
HostBufferConsumer buffer) throws CudfException;

private static native void writeCSVChunkToBuffer(long writerHandle, long tableHandle);

private static native void endWriteCSVToBuffer(long writerHandle);

private static class CSVTableWriter implements TableWriter {
private long writerHandle;
private HostBufferConsumer consumer;

private CSVTableWriter(CSVWriterOptions options, HostBufferConsumer consumer) {
this.writerHandle = startWriteCSVToBuffer(options.getColumnNames(),
options.getIncludeHeader(),
options.getRowDelimiter(),
options.getFieldDelimiter(),
options.getNullValue(),
options.getTrueValue(),
options.getFalseValue(),
consumer);
this.consumer = consumer;
}

@Override
public void write(Table table) {
if (writerHandle == 0) {
throw new IllegalStateException("Writer was already closed");
}
writeCSVChunkToBuffer(writerHandle, table.nativeHandle);
}

@Override
public void close() throws CudfException {
if (writerHandle != 0) {
endWriteCSVToBuffer(writerHandle);
writerHandle = 0;
}
if (consumer != null) {
consumer.done();
consumer = null;
}
}
}

public static TableWriter getCSVBufferWriter(CSVWriterOptions options, HostBufferConsumer bufferConsumer) {
return new CSVTableWriter(options, bufferConsumer);
}

/**
* Read a JSON file using the default JSONOptions.
* @param schema the schema of the file. You may use Schema.INFERRED to infer the schema.
Expand Down
Loading