From 21c3deb916237ebc744aee8999d4d2c7c42c4493 Mon Sep 17 00:00:00 2001 From: MithunR Date: Thu, 15 Dec 2022 11:44:52 -0800 Subject: [PATCH 01/18] JNI bindings to write CSV This change adds JNI bindings to write tables out as CSV, to either the filesystem or memory. The Java Table class now has additional methods: 1. Table.writeCSVToFile(): Writes the current table out to the specified file on the filesystem. 2. Table.writeCSVToBuffer(): Writes the current table out to a HostBufferConsumer. These calls are analogous to cudf::io::write_csv(). Current limitations: 1. The cudf::io::csv_writer_options interface binds the CSV options tightly to the Table being written. This makes it a little clumsy to write multiple Tables to the same HostBufferConsumer, because each could be written with different, contradictory options. 2. cudf::io::write_csv(file_name) overwrites the specified file, if it exists. There currently isn't a way to keep a file open, and write multiple tables to it; each write call overwrites the previous file. --- .../java/ai/rapids/cudf/CSVWriterOptions.java | 110 ++++++++++++++++++ java/src/main/java/ai/rapids/cudf/Table.java | 38 ++++++ java/src/main/native/src/TableJni.cpp | 83 +++++++++++++ .../test/java/ai/rapids/cudf/TableTest.java | 90 ++++++++++++++ 4 files changed, 321 insertions(+) create mode 100644 java/src/main/java/ai/rapids/cudf/CSVWriterOptions.java diff --git a/java/src/main/java/ai/rapids/cudf/CSVWriterOptions.java b/java/src/main/java/ai/rapids/cudf/CSVWriterOptions.java new file mode 100644 index 00000000000..fa963b54493 --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/CSVWriterOptions.java @@ -0,0 +1,110 @@ +/* + * + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class CSVWriterOptions { + + private String[] columnNames; + private Boolean includeHeader = false; + private String rowDelimiter = "\n"; + private byte fieldDelimiter = ','; + private String nullValue = "\\N"; + + private CSVWriterOptions(Builder builder) { + this.columnNames = builder.columnNames.toArray(new String[builder.columnNames.size()]); + this.nullValue = builder.nullValue; + this.includeHeader = builder.includeHeader; + this.fieldDelimiter = builder.fieldDelimiter; + this.rowDelimiter = builder.rowDelimiter; + } + + public String[] getColumnNames() { + return columnNames; + } + + public Boolean getIncludeHeader() { + return includeHeader; + } + + public String getRowDelimiter() { + return rowDelimiter; + } + + public byte getFieldDelimiter() { + return fieldDelimiter; + } + + public String getNullValue() { + return nullValue; + } + + public static Builder builder() { + return new Builder(); + } + + public static class Builder { + + private List columnNames = Collections.emptyList(); + private Boolean includeHeader = false; + private String rowDelimiter = "\n"; + private byte fieldDelimiter = ','; + private String nullValue = "\\N"; + + public CSVWriterOptions build() { + return new CSVWriterOptions(this); + } + + public Builder withColumnNames(List columnNames) { + this.columnNames = columnNames; + return this; + } + + public Builder withColumnNames(String... columnNames) { + List columnNamesList = new ArrayList<>(); + for (String columnName : columnNames) { + columnNamesList.add(columnName); + } + return withColumnNames(columnNamesList); + } + + public Builder withIncludeHeader(Boolean includeHeader) { + this.includeHeader = includeHeader; + return this; + } + + public Builder withRowDelimiter(String rowDelimiter) { + this.rowDelimiter = rowDelimiter; + return this; + } + + public Builder withFieldDelimiter(byte fieldDelimiter) { + this.fieldDelimiter = fieldDelimiter; + return this; + } + + public Builder withNullValue(String nullValue) { + this.nullValue = nullValue; + return this; + } + } +} diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index b93352fa9ac..4332c18f58b 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -857,6 +857,44 @@ public static Table readCSV(Schema schema, CSVOptions opts, HostMemoryBuffer buf opts.getFalseValues())); } + private static native void writeCSVToFile(long table, + String[] columnNames, + boolean includeHeader, + String rowDelimiter, + byte fieldDelimiter, + String nullValue, + String outputPath) throws CudfException; + + public void writeCSVToFile(CSVWriterOptions options, String outputPath) + { + writeCSVToFile(nativeHandle, + options.getColumnNames(), + options.getIncludeHeader(), + options.getRowDelimiter(), + options.getFieldDelimiter(), + options.getNullValue(), + outputPath); + } + + private static native void writeCSVToBuffer(long table, + String[] columnNames, + boolean includeHeader, + String rowDelimiter, + byte fieldDelimiter, + String nullValue, + HostBufferConsumer buffer) throws CudfException; + + public void writeCSVToBuffer(CSVWriterOptions options, HostBufferConsumer bufferConsumer) + { + writeCSVToBuffer(nativeHandle, + options.getColumnNames(), + options.getIncludeHeader(), + options.getRowDelimiter(), + options.getFieldDelimiter(), + options.getNullValue(), + bufferConsumer); + } + /** * Read a JSON file using the default JSONOptions. * @param schema the schema of the file. You may use Schema.INFERRED to infer the schema. diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index b70a7b5a615..48f9fd24127 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1,4 +1,5 @@ /* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -1323,6 +1324,88 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV( CATCH_STD(env, NULL); } +JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToFile(JNIEnv *env, + jclass, + jlong j_table_handle, + jobjectArray j_column_names, + jboolean include_header, + jstring j_row_delimiter, + jbyte j_field_delimiter, + jstring j_null_value, + jstring j_output_path) +{ + JNI_NULL_CHECK(env, j_table_handle, "table handle cannot be null.", ); + JNI_NULL_CHECK(env, j_column_names, "column name array cannot be null", ); + JNI_NULL_CHECK(env, j_row_delimiter, "row delimiter cannot be null", ); + JNI_NULL_CHECK(env, j_field_delimiter, "field delimiter cannot be null", ); + JNI_NULL_CHECK(env, j_null_value, "null representation string cannot be itself null", ); + JNI_NULL_CHECK(env, j_output_path, "output path cannot be null", ); + + try { + cudf::jni::auto_set_device(env); + + auto const native_output_path = cudf::jni::native_jstring{env, j_output_path}; + auto const output_path = native_output_path.get(); + + auto const table = reinterpret_cast(j_table_handle); + auto const n_column_names = cudf::jni::native_jstringArray{env, j_column_names}; + auto const column_names = n_column_names.as_cpp_vector(); + + auto const line_terminator = cudf::jni::native_jstring{env, j_row_delimiter}; + auto const na_rep = cudf::jni::native_jstring{env, j_null_value}; + auto options = cudf::io::csv_writer_options::builder(cudf::io::sink_info{output_path}, *table) + .names(column_names) + .include_header(static_cast(include_header)) + .line_terminator(line_terminator.get()) + .inter_column_delimiter(j_field_delimiter) + .na_rep(na_rep.get()); + + cudf::io::write_csv(options.build()); + } + CATCH_STD(env, ); +} + +JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToBuffer(JNIEnv *env, + jclass, + jlong j_table_handle, + jobjectArray j_column_names, + jboolean include_header, + jstring j_row_delimiter, + jbyte j_field_delimiter, + jstring j_null_value, + jobject j_buffer) +{ + JNI_NULL_CHECK(env, j_table_handle, "table handle cannot be null.", ); + JNI_NULL_CHECK(env, j_column_names, "column name array cannot be null", ); + JNI_NULL_CHECK(env, j_row_delimiter, "row delimiter cannot be null", ); + JNI_NULL_CHECK(env, j_field_delimiter, "field delimiter cannot be null", ); + JNI_NULL_CHECK(env, j_null_value, "null representation string cannot be itself null", ); + JNI_NULL_CHECK(env, j_buffer, "output buffer cannot be null", ); + + try { + cudf::jni::auto_set_device(env); + + auto data_sink = cudf::jni::jni_writer_data_sink{env, j_buffer}; + + auto const table = reinterpret_cast(j_table_handle); + auto const n_column_names = cudf::jni::native_jstringArray{env, j_column_names}; + auto const column_names = n_column_names.as_cpp_vector(); + + auto const line_terminator = cudf::jni::native_jstring{env, j_row_delimiter}; + auto const na_rep = cudf::jni::native_jstring{env, j_null_value}; + auto options = cudf::io::csv_writer_options::builder(cudf::io::sink_info{&data_sink}, *table) + .names(column_names) + .include_header(static_cast(include_header)) + .line_terminator(line_terminator.get()) + .inter_column_delimiter(j_field_delimiter) + .na_rep(na_rep.get()); + + cudf::io::write_csv(options.build()); + data_sink.flush(); + } + CATCH_STD(env, ); +} + JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) { diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index bf951a871e7..b10fa4588e2 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -575,6 +575,96 @@ void testReadCSV() { } } + @Test + void testWriteCSVToFile() throws IOException { + File outputFile = File.createTempFile("testWriteCSVToFile", ".csv"); + Schema schema = Schema.builder() + .column(DType.INT32, "i") + .column(DType.FLOAT64, "f") + .column(DType.BOOL8, "b") + .column(DType.STRING, "str") + .build(); + CSVWriterOptions writeOptions = CSVWriterOptions.builder() + .withColumnNames(schema.getColumnNames()) + .withIncludeHeader(false) + .withFieldDelimiter((byte)'\u0001') + .withRowDelimiter("\n") + .build(); + try (Table inputTable + = new Table.TestBuilder() + .column(0, 1, 2, 3, 4, 5, 6, 7, 8, 9) + .column(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0) + .column(false, true, false, true, false, true, false, true, false, true) + .column("All", "the", "leaves", "are", "brown", "and", "the", "sky", "is", "grey") + .build()) { + inputTable.writeCSVToFile(writeOptions, outputFile.getAbsolutePath()); + + // Read back. + CSVOptions readOptions = CSVOptions.builder() + .includeColumn("i") + .includeColumn("f") + .includeColumn("b") + .includeColumn("str") + .hasHeader(false) + .withDelim('\u0001') + .build(); + try (Table readTable = Table.readCSV(schema, readOptions, outputFile)) { + assertTablesAreEqual(inputTable, readTable); + } + } finally { + outputFile.delete(); + } + } + + private void testWriteCSVToBufferImpl(char fieldDelim) throws IOException { + Schema schema = Schema.builder() + .column(DType.INT32, "i") + .column(DType.FLOAT64, "f") + .column(DType.BOOL8, "b") + .column(DType.STRING, "str") + .build(); + CSVWriterOptions writeOptions = CSVWriterOptions.builder() + .withColumnNames(schema.getColumnNames()) + .withIncludeHeader(false) + .withFieldDelimiter((byte)fieldDelim) + .withRowDelimiter("\n") + .withNullValue("\\N") + .build(); + try (Table inputTable + = new Table.TestBuilder() + .column(0, 1, 2, 3, 4, 5, 6, 7, 8, null) + .column(0.0, 1.0, 2.0, 3.0, 4.0, null, 6.0, 7.0, 8.0, 9.0) + .column(false, true, null, true, false, true, null, true, false, true) + .column("All", "the", "leaves", "are", "brown", "and", "the", "sky", "is", null) + .build(); + MyBufferConsumer consumer = new MyBufferConsumer()) { + inputTable.writeCSVToBuffer(writeOptions, consumer); + inputTable.writeCSVToBuffer(writeOptions, consumer); + inputTable.writeCSVToBuffer(writeOptions, consumer); + + // Read back. + CSVOptions readOptions = CSVOptions.builder() + .includeColumn("i") + .includeColumn("f") + .includeColumn("b") + .includeColumn("str") + .hasHeader(false) + .withDelim(fieldDelim) + .withNullValue("\\N") + .build(); + try (Table readTable = Table.readCSV(schema, readOptions, consumer.buffer, 0, consumer.offset); + Table expected = Table.concatenate(inputTable, inputTable, inputTable)) { + assertTablesAreEqual(expected, readTable); + } + } + } + + @Test + void testWriteCSVToBuffer() throws IOException { + testWriteCSVToBufferImpl(','); + testWriteCSVToBufferImpl('\u0001'); + } + @Test void testReadParquet() { ParquetOptions opts = ParquetOptions.builder() From 7fa02041120d57f9bd33389afba359919f898003 Mon Sep 17 00:00:00 2001 From: MithunR Date: Tue, 20 Dec 2022 23:06:48 -0800 Subject: [PATCH 02/18] Support for chunked CSV writes in JNI: 1. Added setter to change Table instance in csv_writer_options. 2. Plumbing for new chunked writer. 3. Tests. --- cpp/include/cudf/io/csv.hpp | 9 +- java/src/main/java/ai/rapids/cudf/Table.java | 63 ++++- java/src/main/native/src/TableJni.cpp | 231 ++++++------------ .../test/java/ai/rapids/cudf/TableTest.java | 48 ++++ 4 files changed, 192 insertions(+), 159 deletions(-) diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index 1fc4114b94c..51d5b6aa143 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -1332,7 +1332,7 @@ class csv_writer_options { size_type _rows_per_chunk = std::numeric_limits::max(); // character to use for separating lines (default "\n") std::string _line_terminator = "\n"; - // character to use for separating lines (default "\n") + // character to use for separating column values (default ",") char _inter_column_delimiter = ','; // string to use for values != 0 in INT8 types (default 'true') std::string _true_value = std::string{"true"}; @@ -1498,6 +1498,13 @@ class csv_writer_options { * @param val String to represent values == 0 in INT8 types */ void set_false_value(std::string val) { _false_value = val; } + + /** + * @brief (Re)sets the table being written. + * + * @param table Table to be written + */ + void set_table(table_view const& table) { _table = table; } }; /** diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 4332c18f58b..4098d7b67a8 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -887,12 +887,63 @@ private static native void writeCSVToBuffer(long table, public void writeCSVToBuffer(CSVWriterOptions options, HostBufferConsumer bufferConsumer) { writeCSVToBuffer(nativeHandle, - options.getColumnNames(), - options.getIncludeHeader(), - options.getRowDelimiter(), - options.getFieldDelimiter(), - options.getNullValue(), - bufferConsumer); + options.getColumnNames(), + options.getIncludeHeader(), + options.getRowDelimiter(), + options.getFieldDelimiter(), + options.getNullValue(), + bufferConsumer); + } + + private static native long writeCSVToBufferBegin(String[] columnNames, + boolean includeHeader, + String rowDelimiter, + byte fieldDelimiter, + String nullValue, + HostBufferConsumer buffer) throws CudfException; + + private static native void writeCSVChunkToBuffer(long writerHandle, long tableHandle); + + private static native void writeCSVToBufferEnd(long writerHandle); + + private static class CSVTableWriter implements TableWriter { + private long writerHandle; + private HostBufferConsumer consumer; + + private CSVTableWriter(CSVWriterOptions options, HostBufferConsumer consumer) { + this.writerHandle = writeCSVToBufferBegin(options.getColumnNames(), + options.getIncludeHeader(), + options.getRowDelimiter(), + options.getFieldDelimiter(), + options.getNullValue(), + consumer); + this.consumer = consumer; + } + + @Override + public void write(Table table) { + if (writerHandle == 0) { + throw new IllegalStateException("Writer was already closed"); + } + writeCSVChunkToBuffer(writerHandle, table.nativeHandle); + } + + @Override + public void close() throws CudfException { + if (writerHandle != 0) { + writeCSVToBufferEnd(writerHandle); + writerHandle = 0; + } + if (consumer != null) { + consumer.done(); + consumer = null; + } + } + } + + public static TableWriter getCSVBufferWriter(CSVWriterOptions options, HostBufferConsumer bufferConsumer) + { + return new CSVTableWriter(options, bufferConsumer); } /** diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 48f9fd24127..7a61acbbc6c 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -48,165 +48,17 @@ #include #include +#include "csv_chunked_writer.hpp" #include "cudf_jni_apis.hpp" #include "dtype_utils.hpp" #include "jni_compiled_expr.hpp" #include "jni_utils.hpp" +#include "jni_writer_data_sink.hpp" #include "row_conversion.hpp" namespace cudf { namespace jni { -constexpr long MINIMUM_WRITE_BUFFER_SIZE = 10 * 1024 * 1024; // 10 MB - -class jni_writer_data_sink final : public cudf::io::data_sink { -public: - explicit jni_writer_data_sink(JNIEnv *env, jobject callback) { - if (env->GetJavaVM(&jvm) < 0) { - throw std::runtime_error("GetJavaVM failed"); - } - - jclass cls = env->GetObjectClass(callback); - if (cls == nullptr) { - throw cudf::jni::jni_exception("class not found"); - } - - handle_buffer_method = - env->GetMethodID(cls, "handleBuffer", "(Lai/rapids/cudf/HostMemoryBuffer;J)V"); - if (handle_buffer_method == nullptr) { - throw cudf::jni::jni_exception("handleBuffer method"); - } - - this->callback = env->NewGlobalRef(callback); - if (this->callback == nullptr) { - throw cudf::jni::jni_exception("global ref"); - } - } - - virtual ~jni_writer_data_sink() { - // This should normally be called by a JVM thread. If the JVM environment is missing then this - // is likely being triggered by the C++ runtime during shutdown. In that case the JVM may - // already be destroyed and this thread should not try to attach to get an environment. - JNIEnv *env = nullptr; - if (jvm->GetEnv(reinterpret_cast(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) { - env->DeleteGlobalRef(callback); - if (current_buffer != nullptr) { - env->DeleteGlobalRef(current_buffer); - } - } - callback = nullptr; - current_buffer = nullptr; - } - - void host_write(void const *data, size_t size) override { - JNIEnv *env = cudf::jni::get_jni_env(jvm); - long left_to_copy = static_cast(size); - const char *copy_from = static_cast(data); - while (left_to_copy > 0) { - long buffer_amount_available = current_buffer_len - current_buffer_written; - if (buffer_amount_available <= 0) { - // should never be < 0, but just to be safe - rotate_buffer(env); - buffer_amount_available = current_buffer_len - current_buffer_written; - } - long amount_to_copy = - left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available; - char *copy_to = current_buffer_data + current_buffer_written; - - std::memcpy(copy_to, copy_from, amount_to_copy); - copy_from = copy_from + amount_to_copy; - current_buffer_written += amount_to_copy; - total_written += amount_to_copy; - left_to_copy -= amount_to_copy; - } - } - - bool supports_device_write() const override { return true; } - - void device_write(void const *gpu_data, size_t size, rmm::cuda_stream_view stream) override { - JNIEnv *env = cudf::jni::get_jni_env(jvm); - long left_to_copy = static_cast(size); - const char *copy_from = static_cast(gpu_data); - while (left_to_copy > 0) { - long buffer_amount_available = current_buffer_len - current_buffer_written; - if (buffer_amount_available <= 0) { - // should never be < 0, but just to be safe - stream.synchronize(); - rotate_buffer(env); - buffer_amount_available = current_buffer_len - current_buffer_written; - } - long amount_to_copy = - left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available; - char *copy_to = current_buffer_data + current_buffer_written; - - CUDF_CUDA_TRY(cudaMemcpyAsync(copy_to, copy_from, amount_to_copy, cudaMemcpyDeviceToHost, - stream.value())); - - copy_from = copy_from + amount_to_copy; - current_buffer_written += amount_to_copy; - total_written += amount_to_copy; - left_to_copy -= amount_to_copy; - } - stream.synchronize(); - } - - std::future device_write_async(void const *gpu_data, size_t size, - rmm::cuda_stream_view stream) override { - // Call the sync version until figuring out how to write asynchronously. - device_write(gpu_data, size, stream); - return std::async(std::launch::deferred, [] {}); - } - - void flush() override { - if (current_buffer_written > 0) { - JNIEnv *env = cudf::jni::get_jni_env(jvm); - handle_buffer(env, current_buffer, current_buffer_written); - if (current_buffer != nullptr) { - env->DeleteGlobalRef(current_buffer); - } - current_buffer = nullptr; - current_buffer_len = 0; - current_buffer_data = nullptr; - current_buffer_written = 0; - } - } - - size_t bytes_written() override { return total_written; } - - void set_alloc_size(long size) { this->alloc_size = size; } - -private: - void rotate_buffer(JNIEnv *env) { - if (current_buffer != nullptr) { - handle_buffer(env, current_buffer, current_buffer_written); - env->DeleteGlobalRef(current_buffer); - current_buffer = nullptr; - } - jobject tmp_buffer = allocate_host_buffer(env, alloc_size, true); - current_buffer = env->NewGlobalRef(tmp_buffer); - current_buffer_len = get_host_buffer_length(env, current_buffer); - current_buffer_data = reinterpret_cast(get_host_buffer_address(env, current_buffer)); - current_buffer_written = 0; - } - - void handle_buffer(JNIEnv *env, jobject buffer, jlong len) { - env->CallVoidMethod(callback, handle_buffer_method, buffer, len); - if (env->ExceptionCheck()) { - throw std::runtime_error("handleBuffer threw an exception"); - } - } - - JavaVM *jvm; - jobject callback; - jmethodID handle_buffer_method; - jobject current_buffer = nullptr; - char *current_buffer_data = nullptr; - long current_buffer_len = 0; - long current_buffer_written = 0; - size_t total_written = 0; - long alloc_size = MINIMUM_WRITE_BUFFER_SIZE; -}; - template class jni_table_writer_handle final { public: explicit jni_table_writer_handle(std::unique_ptr writer) @@ -1398,14 +1250,89 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToBuffer(JNIEnv *env, .include_header(static_cast(include_header)) .line_terminator(line_terminator.get()) .inter_column_delimiter(j_field_delimiter) - .na_rep(na_rep.get()); + .na_rep(na_rep.get()) + .build(); - cudf::io::write_csv(options.build()); + std::cout << "Unchunked write: rows per chunk: " << options.get_rows_per_chunk() << std::endl; + cudf::io::write_csv(options); data_sink.flush(); } CATCH_STD(env, ); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_writeCSVToBufferBegin(JNIEnv *env, + jclass, + jobjectArray j_column_names, + jboolean include_header, + jstring j_row_delimiter, + jbyte j_field_delimiter, + jstring j_null_value, + jobject j_buffer) +{ + JNI_NULL_CHECK(env, j_column_names, "column name array cannot be null", 0); + JNI_NULL_CHECK(env, j_row_delimiter, "row delimiter cannot be null", 0); + JNI_NULL_CHECK(env, j_field_delimiter, "field delimiter cannot be null", 0); + JNI_NULL_CHECK(env, j_null_value, "null representation string cannot be itself null", 0); + JNI_NULL_CHECK(env, j_buffer, "output buffer cannot be null", 0); + // TODO: Add support for true/false string representations. + + try { + cudf::jni::auto_set_device(env); + + auto data_sink = std::make_unique(env, j_buffer); + + auto const n_column_names = cudf::jni::native_jstringArray{env, j_column_names}; + auto const column_names = n_column_names.as_cpp_vector(); + + auto const line_terminator = cudf::jni::native_jstring{env, j_row_delimiter}; + auto const na_rep = cudf::jni::native_jstring{env, j_null_value}; + auto options = cudf::io::csv_writer_options::builder(cudf::io::sink_info{data_sink.get()}, + cudf::table_view{}) + .names(column_names) + .include_header(static_cast(include_header)) + .line_terminator(line_terminator.get()) + .inter_column_delimiter(j_field_delimiter) + .na_rep(na_rep.get()) + .build(); + + std::cout << "writeBegin(): rows per chunk: " << options.get_rows_per_chunk() << std::endl; + return ptr_as_jlong(new cudf::jni::io::csv_chunked_writer{options, data_sink}); + } + CATCH_STD(env, 0); +} + +JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVChunkToBuffer(JNIEnv *env, + jclass, + jlong j_writer_handle, + jlong j_table_handle) { + JNI_NULL_CHECK(env, j_writer_handle, "writer handle cannot be null.", ); + JNI_NULL_CHECK(env, j_table_handle, "table handle cannot be null.", ); + + auto const table = reinterpret_cast(j_table_handle); + auto writer = reinterpret_cast(j_writer_handle); + + try { + cudf::jni::auto_set_device(env); + writer->write(*table); + } + CATCH_STD(env, ); +} + +JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToBufferEnd(JNIEnv *env, + jclass, + jlong j_writer_handle) { + JNI_NULL_CHECK(env, j_writer_handle, "writer handle cannot be null.", ); + + using cudf::jni::io::csv_chunked_writer; + auto writer = std::unique_ptr{reinterpret_cast(j_writer_handle)}; + + try { + cudf::jni::auto_set_device(env); + writer->close(); + } + CATCH_STD(env, ); +} + JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) { diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index b10fa4588e2..7493bfa9e73 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -665,6 +665,54 @@ void testWriteCSVToBuffer() throws IOException { testWriteCSVToBufferImpl('\u0001'); } + @Test + void testChunkedCSVWriter() throws IOException { + char fieldDelim = ','; + Schema schema = Schema.builder() + .column(DType.INT32, "i") + .column(DType.FLOAT64, "f") + .column(DType.BOOL8, "b") + .column(DType.STRING, "str") + .build(); + CSVWriterOptions writeOptions = CSVWriterOptions.builder() + .withColumnNames(schema.getColumnNames()) + .withIncludeHeader(false) + .withFieldDelimiter((byte)fieldDelim) + .withRowDelimiter("\n") + .withNullValue("\\N") + .build(); + try (Table inputTable + = new Table.TestBuilder() + .column(0, 1, 2, 3, 4, 5, 6, 7, 8, null) + .column(0.0, 1.0, 2.0, 3.0, 4.0, null, 6.0, 7.0, 8.0, 9.0) + .column(false, true, null, true, false, true, null, true, false, true) + .column("All", "the", "leaves", "are", "brown", "and", "the", "sky", "is", null) + .build(); + MyBufferConsumer consumer = new MyBufferConsumer()) { + + try (TableWriter writer = Table.getCSVBufferWriter(writeOptions, consumer)) { + writer.write(inputTable); + writer.write(inputTable); + writer.write(inputTable); + } + + // Read back. + CSVOptions readOptions = CSVOptions.builder() + .includeColumn("i") + .includeColumn("f") + .includeColumn("b") + .includeColumn("str") + .hasHeader(false) + .withDelim(fieldDelim) + .withNullValue("\\N") + .build(); + try (Table readTable = Table.readCSV(schema, readOptions, consumer.buffer, 0, consumer.offset); + Table expected = Table.concatenate(inputTable, inputTable, inputTable)) { + assertTablesAreEqual(expected, readTable); + } + } + } + @Test void testReadParquet() { ParquetOptions opts = ParquetOptions.builder() From e446ae338133f69552aa0cd15808d5f7a6fbae76 Mon Sep 17 00:00:00 2001 From: MithunR Date: Wed, 21 Dec 2022 13:53:08 -0800 Subject: [PATCH 03/18] Added tests header inclusion. --- java/src/main/native/src/TableJni.cpp | 2 -- .../src/test/java/ai/rapids/cudf/TableTest.java | 17 +++++++++++------ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 7a61acbbc6c..a6bee613e82 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1253,7 +1253,6 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToBuffer(JNIEnv *env, .na_rep(na_rep.get()) .build(); - std::cout << "Unchunked write: rows per chunk: " << options.get_rows_per_chunk() << std::endl; cudf::io::write_csv(options); data_sink.flush(); } @@ -1295,7 +1294,6 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_writeCSVToBufferBegin(JNIEnv * .na_rep(na_rep.get()) .build(); - std::cout << "writeBegin(): rows per chunk: " << options.get_rows_per_chunk() << std::endl; return ptr_as_jlong(new cudf::jni::io::csv_chunked_writer{options, data_sink}); } CATCH_STD(env, 0); diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 7493bfa9e73..410b31ad02c 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -50,7 +50,6 @@ import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.util.*; -import java.util.function.IntFunction; import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -665,9 +664,7 @@ void testWriteCSVToBuffer() throws IOException { testWriteCSVToBufferImpl('\u0001'); } - @Test - void testChunkedCSVWriter() throws IOException { - char fieldDelim = ','; + private void testChunkedCSVWriterImpl(char fieldDelim, boolean includeHeader) throws IOException { Schema schema = Schema.builder() .column(DType.INT32, "i") .column(DType.FLOAT64, "f") @@ -676,7 +673,7 @@ void testChunkedCSVWriter() throws IOException { .build(); CSVWriterOptions writeOptions = CSVWriterOptions.builder() .withColumnNames(schema.getColumnNames()) - .withIncludeHeader(false) + .withIncludeHeader(includeHeader) .withFieldDelimiter((byte)fieldDelim) .withRowDelimiter("\n") .withNullValue("\\N") @@ -702,7 +699,7 @@ void testChunkedCSVWriter() throws IOException { .includeColumn("f") .includeColumn("b") .includeColumn("str") - .hasHeader(false) + .hasHeader(includeHeader) .withDelim(fieldDelim) .withNullValue("\\N") .build(); @@ -713,6 +710,14 @@ void testChunkedCSVWriter() throws IOException { } } + @Test + void testChunkedCSVWriter() throws IOException { + testChunkedCSVWriterImpl(',', false); + testChunkedCSVWriterImpl(',', true); + testChunkedCSVWriterImpl('\u0001', false); + testChunkedCSVWriterImpl('\u0001', true); + } + @Test void testReadParquet() { ParquetOptions opts = ParquetOptions.builder() From 54a5a87b16c0a0979714192784f2ef8ab520b536 Mon Sep 17 00:00:00 2001 From: MithunR Date: Wed, 21 Dec 2022 13:55:12 -0800 Subject: [PATCH 04/18] Formatting. --- cpp/include/cudf/io/csv.hpp | 2 +- java/src/main/native/src/TableJni.cpp | 91 +++++++++++---------------- 2 files changed, 36 insertions(+), 57 deletions(-) diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index 51d5b6aa143..2335cd8e947 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -1501,7 +1501,7 @@ class csv_writer_options { /** * @brief (Re)sets the table being written. - * + * * @param table Table to be written */ void set_table(table_view const& table) { _table = table; } diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index a6bee613e82..c5c88a59283 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1176,16 +1176,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV( CATCH_STD(env, NULL); } -JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToFile(JNIEnv *env, - jclass, - jlong j_table_handle, - jobjectArray j_column_names, - jboolean include_header, - jstring j_row_delimiter, - jbyte j_field_delimiter, - jstring j_null_value, - jstring j_output_path) -{ +JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToFile( + JNIEnv *env, jclass, jlong j_table_handle, jobjectArray j_column_names, jboolean include_header, + jstring j_row_delimiter, jbyte j_field_delimiter, jstring j_null_value, jstring j_output_path) { JNI_NULL_CHECK(env, j_table_handle, "table handle cannot be null.", ); JNI_NULL_CHECK(env, j_column_names, "column name array cannot be null", ); JNI_NULL_CHECK(env, j_row_delimiter, "row delimiter cannot be null", ); @@ -1206,27 +1199,20 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToFile(JNIEnv *env, auto const line_terminator = cudf::jni::native_jstring{env, j_row_delimiter}; auto const na_rep = cudf::jni::native_jstring{env, j_null_value}; auto options = cudf::io::csv_writer_options::builder(cudf::io::sink_info{output_path}, *table) - .names(column_names) - .include_header(static_cast(include_header)) - .line_terminator(line_terminator.get()) - .inter_column_delimiter(j_field_delimiter) - .na_rep(na_rep.get()); + .names(column_names) + .include_header(static_cast(include_header)) + .line_terminator(line_terminator.get()) + .inter_column_delimiter(j_field_delimiter) + .na_rep(na_rep.get()); cudf::io::write_csv(options.build()); } CATCH_STD(env, ); } -JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToBuffer(JNIEnv *env, - jclass, - jlong j_table_handle, - jobjectArray j_column_names, - jboolean include_header, - jstring j_row_delimiter, - jbyte j_field_delimiter, - jstring j_null_value, - jobject j_buffer) -{ +JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToBuffer( + JNIEnv *env, jclass, jlong j_table_handle, jobjectArray j_column_names, jboolean include_header, + jstring j_row_delimiter, jbyte j_field_delimiter, jstring j_null_value, jobject j_buffer) { JNI_NULL_CHECK(env, j_table_handle, "table handle cannot be null.", ); JNI_NULL_CHECK(env, j_column_names, "column name array cannot be null", ); JNI_NULL_CHECK(env, j_row_delimiter, "row delimiter cannot be null", ); @@ -1237,7 +1223,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToBuffer(JNIEnv *env, try { cudf::jni::auto_set_device(env); - auto data_sink = cudf::jni::jni_writer_data_sink{env, j_buffer}; + auto data_sink = cudf::jni::jni_writer_data_sink{env, j_buffer}; auto const table = reinterpret_cast(j_table_handle); auto const n_column_names = cudf::jni::native_jstringArray{env, j_column_names}; @@ -1246,12 +1232,12 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToBuffer(JNIEnv *env, auto const line_terminator = cudf::jni::native_jstring{env, j_row_delimiter}; auto const na_rep = cudf::jni::native_jstring{env, j_null_value}; auto options = cudf::io::csv_writer_options::builder(cudf::io::sink_info{&data_sink}, *table) - .names(column_names) - .include_header(static_cast(include_header)) - .line_terminator(line_terminator.get()) - .inter_column_delimiter(j_field_delimiter) - .na_rep(na_rep.get()) - .build(); + .names(column_names) + .include_header(static_cast(include_header)) + .line_terminator(line_terminator.get()) + .inter_column_delimiter(j_field_delimiter) + .na_rep(na_rep.get()) + .build(); cudf::io::write_csv(options); data_sink.flush(); @@ -1259,15 +1245,9 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToBuffer(JNIEnv *env, CATCH_STD(env, ); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_writeCSVToBufferBegin(JNIEnv *env, - jclass, - jobjectArray j_column_names, - jboolean include_header, - jstring j_row_delimiter, - jbyte j_field_delimiter, - jstring j_null_value, - jobject j_buffer) -{ +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_writeCSVToBufferBegin( + JNIEnv *env, jclass, jobjectArray j_column_names, jboolean include_header, + jstring j_row_delimiter, jbyte j_field_delimiter, jstring j_null_value, jobject j_buffer) { JNI_NULL_CHECK(env, j_column_names, "column name array cannot be null", 0); JNI_NULL_CHECK(env, j_row_delimiter, "row delimiter cannot be null", 0); JNI_NULL_CHECK(env, j_field_delimiter, "field delimiter cannot be null", 0); @@ -1278,36 +1258,35 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_writeCSVToBufferBegin(JNIEnv * try { cudf::jni::auto_set_device(env); - auto data_sink = std::make_unique(env, j_buffer); + auto data_sink = std::make_unique(env, j_buffer); auto const n_column_names = cudf::jni::native_jstringArray{env, j_column_names}; auto const column_names = n_column_names.as_cpp_vector(); auto const line_terminator = cudf::jni::native_jstring{env, j_row_delimiter}; auto const na_rep = cudf::jni::native_jstring{env, j_null_value}; - auto options = cudf::io::csv_writer_options::builder(cudf::io::sink_info{data_sink.get()}, + auto options = cudf::io::csv_writer_options::builder(cudf::io::sink_info{data_sink.get()}, cudf::table_view{}) - .names(column_names) - .include_header(static_cast(include_header)) - .line_terminator(line_terminator.get()) - .inter_column_delimiter(j_field_delimiter) - .na_rep(na_rep.get()) - .build(); + .names(column_names) + .include_header(static_cast(include_header)) + .line_terminator(line_terminator.get()) + .inter_column_delimiter(j_field_delimiter) + .na_rep(na_rep.get()) + .build(); return ptr_as_jlong(new cudf::jni::io::csv_chunked_writer{options, data_sink}); } CATCH_STD(env, 0); } -JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVChunkToBuffer(JNIEnv *env, - jclass, +JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVChunkToBuffer(JNIEnv *env, jclass, jlong j_writer_handle, jlong j_table_handle) { JNI_NULL_CHECK(env, j_writer_handle, "writer handle cannot be null.", ); JNI_NULL_CHECK(env, j_table_handle, "table handle cannot be null.", ); auto const table = reinterpret_cast(j_table_handle); - auto writer = reinterpret_cast(j_writer_handle); + auto writer = reinterpret_cast(j_writer_handle); try { cudf::jni::auto_set_device(env); @@ -1316,13 +1295,13 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVChunkToBuffer(JNIEnv *e CATCH_STD(env, ); } -JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToBufferEnd(JNIEnv *env, - jclass, - jlong j_writer_handle) { +JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToBufferEnd(JNIEnv *env, jclass, + jlong j_writer_handle) { JNI_NULL_CHECK(env, j_writer_handle, "writer handle cannot be null.", ); using cudf::jni::io::csv_chunked_writer; - auto writer = std::unique_ptr{reinterpret_cast(j_writer_handle)}; + auto writer = + std::unique_ptr{reinterpret_cast(j_writer_handle)}; try { cudf::jni::auto_set_device(env); From c8f74de3a5709b319c0c0e295f025a624fcc023c Mon Sep 17 00:00:00 2001 From: MithunR Date: Wed, 21 Dec 2022 15:10:35 -0800 Subject: [PATCH 05/18] Support to specify TRUE/FALSE strings. --- .../java/ai/rapids/cudf/CSVWriterOptions.java | 24 +++ java/src/main/java/ai/rapids/cudf/Table.java | 8 + java/src/main/native/src/TableJni.cpp | 21 ++- .../main/native/src/csv_chunked_writer.hpp | 77 ++++++++ .../main/native/src/jni_writer_data_sink.hpp | 176 ++++++++++++++++++ .../test/java/ai/rapids/cudf/TableTest.java | 8 + 6 files changed, 310 insertions(+), 4 deletions(-) create mode 100644 java/src/main/native/src/csv_chunked_writer.hpp create mode 100644 java/src/main/native/src/jni_writer_data_sink.hpp diff --git a/java/src/main/java/ai/rapids/cudf/CSVWriterOptions.java b/java/src/main/java/ai/rapids/cudf/CSVWriterOptions.java index fa963b54493..9d654007209 100644 --- a/java/src/main/java/ai/rapids/cudf/CSVWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/CSVWriterOptions.java @@ -29,6 +29,8 @@ public class CSVWriterOptions { private String rowDelimiter = "\n"; private byte fieldDelimiter = ','; private String nullValue = "\\N"; + private String falseValue = "false"; + private String trueValue = "true"; private CSVWriterOptions(Builder builder) { this.columnNames = builder.columnNames.toArray(new String[builder.columnNames.size()]); @@ -36,6 +38,8 @@ private CSVWriterOptions(Builder builder) { this.includeHeader = builder.includeHeader; this.fieldDelimiter = builder.fieldDelimiter; this.rowDelimiter = builder.rowDelimiter; + this.falseValue = builder.falseValue; + this.trueValue = builder.trueValue; } public String[] getColumnNames() { @@ -58,6 +62,14 @@ public String getNullValue() { return nullValue; } + public String getTrueValue() { + return trueValue; + } + + public String getFalseValue() { + return falseValue; + } + public static Builder builder() { return new Builder(); } @@ -69,6 +81,8 @@ public static class Builder { private String rowDelimiter = "\n"; private byte fieldDelimiter = ','; private String nullValue = "\\N"; + private String falseValue = "false"; + private String trueValue = "true"; public CSVWriterOptions build() { return new CSVWriterOptions(this); @@ -106,5 +120,15 @@ public Builder withNullValue(String nullValue) { this.nullValue = nullValue; return this; } + + public Builder withTrueValue(String trueValue) { + this.trueValue = trueValue; + return this; + } + + public Builder withFalseValue(String falseValue) { + this.falseValue = falseValue; + return this; + } } } diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 4098d7b67a8..4ece07919e0 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -863,6 +863,8 @@ private static native void writeCSVToFile(long table, String rowDelimiter, byte fieldDelimiter, String nullValue, + String trueValue, + String falseValue, String outputPath) throws CudfException; public void writeCSVToFile(CSVWriterOptions options, String outputPath) @@ -873,6 +875,8 @@ public void writeCSVToFile(CSVWriterOptions options, String outputPath) options.getRowDelimiter(), options.getFieldDelimiter(), options.getNullValue(), + options.getTrueValue(), + options.getFalseValue(), outputPath); } @@ -900,6 +904,8 @@ private static native long writeCSVToBufferBegin(String[] columnNames, String rowDelimiter, byte fieldDelimiter, String nullValue, + String trueValue, + String falseValue, HostBufferConsumer buffer) throws CudfException; private static native void writeCSVChunkToBuffer(long writerHandle, long tableHandle); @@ -916,6 +922,8 @@ private CSVTableWriter(CSVWriterOptions options, HostBufferConsumer consumer) { options.getRowDelimiter(), options.getFieldDelimiter(), options.getNullValue(), + options.getTrueValue(), + options.getFalseValue(), consumer); this.consumer = consumer; } diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index c5c88a59283..5aff9421619 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1178,12 +1178,15 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV( JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToFile( JNIEnv *env, jclass, jlong j_table_handle, jobjectArray j_column_names, jboolean include_header, - jstring j_row_delimiter, jbyte j_field_delimiter, jstring j_null_value, jstring j_output_path) { + jstring j_row_delimiter, jbyte j_field_delimiter, jstring j_null_value, jstring j_true_value, + jstring j_false_value, jstring j_output_path) { JNI_NULL_CHECK(env, j_table_handle, "table handle cannot be null.", ); JNI_NULL_CHECK(env, j_column_names, "column name array cannot be null", ); JNI_NULL_CHECK(env, j_row_delimiter, "row delimiter cannot be null", ); JNI_NULL_CHECK(env, j_field_delimiter, "field delimiter cannot be null", ); JNI_NULL_CHECK(env, j_null_value, "null representation string cannot be itself null", ); + JNI_NULL_CHECK(env, j_true_value, "representation string for `true` cannot be null", ); + JNI_NULL_CHECK(env, j_false_value, "representation string for `false` cannot be null", ); JNI_NULL_CHECK(env, j_output_path, "output path cannot be null", ); try { @@ -1198,12 +1201,17 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToFile( auto const line_terminator = cudf::jni::native_jstring{env, j_row_delimiter}; auto const na_rep = cudf::jni::native_jstring{env, j_null_value}; + auto const true_value = cudf::jni::native_jstring{env, j_true_value}; + auto const false_value = cudf::jni::native_jstring{env, j_false_value}; + auto options = cudf::io::csv_writer_options::builder(cudf::io::sink_info{output_path}, *table) .names(column_names) .include_header(static_cast(include_header)) .line_terminator(line_terminator.get()) .inter_column_delimiter(j_field_delimiter) - .na_rep(na_rep.get()); + .na_rep(na_rep.get()) + .true_value(true_value.get()) + .false_value(false_value.get()); cudf::io::write_csv(options.build()); } @@ -1247,13 +1255,13 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToBuffer( JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_writeCSVToBufferBegin( JNIEnv *env, jclass, jobjectArray j_column_names, jboolean include_header, - jstring j_row_delimiter, jbyte j_field_delimiter, jstring j_null_value, jobject j_buffer) { + jstring j_row_delimiter, jbyte j_field_delimiter, jstring j_null_value, jstring j_true_value, + jstring j_false_value, jobject j_buffer) { JNI_NULL_CHECK(env, j_column_names, "column name array cannot be null", 0); JNI_NULL_CHECK(env, j_row_delimiter, "row delimiter cannot be null", 0); JNI_NULL_CHECK(env, j_field_delimiter, "field delimiter cannot be null", 0); JNI_NULL_CHECK(env, j_null_value, "null representation string cannot be itself null", 0); JNI_NULL_CHECK(env, j_buffer, "output buffer cannot be null", 0); - // TODO: Add support for true/false string representations. try { cudf::jni::auto_set_device(env); @@ -1265,6 +1273,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_writeCSVToBufferBegin( auto const line_terminator = cudf::jni::native_jstring{env, j_row_delimiter}; auto const na_rep = cudf::jni::native_jstring{env, j_null_value}; + auto const true_value = cudf::jni::native_jstring{env, j_true_value}; + auto const false_value = cudf::jni::native_jstring{env, j_false_value}; + auto options = cudf::io::csv_writer_options::builder(cudf::io::sink_info{data_sink.get()}, cudf::table_view{}) .names(column_names) @@ -1272,6 +1283,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_writeCSVToBufferBegin( .line_terminator(line_terminator.get()) .inter_column_delimiter(j_field_delimiter) .na_rep(na_rep.get()) + .true_value(true_value.get()) + .false_value(false_value.get()) .build(); return ptr_as_jlong(new cudf::jni::io::csv_chunked_writer{options, data_sink}); diff --git a/java/src/main/native/src/csv_chunked_writer.hpp b/java/src/main/native/src/csv_chunked_writer.hpp new file mode 100644 index 00000000000..9a916bd0646 --- /dev/null +++ b/java/src/main/native/src/csv_chunked_writer.hpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +#include "jni_writer_data_sink.hpp" + +namespace cudf::jni::io { + +/** + * @brief Class to write multiple Tables into the jni_writer_data_sink. + * + * TODO: Consider moving to cpp/ in the future, if there is interest. + */ +class csv_chunked_writer { + + cudf::io::csv_writer_options _options; + std::unique_ptr _sink; + + bool _first_write_completed = false; ///< Decides if header should be written. + +public: + explicit csv_chunked_writer(cudf::io::csv_writer_options options, + std::unique_ptr &sink) + : _options{options}, _sink{std::move(sink)} { + auto const &sink_info = _options.get_sink(); + // Assert invariants. + CUDF_EXPECTS(sink_info.type() != cudf::io::io_type::FILEPATH, + "Currently, chunked CSV writes to files is not supported."); + + // Note: csv_writer_options ties the sink(s) to the options, and exposes + // no way to modify the sinks afterwards. + // Ideally, the options would have been separate from the tables written, + // and the destination sinks. + // Here, we retain a modifiable reference to the sink, and confirm the + // options point to the same sink. + CUDF_EXPECTS(sink_info.num_sinks() == 1, "csv_chunked_writer should have exactly one sink."); + CUDF_EXPECTS(sink_info.user_sinks()[0] == _sink.get(), "Sink mismatch."); + } + + void write(cudf::table_view const &table) { + if (_first_write_completed) { + _options.enable_include_header(false); // Don't write header after the first write. + } else { + _first_write_completed = true; + } + + _options.set_table(table); + _options.set_rows_per_chunk(table.num_rows()); + + cudf::io::write_csv(_options); + } + + void close() { + // Flush pending writes to sink. + _sink->flush(); + } +}; + +} // namespace cudf::jni::io \ No newline at end of file diff --git a/java/src/main/native/src/jni_writer_data_sink.hpp b/java/src/main/native/src/jni_writer_data_sink.hpp new file mode 100644 index 00000000000..3656931acb2 --- /dev/null +++ b/java/src/main/native/src/jni_writer_data_sink.hpp @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "cudf_jni_apis.hpp" +#include "jni_utils.hpp" + +#pragma once + +namespace cudf::jni { + +constexpr long MINIMUM_WRITE_BUFFER_SIZE = 10 * 1024 * 1024; // 10 MB + +class jni_writer_data_sink final : public cudf::io::data_sink { +public: + explicit jni_writer_data_sink(JNIEnv *env, jobject callback) { + if (env->GetJavaVM(&jvm) < 0) { + throw std::runtime_error("GetJavaVM failed"); + } + + jclass cls = env->GetObjectClass(callback); + if (cls == nullptr) { + throw cudf::jni::jni_exception("class not found"); + } + + handle_buffer_method = + env->GetMethodID(cls, "handleBuffer", "(Lai/rapids/cudf/HostMemoryBuffer;J)V"); + if (handle_buffer_method == nullptr) { + throw cudf::jni::jni_exception("handleBuffer method"); + } + + this->callback = env->NewGlobalRef(callback); + if (this->callback == nullptr) { + throw cudf::jni::jni_exception("global ref"); + } + } + + virtual ~jni_writer_data_sink() { + // This should normally be called by a JVM thread. If the JVM environment is missing then this + // is likely being triggered by the C++ runtime during shutdown. In that case the JVM may + // already be destroyed and this thread should not try to attach to get an environment. + JNIEnv *env = nullptr; + if (jvm->GetEnv(reinterpret_cast(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) { + env->DeleteGlobalRef(callback); + if (current_buffer != nullptr) { + env->DeleteGlobalRef(current_buffer); + } + } + callback = nullptr; + current_buffer = nullptr; + } + + void host_write(void const *data, size_t size) override { + JNIEnv *env = cudf::jni::get_jni_env(jvm); + long left_to_copy = static_cast(size); + const char *copy_from = static_cast(data); + while (left_to_copy > 0) { + long buffer_amount_available = current_buffer_len - current_buffer_written; + if (buffer_amount_available <= 0) { + // should never be < 0, but just to be safe + rotate_buffer(env); + buffer_amount_available = current_buffer_len - current_buffer_written; + } + long amount_to_copy = + left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available; + char *copy_to = current_buffer_data + current_buffer_written; + + std::memcpy(copy_to, copy_from, amount_to_copy); + copy_from = copy_from + amount_to_copy; + current_buffer_written += amount_to_copy; + total_written += amount_to_copy; + left_to_copy -= amount_to_copy; + } + } + + bool supports_device_write() const override { return true; } + + void device_write(void const *gpu_data, size_t size, rmm::cuda_stream_view stream) override { + JNIEnv *env = cudf::jni::get_jni_env(jvm); + long left_to_copy = static_cast(size); + const char *copy_from = static_cast(gpu_data); + while (left_to_copy > 0) { + long buffer_amount_available = current_buffer_len - current_buffer_written; + if (buffer_amount_available <= 0) { + // should never be < 0, but just to be safe + stream.synchronize(); + rotate_buffer(env); + buffer_amount_available = current_buffer_len - current_buffer_written; + } + long amount_to_copy = + left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available; + char *copy_to = current_buffer_data + current_buffer_written; + + CUDF_CUDA_TRY(cudaMemcpyAsync(copy_to, copy_from, amount_to_copy, cudaMemcpyDeviceToHost, + stream.value())); + + copy_from = copy_from + amount_to_copy; + current_buffer_written += amount_to_copy; + total_written += amount_to_copy; + left_to_copy -= amount_to_copy; + } + stream.synchronize(); + } + + std::future device_write_async(void const *gpu_data, size_t size, + rmm::cuda_stream_view stream) override { + // Call the sync version until figuring out how to write asynchronously. + device_write(gpu_data, size, stream); + return std::async(std::launch::deferred, [] {}); + } + + void flush() override { + if (current_buffer_written > 0) { + JNIEnv *env = cudf::jni::get_jni_env(jvm); + handle_buffer(env, current_buffer, current_buffer_written); + if (current_buffer != nullptr) { + env->DeleteGlobalRef(current_buffer); + } + current_buffer = nullptr; + current_buffer_len = 0; + current_buffer_data = nullptr; + current_buffer_written = 0; + } + } + + size_t bytes_written() override { return total_written; } + + void set_alloc_size(long size) { this->alloc_size = size; } + +private: + void rotate_buffer(JNIEnv *env) { + if (current_buffer != nullptr) { + handle_buffer(env, current_buffer, current_buffer_written); + env->DeleteGlobalRef(current_buffer); + current_buffer = nullptr; + } + jobject tmp_buffer = allocate_host_buffer(env, alloc_size, true); + current_buffer = env->NewGlobalRef(tmp_buffer); + current_buffer_len = get_host_buffer_length(env, current_buffer); + current_buffer_data = reinterpret_cast(get_host_buffer_address(env, current_buffer)); + current_buffer_written = 0; + } + + void handle_buffer(JNIEnv *env, jobject buffer, jlong len) { + env->CallVoidMethod(callback, handle_buffer_method, buffer, len); + if (env->ExceptionCheck()) { + throw std::runtime_error("handleBuffer threw an exception"); + } + } + + JavaVM *jvm; + jobject callback; + jmethodID handle_buffer_method; + jobject current_buffer = nullptr; + char *current_buffer_data = nullptr; + long current_buffer_len = 0; + long current_buffer_written = 0; + size_t total_written = 0; + long alloc_size = MINIMUM_WRITE_BUFFER_SIZE; +}; + +} // namespace cudf::jni \ No newline at end of file diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 410b31ad02c..116900edbfc 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -588,6 +588,8 @@ void testWriteCSVToFile() throws IOException { .withIncludeHeader(false) .withFieldDelimiter((byte)'\u0001') .withRowDelimiter("\n") + .withTrueValue("T") + .withFalseValue("F") .build(); try (Table inputTable = new Table.TestBuilder() @@ -606,6 +608,8 @@ void testWriteCSVToFile() throws IOException { .includeColumn("str") .hasHeader(false) .withDelim('\u0001') + .withTrueValue("T") + .withFalseValue("F") .build(); try (Table readTable = Table.readCSV(schema, readOptions, outputFile)) { assertTablesAreEqual(inputTable, readTable); @@ -677,6 +681,8 @@ private void testChunkedCSVWriterImpl(char fieldDelim, boolean includeHeader) th .withFieldDelimiter((byte)fieldDelim) .withRowDelimiter("\n") .withNullValue("\\N") + .withTrueValue("T") + .withFalseValue("F") .build(); try (Table inputTable = new Table.TestBuilder() @@ -702,6 +708,8 @@ private void testChunkedCSVWriterImpl(char fieldDelim, boolean includeHeader) th .hasHeader(includeHeader) .withDelim(fieldDelim) .withNullValue("\\N") + .withTrueValue("T") + .withFalseValue("F") .build(); try (Table readTable = Table.readCSV(schema, readOptions, consumer.buffer, 0, consumer.offset); Table expected = Table.concatenate(inputTable, inputTable, inputTable)) { From ebbfcb824665e1843cab7338e29afac95ac359b1 Mon Sep 17 00:00:00 2001 From: MithunR Date: Wed, 21 Dec 2022 15:30:27 -0800 Subject: [PATCH 06/18] Added tests for combinations of True/False reps, header inclusion, etc. --- .../test/java/ai/rapids/cudf/TableTest.java | 35 +++++++++++++------ 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 116900edbfc..4856a4b40fa 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -574,8 +574,8 @@ void testReadCSV() { } } - @Test - void testWriteCSVToFile() throws IOException { + private void testWriteCSVToFileImpl(char fieldDelim, boolean includeHeader, + String trueValue, String falseValue) throws IOException { File outputFile = File.createTempFile("testWriteCSVToFile", ".csv"); Schema schema = Schema.builder() .column(DType.INT32, "i") @@ -619,6 +619,16 @@ void testWriteCSVToFile() throws IOException { } } + @Test + void testWriteCSVToFile() throws IOException { + final boolean INCLUDE_HEADER = true; + final boolean NO_HEADER = false; + testWriteCSVToFileImpl(',', INCLUDE_HEADER, "true", "false"); + testWriteCSVToFileImpl(',', NO_HEADER, "TRUE", "FALSE"); + testWriteCSVToFileImpl('\u0001', INCLUDE_HEADER, "T", "F"); + testWriteCSVToFileImpl('\u0001', NO_HEADER, "True", "False"); + } + private void testWriteCSVToBufferImpl(char fieldDelim) throws IOException { Schema schema = Schema.builder() .column(DType.INT32, "i") @@ -668,7 +678,8 @@ void testWriteCSVToBuffer() throws IOException { testWriteCSVToBufferImpl('\u0001'); } - private void testChunkedCSVWriterImpl(char fieldDelim, boolean includeHeader) throws IOException { + private void testChunkedCSVWriterImpl(char fieldDelim, boolean includeHeader, + String trueValue, String falseValue) throws IOException { Schema schema = Schema.builder() .column(DType.INT32, "i") .column(DType.FLOAT64, "f") @@ -681,8 +692,8 @@ private void testChunkedCSVWriterImpl(char fieldDelim, boolean includeHeader) th .withFieldDelimiter((byte)fieldDelim) .withRowDelimiter("\n") .withNullValue("\\N") - .withTrueValue("T") - .withFalseValue("F") + .withTrueValue(trueValue) + .withFalseValue(falseValue) .build(); try (Table inputTable = new Table.TestBuilder() @@ -708,8 +719,8 @@ private void testChunkedCSVWriterImpl(char fieldDelim, boolean includeHeader) th .hasHeader(includeHeader) .withDelim(fieldDelim) .withNullValue("\\N") - .withTrueValue("T") - .withFalseValue("F") + .withTrueValue(trueValue) + .withFalseValue(falseValue) .build(); try (Table readTable = Table.readCSV(schema, readOptions, consumer.buffer, 0, consumer.offset); Table expected = Table.concatenate(inputTable, inputTable, inputTable)) { @@ -720,10 +731,12 @@ private void testChunkedCSVWriterImpl(char fieldDelim, boolean includeHeader) th @Test void testChunkedCSVWriter() throws IOException { - testChunkedCSVWriterImpl(',', false); - testChunkedCSVWriterImpl(',', true); - testChunkedCSVWriterImpl('\u0001', false); - testChunkedCSVWriterImpl('\u0001', true); + final boolean INCLUDE_HEADER = true; + final boolean NO_HEADER = false; + testChunkedCSVWriterImpl(',', NO_HEADER, "true", "false"); + testChunkedCSVWriterImpl(',', INCLUDE_HEADER, "TRUE", "FALSE"); + testChunkedCSVWriterImpl('\u0001', NO_HEADER, "T", "F"); + testChunkedCSVWriterImpl('\u0001', INCLUDE_HEADER, "True", "False"); } @Test From cce5574cbd8b41b28d77c0efba69a2bbbc347831 Mon Sep 17 00:00:00 2001 From: MithunR Date: Wed, 21 Dec 2022 15:38:05 -0800 Subject: [PATCH 07/18] Removed JNI's non-chunked CSV writes to memory. --- java/src/main/java/ai/rapids/cudf/Table.java | 19 ------- java/src/main/native/src/TableJni.cpp | 35 ------------- .../test/java/ai/rapids/cudf/TableTest.java | 49 ------------------- 3 files changed, 103 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 4ece07919e0..ebdcaf00dd8 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -880,25 +880,6 @@ public void writeCSVToFile(CSVWriterOptions options, String outputPath) outputPath); } - private static native void writeCSVToBuffer(long table, - String[] columnNames, - boolean includeHeader, - String rowDelimiter, - byte fieldDelimiter, - String nullValue, - HostBufferConsumer buffer) throws CudfException; - - public void writeCSVToBuffer(CSVWriterOptions options, HostBufferConsumer bufferConsumer) - { - writeCSVToBuffer(nativeHandle, - options.getColumnNames(), - options.getIncludeHeader(), - options.getRowDelimiter(), - options.getFieldDelimiter(), - options.getNullValue(), - bufferConsumer); - } - private static native long writeCSVToBufferBegin(String[] columnNames, boolean includeHeader, String rowDelimiter, diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 5aff9421619..b1f758a8110 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1218,41 +1218,6 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToFile( CATCH_STD(env, ); } -JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToBuffer( - JNIEnv *env, jclass, jlong j_table_handle, jobjectArray j_column_names, jboolean include_header, - jstring j_row_delimiter, jbyte j_field_delimiter, jstring j_null_value, jobject j_buffer) { - JNI_NULL_CHECK(env, j_table_handle, "table handle cannot be null.", ); - JNI_NULL_CHECK(env, j_column_names, "column name array cannot be null", ); - JNI_NULL_CHECK(env, j_row_delimiter, "row delimiter cannot be null", ); - JNI_NULL_CHECK(env, j_field_delimiter, "field delimiter cannot be null", ); - JNI_NULL_CHECK(env, j_null_value, "null representation string cannot be itself null", ); - JNI_NULL_CHECK(env, j_buffer, "output buffer cannot be null", ); - - try { - cudf::jni::auto_set_device(env); - - auto data_sink = cudf::jni::jni_writer_data_sink{env, j_buffer}; - - auto const table = reinterpret_cast(j_table_handle); - auto const n_column_names = cudf::jni::native_jstringArray{env, j_column_names}; - auto const column_names = n_column_names.as_cpp_vector(); - - auto const line_terminator = cudf::jni::native_jstring{env, j_row_delimiter}; - auto const na_rep = cudf::jni::native_jstring{env, j_null_value}; - auto options = cudf::io::csv_writer_options::builder(cudf::io::sink_info{&data_sink}, *table) - .names(column_names) - .include_header(static_cast(include_header)) - .line_terminator(line_terminator.get()) - .inter_column_delimiter(j_field_delimiter) - .na_rep(na_rep.get()) - .build(); - - cudf::io::write_csv(options); - data_sink.flush(); - } - CATCH_STD(env, ); -} - JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_writeCSVToBufferBegin( JNIEnv *env, jclass, jobjectArray j_column_names, jboolean include_header, jstring j_row_delimiter, jbyte j_field_delimiter, jstring j_null_value, jstring j_true_value, diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 4856a4b40fa..833067a75fa 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -629,55 +629,6 @@ void testWriteCSVToFile() throws IOException { testWriteCSVToFileImpl('\u0001', NO_HEADER, "True", "False"); } - private void testWriteCSVToBufferImpl(char fieldDelim) throws IOException { - Schema schema = Schema.builder() - .column(DType.INT32, "i") - .column(DType.FLOAT64, "f") - .column(DType.BOOL8, "b") - .column(DType.STRING, "str") - .build(); - CSVWriterOptions writeOptions = CSVWriterOptions.builder() - .withColumnNames(schema.getColumnNames()) - .withIncludeHeader(false) - .withFieldDelimiter((byte)fieldDelim) - .withRowDelimiter("\n") - .withNullValue("\\N") - .build(); - try (Table inputTable - = new Table.TestBuilder() - .column(0, 1, 2, 3, 4, 5, 6, 7, 8, null) - .column(0.0, 1.0, 2.0, 3.0, 4.0, null, 6.0, 7.0, 8.0, 9.0) - .column(false, true, null, true, false, true, null, true, false, true) - .column("All", "the", "leaves", "are", "brown", "and", "the", "sky", "is", null) - .build(); - MyBufferConsumer consumer = new MyBufferConsumer()) { - inputTable.writeCSVToBuffer(writeOptions, consumer); - inputTable.writeCSVToBuffer(writeOptions, consumer); - inputTable.writeCSVToBuffer(writeOptions, consumer); - - // Read back. - CSVOptions readOptions = CSVOptions.builder() - .includeColumn("i") - .includeColumn("f") - .includeColumn("b") - .includeColumn("str") - .hasHeader(false) - .withDelim(fieldDelim) - .withNullValue("\\N") - .build(); - try (Table readTable = Table.readCSV(schema, readOptions, consumer.buffer, 0, consumer.offset); - Table expected = Table.concatenate(inputTable, inputTable, inputTable)) { - assertTablesAreEqual(expected, readTable); - } - } - } - - @Test - void testWriteCSVToBuffer() throws IOException { - testWriteCSVToBufferImpl(','); - testWriteCSVToBufferImpl('\u0001'); - } - private void testChunkedCSVWriterImpl(char fieldDelim, boolean includeHeader, String trueValue, String falseValue) throws IOException { Schema schema = Schema.builder() From 15693f1111c7a03ede370c0918d508c86f6ffb77 Mon Sep 17 00:00:00 2001 From: MithunR Date: Tue, 27 Dec 2022 14:40:44 -0800 Subject: [PATCH 08/18] Added newline at the end of the file, per CUDF guideline. --- java/src/main/native/src/csv_chunked_writer.hpp | 2 +- java/src/main/native/src/jni_writer_data_sink.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/java/src/main/native/src/csv_chunked_writer.hpp b/java/src/main/native/src/csv_chunked_writer.hpp index 9a916bd0646..a01fb9cc232 100644 --- a/java/src/main/native/src/csv_chunked_writer.hpp +++ b/java/src/main/native/src/csv_chunked_writer.hpp @@ -74,4 +74,4 @@ class csv_chunked_writer { } }; -} // namespace cudf::jni::io \ No newline at end of file +} // namespace cudf::jni::io diff --git a/java/src/main/native/src/jni_writer_data_sink.hpp b/java/src/main/native/src/jni_writer_data_sink.hpp index 3656931acb2..a2c5eac12f9 100644 --- a/java/src/main/native/src/jni_writer_data_sink.hpp +++ b/java/src/main/native/src/jni_writer_data_sink.hpp @@ -173,4 +173,4 @@ class jni_writer_data_sink final : public cudf::io::data_sink { long alloc_size = MINIMUM_WRITE_BUFFER_SIZE; }; -} // namespace cudf::jni \ No newline at end of file +} // namespace cudf::jni From 15e84c5e540e69ece5d0ea5efa27eceb1a551947 Mon Sep 17 00:00:00 2001 From: MithunR Date: Tue, 27 Dec 2022 14:43:13 -0800 Subject: [PATCH 09/18] Removed unnecessary whitespace at top of file. --- java/src/main/native/src/TableJni.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index b1f758a8110..fca6e56d031 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -14,7 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #include #include From e9107c94edc453473b793990955739bbf8ead35e Mon Sep 17 00:00:00 2001 From: MithunR Date: Wed, 28 Dec 2022 10:44:12 -0800 Subject: [PATCH 10/18] Re-added whitespace at end of file. --- java/src/main/native/src/csv_chunked_writer.hpp | 1 + java/src/main/native/src/jni_writer_data_sink.hpp | 1 + 2 files changed, 2 insertions(+) diff --git a/java/src/main/native/src/csv_chunked_writer.hpp b/java/src/main/native/src/csv_chunked_writer.hpp index a01fb9cc232..1c9e6fe19e4 100644 --- a/java/src/main/native/src/csv_chunked_writer.hpp +++ b/java/src/main/native/src/csv_chunked_writer.hpp @@ -75,3 +75,4 @@ class csv_chunked_writer { }; } // namespace cudf::jni::io + diff --git a/java/src/main/native/src/jni_writer_data_sink.hpp b/java/src/main/native/src/jni_writer_data_sink.hpp index a2c5eac12f9..4a3025230bf 100644 --- a/java/src/main/native/src/jni_writer_data_sink.hpp +++ b/java/src/main/native/src/jni_writer_data_sink.hpp @@ -174,3 +174,4 @@ class jni_writer_data_sink final : public cudf::io::data_sink { }; } // namespace cudf::jni + From 52f62e216643ea18dafc5bb19148b6eab3116428 Mon Sep 17 00:00:00 2001 From: MithunR Date: Fri, 30 Dec 2022 11:08:36 -0800 Subject: [PATCH 11/18] Fixed header order. Removed trailing newlines. --- java/src/main/native/src/csv_chunked_writer.hpp | 6 ++---- java/src/main/native/src/jni_writer_data_sink.hpp | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/java/src/main/native/src/csv_chunked_writer.hpp b/java/src/main/native/src/csv_chunked_writer.hpp index 1c9e6fe19e4..a3d5b2510b7 100644 --- a/java/src/main/native/src/csv_chunked_writer.hpp +++ b/java/src/main/native/src/csv_chunked_writer.hpp @@ -13,14 +13,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #pragma once -#include +#include "jni_writer_data_sink.hpp" #include -#include "jni_writer_data_sink.hpp" +#include namespace cudf::jni::io { @@ -75,4 +74,3 @@ class csv_chunked_writer { }; } // namespace cudf::jni::io - diff --git a/java/src/main/native/src/jni_writer_data_sink.hpp b/java/src/main/native/src/jni_writer_data_sink.hpp index 4a3025230bf..5b7c4798236 100644 --- a/java/src/main/native/src/jni_writer_data_sink.hpp +++ b/java/src/main/native/src/jni_writer_data_sink.hpp @@ -13,13 +13,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -#include +#pragma once #include "cudf_jni_apis.hpp" #include "jni_utils.hpp" -#pragma once +#include namespace cudf::jni { @@ -174,4 +173,3 @@ class jni_writer_data_sink final : public cudf::io::data_sink { }; } // namespace cudf::jni - From af7eed3f5d4f48cab00ab7b859efbf9b9594ac70 Mon Sep 17 00:00:00 2001 From: MithunR Date: Fri, 30 Dec 2022 11:14:29 -0800 Subject: [PATCH 12/18] Postpone setting _first_write till after write. --- java/src/main/native/src/csv_chunked_writer.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/java/src/main/native/src/csv_chunked_writer.hpp b/java/src/main/native/src/csv_chunked_writer.hpp index a3d5b2510b7..ebcb9e39a7b 100644 --- a/java/src/main/native/src/csv_chunked_writer.hpp +++ b/java/src/main/native/src/csv_chunked_writer.hpp @@ -57,14 +57,13 @@ class csv_chunked_writer { void write(cudf::table_view const &table) { if (_first_write_completed) { _options.enable_include_header(false); // Don't write header after the first write. - } else { - _first_write_completed = true; } _options.set_table(table); _options.set_rows_per_chunk(table.num_rows()); cudf::io::write_csv(_options); + _first_write_completed = true; } void close() { From 0d8298430da130c8c655e60a3cbb572658e056fc Mon Sep 17 00:00:00 2001 From: MithunR Date: Fri, 30 Dec 2022 11:52:44 -0800 Subject: [PATCH 13/18] Trailing newlines. --- java/src/main/native/src/csv_chunked_writer.hpp | 1 + java/src/main/native/src/jni_writer_data_sink.hpp | 1 + 2 files changed, 2 insertions(+) diff --git a/java/src/main/native/src/csv_chunked_writer.hpp b/java/src/main/native/src/csv_chunked_writer.hpp index ebcb9e39a7b..6377af89ee9 100644 --- a/java/src/main/native/src/csv_chunked_writer.hpp +++ b/java/src/main/native/src/csv_chunked_writer.hpp @@ -73,3 +73,4 @@ class csv_chunked_writer { }; } // namespace cudf::jni::io + diff --git a/java/src/main/native/src/jni_writer_data_sink.hpp b/java/src/main/native/src/jni_writer_data_sink.hpp index 5b7c4798236..0089348a1f4 100644 --- a/java/src/main/native/src/jni_writer_data_sink.hpp +++ b/java/src/main/native/src/jni_writer_data_sink.hpp @@ -173,3 +173,4 @@ class jni_writer_data_sink final : public cudf::io::data_sink { }; } // namespace cudf::jni + From fa24027c4467c5e7a3cc3e0da9a4e5a55ecaf77f Mon Sep 17 00:00:00 2001 From: MithunR Date: Fri, 30 Dec 2022 15:29:15 -0800 Subject: [PATCH 14/18] Review changes: 1. Formatting. 2. Better names for JNI CSV functions. --- java/src/main/java/ai/rapids/cudf/Table.java | 14 ++++++-------- java/src/main/native/src/TableJni.cpp | 4 ++-- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index ebdcaf00dd8..f41414d26ff 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -867,8 +867,7 @@ private static native void writeCSVToFile(long table, String falseValue, String outputPath) throws CudfException; - public void writeCSVToFile(CSVWriterOptions options, String outputPath) - { + public void writeCSVToFile(CSVWriterOptions options, String outputPath) { writeCSVToFile(nativeHandle, options.getColumnNames(), options.getIncludeHeader(), @@ -880,7 +879,7 @@ public void writeCSVToFile(CSVWriterOptions options, String outputPath) outputPath); } - private static native long writeCSVToBufferBegin(String[] columnNames, + private static native long startWriteCSVToBuffer(String[] columnNames, boolean includeHeader, String rowDelimiter, byte fieldDelimiter, @@ -891,14 +890,14 @@ private static native long writeCSVToBufferBegin(String[] columnNames, private static native void writeCSVChunkToBuffer(long writerHandle, long tableHandle); - private static native void writeCSVToBufferEnd(long writerHandle); + private static native void endWriteCSVToBuffer(long writerHandle); private static class CSVTableWriter implements TableWriter { private long writerHandle; private HostBufferConsumer consumer; private CSVTableWriter(CSVWriterOptions options, HostBufferConsumer consumer) { - this.writerHandle = writeCSVToBufferBegin(options.getColumnNames(), + this.writerHandle = startWriteCSVToBuffer(options.getColumnNames(), options.getIncludeHeader(), options.getRowDelimiter(), options.getFieldDelimiter(), @@ -920,7 +919,7 @@ public void write(Table table) { @Override public void close() throws CudfException { if (writerHandle != 0) { - writeCSVToBufferEnd(writerHandle); + endWriteCSVToBuffer(writerHandle); writerHandle = 0; } if (consumer != null) { @@ -930,8 +929,7 @@ public void close() throws CudfException { } } - public static TableWriter getCSVBufferWriter(CSVWriterOptions options, HostBufferConsumer bufferConsumer) - { + public static TableWriter getCSVBufferWriter(CSVWriterOptions options, HostBufferConsumer bufferConsumer) { return new CSVTableWriter(options, bufferConsumer); } diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index fca6e56d031..a5da14f3c0c 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1217,7 +1217,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToFile( CATCH_STD(env, ); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_writeCSVToBufferBegin( +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_startWriteCSVToBuffer( JNIEnv *env, jclass, jobjectArray j_column_names, jboolean include_header, jstring j_row_delimiter, jbyte j_field_delimiter, jstring j_null_value, jstring j_true_value, jstring j_false_value, jobject j_buffer) { @@ -1272,7 +1272,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVChunkToBuffer(JNIEnv *e CATCH_STD(env, ); } -JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToBufferEnd(JNIEnv *env, jclass, +JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env, jclass, jlong j_writer_handle) { JNI_NULL_CHECK(env, j_writer_handle, "writer handle cannot be null.", ); From f5e30c5df6500827bb4a25be25409999504f1b0d Mon Sep 17 00:00:00 2001 From: MithunR Date: Fri, 30 Dec 2022 15:40:38 -0800 Subject: [PATCH 15/18] More formatting . --- .../main/java/ai/rapids/cudf/CSVWriterOptions.java | 12 ++++++------ java/src/main/java/ai/rapids/cudf/Table.java | 12 ++++++------ java/src/main/native/src/csv_chunked_writer.hpp | 5 ++--- java/src/main/native/src/jni_writer_data_sink.hpp | 5 ++--- java/src/test/java/ai/rapids/cudf/TableTest.java | 14 +++++++------- 5 files changed, 23 insertions(+), 25 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/CSVWriterOptions.java b/java/src/main/java/ai/rapids/cudf/CSVWriterOptions.java index 9d654007209..3cef560a7cb 100644 --- a/java/src/main/java/ai/rapids/cudf/CSVWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/CSVWriterOptions.java @@ -23,7 +23,7 @@ import java.util.List; public class CSVWriterOptions { - + private String[] columnNames; private Boolean includeHeader = false; private String rowDelimiter = "\n"; @@ -68,8 +68,8 @@ public String getTrueValue() { public String getFalseValue() { return falseValue; - } - + } + public static Builder builder() { return new Builder(); } @@ -83,7 +83,7 @@ public static class Builder { private String nullValue = "\\N"; private String falseValue = "false"; private String trueValue = "true"; - + public CSVWriterOptions build() { return new CSVWriterOptions(this); } @@ -120,12 +120,12 @@ public Builder withNullValue(String nullValue) { this.nullValue = nullValue; return this; } - + public Builder withTrueValue(String trueValue) { this.trueValue = trueValue; return this; } - + public Builder withFalseValue(String falseValue) { this.falseValue = falseValue; return this; diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index f41414d26ff..c4a5f4053dc 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -868,12 +868,12 @@ private static native void writeCSVToFile(long table, String outputPath) throws CudfException; public void writeCSVToFile(CSVWriterOptions options, String outputPath) { - writeCSVToFile(nativeHandle, - options.getColumnNames(), - options.getIncludeHeader(), - options.getRowDelimiter(), - options.getFieldDelimiter(), - options.getNullValue(), + writeCSVToFile(nativeHandle, + options.getColumnNames(), + options.getIncludeHeader(), + options.getRowDelimiter(), + options.getFieldDelimiter(), + options.getNullValue(), options.getTrueValue(), options.getFalseValue(), outputPath); diff --git a/java/src/main/native/src/csv_chunked_writer.hpp b/java/src/main/native/src/csv_chunked_writer.hpp index 6377af89ee9..97352251a90 100644 --- a/java/src/main/native/src/csv_chunked_writer.hpp +++ b/java/src/main/native/src/csv_chunked_writer.hpp @@ -15,11 +15,11 @@ */ #pragma once -#include "jni_writer_data_sink.hpp" +#include #include -#include +#include "jni_writer_data_sink.hpp" namespace cudf::jni::io { @@ -73,4 +73,3 @@ class csv_chunked_writer { }; } // namespace cudf::jni::io - diff --git a/java/src/main/native/src/jni_writer_data_sink.hpp b/java/src/main/native/src/jni_writer_data_sink.hpp index 0089348a1f4..ad94cc35088 100644 --- a/java/src/main/native/src/jni_writer_data_sink.hpp +++ b/java/src/main/native/src/jni_writer_data_sink.hpp @@ -15,11 +15,11 @@ */ #pragma once +#include + #include "cudf_jni_apis.hpp" #include "jni_utils.hpp" -#include - namespace cudf::jni { constexpr long MINIMUM_WRITE_BUFFER_SIZE = 10 * 1024 * 1024; // 10 MB @@ -173,4 +173,3 @@ class jni_writer_data_sink final : public cudf::io::data_sink { }; } // namespace cudf::jni - diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 833067a75fa..42149385370 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -574,7 +574,7 @@ void testReadCSV() { } } - private void testWriteCSVToFileImpl(char fieldDelim, boolean includeHeader, + private void testWriteCSVToFileImpl(char fieldDelim, boolean includeHeader, String trueValue, String falseValue) throws IOException { File outputFile = File.createTempFile("testWriteCSVToFile", ".csv"); Schema schema = Schema.builder() @@ -582,7 +582,7 @@ private void testWriteCSVToFileImpl(char fieldDelim, boolean includeHeader, .column(DType.FLOAT64, "f") .column(DType.BOOL8, "b") .column(DType.STRING, "str") - .build(); + .build(); CSVWriterOptions writeOptions = CSVWriterOptions.builder() .withColumnNames(schema.getColumnNames()) .withIncludeHeader(false) @@ -591,7 +591,7 @@ private void testWriteCSVToFileImpl(char fieldDelim, boolean includeHeader, .withTrueValue("T") .withFalseValue("F") .build(); - try (Table inputTable + try (Table inputTable = new Table.TestBuilder() .column(0, 1, 2, 3, 4, 5, 6, 7, 8, 9) .column(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0) @@ -599,7 +599,7 @@ private void testWriteCSVToFileImpl(char fieldDelim, boolean includeHeader, .column("All", "the", "leaves", "are", "brown", "and", "the", "sky", "is", "grey") .build()) { inputTable.writeCSVToFile(writeOptions, outputFile.getAbsolutePath()); - + // Read back. CSVOptions readOptions = CSVOptions.builder() .includeColumn("i") @@ -636,7 +636,7 @@ private void testChunkedCSVWriterImpl(char fieldDelim, boolean includeHeader, .column(DType.FLOAT64, "f") .column(DType.BOOL8, "b") .column(DType.STRING, "str") - .build(); + .build(); CSVWriterOptions writeOptions = CSVWriterOptions.builder() .withColumnNames(schema.getColumnNames()) .withIncludeHeader(includeHeader) @@ -646,7 +646,7 @@ private void testChunkedCSVWriterImpl(char fieldDelim, boolean includeHeader, .withTrueValue(trueValue) .withFalseValue(falseValue) .build(); - try (Table inputTable + try (Table inputTable = new Table.TestBuilder() .column(0, 1, 2, 3, 4, 5, 6, 7, 8, null) .column(0.0, 1.0, 2.0, 3.0, 4.0, null, 6.0, 7.0, 8.0, 9.0) @@ -654,7 +654,7 @@ private void testChunkedCSVWriterImpl(char fieldDelim, boolean includeHeader, .column("All", "the", "leaves", "are", "brown", "and", "the", "sky", "is", null) .build(); MyBufferConsumer consumer = new MyBufferConsumer()) { - + try (TableWriter writer = Table.getCSVBufferWriter(writeOptions, consumer)) { writer.write(inputTable); writer.write(inputTable); From d3642a4b409af5c5522031a6ab7533156c8067b3 Mon Sep 17 00:00:00 2001 From: MithunR Date: Tue, 3 Jan 2023 13:50:59 -0800 Subject: [PATCH 16/18] Updated documentation for _inter_column_delimiter. --- cpp/include/cudf/io/csv.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index 2335cd8e947..ce7db0043da 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -1422,9 +1422,9 @@ class csv_writer_options { [[nodiscard]] std::string get_line_terminator() const { return _line_terminator; } /** - * @brief Returns character used for separating lines. + * @brief Returns character used for separating column values. * - * @return Character used for separating lines + * @return Character used for separating column values. */ [[nodiscard]] char get_inter_column_delimiter() const { return _inter_column_delimiter; } @@ -1479,9 +1479,9 @@ class csv_writer_options { void set_line_terminator(std::string term) { _line_terminator = term; } /** - * @brief Sets character used for separating lines. + * @brief Sets character used for separating column values. * - * @param delim Character to indicate delimiting + * @param delim Character to delimit column values */ void set_inter_column_delimiter(char delim) { _inter_column_delimiter = delim; } @@ -1593,9 +1593,9 @@ class csv_writer_options_builder { } /** - * @brief Sets character used for separating lines. + * @brief Sets character used for separating column values. * - * @param delim Character to indicate delimiting + * @param delim Character to delimit column values * @return this for chaining */ csv_writer_options_builder& inter_column_delimiter(char delim) From c83e0d9a668b2a0a3cc84a3b494c80ec88447b52 Mon Sep 17 00:00:00 2001 From: MithunR Date: Tue, 3 Jan 2023 13:56:09 -0800 Subject: [PATCH 17/18] Updated copyright date. --- cpp/include/cudf/io/csv.hpp | 2 +- java/src/main/java/ai/rapids/cudf/CSVWriterOptions.java | 2 +- java/src/main/java/ai/rapids/cudf/Table.java | 2 +- java/src/main/native/src/TableJni.cpp | 4 ++-- java/src/main/native/src/csv_chunked_writer.hpp | 2 +- java/src/main/native/src/jni_writer_data_sink.hpp | 2 +- java/src/test/java/ai/rapids/cudf/TableTest.java | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index ce7db0043da..92b5447527c 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/java/src/main/java/ai/rapids/cudf/CSVWriterOptions.java b/java/src/main/java/ai/rapids/cudf/CSVWriterOptions.java index 3cef560a7cb..c842522c167 100644 --- a/java/src/main/java/ai/rapids/cudf/CSVWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/CSVWriterOptions.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index c4a5f4053dc..36dec194017 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index a5da14f3c0c..437b96c5f7f 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1,6 +1,6 @@ /* - - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/java/src/main/native/src/csv_chunked_writer.hpp b/java/src/main/native/src/csv_chunked_writer.hpp index 97352251a90..768b2f2946a 100644 --- a/java/src/main/native/src/csv_chunked_writer.hpp +++ b/java/src/main/native/src/csv_chunked_writer.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/java/src/main/native/src/jni_writer_data_sink.hpp b/java/src/main/native/src/jni_writer_data_sink.hpp index ad94cc35088..05fe594fcd5 100644 --- a/java/src/main/native/src/jni_writer_data_sink.hpp +++ b/java/src/main/native/src/jni_writer_data_sink.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 42149385370..44af54f34c4 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From bfd2cd35022b0d633cc8c28efdb628bb76019d41 Mon Sep 17 00:00:00 2001 From: MithunR Date: Wed, 4 Jan 2023 16:26:51 -0800 Subject: [PATCH 18/18] Review fixes: 1. Removed TODO. 2. Changed default null value to empty string. --- java/src/main/java/ai/rapids/cudf/CSVWriterOptions.java | 4 ++-- java/src/main/native/src/csv_chunked_writer.hpp | 2 -- java/src/test/java/ai/rapids/cudf/TableTest.java | 1 + 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/CSVWriterOptions.java b/java/src/main/java/ai/rapids/cudf/CSVWriterOptions.java index c842522c167..410eeab2b18 100644 --- a/java/src/main/java/ai/rapids/cudf/CSVWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/CSVWriterOptions.java @@ -28,7 +28,7 @@ public class CSVWriterOptions { private Boolean includeHeader = false; private String rowDelimiter = "\n"; private byte fieldDelimiter = ','; - private String nullValue = "\\N"; + private String nullValue = ""; private String falseValue = "false"; private String trueValue = "true"; @@ -80,7 +80,7 @@ public static class Builder { private Boolean includeHeader = false; private String rowDelimiter = "\n"; private byte fieldDelimiter = ','; - private String nullValue = "\\N"; + private String nullValue = ""; private String falseValue = "false"; private String trueValue = "true"; diff --git a/java/src/main/native/src/csv_chunked_writer.hpp b/java/src/main/native/src/csv_chunked_writer.hpp index 768b2f2946a..1f1e73a1a4b 100644 --- a/java/src/main/native/src/csv_chunked_writer.hpp +++ b/java/src/main/native/src/csv_chunked_writer.hpp @@ -25,8 +25,6 @@ namespace cudf::jni::io { /** * @brief Class to write multiple Tables into the jni_writer_data_sink. - * - * TODO: Consider moving to cpp/ in the future, if there is interest. */ class csv_chunked_writer { diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 44af54f34c4..83e4cb536f3 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -588,6 +588,7 @@ private void testWriteCSVToFileImpl(char fieldDelim, boolean includeHeader, .withIncludeHeader(false) .withFieldDelimiter((byte)'\u0001') .withRowDelimiter("\n") + .withNullValue("\\N") .withTrueValue("T") .withFalseValue("F") .build();