apache · WillAyd · May 12, 2025
diff --git a/cpp/meson.build b/cpp/meson.build
@@ -59,14 +59,21 @@ endif
 needs_benchmarks = get_option('benchmarks').enabled()
 needs_compute = get_option('compute').enabled()
 needs_csv = get_option('csv').enabled()
+needs_dataset = get_option('dataset').enabled()
 needs_azure = get_option('azure').enabled()
 needs_gcs = get_option('gcs').enabled()
 needs_hdfs = get_option('hdfs').enabled()
 needs_s3 = get_option('s3').enabled()
-needs_filesystem = get_option('filesystem').enabled() or needs_azure or needs_gcs or needs_hdfs or needs_s3
+needs_filesystem = (get_option('filesystem').enabled()
+or needs_azure
+or needs_dataset
+or needs_gcs
+or needs_hdfs
+or needs_s3
+)
 needs_integration = get_option('integration').enabled()
 needs_tests = get_option('tests').enabled()
-needs_acero = get_option('acero').enabled()
+needs_acero = get_option('acero').enabled() or needs_dataset
 needs_ipc = get_option('ipc').enabled() or needs_tests or needs_acero or needs_benchmarks
 needs_fuzzing = get_option('fuzzing').enabled()
 needs_testing = (get_option('testing').enabled()

diff --git a/cpp/meson.options b/cpp/meson.options
@@ -27,6 +27,7 @@ option('brotli', type: 'feature', description: 'Build with Brotli compression')
 option('bz2', type: 'feature', description: 'Build with BZ2 compression')
 option('compute', type: 'feature', description: 'Build all Arrow Compute kernels')
 option('csv', type: 'feature', description: 'Build the Arrow CSV Parser Module')
+option('dataset', type: 'feature', description: 'Build the Arrow Dataset Modules')
 option('filesystem', type: 'feature', description: 'Build the Arrow Filesystem Layer')
 option('fuzzing', type: 'feature', description: 'Build Arrow Fuzzing executables')
 

@@ -85,7 +85,7 @@ arrow_acero_lib = library(
 
 arrow_acero_dep = declare_dependency(link_with: [arrow_acero_lib])
 
-arrow_acero_testing_sources = ['test_nodes.cc', 'test_util_internal.cc'] + arrow_compute_testing_srcs
+arrow_acero_testing_sources = files('test_nodes.cc', 'test_util_internal.cc') + arrow_compute_testing_srcs
 
 arrow_acero_tests = {
     'plan-test': {'sources': ['plan_test.cc', 'test_nodes_test.cc']},

diff --git a/cpp/src/arrow/dataset/meson.build b/cpp/src/arrow/dataset/meson.build
@@ -0,0 +1,142 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+install_headers(
+    [
+        'api.h',
+        'dataset.h',
+        'dataset_writer.h',
+        'discovery.h',
+        'file_base.h',
+        'file_csv.h',
+        'file_ipc.h',
+        'file_json.h',
+        'file_orc.h',
+        'file_parquet.h',
+        'parquet_encryption_config.h',
+        'partition.h',
+        'plan.h',
+        'projector.h',
+        'scanner.h',
+        'type_fwd.h',
+        'visibility.h',
+    ],
+    subdir: 'arrow/dataset',
+)
+
+arrow_dataset_srcs = [
+    'dataset.cc',
+    'dataset_writer.cc',
+    'discovery.cc',
+    'file_base.cc',
+    'file_ipc.cc',
+    'partition.cc',
+    'plan.cc',
+    'projector.cc',
+    'scanner.cc',
+    'scan_node.cc',
+]
+
+if needs_csv
+    arrow_dataset_srcs += ['file_csv.cc']
+endif
+
+if needs_json
+    arrow_dataset_srcs += ['file_json.cc']
+endif
+
+# requires https://github.com/apache/arrow/pull/46409
+#if needs_orc
+#    arrow_dataset_srcs += ['file_orc.cc']
+#endif
+
+# requires https://github.com/apache/arrow/issues/46410
+# if needs_parquet
+#    arrow_dataset_srcs += ['file_parquet.cc']
+# endif
+
+arrow_dataset_lib = library(
+    'arrow-dataset',
+    sources: arrow_dataset_srcs,
+    dependencies: [arrow_dep, arrow_acero_dep],
+)
+
+arrow_dataset_testing_srcs = files('test_util_internal.cc')
+
+arrow_dataset_tests = {
+    'dataset-test': {'sources': ['dataset_test.cc']},
+    'dataset-writer-test': {'sources': ['dataset_writer_test.cc']},
+    'discovery-test': {'sources': ['discovery_test.cc']},
+    'file-ipc-test': {'sources': ['file_ipc_test.cc']},
+    'file-test': {'sources': ['file_test.cc'] + arrow_acero_testing_sources},
+    'partition-test': {'sources': ['partition_test.cc']},
+    'scanner-test': {'sources': ['scanner_test.cc']},
+    'subtree-test': {'sources': ['subtree_test.cc']},
+    'write-node-test': {'sources': ['write_node_test.cc']},
+}
+
+if needs_csv
+    arrow_dataset_tests += {'file-csv-test': {'sources': ['file_csv_test.cc']}}
+endif
+
+if needs_json
+    arrow_dataset_tests += {
+        'file-json-test': {
+            'sources': ['file_json_test.cc'],
+            'dependencies': [rapidjson_dep],
+        },
+    }
+endif
+
+
+# requires https://github.com/apache/arrow/pull/46409
+#if needs_orc
+#    arrow_dataset_test_srcs += ['file_orc_test']
+#endif
+
+# requires https://github.com/apache/arrow/issues/46410
+# if needs_parquet
+#    arrow_dataset_test_srcs += ['file_parquet_test']
+#    if needs_parquet_encryption
+#        ...
+#    endif
+# endif
+
+foreach key, val : arrow_dataset_tests
+    exc = executable(
+        'arrow-dataset-@0@'.format(key),
+        sources: val['sources'] + arrow_dataset_testing_srcs,
+        dependencies: [
+            arrow_acero_dep,
+            arrow_test_dep,
+            val.get('dependencies', []),
+        ],
+        link_with: arrow_dataset_lib,
+    )
+    test('arrow-dataset-@0@'.format(key), exc)
+endforeach
+
+arrow_dataset_benchmarks = ['file_benchmark', 'scanner_benchmark']
+
+foreach arrow_dataset_benchmark : arrow_dataset_benchmarks
+    exc = executable(
+        'arrow-dataset-@0@'.format(arrow_dataset_benchmark.replace('_', '-')),
+        sources: '@[email protected]'.format(arrow_dataset_benchmark),
+        dependencies: [arrow_acero_dep, arrow_benchmark_dep, gmock_dep],
+        link_with: arrow_dataset_lib,
+    )
+endforeach
diff --git a/cpp/src/arrow/meson.build b/cpp/src/arrow/meson.build
@@ -717,6 +717,10 @@ if needs_acero
     subdir('acero')
 endif
 
+if needs_dataset
+    subdir('dataset')
+endif
+
 if needs_filesystem
     subdir('filesystem')
 endif