feat(csharp/src/Drivers/Apache/Spark): Add Lz4 compression support to arrow batch reader (#2669)

eric-wang-1990 · web-flow · commit 028d22f9d2d8 · 2025-04-04T13:20:58.000-07:00
1. Create a new file [Lz4Utilities.cs](main...eric-wang-1990:arrow-adbc:add_lz4_to_arrowbatch#diff-7275463b6b9fcc3cf3b954580e3ded8b0b7237a90e0f4aea33eb11e613f3db39) to abstract common Lz4 decompress util functions for both cloud fetch and arrow batch. 2. Add support for decompress arrow batch with Lz4 3. Rename adbc.spark.cloudfetch.lz4.enabled to adbc.spark.lz4Compression.enabled since it is not specific to cloudfetch 4. Add test to test both cloudFetch and arrowBatch in StatementTests.
diff --git a/csharp/src/Drivers/Apache/Spark/CloudFetch/SparkCloudFetchReader.cs b/csharp/src/Drivers/Apache/Spark/CloudFetch/SparkCloudFetchReader.cs
@@ -22,7 +22,9 @@
 using System.Net.Http;
 using System.Threading;
 using System.Threading.Tasks;
+using Apache.Arrow.Adbc.Drivers.Apache;
 using Apache.Arrow.Adbc.Drivers.Apache.Hive2;
+using Apache.Arrow.Adbc.Drivers.Apache.Spark;
 using Apache.Arrow.Ipc;
 using Apache.Hive.Service.Rpc.Thrift;
 using K4os.Compression.LZ4.Streams;
@@ -161,59 +163,52 @@ private HttpClient HttpClient
                     var link = this.resultLinks[this.linkIndex++];
                     byte[]? fileData = null;
 
-                    // Retry logic for downloading files
-                    for (int retry = 0; retry < this.maxRetries; retry++)
+                    try
                     {
-                        try
+                        // Try to download with retry logic
+                        for (int retry = 0; retry < this.maxRetries; retry++)
                         {
-                            fileData = await DownloadFileAsync(link.FileLink, cancellationToken);
-                            break; // Success, exit retry loop
+                            try
+                            {
+                                fileData = await DownloadFileAsync(link.FileLink, cancellationToken);
+                                break; // Success, exit retry loop
+                            }
+                            catch (Exception) when (retry < this.maxRetries - 1)
+                            {
+                                // Only delay and retry if we haven't reached max retries
+                                await Task.Delay(this.retryDelayMs * (retry + 1), cancellationToken);
+                            }
                         }
-                        catch (Exception ex) when (retry < this.maxRetries - 1)
+
+                        // If download still failed after all retries
+                        if (fileData == null)
                         {
-                            // Log the error and retry
-                            Debug.WriteLine($"Error downloading file (attempt {retry + 1}/{this.maxRetries}): {ex.Message}");
-                            await Task.Delay(this.retryDelayMs * (retry + 1), cancellationToken);
+                            throw new AdbcException($"Failed to download CloudFetch data from {link.FileLink} after {this.maxRetries} attempts");
                         }
-                    }
 
-                    // Process the downloaded file data
-                    MemoryStream dataStream;
+                        ReadOnlyMemory<byte> dataToUse = new ReadOnlyMemory<byte>(fileData);
 
-                    // If the data is LZ4 compressed, decompress it
-                    if (this.isLz4Compressed)
-                    {
-                        try
-                        {
-                            dataStream = new MemoryStream();
-                            using (var inputStream = new MemoryStream(fileData!))
-                            using (var decompressor = LZ4Stream.Decode(inputStream))
-                            {
-                                await decompressor.CopyToAsync(dataStream);
-                            }
-                            dataStream.Position = 0;
-                        }
-                        catch (Exception ex)
+                        // If the data is LZ4 compressed, decompress it
+                        if (this.isLz4Compressed)
                         {
-                            Debug.WriteLine($"Error decompressing data: {ex.Message}");
-                            continue; // Skip this link and try the next one
+                            dataToUse = Lz4Utilities.DecompressLz4(fileData);
                         }
-                    }
-                    else
-                    {
-                        dataStream = new MemoryStream(fileData!);
-                    }
 
-                    try
-                    {
-                        this.currentReader = new ArrowStreamReader(dataStream);
+                        // Use ChunkStream which supports ReadOnlyMemory<byte> directly
+                        this.currentReader = new ArrowStreamReader(new ChunkStream(this.schema, dataToUse));
                         continue;
                     }
                     catch (Exception ex)
                     {
-                        Debug.WriteLine($"Error creating Arrow reader: {ex.Message}");
-                        dataStream.Dispose();
-                        continue; // Skip this link and try the next one
+                        // Create concise error message based on exception type
+                        string errorPrefix = $"CloudFetch link {this.linkIndex-1}:";
+                        string errorMessage = ex switch
+                        {
+                            _ when ex.GetType().Name.Contains("LZ4") => $"{errorPrefix} LZ4 decompression failed - Data may be corrupted",
+                            HttpRequestException or TaskCanceledException => $"{errorPrefix} Download failed - {ex.Message}",
+                            _ => $"{errorPrefix} Processing failed - {ex.Message}" // Default case for any other exception
+                        };
+                        throw new AdbcException(errorMessage, ex);
                     }
                 }
 
@@ -242,9 +237,7 @@ private HttpClient HttpClient
                 }
                 catch (Exception ex)
                 {
-                    Debug.WriteLine($"Error fetching results from server: {ex.Message}");
-                    this.statement = null; // Mark as done due to error
-                    return null;
+                    throw new AdbcException($"Server request failed - {ex.Message}", ex);
                 }
 
                 // Check if we have URL-based results
diff --git a/csharp/src/Drivers/Apache/Spark/Lz4Utilities.cs b/csharp/src/Drivers/Apache/Spark/Lz4Utilities.cs
@@ -0,0 +1,57 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+using System;
+using System.Buffers;
+using System.IO;
+using K4os.Compression.LZ4.Streams;
+
+namespace Apache.Arrow.Adbc.Drivers.Apache.Spark
+{
+    /// <summary>
+    /// Utility class for LZ4 compression/decompression operations.
+    /// </summary>
+    internal static class Lz4Utilities
+    {
+        /// <summary>
+        /// Decompresses LZ4 compressed data into memory.
+        /// </summary>
+        /// <param name="compressedData">The compressed data bytes.</param>
+        /// <returns>A ReadOnlyMemory containing the decompressed data.</returns>
+        /// <exception cref="AdbcException">Thrown when decompression fails.</exception>
+        public static ReadOnlyMemory<byte> DecompressLz4(byte[] compressedData)
+        {
+            try
+            {
+                var outputStream = new MemoryStream();
+                using (var inputStream = new MemoryStream(compressedData))
+                using (var decompressor = LZ4Stream.Decode(inputStream))
+                {
+                    decompressor.CopyTo(outputStream);
+                }
+                // Get the underlying buffer and its valid length without copying
+                return new ReadOnlyMemory<byte>(outputStream.GetBuffer(), 0, (int)outputStream.Length);
+                // Note: We're not disposing the outputStream here because we're returning its buffer.
+                // The memory will be reclaimed when the ReadOnlyMemory is no longer referenced.
+            }
+            catch (Exception ex)
+            {
+                throw new AdbcException($"Failed to decompress LZ4 data: {ex.Message}", ex);
+            }
+        }
+    }
+}
diff --git a/csharp/src/Drivers/Apache/Spark/SparkDatabricksConnection.cs b/csharp/src/Drivers/Apache/Spark/SparkDatabricksConnection.cs
@@ -56,7 +56,7 @@ internal override IArrowArrayStream NewReader<T>(T statement, Schema schema, TGe
             }
             else
             {
-                return new SparkDatabricksReader(statement, schema);
+                return new SparkDatabricksReader(statement, schema, isLz4Compressed);
             }
         }
 
diff --git a/csharp/src/Drivers/Apache/Spark/SparkDatabricksReader.cs b/csharp/src/Drivers/Apache/Spark/SparkDatabricksReader.cs
@@ -15,12 +15,15 @@
 * limitations under the License.
 */
 
+using System;
 using System.Collections.Generic;
+using System.IO;
 using System.Threading;
 using System.Threading.Tasks;
 using Apache.Arrow.Adbc.Drivers.Apache.Hive2;
 using Apache.Arrow.Ipc;
 using Apache.Hive.Service.Rpc.Thrift;
+using K4os.Compression.LZ4.Streams;
 
 namespace Apache.Arrow.Adbc.Drivers.Apache.Spark
 {
@@ -31,11 +34,18 @@ internal sealed class SparkDatabricksReader : IArrowArrayStream
         List<TSparkArrowBatch>? batches;
         int index;
         IArrowReader? reader;
+        bool isLz4Compressed;
 
         public SparkDatabricksReader(HiveServer2Statement statement, Schema schema)
+            : this(statement, schema, false)
+        {
+        }
+
+        public SparkDatabricksReader(HiveServer2Statement statement, Schema schema, bool isLz4Compressed)
         {
             this.statement = statement;
             this.schema = schema;
+            this.isLz4Compressed = isLz4Compressed;
         }
 
         public Schema Schema { get { return schema; } }
@@ -56,7 +66,7 @@ public SparkDatabricksReader(HiveServer2Statement statement, Schema schema)
 
                 if (this.batches != null && this.index < this.batches.Count)
                 {
-                    this.reader = new ArrowStreamReader(new ChunkStream(this.schema, this.batches[this.index++].Batch));
+                    ProcessFetchedBatches();
                     continue;
                 }
 
@@ -70,6 +80,8 @@ public SparkDatabricksReader(HiveServer2Statement statement, Schema schema)
 
                 TFetchResultsReq request = new TFetchResultsReq(this.statement.OperationHandle!, TFetchOrientation.FETCH_NEXT, this.statement.BatchSize);
                 TFetchResultsResp response = await this.statement.Connection.Client!.FetchResults(request, cancellationToken);
+
+                // Make sure we get the arrowBatches
                 this.batches = response.Results.ArrowBatches;
 
                 if (!response.HasMoreRows)
@@ -79,6 +91,43 @@ public SparkDatabricksReader(HiveServer2Statement statement, Schema schema)
             }
         }
 
+        private void ProcessFetchedBatches()
+        {
+            var batch = this.batches![this.index];
+
+            // Ensure batch data exists
+            if (batch.Batch == null || batch.Batch.Length == 0)
+            {
+                this.index++;
+                return;
+            }
+
+            try
+            {
+                ReadOnlyMemory<byte> dataToUse = new ReadOnlyMemory<byte>(batch.Batch);
+
+                // If LZ4 compression is enabled, decompress the data
+                if (isLz4Compressed)
+                {
+                    dataToUse = Lz4Utilities.DecompressLz4(batch.Batch);
+                }
+
+                // Always use ChunkStream which ensures proper schema handling
+                this.reader = new ArrowStreamReader(new ChunkStream(this.schema, dataToUse));
+            }
+            catch (Exception ex)
+            {
+                // Create concise error message based on exception type
+                string errorMessage = ex switch
+                {
+                    _ when ex.GetType().Name.Contains("LZ4") => $"Batch {this.index}: LZ4 decompression failed - Data may be corrupted",
+                    _ => $"Batch {this.index}: Processing failed - {ex.Message}" // Default case for any other exception
+                };
+                throw new AdbcException(errorMessage, ex);
+            }
+            this.index++;
+        }
+
         public void Dispose()
         {
         }
diff --git a/csharp/src/Drivers/Apache/Spark/SparkStatement.cs b/csharp/src/Drivers/Apache/Spark/SparkStatement.cs
@@ -155,9 +155,10 @@ internal void SetMaxBytesPerFile(long maxBytesPerFile)
         /// </summary>
         public sealed class Options : ApacheParameters
         {
+            // Lz4 compression option
+            public const string CanDecompressLz4 = "adbc.spark.lz4_compression.enabled";
             // CloudFetch options
             public const string UseCloudFetch = "adbc.spark.cloudfetch.enabled";
-            public const string CanDecompressLz4 = "adbc.spark.cloudfetch.lz4.enabled";
             public const string MaxBytesPerFile = "adbc.spark.cloudfetch.max_bytes_per_file";
         }
     }
diff --git a/csharp/src/Drivers/Apache/Thrift/ChunkStream.cs b/csharp/src/Drivers/Apache/Thrift/ChunkStream.cs
@@ -26,11 +26,17 @@ namespace Apache.Arrow.Adbc.Drivers.Apache
     internal class ChunkStream : Stream
     {
         ReadOnlyMemory<byte> currentBuffer;
-        byte[] data;
+        ReadOnlyMemory<byte> data;
         bool first;
         int position;
 
         public ChunkStream(Schema schema, byte[] data)
+            : this(schema, new ReadOnlyMemory<byte>(data))
+        {
+            // Call the other constructor to avoid duplication
+        }
+
+        public ChunkStream(Schema schema, ReadOnlyMemory<byte> data)
         {
             MemoryStream buffer = new MemoryStream();
             ArrowStreamWriter writer = new ArrowStreamWriter(buffer, schema, leaveOpen: true);
@@ -70,7 +76,7 @@ public override int Read(byte[] buffer, int offset, int count)
                 {
                     return 0;
                 }
-                this.currentBuffer = new ReadOnlyMemory<byte>(this.data);
+                this.currentBuffer = this.data;
                 this.position = 0;
                 remaining = this.currentBuffer.Length - this.position;
             }
diff --git a/csharp/test/Drivers/Apache/Spark/StatementTests.cs b/csharp/test/Drivers/Apache/Spark/StatementTests.cs
@@ -17,6 +17,7 @@
 
 using System;
 using System.Collections.Generic;
+using System.Linq;
 using System.Threading.Tasks;
 using Apache.Arrow.Adbc.Drivers.Apache.Spark;
 using Apache.Arrow.Adbc.Tests.Drivers.Apache.Common;
@@ -39,6 +40,50 @@ internal override void StatementTimeoutTest(StatementWithExceptions statementWit
             base.StatementTimeoutTest(statementWithExceptions);
         }
 
+        [SkippableTheory]
+        [InlineData(true, "CloudFetch enabled")]
+        [InlineData(false, "CloudFetch disabled")]
+        public async Task LZ4DecompressionCapabilityTest(bool useCloudFetch, string configName)
+        {
+            OutputHelper?.WriteLine($"Testing with LZ4 decompression capability enabled ({configName})");
+
+            // Create a connection using the test configuration
+            using AdbcConnection connection = NewConnection();
+            using var statement = connection.CreateStatement();
+
+            // Set options for LZ4 decompression (enabled by default) and CloudFetch as specified
+            statement.SetOption(SparkStatement.Options.UseCloudFetch, useCloudFetch.ToString().ToLower());
+            OutputHelper?.WriteLine($"CloudFetch is {(useCloudFetch ? "enabled" : "disabled")}");
+            OutputHelper?.WriteLine("LZ4 decompression capability is enabled by default");
+
+            // Execute a query that should return data
+            statement.SqlQuery = "SELECT id, CAST(id AS STRING) as id_string, id * 2 as id_doubled FROM RANGE(100)";
+            QueryResult result = statement.ExecuteQuery();
+
+            // Verify we have a valid stream
+            Assert.NotNull(result.Stream);
+
+            // Read all batches
+            int totalRows = 0;
+            int batchCount = 0;
+
+            while (result.Stream != null)
+            {
+                using var batch = await result.Stream.ReadNextRecordBatchAsync();
+                if (batch == null)
+                    break;
+
+                batchCount++;
+                totalRows += batch.Length;
+                OutputHelper?.WriteLine($"Batch {batchCount}: Read {batch.Length} rows");
+            }
+
+            // Verify we got all rows
+            Assert.Equal(100, totalRows);
+            OutputHelper?.WriteLine($"Successfully read {totalRows} rows in {batchCount} batches with {configName}");
+            OutputHelper?.WriteLine("NOTE: Whether actual LZ4 compression was used is determined by the server");
+        }
+
         internal class LongRunningStatementTimeoutTestData : ShortRunningStatementTimeoutTestData
         {
             public LongRunningStatementTimeoutTestData()

Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ internal override IArrowArrayStream NewReader<T>(T statement, Schema schema, TGe`
`56`	`56`	`}`
`57`	`57`	`else`
`58`	`58`	`{`
`59`		`- return new SparkDatabricksReader(statement, schema);`
	`59`	`+ return new SparkDatabricksReader(statement, schema, isLz4Compressed);`
`60`	`60`	`}`
`61`	`61`	`}`
`62`	`62`
Original file line number	Diff line number	Diff line change
`@@ -155,9 +155,10 @@ internal void SetMaxBytesPerFile(long maxBytesPerFile)`
`155`	`155`	`/// </summary>`
`156`	`156`	`public sealed class Options : ApacheParameters`
`157`	`157`	`{`
	`158`	`+ // Lz4 compression option`
	`159`	`+ public const string CanDecompressLz4 = "adbc.spark.lz4_compression.enabled";`
`158`	`160`	`// CloudFetch options`
`159`	`161`	`public const string UseCloudFetch = "adbc.spark.cloudfetch.enabled";`
`160`		`- public const string CanDecompressLz4 = "adbc.spark.cloudfetch.lz4.enabled";`
`161`	`162`	`public const string MaxBytesPerFile = "adbc.spark.cloudfetch.max_bytes_per_file";`
`162`	`163`	`}`
`163`	`164`	`}`
Original file line number	Diff line number	Diff line change
`@@ -26,11 +26,17 @@ namespace Apache.Arrow.Adbc.Drivers.Apache`
`26`	`26`	`internal class ChunkStream : Stream`
`27`	`27`	`{`
`28`	`28`	`ReadOnlyMemory<byte> currentBuffer;`
`29`		`- byte[] data;`
	`29`	`+ ReadOnlyMemory<byte> data;`
`30`	`30`	`bool first;`
`31`	`31`	`int position;`
`32`	`32`
`33`	`33`	`public ChunkStream(Schema schema, byte[] data)`
	`34`	`+ : this(schema, new ReadOnlyMemory<byte>(data))`
	`35`	`+ {`
	`36`	`+ // Call the other constructor to avoid duplication`
	`37`	`+ }`
	`38`	`+`
	`39`	`+ public ChunkStream(Schema schema, ReadOnlyMemory<byte> data)`
`34`	`40`	`{`
`35`	`41`	`MemoryStream buffer = new MemoryStream();`
`36`	`42`	`ArrowStreamWriter writer = new ArrowStreamWriter(buffer, schema, leaveOpen: true);`
`@@ -70,7 +76,7 @@ public override int Read(byte[] buffer, int offset, int count)`
`70`	`76`	`{`
`71`	`77`	`return 0;`
`72`	`78`	`}`
`73`		`- this.currentBuffer = new ReadOnlyMemory<byte>(this.data);`
	`79`	`+ this.currentBuffer = this.data;`
`74`	`80`	`this.position = 0;`
`75`	`81`	`remaining = this.currentBuffer.Length - this.position;`
`76`	`82`	`}`