Skip to content

Commit 50c8ad9

Browse files
authored
Merge pull request #109 from mukunku/v3.0.0-release
2 parents 77c70c9 + b94c733 commit 50c8ad9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+1236
-542
lines changed

.github/FUNDING.yml

-3
This file was deleted.

.github/ISSUE_TEMPLATE/bug_report.md

-2
Original file line numberDiff line numberDiff line change
@@ -24,5 +24,3 @@ If applicable, add screenshots to help explain your problem.
2424

2525
**Additional context**
2626
Add any other context about the problem here.
27-
28-
Note: This tool relies on the [parquet-dotnet](https://github.com/aloneguid/parquet-dotnet) library for all the actual Parquet processing. So any issues where that library cannot process a parquet file will not be addressed by us. Please open a ticket on that library's repo to address such issues.

.github/workflows/run-tests.yaml

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
name: .NET Test
2+
3+
on:
4+
push:
5+
branches:
6+
- master
7+
pull_request:
8+
9+
jobs:
10+
test:
11+
runs-on: windows-latest
12+
continue-on-error: true
13+
steps:
14+
- uses: actions/checkout@v4
15+
with:
16+
sparse-checkout: |
17+
.github
18+
src
19+
20+
- name: Setup .NET
21+
uses: actions/setup-dotnet@v1
22+
with:
23+
dotnet-version: '8.0.x'
24+
25+
- name: Restore dependencies
26+
run: dotnet restore src/ParquetViewer.sln
27+
28+
- name: Build
29+
run: dotnet build src/ParquetViewer.sln --configuration Debug --no-restore
30+
31+
- name: Test
32+
run: dotnet test src/ParquetViewer.sln --no-build --logger trx
33+
34+
- name: Test Report
35+
uses: bibipkins/[email protected]
36+
with:
37+
github-token: ${{ secrets.GITHUB_TOKEN }}
38+
comment-title: 'Unit Test Results'
39+
results-path: ./src/ParquetViewer.Tests/TestResults/*.trx

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ Checkout the [ParquetViewer Analytics Dashboard](https://app.amplitude.com/analy
2626
[^1]: Full privacy policy here: https://github.com/mukunku/ParquetViewer/wiki/Privacy-Policy
2727

2828
# Technical Details
29-
The latest version of this project was written in C# using Microsoft Visual Studio Community 2022 v17.8.3 and .NET 7
29+
The latest version of this project was written in C# using Microsoft Visual Studio Community 2022 v17.9.6 and .NET 8
3030

3131
# Acknowledgements
3232
This utility would not be possible without: https://github.com/aloneguid/parquet-dotnet

src/Directory.Packages.props

+2-2
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
<ManagePackageVersionsCentrally>true</ManagePackageVersionsCentrally>
44
</PropertyGroup>
55
<ItemGroup>
6-
<PackageVersion Include="Apache.Arrow" Version="15.0.0" />
7-
<PackageVersion Include="Parquet.Net" Version="4.23.4" />
6+
<PackageVersion Include="Apache.Arrow" Version="16.0.0" />
7+
<PackageVersion Include="Parquet.Net" Version="4.23.5" />
88
<PackageVersion Include="Microsoft.CSharp" Version="4.7.0" />
99
<PackageVersion Include="Microsoft.NET.Test.Sdk" Version="17.6.0" />
1010
<PackageVersion Include="RichardSzalay.MockHttp" Version="6.0.0" />

src/ParquetViewer.Engine/DataTableLite.cs

+2-1
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,8 @@ public DataTable ToDataTable(CancellationToken token, IProgress<int>? progress =
7474
throw new NotSupportedException($"Duplicate column '{columnPath}' detected. Column names are case insensitive and must be unique.");
7575
}
7676

77-
dataTable.Columns.Add(new DataColumn(columnLite.Name, columnLite.Type));
77+
var columnType = columnLite.Type;
78+
dataTable.Columns.Add(new DataColumn(columnName: columnLite.Name, dataType: columnType));
7879
}
7980

8081
dataTable.BeginLoadData();

src/ParquetViewer.Engine/Helpers.cs

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
namespace ParquetViewer.Engine
2+
{
3+
internal static class Helpers
4+
{
5+
public static int CompareTo(object? value, object? otherValue)
6+
{
7+
value ??= DBNull.Value;
8+
otherValue ??= DBNull.Value;
9+
10+
if (otherValue == DBNull.Value && value == DBNull.Value)
11+
return 0;
12+
13+
if (otherValue == DBNull.Value)
14+
return 1;
15+
16+
if (value == DBNull.Value)
17+
return -1;
18+
19+
if (value is IComparable comparableValue && otherValue is IComparable otherComparableValue
20+
&& value.GetType().Equals(otherValue.GetType()))
21+
{
22+
return comparableValue.CompareTo(otherComparableValue);
23+
}
24+
else
25+
{
26+
return value.ToString()!.CompareTo(otherValue.ToString()!);
27+
}
28+
}
29+
}
30+
}

src/ParquetViewer.Engine/ParquetEngine.Processor.cs

+103-49
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,19 @@ await ReadPrimitiveField(dataTable, groupReader, rowBeginIndex, field, skipRecor
9090
readRecords, isFirstColumn, cancellationToken, progress);
9191
break;
9292
case ParquetSchemaElement.FieldTypeId.List:
93-
await ReadListField(dataTable, groupReader, rowBeginIndex, field, skipRecords,
94-
readRecords, isFirstColumn, cancellationToken, progress);
93+
var listField = field.GetSingle("list");
94+
ParquetSchemaElement itemField;
95+
try
96+
{
97+
itemField = listField.GetSingle("item");
98+
}
99+
catch (Exception ex)
100+
{
101+
throw new UnsupportedFieldException($"Cannot load field `{field.Path}`. Invalid List type.", ex);
102+
}
103+
var fieldIndex = dataTable.Columns[field.Path]!.Ordinal;
104+
await ReadListField(dataTable, groupReader, rowBeginIndex, itemField, fieldIndex,
105+
skipRecords, readRecords, isFirstColumn, cancellationToken, progress);
95106
break;
96107
case ParquetSchemaElement.FieldTypeId.Map:
97108
await ReadMapField(dataTable, groupReader, rowBeginIndex, field, skipRecords,
@@ -115,64 +126,56 @@ private async Task ReadPrimitiveField(DataTableLite dataTable, ParquetRowGroupRe
115126
int skippedRecords = 0;
116127
var dataColumn = await groupReader.ReadColumnAsync(field.DataField ?? throw new Exception($"Pritimive field `{field.Path}` is missing its data field"), cancellationToken);
117128

129+
bool doesFieldBelongToAList = dataColumn.RepetitionLevels?.Any(l => l > 0) ?? false;
118130
int fieldIndex = dataTable.Columns[field.Path]?.Ordinal ?? throw new Exception($"Column `{field.Path}` is missing");
119-
var fieldType = dataTable.Columns[field.Path].Type; var byteArrayValueType = typeof(ByteArrayValue);
120-
foreach (var value in dataColumn.Data)
131+
if (doesFieldBelongToAList)
121132
{
122-
cancellationToken.ThrowIfCancellationRequested();
123-
124-
if (skipRecords > skippedRecords)
133+
dataColumn = null;
134+
await ReadListField(dataTable, groupReader, rowBeginIndex, field, fieldIndex, skipRecords, readRecords, isFirstColumn, cancellationToken, progress);
135+
}
136+
else
137+
{
138+
var fieldType = dataTable.Columns[field.Path].Type; var byteArrayValueType = typeof(ByteArrayValue);
139+
foreach (var value in dataColumn.Data)
125140
{
126-
skippedRecords++;
127-
continue;
128-
}
141+
cancellationToken.ThrowIfCancellationRequested();
129142

130-
if (rowIndex - rowBeginIndex >= readRecords)
131-
break;
143+
if (skipRecords > skippedRecords)
144+
{
145+
skippedRecords++;
146+
continue;
147+
}
132148

133-
if (isFirstColumn)
134-
{
135-
dataTable.NewRow();
136-
}
149+
if (rowIndex - rowBeginIndex >= readRecords)
150+
break;
137151

138-
if (value == DBNull.Value || value is null)
139-
{
140-
dataTable.Rows[rowIndex]![fieldIndex] = DBNull.Value;
141-
}
142-
else if (fieldType == byteArrayValueType)
143-
{
144-
dataTable.Rows[rowIndex]![fieldIndex] = new ByteArrayValue(field.Path, (byte[])value);
145-
}
146-
else
147-
{
148-
dataTable.Rows[rowIndex]![fieldIndex] = FixDateTime(value, field);
149-
}
152+
if (isFirstColumn)
153+
{
154+
dataTable.NewRow();
155+
}
150156

151-
rowIndex++;
152-
progress?.Report(1);
157+
if (value == DBNull.Value || value is null)
158+
{
159+
dataTable.Rows[rowIndex]![fieldIndex] = DBNull.Value;
160+
}
161+
else if (fieldType == byteArrayValueType)
162+
{
163+
dataTable.Rows[rowIndex]![fieldIndex] = new ByteArrayValue(field.Path, (byte[])value);
164+
}
165+
else
166+
{
167+
dataTable.Rows[rowIndex]![fieldIndex] = FixDateTime(value, field);
168+
}
169+
170+
rowIndex++;
171+
progress?.Report(1);
172+
}
153173
}
154174
}
155175

156-
/// <summary>
157-
/// This is a patch fix to handle malformed datetime fields. We assume TIMESTAMP fields are DateTime values.
158-
/// </summary>
159-
/// <param name="value">Original value</param>
160-
/// <param name="field">Schema element</param>
161-
/// <returns>If the field is a timestamp, a DateTime object will be returned. Otherwise the value will not be changed.</returns>
162-
private async Task ReadListField(DataTableLite dataTable, ParquetRowGroupReader groupReader, int rowBeginIndex, ParquetSchemaElement field,
176+
private async Task ReadListField(DataTableLite dataTable, ParquetRowGroupReader groupReader, int rowBeginIndex, ParquetSchemaElement itemField, int fieldIndex,
163177
long skipRecords, long readRecords, bool isFirstColumn, CancellationToken cancellationToken, IProgress<int>? progress)
164178
{
165-
var listField = field.GetSingle("list");
166-
ParquetSchemaElement itemField;
167-
try
168-
{
169-
itemField = listField.GetSingle("item");
170-
}
171-
catch (Exception ex)
172-
{
173-
throw new UnsupportedFieldException($"Cannot load field `{field.Path}`. Invalid List type.", ex);
174-
}
175-
176179
if (itemField.FieldType() == ParquetSchemaElement.FieldTypeId.Primitive)
177180
{
178181
int rowIndex = rowBeginIndex;
@@ -181,7 +184,6 @@ private async Task ReadListField(DataTableLite dataTable, ParquetRowGroupReader
181184
var dataColumn = await groupReader.ReadColumnAsync(itemField.DataField!, cancellationToken);
182185

183186
ArrayList? rowValue = null;
184-
var fieldIndex = dataTable.Columns[field.Path]!.Ordinal;
185187
for (int i = 0; i < dataColumn.Data.Length; i++)
186188
{
187189
cancellationToken.ThrowIfCancellationRequested();
@@ -230,6 +232,58 @@ bool IsEndOfRow() => (i + 1) == dataColumn.RepetitionLevels!.Length
230232
}
231233
}
232234
}
235+
else if (itemField.FieldType() == ParquetSchemaElement.FieldTypeId.Struct)
236+
{
237+
//Read struct data as a new datatable
238+
DataTableLite structFieldTable = BuildDataTable(itemField, itemField.Children.Select(f => f.Path).ToList(), (int)readRecords);
239+
240+
//Need to calculate progress differently for structs
241+
var structFieldReadProgress = StructReadProgress(progress, structFieldTable.Columns.Count);
242+
243+
//Read the struct data and populate the datatable
244+
await ProcessRowGroup(structFieldTable, groupReader, skipRecords, readRecords, cancellationToken, structFieldReadProgress);
245+
246+
//We need to pivot the data into a new data table (because we read it in columnar fashion above)
247+
int rowIndex = rowBeginIndex;
248+
foreach (var values in structFieldTable.Rows)
249+
{
250+
var newStructFieldTable = BuildDataTable(itemField, itemField.Children.Select(f => f.Path).ToList(), (int)readRecords);
251+
for (var columnOrdinal = 0; columnOrdinal < values.Length; columnOrdinal++)
252+
{
253+
if (values[columnOrdinal] == DBNull.Value)
254+
{
255+
//Empty array
256+
continue;
257+
}
258+
259+
var columnValues = (ListValue)values[columnOrdinal];
260+
for (var rowValueIndex = 0; rowValueIndex < columnValues.Data.Count; rowValueIndex++)
261+
{
262+
var columnValue = columnValues.Data[rowValueIndex] ?? throw new SystemException("This should never happen");
263+
bool isFirstValueColumn = columnOrdinal == 0;
264+
if (isFirstValueColumn)
265+
{
266+
newStructFieldTable.NewRow();
267+
}
268+
newStructFieldTable.Rows[rowValueIndex][columnOrdinal] = columnValue;
269+
}
270+
}
271+
272+
if (isFirstColumn)
273+
dataTable.NewRow();
274+
275+
var listValuesDataTable = newStructFieldTable.ToDataTable(cancellationToken);
276+
var listValues = new ArrayList(listValuesDataTable.Rows.Count);
277+
foreach (DataRow row in listValuesDataTable.Rows)
278+
{
279+
var newStructField = new StructValue(itemField.Path, row);
280+
listValues.Add(newStructField);
281+
}
282+
283+
dataTable.Rows[rowIndex][fieldIndex] = new ListValue(listValues, typeof(StructValue));
284+
rowIndex++;
285+
}
286+
}
233287
else
234288
{
235289
throw new NotSupportedException($"Lists of {itemField.FieldType()}s are not currently supported");

src/ParquetViewer.Engine/ParquetViewer.Engine.csproj

+14-1
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,24 @@
11
<Project Sdk="Microsoft.NET.Sdk">
22

33
<PropertyGroup>
4-
<TargetFramework>net7.0-windows</TargetFramework>
4+
<TargetFramework>net8.0</TargetFramework>
55
<ImplicitUsings>enable</ImplicitUsings>
66
<Nullable>enable</Nullable>
77
<PlatformTarget>x64</PlatformTarget>
88
<Configurations>Debug;Release;Release_SelfContained</Configurations>
9+
<ProduceReferenceAssembly>False</ProduceReferenceAssembly>
10+
<EnforceCodeStyleInBuild>True</EnforceCodeStyleInBuild>
11+
</PropertyGroup>
12+
13+
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release_SelfContained|AnyCPU'">
14+
<Optimize>True</Optimize>
15+
</PropertyGroup>
16+
17+
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|AnyCPU'">
18+
</PropertyGroup>
19+
20+
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|AnyCPU'">
21+
<Optimize>True</Optimize>
922
</PropertyGroup>
1023
<ItemGroup>
1124
<PackageReference Include="Parquet.Net" />

src/ParquetViewer.Engine/Types/ByteArrayValue.cs

+21-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
namespace ParquetViewer.Engine.Types
22
{
3-
public class ByteArrayValue
3+
public class ByteArrayValue : IComparable<ByteArrayValue>, IComparable
44
{
55
public string Name { get; }
66
public byte[] Data { get; }
@@ -12,5 +12,25 @@ public ByteArrayValue(string name, byte[] data)
1212
}
1313

1414
public override string ToString() => BitConverter.ToString(this.Data);
15+
16+
public int CompareTo(ByteArrayValue? other)
17+
{
18+
if (other?.Data is null)
19+
return 1;
20+
else if (this.Data is null)
21+
return -1;
22+
else
23+
return ByteArraysEqual(this.Data, other.Data);
24+
}
25+
26+
private static int ByteArraysEqual(ReadOnlySpan<byte> a1, ReadOnlySpan<byte> a2) => a1.SequenceCompareTo(a2);
27+
28+
public int CompareTo(object? obj)
29+
{
30+
if (obj is ByteArrayValue byteArray)
31+
return CompareTo(byteArray);
32+
else
33+
return 1;
34+
}
1535
}
1636
}

0 commit comments

Comments
 (0)