Skip to content

Commit cbfefce

Browse files
authored
Merge pull request #44 from mukunku/v2.3.4
v2.3.4
2 parents 0512e52 + 3ea6769 commit cbfefce

7 files changed

+159
-78
lines changed

src/ParquetFileViewer.sln

+11-2
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11

22
Microsoft Visual Studio Solution File, Format Version 12.00
3-
# Visual Studio 2013
4-
VisualStudioVersion = 12.0.40629.0
3+
# Visual Studio Version 16
4+
VisualStudioVersion = 16.0.29201.188
55
MinimumVisualStudioVersion = 10.0.40219.1
66
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ParquetFileViewer", "ParquetFileViewer\ParquetFileViewer.csproj", "{6019FC1B-3610-4682-BF96-8345C95CB7EC}"
77
EndProject
8+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Utilities", "Utilities\Utilities.csproj", "{F423D115-06A0-47AF-A86E-2775E2F894F8}"
9+
EndProject
810
Global
911
GlobalSection(SolutionConfigurationPlatforms) = preSolution
1012
Debug|Any CPU = Debug|Any CPU
@@ -15,8 +17,15 @@ Global
1517
{6019FC1B-3610-4682-BF96-8345C95CB7EC}.Debug|Any CPU.Build.0 = Debug|Any CPU
1618
{6019FC1B-3610-4682-BF96-8345C95CB7EC}.Release|Any CPU.ActiveCfg = Release|Any CPU
1719
{6019FC1B-3610-4682-BF96-8345C95CB7EC}.Release|Any CPU.Build.0 = Release|Any CPU
20+
{F423D115-06A0-47AF-A86E-2775E2F894F8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
21+
{F423D115-06A0-47AF-A86E-2775E2F894F8}.Debug|Any CPU.Build.0 = Debug|Any CPU
22+
{F423D115-06A0-47AF-A86E-2775E2F894F8}.Release|Any CPU.ActiveCfg = Release|Any CPU
23+
{F423D115-06A0-47AF-A86E-2775E2F894F8}.Release|Any CPU.Build.0 = Release|Any CPU
1824
EndGlobalSection
1925
GlobalSection(SolutionProperties) = preSolution
2026
HideSolutionNode = FALSE
2127
EndGlobalSection
28+
GlobalSection(ExtensibilityGlobals) = postSolution
29+
SolutionGuid = {24015CE0-473F-4A3B-89BE-E0CEEEA261B6}
30+
EndGlobalSection
2231
EndGlobal
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,5 @@
1-
using Newtonsoft.Json;
2-
using Newtonsoft.Json.Linq;
3-
using System;
4-
using System.Collections.Generic;
1+
using System.Collections.Generic;
52
using System.Data;
6-
using System.IO;
73

84
namespace ParquetFileViewer.Helpers
95
{
@@ -23,33 +19,5 @@ public static IList<string> GetColumnNames(this DataTable datatable)
2319
}
2420
return columns;
2521
}
26-
27-
public static string FormatJSON(this string input)
28-
{
29-
if (input == null)
30-
return null;
31-
32-
try
33-
{
34-
return JValue.Parse(input).ToString(Formatting.Indented);
35-
}
36-
catch (Exception)
37-
{
38-
//malformed json detected
39-
return input;
40-
}
41-
}
42-
43-
public static string Base64Encode(this string plainText)
44-
{
45-
var plainTextBytes = System.Text.Encoding.UTF8.GetBytes(plainText);
46-
return Convert.ToBase64String(plainTextBytes);
47-
}
48-
49-
public static string Base64Decode(this string base64EncodedData)
50-
{
51-
var base64EncodedBytes = Convert.FromBase64String(base64EncodedData);
52-
return System.Text.Encoding.UTF8.GetString(base64EncodedBytes);
53-
}
5422
}
5523
}

src/ParquetFileViewer/MetadataViewer.cs

+6-39
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,8 @@
1-
using Parquet.Thrift;
2-
using ParquetFileViewer.Helpers;
3-
using System;
4-
using System.Linq;
1+
using System;
52
using System.Collections.Generic;
63
using System.Drawing;
7-
using System.Text;
8-
using System.Threading.Tasks;
94
using System.Windows.Forms;
5+
using Utilities;
106

117
namespace ParquetFileViewer
128
{
@@ -58,34 +54,8 @@ private void MainBackgroundWorker_DoWork(object sender, System.ComponentModel.Do
5854
var metadataResult = new List<(string TabName, string Text)>();
5955
if (parquetReader.ThriftMetadata != null)
6056
{
61-
var thriftMetadata = parquetReader.ThriftMetadata;
62-
var jsonObject = new Newtonsoft.Json.Linq.JObject();
63-
jsonObject[nameof(thriftMetadata.Version)] = thriftMetadata.Version;
64-
jsonObject[nameof(thriftMetadata.Num_rows)] = thriftMetadata.Num_rows;
65-
jsonObject[nameof(thriftMetadata.Created_by)] = thriftMetadata.Created_by;
66-
67-
var schemas = new Newtonsoft.Json.Linq.JArray();
68-
foreach (var schema in thriftMetadata.Schema)
69-
{
70-
if ("schema".Equals(schema.Name) && schemas.Count == 0)
71-
continue;
72-
73-
var schemaObject = new Newtonsoft.Json.Linq.JObject();
74-
schemaObject[nameof(schema.Field_id)] = schema.Field_id;
75-
schemaObject[nameof(schema.Name)] = schema.Name;
76-
schemaObject[nameof(schema.Type)] = schema.Type.ToString();
77-
schemaObject[nameof(schema.Type_length)] = schema.Type_length;
78-
schemaObject[nameof(schema.LogicalType)] = schema.LogicalType?.ToString();
79-
schemaObject[nameof(schema.Scale)] = schema.Scale;
80-
schemaObject[nameof(schema.Precision)] = schema.Precision;
81-
schemaObject[nameof(schema.Repetition_type)] = schema.Repetition_type.ToString();
82-
schemaObject[nameof(schema.Converted_type)] = schema.Converted_type.ToString();
83-
84-
schemas.Add(schemaObject);
85-
}
86-
jsonObject[nameof(thriftMetadata.Schema)] = schemas;
87-
88-
metadataResult.Add((THRIFT_METADATA, jsonObject.ToString().FormatJSON()));
57+
string json = ParquetMetadataAnalyzers.ThriftMetadataToJSON(parquetReader.ThriftMetadata);
58+
metadataResult.Add((THRIFT_METADATA, json));
8959
}
9060
else
9161
metadataResult.Add((THRIFT_METADATA, "No thrift metadata available"));
@@ -97,14 +67,11 @@ private void MainBackgroundWorker_DoWork(object sender, System.ComponentModel.Do
9767
string value = _customMetadata.Value;
9868
if (PANDAS_SCHEMA.Equals(_customMetadata.Key))
9969
{
100-
value = value.FormatJSON();
70+
value = ParquetMetadataAnalyzers.PandasSchemaToJSON(value);
10171
}
10272
else if (APACHE_ARROW_SCHEMA.Equals(_customMetadata.Key))
10373
{
104-
//TODO: Base64 decode on its own doesn't accomplish anything.
105-
//Need some way to read the schema but there isn't anything in the apache arrow repo for this...
106-
//https://github.com/apache/arrow/blob/master/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs
107-
//value = value.Base64Decode();
74+
value = ParquetMetadataAnalyzers.ApacheArrowToJSON(value);
10875
}
10976

11077
metadataResult.Add((_customMetadata.Key, value));

src/ParquetFileViewer/ParquetFileViewer.csproj

+7-3
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
<Import Project="..\packages\Costura.Fody.4.1.0\build\Costura.Fody.props" Condition="Exists('..\packages\Costura.Fody.4.1.0\build\Costura.Fody.props')" />
44
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
55
<PropertyGroup>
6+
<RestoreProjectStyle>PackageReference</RestoreProjectStyle>
67
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
78
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
89
<ProjectGuid>{6019FC1B-3610-4682-BF96-8345C95CB7EC}</ProjectGuid>
@@ -46,9 +47,6 @@
4647
<Reference Include="IronSnappy, Version=1.3.0.0, Culture=neutral, PublicKeyToken=b1d4b1dc83bdcf31, processorArchitecture=MSIL">
4748
<HintPath>..\packages\IronSnappy.1.3.0\lib\netstandard2.0\IronSnappy.dll</HintPath>
4849
</Reference>
49-
<Reference Include="Newtonsoft.Json, Version=13.0.0.0, Culture=neutral, PublicKeyToken=30ad4fe6b2a6aeed, processorArchitecture=MSIL">
50-
<HintPath>..\packages\Newtonsoft.Json.13.0.1\lib\net45\Newtonsoft.Json.dll</HintPath>
51-
</Reference>
5250
<Reference Include="Parquet, Version=3.0.0.0, Culture=neutral, PublicKeyToken=d380b3dee6d01926, processorArchitecture=MSIL">
5351
<HintPath>..\packages\Parquet.Net.3.8.6\lib\netstandard2.0\Parquet.dll</HintPath>
5452
</Reference>
@@ -156,6 +154,12 @@
156154
<None Include="Resources\coffee.gif" />
157155
<Content Include="Resources\hourglass.gif" />
158156
</ItemGroup>
157+
<ItemGroup>
158+
<ProjectReference Include="..\Utilities\Utilities.csproj">
159+
<Project>{f423d115-06a0-47af-a86e-2775e2f894f8}</Project>
160+
<Name>Utilities</Name>
161+
</ProjectReference>
162+
</ItemGroup>
159163
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
160164
<Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
161165
<PropertyGroup>

src/ParquetFileViewer/packages.config

-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
<package id="Costura.Fody" version="4.1.0" targetFramework="net461" />
44
<package id="Fody" version="6.5.0" targetFramework="net461" developmentDependency="true" />
55
<package id="IronSnappy" version="1.3.0" targetFramework="net461" />
6-
<package id="Newtonsoft.Json" version="13.0.1" targetFramework="net461" />
76
<package id="Parquet.Net" version="3.8.6" targetFramework="net461" />
87
<package id="System.Buffers" version="4.5.1" targetFramework="net461" />
98
<package id="System.Memory" version="4.5.4" targetFramework="net461" />
+121
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
using Apache.Arrow.Ipc;
2+
using Apache.Arrow.Types;
3+
using Newtonsoft.Json;
4+
using Newtonsoft.Json.Linq;
5+
using Parquet.Thrift;
6+
using System;
7+
8+
namespace Utilities
9+
{
10+
public static class ParquetMetadataAnalyzers
11+
{
12+
public static string ApacheArrowToJSON(string base64)
13+
{
14+
try
15+
{
16+
byte[] bytes = Convert.FromBase64String(base64);
17+
using (ArrowStreamReader reader = new ArrowStreamReader(bytes))
18+
{
19+
reader.ReadNextRecordBatch();
20+
return JsonConvert.SerializeObject(reader.Schema, Formatting.Indented);
21+
22+
var metadata = new JObject();
23+
var schema = new JObject();
24+
25+
var fields = new JArray();
26+
if (reader.Schema?.Fields != null)
27+
{
28+
foreach (var _field in reader.Schema.Fields)
29+
{
30+
var field = new JObject();
31+
field[nameof(_field.Value.Name)] = _field.Value.Name;
32+
field[nameof(_field.Value.IsNullable)] = _field.Value.IsNullable;
33+
field[nameof(_field.Value.DataType)] = JObject.Parse(JsonConvert.SerializeObject(_field.Value.DataType));
34+
35+
if (_field.Value.HasMetadata)
36+
{
37+
metadata = new JObject();
38+
foreach (var _fieldMetadata in _field.Value.Metadata)
39+
{
40+
metadata[_fieldMetadata.Key] = _fieldMetadata.Value;
41+
}
42+
field[nameof(metadata)] = metadata;
43+
}
44+
45+
fields.Add(field);
46+
}
47+
}
48+
schema[nameof(fields)] = fields;
49+
50+
metadata = new JObject();
51+
if (reader.Schema?.Metadata != null)
52+
{
53+
foreach (var _metadata in reader.Schema.Metadata)
54+
{
55+
metadata[_metadata.Key] = _metadata.Value;
56+
}
57+
}
58+
schema[nameof(metadata)] = metadata;
59+
60+
return schema.ToString(Formatting.Indented);
61+
}
62+
}
63+
catch (Exception ex)
64+
{
65+
return $"Something went wrong while processing the schema:{Environment.NewLine}{Environment.NewLine}{ex.ToString()}";
66+
}
67+
}
68+
69+
public static string ThriftMetadataToJSON(FileMetaData thriftMetadata)
70+
{
71+
try
72+
{
73+
var jsonObject = new JObject();
74+
jsonObject[nameof(thriftMetadata.Version)] = thriftMetadata.Version;
75+
jsonObject[nameof(thriftMetadata.Num_rows)] = thriftMetadata.Num_rows;
76+
jsonObject[nameof(thriftMetadata.Created_by)] = thriftMetadata.Created_by;
77+
78+
var schemas = new JArray();
79+
foreach (var schema in thriftMetadata.Schema)
80+
{
81+
if ("schema".Equals(schema.Name) && schemas.Count == 0)
82+
continue;
83+
84+
var schemaObject = new JObject();
85+
schemaObject[nameof(schema.Field_id)] = schema.Field_id;
86+
schemaObject[nameof(schema.Name)] = schema.Name;
87+
schemaObject[nameof(schema.Type)] = schema.Type.ToString();
88+
schemaObject[nameof(schema.Type_length)] = schema.Type_length;
89+
schemaObject[nameof(schema.LogicalType)] = schema.LogicalType?.ToString();
90+
schemaObject[nameof(schema.Scale)] = schema.Scale;
91+
schemaObject[nameof(schema.Precision)] = schema.Precision;
92+
schemaObject[nameof(schema.Repetition_type)] = schema.Repetition_type.ToString();
93+
schemaObject[nameof(schema.Converted_type)] = schema.Converted_type.ToString();
94+
95+
schemas.Add(schemaObject);
96+
}
97+
jsonObject[nameof(thriftMetadata.Schema)] = schemas;
98+
99+
return jsonObject.ToString(Formatting.Indented);
100+
}
101+
catch (Exception ex)
102+
{
103+
return $"Something went wrong while processing the schema:{Environment.NewLine}{Environment.NewLine}{ex.ToString()}";
104+
}
105+
}
106+
107+
public static string PandasSchemaToJSON(string pandas)
108+
{
109+
try
110+
{
111+
//Pandas is already json; so just make it pretty.
112+
return JValue.Parse(pandas).ToString(Formatting.Indented);
113+
}
114+
catch (Exception)
115+
{
116+
//malformed json detected
117+
return pandas;
118+
}
119+
}
120+
}
121+
}

src/Utilities/Utilities.csproj

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<TargetFramework>netstandard2.0</TargetFramework>
5+
</PropertyGroup>
6+
7+
<ItemGroup>
8+
<PackageReference Include="Apache.Arrow" Version="5.0.0" />
9+
<PackageReference Include="Newtonsoft.Json" Version="13.0.1" />
10+
<PackageReference Include="Parquet.Net" Version="3.8.6" />
11+
</ItemGroup>
12+
13+
</Project>

0 commit comments

Comments
 (0)