6
6
using System . Collections . Generic ;
7
7
using System . IO ;
8
8
using System . Linq ;
9
- using System . Threading ;
10
9
using Microsoft . Spark . E2ETest . Utils ;
11
10
using Microsoft . Spark . Extensions . Delta . Tables ;
12
11
using Microsoft . Spark . Sql ;
@@ -41,7 +40,7 @@ public void TestTutorialScenario()
41
40
data . Write ( ) . Format ( "delta" ) . Save ( path ) ;
42
41
43
42
// Validate that data contains the the sequence [0 ... 4].
44
- ValidateDataFrame ( Enumerable . Range ( 0 , 5 ) , data ) ;
43
+ ValidateRangeDataFrame ( Enumerable . Range ( 0 , 5 ) , data ) ;
45
44
46
45
// Create a second iteration of the table.
47
46
data = _spark . Range ( 5 , 10 ) ;
@@ -51,7 +50,7 @@ public void TestTutorialScenario()
51
50
var deltaTable = DeltaTable . ForPath ( path ) ;
52
51
53
52
// Validate that deltaTable contains the the sequence [5 ... 9].
54
- ValidateDataFrame ( Enumerable . Range ( 5 , 5 ) , deltaTable . ToDF ( ) ) ;
53
+ ValidateRangeDataFrame ( Enumerable . Range ( 5 , 5 ) , deltaTable . ToDF ( ) ) ;
55
54
56
55
// Update every even value by adding 100 to it.
57
56
deltaTable . Update (
@@ -70,7 +69,7 @@ public void TestTutorialScenario()
70
69
// |106|
71
70
// |108|
72
71
// +---+
73
- ValidateDataFrame (
72
+ ValidateRangeDataFrame (
74
73
new List < int > ( ) { 5 , 7 , 9 , 106 , 108 } ,
75
74
deltaTable . ToDF ( ) ) ;
76
75
@@ -85,7 +84,7 @@ public void TestTutorialScenario()
85
84
// | 7|
86
85
// | 9|
87
86
// +---+
88
- ValidateDataFrame ( new List < int > ( ) { 5 , 7 , 9 } , deltaTable . ToDF ( ) ) ;
87
+ ValidateRangeDataFrame ( new List < int > ( ) { 5 , 7 , 9 } , deltaTable . ToDF ( ) ) ;
89
88
90
89
// Upsert (merge) new data.
91
90
DataFrame newData = _spark . Range ( 0 , 20 ) . As ( "newData" ) . ToDF ( ) ;
@@ -100,7 +99,7 @@ public void TestTutorialScenario()
100
99
. Execute ( ) ;
101
100
102
101
// Validate that the resulTable contains the the sequence [0 ... 19].
103
- ValidateDataFrame ( Enumerable . Range ( 0 , 20 ) , deltaTable . ToDF ( ) ) ;
102
+ ValidateRangeDataFrame ( Enumerable . Range ( 0 , 20 ) , deltaTable . ToDF ( ) ) ;
104
103
}
105
104
}
106
105
@@ -134,17 +133,90 @@ public void TestStreamingScenario()
134
133
135
134
// Now read the sink DeltaTable and validate its content.
136
135
DeltaTable sink = DeltaTable . ForPath ( sinkPath ) ;
137
- ValidateDataFrame ( Enumerable . Range ( 0 , 5 ) , sink . ToDF ( ) ) ;
136
+ ValidateRangeDataFrame ( Enumerable . Range ( 0 , 5 ) , sink . ToDF ( ) ) ;
138
137
139
138
// Write [5,6,7,8,9] to the source and trigger another stream batch.
140
139
_spark . Range ( 5 , 10 ) . Write ( ) . Format ( "delta" ) . Mode ( "append" ) . Save ( sourcePath ) ;
141
140
dataStreamWriter . Trigger ( Trigger . Once ( ) ) . Start ( sinkPath ) . AwaitTermination ( ) ;
142
141
143
142
// Finally, validate that the new data made its way to the sink.
144
- ValidateDataFrame ( Enumerable . Range ( 0 , 10 ) , sink . ToDF ( ) ) ;
143
+ ValidateRangeDataFrame ( Enumerable . Range ( 0 , 10 ) , sink . ToDF ( ) ) ;
145
144
}
146
145
}
147
146
147
+ /// <summary>
148
+ /// Test <c>DeltaTable.IsDeltaTable()</c> API.
149
+ /// </summary>
150
+ [ SkipIfSparkVersionIsLessThan ( Versions . V2_4_2 ) ]
151
+ public void TestIsDeltaTable ( )
152
+ {
153
+ using ( var tempDirectory = new TemporaryDirectory ( ) )
154
+ {
155
+ // Save the same data to a DeltaTable and to Parquet.
156
+ DataFrame data = _spark . Range ( 0 , 5 ) ;
157
+ string parquetPath = Path . Combine ( tempDirectory . Path , "parquet-data" ) ;
158
+ data . Write ( ) . Parquet ( parquetPath ) ;
159
+ string deltaTablePath = Path . Combine ( tempDirectory . Path , "delta-table" ) ;
160
+ data . Write ( ) . Format ( "delta" ) . Save ( deltaTablePath ) ;
161
+
162
+ Assert . False ( DeltaTable . IsDeltaTable ( parquetPath ) ) ;
163
+ Assert . False ( DeltaTable . IsDeltaTable ( _spark , parquetPath ) ) ;
164
+
165
+ Assert . True ( DeltaTable . IsDeltaTable ( deltaTablePath ) ) ;
166
+ Assert . True ( DeltaTable . IsDeltaTable ( _spark , deltaTablePath ) ) ;
167
+ }
168
+ }
169
+
170
+ /// <summary>
171
+ /// Test <c>DeltaTable.ConvertToDelta()</c> API.
172
+ /// </summary>
173
+ [ SkipIfSparkVersionIsLessThan ( Versions . V2_4_2 ) ]
174
+ public void TestConvertToDelta ( )
175
+ {
176
+ string partitionColumnName = "id_plus_one" ;
177
+ DataFrame data = _spark . Range ( 0 , 5 ) . Select (
178
+ Functions . Col ( "id" ) ,
179
+ Functions . Expr ( $ "(`id` + 1) AS `{ partitionColumnName } `") ) ;
180
+
181
+ // Run the same test on the different overloads of DeltaTable.ConvertToDelta().
182
+ void testWrapper (
183
+ DataFrame dataFrame ,
184
+ Func < string , DeltaTable > convertToDelta ,
185
+ string partitionColumn = null )
186
+ {
187
+ using ( var tempDirectory = new TemporaryDirectory ( ) )
188
+ {
189
+ string path = Path . Combine ( tempDirectory . Path , "parquet-data" ) ;
190
+ DataFrameWriter dataWriter = dataFrame . Write ( ) ;
191
+
192
+ if ( ! string . IsNullOrEmpty ( partitionColumn ) )
193
+ {
194
+ dataWriter = dataWriter . PartitionBy ( partitionColumn ) ;
195
+ }
196
+
197
+ dataWriter . Parquet ( path ) ;
198
+
199
+ Assert . False ( DeltaTable . IsDeltaTable ( path ) ) ;
200
+
201
+ string identifier = $ "parquet.`{ path } `";
202
+ DeltaTable convertedDeltaTable = convertToDelta ( identifier ) ;
203
+
204
+ ValidateRangeDataFrame ( Enumerable . Range ( 0 , 5 ) , convertedDeltaTable . ToDF ( ) ) ;
205
+ Assert . True ( DeltaTable . IsDeltaTable ( path ) ) ;
206
+ }
207
+ }
208
+
209
+ testWrapper ( data , identifier => DeltaTable . ConvertToDelta ( _spark , identifier ) ) ;
210
+ testWrapper (
211
+ data . Repartition ( Functions . Col ( partitionColumnName ) ) ,
212
+ identifier => DeltaTable . ConvertToDelta (
213
+ _spark ,
214
+ identifier ,
215
+ $ "{ partitionColumnName } bigint") ,
216
+ partitionColumnName ) ;
217
+ // TODO: Test with StructType partition schema once StructType is supported.
218
+ }
219
+
148
220
/// <summary>
149
221
/// Test that methods return the expected signature.
150
222
/// </summary>
@@ -161,7 +233,11 @@ public void TestSignatures()
161
233
DeltaTable table = Assert . IsType < DeltaTable > ( DeltaTable . ForPath ( path ) ) ;
162
234
table = Assert . IsType < DeltaTable > ( DeltaTable . ForPath ( _spark , path ) ) ;
163
235
236
+ Assert . IsType < bool > ( DeltaTable . IsDeltaTable ( _spark , path ) ) ;
237
+ Assert . IsType < bool > ( DeltaTable . IsDeltaTable ( path ) ) ;
238
+
164
239
Assert . IsType < DeltaTable > ( table . As ( "oldTable" ) ) ;
240
+ Assert . IsType < DeltaTable > ( table . Alias ( "oldTable" ) ) ;
165
241
Assert . IsType < DataFrame > ( table . History ( ) ) ;
166
242
Assert . IsType < DataFrame > ( table . History ( 200 ) ) ;
167
243
Assert . IsType < DataFrame > ( table . ToDF ( ) ) ;
@@ -221,17 +297,31 @@ public void TestSignatures()
221
297
. Option ( "path" , path )
222
298
. Load ( ) ) ;
223
299
Assert . IsType < DataFrame > ( _spark . ReadStream ( ) . Format ( "delta" ) . Load ( path ) ) ;
300
+
301
+ // Create Parquet data and convert it to DeltaTables.
302
+ string parquetIdentifier = $ "parquet.`{ path } `";
303
+ rangeRate . Write ( ) . Mode ( SaveMode . Overwrite ) . Parquet ( path ) ;
304
+ Assert . IsType < DeltaTable > ( DeltaTable . ConvertToDelta ( _spark , parquetIdentifier ) ) ;
305
+ rangeRate
306
+ . Select ( Functions . Col ( "id" ) , Functions . Expr ( $ "(`id` + 1) AS `id_plus_one`") )
307
+ . Write ( )
308
+ . PartitionBy ( "id" )
309
+ . Mode ( SaveMode . Overwrite )
310
+ . Parquet ( path ) ;
311
+ Assert . IsType < DeltaTable > ( DeltaTable . ConvertToDelta (
312
+ _spark ,
313
+ parquetIdentifier ,
314
+ "id bigint" ) ) ;
315
+ // TODO: Test with StructType partition schema once StructType is supported.
224
316
}
225
317
}
226
318
227
319
/// <summary>
228
- /// Validate that a tutorial DataFrame contains only the expected values.
320
+ /// Validate that a range DataFrame contains only the expected values.
229
321
/// </summary>
230
322
/// <param name="expectedValues"></param>
231
323
/// <param name="dataFrame"></param>
232
- private void ValidateDataFrame (
233
- IEnumerable < int > expectedValues ,
234
- DataFrame dataFrame )
324
+ private void ValidateRangeDataFrame ( IEnumerable < int > expectedValues , DataFrame dataFrame )
235
325
{
236
326
Assert . Equal ( expectedValues . Count ( ) , dataFrame . Count ( ) ) ;
237
327
0 commit comments