Skip to content

Commit 016a960

Browse files
authored
Document and test safeColumns in ParquetReadOptions (#190)
1 parent 1a6443a commit 016a960

File tree

3 files changed

+86
-3
lines changed

3 files changed

+86
-3
lines changed

docs/cookbook.md

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -369,8 +369,9 @@ dataframe> :declareColumns df0
369369
- `selectedColumns`
370370
- `predicate`
371371
- `rowRange`
372+
- `safeColumns`
372373

373-
Options are applied in this order: predicate filtering, column projection, then row range.
374+
Options are applied in this order: predicate filtering, column projection, row range, then safe column promotion.
374375

375376
**Exercise 11: Parquet projection**
376377

@@ -414,7 +415,21 @@ dataframe| "./data/mtcars.parquet"
414415

415416
When `selectedColumns` is set, columns referenced by `predicate` are automatically read as needed, then projected back to the requested output columns.
416417

417-
**Exercise 14: using the typed API**
418+
**Exercise 14: Safe column promotion**
419+
420+
Read the file while promoting every output column to an optional column.
421+
422+
### Solution
423+
424+
```haskell
425+
dataframe> D.readParquetWithOpts
426+
dataframe| (D.defaultParquetReadOptions{D.safeColumns = True})
427+
dataframe| "./data/mtcars.parquet"
428+
```
429+
430+
Use `safeColumns` when downstream code wants a uniformly nullable schema, even when the Parquet file marks some columns as non-nullable.
431+
432+
**Exercise 15: using the typed API**
418433
_This problem is called "Interviews" in Hackerrank.
419434
Samantha interviews many candidates from different colleges using coding challenges and contests. Write a query to print the contest_id, hacker_id, name, and the sums of total_submissions, total_accepted_submissions, total_views, and total_unique_views for each contest sorted by contest_id. Exclude the contest from the result if all four sums are 0.
420435

src/DataFrame/IO/Parquet.hs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ These options are applied in this order:
6464
1. predicate filtering
6565
2. column projection
6666
3. row range
67+
4. safe column promotion
6768
6869
Column selection for @selectedColumns@ uses leaf column names only.
6970
-}

tests/Parquet.hs

Lines changed: 68 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
module Parquet where
55

66
import Assertions (assertExpectException)
7+
import Control.Monad (forM_)
78
import qualified DataFrame as D
89
import qualified DataFrame.Functions as F
910
import qualified DataFrame.IO.Parquet as DP
@@ -29,7 +30,7 @@ import DataFrame.Internal.Binary (
2930
word32ToLittleEndian,
3031
word64ToLittleEndian,
3132
)
32-
import DataFrame.Internal.Column (hasMissing)
33+
import DataFrame.Internal.Column (hasElemType, hasMissing)
3334
import DataFrame.Internal.DataFrame (unsafeGetColumn)
3435
import GHC.IO (unsafePerformIO)
3536
import Test.HUnit
@@ -41,6 +42,18 @@ testBothReadParquetPaths test =
4142
, test (DP._readParquetWithOpts (Just True) D.defaultParquetReadOptions)
4243
]
4344

45+
assertColumnNullability ::
46+
String -> [(T.Text, Bool)] -> D.DataFrame -> Assertion
47+
assertColumnNullability label expected df =
48+
forM_ expected $ \(columnName, shouldBeNullable) ->
49+
assertBool
50+
( label
51+
<> ": expected "
52+
<> T.unpack columnName
53+
<> if shouldBeNullable then " to be nullable" else " to be non-nullable"
54+
)
55+
(hasMissing (unsafeGetColumn columnName df) == shouldBeNullable)
56+
4457
allTypesPlain :: Test
4558
allTypesPlain = testBothReadParquetPaths $ \readParquet ->
4659
TestCase
@@ -169,6 +182,58 @@ predicateUsesNonSelectedColumnWithOpts =
169182
)
170183
)
171184

185+
safeColumnsWithOpts :: Test
186+
safeColumnsWithOpts =
187+
TestCase $ do
188+
defaultDf <- D.readParquet "./tests/data/alltypes_plain.parquet"
189+
safeDf <-
190+
D.readParquetWithOpts
191+
(D.defaultParquetReadOptions{D.safeColumns = True})
192+
"./tests/data/alltypes_plain.parquet"
193+
194+
assertEqual
195+
"safeColumnsWithOpts dimensions"
196+
(D.dimensions defaultDf)
197+
(D.dimensions safeDf)
198+
assertColumnNullability
199+
"default read"
200+
[("id", False), ("bool_col", False)]
201+
defaultDf
202+
assertColumnNullability
203+
"safeColumns read"
204+
[("id", True), ("bool_col", True)]
205+
safeDf
206+
assertBool
207+
"safeColumns id type"
208+
(hasElemType @(Maybe Int32) (unsafeGetColumn "id" safeDf))
209+
assertBool
210+
"safeColumns bool_col type"
211+
(hasElemType @(Maybe Bool) (unsafeGetColumn "bool_col" safeDf))
212+
213+
safeColumnsWithSelectedColumns :: Test
214+
safeColumnsWithSelectedColumns =
215+
TestCase $ do
216+
df <-
217+
D.readParquetWithOpts
218+
( D.defaultParquetReadOptions
219+
{ D.selectedColumns = Just ["id", "bool_col"]
220+
, D.safeColumns = True
221+
}
222+
)
223+
"./tests/data/alltypes_plain.parquet"
224+
225+
assertEqual "safeColumnsWithSelectedColumns dimensions" (8, 2) (D.dimensions df)
226+
assertColumnNullability
227+
"safeColumns projected read"
228+
[("id", True), ("bool_col", True)]
229+
df
230+
assertBool
231+
"safeColumns projected id type"
232+
(hasElemType @(Maybe Int32) (unsafeGetColumn "id" df))
233+
assertBool
234+
"safeColumns projected bool_col type"
235+
(hasElemType @(Maybe Bool) (unsafeGetColumn "bool_col" df))
236+
172237
predicateWithOptsAcrossFiles :: Test
173238
predicateWithOptsAcrossFiles =
174239
TestCase
@@ -1029,6 +1094,8 @@ tests =
10291094
, rowRangeWithOpts
10301095
, predicateWithOpts
10311096
, predicateUsesNonSelectedColumnWithOpts
1097+
, safeColumnsWithOpts
1098+
, safeColumnsWithSelectedColumns
10321099
, predicateWithOptsAcrossFiles
10331100
, missingSelectedColumnWithOpts
10341101
, mtCars

0 commit comments

Comments
 (0)