@@ -41,9 +41,11 @@ import Data.Time
4141import qualified Data.Vector as V
4242import qualified Data.Vector.Unboxed as VU
4343import Data.Word
44+ import qualified Data.Set as S
4445import qualified DataFrame.IO.CSV as CSV
4546import qualified DataFrame.IO.Parquet as Parquet
4647import DataFrame.IO.Parquet.Thrift
48+ import DataFrame.IO.Parquet.Types (columnNullCount )
4749import DataFrame.Internal.Nullable (
4850 BaseType ,
4951 NullLift1Op (applyNull1 ),
@@ -537,33 +539,39 @@ declareColumnsFromCsvFile path = do
537539declareColumnsFromParquetFile :: String -> DecsQ
538540declareColumnsFromParquetFile path = do
539541 isDir <- liftIO $ doesDirectoryExist path
540-
541542 let pat = if isDir then path </> " *.parquet" else path
542-
543543 matches <- liftIO $ glob pat
544-
545544 files <- liftIO $ filterM (fmap Prelude. not . doesDirectoryExist) matches
546- df <-
547- liftIO $
548- foldM
549- ( \ acc p -> do
550- (metadata, _) <- liftIO (Parquet. readMetadataFromPath p)
551- let d = schemaToEmptyDataFrame (schema metadata)
552- pure $ acc <> d
553- )
554- DataFrame.Internal.DataFrame. empty
555- files
545+ metas <- liftIO $ mapM (fmap fst . Parquet. readMetadataFromPath) files
546+ let nullableCols :: S. Set T. Text
547+ nullableCols = S. fromList
548+ [ T. pack (last colPath)
549+ | meta <- metas
550+ , rg <- rowGroups meta
551+ , cc <- rowGroupColumns rg
552+ , let cm = columnMetaData cc
553+ colPath = columnPathInSchema cm
554+ , Prelude. not (null colPath)
555+ , columnNullCount (columnStatistics cm) > 0
556+ ]
557+ let df = foldl (\ acc meta -> acc <> schemaToEmptyDataFrame nullableCols (schema meta))
558+ DataFrame.Internal.DataFrame. empty
559+ metas
556560 declareColumns df
557561
558- schemaToEmptyDataFrame :: [SchemaElement ] -> DataFrame
559- schemaToEmptyDataFrame elems =
562+ schemaToEmptyDataFrame :: S. Set T. Text -> [SchemaElement ] -> DataFrame
563+ schemaToEmptyDataFrame nullableCols elems =
560564 let leafElems = filter (\ e -> numChildren e == 0 ) elems
561- in fromNamedColumns (map schemaElemToColumn leafElems)
565+ in fromNamedColumns (map ( schemaElemToColumn nullableCols) leafElems)
562566
563- schemaElemToColumn :: SchemaElement -> (T. Text , Column )
564- schemaElemToColumn elem =
565- let name = elementName elem
566- in (name, emptyColumnForType (elementType elem ))
567+ schemaElemToColumn :: S. Set T. Text -> SchemaElement -> (T. Text , Column )
568+ schemaElemToColumn nullableCols elem =
569+ let name = elementName elem
570+ isNull = name `S.member` nullableCols
571+ col = if isNull
572+ then emptyNullableColumnForType (elementType elem )
573+ else emptyColumnForType (elementType elem )
574+ in (name, col)
567575
568576emptyColumnForType :: TType -> Column
569577emptyColumnForType = \ case
@@ -578,6 +586,19 @@ emptyColumnForType = \case
578586 STRING -> fromList @ T. Text []
579587 other -> error $ " Unsupported parquet type for column: " <> show other
580588
589+ emptyNullableColumnForType :: TType -> Column
590+ emptyNullableColumnForType = \ case
591+ BOOL -> fromList @ (Maybe Bool ) []
592+ BYTE -> fromList @ (Maybe Word8 ) []
593+ I16 -> fromList @ (Maybe Int16 ) []
594+ I32 -> fromList @ (Maybe Int32 ) []
595+ I64 -> fromList @ (Maybe Int64 ) []
596+ I96 -> fromList @ (Maybe Int64 ) []
597+ FLOAT -> fromList @ (Maybe Float ) []
598+ DOUBLE -> fromList @ (Maybe Double ) []
599+ STRING -> fromList @ (Maybe T. Text ) []
600+ other -> error $ " Unsupported parquet type for column: " <> show other
601+
581602declareColumnsFromCsvWithOpts :: CSV. ReadOptions -> String -> DecsQ
582603declareColumnsFromCsvWithOpts opts path = do
583604 df <- liftIO (CSV. readSeparated opts path)
0 commit comments