Skip to content

Commit d74f762

Browse files
committed
feat: safeRead now defaults reading columns to Maybe a
There is an analogue for Parquet as well.
1 parent 281703d commit d74f762

File tree

16 files changed

+1234
-5122
lines changed

16 files changed

+1234
-5122
lines changed

dataframe-persistent/dataframe-persistent.cabal

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,11 @@ library
3535
exposed-modules: DataFrame.IO.Persistent,
3636
DataFrame.IO.Persistent.TH
3737
build-depends: base >= 4 && <5,
38-
bytestring >= 0.11 && < 0.13,
3938
containers >= 0.6.7 && < 0.9,
4039
dataframe ^>= 1,
4140
persistent >= 2.14 && < 3,
4241
template-haskell >= 2.0 && < 3,
4342
text >= 2.0 && < 3,
44-
time >= 1.12 && < 2,
4543
transformers >= 0.5 && < 0.7,
4644
vector ^>= 0.13
4745
hs-source-dirs: src

dataframe-persistent/src/DataFrame/IO/Persistent.hs

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,19 +41,16 @@ module DataFrame.IO.Persistent (
4141
import Control.Monad (forM)
4242
import Control.Monad.IO.Class (MonadIO, liftIO)
4343
import Control.Monad.Trans.Reader (ReaderT)
44-
import Data.ByteString (ByteString)
4544
import qualified Data.Map.Strict as M
4645
import Data.Proxy (Proxy (..))
4746
import Data.Text (Text)
4847
import qualified Data.Text as T
49-
import Data.Time (Day, TimeOfDay, UTCTime)
5048
import qualified Data.Vector as V
5149
import qualified DataFrame.Internal.Column as DFCol
5250
import DataFrame.Internal.DataFrame (DataFrame (..))
5351
import qualified DataFrame.Internal.DataFrame as DF
5452
import Database.Persist
5553
import Database.Persist.Sql hiding (Column)
56-
import Database.Persist.Types (fieldHaskell, getEntityFields, unFieldNameHS)
5754
import Unsafe.Coerce (unsafeCoerce)
5855

5956
-- | Get number of rows in a DataFrame

dataframe-persistent/src/DataFrame/IO/Persistent/TH.hs

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,21 +14,16 @@ module DataFrame.IO.Persistent.TH (
1414
derivePersistentDataFrame,
1515
) where
1616

17-
import Control.Monad (forM, when)
17+
import Control.Monad (forM)
1818
import Data.Char
19-
import Data.List (foldl')
20-
import Data.Text (Text)
2119
import qualified Data.Text as T
2220
import qualified Data.Vector as V
2321
import DataFrame.Functions (col)
2422
import DataFrame.IO.Persistent
25-
import qualified DataFrame.Internal.Column as DFCol
2623
import DataFrame.Internal.Expression
2724
import Database.Persist
28-
import Database.Persist.Sql (fromSqlKey)
29-
import Database.Persist.TH
3025
import Language.Haskell.TH
31-
import Language.Haskell.TH.Syntax (Lift, lift)
26+
import Language.Haskell.TH.Syntax (lift)
3227

3328
import Debug.Trace (trace)
3429

dataframe.cabal

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ extra-source-files: cbits/process_csv.h
2020
cbits/arrow_abi.h
2121
cbits/dataframe_arrow.h
2222
cbits/rts_init.c
23+
tests/data/typing/texts.txt
24+
tests/data/typing/texts_with_empties.txt
25+
tests/data/typing/texts_with_empties_and_nullish.txt
2326
data/titanic/*.csv
2427
data/sharded/*.parquet
2528
tests/data/*.csv
@@ -274,6 +277,7 @@ test-suite tests
274277
Operations.Typing,
275278
LazyParquet,
276279
Parquet,
280+
ParquetTestData,
277281
Properties,
278282
Monad
279283
build-depends: base >= 4 && < 5,

scripts/presubmit.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,14 @@ set -e
88
cabal build
99
cabal test
1010

11-
cd ../dataframe-persistent
11+
cd ./dataframe-persistent
1212

1313
cabal build
1414

1515
cd ../dataframe-hasktorch
1616

1717
cabal build
1818

19-
cd examples
19+
cd ../examples
2020

2121
cabal build all

src/DataFrame/IO/CSV.hs

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ defaultReadOptions =
220220
ReadOptions
221221
{ headerSpec = UseFirstRow
222222
, typeSpec = InferFromSample 100
223-
, safeRead = True
223+
, safeRead = False
224224
, dateFormat = "%Y-%m-%d"
225225
, columnSeparator = ','
226226
, numColumns = Nothing
@@ -408,11 +408,14 @@ freezeBuilderColumn (BuilderBS _ _) =
408408
"freezeBuilderColumn: BuilderBS must be finalized via finalizeBuilderColumn"
409409

410410
finalizeBuilderColumn :: ReadOptions -> BuilderColumn -> IO Column
411-
finalizeBuilderColumn opts (BuilderBS gv validRef) = do
412-
vec <- freezePagedVector gv
413-
valid <- freezePagedUnboxedVector validRef
414-
return $! inferColumnFromBS opts vec valid
415-
finalizeBuilderColumn _ bc = freezeBuilderColumn bc
411+
finalizeBuilderColumn opts bc = do
412+
col <- case bc of
413+
BuilderBS gv validRef -> do
414+
vec <- freezePagedVector gv
415+
valid <- freezePagedUnboxedVector validRef
416+
return $! inferColumnFromBS opts vec valid
417+
_ -> freezeBuilderColumn bc
418+
return $! if safeRead opts then ensureOptional col else col
416419

417420
inferColumnFromBS ::
418421
ReadOptions -> V.Vector BS.ByteString -> VU.Vector Word8 -> Column

src/DataFrame/IO/Parquet.hs

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,11 @@ import qualified Data.Text as T
1919
import Data.Text.Encoding
2020
import Data.Time
2121
import Data.Time.Clock.POSIX (posixSecondsToUTCTime)
22+
import qualified Data.Vector as V
2223
import DataFrame.Errors (DataFrameException (ColumnsNotFoundException))
2324
import DataFrame.Internal.Binary (littleEndianWord32)
2425
import qualified DataFrame.Internal.Column as DI
25-
import DataFrame.Internal.DataFrame (DataFrame)
26+
import DataFrame.Internal.DataFrame (DataFrame, columns)
2627
import DataFrame.Internal.Expression (Expr, getColumns)
2728
import qualified DataFrame.Operations.Core as DI
2829
import DataFrame.Operations.Merge ()
@@ -75,6 +76,8 @@ data ParquetReadOptions = ParquetReadOptions
7576
-- ^ Optional row filter expression applied before projection.
7677
, rowRange :: Maybe (Int, Int)
7778
-- ^ Optional row slice @(start, end)@ with start-inclusive/end-exclusive semantics.
79+
, safeColumns :: Bool
80+
-- ^ When True, every column is promoted to OptionalColumn after read, regardless of nullability in the schema.
7881
}
7982
deriving (Eq, Show)
8083

@@ -87,6 +90,7 @@ ParquetReadOptions
8790
{ selectedColumns = Nothing
8891
, predicate = Nothing
8992
, rowRange = Nothing
93+
, safeColumns = False
9094
}
9195
@
9296
-}
@@ -96,6 +100,7 @@ defaultParquetReadOptions =
96100
{ selectedColumns = Nothing
97101
, predicate = Nothing
98102
, rowRange = Nothing
103+
, safeColumns = False
99104
}
100105

101106
-- Public API --------------------------------------------------------------
@@ -349,9 +354,15 @@ applyPredicate :: ParquetReadOptions -> DataFrame -> DataFrame
349354
applyPredicate opts df =
350355
maybe df (`DS.filterWhere` df) (predicate opts)
351356

357+
applySafeRead :: ParquetReadOptions -> DataFrame -> DataFrame
358+
applySafeRead opts df
359+
| safeColumns opts = df{columns = V.map DI.ensureOptional (columns df)}
360+
| otherwise = df
361+
352362
applyReadOptions :: ParquetReadOptions -> DataFrame -> DataFrame
353363
applyReadOptions opts =
354-
applyRowRange opts
364+
applySafeRead opts
365+
. applyRowRange opts
355366
. applySelectedColumns opts
356367
. applyPredicate opts
357368

src/DataFrame/Internal/Column.hs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -872,6 +872,12 @@ freezeColumn' nulls (MUnboxedColumn col)
872872
)
873873
{-# INLINE freezeColumn' #-}
874874

875+
-- | Promote a non-nullable column to OptionalColumn. No-op when already optional.
876+
ensureOptional :: Column -> Column
877+
ensureOptional (BoxedColumn col) = OptionalColumn (VB.map Just col)
878+
ensureOptional (UnboxedColumn col) = OptionalColumn (VB.generate (VU.length col) (Just . (col `VU.unsafeIndex`)))
879+
ensureOptional c = c
880+
875881
-- | Fills the end of a column, up to n, with Nothing. Does nothing if column has length greater than n.
876882
expandColumn :: Int -> Column -> Column
877883
expandColumn n (OptionalColumn col) = OptionalColumn $ col <> VB.replicate (n - VG.length col) Nothing
@@ -1350,7 +1356,4 @@ toUnboxedVector column =
13501356
, errorColumnName = Nothing
13511357
}
13521358
)
1353-
{-# SPECIALIZE toUnboxedVector ::
1354-
Column -> Either DataFrameException (VU.Vector Double)
1355-
#-}
13561359
{-# INLINE toUnboxedVector #-}

src/DataFrame/Lazy/IO/CSV.hs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import DataFrame.Internal.Column (
3030
Column (..),
3131
MutableColumn (..),
3232
columnLength,
33+
ensureOptional,
3334
freezeColumn',
3435
writeColumn,
3536
)
@@ -60,7 +61,7 @@ defaultOptions =
6061
ReadOptions
6162
{ hasHeader = True
6263
, inferTypes = True
63-
, safeRead = True
64+
, safeRead = False
6465
, rowRange = Nothing
6566
, seekPos = Nothing
6667
, totalRows = Nothing
@@ -218,7 +219,8 @@ freezeColumn ::
218219
IO Column
219220
freezeColumn mutableCols nulls opts colIndex = do
220221
col <- VM.unsafeRead mutableCols colIndex
221-
freezeColumn' (nulls V.! colIndex) col
222+
frozen <- freezeColumn' (nulls V.! colIndex) col
223+
return $! if safeRead opts then ensureOptional frozen else frozen
222224
{-# INLINE freezeColumn #-}
223225

224226
-- ---------------------------------------------------------------------------

src/DataFrame/Operations/Typing.hs

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ import Data.Maybe (fromMaybe)
1515
import qualified Data.Proxy as P
1616
import Data.Time
1717
import Data.Type.Equality (TestEquality (..))
18-
import DataFrame.Internal.Column (Column (..), fromVector)
18+
import DataFrame.Internal.Column (Column (..), ensureOptional, fromVector)
1919
import DataFrame.Internal.DataFrame (DataFrame (..), unsafeGetColumn)
2020
import DataFrame.Internal.Parsing
2121
import DataFrame.Internal.Schema
@@ -78,14 +78,16 @@ parseFromExamples opts cols =
7878
examples = V.map converter (V.take (sampleSize opts) cols)
7979
asMaybeText = V.map converter cols
8080
dfmt = parseDateFormat opts
81+
result =
82+
case makeParsingAssumption dfmt examples of
83+
BoolAssumption -> handleBoolAssumption asMaybeText
84+
IntAssumption -> handleIntAssumption asMaybeText
85+
DoubleAssumption -> handleDoubleAssumption asMaybeText
86+
TextAssumption -> handleTextAssumption asMaybeText
87+
DateAssumption -> handleDateAssumption dfmt asMaybeText
88+
NoAssumption -> handleNoAssumption dfmt asMaybeText
8189
in
82-
case makeParsingAssumption dfmt examples of
83-
BoolAssumption -> handleBoolAssumption asMaybeText
84-
IntAssumption -> handleIntAssumption asMaybeText
85-
DoubleAssumption -> handleDoubleAssumption asMaybeText
86-
TextAssumption -> handleTextAssumption asMaybeText
87-
DateAssumption -> handleDateAssumption dfmt asMaybeText
88-
NoAssumption -> handleNoAssumption dfmt asMaybeText
90+
if parseSafe opts then ensureOptional result else result
8991

9092
handleBoolAssumption :: V.Vector (Maybe T.Text) -> Column
9193
handleBoolAssumption asMaybeText

0 commit comments

Comments
 (0)