Skip to content

Commit 9b7daf5

Browse files
mchavclaude
andcommitted
refactor: remove OptionalColumn, fold nullability into BoxedColumn/UnboxedColumn via bit-packed bitmap
Replace the three-constructor Column GADT (BoxedColumn, UnboxedColumn, OptionalColumn) with a two-constructor design where BoxedColumn and UnboxedColumn carry an optional bit-packed validity bitmap (Nothing = no nulls, Just bm = bit i is 1 when row i is valid). - Bitmap = VU.Vector Word8, ceil(n/8) bytes, same format as Apache Arrow - fromMaybeVec / toMaybeVec bridge the new storage and the Maybe a interface - KindOf (Maybe a) now dispatches to RNullableBoxed or RNullableUnboxed - Arrow FFI simplifies: Arrow validity buffers map directly to/from Bitmap - CSV and Binary I/O updated; Parquet nullable column assembly fixed - All 457 tests pass Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent d74f762 commit 9b7daf5

30 files changed

+1037
-890
lines changed

src/DataFrame/DecisionTree.hs

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -638,13 +638,19 @@ numericCols :: DataFrame -> [NumExpr]
638638
numericCols df = concatMap extract (columnNames df)
639639
where
640640
extract col = case unsafeGetColumn col df of
641-
UnboxedColumn (_ :: VU.Vector b) ->
641+
UnboxedColumn Nothing (_ :: VU.Vector b) ->
642642
case testEquality (typeRep @b) (typeRep @Double) of
643643
Just Refl -> [NDouble (Col col)]
644644
Nothing -> case sIntegral @b of
645645
STrue -> [NDouble (F.toDouble (Col @b col))]
646646
SFalse -> []
647-
OptionalColumn (_ :: V.Vector (Maybe b)) ->
647+
BoxedColumn (Just _) (_ :: V.Vector b) ->
648+
case testEquality (typeRep @b) (typeRep @Double) of
649+
Just Refl -> [NMaybeDouble (Col @(Maybe b) col)]
650+
Nothing -> case sIntegral @b of
651+
STrue -> [NMaybeDouble (F.whenPresent (realToFrac @b @Double) (Col @(Maybe b) col))]
652+
SFalse -> []
653+
UnboxedColumn (Just _) (_ :: VU.Vector b) ->
648654
case testEquality (typeRep @b) (typeRep @Double) of
649655
Just Refl -> [NMaybeDouble (Col @(Maybe b) col)]
650656
Nothing -> case sIntegral @b of
@@ -697,18 +703,18 @@ generateConditionsOld cfg df =
697703
let
698704
genConds :: T.Text -> [Expr Bool]
699705
genConds colName = case unsafeGetColumn colName df of
700-
(BoxedColumn (col :: V.Vector a)) ->
706+
(BoxedColumn Nothing (col :: V.Vector a)) ->
701707
let ps = map (Lit . (`percentileOrd'` col)) [1, 25, 75, 99]
702708
in map (F.lift2 (==) (Col @a colName)) ps
703-
(OptionalColumn (col :: V.Vector (Maybe a))) -> case sFloating @a of
709+
(BoxedColumn (Just _) (col :: V.Vector a)) -> case sFloating @a of
704710
STrue -> [] -- handled by numericCols / numericExprs
705711
SFalse -> case sIntegral @a of
706712
STrue -> [] -- handled by numericCols / numericExprs
707713
SFalse ->
708714
map
709-
(F.lift2 (==) (Col @(Maybe a) colName) . Lit . (`percentileOrd'` col))
715+
(F.lift2 (==) (Col @(Maybe a) colName) . Lit . Just . (`percentileOrd'` col))
710716
[1, 25, 75, 99]
711-
(UnboxedColumn (_ :: VU.Vector a)) -> []
717+
(UnboxedColumn _ (_ :: VU.Vector a)) -> []
712718

713719
columnConds =
714720
concatMap
@@ -724,13 +730,13 @@ generateConditionsOld cfg df =
724730
]
725731
where
726732
colConds (!l, !r) = case (unsafeGetColumn l df, unsafeGetColumn r df) of
727-
(BoxedColumn (col1 :: V.Vector a), BoxedColumn (_ :: V.Vector b)) ->
733+
(BoxedColumn Nothing (col1 :: V.Vector a), BoxedColumn Nothing (_ :: V.Vector b)) ->
728734
case testEquality (typeRep @a) (typeRep @b) of
729735
Nothing -> []
730736
Just Refl -> [F.lift2 (==) (Col @a l) (Col @a r)]
731-
(UnboxedColumn (_ :: VU.Vector a), UnboxedColumn (_ :: VU.Vector b)) -> []
732-
( OptionalColumn (_ :: V.Vector (Maybe a))
733-
, OptionalColumn (_ :: V.Vector (Maybe b))
737+
(UnboxedColumn _ (_ :: VU.Vector a), UnboxedColumn _ (_ :: VU.Vector b)) -> []
738+
( BoxedColumn (Just _) (_ :: V.Vector a)
739+
, BoxedColumn (Just _) (_ :: V.Vector b)
734740
) -> case testEquality (typeRep @a) (typeRep @b) of
735741
Nothing -> []
736742
Just Refl -> case testEquality (typeRep @a) (typeRep @T.Text) of

src/DataFrame/Display/Terminal/Plot.hs

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -339,15 +339,12 @@ getCategoricalCounts colName df =
339339
Just idx ->
340340
let col = columns df V.! idx
341341
in case col of
342-
BoxedColumn vec ->
342+
BoxedColumn _ vec ->
343343
let counts = countValues vec
344344
in Just [(T.pack (show k), fromIntegral v) | (k, v) <- counts]
345-
UnboxedColumn vec ->
345+
UnboxedColumn _ vec ->
346346
let counts = countValuesUnboxed vec
347347
in Just [(T.pack (show k), fromIntegral v) | (k, v) <- counts]
348-
OptionalColumn vec ->
349-
let counts = countValues vec
350-
in Just [(T.pack (show k), fromIntegral v) | (k, v) <- counts]
351348
where
352349
countValues :: (Ord a, Show a) => V.Vector a -> [(a, Int)]
353350
countValues vec = M.toList $ V.foldr' (\x acc -> M.insertWith (+) x 1 acc) M.empty vec
@@ -365,9 +362,9 @@ extractStringColumn colName df =
365362
Just idx ->
366363
let col = columns df V.! idx
367364
in case col of
368-
BoxedColumn vec -> V.toList $ V.map (T.pack . show) vec
369-
UnboxedColumn vec -> V.toList $ VG.map (T.pack . show) (VG.convert vec)
370-
OptionalColumn vec -> V.toList $ V.map (T.pack . show) vec
365+
BoxedColumn _ vec -> V.toList $ V.map (T.pack . show) vec
366+
UnboxedColumn _ vec -> V.toList $ VG.map (T.pack . show) (VG.convert vec)
367+
_ -> []
371368

372369
extractNumericColumn :: (HasCallStack) => T.Text -> DataFrame -> [Double]
373370
extractNumericColumn colName df =
@@ -376,8 +373,8 @@ extractNumericColumn colName df =
376373
Just idx ->
377374
let col = columns df V.! idx
378375
in case col of
379-
BoxedColumn vec -> vectorToDoubles vec
380-
UnboxedColumn vec -> unboxedVectorToDoubles vec
376+
BoxedColumn _ vec -> vectorToDoubles vec
377+
UnboxedColumn _ vec -> unboxedVectorToDoubles vec
381378
_ -> []
382379

383380
vectorToDoubles :: forall a. (Columnable a, Show a) => V.Vector a -> [Double]

src/DataFrame/Display/Web/Plot.hs

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -850,13 +850,10 @@ extractStringColumn colName df =
850850
Just idx ->
851851
let col = columns df V.! idx
852852
in case col of
853-
BoxedColumn (vec :: V.Vector a) -> case testEquality (typeRep @a) (typeRep @T.Text) of
853+
BoxedColumn _ (vec :: V.Vector a) -> case testEquality (typeRep @a) (typeRep @T.Text) of
854854
Just Refl -> V.toList vec
855855
Nothing -> V.toList $ V.map (T.pack . show) vec
856-
UnboxedColumn vec -> V.toList $ VG.map (T.pack . show) (VG.convert vec)
857-
OptionalColumn (vec :: V.Vector (Maybe a)) -> case testEquality (typeRep @a) (typeRep @T.Text) of
858-
Nothing -> V.toList $ V.map (T.pack . show) vec
859-
Just Refl -> V.toList $ V.map (maybe "Nothing" ("Just " <>)) vec
856+
UnboxedColumn _ vec -> V.toList $ VG.map (T.pack . show) (VG.convert vec)
860857

861858
extractNumericColumn :: (HasCallStack) => T.Text -> DataFrame -> [Double]
862859
extractNumericColumn colName df =
@@ -865,8 +862,8 @@ extractNumericColumn colName df =
865862
Just idx ->
866863
let col = columns df V.! idx
867864
in case col of
868-
BoxedColumn vec -> vectorToDoubles vec
869-
UnboxedColumn vec -> unboxedVectorToDoubles vec
865+
BoxedColumn _ vec -> vectorToDoubles vec
866+
UnboxedColumn _ vec -> unboxedVectorToDoubles vec
870867
_ -> []
871868

872869
vectorToDoubles :: forall a. (Columnable a, Show a) => V.Vector a -> [Double]
@@ -898,21 +895,14 @@ getCategoricalCounts colName df =
898895
Just idx ->
899896
let col = columns df V.! idx
900897
in case col of
901-
BoxedColumn (vec :: V.Vector a) ->
898+
BoxedColumn _ (vec :: V.Vector a) ->
902899
let counts = countValues vec
903900
in case testEquality (typeRep @a) (typeRep @T.Text) of
904901
Nothing -> Just [(T.pack (show k), fromIntegral v) | (k, v) <- counts]
905902
Just Refl -> Just [(k, fromIntegral v) | (k, v) <- counts]
906-
UnboxedColumn vec ->
903+
UnboxedColumn _ vec ->
907904
let counts = countValuesUnboxed vec
908905
in Just [(T.pack (show k), fromIntegral v) | (k, v) <- counts]
909-
OptionalColumn (vec :: V.Vector (Maybe a)) ->
910-
let counts = countValues vec
911-
in case testEquality (typeRep @a) (typeRep @T.Text) of
912-
Nothing -> Just [((T.pack . show) k, fromIntegral v) | (k, v) <- counts]
913-
Just Refl ->
914-
Just
915-
[(maybe "Nothing" ("Just " <>) k, fromIntegral v) | (k, v) <- counts]
916906
where
917907
countValues :: (Ord a, Show a) => V.Vector a -> [(a, Int)]
918908
countValues vec = M.toList $ V.foldr' (\x acc -> M.insertWith (+) x 1 acc) M.empty vec

src/DataFrame/IO/CSV.hs

Lines changed: 9 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -389,19 +389,19 @@ freezeBuilderColumn (BuilderInt gv validRef) = do
389389
vec <- freezePagedUnboxedVector gv
390390
valid <- freezePagedUnboxedVector validRef
391391
if VU.all (== 1) valid
392-
then return $! UnboxedColumn vec
392+
then return $! UnboxedColumn Nothing vec
393393
else constructOptional vec valid
394394
freezeBuilderColumn (BuilderDouble gv validRef) = do
395395
vec <- freezePagedUnboxedVector gv
396396
valid <- freezePagedUnboxedVector validRef
397397
if VU.all (== 1) valid
398-
then return $! UnboxedColumn vec
398+
then return $! UnboxedColumn Nothing vec
399399
else constructOptional vec valid
400400
freezeBuilderColumn (BuilderText gv validRef) = do
401401
vec <- freezePagedVector gv
402402
valid <- freezePagedUnboxedVector validRef
403403
if VU.all (== 1) valid
404-
then return $! BoxedColumn vec
404+
then return $! BoxedColumn Nothing vec
405405
else constructOptionalBoxed vec valid
406406
freezeBuilderColumn (BuilderBS _ _) =
407407
error
@@ -526,23 +526,13 @@ handleBSNo dfmt asMaybe
526526
constructOptional ::
527527
(VU.Unbox a, Columnable a) => VU.Vector a -> VU.Vector Word8 -> IO Column
528528
constructOptional vec valid = do
529-
let size = VU.length vec
530-
mvec <- VM.new size
531-
forM_ [0 .. size - 1] $ \i ->
532-
if (valid VU.! i) == 0
533-
then VM.write mvec i Nothing
534-
else VM.write mvec i (Just (vec VU.! i))
535-
OptionalColumn <$> V.freeze mvec
529+
let bm = buildBitmapFromValid valid
530+
pure $ UnboxedColumn (Just bm) vec
536531

537532
constructOptionalBoxed :: V.Vector T.Text -> VU.Vector Word8 -> IO Column
538533
constructOptionalBoxed vec valid = do
539-
let size = V.length vec
540-
mvec <- VM.new size
541-
forM_ [0 .. size - 1] $ \i ->
542-
if (valid VU.! i) == 0
543-
then VM.write mvec i Nothing
544-
else VM.write mvec i (Just (vec V.! i))
545-
OptionalColumn <$> V.freeze mvec
534+
let bm = buildBitmapFromValid valid
535+
pure $ BoxedColumn (Just bm) vec
546536

547537
writeCsv :: FilePath -> DataFrame -> IO ()
548538
writeCsv = writeSeparated ','
@@ -569,7 +559,7 @@ getRowAsText :: DataFrame -> Int -> [T.Text]
569559
getRowAsText df i = V.ifoldr go [] (columns df)
570560
where
571561
indexMap = M.fromList (map (\(a, b) -> (b, a)) $ M.toList (columnIndices df))
572-
go k (BoxedColumn (c :: V.Vector a)) acc = case c V.!? i of
562+
go k (BoxedColumn _ (c :: V.Vector a)) acc = case c V.!? i of
573563
Just e -> textRep : acc
574564
where
575565
textRep = case testEquality (typeRep @a) (typeRep @T.Text) of
@@ -592,7 +582,7 @@ getRowAsText df i = V.ifoldr go [] (columns df)
592582
++ " has less items than "
593583
++ "the other columns at index "
594584
++ show i
595-
go k (UnboxedColumn c) acc = case c VU.!? i of
585+
go k (UnboxedColumn _ c) acc = case c VU.!? i of
596586
Just e -> T.pack (show e) : acc
597587
Nothing ->
598588
error $
@@ -601,17 +591,6 @@ getRowAsText df i = V.ifoldr go [] (columns df)
601591
++ " has less items than "
602592
++ "the other columns at index "
603593
++ show i
604-
go k (OptionalColumn (c :: V.Vector (Maybe a))) acc = case c V.!? i of
605-
Just e -> case testEquality (typeRep @a) (typeRep @T.Text) of
606-
Just Refl -> fromMaybe T.empty e : acc
607-
Nothing -> maybe T.empty (T.pack . show) e : acc
608-
Nothing ->
609-
error $
610-
"Column "
611-
++ T.unpack (indexMap M.! k)
612-
++ " has less items than "
613-
++ "the other columns at index "
614-
++ show i
615594

616595
stripQuotes :: T.Text -> T.Text
617596
stripQuotes txt =

src/DataFrame/IO/Parquet.hs

Lines changed: 7 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -191,9 +191,8 @@ _readParquetWithOpts extraConfig opts path = withFileBufferedOrSeekable extraCon
191191
)
192192
)
193193

194-
let totalRows = sum (map (fromIntegral . rowGroupNumRows) (rowGroups fileMetadata)) :: Int
195-
colMutMap <- newIORef (M.empty :: M.Map T.Text DI.MutableColumn)
196-
colOffMap <- newIORef (M.empty :: M.Map T.Text Int)
194+
-- Collect per-column chunk lists; concatenate at the end to preserve bitmaps.
195+
colListMap <- newIORef (M.empty :: M.Map T.Text [DI.Column])
197196
lTypeMap <- newIORef (M.empty :: M.Map T.Text LogicalType)
198197

199198
let schemaElements = schema fileMetadata
@@ -266,22 +265,13 @@ _readParquetWithOpts extraConfig opts path = withFileBufferedOrSeekable extraCon
266265
maybeTypeLength
267266
lType
268267

269-
mutMapSnap <- readIORef colMutMap
270-
case M.lookup colFullName mutMapSnap of
271-
Nothing -> do
272-
mc <- DI.newMutableColumn totalRows column
273-
DI.copyIntoMutableColumn mc 0 column
274-
modifyIORef' colMutMap (M.insert colFullName mc)
275-
modifyIORef' colOffMap (M.insert colFullName (DI.columnLength column))
276-
Just mc -> do
277-
off <- (M.! colFullName) <$> readIORef colOffMap
278-
DI.copyIntoMutableColumn mc off column
279-
modifyIORef' colOffMap (M.adjust (+ DI.columnLength column) colFullName)
268+
modifyIORef' colListMap (M.insertWith (++) colFullName [column])
280269
modifyIORef' lTypeMap (M.insert colFullName lType)
281270

282-
finalMutMap <- readIORef colMutMap
283-
finalColMap <-
284-
M.traverseWithKey (\_ mc -> DI.freezeMutableColumn mc) finalMutMap
271+
finalListMap <- readIORef colListMap
272+
-- Reverse the accumulated lists (they were prepended) and concat columns per-name,
273+
-- preserving bitmaps correctly via concatManyColumns.
274+
let finalColMap = M.map (DI.concatManyColumns . reverse) finalListMap
285275
finalLTypeMap <- readIORef lTypeMap
286276
let orderedColumns =
287277
map

src/DataFrame/IO/Parquet/Thrift.hs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,8 @@ createParquetSchema df = schemaDef : map toSchemaElement (DI.columnNames df)
6868
let
6969
colType :: TType
7070
colType = case unsafeGetColumn colName df of
71-
(DI.BoxedColumn (col :: V.Vector a)) -> haskellToTType @a
72-
(DI.UnboxedColumn (col :: VU.Vector a)) -> haskellToTType @a
73-
(DI.OptionalColumn (col :: V.Vector (Maybe a))) -> haskellToTType @a
71+
(DI.BoxedColumn _ (col :: V.Vector a)) -> haskellToTType @a
72+
(DI.UnboxedColumn _ (col :: VU.Vector a)) -> haskellToTType @a
7473
lType =
7574
if DI.hasElemType @T.Text (unsafeGetColumn colName df)
7675
|| DI.hasElemType @(Maybe T.Text) (unsafeGetColumn colName df)

0 commit comments

Comments
 (0)