Skip to content

Commit d667845

Browse files
committed
feat: export toCsv functionality for dataframe for simple ipc
1 parent c4c650b commit d667845

File tree

6 files changed

+135
-47
lines changed

6 files changed

+135
-47
lines changed

dataframe.cabal

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,7 @@ test-suite tests
261261
Operations.Nullable,
262262
Operations.Provenance,
263263
Operations.ReadCsv,
264+
Operations.WriteCsv,
264265
Operations.Shuffle,
265266
Operations.Sort,
266267
Operations.Subset,

src/DataFrame.hs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,8 +281,10 @@ import DataFrame.Internal.DataFrame as Dataframe (
281281
GroupedDataFrame,
282282
empty,
283283
null,
284+
toCsv,
284285
toMarkdown,
285286
toMarkdown',
287+
toSeparated,
286288
)
287289
import DataFrame.Internal.Expression as Expression (Expr, prettyPrint)
288290
import DataFrame.Internal.Row as Row (

src/DataFrame/IO/CSV.hs

Lines changed: 2 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ import Data.Maybe
3939
import Data.Type.Equality (TestEquality (testEquality))
4040
import Data.Word (Word8)
4141
import DataFrame.Internal.Column
42-
import DataFrame.Internal.DataFrame (DataFrame (..))
42+
import DataFrame.Internal.DataFrame (DataFrame (..), toSeparated)
4343
import DataFrame.Internal.Parsing
4444
import DataFrame.Internal.Schema
4545
import DataFrame.Operations.Typing
@@ -547,50 +547,7 @@ writeSeparated ::
547547
FilePath ->
548548
DataFrame ->
549549
IO ()
550-
writeSeparated c filepath df = withFile filepath WriteMode $ \handle -> do
551-
let (rows, _) = dataframeDimensions df
552-
let headers = map fst (L.sortBy (compare `on` snd) (M.toList (columnIndices df)))
553-
TIO.hPutStrLn handle (T.intercalate "," headers)
554-
forM_ [0 .. (rows - 1)] $ \i -> do
555-
let row = getRowAsText df i
556-
TIO.hPutStrLn handle (T.intercalate "," row)
557-
558-
getRowAsText :: DataFrame -> Int -> [T.Text]
559-
getRowAsText df i = V.ifoldr go [] (columns df)
560-
where
561-
indexMap = M.fromList (map (\(a, b) -> (b, a)) $ M.toList (columnIndices df))
562-
go k (BoxedColumn _ (c :: V.Vector a)) acc = case c V.!? i of
563-
Just e -> textRep : acc
564-
where
565-
textRep = case testEquality (typeRep @a) (typeRep @T.Text) of
566-
Just Refl -> e
567-
Nothing -> case typeRep @a of
568-
App t1 t2 -> case eqTypeRep t1 (typeRep @Maybe) of
569-
Just HRefl -> case testEquality t2 (typeRep @T.Text) of
570-
Just Refl -> fromMaybe "null" e
571-
Nothing -> (fromOptional . T.pack . show) e
572-
where
573-
fromOptional s
574-
| T.isPrefixOf "Just " s = T.drop (T.length "Just ") s
575-
| otherwise = "null"
576-
Nothing -> (T.pack . show) e
577-
_ -> (T.pack . show) e
578-
Nothing ->
579-
error $
580-
"Column "
581-
++ T.unpack (indexMap M.! k)
582-
++ " has less items than "
583-
++ "the other columns at index "
584-
++ show i
585-
go k (UnboxedColumn _ c) acc = case c VU.!? i of
586-
Just e -> T.pack (show e) : acc
587-
Nothing ->
588-
error $
589-
"Column "
590-
++ T.unpack (indexMap M.! k)
591-
++ " has less items than "
592-
++ "the other columns at index "
593-
++ show i
550+
writeSeparated c filepath df = TIO.writeFile filepath (toSeparated c df)
594551

595552
stripQuotes :: T.Text -> T.Text
596553
stripQuotes txt =

src/DataFrame/Internal/DataFrame.hs

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
{-# LANGUAGE GADTs #-}
55
{-# LANGUAGE InstanceSigs #-}
66
{-# LANGUAGE OverloadedStrings #-}
7+
{-# LANGUAGE PatternSynonyms #-}
78
{-# LANGUAGE ScopedTypeVariables #-}
89
{-# LANGUAGE TypeApplications #-}
910

@@ -18,13 +19,18 @@ import Control.DeepSeq (NFData (..), rnf)
1819
import Control.Exception (throw)
1920
import Data.Function (on)
2021
import Data.List (sortBy, transpose, (\\))
21-
import Data.Type.Equality (TestEquality (testEquality), type (:~:) (Refl))
22+
import Data.Maybe (fromMaybe)
23+
import Data.Type.Equality (
24+
TestEquality (testEquality),
25+
type (:~:) (Refl),
26+
type (:~~:) (HRefl),
27+
)
2228
import DataFrame.Display.Terminal.PrettyPrint
2329
import DataFrame.Errors
2430
import DataFrame.Internal.Column
2531
import DataFrame.Internal.Expression
2632
import Text.Printf
27-
import Type.Reflection (Typeable, typeRep)
33+
import Type.Reflection (Typeable, eqTypeRep, typeRep, pattern App)
2834
import Prelude hiding (null)
2935

3036
data DataFrame = DataFrame
@@ -196,3 +202,40 @@ Note that a dataframe with columns but no rows is not considered null.
196202
-}
197203
null :: DataFrame -> Bool
198204
null df = V.null (columns df)
205+
206+
-- | Convert a DataFrame to a CSV (comma-separated) text.
207+
toCsv :: DataFrame -> T.Text
208+
toCsv = toSeparated ','
209+
210+
-- | Convert a DataFrame to a text representation with a custom separator.
211+
toSeparated :: Char -> DataFrame -> T.Text
212+
toSeparated sep df
213+
| null df = T.empty
214+
| otherwise =
215+
let (rows, _) = dataframeDimensions df
216+
headers = map fst (sortBy (compare `on` snd) (M.toList (columnIndices df)))
217+
sepText = T.singleton sep
218+
headerLine = T.intercalate sepText headers
219+
dataLines = map (T.intercalate sepText . getRowAsText df) [0 .. rows - 1]
220+
in T.unlines (headerLine : dataLines)
221+
222+
getRowAsText :: DataFrame -> Int -> [T.Text]
223+
getRowAsText df i = map (`showElement` i) (V.toList (columns df))
224+
225+
showElement :: Column -> Int -> T.Text
226+
showElement (BoxedColumn _ (c :: V.Vector a)) i = case c V.!? i of
227+
Nothing -> error $ "Column index out of bounds at row " ++ show i
228+
Just e
229+
| Just Refl <- testEquality (typeRep @a) (typeRep @T.Text) -> e
230+
| App t1 t2 <- typeRep @a
231+
, Just HRefl <- eqTypeRep t1 (typeRep @Maybe) ->
232+
case testEquality t2 (typeRep @T.Text) of
233+
Just Refl -> fromMaybe "null" e
234+
Nothing -> stripJust (T.pack (show e))
235+
| otherwise -> T.pack (show e)
236+
showElement (UnboxedColumn _ c) i = case c VU.!? i of
237+
Nothing -> error $ "Column index out of bounds at row " ++ show i
238+
Just e -> T.pack (show e)
239+
240+
stripJust :: T.Text -> T.Text
241+
stripJust = fromMaybe "null" . T.stripPrefix "Just "

tests/Main.hs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import qualified Operations.Statistics
3333
import qualified Operations.Subset
3434
import qualified Operations.Take
3535
import qualified Operations.Typing
36+
import qualified Operations.WriteCsv
3637
import qualified Parquet
3738
import qualified Properties
3839

@@ -53,6 +54,7 @@ tests =
5354
++ Operations.Nullable.tests
5455
++ Operations.Provenance.tests
5556
++ Operations.ReadCsv.tests
57+
++ Operations.WriteCsv.tests
5658
++ Operations.Shuffle.tests
5759
++ Operations.Sort.tests
5860
++ Operations.Statistics.tests

tests/Operations/WriteCsv.hs

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
{-# LANGUAGE OverloadedStrings #-}
2+
{-# LANGUAGE TypeApplications #-}
3+
4+
module Operations.WriteCsv where
5+
6+
import qualified Data.Text as T
7+
import qualified Data.Text.IO as TIO
8+
import qualified DataFrame as D
9+
import qualified DataFrame.Internal.Column as DI
10+
import DataFrame.Internal.DataFrame (DataFrame (..), toCsv, toSeparated)
11+
import Test.HUnit
12+
13+
-- Basic test: Int and Text columns produce correct CSV
14+
toCsvBasic :: Test
15+
toCsvBasic = TestLabel "toCsv_basic" $ TestCase $ do
16+
let df =
17+
D.fromNamedColumns
18+
[ ("name", DI.fromList @T.Text ["Alice", "Bob", "Charlie"])
19+
, ("age", DI.fromList @Int [30, 25, 35])
20+
]
21+
expected = "name,age\nAlice,30\nBob,25\nCharlie,35\n"
22+
assertEqual "basic toCsv" expected (toCsv df)
23+
24+
-- Empty DataFrame produces empty text
25+
toCsvEmpty :: Test
26+
toCsvEmpty = TestLabel "toCsv_empty" $ TestCase $
27+
assertEqual "empty toCsv" T.empty (toCsv D.empty)
28+
29+
-- toSeparated with tab produces tab-delimited output
30+
toSeparatedTab :: Test
31+
toSeparatedTab = TestLabel "toSeparated_tab" $ TestCase $ do
32+
let df =
33+
D.fromNamedColumns
34+
[ ("x", DI.fromList @Int [1, 2])
35+
, ("y", DI.fromList @Int [3, 4])
36+
]
37+
expected = "x\ty\n1\t3\n2\t4\n"
38+
assertEqual "tab separated" expected (toSeparated '\t' df)
39+
40+
-- Double values render correctly
41+
toCsvDouble :: Test
42+
toCsvDouble = TestLabel "toCsv_double" $ TestCase $ do
43+
let df =
44+
D.fromNamedColumns
45+
[ ("value", DI.fromList @Double [1.5, 2.0, 2.5])
46+
]
47+
expected = "value\n1.5\n2.0\n2.5\n"
48+
assertEqual "double toCsv" expected (toCsv df)
49+
50+
-- Single column DataFrame
51+
toCsvSingleColumn :: Test
52+
toCsvSingleColumn = TestLabel "toCsv_single_column" $ TestCase $ do
53+
let df =
54+
D.fromNamedColumns
55+
[ ("id", DI.fromList @Int [10, 20, 30])
56+
]
57+
expected = "id\n10\n20\n30\n"
58+
assertEqual "single column toCsv" expected (toCsv df)
59+
60+
-- Round trip: toCsv then readCsv preserves data
61+
toCsvRoundTrip :: Test
62+
toCsvRoundTrip = TestLabel "toCsv_roundTrip" $ TestCase $ do
63+
let df =
64+
D.fromNamedColumns
65+
[ ("a", DI.fromList @Int [1, 2, 3])
66+
, ("b", DI.fromList @T.Text ["hello", "world", "test"])
67+
]
68+
let csvText = toCsv df
69+
let tmpPath = "/tmp/dataframe_test_toCsv_roundtrip.csv"
70+
TIO.writeFile tmpPath csvText
71+
df' <- D.readCsv tmpPath
72+
assertEqual "round trip dimensions" (dataframeDimensions df) (dataframeDimensions df')
73+
assertEqual "round trip data" df df'
74+
75+
tests :: [Test]
76+
tests =
77+
[ toCsvBasic
78+
, toCsvEmpty
79+
, toSeparatedTab
80+
, toCsvDouble
81+
, toCsvSingleColumn
82+
, toCsvRoundTrip
83+
]

0 commit comments

Comments
 (0)