feat: export toCsv functionality for dataframe for simple ipc

mchav · mchav · commit d667845c9be7 · 2026-03-29T23:42:02.000-06:00
diff --git a/dataframe.cabal b/dataframe.cabal
@@ -261,6 +261,7 @@ test-suite tests
                    Operations.Nullable,
                    Operations.Provenance,
                    Operations.ReadCsv,
+                   Operations.WriteCsv,
                    Operations.Shuffle,
                    Operations.Sort,
                    Operations.Subset,
diff --git a/src/DataFrame.hs b/src/DataFrame.hs
@@ -281,8 +281,10 @@ import DataFrame.Internal.DataFrame as Dataframe (
     GroupedDataFrame,
     empty,
     null,
+    toCsv,
     toMarkdown,
     toMarkdown',
+    toSeparated,
  )
 import DataFrame.Internal.Expression as Expression (Expr, prettyPrint)
 import DataFrame.Internal.Row as Row (
diff --git a/src/DataFrame/IO/CSV.hs b/src/DataFrame/IO/CSV.hs
@@ -39,7 +39,7 @@ import Data.Maybe
 import Data.Type.Equality (TestEquality (testEquality))
 import Data.Word (Word8)
 import DataFrame.Internal.Column
-import DataFrame.Internal.DataFrame (DataFrame (..))
+import DataFrame.Internal.DataFrame (DataFrame (..), toSeparated)
 import DataFrame.Internal.Parsing
 import DataFrame.Internal.Schema
 import DataFrame.Operations.Typing
@@ -547,50 +547,7 @@ writeSeparated ::
     FilePath ->
     DataFrame ->
     IO ()
-writeSeparated c filepath df = withFile filepath WriteMode $ \handle -> do
-    let (rows, _) = dataframeDimensions df
-    let headers = map fst (L.sortBy (compare `on` snd) (M.toList (columnIndices df)))
-    TIO.hPutStrLn handle (T.intercalate "," headers)
-    forM_ [0 .. (rows - 1)] $ \i -> do
-        let row = getRowAsText df i
-        TIO.hPutStrLn handle (T.intercalate "," row)
-
-getRowAsText :: DataFrame -> Int -> [T.Text]
-getRowAsText df i = V.ifoldr go [] (columns df)
-  where
-    indexMap = M.fromList (map (\(a, b) -> (b, a)) $ M.toList (columnIndices df))
-    go k (BoxedColumn _ (c :: V.Vector a)) acc = case c V.!? i of
-        Just e -> textRep : acc
-          where
-            textRep = case testEquality (typeRep @a) (typeRep @T.Text) of
-                Just Refl -> e
-                Nothing -> case typeRep @a of
-                    App t1 t2 -> case eqTypeRep t1 (typeRep @Maybe) of
-                        Just HRefl -> case testEquality t2 (typeRep @T.Text) of
-                            Just Refl -> fromMaybe "null" e
-                            Nothing -> (fromOptional . T.pack . show) e
-                              where
-                                fromOptional s
-                                    | T.isPrefixOf "Just " s = T.drop (T.length "Just ") s
-                                    | otherwise = "null"
-                        Nothing -> (T.pack . show) e
-                    _ -> (T.pack . show) e
-        Nothing ->
-            error $
-                "Column "
-                    ++ T.unpack (indexMap M.! k)
-                    ++ " has less items than "
-                    ++ "the other columns at index "
-                    ++ show i
-    go k (UnboxedColumn _ c) acc = case c VU.!? i of
-        Just e -> T.pack (show e) : acc
-        Nothing ->
-            error $
-                "Column "
-                    ++ T.unpack (indexMap M.! k)
-                    ++ " has less items than "
-                    ++ "the other columns at index "
-                    ++ show i
+writeSeparated c filepath df = TIO.writeFile filepath (toSeparated c df)
 
 stripQuotes :: T.Text -> T.Text
 stripQuotes txt =
diff --git a/src/DataFrame/Internal/DataFrame.hs b/src/DataFrame/Internal/DataFrame.hs
@@ -4,6 +4,7 @@
 {-# LANGUAGE GADTs #-}
 {-# LANGUAGE InstanceSigs #-}
 {-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE PatternSynonyms #-}
 {-# LANGUAGE ScopedTypeVariables #-}
 {-# LANGUAGE TypeApplications #-}
 
@@ -18,13 +19,18 @@ import Control.DeepSeq (NFData (..), rnf)
 import Control.Exception (throw)
 import Data.Function (on)
 import Data.List (sortBy, transpose, (\\))
-import Data.Type.Equality (TestEquality (testEquality), type (:~:) (Refl))
+import Data.Maybe (fromMaybe)
+import Data.Type.Equality (
+    TestEquality (testEquality),
+    type (:~:) (Refl),
+    type (:~~:) (HRefl),
+ )
 import DataFrame.Display.Terminal.PrettyPrint
 import DataFrame.Errors
 import DataFrame.Internal.Column
 import DataFrame.Internal.Expression
 import Text.Printf
-import Type.Reflection (Typeable, typeRep)
+import Type.Reflection (Typeable, eqTypeRep, typeRep, pattern App)
 import Prelude hiding (null)
 
 data DataFrame = DataFrame
@@ -196,3 +202,40 @@ Note that a dataframe with columns but no rows is not considered null.
 -}
 null :: DataFrame -> Bool
 null df = V.null (columns df)
+
+-- | Convert a DataFrame to a CSV (comma-separated) text.
+toCsv :: DataFrame -> T.Text
+toCsv = toSeparated ','
+
+-- | Convert a DataFrame to a text representation with a custom separator.
+toSeparated :: Char -> DataFrame -> T.Text
+toSeparated sep df
+    | null df = T.empty
+    | otherwise =
+        let (rows, _) = dataframeDimensions df
+            headers = map fst (sortBy (compare `on` snd) (M.toList (columnIndices df)))
+            sepText = T.singleton sep
+            headerLine = T.intercalate sepText headers
+            dataLines = map (T.intercalate sepText . getRowAsText df) [0 .. rows - 1]
+         in T.unlines (headerLine : dataLines)
+
+getRowAsText :: DataFrame -> Int -> [T.Text]
+getRowAsText df i = map (`showElement` i) (V.toList (columns df))
+
+showElement :: Column -> Int -> T.Text
+showElement (BoxedColumn _ (c :: V.Vector a)) i = case c V.!? i of
+    Nothing -> error $ "Column index out of bounds at row " ++ show i
+    Just e
+        | Just Refl <- testEquality (typeRep @a) (typeRep @T.Text) -> e
+        | App t1 t2 <- typeRep @a
+        , Just HRefl <- eqTypeRep t1 (typeRep @Maybe) ->
+            case testEquality t2 (typeRep @T.Text) of
+                Just Refl -> fromMaybe "null" e
+                Nothing -> stripJust (T.pack (show e))
+        | otherwise -> T.pack (show e)
+showElement (UnboxedColumn _ c) i = case c VU.!? i of
+    Nothing -> error $ "Column index out of bounds at row " ++ show i
+    Just e -> T.pack (show e)
+
+stripJust :: T.Text -> T.Text
+stripJust = fromMaybe "null" . T.stripPrefix "Just "
diff --git a/tests/Main.hs b/tests/Main.hs
@@ -33,6 +33,7 @@ import qualified Operations.Statistics
 import qualified Operations.Subset
 import qualified Operations.Take
 import qualified Operations.Typing
+import qualified Operations.WriteCsv
 import qualified Parquet
 import qualified Properties
 
@@ -53,6 +54,7 @@ tests =
             ++ Operations.Nullable.tests
             ++ Operations.Provenance.tests
             ++ Operations.ReadCsv.tests
+            ++ Operations.WriteCsv.tests
             ++ Operations.Shuffle.tests
             ++ Operations.Sort.tests
             ++ Operations.Statistics.tests
diff --git a/tests/Operations/WriteCsv.hs b/tests/Operations/WriteCsv.hs
@@ -0,0 +1,83 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE TypeApplications #-}
+
+module Operations.WriteCsv where
+
+import qualified Data.Text as T
+import qualified Data.Text.IO as TIO
+import qualified DataFrame as D
+import qualified DataFrame.Internal.Column as DI
+import DataFrame.Internal.DataFrame (DataFrame (..), toCsv, toSeparated)
+import Test.HUnit
+
+-- Basic test: Int and Text columns produce correct CSV
+toCsvBasic :: Test
+toCsvBasic = TestLabel "toCsv_basic" $ TestCase $ do
+    let df =
+            D.fromNamedColumns
+                [ ("name", DI.fromList @T.Text ["Alice", "Bob", "Charlie"])
+                , ("age", DI.fromList @Int [30, 25, 35])
+                ]
+        expected = "name,age\nAlice,30\nBob,25\nCharlie,35\n"
+    assertEqual "basic toCsv" expected (toCsv df)
+
+-- Empty DataFrame produces empty text
+toCsvEmpty :: Test
+toCsvEmpty = TestLabel "toCsv_empty" $ TestCase $
+    assertEqual "empty toCsv" T.empty (toCsv D.empty)
+
+-- toSeparated with tab produces tab-delimited output
+toSeparatedTab :: Test
+toSeparatedTab = TestLabel "toSeparated_tab" $ TestCase $ do
+    let df =
+            D.fromNamedColumns
+                [ ("x", DI.fromList @Int [1, 2])
+                , ("y", DI.fromList @Int [3, 4])
+                ]
+        expected = "x\ty\n1\t3\n2\t4\n"
+    assertEqual "tab separated" expected (toSeparated '\t' df)
+
+-- Double values render correctly
+toCsvDouble :: Test
+toCsvDouble = TestLabel "toCsv_double" $ TestCase $ do
+    let df =
+            D.fromNamedColumns
+                [ ("value", DI.fromList @Double [1.5, 2.0, 2.5])
+                ]
+        expected = "value\n1.5\n2.0\n2.5\n"
+    assertEqual "double toCsv" expected (toCsv df)
+
+-- Single column DataFrame
+toCsvSingleColumn :: Test
+toCsvSingleColumn = TestLabel "toCsv_single_column" $ TestCase $ do
+    let df =
+            D.fromNamedColumns
+                [ ("id", DI.fromList @Int [10, 20, 30])
+                ]
+        expected = "id\n10\n20\n30\n"
+    assertEqual "single column toCsv" expected (toCsv df)
+
+-- Round trip: toCsv then readCsv preserves data
+toCsvRoundTrip :: Test
+toCsvRoundTrip = TestLabel "toCsv_roundTrip" $ TestCase $ do
+    let df =
+            D.fromNamedColumns
+                [ ("a", DI.fromList @Int [1, 2, 3])
+                , ("b", DI.fromList @T.Text ["hello", "world", "test"])
+                ]
+    let csvText = toCsv df
+    let tmpPath = "/tmp/dataframe_test_toCsv_roundtrip.csv"
+    TIO.writeFile tmpPath csvText
+    df' <- D.readCsv tmpPath
+    assertEqual "round trip dimensions" (dataframeDimensions df) (dataframeDimensions df')
+    assertEqual "round trip data" df df'
+
+tests :: [Test]
+tests =
+    [ toCsvBasic
+    , toCsvEmpty
+    , toSeparatedTab
+    , toCsvDouble
+    , toCsvSingleColumn
+    , toCsvRoundTrip
+    ]