Initial broken but somewhat working version of scan API.

mchav · mchav · commit 8fc3708bd9e2 · 2025-06-22T00:02:02.000-07:00
diff --git a/dataframe.cabal b/dataframe.cabal
@@ -33,6 +33,7 @@ library
                    DataFrame.Internal.Row,
                    DataFrame.Errors,
                    DataFrame.Operations.Core,
+                   DataFrame.Operations.Merge,
                    DataFrame.Operations.Subset,
                    DataFrame.Operations.Sorting,
                    DataFrame.Operations.Statistics,
@@ -41,7 +42,7 @@ library
                    DataFrame.Operations.Aggregation,
                    DataFrame.Display.Terminal.Plot,
                    DataFrame.IO.CSV,
-                   DataFrame.Lazy.Internal.Column
+                   DataFrame.Lazy.Internal.DataFrame
     build-depends:    base >= 4.17.2.0 && < 4.22,
                       array ^>= 0.5,
                       attoparsec >= 0.12 && <= 0.14.4,
diff --git a/src/DataFrame/IO.hs b/src/DataFrame/IO.hs
@@ -0,0 +1,3 @@
+module DataFrame.IO where
+
+data InputTypes = CSV deriving Show
diff --git a/src/DataFrame/IO/CSV.hs b/src/DataFrame/IO/CSV.hs
@@ -23,7 +23,7 @@ import qualified Data.Vector.Mutable as VM
 import qualified Data.Vector.Unboxed.Mutable as VUM
 
 import Control.Applicative ((<$>), (<|>), (<*>), (<*), (*>), many)
-import Control.Monad (forM_, zipWithM_, unless, void)
+import Control.Monad (forM_, zipWithM_, unless, void, replicateM_)
 import Data.Attoparsec.Text
 import Data.Char
 import DataFrame.Internal.Column (Column(..), freezeColumn', writeColumn, columnLength)
@@ -48,13 +48,15 @@ import Type.Reflection
 data ReadOptions = ReadOptions {
     hasHeader :: Bool,
     inferTypes :: Bool,
-    safeRead :: Bool
+    safeRead :: Bool,
+    rowRange :: Maybe (Int, Int),  -- (start, length)
+    seekPos :: Maybe Integer
 }
 
 -- | By default we assume the file has a header, we infer the types on read
 -- and we convert any rows with nullish objects into Maybe (safeRead).
 defaultOptions :: ReadOptions
-defaultOptions = ReadOptions { hasHeader = True, inferTypes = True, safeRead = True }
+defaultOptions = ReadOptions { hasHeader = True, inferTypes = True, safeRead = True, rowRange = Nothing, seekPos = Nothing }
 
 -- | Reads a CSV file from the given path.
 -- Note this file stores intermediate temporary files
@@ -71,7 +73,9 @@ readTsv = readSeparated '\t' defaultOptions
 -- | Reads a character separated file into a dataframe using mutable vectors.
 readSeparated :: Char -> ReadOptions -> String -> IO DataFrame
 readSeparated c opts path = do
-    totalRows <- countRows c path
+    (begin, len) <- case rowRange opts of
+            Nothing           -> countRows c path >>= \totalRows -> return (0, if hasHeader opts then totalRows - 1 else totalRows)
+            Just (start, len) -> return (start, len)
     withFile path ReadMode $ \handle -> do
         firstRow <- map T.strip . parseSep c <$> TIO.hGetLine handle
         let columnNames = if hasHeader opts
@@ -80,9 +84,12 @@ readSeparated c opts path = do
         -- If there was no header rewind the file cursor.
         unless (hasHeader opts) $ hSeek handle AbsoluteSeek 0
 
+        -- skip columns till `begin`
+        _ <- replicateM_ begin (TIO.hGetLine handle >> return () )
+
         -- Initialize mutable vectors for each column
         let numColumns = length columnNames
-        let numRows = if hasHeader opts then totalRows - 1 else totalRows
+        let numRows = len 
         -- Use this row to infer the types of the rest of the column.
         -- TODO: this isn't robust but in so far as this is a guess anyway
         -- it's probably fine. But we should probably sample n rows and pick
@@ -102,6 +109,7 @@ readSeparated c opts path = do
         -- Freeze the mutable vectors into immutable ones
         nulls' <- V.unsafeFreeze nullIndices
         cols <- V.mapM (freezeColumn mutableCols nulls' opts) (V.generate numColumns id)
+
         return $ DataFrame {
                 columns = cols,
                 freeIndices = [],
@@ -134,7 +142,7 @@ inferValueType s = let
 fillColumns :: Int -> Char -> VM.IOVector Column -> VM.IOVector [(Int, T.Text)] -> Handle -> IO ()
 fillColumns n c mutableCols nullIndices handle = do
     input <- newIORef (mempty :: T.Text)
-    forM_ [1..n] $ \i -> do
+    forM_ [1..(n - 1)] $ \i -> do
         isEOF <- hIsEOF handle
         input' <- readIORef input
         unless (isEOF && input' == mempty) $ do
diff --git a/src/DataFrame/Internal/Column.hs b/src/DataFrame/Internal/Column.hs
@@ -521,6 +521,31 @@ expandColumn n (UnboxedColumn col) = OptionalColumn $ VB.map Just (VU.convert co
 expandColumn n (GroupedBoxedColumn col) = GroupedBoxedColumn $ col <> VB.replicate n VB.empty
 expandColumn n (GroupedUnboxedColumn col) = GroupedUnboxedColumn $ col <> VB.replicate n VU.empty
 
+leftExpandColumn :: Int -> Column -> Column
+leftExpandColumn n (OptionalColumn col) = OptionalColumn $ VB.replicate n Nothing <> col
+leftExpandColumn n (BoxedColumn col) = OptionalColumn $ VB.replicate n Nothing <> VB.map Just col
+leftExpandColumn n (UnboxedColumn col) = OptionalColumn $ VB.replicate n Nothing <> VB.map Just (VU.convert col)
+leftExpandColumn n (GroupedBoxedColumn col) = GroupedBoxedColumn $ VB.replicate n VB.empty <> col
+leftExpandColumn n (GroupedUnboxedColumn col) = GroupedUnboxedColumn $ VB.replicate n VU.empty <> col
+
+concatColumns :: Column -> Column -> Maybe Column
+concatColumns (OptionalColumn left) (OptionalColumn right) = case testEquality (typeOf left) (typeOf right) of
+  Nothing   -> Nothing
+  Just Refl -> Just (OptionalColumn $ left <> right)
+concatColumns (BoxedColumn left) (BoxedColumn right) = case testEquality (typeOf left) (typeOf right) of
+  Nothing   -> Nothing
+  Just Refl -> Just (BoxedColumn $ left <> right)
+concatColumns (UnboxedColumn left) (UnboxedColumn right) = case testEquality (typeOf left) (typeOf right) of
+  Nothing   -> Nothing
+  Just Refl -> Just (UnboxedColumn $ left <> right)
+concatColumns (GroupedBoxedColumn left) (GroupedBoxedColumn right) = case testEquality (typeOf left) (typeOf right) of
+  Nothing   -> Nothing
+  Just Refl -> Just (GroupedBoxedColumn $ left <> right)
+concatColumns (GroupedUnboxedColumn left) (GroupedUnboxedColumn right) = case testEquality (typeOf left) (typeOf right) of
+  Nothing   -> Nothing
+  Just Refl -> Just (GroupedUnboxedColumn $ left <> right)
+concatColumns _ _ = Nothing
+
 toVector :: forall a . Columnable a => Column -> VB.Vector a
 toVector column@(OptionalColumn (col :: VB.Vector b)) =
   case testEquality (typeRep @a) (typeRep @b) of
diff --git a/src/DataFrame/Internal/Expression.hs b/src/DataFrame/Internal/Expression.hs
@@ -32,7 +32,7 @@ data Expr a where
     Apply :: (Columnable a, Columnable b) => T.Text -> (b -> a) -> Expr b -> Expr a
     BinOp :: (Columnable c, Columnable b, Columnable a) => T.Text -> (c -> b -> a) -> Expr c -> Expr b -> Expr a
 
-interpret :: forall a b . (Columnable a) => DataFrame -> Expr a -> TypedColumn a
+interpret :: forall a . (Columnable a) => DataFrame -> Expr a -> TypedColumn a
 interpret df (Lit value) = TColumn $ toColumn' $ V.replicate (fst $ dataframeDimensions df) value
 interpret df (Col name) = case getColumn name df of
     Nothing -> throw $ ColumnNotFoundException name "" (map fst $ M.toList $ columnIndices df)
diff --git a/src/DataFrame/Lazy/Internal/Column.hs b/src/DataFrame/Lazy/Internal/Column.hs
diff --git a/src/DataFrame/Lazy/Internal/DataFrame.hs b/src/DataFrame/Lazy/Internal/DataFrame.hs
@@ -1,19 +1,81 @@
+{-# LANGUAGE GADTs #-}
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE InstanceSigs #-}
+{-# LANGUAGE ExistentialQuantification #-}
+{-# LANGUAGE AllowAmbiguousTypes #-}
+{-# LANGUAGE NumericUnderscores #-}
 module DataFrame.Lazy.Internal.DataFrame where
 
+import           Control.Monad (forM_)
 import           Data.IORef
+import           Data.Kind
 import qualified Data.Map as M
 import qualified Data.Text as T
 import qualified Data.Vector as V
-import qualified DataFrame.Lazy.Internal.Column as C
+import qualified DataFrame.Internal.DataFrame as D
+import qualified DataFrame.Internal.Column as C
+import qualified DataFrame.Internal.Expression as E
+import qualified DataFrame.Operations.Core as D
+import qualified DataFrame.Operations.Subset as D
+import qualified DataFrame.Operations.Transformations as D
+import qualified DataFrame.IO.CSV as D
 import           System.FilePath
 
-data DataFrame = DataFrame
-  { columns           :: V.Vector (Maybe C.Column)
-  , columnIndices     :: !(M.Map T.Text Int)
-  , freeIndices       :: !(IORef [Int])
-  , dataframeDims     :: !(IORef (Int, Int))   -- (rows , cols)
-  , memBudgetBytes    :: !Int                  -- e.g. 512 * 1024 * 1024
-  , liveMemBytes      :: !(IORef Int)          -- updated atomically
-  , chunkRowTarget    :: !Int                  -- e.g. 100_000
-  , spillDir          :: !FilePath
-  }
+data LazyOperation where
+  Derive :: C.Columnable a => T.Text -> E.Expr a -> LazyOperation
+  Select :: [T.Text] -> LazyOperation
+  Filter :: E.Expr Bool -> LazyOperation
+
+instance Show LazyOperation where
+  show :: LazyOperation -> String
+  show (Derive name expr) = T.unpack name ++ " := " ++ show expr
+  show (Select columns) =  "select(" ++ show columns ++ ")"
+  show (Filter expr) = "filter(" ++ show expr ++ ")"
+
+data InputType = ICSV deriving Show
+
+data LazyDataFrame = LazyDataFrame
+  { inputPath        :: FilePath
+  , inputType        :: InputType
+  , operations          :: [LazyOperation]
+  , batchSize        :: Int
+  } deriving Show
+
+eval :: LazyOperation -> D.DataFrame -> D.DataFrame
+eval (Derive name expr) = D.derive name expr
+eval (Select columns) = D.select columns
+eval (Filter expr) = D.filterWhere expr
+
+runDataFrame :: forall a . (C.Columnable a) => LazyDataFrame -> IO D.DataFrame
+runDataFrame df = do
+  let path = inputPath df
+  -- totalRows <- D.countRows ',' path
+  let batches = batchRanges 1000000 (batchSize df)
+  _ <- forM_ batches $ \ (start, end) -> do
+    -- TODO: implement specific read operations for batching that returns a seek instead of re-reading everything.
+    sdf <- D.readSeparated ',' (D.defaultOptions { D.rowRange = Just (start, (batchSize df)) }) path
+    let rdf = foldl' (\d op -> eval op d) sdf (operations df)
+    if fst (D.dimensions rdf) == 0 then return () else print rdf 
+  return (D.empty)
+
+batchRanges :: Int -> Int -> [(Int, Int)]
+batchRanges n inc = go n [0,inc..n]
+  where 
+    go _ []         = []
+    go n [x]        = [(x, n)]
+    go n (f:s:rest) =(f, s) : go n (s:rest)
+
+scanCsv :: T.Text -> LazyDataFrame
+scanCsv path = LazyDataFrame (T.unpack path) ICSV [] 1024
+
+addOperation :: LazyOperation -> LazyDataFrame -> LazyDataFrame
+addOperation op df = df { operations = (operations df) ++ [op] } 
+
+derive :: C.Columnable a => T.Text -> E.Expr a -> LazyDataFrame -> LazyDataFrame
+derive name expr = addOperation (Derive name expr)
+
+select :: C.Columnable a => [T.Text] -> LazyDataFrame -> LazyDataFrame
+select columns = addOperation (Select columns)
+
+filter :: C.Columnable a => E.Expr Bool -> LazyDataFrame -> LazyDataFrame
+filter cond = addOperation (Filter cond)
diff --git a/src/DataFrame/Operations/Merge.hs b/src/DataFrame/Operations/Merge.hs
@@ -0,0 +1,29 @@
+{-# LANGUAGE InstanceSigs #-}
+module DataFrame.Operations.Merge where
+
+import qualified Data.List as L
+import qualified Data.Text as T
+import qualified Data.Vector as V
+import qualified DataFrame.Internal.Column as D
+import qualified DataFrame.Internal.DataFrame as D
+import qualified DataFrame.Operations.Core as D
+
+instance Semigroup D.DataFrame where
+    (<>) :: D.DataFrame -> D.DataFrame -> D.DataFrame
+    (<>) a b = let
+            columnsInBOnly = filter (\c -> not (c `elem` (D.columnNames b))) (D.columnNames b)
+            columnsInA = D.columnNames a
+            addColumns a' b' df name = let
+                        numColumnsA = (fst $ D.dimensions a')
+                        numColumnsB = (fst $ D.dimensions b')
+                        numColumns = max numColumnsA numColumnsB
+                        optA = D.getColumn name a'
+                        optB = D.getColumn name b'
+                    in case optB of
+                        Nothing -> case optA of
+                            Nothing  -> D.insertColumn' name (Just (D.toColumn ([] :: [T.Text]))) df
+                            Just a'' -> D.insertColumn' name (Just (D.expandColumn numColumnsB a'')) df
+                        Just b'' -> case optA of
+                            Nothing  -> D.insertColumn' name (Just (D.leftExpandColumn numColumnsA b'')) df
+                            Just a'' -> D.insertColumn' name (D.concatColumns a'' b'') df
+        in foldl' (addColumns a b) D.empty (L.union (D.columnNames a) (D.columnNames b))

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+module DataFrame.IO where`
	`2`	`+`
	`3`	`+data InputTypes = CSV deriving Show`