Skip to content

Commit 8fc3708

Browse files
committed
Initial broken but somewhat working version of scan API.
1 parent bb621f1 commit 8fc3708

File tree

8 files changed

+147
-54
lines changed

8 files changed

+147
-54
lines changed

dataframe.cabal

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ library
3333
DataFrame.Internal.Row,
3434
DataFrame.Errors,
3535
DataFrame.Operations.Core,
36+
DataFrame.Operations.Merge,
3637
DataFrame.Operations.Subset,
3738
DataFrame.Operations.Sorting,
3839
DataFrame.Operations.Statistics,
@@ -41,7 +42,7 @@ library
4142
DataFrame.Operations.Aggregation,
4243
DataFrame.Display.Terminal.Plot,
4344
DataFrame.IO.CSV,
44-
DataFrame.Lazy.Internal.Column
45+
DataFrame.Lazy.Internal.DataFrame
4546
build-depends: base >= 4.17.2.0 && < 4.22,
4647
array ^>= 0.5,
4748
attoparsec >= 0.12 && <= 0.14.4,

src/DataFrame/IO.hs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
module DataFrame.IO where
2+
3+
data InputTypes = CSV deriving Show

src/DataFrame/IO/CSV.hs

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ import qualified Data.Vector.Mutable as VM
2323
import qualified Data.Vector.Unboxed.Mutable as VUM
2424

2525
import Control.Applicative ((<$>), (<|>), (<*>), (<*), (*>), many)
26-
import Control.Monad (forM_, zipWithM_, unless, void)
26+
import Control.Monad (forM_, zipWithM_, unless, void, replicateM_)
2727
import Data.Attoparsec.Text
2828
import Data.Char
2929
import DataFrame.Internal.Column (Column(..), freezeColumn', writeColumn, columnLength)
@@ -48,13 +48,15 @@ import Type.Reflection
4848
data ReadOptions = ReadOptions {
4949
hasHeader :: Bool,
5050
inferTypes :: Bool,
51-
safeRead :: Bool
51+
safeRead :: Bool,
52+
rowRange :: Maybe (Int, Int), -- (start, length)
53+
seekPos :: Maybe Integer
5254
}
5355

5456
-- | By default we assume the file has a header, we infer the types on read
5557
-- and we convert any rows with nullish objects into Maybe (safeRead).
5658
defaultOptions :: ReadOptions
57-
defaultOptions = ReadOptions { hasHeader = True, inferTypes = True, safeRead = True }
59+
defaultOptions = ReadOptions { hasHeader = True, inferTypes = True, safeRead = True, rowRange = Nothing, seekPos = Nothing }
5860

5961
-- | Reads a CSV file from the given path.
6062
-- Note this file stores intermediate temporary files
@@ -71,7 +73,9 @@ readTsv = readSeparated '\t' defaultOptions
7173
-- | Reads a character separated file into a dataframe using mutable vectors.
7274
readSeparated :: Char -> ReadOptions -> String -> IO DataFrame
7375
readSeparated c opts path = do
74-
totalRows <- countRows c path
76+
(begin, len) <- case rowRange opts of
77+
Nothing -> countRows c path >>= \totalRows -> return (0, if hasHeader opts then totalRows - 1 else totalRows)
78+
Just (start, len) -> return (start, len)
7579
withFile path ReadMode $ \handle -> do
7680
firstRow <- map T.strip . parseSep c <$> TIO.hGetLine handle
7781
let columnNames = if hasHeader opts
@@ -80,9 +84,12 @@ readSeparated c opts path = do
8084
-- If there was no header rewind the file cursor.
8185
unless (hasHeader opts) $ hSeek handle AbsoluteSeek 0
8286

87+
-- skip columns till `begin`
88+
_ <- replicateM_ begin (TIO.hGetLine handle >> return () )
89+
8390
-- Initialize mutable vectors for each column
8491
let numColumns = length columnNames
85-
let numRows = if hasHeader opts then totalRows - 1 else totalRows
92+
let numRows = len
8693
-- Use this row to infer the types of the rest of the column.
8794
-- TODO: this isn't robust but in so far as this is a guess anyway
8895
-- it's probably fine. But we should probably sample n rows and pick
@@ -102,6 +109,7 @@ readSeparated c opts path = do
102109
-- Freeze the mutable vectors into immutable ones
103110
nulls' <- V.unsafeFreeze nullIndices
104111
cols <- V.mapM (freezeColumn mutableCols nulls' opts) (V.generate numColumns id)
112+
105113
return $ DataFrame {
106114
columns = cols,
107115
freeIndices = [],
@@ -134,7 +142,7 @@ inferValueType s = let
134142
fillColumns :: Int -> Char -> VM.IOVector Column -> VM.IOVector [(Int, T.Text)] -> Handle -> IO ()
135143
fillColumns n c mutableCols nullIndices handle = do
136144
input <- newIORef (mempty :: T.Text)
137-
forM_ [1..n] $ \i -> do
145+
forM_ [1..(n - 1)] $ \i -> do
138146
isEOF <- hIsEOF handle
139147
input' <- readIORef input
140148
unless (isEOF && input' == mempty) $ do

src/DataFrame/Internal/Column.hs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -521,6 +521,31 @@ expandColumn n (UnboxedColumn col) = OptionalColumn $ VB.map Just (VU.convert co
521521
expandColumn n (GroupedBoxedColumn col) = GroupedBoxedColumn $ col <> VB.replicate n VB.empty
522522
expandColumn n (GroupedUnboxedColumn col) = GroupedUnboxedColumn $ col <> VB.replicate n VU.empty
523523

524+
leftExpandColumn :: Int -> Column -> Column
525+
leftExpandColumn n (OptionalColumn col) = OptionalColumn $ VB.replicate n Nothing <> col
526+
leftExpandColumn n (BoxedColumn col) = OptionalColumn $ VB.replicate n Nothing <> VB.map Just col
527+
leftExpandColumn n (UnboxedColumn col) = OptionalColumn $ VB.replicate n Nothing <> VB.map Just (VU.convert col)
528+
leftExpandColumn n (GroupedBoxedColumn col) = GroupedBoxedColumn $ VB.replicate n VB.empty <> col
529+
leftExpandColumn n (GroupedUnboxedColumn col) = GroupedUnboxedColumn $ VB.replicate n VU.empty <> col
530+
531+
concatColumns :: Column -> Column -> Maybe Column
532+
concatColumns (OptionalColumn left) (OptionalColumn right) = case testEquality (typeOf left) (typeOf right) of
533+
Nothing -> Nothing
534+
Just Refl -> Just (OptionalColumn $ left <> right)
535+
concatColumns (BoxedColumn left) (BoxedColumn right) = case testEquality (typeOf left) (typeOf right) of
536+
Nothing -> Nothing
537+
Just Refl -> Just (BoxedColumn $ left <> right)
538+
concatColumns (UnboxedColumn left) (UnboxedColumn right) = case testEquality (typeOf left) (typeOf right) of
539+
Nothing -> Nothing
540+
Just Refl -> Just (UnboxedColumn $ left <> right)
541+
concatColumns (GroupedBoxedColumn left) (GroupedBoxedColumn right) = case testEquality (typeOf left) (typeOf right) of
542+
Nothing -> Nothing
543+
Just Refl -> Just (GroupedBoxedColumn $ left <> right)
544+
concatColumns (GroupedUnboxedColumn left) (GroupedUnboxedColumn right) = case testEquality (typeOf left) (typeOf right) of
545+
Nothing -> Nothing
546+
Just Refl -> Just (GroupedUnboxedColumn $ left <> right)
547+
concatColumns _ _ = Nothing
548+
524549
toVector :: forall a . Columnable a => Column -> VB.Vector a
525550
toVector column@(OptionalColumn (col :: VB.Vector b)) =
526551
case testEquality (typeRep @a) (typeRep @b) of

src/DataFrame/Internal/Expression.hs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ data Expr a where
3232
Apply :: (Columnable a, Columnable b) => T.Text -> (b -> a) -> Expr b -> Expr a
3333
BinOp :: (Columnable c, Columnable b, Columnable a) => T.Text -> (c -> b -> a) -> Expr c -> Expr b -> Expr a
3434

35-
interpret :: forall a b . (Columnable a) => DataFrame -> Expr a -> TypedColumn a
35+
interpret :: forall a . (Columnable a) => DataFrame -> Expr a -> TypedColumn a
3636
interpret df (Lit value) = TColumn $ toColumn' $ V.replicate (fst $ dataframeDimensions df) value
3737
interpret df (Col name) = case getColumn name df of
3838
Nothing -> throw $ ColumnNotFoundException name "" (map fst $ M.toList $ columnIndices df)

src/DataFrame/Lazy/Internal/Column.hs

Lines changed: 0 additions & 35 deletions
This file was deleted.
Lines changed: 73 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,81 @@
1+
{-# LANGUAGE GADTs #-}
2+
{-# LANGUAGE FlexibleContexts #-}
3+
{-# LANGUAGE InstanceSigs #-}
4+
{-# LANGUAGE ExistentialQuantification #-}
5+
{-# LANGUAGE AllowAmbiguousTypes #-}
6+
{-# LANGUAGE NumericUnderscores #-}
17
module DataFrame.Lazy.Internal.DataFrame where
28

9+
import Control.Monad (forM_)
310
import Data.IORef
11+
import Data.Kind
412
import qualified Data.Map as M
513
import qualified Data.Text as T
614
import qualified Data.Vector as V
7-
import qualified DataFrame.Lazy.Internal.Column as C
15+
import qualified DataFrame.Internal.DataFrame as D
16+
import qualified DataFrame.Internal.Column as C
17+
import qualified DataFrame.Internal.Expression as E
18+
import qualified DataFrame.Operations.Core as D
19+
import qualified DataFrame.Operations.Subset as D
20+
import qualified DataFrame.Operations.Transformations as D
21+
import qualified DataFrame.IO.CSV as D
822
import System.FilePath
923

10-
data DataFrame = DataFrame
11-
{ columns :: V.Vector (Maybe C.Column)
12-
, columnIndices :: !(M.Map T.Text Int)
13-
, freeIndices :: !(IORef [Int])
14-
, dataframeDims :: !(IORef (Int, Int)) -- (rows , cols)
15-
, memBudgetBytes :: !Int -- e.g. 512 * 1024 * 1024
16-
, liveMemBytes :: !(IORef Int) -- updated atomically
17-
, chunkRowTarget :: !Int -- e.g. 100_000
18-
, spillDir :: !FilePath
19-
}
24+
data LazyOperation where
25+
Derive :: C.Columnable a => T.Text -> E.Expr a -> LazyOperation
26+
Select :: [T.Text] -> LazyOperation
27+
Filter :: E.Expr Bool -> LazyOperation
28+
29+
instance Show LazyOperation where
30+
show :: LazyOperation -> String
31+
show (Derive name expr) = T.unpack name ++ " := " ++ show expr
32+
show (Select columns) = "select(" ++ show columns ++ ")"
33+
show (Filter expr) = "filter(" ++ show expr ++ ")"
34+
35+
data InputType = ICSV deriving Show
36+
37+
data LazyDataFrame = LazyDataFrame
38+
{ inputPath :: FilePath
39+
, inputType :: InputType
40+
, operations :: [LazyOperation]
41+
, batchSize :: Int
42+
} deriving Show
43+
44+
eval :: LazyOperation -> D.DataFrame -> D.DataFrame
45+
eval (Derive name expr) = D.derive name expr
46+
eval (Select columns) = D.select columns
47+
eval (Filter expr) = D.filterWhere expr
48+
49+
runDataFrame :: forall a . (C.Columnable a) => LazyDataFrame -> IO D.DataFrame
50+
runDataFrame df = do
51+
let path = inputPath df
52+
-- totalRows <- D.countRows ',' path
53+
let batches = batchRanges 1000000 (batchSize df)
54+
_ <- forM_ batches $ \ (start, end) -> do
55+
-- TODO: implement specific read operations for batching that returns a seek instead of re-reading everything.
56+
sdf <- D.readSeparated ',' (D.defaultOptions { D.rowRange = Just (start, (batchSize df)) }) path
57+
let rdf = foldl' (\d op -> eval op d) sdf (operations df)
58+
if fst (D.dimensions rdf) == 0 then return () else print rdf
59+
return (D.empty)
60+
61+
batchRanges :: Int -> Int -> [(Int, Int)]
62+
batchRanges n inc = go n [0,inc..n]
63+
where
64+
go _ [] = []
65+
go n [x] = [(x, n)]
66+
go n (f:s:rest) =(f, s) : go n (s:rest)
67+
68+
scanCsv :: T.Text -> LazyDataFrame
69+
scanCsv path = LazyDataFrame (T.unpack path) ICSV [] 1024
70+
71+
addOperation :: LazyOperation -> LazyDataFrame -> LazyDataFrame
72+
addOperation op df = df { operations = (operations df) ++ [op] }
73+
74+
derive :: C.Columnable a => T.Text -> E.Expr a -> LazyDataFrame -> LazyDataFrame
75+
derive name expr = addOperation (Derive name expr)
76+
77+
select :: C.Columnable a => [T.Text] -> LazyDataFrame -> LazyDataFrame
78+
select columns = addOperation (Select columns)
79+
80+
filter :: C.Columnable a => E.Expr Bool -> LazyDataFrame -> LazyDataFrame
81+
filter cond = addOperation (Filter cond)

src/DataFrame/Operations/Merge.hs

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
{-# LANGUAGE InstanceSigs #-}
2+
module DataFrame.Operations.Merge where
3+
4+
import qualified Data.List as L
5+
import qualified Data.Text as T
6+
import qualified Data.Vector as V
7+
import qualified DataFrame.Internal.Column as D
8+
import qualified DataFrame.Internal.DataFrame as D
9+
import qualified DataFrame.Operations.Core as D
10+
11+
instance Semigroup D.DataFrame where
12+
(<>) :: D.DataFrame -> D.DataFrame -> D.DataFrame
13+
(<>) a b = let
14+
columnsInBOnly = filter (\c -> not (c `elem` (D.columnNames b))) (D.columnNames b)
15+
columnsInA = D.columnNames a
16+
addColumns a' b' df name = let
17+
numColumnsA = (fst $ D.dimensions a')
18+
numColumnsB = (fst $ D.dimensions b')
19+
numColumns = max numColumnsA numColumnsB
20+
optA = D.getColumn name a'
21+
optB = D.getColumn name b'
22+
in case optB of
23+
Nothing -> case optA of
24+
Nothing -> D.insertColumn' name (Just (D.toColumn ([] :: [T.Text]))) df
25+
Just a'' -> D.insertColumn' name (Just (D.expandColumn numColumnsB a'')) df
26+
Just b'' -> case optA of
27+
Nothing -> D.insertColumn' name (Just (D.leftExpandColumn numColumnsA b'')) df
28+
Just a'' -> D.insertColumn' name (D.concatColumns a'' b'') df
29+
in foldl' (addColumns a b) D.empty (L.union (D.columnNames a) (D.columnNames b))

0 commit comments

Comments
 (0)