Skip to content

Commit 8d30680

Browse files
committed
feat: Speacialize and inline aggregation functions to avoid expensive numeric conversions.
1 parent 016a960 commit 8d30680

File tree

7 files changed

+387
-24
lines changed

7 files changed

+387
-24
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,6 @@ flake.lock
3131
tags
3232
__pycache__
3333
venv
34-
1brc
34+
1brc
35+
benchmarks/
36+
uci_datasets/

src/DataFrame/DecisionTree.hs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@ import qualified Data.Text as T
3434
import Data.Type.Equality
3535
import qualified Data.Vector as V
3636
import qualified Data.Vector.Unboxed as VU
37-
import Data.Word (Word, Word16, Word32, Word64, Word8)
38-
import Type.Reflection (SomeTypeRep (..), eqTypeRep, typeRep)
37+
import Data.Word (Word16, Word32, Word64, Word8)
38+
import Type.Reflection (SomeTypeRep (..), typeRep)
3939

4040
import DataFrame.Operators
4141

src/DataFrame/Functions.hs

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,9 +232,21 @@ not =
232232

233233
count :: (Columnable a) => Expr a -> Expr Int
234234
count = Agg (MergeAgg "count" (0 :: Int) (\c _ -> c + 1) (+) id)
235+
{-# SPECIALIZE count :: Expr Double -> Expr Int #-}
236+
{-# SPECIALIZE count :: Expr Float -> Expr Int #-}
237+
{-# SPECIALIZE count :: Expr Int -> Expr Int #-}
238+
{-# SPECIALIZE count :: Expr Int8 -> Expr Int #-}
239+
{-# SPECIALIZE count :: Expr Int16 -> Expr Int #-}
240+
{-# SPECIALIZE count :: Expr Int32 -> Expr Int #-}
241+
{-# SPECIALIZE count :: Expr Int64 -> Expr Int #-}
242+
{-# INLINEABLE count #-}
235243

236244
collect :: (Columnable a) => Expr a -> Expr [a]
237245
collect = Agg (FoldAgg "collect" (Just []) (flip (:)))
246+
{-# SPECIALIZE collect :: Expr Double -> Expr [Double] #-}
247+
{-# SPECIALIZE collect :: Expr Float -> Expr [Float] #-}
248+
{-# SPECIALIZE collect :: Expr Int -> Expr [Int] #-}
249+
{-# INLINEABLE collect #-}
238250

239251
mode :: (Ord a, Columnable a, Eq a) => Expr a -> Expr a
240252
mode =
@@ -247,21 +259,58 @@ mode =
247259
. V.foldl' (\m e -> M.insertWith (+) e 1 m) M.empty
248260
)
249261
)
262+
{-# SPECIALIZE mode :: Expr Double -> Expr Double #-}
263+
{-# SPECIALIZE mode :: Expr Float -> Expr Float #-}
264+
{-# SPECIALIZE mode :: Expr Int -> Expr Int #-}
265+
{-# SPECIALIZE mode :: Expr Int8 -> Expr Int8 #-}
266+
{-# SPECIALIZE mode :: Expr Int16 -> Expr Int16 #-}
267+
{-# SPECIALIZE mode :: Expr Int32 -> Expr Int32 #-}
268+
{-# SPECIALIZE mode :: Expr Int64 -> Expr Int64 #-}
269+
{-# INLINEABLE mode #-}
250270

251271
minimum :: (Columnable a, Ord a) => Expr a -> Expr a
252272
minimum = Agg (FoldAgg "minimum" Nothing Prelude.min)
273+
{-# SPECIALIZE DataFrame.Functions.minimum :: Expr Double -> Expr Double #-}
274+
{-# SPECIALIZE DataFrame.Functions.minimum :: Expr Float -> Expr Float #-}
275+
{-# SPECIALIZE DataFrame.Functions.minimum :: Expr Int -> Expr Int #-}
276+
{-# SPECIALIZE DataFrame.Functions.minimum :: Expr Int8 -> Expr Int8 #-}
277+
{-# SPECIALIZE DataFrame.Functions.minimum :: Expr Int16 -> Expr Int16 #-}
278+
{-# SPECIALIZE DataFrame.Functions.minimum :: Expr Int32 -> Expr Int32 #-}
279+
{-# SPECIALIZE DataFrame.Functions.minimum :: Expr Int64 -> Expr Int64 #-}
280+
{-# INLINEABLE DataFrame.Functions.minimum #-}
253281

254282
maximum :: (Columnable a, Ord a) => Expr a -> Expr a
255283
maximum = Agg (FoldAgg "maximum" Nothing Prelude.max)
284+
{-# SPECIALIZE DataFrame.Functions.maximum :: Expr Double -> Expr Double #-}
285+
{-# SPECIALIZE DataFrame.Functions.maximum :: Expr Float -> Expr Float #-}
286+
{-# SPECIALIZE DataFrame.Functions.maximum :: Expr Int -> Expr Int #-}
287+
{-# SPECIALIZE DataFrame.Functions.maximum :: Expr Int8 -> Expr Int8 #-}
288+
{-# SPECIALIZE DataFrame.Functions.maximum :: Expr Int16 -> Expr Int16 #-}
289+
{-# SPECIALIZE DataFrame.Functions.maximum :: Expr Int32 -> Expr Int32 #-}
290+
{-# SPECIALIZE DataFrame.Functions.maximum :: Expr Int64 -> Expr Int64 #-}
291+
{-# INLINEABLE DataFrame.Functions.maximum #-}
256292

257293
sum :: forall a. (Columnable a, Num a) => Expr a -> Expr a
258294
sum = Agg (FoldAgg "sum" Nothing (+))
259295
{-# SPECIALIZE DataFrame.Functions.sum :: Expr Double -> Expr Double #-}
296+
{-# SPECIALIZE DataFrame.Functions.sum :: Expr Float -> Expr Float #-}
260297
{-# SPECIALIZE DataFrame.Functions.sum :: Expr Int -> Expr Int #-}
298+
{-# SPECIALIZE DataFrame.Functions.sum :: Expr Int8 -> Expr Int8 #-}
299+
{-# SPECIALIZE DataFrame.Functions.sum :: Expr Int16 -> Expr Int16 #-}
300+
{-# SPECIALIZE DataFrame.Functions.sum :: Expr Int32 -> Expr Int32 #-}
301+
{-# SPECIALIZE DataFrame.Functions.sum :: Expr Int64 -> Expr Int64 #-}
261302
{-# INLINEABLE DataFrame.Functions.sum #-}
262303

263304
sumMaybe :: forall a. (Columnable a, Num a) => Expr (Maybe a) -> Expr a
264305
sumMaybe = Agg (CollectAgg "sumMaybe" (P.sum . Maybe.catMaybes . V.toList))
306+
{-# SPECIALIZE sumMaybe :: Expr (Maybe Double) -> Expr Double #-}
307+
{-# SPECIALIZE sumMaybe :: Expr (Maybe Float) -> Expr Float #-}
308+
{-# SPECIALIZE sumMaybe :: Expr (Maybe Int) -> Expr Int #-}
309+
{-# SPECIALIZE sumMaybe :: Expr (Maybe Int8) -> Expr Int8 #-}
310+
{-# SPECIALIZE sumMaybe :: Expr (Maybe Int16) -> Expr Int16 #-}
311+
{-# SPECIALIZE sumMaybe :: Expr (Maybe Int32) -> Expr Int32 #-}
312+
{-# SPECIALIZE sumMaybe :: Expr (Maybe Int64) -> Expr Int64 #-}
313+
{-# INLINEABLE sumMaybe #-}
265314

266315
mean :: (Columnable a, Real a) => Expr a -> Expr Double
267316
mean =
@@ -273,18 +322,58 @@ mean =
273322
(\(MeanAcc s1 c1) (MeanAcc s2 c2) -> MeanAcc (s1 + s2) (c1 + c2))
274323
(\(MeanAcc s c) -> if c == 0 then 0 / 0 else s / fromIntegral c)
275324
)
325+
{-# SPECIALIZE mean :: Expr Double -> Expr Double #-}
326+
{-# SPECIALIZE mean :: Expr Float -> Expr Double #-}
327+
{-# SPECIALIZE mean :: Expr Int -> Expr Double #-}
328+
{-# SPECIALIZE mean :: Expr Int8 -> Expr Double #-}
329+
{-# SPECIALIZE mean :: Expr Int16 -> Expr Double #-}
330+
{-# SPECIALIZE mean :: Expr Int32 -> Expr Double #-}
331+
{-# SPECIALIZE mean :: Expr Int64 -> Expr Double #-}
332+
{-# INLINEABLE mean #-}
276333

277334
meanMaybe :: forall a. (Columnable a, Real a) => Expr (Maybe a) -> Expr Double
278335
meanMaybe = Agg (CollectAgg "meanMaybe" (mean' . optionalToDoubleVector))
336+
{-# SPECIALIZE meanMaybe :: Expr (Maybe Double) -> Expr Double #-}
337+
{-# SPECIALIZE meanMaybe :: Expr (Maybe Float) -> Expr Double #-}
338+
{-# SPECIALIZE meanMaybe :: Expr (Maybe Int) -> Expr Double #-}
339+
{-# SPECIALIZE meanMaybe :: Expr (Maybe Int8) -> Expr Double #-}
340+
{-# SPECIALIZE meanMaybe :: Expr (Maybe Int16) -> Expr Double #-}
341+
{-# SPECIALIZE meanMaybe :: Expr (Maybe Int32) -> Expr Double #-}
342+
{-# SPECIALIZE meanMaybe :: Expr (Maybe Int64) -> Expr Double #-}
343+
{-# INLINEABLE meanMaybe #-}
279344

280345
variance :: (Columnable a, Real a, VU.Unbox a) => Expr a -> Expr Double
281346
variance = Agg (CollectAgg "variance" variance')
347+
{-# SPECIALIZE variance :: Expr Double -> Expr Double #-}
348+
{-# SPECIALIZE variance :: Expr Float -> Expr Double #-}
349+
{-# SPECIALIZE variance :: Expr Int -> Expr Double #-}
350+
{-# SPECIALIZE variance :: Expr Int8 -> Expr Double #-}
351+
{-# SPECIALIZE variance :: Expr Int16 -> Expr Double #-}
352+
{-# SPECIALIZE variance :: Expr Int32 -> Expr Double #-}
353+
{-# SPECIALIZE variance :: Expr Int64 -> Expr Double #-}
354+
{-# INLINEABLE variance #-}
282355

283356
median :: (Columnable a, Real a, VU.Unbox a) => Expr a -> Expr Double
284357
median = Agg (CollectAgg "median" median')
358+
{-# SPECIALIZE median :: Expr Double -> Expr Double #-}
359+
{-# SPECIALIZE median :: Expr Float -> Expr Double #-}
360+
{-# SPECIALIZE median :: Expr Int -> Expr Double #-}
361+
{-# SPECIALIZE median :: Expr Int8 -> Expr Double #-}
362+
{-# SPECIALIZE median :: Expr Int16 -> Expr Double #-}
363+
{-# SPECIALIZE median :: Expr Int32 -> Expr Double #-}
364+
{-# SPECIALIZE median :: Expr Int64 -> Expr Double #-}
365+
{-# INLINEABLE median #-}
285366

286367
medianMaybe :: (Columnable a, Real a) => Expr (Maybe a) -> Expr Double
287368
medianMaybe = Agg (CollectAgg "meanMaybe" (median' . optionalToDoubleVector))
369+
{-# SPECIALIZE medianMaybe :: Expr (Maybe Double) -> Expr Double #-}
370+
{-# SPECIALIZE medianMaybe :: Expr (Maybe Float) -> Expr Double #-}
371+
{-# SPECIALIZE medianMaybe :: Expr (Maybe Int) -> Expr Double #-}
372+
{-# SPECIALIZE medianMaybe :: Expr (Maybe Int8) -> Expr Double #-}
373+
{-# SPECIALIZE medianMaybe :: Expr (Maybe Int16) -> Expr Double #-}
374+
{-# SPECIALIZE medianMaybe :: Expr (Maybe Int32) -> Expr Double #-}
375+
{-# SPECIALIZE medianMaybe :: Expr (Maybe Int64) -> Expr Double #-}
376+
{-# INLINEABLE medianMaybe #-}
288377

289378
optionalToDoubleVector :: (Real a) => V.Vector (Maybe a) -> VU.Vector Double
290379
optionalToDoubleVector =
@@ -303,55 +392,115 @@ percentile n =
303392

304393
stddev :: (Columnable a, Real a, VU.Unbox a) => Expr a -> Expr Double
305394
stddev = Agg (CollectAgg "stddev" (sqrt . variance'))
395+
{-# SPECIALIZE stddev :: Expr Double -> Expr Double #-}
396+
{-# SPECIALIZE stddev :: Expr Float -> Expr Double #-}
397+
{-# SPECIALIZE stddev :: Expr Int -> Expr Double #-}
398+
{-# SPECIALIZE stddev :: Expr Int8 -> Expr Double #-}
399+
{-# SPECIALIZE stddev :: Expr Int16 -> Expr Double #-}
400+
{-# SPECIALIZE stddev :: Expr Int32 -> Expr Double #-}
401+
{-# SPECIALIZE stddev :: Expr Int64 -> Expr Double #-}
402+
{-# INLINEABLE stddev #-}
306403

307404
stddevMaybe :: forall a. (Columnable a, Real a) => Expr (Maybe a) -> Expr Double
308405
stddevMaybe = Agg (CollectAgg "stddevMaybe" (sqrt . variance' . optionalToDoubleVector))
406+
{-# SPECIALIZE stddevMaybe :: Expr (Maybe Double) -> Expr Double #-}
407+
{-# SPECIALIZE stddevMaybe :: Expr (Maybe Float) -> Expr Double #-}
408+
{-# SPECIALIZE stddevMaybe :: Expr (Maybe Int) -> Expr Double #-}
409+
{-# SPECIALIZE stddevMaybe :: Expr (Maybe Int8) -> Expr Double #-}
410+
{-# SPECIALIZE stddevMaybe :: Expr (Maybe Int16) -> Expr Double #-}
411+
{-# SPECIALIZE stddevMaybe :: Expr (Maybe Int32) -> Expr Double #-}
412+
{-# SPECIALIZE stddevMaybe :: Expr (Maybe Int64) -> Expr Double #-}
413+
{-# INLINEABLE stddevMaybe #-}
309414

310415
zScore :: Expr Double -> Expr Double
311416
zScore c = (c - mean c) / stddev c
312417

313418
pow :: (Columnable a, Num a) => Expr a -> Int -> Expr a
314419
pow expr i = lift2Decorated (^) "pow" (Just "^") True 8 expr (Lit i)
420+
{-# SPECIALIZE pow :: Expr Double -> Int -> Expr Double #-}
421+
{-# SPECIALIZE pow :: Expr Float -> Int -> Expr Float #-}
422+
{-# SPECIALIZE pow :: Expr Int -> Int -> Expr Int #-}
423+
{-# INLINEABLE pow #-}
315424

316425
relu :: (Columnable a, Num a, Ord a) => Expr a -> Expr a
317426
relu = liftDecorated (Prelude.max 0) "relu" Nothing
427+
{-# SPECIALIZE relu :: Expr Double -> Expr Double #-}
428+
{-# SPECIALIZE relu :: Expr Float -> Expr Float #-}
429+
{-# SPECIALIZE relu :: Expr Int -> Expr Int #-}
430+
{-# INLINEABLE relu #-}
318431

319432
min :: (Columnable a, Ord a) => Expr a -> Expr a -> Expr a
320433
min = lift2Decorated Prelude.min "min" Nothing True 1
434+
{-# SPECIALIZE DataFrame.Functions.min ::
435+
Expr Double -> Expr Double -> Expr Double
436+
#-}
437+
{-# SPECIALIZE DataFrame.Functions.min ::
438+
Expr Float -> Expr Float -> Expr Float
439+
#-}
440+
{-# SPECIALIZE DataFrame.Functions.min :: Expr Int -> Expr Int -> Expr Int #-}
441+
{-# INLINEABLE DataFrame.Functions.min #-}
321442

322443
max :: (Columnable a, Ord a) => Expr a -> Expr a -> Expr a
323444
max = lift2Decorated Prelude.max "max" Nothing True 1
445+
{-# SPECIALIZE DataFrame.Functions.max ::
446+
Expr Double -> Expr Double -> Expr Double
447+
#-}
448+
{-# SPECIALIZE DataFrame.Functions.max ::
449+
Expr Float -> Expr Float -> Expr Float
450+
#-}
451+
{-# SPECIALIZE DataFrame.Functions.max :: Expr Int -> Expr Int -> Expr Int #-}
452+
{-# INLINEABLE DataFrame.Functions.max #-}
324453

325454
reduce ::
326455
forall a b.
327456
(Columnable a, Columnable b) => Expr b -> a -> (a -> b -> a) -> Expr a
328457
reduce expr start f = Agg (FoldAgg "foldUdf" (Just start) f) expr
458+
{-# INLINEABLE reduce #-}
329459

330460
toMaybe :: (Columnable a) => Expr a -> Expr (Maybe a)
331461
toMaybe = liftDecorated Just "toMaybe" Nothing
462+
{-# SPECIALIZE toMaybe :: Expr Double -> Expr (Maybe Double) #-}
463+
{-# SPECIALIZE toMaybe :: Expr Float -> Expr (Maybe Float) #-}
464+
{-# SPECIALIZE toMaybe :: Expr Int -> Expr (Maybe Int) #-}
465+
{-# INLINEABLE toMaybe #-}
332466

333467
fromMaybe :: (Columnable a) => a -> Expr (Maybe a) -> Expr a
334468
fromMaybe d = liftDecorated (Maybe.fromMaybe d) "fromMaybe" Nothing
469+
{-# SPECIALIZE fromMaybe :: Double -> Expr (Maybe Double) -> Expr Double #-}
470+
{-# SPECIALIZE fromMaybe :: Float -> Expr (Maybe Float) -> Expr Float #-}
471+
{-# SPECIALIZE fromMaybe :: Int -> Expr (Maybe Int) -> Expr Int #-}
472+
{-# INLINEABLE fromMaybe #-}
335473

336474
isJust :: (Columnable a) => Expr (Maybe a) -> Expr Bool
337475
isJust = liftDecorated Maybe.isJust "isJust" Nothing
476+
{-# SPECIALIZE isJust :: Expr (Maybe Double) -> Expr Bool #-}
477+
{-# SPECIALIZE isJust :: Expr (Maybe Int) -> Expr Bool #-}
478+
{-# INLINEABLE isJust #-}
338479

339480
isNothing :: (Columnable a) => Expr (Maybe a) -> Expr Bool
340481
isNothing = liftDecorated Maybe.isNothing "isNothing" Nothing
482+
{-# SPECIALIZE isNothing :: Expr (Maybe Double) -> Expr Bool #-}
483+
{-# SPECIALIZE isNothing :: Expr (Maybe Int) -> Expr Bool #-}
484+
{-# INLINEABLE isNothing #-}
341485

342486
fromJust :: (Columnable a) => Expr (Maybe a) -> Expr a
343487
fromJust = liftDecorated Maybe.fromJust "fromJust" Nothing
488+
{-# SPECIALIZE fromJust :: Expr (Maybe Double) -> Expr Double #-}
489+
{-# SPECIALIZE fromJust :: Expr (Maybe Int) -> Expr Int #-}
490+
{-# INLINEABLE fromJust #-}
344491

345492
whenPresent ::
346493
forall a b.
347494
(Columnable a, Columnable b) => (a -> b) -> Expr (Maybe a) -> Expr (Maybe b)
348495
whenPresent f = liftDecorated (fmap f) "whenPresent" Nothing
496+
{-# INLINEABLE whenPresent #-}
349497

350498
whenBothPresent ::
351499
forall a b c.
352500
(Columnable a, Columnable b, Columnable c) =>
353501
(a -> b -> c) -> Expr (Maybe a) -> Expr (Maybe b) -> Expr (Maybe c)
354502
whenBothPresent f = lift2Decorated (\l r -> f <$> l <*> r) "whenBothPresent" Nothing False 0
503+
{-# INLINEABLE whenBothPresent #-}
355504

356505
recode ::
357506
forall a b.

src/DataFrame/Internal/Expression.hs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,9 @@ compareExpr e1 e2 = compare (exprKey e1) (exprKey e2)
283283
exprKey (Agg (FoldAgg name _ _) e) = "5:" ++ T.unpack name ++ exprKey e
284284
exprKey (Agg (MergeAgg name _ _ _ _) e) = "5:" ++ T.unpack name ++ exprKey e
285285

286+
instance (Ord a, Columnable a) => Ord (Expr a) where
287+
compare l r = compareExpr (normalize l) (normalize r)
288+
286289
instance (Eq a, Columnable a) => Eq (Expr a) where
287290
(==) l r = eqNormalized (normalize l) (normalize r)
288291
where

0 commit comments

Comments
 (0)