DataHaskell
diff --git a/‎app/Main.hs‎
Lines changed: 2 additions & 2 deletions b/‎app/Main.hs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmark/Main.hs‎
Lines changed: 1 addition & 1 deletion b/‎benchmark/Main.hs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dataframe.cabal‎
Lines changed: 38 additions & 38 deletions b/‎dataframe.cabal‎
Lines changed: 38 additions & 38 deletions
diff --git a/‎docs/coming_from_pandas.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/coming_from_pandas.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/coming_from_polars.md‎
Lines changed: 5 additions & 5 deletions b/‎docs/coming_from_polars.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎docs/exploratory_data_analysis_primer.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/exploratory_data_analysis_primer.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/haskell_for_data_analysis.md‎
Lines changed: 87 additions & 0 deletions b/‎docs/haskell_for_data_analysis.md‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎src/Data/DataFrame.hs‎
Lines changed: 0 additions & 26 deletions b/‎src/Data/DataFrame.hs‎
Lines changed: 0 additions & 26 deletions
diff --git a/‎src/DataFrame.hs‎
Lines changed: 26 additions & 0 deletions b/‎src/DataFrame.hs‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎…ta/DataFrame/Display/Terminal/Colours.hs‎ ‎src/DataFrame/Display/Terminal/Colours.hs‎src/Data/DataFrame/Display/Terminal/Colours.hs renamed to src/DataFrame/Display/Terminal/Colours.hs
Lines changed: 1 addition & 1 deletion b/‎…ta/DataFrame/Display/Terminal/Colours.hs‎ ‎src/DataFrame/Display/Terminal/Colours.hs‎src/Data/DataFrame/Display/Terminal/Colours.hs renamed to src/DataFrame/Display/Terminal/Colours.hs
Lines changed: 1 addition & 1 deletion
@@ -6,8 +6,8 @@
 
 module Main where
 
-import qualified Data.DataFrame as D
-import Data.DataFrame (dimensions, (|>))
+import qualified DataFrame as D
+import DataFrame (dimensions, (|>))
 import Data.List (delete)
 import Data.Maybe (fromMaybe, isJust, isNothing)
 import qualified Data.Text as T
 
@@ -1,7 +1,7 @@
 {-# LANGUAGE NumericUnderscores #-}
 {-# LANGUAGE OverloadedStrings #-}
 
-import qualified Data.DataFrame as D
+import qualified DataFrame as D
 import qualified Data.Vector.Unboxed as VU
 
 import Control.Monad (replicateM)
 
@@ -22,25 +22,25 @@ source-repository head
   location: https://github.com/mchav/dataframe
 
 library
-    exposed-modules: Data.DataFrame
-    other-modules: Data.DataFrame.Internal.Types,
-                   Data.DataFrame.Internal.Function,
-                   Data.DataFrame.Internal.Parsing,
-                   Data.DataFrame.Internal.Column,
-                   Data.DataFrame.Display.Terminal.PrettyPrint,
-                   Data.DataFrame.Display.Terminal.Colours,
-                   Data.DataFrame.Internal.DataFrame,
-                   Data.DataFrame.Internal.Row,
-                   Data.DataFrame.Errors,
-                   Data.DataFrame.Operations.Core,
-                   Data.DataFrame.Operations.Subset,
-                   Data.DataFrame.Operations.Sorting,
-                   Data.DataFrame.Operations.Statistics,
-                   Data.DataFrame.Operations.Transformations,
-                   Data.DataFrame.Operations.Typing,
-                   Data.DataFrame.Operations.Aggregation,
-                   Data.DataFrame.Display.Terminal.Plot,
-                   Data.DataFrame.IO.CSV
+    exposed-modules: DataFrame
+    other-modules: DataFrame.Internal.Types,
+                   DataFrame.Internal.Function,
+                   DataFrame.Internal.Parsing,
+                   DataFrame.Internal.Column,
+                   DataFrame.Display.Terminal.PrettyPrint,
+                   DataFrame.Display.Terminal.Colours,
+                   DataFrame.Internal.DataFrame,
+                   DataFrame.Internal.Row,
+                   DataFrame.Errors,
+                   DataFrame.Operations.Core,
+                   DataFrame.Operations.Subset,
+                   DataFrame.Operations.Sorting,
+                   DataFrame.Operations.Statistics,
+                   DataFrame.Operations.Transformations,
+                   DataFrame.Operations.Typing,
+                   DataFrame.Operations.Aggregation,
+                   DataFrame.Display.Terminal.Plot,
+                   DataFrame.IO.CSV
     build-depends:    base >= 4.17.2.0 && < 4.21,
                       array ^>= 0.5,
                       attoparsec >= 0.12 && <= 0.14.4,
@@ -58,25 +58,25 @@ library
 
 executable dataframe
     main-is:       Main.hs
-    other-modules: Data.DataFrame,
-                   Data.DataFrame.Internal.Types,
-                   Data.DataFrame.Internal.Function,
-                   Data.DataFrame.Internal.Parsing,
-                   Data.DataFrame.Internal.Column,
-                   Data.DataFrame.Display.Terminal.PrettyPrint,
-                   Data.DataFrame.Display.Terminal.Colours,
-                   Data.DataFrame.Internal.DataFrame,
-                   Data.DataFrame.Internal.Row,
-                   Data.DataFrame.Errors,
-                   Data.DataFrame.Operations.Core,
-                   Data.DataFrame.Operations.Subset,
-                   Data.DataFrame.Operations.Sorting,
-                   Data.DataFrame.Operations.Statistics,
-                   Data.DataFrame.Operations.Transformations,
-                   Data.DataFrame.Operations.Typing,
-                   Data.DataFrame.Operations.Aggregation,
-                   Data.DataFrame.Display.Terminal.Plot,
-                   Data.DataFrame.IO.CSV
+    other-modules: DataFrame,
+                   DataFrame.Internal.Types,
+                   DataFrame.Internal.Function,
+                   DataFrame.Internal.Parsing,
+                   DataFrame.Internal.Column,
+                   DataFrame.Display.Terminal.PrettyPrint,
+                   DataFrame.Display.Terminal.Colours,
+                   DataFrame.Internal.DataFrame,
+                   DataFrame.Internal.Row,
+                   DataFrame.Errors,
+                   DataFrame.Operations.Core,
+                   DataFrame.Operations.Subset,
+                   DataFrame.Operations.Sorting,
+                   DataFrame.Operations.Statistics,
+                   DataFrame.Operations.Transformations,
+                   DataFrame.Operations.Typing,
+                   DataFrame.Operations.Aggregation,
+                   DataFrame.Display.Terminal.Plot,
+                   DataFrame.IO.CSV
     build-depends:    base >= 4.17.2.0 && < 4.21,
                       array ^>= 0.5,
                       attoparsec >= 0.12 && <= 0.14.4,
 
@@ -58,7 +58,7 @@ python> df
 ```
 
 ```haskell
-ghci> import qualified Data.DataFrame as D
+ghci> import qualified DataFrame as D
 ghci> import qualified Data.Vector as V
 ghci> import System.Random (randomRIO)
 ghci> import Control.Monad (replicateM)
 
@@ -36,7 +36,7 @@ As a standalone dataframe script this would look like.
 
 
 ```haskell
-import qualified Data.DataFrame as D
+import qualified DataFrame as D
 import Data.Time.Calendar
 
 main :: IO
@@ -111,10 +111,10 @@ Would be written as:
 ```haskell
 {-# LANGUAGE ScopedTypeVariables #-}
 {-# LANGUAGE TypeApplications #-}
-import qualified Data.DataFrame as D
+import qualified DataFrame as D
 import qualified Data.Text as T
 
-import Data.DataFrame.Operations ( (|>) )
+import DataFrame.Operations ( (|>) )
 import Data.Time.Calendar
 
 main :: IO ()
@@ -133,10 +133,10 @@ Or, more clearly:
 ```haskell
 {-# LANGUAGE ScopedTypeVariables #-}
 {-# LANGUAGE TypeApplications #-}
-import qualified Data.DataFrame as D
+import qualified DataFrame as D
 import qualified Data.Text as T
 
-import Data.DataFrame.Operations ( (|>) )
+import DataFrame ( (|>) )
 import Data.Time.Calendar
 
 main :: IO ()
 
@@ -26,7 +26,7 @@ Univariate non-graphical analysis should give us a sense of the distribution of
 For categorical data the best univariate non-graphical analysis is a tabulation of the frequency of each category.
 
 ```haskell
-ghci> import qualified Data.DataFrame as D
+ghci> import qualified DataFrame as D
 ghci> D.frequencies "ocean_proximity" df
 
 ------------------------------------------------------------------------------
 
@@ -0,0 +1,87 @@
+# Haskell for Data Analysis
+
+This section ports/mirrors Wes McKinney's book [Python for Data Analysis](https://wesmckinney.com/book/). Examples and organizations are drawn from there. This tutorial assumes an understanding of Haskell.
+
+## Data preparation
+Data in the wild doesn't always come in a form that's easy to work with. A data analysis tool should make preparing and cleaning data easy. There are a number of common issues that data analysis too must handle. We'll go through a few common ones and show how to deal with them in Haskell.
+
+### Handling missing data
+In Haskell, potentially missing values are represented by a "wrapper" type called [`Maybe`](https://en.wikibooks.org/wiki/Haskell/Understanding_monads/Maybe).
+
+```
+ghci> import qualified DataFrame as D
+ghci> let df = D.fromColumnList [D.toColumn [Just 1, Just 1, Nothing, Nothing], D.toColumn [Just 6.5, Nothing, Nothing, Just 6.5], D.toColumn [Just 3.0, Nothing, Nothing, Just 3.0]]
+ghci> df
+---------------------------------------------------
+index |       0       |      1       |      2      
+------|---------------|--------------|-------------
+ Int  | Maybe Integer | Maybe Double | Maybe Double
+------|---------------|--------------|-------------
+0     | Just 1        | Just 6.5     | Just 3.0    
+1     | Just 1        | Nothing      | Nothing     
+2     | Nothing       | Nothing      | Nothing     
+3     | Nothing       | Just 6.5     | Just 3.0    
+
+```
+
+If we'd like to drop all rows with missing values we can use the `filterJust` function.
+
+```haskell
+ghci> D.filterJust "0" df
+---------------------------------------------
+index |    0    |      1       |      2      
+------|---------|--------------|-------------
+ Int  | Integer | Maybe Double | Maybe Double
+------|---------|--------------|-------------
+0     | 1       | Just 6.5     | Just 3.0    
+1     | 1       | Nothing      | Nothing     
+```
+
+The function filters out the non-`Nothing` values and "unwrap" the `Maybe` type. To filter all `Nothing` values we use the `filterAllJust` function.
+
+```haskell
+ghci> D.filterAllJust df
+---------------------------------
+index |    0    |   1    |   2   
+------|---------|--------|-------
+ Int  | Integer | Double | Double
+------|---------|--------|-------
+0     | 1       | 6.5    | 3.0   
+```
+
+To fill in the missing values we the impute function which replaces all instances of `Nothing` with a given value.
+
+```haskell
+ghci> D.impute "0" (0 :: Integer) df
+---------------------------------------------
+index |    0    |      1       |      2      
+------|---------|--------------|-------------
+ Int  | Integer | Maybe Double | Maybe Double
+------|---------|--------------|-------------
+0     | 1       | Just 6.5     | Just 3.0    
+1     | 1       | Nothing      | Nothing     
+2     | 0       | Nothing      | Nothing     
+3     | 0       | Just 6.5     | Just 3.0    
+```
+
+There is no general way to replace ALL nothing values with a default since the default depends on the type. In fact, trying to apply the wrong type to a function throws an error:
+
+```haskell
+ghci> D.impute @Double "0" 0 df
+*** Exception: 
+
+[Error]: Type Mismatch
+        While running your code I tried to get a column of type: "Maybe Double" but column was of type: "Maybe Integer"
+        This happened when calling function apply on the column 0
+
+
+
+        Try adding a type at the end of the function e.g change
+                apply arg1 arg2 to 
+                (apply arg1 arg2 :: <Type>)
+        or add {-# LANGUAGE TypeApplications #-} to the top of your file then change the call to 
+                apply @<Type> arg1 arg2
+```
+
+In general, Haskell would usually have a compile-time. But because dataframes are usually run in REPL-like environments which offer immediate feedback to users, `dataframe` is fine turning these into compile-time exceptions.
+
@@ -0,0 +1,26 @@
+module DataFrame
+  ( module D,
+    (|>)
+  )
+where
+
+import DataFrame.Internal.Types as D
+import DataFrame.Internal.Function as D
+import DataFrame.Internal.Parsing as D
+import DataFrame.Internal.Column as D
+import DataFrame.Internal.DataFrame as D hiding (columnIndices, columns)
+import DataFrame.Internal.Row as D hiding (mkRowRep)
+import DataFrame.Errors as D
+import DataFrame.Operations.Core as D
+import DataFrame.Operations.Subset as D
+import DataFrame.Operations.Sorting as D
+import DataFrame.Operations.Statistics as D
+import DataFrame.Operations.Transformations as D
+import DataFrame.Operations.Typing as D
+import DataFrame.Operations.Aggregation as D
+import DataFrame.Display.Terminal.Plot as D
+import DataFrame.IO.CSV as D
+
+import Data.Function
+
+(|>) = (&)
@@ -1,4 +1,4 @@
-module Data.DataFrame.Display.Terminal.Colours where
+module DataFrame.Display.Terminal.Colours where
 
 -- terminal color functions
 red :: String -> String
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-module Data.DataFrame.Display.Terminal.Colours where`
	`1`	`+module DataFrame.Display.Terminal.Colours where`
`2`	`2`
`3`	`3`	`-- terminal color functions`
`4`	`4`	`red :: String -> String`