Skip to content

Commit 220750a

Browse files
authored
Switch to the DataScienceTraits.jl package (#210)
* Switch to the SciTypes.jl package * SciTypes.jl is now DataScienceTraits.jl * Update Coerce docstring * Update Coerce code * Update Coerce tests
1 parent 1480402 commit 220750a

File tree

11 files changed

+101
-58
lines changed

11 files changed

+101
-58
lines changed

Project.toml

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@ version = "1.15.4"
77
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
88
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
99
ColumnSelectors = "9cc86067-7e36-4c61-b350-1ac9833d277f"
10+
DataScienceTraits = "6cb2f572-2d2b-4ba6-bdb3-e710fa044d6c"
1011
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
1112
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
1213
NelderMead = "2f6b4ddb-b4ff-44c0-b59b-2ab99302f970"
1314
PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
1415
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
15-
ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
1616
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
1717
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
1818
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
@@ -24,10 +24,10 @@ Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d"
2424
AbstractTrees = "0.4"
2525
CategoricalArrays = "0.10"
2626
ColumnSelectors = "0.1"
27+
DataScienceTraits = "0.1"
2728
Distributions = "0.25"
2829
NelderMead = "0.4"
2930
PrettyTables = "2"
30-
ScientificTypes = "3.0"
3131
StatsBase = "0.33, 0.34"
3232
Tables = "1.6"
3333
Transducers = "0.4"

src/TableTransforms.jl

+2-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ module TableTransforms
66

77
using Tables
88
using Unitful
9-
using ScientificTypes
9+
using DataScienceTraits
1010
using Distributions: Normal
1111
using Transducers: tcollect
1212
using StatsBase: AbstractWeights
@@ -19,6 +19,7 @@ using CategoricalArrays
1919
using Random
2020
using NelderMead: optimise
2121

22+
using DataScienceTraits: SciType, Continuous, coerce
2223
using ColumnSelectors: ColumnSelector, SingleColumnSelector
2324
using ColumnSelectors: AllSelector, Column, selector, selectsingle
2425
using Unitful: AbstractQuantity, AffineQuantity, AffineUnits, Units

src/assertions.jl

+27-3
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55
"""
66
SciTypeAssertion{T}(selector = AllSelector())
77
8-
Asserts that the columns in the `selector` have a scientific type `T`.
8+
Asserts that the elements of the columns in the `selector` have a scientific type `T`.
99
"""
10-
struct SciTypeAssertion{T,S<:ColumnSelector}
10+
struct SciTypeAssertion{T<:SciType,S<:ColumnSelector}
1111
selector::S
1212
end
1313

@@ -22,6 +22,30 @@ function (assertion::SciTypeAssertion{T})(table) where {T}
2222

2323
for nm in snames
2424
x = Tables.getcolumn(cols, nm)
25-
@assert elscitype(x) <: T "The column '$nm' is not of scientific type $T"
25+
@assert elscitype(x) <: T "the elements of the column '$nm' are not of scientific type $T"
26+
end
27+
end
28+
29+
"""
30+
ColumnTypeAssertion{T}(selector = AllSelector())
31+
32+
Asserts that the columns in the `selector` have a type `T`.
33+
"""
34+
struct ColumnTypeAssertion{T,S<:ColumnSelector}
35+
selector::S
36+
end
37+
38+
ColumnTypeAssertion{T}(selector::S) where {T,S<:ColumnSelector} = ColumnTypeAssertion{T,S}(selector)
39+
40+
ColumnTypeAssertion{T}() where {T} = ColumnTypeAssertion{T}(AllSelector())
41+
42+
function (assertion::ColumnTypeAssertion{T})(table) where {T}
43+
cols = Tables.columns(table)
44+
names = Tables.columnnames(cols)
45+
snames = assertion.selector(names)
46+
47+
for nm in snames
48+
x = Tables.getcolumn(cols, nm)
49+
@assert typeof(x) <: T "the column '$nm' is not of type $T"
2650
end
2751
end

src/transforms/coerce.jl

+27-15
Original file line numberDiff line numberDiff line change
@@ -3,47 +3,59 @@
33
# ------------------------------------------------------------------
44

55
"""
6-
Coerce(pairs, tight=false, verbosity=1)
6+
Coerce(col₁ => S₁, col₂ => S₂, ..., colₙ => Sₙ)
77
88
Return a copy of the table, ensuring that the scientific types of the columns match the new specification.
99
10-
This transform wraps the ScientificTypes.coerce function. Please see their docstring for more details.
10+
This transform uses the `DataScienceTraits.coerce` function. Please see their docstring for more details.
1111
1212
# Examples
1313
1414
```julia
15-
using ScientificTypes
16-
Coerce(:col1 => Continuous, :col2 => Count)
15+
import DataScienceTraits as DST
16+
Coerce(1 => DST.Continuous, 2 => DST.Continuous)
17+
Coerce(:a => DST.Continuous, :b => DST.Continuous)
18+
Coerce("a" => DST.Continuous, "b" => DST.Continuous)
1719
```
1820
"""
19-
struct Coerce{P} <: FeatureTransform
20-
pairs::P
21-
tight::Bool
22-
verbosity::Int
21+
struct Coerce{S<:ColumnSelector} <: StatelessFeatureTransform
22+
selector::S
23+
scitypes::Vector{DataType}
2324
end
2425

25-
Coerce(pairs::Pair{Symbol,<:Type}...; tight=false, verbosity=1) = Coerce(pairs, tight, verbosity)
26-
Coerce(pairs::Pair{<:AbstractString,<:Type}...; kwargs...) = Coerce((Symbol(k) => v for (k, v) in pairs)...; kwargs...)
26+
Coerce() = throw(ArgumentError("cannot create Coerce transform without arguments"))
27+
28+
Coerce(pairs::Pair{C,DataType}...) where {C<:Column} = Coerce(selector(first.(pairs)), collect(last.(pairs)))
2729

2830
isrevertible(::Type{<:Coerce}) = true
2931

3032
function applyfeat(transform::Coerce, feat, prep)
31-
newtable = coerce(feat, transform.pairs...; tight=transform.tight, verbosity=transform.verbosity)
32-
33+
cols = Tables.columns(feat)
34+
names = Tables.columnnames(cols)
3335
types = Tables.schema(feat).types
36+
snames = transform.selector(names)
37+
typedict = Dict(zip(snames, transform.scitypes))
38+
39+
columns = map(names) do name
40+
x = Tables.getcolumn(cols, name)
41+
name snames ? coerce(typedict[name], x) : x
42+
end
43+
44+
𝒯 = (; zip(names, columns)...)
45+
newfeat = 𝒯 |> Tables.materializer(feat)
3446

35-
newtable, types
47+
newfeat, types
3648
end
3749

3850
function revertfeat(::Coerce, newfeat, fcache)
3951
cols = Tables.columns(newfeat)
4052
names = Tables.columnnames(cols)
4153

42-
oldcols = map(zip(fcache, names)) do (T, n)
54+
columns = map(fcache, names) do T, n
4355
x = Tables.getcolumn(cols, n)
4456
collect(T, x)
4557
end
4658

47-
𝒯 = (; zip(names, oldcols)...)
59+
𝒯 = (; zip(names, columns)...)
4860
𝒯 |> Tables.materializer(newfeat)
4961
end

src/transforms/levels.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ Levels(pairs::Pair{C}...; ordered=nothing) where {C<:Column} =
2727

2828
Levels(; kwargs...) = throw(ArgumentError("cannot create Levels transform without arguments"))
2929

30-
assertions(transform::Levels) = [SciTypeAssertion{Finite}(transform.selector)]
30+
assertions(transform::Levels) = [ColumnTypeAssertion{CategoricalArray}(transform.selector)]
3131

3232
isrevertible(::Type{<:Levels}) = true
3333

src/transforms/onehot.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ end
2626

2727
OneHot(col::Column; categ=false) = OneHot(selector(col), categ)
2828

29-
assertions(transform::OneHot) = [SciTypeAssertion{Finite}(transform.selector)]
29+
assertions(transform::OneHot) = [ColumnTypeAssertion{CategoricalArray}(transform.selector)]
3030

3131
isrevertible(::Type{<:OneHot}) = true
3232

test/Project.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
[deps]
22
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
33
ColumnSelectors = "9cc86067-7e36-4c61-b350-1ac9833d277f"
4+
DataScienceTraits = "6cb2f572-2d2b-4ba6-bdb3-e710fa044d6c"
45
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
56
GR = "28b8d3ca-fb5f-59d9-8090-bfdbd6d07a71"
67
ImageIO = "82e4d734-157c-48bb-816b-45c225c6df19"
@@ -9,7 +10,6 @@ PairPlots = "43a3c2be-4208-490b-832a-a21dcd55d7da"
910
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
1011
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
1112
ReferenceTests = "324d217c-45ce-50fc-942e-d289b448e8cf"
12-
ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
1313
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
1414
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
1515
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"

test/assertions.jl

+15-8
Original file line numberDiff line numberDiff line change
@@ -8,23 +8,30 @@
88
table = Table(; a, b, c, d, e, f)
99

1010
selector = CS.selector([:a, :b])
11-
assertion = TT.SciTypeAssertion{Continuous}(selector)
11+
assertion = TT.SciTypeAssertion{DST.Continuous}(selector)
1212
@test isnothing(assertion(table))
1313
selector = CS.selector([:a, :b, :c])
14-
assertion = TT.SciTypeAssertion{Continuous}(selector)
14+
assertion = TT.SciTypeAssertion{DST.Continuous}(selector)
1515
@test_throws AssertionError assertion(table)
1616

1717
selector = CS.selector([:c, :d])
18-
assertion = TT.SciTypeAssertion{Count}(selector)
18+
assertion = TT.SciTypeAssertion{DST.Categorical}(selector)
1919
@test isnothing(assertion(table))
20-
selector = CS.selector([:c, :d, :e])
21-
assertion = TT.SciTypeAssertion{Count}(selector)
20+
selector = CS.selector([:a, :c, :d])
21+
assertion = TT.SciTypeAssertion{DST.Categorical}(selector)
2222
@test_throws AssertionError assertion(table)
2323

2424
selector = CS.selector([:e, :f])
25-
assertion = TT.SciTypeAssertion{Finite}(selector)
25+
assertion = TT.SciTypeAssertion{DST.Categorical}(selector)
2626
@test isnothing(assertion(table))
27-
selector = CS.selector([:d, :e, :f])
28-
assertion = TT.SciTypeAssertion{Finite}(selector)
27+
selector = CS.selector([:b, :e, :f])
28+
assertion = TT.SciTypeAssertion{DST.Categorical}(selector)
29+
@test_throws AssertionError assertion(table)
30+
31+
selector = CS.selector([:e, :f])
32+
assertion = TT.ColumnTypeAssertion{CategoricalArray}(selector)
33+
@test isnothing(assertion(table))
34+
selector = CS.selector([:b, :e, :f])
35+
assertion = TT.ColumnTypeAssertion{CategoricalArray}(selector)
2936
@test_throws AssertionError assertion(table)
3037
end

test/runtests.jl

+2-1
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,15 @@ using Tables
44
using Unitful
55
using TypedTables
66
using CategoricalArrays
7-
using ScientificTypes: Continuous, Count, Finite, Multiclass
87
using LinearAlgebra
98
using Statistics
109
using Test, Random, Plots
1110
using ReferenceTests, ImageIO
1211
using StatsBase
1312
using PairPlots
13+
1414
import ColumnSelectors as CS
15+
import DataScienceTraits as DST
1516

1617
const TT = TableTransforms
1718

test/shows.jl

+4-5
Original file line numberDiff line numberDiff line change
@@ -200,19 +200,18 @@
200200
end
201201

202202
@testset "Coerce" begin
203-
T = Coerce(:a => Continuous, :b => Count)
203+
T = Coerce(:a => DST.Continuous, :b => DST.Categorical)
204204

205205
# compact mode
206206
iostr = sprint(show, T)
207-
@test iostr == "Coerce((:a => Continuous, :b => Count), false, 1)"
207+
@test iostr == "Coerce([:a, :b], DataType[DataScienceTraits.Continuous, DataScienceTraits.Categorical])"
208208

209209
# full mode
210210
iostr = sprint(show, MIME("text/plain"), T)
211211
@test iostr == """
212212
Coerce transform
213-
├─ pairs = (:a => Continuous, :b => Count)
214-
├─ tight = false
215-
└─ verbosity = 1"""
213+
├─ selector = [:a, :b]
214+
└─ scitypes = DataType[DataScienceTraits.Continuous, DataScienceTraits.Categorical]"""
216215
end
217216

218217
@testset "Levels" begin

test/transforms/coerce.jl

+19-20
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,38 @@
11
@testset "Coerce" begin
2-
x1 = [1.0, 2.0, 3.0, 4.0, 5.0]
3-
x2 = [1.0, 2.0, 3.0, 4.0, 5.0]
4-
x3 = [5.0, 5.0, 5.0, 5.0, 5.0]
5-
t = Table(; x1, x2, x3)
2+
a = [1, 2, 3, 4, 5]
3+
b = [1.0, 2.0, 3.0, 4.0, 5.0]
4+
t = Table(; a, b)
65

7-
T = Coerce(:x1 => Count, :x2 => Count)
6+
T = Coerce(1 => DST.Continuous, 2 => DST.Categorical)
87
n, c = apply(T, t)
9-
@test eltype(n.x1) == Int
10-
@test eltype(n.x2) == Int
8+
@test eltype(n.a) <: Float64
9+
@test eltype(n.b) <: Int
1110
n, c = apply(T, t)
1211
tₒ = revert(T, n, c)
13-
@test eltype(tₒ.x1) == eltype(t.x1)
14-
@test eltype(tₒ.x2) == eltype(t.x2)
12+
@test eltype(tₒ.a) == eltype(t.a)
13+
@test eltype(tₒ.b) == eltype(t.b)
1514

16-
T = Coerce(:x1 => Multiclass, :x2 => Multiclass)
15+
T = Coerce(:a => DST.Continuous, :b => DST.Categorical)
1716
n, c = apply(T, t)
18-
@test eltype(n.x1) <: CategoricalValue
19-
@test eltype(n.x2) <: CategoricalValue
17+
@test eltype(n.a) <: Float64
18+
@test eltype(n.b) <: Int
2019
n, c = apply(T, t)
2120
tₒ = revert(T, n, c)
22-
@test eltype(tₒ.x1) == eltype(t.x1)
23-
@test eltype(tₒ.x2) == eltype(t.x2)
21+
@test eltype(tₒ.a) == eltype(t.a)
22+
@test eltype(tₒ.b) == eltype(t.b)
2423

25-
T = Coerce("x1" => Multiclass, "x2" => Multiclass)
24+
T = Coerce("a" => DST.Continuous, "b" => DST.Categorical)
2625
n, c = apply(T, t)
27-
@test eltype(n.x1) <: CategoricalValue
28-
@test eltype(n.x2) <: CategoricalValue
26+
@test eltype(n.a) <: Float64
27+
@test eltype(n.b) <: Int
2928
n, c = apply(T, t)
3029
tₒ = revert(T, n, c)
31-
@test eltype(tₒ.x1) == eltype(t.x1)
32-
@test eltype(tₒ.x2) == eltype(t.x2)
30+
@test eltype(tₒ.a) == eltype(t.a)
31+
@test eltype(tₒ.b) == eltype(t.b)
3332

3433
# row table
3534
rt = Tables.rowtable(t)
36-
T = Coerce(:x1 => Count, :x2 => Count)
35+
T = Coerce(:a => DST.Continuous, :b => DST.Categorical)
3736
n, c = apply(T, rt)
3837
@test Tables.isrowtable(n)
3938
rtₒ = revert(T, n, c)

0 commit comments

Comments
 (0)