Skip to content

Commit 4401c74

Browse files
eliascarvjuliohm
andauthored
Refactor Levels Tranform (#91)
* Refactor Levels transform * Add nothing to ColSpec * Update ColSpec tests * Remove unused constructor * Update docstring * Update Levels tests * Add more tests * Add header in levels.jl * Update _categorical function * Update choose docstring * Add and update old tests * Apply suggestions Co-authored-by: Júlio Hoffimann <[email protected]> * Fix typo Co-authored-by: Júlio Hoffimann <[email protected]>
1 parent 90a30ba commit 4401c74

File tree

4 files changed

+131
-51
lines changed

4 files changed

+131
-51
lines changed

src/colspec.jl

+14-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ function MyTransform(args::T...) where {T<:ColSelector}
4040
end
4141
```
4242
"""
43-
const ColSpec = Union{Vector{T},NTuple{N,T},Regex,Colon} where {N,T<:ColSelector}
43+
const ColSpec = Union{Vector{T},NTuple{N,T},Regex,Colon,Nothing} where {N,T<:ColSelector}
4444

4545
"""
4646
choose(colspec::ColSpec, names) -> Vector{Symbol}
@@ -69,6 +69,18 @@ julia> choose(r"[ace]", names)
6969
:a
7070
:c
7171
:e
72+
73+
julia> choose(:, names)
74+
6-element Vector{Symbol}:
75+
:a
76+
:b
77+
:c
78+
:d
79+
:e
80+
:f
81+
82+
julia> choose(nothing, names)
83+
Symbol[]
7284
```
7385
"""
7486
function choose(colspec::Vector{Symbol}, names)
@@ -100,3 +112,4 @@ choose(colspec::Regex, names::Tuple) =
100112

101113
choose(::Colon, names::Vector) = names
102114
choose(::Colon, names::Tuple) = collect(names)
115+
choose(::Nothing, names) = Symbol[]

src/transforms/levels.jl

+43-39
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,76 @@
1+
# ------------------------------------------------------------------
2+
# Licensed under the MIT License. See LICENSE in the project root.
3+
# ------------------------------------------------------------------
4+
15
"""
2-
Levels(:a => ["yes", "no"])
6+
Levels(col₁ => levels₁, col₂ => levels₂, ..., colₙ => levelsₙ; ordered=nothing)
37
4-
Return a copy of the table with specified levels and orders for categorical columns
5-
allowing only changing the order of the column.
8+
Convert columns `col₁`, `col₂`, ..., `colₙ` to categorical arrays with given levels `levels₁`, `levels₂`, ..., `levelsₙ`.
9+
Optionally, specify which columns are `ordered`.
610
711
# Examples
812
913
```julia
10-
Levels(:a => ["yes, "no"], :c => [1, 2, 4], :d => ["a", "b", "c"])
11-
Levels("a" => ["yes", "no"], "c" => [1, 2, 4], ordered = ["a", "c"])
12-
Levels(:a => ["yes", "no"], :c => [1, 2, 4], :d => [1, 23, 5, 7], ordered = [:a, :b, :c])
14+
Levels(1 => 1:3, 2 => ["a", "b"], ordered=r"a")
15+
Levels(:a => 1:3, :b => ["a", "b"], ordered=[:a])
16+
Levels("a" => 1:3, "b" => ["a", "b"], ordered=["b"])
1317
```
1418
"""
15-
struct Levels{K} <: Stateless
16-
levelspec::K
17-
ordered::Vector{Symbol}
19+
struct Levels{S<:ColSpec,O<:ColSpec,L} <: Stateless
20+
colspec::S
21+
ordered::O
22+
levels::L
1823
end
1924

20-
Levels(pairs::Pair{Symbol}...; ordered=Symbol[]) =
21-
Levels(NamedTuple(pairs), ordered)
25+
Levels(pairs::Pair{T}...; ordered::ColSpec=nothing) where {T<:ColSelector} =
26+
Levels(first.(pairs), ordered, last.(pairs))
2227

23-
Levels(pairs::Pair{K}...; ordered=K[]) where {K<:AbstractString} =
24-
Levels(NamedTuple(Symbol(k) => v for (k,v) in pairs), Symbol.(ordered))
28+
Levels(; kwargs...) = throw(ArgumentError("Cannot create a Levels object without arguments."))
2529

2630
isrevertible(transform::Levels) = true
2731

28-
# when the col is already a categorical array and wanna change levels and order
29-
_categorify(l::AbstractVector, x::CategoricalVector, o) =
30-
categorical(x, levels=l, ordered=o), levels(x)
31-
32-
# when the col is normal array and want to change to categorical array
33-
_categorify(l::AbstractVector, x::AbstractVector, o) =
34-
categorical(x, levels=l, ordered=o), unwrap
32+
_categorical(x::AbstractVector, l, o) =
33+
categorical(x, levels=l, ordered=o), y -> unwrap.(y)
3534

36-
# when the col is not need for change or convert back to normal array
37-
_categorify(f::Function, x::AbstractVector, o) =
38-
o ? (categorical(x, ordered=true), levels(x)) : (f.(x), f)
35+
function _categorical(x::CategoricalArray, l, o)
36+
xl, xo = levels(x), isordered(x)
37+
revfunc = y -> categorical(y, levels=xl, ordered=xo)
38+
categorical(x, levels=l, ordered=o), revfunc
39+
end
3940

4041
function apply(transform::Levels, table)
4142
cols = Tables.columns(table)
4243
names = Tables.columnnames(cols)
43-
44-
result = map(names) do nm
44+
snames = choose(transform.colspec, names)
45+
ordered = choose(transform.ordered, snames)
46+
levels = transform.levels
47+
48+
results = map(names) do nm
4549
x = Tables.getcolumn(cols, nm)
46-
l = get(transform.levelspec, nm, identity)
47-
o = nm transform.ordered
48-
_categorify(l, x, o)
50+
if nm snames
51+
o = nm ordered
52+
l = levels[findfirst(==(nm), snames)]
53+
return _categorical(x, l, o)
54+
end
55+
x, identity
4956
end
50-
51-
categ = first.(result)
52-
cache = last.(result)
5357

54-
𝒯 = (; zip(names, categ)...)
58+
columns, cache = first.(results), last.(results)
59+
60+
𝒯 = (; zip(names, columns)...)
5561
newtable = 𝒯 |> Tables.materializer(table)
56-
5762
newtable, cache
5863
end
5964

60-
function revert(transform::Levels, newtable, cache)
65+
function revert(::Levels, newtable, cache)
6166
cols = Tables.columns(newtable)
6267
names = Tables.columnnames(cols)
6368

64-
ocols = map(zip(cache, names)) do (f, nm)
69+
columns = map(names, cache) do nm, revfunc
6570
x = Tables.getcolumn(cols, nm)
66-
c, _ = _categorify(f, x, false)
67-
c
71+
revfunc(x)
6872
end
6973

70-
𝒯 = (; zip(names, ocols)...)
74+
𝒯 = (; zip(names, columns)...)
7175
𝒯 |> Tables.materializer(newtable)
72-
end
76+
end

test/colspec.jl

+6
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,12 @@
5757
snames = TableTransforms.choose(:, tupnames)
5858
@test snames == [:a, :b, :c, :d, :e, :f]
5959

60+
# nothing
61+
snames = TableTransforms.choose(nothing, vecnames)
62+
@test snames == Symbol[]
63+
snames = TableTransforms.choose(nothing, tupnames)
64+
@test snames == Symbol[]
65+
6066
# throws
6167
@test_throws AssertionError TableTransforms.choose(r"x", vecnames)
6268
@test_throws AssertionError TableTransforms.choose(r"x", tupnames)

test/transforms.jl

+68-11
Original file line numberDiff line numberDiff line change
@@ -1275,8 +1275,39 @@
12751275
n, c = apply(T, t)
12761276
@test_throws AssertionError revert(T, n, c)
12771277
end
1278-
1278+
12791279
@testset "Levels" begin
1280+
a = rand([true, false], 50)
1281+
b = rand(["y", "n"], 50)
1282+
c = rand(1:3, 50)
1283+
t = Table(; a, b, c)
1284+
1285+
T = Levels(2 => ["n", "y", "m"])
1286+
n, c = apply(T, t)
1287+
@test levels(n.b) == ["n", "y", "m"]
1288+
@test isordered(n.b) == false
1289+
tₒ = revert(T, n, c)
1290+
@test tₒ == t
1291+
1292+
T = Levels(:b => ["n", "y", "m"], :c => 1:4, ordered=[:c])
1293+
n, c = apply(T, t)
1294+
@test levels(n.b) == ["n", "y", "m"]
1295+
@test isordered(n.b) == false
1296+
@test levels(n.c) == [1, 2, 3, 4]
1297+
@test isordered(n.c) == true
1298+
tₒ = revert(T, n, c)
1299+
@test tₒ == t
1300+
1301+
T = Levels("b" => ["n", "y", "m"], "c" => 1:4, ordered=["b"])
1302+
n, c = apply(T, t)
1303+
@test levels(n.b) == ["n", "y", "m"]
1304+
@test isordered(n.b) == true
1305+
@test levels(n.c) == [1, 2, 3, 4]
1306+
@test isordered(n.c) == false
1307+
tₒ = revert(T, n, c)
1308+
@test tₒ == t
1309+
1310+
# categorical columns
12801311
a = categorical(["yes", "no", "no", "no", "yes"])
12811312
b = categorical([1, 2, 4, 2, 8], ordered=false)
12821313
c = categorical([1, 2, 1, 2, 1])
@@ -1288,6 +1319,7 @@
12881319
n, c = apply(T, t)
12891320
@test levels(n.a) == ["yes", "no"]
12901321
@test levels(n.c) == [1, 2, 4]
1322+
@test levels(n.d) == [1, 23, 5, 7]
12911323
@test levels(n.e) == [1, 2, 3, 4, 5]
12921324
tₒ = revert(T, n, c)
12931325
@test levels(tₒ.a) == ["no", "yes"]
@@ -1302,35 +1334,60 @@
13021334
@test levels(tₒ.a) == ["no", "yes"]
13031335
@test levels(tₒ.c) == [1, 2]
13041336

1305-
T = Levels(:a => ["yes", "no"], :c => [1, 2, 4], :d => [1, 23, 5, 7], ordered=[:b])
1337+
T = Levels(:a => ["yes", "no"], :c => [1, 2, 4], :d => [1, 23, 5, 7])
13061338
n, c = apply(T, t)
13071339
@test levels(n.a) == ["yes", "no"]
1308-
@test isordered(n.b) == true
13091340
@test levels(n.c) == [1, 2, 4]
1341+
@test levels(n.d) == [1, 23, 5, 7]
13101342
tₒ = revert(T, n, c)
13111343
@test levels(tₒ.a) == ["no", "yes"]
13121344
@test levels(tₒ.c) == [1, 2]
1313-
@test isordered(tₒ.b) == false
13141345

1315-
T = Levels("a" => ["yes", "no"], "c" => [1, 2, 4], "e" => 5:-1:1, ordered=["b", "e"])
1346+
T = Levels("a" => ["yes", "no"], "c" => [1, 2, 4], "e" => 5:-1:1, ordered=["e"])
13161347
n, c = apply(T, t)
1317-
@test isordered(n.b) == true
1318-
@test levels(n.e) == [5, 4, 3, 2, 1]
1348+
@test levels(n.a) == ["yes", "no"]
13191349
@test levels(n.c) == [1, 2, 4]
1350+
@test levels(n.e) == [5, 4, 3, 2, 1]
1351+
@test isordered(n.a) == false
1352+
@test isordered(n.c) == false
1353+
@test isordered(n.e) == true
13201354
tₒ = revert(T, n, c)
1321-
@test isordered(tₒ.b) == false
13221355
@test levels(tₒ.e) == [1, 2, 3, 4]
1356+
@test isordered(tₒ.e) == false
13231357

1324-
T = Levels(:a => ["yes", "no"], :c => [1, 2, 4], :d => [1, 23, 5, 7], ordered=[:a, :b, :d])
1358+
T = Levels(:a => ["yes", "no"], :c => [1, 2, 4], :d => [1, 23, 5, 7], ordered=[:a, :d])
13251359
n, c = apply(T, t)
13261360
@test levels(n.a) == ["yes", "no"]
1361+
@test levels(n.c) == [1, 2, 4]
1362+
@test levels(n.d) == [1, 23, 5, 7]
13271363
@test isordered(n.a) == true
1328-
@test isordered(n.b) == true
1364+
@test isordered(n.c) == false
13291365
@test isordered(n.d) == true
13301366
tₒ = revert(T, n, c)
13311367
@test typeof(tₒ.d) == Vector{Int64}
13321368
@test isordered(tₒ.a) == false
1333-
@test isordered(tₒ.b) == false
1369+
1370+
a = rand([true, false], 50)
1371+
b = rand(["y", "n"], 50)
1372+
c = rand(1:3, 50)
1373+
t = Table(; a, b, c)
1374+
1375+
# throws: Levels without arguments
1376+
@test_throws ArgumentError Levels()
1377+
1378+
# throws: columns that do not exist in the original table
1379+
T = Levels(:x => ["n", "y", "m"], :y => 1:4)
1380+
@test_throws AssertionError apply(T, t)
1381+
T = Levels("x" => ["n", "y", "m"], "y" => 1:4)
1382+
@test_throws AssertionError apply(T, t)
1383+
1384+
# throws: invalid ordered column selection
1385+
T = Levels(:b => ["n", "y", "m"], :c => 1:4, ordered=[:a])
1386+
@test_throws AssertionError apply(T, t)
1387+
T = Levels("b" => ["n", "y", "m"], "c" => 1:4, ordered=["a"])
1388+
@test_throws AssertionError apply(T, t)
1389+
T = Levels("b" => ["n", "y", "m"], "c" => 1:4, ordered=r"xy")
1390+
@test_throws AssertionError apply(T, t)
13341391
end
13351392

13361393
@testset "Sort" begin

0 commit comments

Comments
 (0)