Skip to content

Commit 2ee2b76

Browse files
authored
Refactor Replace transform (#198)
* [WIP] Refactor 'Replace' transform * Add support for ColSpec in Replace & Update Map docstring * Update variable name * Add comments * Fix type promotion * Add tests * Fix typo * Update docstring * Fix typo * Update docstring * Update docstring
1 parent d414cd3 commit 2ee2b76

File tree

5 files changed

+204
-43
lines changed

5 files changed

+204
-43
lines changed

src/transforms/map.jl

+21-19
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,18 @@
33
# ------------------------------------------------------------------
44

55
"""
6-
Map(cols₁ => fun₁, cols₂ => fun₂ => target₂, ..., colsₙ => funₙ => targetₙ)
6+
Map(cols₁ => fun₁ => target₁, cols₂ => fun₂, ..., colsₙ => funₙ => targetₙ)
77
88
Applies the `funᵢ` function to the columns selected by `colsᵢ` using
99
the `map` function and saves the result in a new column named `targetᵢ`.
10-
The target column name is optional and when omitted a new name is generated
11-
by joining the selected column names with the function name.
10+
11+
The column selection can be a single column identifier (index or name),
12+
a collection of identifiers or a regular expression (regex).
13+
14+
Passing a target column name is optional and when omitted a new name
15+
is generated by joining the selected column names with the function name.
1216
If the target column already exists in the table, the original
13-
column will be replaced. The column selection can be a single
14-
column identifier (index or name), a collection of identifiers
15-
or a regular expression (regex).
17+
column will be replaced.
1618
1719
# Examples
1820
@@ -24,14 +26,18 @@ Map([:a, :c] => ((a, c) -> 2a * 3c) => :col1)
2426
Map(["c", "a"] => ((c, a) -> 3c / a) => :col1, "c" => tan)
2527
Map(r"[abc]" => ((a, b, c) -> a^2 - 2b + c) => "col1")
2628
```
29+
30+
## Notes
31+
32+
* Anonymous functions must be passed with parentheses as in the examples above.
2733
"""
2834
struct Map <: StatelessFeatureTransform
2935
colspecs::Vector{ColSpec}
3036
funs::Vector{Function}
3137
targets::Vector{Union{Nothing,Symbol}}
3238
end
3339

34-
isrevertible(::Type{Map}) = true
40+
Map() = throw(ArgumentError("cannot create a Map transform without arguments"))
3541

3642
# utility types
3743
const TargetName = Union{Symbol,AbstractString}
@@ -40,23 +46,19 @@ const PairWithoutTarget = Pair{<:Any,<:Function}
4046
const MapPair = Union{PairWithTarget,PairWithoutTarget}
4147

4248
# utility functions
43-
_extract(p::PairWithTarget) = first(p), first(last(p)), last(last(p))
44-
_extract(p::PairWithoutTarget) = first(p), last(p), nothing
45-
46-
_target(name) = name
47-
_target(name::AbstractString) = Symbol(name)
49+
_extract(p::PairWithTarget) = colspec(first(p)), first(last(p)), Symbol(last(last(p)))
50+
_extract(p::PairWithoutTarget) = colspec(first(p)), last(p), nothing
4851

4952
function Map(pairs::MapPair...)
50-
tuples = map(pairs) do p
51-
spec, fun, name = _extract(p)
52-
(colspec(spec), fun, _target(name))
53-
end
54-
colspecs = map(t -> t[1], tuples) |> collect
55-
funs = map(t -> t[2], tuples) |> collect
56-
targets = map(t -> t[3], tuples) |> collect
53+
tuples = map(_extract, pairs)
54+
colspecs = [t[1] for t in tuples]
55+
funs = [t[2] for t in tuples]
56+
targets = [t[3] for t in tuples]
5757
Map(colspecs, funs, targets)
5858
end
5959

60+
isrevertible(::Type{Map}) = true
61+
6062
_makename(snames, fun) = Symbol(join([snames; nameof(fun)], "_"))
6163

6264
function preprocess(transform::Map, table)

src/transforms/replace.jl

+89-20
Original file line numberDiff line numberDiff line change
@@ -3,42 +3,111 @@
33
# ------------------------------------------------------------------
44

55
"""
6-
Replace(old₁ => new₁, old₂ => new₂, ..., oldₙ => newₙ)
6+
Replace(cols₁ => pred₁ => new₁, pred₂ => new₂, ..., colsₙ => predₙ => newₙ)
77
8-
Replaces `oldᵢ` value with `newᵢ` value in the table.
8+
Replaces all values where `predᵢ` predicate returns `true` with `newᵢ` value
9+
in the the columns selected by `colsᵢ`.
10+
11+
Passing a column selection is optional and when omitted all columns in the table
12+
will be selected. The column selection can be a single column identifier (index or name),
13+
a collection of identifiers, or a regular expression (regex).
14+
15+
The predicate can be a function that accepts a single argument
16+
and returns a boolean, or a value. If the predicate is a value,
17+
it will be transformed into the following function: `x -> x === value`.
918
1019
# Examples
1120
1221
```julia
1322
Replace(1 => -1, 5 => -5)
14-
Replace(1 => 1.5, 5 => 5.5, 4 => true)
23+
Replace(2 => 0.0 => 1.5, 5.0 => 5.5)
24+
Replace(:b => 0.0 => 1.5, 5.0 => 5.5)
25+
Replace("b" => 0.0 => 1.5, 5.0 => 5.5)
26+
Replace([1, 3] => >(5) => 5)
27+
Replace([:a, :c] => isequal(2) => -2)
28+
Replace(["a", "c"] => (x -> 4 < x < 6) => 0)
29+
Replace(r"[abc]" => (x -> isodd(x) && x > 10) => 2)
1530
```
31+
32+
## Notes
33+
34+
* Anonymous functions must be passed with parentheses as in the examples above.
35+
* Replacements are applied in the sequence in which they are defined, therefore,
36+
if there is more than one replacement for the same column, the first valid one will be applied.
1637
"""
17-
struct Replace{K,V} <: StatelessFeatureTransform
18-
pairs::IdDict{K,V}
38+
struct Replace <: StatelessFeatureTransform
39+
colspecs::Vector{ColSpec}
40+
preds::Vector{Function}
41+
news::Vector{Any}
1942
end
2043

21-
Replace() = throw(ArgumentError("Cannot create a Replace object without arguments."))
44+
Replace() = throw(ArgumentError("cannot create a Replace transform without arguments"))
2245

23-
Replace(pairs::Pair...) = Replace(IdDict(values(pairs)))
46+
# utility functions
47+
_extract(p::Pair) = AllSpec(), _pred(first(p)), last(p)
48+
_extract(p::Pair{<:Any,<:Pair}) = colspec(first(p)), _pred(first(last(p))), last(last(p))
49+
50+
_pred(f::Function) = f
51+
_pred(v) = Base.Fix2(===, v)
52+
53+
function Replace(pairs::Pair...)
54+
tuples = map(_extract, pairs)
55+
colspecs = [t[1] for t in tuples]
56+
preds = [t[2] for t in tuples]
57+
news = Any[t[3] for t in tuples]
58+
Replace(colspecs, preds, news)
59+
end
2460

2561
isrevertible(::Type{<:Replace}) = true
2662

27-
function applyfeat(transform::Replace, feat, prep)
63+
function preprocess(transform::Replace, table)
64+
cols = Tables.columns(table)
65+
names = Tables.columnnames(cols)
66+
67+
colspecs = transform.colspecs
68+
preds = transform.preds
69+
news = transform.news
70+
71+
# column replacements
72+
colreps = map(colspecs, preds, news) do colspec, pred, new
73+
snames = choose(colspec, names)
74+
snames => pred => new
75+
end
76+
77+
# join replacements of each column
78+
map(names) do name
79+
pairs = filter(p -> name first(p), colreps)
80+
reps = isempty(pairs) ? nothing : map(last, pairs)
81+
name => reps
82+
end
83+
end
84+
85+
function applyfeat(::Replace, feat, prep)
2886
cols = Tables.columns(feat)
2987
names = Tables.columnnames(cols)
3088

31-
olds = keys(transform.pairs)
32-
values = map(names) do nm
33-
x = Tables.getcolumn(cols, nm)
34-
y = [get(transform.pairs, xᵢ, xᵢ) for xᵢ in x]
35-
inds = [findall(xᵢ -> xᵢ === old, x) .=> old for old in olds]
36-
rev = Dict(reduce(vcat, inds))
37-
y, rev
89+
tuples = map(prep) do (name, reps)
90+
x = Tables.getcolumn(cols, name)
91+
if isnothing(reps)
92+
x, nothing
93+
else
94+
# reversal dict
95+
rev = Dict{Int,eltype(x)}()
96+
y = map(enumerate(x)) do (i, v)
97+
for (pred, new) in reps
98+
if pred(v)
99+
rev[i] = v
100+
return new
101+
end
102+
end
103+
v
104+
end
105+
y, rev
106+
end
38107
end
39108

40-
columns = first.(values)
41-
fcache = last.(values)
109+
columns = first.(tuples)
110+
fcache = last.(tuples)
42111

43112
𝒯 = (; zip(names, columns)...)
44113
newfeat = 𝒯 |> Tables.materializer(feat)
@@ -49,9 +118,9 @@ function revertfeat(::Replace, newfeat, fcache)
49118
cols = Tables.columns(newfeat)
50119
names = Tables.columnnames(cols)
51120

52-
columns = map(names, fcache) do nm, rev
53-
y = Tables.getcolumn(cols, nm)
54-
[get(rev, i, y[i]) for i in eachindex(y)]
121+
columns = map(names, fcache) do name, rev
122+
y = Tables.getcolumn(cols, name)
123+
isnothing(rev) ? y : [get(rev, i, y[i]) for i in 1:length(y)]
55124
end
56125

57126
𝒯 = (; zip(names, columns)...)

test/shows.jl

+5-4
Original file line numberDiff line numberDiff line change
@@ -152,18 +152,19 @@
152152
end
153153

154154
@testset "Replace" begin
155-
pairs = IdDict(1 => -1, 5 => -5)
156-
T = Replace(pairs)
155+
T = Replace(1 => -1, 5 => -5)
157156

158157
# compact mode
159158
iostr = sprint(show, T)
160-
@test iostr == "Replace($pairs)"
159+
@test iostr == "Replace(TableTransforms.ColSpec[all, all], Function[Base.Fix2{typeof(===), Int64}(===, 1), Base.Fix2{typeof(===), Int64}(===, 5)], Any[-1, -5])"
161160

162161
# full mode
163162
iostr = sprint(show, MIME("text/plain"), T)
164163
@test iostr == """
165164
Replace transform
166-
└─ pairs = $pairs"""
165+
├─ colspecs = TableTransforms.ColSpec[all, all]
166+
├─ preds = Function[Base.Fix2{typeof(===), Int64}(===, 1), Base.Fix2{typeof(===), Int64}(===, 5)]
167+
└─ news = Any[-1, -5]"""
167168
end
168169

169170
@testset "Coalesce" begin

test/transforms/map.jl

+3
Original file line numberDiff line numberDiff line change
@@ -68,4 +68,7 @@
6868
@test n.op1 == @. t.a^2 - 2 * t.b + t.c
6969
tₒ = revert(T, n, c)
7070
@test t == tₒ
71+
72+
# throws
73+
@test_throws ArgumentError Map()
7174
end

test/transforms/replace.jl

+86
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,92 @@
2020
tₒ = revert(T, n, c)
2121
@test t == tₒ
2222

23+
# with colspec
24+
T = Replace(2 => 4 => -4, :c => 1 => -1, "e" => 5 => -5)
25+
n, c = apply(T, t)
26+
@test n.a == [3, 2, 1, 4, 5, 3]
27+
@test n.b == [2, -4, -4, 5, 8, 5]
28+
@test n.c == [-1, -1, 6, 2, 4, -1]
29+
@test n.d == [4, 3, 7, 5, 4, 1]
30+
@test n.e == [-5, -5, 2, 6, -5, 2]
31+
@test n.f == [4, 4, 3, 4, 5, 2]
32+
@test isrevertible(T)
33+
tₒ = revert(T, n, c)
34+
@test t == tₒ
35+
36+
T = Replace([2, 5] => 5 => -5)
37+
n, c = apply(T, t)
38+
@test n.a == [3, 2, 1, 4, 5, 3]
39+
@test n.b == [2, 4, 4, -5, 8, -5]
40+
@test n.c == [1, 1, 6, 2, 4, 1]
41+
@test n.d == [4, 3, 7, 5, 4, 1]
42+
@test n.e == [-5, -5, 2, 6, -5, 2]
43+
@test n.f == [4, 4, 3, 4, 5, 2]
44+
@test isrevertible(T)
45+
tₒ = revert(T, n, c)
46+
@test t == tₒ
47+
48+
T = Replace([:b, :e] => 5 => -5)
49+
n, c = apply(T, t)
50+
@test n.a == [3, 2, 1, 4, 5, 3]
51+
@test n.b == [2, 4, 4, -5, 8, -5]
52+
@test n.c == [1, 1, 6, 2, 4, 1]
53+
@test n.d == [4, 3, 7, 5, 4, 1]
54+
@test n.e == [-5, -5, 2, 6, -5, 2]
55+
@test n.f == [4, 4, 3, 4, 5, 2]
56+
@test isrevertible(T)
57+
tₒ = revert(T, n, c)
58+
@test t == tₒ
59+
60+
T = Replace(["b", "e"] => 5 => -5)
61+
n, c = apply(T, t)
62+
@test n.a == [3, 2, 1, 4, 5, 3]
63+
@test n.b == [2, 4, 4, -5, 8, -5]
64+
@test n.c == [1, 1, 6, 2, 4, 1]
65+
@test n.d == [4, 3, 7, 5, 4, 1]
66+
@test n.e == [-5, -5, 2, 6, -5, 2]
67+
@test n.f == [4, 4, 3, 4, 5, 2]
68+
@test isrevertible(T)
69+
tₒ = revert(T, n, c)
70+
@test t == tₒ
71+
72+
T = Replace(r"[be]" => 5 => -5)
73+
n, c = apply(T, t)
74+
@test n.a == [3, 2, 1, 4, 5, 3]
75+
@test n.b == [2, 4, 4, -5, 8, -5]
76+
@test n.c == [1, 1, 6, 2, 4, 1]
77+
@test n.d == [4, 3, 7, 5, 4, 1]
78+
@test n.e == [-5, -5, 2, 6, -5, 2]
79+
@test n.f == [4, 4, 3, 4, 5, 2]
80+
@test isrevertible(T)
81+
tₒ = revert(T, n, c)
82+
@test t == tₒ
83+
84+
# with predicates
85+
T = Replace([:b, :d] => >(4) => 0)
86+
n, c = apply(T, t)
87+
@test n.a == [3, 2, 1, 4, 5, 3]
88+
@test n.b == [2, 4, 4, 0, 0, 0]
89+
@test n.c == [1, 1, 6, 2, 4, 1]
90+
@test n.d == [4, 3, 0, 0, 4, 1]
91+
@test n.e == [5, 5, 2, 6, 5, 2]
92+
@test n.f == [4, 4, 3, 4, 5, 2]
93+
@test isrevertible(T)
94+
tₒ = revert(T, n, c)
95+
@test t == tₒ
96+
97+
T = Replace([:a, :f] => (x -> 1 < x < 5) => 0)
98+
n, c = apply(T, t)
99+
@test n.a == [0, 0, 1, 0, 5, 0]
100+
@test n.b == [2, 4, 4, 5, 8, 5]
101+
@test n.c == [1, 1, 6, 2, 4, 1]
102+
@test n.d == [4, 3, 7, 5, 4, 1]
103+
@test n.e == [5, 5, 2, 6, 5, 2]
104+
@test n.f == [0, 0, 0, 0, 5, 0]
105+
@test isrevertible(T)
106+
tₒ = revert(T, n, c)
107+
@test t == tₒ
108+
23109
# table schema after apply and revert
24110
T = Replace(1 => -1, 5 => -5)
25111
n, c = apply(T, t)

0 commit comments

Comments
 (0)