Skip to content

Commit 3d41f2b

Browse files
author
Peter Gagarinov
committed
Recode is optimized by transforming arrays into sets (solves #343).
1 parent c511b4f commit 3d41f2b

File tree

2 files changed

+31
-6
lines changed

2 files changed

+31
-6
lines changed

benchmark/benchmarks.jl

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,4 +86,16 @@ SUITE["repeated assignment"]["empty dest"] =
8686
SUITE["repeated assignment"]["same levels dest"] =
8787
@benchmarkable mycopy!(c2, a) setup = c2=copy(c)
8888
SUITE["repeated assignment"]["many levels dest"] =
89-
@benchmarkable mycopy!(d2, a) setup = d2=copy(d)
89+
@benchmarkable mycopy!(d2, a) setup = d2=copy(d)
90+
91+
function recode2none(orig_vec, cat2merge_vec)
92+
recode(orig_vec, cat2merge_vec => "None");
93+
end
94+
95+
orig_vec = (x -> repeat(x, 32)).(string.([x % 1000 for x in 1:1000000]));
96+
cat2merge_vec = (x -> repeat(x, 32)).(string.([x % 1000 for x in 1:100000]));
97+
SUITE["recode"] = BenchmarkGroup()
98+
SUITE["recode"]["vectors"] = @benchmarkable recode2none(orig_vec, cat2merge_vec);
99+
SUITE["recode"]["categorical_vectors"] = @benchmarkable recode2none(categorical(orig_vec), categorical(cat2merge_vec));
100+
SUITE["recode"]["matrices"] = @benchmarkable recode2none(reshape(orig_vec, :, 1), reshape(cat2merge_vec, :, 1));
101+
SUITE["recode"]["categorical_matrices"] = @benchmarkable recode2none(categorical(reshape(orig_vec, :, 1)), categorical(reshape(cat2merge_vec, :, 1)));

src/recode.jl

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -49,16 +49,27 @@ A user defined type could override this method to define an appropriate test fun
4949
@inline recode_in(x, collection::Set) = x in collection
5050
@inline recode_in(x, collection) = any(x y for y in collection)
5151

52+
53+
function optimize_pair(pair::Pair)::Pair
54+
if typeof(pair.first) <: AbstractArray
55+
pair = Set(pair.first) => pair.second
56+
end
57+
return pair;
58+
end
59+
60+
5261
function recode!(dest::AbstractArray{T}, src::AbstractArray, default::Any, pairs::Pair...) where {T}
5362
if length(dest) != length(src)
5463
throw(DimensionMismatch("dest and src must be of the same length (got $(length(dest)) and $(length(src)))"))
5564
end
5665

66+
opt_pairs = map(optimize_pair, pairs);
67+
5768
@inbounds for i in eachindex(dest, src)
5869
x = src[i]
5970

60-
for j in 1:length(pairs)
61-
p = pairs[j]
71+
for j in 1:length(opt_pairs)
72+
p = opt_pairs[j]
6273
# we use isequal and recode_in because we cannot really distinguish scalars from collections
6374
if x p.first || recode_in(x, p.first)
6475
dest[i] = p.second
@@ -96,7 +107,9 @@ function recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pa
96107
throw(DimensionMismatch("dest and src must be of the same length (got $(length(dest)) and $(length(src)))"))
97108
end
98109

99-
vals = T[p.second for p in pairs]
110+
opt_pairs = map(optimize_pair, pairs);
111+
112+
vals = T[p.second for p in opt_pairs]
100113
default !== nothing && push!(vals, default)
101114

102115
levels!(dest.pool, filter!(!ismissing, unique(vals)))
@@ -110,8 +123,8 @@ function recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pa
110123
@inbounds for i in eachindex(drefs, src)
111124
x = src[i]
112125

113-
for j in 1:length(pairs)
114-
p = pairs[j]
126+
for j in 1:length(opt_pairs)
127+
p = opt_pairs[j]
115128
# we use isequal and recode_in because we cannot really distinguish scalars from collections
116129
if x p.first || recode_in(x, p.first)
117130
drefs[i] = dupvals ? pairmap[j] : j

0 commit comments

Comments
 (0)