Skip to content

Commit 38d0c04

Browse files
Vectorize remove_copy and unique_copy (#5355)
Co-authored-by: Stephan T. Lavavej <[email protected]>
1 parent 42cee97 commit 38d0c04

File tree

6 files changed

+662
-12
lines changed

6 files changed

+662
-12
lines changed

benchmarks/src/remove.cpp

+25
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,21 @@ void r(benchmark::State& state) {
2626
}
2727
}
2828

29+
template <alg_type Type, class T>
30+
void rc(benchmark::State& state) {
31+
std::vector<T> src(lorem_ipsum.begin(), lorem_ipsum.end());
32+
std::vector<T> v(lorem_ipsum.size());
33+
for (auto _ : state) {
34+
benchmark::DoNotOptimize(src);
35+
benchmark::DoNotOptimize(v);
36+
if constexpr (Type == alg_type::std_fn) {
37+
benchmark::DoNotOptimize(std::remove_copy(src.begin(), src.end(), v.begin(), T{'l'}));
38+
} else {
39+
benchmark::DoNotOptimize(std::ranges::remove_copy(src, v.begin(), T{'l'}));
40+
}
41+
}
42+
}
43+
2944
BENCHMARK(r<alg_type::std_fn, std::uint8_t>);
3045
BENCHMARK(r<alg_type::std_fn, std::uint16_t>);
3146
BENCHMARK(r<alg_type::std_fn, std::uint32_t>);
@@ -36,4 +51,14 @@ BENCHMARK(r<alg_type::rng, std::uint16_t>);
3651
BENCHMARK(r<alg_type::rng, std::uint32_t>);
3752
BENCHMARK(r<alg_type::rng, std::uint64_t>);
3853

54+
BENCHMARK(rc<alg_type::std_fn, std::uint8_t>);
55+
BENCHMARK(rc<alg_type::std_fn, std::uint16_t>);
56+
BENCHMARK(rc<alg_type::std_fn, std::uint32_t>);
57+
BENCHMARK(rc<alg_type::std_fn, std::uint64_t>);
58+
59+
BENCHMARK(rc<alg_type::rng, std::uint8_t>);
60+
BENCHMARK(rc<alg_type::rng, std::uint16_t>);
61+
BENCHMARK(rc<alg_type::rng, std::uint32_t>);
62+
BENCHMARK(rc<alg_type::rng, std::uint64_t>);
63+
3964
BENCHMARK_MAIN();

benchmarks/src/unique.cpp

+31
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,27 @@ void u(benchmark::State& state) {
3737
}
3838
}
3939

40+
template <alg_type Type, class T>
41+
void uc(benchmark::State& state) {
42+
std::mt19937_64 gen(22033);
43+
using TD = std::conditional_t<sizeof(T) == 1, int, T>;
44+
std::binomial_distribution<TD> dis(5);
45+
46+
std::vector<T, not_highly_aligned_allocator<T>> src(2552);
47+
std::generate(src.begin(), src.end(), [&] { return static_cast<T>(dis(gen)); });
48+
49+
std::vector<T, not_highly_aligned_allocator<T>> v(src.size());
50+
for (auto _ : state) {
51+
benchmark::DoNotOptimize(src);
52+
benchmark::DoNotOptimize(v);
53+
if constexpr (Type == alg_type::std_fn) {
54+
benchmark::DoNotOptimize(std::unique_copy(src.begin(), src.end(), v.begin()));
55+
} else {
56+
benchmark::DoNotOptimize(std::ranges::unique_copy(src, v.begin()));
57+
}
58+
}
59+
}
60+
4061
BENCHMARK(u<alg_type::std_fn, std::uint8_t>);
4162
BENCHMARK(u<alg_type::std_fn, std::uint16_t>);
4263
BENCHMARK(u<alg_type::std_fn, std::uint32_t>);
@@ -47,4 +68,14 @@ BENCHMARK(u<alg_type::rng, std::uint16_t>);
4768
BENCHMARK(u<alg_type::rng, std::uint32_t>);
4869
BENCHMARK(u<alg_type::rng, std::uint64_t>);
4970

71+
BENCHMARK(uc<alg_type::std_fn, std::uint8_t>);
72+
BENCHMARK(uc<alg_type::std_fn, std::uint16_t>);
73+
BENCHMARK(uc<alg_type::std_fn, std::uint32_t>);
74+
BENCHMARK(uc<alg_type::std_fn, std::uint64_t>);
75+
76+
BENCHMARK(uc<alg_type::rng, std::uint8_t>);
77+
BENCHMARK(uc<alg_type::rng, std::uint16_t>);
78+
BENCHMARK(uc<alg_type::rng, std::uint32_t>);
79+
BENCHMARK(uc<alg_type::rng, std::uint64_t>);
80+
5081
BENCHMARK_MAIN();

stl/inc/algorithm

+150
Original file line numberDiff line numberDiff line change
@@ -85,10 +85,20 @@ const void* __stdcall __std_search_n_2(const void* _First, const void* _Last, si
8585
const void* __stdcall __std_search_n_4(const void* _First, const void* _Last, size_t _Count, uint32_t _Value) noexcept;
8686
const void* __stdcall __std_search_n_8(const void* _First, const void* _Last, size_t _Count, uint64_t _Value) noexcept;
8787

88+
void* __stdcall __std_remove_copy_1(const void* _First, const void* _Last, void* _Out, uint8_t _Val) noexcept;
89+
void* __stdcall __std_remove_copy_2(const void* _First, const void* _Last, void* _Out, uint16_t _Val) noexcept;
90+
void* __stdcall __std_remove_copy_4(const void* _First, const void* _Last, void* _Out, uint32_t _Val) noexcept;
91+
void* __stdcall __std_remove_copy_8(const void* _First, const void* _Last, void* _Out, uint64_t _Val) noexcept;
92+
8893
void* __stdcall __std_unique_1(void* _First, void* _Last) noexcept;
8994
void* __stdcall __std_unique_2(void* _First, void* _Last) noexcept;
9095
void* __stdcall __std_unique_4(void* _First, void* _Last) noexcept;
9196
void* __stdcall __std_unique_8(void* _First, void* _Last) noexcept;
97+
98+
void* __stdcall __std_unique_copy_1(const void* _First, const void* _Last, void* _Dest) noexcept;
99+
void* __stdcall __std_unique_copy_2(const void* _First, const void* _Last, void* _Dest) noexcept;
100+
void* __stdcall __std_unique_copy_4(const void* _First, const void* _Last, void* _Dest) noexcept;
101+
void* __stdcall __std_unique_copy_8(const void* _First, const void* _Last, void* _Dest) noexcept;
92102
} // extern "C"
93103

94104
_STD_BEGIN
@@ -257,6 +267,43 @@ _Ty* _Unique_vectorized(_Ty* const _First, _Ty* const _Last) noexcept {
257267
}
258268
}
259269

270+
template <class _Ty, class _TVal>
271+
_Ty* _Remove_copy_vectorized(
272+
const _Ty* const _First, const _Ty* const _Last, _Ty* const _Dest, const _TVal _Val) noexcept {
273+
if constexpr (is_pointer_v<_Ty>) {
274+
#ifdef _WIN64
275+
return reinterpret_cast<_Ty*>(::__std_remove_copy_8(_First, _Last, _Dest, reinterpret_cast<uint64_t>(_Val)));
276+
#else // ^^^ defined(_WIN64) / !defined(_WIN64) vvv
277+
return reinterpret_cast<_Ty*>(::__std_remove_copy_4(_First, _Last, _Dest, reinterpret_cast<uint32_t>(_Val)));
278+
#endif // ^^^ !defined(_WIN64) ^^^
279+
} else if constexpr (sizeof(_Ty) == 1) {
280+
return reinterpret_cast<_Ty*>(::__std_remove_copy_1(_First, _Last, _Dest, static_cast<uint8_t>(_Val)));
281+
} else if constexpr (sizeof(_Ty) == 2) {
282+
return reinterpret_cast<_Ty*>(::__std_remove_copy_2(_First, _Last, _Dest, static_cast<uint16_t>(_Val)));
283+
} else if constexpr (sizeof(_Ty) == 4) {
284+
return reinterpret_cast<_Ty*>(::__std_remove_copy_4(_First, _Last, _Dest, static_cast<uint32_t>(_Val)));
285+
} else if constexpr (sizeof(_Ty) == 8) {
286+
return reinterpret_cast<_Ty*>(::__std_remove_copy_8(_First, _Last, _Dest, static_cast<uint64_t>(_Val)));
287+
} else {
288+
_STL_INTERNAL_STATIC_ASSERT(false); // Unexpected size
289+
}
290+
}
291+
292+
template <class _Ty>
293+
_Ty* _Unique_copy_vectorized(const _Ty* const _First, const _Ty* const _Last, _Ty* const _Dest) noexcept {
294+
if constexpr (sizeof(_Ty) == 1) {
295+
return reinterpret_cast<_Ty*>(::__std_unique_copy_1(_First, _Last, _Dest));
296+
} else if constexpr (sizeof(_Ty) == 2) {
297+
return reinterpret_cast<_Ty*>(::__std_unique_copy_2(_First, _Last, _Dest));
298+
} else if constexpr (sizeof(_Ty) == 4) {
299+
return reinterpret_cast<_Ty*>(::__std_unique_copy_4(_First, _Last, _Dest));
300+
} else if constexpr (sizeof(_Ty) == 8) {
301+
return reinterpret_cast<_Ty*>(::__std_unique_copy_8(_First, _Last, _Dest));
302+
} else {
303+
_STL_INTERNAL_STATIC_ASSERT(false); // Unexpected size
304+
}
305+
}
306+
260307
// Can we activate the vector algorithms for find_first_of?
261308
template <class _It1, class _It2, class _Pr>
262309
constexpr bool _Vector_alg_in_find_first_of_is_safe = _Equal_memcmp_is_safe<_It1, _It2, _Pr>;
@@ -282,6 +329,17 @@ constexpr bool _Vector_alg_in_search_n_is_safe = _Vector_alg_in_find_is_safe<_It
282329
// Can we activate the vector algorithms for unique?
283330
template <class _Iter, class _Pr>
284331
constexpr bool _Vector_alg_in_unique_is_safe = _Equal_memcmp_is_safe<_Iter, _Iter, _Pr>;
332+
333+
// Can we use this output iterator for remove_copy or unique_copy?
334+
template <class _Out, class _In>
335+
constexpr bool _Output_iterator_for_vector_alg_is_safe() {
336+
if constexpr (_Iterator_is_contiguous<_Out>) {
337+
return is_same_v<_Iter_value_t<_Out>, remove_const_t<_Iter_value_t<_In>>>;
338+
} else {
339+
return false;
340+
}
341+
}
342+
285343
_STD_END
286344
#endif // _USE_STD_VECTOR_ALGORITHMS
287345

@@ -4718,6 +4776,33 @@ _CONSTEXPR20 _OutIt remove_copy(_InIt _First, _InIt _Last, _OutIt _Dest, const _
47184776
auto _UFirst = _STD _Get_unwrapped(_First);
47194777
const auto _ULast = _STD _Get_unwrapped(_Last);
47204778
auto _UDest = _STD _Get_unwrapped_unverified(_Dest);
4779+
4780+
#if _USE_STD_VECTOR_ALGORITHMS
4781+
if constexpr (_Vector_alg_in_find_is_safe<decltype(_UFirst), _Ty>
4782+
&& _Output_iterator_for_vector_alg_is_safe<decltype(_UDest), decltype(_UFirst)>()) {
4783+
if (!_STD _Is_constant_evaluated()) {
4784+
if (!_STD _Could_compare_equal_to_value_type<decltype(_UFirst)>(_Val)) {
4785+
_UDest = _STD _Copy_unchecked(_UFirst, _ULast, _UDest);
4786+
_STD _Seek_wrapped(_Dest, _UDest);
4787+
return _Dest;
4788+
}
4789+
4790+
const auto _Dest_ptr = _STD _To_address(_UDest);
4791+
const auto _Result =
4792+
_STD _Remove_copy_vectorized(_STD _To_address(_UFirst), _STD _To_address(_ULast), _Dest_ptr, _Val);
4793+
4794+
if constexpr (is_pointer_v<decltype(_UDest)>) {
4795+
_UDest = _Result;
4796+
} else {
4797+
_UDest += _Result - _Dest_ptr;
4798+
}
4799+
4800+
_STD _Seek_wrapped(_Dest, _UDest);
4801+
return _Dest;
4802+
}
4803+
}
4804+
#endif // _USE_STD_VECTOR_ALGORITHMS
4805+
47214806
for (; _UFirst != _ULast; ++_UFirst) {
47224807
if (!(*_UFirst == _Val)) {
47234808
*_UDest = *_UFirst;
@@ -4943,6 +5028,31 @@ namespace ranges {
49435028
_STL_INTERNAL_STATIC_ASSERT(indirectly_copyable<_It, _Out>);
49445029
_STL_INTERNAL_STATIC_ASSERT(indirect_binary_predicate<ranges::equal_to, projected<_It, _Pj>, const _Ty*>);
49455030

5031+
#if _USE_STD_VECTOR_ALGORITHMS
5032+
if constexpr (_Vector_alg_in_find_is_safe<_It, _Ty> && _Output_iterator_for_vector_alg_is_safe<_Out, _It>()
5033+
&& sized_sentinel_for<_Se, _It> && is_same_v<_Pj, identity>) {
5034+
if (!_STD is_constant_evaluated()) {
5035+
const auto _Size = _Last - _First;
5036+
auto _End = _First + _Size;
5037+
5038+
if (!_STD _Could_compare_equal_to_value_type<_It>(_Val)) {
5039+
_Output = _STD _Copy_unchecked(_First, _Last, _Output);
5040+
return {_STD move(_End), _STD move(_Output)};
5041+
}
5042+
5043+
const auto _Dest_ptr = _STD to_address(_Output);
5044+
const auto _Result =
5045+
_STD _Remove_copy_vectorized(_STD to_address(_First), _STD to_address(_End), _Dest_ptr, _Val);
5046+
5047+
if constexpr (is_pointer_v<_Out>) {
5048+
return {_STD move(_End), _Result};
5049+
} else {
5050+
return {_STD move(_End), _STD move(_Output) + (_Result - _Dest_ptr)};
5051+
}
5052+
}
5053+
}
5054+
#endif // _USE_STD_VECTOR_ALGORITHMS
5055+
49465056
for (; _First != _Last; ++_First) {
49475057
if (_STD invoke(_Proj, *_First) != _Val) {
49485058
*_Output = *_First;
@@ -5190,6 +5300,26 @@ _CONSTEXPR20 _OutIt unique_copy(_InIt _First, _InIt _Last, _OutIt _Dest, _Pr _Pr
51905300

51915301
auto _UDest = _STD _Get_unwrapped_unverified(_Dest);
51925302

5303+
#if _USE_STD_VECTOR_ALGORITHMS
5304+
if constexpr (_Vector_alg_in_unique_is_safe<decltype(_UFirst), _Pr>
5305+
&& _Output_iterator_for_vector_alg_is_safe<decltype(_UDest), decltype(_UFirst)>()) {
5306+
if (!_STD _Is_constant_evaluated()) {
5307+
const auto _First_ptr = _STD _To_address(_UFirst);
5308+
const auto _Dest_ptr = _STD _To_address(_UDest);
5309+
const auto _Result = _STD _Unique_copy_vectorized(_First_ptr, _STD _To_address(_ULast), _Dest_ptr);
5310+
5311+
if constexpr (is_pointer_v<decltype(_UDest)>) {
5312+
_UDest = _Result;
5313+
} else {
5314+
_UDest += _Result - _Dest_ptr;
5315+
}
5316+
5317+
_STD _Seek_wrapped(_Dest, _UDest);
5318+
return _Dest;
5319+
}
5320+
}
5321+
#endif // _USE_STD_VECTOR_ALGORITHMS
5322+
51935323
if constexpr (_Is_ranges_fwd_iter_v<_InIt>) { // can reread the source for comparison
51945324
auto _Firstb = _UFirst;
51955325

@@ -5317,6 +5447,26 @@ namespace ranges {
53175447
return {_STD move(_First), _STD move(_Output)};
53185448
}
53195449

5450+
#if _USE_STD_VECTOR_ALGORITHMS
5451+
if constexpr (is_same_v<_Pj, identity> && sized_sentinel_for<_Se, _It>
5452+
&& _Vector_alg_in_unique_is_safe<_It, _Pr>
5453+
&& _Output_iterator_for_vector_alg_is_safe<_Out, _It>()) {
5454+
if (!_STD is_constant_evaluated()) {
5455+
const auto _Size = _Last - _First;
5456+
const auto _First_ptr = _STD to_address(_First);
5457+
const auto _Last_ptr = _First_ptr + static_cast<size_t>(_Size);
5458+
const auto _Output_ptr = _STD to_address(_Output);
5459+
const auto _Result = _STD _Unique_copy_vectorized(_First_ptr, _Last_ptr, _Output_ptr);
5460+
5461+
if constexpr (is_pointer_v<_It> && is_pointer_v<_Out>) {
5462+
return {_Last_ptr, _Result};
5463+
} else {
5464+
return {_STD move(_First) + _Size, _STD move(_Output) + (_Result - _Output_ptr)};
5465+
}
5466+
}
5467+
}
5468+
#endif // _USE_STD_VECTOR_ALGORITHMS
5469+
53205470
if constexpr (_Is_input_with_value_type<_Out, iter_value_t<_It>>) {
53215471
// Can reread _Output
53225472
*_Output = *_First;

0 commit comments

Comments
 (0)