Description
C++20 has added built-in wait/notify operations on atomics, both the generic std::atomic<T>
, as well as a boolean-like std::atomic_flag
.
On the WebAssembly target I'd expect those methods to compile to thin wrappers around the native atomic.wait
/ atomic.notify
instructions, but it looks like it results in a much more complex codegen. I'm guessing it implements whatever those instructions do natively under the hood, but embedded into the WebAssembly binary.
Example:
#include <atomic>
__attribute__((export_name("wait")))
extern "C" void wait(std::atomic<int>* x) {
x->wait(123);
}
int main() {}
with emcc temp.cpp -Os -g2 -o temp.js -pthread
results in the following codegen + tons of the helper functions it references:
(func $wait (type $t0) (param $p0 i32)
(local $l1 i32) (local $l2 i32) (local $l3 i32) (local $l4 i32) (local $l5 i32) (local $l6 i32) (local $l7 i32) (local $l8 i32) (local $l9 i32) (local $l10 i32) (local $l11 i32) (local $l12 i32) (local $l13 i32) (local $l14 i32) (local $l15 i32) (local $l16 i32) (local $l17 i32) (local $l18 i32) (local $l19 i64) (local $l20 i64) (local $l21 i64)
global.get $__stack_pointer
i32.const 32
i32.sub
local.tee $l4
global.set $__stack_pointer
local.get $l4
i64.const 21474836603
i64.store offset=8 align=4
local.get $l4
local.get $p0
i32.store offset=4
local.get $l4
i32.const 5
i32.store offset=28
local.get $l4
local.get $p0
i32.store offset=16
local.get $l4
local.get $l4
i64.load offset=4 align=4
i64.store offset=20 align=4
local.get $l4
i32.const 4
i32.add
local.set $l11
local.get $l4
i32.const 16
i32.add
local.set $l7
i32.const 0
local.set $p0
call $std::__2::chrono::__libcpp_steady_clock_now__
local.set $l20
loop $L0
local.get $p0
i32.const 63
i32.gt_u
local.set $l12
loop $L1
block $B2
local.get $l11
call $std::__2::__cxx_atomic_wait_test_fn_impl<std::__2::__cxx_atomic_impl<int__std::__2::__cxx_atomic_base_impl<int>>__int>::operator___abi:v160006____const
br_if $B2
local.get $l12
i32.eqz
if $I3
local.get $p0
i32.const 1
i32.add
local.set $p0
br $L0
end
call $std::__2::chrono::__libcpp_steady_clock_now__
local.get $l20
i64.sub
local.tee $l19
i64.const 0
i64.gt_s
i32.const 0
i32.and
br_if $B2
block $B4 (result i32)
i32.const 0
local.set $l1
block $B5
local.get $l19
i64.const 64001
i64.ge_s
if $I6
i32.const 1
local.set $l1
local.get $l7
i32.load
call $std::__2::__libcpp_contention_state_void_const_volatile*_
i64.atomic.load offset=8
local.set $l19
local.get $l7
i32.const 4
i32.add
call $std::__2::__cxx_atomic_wait_test_fn_impl<std::__2::__cxx_atomic_impl<int__std::__2::__cxx_atomic_base_impl<int>>__int>::operator___abi:v160006____const
br_if $B5
i32.const 0
local.set $l8
local.get $l7
i32.load
call $std::__2::__libcpp_contention_state_void_const_volatile*_
local.tee $l1
local.tee $l13
i64.const 1
i64.atomic.rmw.add
drop
global.get $__stack_pointer
i32.const 32
i32.sub
local.tee $l5
global.set $__stack_pointer
local.get $l5
local.get $l19
i64.store offset=24
local.get $l5
local.get $l1
i32.const 8
i32.add
i32.store offset=16
call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>::zero_abi:v160006___
local.set $l19
global.get $__stack_pointer
i32.const 32
i32.sub
local.tee $l3
global.set $__stack_pointer
local.get $l3
local.get $l19
i64.store offset=24
local.get $l3
call $std::__2::chrono::__libcpp_steady_clock_now__
i64.store offset=16
loop $L7
local.get $l8
i32.const 63
i32.gt_u
local.set $l14
loop $L8
block $B9
global.get $__stack_pointer
i32.const 16
i32.sub
local.tee $l1
global.set $__stack_pointer
local.get $l1
local.get $l5
i32.load offset=16
i64.atomic.load
i64.store offset=8
local.get $l1
i64.load offset=8
local.get $l5
i64.load offset=24
i64.eq
local.set $l16
local.get $l1
i32.const 16
i32.add
global.set $__stack_pointer
local.get $l16
i32.eqz
br_if $B9
local.get $l14
i32.eqz
if $I10
local.get $l8
i32.const 1
i32.add
local.set $l8
br $L7
end
local.get $l3
call $std::__2::chrono::__libcpp_steady_clock_now__
i64.store
global.get $__stack_pointer
i32.const 16
i32.sub
local.tee $l2
global.set $__stack_pointer
local.get $l2
local.get $l3
i64.load
i64.store offset=8
local.get $l2
local.get $l3
i64.load offset=16
i64.store
global.get $__stack_pointer
i32.const 32
i32.sub
local.tee $l1
global.set $__stack_pointer
local.get $l1
local.get $l2
i64.load offset=8
i64.store offset=8
local.get $l1
i64.load offset=8
local.set $l19
local.get $l1
local.get $l2
i64.load
i64.store
local.get $l1
local.get $l19
local.get $l1
i64.load
i64.sub
i64.store offset=16
local.get $l1
i32.const 24
i32.add
local.get $l1
i32.const 16
i32.add
call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>::duration_abi:v160006_<long_long>_long_long_const&__std::__2::enable_if<is_convertible<long_long_const&__long_long>::value_&&__std::__2::integral_constant<bool__false>::value_||_!treat_as_floating_point<long_long>::value___void>::type*_
i64.load
local.set $l19
local.get $l1
i32.const 32
i32.add
global.set $__stack_pointer
local.get $l2
i32.const 16
i32.add
global.set $__stack_pointer
local.get $l3
local.get $l19
i64.store offset=8
local.get $l3
call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>::zero_abi:v160006___
i64.store
global.get $__stack_pointer
i32.const 16
i32.sub
local.tee $l1
global.set $__stack_pointer
local.get $l3
i32.const 24
i32.add
local.tee $l2
i64.load
local.get $l3
i64.load
i64.eq
local.set $l17
local.get $l1
i32.const 16
i32.add
global.set $__stack_pointer
local.get $l17
i32.const 1
i32.xor
if $I11
global.get $__stack_pointer
i32.const 16
i32.sub
local.tee $l1
global.set $__stack_pointer
local.get $l2
i64.load
local.get $l3
i64.load offset=8
i64.lt_s
local.set $l18
local.get $l1
i32.const 16
i32.add
global.set $__stack_pointer
local.get $l18
br_if $B9
end
local.get $l3
i64.load offset=8
local.set $l19
global.get $__stack_pointer
i32.const 32
i32.sub
local.tee $l1
global.set $__stack_pointer
local.get $l1
local.get $l19
i64.store offset=24
local.get $l1
i32.const 128
i32.store offset=8
local.get $l1
i32.const 16
i32.add
local.tee $l6
local.get $l1
i32.const 8
i32.add
local.tee $l9
call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000ll>>::duration_abi:v160006_<int>_int_const&__std::__2::enable_if<is_convertible<int_const&__long_long>::value_&&__std::__2::integral_constant<bool__false>::value_||_!treat_as_floating_point<int>::value___void>::type*_
local.set $l10
global.get $__stack_pointer
i32.const 16
i32.sub
local.tee $l15
global.set $__stack_pointer
global.get $__stack_pointer
i32.const 16
i32.sub
local.tee $l2
global.set $__stack_pointer
local.get $l2
i32.const 8
i32.add
local.get $l10
call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>::duration_abi:v160006_<long_long__std::__2::ratio<1ll__1000ll>>_std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000ll>>_const&__std::__2::enable_if<__no_overflow<std::__2::ratio<1ll__1000ll>__std::__2::ratio<1ll__1000000000ll>>::value_&&__std::__2::integral_constant<bool__false>::value_||___no_overflow<std::__2::ratio<1ll__1000ll>__std::__2::ratio<1ll__1000000000ll>>::type::den_==_1_&&_!treat_as_floating_point<long_long>::value___void>::type*_
i64.load
local.set $l19
local.get $l2
local.get $l1
i64.load offset=24
i64.store
local.get $l2
i64.load
local.set $l21
local.get $l2
i32.const 16
i32.add
global.set $__stack_pointer
local.get $l15
i32.const 16
i32.add
global.set $__stack_pointer
block $B12
local.get $l19
local.get $l21
i64.lt_s
if $I13
local.get $l1
i32.const 8
i32.store offset=4
local.get $l6
local.get $l9
local.get $l1
i32.const 4
i32.add
call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000ll>>::duration_abi:v160006_<int>_int_const&__std::__2::enable_if<is_convertible<int_const&__long_long>::value_&&__std::__2::integral_constant<bool__false>::value_||_!treat_as_floating_point<int>::value___void>::type*_
call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>::duration_abi:v160006_<long_long__std::__2::ratio<1ll__1000ll>>_std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000ll>>_const&__std::__2::enable_if<__no_overflow<std::__2::ratio<1ll__1000ll>__std::__2::ratio<1ll__1000000000ll>>::value_&&__std::__2::integral_constant<bool__false>::value_||___no_overflow<std::__2::ratio<1ll__1000ll>__std::__2::ratio<1ll__1000000000ll>>::type::den_==_1_&&_!treat_as_floating_point<long_long>::value___void>::type*_
call $std::__2::__libcpp_thread_sleep_for_abi:v160006__std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>_const&_
br $B12
end
local.get $l1
i32.const 64
i32.store offset=8
local.get $l1
i32.const 24
i32.add
local.tee $l6
local.get $l1
i32.const 16
i32.add
local.tee $l9
local.get $l1
i32.const 8
i32.add
local.tee $l10
call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000ll>>::duration_abi:v160006_<int>_int_const&__std::__2::enable_if<is_convertible<int_const&__long_long>::value_&&__std::__2::integral_constant<bool__false>::value_||_!treat_as_floating_point<int>::value___void>::type*_
call $bool_std::__2::chrono::operator>_abi:v160006_<long_long__std::__2::ratio<1ll__1000000000ll>__long_long__std::__2::ratio<1ll__1000000ll>>_std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>_const&__std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000ll>>_const&_
if $I14
local.get $l1
i32.const 2
i32.store offset=8
global.get $__stack_pointer
i32.const 32
i32.sub
local.tee $l2
global.set $__stack_pointer
local.get $l2
local.get $l6
i64.load
i64.store offset=8
local.get $l2
local.get $l2
i64.load offset=8
local.get $l10
i64.load32_s
i64.div_s
i64.store offset=16
local.get $l2
i32.const 24
i32.add
local.get $l2
i32.const 16
i32.add
call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>::duration_abi:v160006_<long_long>_long_long_const&__std::__2::enable_if<is_convertible<long_long_const&__long_long>::value_&&__std::__2::integral_constant<bool__false>::value_||_!treat_as_floating_point<long_long>::value___void>::type*_
i64.load
local.set $l19
local.get $l2
i32.const 32
i32.add
global.set $__stack_pointer
local.get $l1
local.get $l19
i64.store offset=16
local.get $l9
call $std::__2::__libcpp_thread_sleep_for_abi:v160006__std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>_const&_
br $B12
end
local.get $l1
i32.const 4
i32.store offset=8
local.get $l1
i32.const 24
i32.add
local.get $l1
i32.const 16
i32.add
local.get $l1
i32.const 8
i32.add
call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000ll>>::duration_abi:v160006_<int>_int_const&__std::__2::enable_if<is_convertible<int_const&__long_long>::value_&&__std::__2::integral_constant<bool__false>::value_||_!treat_as_floating_point<int>::value___void>::type*_
call $bool_std::__2::chrono::operator>_abi:v160006_<long_long__std::__2::ratio<1ll__1000000000ll>__long_long__std::__2::ratio<1ll__1000000ll>>_std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>_const&__std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000ll>>_const&_
i32.eqz
br_if $B12
call $emscripten_get_now
call $_emscripten_yield
end
local.get $l1
i32.const 32
i32.add
global.set $__stack_pointer
br $L8
end
end
end
local.get $l3
i32.const 32
i32.add
global.set $__stack_pointer
local.get $l5
i32.const 32
i32.add
global.set $__stack_pointer
local.get $l13
i64.const 1
i64.atomic.rmw.sub
drop
i32.const 0
br $B4
end
local.get $l19
i64.const 4001
i64.lt_s
br_if $B5
call $emscripten_get_now
call $_emscripten_yield
end
local.get $l1
end
i32.eqz
br_if $L1
end
end
end
local.get $l4
i32.const 32
i32.add
global.set $__stack_pointer)
I know there is at least emscripten/atomic.h
which uses Clang builtins to get native atomic wait/notify, but it would be nice to be able to use those optimised instructions via the standard library too.