Skip to content

Suboptimal codegen for C++20 atomic wait/notify #20799

Open
@RReverser

Description

@RReverser

C++20 has added built-in wait/notify operations on atomics, both the generic std::atomic<T>, as well as a boolean-like std::atomic_flag.

On the WebAssembly target I'd expect those methods to compile to thin wrappers around the native atomic.wait / atomic.notify instructions, but it looks like it results in a much more complex codegen. I'm guessing it implements whatever those instructions do natively under the hood, but embedded into the WebAssembly binary.

Example:

#include <atomic>

__attribute__((export_name("wait")))
extern "C" void wait(std::atomic<int>* x) {
    x->wait(123);
}

int main() {}

with emcc temp.cpp -Os -g2 -o temp.js -pthread results in the following codegen + tons of the helper functions it references:

(func $wait (type $t0) (param $p0 i32)
  (local $l1 i32) (local $l2 i32) (local $l3 i32) (local $l4 i32) (local $l5 i32) (local $l6 i32) (local $l7 i32) (local $l8 i32) (local $l9 i32) (local $l10 i32) (local $l11 i32) (local $l12 i32) (local $l13 i32) (local $l14 i32) (local $l15 i32) (local $l16 i32) (local $l17 i32) (local $l18 i32) (local $l19 i64) (local $l20 i64) (local $l21 i64)
  global.get $__stack_pointer
  i32.const 32
  i32.sub
  local.tee $l4
  global.set $__stack_pointer
  local.get $l4
  i64.const 21474836603
  i64.store offset=8 align=4
  local.get $l4
  local.get $p0
  i32.store offset=4
  local.get $l4
  i32.const 5
  i32.store offset=28
  local.get $l4
  local.get $p0
  i32.store offset=16
  local.get $l4
  local.get $l4
  i64.load offset=4 align=4
  i64.store offset=20 align=4
  local.get $l4
  i32.const 4
  i32.add
  local.set $l11
  local.get $l4
  i32.const 16
  i32.add
  local.set $l7
  i32.const 0
  local.set $p0
  call $std::__2::chrono::__libcpp_steady_clock_now__
  local.set $l20
  loop $L0
    local.get $p0
    i32.const 63
    i32.gt_u
    local.set $l12
    loop $L1
      block $B2
        local.get $l11
        call $std::__2::__cxx_atomic_wait_test_fn_impl<std::__2::__cxx_atomic_impl<int__std::__2::__cxx_atomic_base_impl<int>>__int>::operator___abi:v160006____const
        br_if $B2
        local.get $l12
        i32.eqz
        if $I3
          local.get $p0
          i32.const 1
          i32.add
          local.set $p0
          br $L0
        end
        call $std::__2::chrono::__libcpp_steady_clock_now__
        local.get $l20
        i64.sub
        local.tee $l19
        i64.const 0
        i64.gt_s
        i32.const 0
        i32.and
        br_if $B2
        block $B4 (result i32)
          i32.const 0
          local.set $l1
          block $B5
            local.get $l19
            i64.const 64001
            i64.ge_s
            if $I6
              i32.const 1
              local.set $l1
              local.get $l7
              i32.load
              call $std::__2::__libcpp_contention_state_void_const_volatile*_
              i64.atomic.load offset=8
              local.set $l19
              local.get $l7
              i32.const 4
              i32.add
              call $std::__2::__cxx_atomic_wait_test_fn_impl<std::__2::__cxx_atomic_impl<int__std::__2::__cxx_atomic_base_impl<int>>__int>::operator___abi:v160006____const
              br_if $B5
              i32.const 0
              local.set $l8
              local.get $l7
              i32.load
              call $std::__2::__libcpp_contention_state_void_const_volatile*_
              local.tee $l1
              local.tee $l13
              i64.const 1
              i64.atomic.rmw.add
              drop
              global.get $__stack_pointer
              i32.const 32
              i32.sub
              local.tee $l5
              global.set $__stack_pointer
              local.get $l5
              local.get $l19
              i64.store offset=24
              local.get $l5
              local.get $l1
              i32.const 8
              i32.add
              i32.store offset=16
              call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>::zero_abi:v160006___
              local.set $l19
              global.get $__stack_pointer
              i32.const 32
              i32.sub
              local.tee $l3
              global.set $__stack_pointer
              local.get $l3
              local.get $l19
              i64.store offset=24
              local.get $l3
              call $std::__2::chrono::__libcpp_steady_clock_now__
              i64.store offset=16
              loop $L7
                local.get $l8
                i32.const 63
                i32.gt_u
                local.set $l14
                loop $L8
                  block $B9
                    global.get $__stack_pointer
                    i32.const 16
                    i32.sub
                    local.tee $l1
                    global.set $__stack_pointer
                    local.get $l1
                    local.get $l5
                    i32.load offset=16
                    i64.atomic.load
                    i64.store offset=8
                    local.get $l1
                    i64.load offset=8
                    local.get $l5
                    i64.load offset=24
                    i64.eq
                    local.set $l16
                    local.get $l1
                    i32.const 16
                    i32.add
                    global.set $__stack_pointer
                    local.get $l16
                    i32.eqz
                    br_if $B9
                    local.get $l14
                    i32.eqz
                    if $I10
                      local.get $l8
                      i32.const 1
                      i32.add
                      local.set $l8
                      br $L7
                    end
                    local.get $l3
                    call $std::__2::chrono::__libcpp_steady_clock_now__
                    i64.store
                    global.get $__stack_pointer
                    i32.const 16
                    i32.sub
                    local.tee $l2
                    global.set $__stack_pointer
                    local.get $l2
                    local.get $l3
                    i64.load
                    i64.store offset=8
                    local.get $l2
                    local.get $l3
                    i64.load offset=16
                    i64.store
                    global.get $__stack_pointer
                    i32.const 32
                    i32.sub
                    local.tee $l1
                    global.set $__stack_pointer
                    local.get $l1
                    local.get $l2
                    i64.load offset=8
                    i64.store offset=8
                    local.get $l1
                    i64.load offset=8
                    local.set $l19
                    local.get $l1
                    local.get $l2
                    i64.load
                    i64.store
                    local.get $l1
                    local.get $l19
                    local.get $l1
                    i64.load
                    i64.sub
                    i64.store offset=16
                    local.get $l1
                    i32.const 24
                    i32.add
                    local.get $l1
                    i32.const 16
                    i32.add
                    call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>::duration_abi:v160006_<long_long>_long_long_const&__std::__2::enable_if<is_convertible<long_long_const&__long_long>::value_&&__std::__2::integral_constant<bool__false>::value_||_!treat_as_floating_point<long_long>::value___void>::type*_
                    i64.load
                    local.set $l19
                    local.get $l1
                    i32.const 32
                    i32.add
                    global.set $__stack_pointer
                    local.get $l2
                    i32.const 16
                    i32.add
                    global.set $__stack_pointer
                    local.get $l3
                    local.get $l19
                    i64.store offset=8
                    local.get $l3
                    call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>::zero_abi:v160006___
                    i64.store
                    global.get $__stack_pointer
                    i32.const 16
                    i32.sub
                    local.tee $l1
                    global.set $__stack_pointer
                    local.get $l3
                    i32.const 24
                    i32.add
                    local.tee $l2
                    i64.load
                    local.get $l3
                    i64.load
                    i64.eq
                    local.set $l17
                    local.get $l1
                    i32.const 16
                    i32.add
                    global.set $__stack_pointer
                    local.get $l17
                    i32.const 1
                    i32.xor
                    if $I11
                      global.get $__stack_pointer
                      i32.const 16
                      i32.sub
                      local.tee $l1
                      global.set $__stack_pointer
                      local.get $l2
                      i64.load
                      local.get $l3
                      i64.load offset=8
                      i64.lt_s
                      local.set $l18
                      local.get $l1
                      i32.const 16
                      i32.add
                      global.set $__stack_pointer
                      local.get $l18
                      br_if $B9
                    end
                    local.get $l3
                    i64.load offset=8
                    local.set $l19
                    global.get $__stack_pointer
                    i32.const 32
                    i32.sub
                    local.tee $l1
                    global.set $__stack_pointer
                    local.get $l1
                    local.get $l19
                    i64.store offset=24
                    local.get $l1
                    i32.const 128
                    i32.store offset=8
                    local.get $l1
                    i32.const 16
                    i32.add
                    local.tee $l6
                    local.get $l1
                    i32.const 8
                    i32.add
                    local.tee $l9
                    call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000ll>>::duration_abi:v160006_<int>_int_const&__std::__2::enable_if<is_convertible<int_const&__long_long>::value_&&__std::__2::integral_constant<bool__false>::value_||_!treat_as_floating_point<int>::value___void>::type*_
                    local.set $l10
                    global.get $__stack_pointer
                    i32.const 16
                    i32.sub
                    local.tee $l15
                    global.set $__stack_pointer
                    global.get $__stack_pointer
                    i32.const 16
                    i32.sub
                    local.tee $l2
                    global.set $__stack_pointer
                    local.get $l2
                    i32.const 8
                    i32.add
                    local.get $l10
                    call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>::duration_abi:v160006_<long_long__std::__2::ratio<1ll__1000ll>>_std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000ll>>_const&__std::__2::enable_if<__no_overflow<std::__2::ratio<1ll__1000ll>__std::__2::ratio<1ll__1000000000ll>>::value_&&__std::__2::integral_constant<bool__false>::value_||___no_overflow<std::__2::ratio<1ll__1000ll>__std::__2::ratio<1ll__1000000000ll>>::type::den_==_1_&&_!treat_as_floating_point<long_long>::value___void>::type*_
                    i64.load
                    local.set $l19
                    local.get $l2
                    local.get $l1
                    i64.load offset=24
                    i64.store
                    local.get $l2
                    i64.load
                    local.set $l21
                    local.get $l2
                    i32.const 16
                    i32.add
                    global.set $__stack_pointer
                    local.get $l15
                    i32.const 16
                    i32.add
                    global.set $__stack_pointer
                    block $B12
                      local.get $l19
                      local.get $l21
                      i64.lt_s
                      if $I13
                        local.get $l1
                        i32.const 8
                        i32.store offset=4
                        local.get $l6
                        local.get $l9
                        local.get $l1
                        i32.const 4
                        i32.add
                        call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000ll>>::duration_abi:v160006_<int>_int_const&__std::__2::enable_if<is_convertible<int_const&__long_long>::value_&&__std::__2::integral_constant<bool__false>::value_||_!treat_as_floating_point<int>::value___void>::type*_
                        call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>::duration_abi:v160006_<long_long__std::__2::ratio<1ll__1000ll>>_std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000ll>>_const&__std::__2::enable_if<__no_overflow<std::__2::ratio<1ll__1000ll>__std::__2::ratio<1ll__1000000000ll>>::value_&&__std::__2::integral_constant<bool__false>::value_||___no_overflow<std::__2::ratio<1ll__1000ll>__std::__2::ratio<1ll__1000000000ll>>::type::den_==_1_&&_!treat_as_floating_point<long_long>::value___void>::type*_
                        call $std::__2::__libcpp_thread_sleep_for_abi:v160006__std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>_const&_
                        br $B12
                      end
                      local.get $l1
                      i32.const 64
                      i32.store offset=8
                      local.get $l1
                      i32.const 24
                      i32.add
                      local.tee $l6
                      local.get $l1
                      i32.const 16
                      i32.add
                      local.tee $l9
                      local.get $l1
                      i32.const 8
                      i32.add
                      local.tee $l10
                      call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000ll>>::duration_abi:v160006_<int>_int_const&__std::__2::enable_if<is_convertible<int_const&__long_long>::value_&&__std::__2::integral_constant<bool__false>::value_||_!treat_as_floating_point<int>::value___void>::type*_
                      call $bool_std::__2::chrono::operator>_abi:v160006_<long_long__std::__2::ratio<1ll__1000000000ll>__long_long__std::__2::ratio<1ll__1000000ll>>_std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>_const&__std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000ll>>_const&_
                      if $I14
                        local.get $l1
                        i32.const 2
                        i32.store offset=8
                        global.get $__stack_pointer
                        i32.const 32
                        i32.sub
                        local.tee $l2
                        global.set $__stack_pointer
                        local.get $l2
                        local.get $l6
                        i64.load
                        i64.store offset=8
                        local.get $l2
                        local.get $l2
                        i64.load offset=8
                        local.get $l10
                        i64.load32_s
                        i64.div_s
                        i64.store offset=16
                        local.get $l2
                        i32.const 24
                        i32.add
                        local.get $l2
                        i32.const 16
                        i32.add
                        call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>::duration_abi:v160006_<long_long>_long_long_const&__std::__2::enable_if<is_convertible<long_long_const&__long_long>::value_&&__std::__2::integral_constant<bool__false>::value_||_!treat_as_floating_point<long_long>::value___void>::type*_
                        i64.load
                        local.set $l19
                        local.get $l2
                        i32.const 32
                        i32.add
                        global.set $__stack_pointer
                        local.get $l1
                        local.get $l19
                        i64.store offset=16
                        local.get $l9
                        call $std::__2::__libcpp_thread_sleep_for_abi:v160006__std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>_const&_
                        br $B12
                      end
                      local.get $l1
                      i32.const 4
                      i32.store offset=8
                      local.get $l1
                      i32.const 24
                      i32.add
                      local.get $l1
                      i32.const 16
                      i32.add
                      local.get $l1
                      i32.const 8
                      i32.add
                      call $std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000ll>>::duration_abi:v160006_<int>_int_const&__std::__2::enable_if<is_convertible<int_const&__long_long>::value_&&__std::__2::integral_constant<bool__false>::value_||_!treat_as_floating_point<int>::value___void>::type*_
                      call $bool_std::__2::chrono::operator>_abi:v160006_<long_long__std::__2::ratio<1ll__1000000000ll>__long_long__std::__2::ratio<1ll__1000000ll>>_std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000000ll>>_const&__std::__2::chrono::duration<long_long__std::__2::ratio<1ll__1000000ll>>_const&_
                      i32.eqz
                      br_if $B12
                      call $emscripten_get_now
                      call $_emscripten_yield
                    end
                    local.get $l1
                    i32.const 32
                    i32.add
                    global.set $__stack_pointer
                    br $L8
                  end
                end
              end
              local.get $l3
              i32.const 32
              i32.add
              global.set $__stack_pointer
              local.get $l5
              i32.const 32
              i32.add
              global.set $__stack_pointer
              local.get $l13
              i64.const 1
              i64.atomic.rmw.sub
              drop
              i32.const 0
              br $B4
            end
            local.get $l19
            i64.const 4001
            i64.lt_s
            br_if $B5
            call $emscripten_get_now
            call $_emscripten_yield
          end
          local.get $l1
        end
        i32.eqz
        br_if $L1
      end
    end
  end
  local.get $l4
  i32.const 32
  i32.add
  global.set $__stack_pointer)

I know there is at least emscripten/atomic.h which uses Clang builtins to get native atomic wait/notify, but it would be nice to be able to use those optimised instructions via the standard library too.

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions