[mlir][llvm][OpenMP] Hoist __atomic_load alloca #132888

NimishMishra · 2025-03-25T06:15:42Z

Current implementation of __atomic_compare_exchange uses an alloca for __atomic_load, leading to issues like #120724. This PR hoists this alloca to AllocaIP.

Fixes: #120724

llvmbot · 2025-03-25T06:16:15Z

@llvm/pr-subscribers-mlir-openmp
@llvm/pr-subscribers-flang-openmp
@llvm/pr-subscribers-mlir

@llvm/pr-subscribers-mlir-llvm

Author: None (NimishMishra)

Changes

Hoist the alloca used in __atomic_load to the entry basic block of the parent function. This prevents the alloca to be wrapped in constructs (like loops) surrounding the __atomic_load.

Fixes: #120724

Full diff: https://github.com/llvm/llvm-project/pull/132888.diff

3 Files Affected:

(modified) flang/test/Integration/OpenMP/atomic-capture-complex.f90 (+1-1)
(modified) llvm/lib/Frontend/Atomic/Atomic.cpp (+6)
(modified) mlir/test/Target/LLVMIR/openmp-llvm.mlir (+4-4)

diff --git a/flang/test/Integration/OpenMP/atomic-capture-complex.f90 b/flang/test/Integration/OpenMP/atomic-capture-complex.f90
index 4ffd18097d79e..69390427ff1ff 100644
--- a/flang/test/Integration/OpenMP/atomic-capture-complex.f90
+++ b/flang/test/Integration/OpenMP/atomic-capture-complex.f90
@@ -9,6 +9,7 @@
 !RUN: %if x86-registered-target %{ %flang_fc1 -triple x86_64-unknown-linux-gnu -emit-llvm -fopenmp %s -o - | FileCheck --check-prefixes=CHECK,X86 %s %}
 !RUN: %if aarch64-registerd-target %{ %flang_fc1 -triple aarch64-unknown-linux-gnu -emit-llvm -fopenmp %s -o - | FileCheck --check-prefixes=CHECK,AARCH64 %s %}
 
+!CHECK: %[[ATOMIC_TEMP_LOAD:.*]] = alloca { float, float }, align 8
 !CHECK: %[[X_NEW_VAL:.*]] = alloca { float, float }, align 8
 !CHECK: %[[VAL_1:.*]] = alloca { float, float }, i64 1, align 8
 !CHECK: %[[ORIG_VAL:.*]] = alloca { float, float }, i64 1, align 8
@@ -16,7 +17,6 @@
 !CHECK: br label %entry
 
 !CHECK: entry:
-!CHECK: %[[ATOMIC_TEMP_LOAD:.*]] = alloca { float, float }, align 8
 !CHECK: call void @__atomic_load(i64 8, ptr %[[ORIG_VAL]], ptr %[[ATOMIC_TEMP_LOAD]], i32 0)
 !CHECK: %[[PHI_NODE_ENTRY_1:.*]] = load { float, float }, ptr %[[ATOMIC_TEMP_LOAD]], align 8
 !CHECK: br label %.atomic.cont
diff --git a/llvm/lib/Frontend/Atomic/Atomic.cpp b/llvm/lib/Frontend/Atomic/Atomic.cpp
index c9f9a9dcfb702..db9f1b96e4aa3 100644
--- a/llvm/lib/Frontend/Atomic/Atomic.cpp
+++ b/llvm/lib/Frontend/Atomic/Atomic.cpp
@@ -118,8 +118,14 @@ AtomicInfo::EmitAtomicLoadLibcall(AtomicOrdering AO) {
   Value *PtrVal = getAtomicPointer();
   PtrVal = Builder->CreateAddrSpaceCast(PtrVal, PointerType::getUnqual(Ctx));
   Args.push_back(PtrVal);
+
+  auto CurrentIP = Builder->saveIP();
+  BasicBlock &InsertBB =
+      Builder->GetInsertBlock()->getParent()->getEntryBlock();
+  Builder->SetInsertPoint(&InsertBB, InsertBB.getFirstInsertionPt());
   AllocaInst *AllocaResult =
       CreateAlloca(Ty, getAtomicPointer()->getName() + "atomic.temp.load");
+  Builder->restoreIP(CurrentIP);
   const Align AllocaAlignment = DL.getPrefTypeAlign(SizedIntTy);
   AllocaResult->setAlignment(AllocaAlignment);
   Args.push_back(AllocaResult);
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index d7f4d0a65b24c..47bb642518811 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -1368,6 +1368,7 @@ llvm.func @omp_atomic_read(%arg0 : !llvm.ptr, %arg1 : !llvm.ptr) -> () {
 
 // CHECK-LABEL: @omp_atomic_read_implicit_cast
 llvm.func @omp_atomic_read_implicit_cast () {
+//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = alloca { float, float }, align 8
 //CHECK: %[[Z:.*]] = alloca float, i64 1, align 4
 //CHECK: %[[Y:.*]] = alloca double, i64 1, align 8
 //CHECK: %[[X:.*]] = alloca [2 x { float, float }], i64 1, align 8
@@ -1392,7 +1393,6 @@ llvm.func @omp_atomic_read_implicit_cast () {
   %16 = llvm.mul %10, %9 overflow<nsw> : i64
   %17 = llvm.getelementptr %5[%15] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(f32, f32)>
 
-//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = alloca { float, float }, align 8
 //CHECK: call void @__atomic_load(i64 8, ptr %[[X_ELEMENT]], ptr %[[ATOMIC_LOAD_TEMP]], i32 0)
 //CHECK: %[[LOAD:.*]] = load { float, float }, ptr %[[ATOMIC_LOAD_TEMP]], align 8
 //CHECK: %[[EXT:.*]] = extractvalue { float, float } %[[LOAD]], 0
@@ -1480,6 +1480,7 @@ llvm.func @omp_atomic_update(%x:!llvm.ptr, %expr: i32, %xbool: !llvm.ptr, %exprb
 
 // -----
 
+//CHECK: %[[ATOMIC_TEMP_LOAD:.*]] = alloca { float, float }, align 8
 //CHECK: %[[X_NEW_VAL:.*]] = alloca { float, float }, align 8
 //CHECK: {{.*}} = alloca { float, float }, i64 1, align 8
 //CHECK: %[[ORIG_VAL:.*]] = alloca { float, float }, i64 1, align 8
@@ -1487,7 +1488,6 @@ llvm.func @omp_atomic_update(%x:!llvm.ptr, %expr: i32, %xbool: !llvm.ptr, %exprb
 //CHECK: br label %entry
 
 //CHECK: entry:
-//CHECK: %[[ATOMIC_TEMP_LOAD:.*]] = alloca { float, float }, align 8
 //CHECK: call void @__atomic_load(i64 8, ptr %[[ORIG_VAL]], ptr %[[ATOMIC_TEMP_LOAD]], i32 0)
 //CHECK: %[[PHI_NODE_ENTRY_1:.*]] = load { float, float }, ptr %[[ATOMIC_TEMP_LOAD]], align 8
 //CHECK: br label %.atomic.cont
@@ -1532,6 +1532,7 @@ llvm.func @_QPomp_atomic_update_complex() {
 
 // -----
 
+//CHECK: %[[ATOMIC_TEMP_LOAD:.*]] = alloca { float, float }, align 8
 //CHECK: %[[X_NEW_VAL:.*]] = alloca { float, float }, align 8
 //CHECK: %[[VAL_1:.*]] = alloca { float, float }, i64 1, align 8
 //CHECK: %[[ORIG_VAL:.*]] = alloca { float, float }, i64 1, align 8
@@ -1539,7 +1540,6 @@ llvm.func @_QPomp_atomic_update_complex() {
 //CHECK: br label %entry
 
 //CHECK: entry:							; preds = %0
-//CHECK: %[[ATOMIC_TEMP_LOAD:.*]] = alloca { float, float }, align 8
 //CHECK: call void @__atomic_load(i64 8, ptr %[[ORIG_VAL]], ptr %[[ATOMIC_TEMP_LOAD]], i32 0)
 //CHECK: %[[PHI_NODE_ENTRY_1:.*]] = load { float, float }, ptr %[[ATOMIC_TEMP_LOAD]], align 8
 //CHECK: br label %.atomic.cont
@@ -1597,9 +1597,9 @@ llvm.func @_QPomp_atomic_capture_complex() {
 // CHECK-LABEL: define void @omp_atomic_read_complex() {
 llvm.func @omp_atomic_read_complex(){
 
+// CHECK: %[[ATOMIC_TEMP_LOAD:.*]] = alloca { float, float }, align 8
 // CHECK: %[[a:.*]] = alloca { float, float }, i64 1, align 8
 // CHECK: %[[b:.*]] = alloca { float, float }, i64 1, align 8
-// CHECK: %[[ATOMIC_TEMP_LOAD:.*]] = alloca { float, float }, align 8
 // CHECK: call void @__atomic_load(i64 8, ptr %[[b]], ptr %[[ATOMIC_TEMP_LOAD]], i32 0)
 // CHECK: %[[LOADED_VAL:.*]] = load { float, float }, ptr %[[ATOMIC_TEMP_LOAD]], align 8
 // CHECK: store { float, float } %[[LOADED_VAL]], ptr %[[a]], align 4

kiranchandramohan

LG.

Please wait for @Meinersbur or @tblah.

tblah

Requesting changes because I don't think this alloca placement is safe in the presence of outlined openmp constructs.

tblah · 2025-03-26T11:47:44Z

llvm/lib/Frontend/Atomic/Atomic.cpp

+
+  auto CurrentIP = Builder->saveIP();
+  BasicBlock &InsertBB =
+      Builder->GetInsertBlock()->getParent()->getEntryBlock();


This will put the alloca too far out if this is used in the presence of openmp constructs which are put into outlined functions (e.g. parallel, task, target). The rough order of events is

All code is generated inside of the same function (this is where EmitAtomicLoadLibcall runs)

Each outlined region is moved to its own function

This means that the entry block here is the original entry block of the function that contains the outlined operation not the entry block of the final outlined function.

For example, see the code generated for #120724 before and after this patch. The alloca is moved to the entry block of _QQmain and then this is passed into the parallel region like a shared variable (which could produce incorrect results if multiple threads try to use the shared alloca at the same time).

There are a few possible solutions here

Keep the alloca in the same place as before this patch but add stack save/restore operations so that the stack usage remains constant no matter how many times this is executed

OpenMPIRBuilder usually has a variable describing the correct insertion point for allocas, that could be passed to here

Have some pass that runs after all of the functions have been outlined by OpenMPIRBuilder which tries to move allocas to the entry blocks of those functions

Of these, I think (1) would be easiest to implement. It will have some overhead but personally I would be happy to come back to that and try a more complex solution only if that overhead turns out to be a problem in practice.

Posting the diff for #120724 to make the above explanation clearer:

--- /tmp/old.ll 2025-03-26 11:04:27.390461691 +0000 +++ /tmp/new.ll 2025-03-26 11:36:09.594270400 +0000 @@ -13,7 +13,8 @@ @2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 22, ptr @0 }, align 8 define void @_QQmain() { - %structArg = alloca { ptr }, align 8 + %structArg = alloca { ptr, ptr }, align 8 + %atomic.temp.load = alloca { float, float }, align 8 %1 = alloca i32, i64 1, align 4 %2 = alloca { float, float }, i64 1, align 8 %3 = alloca i32, i64 1, align 4 @@ -27,8 +28,10 @@ br label %omp_parallel omp_parallel: ; preds = %entry - %gep_ = getelementptr { ptr }, ptr %structArg, i32 0, i32 0 + %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0 store ptr %2, ptr %gep_, align 8 + %gep_atomic.temp.load = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1 + store ptr %atomic.temp.load, ptr %gep_atomic.temp.load, align 8 call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @1, i32 1, ptr @_QQmain..omp_par, ptr %structArg) br label %omp.par.exit @@ -45,8 +48,10 @@ ; Function Attrs: nounwind define internal void @_QQmain..omp_par(ptr noalias %tid.addr, ptr noalias %zero.addr, ptr %0) #0 { omp.par.entry: - %gep_ = getelementptr { ptr }, ptr %0, i32 0, i32 0 + %gep_ = getelementptr { ptr, ptr }, ptr %0, i32 0, i32 0 %loadgep_ = load ptr, ptr %gep_, align 8 + %gep_atomic.temp.load = getelementptr { ptr, ptr }, ptr %0, i32 0, i32 1 + %loadgep_atomic.temp.load = load ptr, ptr %gep_atomic.temp.load, align 8 %p.lastiter = alloca i32, align 4 %p.lowerbound = alloca i32, align 4 %p.upperbound = alloca i32, align 4 @@ -123,9 +128,8 @@ omp.loop_nest.region: ; preds = %omp_loop.body store i32 %8, ptr %omp.private.alloc, align 4 - %atomic.temp.load = alloca { float, float }, align 8 - call void @__atomic_load(i64 8, ptr %loadgep_, ptr %atomic.temp.load, i32 0) - %9 = load { float, float }, ptr %atomic.temp.load, align 8 + call void @__atomic_load(i64 8, ptr %loadgep_, ptr %loadgep_atomic.temp.load, i32 0) + %9 = load { float, float }, ptr %loadgep_atomic.temp.load, align 8 br label %.atomic.cont .atomic.cont: ; preds = %.atomic.cont, %omp.loop_nest.region @@ -137,8 +141,8 @@ %15 = insertvalue { float, float } undef, float %13, 0 %16 = insertvalue { float, float } %15, float %14, 1 store { float, float } %16, ptr %x.new.val, align 4 - %17 = call i1 @__atomic_compare_exchange(i64 8, ptr %loadgep_, ptr %atomic.temp.load, ptr %x.new.val, i32 2, i32 2) - %18 = load { float, float }, ptr %atomic.temp.load, align 4 + %17 = call i1 @__atomic_compare_exchange(i64 8, ptr %loadgep_, ptr %loadgep_atomic.temp.load, ptr %x.new.val, i32 2, i32 2) + %18 = load { float, float }, ptr %loadgep_atomic.temp.load, align 4 br i1 %17, label %.atomic.exit, label %.atomic.cont .atomic.exit: ; preds = %.atomic.cont @@ -203,6 +207,6 @@ !0 = !{i32 7, !"openmp", i32 11} !1 = !{i32 2, !"Debug Info Version", i32 3} -!2 = !{!"flang version 21.0.0 ([email protected]:llvm/llvm-project.git 7967a6c7792af8fb7b8fe8957235e517839c4a96)"} +!2 = !{!"flang version 21.0.0 ([email protected]:NimishMishra/f18-llvm-project.git 5753d49b7b989b0079b321c38576a1fc98ac6cbe)"} !3 = !{!4} !4 = !{i64 2, i64 -1, i64 -1, i1 true}

This means that the entry block here is the original entry block of the function that contains the outlined operation not the entry block of the final outlined function.

Is that so? Thanks for pointing out. I thought the function outlining occurred before the codegen inside the function.

I'll then implement the stack save/restore solution.

Hi @tblah,

I have enclosed the offending compare-exchange loop with a stack save/restore, and the SEGFAULT is resolved now in my testing.

Do you think we need the stack save/restore for atomic reads too (where we would not have any updates)? I could not reproduce a problem as such, so have not added the save/restore as of now. Do let me know if they are needed for atomic reads too.

Thanks for fixing this, but see Michael's comment.

I think this potentially needs fixing anywhere we are generating an alloca which we don't know for sure will end up in an entry block or otherwise outside of a long loop.

I will make the changes Michael requested. Thanks.

Meinersbur

stacksave/stackrestore is for dynamically-sized allocas¹. If you don't have dynamically-sized allocas, don't use them. Normal allocas must be emitted into the AllocaInsertPt (as done in #101966).

And __builtin_alloca in a loop to build stack-based linked lists. Never seen this in the wild. ↩

NimishMishra · 2025-04-01T05:52:26Z

stacksave/stackrestore is for dynamically-sized allocas1. If you don't have dynamically-sized allocas, don't use them. Normal allocas must be emitted into the AllocaInsertPt (as done in #101966).

Oh. Okay, understood. I'll revert the changes to the PR then.

Again, this is new info to me; could you tell why is it that stack save/restore is not for allocas of known size at compile time? I am assuming performance reasons, but is there something else too?

tblah · 2025-04-01T10:11:00Z

Again, this is new info to me; could you tell why is it that stack save/restore is not for allocas of known size at compile time? I am assuming performance reasons, but is there something else too?

New to me too. I think this is done in other places in flang where there is no way to put allocas in the entry block.

Meinersbur · 2025-04-01T11:44:06Z

Mem2Reg (and SROA) only ever consider allocas in the entry block. Other allocas will not be promoted to registers, resulting in a heavy performance penalty. For this reason the inliner will also move allocas from the callee into the caller's entry block. This only impossible if the allocated size depends on computations in the caller (i.e. a variable-sized array), in which case the scope of the callee is surrounded by stacksave/stackrestore so the stack does not grow whenever the callee would have been called, as apparently happens in #120724. There must be no unbounded stack growth within a function unless the user calls __builtin_alloca manually. Stack overflow due to recursion already is a known security issue, search for "stack probing".

You could technically have a statically sized alloca not in the entry-block if it is either surrounded by stacksave/stackrestore or guaranteed to be not in a loop, but optimization passes don't handle it well. When inserting stacksave/stackrestore ensuring (post-)dominance is also difficult; StackColoring may complain or just miscompile your code. This was e.g. a bug in Polly and I strongly suggest to stay away from it if not necessary.

See also the discussion here: https://discourse.llvm.org/t/why-is-llvm-stacksave-necessary/45724

I think the AllocaInsertPt mechanism could have been designed a lot better, but here we are.

tblah · 2025-04-01T12:39:48Z

Thank you for the detailed explanation Michael, that's very helpful for me.

NimishMishra · 2025-04-01T14:04:50Z

Thanks @Meinersbur for the detailed explanation. It really helps.

The latest commit uses AllocaIP to hoist the __atomic_load alloca out of the compare exchange loop. I checked the IR in presence of enclosing constructs like parallel, and no longer observe the issue (i.e. the alloca being hoisted to the entry block of main() function) which @tblah pointed out. Hope this is fine.

tblah

LGTM. Thank you for all of the updates. Wait for approval from Michael before merging.

@Meinersbur the alloca still doesn't make it into the entry block when generating LLVMIR for the example in #120724, but I think that's a separate bug in OpenMPToLLVMIRTranslation.cpp beyond the scope of this PR. If I run the IR through opt -O3 the alloca is hoisted into the entry block. Is this okay with you?

Meinersbur

LGTM

Current implementation of `__atomic_compare_exchange` uses an alloca for `__atomic_load`, leading to issues like llvm#120724. This PR hoists this alloca to `AllocaIP`. Fixes: llvm#120724

NimishMishra requested review from Meinersbur, tblah, kiranchandramohan, jeanPerier, kiranktp, kaviya2510 and Thirumalai-Shaktivel March 25, 2025 06:15

llvmbot added mlir:llvm mlir flang Flang issues not falling into any other category flang:openmp labels Mar 25, 2025

NimishMishra mentioned this pull request Mar 25, 2025

[flang][openmp] crash in complex atomic #120724

Closed

kiranchandramohan approved these changes Mar 26, 2025

View reviewed changes

tblah requested changes Mar 26, 2025

View reviewed changes

NimishMishra force-pushed the atomic_load_alloca_hoist branch from 5753d49 to 978e715 Compare March 27, 2025 12:26

llvmbot added the clang:openmp OpenMP related changes to Clang label Mar 27, 2025

NimishMishra force-pushed the atomic_load_alloca_hoist branch from 978e715 to 2df1197 Compare March 27, 2025 12:27

NimishMishra changed the title ~~[mlir][llvm][OpenMP] Hoist __atomic_load alloca~~ [mlir][llvm][OpenMP] Emit llvm.stacksave and llvm.stackrestore in __atomic_compare_exchange Mar 27, 2025

Meinersbur requested changes Mar 27, 2025

View reviewed changes

NimishMishra force-pushed the atomic_load_alloca_hoist branch from 63d3d76 to 82c11f7 Compare April 1, 2025 14:02

NimishMishra changed the title ~~[mlir][llvm][OpenMP] Emit llvm.stacksave and llvm.stackrestore in __atomic_compare_exchange~~ [mlir][llvm][OpenMP] Hoist __atomic_load alloca Apr 1, 2025

tblah approved these changes Apr 1, 2025

View reviewed changes

[mlir][llvm][OpenMP] Hoist __atomic_load alloca

629450c

NimishMishra force-pushed the atomic_load_alloca_hoist branch from 82c11f7 to 629450c Compare April 7, 2025 11:14

llvmbot added the mlir:openmp label Apr 7, 2025

Fix unittest

2507e02

Meinersbur approved these changes Apr 7, 2025

View reviewed changes

NimishMishra merged commit 53fa92d into llvm:main Apr 9, 2025
11 checks passed

NimishMishra deleted the atomic_load_alloca_hoist branch April 9, 2025 10:02

[mlir][llvm][OpenMP] Hoist __atomic_load alloca #132888

[mlir][llvm][OpenMP] Hoist __atomic_load alloca #132888

Uh oh!

Conversation

NimishMishra commented Mar 25, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Mar 25, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

kiranchandramohan left a comment

Choose a reason for hiding this comment

Uh oh!

tblah left a comment

Choose a reason for hiding this comment

Uh oh!

tblah Mar 26, 2025

Choose a reason for hiding this comment

Uh oh!

tblah Mar 26, 2025

Choose a reason for hiding this comment

Uh oh!

NimishMishra Mar 26, 2025

Choose a reason for hiding this comment

Uh oh!

NimishMishra Mar 27, 2025

Choose a reason for hiding this comment

Uh oh!

tblah Mar 27, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

NimishMishra Apr 1, 2025

Choose a reason for hiding this comment

Uh oh!

Meinersbur left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Footnotes

Uh oh!

NimishMishra commented Apr 1, 2025

Uh oh!

tblah commented Apr 1, 2025

Uh oh!

Meinersbur commented Apr 1, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

tblah commented Apr 1, 2025

Uh oh!

NimishMishra commented Apr 1, 2025

Uh oh!

tblah left a comment

Choose a reason for hiding this comment

Uh oh!

Meinersbur left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

NimishMishra commented Mar 25, 2025 •

edited

Loading

llvmbot commented Mar 25, 2025 •

edited

Loading

tblah Mar 27, 2025 •

edited

Loading

Meinersbur left a comment •

edited

Loading

Meinersbur commented Apr 1, 2025 •

edited

Loading