Description
target triple = "armv4t-unknown-linux-gnueabi"
@scpart_scan_partmap_offs = external global i64
define void @scpart_scan_partmap() {
entry:
br label %while.body.preheader
while.body.preheader: ; preds = %entry
br label %while.body
while.body: ; preds = %while.body, %while.body.preheader
%add13 = phi i64 [ %add, %while.body ], [ undef, %while.body.preheader ]
%add = add nsw i64 %add13, 12
%cmp = icmp slt i64 %add13, 0
br i1 %cmp, label %while.body, label %while.cond.while.end_crit_edge
while.cond.while.end_crit_edge: ; preds = %while.body
%add.lcssa = phi i64 [ %add13, %while.body ]
store i64 %add.lcssa, ptr null, align 8
ret void
}
$ opt -passes=indvars reduced.ll -S
produces
source_filename = "reduced.ll"
target triple = "armv4t-unknown-linux-gnueabi"
@scpart_scan_partmap_offs = external global i64
define void @scpart_scan_partmap() {
entry:
br label %while.body.preheader
while.body.preheader: ; preds = %entry
%smax = call i64 @llvm.smax.i64(i64 undef, i64 0)
%0 = sub i64 %smax, undef
%umin = call i64 @llvm.umin.i64(i64 %0, i64 1)
%1 = sub i64 %0, %umin
%2 = udiv i64 %1, 12
%3 = add i64 %umin, %2
%4 = mul i64 %3, 12
br label %while.body
while.body: ; preds = %while.body, %while.body.preheader
br i1 false, label %while.body, label %while.cond.while.end_crit_edge
while.cond.while.end_crit_edge: ; preds = %while.body
%5 = add i64 %4, undef
store i64 %5, ptr null, align 8
ret void
}
; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
declare i64 @llvm.smax.i64(i64, i64) #0
; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
declare i64 @llvm.umin.i64(i64, i64) #0
attributes #0 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
unfortunately the udiv
we got from SCEV is going to be lowered to an relatively expensive libcall to __aeabi_uldivmod
(32b x86 is also affected, the libcall is __udivdi3
). So we may have eliminated the loop body, but we replaced it with a relatively expensive libcall.
Forked from ClangBuiltLinux/linux#1635. ClangBuiltLinux/linux#1635 (comment) has the reduced C code, and demonstrates GCC keeping the tight loop:
24: 04 f0 9d a4 ldrge pc, [sp], #4
28: 0c 30 93 e2 adds r3, r3, #12
2c: 00 20 a2 e2 adc r2, r2, #0
30: 01 00 53 e1 cmp r3, r1
34: 00 c0 d2 e0 sbcs r12, r2, r0
38: fa ff ff ba blt 0x28 <scpart_scan_partmap+0x28> @ imm = #-24
llvm::rewriteLoopExitValues
in llvm/lib/Transforms/Utils/LoopUtils.cpp tries to replace the loop body regardless of cost if the loop can be deleted. I'm not sure if we could exclude such transforms if the SCEV contains a udiv larger than the target word size, or if the target doesn't support division ARMSubTarget::hasDivideInARMMode()
, or something more generic?
See also https://reviews.llvm.org/D9800 / e2538b5,