forked from llvm/llvm-project
-
Notifications
You must be signed in to change notification settings - Fork 17
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[ARM] Disable UpperBound loop unrolling for MVE tail predicated loops. (
llvm#69709) For MVE tail predicated loops, better code can be generated by keeping the loop whole than to unroll to an upper bound, which requires the expansion of active lane masks that can be difficult to generate good code for. This patch disables UpperBound unrolling when we find a active_lane_mask in the loop.
- Loading branch information
1 parent
03ec84a
commit 75b3c3d
Showing
2 changed files
with
88 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py | ||
; RUN: opt -passes=loop-unroll -S -mtriple thumbv8.1m.main-none-eabi -mattr=+mve %s | FileCheck %s | ||
|
||
; The vector loop here is better kept as a loop than conditionally unrolled, | ||
; letting it transform into a tail predicted loop. | ||
|
||
define void @unroll_upper(ptr noundef %pSrc, ptr nocapture noundef writeonly %pDst, i32 noundef %blockSize) { | ||
; CHECK-LABEL: @unroll_upper( | ||
; CHECK-NEXT: entry: | ||
; CHECK-NEXT: [[CMP_NOT23:%.*]] = icmp ult i32 [[BLOCKSIZE:%.*]], 16 | ||
; CHECK-NEXT: [[AND:%.*]] = and i32 [[BLOCKSIZE]], 15 | ||
; CHECK-NEXT: [[CMP6_NOT28:%.*]] = icmp eq i32 [[AND]], 0 | ||
; CHECK-NEXT: br i1 [[CMP6_NOT28]], label [[WHILE_END12:%.*]], label [[VECTOR_MEMCHECK:%.*]] | ||
; CHECK: vector.memcheck: | ||
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i32 [[AND]] | ||
; CHECK-NEXT: [[TMP0:%.*]] = shl nuw nsw i32 [[AND]], 1 | ||
; CHECK-NEXT: [[SCEVGEP32:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i32 [[TMP0]] | ||
; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[PDST]], [[SCEVGEP32]] | ||
; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[PSRC]], [[SCEVGEP]] | ||
; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] | ||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw nsw i32 [[AND]], 7 | ||
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], 24 | ||
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] | ||
; CHECK: vector.body: | ||
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_MEMCHECK]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] | ||
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PDST]], i32 [[INDEX]] | ||
; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 1 | ||
; CHECK-NEXT: [[NEXT_GEP37:%.*]] = getelementptr i8, ptr [[PSRC]], i32 [[TMP1]] | ||
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[AND]]) | ||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[NEXT_GEP37]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) | ||
; CHECK-NEXT: [[TMP2:%.*]] = lshr <8 x i16> [[WIDE_MASKED_LOAD]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> | ||
; CHECK-NEXT: [[TMP3:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8> | ||
; CHECK-NEXT: call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP3]], ptr [[NEXT_GEP]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]]) | ||
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 | ||
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] | ||
; CHECK-NEXT: br i1 [[TMP4]], label [[WHILE_END12_LOOPEXIT:%.*]], label [[VECTOR_BODY]] | ||
; CHECK: while.end12.loopexit: | ||
; CHECK-NEXT: br label [[WHILE_END12]] | ||
; CHECK: while.end12: | ||
; CHECK-NEXT: ret void | ||
; | ||
entry: | ||
%cmp.not23 = icmp ult i32 %blockSize, 16 | ||
%and = and i32 %blockSize, 15 | ||
%cmp6.not28 = icmp eq i32 %and, 0 | ||
br i1 %cmp6.not28, label %while.end12, label %vector.memcheck | ||
|
||
vector.memcheck: ; preds = %entry | ||
%scevgep = getelementptr i8, ptr %pDst, i32 %and | ||
%0 = shl nuw nsw i32 %and, 1 | ||
%scevgep32 = getelementptr i8, ptr %pSrc, i32 %0 | ||
%bound0 = icmp ult ptr %pDst, %scevgep32 | ||
%bound1 = icmp ult ptr %pSrc, %scevgep | ||
%found.conflict = and i1 %bound0, %bound1 | ||
%n.rnd.up = add nuw nsw i32 %and, 7 | ||
%n.vec = and i32 %n.rnd.up, 24 | ||
br label %vector.body | ||
|
||
vector.body: ; preds = %vector.body, %vector.memcheck | ||
%index = phi i32 [ 0, %vector.memcheck ], [ %index.next, %vector.body ] | ||
%next.gep = getelementptr i8, ptr %pDst, i32 %index | ||
%1 = shl i32 %index, 1 | ||
%next.gep37 = getelementptr i8, ptr %pSrc, i32 %1 | ||
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %and) | ||
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %next.gep37, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison) | ||
%2 = lshr <8 x i16> %wide.masked.load, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> | ||
%3 = trunc <8 x i16> %2 to <8 x i8> | ||
call void @llvm.masked.store.v8i8.p0(<8 x i8> %3, ptr %next.gep, i32 1, <8 x i1> %active.lane.mask) | ||
%index.next = add i32 %index, 8 | ||
%4 = icmp eq i32 %index.next, %n.vec | ||
br i1 %4, label %while.end12, label %vector.body | ||
|
||
while.end12: ; preds = %vector.body, %entry | ||
ret void | ||
} | ||
|
||
declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) | ||
declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr nocapture, i32 immarg, <8 x i1>, <8 x i16>) | ||
declare void @llvm.masked.store.v8i8.p0(<8 x i8>, ptr nocapture, i32 immarg, <8 x i1>) |