-
Notifications
You must be signed in to change notification settings - Fork 15.6k
[AArch64][SVE] Upgrade PTRUE patterns to ALL when they match vector length. #172993
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64][SVE] Upgrade PTRUE patterns to ALL when they match vector length. #172993
Conversation
When the number of active elements of a PTRUE pattern matches the scalable vector length, we can upgrade the pattern to ALL. This enables CSE with similar PTRUEs as well as other simplifications. There was similar logic in `getPredicateForFixedLengthVector`, which should no longer be needed with this change. I believe this change also makes the VLS handling in `isAllActivePredicate` redundant as I'm not aware of any PTRUEs it may match which are not created through `getPTrue`, but I left it as is in case that's not true. If others agree that this code should now be dead, I'm happy to remove it as well.
|
@llvm/pr-subscribers-backend-aarch64 Author: Ricardo Jesus (rj-jesus) ChangesWhen the number of active elements of a PTRUE pattern matches the scalable vector length, we can upgrade the pattern to ALL. This enables CSE with similar PTRUEs as well as other simplifications. There was similar logic in I believe this change also makes the VLS handling in Full diff: https://github.com/llvm/llvm-project/pull/172993.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 837393b0cbdcd..5c3eb8ad20a20 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5894,6 +5894,17 @@ static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
int Pattern) {
if (Pattern == AArch64SVEPredPattern::all)
return DAG.getConstant(1, DL, VT);
+
+ // When the number of active elements of a pattern matches the scalable vector
+ // length, we can upgrade the pattern to ALL and emit a splat instead.
+ if (unsigned PatNumElts = getNumElementsFromSVEPredPattern(Pattern)) {
+ const AArch64Subtarget &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+ unsigned NumElts = VT.getVectorMinNumElements();
+ unsigned VScale = Subtarget.getSVEVectorSizeInBits() / 128;
+ if (PatNumElts == (NumElts * VScale))
+ return DAG.getConstant(1, DL, VT);
+ }
+
return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
DAG.getTargetConstant(Pattern, DL, MVT::i32));
}
@@ -30326,16 +30337,6 @@ static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
getSVEPredPatternFromNumElements(VT.getVectorNumElements());
assert(PgPattern && "Unexpected element count for SVE predicate");
- // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
- // AArch64SVEPredPattern::all, which can enable the use of unpredicated
- // variants of instructions when available.
- const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
- unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
- unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
- if (MaxSVESize && MinSVESize == MaxSVESize &&
- MaxSVESize == VT.getSizeInBits())
- PgPattern = AArch64SVEPredPattern::all;
-
MVT MaskVT;
switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
default:
diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
index 879dd4c12c0ba..b77e90f6fdc45 100644
--- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll
+++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
@@ -453,7 +453,7 @@ entry:
define <vscale x 16 x i1> @lane_mask_nxv16i1_imm256() vscale_range(16, 16) {
; CHECK-LABEL: lane_mask_nxv16i1_imm256:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: ret
entry:
%active.lane.mask = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 256)
diff --git a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
index c3322ca38f9e5..d0026db0176e1 100644
--- a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
@@ -208,7 +208,7 @@ define <vscale x 2 x i32> @vec_scalable_subvec_fixed_idx_nonzero_large_i32(ptr %
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: ptrue p0.d, vl8
+; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x1]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
index 72d839a21a29f..2aef74a91c056 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
@@ -563,7 +563,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_neg128(<vscale x 16 x i8> %a, <vscale
define <vscale x 16 x i8> @splice_nxv16i8_neg256(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) vscale_range(16,16) #0 {
; CHECK-LABEL: splice_nxv16i8_neg256:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: rev p0.b, p0.b
; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-while.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-while.ll
index a82998473fe68..4005e7d99400d 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-while.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-while.ll
@@ -110,7 +110,7 @@ define <vscale x 16 x i1> @whilele_b_ii_dont_fold_to_ptrue_nonexistent_vl9() {
define <vscale x 16 x i1> @whilele_b_vl_maximum() vscale_range(16, 16) {
; CHECK-LABEL: whilele_b_vl_maximum:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: ret
%out = call <vscale x 16 x i1> @llvm.aarch64.sve.whilele.nxv16i1.i64(i64 0, i64 255)
ret <vscale x 16 x i1> %out
@@ -245,7 +245,7 @@ define <vscale x 16 x i1> @whilelo_b_ii_dont_fold_to_ptrue_nonexistent_vl9() {
define <vscale x 16 x i1> @whilelo_b_vl_maximum() vscale_range(16, 16) {
; CHECK-LABEL: whilelo_b_vl_maximum:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: ret
%out = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelo.nxv16i1.i64(i64 0, i64 256)
ret <vscale x 16 x i1> %out
@@ -370,7 +370,7 @@ define <vscale x 16 x i1> @whilels_b_ii_dont_fold_to_ptrue_nonexistent_vl9() {
define <vscale x 16 x i1> @whilels_b_ii_vl_maximum() vscale_range(16, 16) {
; CHECK-LABEL: whilels_b_ii_vl_maximum:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: ret
%out = call <vscale x 16 x i1> @llvm.aarch64.sve.whilels.nxv16i1.i64(i64 0, i64 255)
ret <vscale x 16 x i1> %out
@@ -504,7 +504,7 @@ define <vscale x 16 x i1> @whilelt_b_ii_dont_fold_to_ptrue_nonexistent_vl9() {
define <vscale x 16 x i1> @whilelt_b_ii_vl_maximum() vscale_range(16, 16) {
; CHECK-LABEL: whilelt_b_ii_vl_maximum:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: ret
%out = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelt.nxv16i1.i64(i64 0, i64 256)
ret <vscale x 16 x i1> %out
|
paulwalker-arm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree the matching code in isAllActivePredicate will be redundant, or can be made to be redundant. If you remove the code and there are no test differences then go for it.
When the number of active elements of a PTRUE pattern matches the scalable vector length, we can upgrade the pattern to ALL. This enables CSE with similar PTRUEs as well as other simplifications.
There was similar logic in
getPredicateForFixedLengthVector, which I've removed as it should no longer be needed with this change.I believe this change also makes the VLS handling in
isAllActivePredicate(link) redundant as I'm not aware of any PTRUEs it may match that are not created throughgetPTrue, but I left it as is in case that's not true. If others agree that this code can be removed, I'm happy to do so as well.