Skip to content

Conversation

@rj-jesus
Copy link
Contributor

When the number of active elements of a PTRUE pattern matches the scalable vector length, we can upgrade the pattern to ALL. This enables CSE with similar PTRUEs as well as other simplifications.

There was similar logic in getPredicateForFixedLengthVector, which I've removed as it should no longer be needed with this change.

I believe this change also makes the VLS handling in isAllActivePredicate (link) redundant as I'm not aware of any PTRUEs it may match that are not created through getPTrue, but I left it as is in case that's not true. If others agree that this code can be removed, I'm happy to do so as well.

When the number of active elements of a PTRUE pattern matches the
scalable vector length, we can upgrade the pattern to ALL. This enables
CSE with similar PTRUEs as well as other simplifications.

There was similar logic in `getPredicateForFixedLengthVector`, which
should no longer be needed with this change.

I believe this change also makes the VLS handling in
`isAllActivePredicate` redundant as I'm not aware of any PTRUEs it may
match which are not created through `getPTrue`, but I left it as is in
case that's not true. If others agree that this code should now be dead,
I'm happy to remove it as well.
@llvmbot
Copy link
Member

llvmbot commented Dec 19, 2025

@llvm/pr-subscribers-backend-aarch64

Author: Ricardo Jesus (rj-jesus)

Changes

When the number of active elements of a PTRUE pattern matches the scalable vector length, we can upgrade the pattern to ALL. This enables CSE with similar PTRUEs as well as other simplifications.

There was similar logic in getPredicateForFixedLengthVector, which I've removed as it should no longer be needed with this change.

I believe this change also makes the VLS handling in isAllActivePredicate (link) redundant as I'm not aware of any PTRUEs it may match that are not created through getPTrue, but I left it as is in case that's not true. If others agree that this code can be removed, I'm happy to do so as well.


Full diff: https://github.com/llvm/llvm-project/pull/172993.diff

5 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+11-10)
  • (modified) llvm/test/CodeGen/AArch64/active_lane_mask.ll (+1-1)
  • (modified) llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll (+1-1)
  • (modified) llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll (+1-1)
  • (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-while.ll (+4-4)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 837393b0cbdcd..5c3eb8ad20a20 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5894,6 +5894,17 @@ static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
                                int Pattern) {
   if (Pattern == AArch64SVEPredPattern::all)
     return DAG.getConstant(1, DL, VT);
+
+  // When the number of active elements of a pattern matches the scalable vector
+  // length, we can upgrade the pattern to ALL and emit a splat instead.
+  if (unsigned PatNumElts = getNumElementsFromSVEPredPattern(Pattern)) {
+    const AArch64Subtarget &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+    unsigned NumElts = VT.getVectorMinNumElements();
+    unsigned VScale = Subtarget.getSVEVectorSizeInBits() / 128;
+    if (PatNumElts == (NumElts * VScale))
+      return DAG.getConstant(1, DL, VT);
+  }
+
   return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
                      DAG.getTargetConstant(Pattern, DL, MVT::i32));
 }
@@ -30326,16 +30337,6 @@ static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
       getSVEPredPatternFromNumElements(VT.getVectorNumElements());
   assert(PgPattern && "Unexpected element count for SVE predicate");
 
-  // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
-  // AArch64SVEPredPattern::all, which can enable the use of unpredicated
-  // variants of instructions when available.
-  const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
-  unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
-  unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
-  if (MaxSVESize && MinSVESize == MaxSVESize &&
-      MaxSVESize == VT.getSizeInBits())
-    PgPattern = AArch64SVEPredPattern::all;
-
   MVT MaskVT;
   switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
   default:
diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
index 879dd4c12c0ba..b77e90f6fdc45 100644
--- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll
+++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
@@ -453,7 +453,7 @@ entry:
 define <vscale x 16 x i1> @lane_mask_nxv16i1_imm256() vscale_range(16, 16) {
 ; CHECK-LABEL: lane_mask_nxv16i1_imm256:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    ret
 entry:
   %active.lane.mask = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 256)
diff --git a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
index c3322ca38f9e5..d0026db0176e1 100644
--- a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
@@ -208,7 +208,7 @@ define <vscale x 2 x i32> @vec_scalable_subvec_fixed_idx_nonzero_large_i32(ptr %
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    ptrue p0.d, vl8
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x1]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
index 72d839a21a29f..2aef74a91c056 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
@@ -563,7 +563,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_neg128(<vscale x 16 x i8> %a, <vscale
 define <vscale x 16 x i8> @splice_nxv16i8_neg256(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) vscale_range(16,16) #0 {
 ; CHECK-LABEL: splice_nxv16i8_neg256:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-while.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-while.ll
index a82998473fe68..4005e7d99400d 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-while.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-while.ll
@@ -110,7 +110,7 @@ define <vscale x 16 x i1> @whilele_b_ii_dont_fold_to_ptrue_nonexistent_vl9() {
 define <vscale x 16 x i1> @whilele_b_vl_maximum() vscale_range(16, 16) {
 ; CHECK-LABEL: whilele_b_vl_maximum:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    ret
   %out = call <vscale x 16 x i1> @llvm.aarch64.sve.whilele.nxv16i1.i64(i64 0, i64 255)
   ret <vscale x 16 x i1> %out
@@ -245,7 +245,7 @@ define <vscale x 16 x i1> @whilelo_b_ii_dont_fold_to_ptrue_nonexistent_vl9() {
 define <vscale x 16 x i1> @whilelo_b_vl_maximum() vscale_range(16, 16) {
 ; CHECK-LABEL: whilelo_b_vl_maximum:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    ret
   %out = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelo.nxv16i1.i64(i64 0, i64 256)
   ret <vscale x 16 x i1> %out
@@ -370,7 +370,7 @@ define <vscale x 16 x i1> @whilels_b_ii_dont_fold_to_ptrue_nonexistent_vl9() {
 define <vscale x 16 x i1> @whilels_b_ii_vl_maximum() vscale_range(16, 16) {
 ; CHECK-LABEL: whilels_b_ii_vl_maximum:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    ret
   %out = call <vscale x 16 x i1> @llvm.aarch64.sve.whilels.nxv16i1.i64(i64 0, i64 255)
   ret <vscale x 16 x i1> %out
@@ -504,7 +504,7 @@ define <vscale x 16 x i1> @whilelt_b_ii_dont_fold_to_ptrue_nonexistent_vl9() {
 define <vscale x 16 x i1> @whilelt_b_ii_vl_maximum() vscale_range(16, 16) {
 ; CHECK-LABEL: whilelt_b_ii_vl_maximum:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    ret
   %out = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelt.nxv16i1.i64(i64 0, i64 256)
   ret <vscale x 16 x i1> %out

Copy link
Collaborator

@paulwalker-arm paulwalker-arm left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree the matching code in isAllActivePredicate will be redundant, or can be made to be redundant. If you remove the code and there are no test differences then go for it.

@rj-jesus rj-jesus merged commit b047a0e into llvm:main Dec 22, 2025
10 checks passed
@rj-jesus rj-jesus deleted the rjj/aarch64-sve-simplify-ptrue-vl branch December 22, 2025 12:24
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants