-
Notifications
You must be signed in to change notification settings - Fork 12k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] SimplifyDemandedBitsForTargetNode - add AArch64ISD::BICi handling #76644
Conversation
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-aarch64 Author: Sizov Nikita (snikitav) ChangesFold BICi if all destination bits are already known to be zeroes define <8 x i16> @<!-- -->haddu_known(<8 x i8> %a0, <8 x i8> %a1) {
%x0 = zext <8 x i8> %a0 to <8 x i16>
%x1 = zext <8 x i8> %a1 to <8 x i16>
%hadd = call <8 x i16> @<!-- -->llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)
%res = and <8 x i16> %hadd, <i16 511, i16 511, i16 511, i16 511,i16 511, i16 511, i16 511, i16 511>
ret <8 x i16> %res
}
declare <8 x i16> @<!-- -->llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>)
Full diff: https://github.com/llvm/llvm-project/pull/76644.diff 3 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 81facf92e55ae9..c79d3d8246daaf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3390,10 +3390,15 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
Known = KnownBits::mulhs(Known, Known2);
break;
}
- case ISD::AVGCEILU: {
+ case ISD::AVGFLOORU:
+ case ISD::AVGCEILU:
+ case ISD::AVGFLOORS:
+ case ISD::AVGCEILS: {
Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
- Known = Known.zext(BitWidth + 1);
+ Known = (Opcode == ISD::AVGFLOORU || Opcode == ISD::AVGCEILU)
+ ? Known.zext(BitWidth + 1)
+ : Known.sext(BitWidth + 1);
Known2 = Known2.zext(BitWidth + 1);
KnownBits One = KnownBits::makeConstant(APInt(1, 1));
Known = KnownBits::computeForAddCarry(Known, Known2, One);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index dffe69bdb900db..c5f906aa7473f7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23672,6 +23672,18 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
return R;
return performFlagSettingCombine(N, DCI, AArch64ISD::SBC);
+ case AArch64ISD::BICi: {
+ KnownBits Known;
+ APInt DemandedElts(32, N->getValueType(0).getVectorNumElements());
+ APInt EltSize(32, N->getValueType(0).getScalarSizeInBits());
+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+ !DCI.isBeforeLegalizeOps());
+ if (SimplifyDemandedBitsForTargetNode(SDValue(N, 0), EltSize, DemandedElts,
+ Known, TLO, 0)) {
+ return TLO.New;
+ }
+ break;
+ }
case ISD::XOR:
return performXorCombine(N, DAG, DCI, Subtarget);
case ISD::MUL:
@@ -26658,6 +26670,18 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
// used - simplify to just Val.
return TLO.CombineTo(Op, ShiftR->getOperand(0));
}
+ case AArch64ISD::BICi: {
+ // Fold BICi if all destination bits already known to be zeroed
+ SDValue Op0 = Op.getOperand(0);
+ KnownBits KnownOp0 = TLO.DAG.computeKnownBits(Op0, 0);
+ APInt Shift = Op.getConstantOperandAPInt(2);
+ APInt Op1Val = Op.getConstantOperandAPInt(1);
+ APInt BitsToClear = Op1Val.shl(Shift).zextOrTrunc(KnownOp0.getBitWidth());
+ APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
+ if (AlreadyZeroedBitsToClear == BitsToClear)
+ return TLO.CombineTo(Op, Op0);
+ return false;
+ }
case ISD::INTRINSIC_WO_CHAIN: {
if (auto ElementSize = IsSVECntIntrinsic(Op)) {
unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-bici.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-bici.ll
new file mode 100644
index 00000000000000..55ac9c2f1b4075
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-neon-bici.ll
@@ -0,0 +1,125 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-neon < %s | FileCheck %s
+
+declare <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @haddu_zext(<8 x i8> %a0, <8 x i8> %a1) {
+; CHECK-LABEL: haddu_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
+ %x0 = zext <8 x i8> %a0 to <8 x i16>
+ %x1 = zext <8 x i8> %a1 to <8 x i16>
+ %hadd = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)
+ %res = and <8 x i16> %hadd, <i16 511, i16 511, i16 511, i16 511,i16 511, i16 511, i16 511, i16 511>
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @rhaddu_zext(<8 x i8> %a0, <8 x i8> %a1) {
+; CHECK-LABEL: rhaddu_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEXT: urhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
+ %x0 = zext <8 x i8> %a0 to <8 x i16>
+ %x1 = zext <8 x i8> %a1 to <8 x i16>
+ %hadd = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)
+ %res = and <8 x i16> %hadd, <i16 511, i16 511, i16 511, i16 511, i16 511, i16 511, i16 511, i16 511>
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @hadds_zext(<8 x i8> %a0, <8 x i8> %a1) {
+; CHECK-LABEL: hadds_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
+ %x0 = zext <8 x i8> %a0 to <8 x i16>
+ %x1 = zext <8 x i8> %a1 to <8 x i16>
+ %hadd = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)
+ %res = and <8 x i16> %hadd, <i16 511, i16 511, i16 511, i16 511, i16 511, i16 511, i16 511, i16 511>
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @shaddu_zext(<8 x i8> %a0, <8 x i8> %a1) {
+; CHECK-LABEL: shaddu_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEXT: srhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
+ %x0 = zext <8 x i8> %a0 to <8 x i16>
+ %x1 = zext <8 x i8> %a1 to <8 x i16>
+ %hadd = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)
+ %res = and <8 x i16> %hadd, <i16 511, i16 511, i16 511, i16 511, i16 511, i16 511, i16 511, i16 511>
+ ret <8 x i16> %res
+}
+
+; ; negative tests
+
+define <8 x i16> @haddu_sext(<8 x i8> %a0, <8 x i8> %a1) {
+; CHECK-LABEL: haddu_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-NEXT: sshll v1.8h, v1.8b, #0
+; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: bic v0.8h, #254, lsl #8
+; CHECK-NEXT: ret
+ %x0 = sext <8 x i8> %a0 to <8 x i16>
+ %x1 = sext <8 x i8> %a1 to <8 x i16>
+ %hadd = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)
+ %res = and <8 x i16> %hadd, <i16 511, i16 511, i16 511, i16 511,i16 511, i16 511, i16 511, i16 511>
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @urhadd_sext(<8 x i8> %a0, <8 x i8> %a1) {
+; CHECK-LABEL: urhadd_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-NEXT: sshll v1.8h, v1.8b, #0
+; CHECK-NEXT: urhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: bic v0.8h, #254, lsl #8
+; CHECK-NEXT: ret
+ %x0 = sext <8 x i8> %a0 to <8 x i16>
+ %x1 = sext <8 x i8> %a1 to <8 x i16>
+ %hadd = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)
+ %res = and <8 x i16> %hadd, <i16 511, i16 511, i16 511, i16 511,i16 511, i16 511, i16 511, i16 511>
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @hadds_sext(<8 x i8> %a0, <8 x i8> %a1) {
+; CHECK-LABEL: hadds_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-NEXT: sshll v1.8h, v1.8b, #0
+; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: bic v0.8h, #254, lsl #8
+; CHECK-NEXT: ret
+ %x0 = sext <8 x i8> %a0 to <8 x i16>
+ %x1 = sext <8 x i8> %a1 to <8 x i16>
+ %hadd = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)
+ %res = and <8 x i16> %hadd, <i16 511, i16 511, i16 511, i16 511, i16 511, i16 511, i16 511, i16 511>
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @shaddu_sext(<8 x i8> %a0, <8 x i8> %a1) {
+; CHECK-LABEL: shaddu_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-NEXT: sshll v1.8h, v1.8b, #0
+; CHECK-NEXT: srhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: bic v0.8h, #254, lsl #8
+; CHECK-NEXT: ret
+ %x0 = sext <8 x i8> %a0 to <8 x i16>
+ %x1 = sext <8 x i8> %a1 to <8 x i16>
+ %hadd = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)
+ %res = and <8 x i16> %hadd, <i16 511, i16 511, i16 511, i16 511, i16 511, i16 511, i16 511, i16 511>
+ ret <8 x i16> %res
+}
|
08f70b5
to
00fd477
Compare
case AArch64ISD::BICi: { | ||
KnownBits Known; | ||
APInt DemandedElts(32, N->getValueType(0).getVectorNumElements()); | ||
APInt EltSize(32, N->getValueType(0).getScalarSizeInBits()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is this hardcoded to 32?
APInt EltSize(32, N->getValueType(0).getScalarSizeInBits()); | ||
TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), | ||
!DCI.isBeforeLegalizeOps()); | ||
if (SimplifyDemandedBitsForTargetNode(SDValue(N, 0), EltSize, DemandedElts, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use TLI.SimplifyDemandedBits here
SDValue Op0 = Op.getOperand(0); | ||
KnownBits KnownOp0 = TLO.DAG.computeKnownBits(Op0, 0); | ||
APInt Shift = Op.getConstantOperandAPInt(2); | ||
APInt Op1Val = Op.getConstantOperandAPInt(1); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just use getConstantOperandVal for these?
APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero; | ||
if (AlreadyZeroedBitsToClear == BitsToClear) | ||
return TLO.CombineTo(Op, Op0); | ||
return false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Return the calculated KnownBits (copy the AArch64TargetLowering::computeKnownBitsForTargetNode implementation).
case AArch64ISD::BICi: { | ||
// Fold BICi if all destination bits already known to be zeroed | ||
SDValue Op0 = Op.getOperand(0); | ||
KnownBits KnownOp0 = TLO.DAG.computeKnownBits(Op0, 0); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use the computeKnownBits variant with OriginalDemandedElts
Don't reset the recursion depth - it must be Depth + 1
@snikitav reverse-ping. Are you still looking at this? |
Yes. Sorry, been distracted. Will come up with patch this weekend. |
c220a3e
to
40ce707
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ideally we need to separate the ISD::AVG knownbits handling into a followup patch - @davemgreen can you suggest any simpler tests that will create BICi nodes that the target node KnownBits handling could show?
Known = Known.zext(BitWidth + 1); | ||
Known = (Opcode == ISD::AVGFLOORU || Opcode == ISD::AVGCEILU) | ||
? Known.zext(BitWidth + 1) | ||
: Known.sext(BitWidth + 1); | ||
Known2 = Known2.zext(BitWidth + 1); | ||
KnownBits One = KnownBits::makeConstant(APInt(1, 1)); | ||
Known = KnownBits::computeForAddCarry(Known, Known2, One); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Floor variants don't add the One:
bool IsCeil = Opcode == ISD::AVGCEILU || Opcode == ISD::AVGCEILS);
KnownBits Carry= KnownBits::makeConstant(APInt(1, IsCeil ? 1 : 0));
Known = KnownBits::computeForAddCarry(Known, Known2, Carry);
declare <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>) | ||
declare <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>) | ||
declare <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16>, <8 x i16>) | ||
declare <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These need pre-committing to trunk so the patch shows the codegen diff
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should I make a separate PR for this test?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah that sounds good, if you don't have access to push them directly. Hopefully it is a simple one to get committed either way.
Also, please can you update the title of the patch to be more descriptive? Does this sound OK? "[AArch64] SimplifyDemandedBitsForTargetNode - add AArch64ISD::BICi handling" |
I'm not sure. Considering where they are created it will be common for simple cases to already be optimized. Maybe they could be added to llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp? I agree it would be good if there were separate prs for the BIC vs RADD parts. |
53709ee
to
f1f275e
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please consult FoldValue in SelectionDAG.cpp which has the full evaluation for each of the AVG nodes
Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); | ||
Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); | ||
Known = Known.zext(BitWidth + 1); | ||
Known = IsSigned ? Known.sext(BitWidth + 1) : Known.zext(BitWidth + 1); | ||
Known2 = Known2.zext(BitWidth + 1); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Known2 = IsSigned ? Known2.sext(BitWidth + 1) : Known2.zext(BitWidth + 1);
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
BTW, shouldn't we sext/zext(BitWidth + 2) for Ceil versions?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point - we don't do this in FoldValue either
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, this seems fine as we don't care about the MSB: https://alive2.llvm.org/ce/z/iQdZVc panic over
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
BTW, shouldn't we sext/zext(BitWidth + 2) for Ceil versions?
It's not necessary even though the Ceil versions add 1 for carry. E.g. with 8 bits the maximum you could get would be 0xFF + 0xFF + 1 which is 0x1FF which still fits in 9 bits - you don't need 10 bits.
uint64_t BitsToClear = Op->getConstantOperandVal(1) | ||
<< Op->getConstantOperandVal(2); | ||
APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero; | ||
if (AlreadyZeroedBitsToClear == BitsToClear) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use APInt::isSubsetOf ?
@@ -27324,6 +27337,21 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode( | |||
// used - simplify to just Val. | |||
return TLO.CombineTo(Op, ShiftR->getOperand(0)); | |||
} | |||
case AArch64ISD::BICi: { | |||
// Fold BICi if all destination bits already known to be zeroed |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Its always helpful to include in the comment what each of the immediate operands do, or name them:
uint64_t foo = Op->getConstantOperandVal(1);
uint64_t bar = Op->getConstantOperandVal(2);
uint64_t BitsToClear = foo << bar;
f1f275e
to
5a1ca5a
Compare
Add neon bici test for haddu and shadd, prerequisite for #76644
5a1ca5a
to
afa5a78
Compare
Any luck with creating a test in AArch64SelectionDAGTest.cpp for the BICi known bits? |
afa5a78
to
6ea8481
Compare
@@ -796,4 +797,52 @@ TEST_F(AArch64SelectionDAGTest, computeKnownBits_extload_knownnegative) { | |||
EXPECT_EQ(Known.One, APInt(32, 0xfffffff0)); | |||
} | |||
|
|||
TEST_F(AArch64SelectionDAGTest, | |||
computeKnownBits_AVGFLOORU_AVGFLOORS_AVGCEILU_AVGCEILS) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I meant add BICi test coverage here, no need for the AVG nodes - the idea being that this PR can be purely about BICi demanded/knownbits and then you can create a followup PR that deals with the AVG known bits support
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Or these tests could be used to create a PR just for the AVG nodes part - to get that part in and reviewed, and the BIC can be rebase on top.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Either is fine for me - I just want to see the generic ISD::AVG knownbits patch separate from the AArch64 specific BICi patch.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'll do a separate patch then
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@RKSimon @davemgreen #86754 PTAL
BTW, now this particular PR is temporary broken, because it relies on missing ISD::AVG* computeKnownBits
.isSubsetOf(AlreadyZeroedBitsToClear)) | ||
return TLO.CombineTo(Op, Op0); | ||
|
||
Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), ~BitsToClear)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could this be Known = KnownOp0 & KnownBits::makeConstant(APInt(Known.getBitWidth(), ~BitsToClear));
? There is already some code for this in AArch64TargetLowering::computeKnownBitsForTargetNode, I would guess these should match in terms of what they produce, but sometimes I mis-understand exactly what SimplifyDemandedBitsForTargetNode should be doing. I think this could be quite similar to the ISD::AND case inside SimplifyDemandedBits, just with an always-constant second argument.
@@ -796,4 +797,52 @@ TEST_F(AArch64SelectionDAGTest, computeKnownBits_extload_knownnegative) { | |||
EXPECT_EQ(Known.One, APInt(32, 0xfffffff0)); | |||
} | |||
|
|||
TEST_F(AArch64SelectionDAGTest, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for adding these, they are useful for checking the produced results more precisely. Would it be possible to add something for the BICi changes too?
Are you sure? I don't see any AArch64ISD namespace using in AArch64SelectionDAGTest.cpp. I feel like missing something. |
6ea8481
to
1e04454
Compare
1e04454
to
c155e8f
Compare
@snikitav reverse-ping |
c155e8f
to
f02f7b5
Compare
@RKSimon Are you sure? I don't see any AArch64ISD namespace using in AArch64SelectionDAGTest.cpp. I feel like missing something. Could you give me a bit of guidance here? |
I don't think that will be necessary anymore unless @davemgreen disagrees? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AArch64SelectionDAGTest.cpp
I see what you mean, with them being specified in AArch64ISelLowering.h. I think so long as this is tested and looks sensible it should be OK (the C++ tests just make it easier to write more comprehensive tests).
If you can remove the unused variable then this LGTM too.
@@ -24555,6 +24555,19 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, | |||
if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false)) | |||
return R; | |||
return performFlagSettingCombine(N, DCI, AArch64ISD::SBC); | |||
case AArch64ISD::BICi: { | |||
KnownBits Known; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Known isn't used?
case AArch64ISD::BICi: { | ||
KnownBits Known; | ||
APInt DemandedBits = | ||
APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would expect this not be all bits, but to only demand the bits that are not cleared by the BIC. Again this might be difficult to test though, so it might be best to leave it as-is for the moment.
f02f7b5
to
dc85ce6
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks. LGTM
Fold BICi if all destination bits are already known to be zeroes
Fixes #53881
Fixes #53622