diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 288fd3639e5eb7..759fb352823f83 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -25,6 +25,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SmallVectorExtras.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" @@ -2102,7 +2103,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default); setOperationAction(ISD::BITREVERSE, VT, Default); setOperationAction(ISD::BSWAP, VT, Default); - setOperationAction(ISD::BUILD_VECTOR, VT, Default); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Default); setOperationAction(ISD::CTLZ, VT, Default); setOperationAction(ISD::CTPOP, VT, Default); @@ -14384,24 +14385,72 @@ static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, return SDValue(); } -SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, - SelectionDAG &DAG) const { +SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE( + SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); + SDLoc DL(Op); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + auto *BVN = cast(Op); - if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) { - if (auto SeqInfo = cast(Op)->isConstantSequence()) { - SDLoc DL(Op); - EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); - SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT); - SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second); - SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps); - return convertFromScalableVector(DAG, Op.getValueType(), Seq); - } + if (auto SeqInfo = BVN->isConstantSequence()) { + SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT); + SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second); + SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps); + return convertFromScalableVector(DAG, VT, Seq); + } + + unsigned NumElems = VT.getVectorNumElements(); + if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 || + NumElems <= 1 || BVN->isConstant()) + return SDValue(); + + auto IsExtractElt = [](SDValue Op) { + return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT; + }; - // Revert to common legalisation for all other variants. + // For integer types that are not already in vectors limit to at most four + // elements. This is an arbitrary restriction to avoid many fmovs from GPRs. + if (VT.getScalarType().isInteger() && + NumElems - count_if(Op->op_values(), IsExtractElt) > 4) return SDValue(); + + // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s. + SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64); + SmallVector Intermediates = map_to_vector<16>( + Op->op_values(), [&, Undef = DAG.getUNDEF(ContainerVT)](SDValue Op) { + return Op.isUndef() ? Undef + : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, + ContainerVT, Undef, Op, ZeroI64); + }); + + ElementCount ZipEC = ContainerVT.getVectorElementCount(); + while (Intermediates.size() > 1) { + EVT ZipVT = getPackedSVEVectorVT(ZipEC); + + for (unsigned I = 0; I < Intermediates.size(); I += 2) { + SDValue Op0 = DAG.getBitcast(ZipVT, Intermediates[I + 0]); + SDValue Op1 = DAG.getBitcast(ZipVT, Intermediates[I + 1]); + Intermediates[I / 2] = + Op1.isUndef() ? Op0 + : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1); + } + + Intermediates.resize(Intermediates.size() / 2); + ZipEC = ZipEC.divideCoefficientBy(2); } + assert(Intermediates.size() == 1); + SDValue Vec = DAG.getBitcast(ContainerVT, Intermediates[0]); + return convertFromScalableVector(DAG, VT, Vec); +} + +SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) + return LowerFixedLengthBuildVectorToSVE(Op, DAG); + // Try to build a simple constant vector. Op = NormalizeBuildVector(Op, DAG); // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 1bae7562f459a5..95489f85631801 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1256,6 +1256,7 @@ class AArch64TargetLowering : public TargetLowering { SDValue LowerFixedLengthFPToIntToSVE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFixedLengthVECTOR_SHUFFLEToSVE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFixedLengthBuildVectorToSVE(SDValue Op, SelectionDAG &DAG) const; SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const override; diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll index 276f23703df3df..20659cde83ee00 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll @@ -140,98 +140,65 @@ define <8 x i8> @shuffle_index_indices_from_both_ops(ptr %a, ptr %b) { ; ; SVE2_128_NOMAX-LABEL: shuffle_index_indices_from_both_ops: ; SVE2_128_NOMAX: // %bb.0: -; SVE2_128_NOMAX-NEXT: sub sp, sp, #16 -; SVE2_128_NOMAX-NEXT: .cfi_def_cfa_offset 16 ; SVE2_128_NOMAX-NEXT: ldr d0, [x1] -; SVE2_128_NOMAX-NEXT: mov z1.b, z0.b[7] -; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[6] -; SVE2_128_NOMAX-NEXT: mov z3.b, z0.b[4] -; SVE2_128_NOMAX-NEXT: fmov w8, s1 ; SVE2_128_NOMAX-NEXT: ldr d1, [x0] -; SVE2_128_NOMAX-NEXT: fmov w9, s2 ; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[3] -; SVE2_128_NOMAX-NEXT: mov z1.b, z1.b[1] -; SVE2_128_NOMAX-NEXT: strb w8, [sp, #15] -; SVE2_128_NOMAX-NEXT: fmov w8, s3 ; SVE2_128_NOMAX-NEXT: mov z3.b, z0.b[2] -; SVE2_128_NOMAX-NEXT: strb w9, [sp, #14] -; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[1] -; SVE2_128_NOMAX-NEXT: fmov w9, s2 -; SVE2_128_NOMAX-NEXT: strb w8, [sp, #13] -; SVE2_128_NOMAX-NEXT: strb w8, [sp, #12] -; SVE2_128_NOMAX-NEXT: fmov w8, s3 -; SVE2_128_NOMAX-NEXT: strb w9, [sp, #11] -; SVE2_128_NOMAX-NEXT: fmov w9, s0 -; SVE2_128_NOMAX-NEXT: strb w8, [sp, #10] -; SVE2_128_NOMAX-NEXT: fmov w8, s1 -; SVE2_128_NOMAX-NEXT: strb w9, [sp, #9] -; SVE2_128_NOMAX-NEXT: strb w8, [sp, #8] -; SVE2_128_NOMAX-NEXT: ldr d0, [sp, #8] -; SVE2_128_NOMAX-NEXT: add sp, sp, #16 +; SVE2_128_NOMAX-NEXT: mov z4.b, z0.b[1] +; SVE2_128_NOMAX-NEXT: mov z1.b, z1.b[1] +; SVE2_128_NOMAX-NEXT: mov z5.b, z0.b[7] +; SVE2_128_NOMAX-NEXT: mov z6.b, z0.b[6] +; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[4] +; SVE2_128_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b +; SVE2_128_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b +; SVE2_128_NOMAX-NEXT: zip1 z3.b, z6.b, z5.b +; SVE2_128_NOMAX-NEXT: zip1 z0.b, z0.b, z0.b +; SVE2_128_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h +; SVE2_128_NOMAX-NEXT: zip1 z0.h, z0.h, z3.h +; SVE2_128_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s +; SVE2_128_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2_128_NOMAX-NEXT: ret ; ; SVE2_NOMIN_NOMAX-LABEL: shuffle_index_indices_from_both_ops: ; SVE2_NOMIN_NOMAX: // %bb.0: -; SVE2_NOMIN_NOMAX-NEXT: sub sp, sp, #16 -; SVE2_NOMIN_NOMAX-NEXT: .cfi_def_cfa_offset 16 ; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [x1] -; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z0.b[7] -; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[6] -; SVE2_NOMIN_NOMAX-NEXT: mov z3.b, z0.b[4] -; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s1 ; SVE2_NOMIN_NOMAX-NEXT: ldr d1, [x0] -; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s2 ; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[3] -; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z1.b[1] -; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #15] -; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s3 ; SVE2_NOMIN_NOMAX-NEXT: mov z3.b, z0.b[2] -; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #14] -; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[1] -; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s2 -; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #13] -; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #12] -; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s3 -; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #11] -; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s0 -; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #10] -; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s1 -; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #9] -; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #8] -; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [sp, #8] -; SVE2_NOMIN_NOMAX-NEXT: add sp, sp, #16 +; SVE2_NOMIN_NOMAX-NEXT: mov z4.b, z0.b[1] +; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z1.b[1] +; SVE2_NOMIN_NOMAX-NEXT: mov z5.b, z0.b[7] +; SVE2_NOMIN_NOMAX-NEXT: mov z6.b, z0.b[6] +; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[4] +; SVE2_NOMIN_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b +; SVE2_NOMIN_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b +; SVE2_NOMIN_NOMAX-NEXT: zip1 z3.b, z6.b, z5.b +; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.b, z0.b, z0.b +; SVE2_NOMIN_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h +; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.h, z0.h, z3.h +; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s +; SVE2_NOMIN_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2_NOMIN_NOMAX-NEXT: ret ; ; SVE2_MIN_256_NOMAX-LABEL: shuffle_index_indices_from_both_ops: ; SVE2_MIN_256_NOMAX: // %bb.0: -; SVE2_MIN_256_NOMAX-NEXT: sub sp, sp, #16 -; SVE2_MIN_256_NOMAX-NEXT: .cfi_def_cfa_offset 16 ; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [x1] -; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z0.b[7] -; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[6] -; SVE2_MIN_256_NOMAX-NEXT: mov z3.b, z0.b[4] -; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s1 ; SVE2_MIN_256_NOMAX-NEXT: ldr d1, [x0] -; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s2 ; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[3] -; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z1.b[1] -; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #15] -; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s3 ; SVE2_MIN_256_NOMAX-NEXT: mov z3.b, z0.b[2] -; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #14] -; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[1] -; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s2 -; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #13] -; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #12] -; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s3 -; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #11] -; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s0 -; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #10] -; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s1 -; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #9] -; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #8] -; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [sp, #8] -; SVE2_MIN_256_NOMAX-NEXT: add sp, sp, #16 +; SVE2_MIN_256_NOMAX-NEXT: mov z4.b, z0.b[1] +; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z1.b[1] +; SVE2_MIN_256_NOMAX-NEXT: mov z5.b, z0.b[7] +; SVE2_MIN_256_NOMAX-NEXT: mov z6.b, z0.b[6] +; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[4] +; SVE2_MIN_256_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b +; SVE2_MIN_256_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b +; SVE2_MIN_256_NOMAX-NEXT: zip1 z3.b, z6.b, z5.b +; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.b, z0.b, z0.b +; SVE2_MIN_256_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h +; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.h, z0.h, z3.h +; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s +; SVE2_MIN_256_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2_MIN_256_NOMAX-NEXT: ret %op1 = load <8 x i8>, ptr %a %op2 = load <8 x i8>, ptr %b @@ -263,89 +230,59 @@ define <8 x i8> @shuffle_index_poison_value(ptr %a, ptr %b) { ; ; SVE2_128_NOMAX-LABEL: shuffle_index_poison_value: ; SVE2_128_NOMAX: // %bb.0: -; SVE2_128_NOMAX-NEXT: sub sp, sp, #16 -; SVE2_128_NOMAX-NEXT: .cfi_def_cfa_offset 16 ; SVE2_128_NOMAX-NEXT: ldr d0, [x1] -; SVE2_128_NOMAX-NEXT: ldr d3, [x0] -; SVE2_128_NOMAX-NEXT: mov z1.b, z0.b[6] -; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[4] -; SVE2_128_NOMAX-NEXT: fmov w8, s1 -; SVE2_128_NOMAX-NEXT: mov z1.b, z0.b[3] -; SVE2_128_NOMAX-NEXT: fmov w9, s2 -; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[2] -; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[1] -; SVE2_128_NOMAX-NEXT: strb w8, [sp, #14] -; SVE2_128_NOMAX-NEXT: fmov w8, s1 -; SVE2_128_NOMAX-NEXT: mov z1.b, z3.b[1] -; SVE2_128_NOMAX-NEXT: strb w9, [sp, #13] -; SVE2_128_NOMAX-NEXT: strb w9, [sp, #12] -; SVE2_128_NOMAX-NEXT: fmov w9, s2 -; SVE2_128_NOMAX-NEXT: strb w8, [sp, #11] -; SVE2_128_NOMAX-NEXT: fmov w8, s0 -; SVE2_128_NOMAX-NEXT: strb w9, [sp, #10] -; SVE2_128_NOMAX-NEXT: fmov w9, s1 -; SVE2_128_NOMAX-NEXT: strb w8, [sp, #9] -; SVE2_128_NOMAX-NEXT: strb w9, [sp, #8] -; SVE2_128_NOMAX-NEXT: ldr d0, [sp, #8] -; SVE2_128_NOMAX-NEXT: add sp, sp, #16 +; SVE2_128_NOMAX-NEXT: ldr d1, [x0] +; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[3] +; SVE2_128_NOMAX-NEXT: mov z3.b, z0.b[2] +; SVE2_128_NOMAX-NEXT: mov z4.b, z0.b[1] +; SVE2_128_NOMAX-NEXT: mov z1.b, z1.b[1] +; SVE2_128_NOMAX-NEXT: mov z5.b, z0.b[4] +; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[6] +; SVE2_128_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b +; SVE2_128_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b +; SVE2_128_NOMAX-NEXT: zip1 z3.b, z5.b, z5.b +; SVE2_128_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h +; SVE2_128_NOMAX-NEXT: zip1 z0.h, z3.h, z0.h +; SVE2_128_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s +; SVE2_128_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2_128_NOMAX-NEXT: ret ; ; SVE2_NOMIN_NOMAX-LABEL: shuffle_index_poison_value: ; SVE2_NOMIN_NOMAX: // %bb.0: -; SVE2_NOMIN_NOMAX-NEXT: sub sp, sp, #16 -; SVE2_NOMIN_NOMAX-NEXT: .cfi_def_cfa_offset 16 ; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [x1] -; SVE2_NOMIN_NOMAX-NEXT: ldr d3, [x0] -; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z0.b[6] -; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[4] -; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s1 -; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z0.b[3] -; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s2 -; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[2] -; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[1] -; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #14] -; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s1 -; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z3.b[1] -; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #13] -; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #12] -; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s2 -; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #11] -; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s0 -; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #10] -; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s1 -; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #9] -; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #8] -; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [sp, #8] -; SVE2_NOMIN_NOMAX-NEXT: add sp, sp, #16 +; SVE2_NOMIN_NOMAX-NEXT: ldr d1, [x0] +; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[3] +; SVE2_NOMIN_NOMAX-NEXT: mov z3.b, z0.b[2] +; SVE2_NOMIN_NOMAX-NEXT: mov z4.b, z0.b[1] +; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z1.b[1] +; SVE2_NOMIN_NOMAX-NEXT: mov z5.b, z0.b[4] +; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[6] +; SVE2_NOMIN_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b +; SVE2_NOMIN_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b +; SVE2_NOMIN_NOMAX-NEXT: zip1 z3.b, z5.b, z5.b +; SVE2_NOMIN_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h +; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.h, z3.h, z0.h +; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s +; SVE2_NOMIN_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2_NOMIN_NOMAX-NEXT: ret ; ; SVE2_MIN_256_NOMAX-LABEL: shuffle_index_poison_value: ; SVE2_MIN_256_NOMAX: // %bb.0: -; SVE2_MIN_256_NOMAX-NEXT: sub sp, sp, #16 -; SVE2_MIN_256_NOMAX-NEXT: .cfi_def_cfa_offset 16 ; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [x1] -; SVE2_MIN_256_NOMAX-NEXT: ldr d3, [x0] -; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z0.b[6] -; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[4] -; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s1 -; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z0.b[3] -; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s2 -; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[2] -; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[1] -; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #14] -; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s1 -; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z3.b[1] -; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #13] -; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #12] -; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s2 -; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #11] -; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s0 -; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #10] -; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s1 -; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #9] -; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #8] -; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [sp, #8] -; SVE2_MIN_256_NOMAX-NEXT: add sp, sp, #16 +; SVE2_MIN_256_NOMAX-NEXT: ldr d1, [x0] +; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[3] +; SVE2_MIN_256_NOMAX-NEXT: mov z3.b, z0.b[2] +; SVE2_MIN_256_NOMAX-NEXT: mov z4.b, z0.b[1] +; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z1.b[1] +; SVE2_MIN_256_NOMAX-NEXT: mov z5.b, z0.b[4] +; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[6] +; SVE2_MIN_256_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b +; SVE2_MIN_256_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b +; SVE2_MIN_256_NOMAX-NEXT: zip1 z3.b, z5.b, z5.b +; SVE2_MIN_256_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h +; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.h, z3.h, z0.h +; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s +; SVE2_MIN_256_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2_MIN_256_NOMAX-NEXT: ret %op1 = load <8 x i8>, ptr %a %op2 = load <8 x i8>, ptr %b @@ -401,34 +338,23 @@ define <8 x i8> @shuffle_op1_poison(ptr %a, ptr %b) { define <8 x i8> @negative_test_shuffle_index_size_op_both_maxhw(ptr %a, ptr %b) "target-features"="+sve2" vscale_range(16,16) { ; CHECK-LABEL: negative_test_shuffle_index_size_op_both_maxhw: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: mov z1.b, z0.b[7] -; CHECK-NEXT: mov z2.b, z0.b[6] -; CHECK-NEXT: mov z3.b, z0.b[4] -; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: fmov w9, s2 ; CHECK-NEXT: mov z2.b, z0.b[3] -; CHECK-NEXT: mov z1.b, z1.b[1] -; CHECK-NEXT: strb w8, [sp, #15] -; CHECK-NEXT: fmov w8, s3 ; CHECK-NEXT: mov z3.b, z0.b[2] -; CHECK-NEXT: strb w9, [sp, #14] -; CHECK-NEXT: mov z0.b, z0.b[1] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: strb w8, [sp, #13] -; CHECK-NEXT: strb w8, [sp, #12] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strb w9, [sp, #11] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strb w8, [sp, #10] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strb w9, [sp, #9] -; CHECK-NEXT: strb w8, [sp, #8] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: mov z4.b, z0.b[1] +; CHECK-NEXT: mov z1.b, z1.b[1] +; CHECK-NEXT: mov z5.b, z0.b[7] +; CHECK-NEXT: mov z6.b, z0.b[6] +; CHECK-NEXT: mov z0.b, z0.b[4] +; CHECK-NEXT: zip1 z2.b, z3.b, z2.b +; CHECK-NEXT: zip1 z1.b, z1.b, z4.b +; CHECK-NEXT: zip1 z3.b, z6.b, z5.b +; CHECK-NEXT: zip1 z0.b, z0.b, z0.b +; CHECK-NEXT: zip1 z1.h, z1.h, z2.h +; CHECK-NEXT: zip1 z0.h, z0.h, z3.h +; CHECK-NEXT: zip1 z0.s, z1.s, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %op1 = load <8 x i8>, ptr %a %op2 = load <8 x i8>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll index 617b560713c3ab..478072d33d8c9b 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll @@ -184,13 +184,11 @@ define <32 x i8> @vls_sve_and_32xi8(<32 x i8> %ap) nounwind { define <2 x i16> @vls_sve_and_2xi16(<2 x i16> %b) nounwind { ; CHECK-LABEL: vls_sve_and_2xi16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: fmov s1, wzr ; CHECK-NEXT: mov z0.s, z0.s[1] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: stp wzr, w8, [sp, #8] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: zip1 z0.s, z1.s, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: vls_sve_and_2xi16: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll index b9264ad5f77c37..6644be11a02ba7 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll @@ -91,19 +91,12 @@ define void @bitcast_v32i8(ptr %a, ptr %b) { define void @bitcast_v2i16(ptr %a, ptr %b) { ; CHECK-LABEL: bitcast_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] ; CHECK-NEXT: mov z1.s, z0.s[1] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: zip1 z0.h, z0.h, z1.h ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: str w8, [x1] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: bitcast_v2i16: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll index b8a2e0e0f4bd4c..9729a1d95cd916 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll @@ -222,3 +222,255 @@ define void @build_vector_no_stride_v4f64(ptr %a) { store <4 x double> , ptr %a, align 8 ret void } + +define void @build_vector_non_const_v4i1(i1 %a, i1 %b, i1 %c, i1 %d, ptr %out) { +; CHECK-LABEL: build_vector_non_const_v4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: orr w8, w0, w1, lsl #1 +; CHECK-NEXT: orr w8, w8, w2, lsl #2 +; CHECK-NEXT: orr w8, w8, w3, lsl #3 +; CHECK-NEXT: strb w8, [x4] +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: build_vector_non_const_v4i1: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: orr w8, w0, w1, lsl #1 +; NONEON-NOSVE-NEXT: orr w8, w8, w2, lsl #2 +; NONEON-NOSVE-NEXT: orr w8, w8, w3, lsl #3 +; NONEON-NOSVE-NEXT: strb w8, [x4] +; NONEON-NOSVE-NEXT: ret + %1 = insertelement <4 x i1> undef, i1 %a, i64 0 + %2 = insertelement <4 x i1> %1, i1 %b, i64 1 + %3 = insertelement <4 x i1> %2, i1 %c, i64 2 + %4 = insertelement <4 x i1> %3, i1 %d, i64 3 + store <4 x i1> %4, ptr %out + ret void +} + +define void @build_vector_non_const_v2f64(double %a, double %b, ptr %out) { +; CHECK-LABEL: build_vector_non_const_v2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: zip1 z0.d, z0.d, z1.d +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: build_vector_non_const_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp] +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret + %1 = insertelement <2 x double> undef, double %a, i64 0 + %2 = insertelement <2 x double> %1, double %b, i64 1 + store <2 x double> %2, ptr %out + ret void +} + +define void @build_vector_non_const_v2f32(float %a, float %b, ptr %out) { +; CHECK-LABEL: build_vector_non_const_v2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: // kill: def $s1 killed $s1 def $z1 +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: build_vector_non_const_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret + %1 = insertelement <2 x float> undef, float %a, i64 0 + %2 = insertelement <2 x float> %1, float %b, i64 1 + store <2 x float> %2, ptr %out + ret void +} + +define void @build_vector_non_const_v4f32(float %a, float %b, float %c, float %d, ptr %out) { +; CHECK-LABEL: build_vector_non_const_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $s2 killed $s2 def $z2 +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: // kill: def $s3 killed $s3 def $z3 +; CHECK-NEXT: // kill: def $s1 killed $s1 def $z1 +; CHECK-NEXT: zip1 z2.s, z2.s, z3.s +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: zip1 z0.d, z0.d, z2.d +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: build_vector_non_const_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: stp s2, s3, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp] +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret + %1 = insertelement <4 x float> undef, float %a, i64 0 + %2 = insertelement <4 x float> %1, float %b, i64 1 + %3 = insertelement <4 x float> %2, float %c, i64 2 + %4 = insertelement <4 x float> %3, float %d, i64 3 + store <4 x float> %4, ptr %out + ret void +} + +define void @build_vector_non_const_v4f64(double %a, double %b, double %c, double %d, ptr %out) { +; CHECK-LABEL: build_vector_non_const_v4f64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d3 killed $d3 def $z3 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: zip1 z2.d, z2.d, z3.d +; CHECK-NEXT: zip1 z0.d, z0.d, z1.d +; CHECK-NEXT: stp q0, q2, [x0] +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: build_vector_non_const_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d2, d3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ret + %1 = insertelement <4 x double> undef, double %a, i64 0 + %2 = insertelement <4 x double> %1, double %b, i64 1 + %3 = insertelement <4 x double> %2, double %c, i64 2 + %4 = insertelement <4 x double> %3, double %d, i64 3 + store <4 x double> %4, ptr %out + ret void +} + +define void @build_vector_non_const_v8f16(half %a, half %b, half %c, half %d, half %e, half %f, half %g, half %h, ptr %out) { +; CHECK-LABEL: build_vector_non_const_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h6 killed $h6 def $z6 +; CHECK-NEXT: // kill: def $h4 killed $h4 def $z4 +; CHECK-NEXT: // kill: def $h2 killed $h2 def $z2 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: // kill: def $h7 killed $h7 def $z7 +; CHECK-NEXT: // kill: def $h5 killed $h5 def $z5 +; CHECK-NEXT: // kill: def $h3 killed $h3 def $z3 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $z1 +; CHECK-NEXT: zip1 z6.h, z6.h, z7.h +; CHECK-NEXT: zip1 z4.h, z4.h, z5.h +; CHECK-NEXT: zip1 z2.h, z2.h, z3.h +; CHECK-NEXT: zip1 z0.h, z0.h, z1.h +; CHECK-NEXT: zip1 z1.s, z4.s, z6.s +; CHECK-NEXT: zip1 z0.s, z0.s, z2.s +; CHECK-NEXT: zip1 z0.d, z0.d, z1.d +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: build_vector_non_const_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str h7, [sp, #14] +; NONEON-NOSVE-NEXT: str h6, [sp, #12] +; NONEON-NOSVE-NEXT: str h5, [sp, #10] +; NONEON-NOSVE-NEXT: str h4, [sp, #8] +; NONEON-NOSVE-NEXT: str h3, [sp, #6] +; NONEON-NOSVE-NEXT: str h2, [sp, #4] +; NONEON-NOSVE-NEXT: str h1, [sp, #2] +; NONEON-NOSVE-NEXT: str h0, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp] +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret + %1 = insertelement <8 x half> undef, half %a, i64 0 + %2 = insertelement <8 x half> %1, half %b, i64 1 + %3 = insertelement <8 x half> %2, half %c, i64 2 + %4 = insertelement <8 x half> %3, half %d, i64 3 + %5 = insertelement <8 x half> %4, half %e, i64 4 + %6 = insertelement <8 x half> %5, half %f, i64 5 + %7 = insertelement <8 x half> %6, half %g, i64 6 + %8 = insertelement <8 x half> %7, half %h, i64 7 + store <8 x half> %8, ptr %out + ret void +} + +define void @build_vector_non_const_v2i32(i32 %a, i32 %b, ptr %out) { +; CHECK-LABEL: build_vector_non_const_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov s0, w1 +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: zip1 z0.s, z1.s, z0.s +; CHECK-NEXT: str d0, [x2] +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: build_vector_non_const_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: stp w0, w1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: str d0, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret + %1 = insertelement <2 x i32> undef, i32 %a, i64 0 + %2 = insertelement <2 x i32> %1, i32 %b, i64 1 + store <2 x i32> %2, ptr %out + ret void +} + +define void @build_vector_non_const_v8i8(i8 %a, i8 %b, i8 %c, i8 %d, i8 %e, i8 %f, i8 %g, i8 %h, ptr %out) { +; CHECK-LABEL: build_vector_non_const_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: strb w7, [sp, #15] +; CHECK-NEXT: ldr x8, [sp, #16] +; CHECK-NEXT: strb w6, [sp, #14] +; CHECK-NEXT: strb w5, [sp, #13] +; CHECK-NEXT: strb w4, [sp, #12] +; CHECK-NEXT: strb w3, [sp, #11] +; CHECK-NEXT: strb w2, [sp, #10] +; CHECK-NEXT: strb w1, [sp, #9] +; CHECK-NEXT: strb w0, [sp, #8] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: str d0, [x8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: build_vector_non_const_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: strb w7, [sp, #15] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: strb w6, [sp, #14] +; NONEON-NOSVE-NEXT: strb w5, [sp, #13] +; NONEON-NOSVE-NEXT: strb w4, [sp, #12] +; NONEON-NOSVE-NEXT: strb w3, [sp, #11] +; NONEON-NOSVE-NEXT: strb w2, [sp, #10] +; NONEON-NOSVE-NEXT: strb w1, [sp, #9] +; NONEON-NOSVE-NEXT: strb w0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: str d0, [x8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret + %1 = insertelement <8 x i8> undef, i8 %a, i64 0 + %2 = insertelement <8 x i8> %1, i8 %b, i64 1 + %3 = insertelement <8 x i8> %2, i8 %c, i64 2 + %4 = insertelement <8 x i8> %3, i8 %d, i64 3 + %5 = insertelement <8 x i8> %4, i8 %e, i64 4 + %6 = insertelement <8 x i8> %5, i8 %f, i64 5 + %7 = insertelement <8 x i8> %6, i8 %g, i64 6 + %8 = insertelement <8 x i8> %7, i8 %h, i64 7 + store <8 x i8> %8, ptr %out + ret void +} diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll index 4b6285b2732fe5..c1810c678ea522 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll @@ -12,34 +12,22 @@ target triple = "aarch64-unknown-linux-gnu" define <8 x i8> @concat_v8i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-LABEL: concat_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 -; CHECK-NEXT: mov z2.h, z1.h[3] -; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov z2.h, z1.h[3] ; CHECK-NEXT: mov z3.h, z1.h[2] -; CHECK-NEXT: mov z1.h, z1.h[1] -; CHECK-NEXT: mov z4.h, z0.h[3] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strb w8, [sp, #12] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.h, z0.h[2] -; CHECK-NEXT: mov z0.h, z0.h[1] -; CHECK-NEXT: strb w9, [sp, #8] -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: strb w8, [sp, #15] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strb w9, [sp, #14] -; CHECK-NEXT: strb w8, [sp, #13] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: strb w8, [sp, #11] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strb w8, [sp, #10] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strb w8, [sp, #9] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: mov z4.h, z1.h[1] +; CHECK-NEXT: mov z5.h, z0.h[3] +; CHECK-NEXT: mov z6.h, z0.h[2] +; CHECK-NEXT: mov z7.h, z0.h[1] +; CHECK-NEXT: zip1 z2.b, z3.b, z2.b +; CHECK-NEXT: zip1 z1.b, z1.b, z4.b +; CHECK-NEXT: zip1 z3.b, z6.b, z5.b +; CHECK-NEXT: zip1 z0.b, z0.b, z7.b +; CHECK-NEXT: zip1 z1.h, z1.h, z2.h +; CHECK-NEXT: zip1 z0.h, z0.h, z3.h +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: concat_v8i8: @@ -152,22 +140,14 @@ define void @concat_v64i8(ptr %a, ptr %b, ptr %c) { define <4 x i16> @concat_v4i16(<2 x i16> %op1, <2 x i16> %op2) { ; CHECK-LABEL: concat_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov z2.s, z1.s[1] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov z1.s, z0.s[1] -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: strh w9, [sp, #10] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: mov z3.s, z0.s[1] +; CHECK-NEXT: zip1 z1.h, z1.h, z2.h +; CHECK-NEXT: zip1 z0.h, z0.h, z3.h +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: concat_v4i16: @@ -428,18 +408,14 @@ define void @concat_v8i64(ptr %a, ptr %b, ptr %c) { define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2) { ; CHECK-LABEL: concat_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov z2.h, z1.h[1] -; CHECK-NEXT: str h1, [sp, #12] -; CHECK-NEXT: mov z1.h, z0.h[1] -; CHECK-NEXT: str h0, [sp, #8] -; CHECK-NEXT: str h2, [sp, #14] -; CHECK-NEXT: str h1, [sp, #10] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: mov z3.h, z0.h[1] +; CHECK-NEXT: zip1 z1.h, z1.h, z2.h +; CHECK-NEXT: zip1 z0.h, z0.h, z3.h +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: concat_v4f16: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll index 50a05cb4b1e277..7d6336a43a4fd1 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll @@ -326,29 +326,29 @@ define <2 x i256> @load_sext_v2i64i256(ptr %ap) { ; CHECK-LABEL: load_sext_v2i64i256: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: mov z1.d, z0.d[1] -; CHECK-NEXT: asr x9, x8, #63 -; CHECK-NEXT: fmov x10, d1 -; CHECK-NEXT: stp x8, x9, [sp, #-32]! -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: asr x8, x10, #63 -; CHECK-NEXT: mov z0.d, x9 -; CHECK-NEXT: stp x10, x8, [sp, #16] -; CHECK-NEXT: mov z1.d, x8 -; CHECK-NEXT: ldp q2, q4, [sp], #32 -; CHECK-NEXT: mov z3.d, z0.d[1] -; CHECK-NEXT: mov z5.d, z1.d[1] -; CHECK-NEXT: mov z6.d, z2.d[1] -; CHECK-NEXT: fmov x2, d0 -; CHECK-NEXT: mov z0.d, z4.d[1] -; CHECK-NEXT: fmov x6, d1 -; CHECK-NEXT: fmov x0, d2 -; CHECK-NEXT: fmov x4, d4 -; CHECK-NEXT: fmov x3, d3 -; CHECK-NEXT: fmov x7, d5 -; CHECK-NEXT: fmov x1, d6 -; CHECK-NEXT: fmov x5, d0 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: asr x8, x8, #63 +; CHECK-NEXT: fmov d3, x8 +; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: asr x9, x9, #63 +; CHECK-NEXT: fmov d4, x9 +; CHECK-NEXT: zip1 z0.d, z0.d, z3.d +; CHECK-NEXT: mov z3.d, x9 +; CHECK-NEXT: fmov x2, d2 +; CHECK-NEXT: zip1 z1.d, z1.d, z4.d +; CHECK-NEXT: mov z4.d, z2.d[1] +; CHECK-NEXT: mov z5.d, z0.d[1] +; CHECK-NEXT: mov z6.d, z3.d[1] +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: fmov x6, d3 +; CHECK-NEXT: mov z2.d, z1.d[1] +; CHECK-NEXT: fmov x3, d4 +; CHECK-NEXT: fmov x1, d5 +; CHECK-NEXT: fmov x4, d1 +; CHECK-NEXT: fmov x7, d6 +; CHECK-NEXT: fmov x5, d2 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: load_sext_v2i64i256: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll index 2665696308463f..a728cbe97056db 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll @@ -10,23 +10,15 @@ target triple = "aarch64-unknown-linux-gnu" define <4 x i1> @extract_subvector_v8i1(<8 x i1> %op) { ; CHECK-LABEL: extract_subvector_v8i1: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov z1.b, z0.b[7] ; CHECK-NEXT: mov z2.b, z0.b[6] ; CHECK-NEXT: mov z3.b, z0.b[5] ; CHECK-NEXT: mov z0.b, z0.b[4] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w9, [sp, #12] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: zip1 z1.h, z2.h, z1.h +; CHECK-NEXT: zip1 z0.h, z0.h, z3.h +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: extract_subvector_v8i1: @@ -53,23 +45,15 @@ define <4 x i1> @extract_subvector_v8i1(<8 x i1> %op) { define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) { ; CHECK-LABEL: extract_subvector_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov z1.b, z0.b[7] ; CHECK-NEXT: mov z2.b, z0.b[6] ; CHECK-NEXT: mov z3.b, z0.b[5] ; CHECK-NEXT: mov z0.b, z0.b[4] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w9, [sp, #12] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: zip1 z1.h, z2.h, z1.h +; CHECK-NEXT: zip1 z0.h, z0.h, z3.h +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: extract_subvector_v8i8: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll index dad53b31db0b0f..f1771a753826cc 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll @@ -1126,49 +1126,39 @@ define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) { define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) { ; SVE-LABEL: test_copysign_v4f16_v4f64: ; SVE: // %bb.0: -; SVE-NEXT: sub sp, sp, #16 -; SVE-NEXT: .cfi_def_cfa_offset 16 -; SVE-NEXT: ldp q1, q0, [x1] -; SVE-NEXT: ldr d4, [x0] -; SVE-NEXT: and z4.h, z4.h, #0x7fff -; SVE-NEXT: mov z2.d, z0.d[1] -; SVE-NEXT: mov z3.d, z1.d[1] -; SVE-NEXT: fcvt h0, d0 +; SVE-NEXT: ldp q0, q1, [x1] +; SVE-NEXT: mov z2.d, z1.d[1] +; SVE-NEXT: mov z3.d, z0.d[1] ; SVE-NEXT: fcvt h1, d1 +; SVE-NEXT: fcvt h0, d0 ; SVE-NEXT: fcvt h2, d2 ; SVE-NEXT: fcvt h3, d3 -; SVE-NEXT: str h0, [sp, #12] -; SVE-NEXT: str h1, [sp, #8] -; SVE-NEXT: str h2, [sp, #14] -; SVE-NEXT: str h3, [sp, #10] -; SVE-NEXT: ldr d0, [sp, #8] +; SVE-NEXT: zip1 z1.h, z1.h, z2.h +; SVE-NEXT: zip1 z0.h, z0.h, z3.h +; SVE-NEXT: zip1 z0.s, z0.s, z1.s +; SVE-NEXT: ldr d1, [x0] +; SVE-NEXT: and z1.h, z1.h, #0x7fff ; SVE-NEXT: and z0.h, z0.h, #0x8000 -; SVE-NEXT: orr z0.d, z4.d, z0.d +; SVE-NEXT: orr z0.d, z1.d, z0.d ; SVE-NEXT: str d0, [x0] -; SVE-NEXT: add sp, sp, #16 ; SVE-NEXT: ret ; ; SVE2-LABEL: test_copysign_v4f16_v4f64: ; SVE2: // %bb.0: -; SVE2-NEXT: sub sp, sp, #16 -; SVE2-NEXT: .cfi_def_cfa_offset 16 -; SVE2-NEXT: ldp q2, q1, [x1] -; SVE2-NEXT: mov z0.h, #32767 // =0x7fff -; SVE2-NEXT: ldr d5, [x0] -; SVE2-NEXT: mov z3.d, z1.d[1] -; SVE2-NEXT: mov z4.d, z2.d[1] +; SVE2-NEXT: ldp q0, q1, [x1] +; SVE2-NEXT: mov z2.d, z1.d[1] +; SVE2-NEXT: mov z3.d, z0.d[1] ; SVE2-NEXT: fcvt h1, d1 +; SVE2-NEXT: fcvt h0, d0 ; SVE2-NEXT: fcvt h2, d2 ; SVE2-NEXT: fcvt h3, d3 -; SVE2-NEXT: fcvt h4, d4 -; SVE2-NEXT: str h1, [sp, #12] -; SVE2-NEXT: str h2, [sp, #8] -; SVE2-NEXT: str h3, [sp, #14] -; SVE2-NEXT: str h4, [sp, #10] -; SVE2-NEXT: ldr d1, [sp, #8] -; SVE2-NEXT: bsl z5.d, z5.d, z1.d, z0.d -; SVE2-NEXT: str d5, [x0] -; SVE2-NEXT: add sp, sp, #16 +; SVE2-NEXT: zip1 z1.h, z1.h, z2.h +; SVE2-NEXT: zip1 z0.h, z0.h, z3.h +; SVE2-NEXT: mov z2.h, #32767 // =0x7fff +; SVE2-NEXT: zip1 z0.s, z0.s, z1.s +; SVE2-NEXT: ldr d1, [x0] +; SVE2-NEXT: bsl z1.d, z1.d, z0.d, z2.d +; SVE2-NEXT: str d1, [x0] ; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f64: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll index a206fbc5102953..11fee267660c03 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll @@ -443,9 +443,10 @@ define <2 x i64> @fcvtzu_v2f16_v2i64(<2 x half> %op1) { ; CHECK-NEXT: mov z1.h, z0.h[1] ; CHECK-NEXT: fcvtzu x8, h0 ; CHECK-NEXT: fcvtzu x9, h1 -; CHECK-NEXT: stp x8, x9, [sp, #-16]! -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldr q0, [sp], #16 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: fmov d1, x9 +; CHECK-NEXT: zip1 z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzu_v2f16_v2i64: @@ -471,19 +472,20 @@ define void @fcvtzu_v4f16_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzu_v4f16_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: mov z1.h, z0.h[1] -; CHECK-NEXT: fcvtzu x8, h0 -; CHECK-NEXT: mov z2.h, z0.h[3] -; CHECK-NEXT: mov z0.h, z0.h[2] -; CHECK-NEXT: fcvtzu x9, h1 -; CHECK-NEXT: fcvtzu x10, h2 -; CHECK-NEXT: fcvtzu x11, h0 -; CHECK-NEXT: stp x8, x9, [sp, #-32]! -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: stp x11, x10, [sp, #16] -; CHECK-NEXT: ldp q1, q0, [sp] +; CHECK-NEXT: mov z1.h, z0.h[3] +; CHECK-NEXT: mov z2.h, z0.h[2] +; CHECK-NEXT: mov z3.h, z0.h[1] +; CHECK-NEXT: fcvtzu x10, h0 +; CHECK-NEXT: fcvtzu x8, h1 +; CHECK-NEXT: fcvtzu x9, h2 +; CHECK-NEXT: fcvtzu x11, h3 +; CHECK-NEXT: fmov d2, x10 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: fmov d1, x9 +; CHECK-NEXT: zip1 z0.d, z1.d, z0.d +; CHECK-NEXT: fmov d1, x11 +; CHECK-NEXT: zip1 z1.d, z2.d, z1.d ; CHECK-NEXT: stp q1, q0, [x1] -; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i64: @@ -521,31 +523,35 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: fcvtzu x12, h0 +; CHECK-NEXT: mov z2.h, z0.h[3] +; CHECK-NEXT: mov z3.h, z0.h[2] +; CHECK-NEXT: mov z4.h, z0.h[1] +; CHECK-NEXT: fcvtzu x10, h0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: mov z2.h, z1.h[1] -; CHECK-NEXT: fcvtzu x8, h1 -; CHECK-NEXT: mov z3.h, z1.h[3] -; CHECK-NEXT: mov z1.h, z1.h[2] -; CHECK-NEXT: fcvtzu x9, h2 -; CHECK-NEXT: mov z2.h, z0.h[1] -; CHECK-NEXT: fcvtzu x10, h3 -; CHECK-NEXT: mov z3.h, z0.h[3] -; CHECK-NEXT: fcvtzu x11, h1 -; CHECK-NEXT: mov z0.h, z0.h[2] -; CHECK-NEXT: stp x8, x9, [sp, #-64]! -; CHECK-NEXT: .cfi_def_cfa_offset 64 ; CHECK-NEXT: fcvtzu x8, h2 ; CHECK-NEXT: fcvtzu x9, h3 -; CHECK-NEXT: stp x11, x10, [sp, #16] -; CHECK-NEXT: fcvtzu x10, h0 -; CHECK-NEXT: ldp q2, q3, [sp] -; CHECK-NEXT: stp x12, x8, [sp, #32] -; CHECK-NEXT: stp x10, x9, [sp, #48] -; CHECK-NEXT: ldp q1, q0, [sp, #32] -; CHECK-NEXT: stp q2, q3, [x1, #32] -; CHECK-NEXT: stp q1, q0, [x1] -; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: fcvtzu x11, h4 +; CHECK-NEXT: mov z5.h, z1.h[3] +; CHECK-NEXT: mov z6.h, z1.h[2] +; CHECK-NEXT: mov z2.h, z1.h[1] +; CHECK-NEXT: fcvtzu x14, h1 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: fmov d1, x9 +; CHECK-NEXT: fmov d3, x11 +; CHECK-NEXT: fcvtzu x12, h5 +; CHECK-NEXT: fcvtzu x13, h6 +; CHECK-NEXT: fcvtzu x15, h2 +; CHECK-NEXT: fmov d2, x10 +; CHECK-NEXT: zip1 z0.d, z1.d, z0.d +; CHECK-NEXT: fmov d1, x12 +; CHECK-NEXT: fmov d4, x13 +; CHECK-NEXT: zip1 z2.d, z2.d, z3.d +; CHECK-NEXT: fmov d3, x14 +; CHECK-NEXT: zip1 z1.d, z4.d, z1.d +; CHECK-NEXT: fmov d4, x15 +; CHECK-NEXT: stp q2, q0, [x1] +; CHECK-NEXT: zip1 z3.d, z3.d, z4.d +; CHECK-NEXT: stp q3, q1, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i64: @@ -598,57 +604,67 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) { define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzu_v16f16_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: mov z3.d, z0.d -; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 +; CHECK-NEXT: mov z5.d, z1.d +; CHECK-NEXT: mov z2.h, z0.h[3] +; CHECK-NEXT: mov z4.h, z1.h[1] +; CHECK-NEXT: mov z6.h, z1.h[3] +; CHECK-NEXT: fcvtzu x9, h1 +; CHECK-NEXT: fcvtzu x8, h0 +; CHECK-NEXT: mov z7.h, z0.h[1] ; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: mov z4.h, z2.h[1] -; CHECK-NEXT: fcvtzu x8, h2 -; CHECK-NEXT: mov z5.h, z2.h[3] -; CHECK-NEXT: mov z2.h, z2.h[2] -; CHECK-NEXT: fcvtzu x12, h3 -; CHECK-NEXT: fcvtzu x9, h4 -; CHECK-NEXT: mov z4.h, z3.h[1] -; CHECK-NEXT: fcvtzu x10, h5 -; CHECK-NEXT: mov z5.h, z3.h[3] -; CHECK-NEXT: fcvtzu x11, h2 -; CHECK-NEXT: mov z2.h, z3.h[2] -; CHECK-NEXT: stp x8, x9, [sp, #-128]! -; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: fcvtzu x8, h4 -; CHECK-NEXT: fcvtzu x9, h5 -; CHECK-NEXT: stp x11, x10, [sp, #16] +; CHECK-NEXT: ext z5.b, z5.b, z1.b, #8 ; CHECK-NEXT: fcvtzu x10, h2 -; CHECK-NEXT: mov z3.h, z1.h[1] -; CHECK-NEXT: mov z4.h, z1.h[3] -; CHECK-NEXT: fcvtzu x11, h1 +; CHECK-NEXT: fcvtzu x11, h4 +; CHECK-NEXT: fcvtzu x12, h6 ; CHECK-NEXT: mov z1.h, z1.h[2] -; CHECK-NEXT: mov z2.h, z0.h[1] -; CHECK-NEXT: stp x12, x8, [sp, #64] -; CHECK-NEXT: fcvtzu x12, h3 -; CHECK-NEXT: fcvtzu x8, h4 -; CHECK-NEXT: stp x10, x9, [sp, #80] -; CHECK-NEXT: fcvtzu x9, h1 -; CHECK-NEXT: mov z3.h, z0.h[3] -; CHECK-NEXT: fcvtzu x10, h0 ; CHECK-NEXT: mov z0.h, z0.h[2] -; CHECK-NEXT: stp x11, x12, [sp, #32] -; CHECK-NEXT: fcvtzu x11, h2 -; CHECK-NEXT: fcvtzu x12, h3 -; CHECK-NEXT: stp x9, x8, [sp, #48] -; CHECK-NEXT: fcvtzu x8, h0 -; CHECK-NEXT: ldp q0, q1, [sp] -; CHECK-NEXT: ldp q3, q4, [sp, #64] -; CHECK-NEXT: stp x10, x11, [sp, #96] -; CHECK-NEXT: ldp q6, q7, [sp, #32] -; CHECK-NEXT: stp x8, x12, [sp, #112] -; CHECK-NEXT: ldp q5, q2, [sp, #96] -; CHECK-NEXT: stp q0, q1, [x1, #32] -; CHECK-NEXT: stp q6, q7, [x1] -; CHECK-NEXT: stp q3, q4, [x1, #96] -; CHECK-NEXT: stp q5, q2, [x1, #64] -; CHECK-NEXT: add sp, sp, #128 +; CHECK-NEXT: fmov d16, x9 +; CHECK-NEXT: mov z2.h, z3.h[3] +; CHECK-NEXT: mov z4.h, z5.h[3] +; CHECK-NEXT: fcvtzu x14, h3 +; CHECK-NEXT: fcvtzu x13, h1 +; CHECK-NEXT: fcvtzu x15, h5 +; CHECK-NEXT: mov z1.h, z3.h[1] +; CHECK-NEXT: mov z6.h, z5.h[1] +; CHECK-NEXT: mov z5.h, z5.h[2] +; CHECK-NEXT: mov z3.h, z3.h[2] +; CHECK-NEXT: fcvtzu x9, h2 +; CHECK-NEXT: fmov d2, x10 +; CHECK-NEXT: fcvtzu x10, h4 +; CHECK-NEXT: fmov d4, x11 +; CHECK-NEXT: fcvtzu x11, h7 +; CHECK-NEXT: fmov d7, x12 +; CHECK-NEXT: fcvtzu x12, h0 +; CHECK-NEXT: fmov d0, x13 +; CHECK-NEXT: fcvtzu x13, h1 +; CHECK-NEXT: fmov d1, x14 +; CHECK-NEXT: fcvtzu x14, h6 +; CHECK-NEXT: fmov d6, x15 +; CHECK-NEXT: fcvtzu x15, h5 +; CHECK-NEXT: fmov d5, x9 +; CHECK-NEXT: fcvtzu x9, h3 +; CHECK-NEXT: zip1 z4.d, z16.d, z4.d +; CHECK-NEXT: fmov d16, x8 +; CHECK-NEXT: zip1 z0.d, z0.d, z7.d +; CHECK-NEXT: fmov d3, x12 +; CHECK-NEXT: fmov d7, x10 +; CHECK-NEXT: stp q4, q0, [x1, #64] +; CHECK-NEXT: fmov d0, x14 +; CHECK-NEXT: fmov d4, x9 +; CHECK-NEXT: zip1 z2.d, z3.d, z2.d +; CHECK-NEXT: fmov d3, x11 +; CHECK-NEXT: zip1 z0.d, z6.d, z0.d +; CHECK-NEXT: zip1 z4.d, z4.d, z5.d +; CHECK-NEXT: zip1 z3.d, z16.d, z3.d +; CHECK-NEXT: fmov d16, x15 +; CHECK-NEXT: stp q3, q2, [x1] +; CHECK-NEXT: fmov d2, x13 +; CHECK-NEXT: zip1 z7.d, z16.d, z7.d +; CHECK-NEXT: zip1 z1.d, z1.d, z2.d +; CHECK-NEXT: stp q0, q7, [x1, #96] +; CHECK-NEXT: stp q1, q4, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i64: @@ -1216,26 +1232,18 @@ define <2 x i16> @fcvtzu_v2f64_v2i16(<2 x double> %op1) { define <4 x i16> @fcvtzu_v4f64_v4i16(ptr %a) { ; CHECK-LABEL: fcvtzu_v4f64_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: mov z2.s, z0.s[1] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z0.s, z1.s[1] -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: mov z2.s, z1.s[1] +; CHECK-NEXT: mov z3.s, z0.s[1] +; CHECK-NEXT: zip1 z1.h, z1.h, z2.h +; CHECK-NEXT: zip1 z0.h, z0.h, z3.h +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i16: @@ -1270,40 +1278,29 @@ define <4 x i16> @fcvtzu_v4f64_v4i16(ptr %a) { define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) { ; CHECK-LABEL: fcvtzu_v8f64_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldp q1, q0, [x0, #32] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d -; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d ; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.d +; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s ; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z0.s, z0.s[1] -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z1.s, z1.s[1] -; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.s, z2.s[1] -; CHECK-NEXT: strh w8, [sp, #4] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.s, z3.s[1] -; CHECK-NEXT: strh w8, [sp] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w8, [sp, #6] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: ldr q0, [sp], #16 +; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s +; CHECK-NEXT: mov z4.s, z0.s[1] +; CHECK-NEXT: mov z5.s, z1.s[1] +; CHECK-NEXT: mov z6.s, z3.s[1] +; CHECK-NEXT: mov z7.s, z2.s[1] +; CHECK-NEXT: zip1 z0.h, z0.h, z4.h +; CHECK-NEXT: zip1 z1.h, z1.h, z5.h +; CHECK-NEXT: zip1 z3.h, z3.h, z6.h +; CHECK-NEXT: zip1 z2.h, z2.h, z7.h +; CHECK-NEXT: zip1 z0.s, z1.s, z0.s +; CHECK-NEXT: zip1 z1.s, z2.s, z3.s +; CHECK-NEXT: zip1 z0.d, z1.d, z0.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzu_v8f64_v8i16: @@ -1360,73 +1357,50 @@ define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) { define void @fcvtzu_v16f64_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzu_v16f64_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ldp q5, q6, [x0, #96] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: ldp q4, q5, [x0, #96] -; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d +; CHECK-NEXT: ldp q0, q4, [x0, #32] +; CHECK-NEXT: ldp q2, q7, [x0, #64] +; CHECK-NEXT: ldp q1, q3, [x0] +; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.d +; CHECK-NEXT: fcvtzs z4.d, p0/m, z4.d +; CHECK-NEXT: fcvtzs z5.d, p0/m, z5.d ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: fcvtzs z7.d, p0/m, z7.d ; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d -; CHECK-NEXT: ldp q6, q7, [x0, #64] ; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.d -; CHECK-NEXT: fcvtzs z5.d, p0/m, z5.d -; CHECK-NEXT: fcvtzs z4.d, p0/m, z4.d -; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d +; CHECK-NEXT: uzp1 z6.s, z6.s, z6.s +; CHECK-NEXT: uzp1 z4.s, z4.s, z4.s +; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.d +; CHECK-NEXT: uzp1 z7.s, z7.s, z7.s ; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s ; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s -; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z16.s, z1.s[1] -; CHECK-NEXT: mov z1.s, z0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov z0.s, z2.s[1] -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.s, z3.s[1] -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: movprfx z3, z7 -; CHECK-NEXT: fcvtzs z3.d, p0/m, z7.d -; CHECK-NEXT: strh w8, [sp, #4] -; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: strh w9, [sp] -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: uzp1 z1.s, z4.s, z4.s -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: uzp1 z0.s, z3.s, z3.s -; CHECK-NEXT: mov z3.s, z5.s[1] -; CHECK-NEXT: strh w8, [sp, #6] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: uzp1 z2.s, z6.s, z6.s -; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: strh w8, [sp, #28] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z1.s, z1.s[1] -; CHECK-NEXT: strh w8, [sp, #24] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z0.s, z0.s[1] -; CHECK-NEXT: strh w8, [sp, #20] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.s, z2.s[1] -; CHECK-NEXT: strh w8, [sp, #16] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w8, [sp, #30] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w8, [sp, #26] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strh w8, [sp, #22] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w8, [sp, #18] -; CHECK-NEXT: ldp q1, q0, [sp] -; CHECK-NEXT: stp q1, q0, [x1] -; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: mov z17.s, z6.s[1] +; CHECK-NEXT: mov z16.s, z4.s[1] +; CHECK-NEXT: mov z18.s, z5.s[1] +; CHECK-NEXT: mov z21.s, z0.s[1] +; CHECK-NEXT: mov z19.s, z7.s[1] +; CHECK-NEXT: mov z20.s, z2.s[1] +; CHECK-NEXT: mov z22.s, z3.s[1] +; CHECK-NEXT: mov z23.s, z1.s[1] +; CHECK-NEXT: zip1 z6.h, z6.h, z17.h +; CHECK-NEXT: zip1 z4.h, z4.h, z16.h +; CHECK-NEXT: zip1 z5.h, z5.h, z18.h +; CHECK-NEXT: zip1 z0.h, z0.h, z21.h +; CHECK-NEXT: zip1 z7.h, z7.h, z19.h +; CHECK-NEXT: zip1 z2.h, z2.h, z20.h +; CHECK-NEXT: zip1 z3.h, z3.h, z22.h +; CHECK-NEXT: zip1 z1.h, z1.h, z23.h +; CHECK-NEXT: zip1 z5.s, z5.s, z6.s +; CHECK-NEXT: zip1 z0.s, z0.s, z4.s +; CHECK-NEXT: zip1 z2.s, z2.s, z7.s +; CHECK-NEXT: zip1 z1.s, z1.s, z3.s +; CHECK-NEXT: zip1 z2.d, z2.d, z5.d +; CHECK-NEXT: zip1 z0.d, z1.d, z0.d +; CHECK-NEXT: stp q0, q2, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzu_v16f64_v16i16: @@ -2187,9 +2161,10 @@ define <2 x i64> @fcvtzs_v2f16_v2i64(<2 x half> %op1) { ; CHECK-NEXT: mov z1.h, z0.h[1] ; CHECK-NEXT: fcvtzs x8, h0 ; CHECK-NEXT: fcvtzs x9, h1 -; CHECK-NEXT: stp x8, x9, [sp, #-16]! -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldr q0, [sp], #16 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: fmov d1, x9 +; CHECK-NEXT: zip1 z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzs_v2f16_v2i64: @@ -2215,19 +2190,20 @@ define void @fcvtzs_v4f16_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzs_v4f16_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: mov z1.h, z0.h[1] -; CHECK-NEXT: fcvtzs x8, h0 -; CHECK-NEXT: mov z2.h, z0.h[3] -; CHECK-NEXT: mov z0.h, z0.h[2] -; CHECK-NEXT: fcvtzs x9, h1 -; CHECK-NEXT: fcvtzs x10, h2 -; CHECK-NEXT: fcvtzs x11, h0 -; CHECK-NEXT: stp x8, x9, [sp, #-32]! -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: stp x11, x10, [sp, #16] -; CHECK-NEXT: ldp q1, q0, [sp] +; CHECK-NEXT: mov z1.h, z0.h[3] +; CHECK-NEXT: mov z2.h, z0.h[2] +; CHECK-NEXT: mov z3.h, z0.h[1] +; CHECK-NEXT: fcvtzs x10, h0 +; CHECK-NEXT: fcvtzs x8, h1 +; CHECK-NEXT: fcvtzs x9, h2 +; CHECK-NEXT: fcvtzs x11, h3 +; CHECK-NEXT: fmov d2, x10 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: fmov d1, x9 +; CHECK-NEXT: zip1 z0.d, z1.d, z0.d +; CHECK-NEXT: fmov d1, x11 +; CHECK-NEXT: zip1 z1.d, z2.d, z1.d ; CHECK-NEXT: stp q1, q0, [x1] -; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i64: @@ -2265,31 +2241,35 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: fcvtzs x12, h0 +; CHECK-NEXT: mov z2.h, z0.h[3] +; CHECK-NEXT: mov z3.h, z0.h[2] +; CHECK-NEXT: mov z4.h, z0.h[1] +; CHECK-NEXT: fcvtzs x10, h0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: mov z2.h, z1.h[1] -; CHECK-NEXT: fcvtzs x8, h1 -; CHECK-NEXT: mov z3.h, z1.h[3] -; CHECK-NEXT: mov z1.h, z1.h[2] -; CHECK-NEXT: fcvtzs x9, h2 -; CHECK-NEXT: mov z2.h, z0.h[1] -; CHECK-NEXT: fcvtzs x10, h3 -; CHECK-NEXT: mov z3.h, z0.h[3] -; CHECK-NEXT: fcvtzs x11, h1 -; CHECK-NEXT: mov z0.h, z0.h[2] -; CHECK-NEXT: stp x8, x9, [sp, #-64]! -; CHECK-NEXT: .cfi_def_cfa_offset 64 ; CHECK-NEXT: fcvtzs x8, h2 ; CHECK-NEXT: fcvtzs x9, h3 -; CHECK-NEXT: stp x11, x10, [sp, #16] -; CHECK-NEXT: fcvtzs x10, h0 -; CHECK-NEXT: ldp q2, q3, [sp] -; CHECK-NEXT: stp x12, x8, [sp, #32] -; CHECK-NEXT: stp x10, x9, [sp, #48] -; CHECK-NEXT: ldp q1, q0, [sp, #32] -; CHECK-NEXT: stp q2, q3, [x1, #32] -; CHECK-NEXT: stp q1, q0, [x1] -; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: fcvtzs x11, h4 +; CHECK-NEXT: mov z5.h, z1.h[3] +; CHECK-NEXT: mov z6.h, z1.h[2] +; CHECK-NEXT: mov z2.h, z1.h[1] +; CHECK-NEXT: fcvtzs x14, h1 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: fmov d1, x9 +; CHECK-NEXT: fmov d3, x11 +; CHECK-NEXT: fcvtzs x12, h5 +; CHECK-NEXT: fcvtzs x13, h6 +; CHECK-NEXT: fcvtzs x15, h2 +; CHECK-NEXT: fmov d2, x10 +; CHECK-NEXT: zip1 z0.d, z1.d, z0.d +; CHECK-NEXT: fmov d1, x12 +; CHECK-NEXT: fmov d4, x13 +; CHECK-NEXT: zip1 z2.d, z2.d, z3.d +; CHECK-NEXT: fmov d3, x14 +; CHECK-NEXT: zip1 z1.d, z4.d, z1.d +; CHECK-NEXT: fmov d4, x15 +; CHECK-NEXT: stp q2, q0, [x1] +; CHECK-NEXT: zip1 z3.d, z3.d, z4.d +; CHECK-NEXT: stp q3, q1, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i64: @@ -2342,57 +2322,67 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) { define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzs_v16f16_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: mov z3.d, z0.d -; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 +; CHECK-NEXT: mov z5.d, z1.d +; CHECK-NEXT: mov z2.h, z0.h[3] +; CHECK-NEXT: mov z4.h, z1.h[1] +; CHECK-NEXT: mov z6.h, z1.h[3] +; CHECK-NEXT: fcvtzs x9, h1 +; CHECK-NEXT: fcvtzs x8, h0 +; CHECK-NEXT: mov z7.h, z0.h[1] ; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: mov z4.h, z2.h[1] -; CHECK-NEXT: fcvtzs x8, h2 -; CHECK-NEXT: mov z5.h, z2.h[3] -; CHECK-NEXT: mov z2.h, z2.h[2] -; CHECK-NEXT: fcvtzs x12, h3 -; CHECK-NEXT: fcvtzs x9, h4 -; CHECK-NEXT: mov z4.h, z3.h[1] -; CHECK-NEXT: fcvtzs x10, h5 -; CHECK-NEXT: mov z5.h, z3.h[3] -; CHECK-NEXT: fcvtzs x11, h2 -; CHECK-NEXT: mov z2.h, z3.h[2] -; CHECK-NEXT: stp x8, x9, [sp, #-128]! -; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: fcvtzs x8, h4 -; CHECK-NEXT: fcvtzs x9, h5 -; CHECK-NEXT: stp x11, x10, [sp, #16] +; CHECK-NEXT: ext z5.b, z5.b, z1.b, #8 ; CHECK-NEXT: fcvtzs x10, h2 -; CHECK-NEXT: mov z3.h, z1.h[1] -; CHECK-NEXT: mov z4.h, z1.h[3] -; CHECK-NEXT: fcvtzs x11, h1 +; CHECK-NEXT: fcvtzs x11, h4 +; CHECK-NEXT: fcvtzs x12, h6 ; CHECK-NEXT: mov z1.h, z1.h[2] -; CHECK-NEXT: mov z2.h, z0.h[1] -; CHECK-NEXT: stp x12, x8, [sp, #64] -; CHECK-NEXT: fcvtzs x12, h3 -; CHECK-NEXT: fcvtzs x8, h4 -; CHECK-NEXT: stp x10, x9, [sp, #80] -; CHECK-NEXT: fcvtzs x9, h1 -; CHECK-NEXT: mov z3.h, z0.h[3] -; CHECK-NEXT: fcvtzs x10, h0 ; CHECK-NEXT: mov z0.h, z0.h[2] -; CHECK-NEXT: stp x11, x12, [sp, #32] -; CHECK-NEXT: fcvtzs x11, h2 -; CHECK-NEXT: fcvtzs x12, h3 -; CHECK-NEXT: stp x9, x8, [sp, #48] -; CHECK-NEXT: fcvtzs x8, h0 -; CHECK-NEXT: ldp q0, q1, [sp] -; CHECK-NEXT: ldp q3, q4, [sp, #64] -; CHECK-NEXT: stp x10, x11, [sp, #96] -; CHECK-NEXT: ldp q6, q7, [sp, #32] -; CHECK-NEXT: stp x8, x12, [sp, #112] -; CHECK-NEXT: ldp q5, q2, [sp, #96] -; CHECK-NEXT: stp q0, q1, [x1, #32] -; CHECK-NEXT: stp q6, q7, [x1] -; CHECK-NEXT: stp q3, q4, [x1, #96] -; CHECK-NEXT: stp q5, q2, [x1, #64] -; CHECK-NEXT: add sp, sp, #128 +; CHECK-NEXT: fmov d16, x9 +; CHECK-NEXT: mov z2.h, z3.h[3] +; CHECK-NEXT: mov z4.h, z5.h[3] +; CHECK-NEXT: fcvtzs x14, h3 +; CHECK-NEXT: fcvtzs x13, h1 +; CHECK-NEXT: fcvtzs x15, h5 +; CHECK-NEXT: mov z1.h, z3.h[1] +; CHECK-NEXT: mov z6.h, z5.h[1] +; CHECK-NEXT: mov z5.h, z5.h[2] +; CHECK-NEXT: mov z3.h, z3.h[2] +; CHECK-NEXT: fcvtzs x9, h2 +; CHECK-NEXT: fmov d2, x10 +; CHECK-NEXT: fcvtzs x10, h4 +; CHECK-NEXT: fmov d4, x11 +; CHECK-NEXT: fcvtzs x11, h7 +; CHECK-NEXT: fmov d7, x12 +; CHECK-NEXT: fcvtzs x12, h0 +; CHECK-NEXT: fmov d0, x13 +; CHECK-NEXT: fcvtzs x13, h1 +; CHECK-NEXT: fmov d1, x14 +; CHECK-NEXT: fcvtzs x14, h6 +; CHECK-NEXT: fmov d6, x15 +; CHECK-NEXT: fcvtzs x15, h5 +; CHECK-NEXT: fmov d5, x9 +; CHECK-NEXT: fcvtzs x9, h3 +; CHECK-NEXT: zip1 z4.d, z16.d, z4.d +; CHECK-NEXT: fmov d16, x8 +; CHECK-NEXT: zip1 z0.d, z0.d, z7.d +; CHECK-NEXT: fmov d3, x12 +; CHECK-NEXT: fmov d7, x10 +; CHECK-NEXT: stp q4, q0, [x1, #64] +; CHECK-NEXT: fmov d0, x14 +; CHECK-NEXT: fmov d4, x9 +; CHECK-NEXT: zip1 z2.d, z3.d, z2.d +; CHECK-NEXT: fmov d3, x11 +; CHECK-NEXT: zip1 z0.d, z6.d, z0.d +; CHECK-NEXT: zip1 z4.d, z4.d, z5.d +; CHECK-NEXT: zip1 z3.d, z16.d, z3.d +; CHECK-NEXT: fmov d16, x15 +; CHECK-NEXT: stp q3, q2, [x1] +; CHECK-NEXT: fmov d2, x13 +; CHECK-NEXT: zip1 z7.d, z16.d, z7.d +; CHECK-NEXT: zip1 z1.d, z1.d, z2.d +; CHECK-NEXT: stp q0, q7, [x1, #96] +; CHECK-NEXT: stp q1, q4, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i64: @@ -2962,26 +2952,18 @@ define <2 x i16> @fcvtzs_v2f64_v2i16(<2 x double> %op1) { define <4 x i16> @fcvtzs_v4f64_v4i16(ptr %a) { ; CHECK-LABEL: fcvtzs_v4f64_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: mov z2.s, z0.s[1] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z0.s, z1.s[1] -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: mov z2.s, z1.s[1] +; CHECK-NEXT: mov z3.s, z0.s[1] +; CHECK-NEXT: zip1 z1.h, z1.h, z2.h +; CHECK-NEXT: zip1 z0.h, z0.h, z3.h +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i16: @@ -3016,40 +2998,29 @@ define <4 x i16> @fcvtzs_v4f64_v4i16(ptr %a) { define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) { ; CHECK-LABEL: fcvtzs_v8f64_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldp q1, q0, [x0, #32] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d -; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d ; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.d +; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s ; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z0.s, z0.s[1] -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z1.s, z1.s[1] -; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.s, z2.s[1] -; CHECK-NEXT: strh w8, [sp, #4] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.s, z3.s[1] -; CHECK-NEXT: strh w8, [sp] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w8, [sp, #6] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: ldr q0, [sp], #16 +; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s +; CHECK-NEXT: mov z4.s, z0.s[1] +; CHECK-NEXT: mov z5.s, z1.s[1] +; CHECK-NEXT: mov z6.s, z3.s[1] +; CHECK-NEXT: mov z7.s, z2.s[1] +; CHECK-NEXT: zip1 z0.h, z0.h, z4.h +; CHECK-NEXT: zip1 z1.h, z1.h, z5.h +; CHECK-NEXT: zip1 z3.h, z3.h, z6.h +; CHECK-NEXT: zip1 z2.h, z2.h, z7.h +; CHECK-NEXT: zip1 z0.s, z1.s, z0.s +; CHECK-NEXT: zip1 z1.s, z2.s, z3.s +; CHECK-NEXT: zip1 z0.d, z1.d, z0.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzs_v8f64_v8i16: @@ -3106,73 +3077,50 @@ define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) { define void @fcvtzs_v16f64_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzs_v16f64_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ldp q5, q6, [x0, #96] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: ldp q4, q5, [x0, #96] -; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d +; CHECK-NEXT: ldp q0, q4, [x0, #32] +; CHECK-NEXT: ldp q2, q7, [x0, #64] +; CHECK-NEXT: ldp q1, q3, [x0] +; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.d +; CHECK-NEXT: fcvtzs z4.d, p0/m, z4.d +; CHECK-NEXT: fcvtzs z5.d, p0/m, z5.d ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: fcvtzs z7.d, p0/m, z7.d ; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d -; CHECK-NEXT: ldp q6, q7, [x0, #64] ; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.d -; CHECK-NEXT: fcvtzs z5.d, p0/m, z5.d -; CHECK-NEXT: fcvtzs z4.d, p0/m, z4.d -; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d +; CHECK-NEXT: uzp1 z6.s, z6.s, z6.s +; CHECK-NEXT: uzp1 z4.s, z4.s, z4.s +; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.d +; CHECK-NEXT: uzp1 z7.s, z7.s, z7.s ; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s ; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s -; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z16.s, z1.s[1] -; CHECK-NEXT: mov z1.s, z0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov z0.s, z2.s[1] -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.s, z3.s[1] -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: movprfx z3, z7 -; CHECK-NEXT: fcvtzs z3.d, p0/m, z7.d -; CHECK-NEXT: strh w8, [sp, #4] -; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: strh w9, [sp] -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: uzp1 z1.s, z4.s, z4.s -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: uzp1 z0.s, z3.s, z3.s -; CHECK-NEXT: mov z3.s, z5.s[1] -; CHECK-NEXT: strh w8, [sp, #6] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: uzp1 z2.s, z6.s, z6.s -; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: strh w8, [sp, #28] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z1.s, z1.s[1] -; CHECK-NEXT: strh w8, [sp, #24] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z0.s, z0.s[1] -; CHECK-NEXT: strh w8, [sp, #20] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.s, z2.s[1] -; CHECK-NEXT: strh w8, [sp, #16] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w8, [sp, #30] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w8, [sp, #26] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strh w8, [sp, #22] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w8, [sp, #18] -; CHECK-NEXT: ldp q1, q0, [sp] -; CHECK-NEXT: stp q1, q0, [x1] -; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: mov z17.s, z6.s[1] +; CHECK-NEXT: mov z16.s, z4.s[1] +; CHECK-NEXT: mov z18.s, z5.s[1] +; CHECK-NEXT: mov z21.s, z0.s[1] +; CHECK-NEXT: mov z19.s, z7.s[1] +; CHECK-NEXT: mov z20.s, z2.s[1] +; CHECK-NEXT: mov z22.s, z3.s[1] +; CHECK-NEXT: mov z23.s, z1.s[1] +; CHECK-NEXT: zip1 z6.h, z6.h, z17.h +; CHECK-NEXT: zip1 z4.h, z4.h, z16.h +; CHECK-NEXT: zip1 z5.h, z5.h, z18.h +; CHECK-NEXT: zip1 z0.h, z0.h, z21.h +; CHECK-NEXT: zip1 z7.h, z7.h, z19.h +; CHECK-NEXT: zip1 z2.h, z2.h, z20.h +; CHECK-NEXT: zip1 z3.h, z3.h, z22.h +; CHECK-NEXT: zip1 z1.h, z1.h, z23.h +; CHECK-NEXT: zip1 z5.s, z5.s, z6.s +; CHECK-NEXT: zip1 z0.s, z0.s, z4.s +; CHECK-NEXT: zip1 z2.s, z2.s, z7.s +; CHECK-NEXT: zip1 z1.s, z1.s, z3.s +; CHECK-NEXT: zip1 z2.d, z2.d, z5.d +; CHECK-NEXT: zip1 z0.d, z1.d, z0.d +; CHECK-NEXT: stp q0, q2, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzs_v16f64_v16i16: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll index 035c76b569298a..ad5f91a5f39a49 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll @@ -8,25 +8,18 @@ target triple = "aarch64-unknown-linux-gnu" define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x i1> %mask) { ; CHECK-LABEL: select_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: mov z3.s, z2.s[1] -; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: ldr d2, [sp, #8] +; CHECK-NEXT: zip1 z2.h, z2.h, z3.h ; CHECK-NEXT: lsl z2.h, z2.h, #15 ; CHECK-NEXT: asr z2.h, z2.h, #15 ; CHECK-NEXT: and z2.h, z2.h, #0x1 ; CHECK-NEXT: cmpne p0.h, p0/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: select_v2f16: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll index d77473ed8f08e5..275d13ebfd9491 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll @@ -506,14 +506,10 @@ define <4 x i64> @insertelement_v4i64(ptr %a) { define <2 x half> @insertelement_v2f16(<2 x half> %op1) { ; CHECK-LABEL: insertelement_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: fmov h1, #5.00000000 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: str h0, [sp, #8] -; CHECK-NEXT: str h1, [sp, #10] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: zip1 z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: insertelement_v2f16: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll index afd3bb7161c155..f9f70d30a757eb 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll @@ -1160,18 +1160,14 @@ define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) { define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) { ; CHECK-LABEL: ucvtf_v2i64_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov z1.d, z0.d[1] ; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 ; CHECK-NEXT: ucvtf h0, x8 -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: ucvtf h1, x8 -; CHECK-NEXT: str h0, [sp, #8] -; CHECK-NEXT: str h1, [sp, #10] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ucvtf h1, x9 +; CHECK-NEXT: zip1 z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f16: @@ -2618,18 +2614,14 @@ define void @scvtf_v16i32_v16f64(ptr %a, ptr %b) { define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) { ; CHECK-LABEL: scvtf_v2i64_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov z1.d, z0.d[1] ; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 ; CHECK-NEXT: scvtf h0, x8 -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: scvtf h1, x8 -; CHECK-NEXT: str h0, [sp, #8] -; CHECK-NEXT: str h1, [sp, #10] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: scvtf h1, x9 +; CHECK-NEXT: zip1 z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f16: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll index 270f05a806b82d..613543310f2c31 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll @@ -10,25 +10,20 @@ declare void @def(ptr) define void @alloc_v4i8(ptr %st_ptr) nounwind { ; CHECK-LABEL: alloc_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #48 -; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: add x0, sp, #28 -; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: add x20, sp, #28 +; CHECK-NEXT: add x0, sp, #12 +; CHECK-NEXT: add x20, sp, #12 ; CHECK-NEXT: bl def ; CHECK-NEXT: ptrue p0.b, vl2 ; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x20] ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload ; CHECK-NEXT: mov z2.b, z0.b[1] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: stp w8, w9, [sp, #8] -; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: zip1 z0.s, z0.s, z2.s ; CHECK-NEXT: st1b { z0.s }, p0, [x19] -; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #48 +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: alloc_v4i8: @@ -62,32 +57,28 @@ define void @alloc_v4i8(ptr %st_ptr) nounwind { define void @alloc_v6i8(ptr %st_ptr) nounwind { ; CHECK-LABEL: alloc_v6i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #48 -; CHECK-NEXT: stp x30, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: add x0, sp, #24 +; CHECK-NEXT: add x0, sp, #8 ; CHECK-NEXT: bl def -; CHECK-NEXT: ldr d0, [sp, #24] +; CHECK-NEXT: ldr d0, [sp, #8] ; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: add x8, sp, #4 ; CHECK-NEXT: ptrue p1.s, vl2 ; CHECK-NEXT: mov z1.b, z0.b[3] -; CHECK-NEXT: mov z2.b, z0.b[5] -; CHECK-NEXT: mov z0.b, z0.b[1] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strh w9, [sp, #12] -; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: add x8, sp, #20 -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: st1b { z0.h }, p0, [x8] -; CHECK-NEXT: ld1h { z0.s }, p1/z, [x8] -; CHECK-NEXT: strb w9, [x19, #2] +; CHECK-NEXT: mov z2.b, z0.b[1] +; CHECK-NEXT: mov z0.b, z0.b[5] +; CHECK-NEXT: zip1 z1.h, z2.h, z1.h +; CHECK-NEXT: zip1 z1.s, z1.s, z0.s +; CHECK-NEXT: st1b { z1.h }, p0, [x8] +; CHECK-NEXT: ld1h { z1.s }, p1/z, [x8] ; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strb w8, [x19, #2] +; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: strh w8, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #48 +; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: alloc_v6i8: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll index 5f4b9dd1592cf2..9055b2efba3282 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll @@ -1466,23 +1466,18 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) { define <2 x half> @masked_load_v2f16(ptr %src, <2 x i1> %mask) { ; CHECK-LABEL: masked_load_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: mov z1.s, z0.s[1] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: str wzr, [sp, #12] +; CHECK-NEXT: fmov s1, wzr +; CHECK-NEXT: mov z2.s, z0.s[1] ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: zip1 z0.h, z0.h, z2.h +; CHECK-NEXT: zip1 z1.h, z1.h, z1.h +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s ; CHECK-NEXT: lsl z0.h, z0.h, #15 ; CHECK-NEXT: asr z0.h, z0.h, #15 ; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: masked_load_v2f16: @@ -2318,33 +2313,21 @@ define <8 x float> @masked_load_v8f32(ptr %src, <8 x i1> %mask) { ; CHECK-LABEL: masked_load_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: mov z1.b, z0.b[3] ; CHECK-NEXT: mov z2.b, z0.b[2] +; CHECK-NEXT: mov x8, #4 // =0x4 ; CHECK-NEXT: mov z3.b, z0.b[1] ; CHECK-NEXT: mov z4.b, z0.b[7] -; CHECK-NEXT: strh w8, [sp, #-16]! -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z1.b, z0.b[6] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z2.b, z0.b[5] -; CHECK-NEXT: mov z0.b, z0.b[4] -; CHECK-NEXT: strh w8, [sp, #6] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w9, [sp, #4] -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w9, [sp, #14] -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: mov x8, #4 // =0x4 -; CHECK-NEXT: ldp d0, d1, [sp] +; CHECK-NEXT: mov z5.b, z0.b[6] +; CHECK-NEXT: mov z6.b, z0.b[5] +; CHECK-NEXT: mov z7.b, z0.b[4] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: zip1 z1.h, z2.h, z1.h +; CHECK-NEXT: zip1 z0.h, z0.h, z3.h +; CHECK-NEXT: zip1 z2.h, z5.h, z4.h +; CHECK-NEXT: zip1 z3.h, z7.h, z6.h +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: zip1 z1.s, z3.s, z2.s ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: lsl z0.s, z0.s, #31 @@ -2357,7 +2340,6 @@ define <8 x float> @masked_load_v8f32(ptr %src, <8 x i1> %mask) { ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: masked_load_v8f32: @@ -2684,23 +2666,21 @@ define <4 x double> @masked_load_v4f64(ptr %src, <4 x i1> %mask) { define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) { ; CHECK-LABEL: masked_load_zext_v3i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: strh w3, [sp, #12] +; CHECK-NEXT: fmov s0, w2 +; CHECK-NEXT: fmov s1, w1 ; CHECK-NEXT: adrp x8, .LCPI13_0 ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: strh w2, [sp, #10] -; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI13_0] -; CHECK-NEXT: strh w1, [sp, #8] -; CHECK-NEXT: ldr d1, [sp, #8] -; CHECK-NEXT: and z0.d, z1.d, z0.d +; CHECK-NEXT: zip1 z0.h, z1.h, z0.h +; CHECK-NEXT: fmov s1, w3 +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI13_0] +; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: lsl z0.h, z0.h, #15 ; CHECK-NEXT: asr z0.h, z0.h, #15 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: masked_load_zext_v3i32: @@ -2759,23 +2739,21 @@ define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) { define <3 x i32> @masked_load_sext_v3i32(ptr %load_ptr, <3 x i1> %pm) { ; CHECK-LABEL: masked_load_sext_v3i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: strh w3, [sp, #12] +; CHECK-NEXT: fmov s0, w2 +; CHECK-NEXT: fmov s1, w1 ; CHECK-NEXT: adrp x8, .LCPI14_0 ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: strh w2, [sp, #10] -; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI14_0] -; CHECK-NEXT: strh w1, [sp, #8] -; CHECK-NEXT: ldr d1, [sp, #8] -; CHECK-NEXT: and z0.d, z1.d, z0.d +; CHECK-NEXT: zip1 z0.h, z1.h, z0.h +; CHECK-NEXT: fmov s1, w3 +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI14_0] +; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: lsl z0.h, z0.h, #15 ; CHECK-NEXT: asr z0.h, z0.h, #15 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 ; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0] ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: masked_load_sext_v3i32: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll index 0c3411e5f55148..265480b571970f 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll @@ -589,23 +589,18 @@ define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) { define void @masked_store_v2f16(ptr %dst, <2 x i1> %mask) { ; CHECK-LABEL: masked_store_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: mov z1.s, z0.s[1] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: str wzr, [sp, #12] +; CHECK-NEXT: fmov s1, wzr +; CHECK-NEXT: mov z2.s, z0.s[1] ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: zip1 z0.h, z0.h, z2.h +; CHECK-NEXT: zip1 z1.h, z1.h, z1.h +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s ; CHECK-NEXT: lsl z0.h, z0.h, #15 ; CHECK-NEXT: asr z0.h, z0.h, #15 ; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 ; CHECK-NEXT: mov z0.h, #0 // =0x0 ; CHECK-NEXT: st1h { z0.h }, p0, [x0] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: masked_store_v2f16: @@ -1014,48 +1009,33 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) { define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) { ; CHECK-LABEL: masked_store_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov z1.b, z0.b[7] ; CHECK-NEXT: mov z2.b, z0.b[6] +; CHECK-NEXT: mov x8, #4 // =0x4 ; CHECK-NEXT: mov z3.b, z0.b[5] ; CHECK-NEXT: mov z4.b, z0.b[4] +; CHECK-NEXT: mov z5.b, z0.b[3] +; CHECK-NEXT: mov z6.b, z0.b[2] +; CHECK-NEXT: mov z7.b, z0.b[1] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z2.b, z0.b[3] -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.b, z0.b[2] -; CHECK-NEXT: strh w9, [sp, #12] -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: mov z4.b, z0.b[1] -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: mov x8, #4 // =0x4 -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: ldr d1, [sp, #8] +; CHECK-NEXT: zip1 z1.h, z2.h, z1.h +; CHECK-NEXT: zip1 z2.h, z4.h, z3.h +; CHECK-NEXT: zip1 z3.h, z6.h, z5.h +; CHECK-NEXT: zip1 z0.h, z0.h, z7.h +; CHECK-NEXT: zip1 z1.s, z2.s, z1.s +; CHECK-NEXT: zip1 z0.s, z0.s, z3.s ; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: lsl z1.s, z1.s, #31 +; CHECK-NEXT: lsl z0.s, z0.s, #31 ; CHECK-NEXT: asr z1.s, z1.s, #31 +; CHECK-NEXT: asr z0.s, z0.s, #31 ; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, #0 ; CHECK-NEXT: mov z1.s, #0 // =0x0 -; CHECK-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w9, [sp] -; CHECK-NEXT: strh w8, [sp, #6] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w8, [sp, #4] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: ldr d0, [sp] -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: lsl z0.s, z0.s, #31 -; CHECK-NEXT: asr z0.s, z0.s, #31 ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2] ; CHECK-NEXT: st1w { z1.s }, p0, [x0] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: masked_store_v8f32: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll index b91f813c5141bb..8b296d9fbc215d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll @@ -9,65 +9,44 @@ target triple = "aarch64-unknown-linux-gnu" define void @zip1_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: zip1_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1, #16] ; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: mov z2.b, z0.b[15] -; CHECK-NEXT: mov z3.b, z0.b[14] -; CHECK-NEXT: mov z4.b, z0.b[13] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov z3.b, z0.b[11] -; CHECK-NEXT: mov z2.b, z0.b[12] -; CHECK-NEXT: strb w8, [sp, #14] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z0.b[10] -; CHECK-NEXT: strb w9, [sp, #12] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z2.b, z0.b[9] -; CHECK-NEXT: strb w8, [sp, #10] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.b, z0.b[8] -; CHECK-NEXT: strb w9, [sp, #8] +; CHECK-NEXT: mov z4.b, z0.b[14] +; CHECK-NEXT: mov z6.b, z0.b[13] +; CHECK-NEXT: mov z3.b, z1.b[15] +; CHECK-NEXT: mov z5.b, z1.b[14] +; CHECK-NEXT: mov z7.b, z1.b[13] +; CHECK-NEXT: mov z16.b, z0.b[12] +; CHECK-NEXT: mov z17.b, z1.b[12] +; CHECK-NEXT: mov z18.b, z0.b[11] +; CHECK-NEXT: mov z19.b, z1.b[11] +; CHECK-NEXT: mov z20.b, z0.b[10] +; CHECK-NEXT: mov z21.b, z1.b[10] +; CHECK-NEXT: mov z22.b, z0.b[9] +; CHECK-NEXT: mov z23.b, z1.b[9] +; CHECK-NEXT: mov z24.b, z0.b[8] +; CHECK-NEXT: mov z25.b, z1.b[8] +; CHECK-NEXT: zip1 z2.b, z2.b, z3.b +; CHECK-NEXT: zip1 z3.b, z4.b, z5.b +; CHECK-NEXT: zip1 z4.b, z6.b, z7.b +; CHECK-NEXT: zip1 z5.b, z16.b, z17.b +; CHECK-NEXT: zip1 z6.b, z18.b, z19.b +; CHECK-NEXT: zip1 z7.b, z20.b, z21.b +; CHECK-NEXT: zip1 z16.b, z22.b, z23.b ; CHECK-NEXT: zip1 z0.b, z0.b, z1.b -; CHECK-NEXT: strb w8, [sp, #6] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z1.b[15] -; CHECK-NEXT: strb w8, [sp, #4] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.b, z1.b[14] -; CHECK-NEXT: strb w8, [sp, #2] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.b, z1.b[13] -; CHECK-NEXT: strb w8, [sp] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z1.b[12] -; CHECK-NEXT: strb w8, [sp, #15] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.b, z1.b[11] -; CHECK-NEXT: strb w8, [sp, #13] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.b, z1.b[10] -; CHECK-NEXT: strb w8, [sp, #11] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z1.b[9] -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: strb w8, [sp, #9] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.b, z1.b[8] -; CHECK-NEXT: strb w9, [sp, #5] -; CHECK-NEXT: strb w8, [sp, #7] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: strb w8, [sp, #3] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strb w8, [sp, #1] -; CHECK-NEXT: ldr q1, [sp] +; CHECK-NEXT: zip1 z17.b, z24.b, z25.b +; CHECK-NEXT: zip1 z2.h, z3.h, z2.h +; CHECK-NEXT: zip1 z3.h, z5.h, z4.h +; CHECK-NEXT: zip1 z4.h, z7.h, z6.h ; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: zip1 z5.h, z17.h, z16.h +; CHECK-NEXT: zip1 z2.s, z3.s, z2.s +; CHECK-NEXT: zip1 z3.s, z5.s, z4.s +; CHECK-NEXT: zip1 z1.d, z3.d, z2.d ; CHECK-NEXT: str q1, [x0, #16] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zip1_v32i8: @@ -159,123 +138,97 @@ define void @zip1_v32i8(ptr %a, ptr %b) { define void @zip_v32i16(ptr %a, ptr %b) { ; CHECK-LABEL: zip_v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #64 +; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: ldp q1, q3, [x1] -; CHECK-NEXT: ldp q0, q4, [x0] -; CHECK-NEXT: ldp q2, q5, [x0, #32] -; CHECK-NEXT: mov z16.h, z3.h[7] -; CHECK-NEXT: mov z18.h, z3.h[6] -; CHECK-NEXT: mov z17.h, z4.h[7] -; CHECK-NEXT: ldp q6, q7, [x1, #32] -; CHECK-NEXT: mov z19.h, z4.h[6] -; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: .cfi_offset b14, -56 +; CHECK-NEXT: .cfi_offset b15, -64 +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: mov z5.h, z1.h[7] +; CHECK-NEXT: mov z7.h, z1.h[6] +; CHECK-NEXT: mov z17.h, z1.h[5] +; CHECK-NEXT: mov z4.h, z3.h[7] +; CHECK-NEXT: mov z6.h, z3.h[6] ; CHECK-NEXT: mov z16.h, z3.h[5] -; CHECK-NEXT: fmov w9, s17 -; CHECK-NEXT: mov z17.h, z4.h[5] -; CHECK-NEXT: mov z20.h, z7.h[6] -; CHECK-NEXT: strh w8, [sp, #30] -; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z20.h, z2.h[7] +; CHECK-NEXT: mov z21.h, z0.h[7] ; CHECK-NEXT: mov z18.h, z3.h[4] -; CHECK-NEXT: strh w9, [sp, #28] -; CHECK-NEXT: fmov w9, s19 -; CHECK-NEXT: mov z19.h, z5.h[7] -; CHECK-NEXT: zip1 z3.h, z4.h, z3.h -; CHECK-NEXT: strh w8, [sp, #26] -; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: mov z16.h, z4.h[4] -; CHECK-NEXT: strh w9, [sp, #24] -; CHECK-NEXT: zip1 z4.h, z5.h, z7.h -; CHECK-NEXT: strh w8, [sp, #22] -; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: mov z17.h, z1.h[7] -; CHECK-NEXT: add z3.h, z3.h, z4.h -; CHECK-NEXT: strh w8, [sp, #20] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z0.h[7] -; CHECK-NEXT: strh w8, [sp, #18] -; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: mov z16.h, z1.h[6] -; CHECK-NEXT: strh w8, [sp, #16] -; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: mov z17.h, z0.h[6] -; CHECK-NEXT: strh w8, [sp, #62] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z1.h[5] -; CHECK-NEXT: strh w8, [sp, #60] -; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: mov z16.h, z0.h[5] -; CHECK-NEXT: strh w8, [sp, #58] -; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: mov z17.h, z1.h[4] -; CHECK-NEXT: strh w8, [sp, #56] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z0.h[4] -; CHECK-NEXT: zip1 z0.h, z0.h, z1.h -; CHECK-NEXT: zip1 z1.h, z2.h, z6.h -; CHECK-NEXT: strh w8, [sp, #54] -; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: ldr q16, [sp, #16] -; CHECK-NEXT: add z0.h, z0.h, z1.h -; CHECK-NEXT: strh w8, [sp, #52] -; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: strh w8, [sp, #50] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z7.h[7] -; CHECK-NEXT: strh w8, [sp, #48] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z5.h[6] -; CHECK-NEXT: ldr q17, [sp, #48] -; CHECK-NEXT: strh w8, [sp, #46] -; CHECK-NEXT: fmov w8, s19 -; CHECK-NEXT: mov z19.h, z7.h[5] -; CHECK-NEXT: strh w8, [sp, #44] -; CHECK-NEXT: fmov w8, s20 -; CHECK-NEXT: mov z20.h, z5.h[5] -; CHECK-NEXT: strh w8, [sp, #42] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z7.h[4] -; CHECK-NEXT: strh w8, [sp, #40] -; CHECK-NEXT: fmov w8, s19 -; CHECK-NEXT: mov z19.h, z5.h[4] -; CHECK-NEXT: strh w8, [sp, #38] -; CHECK-NEXT: fmov w8, s20 -; CHECK-NEXT: mov z20.h, z6.h[7] -; CHECK-NEXT: strh w8, [sp, #36] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z2.h[7] -; CHECK-NEXT: strh w8, [sp, #34] -; CHECK-NEXT: fmov w8, s19 -; CHECK-NEXT: mov z19.h, z6.h[6] -; CHECK-NEXT: strh w8, [sp, #32] -; CHECK-NEXT: fmov w8, s20 -; CHECK-NEXT: mov z20.h, z2.h[6] -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z6.h[5] -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s19 -; CHECK-NEXT: mov z19.h, z2.h[5] -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: fmov w8, s20 -; CHECK-NEXT: mov z20.h, z6.h[4] -; CHECK-NEXT: fmov w9, s19 -; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z2.h[4] -; CHECK-NEXT: strh w9, [sp, #4] -; CHECK-NEXT: ldr q2, [sp, #32] -; CHECK-NEXT: strh w8, [sp, #6] -; CHECK-NEXT: fmov w8, s20 -; CHECK-NEXT: fmov w9, s18 -; CHECK-NEXT: add z2.h, z16.h, z2.h -; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: strh w9, [sp] -; CHECK-NEXT: ldr q4, [sp] -; CHECK-NEXT: stp q3, q2, [x0, #32] -; CHECK-NEXT: add z1.h, z17.h, z4.h -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: mov z19.h, z1.h[4] +; CHECK-NEXT: mov z22.h, z2.h[6] +; CHECK-NEXT: mov z23.h, z0.h[6] +; CHECK-NEXT: zip1 z24.h, z5.h, z4.h +; CHECK-NEXT: zip1 z25.h, z7.h, z6.h +; CHECK-NEXT: zip1 z17.h, z17.h, z16.h +; CHECK-NEXT: ldp q4, q6, [x0, #32] +; CHECK-NEXT: zip1 z16.h, z21.h, z20.h +; CHECK-NEXT: ldp q5, q7, [x1, #32] +; CHECK-NEXT: zip1 z18.h, z19.h, z18.h +; CHECK-NEXT: zip1 z19.s, z25.s, z24.s +; CHECK-NEXT: zip1 z22.h, z23.h, z22.h +; CHECK-NEXT: mov z23.h, z2.h[5] +; CHECK-NEXT: mov z21.h, z6.h[7] +; CHECK-NEXT: mov z24.h, z0.h[5] +; CHECK-NEXT: mov z25.h, z2.h[4] +; CHECK-NEXT: mov z20.h, z7.h[7] +; CHECK-NEXT: mov z26.h, z0.h[4] +; CHECK-NEXT: mov z27.h, z6.h[6] +; CHECK-NEXT: mov z28.h, z7.h[5] +; CHECK-NEXT: mov z29.h, z6.h[5] +; CHECK-NEXT: mov z30.h, z7.h[4] +; CHECK-NEXT: mov z31.h, z6.h[4] +; CHECK-NEXT: mov z8.h, z5.h[7] +; CHECK-NEXT: mov z9.h, z4.h[7] +; CHECK-NEXT: zip1 z20.h, z21.h, z20.h +; CHECK-NEXT: mov z21.h, z7.h[6] +; CHECK-NEXT: mov z10.h, z5.h[6] +; CHECK-NEXT: mov z11.h, z4.h[6] +; CHECK-NEXT: mov z12.h, z5.h[5] +; CHECK-NEXT: mov z13.h, z4.h[5] +; CHECK-NEXT: mov z14.h, z5.h[4] +; CHECK-NEXT: mov z15.h, z4.h[4] +; CHECK-NEXT: zip1 z23.h, z24.h, z23.h +; CHECK-NEXT: zip1 z21.h, z27.h, z21.h +; CHECK-NEXT: zip1 z27.h, z29.h, z28.h +; CHECK-NEXT: zip1 z28.h, z31.h, z30.h +; CHECK-NEXT: zip1 z24.h, z26.h, z25.h +; CHECK-NEXT: zip1 z25.h, z9.h, z8.h +; CHECK-NEXT: zip1 z26.h, z11.h, z10.h +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: zip1 z29.h, z13.h, z12.h +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: zip1 z30.h, z15.h, z14.h +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: zip1 z17.s, z18.s, z17.s +; CHECK-NEXT: zip1 z18.s, z21.s, z20.s +; CHECK-NEXT: zip1 z20.s, z28.s, z27.s +; CHECK-NEXT: zip1 z16.s, z22.s, z16.s +; CHECK-NEXT: zip1 z21.s, z24.s, z23.s +; CHECK-NEXT: zip1 z1.h, z1.h, z3.h +; CHECK-NEXT: zip1 z3.s, z26.s, z25.s +; CHECK-NEXT: zip1 z22.s, z30.s, z29.s +; CHECK-NEXT: zip1 z6.h, z6.h, z7.h +; CHECK-NEXT: zip1 z7.d, z17.d, z19.d +; CHECK-NEXT: zip1 z17.d, z20.d, z18.d +; CHECK-NEXT: zip1 z0.h, z0.h, z2.h +; CHECK-NEXT: zip1 z2.h, z4.h, z5.h +; CHECK-NEXT: zip1 z4.d, z21.d, z16.d +; CHECK-NEXT: zip1 z3.d, z22.d, z3.d +; CHECK-NEXT: add z1.h, z1.h, z6.h +; CHECK-NEXT: add z5.h, z7.h, z17.h +; CHECK-NEXT: add z0.h, z0.h, z2.h +; CHECK-NEXT: add z2.h, z4.h, z3.h +; CHECK-NEXT: stp q1, q5, [x0, #32] +; CHECK-NEXT: stp q0, q2, [x0] +; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zip_v32i16: @@ -436,41 +389,28 @@ define void @zip_v32i16(ptr %a, ptr %b) { define void @zip1_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: zip1_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1, #16] ; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: mov z2.h, z0.h[7] -; CHECK-NEXT: mov z3.h, z0.h[6] -; CHECK-NEXT: mov z4.h, z0.h[5] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.h, z0.h[4] -; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z4.h, z0.h[6] +; CHECK-NEXT: mov z6.h, z0.h[5] ; CHECK-NEXT: mov z3.h, z1.h[7] +; CHECK-NEXT: mov z5.h, z1.h[6] +; CHECK-NEXT: mov z7.h, z1.h[5] +; CHECK-NEXT: mov z16.h, z0.h[4] +; CHECK-NEXT: mov z17.h, z1.h[4] ; CHECK-NEXT: zip1 z0.h, z0.h, z1.h -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z1.h[6] -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z2.h, z1.h[5] -; CHECK-NEXT: strh w8, [sp, #4] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.h, z1.h[4] -; CHECK-NEXT: strh w9, [sp] -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w9, [sp, #10] -; CHECK-NEXT: strh w8, [sp, #6] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: ldr q1, [sp] +; CHECK-NEXT: zip1 z2.h, z2.h, z3.h +; CHECK-NEXT: zip1 z3.h, z4.h, z5.h +; CHECK-NEXT: zip1 z4.h, z6.h, z7.h +; CHECK-NEXT: zip1 z5.h, z16.h, z17.h ; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: zip1 z2.s, z3.s, z2.s +; CHECK-NEXT: zip1 z3.s, z5.s, z4.s +; CHECK-NEXT: zip1 z1.d, z3.d, z2.d ; CHECK-NEXT: str q1, [x0, #16] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zip1_v16i16: @@ -530,8 +470,6 @@ define void @zip1_v16i16(ptr %a, ptr %b) { define void @zip1_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: zip1_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1, #16] @@ -539,18 +477,13 @@ define void @zip1_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: mov z2.s, z0.s[3] ; CHECK-NEXT: mov z4.s, z0.s[2] ; CHECK-NEXT: mov z3.s, z1.s[3] +; CHECK-NEXT: mov z5.s, z1.s[2] ; CHECK-NEXT: zip1 z0.s, z0.s, z1.s -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.s, z1.s[2] -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: stp w8, w9, [sp, #8] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: stp w8, w9, [sp] -; CHECK-NEXT: ldr q1, [sp] +; CHECK-NEXT: zip1 z2.s, z2.s, z3.s +; CHECK-NEXT: zip1 z3.s, z4.s, z5.s ; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: zip1 z1.d, z3.d, z2.d ; CHECK-NEXT: str q1, [x0, #16] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zip1_v8i32: @@ -636,25 +569,18 @@ define void @zip_v4f64(ptr %a, ptr %b) { define void @zip_v4i32(ptr %a, ptr %b) { ; CHECK-LABEL: zip_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [x1] ; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: mov z2.s, z0.s[3] ; CHECK-NEXT: mov z3.s, z1.s[3] ; CHECK-NEXT: mov z4.s, z0.s[2] +; CHECK-NEXT: mov z5.s, z1.s[2] ; CHECK-NEXT: zip1 z0.s, z1.s, z0.s -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.s, z1.s[2] -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: stp w9, w8, [sp, #8] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: stp w9, w8, [sp] -; CHECK-NEXT: ldr q1, [sp] +; CHECK-NEXT: zip1 z2.s, z3.s, z2.s +; CHECK-NEXT: zip1 z3.s, z5.s, z4.s +; CHECK-NEXT: zip1 z1.d, z3.d, z2.d ; CHECK-NEXT: add z0.s, z0.s, z1.s ; CHECK-NEXT: str q0, [x0] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zip_v4i32: @@ -1209,65 +1135,44 @@ define void @trn_v8i32_undef(ptr %a) { define void @zip2_v32i8(ptr %a, ptr %b) #0{ ; CHECK-LABEL: zip2_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: ldr q1, [x1, #16] ; CHECK-NEXT: mov z2.b, z0.b[15] -; CHECK-NEXT: mov z3.b, z0.b[14] -; CHECK-NEXT: mov z4.b, z0.b[13] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov z3.b, z0.b[11] -; CHECK-NEXT: mov z2.b, z0.b[12] -; CHECK-NEXT: strb w8, [sp, #14] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z0.b[10] -; CHECK-NEXT: strb w9, [sp, #12] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z2.b, z0.b[9] -; CHECK-NEXT: strb w8, [sp, #10] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.b, z0.b[8] -; CHECK-NEXT: strb w9, [sp, #8] +; CHECK-NEXT: mov z4.b, z0.b[14] +; CHECK-NEXT: mov z6.b, z0.b[13] +; CHECK-NEXT: mov z3.b, z1.b[15] +; CHECK-NEXT: mov z5.b, z1.b[14] +; CHECK-NEXT: mov z7.b, z1.b[13] +; CHECK-NEXT: mov z16.b, z0.b[12] +; CHECK-NEXT: mov z17.b, z1.b[12] +; CHECK-NEXT: mov z18.b, z0.b[11] +; CHECK-NEXT: mov z19.b, z1.b[11] +; CHECK-NEXT: mov z20.b, z0.b[10] +; CHECK-NEXT: mov z21.b, z1.b[10] +; CHECK-NEXT: mov z22.b, z0.b[9] +; CHECK-NEXT: mov z23.b, z1.b[9] +; CHECK-NEXT: mov z24.b, z0.b[8] +; CHECK-NEXT: mov z25.b, z1.b[8] +; CHECK-NEXT: zip1 z2.b, z2.b, z3.b +; CHECK-NEXT: zip1 z3.b, z4.b, z5.b +; CHECK-NEXT: zip1 z4.b, z6.b, z7.b +; CHECK-NEXT: zip1 z5.b, z16.b, z17.b +; CHECK-NEXT: zip1 z6.b, z18.b, z19.b +; CHECK-NEXT: zip1 z7.b, z20.b, z21.b +; CHECK-NEXT: zip1 z16.b, z22.b, z23.b ; CHECK-NEXT: zip1 z0.b, z0.b, z1.b -; CHECK-NEXT: strb w8, [sp, #6] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z1.b[15] -; CHECK-NEXT: strb w8, [sp, #4] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.b, z1.b[14] -; CHECK-NEXT: strb w8, [sp, #2] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.b, z1.b[13] -; CHECK-NEXT: strb w8, [sp] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z1.b[12] -; CHECK-NEXT: strb w8, [sp, #15] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.b, z1.b[11] -; CHECK-NEXT: strb w8, [sp, #13] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.b, z1.b[10] -; CHECK-NEXT: strb w8, [sp, #11] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z1.b[9] -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: strb w8, [sp, #9] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.b, z1.b[8] -; CHECK-NEXT: strb w9, [sp, #5] -; CHECK-NEXT: strb w8, [sp, #7] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: strb w8, [sp, #3] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strb w8, [sp, #1] -; CHECK-NEXT: ldr q1, [sp] +; CHECK-NEXT: zip1 z17.b, z24.b, z25.b +; CHECK-NEXT: zip1 z2.h, z3.h, z2.h +; CHECK-NEXT: zip1 z3.h, z5.h, z4.h +; CHECK-NEXT: zip1 z4.h, z7.h, z6.h ; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: zip1 z5.h, z17.h, z16.h +; CHECK-NEXT: zip1 z2.s, z3.s, z2.s +; CHECK-NEXT: zip1 z3.s, z5.s, z4.s +; CHECK-NEXT: zip1 z1.d, z3.d, z2.d ; CHECK-NEXT: str q1, [x0, #16] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zip2_v32i8: @@ -1359,41 +1264,28 @@ define void @zip2_v32i8(ptr %a, ptr %b) #0{ define void @zip2_v16i16(ptr %a, ptr %b) #0{ ; CHECK-LABEL: zip2_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: ldr q1, [x1, #16] ; CHECK-NEXT: mov z2.h, z0.h[7] -; CHECK-NEXT: mov z3.h, z0.h[6] -; CHECK-NEXT: mov z4.h, z0.h[5] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.h, z0.h[4] -; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z4.h, z0.h[6] +; CHECK-NEXT: mov z6.h, z0.h[5] ; CHECK-NEXT: mov z3.h, z1.h[7] +; CHECK-NEXT: mov z5.h, z1.h[6] +; CHECK-NEXT: mov z7.h, z1.h[5] +; CHECK-NEXT: mov z16.h, z0.h[4] +; CHECK-NEXT: mov z17.h, z1.h[4] ; CHECK-NEXT: zip1 z0.h, z0.h, z1.h -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z1.h[6] -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z2.h, z1.h[5] -; CHECK-NEXT: strh w8, [sp, #4] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.h, z1.h[4] -; CHECK-NEXT: strh w9, [sp] -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w9, [sp, #10] -; CHECK-NEXT: strh w8, [sp, #6] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: ldr q1, [sp] +; CHECK-NEXT: zip1 z2.h, z2.h, z3.h +; CHECK-NEXT: zip1 z3.h, z4.h, z5.h +; CHECK-NEXT: zip1 z4.h, z6.h, z7.h +; CHECK-NEXT: zip1 z5.h, z16.h, z17.h ; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: zip1 z2.s, z3.s, z2.s +; CHECK-NEXT: zip1 z3.s, z5.s, z4.s +; CHECK-NEXT: zip1 z1.d, z3.d, z2.d ; CHECK-NEXT: str q1, [x0, #16] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zip2_v16i16: @@ -1453,8 +1345,6 @@ define void @zip2_v16i16(ptr %a, ptr %b) #0{ define void @zip2_v8i32(ptr %a, ptr %b) #0{ ; CHECK-LABEL: zip2_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: ldr q1, [x1] @@ -1462,18 +1352,13 @@ define void @zip2_v8i32(ptr %a, ptr %b) #0{ ; CHECK-NEXT: mov z2.s, z0.s[3] ; CHECK-NEXT: mov z4.s, z0.s[2] ; CHECK-NEXT: mov z3.s, z1.s[3] +; CHECK-NEXT: mov z5.s, z1.s[2] ; CHECK-NEXT: zip1 z0.s, z0.s, z1.s -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.s, z1.s[2] -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: stp w8, w9, [sp, #8] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: stp w8, w9, [sp] -; CHECK-NEXT: ldr q1, [sp] +; CHECK-NEXT: zip1 z2.s, z2.s, z3.s +; CHECK-NEXT: zip1 z3.s, z4.s, z5.s ; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: zip1 z1.d, z3.d, z2.d ; CHECK-NEXT: str q1, [x0, #16] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zip2_v8i32: @@ -1547,197 +1432,139 @@ define void @zip2_v8i32_undef(ptr %a) #0{ define void @uzp_v32i8(ptr %a, ptr %b) #0{ ; CHECK-LABEL: uzp_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #64 -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: mov z4.b, z3.b[14] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z6.b, z3.b[10] -; CHECK-NEXT: mov z5.b, z3.b[12] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z7.b, z3.b[8] -; CHECK-NEXT: mov z17.b, z3.b[9] -; CHECK-NEXT: mov z18.b, z3.b[7] -; CHECK-NEXT: mov z16.b, z3.b[11] -; CHECK-NEXT: strb w8, [sp, #40] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z3.b[6] -; CHECK-NEXT: strb w9, [sp, #32] -; CHECK-NEXT: fmov w9, s5 -; CHECK-NEXT: mov z5.b, z3.b[4] -; CHECK-NEXT: strb w8, [sp, #47] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.b, z3.b[2] -; CHECK-NEXT: strb w9, [sp, #46] -; CHECK-NEXT: fmov w9, s7 -; CHECK-NEXT: mov z7.b, z2.b[14] -; CHECK-NEXT: strb w8, [sp, #45] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z2.b[12] -; CHECK-NEXT: strb w9, [sp, #44] -; CHECK-NEXT: fmov w9, s16 -; CHECK-NEXT: mov z16.b, z2.b[11] -; CHECK-NEXT: strb w8, [sp, #43] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.b, z2.b[10] -; CHECK-NEXT: strb w9, [sp, #61] -; CHECK-NEXT: fmov w9, s16 -; CHECK-NEXT: strb w8, [sp, #42] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.b, z2.b[8] -; CHECK-NEXT: strb w9, [sp, #53] -; CHECK-NEXT: strb w8, [sp, #41] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: mov z7.b, z2.b[6] -; CHECK-NEXT: strb w8, [sp, #39] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z2.b[4] -; CHECK-NEXT: strb w8, [sp, #38] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.b, z2.b[2] -; CHECK-NEXT: strb w8, [sp, #37] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.b, z1.b[10] -; CHECK-NEXT: strb w8, [sp, #36] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: mov z7.b, z1.b[8] -; CHECK-NEXT: strb w8, [sp, #35] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z1.b[14] -; CHECK-NEXT: strb w8, [sp, #34] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.b, z1.b[12] -; CHECK-NEXT: strb w8, [sp, #33] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strb w8, [sp, #8] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strb w8, [sp] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z1.b[6] -; CHECK-NEXT: strb w8, [sp, #15] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.b, z1.b[4] -; CHECK-NEXT: strb w8, [sp, #14] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.b, z1.b[2] -; CHECK-NEXT: strb w8, [sp, #13] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: mov z7.b, z0.b[14] -; CHECK-NEXT: strb w8, [sp, #12] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z0.b[12] -; CHECK-NEXT: strb w8, [sp, #11] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.b, z0.b[10] -; CHECK-NEXT: strb w8, [sp, #10] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.b, z0.b[8] -; CHECK-NEXT: strb w8, [sp, #9] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: mov z7.b, z0.b[6] -; CHECK-NEXT: strb w8, [sp, #7] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z0.b[4] -; CHECK-NEXT: strb w8, [sp, #6] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.b, z0.b[2] -; CHECK-NEXT: strb w8, [sp, #5] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.b, z3.b[15] -; CHECK-NEXT: strb w8, [sp, #4] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: mov z7.b, z3.b[13] -; CHECK-NEXT: strb w8, [sp, #3] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: ldr q4, [sp, #32] -; CHECK-NEXT: strb w8, [sp, #2] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: strb w8, [sp, #1] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.b, z3.b[5] -; CHECK-NEXT: mov z3.b, z3.b[3] -; CHECK-NEXT: ldr q5, [sp] -; CHECK-NEXT: strb w8, [sp, #63] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: mov z7.b, z2.b[13] -; CHECK-NEXT: strb w8, [sp, #62] -; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: strb w8, [sp, #60] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: strb w8, [sp, #59] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.b, z2.b[9] -; CHECK-NEXT: strb w8, [sp, #58] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.b, z2.b[5] -; CHECK-NEXT: strb w8, [sp, #57] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: mov z7.b, z2.b[3] +; CHECK-NEXT: stp d13, d12, [sp, #-48]! // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: mov z2.b, z1.b[14] +; CHECK-NEXT: mov z3.b, z1.b[12] +; CHECK-NEXT: mov z4.b, z1.b[10] +; CHECK-NEXT: mov z5.b, z1.b[8] +; CHECK-NEXT: mov z6.b, z1.b[6] +; CHECK-NEXT: mov z7.b, z1.b[4] +; CHECK-NEXT: mov z16.b, z1.b[2] +; CHECK-NEXT: mov z18.b, z0.b[14] +; CHECK-NEXT: mov z19.b, z0.b[12] +; CHECK-NEXT: zip1 z3.b, z3.b, z2.b +; CHECK-NEXT: ldp q2, q17, [x1] +; CHECK-NEXT: mov z20.b, z0.b[10] +; CHECK-NEXT: zip1 z4.b, z5.b, z4.b +; CHECK-NEXT: zip1 z5.b, z7.b, z6.b +; CHECK-NEXT: zip1 z6.b, z1.b, z16.b +; CHECK-NEXT: mov z7.b, z0.b[8] +; CHECK-NEXT: mov z16.b, z0.b[6] +; CHECK-NEXT: mov z21.b, z0.b[4] +; CHECK-NEXT: mov z22.b, z0.b[2] +; CHECK-NEXT: mov z23.b, z17.b[14] +; CHECK-NEXT: mov z24.b, z17.b[12] +; CHECK-NEXT: mov z25.b, z17.b[10] +; CHECK-NEXT: mov z26.b, z17.b[8] +; CHECK-NEXT: mov z27.b, z17.b[6] +; CHECK-NEXT: mov z28.b, z17.b[4] +; CHECK-NEXT: mov z29.b, z17.b[2] +; CHECK-NEXT: zip1 z18.b, z19.b, z18.b +; CHECK-NEXT: zip1 z7.b, z7.b, z20.b +; CHECK-NEXT: zip1 z16.b, z21.b, z16.b +; CHECK-NEXT: zip1 z19.b, z0.b, z22.b +; CHECK-NEXT: zip1 z20.b, z24.b, z23.b +; CHECK-NEXT: zip1 z21.b, z26.b, z25.b +; CHECK-NEXT: zip1 z22.b, z28.b, z27.b +; CHECK-NEXT: mov z24.b, z2.b[14] +; CHECK-NEXT: mov z25.b, z2.b[12] +; CHECK-NEXT: mov z26.b, z2.b[10] +; CHECK-NEXT: mov z27.b, z2.b[8] +; CHECK-NEXT: zip1 z23.b, z17.b, z29.b +; CHECK-NEXT: zip1 z3.h, z4.h, z3.h +; CHECK-NEXT: zip1 z4.h, z6.h, z5.h +; CHECK-NEXT: zip1 z5.h, z7.h, z18.h +; CHECK-NEXT: zip1 z6.h, z19.h, z16.h +; CHECK-NEXT: zip1 z7.h, z21.h, z20.h +; CHECK-NEXT: zip1 z18.b, z25.b, z24.b +; CHECK-NEXT: zip1 z19.b, z27.b, z26.b +; CHECK-NEXT: mov z20.b, z2.b[6] +; CHECK-NEXT: mov z21.b, z2.b[4] +; CHECK-NEXT: mov z29.b, z17.b[3] +; CHECK-NEXT: mov z30.b, z17.b[1] +; CHECK-NEXT: mov z31.b, z2.b[15] +; CHECK-NEXT: mov z8.b, z2.b[13] +; CHECK-NEXT: zip1 z16.h, z23.h, z22.h +; CHECK-NEXT: mov z22.b, z2.b[2] +; CHECK-NEXT: mov z23.b, z17.b[15] +; CHECK-NEXT: mov z24.b, z17.b[13] +; CHECK-NEXT: mov z25.b, z17.b[11] +; CHECK-NEXT: mov z26.b, z17.b[9] +; CHECK-NEXT: mov z27.b, z17.b[7] +; CHECK-NEXT: mov z28.b, z17.b[5] +; CHECK-NEXT: zip1 z17.h, z19.h, z18.h +; CHECK-NEXT: zip1 z21.b, z21.b, z20.b +; CHECK-NEXT: zip1 z19.b, z30.b, z29.b +; CHECK-NEXT: zip1 z20.b, z8.b, z31.b +; CHECK-NEXT: mov z29.b, z1.b[15] +; CHECK-NEXT: mov z30.b, z1.b[13] +; CHECK-NEXT: mov z31.b, z1.b[11] +; CHECK-NEXT: mov z8.b, z1.b[9] +; CHECK-NEXT: zip1 z22.b, z2.b, z22.b +; CHECK-NEXT: zip1 z23.b, z24.b, z23.b +; CHECK-NEXT: zip1 z24.b, z26.b, z25.b +; CHECK-NEXT: zip1 z18.b, z28.b, z27.b +; CHECK-NEXT: mov z25.b, z2.b[11] +; CHECK-NEXT: mov z26.b, z2.b[9] +; CHECK-NEXT: mov z27.b, z2.b[7] +; CHECK-NEXT: mov z28.b, z2.b[5] +; CHECK-NEXT: mov z9.b, z1.b[7] +; CHECK-NEXT: mov z10.b, z1.b[5] +; CHECK-NEXT: mov z1.b, z1.b[3] +; CHECK-NEXT: mov z11.b, z0.b[11] +; CHECK-NEXT: mov z12.b, z0.b[9] +; CHECK-NEXT: zip1 z29.b, z30.b, z29.b +; CHECK-NEXT: mov z30.b, z0.b[3] +; CHECK-NEXT: mov z13.b, z0.b[1] +; CHECK-NEXT: zip1 z31.b, z8.b, z31.b +; CHECK-NEXT: mov z8.b, z2.b[3] ; CHECK-NEXT: mov z2.b, z2.b[1] -; CHECK-NEXT: strb w8, [sp, #54] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.b, z1.b[15] -; CHECK-NEXT: strb w8, [sp, #52] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.b, z1.b[13] -; CHECK-NEXT: strb w8, [sp, #50] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: mov z7.b, z1.b[11] -; CHECK-NEXT: strb w8, [sp, #49] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.b, z1.b[9] -; CHECK-NEXT: strb w8, [sp, #48] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.b, z1.b[7] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z2.b, z0.b[15] -; CHECK-NEXT: strb w8, [sp, #31] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.b, z1.b[5] -; CHECK-NEXT: strb w9, [sp, #28] -; CHECK-NEXT: strb w8, [sp, #30] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: mov z7.b, z1.b[3] -; CHECK-NEXT: mov z1.b, z1.b[1] -; CHECK-NEXT: strb w8, [sp, #29] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.b, z0.b[11] -; CHECK-NEXT: strb w8, [sp, #27] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.b, z0.b[13] -; CHECK-NEXT: strb w8, [sp, #26] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: strb w8, [sp, #25] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z1.b, z0.b[9] -; CHECK-NEXT: strb w8, [sp, #24] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.b, z0.b[7] -; CHECK-NEXT: strb w8, [sp, #23] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.b, z0.b[5] -; CHECK-NEXT: strb w8, [sp, #22] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.b, z0.b[3] -; CHECK-NEXT: mov z0.b, z0.b[1] -; CHECK-NEXT: strb w8, [sp, #21] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strb w8, [sp, #20] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strb w8, [sp, #19] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strb w8, [sp, #18] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: strb w8, [sp, #17] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: ldr q0, [sp, #48] -; CHECK-NEXT: add z0.b, z4.b, z0.b -; CHECK-NEXT: strb w8, [sp, #16] -; CHECK-NEXT: ldr q1, [sp, #16] -; CHECK-NEXT: add z1.b, z5.b, z1.b +; CHECK-NEXT: zip1 z9.b, z10.b, z9.b +; CHECK-NEXT: zip1 z10.b, z12.b, z11.b +; CHECK-NEXT: zip1 z1.b, z0.b, z1.b +; CHECK-NEXT: zip1 z30.b, z13.b, z30.b +; CHECK-NEXT: mov z11.b, z0.b[13] +; CHECK-NEXT: mov z0.b, z0.b[5] +; CHECK-NEXT: zip1 z25.b, z26.b, z25.b +; CHECK-NEXT: zip1 z26.b, z28.b, z27.b +; CHECK-NEXT: zip1 z2.b, z2.b, z8.b +; CHECK-NEXT: zip1 z21.h, z22.h, z21.h +; CHECK-NEXT: zip1 z22.h, z24.h, z23.h +; CHECK-NEXT: zip1 z23.h, z31.h, z29.h +; CHECK-NEXT: zip1 z1.h, z1.h, z9.h +; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: zip1 z24.h, z10.h, z11.h +; CHECK-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: zip1 z0.h, z30.h, z0.h +; CHECK-NEXT: zip1 z18.h, z19.h, z18.h +; CHECK-NEXT: zip1 z19.h, z25.h, z20.h +; CHECK-NEXT: zip1 z2.h, z2.h, z26.h +; CHECK-NEXT: zip1 z3.s, z4.s, z3.s +; CHECK-NEXT: zip1 z4.s, z6.s, z5.s +; CHECK-NEXT: zip1 z5.s, z16.s, z7.s +; CHECK-NEXT: zip1 z1.s, z1.s, z23.s +; CHECK-NEXT: zip1 z6.s, z21.s, z17.s +; CHECK-NEXT: zip1 z0.s, z0.s, z24.s +; CHECK-NEXT: zip1 z7.s, z18.s, z22.s +; CHECK-NEXT: zip1 z2.s, z2.s, z19.s +; CHECK-NEXT: zip1 z3.d, z4.d, z3.d +; CHECK-NEXT: zip1 z0.d, z0.d, z1.d +; CHECK-NEXT: zip1 z1.d, z6.d, z5.d +; CHECK-NEXT: zip1 z2.d, z2.d, z7.d +; CHECK-NEXT: add z0.b, z3.b, z0.b +; CHECK-NEXT: add z1.b, z1.b, z2.b ; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: ldp d13, d12, [sp], #48 // 16-byte Folded Reload ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: uzp_v32i8: @@ -1922,110 +1749,71 @@ define void @uzp_v4i16(ptr %a, ptr %b) #0{ define void @uzp_v16i16(ptr %a, ptr %b) #0{ ; CHECK-LABEL: uzp_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #64 -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: mov z4.h, z3.h[6] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z6.h, z3.h[2] -; CHECK-NEXT: mov z5.h, z3.h[4] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z7.h, z2.h[6] -; CHECK-NEXT: mov z17.h, z2.h[7] -; CHECK-NEXT: mov z16.h, z3.h[1] -; CHECK-NEXT: strh w8, [sp, #40] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z2.h[4] -; CHECK-NEXT: strh w9, [sp, #32] -; CHECK-NEXT: fmov w9, s5 -; CHECK-NEXT: mov z5.h, z2.h[2] -; CHECK-NEXT: strh w8, [sp, #46] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.h, z1.h[2] -; CHECK-NEXT: strh w9, [sp, #44] -; CHECK-NEXT: fmov w9, s7 -; CHECK-NEXT: mov z7.h, z0.h[6] -; CHECK-NEXT: strh w8, [sp, #42] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z1.h[6] -; CHECK-NEXT: strh w9, [sp, #38] -; CHECK-NEXT: fmov w9, s16 -; CHECK-NEXT: strh w8, [sp, #36] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.h, z1.h[4] -; CHECK-NEXT: strh w9, [sp, #56] -; CHECK-NEXT: strh w8, [sp, #34] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strh w8, [sp] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z0.h[4] -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.h, z0.h[2] -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.h, z3.h[7] -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: mov z7.h, z3.h[5] -; CHECK-NEXT: strh w8, [sp, #6] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: strh w8, [sp, #4] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.h, z3.h[3] -; CHECK-NEXT: ldr q3, [sp, #32] -; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.h, z2.h[5] -; CHECK-NEXT: ldr q4, [sp] -; CHECK-NEXT: strh w8, [sp, #62] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: mov z7.h, z1.h[7] -; CHECK-NEXT: strh w8, [sp, #60] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.h, z2.h[3] -; CHECK-NEXT: mov z2.h, z2.h[1] -; CHECK-NEXT: strh w8, [sp, #58] -; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z2.h, z0.h[7] -; CHECK-NEXT: strh w8, [sp, #54] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.h, z1.h[5] -; CHECK-NEXT: strh w9, [sp, #48] -; CHECK-NEXT: strh w8, [sp, #52] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.h, z1.h[3] +; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset b8, -16 +; CHECK-NEXT: ldp q1, q6, [x0] +; CHECK-NEXT: ldp q0, q2, [x1] +; CHECK-NEXT: mov z3.h, z6.h[6] +; CHECK-NEXT: mov z4.h, z6.h[4] +; CHECK-NEXT: mov z5.h, z6.h[2] +; CHECK-NEXT: mov z7.h, z1.h[6] +; CHECK-NEXT: mov z16.h, z1.h[4] +; CHECK-NEXT: mov z17.h, z1.h[2] +; CHECK-NEXT: mov z18.h, z2.h[6] +; CHECK-NEXT: mov z19.h, z2.h[4] +; CHECK-NEXT: mov z20.h, z2.h[2] +; CHECK-NEXT: mov z21.h, z0.h[6] +; CHECK-NEXT: mov z22.h, z0.h[4] +; CHECK-NEXT: zip1 z3.h, z4.h, z3.h +; CHECK-NEXT: zip1 z4.h, z6.h, z5.h +; CHECK-NEXT: zip1 z5.h, z16.h, z7.h +; CHECK-NEXT: zip1 z7.h, z1.h, z17.h +; CHECK-NEXT: zip1 z16.h, z19.h, z18.h +; CHECK-NEXT: zip1 z18.h, z2.h, z20.h +; CHECK-NEXT: mov z19.h, z0.h[2] +; CHECK-NEXT: zip1 z17.h, z22.h, z21.h +; CHECK-NEXT: mov z20.h, z6.h[7] +; CHECK-NEXT: mov z21.h, z6.h[5] +; CHECK-NEXT: mov z22.h, z6.h[3] +; CHECK-NEXT: mov z6.h, z6.h[1] +; CHECK-NEXT: mov z23.h, z1.h[7] +; CHECK-NEXT: mov z24.h, z1.h[5] +; CHECK-NEXT: mov z25.h, z1.h[3] ; CHECK-NEXT: mov z1.h, z1.h[1] -; CHECK-NEXT: strh w8, [sp, #50] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: strh w8, [sp, #30] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.h, z0.h[5] -; CHECK-NEXT: strh w8, [sp, #28] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.h, z0.h[3] -; CHECK-NEXT: mov z0.h, z0.h[1] -; CHECK-NEXT: strh w8, [sp, #26] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w8, [sp, #24] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w8, [sp, #22] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: strh w8, [sp, #20] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: strh w8, [sp, #18] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: ldr q0, [sp, #48] -; CHECK-NEXT: add z0.h, z3.h, z0.h -; CHECK-NEXT: strh w8, [sp, #16] -; CHECK-NEXT: ldr q1, [sp, #16] -; CHECK-NEXT: add z1.h, z4.h, z1.h -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: mov z26.h, z2.h[7] +; CHECK-NEXT: mov z27.h, z2.h[5] +; CHECK-NEXT: mov z28.h, z2.h[3] +; CHECK-NEXT: mov z2.h, z2.h[1] +; CHECK-NEXT: mov z29.h, z0.h[7] +; CHECK-NEXT: mov z30.h, z0.h[5] +; CHECK-NEXT: mov z31.h, z0.h[3] +; CHECK-NEXT: mov z8.h, z0.h[1] +; CHECK-NEXT: zip1 z0.h, z0.h, z19.h +; CHECK-NEXT: zip1 z19.h, z21.h, z20.h +; CHECK-NEXT: zip1 z6.h, z6.h, z22.h +; CHECK-NEXT: zip1 z20.h, z24.h, z23.h +; CHECK-NEXT: zip1 z1.h, z1.h, z25.h +; CHECK-NEXT: zip1 z21.h, z27.h, z26.h +; CHECK-NEXT: zip1 z2.h, z2.h, z28.h +; CHECK-NEXT: zip1 z22.h, z30.h, z29.h +; CHECK-NEXT: zip1 z23.h, z8.h, z31.h +; CHECK-NEXT: zip1 z3.s, z4.s, z3.s +; CHECK-NEXT: zip1 z4.s, z7.s, z5.s +; CHECK-NEXT: zip1 z5.s, z18.s, z16.s +; CHECK-NEXT: zip1 z6.s, z6.s, z19.s +; CHECK-NEXT: zip1 z1.s, z1.s, z20.s +; CHECK-NEXT: zip1 z0.s, z0.s, z17.s +; CHECK-NEXT: zip1 z2.s, z2.s, z21.s +; CHECK-NEXT: zip1 z7.s, z23.s, z22.s +; CHECK-NEXT: zip1 z3.d, z4.d, z3.d +; CHECK-NEXT: zip1 z1.d, z1.d, z6.d +; CHECK-NEXT: zip1 z0.d, z0.d, z5.d +; CHECK-NEXT: zip1 z2.d, z7.d, z2.d +; CHECK-NEXT: add z1.h, z3.h, z1.h +; CHECK-NEXT: add z0.h, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: uzp_v16i16: @@ -2116,32 +1904,28 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{ define void @uzp_v8f32(ptr %a, ptr %b) #0{ ; CHECK-LABEL: uzp_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #48 -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: ldp q2, q0, [x0] +; CHECK-NEXT: ldp q6, q0, [x0] ; CHECK-NEXT: adrp x8, .LCPI21_0 -; CHECK-NEXT: ldp q4, q1, [x1] +; CHECK-NEXT: ldp q1, q2, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: mov z3.s, z0.s[2] -; CHECK-NEXT: mov z5.s, z1.s[2] -; CHECK-NEXT: stp s0, s3, [sp, #24] -; CHECK-NEXT: mov z3.s, z4.s[2] -; CHECK-NEXT: stp s5, s2, [sp, #12] -; CHECK-NEXT: mov z5.s, z0.s[3] -; CHECK-NEXT: mov z0.s, z0.s[1] -; CHECK-NEXT: stp s3, s1, [sp, #4] -; CHECK-NEXT: mov z1.s, z2.s[1] -; CHECK-NEXT: str s5, [sp, #44] +; CHECK-NEXT: mov z4.s, z0.s[3] +; CHECK-NEXT: mov z5.s, z0.s[1] +; CHECK-NEXT: mov z7.s, z2.s[2] +; CHECK-NEXT: mov z16.s, z1.s[2] +; CHECK-NEXT: zip1 z0.s, z0.s, z3.s +; CHECK-NEXT: zip1 z3.s, z5.s, z4.s +; CHECK-NEXT: mov z4.s, z6.s[1] +; CHECK-NEXT: zip1 z2.s, z2.s, z7.s ; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI21_0] -; CHECK-NEXT: str s0, [sp, #40] -; CHECK-NEXT: ldp q3, q2, [sp] -; CHECK-NEXT: tbl z0.s, { z4.s }, z5.s -; CHECK-NEXT: str s1, [sp, #32] -; CHECK-NEXT: ldr q1, [sp, #32] -; CHECK-NEXT: fadd z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: zip1 z7.s, z0.s, z16.s +; CHECK-NEXT: tbl z1.s, { z1.s }, z5.s +; CHECK-NEXT: zip1 z0.d, z6.d, z0.d +; CHECK-NEXT: zip1 z3.d, z4.d, z3.d +; CHECK-NEXT: zip1 z2.d, z7.d, z2.d ; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z3.s -; CHECK-NEXT: stp q1, q0, [x0] -; CHECK-NEXT: add sp, sp, #48 +; CHECK-NEXT: fadd z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: uzp_v8f32: @@ -2231,60 +2015,38 @@ define void @uzp_v4i64(ptr %a, ptr %b) #0{ define void @uzp_v8i16(ptr %a, ptr %b) #0{ ; CHECK-LABEL: uzp_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: mov z2.h, z1.h[6] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z4.h, z1.h[2] -; CHECK-NEXT: mov z6.h, z0.h[4] -; CHECK-NEXT: mov z3.h, z1.h[4] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov z5.h, z0.h[6] -; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.h, z0.h[2] -; CHECK-NEXT: strh w9, [sp] -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov z3.h, z1.h[7] -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z1.h[5] -; CHECK-NEXT: strh w9, [sp, #12] -; CHECK-NEXT: fmov w9, s5 -; CHECK-NEXT: mov z5.h, z1.h[3] -; CHECK-NEXT: mov z1.h, z1.h[1] -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: strh w9, [sp, #6] -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: strh w8, [sp, #4] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.h, z0.h[7] -; CHECK-NEXT: strh w9, [sp, #24] -; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w8, [sp, #30] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z0.h[5] -; CHECK-NEXT: strh w8, [sp, #28] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.h, z0.h[3] -; CHECK-NEXT: mov z0.h, z0.h[1] -; CHECK-NEXT: strh w8, [sp, #26] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w8, [sp, #22] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: strh w8, [sp, #20] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: strh w8, [sp, #18] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strh w8, [sp, #16] -; CHECK-NEXT: ldp q3, q0, [sp] -; CHECK-NEXT: add z0.h, z3.h, z0.h +; CHECK-NEXT: ldr q0, [x1] +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: mov z2.h, z0.h[6] +; CHECK-NEXT: mov z3.h, z0.h[4] +; CHECK-NEXT: mov z4.h, z0.h[2] +; CHECK-NEXT: mov z5.h, z1.h[6] +; CHECK-NEXT: mov z6.h, z1.h[4] +; CHECK-NEXT: mov z7.h, z1.h[2] +; CHECK-NEXT: mov z16.h, z0.h[7] +; CHECK-NEXT: mov z17.h, z0.h[5] +; CHECK-NEXT: mov z18.h, z0.h[3] +; CHECK-NEXT: mov z19.h, z0.h[1] +; CHECK-NEXT: mov z20.h, z1.h[7] +; CHECK-NEXT: mov z21.h, z1.h[5] +; CHECK-NEXT: mov z22.h, z1.h[3] +; CHECK-NEXT: mov z23.h, z1.h[1] +; CHECK-NEXT: zip1 z2.h, z3.h, z2.h +; CHECK-NEXT: zip1 z0.h, z0.h, z4.h +; CHECK-NEXT: zip1 z3.h, z6.h, z5.h +; CHECK-NEXT: zip1 z1.h, z1.h, z7.h +; CHECK-NEXT: zip1 z4.h, z17.h, z16.h +; CHECK-NEXT: zip1 z5.h, z19.h, z18.h +; CHECK-NEXT: zip1 z6.h, z21.h, z20.h +; CHECK-NEXT: zip1 z7.h, z23.h, z22.h +; CHECK-NEXT: zip1 z0.s, z0.s, z2.s +; CHECK-NEXT: zip1 z1.s, z1.s, z3.s +; CHECK-NEXT: zip1 z2.s, z5.s, z4.s +; CHECK-NEXT: zip1 z3.s, z7.s, z6.s +; CHECK-NEXT: zip1 z0.d, z1.d, z0.d +; CHECK-NEXT: zip1 z1.d, z3.d, z2.d +; CHECK-NEXT: add z0.h, z0.h, z1.h ; CHECK-NEXT: str q0, [x0] -; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: uzp_v8i16: @@ -2341,31 +2103,21 @@ define void @uzp_v8i16(ptr %a, ptr %b) #0{ define void @uzp_v8i32_undef(ptr %a) #0{ ; CHECK-LABEL: uzp_v8i32_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: mov z2.s, z0.s[2] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z3.s, z1.s[2] -; CHECK-NEXT: mov z4.s, z0.s[3] -; CHECK-NEXT: mov z0.s, z0.s[1] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z2.s, z1.s[3] -; CHECK-NEXT: stp w8, w9, [sp, #8] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov z1.s, z1.s[1] -; CHECK-NEXT: stp w8, w9, [sp] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: stp w9, w8, [sp, #24] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: stp w9, w8, [sp, #16] -; CHECK-NEXT: ldp q0, q1, [sp] +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: mov z2.s, z1.s[2] +; CHECK-NEXT: mov z3.s, z0.s[2] +; CHECK-NEXT: mov z4.s, z1.s[3] +; CHECK-NEXT: mov z5.s, z1.s[1] +; CHECK-NEXT: mov z6.s, z0.s[3] +; CHECK-NEXT: mov z7.s, z0.s[1] +; CHECK-NEXT: zip1 z1.s, z1.s, z2.s +; CHECK-NEXT: zip1 z0.s, z0.s, z3.s +; CHECK-NEXT: zip1 z2.s, z5.s, z4.s +; CHECK-NEXT: zip1 z3.s, z7.s, z6.s +; CHECK-NEXT: zip1 z0.d, z0.d, z1.d +; CHECK-NEXT: zip1 z1.d, z3.d, z2.d ; CHECK-NEXT: add z0.s, z0.s, z1.s ; CHECK-NEXT: stp q0, q0, [x0] -; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: uzp_v8i32_undef: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll index 88c83a214c7394..c942f1eca8ebaf 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll @@ -10,22 +10,14 @@ target triple = "aarch64-unknown-linux-gnu" define <4 x i1> @reshuffle_v4i1_nxv4i1( %a) { ; CHECK-LABEL: reshuffle_v4i1_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 ; CHECK-NEXT: mov z1.s, z0.s[3] -; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: mov z2.s, z0.s[2] ; CHECK-NEXT: mov z3.s, z0.s[1] -; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w9, [sp, #12] -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: zip1 z1.h, z2.h, z1.h +; CHECK-NEXT: zip1 z0.h, z0.h, z3.h +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %el0 = extractelement %a, i32 0 %el1 = extractelement %a, i32 1