Merge pull request #6817 from Akira1Saitoh/aarch64VectorMaskReduction

AArch64: Implement mask reduction operation evaluators
eclipse · Nov 21, 2022 · 26e5221 · 26e5221
2 parents e3ccf74 + cbc4e20
commit 26e5221
Show file tree

Hide file tree

Showing 5 changed files with 512 additions and 5 deletions.
diff --git a/compiler/aarch64/codegen/ARM64Debug.cpp b/compiler/aarch64/codegen/ARM64Debug.cpp
@@ -1842,6 +1842,41 @@ TR_Debug::print(TR::FILE *pOutFile, TR::ARM64Trg1Src1ImmInstruction *instr)
          trfprintf(pOutFile, ", %d, %d", immr, imms);
          }
       }
+   else if (op == TR::InstOpCode::bfmx || op == TR::InstOpCode::bfmw)
+      {
+      uint32_t imm12 = instr->getSourceImmediate();
+      auto immr = imm12 >> 6;
+      auto imms = imm12 & 0x3f;
+      if ((op == TR::InstOpCode::bfmx) || (((immr & (1 << 6)) == 0) && ((imms & (1 << 6)) == 0)))
+         {
+         if (imms < immr)
+            {
+            // bfi alias
+            done = true;
+            trfprintf(pOutFile, "%s \t", (op == TR::InstOpCode::bfmx) ? "bfix" : "bfiw");
+            print(pOutFile, instr->getTargetRegister(), TR_WordReg); trfprintf(pOutFile, ", ");
+            print(pOutFile, instr->getSource1Register(), TR_WordReg);
+            trfprintf(pOutFile, ", %d, %d", 64 - immr, imms + 1);
+            }
+         else
+            {
+            // bfxil alias
+            done = true;
+            trfprintf(pOutFile, "%s \t", (op == TR::InstOpCode::bfmx) ? "bfxilx" : "bfxilw");
+            print(pOutFile, instr->getTargetRegister(), TR_WordReg); trfprintf(pOutFile, ", ");
+            print(pOutFile, instr->getSource1Register(), TR_WordReg);
+            trfprintf(pOutFile, ", %d, %d", immr, imms + 1 - immr);
+            }
+         }
+      if (!done)
+         {
+         done = true;
+         trfprintf(pOutFile, "%s \t", getOpCodeName(&instr->getOpCode()));
+         print(pOutFile, instr->getTargetRegister(), TR_WordReg); trfprintf(pOutFile, ", ");
+         print(pOutFile, instr->getSource1Register(), TR_WordReg);
+         trfprintf(pOutFile, ", %d, %d", immr, imms);
+         }
+      }
    else if (op == TR::InstOpCode::andimmx || op == TR::InstOpCode::andimmw ||
             op == TR::InstOpCode::andsimmx || op == TR::InstOpCode::andsimmw ||
             op == TR::InstOpCode::orrimmx || op == TR::InstOpCode::orrimmw ||

diff --git a/compiler/aarch64/codegen/GenerateInstructions.cpp b/compiler/aarch64/codegen/GenerateInstructions.cpp
@@ -634,6 +634,16 @@ TR::Instruction *generateUBFIZInstruction(TR::CodeGenerator *cg, TR::Node *node,
    return generateTrg1Src1ImmInstruction(cg, is64bit ? TR::InstOpCode::ubfmx : TR::InstOpCode::ubfmw, node, treg, sreg, (immr << 6) | imms, preced);
    }
 
+TR::Instruction *generateBFIInstruction(TR::CodeGenerator *cg, TR::Node *node,
+   TR::Register *treg, TR::Register *sreg, uint32_t lsb, uint32_t width, bool is64bit, TR::Instruction *preced)
+   {
+   uint32_t imms = width - 1;
+   uint32_t immr = (is64bit ? 64 : 32) - lsb;
+   TR_ASSERT_FATAL((is64bit && (immr <= 63) && (imms <= 63)) || ((!is64bit) && (immr <= 31) && (imms <= 31)),
+                   "immediate field for bfm is out of range: is64bit=%d, immr=%d, imms=%d", is64bit, immr, imms);
+   return generateTrg1Src1ImmInstruction(cg, is64bit ? TR::InstOpCode::bfmx : TR::InstOpCode::bfmw, node, treg, sreg, (immr << 6) | imms, preced);
+   }
+
 TR::Instruction *generateVectorShiftImmediateInstruction(TR::CodeGenerator *cg, TR::InstOpCode::Mnemonic op, TR::Node *node,
    TR::Register *treg, TR::Register *sreg, uint32_t shiftAmount, TR::Instruction *preced)
    {

diff --git a/compiler/aarch64/codegen/GenerateInstructions.hpp b/compiler/aarch64/codegen/GenerateInstructions.hpp
@@ -1166,6 +1166,34 @@ TR::Instruction *generateUBFIZInstruction(
                   bool is64bit,
                   TR::Instruction *preced = NULL);
 
+/**
+ * @brief Generates bfi instruction
+ *
+ * @details Generates bfi instruction which copies a bitfield of <width> bits
+ *          from the least significant bits of the source register to
+ *          the bit position <lsb> of the target register.
+ *          The bits above and below the bitfield in the target register is unchanged.
+ *
+ * @param[in] cg      : CodeGenerator
+ * @param[in] node    : node
+ * @param[in] treg    : target register
+ * @param[in] sreg    : source register
+ * @param[in] lsb     : the lsb to be copied in the source register
+ * @param[in] width   : the bitfield width to copy
+ * @param[in] is64bit : true if 64bit
+ * @param[in] preced  : preceding instruction
+ * @return generated instruction
+ */
+TR::Instruction *generateBFIInstruction(
+                  TR::CodeGenerator *cg,
+                  TR::Node *node,
+                  TR::Register *treg,
+                  TR::Register *sreg,
+                  uint32_t lsb,
+                  uint32_t width,
+                  bool is64bit,
+                  TR::Instruction *preced = NULL);
+
 /**
  * @brief Generates vector shift left immediate instruction
  *

diff --git a/compiler/aarch64/codegen/OMRCodeGenerator.cpp b/compiler/aarch64/codegen/OMRCodeGenerator.cpp
@@ -706,6 +706,11 @@ bool OMR::ARM64::CodeGenerator::getSupportsOpCodeForAutoSIMD(TR::CPU *cpu, TR::I
       case TR::mstorei:
       case TR::mRegLoad:
       case TR::mRegStore:
+      case TR::mTrueCount:
+      case TR::mFirstTrue:
+      case TR::mLastTrue:
+      case TR::mToLongBits:
+      case TR::mLongBitsToMask:
       case TR::vsplats:
          return true;
       case TR::vfma: