diff --git a/compiler/p/codegen/OMRCodeGenerator.cpp b/compiler/p/codegen/OMRCodeGenerator.cpp index 93c7da88738..753b2792dec 100644 --- a/compiler/p/codegen/OMRCodeGenerator.cpp +++ b/compiler/p/codegen/OMRCodeGenerator.cpp @@ -1791,6 +1791,9 @@ bool OMR::Power::CodeGenerator::getSupportsOpCodeForAutoSIMD(TR::CPU *cpu, TR::I case TR::m2v: // only P9 has splat byte immediate, otherwise it's too expensive return cpu->isAtLeast(OMR_PROCESSOR_PPC_P9); + case TR::l2m: + TR_ASSERT_FATAL(et == TR::Int16, "Unsupported vector type %s for l2m (must be Int16)\n", et.toString()); + return cpu->isAtLeast(OMR_PROCESSOR_PPC_P8); default: return false; } diff --git a/compiler/p/codegen/OMRInstOpCode.enum b/compiler/p/codegen/OMRInstOpCode.enum index 005029bbf33..51969e4f9cf 100644 --- a/compiler/p/codegen/OMRInstOpCode.enum +++ b/compiler/p/codegen/OMRInstOpCode.enum @@ -962,7 +962,7 @@ // xvxexpsp, // VSX Vector Extract Exponent SP // xvxsigdp, // VSX Vector Extract Significand DP // xvxsigsp, // VSX Vector Extract Significand SP -// xxbrd, // VSX Vector Byte-Reverse Dword + xxbrd, // VSX Vector Byte-Reverse Dword // xxbrh, // VSX Vector Byte-Reverse Hword // xxbrw, // VSX Vector Byte-Reverse Word xxbrq, // VSX Vector Byte-Reverse Qword diff --git a/compiler/p/codegen/OMRInstOpCodeProperties.hpp b/compiler/p/codegen/OMRInstOpCodeProperties.hpp index e23c363e900..2ec9ba6d333 100644 --- a/compiler/p/codegen/OMRInstOpCodeProperties.hpp +++ b/compiler/p/codegen/OMRInstOpCodeProperties.hpp @@ -10938,17 +10938,16 @@ /* PPCOpProp_SyncSideEffectFree, */ /* }, */ - /* { */ - /* .mnemonic = OMR::InstOpCode::xxbrd, */ - /* .name = "xxbrd", */ - /* .description = "VSX Vector Byte-Reverse Dword", */ - /* .prefix = 0x00000000, */ - /* .opcode = 0xF017076C, */ - /* .format = FORMAT_UNKNOWN, */ - /* .minimumALS = OMR_PROCESSOR_PPC_P9, */ - /* .properties = PPCOpProp_IsVSX | */ - /* PPCOpProp_SyncSideEffectFree, */ - /* }, */ + { + /* .mnemonic = */ OMR::InstOpCode::xxbrd, + /* .name = */ "xxbrd", + /* .description = "VSX Vector Byte-Reverse Dword", */ + /* .prefix = */ 0x00000000, + /* .opcode = */ 0xF017076C, + /* .format = */ FORMAT_XT_XB, + /* .minimumALS = */ OMR_PROCESSOR_PPC_P9, + /* .properties = */ PPCOpProp_IsVSX | PPCOpProp_SyncSideEffectFree, + }, /* { */ /* .mnemonic = OMR::InstOpCode::xxbrh, */ diff --git a/compiler/p/codegen/OMRTreeEvaluator.cpp b/compiler/p/codegen/OMRTreeEvaluator.cpp index 0eefb54eb16..6b05232a6f0 100644 --- a/compiler/p/codegen/OMRTreeEvaluator.cpp +++ b/compiler/p/codegen/OMRTreeEvaluator.cpp @@ -1037,7 +1037,63 @@ TR::Register *OMR::Power::TreeEvaluator::i2mEvaluator(TR::Node *node, TR::CodeGe TR::Register *OMR::Power::TreeEvaluator::l2mEvaluator(TR::Node *node, TR::CodeGenerator *cg) { - return TR::TreeEvaluator::unImpOpEvaluator(node, cg); + TR::Node *child = node->getFirstChild(); + + // In order to preserve the boolean array element order on little endian systems, we need to reverse the + // byte/element order of the given input. Due to factors such as instruction availability, there are + // three cases that each need to be handled differently: + // 1.) The child node has refCount == 1 + // 2.) The child node has refCount > 1 AND the target system is P9 or higher + // 3.) The child node has refCount > 1 AND the target system is P8 or lower + + TR::Register *srcReg; + bool reversed = false; + + // Case (1) + if (cg->comp()->target().cpu.isLittleEndian() && child->getReferenceCount() == 1 && child->getRegister() == NULL) { + srcReg = cg->allocateRegister(); + TR::LoadStoreHandler::generateLoadNodeSequence(cg, srcReg, child, TR::InstOpCode::ldbrx, 8, true); + reversed = true; + } else + srcReg = cg->evaluate(child); + + TR::Register *dstReg = cg->allocateRegister(TR_VRF); + TR::Register *tmpReg = cg->allocateRegister(TR_VRF); + + node->setRegister(dstReg); + + // move to VRF + generateTrg1Src1Instruction(cg, TR::InstOpCode::mtvsrd, node, dstReg, srcReg); + + // Case (2) + if (!reversed && cg->comp()->target().cpu.isLittleEndian() + && cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P9)) + generateTrg1Src1Instruction(cg, TR::InstOpCode::xxbrd, node, dstReg, dstReg); + + // unpack byte-length elements to halfword-length elements + generateTrg1Src1Instruction(cg, TR::InstOpCode::vupkhsb, node, dstReg, dstReg); + + // Case (3) + if (!reversed && cg->comp()->target().cpu.isLittleEndian() + && !cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P9)) { + generateTrg1ImmInstruction(cg, TR::InstOpCode::vspltisw, node, tmpReg, -16); + generateTrg1Src2Instruction(cg, TR::InstOpCode::vrlw, node, dstReg, dstReg, tmpReg); + generateTrg1Src2Instruction(cg, TR::InstOpCode::vadduwm, node, tmpReg, tmpReg, tmpReg); + generateTrg1Src2Instruction(cg, TR::InstOpCode::vrld, node, dstReg, dstReg, tmpReg); + generateTrg1Src2ImmInstruction(cg, TR::InstOpCode::xxpermdi, node, dstReg, dstReg, dstReg, 2); + } + + // since OMR assumes that boolean values are represented as 0x00 for false and 0x01 for true, we can create an + // all 0/1 mask by subtracting from 0: + // 0-1 = -1 = 0xFF... + // 0-0 = 0 + generateTrg1ImmInstruction(cg, TR::InstOpCode::vspltisw, node, tmpReg, 0); + generateTrg1Src2Instruction(cg, TR::InstOpCode::vsubuhm, node, dstReg, tmpReg, dstReg); + + cg->stopUsingRegister(tmpReg); + cg->decReferenceCount(child); + + return dstReg; } TR::Register *OMR::Power::TreeEvaluator::v2mEvaluator(TR::Node *node, TR::CodeGenerator *cg)