-
Notifications
You must be signed in to change notification settings - Fork 429
Z: Implement vector bit compress and expand evaluators #8276
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2253,24 +2253,206 @@ TR::Register *OMR::Z::TreeEvaluator::vmbyteswapEvaluator(TR::Node *node, TR::Cod | |
| return TR::TreeEvaluator::unImpOpEvaluator(node, cg); | ||
| } | ||
|
|
||
| /** | ||
| * \brief | ||
| * Compresses the lane-wise values of the source vector based on a bit mask. | ||
| * | ||
| * \details | ||
| * Performs lanewise compression similar to Integer/Long.compress() | ||
| * For each bit position, if the corresponding bit in the mask is set, the bit from the source | ||
| * value is retained in the compressed result; otherwise, it is discarded. | ||
| * e.g. mask= 10101010 | ||
| * source= 11001100 | ||
| * result= 00001010 | ||
| * | ||
| * \param node | ||
| * The node. | ||
| * | ||
| * \param cg | ||
| * The code generator. | ||
| * | ||
| * \return | ||
| * TR::Register with the compressed values. | ||
| */ | ||
| TR::Register *OMR::Z::TreeEvaluator::vcompressbitsEvaluator(TR::Node *node, TR::CodeGenerator *cg) | ||
| { | ||
| return TR::TreeEvaluator::unImpOpEvaluator(node, cg); | ||
| TR_ASSERT_FATAL_WITH_NODE(node, node->getDataType().getVectorLength() == TR::VectorLength128, | ||
| "Only 128-bit vectors are supported %s", node->getDataType().toString()); | ||
| const uint8_t elementSizeMask = getVectorElementSizeMask(node); | ||
| const uint32_t elementBitNum = getVectorElementSize(node) * 8; | ||
| TR::Register *resultReg = cg->allocateRegister(TR_VRF); | ||
| TR::Register *loopCountReg = cg->allocateRegister(); | ||
| TR::Register *scratchReg = cg->allocateRegister(TR_VRF); | ||
| TR::Register *sourceReg = cg->evaluate(node->getFirstChild()); | ||
| TR::Register *maskReg = cg->evaluate(node->getSecondChild()); | ||
| TR::RegisterDependencyConditions *dependencies = generateRegisterDependencyConditions(0, 5, cg); | ||
|
|
||
| // Initialize the result register to zero. | ||
| generateVRIaInstruction(cg, TR::InstOpCode::VGBM, node, resultReg, 0, 0); | ||
|
|
||
| // Set up loop counter to process all bits in vector elements. | ||
| generateRIInstruction(cg, TR::InstOpCode::LHI, node, loopCountReg, elementBitNum); | ||
| TR::LabelSymbol *controlFlowStartLabel = generateLabelSymbol(cg); | ||
| controlFlowStartLabel->setStartInternalControlFlow(); | ||
|
|
||
| generateS390LabelInstruction(cg, TR::InstOpCode::label, node, controlFlowStartLabel); | ||
|
|
||
| const uint32_t msbPosition = elementBitNum - 1; | ||
| // Step 1: Extract current mask bit by shifting it to LSB position | ||
| generateVRSaInstruction(cg, TR::InstOpCode::VESRL, node, scratchReg, maskReg, | ||
| generateS390MemoryReference(msbPosition, cg), elementSizeMask); | ||
| // Step 2: Rotate source left by 1 to position next bit at MSB | ||
| generateVRSaInstruction(cg, TR::InstOpCode::VERLL, node, sourceReg, sourceReg, generateS390MemoryReference(1, cg), | ||
| elementSizeMask); | ||
| // Step 3: Conditionally copy source MSB to result if mask bit is set. | ||
| // VSEL: result = (scratchReg[bit0] == 1) ? workingSource : resultReg | ||
| generateVRReInstruction(cg, TR::InstOpCode::VSEL, node, resultReg, sourceReg, resultReg, scratchReg, 0, 0); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add comment -
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
| // Step 4: Rotate result left by 1 if mask bit was set (scratchReg controls rotation amount: 0 or 1) | ||
| generateVRRcInstruction(cg, TR::InstOpCode::VERLLV, node, resultReg, resultReg, scratchReg, elementSizeMask); | ||
| // Step 5: Rotate mask left by 1 to advance to next bit position | ||
| generateVRSaInstruction(cg, TR::InstOpCode::VERLL, node, maskReg, maskReg, generateS390MemoryReference(1, cg), | ||
| elementSizeMask); | ||
|
|
||
| generateS390BranchInstruction(cg, TR::InstOpCode::BRCT, node, loopCountReg, controlFlowStartLabel); | ||
|
|
||
| // Correct the bit order in the result register by rotating left (compensates for the extra rotation from the loop). | ||
| generateVRSaInstruction(cg, TR::InstOpCode::VERLL, node, resultReg, resultReg, | ||
| generateS390MemoryReference(msbPosition, cg), elementSizeMask); | ||
|
|
||
| dependencies->addPostCondition(sourceReg, TR::RealRegister::AssignAny); | ||
| dependencies->addPostCondition(resultReg, TR::RealRegister::AssignAny); | ||
| dependencies->addPostCondition(maskReg, TR::RealRegister::AssignAny); | ||
| dependencies->addPostCondition(scratchReg, TR::RealRegister::AssignAny); | ||
| dependencies->addPostCondition(loopCountReg, TR::RealRegister::AssignAny); | ||
| TR::LabelSymbol *controlFlowEndLabel = generateLabelSymbol(cg); | ||
| generateS390LabelInstruction(cg, TR::InstOpCode::label, node, controlFlowEndLabel, dependencies); | ||
| controlFlowEndLabel->setEndInternalControlFlow(); | ||
|
|
||
| cg->stopUsingRegister(scratchReg); | ||
| cg->stopUsingRegister(loopCountReg); | ||
|
|
||
| if (node->getOpCode().isVectorMasked()) { | ||
| TR::Node *maskChild = node->getThirdChild(); | ||
| // Apply the lane mask: the result reflects the compressed operation only for lanes where the mask is true; | ||
| // for false mask lanes, the original source value is preserved in the result register. | ||
| generateVRReInstruction(cg, TR::InstOpCode::VSEL, node, resultReg, resultReg, sourceReg, | ||
| cg->evaluate(maskChild), 0, 0); | ||
| cg->decReferenceCount(maskChild); | ||
| } | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TBH - this is the complex implementation but may be the only way we can achieve this operation ? We will have loop iteration that is controlled by the element size in number of bits. So worst case it executes 64 loop iteration for Long. Also in each iteration we have 5 vector operations. I do understand that we want this operation to work on all platform, but I really think that you should consider using bit extract and deposit instruction where it is available. Also
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am already working on that but I prefer to have it in a different PR so I can test it properly on the new hardware. |
||
| node->setRegister(resultReg); | ||
| cg->decReferenceCount(node->getFirstChild()); | ||
| cg->decReferenceCount(node->getSecondChild()); | ||
| return resultReg; | ||
| } | ||
|
|
||
| TR::Register *OMR::Z::TreeEvaluator::vmcompressbitsEvaluator(TR::Node *node, TR::CodeGenerator *cg) | ||
| { | ||
| return TR::TreeEvaluator::unImpOpEvaluator(node, cg); | ||
| return TR::TreeEvaluator::vcompressbitsEvaluator(node, cg); | ||
| } | ||
|
|
||
| /** | ||
| * \brief | ||
| * Expands the lane-wise values of the source vector based on a bit mask. | ||
| * | ||
| * \details | ||
| * Performs lanewise expansion similar to scalar Integer/Long.expand() | ||
| * The bits from the source are distributed to positions where the mask has set bits. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar to compressbits, comment should be made simpler for expand.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
| * Specifically, the first bit of the source moves to the position of the first set bit in the mask, | ||
| * the second source bit moves to the position of the second set bit in the mask, and so on. | ||
| * e.g. mask= 10101010 | ||
| * source= 00001010 | ||
| * result= 10001000 | ||
| * | ||
| * \param node | ||
| * The node. | ||
| * | ||
| * \param cg | ||
| * The code generator. | ||
| * | ||
| * \return | ||
| * TR::Register with the compressed values. | ||
| */ | ||
| TR::Register *OMR::Z::TreeEvaluator::vexpandbitsEvaluator(TR::Node *node, TR::CodeGenerator *cg) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same feedback as compressBits for comments. |
||
| { | ||
| return TR::TreeEvaluator::unImpOpEvaluator(node, cg); | ||
| TR_ASSERT_FATAL_WITH_NODE(node, node->getDataType().getVectorLength() == TR::VectorLength128, | ||
| "Only 128-bit vectors are supported %s", node->getDataType().toString()); | ||
| const uint8_t elementSizeMask = getVectorElementSizeMask(node); | ||
| const uint32_t elementBitNum = getVectorElementSize(node) * 8; | ||
| TR::Register *resultReg = cg->allocateRegister(TR_VRF); | ||
| TR::Register *loopCountReg = cg->allocateRegister(); | ||
| TR::Register *scratchReg = cg->allocateRegister(TR_VRF); | ||
| const bool isMasked = node->getOpCode().isVectorMasked(); | ||
| TR::Register *sourceReg; | ||
| TR::Register *sourceCopyReg; | ||
| TR::RegisterDependencyConditions *dependencies = generateRegisterDependencyConditions(0, 6, cg); | ||
| if (isMasked) { | ||
| // For masked operations, preserve the original source register by creating a copy. | ||
| sourceCopyReg = cg->evaluate(node->getFirstChild()); | ||
| dependencies->addPostCondition(sourceCopyReg, TR::RealRegister::AssignAny); | ||
| sourceReg = cg->allocateRegister(TR_VRF); | ||
| generateVRRaInstruction(cg, TR::InstOpCode::VLR, node, sourceReg, sourceCopyReg); | ||
| } else { | ||
| sourceReg = cg->gprClobberEvaluate(node->getFirstChild()); | ||
| } | ||
| TR::Register *maskReg = cg->evaluate(node->getSecondChild()); | ||
|
|
||
| // Initialize the result register to zero. | ||
| generateVRIaInstruction(cg, TR::InstOpCode::VGBM, node, resultReg, 0, 0); | ||
|
|
||
| // Set up loop counter to process all bits in each vector element. | ||
| generateRIInstruction(cg, TR::InstOpCode::LHI, node, loopCountReg, elementBitNum); | ||
| TR::LabelSymbol *controlFlowStartLabel = generateLabelSymbol(cg); | ||
| generateS390LabelInstruction(cg, TR::InstOpCode::label, node, controlFlowStartLabel); | ||
| controlFlowStartLabel->setStartInternalControlFlow(); | ||
|
|
||
| const uint32_t msbPosition = elementBitNum - 1; | ||
| // Step 1: Rotate the mask to move LSB to the MSB position. | ||
| generateVRSaInstruction(cg, TR::InstOpCode::VERLL, node, maskReg, maskReg, | ||
| generateS390MemoryReference(msbPosition, cg), elementSizeMask); | ||
| // Step 2: Extract the LSB into the scratch register. | ||
| generateVRSaInstruction(cg, TR::InstOpCode::VESRL, node, scratchReg, maskReg, | ||
| generateS390MemoryReference(msbPosition, cg), elementSizeMask); | ||
| // Step 3: Conditionally copy the least significant bit (LSB) from the source to the result if the mask bit is set. | ||
| generateVRReInstruction(cg, TR::InstOpCode::VSEL, node, resultReg, sourceReg, resultReg, scratchReg, 0, 0); | ||
| // Step 4: Rotate the result register right by 1 bit to position the extracted bit correctly. | ||
| generateVRSaInstruction(cg, TR::InstOpCode::VERLL, node, resultReg, resultReg, | ||
| generateS390MemoryReference(msbPosition, cg), elementSizeMask); | ||
| // Step 5: Advance to the next source bit position only if the current mask bit was set. | ||
| generateVRRcInstruction(cg, TR::InstOpCode::VESRLV, node, sourceReg, sourceReg, scratchReg, elementSizeMask); | ||
|
|
||
| generateS390BranchInstruction(cg, TR::InstOpCode::BRCT, node, loopCountReg, controlFlowStartLabel); | ||
| dependencies->addPostCondition(sourceReg, TR::RealRegister::AssignAny); | ||
| dependencies->addPostCondition(resultReg, TR::RealRegister::AssignAny); | ||
| dependencies->addPostCondition(maskReg, TR::RealRegister::AssignAny); | ||
| dependencies->addPostCondition(scratchReg, TR::RealRegister::AssignAny); | ||
| dependencies->addPostCondition(loopCountReg, TR::RealRegister::AssignAny); | ||
| TR::LabelSymbol *controlFlowEndLabel = generateLabelSymbol(cg); | ||
| generateS390LabelInstruction(cg, TR::InstOpCode::label, node, controlFlowEndLabel, dependencies); | ||
| controlFlowEndLabel->setEndInternalControlFlow(); | ||
|
|
||
| cg->stopUsingRegister(scratchReg); | ||
| cg->stopUsingRegister(loopCountReg); | ||
|
|
||
| if (isMasked) { | ||
| TR::Node *maskChild = node->getThirdChild(); | ||
| cg->stopUsingRegister(sourceReg); | ||
| // Apply the lane mask: the result reflects the expanded operation only for lanes where the mask is true; | ||
| // for false mask lanes, the original source value is preserved in the result register. | ||
| generateVRReInstruction(cg, TR::InstOpCode::VSEL, node, resultReg, resultReg, sourceCopyReg, | ||
| cg->evaluate(maskChild), 0, 0); | ||
| cg->decReferenceCount(maskChild); | ||
| } | ||
|
|
||
| node->setRegister(resultReg); | ||
| cg->decReferenceCount(node->getFirstChild()); | ||
| cg->decReferenceCount(node->getSecondChild()); | ||
| return resultReg; | ||
| } | ||
|
|
||
| TR::Register *OMR::Z::TreeEvaluator::vmexpandbitsEvaluator(TR::Node *node, TR::CodeGenerator *cg) | ||
| { | ||
| return TR::TreeEvaluator::unImpOpEvaluator(node, cg); | ||
| return TR::TreeEvaluator::vexpandbitsEvaluator(node, cg); | ||
| } | ||
|
|
||
| TR::Register *OMR::Z::TreeEvaluator::f2iuEvaluator(TR::Node *node, TR::CodeGenerator *cg) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I do think we can still improve the readability of the comment. Looking at the opcode properties, it clearly states that the implementation is equivalent to the Java's Integer/Long.compress. So I would simplify the comment. The usage of MSB here kind of throws me. I thought it was the way opcodes needs specifically. We could include the example of how it looks on one element in the lane-mask pair.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done