Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions compiler/z/codegen/OMRCodeGenerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4367,6 +4367,10 @@ bool OMR::Z::CodeGenerator::getSupportsOpCodeForAutoSIMD(TR::CPU *cpu, TR::ILOpC
case TR::vmreductionMin:
case TR::vcompress:
case TR::vexpand:
case TR::vcompressbits:
case TR::vmcompressbits:
case TR::vexpandbits:
case TR::vmexpandbits:
return true;
case TR::vmul:
case TR::vmmul:
Expand Down
190 changes: 186 additions & 4 deletions compiler/z/codegen/OMRTreeEvaluator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2253,24 +2253,206 @@ TR::Register *OMR::Z::TreeEvaluator::vmbyteswapEvaluator(TR::Node *node, TR::Cod
return TR::TreeEvaluator::unImpOpEvaluator(node, cg);
}

/**
* \brief
* Compresses the lane-wise values of the source vector based on a bit mask.
*
* \details

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do think we can still improve the readability of the comment. Looking at the opcode properties, it clearly states that the implementation is equivalent to the Java's Integer/Long.compress. So I would simplify the comment. The usage of MSB here kind of throws me. I thought it was the way opcodes needs specifically. We could include the example of how it looks on one element in the lane-mask pair.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

* Performs lanewise compression similar to Integer/Long.compress()
* For each bit position, if the corresponding bit in the mask is set, the bit from the source
* value is retained in the compressed result; otherwise, it is discarded.
* e.g. mask= 10101010
* source= 11001100
* result= 00001010
*
* \param node
* The node.
*
* \param cg
* The code generator.
*
* \return
* TR::Register with the compressed values.
*/
TR::Register *OMR::Z::TreeEvaluator::vcompressbitsEvaluator(TR::Node *node, TR::CodeGenerator *cg)
{
return TR::TreeEvaluator::unImpOpEvaluator(node, cg);
TR_ASSERT_FATAL_WITH_NODE(node, node->getDataType().getVectorLength() == TR::VectorLength128,
"Only 128-bit vectors are supported %s", node->getDataType().toString());
const uint8_t elementSizeMask = getVectorElementSizeMask(node);
const uint32_t elementBitNum = getVectorElementSize(node) * 8;
TR::Register *resultReg = cg->allocateRegister(TR_VRF);
TR::Register *loopCountReg = cg->allocateRegister();
TR::Register *scratchReg = cg->allocateRegister(TR_VRF);
TR::Register *sourceReg = cg->evaluate(node->getFirstChild());
TR::Register *maskReg = cg->evaluate(node->getSecondChild());
TR::RegisterDependencyConditions *dependencies = generateRegisterDependencyConditions(0, 5, cg);

// Initialize the result register to zero.
generateVRIaInstruction(cg, TR::InstOpCode::VGBM, node, resultReg, 0, 0);

// Set up loop counter to process all bits in vector elements.
generateRIInstruction(cg, TR::InstOpCode::LHI, node, loopCountReg, elementBitNum);
TR::LabelSymbol *controlFlowStartLabel = generateLabelSymbol(cg);
controlFlowStartLabel->setStartInternalControlFlow();

generateS390LabelInstruction(cg, TR::InstOpCode::label, node, controlFlowStartLabel);

const uint32_t msbPosition = elementBitNum - 1;
// Step 1: Extract current mask bit by shifting it to LSB position
generateVRSaInstruction(cg, TR::InstOpCode::VESRL, node, scratchReg, maskReg,
generateS390MemoryReference(msbPosition, cg), elementSizeMask);
// Step 2: Rotate source left by 1 to position next bit at MSB
generateVRSaInstruction(cg, TR::InstOpCode::VERLL, node, sourceReg, sourceReg, generateS390MemoryReference(1, cg),
elementSizeMask);
// Step 3: Conditionally copy source MSB to result if mask bit is set.
// VSEL: result = (scratchReg[bit0] == 1) ? workingSource : resultReg
generateVRReInstruction(cg, TR::InstOpCode::VSEL, node, resultReg, sourceReg, resultReg, scratchReg, 0, 0);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add comment -
// Step 3: Conditionally copy source MSB to result if mask bit is set
// VSEL: result = (scratchReg[bit0] == 1) ? workingSource : resultReg

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

// Step 4: Rotate result left by 1 if mask bit was set (scratchReg controls rotation amount: 0 or 1)
generateVRRcInstruction(cg, TR::InstOpCode::VERLLV, node, resultReg, resultReg, scratchReg, elementSizeMask);
// Step 5: Rotate mask left by 1 to advance to next bit position
generateVRSaInstruction(cg, TR::InstOpCode::VERLL, node, maskReg, maskReg, generateS390MemoryReference(1, cg),
elementSizeMask);

generateS390BranchInstruction(cg, TR::InstOpCode::BRCT, node, loopCountReg, controlFlowStartLabel);

// Correct the bit order in the result register by rotating left (compensates for the extra rotation from the loop).
generateVRSaInstruction(cg, TR::InstOpCode::VERLL, node, resultReg, resultReg,
generateS390MemoryReference(msbPosition, cg), elementSizeMask);

dependencies->addPostCondition(sourceReg, TR::RealRegister::AssignAny);
dependencies->addPostCondition(resultReg, TR::RealRegister::AssignAny);
dependencies->addPostCondition(maskReg, TR::RealRegister::AssignAny);
dependencies->addPostCondition(scratchReg, TR::RealRegister::AssignAny);
dependencies->addPostCondition(loopCountReg, TR::RealRegister::AssignAny);
TR::LabelSymbol *controlFlowEndLabel = generateLabelSymbol(cg);
generateS390LabelInstruction(cg, TR::InstOpCode::label, node, controlFlowEndLabel, dependencies);
controlFlowEndLabel->setEndInternalControlFlow();

cg->stopUsingRegister(scratchReg);
cg->stopUsingRegister(loopCountReg);

if (node->getOpCode().isVectorMasked()) {
TR::Node *maskChild = node->getThirdChild();
// Apply the lane mask: the result reflects the compressed operation only for lanes where the mask is true;
// for false mask lanes, the original source value is preserved in the result register.
generateVRReInstruction(cg, TR::InstOpCode::VSEL, node, resultReg, resultReg, sourceReg,
cg->evaluate(maskChild), 0, 0);
cg->decReferenceCount(maskChild);
}

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TBH - this is the complex implementation but may be the only way we can achieve this operation ? We will have loop iteration that is controlled by the element size in number of bits. So worst case it executes 64 loop iteration for Long. Also in each iteration we have 5 vector operations. I do understand that we want this operation to work on all platform, but I really think that you should consider using bit extract and deposit instruction where it is available.

Also

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am already working on that but I prefer to have it in a different PR so I can test it properly on the new hardware.

node->setRegister(resultReg);
cg->decReferenceCount(node->getFirstChild());
cg->decReferenceCount(node->getSecondChild());
return resultReg;
}

TR::Register *OMR::Z::TreeEvaluator::vmcompressbitsEvaluator(TR::Node *node, TR::CodeGenerator *cg)
{
return TR::TreeEvaluator::unImpOpEvaluator(node, cg);
return TR::TreeEvaluator::vcompressbitsEvaluator(node, cg);
}

/**
* \brief
* Expands the lane-wise values of the source vector based on a bit mask.
*
* \details
* Performs lanewise expansion similar to scalar Integer/Long.expand()
* The bits from the source are distributed to positions where the mask has set bits.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar to compressbits, comment should be made simpler for expand.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

* Specifically, the first bit of the source moves to the position of the first set bit in the mask,
* the second source bit moves to the position of the second set bit in the mask, and so on.
* e.g. mask= 10101010
* source= 00001010
* result= 10001000
*
* \param node
* The node.
*
* \param cg
* The code generator.
*
* \return
* TR::Register with the compressed values.
*/
TR::Register *OMR::Z::TreeEvaluator::vexpandbitsEvaluator(TR::Node *node, TR::CodeGenerator *cg)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same feedback as compressBits for comments.

{
return TR::TreeEvaluator::unImpOpEvaluator(node, cg);
TR_ASSERT_FATAL_WITH_NODE(node, node->getDataType().getVectorLength() == TR::VectorLength128,
"Only 128-bit vectors are supported %s", node->getDataType().toString());
const uint8_t elementSizeMask = getVectorElementSizeMask(node);
const uint32_t elementBitNum = getVectorElementSize(node) * 8;
TR::Register *resultReg = cg->allocateRegister(TR_VRF);
TR::Register *loopCountReg = cg->allocateRegister();
TR::Register *scratchReg = cg->allocateRegister(TR_VRF);
const bool isMasked = node->getOpCode().isVectorMasked();
TR::Register *sourceReg;
TR::Register *sourceCopyReg;
TR::RegisterDependencyConditions *dependencies = generateRegisterDependencyConditions(0, 6, cg);
if (isMasked) {
// For masked operations, preserve the original source register by creating a copy.
sourceCopyReg = cg->evaluate(node->getFirstChild());
dependencies->addPostCondition(sourceCopyReg, TR::RealRegister::AssignAny);
sourceReg = cg->allocateRegister(TR_VRF);
generateVRRaInstruction(cg, TR::InstOpCode::VLR, node, sourceReg, sourceCopyReg);
} else {
sourceReg = cg->gprClobberEvaluate(node->getFirstChild());
}
TR::Register *maskReg = cg->evaluate(node->getSecondChild());

// Initialize the result register to zero.
generateVRIaInstruction(cg, TR::InstOpCode::VGBM, node, resultReg, 0, 0);

// Set up loop counter to process all bits in each vector element.
generateRIInstruction(cg, TR::InstOpCode::LHI, node, loopCountReg, elementBitNum);
TR::LabelSymbol *controlFlowStartLabel = generateLabelSymbol(cg);
generateS390LabelInstruction(cg, TR::InstOpCode::label, node, controlFlowStartLabel);
controlFlowStartLabel->setStartInternalControlFlow();

const uint32_t msbPosition = elementBitNum - 1;
// Step 1: Rotate the mask to move LSB to the MSB position.
generateVRSaInstruction(cg, TR::InstOpCode::VERLL, node, maskReg, maskReg,
generateS390MemoryReference(msbPosition, cg), elementSizeMask);
// Step 2: Extract the LSB into the scratch register.
generateVRSaInstruction(cg, TR::InstOpCode::VESRL, node, scratchReg, maskReg,
generateS390MemoryReference(msbPosition, cg), elementSizeMask);
// Step 3: Conditionally copy the least significant bit (LSB) from the source to the result if the mask bit is set.
generateVRReInstruction(cg, TR::InstOpCode::VSEL, node, resultReg, sourceReg, resultReg, scratchReg, 0, 0);
// Step 4: Rotate the result register right by 1 bit to position the extracted bit correctly.
generateVRSaInstruction(cg, TR::InstOpCode::VERLL, node, resultReg, resultReg,
generateS390MemoryReference(msbPosition, cg), elementSizeMask);
// Step 5: Advance to the next source bit position only if the current mask bit was set.
generateVRRcInstruction(cg, TR::InstOpCode::VESRLV, node, sourceReg, sourceReg, scratchReg, elementSizeMask);

generateS390BranchInstruction(cg, TR::InstOpCode::BRCT, node, loopCountReg, controlFlowStartLabel);
dependencies->addPostCondition(sourceReg, TR::RealRegister::AssignAny);
dependencies->addPostCondition(resultReg, TR::RealRegister::AssignAny);
dependencies->addPostCondition(maskReg, TR::RealRegister::AssignAny);
dependencies->addPostCondition(scratchReg, TR::RealRegister::AssignAny);
dependencies->addPostCondition(loopCountReg, TR::RealRegister::AssignAny);
TR::LabelSymbol *controlFlowEndLabel = generateLabelSymbol(cg);
generateS390LabelInstruction(cg, TR::InstOpCode::label, node, controlFlowEndLabel, dependencies);
controlFlowEndLabel->setEndInternalControlFlow();

cg->stopUsingRegister(scratchReg);
cg->stopUsingRegister(loopCountReg);

if (isMasked) {
TR::Node *maskChild = node->getThirdChild();
cg->stopUsingRegister(sourceReg);
// Apply the lane mask: the result reflects the expanded operation only for lanes where the mask is true;
// for false mask lanes, the original source value is preserved in the result register.
generateVRReInstruction(cg, TR::InstOpCode::VSEL, node, resultReg, resultReg, sourceCopyReg,
cg->evaluate(maskChild), 0, 0);
cg->decReferenceCount(maskChild);
}

node->setRegister(resultReg);
cg->decReferenceCount(node->getFirstChild());
cg->decReferenceCount(node->getSecondChild());
return resultReg;
}

TR::Register *OMR::Z::TreeEvaluator::vmexpandbitsEvaluator(TR::Node *node, TR::CodeGenerator *cg)
{
return TR::TreeEvaluator::unImpOpEvaluator(node, cg);
return TR::TreeEvaluator::vexpandbitsEvaluator(node, cg);
}

TR::Register *OMR::Z::TreeEvaluator::f2iuEvaluator(TR::Node *node, TR::CodeGenerator *cg)
Expand Down