diff --git a/compiler/aarch64/codegen/ARM64Debug.cpp b/compiler/aarch64/codegen/ARM64Debug.cpp index 528ef6130ff..2eb7d8d4756 100644 --- a/compiler/aarch64/codegen/ARM64Debug.cpp +++ b/compiler/aarch64/codegen/ARM64Debug.cpp @@ -921,6 +921,10 @@ static const char *opCodeToNameMap[] = "vuzp2_8h", "vuzp2_4s", "vuzp2_2d", + "vtrn1_8b", + "vtrn1_16b", + "vtrn2_8b", + "vtrn2_16b", "vext16b", "vneg16b", "vneg8h", diff --git a/compiler/aarch64/codegen/OMRInstOpCode.enum b/compiler/aarch64/codegen/OMRInstOpCode.enum index 310a27f746f..d180f4fb44f 100644 --- a/compiler/aarch64/codegen/OMRInstOpCode.enum +++ b/compiler/aarch64/codegen/OMRInstOpCode.enum @@ -906,6 +906,10 @@ vuzp2_8h, /* 0x4E405800 UZP2 */ vuzp2_4s, /* 0x4E805800 UZP2 */ vuzp2_2d, /* 0x4EC05800 UZP2 */ + vtrn1_8b, /* 0x0E002800 TRN1 */ + vtrn1_16b, /* 0x4E002800 TRN1 */ + vtrn2_8b, /* 0x0E006800 TRN2 */ + vtrn2_16b, /* 0x4E006800 TRN2 */ /* Vector extract */ vext16b, /* 0x6E000000 EXT */ /* Vector Data-processing (1 source) */ diff --git a/compiler/aarch64/codegen/OpBinary.cpp b/compiler/aarch64/codegen/OpBinary.cpp index 7580f981155..f21849b604b 100644 --- a/compiler/aarch64/codegen/OpBinary.cpp +++ b/compiler/aarch64/codegen/OpBinary.cpp @@ -907,6 +907,10 @@ const OMR::ARM64::InstOpCode::OpCodeBinaryEntry OMR::ARM64::InstOpCode::binaryEn 0x4E405800, /* UZP2 vuzp2_8h */ 0x4E805800, /* UZP2 vuzp2_4s */ 0x4EC05800, /* UZP2 vuzp2_2d */ + 0x0E002800, /* TRN1 vtrn1_8b */ + 0x4E002800, /* TRN1 vtrn1_16b */ + 0x0E006800, /* TRN2 vtrn2_8b */ + 0x4E006800, /* TRN2 vtrn2_16b */ /* Vector extract */ 0x6E000000, /* EXT vext16b */ /* Vector Data-processing (1 source) */ diff --git a/compiler/codegen/OMRCodeGenPhase.cpp b/compiler/codegen/OMRCodeGenPhase.cpp index 8bf2997ba2d..fefac29de17 100644 --- a/compiler/codegen/OMRCodeGenPhase.cpp +++ b/compiler/codegen/OMRCodeGenPhase.cpp @@ -251,7 +251,20 @@ OMR::CodeGenPhase::performEmitSnippetsPhase(TR::CodeGenerator * cg, TR::CodeGenP TR::LexicalMemProfiler mp("Emit Snippets", comp->phaseMemProfiler()); LexicalTimer pt("Emit Snippets", comp->phaseTimer()); - cg->emitSnippets(); + if (cg->getLastWarmInstruction() && + comp->getOption(TR_MoveSnippetsToWarmCode)) + { + // Snippets will follow warm blocks + uint8_t * oldCursor = cg->getBinaryBufferCursor(); + cg->setBinaryBufferCursor(cg->getWarmCodeEnd()); + cg->emitSnippets(); + cg->setWarmCodeEnd(cg->getBinaryBufferCursor()); + cg->setBinaryBufferCursor(oldCursor); + } + else + { + cg->emitSnippets(); + } if (comp->getOption(TR_EnableOSR)) { diff --git a/compiler/codegen/OMRCodeGenerator.cpp b/compiler/codegen/OMRCodeGenerator.cpp index 0304aee166c..ad1addfc7cc 100644 --- a/compiler/codegen/OMRCodeGenerator.cpp +++ b/compiler/codegen/OMRCodeGenerator.cpp @@ -363,7 +363,79 @@ OMR::CodeGenerator::generateCodeFromIL() return false; } -void OMR::CodeGenerator::findLastWarmBlock() +void +OMR::CodeGenerator::insertGotoIntoLastBlock(TR::Block *lastBlock) + { + // If the last tree in the last block is not a TR_goto, insert a goto tree + // at the end of the block. + // If there is a following block the goto will branch to it so that when the + // code is split any fall-through will go to the right place. + // If there is no following block the goto will branch to the first block; in + // this case the goto should never be reached, it is there only to + // make sure that the instruction following the last real treetop will be in + // method's code, so if it is a helper call (e.g. for a throw) the return address + // is in this method's code. + // + TR::Compilation *comp = self()->comp(); + TR::TreeTop * tt; + TR::Node * node; + + if (lastBlock->getNumberOfRealTreeTops() == 0) + tt = lastBlock->getEntry(); + else + tt = lastBlock->getLastRealTreeTop(); + + node = tt->getNode(); + + if (!(node->getOpCode().isGoto() || + node->getOpCode().isJumpWithMultipleTargets() || + node->getOpCode().isReturn())) + { + + if (comp->getOption(TR_TraceCG)) + { + traceMsg(comp, "%s Inserting goto at the end of block_%d\n", SPLIT_WARM_COLD_STRING, lastBlock->getNumber()); + } + + // Find the block to be branched to + // + TR::TreeTop * targetTreeTop = lastBlock->getExit()->getNextTreeTop(); + + if (targetTreeTop) + // Branch to following block. Make sure it is not marked as an + // extension block so that it will get a label generated. + // + targetTreeTop->getNode()->getBlock()->setIsExtensionOfPreviousBlock(false); + else + // Branch to the first block. This will not be marked as an extension + // block. + // + targetTreeTop = comp->getStartBlock()->getEntry(); + + // Generate the goto and insert it into the end of the last warm block. + // + TR::TreeTop *gotoTreeTop = TR::TreeTop::create(comp, TR::Node::create(node, TR::Goto, 0, targetTreeTop)); + + // Move reg deps from BBEnd to goto + // + TR::Node *bbEnd = lastBlock->getExit()->getNode(); + + if (bbEnd->getNumChildren() > 0) + { + TR::Node *glRegDeps = bbEnd->getChild(0); + + gotoTreeTop->getNode()->setNumChildren(1); + gotoTreeTop->getNode()->setChild(0, glRegDeps); + + bbEnd->setChild(0,NULL); + bbEnd->setNumChildren(0); + } + + tt->insertAfter(gotoTreeTop); + } + } + +void OMR::CodeGenerator::prepareLastWarmBlockForCodeSplitting() { TR::Compilation *comp = self()->comp(); TR::TreeTop * tt; @@ -457,62 +529,18 @@ void OMR::CodeGenerator::findLastWarmBlock() (numColdBlocks - numNonOutlinedColdBlocks)*100/numColdBlocks); } - // If the last tree in the last warm block is not a TR_goto, insert a goto tree - // at the end of the block. - // If there is a following block the goto will branch to it so that when the - // code is split any fall-through will go to the right place. - // If there is no following block the goto will branch to the first block; in - // this case the goto should never be reached, it is there only to - // make sure that the instruction following the last real treetop will be in - // warm code, so if it is a helper call (e.g. for a throw) the return address - // is in this method's code. - // - if (lastWarmBlock->getNumberOfRealTreeTops() == 0) - tt = lastWarmBlock->getEntry(); - else - tt = lastWarmBlock->getLastRealTreeTop(); - node = tt->getNode(); + insertGotoIntoLastBlock(lastWarmBlock); + TR::Block *lastBlock = comp->findLastTree()->getNode()->getBlock(); - if (!(node->getOpCode().isGoto() || - node->getOpCode().isJumpWithMultipleTargets() || - node->getOpCode().isReturn())) + // If disclaim is enabled, it may happen that nothing follows mainline code + // (no snippets or OOL). Then, we need to insert a goto at the end for the + // reasons described in insertGotoIntoLastBlock() + // + if (TR::Options::getCmdLineOptions()->getOption(TR_EnableCodeCacheDisclaiming) && + lastBlock != lastWarmBlock) { - // Find the block to be branched to - // - TR::TreeTop * targetTreeTop = lastWarmBlock->getExit()->getNextTreeTop(); - - if (targetTreeTop) - // Branch to following block. Make sure it is not marked as an - // extension block so that it will get a label generated. - // - targetTreeTop->getNode()->getBlock()->setIsExtensionOfPreviousBlock(false); - else - // Branch to the first block. This will not be marked as an extension - // block. - // - targetTreeTop = comp->getStartBlock()->getEntry(); - - // Generate the goto and insert it into the end of the last warm block. - // - TR::TreeTop *gotoTreeTop = TR::TreeTop::create(comp, TR::Node::create(node, TR::Goto, 0, targetTreeTop)); - - // Move reg deps from BBEnd to goto - // - TR::Node *bbEnd = lastWarmBlock->getExit()->getNode(); - - if (bbEnd->getNumChildren() > 0) - { - TR::Node *glRegDeps = bbEnd->getChild(0); - - gotoTreeTop->getNode()->setNumChildren(1); - gotoTreeTop->getNode()->setChild(0, glRegDeps); - - bbEnd->setChild(0,NULL); - bbEnd->setNumChildren(0); - } - - tt->insertAfter(gotoTreeTop); + insertGotoIntoLastBlock(lastBlock); } } @@ -570,7 +598,7 @@ void OMR::CodeGenerator::postLowerTrees() if (comp()->getOption(TR_SplitWarmAndColdBlocks) && !comp()->compileRelocatableCode()) { - self()->findLastWarmBlock(); + self()->prepareLastWarmBlockForCodeSplitting(); } } diff --git a/compiler/codegen/OMRCodeGenerator.hpp b/compiler/codegen/OMRCodeGenerator.hpp index 5c3fb60f6c5..35129b51af8 100644 --- a/compiler/codegen/OMRCodeGenerator.hpp +++ b/compiler/codegen/OMRCodeGenerator.hpp @@ -344,7 +344,16 @@ class OMR_EXTENSIBLE CodeGenerator void lowerTreesPropagateBlockToNode(TR::Node *node); - void findLastWarmBlock(); + /** + * @brief Inserts goto into the last block if necessary + */ + void insertGotoIntoLastBlock(TR::Block *lastBlock); + + /** + * @brief Finds last warm block and inserts necessary gotos + * for splitting code into warm and cold + */ + void prepareLastWarmBlockForCodeSplitting(); void setUpForInstructionSelection(); void doInstructionSelection(); diff --git a/compiler/compile/OMRSymbolReferenceTable.cpp b/compiler/compile/OMRSymbolReferenceTable.cpp index 7792db843b1..050aeefaf3e 100644 --- a/compiler/compile/OMRSymbolReferenceTable.cpp +++ b/compiler/compile/OMRSymbolReferenceTable.cpp @@ -1528,7 +1528,7 @@ OMR::SymbolReferenceTable::findOrCreateMethodSymbol( if (!resolvedMethod) symRef->setUnresolved(); else if (callKind == TR::MethodSymbol::Virtual && cpIndex != -1) - symRef->setOffset(resolvedMethod->virtualCallSelector(cpIndex)); + symRef->setOffset(resolvedMethod->virtualCallSelector()); aliasBuilder.methodSymRefs().set(symRef->getReferenceNumber()); diff --git a/compiler/compile/ResolvedMethod.cpp b/compiler/compile/ResolvedMethod.cpp index 13827f20094..3c7280e6aa4 100644 --- a/compiler/compile/ResolvedMethod.cpp +++ b/compiler/compile/ResolvedMethod.cpp @@ -391,11 +391,11 @@ char * TR_ResolvedMethod::fieldNameChars(int32_t, int32_t &) { TR_ char * TR_ResolvedMethod::fieldSignatureChars(int32_t, int32_t &) { TR_UNIMPLEMENTED(); return 0; } char * TR_ResolvedMethod::staticSignatureChars(int32_t, int32_t &) { TR_UNIMPLEMENTED(); return 0; } void * & TR_ResolvedMethod::addressOfClassOfMethod() { TR_UNIMPLEMENTED(); throw std::exception(); } -uint32_t TR_ResolvedMethod::vTableSlot(uint32_t) { TR_UNIMPLEMENTED(); return 0; } +uint32_t TR_ResolvedMethod::vTableSlot() { TR_UNIMPLEMENTED(); return 0; } bool TR_ResolvedMethod::virtualMethodIsOverridden() { TR_UNIMPLEMENTED(); return false; } void TR_ResolvedMethod::setVirtualMethodIsOverridden() { TR_UNIMPLEMENTED(); } void * TR_ResolvedMethod::addressContainingIsOverriddenBit() { TR_UNIMPLEMENTED(); return 0; } -int32_t TR_ResolvedMethod::virtualCallSelector(uint32_t) { TR_UNIMPLEMENTED(); return 0; } +int32_t TR_ResolvedMethod::virtualCallSelector() { TR_UNIMPLEMENTED(); return 0; } uint32_t TR_ResolvedMethod::numberOfExceptionHandlers() { TR_UNIMPLEMENTED(); return 0; } uint8_t * TR_ResolvedMethod::allocateException(uint32_t,TR::Compilation*){ TR_UNIMPLEMENTED(); return 0; } diff --git a/compiler/compile/ResolvedMethod.hpp b/compiler/compile/ResolvedMethod.hpp index c2d4cf6f9e6..4c96024aebb 100644 --- a/compiler/compile/ResolvedMethod.hpp +++ b/compiler/compile/ResolvedMethod.hpp @@ -218,7 +218,7 @@ class TR_ResolvedMethod virtual uint32_t classCPIndexOfMethod(uint32_t); virtual void * & addressOfClassOfMethod(); - virtual uint32_t vTableSlot(uint32_t); + virtual uint32_t vTableSlot(); virtual TR_OpaqueClassBlock *getResolvedInterfaceMethod(int32_t cpIndex, uintptr_t * pITableIndex); @@ -236,7 +236,7 @@ class TR_ResolvedMethod virtual bool virtualMethodIsOverridden(); virtual void setVirtualMethodIsOverridden(); virtual void *addressContainingIsOverriddenBit(); - virtual int32_t virtualCallSelector(uint32_t cpIndex); + virtual int32_t virtualCallSelector(); virtual int32_t exceptionData(int32_t exceptionNumber, int32_t * startIndex, int32_t * endIndex, int32_t * catchType); virtual uint32_t numberOfExceptionHandlers(); diff --git a/compiler/control/OMROptions.cpp b/compiler/control/OMROptions.cpp index d5d4b719ee6..f44e0b48de1 100644 --- a/compiler/control/OMROptions.cpp +++ b/compiler/control/OMROptions.cpp @@ -689,6 +689,7 @@ TR::OptionTable OMR::Options::_jitOptions[] = { {"enableClassChainValidationCaching", "M\tenable class chain validation caching", SET_OPTION_BIT(TR_EnableClassChainValidationCaching), "F", NOT_IN_SUBSET}, {"enableCodeCacheConsolidation", "M\tenable code cache consolidation", SET_OPTION_BIT(TR_EnableCodeCacheConsolidation), "F", NOT_IN_SUBSET}, {"enableCodeCacheDisclaiming", "M\tenable memory disclaiming for code cache (linux specific).", SET_OPTION_BIT(TR_EnableCodeCacheDisclaiming),"F", NOT_IN_SUBSET}, + {"enableCodeCacheDisclaimingSupport", "M\tenable all experimental options that help code cache disclaiming.", SET_OPTION_BIT(TR_EnableCodeCacheDisclaimingSupport),"F", NOT_IN_SUBSET}, {"enableColdCheapTacticalGRA", "O\tenable cold cheap tactical GRA", SET_OPTION_BIT(TR_EnableColdCheapTacticalGRA), "F"}, {"enableCompilationBeforeCheckpoint", "C\tenable compilation before checkpoint", RESET_OPTION_BIT(TR_DisableCompilationBeforeCheckpoint), "F", NOT_IN_SUBSET}, {"enableCompilationSpreading", "C\tenable adding spreading invocations to methods before compiling", SET_OPTION_BIT(TR_EnableCompilationSpreading), "F", NOT_IN_SUBSET}, @@ -1004,7 +1005,9 @@ TR::OptionTable OMR::Options::_jitOptions[] = { {"minSleepTimeMsForCompThrottling=", "M\tLower bound for sleep time during compilation throttling (ms)", TR::Options::setStaticNumeric, (intptr_t)&OMR::Options::_minSleepTimeMsForCompThrottling, 0, "F%d", NOT_IN_SUBSET }, {"moveOOLInstructionsToWarmCode", "M\tmove out-of-line instructions to after last warm instruction", SET_OPTION_BIT(TR_MoveOOLInstructionsToWarmCode), "F"}, + {"moveSnippetsToWarmCode", "M\tmove snippets to after last warm instruction", SET_OPTION_BIT(TR_MoveSnippetsToWarmCode), "F"}, {"noAotSecondRunDetection", "M\tdo not do second run detection for AOT", SET_OPTION_BIT(TR_NoAotSecondRunDetection), "F", NOT_IN_SUBSET }, + #ifdef DEBUG {"noExceptions", "C\tfail compilation for methods with exceptions", TR::Options::setDebug, (intptr_t)"noExceptions"}, @@ -2454,6 +2457,15 @@ OMR::Options::jitLatePostProcess(TR::OptionSet *optionSet, void * jitConfig) self()->setOption(TR_ReservingLocks, false); } + if (self()->getOption(TR_EnableCodeCacheDisclaimingSupport)) + { + self()->setOption(TR_SplitWarmAndColdBlocks); + self()->setOption(TR_DisclaimMemoryOnSwap); + self()->setOption(TR_InstallAOTToColdCode); + self()->setOption(TR_MoveOOLInstructionsToWarmCode); + self()->setOption(TR_MoveSnippetsToWarmCode); + } + return true; } diff --git a/compiler/control/OMROptions.hpp b/compiler/control/OMROptions.hpp index 8a99bf4398f..314a6ec0a5c 100644 --- a/compiler/control/OMROptions.hpp +++ b/compiler/control/OMROptions.hpp @@ -382,7 +382,7 @@ enum TR_CompilationOptions TR_DisableInliningUnrecognizedIntrinsics = 0x10000000 + 9, TR_EnableVectorAPIExpansion = 0x20000000 + 9, TR_MoveOOLInstructionsToWarmCode = 0x40000000 + 9, - // Available = 0x80000000 + 9, + TR_MoveSnippetsToWarmCode = 0x80000000 + 9, // Option word 10 // @@ -392,7 +392,7 @@ enum TR_CompilationOptions TR_FirstLevelProfiling = 0x00000100 + 10, TR_EnableCodeCacheDisclaiming = 0x00000200 + 10, // Available = 0x00000400 + 10, - // Available = 0x00000800 + 10, + TR_EnableCodeCacheDisclaimingSupport = 0x00000800 + 10, // Available = 0x00001000 + 10, TR_DisableNewMethodOverride = 0x00002000 + 10, // Available = 0x00004000 + 10, diff --git a/compiler/p/codegen/OMRTreeEvaluator.cpp b/compiler/p/codegen/OMRTreeEvaluator.cpp index 6c0ebccdd3e..84e180da920 100644 --- a/compiler/p/codegen/OMRTreeEvaluator.cpp +++ b/compiler/p/codegen/OMRTreeEvaluator.cpp @@ -5902,114 +5902,367 @@ OMR::Power::TreeEvaluator::generateHelperBranchAndLinkInstruction( TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR::CodeGenerator *cg) { TR::Compilation *comp = cg->comp(); - TR::Node *dstAddrNode, *lengthNode, *valueNode; - dstAddrNode = node->getChild(0); - lengthNode = node->getChild(1); - valueNode = node->getChild(2); + TR::Node *dstBaseAddrNode, *dstOffsetNode, *dstAddrNode, *lengthNode, *valueNode; - TR::Register *dstAddrReg, *lengthReg, *valueReg; - bool stopUsingCopyReg1, stopUsingCopyReg2 = false, stopUsingCopyReg3 = false; + bool arrayCheckNeeded; - stopUsingCopyReg1 = TR::TreeEvaluator::stopUsingCopyReg(dstAddrNode, dstAddrReg, cg); + // IL tree structure depends on whether or not it's been determined that a runtime arrayCHK is needed: + // if node has four children (i.e.: object base address and offset are separate), need array check + // if node three children (i.e.: object base address and offset have already been added together), don't need array check + if (node->getNumChildren() == 4) + { + arrayCheckNeeded = true; + + dstBaseAddrNode = node->getChild(0); + dstOffsetNode = node->getChild(1); + dstAddrNode = NULL; + lengthNode = node->getChild(2); + valueNode = node->getChild(3); + } + else //i.e.: node->getNumChildren() == 3 + { + arrayCheckNeeded = false; + + dstBaseAddrNode = NULL; + dstOffsetNode = NULL; + dstAddrNode = node->getChild(0); + lengthNode = node->getChild(1); + valueNode = node->getChild(2); + } + + TR::Register *dstBaseAddrReg, *dstOffsetReg, *dstAddrReg, *lengthReg, *valueReg; + + // if the offset is a constant value less than 16 bits, then we dont need a separate register for it + bool useOffsetAsImmVal = dstOffsetNode && dstOffsetNode->getOpCode().isLoadConst() && + (dstOffsetNode->getConstValue() >= LOWER_IMMED) && (dstOffsetNode->getConstValue() <= UPPER_IMMED); + + bool stopUsingCopyRegBase = dstBaseAddrNode ? TR::TreeEvaluator::stopUsingCopyReg(dstBaseAddrNode, dstBaseAddrReg, cg) : false; + bool stopUsingCopyRegAddr = dstAddrNode ? TR::TreeEvaluator::stopUsingCopyReg(dstAddrNode, dstAddrReg, cg) : false ; + + bool stopUsingCopyRegOffset, stopUsingCopyRegLen, stopUsingCopyRegVal; + + //dstOffsetNode (type: long) + if (dstOffsetNode && !useOffsetAsImmVal) //only want to allocate a register for dstoffset if we're using it for the array check AND it isn't a constant + { + if (!cg->canClobberNodesRegister(lengthNode)) //only need to copy dstOffset into another register if the current one isn't clobberable + { + if (cg->comp()->target().is32Bit()) //on 32-bit systems, need to grab the lower 32 bits of offset from the register pair + { + dstOffsetReg = cg->evaluate(dstOffsetNode); + TR::Register *offsetCopyReg = cg->allocateRegister(); + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, dstOffsetNode, offsetCopyReg, dstOffsetReg->getLowOrder()); + + dstOffsetReg = offsetCopyReg; + stopUsingCopyRegOffset = true; + } + else + { + stopUsingCopyRegOffset = TR::TreeEvaluator::stopUsingCopyReg(dstOffsetNode, dstOffsetReg, cg); + } + } + else + { + dstOffsetReg = cg->evaluate(dstOffsetNode); + + if (cg->comp()->target().is32Bit()) //on 32-bit systems, need to grab the lower 32 bits of offset from the register pair + dstOffsetReg = dstOffsetReg->getLowOrder(); + + stopUsingCopyRegOffset = false; + } + } + else + { + stopUsingCopyRegOffset = false; + } + //lengthNode (type: long) lengthReg = cg->evaluate(lengthNode); if (!cg->canClobberNodesRegister(lengthNode)) { - TR::Register *lenCopyReg = cg->allocateRegister(); - generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, lengthNode, lenCopyReg, lengthReg); + TR::Register *lenCopyReg = cg->allocateRegister(); + + if (cg->comp()->target().is32Bit()) //on 32-bit systems, need to grab the lower 32 bits of length from the register pair + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, lengthNode, lenCopyReg, lengthReg->getLowOrder()); + else //on 64-bit system, can just do a normal copy + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, lengthNode, lenCopyReg, lengthReg); + lengthReg = lenCopyReg; - stopUsingCopyReg2 = true; + stopUsingCopyRegLen = true; + } + else + { + if (cg->comp()->target().is32Bit()) //on 32-bit system, need to grab lower 32 bits of length from the register pair + lengthReg = lengthReg->getLowOrder(); + + stopUsingCopyRegLen = false; } + //valueNode (type: byte) valueReg = cg->evaluate(valueNode); - if (!cg->canClobberNodesRegister(valueNode)) + if (cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P8)) { - TR::Register *valCopyReg = cg->allocateRegister(); + //on P8 or higher, we can use vector instructions to cut down on loop iterations and residual tests -> need to copy valueReg into a VSX register + TR::Register *valVectorReg = cg->allocateRegister(TR_VRF); + generateTrg1Src1Instruction(cg, TR::InstOpCode::mtvsrd, valueNode, valVectorReg, valueReg); + + valueReg = valVectorReg; + stopUsingCopyRegVal = true; + } + else if (!cg->canClobberNodesRegister(valueNode)) + { + TR::Register *valCopyReg = cg->allocateRegister(); generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, valueNode, valCopyReg, valueReg); + valueReg = valCopyReg; - stopUsingCopyReg3 = true; + stopUsingCopyRegVal = true; } TR::LabelSymbol * residualLabel = generateLabelSymbol(cg); TR::LabelSymbol * loopStartLabel = generateLabelSymbol(cg); TR::LabelSymbol * doneLabel = generateLabelSymbol(cg); - TR::LabelSymbol * label8aligned = generateLabelSymbol(cg); - TR::LabelSymbol * label4aligned = generateLabelSymbol(cg); - TR::LabelSymbol * label2aligned = generateLabelSymbol(cg); - TR::LabelSymbol * label1aligned = generateLabelSymbol(cg); + + //these labels are not needed for the vector approach to storing to residual bytes (i.e.: P10+) + TR::LabelSymbol *label8aligned, *label4aligned, *label2aligned, *label1aligned; + + if (!cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P10)) + { + label8aligned = generateLabelSymbol(cg); + label4aligned = generateLabelSymbol(cg); + label2aligned = generateLabelSymbol(cg); + label1aligned = generateLabelSymbol(cg); + } TR::RegisterDependencyConditions *conditions; - int32_t numDeps = 5; + int32_t numDeps = 6; + + //need extra register for offset only if it isn't already included in the destination address AND it isn't a constant + if (arrayCheckNeeded && !useOffsetAsImmVal) + numDeps++; + conditions = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(numDeps, numDeps, cg->trMemory()); TR::Register *cndReg = cg->allocateRegister(TR_CCR); TR::addDependency(conditions, cndReg, TR::RealRegister::cr0, TR_CCR, cg); - TR::addDependency(conditions, dstAddrReg, TR::RealRegister::NoReg, TR_GPR, cg); + + if (arrayCheckNeeded) + { + //dstBaseAddrReg holds the address of the object being written to, so need to exclude GPR0 + TR::addDependency(conditions, dstBaseAddrReg, TR::RealRegister::NoReg, TR_GPR, cg); + conditions->getPostConditions()->getRegisterDependency(conditions->getAddCursorForPost() - 1)->setExcludeGPR0(); + + if (!useOffsetAsImmVal) + TR::addDependency(conditions, dstOffsetReg, TR::RealRegister::NoReg, TR_GPR, cg); + } + else + { + //dstAddrReg holds the address of the object being written to, so need to exclude GPR0 + TR::addDependency(conditions, dstAddrReg, TR::RealRegister::NoReg, TR_GPR, cg); + conditions->getPostConditions()->getRegisterDependency(1)->setExcludeGPR0(); + } + TR::addDependency(conditions, lengthReg, TR::RealRegister::NoReg, TR_GPR, cg); TR::addDependency(conditions, valueReg, TR::RealRegister::NoReg, TR_GPR, cg); - TR::Register * tempReg = cg->allocateRegister(); - TR::addDependency(conditions, tempReg, TR::RealRegister::NoReg, TR_GPR, cg); + + //temp1Reg will later be used to hold the J9Class flags for the object at dst, so need to exclude GPR0 + TR::Register * temp1Reg = cg->allocateRegister(); + TR::addDependency(conditions, temp1Reg, TR::RealRegister::NoReg, TR_GPR, cg); + conditions->getPostConditions()->getRegisterDependency(conditions->getAddCursorForPost() - 1)->setExcludeGPR0(); + + TR::Register * temp2Reg = cg->allocateRegister(); + TR::addDependency(conditions, temp2Reg, TR::RealRegister::NoReg, TR_GPR, cg); + + +#if defined (J9VM_GC_ENABLE_SPARSE_HEAP_ALLOCATION) + + if (arrayCheckNeeded) // CASE (3) + { + // There are two scenarios in which we DON'T want to modify the dest base address: + // 1.) If the object is NULL (since we can't load dataAddr from a NULL pointer) + // 2.) If the object is a non-array object + // So two checks are required (NULL, Array) to determine whether dataAddr should be loaded or not + TR::LabelSymbol *noDataAddr = generateLabelSymbol(cg); + + // We only want to generate a runtime NULL check if the status of the object (i.e.: whether it is NULL or non-NULL) + // is NOT known. Note that if the object is known to be NULL, arrayCheckNeeded will be false, so there is no need to check + // that condition here. + if (!dstBaseAddrNode->isNonNull()) + { + //generate NULL test + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpi, node, cndReg, dstBaseAddrReg, 0); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, noDataAddr, cndReg); + } + + //Array Check + TR::Register *dstClassInfoReg = temp1Reg; + TR::Register *arrayFlagReg = temp2Reg; + + //load dst class info into temp1Reg + if (TR::Compiler->om.compressObjectReferences()) + generateTrg1MemInstruction(cg, TR::InstOpCode::lwz, node, dstClassInfoReg, + TR::MemoryReference::createWithDisplacement(cg, dstBaseAddrReg, static_cast(TR::Compiler->om.offsetOfObjectVftField()), 4)); + else + generateTrg1MemInstruction(cg,TR::InstOpCode::Op_load, node, dstClassInfoReg, + TR::MemoryReference::createWithDisplacement(cg, dstBaseAddrReg, static_cast(TR::Compiler->om.offsetOfObjectVftField()), TR::Compiler->om.sizeofReferenceAddress())); + + TR::TreeEvaluator::generateVFTMaskInstruction(cg, node, dstClassInfoReg); + + TR::MemoryReference *dstClassMR = TR::MemoryReference::createWithDisplacement(cg, dstClassInfoReg, offsetof(J9Class, classDepthAndFlags), TR::Compiler->om.sizeofReferenceAddress()); + generateTrg1MemInstruction(cg, TR::InstOpCode::Op_load, node, dstClassInfoReg, dstClassMR); + + //generate array check + int32_t arrayFlagValue = comp->fej9()->getFlagValueForArrayCheck(); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andis_r, node, arrayFlagReg, dstClassInfoReg, arrayFlagValue >> 16); + + //if object is not an array (i.e.: temp1Reg & temp2Reg == 0), skip adjusting dstBaseAddr and dstOffset + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, noDataAddr, cndReg); + + //load dataAddr if object is array: + TR::MemoryReference *dataAddrSlotMR = TR::MemoryReference::createWithDisplacement(cg, dstBaseAddrReg, comp->fej9()->getOffsetOfContiguousDataAddrField(), TR::Compiler->om.sizeofReferenceAddress()); + generateTrg1MemInstruction(cg, TR::InstOpCode::Op_load, node, dstBaseAddrReg, dataAddrSlotMR); + + //arrayCHK will skip to here if object is not an array + generateLabelInstruction(cg, TR::InstOpCode::label, node, noDataAddr); + + //calculate dstAddr = dstBaseAddr + dstOffset + dstAddrReg = dstBaseAddrReg; + + if (useOffsetAsImmVal) + { + int offsetImmVal = dstOffsetNode->getConstValue(); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstBaseAddrReg, offsetImmVal); + } + else + generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, dstAddrReg, dstBaseAddrReg, dstOffsetReg); + } + +#endif /* J9VM_GC_ENABLE_SPARSE_HEAP_ALLOCATION */ // assemble the double word value from byte value - generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rlwimi, node, valueReg, valueReg, 8, 0xff00); - generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rlwimi, node, valueReg, valueReg, 16, 0xffff0000); - generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rldimi, node, valueReg, valueReg, 32, 0xffffffff00000000); + if (cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P8)) + { + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::vspltb, valueNode, valueReg, valueReg, 7); + } + else + { + generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rldimi, node, valueReg, valueReg, 8, CONSTANT64(0x000000000000FF00)); + generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rldimi, node, valueReg, valueReg, 16, CONSTANT64(0x00000000FFFF0000)); + generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rldimi, node, valueReg, valueReg, 32, CONSTANT64(0xFFFFFFFF00000000)); + } - generateTrg1Src1ImmInstruction(cg, lengthNode->getType().isInt32() ? TR::InstOpCode::cmpli4 : TR::InstOpCode::cmpli8, node, cndReg, lengthReg, 32); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, cndReg, lengthReg, 32); generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, residualLabel, cndReg); - generateTrg1Src1ImmInstruction(cg, lengthNode->getType().isInt32() ? TR::InstOpCode::srawi : TR::InstOpCode::sradi, node, tempReg, lengthReg, 5); - generateSrc1Instruction(cg, TR::InstOpCode::mtctr, node, tempReg); + generateTrg1Src1ImmInstruction(cg, lengthNode->getType().isInt32() ? TR::InstOpCode::srawi : TR::InstOpCode::sradi, node, temp1Reg, lengthReg, 5); + generateSrc1Instruction(cg, TR::InstOpCode::mtctr, node, temp1Reg); generateLabelInstruction(cg, TR::InstOpCode::label, node, loopStartLabel); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 8, 8), valueReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 16, 8), valueReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 24, 8), valueReg); - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 32); + + //store designated value to memory in chunks of 32 bytes + if (cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P8)) + { + //on P8 and higher, we can use vector instructions to cut down on loop iterations/number of stores + generateMemSrc1Instruction(cg, TR::InstOpCode::stxvd2x, node, TR::MemoryReference::createWithIndexReg(cg, NULL, dstAddrReg, 16), valueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 16); + generateMemSrc1Instruction(cg, TR::InstOpCode::stxvd2x, node, TR::MemoryReference::createWithIndexReg(cg, NULL, dstAddrReg, 16), valueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 16); + } + else + { + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 8, 8), valueReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 16, 8), valueReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 24, 8), valueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 32); + } + + //decrement counter and return to start of loop generateConditionalBranchInstruction(cg, TR::InstOpCode::bdnz, node, loopStartLabel, cndReg); - generateLabelInstruction(cg, TR::InstOpCode::label, node, residualLabel); //check 16 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tempReg, lengthReg, 16); - generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label8aligned, cndReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 8, 8), valueReg); - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 16); - - generateLabelInstruction(cg, TR::InstOpCode::label, node, label8aligned); //check 8 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tempReg, lengthReg, 8); - generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label4aligned, cndReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueReg); - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 8); - - generateLabelInstruction(cg, TR::InstOpCode::label, node, label4aligned); //check 4 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tempReg, lengthReg, 4); - generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label2aligned, cndReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::stw, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 4), valueReg); - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 4); - - generateLabelInstruction(cg, TR::InstOpCode::label, node, label2aligned); //check 2 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tempReg, lengthReg, 2); - generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label1aligned, cndReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::sth, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 2), valueReg); - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 2); - - generateLabelInstruction(cg, TR::InstOpCode::label, node, label1aligned); //check 1 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tempReg, lengthReg, 1); - generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, doneLabel, cndReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::stb, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 1), valueReg); + //loop exit + generateLabelInstruction(cg, TR::InstOpCode::label, node, residualLabel); + + //Set residual bytes (max number of residual bytes = 31 = 0x1F) + if (cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P10)) //on P10, we can use stxvl to store all residual bytes efficiently + { + //First 16 byte segment + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 16); //get first hex char (can only be 0 or 1) + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, node, temp2Reg, temp1Reg); //keep a copy of first hex char + + //store to memory + //NOTE: due to a quirk of the stxvl instruction on P10, the number of residual bytes must be shifted over before it can be used + generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rldicr, node, temp1Reg, temp1Reg, 56, CONSTANT64(0xFF00000000000000)); + generateSrc3Instruction(cg, TR::InstOpCode::stxvl, node, valueReg, dstAddrReg, temp1Reg); + + //advance to next 16 byte chunk IF number of residual bytes >= 16 + generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, dstAddrReg, dstAddrReg, temp2Reg); + + //Second 16 byte segment + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 15); //get second hex char + generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rldicr, node, temp1Reg, temp1Reg, 56, CONSTANT64(0xFF00000000000000)); //shift num residual bytes + generateSrc3Instruction(cg, TR::InstOpCode::stxvl, node, valueReg, dstAddrReg, temp1Reg); //store to memory + } + else + { + TR::Register *valueResidueReg; + + if (cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P8)) + { + //since P8 and P9 used the vector approach, we first need to copy valueReg back into a GPR + generateTrg1Src1Instruction(cg, TR::InstOpCode::mfvsrd, node, temp2Reg, valueReg); + valueResidueReg = temp2Reg; + } + else + valueResidueReg = valueReg; + + //check if residual < 16 + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 16); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label8aligned, cndReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueResidueReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 8, 8), valueResidueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 16); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, label8aligned); //check if residual < 8 + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 8); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label4aligned, cndReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueResidueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 8); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, label4aligned); //check if residual < 4 + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 4); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label2aligned, cndReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::stw, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 4), valueResidueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 4); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, label2aligned); //check if residual < 2 + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 2); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label1aligned, cndReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::sth, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 2), valueResidueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 2); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, label1aligned); //residual <= 1 + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 1); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, doneLabel, cndReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::stb, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 1), valueResidueReg); + } generateDepLabelInstruction(cg, TR::InstOpCode::label, node, doneLabel, conditions); - if (stopUsingCopyReg1) + if (stopUsingCopyRegBase) + cg->stopUsingRegister(dstBaseAddrReg); + if (stopUsingCopyRegOffset) + cg->stopUsingRegister(dstOffsetReg); + if (stopUsingCopyRegAddr) cg->stopUsingRegister(dstAddrReg); - if (stopUsingCopyReg2) + if (stopUsingCopyRegLen) cg->stopUsingRegister(lengthReg); - if (stopUsingCopyReg3) + if (stopUsingCopyRegVal) cg->stopUsingRegister(valueReg); cg->stopUsingRegister(cndReg); - cg->stopUsingRegister(tempReg); + cg->stopUsingRegister(temp1Reg); + cg->stopUsingRegister(temp2Reg); - cg->decReferenceCount(dstAddrNode); + if (dstBaseAddrNode) cg->decReferenceCount(dstBaseAddrNode); + if (dstOffsetNode) cg->decReferenceCount(dstOffsetNode); + if (dstAddrNode) cg->decReferenceCount(dstAddrNode); cg->decReferenceCount(lengthNode); cg->decReferenceCount(valueNode); diff --git a/compiler/x/codegen/OMRCodeGenerator.cpp b/compiler/x/codegen/OMRCodeGenerator.cpp index d7a5499e029..f579df20b04 100644 --- a/compiler/x/codegen/OMRCodeGenerator.cpp +++ b/compiler/x/codegen/OMRCodeGenerator.cpp @@ -2059,6 +2059,8 @@ void OMR::X86::CodeGenerator::doBinaryEncoding() // bool skipOneReturn = false; int32_t estimatedPrologueStartOffset = estimate; + bool snippetsAfterWarm = self()->comp()->getOption(TR_MoveSnippetsToWarmCode); + while (estimateCursor) { // Update the info bits on the register mask. @@ -2152,6 +2154,9 @@ void OMR::X86::CodeGenerator::doBinaryEncoding() // if (estimateCursor->isLastWarmInstruction()) { + if (snippetsAfterWarm) + estimate = setEstimatedLocationsForSnippetLabels(estimate); + warmEstimate = (estimate+7) & ~7; estimate = warmEstimate + MIN_DISTANCE_BETWEEN_WARM_AND_COLD_CODE; } @@ -2165,7 +2170,8 @@ void OMR::X86::CodeGenerator::doBinaryEncoding() if (self()->comp()->getOption(TR_TraceCG)) traceMsg(self()->comp(), "\n\n"); - estimate = self()->setEstimatedLocationsForSnippetLabels(estimate); + if (!snippetsAfterWarm || !warmEstimate) + estimate = self()->setEstimatedLocationsForSnippetLabels(estimate); // When using copyBinaryToBuffer() to copy the encoding of an instruction we // indiscriminatelly copy a whole integer, even if the size of the encoding @@ -2240,6 +2246,8 @@ void OMR::X86::CodeGenerator::doBinaryEncoding() // Generate binary for the rest of the instructions // + int32_t accumulatedErrorBeforeSnippets = 0; + while (cursorInstruction) { uint8_t * const instructionStart = self()->getBinaryBufferCursor(); @@ -2279,6 +2287,8 @@ void OMR::X86::CodeGenerator::doBinaryEncoding() self()->getWarmCodeEnd(), cursorInstruction, coldCode); } + accumulatedErrorBeforeSnippets = getAccumulatedInstructionLengthError(); + // Adjust the accumulated length error so that distances within the cold // code are calculated properly using the estimated code locations. // @@ -2328,6 +2338,12 @@ void OMR::X86::CodeGenerator::doBinaryEncoding() traceMsg(self()->comp(), "\n"); } + if (self()->comp()->getOption(TR_SplitWarmAndColdBlocks)) + { + if (snippetsAfterWarm) // snippets will follow the warm code + setAccumulatedInstructionLengthError(accumulatedErrorBeforeSnippets); + } + } // different from evaluate in that it returns a clobberable register diff --git a/compiler/z/codegen/OMRPeephole.cpp b/compiler/z/codegen/OMRPeephole.cpp index 6db7ba5c9d1..3f02f106e37 100644 --- a/compiler/z/codegen/OMRPeephole.cpp +++ b/compiler/z/codegen/OMRPeephole.cpp @@ -38,7 +38,7 @@ isBarrierToPeepHoleLookback(TR::Instruction* cursor) { if (cursor == NULL) return true; - + if (cursor->isLabel()) return true; @@ -198,6 +198,11 @@ OMR::Z::Peephole::performOnInstruction(TR::Instruction* cursor) performed |= performedCurrentPeephole; break; } + case TR::InstOpCode::LGFR: + { + performed |= self()->tryToRemoveRedundant32To64BitExtend(true); + break; + } case TR::InstOpCode::LHI: { performed |= self()->tryToReduceLHIToXR(); @@ -213,6 +218,11 @@ OMR::Z::Peephole::performOnInstruction(TR::Instruction* cursor) performed |= self()->tryToReduceLToLZRF(TR::InstOpCode::LLZRGF); break; } + case TR::InstOpCode::LLGFR: + { + performed |= self()->tryToRemoveRedundant32To64BitExtend(false); + break; + } case TR::InstOpCode::LR: { bool performedCurrentPeephole = false; @@ -254,7 +264,7 @@ OMR::Z::Peephole::performOnInstruction(TR::Instruction* cursor) if (!performedCurrentPeephole) performedCurrentPeephole |= self()->tryToRemoveDuplicateLoadRegister(); - + performed |= performedCurrentPeephole; break; } @@ -358,7 +368,7 @@ OMR::Z::Peephole::tryLoadStoreReduction(TR::InstOpCode::Mnemonic storeOpCode, ui return false; } - if (performTransformation(self()->comp(), "O^O S390 PEEPHOLE: Transforming load-store sequence at %p to MVC.", storeInst)) + if (performTransformation(self()->comp(), "O^O S390 PEEPHOLE: Transforming load-store sequence at %p to MVC.\n", storeInst)) { TR::DebugCounter::incStaticDebugCounter(self()->comp(), "z/peephole/load-store"); @@ -942,7 +952,7 @@ OMR::Z::Peephole::tryToReduceAGI() { if (performTransformation(self()->comp(), "O^O S390 PEEPHOLE: AGI LA reduction on [%p] from source load [%p].\n", current, cursor)) { - auto laInst = generateRXInstruction(self()->cg(), TR::InstOpCode::LA, cursor->getNode(), lgrTargetReg, + auto laInst = generateRXInstruction(self()->cg(), TR::InstOpCode::LA, cursor->getNode(), lgrTargetReg, generateS390MemoryReference(lgrSourceReg, 0, self()->cg()), cursor->getPrev()); self()->cg()->replaceInst(cursor, laInst); @@ -1328,7 +1338,7 @@ OMR::Z::Peephole::tryToReduceLLCToLLGC() memRef->resetMemRefUsedBefore(); auto llgcInst = generateRXInstruction(self()->cg(), TR::InstOpCode::LLGC, cursor->getNode(), llcTgtReg, memRef, cursor->getPrev()); self()->cg()->replaceInst(cursor, llgcInst); - + return true; } } @@ -1419,7 +1429,7 @@ OMR::Z::Peephole::tryToReduceLTRToCHI() TR::InstOpCode lgrOpCode = cursor->getOpCode(); if (lgrTargetReg == lgrSourceReg && - (lgrOpCode.getOpCodeValue() == TR::InstOpCode::LTR || + (lgrOpCode.getOpCodeValue() == TR::InstOpCode::LTR || lgrOpCode.getOpCodeValue() == TR::InstOpCode::LTGR)) { if (seekRegInFutureMemRef(cursor, 4, lgrTargetReg)) @@ -1528,7 +1538,7 @@ OMR::Z::Peephole::tryToRemoveDuplicateLoadRegister() windowSize = 0; setCC = setCC || current->getOpCode().setsCC(); useCC = useCC || current->getOpCode().readsCC(); - + rrInst->remove(); continue; @@ -1740,7 +1750,7 @@ OMR::Z::Peephole::tryToRemoveRedundantLA() if (performTransformation(self()->comp(), "O^O S390 PEEPHOLE: Removing redundant LA [%p].\n", cursor)) { cursor->remove(); - + return true; } } @@ -1828,7 +1838,7 @@ OMR::Z::Peephole::tryToRemoveRedundantLTR() TR::Register *lgrSourceReg = cursor->getRegisterOperand(2); TR::Register *lgrTargetReg = cursor->getRegisterOperand(1); - + if (lgrTargetReg == lgrSourceReg) { TR::Instruction *prevInst = cursor->getPrev(); @@ -1861,3 +1871,142 @@ OMR::Z::Peephole::tryToRemoveRedundantLTR() return false; } + +bool +OMR::Z::Peephole::tryToRemoveRedundant32To64BitExtend(bool isSigned) + { + static const bool disableRemoveExtend = feGetEnv("TR_DisableRemoveRedundant32to64Extend") != NULL; + if (disableRemoveExtend) + { + return false; + } + + int32_t windowSize = 0; + const int32_t maxWindowSize = 10; + + const char *lgfrMnemonicName = isSigned ? "LGFR" : "LLGFR"; + TR::Compilation *comp = self()->comp(); + TR::Instruction *lgfr = cursor; + TR::Register *lgfrReg = lgfr->getRegisterOperand(1); + + if (lgfrReg != lgfr->getRegisterOperand(2)) + return false; + + TR::Instruction *current = lgfr->getPrev(); + + while ((current != NULL) && + !isBarrierToPeepHoleLookback(current) && + windowSize < maxWindowSize) + { + TR::InstOpCode::Mnemonic curOpMnemonic = current->getOpCode().getMnemonic(); + + if (current->getNumRegisterOperands() > 0 && lgfrReg == current->getRegisterOperand(1)) + { + TR::MemoryReference *mr = NULL; + TR::Instruction *replacement = NULL; + switch (curOpMnemonic) + { + case TR::InstOpCode::L: + if (performTransformation(comp, "O^O S390 PEEPHOLE: Merging L [%p] and %s [%p] into %s.\n", + current, lgfrMnemonicName, lgfr, isSigned ? "LGF" : "LLGF")) + { + mr = current->getMemoryReference(); + mr->resetMemRefUsedBefore(); + replacement = generateRXInstruction(self()->cg(), isSigned ? TR::InstOpCode::LGF : TR::InstOpCode::LLGF, current->getNode(), lgfrReg, mr, current->getPrev()); + } + break; + case TR::InstOpCode::LH: + if (isSigned && performTransformation(comp, "O^O S390 PEEPHOLE: Merging LH [%p] and LGFR [%p] into LGH.\n", current, lgfr)) + { + mr = current->getMemoryReference(); + mr->resetMemRefUsedBefore(); + replacement = generateRXInstruction(self()->cg(), TR::InstOpCode::LGH, current->getNode(), lgfrReg, mr, current->getPrev()); + } + break; + case TR::InstOpCode::LLH: + if (performTransformation(comp, "O^O S390 PEEPHOLE: Merging LLH [%p] and %s [%p] into LLGH.\n", current, lgfrMnemonicName, lgfr)) + { + mr = current->getMemoryReference(); + mr->resetMemRefUsedBefore(); + replacement = generateRXInstruction(self()->cg(), TR::InstOpCode::LLGH, current->getNode(), lgfrReg, mr, current->getPrev()); + } + break; + case TR::InstOpCode::LB: + if (isSigned && performTransformation(comp, "O^O S390 PEEPHOLE: Merging LB [%p] and LGFR [%p] into LGB.\n", current, lgfr)) + { + mr = current->getMemoryReference(); + mr->resetMemRefUsedBefore(); + replacement = generateRXInstruction(self()->cg(), TR::InstOpCode::LGB, current->getNode(), lgfrReg, mr, current->getPrev()); + } + break; + case TR::InstOpCode::LLC: + if (performTransformation(comp, "O^O S390 PEEPHOLE: Merging LLC [%p] and %s [%p] into LLGC.\n", current, lgfrMnemonicName, lgfr)) + { + mr = current->getMemoryReference(); + mr->resetMemRefUsedBefore(); + replacement = generateRXInstruction(self()->cg(), TR::InstOpCode::LLGC, current->getNode(), lgfrReg, mr, current->getPrev()); + } + break; + + case TR::InstOpCode::XR: + // The following sequence of instructions + // XR GPR1, GPR1 ; Zero out bottom 32 bits of GPR1 + // LGFR/LLGFR GPR1, GPR1 ; Extend those zeros to all 64 bits of GPR1 + // Can be converted to + // XGR GPR1, GPR1 ; Zero out all 64 bits of GPR1 + if (lgfrReg == current->getRegisterOperand(2) && + performTransformation(comp, "O^O S390 PEEPHOLE: Merging XR [%p] and %s [%p] into XGR.\n", current, lgfrMnemonicName, lgfr)) + replacement = generateRRInstruction(self()->cg(), TR::InstOpCode::XGR, current->getNode(), lgfrReg, lgfrReg, current->getPrev()); + break; + case TR::InstOpCode::IILF: + if (performTransformation(comp, "O^O S390 PEEPHOLE: Merging IILF [%p] and %s [%p] into %s.\n", current, lgfrMnemonicName, lgfr, isSigned ? "LGFI" : "LLILF")) + replacement = generateRILInstruction(self()->cg(), isSigned ? TR::InstOpCode::LGFI : TR::InstOpCode::LLILF, current->getNode(), lgfrReg, toS390RILInstruction(current)->getSourceImmediate(), current->getPrev()); + break; + case TR::InstOpCode::LHI: + if (isSigned && performTransformation(comp, "O^O S390 PEEPHOLE: Merging LHI [%p] and LGFR [%p] into LGH.\n", current, lgfr)) + { + replacement = generateRIInstruction(self()->cg(), TR::InstOpCode::LGHI, current->getNode(), lgfrReg, toS390RIInstruction(current)->getSourceImmediate(), current->getPrev()); + } + else if (performTransformation(comp, "O^O S390 PEEPHOLE: Merging LHI [%p] and LLGFR [%p] into LLILF.\n", current, lgfr)) + { + // The following sequence of instructions: + // LHI GPR1, IMM ; sign extend IMM from 16 to 32 bits + // LLGFR GPR1, GPR1 ; zero extend from 32 to 64 bits + // Can be converted to + // LLILF GPR1, IMM' ; where IMM' is IMM sign extended from 16 to 32 bits + int16_t imm = toS390RIInstruction(current)->getSourceImmediate(); + replacement = generateRILInstruction(self()->cg(), TR::InstOpCode::LLILF, current->getNode(), lgfrReg, static_cast(imm), current->getPrev()); + } + break; + + case TR::InstOpCode::LR: + case TR::InstOpCode::LGR: + replacement = generateRRInstruction(self()->cg(), isSigned ? TR::InstOpCode::LGFR : TR::InstOpCode::LLGFR, current->getNode(), lgfrReg, current->getRegisterOperand(2), current->getPrev()); + break; + } + + if (replacement != NULL) + { + TR::DebugCounter::incStaticDebugCounter(comp, + TR::DebugCounter::debugCounterName(comp, "z/peephole/redundant32To64BitExtend/%s/%s/%s/(%s)", + current->getOpCode().getMnemonicName(), + lgfr->getOpCode().getMnemonicName(), + replacement->getOpCode().getMnemonicName(), + comp->signature())); + self()->cg()->replaceInst(current, replacement); + lgfr->remove(); + return true; + } + } + + // Ensure the extend acts on the correct register values + if (current->isDefRegister(lgfrReg)) + break; + + current = current->getPrev(); + + windowSize++; + } + + return false; + } diff --git a/compiler/z/codegen/OMRPeephole.hpp b/compiler/z/codegen/OMRPeephole.hpp index 0b41ad0f9e6..4e40d0be44e 100644 --- a/compiler/z/codegen/OMRPeephole.hpp +++ b/compiler/z/codegen/OMRPeephole.hpp @@ -70,7 +70,7 @@ class OMR_EXTENSIBLE Peephole : public OMR::Peephole * true if the reduction was successful; false otherwise. */ bool tryLoadStoreReduction(TR::InstOpCode::Mnemonic storeOpCode, uint16_t size); - + /** \brief * Tries to fold a load register instruction (\c LR or \c LGR) into a subsequent three-operand instruction if * possible. For example: @@ -92,7 +92,7 @@ class OMR_EXTENSIBLE Peephole : public OMR::Peephole * true if the reduction was successful; false otherwise. */ bool tryToFoldLoadRegisterIntoSubsequentInstruction(); - + /** \brief * Tries to forward a branch target if the branch instruction transfers control to another unconditional * branch instruction (i.e. a trampoline). For example: @@ -170,7 +170,7 @@ class OMR_EXTENSIBLE Peephole : public OMR::Peephole * true if the reduction was successful; false otherwise. */ bool tryToReduceAGI(); - + /** \brief * Tries to reduce a compare logical (\c CLR) insturction followed by a branch to a compare and branch * instruction (\c CLRJ) For example: @@ -190,7 +190,7 @@ class OMR_EXTENSIBLE Peephole : public OMR::Peephole * true if the reduction was successful; false otherwise. */ bool tryToReduceCLRToCLRJ(); - + /** \brief * Tries to reduce a simple branch conditional load of an immediate to a load immediate on condition branch- * less sequence. For example: @@ -218,7 +218,7 @@ class OMR_EXTENSIBLE Peephole : public OMR::Peephole * true if the reduction was successful; false otherwise. */ bool tryToReduceCRJLHIToLOCHI(TR::InstOpCode::Mnemonic compareMnemonic); - + /** \brief * Tries to reduce a load instruction (\c L) to an insert character under mask (\c ICM) instruction. This can * be done if following the load we have a load and test or a compare against certain immediates. For example: @@ -261,7 +261,7 @@ class OMR_EXTENSIBLE Peephole : public OMR::Peephole * true if the reduction was successful; false otherwise. */ bool tryToReduceLToLZRF(TR::InstOpCode::Mnemonic loadAndZeroRightMostByteMnemonic); - + /** \brief * Tries to reduce a load register instruction (\c LGR or \c LTGR) followed by a sign extension to \c LGFR. * For example: @@ -300,7 +300,7 @@ class OMR_EXTENSIBLE Peephole : public OMR::Peephole * true if the reduction was successful; false otherwise. */ bool tryToReduceLHIToXR(); - + /** \brief * Tries to reduce a load logical character instruction (\c LLC) followed by a zero extension to \c LLGC. * For example: @@ -320,7 +320,7 @@ class OMR_EXTENSIBLE Peephole : public OMR::Peephole * true if the reduction was successful; false otherwise. */ bool tryToReduceLLCToLLGC(); - + /** \brief * Tries to reduce a load register instruction (\c LR or \c LGR) and a future compare (\c CHI) against the * target register to \c LTR or \c LTGR. For example: @@ -347,7 +347,7 @@ class OMR_EXTENSIBLE Peephole : public OMR::Peephole * true if the reduction was successful; false otherwise. */ bool tryToReduceLRCHIToLTR(); - + /** \brief * Tries to reduce a load and test register instruction (\c LTR or \c LTGR) to a compare halfword immediate if * the target register of the load is used in a future memory reference. This is an attempt to reduce the AGI @@ -480,6 +480,30 @@ class OMR_EXTENSIBLE Peephole : public OMR::Peephole */ bool tryToRemoveRedundantLTR(); + /** \brief + * Tries to remove redundant 32 to 64 bit extensions with \c LGFR or \c LLGFR on register + * values originating from 32 bit loads if the 32 bit load instruction can be replaced with + * an equivalent extending 32 bit load. For example: + * + * + * L R1,N(R2,R3) + * LGFR R1,R1 + * + * + * can be reduced to: + * + * + * LGF R1,N(R2,R3) + * + * + * \param isSigned + * true if operating on an LGFR instruction; false if LLGFR + * + * \return + * true if the reduction was successful; false otherwise + */ + bool tryToRemoveRedundant32To64BitExtend(bool isSigned); + private: /// The instruction cursor currently being processed by the peephole optimization diff --git a/fvtest/compilerunittest/aarch64/BinaryEncoder.cpp b/fvtest/compilerunittest/aarch64/BinaryEncoder.cpp index 50d1f88b89e..9eb62998f8a 100644 --- a/fvtest/compilerunittest/aarch64/BinaryEncoder.cpp +++ b/fvtest/compilerunittest/aarch64/BinaryEncoder.cpp @@ -1636,6 +1636,36 @@ INSTANTIATE_TEST_CASE_P(VectorUnzip2, ARM64Trg1Src2EncodingTest, ::testing::Valu std::make_tuple(TR::InstOpCode::vuzp2_2d, TR::RealRegister::v0, TR::RealRegister::v0, TR::RealRegister::v31, "4edf5800") )); +INSTANTIATE_TEST_CASE_P(VectorTrn1, ARM64Trg1Src2EncodingTest, ::testing::Values( + std::make_tuple(TR::InstOpCode::vtrn1_8b, TR::RealRegister::v15, TR::RealRegister::v0, TR::RealRegister::v0, "0e00280f"), + std::make_tuple(TR::InstOpCode::vtrn1_8b, TR::RealRegister::v31, TR::RealRegister::v0, TR::RealRegister::v0, "0e00281f"), + std::make_tuple(TR::InstOpCode::vtrn1_8b, TR::RealRegister::v0, TR::RealRegister::v15, TR::RealRegister::v0, "0e0029e0"), + std::make_tuple(TR::InstOpCode::vtrn1_8b, TR::RealRegister::v0, TR::RealRegister::v31, TR::RealRegister::v0, "0e002be0"), + std::make_tuple(TR::InstOpCode::vtrn1_8b, TR::RealRegister::v0, TR::RealRegister::v0, TR::RealRegister::v15, "0e0f2800"), + std::make_tuple(TR::InstOpCode::vtrn1_8b, TR::RealRegister::v0, TR::RealRegister::v0, TR::RealRegister::v31, "0e1f2800"), + std::make_tuple(TR::InstOpCode::vtrn1_16b, TR::RealRegister::v15, TR::RealRegister::v0, TR::RealRegister::v0, "4e00280f"), + std::make_tuple(TR::InstOpCode::vtrn1_16b, TR::RealRegister::v31, TR::RealRegister::v0, TR::RealRegister::v0, "4e00281f"), + std::make_tuple(TR::InstOpCode::vtrn1_16b, TR::RealRegister::v0, TR::RealRegister::v15, TR::RealRegister::v0, "4e0029e0"), + std::make_tuple(TR::InstOpCode::vtrn1_16b, TR::RealRegister::v0, TR::RealRegister::v31, TR::RealRegister::v0, "4e002be0"), + std::make_tuple(TR::InstOpCode::vtrn1_16b, TR::RealRegister::v0, TR::RealRegister::v0, TR::RealRegister::v15, "4e0f2800"), + std::make_tuple(TR::InstOpCode::vtrn1_16b, TR::RealRegister::v0, TR::RealRegister::v0, TR::RealRegister::v31, "4e1f2800") +)); + +INSTANTIATE_TEST_CASE_P(VectorTrn2, ARM64Trg1Src2EncodingTest, ::testing::Values( + std::make_tuple(TR::InstOpCode::vtrn2_8b, TR::RealRegister::v15, TR::RealRegister::v0, TR::RealRegister::v0, "0e00680f"), + std::make_tuple(TR::InstOpCode::vtrn2_8b, TR::RealRegister::v31, TR::RealRegister::v0, TR::RealRegister::v0, "0e00681f"), + std::make_tuple(TR::InstOpCode::vtrn2_8b, TR::RealRegister::v0, TR::RealRegister::v15, TR::RealRegister::v0, "0e0069e0"), + std::make_tuple(TR::InstOpCode::vtrn2_8b, TR::RealRegister::v0, TR::RealRegister::v31, TR::RealRegister::v0, "0e006be0"), + std::make_tuple(TR::InstOpCode::vtrn2_8b, TR::RealRegister::v0, TR::RealRegister::v0, TR::RealRegister::v15, "0e0f6800"), + std::make_tuple(TR::InstOpCode::vtrn2_8b, TR::RealRegister::v0, TR::RealRegister::v0, TR::RealRegister::v31, "0e1f6800"), + std::make_tuple(TR::InstOpCode::vtrn2_16b, TR::RealRegister::v15, TR::RealRegister::v0, TR::RealRegister::v0, "4e00680f"), + std::make_tuple(TR::InstOpCode::vtrn2_16b, TR::RealRegister::v31, TR::RealRegister::v0, TR::RealRegister::v0, "4e00681f"), + std::make_tuple(TR::InstOpCode::vtrn2_16b, TR::RealRegister::v0, TR::RealRegister::v15, TR::RealRegister::v0, "4e0069e0"), + std::make_tuple(TR::InstOpCode::vtrn2_16b, TR::RealRegister::v0, TR::RealRegister::v31, TR::RealRegister::v0, "4e006be0"), + std::make_tuple(TR::InstOpCode::vtrn2_16b, TR::RealRegister::v0, TR::RealRegister::v0, TR::RealRegister::v15, "4e0f6800"), + std::make_tuple(TR::InstOpCode::vtrn2_16b, TR::RealRegister::v0, TR::RealRegister::v0, TR::RealRegister::v31, "4e1f6800") +)); + INSTANTIATE_TEST_CASE_P(VectorUMLAL, ARM64Trg1Src2EncodingTest, ::testing::Values( std::make_tuple(TR::InstOpCode::vumlal_8h, TR::RealRegister::v15, TR::RealRegister::v0, TR::RealRegister::v0, "2e20800f"), std::make_tuple(TR::InstOpCode::vumlal_8h, TR::RealRegister::v31, TR::RealRegister::v0, TR::RealRegister::v0, "2e20801f"),