From ac77de5e76b419a8878fc4c98e9045a26e6e63c1 Mon Sep 17 00:00:00 2001 From: Sarwat Shaheen Date: Mon, 22 Jan 2024 12:39:24 -0500 Subject: [PATCH 1/2] Enable inlining of fmax/fmin/dmax/dmin on Z - Adds java_lang_Math_max/min_float/double as a recognized method - Adds a SupportsInlineMath_MaxMin_FD flag to the Z code generator - Flag is only set in Z if the TR_disableInlineMath_MaxMin_FD environment variable is not set - If the flag is set, call nodes are transformed to a functionally equivalent tree that uses fmin/fmax/dmin/dmax nodes Signed-off-by: Sarwat Shaheen --- runtime/compiler/codegen/J9CodeGenerator.hpp | 11 +++++++++++ runtime/compiler/env/j9method.cpp | 4 ++++ .../optimizer/J9RecognizedCallTransformer.cpp | 16 ++++++++++++++++ runtime/compiler/z/codegen/J9CodeGenerator.cpp | 6 ++++++ 4 files changed, 37 insertions(+) diff --git a/runtime/compiler/codegen/J9CodeGenerator.hpp b/runtime/compiler/codegen/J9CodeGenerator.hpp index b5d425b2fe2..e286569e533 100644 --- a/runtime/compiler/codegen/J9CodeGenerator.hpp +++ b/runtime/compiler/codegen/J9CodeGenerator.hpp @@ -512,6 +512,16 @@ void addMonClass(TR::Node* monNode, TR_OpaqueClassBlock* clazz); */ void setSupportsInlineVectorizedHashCode() { _j9Flags.set(SupportsInlineVectorizedHashCode); } + /** \brief + * Determines whether the code generator supports inlining of java_lang_Math_max/min_F/D + */ + bool getSupportsInlineMath_MaxMin_FD() { return _j9Flags.testAny(SupportsInlineMath_MaxMin_FD); } + + /** \brief + * The code generator supports inlining of java_lang_Math_max/min_F/D + */ + void setSupportsInlineMath_MaxMin_FD() { _j9Flags.set(SupportsInlineMath_MaxMin_FD); } + /** * \brief * The number of nodes between a monext and the next monent before @@ -677,6 +687,7 @@ void addMonClass(TR::Node* monNode, TR_OpaqueClassBlock* clazz); SavesNonVolatileGPRsForGC = 0x00000800, SupportsInlineVectorizedMismatch = 0x00001000, SupportsInlineVectorizedHashCode = 0x00002000, + SupportsInlineMath_MaxMin_FD = 0x00002000, }; flags32_t _j9Flags; diff --git a/runtime/compiler/env/j9method.cpp b/runtime/compiler/env/j9method.cpp index 41c7b767625..69a742f18f9 100644 --- a/runtime/compiler/env/j9method.cpp +++ b/runtime/compiler/env/j9method.cpp @@ -5014,6 +5014,10 @@ TR_ResolvedJ9Method::setRecognizedMethodInfo(TR::RecognizedMethod rm) case TR::java_lang_Math_min_I: case TR::java_lang_Math_max_L: case TR::java_lang_Math_min_L: + case TR::java_lang_Math_max_F: + case TR::java_lang_Math_min_F: + case TR::java_lang_Math_max_D: + case TR::java_lang_Math_min_D: case TR::java_lang_Math_abs_I: case TR::java_lang_Math_abs_L: case TR::java_lang_Math_abs_F: diff --git a/runtime/compiler/optimizer/J9RecognizedCallTransformer.cpp b/runtime/compiler/optimizer/J9RecognizedCallTransformer.cpp index ca4274de589..615a91b12fb 100644 --- a/runtime/compiler/optimizer/J9RecognizedCallTransformer.cpp +++ b/runtime/compiler/optimizer/J9RecognizedCallTransformer.cpp @@ -1366,6 +1366,10 @@ bool J9::RecognizedCallTransformer::isInlineable(TR::TreeTop* treetop) case TR::java_lang_Math_min_I: case TR::java_lang_Math_max_L: case TR::java_lang_Math_min_L: + case TR::java_lang_Math_max_F: + case TR::java_lang_Math_min_F: + case TR::java_lang_Math_max_D: + case TR::java_lang_Math_min_D: return !comp()->getOption(TR_DisableMaxMinOptimization); case TR::java_lang_Math_multiplyHigh: return cg()->getSupportsLMulHigh(); @@ -1495,6 +1499,18 @@ void J9::RecognizedCallTransformer::transform(TR::TreeTop* treetop) case TR::java_lang_Math_min_L: processIntrinsicFunction(treetop, node, TR::lmin); break; + case TR::java_lang_Math_max_F: + processIntrinsicFunction(treetop, node, TR::fmax); + break; + case TR::java_lang_Math_min_F: + processIntrinsicFunction(treetop, node, TR::fmin); + break; + case TR::java_lang_Math_max_D: + processIntrinsicFunction(treetop, node, TR::dmax); + break; + case TR::java_lang_Math_min_D: + processIntrinsicFunction(treetop, node, TR::dmin); + break; case TR::java_lang_Math_multiplyHigh: processIntrinsicFunction(treetop, node, TR::lmulh); break; diff --git a/runtime/compiler/z/codegen/J9CodeGenerator.cpp b/runtime/compiler/z/codegen/J9CodeGenerator.cpp index ca13349d986..8bc9478abca 100644 --- a/runtime/compiler/z/codegen/J9CodeGenerator.cpp +++ b/runtime/compiler/z/codegen/J9CodeGenerator.cpp @@ -125,6 +125,12 @@ J9::Z::CodeGenerator::initialize() cg->setSupportsInlineEncodeASCII(); } + static bool disableInlineMath_MaxMin_FD = feGetEnv("TR_disableInlineMath_MaxMin_FD") != NULL; + if (!disableInlineMath_MaxMin_FD) + { + cg->setSupportsInlineMath_MaxMin_FD(); + } + static bool disableInlineVectorizedMismatch = feGetEnv("TR_disableInlineVectorizedMismatch") != NULL; if (cg->getSupportsArrayCmpLen() && #if defined(J9VM_GC_ENABLE_SPARSE_HEAP_ALLOCATION) From 5b061e9c9b119ea8351b35a256fdb512f4cb80b2 Mon Sep 17 00:00:00 2001 From: Matthew Hall Date: Wed, 2 Oct 2024 16:12:00 -0400 Subject: [PATCH 2/2] Support Math.max/min for floating points w.r.t java spec - spearate evaluators for J9 vs OMR to support differing behaviour (OMR complies with IEEE_754, while J9 returns the first NaN (if present) - +0.0 compares as strictly greater than -0.0 Signed-off-by: Matthew Hall --- runtime/compiler/codegen/J9CodeGenerator.hpp | 2 +- .../optimizer/J9RecognizedCallTransformer.cpp | 3 +- .../compiler/z/codegen/J9CodeGenerator.cpp | 20 ++-- .../compiler/z/codegen/J9TreeEvaluator.cpp | 110 ++++++------------ .../compiler/z/codegen/J9TreeEvaluator.hpp | 6 +- 5 files changed, 54 insertions(+), 87 deletions(-) diff --git a/runtime/compiler/codegen/J9CodeGenerator.hpp b/runtime/compiler/codegen/J9CodeGenerator.hpp index e286569e533..a1a2e545ce1 100644 --- a/runtime/compiler/codegen/J9CodeGenerator.hpp +++ b/runtime/compiler/codegen/J9CodeGenerator.hpp @@ -687,7 +687,7 @@ void addMonClass(TR::Node* monNode, TR_OpaqueClassBlock* clazz); SavesNonVolatileGPRsForGC = 0x00000800, SupportsInlineVectorizedMismatch = 0x00001000, SupportsInlineVectorizedHashCode = 0x00002000, - SupportsInlineMath_MaxMin_FD = 0x00002000, + SupportsInlineMath_MaxMin_FD = 0x00004000, }; flags32_t _j9Flags; diff --git a/runtime/compiler/optimizer/J9RecognizedCallTransformer.cpp b/runtime/compiler/optimizer/J9RecognizedCallTransformer.cpp index 615a91b12fb..3341837ba7d 100644 --- a/runtime/compiler/optimizer/J9RecognizedCallTransformer.cpp +++ b/runtime/compiler/optimizer/J9RecognizedCallTransformer.cpp @@ -1366,11 +1366,12 @@ bool J9::RecognizedCallTransformer::isInlineable(TR::TreeTop* treetop) case TR::java_lang_Math_min_I: case TR::java_lang_Math_max_L: case TR::java_lang_Math_min_L: + return !comp()->getOption(TR_DisableMaxMinOptimization); case TR::java_lang_Math_max_F: case TR::java_lang_Math_min_F: case TR::java_lang_Math_max_D: case TR::java_lang_Math_min_D: - return !comp()->getOption(TR_DisableMaxMinOptimization); + return !comp()->getOption(TR_DisableMaxMinOptimization) && cg()->getSupportsInlineMath_MaxMin_FD(); case TR::java_lang_Math_multiplyHigh: return cg()->getSupportsLMulHigh(); case TR::java_lang_StringUTF16_toBytes: diff --git a/runtime/compiler/z/codegen/J9CodeGenerator.cpp b/runtime/compiler/z/codegen/J9CodeGenerator.cpp index 8bc9478abca..73f81a8ac87 100644 --- a/runtime/compiler/z/codegen/J9CodeGenerator.cpp +++ b/runtime/compiler/z/codegen/J9CodeGenerator.cpp @@ -125,7 +125,7 @@ J9::Z::CodeGenerator::initialize() cg->setSupportsInlineEncodeASCII(); } - static bool disableInlineMath_MaxMin_FD = feGetEnv("TR_disableInlineMath_MaxMin_FD") != NULL; + static bool disableInlineMath_MaxMin_FD = feGetEnv("TR_disableInlineMaxMin") != NULL; if (!disableInlineMath_MaxMin_FD) { cg->setSupportsInlineMath_MaxMin_FD(); @@ -4079,20 +4079,24 @@ J9::Z::CodeGenerator::inlineDirectCall( } } - if (!comp->getOption(TR_DisableSIMDDoubleMaxMin) && cg->getSupportsVectorRegisters()) - { - switch (methodSymbol->getRecognizedMethod()) - { + if (!self()->comp()->getOption(TR_DisableMaxMinOptimization) && cg->getSupportsInlineMath_MaxMin_FD()) { + switch (methodSymbol->getRecognizedMethod()) { case TR::java_lang_Math_max_D: - resultReg = TR::TreeEvaluator::inlineDoubleMax(node, cg); + resultReg = J9::Z::TreeEvaluator::dmaxEvaluator(node, cg); return true; case TR::java_lang_Math_min_D: - resultReg = TR::TreeEvaluator::inlineDoubleMin(node, cg); + resultReg = J9::Z::TreeEvaluator::dminEvaluator(node, cg); + return true; + case TR::java_lang_Math_max_F: + resultReg = J9::Z::TreeEvaluator::fmaxEvaluator(node, cg); + return true; + case TR::java_lang_Math_min_F: + resultReg = J9::Z::TreeEvaluator::fminEvaluator(node, cg); return true; default: break; - } } + } switch (methodSymbol->getRecognizedMethod()) { diff --git a/runtime/compiler/z/codegen/J9TreeEvaluator.cpp b/runtime/compiler/z/codegen/J9TreeEvaluator.cpp index 2c7dbe4b29b..1c9d4eecf91 100644 --- a/runtime/compiler/z/codegen/J9TreeEvaluator.cpp +++ b/runtime/compiler/z/codegen/J9TreeEvaluator.cpp @@ -906,76 +906,48 @@ allocateWriteBarrierInternalPointerRegister(TR::CodeGenerator * cg, TR::Node * s } -extern TR::Register * -doubleMaxMinHelper(TR::Node *node, TR::CodeGenerator *cg, bool isMaxOp) +TR::Register * +J9::Z::TreeEvaluator::dmaxEvaluator(TR::Node * node, TR::CodeGenerator * cg) { - TR_ASSERT(node->getNumChildren() >= 1 || node->getNumChildren() <= 2, "node has incorrect number of children"); - - /* ===================== Allocating Registers ===================== */ - - TR::Register * v16 = cg->allocateRegister(TR_VRF); - TR::Register * v17 = cg->allocateRegister(TR_VRF); - TR::Register * v18 = cg->allocateRegister(TR_VRF); - - /* ===================== Generating instructions ===================== */ - - /* ====== LD FPR0,16(GPR5) Load a ====== */ - TR::Register * v0 = cg->fprClobberEvaluate(node->getFirstChild()); - - /* ====== LD FPR2, 0(GPR5) Load b ====== */ - TR::Register * v2 = cg->evaluate(node->getSecondChild()); - - /* ====== WFTCIDB V16,V0,X'F' a == NaN ====== */ - generateVRIeInstruction(cg, TR::InstOpCode::VFTCI, node, v16, v0, 0xF, 8, 3); - - /* ====== For Max: WFCHE V17,V0,V2 Compare a >= b ====== */ - if(isMaxOp) + if (cg->getSupportsVectorRegisters()) { - generateVRRcInstruction(cg, TR::InstOpCode::VFCH, node, v17, v0, v2, 0, 8, 3); + cg->generateDebugCounter("z13/simd/doubleMax", 1, TR::DebugCounter::Free); + return OMR::Z::TreeEvaluator::fpMinMaxVectorHelper(node, cg); } - /* ====== For Min: WFCHE V17,V0,V2 Compare a <= b ====== */ - else + return OMR::Z::TreeEvaluator::xmaxxminHelper(node, cg); + } + +TR::Register * +J9::Z::TreeEvaluator::dminEvaluator(TR::Node * node, TR::CodeGenerator * cg) + { + if (cg->getSupportsVectorRegisters()) { - generateVRRcInstruction(cg, TR::InstOpCode::VFCH, node, v17, v2, v0, 0, 8, 3); + cg->generateDebugCounter("z13/simd/doubleMin", 1, TR::DebugCounter::Free); + return OMR::Z::TreeEvaluator::fpMinMaxVectorHelper(node, cg); } + return OMR::Z::TreeEvaluator::xmaxxminHelper(node, cg); + } - /* ====== VO V16,V16,V17 (a >= b) || (a == NaN) ====== */ - generateVRRcInstruction(cg, TR::InstOpCode::VO, node, v16, v16, v17, 0, 0, 0); - - /* ====== For Max: WFTCIDB V17,V0,X'800' a == +0 ====== */ - if(isMaxOp) - { - generateVRIeInstruction(cg, TR::InstOpCode::VFTCI, node, v17, v0, 0x800, 8, 3); - } - /* ====== For Min: WFTCIDB V17,V0,X'400' a == -0 ====== */ - else - { - generateVRIeInstruction(cg, TR::InstOpCode::VFTCI, node, v17, v0, 0x400, 8, 3); - } - /* ====== WFTCIDB V18,V2,X'C00' b == 0 ====== */ - generateVRIeInstruction(cg, TR::InstOpCode::VFTCI, node, v18, v2, 0xC00, 8, 3); - - /* ====== VN V17,V17,V18 (a == -0) && (b == 0) ====== */ - generateVRRcInstruction(cg, TR::InstOpCode::VN, node, v17, v17, v18, 0, 0, 0); - - /* ====== VO V16,V16,V17 (a >= b) || (a == NaN) || ((a == -0) && (b == 0)) ====== */ - generateVRRcInstruction(cg, TR::InstOpCode::VO, node, v16, v16, v17, 0, 0, 0); - - /* ====== VSEL V0,V0,V2,V16 ====== */ - generateVRReInstruction(cg, TR::InstOpCode::VSEL, node, v0, v0, v2, v16); - - /* ===================== Deallocating Registers ===================== */ - cg->stopUsingRegister(v2); - cg->stopUsingRegister(v16); - cg->stopUsingRegister(v17); - cg->stopUsingRegister(v18); - - node->setRegister(v0); - - cg->decReferenceCount(node->getFirstChild()); - cg->decReferenceCount(node->getSecondChild()); +TR::Register * +J9::Z::TreeEvaluator::fmaxEvaluator(TR::Node * node, TR::CodeGenerator * cg) + { + if (cg->getSupportsVectorRegisters()) + { + cg->generateDebugCounter("z13/simd/floatMax", 1, TR::DebugCounter::Free); + return OMR::Z::TreeEvaluator::fpMinMaxVectorHelper(node, cg); + } + return OMR::Z::TreeEvaluator::xmaxxminHelper(node, cg); + } - return node->getRegister(); +TR::Register * +J9::Z::TreeEvaluator::fminEvaluator(TR::Node * node, TR::CodeGenerator * cg) + { + if (cg->getSupportsVectorRegisters()) + { + cg->generateDebugCounter("z13/simd/floatMin", 1, TR::DebugCounter::Free); + return OMR::Z::TreeEvaluator::fpMinMaxVectorHelper(node, cg); + } + return OMR::Z::TreeEvaluator::xmaxxminHelper(node, cg); } TR::Register* @@ -2945,19 +2917,7 @@ J9::Z::TreeEvaluator::toLowerIntrinsic(TR::Node *node, TR::CodeGenerator *cg, bo return caseConversionHelper(node, cg, false, isCompressedString); } -TR::Register* -J9::Z::TreeEvaluator::inlineDoubleMax(TR::Node *node, TR::CodeGenerator *cg) - { - cg->generateDebugCounter("z13/simd/doubleMax", 1, TR::DebugCounter::Free); - return doubleMaxMinHelper(node, cg, true); - } -TR::Register* -J9::Z::TreeEvaluator::inlineDoubleMin(TR::Node *node, TR::CodeGenerator *cg) - { - cg->generateDebugCounter("z13/simd/doubleMin", 1, TR::DebugCounter::Free); - return doubleMaxMinHelper(node, cg, false); - } TR::Register * J9::Z::TreeEvaluator::inlineMathFma(TR::Node *node, TR::CodeGenerator *cg) diff --git a/runtime/compiler/z/codegen/J9TreeEvaluator.hpp b/runtime/compiler/z/codegen/J9TreeEvaluator.hpp index da2286d3b73..c21264611ae 100644 --- a/runtime/compiler/z/codegen/J9TreeEvaluator.hpp +++ b/runtime/compiler/z/codegen/J9TreeEvaluator.hpp @@ -126,8 +126,10 @@ class OMR_EXTENSIBLE TreeEvaluator: public J9::TreeEvaluator */ static TR::Register *inlineVectorizedStringIndexOf(TR::Node *node, TR::CodeGenerator *cg, bool isCompressed); static TR::Register *inlineIntrinsicIndexOf(TR::Node *node, TR::CodeGenerator *cg, bool isLatin1); - static TR::Register *inlineDoubleMax(TR::Node *node, TR::CodeGenerator *cg); - static TR::Register *inlineDoubleMin(TR::Node *node, TR::CodeGenerator *cg); + static TR::Register *fminEvaluator(TR::Node *node, TR::CodeGenerator *cg); + static TR::Register *dminEvaluator(TR::Node *node, TR::CodeGenerator *cg); + static TR::Register *fmaxEvaluator(TR::Node *node, TR::CodeGenerator *cg); + static TR::Register *dmaxEvaluator(TR::Node *node, TR::CodeGenerator *cg); static TR::Register *inlineMathFma(TR::Node *node, TR::CodeGenerator *cg); /* This Evaluator generates the SIMD routine for methods