Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AArch64: Implement arraytranslateTRTO255 #7499

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion compiler/aarch64/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ compiler_library(aarch64
${CMAKE_CURRENT_LIST_DIR}/codegen/UnaryEvaluator.cpp
${CMAKE_CURRENT_LIST_DIR}/env/OMRCPU.cpp
${CMAKE_CURRENT_LIST_DIR}/env/OMRDebugEnv.cpp
${CMAKE_CURRENT_LIST_DIR}/runtime/ARM64arrayCopy.spp
${CMAKE_CURRENT_LIST_DIR}/runtime/ARM64ArrayCopy.spp
${CMAKE_CURRENT_LIST_DIR}/runtime/ARM64ArrayTranslate.spp
${CMAKE_CURRENT_LIST_DIR}/runtime/CodeSync.cpp
)
15 changes: 11 additions & 4 deletions compiler/aarch64/codegen/OMRCodeGenerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,10 +190,17 @@ OMR::ARM64::CodeGenerator::initialize()
cg->setSupportsArrayCmpLen();
}
}
if (!comp->getOption(TR_DisableArraySetOpts))
{
cg->setSupportsArraySet();
}

if (!comp->getOption(TR_DisableArraySetOpts))
{
cg->setSupportsArraySet();
}

static bool disableTRTO255 = (feGetEnv("TR_disableTRTO255") != NULL);
if (!disableTRTO255)
{
cg->setSupportsArrayTranslateTRTO255();
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think support for this opcode should be enabled in OMR when there isn't an implementation for it in OMR (the helper implementation is missing). In the absence of that, this logic should be enabled in the downstream projects that do provide the helper.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Both p and x platforms do the same: Implement the helper functions in the downstream project, and enable the feature in OMR.
On the other hand, z inlines the vectorized code instead of calling helper functions.

I can move the implementation of the AArch64 helper functions to OMR. In that case, maybe we want to move the helper functions for p and x to OMR for consistency.

[p]

[x]

  • https://github.com/eclipse-openj9/openj9/blob/master/runtime/compiler/x/runtime/X86ArrayTranslate.nasm
  • static bool disableX86TRTO = feGetEnv("TR_disableX86TRTO") != NULL;
    if (!disableX86TRTO)
    {
    TR_ASSERT_FATAL(comp->compileRelocatableCode() || comp->isOutOfProcessCompilation() || comp->compilePortableCode() || comp->target().cpu.supportsFeature(OMR_FEATURE_X86_SSE4_1) == self()->getX86ProcessorInfo().supportsSSE4_1(), "supportsSSE4_1() failed\n");
    if (comp->target().cpu.supportsFeature(OMR_FEATURE_X86_SSE4_1))
    {
    self()->setSupportsArrayTranslateTRTO();
    }
    }
    static bool disableX86TROT = feGetEnv("TR_disableX86TROT") != NULL;
    if (!disableX86TROT)
    {
    TR_ASSERT_FATAL(comp->compileRelocatableCode() || comp->isOutOfProcessCompilation() || comp->compilePortableCode() || comp->target().cpu.supportsFeature(OMR_FEATURE_X86_SSE4_1) == self()->getX86ProcessorInfo().supportsSSE4_1(), "supportsSSE4_1() failed\n");
    TR_ASSERT_FATAL(comp->compileRelocatableCode() || comp->isOutOfProcessCompilation() || comp->compilePortableCode() || comp->target().cpu.supportsFeature(OMR_FEATURE_X86_SSE2) == self()->getX86ProcessorInfo().supportsSSE2(), "supportsSSE4_1() failed\n");
    if (comp->target().cpu.supportsFeature(OMR_FEATURE_X86_SSE4_1))
    {
    self()->setSupportsArrayTranslateTROT();
    }
    if (comp->target().cpu.supportsFeature(OMR_FEATURE_X86_SSE2))
    {
    self()->setSupportsArrayTranslateTROTNoBreak();
    }
    }
    }

}

void
Expand Down
84 changes: 80 additions & 4 deletions compiler/aarch64/codegen/OMRTreeEvaluator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6400,10 +6400,86 @@ OMR::ARM64::TreeEvaluator::arraytranslateAndTestEvaluator(TR::Node *node, TR::Co

TR::Register *
OMR::ARM64::TreeEvaluator::arraytranslateEvaluator(TR::Node *node, TR::CodeGenerator *cg)
{
// TODO:ARM64: Enable TR::TreeEvaluator::arraytranslateEvaluator in compiler/aarch64/codegen/TreeEvaluatorTable.hpp when Implemented.
return OMR::ARM64::TreeEvaluator::unImpOpEvaluator(node, cg);
}
{
// tree looks as follows:
// arraytranslate
// (0) input ptr
// (1) output ptr
// (2) translation table (dummy)
// (3) stop character (terminal character, either 0xff00ff00 (ISO8859) or 0xff80ff80 (ASCII)
// (4) input length (in elements)
// (5) stopping char (dummy)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are the dummy nodes used by other architectures? Just wondering why they're dummy on AArch64.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is different between p/x/aarch64 and z. See the code below:

if (cg()->getSupportsArrayTranslateTRTO255()|| cg()->getSupportsArrayTranslateTRTO())
{
tableNode = TR::Node::create(callNode, TR::aconst, 0, 0);
if (isISO88591Encoder)
termchar = 0xff00ff00;
else
termchar = 0xff80ff80;
}
else //z
{
bool genSIMD = comp()->cg()->getSupportsVectorRegisters() && !comp()->getOption(TR_DisableSIMDArrayTranslate);
stopIndex = isISO88591Encoder ? 255: 127;
termchar = isISO88591Encoder ? 0x0B: 0xff;
if (genSIMD)
{
tableNode = TR::Node::create(callNode, TR::aconst, 0, 0); //dummy table node, it's not gonna be used
}
else
{
uint8_t *table = (uint8_t*)comp()->trMemory()->allocateMemory(65536, stackAlloc);
int i;
for (i = 0; i <= stopIndex; i++)
table[i] = (uint8_t)i;
for (i = stopIndex+1; i < 65536; i++)
table[i] = (uint8_t)termchar;
tableNode = createTableLoad(comp(), callNode, 16, 8, table, false);
stopIndex=-1;
}
}

The code for z from line 2543 generates a non-dummy node for child(2) at line 2562, while the non-z path generates a dummy aconst 0 at line 2537.
AArch64 takes the non-z path.

Copy link
Contributor Author

@knn-k knn-k Oct 31, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An example of the arraytranslate node for TRTO255:

 n819n    (  0)  treetop                                                                              [       0x12313efb0] bci=[0,13,180] rc=0 vc=2947 vn=- li=57 udi=- nc=1
 n818n    (  1)    arraytranslate  <arraytranslate>[#275  helper Method] [flags 0x400 0x0 ] (in GPR_0115) (char2byteXlate byteArrayXlate tableBackedByRawStorage )  [       0x12313ef60] bci=[0,13,180] rc=1 vc=2947 vn=- li=57 udi=13472 nc=6 flg=0xa020
 n795n    (  0)      aladd (in GPR_0112) (X>=0 internalPtr )                                          [       0x12313e830] bci=[0,12,180] rc=0 vc=2947 vn=- li=57 udi=12416 nc=2 flg=0x8100
 n460n    (  0)        ==>aRegLoad (in &GPR_0016)
 n797n    (  0)        lsub (in GPR_0017) (X>=0 cannotOverflow )                                      [       0x12313e8d0] bci=[0,12,180] rc=0 vc=2947 vn=- li=57 udi=29968 nc=2 flg=0x1100
 n798n    (  0)          lshl (in GPR_0017) (X>=0 )                                                   [       0x12313e920] bci=[0,12,180] rc=0 vc=2947 vn=- li=57 udi=29968 nc=2 flg=0x100
 n799n    (  0)            i2l (highWordZero X>=0 )                                                   [       0x12313e970] bci=[0,12,180] rc=0 vc=2947 vn=- li=57 udi=- nc=1 flg=0x4100
 n12n     (  0)              ==>iRegLoad (in GPR_0017) (cannotOverflow )
 n803n    (  0)            iconst 1 (Unsigned X!=0 X>=0 )                                             [       0x12313eab0] bci=[0,12,180] rc=0 vc=2947 vn=- li=57 udi=- nc=0 flg=0x4104
 n804n    (  0)          lconst -16 (X!=0 X<=0 )                                                      [       0x12313eb00] bci=[0,12,180] rc=0 vc=2947 vn=- li=57 udi=- nc=0 flg=0x204
 n805n    (  0)      aladd (in GPR_0113) (X>=0 internalPtr )                                          [       0x12313eb50] bci=[0,34,185] rc=0 vc=2947 vn=- li=57 udi=12960 nc=2 flg=0x8100
 n7n      (  0)        ==>newarray (in &GPR_0038) (X!=0 )
 n807n    (  0)        lconst 16 (highWordZero X!=0 X>=0 cannotOverflow )                             [       0x12313ebf0] bci=[0,34,185] rc=0 vc=2947 vn=- li=57 udi=- nc=0 flg=0x5104
 n1286n   (  0)      iconst 0 (X==0 X>=0 X<=0 )                                                       [       0x1234d81b0] bci=[-1,0,161] rc=0 vc=2947 vn=- li=- udi=- nc=0 flg=0x302
 n816n    (  0)      iconst 0xff00ff00 (X!=0 X<=0 )                                                   [       0x12313eec0] bci=[0,10,180] rc=0 vc=2947 vn=- li=57 udi=- nc=0 flg=0x204
 n815n    (  0)      i2l (in GPR_0114) (highWordZero X>=0 )                                           [       0x12313ee70] bci=[0,5,179] rc=0 vc=2947 vn=- li=57 udi=13216 nc=1 flg=0x4100
 n5n      (  1)        ==>iRegLoad (in GPR_0018) (cannotOverflow )
 n1287n   (  0)      iconst -1 (X!=0 X<=0 )                                                           [       0x1234d8200] bci=[-1,0,161] rc=0 vc=2947 vn=- li=- udi=- nc=0 flg=0x204

//
// Number of translated elements is returned

TR::Compilation *comp = cg->comp();

TR_ASSERT_FATAL(!node->isSourceByteArrayTranslate(), "Source is byte[] for arraytranslate");
TR_ASSERT_FATAL(node->isTargetByteArrayTranslate(), "Target is char[] for arraytranslate");
TR_ASSERT_FATAL(node->getChild(3)->getOpCodeValue() == TR::iconst && node->getChild(3)->getInt() == 0x0ff00ff00, "Non-ISO8859 stop character for arraytranslate");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is child(3) used for? It does not appear to be used in this evaluator, so is this assert necessary?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

child(3) is used for the arraytranslateTRTO for ASCII translation. I already have the code for that, and I am thinking of opening separate PRs for that after arraytranslateTRTO255 (for ISO 8859-1 translation) gets merged.

Do you want to also review arraytranslateTRTO together with arraytranslateTRTO255 in this PR?


static bool verboseArrayTranslate = (feGetEnv("TR_verboseArrayTranslate") != NULL);
if (verboseArrayTranslate)
{
fprintf(stderr, "arrayTranslateTRTO255: %s @ %s\n",
comp->signature(),
comp->getHotnessName(comp->getMethodHotness())
);
}

TR::Register *inputReg = cg->gprClobberEvaluate(node->getChild(0));
TR::Register *outputReg = cg->gprClobberEvaluate(node->getChild(1));
TR::Register *inputLenReg = cg->gprClobberEvaluate(node->getChild(4));
TR::Register *outputLenReg = cg->allocateRegister();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Children 2,3,5 still need to be evaluated even if they are "dummy".

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Arraytranslate evaluators of p and x platforms leave those dummy children unevaluated.

  • p:
    TR::Register *inputReg = cg->gprClobberEvaluate(node->getChild(0));
    TR::Register *outputReg = cg->gprClobberEvaluate(node->getChild(1));
    TR::Register *stopCharReg = arraytranslateTRTO255 ? NULL : cg->gprClobberEvaluate(node->getChild(3));
    TR::Register *inputLenReg = cg->gprClobberEvaluate(node->getChild(4));
  • x:
    bool stopUsingCopyReg1 = TR::TreeEvaluator::stopUsingCopyRegAddr(node->getChild(0), srcPtrReg, cg);
    bool stopUsingCopyReg2 = TR::TreeEvaluator::stopUsingCopyRegAddr(node->getChild(1), dstPtrReg, cg);
    bool stopUsingCopyReg4 = TR::TreeEvaluator::stopUsingCopyRegInteger(node->getChild(3), termCharReg, cg);
    bool stopUsingCopyReg5 = TR::TreeEvaluator::stopUsingCopyRegInteger(node->getChild(4), lengthReg, cg);

Reference counts of all the child nodes are decremented.


int numDeps = 10;

TR::RegisterDependencyConditions *deps = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(1, numDeps, cg->trMemory());

deps->addPreCondition(inputReg, TR::RealRegister::x0);

deps->addPostCondition(outputLenReg, TR::RealRegister::x0);
deps->addPostCondition(outputReg, TR::RealRegister::x1);
deps->addPostCondition(inputLenReg, TR::RealRegister::x2);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Register x3 will be used by arraytranslateTRTO later.


// Clobbered by the helper
TR::Register *clobberedReg;
deps->addPostCondition(clobberedReg = cg->allocateRegister(), TR::RealRegister::x4);
cg->stopUsingRegister(clobberedReg);
deps->addPostCondition(clobberedReg = cg->allocateRegister(), TR::RealRegister::x5);
cg->stopUsingRegister(clobberedReg);
deps->addPostCondition(clobberedReg = cg->allocateRegister(), TR::RealRegister::x6);
cg->stopUsingRegister(clobberedReg);

deps->addPostCondition(clobberedReg = cg->allocateRegister(TR_VRF), TR::RealRegister::v0);
cg->stopUsingRegister(clobberedReg);
deps->addPostCondition(clobberedReg = cg->allocateRegister(TR_VRF), TR::RealRegister::v1);
cg->stopUsingRegister(clobberedReg);
deps->addPostCondition(clobberedReg = cg->allocateRegister(TR_VRF), TR::RealRegister::v2);
cg->stopUsingRegister(clobberedReg);

// Array Translate helper call
TR_RuntimeHelper helper = TR_ARM64arrayTranslateTRTO255;
TR::SymbolReference *helperSym = cg->symRefTab()->findOrCreateRuntimeHelper(helper);
uintptr_t addr = reinterpret_cast<uintptr_t>(helperSym->getMethodAddress());
generateImmSymInstruction(cg, TR::InstOpCode::bl, node, addr, deps, helperSym, NULL);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What you've provided here is essentially a stub that calls a helper that must be implemented by any downstream project that wants to support arraytranslate opcodes. You should either have a comment at the beginning of this evaluator explaining this, or you should consider moving the entire evaluator into the downstream project itself.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I moved the helper implementation to OMR as discussed yesterday.
At the same time I renamed ARM64arrayCopy.spp to ARM64ArrayCopy.spp for the consistency of file names.


for (uint32_t i = 0; i < node->getNumChildren(); i++)
cg->decReferenceCount(node->getChild(i));

if (inputReg != node->getChild(0)->getRegister())
cg->stopUsingRegister(inputReg);

if (outputReg != node->getChild(1)->getRegister())
cg->stopUsingRegister(outputReg);

if (inputLenReg != node->getChild(4)->getRegister())
cg->stopUsingRegister(inputLenReg);

cg->machine()->setLinkRegisterKilled(true);
node->setRegister(outputLenReg);
return outputLenReg;
}

TR::Register *
OMR::ARM64::TreeEvaluator::arraysetEvaluator(TR::Node *node, TR::CodeGenerator *cg)
Expand Down
130 changes: 130 additions & 0 deletions compiler/aarch64/runtime/ARM64ArrayTranslate.spp
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
/*******************************************************************************
* Copyright IBM Corp. and others 2024
*
* This program and the accompanying materials are made available under
* the terms of the Eclipse Public License 2.0 which accompanies this
* distribution and is available at https://www.eclipse.org/legal/epl-2.0/
* or the Apache License, Version 2.0 which accompanies this distribution
* and is available at https://www.apache.org/licenses/LICENSE-2.0.
*
* This Source Code may also be made available under the following Secondary
* Licenses when the conditions for such availability set forth in the
* Eclipse Public License, v. 2.0 are satisfied: GNU General Public License,
* version 2 with the GNU Classpath Exception [1] and GNU General Public
* License, version 2 with the OpenJDK Assembly Exception [2].
*
* [1] https://www.gnu.org/software/classpath/license.html
* [2] https://openjdk.org/legal/assembly-exception.html
*
* SPDX-License-Identifier: EPL-2.0 OR Apache-2.0 OR GPL-2.0-only WITH Classpath-exception-2.0 OR GPL-2.0-only WITH OpenJDK-assembly-exception-1.0
*******************************************************************************/

.file "ARM64ArrayTranslate.s"

#include "aarch64/runtime/arm64asmdefs.inc"

.globl FUNC_LABEL(__arrayTranslateTRTO255)

.text
.align 2

// ----
// arrayTranslateTRTO255
// ----
// TO stands for Two bytes to One byte
//
// uint16 input[];
// uint8 output[];
// int32 len;
//
// int32 i = 0;
// for (i < len) {
// uint16 ch = input[i];
// if (ch > 0xFF) break;
// output[i] = ch & 0xFF;
// i++;
// }
// return i;
//
// in: x0: input
// x1: output
// x2: len
// out: x0: num of translated elements
// trash: x4-x6, v0-v2

FUNC_LABEL(__arrayTranslateTRTO255):
// preserve output address
mov x6, x1
cmp w2, #16
b.cc atTRTO255_15
lsr w4, w2, #4
atTRTO255_16Loop:
// load 16 elements
ldp q0, q1, [x0]
// collect upper 8 bits
uzp2 v2.16b, v0.16b, v1.16b
// fail when any one of them is non-zero
umaxp v2.4s, v2.4s, v2.4s
mov x5, v2.D[0]
cbnz x5, atTRTO255_Fail
// collect lower 8 bits
uzp1 v2.16b, v0.16b, v1.16b
add x0, x0, #32
subs w4, w4, #1
// store 16 elements
str q2, [x1], #16
b.ne atTRTO255_16Loop
atTRTO255_15:
// 15 elements or less remaining
tst w2, #8
b.eq atTRTO255_7
// load 8 elements
ldr q0, [x0]
// collect upper 8 bits
trn2 v2.16b, v0.16b, v0.16b
// fail when any one of them is non-zero
umaxp v2.4s, v2.4s, v2.4s
mov x5, v2.D[0]
cbnz x5, atTRTO255_Fail
// collect lower 8 bits
xtn v2.8b, v0.8h
add x0, x0, #16
// store 8 elements
str d2, [x1], #8
atTRTO255_7:
// 7 elements or less remaining
tst w2, #4
b.eq atTRTO255_3
// load 4 elements
ldr d0, [x0]
// collect upper 8 bits
trn2 v2.8b, v0.8b, v0.8b
// fail when any one of them is non-zero
mov x5, v2.D[0]
cbnz x5, atTRTO255_Fail
// collect lower 8 bits
xtn v2.8b, v0.8h
add x0, x0, #8
// store 4 elements
str s2, [x1], #4
atTRTO255_3:
// 3 elements or less remaining
ands w4, w2, #3
atTRTO255_1Loop:
b.eq atTRTO255_Done
ldrh w5, [x0], #2
cmp w5, #256
b.cs atTRTO255_Done
subs w4, w4, #1
strb w5, [x1], #1
b atTRTO255_1Loop
atTRTO255_Fail:
ldrh w5, [x0], #2
cmp w5, #256
b.cs atTRTO255_Done
strb w5, [x1], #1
b atTRTO255_Fail
atTRTO255_Done:
// number of translated elements
sub x0, x1, x6
ret
1 change: 1 addition & 0 deletions compiler/ras/Debug.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4237,6 +4237,7 @@ TR_Debug::getRuntimeHelperName(int32_t index)
case TR_ARM64interfaceCompleteSlot2: return "_interfaceCompleteSlot2";
case TR_ARM64interfaceSlotsUnavailable: return "_interfaceSlotsUnavailable";
case TR_ARM64PatchGCRHelper: return "_patchGCRHelper" ;
case TR_ARM64arrayTranslateTRTO255: return "__arrayTranslateTRTO255";
}
}
#endif
Expand Down
3 changes: 2 additions & 1 deletion compiler/runtime/Helpers.inc
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,8 @@ SETVAL(TR_ARM64interfaceCompleteSlot2,TR_FSRH+42)
SETVAL(TR_ARM64interfaceSlotsUnavailable,TR_FSRH+43)
SETVAL(TR_ARM64PatchGCRHelper,TR_FSRH+44)
SETVAL(TR_ARM64fieldWatchHelper,TR_FSRH+45)
SETVAL(TR_ARM64numRuntimeHelpers,TR_FSRH+46)
SETVAL(TR_ARM64arrayTranslateTRTO255,TR_FSRH+46)
SETVAL(TR_ARM64numRuntimeHelpers,TR_FSRH+47)

SETVAL(TR_S390longDivide,TR_FSRH)
SETVAL(TR_S390interfaceCallHelper,TR_FSRH+1)
Expand Down