From c5497983a097132256eb1df2fd5a9fac85e705ac Mon Sep 17 00:00:00 2001 From: KONNO Kazuhiro Date: Mon, 2 Dec 2024 15:47:59 +0900 Subject: [PATCH] Revert "Stop recognizing UTF16_Encoder.encodeUTF16 methods" This commit restores the changes for encodeUTF16Big and encodeUTF16Little that were removed by #20613. Signed-off-by: KONNO Kazuhiro --- .../share/classes/com/ibm/jit/JITHelpers.java | 4 + runtime/compiler/build/files/host/p.mk | 1 + runtime/compiler/build/files/host/x.mk | 1 + .../codegen/J9RecognizedMethodsEnum.hpp | 5 + runtime/compiler/compile/J9Compilation.cpp | 10 + runtime/compiler/env/j9method.cpp | 5 + .../compiler/p/codegen/J9TreeEvaluator.cpp | 98 ++++ runtime/compiler/p/runtime/CMakeLists.txt | 1 + .../compiler/p/runtime/J9PPCEncodeUTF16.spp | 523 ++++++++++++++++++ runtime/compiler/runtime/Runtime.cpp | 14 + .../compiler/x/codegen/J9TreeEvaluator.cpp | 83 +++ .../compiler/x/codegen/J9TreeEvaluator.hpp | 1 + runtime/compiler/x/runtime/.gitignore | 1 + runtime/compiler/x/runtime/CMakeLists.txt | 1 + .../compiler/x/runtime/X86EncodeUTF16.nasm | 185 +++++++ .../compiler/z/codegen/J9CodeGenerator.cpp | 4 + .../compiler/z/codegen/J9TreeEvaluator.cpp | 355 ++++++++++++ .../compiler/z/codegen/J9TreeEvaluator.hpp | 2 + 18 files changed, 1294 insertions(+) create mode 100644 runtime/compiler/p/runtime/J9PPCEncodeUTF16.spp create mode 100644 runtime/compiler/x/runtime/X86EncodeUTF16.nasm diff --git a/jcl/src/java.base/share/classes/com/ibm/jit/JITHelpers.java b/jcl/src/java.base/share/classes/com/ibm/jit/JITHelpers.java index 4280486aaf9..74748ad8209 100644 --- a/jcl/src/java.base/share/classes/com/ibm/jit/JITHelpers.java +++ b/jcl/src/java.base/share/classes/com/ibm/jit/JITHelpers.java @@ -65,6 +65,10 @@ private static JITHelpers jitHelpers() { return helpers; } + public native int transformedEncodeUTF16Big(long src, long dest, int num); + + public native int transformedEncodeUTF16Little(long src, long dest, int num); + /* * Constants for getSuperclass. */ diff --git a/runtime/compiler/build/files/host/p.mk b/runtime/compiler/build/files/host/p.mk index ab846055364..49ada1f34f9 100644 --- a/runtime/compiler/build/files/host/p.mk +++ b/runtime/compiler/build/files/host/p.mk @@ -29,6 +29,7 @@ JIT_PRODUCT_SOURCE_FILES+=\ compiler/p/runtime/J9PPCArrayTranslate.spp \ compiler/p/runtime/J9PPCCRC32.spp \ compiler/p/runtime/J9PPCCRC32_wrapper.c \ + compiler/p/runtime/J9PPCEncodeUTF16.spp \ compiler/p/runtime/Math.spp \ compiler/p/runtime/PPCHWProfiler.cpp \ compiler/p/runtime/PPCRelocationTarget.cpp \ diff --git a/runtime/compiler/build/files/host/x.mk b/runtime/compiler/build/files/host/x.mk index dd60923e066..7a34e1bb5a4 100644 --- a/runtime/compiler/build/files/host/x.mk +++ b/runtime/compiler/build/files/host/x.mk @@ -26,6 +26,7 @@ JIT_PRODUCT_SOURCE_FILES+=\ compiler/x/runtime/X86RelocationTarget.cpp \ compiler/x/runtime/X86ArrayTranslate.nasm \ compiler/x/runtime/X86Codert.nasm \ + compiler/x/runtime/X86EncodeUTF16.nasm \ compiler/x/runtime/X86LockReservation.nasm \ compiler/x/runtime/X86PicBuilder.nasm \ compiler/x/runtime/X86Unresolveds.nasm diff --git a/runtime/compiler/codegen/J9RecognizedMethodsEnum.hpp b/runtime/compiler/codegen/J9RecognizedMethodsEnum.hpp index 49ad355acb8..9762669ad9e 100644 --- a/runtime/compiler/codegen/J9RecognizedMethodsEnum.hpp +++ b/runtime/compiler/codegen/J9RecognizedMethodsEnum.hpp @@ -514,6 +514,11 @@ sun_nio_cs_UTF_8_Encoder_encodeUTF_8, sun_nio_cs_ext_IBM1388_Encoder_encodeArrayLoop, + sun_nio_cs_UTF16_Encoder_encodeUTF16Big, + sun_nio_cs_UTF16_Encoder_encodeUTF16Little, + com_ibm_jit_JITHelpers_transformedEncodeUTF16Big, + com_ibm_jit_JITHelpers_transformedEncodeUTF16Little, + java_lang_Integer_bitCount, java_lang_Integer_highestOneBit, java_lang_Integer_lowestOneBit, diff --git a/runtime/compiler/compile/J9Compilation.cpp b/runtime/compiler/compile/J9Compilation.cpp index c8a056f63e0..f06fca26130 100644 --- a/runtime/compiler/compile/J9Compilation.cpp +++ b/runtime/compiler/compile/J9Compilation.cpp @@ -449,6 +449,8 @@ J9::Compilation::isConverterMethod(TR::RecognizedMethod rm) case TR::sun_nio_cs_ext_SBCS_Decoder_decodeSBCS: case TR::sun_nio_cs_UTF_8_Encoder_encodeUTF_8: case TR::sun_nio_cs_UTF_8_Decoder_decodeUTF_8: + case TR::sun_nio_cs_UTF16_Encoder_encodeUTF16Big: + case TR::sun_nio_cs_UTF16_Encoder_encodeUTF16Little: return true; default: return false; @@ -495,6 +497,14 @@ J9::Compilation::canTransformConverterMethod(TR::RecognizedMethod rm) case TR::sun_nio_cs_ext_SBCS_Decoder_decodeSBCS: return genTRxx; + // devinmp: I'm not sure whether these could be transformed in AOT, but + // they haven't been so far. + case TR::sun_nio_cs_UTF16_Encoder_encodeUTF16Little: + return !aot && self()->cg()->getSupportsEncodeUtf16LittleWithSurrogateTest(); + + case TR::sun_nio_cs_UTF16_Encoder_encodeUTF16Big: + return !aot && self()->cg()->getSupportsEncodeUtf16BigWithSurrogateTest(); + default: return false; } diff --git a/runtime/compiler/env/j9method.cpp b/runtime/compiler/env/j9method.cpp index e9a247dc3db..8ec59eeb20a 100644 --- a/runtime/compiler/env/j9method.cpp +++ b/runtime/compiler/env/j9method.cpp @@ -3193,6 +3193,8 @@ void TR_ResolvedJ9Method::construct() {x(TR::com_ibm_jit_JITHelpers_getPackedDataSizeFromJ9Class64, "getPackedDataSizeFromJ9Class64", "(J)J")}, {x(TR::com_ibm_jit_JITHelpers_getComponentTypeFromJ9Class32, "getComponentTypeFromJ9Class32", "(I)I")}, {x(TR::com_ibm_jit_JITHelpers_getComponentTypeFromJ9Class64, "getComponentTypeFromJ9Class64", "(J)J")}, + {x(TR::com_ibm_jit_JITHelpers_transformedEncodeUTF16Big, "transformedEncodeUTF16Big", "(JJI)I")}, + {x(TR::com_ibm_jit_JITHelpers_transformedEncodeUTF16Little, "transformedEncodeUTF16Little", "(JJI)I")}, {x(TR::com_ibm_jit_JITHelpers_getIntFromObject, "getIntFromObject", "(Ljava/lang/Object;J)I")}, {x(TR::com_ibm_jit_JITHelpers_getIntFromObjectVolatile, "getIntFromObjectVolatile", "(Ljava/lang/Object;J)I")}, {x(TR::com_ibm_jit_JITHelpers_getLongFromObject, "getLongFromObject", "(Ljava/lang/Object;J)J")}, @@ -3780,6 +3782,8 @@ void TR_ResolvedJ9Method::construct() {x(TR::sun_nio_cs_ext_SBCS_Decoder_decodeSBCS, "decodeSBCS", "([BII[CI[C)I")}, {x(TR::sun_nio_cs_UTF_8_Encoder_encodeUTF_8, "encodeUTF_8", "([CII[BI)I")}, {x(TR::sun_nio_cs_UTF_8_Decoder_decodeUTF_8, "decodeUTF_8", "([BII[CI)I")}, + {x(TR::sun_nio_cs_UTF16_Encoder_encodeUTF16Big, "encodeUTF16Big", "([CII[BI)I")}, + {x(TR::sun_nio_cs_UTF16_Encoder_encodeUTF16Little, "encodeUTF16Little", "([CII[BI)I")}, { TR::unknownMethod} }; @@ -4188,6 +4192,7 @@ void TR_ResolvedJ9Method::construct() { "java/lang/reflect/Method", MethodMethods }, { "sun/nio/cs/UTF_8$Decoder", EncodeMethods }, { "sun/nio/cs/UTF_8$Encoder", EncodeMethods }, + { "sun/nio/cs/UTF16_Encoder", EncodeMethods }, { "jdk/internal/misc/Unsafe", UnsafeMethods }, { 0 } }; diff --git a/runtime/compiler/p/codegen/J9TreeEvaluator.cpp b/runtime/compiler/p/codegen/J9TreeEvaluator.cpp index 1d246026c35..cf7f96a1fd1 100644 --- a/runtime/compiler/p/codegen/J9TreeEvaluator.cpp +++ b/runtime/compiler/p/codegen/J9TreeEvaluator.cpp @@ -10597,6 +10597,95 @@ static TR::Register *inlineStringHashcode(TR::Node *node, TR::CodeGenerator *cg) return hashReg; } +static TR::Register *inlineEncodeUTF16(TR::Node *node, TR::CodeGenerator *cg) + { + // tree looks like: + // icall com.ibm.jit.JITHelpers.encodeUtf16{Big,Little}() + // input ptr + // output ptr + // input length (in elements) + // Number of elements converted returned + + TR::MethodSymbol *symbol = node->getSymbol()->castToMethodSymbol(); + bool bigEndian = symbol->getRecognizedMethod() == TR::com_ibm_jit_JITHelpers_transformedEncodeUTF16Big; + + // Set up register dependencies + const int gprClobberCount = 5; + const int fprClobberCount = 4; + const int vrClobberCount = 6; + const int crClobberCount = 2; + const int totalDeps = crClobberCount + gprClobberCount + fprClobberCount + vrClobberCount + 3; + TR::RegisterDependencyConditions *deps = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(1, totalDeps, cg->trMemory()); + + TR::Register *inputReg = cg->gprClobberEvaluate(node->getChild(0)); + TR::Register *outputReg = cg->gprClobberEvaluate(node->getChild(1)); + TR::Register *inputLenReg = cg->gprClobberEvaluate(node->getChild(2)); + TR::Register *outputLenReg = cg->allocateRegister(); + + // Allocate clobbered registers + TR::Register *gprClobbers[gprClobberCount], *fprClobbers[fprClobberCount], *vrClobbers[vrClobberCount], *crClobbers[crClobberCount]; + for (int i = 0; i < gprClobberCount; ++i) gprClobbers[i] = cg->allocateRegister(TR_GPR); + for (int i = 0; i < fprClobberCount; ++i) fprClobbers[i] = cg->allocateRegister(TR_FPR); + for (int i = 0; i < vrClobberCount; ++i) vrClobbers[i] = cg->allocateRegister(TR_VRF); + for (int i = 0; i < crClobberCount; ++i) crClobbers[i] = cg->allocateRegister(TR_CCR); + + // Add the pre and post conditions + // Input and output registers + deps->addPreCondition(inputReg, TR::RealRegister::gr3); + + deps->addPostCondition(outputLenReg, TR::RealRegister::gr3); + deps->addPostCondition(outputReg, TR::RealRegister::gr4); + deps->addPostCondition(inputLenReg, TR::RealRegister::gr5); + + //CCR. + deps->addPostCondition(crClobbers[0], TR::RealRegister::cr0); + deps->addPostCondition(crClobbers[1], TR::RealRegister::cr6); + + //GPRs + Trampoline + deps->addPostCondition(gprClobbers[0], TR::RealRegister::gr6); + deps->addPostCondition(gprClobbers[1], TR::RealRegister::gr7); + deps->addPostCondition(gprClobbers[2], TR::RealRegister::gr8); + deps->addPostCondition(gprClobbers[3], TR::RealRegister::gr9); + deps->addPostCondition(gprClobbers[4], TR::RealRegister::gr11); + + //VR's + deps->addPostCondition(vrClobbers[0], TR::RealRegister::vr0); + deps->addPostCondition(vrClobbers[1], TR::RealRegister::vr1); + deps->addPostCondition(vrClobbers[2], TR::RealRegister::vr2); + deps->addPostCondition(vrClobbers[3], TR::RealRegister::vr3); + deps->addPostCondition(vrClobbers[4], TR::RealRegister::vr4); + deps->addPostCondition(vrClobbers[5], TR::RealRegister::vr5); + + //FP/VSR + deps->addPostCondition(fprClobbers[0], TR::RealRegister::fp0); + deps->addPostCondition(fprClobbers[1], TR::RealRegister::fp1); + deps->addPostCondition(fprClobbers[2], TR::RealRegister::fp2); + deps->addPostCondition(fprClobbers[3], TR::RealRegister::fp3); + + // Generate helper call + TR_RuntimeHelper helper; + helper = bigEndian ? TR_PPCencodeUTF16Big : TR_PPCencodeUTF16Little; + TR::SymbolReference *helperSym = cg->comp()->getSymRefTab()->findOrCreateRuntimeHelper(helper); + generateDepImmSymInstruction(cg, TR::InstOpCode::bl, node, (uintptr_t)helperSym->getMethodAddress(), deps, helperSym); + + for (uint32_t i = 0; i < node->getNumChildren(); ++i) cg->decReferenceCount(node->getChild(i)); + + // Spill the clobbered registers + if (inputReg != node->getChild(0)->getRegister()) cg->stopUsingRegister(inputReg); + if (outputReg != node->getChild(1)->getRegister()) cg->stopUsingRegister(outputReg); + if (inputLenReg != node->getChild(2)->getRegister()) cg->stopUsingRegister(inputLenReg); + for (int i = 0; i < gprClobberCount; ++i) cg->stopUsingRegister(gprClobbers[i]); + for (int i = 0; i < vrClobberCount; ++i) cg->stopUsingRegister(vrClobbers[i]); + for (int i = 0; i < fprClobberCount; ++i) cg->stopUsingRegister(fprClobbers[i]); + for (int i = 0; i < crClobberCount; ++i) cg->stopUsingRegister(crClobbers[i]); + + cg->machine()->setLinkRegisterKilled(true); + cg->setHasCall(); + node->setRegister(outputLenReg); + + return outputLenReg; + } + static TR::Register *inlineIntrinsicIndexOf_P10(TR::Node *node, TR::CodeGenerator *cg, bool isLatin1) { static bool disableIndexOfStringIntrinsic = feGetEnv("TR_DisableIndexOfStringIntrinsic") != NULL; @@ -12036,6 +12125,15 @@ J9::Power::CodeGenerator::inlineDirectCall(TR::Node *node, TR::Register *&result } break; + case TR::com_ibm_jit_JITHelpers_transformedEncodeUTF16Big: + case TR::com_ibm_jit_JITHelpers_transformedEncodeUTF16Little: + if (comp->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P7) && comp->target().cpu.supportsFeature(OMR_FEATURE_PPC_HAS_VSX)) + { + resultReg = inlineEncodeUTF16(node, cg); + return true; + } + break; + case TR::java_lang_StringLatin1_indexOfChar: case TR::java_lang_StringUTF16_indexOfCharUnsafe: case TR::com_ibm_jit_JITHelpers_intrinsicIndexOfLatin1: diff --git a/runtime/compiler/p/runtime/CMakeLists.txt b/runtime/compiler/p/runtime/CMakeLists.txt index e96bb674e29..4253cc430aa 100644 --- a/runtime/compiler/p/runtime/CMakeLists.txt +++ b/runtime/compiler/p/runtime/CMakeLists.txt @@ -40,6 +40,7 @@ j9jit_files( ${omr_SOURCE_DIR}/compiler/p/runtime/OMRCodeCacheConfig.cpp p/runtime/J9PPCArrayCopy.spp p/runtime/J9PPCArrayTranslate.spp + p/runtime/J9PPCEncodeUTF16.spp p/runtime/J9PPCCRC32.spp p/runtime/J9PPCCRC32_wrapper.c p/runtime/CodeSync.cpp diff --git a/runtime/compiler/p/runtime/J9PPCEncodeUTF16.spp b/runtime/compiler/p/runtime/J9PPCEncodeUTF16.spp new file mode 100644 index 00000000000..0807d2ead51 --- /dev/null +++ b/runtime/compiler/p/runtime/J9PPCEncodeUTF16.spp @@ -0,0 +1,523 @@ +!! Copyright IBM Corp. and others 2000 +!! +!! This program and the accompanying materials are made available under +!! the terms of the Eclipse Public License 2.0 which accompanies this +!! distribution and is available at https://www.eclipse.org/legal/epl-2.0/ +!! or the Apache License, Version 2.0 which accompanies this distribution and +!! is available at https://www.apache.org/licenses/LICENSE-2.0. +!! +!! This Source Code may also be made available under the following +!! Secondary Licenses when the conditions for such availability set +!! forth in the Eclipse Public License, v. 2.0 are satisfied: GNU +!! General Public License, version 2 with the GNU Classpath +!! Exception [1] and GNU General Public License, version 2 with the +!! OpenJDK Assembly Exception [2]. +!! +!! [1] https://www.gnu.org/software/classpath/license.html +!! [2] https://openjdk.org/legal/assembly-exception.html +!! +!! SPDX-License-Identifier: EPL-2.0 OR Apache-2.0 OR GPL-2.0-only WITH Classpath-exception-2.0 OR GPL-2.0-only WITH OpenJDK-assembly-exception-1.0 + +#include "j9cfg.h" +#include "jilconsts.inc" +#include "p/runtime/ppcasmdefines.inc" + +#define SURR_MASK_RES_SWAP 0xD8 +#define SURR_MASK_RES 0xD800 + +#ifdef AIXPPC + .globl .__encodeUTF16Big + .globl __encodeUTF16Big{DS} + .globl .__encodeUTF16Little + .globl __encodeUTF16Little{DS} + +#elif defined(LINUXPPC64) + .globl FUNC_LABEL(__encodeUTF16Big) + .type FUNC_LABEL(__encodeUTF16Big),@function + .globl FUNC_LABEL(__encodeUTF16Little) + .type FUNC_LABEL(__encodeUTF16Little),@function + +#elif defined(LINUX) || defined(NEUTRINO) + .globl __encodeUTF16Big + .globl __encodeUTF16Little +#endif + +#ifdef AIXPPC +! .text section + .align 5 + .csect Utf16ToByte_TEXT{PR} +#elif defined(LINUXPPC64) + .section ".text" + .align 5 +#else + .align 5 +#endif + +!------------------------------------------------------------------------------- +! entry encodeUTF16Big +!------------------------------------------------------------------------------- +! The conversion process copies 2-byte UTF16 characters to the destination +! unless a surrogate pair is encountered, when the process stops. A surrogate +! pair is a pair of UTF16 code points in range U+10000 - U+10FFF, represented by +! a sequence of 4 bytes in range 0xD800 - 0xDBFF and 0xDC00 - 0xDFFF (2 bytes +! per respective code point). The surrogate pair can appear swapped in a text +! stream or missing one of the code points. We can generalize the test for it +! by masking a code points with 0xF800 and checking if the result is equal to +! 0xD800. +! The conversion process is implemented as follows: +! +! size_t i = 0; +! while (i < n) { +! uint16_t u16 = *(src + i); +! +! // surrogate check +! if ((u16 & 0xF800) == 0xD800) break; +! +! *(dest++) = u16; +! ++i; +! } +! +! return i; + +#ifdef AIXPPC +.__encodeUTF16Big: + .function .__encodeUTF16Big,startproc.__encodeUTF16Big,16,0,(endproc.__encodeUTF16Big-startproc.__encodeUTF16Big) + .machine "push" + .machine "pwr7" +#elif defined(LINUXPPC64) +FUNC_LABEL(__encodeUTF16Big): +#else +__encodeUTF16Big: +#endif + +!---------------------------------------------------------------------- +! input: +! r3 = input ptr +! r4 = output ptr +! r5 = num_elements +! output: +! r3 = number_elements_processed +! r4 = input ptr +! clobbered: +! r3 = input ptr +! r4 = output ptr +! r5 = element to process count +! r6 = elements to process in vec loop +! r7 = elements to process in residue loop +! r8 = temp storage area +! r9 = result of masking code points with 0xF800 +! r11 = 0xD800 mask result for half word reads +! vr0 = 0xF800 mask for the vector reads +! vr1 = 0xD800 mask result for vector reads +! vr2 = input half words 0 - 7 +! vr3 = input half words 8 - 15 +! vr4 = temp mask generation reg +! vr5 = half word rotate amount register for endian correction (0008) +! fp0-3 = output registers (using FP registers to perform 16-byte misaligned stores) +! cr0 = all sorts of condition checks + + startproc.__encodeUTF16Big: + +#ifdef STANDALONE + ! Prologue + stdu 1, -112(1) + mflr r0 + std r0, 128(1) + std r4, 160(1) +#else + staddr r4, -ALen(J9SP) ! preserve start of output buffer for number of elements processed calculation +#endif + + cmpi cr0, 0, r5, 0 ! done if no elements to process + beq .L.__done_big + li r11, 0 ! load surrogate masking result + +#if !defined(__LITTLE_ENDIAN__) + ori r11, r11, SURR_MASK_RES +#else + ori r11, r11, SURR_MASK_RES_SWAP +#endif + +.L.__align_loop_big: + andi. r8, r3, 0xF ! bits in 0xf? + beq cr0, .L.__main_big ! 16 byte aligned if clear +#if !defined(__LITTLE_ENDIAN__) + lhz r8, 0(r3) ! load code pt + rlwinm r9, r8, 0, 16, 20 ! mask out equivalent of 0xF800 from code point +#else + lhbrx r8, 0, r3 + rlwinm r9, r8, 0, 24, 28 ! mask out equivalent of 0xF800 from code point +#endif + cmpw cr0, r9, r11 ! surrogate if == 0xD800 + beq cr0, .L.__done_big ! skip processing if surrogate + sth r8, 0(r4) ! store hb at dest + addi r4, r4, 2 + addi r3, r3, 2 + addi r5, r5, -1 + cmpi cr0, 0, r5, 0 + beq .L.__done_big ! the count reached zero before alignment + b .L.__align_loop_big +.L.__main_big: + cmpi cr0, 0, r5, 16 ! if we have less than 16 items to process, jump into the alignment residue loop + ble cr0, .L.__alignResidue_big + rlwinm r6, r5, 32-4, 4, 31 ! r6 = n / 16, processed in vector loop + mtctr r6 + rlwinm r7, r5, 0, 28, 31 ! r7 = n % 16, processed in residue loop + + ! set up HB and LB permute masks + vspltisb vr0, -8 ! vr0: F8F8F8F8F8F8F8F8F8F8F8F8F8F8F8F8 + vspltisb vr4, 8 ! vr4: 08080808080808080808080808080808 + vslh vr0, vr0, vr4 ! vr0: F800F800F800F800F800F800F800F800 + li r8, 0xD ! r8: 0xD + lvsl vr1, 0, r8 ! vr1: 0D0E0F101112131415161718191A1B1C + vspltisb vr4, 4 ! vr4: 04040404040404040404040404040404 + vslb vr1, vr1, vr4 ! vr1: D0E0F101112131415161718191A1B1C0 + li r8, 0x8 ! r8: 0x8 + lvsl vr4, 0, r8 ! vr4: 08090A0B0C0D0E0F1011121314151617 + vaddubs vr1, vr1, vr4 ! vr1: D8E9FB0C1D2E3F5061728394A5B6C7D7 + vspltb vr1, vr1, 0 ! vr1: D8D8D8D8D8D8D8D8D8D8D8D8D8D8D8D8 + vspltisb vr4, 8 ! vr4: 08080808080808080808080808080808 + vslh vr1, vr1, vr4 ! vr1: D800D800D800D800D800D800D800D800 + li r8, 16 ! r8: input/output ptr increment + +#if defined(__LITTLE_ENDIAN__) + vspltish vr5, 8 ! vr5: 00080008000800080008000800080008 +#endif + +.L.__vectorLoop_big: + lvx vr2, 0, r3 ! vr2: hw 0 - 7 + lvx vr3, r3, r8 ! vr3: hw 8 - 15 + vand vr4, vr2, vr0 ! vr4: high bytes masked with F8 + vcmpequh_r vr4, vr4, vr1 ! vr4: all 0 if no HB & F8 == D8 + bne cr6, .L.__alignResidue_big ! skip to residue processing if we have a surrogate + vand vr4, vr3, vr0 ! vr4: high bytes masked with F8 + vcmpequh_r vr4, vr4, vr1 ! vr4: all 0 if no HB & F8 == D8 + bne cr6, .L.__alignResidue_big ! skip to residue processing if we have a surrogate + +#if defined(__LITTLE_ENDIAN__) + vrlh vr2, vr2, vr5 ! vr2: hw 0 - 7 byte-swapped + vrlh vr3, vr3, vr5 ! vr3: hw 8 - 15 byte-swapped + xxlor vs1, vs34, vs34 ! least significant double word of vr2 + xxpermdi vs0, vs34, vs34, 2 ! most significant double word of vr2 +#else + xxlor vs0, vs34, vs34 ! most significant double word of vr2 + xxpermdi vs1, vs34, vs34, 2 ! least significant double word of vr2 +#endif + stfd fp0, 0(r4) ! hw 0 - 7 stored at the output ptr + stfd fp1, 8(r4) ! hw 8 - 15 stored at the output ptr +#if defined(__LITTLE_ENDIAN__) + xxlor vs3, vs35, vs35 ! least significant double word of vr3 + xxpermdi vs2, vs35, vs35, 2 ! most significant double word of vr3 +#else + xxlor vs2, vs35, vs35 ! most significant double word of vr3 + xxpermdi vs3, vs35, vs35, 2 ! least significant double word of vr3 +#endif + stfd fp2, 16(r4) ! hw 16 - 23 stored at the output ptr + stfd fp3, 24(r4) ! hw 16 - 23 stored at the output ptr + addi r3, r3, 32 ! bump input ptr + addi r4, r4, 32 ! bump output ptr + subi r5, r5, 16 ! decrement the element count + bdnz .L.__vectorLoop_big + cmpi cr0, 0, r5, 0 ! check for work to be done in residue loop + beq cr0, .L.__done_big ! nothing to do, return + +! residue work starts here +.L.__alignResidue_big: + mtctr r5 ! move count of remaining elements into ctr + addi r3, r3, -2 ! dec. input ptr so we use lhzu instruction to load + addi r4, r4, -2 ! ditto for the output ptr. + +.L.__residueLoop_big: +#if !defined(__LITTLE_ENDIAN__) + lhzu r8, 2(r3) ! load utf16 code point and bump src + rlwinm r9, r8, 0, 16, 20 ! repeat test against surrogate mask +#else + addi r3, r3, 2 + lhbrx r8, 0, r3 + rlwinm r9, r8, 0, 24, 28 ! repeat test against surrogate mask +#endif + cmpw cr0, r9, r11 + beq cr0, .L.__doneResidue_big ! stop copying if we have a surrogate + sthu r8, 2(r4) + addi r5, r5, -1 + bdnz .L.__residueLoop_big + +.L.__doneResidue_big: + addi r4, r4, 2 ! inc. output ptr so we can use it for calculating number of elements processed + +.L.__done_big: +#ifdef STANDALONE + ld r8, 160(1) +#else + laddr r8, -ALen(J9SP) +#endif + sub r3, r4, r8 ! r3 = output_ptr - output_base_ptr = 2x num elements processed + srwi r3, r3, 1 ! divide diff by 2 to obtain num elements processed + +#ifdef STANDALONE + ! Prologue + ld 0, 128(1) + mtlr 0 + ld 1, 0(1) +#endif + blr + +#ifdef AIXPPC + .machine "pop" +#endif + endproc.__encodeUTF16Big: + + +!------------------------------------------------------------------------------- +! entry encodeUTF16Little +!------------------------------------------------------------------------------- +! The conversion process copies 2-byte UTF16 characters to the destination +! unless a surrogate pair is encountered, when the process stops. A surrogate +! pair is a pair of UTF16 code points in range U+10000 - U+10FFF, represented by +! a sequence of 4 bytes in range 0xD800 - 0xDBFF and 0xDC00 - 0xDFFF (2 bytes +! per respective code point). The surrogate pair can appear swapped in a text +! stream or missing one of the code points. We can generalize the test for it +! by masking a code points with 0xF800 and checking if the result is equal to +! 0xD800. +! The conversion process is implemented as follows: +! +! size_t i = 0; +! while (i < n) { +! uint16_t u16 = *(src + i); +! +! // surrogate check +! if ((u16 & 0xF800) == 0xD800) break; +! +! *(dest++) = u16; +! ++i; +! } +! +! return i; + +#ifdef AIXPPC +.__encodeUTF16Little: + .function .__encodeUTF16Little,startproc.__encodeUTF16Little,16,0,(endproc.__encodeUTF16Little-startproc.__encodeUTF16Little) + .machine "push" + .machine "pwr7" +#elif defined(LINUXPPC64) +FUNC_LABEL(__encodeUTF16Little): +#else +__encodeUTF16Little: +#endif + +!---------------------------------------------------------------------- +! input: +! r3 = input ptr +! r4 = output ptr +! r5 = num_elements +! output: +! r3 = number_elements_processed +! clobbered: +! r3 = output ptr +! r4 = input ptr +! r5 = element to process count +! r6 = elements to process in vec loop +! r7 = elements to process in residue loop +! r8 = temp storage area +! r9 = result of masking code points with 0xF800 +! r11 = 0xD800 mask result for half word reads +! vr0 = 0xF800 mask for the vector reads +! vr1 = 0xD800 mask result for vector reads +! vr2 = input half words 0 - 7 +! vr3 = input half words 8 - 15 +! vr4 = temp mask generation reg +! vr5 = half word rotate amount register for endian correction (0008) +! fp0-3 = output registers (using FP registers to perform 16-byte misaligned stores) +! cr0 = all sorts of condition checks + + startproc.__encodeUTF16Little: + +#ifdef STANDALONE + ! Prologue + stdu 1, -112(1) + mflr r0 + std r0, 128(1) + std r4, 160(1) +#else + staddr r4, -ALen(J9SP) ! preserve start of output buffer for number of elements processed calculation +#endif + + cmpi cr0, 0, r5, 0 ! done if no elements to process + beq .L.__done + li r11, 0 ! load surrogate masking result + +#if defined(__LITTLE_ENDIAN__) + ori r11, r11, SURR_MASK_RES +#else + ori r11, r11, SURR_MASK_RES_SWAP +#endif + +.L.__align_loop: + andi. r8, r3, 0xF ! bits in 0xf? + beq cr0, .L.__main ! 16 byte aligned if clear + +#if defined(__LITTLE_ENDIAN__) + lhz r8, 0(r3) ! load code pt + rlwinm r9, r8, 0, 16, 20 ! mask out equivalent of 0xF800 from code point +#else + lhbrx r8, r0, r3 + rlwinm r9, r8, 0, 24, 28 ! mask out equivalent of 0xF8 from code point +#endif + + cmpw cr0, r9, r11 ! surrogate if == 0xD8 + beq cr0, .L.__done ! skip processing if surrogate + sth r8, 0(r4) ! store hb at dest + addi r3, r3, 2 + addi r4, r4, 2 + addi r5, r5, -1 + cmpi cr0, 0, r5, 0 + beq .L.__done ! the count reached zero before alignment + b .L.__align_loop +.L.__main: + cmpi cr0, 0, r5, 16 ! if we have less than 16 items to process, jump into the alignment residue loop + ble cr0, .L.__alignResidue + rlwinm r6, r5, 32-4, 4, 31 ! r6 = n / 16, processed in vector loop + mtctr r6 + rlwinm r7, r5, 0, 28, 31 ! r7 = n % 16, processed in residue loop + ! set up HB and LB permute masks + vspltisb vr0, -8 ! vr0: F8F8F8F8F8F8F8F8F8F8F8F8F8F8F8F8 + vspltisb vr4, 8 ! vr4: 08080808080808080808080808080808 + vslh vr0, vr0, vr4 ! vr0: F800F800F800F800F800F800F800F800 + li r8, 0xD ! r8: 0xD + lvsl vr1, 0, r8 ! vr1: 0D0E0F101112131415161718191A1B1C + vspltisb vr4, 4 ! vr4: 04040404040404040404040404040404 + vslb vr1, vr1, vr4 ! vr1: D0E0F101112131415161718191A1B1C0 + li r8, 0x8 ! r8: 0x8 + lvsl vr4, 0, r8 ! vr4: 08090A0B0C0D0E0F1011121314151617 + vaddubs vr1, vr1, vr4 ! vr1: D8E9FB0C1D2E3F5061728394A5B6C7D7 + vspltb vr1, vr1, 0 ! vr1: D8D8D8D8D8D8D8D8D8D8D8D8D8D8D8D8 + vspltisb vr4, 8 ! vr4: 08080808080808080808080808080808 + vslh vr1, vr1, vr4 ! vr1: D800D800D800D800D800D800D800D800 + +#if !defined(__LITTLE_ENDIAN__) + vspltish vr5, 8 ! vr5: 00080008000800080008000800080008 +#endif + + li r8, 16 ! r8: input/output ptr increment +.L.__vectorLoop: + lvx vr2, 0, r3 ! vr2: hw 0 - 7 + lvx vr3, r3, r8 ! vr3: hw 8 - 15 + vand vr4, vr2, vr0 ! vr4: high bytes masked with F8 + vcmpequh_r vr4, vr4, vr1 ! vr4: all 0 if no HB & F8 == D8 + bne cr6, .L.__alignResidue ! skip to residue processing if we have a surrogate + vand vr4, vr3, vr0 ! vr4: high bytes masked with F8 + vcmpequh_r vr4, vr4, vr1 ! vr4: all 0 if no HB & F8 == D8 + bne cr6, .L.__alignResidue ! skip to residue processing if we have a surrogate + +#if !defined(__LITTLE_ENDIAN__) + vrlh vr2, vr2, vr5 ! vr2: hw 0 - 7 byte-swapped + vrlh vr3, vr3, vr5 ! vr3: hw 8 - 15 byte-swapped + xxlor vs1, vs34, vs34 ! MSDW of vr2 + xxpermdi vs0, vs34, vs34, 2 +#else + xxlor vs0, vs34, vs34 ! LSDW of vr2 + xxpermdi vs1, vs34, vs34, 2 +#endif + stfd fp0, 0(r4) ! hw 0 - 7 stored at the output ptr + stfd fp1, 8(r4) ! hw 8 - 15 stored at the output ptr + +#if !defined(__LITTLE_ENDIAN__) + xxlor vs3, vs35, vs35 ! MSDW of vr3 + xxpermdi vs2, vs35, vs35, 2 ! LSDW of vr3 +#else + xxlor vs2, vs35, vs35 ! LSDW of vr3 + xxpermdi vs3, vs35, vs35, 2 ! MSDW of vr3 +#endif + stfd fp2, 16(r4) ! hw 16 - 23 stored at the output ptr + stfd fp3, 24(r4) ! hw 16 - 23 stored at the output ptr + + addi r3, r3, 32 ! bump output ptr + addi r4, r4, 32 ! bump input ptr + subi r5, r5, 16 ! decrement the element count + bdnz .L.__vectorLoop + cmpi cr0, 0, r5, 0 ! check for work to be done in residue loop + beq cr0, .L.__done ! nothing to do, return + +! residue work starts here +.L.__alignResidue: + mtctr r5 ! move count of remaining elements into ctr + addi r3, r3, -2 ! dec. input ptr so we use lhzu instruction to load + addi r4, r4, -2 ! ditto for the output ptr. + +.L.__residueLoop: +#if defined(__LITTLE_ENDIAN__) + lhzu r8, 2(r3) ! load utf16 code point and bump src + rlwinm r9, r8, 0, 16, 20 ! repeat test against surrogate mask +#else + addi r3, r3, 2 + lhbrx r8, r0, r3 + rlwinm r9, r8, 0, 24, 28 ! repeat test against surrogate mask +#endif + + cmpw cr0, r9, r11 + beq cr0, .L.__doneResidue ! stop copying if we have a surrogate + sthu r8, 2(r4) ! store lw at dest + addi r5, r5, -1 + bdnz .L.__residueLoop + +.L.__doneResidue: + addi r4, r4, 2 ! inc. output ptr so we can use it for calculating number of elements processed + +.L.__done: +#ifdef STANDALONE + ld r8, 160(1) +#else + laddr r8, -ALen(J9SP) +#endif + sub r3, r4, r8 ! r3 = output_ptr - output_base_ptr = 2x num elements processed + srwi r3, r3, 1 ! divide diff by 2 to obtain num elements processed + +#ifdef STANDALONE + ! Prologue + ld 0, 128(1) + mtlr 0 + ld 1, 0(1) +#endif + blr + +#ifdef AIXPPC + .machine "pop" +#endif + endproc.__encodeUTF16Little: + +! .data section +#ifdef AIXPPC + .toc + .csect __encodeUTF16Big{DS} + ADDR .__encodeUTF16Big + ADDR TOC{TC0} + ADDR 0x00000000 +! End csect __encodeUTF16Big{DS} + + .csect __encodeUTF16Little{DS} + ADDR .__encodeUTF16Little + ADDR TOC{TC0} + ADDR 0x00000000 +! End csect __encodeUTF16Little{DS} + +#elif defined(LINUXPPC64) + .section ".toc" +#if !defined(__LITTLE_ENDIAN__) + .section ".opd","aw" + .align 3 + .globl __encodeUTF16Big + .size __encodeUTF16Big,24 +__encodeUTF16Big: + .quad .__encodeUTF16Big + .quad .TOC.@tocbase + .long 0x00000000 + .long 0x00000000 + .globl __encodeUTF16Little + .size __encodeUTF16Little,24 +__encodeUTF16Little: + .quad .__encodeUTF16Little + .quad .TOC.@tocbase + .long 0x00000000 + .long 0x00000000 +#endif +#endif diff --git a/runtime/compiler/runtime/Runtime.cpp b/runtime/compiler/runtime/Runtime.cpp index cfd735467fc..af74dbde33b 100644 --- a/runtime/compiler/runtime/Runtime.cpp +++ b/runtime/compiler/runtime/Runtime.cpp @@ -342,6 +342,9 @@ JIT_HELPER(icallVMprJavaSendVirtualL); JIT_HELPER(icallVMprJavaSendVirtualF); JIT_HELPER(icallVMprJavaSendVirtualD); +JIT_HELPER(encodeUTF16Big); +JIT_HELPER(encodeUTF16Little); + #ifdef J9VM_OPT_JAVA_CRYPTO_ACCELERATION JIT_HELPER(doAESENCEncrypt); JIT_HELPER(doAESENCDecrypt); @@ -366,6 +369,9 @@ JIT_HELPER(SSEfloatRemainderIA32Thunk); JIT_HELPER(SSEdoubleRemainderIA32Thunk); JIT_HELPER(SSEdouble2LongIA32); +JIT_HELPER(encodeUTF16Big); +JIT_HELPER(encodeUTF16Little); + JIT_HELPER(SMPVPicInit); #endif /* TR_HOST_64BIT */ @@ -432,6 +438,8 @@ JIT_HELPER(__forwardWordArrayCopy_dp); JIT_HELPER(__forwardHalfWordArrayCopy_dp); JIT_HELPER(__referenceArrayCopy); JIT_HELPER(__generalArrayCopy); +JIT_HELPER(__encodeUTF16Big); +JIT_HELPER(__encodeUTF16Little); JIT_HELPER(__quadWordArrayCopy_vsx); JIT_HELPER(__forwardQuadWordArrayCopy_vsx); @@ -1217,6 +1225,8 @@ void initializeCodeRuntimeHelperTable(J9JITConfig *jitConfig, char isSMP) SET(TR_AMD64arrayTranslateTRTO, (void *)arrayTranslateTRTO, TR_Helper); SET(TR_AMD64arrayTranslateTROTNoBreak, (void *)arrayTranslateTROTNoBreak, TR_Helper); SET(TR_AMD64arrayTranslateTROT, (void *)arrayTranslateTROT, TR_Helper); + SET(TR_AMD64encodeUTF16Big, (void *)encodeUTF16Big, TR_Helper); + SET(TR_AMD64encodeUTF16Little, (void *)encodeUTF16Little, TR_Helper); #ifdef J9VM_OPT_JAVA_CRYPTO_ACCELERATION SET(TR_AMD64doAESENCEncrypt, (void *)doAESENCEncrypt, TR_Helper); SET(TR_AMD64doAESENCDecrypt, (void *)doAESENCDecrypt, TR_Helper); @@ -1263,6 +1273,8 @@ void initializeCodeRuntimeHelperTable(J9JITConfig *jitConfig, char isSMP) SET(TR_IA32arrayTranslateTRTO, (void *)arrayTranslateTRTO, TR_Helper); SET(TR_IA32arrayTranslateTROTNoBreak, (void *)arrayTranslateTROTNoBreak, TR_Helper); SET(TR_IA32arrayTranslateTROT, (void *)arrayTranslateTROT, TR_Helper); + SET(TR_IA32encodeUTF16Big, (void *)encodeUTF16Big, TR_Helper); + SET(TR_IA32encodeUTF16Little, (void *)encodeUTF16Little, TR_Helper); SET(TR_jitAddPicToPatchOnClassUnload, (void *)jitAddPicToPatchOnClassUnload, TR_Helper); @@ -1378,6 +1390,8 @@ void initializeCodeRuntimeHelperTable(J9JITConfig *jitConfig, char isSMP) SET(TR_PPCarrayTranslateTRTO255, (void *) __arrayTranslateTRTO255, TR_Helper); SET(TR_PPCarrayTranslateTROT255, (void *) __arrayTranslateTROT255, TR_Helper); SET(TR_PPCarrayTranslateTROT, (void *) __arrayTranslateTROT, TR_Helper); + SET(TR_PPCencodeUTF16Big, (void *) __encodeUTF16Big, TR_Helper); + SET(TR_PPCencodeUTF16Little, (void *) __encodeUTF16Little, TR_Helper); #elif defined(TR_HOST_ARM) SET(TR_ARMdouble2Long, (void *) __double2Long, TR_Helper); diff --git a/runtime/compiler/x/codegen/J9TreeEvaluator.cpp b/runtime/compiler/x/codegen/J9TreeEvaluator.cpp index f71fe81d73f..be75ccb0a48 100644 --- a/runtime/compiler/x/codegen/J9TreeEvaluator.cpp +++ b/runtime/compiler/x/codegen/J9TreeEvaluator.cpp @@ -11614,6 +11614,10 @@ J9::X86::TreeEvaluator::directCallEvaluator(TR::Node *node, TR::CodeGenerator *c return inlineIntrinsicIndexOf(node, cg, false); break; + case TR::com_ibm_jit_JITHelpers_transformedEncodeUTF16Big: + case TR::com_ibm_jit_JITHelpers_transformedEncodeUTF16Little: + return TR::TreeEvaluator::encodeUTF16Evaluator(node, cg); + case TR::java_lang_String_hashCodeImplDecompressed: if (cg->getSupportsInlineStringHashCode()) returnRegister = inlineStringHashCode(node, false, cg); @@ -11927,6 +11931,85 @@ J9::X86::TreeEvaluator::inlineStringLatin1Inflate(TR::Node *node, TR::CodeGenera return NULL; } +TR::Register * +J9::X86::TreeEvaluator::encodeUTF16Evaluator(TR::Node *node, TR::CodeGenerator *cg) + { + // tree looks like: + // icall com.ibm.jit.JITHelpers.encodeUTF16{Big,Little}() + // input ptr + // output ptr + // input length (in elements) + // Number of elements translated is returned + + TR::MethodSymbol *symbol = node->getSymbol()->castToMethodSymbol(); + bool bigEndian = symbol->getRecognizedMethod() == TR::com_ibm_jit_JITHelpers_transformedEncodeUTF16Big; + + // Set up register dependencies + const int gprClobberCount = 2; + const int maxFprClobberCount = 5; + const int fprClobberCount = bigEndian ? 5 : 4; // xmm4 only needed for big-endian + TR::Register *srcPtrReg, *dstPtrReg, *lengthReg, *resultReg; + TR::Register *gprClobbers[gprClobberCount], *fprClobbers[maxFprClobberCount]; + bool killSrc = TR::TreeEvaluator::stopUsingCopyRegAddr(node->getChild(0), srcPtrReg, cg); + bool killDst = TR::TreeEvaluator::stopUsingCopyRegAddr(node->getChild(1), dstPtrReg, cg); + bool killLen = TR::TreeEvaluator::stopUsingCopyRegInteger(node->getChild(2), lengthReg, cg); + resultReg = cg->allocateRegister(); + for (int i = 0; i < gprClobberCount; i++) + gprClobbers[i] = cg->allocateRegister(); + for (int i = 0; i < fprClobberCount; i++) + fprClobbers[i] = cg->allocateRegister(TR_FPR); + + int depCount = 11; + TR::RegisterDependencyConditions *deps = + generateRegisterDependencyConditions((uint8_t)0, depCount, cg); + + deps->addPostCondition(srcPtrReg, TR::RealRegister::esi, cg); + deps->addPostCondition(dstPtrReg, TR::RealRegister::edi, cg); + deps->addPostCondition(lengthReg, TR::RealRegister::edx, cg); + deps->addPostCondition(resultReg, TR::RealRegister::eax, cg); + + deps->addPostCondition(gprClobbers[0], TR::RealRegister::ecx, cg); + deps->addPostCondition(gprClobbers[1], TR::RealRegister::ebx, cg); + + deps->addPostCondition(fprClobbers[0], TR::RealRegister::xmm0, cg); + deps->addPostCondition(fprClobbers[1], TR::RealRegister::xmm1, cg); + deps->addPostCondition(fprClobbers[2], TR::RealRegister::xmm2, cg); + deps->addPostCondition(fprClobbers[3], TR::RealRegister::xmm3, cg); + if (bigEndian) + deps->addPostCondition(fprClobbers[4], TR::RealRegister::xmm4, cg); + + deps->stopAddingConditions(); + + // Generate helper call + TR_RuntimeHelper helper; + if (cg->comp()->target().is64Bit()) + helper = bigEndian ? TR_AMD64encodeUTF16Big : TR_AMD64encodeUTF16Little; + else + helper = bigEndian ? TR_IA32encodeUTF16Big : TR_IA32encodeUTF16Little; + + generateHelperCallInstruction(node, helper, deps, cg); + + // Free up registers + for (int i = 0; i < gprClobberCount; i++) + cg->stopUsingRegister(gprClobbers[i]); + for (int i = 0; i < fprClobberCount; i++) + cg->stopUsingRegister(fprClobbers[i]); + + for (uint16_t i = 0; i < node->getNumChildren(); i++) + cg->decReferenceCount(node->getChild(i)); + + TR_LiveRegisters *liveRegs = cg->getLiveRegisters(TR_GPR); + if (killSrc) + liveRegs->registerIsDead(srcPtrReg); + if (killDst) + liveRegs->registerIsDead(dstPtrReg); + if (killLen) + liveRegs->registerIsDead(lengthReg); + + node->setRegister(resultReg); + return resultReg; + } + /* * The CaseConversionManager is used to store info about the conversion. It defines the lower bound and upper bound value depending on diff --git a/runtime/compiler/x/codegen/J9TreeEvaluator.hpp b/runtime/compiler/x/codegen/J9TreeEvaluator.hpp index 7f71c2e9f79..ee7bccb07f1 100644 --- a/runtime/compiler/x/codegen/J9TreeEvaluator.hpp +++ b/runtime/compiler/x/codegen/J9TreeEvaluator.hpp @@ -130,6 +130,7 @@ class OMR_EXTENSIBLE TreeEvaluator: public J9::TreeEvaluator */ static void generateFillInDataBlockSequenceForUnresolvedField (TR::CodeGenerator *cg, TR::Node *node, TR::Snippet *dataSnippet, bool isWrite, TR::Register *sideEffectRegister, TR::Register *dataSnippetRegister); static TR::Register *directCallEvaluator(TR::Node *node, TR::CodeGenerator *cg); + static TR::Register *encodeUTF16Evaluator(TR::Node *node, TR::CodeGenerator *cg); static TR::Register *toUpperIntrinsicUTF16Evaluator(TR::Node *node, TR::CodeGenerator *cg); static TR::Register *toLowerIntrinsicUTF16Evaluator(TR::Node *node, TR::CodeGenerator *cg); static TR::Register *toUpperIntrinsicLatin1Evaluator(TR::Node *node, TR::CodeGenerator *cg); diff --git a/runtime/compiler/x/runtime/.gitignore b/runtime/compiler/x/runtime/.gitignore index a0a30d216f4..62fa885b13a 100644 --- a/runtime/compiler/x/runtime/.gitignore +++ b/runtime/compiler/x/runtime/.gitignore @@ -25,3 +25,4 @@ /X86Unresolveds.s /X86Unresolveds.asm /X86Crypto.s +/X86EncodeUTF16.s diff --git a/runtime/compiler/x/runtime/CMakeLists.txt b/runtime/compiler/x/runtime/CMakeLists.txt index 295618c213e..a80b59adefe 100644 --- a/runtime/compiler/x/runtime/CMakeLists.txt +++ b/runtime/compiler/x/runtime/CMakeLists.txt @@ -25,6 +25,7 @@ j9jit_files( x/runtime/Recomp.cpp x/runtime/X86ArrayTranslate.nasm x/runtime/X86Codert.nasm + x/runtime/X86EncodeUTF16.nasm x/runtime/X86LockReservation.nasm x/runtime/X86PicBuilder.nasm x/runtime/X86RelocationTarget.cpp diff --git a/runtime/compiler/x/runtime/X86EncodeUTF16.nasm b/runtime/compiler/x/runtime/X86EncodeUTF16.nasm new file mode 100644 index 00000000000..8095f5faf67 --- /dev/null +++ b/runtime/compiler/x/runtime/X86EncodeUTF16.nasm @@ -0,0 +1,185 @@ +; Copyright IBM Corp. and others 2014 +; +; This program and the accompanying materials are made available under +; the terms of the Eclipse Public License 2.0 which accompanies this +; distribution and is available at https://www.eclipse.org/legal/epl-2.0/ +; or the Apache License, Version 2.0 which accompanies this distribution and +; is available at https://www.apache.org/licenses/LICENSE-2.0. +; +; This Source Code may also be made available under the following +; Secondary Licenses when the conditions for such availability set +; forth in the Eclipse Public License, v. 2.0 are satisfied: GNU +; General Public License, version 2 with the GNU Classpath +; Exception [1] and GNU General Public License, version 2 with the +; OpenJDK Assembly Exception [2]. +; +; [1] https://www.gnu.org/software/classpath/license.html +; [2] https://openjdk.org/legal/assembly-exception.html +; +; SPDX-License-Identifier: EPL-2.0 OR Apache-2.0 OR GPL-2.0-only WITH Classpath-exception-2.0 OR GPL-2.0-only WITH OpenJDK-assembly-exception-1.0 + + SURROGATE_MASK equ 0f800h + SURROGATE_MASK32 equ 0f800f800h + SURROGATE_BITS equ 0d800h + SURROGATE_BITS32 equ 0d800d800h + SSE_MIN_CHARS equ 32 + +%include "jilconsts.inc" + +segment .text + + DECLARE_GLOBAL encodeUTF16Big + DECLARE_GLOBAL encodeUTF16Little + + align 16 +encodeUTF16Big_shufmask: + dq 0607040502030001h + dq 0e0f0c0d0a0b0809h + +%macro DefineUTF16EncodeHelper 2 ; args: helperName, bigEndian +; UTF16 encoding for BMP characters +; pseudocode(uint8_t *dest, uint16_t *src, int n): +; { +; for (int i = 0; i < n; i++) +; { +; uint16_t c = src[i]; +; if ((c & SURROGATE_MASK) == SURROGATE_BITS) break; +; #if bigEndian +; *dest++ = (uint8_t)(c >> 8); +; *dest++ = (uint8_t)(c & 0xff); +; #else +; *dest++ = (uint8_t)(c & 0xff); +; *dest++ = (uint8_t)(c >> 8); +; #endif +; } +; return i; +; } + +; NB. c is a surrogate code unit +; iff SURROGATE_MIN = 0xd800 <= c <= 0xdfff = SURROGATE_MAX +; iff (c & SURROGATE_MASK) == SURROGATE_BITS, +; where SURROGATE_MASK = 0xf800, SURROGATE_BITS = 0xd800 + +; registers: +; _rdi dest ptr (into byte array) +; _rsi src ptr (into char array) +; _rdx n +; [_r]cx c (one-at-a-time); tmp when using SSE +; bx c & SURROGATE_MASK (one-at-a-time) +; _rax original n / return value +; xmm0 constant SURROGATE_MASK vector (0xf800..f800) +; xmm1 constant SURROGATE_BITS vector (0xd800..d800) +; xmm2 current 8 characters (8-at-a-time) +; xmm3 surrogate bitmask +; xmm4 byte shuffle mask (big-endian only) + + align 16 +%1: ; helperName + ; Remember original count - + ; will subtract at return to compute number converted + mov _rax, _rdx + cmp _rdx, 0 + je Lend_%1 ; helpername + sub _rdi, _rsi ; relative to _rsi, only advance _rsi + cmp _rdx, SSE_MIN_CHARS + jl Lresidue_loop_%1 ; helperName + +Lprealign_%1: ; helperName + test _rsi, 0fh + jz Laligned16_%1 ; helperName + + mov cx, word [_rsi] + + ; return if surrogate + mov bx, cx + and bx, SURROGATE_MASK + cmp bx, SURROGATE_BITS + je Lend_%1 ; helperName + + ; not surrogate +%if %2 ;bigEndian + xchg cl, ch +%endif + mov word [_rsi + _rdi], cx + add _rsi, 2 + dec _rdx + jg Lprealign_%1 ; helperName + jmp Lend_%1 ; helperName + +Laligned16_%1: ; helperName + sub _rdx, 8 + jl Lresidue_%1 ; helperName + + ; initialize constant vectors: + ; SURROGATE_MASK + mov ecx, SURROGATE_MASK32 + movd xmm0, ecx + pshufd xmm0, xmm0, 0 + + ; SURROGATE_BITS + mov ecx, SURROGATE_BITS32 + movd xmm1, ecx + pshufd xmm1, xmm1, 0 + +%if %2 ;&bigEndian + ; shuffle mask for PSHUFB + movdqa xmm4, oword [rel encodeUTF16Big_shufmask] +%endif + +L8_at_a_time_%1: ; helperName + ; read 8 chars + ; should this use movdqu, start once 8-byte aligned? + movdqa xmm2, oword [_rsi] + + ; jump to residue loop if any are surrogate + movdqa xmm3, xmm2 + pand xmm3, xmm0 + pcmpeqw xmm3, xmm1 + ptest xmm3, xmm3 ; SSE4.1 + jnz Lresidue_%1 ; helperName + + ; no surrogates +%if %2 ;&bigEndian + pshufb xmm2, xmm4 ; SSSE3 +%endif + + ; write 8 chars + movdqu oword [_rsi + _rdi], xmm2 + + add _rsi, 16 + sub _rdx, 8 + jge L8_at_a_time_%1 ; helperName + +Lresidue_%1: ; helperName + add _rdx, 8 + cmp _rdx, 0 + je Lend_%1 ; helperName + +Lresidue_loop_%1: ; helperName + mov cx, word [_rsi] + + ; return if surrogate + mov bx, cx + and bx, SURROGATE_MASK + cmp bx, SURROGATE_BITS + je Lend_%1 ; helperName + + ; not surrogate +%if %2 ;&bigEndian + xchg cl, ch +%endif + mov word [_rsi + _rdi], cx + add _rsi, 2 + dec _rdx + jg Lresidue_loop_%1 ; helperName + +Lend_%1: ;&helperName: + sub _rax, _rdx + ret + +%endmacro + +; Expand out the two helpers + +DefineUTF16EncodeHelper encodeUTF16Big, 1 +DefineUTF16EncodeHelper encodeUTF16Little, 0 diff --git a/runtime/compiler/z/codegen/J9CodeGenerator.cpp b/runtime/compiler/z/codegen/J9CodeGenerator.cpp index 8fb71450f0e..a7921c12a62 100644 --- a/runtime/compiler/z/codegen/J9CodeGenerator.cpp +++ b/runtime/compiler/z/codegen/J9CodeGenerator.cpp @@ -4045,6 +4045,10 @@ J9::Z::CodeGenerator::inlineDirectCall( return true; } break; + case TR::com_ibm_jit_JITHelpers_transformedEncodeUTF16Big: + return resultReg = comp->getOption(TR_DisableUTF16BEEncoder) ? TR::TreeEvaluator::inlineUTF16BEEncodeSIMD(node, cg) + : TR::TreeEvaluator::inlineUTF16BEEncode (node, cg); + break; case TR::java_lang_Integer_stringSize: case TR::java_lang_Long_stringSize: if (cg->getSupportsIntegerStringSize()) diff --git a/runtime/compiler/z/codegen/J9TreeEvaluator.cpp b/runtime/compiler/z/codegen/J9TreeEvaluator.cpp index faabbcca960..1fbf9caaad8 100644 --- a/runtime/compiler/z/codegen/J9TreeEvaluator.cpp +++ b/runtime/compiler/z/codegen/J9TreeEvaluator.cpp @@ -1993,6 +1993,152 @@ J9::Z::TreeEvaluator::inlineIntrinsicIndexOf(TR::Node * node, TR::CodeGenerator return indexRegister; } +TR::Register* +J9::Z::TreeEvaluator::inlineUTF16BEEncode(TR::Node *node, TR::CodeGenerator *cg) + { + TR::Compilation* comp = cg->comp(); + + // Create the necessary registers + TR::Register* output = cg->gprClobberEvaluate(node->getChild(1)); + TR::Register* input = cg->gprClobberEvaluate(node->getChild(0)); + + TR::Register* inputLen = cg->gprClobberEvaluate(node->getChild(2)); + TR::Register* inputLen8 = cg->allocateRegister(); + + TR::Register* temp1 = cg->allocateRegister(); + TR::Register* temp2 = cg->allocateRegister(); + + // Number of bytes currently translated (also used as a stride register) + TR::Register* translated = cg->allocateRegister(); + + // Convert input length in number of characters to number of bytes + generateRSInstruction(cg, TR::InstOpCode::getShiftLeftLogicalSingleOpCode(), node, inputLen, inputLen, 1); + + // Calculate inputLen8 = inputLen / 8 + generateRSInstruction(cg, TR::InstOpCode::SRLK, node, inputLen8, inputLen, 3); + + // Initialize the number of translated bytes to 0 + generateRREInstruction(cg, TR::InstOpCode::getXORRegOpCode(), node, translated, translated); + + // Create the necessary labels + TR::LabelSymbol * processChar4 = generateLabelSymbol( cg); + TR::LabelSymbol * processChar4End = generateLabelSymbol( cg); + TR::LabelSymbol * processChar1 = generateLabelSymbol( cg); + TR::LabelSymbol * processChar1End = generateLabelSymbol( cg); + TR::LabelSymbol * processChar1Copy = generateLabelSymbol( cg); + + const uint16_t surrogateRange1 = 0xD800; + const uint16_t surrogateRange2 = 0xDFFF; + + const uint32_t surrogateMaskAND = 0xF800F800; + const uint32_t surrogateMaskXOR = 0xD800D800; + + TR::RegisterDependencyConditions* dependencies = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(0, 7, cg); + + // ----------------- Incoming branch ----------------- + + generateS390LabelInstruction(cg, TR::InstOpCode::label, node, processChar4); + processChar4->setStartInternalControlFlow(); + + // Branch to the end if there are no more multiples of 4 chars left to process + generateS390CompareAndBranchInstruction(cg, TR::InstOpCode::getCmpLogicalOpCode(), node, inputLen8, 0, TR::InstOpCode::COND_MASK8, processChar4End, false, false, NULL, dependencies); + + // Load 4 input characters from memory and make a copy + generateRXInstruction(cg, TR::InstOpCode::LG, node, temp1, generateS390MemoryReference(input, translated, 0, cg)); + generateRREInstruction(cg, TR::InstOpCode::LGR, node, temp2, temp1); + + // AND temp2 by the surrogate mask + generateRILInstruction(cg, TR::InstOpCode::NIHF, node, temp2, surrogateMaskAND); + generateRILInstruction(cg, TR::InstOpCode::NILF, node, temp2, surrogateMaskAND); + + // XOR temp2 by the surrogate mask and branch if CC = 1 (meaning there is a surrogate) + generateRILInstruction(cg, TR::InstOpCode::XIHF, node, temp2, surrogateMaskXOR); + generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_CC1, node, processChar4End); + generateRILInstruction(cg, TR::InstOpCode::XILF, node, temp2, surrogateMaskXOR); + generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_CC1, node, processChar4End); + + generateRXInstruction(cg, TR::InstOpCode::STG, node, temp1, generateS390MemoryReference(output, translated, 0, cg)); + + // Advance the number of bytes processed + generateRIInstruction(cg, TR::InstOpCode::getAddHalfWordImmOpCode(), node, translated, 8); + + // Branch back to the start of the loop + generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_MASK15, node, processChar4); + + // ----------------- Incoming branch ----------------- + + generateS390LabelInstruction(cg, TR::InstOpCode::label, node, processChar4End); + processChar4End->setEndInternalControlFlow(); + generateS390LabelInstruction(cg, TR::InstOpCode::label, node, processChar1); + processChar1->setStartInternalControlFlow(); + + // Branch to the end if there are no more characters left to process + generateS390CompareAndBranchInstruction(cg, TR::InstOpCode::getCmpRegOpCode(), node, translated, inputLen, TR::InstOpCode::COND_BNL, processChar1End, false, false); + + // Load an input character from memory + generateRXInstruction(cg, TR::InstOpCode::LLH, node, temp1, generateS390MemoryReference(input, translated, 0, cg)); + + // Compare the input character against the lower bound surrogate character range + generateRILInstruction(cg, TR::InstOpCode::getCmpImmOpCode(), node, temp1, surrogateRange1); + + // Branch if < (non-surrogate char) + generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_MASK4, node, processChar1Copy); + + // Compare the input character against the upper bound surrogate character range + generateRILInstruction(cg, TR::InstOpCode::getCmpImmOpCode(), node, temp1, surrogateRange2); + + // Branch if > (non-surrogate char) + generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_MASK2, node, processChar1Copy); + + // If we get here it must be a surrogate char + generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_MASK15, node, processChar1End); + + // ----------------- Incoming branch ----------------- + + generateS390LabelInstruction(cg, TR::InstOpCode::label, node, processChar1Copy); + + // Store the lower byte of the character into the output buffer + generateRXInstruction (cg, TR::InstOpCode::STH, node, temp1, generateS390MemoryReference(output, translated, 0, cg)); + + // Advance the number of bytes processed + generateRIInstruction(cg, TR::InstOpCode::getAddHalfWordImmOpCode(), node, translated, 2); + + // Branch back to the start of the loop + generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_MASK15, node, processChar1); + + // Set up the proper register dependencies + dependencies->addPostCondition(input, TR::RealRegister::AssignAny); + dependencies->addPostCondition(inputLen, TR::RealRegister::AssignAny); + dependencies->addPostCondition(inputLen8, TR::RealRegister::AssignAny); + dependencies->addPostCondition(temp1, TR::RealRegister::AssignAny); + dependencies->addPostCondition(temp2, TR::RealRegister::AssignAny); + dependencies->addPostCondition(output, TR::RealRegister::AssignAny); + dependencies->addPostCondition(translated, TR::RealRegister::AssignAny); + + // ----------------- Incoming branch ----------------- + + generateS390LabelInstruction(cg, TR::InstOpCode::label, node, processChar1End, dependencies); + processChar1End->setEndInternalControlFlow(); + + // Convert translated length in number of bytes to number of characters + generateRSInstruction(cg, TR::InstOpCode::getShiftRightLogicalSingleOpCode(), node, translated, translated, 1); + + // Cleanup nodes before returning + cg->decReferenceCount(node->getChild(0)); + cg->decReferenceCount(node->getChild(1)); + cg->decReferenceCount(node->getChild(2)); + + // Cleanup registers before returning + cg->stopUsingRegister(input); + cg->stopUsingRegister(inputLen); + cg->stopUsingRegister(inputLen8); + cg->stopUsingRegister(temp1); + cg->stopUsingRegister(temp2); + cg->stopUsingRegister(output); + + return node->setRegister(translated); + } + /** * \brief Generate inline assembly for CRC32C.updateBytes and CRC32C.updateDirectByteBuffer * \details @@ -2445,6 +2591,215 @@ J9::Z::TreeEvaluator::inlineCRC32CUpdateBytes(TR::Node *node, TR::CodeGenerator return crc; } +TR::Register* +J9::Z::TreeEvaluator::inlineUTF16BEEncodeSIMD(TR::Node *node, TR::CodeGenerator *cg) + { + TR::Compilation* comp = cg->comp(); + + // Create the necessary registers + TR::Register* output = cg->gprClobberEvaluate(node->getChild(1)); + TR::Register* input = cg->gprClobberEvaluate(node->getChild(0)); + + TR::Register* inputLen; + TR::Register* inputLen16 = cg->allocateRegister(); + TR::Register* inputLenMinus1 = inputLen16; + + // Number of characters currently translated + TR::Register* translated = cg->allocateRegister(); + + // Initialize the number of translated characters to 0 + generateRREInstruction(cg, TR::InstOpCode::getXORRegOpCode(), node, translated, translated); + + TR::Node* inputLenNode = node->getChild(2); + + // Optimize the constant length case + bool isLenConstant = inputLenNode->getOpCode().isLoadConst() && performTransformation(comp, "O^O [%p] Reduce input length to constant.\n", inputLenNode); + + if (isLenConstant) + { + inputLen = cg->allocateRegister(); + + // Convert input length in number of characters to number of bytes + generateLoad32BitConstant(cg, inputLenNode, ((getIntegralValue(inputLenNode) * 2)), inputLen, true); + generateLoad32BitConstant(cg, inputLenNode, ((getIntegralValue(inputLenNode) * 2) >> 4) << 4, inputLen16, true); + } + else + { + inputLen = cg->gprClobberEvaluate(inputLenNode, true); + + // Convert input length in number of characters to number of bytes + generateRSInstruction(cg, TR::InstOpCode::getShiftLeftLogicalSingleOpCode(), node, inputLen, inputLen, 1); + + // Sign extend the value if needed + if (cg->comp()->target().is64Bit() && !(inputLenNode->getOpCode().isLong())) + { + generateRRInstruction(cg, TR::InstOpCode::getLoadRegWidenOpCode(), node, inputLen, inputLen); + generateRRInstruction(cg, TR::InstOpCode::getLoadRegWidenOpCode(), node, inputLen16, inputLen); + } + else + { + generateRRInstruction(cg, TR::InstOpCode::getLoadRegOpCode(), node, inputLen16, inputLen); + } + + // Truncate the 4 right most bits + generateRIInstruction(cg, TR::InstOpCode::NILL, node, inputLen16, static_cast (0xFFF0)); + } + + // Create the necessary vector registers + TR::Register* vInput = cg->allocateRegister(TR_VRF); + TR::Register* vSurrogate = cg->allocateRegister(TR_VRF); // Track index of first surrogate char + + TR::Register* vRange = cg->allocateRegister(TR_VRF); + TR::Register* vRangeControl = cg->allocateRegister(TR_VRF); + + // Initialize the vector registers + uint16_t surrogateRange1 = 0xD800; + uint16_t surrogateRange2 = 0xDFFF; + + uint16_t surrogateControl1 = 0xA000; // >= comparison + uint16_t surrogateControl2 = 0xC000; // <= comparison + + generateVRIaInstruction(cg, TR::InstOpCode::VGBM, node, vRange, 0, 0 /*unused*/); + generateVRIaInstruction(cg, TR::InstOpCode::VGBM, node, vRangeControl, 0, 0 /*unused*/); + + generateVRIaInstruction(cg, TR::InstOpCode::VLEIH, node, vRange, surrogateRange1, 0); + generateVRIaInstruction(cg, TR::InstOpCode::VLEIH, node, vRange, surrogateRange2, 1); + + generateVRIaInstruction(cg, TR::InstOpCode::VLEIH, node, vRangeControl, surrogateControl1, 0); + generateVRIaInstruction(cg, TR::InstOpCode::VLEIH, node, vRangeControl, surrogateControl2, 1); + + // Create the necessary labels + TR::LabelSymbol * process8Chars = generateLabelSymbol(cg); + TR::LabelSymbol * process8CharsEnd = generateLabelSymbol(cg); + + TR::LabelSymbol * processUnder8Chars = generateLabelSymbol(cg); + TR::LabelSymbol * processUnder8CharsEnd = generateLabelSymbol(cg); + + TR::LabelSymbol * processSurrogate = generateLabelSymbol(cg); + TR::LabelSymbol * processSurrogateEnd = generateLabelSymbol(cg); + + // Branch to the end if there are no more multiples of 8 chars left to process + generateS390CompareAndBranchInstruction(cg, TR::InstOpCode::getCmpLogicalOpCode(), node, inputLen16, 0, TR::InstOpCode::COND_MASK8, process8CharsEnd, false, false); + + // ----------------- Incoming branch ----------------- + + generateS390LabelInstruction(cg, TR::InstOpCode::label, node, process8Chars); + process8Chars->setStartInternalControlFlow(); + + // Load 16 bytes (8 chars) into vector register + generateVRXInstruction(cg, TR::InstOpCode::VL, node, vInput, generateS390MemoryReference(input, translated, 0, cg)); + + // Check for vector surrogates and branch to copy the non-surrogate bytes + generateVRRdInstruction(cg, TR::InstOpCode::VSTRC, node, vSurrogate, vInput, vRange, vRangeControl, 0x1, 1); + generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_CC1, node, processSurrogate); + + // Store the result + generateVRXInstruction(cg, TR::InstOpCode::VST, node, vInput, generateS390MemoryReference(output, translated, 0, cg)); + + // Advance the stride register + generateRIInstruction(cg, TR::InstOpCode::getAddHalfWordImmOpCode(), node, translated, 16); + + // Loop back if there is at least 8 chars left to process + generateS390CompareAndBranchInstruction(cg, TR::InstOpCode::getCmpRegOpCode(), node, translated, inputLen16, TR::InstOpCode::COND_BL, process8Chars, false, false); + + generateS390LabelInstruction(cg, TR::InstOpCode::label, node, process8CharsEnd); + process8CharsEnd->setEndInternalControlFlow(); + + // ----------------- Incoming branch ----------------- + + generateS390LabelInstruction(cg, TR::InstOpCode::label, node, processUnder8Chars); + processUnder8Chars->setStartInternalControlFlow(); + + // Calculate the number of residue bytes available + generateRRInstruction(cg, TR::InstOpCode::getSubstractRegOpCode(), node, inputLen, translated); + + // Branch to the end if there is no residue + generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_CC0, node, processUnder8CharsEnd); + + // VLL and VSTL work on indices so we must subtract 1 + generateRIEInstruction(cg, TR::InstOpCode::getAddLogicalRegRegImmediateOpCode(), node, inputLenMinus1, inputLen, -1); + + // Zero out the input register to avoid invalid VSTRC result + generateVRIaInstruction(cg, TR::InstOpCode::VGBM, node, vInput, 0, 0 /*unused*/); + + // VLL instruction can only handle memory references of type D(B), so increment the base input address + generateRRInstruction (cg, TR::InstOpCode::getAddRegOpCode(), node, input, translated); + + // Load residue bytes into vector register + generateVRSbInstruction(cg, TR::InstOpCode::VLL, node, vInput, inputLenMinus1, generateS390MemoryReference(input, 0, cg)); + + // Check for vector surrogates and branch to copy the non-surrogate bytes + generateVRRdInstruction(cg, TR::InstOpCode::VSTRC, node, vSurrogate, vInput, vRange, vRangeControl, 0x1, 1); + + generateS390BranchInstruction(cg, TR::InstOpCode::BRC, TR::InstOpCode::COND_CC3, node, processSurrogateEnd); + + // ----------------- Incoming branch ----------------- + + generateS390LabelInstruction(cg, TR::InstOpCode::label, node, processSurrogate); + + // Extract the index of the first surrogate char + generateVRScInstruction(cg, TR::InstOpCode::VLGV, node, inputLen, vSurrogate, generateS390MemoryReference(7, cg), 0); + + // Return in the case of saturation at index 0 + generateS390CompareAndBranchInstruction(cg, TR::InstOpCode::getCmpLogicalOpCode(), node, inputLen, 0, TR::InstOpCode::COND_CC0, processUnder8CharsEnd, false, false); + + // VLL and VSTL work on indices so we must subtract 1 + generateRIEInstruction(cg, TR::InstOpCode::getAddLogicalRegRegImmediateOpCode(), node, inputLenMinus1, inputLen, -1); + + // ----------------- Incoming branch ----------------- + + generateS390LabelInstruction(cg, TR::InstOpCode::label, node, processSurrogateEnd); + + // VSTL instruction can only handle memory references of type D(B), so increment the base output address + generateRRInstruction (cg, TR::InstOpCode::getAddRegOpCode(), node, output, translated); + + // Store the result + generateVRSbInstruction(cg, TR::InstOpCode::VSTL, node, vInput, inputLenMinus1, generateS390MemoryReference(output, 0, cg), 0); + + // Advance the stride register + generateRRInstruction(cg, TR::InstOpCode::getAddRegOpCode(), node, translated, inputLen); + + // Set up the proper register dependencies + TR::RegisterDependencyConditions* dependencies = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(0, 9, cg); + + dependencies->addPostCondition(input, TR::RealRegister::AssignAny); + dependencies->addPostCondition(inputLen, TR::RealRegister::AssignAny); + dependencies->addPostCondition(inputLen16, TR::RealRegister::AssignAny); + dependencies->addPostCondition(output, TR::RealRegister::AssignAny); + dependencies->addPostCondition(translated, TR::RealRegister::AssignAny); + + dependencies->addPostCondition(vInput, TR::RealRegister::AssignAny); + dependencies->addPostCondition(vSurrogate, TR::RealRegister::AssignAny); + dependencies->addPostCondition(vRange, TR::RealRegister::AssignAny); + dependencies->addPostCondition(vRangeControl, TR::RealRegister::AssignAny); + + // ----------------- Incoming branch ----------------- + + generateS390LabelInstruction(cg, TR::InstOpCode::label, node, processUnder8CharsEnd, dependencies); + processUnder8CharsEnd->setEndInternalControlFlow(); + + // Convert translated length in number of bytes to number of characters + generateRSInstruction(cg, TR::InstOpCode::getShiftRightLogicalSingleOpCode(), node, translated, translated, 1); + + // Cleanup nodes before returning + cg->decReferenceCount(node->getChild(0)); + cg->decReferenceCount(node->getChild(1)); + cg->decReferenceCount(node->getChild(2)); + + // Cleanup registers before returning + cg->stopUsingRegister(input); + cg->stopUsingRegister(inputLen); + cg->stopUsingRegister(inputLen16); + cg->stopUsingRegister(output); + + cg->stopUsingRegister(vInput); + cg->stopUsingRegister(vSurrogate); + cg->stopUsingRegister(vRange); + cg->stopUsingRegister(vRangeControl); + + return node->setRegister(translated); + } + static TR::Register* hashCodeHelper(TR::Node* node, TR::CodeGenerator* cg, TR::DataType elementType, TR::Node* nodeHash, bool isSigned) { diff --git a/runtime/compiler/z/codegen/J9TreeEvaluator.hpp b/runtime/compiler/z/codegen/J9TreeEvaluator.hpp index 4adf0414fe5..ecd7ff519c7 100644 --- a/runtime/compiler/z/codegen/J9TreeEvaluator.hpp +++ b/runtime/compiler/z/codegen/J9TreeEvaluator.hpp @@ -138,6 +138,8 @@ class OMR_EXTENSIBLE TreeEvaluator: public J9::TreeEvaluator */ static TR::Register *inlineStringHashCode(TR::Node *node, TR::CodeGenerator *cg, bool isCompressed); static TR::Register *inlineVectorizedHashCode(TR::Node* node, TR::CodeGenerator* cg); + static TR::Register *inlineUTF16BEEncodeSIMD(TR::Node *node, TR::CodeGenerator *cg); + static TR::Register* inlineUTF16BEEncode (TR::Node *node, TR::CodeGenerator *cg); static TR::Register *inlineCRC32CUpdateBytes(TR::Node *node, TR::CodeGenerator *cg, bool isDirectBuffer); static TR::Register *zdloadEvaluator(TR::Node *node, TR::CodeGenerator *cg);