From d1efcf509673cc47b269679f4c2ba97cd16bd686 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B6ren=20Brunk?= Date: Mon, 3 Jul 2023 22:41:58 +0200 Subject: [PATCH] Add sentencepiece preset --- .github/workflows/sentencepiece.yml | 46 +++ pom.xml | 6 + sentencepiece/LICENSE | 202 +++++++++ sentencepiece/cppbuild.sh | 28 ++ sentencepiece/platform/pom.xml | 135 ++++++ sentencepiece/pom.xml | 55 +++ .../samples/SentencepieceExample.java | 23 ++ sentencepiece/samples/pom.xml | 21 + .../ImmutableNBestSentencePieceText.java | 47 +++ .../ImmutableSentencePieceText.java | 47 +++ ...tencePieceText_ImmutableSentencePiece.java | 39 ++ .../org/bytedeco/sentencepiece/IntVector.java | 78 ++++ .../IntVectorFloatPairVector.java | 39 ++ .../bytedeco/sentencepiece/ModelProto.java | 17 + .../sentencepiece/NBestSentencePieceText.java | 72 ++++ .../sentencepiece/NormalizerSpec.java | 17 + .../PretokenizerForTrainingInterface.java | 17 + .../sentencepiece/SentenceIterator.java | 33 ++ .../sentencepiece/SentencePieceProcessor.java | 383 ++++++++++++++++++ .../sentencepiece/SentencePieceText.java | 17 + .../SentencePieceText_SentencePiece.java | 17 + .../sentencepiece/SentencePieceTrainer.java | 94 +++++ .../org/bytedeco/sentencepiece/Status.java | 46 +++ .../sentencepiece/StringStringMap.java | 40 ++ .../bytedeco/sentencepiece/StringVector.java | 99 +++++ .../StringVectorFloatPairVector.java | 39 ++ .../bytedeco/sentencepiece/TrainerSpec.java | 18 + .../sentencepiece/global/sentencepiece.java | 183 +++++++++ .../sentencepiece/presets/sentencepiece.java | 41 ++ sentencepiece/src/main/java9/module-info.java | 6 + 30 files changed, 1905 insertions(+) create mode 100644 .github/workflows/sentencepiece.yml create mode 100644 sentencepiece/LICENSE create mode 100755 sentencepiece/cppbuild.sh create mode 100644 sentencepiece/platform/pom.xml create mode 100644 sentencepiece/pom.xml create mode 100644 sentencepiece/samples/SentencepieceExample.java create mode 100644 sentencepiece/samples/pom.xml create mode 100644 sentencepiece/src/gen/java/org/bytedeco/sentencepiece/ImmutableNBestSentencePieceText.java create mode 100644 sentencepiece/src/gen/java/org/bytedeco/sentencepiece/ImmutableSentencePieceText.java create mode 100644 sentencepiece/src/gen/java/org/bytedeco/sentencepiece/ImmutableSentencePieceText_ImmutableSentencePiece.java create mode 100644 sentencepiece/src/gen/java/org/bytedeco/sentencepiece/IntVector.java create mode 100644 sentencepiece/src/gen/java/org/bytedeco/sentencepiece/IntVectorFloatPairVector.java create mode 100644 sentencepiece/src/gen/java/org/bytedeco/sentencepiece/ModelProto.java create mode 100644 sentencepiece/src/gen/java/org/bytedeco/sentencepiece/NBestSentencePieceText.java create mode 100644 sentencepiece/src/gen/java/org/bytedeco/sentencepiece/NormalizerSpec.java create mode 100644 sentencepiece/src/gen/java/org/bytedeco/sentencepiece/PretokenizerForTrainingInterface.java create mode 100644 sentencepiece/src/gen/java/org/bytedeco/sentencepiece/SentenceIterator.java create mode 100644 sentencepiece/src/gen/java/org/bytedeco/sentencepiece/SentencePieceProcessor.java create mode 100644 sentencepiece/src/gen/java/org/bytedeco/sentencepiece/SentencePieceText.java create mode 100644 sentencepiece/src/gen/java/org/bytedeco/sentencepiece/SentencePieceText_SentencePiece.java create mode 100644 sentencepiece/src/gen/java/org/bytedeco/sentencepiece/SentencePieceTrainer.java create mode 100644 sentencepiece/src/gen/java/org/bytedeco/sentencepiece/Status.java create mode 100644 sentencepiece/src/gen/java/org/bytedeco/sentencepiece/StringStringMap.java create mode 100644 sentencepiece/src/gen/java/org/bytedeco/sentencepiece/StringVector.java create mode 100644 sentencepiece/src/gen/java/org/bytedeco/sentencepiece/StringVectorFloatPairVector.java create mode 100644 sentencepiece/src/gen/java/org/bytedeco/sentencepiece/TrainerSpec.java create mode 100644 sentencepiece/src/gen/java/org/bytedeco/sentencepiece/global/sentencepiece.java create mode 100644 sentencepiece/src/main/java/org/bytedeco/sentencepiece/presets/sentencepiece.java create mode 100644 sentencepiece/src/main/java9/module-info.java diff --git a/.github/workflows/sentencepiece.yml b/.github/workflows/sentencepiece.yml new file mode 100644 index 00000000000..241597a1e21 --- /dev/null +++ b/.github/workflows/sentencepiece.yml @@ -0,0 +1,46 @@ +name: sentencepiece +on: + push: + paths: + - sentencepiece/** + - .github/workflows/sentencepiece.yml + pull_request: + paths: + - sentencepiece/** + - .github/workflows/sentencepiece.yml + workflow_dispatch: +env: + CI_DEPLOY_MODULE: ${{ github.workflow }} + CI_DEPLOY_PLATFORM: ${{ github.job }} + CI_DEPLOY_SETTINGS: ${{ secrets.CI_DEPLOY_SETTINGS }} + CI_DEPLOY_USERNAME: ${{ secrets.CI_DEPLOY_USERNAME }} + CI_DEPLOY_PASSWORD: ${{ secrets.CI_DEPLOY_PASSWORD }} + STAGING_REPOSITORY: ${{ secrets.STAGING_REPOSITORY }} +jobs: + linux-arm64: + runs-on: ubuntu-20.04 + container: centos:7 + steps: + - uses: bytedeco/javacpp-presets/.github/actions/deploy-centos@actions + linux-x86_64: + runs-on: ubuntu-20.04 + container: centos:7 + steps: + - uses: bytedeco/javacpp-presets/.github/actions/deploy-centos@actions + macosx-arm64: + runs-on: macos-11 + steps: + - uses: bytedeco/javacpp-presets/.github/actions/deploy-macosx@actions + macosx-x86_64: + runs-on: macos-11 + steps: + - uses: bytedeco/javacpp-presets/.github/actions/deploy-macosx@actions + windows-x86_64: + runs-on: windows-2019 + steps: + - uses: bytedeco/javacpp-presets/.github/actions/deploy-windows@actions + redeploy: + needs: [linux-arm64, linux-x86_64, macosx-arm64, macosx-x86_64, windows-x86_64] + runs-on: ubuntu-20.04 + steps: + - uses: bytedeco/javacpp-presets/.github/actions/redeploy@actions diff --git a/pom.xml b/pom.xml index f3d05cc6724..a970712a8d9 100644 --- a/pom.xml +++ b/pom.xml @@ -636,6 +636,7 @@ cpu_features modsecurity systems + sentencepiece ${os.name}-${os.arch} @@ -1203,6 +1204,7 @@ onnxruntime cpu_features systems + sentencepiece @@ -1419,6 +1421,7 @@ cpu_features modsecurity systems + sentencepiece @@ -1460,6 +1463,7 @@ libraw leptonica tesseract + sentencepiece @@ -1544,6 +1548,7 @@ cpu_features modsecurity systems + sentencepiece @@ -1692,6 +1697,7 @@ qt cpu_features systems + sentencepiece diff --git a/sentencepiece/LICENSE b/sentencepiece/LICENSE new file mode 100644 index 00000000000..d6456956733 --- /dev/null +++ b/sentencepiece/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/sentencepiece/cppbuild.sh b/sentencepiece/cppbuild.sh new file mode 100755 index 00000000000..9a7e4db8e57 --- /dev/null +++ b/sentencepiece/cppbuild.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# This file is meant to be included by the parent cppbuild.sh script +if [[ -z "$PLATFORM" ]]; then + pushd .. + bash cppbuild.sh "$@" sentencepiece + popd + exit +fi + +SENTENCEPIECE_VERSION=0.1.99 +download https://github.com/google/sentencepiece/archive/refs/tags/v$SENTENCEPIECE_VERSION.tar.gz sentencepiece-$SENTENCEPIECE_VERSION.tar.gz + +mkdir -p $PLATFORM +cd $PLATFORM +INSTALL_PATH=`pwd` +echo "Decompressing archives..." +tar -xzvf ../sentencepiece-$SENTENCEPIECE_VERSION.tar.gz +cd sentencepiece-$SENTENCEPIECE_VERSION + +CMAKE_CONFIG="-DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$INSTALL_PATH -DCMAKE_INSTALL_LIBDIR=$INSTALL_PATH/lib" + +mkdir -p build +cd build +$CMAKE $CMAKE_CONFIG .. +make -j $MAKEJ +make install + +cd ../.. diff --git a/sentencepiece/platform/pom.xml b/sentencepiece/platform/pom.xml new file mode 100644 index 00000000000..5f0c36d3803 --- /dev/null +++ b/sentencepiece/platform/pom.xml @@ -0,0 +1,135 @@ + + + 4.0.0 + + + org.bytedeco + javacpp-presets + 1.5.10-SNAPSHOT + ../../ + + + org.bytedeco + sentencepiece-platform + 0.1.99-${project.parent.version} + JavaCPP Presets Platform for sentencepiece + + + sentencepiece + + + + + org.bytedeco + javacpp-platform + ${project.parent.version} + + + ${project.groupId} + ${javacpp.moduleId} + ${project.version} + + + ${project.groupId} + ${javacpp.moduleId} + ${project.version} + ${javacpp.platform.linux-arm64} + + + ${project.groupId} + ${javacpp.moduleId} + ${project.version} + ${javacpp.platform.linux-x86_64} + + + ${project.groupId} + ${javacpp.moduleId} + ${project.version} + ${javacpp.platform.macosx-arm64} + + + ${project.groupId} + ${javacpp.moduleId} + ${project.version} + ${javacpp.platform.macosx-x86_64} + + + ${project.groupId} + ${javacpp.moduleId} + ${project.version} + ${javacpp.platform.windows-x86_64} + + + + + + + maven-jar-plugin + + + default-jar + + + + ${javacpp.moduleId}.jar ${javacpp.moduleId}-linux-arm64.jar ${javacpp.moduleId}-linux-x86_64.jar ${javacpp.moduleId}-macosx-arm64.jar ${javacpp.moduleId}-macosx-x86_64.jar ${javacpp.moduleId}-windows-x86_64.jar + + + + + + empty-javadoc-jar + + jar + + + javadoc + + + + empty-sources-jar + + jar + + + sources + + + + + + org.moditect + moditect-maven-plugin + + + add-module-infos + none + + + add-platform-module-info + package + + add-module-info + + + + + ${project.build.directory}/${project.artifactId}.jar + + module org.bytedeco.${javacpp.moduleId}.platform { + requires static org.bytedeco.${javacpp.moduleId}.linux.arm64; + requires static org.bytedeco.${javacpp.moduleId}.linux.x86_64; + requires static org.bytedeco.${javacpp.moduleId}.macosx.arm64; + requires static org.bytedeco.${javacpp.moduleId}.macosx.x86_64; + requires static org.bytedeco.${javacpp.moduleId}.windows.x86_64; + } + + + + + + + + + + diff --git a/sentencepiece/pom.xml b/sentencepiece/pom.xml new file mode 100644 index 00000000000..c636c76ae17 --- /dev/null +++ b/sentencepiece/pom.xml @@ -0,0 +1,55 @@ + + + 4.0.0 + + + org.bytedeco + javacpp-presets + 1.5.10-SNAPSHOT + + + org.bytedeco + sentencepiece + 0.1.99-${project.parent.version} + JavaCPP Presets for sentencepiece + + + + org.bytedeco + javacpp + + + + + + + maven-resources-plugin + + + maven-compiler-plugin + + + org.bytedeco + javacpp + + + maven-jar-plugin + + + org.moditect + moditect-maven-plugin + + + maven-dependency-plugin + + + maven-source-plugin + + + maven-javadoc-plugin + + + + + diff --git a/sentencepiece/samples/SentencepieceExample.java b/sentencepiece/samples/SentencepieceExample.java new file mode 100644 index 00000000000..6d918ec00eb --- /dev/null +++ b/sentencepiece/samples/SentencepieceExample.java @@ -0,0 +1,23 @@ +import java.nio.ByteBuffer; +import org.bytedeco.javacpp.*; +import org.bytedeco.sentencepiece.*; + +public final class SentencepieceExample { + + public static void main(String[] args) { + + SentencePieceProcessor processor = new SentencePieceProcessor(); + Status status = processor.Load(args[0]); + if (!status.ok()) { + throw new RuntimeException(status.ToString().getString()); + } + + IntVector ids = new IntVector(); + processor.Encode("hello world!", ids); + + for (int id : ids.get()) { + System.out.print(id + " "); + } + } + +} diff --git a/sentencepiece/samples/pom.xml b/sentencepiece/samples/pom.xml new file mode 100644 index 00000000000..e4211472a6c --- /dev/null +++ b/sentencepiece/samples/pom.xml @@ -0,0 +1,21 @@ + + 4.0.0 + org.bytedeco.sentencepiece + sentencepiece-example + 1.5.10-SNAPSHOT + + SentencepieceExample + 1.8 + 1.8 + + + + org.bytedeco + sentencepiece-platform + 0.1.99-1.5.10-SNAPSHOT + + + + . + + diff --git a/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/ImmutableNBestSentencePieceText.java b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/ImmutableNBestSentencePieceText.java new file mode 100644 index 00000000000..dee8b069a50 --- /dev/null +++ b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/ImmutableNBestSentencePieceText.java @@ -0,0 +1,47 @@ +// Targeted by JavaCPP version 1.5.10-SNAPSHOT: DO NOT EDIT THIS FILE + +package org.bytedeco.sentencepiece; + +import java.nio.*; +import org.bytedeco.javacpp.*; +import org.bytedeco.javacpp.annotation.*; + +import static org.bytedeco.sentencepiece.global.sentencepiece.*; + + +// Wrapper class of SentencePieceText +// This wrapper only allows an immutable access to the proto and +// hides the actual implementation of protobuf. +// See sentencepiece.proto for the details of this class. +@Namespace("sentencepiece") @NoOffset @Properties(inherit = org.bytedeco.sentencepiece.presets.sentencepiece.class) +public class ImmutableNBestSentencePieceText extends Pointer { + static { Loader.load(); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public ImmutableNBestSentencePieceText(Pointer p) { super(p); } + /** Native array allocator. Access with {@link Pointer#position(long)}. */ + public ImmutableNBestSentencePieceText(long size) { super((Pointer)null); allocateArray(size); } + private native void allocateArray(long size); + @Override public ImmutableNBestSentencePieceText position(long position) { + return (ImmutableNBestSentencePieceText)super.position(position); + } + @Override public ImmutableNBestSentencePieceText getPointer(long i) { + return new ImmutableNBestSentencePieceText((Pointer)this).offsetAddress(i); + } + + public ImmutableNBestSentencePieceText() { super((Pointer)null); allocate(); } + private native void allocate(); + + public native @StdVector ImmutableSentencePieceText nbests(); + + public native @Cast("size_t") long nbests_size(); + public native @ByVal ImmutableSentencePieceText nbests(int index); + + public native @StdString BytePointer SerializeAsString(); + + // Returns the actual mutable proto. + // Do not use this outside of SentencePieceProcessor, as + // it returns the raw pointer managed by the shared_ptr. + public native NBestSentencePieceText mutable_proto(); + + public native void ConvertToUnicodeSpans(); +} diff --git a/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/ImmutableSentencePieceText.java b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/ImmutableSentencePieceText.java new file mode 100644 index 00000000000..6a8ea2b4627 --- /dev/null +++ b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/ImmutableSentencePieceText.java @@ -0,0 +1,47 @@ +// Targeted by JavaCPP version 1.5.10-SNAPSHOT: DO NOT EDIT THIS FILE + +package org.bytedeco.sentencepiece; + +import java.nio.*; +import org.bytedeco.javacpp.*; +import org.bytedeco.javacpp.annotation.*; + +import static org.bytedeco.sentencepiece.global.sentencepiece.*; + + +@Namespace("sentencepiece") @NoOffset @Properties(inherit = org.bytedeco.sentencepiece.presets.sentencepiece.class) +public class ImmutableSentencePieceText extends Pointer { + static { Loader.load(); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public ImmutableSentencePieceText(Pointer p) { super(p); } + /** Native array allocator. Access with {@link Pointer#position(long)}. */ + public ImmutableSentencePieceText(long size) { super((Pointer)null); allocateArray(size); } + private native void allocateArray(long size); + @Override public ImmutableSentencePieceText position(long position) { + return (ImmutableSentencePieceText)super.position(position); + } + @Override public ImmutableSentencePieceText getPointer(long i) { + return new ImmutableSentencePieceText((Pointer)this).offsetAddress(i); + } + + public ImmutableSentencePieceText() { super((Pointer)null); allocate(); } + private native void allocate(); + + public native @StdVector ImmutableSentencePieceText_ImmutableSentencePiece pieces(); + + public native @Cast("size_t") long pieces_size(); + public native @ByVal ImmutableSentencePieceText_ImmutableSentencePiece pieces(int index); + + public native @StdString BytePointer text(); + public native float score(); + + public native @StdString BytePointer SerializeAsString(); + + // Returns the actual mutable proto. + // Do not use this outside of SentencePieceProcessor, as + // it returns the raw pointer managed by the shared_ptr. + public native SentencePieceText mutable_proto(); + + // Converts the utf8 byte spans into Unicode char span. + public native void ConvertToUnicodeSpans(); +} diff --git a/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/ImmutableSentencePieceText_ImmutableSentencePiece.java b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/ImmutableSentencePieceText_ImmutableSentencePiece.java new file mode 100644 index 00000000000..c3848314298 --- /dev/null +++ b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/ImmutableSentencePieceText_ImmutableSentencePiece.java @@ -0,0 +1,39 @@ +// Targeted by JavaCPP version 1.5.10-SNAPSHOT: DO NOT EDIT THIS FILE + +package org.bytedeco.sentencepiece; + +import java.nio.*; +import org.bytedeco.javacpp.*; +import org.bytedeco.javacpp.annotation.*; + +import static org.bytedeco.sentencepiece.global.sentencepiece.*; + + +// Wrapper class of SentencePieceText +// This wrapper only allows an immutable access to the proto and +// hides the actual implementation of protobuf. +// See sentencepiece.proto for the details of this class. +@Namespace("sentencepiece") @NoOffset @Properties(inherit = org.bytedeco.sentencepiece.presets.sentencepiece.class) +public class ImmutableSentencePieceText_ImmutableSentencePiece extends Pointer { + static { Loader.load(); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public ImmutableSentencePieceText_ImmutableSentencePiece(Pointer p) { super(p); } + /** Native array allocator. Access with {@link Pointer#position(long)}. */ + public ImmutableSentencePieceText_ImmutableSentencePiece(long size) { super((Pointer)null); allocateArray(size); } + private native void allocateArray(long size); + @Override public ImmutableSentencePieceText_ImmutableSentencePiece position(long position) { + return (ImmutableSentencePieceText_ImmutableSentencePiece)super.position(position); + } + @Override public ImmutableSentencePieceText_ImmutableSentencePiece getPointer(long i) { + return new ImmutableSentencePieceText_ImmutableSentencePiece((Pointer)this).offsetAddress(i); + } + + public ImmutableSentencePieceText_ImmutableSentencePiece() { super((Pointer)null); allocate(); } + private native void allocate(); + + public native @StdString BytePointer piece(); + public native @StdString BytePointer surface(); + public native @Cast("uint32_t") int id(); + public native @Cast("uint32_t") int begin(); + public native @Cast("uint32_t") int end(); +} diff --git a/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/IntVector.java b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/IntVector.java new file mode 100644 index 00000000000..2f76cb30911 --- /dev/null +++ b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/IntVector.java @@ -0,0 +1,78 @@ +// Targeted by JavaCPP version 1.5.10-SNAPSHOT: DO NOT EDIT THIS FILE + +package org.bytedeco.sentencepiece; + +import java.nio.*; +import org.bytedeco.javacpp.*; +import org.bytedeco.javacpp.annotation.*; + +import static org.bytedeco.sentencepiece.global.sentencepiece.*; + +@Name("std::vector") @Properties(inherit = org.bytedeco.sentencepiece.presets.sentencepiece.class) +public class IntVector extends Pointer { + static { Loader.load(); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public IntVector(Pointer p) { super(p); } + public IntVector(int ... array) { this(array.length); put(array); } + public IntVector() { allocate(); } + public IntVector(long n) { allocate(n); } + private native void allocate(); + private native void allocate(@Cast("size_t") long n); + public native @Name("operator =") @ByRef IntVector put(@ByRef IntVector x); + + public boolean empty() { return size() == 0; } + public native long size(); + public void clear() { resize(0); } + public native void resize(@Cast("size_t") long n); + + @Index(function = "at") public native int get(@Cast("size_t") long i); + public native IntVector put(@Cast("size_t") long i, int value); + + public native @ByVal Iterator insert(@ByVal Iterator pos, int value); + public native @ByVal Iterator erase(@ByVal Iterator pos); + public native @ByVal Iterator begin(); + public native @ByVal Iterator end(); + @NoOffset @Name("iterator") public static class Iterator extends Pointer { + public Iterator(Pointer p) { super(p); } + public Iterator() { } + + public native @Name("operator ++") @ByRef Iterator increment(); + public native @Name("operator ==") boolean equals(@ByRef Iterator it); + public native @Name("operator *") int get(); + } + + public int[] get() { + int[] array = new int[size() < Integer.MAX_VALUE ? (int)size() : Integer.MAX_VALUE]; + for (int i = 0; i < array.length; i++) { + array[i] = get(i); + } + return array; + } + @Override public String toString() { + return java.util.Arrays.toString(get()); + } + + public int pop_back() { + long size = size(); + int value = get(size - 1); + resize(size - 1); + return value; + } + public IntVector push_back(int value) { + long size = size(); + resize(size + 1); + return put(size, value); + } + public IntVector put(int value) { + if (size() != 1) { resize(1); } + return put(0, value); + } + public IntVector put(int ... array) { + if (size() != array.length) { resize(array.length); } + for (int i = 0; i < array.length; i++) { + put(i, array[i]); + } + return this; + } +} + diff --git a/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/IntVectorFloatPairVector.java b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/IntVectorFloatPairVector.java new file mode 100644 index 00000000000..b705e56cace --- /dev/null +++ b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/IntVectorFloatPairVector.java @@ -0,0 +1,39 @@ +// Targeted by JavaCPP version 1.5.10-SNAPSHOT: DO NOT EDIT THIS FILE + +package org.bytedeco.sentencepiece; + +import java.nio.*; +import org.bytedeco.javacpp.*; +import org.bytedeco.javacpp.annotation.*; + +import static org.bytedeco.sentencepiece.global.sentencepiece.*; + +@Name("std::vector,float> >") @Properties(inherit = org.bytedeco.sentencepiece.presets.sentencepiece.class) +public class IntVectorFloatPairVector extends Pointer { + static { Loader.load(); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public IntVectorFloatPairVector(Pointer p) { super(p); } + public IntVectorFloatPairVector(IntVector[] firstValue, float[] secondValue) { this(Math.min(firstValue.length, secondValue.length)); put(firstValue, secondValue); } + public IntVectorFloatPairVector() { allocate(); } + public IntVectorFloatPairVector(long n) { allocate(n); } + private native void allocate(); + private native void allocate(@Cast("size_t") long n); + public native @Name("operator =") @ByRef IntVectorFloatPairVector put(@ByRef IntVectorFloatPairVector x); + + public boolean empty() { return size() == 0; } + public native long size(); + public void clear() { resize(0); } + public native void resize(@Cast("size_t") long n); + + @Index(function = "at") public native @ByRef IntVector first(@Cast("size_t") long i); public native IntVectorFloatPairVector first(@Cast("size_t") long i, IntVector first); + @Index(function = "at") public native float second(@Cast("size_t") long i); public native IntVectorFloatPairVector second(@Cast("size_t") long i, float second); + + public IntVectorFloatPairVector put(IntVector[] firstValue, float[] secondValue) { + for (int i = 0; i < firstValue.length && i < secondValue.length; i++) { + first(i, firstValue[i]); + second(i, secondValue[i]); + } + return this; + } +} + diff --git a/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/ModelProto.java b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/ModelProto.java new file mode 100644 index 00000000000..0961a5ab411 --- /dev/null +++ b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/ModelProto.java @@ -0,0 +1,17 @@ +// Targeted by JavaCPP version 1.5.10-SNAPSHOT: DO NOT EDIT THIS FILE + +package org.bytedeco.sentencepiece; + +import java.nio.*; +import org.bytedeco.javacpp.*; +import org.bytedeco.javacpp.annotation.*; + +import static org.bytedeco.sentencepiece.global.sentencepiece.*; + +@Namespace("sentencepiece") @Opaque @Properties(inherit = org.bytedeco.sentencepiece.presets.sentencepiece.class) +public class ModelProto extends Pointer { + /** Empty constructor. Calls {@code super((Pointer)null)}. */ + public ModelProto() { super((Pointer)null); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public ModelProto(Pointer p) { super(p); } +} diff --git a/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/NBestSentencePieceText.java b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/NBestSentencePieceText.java new file mode 100644 index 00000000000..0e39fab22ed --- /dev/null +++ b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/NBestSentencePieceText.java @@ -0,0 +1,72 @@ +// Targeted by JavaCPP version 1.5.10-SNAPSHOT: DO NOT EDIT THIS FILE + +package org.bytedeco.sentencepiece; + +import java.nio.*; +import org.bytedeco.javacpp.*; +import org.bytedeco.javacpp.annotation.*; + +import static org.bytedeco.sentencepiece.global.sentencepiece.*; + // namespace util + +// SentencePieceProcessor: +// Simple and language independent tokenizer and de-tokenizer for +// Neural Network Machine Translation. +// +// SentencePieceProcessor provides Encode() and Decode() methods, +// which correspond to tokenization and de-tokenization respectively. +// +// - Encode: +// Given a raw source sentence, encode it into a sequence +// of pieces or vocabulary ids. +// +// - Decode: +// Given a sequence of pieces or vocabulary ids, decode it +// into a de-tokenized raw sentence. +// +// SentencePieceProcessor provides a lossless data conversion +// that allows the original raw sentence to be perfectly reconstructed +// from the encoded data, i.e., Decode(Encode(input)) == input. +// This characteristics is useful, as we can make the de-tokenization +// completely language independent. +// +// Usage: +// SentencePieceProcessor sp; +// sp.Load("//path/to/model"); +// +// vector sps; +// sp.Encode("hello world.", &sps).IgnoreError(); +// +// vector ids; +// sp.Encode("hello world.", &ids).IgnoreError(); +// +// string detok; +// sp.Decode(sps, &detok); +// CHECK_EQ("hello world.", detok).IgnoreError(); +// +// sp.Decode(ids, &detok); +// CHECK_EQ("hello world.", detok).IgnoreError(); +// +// We can also use SentencePieceText which manages the byte-offsets +// between user input (output) and internal sentence pieces. +// +// SentencePieceText spt; +// sp.Encode("hello world.", &spt); +// // Emits the byte range of each piece. +// for (const auto &piece : spt.pieces()) { +// LOG(INFO) << piece.begin() << " " << piece.end(); +// } +// +// sp.Decode({0, 1, 2, 3..}, &spt); +// for (const auto &piece : spt.pieces()) { +// LOG(INFO) << piece.begin() << " " << piece.end(); +// } +// + +@Namespace("sentencepiece") @Opaque @Properties(inherit = org.bytedeco.sentencepiece.presets.sentencepiece.class) +public class NBestSentencePieceText extends Pointer { + /** Empty constructor. Calls {@code super((Pointer)null)}. */ + public NBestSentencePieceText() { super((Pointer)null); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public NBestSentencePieceText(Pointer p) { super(p); } +} diff --git a/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/NormalizerSpec.java b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/NormalizerSpec.java new file mode 100644 index 00000000000..a1e8684b835 --- /dev/null +++ b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/NormalizerSpec.java @@ -0,0 +1,17 @@ +// Targeted by JavaCPP version 1.5.10-SNAPSHOT: DO NOT EDIT THIS FILE + +package org.bytedeco.sentencepiece; + +import java.nio.*; +import org.bytedeco.javacpp.*; +import org.bytedeco.javacpp.annotation.*; + +import static org.bytedeco.sentencepiece.global.sentencepiece.*; + +@Namespace("sentencepiece") @Opaque @Properties(inherit = org.bytedeco.sentencepiece.presets.sentencepiece.class) +public class NormalizerSpec extends Pointer { + /** Empty constructor. Calls {@code super((Pointer)null)}. */ + public NormalizerSpec() { super((Pointer)null); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public NormalizerSpec(Pointer p) { super(p); } +} diff --git a/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/PretokenizerForTrainingInterface.java b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/PretokenizerForTrainingInterface.java new file mode 100644 index 00000000000..31ad601103a --- /dev/null +++ b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/PretokenizerForTrainingInterface.java @@ -0,0 +1,17 @@ +// Targeted by JavaCPP version 1.5.10-SNAPSHOT: DO NOT EDIT THIS FILE + +package org.bytedeco.sentencepiece; + +import java.nio.*; +import org.bytedeco.javacpp.*; +import org.bytedeco.javacpp.annotation.*; + +import static org.bytedeco.sentencepiece.global.sentencepiece.*; + +@Namespace("sentencepiece::pretokenizer") @Opaque @Properties(inherit = org.bytedeco.sentencepiece.presets.sentencepiece.class) +public class PretokenizerForTrainingInterface extends Pointer { + /** Empty constructor. Calls {@code super((Pointer)null)}. */ + public PretokenizerForTrainingInterface() { super((Pointer)null); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public PretokenizerForTrainingInterface(Pointer p) { super(p); } +} diff --git a/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/SentenceIterator.java b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/SentenceIterator.java new file mode 100644 index 00000000000..b205edf1065 --- /dev/null +++ b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/SentenceIterator.java @@ -0,0 +1,33 @@ +// Targeted by JavaCPP version 1.5.10-SNAPSHOT: DO NOT EDIT THIS FILE + +package org.bytedeco.sentencepiece; + +import java.nio.*; +import org.bytedeco.javacpp.*; +import org.bytedeco.javacpp.annotation.*; + +import static org.bytedeco.sentencepiece.global.sentencepiece.*; + // namespace pretokenizer + +// Iterator over the training sentences. +// Training sentences are loaded sequentially as follows: +// +// for (; !it.done(); it.Next()) { +// const std::string &s = it.value(); +// } +// RETURN_IF_ERROR(it.status()); +// +@Namespace("sentencepiece") @Properties(inherit = org.bytedeco.sentencepiece.presets.sentencepiece.class) +public class SentenceIterator extends Pointer { + static { Loader.load(); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public SentenceIterator(Pointer p) { super(p); } + + // Returns true if iteration finishes (including error case). + // Uses SentenceIterator::status() method to know whether + // all sentences are loaded successfully. + public native @Cast("bool") boolean done(); + public native void Next(); + public native @StdString BytePointer value(); + public native @ByVal Status status(); +} diff --git a/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/SentencePieceProcessor.java b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/SentencePieceProcessor.java new file mode 100644 index 00000000000..17d60bd2033 --- /dev/null +++ b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/SentencePieceProcessor.java @@ -0,0 +1,383 @@ +// Targeted by JavaCPP version 1.5.10-SNAPSHOT: DO NOT EDIT THIS FILE + +package org.bytedeco.sentencepiece; + +import java.nio.*; +import org.bytedeco.javacpp.*; +import org.bytedeco.javacpp.annotation.*; + +import static org.bytedeco.sentencepiece.global.sentencepiece.*; + + +@Namespace("sentencepiece") @NoOffset @Properties(inherit = org.bytedeco.sentencepiece.presets.sentencepiece.class) +public class SentencePieceProcessor extends Pointer { + static { Loader.load(); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public SentencePieceProcessor(Pointer p) { super(p); } + /** Native array allocator. Access with {@link Pointer#position(long)}. */ + public SentencePieceProcessor(long size) { super((Pointer)null); allocateArray(size); } + private native void allocateArray(long size); + @Override public SentencePieceProcessor position(long position) { + return (SentencePieceProcessor)super.position(position); + } + @Override public SentencePieceProcessor getPointer(long i) { + return new SentencePieceProcessor((Pointer)this).offsetAddress(i); + } + + public SentencePieceProcessor() { super((Pointer)null); allocate(); } + private native void allocate(); + + // Loads model from `filename`. + // Returns false if `filename` cannot be loaded. + public native @ByVal Status Load(@ByVal @StdString String filename); + + // Loads model from `filename`. + // Crash if `filename` cannot be loaded. + public native void LoadOrDie(@ByVal @StdString String filename); + + // Loads model from `model_proto`. + // `model_proto` is copied. + public native @ByVal Status Load(@Const @ByRef ModelProto model_proto); + + // Loads model from `model_proto`. + // `model_proto` is moved. + + // Loads model from `serialized`, which is a string-serialized model proto. + // Useful to load the model from a platform independent blob object. + public native @ByVal Status LoadFromSerializedProto(@ByVal @StdString String serialized); + + // Returns the status. Encode/Decode methods are valid when status is OK. + public native @ByVal Status status(); + + // Sets encode extra_option sequence. + public native @ByVal Status SetEncodeExtraOptions(@ByVal @StdString String extra_option); + + // Sets decode extra_option sequence. + public native @ByVal Status SetDecodeExtraOptions(@ByVal @StdString String extra_option); + + ////////////////////////////////////////////////////////////// + // Vocabulary restriction. + // Background: + // https://github.com/rsennrich/subword-nmt#best-practice-advice-for-byte-pair-encoding-in-nmt + + // Restricts the vocabulary set. + // The input sentences are encoded into the tokens in `valid_vocab`. + + + // Reverts the vocabulary restriction. + public native @ByVal Status ResetVocabulary(); + + // Loads the valid vocabulary set from `filename` in TSV format. + // Format: . + // Any token with frequency < threshold will be treated as OOV. + public native @ByVal Status LoadVocabulary(@ByVal @StdString String filename, + int threshold); + + ////////////////////////////////////////////////////////////// + // Simple Encode and Decode API. + // + // Given a UTF8 input, encodes it into a sequence of sentence pieces. + public native @ByVal Status Encode(@ByVal @StdString String input, + StringVector pieces); + + // Given a UTF8 input, encodes it into a sequence of ids. + public native @ByVal Status Encode(@ByVal @StdString String input, + IntVector ids); + + // Given a sequence of pieces, decodes it into a detokenized output. + + + // Given a sequence of pieces, decodes it into a detokenized output. + + + // Given a sequence of ids, decodes it into a detokenized output. + + + ////////////////////////////////////////////////////////////// + // NBest API. + // + // Same as Encode, but returns nbest results. + public native @ByVal Status NBestEncode( + @ByVal @StdString String input, int nbest_size, + @StdVector StringVector pieces); + + // Same as Encode, but returns nbest results. + public native @ByVal Status NBestEncode(@ByVal @StdString String input, int nbest_size, + @StdVector IntVector ids); + + ////////////////////////////////////////////////////////////// + // Sampling API. + // + // Unigram and BPE support sampling mode. + // - Unigram (--model_type=unigram): + // `nbest_size`: When `nbest_size` is positive value, approximately samples + // one segmentation from nbest candidates. When `nbest_size` is negative + // value, samples one segmentation from the hypotheses (Lattice) according to + // the generation probabilities using forward-filtering and backward-sampling + // algorithm. + // `alpha`: Smoothing parameter (inverse temperature). The best segmentation + // (Viterbi segmentation) is more likely sampled when setting larger alpha. + // When alpha is 0.0, one segmentation is uniformly sampled from the nbest or + // lattice. `nbest_size` and `alpha` correspond to parameters `l` and `alpha` + // in https://arxiv.org/abs/1804.10959 (nbest_size < 0 means l = infinity) + // + // - BPE (--model_type=bpe): + // `alpha`: The dropout probability `p` of bpe merge operations in + // https://arxiv.org/abs/1910.13267 Nbest-based sampling is not supported so + // nbest_size parameter is ignored in BPE. + public native @ByVal Status SampleEncode(@ByVal @StdString String input, int nbest_size, + float alpha, + StringVector pieces); + + // Same as above, but returns a sequence of ids. + public native @ByVal Status SampleEncode(@ByVal @StdString String input, int nbest_size, + float alpha, IntVector ids); + + ////////////////////////////////////////////////////////////// + // SampleEncodeAndScore API. + // + // Sample `samples` many tokenisations from the segmentation lattice. + // These methods are only available in model_type=unigram. + // + // `alpha`: smoothing parameter (inverse temperature). The same as `alpha` in + // `Sample` method. + // 'wor`: If `wor` is true, the samples are taken without replacement, and the + // scores are the inclusion probabilities of the elements in the sample; + // otherwise the samples are taken with replacement and the scores are the + // log-probs of sample elements + // `include_best`: If `include_best` is true, the best tokenisation is always + // included in the sample, and the remaining elements are sampled excluding + // the best. + public native @ByVal Status SampleEncodeAndScore( + @ByVal @StdString String input, int num_samples, float alpha, @Cast("bool") boolean wor, + @Cast("bool") boolean include_best, + StringVectorFloatPairVector pieces); + + // Same as above, but returns a sequence of ids. + public native @ByVal Status SampleEncodeAndScore( + @ByVal @StdString String input, int num_samples, float alpha, @Cast("bool") boolean wor, + @Cast("bool") boolean include_best, + IntVectorFloatPairVector ids); + + ////////////////////////////////////////////////////////////// + // Entropy API. + // + // This only available in model_type=unigram. + // Calculate entropy of possible tokenisations + public native @ByVal Status CalculateEntropy(@ByVal @StdString String input, float alpha, + FloatPointer entropy); + public native @ByVal Status CalculateEntropy(@ByVal @StdString String input, float alpha, + FloatBuffer entropy); + public native @ByVal Status CalculateEntropy(@ByVal @StdString String input, float alpha, + float[] entropy); + + ////////////////////////////////////////////////////////////// + // Advanced API returning SentencePieceText, which manages + // utf8-byte alignments between user-input/detokenized text + // and internal sentencepiece sequence. + // + // Given a UTF8 input, encodes it into SentencePieceText. + // + // When using these APIs, sentencepiece.pb.h header files must be included. + // We can also use ImutableSentencePieceText as follows. + // + // ImmutableSentencePieceText spt; + // Encode("hello", spt.mutable_proto()).IgnoreError(); + // std::cout << spt.pieces_size() << std::endl; + public native @ByVal Status Encode(@ByVal @StdString String input, + SentencePieceText spt); + + public native @ByVal Status NBestEncode(@ByVal @StdString String input, int nbest_size, + NBestSentencePieceText nbest_spt); + + public native @ByVal Status SampleEncode(@ByVal @StdString String input, int nbest_size, + float alpha, SentencePieceText spt); + + public native @ByVal Status SampleEncodeAndScore( + @ByVal @StdString String input, int num_samples, float alpha, @Cast("bool") boolean wor, + @Cast("bool") boolean include_best, NBestSentencePieceText samples_spt); + + // DEPRECATED: Remove this API and use std::vector + + + + + +// #ifdef SWIG +// #define SPP_SWIG_CHECK_AND_THROW +// if (!status.ok()) throw status; +// #else +// #define SPP_SWIG_CHECK_AND_THROW +// if (!status.ok()) { +// } +// #endif // SWIG + +// #define DEFINE_SPP_DIRECT_FUNC_IMPL(FuncName, OutType, ...) +// OutType output; +// const auto status = FuncName(__VA_ARGS__, &output); +// SPP_SWIG_CHECK_AND_THROW; +// return output; + +// #define DEFINE_SPP_SERIALIZED_PROTO_IMPL(FuncName, OutType, ...) +// OutType output; +// const auto status = FuncName(__VA_ARGS__, output.mutable_proto()); +// SPP_SWIG_CHECK_AND_THROW; +// return output.SerializeAsString(); + +// #define DEFINE_SPP_IMMUTABLE_PROTO_IMPL(FuncName, OutType, ...) +// OutType output; +// const auto status = FuncName(__VA_ARGS__, output.mutable_proto()); +// SPP_SWIG_CHECK_AND_THROW; +// return output; + + ////////////////////////////////////////////////////////////// + // Handy methods that return the result directly. + // These functions ignore internal errors. + public native @ByVal StringVector EncodeAsPieces( + @ByVal @StdString String input); + + public native @ByVal IntVector EncodeAsIds(@ByVal @StdString String input); + + public native @StdVector StringVector NBestEncodeAsPieces( + @ByVal @StdString String input, int nbest_size); + + public native @StdVector IntVector NBestEncodeAsIds( + @ByVal @StdString String input, int nbest_size); + + public native @ByVal StringVector SampleEncodeAsPieces(@ByVal @StdString String input, + int nbest_size, + float alpha); + + public native @ByVal IntVector SampleEncodeAsIds(@ByVal @StdString String input, + int nbest_size, + float alpha); + + public native @ByVal StringVectorFloatPairVector SampleEncodeAndScoreAsPieces(@ByVal @StdString String input, int num_samples, + float alpha, @Cast("bool") boolean wor, @Cast("bool") boolean include_best); + + public native @ByVal IntVectorFloatPairVector SampleEncodeAndScoreAsIds(@ByVal @StdString String input, int num_samples, + float alpha, @Cast("bool") boolean wor, @Cast("bool") boolean include_best); + + // DEPRECATED: Remove this API and use std::vector + + + + + public native @StdString BytePointer DecodeIds(@Const @ByRef IntVector ids); + + public native float CalculateEntropy(@ByVal @StdString String text, float alpha); + + ////////////////////////////////////////////////////////////// + // SerializedProto API. (DEPRECATED). Use ImmutableProto API. + // They are used in Python interface. Returns serialized proto. + // In python module, we can get access to the full Proto after + // deserialzing the returned byte sequence. + public native @StdString BytePointer EncodeAsSerializedProto(@ByVal @StdString String input); + + public native @StdString BytePointer SampleEncodeAsSerializedProto(@ByVal @StdString String input, + int nbest_size, + float alpha); + + public native @StdString BytePointer NBestEncodeAsSerializedProto(@ByVal @StdString String input, + int nbest_size); + + public native @StdString BytePointer SampleEncodeAndScoreAsSerializedProto( + @ByVal @StdString String input, int num_samples, float alpha, @Cast("bool") boolean wor, + @Cast("bool") boolean include_best); + + // TODO(taku): Remove this API and use std::vector + + + + + public native @StdString BytePointer DecodeIdsAsSerializedProto( + @Const @ByRef IntVector ids); + + ////////////////////////////////////////////////////////////// + // ImmutableProto API. + public native @ByVal ImmutableSentencePieceText EncodeAsImmutableProto( + @ByVal @StdString String input); + + public native @ByVal ImmutableSentencePieceText SampleEncodeAsImmutableProto( + @ByVal @StdString String input, int nbest_size, float alpha); + + public native @ByVal ImmutableNBestSentencePieceText NBestEncodeAsImmutableProto( + @ByVal @StdString String input, int nbest_size); + + public native @ByVal ImmutableNBestSentencePieceText SampleEncodeAndScoreAsImmutableProto( + @ByVal @StdString String input, int num_samples, float alpha, @Cast("bool") boolean wor, + @Cast("bool") boolean include_best); + + // TODO(taku): Remove this API and use std::vector + + + + + public native @ByVal ImmutableSentencePieceText DecodeIdsAsImmutableProto( + @Const @ByRef IntVector ids); + +// #undef DEFINE_SPP_DIRECT_FUNC_IMPL +// #undef DEFINE_SPP_SERIALIZED_PROTO_IMPL +// #undef DEFINE_SPP_IMMUTABLE_PROTO_IMPL + + ////////////////////////////////////////////////////////////// + // Vocabulary management methods. + // + // Returns the size of sentence pieces, which is the same as + // the size of vocabulary for NMT. + public native int GetPieceSize(); + + // Returns the vocab id of `piece`. + // Returns UNK(0) if `piece` is unknown. + public native int PieceToId(@ByVal @StdString String piece); + + // Returns the string representation of vocab with `id`. + public native @StdString BytePointer IdToPiece(int id); + + // Returns the score of `id`. + // Usually score is an emission log probability of unigram language + // model. + public native float GetScore(int id); + + // Returns true if `id` is unknown symbol. + public native @Cast("bool") boolean IsUnknown(int id); + + // Returns true if `id` is control symbol. + public native @Cast("bool") boolean IsControl(int id); + + // Returns true if `id` is unused symbol. + public native @Cast("bool") boolean IsUnused(int id); + + // Returns true if `id` is byte symbol. + public native @Cast("bool") boolean IsByte(int id); + + // Returns the reserved id. + // Returns -1 if not defined. + + // Returns unknown () id. + public native int unk_id(); + + // Returns BOS () id. + public native int bos_id(); + + // Returns EOS () id. + public native int eos_id(); + + // Returns PAD () id. + public native int pad_id(); + + ////////////////////////////////////////////////////////////// + // Model management. + // + // Allows injection of a mock model instance. `model` is moved. + + // Allows injection of a normalizer instance. `normalizer` is moved. + + // Returns immutable model proto. Useful to obtain extended + // or experimental parameters encoded in model_proto. + public native @Const @ByRef ModelProto model_proto(); + + // returns immutable model proto as std::string. + // Useful to save the state of this instance via Python's pickle object. + public native @StdString BytePointer serialized_model_proto(); +} diff --git a/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/SentencePieceText.java b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/SentencePieceText.java new file mode 100644 index 00000000000..fadca5c148e --- /dev/null +++ b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/SentencePieceText.java @@ -0,0 +1,17 @@ +// Targeted by JavaCPP version 1.5.10-SNAPSHOT: DO NOT EDIT THIS FILE + +package org.bytedeco.sentencepiece; + +import java.nio.*; +import org.bytedeco.javacpp.*; +import org.bytedeco.javacpp.annotation.*; + +import static org.bytedeco.sentencepiece.global.sentencepiece.*; + +@Namespace("sentencepiece") @Opaque @Properties(inherit = org.bytedeco.sentencepiece.presets.sentencepiece.class) +public class SentencePieceText extends Pointer { + /** Empty constructor. Calls {@code super((Pointer)null)}. */ + public SentencePieceText() { super((Pointer)null); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public SentencePieceText(Pointer p) { super(p); } +} diff --git a/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/SentencePieceText_SentencePiece.java b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/SentencePieceText_SentencePiece.java new file mode 100644 index 00000000000..e9ff2367916 --- /dev/null +++ b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/SentencePieceText_SentencePiece.java @@ -0,0 +1,17 @@ +// Targeted by JavaCPP version 1.5.10-SNAPSHOT: DO NOT EDIT THIS FILE + +package org.bytedeco.sentencepiece; + +import java.nio.*; +import org.bytedeco.javacpp.*; +import org.bytedeco.javacpp.annotation.*; + +import static org.bytedeco.sentencepiece.global.sentencepiece.*; + +@Namespace("sentencepiece") @Opaque @Properties(inherit = org.bytedeco.sentencepiece.presets.sentencepiece.class) +public class SentencePieceText_SentencePiece extends Pointer { + /** Empty constructor. Calls {@code super((Pointer)null)}. */ + public SentencePieceText_SentencePiece() { super((Pointer)null); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public SentencePieceText_SentencePiece(Pointer p) { super(p); } +} diff --git a/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/SentencePieceTrainer.java b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/SentencePieceTrainer.java new file mode 100644 index 00000000000..3155b2b2d9f --- /dev/null +++ b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/SentencePieceTrainer.java @@ -0,0 +1,94 @@ +// Targeted by JavaCPP version 1.5.10-SNAPSHOT: DO NOT EDIT THIS FILE + +package org.bytedeco.sentencepiece; + +import java.nio.*; +import org.bytedeco.javacpp.*; +import org.bytedeco.javacpp.annotation.*; + +import static org.bytedeco.sentencepiece.global.sentencepiece.*; + + +@Namespace("sentencepiece") @Properties(inherit = org.bytedeco.sentencepiece.presets.sentencepiece.class) +public class SentencePieceTrainer extends Pointer { + static { Loader.load(); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public SentencePieceTrainer(Pointer p) { super(p); } + + // Trains SentencePiece model with `trainer_spec`. + // Default `normalizer_spec` is used. + // When `sentence_iterator` is passed, load sentences from the iterator. + + + // Trains SentencePiece model with `trainer_spec` and + // `normalizer_spec`. + // When `sentence_iterator` is passed, load sentences from the iterator. + + + // Trains SentencePiece model with `trainer_spec`, `normalizer_spec` + // and `denormalizer_spec`. + // When `sentence_iterator` is passed, load sentences from the iterator. + + // Trains SentencePiece model with command-line string in `args`, + // e.g., + // '--input=data --model_prefix=m --vocab_size=8192 model_type=unigram' + // When `sentence_iterator` is passed, load sentences from the iterator. + + + // Trains SentencePiece model with mapin `kwargs`. + // e.g., {{"input", "data"}, {"model_prefix, "m"}, {"vocab_size", "8192"}...} + + + // Handy function to make a normalizer spec from the pre-compiled + // normalization name. Do not use this method in production as it crashes + // When `name` is invalid. Useful for unittesting. + + + // Populates necessary fields (precompiled_charmap) from + // `NormalizerSpec::name` or `NormalizerSpec::normalization_rule_tsv`. + public static native @ByVal Status PopulateNormalizerSpec(NormalizerSpec normalizer_spec, + @Cast("bool") boolean is_denormalizer/*=false*/); + public static native @ByVal Status PopulateNormalizerSpec(NormalizerSpec normalizer_spec); + + // Overrides `trainer_spec`, `normalizer_spec`, `denormalizer_spec` with the + // std::unordered_map in `kargs`. + public static native @ByVal Status MergeSpecsFromArgs( + @Const @ByRef StringStringMap kwargs, + TrainerSpec trainer_spec, NormalizerSpec normalizer_spec, + NormalizerSpec denormalizer_spec); + + // Overrides `trainer_spec`, `normalizer_spec`, `denormalizer_spec` with the + // command line flags in `args`. + public static native @ByVal Status MergeSpecsFromArgs(@ByVal @StdString String args, + TrainerSpec trainer_spec, + NormalizerSpec normalizer_spec, + NormalizerSpec denormalizer_spec); + + // Injects global pre-tokenizer that are applied in training time. + // Pretokenizer is only used for extracting pieces. + // TODO(taku): It would be better to inject per `trainer_spec`. + public static native @ByVal Status SetPretokenizerForTraining( + @Const PretokenizerForTrainingInterface pretokenizer); + + // Returns the current pretokenizer. if no pretokenizer is defined, returns + // nullptr. + public static native @Const PretokenizerForTrainingInterface GetPretokenizerForTraining(); + + // Helper function to set `field_name=value` in `message`. + // When `field_name` is repeated, multiple values can be passed + // with comma-separated values. `field_name` must not be a nested message. + // The body of these functions are automatically generated with + // data/gen_spec_parser.pl + public static native @ByVal Status SetProtoField(@ByVal @StdString String name, + @ByVal @StdString String value, + TrainerSpec message); + + public static native @ByVal Status SetProtoField(@ByVal @StdString String name, + @ByVal @StdString String value, + NormalizerSpec message); + + // Populates model type from string representation, e.g., "bpe". + // Supported model: "unigram", "bpe", "word", "char". + public static native @ByVal Status PopulateModelTypeFromString(@ByVal @StdString String type, + TrainerSpec trainer_spec); +} diff --git a/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/Status.java b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/Status.java new file mode 100644 index 00000000000..b27d76aed1e --- /dev/null +++ b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/Status.java @@ -0,0 +1,46 @@ +// Targeted by JavaCPP version 1.5.10-SNAPSHOT: DO NOT EDIT THIS FILE + +package org.bytedeco.sentencepiece; + +import java.nio.*; +import org.bytedeco.javacpp.*; +import org.bytedeco.javacpp.annotation.*; + +import static org.bytedeco.sentencepiece.global.sentencepiece.*; + + +@Namespace("sentencepiece::util") @NoOffset @Properties(inherit = org.bytedeco.sentencepiece.presets.sentencepiece.class) +public class Status extends Pointer { + static { Loader.load(); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public Status(Pointer p) { super(p); } + /** Native array allocator. Access with {@link Pointer#position(long)}. */ + public Status(long size) { super((Pointer)null); allocateArray(size); } + private native void allocateArray(long size); + @Override public Status position(long position) { + return (Status)super.position(position); + } + @Override public Status getPointer(long i) { + return new Status((Pointer)this).offsetAddress(i); + } + + public Status() { super((Pointer)null); allocate(); } + private native void allocate(); + public Status(@Cast("sentencepiece::util::StatusCode") int code, @ByVal @StdString String error_message) { super((Pointer)null); allocate(code, error_message); } + private native void allocate(@Cast("sentencepiece::util::StatusCode") int code, @ByVal @StdString String error_message); + public Status(@Const @ByRef Status s) { super((Pointer)null); allocate(s); } + private native void allocate(@Const @ByRef Status s); + public native @Name("operator =") void put(@Const @ByRef Status s); + public native @Cast("bool") @Name("operator ==") boolean equals(@Const @ByRef Status s); + public native @Cast("bool") @Name("operator !=") boolean notEquals(@Const @ByRef Status s); + public native @Cast("bool") boolean ok(); + + public native void set_error_message(@Cast("const char*") BytePointer str); + public native void set_error_message(String str); + public native @Cast("const char*") BytePointer error_message(); + public native @Cast("const char*") BytePointer message(); + public native @Cast("sentencepiece::util::StatusCode") int code(); + public native @StdString BytePointer ToString(); + + public native void IgnoreError(); +} diff --git a/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/StringStringMap.java b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/StringStringMap.java new file mode 100644 index 00000000000..952580cba5b --- /dev/null +++ b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/StringStringMap.java @@ -0,0 +1,40 @@ +// Targeted by JavaCPP version 1.5.10-SNAPSHOT: DO NOT EDIT THIS FILE + +package org.bytedeco.sentencepiece; + +import java.nio.*; +import org.bytedeco.javacpp.*; +import org.bytedeco.javacpp.annotation.*; + +import static org.bytedeco.sentencepiece.global.sentencepiece.*; + +@Name("std::unordered_map") @Properties(inherit = org.bytedeco.sentencepiece.presets.sentencepiece.class) +public class StringStringMap extends Pointer { + static { Loader.load(); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public StringStringMap(Pointer p) { super(p); } + public StringStringMap() { allocate(); } + private native void allocate(); + public native @Name("operator =") @ByRef StringStringMap put(@ByRef StringStringMap x); + + public boolean empty() { return size() == 0; } + public native long size(); + + @Index public native @StdString BytePointer get(@StdString BytePointer i); + public native StringStringMap put(@StdString BytePointer i, BytePointer value); + @ValueSetter @Index public native StringStringMap put(@StdString BytePointer i, @StdString String value); + + public native void erase(@ByVal Iterator pos); + public native @ByVal Iterator begin(); + public native @ByVal Iterator end(); + @NoOffset @Name("iterator") public static class Iterator extends Pointer { + public Iterator(Pointer p) { super(p); } + public Iterator() { } + + public native @Name("operator ++") @ByRef Iterator increment(); + public native @Name("operator ==") boolean equals(@ByRef Iterator it); + public native @Name("operator *().first") @MemberGetter @StdString BytePointer first(); + public native @Name("operator *().second") @MemberGetter @StdString BytePointer second(); + } +} + diff --git a/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/StringVector.java b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/StringVector.java new file mode 100644 index 00000000000..74855bbc255 --- /dev/null +++ b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/StringVector.java @@ -0,0 +1,99 @@ +// Targeted by JavaCPP version 1.5.10-SNAPSHOT: DO NOT EDIT THIS FILE + +package org.bytedeco.sentencepiece; + +import java.nio.*; +import org.bytedeco.javacpp.*; +import org.bytedeco.javacpp.annotation.*; + +import static org.bytedeco.sentencepiece.global.sentencepiece.*; + +@Name("std::vector") @Properties(inherit = org.bytedeco.sentencepiece.presets.sentencepiece.class) +public class StringVector extends Pointer { + static { Loader.load(); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public StringVector(Pointer p) { super(p); } + public StringVector(BytePointer value) { this(1); put(0, value); } + public StringVector(BytePointer ... array) { this(array.length); put(array); } + public StringVector(String value) { this(1); put(0, value); } + public StringVector(String ... array) { this(array.length); put(array); } + public StringVector() { allocate(); } + public StringVector(long n) { allocate(n); } + private native void allocate(); + private native void allocate(@Cast("size_t") long n); + public native @Name("operator =") @ByRef StringVector put(@ByRef StringVector x); + + public boolean empty() { return size() == 0; } + public native long size(); + public void clear() { resize(0); } + public native void resize(@Cast("size_t") long n); + + @Index(function = "at") public native @StdString BytePointer get(@Cast("size_t") long i); + public native StringVector put(@Cast("size_t") long i, BytePointer value); + @ValueSetter @Index(function = "at") public native StringVector put(@Cast("size_t") long i, @StdString String value); + + public native @ByVal Iterator insert(@ByVal Iterator pos, @StdString BytePointer value); + public native @ByVal Iterator erase(@ByVal Iterator pos); + public native @ByVal Iterator begin(); + public native @ByVal Iterator end(); + @NoOffset @Name("iterator") public static class Iterator extends Pointer { + public Iterator(Pointer p) { super(p); } + public Iterator() { } + + public native @Name("operator ++") @ByRef Iterator increment(); + public native @Name("operator ==") boolean equals(@ByRef Iterator it); + public native @Name("operator *") @StdString BytePointer get(); + } + + public BytePointer[] get() { + BytePointer[] array = new BytePointer[size() < Integer.MAX_VALUE ? (int)size() : Integer.MAX_VALUE]; + for (int i = 0; i < array.length; i++) { + array[i] = get(i); + } + return array; + } + @Override public String toString() { + return java.util.Arrays.toString(get()); + } + + public BytePointer pop_back() { + long size = size(); + BytePointer value = get(size - 1); + resize(size - 1); + return value; + } + public StringVector push_back(BytePointer value) { + long size = size(); + resize(size + 1); + return put(size, value); + } + public StringVector put(BytePointer value) { + if (size() != 1) { resize(1); } + return put(0, value); + } + public StringVector put(BytePointer ... array) { + if (size() != array.length) { resize(array.length); } + for (int i = 0; i < array.length; i++) { + put(i, array[i]); + } + return this; + } + + public StringVector push_back(String value) { + long size = size(); + resize(size + 1); + return put(size, value); + } + public StringVector put(String value) { + if (size() != 1) { resize(1); } + return put(0, value); + } + public StringVector put(String ... array) { + if (size() != array.length) { resize(array.length); } + for (int i = 0; i < array.length; i++) { + put(i, array[i]); + } + return this; + } +} + diff --git a/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/StringVectorFloatPairVector.java b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/StringVectorFloatPairVector.java new file mode 100644 index 00000000000..783f218cec2 --- /dev/null +++ b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/StringVectorFloatPairVector.java @@ -0,0 +1,39 @@ +// Targeted by JavaCPP version 1.5.10-SNAPSHOT: DO NOT EDIT THIS FILE + +package org.bytedeco.sentencepiece; + +import java.nio.*; +import org.bytedeco.javacpp.*; +import org.bytedeco.javacpp.annotation.*; + +import static org.bytedeco.sentencepiece.global.sentencepiece.*; + +@Name("std::vector,float> >") @Properties(inherit = org.bytedeco.sentencepiece.presets.sentencepiece.class) +public class StringVectorFloatPairVector extends Pointer { + static { Loader.load(); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public StringVectorFloatPairVector(Pointer p) { super(p); } + public StringVectorFloatPairVector(StringVector[] firstValue, float[] secondValue) { this(Math.min(firstValue.length, secondValue.length)); put(firstValue, secondValue); } + public StringVectorFloatPairVector() { allocate(); } + public StringVectorFloatPairVector(long n) { allocate(n); } + private native void allocate(); + private native void allocate(@Cast("size_t") long n); + public native @Name("operator =") @ByRef StringVectorFloatPairVector put(@ByRef StringVectorFloatPairVector x); + + public boolean empty() { return size() == 0; } + public native long size(); + public void clear() { resize(0); } + public native void resize(@Cast("size_t") long n); + + @Index(function = "at") public native @ByRef StringVector first(@Cast("size_t") long i); public native StringVectorFloatPairVector first(@Cast("size_t") long i, StringVector first); + @Index(function = "at") public native float second(@Cast("size_t") long i); public native StringVectorFloatPairVector second(@Cast("size_t") long i, float second); + + public StringVectorFloatPairVector put(StringVector[] firstValue, float[] secondValue) { + for (int i = 0; i < firstValue.length && i < secondValue.length; i++) { + first(i, firstValue[i]); + second(i, secondValue[i]); + } + return this; + } +} + diff --git a/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/TrainerSpec.java b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/TrainerSpec.java new file mode 100644 index 00000000000..14bf58bf497 --- /dev/null +++ b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/TrainerSpec.java @@ -0,0 +1,18 @@ +// Targeted by JavaCPP version 1.5.10-SNAPSHOT: DO NOT EDIT THIS FILE + +package org.bytedeco.sentencepiece; + +import java.nio.*; +import org.bytedeco.javacpp.*; +import org.bytedeco.javacpp.annotation.*; + +import static org.bytedeco.sentencepiece.global.sentencepiece.*; + + +@Namespace("sentencepiece") @Opaque @Properties(inherit = org.bytedeco.sentencepiece.presets.sentencepiece.class) +public class TrainerSpec extends Pointer { + /** Empty constructor. Calls {@code super((Pointer)null)}. */ + public TrainerSpec() { super((Pointer)null); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public TrainerSpec(Pointer p) { super(p); } +} diff --git a/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/global/sentencepiece.java b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/global/sentencepiece.java new file mode 100644 index 00000000000..85959b04b04 --- /dev/null +++ b/sentencepiece/src/gen/java/org/bytedeco/sentencepiece/global/sentencepiece.java @@ -0,0 +1,183 @@ +// Targeted by JavaCPP version 1.5.10-SNAPSHOT: DO NOT EDIT THIS FILE + +package org.bytedeco.sentencepiece.global; + +import org.bytedeco.sentencepiece.*; + +import java.nio.*; +import org.bytedeco.javacpp.*; +import org.bytedeco.javacpp.annotation.*; + +public class sentencepiece extends org.bytedeco.sentencepiece.presets.sentencepiece { + static { Loader.load(); } + +// Targeting ../StringVector.java + + +// Targeting ../IntVector.java + + +// Targeting ../StringVectorFloatPairVector.java + + +// Targeting ../IntVectorFloatPairVector.java + + +// Targeting ../StringStringMap.java + + +// Parsed from + +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License.! + +// #ifndef SENTENCEPIECE_PROCESSOR_H_ +// #define SENTENCEPIECE_PROCESSOR_H_ + +// #include +// #include +// #include +// #include +// #include +// #include + +// #ifndef SWIG + // namespace absl +// #endif // SWIG + +/** enum class sentencepiece::util::StatusCode */ +public static final int + kOk = 0, + kCancelled = 1, + kUnknown = 2, + kInvalidArgument = 3, + kDeadlineExceeded = 4, + kNotFound = 5, + kAlreadyExists = 6, + kPermissionDenied = 7, + kResourceExhausted = 8, + kFailedPrecondition = 9, + kAborted = 10, + kOutOfRange = 11, + kUnimplemented = 12, + kInternal = 13, + kUnavailable = 14, + kDataLoss = 15, + kUnauthenticated = 16; +// Targeting ../Status.java + + + +// Targeting ../NBestSentencePieceText.java + + +// Targeting ../SentencePieceText.java + + +// Targeting ../ModelProto.java + + + // namespace normalizer + +// #ifndef SWIGGO +// Redefine std::string for serialized_proto interface as Python's string is +// a Unicode string. We can enforce the return value to be raw byte sequence +// with SWIG's typemap. + // namespace util +// #endif // SWIGGO +// Targeting ../SentencePieceText_SentencePiece.java + + +// Targeting ../ImmutableSentencePieceText_ImmutableSentencePiece.java + + +// Targeting ../ImmutableSentencePieceText.java + + +// Targeting ../ImmutableNBestSentencePieceText.java + + +// Targeting ../SentencePieceProcessor.java + + + +// Set seed value of random generator. +// Do not set static_cast(-1), +// as this seed is reserved for initializing from +// std::random_device. +@Namespace("sentencepiece") public static native void SetRandomGeneratorSeed(@Cast("unsigned int") int seed); + +// IO related functions to absorb model formats. +// Loads `model_proto` from `filename`. +// We can instantiate SentencePieceProcessor as follows: +// +// auto model_proto = absl::make_unique(); +// io::LoadModelProto("//path/spm.model", model_proto.get()); +// SentencePieceProcessor sp; +// CHECK_OK(sp.Load(std::move(model_proto))); +@Namespace("sentencepiece::io") public static native @ByVal Status LoadModelProto(@ByVal @StdString String arg0, ModelProto model_proto); + +// Saves `model_proto` as `filename`. +@Namespace("sentencepiece::io") public static native @ByVal Status SaveModelProto(@ByVal @StdString String arg0, @Const @ByRef ModelProto model_proto); + // namespace io + // namespace sentencepiece +// #endif // SENTENCEPIECE_PROCESSOR_H_ + + +// Parsed from + +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License.! + +// #ifndef SENTENCEPIECE_TRAINER_H_ +// #define SENTENCEPIECE_TRAINER_H_ + +// #include +// #include + +// #include "sentencepiece_processor.h" +// Targeting ../TrainerSpec.java + + +// Targeting ../NormalizerSpec.java + + +// Targeting ../PretokenizerForTrainingInterface.java + + + +// Targeting ../SentenceIterator.java + + +// Targeting ../SentencePieceTrainer.java + + + + // namespace sentencepiece + +// #endif // SENTENCEPIECE_TRAINER_H_ + + +} diff --git a/sentencepiece/src/main/java/org/bytedeco/sentencepiece/presets/sentencepiece.java b/sentencepiece/src/main/java/org/bytedeco/sentencepiece/presets/sentencepiece.java new file mode 100644 index 00000000000..a5239654af3 --- /dev/null +++ b/sentencepiece/src/main/java/org/bytedeco/sentencepiece/presets/sentencepiece.java @@ -0,0 +1,41 @@ +package org.bytedeco.sentencepiece.presets; + +import org.bytedeco.javacpp.*; +import org.bytedeco.javacpp.annotation.*; +import org.bytedeco.javacpp.tools.*; + +@Properties( + value = { + @Platform(compiler = "cpp17", include = {"", ""}, link = "sentencepiece"), + @Platform(value = "windows", link = "sentencepiece#") + }, + target = "org.bytedeco.sentencepiece", + global = "org.bytedeco.sentencepiece.global.sentencepiece" +) +public class sentencepiece implements InfoMapper { + static { + Loader.checkVersion("org.bytedeco", "sentencepiece"); + } + + public void map(InfoMap infoMap) { + infoMap + .put(new Info("SPP_SWIG_CHECK_AND_THROW").cppTypes().annotations()) + .put(new Info("std::unordered_map").pointerTypes("StringStringMap").define()) + .put(new Info("string_view", "absl::string_view").pointerTypes("@StdString String")) + .put(new Info("std::vector").pointerTypes("StringVector").define()) + .put(new Info("std::vector").pointerTypes("IntVector").define()) + .put(new Info("std::vector,float> >").pointerTypes("StringVectorFloatPairVector").define()) + .put(new Info("std::vector,float> >").pointerTypes("IntVectorFloatPairVector").define()) + .put(new Info( + "sentencepiece::ModelInterface", + "sentencepiece::normalizer::Normalizer", + "sentencepiece::SentencePieceTrainer::GetNormalizerSpec", + "sentencepiece::SentencePieceTrainer::Train", + "sentencepiece::SentencePieceProcessor::DecodePiecesAsImmutableProto", + "sentencepiece::SentencePieceProcessor::DecodePiecesAsSerializedProto", + "sentencepiece::SentencePieceProcessor::DecodePieces", + "sentencepiece::SentencePieceProcessor::Decode", + "sentencepiece::SentencePieceProcessor::SetVocabulary" + ).skip()); + } +} \ No newline at end of file diff --git a/sentencepiece/src/main/java9/module-info.java b/sentencepiece/src/main/java9/module-info.java new file mode 100644 index 00000000000..bb865b013e3 --- /dev/null +++ b/sentencepiece/src/main/java9/module-info.java @@ -0,0 +1,6 @@ +module org.bytedeco.sentencepiece { + requires transitive org.bytedeco.javacpp; + exports org.bytedeco.sentencepiece.global; + exports org.bytedeco.sentencepiece.presets; + exports org.bytedeco.sentencepiece; +}