diff --git a/.github/workflows/arm64_centos7.yml b/.github/workflows/arm64_centos7.yml index 3f218d93..5380fdf9 100644 --- a/.github/workflows/arm64_centos7.yml +++ b/.github/workflows/arm64_centos7.yml @@ -10,11 +10,11 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true lfs: true - - uses: bab2min/run-on-arch-action@use-custom-image + - uses: bab2min/run-on-arch-action@multiple-step id: runcmd with: image: quay.io/pypa/manylinux2014_aarch64 @@ -23,24 +23,30 @@ jobs: mkdir -p "${PWD}/artifacts" dockerRunArgs: | --volume "${PWD}/artifacts:/artifacts" - run: | - yum install java-1.8.0-openjdk-devel -y - mkdir build && pushd build && cmake -DCMAKE_BUILD_TYPE=Release -DKIWI_USE_MIMALLOC=0 -DKIWI_JAVA_BINDING=1 .. - make -j2 && popd - ./build/test/kiwi-test - mkdir eval_results && ./build/kiwi-evaluator -m ./ModelGenerator eval_data/*.txt -o eval_results/ && ./build/kiwi-evaluator -m ./ModelGenerator eval_data/*.txt --sbg -o eval_results/ - mkdir eval_results_balanced && KIWI_ARCH_TYPE=balanced ./build/kiwi-evaluator -m ./ModelGenerator eval_data/*.txt -o eval_results_balanced/ - cp -r build /artifacts/ - cp -r eval_results /artifacts/ - - curl -OL https://latina.bab2min.pe.kr/_data/kowiki1000.txt - KIWI_ARCH_TYPE=none ./build/kiwi-cli-* -m ./ModelGenerator -e -o test.out kowiki1000.txt - KIWI_ARCH_TYPE=balanced ./build/kiwi-cli-* -m ./ModelGenerator -e -o test.out kowiki1000.txt - KIWI_ARCH_TYPE=neon ./build/kiwi-cli-* -m ./ModelGenerator -e -o test.out kowiki1000.txt + multipleRun: | + - name: Install dependencies + run: | + yum install java-1.8.0-openjdk-devel -y + - name: Build + run: | + mkdir build && pushd build && cmake -DCMAKE_BUILD_TYPE=Release -DKIWI_USE_MIMALLOC=0 -DKIWI_JAVA_BINDING=1 .. + make -j2 && popd + - name: Test + run: | + ./build/test/kiwi-test + mkdir eval_results && ./build/kiwi-evaluator -m ./ModelGenerator eval_data/*.txt -o eval_results/ && ./build/kiwi-evaluator -m ./ModelGenerator eval_data/*.txt --sbg -o eval_results/ + cp -r build /artifacts/ + cp -r eval_results /artifacts/ + - name: Benchmark + run: | + curl -OL https://latina.bab2min.pe.kr/_data/kowiki1000.txt + KIWI_ARCH_TYPE=none ./build/kiwi-cli-* -m ./ModelGenerator -e -o test.out kowiki1000.txt + KIWI_ARCH_TYPE=balanced ./build/kiwi-cli-* -m ./ModelGenerator -e -o test.out kowiki1000.txt + KIWI_ARCH_TYPE=neon ./build/kiwi-cli-* -m ./ModelGenerator -e -o test.out kowiki1000.txt - name: Archive binaries - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: - name: Artifacts ${{ matrix.name }} + name: Artifacts Arm64-Centos7 path: | artifacts/build/*kiwi* artifacts/build/test/*kiwi* diff --git a/.github/workflows/centos5.yml b/.github/workflows/centos5.yml deleted file mode 100644 index 65e4f2a6..00000000 --- a/.github/workflows/centos5.yml +++ /dev/null @@ -1,51 +0,0 @@ -name: Centos5 - -on: - pull_request: - branches: [ main ] - -jobs: - build-centos: - name: Centos5 - runs-on: ubuntu-latest - container: - image: docker://hoshizora/manylinux1-clang_x86_64 - - steps: - - name: Install Git LFS - run: | - mkdir ../gitlfs && pushd ../gitlfs - curl -L https://github.com/git-lfs/git-lfs/releases/download/v2.13.2/git-lfs-linux-amd64-v2.13.2.tar.gz | tar -zxv - ./install.sh - popd - - uses: actions/checkout@v1 - with: - submodules: true - - name: Pull LFS files - run: git lfs pull - - name: Update CMake - run: /opt/python/cp36-cp36m/bin/pip install "cmake<3.20" - - name: Checkout Old Version googletest - run: cd third_party/googletest && git checkout v1.8.x && cd ../.. - - name: Configure Build - run: mkdir build && cd build && /opt/python/cp36-cp36m/bin/cmake -DCMAKE_BUILD_TYPE=Release -DKIWI_USE_MIMALLOC=0 -DKIWI_USE_CPUINFO=0 .. - - name: Build - run: cd build && make -j2 - - name: Run Unit Test - run: ./build/test/kiwi-test - - name: Run Unit Test in Debug mode - if: failure() - run: | - mkdir build_debug && cd build_debug && /opt/python/cp36-cp36m/bin/cmake -DCMAKE_BUILD_TYPE=Debug -DKIWI_USE_MIMALLOC=0 -DKIWI_USE_CPUINFO=0 .. && make -j2 - cd .. && gdb -ex run -ex bt ./build_debug/test/kiwi-test - - name: Run Evaluator - run: | - mkdir eval_results - ./build/kiwi-evaluator -m ./ModelGenerator eval_data/*.txt -o eval_results/ - ./build/kiwi-evaluator -m ./ModelGenerator eval_data/*.txt --sbg -o eval_results/ - - run: tar -zcvf arts.tgz build/*kiwi* build/test/*kiwi* eval_results/*.txt - - name: Archive binaries - uses: actions/upload-artifact@v1 - with: - name: Artifacts Centos5 - path: arts.tgz diff --git a/.github/workflows/centos6.yml b/.github/workflows/centos6.yml deleted file mode 100644 index cc3b786e..00000000 --- a/.github/workflows/centos6.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: Centos6 - -on: - pull_request: - branches: [ main ] - -jobs: - build-centos6: - name: Centos6 - runs-on: ubuntu-latest - container: - image: docker://quay.io/pypa/manylinux2010_x86_64 - - steps: - - name: Install Git LFS - run: | - mkdir ../gitlfs && pushd ../gitlfs - curl -L https://github.com/git-lfs/git-lfs/releases/download/v2.10.0/git-lfs-linux-amd64-v2.10.0.tar.gz | tar -zxv - ./install.sh - popd - - uses: actions/checkout@v1 - with: - submodules: true - - name: Pull LFS files - run: git config --global --add safe.directory /__w/Kiwi/Kiwi && git lfs pull - - name: Update CMake - run: | - /opt/python/cp36-cp36m/bin/pip install cmake - yum install java-1.8.0-openjdk-devel.x86_64 -y - - name: Configure Build - run: mkdir build && cd build && /opt/python/cp36-cp36m/bin/cmake -DCMAKE_BUILD_TYPE=Release -DKIWI_JAVA_BINDING=1 .. - - name: Build - run: cd build && make -j2 - - name: Run Unit Test - run: ./build/test/kiwi-test - - name: Run Java Unit Test - run: | - cd bindings/java - curl -OL https://repo1.maven.org/maven2/junit/junit/4.13.2/junit-4.13.2.jar - curl -OL https://repo1.maven.org/maven2/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar - javac -cp "*:../../build/bindings/java/*:." -encoding utf-8 kr/pe/bab2min/KiwiTest.java - java -cp "*:../../build/bindings/java/*:." org.junit.runner.JUnitCore kr.pe.bab2min.KiwiTest - - name: Run Evaluator - run: | - mkdir eval_results - ./build/kiwi-evaluator -m ./ModelGenerator eval_data/*.txt -o eval_results/ - ./build/kiwi-evaluator -m ./ModelGenerator eval_data/*.txt --sbg -o eval_results/ - - run: tar -zcvf arts.tgz build/*kiwi* build/test/*kiwi* eval_results/*.txt build/bindings/java/*.jar - - name: Archive binaries - uses: actions/upload-artifact@v1 - with: - name: Artifacts Centos6 - path: arts.tgz diff --git a/.github/workflows/centos7.yml b/.github/workflows/centos7.yml new file mode 100644 index 00000000..6a152373 --- /dev/null +++ b/.github/workflows/centos7.yml @@ -0,0 +1,50 @@ +name: Centos7 + +on: + pull_request: + branches: [ main ] + +jobs: + build-centos7: + name: Centos7 + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + with: + submodules: true + lfs: true + - uses: bab2min/run-on-arch-action@multiple-step + id: runcmd + with: + image: quay.io/pypa/manylinux2014_x86_64 + githubToken: ${{ github.token }} + multipleRun: | + - name: Update CMake + run: | + /opt/python/cp310-cp310/bin/pip install cmake + yum install java-1.8.0-openjdk-devel.x86_64 -y + - name: Configure Build + run: mkdir build && cd build && /opt/python/cp310-cp310/bin/cmake -DCMAKE_BUILD_TYPE=Release -DKIWI_JAVA_BINDING=1 .. + - name: Build + run: cd build && make -j2 + - name: Run Unit Test + run: ./build/test/kiwi-test + - name: Run Java Unit Test + run: | + cd bindings/java + curl -OL https://repo1.maven.org/maven2/junit/junit/4.13.2/junit-4.13.2.jar + curl -OL https://repo1.maven.org/maven2/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar + javac -cp "*:../../build/bindings/java/*:." -encoding utf-8 kr/pe/bab2min/KiwiTest.java + java -cp "*:../../build/bindings/java/*:." org.junit.runner.JUnitCore kr.pe.bab2min.KiwiTest + - name: Run Evaluator + run: | + mkdir eval_results + ./build/kiwi-evaluator -m ./ModelGenerator eval_data/*.txt -o eval_results/ + ./build/kiwi-evaluator -m ./ModelGenerator eval_data/*.txt --sbg -o eval_results/ + - run: tar -zcvf arts.tgz build/*kiwi* build/test/*kiwi* eval_results/*.txt build/bindings/java/*.jar + - name: Archive binaries + uses: actions/upload-artifact@v3 + with: + name: Artifacts Centos7 + path: arts.tgz diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index 6860dcdd..0a8d55f0 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -24,7 +24,7 @@ jobs: name: ${{ matrix.name }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true lfs: true @@ -80,7 +80,7 @@ jobs: KIWI_ARCH_TYPE=neon ./build/kiwi-cli-* -m ./ModelGenerator -e -o test.out --typos 6 kowiki1000.txt fi - name: Archive binaries - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: Artifacts ${{ matrix.name }} path: | diff --git a/.github/workflows/ppc64le_centos7.yml b/.github/workflows/ppc64le_centos7.yml index 714e41f0..2837048b 100644 --- a/.github/workflows/ppc64le_centos7.yml +++ b/.github/workflows/ppc64le_centos7.yml @@ -10,11 +10,11 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true lfs: true - - uses: bab2min/run-on-arch-action@use-custom-image + - uses: bab2min/run-on-arch-action@multiple-step id: runcmd with: image: quay.io/pypa/manylinux2014_ppc64le @@ -32,11 +32,11 @@ jobs: cp -r build /artifacts/ cp -r eval_results /artifacts/ - name: Archive binaries - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: - name: Artifacts ${{ matrix.name }} + name: Artifacts PPC64LE-Centos7 path: | artifacts/build/*kiwi* artifacts/build/test/*kiwi* artifacts/eval_results/*.txt - artifacts/build/binding/java/*.jar \ No newline at end of file + artifacts/build/binding/java/*.jar diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index c6625944..1070c9f3 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -5,82 +5,32 @@ on: types: [created] jobs: - build-centos: - name: Centos5 + build-centos7: + name: Centos7 runs-on: ubuntu-latest - container: - image: docker://hoshizora/manylinux1-clang_x86_64 steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v3 with: submodules: true - - name: Checkout Old Version googletest - run: cd third_party/googletest && git checkout v1.8.x && cd ../.. - - name: Update CMake - run: /opt/python/cp36-cp36m/bin/pip install "cmake<3.20" - - name: Configure Build - run: mkdir build && cd build && /opt/python/cp36-cp36m/bin/cmake -DCMAKE_BUILD_TYPE=Release -DKIWI_USE_MIMALLOC=0 -DKIWI_USE_CPUINFO=0 .. - - name: Build - run: cd build && make -j2 - - run: | - mkdir result - cd result - mkdir include && mkdir lib && mkdir bin - mv ../build/libkiwi* lib/ - mv ../build/kiwi-* bin/ - mv ../build/test/kiwi-* bin/ - cp -r ../include/kiwi include/ - tar -zcvf ../asset.tgz include/ lib/ bin/ - - name: Archive binaries - uses: actions/upload-artifact@v1 - with: - name: asset_Centos5 - path: asset.tgz - - upload-asset-from-centos: - name: Upload asset from Centos5 - needs: build-centos - runs-on: ubuntu-latest - - steps: - - uses: actions/download-artifact@v2 - with: - name: asset_Centos5 - path: arts - name: Get release id: get_release - uses: bruceadams/get-release@v1.2.2 - env: - GITHUB_TOKEN: ${{ secrets.ACCESS_TOKEN }} - - name: Upload release binary - uses: actions/upload-release-asset@v1.0.2 + uses: bruceadams/get-release@v1.3.2 env: GITHUB_TOKEN: ${{ secrets.ACCESS_TOKEN }} + - uses: bab2min/run-on-arch-action@multiple-step with: - upload_url: ${{ steps.get_release.outputs.upload_url }} - asset_path: arts/asset.tgz - asset_name: kiwi_lnx_centos5_x86_64_${{ steps.get_release.outputs.tag_name }}.tgz - asset_content_type: application/octet-stream - - build-centos6: - name: Centos6 - runs-on: ubuntu-latest - container: - image: docker://quay.io/pypa/manylinux2010_x86_64 - - steps: - - uses: actions/checkout@v1 - with: - submodules: true - - name: Update CMake - run: | - /opt/python/cp36-cp36m/bin/pip install cmake - yum install java-1.8.0-openjdk-devel.x86_64 -y - - name: Configure Build - run: mkdir build && cd build && /opt/python/cp36-cp36m/bin/cmake -DCMAKE_BUILD_TYPE=Release -DKIWI_JAVA_BINDING=1 .. - - name: Build - run: cd build && make -j2 + image: quay.io/pypa/manylinux2014_x86_64 + githubToken: ${{ github.token }} + multipleRun: | + - name: Update CMake + run: | + /opt/python/cp310-cp310/bin/pip install cmake + yum install java-1.8.0-openjdk-devel.x86_64 -y + - name: Configure Build + run: mkdir build && cd build && /opt/python/cp310-cp310/bin/cmake -DCMAKE_BUILD_TYPE=Release -DKIWI_JAVA_BINDING=1 .. + - name: Build + run: cd build && make -j2 - run: | mkdir result cd result @@ -90,54 +40,24 @@ jobs: mv ../build/test/kiwi-* bin/ cp -r ../include/kiwi include/ tar -zcvf ../asset.tgz include/ lib/ bin/ - - name: Archive binaries - uses: actions/upload-artifact@v1 - with: - name: asset_Centos6 - path: asset.tgz - - run: | - mv build/bindings/java/kiwi-java*.jar kiwi-java.jar - - name: Archive jar binaries - uses: actions/upload-artifact@v1 - with: - name: jar_Centos6 - path: kiwi-java.jar - - upload-asset-from-centos6: - name: Upload asset from Centos6 - needs: build-centos6 - runs-on: ubuntu-latest - - steps: - - name: Get release - id: get_release - uses: bruceadams/get-release@v1.2.2 - env: - GITHUB_TOKEN: ${{ secrets.ACCESS_TOKEN }} - - uses: actions/download-artifact@v2 - with: - name: asset_Centos6 - path: arts - name: Upload release binary uses: actions/upload-release-asset@v1.0.2 env: GITHUB_TOKEN: ${{ secrets.ACCESS_TOKEN }} with: upload_url: ${{ steps.get_release.outputs.upload_url }} - asset_path: arts/asset.tgz + asset_path: asset.tgz asset_name: kiwi_lnx_x86_64_${{ steps.get_release.outputs.tag_name }}.tgz asset_content_type: application/octet-stream - - uses: actions/download-artifact@v2 - with: - name: jar_Centos6 - path: arts + - run: | + mv build/bindings/java/kiwi-java*.jar kiwi-java.jar - name: Upload release jar uses: actions/upload-release-asset@v1.0.2 env: GITHUB_TOKEN: ${{ secrets.ACCESS_TOKEN }} with: upload_url: ${{ steps.get_release.outputs.upload_url }} - asset_path: arts/kiwi-java.jar + asset_path: kiwi-java.jar asset_name: kiwi-java-${{ steps.get_release.outputs.tag_name }}-lnx-x86-64.jar asset_content_type: application/octet-stream @@ -151,7 +71,7 @@ jobs: name: ${{ matrix.os }} - ${{ matrix.arch }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true lfs: true @@ -216,7 +136,7 @@ jobs: name: ${{ matrix.os }} ${{ matrix.arch }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true lfs: true @@ -276,7 +196,7 @@ jobs: name: Centos7-${{ matrix.arch }} runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true lfs: true @@ -331,7 +251,7 @@ jobs: name: Emscripten runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true lfs: true diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 700b24c0..e7235c25 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -26,7 +26,7 @@ jobs: name: ${{ matrix.name }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true lfs: true @@ -75,7 +75,7 @@ jobs: KIWI_ARCH_TYPE=avx512bw ./build/kiwi-cli-* -m ./ModelGenerator -e -o test.out --sbg kowiki1000.txt KIWI_ARCH_TYPE=avx512bw ./build/kiwi-cli-* -m ./ModelGenerator -e -o test.out --typos 6 kowiki1000.txt - name: Archive binaries - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: Artifacts ${{ matrix.name }} path: | diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index c37fc220..a1123b4e 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -15,7 +15,7 @@ jobs: name: ${{ matrix.os }} ${{ matrix.arch }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true lfs: true @@ -38,7 +38,7 @@ jobs: .\build\Release\kiwi-evaluator.exe -m .\ModelGenerator (Get-ChildItem eval_data\*.txt | Select-Object -Expand FullName) -o eval_results\ .\build\Release\kiwi-evaluator.exe -m .\ModelGenerator --sbg (Get-ChildItem eval_data\*.txt | Select-Object -Expand FullName) -o eval_results\ - name: Archive binaries - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: Artifacts ${{ matrix.os }} ${{ matrix.arch }}bit path: | diff --git a/bindings/java/kiwi_java.cpp b/bindings/java/kiwi_java.cpp index 5db1f758..9013d9fe 100644 --- a/bindings/java/kiwi_java.cpp +++ b/bindings/java/kiwi_java.cpp @@ -493,6 +493,16 @@ class JTypoTransformer : public kiwi::TypoTransformer, jni::JObject @@ -547,7 +557,11 @@ JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM* vm, void* reserved) jni::define() .template ctor<>() .template method<&JTypoTransformer::addTypo>("_addTypo") - .template method<&JTypoTransformer::setContinualTypoCost>("_setContinualTypoCost"), + .template method<&JTypoTransformer::setContinualTypoCost>("_setContinualTypoCost") + .template method<&JTypoTransformer::setLengtheningTypoCost>("_setLengtheningTypoCost") + .template method<&JTypoTransformer::copy>("copy") + .template method<&JTypoTransformer::update>("_update") + .template method<&JTypoTransformer::scaleCost>("_scaleCost"), jni::define() .template ctor() diff --git a/bindings/java/kr/pe/bab2min/KiwiBuilder.java b/bindings/java/kr/pe/bab2min/KiwiBuilder.java index a440c602..5cfdfd17 100644 --- a/bindings/java/kr/pe/bab2min/KiwiBuilder.java +++ b/bindings/java/kr/pe/bab2min/KiwiBuilder.java @@ -63,8 +63,12 @@ public boolean isAlive() { @Override public native void close() throws Exception; + public native TypoTransformer copy(); public native void _addTypo(String orig, String error, float cost, byte convVowel); + public native void _update(TypoTransformer src); + public native void _scaleCost(float scale); public native void _setContinualTypoCost(float cost); + public native void _setLengtheningTypoCost(float cost); public TypoTransformer addTypo(String orig, String error, float cost, byte convVowel) { _addTypo(orig, error, cost, convVowel); @@ -80,10 +84,29 @@ public TypoTransformer addTypo(String[] orig, String[] error, float cost, byte c return this; } + // Set continual typo cost (inplace) public TypoTransformer setContinualTypoCost(float cost) { _setContinualTypoCost(cost); return this; } + + // Set lengthening typo cost (inplace) + public TypoTransformer setLengtheningTypoCost(float cost) { + _setLengtheningTypoCost(cost); + return this; + } + + // Inplace update + public TypoTransformer update(TypoTransformer src) { + _update(src); + return this; + } + + // Inplace scaling + public TypoTransformer scaleCost(float scale) { + _scaleCost(scale); + return this; + } } public KiwiBuilder(long _inst) { @@ -241,106 +264,7 @@ public Kiwi build(TypoTransformer typos) { .addTypo(new String[]{"ᆶ"}, new String[]{"ᆯᇂ"}, 1e-12f, CondVowel.none) .addTypo(new String[]{"ᆹ"}, new String[]{"ᆸᆺ", "ᆸᆻ"}, 1e-12f, CondVowel.none); - final public static TypoTransformer basicTypoSetWithContinual = new TypoTransformer() - .addTypo(new String[]{"ㅐ", "ㅔ"}, new String[]{"ㅐ", "ㅔ"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ㅐ", "ㅔ"}, new String[]{"ㅒ", "ㅖ"}, 1.5f, CondVowel.none) - .addTypo(new String[]{"ㅒ", "ㅖ"}, new String[]{"ㅐ", "ㅔ"}, 1.5f, CondVowel.none) - .addTypo(new String[]{"ㅒ", "ㅖ"}, new String[]{"ㅒ", "ㅖ"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ㅚ", "ㅙ", "ㅞ"}, new String[]{"ㅚ", "ㅙ", "ㅞ", "ㅐ", "ㅔ"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ㅝ"}, new String[]{"ㅗ", "ㅓ"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ㅟ", "ㅢ"}, new String[]{"ㅣ"}, 1.f, CondVowel.none) - .addTypo(new String[]{"위", "의"}, new String[]{"이"}, Float.POSITIVE_INFINITY, CondVowel.none) - .addTypo(new String[]{"위", "의"}, new String[]{"이"}, 1.f, CondVowel.any) - .addTypo(new String[]{"자", "쟈"}, new String[]{"자", "쟈"}, 1.f, CondVowel.none) - .addTypo(new String[]{"재", "쟤"}, new String[]{"재", "쟤"}, 1.f, CondVowel.none) - .addTypo(new String[]{"저", "져"}, new String[]{"저", "져"}, 1.f, CondVowel.none) - .addTypo(new String[]{"제", "졔"}, new String[]{"제", "졔"}, 1.f, CondVowel.none) - .addTypo(new String[]{"조", "죠", "줘"}, new String[]{"조", "죠", "줘"}, 1.f, CondVowel.none) - .addTypo(new String[]{"주", "쥬"}, new String[]{"주", "쥬"}, 1.f, CondVowel.none) - .addTypo(new String[]{"차", "챠"}, new String[]{"차", "챠"}, 1.f, CondVowel.none) - .addTypo(new String[]{"채", "챼"}, new String[]{"채", "챼"}, 1.f, CondVowel.none) - .addTypo(new String[]{"처", "쳐"}, new String[]{"처", "쳐"}, 1.f, CondVowel.none) - .addTypo(new String[]{"체", "쳬"}, new String[]{"체", "쳬"}, 1.f, CondVowel.none) - .addTypo(new String[]{"초", "쵸", "춰"}, new String[]{"초", "쵸", "춰"}, 1.f, CondVowel.none) - .addTypo(new String[]{"추", "츄"}, new String[]{"추", "츄"}, 1.f, CondVowel.none) - .addTypo(new String[]{"유", "류"}, new String[]{"유", "류"}, 1.f, CondVowel.none) - .addTypo(new String[]{"므", "무"}, new String[]{"므", "무"}, 1.f, CondVowel.none) - .addTypo(new String[]{"브", "부"}, new String[]{"브", "부"}, 1.f, CondVowel.none) - .addTypo(new String[]{"프", "푸"}, new String[]{"프", "푸"}, 1.f, CondVowel.none) - .addTypo(new String[]{"르", "루"}, new String[]{"르", "루"}, 1.f, CondVowel.none) - .addTypo(new String[]{"러", "뤄"}, new String[]{"러", "뤄"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ᆩ", "ᆪ"}, new String[]{"ᆨ", "ᆩ", "ᆪ"}, 1.5f, CondVowel.none) - .addTypo(new String[]{"ᆬ", "ᆭ"}, new String[]{"ᆫ", "ᆬ", "ᆭ"}, 1.5f, CondVowel.none) - .addTypo(new String[]{"ᆰ", "ᆱ", "ᆲ", "ᆳ", "ᆴ", "ᆵ", "ᆶ"}, new String[]{"ᆯ", "ᆰ", "ᆱ", "ᆲ", "ᆳ", "ᆴ", "ᆵ", "ᆶ"}, 1.5f, CondVowel.none) - .addTypo(new String[]{"ᆺ", "ᆻ"}, new String[]{"ᆺ", "ᆻ"}, 1.f, CondVowel.none) - - .addTypo(new String[]{"안"}, new String[]{"않"}, 1.5f, CondVowel.none) - .addTypo(new String[]{"맞추", "맞히"}, new String[]{"맞추", "맞히"}, 1.5f, CondVowel.none) - .addTypo(new String[]{"맞춰", "맞혀"}, new String[]{"맞춰", "맞혀"}, 1.5f, CondVowel.none) - .addTypo(new String[]{"받치", "바치", "받히"}, new String[]{"받치", "바치", "받히"}, 1.5f, CondVowel.none) - .addTypo(new String[]{"받쳐", "바쳐", "받혀"}, new String[]{"받쳐", "바쳐", "받혀"}, 1.5f, CondVowel.none) - .addTypo(new String[]{"던", "든"}, new String[]{"던", "든"}, 1.f, CondVowel.none) - .addTypo(new String[]{"때", "데"}, new String[]{"때", "데"}, 1.5f, CondVowel.none) - .addTypo(new String[]{"빛", "빚"}, new String[]{"빛", "빚"}, 1.f, CondVowel.none) - - .addTypo(new String[]{"ᆮ이", "지"}, new String[]{"ᆮ이", "지"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ᆮ여", "져"}, new String[]{"ᆮ여", "져"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ᇀ이", "치"}, new String[]{"ᇀ이", "치"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ᇀ여", "쳐"}, new String[]{"ᇀ여", "쳐"}, 1.f, CondVowel.none) - - .addTypo(new String[]{"ᄀ", "ᄁ"}, new String[]{"ᄀ", "ᄁ"}, 1.f, CondVowel.applosive) - .addTypo(new String[]{"ᄃ", "ᄄ"}, new String[]{"ᄃ", "ᄄ"}, 1.f, CondVowel.applosive) - .addTypo(new String[]{"ᄇ", "ᄈ"}, new String[]{"ᄇ", "ᄈ"}, 1.f, CondVowel.applosive) - .addTypo(new String[]{"ᄉ", "ᄊ"}, new String[]{"ᄉ", "ᄊ"}, 1.f, CondVowel.applosive) - .addTypo(new String[]{"ᄌ", "ᄍ"}, new String[]{"ᄌ", "ᄍ"}, 1.f, CondVowel.applosive) - - .addTypo(new String[]{"ᇂᄒ", "ᆨᄒ", "ᇂᄀ"}, new String[]{"ᇂᄒ", "ᆨᄒ", "ᇂᄀ"}, 1.f, CondVowel.none) - - .addTypo(new String[]{"ᆨᄂ", "ᆩᄂ", "ᆪᄂ", "ᆿᄂ", "ᆼᄂ"}, new String[]{"ᆨᄂ", "ᆩᄂ", "ᆪᄂ", "ᆿᄂ", "ᆼᄂ"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ᆨᄆ", "ᆩᄆ", "ᆪᄆ", "ᆿᄆ", "ᆼᄆ"}, new String[]{"ᆨᄆ", "ᆩᄆ", "ᆪᄆ", "ᆿᄆ", "ᆼᄆ"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ᆨᄅ", "ᆩᄅ", "ᆪᄅ", "ᆿᄅ", "ᆼᄅ", "ᆼᄂ",}, new String[]{"ᆨᄅ", "ᆩᄅ", "ᆪᄅ", "ᆿᄅ", "ᆼᄅ", "ᆼᄂ",}, 1.f, CondVowel.none) - .addTypo(new String[]{"ᆮᄂ", "ᆺᄂ", "ᆻᄂ", "ᆽᄂ", "ᆾᄂ", "ᇀᄂ", "ᆫᄂ"}, new String[]{"ᆮᄂ", "ᆺᄂ", "ᆻᄂ", "ᆽᄂ", "ᆾᄂ", "ᇀᄂ", "ᆫᄂ"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ᆮᄆ", "ᆺᄆ", "ᆻᄆ", "ᆽᄆ", "ᆾᄆ", "ᇀᄆ", "ᆫᄆ"}, new String[]{"ᆮᄆ", "ᆺᄆ", "ᆻᄆ", "ᆽᄆ", "ᆾᄆ", "ᇀᄆ", "ᆫᄆ"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ᆮᄅ", "ᆺᄅ", "ᆻᄅ", "ᆽᄅ", "ᆾᄅ", "ᇀᄅ", "ᆫᄅ", "ᆫᄂ",}, new String[]{"ᆮᄅ", "ᆺᄅ", "ᆻᄅ", "ᆽᄅ", "ᆾᄅ", "ᇀᄅ", "ᆫᄅ", "ᆫᄂ",}, 1.f, CondVowel.none) - .addTypo(new String[]{"ᆸᄂ", "ᆹᄂ", "ᇁᄂ", "ᆷᄂ"}, new String[]{"ᆸᄂ", "ᆹᄂ", "ᇁᄂ", "ᆷᄂ"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ᆸᄆ", "ᆹᄆ", "ᇁᄆ", "ᆷᄆ"}, new String[]{"ᆸᄆ", "ᆹᄆ", "ᇁᄆ", "ᆷᄆ"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ᆸᄅ", "ᆹᄅ", "ᇁᄅ", "ᆷᄅ", "ᆷᄂ",}, new String[]{"ᆸᄅ", "ᆹᄅ", "ᇁᄅ", "ᆷᄅ", "ᆷᄂ",}, 1.f, CondVowel.none) - .addTypo(new String[]{"ᆫᄅ", "ᆫᄂ", "ᆯᄅ", "ᆯᄂ"}, new String[]{"ᆫᄅ", "ᆫᄂ", "ᆯᄅ", "ᆯᄂ"}, 1.f, CondVowel.none) - - .addTypo(new String[]{"ᆨᄋ", "ᄀ"}, new String[]{"ᆨᄋ", "ᄀ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆩᄋ", "ᄁ"}, new String[]{"ᆩᄋ", "ᄁ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆫᄋ", "ᆫᄒ", "ᄂ"}, new String[]{"ᆫᄋ", "ᆫᄒ", "ᄂ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆬᄋ", "ᆫᄌ"}, new String[]{"ᆬᄋ", "ᆫᄌ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆭᄋ", "ᄂ"}, new String[]{"ᆭᄋ", "ᄂ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆮᄋ", "ᄃ"}, new String[]{"ᆮᄋ", "ᄃ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆯᄋ", "ᆯᄒ", "ᄅ"}, new String[]{"ᆯᄋ", "ᆯᄒ", "ᄅ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆰᄋ", "ᆯᄀ"}, new String[]{"ᆰᄋ", "ᆯᄀ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆰᄒ", "ᆯᄏ"}, new String[]{"ᆰᄒ", "ᆯᄏ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆷᄋ", "ᄆ"}, new String[]{"ᆷᄋ", "ᄆ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆸᄋ", "ᄇ"}, new String[]{"ᆸᄋ", "ᄇ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆺᄋ", "ᄉ"}, new String[]{"ᆺᄋ", "ᄉ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆻᄋ", "ᆺᄉ", "ᄊ"}, new String[]{"ᆻᄋ", "ᆺᄉ", "ᄊ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆽᄋ", "ᄌ"}, new String[]{"ᆽᄋ", "ᄌ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆾᄋ", "ᆾᄒ", "ᆽᄒ", "ᄎ"}, new String[]{"ᆾᄋ", "ᆾᄒ", "ᆽᄒ", "ᄎ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆿᄋ", "ᆿᄒ", "ᆨᄒ", "ᄏ"}, new String[]{"ᆿᄋ", "ᆿᄒ", "ᆨᄒ", "ᄏ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᇀᄋ", "ᇀᄒ", "ᆮᄒ", "ᄐ"}, new String[]{"ᇀᄋ", "ᇀᄒ", "ᆮᄒ", "ᄐ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᇁᄋ", "ᇁᄒ", "ᆸᄒ", "ᄑ"}, new String[]{"ᇁᄋ", "ᇁᄒ", "ᆸᄒ", "ᄑ"}, 1.f, CondVowel.vowel) - - .addTypo(new String[]{"은", "는"}, new String[]{"은", "는"}, 2.f, CondVowel.none) - .addTypo(new String[]{"을", "를"}, new String[]{"을", "를"}, 2.f, CondVowel.none) - - .addTypo(new String[]{"ㅣ워", "ㅣ어", "ㅕ"}, new String[]{"ㅣ워", "ㅣ어", "ㅕ"}, 1.5f, CondVowel.none) - .setContinualTypoCost(1.f) - .addTypo(new String[]{"ᆪ"}, new String[]{"ᆨᆺ", "ᆨᆻ"}, 1e-12f, CondVowel.none) - .addTypo(new String[]{"ᆬ"}, new String[]{"ᆫᆽ"}, 1e-12f, CondVowel.none) - .addTypo(new String[]{"ᆭ"}, new String[]{"ᆫᇂ"}, 1e-12f, CondVowel.none) - .addTypo(new String[]{"ᆰ"}, new String[]{"ᆯᆨ"}, 1e-12f, CondVowel.none) - .addTypo(new String[]{"ᆱ"}, new String[]{"ᆯᆷ"}, 1e-12f, CondVowel.none) - .addTypo(new String[]{"ᆲ"}, new String[]{"ᆯᆸ"}, 1e-12f, CondVowel.none) - .addTypo(new String[]{"ᆳ"}, new String[]{"ᆯᆺ"}, 1e-12f, CondVowel.none) - .addTypo(new String[]{"ᆴ"}, new String[]{"ᆯᇀ"}, 1e-12f, CondVowel.none) - .addTypo(new String[]{"ᆵ"}, new String[]{"ᆯᇁ"}, 1e-12f, CondVowel.none) - .addTypo(new String[]{"ᆶ"}, new String[]{"ᆯᇂ"}, 1e-12f, CondVowel.none) - .addTypo(new String[]{"ᆹ"}, new String[]{"ᆸᆺ", "ᆸᆻ"}, 1e-12f, CondVowel.none); + final public static TypoTransformer basicTypoSetWithContinual = basicTypoSet.copy().update(continualTypoSet); + final public static TypoTransformer lengtheningTypoSet = new TypoTransformer().setLengtheningTypoCost(0.5f); } diff --git a/bindings/java/kr/pe/bab2min/KiwiTest.java b/bindings/java/kr/pe/bab2min/KiwiTest.java index 326ff831..6ee7dfa0 100644 --- a/bindings/java/kr/pe/bab2min/KiwiTest.java +++ b/bindings/java/kr/pe/bab2min/KiwiTest.java @@ -4,6 +4,9 @@ import java.util.concurrent.Future; import org.junit.Test; + +import kr.pe.bab2min.KiwiBuilder.TypoTransformer; + import static org.junit.Assert.*; public class KiwiTest { @@ -155,6 +158,30 @@ public void testContinualTypos() throws Exception { assertEquals(tokens[3].form, "어"); } + @Test + public void testCustomTypos() throws Exception { + System.gc(); + KiwiBuilder builder = new KiwiBuilder(modelPath); + TypoTransformer typoSet = KiwiBuilder.basicTypoSet.copy() + .update(KiwiBuilder.continualTypoSet) + .update(KiwiBuilder.lengtheningTypoSet); + Kiwi kiwi = builder.build(typoSet); + + Kiwi.Token[] tokens = kiwi.tokenize("프로그래미", Kiwi.Match.allWithNormalizing); + System.out.println(Arrays.deepToString(tokens)); + assertEquals(tokens[0].form, "프로그램"); + assertEquals(tokens[1].form, "이"); + + tokens = kiwi.tokenize("지인짜?", Kiwi.Match.allWithNormalizing); + System.out.println(Arrays.deepToString(tokens)); + assertEquals(tokens[0].form, "진짜"); + assertEquals(tokens[1].form, "?"); + + tokens = kiwi.tokenize("맗은 물", Kiwi.Match.allWithNormalizing); + System.out.println(Arrays.deepToString(tokens)); + assertEquals(tokens[0].form, "맑"); + } + @Test public void testBlocklist() throws Exception { System.gc(); diff --git a/include/kiwi/ArchUtils.h b/include/kiwi/ArchUtils.h index 06191001..ba2d0e85 100644 --- a/include/kiwi/ArchUtils.h +++ b/include/kiwi/ArchUtils.h @@ -30,13 +30,13 @@ namespace kiwi template<> struct ArchInfo { - static constexpr size_t alignment = 0; + static constexpr size_t alignment = 4; }; template<> struct ArchInfo { - static constexpr size_t alignment = 0; + static constexpr size_t alignment = 4; }; template<> diff --git a/include/kiwi/Kiwi.h b/include/kiwi/Kiwi.h index f69c4eb7..c8ebede1 100644 --- a/include/kiwi/Kiwi.h +++ b/include/kiwi/Kiwi.h @@ -69,6 +69,7 @@ namespace kiwi float spacePenalty = 7; float typoCostWeight = 6; float continualTypoCost = INFINITY; + float lengtheningTypoCost = INFINITY; size_t maxUnkFormSize = 6; size_t spaceTolerance = 0; @@ -127,7 +128,11 @@ namespace kiwi * @note 이 생성자는 기본 생성자로 이를 통해 생성된 객체는 바로 형태소 분석에 사용할 수 없다. * kiwi::KiwiBuilder 를 통해 생성된 객체만이 형태소 분석에 사용할 수 있다. */ - Kiwi(ArchType arch = ArchType::default_, LangModel _langMdl = {}, bool typoTolerant = false, bool continualTypoTolerant = false); + Kiwi(ArchType arch = ArchType::default_, + LangModel _langMdl = {}, + bool typoTolerant = false, + bool continualTypoTolerant = false, + bool lengtheningTypoTolerant = false); ~Kiwi(); diff --git a/include/kiwi/Types.h b/include/kiwi/Types.h index 6af80524..c43c7b9e 100644 --- a/include/kiwi/Types.h +++ b/include/kiwi/Types.h @@ -136,6 +136,9 @@ namespace kiwi template> using UnorderedMap = std::unordered_map<_K, _V, _Hash, std::equal_to<_K>, mi_stl_allocator>>; + template> + using UnorderedSet = std::unordered_set<_K, _Hash, std::equal_to<_K>, mi_stl_allocator<_K>>; + using KString = std::basic_string, mi_stl_allocator>; using KStringStream = std::basic_stringstream, mi_stl_allocator>; using KcVector = Vector; @@ -168,6 +171,9 @@ namespace kiwi template> using UnorderedMap = std::unordered_map<_K, _V, _Hash>; + template> + using UnorderedSet = std::unordered_set<_K, _Hash>; + /** * @brief std::u16string의 내부용 타입. mimalloc 옵션에 따라 mi_stl_allocator로부터 메모리를 할당받는다. * diff --git a/include/kiwi/TypoTransformer.h b/include/kiwi/TypoTransformer.h index f13576be..cf2a9757 100644 --- a/include/kiwi/TypoTransformer.h +++ b/include/kiwi/TypoTransformer.h @@ -1,3 +1,12 @@ +/** + * @file TypoTransformer.h + * @author bab2min (bab2min@gmail.com) + * @brief 오타 교정에 사용되는 TypoTransformer 및 관련 클래스들을 정의합니다. + * @version 0.19.0 + * @date 2024-09-15 + * + * + */ #pragma once #include "Types.h" @@ -114,6 +123,9 @@ namespace kiwi class KiwiBuilder; class TypoTransformer; + /** + * @brief 오타 생성 및 교정 준비가 완료된 오타 생성기. kiwi::TypoTransformer::prepare()로부터 생성됩니다. + */ class PreparedTypoTransformer { friend class KiwiBuilder; @@ -165,6 +177,7 @@ namespace kiwi KString strPool; Vector replacements; float continualTypoThreshold = INFINITY; + float lengtheningTypoThreshold = INFINITY; template TypoCandidates _generate(const KString& orig, float costThreshold = 2.5f) const; @@ -185,33 +198,33 @@ namespace kiwi return continualTypoThreshold; } + float getLengtheningTypoCost() const + { + return lengtheningTypoThreshold; + } + + /** + * @brief 주어진 문자열에 대해 오타 후보를 생성합니다. + * + * @param orig 원본 문자열 + * @param costThreshold 생성할 오타 후보의 비용 상한 + */ TypoCandidates generate(const std::u16string& orig, float costThreshold = 2.5f) const; }; + /** + * @brief 오타 교정에 사용되는 오타 생성기 정의자 + */ class TypoTransformer { friend class KiwiBuilder; friend class PreparedTypoTransformer; - using TrieNode = utils::TrieNode>>; - - struct ReplInfo - { - uint32_t begin, end; - float cost; - CondVowel leftCond; - - ReplInfo(uint32_t _begin = 0, uint32_t _end = 0, float _cost = 0, CondVowel _leftCond = CondVowel::none) - : begin{ _begin }, end{ _end }, cost{ _cost }, leftCond{ _leftCond } - {} - }; - - utils::ContinuousTrie patTrie; - KString strPool; - Vector> replacements; float continualTypoThreshold = INFINITY; + float lengtheningTypoThreshold = INFINITY; + + UnorderedMap, float> typos; - void addTypoImpl(const KString& orig, const KString& error, float cost, CondVowel leftCond = CondVowel::none); void addTypoWithCond(const KString& orig, const KString& error, float cost, CondVowel leftCond = CondVowel::none); void addTypoNormalized(const KString& orig, const KString& error, float cost = 1, CondVowel leftCond = CondVowel::none); @@ -233,12 +246,21 @@ namespace kiwi TypoTransformer& operator=(TypoTransformer&&); bool isContinualTypoEnabled() const; - - bool empty() const - { - return replacements.empty() && !isContinualTypoEnabled(); - } - + bool isLengtheningTypoEnabled() const; + bool empty() const; + + /** + * @brief 새 오타를 정의합니다. + * + * @param orig 원본 문자열 + * @param error 오류 문자열 + * @param cost 오류 문자열로 변환하는데 드는 비용. 이 값을 무한대로 설정하면 해당 오타가 비활성화됩니다. + * @param leftCond 원본 문자열이 오류 문자열로 변환될 때 요구되는 왼쪽 모음의 조건 + * + * @note orig, error는 모두 완전한 음절이거나 모음이거나 초성이어야 합니다. 그렇지 않은 경우 invalid_argument 예외가 발생합니다. + * addTypo(u"ㅐ", u"ㅔ")는 비용 1을 들여 ㅐ를 ㅔ로 바꾸는 변환을 새로 정의합니다. + * addTypo(u"ㅐ", u"에")는 실패하고 예외를 발생시킵니다. + */ void addTypo(const std::u16string& orig, const std::u16string& error, float cost = 1, CondVowel leftCond = CondVowel::none); TypoTransformer& addTypos(std::initializer_list lst) @@ -256,13 +278,108 @@ namespace kiwi return *this; } + const UnorderedMap, float>& getTypos() const + { + return typos; + } + + /** + * @brief 연철 오타의 비용을 새로 설정합니다. + * + * @param threshold 연철 오타의 비용 + * @note 연철 오타의 초기값은 무한대, 즉 비활성화 상태입니다. 유한한 값으로 설정하면 연철 오타가 활성화됩니다. + */ void setContinualTypoCost(float threshold) { continualTypoThreshold = threshold; } + float getContinualTypoCost() const + { + return continualTypoThreshold; + } + + static TypoTransformer fromContinualTypoCost(float threshold) + { + TypoTransformer ret; + ret.setContinualTypoCost(threshold); + return ret; + } + TypoTransformer copyWithNewContinualTypoCost(float threshold) const; + /** + * @brief 장음화 오타의 비용을 새로 설정합니다. + * + * @param threshold 장음화 오타의 비용 + * @note 장음화 오타의 초기값은 무한대, 즉 비활성화 상태입니다. 유한한 값으로 설정하면 장음화 오타가 활성화됩니다. + */ + void setLengtheningTypoCost(float threshold) + { + lengtheningTypoThreshold = threshold; + } + + float getLengtheningTypoCost() const + { + return lengtheningTypoThreshold; + } + + static TypoTransformer fromLengtheningTypoCost(float threshold) + { + TypoTransformer ret; + ret.setLengtheningTypoCost(threshold); + return ret; + } + + TypoTransformer copyWithNewLengtheningTypoCost(float threshold) const; + + /** + * @brief 다른 TypoTransformer의 오타를 현재 TypoTransformer에 추가합니다. + * + * @param o 추가할 TypoTransformer + * @note 현재 TypoTransformer와 o에서 동일한 오타를 정의하고 있는 경우 비용이 더 낮은 정의가 선택됩니다. + * 연철 오타와 장음화 오타 역시 마찬가지로 양쪽 중 더 낮은 쪽의 비용이 선택됩니다. + */ + void update(const TypoTransformer& o); + + TypoTransformer& operator|=(const TypoTransformer& o) + { + update(o); + return *this; + } + + TypoTransformer operator|(const TypoTransformer& o) const + { + TypoTransformer ret = *this; + ret.update(o); + return ret; + } + + /** + * @brief 현재 TypoTransformer의 모든 오타의 비용을 scale배 합니다. + * + * @param scale 배율 + * @note scale은 0보다 큰 양수여야 합니다. 0, 음수, 무한대의 경우 invalid_argument 예외가 발생합니다. + */ + void scaleCost(float scale); + + TypoTransformer& operator*=(float scale) + { + scaleCost(scale); + return *this; + } + + TypoTransformer operator*(float scale) const + { + TypoTransformer ret = *this; + ret.scaleCost(scale); + return ret; + } + + /** + * @brief 현재 TypoTransformer를 사용하여 PreparedTypoTransformer를 생성합니다. + * PreparedTypoTransformer는 실제로 오타를 생성하거나 kiwi::KiwiBuilder에 전달되어 오타 교정에 사용될 수 있습니다. + */ PreparedTypoTransformer prepare() const { return { *this }; @@ -275,7 +392,13 @@ namespace kiwi basicTypoSet, continualTypoSet, basicTypoSetWithContinual, + lengtheningTypoSet, }; + /** + * @brief 기본 내장 오타 생성기를 반환합니다. + * + * @param set 사용할 기본 내장 오타 생성기의 종류 + */ const TypoTransformer& getDefaultTypoSet(DefaultTypoSet set); } diff --git a/include/kiwi/capi.h b/include/kiwi/capi.h index a437e0e0..76dd282e 100644 --- a/include/kiwi/capi.h +++ b/include/kiwi/capi.h @@ -332,11 +332,11 @@ DECL_DLL kiwi_h kiwi_builder_build(kiwi_builder_h handle, kiwi_typo_h typos, flo /** - * @brief + * @brief 오타 교정기를 새로 생성합니다. * - * @return + * @return 성공 시 오타 교정기의 핸들을 반환합니다. 실패 시 null를 반환하고 에러 메세지를 설정합니다. * - * @note + * @note 생성된 오타 교정기는 kiwi_typo_close를 통해 반드시 해제되어야 합니다. */ DECL_DLL kiwi_typo_h kiwi_typo_init(); @@ -358,32 +358,84 @@ enum KIWI_TYPO_BASIC_TYPO_SET = 1, KIWI_TYPO_CONTINUAL_TYPO_SET = 2, KIWI_TYPO_BASIC_TYPO_SET_WITH_CONTINUAL = 3, + KIWI_TYPO_LENGTHENING_TYPO_SET = 4, }; /** * @brief Kiwi에 기본적으로 내장된 오타 교정기의 핸들을 반환합니다. * - * @return + * @return 성공 시 오타 교정기의 핸들을 반환합니다. 실패 시 null를 반환하고 에러 메세지를 설정합니다. * * @note 이 핸들은 kiwi_typo_close에 사용할 수 없음. */ DECL_DLL kiwi_typo_h kiwi_typo_get_default(int kiwi_typo_set); /** - * @brief + * @brief 오타 교정기에 새로운 오타 정의를 추가합니다. * * @return * - * @note + * @note 이 함수는 kiwi_typo_get_default로 얻은 핸들에는 사용할 수 없습니다. */ DECL_DLL int kiwi_typo_add(kiwi_typo_h handle, const char** orig, int orig_size, const char** error, int error_size, float cost, int condition); /** - * @brief +* @brief 오타 교정기를 복사하여 새로운 핸들을 생성합니다. +* +* @return 성공 시 새로운 오타 교정기의 핸들을 반환합니다. 실패 시 null를 반환하고 에러 메세지를 설정합니다. +* +* @note 복사하여 새로 생성된 오타 교정기의 핸들은 kiwi_typo_close를 통해 반드시 해제되어야 합니다. +*/ +DECL_DLL kiwi_typo_h kiwi_typo_copy(kiwi_typo_h handle); + +/** +* @brief 현재 오타 교정기에 다른 오타 교정기 내의 오타 정의들을 추가합니다. +* +* @param handle 오타가 삽입될 교정기의 핸들 +* @param src 오타 정의 출처 +* @return 성공 시 0를 반환합니다. 실패 시 음수를 반환하고 에러 메세지를 설정합니다. +* +* @note kiwi_typo_get_default로 얻은 핸들은 handle로 사용할 수 없습니다. src로 사용하는 것은 가능합니다. +*/ +DECL_DLL int kiwi_typo_update(kiwi_typo_h handle, kiwi_typo_h src); + +/** +* @brief 현재 오타 교정기의 오타 비용을 일정한 비율로 늘리거나 줄입니다. +* +* @param handle 오타 교정기의 핸들 +* @param scale 0보다 큰 실수. 모든 오타 비용에 이 값이 곱해집니다. +* @return 성공 시 0를 반환합니다. 실패 시 음수를 반환하고 에러 메세지를 설정합니다. +*/ +DECL_DLL int kiwi_typo_scale_cost(kiwi_typo_h handle, float scale); + +/** +* @brief 현재 오타 교정기의 연철 오타 비용을 설정합니다. +* +* @param handle 오타 교정기의 핸들 +* @param threshold 연철 오타의 새로운 비용 +* @return 성공 시 0를 반환합니다. 실패 시 음수를 반환하고 에러 메세지를 설정합니다. +* +* @note 연철 오타의 초기값은 무한대, 즉 비활성화 상태입니다. 유한한 값으로 설정하면 연철 오타가 활성화됩니다. +*/ +DECL_DLL int kiwi_typo_set_continual_typo_cost(kiwi_typo_h handle, float threshold); + +/** +* @brief 현재 오타 교정기의 장음화 오타 비용을 설정합니다. +* +* @param handle 오타 교정기의 핸들 +* @param threshold 장음화 오타의 새로운 비용 +* @return 성공 시 0를 반환합니다. 실패 시 음수를 반환하고 에러 메세지를 설정합니다. +* +* @note 장음화 오타의 초기값은 무한대, 즉 비활성화 상태입니다. 유한한 값으로 설정하면 장음화 오타가 활성화됩니다. +*/ +DECL_DLL int kiwi_typo_set_lengthening_typo_cost(kiwi_typo_h handle, float threshold); + +/** + * @brief 생성된 오타 교정기를 해제합니다. * - * @return + * @return 성공 시 0를 반환합니다. 실패 시 음수를 반환하고 에러 메세지를 설정합니다. * - * @note + * @note kiwi_typo_get_default로 얻은 핸들은 절대 해제해서는 안됩니다. */ DECL_DLL int kiwi_typo_close(kiwi_typo_h handle); diff --git a/src/KTrie.cpp b/src/KTrie.cpp index d3ac246d..83593fc8 100644 --- a/src/KTrie.cpp +++ b/src/KTrie.cpp @@ -42,7 +42,7 @@ namespace kiwi return true; } - template + template struct FormCandidate { const Form* form = nullptr; @@ -51,8 +51,17 @@ namespace kiwi uint32_t typoId = 0; uint32_t end = 0; // only used in continual typo tolerant mode - FormCandidate(const Form* _form = nullptr, float _cost = 0, uint32_t _start = 0, uint32_t _typoId = 0, uint32_t _end = 0) - : form{ _form }, cost{ _cost }, start{ _start }, typoId{ _typoId }, end{ _end } + FormCandidate(const Form* _form = nullptr, + float _cost = 0, + uint32_t _start = 0, + uint32_t _typoId = 0, + uint32_t _end = 0, + uint32_t = 0) + : form{ _form }, + cost{ _cost }, + start{ _start }, + typoId{ _typoId }, + end{ _end } {} size_t getStartPos(size_t ) const @@ -87,11 +96,11 @@ namespace kiwi }; template<> - struct FormCandidate + struct FormCandidate { const Form* form = nullptr; - FormCandidate(const Form* _form = nullptr, float = 0, uint32_t = 0, uint32_t = 0, uint32_t = 0) + FormCandidate(const Form* _form = nullptr, float = 0, uint32_t = 0, uint32_t = 0, uint32_t = 0, uint32_t = 0) : form{ _form } {} @@ -126,6 +135,28 @@ namespace kiwi } }; + template + struct FormCandidate : public FormCandidate + { + using BaseType = FormCandidate; + uint32_t lengthenedSize = 0; + + FormCandidate(const Form* _form = nullptr, + float _cost = 0, + uint32_t _start = 0, + uint32_t _typoId = 0, + uint32_t _end = 0, + uint32_t _lengthenedSize = 0) + : FormCandidate{ _form, _cost, _start, _typoId, _end, _lengthenedSize }, + lengthenedSize{ _lengthenedSize } + {} + + size_t getFormSizeWithTypos(const size_t* typoPtrs) const + { + return BaseType::getFormSizeWithTypos(typoPtrs) + lengthenedSize; + } + }; + template bool getZCodaAppendable( const Form* foundCand, @@ -143,9 +174,9 @@ namespace kiwi } } - template + template bool insertCandidates( - Vector>& candidates, + Vector>& candidates, const Form* foundCand, const Form* formBase, const size_t* typoPtrs, @@ -153,7 +184,8 @@ namespace kiwi const Vector& nonSpaces, uint32_t startPosition = 0, uint32_t endPosition = 0, - float cost = 0 + float cost = 0, + uint32_t lengthenedSize = 0 ) { static constexpr size_t posMultiplier = continualTypoTolerant ? 4 : 1; @@ -164,12 +196,17 @@ namespace kiwi while (1) { - const auto typoFormSize = typoPtrs[tCand->typoId + 1] - typoPtrs[tCand->typoId]; + const auto typoFormSize = typoPtrs[tCand->typoId + 1] - typoPtrs[tCand->typoId] + lengthenedSize; auto cand = &tCand->form(formBase); if (FeatureTestor::isMatched(&str[0], &str[nonSpaces[nonSpaces.size() - typoFormSize]], tCand->leftCond) && FeatureTestor::isMatchedApprox(&str[0], &str[nonSpaces[nonSpaces.size() - typoFormSize]], cand->vowel, cand->polar)) { - candidates.emplace_back(cand, tCand->score() + cost, startPosition ? startPosition : ((nonSpaces.size() - typoFormSize) * posMultiplier), tCand->typoId, endPosition); + candidates.emplace_back(cand, + tCand->score() + cost, + startPosition ? startPosition : ((nonSpaces.size() - typoFormSize) * posMultiplier), + tCand->typoId, + endPosition, + lengthenedSize); } if (tCand[0].hash() != tCand[1].hash()) break; ++tCand; @@ -472,9 +509,9 @@ namespace kiwi } }; - template + template inline void insertContinualTypoNode( - Vector>& candidates, + Vector>& candidates, Vector::Node*>>& continualTypoRightNodes, Decomposer decomposer, float continualTypoCost, @@ -490,7 +527,7 @@ namespace kiwi if (!continualTypoTolerant) return; static constexpr size_t posMultiplier = continualTypoTolerant ? 4 : 1; - char16_t codaFromContinual = decomposer.onsetToCoda(c), + const char16_t codaFromContinual = decomposer.onsetToCoda(c), droppedSyllable = decomposer.dropRightSyllable(c); if (!codaFromContinual || !droppedSyllable) return; @@ -537,7 +574,11 @@ inline bool isDiscontinuous(POSTag prevTag, POSTag curTag, ScriptType prevScript return prevTag != curTag; } -template +template size_t kiwi::splitByTrie( Vector& ret, const Form* formBase, @@ -549,6 +590,7 @@ size_t kiwi::splitByTrie( size_t maxUnkFormSize, size_t spaceTolerance, float continualTypoCost, + float lengtheningTypoCost, const PretokenizedSpanGroup::Span*& pretokenizedFirst, const PretokenizedSpanGroup::Span* pretokenizedLast ) @@ -587,11 +629,13 @@ size_t kiwi::splitByTrie( out.clear(); out.emplace_back(); size_t n = 0; - Vector> candidates; + Vector> candidates; + using NodePtrTy = decltype(trie.root()); auto* curNode = trie.root(); - auto* curNodeForContinualTypo = trie.root(); + auto* curNodeForTypo = trie.root(); auto* nextNode = trie.root(); - Vector> continualTypoRightNodes; + Vector> continualTypoRightNodes; + Vector> lengtheningTypoNodes; size_t lastSpecialEndPos = 0, specialStartPos = 0; POSTag chrType, lastChrType = POSTag::unknown, lastMatchedPattern = POSTag::unknown; @@ -921,7 +965,7 @@ size_t kiwi::splitByTrie( goto continueFor; } - curNodeForContinualTypo = curNode; + curNodeForTypo = curNode; nextNode = curNode->template nextOpt(trie, c); while (!nextNode) // if curNode has no exact next node, goto fail { @@ -973,6 +1017,17 @@ size_t kiwi::splitByTrie( } zCodaFollowable = false; + // invalidate typo nodes + if (continualTypoTolerant) + { + continualTypoRightNodes.clear(); + } + + if (lengtheningTypoTolerant) + { + lengtheningTypoNodes.clear(); + } + goto continueFor; } } @@ -989,6 +1044,63 @@ size_t kiwi::splitByTrie( continualTypoRightNodes.resize(outputIdx); } + if (lengtheningTypoTolerant) + { + static uint8_t lengthenVowelTable[] = { + 0, // ㅏ + 1, // ㅐ + 0, // ㅑ + 1, // ㅒ + 4, // ㅓ + 5, // ㅔ + 4, // ㅕ + 5, // ㅖ + 8, // ㅗ + 0, // ㅘ + 1, // ㅙ + 1, // ㅚ + 8, // ㅛ + 13, // ㅜ + 4, // ㅝ + 5, // ㅞ + 20, // ㅟ + 13, // ㅠ + 18, // ㅡ + 20, // ㅢ + 20, // ㅣ + }; + const size_t prevLengtheningSize = lengtheningTypoNodes.size(); + if (n > 0 && isHangulSyllable(str[n - 1]) && + (u'아' <= c && c < u'자') && lengthenVowelTable[extractVowel(str[n - 1])] == extractVowel(c)) + { + lengtheningTypoNodes.emplace_back(1, curNodeForTypo); + for (size_t i = 0; i < prevLengtheningSize; ++i) + { + auto& node = lengtheningTypoNodes[i]; + lengtheningTypoNodes.emplace_back(node.first + 1, node.second); + } + } + + thread_local UnorderedSet> uniq; + uniq.clear(); + size_t outputIdx = 0; + for (size_t i = 0; i < prevLengtheningSize; ++i) + { + auto& node = lengtheningTypoNodes[i]; + node.second = node.second->template nextOpt(trie, c); + if (!node.second) continue; + if (!uniq.emplace(node).second) continue; + lengtheningTypoNodes[outputIdx++] = node; + } + for (size_t i = prevLengtheningSize; i < lengtheningTypoNodes.size(); ++i) + { + auto& node = lengtheningTypoNodes[i]; + if (!uniq.emplace(node).second) continue; + lengtheningTypoNodes[outputIdx++] = node; + } + lengtheningTypoNodes.erase(lengtheningTypoNodes.begin() + outputIdx, lengtheningTypoNodes.end()); + } + if (chrType != POSTag::max) { flushBranch(specialStartPos, specialStartPos < nonSpaces.size() ? nonSpaces[specialStartPos] : n); @@ -1009,13 +1121,13 @@ size_t kiwi::splitByTrie( if (continualTypoTolerant && lastChrType == POSTag::max) { insertContinualTypoNode(candidates, continualTypoRightNodes, ContinualIeungDecomposer{}, - continualTypoCost, c, formBase, typoPtrs, trie, str, nonSpaces, curNodeForContinualTypo); + continualTypoCost, c, formBase, typoPtrs, trie, str, nonSpaces, curNodeForTypo); insertContinualTypoNode(candidates, continualTypoRightNodes, ContinualHieutDecomposer{}, - continualTypoCost, c, formBase, typoPtrs, trie, str, nonSpaces, curNodeForContinualTypo); + continualTypoCost, c, formBase, typoPtrs, trie, str, nonSpaces, curNodeForTypo); insertContinualTypoNode(candidates, continualTypoRightNodes, ContinualCodaDecomposer{}, - continualTypoCost, c, formBase, typoPtrs, trie, str, nonSpaces, curNodeForContinualTypo); + continualTypoCost, c, formBase, typoPtrs, trie, str, nonSpaces, curNodeForTypo); } - + // from this, curNode has the exact next node curNode = nextNode; // if it has exit node, patterns have been found @@ -1041,6 +1153,19 @@ size_t kiwi::splitByTrie( } } } + + if (lengtheningTypoTolerant) + { + for (auto& node : lengtheningTypoNodes) + { + const Form* cand = node.second->val(trie); + if (cand && !trie.hasSubmatch(cand)) + { + insertCandidates(candidates, cand, formBase, typoPtrs, str, nonSpaces, 0, 0, lengtheningTypoCost * node.first, node.first); + } + } + } + continueFor: lastChrType = chrType; lastScriptType = scriptType; @@ -1117,29 +1242,34 @@ const Form* kiwi::findForm( namespace kiwi { - template + template struct SplitByTrieGetter { template struct Wrapper { - static constexpr FnSplitByTrie value = &splitByTrie(i), typoTolerant, continualTypoTolerant>; + static constexpr FnSplitByTrie value = &splitByTrie(i), typoTolerant, continualTypoTolerant, lengtheningTypoTolerant>; }; }; } -FnSplitByTrie kiwi::getSplitByTrieFn(ArchType arch, bool typoTolerant, bool continualTypoTolerant) +FnSplitByTrie kiwi::getSplitByTrieFn(ArchType arch, bool typoTolerant, bool continualTypoTolerant, bool lengtheningTypoTolerant) { - static std::array, 4> table{ - SplitByTrieGetter{}, - SplitByTrieGetter{}, - SplitByTrieGetter{}, - SplitByTrieGetter{} + static std::array, 8> table{ + SplitByTrieGetter{}, + SplitByTrieGetter{}, + SplitByTrieGetter{}, + SplitByTrieGetter{}, + SplitByTrieGetter{}, + SplitByTrieGetter{}, + SplitByTrieGetter{}, + SplitByTrieGetter{}, }; size_t idx = 0; if (typoTolerant) idx += 1; if (continualTypoTolerant) idx += 2; + if (lengtheningTypoTolerant) idx += 4; return table[idx][static_cast(arch)]; } diff --git a/src/KTrie.h b/src/KTrie.h index 71bd9e5a..c07e0636 100644 --- a/src/KTrie.h +++ b/src/KTrie.h @@ -80,8 +80,13 @@ namespace kiwi * @tparam arch Trie탐색에 사용할 CPU 아키텍처 타입 * @tparam typoTolerant 오타가 포함된 형태를 탐색할지 여부 * @tparam continualTypoTolerant 연철된 오타를 탐색할지 여부 + * @tparam lengtheningTypoTolerant 여러 음절로 늘려진 오타를 탐색할지 여부 */ - template + template size_t splitByTrie( Vector& out, const Form* formBase, @@ -93,6 +98,7 @@ namespace kiwi size_t maxUnkFormSize, size_t spaceTolerance, float continualTypoCost, + float lengtheningTypoCost, const PretokenizedSpanGroup::Span*& pretokenizedFirst, const PretokenizedSpanGroup::Span* pretokenizedLast ); @@ -104,7 +110,7 @@ namespace kiwi ); using FnSplitByTrie = decltype(&splitByTrie); - FnSplitByTrie getSplitByTrieFn(ArchType arch, bool typoTolerant, bool continualTypoTolerant); + FnSplitByTrie getSplitByTrieFn(ArchType arch, bool typoTolerant, bool continualTypoTolerant, bool lengtheningTypoTolerant); using FnFindForm = decltype(&findForm); FnFindForm getFindFormFn(ArchType arch); diff --git a/src/Kiwi.cpp b/src/Kiwi.cpp index 25bf54f5..7e5aed9e 100644 --- a/src/Kiwi.cpp +++ b/src/Kiwi.cpp @@ -41,11 +41,18 @@ namespace kiwi return buf; } - Kiwi::Kiwi(ArchType arch, LangModel _langMdl, bool typoTolerant, bool continualTypoTolerant) + Kiwi::Kiwi(ArchType arch, + LangModel _langMdl, + bool typoTolerant, + bool continualTypoTolerant, + bool lengtheningTypoTolerant) : langMdl(_langMdl) { selectedArch = arch; - dfSplitByTrie = (void*)getSplitByTrieFn(selectedArch, typoTolerant, continualTypoTolerant); + dfSplitByTrie = (void*)getSplitByTrieFn(selectedArch, + typoTolerant, + continualTypoTolerant, + lengtheningTypoTolerant); dfFindForm = (void*)getFindFormFn(selectedArch); static tp::Table lmKnLM_8{ FindBestPathGetter::type>{} }; @@ -1005,6 +1012,7 @@ namespace kiwi maxUnkFormSize, spaceTolerance, continualTypoCost, + lengtheningTypoCost, pretokenizedFirst, pretokenizedLast ); diff --git a/src/KiwiBuilder.cpp b/src/KiwiBuilder.cpp index 60dd20e4..4ffe5dc7 100644 --- a/src/KiwiBuilder.cpp +++ b/src/KiwiBuilder.cpp @@ -1683,7 +1683,7 @@ namespace kiwi Kiwi KiwiBuilder::build(const TypoTransformer& typos, float typoCostThreshold) const { - Kiwi ret{ archType, langMdl, !typos.empty(), typos.isContinualTypoEnabled() }; + Kiwi ret{ archType, langMdl, !typos.empty(), typos.isContinualTypoEnabled(), typos.isLengtheningTypoEnabled()}; Vector combinedForms; Vector combinedMorphemes; @@ -1808,6 +1808,7 @@ Kiwi KiwiBuilder::build(const TypoTransformer& typos, float typoCostThreshold) c UnorderedMap> typoGroup; auto ptypos = typos.prepare(); ret.continualTypoCost = ptypos.getContinualTypoCost(); + ret.lengtheningTypoCost = ptypos.getLengtheningTypoCost(); for (auto f : sortedForms) { // 현재는 공백이 없는 단일 단어에 대해서만 오타 교정을 수행. diff --git a/src/TypoTransformer.cpp b/src/TypoTransformer.cpp index 6b6076e3..96da2813 100644 --- a/src/TypoTransformer.cpp +++ b/src/TypoTransformer.cpp @@ -205,10 +205,7 @@ TypoIterator& TypoIterator::operator++() } TypoTransformer::TypoTransformer() - : patTrie{ 1 } { - char16_t c = 0; - patTrie.build(&c, &c + 1, 0); } TypoTransformer::~TypoTransformer() = default; @@ -217,39 +214,17 @@ TypoTransformer::TypoTransformer(TypoTransformer&&) noexcept = default; TypoTransformer& TypoTransformer::operator=(const TypoTransformer&) = default; TypoTransformer& TypoTransformer::operator=(TypoTransformer&&) = default; -void TypoTransformer::addTypoImpl(const KString& orig, const KString& error, float cost, CondVowel leftCond) -{ - if (orig == error) return; - - size_t replIdx = patTrie.build(orig.begin(), orig.end(), replacements.size() + 1)->val - 1; - if (replIdx == replacements.size()) - { - replacements.emplace_back(); - } - bool updated = false; - for (auto& p : replacements[replIdx]) - { - if (p.leftCond == leftCond && strPool.substr(p.begin, p.end - p.begin) == error) - { - p.cost = isfinite(cost) ? min(p.cost, cost) : cost; - updated = true; - break; - } - } - if (!updated) - { - replacements[replIdx].emplace_back(strPool.size(), strPool.size() + error.size(), cost, leftCond); - strPool += error; - } -} - void TypoTransformer::addTypoWithCond(const KString& orig, const KString& error, float cost, CondVowel leftCond) { if (orig == error) return; if (leftCond == CondVowel::none || leftCond == CondVowel::vowel || leftCond == CondVowel::any) { - addTypoImpl(orig, error, cost, leftCond); + auto inserted = typos.emplace(make_tuple(orig, error, leftCond), cost); + if (!inserted.second) + { + inserted.first->second = isfinite(cost) ? min(inserted.first->second, cost) : cost; + } } else if (leftCond == CondVowel::applosive) { @@ -260,7 +235,11 @@ void TypoTransformer::addTypoWithCond(const KString& orig, const KString& error, o += orig; if (c) e.push_back(c); e += error; - addTypoImpl(o, e, cost, c ? CondVowel::none : leftCond); + auto inserted = typos.emplace(make_tuple(o, e, c ? CondVowel::none : leftCond), cost); + if (!inserted.second) + { + inserted.first->second = isfinite(cost) ? min(inserted.first->second, cost) : cost; + } } } else @@ -312,6 +291,16 @@ bool TypoTransformer::isContinualTypoEnabled() const return isfinite(continualTypoThreshold); } +bool TypoTransformer::isLengtheningTypoEnabled() const +{ + return isfinite(lengtheningTypoThreshold); +} + +bool TypoTransformer::empty() const +{ + return typos.empty() && !isContinualTypoEnabled() && !isLengtheningTypoEnabled(); +} + TypoTransformer TypoTransformer::copyWithNewContinualTypoCost(float threshold) const { TypoTransformer ret = *this; @@ -319,17 +308,114 @@ TypoTransformer TypoTransformer::copyWithNewContinualTypoCost(float threshold) c return ret; } +TypoTransformer TypoTransformer::copyWithNewLengtheningTypoCost(float threshold) const +{ + TypoTransformer ret = *this; + ret.lengtheningTypoThreshold = threshold; + return ret; +} + +void TypoTransformer::update(const TypoTransformer& o) +{ + for (auto& p : o.typos) + { + auto inserted = typos.emplace(p); + if (!inserted.second) + { + inserted.first->second = min(inserted.first->second, p.second); + } + } + + continualTypoThreshold = min(continualTypoThreshold, o.continualTypoThreshold); + lengtheningTypoThreshold = min(lengtheningTypoThreshold, o.lengtheningTypoThreshold); +} + +void TypoTransformer::scaleCost(float scale) +{ + if (!isfinite(scale) || scale <= 0) throw invalid_argument{ "`scale` must be positive real." }; + + for (auto& p : typos) + { + p.second *= scale; + } + if (isfinite(continualTypoThreshold)) continualTypoThreshold *= scale; + if (isfinite(lengtheningTypoThreshold)) lengtheningTypoThreshold *= scale; +} + +namespace kiwi +{ + struct IntermediateTypoTransformer + { + using TrieNode = utils::TrieNode>>; + + struct ReplInfo + { + uint32_t begin, end; + float cost; + CondVowel leftCond; + + ReplInfo(uint32_t _begin = 0, uint32_t _end = 0, float _cost = 0, CondVowel _leftCond = CondVowel::none) + : begin{ _begin }, end{ _end }, cost{ _cost }, leftCond{ _leftCond } + {} + }; + + utils::ContinuousTrie patTrie; + KString strPool; + Vector> replacements; + + IntermediateTypoTransformer() + : patTrie{ 1 } + { + char16_t c = 0; + patTrie.build(&c, &c + 1, 0); + } + + void addTypo(const KString& orig, const KString& error, float cost, CondVowel leftCond = CondVowel::none) + { + if (orig == error) return; + + size_t replIdx = patTrie.build(orig.begin(), orig.end(), replacements.size() + 1)->val - 1; + if (replIdx == replacements.size()) + { + replacements.emplace_back(); + } + bool updated = false; + for (auto& p : replacements[replIdx]) + { + if (p.leftCond == leftCond && strPool.substr(p.begin, p.end - p.begin) == error) + { + p.cost = isfinite(cost) ? min(p.cost, cost) : cost; + updated = true; + break; + } + } + if (!updated) + { + replacements[replIdx].emplace_back(strPool.size(), strPool.size() + error.size(), cost, leftCond); + strPool += error; + } + } + }; +} + PreparedTypoTransformer::PreparedTypoTransformer() = default; PreparedTypoTransformer::PreparedTypoTransformer(const TypoTransformer& tt) - : strPool{ tt.strPool }, continualTypoThreshold{ tt.continualTypoThreshold } + : continualTypoThreshold{ tt.continualTypoThreshold }, lengtheningTypoThreshold{ tt.lengtheningTypoThreshold } { + IntermediateTypoTransformer itt; + for (auto& t : tt.typos) + { + itt.addTypo(get<0>(t.first), get<1>(t.first), t.second, get<2>(t.first)); + } + strPool = std::move(itt.strPool); + size_t tot = 0; - for (auto& rs : tt.replacements) tot += rs.size(); + for (auto& rs : itt.replacements) tot += rs.size(); replacements.reserve(tot); Vector> patData; - for (auto& rs : tt.replacements) + for (auto& rs : itt.replacements) { patData.emplace_back(replacements.data() + replacements.size(), rs.size()); for (auto& r : rs) @@ -338,7 +424,7 @@ PreparedTypoTransformer::PreparedTypoTransformer(const TypoTransformer& tt) } } - patTrie = decltype(patTrie){ tt.patTrie, ArchTypeHolder{}, [&](const TypoTransformer::TrieNode& o) -> PatInfo + patTrie = decltype(patTrie){ itt.patTrie, ArchTypeHolder{}, [&](const IntermediateTypoTransformer::TrieNode& o) -> PatInfo { uint32_t depth = o.depth; if (o.val && patData[o.val - 1].first->leftCond == CondVowel::applosive) @@ -557,21 +643,8 @@ namespace kiwi TypoDef{ {u"ㅣ워", u"ㅣ어", u"ㅕ"}, {u"ㅣ워", u"ㅣ어", u"ㅕ"}, 1.5f, CondVowel::none}, }; - static const TypoTransformer continualTypoSet = withoutTypo.copyWithNewContinualTypoCost(1.f).addTypos({ - TypoDef{ {u"ᆪ"}, {u"ᆨᆺ", u"ᆨᆻ"}, 1e-12f, CondVowel::none }, - TypoDef{ {u"ᆬ"}, {u"ᆫᆽ"}, 1e-12f, CondVowel::none }, - TypoDef{ {u"ᆭ"}, {u"ᆫᇂ"}, 1e-12f, CondVowel::none }, - TypoDef{ {u"ᆰ"}, {u"ᆯᆨ"}, 1e-12f, CondVowel::none }, - TypoDef{ {u"ᆱ"}, {u"ᆯᆷ"}, 1e-12f, CondVowel::none }, - TypoDef{ {u"ᆲ"}, {u"ᆯᆸ"}, 1e-12f, CondVowel::none }, - TypoDef{ {u"ᆳ"}, {u"ᆯᆺ"}, 1e-12f, CondVowel::none }, - TypoDef{ {u"ᆴ"}, {u"ᆯᇀ"}, 1e-12f, CondVowel::none }, - TypoDef{ {u"ᆵ"}, {u"ᆯᇁ"}, 1e-12f, CondVowel::none }, - TypoDef{ {u"ᆶ"}, {u"ᆯᇂ"}, 1e-12f, CondVowel::none }, - TypoDef{ {u"ᆹ"}, {u"ᆸᆺ", u"ᆸᆻ"}, 1e-12f, CondVowel::none }, - }); - static const TypoTransformer basicTypoSetWithContinual = basicTypoSet.copyWithNewContinualTypoCost(1.f).addTypos({ + static const TypoTransformer continualTypoSet = TypoTransformer::fromContinualTypoCost(1.f).addTypos({ TypoDef{ {u"ᆪ"}, {u"ᆨᆺ", u"ᆨᆻ"}, 1e-12f, CondVowel::none }, TypoDef{ {u"ᆬ"}, {u"ᆫᆽ"}, 1e-12f, CondVowel::none }, TypoDef{ {u"ᆭ"}, {u"ᆫᇂ"}, 1e-12f, CondVowel::none }, @@ -585,24 +658,23 @@ namespace kiwi TypoDef{ {u"ᆹ"}, {u"ᆸᆺ", u"ᆸᆻ"}, 1e-12f, CondVowel::none }, }); - if (set == DefaultTypoSet::withoutTypo) + static const TypoTransformer basicTypoSetWithContinual = basicTypoSet | continualTypoSet; + + static const TypoTransformer lengtheningTypoSet = TypoTransformer::fromLengtheningTypoCost(0.5f); + + switch (set) { + case kiwi::DefaultTypoSet::withoutTypo: return withoutTypo; - } - else if (set == DefaultTypoSet::basicTypoSet) - { + case kiwi::DefaultTypoSet::basicTypoSet: return basicTypoSet; - } - else if (set == DefaultTypoSet::continualTypoSet) - { + case kiwi::DefaultTypoSet::continualTypoSet: return continualTypoSet; - } - else if (set == DefaultTypoSet::basicTypoSetWithContinual) - { + case kiwi::DefaultTypoSet::basicTypoSetWithContinual: return basicTypoSetWithContinual; - } - else - { + case kiwi::DefaultTypoSet::lengtheningTypoSet: + return lengtheningTypoSet; + default: throw invalid_argument{ "Invalid `DefaultTypoSet`" }; } } diff --git a/src/capi/kiwi_c.cpp b/src/capi/kiwi_c.cpp index 88d0fec8..1ba7abf4 100644 --- a/src/capi/kiwi_c.cpp +++ b/src/capi/kiwi_c.cpp @@ -443,6 +443,80 @@ int kiwi_typo_add(kiwi_typo_h handle, const char** orig, int orig_size, const ch } } +kiwi_typo_h kiwi_typo_copy(kiwi_typo_h handle) +{ + if (!handle) return nullptr; + try + { + return new kiwi_typo{ *handle }; + } + catch (...) + { + currentError = current_exception(); + return nullptr; + } +} + +int kiwi_typo_update(kiwi_typo_h handle, kiwi_typo_h src) +{ + if (!handle) return KIWIERR_INVALID_HANDLE; + try + { + handle->update(*src); + return 0; + } + catch (...) + { + currentError = current_exception(); + return -1; + } +} + +int kiwi_typo_scale_cost(kiwi_typo_h handle, float cost) +{ + if (!handle) return KIWIERR_INVALID_HANDLE; + try + { + handle->scaleCost(cost); + return 0; + } + catch (...) + { + currentError = current_exception(); + return -1; + } +} + +int kiwi_typo_set_continual_typo_cost(kiwi_typo_h handle, float threshold) +{ + if (!handle) return KIWIERR_INVALID_HANDLE; + try + { + handle->setContinualTypoCost(threshold); + return 0; + } + catch (...) + { + currentError = current_exception(); + return -1; + } +} + +int kiwi_typo_set_lengthening_typo_cost(kiwi_typo_h handle, float threshold) +{ + if (!handle) return KIWIERR_INVALID_HANDLE; + try + { + handle->setLengtheningTypoCost(threshold); + return 0; + } + catch (...) + { + currentError = current_exception(); + return -1; + } +} + int kiwi_typo_close(kiwi_typo_h handle) { if (!handle) return KIWIERR_INVALID_HANDLE; diff --git a/test/test_c.cpp b/test/test_c.cpp index 257c750a..23015001 100644 --- a/test/test_c.cpp +++ b/test/test_c.cpp @@ -268,6 +268,37 @@ TEST(KiwiC, AnalyzeBasicTypoSet) EXPECT_EQ(kiwi_close(typo_kw), 0); } +TEST(KiwiC, CustomTypoSet) +{ + kiwi_h okw = reuse_kiwi_instance(), typo_kw; + kiwi_builder_h builder = kiwi_builder_init(MODEL_PATH, 0, KIWI_BUILD_DEFAULT); + kiwi_typo_h basic_typo = kiwi_typo_get_default(KIWI_TYPO_BASIC_TYPO_SET), + continual_typo = kiwi_typo_get_default(KIWI_TYPO_CONTINUAL_TYPO_SET), + lengthening_typo = kiwi_typo_get_default(KIWI_TYPO_LENGTHENING_TYPO_SET), + custom_typo = kiwi_typo_init(); + + kiwi_typo_update(custom_typo, basic_typo); + kiwi_typo_update(custom_typo, continual_typo); + kiwi_typo_update(custom_typo, lengthening_typo); + + typo_kw = kiwi_builder_build(builder, custom_typo, 2.5f); + kiwi_set_option_f(typo_kw, KIWI_TYPO_COST_WEIGHT, 5); + + kiwi_res_h o, c; + for (const char* s : { u8"외않됀데?", u8"나 죰 도와죠.", u8"자알했따", u8"외구거 공부", u8"맗은 믈을 마셧다!" }) + { + o = kiwi_analyze(okw, s, 1, KIWI_MATCH_ALL_WITH_NORMALIZING, nullptr, nullptr); + c = kiwi_analyze(typo_kw, s, 1, KIWI_MATCH_ALL_WITH_NORMALIZING, nullptr, nullptr); + EXPECT_TRUE(kiwi_res_prob(o, 0) < kiwi_res_prob(c, 0)); + EXPECT_EQ(kiwi_res_close(o), 0); + EXPECT_EQ(kiwi_res_close(c), 0); + } + + EXPECT_EQ(kiwi_typo_close(custom_typo), 0); + EXPECT_EQ(kiwi_builder_close(builder), 0); + EXPECT_EQ(kiwi_close(typo_kw), 0); +} + TEST(KiwiC, Tokenizer) { kiwi_h okw = reuse_kiwi_instance(); diff --git a/test/test_typo.cpp b/test/test_typo.cpp index 7f662631..2b6a004d 100644 --- a/test/test_typo.cpp +++ b/test/test_typo.cpp @@ -204,3 +204,42 @@ TEST(KiwiTypo, BasicTypoSetWithContinual) EXPECT_EQ(res[1].str, u"많"); EXPECT_EQ(res[2].str, u"다"); } + +TEST(KiwiTypo, LengtheningTypoSet) +{ + KiwiBuilder builder{ MODEL_PATH, 0, BuildOption::default_, }; + Kiwi typoKiwi = builder.build(DefaultTypoSet::lengtheningTypoSet); + const float typoCost = typoKiwi.getTypoCostWeight() * 0.5f; + + auto ref = typoKiwi.analyze(u"진짜?", Match::allWithNormalizing); + auto res = typoKiwi.analyze(u"지인짜?", Match::allWithNormalizing); + EXPECT_FLOAT_EQ(ref.second - typoCost, res.second); + EXPECT_EQ(res.first.size(), 2); + EXPECT_EQ(res.first[0].str, u"진짜"); + EXPECT_EQ(res.first[1].str, u"?"); + + res = typoKiwi.analyze(u"지인짜아?", Match::allWithNormalizing); + EXPECT_FLOAT_EQ(ref.second - 2 * typoCost, res.second); + EXPECT_EQ(res.first.size(), 2); + EXPECT_EQ(res.first[0].str, u"진짜"); + EXPECT_EQ(res.first[1].str, u"?"); + + res = typoKiwi.analyze(u"그으으래?", Match::allWithNormalizing); + EXPECT_EQ(res.first.size(), 2); + EXPECT_EQ(res.first[0].str, u"그래"); + EXPECT_EQ(res.first[1].str, u"?"); + + res = typoKiwi.analyze(u"그으으으으래?", Match::allWithNormalizing); + EXPECT_EQ(res.first.size(), 2); + EXPECT_EQ(res.first[0].str, u"그래"); + EXPECT_EQ(res.first[1].str, u"?"); + + res = typoKiwi.analyze(u"학교오를 가야아해", Match::allWithNormalizing); + EXPECT_EQ(res.first.size(), 6); + EXPECT_EQ(res.first[0].str, u"학교"); + EXPECT_EQ(res.first[1].str, u"를"); + EXPECT_EQ(res.first[2].str, u"가"); + EXPECT_EQ(res.first[3].str, u"어야"); + EXPECT_EQ(res.first[4].str, u"하"); + EXPECT_EQ(res.first[5].str, u"어"); +}