Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add zh-cn, zh-tw support #69

Merged
merged 1 commit into from
Jan 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Whether you are new to the process and are building your first AI assistant or y
- Why is the assistant responding incorrectly to this question?
- How do I improve my assistant’s ability to understand questions?

Currently Supported Languages: en, fr, cs, de, es, it, pt, nl
Currently Supported Languages: en, fr, cs, de, es, it, pt, nl, zh-cn, zh-tw

## Usage
If you clone the notebook from this repository locally, please use the steps below. For usage in Watson studio, please refer to the
Expand Down
155 changes: 155 additions & 0 deletions assistant_skill_analysis/resources/zh-cn/stopwords
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
<
>
|
-
,
;
:
!
?
.
''
'
"
(
)
[
]
{
}
*
%
+
<SE>
一会儿
一边
一面
不但
不光
不可
不如
不是
不管
不论
与其
之所以
也不
也许
也许是
他们
你们
便
倘若
即使
只有
只要
可以
可是
可能
哪怕
因为
因此
她们
如果
宁可
它们
尽管
已经
并且
我们
所以
无论
既然
是因为
没有
然后
然而
由于
而且
而是
自己
虽然
认为
还是
通过
那么
51 changes: 51 additions & 0 deletions assistant_skill_analysis/resources/zh-tw/stopwords
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
the
of
is
and
to
in
that
we
for
an
are
by
be
as
on
with
can
if
from
which
you
it
this
then
at
have
all
not
one
has
or
that
一個
沒有
我們
你們
妳們
他們
她們
是否
18 changes: 17 additions & 1 deletion assistant_skill_analysis/utils/lang_utils.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,31 @@
import os
import re
from types import SimpleNamespace
import sys
import jieba
from nltk.stem.snowball import SnowballStemmer
from spacy.tokenizer import Tokenizer
import unicodedata
import assistant_skill_analysis


SUPPORTED_LANGUAGE = ["en", "fr", "de", "cs", "es", "it", "pt", "nl"]
SUPPORTED_LANGUAGE = ["en", "fr", "de", "cs", "es", "it", "pt", "nl", "zh-cn", "zh-tw"]
PUNCTUATION = [
"\\" + chr(i)
for i in range(sys.maxunicode)
if unicodedata.category(chr(i)).startswith("P")
]


class _JiebaTokenizerWrapper:
"""for zh-cn and zh-tw"""

def __call__(self, *args, **kwargs):
text = args[0]
for token in jieba.tokenize(text):
yield SimpleNamespace(text=token[0])


class LanguageUtility:
def __init__(self, language_code):
if language_code not in SUPPORTED_LANGUAGE:
Expand Down Expand Up @@ -96,6 +107,11 @@ def init_resources(self):
self.tokenizer = Tokenizer(Dutch().vocab)
self.stemmer = SnowballStemmer(language="dutch")
self.stop_words = self.load_stop_words(stopwords_path)

elif self.language_code in ["zh-cn", "zh-tw"]:
self.tokenizer = _JiebaTokenizerWrapper()
self.stop_words = self.load_stop_words(stopwords_path)

else:
raise Exception("language code %s is not supported", self.language_code)

Expand Down
2 changes: 1 addition & 1 deletion classic_dialog_skill_analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
"metadata": {},
"source": [
"Pick the language code correspond to your workspace data: \n",
"*Supported Language codes:* **en, fr, de, es, cs, it, pt, nl**"
"*Supported Language codes:* **en, fr, de, es, cs, it, pt, nl, zh-cn, zh-tw**"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion classic_dialog_skill_analysis_cp4d.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
"metadata": {},
"source": [
"Pick the language code correspond to your workspace data: \n",
"*Supported Language codes:* **en, fr, de, es, cs, it, pt, nl**"
"*Supported Language codes:* **en, fr, de, es, cs, it, pt, nl, zh-cn, zh-tw**"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion new_experience_skill_analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@
"### Assistant Settings\n",
"Please set values for the variables in the cell below to configure this notebook.\n",
"\n",
"- **LANGUAGE_CODE:** language code correspond to your workspace data, supported languages: **en, fr, de, es, cs, it, pt, nl**\n",
"- **LANGUAGE_CODE:** language code correspond to your workspace data, supported languages: **en, fr, de, es, cs, it, pt, nl, zh-cn, zh-tw**\n",
"\n",
"- **ASSISTANT_ID:** id of the Watson Assistant service instance\n",
"\n",
Expand Down
2 changes: 1 addition & 1 deletion new_experience_skill_analysis_cp4d.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@
"### Assistant Settings\n",
"Please set values for the variables in the cell below to configure this notebook. The notebook uses CloudPakForDataAuthenticator to authenticate the APIs.\n",
"\n",
"- **LANGUAGE_CODE:** language code correspond to your workspace data, supported languages: **en, fr, de, es, cs, it, pt, nl**\n",
"- **LANGUAGE_CODE:** language code correspond to your workspace data, supported languages: **en, fr, de, es, cs, it, pt, nl, zh-cn, zh-tw**\n",
"\n",
"- **ASSISTANT_ID:** id of the Watson Assistant service instance\n",
"\n",
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ scipy>=1.2.0
jupyter
spacy~=2.3.2
ibm-cos-sdk>=2.11.0
nbconvert>=7.7.1
nbconvert>=7.7.1
jieba
14 changes: 14 additions & 0 deletions tests/utils/test_lang_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,20 @@ def test_de(self):
sent = util.tokenize(sent)
self.assertEqual(sent, ["autobahn"])

def test_zh_cn(self):
util = LanguageUtility("zh-cn")
sent = util.preprocess("不想当兼职")
self.assertEqual(sent, "不想当兼职")
sent = util.tokenize(sent)
self.assertEqual(sent, ['不想', '当', '兼职'])

def test_zh_tw(self):
util = LanguageUtility("zh-tw")
sent = util.preprocess("畀到機會我嘗試")
self.assertEqual(sent, "畀到機會我嘗試")
sent = util.tokenize(sent)
self.assertEqual(sent, ['畀', '到', '機會', '我', '嘗試'])

def tearDown(self):
unittest.TestCase.tearDown(self)
self.skill_file.close()
Expand Down
Loading