Skip to content

Commit

Permalink
Merge pull request #7 from toriving/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
toriving authored Sep 26, 2021
2 parents fe88c2c + 0b1847a commit 5dfbb0e
Show file tree
Hide file tree
Showing 15 changed files with 254 additions and 59 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.6', '3.7', '3.8', '3.9']
python-version: ['3.7', '3.8', '3.9']

steps:
- uses: actions/checkout@v2
Expand Down
124 changes: 98 additions & 26 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,69 +20,141 @@ KoEDA
<p>Easy Data Augmentation for Korean
</h3>

This is a project that re-implemented Easy data augmentation and A Easier Data Augmentation, which were implemented for English, to fit Korean.

## Prerequisites
- python >= 3.6
- python >= 3.7

## Installation
This repository is tested on Python 3.6 - 3.9.
This repository is tested on Python 3.7 - 3.9.

KoEDA can be installed using pip as follows:
```shell script
$ pip install koeda
```

## Quick Start

- EDA
```python
from koeda import EasyDataAugmentation
from koeda import EDA


EDA = EasyDataAugmentation(
morpheme_analyzer=None, alpha_sr=0.3, alpha_ri=0.3, alpha_rs=0.3, prob_rd=0.3
eda = EDA(
morpheme_analyzer="Okt", alpha_sr=0.3, alpha_ri=0.3, alpha_rs=0.3, prob_rd=0.3
)

text = "아버지가 방에 들어가신다"

result = EDA(text)
result = eda(text)
print(result)
# 아버지가 정실에 들어가신다

result = eda(text, p=(0.9, 0.9, 0.9, 0.9), repetition=2)
print(result)
# ['아버지가 객실 아빠 안방 방에 정실 들어가신다', '아버지가 탈의실 방 휴게실 에 안방 탈의실 들어가신다']
```

- AEDA
```python
from koeda import AEDA


aeda = AEDA(
morpheme_analyzer="Okt", punc_ratio=0.3, punctuations=[".", ",", "!", "?", ";", ":"]
)

text = "어머니가 집을 나가신다"

result = aeda(text)
print(result)
# 어머니가 ! 집을 , 나가신다

result = aeda(text, p=0.9, repetition=2)
print(result)
# ['! 어머니가 ! 집 ; 을 ? 나가신다', '. 어머니 ? 가 . 집 , 을 , 나가신다']
```
## Augmenters
- EasyDataAugmentation (EDA)
- AEasierDataAugmentation (AEDA)
- RandomDeletion (RD)
- RandomInsertion (RI)
- SynonymReplacement (SR)
- RandomSwap (RS)

There are two ways to load Augmenter.

The first is to use the full name.
```python
from koeda import EasyDataAugmentation
```
The second is to use abbreviations.
```python
from koeda import EDA
```

## Usage
- EDA class
- EDA
```python
EDA = EasyDataAugmentation(
morpheme_analyzer: str = None,
alpha_sr: float = 0.1,
alpha_ri: float = 0.1,
alpha_rs: float = 0.1,
prob_rd: float = 0.1,
):

text = "아버지가방에들어가신다"

# EDA(data: Union[List[str], str], p: List[float] = None, repetition: int = 1)
result = EDA(data=text, p=None, repetition=1)
augmenter = EDA(
morpheme_analyzer: str = None, # Default = "Okt"
alpha_sr: float = 0.1,
alpha_ri: float = 0.1,
alpha_rs: float = 0.1,
prob_rd: float = 0.1
)

result = augmenter(
data: Union[List[str], str],
p: List[float] = None, # Default = (0.1, 0.1, 0.1, 0.1)
repetition: int = 1
)
```

- The others (RD, RI, SR, RS)
- AEDA
```python
augmenter = Augmenter(morpheme_analyzer: str = None, stopword: bool = False)

text = "아버지가방에들어가신다"
augmenter = AEDA(
morpheme_analyzer: str = None, # Default = "Okt"
punc_ratio: float = 0.3,
punctuations: List[str] = None # default = ('.', ',', '!', '?', ';', ':')
)

result = augmenter(
data: Union[List[str], str],
p: float = None, # Default = 0.3
repetition: int = 1
)
```

# augmenter(data: Union[List[str], str], p: float = 0.1, repetition: int = 1)
result = augmenter(data=text, p=0.5, repetiion=1)
- The others (RD, RI, SR, RS)
```python
augmenter = RD(
morpheme_analyzer: str = None,
)

augmenter = RI(
morpheme_analyzer: str = None,
stopword: bool = False
)

augmenter = SR(
morpheme_analyzer: str = None,
stopword: bool = False
)

augmenter = RS(
morpheme_analyzer: str = None,
)

result = augmenter(
data: Union[List[str], str],
p: float = 0.1,
repetition: int = 1
)
```

## Reference
[Easy Data Augmentation Paper](https://www.aclweb.org/anthology/D19-1670.pdf)
[Easy Data Augmentation Repository](https://github.com/jasonwei20/eda_nlp)
[A Easier Data Augmentation Paper](https://arxiv.org/pdf/2108.13230.pdf)
[A Easier Data Augmentation Repository](https://github.com/akkarimi/aeda_nlp)
[Korean WordNet](http://wordnet.kaist.ac.kr/)
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
numpy==1.19.4
konlpy==0.5.2
numpy>=1.19.4
konlpy>=0.5.2
tweepy==3.10.0
5 changes: 2 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name="koeda",
version="0.0.3",
version="0.0.4",
description="Korean Easy Data Augmentation Package",
long_description=open("README.md", "r", encoding="utf-8").read(),
long_description_content_type="text/markdown",
Expand All @@ -17,7 +17,7 @@
install_requires=requirements,
keywords=["NLP deep learning koeda korean easy data augmentation"],
license="MIT",
python_requires=">=3.6.0",
python_requires=">=3.7.0",
include_package_data=True,
zip_safe=False,
classifiers=[
Expand All @@ -28,7 +28,6 @@
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
Expand Down
14 changes: 10 additions & 4 deletions src/koeda/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
__title__ = "KoEDA"
__version__ = "0.0.3"
__version__ = "0.0.4"

__author__ = "Dongju Park"
__email__ = "[email protected]"
Expand All @@ -11,7 +11,13 @@


from .eda import EasyDataAugmentation
from .aeda import AEasierDataAugmentation
from .augmenters import RandomDeletion, RandomInsertion, \
SynonymReplacement, RandomSwap

from .augmenters import RandomDeletion, RandomInsertion, SynonymReplacement, RandomSwap

from .utils import STOPWORD, WORDNET, get_synonyms
from .aeda import AEasierDataAugmentation as AEDA
from .eda import EasyDataAugmentation as EDA
from .augmenters import RandomDeletion as RD
from .augmenters import RandomInsertion as RI
from .augmenters import SynonymReplacement as SR
from .augmenters import RandomSwap as RS
94 changes: 94 additions & 0 deletions src/koeda/aeda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import random
from typing import Union, List
from itertools import repeat, chain

from konlpy.tag import *

from .utils import replace_space, revert_space, SPACE_TOKEN


class AEasierDataAugmentation:
def __init__(
self,
morpheme_analyzer: str = None,
punc_ratio: float = 0.3,
punctuations: List[str] = None
):
if punctuations is None or not isinstance(punctuations, list):
self.punctuations = ('.', ',', '!', '?', ';', ':')
else:
self.punctuations = punctuations

if morpheme_analyzer is None:
self.morpheme_analyzer = Okt()
elif morpheme_analyzer in ["Okt", "Kkma", "Komoran", "Mecab", "Hannanum"]:
self.morpheme_analyzer = eval(morpheme_analyzer)()
elif hasattr(morpheme_analyzer, "morphs"):
self.morpheme_analyzer = morpheme_analyzer
else:
raise ValueError(f'Does not support {morpheme_analyzer} morpheme analyzer. '
f'Choose one of ["Okt", "Kkma", "Komoran", "Mecab", "Hannanum"]')

self.ratio = punc_ratio

def __call__(self, *args, **kwargs):
return self.aeda(*args, **kwargs)

def aeda(
self, data: Union[List[str], str], p: float = None, repetition: int = 1
) -> Union[List[str], str]:
if isinstance(data, str):
if repetition <= 1:
return self._aeda(data, p)
else:
return list(
map(self._aeda, repeat(data, repetition), repeat(p, repetition))
)
elif isinstance(data, list):
if repetition <= 1:
return list(map(self._aeda, data, repeat(p, len(data))))
else:
return list(
map(
self._aeda,
chain.from_iterable(repeat(x, repetition) for x in data),
repeat(p, len(data) * repetition),
)
)
else:
raise TypeError(f"Does not support the data type : {type(data)}")

def _aeda(self, data: str, p: float) -> str:
if p is None:
p = self.ratio

split_words = self.morpheme_analyzer.morphs(replace_space(data))
words = self.morpheme_analyzer.morphs(data)

new_words = []
q = random.randint(1, int(p * len(words) + 1))
qs = random.sample(range(0, len(split_words)), q)

while self.check_special_selection(split_words, qs):
qs = random.sample(range(0, len(split_words)), q)

for j, word in enumerate(split_words):
if j in qs:
new_words.append(SPACE_TOKEN)
new_words.append(
self.punctuations[random.randint(0, len(self.punctuations) - 1)])
new_words.append(SPACE_TOKEN)
new_words.append(word)
else:
new_words.append(word)

augmented_sentences = revert_space(new_words)

return augmented_sentences

@staticmethod
def check_special_selection(split_words: list, qs: list) -> bool:
for i in qs:
if split_words[i] == SPACE_TOKEN:
return True
return False
7 changes: 4 additions & 3 deletions src/koeda/augmenters/deletion.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from konlpy.tag import *

from koeda.utils import replace_space, revert_space, SPACE_TOKEN
from ..utils import replace_space, revert_space, SPACE_TOKEN


class RandomDeletion:
Expand All @@ -16,7 +16,8 @@ def __init__(self, morpheme_analyzer: str = None):
elif hasattr(morpheme_analyzer, "morphs"):
self.morpheme_analyzer = morpheme_analyzer
else:
raise Exception("Does not support morpheme analyzer.")
raise ValueError(f'Does not support {morpheme_analyzer} morpheme analyzer. '
f'Choose one of ["Okt", "Kkma", "Komoran", "Mecab", "Hannanum"]')

def __call__(self, *args, **kwargs):
return self.random_deletion(*args, **kwargs)
Expand All @@ -43,7 +44,7 @@ def random_deletion(
)
)
else:
raise Exception(f"Does not support the data type : {type(data)}")
raise TypeError(f"Does not support the data type : {type(data)}")

def _deletion(self, data: str, p: float = 0.1) -> str:
split_words = self.morpheme_analyzer.morphs(replace_space(data))
Expand Down
7 changes: 4 additions & 3 deletions src/koeda/augmenters/insertion.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from konlpy.tag import *

from koeda.utils import replace_space, revert_space, get_synonyms, STOPWORD, SPACE_TOKEN
from ..utils import replace_space, revert_space, get_synonyms, STOPWORD, SPACE_TOKEN


class RandomInsertion:
Expand All @@ -18,7 +18,8 @@ def __init__(self, morpheme_analyzer: str = None, stopword: bool = False):
elif hasattr(morpheme_analyzer, "morphs"):
self.morpheme_analyzer = morpheme_analyzer
else:
raise Exception("Does not support morpheme analyzer.")
raise ValueError(f'Does not support {morpheme_analyzer} morpheme analyzer. '
f'Choose one of ["Okt", "Kkma", "Komoran", "Mecab", "Hannanum"]')

def __call__(self, *args, **kwargs):
return self.random_insertion(*args, **kwargs)
Expand Down Expand Up @@ -47,7 +48,7 @@ def random_insertion(
)
)
else:
raise Exception(f"Does not support the data type : {type(data)}")
raise TypeError(f"Does not support the data type : {type(data)}")

def _insertion(self, data: str, p: float = 0.1) -> str:
split_words = self.morpheme_analyzer.morphs(replace_space(data))
Expand Down
Loading

0 comments on commit 5dfbb0e

Please sign in to comment.