-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #5 from toriving/develop
Develop
- Loading branch information
Showing
28 changed files
with
205,268 additions
and
43 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
[flake8] | ||
max-line-length = 88 | ||
select = C,E,F,W,B,B950 | ||
ignore = F403, F405, E203, E501, W503 | ||
per-file-ignores = __init__.py:F401 | ||
exclude = | ||
.git, | ||
__pycache__, | ||
build, | ||
dist | ||
venv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
# This workflows will upload a Python Package using Twine when a release is created | ||
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries | ||
|
||
name: deploy | ||
|
||
on: | ||
release: | ||
types: [created] | ||
|
||
jobs: | ||
deploy: | ||
|
||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
- uses: actions/checkout@v2 | ||
- name: Set up Python | ||
uses: actions/setup-python@v2 | ||
with: | ||
python-version: '3.x' | ||
- name: Install dependencies | ||
run: | | ||
python -m pip install --upgrade pip | ||
pip install setuptools wheel twine | ||
- name: Build and publish | ||
env: | ||
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} | ||
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} | ||
run: | | ||
python setup.py sdist bdist_wheel | ||
twine upload dist/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions | ||
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions | ||
|
||
name: test | ||
|
||
on: | ||
push: | ||
branches: [ main, develop ] | ||
|
||
jobs: | ||
build: | ||
|
||
runs-on: ubuntu-latest | ||
strategy: | ||
matrix: | ||
python-version: ['3.6', '3.7', '3.8', '3.9'] | ||
|
||
steps: | ||
- uses: actions/checkout@v2 | ||
- name: Set up Python ${{ matrix.python-version }} | ||
uses: actions/setup-python@v2 | ||
with: | ||
python-version: ${{ matrix.python-version }} | ||
- name: Install dependencies | ||
run: | | ||
python -m pip install --upgrade pip | ||
python -m pip install flake8 pytest | ||
python setup.py install | ||
- name: Lint with flake8 | ||
run: | | ||
# stop the build if there are Python syntax errors or undefined names | ||
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics | ||
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide | ||
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics | ||
- name: Test with pytest | ||
run: | | ||
pytest |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -131,5 +131,9 @@ dmypy.json | |
# IDEA | ||
.idea/ | ||
|
||
# Temporary folder | ||
temp/ | ||
|
||
# Deploy | ||
deploy.sh | ||
deploy.sh | ||
.pre-commit-config.yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
graft src/koeda/corpora |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,88 @@ | ||
# KoEDA | ||
Easy Data Augmentation for Korean | ||
<h1 align="center"> | ||
KoEDA | ||
</h1> | ||
<p align="center"> | ||
<a href="https://github.com/toriving/KoEDA/actions"> | ||
<img alt="Deploy" src="https://github.com/toriving/KoEDA/workflows/deploy/badge.svg"> | ||
</a> | ||
<a href="https://github.com/toriving/KoEDA/actions"> | ||
<img alt="Test" src="https://github.com/toriving/KoEDA/workflows/test/badge.svg"> | ||
</a> | ||
<a href="https://github.com/toriving/KoEDA/releases"> | ||
<img alt="Release" src="https://img.shields.io/github/release/toriving/KoEDA.svg"> | ||
</a> | ||
<a href="https://github.com/psf/black"> | ||
<img alt="Black" src="https://img.shields.io/badge/code%20style-black-000000.svg"> | ||
</a> | ||
</p> | ||
|
||
## Install | ||
<h3 align="center"> | ||
<p>Easy Data Augmentation for Korean | ||
</h3> | ||
|
||
## Prerequisites | ||
- python >= 3.6 | ||
|
||
## Installation | ||
This repository is tested on Python 3.6 - 3.9. | ||
KoEDA can be installed using pip as follows: | ||
```shell script | ||
$ pip install koeda | ||
``` | ||
``` | ||
|
||
## Quick Start | ||
|
||
```python | ||
from koeda import EasyDataAugmentation | ||
|
||
|
||
EDA = EasyDataAugmentation( | ||
morpheme_analyzer=None, alpha_sr=0.3, alpha_ri=0.3, alpha_rs=0.3, prob_rd=0.3 | ||
) | ||
|
||
text = "아버지가 방에 들어가신다" | ||
|
||
result = EDA(text) | ||
print(result) | ||
# 아버지가 정실에 들어가신다 | ||
``` | ||
|
||
## Augmenters | ||
- EasyDataAugmentation (EDA) | ||
- RandomDeletion (RD) | ||
- RandomInsertion (RI) | ||
- SynonymReplacement (SR) | ||
- RandomSwap (RS) | ||
|
||
|
||
## Usage | ||
- EDA class | ||
```python | ||
EDA = EasyDataAugmentation( | ||
morpheme_analyzer: str = None, | ||
alpha_sr: float = 0.1, | ||
alpha_ri: float = 0.1, | ||
alpha_rs: float = 0.1, | ||
prob_rd: float = 0.1, | ||
): | ||
|
||
text = "아버지가방에들어가신다" | ||
|
||
# EDA(data: Union[List[str], str], p: List[float] = None, repetition: int = 1) | ||
result = EDA(data=text, p=None, repetition=1) | ||
``` | ||
|
||
- The others (RD, RI, SR, RS) | ||
```python | ||
augmenter = Augmenter(morpheme_analyzer: str = None, stopword: bool = False) | ||
|
||
text = "아버지가방에들어가신다" | ||
|
||
# augmenter(data: Union[List[str], str], p: float = 0.1, repetition: int = 1) | ||
result = augmenter(data=text, p=0.5, repetiion=1) | ||
``` | ||
|
||
## Reference | ||
[Easy Data Augmentation Paper](https://www.aclweb.org/anthology/D19-1670.pdf) | ||
[Easy Data Augmentation Repository](https://github.com/jasonwei20/eda_nlp) | ||
[Korean WordNet](http://wordnet.kaist.ac.kr/) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
numpy==1.19.4 | ||
konlpy==0.5.2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,34 +1,37 @@ | ||
from setuptools import setup, find_packages | ||
|
||
setup( | ||
name='koeda', | ||
version='0.0.2', | ||
description='Korean Easy Data Augmentation Package', | ||
long_description=open("README.md", "r", encoding="utf-8").read(), | ||
long_description_content_type="text/markdown", | ||
author='Dongju.Park', | ||
author_email='[email protected]', | ||
url='https://github.com/toriving/KoEDA', | ||
package_dir={"": "src"}, | ||
packages=find_packages("src"), | ||
install_requires=[], | ||
keywords=['NLP deep learning koeda korean easy data augmentation'], | ||
license="MIT", | ||
python_requires='>=3.6.0', | ||
package_data={}, | ||
zip_safe=False, | ||
classifiers=[ | ||
"Development Status :: 5 - Production/Stable", | ||
"Intended Audience :: Developers", | ||
"Intended Audience :: Education", | ||
"Intended Audience :: Science/Research", | ||
"License :: OSI Approved :: MIT License", | ||
"Operating System :: OS Independent", | ||
'Programming Language :: Python :: 3', | ||
'Programming Language :: Python :: 3.6', | ||
'Programming Language :: Python :: 3.7', | ||
'Programming Language :: Python :: 3.8', | ||
'Programming Language :: Python :: 3.9', | ||
"Topic :: Scientific/Engineering :: Artificial Intelligence", | ||
], | ||
) | ||
from setuptools import setup, find_packages | ||
|
||
with open("requirements.txt") as f: | ||
requirements = f.read().splitlines() | ||
|
||
setup( | ||
name="koeda", | ||
version="0.0.3", | ||
description="Korean Easy Data Augmentation Package", | ||
long_description=open("README.md", "r", encoding="utf-8").read(), | ||
long_description_content_type="text/markdown", | ||
author="Dongju.Park", | ||
author_email="[email protected]", | ||
url="https://github.com/toriving/KoEDA", | ||
package_dir={"": "src"}, | ||
packages=find_packages("src"), | ||
install_requires=requirements, | ||
keywords=["NLP deep learning koeda korean easy data augmentation"], | ||
license="MIT", | ||
python_requires=">=3.6.0", | ||
include_package_data=True, | ||
zip_safe=False, | ||
classifiers=[ | ||
"Development Status :: 5 - Production/Stable", | ||
"Intended Audience :: Developers", | ||
"Intended Audience :: Education", | ||
"Intended Audience :: Science/Research", | ||
"License :: OSI Approved :: MIT License", | ||
"Operating System :: OS Independent", | ||
"Programming Language :: Python :: 3", | ||
"Programming Language :: Python :: 3.6", | ||
"Programming Language :: Python :: 3.7", | ||
"Programming Language :: Python :: 3.8", | ||
"Programming Language :: Python :: 3.9", | ||
"Topic :: Scientific/Engineering :: Artificial Intelligence", | ||
], | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,17 @@ | ||
if __name__ == "__main__": | ||
print("Hello KoEDA") | ||
__title__ = "KoEDA" | ||
__version__ = "0.0.3" | ||
|
||
def hello(): | ||
print("Hello KoEDA") | ||
__author__ = "Dongju Park" | ||
__email__ = "[email protected]" | ||
__url__ = "http://toriving.github.io" | ||
|
||
__summary__ = "Easy Data Augmentation for Korean" | ||
__license__ = "MIT License" | ||
__copyright__ = "Copyright 2020 {}".format(__author__) | ||
|
||
|
||
from .eda import EasyDataAugmentation | ||
|
||
from .augmenters import RandomDeletion, RandomInsertion, SynonymReplacement, RandomSwap | ||
|
||
from .utils import STOPWORD, WORDNET, get_synonyms |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
__all__ = ["RandomDeletion", "RandomInsertion", "SynonymReplacement", "RandomSwap"] | ||
|
||
from .deletion import * | ||
from .insertion import * | ||
from .replacement import * | ||
from .swap import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
import random | ||
from typing import Union, List | ||
from itertools import repeat, chain | ||
|
||
from konlpy.tag import * | ||
|
||
from koeda.utils import replace_space, revert_space, SPACE_TOKEN | ||
|
||
|
||
class RandomDeletion: | ||
def __init__(self, morpheme_analyzer: str = None): | ||
if morpheme_analyzer is None: | ||
self.morpheme_analyzer = Okt() | ||
elif morpheme_analyzer in ["Okt", "Kkma", "Komoran", "Mecab", "Hannanum"]: | ||
self.morpheme_analyzer = eval(morpheme_analyzer)() | ||
elif hasattr(morpheme_analyzer, "morphs"): | ||
self.morpheme_analyzer = morpheme_analyzer | ||
else: | ||
raise Exception("Does not support morpheme analyzer.") | ||
|
||
def __call__(self, *args, **kwargs): | ||
return self.random_deletion(*args, **kwargs) | ||
|
||
def random_deletion( | ||
self, data: Union[List[str], str], p: float = 0.1, repetition: int = 1 | ||
) -> Union[List[str], str]: | ||
if isinstance(data, str): | ||
if repetition <= 1: | ||
return self._deletion(data, p) | ||
else: | ||
return list( | ||
map(self._deletion, repeat(data, repetition), repeat(p, repetition)) | ||
) | ||
elif isinstance(data, list): | ||
if repetition <= 1: | ||
return list(map(self._deletion, data, repeat(p, len(data)))) | ||
else: | ||
return list( | ||
map( | ||
self._deletion, | ||
chain.from_iterable(repeat(x, repetition) for x in data), | ||
repeat(p, len(data) * repetition), | ||
) | ||
) | ||
else: | ||
raise Exception(f"Does not support the data type : {type(data)}") | ||
|
||
def _deletion(self, data: str, p: float = 0.1) -> str: | ||
split_words = self.morpheme_analyzer.morphs(replace_space(data)) | ||
words = self.morpheme_analyzer.morphs(data) | ||
|
||
# obviously, if there's only one word, don't delete it | ||
if len(words) == 1: | ||
return words | ||
|
||
# randomly delete words with probability p | ||
new_words = [] | ||
for word in split_words: | ||
if word == SPACE_TOKEN: | ||
new_words.append(word) | ||
continue | ||
r = random.uniform(0, 1) | ||
if r > p: | ||
new_words.append(word) | ||
|
||
# if you end up deleting all words, just return a random word | ||
if len(set(filter(SPACE_TOKEN.__ne__, new_words))) == 0: | ||
rand_int = random.randint(0, len(words) - 1) | ||
return revert_space([data[rand_int]]) | ||
|
||
return revert_space(new_words) |
Oops, something went wrong.