Skip to content

Commit

Permalink
Merge pull request #5 from toriving/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
toriving authored Dec 21, 2020
2 parents c84a806 + 565e310 commit fe88c2c
Show file tree
Hide file tree
Showing 28 changed files with 205,268 additions and 43 deletions.
11 changes: 11 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[flake8]
max-line-length = 88
select = C,E,F,W,B,B950
ignore = F403, F405, E203, E501, W503
per-file-ignores = __init__.py:F401
exclude =
.git,
__pycache__,
build,
dist
venv
31 changes: 31 additions & 0 deletions .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# This workflows will upload a Python Package using Twine when a release is created
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries

name: deploy

on:
release:
types: [created]

jobs:
deploy:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.x'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install setuptools wheel twine
- name: Build and publish
env:
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
run: |
python setup.py sdist bdist_wheel
twine upload dist/*
37 changes: 37 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions

name: test

on:
push:
branches: [ main, develop ]

jobs:
build:

runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.6', '3.7', '3.8', '3.9']

steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install flake8 pytest
python setup.py install
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pytest
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -131,5 +131,9 @@ dmypy.json
# IDEA
.idea/

# Temporary folder
temp/

# Deploy
deploy.sh
deploy.sh
.pre-commit-config.yaml
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
graft src/koeda/corpora
89 changes: 85 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,88 @@
# KoEDA
Easy Data Augmentation for Korean
<h1 align="center">
KoEDA
</h1>
<p align="center">
<a href="https://github.com/toriving/KoEDA/actions">
<img alt="Deploy" src="https://github.com/toriving/KoEDA/workflows/deploy/badge.svg">
</a>
<a href="https://github.com/toriving/KoEDA/actions">
<img alt="Test" src="https://github.com/toriving/KoEDA/workflows/test/badge.svg">
</a>
<a href="https://github.com/toriving/KoEDA/releases">
<img alt="Release" src="https://img.shields.io/github/release/toriving/KoEDA.svg">
</a>
<a href="https://github.com/psf/black">
<img alt="Black" src="https://img.shields.io/badge/code%20style-black-000000.svg">
</a>
</p>

## Install
<h3 align="center">
<p>Easy Data Augmentation for Korean
</h3>

## Prerequisites
- python >= 3.6

## Installation
This repository is tested on Python 3.6 - 3.9.
KoEDA can be installed using pip as follows:
```shell script
$ pip install koeda
```
```

## Quick Start

```python
from koeda import EasyDataAugmentation


EDA = EasyDataAugmentation(
morpheme_analyzer=None, alpha_sr=0.3, alpha_ri=0.3, alpha_rs=0.3, prob_rd=0.3
)

text = "아버지가 방에 들어가신다"

result = EDA(text)
print(result)
# 아버지가 정실에 들어가신다
```

## Augmenters
- EasyDataAugmentation (EDA)
- RandomDeletion (RD)
- RandomInsertion (RI)
- SynonymReplacement (SR)
- RandomSwap (RS)


## Usage
- EDA class
```python
EDA = EasyDataAugmentation(
morpheme_analyzer: str = None,
alpha_sr: float = 0.1,
alpha_ri: float = 0.1,
alpha_rs: float = 0.1,
prob_rd: float = 0.1,
):

text = "아버지가방에들어가신다"

# EDA(data: Union[List[str], str], p: List[float] = None, repetition: int = 1)
result = EDA(data=text, p=None, repetition=1)
```

- The others (RD, RI, SR, RS)
```python
augmenter = Augmenter(morpheme_analyzer: str = None, stopword: bool = False)

text = "아버지가방에들어가신다"

# augmenter(data: Union[List[str], str], p: float = 0.1, repetition: int = 1)
result = augmenter(data=text, p=0.5, repetiion=1)
```

## Reference
[Easy Data Augmentation Paper](https://www.aclweb.org/anthology/D19-1670.pdf)
[Easy Data Augmentation Repository](https://github.com/jasonwei20/eda_nlp)
[Korean WordNet](http://wordnet.kaist.ac.kr/)
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
numpy==1.19.4
konlpy==0.5.2
71 changes: 37 additions & 34 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,37 @@
from setuptools import setup, find_packages

setup(
name='koeda',
version='0.0.2',
description='Korean Easy Data Augmentation Package',
long_description=open("README.md", "r", encoding="utf-8").read(),
long_description_content_type="text/markdown",
author='Dongju.Park',
author_email='[email protected]',
url='https://github.com/toriving/KoEDA',
package_dir={"": "src"},
packages=find_packages("src"),
install_requires=[],
keywords=['NLP deep learning koeda korean easy data augmentation'],
license="MIT",
python_requires='>=3.6.0',
package_data={},
zip_safe=False,
classifiers=[
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"Intended Audience :: Education",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
"Topic :: Scientific/Engineering :: Artificial Intelligence",
],
)
from setuptools import setup, find_packages

with open("requirements.txt") as f:
requirements = f.read().splitlines()

setup(
name="koeda",
version="0.0.3",
description="Korean Easy Data Augmentation Package",
long_description=open("README.md", "r", encoding="utf-8").read(),
long_description_content_type="text/markdown",
author="Dongju.Park",
author_email="[email protected]",
url="https://github.com/toriving/KoEDA",
package_dir={"": "src"},
packages=find_packages("src"),
install_requires=requirements,
keywords=["NLP deep learning koeda korean easy data augmentation"],
license="MIT",
python_requires=">=3.6.0",
include_package_data=True,
zip_safe=False,
classifiers=[
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"Intended Audience :: Education",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
],
)
20 changes: 16 additions & 4 deletions src/koeda/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
if __name__ == "__main__":
print("Hello KoEDA")
__title__ = "KoEDA"
__version__ = "0.0.3"

def hello():
print("Hello KoEDA")
__author__ = "Dongju Park"
__email__ = "[email protected]"
__url__ = "http://toriving.github.io"

__summary__ = "Easy Data Augmentation for Korean"
__license__ = "MIT License"
__copyright__ = "Copyright 2020 {}".format(__author__)


from .eda import EasyDataAugmentation

from .augmenters import RandomDeletion, RandomInsertion, SynonymReplacement, RandomSwap

from .utils import STOPWORD, WORDNET, get_synonyms
6 changes: 6 additions & 0 deletions src/koeda/augmenters/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
__all__ = ["RandomDeletion", "RandomInsertion", "SynonymReplacement", "RandomSwap"]

from .deletion import *
from .insertion import *
from .replacement import *
from .swap import *
71 changes: 71 additions & 0 deletions src/koeda/augmenters/deletion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import random
from typing import Union, List
from itertools import repeat, chain

from konlpy.tag import *

from koeda.utils import replace_space, revert_space, SPACE_TOKEN


class RandomDeletion:
def __init__(self, morpheme_analyzer: str = None):
if morpheme_analyzer is None:
self.morpheme_analyzer = Okt()
elif morpheme_analyzer in ["Okt", "Kkma", "Komoran", "Mecab", "Hannanum"]:
self.morpheme_analyzer = eval(morpheme_analyzer)()
elif hasattr(morpheme_analyzer, "morphs"):
self.morpheme_analyzer = morpheme_analyzer
else:
raise Exception("Does not support morpheme analyzer.")

def __call__(self, *args, **kwargs):
return self.random_deletion(*args, **kwargs)

def random_deletion(
self, data: Union[List[str], str], p: float = 0.1, repetition: int = 1
) -> Union[List[str], str]:
if isinstance(data, str):
if repetition <= 1:
return self._deletion(data, p)
else:
return list(
map(self._deletion, repeat(data, repetition), repeat(p, repetition))
)
elif isinstance(data, list):
if repetition <= 1:
return list(map(self._deletion, data, repeat(p, len(data))))
else:
return list(
map(
self._deletion,
chain.from_iterable(repeat(x, repetition) for x in data),
repeat(p, len(data) * repetition),
)
)
else:
raise Exception(f"Does not support the data type : {type(data)}")

def _deletion(self, data: str, p: float = 0.1) -> str:
split_words = self.morpheme_analyzer.morphs(replace_space(data))
words = self.morpheme_analyzer.morphs(data)

# obviously, if there's only one word, don't delete it
if len(words) == 1:
return words

# randomly delete words with probability p
new_words = []
for word in split_words:
if word == SPACE_TOKEN:
new_words.append(word)
continue
r = random.uniform(0, 1)
if r > p:
new_words.append(word)

# if you end up deleting all words, just return a random word
if len(set(filter(SPACE_TOKEN.__ne__, new_words))) == 0:
rand_int = random.randint(0, len(words) - 1)
return revert_space([data[rand_int]])

return revert_space(new_words)
Loading

0 comments on commit fe88c2c

Please sign in to comment.