Merge pull request #5 from toriving/develop

Develop
toriving · Dec 21, 2020 · fe88c2c · fe88c2c
2 parents c84a806 + 565e310
commit fe88c2c
Show file tree

Hide file tree

Showing 28 changed files with 205,268 additions and 43 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,11 @@
+[flake8]
+max-line-length = 88
+select = C,E,F,W,B,B950
+ignore = F403, F405, E203, E501, W503
+per-file-ignores = __init__.py:F401
+exclude =
+    .git,
+    __pycache__,
+    build,
+    dist
+    venv
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -0,0 +1,31 @@
+# This workflows will upload a Python Package using Twine when a release is created
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
+
+name: deploy
+
+on:
+  release:
+    types: [created]
+
+jobs:
+  deploy:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install setuptools wheel twine
+    - name: Build and publish
+      env:
+        TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+        TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+      run: |
+        python setup.py sdist bdist_wheel
+        twine upload dist/*
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,37 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: test
+
+on:
+  push:
+    branches: [ main, develop ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.6', '3.7', '3.8', '3.9']
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install flake8 pytest
+        python setup.py install
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      run: |
+        pytest
diff --git a/.gitignore b/.gitignore
@@ -131,5 +131,9 @@ dmypy.json
 # IDEA
 .idea/
 
+# Temporary folder
+temp/
+
 # Deploy
-deploy.sh
+deploy.sh
+.pre-commit-config.yaml
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+graft src/koeda/corpora
diff --git a/README.md b/README.md
@@ -1,7 +1,88 @@
-# KoEDA
-Easy Data Augmentation for Korean
+<h1 align="center">
+KoEDA
+</h1>
+<p align="center">
+    <a href="https://github.com/toriving/KoEDA/actions">
+        <img alt="Deploy" src="https://github.com/toriving/KoEDA/workflows/deploy/badge.svg">
+    </a>
+    <a href="https://github.com/toriving/KoEDA/actions">
+        <img alt="Test" src="https://github.com/toriving/KoEDA/workflows/test/badge.svg">
+    </a>
+    <a href="https://github.com/toriving/KoEDA/releases">
+        <img alt="Release" src="https://img.shields.io/github/release/toriving/KoEDA.svg">
+    </a>
+    <a href="https://github.com/psf/black">
+        <img alt="Black" src="https://img.shields.io/badge/code%20style-black-000000.svg">
+    </a>
+</p>
 
-## Install
+<h3 align="center">
+<p>Easy Data Augmentation for Korean
+</h3>
+
+## Prerequisites
+- python >= 3.6
+
+## Installation
+This repository is tested on Python 3.6 - 3.9.  
+KoEDA can be installed using pip as follows:
 ```shell script
 $ pip install koeda
-```
+```
+
+## Quick Start
+
+```python
+from koeda import EasyDataAugmentation
+
+
+EDA = EasyDataAugmentation(
+    morpheme_analyzer=None, alpha_sr=0.3, alpha_ri=0.3, alpha_rs=0.3, prob_rd=0.3
+)
+
+text = "아버지가 방에 들어가신다"
+
+result = EDA(text)
+print(result)
+# 아버지가 정실에 들어가신다
+```
+
+## Augmenters
+- EasyDataAugmentation (EDA)
+- RandomDeletion (RD)
+- RandomInsertion (RI)
+- SynonymReplacement (SR)
+- RandomSwap (RS)
+
+
+## Usage
+- EDA class
+```python
+EDA = EasyDataAugmentation(
+    morpheme_analyzer: str = None,
+    alpha_sr: float = 0.1,
+    alpha_ri: float = 0.1,
+    alpha_rs: float = 0.1,
+    prob_rd: float = 0.1,
+):
+
+text = "아버지가방에들어가신다"
+
+# EDA(data: Union[List[str], str], p: List[float] = None, repetition: int = 1)
+result = EDA(data=text, p=None, repetition=1)
+```
+
+- The others (RD, RI, SR, RS)
+```python
+augmenter = Augmenter(morpheme_analyzer: str = None, stopword: bool = False)
+
+text = "아버지가방에들어가신다"
+
+# augmenter(data: Union[List[str], str], p: float = 0.1, repetition: int = 1)
+result = augmenter(data=text, p=0.5, repetiion=1)
+```
+
+## Reference
+[Easy Data Augmentation Paper](https://www.aclweb.org/anthology/D19-1670.pdf)  
+[Easy Data Augmentation Repository](https://github.com/jasonwei20/eda_nlp)  
+[Korean WordNet](http://wordnet.kaist.ac.kr/)
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+numpy==1.19.4
+konlpy==0.5.2
diff --git a/setup.py b/setup.py
@@ -1,34 +1,37 @@
-from setuptools import setup, find_packages
-
-setup(
-    name='koeda',
-    version='0.0.2',
-    description='Korean Easy Data Augmentation Package',
-    long_description=open("README.md", "r", encoding="utf-8").read(),
-    long_description_content_type="text/markdown",
-    author='Dongju.Park',
-    author_email='[email protected]',
-    url='https://github.com/toriving/KoEDA',
-    package_dir={"": "src"},
-    packages=find_packages("src"),
-    install_requires=[],
-    keywords=['NLP deep learning koeda korean easy data augmentation'],
-    license="MIT",
-    python_requires='>=3.6.0',
-    package_data={},
-    zip_safe=False,
-    classifiers=[
-        "Development Status :: 5 - Production/Stable",
-        "Intended Audience :: Developers",
-        "Intended Audience :: Education",
-        "Intended Audience :: Science/Research",
-        "License :: OSI Approved :: MIT License",
-        "Operating System :: OS Independent",
-        'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.6',
-        'Programming Language :: Python :: 3.7',
-        'Programming Language :: Python :: 3.8',
-        'Programming Language :: Python :: 3.9',
-        "Topic :: Scientific/Engineering :: Artificial Intelligence",
-    ],
-)
+from setuptools import setup, find_packages
+
+with open("requirements.txt") as f:
+    requirements = f.read().splitlines()
+
+setup(
+    name="koeda",
+    version="0.0.3",
+    description="Korean Easy Data Augmentation Package",
+    long_description=open("README.md", "r", encoding="utf-8").read(),
+    long_description_content_type="text/markdown",
+    author="Dongju.Park",
+    author_email="[email protected]",
+    url="https://github.com/toriving/KoEDA",
+    package_dir={"": "src"},
+    packages=find_packages("src"),
+    install_requires=requirements,
+    keywords=["NLP deep learning koeda korean easy data augmentation"],
+    license="MIT",
+    python_requires=">=3.6.0",
+    include_package_data=True,
+    zip_safe=False,
+    classifiers=[
+        "Development Status :: 5 - Production/Stable",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Education",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    ],
+)
diff --git a/src/koeda/__init__.py b/src/koeda/__init__.py
@@ -1,5 +1,17 @@
-if __name__ == "__main__":
-    print("Hello KoEDA")
+__title__ = "KoEDA"
+__version__ = "0.0.3"
 
-def hello():
-    print("Hello KoEDA")
+__author__ = "Dongju Park"
+__email__ = "[email protected]"
+__url__ = "http://toriving.github.io"
+
+__summary__ = "Easy Data Augmentation for Korean"
+__license__ = "MIT License"
+__copyright__ = "Copyright 2020 {}".format(__author__)
+
+
+from .eda import EasyDataAugmentation
+
+from .augmenters import RandomDeletion, RandomInsertion, SynonymReplacement, RandomSwap
+
+from .utils import STOPWORD, WORDNET, get_synonyms
diff --git a/src/koeda/augmenters/__init__.py b/src/koeda/augmenters/__init__.py
@@ -0,0 +1,6 @@
+__all__ = ["RandomDeletion", "RandomInsertion", "SynonymReplacement", "RandomSwap"]
+
+from .deletion import *
+from .insertion import *
+from .replacement import *
+from .swap import *
diff --git a/src/koeda/augmenters/deletion.py b/src/koeda/augmenters/deletion.py
@@ -0,0 +1,71 @@
+import random
+from typing import Union, List
+from itertools import repeat, chain
+
+from konlpy.tag import *
+
+from koeda.utils import replace_space, revert_space, SPACE_TOKEN
+
+
+class RandomDeletion:
+    def __init__(self, morpheme_analyzer: str = None):
+        if morpheme_analyzer is None:
+            self.morpheme_analyzer = Okt()
+        elif morpheme_analyzer in ["Okt", "Kkma", "Komoran", "Mecab", "Hannanum"]:
+            self.morpheme_analyzer = eval(morpheme_analyzer)()
+        elif hasattr(morpheme_analyzer, "morphs"):
+            self.morpheme_analyzer = morpheme_analyzer
+        else:
+            raise Exception("Does not support morpheme analyzer.")
+
+    def __call__(self, *args, **kwargs):
+        return self.random_deletion(*args, **kwargs)
+
+    def random_deletion(
+        self, data: Union[List[str], str], p: float = 0.1, repetition: int = 1
+    ) -> Union[List[str], str]:
+        if isinstance(data, str):
+            if repetition <= 1:
+                return self._deletion(data, p)
+            else:
+                return list(
+                    map(self._deletion, repeat(data, repetition), repeat(p, repetition))
+                )
+        elif isinstance(data, list):
+            if repetition <= 1:
+                return list(map(self._deletion, data, repeat(p, len(data))))
+            else:
+                return list(
+                    map(
+                        self._deletion,
+                        chain.from_iterable(repeat(x, repetition) for x in data),
+                        repeat(p, len(data) * repetition),
+                    )
+                )
+        else:
+            raise Exception(f"Does not support the data type : {type(data)}")
+
+    def _deletion(self, data: str, p: float = 0.1) -> str:
+        split_words = self.morpheme_analyzer.morphs(replace_space(data))
+        words = self.morpheme_analyzer.morphs(data)
+
+        # obviously, if there's only one word, don't delete it
+        if len(words) == 1:
+            return words
+
+        # randomly delete words with probability p
+        new_words = []
+        for word in split_words:
+            if word == SPACE_TOKEN:
+                new_words.append(word)
+                continue
+            r = random.uniform(0, 1)
+            if r > p:
+                new_words.append(word)
+
+        # if you end up deleting all words, just return a random word
+        if len(set(filter(SPACE_TOKEN.__ne__, new_words))) == 0:
+            rand_int = random.randint(0, len(words) - 1)
+            return revert_space([data[rand_int]])
+
+        return revert_space(new_words)