From 83baa9bb740aab9f5fe34bc87d90ad2e0b11a2dd Mon Sep 17 00:00:00 2001 From: Sveinbjorn Thordarson Date: Tue, 19 Sep 2023 14:15:07 +0000 Subject: [PATCH 1/5] Don't load package version from source file --- setup.py | 8 ++------ src/reynir_correct/__init__.py | 2 +- src/reynir_correct/version.py | 1 - 3 files changed, 3 insertions(+), 8 deletions(-) delete mode 100644 src/reynir_correct/version.py diff --git a/setup.py b/setup.py index 7c0b521..6e7c83a 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ """ Greynir: Natural language processing for Icelandic - Setup.py + setup.py Copyright (C) 2023 Miðeind ehf. Original author: Vilhjálmur Þorsteinsson @@ -59,13 +59,9 @@ def read(*names: str, **kwargs: str): return "" -# Load version string from file -__version__ = "[missing]" -exec(open(join("src", "reynir_correct", "version.py")).read()) - setup( name="reynir-correct", - version=__version__, + version="4.0.0", # Remember to update in __init__.py as well license="MIT", description="A spelling and grammar corrector for Icelandic", long_description="{0}\n{1}".format( diff --git a/src/reynir_correct/__init__.py b/src/reynir_correct/__init__.py index 9afb89e..92f01cb 100644 --- a/src/reynir_correct/__init__.py +++ b/src/reynir_correct/__init__.py @@ -43,11 +43,11 @@ from .errtokenizer import Correct_TOK, CorrectionPipeline, CorrectToken from .readability import FleschKincaidFeedback, FleschKincaidScorer, RareWordsFinder from .settings import Settings -from .version import __version__ from .wrappers import CorrectedSentence, CorrectionResult, GreynirCorrectAPI, ParseResultStats, check_errors __author__ = "Miðeind ehf" __copyright__ = "(C) 2023 Miðeind ehf." +__version__ = "4.0.0" # Remember to update in setup.py as well __all__ = ( "Greynir", diff --git a/src/reynir_correct/version.py b/src/reynir_correct/version.py deleted file mode 100644 index ce1305b..0000000 --- a/src/reynir_correct/version.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = "4.0.0" From b8e477376da00ca0c9765cce2a7809abe55da466 Mon Sep 17 00:00:00 2001 From: Sveinbjorn Thordarson Date: Tue, 19 Sep 2023 16:20:39 +0000 Subject: [PATCH 2/5] Typing fixes + formatted excessively long lines in readability.py --- src/reynir_correct/errtokenizer.py | 2 +- src/reynir_correct/readability.py | 7 +++++-- src/reynir_correct/spelling.py | 2 +- src/reynir_correct/wrappers.py | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/reynir_correct/errtokenizer.py b/src/reynir_correct/errtokenizer.py index 4663fc8..f57e50c 100644 --- a/src/reynir_correct/errtokenizer.py +++ b/src/reynir_correct/errtokenizer.py @@ -2972,7 +2972,7 @@ def __init__( self._suggest_not_correct = options.pop("suggest_not_correct", False) # Wordlist for words that should not be marked as errors or corrected self._ignore_wordlist = options.pop("ignore_wordlist", set()) - self._ignore_rules = options.pop("ignore_rules", set()) + self._ignore_rules = cast(frozenset, options.pop("ignore_rules", frozenset())) self.settings = settings def correct_tokens(self, stream: TokenIterator) -> TokenIterator: diff --git a/src/reynir_correct/readability.py b/src/reynir_correct/readability.py index 61b5b55..928b1fc 100644 --- a/src/reynir_correct/readability.py +++ b/src/reynir_correct/readability.py @@ -31,6 +31,7 @@ This module implements the Flesch reading ease score for Icelandic text. A high score indicates that the text is easy to read, while a low score indicates that the text is difficult to read. + """ from __future__ import annotations @@ -190,7 +191,8 @@ class RareWordsFinder: Rare words are defined as words which have a probability lower than the low_prob_cutoff. The probability of a word is calculated by looking up the word in an n-gram model. - The class is designed to be used with the tokenizer module and maintains an internal state which needs to be reset manually. + The class is designed to be used with the tokenizer module and maintains an internal + state which needs to be reset manually. """ def __init__(self): @@ -200,7 +202,8 @@ def __init__(self): def get_rare_words_from_stream( self, tok_stream: Iterable[tokenizer.Tok], max_words: int, low_prob_cutoff: float ) -> List[Tuple[str, float]]: - """Tracks the probability of each word in a token stream. This is done by yielding the tokens in the token stream.""" + """Tracks the probability of each word in a token stream. + This is done by yielding the tokens in the token stream.""" rare_words_dict: Dict[str, float] = {} for token in tok_stream: # Only consider words, not punctuation, numbers, etc. diff --git a/src/reynir_correct/spelling.py b/src/reynir_correct/spelling.py index dc31a5b..07311f3 100644 --- a/src/reynir_correct/spelling.py +++ b/src/reynir_correct/spelling.py @@ -661,7 +661,7 @@ def _case_of(text: str) -> Callable[[str], str]: # We don't use .istitle() and .title() because # they consider apostrophes to be word separators return lambda s: s[0].upper() + s[1:] - return str + return str # noqa def _cast(self, word: str) -> str: """Cast the word to lowercase and correct accents""" diff --git a/src/reynir_correct/wrappers.py b/src/reynir_correct/wrappers.py index b9d0e9f..709924c 100644 --- a/src/reynir_correct/wrappers.py +++ b/src/reynir_correct/wrappers.py @@ -294,7 +294,7 @@ def _correct_spelling( # TODO: The pipeline needs a refactoring. # We use some hacks here to avoid having to rewrite the pipeline at this point. self.gc.pipeline._text_or_gen = text - self.gc.pipeline._ignore_rules = ignore_rules or set() + self.gc.pipeline._ignore_rules = cast(frozenset, ignore_rules or frozenset()) self.gc.pipeline._suppress_suggestions = suppress_suggestions return self.gc.pipeline.tokenize() # type: ignore From 09089f5be0561752fb3dc35e932aa68f72fab697 Mon Sep 17 00:00:00 2001 From: Sveinbjorn Thordarson Date: Tue, 19 Sep 2023 20:29:07 +0000 Subject: [PATCH 3/5] Added package build settings and metadata to pyproject.toml --- pyproject.toml | 54 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 12f05b0..6d6987f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,54 @@ +[project] +name = "reynir-correct" +version = "4.0.0" +description = "Spelling and grammar correction for Icelandic" +authors = [{ name = "Miðeind ehf", email = "mideind@mideind.is" }] +readme = { file = "README.rst", content-type = "text/x-rst" } +license = { file = "LICENSE.txt" } +# For classifier list see: https://pypi.org/pypi?%3Aaction=list_classifiers +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Operating System :: Unix", + "Operating System :: POSIX", + "Operating System :: Microsoft :: Windows", + "Operating System :: MacOS", + "Natural Language :: Icelandic", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Utilities", + "Topic :: Text Processing :: Linguistic", +] +requires-python = ">=3.8" +dependencies = ["reynir>=3.5.3", "icegrams>=1.1.2", "typing_extensions"] + +[project.urls] +Repository = "https://github.com/mideind/GreynirCorrect" + +[project.optional-dependencies] +# Dev dependencies +dev = [] + +# *** Configuration of tools *** + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.pytest.ini_options] +filterwarnings = [ + # Ignore deprecation warnings in libraries, their problem not ours + "ignore::DeprecationWarning", +] + [tool.ruff] line-length = 120 @@ -5,6 +56,7 @@ line-length = 120 line-length = 120 [tool.isort] -known_future_library = ["__future__", "typing", "typing_extensions"] # This forces these imports to placed at the top +# This forces these imports to placed at the top +known_future_library = ["__future__", "typing", "typing_extensions"] profile = "black" line_length = 120 From d97a4d5957d8927d9fc7f86475ffef113d112bfc Mon Sep 17 00:00:00 2001 From: Sveinbjorn Thordarson Date: Wed, 20 Sep 2023 20:08:13 +0000 Subject: [PATCH 4/5] Reformatted excessively long lines of code --- src/reynir_correct/errtokenizer.py | 13 ++++++++++--- src/reynir_correct/main.py | 12 +++++++++--- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/src/reynir_correct/errtokenizer.py b/src/reynir_correct/errtokenizer.py index f57e50c..863918c 100644 --- a/src/reynir_correct/errtokenizer.py +++ b/src/reynir_correct/errtokenizer.py @@ -1841,7 +1841,8 @@ def add_ritmyndir_error(token: CorrectToken) -> CorrectToken: return token def get_details(code: str, txt: str, correct: str, lemma: str) -> Tuple[str, str, List[str]]: - """Return short and detailed descriptions for the error category plus a link to grammar references where possible""" + """Return short and detailed descriptions for the error category + plus a link to grammar references where possible""" # text is the short version, about the category and the error. # details is the long version with references. try: @@ -2497,7 +2498,12 @@ def number_error(token: CorrectToken, replace: str, code: str, instruction_txt: suggest=correct, ) ) - if suppress_suggestions and token.error_code == "Z001" and isinstance(token.val, list) and any(v.ordfl == "lo" for v in token.val): # type: ignore + if ( + suppress_suggestions + and token.error_code == "Z001" + and isinstance(token.val, list) + and any(v.ordfl == "lo" for v in token.val) + ): orig = token.original.strip() if token.original else token.txt token.remove_error(orig) @@ -2968,7 +2974,8 @@ def __init__( self._generate_suggestion_list = options.pop("generate_suggestion_list", False) # Skip spelling suggestions self._suppress_suggestions = options.pop("suppress_suggestions", False) - # Only give suggestions, don't correct everything automatically. Currently only applies to lookup_unknown_words and check_wording. + # Only give suggestions, don't correct everything automatically. + # Currently only applies to lookup_unknown_words and check_wording. self._suggest_not_correct = options.pop("suggest_not_correct", False) # Wordlist for words that should not be marked as errors or corrected self._ignore_wordlist = options.pop("ignore_wordlist", set()) diff --git a/src/reynir_correct/main.py b/src/reynir_correct/main.py index 45158b5..f16bea5 100644 --- a/src/reynir_correct/main.py +++ b/src/reynir_correct/main.py @@ -82,7 +82,11 @@ nargs="?", type=str, default="text", - help="Determine output format.\ntext: Corrected text only.\ncsv: One token per line in CSV format.\njson: One token per line in JSON format.\nm2: M2 format, GEC standard.", + help="""Determine output format. +text: Corrected text only. +csv: One token per line in CSV format. +json: One token per line in JSON format. +m2: M2 format, GEC standard.""", ) # Determines whether we supply only token-level annotations or also sentence-level annotations @@ -127,7 +131,8 @@ parser.add_argument( "--sentence_prefilter", - help="Run a heuristic filter on sentences to determine whether they are probably correct. Probably correct sentences will not go through the full parsing process.", + help="""Run a heuristic filter on sentences to determine whether they are probably +correct. Probably correct sentences will not go through the full parsing process.""", action="store_true", ) parser.add_argument( @@ -145,7 +150,8 @@ "--tov_config", nargs=1, type=str, - help="Add additional use-specific rules in a configuration file to check for custom tone-of-voice issues. Uses the same format as the default GreynirCorrect.conf file", + help="""Add additional use-specific rules in a configuration file to check for custom +tone-of-voice issues. Uses the same format as the default GreynirCorrect.conf file""", default=None, ) From 5fbf2f13f1e323c94d7d8241902f7c2613dbe86d Mon Sep 17 00:00:00 2001 From: Sveinbjorn Thordarson Date: Fri, 22 Sep 2023 16:01:36 +0000 Subject: [PATCH 5/5] Updated pyproject.toml, rm setup.py, updated CI config --- .github/workflows/python-package.yml | 4 +- pyproject.toml | 13 ++- setup.py | 116 --------------------------- src/reynir_correct/__init__.py | 2 +- 4 files changed, 14 insertions(+), 121 deletions(-) delete mode 100644 setup.py diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 1d85460..bc91e49 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -18,9 +18,9 @@ jobs: python-version: [ "3.8", "3.9", "3.10", "3.11", "pypy-3.9", "pypy-3.10"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Install GreynirCorrect diff --git a/pyproject.toml b/pyproject.toml index 6d6987f..795d9b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,14 +35,23 @@ dependencies = ["reynir>=3.5.3", "icegrams>=1.1.2", "typing_extensions"] Repository = "https://github.com/mideind/GreynirCorrect" [project.optional-dependencies] -# Dev dependencies -dev = [] +# dev dependencies +dev = ["pytest"] +# sentence_classifier dependencies +sentence_classifier = ["transformers", "datasets", "torch"] + +[project.scripts] +# 'correct' command line tool +correct = "reynir_correct.main:main" # *** Configuration of tools *** [tool.setuptools.packages.find] where = ["src"] +[tool.setuptools.package-data] +where = ["src"] + [tool.pytest.ini_options] filterwarnings = [ # Ignore deprecation warnings in libraries, their problem not ours diff --git a/setup.py b/setup.py deleted file mode 100644 index 6e7c83a..0000000 --- a/setup.py +++ /dev/null @@ -1,116 +0,0 @@ -#!/usr/bin/env python3 -""" - Greynir: Natural language processing for Icelandic - - setup.py - - Copyright (C) 2023 Miðeind ehf. - Original author: Vilhjálmur Þorsteinsson - - This software is licensed under the MIT License: - - Permission is hereby granted, free of charge, to any person - obtaining a copy of this software and associated documentation - files (the "Software"), to deal in the Software without restriction, - including without limitation the rights to use, copy, modify, merge, - publish, distribute, sublicense, and/or sell copies of the Software, - and to permit persons to whom the Software is furnished to do so, - subject to the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - This module sets up the GreynirCorrect package and installs the - 'correct' command-line utility. - - This package requires Python >= 3.8, and supports PyPy >= 3.8. - -""" - -import io -import re -import sys - -from glob import glob -from os.path import basename, dirname, join, splitext - -from setuptools import find_packages # type: ignore -from setuptools import setup # type: ignore - - -if sys.version_info < (3, 8): - print("GreynirCorrect requires Python >= 3.8") - sys.exit(1) - - -def read(*names: str, **kwargs: str): - try: - return io.open(join(dirname(__file__), *names), encoding=kwargs.get("encoding", "utf8")).read() - except (IOError, OSError): - return "" - - -setup( - name="reynir-correct", - version="4.0.0", # Remember to update in __init__.py as well - license="MIT", - description="A spelling and grammar corrector for Icelandic", - long_description="{0}\n{1}".format( - re.compile("^.. start-badges.*^.. end-badges", re.M | re.S).sub("", read("README.rst")), - re.sub(":[a-z]+:`~?(.*?)`", r"``\1``", read("CHANGELOG.rst")), - ), - author="Miðeind ehf", - author_email="mideind@mideind.is", - url="https://github.com/mideind/GreynirCorrect", - packages=find_packages("src"), - package_dir={"": "src"}, - py_modules=[splitext(basename(path))[0] for path in glob("src/*.py")], - package_data={"reynir_correct": ["py.typed"]}, - include_package_data=True, - zip_safe=True, - classifiers=[ - # complete classifier list: http://pypi.python.org/pypi?%3Aaction=list_classifiers - "Development Status :: 5 - Production/Stable", - "Intended Audience :: Developers", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: MIT License", - "Operating System :: Unix", - "Operating System :: POSIX", - "Operating System :: Microsoft :: Windows", - "Operating System :: MacOS", - "Natural Language :: Icelandic", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy", - "Topic :: Software Development :: Libraries :: Python Modules", - "Topic :: Utilities", - "Topic :: Text Processing :: Linguistic", - ], - keywords=["nlp", "parser", "icelandic", "spellchecker"], - setup_requires=[], - install_requires=["reynir>=3.5.3", "icegrams>=1.1.2", "typing_extensions"], - extras_require={ - "sentence_classifier": ["transformers", "datasets", "torch"], - }, - # Set up a 'correct' command ('correct.exe' on Windows), - # which calls main() in src/reynir-correct/main.py - entry_points={ - "console_scripts": [ - "correct=reynir_correct.main:main", - ], - }, -) diff --git a/src/reynir_correct/__init__.py b/src/reynir_correct/__init__.py index 92f01cb..49d6b3e 100644 --- a/src/reynir_correct/__init__.py +++ b/src/reynir_correct/__init__.py @@ -47,7 +47,7 @@ __author__ = "Miðeind ehf" __copyright__ = "(C) 2023 Miðeind ehf." -__version__ = "4.0.0" # Remember to update in setup.py as well +__version__ = "4.0.0" # Remember to update in pyproject.toml as well __all__ = ( "Greynir",