diff --git a/.bumpversion.cfg b/.bumpversion.cfg new file mode 100644 index 0000000..e994796 --- /dev/null +++ b/.bumpversion.cfg @@ -0,0 +1,8 @@ +[bumpversion] +current_version = 0.3.6 +commit = True +tag = True +tag_name = {new_version} + +[bumpversion:file:setup.py] +parse = version\s*=\s*['"](?P\d+)\.(?P\d+)\.(?P\d+) \ No newline at end of file diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..7062cb6 --- /dev/null +++ b/.flake8 @@ -0,0 +1,30 @@ +[flake8] +max-line-length = 119 +per-file-ignores = +# Exclude files that are meant to provide top-level imports +# E402: Module level import not at top of file +# F401: Module imported but unused + sklearn_crfsuite/__init__.py:E402 + +# Issues pending a review: +ignore = + B017, + D100, + D101, + D102, + D103, + D104, + D105, + D107, + D200, + D205, + D400, + E203, + E501, + E701, + E704, + F401, + W503 +exclude = + docs/conf.py + setup.py diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..203d495 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,22 @@ +name: publish +on: + push: + tags: + - "[0-9]+.[0-9]+.[0-9]+" +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + - name: Install dependencies + run: | + pip install --upgrade build twine + python -m build + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@v1.8.14 + with: + password: ${{ secrets.PYPI_TOKEN }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..7a3424a --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,53 @@ +name: tox +on: + pull_request: + push: + branches: [ main ] +jobs: + test: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - python-version: "3.8" + - python-version: "3.9" + - python-version: "3.10" + - python-version: "3.11" + - python-version: "3.12" + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install tox + - name: tox + run: | + tox -e ${{ matrix.toxenv || 'py' }} + - name: coverage + if: ${{ success() }} + run: bash <(curl -s https://codecov.io/bash) + check: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ['3.12'] + tox-job: ["pre-commit", "mypy", "twinecheck"] + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install tox + - name: tox + run: | + tox -e ${{ matrix.tox-job }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..83e03b0 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,26 @@ +default_language_version: + python: python3 +repos: +- hooks: + - id: black + repo: https://github.com/ambv/black + rev: 24.3.0 +- hooks: + - id: isort + repo: https://github.com/PyCQA/isort + rev: 5.13.2 +- hooks: + - id: flake8 + additional_dependencies: + - flake8-bugbear + - flake8-comprehensions + - flake8-debugger + - flake8-docstrings + - flake8-string-format + repo: https://github.com/pycqa/flake8 + rev: 7.0.0 +- repo: https://github.com/asottile/pyupgrade + rev: v3.15.2 + hooks: + - id: pyupgrade + args: [--py38-plus, --keep-runtime-typing] diff --git a/docs/conf.py b/docs/conf.py index 7c94736..2eb78cf 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # # sklearn-crfsuite documentation build configuration file, created by # sphinx-quickstart on Fri Nov 27 03:50:38 2015. @@ -13,103 +12,106 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys import os import shlex +import sys # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -sys.path.insert(0, os.path.abspath('..')) +sys.path.insert(0, os.path.abspath("..")) from unittest.mock import MagicMock + class Mock(MagicMock): @classmethod def __getattr__(cls, name): return MagicMock() -MOCK_MODULES = ['sklearn', 'sklearn.metrics', 'tqdm', 'tabulate', 'numpy', - 'pycrfsuite'] + +MOCK_MODULES = ["sklearn", "sklearn.metrics", "tqdm", "tabulate", "numpy", "pycrfsuite"] sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.viewcode', - 'sphinx.ext.napoleon', - 'sphinx.ext.mathjax', + "IPython.sphinxext.ipython_console_highlighting", + "IPython.sphinxext.ipython_directive", + "sphinx.ext.autodoc", + "sphinx.ext.viewcode", + "sphinx.ext.napoleon", + "sphinx.ext.mathjax", ] numpydoc_show_class_members = False -autodoc_member_order = 'bysource' +autodoc_member_order = "bysource" # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = 'sklearn-crfsuite' -copyright = '2015, Mikhail Korobov' -author = 'Mikhail Korobov' +project = "sklearn-crfsuite" +copyright = "Mikhail Korobov" +author = "Mikhail Korobov" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = '0.3' +version = "0.3" # The full version, including alpha/beta/rc tags. -release = '0.3' +release = "0.3" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build', 'CoNLL2002.rst'] +exclude_patterns = ["_build", "CoNLL2002.rst"] # The reST default role (used for this markup: `text`) to use for all # documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False +# show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False +# keep_warnings = False # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False @@ -127,135 +129,137 @@ def __getattr__(cls, name): # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +# html_static_path = ["_static"] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. -#html_extra_path = [] +# html_extra_path = [] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +# html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True +# html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True +# html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None +# html_file_suffix = None # Language to be used for generating the HTML full-text search index. # Sphinx supports the following languages: # 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' # 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr' -#html_search_language = 'en' +# html_search_language = 'en' # A dictionary with options for the search language support, empty by default. # Now only 'ja' uses this config value -#html_search_options = {'type': 'default'} +# html_search_options = {'type': 'default'} # The name of a javascript file (relative to the configuration directory) that # implements a search results scorer. If empty, the default will be used. -#html_search_scorer = 'scorer.js' +# html_search_scorer = 'scorer.js' # Output file base name for HTML help builder. -htmlhelp_basename = 'sklearn-crfsuitedoc' +htmlhelp_basename = "sklearn-crfsuitedoc" # -- Options for LaTeX output --------------------------------------------- latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', - -# Latex figure (float) alignment -#'figure_align': 'htbp', + # The paper size ('letterpaper' or 'a4paper'). + #'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + #'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + #'preamble': '', + # Latex figure (float) alignment + #'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'sklearn-crfsuite.tex', 'sklearn-crfsuite Documentation', - 'Mikhail Korobov', 'manual'), + ( + master_doc, + "sklearn-crfsuite.tex", + "sklearn-crfsuite Documentation", + "Mikhail Korobov", + "manual", + ), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output --------------------------------------- @@ -263,12 +267,11 @@ def __getattr__(cls, name): # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - (master_doc, 'sklearn-crfsuite', 'sklearn-crfsuite Documentation', - [author], 1) + (master_doc, "sklearn-crfsuite", "sklearn-crfsuite Documentation", [author], 1) ] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ------------------------------------------- @@ -277,19 +280,25 @@ def __getattr__(cls, name): # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'sklearn-crfsuite', 'sklearn-crfsuite Documentation', - author, 'sklearn-crfsuite', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "sklearn-crfsuite", + "sklearn-crfsuite Documentation", + author, + "sklearn-crfsuite", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False +# texinfo_no_detailmenu = False diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..499a35d --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,6 @@ +ipython +matplotlib +numpy +pickleshare # ipython optional dependency +sphinx==6.2.1 +sphinx-rtd-theme==2.0.0 diff --git a/setup.py b/setup.py index c454d67..02724c7 100755 --- a/setup.py +++ b/setup.py @@ -2,33 +2,34 @@ from setuptools import setup setup( - name='sklearn-crfsuite', - version='0.3.6', - author='Mikhail Korobov', - author_email='kmike84@gmail.com', - license='MIT license', - long_description=open('README.rst').read() + "\n\n" + open('CHANGES.rst').read(), + name="sklearn-crfsuite", + version="0.3.6", + author="Mikhail Korobov", + author_email="kmike84@gmail.com", + license="MIT license", + long_description=open("README.rst").read() + "\n\n" + open("CHANGES.rst").read(), + long_description_content_type="text/x-rst", description="CRFsuite (python-crfsuite) wrapper which provides interface simlar to scikit-learn", - url='https://github.com/TeamHG-Memex/sklearn-crfsuite', + url="https://github.com/TeamHG-Memex/sklearn-crfsuite", zip_safe=False, - packages=['sklearn_crfsuite'], + packages=["sklearn_crfsuite"], install_requires=[ + "python-crfsuite >= 0.9.7", + "scikit-learn >= 0.24.0", + "tabulate >= 0.4.2", "tqdm >= 2.0", - "six", - "tabulate", - "python-crfsuite >= 0.8.3" ], classifiers=[ - 'Development Status :: 3 - Alpha', - 'License :: OSI Approved :: MIT License', - 'Intended Audience :: Developers', - 'Operating System :: OS Independent', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', + "Development Status :: 3 - Alpha", + "License :: OSI Approved :: MIT License", + "Intended Audience :: Developers", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", ], ) diff --git a/sklearn_crfsuite/__init__.py b/sklearn_crfsuite/__init__.py index fe61bef..623fc0e 100644 --- a/sklearn_crfsuite/__init__.py +++ b/sklearn_crfsuite/__init__.py @@ -1,2 +1 @@ -# -*- coding: utf-8 -*- from .estimator import CRF diff --git a/sklearn_crfsuite/_fileresource.py b/sklearn_crfsuite/_fileresource.py index 3569cad..c28d63c 100644 --- a/sklearn_crfsuite/_fileresource.py +++ b/sklearn_crfsuite/_fileresource.py @@ -1,17 +1,16 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import import os import tempfile -class FileResource(object): +class FileResource: """ Object that "owns" a file on a filesystem. If the ``filename`` is None, it maintains a temporary file which name is accessible via ``name`` attribute; when pickling, the contents of this file is pickled; when unpickling, a new temp file is created; temp files are auto-deleted. """ - def __init__(self, filename=None, keep_tempfiles=False, suffix='', prefix=''): + + def __init__(self, filename=None, keep_tempfiles=False, suffix="", prefix=""): self.name = filename self.auto = filename is None self.keep_tempfiles = keep_tempfiles @@ -19,7 +18,7 @@ def __init__(self, filename=None, keep_tempfiles=False, suffix='', prefix=''): self.prefix = prefix def ensure_name(self): - """ Ensure that a filename is available """ + """Ensure that a filename is available""" if self.name is not None: return if self.auto: @@ -28,7 +27,7 @@ def ensure_name(self): raise ValueError("File name is not provided") def cleanup(self): - """ Clean temporary files if needed """ + """Clean temporary files if needed""" if self.keep_tempfiles or not self.auto: return @@ -54,24 +53,23 @@ def __getstate__(self): dct = self.__dict__.copy() if self.auto: - filename = dct['name'] + filename = dct["name"] if filename is not None: try: - with open(filename, 'rb') as f: - dct['__FILE_RESOURCE_DATA__'] = f.read() - except IOError: + with open(filename, "rb") as f: + dct["__FILE_RESOURCE_DATA__"] = f.read() + except OSError: pass - dct['name'] = None + dct["name"] = None return dct def __setstate__(self, state): - data = state.pop('__FILE_RESOURCE_DATA__', None) + data = state.pop("__FILE_RESOURCE_DATA__", None) self.__dict__.update(state) if data is not None: assert self.name is None self.ensure_name() - with open(self.name, 'wb') as f: + with open(self.name, "wb") as f: f.write(data) - diff --git a/sklearn_crfsuite/compat.py b/sklearn_crfsuite/compat.py index 34c2315..5360600 100644 --- a/sklearn_crfsuite/compat.py +++ b/sklearn_crfsuite/compat.py @@ -1,6 +1,6 @@ -# -*- coding: utf-8 -*- try: from sklearn.base import BaseEstimator except ImportError: - class BaseEstimator(object): + + class BaseEstimator: pass diff --git a/sklearn_crfsuite/estimator.py b/sklearn_crfsuite/estimator.py index 298e20e..65f0891 100644 --- a/sklearn_crfsuite/estimator.py +++ b/sklearn_crfsuite/estimator.py @@ -1,13 +1,9 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import - -from six.moves import zip -from tqdm import tqdm import pycrfsuite +from tqdm import tqdm from sklearn_crfsuite._fileresource import FileResource -from sklearn_crfsuite.trainer import LinePerIterationTrainer from sklearn_crfsuite.compat import BaseEstimator +from sklearn_crfsuite.trainer import LinePerIterationTrainer class CRF(BaseEstimator): @@ -207,37 +203,38 @@ class CRF(BaseEstimator): is to use pickle (or its alternatives like joblib). """ - def __init__(self, - algorithm=None, - - min_freq=None, - all_possible_states=None, - all_possible_transitions=None, - c1=None, - c2=None, - max_iterations=None, - num_memories=None, - epsilon=None, - period=None, - delta=None, - linesearch=None, - max_linesearch=None, - calibration_eta=None, - calibration_rate=None, - calibration_samples=None, - calibration_candidates=None, - calibration_max_trials=None, - pa_type=None, - c=None, - error_sensitive=None, - averaging=None, - variance=None, - gamma=None, - - verbose=False, - model_filename=None, - keep_tempfiles=False, - trainer_cls=None): + + def __init__( + self, + algorithm=None, + min_freq=None, + all_possible_states=None, + all_possible_transitions=None, + c1=None, + c2=None, + max_iterations=None, + num_memories=None, + epsilon=None, + period=None, + delta=None, + linesearch=None, + max_linesearch=None, + calibration_eta=None, + calibration_rate=None, + calibration_samples=None, + calibration_candidates=None, + calibration_max_trials=None, + pa_type=None, + c=None, + error_sensitive=None, + averaging=None, + variance=None, + gamma=None, + verbose=False, + model_filename=None, + keep_tempfiles=False, + trainer_cls=None, + ): self.algorithm = algorithm self.min_freq = min_freq @@ -263,12 +260,14 @@ def __init__(self, self.averaging = averaging self.variance = variance self.gamma = gamma + self.model_filename = model_filename + self.keep_tempfiles = keep_tempfiles self.modelfile = FileResource( filename=model_filename, keep_tempfiles=keep_tempfiles, suffix=".crfsuite", - prefix="model" + prefix="model", ) self.verbose = verbose self.trainer_cls = trainer_cls @@ -295,7 +294,9 @@ def fit(self, X, y, X_dev=None, y_dev=None): y_dev : (optional) list of lists of strings Labels corresponding to X_dev. """ - if (X_dev is None and y_dev is not None) or (X_dev is not None and y_dev is None): + if (X_dev is None and y_dev is not None) or ( + X_dev is not None and y_dev is None + ): raise ValueError("Pass both X_dev and y_dev to use the holdout data") if self._tagger is not None: @@ -308,7 +309,9 @@ def fit(self, X, y, X_dev=None, y_dev=None): train_data = zip(X, y) if self.verbose: - train_data = tqdm(train_data, "loading training data to CRFsuite", len(X), leave=True) + train_data = tqdm( + train_data, "loading training data to CRFsuite", len(X), leave=True + ) for xseq, yseq in train_data: trainer.append(xseq, yseq) @@ -320,7 +323,9 @@ def fit(self, X, y, X_dev=None, y_dev=None): test_data = zip(X_dev, y_dev) if self.verbose: - test_data = tqdm(test_data, "loading dev data to CRFsuite", len(X_dev), leave=True) + test_data = tqdm( + test_data, "loading dev data to CRFsuite", len(X_dev), leave=True + ) for xseq, yseq in test_data: trainer.append(xseq, yseq, 1) @@ -412,6 +417,7 @@ def score(self, X, y): For other metrics check :mod:`sklearn_crfsuite.metrics`. """ from sklearn_crfsuite.metrics import flat_accuracy_score + y_pred = self.predict(X) return flat_accuracy_score(y, y_pred) @@ -446,7 +452,7 @@ def size_(self): """ if self._info is None: return None - return int(self._info.header['size']) + return int(self._info.header["size"]) @property def num_attributes_(self): @@ -455,7 +461,7 @@ def num_attributes_(self): """ if self._info is None: return None - return int(self._info.header['num_attrs']) + return int(self._info.header["num_attrs"]) @property def attributes_(self): @@ -502,29 +508,29 @@ def _info(self): def _get_trainer(self): trainer_cls = self.trainer_cls or LinePerIterationTrainer params = { - 'feature.minfreq': self.min_freq, - 'feature.possible_states': self.all_possible_states, - 'feature.possible_transitions': self.all_possible_transitions, - 'c1': self.c1, - 'c2': self.c2, - 'max_iterations': self.max_iterations, - 'num_memories': self.num_memories, - 'epsilon': self.epsilon, - 'period': self.period, - 'delta': self.delta, - 'linesearch': self.linesearch, - 'max_linesearch': self.max_linesearch, - 'calibration.eta': self.calibration_eta, - 'calibration.rate': self.calibration_rate, - 'calibration.samples': self.calibration_samples, - 'calibration.candidates': self.calibration_candidates, - 'calibration.max_trials': self.calibration_max_trials, - 'type': self.pa_type, - 'c': self.c, - 'error_sensitive': self.error_sensitive, - 'averaging': self.averaging, - 'variance': self.variance, - 'gamma': self.gamma, + "feature.minfreq": self.min_freq, + "feature.possible_states": self.all_possible_states, + "feature.possible_transitions": self.all_possible_transitions, + "c1": self.c1, + "c2": self.c2, + "max_iterations": self.max_iterations, + "num_memories": self.num_memories, + "epsilon": self.epsilon, + "period": self.period, + "delta": self.delta, + "linesearch": self.linesearch, + "max_linesearch": self.max_linesearch, + "calibration.eta": self.calibration_eta, + "calibration.rate": self.calibration_rate, + "calibration.samples": self.calibration_samples, + "calibration.candidates": self.calibration_candidates, + "calibration.max_trials": self.calibration_max_trials, + "type": self.pa_type, + "c": self.c, + "error_sensitive": self.error_sensitive, + "averaging": self.averaging, + "variance": self.variance, + "gamma": self.gamma, } params = {k: v for k, v in params.items() if v is not None} return trainer_cls( @@ -535,6 +541,6 @@ def _get_trainer(self): def __getstate__(self): dct = self.__dict__.copy() - dct['_tagger'] = None - dct['_info_cached'] = None + dct["_tagger"] = None + dct["_info_cached"] = None return dct diff --git a/sklearn_crfsuite/metrics.py b/sklearn_crfsuite/metrics.py index fe7d026..6f1d0b1 100644 --- a/sklearn_crfsuite/metrics.py +++ b/sklearn_crfsuite/metrics.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import, division from functools import wraps from sklearn_crfsuite.utils import flatten @@ -11,6 +9,7 @@ def wrapper(y_true, y_pred, *args, **kwargs): y_true_flat = flatten(y_true) y_pred_flat = flatten(y_pred) return func(y_true_flat, y_pred_flat, *args, **kwargs) + return wrapper @@ -20,6 +19,7 @@ def flat_accuracy_score(y_true, y_pred): Return accuracy score for sequence items. """ from sklearn import metrics + return metrics.accuracy_score(y_true, y_pred) @@ -29,6 +29,7 @@ def flat_precision_score(y_true, y_pred, **kwargs): Return precision score for sequence items. """ from sklearn import metrics + return metrics.precision_score(y_true, y_pred, **kwargs) @@ -38,6 +39,7 @@ def flat_recall_score(y_true, y_pred, **kwargs): Return recall score for sequence items. """ from sklearn import metrics + return metrics.recall_score(y_true, y_pred, **kwargs) @@ -47,6 +49,7 @@ def flat_f1_score(y_true, y_pred, **kwargs): Return F1 score for sequence items. """ from sklearn import metrics + return metrics.f1_score(y_true, y_pred, **kwargs) @@ -56,7 +59,8 @@ def flat_fbeta_score(y_true, y_pred, beta, **kwargs): Return F-beta score for sequence items. """ from sklearn import metrics - return metrics.fbeta_score(y_true, y_pred, beta, **kwargs) + + return metrics.fbeta_score(y_true, y_pred, beta=beta, **kwargs) @_flattens_y @@ -65,6 +69,7 @@ def flat_classification_report(y_true, y_pred, labels=None, **kwargs): Return classification report for sequence items. """ from sklearn import metrics + return metrics.classification_report(y_true, y_pred, labels, **kwargs) @@ -77,7 +82,8 @@ def sequence_accuracy_score(y_true, y_pred): if not total: return 0 - matches = sum(1 for yseq_true, yseq_pred in zip(y_true, y_pred) - if yseq_true == yseq_pred) + matches = sum( + 1 for yseq_true, yseq_pred in zip(y_true, y_pred) if yseq_true == yseq_pred + ) return matches / total diff --git a/sklearn_crfsuite/scorers.py b/sklearn_crfsuite/scorers.py index 0e51a43..24a19f9 100644 --- a/sklearn_crfsuite/scorers.py +++ b/sklearn_crfsuite/scorers.py @@ -1,12 +1,11 @@ -# -*- coding: utf-8 -*- """ Scorer functions to use with scikit-learn ``cross_val_score``, ``GridSearchCV``, ``RandomizedSearchCV`` and other similar classes. """ + from sklearn.metrics import make_scorer from sklearn_crfsuite import metrics - flat_accuracy = make_scorer(metrics.flat_accuracy_score) sequence_accuracy = make_scorer(metrics.sequence_accuracy_score) diff --git a/sklearn_crfsuite/trainer.py b/sklearn_crfsuite/trainer.py index 3aa2cd3..3457584 100644 --- a/sklearn_crfsuite/trainer.py +++ b/sklearn_crfsuite/trainer.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- import pycrfsuite from tabulate import tabulate @@ -8,6 +7,7 @@ class LinePerIterationTrainer(pycrfsuite.Trainer): This pycrfsuite.Trainer prints information about each iteration on a single line. """ + def on_iteration(self, log, info): parts = [ "Iter {num:<3} ", @@ -15,18 +15,18 @@ def on_iteration(self, log, info): "loss={loss:<8.2f} ", ] - if 'active_features' in info: + if "active_features" in info: parts += ["active={active_features:<5} "] - if 'avg_precision' in info: + if "avg_precision" in info: parts += [ "precision={avg_precision:0.3f} ", "recall={avg_recall:0.3f} ", "F1={avg_f1:0.3f} ", - "Acc(item/seq)={item_accuracy_float:0.3f} {instance_accuracy_float:0.3f} " + "Acc(item/seq)={item_accuracy_float:0.3f} {instance_accuracy_float:0.3f} ", ] - if 'feature_norm' in info: + if "feature_norm" in info: parts += ["feature_norm={feature_norm:<8.2f}"] line = "".join(parts) @@ -34,17 +34,18 @@ def on_iteration(self, log, info): def on_optimization_end(self, log): last_iter = self.logparser.last_iteration - if last_iter.get('scores', None): + if last_iter.get("scores", None): data = [ [entity, score.precision, score.recall, score.f1 or 0, score.ref] - for entity, score in sorted(last_iter['scores'].items()) + for entity, score in sorted(last_iter["scores"].items()) ] - table = tabulate(data, + table = tabulate( + data, headers=["Label", "Precision", "Recall", "F1", "Support"], floatfmt="0.3f", ) size = len(table.splitlines()[0]) - print("="*size) + print("=" * size) print(table) - print("-"*size) - super(LinePerIterationTrainer, self).on_optimization_end(log) + print("-" * size) + super().on_optimization_end(log) diff --git a/sklearn_crfsuite/utils.py b/sklearn_crfsuite/utils.py index 7999ba8..788453b 100644 --- a/sklearn_crfsuite/utils.py +++ b/sklearn_crfsuite/utils.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- from itertools import chain diff --git a/tests/conftest.py b/tests/conftest.py index 93f710a..96810f1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,24 +1,33 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import import pytest @pytest.fixture() def xseq(): return [ - {'walk': 1, 'shop': 0.5}, - {'walk': 1}, - {'walk': 1, 'clean': 0.5}, - {u'shop': 0.5, u'clean': 0.5}, - {'walk': 0.5, 'clean': 1}, - {'clean': 1, u'shop': 0.1}, - {'walk': 1, 'shop': 0.5}, + {"walk": 1, "shop": 0.5}, + {"walk": 1}, + {"walk": 1, "clean": 0.5}, + {"shop": 0.5, "clean": 0.5}, + {"walk": 0.5, "clean": 1}, + {"clean": 1, "shop": 0.1}, + {"walk": 1, "shop": 0.5}, {}, - {'clean': 1}, - {u'солнце': u'не светит'.encode('utf8'), 'clean': 1}, + {"clean": 1}, + {"солнце": "не светит".encode(), "clean": 1}, ] + @pytest.fixture def yseq(): - return ['sunny', 'sunny', u'sunny', 'rainy', 'rainy', 'rainy', - 'sunny', 'sunny', 'rainy', 'rainy'] + return [ + "sunny", + "sunny", + "sunny", + "rainy", + "rainy", + "rainy", + "sunny", + "sunny", + "rainy", + "rainy", + ] diff --git a/tests/test_crf.py b/tests/test_crf.py index 13ed675..303324b 100644 --- a/tests/test_crf.py +++ b/tests/test_crf.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- import os import pickle @@ -7,8 +6,7 @@ from sklearn_crfsuite import CRF - -ALGORITHMS = ["lbfgs", "l2sgd", "pa", "ap", "arow"] +ALGORITHMS = ["lbfgs", "l2sgd", "pa", "ap", "arow"] @pytest.mark.parametrize("algorithm", ALGORITHMS) @@ -17,7 +15,7 @@ def test_crf(xseq, yseq, algorithm): crf.fit([xseq], [yseq]) y_pred = crf.predict([xseq]) - if algorithm != 'ap': # Averaged Perceptron is regularized too much + if algorithm != "ap": # Averaged Perceptron is regularized too much assert y_pred == [yseq] @@ -31,14 +29,9 @@ def test_crf_verbose(xseq, yseq, algorithm, use_dev): else: X_dev, y_dev = None, None - crf.fit( - X=[xseq, xseq], - y=[yseq, yseq], - X_dev=X_dev, - y_dev=y_dev - ) + crf.fit(X=[xseq, xseq], y=[yseq, yseq], X_dev=X_dev, y_dev=y_dev) y_pred = crf.predict([xseq]) - if algorithm != 'ap': # Averaged Perceptron is regularized too much + if algorithm != "ap": # Averaged Perceptron is regularized too much assert y_pred == [yseq] @@ -72,7 +65,7 @@ def test_crf_score(xseq, yseq, algorithm): crf.fit([xseq], [yseq]) score = crf.score([xseq], [yseq]) - if algorithm != 'ap': + if algorithm != "ap": assert score == 1.0 else: # Averaged Perceptron is regularized too much assert score > 0.8 @@ -86,7 +79,7 @@ def test_crf_pickling(xseq, yseq, algorithm): crf2 = pickle.loads(data) score = crf2.score([xseq], [yseq]) - if algorithm != 'ap': + if algorithm != "ap": assert score == 1.0 else: # Averaged Perceptron is regularized too much assert score > 0.8 @@ -138,16 +131,16 @@ def test_attributes(xseq, yseq): assert crf.state_features_ is None assert crf.transition_features_ is None - crf.fit([xseq]*20, [yseq]*20) + crf.fit([xseq] * 20, [yseq] * 20) assert crf.tagger_ is not None assert crf.size_ > 1000 - assert set(crf.classes_) == {'sunny', 'rainy'} + assert set(crf.classes_) == {"sunny", "rainy"} assert crf.num_attributes_ > 0 assert len(crf.attributes_) == crf.num_attributes_ assert all(crf.attributes_) - assert 'clean' in crf.attributes_ + assert "clean" in crf.attributes_ assert len(crf.state_features_) > 0 assert all(isinstance(c, float) for c in crf.state_features_.values()) diff --git a/tests/test_metrics.py b/tests/test_metrics.py index c5a505e..f98e514 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -1,11 +1,7 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import, division - import pytest from sklearn_crfsuite import metrics - y1 = [["x", "z", "y"], ["x", "x"]] y2 = [["y", "z", "y"], ["x", "y"]] @@ -16,30 +12,34 @@ def test_flat_accuracy(): def test_flat_precision(): - score = metrics.flat_precision_score(y1, y2, average='micro') + score = metrics.flat_precision_score(y1, y2, average="micro") assert score == 3 / 5 def test_flat_recall(): - score = metrics.flat_recall_score(y1, y2, average='micro') + score = metrics.flat_recall_score(y1, y2, average="micro") assert score == 3 / 5 def test_flat_fscore(): - score = metrics.flat_f1_score(y1, y2, average='macro') + score = metrics.flat_f1_score(y1, y2, average="macro") assert score == 2 / 3 - assert metrics.flat_fbeta_score(y1, y2, beta=1, average='macro') == score + assert metrics.flat_fbeta_score(y1, y2, beta=1, average="macro") == score -@pytest.mark.xfail(reason="see https://github.com/TeamHG-Memex/sklearn-crfsuite/issues/1") +@pytest.mark.xfail( + reason="see https://github.com/TeamHG-Memex/sklearn-crfsuite/issues/1" +) def test_flat_f1_score_binary(): s = [["x", "y"], ["x", "y"]] - score = metrics.flat_f1_score(s, s, average='weighted') + score = metrics.flat_f1_score(s, s, average="weighted") assert score == 1.0 def test_sequence_accuracy(): assert metrics.sequence_accuracy_score(y1, y2) == 0 assert metrics.sequence_accuracy_score([], []) == 0 - assert metrics.sequence_accuracy_score([[1,2], [3], [4]], [[1,2], [4], [4]]) == 2 / 3 - assert metrics.sequence_accuracy_score([[1,2], [3]], [[1,2], [3]]) == 1.0 + assert ( + metrics.sequence_accuracy_score([[1, 2], [3], [4]], [[1, 2], [4], [4]]) == 2 / 3 + ) + assert metrics.sequence_accuracy_score([[1, 2], [3]], [[1, 2], [3]]) == 1.0 diff --git a/tox.ini b/tox.ini index b204ad6..fdb8599 100644 --- a/tox.ini +++ b/tox.ini @@ -1,15 +1,40 @@ [tox] -envlist = py27,py34,py35,py36 +envlist = pre-commit,docs,twinecheck,min,py38,py39,py310,py311,py312 [testenv] deps= pytest pytest-cov - numpy - commands= - pip install -U wheel pip - pip install scipy - pip install scikit-learn - pip install -e . - py.test --doctest-modules --cov=sklearn_crfsuite --cov-report= {posargs: sklearn_crfsuite tests} + pytest --doctest-modules --cov=sklearn_crfsuite --cov-report= {posargs:sklearn_crfsuite tests} + +[testenv:min] +basepython = python3.8 +deps = + {[testenv]deps} + python-crfsuite==0.9.7 + scikit-learn==0.24.0 + tabulate==0.4.2 + tqdm==2.0 + +[testenv:pre-commit] +deps = + pre-commit +commands = pre-commit run --all-files --show-diff-on-failure + +[testenv:docs] +basepython = python3 +changedir = docs +deps = + -rdocs/requirements.txt +commands = + sphinx-build -W -b html . {envtmpdir}/html + +[testenv:twinecheck] +basepython = python3 +deps = + twine==5.0.0 + build==1.1.1 +commands = + python -m build --sdist + twine check dist/*