From c5c4030391d992d76cdae0ac4f6975662a46d4dd Mon Sep 17 00:00:00 2001 From: David Beauchemin Date: Sun, 17 Dec 2023 20:00:23 +0800 Subject: [PATCH 1/2] Update CODE_OF_CONDUCT.md --- .github/CODE_OF_CONDUCT.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/CODE_OF_CONDUCT.md b/.github/CODE_OF_CONDUCT.md index c3e3b0ec..7e54263c 100644 --- a/.github/CODE_OF_CONDUCT.md +++ b/.github/CODE_OF_CONDUCT.md @@ -1,4 +1,4 @@ -# Contributor Covenant Code of Conduct +# Contributor Deepparse Code of Conduct ## Our Pledge @@ -11,7 +11,7 @@ appearance, race, religion, or sexual identity and orientation. ## Our Standards -Examples of behavior that contributes to creating a positive environment +Examples of behaviour that contributes to creating a positive environment include: * Using welcoming and inclusive language @@ -20,13 +20,13 @@ include: * Focusing on what is best for the community * Showing empathy towards other community members -Examples of unacceptable behavior by participants include: +Examples of unacceptable behaviour by participants include: * The use of sexualized language or imagery and unwelcome sexual attention or advances * Trolling, insulting/derogatory comments, and personal or political attacks * Public or private harassment -* Publishing others' private information, such as a physical or electronic +* Publishing others' private information, such as physical or electronic address, without explicit permission * Other conduct which could reasonably be considered inappropriate in a professional setting @@ -34,13 +34,13 @@ Examples of unacceptable behavior by participants include: ## Our Responsibilities Project maintainers are responsible for clarifying the standards of acceptable -behavior and are expected to take appropriate and fair corrective action in -response to any instances of unacceptable behavior. +behaviour and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behaviour. Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or -permanently any contributor for other behaviors that they deem inappropriate, +permanently any contributor for other behaviours that they deem inappropriate, threatening, offensive, or harmful. ## Scope @@ -54,8 +54,8 @@ further defined and clarified by project maintainers. ## Enforcement -Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported by contacting the project team at waf2107@columbia.edu. All +Instances of abusive, harassing, or otherwise unacceptable behaviour may be +reported by contacting the project team at david.beauchemin@ift.ulaval.ca. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. From 2ecb38d779e1596777a9e7b3691508e8decec5e5 Mon Sep 17 00:00:00 2001 From: David Beauchemin Date: Sun, 23 Jun 2024 18:39:24 -0400 Subject: [PATCH 2/2] Dev (#230) * bump black version * Improve documentation (#211) * fix error in documentation and improve it * add api.rst documentation file * update chabngelog * fix disk memory usage problem with some github actions * add disk space cleaning for disk space errors * delete windows cleaning since use unix command * Add Training Guides (#212) and Improve Doc * fix error in documentation and improve it * add api.rst documentation file * first draft of training_guidelines * removed installation and getting started from index to specific files for easier redability * added training guide & fixed warnings * Update docs/source/training_guide.rst * Update docs/source/training_guide.rst * Update docs/source/training_guide.rst * Update docs/source/training_guide.rst * Update docs/source/training_guide.rst * Update docs/source/training_guide.rst * Update docs/source/training_guide.rst * Update docs/source/training_guide.rst * Update docs/source/training_guide.rst * Update docs/source/training_guide.rst * Update docs/source/training_guide.rst * Update docs/source/training_guide.rst * Update docs/source/training_guide.rst * Update docs/source/training_guide.rst * fix header error * clean spacing * clean spacing README * added details about the data * changed countries names to english * Update docs/source/training_guide.rst Co-authored-by: David Beauchemin * Update docs/source/training_guide.rst Co-authored-by: David Beauchemin * formatting - removed blanck line * Update docs/source/training_guide.rst --------- Co-authored-by: Marouane Yassine Co-authored-by: Marouane Yassine <46830666+MAYAS3@users.noreply.github.com> * improve documentation * bump black version * improve documentation * Bump black from 23.9.1 to 24.3.0 (#218) * Update CODE_OF_CONDUCT.md * Bump black from 23.9.1 to 24.3.0 Bumps [black](https://github.com/psf/black) from 23.9.1 to 24.3.0. - [Release notes](https://github.com/psf/black/releases) - [Changelog](https://github.com/psf/black/blob/main/CHANGES.md) - [Commits](https://github.com/psf/black/compare/23.9.1...24.3.0) --- updated-dependencies: - dependency-name: black dependency-type: direct:production ... Signed-off-by: dependabot[bot] * pyproject.toml * fix black * fix requirements --------- Signed-off-by: dependabot[bot] Co-authored-by: David Beauchemin Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * fix fastapi security RDoS breach * Improve data validation (#227) * fix typos in documentation * fix pylint * fix black version and black * fix errors in tests * remove fixed version * fix app * fix pylint disable * fix pylint disable * correction in changelog and added stuff for this PR * Remove context manager (#228) * fix imports * add interface for data cleaning pre processing during loading * add interface for data cleaning pre processing during loading * Bpemb hot fix (#229) * hot-fix the problem with BPEmb broken base URL * add changelog * bump version to 0.9.10 * fix numpy due to makor release breaking change * fix numpy due to makor release breaking change * fix numpy due to makor release breaking change * fix tests with new bpemb wrapper * fix tests with new bpemb wrapper * remove unecessary tests * fix app tests * fix app tests * fix app circular import * fix sentry deprecated argument * Fix typos in changelog --------- Signed-off-by: dependabot[bot] Co-authored-by: Marouane Yassine Co-authored-by: Marouane Yassine <46830666+MAYAS3@users.noreply.github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- CHANGELOG.md | 146 ++++++++++-------- app_requirements.txt | 9 +- deepparse/app/__init__.py | 1 - deepparse/app/address.py | 5 + deepparse/app/app.py | 56 +------ deepparse/app/request_examples.http | 2 +- deepparse/app/sentry.py | 2 +- deepparse/app/tools.py | 45 ++++++ deepparse/cli/download_model.py | 5 +- deepparse/cli/download_models.py | 4 +- deepparse/cli/parse.py | 12 +- deepparse/cli/parser_arguments_adder.py | 4 +- deepparse/cli/retrain.py | 12 +- deepparse/cli/test.py | 2 +- deepparse/comparer/addresses_comparer.py | 26 ++-- .../comparer/formatted_compared_addresses.py | 31 ++-- .../formatted_compared_addresses_raw.py | 4 +- .../formatted_compared_addresses_tags.py | 2 +- deepparse/converter/data_padder.py | 28 ++-- deepparse/converter/data_processor.py | 6 +- deepparse/converter/target_converter.py | 5 +- deepparse/data_validation/data_validation.py | 82 ++++++++-- .../dataset_container/dataset_container.py | 145 ++++++++++++----- deepparse/dataset_container/tools.py | 4 +- deepparse/download_tools.py | 46 +++--- .../bpemb_embeddings_model.py | 15 +- .../embeddings_model_factory.py | 4 +- .../fasttext_embeddings_model.py | 4 +- .../magnitude_embeddings_model.py | 8 +- deepparse/errors/data_error.py | 2 +- deepparse/errors/model_error.py | 2 +- deepparse/errors/server_error.py | 2 +- deepparse/metrics/accuracy.py | 4 +- deepparse/metrics/nll_loss.py | 10 +- deepparse/network/bpemb_seq2seq.py | 32 ++-- deepparse/network/decoder.py | 4 +- deepparse/network/embedding_network.py | 19 +-- deepparse/network/encoder.py | 6 +- deepparse/network/fasttext_seq2seq.py | 24 +-- deepparse/network/model_factory.py | 18 ++- deepparse/network/seq2seq.py | 50 +++--- deepparse/parser/address_parser.py | 77 +++++---- deepparse/parser/formatted_parsed_address.py | 10 +- deepparse/validations.py | 14 +- deepparse/vectorizer/bpemb_vectorizer.py | 2 +- deepparse/vectorizer/fasttext_vectorizer.py | 4 +- deepparse/vectorizer/magnitude_vectorizer.py | 4 +- deepparse/weights_tools.py | 4 +- docs/source/api.rst | 12 +- docs/source/cli.rst | 26 ++-- .../retrain_with_new_seq2seq_params.rst | 2 +- docs/source/parser.rst | 10 +- docs/source/training_guide.rst | 16 +- examples/retrain_with_new_seq2seq_params.py | 2 +- models_evaluation/timer/timer.py | 2 +- pyproject.toml | 3 +- requirements.txt | 3 +- setup.py | 9 +- styling_requirements.txt | 8 +- tests/app/test_app.py | 21 ++- tests/cli/test_retrain.py | 4 +- tests/converter/test_data_processor.py | 8 +- tests/data_validation/test_data_validation.py | 91 +++++++++-- .../test_dataset_container.py | 18 +++ .../test_bpemb_embeddings_model.py | 6 +- .../test_embeddings_model_factory.py | 2 +- tests/network/test_seq2seq.py | 9 +- tests/parser/test_address_parser.py | 41 +++-- .../parser/test_address_parser_retrain_api.py | 85 +--------- tests/requirements.txt | 16 +- tests/test_download_tools.py | 4 +- tests/vectorizer/test_vectorizer_factory.py | 2 +- version.txt | 2 +- 73 files changed, 791 insertions(+), 614 deletions(-) delete mode 100644 deepparse/app/__init__.py create mode 100644 deepparse/app/address.py create mode 100644 deepparse/app/tools.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 1cd0153c..651cc362 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,18 +6,18 @@ ## 0.1.2 - Modification of assets URL -- Bugfix dictionary use +- Bug-fix dictionary use - Fixed logo - Fixed typo deepParse -> deepparse -- Fixed authors in setup +- Fixed authors in the setup ## 0.1.3 - Added "contributing to" - Added fix for comma problem (#56) -- Added content in Address Parser doc for tags definition +- Added content in Address Parser documentation for tags definition - Fixed Pylint bug with PyTorch 1.6 -- Fixed `pack_padded` cpu error with PyTorch new release +- Fixed `pack_padded` CPU error with PyTorch's new release ## 0.1.3.1 @@ -49,7 +49,7 @@ - Added Libpostal time in the doc - Documentation improvement -- Added new models evaluation to doc +- Added new model evaluation to doc - Release new models ## 0.3.3 @@ -61,35 +61,35 @@ ## 0.3.4 -- Fixed a bug when using batched address. Since we were sorting the address during the forward pass, the output +- Fixed a bug when using a batched address. Since we were sorting the address during the forward pass, the output prediction tags were not aligned with the supposed parsed address. We have removed the sorting, and now the results are more aligned with our research. ## 0.3.5 -- Added verbose flag to training and test based on the __init__ of address parser. +- Added a verbose flag to the training and test based on the __init__ of the address parser. - **Breaking change** Since [SciPy 1.6](https://github.com/scipy/scipy/releases/tag/v1.6.0) is released on Python `3.7+` , we don't support Python `3.6`. -- Added management for Windows where the FastText model cannot be pickled. On Windows, we use Gensim fasttext model, +- Added management for Windows where the FastText model cannot be pickled. We use the Gensim fasttext model on Windows, which takes more RAM. ## 0.3.6 -- Added a method for a dict conversion of parsed addresses for simpler `Pandas` integration. +- Added a method for dictionary conversion of parsed addresses for simpler `Pandas` integration. - Added examples for parsing addresses and how to convert them into a DataFrame. -- Fixed error with download module. +- Fixed error with the download module. ## 0.4 -- Added verbose flag to training and test based on the __init__ of address parser. -- Added a feature to retrain our models with prediction tags dictionary different from the default one. -- Added in-doc code examples. +- Added a verbose flag to the training and a test based on the __init__ of the address parser. +- Added a feature to retrain our models with a prediction tags dictionary that is different from the default one. +- Added in-documentation code examples. - Added code examples. -- Small improvement of models implementation. +- Small improvement of model implementation. ## 0.4.1 -- Added method to specify the format of address components of an `FormattedParsedAddress`. Formatting can specify the +- Added method to specify the format of address components of a `FormattedParsedAddress.` Formatting can specify the field separator, the field to be capitalized, and the field to be upper case. ## 0.4.2 @@ -100,9 +100,9 @@ ## 0.4.3 -- Fixed typos in one of a file name. +- Fixed typos in one of the file names. - Added tools to compare addresses (tagged or not). -- Fixed some tests errors. +- Fixed some test errors. ## 0.4.4 @@ -119,8 +119,8 @@ - Fixed address_comparer hint typing error - Fixed some docs errors -- Retrain and test now have more defaults parameters -- Various small code and tests improvements +- Retrain and test now have more default parameters +- Various small code and test improvements ## 0.6 @@ -133,28 +133,28 @@ ## 0.6.2 -- Improved (slightly) code speed of data padding method as per PyTorch list or array to Tensor recommendation. -- Improved doc for RuntimeError due to retraining FastText and BPEmb model in the same directory. +- Improved (slightly) data padding method code speed as per PyTorch list or array to the Tensor recommendation. +- Improved documentation for RuntimeError due to retraining FastText and BPEmb model in the same directory. - Added error handling RuntimeError when retraining. ## 0.6.3 -- Fixed the printing capture to raise the error with Poutyne as of version 1.8. We keep the previous approach as for - compatibilities with the previous Poutyne version. -- Added a flag to disable or not Tensorboard during retraining. +- Fixed the printing capture to raise the error with Poutyne as of version 1.8. We keep the previous approach for + compatibility with the previous Poutyne version. +- Added a flag to disable or not use Tensorboard during retraining. ## 0.6.4 -- Bugfix reloading of retrain attention model (PR #110) +- Bug-fix reloading of retraining attention model (PR #110) - Improve error handling - Improve doc ## 0.6.5 - Improve error handling of empty data and whitespace-only data. -- Parsing now include two validation on the data quality (not empty and not whitespace only) -- DataContainer now includes data quality test (not empty, not whitespace only, tags not empty, tag the same len as an - address and data is a list of tuple) +- Parsing now includes two validations on the data quality (not empty and not whitespace only) +- DataContainer now includes data quality test (not empty, not whitespace only, tags not empty, tags the same length as an + address and data is a list of tuples) - New CSVDatasetContainer - DataContainer can now be used to predict using a flag. - Add a CLI to parse addresses from the command line. @@ -162,53 +162,53 @@ ## 0.6.6 - Fixed errors in code examples -- Improved doc of download_from_url -- Improve error management of retrain and test +- Improved documentation of download_from_url +- Improve error management of retraining and test ## 0.6.7 - Fixed errors in data validation -- Improved doc over data validation -- Bugfix data slicing error with data containers -- Add an example on how to use a retrained model +- Improved documentation over data validation +- Bug-fix data slicing error with data containers +- Add an example of how to use a retrained model ## 0.7 - Improved CLI - Fixed bug in CLI export dataset -- Improved the doc of the CLI +- Improved the documentation of the CLI ## 0.7.1 -- Hotfix for missing dependency +- Hot-fix for missing dependency - Fixed bug with poutyne version handling ## 0.7.2 - Added JSON output support - Add logging output of parse CLI function -- Hotfix Poutyne version handling +- Hot-fix Poutyne version handling ## 0.7.3 -- Add freeze layers parameters to freeze layers during retraining +- Add freeze layer parameters to freeze layers during retraining ## 0.7.4 - Improve parsed address print - Bug-fix #124: comma-separated list without whitespace in CSVDatasetContainer - Add a report when addresses to parse and tags list len differ -- Add an example on how to fine-tune using our CSVDatasetContainer +- Add an example of how to fine-tune using our CSVDatasetContainer - Improve data validation for data to parse ## 0.7.5 - Bug-fix Poutyne version handling that causes a print error when a version is 1.11 when retraining -- Add the option to create a named retrain parsing model using by default the architecture setting or using the +- Add the option to create a named retrain the parsing model using, by default, the architecture setting or using the user-given name - Hot-fix missing raise for DataError validation of address to parse when address is tuple - Bug-fix handling of string column name for CSVDatasetContainer that raised ValueError -- Improve parse CLI doc and fix error in doc stating JSON format is supported as input data +- Improve parse CLI documentation and fix errors in documentation stating JSON format is supported as input data - Add batch_size to parse CLI - Add minimum version to Gensim 4.0.0. - Add a new CLI function, retrain, to retrain from the command line @@ -217,39 +217,39 @@ models weights cache directory - Change the `saving_dir` argument of `download_fastext_embeddings` and `download_fasttext_magnitude_embeddings` function - to `cache_dir`. `saving_dir` is now deprecated and will be remove in version 0.8. + to `cache_dir`. `saving_dir` is now deprecated and will be removed in version 0.8. - Add a new CLI function, test, to test from the command line ## 0.7.6 -- Re-release the version 0.7.5 into 0.7.6 due to manipulation error and change in PyPi (now delete does not delete - release by yank does). +- Re-release version 0.7.5 into 0.7.6 due to manipulation error and change in PyPi (now delete does not delete + release). ## 0.8 - Improve SEO - Add cache_dir arg in all CLI functions -- Improve handling of HTTP error in models version verification +- Improve handling of HTTP errors in models version verification - Improve doc - Add a note for parsing data cleaning (i.e. lowercase, commas removal, and hyphen replacing). - Add hyphen parsing cleaning step (with a bool flag to activate or not) to improve some country address parsing ( see [issue 137](https://github.com/GRAAL-Research/deepparse/issues/137)). -- Add ListDatasetContainer for Python list dataset. +- Add ListDatasetContainer for the Python list dataset. ## 0.8.1 - Refactored function `download_from_url` to `download_from_public_repository`. -- Add error management when retrain a FastText like model on Windows with a number of workers (`num_workers`) greater +- Add error management when retraining a FastText-like model on Windows with several workers (`num_workers`) greater than 0. - Improve dev tooling - Improve CI - Improve code coverage and pylint -- Add codacy +- Add Codacy ## 0.8.2 - Bug-fix retrain attention model naming parsing -- Improve error handling when not a DatasetContainer is use in retrain and test API +- Improve error handling when not a DatasetContainer is used in retraining and testing API ## 0.8.3 @@ -259,17 +259,17 @@ - Add `save_model_weights` method to `AddressParser` to save model weights (PyTorch state dictionary) - Improve CI -- Added verbose flag for test to activate or deactivate the test verbosity (it override the AddressParser verbosity) +- Added verbose flag for the test to activate or deactivate the test verbosity (it overrides the AddressParser verbosity) - Add Docker image - Add `val_dataset` to retrain API to allow the use of a specific val dataset for training -- Remove deprecated `download_from_url` function -- Remove deprecated `dataset_container` argument -- Fixed error and docs +- Remove the deprecated `download_from_url` function +- Remove the deprecated `dataset_container` argument +- Fixed errors and docs - Added the UK retrain example ## 0.9.1 -- Hotfix cli.download_model attention model bug +- Hot-fix cli.download_model attention model bug ## 0.9.2 @@ -277,15 +277,15 @@ - Remove deprecated argument `saving_dir` in `download_fasttext_magnitude_embeddings` and `download_fasttext_embeddings` functions. - Add offline argument to remove verification of the latest version. -- Bug-fix cache handling in download model. -- Add `download_models` CLI function. +- Bug-fix cache handling in the download model. +- Add the `download_models` CLI function. - [Temporary hot-fix BPEmb SSL certificate error](https://github.com/GRAAL-Research/deepparse/issues/156). ## 0.9.3 - Improve error handling. -- Bug-fix FastText error not handled in test API. -- Add feature to allow new_prediction_tags to retrain CLI. +- Bug-fix FastText error is not handled in the test API. +- Add a feature to allow new_prediction_tags to retrain CLI. ## 0.9.4 @@ -293,7 +293,7 @@ ## 0.9.5 -- Fixed tags converter bug with data processor. +- Fixed tags converter bug with the data processor. ## 0.9.6 @@ -303,8 +303,8 @@ by [Torch documentation](https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html). - Add `torch.no_grad()` context manager in `__call__()` to increase performance. - Reduce memory swap between CPU and GPU by instantiating Tensor directly on the GPU device. -- Improve some Warnings clarity (i.e. category and message). -- Bug-fix MacOS multiprocessing. It was impossible to use in multiprocess since we were not testing whether torch +- Improve some warnings' clarity (i.e., category and message). +- Bug-fix MacOS multiprocessing. It was impossible to use in multiprocess since we were not testing whether Torch multiprocess was set properly. Now, we set it properly and raise a warning instead of an error. - Drop Python 3.7 support since newer Python versions are faster and [Torch 2.0 does not support Python 3.7](https://dev-discuss.pytorch.org/t/dropping-support-for-cuda-11-6-and-python-3-7-from-pytorch-2-0-release/1021). @@ -321,14 +321,14 @@ - Add an example of how to use URI for parsing from and uploading to. - Improve error handling of `path_to_retrain_model`. - Bug-fix pre-processor error. -- Add verbose override and improve verbosity handling in retrain. +- Add verbose override and improve verbosity handling in retraining. - Bug-fix the broken FastText installation using `fasttext-wheel` instead of `fasttext` ( see [here](https://github.com/facebookresearch/fastText/issues/512#issuecomment-1534519551) and [here](https://github.com/facebookresearch/fastText/pull/1292)). ## 0.9.8 -- Hot-Fix wheel install (See [issue 196](https://github.com/GRAAL-Research/deepparse/issues/196)). +- Hot-fix wheel install (See [issue 196](https://github.com/GRAAL-Research/deepparse/issues/196)). - Starting now, we also include model weights release in the GitHub release. ## 0.9.9 @@ -336,8 +336,20 @@ - Add version to Seq2Seq and AddressParser. - Add a Deepparse as an API using FastAPI. - Add a Dockerfile and a `docker-compose.yml` to build a Docker container for the API. -- Bug-fix the default pre-processors that were not all apply but only the last one. - -## dev - -- Improve documentation \ No newline at end of file +- Bug-fix the default pre-processors that did not all apply but only the last one. + +## 0.9.10 + +- Fix and improve documentation. +- Remove fixed dependencies version. +- Fix app errors. +- Add data validation for 1) multiple consecutive whitespace and 2) newline. +- Fixes some errors in tests. +- Add an argument to the `DatasetContainer` interface to use a pre-processing data cleaning function before validation. +- Hot-fix the issue with the BPEmb base URL download problem. See [issue 221](https://github.com/GRAAL-Research/deepparse/issues/221). +- Fix the NumPy version due to a major release with breaking changes. +- Fix the SciPy version due to breaking change with Gensim. +- Fix circular import in the API app. +- Fix deprecated `max_request_body_size` in Sentry. + +## dev \ No newline at end of file diff --git a/app_requirements.txt b/app_requirements.txt index 38df9d85..c642b056 100644 --- a/app_requirements.txt +++ b/app_requirements.txt @@ -1,4 +1,5 @@ -fastapi[all]==0.99.1 -uvicorn==0.22.0 -sentry-sdk[fastapi]==1.28.1 -python-decouple==3.8 \ No newline at end of file +fastapi[all] +uvicorn +sentry-sdk[fastapi] +python-decouple +pydantic \ No newline at end of file diff --git a/deepparse/app/__init__.py b/deepparse/app/__init__.py deleted file mode 100644 index 1afb6e3b..00000000 --- a/deepparse/app/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .app import * diff --git a/deepparse/app/address.py b/deepparse/app/address.py new file mode 100644 index 00000000..622b1e7c --- /dev/null +++ b/deepparse/app/address.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel + + +class Address(BaseModel): + raw: str diff --git a/deepparse/app/app.py b/deepparse/app/app.py index dba68fb1..6d8a2274 100644 --- a/deepparse/app/app.py +++ b/deepparse/app/app.py @@ -1,32 +1,28 @@ """REST API.""" -from typing import List, Dict, Union -from contextlib import asynccontextmanager + import logging +from contextlib import asynccontextmanager +from typing import List +from deepparse.app.address import Address +from deepparse.app.tools import format_parsed_addresses, address_parser_mapping from deepparse.download_tools import MODEL_MAPPING_CHOICES, download_models from deepparse.parser import AddressParser - try: from deepparse.app.sentry import configure_sentry - from pydantic import BaseModel from fastapi import FastAPI, Depends from fastapi.responses import JSONResponse import uvicorn - - except ModuleNotFoundError as e: raise ModuleNotFoundError("Ensure you installed the extra packages using: 'pip install deepparse[app]'") from e - logger = logging.getLogger(__name__) FORMAT = "%(asctime)s; %(levelname)s: %(message)s" logging.basicConfig(format=FORMAT, level=logging.DEBUG) configure_sentry() -address_parser_mapping: Dict[str, AddressParser] = {} - @asynccontextmanager async def lifespan(application: FastAPI): # pylint: disable=unused-argument @@ -53,48 +49,6 @@ async def lifespan(application: FastAPI): # pylint: disable=unused-argument app = FastAPI(lifespan=lifespan) -class Address(BaseModel): - raw: str - - -def format_parsed_addresses( - parsing_model: str, addresses: List[Address], model_mapping=None -) -> Dict[str, Union[str, Dict[str, str]]]: - """ - Format parsed addresses. - - Args: - - **parsing_model** (str): The parsing model to use for address parsing. - - **addresses** (List[Address]): List of addresses to parse. - - Returns: - - **JSONResponse**: JSON response containing the parsed addresses, along with the model type and version. - """ - assert addresses, "Addresses parameter must not be empty" - assert ( - parsing_model in MODEL_MAPPING_CHOICES - ), f"Parsing model not implemented, available choices: {MODEL_MAPPING_CHOICES}" - - if model_mapping is None: - model_mapping = address_parser_mapping - - parsed_addresses = model_mapping[parsing_model]([address.raw for address in addresses]) - - if not isinstance(parsed_addresses, list): - parsed_addresses = [parsed_addresses] - - response_payload = { - "model_type": model_mapping[parsing_model].model_type, - "parsed_addresses": { - raw_address.raw: parsed_address.to_dict() - for parsed_address, raw_address in zip(parsed_addresses, addresses) - }, - "version": model_mapping[parsing_model].version, - } - - return response_payload - - @app.post("/parse/{parsing_model}") def parse(parsing_model: str, addresses: List[Address], resp=Depends(format_parsed_addresses)): """ diff --git a/deepparse/app/request_examples.http b/deepparse/app/request_examples.http index 1bd8a2f8..403dc391 100644 --- a/deepparse/app/request_examples.http +++ b/deepparse/app/request_examples.http @@ -16,5 +16,5 @@ Content-Type: application/json [ {"raw": "16 rue Grande-Place, Victoriaville, QC, G6S 1E6"}, - {"raw": "123 rue Valancourt, Val-Alain, quebec, g9v1s3"} + {"raw": "123 rue valancourt, val-alain, quebec, g9v 1s3"} ] \ No newline at end of file diff --git a/deepparse/app/sentry.py b/deepparse/app/sentry.py index 028a0ebe..c8d03e3d 100644 --- a/deepparse/app/sentry.py +++ b/deepparse/app/sentry.py @@ -15,5 +15,5 @@ def configure_sentry() -> None: release=f"deepparse@{get_version('deepparse')}", profiles_sample_rate=1.0, environment=environment, - request_bodies="small", + max_request_body_size="small", ) diff --git a/deepparse/app/tools.py b/deepparse/app/tools.py new file mode 100644 index 00000000..f5023294 --- /dev/null +++ b/deepparse/app/tools.py @@ -0,0 +1,45 @@ +from typing import List, Dict, Union + +from deepparse.app.address import Address +from deepparse.download_tools import MODEL_MAPPING_CHOICES +from deepparse.parser import AddressParser + +address_parser_mapping: Dict[str, AddressParser] = {} + + +def format_parsed_addresses( + parsing_model: str, addresses: List[Address], model_mapping=None +) -> Dict[str, Union[str, Dict[str, str]]]: + """ + Format parsed addresses. + + Args: + - **parsing_model** (str): The parsing model to use for address parsing. + - **addresses** (List[Address]): List of addresses to parse. + + Returns: + - **JSONResponse**: JSON response containing the parsed addresses, along with the model type and version. + """ + assert addresses, "Addresses parameter must not be empty" + assert ( + parsing_model in MODEL_MAPPING_CHOICES + ), f"Parsing model not implemented, available choices: {MODEL_MAPPING_CHOICES}" + + if model_mapping is None: + model_mapping = address_parser_mapping + + parsed_addresses = model_mapping[parsing_model]([address.raw for address in addresses]) + + if not isinstance(parsed_addresses, list): + parsed_addresses = [parsed_addresses] + + response_payload = { + "model_type": model_mapping[parsing_model].model_type, + "parsed_addresses": { + raw_address.raw: parsed_address.to_dict() + for parsed_address, raw_address in zip(parsed_addresses, addresses) + }, + "version": model_mapping[parsing_model].version, + } + + return response_payload diff --git a/deepparse/cli/download_model.py b/deepparse/cli/download_model.py index d748bc89..6ccaa5e4 100644 --- a/deepparse/cli/download_model.py +++ b/deepparse/cli/download_model.py @@ -1,13 +1,12 @@ import argparse import sys - from deepparse.download_tools import download_model, MODEL_MAPPING_CHOICES def main(args=None) -> None: """ - CLI function to manually download all the dependencies for a pretrained model. + CLI function to download all the dependencies for a pretrained model manually. Example of usage: @@ -41,7 +40,7 @@ def get_parser() -> argparse.ArgumentParser: "--saving_cache_dir", type=str, default=None, - help="To change the default saving cache directory (default to None e.g. default path).", + help="To change the default saving cache directory (default to None, e.g. default path).", ) return parser diff --git a/deepparse/cli/download_models.py b/deepparse/cli/download_models.py index 6ab6f359..658c8816 100644 --- a/deepparse/cli/download_models.py +++ b/deepparse/cli/download_models.py @@ -6,7 +6,7 @@ def main(args=None) -> None: """ - CLI function to manually download all the dependencies for all pretrained models. + CLI function to download all the dependencies for all pretrained models manually. Example of usage: @@ -34,7 +34,7 @@ def get_parser() -> argparse.ArgumentParser: "--saving_cache_dir", type=str, default=None, - help="To change the default saving cache directory (default to None e.g. default path).", + help="To change the default saving cache directory (default to None, e.g. default path).", ) return parser diff --git a/deepparse/cli/parse.py b/deepparse/cli/parse.py index 37e8c13d..a96746ec 100644 --- a/deepparse/cli/parse.py +++ b/deepparse/cli/parse.py @@ -32,7 +32,7 @@ def main(args=None) -> None: # pylint: disable=too-many-locals, too-many-branches """ - CLI function to rapidly parse an addresses dataset and output it in another file. + CLI function to easily parse an address dataset and output it in another file. Examples of usage: @@ -40,7 +40,7 @@ def main(args=None) -> None: parse fasttext ./dataset_path.csv parsed_address.pickle - Using a gpu device + Using a GPU device .. code-block:: sh @@ -119,7 +119,7 @@ def main(args=None) -> None: def get_parser() -> argparse.ArgumentParser: - """Return ArgumentParser for the cli.""" + """Return ArgumentParser for the CLI.""" parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) parser.add_argument( @@ -137,11 +137,11 @@ def get_parser() -> argparse.ArgumentParser: parser.add_argument( "export_filename", help=wrap( - "The filename to use to export the parsed addresses. We will infer the file format base on the " + "The filename to use to export the parsed addresses. We will infer the file format based on the " "file extension. That is, if the file is a pickle (.p or .pickle), we will export it into a pickle file. " - "The supported format are Pickle, CSV and JSON. " + "The supported formats are Pickle, CSV and JSON. " "The file will be exported in the same repositories as the dataset_path. " - "See the doc for more details on the format exporting." + "See the documentation for more details on the format exporting." ), type=str, ) diff --git a/deepparse/cli/parser_arguments_adder.py b/deepparse/cli/parser_arguments_adder.py index 72eeaf2b..52c50755 100644 --- a/deepparse/cli/parser_arguments_adder.py +++ b/deepparse/cli/parser_arguments_adder.py @@ -25,7 +25,7 @@ def add_csv_column_name_arg(parser: ArgumentParser) -> None: parser.add_argument( "--csv_column_name", help=wrap( - "The column name to extract address in the CSV. Need to be specified if the provided dataset_path " + "The column name to extract the address in the CSV. It needs to be specified if the provided dataset_path " "leads to a CSV file." ), type=str, @@ -37,7 +37,7 @@ def add_csv_column_names_arg(parser: ArgumentParser) -> None: parser.add_argument( "--csv_column_names", help=wrap( - "The column names to extract address and tags in the CSV. Need to be specified if the provided " + "The column names to extract addresses and tags in the CSV. It needs to be specified if the provided " "dataset_path leads to a CSV file. Column names have to be separated by a whitespace. For" "example, --csv_column_names column1 column2. By default, None." ), diff --git a/deepparse/cli/retrain.py b/deepparse/cli/retrain.py index 5d070a6b..7ba8c7eb 100644 --- a/deepparse/cli/retrain.py +++ b/deepparse/cli/retrain.py @@ -64,7 +64,7 @@ def handle_prediction_tags(parsed_args): def main(args=None) -> None: # pylint: disable=too-many-locals, too-many-branches """ - CLI function to rapidly retrain an addresses parser and saves it. One can retrain a base pretrained model + CLI function to easily retrain an address parser and save it. One can retrain a base pretrained model using most of the arguments as the :meth:`~AddressParser.retrain` method. By default, all the parameters have the same default value as the :meth:`~AddressParser.retrain` method. The supported parameters are the following: @@ -86,7 +86,7 @@ def main(args=None) -> None: retrain fasttext ./train_dataset_path.csv - Using a gpu device + Using a GPU device .. code-block:: sh @@ -142,7 +142,7 @@ def main(args=None) -> None: def get_parser() -> argparse.ArgumentParser: - """Return ArgumentParser for the cli.""" + """Return ArgumentParser for the CLI.""" parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) @@ -198,8 +198,8 @@ def get_parser() -> argparse.ArgumentParser: "--logging_path", help=wrap( "The logging path for the checkpoints and the retrained model. " - "Note that training creates checkpoints, and we use Poutyne library that use the best epoch " - "model and reloads the state if any checkpoints are already there. " + "Note that training creates checkpoints, and we use the Poutyne library that uses the best epoch " + "model and reload the state if any checkpoints are already there. " "Thus, an error will be raised if you change the model type. For example, " "you retrain a FastText model and then retrain a BPEmb in the same logging path directory." "By default, the path is './checkpoints'." @@ -241,7 +241,7 @@ def get_parser() -> argparse.ArgumentParser: help=wrap( "Path to a JSON file of prediction tags to use to retrain. Tags are in a key-value style, where " "the key is the tag name, and the value is the index one." - "The last element has to be an EOS tag. Read the doc for more detail about EOS tag." + "The last element has to be an EOS tag. Read the documentation for more details about the EOS tag." ), default=None, type=str, diff --git a/deepparse/cli/test.py b/deepparse/cli/test.py index 648e3dc7..853505c0 100644 --- a/deepparse/cli/test.py +++ b/deepparse/cli/test.py @@ -108,7 +108,7 @@ def main(args=None) -> None: def get_parser() -> argparse.ArgumentParser: - """Return ArgumentParser for the cli.""" + """Return ArgumentParser for the CLI.""" parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) diff --git a/deepparse/comparer/addresses_comparer.py b/deepparse/comparer/addresses_comparer.py index c7dabf06..8ed24400 100644 --- a/deepparse/comparer/addresses_comparer.py +++ b/deepparse/comparer/addresses_comparer.py @@ -10,8 +10,9 @@ @dataclass(frozen=True) class AddressesComparer: """ - Address comparer to compare addresses with each other and retrieves the differences between them. The addresses - are parsed using an address parser based on one of the seq2seq pretrained networks, either with fastText or BPEmb. + Address comparer is used to compare addresses with each other and retrieve the differences between them. The + addresses are parsed using an address parser based on one of the seq2seq pretrained networks, either with + FastText or BPEmb. The address comparer can compare already parsed addresses. The address parser first recomposes the raw addresses then suggest its own tags; then it makes a comparison with the tags of the source parsing and the @@ -43,12 +44,12 @@ def compare_tags( raw address from the parsing, AddressParser generates tags and compares the two parsings. Args: - addresses_tags_to_compare (Union[List[tuple], List[List[tuple]]]): list of tuple that contains + addresses_tags_to_compare (Union[List[tuple], List[List[tuple]]]): list of tuples that contain the tags for the address components from the source. Can compare multiple parsings if passed as a list of tuples. - with_prob (Union[None, bool]): An option flag to either or not include prob in the comparison report. - The probabilities are not compared but only included in the report. - The default value is None, which means not taking into account. + with_prob (Union[None, bool]): An option flag to either or not include probabilities in the comparison + report. The probabilities are not compared but only included in the report. The default value is + ``None``, which means not taking into account. Return: Either a :class:`~FormattedComparedAddressesTags` or a list of :class:`~FormattedComparedAddressTags` @@ -123,15 +124,14 @@ def compare_raw( ) -> List[FormattedComparedAddressesRaw]: """ Compare a list of raw addresses together. It starts by parsing the addresses - with the setted parser and then return the differences between the addresses components - retrieved with our model. + with the parser and then return the differences between the parsed address components of the two addresses. Args: raw_addresses_to_compare (Union[Tuple[str], List[Tuple[str]]]): List of strings that represent raw addresses to compare. - with_prob (Union[None, bool]): An option flag to either or not include prob in the comparison report. - The probabilities are not compared but only included in the report. - The default value is None, which means not taking into account. + with_prob (Union[None, bool]): An option flag to either or not include probabilities in the comparison + report. The probabilities are not compared but only included in the report. The default value is + ``None``, which means not taking into account. Return: Either a :class:`~FormattedComparedAddressesRaw` or a list of @@ -184,8 +184,8 @@ def compare_raw( @staticmethod def _format_comparisons_dict(comparison_tuples: List, origin_tuple: Tuple[str, str], with_prob: bool) -> List[Dict]: """ - Return formatted dict that contains two FormattedParsedAddress and the origin name tuple and output it in a - dict format. + Return formatted dictionary that contains two FormattedParsedAddress and the origin name tuple and output it + in a dictionary format. """ list_of_formatted_comparisons_dict = [] diff --git a/deepparse/comparer/formatted_compared_addresses.py b/deepparse/comparer/formatted_compared_addresses.py index f90f699e..b96f15b8 100644 --- a/deepparse/comparer/formatted_compared_addresses.py +++ b/deepparse/comparer/formatted_compared_addresses.py @@ -110,16 +110,16 @@ def _comparison_report_builder(self) -> str: @abstractmethod def _get_probs(self) -> Dict: """ - To get the tags from the parsing with their associated probabilities, the method needs to be implemented in each - class because they don't use the probabilities the same way. + A method to get the tags from the parsing with their associated probabilities, it needs to be implemented in + each class because they don't use the probabilities the same way. """ @staticmethod def _get_color_diff(string_one: str, string_two: str, highlight: bool = False) -> str: """ - Compare two strings and determine the difference between the two. The differences are noted with colour code; - if the first string has more elements than the second one, it will be noted in one colour; on the contrary, - if the other string has something more, it will have a different colour notation. + Compare two strings and determine the difference between the two. The differences are highlighted with a + coloured scheme; if the first string has more elements than the second one, it will be noted in one colour; + on the contrary, if the other string has something more, it will have a different colour notation. Args: string_one (str): The first string to compare. @@ -129,7 +129,7 @@ def _get_color_diff(string_one: str, string_two: str, highlight: bool = False) - two strings are spaces. The default is False. Notes: - the method is colorblind-friendly, which means that the output will be + The method is colorblind-friendly, which means that the output will be in colours that minimize the risk that a user cannot see the difference as defined here https://davidmathlogic.com/colorblind/#%23D81B60-%231E88E5-%23FFC107-%23004D40. @@ -137,7 +137,7 @@ def _get_color_diff(string_one: str, string_two: str, highlight: bool = False) - If the first string has something more than the second one, it will be indicated in blue. If the second string has something more than the first one, it will be noted in yellow. - It uses SequenceMatcher to get the different codes to be later converted into colour codes. + It uses SequenceMatcher to convert the different codes into colour codes later. Return: str: The two strings joined, and the differences are noted in colour codes @@ -176,13 +176,16 @@ def _get_tags_diff_color( verbose: bool = True, ) -> str: """ - Print the output of the string with colour codes that represent the differences between the two strings. + Print the output of the string with colour codes representing the differences between the two strings. Args: - name_one (str, optional) : Name associated with first color. The default value is the first address. - name_two (str, optional) : Name associated with the second colour. The default value is the second address. - verbose (bool, optional): If True, it will print a presentation of the colours and what they mean. - The default value is True. + name_one (str, optional) : Name associated with first color. The default value is ``"first address"``, + namely the first address of the two. We recommend using a whitespace characters between the words. + name_two (str, optional) : Name associated with the second colour. The default value is + ``"second address"``, namely the second address of the two. We recommend using a whitespace + characters between the words. + verbose (bool, optional): If True, it will print a presentation of the colours and their meaning. + The default value is ``True``. """ @@ -220,7 +223,7 @@ def _get_tags_diff_color( def _bool_address_tags_are_the_same(self, parsed_addresses: Union[List[List[tuple]], List[tuple]]) -> List[tuple]: """ - Compare addresses components and put the differences in a dictionary where the keys are the + Compare the components between two addresses and put the differences in a dictionary where the keys are the names of the addresses components, and the values are the values of the addresses component. Args: @@ -228,7 +231,7 @@ def _bool_address_tags_are_the_same(self, parsed_addresses: Union[List[List[tupl address components' names for the parsed addresses. Return: - List[tuple]: List of tuples that contain all addresses components that differ from each other. + List[tuple]: List of tuples containing the components that differ from the two addresses. """ unique_address_component_names = self._unique_addresses_component_names(parsed_addresses) diff --git a/deepparse/comparer/formatted_compared_addresses_raw.py b/deepparse/comparer/formatted_compared_addresses_raw.py index de94c05d..860f268b 100644 --- a/deepparse/comparer/formatted_compared_addresses_raw.py +++ b/deepparse/comparer/formatted_compared_addresses_raw.py @@ -12,8 +12,8 @@ class FormattedComparedAddressesRaw(FormattedComparedAddresses): def _get_probs(self) -> Dict: """ - To get the tags from the parsing with their associated probabilities, the method needs to be implemented in each - class because they don't use the probabilities the same way. + Method to get the tags from the parsing with their associated probabilities, a method needs to be + implemented in each class because they don't use the probabilities the same way. """ return { self.first_address.raw_address: self.first_address.address_parsed_components, diff --git a/deepparse/comparer/formatted_compared_addresses_tags.py b/deepparse/comparer/formatted_compared_addresses_tags.py index 775335d8..c071194d 100644 --- a/deepparse/comparer/formatted_compared_addresses_tags.py +++ b/deepparse/comparer/formatted_compared_addresses_tags.py @@ -12,7 +12,7 @@ class FormattedComparedAddressesTags(FormattedComparedAddresses): def _get_probs(self) -> Dict: """ - To get the tags from the parsing with their associated probabilities, the method needs to be implemented in each + To get the tags from the parsing with their associated probabilities, A method needs to be implemented in each class because they don't use the probabilities the same way. """ return { diff --git a/deepparse/converter/data_padder.py b/deepparse/converter/data_padder.py index e21c32cb..3edc5d2b 100644 --- a/deepparse/converter/data_padder.py +++ b/deepparse/converter/data_padder.py @@ -15,20 +15,20 @@ class DataPadder: def __init__(self, padding_value: int = -100) -> None: self.padding_value = padding_value - def pad_word_embeddings_batch( - self, batch: List[Tuple[List, List]], teacher_forcing: bool = False - ) -> Union[ + def pad_word_embeddings_batch(self, batch: List[Tuple[List, List]], teacher_forcing: bool = False) -> Union[ Tuple[Tuple[torch.Tensor, List], torch.Tensor], Tuple[Tuple[torch.Tensor, List, torch.Tensor], torch.Tensor], ]: """ - Method to pad a batch of word embeddings sequences and their targets to the length of the longest one. + A method to apply padding to a batch of word embeddings sequences and their targets to the length of the + longest one. + Args: batch (list[Tuple[list, list]]): a list of tuples where the first element is a list of word embeddings (the sequence) and the second is a list of targets. teacher_forcing (bool): if True, the padded target vectors are returned twice, once with the sequences and their lengths, and once on their own. This enables - the use of teacher forcing during the training of sequence to sequence models. + the use of teacher forcing during the training of sequence-to-sequence models. Return: A tuple of two elements: - a tuple containing either a @@ -52,7 +52,7 @@ def pad_word_embeddings_batch( def pad_word_embeddings_sequences(self, sequences_batch: List) -> Tuple[torch.Tensor, List]: """ - Method to pad a batch of word embeddings sequences. + A method to apply batch padding to sequences of word embeddings. Args: sequences_batch (list): a tuple containing lists of word embeddings (the sequences) Return: @@ -81,15 +81,17 @@ def pad_subword_embeddings_batch( Tuple[Tuple[torch.Tensor, List, List, torch.Tensor], torch.Tensor], ]: """ - Method to pad a batch of subword embeddings sequences and their targets to the length of the longest one. + A method to apply padding to a batch of subword embeddings sequences and their targets to the length of the + longest one. + Args: batch (list[Tuple[Tuple[list, list], list]]): a list of tuples containing the two following elements: - - a tuple where the first element is a list of words represented as subword embeddings and the + - a tuple where the first element is a list of words represented as subword embeddings, and the second element is a list of the number of subword embeddings that each word is decomposed into. - a list of targets. teacher_forcing (bool): if True, the padded target vectors are returned twice, once with the sequences and their lengths, and once on their own. This enables - the use of teacher forcing during the training of sequence to sequence models. + the use of teacher forcing during the training of sequence-to-sequence models. Return: A tuple of two elements: - A tuple (``x``, ``y`` , ``z``). The element ``x`` is a :class:`~torch.Tensor` of @@ -122,9 +124,9 @@ def pad_subword_embeddings_sequences( self, sequences_batch: List[Tuple[List, List]] ) -> Tuple[torch.Tensor, List, List]: """ - Method to pad a batch of subword embeddings sequences. + A method to apply padding to a batch of subword embeddings sequences. Args: - sequences_batch (list[Tuple[list, list]]): a list of tuple containing tuples of two elements: + sequences_batch (list[Tuple[list, list]]): a list of tuples containing tuples of two elements: - a list of lists representing words as lists of subword embeddings. - a list of the number of subword embeddings that each word is decomposed into. Return: @@ -158,7 +160,7 @@ def pad_subword_embeddings_sequences( def pad_targets(self, target_batch: List) -> torch.Tensor: """ - Method to pad a batch of target indices to the longest one. + A method to apply padding to a batch of target indices to the longest one. Args: target_batch (list): a tuple containing lists of target indices. Return: @@ -170,7 +172,7 @@ def pad_targets(self, target_batch: List) -> torch.Tensor: def _extract_word_embeddings_sequences_and_target(self, batch: List[Tuple[List, List]]) -> Tuple[List, List]: """ - Method that takes a list of word embedding sequences and targets and zips the + A method that takes a list of word embedding sequences and targets and zips the sequences together and the targets together. """ sorted_batch = sorted(batch, key=lambda x: len(x[0]), reverse=True) diff --git a/deepparse/converter/data_processor.py b/deepparse/converter/data_processor.py index dd45e25a..8b708cdc 100644 --- a/deepparse/converter/data_processor.py +++ b/deepparse/converter/data_processor.py @@ -37,7 +37,7 @@ def process_for_inference( self, addresses: List[str] ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, List, torch.Tensor]]: """ - Method to vectorize addresses for inference. + A method to vectorize the addresses for inference. Args: addresses (List[str]): a list of addresses Return: @@ -60,13 +60,13 @@ def process_for_training( ], ]: """ - Method to vectorize addresses and tags for training. + A method to vectorize the addresses and the tags for training. Args: addresses_and_targets (List[Tuple[str, List[str]]]): a list of tuples where the first element is an address and the second is a list of tags. teacher_forcing (bool): if True, the padded target vectors are returned twice, once with the sequences and their lengths, and once on their own. This enables - the use of teacher forcing during the training of sequence to sequence models. + the use of teacher forcing during the training of sequence-to-sequence models. Return: A padded batch. Check out :meth:`~deepparse.converter.DataPadder.pad_word_embeddings_batch` and :meth:`~DataPadder.pad_subword_embeddings_batch` for more details. diff --git a/deepparse/converter/target_converter.py b/deepparse/converter/target_converter.py index c55e4017..34825c5c 100644 --- a/deepparse/converter/target_converter.py +++ b/deepparse/converter/target_converter.py @@ -3,7 +3,7 @@ class TagsConverter: """ - Class to define logic of tag to idx conversion and vice versa. + Class to define the logic of tag to idx conversion and vice versa. Args: tags_to_idx (Dict): A dictionary where the keys are the tags (e.g. StreetNumber) and the values are @@ -16,7 +16,8 @@ def __init__(self, tags_to_idx: Dict) -> None: def __call__(self, key: Union[str, int]) -> int: """ - If str convert from a tag to idx and if int convert from a idx to a tag using the convert table. + If it is a ``str, ``, it will convert from a "tag" to an IDX, and if ``int``, it will convert from an IDX to + a "tag" using the convert table. """ if isinstance(key, str): return self.tags_to_idx[key] diff --git a/deepparse/data_validation/data_validation.py b/deepparse/data_validation/data_validation.py index 2e95e85d..2a338a3d 100644 --- a/deepparse/data_validation/data_validation.py +++ b/deepparse/data_validation/data_validation.py @@ -1,10 +1,17 @@ +import re from typing import List +# Regular expression to find if a string contains consecutive whitespace +consecutive_whitespace_regular_expression = re.compile(r"\s{2,}") + +# Regular expression to find if a string contains a newline character +newline_regular_expression = re.compile(r"\n") + def validate_if_any_empty(string_elements: List) -> bool: """ Return ``True`` if one of the string elements is empty. For example, the second element in the following list is - an empty address: ``["An address", "", "Another address"]``. Thus, it will return ``False``. + an empty address: ``["An address", "", "Another address"]``. Thus, it will return ``True``. Args: string_elements (list): A list of strings to validate. @@ -15,7 +22,7 @@ def validate_if_any_empty(string_elements: List) -> bool: def validate_if_any_whitespace_only(string_elements: List) -> bool: """ Return ``True`` if one of the string elements is only whitespace. For example, the second element in the - following list is only whitespace: ``["An address", " ", "Another address"]``. Thus, it will return ``False``. + following list is only whitespace: ``["An address", " ", "Another address"]``. Thus, it will return ``True``. Args: string_elements (list): A list of strings to validate. @@ -26,7 +33,7 @@ def validate_if_any_whitespace_only(string_elements: List) -> bool: def validate_if_any_none(string_elements: List) -> bool: """ Return ``True`` if one string element is a ``None`` value. For example, the second element in the following - list is a ``None`` value: ``["An address", None, "Another address"]``. Thus, it will return ``False``. + list is a ``None`` value: ``["An address", None, "Another address"]``. Thus, it will return ``True``. Args: string_elements (list): A list of strings to validate. @@ -34,40 +41,91 @@ def validate_if_any_none(string_elements: List) -> bool: return any(is_none(string_element) for string_element in string_elements) -def is_whitespace_only(a_string: str) -> bool: +def validate_if_any_multiple_consecutive_whitespace(string_elements: List) -> bool: + """ + Return ``True`` if one string element include multiple consecutive_whitespace. + For example, the second element in the following list has two consecutive whitespace: + ``["An address", "An address", "Another address"]``. Thus, it will return ``True``. + + Args: + string_elements (list): A list of strings to validate. + """ + return any(is_multiple_consecutive_whitespace(string_element) for string_element in string_elements) + + +def validate_if_any_newline_character(string_elements: List) -> bool: + """ + Return ``True`` if one string element include a newline character. + For example, the second element in the following list include a newline character. + ``["An address", "An address\n", "Another address"]``. Thus, it will return ``True``. + + Args: + string_elements (list): A list of strings to validate. + """ + return any(is_newline(string_element) for string_element in string_elements) + + +def is_whitespace_only(string_element: str) -> bool: """ Validate if a string is composed of only whitespace. Args: - a_string (str): A string to validate. + string_element (str): A string to validate. Return: Either or not, the string is composed only of whitespace or not. """ - return len(a_string.strip(" ").split()) == 0 + return len(string_element.strip(" ").split()) == 0 -def is_empty(a_string: str) -> bool: +def is_empty(string_element: str) -> bool: """ Validate if a string is empty. Args: - a_string (str): A string to validate. + string_element (str): A string to validate. Return: Either or not, the string is empty. """ - return len(a_string) == 0 + return len(string_element) == 0 -def is_none(a_string: str) -> bool: +def is_none(string_element: str) -> bool: """ Validate if a string is a None. Args: - a_string (str): A string to validate. + string_element (str): A string to validate. Return: Either or not, the string is a None type. """ - return a_string is None + return string_element is None + + +def is_multiple_consecutive_whitespace(string_element: str) -> bool: + """ + Validate if a string include consecutive whitespace. Consecutive whitespace will break matching between the + address components and the tags during splitting. + + Args: + string_element (str): A string to validate. + + Return: + Either or not, the string include consecutive whitespace. + """ + return consecutive_whitespace_regular_expression.search(string_element) is not None + + +def is_newline(string_element: str) -> bool: + """ + Validate if a string include a newline character. + + Args: + string_element (str): A string to validate. + + Return: + Either or not, the string include a newline character. + """ + return newline_regular_expression.search(string_element) is not None diff --git a/deepparse/dataset_container/dataset_container.py b/deepparse/dataset_container/dataset_container.py index 9ffd7588..cd4db5ea 100644 --- a/deepparse/dataset_container/dataset_container.py +++ b/deepparse/dataset_container/dataset_container.py @@ -7,8 +7,14 @@ from torch.utils.data import Dataset from .tools import former_python_list, validate_column_names +from ..data_validation import ( + validate_if_any_empty, + validate_if_any_whitespace_only, + validate_if_any_none, + validate_if_any_multiple_consecutive_whitespace, + validate_if_any_newline_character, +) from ..errors.data_error import DataError -from ..data_validation import validate_if_any_empty, validate_if_any_whitespace_only, validate_if_any_none class DatasetContainer(Dataset, ABC): @@ -21,31 +27,38 @@ class DatasetContainer(Dataset, ABC): For a training container, it validates the following: - - all addresses are not None value, - - all addresses are not empty, - - all addresses are not whitespace string, - - all tags are not empty, if data is a list of tuple (``[('an address', ['a_tag', 'another_tag']), ...]``), and + - no address is a ``None`` value, + - no address is empty, + - no address is composed of only whitespace, + - no address includes consecutive whitespace (e.g. "An address"), + - no tags list is empty, if data is a list of tuple (``[('an address', ['a_tag', 'another_tag']), ...]``), and - if the addresses (whitespace-split) are the same length as their respective tags list. While for a predict container (unknown prediction tag), it validates the following: - - all addresses are not None, - - all addresses are not empty, and - - all addresses are not whitespace string. + - no address is a ``None`` value, + - no address is empty, + - no address is composed of only whitespace, and + - no address includes consecutive whitespace (e.g. "An address"). Args: is_training_container (bool): Either or not, the dataset container is a training container. This will determine the dataset validation test we apply to the dataset. That is, a predict dataset doesn't include tags. - The default value is true. + The default value is ``True``. + data_cleaning_pre_processing_fn (Callable): Function to apply as data clea ning pre-processing step after + loading the data, but before applying the validation steps. The default value is ``None``. """ @abstractmethod - def __init__(self, is_training_container: bool = True) -> None: + def __init__( + self, is_training_container: bool = True, data_cleaning_pre_processing_fn: Union[None, Callable] = None + ) -> None: """ - Need to be defined by the child class. + The method to init the class. It needs to be defined by the child's class. """ self.data = None self.is_training_container = is_training_container + self.data_cleaning_pre_processing_fn = data_cleaning_pre_processing_fn def __len__(self) -> int: return len(self.data) @@ -59,7 +72,7 @@ def __getitem__( - it can be a list of string items (e.g. a list of addresses (str)), or - it can be a unique string item (e.g. one address). - If the DatasetContainer is a training one: + If the DatasetContainer is a "training" one: - it can be a list of tuple (str, list) items, namely a list of parsed examples (e.g. an address with the tags), or @@ -86,7 +99,7 @@ def validate_dataset(self) -> None: data_to_validate = self.data if validate_if_any_none(string_elements=data_to_validate): - raise DataError("Some addresses data points are None value.") + raise DataError("Some addresses data points are 'None' value.") if self.is_training_container: # Not done in previous similar if since none test not applied @@ -98,6 +111,14 @@ def validate_dataset(self) -> None: if validate_if_any_whitespace_only(string_elements=data_to_validate): raise DataError("Some addresses only include whitespace thus cannot be parsed.") + if validate_if_any_multiple_consecutive_whitespace(string_elements=data_to_validate): + raise DataError( + "Some addresses include consecutive whitespaces (i.e. 'An address') thus cannot be properly parsed." + ) + + if validate_if_any_newline_character(string_elements=data_to_validate): + raise DataError("Some addresses include newline characters (i.e. '\n') thus cannot be properly parsed.") + def _data_is_a_list(self) -> bool: return isinstance(self.data, list) @@ -114,12 +135,14 @@ def _training_validation(self) -> None: if not self._data_tags_is_same_len_then_address(): print( - f"Some addresses (whitespace-split) and the associated tags are not the same len. " - f"If you are using a CSVDatasetContainer, consider using the tag_seperator_reformat_fn argument." + f"Some addresses (whitespace-split) and the associated tags are not the same length. " + f"If you use a CSVDatasetContainer, consider using the tag_seperator_reformat_fn argument." f"Here is the report of those cases where len differ to help you out:\n" f"{self._data_tags_not_the_same_len_diff()}" ) - raise DataError("Some addresses (whitespace-split) and the tags associated with them are not the same len.") + raise DataError( + "Some addresses (whitespace-split) and the tags associated with them are not the same length." + ) def _data_is_list_of_tuple(self) -> bool: """ @@ -157,33 +180,44 @@ class PickleDatasetContainer(DatasetContainer): The dataset needs to be a list of tuples where the first element of each tuple is the address (a string), and the second is a list of the expected tag to predict (e.g. ``[('an address', ['a_tag', 'another_tag']), ...]``). - The len of the tags needs to be the same as the len of the address when whitespace split. + The length of the tags needs to be the same as the length of the address when the whitespace-split is used. For a training container, the validation tests applied on the dataset are the following: - - all addresses are not None value, - - all addresses are not empty, - - all addresses are not whitespace string, - - all tags are not empty, if data is a list of tuple (``[('an address', ['a_tag', 'another_tag']), ...]``), and + - no address is a ``None`` value, + - no address is empty, + - no address is composed of only whitespace, + - no address includes consecutive whitespace (e.g. "An address"), + - no tags list is empty, if data is a list of tuple (``[('an address', ['a_tag', 'another_tag']), ...]``), and - if the addresses (whitespace-split) are the same length as their respective tags list. While for a predict container (unknown prediction tag), the validation tests applied on the dataset are the following: - - all addresses are not None value, - - all addresses are not empty, and - - all addresses are not whitespace string. + - no address is a ``None`` value, + - no address is empty, + - no address is composed of only whitespace, and + - no address includes consecutive whitespace (e.g. "An address"). Args: data_path (str): The path to the pickle dataset file. is_training_container (bool): Either or not, the dataset container is a training container. This will determine the dataset validation test we apply to the dataset. That is, a predict dataset doesn't include tags. - The default value is true. + The default value is ``True``. + data_cleaning_pre_processing_fn (Callable): Function to apply as data clea ning pre-processing step after + loading the data, but before applying the validation steps. The default value is ``None``. """ - def __init__(self, data_path: str, is_training_container: bool = True) -> None: - super().__init__(is_training_container=is_training_container) + def __init__( + self, + data_path: str, + is_training_container: bool = True, + data_cleaning_pre_processing_fn: Union[None, Callable] = None, + ) -> None: + super().__init__( + is_training_container=is_training_container, data_cleaning_pre_processing_fn=data_cleaning_pre_processing_fn + ) with open(data_path, "rb") as f: self.data = load(f) @@ -193,6 +227,8 @@ def __init__(self, data_path: str, is_training_container: bool = True) -> None: "The data is a list of tuples, but the dataset container is a predict container. " "Predict container should contain only a list of addresses." ) + if self.data_cleaning_pre_processing_fn is not None: + self.data = self.data_cleaning_pre_processing_fn(self.data) self.validate_dataset() @@ -202,25 +238,28 @@ def _test_predict_container_is_list_of_tuple(self) -> bool: class CSVDatasetContainer(DatasetContainer): """ - CSV dataset container that imports a CSV of addresses. If the dataset is a predict one, it needs to have at least - one column with some addresses. If the dataset is a training one (with prediction tags), it needs to have at + CSV dataset container that imports a CSV of addresses. If the dataset is a predict one, it must have at least + one column with some addresses. If the dataset is a training one (with prediction tags), it must have at least two columns, one with some addresses and another with a list of tags for each address. After loading the CSV dataset, some tests will be applied depending on its type. For a training container, the validation tests applied on the dataset are the following: - - all addresses are not None value, - - all addresses are not empty, - - all addresses are not whitespace string, and + - no address is a ``None`` value, + - no address is empty, + - no address is composed of only whitespace, + - no address includes consecutive whitespace (e.g. "An address"), + - no tags list is empty, if data is a list of tuple (``[('an address', ['a_tag', 'another_tag']), ...]``), and - if the addresses (whitespace-split) are the same length as their respective tags list. While for a predict container (unknown prediction tag), the validation tests applied on the dataset are the following: - - all addresses are not None value, - - all addresses are not empty, and - - all addresses are not whitespace string. + - no address is a ``None`` value, + - no address is empty, + - no address is composed of only whitespace, and + - no address includes consecutive whitespace (e.g. "An address"). Args: @@ -231,7 +270,7 @@ class CSVDatasetContainer(DatasetContainer): of exactly two elements: addresses and tags. is_training_container (bool): Either or not, the dataset container is a training container. This will determine the dataset validation test we apply to the dataset. That is, a predict dataset doesn't include tags. - The default value is true. + The default value is ``True``. separator (str): The CSV columns separator to use. By default, ``"\\t"``. tag_seperator_reformat_fn (Callable, optional): A function to parse a tags string and return a list of address tags. For example, if the tag column is a former Python list saved with pandas, the characters ``]`` @@ -241,6 +280,8 @@ class CSVDatasetContainer(DatasetContainer): csv_reader_kwargs (dict, optional): Keyword arguments to pass to pandas ``read_csv`` use internally. By default, the ``data_path`` is passed along with our default ``sep`` value ( ``"\\t"``) and the ``"utf-8"`` encoding format. However, this can be overridden by using this argument again. + data_cleaning_pre_processing_fn (Callable): Function to apply as data clea ning pre-processing step after + loading the data, but before applying the validation steps. The default value is ``None``. """ def __init__( @@ -251,18 +292,21 @@ def __init__( separator: str = "\t", tag_seperator_reformat_fn: Union[None, Callable] = None, csv_reader_kwargs: Union[None, Dict] = None, + data_cleaning_pre_processing_fn: Union[None, Callable] = None, ) -> None: - super().__init__(is_training_container=is_training_container) + super().__init__( + is_training_container=is_training_container, data_cleaning_pre_processing_fn=data_cleaning_pre_processing_fn + ) if is_training_container: if isinstance(column_names, str): raise ValueError( - "When the dataset is a training container, the column names should be a list of column name." + "When the dataset is a training container, the column names should be a list of column names." ) if len(column_names) != 2: raise ValueError("When the dataset is a training container, two column names must be provided.") else: # It means it is a predict container if isinstance(column_names, str): - # We transform the str into a list to assess is len + # We transform the str into a list to assess its length column_names = [column_names] if len(column_names) != 1: raise ValueError("When the dataset is a predict container, one column name must be provided.") @@ -289,6 +333,10 @@ def __init__( else: data = [data_point[0] for data_point in pd.read_csv(**csv_reader_kwargs)[column_names].to_numpy()] self.data = data + + if self.data_cleaning_pre_processing_fn is not None: + self.data = self.data_cleaning_pre_processing_fn(self.data) + self.validate_dataset() @@ -302,10 +350,23 @@ class ListDatasetContainer(DatasetContainer): identical as the :class:`~deepparse.dataset_container.PickleDatasetContainer`. is_training_container (bool): Either or not, the dataset container is a training container. This will determine the dataset validation test we apply to the dataset. That is, a predict dataset doesn't include tags. - The default value is true. + The default value is ``True``. + data_cleaning_pre_processing_fn (Callable): Function to apply as data clea ning pre-processing step after + loading the data, but before applying the validation steps. The default value is ``None``. """ - def __init__(self, data: List, is_training_container: bool = True) -> None: - super().__init__(is_training_container=is_training_container) + def __init__( + self, + data: List, + is_training_container: bool = True, + data_cleaning_pre_processing_fn: Union[None, Callable] = None, + ) -> None: + super().__init__( + is_training_container=is_training_container, data_cleaning_pre_processing_fn=data_cleaning_pre_processing_fn + ) self.data = data + + if self.data_cleaning_pre_processing_fn is not None: + self.data = self.data_cleaning_pre_processing_fn(self.data) + self.validate_dataset() diff --git a/deepparse/dataset_container/tools.py b/deepparse/dataset_container/tools.py index 133522aa..5ef211c6 100644 --- a/deepparse/dataset_container/tools.py +++ b/deepparse/dataset_container/tools.py @@ -13,9 +13,9 @@ def former_python_list(tags: str) -> List: Return: A list of the parsed tag set. """ - # We remove the [ and ] of the list. + # We removed the ``"["`` and ``"]"`` from the list. # Then, we split each element using a comma as a separator. - # Finally, in some cases, the element are separated by a comma (e.g. element1,element2) + # Finally, in some cases, the elements are separated by a comma (e.g. element1,element2) # or a comma and a whitespace (e.g. element1, element2), we strip the whitespace on all tags to # remove the trailing whitespace when a coma and a whitespace separate elements. # To fix https://github.com/GRAAL-Research/deepparse/issues/124. diff --git a/deepparse/download_tools.py b/deepparse/download_tools.py index dd6759d1..035c3a47 100644 --- a/deepparse/download_tools.py +++ b/deepparse/download_tools.py @@ -35,9 +35,9 @@ def download_fasttext_magnitude_embeddings(cache_dir: str, verbose: bool = True, offline: bool = False) -> str: """ - Function to download the magnitude pretrained fastText model. + Function to download the magnitude pretrained FastText model. - Return the full path to the fastText embeddings. + Return the full path to the FastText embeddings. """ os.makedirs(cache_dir, exist_ok=True) @@ -48,7 +48,7 @@ def download_fasttext_magnitude_embeddings(cache_dir: str, verbose: bool = True, if not os.path.isfile(file_name) and not offline: if verbose: print( - "The fastText pretrained word embeddings will be download in magnitude format (2.3 GO), " + "The FastText pretrained word embeddings will be download in magnitude format (2.3 GO), " "this process will take several minutes." ) extension = extension + ".gz" @@ -67,7 +67,7 @@ def download_weights(model_filename: str, saving_dir: str, verbose: bool = True) Args: model_filename: The network type (i.e. ``fasttext`` or ``bpemb``). saving_dir: The path to the saving directory. - verbose (bool): Either or not to be verbose during the download of a model. The default value is True. + verbose (bool): Either or not to be verbose during the download of a model. The default value is ``True``. """ if verbose: print(f"Downloading the pre-trained weights for the network {model_filename}.") @@ -83,7 +83,7 @@ def download_weights(model_filename: str, saving_dir: str, verbose: bool = True) def download_from_public_repository(file_name: str, saving_dir: str, file_extension: str) -> None: """ - Simple function to download the content of a file from Deepparse public repository. + Simple function to download the content of a file from the Deepparse public repository. The repository URL string is `'https://graal.ift.ulaval.ca/public/deepparse/{}.{}'`` where the first bracket is the file name and the second is the file extension. """ @@ -97,7 +97,7 @@ def download_from_public_repository(file_name: str, saving_dir: str, file_extens def download_models(saving_cache_path: Union[Path, None] = None) -> None: """ - Function to download all the pretrained models. It will download all the models checkpoint and version file. + Function to download all the pretrained models. It will download all the model's checkpoints and version files. Args: saving_cache_path: The path to the saving cache directory for the specified model. @@ -129,7 +129,7 @@ def download_model( elif "bpemb" in model_type: BPEmb( lang="multi", vs=100000, dim=300, cache_dir=saving_cache_path - ) # The class manage the download of the pretrained words embedding + ) # The class manages the download of the pretrained words embedding model_type_filename = MODEL_MAPPING_CHOICES[model_type] model_path = os.path.join(saving_cache_path, f"{model_type_filename}.ckpt") @@ -165,15 +165,15 @@ def latest_version(model: str, cache_path: str, verbose: bool) -> bool: except HTTPError as exception: # HTTP connection error handling if HTTP_CLIENT_ERROR_STATUS_CODE <= exception.response.status_code < NEXT_RANGE_STATUS_CODE: - # Case where Deepparse server is down. + # Case where the Deepparse server is down. if verbose: warnings.warn( - f"We where not able to verify the cached model in the cache directory {cache_path}. It seems like" - f"Deepparse server is not available at the moment. We recommend to attempt to verify " + f"We could not verify the cached model in the cache directory {cache_path}. It seems like" + f"Deepparse server is not available at the moment. We recommend attempting to verify " f"the model version another time using our download CLI function.", category=RuntimeWarning, ) - # The is_lastest_version is set to True even if we were not able to validate the version. We do so not to + # The is_lastest_version is set to True even if we cannot validate the version. We do so not to # block the rest of the process. is_latest_version = True else: @@ -182,15 +182,15 @@ def latest_version(model: str, cache_path: str, verbose: bool) -> bool: raise except MaxRetryError: # Case where the user does not have an Internet connection. For example, one can run it in a - # Docker container not connected to the Internet. + # The Docker container is not connected to the Internet. if verbose: warnings.warn( - f"We where not able to verify the cached model in the cache directory {cache_path}. It seems like" - f"you are not connected to the Internet. We recommend to verify if you have the latest using our " + f"We could not verify the cached model in the cache directory {cache_path}. It seems like" + f"you are not connected to the Internet. We recommend verifying if you have the latest using our " f"download CLI function.", category=RuntimeWarning, ) - # The is_lastest_version is set to True even if we were not able to validate the version. We do so not to + # The is_lastest_version is set to True even if we cannot validate the version. We do so not to # block the rest of the process. is_latest_version = True finally: @@ -203,7 +203,7 @@ def latest_version(model: str, cache_path: str, verbose: bool) -> bool: # pylint: disable=pointless-string-statement FASTTEXT_COPYRIGHT_MIT_LICENSE = """ -The code below was copied from the fastText project, and has been modified for the purpose of this package. +The code below was copied from the FastText project, and has been modified for the purpose of this package. COPYRIGHT @@ -237,11 +237,11 @@ def latest_version(model: str, cache_path: str, verbose: bool) -> bool: def download_fasttext_embeddings(cache_dir: str, verbose: bool = True, offline: bool = False) -> str: """ - Simpler version of the download_model function from fastText to download pretrained common-crawl - vectors from fastText's website https://fasttext.cc/docs/en/crawl-vectors.html and save it in the + Simpler version of the download_model function from FastText to download pretrained common-crawl + vectors from FastText's website https://fasttext.cc/docs/en/crawl-vectors.html and save it in the saving directory (saving_dir). - Return the full path to the fastText embeddings. + Return the full path to the FastText embeddings. """ os.makedirs(cache_dir, exist_ok=True) @@ -258,21 +258,21 @@ def download_fasttext_embeddings(cache_dir: str, verbose: bool = True, offline: shutil.copyfileobj(f, f_out) os.remove(os.path.join(cache_dir, gz_file_name)) - return file_name_path # return the full path to the fastText embeddings + return file_name_path # return the full path to the FastText embeddings # Now use a saving path and don't return a bool def download_gz_model(gz_file_name: str, saving_path: str, verbose: bool = True) -> None: """ - Simpler version of the _download_gz_model function from fastText to download pretrained common-crawl - vectors from fastText's website https://fasttext.cc/docs/en/crawl-vectors.html and save it in the + Simpler version of the _download_gz_model function from FastText to download pretrained common-crawl + vectors from FastText's website https://fasttext.cc/docs/en/crawl-vectors.html and save it in the saving directory (saving_path). """ url = f"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/{gz_file_name}" if verbose: print( - "The fastText pretrained word embeddings will be downloaded (6.8 GO), " + "The FastText pretrained word embeddings will be downloaded (6.8 GO), " "this process will take several minutes." ) _download_file(url, saving_path, verbose=verbose) diff --git a/deepparse/embeddings_models/bpemb_embeddings_model.py b/deepparse/embeddings_models/bpemb_embeddings_model.py index 414bb78d..4697b175 100644 --- a/deepparse/embeddings_models/bpemb_embeddings_model.py +++ b/deepparse/embeddings_models/bpemb_embeddings_model.py @@ -4,13 +4,18 @@ import requests from bpemb import BPEmb - from numpy.core.multiarray import ndarray from urllib3.exceptions import InsecureRequestWarning from .embeddings_model import EmbeddingsModel +class BPEmbBaseURLWrapperBugFix(BPEmb): + def __init__(self, **kwargs): + self.base_url = "https://bpemb.h-its.org/multi/" + super().__init__(**kwargs) + + class BPEmbEmbeddingsModel(EmbeddingsModel): """ BPEmb embeddings network from `BPEmb: Tokenization-free Pre-trained Subword Embeddings in 275 Languages @@ -19,7 +24,7 @@ class BPEmbEmbeddingsModel(EmbeddingsModel): Params: cache_dir (str): Path to the cache directory to the embeddings' bin vector and the model. - verbose (bool): Wether or not to make the loading of the embeddings verbose. + verbose (bool): Whether or not to make the loading of the embeddings verbose. """ def __init__(self, cache_dir: str, verbose: bool = True) -> None: @@ -31,7 +36,9 @@ def __init__(self, cache_dir: str, verbose: bool = True) -> None: # hotfix until https://github.com/bheinzerling/bpemb/issues/63 # is resolved. with no_ssl_verification(): - model = BPEmb(lang="multi", vs=100000, dim=300, cache_dir=Path(cache_dir)) # defaults parameters + # We use the default parameters other than the dim at 300 and a vs of 100,000 + # We use a BPEmb wrapper since the base URL is broken and the issue is not resolved as of june 23rd. + model = BPEmbBaseURLWrapperBugFix(lang="multi", vs=100000, dim=300, cache_dir=Path(cache_dir)) self.model = model def __call__(self, word: str) -> ndarray: @@ -53,7 +60,7 @@ def no_ssl_verification(): Reference: https://gist.github.com/ChenTanyi/0c47652bd916b61dc196968bca7dad1d. - Will be removed when https://github.com/bheinzerling/bpemb/issues/63 is resolved. + It will be removed when https://github.com/bheinzerling/bpemb/issues/63 is resolved. """ opened_adapters = set() old_merge_environment_settings = requests.Session.merge_environment_settings diff --git a/deepparse/embeddings_models/embeddings_model_factory.py b/deepparse/embeddings_models/embeddings_model_factory.py index 6360f752..c5aae0e2 100644 --- a/deepparse/embeddings_models/embeddings_model_factory.py +++ b/deepparse/embeddings_models/embeddings_model_factory.py @@ -11,12 +11,12 @@ def create(self, embedding_model_type: str, cache_dir: str, verbose: bool = True """ Embeddings model creation method. Args: - embeddings_model_type (str): the type of the embeddings model to create. Valid options: + embedding_model_type (str): the type of the embeddings model to create. Valid options: - bpemb - fasttext - fasttext_magnitude cache_dir (str): Path to the cache directory where the embeddings model exists or is to be downloaded. - verbose (bool): Wether or not to make the loading of the embeddings verbose. + verbose (bool): Whether or not to make the loading of the embeddings verbose. Return: An :class:`~EmbeddingsModel` """ diff --git a/deepparse/embeddings_models/fasttext_embeddings_model.py b/deepparse/embeddings_models/fasttext_embeddings_model.py index b4b34e08..abb1e5a4 100644 --- a/deepparse/embeddings_models/fasttext_embeddings_model.py +++ b/deepparse/embeddings_models/fasttext_embeddings_model.py @@ -18,7 +18,7 @@ class FastTextEmbeddingsModel(EmbeddingsModel): Note: Since Windows uses ``spawn`` instead of ``fork`` during multiprocess (for the data loading pre-processing - ``num_worker`` > 0) we use the Gensim model, which takes more RAM (~10 GO) than the Fasttext one (~8 GO). + ``num_worker`` > 0), we use the Gensim model, which takes more RAM (~10 GO) than the Fasttext one (~8 GO). It also takes a longer time to load. See here the `issue `_. """ @@ -39,7 +39,7 @@ def __call__(self, word: str) -> ndarray: word (str): Word to get vector. Return: - The fastText embedding for a word. + The FastText embedding for a word. """ return self.model[word] diff --git a/deepparse/embeddings_models/magnitude_embeddings_model.py b/deepparse/embeddings_models/magnitude_embeddings_model.py index f2d85720..0cd40ac6 100644 --- a/deepparse/embeddings_models/magnitude_embeddings_model.py +++ b/deepparse/embeddings_models/magnitude_embeddings_model.py @@ -7,7 +7,7 @@ class MagnitudeEmbeddingsModel(EmbeddingsModel): """ FastText embeddings network from `Enriching Word Vectors with Subword Information `_ using the magnitude mapping - (`_), which reduce memory footprint. + (`_), which reduces the memory footprint. Args: embeddings_path (str): Path to the bin embeddings vector (.bin). @@ -20,13 +20,13 @@ def __init__(self, embeddings_path: str, verbose: bool = True) -> None: def __call__(self, words: str) -> ndarray: """ - Callable method to get word vector of a complete address. + Callable method to get the word vector of a complete address. Args: words (str): Address to get vector for words. Return: - The fastText embedding for a list of words. + The FastText embedding for a list of words. """ - # we leverage the multiple word query which are faster than single word query + # We leverage the multiple-word query which is faster than a single word query return self.model.query(words.split()) diff --git a/deepparse/errors/data_error.py b/deepparse/errors/data_error.py index 20f41a9c..e829c06b 100644 --- a/deepparse/errors/data_error.py +++ b/deepparse/errors/data_error.py @@ -1,6 +1,6 @@ class DataError(Exception): """ - User error when data is not construct as expected. + User error occurs when the data structure is not as expected. """ def __init__(self, value: str) -> None: diff --git a/deepparse/errors/model_error.py b/deepparse/errors/model_error.py index 8ee4196a..889b26c1 100644 --- a/deepparse/errors/model_error.py +++ b/deepparse/errors/model_error.py @@ -1,6 +1,6 @@ class FastTextModelError(Exception): """ - User error when user uses a FastText-like model on an OS that does not support properly multithreading. + User error occurs when a user uses a FastText-like model on an OS that does not correctly support multithreading. """ def __init__(self, value: str) -> None: diff --git a/deepparse/errors/server_error.py b/deepparse/errors/server_error.py index 9e98dc02..903e5c13 100644 --- a/deepparse/errors/server_error.py +++ b/deepparse/errors/server_error.py @@ -1,6 +1,6 @@ class ServerError(Exception): """ - User error when Deepparse server is not responding. + User error occurs when the Deepparse server is not responding. """ def __init__(self, value: str) -> None: diff --git a/deepparse/metrics/accuracy.py b/deepparse/metrics/accuracy.py index ed8dbeeb..f1ca46af 100644 --- a/deepparse/metrics/accuracy.py +++ b/deepparse/metrics/accuracy.py @@ -2,8 +2,8 @@ from poutyne.framework.metrics import acc -def accuracy(pred: torch.Tensor, ground_truth: torch.Tensor) -> float: +def accuracy(predictions: torch.Tensor, ground_truths: torch.Tensor) -> float: """ Accuracy per tag. """ - return acc(pred.transpose(0, 1).transpose(-1, 1), ground_truth) + return acc(predictions.transpose(0, 1).transpose(-1, 1), ground_truths) diff --git a/deepparse/metrics/nll_loss.py b/deepparse/metrics/nll_loss.py index 4ea1042f..92cf5530 100644 --- a/deepparse/metrics/nll_loss.py +++ b/deepparse/metrics/nll_loss.py @@ -4,13 +4,13 @@ criterion = NLLLoss() -def nll_loss(pred: torch.Tensor, ground_truth: torch.Tensor) -> float: +def nll_loss(predictions: torch.Tensor, ground_truths: torch.Tensor) -> float: """ - NLL loss compute per tag. + NLL loss to compute loss per tag. """ loss = 0 - ground_truth = ground_truth.transpose(0, 1) - for i in range(pred.size(0)): - loss += criterion(pred[i], ground_truth[i]) + ground_truths = ground_truths.transpose(0, 1) + for i in range(predictions.size(0)): + loss += criterion(predictions[i], ground_truths[i]) return loss diff --git a/deepparse/network/bpemb_seq2seq.py b/deepparse/network/bpemb_seq2seq.py index 0195eb31..d08db831 100644 --- a/deepparse/network/bpemb_seq2seq.py +++ b/deepparse/network/bpemb_seq2seq.py @@ -10,22 +10,25 @@ class BPEmbSeq2SeqModel(Seq2SeqModel): """ - BPEmb Seq2Seq network, the best of the two model we propose, but takes more ``GPU``/``CPU`` resources. + BPEmb Seq2Seq network is the best of the two proposed models but takes more ``GPU``/``CPU`` resources. Args: cache_dir (str): The path to the cached directory to use for downloading (and loading) the model weights. - device (~torch.device): The device tu use for the prediction. - input_size (int): The input size of the encoder (i.e. the embeddings size). It will also be used to initialize - the internal embeddings network input size, hidden size and output dim. The default value is 300. - encoder_hidden_size (int): The size of the hidden layer(s) of the encoder. The default value is 1024. - encoder_num_layers (int): The number of hidden layers of the encoder. The default value is 1. - decoder_hidden_size (int): The size of the hidden layer(s) of the decoder. The default value is 1024. - decoder_num_layers (int): The number of hidden layers of the decoder. The default value is 1. - output_size (int): The size of the prediction layers (i.e. the number of tag to predict). - attention_mechanism (bool): Either or not to use attention mechanism. The default value is False. - verbose (bool): Turn on/off the verbosity of the model. The default value is True. - path_to_retrained_model (Union[str, None]): The path to the retrained model to use for the seq2seq. + device (~torch.device): The device to use for the prediction. + input_size (int): The input size of the encoder (i.e. the size of the embedding). It will also be used to + initialize the internal embeddings network input size, hidden size and output dim. The default value is + ``300``. + encoder_hidden_size (int): The size of the hidden layer(s) of the encoder. The default value is ``1024``. + encoder_num_layers (int): The number of hidden layers of the encoder. The default value is ``1``. + decoder_hidden_size (int): The size of the hidden layer(s) of the decoder. The default value is ``1024``. + decoder_num_layers (int): The number of hidden layers of the decoder. The default value is ``1``. + output_size (int): The size of the prediction layers (i.e. the number of tags to predict). The default value is + ``9``. + attention_mechanism (bool): Either or not to use the attention mechanism. The default value is ``False``. + verbose (bool): Turn on/off the verbosity of the model. The default value is ``True`. + path_to_retrained_model (Union[str, None]): The path to the retrained model to use for the seq2seq.`The default + value is ``None``. """ def __init__( @@ -92,9 +95,10 @@ def forward( to_predict (~torch.Tensor): The elements to predict the tags. decomposition_lengths (list) : The lengths of the decomposed words of the batch elements (since packed). lengths (list) : The lengths of the batch elements (since packed). - target (~torch.LongTensor) : The target of the batch element, use only when we retrain the model since we do + target (~torch.LongTensor) : The target of the batch element, used only when we retrain the model since + we do `teacher forcing `_. - Default value is None since we mostly don't have the target except for retrain. + The default value is ``None`` since we mostly don't have the target except for retraining. Return: A Tensor of the predicted sequence. """ diff --git a/deepparse/network/decoder.py b/deepparse/network/decoder.py index f0c347ab..c434344b 100644 --- a/deepparse/network/decoder.py +++ b/deepparse/network/decoder.py @@ -11,7 +11,7 @@ class Decoder(nn.Module): """ - Decoder module that use a LSTM to decode a previously encoded sequence and a linear layer to map + Decoder module that uses a LSTM to decode a previously encoded sequence and a linear layer to map the decoded sequence tags. Args: @@ -33,7 +33,7 @@ def __init__( super().__init__() self.attention_mechanism = attention_mechanism if attention_mechanism: - # Since layer also have attention mechanism + # Since layer also has attention mechanism self.hidden_size = hidden_size input_size = input_size + hidden_size self._attention_mechanism_set_up() diff --git a/deepparse/network/embedding_network.py b/deepparse/network/embedding_network.py index 31721569..297909c1 100644 --- a/deepparse/network/embedding_network.py +++ b/deepparse/network/embedding_network.py @@ -1,7 +1,7 @@ # Bug with PyTorch source code makes torch.tensor as not callable for pylint. # pylint: disable=not-callable -# temporary fix for _forward_unimplemented for PyTorch 1.6 https://github.com/pytorch/pytorch/issues/42305 +# Temporary fix for _forward_unimplemented for PyTorch 1.6 https://github.com/pytorch/pytorch/issues/42305 # pylint: disable=W0223 from typing import Tuple, List @@ -13,14 +13,15 @@ class EmbeddingNetwork(nn.Module): """ - Embedding Network to represent the address components byte-pair embedding representation using a LSTM. + Embedding Network to represent the address components byte-pair embedding representation using an LSTM. Args: input_size (int): The input size of the LSTM. hidden_size (int): The hidden size of the LSTM. - num_layers (int): The number of layer of the LSTM. Default is one (1) layer. - maxpool (bool): Either or not to add a maximum pooling layer after the embedding composition. Default is false. - maxpool_kernel_size (int): The kernel size of the maximum pooling layer. Default is three (3). + num_layers (int): The number of layers of the LSTM. The default value is ``1``, namely one layer. + maxpool (bool): Either or not to add a maximum pooling layer after the embedding composition. The default + value is ``False``. + maxpool_kernel_size (int): The kernel size of the maximum pooling layer. The default value is ``3``. """ def __init__( @@ -73,7 +74,7 @@ def forward(self, to_predict: torch.Tensor, decomposition_lengths: Tuple[List]) for i in range(to_predict.size(0)): lengths = [] - # reorder decomposition, could use a transpose but take a LOT (like a LOT) of memory + # Reorder decomposition, could use a transpose but take a LOT (like a LOT) of memory for decomposition_length in decomposition_lengths: lengths.append(decomposition_length[i]) @@ -86,15 +87,15 @@ def forward(self, to_predict: torch.Tensor, decomposition_lengths: Tuple[List]) packed_output, _ = self.model(packed_sequence) - # pad packed the output to be applied later on in the projection layer + # Pad packed the output to be applied later on in the projection layer. padded_output, padded_output_lengths = pad_packed_sequence(packed_output, batch_first=True) - # filling the embedding by idx + # Filling the embedding by IDX. word_context = torch.zeros(padded_output.size(0), padded_output.size(2), device=device) for j in range(batch_size): word_context[j] = padded_output[j, padded_output_lengths[j] - 1, :] - # projection layer from dim 600 to 300 + # Projection layer from dim 600 to 300. projection_output = self.projection_layer(word_context) if self.maxpooling_layer is not None: diff --git a/deepparse/network/encoder.py b/deepparse/network/encoder.py index 5fafb917..e275d875 100644 --- a/deepparse/network/encoder.py +++ b/deepparse/network/encoder.py @@ -1,4 +1,4 @@ -# temporary fix for _forward_unimplemented for torch 1.6 https://github.com/pytorch/pytorch/issues/42305 +# Temporary fix for _forward_unimplemented for torch 1.6 https://github.com/pytorch/pytorch/issues/42305 # pylint: disable=W0223 from typing import Tuple, List @@ -12,12 +12,12 @@ class Encoder(nn.Module): """ - Encoder module that use a LSTM to encode a sequence. + Encoder module that uses an LSTM to encode a sequence. Args: input_size (int): The input size of the encoder. hidden_size (int): The hidden size of the encoder. - num_layers (int): The number of layer to the encoder. + num_layers (int): The number of layers to the encoder. """ def __init__(self, input_size: int, hidden_size: int, num_layers: int) -> None: diff --git a/deepparse/network/fasttext_seq2seq.py b/deepparse/network/fasttext_seq2seq.py index dd08059a..34ae72e9 100644 --- a/deepparse/network/fasttext_seq2seq.py +++ b/deepparse/network/fasttext_seq2seq.py @@ -9,22 +9,24 @@ class FastTextSeq2SeqModel(Seq2SeqModel): """ - FastText Seq2Seq network, the lightest of the two model we propose (in ``GPU``/``CPU`` consumption) for a little + FastText Seq2Seq network, the lightest of the two models we propose (in ``GPU``/``CPU`` consumption) for a little less accuracy. Args: cache_dir (str): The path to the cached directory to use for downloading (and loading) the model weights. device (~torch.device): The device tu use for the prediction. - input_size (int): The input size of the encoder (i.e. the embeddings size). The default value is 300. - encoder_hidden_size (int): The size of the hidden layer(s) of the encoder. The default value is 1024. - encoder_num_layers (int): The number of hidden layers of the encoder. The default value is 1. - decoder_hidden_size (int): The size of the hidden layer(s) of the decoder. The default value is 1024. - decoder_num_layers (int): The number of hidden layers of the decoder. The default value is 1. - output_size (int): The size of the prediction layers (i.e. the number of tag to predict). - attention_mechanism (bool): Either or not to use attention mechanism. The default value is False. - verbose (bool): Turn on/off the verbosity of the model. The default value is True. - path_to_retrained_model (Union[str, None]): The path to the retrained model to use for the seq2seq. + input_size (int): The input size of the encoder (i.e. the size of the embedding). The default value is ``300``. + encoder_hidden_size (int): The size of the encoder's hidden layer(s). The default value is ``1024``. + encoder_num_layers (int): The number of hidden layers of the encoder. The default value is ``1``. + decoder_hidden_size (int): The size of the decoder's hidden layer(s). The default value is ``1024``. + decoder_num_layers (int): The number of hidden layers of the decoder. The default value is ``1``. + output_size (int): The size of the prediction layers (i.e. the number of tags to predict). The default value + is ``9``. + attention_mechanism (bool): Either or not to use the attention mechanism. The default value is ``False``. + verbose (bool): Turn on/off the verbosity of the model. The default value is ``True``. + path_to_retrained_model (Union[str, None]): The path to the retrained model to use for the seq2seq. The default + value is ``None``. """ def __init__( @@ -88,7 +90,7 @@ def forward( lengths (list) : The lengths of the batch elements (since packed). target (~torch.LongTensor) : The target of the batch element, use only when we retrain the model since we do `teacher forcing `_. - Default value is None since we mostly don't have the target except for retrain. + The default value is ``None`` since we mostly don't have the target except for retrain. Return: A Tensor of the predicted sequence. diff --git a/deepparse/network/model_factory.py b/deepparse/network/model_factory.py index 4893a9c7..a107ab07 100644 --- a/deepparse/network/model_factory.py +++ b/deepparse/network/model_factory.py @@ -8,7 +8,7 @@ class ModelFactory: """ - A factory for the creation of neural network models that predict the tags from addresses + A factory for creating neural network models that predict the tags from addresses. """ def create( @@ -32,12 +32,14 @@ def create( - bpemb cache_dir (str): The path to the cached directory to use for downloading (and loading) the model weights. - device (~torch.device): The device tu use for the prediction. - output_size (int): The size of the prediction layers (i.e. the number of tag to predict). - attention_mechanism (bool): Either or not to use attention mechanism. The default value is False. - path_to_retrained_model (Union[str, None]): The path to the retrained model to use for the seq2seq. - offline (bool): Wether or not the model is an offline or an online. - verbose (bool): Turn on/off the verbosity of the model. The default value is True. + device (~torch.device): The device to use for the prediction. + output_size (int): The size of the prediction layers (i.e. the number of tags to predict). The default + value is ``9``. + attention_mechanism (bool): Either or not to use the attention mechanism. The default value is ``False``. + path_to_retrained_model (Union[str, None]): The path to the retrained model to use for the seq2seq. The + default value is ``None``. + offline (bool): Whether or not the model is an offline or an online. The default value is ``False``. + verbose (bool): Turn on/off the verbosity of the model. The default value is ``True``. Return: A :class:`~Seq2SeqModel`. @@ -69,7 +71,7 @@ def create( else: raise NotImplementedError( f""" - There is no {model_type} network implemented. model_type should be either fasttext or bpemb + There is no {model_type} network implemented. model_type should be either "fasttext" or "bpemb". """ ) diff --git a/deepparse/network/seq2seq.py b/deepparse/network/seq2seq.py index faf5b808..d222b462 100644 --- a/deepparse/network/seq2seq.py +++ b/deepparse/network/seq2seq.py @@ -21,14 +21,15 @@ class Seq2SeqModel(ABC, nn.Module): Args: device (~torch.device): The device tu use for the prediction. - input_size (int): The input size of the encoder (i.e. the embeddings size). The default value is 300. - encoder_hidden_size (int): The size of the hidden layer(s) of the encoder. The default value is 1024. - encoder_num_layers (int): The number of hidden layers of the encoder. The default value is 1. - decoder_hidden_size (int): The size of the hidden layer(s) of the decoder. The default value is 1024. - decoder_num_layers (int): The number of hidden layers of the decoder. The default value is 1. - output_size (int): The size of the prediction layers (i.e. the number of tag to predict). - attention_mechanism (bool): Either or not to use attention mechanism. The default value is False. - verbose (bool): Turn on/off the verbosity of the model. The default value is True. + input_size (int): The input size of the encoder (i.e. the size of the embedding). The default value is ``300``. + encoder_hidden_size (int): The size of the encoder's hidden layer(s). The default value is ``1024``. + encoder_num_layers (int): The number of hidden layers of the encoder. The default value is ``1``. + decoder_hidden_size (int): The size of the decoder's hidden layer(s). The default value is ``1024``. + decoder_num_layers (int): The number of hidden layers of the decoder. The default value is ``1``. + output_size (int): The size of the prediction layers (i.e. the number of tags to predict). The default value is + ``9``. + attention_mechanism (bool): Either or not to use the attention mechanism. The default value is ``False``. + verbose (bool): Turn on/off the verbosity of the model. The default value is ``True``. """ def __init__( @@ -80,14 +81,14 @@ def same_output_dim(self, size: int) -> bool: def handle_new_output_dim(self, new_dim: int) -> None: """ - Update the new output dimension + Update the new output dimension. """ self.decoder.linear_layer_set_up(output_size=new_dim) self.output_size = new_dim def _load_pre_trained_weights(self, model_type: str, cache_dir: str, offline: bool) -> None: """ - Method to download and resolved the loading (into the network) of the pretrained weights. + Method to download and resolve the loading (into the network) of the pre-trained weights. Args: model_type (str): The network pretrained weights to load. @@ -126,7 +127,7 @@ def _load_weights(self, path_to_model_torch_archive: str) -> None: path_to_model_to_upload=path_to_model_torch_archive, device=self.device ) - # All the time, our torch archive include meta-data along with the model weights + # All the time, our torch archive includes meta-data along with the model weights. all_layers_params = all_layers_params.get("address_tagger_model") self.load_state_dict(all_layers_params) @@ -154,11 +155,11 @@ def _encoder_step(self, to_predict: torch.Tensor, lengths: List, batch_size: int Args: to_predict (~torch.Tensor): The elements to predict the tags. lengths (list): The lengths of the batch elements (since packed). - batch_size (int): The number of element in the batch. + batch_size (int): The number of elements in the batch. Return: A tuple (``x``, ``y``, ``z``) where ``x`` is the decoder input (a zeros tensor), ``y`` is the decoder - hidden states and ``z`` is the encoder outputs for the attention weighs if needed. + hidden states, and ``z`` is the encoder output for the attention weighs if needed. """ encoder_outputs, decoder_hidden = self.encoder(to_predict, lengths) @@ -181,41 +182,42 @@ def _decoder_step( Args: decoder_input (~torch.Tensor): The decoder input (so the encode output). - decoder_hidden (~torch.Tensor): The encoder hidden state (so the encode hidden state). + decoder_hidden (~torch.Tensor): The encoder's hidden state (so the encode hidden state). encoder_outputs (~torch.Tensor): The encoder outputs for the attention mechanism weighs if needed. - target (~torch.LongTensor) : The target of the batch element, use only when we retrain the model since we do + target (~torch.LongTensor) : The target of the batch element, used only when we retrain the model since + we do `teacher forcing `_. lengths (list): The lengths of the batch elements (since packed). - batch_size (int): Number of element in the batch. + batch_size (int): Number of elements in the batch. Return: A Tensor of the predicted sequence. """ longest_sequence_length = max(lengths) - # The empty prediction sequence - # +1 for the EOS + # The empty prediction sequence. + # +1 for the EOS. prediction_sequence = torch.zeros(longest_sequence_length + 1, batch_size, self.output_size, device=self.device) - # We decode the first token + # We decode the first token. decoder_output, decoder_hidden, attention_weights = self.decoder( decoder_input, decoder_hidden, encoder_outputs, lengths ) if attention_weights is not None: - # We fill the attention + # We fill the attention. attention_output = torch.ones(longest_sequence_length + 1, batch_size, 1, longest_sequence_length) attention_output[0] = attention_weights - # We fill the first token prediction + # We fill the first token prediction. prediction_sequence[0] = decoder_output - # The decoder next step input (the predicted idx of the previous token) + # The decoder's next step input (the predicted idx of the previous token). _, decoder_input = decoder_output.topk(1) - # we loop the same steps for the rest of the sequence + # We loop the same steps for the rest of the sequence. if target is not None and random.random() < 0.5: - # force the real target value instead of the predicted one to help learning + # Force the real target value instead of the predicted one to help learning. target = target.transpose(0, 1) for idx in range(longest_sequence_length): decoder_input = target[idx].view(1, batch_size, 1) diff --git a/deepparse/parser/address_parser.py b/deepparse/parser/address_parser.py index ceba52e0..2b9bf789 100644 --- a/deepparse/parser/address_parser.py +++ b/deepparse/parser/address_parser.py @@ -4,7 +4,6 @@ # It must be due to the complex try, except else case. # pylint: disable=inconsistent-return-statements -import contextlib import os import re import warnings @@ -19,11 +18,7 @@ from torch.optim import SGD from torch.utils.data import DataLoader, Subset -from ..download_tools import CACHE_PATH -from ..pre_processing.pre_processor_list import PreProcessorList -from ..validations import valid_poutyne_version from . import formatted_parsed_address -from .capturing import Capturing from .formatted_parsed_address import FormattedParsedAddress from .tools import ( get_address_parser_in_directory, @@ -39,13 +34,15 @@ from .. import validate_data_to_parse from ..converter import TagsConverter, DataProcessorFactory, DataPadder from ..dataset_container import DatasetContainer +from ..download_tools import CACHE_PATH from ..embeddings_models import EmbeddingsModelFactory from ..errors import FastTextModelError from ..metrics import nll_loss, accuracy from ..network import ModelFactory from ..pre_processing import coma_cleaning, lower_cleaning, hyphen_cleaning from ..pre_processing import trailing_whitespace_cleaning, double_whitespaces_cleaning - +from ..pre_processing.pre_processor_list import PreProcessorList +from ..validations import valid_poutyne_version from ..vectorizer import VectorizerFactory from ..weights_tools import handle_weights_upload @@ -95,8 +92,8 @@ class AddressParser: ``None``. To further improve performance, consider using the models (fasttext or BPEmb) with their counterparts using an attention mechanism with the ``attention_mechanism`` flag. attention_mechanism (bool): Whether to use the model with an attention mechanism. The model will use an - attention mechanism that takes an extra 100 MB on GPU usage (see the doc for more statistics). - The default value is False. + attention mechanism that takes an extra 100 MB on GPU usage (see the documentation for more statistics). + The default value is ``False``. device (Union[int, str, torch.torch.device]): The device to use can be either: - a ``GPU`` index in int format (e.g. ``0``), @@ -104,28 +101,31 @@ class AddressParser: - a :class:`~torch.torch.device` object, - ``"cpu"`` for a ``CPU`` use. - The default value is GPU with the index ``0`` if it exists. Otherwise, the value is ``CPU``. - rounding (int): The rounding to use when asking the probability of the tags. The default value is four digits. - verbose (bool): Turn on/off the verbosity of the model weights download and loading. The default value is True. + The default value is ``0``, witch is a GPU device with the index ``0`` if it exists. Otherwise, + the value is ``CPU``. + rounding (int): The rounding to use when asking the probability of the tags. The default value is ``4``, + namely four digits. + verbose (bool): Turn on/off the verbosity of the model weights download and loading. The default value is + ``True``. path_to_retrained_model (Union[S3Path, str, None]): The path to the retrained model to use for prediction. We will infer the ``model_type`` of the retrained model. The default value is ``None``, meaning we use our pretrained model. If the retrained model uses an attention mechanism, ``attention_mechanism`` needs to be set to True. The path_to_retrain_model can also be a S3-like (Azure, AWS, Google) bucket URI string path (e.g. ``"s3://path/to/aws/s3/bucket.ckpt"``). Or it can be a ``S3Path`` S3-like URI using `cloudpathlib` to handle S3-like bucket. See `cloudpathlib ` - for detail on supported S3 buckets provider and URI condition. The default value is None. + for detail on supported S3 buckets provider and URI condition. The default value is ``None``. cache_dir (Union[str, None]): The path to the cached directory to use for downloading (and loading) the embeddings model and the model pretrained weights. offline (bool): Whether or not the model is an offline one, meaning you have already downloaded the pre-trained weights and embeddings weights in either the default Deepparse cache directory (``"~./cache/deepparse"``) or the ``cache_dir`` directory. When offline, we will not verify if the model is the latest. You can use our - ``download_models`` CLI function to download all the requirements for a model. The default value is False - (not an offline parsing model). + ``download_models`` CLI function to download all the requirements for a model. The default value is + ``False`` (not an offline parsing model). Note: For both networks, we will download the pretrained weights and embeddings in the ``.cache`` directory - for the root user. The pretrained weights take at most 44 MB. The fastText embeddings take 6.8 GO, - the fastText-light embeddings take 3.3 GO and bpemb take 116 MB (in ``".cache/bpemb"``). + for the root user. The pretrained weights take at most 44 MB. The FastText embeddings take 6.8 GO, + the FastText-light (``"fasttext-light"``) embeddings take 3.3 GO and bpemb take 116 MB (in ``".cache/bpemb"``). Also, one can download all the dependencies of our pretrained model using our CLI (e.g. download_model fasttext) before sending it to a node without access to Internet. @@ -788,14 +788,12 @@ def retrain( verbose = self.verbose try: - with_capturing_context = False if not valid_poutyne_version(min_major=1, min_minor=8): - print( + raise ImportError( "You are using an older version of Poutyne that does not support proper error management." - " Due to that, we cannot show retrain progress. To fix that, update Poutyne to " + " Due to that, we cannot show retrain progress. To fix that, please update Poutyne to " "the newest version." ) - with_capturing_context = True train_res = self._retrain( experiment=exp, train_generator=train_generator, @@ -804,7 +802,6 @@ def retrain( seed=seed, callbacks=callbacks, disable_tensorboard=disable_tensorboard, - capturing_context=with_capturing_context, verbose=verbose, ) except RuntimeError as error: @@ -858,9 +855,11 @@ def retrain( torch_save.update( { - "named_parser": name_of_the_retrain_parser - if name_of_the_retrain_parser is not None - else self._formatted_named_parser_name(prediction_tags, seq2seq_params, layers_to_freeze) + "named_parser": ( + name_of_the_retrain_parser + if name_of_the_retrain_parser is not None + else self._formatted_named_parser_name(prediction_tags, seq2seq_params, layers_to_freeze) + ) } ) @@ -914,7 +913,7 @@ def test( seed (int): Seed to use (by default, ``42``). verbose (Union[None, bool]): To override the AddressParser verbosity for the test. When set to True or False, it will override (but it does not change the AddressParser verbosity) the test verbosity. - If set to the default value None, the AddressParser verbosity is used as the test verbosity. + If set to the default value ``None``, the AddressParser verbosity is used as the test verbosity. Return: A dictionary with the stats (see `Experiment class @@ -964,7 +963,7 @@ def test( if "fasttext-light" in self.model_type: raise FastTextModelError( "It's not possible to test a fasttext-light due to pymagnitude problem. " - "See the Retrain method doc for more details." + "See the Retrain method documentation for more details." ) if not isinstance(test_dataset_container, DatasetContainer): @@ -1192,22 +1191,18 @@ def _retrain( seed: int, callbacks: List, disable_tensorboard: bool, - capturing_context: bool, verbose: Union[None, bool], ) -> List[Dict]: # pylint: disable=too-many-arguments - # If Poutyne 1.7 and before, we capture poutyne print since it prints some exception. - # Otherwise, we use a null context manager. - with Capturing() if capturing_context else contextlib.nullcontext(): - train_res = experiment.train( - train_generator, - valid_generator=valid_generator, - epochs=epochs, - seed=seed, - callbacks=callbacks, - disable_tensorboard=disable_tensorboard, - verbose=verbose, - ) + train_res = experiment.train( + train_generator, + valid_generator=valid_generator, + epochs=epochs, + seed=seed, + callbacks=callbacks, + disable_tensorboard=disable_tensorboard, + verbose=verbose, + ) return train_res def _freeze_model_params(self, layers_to_freeze: Union[str]) -> None: @@ -1215,7 +1210,7 @@ def _freeze_model_params(self, layers_to_freeze: Union[str]) -> None: if layers_to_freeze not in ("encoder", "decoder", "prediction_layer", "seq2seq"): raise ValueError( f"{layers_to_freeze} freezing setting is not supported. Value can be 'encoder', 'decoder', " - f"'prediction_layer' and 'seq2seq'. See doc for more details." + f"'prediction_layer' and 'seq2seq'. See documentation for more details." ) layer_exclude = None if layers_to_freeze == "decoder": @@ -1271,7 +1266,7 @@ def _retrain_argumentation_validations( if "fasttext-light" in self.model_type: raise FastTextModelError( "It's not possible to retrain a fasttext-light due to pymagnitude problem. " - "See the Retrain method doc for more details." + "See the Retrain method documentation for more details." ) if not isinstance(train_dataset_container, DatasetContainer): diff --git a/deepparse/parser/formatted_parsed_address.py b/deepparse/parser/formatted_parsed_address.py index 9a013741..28265882 100644 --- a/deepparse/parser/formatted_parsed_address.py +++ b/deepparse/parser/formatted_parsed_address.py @@ -86,7 +86,7 @@ def __repr__(self) -> str: def __eq__(self, other) -> bool: """ - Equal if all address components elements are equals. If attributes are not the same, will return False. + Equal if all address components elements are equals. If attributes are not the same, it will return False. """ for field in self.__dict__: address_component = getattr(self, field) @@ -114,7 +114,7 @@ def format_address( Args: fields (Union[list, None]): Optional argument to define the fields to order the address components of - the address. If None, we will use the inferred order base on the address tags appearance. For example, + the address. If None, we will use the inferred order based on the address tags' appearance. For example, if the parsed address is ``(305, StreetNumber), (rue, StreetName), (des, StreetName), (Lilas, StreetName)``, the inferred order will be ``StreetNumber, StreetName``. capitalize_fields (Union[list, None]): Optional argument to define the capitalized fields for the formatted @@ -138,7 +138,7 @@ def format_address( # > 350, rue des lilas, ouest, quebec city, quebec, g1l 1b6 parse_address.formatted_address(fields_separator=", ", capitalize_fields=["StreetName", "Orientation"]) - # > 350, Rue des lilas, Ouest, quebec city, quebec, g1l 1b6 + # > 350, rue des lilas, ouest, quebec city, quebec, g1l 1b6 parse_address.formatted_address(fields_separator=", ", upper_case_fields=["PostalCode""]) # > 350 rue des lilas ouest quebec city quebec G1L 1B6 @@ -214,7 +214,7 @@ def to_list_of_tuples(self, fields: Union[List, None] = None) -> List[tuple]: def to_pandas(self) -> Dict: """ Method to convert a parsed address into a dictionary for pandas where the first key is the raw address and - the followings keys are the address components, and the values are the value of those components. + the following keys are the address components, and the values are the values of those components. For example, the parsed address `` 305 rue des Lilas`` will be converted into the following dictionary: ``{'Address': '305 rue des Lilas', 'StreetNumber':'305', 'StreetName': 'rue des Lilas'}``. @@ -228,7 +228,7 @@ def to_pandas(self) -> Dict: def to_pickle(self) -> Tuple[str, List]: """ Method to convert a parsed address into a list of tuple for pickle where the first tuple element is the - raw address and the followings tuples are the address components, and the values are the value of + raw address and the following tuples are the address components, and the values are the values of those components. For example, the parsed address `` 305 rue des Lilas`` will be converted into the following list of tuples: ``'305 rue des Lilas', ('305', 'StreetNumber'), ('rue des Lilas', 'StreetName')]``. diff --git a/deepparse/validations.py b/deepparse/validations.py index e2007944..a6c6d555 100644 --- a/deepparse/validations.py +++ b/deepparse/validations.py @@ -12,7 +12,7 @@ def extract_package_version(package) -> str: """ - Handle the retrieval of the major and minor version part of a Python package. + Handle the retrieval of a Python package's major and minor version parts. """ full_version = package.version.__version__ components_parts = full_version.split(".") @@ -24,8 +24,8 @@ def extract_package_version(package) -> str: def valid_poutyne_version(min_major: int = 1, min_minor: int = 2) -> bool: """ - Validate Poutyne version is greater than min_major.min_minor for using a str checkpoint. Some version before - does not support all the features we need. By default, min_major.min_minor equal version 1.2 which is the + Validate that the Poutyne version is greater than min_major.min_minor for using a str checkpoint. Some versions + do not support all the features we need. By default, min_major.min_minor equals version 1.2, which is the lowest version we can use. """ version_components = extract_package_version(package=poutyne).split(".") @@ -45,13 +45,13 @@ def validate_data_to_parse(addresses_to_parse: List) -> None: """ Validation tests on the addresses to parse to respect the following two criteria: - addresses are not tuple, - - no addresses are None value, - - no addresses are empty strings, and - - no addresses are whitespace-only strings. + - no address is a ``None`` value, + - no address is empty, and + - no address is composed of only whitespace. """ if isinstance(addresses_to_parse[0], tuple): raise DataError( - "Addresses to parsed are tuples. They need to be a list of string. Are you using training data?" + "Addresses to parsed are tuples. They need to be a list of strings. Are you using training data?" ) if validate_if_any_none(addresses_to_parse): raise DataError("Some addresses are None value.") diff --git a/deepparse/vectorizer/bpemb_vectorizer.py b/deepparse/vectorizer/bpemb_vectorizer.py index a20d4c12..69ca91dd 100644 --- a/deepparse/vectorizer/bpemb_vectorizer.py +++ b/deepparse/vectorizer/bpemb_vectorizer.py @@ -50,7 +50,7 @@ def _vectorize_sequence(self, address: str) -> Tuple[List, List]: address (str): Address to vectorize using BPEmb. Return: - A tuple of list of word vector and the word decomposition lengths. + A tuple of the list of word vectors and the word decomposition lengths. """ input_sequence = [] diff --git a/deepparse/vectorizer/fasttext_vectorizer.py b/deepparse/vectorizer/fasttext_vectorizer.py index f6506a36..8fcc1990 100644 --- a/deepparse/vectorizer/fasttext_vectorizer.py +++ b/deepparse/vectorizer/fasttext_vectorizer.py @@ -6,7 +6,7 @@ class FastTextVectorizer(Vectorizer): """ - FastText vectorizer to convert an address into fastText embeddings. + FastText vectorizer to convert an address into FastText embeddings. """ def __call__(self, addresses: List[str]) -> List: @@ -28,7 +28,7 @@ def _vectorize_sequence(self, address: str) -> List: Method to vectorize the address. Args: - address (str): Address to vectorize using fastText. + address (str): Address to vectorize using FastText. Return: A list of word vector. diff --git a/deepparse/vectorizer/magnitude_vectorizer.py b/deepparse/vectorizer/magnitude_vectorizer.py index ded630be..0ffff493 100644 --- a/deepparse/vectorizer/magnitude_vectorizer.py +++ b/deepparse/vectorizer/magnitude_vectorizer.py @@ -8,7 +8,7 @@ class MagnitudeVectorizer(Vectorizer): """ - FastText Magnitude vectorizer to convert an address into fastText embeddings using magnitude mapping. + FastText Magnitude vectorizer to convert an address into FastText embeddings using magnitude mapping. """ def __call__(self, addresses: List[str]) -> List: @@ -30,7 +30,7 @@ def _vectorize_sequence(self, address: str) -> ndarray: Method to vectorize the address. Args: - address (str): Address to vectorize using fastText. + address (str): Address to vectorize using FastText. Return: A list of word vector. diff --git a/deepparse/weights_tools.py b/deepparse/weights_tools.py index dd5831bd..b8e8e238 100644 --- a/deepparse/weights_tools.py +++ b/deepparse/weights_tools.py @@ -8,7 +8,7 @@ def weights_init(m: nn.Module) -> None: """ - Function to initialize the weights of a model layers. + Function to initialize the weights of model layers. Usage: network = Model() @@ -50,7 +50,7 @@ def handle_weights_upload( except FileNotFoundError as error: if "s3" in path_to_model_to_upload or "//" in path_to_model_to_upload or ":" in path_to_model_to_upload: raise FileNotFoundError( - "Are You trying to use a AWS S3 URI? If so path need to start with s3://." + "Are You trying to use an AWS S3 URI? If so, the path needs to start with s3://." ) from error raise FileNotFoundError(f"The file {path_to_model_to_upload} was not found.") from error return checkpoint_weights diff --git a/docs/source/api.rst b/docs/source/api.rst index c39ff937..52b343f1 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -4,13 +4,13 @@ Parse Address With Our Out-Of-The-Box API ========================================= -We also offer an out-of-the-box RESTAPI to parse addresses using FastAPI. +We also offer an out-of-the-box REST API to parse addresses using FastAPI. Installation ************ -First, ensure that you have Docker Engine and Docker Compose installed on your machine. -If not, you can install them using the following documentations in the following order: +First, ensure you have Docker Engine and Docker Compose installed on your machine. +If not, you can install them using the following documentation in the following order: 1. `Docker Engine `_ 2. `Docker Compose `_ @@ -24,7 +24,7 @@ Once you have Docker Engine and Docker Compose installed, you can run the follow Sentry ****** -Also, you can monitor your application usage with `Sentry `_ by setting the environment variable ``SENTRY_DSN`` to your Sentry's project +Also, you can monitor your application usage with `Sentry `_ by setting the environment variable ``SENTRY_DSN`` to your Sentry project DSN. There is an example of the ``.env`` file in the project's root named ``.env_example``. You can copy it using the following command: .. code-block:: sh @@ -34,7 +34,7 @@ DSN. There is an example of the ``.env`` file in the project's root named ``.env Request Examples ---------------- -Once the application is up and running and port ``8000`` is exported on your localhost, you can send a request with one +Once the application is up and running and port ``8000`` is exported on your ``localhost``, you can send a request with one of the following methods: cURL POST request @@ -65,4 +65,4 @@ Python POST request response = requests.post(url, json=addresses) parsed_addresses = response.json() - print(parsed_addresses) \ No newline at end of file + print(parsed_addresses) diff --git a/docs/source/cli.rst b/docs/source/cli.rst index 60f52be0..09d50a09 100644 --- a/docs/source/cli.rst +++ b/docs/source/cli.rst @@ -13,17 +13,17 @@ The parsing of the addresses to parse ``dataset_path`` is done using the selecte The exported parsed addresses are to be exported in the same directory as the addresses to parse but given the ``export_file_name`` using the encoding format of the address dataset file. For example, if the dataset is in a CSV format, the output file format will be a CSV. Moreover, by default, -we log some information (``--log``) such as the parser model name, the parsed dataset path +we log some information (``--log``), such as the parser model name, the parsed dataset path and the number of parsed addresses. Here is the list of the arguments, their descriptions and default values. One can use the command ``parse --help`` to output the same description in your command line. - ``parsing_model``: The parsing module to use. - ``dataset_path``: The path to the dataset file in a pickle (``.p``, ``.pickle`` or ``.pckl``) or CSV format. - - ``export_file_name``: The filename to use to export the parsed addresses. We will infer the file format base on the file extension. That is, if the file is a pickle (``.p`` or ``.pickle``), we will export it into a pickle file. The supported formats are Pickle, CSV and JSON. The file will be exported in the same repositories as the dataset_path. See the doc for more details on the format exporting. + - ``export_file_name``: The filename to use to export the parsed addresses. We will infer the file format base on the file extension. That is, if the file is a pickle (``.p`` or ``.pickle``), we will export it into a pickle file. The supported formats are Pickle, CSV and JSON. The file will be exported in the same repositories as the dataset_path. See the documentation for more details on the format exporting. - ``--device``: The device to use. It can be 'cpu' or a GPU device index such as ``'0'`` or ``'1'``. By default, ``'0'``. - ``--batch_size``: The batch size to use to process the dataset. By default, ``32``. - ``--path_to_retrained_model``: A path to a retrained model to use for parsing. By default, ``None``. - - ``--csv_column_name``: The column name to extract address in the CSV. Need to be specified if the provided ``dataset_path`` leads to a CSV file. By default, ``None``. + - ``--csv_column_name``: The column name to extract address in the CSV. It needs to be specified if the provided ``dataset_path`` leads to a CSV file. By default, ``None``. - ``--csv_column_separator``: The column separator for the dataset container will only be used if the dataset is a CSV one. By default, ``'\t'``. - ``--log``: Either or not to log the parsing process into a ``.log`` file exported at the same place as the parsed data using the same name as the export file. The bool value can be (not case sensitive) ``'true/false'``, ``'t/f'``, ``'yes/no'``, ``'y/n'`` or ``'0/1'``. By default, ``True``. - ``--cache_dir``: To change the default cache directory (default to ``None``, e.g. default path). @@ -40,7 +40,7 @@ We support three types of export formats: CSV, Pickle and JSON. The first export uses the following pattern column pattern: ``"Address", "First address components class", "Second class", ...``. -Which means the address ``305 rue des Lilas 0 app 2`` will output the table bellow +Which means the address ``305 rue des Lilas 0 app 2`` will output the table below using our default tags: .. list-table:: @@ -65,17 +65,17 @@ using our default tags: - None - None -The second export uses a similar approach but using tuples and list. Using the same example will return the following +The second export uses a similar approach but uses tuples and lists. Using the same example will return the following tuple ``("305 rue des Lilas 0 app 2", [("305", "StreetNumber"), ("rue des lilas", "StreetName"), ...])``. The third export uses a similar approach to the CSV format but uses dictionary-like formatting. Using the -same example will return the following dict ``{"Address": "305 rue des Lilas 0 app 2", "StreetNumber": "305", ...}``. +same example will return the following dictionary ``{"Address": "305 rue des Lilas 0 app 2", "StreetNumber": "305", ...}``. Retrain ******* This command allows a user to retrain the ``base_parsing_model`` on the ``train_dataset_path`` dataset. -For the training, the CSV or Pickle dataset is loader in a specific dataloader (see +For the training, the CSV or Pickle dataset is loaded in a specific dataloader (see :class:`~deepparse.dataset_container.DatasetContainer` for more details). We use Poutyne's automatic logging functionalities during training. Thus, it creates an epoch checkpoint and outputs the epoch metrics in a TSV file. Moreover, we save the best epoch model under the retrain model name (either the default one or a given name using @@ -94,11 +94,11 @@ One can use the command ``parse --help`` to output the same description in your - ``--disable_tensorboard``: To disable Poutyne automatic Tensorboard monitoring. By default, we disable them (``True``). - ``--layers_to_freeze``: Name of the portion of the seq2seq to freeze layers, thus reducing the number of parameters to learn. Default to ``None``. - ``--name_of_the_retrain_parser``: Name to give to the retrained parser that will be used when reloaded as the printed name, and to the saving file name. By default, ``None``, thus, the default name. See the complete parser retrain method for more details. - - ``--device``: The device to use. It can be ``'cpu'`` or a GPU device index such as ``'0'`` or ``'1'``. By default ``'0'``. - - ``--csv_column_names``: The column names to extract address in the CSV. Need to be specified if the provided dataset_path leads to a CSV file. Column names have to be separated by whitespace. For example, ``--csv_column_names column1 column2``. + - ``--device``: The device to use. It can be ``'cpu'`` or a GPU device index such as ``'0'`` or ``'1'``. By default, ``'0'``. + - ``--csv_column_names``: The column names to extract the address in the CSV. It must be specified if the provided dataset_path leads to a CSV file. Column names have to be separated by whitespace. For example, ``--csv_column_names column1 column2``. - ``--csv_column_separator``: The column separator for the dataset container will only be used if the dataset is a CSV one. By default, ``'\t'``. - ``--cache_dir``: To change the default cache directory (default to ``None``, e.g. default path). - - ``prediction_tags``: To change the prediction tags. The ``prediction_tags`` is a path leading to a JSON file of the new tags in a key-value style. For example, the path can be ``"a_path/file.json"`` and the content can be ``{"new_tag": 0, "other_tag": 1, "EOS": 2}``. + - ``prediction_tags``: To change the prediction tags. The ``prediction_tags`` path leads to a JSON file of the new tags in a key-value style. For example, the path can be ``"a_path/file.json"`` and the content can be ``{"new_tag": 0, "other_tag": 1, "EOS": 2}``. .. autofunction:: deepparse.cli.retrain.main @@ -109,9 +109,9 @@ Test This command allows a user to test the ``base_parsing_model`` (or the retrained one using the ``--path_to_retrained_model``) on the ``train_dataset_path`` dataset. -For the testing, the CSV or Pickle dataset is loader in a specific dataloader (see +For the testing, the CSV or Pickle dataset is loaded in a specific dataloader (see :class:`~deepparse.dataset_container.DatasetContainer` for more details). Moreover, by default, -we log some information (``--log``) such as the tested address parser model name and the parsed dataset path. Plus, +we log some information (``--log``), such as the tested address parser model name and the parsed dataset path. Plus, we also log the testing results in a TSV file. The two files are exported at the same path as the testing dataset. Here is the list of the arguments, their descriptions and default values. One can use the command ``parse --help`` to output the same description in your command line. @@ -123,7 +123,7 @@ One can use the command ``parse --help`` to output the same description in your - ``--batch_size``: The batch size to use to process the dataset. By default, ``32``. - ``--num_workers``: The number of workers to use for the data loader (default is ``1`` worker). - ``--seed``: The seed to use to make the sampling deterministic (default ``42``). - - ``--csv_column_name``: The column name to extract address in the CSV. Need to be specified if the provided ``dataset_path`` leads to a CSV file. By default, ``None``. + - ``--csv_column_name``: The column name to extract the address in the CSV. It must be specified if the provided ``dataset_path`` leads to a CSV file. By default, ``None``. - ``--csv_column_separator``: The column separator for the dataset container will only be used if the dataset is a CSV one. By default, ``'\t'``. - ``--log``: Either or not to log the parsing process into a ``.log`` file exported at the same place as the parsed data using the same name as the export file. The bool value can be (not case sensitive) ``'true/false'``, ``'t/f'``, ``'yes/no'``, ``'y/n'`` or ``'0/1'``. By default, ``True``. - ``--cache_dir``: To change the default cache directory (default to ``None``, e.g. default path). diff --git a/docs/source/examples/retrain_with_new_seq2seq_params.rst b/docs/source/examples/retrain_with_new_seq2seq_params.rst index 334291b5..d7e4bc8f 100644 --- a/docs/source/examples/retrain_with_new_seq2seq_params.rst +++ b/docs/source/examples/retrain_with_new_seq2seq_params.rst @@ -56,7 +56,7 @@ Let's start with the default learning rate of ``0.01`` and use a learning rate s logging_path = "./checkpoints" # The new seq2seq params settings using smaller hidden size - # See the doc for the list of tunable seq2seq parameters + # See the documentation for the list of tunable seq2seq parameters seq2seq_params = { "encoder_hidden_size": 512, "decoder_hidden_size": 512 diff --git a/docs/source/parser.rst b/docs/source/parser.rst index 620fb811..f6855880 100644 --- a/docs/source/parser.rst +++ b/docs/source/parser.rst @@ -12,10 +12,10 @@ Pre-trained Complete Model This is the complete pretrained address parser model. This model allows using the pretrained weights to predict the tags of any address. -We offer, for now, only two pretrained models, FastText and BPEmb. The first one relies on +For now, we offer only two pretrained models, FastText and BPEmb. The first one relies on `fastText `__ French pretrained embeddings to parse the address, and the second use the `byte-pair multilingual subword `_ pretrained embeddings. In both cases, -the architecture is similar, and performances are comparable; our results are available in this +the architecture and performances are similar; our results are available in this `article `_. Memory Usage and Time Performance @@ -27,7 +27,7 @@ we report the RAM usage, and in the first table, we also report the GPU memory u Also, for both tables, we report the mean-time of execution that was obtained by processing ~183,000 addresses using different batch sizes (2^0, ..., 2^9) (i.e. :math:`\frac{\text{Total time to process all addresses}}{~183,000} =` time per address). -In addition, we proposed a lighter version (fasttext-light) of our fastText model using +In addition, we proposed a lighter version (``"fasttext-light"``) of our fastText model using `Magnitude embeddings mapping `_. For this lighter model, on average, results are a little bit lower for the trained country (around ~2%) but are similar for the zero-shot country (see our `article `_ for more details). @@ -108,10 +108,10 @@ are a little bit lower for the trained country (around ~2%) but are similar for .. [2] Note that on Windows, we use the Gensim FastText models that use ~10 GO with similar performance. -Thus, the more address is, the faster each address can be processed. You can also improve performance by using more +Thus, the more addresses there are, the faster each address can be processed. You can also improve performance by using more workers for the data loader created with your data within the call. But note that this performance improvement is not linear. Furthermore, as of version ``0.9.6``, we now use Torch 2.0 and many other tricks to improve -processing performance. Here a few: if the parser uses a GPU, it will pin the memory in the Dataloader and reduce some +processing performance. Here are a few: if the parser uses a GPU, it will pin the memory in the Dataloader and reduce some operations (e.g. useless ``.to(device)``). AddressParser diff --git a/docs/source/training_guide.rst b/docs/source/training_guide.rst index 4649bbd0..6aa8cdc1 100644 --- a/docs/source/training_guide.rst +++ b/docs/source/training_guide.rst @@ -5,7 +5,7 @@ Training Guide ============== In addition to parsing addresses out-of-the-box, Deepparse allows you to retrain the pre-trained models to fit your data and use cases. -In the world of machine learning, this is what's refered to as ``fine-tuning``, which can make it easier to obtain well-performing +In the world of machine learning, this is what's referred to as ``fine-tuning``, which can make it easier to obtain well-performing models more efficiently and with less data. Since fine-tuning models can be tricky, this section of the documentation provides some guidelines and insights that may @@ -19,14 +19,14 @@ how to retrain our models. A few use cases may lead you to want to retrain Deepparse's models. Whether you wish to obtain a better performance on a single or multiple countries that our models weren't trained on, or your data and address schemes require a more complex -architecture, or the tag structure of your dataset, is different from ours; deepparse's retraining features accommodate all these use cases and more. +architecture, or your dataset's tag structure, differs from ours; Deepparse's retraining features accommodate all these use cases and more. In practice, our models were trained on 20 countries. They demonstrated very accurate results on all of them, so we advise you to use our models without retraining unless you wish to predict -different tags (e.g., StreetNumber ...). Also, suppose you want to retrain +different tags (e.g., StreetNumber, ...). Also, suppose you want to retrain our models to perform better on countries outside of the 20 used in the original training set. In that case, you can look at `our dataset `__ which includes an additional 41 countries used only for testing. -There are two main concerns to keep in mind when fine-tuning a model: the model's convergence (i.e, its ability actually to learn from the new data) +There are two main concerns to keep in mind when fine-tuning a model: the model's convergence (i.e., its ability actually to learn from the new data) and the possibility of ``catastrophic forgetting`` (i.e., losing the model's previous knowledge after training on the new data). Learning Successfully @@ -37,7 +37,7 @@ of fine-tuning, the models have already developed a base knowledge of the task t This is especially true in the case of Deepparse since the task you are fine-tuning remains the same (i.e. parsing addresses). However, there are a couple of points to consider to obtain favourable results: -- **Make sure you have enough data**: deep learning models are notorious for being pretty data hungry, so unless you have enough data, the models +- **Make sure you have enough data**: deep learning models are notorious for being pretty data-hungry, so unless you have enough data, the models will have a hard time learning. Since Deepparse's models have already been trained on a few million addresses, the need for data is mitigated for fine-tuning. However, it is recommended to use at least a few thousand examples per new country when retraining. @@ -59,7 +59,7 @@ However, there are a couple of points to consider to obtain favourable results: Do Not Forget! ************** -As mentionned above, catastrophic forgetting can happen when fine-tuning machine learning models. This is because the models' internal parameters are +As mentioned above, catastrophic forgetting can happen when fine-tuning machine learning models. This is because the models' internal parameters are modified to accommodate the new task/data, which can impact their ability to be appropriate for the previous task/data. There are many fancy ways to mitigate catastrophic forgetting when fine-tuning models. Still, given the task and data that Deepparse handles, we recommend including some of the previous data when constructing your retraining dataset. The amount @@ -95,5 +95,5 @@ Modifying the Architecture The :meth:`~deepparse.parser.AddressParser.retrain` method allows you to change the architecture of the models using the ``seq2seq_params`` argument. This can be useful if you need a more complex model or a lighter model, for example. However, if you -change the models' architecture, you will end up with a completely new model that will be retrained from scratch. This -means that all the previous knowledge that the initial model had will disapear. +change the models' architecture, a completely new model will be retrained from scratch. This +means that all the previous knowledge that the initial model had will disappear. diff --git a/examples/retrain_with_new_seq2seq_params.py b/examples/retrain_with_new_seq2seq_params.py index a402240d..99a42920 100644 --- a/examples/retrain_with_new_seq2seq_params.py +++ b/examples/retrain_with_new_seq2seq_params.py @@ -34,7 +34,7 @@ logging_path = "./checkpoints" # The new seq2seq params settings using smaller hidden size -# See the doc for the list of tunable seq2seq parameters +# See the documentation for the list of tunable seq2seq parameters seq2seq_params = {"encoder_hidden_size": 512, "decoder_hidden_size": 512} address_parser.retrain( diff --git a/models_evaluation/timer/timer.py b/models_evaluation/timer/timer.py index 285f1777..07fc2c6e 100644 --- a/models_evaluation/timer/timer.py +++ b/models_evaluation/timer/timer.py @@ -30,7 +30,7 @@ class Timer: The class can be used as a context manager to time the code inside the 'with' statement, as a decorator of a function or a method to time it at each call, or as an iterator to have the total running time of a - for loop as well as the mean time taken per iteration. See the doc of the init method for usage examples. + for loop as well as the mean time taken per iteration. See the documentation of the init method for usage examples. """ def __init__( diff --git a/pyproject.toml b/pyproject.toml index 44d1fe5f..54b8d8c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,11 +3,10 @@ target-version = ['py38', 'py39', 'py310', 'py311'] line-length = 120 skip-string-normalization = true -required-version = "23.9.1" extend-exclude = "/(slides)/" [tool.pylint.ini_options] DJANGO_SETTINGS_MODULE = "settings" [build-system] -requires = ["setuptools", "wheel", "pybind11"] \ No newline at end of file +requires = ["setuptools", "wheel", "pybind11"] diff --git a/requirements.txt b/requirements.txt index 3df78e60..f93a8e98 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ torch bpemb -numpy +numpy<2.0.0 +scipy<=1.10.1 requests pymagnitude-light colorama>=0.4.3 diff --git a/setup.py b/setup.py index 1cd42fe1..f83097f2 100644 --- a/setup.py +++ b/setup.py @@ -69,15 +69,16 @@ def main(): ], packages=packages, install_requires=[ - "numpy", + "numpy<2.0.0", "torch", "bpemb", + "scipy<=1.10.1", "gensim>=4.0.0", "requests", "fasttext-wheel", "pymagnitude-light", "poutyne", - "pandas==2.0.3", + "pandas", "urllib3", "cloudpathlib[s3, gs, azure]", ], @@ -90,11 +91,11 @@ def main(): "app": ["fastapi[all]==0.99.1", "uvicorn==0.22.0", "sentry-sdk[fastapi]==1.28.1", "python-decouple==3.8"], "all": [ "colorama>=0.4.3", # colorama - "fastapi[all]==0.99.1", # app requirements + "fastapi[all]==0.109.1", # app requirements "uvicorn==0.22.0", "sentry-sdk[fastapi]>=1.28.1", "python-decouple==3.8", - "black==23.3.0", # code formatting requirements + "black", # code formatting requirements "pylint==2.16.2", "pylint-django[with_django]==2.5.3", "pre-commit==3.3.3", diff --git a/styling_requirements.txt b/styling_requirements.txt index 8a4776f6..335cae47 100644 --- a/styling_requirements.txt +++ b/styling_requirements.txt @@ -1,4 +1,4 @@ -black==23.9.1 -pylint==2.16.2 -pylint-django[with_django]==2.5.3 -pre-commit==3.3.3 \ No newline at end of file +black +pylint +pylint-django[with_django] +pre-commit \ No newline at end of file diff --git a/tests/app/test_app.py b/tests/app/test_app.py index 8695a30c..0598c3d4 100644 --- a/tests/app/test_app.py +++ b/tests/app/test_app.py @@ -1,8 +1,8 @@ -from typing import Dict, List, Union import os - -from unittest.mock import MagicMock +from typing import Dict, List, Union from unittest import skipIf +from unittest.mock import MagicMock + import pytest try: @@ -13,12 +13,21 @@ "Ensure you installed the packages for the app_requirements.txt file found in the root of the project" ) from e -from deepparse.app.app import app, format_parsed_addresses, Address, AddressParser -from deepparse.parser import FormattedParsedAddress +if os.environ["TEST_LEVEL"] == "all": + from deepparse.app.app import app + from deepparse.app.tools import format_parsed_addresses, Address + from deepparse.parser import FormattedParsedAddress, AddressParser +else: + # To handle pylint error E0606. + app = None + format_parsed_addresses = None + Address = None + FormattedParsedAddress = None + AddressParser = None -@skipIf(os.environ["TEST_LEVEL"] == "unit", "Cannot run test without a proper GPU or RAM.") @pytest.fixture(scope="session", name="client") +@skipIf(os.environ["TEST_LEVEL"] == "unit", "Cannot run test without a proper GPU or RAM.") def fixture_client(): with TestClient(app) as _client: yield _client diff --git a/tests/cli/test_retrain.py b/tests/cli/test_retrain.py index fad25bf0..e3e32570 100644 --- a/tests/cli/test_retrain.py +++ b/tests/cli/test_retrain.py @@ -201,8 +201,8 @@ def test_integration_csv(self): def test_ifIsCSVFile_noColumnName_raiseValueError(self): with self.assertRaises(ValueError): - # We set up the params with the default value of csv_column_names of the test case method set_up_params, - # which is None, thus no column names. + # We set up the params with the default value of ``"csv_column_names"`` of the test case method + # set_up_params, which is None, thus no column names. parser_params = self.set_up_params(train_dataset_path=self.a_train_csv_dataset_path) retrain.main(parser_params) diff --git a/tests/converter/test_data_processor.py b/tests/converter/test_data_processor.py index 4686caed..bea57c72 100644 --- a/tests/converter/test_data_processor.py +++ b/tests/converter/test_data_processor.py @@ -64,8 +64,8 @@ def setUp(self): ) self.fasttext_batch_padding_callback_mock = Mock() - self.fasttext_batch_padding_callback_mock.side_effect = ( - lambda *params: ( + self.fasttext_batch_padding_callback_mock.side_effect = lambda *params: ( + ( ( self.a_padded_word_embedding_sequence, self.a_sequence_lengths_list, @@ -89,8 +89,8 @@ def setUp(self): self.a_padded_tag_targets, ) ) - self.bpemb_batch_padding_callback_mock.side_effect = ( - lambda *params: ( + self.bpemb_batch_padding_callback_mock.side_effect = lambda *params: ( + ( ( self.a_padded_subword_embedding_sequence, self.a_word_decomposition_lengths_list, diff --git a/tests/data_validation/test_data_validation.py b/tests/data_validation/test_data_validation.py index 4ada1013..7b2e4b55 100644 --- a/tests/data_validation/test_data_validation.py +++ b/tests/data_validation/test_data_validation.py @@ -1,21 +1,37 @@ +# pylint: disable=too-many-public-methods + from unittest import TestCase from deepparse import validate_if_any_empty, validate_if_any_whitespace_only, validate_if_any_none -from deepparse.data_validation import is_whitespace_only, is_empty, is_none +from deepparse.data_validation import ( + is_whitespace_only, + is_empty, + is_none, + validate_if_any_multiple_consecutive_whitespace, + is_multiple_consecutive_whitespace, + is_newline, + validate_if_any_newline_character, +) class DataValidationTest(TestCase): - def test_integration_validate_if_any_empty(self): - a_list_of_string_element = ["an address", "another address"] - self.assertFalse(validate_if_any_empty(a_list_of_string_element)) + def setUp(self): + self.a_list_of_string_element = ["an address", "another address"] + + def test_integration_validate_if_any_empty_return_false(self): + self.assertFalse(validate_if_any_empty(self.a_list_of_string_element)) + + def test_integration_validate_if_any_whitespace_only_return_false(self): + self.assertFalse(validate_if_any_whitespace_only(self.a_list_of_string_element)) + + def test_integration_validate_if_any_none_return_false(self): + self.assertFalse(validate_if_any_none(self.a_list_of_string_element)) - def test_integration_validate_if_any_whitespace_only(self): - a_list_of_string_element = ["an address", "another address"] - self.assertFalse(validate_if_any_whitespace_only(a_list_of_string_element)) + def test_integration_validate_if_any_multiple_consecutive_whitespace_return_false(self): + self.assertFalse(validate_if_any_multiple_consecutive_whitespace(self.a_list_of_string_element)) - def test_integration_validate_if_any_none(self): - a_list_of_string_element = ["an address", "another address"] - self.assertFalse(validate_if_any_none(a_list_of_string_element)) + def test_integration_validate_if_any_newline_character_return_false(self): + self.assertFalse(validate_if_any_newline_character(self.a_list_of_string_element)) def test_integration_validate_if_any_empty_with_empty_return_true(self): a_list_of_string_element = ["an address", ""] @@ -29,6 +45,23 @@ def test_integration_validate_if_any_none_with_none_return_true(self): a_list_of_string_element = ["an address", None] self.assertTrue(validate_if_any_none(a_list_of_string_element)) + def test_integration_validate_if_any_multiple_consecutive_whitespace_with_multiple_whitespace_return_true(self): + a_list_of_string_element = ["an address", "an address"] + self.assertTrue(validate_if_any_multiple_consecutive_whitespace(a_list_of_string_element)) + + a_list_of_string_element = ["an address", "an address"] + self.assertTrue(validate_if_any_multiple_consecutive_whitespace(a_list_of_string_element)) + + a_list_of_string_element = ["an address", "an address", "an address"] + self.assertTrue(validate_if_any_multiple_consecutive_whitespace(a_list_of_string_element)) + + def test_integration_validate_if_any_newline_character_with_newline_return_true(self): + a_list_of_string_element = ["an address", "an address\n"] + self.assertTrue(validate_if_any_newline_character(a_list_of_string_element)) + + a_list_of_string_element = ["an address", "an\n address"] + self.assertTrue(validate_if_any_newline_character(a_list_of_string_element)) + def test_if_no_whitespace_address_when_is_white_space_return_false(self): an_address_not_whitespace_only = "an address" @@ -78,7 +111,7 @@ def test_if_no_none_when_is_none_return_false(self): self.assertFalse(is_none(another_address_not_empty)) - def test_if_no_none_when_is_none_return_true(self): + def test_if_none_when_is_none_return_true(self): an_address_not_empty = None self.assertTrue(is_none(an_address_not_empty)) @@ -86,3 +119,39 @@ def test_if_no_none_when_is_none_return_true(self): another_address_not_empty = None self.assertTrue(is_none(another_address_not_empty)) + + def test_if_no_consecutive_whitespace_when_is_multiple_consecutive_whitespace_return_false(self): + an_address_not_empty = "an address" + + self.assertFalse(is_multiple_consecutive_whitespace(an_address_not_empty)) + + another_address_not_empty = "address" + + self.assertFalse(is_multiple_consecutive_whitespace(another_address_not_empty)) + + def test_if_consecutive_whitespace_when_is_multiple_consecutive_whitespace_return_true(self): + an_address_not_empty = "an address" + + self.assertTrue(is_multiple_consecutive_whitespace(an_address_not_empty)) + + another_address_not_empty = "address " + + self.assertTrue(is_multiple_consecutive_whitespace(another_address_not_empty)) + + def test_if_no_newline_when_is_newline_return_false(self): + an_address_not_empty = "an address" + + self.assertFalse(is_newline(an_address_not_empty)) + + another_address_not_empty = "address" + + self.assertFalse(is_newline(another_address_not_empty)) + + def test_if_newline_when_is_newline_return_true(self): + an_address_not_empty = "an address\n" + + self.assertTrue(is_newline(an_address_not_empty)) + + another_address_not_empty = "address \n" + + self.assertTrue(is_newline(another_address_not_empty)) diff --git a/tests/dataset_container/test_dataset_container.py b/tests/dataset_container/test_dataset_container.py index 82bac989..94fd94d6 100644 --- a/tests/dataset_container/test_dataset_container.py +++ b/tests/dataset_container/test_dataset_container.py @@ -104,6 +104,24 @@ def test_when_whitespace_only_address_then_raise_data_error(self): with self.assertRaises(DataError): ADatasetContainer(some_invalid_data) + def test_when_multiple_consecutive_whitespace_address_then_raise_data_error(self): + some_invalid_data = [("An address", [1, 0]), ("An address", [1, 0]), ("A last address", [3, 4, 0])] + with self.assertRaises(DataError): + ADatasetContainer(some_invalid_data) + + some_invalid_data = [("An address", [1, 0]), ("A second Address", [2, 3, 4]), ("A last address", [3, 4, 0])] + with self.assertRaises(DataError): + ADatasetContainer(some_invalid_data) + + def test_when_newline_address_then_raise_data_error(self): + some_invalid_data = [("An address", [1, 0]), ("An address\n", [1, 0]), ("A last address", [3, 4, 0])] + with self.assertRaises(DataError): + ADatasetContainer(some_invalid_data) + + some_invalid_data = [("An address", [1, 0]), ("A second \nAddress", [2, 3, 4]), ("A last address", [3, 4, 0])] + with self.assertRaises(DataError): + ADatasetContainer(some_invalid_data) + def test_when_empty_tags_set_then_raise_data_error(self): some_invalid_data = [("An address", [1, 0]), ("another address", []), ("A last address", [3, 4, 0])] with self.assertRaises(DataError): diff --git a/tests/embeddings_models/test_bpemb_embeddings_model.py b/tests/embeddings_models/test_bpemb_embeddings_model.py index a14ecac9..a1663150 100644 --- a/tests/embeddings_models/test_bpemb_embeddings_model.py +++ b/tests/embeddings_models/test_bpemb_embeddings_model.py @@ -19,7 +19,7 @@ def setUp(self): def test_whenInstantiatedWithPath_thenShouldLoadBPEmbModel(self): with patch( - "deepparse.embeddings_models.bpemb_embeddings_model.BPEmb", + "deepparse.embeddings_models.bpemb_embeddings_model.BPEmbBaseURLWrapperBugFix", return_value=self.model, ) as loader: _ = BPEmbEmbeddingsModel(self.a_path, verbose=False) @@ -28,7 +28,7 @@ def test_whenInstantiatedWithPath_thenShouldLoadBPEmbModel(self): def test_whenCalledToEmbed_thenShouldCallLoadedModel(self): with patch( - "deepparse.embeddings_models.bpemb_embeddings_model.BPEmb", + "deepparse.embeddings_models.bpemb_embeddings_model.BPEmbBaseURLWrapperBugFix", return_value=self.model, ): embeddings_model = BPEmbEmbeddingsModel(self.a_path, verbose=False) @@ -39,7 +39,7 @@ def test_whenCalledToEmbed_thenShouldCallLoadedModel(self): def test_givenADimOf9_whenAskDimProperty_thenReturnProperDim(self): with patch( - "deepparse.embeddings_models.bpemb_embeddings_model.BPEmb", + "deepparse.embeddings_models.bpemb_embeddings_model.BPEmbBaseURLWrapperBugFix", return_value=self.model, ): embeddings_model = BPEmbEmbeddingsModel(self.a_path, verbose=False) diff --git a/tests/embeddings_models/test_embeddings_model_factory.py b/tests/embeddings_models/test_embeddings_model_factory.py index b9e9cc07..e19e95b9 100644 --- a/tests/embeddings_models/test_embeddings_model_factory.py +++ b/tests/embeddings_models/test_embeddings_model_factory.py @@ -25,7 +25,7 @@ def setUpClass(cls): def setUp(self): self.embeddings_model_factory = EmbeddingsModelFactory() - @patch("deepparse.embeddings_models.bpemb_embeddings_model.BPEmb") + @patch("deepparse.embeddings_models.bpemb_embeddings_model.BPEmbBaseURLWrapperBugFix") def test_givenABpembEmbeddingsModelType_whenCreatingEmbeddingsModel_thenShouldReturnCorrectEmbeddingsModel( self, bpemb_mock ): diff --git a/tests/network/test_seq2seq.py b/tests/network/test_seq2seq.py index cacdee64..7697bf0f 100644 --- a/tests/network/test_seq2seq.py +++ b/tests/network/test_seq2seq.py @@ -13,7 +13,6 @@ from unittest import skipIf from unittest.mock import patch, MagicMock, call -import pytest import torch from deepparse.network import Seq2SeqModel @@ -283,9 +282,7 @@ def test_givenSeq2seqModel_whenLoadPreTrainedWeightsNotVerboseGPU_thenWarningsNo isfile_mock.return_value = True last_version_mock.return_value = False with patch("deepparse.network.seq2seq.download_weights"): - with pytest.warns(None) as record: - seq2seq_model._load_pre_trained_weights(self.a_model_type, cache_dir=self.cache_dir, offline=False) - self.assertEqual(0, len(record)) + seq2seq_model._load_pre_trained_weights(self.a_model_type, cache_dir=self.cache_dir, offline=False) @patch("deepparse.network.seq2seq.latest_version") @patch("os.path.isfile") @@ -330,9 +327,7 @@ def test_givenSeq2seqModel_whenLoadPreTrainedWeightsNotVerboseCPU_thenWarningsNo isfile_mock.return_value = True last_version_mock.return_value = False with patch("deepparse.network.seq2seq.download_weights"): - with pytest.warns(None) as record: - seq2seq_model._load_pre_trained_weights(self.a_model_type, cache_dir=self.cache_dir, offline=False) - self.assertEqual(0, len(record)) + seq2seq_model._load_pre_trained_weights(self.a_model_type, cache_dir=self.cache_dir, offline=False) @patch("deepparse.weights_tools.torch") @patch("deepparse.network.seq2seq.torch.nn.Module.load_state_dict") diff --git a/tests/parser/test_address_parser.py b/tests/parser/test_address_parser.py index 77573d2a..3b1ce85d 100644 --- a/tests/parser/test_address_parser.py +++ b/tests/parser/test_address_parser.py @@ -109,7 +109,13 @@ def assert_equal_not_ordered(self, actual, expected_elements): self.assertIn(expected, actual) def test_givenAModel_whenInit_thenProperFieldsSet(self): - address_parser = AddressParser(model_type=self.a_bpemb_model_type, device=self.a_cpu_device, verbose=True) + # We use BPEmb but could use FastText also + with patch("deepparse.parser.address_parser.EmbeddingsModelFactory") as _: + with patch("deepparse.parser.address_parser.VectorizerFactory") as _: + with patch("deepparse.parser.address_parser.DataProcessorFactory") as _: + address_parser = AddressParser( + model_type=self.a_bpemb_model_type, device=self.a_cpu_device, verbose=True + ) expected_fields = self.expected_fields actual_tags = list(address_parser.tags_converter.tags_to_idx.keys()) @@ -120,25 +126,32 @@ def test_givenAModel_whenInit_thenProperFieldsSet(self): self.assert_equal_not_ordered(actual_fields, expected_fields) def test_givenACPUDeviceSetup_whenInstantiatingParser_thenDeviceIsCPU(self): - address_parser = AddressParser( - model_type=self.a_best_model_type.capitalize(), - # we use BPEmb for simplicity - device=self.a_cpu_device, - ) + # We use BPEmb but could use FastText also + with patch("deepparse.parser.address_parser.EmbeddingsModelFactory") as _: + with patch("deepparse.parser.address_parser.VectorizerFactory") as _: + with patch("deepparse.parser.address_parser.DataProcessorFactory") as _: + address_parser = AddressParser( + model_type=self.a_best_model_type.capitalize(), + # we use BPEmb for simplicity + device=self.a_cpu_device, + ) actual = address_parser.device expected = self.a_cpu_torch_device self.assertEqual(actual, expected) - # We use BPEmb but could use FastText also @patch("deepparse.parser.address_parser.torch.cuda") def test_givenAGPUDeviceSetup_whenInstantiatingParserWithoutGPU_thenRaiseWarningAndCPU(self, cuda_mock): - cuda_mock.is_available.return_value = False - with self.assertWarns(UserWarning): - address_parser = AddressParser( - model_type=self.a_best_model_type.capitalize(), - # we use BPEmb for simplicity - device=self.a_gpu_device, - ) + # We use BPEmb but could use FastText also + with patch("deepparse.parser.address_parser.EmbeddingsModelFactory") as _: + with patch("deepparse.parser.address_parser.VectorizerFactory") as _: + with patch("deepparse.parser.address_parser.DataProcessorFactory") as _: + cuda_mock.is_available.return_value = False + with self.assertWarns(UserWarning): + address_parser = AddressParser( + model_type=self.a_best_model_type.capitalize(), + # we use BPEmb for simplicity + device=self.a_gpu_device, + ) actual = address_parser.device expected = self.a_cpu_torch_device self.assertEqual(actual, expected) diff --git a/tests/parser/test_address_parser_retrain_api.py b/tests/parser/test_address_parser_retrain_api.py index f3b12b22..a84a7e23 100644 --- a/tests/parser/test_address_parser_retrain_api.py +++ b/tests/parser/test_address_parser_retrain_api.py @@ -166,87 +166,6 @@ def test_givenAFasttextModel_whenRetrain_thenInstantiateOptimizer( optimizer_mock.assert_called_with(self.model_mock.parameters(), self.a_learning_rate) - @patch("deepparse.validations.poutyne") - @patch("deepparse.parser.address_parser.torch.save") - @patch("deepparse.parser.address_parser.Experiment") - @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.ModelFactory") - @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") - @patch("deepparse.parser.address_parser.VectorizerFactory") - @patch("deepparse.parser.address_parser.DataProcessorFactory") - @patch("deepparse.parser.address_parser.DataPadder") - def test_givenAModel_whenRetrainWithPoutyneBefore18_thenPrintMessage( - self, - data_padder_mock, - data_processor_factory_mock, - vectorizer_factory_mock, - embeddings_factory_mock, - model_factory_mock, - optimizer_mock, - experiment_mock, - torch_save_mock, - poutyne_mock, - ): - poutyne_mock.version.__version__ = "1.7" - self._capture_output() - self.address_parser = AddressParser( - model_type=self.a_fasttext_model_type, - device=self.a_device, - verbose=self.verbose, - ) - self.address_parser_retrain_call() - - actual = self.test_out.getvalue() - expected = ( - "You are using an older version of Poutyne that does not support proper error management." - " Due to that, we cannot show retrain progress. To fix that, update Poutyne to the newest version.\n" - ) - - self.assertEqual(actual, expected) - - @patch("deepparse.validations.poutyne") - @patch("deepparse.parser.address_parser.torch.save") - @patch("deepparse.parser.address_parser.Experiment") - @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.ModelFactory") - @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") - @patch("deepparse.parser.address_parser.VectorizerFactory") - @patch("deepparse.parser.address_parser.DataProcessorFactory") - @patch("deepparse.parser.address_parser.DataPadder") - def test_givenAModel_whenRetrainWithPoutyneAfter17_thenDoNotPrintMessage( - self, - data_padder_mock, - data_processor_factory_mock, - vectorizer_factory_mock, - embeddings_factory_mock, - model_factory_mock, - optimizer_mock, - experiment_mock, - torch_save_mock, - poutyne_mock, - ): - poutyne_mock.version.__version__ = "1.8" - self._capture_output() - self.address_parser = AddressParser( - model_type=self.a_fasttext_model_type, - device=self.a_device, - verbose=self.verbose, - ) - self.address_parser_retrain_call() - - actual = self.test_out.getvalue() - - expected = "" - self.assertEqual(actual, expected) - - not_expected = ( - "You are using a older version of Poutyne that does not support properly error management." - " Due to that, we cannot show retrain progress. To fix that, update Poutyne to the newest " - "version.\n" - ) - - self.assertNotRegex(actual, not_expected) - @patch("deepparse.parser.address_parser.torch.save") @patch( "deepparse.parser.address_parser.Experiment", @@ -1433,8 +1352,8 @@ def test_givenRetrainSettings_whenFormattedNameParserName_thenReturnProperNaming ) # We set possible params type with a value - prediction_tags_settings = [{"A dict": 1.0}, None] # Can be a dict or a None - seq2seq_params_settings = [{"A dict": 1.0}, None] # Can be a dict or a None + prediction_tags_settings = [{"A dict": 1.0}, None] # Can be a dictionary or a None + seq2seq_params_settings = [{"A dict": 1.0}, None] # Can be a dictionary or a None layers_to_freeze_settings = [None, "encoder", "decoder", "prediction_layer", "seq2seq"] # From the doc # We loop all possible settings diff --git a/tests/requirements.txt b/tests/requirements.txt index 76ddef32..7ea2025f 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,8 +1,8 @@ -pycountry==22.3.5 -pytest==7.4.0 -pytest-asyncio==0.21.1 -pytest_cov==4.1.0 -pytest-env==0.8.2 -pytest-mock==3.11.1 -pytest-xdist[psutil]==3.3.1 -tensorboard==2.13.0 +pycountry +pytest +pytest-asyncio +pytest_cov +pytest-env +pytest-mock +pytest-xdist[psutil] +tensorboard diff --git a/tests/test_download_tools.py b/tests/test_download_tools.py index 6973a76b..202c7c89 100644 --- a/tests/test_download_tools.py +++ b/tests/test_download_tools.py @@ -228,7 +228,7 @@ def test_givenAFasttextLightEmbeddingsNotLocal_whenDownloadFasttextEmbeddingsVer download_fasttext_magnitude_embeddings(self.a_directory_path, verbose=True) expected = ( - "The fastText pretrained word embeddings will be download in magnitude format (2.3 GO), " + "The FastText pretrained word embeddings will be download in magnitude format (2.3 GO), " "this process will take several minutes." ) @@ -311,7 +311,7 @@ def test_givenADownloadFasttext_whenPrintProgressSetToVerbose_thenPrint( actual = self.test_out.getvalue().strip() expected = ( - "The fastText pretrained word embeddings will be downloaded (6.8 GO), " + "The FastText pretrained word embeddings will be downloaded (6.8 GO), " "this process will take several minutes." ) self.assertIn(expected, actual) diff --git a/tests/vectorizer/test_vectorizer_factory.py b/tests/vectorizer/test_vectorizer_factory.py index 4839ce41..2a75b921 100644 --- a/tests/vectorizer/test_vectorizer_factory.py +++ b/tests/vectorizer/test_vectorizer_factory.py @@ -13,7 +13,7 @@ class VectorizerFactoryTest(TestCase): @classmethod - @patch("deepparse.embeddings_models.bpemb_embeddings_model.BPEmb") + @patch("deepparse.embeddings_models.bpemb_embeddings_model.BPEmbBaseURLWrapperBugFix") @patch("deepparse.embeddings_models.fasttext_embeddings_model.load_fasttext_embeddings") @patch("deepparse.embeddings_models.fasttext_embeddings_model.load_facebook_vectors") @patch("deepparse.embeddings_models.magnitude_embeddings_model.Magnitude") diff --git a/version.txt b/version.txt index 6f060dcb..ea8f4fd6 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.9.9 \ No newline at end of file +0.9.10 \ No newline at end of file