From 325242877bb514c05e15052f1dd6cb951558666e Mon Sep 17 00:00:00 2001 From: davebulaval Date: Thu, 28 Dec 2023 11:49:53 -0400 Subject: [PATCH] improve documentation --- CHANGELOG.md | 16 ++-- deepparse/app/request_examples.http | 2 +- deepparse/cli/download_model.py | 5 +- deepparse/cli/download_models.py | 4 +- deepparse/cli/parse.py | 12 +-- deepparse/cli/parser_arguments_adder.py | 4 +- deepparse/cli/retrain.py | 12 +-- deepparse/cli/test.py | 2 +- deepparse/comparer/addresses_comparer.py | 26 +++---- .../comparer/formatted_compared_addresses.py | 31 ++++---- .../formatted_compared_addresses_raw.py | 4 +- .../formatted_compared_addresses_tags.py | 2 +- deepparse/converter/data_padder.py | 24 +++--- deepparse/converter/data_processor.py | 6 +- deepparse/converter/target_converter.py | 5 +- .../dataset_container/dataset_container.py | 73 ++++++++++--------- deepparse/dataset_container/tools.py | 4 +- deepparse/download_tools.py | 46 ++++++------ .../bpemb_embeddings_model.py | 4 +- .../embeddings_model_factory.py | 4 +- .../fasttext_embeddings_model.py | 4 +- .../magnitude_embeddings_model.py | 8 +- deepparse/errors/data_error.py | 2 +- deepparse/errors/model_error.py | 2 +- deepparse/errors/server_error.py | 2 +- deepparse/metrics/accuracy.py | 4 +- deepparse/metrics/nll_loss.py | 10 +-- deepparse/network/bpemb_seq2seq.py | 32 ++++---- deepparse/network/decoder.py | 4 +- deepparse/network/embedding_network.py | 19 ++--- deepparse/network/encoder.py | 6 +- deepparse/network/fasttext_seq2seq.py | 24 +++--- deepparse/network/model_factory.py | 18 +++-- deepparse/network/seq2seq.py | 50 +++++++------ deepparse/parser/address_parser.py | 31 ++++---- deepparse/parser/formatted_parsed_address.py | 10 +-- deepparse/validations.py | 14 ++-- deepparse/vectorizer/bpemb_vectorizer.py | 2 +- deepparse/vectorizer/fasttext_vectorizer.py | 4 +- deepparse/vectorizer/magnitude_vectorizer.py | 4 +- deepparse/weights_tools.py | 4 +- docs/source/api.rst | 12 +-- docs/source/cli.rst | 26 +++---- .../retrain_with_new_seq2seq_params.rst | 2 +- docs/source/parser.rst | 10 +-- docs/source/training_guide.rst | 16 ++-- examples/retrain_with_new_seq2seq_params.py | 2 +- models_evaluation/timer/timer.py | 2 +- tests/cli/test_retrain.py | 4 +- .../parser/test_address_parser_retrain_api.py | 4 +- tests/test_download_tools.py | 4 +- 51 files changed, 323 insertions(+), 299 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1cd0153c..fc87338d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,7 @@ - Added "contributing to" - Added fix for comma problem (#56) -- Added content in Address Parser doc for tags definition +- Added content in Address Parser documentation for tags definition - Fixed Pylint bug with PyTorch 1.6 - Fixed `pack_padded` cpu error with PyTorch new release @@ -75,7 +75,7 @@ ## 0.3.6 -- Added a method for a dict conversion of parsed addresses for simpler `Pandas` integration. +- Added a method for dictionary conversion of parsed addresses for simpler `Pandas` integration. - Added examples for parsing addresses and how to convert them into a DataFrame. - Fixed error with download module. @@ -83,7 +83,7 @@ - Added verbose flag to training and test based on the __init__ of address parser. - Added a feature to retrain our models with prediction tags dictionary different from the default one. -- Added in-doc code examples. +- Added in-documentation code examples. - Added code examples. - Small improvement of models implementation. @@ -134,7 +134,7 @@ ## 0.6.2 - Improved (slightly) code speed of data padding method as per PyTorch list or array to Tensor recommendation. -- Improved doc for RuntimeError due to retraining FastText and BPEmb model in the same directory. +- Improved documentation for RuntimeError due to retraining FastText and BPEmb model in the same directory. - Added error handling RuntimeError when retraining. ## 0.6.3 @@ -162,13 +162,13 @@ ## 0.6.6 - Fixed errors in code examples -- Improved doc of download_from_url +- Improved documentation of download_from_url - Improve error management of retrain and test ## 0.6.7 - Fixed errors in data validation -- Improved doc over data validation +- Improved documentation over data validation - Bugfix data slicing error with data containers - Add an example on how to use a retrained model @@ -176,7 +176,7 @@ - Improved CLI - Fixed bug in CLI export dataset -- Improved the doc of the CLI +- Improved the documentation of the CLI ## 0.7.1 @@ -208,7 +208,7 @@ user-given name - Hot-fix missing raise for DataError validation of address to parse when address is tuple - Bug-fix handling of string column name for CSVDatasetContainer that raised ValueError -- Improve parse CLI doc and fix error in doc stating JSON format is supported as input data +- Improve parse CLI documentation and fix error in documentation stating JSON format is supported as input data - Add batch_size to parse CLI - Add minimum version to Gensim 4.0.0. - Add a new CLI function, retrain, to retrain from the command line diff --git a/deepparse/app/request_examples.http b/deepparse/app/request_examples.http index 1bd8a2f8..403dc391 100644 --- a/deepparse/app/request_examples.http +++ b/deepparse/app/request_examples.http @@ -16,5 +16,5 @@ Content-Type: application/json [ {"raw": "16 rue Grande-Place, Victoriaville, QC, G6S 1E6"}, - {"raw": "123 rue Valancourt, Val-Alain, quebec, g9v1s3"} + {"raw": "123 rue valancourt, val-alain, quebec, g9v 1s3"} ] \ No newline at end of file diff --git a/deepparse/cli/download_model.py b/deepparse/cli/download_model.py index d748bc89..6ccaa5e4 100644 --- a/deepparse/cli/download_model.py +++ b/deepparse/cli/download_model.py @@ -1,13 +1,12 @@ import argparse import sys - from deepparse.download_tools import download_model, MODEL_MAPPING_CHOICES def main(args=None) -> None: """ - CLI function to manually download all the dependencies for a pretrained model. + CLI function to download all the dependencies for a pretrained model manually. Example of usage: @@ -41,7 +40,7 @@ def get_parser() -> argparse.ArgumentParser: "--saving_cache_dir", type=str, default=None, - help="To change the default saving cache directory (default to None e.g. default path).", + help="To change the default saving cache directory (default to None, e.g. default path).", ) return parser diff --git a/deepparse/cli/download_models.py b/deepparse/cli/download_models.py index 6ab6f359..658c8816 100644 --- a/deepparse/cli/download_models.py +++ b/deepparse/cli/download_models.py @@ -6,7 +6,7 @@ def main(args=None) -> None: """ - CLI function to manually download all the dependencies for all pretrained models. + CLI function to download all the dependencies for all pretrained models manually. Example of usage: @@ -34,7 +34,7 @@ def get_parser() -> argparse.ArgumentParser: "--saving_cache_dir", type=str, default=None, - help="To change the default saving cache directory (default to None e.g. default path).", + help="To change the default saving cache directory (default to None, e.g. default path).", ) return parser diff --git a/deepparse/cli/parse.py b/deepparse/cli/parse.py index 37e8c13d..a96746ec 100644 --- a/deepparse/cli/parse.py +++ b/deepparse/cli/parse.py @@ -32,7 +32,7 @@ def main(args=None) -> None: # pylint: disable=too-many-locals, too-many-branches """ - CLI function to rapidly parse an addresses dataset and output it in another file. + CLI function to easily parse an address dataset and output it in another file. Examples of usage: @@ -40,7 +40,7 @@ def main(args=None) -> None: parse fasttext ./dataset_path.csv parsed_address.pickle - Using a gpu device + Using a GPU device .. code-block:: sh @@ -119,7 +119,7 @@ def main(args=None) -> None: def get_parser() -> argparse.ArgumentParser: - """Return ArgumentParser for the cli.""" + """Return ArgumentParser for the CLI.""" parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) parser.add_argument( @@ -137,11 +137,11 @@ def get_parser() -> argparse.ArgumentParser: parser.add_argument( "export_filename", help=wrap( - "The filename to use to export the parsed addresses. We will infer the file format base on the " + "The filename to use to export the parsed addresses. We will infer the file format based on the " "file extension. That is, if the file is a pickle (.p or .pickle), we will export it into a pickle file. " - "The supported format are Pickle, CSV and JSON. " + "The supported formats are Pickle, CSV and JSON. " "The file will be exported in the same repositories as the dataset_path. " - "See the doc for more details on the format exporting." + "See the documentation for more details on the format exporting." ), type=str, ) diff --git a/deepparse/cli/parser_arguments_adder.py b/deepparse/cli/parser_arguments_adder.py index 72eeaf2b..52c50755 100644 --- a/deepparse/cli/parser_arguments_adder.py +++ b/deepparse/cli/parser_arguments_adder.py @@ -25,7 +25,7 @@ def add_csv_column_name_arg(parser: ArgumentParser) -> None: parser.add_argument( "--csv_column_name", help=wrap( - "The column name to extract address in the CSV. Need to be specified if the provided dataset_path " + "The column name to extract the address in the CSV. It needs to be specified if the provided dataset_path " "leads to a CSV file." ), type=str, @@ -37,7 +37,7 @@ def add_csv_column_names_arg(parser: ArgumentParser) -> None: parser.add_argument( "--csv_column_names", help=wrap( - "The column names to extract address and tags in the CSV. Need to be specified if the provided " + "The column names to extract addresses and tags in the CSV. It needs to be specified if the provided " "dataset_path leads to a CSV file. Column names have to be separated by a whitespace. For" "example, --csv_column_names column1 column2. By default, None." ), diff --git a/deepparse/cli/retrain.py b/deepparse/cli/retrain.py index 5d070a6b..7ba8c7eb 100644 --- a/deepparse/cli/retrain.py +++ b/deepparse/cli/retrain.py @@ -64,7 +64,7 @@ def handle_prediction_tags(parsed_args): def main(args=None) -> None: # pylint: disable=too-many-locals, too-many-branches """ - CLI function to rapidly retrain an addresses parser and saves it. One can retrain a base pretrained model + CLI function to easily retrain an address parser and save it. One can retrain a base pretrained model using most of the arguments as the :meth:`~AddressParser.retrain` method. By default, all the parameters have the same default value as the :meth:`~AddressParser.retrain` method. The supported parameters are the following: @@ -86,7 +86,7 @@ def main(args=None) -> None: retrain fasttext ./train_dataset_path.csv - Using a gpu device + Using a GPU device .. code-block:: sh @@ -142,7 +142,7 @@ def main(args=None) -> None: def get_parser() -> argparse.ArgumentParser: - """Return ArgumentParser for the cli.""" + """Return ArgumentParser for the CLI.""" parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) @@ -198,8 +198,8 @@ def get_parser() -> argparse.ArgumentParser: "--logging_path", help=wrap( "The logging path for the checkpoints and the retrained model. " - "Note that training creates checkpoints, and we use Poutyne library that use the best epoch " - "model and reloads the state if any checkpoints are already there. " + "Note that training creates checkpoints, and we use the Poutyne library that uses the best epoch " + "model and reload the state if any checkpoints are already there. " "Thus, an error will be raised if you change the model type. For example, " "you retrain a FastText model and then retrain a BPEmb in the same logging path directory." "By default, the path is './checkpoints'." @@ -241,7 +241,7 @@ def get_parser() -> argparse.ArgumentParser: help=wrap( "Path to a JSON file of prediction tags to use to retrain. Tags are in a key-value style, where " "the key is the tag name, and the value is the index one." - "The last element has to be an EOS tag. Read the doc for more detail about EOS tag." + "The last element has to be an EOS tag. Read the documentation for more details about the EOS tag." ), default=None, type=str, diff --git a/deepparse/cli/test.py b/deepparse/cli/test.py index 648e3dc7..853505c0 100644 --- a/deepparse/cli/test.py +++ b/deepparse/cli/test.py @@ -108,7 +108,7 @@ def main(args=None) -> None: def get_parser() -> argparse.ArgumentParser: - """Return ArgumentParser for the cli.""" + """Return ArgumentParser for the CLI.""" parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) diff --git a/deepparse/comparer/addresses_comparer.py b/deepparse/comparer/addresses_comparer.py index c7dabf06..8ed24400 100644 --- a/deepparse/comparer/addresses_comparer.py +++ b/deepparse/comparer/addresses_comparer.py @@ -10,8 +10,9 @@ @dataclass(frozen=True) class AddressesComparer: """ - Address comparer to compare addresses with each other and retrieves the differences between them. The addresses - are parsed using an address parser based on one of the seq2seq pretrained networks, either with fastText or BPEmb. + Address comparer is used to compare addresses with each other and retrieve the differences between them. The + addresses are parsed using an address parser based on one of the seq2seq pretrained networks, either with + FastText or BPEmb. The address comparer can compare already parsed addresses. The address parser first recomposes the raw addresses then suggest its own tags; then it makes a comparison with the tags of the source parsing and the @@ -43,12 +44,12 @@ def compare_tags( raw address from the parsing, AddressParser generates tags and compares the two parsings. Args: - addresses_tags_to_compare (Union[List[tuple], List[List[tuple]]]): list of tuple that contains + addresses_tags_to_compare (Union[List[tuple], List[List[tuple]]]): list of tuples that contain the tags for the address components from the source. Can compare multiple parsings if passed as a list of tuples. - with_prob (Union[None, bool]): An option flag to either or not include prob in the comparison report. - The probabilities are not compared but only included in the report. - The default value is None, which means not taking into account. + with_prob (Union[None, bool]): An option flag to either or not include probabilities in the comparison + report. The probabilities are not compared but only included in the report. The default value is + ``None``, which means not taking into account. Return: Either a :class:`~FormattedComparedAddressesTags` or a list of :class:`~FormattedComparedAddressTags` @@ -123,15 +124,14 @@ def compare_raw( ) -> List[FormattedComparedAddressesRaw]: """ Compare a list of raw addresses together. It starts by parsing the addresses - with the setted parser and then return the differences between the addresses components - retrieved with our model. + with the parser and then return the differences between the parsed address components of the two addresses. Args: raw_addresses_to_compare (Union[Tuple[str], List[Tuple[str]]]): List of strings that represent raw addresses to compare. - with_prob (Union[None, bool]): An option flag to either or not include prob in the comparison report. - The probabilities are not compared but only included in the report. - The default value is None, which means not taking into account. + with_prob (Union[None, bool]): An option flag to either or not include probabilities in the comparison + report. The probabilities are not compared but only included in the report. The default value is + ``None``, which means not taking into account. Return: Either a :class:`~FormattedComparedAddressesRaw` or a list of @@ -184,8 +184,8 @@ def compare_raw( @staticmethod def _format_comparisons_dict(comparison_tuples: List, origin_tuple: Tuple[str, str], with_prob: bool) -> List[Dict]: """ - Return formatted dict that contains two FormattedParsedAddress and the origin name tuple and output it in a - dict format. + Return formatted dictionary that contains two FormattedParsedAddress and the origin name tuple and output it + in a dictionary format. """ list_of_formatted_comparisons_dict = [] diff --git a/deepparse/comparer/formatted_compared_addresses.py b/deepparse/comparer/formatted_compared_addresses.py index f90f699e..b96f15b8 100644 --- a/deepparse/comparer/formatted_compared_addresses.py +++ b/deepparse/comparer/formatted_compared_addresses.py @@ -110,16 +110,16 @@ def _comparison_report_builder(self) -> str: @abstractmethod def _get_probs(self) -> Dict: """ - To get the tags from the parsing with their associated probabilities, the method needs to be implemented in each - class because they don't use the probabilities the same way. + A method to get the tags from the parsing with their associated probabilities, it needs to be implemented in + each class because they don't use the probabilities the same way. """ @staticmethod def _get_color_diff(string_one: str, string_two: str, highlight: bool = False) -> str: """ - Compare two strings and determine the difference between the two. The differences are noted with colour code; - if the first string has more elements than the second one, it will be noted in one colour; on the contrary, - if the other string has something more, it will have a different colour notation. + Compare two strings and determine the difference between the two. The differences are highlighted with a + coloured scheme; if the first string has more elements than the second one, it will be noted in one colour; + on the contrary, if the other string has something more, it will have a different colour notation. Args: string_one (str): The first string to compare. @@ -129,7 +129,7 @@ def _get_color_diff(string_one: str, string_two: str, highlight: bool = False) - two strings are spaces. The default is False. Notes: - the method is colorblind-friendly, which means that the output will be + The method is colorblind-friendly, which means that the output will be in colours that minimize the risk that a user cannot see the difference as defined here https://davidmathlogic.com/colorblind/#%23D81B60-%231E88E5-%23FFC107-%23004D40. @@ -137,7 +137,7 @@ def _get_color_diff(string_one: str, string_two: str, highlight: bool = False) - If the first string has something more than the second one, it will be indicated in blue. If the second string has something more than the first one, it will be noted in yellow. - It uses SequenceMatcher to get the different codes to be later converted into colour codes. + It uses SequenceMatcher to convert the different codes into colour codes later. Return: str: The two strings joined, and the differences are noted in colour codes @@ -176,13 +176,16 @@ def _get_tags_diff_color( verbose: bool = True, ) -> str: """ - Print the output of the string with colour codes that represent the differences between the two strings. + Print the output of the string with colour codes representing the differences between the two strings. Args: - name_one (str, optional) : Name associated with first color. The default value is the first address. - name_two (str, optional) : Name associated with the second colour. The default value is the second address. - verbose (bool, optional): If True, it will print a presentation of the colours and what they mean. - The default value is True. + name_one (str, optional) : Name associated with first color. The default value is ``"first address"``, + namely the first address of the two. We recommend using a whitespace characters between the words. + name_two (str, optional) : Name associated with the second colour. The default value is + ``"second address"``, namely the second address of the two. We recommend using a whitespace + characters between the words. + verbose (bool, optional): If True, it will print a presentation of the colours and their meaning. + The default value is ``True``. """ @@ -220,7 +223,7 @@ def _get_tags_diff_color( def _bool_address_tags_are_the_same(self, parsed_addresses: Union[List[List[tuple]], List[tuple]]) -> List[tuple]: """ - Compare addresses components and put the differences in a dictionary where the keys are the + Compare the components between two addresses and put the differences in a dictionary where the keys are the names of the addresses components, and the values are the values of the addresses component. Args: @@ -228,7 +231,7 @@ def _bool_address_tags_are_the_same(self, parsed_addresses: Union[List[List[tupl address components' names for the parsed addresses. Return: - List[tuple]: List of tuples that contain all addresses components that differ from each other. + List[tuple]: List of tuples containing the components that differ from the two addresses. """ unique_address_component_names = self._unique_addresses_component_names(parsed_addresses) diff --git a/deepparse/comparer/formatted_compared_addresses_raw.py b/deepparse/comparer/formatted_compared_addresses_raw.py index de94c05d..860f268b 100644 --- a/deepparse/comparer/formatted_compared_addresses_raw.py +++ b/deepparse/comparer/formatted_compared_addresses_raw.py @@ -12,8 +12,8 @@ class FormattedComparedAddressesRaw(FormattedComparedAddresses): def _get_probs(self) -> Dict: """ - To get the tags from the parsing with their associated probabilities, the method needs to be implemented in each - class because they don't use the probabilities the same way. + Method to get the tags from the parsing with their associated probabilities, a method needs to be + implemented in each class because they don't use the probabilities the same way. """ return { self.first_address.raw_address: self.first_address.address_parsed_components, diff --git a/deepparse/comparer/formatted_compared_addresses_tags.py b/deepparse/comparer/formatted_compared_addresses_tags.py index 775335d8..c071194d 100644 --- a/deepparse/comparer/formatted_compared_addresses_tags.py +++ b/deepparse/comparer/formatted_compared_addresses_tags.py @@ -12,7 +12,7 @@ class FormattedComparedAddressesTags(FormattedComparedAddresses): def _get_probs(self) -> Dict: """ - To get the tags from the parsing with their associated probabilities, the method needs to be implemented in each + To get the tags from the parsing with their associated probabilities, A method needs to be implemented in each class because they don't use the probabilities the same way. """ return { diff --git a/deepparse/converter/data_padder.py b/deepparse/converter/data_padder.py index e21c32cb..ef434381 100644 --- a/deepparse/converter/data_padder.py +++ b/deepparse/converter/data_padder.py @@ -22,13 +22,15 @@ def pad_word_embeddings_batch( Tuple[Tuple[torch.Tensor, List, torch.Tensor], torch.Tensor], ]: """ - Method to pad a batch of word embeddings sequences and their targets to the length of the longest one. + A method to apply padding to a batch of word embeddings sequences and their targets to the length of the + longest one. + Args: batch (list[Tuple[list, list]]): a list of tuples where the first element is a list of word embeddings (the sequence) and the second is a list of targets. teacher_forcing (bool): if True, the padded target vectors are returned twice, once with the sequences and their lengths, and once on their own. This enables - the use of teacher forcing during the training of sequence to sequence models. + the use of teacher forcing during the training of sequence-to-sequence models. Return: A tuple of two elements: - a tuple containing either a @@ -52,7 +54,7 @@ def pad_word_embeddings_batch( def pad_word_embeddings_sequences(self, sequences_batch: List) -> Tuple[torch.Tensor, List]: """ - Method to pad a batch of word embeddings sequences. + A method to apply batch padding to sequences of word embeddings. Args: sequences_batch (list): a tuple containing lists of word embeddings (the sequences) Return: @@ -81,15 +83,17 @@ def pad_subword_embeddings_batch( Tuple[Tuple[torch.Tensor, List, List, torch.Tensor], torch.Tensor], ]: """ - Method to pad a batch of subword embeddings sequences and their targets to the length of the longest one. + A method to apply padding to a batch of subword embeddings sequences and their targets to the length of the + longest one. + Args: batch (list[Tuple[Tuple[list, list], list]]): a list of tuples containing the two following elements: - - a tuple where the first element is a list of words represented as subword embeddings and the + - a tuple where the first element is a list of words represented as subword embeddings, and the second element is a list of the number of subword embeddings that each word is decomposed into. - a list of targets. teacher_forcing (bool): if True, the padded target vectors are returned twice, once with the sequences and their lengths, and once on their own. This enables - the use of teacher forcing during the training of sequence to sequence models. + the use of teacher forcing during the training of sequence-to-sequence models. Return: A tuple of two elements: - A tuple (``x``, ``y`` , ``z``). The element ``x`` is a :class:`~torch.Tensor` of @@ -122,9 +126,9 @@ def pad_subword_embeddings_sequences( self, sequences_batch: List[Tuple[List, List]] ) -> Tuple[torch.Tensor, List, List]: """ - Method to pad a batch of subword embeddings sequences. + A method to apply padding to a batch of subword embeddings sequences. Args: - sequences_batch (list[Tuple[list, list]]): a list of tuple containing tuples of two elements: + sequences_batch (list[Tuple[list, list]]): a list of tuples containing tuples of two elements: - a list of lists representing words as lists of subword embeddings. - a list of the number of subword embeddings that each word is decomposed into. Return: @@ -158,7 +162,7 @@ def pad_subword_embeddings_sequences( def pad_targets(self, target_batch: List) -> torch.Tensor: """ - Method to pad a batch of target indices to the longest one. + A method to apply padding to a batch of target indices to the longest one. Args: target_batch (list): a tuple containing lists of target indices. Return: @@ -170,7 +174,7 @@ def pad_targets(self, target_batch: List) -> torch.Tensor: def _extract_word_embeddings_sequences_and_target(self, batch: List[Tuple[List, List]]) -> Tuple[List, List]: """ - Method that takes a list of word embedding sequences and targets and zips the + A method that takes a list of word embedding sequences and targets and zips the sequences together and the targets together. """ sorted_batch = sorted(batch, key=lambda x: len(x[0]), reverse=True) diff --git a/deepparse/converter/data_processor.py b/deepparse/converter/data_processor.py index dd45e25a..8b708cdc 100644 --- a/deepparse/converter/data_processor.py +++ b/deepparse/converter/data_processor.py @@ -37,7 +37,7 @@ def process_for_inference( self, addresses: List[str] ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, List, torch.Tensor]]: """ - Method to vectorize addresses for inference. + A method to vectorize the addresses for inference. Args: addresses (List[str]): a list of addresses Return: @@ -60,13 +60,13 @@ def process_for_training( ], ]: """ - Method to vectorize addresses and tags for training. + A method to vectorize the addresses and the tags for training. Args: addresses_and_targets (List[Tuple[str, List[str]]]): a list of tuples where the first element is an address and the second is a list of tags. teacher_forcing (bool): if True, the padded target vectors are returned twice, once with the sequences and their lengths, and once on their own. This enables - the use of teacher forcing during the training of sequence to sequence models. + the use of teacher forcing during the training of sequence-to-sequence models. Return: A padded batch. Check out :meth:`~deepparse.converter.DataPadder.pad_word_embeddings_batch` and :meth:`~DataPadder.pad_subword_embeddings_batch` for more details. diff --git a/deepparse/converter/target_converter.py b/deepparse/converter/target_converter.py index c55e4017..34825c5c 100644 --- a/deepparse/converter/target_converter.py +++ b/deepparse/converter/target_converter.py @@ -3,7 +3,7 @@ class TagsConverter: """ - Class to define logic of tag to idx conversion and vice versa. + Class to define the logic of tag to idx conversion and vice versa. Args: tags_to_idx (Dict): A dictionary where the keys are the tags (e.g. StreetNumber) and the values are @@ -16,7 +16,8 @@ def __init__(self, tags_to_idx: Dict) -> None: def __call__(self, key: Union[str, int]) -> int: """ - If str convert from a tag to idx and if int convert from a idx to a tag using the convert table. + If it is a ``str, ``, it will convert from a "tag" to an IDX, and if ``int``, it will convert from an IDX to + a "tag" using the convert table. """ if isinstance(key, str): return self.tags_to_idx[key] diff --git a/deepparse/dataset_container/dataset_container.py b/deepparse/dataset_container/dataset_container.py index 9ffd7588..0448ac5b 100644 --- a/deepparse/dataset_container/dataset_container.py +++ b/deepparse/dataset_container/dataset_container.py @@ -21,28 +21,28 @@ class DatasetContainer(Dataset, ABC): For a training container, it validates the following: - - all addresses are not None value, - - all addresses are not empty, - - all addresses are not whitespace string, - - all tags are not empty, if data is a list of tuple (``[('an address', ['a_tag', 'another_tag']), ...]``), and + - no address is a ``None`` value, + - no address is empty, + - no address is composed of only whitespace, + - no tags list is empty, if data is a list of tuple (``[('an address', ['a_tag', 'another_tag']), ...]``), and - if the addresses (whitespace-split) are the same length as their respective tags list. While for a predict container (unknown prediction tag), it validates the following: - - all addresses are not None, - - all addresses are not empty, and - - all addresses are not whitespace string. + - no address is a ``None`` value, + - no address is empty, and + - no address is composed of only whitespace. Args: is_training_container (bool): Either or not, the dataset container is a training container. This will determine the dataset validation test we apply to the dataset. That is, a predict dataset doesn't include tags. - The default value is true. + The default value is ``True``. """ @abstractmethod def __init__(self, is_training_container: bool = True) -> None: """ - Need to be defined by the child class. + The method to init the class. It needs to be defined by the child's class. """ self.data = None self.is_training_container = is_training_container @@ -59,7 +59,7 @@ def __getitem__( - it can be a list of string items (e.g. a list of addresses (str)), or - it can be a unique string item (e.g. one address). - If the DatasetContainer is a training one: + If the DatasetContainer is a "training" one: - it can be a list of tuple (str, list) items, namely a list of parsed examples (e.g. an address with the tags), or @@ -114,12 +114,14 @@ def _training_validation(self) -> None: if not self._data_tags_is_same_len_then_address(): print( - f"Some addresses (whitespace-split) and the associated tags are not the same len. " - f"If you are using a CSVDatasetContainer, consider using the tag_seperator_reformat_fn argument." + f"Some addresses (whitespace-split) and the associated tags are not the same length. " + f"If you use a CSVDatasetContainer, consider using the tag_seperator_reformat_fn argument." f"Here is the report of those cases where len differ to help you out:\n" f"{self._data_tags_not_the_same_len_diff()}" ) - raise DataError("Some addresses (whitespace-split) and the tags associated with them are not the same len.") + raise DataError( + "Some addresses (whitespace-split) and the tags associated with them are not the same length." + ) def _data_is_list_of_tuple(self) -> bool: """ @@ -157,28 +159,28 @@ class PickleDatasetContainer(DatasetContainer): The dataset needs to be a list of tuples where the first element of each tuple is the address (a string), and the second is a list of the expected tag to predict (e.g. ``[('an address', ['a_tag', 'another_tag']), ...]``). - The len of the tags needs to be the same as the len of the address when whitespace split. + The length of the tags needs to be the same as the length of the address when the whitespace-split is used. For a training container, the validation tests applied on the dataset are the following: - - all addresses are not None value, - - all addresses are not empty, - - all addresses are not whitespace string, - - all tags are not empty, if data is a list of tuple (``[('an address', ['a_tag', 'another_tag']), ...]``), and + - no address is a ``None`` value, + - no address is empty, + - no address is composed of only whitespace, + - no tags list is empty, if data is a list of tuple (``[('an address', ['a_tag', 'another_tag']), ...]``), and - if the addresses (whitespace-split) are the same length as their respective tags list. While for a predict container (unknown prediction tag), the validation tests applied on the dataset are the following: - - all addresses are not None value, - - all addresses are not empty, and - - all addresses are not whitespace string. + - no address is a ``None`` value, + - no address is empty, and + - no address is composed of only whitespace. Args: data_path (str): The path to the pickle dataset file. is_training_container (bool): Either or not, the dataset container is a training container. This will determine the dataset validation test we apply to the dataset. That is, a predict dataset doesn't include tags. - The default value is true. + The default value is ``True``. """ @@ -202,25 +204,26 @@ def _test_predict_container_is_list_of_tuple(self) -> bool: class CSVDatasetContainer(DatasetContainer): """ - CSV dataset container that imports a CSV of addresses. If the dataset is a predict one, it needs to have at least - one column with some addresses. If the dataset is a training one (with prediction tags), it needs to have at + CSV dataset container that imports a CSV of addresses. If the dataset is a predict one, it must have at least + one column with some addresses. If the dataset is a training one (with prediction tags), it must have at least two columns, one with some addresses and another with a list of tags for each address. After loading the CSV dataset, some tests will be applied depending on its type. For a training container, the validation tests applied on the dataset are the following: - - all addresses are not None value, - - all addresses are not empty, - - all addresses are not whitespace string, and + - no address is a ``None`` value, + - no address is empty, + - no address is composed of only whitespace, + - no tags list is empty, if data is a list of tuple (``[('an address', ['a_tag', 'another_tag']), ...]``), and - if the addresses (whitespace-split) are the same length as their respective tags list. While for a predict container (unknown prediction tag), the validation tests applied on the dataset are the following: - - all addresses are not None value, - - all addresses are not empty, and - - all addresses are not whitespace string. + - no address is a ``None`` value, + - no address is empty, and + - no address is composed of only whitespace. Args: @@ -231,7 +234,7 @@ class CSVDatasetContainer(DatasetContainer): of exactly two elements: addresses and tags. is_training_container (bool): Either or not, the dataset container is a training container. This will determine the dataset validation test we apply to the dataset. That is, a predict dataset doesn't include tags. - The default value is true. + The default value is ``True``. separator (str): The CSV columns separator to use. By default, ``"\\t"``. tag_seperator_reformat_fn (Callable, optional): A function to parse a tags string and return a list of address tags. For example, if the tag column is a former Python list saved with pandas, the characters ``]`` @@ -240,7 +243,7 @@ class CSVDatasetContainer(DatasetContainer): That is, it removes the ``[],`` characters and splits the sequence at each comma (``","``). csv_reader_kwargs (dict, optional): Keyword arguments to pass to pandas ``read_csv`` use internally. By default, the ``data_path`` is passed along with our default ``sep`` value ( ``"\\t"``) and the ``"utf-8"`` encoding - format. However, this can be overridden by using this argument again. + format. However, this can be overridded by using this argument again. """ def __init__( @@ -256,13 +259,13 @@ def __init__( if is_training_container: if isinstance(column_names, str): raise ValueError( - "When the dataset is a training container, the column names should be a list of column name." + "When the dataset is a training container, the column names should be a list of column names." ) if len(column_names) != 2: raise ValueError("When the dataset is a training container, two column names must be provided.") else: # It means it is a predict container if isinstance(column_names, str): - # We transform the str into a list to assess is len + # We transform the str into a list to assess its length column_names = [column_names] if len(column_names) != 1: raise ValueError("When the dataset is a predict container, one column name must be provided.") @@ -302,7 +305,7 @@ class ListDatasetContainer(DatasetContainer): identical as the :class:`~deepparse.dataset_container.PickleDatasetContainer`. is_training_container (bool): Either or not, the dataset container is a training container. This will determine the dataset validation test we apply to the dataset. That is, a predict dataset doesn't include tags. - The default value is true. + The default value is ``True``. """ def __init__(self, data: List, is_training_container: bool = True) -> None: diff --git a/deepparse/dataset_container/tools.py b/deepparse/dataset_container/tools.py index 133522aa..5ef211c6 100644 --- a/deepparse/dataset_container/tools.py +++ b/deepparse/dataset_container/tools.py @@ -13,9 +13,9 @@ def former_python_list(tags: str) -> List: Return: A list of the parsed tag set. """ - # We remove the [ and ] of the list. + # We removed the ``"["`` and ``"]"`` from the list. # Then, we split each element using a comma as a separator. - # Finally, in some cases, the element are separated by a comma (e.g. element1,element2) + # Finally, in some cases, the elements are separated by a comma (e.g. element1,element2) # or a comma and a whitespace (e.g. element1, element2), we strip the whitespace on all tags to # remove the trailing whitespace when a coma and a whitespace separate elements. # To fix https://github.com/GRAAL-Research/deepparse/issues/124. diff --git a/deepparse/download_tools.py b/deepparse/download_tools.py index dd6759d1..035c3a47 100644 --- a/deepparse/download_tools.py +++ b/deepparse/download_tools.py @@ -35,9 +35,9 @@ def download_fasttext_magnitude_embeddings(cache_dir: str, verbose: bool = True, offline: bool = False) -> str: """ - Function to download the magnitude pretrained fastText model. + Function to download the magnitude pretrained FastText model. - Return the full path to the fastText embeddings. + Return the full path to the FastText embeddings. """ os.makedirs(cache_dir, exist_ok=True) @@ -48,7 +48,7 @@ def download_fasttext_magnitude_embeddings(cache_dir: str, verbose: bool = True, if not os.path.isfile(file_name) and not offline: if verbose: print( - "The fastText pretrained word embeddings will be download in magnitude format (2.3 GO), " + "The FastText pretrained word embeddings will be download in magnitude format (2.3 GO), " "this process will take several minutes." ) extension = extension + ".gz" @@ -67,7 +67,7 @@ def download_weights(model_filename: str, saving_dir: str, verbose: bool = True) Args: model_filename: The network type (i.e. ``fasttext`` or ``bpemb``). saving_dir: The path to the saving directory. - verbose (bool): Either or not to be verbose during the download of a model. The default value is True. + verbose (bool): Either or not to be verbose during the download of a model. The default value is ``True``. """ if verbose: print(f"Downloading the pre-trained weights for the network {model_filename}.") @@ -83,7 +83,7 @@ def download_weights(model_filename: str, saving_dir: str, verbose: bool = True) def download_from_public_repository(file_name: str, saving_dir: str, file_extension: str) -> None: """ - Simple function to download the content of a file from Deepparse public repository. + Simple function to download the content of a file from the Deepparse public repository. The repository URL string is `'https://graal.ift.ulaval.ca/public/deepparse/{}.{}'`` where the first bracket is the file name and the second is the file extension. """ @@ -97,7 +97,7 @@ def download_from_public_repository(file_name: str, saving_dir: str, file_extens def download_models(saving_cache_path: Union[Path, None] = None) -> None: """ - Function to download all the pretrained models. It will download all the models checkpoint and version file. + Function to download all the pretrained models. It will download all the model's checkpoints and version files. Args: saving_cache_path: The path to the saving cache directory for the specified model. @@ -129,7 +129,7 @@ def download_model( elif "bpemb" in model_type: BPEmb( lang="multi", vs=100000, dim=300, cache_dir=saving_cache_path - ) # The class manage the download of the pretrained words embedding + ) # The class manages the download of the pretrained words embedding model_type_filename = MODEL_MAPPING_CHOICES[model_type] model_path = os.path.join(saving_cache_path, f"{model_type_filename}.ckpt") @@ -165,15 +165,15 @@ def latest_version(model: str, cache_path: str, verbose: bool) -> bool: except HTTPError as exception: # HTTP connection error handling if HTTP_CLIENT_ERROR_STATUS_CODE <= exception.response.status_code < NEXT_RANGE_STATUS_CODE: - # Case where Deepparse server is down. + # Case where the Deepparse server is down. if verbose: warnings.warn( - f"We where not able to verify the cached model in the cache directory {cache_path}. It seems like" - f"Deepparse server is not available at the moment. We recommend to attempt to verify " + f"We could not verify the cached model in the cache directory {cache_path}. It seems like" + f"Deepparse server is not available at the moment. We recommend attempting to verify " f"the model version another time using our download CLI function.", category=RuntimeWarning, ) - # The is_lastest_version is set to True even if we were not able to validate the version. We do so not to + # The is_lastest_version is set to True even if we cannot validate the version. We do so not to # block the rest of the process. is_latest_version = True else: @@ -182,15 +182,15 @@ def latest_version(model: str, cache_path: str, verbose: bool) -> bool: raise except MaxRetryError: # Case where the user does not have an Internet connection. For example, one can run it in a - # Docker container not connected to the Internet. + # The Docker container is not connected to the Internet. if verbose: warnings.warn( - f"We where not able to verify the cached model in the cache directory {cache_path}. It seems like" - f"you are not connected to the Internet. We recommend to verify if you have the latest using our " + f"We could not verify the cached model in the cache directory {cache_path}. It seems like" + f"you are not connected to the Internet. We recommend verifying if you have the latest using our " f"download CLI function.", category=RuntimeWarning, ) - # The is_lastest_version is set to True even if we were not able to validate the version. We do so not to + # The is_lastest_version is set to True even if we cannot validate the version. We do so not to # block the rest of the process. is_latest_version = True finally: @@ -203,7 +203,7 @@ def latest_version(model: str, cache_path: str, verbose: bool) -> bool: # pylint: disable=pointless-string-statement FASTTEXT_COPYRIGHT_MIT_LICENSE = """ -The code below was copied from the fastText project, and has been modified for the purpose of this package. +The code below was copied from the FastText project, and has been modified for the purpose of this package. COPYRIGHT @@ -237,11 +237,11 @@ def latest_version(model: str, cache_path: str, verbose: bool) -> bool: def download_fasttext_embeddings(cache_dir: str, verbose: bool = True, offline: bool = False) -> str: """ - Simpler version of the download_model function from fastText to download pretrained common-crawl - vectors from fastText's website https://fasttext.cc/docs/en/crawl-vectors.html and save it in the + Simpler version of the download_model function from FastText to download pretrained common-crawl + vectors from FastText's website https://fasttext.cc/docs/en/crawl-vectors.html and save it in the saving directory (saving_dir). - Return the full path to the fastText embeddings. + Return the full path to the FastText embeddings. """ os.makedirs(cache_dir, exist_ok=True) @@ -258,21 +258,21 @@ def download_fasttext_embeddings(cache_dir: str, verbose: bool = True, offline: shutil.copyfileobj(f, f_out) os.remove(os.path.join(cache_dir, gz_file_name)) - return file_name_path # return the full path to the fastText embeddings + return file_name_path # return the full path to the FastText embeddings # Now use a saving path and don't return a bool def download_gz_model(gz_file_name: str, saving_path: str, verbose: bool = True) -> None: """ - Simpler version of the _download_gz_model function from fastText to download pretrained common-crawl - vectors from fastText's website https://fasttext.cc/docs/en/crawl-vectors.html and save it in the + Simpler version of the _download_gz_model function from FastText to download pretrained common-crawl + vectors from FastText's website https://fasttext.cc/docs/en/crawl-vectors.html and save it in the saving directory (saving_path). """ url = f"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/{gz_file_name}" if verbose: print( - "The fastText pretrained word embeddings will be downloaded (6.8 GO), " + "The FastText pretrained word embeddings will be downloaded (6.8 GO), " "this process will take several minutes." ) _download_file(url, saving_path, verbose=verbose) diff --git a/deepparse/embeddings_models/bpemb_embeddings_model.py b/deepparse/embeddings_models/bpemb_embeddings_model.py index 414bb78d..16abdb21 100644 --- a/deepparse/embeddings_models/bpemb_embeddings_model.py +++ b/deepparse/embeddings_models/bpemb_embeddings_model.py @@ -19,7 +19,7 @@ class BPEmbEmbeddingsModel(EmbeddingsModel): Params: cache_dir (str): Path to the cache directory to the embeddings' bin vector and the model. - verbose (bool): Wether or not to make the loading of the embeddings verbose. + verbose (bool): Whether or not to make the loading of the embeddings verbose. """ def __init__(self, cache_dir: str, verbose: bool = True) -> None: @@ -53,7 +53,7 @@ def no_ssl_verification(): Reference: https://gist.github.com/ChenTanyi/0c47652bd916b61dc196968bca7dad1d. - Will be removed when https://github.com/bheinzerling/bpemb/issues/63 is resolved. + It will be removed when https://github.com/bheinzerling/bpemb/issues/63 is resolved. """ opened_adapters = set() old_merge_environment_settings = requests.Session.merge_environment_settings diff --git a/deepparse/embeddings_models/embeddings_model_factory.py b/deepparse/embeddings_models/embeddings_model_factory.py index 6360f752..c5aae0e2 100644 --- a/deepparse/embeddings_models/embeddings_model_factory.py +++ b/deepparse/embeddings_models/embeddings_model_factory.py @@ -11,12 +11,12 @@ def create(self, embedding_model_type: str, cache_dir: str, verbose: bool = True """ Embeddings model creation method. Args: - embeddings_model_type (str): the type of the embeddings model to create. Valid options: + embedding_model_type (str): the type of the embeddings model to create. Valid options: - bpemb - fasttext - fasttext_magnitude cache_dir (str): Path to the cache directory where the embeddings model exists or is to be downloaded. - verbose (bool): Wether or not to make the loading of the embeddings verbose. + verbose (bool): Whether or not to make the loading of the embeddings verbose. Return: An :class:`~EmbeddingsModel` """ diff --git a/deepparse/embeddings_models/fasttext_embeddings_model.py b/deepparse/embeddings_models/fasttext_embeddings_model.py index b4b34e08..abb1e5a4 100644 --- a/deepparse/embeddings_models/fasttext_embeddings_model.py +++ b/deepparse/embeddings_models/fasttext_embeddings_model.py @@ -18,7 +18,7 @@ class FastTextEmbeddingsModel(EmbeddingsModel): Note: Since Windows uses ``spawn`` instead of ``fork`` during multiprocess (for the data loading pre-processing - ``num_worker`` > 0) we use the Gensim model, which takes more RAM (~10 GO) than the Fasttext one (~8 GO). + ``num_worker`` > 0), we use the Gensim model, which takes more RAM (~10 GO) than the Fasttext one (~8 GO). It also takes a longer time to load. See here the `issue `_. """ @@ -39,7 +39,7 @@ def __call__(self, word: str) -> ndarray: word (str): Word to get vector. Return: - The fastText embedding for a word. + The FastText embedding for a word. """ return self.model[word] diff --git a/deepparse/embeddings_models/magnitude_embeddings_model.py b/deepparse/embeddings_models/magnitude_embeddings_model.py index f2d85720..0cd40ac6 100644 --- a/deepparse/embeddings_models/magnitude_embeddings_model.py +++ b/deepparse/embeddings_models/magnitude_embeddings_model.py @@ -7,7 +7,7 @@ class MagnitudeEmbeddingsModel(EmbeddingsModel): """ FastText embeddings network from `Enriching Word Vectors with Subword Information `_ using the magnitude mapping - (`_), which reduce memory footprint. + (`_), which reduces the memory footprint. Args: embeddings_path (str): Path to the bin embeddings vector (.bin). @@ -20,13 +20,13 @@ def __init__(self, embeddings_path: str, verbose: bool = True) -> None: def __call__(self, words: str) -> ndarray: """ - Callable method to get word vector of a complete address. + Callable method to get the word vector of a complete address. Args: words (str): Address to get vector for words. Return: - The fastText embedding for a list of words. + The FastText embedding for a list of words. """ - # we leverage the multiple word query which are faster than single word query + # We leverage the multiple-word query which is faster than a single word query return self.model.query(words.split()) diff --git a/deepparse/errors/data_error.py b/deepparse/errors/data_error.py index 20f41a9c..e829c06b 100644 --- a/deepparse/errors/data_error.py +++ b/deepparse/errors/data_error.py @@ -1,6 +1,6 @@ class DataError(Exception): """ - User error when data is not construct as expected. + User error occurs when the data structure is not as expected. """ def __init__(self, value: str) -> None: diff --git a/deepparse/errors/model_error.py b/deepparse/errors/model_error.py index 8ee4196a..889b26c1 100644 --- a/deepparse/errors/model_error.py +++ b/deepparse/errors/model_error.py @@ -1,6 +1,6 @@ class FastTextModelError(Exception): """ - User error when user uses a FastText-like model on an OS that does not support properly multithreading. + User error occurs when a user uses a FastText-like model on an OS that does not correctly support multithreading. """ def __init__(self, value: str) -> None: diff --git a/deepparse/errors/server_error.py b/deepparse/errors/server_error.py index 9e98dc02..903e5c13 100644 --- a/deepparse/errors/server_error.py +++ b/deepparse/errors/server_error.py @@ -1,6 +1,6 @@ class ServerError(Exception): """ - User error when Deepparse server is not responding. + User error occurs when the Deepparse server is not responding. """ def __init__(self, value: str) -> None: diff --git a/deepparse/metrics/accuracy.py b/deepparse/metrics/accuracy.py index ed8dbeeb..f1ca46af 100644 --- a/deepparse/metrics/accuracy.py +++ b/deepparse/metrics/accuracy.py @@ -2,8 +2,8 @@ from poutyne.framework.metrics import acc -def accuracy(pred: torch.Tensor, ground_truth: torch.Tensor) -> float: +def accuracy(predictions: torch.Tensor, ground_truths: torch.Tensor) -> float: """ Accuracy per tag. """ - return acc(pred.transpose(0, 1).transpose(-1, 1), ground_truth) + return acc(predictions.transpose(0, 1).transpose(-1, 1), ground_truths) diff --git a/deepparse/metrics/nll_loss.py b/deepparse/metrics/nll_loss.py index 4ea1042f..92cf5530 100644 --- a/deepparse/metrics/nll_loss.py +++ b/deepparse/metrics/nll_loss.py @@ -4,13 +4,13 @@ criterion = NLLLoss() -def nll_loss(pred: torch.Tensor, ground_truth: torch.Tensor) -> float: +def nll_loss(predictions: torch.Tensor, ground_truths: torch.Tensor) -> float: """ - NLL loss compute per tag. + NLL loss to compute loss per tag. """ loss = 0 - ground_truth = ground_truth.transpose(0, 1) - for i in range(pred.size(0)): - loss += criterion(pred[i], ground_truth[i]) + ground_truths = ground_truths.transpose(0, 1) + for i in range(predictions.size(0)): + loss += criterion(predictions[i], ground_truths[i]) return loss diff --git a/deepparse/network/bpemb_seq2seq.py b/deepparse/network/bpemb_seq2seq.py index 0195eb31..d08db831 100644 --- a/deepparse/network/bpemb_seq2seq.py +++ b/deepparse/network/bpemb_seq2seq.py @@ -10,22 +10,25 @@ class BPEmbSeq2SeqModel(Seq2SeqModel): """ - BPEmb Seq2Seq network, the best of the two model we propose, but takes more ``GPU``/``CPU`` resources. + BPEmb Seq2Seq network is the best of the two proposed models but takes more ``GPU``/``CPU`` resources. Args: cache_dir (str): The path to the cached directory to use for downloading (and loading) the model weights. - device (~torch.device): The device tu use for the prediction. - input_size (int): The input size of the encoder (i.e. the embeddings size). It will also be used to initialize - the internal embeddings network input size, hidden size and output dim. The default value is 300. - encoder_hidden_size (int): The size of the hidden layer(s) of the encoder. The default value is 1024. - encoder_num_layers (int): The number of hidden layers of the encoder. The default value is 1. - decoder_hidden_size (int): The size of the hidden layer(s) of the decoder. The default value is 1024. - decoder_num_layers (int): The number of hidden layers of the decoder. The default value is 1. - output_size (int): The size of the prediction layers (i.e. the number of tag to predict). - attention_mechanism (bool): Either or not to use attention mechanism. The default value is False. - verbose (bool): Turn on/off the verbosity of the model. The default value is True. - path_to_retrained_model (Union[str, None]): The path to the retrained model to use for the seq2seq. + device (~torch.device): The device to use for the prediction. + input_size (int): The input size of the encoder (i.e. the size of the embedding). It will also be used to + initialize the internal embeddings network input size, hidden size and output dim. The default value is + ``300``. + encoder_hidden_size (int): The size of the hidden layer(s) of the encoder. The default value is ``1024``. + encoder_num_layers (int): The number of hidden layers of the encoder. The default value is ``1``. + decoder_hidden_size (int): The size of the hidden layer(s) of the decoder. The default value is ``1024``. + decoder_num_layers (int): The number of hidden layers of the decoder. The default value is ``1``. + output_size (int): The size of the prediction layers (i.e. the number of tags to predict). The default value is + ``9``. + attention_mechanism (bool): Either or not to use the attention mechanism. The default value is ``False``. + verbose (bool): Turn on/off the verbosity of the model. The default value is ``True`. + path_to_retrained_model (Union[str, None]): The path to the retrained model to use for the seq2seq.`The default + value is ``None``. """ def __init__( @@ -92,9 +95,10 @@ def forward( to_predict (~torch.Tensor): The elements to predict the tags. decomposition_lengths (list) : The lengths of the decomposed words of the batch elements (since packed). lengths (list) : The lengths of the batch elements (since packed). - target (~torch.LongTensor) : The target of the batch element, use only when we retrain the model since we do + target (~torch.LongTensor) : The target of the batch element, used only when we retrain the model since + we do `teacher forcing `_. - Default value is None since we mostly don't have the target except for retrain. + The default value is ``None`` since we mostly don't have the target except for retraining. Return: A Tensor of the predicted sequence. """ diff --git a/deepparse/network/decoder.py b/deepparse/network/decoder.py index f0c347ab..c434344b 100644 --- a/deepparse/network/decoder.py +++ b/deepparse/network/decoder.py @@ -11,7 +11,7 @@ class Decoder(nn.Module): """ - Decoder module that use a LSTM to decode a previously encoded sequence and a linear layer to map + Decoder module that uses a LSTM to decode a previously encoded sequence and a linear layer to map the decoded sequence tags. Args: @@ -33,7 +33,7 @@ def __init__( super().__init__() self.attention_mechanism = attention_mechanism if attention_mechanism: - # Since layer also have attention mechanism + # Since layer also has attention mechanism self.hidden_size = hidden_size input_size = input_size + hidden_size self._attention_mechanism_set_up() diff --git a/deepparse/network/embedding_network.py b/deepparse/network/embedding_network.py index 31721569..297909c1 100644 --- a/deepparse/network/embedding_network.py +++ b/deepparse/network/embedding_network.py @@ -1,7 +1,7 @@ # Bug with PyTorch source code makes torch.tensor as not callable for pylint. # pylint: disable=not-callable -# temporary fix for _forward_unimplemented for PyTorch 1.6 https://github.com/pytorch/pytorch/issues/42305 +# Temporary fix for _forward_unimplemented for PyTorch 1.6 https://github.com/pytorch/pytorch/issues/42305 # pylint: disable=W0223 from typing import Tuple, List @@ -13,14 +13,15 @@ class EmbeddingNetwork(nn.Module): """ - Embedding Network to represent the address components byte-pair embedding representation using a LSTM. + Embedding Network to represent the address components byte-pair embedding representation using an LSTM. Args: input_size (int): The input size of the LSTM. hidden_size (int): The hidden size of the LSTM. - num_layers (int): The number of layer of the LSTM. Default is one (1) layer. - maxpool (bool): Either or not to add a maximum pooling layer after the embedding composition. Default is false. - maxpool_kernel_size (int): The kernel size of the maximum pooling layer. Default is three (3). + num_layers (int): The number of layers of the LSTM. The default value is ``1``, namely one layer. + maxpool (bool): Either or not to add a maximum pooling layer after the embedding composition. The default + value is ``False``. + maxpool_kernel_size (int): The kernel size of the maximum pooling layer. The default value is ``3``. """ def __init__( @@ -73,7 +74,7 @@ def forward(self, to_predict: torch.Tensor, decomposition_lengths: Tuple[List]) for i in range(to_predict.size(0)): lengths = [] - # reorder decomposition, could use a transpose but take a LOT (like a LOT) of memory + # Reorder decomposition, could use a transpose but take a LOT (like a LOT) of memory for decomposition_length in decomposition_lengths: lengths.append(decomposition_length[i]) @@ -86,15 +87,15 @@ def forward(self, to_predict: torch.Tensor, decomposition_lengths: Tuple[List]) packed_output, _ = self.model(packed_sequence) - # pad packed the output to be applied later on in the projection layer + # Pad packed the output to be applied later on in the projection layer. padded_output, padded_output_lengths = pad_packed_sequence(packed_output, batch_first=True) - # filling the embedding by idx + # Filling the embedding by IDX. word_context = torch.zeros(padded_output.size(0), padded_output.size(2), device=device) for j in range(batch_size): word_context[j] = padded_output[j, padded_output_lengths[j] - 1, :] - # projection layer from dim 600 to 300 + # Projection layer from dim 600 to 300. projection_output = self.projection_layer(word_context) if self.maxpooling_layer is not None: diff --git a/deepparse/network/encoder.py b/deepparse/network/encoder.py index 5fafb917..e275d875 100644 --- a/deepparse/network/encoder.py +++ b/deepparse/network/encoder.py @@ -1,4 +1,4 @@ -# temporary fix for _forward_unimplemented for torch 1.6 https://github.com/pytorch/pytorch/issues/42305 +# Temporary fix for _forward_unimplemented for torch 1.6 https://github.com/pytorch/pytorch/issues/42305 # pylint: disable=W0223 from typing import Tuple, List @@ -12,12 +12,12 @@ class Encoder(nn.Module): """ - Encoder module that use a LSTM to encode a sequence. + Encoder module that uses an LSTM to encode a sequence. Args: input_size (int): The input size of the encoder. hidden_size (int): The hidden size of the encoder. - num_layers (int): The number of layer to the encoder. + num_layers (int): The number of layers to the encoder. """ def __init__(self, input_size: int, hidden_size: int, num_layers: int) -> None: diff --git a/deepparse/network/fasttext_seq2seq.py b/deepparse/network/fasttext_seq2seq.py index dd08059a..34ae72e9 100644 --- a/deepparse/network/fasttext_seq2seq.py +++ b/deepparse/network/fasttext_seq2seq.py @@ -9,22 +9,24 @@ class FastTextSeq2SeqModel(Seq2SeqModel): """ - FastText Seq2Seq network, the lightest of the two model we propose (in ``GPU``/``CPU`` consumption) for a little + FastText Seq2Seq network, the lightest of the two models we propose (in ``GPU``/``CPU`` consumption) for a little less accuracy. Args: cache_dir (str): The path to the cached directory to use for downloading (and loading) the model weights. device (~torch.device): The device tu use for the prediction. - input_size (int): The input size of the encoder (i.e. the embeddings size). The default value is 300. - encoder_hidden_size (int): The size of the hidden layer(s) of the encoder. The default value is 1024. - encoder_num_layers (int): The number of hidden layers of the encoder. The default value is 1. - decoder_hidden_size (int): The size of the hidden layer(s) of the decoder. The default value is 1024. - decoder_num_layers (int): The number of hidden layers of the decoder. The default value is 1. - output_size (int): The size of the prediction layers (i.e. the number of tag to predict). - attention_mechanism (bool): Either or not to use attention mechanism. The default value is False. - verbose (bool): Turn on/off the verbosity of the model. The default value is True. - path_to_retrained_model (Union[str, None]): The path to the retrained model to use for the seq2seq. + input_size (int): The input size of the encoder (i.e. the size of the embedding). The default value is ``300``. + encoder_hidden_size (int): The size of the encoder's hidden layer(s). The default value is ``1024``. + encoder_num_layers (int): The number of hidden layers of the encoder. The default value is ``1``. + decoder_hidden_size (int): The size of the decoder's hidden layer(s). The default value is ``1024``. + decoder_num_layers (int): The number of hidden layers of the decoder. The default value is ``1``. + output_size (int): The size of the prediction layers (i.e. the number of tags to predict). The default value + is ``9``. + attention_mechanism (bool): Either or not to use the attention mechanism. The default value is ``False``. + verbose (bool): Turn on/off the verbosity of the model. The default value is ``True``. + path_to_retrained_model (Union[str, None]): The path to the retrained model to use for the seq2seq. The default + value is ``None``. """ def __init__( @@ -88,7 +90,7 @@ def forward( lengths (list) : The lengths of the batch elements (since packed). target (~torch.LongTensor) : The target of the batch element, use only when we retrain the model since we do `teacher forcing `_. - Default value is None since we mostly don't have the target except for retrain. + The default value is ``None`` since we mostly don't have the target except for retrain. Return: A Tensor of the predicted sequence. diff --git a/deepparse/network/model_factory.py b/deepparse/network/model_factory.py index 4893a9c7..a107ab07 100644 --- a/deepparse/network/model_factory.py +++ b/deepparse/network/model_factory.py @@ -8,7 +8,7 @@ class ModelFactory: """ - A factory for the creation of neural network models that predict the tags from addresses + A factory for creating neural network models that predict the tags from addresses. """ def create( @@ -32,12 +32,14 @@ def create( - bpemb cache_dir (str): The path to the cached directory to use for downloading (and loading) the model weights. - device (~torch.device): The device tu use for the prediction. - output_size (int): The size of the prediction layers (i.e. the number of tag to predict). - attention_mechanism (bool): Either or not to use attention mechanism. The default value is False. - path_to_retrained_model (Union[str, None]): The path to the retrained model to use for the seq2seq. - offline (bool): Wether or not the model is an offline or an online. - verbose (bool): Turn on/off the verbosity of the model. The default value is True. + device (~torch.device): The device to use for the prediction. + output_size (int): The size of the prediction layers (i.e. the number of tags to predict). The default + value is ``9``. + attention_mechanism (bool): Either or not to use the attention mechanism. The default value is ``False``. + path_to_retrained_model (Union[str, None]): The path to the retrained model to use for the seq2seq. The + default value is ``None``. + offline (bool): Whether or not the model is an offline or an online. The default value is ``False``. + verbose (bool): Turn on/off the verbosity of the model. The default value is ``True``. Return: A :class:`~Seq2SeqModel`. @@ -69,7 +71,7 @@ def create( else: raise NotImplementedError( f""" - There is no {model_type} network implemented. model_type should be either fasttext or bpemb + There is no {model_type} network implemented. model_type should be either "fasttext" or "bpemb". """ ) diff --git a/deepparse/network/seq2seq.py b/deepparse/network/seq2seq.py index faf5b808..d222b462 100644 --- a/deepparse/network/seq2seq.py +++ b/deepparse/network/seq2seq.py @@ -21,14 +21,15 @@ class Seq2SeqModel(ABC, nn.Module): Args: device (~torch.device): The device tu use for the prediction. - input_size (int): The input size of the encoder (i.e. the embeddings size). The default value is 300. - encoder_hidden_size (int): The size of the hidden layer(s) of the encoder. The default value is 1024. - encoder_num_layers (int): The number of hidden layers of the encoder. The default value is 1. - decoder_hidden_size (int): The size of the hidden layer(s) of the decoder. The default value is 1024. - decoder_num_layers (int): The number of hidden layers of the decoder. The default value is 1. - output_size (int): The size of the prediction layers (i.e. the number of tag to predict). - attention_mechanism (bool): Either or not to use attention mechanism. The default value is False. - verbose (bool): Turn on/off the verbosity of the model. The default value is True. + input_size (int): The input size of the encoder (i.e. the size of the embedding). The default value is ``300``. + encoder_hidden_size (int): The size of the encoder's hidden layer(s). The default value is ``1024``. + encoder_num_layers (int): The number of hidden layers of the encoder. The default value is ``1``. + decoder_hidden_size (int): The size of the decoder's hidden layer(s). The default value is ``1024``. + decoder_num_layers (int): The number of hidden layers of the decoder. The default value is ``1``. + output_size (int): The size of the prediction layers (i.e. the number of tags to predict). The default value is + ``9``. + attention_mechanism (bool): Either or not to use the attention mechanism. The default value is ``False``. + verbose (bool): Turn on/off the verbosity of the model. The default value is ``True``. """ def __init__( @@ -80,14 +81,14 @@ def same_output_dim(self, size: int) -> bool: def handle_new_output_dim(self, new_dim: int) -> None: """ - Update the new output dimension + Update the new output dimension. """ self.decoder.linear_layer_set_up(output_size=new_dim) self.output_size = new_dim def _load_pre_trained_weights(self, model_type: str, cache_dir: str, offline: bool) -> None: """ - Method to download and resolved the loading (into the network) of the pretrained weights. + Method to download and resolve the loading (into the network) of the pre-trained weights. Args: model_type (str): The network pretrained weights to load. @@ -126,7 +127,7 @@ def _load_weights(self, path_to_model_torch_archive: str) -> None: path_to_model_to_upload=path_to_model_torch_archive, device=self.device ) - # All the time, our torch archive include meta-data along with the model weights + # All the time, our torch archive includes meta-data along with the model weights. all_layers_params = all_layers_params.get("address_tagger_model") self.load_state_dict(all_layers_params) @@ -154,11 +155,11 @@ def _encoder_step(self, to_predict: torch.Tensor, lengths: List, batch_size: int Args: to_predict (~torch.Tensor): The elements to predict the tags. lengths (list): The lengths of the batch elements (since packed). - batch_size (int): The number of element in the batch. + batch_size (int): The number of elements in the batch. Return: A tuple (``x``, ``y``, ``z``) where ``x`` is the decoder input (a zeros tensor), ``y`` is the decoder - hidden states and ``z`` is the encoder outputs for the attention weighs if needed. + hidden states, and ``z`` is the encoder output for the attention weighs if needed. """ encoder_outputs, decoder_hidden = self.encoder(to_predict, lengths) @@ -181,41 +182,42 @@ def _decoder_step( Args: decoder_input (~torch.Tensor): The decoder input (so the encode output). - decoder_hidden (~torch.Tensor): The encoder hidden state (so the encode hidden state). + decoder_hidden (~torch.Tensor): The encoder's hidden state (so the encode hidden state). encoder_outputs (~torch.Tensor): The encoder outputs for the attention mechanism weighs if needed. - target (~torch.LongTensor) : The target of the batch element, use only when we retrain the model since we do + target (~torch.LongTensor) : The target of the batch element, used only when we retrain the model since + we do `teacher forcing `_. lengths (list): The lengths of the batch elements (since packed). - batch_size (int): Number of element in the batch. + batch_size (int): Number of elements in the batch. Return: A Tensor of the predicted sequence. """ longest_sequence_length = max(lengths) - # The empty prediction sequence - # +1 for the EOS + # The empty prediction sequence. + # +1 for the EOS. prediction_sequence = torch.zeros(longest_sequence_length + 1, batch_size, self.output_size, device=self.device) - # We decode the first token + # We decode the first token. decoder_output, decoder_hidden, attention_weights = self.decoder( decoder_input, decoder_hidden, encoder_outputs, lengths ) if attention_weights is not None: - # We fill the attention + # We fill the attention. attention_output = torch.ones(longest_sequence_length + 1, batch_size, 1, longest_sequence_length) attention_output[0] = attention_weights - # We fill the first token prediction + # We fill the first token prediction. prediction_sequence[0] = decoder_output - # The decoder next step input (the predicted idx of the previous token) + # The decoder's next step input (the predicted idx of the previous token). _, decoder_input = decoder_output.topk(1) - # we loop the same steps for the rest of the sequence + # We loop the same steps for the rest of the sequence. if target is not None and random.random() < 0.5: - # force the real target value instead of the predicted one to help learning + # Force the real target value instead of the predicted one to help learning. target = target.transpose(0, 1) for idx in range(longest_sequence_length): decoder_input = target[idx].view(1, batch_size, 1) diff --git a/deepparse/parser/address_parser.py b/deepparse/parser/address_parser.py index ceba52e0..febb139c 100644 --- a/deepparse/parser/address_parser.py +++ b/deepparse/parser/address_parser.py @@ -95,8 +95,8 @@ class AddressParser: ``None``. To further improve performance, consider using the models (fasttext or BPEmb) with their counterparts using an attention mechanism with the ``attention_mechanism`` flag. attention_mechanism (bool): Whether to use the model with an attention mechanism. The model will use an - attention mechanism that takes an extra 100 MB on GPU usage (see the doc for more statistics). - The default value is False. + attention mechanism that takes an extra 100 MB on GPU usage (see the documentation for more statistics). + The default value is ``False``. device (Union[int, str, torch.torch.device]): The device to use can be either: - a ``GPU`` index in int format (e.g. ``0``), @@ -104,28 +104,31 @@ class AddressParser: - a :class:`~torch.torch.device` object, - ``"cpu"`` for a ``CPU`` use. - The default value is GPU with the index ``0`` if it exists. Otherwise, the value is ``CPU``. - rounding (int): The rounding to use when asking the probability of the tags. The default value is four digits. - verbose (bool): Turn on/off the verbosity of the model weights download and loading. The default value is True. + The default value is ``0``, witch is a GPU device with the index ``0`` if it exists. Otherwise, + the value is ``CPU``. + rounding (int): The rounding to use when asking the probability of the tags. The default value is ``4``, + namely four digits. + verbose (bool): Turn on/off the verbosity of the model weights download and loading. The default value is + ``True``. path_to_retrained_model (Union[S3Path, str, None]): The path to the retrained model to use for prediction. We will infer the ``model_type`` of the retrained model. The default value is ``None``, meaning we use our pretrained model. If the retrained model uses an attention mechanism, ``attention_mechanism`` needs to be set to True. The path_to_retrain_model can also be a S3-like (Azure, AWS, Google) bucket URI string path (e.g. ``"s3://path/to/aws/s3/bucket.ckpt"``). Or it can be a ``S3Path`` S3-like URI using `cloudpathlib` to handle S3-like bucket. See `cloudpathlib ` - for detail on supported S3 buckets provider and URI condition. The default value is None. + for detail on supported S3 buckets provider and URI condition. The default value is ``None``. cache_dir (Union[str, None]): The path to the cached directory to use for downloading (and loading) the embeddings model and the model pretrained weights. offline (bool): Whether or not the model is an offline one, meaning you have already downloaded the pre-trained weights and embeddings weights in either the default Deepparse cache directory (``"~./cache/deepparse"``) or the ``cache_dir`` directory. When offline, we will not verify if the model is the latest. You can use our - ``download_models`` CLI function to download all the requirements for a model. The default value is False - (not an offline parsing model). + ``download_models`` CLI function to download all the requirements for a model. The default value is + ``False`` (not an offline parsing model). Note: For both networks, we will download the pretrained weights and embeddings in the ``.cache`` directory - for the root user. The pretrained weights take at most 44 MB. The fastText embeddings take 6.8 GO, - the fastText-light embeddings take 3.3 GO and bpemb take 116 MB (in ``".cache/bpemb"``). + for the root user. The pretrained weights take at most 44 MB. The FastText embeddings take 6.8 GO, + the FastText-light (``"fasttext-light"``) embeddings take 3.3 GO and bpemb take 116 MB (in ``".cache/bpemb"``). Also, one can download all the dependencies of our pretrained model using our CLI (e.g. download_model fasttext) before sending it to a node without access to Internet. @@ -914,7 +917,7 @@ def test( seed (int): Seed to use (by default, ``42``). verbose (Union[None, bool]): To override the AddressParser verbosity for the test. When set to True or False, it will override (but it does not change the AddressParser verbosity) the test verbosity. - If set to the default value None, the AddressParser verbosity is used as the test verbosity. + If set to the default value ``None``, the AddressParser verbosity is used as the test verbosity. Return: A dictionary with the stats (see `Experiment class @@ -964,7 +967,7 @@ def test( if "fasttext-light" in self.model_type: raise FastTextModelError( "It's not possible to test a fasttext-light due to pymagnitude problem. " - "See the Retrain method doc for more details." + "See the Retrain method documentation for more details." ) if not isinstance(test_dataset_container, DatasetContainer): @@ -1215,7 +1218,7 @@ def _freeze_model_params(self, layers_to_freeze: Union[str]) -> None: if layers_to_freeze not in ("encoder", "decoder", "prediction_layer", "seq2seq"): raise ValueError( f"{layers_to_freeze} freezing setting is not supported. Value can be 'encoder', 'decoder', " - f"'prediction_layer' and 'seq2seq'. See doc for more details." + f"'prediction_layer' and 'seq2seq'. See documentation for more details." ) layer_exclude = None if layers_to_freeze == "decoder": @@ -1271,7 +1274,7 @@ def _retrain_argumentation_validations( if "fasttext-light" in self.model_type: raise FastTextModelError( "It's not possible to retrain a fasttext-light due to pymagnitude problem. " - "See the Retrain method doc for more details." + "See the Retrain method documentation for more details." ) if not isinstance(train_dataset_container, DatasetContainer): diff --git a/deepparse/parser/formatted_parsed_address.py b/deepparse/parser/formatted_parsed_address.py index 9a013741..28265882 100644 --- a/deepparse/parser/formatted_parsed_address.py +++ b/deepparse/parser/formatted_parsed_address.py @@ -86,7 +86,7 @@ def __repr__(self) -> str: def __eq__(self, other) -> bool: """ - Equal if all address components elements are equals. If attributes are not the same, will return False. + Equal if all address components elements are equals. If attributes are not the same, it will return False. """ for field in self.__dict__: address_component = getattr(self, field) @@ -114,7 +114,7 @@ def format_address( Args: fields (Union[list, None]): Optional argument to define the fields to order the address components of - the address. If None, we will use the inferred order base on the address tags appearance. For example, + the address. If None, we will use the inferred order based on the address tags' appearance. For example, if the parsed address is ``(305, StreetNumber), (rue, StreetName), (des, StreetName), (Lilas, StreetName)``, the inferred order will be ``StreetNumber, StreetName``. capitalize_fields (Union[list, None]): Optional argument to define the capitalized fields for the formatted @@ -138,7 +138,7 @@ def format_address( # > 350, rue des lilas, ouest, quebec city, quebec, g1l 1b6 parse_address.formatted_address(fields_separator=", ", capitalize_fields=["StreetName", "Orientation"]) - # > 350, Rue des lilas, Ouest, quebec city, quebec, g1l 1b6 + # > 350, rue des lilas, ouest, quebec city, quebec, g1l 1b6 parse_address.formatted_address(fields_separator=", ", upper_case_fields=["PostalCode""]) # > 350 rue des lilas ouest quebec city quebec G1L 1B6 @@ -214,7 +214,7 @@ def to_list_of_tuples(self, fields: Union[List, None] = None) -> List[tuple]: def to_pandas(self) -> Dict: """ Method to convert a parsed address into a dictionary for pandas where the first key is the raw address and - the followings keys are the address components, and the values are the value of those components. + the following keys are the address components, and the values are the values of those components. For example, the parsed address `` 305 rue des Lilas`` will be converted into the following dictionary: ``{'Address': '305 rue des Lilas', 'StreetNumber':'305', 'StreetName': 'rue des Lilas'}``. @@ -228,7 +228,7 @@ def to_pandas(self) -> Dict: def to_pickle(self) -> Tuple[str, List]: """ Method to convert a parsed address into a list of tuple for pickle where the first tuple element is the - raw address and the followings tuples are the address components, and the values are the value of + raw address and the following tuples are the address components, and the values are the values of those components. For example, the parsed address `` 305 rue des Lilas`` will be converted into the following list of tuples: ``'305 rue des Lilas', ('305', 'StreetNumber'), ('rue des Lilas', 'StreetName')]``. diff --git a/deepparse/validations.py b/deepparse/validations.py index e2007944..a6c6d555 100644 --- a/deepparse/validations.py +++ b/deepparse/validations.py @@ -12,7 +12,7 @@ def extract_package_version(package) -> str: """ - Handle the retrieval of the major and minor version part of a Python package. + Handle the retrieval of a Python package's major and minor version parts. """ full_version = package.version.__version__ components_parts = full_version.split(".") @@ -24,8 +24,8 @@ def extract_package_version(package) -> str: def valid_poutyne_version(min_major: int = 1, min_minor: int = 2) -> bool: """ - Validate Poutyne version is greater than min_major.min_minor for using a str checkpoint. Some version before - does not support all the features we need. By default, min_major.min_minor equal version 1.2 which is the + Validate that the Poutyne version is greater than min_major.min_minor for using a str checkpoint. Some versions + do not support all the features we need. By default, min_major.min_minor equals version 1.2, which is the lowest version we can use. """ version_components = extract_package_version(package=poutyne).split(".") @@ -45,13 +45,13 @@ def validate_data_to_parse(addresses_to_parse: List) -> None: """ Validation tests on the addresses to parse to respect the following two criteria: - addresses are not tuple, - - no addresses are None value, - - no addresses are empty strings, and - - no addresses are whitespace-only strings. + - no address is a ``None`` value, + - no address is empty, and + - no address is composed of only whitespace. """ if isinstance(addresses_to_parse[0], tuple): raise DataError( - "Addresses to parsed are tuples. They need to be a list of string. Are you using training data?" + "Addresses to parsed are tuples. They need to be a list of strings. Are you using training data?" ) if validate_if_any_none(addresses_to_parse): raise DataError("Some addresses are None value.") diff --git a/deepparse/vectorizer/bpemb_vectorizer.py b/deepparse/vectorizer/bpemb_vectorizer.py index a20d4c12..69ca91dd 100644 --- a/deepparse/vectorizer/bpemb_vectorizer.py +++ b/deepparse/vectorizer/bpemb_vectorizer.py @@ -50,7 +50,7 @@ def _vectorize_sequence(self, address: str) -> Tuple[List, List]: address (str): Address to vectorize using BPEmb. Return: - A tuple of list of word vector and the word decomposition lengths. + A tuple of the list of word vectors and the word decomposition lengths. """ input_sequence = [] diff --git a/deepparse/vectorizer/fasttext_vectorizer.py b/deepparse/vectorizer/fasttext_vectorizer.py index f6506a36..8fcc1990 100644 --- a/deepparse/vectorizer/fasttext_vectorizer.py +++ b/deepparse/vectorizer/fasttext_vectorizer.py @@ -6,7 +6,7 @@ class FastTextVectorizer(Vectorizer): """ - FastText vectorizer to convert an address into fastText embeddings. + FastText vectorizer to convert an address into FastText embeddings. """ def __call__(self, addresses: List[str]) -> List: @@ -28,7 +28,7 @@ def _vectorize_sequence(self, address: str) -> List: Method to vectorize the address. Args: - address (str): Address to vectorize using fastText. + address (str): Address to vectorize using FastText. Return: A list of word vector. diff --git a/deepparse/vectorizer/magnitude_vectorizer.py b/deepparse/vectorizer/magnitude_vectorizer.py index ded630be..0ffff493 100644 --- a/deepparse/vectorizer/magnitude_vectorizer.py +++ b/deepparse/vectorizer/magnitude_vectorizer.py @@ -8,7 +8,7 @@ class MagnitudeVectorizer(Vectorizer): """ - FastText Magnitude vectorizer to convert an address into fastText embeddings using magnitude mapping. + FastText Magnitude vectorizer to convert an address into FastText embeddings using magnitude mapping. """ def __call__(self, addresses: List[str]) -> List: @@ -30,7 +30,7 @@ def _vectorize_sequence(self, address: str) -> ndarray: Method to vectorize the address. Args: - address (str): Address to vectorize using fastText. + address (str): Address to vectorize using FastText. Return: A list of word vector. diff --git a/deepparse/weights_tools.py b/deepparse/weights_tools.py index dd5831bd..b8e8e238 100644 --- a/deepparse/weights_tools.py +++ b/deepparse/weights_tools.py @@ -8,7 +8,7 @@ def weights_init(m: nn.Module) -> None: """ - Function to initialize the weights of a model layers. + Function to initialize the weights of model layers. Usage: network = Model() @@ -50,7 +50,7 @@ def handle_weights_upload( except FileNotFoundError as error: if "s3" in path_to_model_to_upload or "//" in path_to_model_to_upload or ":" in path_to_model_to_upload: raise FileNotFoundError( - "Are You trying to use a AWS S3 URI? If so path need to start with s3://." + "Are You trying to use an AWS S3 URI? If so, the path needs to start with s3://." ) from error raise FileNotFoundError(f"The file {path_to_model_to_upload} was not found.") from error return checkpoint_weights diff --git a/docs/source/api.rst b/docs/source/api.rst index c39ff937..52b343f1 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -4,13 +4,13 @@ Parse Address With Our Out-Of-The-Box API ========================================= -We also offer an out-of-the-box RESTAPI to parse addresses using FastAPI. +We also offer an out-of-the-box REST API to parse addresses using FastAPI. Installation ************ -First, ensure that you have Docker Engine and Docker Compose installed on your machine. -If not, you can install them using the following documentations in the following order: +First, ensure you have Docker Engine and Docker Compose installed on your machine. +If not, you can install them using the following documentation in the following order: 1. `Docker Engine `_ 2. `Docker Compose `_ @@ -24,7 +24,7 @@ Once you have Docker Engine and Docker Compose installed, you can run the follow Sentry ****** -Also, you can monitor your application usage with `Sentry `_ by setting the environment variable ``SENTRY_DSN`` to your Sentry's project +Also, you can monitor your application usage with `Sentry `_ by setting the environment variable ``SENTRY_DSN`` to your Sentry project DSN. There is an example of the ``.env`` file in the project's root named ``.env_example``. You can copy it using the following command: .. code-block:: sh @@ -34,7 +34,7 @@ DSN. There is an example of the ``.env`` file in the project's root named ``.env Request Examples ---------------- -Once the application is up and running and port ``8000`` is exported on your localhost, you can send a request with one +Once the application is up and running and port ``8000`` is exported on your ``localhost``, you can send a request with one of the following methods: cURL POST request @@ -65,4 +65,4 @@ Python POST request response = requests.post(url, json=addresses) parsed_addresses = response.json() - print(parsed_addresses) \ No newline at end of file + print(parsed_addresses) diff --git a/docs/source/cli.rst b/docs/source/cli.rst index 60f52be0..09d50a09 100644 --- a/docs/source/cli.rst +++ b/docs/source/cli.rst @@ -13,17 +13,17 @@ The parsing of the addresses to parse ``dataset_path`` is done using the selecte The exported parsed addresses are to be exported in the same directory as the addresses to parse but given the ``export_file_name`` using the encoding format of the address dataset file. For example, if the dataset is in a CSV format, the output file format will be a CSV. Moreover, by default, -we log some information (``--log``) such as the parser model name, the parsed dataset path +we log some information (``--log``), such as the parser model name, the parsed dataset path and the number of parsed addresses. Here is the list of the arguments, their descriptions and default values. One can use the command ``parse --help`` to output the same description in your command line. - ``parsing_model``: The parsing module to use. - ``dataset_path``: The path to the dataset file in a pickle (``.p``, ``.pickle`` or ``.pckl``) or CSV format. - - ``export_file_name``: The filename to use to export the parsed addresses. We will infer the file format base on the file extension. That is, if the file is a pickle (``.p`` or ``.pickle``), we will export it into a pickle file. The supported formats are Pickle, CSV and JSON. The file will be exported in the same repositories as the dataset_path. See the doc for more details on the format exporting. + - ``export_file_name``: The filename to use to export the parsed addresses. We will infer the file format base on the file extension. That is, if the file is a pickle (``.p`` or ``.pickle``), we will export it into a pickle file. The supported formats are Pickle, CSV and JSON. The file will be exported in the same repositories as the dataset_path. See the documentation for more details on the format exporting. - ``--device``: The device to use. It can be 'cpu' or a GPU device index such as ``'0'`` or ``'1'``. By default, ``'0'``. - ``--batch_size``: The batch size to use to process the dataset. By default, ``32``. - ``--path_to_retrained_model``: A path to a retrained model to use for parsing. By default, ``None``. - - ``--csv_column_name``: The column name to extract address in the CSV. Need to be specified if the provided ``dataset_path`` leads to a CSV file. By default, ``None``. + - ``--csv_column_name``: The column name to extract address in the CSV. It needs to be specified if the provided ``dataset_path`` leads to a CSV file. By default, ``None``. - ``--csv_column_separator``: The column separator for the dataset container will only be used if the dataset is a CSV one. By default, ``'\t'``. - ``--log``: Either or not to log the parsing process into a ``.log`` file exported at the same place as the parsed data using the same name as the export file. The bool value can be (not case sensitive) ``'true/false'``, ``'t/f'``, ``'yes/no'``, ``'y/n'`` or ``'0/1'``. By default, ``True``. - ``--cache_dir``: To change the default cache directory (default to ``None``, e.g. default path). @@ -40,7 +40,7 @@ We support three types of export formats: CSV, Pickle and JSON. The first export uses the following pattern column pattern: ``"Address", "First address components class", "Second class", ...``. -Which means the address ``305 rue des Lilas 0 app 2`` will output the table bellow +Which means the address ``305 rue des Lilas 0 app 2`` will output the table below using our default tags: .. list-table:: @@ -65,17 +65,17 @@ using our default tags: - None - None -The second export uses a similar approach but using tuples and list. Using the same example will return the following +The second export uses a similar approach but uses tuples and lists. Using the same example will return the following tuple ``("305 rue des Lilas 0 app 2", [("305", "StreetNumber"), ("rue des lilas", "StreetName"), ...])``. The third export uses a similar approach to the CSV format but uses dictionary-like formatting. Using the -same example will return the following dict ``{"Address": "305 rue des Lilas 0 app 2", "StreetNumber": "305", ...}``. +same example will return the following dictionary ``{"Address": "305 rue des Lilas 0 app 2", "StreetNumber": "305", ...}``. Retrain ******* This command allows a user to retrain the ``base_parsing_model`` on the ``train_dataset_path`` dataset. -For the training, the CSV or Pickle dataset is loader in a specific dataloader (see +For the training, the CSV or Pickle dataset is loaded in a specific dataloader (see :class:`~deepparse.dataset_container.DatasetContainer` for more details). We use Poutyne's automatic logging functionalities during training. Thus, it creates an epoch checkpoint and outputs the epoch metrics in a TSV file. Moreover, we save the best epoch model under the retrain model name (either the default one or a given name using @@ -94,11 +94,11 @@ One can use the command ``parse --help`` to output the same description in your - ``--disable_tensorboard``: To disable Poutyne automatic Tensorboard monitoring. By default, we disable them (``True``). - ``--layers_to_freeze``: Name of the portion of the seq2seq to freeze layers, thus reducing the number of parameters to learn. Default to ``None``. - ``--name_of_the_retrain_parser``: Name to give to the retrained parser that will be used when reloaded as the printed name, and to the saving file name. By default, ``None``, thus, the default name. See the complete parser retrain method for more details. - - ``--device``: The device to use. It can be ``'cpu'`` or a GPU device index such as ``'0'`` or ``'1'``. By default ``'0'``. - - ``--csv_column_names``: The column names to extract address in the CSV. Need to be specified if the provided dataset_path leads to a CSV file. Column names have to be separated by whitespace. For example, ``--csv_column_names column1 column2``. + - ``--device``: The device to use. It can be ``'cpu'`` or a GPU device index such as ``'0'`` or ``'1'``. By default, ``'0'``. + - ``--csv_column_names``: The column names to extract the address in the CSV. It must be specified if the provided dataset_path leads to a CSV file. Column names have to be separated by whitespace. For example, ``--csv_column_names column1 column2``. - ``--csv_column_separator``: The column separator for the dataset container will only be used if the dataset is a CSV one. By default, ``'\t'``. - ``--cache_dir``: To change the default cache directory (default to ``None``, e.g. default path). - - ``prediction_tags``: To change the prediction tags. The ``prediction_tags`` is a path leading to a JSON file of the new tags in a key-value style. For example, the path can be ``"a_path/file.json"`` and the content can be ``{"new_tag": 0, "other_tag": 1, "EOS": 2}``. + - ``prediction_tags``: To change the prediction tags. The ``prediction_tags`` path leads to a JSON file of the new tags in a key-value style. For example, the path can be ``"a_path/file.json"`` and the content can be ``{"new_tag": 0, "other_tag": 1, "EOS": 2}``. .. autofunction:: deepparse.cli.retrain.main @@ -109,9 +109,9 @@ Test This command allows a user to test the ``base_parsing_model`` (or the retrained one using the ``--path_to_retrained_model``) on the ``train_dataset_path`` dataset. -For the testing, the CSV or Pickle dataset is loader in a specific dataloader (see +For the testing, the CSV or Pickle dataset is loaded in a specific dataloader (see :class:`~deepparse.dataset_container.DatasetContainer` for more details). Moreover, by default, -we log some information (``--log``) such as the tested address parser model name and the parsed dataset path. Plus, +we log some information (``--log``), such as the tested address parser model name and the parsed dataset path. Plus, we also log the testing results in a TSV file. The two files are exported at the same path as the testing dataset. Here is the list of the arguments, their descriptions and default values. One can use the command ``parse --help`` to output the same description in your command line. @@ -123,7 +123,7 @@ One can use the command ``parse --help`` to output the same description in your - ``--batch_size``: The batch size to use to process the dataset. By default, ``32``. - ``--num_workers``: The number of workers to use for the data loader (default is ``1`` worker). - ``--seed``: The seed to use to make the sampling deterministic (default ``42``). - - ``--csv_column_name``: The column name to extract address in the CSV. Need to be specified if the provided ``dataset_path`` leads to a CSV file. By default, ``None``. + - ``--csv_column_name``: The column name to extract the address in the CSV. It must be specified if the provided ``dataset_path`` leads to a CSV file. By default, ``None``. - ``--csv_column_separator``: The column separator for the dataset container will only be used if the dataset is a CSV one. By default, ``'\t'``. - ``--log``: Either or not to log the parsing process into a ``.log`` file exported at the same place as the parsed data using the same name as the export file. The bool value can be (not case sensitive) ``'true/false'``, ``'t/f'``, ``'yes/no'``, ``'y/n'`` or ``'0/1'``. By default, ``True``. - ``--cache_dir``: To change the default cache directory (default to ``None``, e.g. default path). diff --git a/docs/source/examples/retrain_with_new_seq2seq_params.rst b/docs/source/examples/retrain_with_new_seq2seq_params.rst index 334291b5..d7e4bc8f 100644 --- a/docs/source/examples/retrain_with_new_seq2seq_params.rst +++ b/docs/source/examples/retrain_with_new_seq2seq_params.rst @@ -56,7 +56,7 @@ Let's start with the default learning rate of ``0.01`` and use a learning rate s logging_path = "./checkpoints" # The new seq2seq params settings using smaller hidden size - # See the doc for the list of tunable seq2seq parameters + # See the documentation for the list of tunable seq2seq parameters seq2seq_params = { "encoder_hidden_size": 512, "decoder_hidden_size": 512 diff --git a/docs/source/parser.rst b/docs/source/parser.rst index 620fb811..f6855880 100644 --- a/docs/source/parser.rst +++ b/docs/source/parser.rst @@ -12,10 +12,10 @@ Pre-trained Complete Model This is the complete pretrained address parser model. This model allows using the pretrained weights to predict the tags of any address. -We offer, for now, only two pretrained models, FastText and BPEmb. The first one relies on +For now, we offer only two pretrained models, FastText and BPEmb. The first one relies on `fastText `__ French pretrained embeddings to parse the address, and the second use the `byte-pair multilingual subword `_ pretrained embeddings. In both cases, -the architecture is similar, and performances are comparable; our results are available in this +the architecture and performances are similar; our results are available in this `article `_. Memory Usage and Time Performance @@ -27,7 +27,7 @@ we report the RAM usage, and in the first table, we also report the GPU memory u Also, for both tables, we report the mean-time of execution that was obtained by processing ~183,000 addresses using different batch sizes (2^0, ..., 2^9) (i.e. :math:`\frac{\text{Total time to process all addresses}}{~183,000} =` time per address). -In addition, we proposed a lighter version (fasttext-light) of our fastText model using +In addition, we proposed a lighter version (``"fasttext-light"``) of our fastText model using `Magnitude embeddings mapping `_. For this lighter model, on average, results are a little bit lower for the trained country (around ~2%) but are similar for the zero-shot country (see our `article `_ for more details). @@ -108,10 +108,10 @@ are a little bit lower for the trained country (around ~2%) but are similar for .. [2] Note that on Windows, we use the Gensim FastText models that use ~10 GO with similar performance. -Thus, the more address is, the faster each address can be processed. You can also improve performance by using more +Thus, the more addresses there are, the faster each address can be processed. You can also improve performance by using more workers for the data loader created with your data within the call. But note that this performance improvement is not linear. Furthermore, as of version ``0.9.6``, we now use Torch 2.0 and many other tricks to improve -processing performance. Here a few: if the parser uses a GPU, it will pin the memory in the Dataloader and reduce some +processing performance. Here are a few: if the parser uses a GPU, it will pin the memory in the Dataloader and reduce some operations (e.g. useless ``.to(device)``). AddressParser diff --git a/docs/source/training_guide.rst b/docs/source/training_guide.rst index 4649bbd0..6aa8cdc1 100644 --- a/docs/source/training_guide.rst +++ b/docs/source/training_guide.rst @@ -5,7 +5,7 @@ Training Guide ============== In addition to parsing addresses out-of-the-box, Deepparse allows you to retrain the pre-trained models to fit your data and use cases. -In the world of machine learning, this is what's refered to as ``fine-tuning``, which can make it easier to obtain well-performing +In the world of machine learning, this is what's referred to as ``fine-tuning``, which can make it easier to obtain well-performing models more efficiently and with less data. Since fine-tuning models can be tricky, this section of the documentation provides some guidelines and insights that may @@ -19,14 +19,14 @@ how to retrain our models. A few use cases may lead you to want to retrain Deepparse's models. Whether you wish to obtain a better performance on a single or multiple countries that our models weren't trained on, or your data and address schemes require a more complex -architecture, or the tag structure of your dataset, is different from ours; deepparse's retraining features accommodate all these use cases and more. +architecture, or your dataset's tag structure, differs from ours; Deepparse's retraining features accommodate all these use cases and more. In practice, our models were trained on 20 countries. They demonstrated very accurate results on all of them, so we advise you to use our models without retraining unless you wish to predict -different tags (e.g., StreetNumber ...). Also, suppose you want to retrain +different tags (e.g., StreetNumber, ...). Also, suppose you want to retrain our models to perform better on countries outside of the 20 used in the original training set. In that case, you can look at `our dataset `__ which includes an additional 41 countries used only for testing. -There are two main concerns to keep in mind when fine-tuning a model: the model's convergence (i.e, its ability actually to learn from the new data) +There are two main concerns to keep in mind when fine-tuning a model: the model's convergence (i.e., its ability actually to learn from the new data) and the possibility of ``catastrophic forgetting`` (i.e., losing the model's previous knowledge after training on the new data). Learning Successfully @@ -37,7 +37,7 @@ of fine-tuning, the models have already developed a base knowledge of the task t This is especially true in the case of Deepparse since the task you are fine-tuning remains the same (i.e. parsing addresses). However, there are a couple of points to consider to obtain favourable results: -- **Make sure you have enough data**: deep learning models are notorious for being pretty data hungry, so unless you have enough data, the models +- **Make sure you have enough data**: deep learning models are notorious for being pretty data-hungry, so unless you have enough data, the models will have a hard time learning. Since Deepparse's models have already been trained on a few million addresses, the need for data is mitigated for fine-tuning. However, it is recommended to use at least a few thousand examples per new country when retraining. @@ -59,7 +59,7 @@ However, there are a couple of points to consider to obtain favourable results: Do Not Forget! ************** -As mentionned above, catastrophic forgetting can happen when fine-tuning machine learning models. This is because the models' internal parameters are +As mentioned above, catastrophic forgetting can happen when fine-tuning machine learning models. This is because the models' internal parameters are modified to accommodate the new task/data, which can impact their ability to be appropriate for the previous task/data. There are many fancy ways to mitigate catastrophic forgetting when fine-tuning models. Still, given the task and data that Deepparse handles, we recommend including some of the previous data when constructing your retraining dataset. The amount @@ -95,5 +95,5 @@ Modifying the Architecture The :meth:`~deepparse.parser.AddressParser.retrain` method allows you to change the architecture of the models using the ``seq2seq_params`` argument. This can be useful if you need a more complex model or a lighter model, for example. However, if you -change the models' architecture, you will end up with a completely new model that will be retrained from scratch. This -means that all the previous knowledge that the initial model had will disapear. +change the models' architecture, a completely new model will be retrained from scratch. This +means that all the previous knowledge that the initial model had will disappear. diff --git a/examples/retrain_with_new_seq2seq_params.py b/examples/retrain_with_new_seq2seq_params.py index a402240d..99a42920 100644 --- a/examples/retrain_with_new_seq2seq_params.py +++ b/examples/retrain_with_new_seq2seq_params.py @@ -34,7 +34,7 @@ logging_path = "./checkpoints" # The new seq2seq params settings using smaller hidden size -# See the doc for the list of tunable seq2seq parameters +# See the documentation for the list of tunable seq2seq parameters seq2seq_params = {"encoder_hidden_size": 512, "decoder_hidden_size": 512} address_parser.retrain( diff --git a/models_evaluation/timer/timer.py b/models_evaluation/timer/timer.py index 285f1777..07fc2c6e 100644 --- a/models_evaluation/timer/timer.py +++ b/models_evaluation/timer/timer.py @@ -30,7 +30,7 @@ class Timer: The class can be used as a context manager to time the code inside the 'with' statement, as a decorator of a function or a method to time it at each call, or as an iterator to have the total running time of a - for loop as well as the mean time taken per iteration. See the doc of the init method for usage examples. + for loop as well as the mean time taken per iteration. See the documentation of the init method for usage examples. """ def __init__( diff --git a/tests/cli/test_retrain.py b/tests/cli/test_retrain.py index fad25bf0..e3e32570 100644 --- a/tests/cli/test_retrain.py +++ b/tests/cli/test_retrain.py @@ -201,8 +201,8 @@ def test_integration_csv(self): def test_ifIsCSVFile_noColumnName_raiseValueError(self): with self.assertRaises(ValueError): - # We set up the params with the default value of csv_column_names of the test case method set_up_params, - # which is None, thus no column names. + # We set up the params with the default value of ``"csv_column_names"`` of the test case method + # set_up_params, which is None, thus no column names. parser_params = self.set_up_params(train_dataset_path=self.a_train_csv_dataset_path) retrain.main(parser_params) diff --git a/tests/parser/test_address_parser_retrain_api.py b/tests/parser/test_address_parser_retrain_api.py index f3b12b22..2b5e4f38 100644 --- a/tests/parser/test_address_parser_retrain_api.py +++ b/tests/parser/test_address_parser_retrain_api.py @@ -1433,8 +1433,8 @@ def test_givenRetrainSettings_whenFormattedNameParserName_thenReturnProperNaming ) # We set possible params type with a value - prediction_tags_settings = [{"A dict": 1.0}, None] # Can be a dict or a None - seq2seq_params_settings = [{"A dict": 1.0}, None] # Can be a dict or a None + prediction_tags_settings = [{"A dict": 1.0}, None] # Can be a dictionary or a None + seq2seq_params_settings = [{"A dict": 1.0}, None] # Can be a dictionary or a None layers_to_freeze_settings = [None, "encoder", "decoder", "prediction_layer", "seq2seq"] # From the doc # We loop all possible settings diff --git a/tests/test_download_tools.py b/tests/test_download_tools.py index 6973a76b..202c7c89 100644 --- a/tests/test_download_tools.py +++ b/tests/test_download_tools.py @@ -228,7 +228,7 @@ def test_givenAFasttextLightEmbeddingsNotLocal_whenDownloadFasttextEmbeddingsVer download_fasttext_magnitude_embeddings(self.a_directory_path, verbose=True) expected = ( - "The fastText pretrained word embeddings will be download in magnitude format (2.3 GO), " + "The FastText pretrained word embeddings will be download in magnitude format (2.3 GO), " "this process will take several minutes." ) @@ -311,7 +311,7 @@ def test_givenADownloadFasttext_whenPrintProgressSetToVerbose_thenPrint( actual = self.test_out.getvalue().strip() expected = ( - "The fastText pretrained word embeddings will be downloaded (6.8 GO), " + "The FastText pretrained word embeddings will be downloaded (6.8 GO), " "this process will take several minutes." ) self.assertIn(expected, actual)