From 2f396dc32fabf27d865463a13acbee10ea6c4b8b Mon Sep 17 00:00:00 2001 From: davebulaval Date: Mon, 16 Oct 2023 20:41:30 -0400 Subject: [PATCH] improve documentation --- deepparse/comparer/addresses_comparer.py | 20 ++++++------- .../comparer/formatted_compared_addresses.py | 28 +++++++++---------- .../formatted_compared_addresses_raw.py | 6 ++-- .../formatted_compared_addresses_tags.py | 6 ++-- deepparse/data_validation/data_validation.py | 15 ++++++---- .../dataset_container/dataset_container.py | 20 ++++++------- deepparse/dataset_container/tools.py | 10 +++---- deepparse/parser/address_parser.py | 26 ++++++++--------- .../parser/test_address_parser_retrain_api.py | 2 +- 9 files changed, 68 insertions(+), 65 deletions(-) diff --git a/deepparse/comparer/addresses_comparer.py b/deepparse/comparer/addresses_comparer.py index 70b7f704..c7dabf06 100644 --- a/deepparse/comparer/addresses_comparer.py +++ b/deepparse/comparer/addresses_comparer.py @@ -11,14 +11,14 @@ class AddressesComparer: """ Address comparer to compare addresses with each other and retrieves the differences between them. The addresses - are parsed using an address parser based on one of the seq2seq pretrained networks either with fastText or BPEmb. + are parsed using an address parser based on one of the seq2seq pretrained networks, either with fastText or BPEmb. - The address comparer can compare already parsed addresses. The address parser first recompose the raw - addresses then suggests its own tags, then it makes a comparison with the tags of the source parsing and the + The address comparer can compare already parsed addresses. The address parser first recomposes the raw + addresses then suggest its own tags; then it makes a comparison with the tags of the source parsing and the newly parsed address The address comparer is also able to compare raw addresses by first parsing the addresses using the - address parser and then brings out the differences among the parsed addresses. + address parser and then bring out the differences among the parsed addresses. Args: @@ -40,13 +40,13 @@ def compare_tags( ) -> Union[List[FormattedComparedAddressesTags], FormattedComparedAddressesTags]: """ Compare tags of a source parsing with the parsing from AddressParser. First, it reconstructs the - raw address from the parsing, then AddressParser generates tags and then compares the two parsings. + raw address from the parsing, AddressParser generates tags and compares the two parsings. Args: addresses_tags_to_compare (Union[List[tuple], List[List[tuple]]]): list of tuple that contains - the tags for the address components from the source. Can compare multiples parsings if passed as a + the tags for the address components from the source. Can compare multiple parsings if passed as a list of tuples. - with_prob (Union[None, bool]): A option flag to either or not include prob in the comparison report. + with_prob (Union[None, bool]): An option flag to either or not include prob in the comparison report. The probabilities are not compared but only included in the report. The default value is None, which means not taking into account. @@ -122,14 +122,14 @@ def compare_raw( with_prob: Union[None, bool] = None, ) -> List[FormattedComparedAddressesRaw]: """ - Compare a list of raw addresses together, it starts by parsing the addresses + Compare a list of raw addresses together. It starts by parsing the addresses with the setted parser and then return the differences between the addresses components retrieved with our model. Args: raw_addresses_to_compare (Union[Tuple[str], List[Tuple[str]]]): - List of string that represent raw addresses to compare. - with_prob (Union[None, bool]): A option flag to either or not include prob in the comparison report. + List of strings that represent raw addresses to compare. + with_prob (Union[None, bool]): An option flag to either or not include prob in the comparison report. The probabilities are not compared but only included in the report. The default value is None, which means not taking into account. diff --git a/deepparse/comparer/formatted_compared_addresses.py b/deepparse/comparer/formatted_compared_addresses.py index f29eaca2..f90f699e 100644 --- a/deepparse/comparer/formatted_compared_addresses.py +++ b/deepparse/comparer/formatted_compared_addresses.py @@ -20,7 +20,7 @@ class FormattedComparedAddresses(ABC): for the first one. second_address(FormattedParsedAddress): A formatted parsed address that contains the parsing information for the second one. - origin: (Tuple[str, str]): The origin of the parsing (ex : from source or from a deepparse pretrained model). + origin: (Tuple[str, str]): The origin of the parsing (ex : from source or a Deepparse pretrained model). Example: @@ -40,7 +40,7 @@ class FormattedComparedAddresses(ABC): @property def list_of_bool(self) -> List: """ - A list of boolean that contains all the address components names and indicates if it is the same for the + A list of boolean that contains all the address components' names and indicates if it is the same for the two addresses. Return: @@ -86,7 +86,7 @@ def comparison_report(self, nb_delimiters: Union[int, None] = None) -> None: def _comparison_report(self, nb_delimiters: Union[int, None]) -> str: """ - Builds a comparison_report with delimiters to make the beginning and the end of the comparison easier to spot. + Builds a comparison_report with delimiters to make the comparison's beginning and end easier to spot. """ # Get terminal size to adapt the output to the user @@ -102,15 +102,15 @@ def _comparison_report(self, nb_delimiters: Union[int, None]) -> str: @abstractmethod def _comparison_report_builder(self) -> str: """ - Builds the core of a comparison report for the different comparisons. Since the procedure to make a tags - comparison and the raw addresses comparison is different, the comparison report is not the same for the two. + Builds the core of a comparison report for the various comparisons. Since the procedure to make a tags + comparison and the raw addresses comparison are different, the comparison report is not the same for the two. It is then implemented in each specific class. """ @abstractmethod def _get_probs(self) -> Dict: """ - Get the tags from the parsing with their associated probabilities, the method needs to be implemented in each + To get the tags from the parsing with their associated probabilities, the method needs to be implemented in each class because they don't use the probabilities the same way. """ @@ -180,7 +180,7 @@ def _get_tags_diff_color( Args: name_one (str, optional) : Name associated with first color. The default value is the first address. - name_two (str, optional) : Name associated with second color. The default value is the second address. + name_two (str, optional) : Name associated with the second colour. The default value is the second address. verbose (bool, optional): If True, it will print a presentation of the colours and what they mean. The default value is True. @@ -221,14 +221,14 @@ def _get_tags_diff_color( def _bool_address_tags_are_the_same(self, parsed_addresses: Union[List[List[tuple]], List[tuple]]) -> List[tuple]: """ Compare addresses components and put the differences in a dictionary where the keys are the - names of the addresses components, and the values are the value of the addresses component. + names of the addresses components, and the values are the values of the addresses component. Args: parsed_addresses (Union[List[List[tuple]], List[tuple]]): Contains the tags and the - address components name for the parsed addresses. + address components' names for the parsed addresses. Return: - List[tuple]: List of tuples that contains all addresses components that differ from each other. + List[tuple]: List of tuples that contain all addresses components that differ from each other. """ unique_address_component_names = self._unique_addresses_component_names(parsed_addresses) @@ -258,16 +258,16 @@ def _bool_address_tags_are_the_same(self, parsed_addresses: Union[List[List[tupl @staticmethod def _unique_addresses_component_names(parsed_addresses: List[List[tuple]]) -> List: """ - Retrieves all the unique address components names from the comparison then returns it. + Retrieves all the unique address component names from the comparison, then returns it. Args: parsed_addresses (List[List[tuple]]): Contains the tags and the - address components name for the parsed addresses. + address components' names for the parsed addresses. Return: - Returns a list of all the unique address components names. + Returns a list of all the unique address component names. """ - # Here we don't use a set since order will change and report will also change. + # We don't use a set here since the order and report will change. unique_address_component_names = [] for tuple_values in parsed_addresses: for address_component in tuple_values: diff --git a/deepparse/comparer/formatted_compared_addresses_raw.py b/deepparse/comparer/formatted_compared_addresses_raw.py index 614ee313..de94c05d 100644 --- a/deepparse/comparer/formatted_compared_addresses_raw.py +++ b/deepparse/comparer/formatted_compared_addresses_raw.py @@ -12,7 +12,7 @@ class FormattedComparedAddressesRaw(FormattedComparedAddresses): def _get_probs(self) -> Dict: """ - Get the tags from the parsing with their associated probabilities, the method needs to be implemented in each + To get the tags from the parsing with their associated probabilities, the method needs to be implemented in each class because they don't use the probabilities the same way. """ return { @@ -45,8 +45,8 @@ def _get_raw_diff_color(self, verbose=True) -> str: def _comparison_report_builder(self) -> str: """ - Builds the core of a comparison report for the different comparisons. Since the procedure to make a tags - comparison and the raw addresses comparison is different, the comparison report is not the same for the two. + Builds the core of a comparison report for the various comparisons. Since the procedure to make a tags + comparison and the raw addresses comparison are different, the comparison report is not the same for the two. It is then implemented in each specific class. """ str_formatted = "" diff --git a/deepparse/comparer/formatted_compared_addresses_tags.py b/deepparse/comparer/formatted_compared_addresses_tags.py index 104643af..775335d8 100644 --- a/deepparse/comparer/formatted_compared_addresses_tags.py +++ b/deepparse/comparer/formatted_compared_addresses_tags.py @@ -12,7 +12,7 @@ class FormattedComparedAddressesTags(FormattedComparedAddresses): def _get_probs(self) -> Dict: """ - Get the tags from the parsing with their associated probabilities, the method needs to be implemented in each + To get the tags from the parsing with their associated probabilities, the method needs to be implemented in each class because they don't use the probabilities the same way. """ return { @@ -37,8 +37,8 @@ def _get_probs_of_tags(self, verbose: bool = True) -> str: def _comparison_report_builder(self) -> str: """ - Builds the core of a comparison report for the different comparisons. Since the procedure to make a tags - comparison and the raw addresses comparison is different, the comparison report is not the same for the two. + Builds the core of a comparison report for the various comparisons. Since the procedure to make a tags + comparison and the raw addresses comparison are different, the comparison report is not the same for the two. It is then implemented in each specific class. """ diff --git a/deepparse/data_validation/data_validation.py b/deepparse/data_validation/data_validation.py index 3b98a8c3..2e95e85d 100644 --- a/deepparse/data_validation/data_validation.py +++ b/deepparse/data_validation/data_validation.py @@ -3,30 +3,33 @@ def validate_if_any_empty(string_elements: List) -> bool: """ - Return true if one of the string element is an empty one. + Return ``True`` if one of the string elements is empty. For example, the second element in the following list is + an empty address: ``["An address", "", "Another address"]``. Thus, it will return ``False``. Args: - string_elements (list): A list of string to validate. + string_elements (list): A list of strings to validate. """ return any(is_empty(string_element) for string_element in string_elements) def validate_if_any_whitespace_only(string_elements: List) -> bool: """ - Return true if one of the string element is only whitespace. + Return ``True`` if one of the string elements is only whitespace. For example, the second element in the + following list is only whitespace: ``["An address", " ", "Another address"]``. Thus, it will return ``False``. Args: - string_elements (list): A list of string to validate. + string_elements (list): A list of strings to validate. """ return any(is_whitespace_only(string_element) for string_element in string_elements) def validate_if_any_none(string_elements: List) -> bool: """ - Return true if one of the string element is a None value. + Return ``True`` if one string element is a ``None`` value. For example, the second element in the following + list is a ``None`` value: ``["An address", None, "Another address"]``. Thus, it will return ``False``. Args: - string_elements (list): A list of string to validate. + string_elements (list): A list of strings to validate. """ return any(is_none(string_element) for string_element in string_elements) diff --git a/deepparse/dataset_container/dataset_container.py b/deepparse/dataset_container/dataset_container.py index 2e691305..9ffd7588 100644 --- a/deepparse/dataset_container/dataset_container.py +++ b/deepparse/dataset_container/dataset_container.py @@ -54,14 +54,14 @@ def __getitem__( self, idx: Union[int, slice] ) -> Union[List[str], str, List[List[Tuple[str, List]]], Tuple[str, List]]: """ - If the DatasetContainer is a predict one: + If the DatasetContainer is a "predict" one: - - it can be a list of string items (e.g. a list of address (str)), or + - it can be a list of string items (e.g. a list of addresses (str)), or - it can be a unique string item (e.g. one address). If the DatasetContainer is a training one: - - it can be a list of tuple (str, list) items, namely a list of parsed example (e.g. an address with + - it can be a list of tuple (str, list) items, namely a list of parsed examples (e.g. an address with the tags), or - it can be a tuple (str, list) item. @@ -114,7 +114,7 @@ def _training_validation(self) -> None: if not self._data_tags_is_same_len_then_address(): print( - f"Some addresses (whitespace-split) and the tags associated with them are not the same len. " + f"Some addresses (whitespace-split) and the associated tags are not the same len. " f"If you are using a CSVDatasetContainer, consider using the tag_seperator_reformat_fn argument." f"Here is the report of those cases where len differ to help you out:\n" f"{self._data_tags_not_the_same_len_diff()}" @@ -190,8 +190,8 @@ def __init__(self, data_path: str, is_training_container: bool = True) -> None: if not is_training_container: if self._test_predict_container_is_list_of_tuple(): raise DataError( - "The data is a list of tuple by the dataset container is a predict container. " - "Predict container should contains only a list of address." + "The data is a list of tuples, but the dataset container is a predict container. " + "Predict container should contain only a list of addresses." ) self.validate_dataset() @@ -226,17 +226,17 @@ class CSVDatasetContainer(DatasetContainer): data_path (str): The path to the CSV dataset file. column_names (list): A column name list to extract the dataset element. - If the dataset container is a predict one, the list must be of exactly one element - (i.e. the address column). On the other hand, if the dataset container is a training one, the list must be + If the dataset container is a "predict" one, the list must be of exactly one element + (i.e. the address column). On the other hand, if the dataset container is a "training" one, the list must be of exactly two elements: addresses and tags. is_training_container (bool): Either or not, the dataset container is a training container. This will determine the dataset validation test we apply to the dataset. That is, a predict dataset doesn't include tags. The default value is true. separator (str): The CSV columns separator to use. By default, ``"\\t"``. tag_seperator_reformat_fn (Callable, optional): A function to parse a tags string and return a list of - address tags. For example, if the tag column is a former python list saved with pandas, the characters ``]`` + address tags. For example, if the tag column is a former Python list saved with pandas, the characters ``]`` , ``]`` and ``'`` will be included as the tags' element. Thus, a parsing function will take a string as is - parameter and output a python list. The default function process it as a former python list. + parameter and output a python list. The default function processes it as a former Python list. That is, it removes the ``[],`` characters and splits the sequence at each comma (``","``). csv_reader_kwargs (dict, optional): Keyword arguments to pass to pandas ``read_csv`` use internally. By default, the ``data_path`` is passed along with our default ``sep`` value ( ``"\\t"``) and the ``"utf-8"`` encoding diff --git a/deepparse/dataset_container/tools.py b/deepparse/dataset_container/tools.py index 2971b58f..133522aa 100644 --- a/deepparse/dataset_container/tools.py +++ b/deepparse/dataset_container/tools.py @@ -14,23 +14,23 @@ def former_python_list(tags: str) -> List: A list of the parsed tag set. """ # We remove the [ and ] of the list. - # Then, we split each element using a comma as separator. - # Finally, since some case the element are separated by a comma (e.g. element1,element2) + # Then, we split each element using a comma as a separator. + # Finally, in some cases, the element are separated by a comma (e.g. element1,element2) # or a comma and a whitespace (e.g. element1, element2), we strip the whitespace on all tags to - # remove the trailing whitespace when element are separated by a coma and a whitespace. + # remove the trailing whitespace when a coma and a whitespace separate elements. # To fix https://github.com/GRAAL-Research/deepparse/issues/124. return [tag.strip() for tag in tags.replace("[", "").replace("]", "").replace("'", "").split(",")] def validate_column_names(column_names: List[str]) -> bool: """ - Function validate if element of a list of column name are valid. + Function to validate if the element of a list of column names is valid. Args: column_names (List[str]): A list of column names. Return: - Either or not, the colum name are valid. + Either or not, the column names are valid. """ improper_column_names = False if validate_if_any_empty(column_names) or validate_if_any_whitespace_only(column_names): diff --git a/deepparse/parser/address_parser.py b/deepparse/parser/address_parser.py index 2dc48c3d..ceba52e0 100644 --- a/deepparse/parser/address_parser.py +++ b/deepparse/parser/address_parser.py @@ -539,9 +539,9 @@ def retrain( disable_tensorboard (bool): To disable Poutyne automatic Tensorboard monitoring. By default, we disable them (true). prediction_tags (Union[dict, None]): A dictionary where the keys are the address components - (e.g. street name) and the values are the components indices (from 0 to N + 1) to use during retraining - of a model. The ``+ 1`` corresponds to the End Of Sequence (EOS) token that needs to be included in the - dictionary. We will use this dictionary's length for the prediction layer's output size. + (e.g. street name) and the values are the components indices (from 0 to N + 1) to use during the + retraining of a model. The ``+ 1`` corresponds to the End Of Sequence (EOS) token that needs to be + included in the dictionary. We will use this dictionary's length for the prediction layer's output size. We also save the dictionary to be used later on when you load the model. The default value is ``None``, meaning we use our pretrained model prediction tags. seq2seq_params (Union[dict, None]): A dictionary of seq2seq parameters to modify the seq2seq architecture @@ -582,7 +582,7 @@ def retrain( - if layers_to_freeze is not ``None``, the following tag: ``FreezedLayer{portion}``. verbose (Union[None, bool]): To override the AddressParser verbosity for the test. When set to True or False, it will override (but it does not change the AddressParser verbosity) the test verbosity. - If set to the default value None, the AddressParser verbosity is used as the test verbosity. + If set to the default value ``None``, the AddressParser verbosity is used as the test verbosity. Return: @@ -745,7 +745,7 @@ def retrain( self.processor.tags_converter = self.tags_converter if not self.model.same_output_dim(self.tags_converter.dim): - # Since we have change the output layer dim, we need to handle the prediction layer dim + # Since we have changed the output layer dim, we need to handle the prediction layer dim new_dim = self.tags_converter.dim if seq2seq_params is None: self.model.handle_new_output_dim(new_dim) @@ -759,7 +759,7 @@ def retrain( seq2seq_params.update({"pre_trained_weights": False}) model_factory_dict.update({"seq2seq_kwargs": seq2seq_params}) - # We set verbose to false since model is reloaded + # We set verbose to false since the model is reloaded self._setup_model(verbose=False, path_to_retrained_model=None, **model_factory_dict) callbacks = [] if callbacks is None else callbacks @@ -791,7 +791,7 @@ def retrain( with_capturing_context = False if not valid_poutyne_version(min_major=1, min_minor=8): print( - "You are using a older version of Poutyne that does not support properly error management." + "You are using an older version of Poutyne that does not support proper error management." " Due to that, we cannot show retrain progress. To fix that, update Poutyne to " "the newest version." ) @@ -811,7 +811,7 @@ def retrain( list_of_file_path = os.listdir(path=".") if list_of_file_path: if pretrained_parser_in_directory(logging_path): - # Mean we might already have checkpoint in the training directory + # Mean we might already have a checkpoint in the training directory files_in_directory = get_files_in_directory(logging_path) retrained_address_parser_in_directory = get_address_parser_in_directory(files_in_directory)[ 0 @@ -853,7 +853,7 @@ def retrain( # Means we have changed the seq2seq params torch_save.update({"seq2seq_params": seq2seq_params}) if prediction_tags is not None: - # Means we have changed the predictions tags + # Means we have changed the prediction tags torch_save.update({"prediction_tags": prediction_tags}) torch_save.update( @@ -885,7 +885,7 @@ def retrain( except FileNotFoundError as error: if "s3" in file_path or "//" in file_path or ":" in file_path: raise FileNotFoundError( - "Are You trying to use a AWS S3 URI? If so path need to start with s3://." + "Are You trying to use an AWS S3 URI? If so path needs to start with s3://." ) from error return train_res @@ -906,7 +906,7 @@ def test( Args: test_dataset_container (~deepparse.dataset_container.DatasetContainer): The test dataset container of the data to use. - batch_size (int): The size of the batch (by default, ``32``). + batch_size (int): The batch size (by default, ``32``). num_workers (int): Number of workers to use for the data loader (by default, ``1`` worker). callbacks (Union[list, None]): List of callbacks to use during training. See Poutyne `callback `_ for more information. @@ -1008,7 +1008,7 @@ def save_model_weights(self, file_path: Union[str, Path]) -> None: Method to save, in a Pickle format, the address parser model weights (PyTorch state dictionary). file_path (Union[str, Path]): A complete file path with a pickle extension to save the model weights. - It can either be a string (e.g. 'path/to/save.p') or a path like path (e.g. Path('path/to/save.p'). + It can either be a string (e.g. 'path/to/save.p') or a path-like path (e.g. Path('path/to/save.p'). Examples: @@ -1196,7 +1196,7 @@ def _retrain( verbose: Union[None, bool], ) -> List[Dict]: # pylint: disable=too-many-arguments - # If Poutyne 1.7 and before, we capture poutyne print since it print some exception. + # If Poutyne 1.7 and before, we capture poutyne print since it prints some exception. # Otherwise, we use a null context manager. with Capturing() if capturing_context else contextlib.nullcontext(): train_res = experiment.train( diff --git a/tests/parser/test_address_parser_retrain_api.py b/tests/parser/test_address_parser_retrain_api.py index 5253626e..f3b12b22 100644 --- a/tests/parser/test_address_parser_retrain_api.py +++ b/tests/parser/test_address_parser_retrain_api.py @@ -198,7 +198,7 @@ def test_givenAModel_whenRetrainWithPoutyneBefore18_thenPrintMessage( actual = self.test_out.getvalue() expected = ( - "You are using a older version of Poutyne that does not support properly error management." + "You are using an older version of Poutyne that does not support proper error management." " Due to that, we cannot show retrain progress. To fix that, update Poutyne to the newest version.\n" )