diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 542da21c..d4830b8f 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -14,6 +14,15 @@ jobs: runs-on: ubuntu-latest steps: + # Appears that we get disk memory space problem, thus as recommended by this + # thread (https://github.com/actions/runner-images/issues/2840#issuecomment-790492173) + # we clean the runner before starting the tests to free some spaces. + - name: Remove unnecessary files + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" - uses: actions/checkout@v3 - name: Build the Docker image run: | diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 030faa11..16a7dd6b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -10,6 +10,15 @@ jobs: python-version: [ "3.8", "3.9", "3.10", "3.11" ] steps: + # Appears that we get disk memory space problem, thus as recommended by this + # thread (https://github.com/actions/runner-images/issues/2840#issuecomment-790492173) + # we clean the runner before starting the tests to free some spaces. + - name: Remove unnecessary files + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 @@ -29,6 +38,15 @@ jobs: python-version: [ "3.8", "3.9", "3.10", "3.11" ] steps: + # Appears that we get disk memory space problem, thus as recommended by this + # thread (https://github.com/actions/runner-images/issues/2840#issuecomment-790492173) + # we clean the runner before starting the tests to free some spaces. + - name: Remove unnecessary files + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 diff --git a/CHANGELOG.md b/CHANGELOG.md index cb22812e..1cd0153c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -338,4 +338,6 @@ - Add a Dockerfile and a `docker-compose.yml` to build a Docker container for the API. - Bug-fix the default pre-processors that were not all apply but only the last one. -## dev \ No newline at end of file +## dev + +- Improve documentation \ No newline at end of file diff --git a/README.md b/README.md index 9f4da070..289dc6e9 100644 --- a/README.md +++ b/README.md @@ -224,14 +224,14 @@ address_parser = AddressParser( address_parser("350 rue des Lilas Ouest Québec Québec G1L 1B6") ``` -### Parse Address With Our Out-Of-The-Box FastAPI Parse Model +### Parse Address With Our Out-Of-The-Box API -You can use Out-Of-The-Box RESTAPI to parse addresses: +We also offer an out-of-the-box RESTAPI to parse addresses using FastAPI. #### Installation: First, ensure that you have Docker Engine and Docker Compose installed on your machine. -if not, you can install them using the following documentations in the following order: +If not, you can install them using the following documentations in the following order: 1. [Docker Engine](https://docs.docker.com/engine/install/) 2. [Docker Compose](https://docs.docker.com/compose/install/linux/#install-using-the-repository) @@ -246,7 +246,7 @@ docker compose up app #### Sentry: Also, you can monitor your application usage with [Sentry](https://sentry.io) by setting the environment variable `SENTRY_DSN` to your Sentry's project -DSN. There is an example of the .env file in the root of the project named `.env_example`. you can just copy it using the following command: +DNS. There is an example of the `.env` file in the project's root named `.env_example`. You can copy it using the following command: ```shell cp .env_example .env @@ -259,7 +259,7 @@ the `.env` file will also work. The application will run without any problem if #### Request Examples: -Once the application is up and running and the port 8000 is exported on your localhost, you can send request with one +Once the application is up and running and port `8000` is exported on your localhost, you can send a request with one of the following methods: ##### cURL POST request: diff --git a/deepparse/parser/address_parser.py b/deepparse/parser/address_parser.py index fe70e9c8..2dc48c3d 100644 --- a/deepparse/parser/address_parser.py +++ b/deepparse/parser/address_parser.py @@ -348,7 +348,7 @@ def __call__( replaced as ``'3 305'`` for the parsing. Where ``'3'`` is the unit, and ``'305'`` is the street number. We use a regular expression to replace alphanumerical characters separated by a hyphen at the start of the string. We do so since some cities use hyphens in their names. The default - is ``False``. If True, it adds the :func:`~deepparse.pre_processing.pre_processor.hyphen_cleaning` + is ``False``. If True, it adds the :func:`~deepparse.pre_processing.address_cleaner.hyphen_cleaning` pre-processor **at the end** of the pre-processor list to apply. pre_processors (Union[None, List[Callable]]): A list of functions (callable) to apply pre-processing on all the addresses to parse before parsing. See :ref:`pre_processor_label` for examples of diff --git a/deepparse/pre_processing/address_cleaner.py b/deepparse/pre_processing/address_cleaner.py index b4a180c8..b7ad962a 100644 --- a/deepparse/pre_processing/address_cleaner.py +++ b/deepparse/pre_processing/address_cleaner.py @@ -3,7 +3,7 @@ def double_whitespaces_cleaning(address: str) -> str: """ - Pre-processor to remove double whitespace by one whitespace. + Pre-processor to remove double whitespace (``" "``) by one whitespace (``" "``). The regular expression use to clean multiple whitespaces is the following ``" {2,}"``. Args: @@ -17,10 +17,10 @@ def double_whitespaces_cleaning(address: str) -> str: def trailing_whitespace_cleaning(address: str) -> str: """ - Pre-processor to remove trailing whitespace. + Pre-processor to remove trailing whitespace (``" "``). Args: - address: The address to apply trailing whitespace cleaning on. + address: The address to apply trailing whitespace (``" "``) cleaning on. Return: The trailing whitespace cleaned address. @@ -64,16 +64,16 @@ def hyphen_cleaning(address: str) -> str: """ Pre-processor to clean hyphen between the street number and unit in an address. Since some addresses use the hyphen to split the unit and street address, we replace the hyphen with whitespaces to allow a - proper splitting of the address. For example, the proper parsing of the address 3-305 street name is - Unit: 3, StreetNumber: 305, StreetName: street name. + proper splitting of the address. For example, the proper parsing of the address ``"3-305 street name"`` is + ``"Unit": "3", "StreetNumber": "305", "StreetName": "street name"``. See `issue 137 `_ for more details. The regular expression use to clean hyphen is the following ``"^([0-9]*[a-z]?)-([0-9]*[a-z]?) "``. The first group is the unit, and the second is the street number. Both include letters since they can include - letters in some countries. For example, unit 3a or address 305a. + letters in some countries. For example, ``unit 3a`` or address ``305a``. - Note: the hyphen is also used in some cities' names, such as Saint-Jean; thus, we use regex to detect + Note: the hyphen is also used in some cities' names, such as ``"Saint-Jean"``; thus, we use regex to detect the proper hyphen to replace. Args: diff --git a/docs/source/api.rst b/docs/source/api.rst new file mode 100644 index 00000000..c39ff937 --- /dev/null +++ b/docs/source/api.rst @@ -0,0 +1,68 @@ +.. role:: hidden + :class: hidden-section + +Parse Address With Our Out-Of-The-Box API +========================================= + +We also offer an out-of-the-box RESTAPI to parse addresses using FastAPI. + +Installation +************ + +First, ensure that you have Docker Engine and Docker Compose installed on your machine. +If not, you can install them using the following documentations in the following order: + +1. `Docker Engine `_ +2. `Docker Compose `_ + +Once you have Docker Engine and Docker Compose installed, you can run the following command to start the FastAPI application: + +.. code-block:: sh + + docker compose up app + +Sentry +****** + +Also, you can monitor your application usage with `Sentry `_ by setting the environment variable ``SENTRY_DSN`` to your Sentry's project +DSN. There is an example of the ``.env`` file in the project's root named ``.env_example``. You can copy it using the following command: + +.. code-block:: sh + + cp .env_example .env + +Request Examples +---------------- + +Once the application is up and running and port ``8000`` is exported on your localhost, you can send a request with one +of the following methods: + +cURL POST request +~~~~~~~~~~~~~~~~~ + +.. code-block:: shell + + curl -X POST --location "http://127.0.0.1:8000/parse/bpemb-attention" --http1.1 \ + -H "Host: 127.0.0.1:8000" \ + -H "Content-Type: application/json" \ + -d "[ + {\"raw\": \"350 rue des Lilas Ouest Quebec city Quebec G1L 1B6\"}, + {\"raw\": \"2325 Rue de l'Université, Québec, QC G1V 0A6\"} + ]" + +Python POST request +~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import requests + + url = 'http://localhost:8000/parse/bpemb' + addresses = [ + {"raw": "350 rue des Lilas Ouest Quebec city Quebec G1L 1B6"}, + {"raw": "2325 Rue de l'Université, Québec, QC G1V 0A6"} + ] + + response = requests.post(url, json=addresses) + parsed_addresses = response.json() + print(parsed_addresses) \ No newline at end of file diff --git a/docs/source/cli.rst b/docs/source/cli.rst index c4ae4e3a..60f52be0 100644 --- a/docs/source/cli.rst +++ b/docs/source/cli.rst @@ -106,6 +106,7 @@ We do not handle the ``seq2seq_params`` fine-tuning argument for now. Test **** + This command allows a user to test the ``base_parsing_model`` (or the retrained one using the ``--path_to_retrained_model``) on the ``train_dataset_path`` dataset. For the testing, the CSV or Pickle dataset is loader in a specific dataloader (see @@ -136,4 +137,4 @@ Command to pre-download model weights and requirements. Here is the list of argu - ``model_type``: The parsing module to download. The possible choice are ``'fasttext'``, ``'fasttext-attention'``, ``'fasttext-light'``, ``'bpemb'`` and ``'bpemb-attention'``. - ``--saving_cache_dir``: To change the default saving cache directory (default to ``None``, e.g. default path). -.. autofunction:: deepparse.cli.download.main +.. autofunction:: deepparse.cli.download_model.main diff --git a/docs/source/conf.py b/docs/source/conf.py index 41bdbe4f..430d87e6 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -73,7 +73,7 @@ # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = 'en' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. diff --git a/docs/source/index.rst b/docs/source/index.rst index 5cfe2bef..270fdc31 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -653,32 +653,40 @@ class name) when reloading it. address_parser.retrain(training_container, train_ratio=0.8, epochs=5, batch_size=8, name_of_the_retrain_parser="MyNewParser") -Parse Address With Our Out-Of-The-Box FastAPI Parse Model -********************************************************* -You can use Out-Of-The-Box RESTAPI to parse addresses: +Parse Address With Our Out-Of-The-Box API +***************************************** +We also offer an out-of-the-box RESTAPI to parse addresses using FastAPI. Installation ------------ First, ensure that you have Docker Engine and Docker Compose installed on your machine. -if not, you can install them using the following documentations in the following order: +If not, you can install them using the following documentations in the following order: 1. `Docker Engine `_ - 2. `Docker Compose `_ -Also, you can monitor your application usage with `Sentry `_ by setting the environment variable SENTRY_DSN to your Sentry's project DSN. There is an example of the .env file in the root of the project named .env_example. - Once you have Docker Engine and Docker Compose installed, you can run the following command to start the FastAPI application: .. code-block:: shell docker compose up app +Sentry +****** + +Also, you can monitor your application usage with `Sentry `_ by setting the environment variable ``SENTRY_DSN`` to your Sentry's project +DSN. There is an example of the ``.env`` file in the project's root named ``.env_example``. You can copy it using the following command: + +.. code-block:: sh + + cp .env_example .env + Request Examples ---------------- -Once the application is up and running and the port 8000 is exported on your localhost, you can send request with one of the following methods: +Once the application is up and running and port ``8000`` is exported on your localhost, you can send a request with one +of the following methods: cURL POST request ~~~~~~~~~~~~~~~~~ @@ -828,6 +836,7 @@ API Reference dataset_container comparer cli + api .. toctree:: :glob: diff --git a/docs/source/pre_processor.rst b/docs/source/pre_processor.rst index 49fc2641..ebb8fdcc 100644 --- a/docs/source/pre_processor.rst +++ b/docs/source/pre_processor.rst @@ -9,8 +9,8 @@ Pre-Processors Here are the available pre-processor in Deepparse. The first four are used as default settings when parsing addresses. -.. autofunction:: deepparse.pre_processing.pre_processor.coma_cleaning -.. autofunction:: deepparse.pre_processing.pre_processor.lower_cleaning -.. autofunction:: deepparse.pre_processing.pre_processor.trailing_whitespace_cleaning -.. autofunction:: deepparse.pre_processing.pre_processor.double_whitespaces_cleaning -.. autofunction:: deepparse.pre_processing.pre_processor.hyphen_cleaning \ No newline at end of file +.. autofunction:: deepparse.pre_processing.address_cleaner.coma_cleaning +.. autofunction:: deepparse.pre_processing.address_cleaner.lower_cleaning +.. autofunction:: deepparse.pre_processing.address_cleaner.trailing_whitespace_cleaning +.. autofunction:: deepparse.pre_processing.address_cleaner.double_whitespaces_cleaning +.. autofunction:: deepparse.pre_processing.address_cleaner.hyphen_cleaning \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index b82d01bb..44d1fe5f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ target-version = ['py38', 'py39', 'py310', 'py311'] line-length = 120 skip-string-normalization = true -required-version = "23.3.0" +required-version = "23.9.1" extend-exclude = "/(slides)/" [tool.pylint.ini_options] diff --git a/styling_requirements.txt b/styling_requirements.txt index 05a7d8a6..8a4776f6 100644 --- a/styling_requirements.txt +++ b/styling_requirements.txt @@ -1,4 +1,4 @@ -black==23.3.0 +black==23.9.1 pylint==2.16.2 pylint-django[with_django]==2.5.3 pre-commit==3.3.3 \ No newline at end of file