diff --git a/doc/adb.rst b/doc/adb.rst index b2e9aca..5a03975 100644 --- a/doc/adb.rst +++ b/doc/adb.rst @@ -1,4 +1,39 @@ Asian Development Bank ********************** +This module handles the `Asian Transport Outlook (ATO) `_ source maintained by the Asian Development Bank (ADB, initially) and Asian Infrastructure Investment Bank (AIIB, more recently), specifically the `ATO National Database `_. + +In particular, it converts data from the :ref:`ATO native Excel file format `—both the 2022-10-07 and 2024-05-20 formats—to SDMX and extracts metadata. + +.. contents:: + :local: + :backlinks: none + +.. _ato-format: + +ATO National Database format +============================ + +The ATO native Excel file format is characterized by the following. +There is an `ATO National Database User Guide `_ that contains some of the information below, but does not describe the file format. + +- Data flow IDs like ``TAS-PAT-001(1)``, wherein: + + - ``TAS`` is the ID of a ‘category’ code with a corresponding name like “Transport Activity & Services”. + Individual files (‘workbooks’) contain data for flows within one category. + - ``PAT`` is the ID of a ‘subcategory’ code with a corresponding name like “Passenger Activity Transit”. + - ``001(1)`` is the ID of an ‘indicator’ code with a corresponding name like “Passengers Kilometer Travel - Railways”. + - All data flows with the same initial part like ``TAS-PAT-001`` contain data for the same *measure*. + The final part ``(1)`` indicates data from alternate sources for the same measure. +- Files contain a “TOC” sheet and further individual sheets. + See :func:`~.adb.read_sheet`, which reads these sheets, for a detailed description of the apparent format. +- The files are periodically updated. +- The update schedule is not fixed in advance. +- Previous versions of the files do not appear to be available. +- The file metadata contains a "Created by:" field with information like "2022-10-07, 11:41:56, openpyxl". + At least two different dates have been observed: + + - 2022-10-07 + - 2024-05-20. + .. include:: _api/transport_data.adb.rst diff --git a/doc/whatsnew.rst b/doc/whatsnew.rst index 7a87431..5e64e72 100644 --- a/doc/whatsnew.rst +++ b/doc/whatsnew.rst @@ -5,6 +5,8 @@ Next release ============ - Add :doc:`standards` and :doc:`roadmap` documentation pages (:pull:`9`). +- Adjust :mod:`.adb` for changes in data format in the 2024-05-20 edition of the ATO National Database (:pull:`20`, :issue:`18`). + Document the :ref:`current file format ` that the code supports. v24.2.5 ======= diff --git a/transport_data/adb/__init__.py b/transport_data/adb/__init__.py index 9efa3e2..ea7c505 100644 --- a/transport_data/adb/__init__.py +++ b/transport_data/adb/__init__.py @@ -4,6 +4,7 @@ from typing import Callable, Tuple from urllib.parse import quote +import numpy as np import pandas as pd import sdmx.model.v21 as m @@ -167,11 +168,24 @@ def read_sheet( data_col_mask = list(map(str.isnumeric, df.columns)) # Handle values not parsed by pd.ExcelFile.parse(). Some cells have values like - # "12,345.6\t", which are not parsed to float. Strip the trailing whitespace and - # thousands separators, then convert. + # "12,345.6\t", which are not parsed to float. + # + # - Strip trailing whitespace. + # - Remove thousands separators ("," in 2022-10-17 edition; " " in 2024-05-20 + # edition) and whitespace before the decimal separator ("10860 .6", 2024-05-20 + # edition). + # - Replace "-" (no data) and "long ton" (erroneous) appearing since 2024-05-20 + # edition. + # - Finally, convert to float. dtypes = df.loc[:, data_col_mask].dtypes # Dtypes of data columns only for col, _ in filter(lambda x: x[1] != "float", dtypes.items()): - df[col] = df[col].str.strip().str.replace(",", "").astype(float) + df[col] = ( + df[col] + .str.strip() + .str.replace(r"(\d)[, ]([\d\.])", r"\1\2", regex=True) + .replace("^(-|long ton)$", np.nan, regex=True) + .astype(float) + ) # Identify remarks columns: any entries at the *end* of `df.columns` with non- # numeric labels. Use the index of last data column, counting backwards.