diff --git a/.gitattributes b/.gitattributes index d4661d8..e66f66a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,4 @@ # Reduce the number of merge/rebase conflicts doc/whatsnew.rst merge=union +# Git LFS +*.xlsx filter=lfs diff=lfs merge=lfs -text diff --git a/.github/workflows/pytest.yaml b/.github/workflows/pytest.yaml index 5b9299c..b3673fa 100644 --- a/.github/workflows/pytest.yaml +++ b/.github/workflows/pytest.yaml @@ -29,14 +29,6 @@ jobs: - "3.11" - "3.12" - # Work around https://github.com/actions/setup-python/issues/696 - exclude: - - {os: macos-latest, python-version: "3.8"} - - {os: macos-latest, python-version: "3.9"} - include: - - {os: macos-13, python-version: "3.8"} - - {os: macos-13, python-version: "3.9"} - fail-fast: false runs-on: ${{ matrix.os }} @@ -44,6 +36,8 @@ jobs: steps: - uses: actions/checkout@v4 + with: + lfs: true - uses: actions/setup-python@v5 with: diff --git a/.gitignore b/.gitignore index f2002e3..bbe7e94 100644 --- a/.gitignore +++ b/.gitignore @@ -133,3 +133,6 @@ dmypy.json # Editors/IDEs .vscode .ruff_cache + +# Generated by transport_data +*.xlsx diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 103f8dd..8f10388 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,6 +10,7 @@ repos: - pytest - sdmx1 - Sphinx + - types-openpyxl - types-requests args: [] - repo: https://github.com/astral-sh/ruff-pre-commit diff --git a/doc/api.in b/doc/api.in index 62f0072..98af50b 100644 --- a/doc/api.in +++ b/doc/api.in @@ -9,6 +9,7 @@ config estat iamc + ipcc jrc oica org diff --git a/doc/giz.rst b/doc/giz.rst new file mode 100644 index 0000000..aa0bb12 --- /dev/null +++ b/doc/giz.rst @@ -0,0 +1,12 @@ +Gesellschaft für Internationale Zusammenarbeit (GIZ) +**************************************************** + +GIZ GmbH (`website `_, lit. *Corporation for International Development*) is the main German development agency. +It is not currently a direct provider of (meta)data through TDC, but its members initiated what is now the TDCI and support its activities, including development of this :mod:`.transport_data` package. +This work mainly appears in the :mod:`.org` and :mod:`.proto` modules. + +TUEWAS +====== + +- “Transport, Environment, Energy, and Water in Asia” is an “internal sector network” of GIZ. +- Website: https://tuewas-asia.org/ diff --git a/doc/howto/index.rst b/doc/howto/index.rst new file mode 100644 index 0000000..9e89722 --- /dev/null +++ b/doc/howto/index.rst @@ -0,0 +1,13 @@ +HOWTOs +****** + +This section contains practical **how-to guides**, instructions, and tutorials. +These are intended to illustrate and demonstrate how to work with TDC-compliant data and metadata, and develop code and other tools process such (meta)data. + +In contrast, the :doc:`/standards` are *prescriptive*. +Put another way, a HOWTO shows *just one possible or suggested* way to comply with the standards. + +.. toctree:: + :maxdepth: 1 + + metadata diff --git a/doc/howto/metadata.rst b/doc/howto/metadata.rst new file mode 100644 index 0000000..3bc87da --- /dev/null +++ b/doc/howto/metadata.rst @@ -0,0 +1,238 @@ +Record and update TDC-structured metadata +***************************************** + +This guide explains how to record and update **metadata** in a TDC-specific file format. [1]_ + +.. contents:: + :local: + +.. [1] The guide was developed as part of a project funded by :doc:`/giz` under the `TUEWAS Asia `_ network. + +Introduction +============ + +This HOWTO does *not* set out to give a comprehensive explanation of data and metadata. +For more information, you could consult some of the references linked under :ref:`std-defs` on the page :doc:`/standards`. + +What is metadata? +----------------- + +Metadata is **facts or information about data**, distinct from the *data itself* (i.e. particular numbers). +By recording, exchanging, and analyzing metadata, it is possible to understand and make decisions about data processing and usage—even *without* the actual data. + +Two kinds of metadata +--------------------- + +We are concerned with two kinds of metadata. + +**Structural metadata**, as the name implies, give information about the *structure* of data. +For example, suppose we know that: + + In data set ‘X’, individual observations look like: in Canada, in 2023, 17.4 million apples and 13.8 million bananas were sold. + +We could describe the structure of this data by saying: + +- The data have 3 conceptual **dimensions**: country, time period, and kind of fruit. +- One dimension refers to *countries*—perhaps with labels like “Canada” (literally), or perhaps with short **codes** like “CA”. + These codes might form a certain, fixed, **code list**: *only* the codes in this list appear in the ‘country’ dimension. + + This information also can tell us about the **scope** and **resolution** of the data, along this particular dimension. + For example, the very name or ID of this dimension, ‘country’, implies the spatial resolution of the data: entire countries. + Or, if the ‘country’ code list includes (CA, MX, US), we understand the spatial scope of the data is “North America”. + +- A second dimension refers to the *time period*. + As with the ‘country’ dimension, we can understand the temporal resolution (here, probably years/annual) and scope (at least the year 2023 is included; perhaps also other years). +- A third dimension refers to *kinds of fruit*. + We see the scope is, at least, ‘apples’ and ‘bananas’. + We might also infer that there is no further resolution related to ‘kind of fruit’: for example, the data may not distinguish “honey crisp apple” from “Fuji apple”. +- We also see that the values are for a specific **measure**. + In this case, the measure is “number pieces of fruit were sold”. + The same real-world activity can be measured in different ways. + For example, fruit sold could also be measured as “total market value”, in **units of measurement** such as “US dollars equivalent at market exchange rates”. + +If structural metadata explain the ‘what’ of data (answering the question: “What do the data consist of?”), then **provenance metadata** is a general term for other information about *who* provides data; *how* data is collected, prepared, and published; *when* these things happen; and *where* the data can be found. + +How are metadata described, stored, and exchanged? +-------------------------------------------------- + +Once we have a metadata fact like, “Data set ‘A’ is published by the United Nations,” (Fact 1) we have to decide how to store this, exchange it, and compare it along with possibly many other pieces of metadata. + +This is most commonly done as plain text. + +TDC follows the **SDMX** standards for **Statistical Data and Metadata eXchange**. +Describing and storing metadata in a standards-based way allows to be clear and precise about its meaning. +For example, suppose we have a second fact, expressed as plain text: “Data set ‘B’ is UN data.” + +- Is Fact 2 the same as Fact 1, only phrased differently? +- Or are they diffent? + For example, is data set B “published by” a different agency than the UN, but contains “UN data” from UN data flows? + +By specifying SDMX **metadata attributes** and giving distinct **metadata values** for each, we try to reduce or eliminate this ambiguity. +In this example, we can distinguish *the identity of the data provider/publisher* from *the ‘upstream’ source of the data contained in the data flow*. + +These standards specify ways to describe and exchange metadata in different **file formats** that are machine-readable, such as `XML `_, JSON, and CSV. +Using machine-readable formats allows for semi-automated processing that can handle large amounts of (meta)data. + +XML and JSON, however, are not easily **human-readable**. +For this reason, there are + +For example, see https://ec.europa.eu/eurostat/cache/metadata/en/avia_if_esms.htm + +- This file contains metadata expressed in a human-readable HTML format. +- Click “Download” at the above URL, or access https://ec.europa.eu/eurostat/api/dissemination/files?file=metadata/avia_if_esms.sdmx.zip +- This archive contains: + + - The same :file:`.htm` file. + - Two :file:`.xml` files giving the metadata specification (:file:`ESMS_MSD.msd.xml`) and the actual metadata for this data flow (:file:`avia_if_esms.sdmx.xml`). + - A spreadsheet in Office Open XML (“Microsoft Excel”) format with an alternate, human-readable format (:file:`avia_if_esms.xlsx`) + +Understand the TDC metadata format +================================== + +TDC uses an an unofficial, prototype format for metadata. +This loosely imitates the above-mentioned Eurostat format. +These files contain metadata (information *about* data) based on the SDMX information model, but their file type (.xlsx) and layout (sheet names, columns, etc.) is not specified by the SDMX standard, hence ‘unofficial’. + +The files have the following sheets: + +“README” + Repeats information from this section of the HOWTO. + +“Attributes” + - One row per metadata attribute (or 'field'). + - Columns for the name; description; and ID (short and machine-readable) of each attribute. + See these descriptions to learn what to write for each attribute. + +One or more additional sheets named, e.g. “XX001” + - The name (or title) of each sheet corresponds to the identity (ID) of the data flow that is described by the metadata in that sheet. + - In Column A, the name of the metadata attribute. + Each name **must** exactly match one appearing in the "Attributes" sheet. + - In Column B, the actual metadata values. + These **may** be empty, but **should** contain some indication of why the metadata value is not available or recorded. + +“TEMPLATE” + To add information about additional data flows not included in existing sheets (above), you can copy and rename this sheet. + +Record and update metadata +========================== + +- Metadata will be provided as one or more spreadsheets. + These may be in a web-based, common, editable document, or as e-mail attachments, etc. +- Communicate clearly about which files are exchanged or edited. +- If files are not in a web-based, common, editable format, use “track changes” features in documents to distinguish your edits from existing comments. + +Update or correct existing sheets +--------------------------------- + +- Identify and change incorrect metadata. +- If reviewing existing metadata, update the “Comment” field even if all metadata appear correct. +- Preface comments with your initials or other identifying mark and, if necessary, the date. + For example: + + ABC (2024-08-11) Added UNIT_MEASURE. + XY (2024-09-12) Expanded DATA_DESC; corrected URL. + MN (2024-11-18) Check & confirmed. + +Add additional sheets +--------------------- + +- Duplicate either “TEMPLATE” or any other existing sheet. +- Choose a new, distinct ID for the data flow. +- Be detailed! + The ``DATA_DESC`` attribute is intended as a catch-all; use blank lines to separate different points of information about the data flow. +- Use simple language. + +Use common IDs for concepts/dimensions +-------------------------------------- + +- If a similar concept, dimension, or code list appears in metadata for multiple data flows, try to use the same ID to identify these. +- Some known concepts/dimensions are listed below. +- If there are important distinctions with an existing concept ID—for example, if two data providers use the same name to mean very different things—add extra text in the ``DIMENSION``, ``DATA_DESC`` or other fields to explain. + +================== === +ID Possible values +================== === +ACCIDENT_TYPE e.g. fatal accidents, non-fatal injury accidents, injury accidents, vehicle damage only accidents +DESTINATION e.g. urban, rural +DRIVER_PASSENGER e.g. driver, passenger +FUEL_TYPE e.g. electric, petrol +GEO; REF_AREA Specific countries or regions +IMPORT_REG e.g. new import, first registration, used import +INJURY_TYPE e.g. killed, injured +INSTITUTION e.g. government, private firm, individual +MANUFACTURER e.g. Renault, Toyota +MODE e.g. road, rail +NEW_USED new; used +PUBLIC_PRIVATE public; private +ROAD_CONDITION e.g. paved, unpaved +ROAD_TYPE e.g. motorway, highway +ROAD_USER e.g. pedestrian, four-wheeled vehicle +SERVICE freight; passenger +SEX e.g. female; male; other +SOURCE (of revenue) e.g. toll, tax +TIME_PERIOD +TYPE_OF_SPEND e.g. construction, maintenance +VEHICLE_AGE +VEHICLE_TYPE e.g. passenger car, bus, scooter +================== === + +Avoid common ‘gotchas’ +====================== + +When handling metadata, there are some common issues that can arise. +This section lists a few, and appropriate responses. + +Large/composite “databases” +--------------------------- + +Often, the term “data set” is use informally to refer to a collection of many kinds of data. +An easy way to notice this happening is to see if each metadata attribute has a complicated value or multiple values. + +For example: + + Measure: GDP; population. + + Unit of measure: 2020 U.S. dollars; millions of people. + + Dimensions: time and country; time, country, sex, and age. + +In this example, we see there are in fact **two** data flows. +It is simpler to describe these separately. +If other metadata values for one data flow are identical to the values for another, make such a reference: + + Data description: Same as [DF00X]. + +Mixing measures and dimensions: the word “by” +--------------------------------------------- + +For example: + +- Data set A may be described as “Sales of cars by manufacturer” +- Data set B may be described as “Sales of cars by weight class” + +In this case, the word **“by”** is a clue that *the data have at least one specific dimension*. +For data set A, that specific named dimension is “manufacturer”. +For data set B, the dimension is “weight class”. + +However: + +- Both data sets actually capture *the same measure*—sales of cars—and may use the same units of measurement. +- Each data set probably has additional dimensions, besides the one singled out in the name or title. + For example, both data flow A and data flow B may have GEO and TIME_PERIOD dimensions. + It is possible that data flow B *also* has a “manufacturer” dimension, but this is merely omitted from the name or title. + +To avoid this ambiguity is to: + +- Always give the complete list of dimensions. +- Do not combine dimensions with the measure. +- Avoid mentioning just one or a few dimensions. + +Mixing measures and units of measure +------------------------------------ + +For example: + +- For data flow A, the measure is given as “passenger miles traveled”. +- For data flow B, the measure is given as “passenger kilometres”. + +With the above information, we can understand that these are *the same measure* (one we might call “passenger distance traveled”), but the *units of measurement* are different (in one case, miles; in the other, kilometres). diff --git a/doc/index.rst b/doc/index.rst index 425a155..a63651d 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -24,6 +24,7 @@ For more on the design, status, and plans for this package, see :doc:`dev`. usage dev standards + howto/index roadmap whatsnew @@ -33,6 +34,7 @@ General - :doc:`usage` - :doc:`dev` - :doc:`standards` +- :doc:`howto/index` - :doc:`roadmap` - :doc:`whatsnew` @@ -53,7 +55,9 @@ They handle tasks including: adb estat + giz iamc + ipcc jrc oica org @@ -61,7 +65,9 @@ They handle tasks including: - :mod:`.adb`: :doc:`adb` - :mod:`.estat`: :doc:`estat` +- :doc:`giz` - :mod:`.iamc`: :doc:`iamc` +- :mod:`.ipcc`: :doc:`ipcc` - :mod:`.jrc`: :doc:`jrc` - :mod:`.oica`: :doc:`oica` - :mod:`.org`: :doc:`org` diff --git a/doc/ipcc.rst b/doc/ipcc.rst new file mode 100644 index 0000000..804ad67 --- /dev/null +++ b/doc/ipcc.rst @@ -0,0 +1,14 @@ +Intergovernmental Panel on Climate Change (IPCC) +************************************************ + +.. include:: _api/transport_data.ipcc.rst + +References +========== + +Some of these references are to documents or webpages authored not by the IPCC *per se*, but by individuals or groups connected to the United Nations Framework Convention on Climate Change (UN FCCC). +Since :mod:`transport_data` does not currently have a :mod:`.unfccc` module, they are included here. + +- 2006 Guidelines for National Greenhouse Gas Inventories (`HTML `__) — Volume 2: Energy (`HTML `__) — Chapter 3: Mobile Combustion (`PDF (en) `__). +- 2023-01 Technical handbook for developing country Parties on Preparing for implementation of the enhanced transparency framework [ETF] under the Paris Agreement (`HTML `__, `PDF (en) `__). +- 2024-01-24 Compendium on Greenhouse Gas Baselines and Monitoring Passenger and Freight Transport (`HTML `__, `PDF (en) `__). diff --git a/doc/roadmap.rst b/doc/roadmap.rst index 42e3b55..282460c 100644 --- a/doc/roadmap.rst +++ b/doc/roadmap.rst @@ -1,4 +1,5 @@ Roadmap ******* -This page will give a longer-term overview of future development of :mod:`transport_data`, focused on the tools in this package, but with relevant details about the broader TDC and TDCI. +This page gives a medium- and long-term overview of future development of :mod:`transport_data`, focused on the tools in this package, but with relevant details about the broader TDC and TDCI. +See also `transport-data/projects/1 `_ on GitHub. diff --git a/doc/standards.rst b/doc/standards.rst index 4f4b3db..eb6e2f7 100644 --- a/doc/standards.rst +++ b/doc/standards.rst @@ -35,6 +35,8 @@ These standards: TDCI develops and supports community development of tutorials, guides, explainers, in multiple languages and media, targeted to different audiences, that help with the adoption of these standards. These **must** promote a correct implementation of the standards, and **may** give more detailed and elaborate examples. +.. _std-defs: + Definitions =========== @@ -129,6 +131,10 @@ If used, annotations with these IDs **must** conform to the given requirements: The function :func:`.anno_generated` generates such an annotation and **should** be called on all objects created in this package. +``preferred-unit`` + Especially for :class:`.Concept` in :class:`.ConceptScheme`, the preferred units of measurement if the concept is used as a measure. + These correspond to the well-known SDMX concept and attribute ``Concept=SDMX:CROSS_DOMAIN_CONCEPTS(2.0).UNIT_MEASURE``. + Codes ----- diff --git a/doc/whatsnew.rst b/doc/whatsnew.rst index 5e64e72..b81f8d2 100644 --- a/doc/whatsnew.rst +++ b/doc/whatsnew.rst @@ -4,6 +4,7 @@ What's new Next release ============ +- Add :mod:`.ipcc` (:doc:`ipcc`) module (:issue:`15`, :pull:`21`). - Add :doc:`standards` and :doc:`roadmap` documentation pages (:pull:`9`). - Adjust :mod:`.adb` for changes in data format in the 2024-05-20 edition of the ATO National Database (:pull:`20`, :issue:`18`). Document the :ref:`current file format ` that the code supports. diff --git a/pyproject.toml b/pyproject.toml index 58ed5f2..cdd9d9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,10 +23,12 @@ readme = "README.rst" requires-python = ">=3.8" dependencies = [ "click", + "Jinja2", "openpyxl", "packaging", "pandas", "platformdirs", + "pluggy", "pooch", "pycountry", "requests", diff --git a/transport_data/adb/__init__.py b/transport_data/adb/__init__.py index ea7c505..97b2a07 100644 --- a/transport_data/adb/__init__.py +++ b/transport_data/adb/__init__.py @@ -9,12 +9,13 @@ import sdmx.model.v21 as m from transport_data import STORE as registry +from transport_data.util.pluggy import hookimpl from transport_data.util.pooch import Pooch from transport_data.util.sdmx import anno_generated -def get_agency() -> m.Agency: - # Agency +@hookimpl +def get_agencies(): a = m.Agency( id="ADB", name="Asian Transport Outlook team at the Asian Development Bank", @@ -26,7 +27,7 @@ def get_agency() -> m.Agency: c3 = m.Contact(name="Sudhir Gota", email=["sudhirgota@gmail.com"]) a.contact.extend([c1, c2, c3]) - return a + return (a,) BASE_URL = "https://asiantransportoutlook.com/exportdl?orig=1" @@ -240,10 +241,12 @@ def prepare(aa: m.AnnotableArtefact) -> Tuple[m.DataSet, Callable]: # Data structure definition with an ID matching the measure # NB here we set ADB as the maintainer. Precisely, ADB establishes the data # structure, but TDCI is maintaining the SDMX representation of it. - dsd = m.DataStructureDefinition(id=measure_id, maintainer=get_agency()) + dsd = m.DataStructureDefinition(id=measure_id, maintainer=get_agencies()[0]) anno_generated(dsd) - dfd = m.DataflowDefinition(id=measure_id, maintainer=get_agency(), structure=dsd) + dfd = m.DataflowDefinition( + id=measure_id, maintainer=get_agencies()[0], structure=dsd + ) pm = m.PrimaryMeasure(id="OBS_VALUE", concept_identity=c) dsd.measures.append(pm) @@ -317,7 +320,7 @@ def convert(part): # Write the lists of "Economy" codes and measures/concepts accumulated while # converting - a = get_agency() + a = get_agencies()[0] for obj in (CL_ECONOMY, CS_MEASURE): obj.maintainer = a obj.version = "0.1.0" diff --git a/transport_data/adb/cli.py b/transport_data/adb/cli.py index 7e22007..4678715 100644 --- a/transport_data/adb/cli.py +++ b/transport_data/adb/cli.py @@ -18,7 +18,7 @@ from . import FILES, convert, fetch -@click.group("adb", help=__doc__) +@click.group("adb") def main(): """Asian Development Bank (ADB) provider.""" diff --git a/transport_data/data/org/template-metadata.html b/transport_data/data/org/template-metadata.html new file mode 100644 index 0000000..7a05d39 --- /dev/null +++ b/transport_data/data/org/template-metadata.html @@ -0,0 +1,53 @@ +{% macro summarize_metadatareport(report, heading="h2") %} +{% set dataflow = report.attaches_to.key_values["DATAFLOW"].obj %} +<{{ heading }} id="{{ report | dfd_id }}">Data flow '{{ report | dfd_id }}' ^ +{{ summarize_dataflow(dataflow) }} +{% for metadata_concept in ["DATA_PROVIDER", "URL", "MEASURE", "UNIT_MEASURE", "DATA_DESCR", "COMMENT"] %} +{% set ra_value, mda = get_reported_attribute(report, metadata_concept) %} +{% if not mda %}{% continue %}{% endif %} +

{{ mda.concept_identity.name }}: +{% if metadata_concept == "DATA_PROVIDER" %} +{{ ra_value }} +{% elif metadata_concept == "URL" %} +{{ ra_value}} +{% elif metadata_concept in ("DATA_DESCR", "COMMENTS") %} +
{{ ra_value.replace('\n', '
') | safe }} +{% else %} +{{ ra_value }} +{% endif %} +

+{% endfor %} +{% endmacro %} + +{% macro summarize_dataflow(dfd) %} +…with dimensions: +
    + {% for dim in dfd.structure.dimensions %} +
  1. {{ dim.id }}: {{ dim | format_desc }}
  2. + {% endfor %} +
+{% endmacro %} + + + + +

Direct links:
+ {{ matched | length }} data flows containing data on {{ ref_area }}: + {% for mdr in matched %}
{{ mdr | dfd_id }}{{ ", " if not loop.last }}{% endfor %} +
+ {{ no_match | length }} other data flows: + {% for mdr in no_match %}{{ mdr | dfd_id }}{{ ", " if not loop.last }}{% endfor %} +

+

Data flows containing data on {{ ref_area }}

+

These data flows are explicitly marked as containing data pertaining to the country.

+ {% for mdr in matched %} + {{ summarize_metadatareport(mdr) }} + {% endfor %} +

Other data flows

+

These data flows are not explicitly identified as containing data on the country. + This doesn't completely rule out that they may contain such data, but this is less likely and would require further investigation and inspection.

+ {% for mdr in no_match %} + {{ summarize_metadatareport(mdr) }} + {% endfor %} + + diff --git a/transport_data/data/tests/metadata-input.xlsx b/transport_data/data/tests/metadata-input.xlsx new file mode 100644 index 0000000..df2dbce --- /dev/null +++ b/transport_data/data/tests/metadata-input.xlsx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:380dfcd70698fc387020074845a127f23cbca2746ae66cc9213603185b4dbbe6 +size 100078 diff --git a/transport_data/iamc/__init__.py b/transport_data/iamc/__init__.py index 21a385d..5d2e429 100644 --- a/transport_data/iamc/__init__.py +++ b/transport_data/iamc/__init__.py @@ -11,15 +11,19 @@ import sdmx.model.v21 as m from sdmx.message import StructureMessage +from transport_data.util.pluggy import hookimpl + log = logging.getLogger(__name__) -def get_agency(): - return m.Agency( +@hookimpl +def get_agencies(): + a = m.Agency( id="IAMC", name="Integrated Assessment Modeling Consortium", contact=[m.Contact(uri=["https://iamconsortium.org"])], ) + return (a,) def common_structures(): @@ -31,7 +35,7 @@ def common_structures(): with id "IAMC", containing the concepts for the IAMC dimensions and attribute. """ cs = m.ConceptScheme( - id="IAMC", name="Concepts in the IAMC data model", maintainer=get_agency() + id="IAMC", name="Concepts in the IAMC data model", maintainer=get_agencies()[0] ) cs.extend( diff --git a/transport_data/ipcc/__init__.py b/transport_data/ipcc/__init__.py new file mode 100644 index 0000000..bdee3de --- /dev/null +++ b/transport_data/ipcc/__init__.py @@ -0,0 +1 @@ +"""Intergovernmental Panel on Climate Change metadata provider.""" diff --git a/transport_data/ipcc/structure.py b/transport_data/ipcc/structure.py new file mode 100644 index 0000000..12c4aef --- /dev/null +++ b/transport_data/ipcc/structure.py @@ -0,0 +1,280 @@ +"""IPCC structural metadata.""" + +from typing import TYPE_CHECKING + +from transport_data.util.pluggy import hookimpl + +if TYPE_CHECKING: + import sdmx.model.common + + +@hookimpl +def get_agencies(): + """Return the IPCC :class:`.Agency`.""" + from sdmx.model import v21 + + a = v21.Agency( + id="IPCC", + name="Intergovernmental Panel on Climate Change", + description="https://www.ipcc.ch/", + ) + return (a,) + + +def gen_cl_T311(**kwargs) -> "sdmx.model.Common.Codelist": + """Generate a code list from the GNGGI, Volume 2, Table 3.1.1. + + The generated code list's URN ends with ``Codelist=TDCI:CL_IPCC_2006_V2_T3.1.1(…)``. + + .. todo:: Expand to include 'Explanation' text from the table as descriptions for + codes. + + .. todo:: Include internationalized texts (names, descriptions) from the Arabic, + Chinese, French, Russian, and/or Spanish versions of the documents. + """ + from sdmx.model.common import Code, Codelist + + cl = Codelist( + id="CL_IPCC_2006_V2_T3.1.1", + name="Detailed sector split for the Transport sector", + description="""Transcribed from 2006 IPCC Guidelines for National Greenhouse Gas Inventories — Volume 2: Energy — Chapter 3: Mobile Combustion — Table 3.1.1, using the file https://www.ipcc-nggip.iges.or.jp/public/2006gl/pdf/2_Volume2/V2_3_Ch3_Mobile_Combustion.pdf, as linked from https://www.ipcc-nggip.iges.or.jp/public/2006gl/vol2.html. + +This version includes the 'Explanation' text from the table as the description for individual codes, but at the moment only for the code 1 A 3. For others, see the source document.""", + **kwargs, + ) + + # The codes have well-formed, hierarchical IDs, so it is possible to infer the ID of + # the parent code, if it exists. + def _c(id_, name, description=None): + """Shorthand for adding to `cl`.""" + try: + parent = cl[" ".join(id_.split()[:-1])] + except KeyError: + parent = None + + cl.append(Code(id=id_, name=name, description=description, parent=parent)) + + _c( + "1 A 3", + "TRANSPORT", + """Emissions from the combustion and evaporation of fuel for all transport activity (excluding military transport), regardless of the sector, specified by sub-categories below. + +Emissions from fuel sold to any air or marine vessel engaged in international transport (1 A 3 a i and 1 A 3 d i) should as far as possible be excluded from the totals and subtotals in this category and should be reported separately.""", + ) + _c("1 A 3 a", "Civil Aviation") + _c("1 A 3 a i", "International Aviation (International Bunkers)") + _c("1 A 3 a ii", "Domestic Aviation") + _c("1 A 3 b", "Road Transportation") + _c("1 A 3 b i", "Cars") + _c("1 A 3 b i 1", "Passenger cars with 3-way catalysts") + _c("1 A 3 b i 2", "Passenger cars without 3-way catalysts") + _c("1 A 3 b ii", "Light duty trucks") + _c("1 A 3 b ii 1", "Light-duty trucks with 3-way catalysts") + _c("1 A 3 b ii 2", "Light-duty trucks without 3-way catalysts") + _c("1 A 3 b iii", "Heavy duty trucks and buses") + _c("1 A 3 b iv", "Motorcycles") + _c("1 A 3 b v", "Evaporative emissions from vehicles") + _c("1 A 3 b vi", "Urea-based catalysts") + _c("1 A 3 c", "Railways") + _c("1 A 3 d", "Water-borne Navigation") + _c("1 A 3 d i", "International water-borne navigation (International bunkers)") + _c("1 A 3 d ii", "Domestic water-borne Navigation") + _c("1 A 3 e", "Other Transportation") + _c("1 A 3 e i", "Pipeline Transport") + _c("1 A 3 e ii", "Off-road") + _c("1 A 4 c iii", "Fishing (mobile combustion)") + _c("1 A 5 a", "Non specified stationary") + _c("1 A 5 b", "Non specified mobile") + + return cl + + +def gen_cs_ch3(**kwargs) -> "sdmx.model.common.ConceptScheme": + """Generate a scheme of concepts included in equations in Chapter 3. + + The generated code list's URN ends with + ``ConceptScheme=TDCI:CS_IPCC_2006_V2_CH3(…)``. + + .. todo:: Include concepts used as table dimensions. + + .. todo:: Include internationalized texts (names, descriptions) from the Arabic, + Chinese, French, Russian, and/or Spanish versions of the documents. + """ + from sdmx.model.common import Annotation, Concept, ConceptScheme + + cs = ConceptScheme( + id="CS_IPCC_2006_V2_CH3", name="Concepts appearing in equations", **kwargs + ) + + equation, page = "", "" + + def _c(id_, name=None, units=None, description=None): + c = Concept( + id=id_, + name=name, + description=f"First appears in Equation {equation} on p.{page}", + ) + + if units: + c.annotations.append(Annotation(id="preferred-units", text=units)) + cs.append(c) + + # §3.2 Road transportation + + equation, page = "3.2.1", "3.12" + _c( + "EMI 1", + "Emissions", + "kg", + """Variously "Emissions of CO₂" (Eq. 3.2.1), or of varying species (Eq. 3.2.3, 3.2.5)""", + ) + _c("Fuel 1", "Fuel sold", "TJ") + _c("EF 1", "Emission factor", "kg/TJ") + _c( + "a", + "Type of fuel (e.g. petrol, diesel, natural gas, lpg)", + None, + "In Eq 3.2.6, 'j' is used for the same concept.", + ) + + equation, page = "3.2.2", "3.12" + _c( + "EMI 2", + "CO₂ Emissions from urea-based additive in catalytic converters", + "Gg CO₂", + ) + _c( + "Activity", + "amount of urea-based additive consumed for use in catalytic converters", + "Gg", + ) + _c( + "Purity", + "the mass fraction (=percentage divided by 100) of urea in the urea-based additive", + ) + + # Eq. 3.2.3 —same concepts as 3.2.1 + + equation, page = "3.2.4", "3.13" + _c( + "Fuel 2", + "fuel consumed (as represented by fuel sold) for a given mobile source activity", + "TJ", + ) + _c( + "b", + "vehicle type", + None, + "In Eq 3.2.6, 'i' is used for the same concept (e.g., car, bus)", + ) + _c( + "c", + "emission control technology (such as uncontrolled, catalytic converter, etc.)", + ) + + equation, page = "3.2.5", "3.15" + _c("EF 2", "emission factor", "kg / km") + _c( + "Distance 1", + "distance travelled during thermally stabilized engine operation phase for a given mobile source activity", + "km", + ) + _c("C", "emissions during warm-up phase (cold start)", "kg") + _c( + "d", + "operating conditions (e.g. urban or rural road type, climate, or other environmental factors)", + ) + + equation, page = "3.2.6", "3.26" + _c( + "Estimated fuel", + "total estimated fuel use estimated from distance travelled (VKT) data", + "litre", + ) + _c("Vehicles", "number of vehicles of type i and using fuel j on road type t") + _c( + "Distance 2", + "annual kilometres travelled per vehicle of type i and using fuel j on road type t", + "km", + ) + _c("t", "type of road (e.g., urban, rural)") + + # §3.3 Off-road transportation + # Eq. 3.3.1 —no additional concepts + # Eq. 3.3.2 —no additional concepts + + equation, page = "3.3.3", "3.34" + _c( + "N", + "source population", + None, + """In Eq. 3.4.3 this is given as 'number of locomotives of type i".""", + ) + # Ditto below, all used in Eq. 3.4.3 + _c("H", "annual hours of use of vehicle i", "hour") + _c("P", "average rated power of vehicle i", "kW") + _c("LF", "typical load factor of vehicle i (fraction between 0 and 1)") + _c("EF 3", "average emission factor for use of fuel j in vehicle i", "kg / kWh") + + # Eq. 3.3.4 —no additional concepts + + # §3.4 Railways + # Eq. 3.4.1 —no additional concepts + # Eq. 3.4.2 —no additional concepts + + equation, page = "3.4.3", "3.42" + _c("i", "locomotive type and journey type") + + equation, page = "3.4.4", "3.43" + _c("EF 4", "engine specific emission factor for locomotive of type i", "kg/TJ") + _c("PWF", "pollutant weighting factor for locomotive of type i", "dimensionless") + _c("EF 5", "default emission factor for diesel (applies to CH₄, N₂O)", "kg/TJ") + + # §3.6 Civil Aviation + # Eq. 3.6.1 —no additional concepts + + equation, page = "3.6.2", "3.59" + _c( + "Emissions.LTO", + "", + None, + """'LTO' is defined on p.3.56 as "Landing/Take-Off cycle".""", + ) + _c( + "Emissions.Cruise", + "", + None, + """'Cruise' is defined on p.3.56 in contrast with 'LTO'.""", + ) + + equation, page = "3.6.3", "3.59" + _c("Number of LTOs") + _c("EF.LTO") + + equation, page = "3.6.4", "3.59" + _c("Fuel consumption.LTO") + _c("Fuel consumption per LTO") + + equation, page = "3.6.5", "3.59" + _c("Total Fuel Consumption") + _c("EF.Cruise") + + return cs + + +def gen_structures() -> None: + """Create or update IPCC-maintained structural metadata.""" + from transport_data import STORE, org + + def _make_id(value: str) -> str: + return f"{get_agencies()[0].id}_{value}" + + ma_args = dict( + maintainer=org.get_agencies()[0], + version="0.1", + is_final=True, + is_external_reference=False, + ) + + STORE.setdefault(gen_cl_T311(**ma_args)) + STORE.setdefault(gen_cs_ch3(**ma_args)) diff --git a/transport_data/jrc/__init__.py b/transport_data/jrc/__init__.py index d1cdcd9..6b5343b 100644 --- a/transport_data/jrc/__init__.py +++ b/transport_data/jrc/__init__.py @@ -24,11 +24,13 @@ import sdmx.model.v21 as m from transport_data import STORE as registry +from transport_data.util.pluggy import hookimpl from transport_data.util.pooch import Pooch from transport_data.util.sdmx import anno_generated -def get_agency() -> m.Agency: +@hookimpl +def get_agencies(): """Return information about the agency providing the data set. See :func:`.org.get_agencyscheme`. @@ -46,7 +48,7 @@ def get_agency() -> m.Agency: m.Contact(name="Jacopo Tattini", email=["Jacopo.TATTINI@ec.europa.eu"]) ) - return a + return (a,) BASE_URL = ( @@ -448,7 +450,7 @@ def convert(geo): registry.write(obj) # Write code lists, measure concept scheme to file - a = get_agency() + a = get_agencies()[0] for obj in chain(CL.values(), [CS_MEASURE]): obj.maintainer = a obj.version = "0.1.0" @@ -468,12 +470,12 @@ def prepare(measure_concept, dims): # NB here we set ADB as the maintainer. Precisely, ADB establishes the data # structure, but TDCI is maintaining the SDMX representation of it. dsd = m.DataStructureDefinition( - id=measure_id, maintainer=get_agency(), version="0.0.0" + id=measure_id, maintainer=get_agencies()[0], version="0.0.0" ) anno_generated(dsd) dfd = m.DataflowDefinition( - id=measure_id, maintainer=get_agency(), version="0.0.0", structure=dsd + id=measure_id, maintainer=get_agencies()[0], version="0.0.0", structure=dsd ) pm = m.PrimaryMeasure(id="OBS_VALUE", concept_identity=c) diff --git a/transport_data/jrc/cli.py b/transport_data/jrc/cli.py index 7744297..80cf86e 100644 --- a/transport_data/jrc/cli.py +++ b/transport_data/jrc/cli.py @@ -18,7 +18,7 @@ from . import GEO, convert, fetch -@click.group("jrc", help=__doc__.splitlines()[0]) +@click.group("jrc") def main(): """EU Joint Research Center (JRC) provider.""" diff --git a/transport_data/oica/__init__.py b/transport_data/oica/__init__.py index 61cd027..55fe7e0 100644 --- a/transport_data/oica/__init__.py +++ b/transport_data/oica/__init__.py @@ -20,6 +20,7 @@ import pandas as pd +from transport_data.util.pluggy import hookimpl from transport_data.util.pooch import Pooch if TYPE_CHECKING: @@ -165,7 +166,9 @@ def convert_single_file( # Prepare a GEO codelist and map using the "GEO" column cl_geo = get_cl_geo() - geo_map = _make_geo_codes(cl_geo, df["GEO"], maintainer=get_agency(), version="0.1") + geo_map = _make_geo_codes( + cl_geo, df["GEO"], maintainer=get_agencies()[0], version="0.1" + ) # Store `cl_geo` STORE.write(cl_geo) @@ -327,16 +330,17 @@ def _make_code(value: str): return id_for_name -@lru_cache -def get_agency() -> "sdmx.model.common.Agency": +@hookimpl +def get_agencies(): """Return the OICA Agency.""" from sdmx.model import v21 - return v21.Agency( + a = v21.Agency( id="OICA", name="International Organization of Motor Vehicle Manufacturers", description="https://www.oica.net", ) + return (a,) def get_cl_geo() -> "sdmx.model.common.Codelist": @@ -346,7 +350,9 @@ def get_cl_geo() -> "sdmx.model.common.Codelist": from transport_data import STORE, org candidate: common.Codelist = common.Codelist( - id=f"{get_agency().id}_GEO", maintainer=org.get_agency()[0], version="0.1" + id=f"{get_agencies()[0].id}_GEO", + maintainer=org.get_agencies()[0], + version="0.1", ) return STORE.setdefault(candidate) @@ -360,7 +366,9 @@ def get_conceptscheme() -> "sdmx.model.common.ConceptScheme": from transport_data import STORE, org cs = common.ConceptScheme( - id=f"{get_agency().id}_CONCEPTS", maintainer=org.get_agency()[0], version="0.1" + id=f"{get_agencies()[0].id}_CONCEPTS", + maintainer=org.get_agencies()[0], + version="0.1", ) # Measures @@ -415,9 +423,9 @@ def get_structures( from transport_data import STORE, org - base = f"{get_agency().id}_{measure}" + base = f"{get_agencies()[0].id}_{measure}" ma_args = dict( - maintainer=org.get_agency()[0], + maintainer=org.get_agencies()[0], version="0.1", is_final=False, is_external_reference=False, diff --git a/transport_data/oica/cli.py b/transport_data/oica/cli.py index 2b56c4a..4da6635 100644 --- a/transport_data/oica/cli.py +++ b/transport_data/oica/cli.py @@ -17,7 +17,7 @@ import click -@click.group("oica", help=__doc__.splitlines()[0]) +@click.group("oica", short_help="OICA provider.") def main(): """International Organization of Motor Vehicle Manufacturers (OICA) provider.""" diff --git a/transport_data/org/__init__.py b/transport_data/org/__init__.py index 90515be..841cc03 100644 --- a/transport_data/org/__init__.py +++ b/transport_data/org/__init__.py @@ -1,18 +1,20 @@ """Information about the TDCI *per se*.""" from datetime import date -from importlib import import_module +from itertools import chain from typing import TYPE_CHECKING, Union import sdmx.model.v21 as m from transport_data import STORE as registry +from transport_data.util.pluggy import hookimpl, pm, register_internal if TYPE_CHECKING: import sdmx.model.v21 -def get_agency() -> "sdmx.model.v21.Agency": +@hookimpl +def get_agencies() -> "sdmx.model.v21.Agency": # Agency a1 = m.Agency( id="TDCI", @@ -48,8 +50,6 @@ def get_agency() -> "sdmx.model.v21.Agency": def get_agencyscheme(version: Union[None, str] = None) -> "sdmx.model.v21.AgencyScheme": """Generate an AgencyScheme including some TDCI data providers.""" - agencies = get_agency() - as_ = m.AgencyScheme( id="TDCI", # NameableArtefact @@ -57,17 +57,18 @@ def get_agencyscheme(version: Union[None, str] = None) -> "sdmx.model.v21.Agency # VersionableArtefact valid_from=date.today().isoformat(), # MaintainableArtefact - maintainer=agencies[0], + maintainer=None, ) - for a in agencies: - as_.append(a) + # Use plugin hooks to collect Agency objects from within transport_data or other + # registered code + register_internal() + + for agency in chain(*pm.hook.get_agencies()): + as_.append(agency) - # Add agencies with corresponding modules in this repository - for id_ in ("adb", "jrc"): - module = import_module(f"transport_data.{id_}") - # Call a function named get_agency() in the module - as_.append(module.get_agency()) + # TDCI itself is the maintainer + as_.maintainer = as_["TDCI"] as_.version = version if as_.version is None: diff --git a/transport_data/org/cli.py b/transport_data/org/cli.py index ba5dbba..51bcb2c 100644 --- a/transport_data/org/cli.py +++ b/transport_data/org/cli.py @@ -6,15 +6,43 @@ """ +import pathlib + import click -from transport_data import STORE from transport_data.util.click import common_params -@click.command("org", params=common_params("version")) -def main(version): - """Information about the TDCI per se.""" +@click.group("org") +def main(): + """TDCI itself.""" + + +@main.command("refresh", params=common_params("version")) +def refresh(version): + """Update the TDCI metadata.""" + from transport_data import STORE + from . import get_agencyscheme - STORE.write(get_agencyscheme(version=version), force=True) + STORE.write(get_agencyscheme(version=version)) + + +@main.command("read") +@click.argument( + "path", type=click.Path(exists=True, dir_okay=False, path_type=pathlib.Path) +) +def read(path: "pathlib.Path"): + """Read and summarize metadata.""" + from .metadata import read_workbook, summarize_metadataset + + mds = read_workbook(path.resolve()) + summarize_metadataset(mds) + + +@main.command("template") +def template(): + """Generate the metadata template.""" + from .metadata import make_workbook + + make_workbook() diff --git a/transport_data/org/metadata.py b/transport_data/org/metadata.py new file mode 100644 index 0000000..13b8937 --- /dev/null +++ b/transport_data/org/metadata.py @@ -0,0 +1,637 @@ +import itertools +import logging +import re +from collections import defaultdict +from functools import lru_cache, partial +from typing import TYPE_CHECKING, Callable, Hashable, List, Optional, Tuple + +from pycountry import countries +from sdmx.model import common, v21 + +if TYPE_CHECKING: + import pathlib + + from openpyxl import Workbook + from openpyxl.worksheet.worksheet import Worksheet + +log = logging.getLogger(__name__) + +#: Concepts and metadata attributes in the TDC metadata structure. +CONCEPTS = { + "DATAFLOW": ( + "Data flow ID", + """A unique identifier for the data flow (=data source, data set, etc.). + +We suggest to use IDs like ‘VN001’, where ‘VN’ is the ISO 3166 alpha-2 country +code, and ‘001’ is a unique number. The value MUST match the name of the sheet +in which it appears.""", + ), + "DATA_PROVIDER": ( + "Data provider", + """Organization or individual that provides the data and any related metadata. + +This can be as general (“IEA”) or specific (organization unit/department, specific +person responsible, contact details, etc.) as appropriate.""", + ), + "URL": ( + "URL or web address", + "Location on the Internet with further information about the data flow.", + ), + "MEASURE": ( + "Measure (‘indicator’)", + """Statistical concept for which data are provided in the data flow. + +If the data flow contains data for multiple measures, give each one separated by +semicolons. Example: “Number of cars; passengers per vehicle”. + +This SHOULD NOT duplicate the value for ‘UNIT_MEASURE’. Example: “Annual driving +distance per vehicle”, not “Kilometres per vehicle”.""", + ), + "UNIT_MEASURE": ( + "Unit of measure", + """Unit in which the data values are expressed. + +If ‘MEASURE’ contains 2+ items separated by semicolons, give the respective units in the +same way and order. If there are no units, write ‘dimensionless’, ‘1’, or similar.""", + ), + "DIMENSION": ( + "Dimensions", + """Formally, the “statistical concept used in combination with other statistical +concepts to identify a statistical series or individual observations.” + +Record all dimensions of the data, either in a bulleted or numbered list, or +separated by semicolons. In parentheses, give some indication of the scope +and/or resolution of the data along each dimension. Most data have at least time +and space dimensions. + +Example: + +- TIME_PERIOD (annual, 5 years up to 2021) +- REF_AREA (whole country; VN only) +- Vehicle type (12 different types: […]) +- Emissions species (CO2 and 4 others)""", + ), + "DATA_DESCR": ( + "Data description", + """Any information about the data flow that does not fit in other attributes. + +Until or unless other metadata attributes are added to this metadata structure/ +template, this MAY include: + +- Any conditions on data access, e.g. publicly available, proprietary, fee or + subscription required, available on request, etc. +- Frequency of data updates. +- Any indication of quality, including third-party references that indicate data + quality. +""", + ), + "COMMENT": ( + "Comment", + """Any other information about the metadata values, for instance discrepancies or +unclear or missing information. + +Precede comments with initials; append to existing comments to keep +chronological order; and include a date (for example, “2024-07-24”) if helpful.""", + ), +} + +#: README text for the TDC metadata file format. +README_TEXT = """This file is an unofficial, prototype TDC format for metadata. +loosely imitates the Eurostat format. These files contain metadata (information +*about* data) based on the SDMX information model, but their layout (sheet +names, columns, etc.) is not specified by the SDMX standard, hence ‘unofficial’. + +This file has the following sheets. + +README +====== + +This sheet. + +Attributes +========== + +- One row per metadata attribute (or 'field'). +- Columns for the name; description; and ID (short and machine-readable) of each + attribute. See these descriptions to learn what to write for each attribute. + +One or more additional sheets +============================= + +- The name (or title) of each sheet corresponds to the identity (ID) of the data + flow that is described by the metadata in that sheet. +- In Column A, the name of the metadata attribute. Each name MUST exactly + match one appearing in the "Attributes" sheet. Some names MAY be omitted. +- In Column B, the actual metadata. These may be empty. + +TEMPLATE +======== + +To add information about additional data flows not included in existing sheets +(above), you can copy and rename this sheet. +""" + + +def _header(ws: "Worksheet", *columns: Tuple[str, int]) -> None: + """Write header columns and format their style and width.""" + for column, (value, width) in enumerate(columns, start=1): + cell = ws.cell(row=1, column=column, value=value) + cell.style = "header" + ws.column_dimensions[cell.column_letter].width = width + + +def add_readme(wb: "Workbook") -> None: + """Add a "README" sheet to `wb`.""" + ws = wb.create_sheet("README") + + _header(ws, ("Transport Data Commons (TDC) metadata", 72)) + ws["A3"] = README_TEXT + + +def add_attributes(wb: "Workbook", msd: "v21.MetadataStructureDefinition"): + """Add an "Attributes" sheet to `wb` listing the metadata attributes from `msd`.""" + ws = wb.create_sheet("Attributes") + + _header( + ws, + ("Name", 20), # "Element name" in Eurostat + ("Description", 72), # Not present in Eurostat + ("ID", 20), # "Element code" in Eurostat + ) + + for row, attribute in enumerate(msd.report_structure["ALL"].components, start=2): + concept = attribute.concept_identity + ws.cell(row=row, column=1, value=concept.name.localized_default()).style = "top" + ws.cell(row=row, column=2, value=concept.description.localized_default()) + ws.cell(row=row, column=3, value=attribute.id).style = "top" + + +def add_template(wb: "Workbook", msd: "v21.MetadataStructureDefinition"): + """Add a "TEMPLATE" sheet to `wb` with a metadata template.""" + ws = wb.create_sheet("TEMPLATE") + + _header( + ws, + ("Attribute name", 20), # "Concept name" in Eurostat + ("Value", 72), # "Concept value" in Eurostat + ) + + for row, attribute in enumerate(msd.report_structure["ALL"].components, start=2): + concept = attribute.concept_identity + ws.cell(row=row, column=1, value=concept.name.localized_default()).style = "top" + ws.cell(row=row, column=2, value="---") + + +def contains_data_for(mdr: "v21.MetadataReport", ref_area: str) -> bool: + """Return :any:`True` if `mdr` contains data for `ref_area`. + + :any:`True` is returned if any of the following: + + 1. The referenced data flow definition has an ID that starts with `ref_area`. + 2. The country's ISO 3166 alpha-2 code, alpha-3 code, official name, or common name + appears in the value of the ``DATA_DESCR`` metadata attribute. + + + Parameters + ---------- + ref_area : str + ISO 3166 alpha-2 code for a country. Passed to + :meth:`pycountry.countries.lookup`. + """ + country = countries.lookup(ref_area) + + if mdr.attaches_to.key_values["DATAFLOW"].obj.id.startswith(ref_area): # type: ignore [union-attr] + return True + + # Pattern to match in DATA_DESCR + pat = re.compile( + f"({country.alpha_2}|{country.alpha_3}|{country.name}|{country.common_name})" + ) + for ra in mdr.metadata: + assert hasattr(ra, "value") + if ra.value_for.id == "DATA_DESCR" and pat.search(ra.value): + return True + + return False + + +def generate_summary_html( + mds: "v21.MetadataSet", ref_area: str, path: "pathlib.Path" +) -> None: + """Generate a summary report in HTML.""" + from jinja2 import Environment, PackageLoader, select_autoescape + + # Create a Jinja environment + env = Environment( + loader=PackageLoader("transport_data", package_path="data/org"), + extensions=["jinja2.ext.loopcontrols"], + autoescape=select_autoescape(), + trim_blocks=True, + lstrip_blocks=True, + ) + + grouped = groupby(mds, key=partial(contains_data_for, ref_area=ref_area)) + + def _dfd_id(mdr): + return mdr.attaches_to.key_values["DATAFLOW"].obj.id + + def _get_reported_attribute(mdr, id_): + for ra in mdr.metadata: + if ra.value_for.id == id_: + return ra.value, ra.value_for + return "—", None + + def _format_desc(dim): + if desc := str(dim.get_annotation(id="tdc-description").text): + return desc + else: + return "—" + + env.filters["dfd_id"] = _dfd_id + env.filters["format_desc"] = _format_desc + + path.write_text( + env.get_template("template-metadata.html").render( + ref_area=ref_area, + matched=grouped[True], + no_match=grouped[False], + get_reported_attribute=_get_reported_attribute, + ) + ) + + +@lru_cache +def get_cs_common() -> "common.ConceptScheme": + """Create a shared concept scheme for the concepts referenced by dimensions. + + Concepts in this scheme have an annotation ``tdc-aka``, which is a list of alternate + IDs recognized for the concept. + """ + from . import get_agencyscheme + + as_ = get_agencyscheme() + cs = common.ConceptScheme(id="CONCEPTS", maintainer=as_["TDCI"]) + + cs.setdefault( + id="CONFIDENTIALITY", + annotations=[common.Annotation(id="tdc-aka", text=repr(["CONFIDIENTALITY"]))], + ) + cs.setdefault( + id="FUEL_TYPE", + annotations=[common.Annotation(id="tdc-aka", text=repr(["Fuel type"]))], + ) + cs.setdefault( + id="REF_AREA", + annotations=[ + common.Annotation( + id="tdc-aka", text=repr(["Area", "Country", "Country code", "Region"]) + ) + ], + ) + cs.setdefault( + id="SERVICE", + annotations=[common.Annotation(id="tdc-aka", text=repr(["FREIGHT_PASSENGER"]))], + ) + cs.setdefault( + id="TIME_PERIOD", + annotations=[common.Annotation(id="tdc-aka", text=repr(["Time", "Year"]))], + ) + + return cs + + +def get_msd() -> "v21.MetadataStructureDefinition": + from transport_data import STORE + + from . import get_agencyscheme + + TDCI = get_agencyscheme()["TDCI"] + + cs = common.ConceptScheme(id="METADATA_CONCEPTS", maintainer=TDCI) + msd = v21.MetadataStructureDefinition(id="SIMPLE", version="1", maintainer=TDCI) + rs = msd.report_structure["ALL"] = v21.ReportStructure(id="ALL") + + for id_, (name, description) in CONCEPTS.items(): + ci = cs.setdefault(id=id_, name=name, description=description) + rs.getdefault(id_, concept_identity=ci) + + # NB Currently not supported by sdmx1; results in an empty XML collection + STORE.write(msd) + + return msd + + +def getdefault(is_: "common.ItemScheme", other: "common.Item") -> "common.Item": + """Return an item from `is_` matching `other`. + + Several methods are attempted to match `other` with an existing item: + + 1. ID of `other` is identical to that of an existing item. + 2. Transformed ID of `other`—in upper case, " " replaced with "_" is identical to + that of an existing item. + 3. ID of `other` is in the annotation ``tdc-aka`` + + """ + # Exact match on ID or transformed ID + for candidate in (other.id, other.id.upper().replace(" ", "_")): + try: + return is_[candidate] + except KeyError: + pass + + # Iterate over existing items + for item in is_: + # Eval the annotation "tdc-aka" for a list of alternate IDs for the item + if aka := item.eval_annotation(id="tdc-aka"): + if other.id in aka: + return item + + # Still no match; create the item + return is_.setdefault(id=other.id) + + +def groupby( + mds: "v21.MetadataSet", key=Callable[["v21.MetadataReport"], Hashable] +) -> dict[Hashable, list["v21.MetadataReport"]]: + """Group metadata reports in `mds` according to a `key` function. + + Similar to :func:`itertools.groupby`. + """ + result: dict[Hashable, list["v21.MetadataReport"]] = defaultdict(list) + for k, g in itertools.groupby(mds.report, key): + result[k].extend(g) + return result + + +def make_workbook(name="sample.xlsx") -> None: + """Generate a :class:`openpyxl.Workbook` for exchange of metadata.""" + from openpyxl import Workbook + from openpyxl.styles import Alignment, Font, NamedStyle, PatternFill + + wb = Workbook() + + # Delete the default sheet + assert wb.active + wb.remove(wb.active) + + # Create two named styles + header = NamedStyle(name="header") + header.fill = PatternFill("solid", fgColor="000000") + header.font = Font(bold=True, color="ffffff", name="Calibri") + wb.add_named_style(header) + + top = NamedStyle(name="top") + top.alignment = Alignment(vertical="top", wrap_text=True) + top.font = Font(name="Calibri") + wb.add_named_style(top) + + # Generate the metadata structure definition + msd = get_msd() + + # Add sheets + add_readme(wb) + add_attributes(wb, msd) + add_template(wb, msd) + + # Save the file + wb.save(name) + + +def parse_dimension(value: str) -> List[v21.Concept]: + """Parse the description of a dimension from `value`. + + Supported values include: + + 1. Multiple lines, with each line beginning "- ". + 2. A single line, with dimensions separated by ", ". + 3. A single dimension ID. + """ + # Partial regular expressions for a dimension + entry = r"(?P.+?)(?: \((?P[^\)]*)\))?" + + # Split `value` into potentially multiple values; separate dimension IDs from + # description/annotation + parts = [] + if matches := re.findall(f"^- {entry}$", value, flags=re.MULTILINE): + # Multiple lines, with each line beginning "- " + parts.extend(matches) + elif matches := re.findall(f"{entry}(?:, |$)", value): + # Single line, with dimensions separated by ", " + # TODO Check behaviour if the ", " is within parentheses + parts.extend(matches) + elif 0 == len(parts): + # None of the above → a single dimension label + parts.append(value) + + # Convert to a list of Concept objects + return [ + v21.Concept(id=id_, name=id_, description=description) + for id_, description in parts + ] + + +def read_workbook( + path: "pathlib.Path", +) -> tuple["v21.MetadataSet", "v21.ConceptScheme"]: + """Read a metadata set from the workbook at `path`.""" + from openpyxl import load_workbook + + wb = load_workbook(path) + # Generate/retrieve the metadata structure definition + msd = get_msd() + + mds = v21.MetadataSet(structured_by=msd) + + # Create a shared concept scheme for the concepts referenced by dimensions + # TODO Collect, maybe with get_msd() + cs_dims = get_cs_common() + + for ws in wb.worksheets: + # Skip information sheets generated by methods in this file + if ws.title in ("README", "Attributes", "TEMPLATE"): + continue + + if r := read_worksheet(ws, msd, cs_dims): + mds.report.append(r) + + return mds, cs_dims + + +def read_worksheet( + ws: "Worksheet", + msd: "v21.MetadataStructureDefinition", + cs_dims: "v21.ConceptScheme", +) -> Optional["v21.MetadataReport"]: + """Read a metadata report from the worksheet `ws`. + + Parameters + ---------- + msd : + Metadata structure definition. + """ + # Mapping from names (not IDs) to MetadataAttributes + mda_for_name = { + str(mda.concept_identity.name): mda + for mda in msd.report_structure["ALL"].components + } + + # Create the target of the report: a data flow definition + # TODO Expand this DFD and its associated data structure definition + df_id_from_title = ws.title + dfd = v21.DataflowDefinition(id=ws.title, maintainer=msd.maintainer) + dsd = v21.DataStructureDefinition(id=ws.title, maintainer=msd.maintainer) + dfd.structure = dsd + + # Create objects to associate the metadata report with the data flow definition + iot = v21.IdentifiableObjectTarget() + tok = v21.TargetObjectKey( + key_values={"DATAFLOW": v21.TargetIdentifiableObject(value_for=iot, obj=dfd)} + ) + + # Create the report itself + mdr = v21.MetadataReport() + mdr.attaches_to = tok + + mda = None # Reference to the MetaDataAttribute describing the current row + dimension_concepts = [] + + # Iterate over rows in the worksheet, skipping the first + for row in ws.iter_rows(min_row=2): + try: + # Column B: value in the row + ra_value = row[1].value + + if ra_value is None: + continue + except IndexError: + log.warning( + f"Sheet {df_id_from_title!r} has only < 2 columns in the first row; skip" + ) + return None + + # Column A: name of the metadata attribute + mda_name = row[0].value + + # Identify the MDA + # NB if `mda_name` is none, then `mda` retains the value found on the previous + # row. This allows e.g. multiple rows to give values for DIMENSION + # TODO Protect against other malformed data. + mda = mda_for_name.get(str(mda_name), mda) + + if mda and mda.id == "DIMENSION": + # Parse 1 or more dimension(s) and add to the DSD + dimension_concepts.extend(parse_dimension(str(ra_value))) + else: + # Store as OtherNonEnumeratedAttributeValue + # TODO Use EnumeratedAttributeValue, once code lists are available + # corresponding to dimensions + ra = v21.OtherNonEnumeratedAttributeValue( + value=str(ra_value), value_for=mda + ) + + # Attend the reported attribute to the report + mdr.metadata.append(ra) + + # Basic checks + df_id_from_cell = _get(mdr, "DATAFLOW") + if not df_id_from_cell: + log.warning(f"Sheet {df_id_from_title!r} does not identify a data flow; skip") + return None + + update_dimension_descriptor(dsd, cs_dims, *dimension_concepts) + + return mdr + + +def _get(mdr: "v21.MetadataReport", mda_id: str) -> Optional[str]: + """Retrieve from `mdr` the reported value of the metadata attribute `mda_id`.""" + for mda in mdr.metadata: + if mda.value_for is not None and mda.value_for.id == mda_id: + assert hasattr(mda, "value") # Exclude ReportedAttribute without value attr + return mda.value + # No match + return None + + +def summarize_metadataattribute(mds: "v21.MetadataSet", mda_id: str) -> None: + """Summarize unique values appear in metadata for attribute `mda_id`.""" + value_id = defaultdict(set) + + for r in mds.report: + value_id[_get(r, mda_id) or "MISSING"].add(_get(r, "DATAFLOW") or "MISSING") + + assert mds.structured_by + mda = mds.structured_by.report_structure["ALL"].get(mda_id) + + print("\n\n" + uline(f"{mda}: {len(value_id)} unique values")) + for value, df_ids in sorted(value_id.items()): + print(f"{value}\n " + " ".join(sorted(df_ids))) + + +def summarize_metadatareport(mdr: "v21.MetadataReport") -> None: + lines = ["", uline("Metadata report")] + + # Retrieve references to the data flow and data structure + dfd: v21.DataflowDefinition = mdr.attaches_to.key_values["DATAFLOW"].obj # type: ignore [union-attr] + dsd = dfd.structure + + # Summarize the data flow and data structure + + lines.extend( + [f"Refers to {dfd!r}", f" with structure {dsd!r}", " with dimensions:"] + ) + for dim in dsd.dimensions: + line = f" - {dim.id}:" + if desc := str(dim.get_annotation(id="tdc-description").text): + line += f" {desc!s}" + else: + line += " —" + try: + original_id = dim.get_annotation(id="tdc-original-id").text + line += f" ('{original_id!s}' in input file)" + except KeyError: + pass + lines.append(line) + + lines.append("") + + for ra in mdr.metadata: + if ra.value_for.id == "DATAFLOW": + continue + assert hasattr(ra, "value") + lines.append(f"{ra.value_for}: {ra.value}") + + print("\n".join(lines)) + + +def summarize_metadataset(mds: "v21.MetadataSet") -> None: + """Print a summary of the contents of `mds`.""" + print(f"Metadata set containing {len(mds.report)} metadata reports") + + summarize_metadataattribute(mds, "MEASURE") + summarize_metadataattribute(mds, "DATA_PROVIDER") + summarize_metadataattribute(mds, "UNIT_MEASURE") + + for r in mds.report: + summarize_metadatareport(r) + + +def uline(text: str, char: str = "=") -> str: + """Underline `text`.""" + return f"{text}\n{char * len(text)}" + + +def update_dimension_descriptor( + dsd: "v21.DataStructureDefinition", cs_dims: "v21.ConceptScheme", *concepts +) -> None: + """Update the DimensionDescriptor of `dsd` with `concepts`.""" + for dc in concepts: + # Identify the concept in `cs_dims` with the same ID + c = getdefault(cs_dims, dc) + + # Construct annotations + anno = [common.Annotation(id="tdc-description", text=dc.description)] + if c.id != dc.id: + anno.append(common.Annotation(id="tdc-original-id", text=dc.id)) + + dsd.dimensions.getdefault(id=c.id, concept_identity=c, annotations=anno) diff --git a/transport_data/testing.py b/transport_data/testing.py index e79a349..82d8ccb 100644 --- a/transport_data/testing.py +++ b/transport_data/testing.py @@ -1,3 +1,4 @@ +from pathlib import Path from typing import Generator, cast import pytest @@ -56,6 +57,12 @@ def sdmx_structures(tmp_store) -> sdmx.message.StructureMessage: return sm +@pytest.fixture(scope="session") +def test_data_path() -> Generator[Path, None, None]: + """Path containing test data.""" + yield Path(__file__).parent.joinpath("data", "tests") + + @pytest.fixture(scope="session") def tmp_config(tmp_path_factory) -> Generator[Config, None, None]: """A :class:`.Config` instance pointing to a temporary directory.""" diff --git a/transport_data/tests/org/__init__.py b/transport_data/tests/org/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/transport_data/tests/org/test_metadata.py b/transport_data/tests/org/test_metadata.py new file mode 100644 index 0000000..a30fee4 --- /dev/null +++ b/transport_data/tests/org/test_metadata.py @@ -0,0 +1,78 @@ +from functools import partial + +import pytest + +from transport_data.org.metadata import ( + contains_data_for, + generate_summary_html, + groupby, + make_workbook, + read_workbook, + summarize_metadataset, +) + + +def test_make_workbook(tmp_path) -> None: + make_workbook() + + +@pytest.fixture(scope="module") +def example_metadata(test_data_path): + return read_workbook(test_data_path.joinpath("metadata-input.xlsx")) + + +def test_read_workbook(example_metadata) -> None: + # Function runs successfully + result, _ = example_metadata + + # Result has a certain number of metadata reports + assert 45 == len(result.report) + + +def test_summarize_metadataset(capsys, example_metadata) -> None: + mds, cs_dims = example_metadata + + # Function runs successfully + summarize_metadataset(mds) + + captured = capsys.readouterr() + # pathlib.Path("debug.txt").write_text(captured.out) # DEBUG Write to a file + + # Output contains certain text + assert "MEASURE: 39 unique values" in captured.out + + # TODO expand with further assertions + + +COUNTRIES = [ + ("CN", 19), + ("ID", 17), + ("IN", 19), + ("PH", 14), + ("NP", 0), + ("TH", 17), + ("VN", 18), +] + + +@pytest.mark.parametrize("ref_area, N_exp", COUNTRIES) +def test_groupby(example_metadata, ref_area, N_exp: int) -> None: + predicate = partial(contains_data_for, ref_area=ref_area) + result = groupby(example_metadata[0], predicate) + + # Expected counts of metadata reports with respective values + # NB Use set notation to tolerate missing keys in `result` if N_exp == 0 + exp = {(True, N_exp), (False, 45 - N_exp)} + + # Observed counts match + assert exp >= {(k, len(v)) for k, v in result.items()} + + +@pytest.mark.parametrize("ref_area, N_exp", COUNTRIES) +def test_generate_summary_html(tmp_path, example_metadata, ref_area, N_exp) -> None: + path = tmp_path.joinpath(f"{ref_area}.html") + + generate_summary_html(example_metadata[0], ref_area=ref_area, path=path) + + # Output was created + assert path.exists() diff --git a/transport_data/tests/test_ipcc.py b/transport_data/tests/test_ipcc.py new file mode 100644 index 0000000..b3a314d --- /dev/null +++ b/transport_data/tests/test_ipcc.py @@ -0,0 +1,5 @@ +from transport_data.ipcc.structure import gen_structures + + +def test_gen_structures() -> None: + gen_structures() diff --git a/transport_data/tests/test_org.py b/transport_data/tests/test_org.py index 9e19b2e..0ca27ed 100644 --- a/transport_data/tests/test_org.py +++ b/transport_data/tests/test_org.py @@ -2,7 +2,10 @@ def test_get_agencyscheme() -> None: - get_agencyscheme() + as_ = get_agencyscheme() + + # Number of agencies associated with code in the transport_data repo + assert 7 == len(as_) def test_refresh() -> None: diff --git a/transport_data/util/hooks.py b/transport_data/util/hooks.py new file mode 100644 index 0000000..93b15c4 --- /dev/null +++ b/transport_data/util/hooks.py @@ -0,0 +1,17 @@ +from typing import TYPE_CHECKING, Iterable + +import pluggy + +if TYPE_CHECKING: + import sdmx.model.v21 + +hookspec = pluggy.HookspecMarker("transport_data") + + +@hookspec +def get_agencies() -> Iterable["sdmx.model.v21.Agency"]: + """Return :class:`sdmx.model.common.Agency` identifying (meta)data provider(s). + + An implementation **must** return an iterable of 0 or more Agency instances. + """ + raise NotImplementedError diff --git a/transport_data/util/pluggy.py b/transport_data/util/pluggy.py new file mode 100644 index 0000000..f4d593a --- /dev/null +++ b/transport_data/util/pluggy.py @@ -0,0 +1,24 @@ +from importlib import import_module + +import pluggy + +from . import hooks + +hookimpl = pluggy.HookimplMarker("transport_data") + + +pm = pluggy.PluginManager("transport_data") +pm.add_hookspecs(hooks) + + +def register_internal(): + """Register hook implementations from all modules that contain them. + + .. todo:: Automatically do this for all top-level submodules of transport_data. + """ + + for id_ in ("adb", "iamc", "ipcc.structure", "jrc", "oica", "org"): + try: + pm.register(import_module(f"transport_data.{id_}")) + except ValueError: + pass