From e6fddbb34024636d392ecac2d0c7c5df3b573f9d Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Wed, 10 Jul 2024 15:36:55 +0200 Subject: [PATCH 01/34] Add stub files, docs for .ipcc --- doc/api.in | 1 + doc/ipcc.rst | 4 ++++ transport_data/ipcc/__init__.py | 1 + transport_data/ipcc/structure.py | 2 ++ transport_data/tests/test_ipcc.py | 5 +++++ 5 files changed, 13 insertions(+) create mode 100644 doc/ipcc.rst create mode 100644 transport_data/ipcc/__init__.py create mode 100644 transport_data/ipcc/structure.py create mode 100644 transport_data/tests/test_ipcc.py diff --git a/doc/api.in b/doc/api.in index 62f0072..98af50b 100644 --- a/doc/api.in +++ b/doc/api.in @@ -9,6 +9,7 @@ config estat iamc + ipcc jrc oica org diff --git a/doc/ipcc.rst b/doc/ipcc.rst new file mode 100644 index 0000000..ff60f0f --- /dev/null +++ b/doc/ipcc.rst @@ -0,0 +1,4 @@ +Intergovernmental Panel on Climate Change (IPCC) +************************************************ + +.. include:: _api/transport_data.ipcc.rst diff --git a/transport_data/ipcc/__init__.py b/transport_data/ipcc/__init__.py new file mode 100644 index 0000000..bdee3de --- /dev/null +++ b/transport_data/ipcc/__init__.py @@ -0,0 +1 @@ +"""Intergovernmental Panel on Climate Change metadata provider.""" diff --git a/transport_data/ipcc/structure.py b/transport_data/ipcc/structure.py new file mode 100644 index 0000000..166a283 --- /dev/null +++ b/transport_data/ipcc/structure.py @@ -0,0 +1,2 @@ +def gen_structures() -> None: + raise NotImplementedError diff --git a/transport_data/tests/test_ipcc.py b/transport_data/tests/test_ipcc.py new file mode 100644 index 0000000..b3a314d --- /dev/null +++ b/transport_data/tests/test_ipcc.py @@ -0,0 +1,5 @@ +from transport_data.ipcc.structure import gen_structures + + +def test_gen_structures() -> None: + gen_structures() From 0d94447bcb4d9806f9500103f0423f790ba3cedd Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Wed, 10 Jul 2024 16:19:01 +0200 Subject: [PATCH 02/34] Transcribe IPCC GNGGI Vol.2 Ch.3 Tab. 3.1.1 --- transport_data/ipcc/structure.py | 96 +++++++++++++++++++++++++++++++- 1 file changed, 95 insertions(+), 1 deletion(-) diff --git a/transport_data/ipcc/structure.py b/transport_data/ipcc/structure.py index 166a283..b16b3be 100644 --- a/transport_data/ipcc/structure.py +++ b/transport_data/ipcc/structure.py @@ -1,2 +1,96 @@ +from functools import lru_cache +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import sdmx.model.common + + +@lru_cache +def get_agency() -> "sdmx.model.common.Agency": + """Return the IPCC :class:`.Agency`.""" + from sdmx.model import v21 + + return v21.Agency( + id="IPCC", + name="Intergovernmental Panel on Climate Change", + description="https://www.ipcc.ch/", + ) + + +def gen_cl_T311(**kwargs) -> "sdmx.model.Common.Codelist": + """Generate a code list from the GNGGI, Volume 2, Table 3.1.1.""" + from sdmx.model.common import Code, Codelist + + cl = Codelist( + id="CL_IPCC_2006_V2_T3.1.1", + name="Detailed sector split for the Transport sector", + description="""Transcribed from 2006 IPCC Guidelines for National Greenhouse Gas Inventories — Volume 2: Energy — Chapter 3: Mobile Combustion — Table 3.1.1, using the file https://www.ipcc-nggip.iges.or.jp/public/2006gl/pdf/2_Volume2/V2_3_Ch3_Mobile_Combustion.pdf, as linked from https://www.ipcc-nggip.iges.or.jp/public/2006gl/vol2.html. + +This version includes the 'Explanation' text from the table as the description for individual codes, but at the moment only for the code 1 A 3. For others, see the source document.""", + **kwargs, + ) + + # The codes have well-formed, hierarchical IDs, so it is possible to infer the ID of + # the parent code, if it exists. + def _c(id_, name, description=None): + """Shorthand for adding to `cl`.""" + try: + parent = cl[" ".join(id_.split()[:-1])] + except KeyError: + parent = None + + cl.append(Code(id=id_, name=name, description=description, parent=parent)) + + _c( + "1 A 3", + "TRANSPORT", + """Emissions from the combustion and evaporation of fuel for all transport activity (excluding military transport), regardless of the sector, specified by sub-categories below. + +Emissions from fuel sold to any air or marine vessel engaged in international transport (1 A 3 a i and 1 A 3 d i) should as far as possible be excluded from the totals and subtotals in this category and should be reported separately.""", + ) + _c("1 A 3 a", "Civil Aviation") + _c("1 A 3 a i", "International Aviation (International Bunkers)") + _c("1 A 3 a ii", "Domestic Aviation") + _c("1 A 3 b", "Road Transportation") + _c("1 A 3 b i", "Cars") + _c("1 A 3 b i 1", "Passenger cars with 3-way catalysts") + _c("1 A 3 b i 2", "Passenger cars without 3-way catalysts") + _c("1 A 3 b ii", "Light duty trucks") + _c("1 A 3 b ii 1", "Light-duty trucks with 3-way catalysts") + _c("1 A 3 b ii 2", "Light-duty trucks without 3-way catalysts") + _c("1 A 3 b iii", "Heavy duty trucks and buses") + _c("1 A 3 b iv", "Motorcycles") + _c("1 A 3 b v", "Evaporative emissions from vehicles") + _c("1 A 3 b vi", "Urea-based catalysts") + _c("1 A 3 c", "Railways") + _c("1 A 3 d", "Water-borne Navigation") + _c("1 A 3 d i", "International water-borne navigation (International bunkers)") + _c("1 A 3 d ii", "Domestic water-borne Navigation") + _c("1 A 3 e", "Other Transportation") + _c("1 A 3 e i", "Pipeline Transport") + _c("1 A 3 e ii", "Off-road") + _c("1 A 4 c iii", "Fishing (mobile combustion)") + _c("1 A 5 a", "Non specified stationary") + _c("1 A 5 b", "Non specified mobile") + + return cl + + def gen_structures() -> None: - raise NotImplementedError + """Create or update IPCC-maintained structural metadata. + + The structures have URNs like ``TDCI:CS_IPCC_{NAME}(0.1)`` + """ + from transport_data import STORE, org + + def _make_id(value: str) -> str: + return f"{get_agency().id}_{value}" + + ma_args = dict( + maintainer=org.get_agency()[0], + version="0.1", + is_final=True, + is_external_reference=False, + ) + + STORE.setdefault(gen_cl_T311(**ma_args)) From a3b127322061a8c0e0fde20b85038d2d41c8fccb Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 26 Jul 2024 12:03:53 +0200 Subject: [PATCH 03/34] Expand docs --- doc/giz.rst | 12 ++++++++++++ doc/index.rst | 4 ++++ doc/ipcc.rst | 10 ++++++++++ doc/roadmap.rst | 3 ++- doc/standards.rst | 4 ++++ doc/whatsnew.rst | 1 + 6 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 doc/giz.rst diff --git a/doc/giz.rst b/doc/giz.rst new file mode 100644 index 0000000..aa0bb12 --- /dev/null +++ b/doc/giz.rst @@ -0,0 +1,12 @@ +Gesellschaft für Internationale Zusammenarbeit (GIZ) +**************************************************** + +GIZ GmbH (`website `_, lit. *Corporation for International Development*) is the main German development agency. +It is not currently a direct provider of (meta)data through TDC, but its members initiated what is now the TDCI and support its activities, including development of this :mod:`.transport_data` package. +This work mainly appears in the :mod:`.org` and :mod:`.proto` modules. + +TUEWAS +====== + +- “Transport, Environment, Energy, and Water in Asia” is an “internal sector network” of GIZ. +- Website: https://tuewas-asia.org/ diff --git a/doc/index.rst b/doc/index.rst index 425a155..c5c6b7c 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -53,7 +53,9 @@ They handle tasks including: adb estat + giz iamc + ipcc jrc oica org @@ -61,7 +63,9 @@ They handle tasks including: - :mod:`.adb`: :doc:`adb` - :mod:`.estat`: :doc:`estat` +- :doc:`giz` - :mod:`.iamc`: :doc:`iamc` +- :mod:`.ipcc`: :doc:`ipcc` - :mod:`.jrc`: :doc:`jrc` - :mod:`.oica`: :doc:`oica` - :mod:`.org`: :doc:`org` diff --git a/doc/ipcc.rst b/doc/ipcc.rst index ff60f0f..804ad67 100644 --- a/doc/ipcc.rst +++ b/doc/ipcc.rst @@ -2,3 +2,13 @@ Intergovernmental Panel on Climate Change (IPCC) ************************************************ .. include:: _api/transport_data.ipcc.rst + +References +========== + +Some of these references are to documents or webpages authored not by the IPCC *per se*, but by individuals or groups connected to the United Nations Framework Convention on Climate Change (UN FCCC). +Since :mod:`transport_data` does not currently have a :mod:`.unfccc` module, they are included here. + +- 2006 Guidelines for National Greenhouse Gas Inventories (`HTML `__) — Volume 2: Energy (`HTML `__) — Chapter 3: Mobile Combustion (`PDF (en) `__). +- 2023-01 Technical handbook for developing country Parties on Preparing for implementation of the enhanced transparency framework [ETF] under the Paris Agreement (`HTML `__, `PDF (en) `__). +- 2024-01-24 Compendium on Greenhouse Gas Baselines and Monitoring Passenger and Freight Transport (`HTML `__, `PDF (en) `__). diff --git a/doc/roadmap.rst b/doc/roadmap.rst index 42e3b55..282460c 100644 --- a/doc/roadmap.rst +++ b/doc/roadmap.rst @@ -1,4 +1,5 @@ Roadmap ******* -This page will give a longer-term overview of future development of :mod:`transport_data`, focused on the tools in this package, but with relevant details about the broader TDC and TDCI. +This page gives a medium- and long-term overview of future development of :mod:`transport_data`, focused on the tools in this package, but with relevant details about the broader TDC and TDCI. +See also `transport-data/projects/1 `_ on GitHub. diff --git a/doc/standards.rst b/doc/standards.rst index 4f4b3db..7eeaab6 100644 --- a/doc/standards.rst +++ b/doc/standards.rst @@ -129,6 +129,10 @@ If used, annotations with these IDs **must** conform to the given requirements: The function :func:`.anno_generated` generates such an annotation and **should** be called on all objects created in this package. +``preferred-unit`` + Especially for :class:`.Concept` in :class:`.ConceptScheme`, the preferred units of measurement if the concept is used as a measure. + These correspond to the well-known SDMX concept and attribute ``Concept=SDMX:CROSS_DOMAIN_CONCEPTS(2.0).UNIT_MEASURE``. + Codes ----- diff --git a/doc/whatsnew.rst b/doc/whatsnew.rst index 5e64e72..b81f8d2 100644 --- a/doc/whatsnew.rst +++ b/doc/whatsnew.rst @@ -4,6 +4,7 @@ What's new Next release ============ +- Add :mod:`.ipcc` (:doc:`ipcc`) module (:issue:`15`, :pull:`21`). - Add :doc:`standards` and :doc:`roadmap` documentation pages (:pull:`9`). - Adjust :mod:`.adb` for changes in data format in the 2024-05-20 edition of the ATO National Database (:pull:`20`, :issue:`18`). Document the :ref:`current file format ` that the code supports. From bc68ede3cadb940f996c2c929a62ed7e363aa39e Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Wed, 31 Jul 2024 13:54:55 +0200 Subject: [PATCH 04/34] Add types-openpyxl to mypy packages --- .pre-commit-config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 103f8dd..8f10388 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,6 +10,7 @@ repos: - pytest - sdmx1 - Sphinx + - types-openpyxl - types-requests args: [] - repo: https://github.com/astral-sh/ruff-pre-commit From 2fa15a2ab0c929b463d2e3e3a8db0095ed142d34 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Wed, 31 Jul 2024 13:55:16 +0200 Subject: [PATCH 05/34] Gitignore generated .xlsx files --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index f2002e3..bbe7e94 100644 --- a/.gitignore +++ b/.gitignore @@ -133,3 +133,6 @@ dmypy.json # Editors/IDEs .vscode .ruff_cache + +# Generated by transport_data +*.xlsx From 4ff47312c4f23492e095976aba3fa3a04fb23984 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Wed, 31 Jul 2024 13:56:38 +0200 Subject: [PATCH 06/34] Add TDC metadata template (.org.metadata) --- transport_data/org/cli.py | 24 +++- transport_data/org/metadata.py | 228 +++++++++++++++++++++++++++++++++ 2 files changed, 247 insertions(+), 5 deletions(-) create mode 100644 transport_data/org/metadata.py diff --git a/transport_data/org/cli.py b/transport_data/org/cli.py index ba5dbba..6ffaa48 100644 --- a/transport_data/org/cli.py +++ b/transport_data/org/cli.py @@ -8,13 +8,27 @@ import click -from transport_data import STORE from transport_data.util.click import common_params -@click.command("org", params=common_params("version")) -def main(version): - """Information about the TDCI per se.""" +@click.group("org") +def main(): + """TDCI itself.""" + + +@main.command("refresh", params=common_params("version")) +def refresh(version): + """Update the TDCI metadata.""" + from transport_data import STORE + from . import get_agencyscheme - STORE.write(get_agencyscheme(version=version), force=True) + STORE.write(get_agencyscheme(version=version)) + + +@main.command("template") +def template(): + """Generate the metadata template.""" + from .metadata import make_workbook + + make_workbook() diff --git a/transport_data/org/metadata.py b/transport_data/org/metadata.py new file mode 100644 index 0000000..5570fac --- /dev/null +++ b/transport_data/org/metadata.py @@ -0,0 +1,228 @@ +from typing import TYPE_CHECKING, Tuple + +from sdmx.model.v21 import MetadataStructureDefinition + +if TYPE_CHECKING: + from openpyxl import Workbook + from openpyxl.worksheet.worksheet import Worksheet + +#: Concepts and metadata attributes in the TDC metadata structure. +CONCEPTS = { + "DATAFLOW": ( + "Data flow ID", + """A unique identifier for the data flow (=data source, data set, etc.). + +We suggest to use IDs like ‘VN001’, where ‘VN’ is the ISO 3166 alpha-2 country +code, and ‘001’ is a unique number. The value MUST match the name of the sheet +in which it appears.""", + ), + "DATA_PROVIDER": ( + "Data provider", + """Organization or individual that provides the data and any related metadata. + +This can be as general (“IEA”) or specific (organization unit/department, specific +person responsible, contact details, etc.) as appropriate.""", + ), + "URL": ( + "URL or web address", + "Location on the Internet with further information about the data flow.", + ), + "MEASURE": ( + "Measure (‘indicator’)", + """Statistical concept for which data are provided in the data flow. + +If the data flow contains data for multiple measures, give each one separated by +semicolons. Example: “Number of cars; passengers per vehicle”. + +This SHOULD NOT duplicate the value for ‘UNIT_MEASURE’. Example: “Annual driving +distance per vehicle”, not “Kilometres per vehicle”.""", + ), + "UNIT_MEASURE": ( + "Unit of measure", + """Unit in which the data values are expressed. + +If ‘MEASURE’ contains 2+ items separated by semicolons, give the respective units in the +same way and order. If there are no units, write ‘dimensionless’, ‘1’, or similar.""", + ), + "DIMENSION": ( + "Dimensions", + """Formally, the “statistical concept used in combination with other statistical +concepts to identify a statistical series or individual observations.” + +Record all dimensions of the data, either in a bulleted or numbered list, or +separated by semicolons. In parentheses, give some indication of the scope +and/or resolution of the data along each dimension. Most data have at least time +and space dimensions. + +Example: + +- TIME_PERIOD (annual, 5 years up to 2021) +- REF_AREA (whole country; VN only) +- Vehicle type (12 different types: […]) +- Emissions species (CO2 and 4 others)""", + ), + "DATA_DESCR": ( + "Data description", + """Any information about the data flow that does not fit in other attributes. + +Until or unless other metadata attributes are added to this metadata structure/ +template, this MAY include: + +- Any conditions on data access, e.g. publicly available, proprietary, fee or + subscription required, available on request, etc. +- Frequency of data updates. +- Any indication of quality, including third-party references that indicate data + quality. +""", + ), + "COMMENT": ( + "Comment", + """Any other information about the metadata values, for instance discrepancies or +unclear or missing information. + +Precede comments with initials; append to existing comments to keep +chronological order; and include a date (for example, “2024-07-24”) if helpful.""", + ), +} + +#: README text for the TDC metadata file format. +README_TEXT = """This file is an unofficial, prototype TDC format for metadata. +loosely imitates the Eurostat format. These files contain metadata (information +*about* data) based on the SDMX information model, but their layout (sheet +names, columns, etc.) is not specified by the SDMX standard, hence ‘unofficial’. + +This file has the following sheets. + +README +====== + +This sheet. + +Attributes +========== + +- One row per metadata attribute (or 'field'). +- Columns for the name; description; and ID (short and machine-readable) of each + attribute. See these descriptions to learn what to write for each attribute. + +One or more additional sheets +============================= + +- The name (or title) of each sheet corresponds to the identity (ID) of the data + flow that is described by the metadata in that sheet. +- In Column A, the name of the metadata attribute. Each name MUST exactly + match one appearing in the "Attributes" sheet. Some names MAY be omitted. +- In Column B, the actual metadata. These may be empty. + +TEMPLATE +======== + +To add information about additional data flows not included in existing sheets +(above), you can copy and rename this sheet. +""" + + +def _header(ws: "Worksheet", *columns: Tuple[str, int]) -> None: + """Write header columns and format their style and width.""" + for column, (value, width) in enumerate(columns, start=1): + cell = ws.cell(row=1, column=column, value=value) + cell.style = "header" + ws.column_dimensions[cell.column_letter].width = width + + +def add_readme(wb: "Workbook") -> None: + """Add a "README" sheet to `wb`.""" + ws = wb.create_sheet("README") + + _header(ws, ("Transport Data Commons (TDC) metadata", 72)) + ws["A3"] = README_TEXT + + +def add_attributes(wb: "Workbook", msd: "MetadataStructureDefinition"): + """Add an "Attributes" sheet to `wb` listing the metadata attributes from `msd`.""" + ws = wb.create_sheet("Attributes") + + _header( + ws, + ("Name", 20), # "Element name" in Eurostat + ("Description", 72), # Not present in Eurostat + ("ID", 20), # "Element code" in Eurostat + ) + + for row, attribute in enumerate(msd.report_structure["ALL"].components, start=2): + concept = attribute.concept_identity + ws.cell(row=row, column=1, value=concept.name.localized_default()).style = "top" + ws.cell(row=row, column=2, value=concept.description.localized_default()) + ws.cell(row=row, column=3, value=attribute.id).style = "top" + + +def add_template(wb: "Workbook", msd: "MetadataStructureDefinition"): + """Add a "TEMPLATE" sheet to `wb` with a metadata template.""" + ws = wb.create_sheet("TEMPLATE") + + _header( + ws, + ("Attribute name", 20), # "Concept name" in Eurostat + ("Value", 72), # "Concept value" in Eurostat + ) + + for row, attribute in enumerate(msd.report_structure["ALL"].components, start=2): + concept = attribute.concept_identity + ws.cell(row=row, column=1, value=concept.name.localized_default()).style = "top" + ws.cell(row=row, column=2, value="---") + + +def get_msd() -> "MetadataStructureDefinition": + from sdmx.model.common import Concept + from sdmx.model.v21 import ReportStructure + + from transport_data import STORE + + from . import get_agencyscheme + + as_ = get_agencyscheme() + msd = MetadataStructureDefinition(id="SIMPLE", version="1", maintainer=as_["TDCI"]) + rs = msd.report_structure["ALL"] = ReportStructure(id="ALL") + + for id_, (name, description) in CONCEPTS.items(): + ci = Concept(id=id_, name=name, description=description) + rs.getdefault(id_, concept_identity=ci) + + # NB Currently not supported by sdmx1; results in an empty XML collection + STORE.write(msd) + + return msd + + +def make_workbook(name="sample.xlsx") -> None: + """Generate a :class:`openpyxl.Workbook` for exchange of metadata.""" + from openpyxl import Workbook + from openpyxl.styles import Alignment, Font, NamedStyle, PatternFill + + wb = Workbook() + + # Delete the default sheet + assert wb.active + wb.remove(wb.active) + + # Create two named styles + header = NamedStyle(name="header") + header.fill = PatternFill("solid", fgColor="000000") + header.font = Font(bold=True, color="ffffff", name="Calibri") + wb.add_named_style(header) + + top = NamedStyle(name="top") + top.alignment = Alignment(vertical="top", wrap_text=True) + top.font = Font(name="Calibri") + wb.add_named_style(top) + + # Generate the metadata structure definition + msd = get_msd() + + # Add sheets + add_readme(wb) + add_attributes(wb, msd) + add_template(wb, msd) + + # Save the file + wb.save(name) From 343218cc00fda3ba53d53ac31b2d948642ff1f8b Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Wed, 31 Jul 2024 15:52:11 +0200 Subject: [PATCH 07/34] Add IPCC 2006 concept scheme --- transport_data/ipcc/structure.py | 193 +++++++++++++++++++++++++++++-- 1 file changed, 186 insertions(+), 7 deletions(-) diff --git a/transport_data/ipcc/structure.py b/transport_data/ipcc/structure.py index b16b3be..f7e5e33 100644 --- a/transport_data/ipcc/structure.py +++ b/transport_data/ipcc/structure.py @@ -1,3 +1,5 @@ +"""IPCC structural metadata.""" + from functools import lru_cache from typing import TYPE_CHECKING @@ -18,7 +20,16 @@ def get_agency() -> "sdmx.model.common.Agency": def gen_cl_T311(**kwargs) -> "sdmx.model.Common.Codelist": - """Generate a code list from the GNGGI, Volume 2, Table 3.1.1.""" + """Generate a code list from the GNGGI, Volume 2, Table 3.1.1. + + The generated code list's URN ends with ``Codelist=TDCI:CL_IPCC_2006_V2_T3.1.1(…)``. + + .. todo:: Expand to include 'Explanation' text from the table as descriptions for + codes. + + .. todo:: Include internationalized texts (names, descriptions) from the Arabic, + Chinese, French, Russian, and/or Spanish versions of the documents. + """ from sdmx.model.common import Code, Codelist cl = Codelist( @@ -76,15 +87,182 @@ def _c(id_, name, description=None): return cl -def gen_structures() -> None: - """Create or update IPCC-maintained structural metadata. +def gen_cs_ch3(**kwargs) -> "sdmx.model.common.ConceptScheme": + """Generate a scheme of concepts included in equations in Chapter 3. + + The generated code list's URN ends with + ``ConceptScheme=TDCI:CS_IPCC_2006_V2_CH3(…)``. - The structures have URNs like ``TDCI:CS_IPCC_{NAME}(0.1)`` + .. todo:: Include concepts used as table dimensions. + + .. todo:: Include internationalized texts (names, descriptions) from the Arabic, + Chinese, French, Russian, and/or Spanish versions of the documents. """ - from transport_data import STORE, org + from sdmx.model.common import Annotation, Concept, ConceptScheme + + cs = ConceptScheme( + id="CS_IPCC_2006_V2_CH3", name="Concepts appearing in equations", **kwargs + ) + + equation, page = "", "" + + def _c(id_, name=None, units=None, description=None): + c = Concept( + id=id_, + name=name, + description=f"First appears in Equation {equation} on p.{page}", + ) + + if units: + c.annotations.append(Annotation(id="preferred-units", text=units)) + cs.append(c) + + # §3.2 Road transportation + + equation, page = "3.2.1", "3.12" + _c( + "EMI 1", + "Emissions", + "kg", + """Variously "Emissions of CO₂" (Eq. 3.2.1), or of varying species (Eq. 3.2.3, 3.2.5)""", + ) + _c("Fuel 1", "Fuel sold", "TJ") + _c("EF 1", "Emission factor", "kg/TJ") + _c( + "a", + "Type of fuel (e.g. petrol, diesel, natural gas, lpg)", + None, + "In Eq 3.2.6, 'j' is used for the same concept.", + ) + + equation, page = "3.2.2", "3.12" + _c( + "EMI 2", + "CO₂ Emissions from urea-based additive in catalytic converters", + "Gg CO₂", + ) + _c( + "Activity", + "amount of urea-based additive consumed for use in catalytic converters", + "Gg", + ) + _c( + "Purity", + "the mass fraction (=percentage divided by 100) of urea in the urea-based additive", + ) + + # Eq. 3.2.3 —same concepts as 3.2.1 + + equation, page = "3.2.4", "3.13" + _c( + "Fuel 2", + "fuel consumed (as represented by fuel sold) for a given mobile source activity", + "TJ", + ) + _c( + "b", + "vehicle type", + None, + "In Eq 3.2.6, 'i' is used for the same concept (e.g., car, bus)", + ) + _c( + "c", + "emission control technology (such as uncontrolled, catalytic converter, etc.)", + ) + + equation, page = "3.2.5", "3.15" + _c("EF 2", "emission factor", "kg / km") + _c( + "Distance 1", + "distance travelled during thermally stabilized engine operation phase for a given mobile source activity", + "km", + ) + _c("C", "emissions during warm-up phase (cold start)", "kg") + _c( + "d", + "operating conditions (e.g. urban or rural road type, climate, or other environmental factors)", + ) + + equation, page = "3.2.6", "3.26" + _c( + "Estimated fuel", + "total estimated fuel use estimated from distance travelled (VKT) data", + "litre", + ) + _c("Vehicles", "number of vehicles of type i and using fuel j on road type t") + _c( + "Distance 2", + "annual kilometres travelled per vehicle of type i and using fuel j on road type t", + "km", + ) + _c("t", "type of road (e.g., urban, rural)") + + # §3.3 Off-road transportation + # Eq. 3.3.1 —no additional concepts + # Eq. 3.3.2 —no additional concepts + + equation, page = "3.3.3", "3.34" + _c( + "N", + "source population", + None, + """In Eq. 3.4.3 this is given as 'number of locomotives of type i".""", + ) + # Ditto below, all used in Eq. 3.4.3 + _c("H", "annual hours of use of vehicle i", "hour") + _c("P", "average rated power of vehicle i", "kW") + _c("LF", "typical load factor of vehicle i (fraction between 0 and 1)") + _c("EF 3", "average emission factor for use of fuel j in vehicle i", "kg / kWh") + + # Eq. 3.3.4 —no additional concepts - def _make_id(value: str) -> str: - return f"{get_agency().id}_{value}" + # §3.4 Railways + # Eq. 3.4.1 —no additional concepts + # Eq. 3.4.2 —no additional concepts + + equation, page = "3.4.3", "3.42" + _c("i", "locomotive type and journey type") + + equation, page = "3.4.4", "3.43" + _c("EF 4", "engine specific emission factor for locomotive of type i", "kg/TJ") + _c("PWF", "pollutant weighting factor for locomotive of type i", "dimensionless") + _c("EF 5", "default emission factor for diesel (applies to CH₄, N₂O)", "kg/TJ") + + # §3.6 Civil Aviation + # Eq. 3.6.1 —no additional concepts + + equation, page = "3.6.2", "3.59" + _c( + "Emissions.LTO", + "", + None, + """'LTO' is defined on p.3.56 as "Landing/Take-Off cycle".""", + ) + _c( + "Emissions.Cruise", + "", + None, + """'Cruise' is defined on p.3.56 in contrast with 'LTO'.""", + ) + + equation, page = "3.6.3", "3.59" + _c("Number of LTOs") + _c("EF.LTO") + + equation, page = "3.6.4", "3.59" + _c("Fuel consumption.LTO") + _c("Fuel consumption per LTO") + + equation, page = "3.6.5", "3.59" + _c("Total Fuel Consumption") + _c("EF.Cruise") + + return cs + + +def gen_structures() -> None: + """Create or update IPCC-maintained structural metadata.""" + from transport_data import STORE, org ma_args = dict( maintainer=org.get_agency()[0], @@ -94,3 +272,4 @@ def _make_id(value: str) -> str: ) STORE.setdefault(gen_cl_T311(**ma_args)) + STORE.setdefault(gen_cs_ch3(**ma_args)) From 03ba34c9b141900507f10496bff2fa6cb0eb5fca Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Mon, 12 Aug 2024 00:00:28 +0200 Subject: [PATCH 08/34] Add doc/howto/metadata --- doc/howto/index.rst | 13 +++ doc/howto/metadata.rst | 238 +++++++++++++++++++++++++++++++++++++++++ doc/index.rst | 2 + doc/standards.rst | 2 + 4 files changed, 255 insertions(+) create mode 100644 doc/howto/index.rst create mode 100644 doc/howto/metadata.rst diff --git a/doc/howto/index.rst b/doc/howto/index.rst new file mode 100644 index 0000000..9e89722 --- /dev/null +++ b/doc/howto/index.rst @@ -0,0 +1,13 @@ +HOWTOs +****** + +This section contains practical **how-to guides**, instructions, and tutorials. +These are intended to illustrate and demonstrate how to work with TDC-compliant data and metadata, and develop code and other tools process such (meta)data. + +In contrast, the :doc:`/standards` are *prescriptive*. +Put another way, a HOWTO shows *just one possible or suggested* way to comply with the standards. + +.. toctree:: + :maxdepth: 1 + + metadata diff --git a/doc/howto/metadata.rst b/doc/howto/metadata.rst new file mode 100644 index 0000000..3bc87da --- /dev/null +++ b/doc/howto/metadata.rst @@ -0,0 +1,238 @@ +Record and update TDC-structured metadata +***************************************** + +This guide explains how to record and update **metadata** in a TDC-specific file format. [1]_ + +.. contents:: + :local: + +.. [1] The guide was developed as part of a project funded by :doc:`/giz` under the `TUEWAS Asia `_ network. + +Introduction +============ + +This HOWTO does *not* set out to give a comprehensive explanation of data and metadata. +For more information, you could consult some of the references linked under :ref:`std-defs` on the page :doc:`/standards`. + +What is metadata? +----------------- + +Metadata is **facts or information about data**, distinct from the *data itself* (i.e. particular numbers). +By recording, exchanging, and analyzing metadata, it is possible to understand and make decisions about data processing and usage—even *without* the actual data. + +Two kinds of metadata +--------------------- + +We are concerned with two kinds of metadata. + +**Structural metadata**, as the name implies, give information about the *structure* of data. +For example, suppose we know that: + + In data set ‘X’, individual observations look like: in Canada, in 2023, 17.4 million apples and 13.8 million bananas were sold. + +We could describe the structure of this data by saying: + +- The data have 3 conceptual **dimensions**: country, time period, and kind of fruit. +- One dimension refers to *countries*—perhaps with labels like “Canada” (literally), or perhaps with short **codes** like “CA”. + These codes might form a certain, fixed, **code list**: *only* the codes in this list appear in the ‘country’ dimension. + + This information also can tell us about the **scope** and **resolution** of the data, along this particular dimension. + For example, the very name or ID of this dimension, ‘country’, implies the spatial resolution of the data: entire countries. + Or, if the ‘country’ code list includes (CA, MX, US), we understand the spatial scope of the data is “North America”. + +- A second dimension refers to the *time period*. + As with the ‘country’ dimension, we can understand the temporal resolution (here, probably years/annual) and scope (at least the year 2023 is included; perhaps also other years). +- A third dimension refers to *kinds of fruit*. + We see the scope is, at least, ‘apples’ and ‘bananas’. + We might also infer that there is no further resolution related to ‘kind of fruit’: for example, the data may not distinguish “honey crisp apple” from “Fuji apple”. +- We also see that the values are for a specific **measure**. + In this case, the measure is “number pieces of fruit were sold”. + The same real-world activity can be measured in different ways. + For example, fruit sold could also be measured as “total market value”, in **units of measurement** such as “US dollars equivalent at market exchange rates”. + +If structural metadata explain the ‘what’ of data (answering the question: “What do the data consist of?”), then **provenance metadata** is a general term for other information about *who* provides data; *how* data is collected, prepared, and published; *when* these things happen; and *where* the data can be found. + +How are metadata described, stored, and exchanged? +-------------------------------------------------- + +Once we have a metadata fact like, “Data set ‘A’ is published by the United Nations,” (Fact 1) we have to decide how to store this, exchange it, and compare it along with possibly many other pieces of metadata. + +This is most commonly done as plain text. + +TDC follows the **SDMX** standards for **Statistical Data and Metadata eXchange**. +Describing and storing metadata in a standards-based way allows to be clear and precise about its meaning. +For example, suppose we have a second fact, expressed as plain text: “Data set ‘B’ is UN data.” + +- Is Fact 2 the same as Fact 1, only phrased differently? +- Or are they diffent? + For example, is data set B “published by” a different agency than the UN, but contains “UN data” from UN data flows? + +By specifying SDMX **metadata attributes** and giving distinct **metadata values** for each, we try to reduce or eliminate this ambiguity. +In this example, we can distinguish *the identity of the data provider/publisher* from *the ‘upstream’ source of the data contained in the data flow*. + +These standards specify ways to describe and exchange metadata in different **file formats** that are machine-readable, such as `XML `_, JSON, and CSV. +Using machine-readable formats allows for semi-automated processing that can handle large amounts of (meta)data. + +XML and JSON, however, are not easily **human-readable**. +For this reason, there are + +For example, see https://ec.europa.eu/eurostat/cache/metadata/en/avia_if_esms.htm + +- This file contains metadata expressed in a human-readable HTML format. +- Click “Download” at the above URL, or access https://ec.europa.eu/eurostat/api/dissemination/files?file=metadata/avia_if_esms.sdmx.zip +- This archive contains: + + - The same :file:`.htm` file. + - Two :file:`.xml` files giving the metadata specification (:file:`ESMS_MSD.msd.xml`) and the actual metadata for this data flow (:file:`avia_if_esms.sdmx.xml`). + - A spreadsheet in Office Open XML (“Microsoft Excel”) format with an alternate, human-readable format (:file:`avia_if_esms.xlsx`) + +Understand the TDC metadata format +================================== + +TDC uses an an unofficial, prototype format for metadata. +This loosely imitates the above-mentioned Eurostat format. +These files contain metadata (information *about* data) based on the SDMX information model, but their file type (.xlsx) and layout (sheet names, columns, etc.) is not specified by the SDMX standard, hence ‘unofficial’. + +The files have the following sheets: + +“README” + Repeats information from this section of the HOWTO. + +“Attributes” + - One row per metadata attribute (or 'field'). + - Columns for the name; description; and ID (short and machine-readable) of each attribute. + See these descriptions to learn what to write for each attribute. + +One or more additional sheets named, e.g. “XX001” + - The name (or title) of each sheet corresponds to the identity (ID) of the data flow that is described by the metadata in that sheet. + - In Column A, the name of the metadata attribute. + Each name **must** exactly match one appearing in the "Attributes" sheet. + - In Column B, the actual metadata values. + These **may** be empty, but **should** contain some indication of why the metadata value is not available or recorded. + +“TEMPLATE” + To add information about additional data flows not included in existing sheets (above), you can copy and rename this sheet. + +Record and update metadata +========================== + +- Metadata will be provided as one or more spreadsheets. + These may be in a web-based, common, editable document, or as e-mail attachments, etc. +- Communicate clearly about which files are exchanged or edited. +- If files are not in a web-based, common, editable format, use “track changes” features in documents to distinguish your edits from existing comments. + +Update or correct existing sheets +--------------------------------- + +- Identify and change incorrect metadata. +- If reviewing existing metadata, update the “Comment” field even if all metadata appear correct. +- Preface comments with your initials or other identifying mark and, if necessary, the date. + For example: + + ABC (2024-08-11) Added UNIT_MEASURE. + XY (2024-09-12) Expanded DATA_DESC; corrected URL. + MN (2024-11-18) Check & confirmed. + +Add additional sheets +--------------------- + +- Duplicate either “TEMPLATE” or any other existing sheet. +- Choose a new, distinct ID for the data flow. +- Be detailed! + The ``DATA_DESC`` attribute is intended as a catch-all; use blank lines to separate different points of information about the data flow. +- Use simple language. + +Use common IDs for concepts/dimensions +-------------------------------------- + +- If a similar concept, dimension, or code list appears in metadata for multiple data flows, try to use the same ID to identify these. +- Some known concepts/dimensions are listed below. +- If there are important distinctions with an existing concept ID—for example, if two data providers use the same name to mean very different things—add extra text in the ``DIMENSION``, ``DATA_DESC`` or other fields to explain. + +================== === +ID Possible values +================== === +ACCIDENT_TYPE e.g. fatal accidents, non-fatal injury accidents, injury accidents, vehicle damage only accidents +DESTINATION e.g. urban, rural +DRIVER_PASSENGER e.g. driver, passenger +FUEL_TYPE e.g. electric, petrol +GEO; REF_AREA Specific countries or regions +IMPORT_REG e.g. new import, first registration, used import +INJURY_TYPE e.g. killed, injured +INSTITUTION e.g. government, private firm, individual +MANUFACTURER e.g. Renault, Toyota +MODE e.g. road, rail +NEW_USED new; used +PUBLIC_PRIVATE public; private +ROAD_CONDITION e.g. paved, unpaved +ROAD_TYPE e.g. motorway, highway +ROAD_USER e.g. pedestrian, four-wheeled vehicle +SERVICE freight; passenger +SEX e.g. female; male; other +SOURCE (of revenue) e.g. toll, tax +TIME_PERIOD +TYPE_OF_SPEND e.g. construction, maintenance +VEHICLE_AGE +VEHICLE_TYPE e.g. passenger car, bus, scooter +================== === + +Avoid common ‘gotchas’ +====================== + +When handling metadata, there are some common issues that can arise. +This section lists a few, and appropriate responses. + +Large/composite “databases” +--------------------------- + +Often, the term “data set” is use informally to refer to a collection of many kinds of data. +An easy way to notice this happening is to see if each metadata attribute has a complicated value or multiple values. + +For example: + + Measure: GDP; population. + + Unit of measure: 2020 U.S. dollars; millions of people. + + Dimensions: time and country; time, country, sex, and age. + +In this example, we see there are in fact **two** data flows. +It is simpler to describe these separately. +If other metadata values for one data flow are identical to the values for another, make such a reference: + + Data description: Same as [DF00X]. + +Mixing measures and dimensions: the word “by” +--------------------------------------------- + +For example: + +- Data set A may be described as “Sales of cars by manufacturer” +- Data set B may be described as “Sales of cars by weight class” + +In this case, the word **“by”** is a clue that *the data have at least one specific dimension*. +For data set A, that specific named dimension is “manufacturer”. +For data set B, the dimension is “weight class”. + +However: + +- Both data sets actually capture *the same measure*—sales of cars—and may use the same units of measurement. +- Each data set probably has additional dimensions, besides the one singled out in the name or title. + For example, both data flow A and data flow B may have GEO and TIME_PERIOD dimensions. + It is possible that data flow B *also* has a “manufacturer” dimension, but this is merely omitted from the name or title. + +To avoid this ambiguity is to: + +- Always give the complete list of dimensions. +- Do not combine dimensions with the measure. +- Avoid mentioning just one or a few dimensions. + +Mixing measures and units of measure +------------------------------------ + +For example: + +- For data flow A, the measure is given as “passenger miles traveled”. +- For data flow B, the measure is given as “passenger kilometres”. + +With the above information, we can understand that these are *the same measure* (one we might call “passenger distance traveled”), but the *units of measurement* are different (in one case, miles; in the other, kilometres). diff --git a/doc/index.rst b/doc/index.rst index c5c6b7c..a63651d 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -24,6 +24,7 @@ For more on the design, status, and plans for this package, see :doc:`dev`. usage dev standards + howto/index roadmap whatsnew @@ -33,6 +34,7 @@ General - :doc:`usage` - :doc:`dev` - :doc:`standards` +- :doc:`howto/index` - :doc:`roadmap` - :doc:`whatsnew` diff --git a/doc/standards.rst b/doc/standards.rst index 7eeaab6..eb6e2f7 100644 --- a/doc/standards.rst +++ b/doc/standards.rst @@ -35,6 +35,8 @@ These standards: TDCI develops and supports community development of tutorials, guides, explainers, in multiple languages and media, targeted to different audiences, that help with the adoption of these standards. These **must** promote a correct implementation of the standards, and **may** give more detailed and elaborate examples. +.. _std-defs: + Definitions =========== From f0a32add68d6bcfa07e04558d383104435d631cd Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 6 Sep 2024 10:44:13 +0200 Subject: [PATCH 09/34] Read metadata expressed in the template --- transport_data/org/metadata.py | 136 ++++++++++++++++++++++++++++++++- 1 file changed, 133 insertions(+), 3 deletions(-) diff --git a/transport_data/org/metadata.py b/transport_data/org/metadata.py index 5570fac..41ee353 100644 --- a/transport_data/org/metadata.py +++ b/transport_data/org/metadata.py @@ -1,11 +1,18 @@ -from typing import TYPE_CHECKING, Tuple +import logging +from collections import defaultdict +from typing import TYPE_CHECKING, Optional, Tuple +from sdmx.model import v21 from sdmx.model.v21 import MetadataStructureDefinition if TYPE_CHECKING: + import pathlib + from openpyxl import Workbook from openpyxl.worksheet.worksheet import Worksheet +log = logging.getLogger(__name__) + #: Concepts and metadata attributes in the TDC metadata structure. CONCEPTS = { "DATAFLOW": ( @@ -173,7 +180,7 @@ def add_template(wb: "Workbook", msd: "MetadataStructureDefinition"): def get_msd() -> "MetadataStructureDefinition": - from sdmx.model.common import Concept + from sdmx.model.common import ConceptScheme from sdmx.model.v21 import ReportStructure from transport_data import STORE @@ -181,11 +188,12 @@ def get_msd() -> "MetadataStructureDefinition": from . import get_agencyscheme as_ = get_agencyscheme() + cs = ConceptScheme(id="METADATA_CONCEPTS", maintainer=as_["TDCI"]) msd = MetadataStructureDefinition(id="SIMPLE", version="1", maintainer=as_["TDCI"]) rs = msd.report_structure["ALL"] = ReportStructure(id="ALL") for id_, (name, description) in CONCEPTS.items(): - ci = Concept(id=id_, name=name, description=description) + ci = cs.setdefault(id=id_, name=name, description=description) rs.getdefault(id_, concept_identity=ci) # NB Currently not supported by sdmx1; results in an empty XML collection @@ -226,3 +234,125 @@ def make_workbook(name="sample.xlsx") -> None: # Save the file wb.save(name) + + +def read_workbook(path: "pathlib.Path") -> "v21.MetadataSet": + """Read a metadata set from the workbook at `path`.""" + from openpyxl import load_workbook + + wb = load_workbook(path) + # Generate/retrieve the metadata structure definition + msd = get_msd() + + mds = v21.MetadataSet(structured_by=msd) + + for ws in wb.worksheets: + # Skip information sheets generated by methods in this file + if ws.title in ("README", "Attributes", "TEMPLATE"): + continue + + mds.report.append(read_worksheet(ws, msd)) + + return mds + + +def read_worksheet( + ws: "Worksheet", msd: "MetadataStructureDefinition" +) -> "v21.MetadataReport": + """Read a metadata report from the worksheet `ws`. + + Parameters + ---------- + msd : + Metadata structure definition. + """ + # Mapping from names (not IDs) to MetadataAttributes + mda_for_name = { + str(mda.concept_identity.name): mda + for mda in msd.report_structure["ALL"].components + } + + # Create the target of the report: a data flow definition + # TODO Expand this DFD and its associated data structure definition + df_id_from_title = ws.title + dfd = v21.DataflowDefinition(id=ws.title, maintainer=msd.maintainer) + + # Create objects to associate the metadata report with the data flow definition + iot = v21.IdentifiableObjectTarget() + tok = v21.TargetObjectKey( + key_values={"DATAFLOW": v21.TargetIdentifiableObject(value_for=iot, obj=dfd)} + ) + + # Create the report itself + mdr = v21.MetadataReport() + mdr.attaches_to = tok + + # Iterate over rows in the worksheet + mda = None + for row in ws.iter_rows(): + # Column B: value in the row + ra_value = row[1].value + + if ra_value is None: + continue + + # Column A: name of the metadata attribute + mda_name = row[0].value + + # Identify the MDA + # NB if `mda_name` is none, then `mda` retains the value found on the previous + # row. This allows e.g. multiple rows to give values for DIMENSION + # TODO Protect against other malformed data. + mda = mda_for_name.get(str(mda_name), mda) + + # Store as OtherNonEnumeratedAttributeValue + # TODO Use EnumeratedAttributeValue, once code lists are available corresponding + # to dimensions + ra = v21.OtherNonEnumeratedAttributeValue(value=str(ra_value), value_for=mda) + + # Attend the reported attribute to the report + mdr.metadata.append(ra) + + # Basic checks + df_id_from_cell = _get(mdr, "DATAFLOW") + if not df_id_from_cell: + log.warning(f"Sheet {df_id_from_title!r} does not identify a data flow; skip") + + return mdr + + +def _get(mdr: "v21.MetadataReport", mda_id: str) -> Optional[str]: + """Retrieve from `mdr` the reported value of the metadata attribute `mda_id`.""" + for mda in mdr.metadata: + if mda.value_for is not None and mda.value_for.id == mda_id: + assert hasattr(mda, "value") # Exclude ReportedAttribute without value attr + return mda.value + # No match + return None + + +def summarize_metadataset(mds: "v21.MetadataSet") -> None: + """Print a summary of the contents of `mds`.""" + print(f"Metadata set containing {len(mds.report)} metadata reports") + + def uline(text: str, char: str = "=") -> str: + """Underline `text`.""" + return f"{text}\n{char * len(text)}" + + def summarize_metadataattribute(mda_id: str) -> None: + """Summarize unique values appear in metadata for attribute `mda_id`.""" + value_id = defaultdict(set) + + for r in mds.report: + value_id[_get(r, mda_id) or "MISSING"].add(_get(r, "DATAFLOW") or "MISSING") + + assert mds.structured_by + mda = mds.structured_by.report_structure["ALL"].get(mda_id) + + print("\n\n" + uline(f"{mda}: {len(value_id)} unique values")) + for value, df_ids in sorted(value_id.items()): + print(f"{value}\n " + " ".join(sorted(df_ids))) + + summarize_metadataattribute("MEASURE") + summarize_metadataattribute("DATA_PROVIDER") + summarize_metadataattribute("UNIT_MEASURE") From ae31d9ef9ffe0eb64e77ce144a7dea86b7bfa4d9 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 6 Sep 2024 10:45:05 +0200 Subject: [PATCH 10/34] Improve short help for adb, jrc, oica CLI --- transport_data/adb/cli.py | 2 +- transport_data/jrc/cli.py | 2 +- transport_data/oica/cli.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/transport_data/adb/cli.py b/transport_data/adb/cli.py index 7e22007..4678715 100644 --- a/transport_data/adb/cli.py +++ b/transport_data/adb/cli.py @@ -18,7 +18,7 @@ from . import FILES, convert, fetch -@click.group("adb", help=__doc__) +@click.group("adb") def main(): """Asian Development Bank (ADB) provider.""" diff --git a/transport_data/jrc/cli.py b/transport_data/jrc/cli.py index 7744297..80cf86e 100644 --- a/transport_data/jrc/cli.py +++ b/transport_data/jrc/cli.py @@ -18,7 +18,7 @@ from . import GEO, convert, fetch -@click.group("jrc", help=__doc__.splitlines()[0]) +@click.group("jrc") def main(): """EU Joint Research Center (JRC) provider.""" diff --git a/transport_data/oica/cli.py b/transport_data/oica/cli.py index 2b56c4a..4da6635 100644 --- a/transport_data/oica/cli.py +++ b/transport_data/oica/cli.py @@ -17,7 +17,7 @@ import click -@click.group("oica", help=__doc__.splitlines()[0]) +@click.group("oica", short_help="OICA provider.") def main(): """International Organization of Motor Vehicle Manufacturers (OICA) provider.""" From 828ff63e4e055beb05d513b723fadbffad108b77 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 6 Sep 2024 10:45:50 +0200 Subject: [PATCH 11/34] Add "tdc org read" CLI command --- transport_data/org/cli.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/transport_data/org/cli.py b/transport_data/org/cli.py index 6ffaa48..51bcb2c 100644 --- a/transport_data/org/cli.py +++ b/transport_data/org/cli.py @@ -6,6 +6,8 @@ """ +import pathlib + import click from transport_data.util.click import common_params @@ -26,6 +28,18 @@ def refresh(version): STORE.write(get_agencyscheme(version=version)) +@main.command("read") +@click.argument( + "path", type=click.Path(exists=True, dir_okay=False, path_type=pathlib.Path) +) +def read(path: "pathlib.Path"): + """Read and summarize metadata.""" + from .metadata import read_workbook, summarize_metadataset + + mds = read_workbook(path.resolve()) + summarize_metadataset(mds) + + @main.command("template") def template(): """Generate the metadata template.""" From d3aec7ace74bcaef16de6cd1b5162a2b100832d5 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 6 Sep 2024 10:46:34 +0200 Subject: [PATCH 12/34] Add test_data_path pytest fixture --- transport_data/testing.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/transport_data/testing.py b/transport_data/testing.py index e79a349..82d8ccb 100644 --- a/transport_data/testing.py +++ b/transport_data/testing.py @@ -1,3 +1,4 @@ +from pathlib import Path from typing import Generator, cast import pytest @@ -56,6 +57,12 @@ def sdmx_structures(tmp_store) -> sdmx.message.StructureMessage: return sm +@pytest.fixture(scope="session") +def test_data_path() -> Generator[Path, None, None]: + """Path containing test data.""" + yield Path(__file__).parent.joinpath("data", "tests") + + @pytest.fixture(scope="session") def tmp_config(tmp_path_factory) -> Generator[Config, None, None]: """A :class:`.Config` instance pointing to a temporary directory.""" From 3059f2f7af55690f3d2817d22b2c80bb00e6e79f Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 6 Sep 2024 10:49:48 +0200 Subject: [PATCH 13/34] Add tests of reading metadata, writing template - Add Git LFS configuration, .gitattributes. - Add metadata specimen. - Clone LFS files in "pytest" CI workflow. --- .gitattributes | 2 ++ .github/workflows/pytest.yaml | 2 ++ transport_data/data/tests/metadata-input.xlsx | 3 ++ transport_data/tests/org/__init__.py | 0 transport_data/tests/org/test_metadata.py | 32 +++++++++++++++++++ 5 files changed, 39 insertions(+) create mode 100644 transport_data/data/tests/metadata-input.xlsx create mode 100644 transport_data/tests/org/__init__.py create mode 100644 transport_data/tests/org/test_metadata.py diff --git a/.gitattributes b/.gitattributes index d4661d8..e66f66a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,4 @@ # Reduce the number of merge/rebase conflicts doc/whatsnew.rst merge=union +# Git LFS +*.xlsx filter=lfs diff=lfs merge=lfs -text diff --git a/.github/workflows/pytest.yaml b/.github/workflows/pytest.yaml index 5b9299c..f52c22d 100644 --- a/.github/workflows/pytest.yaml +++ b/.github/workflows/pytest.yaml @@ -44,6 +44,8 @@ jobs: steps: - uses: actions/checkout@v4 + with: + lfs: true - uses: actions/setup-python@v5 with: diff --git a/transport_data/data/tests/metadata-input.xlsx b/transport_data/data/tests/metadata-input.xlsx new file mode 100644 index 0000000..df2dbce --- /dev/null +++ b/transport_data/data/tests/metadata-input.xlsx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:380dfcd70698fc387020074845a127f23cbca2746ae66cc9213603185b4dbbe6 +size 100078 diff --git a/transport_data/tests/org/__init__.py b/transport_data/tests/org/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/transport_data/tests/org/test_metadata.py b/transport_data/tests/org/test_metadata.py new file mode 100644 index 0000000..289dd90 --- /dev/null +++ b/transport_data/tests/org/test_metadata.py @@ -0,0 +1,32 @@ +from transport_data.org.metadata import ( + make_workbook, + read_workbook, + summarize_metadataset, +) + + +def test_make_workbook(tmp_path) -> None: + make_workbook() + + +def test_read_workbook(test_data_path) -> None: + # Function runs successfully + result = read_workbook(test_data_path.joinpath("metadata-input.xlsx")) + + # Result has a certain number of metadata reports + assert 47 == len(result.report) + + +def test_summarize_metadataset(capsys, test_data_path) -> None: + mds = read_workbook(test_data_path.joinpath("metadata-input.xlsx")) + + # Function runs successfully + summarize_metadataset(mds) + + captured = capsys.readouterr() + # pathlib.Path("debug.txt").write_text(captured.out) # DEBUG Write to a file + + # Output contains certain text + assert "MEASURE: 40 unique values" in captured.out + + # TODO expand with further assertions From af91ca7d171d9113015a376a2ca8c4f7d211ce00 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 6 Sep 2024 10:57:35 +0200 Subject: [PATCH 14/34] Revert "Work around actions/setup-python#696" This reverts commit 1bf3b9b1e4352bcb6a06753f633ed3bf4b8a173a. --- .github/workflows/pytest.yaml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/workflows/pytest.yaml b/.github/workflows/pytest.yaml index f52c22d..b3673fa 100644 --- a/.github/workflows/pytest.yaml +++ b/.github/workflows/pytest.yaml @@ -29,14 +29,6 @@ jobs: - "3.11" - "3.12" - # Work around https://github.com/actions/setup-python/issues/696 - exclude: - - {os: macos-latest, python-version: "3.8"} - - {os: macos-latest, python-version: "3.9"} - include: - - {os: macos-13, python-version: "3.8"} - - {os: macos-13, python-version: "3.9"} - fail-fast: false runs-on: ${{ matrix.os }} From 355d8f17d1b437f8bfbea829f55d3e678031f5a4 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 6 Sep 2024 11:44:23 +0200 Subject: [PATCH 15/34] Implement get_agencies() hook using pluggy - Add pluggy to dependencies. - Define get_agencies() hook spec. - Rename get_agency() to get_agencies() everywhere. - Use hooks in .org.get_agencyscheme() --- pyproject.toml | 1 + transport_data/adb/__init__.py | 15 +++++++++------ transport_data/iamc/__init__.py | 10 +++++++--- transport_data/ipcc/structure.py | 12 +++++++----- transport_data/jrc/__init__.py | 12 +++++++----- transport_data/oica/__init__.py | 24 ++++++++++++++++-------- transport_data/org/__init__.py | 25 +++++++++++++------------ transport_data/tests/test_org.py | 5 ++++- transport_data/util/hooks.py | 17 +++++++++++++++++ transport_data/util/pluggy.py | 24 ++++++++++++++++++++++++ 10 files changed, 105 insertions(+), 40 deletions(-) create mode 100644 transport_data/util/hooks.py create mode 100644 transport_data/util/pluggy.py diff --git a/pyproject.toml b/pyproject.toml index 58ed5f2..9194edf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ dependencies = [ "packaging", "pandas", "platformdirs", + "pluggy", "pooch", "pycountry", "requests", diff --git a/transport_data/adb/__init__.py b/transport_data/adb/__init__.py index ea7c505..97b2a07 100644 --- a/transport_data/adb/__init__.py +++ b/transport_data/adb/__init__.py @@ -9,12 +9,13 @@ import sdmx.model.v21 as m from transport_data import STORE as registry +from transport_data.util.pluggy import hookimpl from transport_data.util.pooch import Pooch from transport_data.util.sdmx import anno_generated -def get_agency() -> m.Agency: - # Agency +@hookimpl +def get_agencies(): a = m.Agency( id="ADB", name="Asian Transport Outlook team at the Asian Development Bank", @@ -26,7 +27,7 @@ def get_agency() -> m.Agency: c3 = m.Contact(name="Sudhir Gota", email=["sudhirgota@gmail.com"]) a.contact.extend([c1, c2, c3]) - return a + return (a,) BASE_URL = "https://asiantransportoutlook.com/exportdl?orig=1" @@ -240,10 +241,12 @@ def prepare(aa: m.AnnotableArtefact) -> Tuple[m.DataSet, Callable]: # Data structure definition with an ID matching the measure # NB here we set ADB as the maintainer. Precisely, ADB establishes the data # structure, but TDCI is maintaining the SDMX representation of it. - dsd = m.DataStructureDefinition(id=measure_id, maintainer=get_agency()) + dsd = m.DataStructureDefinition(id=measure_id, maintainer=get_agencies()[0]) anno_generated(dsd) - dfd = m.DataflowDefinition(id=measure_id, maintainer=get_agency(), structure=dsd) + dfd = m.DataflowDefinition( + id=measure_id, maintainer=get_agencies()[0], structure=dsd + ) pm = m.PrimaryMeasure(id="OBS_VALUE", concept_identity=c) dsd.measures.append(pm) @@ -317,7 +320,7 @@ def convert(part): # Write the lists of "Economy" codes and measures/concepts accumulated while # converting - a = get_agency() + a = get_agencies()[0] for obj in (CL_ECONOMY, CS_MEASURE): obj.maintainer = a obj.version = "0.1.0" diff --git a/transport_data/iamc/__init__.py b/transport_data/iamc/__init__.py index 21a385d..5d2e429 100644 --- a/transport_data/iamc/__init__.py +++ b/transport_data/iamc/__init__.py @@ -11,15 +11,19 @@ import sdmx.model.v21 as m from sdmx.message import StructureMessage +from transport_data.util.pluggy import hookimpl + log = logging.getLogger(__name__) -def get_agency(): - return m.Agency( +@hookimpl +def get_agencies(): + a = m.Agency( id="IAMC", name="Integrated Assessment Modeling Consortium", contact=[m.Contact(uri=["https://iamconsortium.org"])], ) + return (a,) def common_structures(): @@ -31,7 +35,7 @@ def common_structures(): with id "IAMC", containing the concepts for the IAMC dimensions and attribute. """ cs = m.ConceptScheme( - id="IAMC", name="Concepts in the IAMC data model", maintainer=get_agency() + id="IAMC", name="Concepts in the IAMC data model", maintainer=get_agencies()[0] ) cs.extend( diff --git a/transport_data/ipcc/structure.py b/transport_data/ipcc/structure.py index f7e5e33..7315ee7 100644 --- a/transport_data/ipcc/structure.py +++ b/transport_data/ipcc/structure.py @@ -1,22 +1,24 @@ """IPCC structural metadata.""" -from functools import lru_cache from typing import TYPE_CHECKING +from transport_data.util.pluggy import hookimpl + if TYPE_CHECKING: import sdmx.model.common -@lru_cache -def get_agency() -> "sdmx.model.common.Agency": +@hookimpl +def get_agencies(): """Return the IPCC :class:`.Agency`.""" from sdmx.model import v21 - return v21.Agency( + a = v21.Agency( id="IPCC", name="Intergovernmental Panel on Climate Change", description="https://www.ipcc.ch/", ) + return (a,) def gen_cl_T311(**kwargs) -> "sdmx.model.Common.Codelist": @@ -265,7 +267,7 @@ def gen_structures() -> None: from transport_data import STORE, org ma_args = dict( - maintainer=org.get_agency()[0], + maintainer=org.get_agencies()[0], version="0.1", is_final=True, is_external_reference=False, diff --git a/transport_data/jrc/__init__.py b/transport_data/jrc/__init__.py index d1cdcd9..6b5343b 100644 --- a/transport_data/jrc/__init__.py +++ b/transport_data/jrc/__init__.py @@ -24,11 +24,13 @@ import sdmx.model.v21 as m from transport_data import STORE as registry +from transport_data.util.pluggy import hookimpl from transport_data.util.pooch import Pooch from transport_data.util.sdmx import anno_generated -def get_agency() -> m.Agency: +@hookimpl +def get_agencies(): """Return information about the agency providing the data set. See :func:`.org.get_agencyscheme`. @@ -46,7 +48,7 @@ def get_agency() -> m.Agency: m.Contact(name="Jacopo Tattini", email=["Jacopo.TATTINI@ec.europa.eu"]) ) - return a + return (a,) BASE_URL = ( @@ -448,7 +450,7 @@ def convert(geo): registry.write(obj) # Write code lists, measure concept scheme to file - a = get_agency() + a = get_agencies()[0] for obj in chain(CL.values(), [CS_MEASURE]): obj.maintainer = a obj.version = "0.1.0" @@ -468,12 +470,12 @@ def prepare(measure_concept, dims): # NB here we set ADB as the maintainer. Precisely, ADB establishes the data # structure, but TDCI is maintaining the SDMX representation of it. dsd = m.DataStructureDefinition( - id=measure_id, maintainer=get_agency(), version="0.0.0" + id=measure_id, maintainer=get_agencies()[0], version="0.0.0" ) anno_generated(dsd) dfd = m.DataflowDefinition( - id=measure_id, maintainer=get_agency(), version="0.0.0", structure=dsd + id=measure_id, maintainer=get_agencies()[0], version="0.0.0", structure=dsd ) pm = m.PrimaryMeasure(id="OBS_VALUE", concept_identity=c) diff --git a/transport_data/oica/__init__.py b/transport_data/oica/__init__.py index 61cd027..55fe7e0 100644 --- a/transport_data/oica/__init__.py +++ b/transport_data/oica/__init__.py @@ -20,6 +20,7 @@ import pandas as pd +from transport_data.util.pluggy import hookimpl from transport_data.util.pooch import Pooch if TYPE_CHECKING: @@ -165,7 +166,9 @@ def convert_single_file( # Prepare a GEO codelist and map using the "GEO" column cl_geo = get_cl_geo() - geo_map = _make_geo_codes(cl_geo, df["GEO"], maintainer=get_agency(), version="0.1") + geo_map = _make_geo_codes( + cl_geo, df["GEO"], maintainer=get_agencies()[0], version="0.1" + ) # Store `cl_geo` STORE.write(cl_geo) @@ -327,16 +330,17 @@ def _make_code(value: str): return id_for_name -@lru_cache -def get_agency() -> "sdmx.model.common.Agency": +@hookimpl +def get_agencies(): """Return the OICA Agency.""" from sdmx.model import v21 - return v21.Agency( + a = v21.Agency( id="OICA", name="International Organization of Motor Vehicle Manufacturers", description="https://www.oica.net", ) + return (a,) def get_cl_geo() -> "sdmx.model.common.Codelist": @@ -346,7 +350,9 @@ def get_cl_geo() -> "sdmx.model.common.Codelist": from transport_data import STORE, org candidate: common.Codelist = common.Codelist( - id=f"{get_agency().id}_GEO", maintainer=org.get_agency()[0], version="0.1" + id=f"{get_agencies()[0].id}_GEO", + maintainer=org.get_agencies()[0], + version="0.1", ) return STORE.setdefault(candidate) @@ -360,7 +366,9 @@ def get_conceptscheme() -> "sdmx.model.common.ConceptScheme": from transport_data import STORE, org cs = common.ConceptScheme( - id=f"{get_agency().id}_CONCEPTS", maintainer=org.get_agency()[0], version="0.1" + id=f"{get_agencies()[0].id}_CONCEPTS", + maintainer=org.get_agencies()[0], + version="0.1", ) # Measures @@ -415,9 +423,9 @@ def get_structures( from transport_data import STORE, org - base = f"{get_agency().id}_{measure}" + base = f"{get_agencies()[0].id}_{measure}" ma_args = dict( - maintainer=org.get_agency()[0], + maintainer=org.get_agencies()[0], version="0.1", is_final=False, is_external_reference=False, diff --git a/transport_data/org/__init__.py b/transport_data/org/__init__.py index 90515be..841cc03 100644 --- a/transport_data/org/__init__.py +++ b/transport_data/org/__init__.py @@ -1,18 +1,20 @@ """Information about the TDCI *per se*.""" from datetime import date -from importlib import import_module +from itertools import chain from typing import TYPE_CHECKING, Union import sdmx.model.v21 as m from transport_data import STORE as registry +from transport_data.util.pluggy import hookimpl, pm, register_internal if TYPE_CHECKING: import sdmx.model.v21 -def get_agency() -> "sdmx.model.v21.Agency": +@hookimpl +def get_agencies() -> "sdmx.model.v21.Agency": # Agency a1 = m.Agency( id="TDCI", @@ -48,8 +50,6 @@ def get_agency() -> "sdmx.model.v21.Agency": def get_agencyscheme(version: Union[None, str] = None) -> "sdmx.model.v21.AgencyScheme": """Generate an AgencyScheme including some TDCI data providers.""" - agencies = get_agency() - as_ = m.AgencyScheme( id="TDCI", # NameableArtefact @@ -57,17 +57,18 @@ def get_agencyscheme(version: Union[None, str] = None) -> "sdmx.model.v21.Agency # VersionableArtefact valid_from=date.today().isoformat(), # MaintainableArtefact - maintainer=agencies[0], + maintainer=None, ) - for a in agencies: - as_.append(a) + # Use plugin hooks to collect Agency objects from within transport_data or other + # registered code + register_internal() + + for agency in chain(*pm.hook.get_agencies()): + as_.append(agency) - # Add agencies with corresponding modules in this repository - for id_ in ("adb", "jrc"): - module = import_module(f"transport_data.{id_}") - # Call a function named get_agency() in the module - as_.append(module.get_agency()) + # TDCI itself is the maintainer + as_.maintainer = as_["TDCI"] as_.version = version if as_.version is None: diff --git a/transport_data/tests/test_org.py b/transport_data/tests/test_org.py index 9e19b2e..0ca27ed 100644 --- a/transport_data/tests/test_org.py +++ b/transport_data/tests/test_org.py @@ -2,7 +2,10 @@ def test_get_agencyscheme() -> None: - get_agencyscheme() + as_ = get_agencyscheme() + + # Number of agencies associated with code in the transport_data repo + assert 7 == len(as_) def test_refresh() -> None: diff --git a/transport_data/util/hooks.py b/transport_data/util/hooks.py new file mode 100644 index 0000000..93b15c4 --- /dev/null +++ b/transport_data/util/hooks.py @@ -0,0 +1,17 @@ +from typing import TYPE_CHECKING, Iterable + +import pluggy + +if TYPE_CHECKING: + import sdmx.model.v21 + +hookspec = pluggy.HookspecMarker("transport_data") + + +@hookspec +def get_agencies() -> Iterable["sdmx.model.v21.Agency"]: + """Return :class:`sdmx.model.common.Agency` identifying (meta)data provider(s). + + An implementation **must** return an iterable of 0 or more Agency instances. + """ + raise NotImplementedError diff --git a/transport_data/util/pluggy.py b/transport_data/util/pluggy.py new file mode 100644 index 0000000..f4d593a --- /dev/null +++ b/transport_data/util/pluggy.py @@ -0,0 +1,24 @@ +from importlib import import_module + +import pluggy + +from . import hooks + +hookimpl = pluggy.HookimplMarker("transport_data") + + +pm = pluggy.PluginManager("transport_data") +pm.add_hookspecs(hooks) + + +def register_internal(): + """Register hook implementations from all modules that contain them. + + .. todo:: Automatically do this for all top-level submodules of transport_data. + """ + + for id_ in ("adb", "iamc", "ipcc.structure", "jrc", "oica", "org"): + try: + pm.register(import_module(f"transport_data.{id_}")) + except ValueError: + pass From 468465ceef915a00209ba82bee3e78438fa95280 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Mon, 30 Sep 2024 10:11:55 +0200 Subject: [PATCH 16/34] Improve .org.metadata - Add get_cs_common() - Add getdefault() - Add parse_dimension() - Add summarize_metadatareport() - Add update_dimension_descriptor() --- transport_data/org/metadata.py | 278 +++++++++++++++++++++++++++------ 1 file changed, 232 insertions(+), 46 deletions(-) diff --git a/transport_data/org/metadata.py b/transport_data/org/metadata.py index 41ee353..d1918f1 100644 --- a/transport_data/org/metadata.py +++ b/transport_data/org/metadata.py @@ -1,9 +1,10 @@ import logging +import re from collections import defaultdict -from typing import TYPE_CHECKING, Optional, Tuple +from functools import lru_cache +from typing import TYPE_CHECKING, List, Optional, Tuple -from sdmx.model import v21 -from sdmx.model.v21 import MetadataStructureDefinition +from sdmx.model import common, v21 if TYPE_CHECKING: import pathlib @@ -145,7 +146,7 @@ def add_readme(wb: "Workbook") -> None: ws["A3"] = README_TEXT -def add_attributes(wb: "Workbook", msd: "MetadataStructureDefinition"): +def add_attributes(wb: "Workbook", msd: "v21.MetadataStructureDefinition"): """Add an "Attributes" sheet to `wb` listing the metadata attributes from `msd`.""" ws = wb.create_sheet("Attributes") @@ -163,7 +164,7 @@ def add_attributes(wb: "Workbook", msd: "MetadataStructureDefinition"): ws.cell(row=row, column=3, value=attribute.id).style = "top" -def add_template(wb: "Workbook", msd: "MetadataStructureDefinition"): +def add_template(wb: "Workbook", msd: "v21.MetadataStructureDefinition"): """Add a "TEMPLATE" sheet to `wb` with a metadata template.""" ws = wb.create_sheet("TEMPLATE") @@ -179,18 +180,56 @@ def add_template(wb: "Workbook", msd: "MetadataStructureDefinition"): ws.cell(row=row, column=2, value="---") -def get_msd() -> "MetadataStructureDefinition": - from sdmx.model.common import ConceptScheme - from sdmx.model.v21 import ReportStructure +@lru_cache +def get_cs_common() -> "common.ConceptScheme": + """Create a shared concept scheme for the concepts referenced by dimensions. + Concepts in this scheme have an annotation ``tdc-aka``, which is a list of alternate + IDs recognized for the concept. + """ + from . import get_agencyscheme + + as_ = get_agencyscheme() + cs = common.ConceptScheme(id="CONCEPTS", maintainer=as_["TDCI"]) + + cs.setdefault( + id="CONFIDENTIALITY", + annotations=[common.Annotation(id="tdc-aka", text=repr(["CONFIDIENTALITY"]))], + ) + cs.setdefault( + id="FUEL_TYPE", + annotations=[common.Annotation(id="tdc-aka", text=repr(["Fuel type"]))], + ) + cs.setdefault( + id="REF_AREA", + annotations=[ + common.Annotation( + id="tdc-aka", text=repr(["Area", "Country", "Country code", "Region"]) + ) + ], + ) + cs.setdefault( + id="SERVICE", + annotations=[common.Annotation(id="tdc-aka", text=repr(["FREIGHT_PASSENGER"]))], + ) + cs.setdefault( + id="TIME_PERIOD", + annotations=[common.Annotation(id="tdc-aka", text=repr(["Time", "Year"]))], + ) + + return cs + + +def get_msd() -> "v21.MetadataStructureDefinition": from transport_data import STORE from . import get_agencyscheme - as_ = get_agencyscheme() - cs = ConceptScheme(id="METADATA_CONCEPTS", maintainer=as_["TDCI"]) - msd = MetadataStructureDefinition(id="SIMPLE", version="1", maintainer=as_["TDCI"]) - rs = msd.report_structure["ALL"] = ReportStructure(id="ALL") + TDCI = get_agencyscheme()["TDCI"] + + cs = common.ConceptScheme(id="METADATA_CONCEPTS", maintainer=TDCI) + msd = v21.MetadataStructureDefinition(id="SIMPLE", version="1", maintainer=TDCI) + rs = msd.report_structure["ALL"] = v21.ReportStructure(id="ALL") for id_, (name, description) in CONCEPTS.items(): ci = cs.setdefault(id=id_, name=name, description=description) @@ -202,6 +241,35 @@ def get_msd() -> "MetadataStructureDefinition": return msd +def getdefault(is_: "common.ItemScheme", other: "common.Item") -> "common.Item": + """Return an item from `is_` matching `other`. + + Several methods are attempted to match `other` with an existing item: + + 1. ID of `other` is identical to that of an existing item. + 2. Transformed ID of `other`—in upper case, " " replaced with "_" is identical to + that of an existing item. + 3. ID of `other` is in the annotation ``tdc-aka`` + + """ + # Exact match on ID or transformed ID + for candidate in (other.id, other.id.upper().replace(" ", "_")): + try: + return is_[candidate] + except KeyError: + pass + + # Iterate over existing items + for item in is_: + # Eval the annotation "tdc-aka" for a list of alternate IDs for the item + if aka := item.eval_annotation(id="tdc-aka"): + if other.id in aka: + return item + + # Still no match; create the item + return is_.setdefault(id=other.id) + + def make_workbook(name="sample.xlsx") -> None: """Generate a :class:`openpyxl.Workbook` for exchange of metadata.""" from openpyxl import Workbook @@ -236,7 +304,42 @@ def make_workbook(name="sample.xlsx") -> None: wb.save(name) -def read_workbook(path: "pathlib.Path") -> "v21.MetadataSet": +def parse_dimension(value: str) -> List[v21.Concept]: + """Parse the description of a dimension from `value`. + + Supported values include: + + 1. Multiple lines, with each line beginning "- ". + 2. A single line, with dimensions separated by ", ". + 3. A single dimension ID. + """ + # Partial regular expressions for a dimension + entry = r"(?P.+?)(?: \((?P[^\)]*)\))?" + + # Split `value` into potentially multiple values; separate dimension IDs from + # description/annotation + parts = [] + if matches := re.findall(f"^- {entry}$", value, flags=re.MULTILINE): + # Multiple lines, with each line beginning "- " + parts.extend(matches) + elif matches := re.findall(f"{entry}(?:, |$)", value): + # Single line, with dimensions separated by ", " + # TODO Check behaviour if the ", " is within parentheses + parts.extend(matches) + elif 0 == len(parts): + # None of the above → a single dimension label + parts.append(value) + + # Convert to a list of Concept objects + return [ + v21.Concept(id=id_, name=id_, description=description) + for id_, description in parts + ] + + +def read_workbook( + path: "pathlib.Path", +) -> tuple["v21.MetadataSet", "v21.ConceptScheme"]: """Read a metadata set from the workbook at `path`.""" from openpyxl import load_workbook @@ -246,19 +349,26 @@ def read_workbook(path: "pathlib.Path") -> "v21.MetadataSet": mds = v21.MetadataSet(structured_by=msd) + # Create a shared concept scheme for the concepts referenced by dimensions + # TODO Collect, maybe with get_msd() + cs_dims = get_cs_common() + for ws in wb.worksheets: # Skip information sheets generated by methods in this file if ws.title in ("README", "Attributes", "TEMPLATE"): continue - mds.report.append(read_worksheet(ws, msd)) + if r := read_worksheet(ws, msd, cs_dims): + mds.report.append(r) - return mds + return mds, cs_dims def read_worksheet( - ws: "Worksheet", msd: "MetadataStructureDefinition" -) -> "v21.MetadataReport": + ws: "Worksheet", + msd: "v21.MetadataStructureDefinition", + cs_dims: "v21.ConceptScheme", +) -> Optional["v21.MetadataReport"]: """Read a metadata report from the worksheet `ws`. Parameters @@ -276,6 +386,8 @@ def read_worksheet( # TODO Expand this DFD and its associated data structure definition df_id_from_title = ws.title dfd = v21.DataflowDefinition(id=ws.title, maintainer=msd.maintainer) + dsd = v21.DataStructureDefinition(id=ws.title, maintainer=msd.maintainer) + dfd.structure = dsd # Create objects to associate the metadata report with the data flow definition iot = v21.IdentifiableObjectTarget() @@ -287,14 +399,22 @@ def read_worksheet( mdr = v21.MetadataReport() mdr.attaches_to = tok - # Iterate over rows in the worksheet - mda = None - for row in ws.iter_rows(): - # Column B: value in the row - ra_value = row[1].value + mda = None # Reference to the MetaDataAttribute describing the current row + dimension_concepts = [] - if ra_value is None: - continue + # Iterate over rows in the worksheet, skipping the first + for row in ws.iter_rows(min_row=2): + try: + # Column B: value in the row + ra_value = row[1].value + + if ra_value is None: + continue + except IndexError: + log.warning( + f"Sheet {df_id_from_title!r} has only < 2 columns in the first row; skip" + ) + return None # Column A: name of the metadata attribute mda_name = row[0].value @@ -305,18 +425,27 @@ def read_worksheet( # TODO Protect against other malformed data. mda = mda_for_name.get(str(mda_name), mda) - # Store as OtherNonEnumeratedAttributeValue - # TODO Use EnumeratedAttributeValue, once code lists are available corresponding - # to dimensions - ra = v21.OtherNonEnumeratedAttributeValue(value=str(ra_value), value_for=mda) + if mda and mda.id == "DIMENSION": + # Parse 1 or more dimension(s) and add to the DSD + dimension_concepts.extend(parse_dimension(str(ra_value))) + else: + # Store as OtherNonEnumeratedAttributeValue + # TODO Use EnumeratedAttributeValue, once code lists are available + # corresponding to dimensions + ra = v21.OtherNonEnumeratedAttributeValue( + value=str(ra_value), value_for=mda + ) - # Attend the reported attribute to the report - mdr.metadata.append(ra) + # Attend the reported attribute to the report + mdr.metadata.append(ra) # Basic checks df_id_from_cell = _get(mdr, "DATAFLOW") if not df_id_from_cell: log.warning(f"Sheet {df_id_from_title!r} does not identify a data flow; skip") + return None + + update_dimension_descriptor(dsd, cs_dims, *dimension_concepts) return mdr @@ -331,28 +460,85 @@ def _get(mdr: "v21.MetadataReport", mda_id: str) -> Optional[str]: return None +def summarize_metadataattribute(mds: "v21.MetadataSet", mda_id: str) -> None: + """Summarize unique values appear in metadata for attribute `mda_id`.""" + value_id = defaultdict(set) + + for r in mds.report: + value_id[_get(r, mda_id) or "MISSING"].add(_get(r, "DATAFLOW") or "MISSING") + + assert mds.structured_by + mda = mds.structured_by.report_structure["ALL"].get(mda_id) + + print("\n\n" + uline(f"{mda}: {len(value_id)} unique values")) + for value, df_ids in sorted(value_id.items()): + print(f"{value}\n " + " ".join(sorted(df_ids))) + + +def summarize_metadatareport(mdr: "v21.MetadataReport") -> None: + lines = ["", uline("Metadata report")] + + # Retrieve references to the data flow and data structure + dfd: v21.DataflowDefinition = mdr.attaches_to.key_values["DATAFLOW"].obj # type: ignore [union-attr] + dsd = dfd.structure + + # Summarize the data flow and data structure + + lines.extend( + [f"Refers to {dfd!r}", f" with structure {dsd!r}", " with dimensions:"] + ) + for dim in dsd.dimensions: + line = f" - {dim.id}:" + if desc := str(dim.get_annotation(id="tdc-description").text): + line += f" {desc!s}" + else: + line += " —" + try: + original_id = dim.get_annotation(id="tdc-original-id").text + line += f" ('{original_id!s}' in input file)" + except KeyError: + pass + lines.append(line) + + lines.append("") + + for ra in mdr.metadata: + if ra.value_for.id == "DATAFLOW": + continue + assert hasattr(ra, "value") + lines.append(f"{ra.value_for}: {ra.value}") + + print("\n".join(lines)) + + def summarize_metadataset(mds: "v21.MetadataSet") -> None: """Print a summary of the contents of `mds`.""" print(f"Metadata set containing {len(mds.report)} metadata reports") - def uline(text: str, char: str = "=") -> str: - """Underline `text`.""" - return f"{text}\n{char * len(text)}" + summarize_metadataattribute(mds, "MEASURE") + summarize_metadataattribute(mds, "DATA_PROVIDER") + summarize_metadataattribute(mds, "UNIT_MEASURE") + + for r in mds.report: + summarize_metadatareport(r) + - def summarize_metadataattribute(mda_id: str) -> None: - """Summarize unique values appear in metadata for attribute `mda_id`.""" - value_id = defaultdict(set) +def uline(text: str, char: str = "=") -> str: + """Underline `text`.""" + return f"{text}\n{char * len(text)}" - for r in mds.report: - value_id[_get(r, mda_id) or "MISSING"].add(_get(r, "DATAFLOW") or "MISSING") - assert mds.structured_by - mda = mds.structured_by.report_structure["ALL"].get(mda_id) +def update_dimension_descriptor( + dsd: "v21.DataStructureDefinition", cs_dims: "v21.ConceptScheme", *concepts +) -> None: + """Update the DimensionDescriptor of `dsd` with `concepts`.""" + for dc in concepts: + # Identify the concept in `cs_dims` with the same ID + c = getdefault(cs_dims, dc) - print("\n\n" + uline(f"{mda}: {len(value_id)} unique values")) - for value, df_ids in sorted(value_id.items()): - print(f"{value}\n " + " ".join(sorted(df_ids))) + # Construct annotations + anno = [common.Annotation(id="tdc-description", text=dc.description)] + if c.id != dc.id: + anno.append(common.Annotation(id="tdc-original-id", text=dc.id)) - summarize_metadataattribute("MEASURE") - summarize_metadataattribute("DATA_PROVIDER") - summarize_metadataattribute("UNIT_MEASURE") + dsd.dimensions.getdefault(id=c.id, concept_identity=c, annotations=anno) From c74b88a7aa537e48b73b64c1310209b7e71c1390 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Mon, 30 Sep 2024 10:50:34 +0200 Subject: [PATCH 17/34] Add .metadata.{contains_data_for,groupby}() --- transport_data/org/metadata.py | 50 ++++++++++++++++++++++- transport_data/tests/org/test_metadata.py | 47 ++++++++++++++++++--- 2 files changed, 90 insertions(+), 7 deletions(-) diff --git a/transport_data/org/metadata.py b/transport_data/org/metadata.py index d1918f1..1d0d1ce 100644 --- a/transport_data/org/metadata.py +++ b/transport_data/org/metadata.py @@ -1,9 +1,11 @@ +import itertools import logging import re from collections import defaultdict from functools import lru_cache -from typing import TYPE_CHECKING, List, Optional, Tuple +from typing import TYPE_CHECKING, Callable, Hashable, List, Optional, Tuple +from pycountry import countries from sdmx.model import common, v21 if TYPE_CHECKING: @@ -180,6 +182,39 @@ def add_template(wb: "Workbook", msd: "v21.MetadataStructureDefinition"): ws.cell(row=row, column=2, value="---") +def contains_data_for(mdr: "v21.MetadataReport", ref_area: str) -> bool: + """Return :any:`True` if `mdr` contains data for `ref_area`. + + :any:`True` is returned if any of the following: + + 1. The referenced data flow definition has an ID that starts with `ref_area`. + 2. The country's ISO 3166 alpha-2 code, alpha-3 code, official name, or common name + appears in the value of the ``DATA_DESCR`` metadata attribute. + + + Parameters + ---------- + ref_area : str + ISO 3166 alpha-2 code for a country. Passed to + :meth:`pycountry.countries.lookup`. + """ + country = countries.lookup(ref_area) + + if mdr.attaches_to.key_values["DATAFLOW"].obj.id.startswith(ref_area): # type: ignore [union-attr] + return True + + # Pattern to match in DATA_DESCR + pat = re.compile( + f"({country.alpha_2}|{country.alpha_3}|{country.name}|{country.common_name})" + ) + for ra in mdr.metadata: + assert hasattr(ra, "value") + if ra.value_for.id == "DATA_DESCR" and pat.search(ra.value): + return True + + return False + + @lru_cache def get_cs_common() -> "common.ConceptScheme": """Create a shared concept scheme for the concepts referenced by dimensions. @@ -270,6 +305,19 @@ def getdefault(is_: "common.ItemScheme", other: "common.Item") -> "common.Item": return is_.setdefault(id=other.id) +def groupby( + mds: "v21.MetadataSet", key=Callable[["v21.MetadataReport"], Hashable] +) -> dict[Hashable, list["v21.MetadataReport"]]: + """Group metadata reports in `mds` according to a `key` function. + + Similar to :func:`itertools.groupby`. + """ + result: dict[Hashable, list["v21.MetadataReport"]] = defaultdict(list) + for k, g in itertools.groupby(mds.report, key): + result[k].extend(g) + return result + + def make_workbook(name="sample.xlsx") -> None: """Generate a :class:`openpyxl.Workbook` for exchange of metadata.""" from openpyxl import Workbook diff --git a/transport_data/tests/org/test_metadata.py b/transport_data/tests/org/test_metadata.py index 289dd90..9863180 100644 --- a/transport_data/tests/org/test_metadata.py +++ b/transport_data/tests/org/test_metadata.py @@ -1,4 +1,10 @@ +from functools import partial + +import pytest + from transport_data.org.metadata import ( + contains_data_for, + groupby, make_workbook, read_workbook, summarize_metadataset, @@ -9,16 +15,21 @@ def test_make_workbook(tmp_path) -> None: make_workbook() -def test_read_workbook(test_data_path) -> None: +@pytest.fixture(scope="module") +def example_metadata(test_data_path): + return read_workbook(test_data_path.joinpath("metadata-input.xlsx")) + + +def test_read_workbook(example_metadata) -> None: # Function runs successfully - result = read_workbook(test_data_path.joinpath("metadata-input.xlsx")) + result, _ = example_metadata # Result has a certain number of metadata reports - assert 47 == len(result.report) + assert 45 == len(result.report) -def test_summarize_metadataset(capsys, test_data_path) -> None: - mds = read_workbook(test_data_path.joinpath("metadata-input.xlsx")) +def test_summarize_metadataset(capsys, example_metadata) -> None: + mds, cs_dims = example_metadata # Function runs successfully summarize_metadataset(mds) @@ -27,6 +38,30 @@ def test_summarize_metadataset(capsys, test_data_path) -> None: # pathlib.Path("debug.txt").write_text(captured.out) # DEBUG Write to a file # Output contains certain text - assert "MEASURE: 40 unique values" in captured.out + assert "MEASURE: 39 unique values" in captured.out # TODO expand with further assertions + + +@pytest.mark.parametrize( + "ref_area, N_exp", + [ + ("CN", 19), + ("ID", 17), + ("IN", 19), + ("PH", 14), + ("NP", 0), + ("TH", 17), + ("VN", 18), + ], +) +def test_groupby(example_metadata, ref_area, N_exp: int) -> None: + predicate = partial(contains_data_for, ref_area=ref_area) + result = groupby(example_metadata[0], predicate) + + # Expected counts of metadata reports with respective values + # NB Use set notation to tolerate missing keys in `result` if N_exp == 0 + exp = {(True, N_exp), (False, 45 - N_exp)} + + # Observed counts match + assert exp >= {(k, len(v)) for k, v in result.items()} From 1fcee2469ef19a2042ee7f0726da0d38a31a0339 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Mon, 30 Sep 2024 13:44:38 +0200 Subject: [PATCH 18/34] Generate country summaries in HTML --- pyproject.toml | 1 + .../data/org/template-metadata.html | 53 +++++++++++++++++++ transport_data/org/metadata.py | 47 +++++++++++++++- transport_data/tests/org/test_metadata.py | 35 +++++++----- 4 files changed, 123 insertions(+), 13 deletions(-) create mode 100644 transport_data/data/org/template-metadata.html diff --git a/pyproject.toml b/pyproject.toml index 9194edf..cdd9d9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ readme = "README.rst" requires-python = ">=3.8" dependencies = [ "click", + "Jinja2", "openpyxl", "packaging", "pandas", diff --git a/transport_data/data/org/template-metadata.html b/transport_data/data/org/template-metadata.html new file mode 100644 index 0000000..7a05d39 --- /dev/null +++ b/transport_data/data/org/template-metadata.html @@ -0,0 +1,53 @@ +{% macro summarize_metadatareport(report, heading="h2") %} +{% set dataflow = report.attaches_to.key_values["DATAFLOW"].obj %} +<{{ heading }} id="{{ report | dfd_id }}">Data flow '{{ report | dfd_id }}' ^ +{{ summarize_dataflow(dataflow) }} +{% for metadata_concept in ["DATA_PROVIDER", "URL", "MEASURE", "UNIT_MEASURE", "DATA_DESCR", "COMMENT"] %} +{% set ra_value, mda = get_reported_attribute(report, metadata_concept) %} +{% if not mda %}{% continue %}{% endif %} +

{{ mda.concept_identity.name }}: +{% if metadata_concept == "DATA_PROVIDER" %} +{{ ra_value }} +{% elif metadata_concept == "URL" %} +{{ ra_value}} +{% elif metadata_concept in ("DATA_DESCR", "COMMENTS") %} +
{{ ra_value.replace('\n', '
') | safe }} +{% else %} +{{ ra_value }} +{% endif %} +

+{% endfor %} +{% endmacro %} + +{% macro summarize_dataflow(dfd) %} +…with dimensions: +
    + {% for dim in dfd.structure.dimensions %} +
  1. {{ dim.id }}: {{ dim | format_desc }}
  2. + {% endfor %} +
+{% endmacro %} + + + + +

Direct links:
+ {{ matched | length }} data flows containing data on {{ ref_area }}: + {% for mdr in matched %}
{{ mdr | dfd_id }}{{ ", " if not loop.last }}{% endfor %} +
+ {{ no_match | length }} other data flows: + {% for mdr in no_match %}{{ mdr | dfd_id }}{{ ", " if not loop.last }}{% endfor %} +

+

Data flows containing data on {{ ref_area }}

+

These data flows are explicitly marked as containing data pertaining to the country.

+ {% for mdr in matched %} + {{ summarize_metadatareport(mdr) }} + {% endfor %} +

Other data flows

+

These data flows are not explicitly identified as containing data on the country. + This doesn't completely rule out that they may contain such data, but this is less likely and would require further investigation and inspection.

+ {% for mdr in no_match %} + {{ summarize_metadatareport(mdr) }} + {% endfor %} + + diff --git a/transport_data/org/metadata.py b/transport_data/org/metadata.py index 1d0d1ce..13b8937 100644 --- a/transport_data/org/metadata.py +++ b/transport_data/org/metadata.py @@ -2,7 +2,7 @@ import logging import re from collections import defaultdict -from functools import lru_cache +from functools import lru_cache, partial from typing import TYPE_CHECKING, Callable, Hashable, List, Optional, Tuple from pycountry import countries @@ -215,6 +215,51 @@ def contains_data_for(mdr: "v21.MetadataReport", ref_area: str) -> bool: return False +def generate_summary_html( + mds: "v21.MetadataSet", ref_area: str, path: "pathlib.Path" +) -> None: + """Generate a summary report in HTML.""" + from jinja2 import Environment, PackageLoader, select_autoescape + + # Create a Jinja environment + env = Environment( + loader=PackageLoader("transport_data", package_path="data/org"), + extensions=["jinja2.ext.loopcontrols"], + autoescape=select_autoescape(), + trim_blocks=True, + lstrip_blocks=True, + ) + + grouped = groupby(mds, key=partial(contains_data_for, ref_area=ref_area)) + + def _dfd_id(mdr): + return mdr.attaches_to.key_values["DATAFLOW"].obj.id + + def _get_reported_attribute(mdr, id_): + for ra in mdr.metadata: + if ra.value_for.id == id_: + return ra.value, ra.value_for + return "—", None + + def _format_desc(dim): + if desc := str(dim.get_annotation(id="tdc-description").text): + return desc + else: + return "—" + + env.filters["dfd_id"] = _dfd_id + env.filters["format_desc"] = _format_desc + + path.write_text( + env.get_template("template-metadata.html").render( + ref_area=ref_area, + matched=grouped[True], + no_match=grouped[False], + get_reported_attribute=_get_reported_attribute, + ) + ) + + @lru_cache def get_cs_common() -> "common.ConceptScheme": """Create a shared concept scheme for the concepts referenced by dimensions. diff --git a/transport_data/tests/org/test_metadata.py b/transport_data/tests/org/test_metadata.py index 9863180..a30fee4 100644 --- a/transport_data/tests/org/test_metadata.py +++ b/transport_data/tests/org/test_metadata.py @@ -4,6 +4,7 @@ from transport_data.org.metadata import ( contains_data_for, + generate_summary_html, groupby, make_workbook, read_workbook, @@ -43,18 +44,18 @@ def test_summarize_metadataset(capsys, example_metadata) -> None: # TODO expand with further assertions -@pytest.mark.parametrize( - "ref_area, N_exp", - [ - ("CN", 19), - ("ID", 17), - ("IN", 19), - ("PH", 14), - ("NP", 0), - ("TH", 17), - ("VN", 18), - ], -) +COUNTRIES = [ + ("CN", 19), + ("ID", 17), + ("IN", 19), + ("PH", 14), + ("NP", 0), + ("TH", 17), + ("VN", 18), +] + + +@pytest.mark.parametrize("ref_area, N_exp", COUNTRIES) def test_groupby(example_metadata, ref_area, N_exp: int) -> None: predicate = partial(contains_data_for, ref_area=ref_area) result = groupby(example_metadata[0], predicate) @@ -65,3 +66,13 @@ def test_groupby(example_metadata, ref_area, N_exp: int) -> None: # Observed counts match assert exp >= {(k, len(v)) for k, v in result.items()} + + +@pytest.mark.parametrize("ref_area, N_exp", COUNTRIES) +def test_generate_summary_html(tmp_path, example_metadata, ref_area, N_exp) -> None: + path = tmp_path.joinpath(f"{ref_area}.html") + + generate_summary_html(example_metadata[0], ref_area=ref_area, path=path) + + # Output was created + assert path.exists() From 18f12542829e05fa4fafbe9eae1624765f0da563 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Mon, 30 Sep 2024 14:09:21 +0200 Subject: [PATCH 19/34] Generate a summary table for multiple countries --- ...metadata.html => template-metadata-0.html} | 0 .../data/org/template-metadata-1.html | 24 ++++++ transport_data/org/metadata.py | 86 ++++++++++++------- transport_data/tests/org/test_metadata.py | 18 +++- 4 files changed, 95 insertions(+), 33 deletions(-) rename transport_data/data/org/{template-metadata.html => template-metadata-0.html} (100%) create mode 100644 transport_data/data/org/template-metadata-1.html diff --git a/transport_data/data/org/template-metadata.html b/transport_data/data/org/template-metadata-0.html similarity index 100% rename from transport_data/data/org/template-metadata.html rename to transport_data/data/org/template-metadata-0.html diff --git a/transport_data/data/org/template-metadata-1.html b/transport_data/data/org/template-metadata-1.html new file mode 100644 index 0000000..31ef4cf --- /dev/null +++ b/transport_data/data/org/template-metadata-1.html @@ -0,0 +1,24 @@ + + + + + + + {% for ra in ref_area | sort %} + + {% endfor %} + + + + {% for dfd_id, row_data in data | items | sort %} + + + {% for ra in ref_area | sort %} + + {% endfor %} + + {% endfor %} + +
Dataflow ID{{ ra }}
{{ dfd_id }}{{ "✔" if row_data[ra] else "" }}
+ + diff --git a/transport_data/org/metadata.py b/transport_data/org/metadata.py index 13b8937..302a805 100644 --- a/transport_data/org/metadata.py +++ b/transport_data/org/metadata.py @@ -215,47 +215,36 @@ def contains_data_for(mdr: "v21.MetadataReport", ref_area: str) -> bool: return False -def generate_summary_html( +def generate_summary_html0( mds: "v21.MetadataSet", ref_area: str, path: "pathlib.Path" ) -> None: """Generate a summary report in HTML.""" - from jinja2 import Environment, PackageLoader, select_autoescape - - # Create a Jinja environment - env = Environment( - loader=PackageLoader("transport_data", package_path="data/org"), - extensions=["jinja2.ext.loopcontrols"], - autoescape=select_autoescape(), - trim_blocks=True, - lstrip_blocks=True, - ) grouped = groupby(mds, key=partial(contains_data_for, ref_area=ref_area)) - def _dfd_id(mdr): - return mdr.attaches_to.key_values["DATAFLOW"].obj.id + env, common = get_jinja_env() - def _get_reported_attribute(mdr, id_): - for ra in mdr.metadata: - if ra.value_for.id == id_: - return ra.value, ra.value_for - return "—", None + path.write_text( + env.get_template("template-metadata-0.html").render( + ref_area=ref_area, matched=grouped[True], no_match=grouped[False], **common + ) + ) - def _format_desc(dim): - if desc := str(dim.get_annotation(id="tdc-description").text): - return desc - else: - return "—" - env.filters["dfd_id"] = _dfd_id - env.filters["format_desc"] = _format_desc +def generate_summary_html1( + mds: "v21.MetadataSet", ref_area: list[str], path: "pathlib.Path" +) -> None: + data = { + mdr.attaches_to.key_values["DATAFLOW"].obj.id: { # type: ignore [union-attr] + ra: contains_data_for(mdr, ra) for ra in ref_area + } + for mdr in mds.report + } + env, common = get_jinja_env() path.write_text( - env.get_template("template-metadata.html").render( - ref_area=ref_area, - matched=grouped[True], - no_match=grouped[False], - get_reported_attribute=_get_reported_attribute, + env.get_template("template-metadata-1.html").render( + ref_area=ref_area, data=data, **common ) ) @@ -300,6 +289,43 @@ def get_cs_common() -> "common.ConceptScheme": return cs +@lru_cache +def get_jinja_env(): + """Return a Jinja2 environment for rendering templates.""" + from jinja2 import Environment, PackageLoader, select_autoescape + + # Create a Jinja environment + env = Environment( + loader=PackageLoader("transport_data", package_path="data/org"), + extensions=["jinja2.ext.loopcontrols"], + autoescape=select_autoescape(), + trim_blocks=True, + lstrip_blocks=True, + ) + + def _dfd_id(mdr): + return mdr.attaches_to.key_values["DATAFLOW"].obj.id + + def _get_reported_attribute(mdr, id_): + for ra in mdr.metadata: + if ra.value_for.id == id_: + return ra.value, ra.value_for + return "—", None + + def _format_desc(dim): + if desc := str(dim.get_annotation(id="tdc-description").text): + return desc + else: + return "—" + + env.filters["dfd_id"] = _dfd_id + env.filters["format_desc"] = _format_desc + + return env, dict( + get_reported_attribute=_get_reported_attribute, + ) + + def get_msd() -> "v21.MetadataStructureDefinition": from transport_data import STORE diff --git a/transport_data/tests/org/test_metadata.py b/transport_data/tests/org/test_metadata.py index a30fee4..e195354 100644 --- a/transport_data/tests/org/test_metadata.py +++ b/transport_data/tests/org/test_metadata.py @@ -4,7 +4,8 @@ from transport_data.org.metadata import ( contains_data_for, - generate_summary_html, + generate_summary_html0, + generate_summary_html1, groupby, make_workbook, read_workbook, @@ -69,10 +70,21 @@ def test_groupby(example_metadata, ref_area, N_exp: int) -> None: @pytest.mark.parametrize("ref_area, N_exp", COUNTRIES) -def test_generate_summary_html(tmp_path, example_metadata, ref_area, N_exp) -> None: +def test_generate_summary_html0(tmp_path, example_metadata, ref_area, N_exp) -> None: path = tmp_path.joinpath(f"{ref_area}.html") - generate_summary_html(example_metadata[0], ref_area=ref_area, path=path) + generate_summary_html0(example_metadata[0], ref_area=ref_area, path=path) + + # Output was created + assert path.exists() + + +def test_generate_summary_html1(tmp_path, example_metadata) -> None: + path = tmp_path.joinpath("all.html") + + generate_summary_html1( + example_metadata[0], ref_area=list(item[0] for item in COUNTRIES), path=path + ) # Output was created assert path.exists() From 14becdca6c2303242a9cc462f668e6fada3952d3 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 4 Oct 2024 18:34:36 +0200 Subject: [PATCH 20/34] Improve report generation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add .report.Report ABC for reports on data and metadata. - Add .util.docutils to customize behaviour of docutils ODT writing. - Add .util.jinja2 to collect handling of Jinja2 templates. - Move templates to data/template/… - Add metadata-2.rst template for ReST → ODT reporting. - Add odt-styles.xml to customize appearance of generated ODT. - Define TDC metadata reports in .metadata.report. - Update CLI, tests. - Add docutils to dependencies. - Add docutils, lxml stubs for mypy via pre-commit. --- .pre-commit-config.yaml | 2 + pyproject.toml | 1 + .../metadata-0.html} | 0 .../metadata-1.html} | 0 transport_data/data/template/metadata-2.rst | 83 ++ transport_data/data/template/odt-styles.xml | 1163 +++++++++++++++++ transport_data/org/cli.py | 41 +- .../org/{metadata.py => metadata/__init__.py} | 87 +- transport_data/org/metadata/report.py | 84 ++ transport_data/report.py | 43 + transport_data/tests/org/test_metadata.py | 42 +- transport_data/util/__init__.py | 5 + transport_data/util/docutils.py | 130 ++ transport_data/util/jinja2.py | 42 + 14 files changed, 1626 insertions(+), 97 deletions(-) rename transport_data/data/{org/template-metadata-0.html => template/metadata-0.html} (100%) rename transport_data/data/{org/template-metadata-1.html => template/metadata-1.html} (100%) create mode 100644 transport_data/data/template/metadata-2.rst create mode 100644 transport_data/data/template/odt-styles.xml rename transport_data/org/{metadata.py => metadata/__init__.py} (89%) create mode 100644 transport_data/org/metadata/report.py create mode 100644 transport_data/report.py create mode 100644 transport_data/util/docutils.py create mode 100644 transport_data/util/jinja2.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8f10388..327a865 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,11 +5,13 @@ repos: - id: mypy additional_dependencies: - click + - lxml-stubs - pandas-stubs - platformdirs - pytest - sdmx1 - Sphinx + - types-docutils - types-openpyxl - types-requests args: [] diff --git a/pyproject.toml b/pyproject.toml index cdd9d9b..8eed655 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ readme = "README.rst" requires-python = ">=3.8" dependencies = [ "click", + "docutils", "Jinja2", "openpyxl", "packaging", diff --git a/transport_data/data/org/template-metadata-0.html b/transport_data/data/template/metadata-0.html similarity index 100% rename from transport_data/data/org/template-metadata-0.html rename to transport_data/data/template/metadata-0.html diff --git a/transport_data/data/org/template-metadata-1.html b/transport_data/data/template/metadata-1.html similarity index 100% rename from transport_data/data/org/template-metadata-1.html rename to transport_data/data/template/metadata-1.html diff --git a/transport_data/data/template/metadata-2.rst b/transport_data/data/template/metadata-2.rst new file mode 100644 index 0000000..9ed6027 --- /dev/null +++ b/transport_data/data/template/metadata-2.rst @@ -0,0 +1,83 @@ +{% macro summarize_metadatareport(report, heading="-") %} +{% set dataflow = report.attaches_to.key_values["DATAFLOW"].obj %} +.. _{{ report | dfd_id }}: + +{% filter rst_heading(heading) %}Data flow ``'{{ report | dfd_id }}'`` `^ `_{% endfilter %} + +{% for metadata_concept in ["DATA_PROVIDER", "URL", "MEASURE", "UNIT_MEASURE", "DATA_DESCR", "COMMENT"] %} +{% set ra_value, mda = get_reported_attribute(report, metadata_concept) %} +{% if not mda %}{% continue %}{% endif %} + +*{{ mda.concept_identity.name }}*: {% if metadata_concept == "DATA_PROVIDER" %} +**{{ ra_value }}** +{% elif metadata_concept == "URL" %} +`(link) <{{ ra_value }}>`__ +{% elif metadata_concept == "DATA_DESCR" %} + + +{{ ra_value }} +{% else %} +{{ ra_value if ra_value else "*(empty)*" }} +{% endif %} +{% endfor %} + +{{ summarize_dataflow(dataflow) }} +{% endmacro %} + +{% macro summarize_dataflow(dfd) %} +…with dimensions: + +{% for dim in dfd.structure.dimensions %} +{{ loop.index }}. **{{ dim.id }}**: {{ dim | format_desc }} +{% endfor %} +{% endmacro %} + +{% filter rst_heading("*") %}{{ ref_area }} metadata and data{% endfilter %} + + +This file contains a summary of metadata and data for {{ ref_area }} collected through the project from the consultant team, GIZ Country Focal Points (CFPs), etc. + +- It is automatically generated using the tools developed at transport-data/tools#21 on GitHub, using a command similar to: + + .. code-block:: + + tdc org summarize --ref-area={{ ref_area }} “Metadata 2024-09-27.xlsx” + + …currently using, as input, the Teams file “Metadata file – prototype 1.xlsx” as of 2024-09-27. + +- Use “File > Version History” in Microsoft Office to see updates. +- For questions or information please contact Paul Kishimoto via Teams and/or see the HOWTO. +- The entire file may be overwritten periodically. + **Do not** make edits in this file if you need them to be preserved; instead, make a copy and edit there. + +.. _top: + +**Direct links:** + +**{{ matched | length }}** data flows containing data on {{ ref_area }}: +{% for mdr in matched %}`{{ mdr | dfd_id }}`_{{ ", " if not loop.last }}{% endfor %} + + +**{{ no_match | length }}** other data flows: +{% for mdr in no_match %}`{{ mdr | dfd_id }}`_{{ ", " if not loop.last }}{% endfor %} + + +{% filter rst_heading("=") %}Data flows containing data on {{ ref_area }}{% endfilter %} + + +These data flows are explicitly marked as containing data pertaining to the country. + +{% for mdr in matched %} +{{ summarize_metadatareport(mdr) }} +{% endfor %} + + +Other data flows +================ + +These data flows are not *explicitly* identified as containing data on the country. +This doesn't completely rule out that they *may* contain such data, but this is less likely and would require further investigation and inspection. + +{% for mdr in no_match %} +{{ summarize_metadatareport(mdr) }} +{% endfor %} diff --git a/transport_data/data/template/odt-styles.xml b/transport_data/data/template/odt-styles.xml new file mode 100644 index 0000000..274ee23 --- /dev/null +++ b/transport_data/data/template/odt-styles.xml @@ -0,0 +1,1163 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Page + + + + + + Page + + + + + diff --git a/transport_data/org/cli.py b/transport_data/org/cli.py index 51bcb2c..3502571 100644 --- a/transport_data/org/cli.py +++ b/transport_data/org/cli.py @@ -7,6 +7,7 @@ """ import pathlib +from typing import Optional import click @@ -36,10 +37,48 @@ def read(path: "pathlib.Path"): """Read and summarize metadata.""" from .metadata import read_workbook, summarize_metadataset - mds = read_workbook(path.resolve()) + mds, _ = read_workbook(path.resolve()) summarize_metadataset(mds) +@main.command +@click.option("--ref-area", required=True) +@click.argument( + "path_in", type=click.Path(exists=True, dir_okay=False, path_type=pathlib.Path) +) +@click.argument( + "path_out", type=click.Path(dir_okay=False, path_type=pathlib.Path), required=False +) +def summarize(path_in: "pathlib.Path", path_out: Optional["pathlib.Path"], ref_area): + """Generate HTML metadata summary. + + If a single value is given for --ref-area (e.g. --ref-area=CA), a summary is + generated of the (meta)data pertaining to that country/area. If multiple values are + given (e.g. --ref-area=AF,ZW), a summary table is generated. + """ + from .metadata import read_workbook + from .metadata.report import SummaryHTML0, SummaryHTML1, SummaryODT + + mds, _ = read_workbook(path_in.resolve()) + + ref_areas = ref_area.split(",") + if 1 == len(ref_areas): + # Report for a single REF_AREA + if path_out is None: + path_out = pathlib.Path.cwd().joinpath(f"{ref_areas[0]}.{{html,odt}}") + print(f"Write to {path_out}") + SummaryHTML0(mds, ref_area=ref_areas[0]).write_file( + path_out.with_suffix(".html") + ) + SummaryODT(mds, ref_area=ref_areas[0]).write_file(path_out.with_suffix(".odt")) + elif 1 < len(ref_areas): + # Report for multiple REF_AREA + if path_out is None: + path_out = pathlib.Path.cwd().joinpath("all.html") + print(f"Write to {path_out}") + SummaryHTML1(mds, ref_area=ref_areas).write_file(path_out) + + @main.command("template") def template(): """Generate the metadata template.""" diff --git a/transport_data/org/metadata.py b/transport_data/org/metadata/__init__.py similarity index 89% rename from transport_data/org/metadata.py rename to transport_data/org/metadata/__init__.py index 302a805..0274997 100644 --- a/transport_data/org/metadata.py +++ b/transport_data/org/metadata/__init__.py @@ -2,12 +2,14 @@ import logging import re from collections import defaultdict -from functools import lru_cache, partial +from functools import lru_cache from typing import TYPE_CHECKING, Callable, Hashable, List, Optional, Tuple from pycountry import countries from sdmx.model import common, v21 +from transport_data.util import uline + if TYPE_CHECKING: import pathlib @@ -215,40 +217,6 @@ def contains_data_for(mdr: "v21.MetadataReport", ref_area: str) -> bool: return False -def generate_summary_html0( - mds: "v21.MetadataSet", ref_area: str, path: "pathlib.Path" -) -> None: - """Generate a summary report in HTML.""" - - grouped = groupby(mds, key=partial(contains_data_for, ref_area=ref_area)) - - env, common = get_jinja_env() - - path.write_text( - env.get_template("template-metadata-0.html").render( - ref_area=ref_area, matched=grouped[True], no_match=grouped[False], **common - ) - ) - - -def generate_summary_html1( - mds: "v21.MetadataSet", ref_area: list[str], path: "pathlib.Path" -) -> None: - data = { - mdr.attaches_to.key_values["DATAFLOW"].obj.id: { # type: ignore [union-attr] - ra: contains_data_for(mdr, ra) for ra in ref_area - } - for mdr in mds.report - } - - env, common = get_jinja_env() - path.write_text( - env.get_template("template-metadata-1.html").render( - ref_area=ref_area, data=data, **common - ) - ) - - @lru_cache def get_cs_common() -> "common.ConceptScheme": """Create a shared concept scheme for the concepts referenced by dimensions. @@ -256,7 +224,7 @@ def get_cs_common() -> "common.ConceptScheme": Concepts in this scheme have an annotation ``tdc-aka``, which is a list of alternate IDs recognized for the concept. """ - from . import get_agencyscheme + from transport_data.org import get_agencyscheme as_ = get_agencyscheme() cs = common.ConceptScheme(id="CONCEPTS", maintainer=as_["TDCI"]) @@ -289,47 +257,9 @@ def get_cs_common() -> "common.ConceptScheme": return cs -@lru_cache -def get_jinja_env(): - """Return a Jinja2 environment for rendering templates.""" - from jinja2 import Environment, PackageLoader, select_autoescape - - # Create a Jinja environment - env = Environment( - loader=PackageLoader("transport_data", package_path="data/org"), - extensions=["jinja2.ext.loopcontrols"], - autoescape=select_autoescape(), - trim_blocks=True, - lstrip_blocks=True, - ) - - def _dfd_id(mdr): - return mdr.attaches_to.key_values["DATAFLOW"].obj.id - - def _get_reported_attribute(mdr, id_): - for ra in mdr.metadata: - if ra.value_for.id == id_: - return ra.value, ra.value_for - return "—", None - - def _format_desc(dim): - if desc := str(dim.get_annotation(id="tdc-description").text): - return desc - else: - return "—" - - env.filters["dfd_id"] = _dfd_id - env.filters["format_desc"] = _format_desc - - return env, dict( - get_reported_attribute=_get_reported_attribute, - ) - - def get_msd() -> "v21.MetadataStructureDefinition": from transport_data import STORE - - from . import get_agencyscheme + from transport_data.org import get_agencyscheme TDCI = get_agencyscheme()["TDCI"] @@ -611,7 +541,7 @@ def summarize_metadatareport(mdr: "v21.MetadataReport") -> None: if desc := str(dim.get_annotation(id="tdc-description").text): line += f" {desc!s}" else: - line += " —" + line += " (no info)" try: original_id = dim.get_annotation(id="tdc-original-id").text line += f" ('{original_id!s}' in input file)" @@ -642,11 +572,6 @@ def summarize_metadataset(mds: "v21.MetadataSet") -> None: summarize_metadatareport(r) -def uline(text: str, char: str = "=") -> str: - """Underline `text`.""" - return f"{text}\n{char * len(text)}" - - def update_dimension_descriptor( dsd: "v21.DataStructureDefinition", cs_dims: "v21.ConceptScheme", *concepts ) -> None: diff --git a/transport_data/org/metadata/report.py b/transport_data/org/metadata/report.py new file mode 100644 index 0000000..16272dc --- /dev/null +++ b/transport_data/org/metadata/report.py @@ -0,0 +1,84 @@ +from dataclasses import dataclass +from functools import partial +from typing import TYPE_CHECKING + +from transport_data.report import Report + +if TYPE_CHECKING: + from sdmx.model import v21 + + +@dataclass +class SummaryHTML0(Report): + """Generate a summary report in HTML.""" + + template_name = "metadata-0.html" + + #: Metadata set to summarize. + mds: "v21.MetadataSet" + #: Geography. + ref_area: str + + def render(self) -> str: + from transport_data.org.metadata import contains_data_for, groupby + + grouped = groupby( + self.mds, key=partial(contains_data_for, ref_area=self.ref_area) + ) + + return self.render_jinja_template( + ref_area=self.ref_area, matched=grouped[True], no_match=grouped[False] + ) + + +@dataclass +class SummaryHTML1(Report): + """Generate a summary report in HTML.""" + + template_name = "metadata-1.html" + + #: Metadata set to summarize. + mds: "v21.MetadataSet" + #: Geography. + ref_area: list[str] + + def render(self) -> str: + from transport_data.org.metadata import contains_data_for + + data = { + mdr.attaches_to.key_values["DATAFLOW"].obj.id: { # type: ignore [union-attr] + ra: contains_data_for(mdr, ra) for ra in self.ref_area + } + for mdr in self.mds.report + } + + return self.render_jinja_template(ref_area=self.ref_area, data=data) + + +@dataclass +class SummaryODT(Report): + template_name = "metadata-2.rst" + + #: Metadata set to summarize. + mds: "v21.MetadataSet" + #: Geography. + ref_area: str + + def render(self) -> bytes: + from transport_data.org.metadata import contains_data_for, groupby + + # Prepare data; same as SummaryHTML0 + grouped = groupby( + self.mds, key=partial(contains_data_for, ref_area=self.ref_area) + ) + + # Render the report as reStructuredText using Jinja2 and a template + rst_source = self.render_jinja_template( + ref_area=self.ref_area, + matched=grouped[True], + no_match=grouped[False], + ) + # print(rst_source) # DEBUG + + # Convert reStructuredText → OpenDocumentText + return self.rst2odt(rst_source) diff --git a/transport_data/report.py b/transport_data/report.py new file mode 100644 index 0000000..0e09e2a --- /dev/null +++ b/transport_data/report.py @@ -0,0 +1,43 @@ +"""Generate reports based on data and metadata.""" + +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Union + + +class Report(ABC): + """Base class for reports.""" + + template_name: str + + @abstractmethod + def render(self) -> Union[str, bytes]: ... + + def render_jinja_template(self, *args, **kwargs) -> str: + from transport_data.util import jinja2 + + env, common = jinja2.get_env() + + return env.get_template(self.template_name).render(*args, **kwargs, **common) + + def rst2odt(self, content: str) -> bytes: + from docutils.core import publish_string + + from transport_data.util import docutils, jinja2 + + env, _ = jinja2.get_env() + + ss_path = Path(env.get_template("odt-styles.xml").filename) + settings = {"create_links": True, "stylesheet": str(ss_path)} + + # Convert reStructuredText → ODF ODT using docutils + return publish_string( + writer=docutils.ODTWriter(), source=content, settings_overrides=settings + ) + + def write_file(self, path: Path) -> None: + content = self.render() + if isinstance(content, str): + path.write_text(content) + else: + path.write_bytes(content) diff --git a/transport_data/tests/org/test_metadata.py b/transport_data/tests/org/test_metadata.py index e195354..398fd1f 100644 --- a/transport_data/tests/org/test_metadata.py +++ b/transport_data/tests/org/test_metadata.py @@ -4,13 +4,12 @@ from transport_data.org.metadata import ( contains_data_for, - generate_summary_html0, - generate_summary_html1, groupby, make_workbook, read_workbook, summarize_metadataset, ) +from transport_data.org.metadata.report import SummaryHTML0, SummaryHTML1, SummaryODT def test_make_workbook(tmp_path) -> None: @@ -69,22 +68,35 @@ def test_groupby(example_metadata, ref_area, N_exp: int) -> None: assert exp >= {(k, len(v)) for k, v in result.items()} -@pytest.mark.parametrize("ref_area, N_exp", COUNTRIES) -def test_generate_summary_html0(tmp_path, example_metadata, ref_area, N_exp) -> None: - path = tmp_path.joinpath(f"{ref_area}.html") +class TestSummaryHTML0: + @pytest.mark.parametrize("ref_area, N_exp", COUNTRIES) + def test_write_file(self, tmp_path, example_metadata, ref_area, N_exp) -> None: + path = tmp_path.joinpath(f"{ref_area}.html") + + SummaryHTML0(example_metadata[0], ref_area=ref_area).write_file(path) + + # Output was created + assert path.exists() - generate_summary_html0(example_metadata[0], ref_area=ref_area, path=path) - # Output was created - assert path.exists() +class TestSummaryHTML1: + def test_write_file(self, tmp_path, example_metadata) -> None: + path = tmp_path.joinpath("all.html") + SummaryHTML1( + example_metadata[0], ref_area=list(item[0] for item in COUNTRIES) + ).write_file(path) -def test_generate_summary_html1(tmp_path, example_metadata) -> None: - path = tmp_path.joinpath("all.html") + # Output was created + assert path.exists() + + +@pytest.mark.parametrize("ref_area, N_exp", COUNTRIES) +class TestSummaryODT: + def test_write_file(self, tmp_path, example_metadata, ref_area, N_exp) -> None: + path = tmp_path.joinpath(f"{ref_area}.odt") - generate_summary_html1( - example_metadata[0], ref_area=list(item[0] for item in COUNTRIES), path=path - ) + SummaryODT(example_metadata[0], ref_area=ref_area).write_file(path=path) - # Output was created - assert path.exists() + # Output was created + assert path.exists() diff --git a/transport_data/util/__init__.py b/transport_data/util/__init__.py index dc51750..ad253b2 100644 --- a/transport_data/util/__init__.py +++ b/transport_data/util/__init__.py @@ -1 +1,6 @@ """Utilities.""" + + +def uline(text: str, char: str = "=") -> str: + """Underline `text` with `char`.""" + return f"{text}\n{char * len(text)}" diff --git a/transport_data/util/docutils.py b/transport_data/util/docutils.py new file mode 100644 index 0000000..d233035 --- /dev/null +++ b/transport_data/util/docutils.py @@ -0,0 +1,130 @@ +from pathlib import Path +from zipfile import BadZipFile, ZipFile + +import docutils.writers +import docutils.writers.odf_odt +from docutils.writers.odf_odt import SubElement + + +class ODFTranslator(docutils.writers.odf_odt.ODFTranslator): + """Translator from docutils DOM to OpenDocumentText + + This subclass works around bugs and missing figures in the upstream class. + + - Tolerate a file path with ".xml" for the "stylesheet" setting. + - Fix internal hyperlinks: + + - Write internal hyperlink targets as :xml:`` instead of + :xml:``. + - Write hyperlinks to internal targets as :xml:`` instead of + :xml:``. The latter does not allow to specify the text + content of the reference. In this class, the reference text is as given in the + docutils DOM source. + """ + + # NB Lines excluded with "pragma: no cover" are not used in transport_data + + pending_ids: list[str] + + def retrieve_styles(self, extension: str) -> None: + """Retrieve the stylesheet a file with extension either ".xml" or `extension`. + + Returns nothing. + """ + from lxml import etree + + stylespath = Path(self.settings.stylesheet) + + if stylespath.suffix == ".xml": + with open(stylespath, "rb") as stylesfile: + self.str_stylesheet = stylesfile.read() + elif stylespath.suffix == extension: # pragma: no cover + with ZipFile(stylespath) as zf: + self.str_stylesheet = zf.read("styles.xml") + self.str_stylesheetcontent = zf.read("content.xml") + else: # pragma: no cover + raise RuntimeError( + f"stylesheet path {stylespath} must be {extension} or .xml file" + ) + + self.dom_stylesheet = etree.fromstring(self.str_stylesheet) + + if self.str_stylesheetcontent: # pragma: no cover + # TODO Identify what this is for, or remove + self.dom_stylesheetcontent = etree.fromstring(self.str_stylesheetcontent) + self.table_styles = self.extract_table_styles(self.str_stylesheetcontent) + + def append_pending_ids(self, el) -> None: + if self.settings.create_links: + for id in self.pending_ids: + SubElement(el, "text:bookmark", attrib={"text:name": id}) + self.pending_ids.clear() + + def visit_reference(self, node: "docutils.nodes.reference") -> None: + # text = node.astext() + if self.settings.create_links: + if "refuri" in node: + href = node["refuri"] + if self.settings.cloak_email_addresses and href.startswith( + "mailto:" + ): # pragma: no cover + href = self.cloak_mailto(href) + el = self.append_child( + "text:a", + attrib={ + "xlink:href": "%s" % href, + "xlink:type": "simple", + }, + ) + self.set_current_element(el) + elif "refid" in node: + el2 = self.append_child( + "text:a", + attrib={"xlink:type": "simple", "xlink:href": f"#{node['refid']}"}, + ) + el2.text = node.children.pop(0) + else: # pragma: no cover + self.document.reporter.warning( + 'References must have "refuri" or "refid" attribute.' + ) + if ( + self.in_table_of_contents + and len(node.children) >= 1 + and isinstance(node.children[0], docutils.nodes.generated) + ): # pragma: no cover + node.remove(node.children[0]) + + +class ODTWriter(docutils.writers.odf_odt.Writer): + """Docutils writer for OpenDocument Text. + + This subclass works around bugs in the upstream class. + """ + + # NB Lines excluded with "pragma: no cover" are not used in transport_data + + def __init__(self): + docutils.writers.Writer.__init__(self) + # Use the ODFTranslator subclass defined in this module + self.translator_class = ODFTranslator + + def copy_from_stylesheet(self, outzipfile: "ZipFile") -> None: + """Copy images, settings, etc from the stylesheet doc into target doc.""" + stylespath = Path(self.settings.stylesheet) + + try: + zf = ZipFile(stylespath) + except BadZipFile: + # `stylespath` references a non-ODF archive, which will not contain + # settings.xml or any Pictures/ directory. Copy these instead from the + # default file. + zf = ZipFile(self.default_stylesheet_path) + + # Copy the styles + self.write_zip_str(outzipfile, "settings.xml", zf.read("settings.xml")) + + # Copy the images + for name in filter( + lambda n: n.startswith("Pictures/"), zf.namelist() + ): # pragma: no cover + outzipfile.writestr(name, zf.read(name)) diff --git a/transport_data/util/jinja2.py b/transport_data/util/jinja2.py new file mode 100644 index 0000000..b2fa892 --- /dev/null +++ b/transport_data/util/jinja2.py @@ -0,0 +1,42 @@ +from functools import lru_cache + + +@lru_cache +def get_env(): + """Return a Jinja2 environment for rendering templates.""" + from jinja2 import Environment, PackageLoader, select_autoescape + + from transport_data.util import uline + + # Create a Jinja environment + env = Environment( + loader=PackageLoader("transport_data", package_path="data/template"), + extensions=["jinja2.ext.loopcontrols"], + autoescape=select_autoescape(), + trim_blocks=True, + lstrip_blocks=True, + ) + + # Update filters + def _dfd_id(mdr): + return mdr.attaches_to.key_values["DATAFLOW"].obj.id + + def _get_reported_attribute(mdr, id_): + for ra in mdr.metadata: + if ra.value_for.id == id_: + return ra.value, ra.value_for + return "—", None + + def _format_desc(dim): + if desc := str(dim.get_annotation(id="tdc-description").text): + return desc + else: + return "—" + + env.filters["dfd_id"] = _dfd_id + env.filters["format_desc"] = _format_desc + env.filters["rst_heading"] = uline + + return env, dict( + get_reported_attribute=_get_reported_attribute, + ) From 211c51d52269cf4ff7fb7581eed431aa6bb0da55 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 4 Oct 2024 18:51:22 +0200 Subject: [PATCH 21/34] Adjust contains_data_for() for pycountry 24.6.1 - Tolerate missing "common_name" attribute. --- transport_data/org/metadata/__init__.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/transport_data/org/metadata/__init__.py b/transport_data/org/metadata/__init__.py index 0274997..5456ffc 100644 --- a/transport_data/org/metadata/__init__.py +++ b/transport_data/org/metadata/__init__.py @@ -3,7 +3,7 @@ import re from collections import defaultdict from functools import lru_cache -from typing import TYPE_CHECKING, Callable, Hashable, List, Optional, Tuple +from typing import TYPE_CHECKING, Callable, Hashable, Iterable, List, Optional, Tuple from pycountry import countries from sdmx.model import common, v21 @@ -200,15 +200,24 @@ def contains_data_for(mdr: "v21.MetadataReport", ref_area: str) -> bool: ISO 3166 alpha-2 code for a country. Passed to :meth:`pycountry.countries.lookup`. """ - country = countries.lookup(ref_area) - if mdr.attaches_to.key_values["DATAFLOW"].obj.id.startswith(ref_area): # type: ignore [union-attr] return True - # Pattern to match in DATA_DESCR - pat = re.compile( - f"({country.alpha_2}|{country.alpha_3}|{country.name}|{country.common_name})" + # Valid identifiers for `country`: its ISO 3166 alpha-[23] codes and names + # NB In pycountry 23.12.11, "common_name" would fall back to "official_name" or + # "name" if not explicitly defined. In 24.6.1, this was reverted and + # AttributeError is raised. + country = countries.lookup(ref_area) + values: Iterable[str] = filter( + None, + ( + getattr(country, a, None) + for a in "alpha_2 alpha_3 common_name name official_name".split() + ), ) + # Pattern to match in DATA_DESCR + pat = re.compile("(" + "|".join(values) + ")") + for ra in mdr.metadata: assert hasattr(ra, "value") if ra.value_for.id == "DATA_DESCR" and pat.search(ra.value): From 110be7ddcb3ced7d4703f6e6dbbc6d0f512f6804 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 4 Oct 2024 18:58:22 +0200 Subject: [PATCH 22/34] Collect metadata spreadsheet code in a submodule --- transport_data/org/cli.py | 7 +- transport_data/org/metadata/__init__.py | 319 +------------------- transport_data/org/metadata/spreadsheet.py | 331 +++++++++++++++++++++ transport_data/tests/org/test_metadata.py | 3 +- 4 files changed, 337 insertions(+), 323 deletions(-) create mode 100644 transport_data/org/metadata/spreadsheet.py diff --git a/transport_data/org/cli.py b/transport_data/org/cli.py index 3502571..909515f 100644 --- a/transport_data/org/cli.py +++ b/transport_data/org/cli.py @@ -35,7 +35,8 @@ def refresh(version): ) def read(path: "pathlib.Path"): """Read and summarize metadata.""" - from .metadata import read_workbook, summarize_metadataset + from .metadata import summarize_metadataset + from .metadata.spreadsheet import read_workbook mds, _ = read_workbook(path.resolve()) summarize_metadataset(mds) @@ -56,8 +57,8 @@ def summarize(path_in: "pathlib.Path", path_out: Optional["pathlib.Path"], ref_a generated of the (meta)data pertaining to that country/area. If multiple values are given (e.g. --ref-area=AF,ZW), a summary table is generated. """ - from .metadata import read_workbook from .metadata.report import SummaryHTML0, SummaryHTML1, SummaryODT + from .metadata.spreadsheet import read_workbook mds, _ = read_workbook(path_in.resolve()) @@ -82,6 +83,6 @@ def summarize(path_in: "pathlib.Path", path_out: Optional["pathlib.Path"], ref_a @main.command("template") def template(): """Generate the metadata template.""" - from .metadata import make_workbook + from .metadata.spreadsheet import make_workbook make_workbook() diff --git a/transport_data/org/metadata/__init__.py b/transport_data/org/metadata/__init__.py index 5456ffc..494c00d 100644 --- a/transport_data/org/metadata/__init__.py +++ b/transport_data/org/metadata/__init__.py @@ -3,19 +3,13 @@ import re from collections import defaultdict from functools import lru_cache -from typing import TYPE_CHECKING, Callable, Hashable, Iterable, List, Optional, Tuple +from typing import Callable, Hashable, Iterable, Optional from pycountry import countries from sdmx.model import common, v21 from transport_data.util import uline -if TYPE_CHECKING: - import pathlib - - from openpyxl import Workbook - from openpyxl.worksheet.worksheet import Worksheet - log = logging.getLogger(__name__) #: Concepts and metadata attributes in the TDC metadata structure. @@ -97,92 +91,6 @@ ), } -#: README text for the TDC metadata file format. -README_TEXT = """This file is an unofficial, prototype TDC format for metadata. -loosely imitates the Eurostat format. These files contain metadata (information -*about* data) based on the SDMX information model, but their layout (sheet -names, columns, etc.) is not specified by the SDMX standard, hence ‘unofficial’. - -This file has the following sheets. - -README -====== - -This sheet. - -Attributes -========== - -- One row per metadata attribute (or 'field'). -- Columns for the name; description; and ID (short and machine-readable) of each - attribute. See these descriptions to learn what to write for each attribute. - -One or more additional sheets -============================= - -- The name (or title) of each sheet corresponds to the identity (ID) of the data - flow that is described by the metadata in that sheet. -- In Column A, the name of the metadata attribute. Each name MUST exactly - match one appearing in the "Attributes" sheet. Some names MAY be omitted. -- In Column B, the actual metadata. These may be empty. - -TEMPLATE -======== - -To add information about additional data flows not included in existing sheets -(above), you can copy and rename this sheet. -""" - - -def _header(ws: "Worksheet", *columns: Tuple[str, int]) -> None: - """Write header columns and format their style and width.""" - for column, (value, width) in enumerate(columns, start=1): - cell = ws.cell(row=1, column=column, value=value) - cell.style = "header" - ws.column_dimensions[cell.column_letter].width = width - - -def add_readme(wb: "Workbook") -> None: - """Add a "README" sheet to `wb`.""" - ws = wb.create_sheet("README") - - _header(ws, ("Transport Data Commons (TDC) metadata", 72)) - ws["A3"] = README_TEXT - - -def add_attributes(wb: "Workbook", msd: "v21.MetadataStructureDefinition"): - """Add an "Attributes" sheet to `wb` listing the metadata attributes from `msd`.""" - ws = wb.create_sheet("Attributes") - - _header( - ws, - ("Name", 20), # "Element name" in Eurostat - ("Description", 72), # Not present in Eurostat - ("ID", 20), # "Element code" in Eurostat - ) - - for row, attribute in enumerate(msd.report_structure["ALL"].components, start=2): - concept = attribute.concept_identity - ws.cell(row=row, column=1, value=concept.name.localized_default()).style = "top" - ws.cell(row=row, column=2, value=concept.description.localized_default()) - ws.cell(row=row, column=3, value=attribute.id).style = "top" - - -def add_template(wb: "Workbook", msd: "v21.MetadataStructureDefinition"): - """Add a "TEMPLATE" sheet to `wb` with a metadata template.""" - ws = wb.create_sheet("TEMPLATE") - - _header( - ws, - ("Attribute name", 20), # "Concept name" in Eurostat - ("Value", 72), # "Concept value" in Eurostat - ) - - for row, attribute in enumerate(msd.report_structure["ALL"].components, start=2): - concept = attribute.concept_identity - ws.cell(row=row, column=1, value=concept.name.localized_default()).style = "top" - ws.cell(row=row, column=2, value="---") - def contains_data_for(mdr: "v21.MetadataReport", ref_area: str) -> bool: """Return :any:`True` if `mdr` contains data for `ref_area`. @@ -286,35 +194,6 @@ def get_msd() -> "v21.MetadataStructureDefinition": return msd -def getdefault(is_: "common.ItemScheme", other: "common.Item") -> "common.Item": - """Return an item from `is_` matching `other`. - - Several methods are attempted to match `other` with an existing item: - - 1. ID of `other` is identical to that of an existing item. - 2. Transformed ID of `other`—in upper case, " " replaced with "_" is identical to - that of an existing item. - 3. ID of `other` is in the annotation ``tdc-aka`` - - """ - # Exact match on ID or transformed ID - for candidate in (other.id, other.id.upper().replace(" ", "_")): - try: - return is_[candidate] - except KeyError: - pass - - # Iterate over existing items - for item in is_: - # Eval the annotation "tdc-aka" for a list of alternate IDs for the item - if aka := item.eval_annotation(id="tdc-aka"): - if other.id in aka: - return item - - # Still no match; create the item - return is_.setdefault(id=other.id) - - def groupby( mds: "v21.MetadataSet", key=Callable[["v21.MetadataReport"], Hashable] ) -> dict[Hashable, list["v21.MetadataReport"]]: @@ -328,186 +207,6 @@ def groupby( return result -def make_workbook(name="sample.xlsx") -> None: - """Generate a :class:`openpyxl.Workbook` for exchange of metadata.""" - from openpyxl import Workbook - from openpyxl.styles import Alignment, Font, NamedStyle, PatternFill - - wb = Workbook() - - # Delete the default sheet - assert wb.active - wb.remove(wb.active) - - # Create two named styles - header = NamedStyle(name="header") - header.fill = PatternFill("solid", fgColor="000000") - header.font = Font(bold=True, color="ffffff", name="Calibri") - wb.add_named_style(header) - - top = NamedStyle(name="top") - top.alignment = Alignment(vertical="top", wrap_text=True) - top.font = Font(name="Calibri") - wb.add_named_style(top) - - # Generate the metadata structure definition - msd = get_msd() - - # Add sheets - add_readme(wb) - add_attributes(wb, msd) - add_template(wb, msd) - - # Save the file - wb.save(name) - - -def parse_dimension(value: str) -> List[v21.Concept]: - """Parse the description of a dimension from `value`. - - Supported values include: - - 1. Multiple lines, with each line beginning "- ". - 2. A single line, with dimensions separated by ", ". - 3. A single dimension ID. - """ - # Partial regular expressions for a dimension - entry = r"(?P.+?)(?: \((?P[^\)]*)\))?" - - # Split `value` into potentially multiple values; separate dimension IDs from - # description/annotation - parts = [] - if matches := re.findall(f"^- {entry}$", value, flags=re.MULTILINE): - # Multiple lines, with each line beginning "- " - parts.extend(matches) - elif matches := re.findall(f"{entry}(?:, |$)", value): - # Single line, with dimensions separated by ", " - # TODO Check behaviour if the ", " is within parentheses - parts.extend(matches) - elif 0 == len(parts): - # None of the above → a single dimension label - parts.append(value) - - # Convert to a list of Concept objects - return [ - v21.Concept(id=id_, name=id_, description=description) - for id_, description in parts - ] - - -def read_workbook( - path: "pathlib.Path", -) -> tuple["v21.MetadataSet", "v21.ConceptScheme"]: - """Read a metadata set from the workbook at `path`.""" - from openpyxl import load_workbook - - wb = load_workbook(path) - # Generate/retrieve the metadata structure definition - msd = get_msd() - - mds = v21.MetadataSet(structured_by=msd) - - # Create a shared concept scheme for the concepts referenced by dimensions - # TODO Collect, maybe with get_msd() - cs_dims = get_cs_common() - - for ws in wb.worksheets: - # Skip information sheets generated by methods in this file - if ws.title in ("README", "Attributes", "TEMPLATE"): - continue - - if r := read_worksheet(ws, msd, cs_dims): - mds.report.append(r) - - return mds, cs_dims - - -def read_worksheet( - ws: "Worksheet", - msd: "v21.MetadataStructureDefinition", - cs_dims: "v21.ConceptScheme", -) -> Optional["v21.MetadataReport"]: - """Read a metadata report from the worksheet `ws`. - - Parameters - ---------- - msd : - Metadata structure definition. - """ - # Mapping from names (not IDs) to MetadataAttributes - mda_for_name = { - str(mda.concept_identity.name): mda - for mda in msd.report_structure["ALL"].components - } - - # Create the target of the report: a data flow definition - # TODO Expand this DFD and its associated data structure definition - df_id_from_title = ws.title - dfd = v21.DataflowDefinition(id=ws.title, maintainer=msd.maintainer) - dsd = v21.DataStructureDefinition(id=ws.title, maintainer=msd.maintainer) - dfd.structure = dsd - - # Create objects to associate the metadata report with the data flow definition - iot = v21.IdentifiableObjectTarget() - tok = v21.TargetObjectKey( - key_values={"DATAFLOW": v21.TargetIdentifiableObject(value_for=iot, obj=dfd)} - ) - - # Create the report itself - mdr = v21.MetadataReport() - mdr.attaches_to = tok - - mda = None # Reference to the MetaDataAttribute describing the current row - dimension_concepts = [] - - # Iterate over rows in the worksheet, skipping the first - for row in ws.iter_rows(min_row=2): - try: - # Column B: value in the row - ra_value = row[1].value - - if ra_value is None: - continue - except IndexError: - log.warning( - f"Sheet {df_id_from_title!r} has only < 2 columns in the first row; skip" - ) - return None - - # Column A: name of the metadata attribute - mda_name = row[0].value - - # Identify the MDA - # NB if `mda_name` is none, then `mda` retains the value found on the previous - # row. This allows e.g. multiple rows to give values for DIMENSION - # TODO Protect against other malformed data. - mda = mda_for_name.get(str(mda_name), mda) - - if mda and mda.id == "DIMENSION": - # Parse 1 or more dimension(s) and add to the DSD - dimension_concepts.extend(parse_dimension(str(ra_value))) - else: - # Store as OtherNonEnumeratedAttributeValue - # TODO Use EnumeratedAttributeValue, once code lists are available - # corresponding to dimensions - ra = v21.OtherNonEnumeratedAttributeValue( - value=str(ra_value), value_for=mda - ) - - # Attend the reported attribute to the report - mdr.metadata.append(ra) - - # Basic checks - df_id_from_cell = _get(mdr, "DATAFLOW") - if not df_id_from_cell: - log.warning(f"Sheet {df_id_from_title!r} does not identify a data flow; skip") - return None - - update_dimension_descriptor(dsd, cs_dims, *dimension_concepts) - - return mdr - - def _get(mdr: "v21.MetadataReport", mda_id: str) -> Optional[str]: """Retrieve from `mdr` the reported value of the metadata attribute `mda_id`.""" for mda in mdr.metadata: @@ -579,19 +278,3 @@ def summarize_metadataset(mds: "v21.MetadataSet") -> None: for r in mds.report: summarize_metadatareport(r) - - -def update_dimension_descriptor( - dsd: "v21.DataStructureDefinition", cs_dims: "v21.ConceptScheme", *concepts -) -> None: - """Update the DimensionDescriptor of `dsd` with `concepts`.""" - for dc in concepts: - # Identify the concept in `cs_dims` with the same ID - c = getdefault(cs_dims, dc) - - # Construct annotations - anno = [common.Annotation(id="tdc-description", text=dc.description)] - if c.id != dc.id: - anno.append(common.Annotation(id="tdc-original-id", text=dc.id)) - - dsd.dimensions.getdefault(id=c.id, concept_identity=c, annotations=anno) diff --git a/transport_data/org/metadata/spreadsheet.py b/transport_data/org/metadata/spreadsheet.py new file mode 100644 index 0000000..624a25b --- /dev/null +++ b/transport_data/org/metadata/spreadsheet.py @@ -0,0 +1,331 @@ +import logging +import re +from typing import TYPE_CHECKING, List, Optional, Tuple + +from sdmx.model import common, v21 + +if TYPE_CHECKING: + import pathlib + + from openpyxl import Workbook + from openpyxl.worksheet.worksheet import Worksheet + +log = logging.getLogger(__name__) + + +#: README text for the TDC metadata file format. +README_TEXT = """This file is an unofficial, prototype TDC format for metadata. +loosely imitates the Eurostat format. These files contain metadata (information +*about* data) based on the SDMX information model, but their layout (sheet +names, columns, etc.) is not specified by the SDMX standard, hence ‘unofficial’. + +This file has the following sheets. + +README +====== + +This sheet. + +Attributes +========== + +- One row per metadata attribute (or 'field'). +- Columns for the name; description; and ID (short and machine-readable) of each + attribute. See these descriptions to learn what to write for each attribute. + +One or more additional sheets +============================= + +- The name (or title) of each sheet corresponds to the identity (ID) of the data + flow that is described by the metadata in that sheet. +- In Column A, the name of the metadata attribute. Each name MUST exactly + match one appearing in the "Attributes" sheet. Some names MAY be omitted. +- In Column B, the actual metadata. These may be empty. + +TEMPLATE +======== + +To add information about additional data flows not included in existing sheets +(above), you can copy and rename this sheet. +""" + + +def _header(ws: "Worksheet", *columns: Tuple[str, int]) -> None: + """Write header columns and format their style and width.""" + for column, (value, width) in enumerate(columns, start=1): + cell = ws.cell(row=1, column=column, value=value) + cell.style = "header" + ws.column_dimensions[cell.column_letter].width = width + + +def add_readme(wb: "Workbook") -> None: + """Add a "README" sheet to `wb`.""" + ws = wb.create_sheet("README") + + _header(ws, ("Transport Data Commons (TDC) metadata", 72)) + ws["A3"] = README_TEXT + + +def add_attributes(wb: "Workbook", msd: "v21.MetadataStructureDefinition"): + """Add an "Attributes" sheet to `wb` listing the metadata attributes from `msd`.""" + ws = wb.create_sheet("Attributes") + + _header( + ws, + ("Name", 20), # "Element name" in Eurostat + ("Description", 72), # Not present in Eurostat + ("ID", 20), # "Element code" in Eurostat + ) + + for row, attribute in enumerate(msd.report_structure["ALL"].components, start=2): + concept = attribute.concept_identity + ws.cell(row=row, column=1, value=concept.name.localized_default()).style = "top" + ws.cell(row=row, column=2, value=concept.description.localized_default()) + ws.cell(row=row, column=3, value=attribute.id).style = "top" + + +def add_template(wb: "Workbook", msd: "v21.MetadataStructureDefinition"): + """Add a "TEMPLATE" sheet to `wb` with a metadata template.""" + ws = wb.create_sheet("TEMPLATE") + + _header( + ws, + ("Attribute name", 20), # "Concept name" in Eurostat + ("Value", 72), # "Concept value" in Eurostat + ) + + for row, attribute in enumerate(msd.report_structure["ALL"].components, start=2): + concept = attribute.concept_identity + ws.cell(row=row, column=1, value=concept.name.localized_default()).style = "top" + ws.cell(row=row, column=2, value="---") + + +def getdefault(is_: "common.ItemScheme", other: "common.Item") -> "common.Item": + """Return an item from `is_` matching `other`. + + Several methods are attempted to match `other` with an existing item: + + 1. ID of `other` is identical to that of an existing item. + 2. Transformed ID of `other`—in upper case, " " replaced with "_" is identical to + that of an existing item. + 3. ID of `other` is in the annotation ``tdc-aka`` + + """ + # Exact match on ID or transformed ID + for candidate in (other.id, other.id.upper().replace(" ", "_")): + try: + return is_[candidate] + except KeyError: + pass + + # Iterate over existing items + for item in is_: + # Eval the annotation "tdc-aka" for a list of alternate IDs for the item + if aka := item.eval_annotation(id="tdc-aka"): + if other.id in aka: + return item + + # Still no match; create the item + return is_.setdefault(id=other.id) + + +def make_workbook(name="sample.xlsx") -> None: + """Generate a :class:`openpyxl.Workbook` for exchange of metadata.""" + from openpyxl import Workbook + from openpyxl.styles import Alignment, Font, NamedStyle, PatternFill + + from transport_data.org.metadata import get_msd + + wb = Workbook() + + # Delete the default sheet + assert wb.active + wb.remove(wb.active) + + # Create two named styles + header = NamedStyle(name="header") + header.fill = PatternFill("solid", fgColor="000000") + header.font = Font(bold=True, color="ffffff", name="Calibri") + wb.add_named_style(header) + + top = NamedStyle(name="top") + top.alignment = Alignment(vertical="top", wrap_text=True) + top.font = Font(name="Calibri") + wb.add_named_style(top) + + # Generate the metadata structure definition + msd = get_msd() + + # Add sheets + add_readme(wb) + add_attributes(wb, msd) + add_template(wb, msd) + + # Save the file + wb.save(name) + + +def parse_dimension(value: str) -> List[v21.Concept]: + """Parse the description of a dimension from `value`. + + Supported values include: + + 1. Multiple lines, with each line beginning "- ". + 2. A single line, with dimensions separated by ", ". + 3. A single dimension ID. + """ + # Partial regular expressions for a dimension + entry = r"(?P.+?)(?: \((?P[^\)]*)\))?" + + # Split `value` into potentially multiple values; separate dimension IDs from + # description/annotation + parts = [] + if matches := re.findall(f"^- {entry}$", value, flags=re.MULTILINE): + # Multiple lines, with each line beginning "- " + parts.extend(matches) + elif matches := re.findall(f"{entry}(?:, |$)", value): + # Single line, with dimensions separated by ", " + # TODO Check behaviour if the ", " is within parentheses + parts.extend(matches) + elif 0 == len(parts): # pragma: no cover + # None of the above → a single dimension label + parts.append(value) + + # Convert to a list of Concept objects + return [ + v21.Concept(id=id_, name=id_, description=description) + for id_, description in parts + ] + + +def read_workbook( + path: "pathlib.Path", +) -> tuple["v21.MetadataSet", "v21.ConceptScheme"]: + """Read a metadata set from the workbook at `path`.""" + from openpyxl import load_workbook + + from transport_data.org.metadata import get_cs_common, get_msd + + wb = load_workbook(path) + # Generate/retrieve the metadata structure definition + msd = get_msd() + + mds = v21.MetadataSet(structured_by=msd) + + # Create a shared concept scheme for the concepts referenced by dimensions + # TODO Collect, maybe with get_msd() + cs_dims = get_cs_common() + + for ws in wb.worksheets: + # Skip information sheets generated by methods in this file + if ws.title in ("README", "Attributes", "TEMPLATE"): + continue + + if r := read_worksheet(ws, msd, cs_dims): + mds.report.append(r) + + return mds, cs_dims + + +def read_worksheet( + ws: "Worksheet", + msd: "v21.MetadataStructureDefinition", + cs_dims: "v21.ConceptScheme", +) -> Optional["v21.MetadataReport"]: + """Read a metadata report from the worksheet `ws`. + + Parameters + ---------- + msd : + Metadata structure definition. + """ + from transport_data.org.metadata import _get + + # Mapping from names (not IDs) to MetadataAttributes + mda_for_name = { + str(mda.concept_identity.name): mda + for mda in msd.report_structure["ALL"].components + } + + # Create the target of the report: a data flow definition + # TODO Expand this DFD and its associated data structure definition + df_id_from_title = ws.title + dfd = v21.DataflowDefinition(id=ws.title, maintainer=msd.maintainer) + dsd = v21.DataStructureDefinition(id=ws.title, maintainer=msd.maintainer) + dfd.structure = dsd + + # Create objects to associate the metadata report with the data flow definition + iot = v21.IdentifiableObjectTarget() + tok = v21.TargetObjectKey( + key_values={"DATAFLOW": v21.TargetIdentifiableObject(value_for=iot, obj=dfd)} + ) + + # Create the report itself + mdr = v21.MetadataReport() + mdr.attaches_to = tok + + mda = None # Reference to the MetaDataAttribute describing the current row + dimension_concepts = [] + + # Iterate over rows in the worksheet, skipping the first + for row in ws.iter_rows(min_row=2): + try: + # Column B: value in the row + ra_value = row[1].value + + if ra_value is None: + continue + except IndexError: # pragma: no cover + log.warning( + f"Sheet {df_id_from_title!r} has only < 2 columns in the first row; skip" + ) + return None + + # Column A: name of the metadata attribute + mda_name = row[0].value + + # Identify the MDA + # NB if `mda_name` is none, then `mda` retains the value found on the previous + # row. This allows e.g. multiple rows to give values for DIMENSION + # TODO Protect against other malformed data. + mda = mda_for_name.get(str(mda_name), mda) + + if mda and mda.id == "DIMENSION": + # Parse 1 or more dimension(s) and add to the DSD + dimension_concepts.extend(parse_dimension(str(ra_value))) + else: + # Store as OtherNonEnumeratedAttributeValue + # TODO Use EnumeratedAttributeValue, once code lists are available + # corresponding to dimensions + ra = v21.OtherNonEnumeratedAttributeValue( + value=str(ra_value), value_for=mda + ) + + # Attend the reported attribute to the report + mdr.metadata.append(ra) + + # Basic checks + df_id_from_cell = _get(mdr, "DATAFLOW") + if not df_id_from_cell: + log.warning(f"Sheet {df_id_from_title!r} does not identify a data flow; skip") + return None + + update_dimension_descriptor(dsd, cs_dims, *dimension_concepts) + + return mdr + + +def update_dimension_descriptor( + dsd: "v21.DataStructureDefinition", cs_dims: "v21.ConceptScheme", *concepts +) -> None: + """Update the DimensionDescriptor of `dsd` with `concepts`.""" + for dc in concepts: + # Identify the concept in `cs_dims` with the same ID + c = getdefault(cs_dims, dc) + + # Construct annotations + anno = [common.Annotation(id="tdc-description", text=dc.description)] + if c.id != dc.id: + anno.append(common.Annotation(id="tdc-original-id", text=dc.id)) + + dsd.dimensions.getdefault(id=c.id, concept_identity=c, annotations=anno) diff --git a/transport_data/tests/org/test_metadata.py b/transport_data/tests/org/test_metadata.py index 398fd1f..ce0f53a 100644 --- a/transport_data/tests/org/test_metadata.py +++ b/transport_data/tests/org/test_metadata.py @@ -5,11 +5,10 @@ from transport_data.org.metadata import ( contains_data_for, groupby, - make_workbook, - read_workbook, summarize_metadataset, ) from transport_data.org.metadata.report import SummaryHTML0, SummaryHTML1, SummaryODT +from transport_data.org.metadata.spreadsheet import make_workbook, read_workbook def test_make_workbook(tmp_path) -> None: From a65c066ed2837f560f0334afc37a036499187755 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 4 Oct 2024 18:59:38 +0200 Subject: [PATCH 23/34] =?UTF-8?q?Bump=20mypy=20v0.9.1=20=E2=86=92=20v1.11.?= =?UTF-8?q?2;=20ruff=20v0.4.1=20=E2=86=92=20v0.6.9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 327a865..179aec6 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.9.0 + rev: v1.11.2 hooks: - id: mypy additional_dependencies: @@ -16,7 +16,7 @@ repos: - types-requests args: [] - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.1 + rev: v0.6.9 hooks: - id: ruff - id: ruff-format From cfa9c3773f563773663af98207ba69e47944fe4e Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 4 Oct 2024 19:22:40 +0200 Subject: [PATCH 24/34] Convert summarize_metadata*() to Report subclasses --- transport_data/org/cli.py | 15 ++-- transport_data/org/metadata/__init__.py | 65 --------------- transport_data/org/metadata/report.py | 96 ++++++++++++++++++++++- transport_data/tests/org/test_metadata.py | 75 +++++++++--------- 4 files changed, 139 insertions(+), 112 deletions(-) diff --git a/transport_data/org/cli.py b/transport_data/org/cli.py index 909515f..170bb16 100644 --- a/transport_data/org/cli.py +++ b/transport_data/org/cli.py @@ -35,11 +35,12 @@ def refresh(version): ) def read(path: "pathlib.Path"): """Read and summarize metadata.""" - from .metadata import summarize_metadataset + from .metadata import report from .metadata.spreadsheet import read_workbook mds, _ = read_workbook(path.resolve()) - summarize_metadataset(mds) + + print(report.MetadataSetPlain(mds).render()) @main.command @@ -57,7 +58,7 @@ def summarize(path_in: "pathlib.Path", path_out: Optional["pathlib.Path"], ref_a generated of the (meta)data pertaining to that country/area. If multiple values are given (e.g. --ref-area=AF,ZW), a summary table is generated. """ - from .metadata.report import SummaryHTML0, SummaryHTML1, SummaryODT + from .metadata import report from .metadata.spreadsheet import read_workbook mds, _ = read_workbook(path_in.resolve()) @@ -68,16 +69,18 @@ def summarize(path_in: "pathlib.Path", path_out: Optional["pathlib.Path"], ref_a if path_out is None: path_out = pathlib.Path.cwd().joinpath(f"{ref_areas[0]}.{{html,odt}}") print(f"Write to {path_out}") - SummaryHTML0(mds, ref_area=ref_areas[0]).write_file( + report.MetadataSetHTML0(mds, ref_area=ref_areas[0]).write_file( path_out.with_suffix(".html") ) - SummaryODT(mds, ref_area=ref_areas[0]).write_file(path_out.with_suffix(".odt")) + report.MetadataSetODT(mds, ref_area=ref_areas[0]).write_file( + path_out.with_suffix(".odt") + ) elif 1 < len(ref_areas): # Report for multiple REF_AREA if path_out is None: path_out = pathlib.Path.cwd().joinpath("all.html") print(f"Write to {path_out}") - SummaryHTML1(mds, ref_area=ref_areas).write_file(path_out) + report.MetadataSetHTML1(mds, ref_area=ref_areas).write_file(path_out) @main.command("template") diff --git a/transport_data/org/metadata/__init__.py b/transport_data/org/metadata/__init__.py index 494c00d..440b2a2 100644 --- a/transport_data/org/metadata/__init__.py +++ b/transport_data/org/metadata/__init__.py @@ -8,8 +8,6 @@ from pycountry import countries from sdmx.model import common, v21 -from transport_data.util import uline - log = logging.getLogger(__name__) #: Concepts and metadata attributes in the TDC metadata structure. @@ -215,66 +213,3 @@ def _get(mdr: "v21.MetadataReport", mda_id: str) -> Optional[str]: return mda.value # No match return None - - -def summarize_metadataattribute(mds: "v21.MetadataSet", mda_id: str) -> None: - """Summarize unique values appear in metadata for attribute `mda_id`.""" - value_id = defaultdict(set) - - for r in mds.report: - value_id[_get(r, mda_id) or "MISSING"].add(_get(r, "DATAFLOW") or "MISSING") - - assert mds.structured_by - mda = mds.structured_by.report_structure["ALL"].get(mda_id) - - print("\n\n" + uline(f"{mda}: {len(value_id)} unique values")) - for value, df_ids in sorted(value_id.items()): - print(f"{value}\n " + " ".join(sorted(df_ids))) - - -def summarize_metadatareport(mdr: "v21.MetadataReport") -> None: - lines = ["", uline("Metadata report")] - - # Retrieve references to the data flow and data structure - dfd: v21.DataflowDefinition = mdr.attaches_to.key_values["DATAFLOW"].obj # type: ignore [union-attr] - dsd = dfd.structure - - # Summarize the data flow and data structure - - lines.extend( - [f"Refers to {dfd!r}", f" with structure {dsd!r}", " with dimensions:"] - ) - for dim in dsd.dimensions: - line = f" - {dim.id}:" - if desc := str(dim.get_annotation(id="tdc-description").text): - line += f" {desc!s}" - else: - line += " (no info)" - try: - original_id = dim.get_annotation(id="tdc-original-id").text - line += f" ('{original_id!s}' in input file)" - except KeyError: - pass - lines.append(line) - - lines.append("") - - for ra in mdr.metadata: - if ra.value_for.id == "DATAFLOW": - continue - assert hasattr(ra, "value") - lines.append(f"{ra.value_for}: {ra.value}") - - print("\n".join(lines)) - - -def summarize_metadataset(mds: "v21.MetadataSet") -> None: - """Print a summary of the contents of `mds`.""" - print(f"Metadata set containing {len(mds.report)} metadata reports") - - summarize_metadataattribute(mds, "MEASURE") - summarize_metadataattribute(mds, "DATA_PROVIDER") - summarize_metadataattribute(mds, "UNIT_MEASURE") - - for r in mds.report: - summarize_metadatareport(r) diff --git a/transport_data/org/metadata/report.py b/transport_data/org/metadata/report.py index 16272dc..c3a7955 100644 --- a/transport_data/org/metadata/report.py +++ b/transport_data/org/metadata/report.py @@ -1,15 +1,85 @@ +from collections import defaultdict from dataclasses import dataclass from functools import partial from typing import TYPE_CHECKING from transport_data.report import Report +from transport_data.util import uline if TYPE_CHECKING: from sdmx.model import v21 @dataclass -class SummaryHTML0(Report): +class MetadataAttributePlain(Report): + """Summarize unique values appearing in `mds` for attribute `mda_id`.""" + + mds: "v21.MetadataSet" + mda_id: str + + def render(self) -> str: + from transport_data.org.metadata import _get + + value_id = defaultdict(set) + + for r in self.mds.report: + value_id[_get(r, self.mda_id) or "MISSING"].add( + _get(r, "DATAFLOW") or "MISSING" + ) + + assert self.mds.structured_by + mda = self.mds.structured_by.report_structure["ALL"].get(self.mda_id) + + lines = ["", "", uline(f"{mda}: {len(value_id)} unique values")] + + for value, df_ids in sorted(value_id.items()): + lines.extend([value, " " + " ".join(sorted(df_ids))]) + + return "\n".join(lines) + + +@dataclass +class MetadataReportPlain(Report): + mdr: "v21.MetadataReport" + + def render(self) -> str: + lines = ["", uline("Metadata report")] + + # Retrieve references to the data flow and data structure + dfd: "v21.DataflowDefinition" = self.mdr.attaches_to.key_values["DATAFLOW"].obj # type: ignore [union-attr] + dsd = dfd.structure + + # Summarize the data flow and data structure + + lines.extend( + [f"Refers to {dfd!r}", f" with structure {dsd!r}", " with dimensions:"] + ) + for dim in dsd.dimensions: + line = f" - {dim.id}:" + if desc := str(dim.get_annotation(id="tdc-description").text): + line += f" {desc!s}" + else: + line += " (no info)" + try: + original_id = dim.get_annotation(id="tdc-original-id").text + line += f" ('{original_id!s}' in input file)" + except KeyError: + pass + lines.append(line) + + lines.append("") + + for ra in self.mdr.metadata: + if ra.value_for.id == "DATAFLOW": + continue + assert hasattr(ra, "value") + lines.append(f"{ra.value_for}: {ra.value}") + + return "\n".join(lines) + + +@dataclass +class MetadataSetHTML0(Report): """Generate a summary report in HTML.""" template_name = "metadata-0.html" @@ -32,7 +102,7 @@ def render(self) -> str: @dataclass -class SummaryHTML1(Report): +class MetadataSetHTML1(Report): """Generate a summary report in HTML.""" template_name = "metadata-1.html" @@ -56,7 +126,7 @@ def render(self) -> str: @dataclass -class SummaryODT(Report): +class MetadataSetODT(Report): template_name = "metadata-2.rst" #: Metadata set to summarize. @@ -82,3 +152,23 @@ def render(self) -> bytes: # Convert reStructuredText → OpenDocumentText return self.rst2odt(rst_source) + + +@dataclass +class MetadataSetPlain(Report): + """Print a summary of the contents of `mds`.""" + + mds: "v21.MetadataSet" + + def render(self) -> str: + lines = [ + f"Metadata set containing {len(self.mds.report)} metadata reports", + MetadataAttributePlain(self.mds, "MEASURE").render(), + MetadataAttributePlain(self.mds, "DATA_PROVIDER").render(), + MetadataAttributePlain(self.mds, "UNIT_MEASURE").render(), + ] + + for r in self.mds.report: + lines.append(MetadataReportPlain(r).render()) + + return "\n".join(lines) diff --git a/transport_data/tests/org/test_metadata.py b/transport_data/tests/org/test_metadata.py index ce0f53a..91967b4 100644 --- a/transport_data/tests/org/test_metadata.py +++ b/transport_data/tests/org/test_metadata.py @@ -2,14 +2,21 @@ import pytest -from transport_data.org.metadata import ( - contains_data_for, - groupby, - summarize_metadataset, -) -from transport_data.org.metadata.report import SummaryHTML0, SummaryHTML1, SummaryODT +from transport_data.org.metadata import contains_data_for, groupby, report from transport_data.org.metadata.spreadsheet import make_workbook, read_workbook +#: Number of metadata reports in the test specimen for which contains_data_for() returns +#: :any:`True`. +COUNTRIES = [ + ("CN", 19), + ("ID", 17), + ("IN", 19), + ("PH", 14), + ("NP", 0), + ("TH", 17), + ("VN", 18), +] + def test_make_workbook(tmp_path) -> None: make_workbook() @@ -28,32 +35,6 @@ def test_read_workbook(example_metadata) -> None: assert 45 == len(result.report) -def test_summarize_metadataset(capsys, example_metadata) -> None: - mds, cs_dims = example_metadata - - # Function runs successfully - summarize_metadataset(mds) - - captured = capsys.readouterr() - # pathlib.Path("debug.txt").write_text(captured.out) # DEBUG Write to a file - - # Output contains certain text - assert "MEASURE: 39 unique values" in captured.out - - # TODO expand with further assertions - - -COUNTRIES = [ - ("CN", 19), - ("ID", 17), - ("IN", 19), - ("PH", 14), - ("NP", 0), - ("TH", 17), - ("VN", 18), -] - - @pytest.mark.parametrize("ref_area, N_exp", COUNTRIES) def test_groupby(example_metadata, ref_area, N_exp: int) -> None: predicate = partial(contains_data_for, ref_area=ref_area) @@ -67,22 +48,22 @@ def test_groupby(example_metadata, ref_area, N_exp: int) -> None: assert exp >= {(k, len(v)) for k, v in result.items()} -class TestSummaryHTML0: +class TestMetadataSetHTML0: @pytest.mark.parametrize("ref_area, N_exp", COUNTRIES) def test_write_file(self, tmp_path, example_metadata, ref_area, N_exp) -> None: path = tmp_path.joinpath(f"{ref_area}.html") - SummaryHTML0(example_metadata[0], ref_area=ref_area).write_file(path) + report.MetadataSetHTML0(example_metadata[0], ref_area=ref_area).write_file(path) # Output was created assert path.exists() -class TestSummaryHTML1: +class TestMetadataSetHTML1: def test_write_file(self, tmp_path, example_metadata) -> None: path = tmp_path.joinpath("all.html") - SummaryHTML1( + report.MetadataSetHTML1( example_metadata[0], ref_area=list(item[0] for item in COUNTRIES) ).write_file(path) @@ -91,11 +72,29 @@ def test_write_file(self, tmp_path, example_metadata) -> None: @pytest.mark.parametrize("ref_area, N_exp", COUNTRIES) -class TestSummaryODT: +class TestMetadataSetODT: def test_write_file(self, tmp_path, example_metadata, ref_area, N_exp) -> None: path = tmp_path.joinpath(f"{ref_area}.odt") - SummaryODT(example_metadata[0], ref_area=ref_area).write_file(path=path) + report.MetadataSetODT(example_metadata[0], ref_area=ref_area).write_file( + path=path + ) # Output was created assert path.exists() + + +class TestMetadataSetPlain: + def test_render(self, capsys, example_metadata) -> None: + mds, cs_dims = example_metadata + + # Function runs successfully + result = report.MetadataSetPlain(mds).render() + + # pathlib.Path("debug.txt").write_text(result) # DEBUG Write to a file + # print(result) # DEBUG Write to stdout + + # Output contains certain text + assert "MEASURE: 39 unique values" in result + + # TODO expand with further assertions From 0804282d4f8d5b30f2b418067b274f943f45a939 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Mon, 7 Oct 2024 10:04:14 +0200 Subject: [PATCH 25/34] Write MetadataSetHTML[01] as UTF-8 on Windows --- transport_data/report.py | 31 +++++++++++++++++++---- transport_data/tests/org/test_metadata.py | 6 +++-- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/transport_data/report.py b/transport_data/report.py index 0e09e2a..6cc0972 100644 --- a/transport_data/report.py +++ b/transport_data/report.py @@ -6,14 +6,27 @@ class Report(ABC): - """Base class for reports.""" + """Abstract class for reports. + Subclasses: + + - **must** implement :meth:`render`. + - **may** use :func:`dataclasses.dataclass` to declare additional attributes and an + :py:`__init__()` method that accepts and stores them. + """ + + #: Name of a Jinja2 template used by the report; see :meth:`render_jinja_template`. template_name: str @abstractmethod - def render(self) -> Union[str, bytes]: ... + def render(self) -> Union[str, bytes]: + """Render the report (generate its contents) and return as str or bytes. + + The content may be in any format: plain text, HTML, binary file content, etc. + """ def render_jinja_template(self, *args, **kwargs) -> str: + """Retrieve the Jinja2 :attr:`template_name` and call its render method.""" from transport_data.util import jinja2 env, common = jinja2.get_env() @@ -21,6 +34,13 @@ def render_jinja_template(self, *args, **kwargs) -> str: return env.get_template(self.template_name).render(*args, **kwargs, **common) def rst2odt(self, content: str) -> bytes: + """Convert `content` from reStructuredText to OpenDocument Text (ODT). + + Returns + ------- + bytes + The ODT (ZIP) archive. + """ from docutils.core import publish_string from transport_data.util import docutils, jinja2 @@ -35,9 +55,10 @@ def rst2odt(self, content: str) -> bytes: writer=docutils.ODTWriter(), source=content, settings_overrides=settings ) - def write_file(self, path: Path) -> None: + def write_file(self, path: Path, **kwargs) -> None: + """:meth:`render` the report and write to `path`.""" content = self.render() if isinstance(content, str): - path.write_text(content) + path.write_text(content, **kwargs) else: - path.write_bytes(content) + path.write_bytes(content, **kwargs) diff --git a/transport_data/tests/org/test_metadata.py b/transport_data/tests/org/test_metadata.py index 91967b4..baa29da 100644 --- a/transport_data/tests/org/test_metadata.py +++ b/transport_data/tests/org/test_metadata.py @@ -53,7 +53,9 @@ class TestMetadataSetHTML0: def test_write_file(self, tmp_path, example_metadata, ref_area, N_exp) -> None: path = tmp_path.joinpath(f"{ref_area}.html") - report.MetadataSetHTML0(example_metadata[0], ref_area=ref_area).write_file(path) + report.MetadataSetHTML0(example_metadata[0], ref_area=ref_area).write_file( + path, encoding="utf-8" + ) # Output was created assert path.exists() @@ -65,7 +67,7 @@ def test_write_file(self, tmp_path, example_metadata) -> None: report.MetadataSetHTML1( example_metadata[0], ref_area=list(item[0] for item in COUNTRIES) - ).write_file(path) + ).write_file(path, encoding="utf-8") # Output was created assert path.exists() From 684a36e032693c4b92df8ec862f5875c2bd07cb4 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Mon, 7 Oct 2024 10:10:54 +0200 Subject: [PATCH 26/34] Drop Python 3.8 support --- .github/workflows/pytest.yaml | 2 +- doc/whatsnew.rst | 1 + pyproject.toml | 3 +-- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pytest.yaml b/.github/workflows/pytest.yaml index b3673fa..3912d57 100644 --- a/.github/workflows/pytest.yaml +++ b/.github/workflows/pytest.yaml @@ -23,7 +23,7 @@ jobs: - windows-latest python-version: - - "3.8" + # These should match pyproject.toml - "3.9" - "3.10" - "3.11" diff --git a/doc/whatsnew.rst b/doc/whatsnew.rst index b81f8d2..08b105f 100644 --- a/doc/whatsnew.rst +++ b/doc/whatsnew.rst @@ -4,6 +4,7 @@ What's new Next release ============ +- Python 3.8 support is dropped (:pull:`21`), as it has reached end-of-life. - Add :mod:`.ipcc` (:doc:`ipcc`) module (:issue:`15`, :pull:`21`). - Add :doc:`standards` and :doc:`roadmap` documentation pages (:pull:`9`). - Adjust :mod:`.adb` for changes in data format in the 2024-05-20 edition of the ATO National Database (:pull:`20`, :issue:`18`). diff --git a/pyproject.toml b/pyproject.toml index 8eed655..4ad8543 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,14 +13,13 @@ classifiers = [ "Operating System :: OS Independent", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", ] readme = "README.rst" -requires-python = ">=3.8" +requires-python = ">=3.9" dependencies = [ "click", "docutils", From e824ad01fa1ddcaf0461f609754b3878b08111be Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Mon, 7 Oct 2024 10:18:04 +0200 Subject: [PATCH 27/34] Renumber and sort report classes, template files --- .../{metadata-0.html => metadata-set-1.html} | 0 .../{metadata-2.rst => metadata-set-1.rst} | 0 .../{metadata-1.html => metadata-set-2.html} | 0 transport_data/org/cli.py | 8 +- transport_data/org/metadata/report.py | 96 +++++++++++++------ transport_data/tests/org/test_metadata.py | 56 +++++------ 6 files changed, 100 insertions(+), 60 deletions(-) rename transport_data/data/template/{metadata-0.html => metadata-set-1.html} (100%) rename transport_data/data/template/{metadata-2.rst => metadata-set-1.rst} (100%) rename transport_data/data/template/{metadata-1.html => metadata-set-2.html} (100%) diff --git a/transport_data/data/template/metadata-0.html b/transport_data/data/template/metadata-set-1.html similarity index 100% rename from transport_data/data/template/metadata-0.html rename to transport_data/data/template/metadata-set-1.html diff --git a/transport_data/data/template/metadata-2.rst b/transport_data/data/template/metadata-set-1.rst similarity index 100% rename from transport_data/data/template/metadata-2.rst rename to transport_data/data/template/metadata-set-1.rst diff --git a/transport_data/data/template/metadata-1.html b/transport_data/data/template/metadata-set-2.html similarity index 100% rename from transport_data/data/template/metadata-1.html rename to transport_data/data/template/metadata-set-2.html diff --git a/transport_data/org/cli.py b/transport_data/org/cli.py index 170bb16..e91e3dc 100644 --- a/transport_data/org/cli.py +++ b/transport_data/org/cli.py @@ -40,7 +40,7 @@ def read(path: "pathlib.Path"): mds, _ = read_workbook(path.resolve()) - print(report.MetadataSetPlain(mds).render()) + print(report.MetadataSet0Plain(mds).render()) @main.command @@ -69,10 +69,10 @@ def summarize(path_in: "pathlib.Path", path_out: Optional["pathlib.Path"], ref_a if path_out is None: path_out = pathlib.Path.cwd().joinpath(f"{ref_areas[0]}.{{html,odt}}") print(f"Write to {path_out}") - report.MetadataSetHTML0(mds, ref_area=ref_areas[0]).write_file( + report.MetadataSet1HTML(mds, ref_area=ref_areas[0]).write_file( path_out.with_suffix(".html") ) - report.MetadataSetODT(mds, ref_area=ref_areas[0]).write_file( + report.MetadataSet1ODT(mds, ref_area=ref_areas[0]).write_file( path_out.with_suffix(".odt") ) elif 1 < len(ref_areas): @@ -80,7 +80,7 @@ def summarize(path_in: "pathlib.Path", path_out: Optional["pathlib.Path"], ref_a if path_out is None: path_out = pathlib.Path.cwd().joinpath("all.html") print(f"Write to {path_out}") - report.MetadataSetHTML1(mds, ref_area=ref_areas).write_file(path_out) + report.MetadataSet2HTML(mds, ref_area=ref_areas).write_file(path_out) @main.command("template") diff --git a/transport_data/org/metadata/report.py b/transport_data/org/metadata/report.py index c3a7955..6cb4b76 100644 --- a/transport_data/org/metadata/report.py +++ b/transport_data/org/metadata/report.py @@ -11,7 +11,18 @@ @dataclass -class MetadataAttributePlain(Report): +class MetadataAttribute0HTML(Report): + """Summarize unique values appearing in `mds` for attribute `mda_id`.""" + + mds: "v21.MetadataSet" + mda_id: str + + def render(self) -> str: + raise NotImplementedError + + +@dataclass +class MetadataAttribute0Plain(Report): """Summarize unique values appearing in `mds` for attribute `mda_id`.""" mds: "v21.MetadataSet" @@ -39,7 +50,15 @@ def render(self) -> str: @dataclass -class MetadataReportPlain(Report): +class MetadataReport0HTML(Report): + mdr: "v21.MetadataReport" + + def render(self) -> str: + raise NotImplementedError + + +@dataclass +class MetadataReport0Plain(Report): mdr: "v21.MetadataReport" def render(self) -> str: @@ -79,10 +98,51 @@ def render(self) -> str: @dataclass -class MetadataSetHTML0(Report): - """Generate a summary report in HTML.""" +class MetadataSet0HTML(Report): + """Print a summary of the contents of `mds`.""" template_name = "metadata-0.html" + mds: "v21.MetadataSet" + + def render(self) -> str: + lines = [ + f"Metadata set containing {len(self.mds.report)} metadata reports", + MetadataAttribute0HTML(self.mds, "MEASURE").render(), + MetadataAttribute0HTML(self.mds, "DATA_PROVIDER").render(), + MetadataAttribute0HTML(self.mds, "UNIT_MEASURE").render(), + ] + + for r in self.mds.report: + lines.append(MetadataReport0HTML(r).render()) + + return "\n".join(lines) + + +@dataclass +class MetadataSet0Plain(Report): + """Print a summary of the contents of `mds`.""" + + mds: "v21.MetadataSet" + + def render(self) -> str: + lines = [ + f"Metadata set containing {len(self.mds.report)} metadata reports", + MetadataAttribute0Plain(self.mds, "MEASURE").render(), + MetadataAttribute0Plain(self.mds, "DATA_PROVIDER").render(), + MetadataAttribute0Plain(self.mds, "UNIT_MEASURE").render(), + ] + + for r in self.mds.report: + lines.append(MetadataReport0Plain(r).render()) + + return "\n".join(lines) + + +@dataclass +class MetadataSet1HTML(Report): + """Generate a summary report in HTML.""" + + template_name = "metadata-set-1.html" #: Metadata set to summarize. mds: "v21.MetadataSet" @@ -102,10 +162,10 @@ def render(self) -> str: @dataclass -class MetadataSetHTML1(Report): +class MetadataSet2HTML(Report): """Generate a summary report in HTML.""" - template_name = "metadata-1.html" + template_name = "metadata-set-2.html" #: Metadata set to summarize. mds: "v21.MetadataSet" @@ -126,8 +186,8 @@ def render(self) -> str: @dataclass -class MetadataSetODT(Report): - template_name = "metadata-2.rst" +class MetadataSet1ODT(Report): + template_name = "metadata-set-1.rst" #: Metadata set to summarize. mds: "v21.MetadataSet" @@ -152,23 +212,3 @@ def render(self) -> bytes: # Convert reStructuredText → OpenDocumentText return self.rst2odt(rst_source) - - -@dataclass -class MetadataSetPlain(Report): - """Print a summary of the contents of `mds`.""" - - mds: "v21.MetadataSet" - - def render(self) -> str: - lines = [ - f"Metadata set containing {len(self.mds.report)} metadata reports", - MetadataAttributePlain(self.mds, "MEASURE").render(), - MetadataAttributePlain(self.mds, "DATA_PROVIDER").render(), - MetadataAttributePlain(self.mds, "UNIT_MEASURE").render(), - ] - - for r in self.mds.report: - lines.append(MetadataReportPlain(r).render()) - - return "\n".join(lines) diff --git a/transport_data/tests/org/test_metadata.py b/transport_data/tests/org/test_metadata.py index baa29da..6cebef7 100644 --- a/transport_data/tests/org/test_metadata.py +++ b/transport_data/tests/org/test_metadata.py @@ -48,12 +48,28 @@ def test_groupby(example_metadata, ref_area, N_exp: int) -> None: assert exp >= {(k, len(v)) for k, v in result.items()} -class TestMetadataSetHTML0: +class TestMetadataSet0Plain: + def test_render(self, capsys, example_metadata) -> None: + mds, cs_dims = example_metadata + + # Function runs successfully + result = report.MetadataSet0Plain(mds).render() + + # pathlib.Path("debug.txt").write_text(result) # DEBUG Write to a file + # print(result) # DEBUG Write to stdout + + # Output contains certain text + assert "MEASURE: 39 unique values" in result + + # TODO expand with further assertions + + +class TestMetadataSet1HTML: @pytest.mark.parametrize("ref_area, N_exp", COUNTRIES) def test_write_file(self, tmp_path, example_metadata, ref_area, N_exp) -> None: path = tmp_path.joinpath(f"{ref_area}.html") - report.MetadataSetHTML0(example_metadata[0], ref_area=ref_area).write_file( + report.MetadataSet1HTML(example_metadata[0], ref_area=ref_area).write_file( path, encoding="utf-8" ) @@ -61,24 +77,12 @@ def test_write_file(self, tmp_path, example_metadata, ref_area, N_exp) -> None: assert path.exists() -class TestMetadataSetHTML1: - def test_write_file(self, tmp_path, example_metadata) -> None: - path = tmp_path.joinpath("all.html") - - report.MetadataSetHTML1( - example_metadata[0], ref_area=list(item[0] for item in COUNTRIES) - ).write_file(path, encoding="utf-8") - - # Output was created - assert path.exists() - - @pytest.mark.parametrize("ref_area, N_exp", COUNTRIES) -class TestMetadataSetODT: +class TestMetadataSet1ODT: def test_write_file(self, tmp_path, example_metadata, ref_area, N_exp) -> None: path = tmp_path.joinpath(f"{ref_area}.odt") - report.MetadataSetODT(example_metadata[0], ref_area=ref_area).write_file( + report.MetadataSet1ODT(example_metadata[0], ref_area=ref_area).write_file( path=path ) @@ -86,17 +90,13 @@ def test_write_file(self, tmp_path, example_metadata, ref_area, N_exp) -> None: assert path.exists() -class TestMetadataSetPlain: - def test_render(self, capsys, example_metadata) -> None: - mds, cs_dims = example_metadata - - # Function runs successfully - result = report.MetadataSetPlain(mds).render() - - # pathlib.Path("debug.txt").write_text(result) # DEBUG Write to a file - # print(result) # DEBUG Write to stdout +class TestMetadataSet2HTML: + def test_write_file(self, tmp_path, example_metadata) -> None: + path = tmp_path.joinpath("all.html") - # Output contains certain text - assert "MEASURE: 39 unique values" in result + report.MetadataSet2HTML( + example_metadata[0], ref_area=list(item[0] for item in COUNTRIES) + ).write_file(path, encoding="utf-8") - # TODO expand with further assertions + # Output was created + assert path.exists() From cf743ffa80929b93876a42f2e9f92b31c335ce68 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Mon, 7 Oct 2024 11:38:18 +0200 Subject: [PATCH 28/34] Implement report.MetadataSet0ODT --- .../data/template/metadata-attribute-0.rst | 10 +++ .../data/template/metadata-set-0.rst | 42 ++++++++++++ transport_data/org/metadata/__init__.py | 34 +++++++++- transport_data/org/metadata/report.py | 64 ++++++++++--------- transport_data/tests/org/test_metadata.py | 13 ++++ 5 files changed, 131 insertions(+), 32 deletions(-) create mode 100644 transport_data/data/template/metadata-attribute-0.rst create mode 100644 transport_data/data/template/metadata-set-0.rst diff --git a/transport_data/data/template/metadata-attribute-0.rst b/transport_data/data/template/metadata-attribute-0.rst new file mode 100644 index 0000000..1885cdc --- /dev/null +++ b/transport_data/data/template/metadata-attribute-0.rst @@ -0,0 +1,10 @@ +.. _{{mda}}: + +{{ mda }}: {{ value_id | length }} unique values +================================================ + +{% for value, dfd_ids in value_id | items | sort %} + +{{ value }} ({{ dfd_ids | length }} appearances) + {%+ for id in dfd_ids | sort %}{{ id }}{{ ", " if not loop.last }}{% endfor %} +{% endfor %} diff --git a/transport_data/data/template/metadata-set-0.rst b/transport_data/data/template/metadata-set-0.rst new file mode 100644 index 0000000..feeec96 --- /dev/null +++ b/transport_data/data/template/metadata-set-0.rst @@ -0,0 +1,42 @@ +.. _top: + +Metadata summary +**************** + +This file contains a summary of metadata collected through the project from the consultant team, GIZ Country Focal Points (CFPs), etc. + +- It is automatically generated using the tools developed at transport-data/tools#21 on GitHub, using a command similar to: + + .. code-block:: + + tdc org read “Metadata 2024-09-27.xlsx” + + …currently using, as input, the Teams file “Metadata file – prototype 1.xlsx” as of 2024-09-27. + +- Use “File > Version History” in Microsoft Office to see updates. +- For questions or information please contact Paul Kishimoto via Teams and/or see the HOWTO. +- The entire file may be overwritten periodically. + **Do not** make edits in this file if you need them to be preserved; instead, make a copy and edit there. + + +**Direct links:** + +`DATA_PROVIDER`_ — +`MEASURE`_ — +`UNIT_MEASURE`_ — +`Dimensions`_ + +{# Write out pre-generated reports on MetadataAttributes #} +{% for obj in mda %} + +{{ obj | safe }} +{% endfor %} + +_`Dimensions`: {{ dim_id | length }} unique concepts +================================================= + +{% for value, dfd_ids in dim_id | items | sort %} + +{{ value }} ({{ dfd_ids | length }} appearances) + {%+ for id in dfd_ids | sort %}{{ id }}{{ ", " if not loop.last }}{% endfor %} +{% endfor %} diff --git a/transport_data/org/metadata/__init__.py b/transport_data/org/metadata/__init__.py index 440b2a2..db6f803 100644 --- a/transport_data/org/metadata/__init__.py +++ b/transport_data/org/metadata/__init__.py @@ -153,10 +153,11 @@ def get_cs_common() -> "common.ConceptScheme": annotations=[common.Annotation(id="tdc-aka", text=repr(["Fuel type"]))], ) cs.setdefault( - id="REF_AREA", + id="GEO", annotations=[ common.Annotation( - id="tdc-aka", text=repr(["Area", "Country", "Country code", "Region"]) + id="tdc-aka", + text=repr(["Area", "Country", "Country code", "REF_AREA", "Region"]), ) ], ) @@ -213,3 +214,32 @@ def _get(mdr: "v21.MetadataReport", mda_id: str) -> Optional[str]: return mda.value # No match return None + + +def map_values_to_ids(mds: "v21.MetadataSet", mda_id: str) -> dict[str, set[str]]: + """Return a mapping from unique reported attribute values to data flow IDs.""" + result = defaultdict(set) + + for r in mds.report: + result[_get(r, mda_id) or "MISSING"].add(_get(r, "DATAFLOW") or "MISSING") + + return result + + +def map_dims_to_ids(mds: "v21.MetadataSet") -> dict[str, set[str]]: + """Return a mapping from unique concept IDs used for dimensions to data flow IDs.""" + result = defaultdict(set) + + for r in mds.report: + dfd = r.attaches_to.key_values["DATAFLOW"].obj # type: ignore [union-attr] + for dim in dfd.structure.dimensions: + key = f"{dim.id!r}" + try: + anno = dim.get_annotation(id="tdc-original-id") + key += f" (as '{anno.text!s}')" + except KeyError: + pass + + result[key].add(dfd.id) + + return result diff --git a/transport_data/org/metadata/report.py b/transport_data/org/metadata/report.py index 6cb4b76..a25e9b7 100644 --- a/transport_data/org/metadata/report.py +++ b/transport_data/org/metadata/report.py @@ -1,4 +1,3 @@ -from collections import defaultdict from dataclasses import dataclass from functools import partial from typing import TYPE_CHECKING @@ -11,42 +10,44 @@ @dataclass -class MetadataAttribute0HTML(Report): +class MetadataAttribute0Plain(Report): """Summarize unique values appearing in `mds` for attribute `mda_id`.""" mds: "v21.MetadataSet" mda_id: str def render(self) -> str: - raise NotImplementedError + from transport_data.org.metadata import map_values_to_ids + + value_id = map_values_to_ids(self.mds, self.mda_id) + + assert self.mds.structured_by + mda = self.mds.structured_by.report_structure["ALL"].get(self.mda_id) + + lines = ["", "", uline(f"{mda}: {len(value_id)} unique values")] + + for value, df_ids in sorted(value_id.items()): + lines.extend([value, " " + " ".join(sorted(df_ids))]) + + return "\n".join(lines) @dataclass -class MetadataAttribute0Plain(Report): +class MetadataAttribute0RST(Report): """Summarize unique values appearing in `mds` for attribute `mda_id`.""" + template_name = "metadata-attribute-0.rst" mds: "v21.MetadataSet" mda_id: str def render(self) -> str: - from transport_data.org.metadata import _get - - value_id = defaultdict(set) - - for r in self.mds.report: - value_id[_get(r, self.mda_id) or "MISSING"].add( - _get(r, "DATAFLOW") or "MISSING" - ) + from transport_data.org.metadata import map_values_to_ids + value_id = map_values_to_ids(self.mds, self.mda_id) assert self.mds.structured_by mda = self.mds.structured_by.report_structure["ALL"].get(self.mda_id) - lines = ["", "", uline(f"{mda}: {len(value_id)} unique values")] - - for value, df_ids in sorted(value_id.items()): - lines.extend([value, " " + " ".join(sorted(df_ids))]) - - return "\n".join(lines) + return self.render_jinja_template(mda=mda, value_id=value_id) @dataclass @@ -98,24 +99,27 @@ def render(self) -> str: @dataclass -class MetadataSet0HTML(Report): +class MetadataSet0ODT(Report): """Print a summary of the contents of `mds`.""" - template_name = "metadata-0.html" + template_name = "metadata-set-0.rst" mds: "v21.MetadataSet" - def render(self) -> str: - lines = [ - f"Metadata set containing {len(self.mds.report)} metadata reports", - MetadataAttribute0HTML(self.mds, "MEASURE").render(), - MetadataAttribute0HTML(self.mds, "DATA_PROVIDER").render(), - MetadataAttribute0HTML(self.mds, "UNIT_MEASURE").render(), - ] + def render(self) -> bytes: + from transport_data.org.metadata import map_dims_to_ids - for r in self.mds.report: - lines.append(MetadataReport0HTML(r).render()) + # Mapping from reported attribute values to data flow IDs + mda = [ + MetadataAttribute0RST(self.mds, "DATA_PROVIDER").render(), + MetadataAttribute0RST(self.mds, "MEASURE").render(), + MetadataAttribute0RST(self.mds, "UNIT_MEASURE").render(), + ] + # Mapping from dimension IDs to data flow IDs + dim_id = map_dims_to_ids(self.mds) - return "\n".join(lines) + rst_source = self.render_jinja_template(mda=mda, dim_id=dim_id) + # print(rst_source) # DEBUG + return self.rst2odt(rst_source) @dataclass diff --git a/transport_data/tests/org/test_metadata.py b/transport_data/tests/org/test_metadata.py index 6cebef7..0b8ad29 100644 --- a/transport_data/tests/org/test_metadata.py +++ b/transport_data/tests/org/test_metadata.py @@ -48,6 +48,19 @@ def test_groupby(example_metadata, ref_area, N_exp: int) -> None: assert exp >= {(k, len(v)) for k, v in result.items()} +class TestMetadataSet0ODT: + def test_render(self, tmp_path, example_metadata) -> None: + mds, cs_dims = example_metadata + + path = tmp_path.joinpath("all.odt") + + # Function runs successfully + report.MetadataSet0ODT(mds).write_file(path) + + # Output was created + assert path.exists() + + class TestMetadataSet0Plain: def test_render(self, capsys, example_metadata) -> None: mds, cs_dims = example_metadata From 9ce2e6f01f45cc1d5a2f33677ebb79685af4dcea Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Mon, 7 Oct 2024 11:38:55 +0200 Subject: [PATCH 29/34] Gitignore top-level output/ directory --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index bbe7e94..e7ae1e7 100644 --- a/.gitignore +++ b/.gitignore @@ -136,3 +136,4 @@ dmypy.json # Generated by transport_data *.xlsx +/output/ From f8a83271c34cc1bde9338cf31ab4f118d09f6745 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Mon, 7 Oct 2024 11:48:55 +0200 Subject: [PATCH 30/34] Add a catch-all command for TUEWAS outputs --- transport_data/org/cli.py | 47 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/transport_data/org/cli.py b/transport_data/org/cli.py index e91e3dc..b4c4f70 100644 --- a/transport_data/org/cli.py +++ b/transport_data/org/cli.py @@ -78,9 +78,52 @@ def summarize(path_in: "pathlib.Path", path_out: Optional["pathlib.Path"], ref_a elif 1 < len(ref_areas): # Report for multiple REF_AREA if path_out is None: - path_out = pathlib.Path.cwd().joinpath("all.html") + path_out = pathlib.Path.cwd().joinpath("all.{html,odt}") print(f"Write to {path_out}") - report.MetadataSet2HTML(mds, ref_area=ref_areas).write_file(path_out) + report.MetadataSet0ODT(mds).write_file(path_out.with_suffix(".odt")) + report.MetadataSet2HTML(mds, ref_area=ref_areas).write_file( + path_out.with_suffix(".html") + ) + + +@main.command("tuewas") +@click.argument( + "path_in", type=click.Path(exists=True, dir_okay=False, path_type=pathlib.Path) +) +def _tuewas_all(path_in): + """Generate all outputs for TUEWAS.""" + from zipfile import ZipFile + + from .metadata import report + from .metadata.spreadsheet import read_workbook + + ref_areas = "CN ID IN PH TH VN".split() + + mds, _ = read_workbook(path_in.resolve()) + + dir_out = pathlib.Path.cwd().joinpath("output") + path_out = [] + + for ref_area in ref_areas: + path_out.append(dir_out.joinpath(ref_area, "Summary.odt")) + path_out[-1].parent.mkdir(parents=True, exist_ok=True) + report.MetadataSet1ODT(mds, ref_area=ref_areas[0]).write_file(path_out[-1]) + print(f"Wrote {path_out[-1]}") + + path_out.append(dir_out.joinpath("Metadata summary.odt")) + report.MetadataSet0ODT(mds).write_file(path_out[-1]) + print(f"Wrote {path_out[-1]}") + + path_out.append(dir_out.joinpath("Metadata summary table.html")) + report.MetadataSet2HTML(mds, ref_area=ref_areas).write_file(path_out[-1]) + print(f"Wrote {path_out[-1]}") + + path_zip = dir_out.joinpath("all.zip") + with ZipFile(path_zip, mode="w") as zf: + for p in path_out: + zf.write(p, str(p.relative_to(dir_out))) + + print(f"Wrote {path_zip}") @main.command("template") From c860db1b330abd37d606d760b1f0ff81441d564e Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Mon, 7 Oct 2024 21:45:06 +0200 Subject: [PATCH 31/34] Test "org {read,summarize,tuewas}" CLI commands --- transport_data/testing.py | 14 +++++ transport_data/tests/org/test_metadata.py | 72 +++++++++++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/transport_data/testing.py b/transport_data/testing.py index 82d8ccb..ae3f84e 100644 --- a/transport_data/testing.py +++ b/transport_data/testing.py @@ -1,6 +1,7 @@ from pathlib import Path from typing import Generator, cast +import click.testing import pytest import sdmx.message import sdmx.model.v21 as m @@ -57,6 +58,19 @@ def sdmx_structures(tmp_store) -> sdmx.message.StructureMessage: return sm +class CliRunner(click.testing.CliRunner): + def invoke(self, *args, **kwargs): + import transport_data.cli + + return super().invoke(transport_data.cli.main, *args, **kwargs) + + +@pytest.fixture +def tdc_cli(): + """A :class:`.CliRunner` object that invokes the :program:`tdc` CLI.""" + yield CliRunner() + + @pytest.fixture(scope="session") def test_data_path() -> Generator[Path, None, None]: """Path containing test data.""" diff --git a/transport_data/tests/org/test_metadata.py b/transport_data/tests/org/test_metadata.py index 0b8ad29..e4ffbbf 100644 --- a/transport_data/tests/org/test_metadata.py +++ b/transport_data/tests/org/test_metadata.py @@ -1,4 +1,5 @@ from functools import partial +from pathlib import Path import pytest @@ -22,6 +23,19 @@ def test_make_workbook(tmp_path) -> None: make_workbook() +def test_make_workbook_cli(tmp_path, tdc_cli) -> None: + with tdc_cli.isolated_filesystem(temp_dir=tmp_path) as td: + # Expected output path + exp = Path(td, "sample.xlsx") + result = tdc_cli.invoke(["org", "template"]) + + # Command ran without error + assert 0 == result.exit_code + + # Expected file was generated + assert exp.exists() + + @pytest.fixture(scope="module") def example_metadata(test_data_path): return read_workbook(test_data_path.joinpath("metadata-input.xlsx")) @@ -113,3 +127,61 @@ def test_write_file(self, tmp_path, example_metadata) -> None: # Output was created assert path.exists() + + +def test_read_cli(tmp_path, tdc_cli, test_data_path) -> None: + path_in = test_data_path.joinpath("metadata-input.xlsx") + + result = tdc_cli.invoke(["org", "read", str(path_in)]) + + # Command ran without error + assert 0 == result.exit_code, result.output + + assert "MEASURE: 39 unique values" in result.output + + +def test_refresh_cli(tdc_cli) -> None: + result = tdc_cli.invoke(["org", "refresh"]) + + # Command ran without error + assert 0 == result.exit_code, result.output + + +def test_summarize_cli(tmp_path, tdc_cli, test_data_path) -> None: + path_in = test_data_path.joinpath("metadata-input.xlsx") + + # --ref-area= single value + with tdc_cli.isolated_filesystem(temp_dir=tmp_path) as td: + path_out = Path(td, "VN") + result = tdc_cli.invoke(["org", "summarize", "--ref-area=VN", str(path_in)]) + + # Command ran without error + assert 0 == result.exit_code, result.output + # Output files are generated + assert path_out.with_suffix(".html").exists() + assert path_out.with_suffix(".odt").exists() + + # --ref-area= multiple values + with tdc_cli.isolated_filesystem(temp_dir=tmp_path) as td: + path_out = Path(td, "all") + result = tdc_cli.invoke(["org", "summarize", "--ref-area=TH,VN", str(path_in)]) + + # Command ran without error + assert 0 == result.exit_code, result.output + # Output files are generated + assert path_out.with_suffix(".html").exists() + assert path_out.with_suffix(".odt").exists() + + +def test_tuewas_cli(tmp_path, tdc_cli, test_data_path) -> None: + path_in = test_data_path.joinpath("metadata-input.xlsx") + with tdc_cli.isolated_filesystem(temp_dir=tmp_path) as td: + dir_out = Path(td, "output") + result = tdc_cli.invoke(["org", "tuewas", str(path_in)]) + + # Command ran without error + assert 0 == result.exit_code, result.output + # Expected files were generated + assert dir_out.joinpath("Metadata summary.odt").exists() + assert dir_out.joinpath("Metadata summary table.html").exists() + assert dir_out.joinpath("CN", "Summary.odt").exists() From d976e9f8039431f9fce5ab2c1f3f11d9184c37a3 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Mon, 7 Oct 2024 21:59:26 +0200 Subject: [PATCH 32/34] Exclude NotImplemented function bodies from coverage --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 4ad8543..1c64255 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,7 @@ exclude_also = [ "if TYPE_CHECKING:", # Abstract methods '\.\.\.', + "raise NotImplementedError", ] [[tool.mypy.overrides]] From fdd3673ec79228ea0266015399370346869e969d Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Mon, 7 Oct 2024 22:12:56 +0200 Subject: [PATCH 33/34] Write output in UTF-8 from CLI commands --- transport_data/org/cli.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/transport_data/org/cli.py b/transport_data/org/cli.py index b4c4f70..20a3003 100644 --- a/transport_data/org/cli.py +++ b/transport_data/org/cli.py @@ -70,7 +70,7 @@ def summarize(path_in: "pathlib.Path", path_out: Optional["pathlib.Path"], ref_a path_out = pathlib.Path.cwd().joinpath(f"{ref_areas[0]}.{{html,odt}}") print(f"Write to {path_out}") report.MetadataSet1HTML(mds, ref_area=ref_areas[0]).write_file( - path_out.with_suffix(".html") + path_out.with_suffix(".html"), encoding="utf-8" ) report.MetadataSet1ODT(mds, ref_area=ref_areas[0]).write_file( path_out.with_suffix(".odt") @@ -82,7 +82,7 @@ def summarize(path_in: "pathlib.Path", path_out: Optional["pathlib.Path"], ref_a print(f"Write to {path_out}") report.MetadataSet0ODT(mds).write_file(path_out.with_suffix(".odt")) report.MetadataSet2HTML(mds, ref_area=ref_areas).write_file( - path_out.with_suffix(".html") + path_out.with_suffix(".html"), encoding="utf-8" ) @@ -115,7 +115,9 @@ def _tuewas_all(path_in): print(f"Wrote {path_out[-1]}") path_out.append(dir_out.joinpath("Metadata summary table.html")) - report.MetadataSet2HTML(mds, ref_area=ref_areas).write_file(path_out[-1]) + report.MetadataSet2HTML(mds, ref_area=ref_areas).write_file( + path_out[-1], encoding="utf-8" + ) print(f"Wrote {path_out[-1]}") path_zip = dir_out.joinpath("all.zip") From 78f18c7eb1b1b665955c2b1d882a231f60ba41de Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Tue, 8 Oct 2024 14:16:29 +0200 Subject: [PATCH 34/34] Expand docs for #21; add to whatsnew --- doc/api.in | 1 + doc/conf.py | 12 ++ doc/giz.rst | 4 +- doc/index.rst | 2 + doc/report.rst | 4 + doc/roadmap.rst | 5 +- doc/whatsnew.rst | 8 ++ transport_data/org/__init__.py | 1 + transport_data/org/cli.py | 3 + transport_data/org/metadata/__init__.py | 3 + transport_data/org/metadata/report.py | 124 ++++++++++++++++----- transport_data/org/metadata/spreadsheet.py | 2 + transport_data/util/docutils.py | 2 + transport_data/util/hooks.py | 2 + transport_data/util/jinja2.py | 2 + transport_data/util/pluggy.py | 2 + transport_data/util/pycountry.py | 5 + 17 files changed, 152 insertions(+), 30 deletions(-) create mode 100644 doc/report.rst diff --git a/doc/api.in b/doc/api.in index 98af50b..bb165c8 100644 --- a/doc/api.in +++ b/doc/api.in @@ -14,6 +14,7 @@ oica org proto + report store testing tests diff --git a/doc/conf.py b/doc/conf.py index 639936a..3db10ba 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -33,6 +33,15 @@ nitpicky = True exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] +# A string of reStructuredText included at the beginning of every source file. +rst_prolog = r""" +.. role:: py(code) + :language: python + +.. role:: xml(code) + :language: xml +""" + def setup(app: "sphinx.application.Sphinx"): from sphinx.ext.autosummary.generate import generate_autosummary_docs @@ -72,8 +81,11 @@ def setup(app: "sphinx.application.Sphinx"): intersphinx_mapping = { "click": ("https://click.palletsprojects.com/en/8.1.x/", None), + "docutils": ("https://sphinx-docutils.readthedocs.io/en/latest", None), + "jinja2": ("https://jinja.palletsprojects.com/en/3.1.x", None), "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), "platformdirs": ("https://platformdirs.readthedocs.io/en/latest/", None), + "pluggy": ("https://pluggy.readthedocs.io/en/stable", None), "pooch": ("https://www.fatiando.org/pooch/latest/", None), "py": ("https://docs.python.org/3/", None), "pytest": ("https://docs.pytest.org/en/stable/", None), diff --git a/doc/giz.rst b/doc/giz.rst index aa0bb12..2394a49 100644 --- a/doc/giz.rst +++ b/doc/giz.rst @@ -5,8 +5,10 @@ GIZ GmbH (`website `_, lit. *Corporation for Internation It is not currently a direct provider of (meta)data through TDC, but its members initiated what is now the TDCI and support its activities, including development of this :mod:`.transport_data` package. This work mainly appears in the :mod:`.org` and :mod:`.proto` modules. +.. _project-tuewas: + TUEWAS ====== - “Transport, Environment, Energy, and Water in Asia” is an “internal sector network” of GIZ. -- Website: https://tuewas-asia.org/ +- Website: https://tuewas-asia.org diff --git a/doc/index.rst b/doc/index.rst index a63651d..0e186dc 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -84,6 +84,7 @@ The following modules contain *generic* code and utilities usable with (meta)dat cli config + report store testing tests @@ -91,6 +92,7 @@ The following modules contain *generic* code and utilities usable with (meta)dat - :mod:`~transport_data.cli`: :doc:`cli` - :mod:`.config`: :doc:`config` +- :mod:`.report`: :doc:`report` - :mod:`.store`: :doc:`store` - :mod:`.testing`: :doc:`testing` - :mod:`.tests`: :doc:`tests` diff --git a/doc/report.rst b/doc/report.rst new file mode 100644 index 0000000..89c02a9 --- /dev/null +++ b/doc/report.rst @@ -0,0 +1,4 @@ +Reports +********* + +.. include:: _api/transport_data.report.rst diff --git a/doc/roadmap.rst b/doc/roadmap.rst index 282460c..4f052c7 100644 --- a/doc/roadmap.rst +++ b/doc/roadmap.rst @@ -2,4 +2,7 @@ Roadmap ******* This page gives a medium- and long-term overview of future development of :mod:`transport_data`, focused on the tools in this package, but with relevant details about the broader TDC and TDCI. -See also `transport-data/projects/1 `_ on GitHub. +See also: + +- GitHub project `transport-data/projects/1 `_ for general TDCI development. +- GitHub project `transport-data/projects/3 `_ for deployment of CKAN (2024). diff --git a/doc/whatsnew.rst b/doc/whatsnew.rst index 08b105f..7fa748d 100644 --- a/doc/whatsnew.rst +++ b/doc/whatsnew.rst @@ -4,6 +4,14 @@ What's new Next release ============ +- Add tools and data for the :ref:`project-tuewas` project (:pull:`21`). + + - Add :mod:`.metadata.spreadsheet`, :mod:`.metadata.report` submodules; expand :mod:`.metadata`. + - Add :program:`tdc org read`, :program:`tdc org summarize`, :program:`tdc org tuewas` CLI commands. + +- Add :class:`.report.Report`, a base class for generating ‘reports’ (documents derived from SDMX (meta)data) and supporting code in :mod:`.util.docutils`, :mod:`.util.jinja2` (:pull:`21`). +- Adopt :mod:`pluggy <.util.pluggy>` for plug-in hooks and implementations (:pull:`21`); use the :func:`.hooks.get_agencies` hook across existing modules. +- Add :func:`.tdc_cli`, :func:`.test_data_path` test fixtures (:pull:`21`). - Python 3.8 support is dropped (:pull:`21`), as it has reached end-of-life. - Add :mod:`.ipcc` (:doc:`ipcc`) module (:issue:`15`, :pull:`21`). - Add :doc:`standards` and :doc:`roadmap` documentation pages (:pull:`9`). diff --git a/transport_data/org/__init__.py b/transport_data/org/__init__.py index 841cc03..2359eaa 100644 --- a/transport_data/org/__init__.py +++ b/transport_data/org/__init__.py @@ -15,6 +15,7 @@ @hookimpl def get_agencies() -> "sdmx.model.v21.Agency": + """Return agencies and organizations including and subsidiary to TDCI itself.""" # Agency a1 = m.Agency( id="TDCI", diff --git a/transport_data/org/cli.py b/transport_data/org/cli.py index 20a3003..ceb69d2 100644 --- a/transport_data/org/cli.py +++ b/transport_data/org/cli.py @@ -1,5 +1,8 @@ """CLI for :mod:`.org`. +Use the :program:`--help` command-line option to see help for individual commands +defined in this module. + .. runblock:: console $ tdc org --help diff --git a/transport_data/org/metadata/__init__.py b/transport_data/org/metadata/__init__.py index db6f803..d7dee58 100644 --- a/transport_data/org/metadata/__init__.py +++ b/transport_data/org/metadata/__init__.py @@ -1,3 +1,5 @@ +"""Handle TDC-structured metadata.""" + import itertools import logging import re @@ -174,6 +176,7 @@ def get_cs_common() -> "common.ConceptScheme": def get_msd() -> "v21.MetadataStructureDefinition": + """Generate and return the TDC metadata structure definition.""" from transport_data import STORE from transport_data.org import get_agencyscheme diff --git a/transport_data/org/metadata/report.py b/transport_data/org/metadata/report.py index a25e9b7..7a53ebd 100644 --- a/transport_data/org/metadata/report.py +++ b/transport_data/org/metadata/report.py @@ -1,3 +1,21 @@ +"""Generate reports about TDC-structured metadata. + +:class:`.Report` subclasses in this file **should** have names like:: + + { Type }{ ID }{ Format } + +…wherein: + +- ``{ Type }`` refers to the type of object(s) from the SDMX Information Model that + is/are represented in the report. + Usually the first argument to the :py:`__init__()` method is an instance of this type. +- ``{ ID }`` is a number that distinguishes different ‘kinds’ of reports for the same + ``{ Type }``. + Report classes with the same ``{ Type }{ ID }`` **should** display roughly the same + information in the same order and layout, regardless of ``{ Format }``. +- ``{ Format }`` is the output format or file type. +""" + from dataclasses import dataclass from functools import partial from typing import TYPE_CHECKING @@ -11,9 +29,15 @@ @dataclass class MetadataAttribute0Plain(Report): - """Summarize unique values appearing in `mds` for attribute `mda_id`.""" + """Unique values appearing in `mds` for metadata attribute `mda_id`. + + Each unique value is shown with the IDs of the data flows that contain the value for + `mda_id`. + """ + #: Metadata set to report. mds: "v21.MetadataSet" + #: ID of a Metadata Attribute to report. mda_id: str def render(self) -> str: @@ -34,10 +58,16 @@ def render(self) -> str: @dataclass class MetadataAttribute0RST(Report): - """Summarize unique values appearing in `mds` for attribute `mda_id`.""" + """Unique values appearing in `mds` for the metadata attribute `mda_id`. + Same as :class:`.MetadataAttribute0Plain`, but in reStructuredText. + """ + + #: Jinja2 reStructuredText template. template_name = "metadata-attribute-0.rst" + #: Metadata set to report. mds: "v21.MetadataSet" + #: ID of a Metadata Attribute to report. mda_id: str def render(self) -> str: @@ -52,6 +82,9 @@ def render(self) -> str: @dataclass class MetadataReport0HTML(Report): + """Same as :class:`.MetadataReport0Plain`, but in HTML.""" + + #: Metadata report to report. mdr: "v21.MetadataReport" def render(self) -> str: @@ -60,6 +93,15 @@ def render(self) -> str: @dataclass class MetadataReport0Plain(Report): + """Contents of a single metadata report. + + This includes: + + 1. The reported attribute values for all metadata attributes. + 2. The data flow that is targeted by the report and its dimensions. + """ + + #: Metadata report to report. mdr: "v21.MetadataReport" def render(self) -> str: @@ -100,9 +142,15 @@ def render(self) -> str: @dataclass class MetadataSet0ODT(Report): - """Print a summary of the contents of `mds`.""" + """Summary of the unique reported attribute values in `mds`. + + Similar to :class:`.MetadataSet0Plain`, but also including: + + - The unique dimension concepts appearing in the data structure definitions. + """ template_name = "metadata-set-0.rst" + #: Metadata set to report. mds: "v21.MetadataSet" def render(self) -> bytes: @@ -124,8 +172,15 @@ def render(self) -> bytes: @dataclass class MetadataSet0Plain(Report): - """Print a summary of the contents of `mds`.""" + """Summary of the unique reported attribute values in `mds`. + + This includes: + - Unique values of the metadata attributes ``MEASURE``, ``DATA_PROVIDER``, and + ``UNIT_MEASURE``. + """ + + #: Metadata set to report. mds: "v21.MetadataSet" def render(self) -> str: @@ -144,7 +199,7 @@ def render(self) -> str: @dataclass class MetadataSet1HTML(Report): - """Generate a summary report in HTML.""" + """Metadata reports related to `ref_area`.""" template_name = "metadata-set-1.html" @@ -166,31 +221,12 @@ def render(self) -> str: @dataclass -class MetadataSet2HTML(Report): - """Generate a summary report in HTML.""" - - template_name = "metadata-set-2.html" - - #: Metadata set to summarize. - mds: "v21.MetadataSet" - #: Geography. - ref_area: list[str] - - def render(self) -> str: - from transport_data.org.metadata import contains_data_for - - data = { - mdr.attaches_to.key_values["DATAFLOW"].obj.id: { # type: ignore [union-attr] - ra: contains_data_for(mdr, ra) for ra in self.ref_area - } - for mdr in self.mds.report - } - - return self.render_jinja_template(ref_area=self.ref_area, data=data) +class MetadataSet1ODT(Report): + """Metadata reports related to `ref_area`. + Same as :class:`.MetadataSet1HTML` but as OpenDocument Text. + """ -@dataclass -class MetadataSet1ODT(Report): template_name = "metadata-set-1.rst" #: Metadata set to summarize. @@ -216,3 +252,35 @@ def render(self) -> bytes: # Convert reStructuredText → OpenDocumentText return self.rst2odt(rst_source) + + +@dataclass +class MetadataSet2HTML(Report): + """Table of metadata reports. + + This table has: + + - One *column* per value in `ref_areas`. + - One *row* per metadata report in `mds`. + - A check-mark in cells where :func:`.contains_data_for` indicates that the metadata + report targets a data flow that contains data for the reference area. + """ + + template_name = "metadata-set-2.html" + + #: Metadata set to summarize. + mds: "v21.MetadataSet" + #: Geographies to show. + ref_area: list[str] + + def render(self) -> str: + from transport_data.org.metadata import contains_data_for + + data = { + mdr.attaches_to.key_values["DATAFLOW"].obj.id: { # type: ignore [union-attr] + ra: contains_data_for(mdr, ra) for ra in self.ref_area + } + for mdr in self.mds.report + } + + return self.render_jinja_template(ref_area=self.ref_area, data=data) diff --git a/transport_data/org/metadata/spreadsheet.py b/transport_data/org/metadata/spreadsheet.py index 624a25b..49464ec 100644 --- a/transport_data/org/metadata/spreadsheet.py +++ b/transport_data/org/metadata/spreadsheet.py @@ -1,3 +1,5 @@ +"""Non-standard TDC Excel file format for collecting metadata.""" + import logging import re from typing import TYPE_CHECKING, List, Optional, Tuple diff --git a/transport_data/util/docutils.py b/transport_data/util/docutils.py index d233035..c574046 100644 --- a/transport_data/util/docutils.py +++ b/transport_data/util/docutils.py @@ -1,3 +1,5 @@ +"""Utilities for :mod:`docutils`.""" + from pathlib import Path from zipfile import BadZipFile, ZipFile diff --git a/transport_data/util/hooks.py b/transport_data/util/hooks.py index 93b15c4..626c3f6 100644 --- a/transport_data/util/hooks.py +++ b/transport_data/util/hooks.py @@ -1,3 +1,5 @@ +"""Plug-in hooks to be implemented by submodules and other packages.""" + from typing import TYPE_CHECKING, Iterable import pluggy diff --git a/transport_data/util/jinja2.py b/transport_data/util/jinja2.py index b2fa892..9e0f1d5 100644 --- a/transport_data/util/jinja2.py +++ b/transport_data/util/jinja2.py @@ -1,3 +1,5 @@ +"""Utilities for :mod:`jinja2`.""" + from functools import lru_cache diff --git a/transport_data/util/pluggy.py b/transport_data/util/pluggy.py index f4d593a..3262a89 100644 --- a/transport_data/util/pluggy.py +++ b/transport_data/util/pluggy.py @@ -1,3 +1,5 @@ +"""Utilities for :mod:`pluggy`.""" + from importlib import import_module import pluggy diff --git a/transport_data/util/pycountry.py b/transport_data/util/pycountry.py index 3a34e9e..59188c2 100644 --- a/transport_data/util/pycountry.py +++ b/transport_data/util/pycountry.py @@ -1,3 +1,8 @@ +"""Utilities for working with pycountry_. + +.. _pycountry: https://pypi.org/project/pycountry/ +""" + #: Mapping from country name forms appearing in data to values recognized by #: :meth:`pycountry.countries.lookup`. NAME_MAP = {