'{{ report | dfd_id }}'
^{{ heading }}>
+{{ summarize_dataflow(dataflow) }}
+{% for metadata_concept in ["DATA_PROVIDER", "URL", "MEASURE", "UNIT_MEASURE", "DATA_DESCR", "COMMENT"] %}
+{% set ra_value, mda = get_reported_attribute(report, metadata_concept) %}
+{% if not mda %}{% continue %}{% endif %}
+{{ mda.concept_identity.name }}:
+{% if metadata_concept == "DATA_PROVIDER" %}
+{{ ra_value }}
+{% elif metadata_concept == "URL" %}
+{{ ra_value}}
+{% elif metadata_concept in ("DATA_DESCR", "COMMENTS") %}
+
{{ ra_value.replace('\n', '
') | safe }}
+{% else %}
+{{ ra_value }}
+{% endif %}
+
Direct links:
+ {{ matched | length }} data flows containing data on {{ ref_area }}:
+ {% for mdr in matched %}{{ mdr | dfd_id }}{{ ", " if not loop.last }}{% endfor %}
+
+ {{ no_match | length }} other data flows:
+ {% for mdr in no_match %}{{ mdr | dfd_id }}{{ ", " if not loop.last }}{% endfor %}
+
These data flows are explicitly marked as containing data pertaining to the country.
+ {% for mdr in matched %} + {{ summarize_metadatareport(mdr) }} + {% endfor %} +These data flows are not explicitly identified as containing data on the country. + This doesn't completely rule out that they may contain such data, but this is less likely and would require further investigation and inspection.
+ {% for mdr in no_match %} + {{ summarize_metadatareport(mdr) }} + {% endfor %} + + diff --git a/transport_data/data/tests/metadata-input.xlsx b/transport_data/data/tests/metadata-input.xlsx new file mode 100644 index 0000000..df2dbce --- /dev/null +++ b/transport_data/data/tests/metadata-input.xlsx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:380dfcd70698fc387020074845a127f23cbca2746ae66cc9213603185b4dbbe6 +size 100078 diff --git a/transport_data/iamc/__init__.py b/transport_data/iamc/__init__.py index 21a385d..5d2e429 100644 --- a/transport_data/iamc/__init__.py +++ b/transport_data/iamc/__init__.py @@ -11,15 +11,19 @@ import sdmx.model.v21 as m from sdmx.message import StructureMessage +from transport_data.util.pluggy import hookimpl + log = logging.getLogger(__name__) -def get_agency(): - return m.Agency( +@hookimpl +def get_agencies(): + a = m.Agency( id="IAMC", name="Integrated Assessment Modeling Consortium", contact=[m.Contact(uri=["https://iamconsortium.org"])], ) + return (a,) def common_structures(): @@ -31,7 +35,7 @@ def common_structures(): with id "IAMC", containing the concepts for the IAMC dimensions and attribute. """ cs = m.ConceptScheme( - id="IAMC", name="Concepts in the IAMC data model", maintainer=get_agency() + id="IAMC", name="Concepts in the IAMC data model", maintainer=get_agencies()[0] ) cs.extend( diff --git a/transport_data/ipcc/__init__.py b/transport_data/ipcc/__init__.py new file mode 100644 index 0000000..bdee3de --- /dev/null +++ b/transport_data/ipcc/__init__.py @@ -0,0 +1 @@ +"""Intergovernmental Panel on Climate Change metadata provider.""" diff --git a/transport_data/ipcc/structure.py b/transport_data/ipcc/structure.py new file mode 100644 index 0000000..12c4aef --- /dev/null +++ b/transport_data/ipcc/structure.py @@ -0,0 +1,280 @@ +"""IPCC structural metadata.""" + +from typing import TYPE_CHECKING + +from transport_data.util.pluggy import hookimpl + +if TYPE_CHECKING: + import sdmx.model.common + + +@hookimpl +def get_agencies(): + """Return the IPCC :class:`.Agency`.""" + from sdmx.model import v21 + + a = v21.Agency( + id="IPCC", + name="Intergovernmental Panel on Climate Change", + description="https://www.ipcc.ch/", + ) + return (a,) + + +def gen_cl_T311(**kwargs) -> "sdmx.model.Common.Codelist": + """Generate a code list from the GNGGI, Volume 2, Table 3.1.1. + + The generated code list's URN ends with ``Codelist=TDCI:CL_IPCC_2006_V2_T3.1.1(…)``. + + .. todo:: Expand to include 'Explanation' text from the table as descriptions for + codes. + + .. todo:: Include internationalized texts (names, descriptions) from the Arabic, + Chinese, French, Russian, and/or Spanish versions of the documents. + """ + from sdmx.model.common import Code, Codelist + + cl = Codelist( + id="CL_IPCC_2006_V2_T3.1.1", + name="Detailed sector split for the Transport sector", + description="""Transcribed from 2006 IPCC Guidelines for National Greenhouse Gas Inventories — Volume 2: Energy — Chapter 3: Mobile Combustion — Table 3.1.1, using the file https://www.ipcc-nggip.iges.or.jp/public/2006gl/pdf/2_Volume2/V2_3_Ch3_Mobile_Combustion.pdf, as linked from https://www.ipcc-nggip.iges.or.jp/public/2006gl/vol2.html. + +This version includes the 'Explanation' text from the table as the description for individual codes, but at the moment only for the code 1 A 3. For others, see the source document.""", + **kwargs, + ) + + # The codes have well-formed, hierarchical IDs, so it is possible to infer the ID of + # the parent code, if it exists. + def _c(id_, name, description=None): + """Shorthand for adding to `cl`.""" + try: + parent = cl[" ".join(id_.split()[:-1])] + except KeyError: + parent = None + + cl.append(Code(id=id_, name=name, description=description, parent=parent)) + + _c( + "1 A 3", + "TRANSPORT", + """Emissions from the combustion and evaporation of fuel for all transport activity (excluding military transport), regardless of the sector, specified by sub-categories below. + +Emissions from fuel sold to any air or marine vessel engaged in international transport (1 A 3 a i and 1 A 3 d i) should as far as possible be excluded from the totals and subtotals in this category and should be reported separately.""", + ) + _c("1 A 3 a", "Civil Aviation") + _c("1 A 3 a i", "International Aviation (International Bunkers)") + _c("1 A 3 a ii", "Domestic Aviation") + _c("1 A 3 b", "Road Transportation") + _c("1 A 3 b i", "Cars") + _c("1 A 3 b i 1", "Passenger cars with 3-way catalysts") + _c("1 A 3 b i 2", "Passenger cars without 3-way catalysts") + _c("1 A 3 b ii", "Light duty trucks") + _c("1 A 3 b ii 1", "Light-duty trucks with 3-way catalysts") + _c("1 A 3 b ii 2", "Light-duty trucks without 3-way catalysts") + _c("1 A 3 b iii", "Heavy duty trucks and buses") + _c("1 A 3 b iv", "Motorcycles") + _c("1 A 3 b v", "Evaporative emissions from vehicles") + _c("1 A 3 b vi", "Urea-based catalysts") + _c("1 A 3 c", "Railways") + _c("1 A 3 d", "Water-borne Navigation") + _c("1 A 3 d i", "International water-borne navigation (International bunkers)") + _c("1 A 3 d ii", "Domestic water-borne Navigation") + _c("1 A 3 e", "Other Transportation") + _c("1 A 3 e i", "Pipeline Transport") + _c("1 A 3 e ii", "Off-road") + _c("1 A 4 c iii", "Fishing (mobile combustion)") + _c("1 A 5 a", "Non specified stationary") + _c("1 A 5 b", "Non specified mobile") + + return cl + + +def gen_cs_ch3(**kwargs) -> "sdmx.model.common.ConceptScheme": + """Generate a scheme of concepts included in equations in Chapter 3. + + The generated code list's URN ends with + ``ConceptScheme=TDCI:CS_IPCC_2006_V2_CH3(…)``. + + .. todo:: Include concepts used as table dimensions. + + .. todo:: Include internationalized texts (names, descriptions) from the Arabic, + Chinese, French, Russian, and/or Spanish versions of the documents. + """ + from sdmx.model.common import Annotation, Concept, ConceptScheme + + cs = ConceptScheme( + id="CS_IPCC_2006_V2_CH3", name="Concepts appearing in equations", **kwargs + ) + + equation, page = "", "" + + def _c(id_, name=None, units=None, description=None): + c = Concept( + id=id_, + name=name, + description=f"First appears in Equation {equation} on p.{page}", + ) + + if units: + c.annotations.append(Annotation(id="preferred-units", text=units)) + cs.append(c) + + # §3.2 Road transportation + + equation, page = "3.2.1", "3.12" + _c( + "EMI 1", + "Emissions", + "kg", + """Variously "Emissions of CO₂" (Eq. 3.2.1), or of varying species (Eq. 3.2.3, 3.2.5)""", + ) + _c("Fuel 1", "Fuel sold", "TJ") + _c("EF 1", "Emission factor", "kg/TJ") + _c( + "a", + "Type of fuel (e.g. petrol, diesel, natural gas, lpg)", + None, + "In Eq 3.2.6, 'j' is used for the same concept.", + ) + + equation, page = "3.2.2", "3.12" + _c( + "EMI 2", + "CO₂ Emissions from urea-based additive in catalytic converters", + "Gg CO₂", + ) + _c( + "Activity", + "amount of urea-based additive consumed for use in catalytic converters", + "Gg", + ) + _c( + "Purity", + "the mass fraction (=percentage divided by 100) of urea in the urea-based additive", + ) + + # Eq. 3.2.3 —same concepts as 3.2.1 + + equation, page = "3.2.4", "3.13" + _c( + "Fuel 2", + "fuel consumed (as represented by fuel sold) for a given mobile source activity", + "TJ", + ) + _c( + "b", + "vehicle type", + None, + "In Eq 3.2.6, 'i' is used for the same concept (e.g., car, bus)", + ) + _c( + "c", + "emission control technology (such as uncontrolled, catalytic converter, etc.)", + ) + + equation, page = "3.2.5", "3.15" + _c("EF 2", "emission factor", "kg / km") + _c( + "Distance 1", + "distance travelled during thermally stabilized engine operation phase for a given mobile source activity", + "km", + ) + _c("C", "emissions during warm-up phase (cold start)", "kg") + _c( + "d", + "operating conditions (e.g. urban or rural road type, climate, or other environmental factors)", + ) + + equation, page = "3.2.6", "3.26" + _c( + "Estimated fuel", + "total estimated fuel use estimated from distance travelled (VKT) data", + "litre", + ) + _c("Vehicles", "number of vehicles of type i and using fuel j on road type t") + _c( + "Distance 2", + "annual kilometres travelled per vehicle of type i and using fuel j on road type t", + "km", + ) + _c("t", "type of road (e.g., urban, rural)") + + # §3.3 Off-road transportation + # Eq. 3.3.1 —no additional concepts + # Eq. 3.3.2 —no additional concepts + + equation, page = "3.3.3", "3.34" + _c( + "N", + "source population", + None, + """In Eq. 3.4.3 this is given as 'number of locomotives of type i".""", + ) + # Ditto below, all used in Eq. 3.4.3 + _c("H", "annual hours of use of vehicle i", "hour") + _c("P", "average rated power of vehicle i", "kW") + _c("LF", "typical load factor of vehicle i (fraction between 0 and 1)") + _c("EF 3", "average emission factor for use of fuel j in vehicle i", "kg / kWh") + + # Eq. 3.3.4 —no additional concepts + + # §3.4 Railways + # Eq. 3.4.1 —no additional concepts + # Eq. 3.4.2 —no additional concepts + + equation, page = "3.4.3", "3.42" + _c("i", "locomotive type and journey type") + + equation, page = "3.4.4", "3.43" + _c("EF 4", "engine specific emission factor for locomotive of type i", "kg/TJ") + _c("PWF", "pollutant weighting factor for locomotive of type i", "dimensionless") + _c("EF 5", "default emission factor for diesel (applies to CH₄, N₂O)", "kg/TJ") + + # §3.6 Civil Aviation + # Eq. 3.6.1 —no additional concepts + + equation, page = "3.6.2", "3.59" + _c( + "Emissions.LTO", + "", + None, + """'LTO' is defined on p.3.56 as "Landing/Take-Off cycle".""", + ) + _c( + "Emissions.Cruise", + "", + None, + """'Cruise' is defined on p.3.56 in contrast with 'LTO'.""", + ) + + equation, page = "3.6.3", "3.59" + _c("Number of LTOs") + _c("EF.LTO") + + equation, page = "3.6.4", "3.59" + _c("Fuel consumption.LTO") + _c("Fuel consumption per LTO") + + equation, page = "3.6.5", "3.59" + _c("Total Fuel Consumption") + _c("EF.Cruise") + + return cs + + +def gen_structures() -> None: + """Create or update IPCC-maintained structural metadata.""" + from transport_data import STORE, org + + def _make_id(value: str) -> str: + return f"{get_agencies()[0].id}_{value}" + + ma_args = dict( + maintainer=org.get_agencies()[0], + version="0.1", + is_final=True, + is_external_reference=False, + ) + + STORE.setdefault(gen_cl_T311(**ma_args)) + STORE.setdefault(gen_cs_ch3(**ma_args)) diff --git a/transport_data/jrc/__init__.py b/transport_data/jrc/__init__.py index d1cdcd9..6b5343b 100644 --- a/transport_data/jrc/__init__.py +++ b/transport_data/jrc/__init__.py @@ -24,11 +24,13 @@ import sdmx.model.v21 as m from transport_data import STORE as registry +from transport_data.util.pluggy import hookimpl from transport_data.util.pooch import Pooch from transport_data.util.sdmx import anno_generated -def get_agency() -> m.Agency: +@hookimpl +def get_agencies(): """Return information about the agency providing the data set. See :func:`.org.get_agencyscheme`. @@ -46,7 +48,7 @@ def get_agency() -> m.Agency: m.Contact(name="Jacopo Tattini", email=["Jacopo.TATTINI@ec.europa.eu"]) ) - return a + return (a,) BASE_URL = ( @@ -448,7 +450,7 @@ def convert(geo): registry.write(obj) # Write code lists, measure concept scheme to file - a = get_agency() + a = get_agencies()[0] for obj in chain(CL.values(), [CS_MEASURE]): obj.maintainer = a obj.version = "0.1.0" @@ -468,12 +470,12 @@ def prepare(measure_concept, dims): # NB here we set ADB as the maintainer. Precisely, ADB establishes the data # structure, but TDCI is maintaining the SDMX representation of it. dsd = m.DataStructureDefinition( - id=measure_id, maintainer=get_agency(), version="0.0.0" + id=measure_id, maintainer=get_agencies()[0], version="0.0.0" ) anno_generated(dsd) dfd = m.DataflowDefinition( - id=measure_id, maintainer=get_agency(), version="0.0.0", structure=dsd + id=measure_id, maintainer=get_agencies()[0], version="0.0.0", structure=dsd ) pm = m.PrimaryMeasure(id="OBS_VALUE", concept_identity=c) diff --git a/transport_data/jrc/cli.py b/transport_data/jrc/cli.py index 7744297..80cf86e 100644 --- a/transport_data/jrc/cli.py +++ b/transport_data/jrc/cli.py @@ -18,7 +18,7 @@ from . import GEO, convert, fetch -@click.group("jrc", help=__doc__.splitlines()[0]) +@click.group("jrc") def main(): """EU Joint Research Center (JRC) provider.""" diff --git a/transport_data/oica/__init__.py b/transport_data/oica/__init__.py index 61cd027..55fe7e0 100644 --- a/transport_data/oica/__init__.py +++ b/transport_data/oica/__init__.py @@ -20,6 +20,7 @@ import pandas as pd +from transport_data.util.pluggy import hookimpl from transport_data.util.pooch import Pooch if TYPE_CHECKING: @@ -165,7 +166,9 @@ def convert_single_file( # Prepare a GEO codelist and map using the "GEO" column cl_geo = get_cl_geo() - geo_map = _make_geo_codes(cl_geo, df["GEO"], maintainer=get_agency(), version="0.1") + geo_map = _make_geo_codes( + cl_geo, df["GEO"], maintainer=get_agencies()[0], version="0.1" + ) # Store `cl_geo` STORE.write(cl_geo) @@ -327,16 +330,17 @@ def _make_code(value: str): return id_for_name -@lru_cache -def get_agency() -> "sdmx.model.common.Agency": +@hookimpl +def get_agencies(): """Return the OICA Agency.""" from sdmx.model import v21 - return v21.Agency( + a = v21.Agency( id="OICA", name="International Organization of Motor Vehicle Manufacturers", description="https://www.oica.net", ) + return (a,) def get_cl_geo() -> "sdmx.model.common.Codelist": @@ -346,7 +350,9 @@ def get_cl_geo() -> "sdmx.model.common.Codelist": from transport_data import STORE, org candidate: common.Codelist = common.Codelist( - id=f"{get_agency().id}_GEO", maintainer=org.get_agency()[0], version="0.1" + id=f"{get_agencies()[0].id}_GEO", + maintainer=org.get_agencies()[0], + version="0.1", ) return STORE.setdefault(candidate) @@ -360,7 +366,9 @@ def get_conceptscheme() -> "sdmx.model.common.ConceptScheme": from transport_data import STORE, org cs = common.ConceptScheme( - id=f"{get_agency().id}_CONCEPTS", maintainer=org.get_agency()[0], version="0.1" + id=f"{get_agencies()[0].id}_CONCEPTS", + maintainer=org.get_agencies()[0], + version="0.1", ) # Measures @@ -415,9 +423,9 @@ def get_structures( from transport_data import STORE, org - base = f"{get_agency().id}_{measure}" + base = f"{get_agencies()[0].id}_{measure}" ma_args = dict( - maintainer=org.get_agency()[0], + maintainer=org.get_agencies()[0], version="0.1", is_final=False, is_external_reference=False, diff --git a/transport_data/oica/cli.py b/transport_data/oica/cli.py index 2b56c4a..4da6635 100644 --- a/transport_data/oica/cli.py +++ b/transport_data/oica/cli.py @@ -17,7 +17,7 @@ import click -@click.group("oica", help=__doc__.splitlines()[0]) +@click.group("oica", short_help="OICA provider.") def main(): """International Organization of Motor Vehicle Manufacturers (OICA) provider.""" diff --git a/transport_data/org/__init__.py b/transport_data/org/__init__.py index 90515be..841cc03 100644 --- a/transport_data/org/__init__.py +++ b/transport_data/org/__init__.py @@ -1,18 +1,20 @@ """Information about the TDCI *per se*.""" from datetime import date -from importlib import import_module +from itertools import chain from typing import TYPE_CHECKING, Union import sdmx.model.v21 as m from transport_data import STORE as registry +from transport_data.util.pluggy import hookimpl, pm, register_internal if TYPE_CHECKING: import sdmx.model.v21 -def get_agency() -> "sdmx.model.v21.Agency": +@hookimpl +def get_agencies() -> "sdmx.model.v21.Agency": # Agency a1 = m.Agency( id="TDCI", @@ -48,8 +50,6 @@ def get_agency() -> "sdmx.model.v21.Agency": def get_agencyscheme(version: Union[None, str] = None) -> "sdmx.model.v21.AgencyScheme": """Generate an AgencyScheme including some TDCI data providers.""" - agencies = get_agency() - as_ = m.AgencyScheme( id="TDCI", # NameableArtefact @@ -57,17 +57,18 @@ def get_agencyscheme(version: Union[None, str] = None) -> "sdmx.model.v21.Agency # VersionableArtefact valid_from=date.today().isoformat(), # MaintainableArtefact - maintainer=agencies[0], + maintainer=None, ) - for a in agencies: - as_.append(a) + # Use plugin hooks to collect Agency objects from within transport_data or other + # registered code + register_internal() + + for agency in chain(*pm.hook.get_agencies()): + as_.append(agency) - # Add agencies with corresponding modules in this repository - for id_ in ("adb", "jrc"): - module = import_module(f"transport_data.{id_}") - # Call a function named get_agency() in the module - as_.append(module.get_agency()) + # TDCI itself is the maintainer + as_.maintainer = as_["TDCI"] as_.version = version if as_.version is None: diff --git a/transport_data/org/cli.py b/transport_data/org/cli.py index ba5dbba..51bcb2c 100644 --- a/transport_data/org/cli.py +++ b/transport_data/org/cli.py @@ -6,15 +6,43 @@ """ +import pathlib + import click -from transport_data import STORE from transport_data.util.click import common_params -@click.command("org", params=common_params("version")) -def main(version): - """Information about the TDCI per se.""" +@click.group("org") +def main(): + """TDCI itself.""" + + +@main.command("refresh", params=common_params("version")) +def refresh(version): + """Update the TDCI metadata.""" + from transport_data import STORE + from . import get_agencyscheme - STORE.write(get_agencyscheme(version=version), force=True) + STORE.write(get_agencyscheme(version=version)) + + +@main.command("read") +@click.argument( + "path", type=click.Path(exists=True, dir_okay=False, path_type=pathlib.Path) +) +def read(path: "pathlib.Path"): + """Read and summarize metadata.""" + from .metadata import read_workbook, summarize_metadataset + + mds = read_workbook(path.resolve()) + summarize_metadataset(mds) + + +@main.command("template") +def template(): + """Generate the metadata template.""" + from .metadata import make_workbook + + make_workbook() diff --git a/transport_data/org/metadata.py b/transport_data/org/metadata.py new file mode 100644 index 0000000..13b8937 --- /dev/null +++ b/transport_data/org/metadata.py @@ -0,0 +1,637 @@ +import itertools +import logging +import re +from collections import defaultdict +from functools import lru_cache, partial +from typing import TYPE_CHECKING, Callable, Hashable, List, Optional, Tuple + +from pycountry import countries +from sdmx.model import common, v21 + +if TYPE_CHECKING: + import pathlib + + from openpyxl import Workbook + from openpyxl.worksheet.worksheet import Worksheet + +log = logging.getLogger(__name__) + +#: Concepts and metadata attributes in the TDC metadata structure. +CONCEPTS = { + "DATAFLOW": ( + "Data flow ID", + """A unique identifier for the data flow (=data source, data set, etc.). + +We suggest to use IDs like ‘VN001’, where ‘VN’ is the ISO 3166 alpha-2 country +code, and ‘001’ is a unique number. The value MUST match the name of the sheet +in which it appears.""", + ), + "DATA_PROVIDER": ( + "Data provider", + """Organization or individual that provides the data and any related metadata. + +This can be as general (“IEA”) or specific (organization unit/department, specific +person responsible, contact details, etc.) as appropriate.""", + ), + "URL": ( + "URL or web address", + "Location on the Internet with further information about the data flow.", + ), + "MEASURE": ( + "Measure (‘indicator’)", + """Statistical concept for which data are provided in the data flow. + +If the data flow contains data for multiple measures, give each one separated by +semicolons. Example: “Number of cars; passengers per vehicle”. + +This SHOULD NOT duplicate the value for ‘UNIT_MEASURE’. Example: “Annual driving +distance per vehicle”, not “Kilometres per vehicle”.""", + ), + "UNIT_MEASURE": ( + "Unit of measure", + """Unit in which the data values are expressed. + +If ‘MEASURE’ contains 2+ items separated by semicolons, give the respective units in the +same way and order. If there are no units, write ‘dimensionless’, ‘1’, or similar.""", + ), + "DIMENSION": ( + "Dimensions", + """Formally, the “statistical concept used in combination with other statistical +concepts to identify a statistical series or individual observations.” + +Record all dimensions of the data, either in a bulleted or numbered list, or +separated by semicolons. In parentheses, give some indication of the scope +and/or resolution of the data along each dimension. Most data have at least time +and space dimensions. + +Example: + +- TIME_PERIOD (annual, 5 years up to 2021) +- REF_AREA (whole country; VN only) +- Vehicle type (12 different types: […]) +- Emissions species (CO2 and 4 others)""", + ), + "DATA_DESCR": ( + "Data description", + """Any information about the data flow that does not fit in other attributes. + +Until or unless other metadata attributes are added to this metadata structure/ +template, this MAY include: + +- Any conditions on data access, e.g. publicly available, proprietary, fee or + subscription required, available on request, etc. +- Frequency of data updates. +- Any indication of quality, including third-party references that indicate data + quality. +""", + ), + "COMMENT": ( + "Comment", + """Any other information about the metadata values, for instance discrepancies or +unclear or missing information. + +Precede comments with initials; append to existing comments to keep +chronological order; and include a date (for example, “2024-07-24”) if helpful.""", + ), +} + +#: README text for the TDC metadata file format. +README_TEXT = """This file is an unofficial, prototype TDC format for metadata. +loosely imitates the Eurostat format. These files contain metadata (information +*about* data) based on the SDMX information model, but their layout (sheet +names, columns, etc.) is not specified by the SDMX standard, hence ‘unofficial’. + +This file has the following sheets. + +README +====== + +This sheet. + +Attributes +========== + +- One row per metadata attribute (or 'field'). +- Columns for the name; description; and ID (short and machine-readable) of each + attribute. See these descriptions to learn what to write for each attribute. + +One or more additional sheets +============================= + +- The name (or title) of each sheet corresponds to the identity (ID) of the data + flow that is described by the metadata in that sheet. +- In Column A, the name of the metadata attribute. Each name MUST exactly + match one appearing in the "Attributes" sheet. Some names MAY be omitted. +- In Column B, the actual metadata. These may be empty. + +TEMPLATE +======== + +To add information about additional data flows not included in existing sheets +(above), you can copy and rename this sheet. +""" + + +def _header(ws: "Worksheet", *columns: Tuple[str, int]) -> None: + """Write header columns and format their style and width.""" + for column, (value, width) in enumerate(columns, start=1): + cell = ws.cell(row=1, column=column, value=value) + cell.style = "header" + ws.column_dimensions[cell.column_letter].width = width + + +def add_readme(wb: "Workbook") -> None: + """Add a "README" sheet to `wb`.""" + ws = wb.create_sheet("README") + + _header(ws, ("Transport Data Commons (TDC) metadata", 72)) + ws["A3"] = README_TEXT + + +def add_attributes(wb: "Workbook", msd: "v21.MetadataStructureDefinition"): + """Add an "Attributes" sheet to `wb` listing the metadata attributes from `msd`.""" + ws = wb.create_sheet("Attributes") + + _header( + ws, + ("Name", 20), # "Element name" in Eurostat + ("Description", 72), # Not present in Eurostat + ("ID", 20), # "Element code" in Eurostat + ) + + for row, attribute in enumerate(msd.report_structure["ALL"].components, start=2): + concept = attribute.concept_identity + ws.cell(row=row, column=1, value=concept.name.localized_default()).style = "top" + ws.cell(row=row, column=2, value=concept.description.localized_default()) + ws.cell(row=row, column=3, value=attribute.id).style = "top" + + +def add_template(wb: "Workbook", msd: "v21.MetadataStructureDefinition"): + """Add a "TEMPLATE" sheet to `wb` with a metadata template.""" + ws = wb.create_sheet("TEMPLATE") + + _header( + ws, + ("Attribute name", 20), # "Concept name" in Eurostat + ("Value", 72), # "Concept value" in Eurostat + ) + + for row, attribute in enumerate(msd.report_structure["ALL"].components, start=2): + concept = attribute.concept_identity + ws.cell(row=row, column=1, value=concept.name.localized_default()).style = "top" + ws.cell(row=row, column=2, value="---") + + +def contains_data_for(mdr: "v21.MetadataReport", ref_area: str) -> bool: + """Return :any:`True` if `mdr` contains data for `ref_area`. + + :any:`True` is returned if any of the following: + + 1. The referenced data flow definition has an ID that starts with `ref_area`. + 2. The country's ISO 3166 alpha-2 code, alpha-3 code, official name, or common name + appears in the value of the ``DATA_DESCR`` metadata attribute. + + + Parameters + ---------- + ref_area : str + ISO 3166 alpha-2 code for a country. Passed to + :meth:`pycountry.countries.lookup`. + """ + country = countries.lookup(ref_area) + + if mdr.attaches_to.key_values["DATAFLOW"].obj.id.startswith(ref_area): # type: ignore [union-attr] + return True + + # Pattern to match in DATA_DESCR + pat = re.compile( + f"({country.alpha_2}|{country.alpha_3}|{country.name}|{country.common_name})" + ) + for ra in mdr.metadata: + assert hasattr(ra, "value") + if ra.value_for.id == "DATA_DESCR" and pat.search(ra.value): + return True + + return False + + +def generate_summary_html( + mds: "v21.MetadataSet", ref_area: str, path: "pathlib.Path" +) -> None: + """Generate a summary report in HTML.""" + from jinja2 import Environment, PackageLoader, select_autoescape + + # Create a Jinja environment + env = Environment( + loader=PackageLoader("transport_data", package_path="data/org"), + extensions=["jinja2.ext.loopcontrols"], + autoescape=select_autoescape(), + trim_blocks=True, + lstrip_blocks=True, + ) + + grouped = groupby(mds, key=partial(contains_data_for, ref_area=ref_area)) + + def _dfd_id(mdr): + return mdr.attaches_to.key_values["DATAFLOW"].obj.id + + def _get_reported_attribute(mdr, id_): + for ra in mdr.metadata: + if ra.value_for.id == id_: + return ra.value, ra.value_for + return "—", None + + def _format_desc(dim): + if desc := str(dim.get_annotation(id="tdc-description").text): + return desc + else: + return "—" + + env.filters["dfd_id"] = _dfd_id + env.filters["format_desc"] = _format_desc + + path.write_text( + env.get_template("template-metadata.html").render( + ref_area=ref_area, + matched=grouped[True], + no_match=grouped[False], + get_reported_attribute=_get_reported_attribute, + ) + ) + + +@lru_cache +def get_cs_common() -> "common.ConceptScheme": + """Create a shared concept scheme for the concepts referenced by dimensions. + + Concepts in this scheme have an annotation ``tdc-aka``, which is a list of alternate + IDs recognized for the concept. + """ + from . import get_agencyscheme + + as_ = get_agencyscheme() + cs = common.ConceptScheme(id="CONCEPTS", maintainer=as_["TDCI"]) + + cs.setdefault( + id="CONFIDENTIALITY", + annotations=[common.Annotation(id="tdc-aka", text=repr(["CONFIDIENTALITY"]))], + ) + cs.setdefault( + id="FUEL_TYPE", + annotations=[common.Annotation(id="tdc-aka", text=repr(["Fuel type"]))], + ) + cs.setdefault( + id="REF_AREA", + annotations=[ + common.Annotation( + id="tdc-aka", text=repr(["Area", "Country", "Country code", "Region"]) + ) + ], + ) + cs.setdefault( + id="SERVICE", + annotations=[common.Annotation(id="tdc-aka", text=repr(["FREIGHT_PASSENGER"]))], + ) + cs.setdefault( + id="TIME_PERIOD", + annotations=[common.Annotation(id="tdc-aka", text=repr(["Time", "Year"]))], + ) + + return cs + + +def get_msd() -> "v21.MetadataStructureDefinition": + from transport_data import STORE + + from . import get_agencyscheme + + TDCI = get_agencyscheme()["TDCI"] + + cs = common.ConceptScheme(id="METADATA_CONCEPTS", maintainer=TDCI) + msd = v21.MetadataStructureDefinition(id="SIMPLE", version="1", maintainer=TDCI) + rs = msd.report_structure["ALL"] = v21.ReportStructure(id="ALL") + + for id_, (name, description) in CONCEPTS.items(): + ci = cs.setdefault(id=id_, name=name, description=description) + rs.getdefault(id_, concept_identity=ci) + + # NB Currently not supported by sdmx1; results in an empty XML collection + STORE.write(msd) + + return msd + + +def getdefault(is_: "common.ItemScheme", other: "common.Item") -> "common.Item": + """Return an item from `is_` matching `other`. + + Several methods are attempted to match `other` with an existing item: + + 1. ID of `other` is identical to that of an existing item. + 2. Transformed ID of `other`—in upper case, " " replaced with "_" is identical to + that of an existing item. + 3. ID of `other` is in the annotation ``tdc-aka`` + + """ + # Exact match on ID or transformed ID + for candidate in (other.id, other.id.upper().replace(" ", "_")): + try: + return is_[candidate] + except KeyError: + pass + + # Iterate over existing items + for item in is_: + # Eval the annotation "tdc-aka" for a list of alternate IDs for the item + if aka := item.eval_annotation(id="tdc-aka"): + if other.id in aka: + return item + + # Still no match; create the item + return is_.setdefault(id=other.id) + + +def groupby( + mds: "v21.MetadataSet", key=Callable[["v21.MetadataReport"], Hashable] +) -> dict[Hashable, list["v21.MetadataReport"]]: + """Group metadata reports in `mds` according to a `key` function. + + Similar to :func:`itertools.groupby`. + """ + result: dict[Hashable, list["v21.MetadataReport"]] = defaultdict(list) + for k, g in itertools.groupby(mds.report, key): + result[k].extend(g) + return result + + +def make_workbook(name="sample.xlsx") -> None: + """Generate a :class:`openpyxl.Workbook` for exchange of metadata.""" + from openpyxl import Workbook + from openpyxl.styles import Alignment, Font, NamedStyle, PatternFill + + wb = Workbook() + + # Delete the default sheet + assert wb.active + wb.remove(wb.active) + + # Create two named styles + header = NamedStyle(name="header") + header.fill = PatternFill("solid", fgColor="000000") + header.font = Font(bold=True, color="ffffff", name="Calibri") + wb.add_named_style(header) + + top = NamedStyle(name="top") + top.alignment = Alignment(vertical="top", wrap_text=True) + top.font = Font(name="Calibri") + wb.add_named_style(top) + + # Generate the metadata structure definition + msd = get_msd() + + # Add sheets + add_readme(wb) + add_attributes(wb, msd) + add_template(wb, msd) + + # Save the file + wb.save(name) + + +def parse_dimension(value: str) -> List[v21.Concept]: + """Parse the description of a dimension from `value`. + + Supported values include: + + 1. Multiple lines, with each line beginning "- ". + 2. A single line, with dimensions separated by ", ". + 3. A single dimension ID. + """ + # Partial regular expressions for a dimension + entry = r"(?P