Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

netcdfread: safer version handling for Conventions property #296

Draft
wants to merge 13 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 28 additions & 8 deletions cfdm/read_write/netcdf/netcdfread.py
Original file line number Diff line number Diff line change
Expand Up @@ -1020,26 +1020,41 @@ def read(

# ------------------------------------------------------------
# Find the CF version for the file, and the CFA version.
# (See '2.6.1 Identification of Conventions' in the CF Conformance
# document for valid inputs for the 'Conventions' property.)
# ------------------------------------------------------------
Conventions = g["global_attributes"].get("Conventions", "")

# If the string contains any commas, it is assumed to be a
# comma-separated list.
all_conventions = re.split(",\s*", Conventions)
all_conventions = re.split(r",\s*", Conventions)
if all_conventions[0] == Conventions:
all_conventions = Conventions.split()

file_version = None
for c in all_conventions:
if c.startswith("CF-"):
file_version = c.replace("CF-", "", 1)
elif c.startswith("UGRID-"):
# Be particularly strict with the regex to account for ambiguous
# values e.g. CF-<badversionformat> or CF-1.X/CF-1.Y. Note that:
# * the '^' and '$' start and end of string tokens ensure that
# only zero or one match can be found per given string c;
# * the regex below ensures a valid input to
# Version(), allowing any level of versioning identifier
# detail e.g. 1.23.34.45.6 (for future-proofing).
# See https://packaging.python.org/en/latest/specifications/
# version-specifiers/ for more on valid input to Version()
v_id = r"^{}-(\d+(\.\d+)*)$"
cf_v = re.search(v_id.format("CF"), c)
u_v = re.search(v_id.format("UGRID"), c)
cfa_v = re.search(v_id.format("CFA"), c)

if cf_v:
file_version = cf_v.group(1)
elif u_v:
# Allow UGRID if it has been specified in Conventions,
# regardless of the version of CF.
g["UGRID_version"] = Version(c.replace("UGRID-", "", 1))
elif c.startswith("CFA-"):
g["UGRID_version"] = Version(u_v.group(1))
elif cfa_v:
g["cfa"] = True
g["CFA_version"] = Version(c.replace("CFA-", "", 1))
g["CFA_version"] = Version(cfa_v.group(1))
elif c == "CFA":
g["cfa"] = True
g["CFA_version"] = Version("0.4")
Expand All @@ -1054,6 +1069,11 @@ def read(
file_version = self.implementation.get_cf_version()

g["file_version"] = Version(file_version)
if is_log_level_debug(logger):
logger.debug(
" Versioning:\n read_vars['file_version'] ="
f"{g['file_version']}"
) # pragma: no cover

# Set minimum/maximum versions
for vn in ("1.6", "1.7", "1.8", "1.9", "1.10", "1.11"):
Expand Down
2 changes: 2 additions & 0 deletions cfdm/read_write/read.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def read(
mask=True,
domain=False,
_implementation=_implementation,
_scan_only=False,
):
"""Read field or domain constructs from a dataset.

Expand Down Expand Up @@ -335,6 +336,7 @@ def read(
mask=mask,
domain=domain,
extra_read_vars=None,
_scan_only=_scan_only,
)
except MaskError:
# Some data required for field interpretation is missing,
Expand Down
90 changes: 90 additions & 0 deletions cfdm/test/test_read_write.py
Original file line number Diff line number Diff line change
Expand Up @@ -702,6 +702,96 @@ def test_read_write_string(self):
):
self.assertTrue(i.equals(j, verbose=3))

def test_read_write_Conventions_version(self):
"""Test processing of `Conventions` attribute to field property."""
from packaging.version import Version

f = cfdm.read(self.filename)[0]

# Construct single valid values for standards
valid_version_ends = ["1.11", "1", "2.30.4"]
cf_valid_conv = [f"CF-{v}" for v in valid_version_ends]
ugrid_valid_conv = [f"UGRID-{v}" for v in valid_version_ends]
cfa_valid_conv = [f"CFA-{v}" for v in valid_version_ends]
other_valid_conv = [f"somestandard-{v}" for v in valid_version_ends]

# Construct some mixed compound valid values for standards. Reverse
# one list to make version IDs differ on at least one standard.
zip_valid = list(zip(
cf_valid_conv, reversed(ugrid_valid_conv), cfa_valid_conv,
other_valid_conv
))
# Only space and comma delimiters are valid (see Conformance doc.)
combinations_comma_delim = [",".join(c) for c in zip_valid]
combinations_space_delim = [" ".join(c) for c in zip_valid]

all_valid_conv = (
cf_valid_conv + ugrid_valid_conv + cfa_valid_conv +
other_valid_conv + combinations_comma_delim +
combinations_space_delim
)

# Check that valid Conventions version specifications get set as the
# corresponding version on the Conventions property.
for set_conv_value in all_valid_conv:
cfdm.write(f, tmpfile)

# Can't use cfdm to change Conventions property so must use netCDF4
# Open with append mode, just want to update the global attribute
n = netCDF4.Dataset(tmpfile, "a")
n.Conventions = set_conv_value
n.close()

g = cfdm.read(tmpfile)[0]
self.assertEqual(g.get_property("Conventions"), set_conv_value)

invalid_version_ends = ["1.1/1.2", "bad", ".11", ""]
cf_invalid_conv = [f"CF-{v}" for v in invalid_version_ends]
ugrid_invalid_conv = [f"UGRID-{v}" for v in invalid_version_ends]
cfa_invalid_conv = [f"CFA-{v}" for v in invalid_version_ends]
other_invalid_conv = [f"somestandard-{v}" for v in invalid_version_ends]
zip_invalid = zip(
cf_invalid_conv, ugrid_invalid_conv, cfa_invalid_conv,
other_invalid_conv
)
bad_combinations_good_delim = [",".join(c) for c in zip_invalid]
# Include valid values with bad (unsupported) delimiters
good_combinations_bad_delim = ["- ".join(c) for c in zip_valid]

all_invalid_conv = (
cf_invalid_conv + ugrid_invalid_conv + cfa_invalid_conv +
other_invalid_conv + bad_combinations_good_delim +
good_combinations_bad_delim
)

# Include a mixture of valid and invalid version specifiers
some_valid_some_invalid_conv = [
" ".join(c) for c in zip(
cf_invalid_conv, reversed(ugrid_valid_conv), cfa_invalid_conv,
other_valid_conv)
]

# Check that invalid version specifications get ignored, so that the
# file is successfully read in, but is given default-logic version.
for set_conv_value in (
all_invalid_conv + some_valid_some_invalid_conv):
get_conv_value = f.get_property("Conventions")
cfdm.write(f, tmpfile)

# Can't use cfdm to change Conventions property so must use netCDF4
n = netCDF4.Dataset(tmpfile, "a")
n.Conventions = set_conv_value
n.close()

self.assertEqual(
cfdm.read(tmpfile, _scan_only=True)["file_version"],
Version(get_conv_value.lstrip("CF-"))
)
g = cfdm.read(tmpfile)[0]
# TODO: do we want to re-set the Conventions property as well,
# given it is invalid?
self.assertEqual(g.get_property("Conventions"), set_conv_value)

def test_read_write_Conventions(self):
"""Test the `Conventions` keyword argument to `write`."""
f = cfdm.read(self.filename)[0]
Expand Down