From 3b3456eb47cd79c122102e3b9072e510d312a377 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Thu, 8 Nov 2018 17:13:50 +0000 Subject: [PATCH 1/4] Beginnings of append : somewhat works but safe-checking incomplete. --- lib/iris/fileformats/netcdf.py | 375 +++++++++++++++++++++++++++++++-- 1 file changed, 359 insertions(+), 16 deletions(-) diff --git a/lib/iris/fileformats/netcdf.py b/lib/iris/fileformats/netcdf.py index d5ea39acb6..b1f526f999 100644 --- a/lib/iris/fileformats/netcdf.py +++ b/lib/iris/fileformats/netcdf.py @@ -757,23 +757,135 @@ def _set_file_ncattr(variable, name, attribute): return variable.setncattr(name, attribute) + + +def _offset_keys(keys, axis, offset): + """ + Return indexes adjusted for a write offset. + + This is to support append operations, where we want to translate a write + access, from e.g. "cf_var[:, :, :]" to something like "cf_var[:, N:, :]". + We can't use an index-of-index-of-thing to do this, as cf_var[keys] does + not produce a writeable view (unlike the equivalent numpy array access). + So instead we pick apart the key expression + produce a modified version. + + Args: + + * keys (multidimensional indexing expression): + A slicing representing a subarray of an array. + Must contain at least 'axis' key elements. + * axis (integer): + The dimension to offset. + * offset (integer): + A positive integer to add onto the 'axis'th indexing. + + Returns: + * out_keys (tuple of ints or slices): + an indexing expression for the notional target region + "target[< axis * (:,) >, offset:][keys]". + + .. note:: + Supports *only* keys which are slice objects or integers + -- not newaxis, ellipsis, or anything else. + So we depend on dask streaming using only those. + It seems to work, for now. + + .. note:: + Negative indices also don't have an obvious interpretation, so we don't + allow them. + + """ + # Make input into an iterable of keys, if not already. + try: + keys = iter(keys) + except TypeError: + # Convert single key to 1-tuple. + keys = (keys,) + + # Make into a list so we can change it. + keys = list(keys) + + _DEBUG_OFFSET_KEYS = False + if _DEBUG_OFFSET_KEYS: + debug_orig_keys = keys + + # Expand to minimum dimensions if required. + n_extra_dims = axis - len(keys) + 1 + if n_extra_dims > 0: + keys = keys + n_extra_dims * [slice(None)] + + # Adjust the key of the specified axis. + axis_key = keys[axis] + if isinstance(axis_key, int): + # The key is an integer : just add the offset. + axis_key += offset + elif isinstance(axis_key, slice): + # The key is a slice object : translate accordingly .. + + # Take apart the slice.. + start, stop, step = axis_key.start, axis_key.stop, axis_key.step + + # ..adjust slice elements.. + if ((start is not None and start < 0) or + (stop is not None and stop < 0)): + msg = ("Cannot offset the {}th key of index expression " + "'{}', = '{}', as it uses negative indices.") + msg = msg.format(axis+ 1, keys, axis_key) + raise ValueError(msg) + + if start is None: + start = offset + else: + start = start + offset + + if stop is not None: + stop = stop + offset + + # ..put slice back together. + axis_key = slice(start, stop, step) + else: + msg = ("Cannot offset the {}th key of index expression " + "'{}', = '{}', as it is of type '{}' but we only support " + "integers or slice objects.") + msg = msg.format(axis + 1, keys, axis_key, type(axis_key)) + raise ValueError(msg) + + # Update just the specified axis, and return a tuple of keys (always). + keys[axis] = axis_key + keys = tuple(keys) + + if _DEBUG_OFFSET_KEYS: + msg = '\n DEBUG: offset_keys({}, axis={}, offs={}) --> {}' + print(msg.format(debug_orig_keys, axis, offset, keys)) + + return keys + + class _FillValueMaskCheckAndStoreTarget(object): """ To be used with da.store. Remembers whether any element was equal to a given value and whether it was masked, before passing the chunk to the given target. + TODO: document the added 'offset' behaviour properly. + """ - def __init__(self, target, fill_value=None): + def __init__(self, target, fill_value=None, + write_offset=None, write_offset_axis=None): self.target = target self.fill_value = fill_value self.contains_value = False self.is_masked = False + self.write_offset = write_offset + self.write_offset_axis = write_offset_axis def __setitem__(self, keys, arr): if self.fill_value is not None: self.contains_value = self.contains_value or self.fill_value in arr self.is_masked = self.is_masked or ma.is_masked(arr) + if self.write_offset_axis is not None: + # Adjust keys to offset one axis, as needed in append operations. + keys = _offset_keys(keys, self.write_offset_axis, self.write_offset) self.target[keys] = arr @@ -797,11 +909,13 @@ def __init__(self, *args, **kwargs): Args order and Kwargs entries from '__slots__'. - Unspecified args default to values in 'self._defaults_dict', + Unspecified args default to values in 'self._defaults_dict()', if any, or else None. """ values = getattr(self, '_defaults_dict', {}) + if callable(values): + values = values() unrecs = [key for key in kwargs if key not in self.__slots__] if unrecs: unrecs = ', '.join(unrecs) @@ -839,11 +953,16 @@ def __str__(self): class _SaveFile(_SlotsHolder): _typename = 'SaveFile' __slots__ = ('dims', 'vars', 'ncattrs') - + @staticmethod + def _defaults_dict(): + return dict(dims=OrderedDict(), + vars=OrderedDict(), + ncattrs=OrderedDict()) class _SaveDim(_SlotsHolder): _typename = 'SaveDim' - __slots__ = ('name', 'size') + __slots__ = ('name', 'size', 'is_unlimited') + _defaults_dict = {'is_unlimited': False} class _SaveVar(_SlotsHolder): @@ -936,9 +1055,7 @@ def __init__(self, filename, netcdf_format, append=False): #: A dictionary, mapping formula terms to owner cf variable name self._formula_terms_cache = {} #: A placeholder for elements to be written/appended to the file. - self._save = _SaveFile(dims=OrderedDict(), - vars=OrderedDict(), - ncattrs=OrderedDict()) + self._save = _SaveFile() self._append_mode = append #: NetCDF dataset @@ -966,7 +1083,7 @@ def __enter__(self): def __exit__(self, type, value, traceback): """Flush any buffered data to the CF-netCDF file before closing.""" output_path = self._dataset.filepath() - self._write_to_dataset() + self._write_or_append_to_dataset() self._dataset.sync() self._dataset.close() import subprocess @@ -1096,7 +1213,6 @@ def write(self, cube, local_keys=None, unlimited_dimensions=None, """ if unlimited_dimensions is None: unlimited_dimensions = [] - # # For now, we are just not supporting the cf_profile hook, # because this expects an existing open dataset to modify after each write, @@ -1260,12 +1376,14 @@ def _create_cf_dimensions(self, cube, dimension_names, for dim_name in dimension_names: if dim_name not in self._save.dims: - if dim_name in unlimited_dim_names: + unlimited = dim_name in unlimited_dim_names + if unlimited : size = None else: size = self._existing_dim[dim_name] _addbyname(self._save.dims, - _SaveDim(name=dim_name, size=size)) + _SaveDim(name=dim_name, size=size, + is_unlimited=unlimited)) def _add_aux_coords(self, cube, cf_var_cube, dimension_names): """ @@ -2265,6 +2383,17 @@ def _increment_name(self, varname): return '{}_{}'.format(varname, num) + def _write_or_append_to_dataset(self): + """ + Write the contents of self._save to an actual file. + The target, 'self._dataset' should be open + ready to go. + + """ + if self._append_mode: + self._append_to_dataset() + else: + self._write_to_dataset() + def _write_to_dataset(self): """ Write the contents of self._save to an actual file. @@ -2290,6 +2419,186 @@ def _write_to_dataset(self): for attr in self._save.ncattrs.values(): _set_file_ncattr(self._dataset, attr.name, attr.value) + def _append_to_dataset(self): + """ + Append the contents of self._save to an actual file. + The target, 'self._dataset' should be open + ready to go. + + """ + # First check all is good to go. + filedata = self._prepare_append() + # Find the unlimited dimension + get its current length in the file. + dims_unlim = [dim for dim in filedata.dims.values() + if dim.is_unlimited] + assert len(dims_unlim) == 1 + file_unlim_dim = dims_unlim[0] + unlim_dim_length = file_unlim_dim.size + file_extend_dim_name = file_unlim_dim.name + + # Write to (extend) each variable mapped to the unlimited dim. + for src_var in self._save.vars.values(): + if file_extend_dim_name in src_var.dim_names: + tgt_var = filedata.vars[src_var.name] + i_dim_unlim = src_var.dim_names.index(file_extend_dim_name) + nc_var = self._dataset.variables[tgt_var.name] + self._write_variable_values_in_dataset( + nc_var, src_var, + write_axis=i_dim_unlim, write_offset=unlim_dim_length) + + def _prepare_append(self): + """ + Work out the correspondence between the data elements in self._save + and those found in an existing file (for append). + + Align the save data variables and dimensions with those in the file by + renaming them as needed. + Check all necessary matching properties between corresponding elements. + Check there is exactly one unlimited dimension. + Check that dimension variable appends will preserve monotonicity. + + If this call succeeds, an append can proceed, and should be safe. + + NOTE: general attributes in the new data are mostly ignored : They are + not written to the file. However, structural attributes like + 'coordinates' are used, and 'units' attributes are checked for a match. + + """ + def var_identity_name(var): + if 'standard_name' in var.ncattrs: + result = var.ncattrs['standard_name'] + elif 'long_name' in var.ncattrs: + result = var.ncattrs['long_name'] + else: + result = var.name + return result + + def identity_match(var1, var2): + std, lng = 'standard_name', 'long_name' + if std in var1.ncattrs or std in var2.ncattrs: + match = (std in var1.ncattrs and std in var2.ncattrs and + var1.ncattrs[std] == var2.ncattrs[std]) + elif lng in var1.ncattrs or lng in var2.ncattrs: + match = (lng in var1.ncattrs and lng in var2.ncattrs and + var1.ncattrs[lng] == var2.ncattrs[lng]) + else: + match = var1.name == var2.name + return match + + # Construct a simple representation of the existing data. + ds = self._dataset + filedata = _SaveFile() + + for dimname, ds_dim in ds.dimensions.items(): + _addbyname(filedata.dims, + _SaveDim(dimname, size=ds_dim.size, + is_unlimited=ds_dim.isunlimited())) + + for ds_var in ds.variables.values(): + attrs = {attname: _SaveAttr(attname, getattr(ds_var, attname)) + for attname in ds_var.ncattrs()} + + cf_var = _SaveVar(ds_var.name, + dim_names=ds_var.dimensions, + ncattrs=attrs) + _addbyname(filedata.vars, cf_var) + + # Scan both dataset + save-data to add 'linkage' information. + # Linkage occurs where a variable attribute names other variables, and + # also when a variable uses dimensions with dimension variables. + variable_linkage_attrs = ('bounds', 'coordinates', 'grid_mapping', + 'cell_measures', 'ancillary_variables') + + def label_links(cf_data, user_datatype_info): + # Create empty links on all variables. + for cf_var in cf_data.vars.values(): + cf_var._linksto = {} + cf_var._linkedfrom = {} + for cf_var in cf_data.vars.values(): + linksto = cf_var._linksto + # Make links where variable attributes name other variables. + # Note: all links are made both 'up' and 'down' + for attrname in variable_linkage_attrs: + if attrname in cf_var.ncattrs: + target_names = cf_var.ncattrs[attrname].value + target_names = target_names.split(' ') + for target_name in target_names: + if target_name not in cf_data.vars: + msg = ('Missing linked variable in {} : ' + 'variable property {}.{} contains {}, ' + 'which is not a variable in the ' + 'dataset.') + msg = msg.format(user_datatype_info, + cf_var.name, attrname, + target_name) + raise ValueError(msg) + # Make links both "up" and "down". + target_var = cf_data.vars[target_name] + if attrname not in linksto: + linksto[attrname] = [] + linksto[attrname].append(target_var) + linkedfrom = target_var._linkedfrom + if attrname not in linkedfrom: + linkedfrom[attrname] = [] + linkedfrom[attrname].append(target_var) + for dim_name in cf_var.dim_names: + # Find+label the dimension variable (i.e. coord), if any. + dim_var = cf_data.vars.get(dim_name, None) + if dim_var is not None: + if '_dim' not in linksto: + linksto['_dim'] = [] + linksto['_dim'].append(dim_var) + linkedfrom = dim_var._linkedfrom + if '_dim' not in linkedfrom: + linkedfrom['_dim'] = [] + linkedfrom['_dim'].append(cf_var) + + savedata = self._save + label_links(filedata, 'existing file') + label_links(savedata, 'data to be appended') + + # Get all variables in the existing file. + file_vars = set(filedata.vars.values()) + + # Identify + exclude the dimension variables. + file_dim_vars = set(file_var for file_var in file_vars + if file_var.name in filedata.dims) + file_vars -= file_dim_vars + + # Get all variables in the save data. + save_vars = set(self._save.vars.values()) # A copy ! + + # Identify + exclude all the dimension variables in the save data. + # NOTE: at this point, we don't judge which is which. That will be + # done later, by matching usage in variables. + save_dim_vars = set(save_var for save_var in save_vars + if save_var.name in savedata.dims.keys()) + save_vars -= save_dim_vars + + # Get a list of the 'primary' variables. + primary_save_vars = set( + save_var for save_var in save_vars + if save_var.controls['var_type'] == 'data-var') + save_vars -= primary_save_vars + + # Identify each primary variable with one in the file, by name. + for save_var in primary_save_vars: + ident = var_identity_name(save_var) + matches = [file_var + for file_var in filedata.vars.values() + if identity_match(save_var, file_var)] + n_matches = len(matches) + if n_matches != 1: + fail_reason = 'no' if n_matches == 0 else 'too many' + msg = ('Append failure : {} variables found in original file ' + 'matching source cube "{}"') + raise ValueError(msg.format(fail_reason, ident)) + file_var = matches[0] + # "Label" the save variables with the matching file variable name. + save_var.name = file_var.name + + # Return our representation of the original file. + return filedata + def _make_variable_in_dataset(self, var): """ Create an actual file variable. @@ -2358,23 +2667,49 @@ def _make_variable_in_dataset(self, var): return nc_var - def _write_variable_values_in_dataset(self, nc_var, var): + def _write_variable_values_in_dataset(self, nc_var, var, + write_axis=None, write_offset=None): """ Write data values into a file variable. + Args: + + * nc_var (:class:`netCDF4.Variable`): + The netCDF4 variable object to write into. + * var (:class:`_SaveVar`): + A _SaveVar whose .data_source provides the data to write. + Either real or lazy data can be written, in the lazy case we + perform a Dask streaming write. + + Kwargs: + + * write_axis (int): + * write_offset (int): + Keys to apply when writing to the target variable. + (Used for appending along a dimension). + Note : for safe "append"s, this must not raise Exceptions. """ # Decide whether to write or stream data into the variable. # If netcdf3, avoid streaming due to dtype handling. data = var.data_source + if (not is_lazy_data(data) or self._dataset.file_format in ('NETCDF3_CLASSIC', 'NETCDF3_64BIT')): data = as_concrete_data(data) + # Construct indexing keys for the data target. + if write_offset is None: + # Normal writes just use var[:] + write_slices = slice(None) + else: + # Appended writes require var[:, :, ... n_existing:, ...] + write_slices = [slice(None)] * data.ndim + write_slices[write_axis] = slice(write_offset, None) def store(data, cf_var, fill_value): - cf_var[:] = data + cf_var[write_slices] = data is_masked = ma.is_masked(data) contains_value = fill_value is not None and fill_value in data return is_masked, contains_value @@ -2384,7 +2719,9 @@ def store(data, cf_var, fill_value): def store(data, cf_var, fill_value): # Store lazy data and check whether it is masked and contains # the fill value - target = _FillValueMaskCheckAndStoreTarget(cf_var, fill_value) + target = _FillValueMaskCheckAndStoreTarget( + cf_var, fill_value, + write_offset_axis=write_axis, write_offset=write_offset) da.store([data], [target]) return target.is_masked, target.contains_value @@ -2462,7 +2799,8 @@ def _var_type_and_ref_name(var): def save(cube, filename, netcdf_format='NETCDF4', local_keys=None, unlimited_dimensions=None, zlib=False, complevel=4, shuffle=True, fletcher32=False, contiguous=False, chunksizes=None, endian='native', - least_significant_digit=None, packing=None, fill_value=None): + least_significant_digit=None, packing=None, fill_value=None, + append=False): """ Save cube(s) to a netCDF file, given the cube and the filename. @@ -2584,6 +2922,11 @@ def save(cube, filename, netcdf_format='NETCDF4', local_keys=None, `:class:`iris.cube.CubeList`, or a single element, and each element of this argument will be applied to each cube separately. + * append (bool): + If True, append all data to an existing file, instead of writing a new + file as when append=False (the default). + TODO: much explaining ... + Returns: None. @@ -2679,7 +3022,7 @@ def is_valid_packspec(p): raise ValueError(msg) # Initialise Manager for saving - with Saver(filename, netcdf_format) as sman: + with Saver(filename, netcdf_format, append=append) as sman: # Iterate through the cubelist. for cube, packspec, fill_value in zip(cubes, packspecs, fill_values): sman.write(cube, local_keys, unlimited_dimensions, zlib, complevel, From 3995b662c6cf3b2ca826d22f591ed0c300fc9911 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Fri, 9 Nov 2018 17:50:39 +0000 Subject: [PATCH 2/4] Codestyle; skip no-offset writes. --- lib/iris/fileformats/netcdf.py | 42 +++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/lib/iris/fileformats/netcdf.py b/lib/iris/fileformats/netcdf.py index b1f526f999..f843f5fc47 100644 --- a/lib/iris/fileformats/netcdf.py +++ b/lib/iris/fileformats/netcdf.py @@ -757,8 +757,6 @@ def _set_file_ncattr(variable, name, attribute): return variable.setncattr(name, attribute) - - def _offset_keys(keys, axis, offset): """ Return indexes adjusted for a write offset. @@ -767,30 +765,31 @@ def _offset_keys(keys, axis, offset): access, from e.g. "cf_var[:, :, :]" to something like "cf_var[:, N:, :]". We can't use an index-of-index-of-thing to do this, as cf_var[keys] does not produce a writeable view (unlike the equivalent numpy array access). - So instead we pick apart the key expression + produce a modified version. + So instead we pick apart the index expression + return a modified version. Args: * keys (multidimensional indexing expression): - A slicing representing a subarray of an array. - Must contain at least 'axis' key elements. + A slicing representing a subsection of an array. * axis (integer): The dimension to offset. * offset (integer): A positive integer to add onto the 'axis'th indexing. Returns: - * out_keys (tuple of ints or slices): - an indexing expression for the notional target region + * out_keys (tuple of (int or slice)): + an indexing expression addressing the notional target region "target[< axis * (:,) >, offset:][keys]". .. note:: + Supports *only* keys which are slice objects or integers -- not newaxis, ellipsis, or anything else. So we depend on dask streaming using only those. It seems to work, for now. .. note:: + Negative indices also don't have an obvious interpretation, so we don't allow them. @@ -830,7 +829,7 @@ def _offset_keys(keys, axis, offset): (stop is not None and stop < 0)): msg = ("Cannot offset the {}th key of index expression " "'{}', = '{}', as it uses negative indices.") - msg = msg.format(axis+ 1, keys, axis_key) + msg = msg.format(axis + 1, keys, axis_key) raise ValueError(msg) if start is None: @@ -871,21 +870,21 @@ class _FillValueMaskCheckAndStoreTarget(object): """ def __init__(self, target, fill_value=None, - write_offset=None, write_offset_axis=None): + write_offset_axis=None, write_offset=0): self.target = target self.fill_value = fill_value self.contains_value = False self.is_masked = False - self.write_offset = write_offset - self.write_offset_axis = write_offset_axis + self.offset_axis = write_offset_axis + self.offset = write_offset def __setitem__(self, keys, arr): if self.fill_value is not None: self.contains_value = self.contains_value or self.fill_value in arr self.is_masked = self.is_masked or ma.is_masked(arr) - if self.write_offset_axis is not None: + if self.offset != 0: # Adjust keys to offset one axis, as needed in append operations. - keys = _offset_keys(keys, self.write_offset_axis, self.write_offset) + keys = _offset_keys(keys, self.offset_axis, self.offset) self.target[keys] = arr @@ -899,8 +898,11 @@ class _SlotsHolder(object): * __slots__ (list of string): names of content attributes. Order and names also provide object init args and kwargs. - * _typename : the headline name for the string print - * _defaults_dict : an kwargs-like dict of defaults for object init. + * _typename (string): + the headline name for the string print + * _defaults_dict (dict or callable): + a kwargs-like dict of defaults for object init, + or a callable returning one. """ def __init__(self, *args, **kwargs): @@ -953,12 +955,14 @@ def __str__(self): class _SaveFile(_SlotsHolder): _typename = 'SaveFile' __slots__ = ('dims', 'vars', 'ncattrs') + @staticmethod def _defaults_dict(): return dict(dims=OrderedDict(), vars=OrderedDict(), ncattrs=OrderedDict()) + class _SaveDim(_SlotsHolder): _typename = 'SaveDim' __slots__ = ('name', 'size', 'is_unlimited') @@ -1377,7 +1381,7 @@ def _create_cf_dimensions(self, cube, dimension_names, for dim_name in dimension_names: if dim_name not in self._save.dims: unlimited = dim_name in unlimited_dim_names - if unlimited : + if unlimited: size = None else: size = self._existing_dim[dim_name] @@ -2668,7 +2672,7 @@ def _make_variable_in_dataset(self, var): return nc_var def _write_variable_values_in_dataset(self, nc_var, var, - write_axis=None, write_offset=None): + write_axis=None, write_offset=0): """ Write data values into a file variable. @@ -2684,7 +2688,7 @@ def _write_variable_values_in_dataset(self, nc_var, var, Kwargs: * write_axis (int): - * write_offset (int): + * offset (int): Keys to apply when writing to the target variable. (Used for appending along a dimension). @@ -2700,7 +2704,7 @@ def _write_variable_values_in_dataset(self, nc_var, var, 'NETCDF3_64BIT')): data = as_concrete_data(data) # Construct indexing keys for the data target. - if write_offset is None: + if write_offset == 0: # Normal writes just use var[:] write_slices = slice(None) else: From 0c7584b2ab30eb293a386aa1b68c621226437c5f Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Fri, 9 Nov 2018 18:31:44 +0000 Subject: [PATCH 3/4] Hopefully working append checks (but no tests yet). --- lib/iris/fileformats/netcdf.py | 411 +++++++++++++++++++++++++++------ 1 file changed, 338 insertions(+), 73 deletions(-) diff --git a/lib/iris/fileformats/netcdf.py b/lib/iris/fileformats/netcdf.py index f843f5fc47..53651ae85a 100644 --- a/lib/iris/fileformats/netcdf.py +++ b/lib/iris/fileformats/netcdf.py @@ -44,6 +44,8 @@ import numpy.ma as ma from pyke import knowledge_engine +from cf_units import Unit + from iris._deprecation import warn_deprecated import iris.analysis from iris.aux_factory import HybridHeightFactory, HybridPressureFactory, \ @@ -2430,21 +2432,15 @@ def _append_to_dataset(self): """ # First check all is good to go. - filedata = self._prepare_append() - # Find the unlimited dimension + get its current length in the file. - dims_unlim = [dim for dim in filedata.dims.values() - if dim.is_unlimited] - assert len(dims_unlim) == 1 - file_unlim_dim = dims_unlim[0] + filedata, file_unlim_dim = self._prepare_append() unlim_dim_length = file_unlim_dim.size file_extend_dim_name = file_unlim_dim.name # Write to (extend) each variable mapped to the unlimited dim. for src_var in self._save.vars.values(): if file_extend_dim_name in src_var.dim_names: - tgt_var = filedata.vars[src_var.name] i_dim_unlim = src_var.dim_names.index(file_extend_dim_name) - nc_var = self._dataset.variables[tgt_var.name] + nc_var = src_var._file_var.data_source self._write_variable_values_in_dataset( nc_var, src_var, write_axis=i_dim_unlim, write_offset=unlim_dim_length) @@ -2456,18 +2452,34 @@ def _prepare_append(self): Align the save data variables and dimensions with those in the file by renaming them as needed. - Check all necessary matching properties between corresponding elements. + + Check all necessary matching properties between corresponding elements: Check there is exactly one unlimited dimension. Check that dimension variable appends will preserve monotonicity. + Check that the data-variables are 1-to-1 with the file, and all 'other' + variables are accounted for. + Variables mapped to the unlimited dimension should all have the same + length in the that dimension, i.e. same 'append size'. + Variables *not* mapped to the unlimited dimension should all match + 1-to-1 with variables in the file. + All variables are either 'data' variables, or are the subject of a + reference from another variable. If this call succeeds, an append can proceed, and should be safe. NOTE: general attributes in the new data are mostly ignored : They are not written to the file. However, structural attributes like 'coordinates' are used, and 'units' attributes are checked for a match. + Any packing controls are also retained as already in the file. """ def var_identity_name(var): + """ + Get the human-readable name of a variable (_SaveVar). + + NOTE: Used *only* for constructing user-messages. + + """ if 'standard_name' in var.ncattrs: result = var.ncattrs['standard_name'] elif 'long_name' in var.ncattrs: @@ -2476,18 +2488,6 @@ def var_identity_name(var): result = var.name return result - def identity_match(var1, var2): - std, lng = 'standard_name', 'long_name' - if std in var1.ncattrs or std in var2.ncattrs: - match = (std in var1.ncattrs and std in var2.ncattrs and - var1.ncattrs[std] == var2.ncattrs[std]) - elif lng in var1.ncattrs or lng in var2.ncattrs: - match = (lng in var1.ncattrs and lng in var2.ncattrs and - var1.ncattrs[lng] == var2.ncattrs[lng]) - else: - match = var1.name == var2.name - return match - # Construct a simple representation of the existing data. ds = self._dataset filedata = _SaveFile() @@ -2503,22 +2503,57 @@ def identity_match(var1, var2): cf_var = _SaveVar(ds_var.name, dim_names=ds_var.dimensions, - ncattrs=attrs) + ncattrs=attrs, + data_source=ds_var) + _addbyname(filedata.vars, cf_var) - # Scan both dataset + save-data to add 'linkage' information. + savedata = self._save + + # Check that both savedata + filedata have a single unlimited dim, + # and grab it. + def get_single_unlimited_dim(cf_data, source_description): + """Find the unlimited dimension + ensure there is just one.""" + dims_unlim = [dim for dim in cf_data.dims.values() + if dim.is_unlimited] + n_unlim = len(dims_unlim) + if n_unlim != 1: + if n_unlim == 0: + fail_reason = 'no' + else: + fail_reason = 'too many ({})'.format(n_unlim) + msg = ('Append failure : {} has {} unlimited ' + 'dimensions : can only have exactly 1.') + msg = msg.format(source_description, fail_reason) + raise ValueError(msg) + return dims_unlim[0] + + file_unlimited_dim = get_single_unlimited_dim(filedata, 'source file') + save_unlimited_dim = get_single_unlimited_dim(savedata, 'save data') + + # Label all the links in both savedata + filedata. + # This enables us to identify variables like aux-coords and + # cell-methods through relationships, not just their names. # Linkage occurs where a variable attribute names other variables, and # also when a variable uses dimensions with dimension variables. variable_linkage_attrs = ('bounds', 'coordinates', 'grid_mapping', 'cell_measures', 'ancillary_variables') def label_links(cf_data, user_datatype_info): + def addlink(linksdict, name, item): + """ + Add item to a named list in a "links dictionary", + creating the named list if not yet present. + + """ + links_list = linksdict.setdefault(name, []) + links_list.append(item) + # Create empty links on all variables. for cf_var in cf_data.vars.values(): cf_var._linksto = {} cf_var._linkedfrom = {} for cf_var in cf_data.vars.values(): - linksto = cf_var._linksto # Make links where variable attributes name other variables. # Note: all links are made both 'up' and 'down' for attrname in variable_linkage_attrs: @@ -2527,81 +2562,311 @@ def label_links(cf_data, user_datatype_info): target_names = target_names.split(' ') for target_name in target_names: if target_name not in cf_data.vars: - msg = ('Missing linked variable in {} : ' - 'variable property {}.{} contains {}, ' - 'which is not a variable in the ' - 'dataset.') + msg = ( + 'Append failure : missing linked variable ' + 'in {} : variable property {}.{} is {!s}, ' + 'but there is no "{}" variable in the ' + 'dataset.') msg = msg.format(user_datatype_info, cf_var.name, attrname, + target_names, target_name) raise ValueError(msg) # Make links both "up" and "down". target_var = cf_data.vars[target_name] - if attrname not in linksto: - linksto[attrname] = [] - linksto[attrname].append(target_var) - linkedfrom = target_var._linkedfrom - if attrname not in linkedfrom: - linkedfrom[attrname] = [] - linkedfrom[attrname].append(target_var) + addlink(cf_var._linksto, attrname, target_var) + addlink(target_var._linkedfrom, attrname, cf_var) + + # Also add links due to dimension usage. for dim_name in cf_var.dim_names: - # Find+label the dimension variable (i.e. coord), if any. dim_var = cf_data.vars.get(dim_name, None) if dim_var is not None: - if '_dim' not in linksto: - linksto['_dim'] = [] - linksto['_dim'].append(dim_var) - linkedfrom = dim_var._linkedfrom - if '_dim' not in linkedfrom: - linkedfrom['_dim'] = [] - linkedfrom['_dim'].append(cf_var) + # Note: there may *not* be a dim-coord for the dim. + addlink(cf_var._linksto, '_dim', dim_var) + addlink(dim_var._linkedfrom, '_dim', cf_var) - savedata = self._save label_links(filedata, 'existing file') label_links(savedata, 'data to be appended') # Get all variables in the existing file. - file_vars = set(filedata.vars.values()) - - # Identify + exclude the dimension variables. - file_dim_vars = set(file_var for file_var in file_vars - if file_var.name in filedata.dims) - file_vars -= file_dim_vars + all_file_vars = set(filedata.vars.values()) # Get all variables in the save data. - save_vars = set(self._save.vars.values()) # A copy ! + all_save_vars = set(self._save.vars.values()) # A copy ! - # Identify + exclude all the dimension variables in the save data. - # NOTE: at this point, we don't judge which is which. That will be - # done later, by matching usage in variables. - save_dim_vars = set(save_var for save_var in save_vars - if save_var.name in savedata.dims.keys()) - save_vars -= save_dim_vars - - # Get a list of the 'primary' variables. + # Get a list of the 'primary' variables (cube data in the save). primary_save_vars = set( - save_var for save_var in save_vars + save_var for save_var in all_save_vars if save_var.controls['var_type'] == 'data-var') - save_vars -= primary_save_vars - # Identify each primary variable with one in the file, by name. - for save_var in primary_save_vars: - ident = var_identity_name(save_var) - matches = [file_var - for file_var in filedata.vars.values() + # Identify save variables with file variables, using the metadata in + # preference to matching by variable names only ... + + def units_match(units_str_1, units_str_2): + # Compare unit strings *AS* units, if valid, else string-compare. + try: + unit1 = Unit(units_str_1) + unit2 = Unit(units_str_2) + except ValueError: + unit1, unit2 = units_str_1, units_str_2 + return unit1 == unit2 + + def identity_match(v_save, v_file): + """ + Match a save-data variable to a file variable. + + Match by name attributes or actual variable name. + If existing identification is recorded, check it. + If no existing identification recorded, perform units equivalence + checking, but do *not* record the new identification : the caller + should do that. + + """ + std, lng = 'standard_name', 'long_name' + if std in v_save.ncattrs or std in v_file.ncattrs: + # If either has 'standard_name', compare those. + match = (std in v_save.ncattrs and std in v_file.ncattrs and + v_save.ncattrs[std] == v_file.ncattrs[std]) + elif lng in v_save.ncattrs or lng in v_file.ncattrs: + # ELSE if either has 'long_name', compare those. + match = (lng in v_save.ncattrs and lng in v_file.ncattrs and + v_save.ncattrs[lng] == v_file.ncattrs[lng]) + else: + # ELSE compare actual variable names. + match = v_save.name == v_file.name + + if match: + # If we already have an association, check it is the same. + linked = (hasattr(v_save, '_file_var') or + hasattr(v_file, '_save_var')) + if linked: + if (getattr(v_save, '_file_var', None) != v_file or + getattr(v_file, '_save_var', None) != v_save): + msg = ('Append failure : save data variable "{}" ' + 'appears to match file variable "{}", but ' + 'file variable "{}" also matches save data ' + 'variable "{}".') + msg = msg.format( + var_identity_name(v_save), + var_identity_name(v_file), + var_identity_name(v_file._save_var)) + raise ValueError(msg) + else: + # New identification found : check some other aspects. + # Check shapes match. + save_dims = [dim.size + for dim in [savedata.dims[dim_name] + for dim_name in v_save.dim_names] + if dim is not save_unlimited_dim] + file_dims = [dim.size + for dim in [filedata.dims[dim_name] + for dim_name in v_file.dim_names] + if dim is not file_unlimited_dim] + if (save_dims != file_dims): + msg = ('Append failure : Shapes of "{}" in save ' + 'data and "{}" in the file are not ' + 'equivalent : "{}" != "{}".') + msg = msg.format(var_identity_name(v_save), + var_identity_name(v_file), + save_dims, + file_dims) + raise ValueError(msg) + + # Check units, if any. + if ('units' in v_save.ncattrs or + 'units' in v_file.ncattrs): + save_ut_str, file_ut_str = ( + var.ncattrs['units'].value + for var in (v_save, v_file)) + if not units_match(save_ut_str, file_ut_str): + msg = ('Append failure : Units of "{}" in save ' + 'data and "{}" in the file are not ' + 'equivalent : "{}" != "{}".') + msg = msg.format(var_identity_name(v_save), + var_identity_name(v_file), + save_ut_str, file_ut_str) + raise ValueError(msg) + + return match + + def find_single_match(save_var, candidate_file_vars, element_type): + matches = [file_var for file_var in candidate_file_vars if identity_match(save_var, file_var)] n_matches = len(matches) if n_matches != 1: - fail_reason = 'no' if n_matches == 0 else 'too many' + ident = var_identity_name(save_var) + if n_matches == 0: + fail_reason = 'no' + else: + fail_reason = 'too many ({})'.format(n_matches) msg = ('Append failure : {} variables found in original file ' - 'matching source cube "{}"') - raise ValueError(msg.format(fail_reason, ident)) - file_var = matches[0] - # "Label" the save variables with the matching file variable name. - save_var.name = file_var.name + 'matching source {} "{}"') + raise ValueError(msg.format(fail_reason, element_type, ident)) + + return matches[0] + + # Identify each primary variable in the save data with a variable in + # the file, matching by cf-meaningful name attributes in preference to + # the actual variable names. + for save_var in primary_save_vars: + file_var = find_single_match(save_var, all_file_vars, 'cube data') + # Record the matching file variable + vice-versa + save_var._file_var = file_var + file_var._save_var = save_var + + # Starting with the primary variables, extend the variable + # identification process over all links from each variable, recursing + # until all done. + new_matches = primary_save_vars.copy() + all_matches = set() + while new_matches: + all_matches |= new_matches + scan_new_matches = new_matches.copy() + new_matches = set() + for src_var in scan_new_matches: + # Do identify on link groups of each 'type'. + for linktype in src_var._linksto: + src_links = src_var._linksto[linktype] + tgt_links = src_var._file_var._linksto[linktype] + # Search to match these target vars 1-2-1 to source vars. + assert len(tgt_links) == len(src_links) + for src_link in src_links: + tgt_link = find_single_match(src_link, tgt_links, + linktype) + if not hasattr(src_link, '_file_var'): + # Found a new variable identity. + new_matches.add(src_link) + src_link._file_var = tgt_link + tgt_link._save_var = src_link + else: +# if src_link._file_var != tgt_link: +# # This can't happen, as it means src matches +# # >1 target, which we already excluded. +# assert(0) + if tgt_link._save_var != src_link: + msg = ( + 'Append failure : file data variable "{}" ' + 'appears to match save data variable ' + '"{}", but also matches "{}".') + msg = msg.format( + var_identity_name(tgt_link), + var_identity_name(src_link), + var_identity_name(src_link._file_var)) + raise ValueError(msg) + + # Check that all save-vars are now matched to a file-var. + # NOTE: *including* dimensions, which should all appear somewhere in + # the '_dim' references. + non_id_vars = [var for var in all_save_vars + if var._file_var is None] + if len(non_id_vars) > 0: + msg = ('Append failure : save data variable(s) have no match in ' + 'the existing file : ({})') + msg = msg.format(', '.join(var_identity_name(var) + for var in non_id_vars)) + raise ValueError(msg) + + # NOTE: at this point, it is ok to have additional variables in the + # file not identified with a save variable. + # But ... we *must* have a save variable for all those file variables + # *which map to the unlimited dimension*. + for var in all_file_vars: + if (file_unlimited_dim.name in var.dim_names and + not hasattr(var, '_save_var')): + msg = ('Append failure : variable "{}" in the existing file ' + 'is indexed to the unlimited dimension, but there is ' + 'no corresponding content in the saved cubes.') + msg = msg.format(var_identity_name(var)) + raise ValueError(msg) + + # Get the list of save variables we are going to actually write out. + save_vars = [var for var in all_save_vars + if save_unlimited_dim.name in var.dim_names] + + # Check that all save variable dimensions correspond correctly with + # those in the file. + # For this we scan through all the save variable dimensions, labelling + # both save and and file dimensions in the order they are encountered, + # and checking that the references are everywhere the same between the + # save and file variables. + i_next_dim = 0 + for save_var in all_save_vars: + file_var = save_var._file_var + # N.B. we already checked that var *shapes* match. + for save_dimname, file_dimname in zip( + save_var.dim_names, file_var.dim_names): + save_dim = savedata.dims[save_dimname] + file_dim = filedata.dims[file_dimname] + save_i = getattr(save_dim, '_i_dim', None) + file_i = getattr(file_dim, '_i_dim', None) + if (save_i is None and file_i is not None): + # Newly encountered dim in both save and file. + save_dim._i_dim = i_next_dim + file_dim._i_dim = i_next_dim + i_next_dim += 1 + else: + # At least one already seen : check both seen + match. + if (save_i != file_i): + # N.B. (includes 'None' values for unseen). + msg = ('Append failure : the dimensions of variable ' + '{}({}) in the save data do not correspond with ' + 'those of variable {}({}) in the existing ' + 'file.') + msg = msg.format( + var_identity_name(save_var), + save_var.dim_names, + var_identity_name(file_var), + file_var.dim_names) + raise ValueError(msg) + + # Check that all the new data is the same length along the unlimited + # dimension. + save_length = None + for var in save_vars: + i_dim = var.dim_names.index(save_unlimited_dim.name) + this_length = var.data_source.shape[i_dim] + if save_length is None: + save_length = this_length + elif this_length != save_length: + save_dim_var = savedata.vars[save_unlimited_dim.name] + save_dimname = var_identity_name(save_dim_var) + msg = ( + 'Append failure : source data does not all have the ' + 'same length in the append dimension "{}" : {} != {}.') + msg = msg.format(save_dimname, this_length, save_length) + raise ValueError(msg) + + # Check that any dimension coordinate on the unlimited dimension will + # remain monotonic in the append. + file_unlim_dimvar = filedata.vars.get(file_unlimited_dim.name, None) + if file_unlim_dimvar: + file_unlim_data = file_unlim_dimvar.data_source + save_unlim_data = file_unlim_dimvar._save_var.data_source + # Existing and new dimension values will be (strictly) monotonic. + # Given that, check that the combination still will be ... + end_old_data = file_unlim_data[-2:] + start_new_data = save_unlim_data[:2] + end_old_data, start_new_data = ( + as_concrete_data(data) + for data in (end_old_data, start_new_data)) + diffs = np.diff(np.concatenate((end_old_data, start_new_data))) + mindiff, maxdiff = np.min(diffs), np.max(diffs) + if mindiff * maxdiff <= 1.0e-9: + # NOTE: bit tricky this ... + # Triggers if *either* there are diffs both above + below 0, + # *or* one of them is close to 0 --> not *strictly* monotonic. + msg = ('Append failure : append of new to old coordinate ' + 'values along the unlimited "{}" dimension will not be ' + 'monotonic : old values = ..., {} : ' + 'new values = {}, ...') + old_vals_str = ', '.join(str(val) for val in list(end_old_data)) + new_vals_str = ', '.join(str(val) for val in list(start_new_data)) + msg = msg.format(var_identity_name(file_unlim_dimvar), + old_vals_str, new_vals_str) + raise ValueError(msg) # Return our representation of the original file. - return filedata + return filedata, file_unlimited_dim def _make_variable_in_dataset(self, var): """ From a6e86dce88e721c084c96c4c43d13225fc5289fc Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Tue, 27 Nov 2018 16:46:56 +0000 Subject: [PATCH 4/4] Fix debug code for Python3. --- lib/iris/fileformats/netcdf.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/iris/fileformats/netcdf.py b/lib/iris/fileformats/netcdf.py index 53651ae85a..760211b389 100644 --- a/lib/iris/fileformats/netcdf.py +++ b/lib/iris/fileformats/netcdf.py @@ -1095,8 +1095,12 @@ def __exit__(self, type, value, traceback): import subprocess import sys cmd = 'ncdump -h ' + output_path - lines = subprocess.check_output(cmd, shell=True) - lines = lines.split('\n') + chars = subprocess.check_output(cmd, shell=True) + # Convert bytes into strings in Python 2/3 compatible way. + if (not isinstance(chars, six.string_types) and + hasattr(chars, 'decode')): + chars = chars.decode() + lines = chars.split('\n') lines = ['netcdf [[XXX]] {'] + \ lines[1:] # replace first with filepath outlines = ['',