From 2473d501535a32c7c9b414844ba5f378b1fb1f76 Mon Sep 17 00:00:00 2001 From: gsheni Date: Mon, 25 Mar 2024 14:47:23 -0400 Subject: [PATCH] use single quotes --- deepecho/__init__.py | 14 +- deepecho/demo.py | 4 +- deepecho/models/__init__.py | 2 +- deepecho/models/base.py | 44 +++---- deepecho/models/basic_gan.py | 80 ++++++------ deepecho/models/par.py | 194 ++++++++++++++-------------- deepecho/sequences.py | 6 +- pyproject.toml | 4 +- tasks.py | 66 +++++----- tests/integration/test_basic_gan.py | 58 ++++----- tests/integration/test_par.py | 68 +++++----- tests/test_tasks.py | 20 +-- tests/unit/test_sequences.py | 154 +++++++++++----------- 13 files changed, 358 insertions(+), 356 deletions(-) diff --git a/deepecho/__init__.py b/deepecho/__init__.py index 4bc124a..5cc9aa5 100644 --- a/deepecho/__init__.py +++ b/deepecho/__init__.py @@ -1,16 +1,16 @@ """Top-level package for DeepEcho.""" -__author__ = "DataCebo, Inc." -__email__ = "info@sdv.dev" -__version__ = "0.5.1.dev0" -__path__ = __import__("pkgutil").extend_path(__path__, __name__) +__author__ = 'DataCebo, Inc.' +__email__ = 'info@sdv.dev' +__version__ = '0.5.1.dev0' +__path__ = __import__('pkgutil').extend_path(__path__, __name__) from deepecho.demo import load_demo from deepecho.models.basic_gan import BasicGANModel from deepecho.models.par import PARModel __all__ = [ - "load_demo", - "BasicGANModel", - "PARModel", + 'load_demo', + 'BasicGANModel', + 'PARModel', ] diff --git a/deepecho/demo.py b/deepecho/demo.py index 223a04a..229a39b 100644 --- a/deepecho/demo.py +++ b/deepecho/demo.py @@ -4,11 +4,11 @@ import pandas as pd -_DATA_PATH = os.path.join(os.path.dirname(__file__), "data") +_DATA_PATH = os.path.join(os.path.dirname(__file__), 'data') def load_demo(): """Load the demo DataFrame.""" return pd.read_csv( - os.path.join(_DATA_PATH, "demo.csv"), parse_dates=["date"] + os.path.join(_DATA_PATH, 'demo.csv'), parse_dates=['date'] ) diff --git a/deepecho/models/__init__.py b/deepecho/models/__init__.py index 97432bf..7880287 100644 --- a/deepecho/models/__init__.py +++ b/deepecho/models/__init__.py @@ -3,4 +3,4 @@ from deepecho.models.basic_gan import BasicGANModel from deepecho.models.par import PARModel -__all__ = ["PARModel", "BasicGANModel"] +__all__ = ['PARModel', 'BasicGANModel'] diff --git a/deepecho/models/base.py b/deepecho/models/base.py index d65d631..1bfda64 100644 --- a/deepecho/models/base.py +++ b/deepecho/models/base.py @@ -29,19 +29,19 @@ def _validate(sequences, context_types, data_types): See `fit`. """ dtypes = set([ - "continuous", - "categorical", - "ordinal", - "count", - "datetime", + 'continuous', + 'categorical', + 'ordinal', + 'count', + 'datetime', ]) assert all(dtype in dtypes for dtype in context_types) assert all(dtype in dtypes for dtype in data_types) for sequence in sequences: - assert len(sequence["context"]) == len(context_types) - assert len(sequence["data"]) == len(data_types) - lengths = [len(x) for x in sequence["data"]] + assert len(sequence['context']) == len(context_types) + assert len(sequence['data']) == len(data_types) + lengths = [len(x) for x in sequence['data']] assert len(set(lengths)) == 1 def fit_sequences(self, sequences, context_types, data_types): @@ -93,15 +93,15 @@ def _get_data_types(data, data_types, columns): else: dtype = data[column].dtype kind = dtype.kind - if kind in "fiud": - dtypes_list.append("continuous") - elif kind in "OSUb": - dtypes_list.append("categorical") - elif kind == "M": - dtypes_list.append("datetime") + if kind in 'fiud': + dtypes_list.append('continuous') + elif kind in 'OSUb': + dtypes_list.append('categorical') + elif kind == 'M': + dtypes_list.append('datetime') else: error = ( - f"Unsupported data_type for column {column}: {dtype}" + f'Unsupported data_type for column {column}: {dtype}' ) raise ValueError(error) @@ -147,18 +147,18 @@ def fit( """ if not entity_columns and segment_size is None: raise TypeError( - "If the data has no `entity_columns`, `segment_size` must be given." + 'If the data has no `entity_columns`, `segment_size` must be given.' ) if segment_size is not None and not isinstance(segment_size, int): if sequence_index is None: raise TypeError( - "`segment_size` must be of type `int` if " - "no `sequence_index` is given." + '`segment_size` must be of type `int` if ' + 'no `sequence_index` is given.' ) - if data[sequence_index].dtype.kind != "M": + if data[sequence_index].dtype.kind != 'M': raise TypeError( - "`segment_size` must be of type `int` if " - "`sequence_index` is not a `datetime` column." + '`segment_size` must be of type `int` if ' + '`sequence_index` is not a `datetime` column.' ) segment_size = pd.to_timedelta(segment_size) @@ -237,7 +237,7 @@ def sample(self, num_entities=None, context=None, sequence_length=None): if context is None: if num_entities is None: raise TypeError( - "Either context or num_entities must be not None" + 'Either context or num_entities must be not None' ) context = self._context_values.sample(num_entities, replace=True) diff --git a/deepecho/models/basic_gan.py b/deepecho/models/basic_gan.py index ed05a09..734dafa 100644 --- a/deepecho/models/basic_gan.py +++ b/deepecho/models/basic_gan.py @@ -174,22 +174,22 @@ def __init__( self._hidden_size = hidden_size if not cuda or not torch.cuda.is_available(): - device = "cpu" + device = 'cpu' elif isinstance(cuda, str): device = cuda else: - device = "cuda" + device = 'cuda' self._device = torch.device(device) self._verbose = verbose - LOGGER.info("%s instance created", self) + LOGGER.info('%s instance created', self) def __repr__(self): """Return a representation of the class object.""" return ( - f"{self.__class__.__name__}(epochs={self._epochs}, latent_size={self._latent_size}," - f"hidden_size={self._hidden_size}, gen_lr={self._gen_lr}, dis_lr={self._dis_lr}," + f'{self.__class__.__name__}(epochs={self._epochs}, latent_size={self._latent_size},' + f'hidden_size={self._hidden_size}, gen_lr={self._gen_lr}, dis_lr={self._dis_lr},' f"cuda='{self._device}', verbose={self._verbose})" ) @@ -221,25 +221,25 @@ def _index_map(columns, types): mapping = {} for column, column_type in enumerate(types): values = columns[column] - if column_type in ("continuous", "count"): + if column_type in ('continuous', 'count'): mapping[column] = { - "type": column_type, - "min": np.min(values), - "max": np.max(values), - "indices": (dimensions, dimensions + 1), + 'type': column_type, + 'min': np.min(values), + 'max': np.max(values), + 'indices': (dimensions, dimensions + 1), } dimensions += 2 - elif column_type in ("categorical", "ordinal"): + elif column_type in ('categorical', 'ordinal'): indices = {} for value in set(values): indices[value] = dimensions dimensions += 1 - mapping[column] = {"type": column_type, "indices": indices} + mapping[column] = {'type': column_type, 'indices': indices} else: - raise ValueError(f"Unsupported type: {column_type}") + raise ValueError(f'Unsupported type: {column_type}') return mapping, dimensions @@ -252,7 +252,7 @@ def _analyze_data(self, sequences, context_types, data_types): - Index map and dimensions for the data. """ sequence_lengths = np.array([ - len(sequence["data"][0]) for sequence in sequences + len(sequence['data'][0]) for sequence in sequences ]) self._max_sequence_length = np.max(sequence_lengths) self._fixed_length = ( @@ -263,7 +263,7 @@ def _analyze_data(self, sequences, context_types, data_types): context = [] for column in range(len(context_types)): context.append([ - sequence["context"][column] for sequence in sequences + sequence['context'][column] for sequence in sequences ]) self._context_map, self._context_size = self._index_map( @@ -274,7 +274,7 @@ def _analyze_data(self, sequences, context_types, data_types): data = [] for column in range(len(data_types)): data.append( - sum([sequence["data"][column] for sequence in sequences], []) + sum([sequence['data'][column] for sequence in sequences], []) ) self._data_map, self._data_size = self._index_map(data, data_types) @@ -284,13 +284,13 @@ def _analyze_data(self, sequences, context_types, data_types): @staticmethod def _normalize(tensor, value, properties): """Normalize the value between 0 and 1 and flag nans.""" - value_idx, missing_idx = properties["indices"] + value_idx, missing_idx = properties['indices'] if pd.isnull(value): tensor[value_idx] = 0.0 tensor[missing_idx] = 1.0 else: - column_min = properties["min"] - column_range = properties["max"] - column_min + column_min = properties['min'] + column_range = properties['max'] - column_min offset = value - column_min tensor[value_idx] = 2.0 * offset / column_range - 1.0 tensor[missing_idx] = 0.0 @@ -298,13 +298,13 @@ def _normalize(tensor, value, properties): @staticmethod def _denormalize(tensor, row, properties, round_value): """Denormalize previously normalized values, setting NaN values if necessary.""" - value_idx, missing_idx = properties["indices"] + value_idx, missing_idx = properties['indices'] if tensor[row, 0, missing_idx] > 0.5: return None normalized = tensor[row, 0, value_idx].item() - column_min = properties["min"] - column_range = properties["max"] - column_min + column_min = properties['min'] + column_range = properties['max'] - column_min denormalized = (normalized + 1) * column_range / 2.0 + column_min if round_value: @@ -315,14 +315,14 @@ def _denormalize(tensor, row, properties, round_value): @staticmethod def _one_hot_encode(tensor, value, properties): """Update the index that corresponds to the value to 1.0.""" - value_index = properties["indices"][value] + value_index = properties['indices'][value] tensor[value_index] = 1.0 @staticmethod def _one_hot_decode(tensor, row, properties): """Obtain the category that corresponds to the highest one-hot value.""" - max_value = float("-inf") - for category, idx in properties["indices"].items(): + max_value = float('-inf') + for category, idx in properties['indices'].items(): value = tensor[row, 0, idx] if value > max_value: max_value = value @@ -332,10 +332,10 @@ def _one_hot_decode(tensor, row, properties): def _value_to_tensor(self, tensor, value, properties): """Update the tensor according to the value and properties.""" - column_type = properties["type"] - if column_type in ("continuous", "count"): + column_type = properties['type'] + if column_type in ('continuous', 'count'): self._normalize(tensor, value, properties) - elif column_type in ("categorical", "ordinal"): + elif column_type in ('categorical', 'ordinal'): self._one_hot_encode(tensor, value, properties) else: @@ -381,17 +381,17 @@ def _tensor_to_data(self, tensor): data = [None] * len(self._data_map) for column, properties in self._data_map.items(): - column_type = properties["type"] + column_type = properties['type'] column_data = [] data[column] = column_data for row in range(sequence_length): - if column_type in ("continuous", "count"): - round_value = column_type == "count" + if column_type in ('continuous', 'count'): + round_value = column_type == 'count' value = self._denormalize( tensor, row, properties, round_value=round_value ) - elif column_type in ("categorical", "ordinal"): + elif column_type in ('categorical', 'ordinal'): value = self._one_hot_decode(tensor, row, properties) else: raise ValueError() # Theoretically unreachable @@ -414,15 +414,15 @@ def _build_tensor(self, transform, sequences, key, dim): def _transform(self, data): for properties in self._data_map.values(): - column_type = properties["type"] - if column_type in ("continuous", "count"): - value_idx, missing_idx = properties["indices"] + column_type = properties['type'] + if column_type in ('continuous', 'count'): + value_idx, missing_idx = properties['indices'] data[:, :, value_idx] = torch.tanh(data[:, :, value_idx]) data[:, :, missing_idx] = torch.sigmoid( data[:, :, missing_idx] ) - elif column_type in ("categorical", "ordinal"): - indices = list(properties["indices"].values()) + elif column_type in ('categorical', 'ordinal'): + indices = list(properties['indices'].values()) data[:, :, indices] = torch.nn.functional.softmax( data[:, :, indices] ) @@ -548,10 +548,10 @@ def fit_sequences(self, sequences, context_types, data_types): self._analyze_data(sequences, context_types, data_types) data = self._build_tensor( - self._data_to_tensor, sequences, "data", dim=1 + self._data_to_tensor, sequences, 'data', dim=1 ) context = self._build_tensor( - self._context_to_tensor, sequences, "context", dim=0 + self._context_to_tensor, sequences, 'context', dim=0 ) data_context = _expand_context(data, context) @@ -580,7 +580,7 @@ def fit_sequences(self, sequences, context_types, data_types): d_loss = discriminator_score.item() g_loss = generator_score.item() iterator.set_description( - f"Epoch {epoch + 1} | D Loss {d_loss} | G Loss {g_loss}" + f'Epoch {epoch + 1} | D Loss {d_loss} | G Loss {g_loss}' ) def sample_sequence(self, context, sequence_length=None): diff --git a/deepecho/models/par.py b/deepecho/models/par.py index 9590da0..48a3094 100644 --- a/deepecho/models/par.py +++ b/deepecho/models/par.py @@ -111,22 +111,22 @@ def __init__(self, epochs=128, sample_size=1, cuda=True, verbose=True): self.sample_size = sample_size if not cuda or not torch.cuda.is_available(): - device = "cpu" + device = 'cpu' elif isinstance(cuda, str): device = cuda else: - device = "cuda" + device = 'cuda' self.device = torch.device(device) self.verbose = verbose - self.loss_values = pd.DataFrame(columns=["Epoch", "Loss"]) + self.loss_values = pd.DataFrame(columns=['Epoch', 'Loss']) - LOGGER.info("%s instance created", self) + LOGGER.info('%s instance created', self) def __repr__(self): """Return a representation of the class object.""" return ( - f"{self.__class__.__name__}(epochs={self.epochs}, sample_size={self.sample_size}," + f'{self.__class__.__name__}(epochs={self.epochs}, sample_size={self.sample_size},' f"cuda='{self.device}', verbose={self.verbose})" ) @@ -134,38 +134,38 @@ def _idx_map(self, x, t): idx = 0 idx_map = {} for i, t in enumerate(t): - if t == "continuous" or t == "datetime": + if t == 'continuous' or t == 'datetime': idx_map[i] = { - "type": t, - "mu": np.nanmean(x[i]), - "std": np.nanstd(x[i]), - "nulls": pd.isnull(x[i]).any(), - "indices": (idx, idx + 1, idx + 2), + 'type': t, + 'mu': np.nanmean(x[i]), + 'std': np.nanstd(x[i]), + 'nulls': pd.isnull(x[i]).any(), + 'indices': (idx, idx + 1, idx + 2), } idx += 3 - elif t == "count": + elif t == 'count': idx_map[i] = { - "type": t, - "min": np.nanmin(x[i]), - "range": np.nanmax(x[i]) - np.nanmin(x[i]), - "nulls": pd.isnull(x[i]).any(), - "indices": (idx, idx + 1, idx + 2), + 'type': t, + 'min': np.nanmin(x[i]), + 'range': np.nanmax(x[i]) - np.nanmin(x[i]), + 'nulls': pd.isnull(x[i]).any(), + 'indices': (idx, idx + 1, idx + 2), } idx += 3 - elif t == "categorical" or t == "ordinal": - idx_map[i] = {"type": t, "indices": {}} + elif t == 'categorical' or t == 'ordinal': + idx_map[i] = {'type': t, 'indices': {}} idx += 1 for v in set(x[i]): if pd.isnull(v): v = None - idx_map[i]["indices"][v] = idx + idx_map[i]['indices'][v] = idx idx += 1 else: - raise ValueError(f"Unsupported type: {t}") + raise ValueError(f'Unsupported type: {t}') return idx_map, idx @@ -175,8 +175,8 @@ def _build(self, sequences, context_types, data_types): min_length = np.inf max_length = -np.inf for sequence in sequences: - sequence_data = sequence["data"] - sequence_context = sequence["context"] + sequence_data = sequence['data'] + sequence_context = sequence['context'] sequence_length = len(sequence_data[0]) min_length = min(min_length, sequence_length) max_length = max(max_length, sequence_length) @@ -192,12 +192,12 @@ def _build(self, sequences, context_types, data_types): self._ctx_map, self._ctx_dims = self._idx_map(contexts, context_types) self._data_map, self._data_dims = self._idx_map(data, data_types) - self._data_map[""] = { - "type": "categorical", - "indices": { - "": self._data_dims, - "": self._data_dims + 1, - "": self._data_dims + 2, + self._data_map[''] = { + 'type': 'categorical', + 'indices': { + '': self._data_dims, + '': self._data_dims + 1, + '': self._data_dims + 2, }, } self._data_dims += 3 @@ -207,45 +207,45 @@ def _data_to_tensor(self, data): X = [] x = torch.zeros(self._data_dims) - x[self._data_map[""]["indices"][""]] = 1.0 + x[self._data_map['']['indices']['']] = 1.0 X.append(x) for i in range(seq_len): x = torch.zeros(self._data_dims) for key, props in self._data_map.items(): - if key == "": - x[self._data_map[""]["indices"][""]] = 1.0 + if key == '': + x[self._data_map['']['indices']['']] = 1.0 - elif props["type"] in ["continuous", "timestamp"]: - mu_idx, sigma_idx, missing_idx = props["indices"] - if pd.isnull(data[key][i]) or props["std"] == 0: + elif props['type'] in ['continuous', 'timestamp']: + mu_idx, sigma_idx, missing_idx = props['indices'] + if pd.isnull(data[key][i]) or props['std'] == 0: x[mu_idx] = 0.0 else: - x[mu_idx] = (data[key][i] - props["mu"]) / props["std"] + x[mu_idx] = (data[key][i] - props['mu']) / props['std'] x[sigma_idx] = 0.0 x[missing_idx] = 1.0 if pd.isnull(data[key][i]) else 0.0 - elif props["type"] in ["count"]: - r_idx, p_idx, missing_idx = props["indices"] - if pd.isnull(data[key][i]) or props["range"] == 0: + elif props['type'] in ['count']: + r_idx, p_idx, missing_idx = props['indices'] + if pd.isnull(data[key][i]) or props['range'] == 0: x[r_idx] = 0.0 else: - x[r_idx] = (data[key][i] - props["min"]) / props[ - "range" + x[r_idx] = (data[key][i] - props['min']) / props[ + 'range' ] x[p_idx] = 0.0 x[missing_idx] = 1.0 if pd.isnull(data[key][i]) else 0.0 - elif props["type"] in [ - "categorical", - "ordinal", + elif props['type'] in [ + 'categorical', + 'ordinal', ]: # categorical value = data[key][i] if pd.isnull(value): value = None - x[props["indices"][value]] = 1.0 + x[props['indices'][value]] = 1.0 else: raise ValueError() @@ -253,7 +253,7 @@ def _data_to_tensor(self, data): X.append(x) x = torch.zeros(self._data_dims) - x[self._data_map[""]["indices"][""]] = 1.0 + x[self._data_map['']['indices']['']] = 1.0 X.append(x) return torch.stack(X, dim=0).to(self.device) @@ -264,31 +264,31 @@ def _context_to_tensor(self, context): x = torch.zeros(self._ctx_dims) for key, props in self._ctx_map.items(): - if props["type"] in ["continuous", "datetime"]: - mu_idx, sigma_idx, missing_idx = props["indices"] + if props['type'] in ['continuous', 'datetime']: + mu_idx, sigma_idx, missing_idx = props['indices'] x[mu_idx] = ( 0.0 - if (pd.isnull(context[key]) or props["std"] == 0) - else (context[key] - props["mu"]) / props["std"] + if (pd.isnull(context[key]) or props['std'] == 0) + else (context[key] - props['mu']) / props['std'] ) x[sigma_idx] = 0.0 x[missing_idx] = 1.0 if pd.isnull(context[key]) else 0.0 - elif props["type"] in ["count"]: - r_idx, p_idx, missing_idx = props["indices"] + elif props['type'] in ['count']: + r_idx, p_idx, missing_idx = props['indices'] x[r_idx] = ( 0.0 - if (pd.isnull(context[key]) or props["range"] == 0) - else (context[key] - props["min"]) / props["range"] + if (pd.isnull(context[key]) or props['range'] == 0) + else (context[key] - props['min']) / props['range'] ) x[p_idx] = 0.0 x[missing_idx] = 1.0 if pd.isnull(context[key]) else 0.0 - elif props["type"] in ["categorical", "ordinal"]: + elif props['type'] in ['categorical', 'ordinal']: value = context[key] if pd.isnull(value): value = None - x[props["indices"][value]] = 1.0 + x[props['indices'][value]] = 1.0 else: raise ValueError() @@ -334,8 +334,8 @@ def fit_sequences(self, sequences, context_types, data_types): X, C = [], [] self._build(sequences, context_types, data_types) for sequence in sequences: - X.append(self._data_to_tensor(sequence["data"])) - C.append(self._context_to_tensor(sequence["context"])) + X.append(self._data_to_tensor(sequence['data'])) + C.append(self._context_to_tensor(sequence['context'])) X = torch.nn.utils.rnn.pack_sequence(X, enforce_sorted=False).to( self.device @@ -348,11 +348,11 @@ def fit_sequences(self, sequences, context_types, data_types): iterator = tqdm(range(self.epochs), disable=(not self.verbose)) if self.verbose: - pbar_description = "Loss ({loss:.3f})" + pbar_description = 'Loss ({loss:.3f})' iterator.set_description(pbar_description.format(loss=0)) # Reset loss_values dataframe - self.loss_values = pd.DataFrame(columns=["Epoch", "Loss"]) + self.loss_values = pd.DataFrame(columns=['Epoch', 'Loss']) X_padded, seq_len = torch.nn.utils.rnn.pad_packed_sequence(X) for epoch in iterator: @@ -366,8 +366,8 @@ def fit_sequences(self, sequences, context_types, data_types): loss.backward() epoch_loss_df = pd.DataFrame({ - "Epoch": [epoch], - "Loss": [loss.item()], + 'Epoch': [epoch], + 'Loss': [loss.item()], }) if not self.loss_values.empty: self.loss_values = pd.concat([ @@ -407,8 +407,8 @@ def _compute_loss(self, X_padded, Y_padded, seq_len): _, batch_size, _input_size = X_padded.shape for key, props in self._data_map.items(): - if props["type"] in ["continuous", "timestamp"]: - mu_idx, sigma_idx, missing_idx = props["indices"] + if props['type'] in ['continuous', 'timestamp']: + mu_idx, sigma_idx, missing_idx = props['indices'] mu = Y_padded[:, :, mu_idx] sigma = torch.nn.functional.softplus(Y_padded[:, :, sigma_idx]) missing = torch.nn.LogSigmoid()(Y_padded[:, :, missing_idx]) @@ -428,14 +428,14 @@ def _compute_loss(self, X_padded, Y_padded, seq_len): (1.0 - p_true) * torch.log(1.0 - torch.exp(p_pred)) ) - elif props["type"] in ["count"]: - r_idx, p_idx, missing_idx = props["indices"] + elif props['type'] in ['count']: + r_idx, p_idx, missing_idx = props['indices'] r = ( torch.nn.functional.softplus(Y_padded[:, :, r_idx]) - * props["range"] + * props['range'] ) p = torch.sigmoid(Y_padded[:, :, p_idx]) - x = X_padded[:, :, r_idx] * props["range"] + x = X_padded[:, :, r_idx] * props['range'] missing = torch.nn.LogSigmoid()(Y_padded[:, :, missing_idx]) for i in range(batch_size): @@ -457,8 +457,8 @@ def _compute_loss(self, X_padded, Y_padded, seq_len): (1.0 - p_true) * torch.log(1.0 - torch.exp(p_pred)) ) - elif props["type"] in ["categorical", "ordinal"]: - idx = list(props["indices"].values()) + elif props['type'] in ['categorical', 'ordinal']: + idx = list(props['indices'].values()) log_softmax = torch.nn.functional.log_softmax( Y_padded[:, :, idx], dim=2 ) @@ -480,41 +480,41 @@ def _compute_loss(self, X_padded, Y_padded, seq_len): def _tensor_to_data(self, x): # Force CPU on x - x = x.to(torch.device("cpu")) + x = x.to(torch.device('cpu')) seq_len, batch_size, _ = x.shape assert batch_size == 1 data = [None] * (len(self._data_map) - 1) for key, props in self._data_map.items(): - if key == "": + if key == '': continue data[key] = [] for i in range(seq_len): - if props["type"] in ["continuous", "datetime"]: - mu_idx, _sigma_idx, missing_idx = props["indices"] - if (x[i, 0, missing_idx] > 0) and props["nulls"]: + if props['type'] in ['continuous', 'datetime']: + mu_idx, _sigma_idx, missing_idx = props['indices'] + if (x[i, 0, missing_idx] > 0) and props['nulls']: data[key].append(None) else: data[key].append( - x[i, 0, mu_idx].item() * props["std"] + props["mu"] + x[i, 0, mu_idx].item() * props['std'] + props['mu'] ) - elif props["type"] in ["count"]: - r_idx, _p_idx, missing_idx = props["indices"] - if x[i, 0, missing_idx] > 0 and props["nulls"]: + elif props['type'] in ['count']: + r_idx, _p_idx, missing_idx = props['indices'] + if x[i, 0, missing_idx] > 0 and props['nulls']: data[key].append(None) else: sample = ( - x[i, 0, r_idx].item() * props["range"] - + props["min"] + x[i, 0, r_idx].item() * props['range'] + + props['min'] ) data[key].append(int(sample)) - elif props["type"] in ["categorical", "ordinal"]: - ml_value, max_x = None, float("-inf") - for value, idx in props["indices"].items(): + elif props['type'] in ['categorical', 'ordinal']: + ml_value, max_x = None, float('-inf') + for value, idx in props['indices'].items(): if x[i, 0, idx] > max_x: max_x = x[i, 0, idx] ml_value = value @@ -532,8 +532,8 @@ def _sample_state(self, x): assert seq_len == 1 and batch_size == 1 for key, props in self._data_map.items(): - if props["type"] in ["continuous", "timestamp"]: - mu_idx, sigma_idx, missing_idx = props["indices"] + if props['type'] in ['continuous', 'timestamp']: + mu_idx, sigma_idx, missing_idx = props['indices'] mu = x[0, 0, mu_idx] sigma = torch.nn.functional.softplus(x[0, 0, sigma_idx]) dist = torch.distributions.normal.Normal(mu, sigma) @@ -552,11 +552,11 @@ def _sample_state(self, x): dist.log_prob(x[0, 0, missing_idx]) ) - elif props["type"] in ["count"]: - r_idx, p_idx, missing_idx = props["indices"] + elif props['type'] in ['count']: + r_idx, p_idx, missing_idx = props['indices'] r = ( torch.nn.functional.softplus(x[0, 0, r_idx]) - * props["range"] + * props['range'] ) p = torch.sigmoid(x[0, 0, p_idx]) dist = torch.distributions.negative_binomial.NegativeBinomial( @@ -565,7 +565,7 @@ def _sample_state(self, x): x[0, 0, r_idx] = dist.sample() x[0, 0, p_idx] = 0.0 log_likelihood += torch.sum(dist.log_prob(x[0, 0, r_idx])) - x[0, 0, r_idx] /= props["range"] + x[0, 0, r_idx] /= props['range'] dist = torch.distributions.Bernoulli( torch.sigmoid(x[0, 0, missing_idx]) @@ -576,8 +576,8 @@ def _sample_state(self, x): dist.log_prob(x[0, 0, missing_idx]) ) - elif props["type"] in ["categorical", "ordinal"]: - idx = list(props["indices"].values()) + elif props['type'] in ['categorical', 'ordinal']: + idx = list(props['indices'].values()) p = torch.nn.functional.softmax(x[0, 0, idx], dim=0) x_new = torch.zeros(p.size()).to(self.device) x_new.scatter_(dim=0, index=torch.multinomial(p, 1), value=1) @@ -593,7 +593,7 @@ def _sample_sequence(self, context, min_length, max_length): log_likelihood = 0.0 x = torch.zeros(self._data_dims).to(self.device) - x[self._data_map[""]["indices"][""]] = 1.0 + x[self._data_map['']['indices']['']] = 1.0 x = x.unsqueeze(0).unsqueeze(0) for step in range(max_length): @@ -601,16 +601,16 @@ def _sample_sequence(self, context, min_length, max_length): x = torch.cat([x, next_x], dim=0) log_likelihood += ll if ( - next_x[0, 0, self._data_map[""]["indices"][""]] + next_x[0, 0, self._data_map['']['indices']['']] > 0.0 ): if min_length <= step + 1 <= max_length: break # received end token next_x[ - 0, 0, self._data_map[""]["indices"][""] + 0, 0, self._data_map['']['indices'][''] ] = 1.0 - next_x[0, 0, self._data_map[""]["indices"][""]] = ( + next_x[0, 0, self._data_map['']['indices']['']] = ( 0.0 ) @@ -644,7 +644,7 @@ def sample_sequence(self, context, sequence_length=None): else: context = None - best_x, best_ll = None, float("-inf") + best_x, best_ll = None, float('-inf') for _ in range(self.sample_size): with torch.no_grad(): x, log_likelihood = self._sample_sequence( diff --git a/deepecho/sequences.py b/deepecho/sequences.py index 4889847..785bbe6 100644 --- a/deepecho/sequences.py +++ b/deepecho/sequences.py @@ -113,7 +113,7 @@ def _convert_to_dicts(segments, context_columns): context = segment[context_columns] if len(context.drop_duplicates()) > 1: raise ValueError( - "Context columns are not constant within each segment." + 'Context columns are not constant within each segment.' ) context = context.iloc[0].values @@ -122,7 +122,7 @@ def _convert_to_dicts(segments, context_columns): context = [] lists = [list(row) for _, row in segment.items()] - sequences.append({"context": context, "data": lists}) + sequences.append({'context': context, 'data': lists}) return sequences @@ -192,7 +192,7 @@ def assemble_sequences( if context_columns: if len(sequence[context_columns].drop_duplicates()) > 1: raise ValueError( - "Context columns are not constant within each entity." + 'Context columns are not constant within each entity.' ) entity_segments = segment_sequence( diff --git a/pyproject.toml b/pyproject.toml index 684d33b..d481e74 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -183,6 +183,8 @@ ignore = [ "E501", "D107", # Missing docstring in __init__ "D417", # Missing argument descriptions in the docstring, this is a bug from pydocstyle: https://github.com/PyCQA/pydocstyle/issues/449 + "Q000", # Remove quotes + "Q003", # Remove quotes ] [tool.ruff.lint.pep8-naming] @@ -195,7 +197,7 @@ known-first-party = ["deepecho"] "__init__.py" = ["F401", "E402", "F403", "F405", "E501", "I001"] [tool.ruff.format] -quote-style = "double" +quote-style = "single" indent-style = "space" preview = true diff --git a/tasks.py b/tasks.py index 82acf3c..4362402 100644 --- a/tasks.py +++ b/tasks.py @@ -13,47 +13,47 @@ from packaging.version import Version COMPARISONS = { - ">=": operator.ge, - ">": operator.gt, - "<": operator.lt, - "<=": operator.le, + '>=': operator.ge, + '>': operator.gt, + '<': operator.lt, + '<=': operator.le, } -if not hasattr(inspect, "getargspec"): +if not hasattr(inspect, 'getargspec'): inspect.getargspec = inspect.getfullargspec @task def check_dependencies(c): - c.run("python -m pip check") + c.run('python -m pip check') @task def integration(c): c.run( - "python -m pytest ./tests/integration --reruns 3 --cov=deepecho --cov-report=xml" + 'python -m pytest ./tests/integration --reruns 3 --cov=deepecho --cov-report=xml' ) @task def unit(c): - c.run("python -m pytest ./tests/unit --reruns 3") + c.run('python -m pytest ./tests/unit --reruns 3') def _get_minimum_versions(dependencies, python_version): min_versions = {} for dependency in dependencies: - if "@" in dependency: - name, url = dependency.split(" @ ") - min_versions[name] = f"{name} @ {url}" + if '@' in dependency: + name, url = dependency.split(' @ ') + min_versions[name] = f'{name} @ {url}' continue req = Requirement(dependency) - if ";" in dependency: + if ';' in dependency: marker = req.marker if marker and not marker.evaluate({ - "python_version": python_version + 'python_version': python_version }): continue # Skip this dependency if the marker does not apply to the current Python version @@ -62,26 +62,26 @@ def _get_minimum_versions(dependencies, python_version): ( spec.version for spec in req.specifier - if spec.operator in (">=", "==") + if spec.operator in ('>=', '==') ), None, ) if min_version: - min_versions[req.name] = f"{req.name}=={min_version}" + min_versions[req.name] = f'{req.name}=={min_version}' - elif "@" not in min_versions[req.name]: - existing_version = Version(min_versions[req.name].split("==")[1]) + elif '@' not in min_versions[req.name]: + existing_version = Version(min_versions[req.name].split('==')[1]) new_version = next( ( spec.version for spec in req.specifier - if spec.operator in (">=", "==") + if spec.operator in ('>=', '==') ), existing_version, ) if new_version > existing_version: min_versions[req.name] = ( - f"{req.name}=={new_version}" # Change when a valid newer version is found + f'{req.name}=={new_version}' # Change when a valid newer version is found ) return list(min_versions.values()) @@ -89,15 +89,15 @@ def _get_minimum_versions(dependencies, python_version): @task def install_minimum(c): - with open("pyproject.toml", "rb") as pyproject_file: + with open('pyproject.toml', 'rb') as pyproject_file: pyproject_data = tomli.load(pyproject_file) - dependencies = pyproject_data.get("project", {}).get("dependencies", []) - python_version = ".".join(map(str, sys.version_info[:2])) + dependencies = pyproject_data.get('project', {}).get('dependencies', []) + python_version = '.'.join(map(str, sys.version_info[:2])) minimum_versions = _get_minimum_versions(dependencies, python_version) if minimum_versions: - c.run(f'python -m pip install {" ".join(minimum_versions)}') + c.run(f'python -m pip install {' '.join(minimum_versions)}') @task @@ -110,39 +110,39 @@ def minimum(c): @task def readme(c): - test_path = Path("tests/readme_test") + test_path = Path('tests/readme_test') if test_path.exists() and test_path.is_dir(): shutil.rmtree(test_path) cwd = os.getcwd() os.makedirs(test_path, exist_ok=True) - shutil.copy("README.md", test_path / "README.md") + shutil.copy('README.md', test_path / 'README.md') os.chdir(test_path) - c.run("rundoc run --single-session python3 -t python3 README.md") + c.run('rundoc run --single-session python3 -t python3 README.md') os.chdir(cwd) shutil.rmtree(test_path) @task def tutorials(c): - for ipynb_file in glob.glob("tutorials/*.ipynb") + glob.glob( - "tutorials/**/*.ipynb" + for ipynb_file in glob.glob('tutorials/*.ipynb') + glob.glob( + 'tutorials/**/*.ipynb' ): - if ".ipynb_checkpoints" not in ipynb_file: + if '.ipynb_checkpoints' not in ipynb_file: c.run( ( - "jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 " + 'jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 ' f'--to=html --stdout "{ipynb_file}"' ), - hide="out", + hide='out', ) @task def lint(c): check_dependencies(c) - c.run("ruff check .") - c.run("ruff format . --check") + c.run('ruff check .') + c.run('ruff format . --check') def remove_readonly(func, path, _): diff --git a/tests/integration/test_basic_gan.py b/tests/integration/test_basic_gan.py index 0031556..c58d6a1 100644 --- a/tests/integration/test_basic_gan.py +++ b/tests/integration/test_basic_gan.py @@ -12,22 +12,22 @@ def test_basic(self): """Basic test for the ``BasicGANModel``.""" sequences = [ { - "context": [], - "data": [ + 'context': [], + 'data': [ [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2, 0.1, 0.0], ], }, { - "context": [], - "data": [ + 'context': [], + 'data': [ [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2, 0.1, 0.0], ], }, ] context_types = [] - data_types = ["continuous", "continuous"] + data_types = ['continuous', 'continuous'] model = BasicGANModel(epochs=10) model.fit_sequences(sequences, context_types, data_types) @@ -37,22 +37,22 @@ def test_conditional(self): """Test the ``BasicGANModel`` with conditional sampling.""" sequences = [ { - "context": [0], - "data": [ + 'context': [0], + 'data': [ [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2, 0.1, 0.0], ], }, { - "context": [1], - "data": [ + 'context': [1], + 'data': [ [0.5, 0.4, 0.3, 0.2, 0.1, 0.0], [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], ], }, ] - context_types = ["categorical"] - data_types = ["continuous", "continuous"] + context_types = ['categorical'] + data_types = ['continuous', 'continuous'] model = BasicGANModel(epochs=10) model.fit_sequences(sequences, context_types, data_types) @@ -62,22 +62,22 @@ def test_mixed(self): """Test the ``BasicGANModel`` with mixed input data.""" sequences = [ { - "context": [0], - "data": [ + 'context': [0], + 'data': [ [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], [0, 1, 0, 1, 0, 1], ], }, { - "context": [1], - "data": [ + 'context': [1], + 'data': [ [0.5, 0.4, 0.3, 0.2, 0.1, 0.0], [0, 1, 0, 1, 0, 1], ], }, ] - context_types = ["categorical"] - data_types = ["continuous", "categorical"] + context_types = ['categorical'] + data_types = ['continuous', 'categorical'] model = BasicGANModel(epochs=10) model.fit_sequences(sequences, context_types, data_types) @@ -87,22 +87,22 @@ def test_count(self): """Test the BasicGANModel with datatype ``count``.""" sequences = [ { - "context": [0.5], - "data": [ + 'context': [0.5], + 'data': [ [0, 5, 5, 3, 1, 1], [0, 1, 2, 1, 0, 1], ], }, { - "context": [1.1], - "data": [ + 'context': [1.1], + 'data': [ [1, 6, 6, 4, 2, 2], [0, 1, 0, 1, 0, 1], ], }, ] - context_types = ["continuous"] - data_types = ["count", "categorical"] + context_types = ['continuous'] + data_types = ['count', 'categorical'] model = BasicGANModel(epochs=10) model.fit_sequences(sequences, context_types, data_types) @@ -112,22 +112,22 @@ def test_variable_length(self): """Test ``BasicGANModel`` with variable data length.""" sequences = [ { - "context": [0], - "data": [ + 'context': [0], + 'data': [ [0, 5, 5, 3, 1, 1, 0], [0, 1, 2, 1, 0, 1, 2], ], }, { - "context": [1], - "data": [ + 'context': [1], + 'data': [ [1, 6, 6, 4, 2, 2], [0, 1, 0, 1, 0, 1], ], }, ] - context_types = ["count"] - data_types = ["count", "categorical"] + context_types = ['count'] + data_types = ['count', 'categorical'] model = BasicGANModel(epochs=10) model.fit_sequences(sequences, context_types, data_types) diff --git a/tests/integration/test_par.py b/tests/integration/test_par.py index d7c07b2..55ce0c0 100644 --- a/tests/integration/test_par.py +++ b/tests/integration/test_par.py @@ -14,143 +14,143 @@ def test_basic(self): """Test the basic usage of a ``PARModel``.""" sequences = [ { - "context": [], - "data": [ + 'context': [], + 'data': [ [0.0, np.nan, 0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2, 0.1, 0.0], ], }, { - "context": [], - "data": [ + 'context': [], + 'data': [ [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2, 0.1, np.nan], ], }, ] context_types = [] - data_types = ["continuous", "continuous"] + data_types = ['continuous', 'continuous'] model = PARModel() model.fit_sequences(sequences, context_types, data_types) model.sample_sequence([]) # Assert - assert set(model.loss_values.columns) == {"Epoch", "Loss"} + assert set(model.loss_values.columns) == {'Epoch', 'Loss'} assert len(model.loss_values) == 128 def test_conditional(self): """Test the ``PARModel`` with conditional sampling.""" sequences = [ { - "context": [0], - "data": [ + 'context': [0], + 'data': [ [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2, np.nan, 0.0], ], }, { - "context": [1], - "data": [ + 'context': [1], + 'data': [ [0.5, 0.4, 0.3, 0.2, 0.1, 0.0], [0.0, 0.1, np.nan, 0.3, 0.4, 0.5], ], }, ] - context_types = ["categorical"] - data_types = ["continuous", "continuous"] + context_types = ['categorical'] + data_types = ['continuous', 'continuous'] model = PARModel() model.fit_sequences(sequences, context_types, data_types) model.sample_sequence([0]) # Assert - assert set(model.loss_values.columns) == {"Epoch", "Loss"} + assert set(model.loss_values.columns) == {'Epoch', 'Loss'} assert len(model.loss_values) == 128 def test_mixed(self): """Test the ``PARModel`` with mixed input data.""" sequences = [ { - "context": [0], - "data": [ + 'context': [0], + 'data': [ [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], [0, 1, 0, 1, 0, 1], ], }, { - "context": [1], - "data": [ + 'context': [1], + 'data': [ [0.5, np.nan, 0.3, 0.2, np.nan, 0.0], [0, 1, 0, 1, np.nan, 1], ], }, ] - context_types = ["categorical"] - data_types = ["continuous", "categorical"] + context_types = ['categorical'] + data_types = ['continuous', 'categorical'] model = PARModel() model.fit_sequences(sequences, context_types, data_types) model.sample_sequence([0]) # Assert - assert set(model.loss_values.columns) == {"Epoch", "Loss"} + assert set(model.loss_values.columns) == {'Epoch', 'Loss'} assert len(model.loss_values) == 128 def test_count(self): """Test the PARModel with datatype ``count``.""" sequences = [ { - "context": [0.5], - "data": [ + 'context': [0.5], + 'data': [ [0, 5, 5, np.nan, 1, 1], [0, 1, 2, 1, 0, 1], ], }, { - "context": [1.1], - "data": [ + 'context': [1.1], + 'data': [ [1, 6, 6, 4, 2, 2], [0, 1, 0, 1, 0, 1], ], }, ] - context_types = ["continuous"] - data_types = ["count", "categorical"] + context_types = ['continuous'] + data_types = ['count', 'categorical'] model = PARModel() model.fit_sequences(sequences, context_types, data_types) model.sample_sequence([0]) # Assert - assert set(model.loss_values.columns) == {"Epoch", "Loss"} + assert set(model.loss_values.columns) == {'Epoch', 'Loss'} assert len(model.loss_values) == 128 def test_variable_length(self): """Test ``PARModel`` with variable data length.""" sequences = [ { - "context": [0], - "data": [ + 'context': [0], + 'data': [ [0, 5, 5, 3, 1, 1, 0], [0, 1, 2, 1, 0, 1, 2], ], }, { - "context": [1], - "data": [ + 'context': [1], + 'data': [ [1, 6, 6, 4, 2, 2], [np.nan, 1, 0, 1, 0, np.nan], ], }, ] - context_types = ["count"] - data_types = ["count", "categorical"] + context_types = ['count'] + data_types = ['count', 'categorical'] model = PARModel() model.fit_sequences(sequences, context_types, data_types) model.sample_sequence([0]) # Assert - assert set(model.loss_values.columns) == {"Epoch", "Loss"} + assert set(model.loss_values.columns) == {'Epoch', 'Loss'} assert len(model.loss_values) == 128 diff --git a/tests/test_tasks.py b/tests/test_tasks.py index 1c03bb4..9099d48 100644 --- a/tests/test_tasks.py +++ b/tests/test_tasks.py @@ -15,24 +15,24 @@ def test_get_minimum_versions(): "numpy>=1.23.3,<2;python_version>='3.10'", "pandas>=1.2.0,<2;python_version<'3.10'", "pandas>=1.3.0,<2;python_version>='3.10'", - "humanfriendly>=8.2,<11", - "pandas @ git+https://github.com/pandas-dev/pandas.git@master#egg=pandas", + 'humanfriendly>=8.2,<11', + 'pandas @ git+https://github.com/pandas-dev/pandas.git@master#egg=pandas', ] # Run - minimum_versions_39 = _get_minimum_versions(dependencies, "3.9") - minimum_versions_310 = _get_minimum_versions(dependencies, "3.10") + minimum_versions_39 = _get_minimum_versions(dependencies, '3.9') + minimum_versions_310 = _get_minimum_versions(dependencies, '3.10') # Assert expected_versions_39 = [ - "numpy==1.20.0", - "pandas @ git+https://github.com/pandas-dev/pandas.git@master#egg=pandas", - "humanfriendly==8.2", + 'numpy==1.20.0', + 'pandas @ git+https://github.com/pandas-dev/pandas.git@master#egg=pandas', + 'humanfriendly==8.2', ] expected_versions_310 = [ - "numpy==1.23.3", - "pandas @ git+https://github.com/pandas-dev/pandas.git@master#egg=pandas", - "humanfriendly==8.2", + 'numpy==1.23.3', + 'pandas @ git+https://github.com/pandas-dev/pandas.git@master#egg=pandas', + 'humanfriendly==8.2', ] assert minimum_versions_39 == expected_versions_39 diff --git a/tests/unit/test_sequences.py b/tests/unit/test_sequences.py index af033b5..cca33cc 100644 --- a/tests/unit/test_sequences.py +++ b/tests/unit/test_sequences.py @@ -14,8 +14,8 @@ def test_segment_by_size(): """The sequence is cut in sequences of the indicated lenght.""" sequence = pd.DataFrame({ - "a": [1, 2, 3, 4, 5, 6, 7, 8, 9], - "b": [9, 8, 7, 6, 5, 4, 3, 2, 1], + 'a': [1, 2, 3, 4, 5, 6, 7, 8, 9], + 'b': [9, 8, 7, 6, 5, 4, 3, 2, 1], }) out = segment_by_size(sequence, 3) @@ -25,22 +25,22 @@ def test_segment_by_size(): pd.testing.assert_frame_equal( pd.DataFrame({ - "a": [1, 2, 3], - "b": [9, 8, 7], + 'a': [1, 2, 3], + 'b': [9, 8, 7], }), out[0], ) pd.testing.assert_frame_equal( pd.DataFrame({ - "a": [4, 5, 6], - "b": [6, 5, 4], + 'a': [4, 5, 6], + 'b': [6, 5, 4], }), out[1], ) pd.testing.assert_frame_equal( pd.DataFrame({ - "a": [7, 8, 9], - "b": [3, 2, 1], + 'a': [7, 8, 9], + 'b': [3, 2, 1], }), out[2], ) @@ -49,14 +49,14 @@ def test_segment_by_size(): def test_segment_by_time(): """The sequence is cut in sequences of the indicated time lenght.""" sequence = pd.DataFrame({ - "a": [1, 2, 3, 4, 5, 6, 7, 8, 9], - "b": [9, 8, 7, 6, 5, 4, 3, 2, 1], + 'a': [1, 2, 3, 4, 5, 6, 7, 8, 9], + 'b': [9, 8, 7, 6, 5, 4, 3, 2, 1], }) sequence_index = pd.date_range( - start="2001-01-01", periods=9, freq="1d" + start='2001-01-01', periods=9, freq='1d' ).to_series() - segment_size = pd.to_timedelta("3d") + segment_size = pd.to_timedelta('3d') out = segment_by_time(sequence, segment_size, sequence_index) assert isinstance(out, list) @@ -64,22 +64,22 @@ def test_segment_by_time(): pd.testing.assert_frame_equal( pd.DataFrame({ - "a": [1, 2, 3], - "b": [9, 8, 7], + 'a': [1, 2, 3], + 'b': [9, 8, 7], }), out[0], ) pd.testing.assert_frame_equal( pd.DataFrame({ - "a": [4, 5, 6], - "b": [6, 5, 4], + 'a': [4, 5, 6], + 'b': [6, 5, 4], }), out[1], ) pd.testing.assert_frame_equal( pd.DataFrame({ - "a": [7, 8, 9], - "b": [3, 2, 1], + 'a': [7, 8, 9], + 'b': [3, 2, 1], }), out[2], ) @@ -88,8 +88,8 @@ def test_segment_by_time(): def test_segment_sequence(): """If no sequence index is given, segments are not ordered.""" sequence = pd.DataFrame({ - "a": [1, 2, 3, 7, 8, 9, 4, 5, 6], - "b": [9, 8, 7, 3, 2, 1, 6, 5, 4], + 'a': [1, 2, 3, 7, 8, 9, 4, 5, 6], + 'b': [9, 8, 7, 3, 2, 1, 6, 5, 4], }) out = segment_sequence(sequence, 3, None) @@ -99,22 +99,22 @@ def test_segment_sequence(): pd.testing.assert_frame_equal( pd.DataFrame({ - "a": [1, 2, 3], - "b": [9, 8, 7], + 'a': [1, 2, 3], + 'b': [9, 8, 7], }), out[0], ) pd.testing.assert_frame_equal( pd.DataFrame({ - "a": [7, 8, 9], - "b": [3, 2, 1], + 'a': [7, 8, 9], + 'b': [3, 2, 1], }), out[1], ) pd.testing.assert_frame_equal( pd.DataFrame({ - "a": [4, 5, 6], - "b": [6, 5, 4], + 'a': [4, 5, 6], + 'b': [6, 5, 4], }), out[2], ) @@ -123,30 +123,30 @@ def test_segment_sequence(): def test_segment_sequence_sequence_index(): """If a sequence index is given, segments are ordered and index is dropped.""" sequence = pd.DataFrame({ - "a": [1, 2, 3, 7, 8, 9, 4, 5, 6], - "b": [9, 8, 7, 3, 2, 1, 6, 5, 4], + 'a': [1, 2, 3, 7, 8, 9, 4, 5, 6], + 'b': [9, 8, 7, 3, 2, 1, 6, 5, 4], }) - out = segment_sequence(sequence, 3, "a") + out = segment_sequence(sequence, 3, 'a') assert isinstance(out, list) assert len(out) == 3 pd.testing.assert_frame_equal( pd.DataFrame({ - "b": [9, 8, 7], + 'b': [9, 8, 7], }), out[0], ) pd.testing.assert_frame_equal( pd.DataFrame({ - "b": [6, 5, 4], + 'b': [6, 5, 4], }), out[1], ) pd.testing.assert_frame_equal( pd.DataFrame({ - "b": [3, 2, 1], + 'b': [3, 2, 1], }), out[2], ) @@ -158,20 +158,20 @@ def test__assemble_sequences_no_entity_no_context(): context_columns = [] data = pd.DataFrame({ - "a": [1, 2, 3, 4, 5, 6], - "b": [9, 8, 7, 6, 5, 4], + 'a': [1, 2, 3, 4, 5, 6], + 'b': [9, 8, 7, 6, 5, 4], }) out = assemble_sequences(data, entity_columns, context_columns, 3, None) assert isinstance(out, list) assert out == [ { - "context": [], - "data": [[1, 2, 3], [9, 8, 7]], + 'context': [], + 'data': [[1, 2, 3], [9, 8, 7]], }, { - "context": [], - "data": [[4, 5, 6], [6, 5, 4]], + 'context': [], + 'data': [[4, 5, 6], [6, 5, 4]], }, ] @@ -179,87 +179,87 @@ def test__assemble_sequences_no_entity_no_context(): def test__assemble_sequences_no_entity_and_context(): """If no entity columns, segment the given data adding context.""" entity_columns = [] - context_columns = ["a"] + context_columns = ['a'] data = pd.DataFrame({ - "a": [1, 1, 1, 2, 2, 2], - "b": [1, 2, 3, 4, 5, 6], - "c": [9, 8, 7, 6, 5, 4], + 'a': [1, 1, 1, 2, 2, 2], + 'b': [1, 2, 3, 4, 5, 6], + 'c': [9, 8, 7, 6, 5, 4], }) out = assemble_sequences(data, entity_columns, context_columns, 3, None) assert isinstance(out, list) assert out == [ { - "context": [1], - "data": [[1, 2, 3], [9, 8, 7]], + 'context': [1], + 'data': [[1, 2, 3], [9, 8, 7]], }, { - "context": [2], - "data": [[4, 5, 6], [6, 5, 4]], + 'context': [2], + 'data': [[4, 5, 6], [6, 5, 4]], }, ] def test__assemble_sequences_entity_no_segment(): """If entity columns , group by .""" - entity_columns = ["a"] + entity_columns = ['a'] context_columns = [] data = pd.DataFrame({ - "a": [1, 1, 1, 2, 2, 2], - "b": [1, 2, 3, 4, 5, 6], - "c": [9, 8, 7, 6, 5, 4], + 'a': [1, 1, 1, 2, 2, 2], + 'b': [1, 2, 3, 4, 5, 6], + 'c': [9, 8, 7, 6, 5, 4], }) out = assemble_sequences(data, entity_columns, context_columns, None, None) assert isinstance(out, list) assert out == [ { - "context": [], - "data": [[1, 2, 3], [9, 8, 7]], + 'context': [], + 'data': [[1, 2, 3], [9, 8, 7]], }, { - "context": [], - "data": [[4, 5, 6], [6, 5, 4]], + 'context': [], + 'data': [[4, 5, 6], [6, 5, 4]], }, ] def test__assemble_sequences_entity_and_segment_size(): """If entity columns and segment_size, group by and then segment.""" - entity_columns = ["a"] + entity_columns = ['a'] context_columns = [] data = pd.DataFrame({ - "a": [1, 1, 1, 1, 1, 1], - "b": [1, 2, 3, 4, 5, 6], - "c": [9, 8, 7, 6, 5, 4], + 'a': [1, 1, 1, 1, 1, 1], + 'b': [1, 2, 3, 4, 5, 6], + 'c': [9, 8, 7, 6, 5, 4], }) out = assemble_sequences(data, entity_columns, context_columns, 3, None) assert isinstance(out, list) assert out == [ { - "context": [], - "data": [[1, 2, 3], [9, 8, 7]], + 'context': [], + 'data': [[1, 2, 3], [9, 8, 7]], }, { - "context": [], - "data": [[4, 5, 6], [6, 5, 4]], + 'context': [], + 'data': [[4, 5, 6], [6, 5, 4]], }, ] def test__assemble_sequences_context_error(): """If context is not constant within an entity, raise an error.""" - entity_columns = ["a"] - context_columns = ["b"] + entity_columns = ['a'] + context_columns = ['b'] data = pd.DataFrame({ - "a": [1, 1, 1, 1, 2, 2, 2, 2], - "b": [1, 1, 2, 2, 3, 3, 4, 4], - "c": [9, 8, 7, 6, 5, 4, 3, 2], + 'a': [1, 1, 1, 1, 2, 2, 2, 2], + 'b': [1, 1, 2, 2, 3, 3, 4, 4], + 'c': [9, 8, 7, 6, 5, 4, 3, 2], }) with pytest.raises(ValueError): assemble_sequences(data, entity_columns, context_columns, 2, None) @@ -267,31 +267,31 @@ def test__assemble_sequences_context_error(): def test__assemble_sequences_entity_and_time_segment_size(): """If entity columns and segment_size, group by and then segment.""" - entity_columns = ["a"] + entity_columns = ['a'] context_columns = [] data = pd.DataFrame({ - "a": [1, 1, 1, 1], - "b": [1, 2, 3, 4], - "c": [9, 8, 7, 6], - "time": pd.date_range(start="2001-01-01", periods=4, freq="1d"), + 'a': [1, 1, 1, 1], + 'b': [1, 2, 3, 4], + 'c': [9, 8, 7, 6], + 'time': pd.date_range(start='2001-01-01', periods=4, freq='1d'), }) out = assemble_sequences( - data, entity_columns, context_columns, pd.to_timedelta("2d"), "time" + data, entity_columns, context_columns, pd.to_timedelta('2d'), 'time' ) assert isinstance(out, list) assert out == [ { - "context": [], - "data": [ + 'context': [], + 'data': [ [1, 2], [9, 8], ], }, { - "context": [], - "data": [ + 'context': [], + 'data': [ [3, 4], [7, 6], ],