From 1bd17dade0721a791d4f751495311fe90a9c3b73 Mon Sep 17 00:00:00 2001 From: eschibli <46228485+eschibli@users.noreply.github.com> Date: Sat, 7 Sep 2024 17:18:02 -0700 Subject: [PATCH] Implimented project_first --- darts/models/forecasting/tsmixer_model.py | 48 ++++++++++++----------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/darts/models/forecasting/tsmixer_model.py b/darts/models/forecasting/tsmixer_model.py index 8e82fe0df0..887f5e79a0 100644 --- a/darts/models/forecasting/tsmixer_model.py +++ b/darts/models/forecasting/tsmixer_model.py @@ -383,7 +383,7 @@ def __init__( if activation not in ACTIVATIONS: raise_log( ValueError( - f"Invalid `activation={activation}`. Must be on of {ACTIVATIONS}." + f"Invalid `activation={activation}`. Must be one of {ACTIVATIONS}." ), logger=logger, ) @@ -393,7 +393,7 @@ def __init__( if norm_type not in NORMS: raise_log( ValueError( - f"Invalid `norm_type={norm_type}`. Must be on of {NORMS}." + f"Invalid `norm_type={norm_type}`. Must be one of {NORMS}." ), logger=logger, ) @@ -497,7 +497,7 @@ def forward( # B: batch size # L: input chunk length # T: output chunk length - # LT: latent time dimension (T if project_first_layer, L otherwise) + # SL: Residual block time dimension (T if project_first_layer, L otherwise) # C: target components # P: past cov features # F: future cov features @@ -509,53 +509,57 @@ def forward( # `x`: (B, L, H), `x_future`: (B, T, F), `x_static`: (B, C or 1, S) x, x_future, x_static = x_in - # (B, L, H) -> (B, LT, H) + # If project_first_layer, decoder style model with residual blocks in output time dimension + # (B, L, H) -> (B, SL, H) if self.project_first_layer: # swap feature and time dimensions (B, L, H) -> (B, H, L) x = _time_to_feature(x) - # linear transformations to LT (T in this case) - # (B, H, L) -> (B, H, LT) + # linear transformations to SL (T in this case) + # (B, H, L) -> (B, H, SL) x = self.fc_hist(x) # Transpose back # (B, H, T) -> (B, T, H) x = _time_to_feature(x) + + # Otherwise, encoder-style model with residual blocks in input time dimension + # In the original paper this was not implimented for future covariates, + # but rather than ignoring them or raising an error we remap them to the input time dimension. + # Suboptimal but may be useful in some cases. elif self.future_cov_dim: # swap feature and time dimensions (B, L, F) -> (B, F, L) x_future = _time_to_feature(x_future) - # linear transformations to LT (L in this case) - # (B, F, T) -> (B, F, L) + # linear transformations to SL (L in this case) + # (B, F, T) -> (B, F, SL) x_future = self.fc_future(x_future) # Transpose back (B, L, F) -> (B, F, L) x_future = _time_to_feature(x_future) - # feature mixing for historical features (B, LT, H) -> (B, LT, H_S) + # feature mixing for historical features (B, SL, H) -> (B, SL, H_S) x = self.feature_mixing_hist(x) if self.future_cov_dim: - # feature mixing for future features (B, LT, F) -> (B, LT, H_S) + # feature mixing for future features (B, SL, F) -> (B, SL, H_S) x_future = self.feature_mixing_future(x_future) - # (B, LT, H_S) + (B, LT, H_S) -> (B, T, 2*H_S) + # (B, SL, H_S) + (B, SL, H_S) -> (B, T, 2*H_S) x = torch.cat([x, x_future], dim=-1) if self.static_cov_dim: # (B, C, S) -> (B, 1, C * S) x_static = x_static.reshape(x_static.shape[0], 1, -1) - # repeat to match time dim: (B, 1, C * S) -> (B, LT, C * S) - x_static = x_static.repeat(1, - (self.output_chunk_length if self.project_first_layer else self.input_chunk_length), 1) + # repeat to match time dim: (B, 1, C * S) -> (B, SL, C * S) + x_static = x_static.repeat(1, self.sequence_length, 1) for mixing_layer in self.conditional_mixer: - # conditional mixer layers with static covariates (B, LT, 2 * H_S), (B, LT, C * S) -> (B, LT, H_S) + # conditional mixer layers with static covariates (B, SL, 2 * H_S), (B, SL, C * S) -> (B, SL, H_S) x = mixing_layer(x, x_static=x_static) - # If not projecting first, project to the output time dimension - # In the original paper there is no fc_out layer, but we believe it is better to - # remap the time dimension before the feature dimension as the output feature dimension - # is likely to be 1 (or a small number) and the time dimension is likely to be larger. - # So we don't want to compress the feature dimension before remapping the time dimension. + # If we are in the input time dimension, we need to project to the output time dimension. + # The original paper did not a fc_out layer (as hidden_size == output_dim) so we needed to decide where to put it. + # We put the projection first as it as while both operations may be very compressive, + # we felt it more likely that output_dim << hidden_size than output_chunk_length << input_chunk_length. if not self.project_first_layer: - # (B, LT, H_S) -> (B, H_S, LT) + # (B, SL, H_S) -> (B, H_S, SL) x = _time_to_feature(x) - # (B, H_S, LT) -> (B, H_S, T) + # (B, H_S, SL) -> (B, H_S, T) x = self.fc_hist(x) # (B, H_S, T) -> (B, T, H_S) x = _time_to_feature(x)