-
Notifications
You must be signed in to change notification settings - Fork 12
/
utils.py
135 lines (88 loc) · 4 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import numpy as np
from typing import Tuple
import pandas as pd
def get_xgboost_x_y(
indices: list,
data: np.array,
target_sequence_length,
input_seq_len: int
) -> Tuple[np.array, np.array]:
"""
Args:
indices: List of index positions at which data should be sliced
data: A univariate time series
target_sequence_length: The forecasting horizon, m
input_seq_len: The length of the model input, n
Output:
all_x: np.array of shape (number of instances, input seq len)
all_y: np.array of shape (number of instances, target seq len)
"""
print("Preparing data..")
# Loop over list of training indices
for i, idx in enumerate(indices):
# Slice data into instance of length input length + target length
data_instance = data[idx[0]:idx[1]]
x = data_instance[0:input_seq_len]
assert len(x) == input_seq_len
y = data_instance[input_seq_len:input_seq_len+target_sequence_length]
# Create all_y and all_x objects in first loop iteration
if i == 0:
all_y = y.reshape(1, -1)
all_x = x.reshape(1, -1)
else:
all_y = np.concatenate((all_y, y.reshape(1, -1)), axis=0)
all_x = np.concatenate((all_x, x.reshape(1, -1)), axis=0)
print("Finished preparing data!")
return all_x, all_y
def load_data():
# Read data
spotprices = pd.read_csv("Elspotprices.csv", delimiter=";")
target_variable = "SpotPriceEUR"
timestamp_col = "HourDK"
# Convert separator from "," to "." and make numeric
spotprices[target_variable] = spotprices[target_variable].str.replace(',', '.', regex=True)
spotprices[target_variable] = pd.to_numeric(spotprices["SpotPriceEUR"])
# Convert HourDK to proper date time and make it index
spotprices[timestamp_col] = pd.to_datetime(spotprices[timestamp_col])
spotprices.index = pd.to_datetime(spotprices[timestamp_col])
# Discard all cols except DKK prices
spotprices = spotprices[[target_variable]]
# Order by ascending time stamp
spotprices.sort_values(by=timestamp_col, ascending=True, inplace=True)
return spotprices
def get_indices_entire_sequence(
data: pd.DataFrame,
window_size: int,
step_size: int
) -> list:
"""
Produce all the start and end index positions that is needed to produce
the sub-sequences.
Returns a list of tuples. Each tuple is (start_idx, end_idx) of a sub-
sequence. These tuples should be used to slice the dataset into sub-
sequences. These sub-sequences should then be passed into a function
that slices them into input and target sequences.
Args:
data (pd.DataFrame): Partitioned data set, e.g. training data
window_size (int): The desired length of each sub-sequence. Should be
(input_sequence_length + target_sequence_length)
E.g. if you want the model to consider the past 100
time steps in order to predict the future 50
time steps, window_size = 100+50 = 150
step_size (int): Size of each step as the data sequence is traversed
by the moving window.
If 1, the first sub-sequence will be [0:window_size],
and the next will be [1:window_size].
Return:
indices: a list of tuples
"""
stop_position = len(data)-1 # 1- because of 0 indexing
# Start the first sub-sequence at index position 0
subseq_first_idx = 0
subseq_last_idx = window_size
indices = []
while subseq_last_idx <= stop_position:
indices.append((subseq_first_idx, subseq_last_idx))
subseq_first_idx += step_size
subseq_last_idx += step_size
return indices