-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_model.py
2468 lines (1953 loc) · 96.1 KB
/
create_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
"""
Created on Wed May 3 10:54:49 2023
@author: felix markwordt
"""
#TODO: clear imports
from datetime import timedelta, datetime
import datetime
import json
import logging
import os
from dateutil import parser
import subprocess
from dotenv import load_dotenv
import pycaret
#from pycaret.time_series import *
from pycaret.regression import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import missingno as msno
import sys
import pytz
import requests
import urllib
from geopy.geocoders import Nominatim
from timezonefinder import TimezoneFinder
# new imports nn
import tensorflow
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dense, LSTM, GRU, Bidirectional, Dropout
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.backend import floatx
#from transformers import TFAutoModel, AutoTokenizer
from scikeras.wrappers import KerasRegressor
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from scipy.interpolate import CubicSpline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
#import kerastuner as kt
from keras_tuner import Hyperband, HyperParameters
import time
from keras.callbacks import Callback
from tensorflow.keras.callbacks import EarlyStopping
# local
import main
# URL of API to retrive devices
ApiUrl = ""#"http://wazigate/" # Production mode
Token = None
# Initialize an empty dictionary to store the current config
Current_config = {}
# Current timezone
Timezone = ''
# Extracted variables from Current_config
DeviceAndSensorIdsMoisture = []
DeviceAndSensorIdsTemp = []
DeviceAndSensorIdsFlow = []
# Rolling mean window
RollingMeanWindowData = 15
RollingMeanWindowGrouped = 5
# Sampling rate of training dataset
Sample_rate = 60
# Forecast horizon TODO: add config or adjust automa
Forcast_horizon = 5 #days
# Created features that are dropped later
#To_be_dropped = ['Timestamp', 'minute', 'grouped_soil', 'grouped_soil_temp', 'gradient']
To_be_dropped = ['minute', 'Timestamp','gradient','grouped_soil','grouped_soil_temp','Winddirection','month','day_of_year','date']
# Mapping to identify models TODO: check for correctness
Model_mapping = {
'LinearRegression': 'lr',
'Lasso': 'lasso',
'Ridge': 'ridge',
'ElasticNet': 'en',
'Lars': 'lar',
'LassoLars': 'llar',
'OrthogonalMatchingPursuit': 'omp',
'BayesianRidge': 'br',
'ARDRegression': 'ard',
'PassiveAggressiveRegressor': 'par',
'RANSACRegressor': 'ransac',
'TheilSenRegressor': 'tr',
'HuberRegressor': 'huber',
'KernelRidge': 'kr',
'SVR': 'svm',
'KNeighborsRegressor': 'knn',
'DecisionTreeRegressor': 'dt',
'RandomForestRegressor': 'rf',
'ExtraTreesRegressor': 'et',
'AdaBoostRegressor': 'ada',
'GradientBoostingRegressor': 'gbr',
'MLPRegressor': 'mlp',
'XGBRegressor': 'xgboost',
'LGBMRegressor': 'lightgbm',
'CatBoostRegressor': 'catboost',
'DummyRegressor': 'dummy'
}
# predictions
Data = pd.DataFrame
Data_w = pd.DataFrame
Predictions = pd.DataFrame
Threshold_timestamp = ""
Use_pycaret = True
Tuned_best = None
Best_exp = None
# Load data from CSV, is set if there is a file in the root directory
CSVFile = "binned_removed_new_for_app_ww.csv"
LoadDataFromCSV = False # DEBUG
# Load former irrigations from file "data/irrigations.json" DEBUG
Load_irrigations_from_file = False
# Restrict time to training
class TimeLimitCallback(Callback):
def __init__(self, max_time_seconds):
super(TimeLimitCallback, self).__init__()
self.max_time_seconds = max_time_seconds
self.start_time = time.time()
def on_epoch_end(self, epoch, logs=None): # Changed to on_epoch_end
elapsed_time = time.time() - self.start_time
print(f"Epoch {epoch}: Elapsed time {elapsed_time:.2f} seconds")
if elapsed_time > self.max_time_seconds:
self.model.stop_training = True
print(f"Training stopped after {self.max_time_seconds} seconds")
def read_config():
global DeviceAndSensorIdsMoisture
global DeviceAndSensorIdsTemp
global DeviceAndSensorIdsFlow
global LoadDataFromCSV
# Specify the path to the JSON file you want to read
json_file_path = 'config/current_config.json'
try:
with open(CSVFile, "r") as file:
# Perform operations on the file
data = pd.read_csv(file, header=0)
DeviceAndSensorIdsMoisture = []
DeviceAndSensorIdsTemp = []
DeviceAndSensorIdsFlow = []
# create array with sensors strings
for col in data.columns:
if col.startswith("tension"):
DeviceAndSensorIdsMoisture.append(col)
elif col.startswith("soil_temp"):
DeviceAndSensorIdsTemp.append(col)
LoadDataFromCSV = True
# This is not implemented: DeviceAndSensorIdsFlow = config["DeviceAndSensorIdsFlow"]
except FileNotFoundError:
# Read the JSON data from the file
with open(json_file_path, 'r') as json_file:
config = json.load(json_file)
DeviceAndSensorIdsMoisture = config["DeviceAndSensorIdsMoisture"]
DeviceAndSensorIdsTemp = config["DeviceAndSensorIdsTemp"]
if "DeviceAndSensorIdsFlow" in config:
DeviceAndSensorIdsFlow = config["DeviceAndSensorIdsFlow"]
except Exception as e:
print("An error occurred: No devices are set in settings, there is also no local config file.", e)
return config
# not ready
def get_token():
global Token
# Generate token to fetch data from another gateway
if ApiUrl.startswith('http://wazigate/'):
print('There is no token needed, fetch data from local gateway.')
# Get token, important for non localhost devices
else:
# curl -X POST "http://192.168.189.2/auth/token" -H "accept: application/json" -d "{\"username\": \"admin\", \"password\": \"loragateway\"}"
token_url = ApiUrl + "auth/token"
# Parse the URL
parsed_token_url = urllib.parse.urlsplit(token_url)
# Encode the query parameters
encoded_query = urllib.parse.quote(parsed_token_url.query, safe='=&')
# Reconstruct the URL with the encoded query
encoded_url = urllib.parse.urlunsplit((parsed_token_url.scheme,
parsed_token_url.netloc,
parsed_token_url.path,
encoded_query,
parsed_token_url.fragment))
# Define headers for the POST request
headers = {
'accept': 'application/json',
#'Content-Type': 'application/json', # Make sure to set Content-Type
}
# Define data for the GET request
data = {
'username': 'admin',
'password': 'loragateway',
}
try:
# Send a GET request to the API
response = requests.post(encoded_url, headers=headers, json=data)
# Check if the request was successful (status code 200)
if response.status_code == 200:
# The response content contains the data from the API
Token = response.json()
else:
print("Request failed with status code:", response.status_code)
except requests.exceptions.RequestException as e:
# Handle request exceptions (e.g., connection errors)
print("Request error:", e)
return "", e #TODO: intruduce error handling!
def load_latest_data_api(sensor_name, type):#, token)
global ApiUrl
# Token
load_dotenv()
ApiUrl = os.getenv('API_URL')
if ApiUrl.startswith('http://wazigate/'):
print('There is no token needed, fetch data from local gateway.')
elif Token != None:
print('There is no token needed, already present.')
# Get token, important for non localhost devices
else:
get_token()
# Create URL for API call e.g.:curl -X GET "http://192.168.189.15/devices/669780aa68f319066a12444a/sensors/6697875968f319066a12444d/value" -H "accept: application/json"
api_url = ApiUrl + "devices/" + sensor_name.split('/')[0] + "/" + type + "/" + sensor_name.split('/')[1] + "/value"
# Parse the URL
parsed_url = urllib.parse.urlsplit(api_url)
# Encode the query parameters
encoded_query = urllib.parse.quote(parsed_url.query, safe='=&')
# Reconstruct the URL with the encoded query
encoded_url = urllib.parse.urlunsplit((parsed_url.scheme,
parsed_url.netloc,
parsed_url.path,
encoded_query,
parsed_url.fragment))
# Define headers for the GET request
headers = {
'Authorization': f'Bearer {Token}',
}
try:
# Send a GET request to the API
response = requests.get(encoded_url, headers=headers)
# Check if the request was successful (status code 200)
if response.status_code == 200:
# The response content contains the data from the API
response_ok = response.json()
else:
print("Request failed with status code:", response.status_code)
print("Response content:", response.text)
response_ok = None
except requests.exceptions.RequestException as e:
# Handle request exceptions (e.g., connection errors)
print("Request error:", e)
return "Error in 'load_latest_data_api()'! ", e #TODO: intruduce error handling!
return response_ok
# Load from CSV file
def load_data(path):
# creating a data frame
data = pd.read_csv("binned_removed.csv",header=0)
print(data.head())
return data
# Load from wazigate API
def load_data_api(sensor_name, type, from_timestamp):#, token)
global ApiUrl
global Timezone
global Current_config
# Load config to obtain GPS coordinates
Current_config = read_config()
# Token
load_dotenv()
ApiUrl = os.getenv('API_URL')
# Convert timestamp
if not isinstance(from_timestamp, str):
from_timestamp = from_timestamp.strftime('%Y-%m-%dT%H:%M:%S.%fZ')
# Get timezone if no information avalable
if Timezone == '':
Timezone = get_timezone(Current_config["Gps_info"]["lattitude"], Current_config["Gps_info"]["longitude"])
# Correct timestamp for timezone => TODO: here is an ERROR, timezone var is not available in first start
from_timestamp = (datetime.datetime.strptime(from_timestamp, "%Y-%m-%dT%H:%M:%S.%fZ") - timedelta(hours=get_timezone_offset(Timezone))).strftime('%Y-%m-%dT%H:%M:%S.%fZ')
if ApiUrl.startswith('http://wazigate/'):
print('There is no token needed, fetch data from local gateway.')
elif Token != None:
print('There is no token needed, already present.')
# Get token, important for non localhost devices
else:
get_token()
# Create URL for API call
api_url = ApiUrl + "devices/" + sensor_name.split('/')[0] + "/" + type + "/" + sensor_name.split('/')[1] + "/values" + "?from=" + from_timestamp
# Parse the URL
parsed_url = urllib.parse.urlsplit(api_url)
# Encode the query parameters
encoded_query = urllib.parse.quote(parsed_url.query, safe='=&')
# Reconstruct the URL with the encoded query
encoded_url = urllib.parse.urlunsplit((parsed_url.scheme,
parsed_url.netloc,
parsed_url.path,
encoded_query,
parsed_url.fragment))
# Define headers for the GET request
headers = {
'Authorization': f'Bearer {Token}',
}
try:
# Send a GET request to the API
response = requests.get(encoded_url, headers=headers)
# Check if the request was successful (status code 200)
if response.status_code == 200:
# The response content contains the data from the API
response_ok = response.json()
else:
print("Request failed with status code:", response.status_code)
except requests.exceptions.RequestException as e:
# Handle request exceptions (e.g., connection errors)
print("Request error:", e)
return "", e #TODO: intruduce error handling!
return response_ok
# Resample and interpolate
def check_gaps(data):
mask = data.isna().any()
print(mask)
if (mask.any()):
data.dropna(inplace=True)
data = resample(data)
return data.interpolate(method='linear')
else:
return data
# Impute missing data & apply rolling mean (imputation & cleaning)
def fill_gaps(data):
# Show if there are any missing values inside the data
print("This is before: \n",data.isna().any())
data = data.interpolate(method='linear')
# Show if there are any missing values inside the data
print("This is afterwards: \n",data.isna().any())
return data
# Remove larger gaps in data as imputing them is not accurate
def remove_large_gaps(df, col, gap_threshold = 6):
# Detect NaN gaps in the specified column
is_nan = df[col].isna()
# Group consecutive NaN and non-NaN values using cumsum on not-NaN
group = (~is_nan).cumsum()
# Count consecutive NaNs within each group
consecutive_gaps = is_nan.groupby(group).cumsum()
# Filter out rows where consecutive NaN values exceed the threshold
df_filtered = df[consecutive_gaps <= gap_threshold].copy()
return df_filtered
# Get offset to UTC time
def get_timezone_offset(timezone_str):
timezone = pytz.timezone(timezone_str)
current_time = datetime.datetime.now(tz=timezone)
utc_offset = current_time.utcoffset().total_seconds() / 3600.0
return utc_offset
# Get timezone string
def get_timezone(latitude_str, longitude_str):
# Convert to floats
latitude = float(latitude_str)
longitude = float(longitude_str)
geolocator = Nominatim(user_agent="timezone_finder")
location = geolocator.reverse((latitude, longitude), language="en")
# Extract timezone using timezonefinder
timezone_finder = TimezoneFinder()
timezone_str = timezone_finder.timezone_at(lng=longitude, lat=latitude)
return timezone_str
# Get historical values from open-meteo TODO: include timezone service: https://stackoverflow.com/questions/16086962/how-to-get-a-time-zone-from-a-location-using-latitude-and-longitude-coordinates
def get_historical_weather_api(data):
global Data_w
# TODO: remove that one day overlapping data
if Data_w.empty:
first_day_minus_one_str = (data.index[0] - timedelta(days = 1)).strftime("%Y-%m-%d")
else:
first_day_minus_one_str = (Data_w.index[-1] - timedelta(days = 1)).strftime("%Y-%m-%d")
#data_w_former_end = Data_w.index[-1]
# need to add one day, overlapping have to be cut off later => useless because data is not available to fetch via api
last_date = data.index[-1]
last_date_str = last_date.strftime("%Y-%m-%d")
lat = Current_config["Gps_info"]["lattitude"]
long = Current_config["Gps_info"]["longitude"]
url = (
f'https://archive-api.open-meteo.com/v1/era5'
f'?latitude={lat}'
f'&longitude={long}'
f'&start_date={first_day_minus_one_str}'
f'&end_date={last_date_str}'
f'&hourly=temperature_2m,relativehumidity_2m,rain,cloudcover,shortwave_radiation,windspeed_10m,winddirection_10m,soil_temperature_7_to_28cm,soil_moisture_0_to_7cm,et0_fao_evapotranspiration'
f'&timezone={Timezone}'
)
dct = subprocess.check_output(['curl', url]).decode()
dct = json.loads(dct)
# Also convert it to a pandas dataframe
data_w_fetched = (pd.DataFrame([dct['hourly']['temperature_2m'],
dct['hourly']['relativehumidity_2m'],
dct['hourly']['rain'],
dct['hourly']['cloudcover'],
dct['hourly']['shortwave_radiation'],
dct['hourly']['windspeed_10m'],
dct['hourly']['winddirection_10m'],
dct['hourly']['soil_temperature_7_to_28cm'],
dct['hourly']['soil_moisture_0_to_7cm'],
dct['hourly']['et0_fao_evapotranspiration'],
dct['hourly']['time']],
index = ['Temperature', 'Humidity', 'Rain', 'Cloudcover', 'Shortwave_Radiation', 'Windspeed', 'Winddirection', 'Soil_temperature_7-28', 'Soil_moisture_0-7', 'Et0_evapotranspiration', 'Timestamp'])
.T
.assign(Timestamp = lambda x : pd.to_datetime(x.Timestamp, format='%Y-%m-%dT%H:%M'))
.set_index(['Timestamp'])
.dropna())
# Add timezone information without converting
data_w_fetched.index = data_w_fetched.index.map(lambda x: x.replace(tzinfo=pytz.timezone(Timezone)))
#data_w.index = pd.to_datetime(data_w.index) + pd.DateOffset(hours=get_timezone_offset(timezone))
# convert cols to float64
data_w_fetched = convert_cols(data_w_fetched)
# If empty
if Data_w.empty:
# Set as global
Data_w = data_w_fetched
# If it is the same
elif data_w_fetched.index[-1] == Data_w.index[-1]:
return Data_w
# Concat data
else:
# Merge former historical weather data and fetched one into one dataframe
Data_w = pd.concat([Data_w.loc[Data_w.index[0]:],
data_w_fetched.loc[Data_w.index[-1] +
timedelta(minutes=Sample_rate)
: data.index[-1]]
])
return Data_w
# Get weather forecast from open-meteo
def get_weather_forecast_api(start_date, end_date):
# Timezone and geo_location
lat = Current_config["Gps_info"]["lattitude"]
long = Current_config["Gps_info"]["longitude"]
# Define the API URL for weather forecast
url = (
f'https://api.open-meteo.com/v1/forecast'
f'?latitude={lat}'
f'&longitude={long}'
f'&hourly=temperature_2m,relative_humidity_2m,precipitation,cloud_cover,et0_fao_evapotranspiration,wind_speed_10m,wind_direction_10m,soil_temperature_18cm,soil_moisture_3_to_9cm,shortwave_radiation'
f'&start_date={start_date.strftime("%Y-%m-%d")}'
f'&end_date={end_date.strftime("%Y-%m-%d")}'
f'&timezone={Timezone}'
)
# Use subprocess to run the curl command and decode the output
dct = subprocess.check_output(['curl', url]).decode()
dct = json.loads(dct)
# Convert API response to a pandas dataframe
data_forecast = (pd.DataFrame([dct['hourly']['temperature_2m'],
dct['hourly']['relative_humidity_2m'],
dct['hourly']['precipitation'],
dct['hourly']['cloud_cover'],
dct['hourly']['shortwave_radiation'],
dct['hourly']['wind_speed_10m'],
dct['hourly']['wind_direction_10m'],
dct['hourly']['soil_temperature_18cm'],
dct['hourly']['soil_moisture_3_to_9cm'],
dct['hourly']['et0_fao_evapotranspiration'],
dct['hourly']['time']],
index = ['Temperature', 'Humidity', 'Rain', 'Cloudcover', 'Shortwave_Radiation', 'Windspeed', 'Winddirection', 'Soil_temperature_7-28', 'Soil_moisture_0-7', 'Et0_evapotranspiration', 'Timestamp'])
.T
.assign(Timestamp = lambda x : pd.to_datetime(x.Timestamp, format='%Y-%m-%dT%H:%M'))
.set_index(['Timestamp'])
.dropna())
# Add timezone information without converting
#data_forecast.index = data_forecast.index.tz_localize('UTC').tz_convert(timezone)
#data_forecast.index = data_forecast.index.tz_localize(timezone, utc=True)
#data_forecast.index = pd.DatetimeIndex(data_forecast.index).tz_localize('UTC').tz_convert('Europe/Berlin')
data_forecast.index = data_forecast.index.map(lambda x: x.replace(tzinfo=pytz.timezone(Timezone)))
#data_forecast.index = pd.to_datetime(data_forecast.index) + pd.DateOffset(hours=get_timezone_offset(timezone)) + pd.DateOffset(hours=1.0)
# convert cols to float64
data_forecast = convert_cols(data_forecast)
return data_forecast
# convert to float64
def convert_cols(data):
obj_dtype = 'object'
for col in data.columns:
if data[col].dtype == obj_dtype:
data[col] = pd.to_numeric(data[col], errors='coerce')
print(col, "has the following dtype: ", data[col].dtype)
return data
# Resample data
def resample(d):
d_resample = d.resample(str(Sample_rate)+'T').mean()
return d_resample
# Volumetric water content => Not called
def soil_tension_to_volumetric_water_content(soil_tension, soil_water_retention_curve):
"""
Convert soil tension (kPa) to volumetric water content (fraction) using a given soil-water retention curve.
Parameters:
soil_tension (float): Soil tension value in kPa.
soil_water_retention_curve (list of tuples): A list of tuples containing points on the soil-water retention curve.
Each tuple contains two elements: (soil_tension_value, volumetric_water_content_value).
Returns:
float: Volumetric water content as a fraction (between 0 and 1).
"""
# Find the two points on the curve that bound the given soil tension value
lower_point, upper_point = None, None
for tension, water_content in soil_water_retention_curve:
if tension <= soil_tension:
lower_point = (tension, water_content)
else:
upper_point = (tension, water_content)
break
# If the soil tension is lower than the first point on the curve, use the first point
if lower_point is None:
return soil_water_retention_curve[0][1]
# If the soil tension is higher than the last point on the curve, use the last point
if upper_point is None:
return soil_water_retention_curve[-1][1]
# Interpolate to find the volumetric water content at the given soil tension
tension_diff = upper_point[0] - lower_point[0]
water_content_diff = upper_point[1] - lower_point[1]
interpolated_water_content = lower_point[1] + ((soil_tension - lower_point[0]) / tension_diff) * water_content_diff
return interpolated_water_content
# VWC with log scale => Not called
def soil_tension_to_volumetric_water_content_log(soil_tension, soil_water_retention_curve):
# Transform the tension and content values to logarithmic space
tensions_log = np.log10([point[0] for point in soil_water_retention_curve])
content_log = np.log10([point[1] for point in soil_water_retention_curve])
# Interpolate in logarithmic space
interpolated_content_log = np.interp(np.log10(soil_tension), tensions_log, content_log)
# Transform back to linear space
interpolated_content = 10 ** interpolated_content_log
return interpolated_content
# VWC with splines scale
def soil_tension_to_volumetric_water_content_spline(soil_tension, soil_water_retention_curve):
"""
Convert soil tension (kPa) to volumetric water content (fraction) using cubic spline interpolation.
Parameters:
soil_tension (float): Soil tension value in kPa.
soil_water_retention_curve (list of tuples): A list of tuples containing points on the soil-water retention curve.
Each tuple contains two elements: (soil_tension_value, volumetric_water_content_value).
Returns:
float: Volumetric water content as a fraction (between 0 and 1).
"""
# Extract tension and water content values from the curve
tensions, water_contents = zip(*soil_water_retention_curve)
# Create a cubic spline interpolator
spline = CubicSpline(tensions, water_contents, bc_type='natural')
# Evaluate the spline at the given soil tension
interpolated_water_content = spline(soil_tension)
# Clip the result to ensure it remains within the valid range [0, 1]
return np.clip(interpolated_water_content, 0, 1)
def add_volumetric_col_to_df(df, col_name):
# Iterate over the rows of the dataframe and calculate volumetric water content
soil_water_retention_tupel_list = [(float(dct['Soil tension']), float(dct['VWC'])) for dct in Current_config['Soil_water_retention_curve']]
# Sort the soil-water retention curve points by soil tension in ascending order => TODO: NOT EFFICIENT HERE, move out
sorted_curve = sorted(soil_water_retention_tupel_list, key=lambda x: x[0])
for index, row in df.iterrows():
soil_tension = row[col_name]
# Calculate volumetric water content
volumetric_water_content = soil_tension_to_volumetric_water_content_spline(soil_tension, sorted_curve)
# Assign the calculated value to a new column in the dataframe
df.at[index, col_name + '_vol'] = round(volumetric_water_content, 4)
return df
def calc_volumetric_water_content_single_value(soil_tension_value):
global Current_config
# Check config beeing loaded, otherwise read it
if 'Current_config' not in globals() and 'Soil_water_retention_curve' not in globals()['Current_config']:
Current_config = read_config()
# Iterate over the rows of the dataframe and calculate volumetric water content
soil_water_retention_tupel_list = [(float(dct['Soil tension']), float(dct['VWC'])) for dct in Current_config['Soil_water_retention_curve']]
# Sort the soil-water retention curve points by soil tension in ascending order => TODO: NOT EFFICIENT HERE, move out
sorted_curve = sorted(soil_water_retention_tupel_list, key=lambda x: x[0])
# Calculate volumetric water content
volumetric_water_content = soil_tension_to_volumetric_water_content_spline(soil_tension_value, sorted_curve)
return volumetric_water_content
def align_retention_curve_with_api(data, data_weather_api):
soil_water_retention_tupel_list = [(float(dct['Soil tension']), float(dct['VWC'])) for dct in Current_config['Soil_water_retention_curve']]
# Sort the soil-water retention curve points by soil tension in ascending order => TODO: NOT EFFICIENT HERE, move out
sorted_curve = sorted(soil_water_retention_tupel_list, key=lambda x: x[0])
# compare weatherdata from past against messured values more expressive:
mean_recorded_sensor_values = data["rolling_mean_grouped_soil_vol"].mean()
mean_open_meteo_past_vol = data_weather_api["Soil_moisture_0-7"].mean()
factor = mean_open_meteo_past_vol / mean_recorded_sensor_values
print("mean_recorded_sensor_values: ", mean_recorded_sensor_values, " mean_open_meteo_past_vol: ", mean_open_meteo_past_vol, " factor: ", factor)
# Multiply the second column by factor
modified_curve = [(x, y * factor) for x, y in sorted_curve]
return modified_curve
# TODO: more sophisticated approach needed: needs to learn from former => introduce model, is now excluded when flow meter is installed
def add_pump_state(data):
slope = float(Current_config["Slope"])
# for index, row in data.iterrows():
# if row['gradient'] < slope:
# #print(index, row['rolling_mean_grouped_soil'], row['gradient'], row['Rain'])
# # only add if there was no rain in the previous hours
# if row['Rain'] == 0.0:
# data.at[index, 'pump_state'] = 1
for i in range(1,len(data)):
# look for two consecutive occurances of high negative slope
if data.iloc[i-1]['gradient'] < slope and data.iloc[i]['gradient'] < slope:
# and if it rained now and previously
if data.iloc[i-1]['Rain'] == 0.0 and data.iloc[i]['Rain'] == 0.0:
data.loc[data.index[i], 'pump_state'] = 1
data.loc[data.index[i-1], 'pump_state'] = 1
return data
# Calculate time since pump was on (time since last irrigation)
# TODO: need to be included, but is not tested yet
def hours_since_pump_was_turned_on(df):
# Find the index of rows where pump state is 1
pump_on_indices = df[df['pump_state'] == 1].index
# Initialize a new column with NaN values
df['rows_since_last_pump_on'] = float('nan')
# Iterate over pump_on_indices and update the new column
for i in range(len(pump_on_indices)):
if i == 0:
# If it's the first occurrence, update with the total rows
df.loc[:pump_on_indices[i], 'rows_since_last_pump_on'] = len(df)
else:
# Update with the difference in rows since the last occurrence
df.loc[pump_on_indices[i - 1] + 1:pump_on_indices[i], 'rows_since_last_pump_on'] = \
(pump_on_indices[i] - pump_on_indices[i - 1] - pd.Timedelta(seconds=1)) / pd.Timedelta('1 hour')
# Fill NaN values with 0 for rows where pump state is 1
df['rows_since_last_pump_on'] = df['rows_since_last_pump_on'].fillna(0).astype(int)
return df
# Function to ensure the JSON file exists or create it if missing
def ensure_json_file(file_path):
if not os.path.exists(file_path):
with open(file_path, 'w') as file:
json.dump({"irrigations": []}, file)
print(f"Created new JSON file at: {file_path}")
# include the (on device saved) amount of irrigation given
def include_irrigation_amount(df):
irrigation_file = 'data/irrigations.json'
# Check and ensure the JSON file exists
ensure_json_file(irrigation_file)
if Load_irrigations_from_file:
# Load JSON data from file
with open(irrigation_file, 'r') as file:
irrigations_json = json.load(file)
print("Loaded JSON data:")
print(irrigations_json)
# Convert the 'irrigations' list to a pandas DataFrame
irrigations_df = pd.DataFrame(irrigations_json['irrigations'])
print("\nDataFrame from JSON:")
print(irrigations_df.head()) # Check the first few rows to ensure data is loaded correctly
# Convert timestamp to datetime
irrigations_df['timestamp'] = pd.to_datetime(irrigations_df['timestamp'])
# Set timestamp as index
irrigations_df.set_index('timestamp', inplace=True)
print("\nDataFrame after timestamp conversion and setting index:")
print(irrigations_df.head()) # Check again to ensure timestamps are converted correctly
# Resample the irrigations dataframe to hourly intervals, summing the amounts
irrigations_resampled = irrigations_df.resample('H').sum()
# Reindex the irrigations dataframe to match the main dataframe's index, filling missing values with zero
irrigations_reindexed = irrigations_resampled.reindex(df.index, fill_value=0)
print("\nReindexed DataFrame:")
print(irrigations_reindexed.head()) # Check reindexed DataFrame to see if it aligns with df's index
# Add the reindexed irrigation amounts to the main dataframe
df['irrigation_amount'] = irrigations_reindexed['amount']
else:
if (len(DeviceAndSensorIdsFlow) > 0):
# Load data and create the dataframe
data_irrigation = load_data_api(DeviceAndSensorIdsFlow[0], "actuators", Current_config['Start_date'])
df_irrigation = pd.DataFrame(data_irrigation)
df_irrigation.rename(columns={'time': 'Timestamp'}, inplace=True)
df_irrigation['Timestamp'] = pd.to_datetime(df_irrigation['Timestamp'], utc=True)
df_irrigation['Timestamp'] = df_irrigation['Timestamp'].dt.tz_convert(Timezone) # Replace with the correct timezone
df_irrigation.rename(columns={'value': 'irrigation_amount'}, inplace=True)
# Convert irrigation_amount to numeric, forcing errors to NaN
df_irrigation['irrigation_amount'] = pd.to_numeric(df_irrigation['irrigation_amount'], errors='coerce')
# Round the timestamps to the nearest hour and aggregate the irrigation amounts
df_irrigation['Timestamp'] = df_irrigation['Timestamp'].dt.round('H')
df_irrigation = df_irrigation.groupby('Timestamp').agg({'irrigation_amount': 'sum'}).reset_index()
# subtract one hour, like wtf!!!!
#df_irrigation['Timestamp'] = df_irrigation['Timestamp'] - pd.to_timedelta(1, unit='h')
# Set the Timestamp as the index
df_irrigation.set_index('Timestamp', inplace=True)
# Timezone has to be set for df_irrigation
df_irrigation = df_irrigation.tz_convert(Timezone)
# Merge the dataframes
df = pd.merge(df, df_irrigation, left_index=True, right_index=True, how='outer', suffixes=('_main', '_irrigation'))
# Fill missing irrigation_amount with 0
df['irrigation_amount'] = df['irrigation_amount'].fillna(0)
return df
# Augment the dataset creating new features
def create_features(data):
# Create average cols
data['grouped_soil'] = data[DeviceAndSensorIdsMoisture].mean(axis=1)
data['grouped_soil_temp'] = data[DeviceAndSensorIdsTemp].mean(axis=1)
# Create rolling mean: introduces NaN again -> later just cut off
data['rolling_mean_grouped_soil'] = data['grouped_soil'].rolling(window=RollingMeanWindowGrouped, win_type='gaussian').mean(std=RollingMeanWindowGrouped)
data['rolling_mean_grouped_soil_temp'] = data['grouped_soil_temp'].rolling(window=RollingMeanWindowGrouped, win_type='gaussian').mean(std=RollingMeanWindowGrouped)
# Drop those values without rolling_mean /// was 18 before
data = data[4:]
# Resample data
data = resample(data)
# Create time related features
data['hour'] = data.index.hour#.astype("float64")
data['minute'] = data.index.minute#.astype("float64")
data['date'] = data.index.day#.astype("float64")
data['month'] = data.index.month#.astype("float64")
data['day_of_year'] = data.index.dayofyear#.astype("float64")
# Get weather from weather meteo
data_weather = get_historical_weather_api(data)
# Resample weatherdata before merge => takes a long time
data_weather = resample(data_weather)
# historical weather data is not available for the latest two days, use forecast to account for that!
data_weather_endtime = data_weather.index[-1]
data_endtime = data.index[-1]
# Get forecast for the ~last two days
data_weather_recent_forecast = get_weather_forecast_api(data_weather_endtime, data_endtime)
# Merge weather data to one dataframe
data_weather_merged = pd.concat([data_weather.loc[data.index[0]:],
data_weather_recent_forecast.loc[data_weather_endtime +
timedelta(minutes=Sample_rate)
: data_endtime]
])
# Merge data_weather_merged into data
data = pd.merge(data, data_weather_merged, left_index=True, right_index=True, how='outer')
# # Calculate and add volumetric water content => do not use this approach, does not yield better results
# data = add_volumetric_col_to_df(data, 'rolling_mean_grouped_soil')
# # align soil water retention curve with data from API => do not use this approach, does not yield better results
# corrected_water_retention_curve = align_retention_curve_with_api(data, data_weather)
# # Drop not aligned curve
# data = data.drop(columns=['rolling_mean_grouped_soil_vol'])
# # Calculate and add CORRECTED volumetric water content
# data = add_volumetric_col_to_df(data, 'rolling_mean_grouped_soil', corrected_water_retention_curve)
# Omit rows when there is no data from physical sensors for over six hours => below: interpolate
data = remove_large_gaps(data, 'rolling_mean_grouped_soil', 6)
# Check gaps => TODO: not every col should interpolated (month?), some data is lost here
data = fill_gaps(data)
# Add calculated pump state or actual irrigation amount
# Calc gradient
f = data.rolling_mean_grouped_soil
data['gradient'] = np.gradient(f)
# Skip the pump state if there is a flow meter where the artificial irrigation amount is messured
if "DeviceAndSensorIdsFlow" in Current_config:
data = include_irrigation_amount(data)
else:
data['pump_state'] = int(0)
data = add_pump_state(data)
# also add hours since last irrigation => TODO: check later, still an error, !!!!!questionable whether it is useful!!!!!
#data = hours_since_pump_was_turned_on(data)
return data
# Normalize the data in min - max approach from 0 - 1
def normalize(data):
# feature scaling
data.describe()
# Min-Max Normalization
df = data.drop(['Time','rolling_mean_grouped_soil', 'hour', 'minute', 'date', 'month'], axis=1)
df_norm = (df-df.min())/(df.max()-df.min())
df_norm = pd.concat([df_norm, data['Time'],data['hour'], data['minute'], data['date'], data['month'], data.rolling_mean_grouped_soil], 1)
# bring back to order -> not important -> will not work in production
data = data[['Time', 'hour', 'minute', 'date', 'month', 'grouped_soil',
'grouped_resistance', 'grouped_soil_temp', 'rolling_mean_grouped_soil',
'rolling_mean_grouped_soil_temp',
'638de53168f31919048f189d, 63932c6e68f319085df375b5; B2_solar_x2_03, Soil_tension',
'638de53168f31919048f189d, 63932c6e68f319085df375b7; B2_solar_x2_03, Resistance',
'638de53168f31919048f189d, 63932c6e68f319085df375b9; B2_solar_x2_03, Soil_temperature',
'638de56a68f31919048f189e, 63932ce468f319085df375cb; B3_solar_x1_02, Resistance',
'638de57568f31919048f189f, 63932c3668f319085df375ab; B4_solar_x1_03, Soil_tension',
'638de57568f31919048f189f, 63932c3668f319085df375ad; B4_solar_x1_03, Resistance',
'638de57568f31919048f189f, 63932c3668f319085df375af; B4_solar_x1_03, Soil_temperature'
]]
return df_norm
# Split dataset into train and test set by date
def split_data_by_date(data, split_date):
return data[data['Time'] <= split_date].copy(), \
data[data['Time'] > split_date].copy()
# Split dataset into train and test set by ratio
def split_by_ratio(data, test_size_percent):
# Calculate the number of rows for the test set
test_size = int(len(data) * (test_size_percent / 100))
# Split the DataFrame
train_set = data.iloc[:-test_size]
test_set = data.iloc[-test_size:]
return train_set, test_set
# Delete the ones that are non consecutive
def delete_nonconsecutive_rows(df, column_name, min_consecutive):
arr = df[column_name].to_numpy()
i = 0
while i < len(arr) - 1:
if arr[i+1] == arr[i] + 1:
start_index = i
while i < len(arr) - 1 and arr[i+1] == arr[i] + 1:
i += 1
end_index = i
if end_index - start_index + 1 < min_consecutive:
df = df.drop(range(start_index, end_index+1))
i += 1
return df
# Create visual representation of irrigation times
def highlight(data_plot, ax, neg_slope):
for index, row in neg_slope.iterrows():
current_index = int(row['index'])
#print(current_index)
ax.axvspan(current_index-10, current_index+10, facecolor='pink', edgecolor='none', alpha=.5)
# Create ranges to remove from data
def create_split_tuples(df, indices_to_omit):
# Sort the indices in ascending order
indices_to_omit = sorted(indices_to_omit)
# Create a list of index ranges to remove
ranges_to_remove = []
start_idx = None
for idx in indices_to_omit:
if start_idx is None:
start_idx = idx
elif idx == start_idx + 1:
start_idx = idx
else:
ranges_to_remove.append((int(start_idx), int(idx-1)))
start_idx = idx
if start_idx is not None:
ranges_to_remove.append((int(start_idx), df.index.max()))
print("Irrigation times to be omitted: ", ranges_to_remove)
print("type: ", type(ranges_to_remove[0][0]))
return ranges_to_remove
# Split data to split dataframes
def split_dataframe(df, index_ranges):
dfs = []
for i, (start, end) in enumerate(index_ranges):
if index_ranges[i][1]-index_ranges[i][0] < 50:
continue