-
Notifications
You must be signed in to change notification settings - Fork 0
/
compute_hydro_hourly_vars.py
135 lines (123 loc) · 7.97 KB
/
compute_hydro_hourly_vars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import glob
import numpy as np
import pandas as pd
# DISCLAIMER:
# Since the full hourly hydrologic dataset is too large to host on GitHub, this code will not run if one downloads it.
# However, one could use it as a guide to my workflow in preprocessing the hourly hydrologic data.
def split_by_intrinsic_var(df, var_str):
"""
Split the DataFrame 'df' into multiple DataFrames based on an identifying categorical variable given by the user
through a column named 'var_str'.
@params:
df (DataFrame): The original DataFrame to be split.
var_str (str): The column name of the variable used for splitting.
@returns:
list: A list of DataFrames, each corresponding to a distinct value of the specified variable.
"""
unq_vals = df[var_str].unique().tolist()
hold_splits = []
for value in unq_vals:
hold_splits.append(df.groupby(var_str).get_group(value).reset_index().drop('index', axis=1))
return hold_splits
def save_to_folder(folder_path, df_list, group_col):
"""
Save a list of grouped DataFrames into separate CSV files based on the value of 'group_col' column, which
correponds to a categorical variable. Each value in 'group_col' should be the same.
@params:
folder_path (str): Path to a folder where CSV files will be saved. Use \\ path separation if in Windows.
df_list (list): A list of DataFrames, each corresponding to a different group.
group_col (str): The column name used for grouping the DataFrames.
@returns:
None: Exports CSV files to the specified folder.
"""
folder_path = folder_path
for df in df_list:
group_str = df[group_col][0]
df.to_csv(folder_path+"\\"+group_str+".csv")
for i in range(0, 9):
idx = str(i)
# Reads data stored in file specified by the value in 'i' in local computer.
floodingdf = pd.read_csv("D:\\Etienne\\PAPER_2023\\CRMS_Continuous_Hydrographic\\export-" + idx + ".csv",
encoding="unicode_escape")[['Station ID', 'Adjusted Water Elevation to Marsh (ft)',
'Date (mm/dd/yyyy)']]
# Make the simple site name. Leads to common grounds for identifying CRMS stations
floodingdf['Simple site'] = [i[:8] for i in floodingdf['Station ID']]
floodingdf = floodingdf.drop('Station ID', axis=1)
# Split the DataFrame into multiple DataFrames based on the CRMS station name
split_bysite_ls = split_by_intrinsic_var(floodingdf, "Simple site")
# Save the separated DataFrames locally
save_to_folder("D:\\Etienne\\PAPER_2023\\CRMS_Continuous_Hydrographic\\export-" + idx, split_bysite_ls,
"Simple site")
# Re-Load the DataFrames back in
path = "D:\\Etienne\\PAPER_2023\\CRMS_Continuous_Hydrographic\\export-" + idx
files = glob.glob(path + "/*.csv")
# checking all the csv files in the specified path
listdfs = []
for filename in files:
df = pd.read_csv(filename, encoding="unicode_escape")
listdfs.append(df)
# Create containers to hold processed data
dictdf = {}
arraydf = {}
flooddepthdf = {}
# Loop through the CSV files in the folder and read them back into DataFrames
for d in listdfs:
dfname = d['Simple site'][0] # name of CRMS station used to name dataframe and associated features
dictdf[dfname] = d.dropna().reset_index()
# Calculating flood frequency and flood depths
if len(dictdf[dfname]) > 0: # Check that DataFrame has enough values in it
wlarray = dictdf[dfname]['Adjusted Water Elevation to Marsh (ft)'].to_numpy() # hold water elevation relative to marsh elevation
dictdf[dfname]['date in datetime'] = pd.to_datetime(dictdf[dfname]['Date (mm/dd/yyyy)'], format='%m/%d/%Y')
start = dictdf[dfname]['date in datetime'][0] # Identify the date of first recorded water elevation
end = dictdf[dfname]['date in datetime'][len(dictdf[dfname]) - 1] # Identify date of the last recorded water elevation
timedays = end - start # Calculate the time in days of monitoring
timedays = timedays.days
decimalyears = timedays / 365 # Convert to decimal years for calculating floods/year
arraydf[dfname] = np.zeros((len(wlarray),)) # initialize an array of zeros to hold the flood counts
flooddepthdf[dfname] = np.zeros((len(wlarray,)))
# Begin counting the number of floods and their depths that occur throughout the stations monitoring history
for i in range(len(wlarray)):
# flood depth
if wlarray[i] > 0: # Check if water level is greater than the marsh elevation (this means "flood").
flooddepthdf[dfname][i] = wlarray[i] # add the depth of the flood to the flood depth array
# flood frequency
if (wlarray[i] <= 0 and wlarray[i - 1] > 0) or (wlarray[i] > 0 and wlarray[i - 1] <= 0):
if decimalyears > 0:
arraydf[dfname][
i] = 0.5 / decimalyears # 0.5 cuz I will be appending every time a flood comes in and then out (so i divide)
else:
arraydf[dfname][i] = 0.5
npcolstack_floodFreq = {}
npcolstack_floodDepth = {}
for key in arraydf:
npcolstack_floodFreq[key] = np.column_stack((dictdf[key]['Simple site'].to_numpy(), arraydf[key]))
for key in flooddepthdf:
npcolstack_floodDepth[key] = np.column_stack((dictdf[key]['Simple site'].to_numpy(), flooddepthdf[key],
np.full(shape=len(flooddepthdf[key]), # Find the 90th percentile of flood depth
fill_value=np.percentile(a=flooddepthdf[key], q=90)),
np.full(shape=len(flooddepthdf[key]), # Find the 10th percentile of flood depth
fill_value=np.percentile(a=flooddepthdf[key], q=10)),
np.full(shape=len(flooddepthdf[key]), # find the standard deviation of flood depth
fill_value=np.std(flooddepthdf[key]))
))
stackeddf_floodFreq = {}
stackeddf_floodDepth = {}
# Average all values into a single dataframe
for key in npcolstack_floodFreq:
stackeddf_floodFreq[key] = pd.DataFrame(npcolstack_floodFreq[key],
columns=['Simple site', 'Flood Freq (Floods/yr)']) \
.groupby('Simple site').sum().reset_index()
for key in npcolstack_floodDepth:
stackeddf_floodDepth[key] = pd.DataFrame(npcolstack_floodDepth[key], columns=['Simple site',
'Avg. Flood Depth when Flooded (ft)',
'90th Percentile Flood Depth when Flooded (ft)',
'10th Percentile Flood Depth when Flooded (ft)',
'Std. Deviation Flood Depth when Flooded ']) \
.groupby('Simple site').median().reset_index()
# Export to local computer for use in ML experiments
check_freq = pd.concat(stackeddf_floodFreq.values(), ignore_index=True)
check_freq.to_csv("D:\\Etienne\\PAPER_2023\\CRMS_Continuous_Hydrographic\\floodingsplits\\floodFrequencySitePerYear-" + idx +
".csv")
check_depth = pd.concat(stackeddf_floodDepth.values(), ignore_index=True)
check_depth.to_csv("D:\\Etienne\\PAPER_2023\\CRMS_Continuous_Hydrographic\\flooddepthsplits\\floodDepthSitePerYear-"
+ idx + ".csv")