-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Percentile qc - test #162
base: main
Are you sure you want to change the base?
Percentile qc - test #162
Changes from all commits
2af3d54
8586a91
9f2a0ec
df91382
7985e7d
0d9bece
c766e9c
5bdbfbc
fa1c0e1
560c808
5a17f96
d678550
156161f
2ad99fe
2807977
a66e392
b9d29d5
1c5fd1a
f3d113e
c185aaa
564705f
6a55ec7
068004d
53e2d5c
64c0e48
879d43d
44f4360
ddf2fd3
1ab0ba0
4a8d581
aa38424
09b4f44
305fd69
9ea80a9
370572d
3c3d54f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -6,8 +6,12 @@ | |||||
import urllib.request | ||||||
from urllib.error import HTTPError, URLError | ||||||
import pandas as pd | ||||||
import subprocess | ||||||
import os | ||||||
import xarray as xr | ||||||
import sqlite3 | ||||||
|
||||||
# from IPython import embed | ||||||
|
||||||
def toL2(L1, T_0=273.15, ews=1013.246, ei0=6.1071, eps_overcast=1., | ||||||
eps_clear=9.36508e-6, emissivity=0.97): | ||||||
|
@@ -45,15 +49,26 @@ def toL2(L1, T_0=273.15, ews=1013.246, ei0=6.1071, eps_overcast=1., | |||||
except Exception as e: | ||||||
print('Flagging and fixing failed:') | ||||||
print(e) | ||||||
|
||||||
#stid = ds.station_id | ||||||
|
||||||
|
||||||
|
||||||
ds = differenceQC(ds) # Flag and Remove difference outliers | ||||||
ds = percentileQC(ds) # Flag and remove percentile outliers | ||||||
|
||||||
T_100 = _getTempK(T_0) | ||||||
ds['rh_u_cor'] = correctHumidity(ds['rh_u'], ds['t_u'], | ||||||
T_0, T_100, ews, ei0) | ||||||
|
||||||
# Determiune cloud cover | ||||||
cc = calcCloudCoverage(ds['t_u'], T_0, eps_overcast, eps_clear, # Calculate cloud coverage | ||||||
ds['dlr'], ds.attrs['station_id']) | ||||||
ds['cc'] = (('time'), cc.data) | ||||||
# Determiune cloud cover for on-ice stations | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wondering why this change for cloud cover is here? This has nothing to do with percentiles I assume? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is something Baptiste implemented in the meantime, so no, nothing to do with the qc. |
||||||
if not ds.attrs['bedrock']: | ||||||
cc = calcCloudCoverage(ds['t_u'], T_0, eps_overcast, eps_clear, # Calculate cloud coverage | ||||||
ds['dlr'], ds.attrs['station_id']) | ||||||
ds['cc'] = (('time'), cc.data) | ||||||
else: | ||||||
# Default cloud cover for bedrock station for which tilt should be 0 anyway. | ||||||
cc = 0.8 | ||||||
|
||||||
# Determine surface temperature | ||||||
ds['t_surf'] = calcSurfaceTemperature(T_0, ds['ulr'], ds['dlr'], # Calculate surface temperature | ||||||
|
@@ -141,6 +156,248 @@ def toL2(L1, T_0=273.15, ews=1013.246, ei0=6.1071, eps_overcast=1., | |||||
T_0, T_100, ews, ei0) | ||||||
return ds | ||||||
|
||||||
def differenceQC(ds): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would make sense to add such functions to a separate module like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, it might make sense to take this out of Also, it would be great to add a short description at the top of the docstrings for both There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is definitely something we will do in the future, probably in a future update. A better described docsrting is also a good idea There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, it is a persistence/static check. We will rename it to staticQC. |
||||||
''' | ||||||
|
||||||
Parameters | ||||||
---------- | ||||||
ds : xr.Dataset | ||||||
Level 1 datset | ||||||
|
||||||
Returns | ||||||
------- | ||||||
ds_out : xr.Dataset | ||||||
Level 1 dataset with difference outliers set to NaN | ||||||
''' | ||||||
|
||||||
# the differenceQC is not done on the Windspeed | ||||||
# Optionally examine flagged data by setting make_plots to True | ||||||
# This is best done by running aws.py directly and setting 'test_station' | ||||||
# Plots will be shown before and after flag removal for each var | ||||||
|
||||||
stid = ds.station_id | ||||||
df = ds.to_dataframe() # Switch to pandas | ||||||
|
||||||
# Define threshold dict to hold limit values, and the difference values. | ||||||
# Limit values indicate how much a variable has to change to the previous value | ||||||
# diff_period is how many hours a value can stay the same without being set to NaN | ||||||
# * are used to calculate and define all limits, which are then applied to *_u, *_l and *_i | ||||||
|
||||||
var_threshold = { | ||||||
't': {'static_limit': 0.001, 'diff_period' : 1}, | ||||||
'p': {'static_limit': 0.0001, 'diff_period' : 24}, | ||||||
'rh': {'static_limit': 0.0001, 'diff_period' : 24} | ||||||
} | ||||||
|
||||||
|
||||||
for k in var_threshold.keys(): | ||||||
|
||||||
var_all = [k + '_u',k + '_l',k + '_i'] # apply to upper, lower boom, and instant | ||||||
static_lim = var_threshold[k]['static_limit'] # loading static limit | ||||||
diff_h = var_threshold[k]['diff_period'] # loading diff period | ||||||
|
||||||
for v in var_all: | ||||||
if v in df: | ||||||
|
||||||
data = df[v] | ||||||
diff = data.diff() | ||||||
diff.fillna(method='ffill', inplace=True) # forward filling all NaNs! | ||||||
diff = np.array(diff) | ||||||
|
||||||
diff_period = np.ones_like(diff) * False | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
|
||||||
for i,d in enumerate(diff): # algorithm that ensures values can stay the same within the diff_period | ||||||
if i > (diff_h-1): | ||||||
if sum(abs(diff[i-diff_h:i])) < static_lim: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider using |
||||||
diff_period[i-diff_h:i] = True | ||||||
|
||||||
diff_period = np.array(diff_period).astype('bool') | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is unnecessary to assign to boolean data type if the array is instantiated as suggested above
Suggested change
|
||||||
|
||||||
outliers = df[v][diff_period] # finding outliers in dataframe | ||||||
|
||||||
df.loc[outliers.index,v] = np.nan # setting outliers to NaN | ||||||
|
||||||
# Back to xarray, and re-assign the original attrs | ||||||
ds_out = df.to_xarray() | ||||||
ds_out = ds_out.assign_attrs(ds.attrs) # Dataset attrs | ||||||
for x in ds_out.data_vars: # variable-specific attrs | ||||||
ds_out[x].attrs = ds[x].attrs | ||||||
# equivalent to above: | ||||||
# vals = [xr.DataArray(data=df_out[c], dims=['time'], coords={'time':df_out.index}, attrs=ds[c].attrs) for c in df_out.columns] | ||||||
# ds_out = xr.Dataset(dict(zip(df_out.columns, vals)), attrs=ds.attrs) | ||||||
return ds_out | ||||||
|
||||||
|
||||||
def percentileQC(ds): | ||||||
''' | ||||||
Parameters | ||||||
---------- | ||||||
ds : xr.Dataset | ||||||
Level 1 dataset | ||||||
|
||||||
Returns | ||||||
------- | ||||||
ds_out : xr.Dataset | ||||||
Level 1 dataset with percentile outliers set to NaN | ||||||
''' | ||||||
# Check if on-disk sqlite db exists | ||||||
# If it not exists, then run compute_percentiles.py | ||||||
|
||||||
base_path = os.getcwd() | ||||||
|
||||||
file_path1 = base_path + '/main/src/pypromice/qc/percentiles.db' | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Environment specific paths/variables shall not be assigned statically in the function body. Use input parameters or a class instead. |
||||||
file_path2 = base_path + '/qc/percentiles.db' | ||||||
|
||||||
script_path1 = base_path + '/main/src/pypromice/qc/compute_percentiles.py' | ||||||
script_path2 = base_path + '/qc/compute_percentiles.py' | ||||||
|
||||||
script_exist1 = os.path.isfile(script_path1) | ||||||
|
||||||
db_exist1 = os.path.isfile(file_path1) | ||||||
db_exist2 = os.path.isfile(file_path2) | ||||||
|
||||||
|
||||||
print(f'Does percentiles.db exist {db_exist2} in this path {file_path2}') | ||||||
|
||||||
|
||||||
if not db_exist1 and not db_exist2: | ||||||
if script_exist1: | ||||||
print(f'percentiles.db does not exist running {script_path1}') | ||||||
subprocess.call(['python',script_path1]) | ||||||
file_path = file_path1 | ||||||
else: | ||||||
print(f'percentiles.db does not exist running {script_path2}') | ||||||
subprocess.call(['python',script_path2]) | ||||||
file_path = file_path2 | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All of the above should be implemented in a separate module and maybe a python class wrapping the database There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why use a database instead of just a datafile like NetCDF? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ladsmund I wrote this original code that uses sqlite. The general idea is to provide processing effficiency. If the percentile distributions (for each variable) are stored in a sql database, we are able to extract the percentiles of interest very fast with a sql query, rather than opening/reading a static file for each station. sqlite is the simplest database solution, where we can have a small file on disk. However, this is of more significant benefit for much larger datasets (e.g. 10s or 100s of thousands of stations). A solution using a static file could also be implemented if desired. Either way, you are persisting something on disk between runs that holds the percentile distributions. |
||||||
|
||||||
# Optionally examine flagged data by setting make_plots to True | ||||||
# This is best done by running aws.py directly and setting 'test_station' | ||||||
# Plots will be shown before and after flag removal for each var | ||||||
make_plots = False | ||||||
|
||||||
stid = ds.station_id | ||||||
stid_type = type(stid) | ||||||
|
||||||
print(f'station id {stid}') | ||||||
print(f'station id type {stid_type}') | ||||||
df = ds.to_dataframe() # Switch to pandas | ||||||
|
||||||
# Define threshold dict to hold limit values, and 'hi' and 'lo' percentile. | ||||||
# Limit values indicate how far we will go beyond the hi and lo percentiles to flag outliers. | ||||||
# *_u are used to calculate and define all limits, which are then applied to *_u, *_l and *_i | ||||||
var_threshold = { | ||||||
't_u': {'limit': 9}, # 'hi' and 'lo' will be held in 'seasons' dict | ||||||
'p_u': {'limit': 12}, | ||||||
'rh_u': {'limit': 12}, | ||||||
'wspd_u': {'limit': 12}, | ||||||
} | ||||||
|
||||||
# Query from the on-disk sqlite db for specified percentiles | ||||||
if db_exist1: | ||||||
file_path = file_path1 | ||||||
else: | ||||||
file_path = file_path2 | ||||||
|
||||||
con = sqlite3.connect(file_path) | ||||||
cur = con.cursor() | ||||||
for k in var_threshold.keys(): | ||||||
if k == 't_u': | ||||||
# Different pattern for t_u, which considers seasons | ||||||
# 1: winter (DecJanFeb), 2: spring (MarAprMay), 3: summer (JunJulAug), 4: fall (SepOctNov) | ||||||
seasons = {1: {}, 2: {}, 3: {}, 4: {}} | ||||||
sql = f"SELECT p0p5,p99p5,season FROM {k} WHERE season in (1,2,3,4) and stid = ?" | ||||||
cur.execute(sql, [stid]) | ||||||
result = cur.fetchall() | ||||||
if result: | ||||||
for row in result: | ||||||
# row[0] is p0p5, row[1] is p99p5, row[2] is the season integer | ||||||
seasons[row[2]]['lo'] = row[0] # 0.005 | ||||||
seasons[row[2]]['hi'] = row[1] # 0.995 | ||||||
var_threshold[k]['seasons'] = seasons | ||||||
else: | ||||||
print(f'{stid} has no {k} data') | ||||||
else: | ||||||
sql = f"SELECT p0p5,p99p5 FROM {k} WHERE stid = ?" | ||||||
cur.execute(sql, [stid]) | ||||||
result = cur.fetchone() # we only expect one row back per station | ||||||
if result: | ||||||
var_threshold[k]['lo'] = result[0] # 0.005 | ||||||
var_threshold[k]['hi'] = result[1] # 0.995 | ||||||
else: | ||||||
print(f'{stid} has no {k} data') | ||||||
|
||||||
con.close() # close the database connection (and cursor) | ||||||
|
||||||
# Set flagged data to NaN | ||||||
for k in var_threshold.keys(): | ||||||
if k == 't_u': | ||||||
# use t_u thresholds to flag t_u, t_l, t_i | ||||||
base_var = k.split('_')[0] | ||||||
vars_all = [k, base_var+'_l', base_var+'_i'] | ||||||
for t in vars_all: | ||||||
if t in df: | ||||||
winter = df[t][df.index.month.isin([12,1,2])] | ||||||
spring = df[t][df.index.month.isin([3,4,5])] | ||||||
summer = df[t][df.index.month.isin([6,7,8])] | ||||||
fall = df[t][df.index.month.isin([9,10,11])] | ||||||
season_dfs = [winter,spring,summer,fall] | ||||||
|
||||||
if make_plots: | ||||||
_plot_percentiles_t(k,t,df,season_dfs,var_threshold,stid) # BEFORE OUTLIER REMOVAL | ||||||
for x1,x2 in zip([1,2,3,4], season_dfs): | ||||||
try: | ||||||
print(f'percentile flagging {t} {x1}') | ||||||
lower_thresh = var_threshold[k]['seasons'][x1]['lo'] - var_threshold[k]['limit'] | ||||||
upper_thresh = var_threshold[k]['seasons'][x1]['hi'] + var_threshold[k]['limit'] | ||||||
outliers_upper = x2[x2.values > upper_thresh] | ||||||
outliers_lower = x2[x2.values < lower_thresh] | ||||||
outliers = pd.concat([outliers_upper,outliers_lower]) | ||||||
df.loc[outliers.index,t] = np.nan | ||||||
df.loc[outliers.index,t] = np.nan | ||||||
except Exception as e: | ||||||
print(f'{t} Season {x1} is not computed due to lack of data') | ||||||
print(e) | ||||||
if make_plots: | ||||||
_plot_percentiles_t(k,t,df,season_dfs,var_threshold,stid) # AFTER OUTLIER REMOVAL | ||||||
else: | ||||||
# use *_u thresholds to flag *_u, *_l, *_i | ||||||
base_var = k.split('_')[0] | ||||||
vars_all = [k, base_var+'_l', base_var+'_i'] | ||||||
for t in vars_all: | ||||||
if t in df: | ||||||
try: | ||||||
print(f'percentile flagging {t}') | ||||||
upper_thresh = var_threshold[k]['hi'] + var_threshold[k]['limit'] | ||||||
lower_thresh = var_threshold[k]['lo'] - var_threshold[k]['limit'] | ||||||
if make_plots: | ||||||
_plot_percentiles(k,t,df,var_threshold,upper_thresh,lower_thresh,stid) # BEFORE OUTLIER REMOVAL | ||||||
if t == 'p_i': | ||||||
# shift p_i so we can use the p_u thresholds | ||||||
shift_p = df[t]+1000. | ||||||
outliers_upper = shift_p[shift_p.values > upper_thresh] | ||||||
outliers_lower = shift_p[shift_p.values < lower_thresh] | ||||||
else: | ||||||
outliers_upper = df[t][df[t].values > upper_thresh] | ||||||
outliers_lower = df[t][df[t].values < lower_thresh] | ||||||
outliers = pd.concat([outliers_upper,outliers_lower]) | ||||||
df.loc[outliers.index,t] = np.nan | ||||||
df.loc[outliers.index,t] = np.nan | ||||||
except Exception as e: | ||||||
print(f'{t} is not flagged due to lack of data') | ||||||
print(e) | ||||||
if make_plots: | ||||||
_plot_percentiles(k,t,df,var_threshold,upper_thresh,lower_thresh,stid) # AFTER OUTLIER REMOVAL | ||||||
|
||||||
# Back to xarray, and re-assign the original attrs | ||||||
ds_out = df.to_xarray() | ||||||
ds_out = ds_out.assign_attrs(ds.attrs) # Dataset attrs | ||||||
for x in ds_out.data_vars: # variable-specific attrs | ||||||
ds_out[x].attrs = ds[x].attrs | ||||||
# equivalent to above: | ||||||
# vals = [xr.DataArray(data=df_out[c], dims=['time'], coords={'time':df_out.index}, attrs=ds[c].attrs) for c in df_out.columns] | ||||||
# ds_out = xr.Dataset(dict(zip(df_out.columns, vals)), attrs=ds.attrs) | ||||||
return ds_out | ||||||
|
||||||
def flagNAN(ds_in, | ||||||
flag_url='https://raw.githubusercontent.com/GEUS-Glaciology-and-Climate/PROMICE-AWS-data-issues/master/flags/', | ||||||
flag_dir='local/flags/'): | ||||||
|
@@ -1012,6 +1269,51 @@ def _getRotation(): # | |||||
rad2deg = 1 / deg2rad | ||||||
return deg2rad, rad2deg | ||||||
|
||||||
def _plot_percentiles_t(k, t, df, season_dfs, var_threshold, stid): | ||||||
'''Plot data and percentile thresholds for air temp (seasonal)''' | ||||||
import matplotlib.pyplot as plt | ||||||
plt.figure(figsize=(20,12)) | ||||||
inst_var = t.split('_')[0] + '_i' | ||||||
if inst_var in df: | ||||||
i_plot = df[inst_var] | ||||||
plt.scatter(df.index,i_plot, color='orange', s=3, label='t_i instantaneuous') | ||||||
if t in ('t_u','t_l'): | ||||||
plt.scatter(df.index,df[t], color='b', s=3, label=f'{t} hourly ave') | ||||||
for x1,x2 in zip([1,2,3,4], season_dfs): | ||||||
y1 = np.full(len(x2.index), (var_threshold[k]['seasons'][x1]['lo'] - var_threshold[k]['limit'])) | ||||||
y2 = np.full(len(x2.index), (var_threshold[k]['seasons'][x1]['hi'] + var_threshold[k]['limit'])) | ||||||
y11 = np.full(len(x2.index), (var_threshold[k]['seasons'][x1]['lo'] )) | ||||||
y22 = np.full(len(x2.index), (var_threshold[k]['seasons'][x1]['hi'] )) | ||||||
plt.scatter(x2.index, y1, color='r',s=1) | ||||||
plt.scatter(x2.index, y2, color='r', s=1) | ||||||
plt.scatter(x2.index, y11, color='k', s=1) | ||||||
plt.scatter(x2.index, y22, color='k', s=1) | ||||||
plt.title('{} {}'.format(stid, t)) | ||||||
plt.legend(loc="lower left") | ||||||
plt.show() | ||||||
|
||||||
def _plot_percentiles(k, t, df, var_threshold, upper_thresh, lower_thresh, stid): | ||||||
'''Plot data and percentile thresholds''' | ||||||
import matplotlib.pyplot as plt | ||||||
plt.figure(figsize=(20,12)) | ||||||
inst_var = t.split('_')[0] + '_i' | ||||||
if inst_var in df: | ||||||
if k == 'p_u': | ||||||
i_plot = (df[inst_var]+1000.) | ||||||
else: | ||||||
i_plot = df[inst_var] | ||||||
plt.scatter(df.index,i_plot, color='orange', s=3, label='instantaneuous') | ||||||
if t != inst_var: | ||||||
plt.scatter(df.index,df[t], color='b', s=3, label=f' {t} hourly ave') | ||||||
plt.axhline(y=upper_thresh, color='r', linestyle='-') | ||||||
plt.axhline(y=lower_thresh, color='r', linestyle='-') | ||||||
plt.axhline(y=var_threshold[k]['hi'], color='k', linestyle='--') | ||||||
plt.axhline(y=var_threshold[k]['lo'], color='k', linestyle='--') | ||||||
plt.title('{} {}'.format(stid, t)) | ||||||
plt.legend(loc="lower left") | ||||||
plt.show() | ||||||
|
||||||
|
||||||
if __name__ == "__main__": | ||||||
# unittest.main() | ||||||
pass |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is
ds
the entire time series?