Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Percentile qc - test #162

Open
wants to merge 36 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
2af3d54
initial implementation
patrickjwright Mar 21, 2023
8586a91
minor tweaks to compute_percentiles.py
patrickjwright Mar 22, 2023
9f2a0ec
typo in aws.py
patrickjwright Mar 22, 2023
df91382
stub in prelim location for running percentile QC check
patrickjwright Mar 22, 2023
7985e7d
stub out percentile QC check in L1toL2
patrickjwright Mar 24, 2023
0d9bece
fix season definitions in compute_percentiles.py
patrickjwright Mar 24, 2023
c766e9c
set percentile outliers to nan
patrickjwright Mar 28, 2023
5bdbfbc
clean up and comment L1toL2.py
patrickjwright Mar 29, 2023
fa1c0e1
tweak comments
patrickjwright Mar 29, 2023
560c808
clean up and comment L1toL2.py
patrickjwright Mar 29, 2023
5a17f96
add plotting capability to compute_percentiles.py
patrickjwright Mar 29, 2023
d678550
Changing the Percentiles limits. Adding Difference QC. And checking i…
RasmusBahbah Aug 4, 2023
156161f
Wrong index
RasmusBahbah Aug 8, 2023
2ad99fe
fixing subprocess run compute_percentiles.py
RasmusBahbah Aug 8, 2023
2807977
Trying to fix subprocess path
RasmusBahbah Aug 8, 2023
a66e392
script_check
RasmusBahbah Aug 8, 2023
b9d29d5
Update L1toL2.py
RasmusBahbah Aug 8, 2023
1c5fd1a
Printing Current Path
RasmusBahbah Aug 8, 2023
f3d113e
Update L1toL2.py
RasmusBahbah Aug 8, 2023
c185aaa
Fixing paths to percentiles and compute_percentiles
RasmusBahbah Aug 8, 2023
564705f
Update L1toL2.py
RasmusBahbah Aug 8, 2023
6a55ec7
Fixing paths
RasmusBahbah Aug 8, 2023
068004d
Update L1toL2.py
RasmusBahbah Aug 8, 2023
53e2d5c
updating subprocess
RasmusBahbah Aug 8, 2023
64c0e48
Update L1toL2.py
RasmusBahbah Aug 8, 2023
879d43d
Fixing path to .db file
RasmusBahbah Aug 8, 2023
44f4360
changing path to l3 data
RasmusBahbah Aug 8, 2023
ddf2fd3
crash bug if there are no season data for temp.
RasmusBahbah Aug 15, 2023
1ab0ba0
Add files via upload
RasmusBahbah Aug 15, 2023
4a8d581
Update L1toL2.py
RasmusBahbah Aug 15, 2023
aa38424
Merge branch 'percentile-qc' of https://github.com/GEUS-Glaciology-an…
RasmusBahbah Aug 15, 2023
09b4f44
Bug fix - if station do not have var (p, wspd,rh)
RasmusBahbah Aug 15, 2023
305fd69
windows and Linux Separator bug
RasmusBahbah Aug 15, 2023
9ea80a9
check paths
RasmusBahbah Aug 18, 2023
370572d
update cc
RasmusBahbah Aug 18, 2023
3c3d54f
correcting path to .db file
RasmusBahbah Aug 18, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,6 @@ src/pypromice/postprocess/latest_timestamps.pkl

# Output positions from transmitted GPS
src/pypromice/postprocess/positions.csv

# sqlite db files
*.db
310 changes: 306 additions & 4 deletions src/pypromice/process/L1toL2.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,12 @@
import urllib.request
from urllib.error import HTTPError, URLError
import pandas as pd
import subprocess
import os
import xarray as xr
import sqlite3

# from IPython import embed

def toL2(L1, T_0=273.15, ews=1013.246, ei0=6.1071, eps_overcast=1.,
eps_clear=9.36508e-6, emissivity=0.97):
Expand Down Expand Up @@ -45,15 +49,26 @@ def toL2(L1, T_0=273.15, ews=1013.246, ei0=6.1071, eps_overcast=1.,
except Exception as e:
print('Flagging and fixing failed:')
print(e)

#stid = ds.station_id



ds = differenceQC(ds) # Flag and Remove difference outliers
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is ds the entire time series?

ds = percentileQC(ds) # Flag and remove percentile outliers

T_100 = _getTempK(T_0)
ds['rh_u_cor'] = correctHumidity(ds['rh_u'], ds['t_u'],
T_0, T_100, ews, ei0)

# Determiune cloud cover
cc = calcCloudCoverage(ds['t_u'], T_0, eps_overcast, eps_clear, # Calculate cloud coverage
ds['dlr'], ds.attrs['station_id'])
ds['cc'] = (('time'), cc.data)
# Determiune cloud cover for on-ice stations
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wondering why this change for cloud cover is here? This has nothing to do with percentiles I assume?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is something Baptiste implemented in the meantime, so no, nothing to do with the qc.

if not ds.attrs['bedrock']:
cc = calcCloudCoverage(ds['t_u'], T_0, eps_overcast, eps_clear, # Calculate cloud coverage
ds['dlr'], ds.attrs['station_id'])
ds['cc'] = (('time'), cc.data)
else:
# Default cloud cover for bedrock station for which tilt should be 0 anyway.
cc = 0.8

# Determine surface temperature
ds['t_surf'] = calcSurfaceTemperature(T_0, ds['ulr'], ds['dlr'], # Calculate surface temperature
Expand Down Expand Up @@ -141,6 +156,248 @@ def toL2(L1, T_0=273.15, ews=1013.246, ei0=6.1071, eps_overcast=1.,
T_0, T_100, ews, ei0)
return ds

def differenceQC(ds):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would make sense to add such functions to a separate module like pypromice.qc

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it might make sense to take this out of L1toL2.py and have all qc-related functions in the qc directory? Not sure if there are any implications of moving this other than organizational, but worth considering.

Also, it would be great to add a short description at the top of the docstrings for both differenceQC and percentileQC that gives a basic description of what the function does.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is definitely something we will do in the future, probably in a future update. A better described docsrting is also a good idea

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is differenceQC? Just glancing at the code, this appears to be a persistence check? That is, checking if a value has gotten stuck in a persisting state for a certain period of time. If so, I would rename to persistenceQC.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, it is a persistence/static check. We will rename it to staticQC.

'''

Parameters
----------
ds : xr.Dataset
Level 1 datset

Returns
-------
ds_out : xr.Dataset
Level 1 dataset with difference outliers set to NaN
'''

# the differenceQC is not done on the Windspeed
# Optionally examine flagged data by setting make_plots to True
# This is best done by running aws.py directly and setting 'test_station'
# Plots will be shown before and after flag removal for each var

stid = ds.station_id
df = ds.to_dataframe() # Switch to pandas

# Define threshold dict to hold limit values, and the difference values.
# Limit values indicate how much a variable has to change to the previous value
# diff_period is how many hours a value can stay the same without being set to NaN
# * are used to calculate and define all limits, which are then applied to *_u, *_l and *_i

var_threshold = {
't': {'static_limit': 0.001, 'diff_period' : 1},
'p': {'static_limit': 0.0001, 'diff_period' : 24},
'rh': {'static_limit': 0.0001, 'diff_period' : 24}
}


for k in var_threshold.keys():

var_all = [k + '_u',k + '_l',k + '_i'] # apply to upper, lower boom, and instant
static_lim = var_threshold[k]['static_limit'] # loading static limit
diff_h = var_threshold[k]['diff_period'] # loading diff period

for v in var_all:
if v in df:

data = df[v]
diff = data.diff()
diff.fillna(method='ffill', inplace=True) # forward filling all NaNs!
diff = np.array(diff)

diff_period = np.ones_like(diff) * False
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
diff_period = np.ones_like(diff) * False
diff_period = np.zeros_like(diff, dtype='bool')


for i,d in enumerate(diff): # algorithm that ensures values can stay the same within the diff_period
if i > (diff_h-1):
if sum(abs(diff[i-diff_h:i])) < static_lim:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider using mean instead of sum to make the threshold invariant of window size

diff_period[i-diff_h:i] = True

diff_period = np.array(diff_period).astype('bool')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is unnecessary to assign to boolean data type if the array is instantiated as suggested above

Suggested change
diff_period = np.array(diff_period).astype('bool')


outliers = df[v][diff_period] # finding outliers in dataframe

df.loc[outliers.index,v] = np.nan # setting outliers to NaN

# Back to xarray, and re-assign the original attrs
ds_out = df.to_xarray()
ds_out = ds_out.assign_attrs(ds.attrs) # Dataset attrs
for x in ds_out.data_vars: # variable-specific attrs
ds_out[x].attrs = ds[x].attrs
# equivalent to above:
# vals = [xr.DataArray(data=df_out[c], dims=['time'], coords={'time':df_out.index}, attrs=ds[c].attrs) for c in df_out.columns]
# ds_out = xr.Dataset(dict(zip(df_out.columns, vals)), attrs=ds.attrs)
return ds_out


def percentileQC(ds):
'''
Parameters
----------
ds : xr.Dataset
Level 1 dataset

Returns
-------
ds_out : xr.Dataset
Level 1 dataset with percentile outliers set to NaN
'''
# Check if on-disk sqlite db exists
# If it not exists, then run compute_percentiles.py

base_path = os.getcwd()

file_path1 = base_path + '/main/src/pypromice/qc/percentiles.db'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Environment specific paths/variables shall not be assigned statically in the function body. Use input parameters or a class instead.

file_path2 = base_path + '/qc/percentiles.db'

script_path1 = base_path + '/main/src/pypromice/qc/compute_percentiles.py'
script_path2 = base_path + '/qc/compute_percentiles.py'

script_exist1 = os.path.isfile(script_path1)

db_exist1 = os.path.isfile(file_path1)
db_exist2 = os.path.isfile(file_path2)


print(f'Does percentiles.db exist {db_exist2} in this path {file_path2}')


if not db_exist1 and not db_exist2:
if script_exist1:
print(f'percentiles.db does not exist running {script_path1}')
subprocess.call(['python',script_path1])
file_path = file_path1
else:
print(f'percentiles.db does not exist running {script_path2}')
subprocess.call(['python',script_path2])
file_path = file_path2
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All of the above should be implemented in a separate module and maybe a python class wrapping the database

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why use a database instead of just a datafile like NetCDF?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ladsmund I wrote this original code that uses sqlite. The general idea is to provide processing effficiency. If the percentile distributions (for each variable) are stored in a sql database, we are able to extract the percentiles of interest very fast with a sql query, rather than opening/reading a static file for each station. sqlite is the simplest database solution, where we can have a small file on disk.

However, this is of more significant benefit for much larger datasets (e.g. 10s or 100s of thousands of stations). A solution using a static file could also be implemented if desired. Either way, you are persisting something on disk between runs that holds the percentile distributions.


# Optionally examine flagged data by setting make_plots to True
# This is best done by running aws.py directly and setting 'test_station'
# Plots will be shown before and after flag removal for each var
make_plots = False

stid = ds.station_id
stid_type = type(stid)

print(f'station id {stid}')
print(f'station id type {stid_type}')
df = ds.to_dataframe() # Switch to pandas

# Define threshold dict to hold limit values, and 'hi' and 'lo' percentile.
# Limit values indicate how far we will go beyond the hi and lo percentiles to flag outliers.
# *_u are used to calculate and define all limits, which are then applied to *_u, *_l and *_i
var_threshold = {
't_u': {'limit': 9}, # 'hi' and 'lo' will be held in 'seasons' dict
'p_u': {'limit': 12},
'rh_u': {'limit': 12},
'wspd_u': {'limit': 12},
}

# Query from the on-disk sqlite db for specified percentiles
if db_exist1:
file_path = file_path1
else:
file_path = file_path2

con = sqlite3.connect(file_path)
cur = con.cursor()
for k in var_threshold.keys():
if k == 't_u':
# Different pattern for t_u, which considers seasons
# 1: winter (DecJanFeb), 2: spring (MarAprMay), 3: summer (JunJulAug), 4: fall (SepOctNov)
seasons = {1: {}, 2: {}, 3: {}, 4: {}}
sql = f"SELECT p0p5,p99p5,season FROM {k} WHERE season in (1,2,3,4) and stid = ?"
cur.execute(sql, [stid])
result = cur.fetchall()
if result:
for row in result:
# row[0] is p0p5, row[1] is p99p5, row[2] is the season integer
seasons[row[2]]['lo'] = row[0] # 0.005
seasons[row[2]]['hi'] = row[1] # 0.995
var_threshold[k]['seasons'] = seasons
else:
print(f'{stid} has no {k} data')
else:
sql = f"SELECT p0p5,p99p5 FROM {k} WHERE stid = ?"
cur.execute(sql, [stid])
result = cur.fetchone() # we only expect one row back per station
if result:
var_threshold[k]['lo'] = result[0] # 0.005
var_threshold[k]['hi'] = result[1] # 0.995
else:
print(f'{stid} has no {k} data')

con.close() # close the database connection (and cursor)

# Set flagged data to NaN
for k in var_threshold.keys():
if k == 't_u':
# use t_u thresholds to flag t_u, t_l, t_i
base_var = k.split('_')[0]
vars_all = [k, base_var+'_l', base_var+'_i']
for t in vars_all:
if t in df:
winter = df[t][df.index.month.isin([12,1,2])]
spring = df[t][df.index.month.isin([3,4,5])]
summer = df[t][df.index.month.isin([6,7,8])]
fall = df[t][df.index.month.isin([9,10,11])]
season_dfs = [winter,spring,summer,fall]

if make_plots:
_plot_percentiles_t(k,t,df,season_dfs,var_threshold,stid) # BEFORE OUTLIER REMOVAL
for x1,x2 in zip([1,2,3,4], season_dfs):
try:
print(f'percentile flagging {t} {x1}')
lower_thresh = var_threshold[k]['seasons'][x1]['lo'] - var_threshold[k]['limit']
upper_thresh = var_threshold[k]['seasons'][x1]['hi'] + var_threshold[k]['limit']
outliers_upper = x2[x2.values > upper_thresh]
outliers_lower = x2[x2.values < lower_thresh]
outliers = pd.concat([outliers_upper,outliers_lower])
df.loc[outliers.index,t] = np.nan
df.loc[outliers.index,t] = np.nan
except Exception as e:
print(f'{t} Season {x1} is not computed due to lack of data')
print(e)
if make_plots:
_plot_percentiles_t(k,t,df,season_dfs,var_threshold,stid) # AFTER OUTLIER REMOVAL
else:
# use *_u thresholds to flag *_u, *_l, *_i
base_var = k.split('_')[0]
vars_all = [k, base_var+'_l', base_var+'_i']
for t in vars_all:
if t in df:
try:
print(f'percentile flagging {t}')
upper_thresh = var_threshold[k]['hi'] + var_threshold[k]['limit']
lower_thresh = var_threshold[k]['lo'] - var_threshold[k]['limit']
if make_plots:
_plot_percentiles(k,t,df,var_threshold,upper_thresh,lower_thresh,stid) # BEFORE OUTLIER REMOVAL
if t == 'p_i':
# shift p_i so we can use the p_u thresholds
shift_p = df[t]+1000.
outliers_upper = shift_p[shift_p.values > upper_thresh]
outliers_lower = shift_p[shift_p.values < lower_thresh]
else:
outliers_upper = df[t][df[t].values > upper_thresh]
outliers_lower = df[t][df[t].values < lower_thresh]
outliers = pd.concat([outliers_upper,outliers_lower])
df.loc[outliers.index,t] = np.nan
df.loc[outliers.index,t] = np.nan
except Exception as e:
print(f'{t} is not flagged due to lack of data')
print(e)
if make_plots:
_plot_percentiles(k,t,df,var_threshold,upper_thresh,lower_thresh,stid) # AFTER OUTLIER REMOVAL

# Back to xarray, and re-assign the original attrs
ds_out = df.to_xarray()
ds_out = ds_out.assign_attrs(ds.attrs) # Dataset attrs
for x in ds_out.data_vars: # variable-specific attrs
ds_out[x].attrs = ds[x].attrs
# equivalent to above:
# vals = [xr.DataArray(data=df_out[c], dims=['time'], coords={'time':df_out.index}, attrs=ds[c].attrs) for c in df_out.columns]
# ds_out = xr.Dataset(dict(zip(df_out.columns, vals)), attrs=ds.attrs)
return ds_out

def flagNAN(ds_in,
flag_url='https://raw.githubusercontent.com/GEUS-Glaciology-and-Climate/PROMICE-AWS-data-issues/master/flags/',
flag_dir='local/flags/'):
Expand Down Expand Up @@ -1012,6 +1269,51 @@ def _getRotation(): #
rad2deg = 1 / deg2rad
return deg2rad, rad2deg

def _plot_percentiles_t(k, t, df, season_dfs, var_threshold, stid):
'''Plot data and percentile thresholds for air temp (seasonal)'''
import matplotlib.pyplot as plt
plt.figure(figsize=(20,12))
inst_var = t.split('_')[0] + '_i'
if inst_var in df:
i_plot = df[inst_var]
plt.scatter(df.index,i_plot, color='orange', s=3, label='t_i instantaneuous')
if t in ('t_u','t_l'):
plt.scatter(df.index,df[t], color='b', s=3, label=f'{t} hourly ave')
for x1,x2 in zip([1,2,3,4], season_dfs):
y1 = np.full(len(x2.index), (var_threshold[k]['seasons'][x1]['lo'] - var_threshold[k]['limit']))
y2 = np.full(len(x2.index), (var_threshold[k]['seasons'][x1]['hi'] + var_threshold[k]['limit']))
y11 = np.full(len(x2.index), (var_threshold[k]['seasons'][x1]['lo'] ))
y22 = np.full(len(x2.index), (var_threshold[k]['seasons'][x1]['hi'] ))
plt.scatter(x2.index, y1, color='r',s=1)
plt.scatter(x2.index, y2, color='r', s=1)
plt.scatter(x2.index, y11, color='k', s=1)
plt.scatter(x2.index, y22, color='k', s=1)
plt.title('{} {}'.format(stid, t))
plt.legend(loc="lower left")
plt.show()

def _plot_percentiles(k, t, df, var_threshold, upper_thresh, lower_thresh, stid):
'''Plot data and percentile thresholds'''
import matplotlib.pyplot as plt
plt.figure(figsize=(20,12))
inst_var = t.split('_')[0] + '_i'
if inst_var in df:
if k == 'p_u':
i_plot = (df[inst_var]+1000.)
else:
i_plot = df[inst_var]
plt.scatter(df.index,i_plot, color='orange', s=3, label='instantaneuous')
if t != inst_var:
plt.scatter(df.index,df[t], color='b', s=3, label=f' {t} hourly ave')
plt.axhline(y=upper_thresh, color='r', linestyle='-')
plt.axhline(y=lower_thresh, color='r', linestyle='-')
plt.axhline(y=var_threshold[k]['hi'], color='k', linestyle='--')
plt.axhline(y=var_threshold[k]['lo'], color='k', linestyle='--')
plt.title('{} {}'.format(stid, t))
plt.legend(loc="lower left")
plt.show()


if __name__ == "__main__":
# unittest.main()
pass
2 changes: 1 addition & 1 deletion src/pypromice/process/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,7 +463,7 @@ def mergeVars(ds_list, variables, cols=['lo','hi','OOL']): #
----------
ds_list : list
List of xarray.Dataset objects
varaibles : pandas.DataFrame
variables : pandas.DataFrame
Variable look up table
cols : str, optional
Variable column names to merge by. The default is ['lo','hi','OOL'].
Expand Down
Loading