-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxarray_clim.py
405 lines (289 loc) · 12.3 KB
/
xarray_clim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Aug 10 11:02:39 2018
This module contains wrapper functions for xarray. The functions are
designed to make typical in analyzing gridded climate and weather data
as easy as possible. The functions are desinged to closely resemble
the widely used command line tool CDO (Climate Data Operators)
All functions are desinged such that they work equally well with lazily
loaded dask-arrays in the background (see http://xarray.pydata.org/en/stable/dask.html).
This means that when a lazily loaded array is used as input, the output is also
lazy and wont be evaluated yet. This allows to use input datasets that are larger
then the available memory.
All functions exclusively use xarrays named indexing, thus they do not
rely on the order of the dimensions (e.g. (time,lat,lon) or (time,lon,lat)).
It is also ok if there are more dimensions present, they will simply be left
unlatered and carried through.
The testing has however only been done for the most common ordering (time,lev,lat,lon)
Most of the functions work both on xarray.Dataset and xarray.DataArray
Pros combared to cdo:
cdo is limited to 4-dimensions, this code works for N dimensions
directly in python
Does NOT yet work with rotated grids (lat and lon are expected to be 1d vectors)
requirements:
xarray >= 0.10.8 (lower does not work)
pandas (developed with 0.20.3, might also work with lower)
numpy (developed with 1.12.1, might also work with lower)
dask
wirtten in python3
@author: Sebastian Scher (seb1000 on github)
Feedback is highly appreciated and can be sent to [email protected]
This work is distributed under the GNU GPLv3 license.
"""
import calendar
import warnings
import xarray as xr
import dask
import numpy as np
import pandas as pd
import dask.array as da
def standardize_dataset(ds):
''' change dimension names to standard names (lat,lon,lev,time),
ds: DataSet or DataArray
return: dataset or datarray with standardized names
'''
pairs = {'latitude':'lat',
'longitude':'lon',
'level':'lev'}
for key in pairs.keys():
if key in ds.dims.keys():
ds = ds.rename({key:pairs[key]})
# extract variable name
var_keys = ds.data_vars.keys()
assert ( len(var_keys) ==1)
for e in var_keys:
name = e
ds.attrs['varname'] = name
return ds
def _lat_is_increasing(data):
if data.lat[1] > data.lat[0]:
return True
else:
return False
def sellonlatpoint(data,lon,lat, method='nearest'):
'''select a single spatial point, either by taking the closest
available point (method='nearest'), or by
interpolating ('linear' or 'cubic')
'''
supported_methods = ('nearest','linear','cubic')
if method not in supported_methods:
raise ValueError('"method" must be one of ' + str(supported_methods))
# check wehther the requested point is in the domain, if not raise an error
lonrange = (float(data.lon.min()), float(data.lon.max()))
if lon < lonrange[0] or lon > lonrange[1]:
raise ValueError('''the requested lon {} is outside the range of the
dataset {}'''.format(lon, lonrange))
latrange = (float(data.lat.min()), float(data.lat.max()))
if lat < latrange[0] or lat > latrange[1]:
raise ValueError('''the requested lat {} is outside the range of the
dataset {}'''.format(lat, latrange))
if method == 'nearest':
sub = data.sel(lat=lat,lon=lon,method=method)
elif method in ('linear', 'cubic'):
sub = data.interp(lat=lat,lon=lon,method=method)
return sub
def sellonlatbox(data,west,north,east,south, allow_single_point=False):
'''
select a rectangular box.
if west==east and north==south, then a single point will be returned
if allow_single_point=True, otherwise an error will be raised
'''
if north == south and west==east:
if allow_single_point:
# in this case we return a single point
return sellonlatpoint(data,west,north)
else:
raise ValueError('''the requsted area is a single point
(north==south and east==west)''')
if north < south:
raise ValueError('north ({}) is smaller than south ({})'.
format(north,south))
if east < west: #north < south:
raise ValueError('west ({}) is larger than east ({})'.
format(west,east))
# check wehther the requested point is in the domain
lonrange = (float(data.lon.min()), float(data.lon.max()))
if ( west < lonrange[0] or west > lonrange[1] or
east < lonrange[0] or east > lonrange[1] ):
raise ValueError('''the requested lon extend {} to {} is outside
the range of the
dataset {}'''.format(west,east,lonrange))
latrange = (float(data.lat.min()), float(data.lat.max()))
if ( north < latrange[0] or north > latrange[1] or
south < latrange[0] or south > latrange[1] ):
raise ValueError('''the requested lat extend {} to {} is outside
the range of the
dataset {}'''.format(north,south,latrange))
# depending on whether lat is increasing or decreasing (which is
# different for different datasets) the indexing has to be done in
# a different way
if _lat_is_increasing(data):
indexers = {'lon':slice(west,east), 'lat':slice(south,north)}
else:
indexers = {'lon':slice(west,east), 'lat':slice(north,south)}
sub = data.sel(**indexers)
return sub
def weighted_areamean(ds):
'''area mean weighted by cos of latitude'''
weights = xr.ufuncs.cos(np.deg2rad(ds.lat))
# this is 1d array. Xarray broadcasts it automatically.
# however, we compute the normalization term explicitely.
# the normalization term is the sum of the weights over all gridpoints.
# because the weight is not lon dependent, is is simply the sum along one
# meridian times the number of lon points
norm = np.sum(weights) * len(ds.lon)
# this means we have only lat and lon
amean = (ds*weights).sum(('lat','lon')) / norm
return amean
def wrap360_to180(ds):
"""
wrap longitude coordinates from 0..360 to -180..180
Known Issues: with lazily loaded arrays this some leads to problems
(the dimensions in the onderlying numpy array then suddenly have a different
order than the dask array)
"""
if type(ds) is xr.DataArray:
orig_is_array = True
# create to dataset with a dummy variable name
ds = ds.to_dataset(name='__dummy__')
else:
orig_is_array = False
# check whether the data is lazily loaded
for _, var in ds.variables.items():
if type(var._data) is dask.array.core.Array:
warnings.warn('''the dataset (or part of it) is lazily loaded, this
sometimes casues problems in wrap369_to180''')
# wrap 0..359 to -179...180
new = np.array([e if e < 180 else e-360 for e in ds['lon'].values])
ds = ds.assign(**{ 'lon' : new})
ds = ds.reindex({'lon': np.sort(new)})
if orig_is_array:
# convert back to
ds = ds['__dummy__']
return ds
def corr_over_time(x,y):
'''
point-by point correlation of two spatial arrays x and y
(the correlation of x[all_times,lati,loni] and y[all_times,lati,loni])
Can be used to make correlation maps of two quantities
'''
if x.shape != y.shape:
raise ValueError('x and y must have exactly the same dimension!')
# the numpy corrcoref function works only for one point at a time,
# and looping over the field is very slow. therefore we do the computation
# on a lower level (only wieh mean and std functions)
mx = x.mean('time')
my = y.mean('time')
xm, ym = x-mx, y-my
r_num = (xm*ym).mean('time')
r_den = xm.std('time') * ym.std('time')
r = r_num / r_den
return r
def seldate(data,date):
'''select a singel timestep. date can be string or anything that pandas recognizes
(e.g. Timestamps)'''
return data.sel(time=date)
def seldaterange(data,start,stop):
'''select a range from start to (including) stop.
dates can be string or anything that pandas recognizes
(e.g. Timestamps)'''
return data.sel(time=slice(start,stop))
def selmonth(data,month):
'''
select all dates with the given month number (0..12) or name
('jan','feb',...)
'''
if type(month) is int:
if month not in list(range(1,12+1)):
raise ValueError('month musb be in 1..12, given:{}'.format(month))
elif type(month) is str:
# convert month name to number
try:
month = list(calendar.month_abbr).index('feb')
except ValueError():
raise ValueError('''{} could not be converted to a
month number'''.format(month))
return data.isel(time=data.time.dt.month==month)
def selseas(data,seas):
''' select season by abbreviation (djf, mam, jja, son)'''
# convert to uppercase (xarray needs uppercase)
seas = seas.upper()
return data.isel(time=data.time.dt.season==seas)
def resample_time(data,freq,func='mean', **kwargs):
'''
resample a dataarray or dataset to a new timefrequency
data: input dataset/dataarray
freq: new timem frequency
func: how to aggregate (e.g. 'mean','max')
**kwargs: arguments passed on to xr.DataArray.resample
'''
if freq in('season','seas'):
# due to a restriction in pandas, 'seas' is not available in resample
# however, 'QS-DEC' gets us the meteorological seasons
freq = 'QS-DEC'
else:
# convert from typical names like 'monht' to pandas syntax
convert_dict = {'month':'m',
'day':'d',
'year':'yr'}
if freq in convert_dict.keys():
freq = convert_dict[freq]
# we need to do tow steps: resample, and then apply the aggregation function
# defined by 'func'
resampled = data.resample(time=freq, **kwargs)
try:
# apply function for resamplein (mean, max ,...etc)
resampled_collapsed = getattr(resampled,func)(dim='time')
except AttributeError:
raise ValueError('func {} not available'.format(func))
assert(len(resampled_collapsed.shape)==len(data.shape))
return resampled_collapsed
def flexible_clim(data,freq,func='mean'):
resampled = resample_time(data,freq,func)
return resampled.groupby('time.'+str(freq)).mean(dim='time')
def ymon_func(data,func='mean'):
return flexible_clim(data,'month',func)
def yseas_func(data,func='mean'):
return flexible_clim(data,'seas',func)
#%% now some convenience wrappers for the yseas functions
def ymonmean(data):
return ymon_func(data,'mean')
def ymonmax(data):
return ymon_func(data,'max')
def ymonmin(data):
return ymon_func(data,'min')
def yseasmean(data):
'''seasonal climatology'''
return yseas_func(data,'mean')
def yseasmax(data):
'''seasonal climatology'''
return yseas_func(data,'max')
def yseasmin(data):
'''seasonal climatology'''
return yseas_func(data,'min')
def sellev(data,lev):
'''select level by value'''
return data.sel(lev=lev)
def selilev(data,lev):
'''select level by index'''
return data.isel(lev=lev)
def area_func(data,func='mean', **kwargs):
'''
**kwargs: passed on to func (e.g. q for quantile)
'''
try:
# apply function 'func' which should be a funtion of xr.DataArray
return getattr(data,func)(dim=('lat','lon'), **kwargs)
except AttributeError:
raise ValueError('func {} not available'.format(func))
def areamean(data):
return area_func(data,'mean')
def areamax(data):
return area_func(data,'max')
def areamin(data):
return area_func(data,'min')
def areaquantile(data,q):
return area_func(data,'quantile', q=q)
def areapercentile(data,p):
return area_func(data,'quantile', q=p/100.)