Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding filename parsing for ARM filenames. #708

Merged
merged 2 commits into from
Sep 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 101 additions & 0 deletions act/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -660,3 +660,104 @@ def test_adjust_timestamp():

ds = act.utils.datetime_utils.adjust_timestamp(ds, offset=-60 * 60)
assert ds['time'].values[0] == np.datetime64('2019-11-24T22:30:00.000000000')


def test_DatastreamParser():
from act.utils.data_utils import DatastreamParserARM as DatastreamParser

pytest.raises(ValueError, DatastreamParser, 123)

fn_obj = DatastreamParser()
pytest.raises(ValueError, fn_obj.set_datastream, None)

fn_obj = DatastreamParser()
assert fn_obj.site is None
assert fn_obj.datastream_class is None
assert fn_obj.facility is None
assert fn_obj.level is None
assert fn_obj.datastream is None
assert fn_obj.date is None
assert fn_obj.time is None
assert fn_obj.ext is None
del fn_obj

fn_obj = DatastreamParser('/data/sgp/sgpmetE13.b1/sgpmetE13.b1.20190501.024254.nc')
assert fn_obj.site == 'sgp'
assert fn_obj.datastream_class == 'met'
assert fn_obj.facility == 'E13'
assert fn_obj.level == 'b1'
assert fn_obj.datastream == 'sgpmetE13.b1'
assert fn_obj.date == '20190501'
assert fn_obj.time == '024254'
assert fn_obj.ext == 'nc'

fn_obj.set_datastream('nsatwrC1.a0.19991230.233451.cdf')
assert fn_obj.site == 'nsa'
assert fn_obj.datastream_class == 'twr'
assert fn_obj.facility == 'C1'
assert fn_obj.level == 'a0'
assert fn_obj.datastream == 'nsatwrC1.a0'
assert fn_obj.date == '19991230'
assert fn_obj.time == '233451'
assert fn_obj.ext == 'cdf'

fn_obj = DatastreamParser('nsaitscomplicatedX1.00.991230.2334.txt')
assert fn_obj.site == 'nsa'
assert fn_obj.datastream_class == 'itscomplicated'
assert fn_obj.facility == 'X1'
assert fn_obj.level == '00'
assert fn_obj.datastream == 'nsaitscomplicatedX1.00'
assert fn_obj.date == '991230'
assert fn_obj.time == '2334'
assert fn_obj.ext == 'txt'

fn_obj = DatastreamParser('sgpmetE13.b1')
assert fn_obj.site == 'sgp'
assert fn_obj.datastream_class == 'met'
assert fn_obj.facility == 'E13'
assert fn_obj.level == 'b1'
assert fn_obj.datastream == 'sgpmetE13.b1'
assert fn_obj.date is None
assert fn_obj.time is None
assert fn_obj.ext is None

fn_obj = DatastreamParser('sgpmetE13')
assert fn_obj.site == 'sgp'
assert fn_obj.datastream_class == 'met'
assert fn_obj.facility == 'E13'
assert fn_obj.level is None
assert fn_obj.datastream is None
assert fn_obj.date is None
assert fn_obj.time is None
assert fn_obj.ext is None

fn_obj = DatastreamParser('sgpmet')
assert fn_obj.site == 'sgp'
assert fn_obj.datastream_class == 'met'
assert fn_obj.facility is None
assert fn_obj.level is None
assert fn_obj.datastream is None
assert fn_obj.date is None
assert fn_obj.time is None
assert fn_obj.ext is None

fn_obj = DatastreamParser('sgp')
assert fn_obj.site == 'sgp'
assert fn_obj.datastream_class is None
assert fn_obj.facility is None
assert fn_obj.level is None
assert fn_obj.datastream is None
assert fn_obj.date is None
assert fn_obj.time is None
assert fn_obj.ext is None

fn_obj = DatastreamParser('sg')
assert fn_obj.site is None
assert fn_obj.datastream_class is None
assert fn_obj.facility is None
assert fn_obj.level is None
assert fn_obj.datastream is None
assert fn_obj.date is None
assert fn_obj.time is None
assert fn_obj.ext is None
del fn_obj
229 changes: 229 additions & 0 deletions act/utils/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import pint
import scipy.stats as stats
import xarray as xr
from pathlib import Path
import re

spec = importlib.util.find_spec('pyart')
if spec is not None:
Expand Down Expand Up @@ -103,6 +105,233 @@ def change_units(
return self._ds


# @xr.register_dataset_accessor('utils')
class DatastreamParserARM(object):
'''
Class to parse ARM datastream names or filenames into its components.
Will return None for each attribute if not extracted from the filename.

Attributes
----------
site : str or None
The site code extracted from the filename.
datastream_class : str
The datastream class extracted from the filename.
facility : str or None
The datastream facility code extracted from the filename.
level : str or None
The datastream level code extracted from the filename.
datastream : str or None
The datastram extracted from the filename.
date : str or None
The date extracted from the filename.
time : str or None
The time extracted from the filename.
ext : str or None
The file extension extracted from the filename.

Example
-------
>>> from act.utils.data_utils import DatastreamParserARM
>>> file = 'sgpmetE13.b1.20190501.024254.nc'
>>> fn_obj = DatastreamParserARM(file)
>>> fn_obj.site
'sgp'
>>> fn_obj.datastream_class
'met'


'''
def __init__(self, ds=''):
'''
Constructor that initializes datastream data member and runs
parse_datastream class method. Also converts datastream name to
lower case before parsing.

ds : str
The datastream or filename to parse

'''

if isinstance(ds, str):
self.__datastream = Path(ds).name
else:
raise ValueError('Datastream or filename name must be a string')

try:
self.__parse_datastream()
except ValueError:
self.__site = None
self.__class = None
self.__facility = None
self.__datastream = None
self.__level = None
self.__date = None
self.__time = None
self.__ext = None

def __parse_datastream(self):
'''
Private method to parse datastream name into its various components
(site, class, facility, and data level. Is called automatically by
constructor when object of class is instantiated and when the
set_datastream method is called to reset the object.

'''
# Import the built-in match function from regular expression library
# self.__datastream = self.__datastream
tempstring = self.__datastream.split('.')

# Check to see if ARM-standard filename was passed
self.__ext = None
self.__time = None
self.__date = None
self.__level = None
self.__site = None
self.__class = None
self.__facility = None
if len(tempstring) >= 5:
self.__ext = tempstring[4]

if len(tempstring) >= 4:
self.__time = tempstring[3]

if len(tempstring) >= 3:
self.__date = tempstring[2]

if len(tempstring) >= 2:
m = re.match('[abcs0][0123456789]', tempstring[1])
if m is not None:
self.__level = m.group()

match = False
m = re.search(r'(^[a-z]{3})(\w+)([A-Z]{1}\d{1,2})$', tempstring[0])
if m is not None:
self.__site = m.group(1)
self.__class = m.group(2)
self.__facility = m.group(3)
match = True

if not match:
m = re.search(r'(^[a-z]{3})(\w+)$', tempstring[0])
if m is not None:
self.__site = m.group(1)
self.__class = m.group(2)
match = True

if not match and len(tempstring[0]) == 3:
self.__site = tempstring[0]
match = True

if not match:
raise ValueError(self.__datastream)

def set_datastream(self, ds):
'''
Method used to set or reset object by passing a new datastream name.

'''

self.__init__(ds)

@property
def datastream(self):
'''
Property returning current datastream name stored in object in
standard lower case. Will return the datastrem with no level if
unavailable.

'''

try:
return ''.join((self.__site, self.__class, self.__facility, '.',
self.__level))
except TypeError:
return None

@property
def site(self):
'''
Property returning current site name stored in object in standard
lower case.

'''

return self.__site

@property
def datastream_class(self):
'''
Property returning current datastream class name stored in object in
standard lower case. Could not use class as attribute name since it
is a reserved word in Python

'''

return self.__class

@property
def facility(self):
'''
Property returning current facility name stored in object in
standard upper case.

'''

try:
return self.__facility.upper()
except AttributeError:
return self.__facility

@property
def level(self):
'''
Property returning current data level stored in object in standard
lower case.
'''

return self.__level

@property
def datastream_standard(self):
'''
Property returning datastream name in ARM-standard format with
facility in caps. Will return the datastream name with no level if
unavailable.
'''

try:
return ''.join((self.site, self.datastream_class, self.facility,
'.', self.level))

except TypeError:
return None

@property
def date(self):
'''
Property returning date from filename.
'''

return self.__date

@property
def time(self):
'''
Property returning time from filename.
'''

return self.__time

@property
def ext(self):
'''
Property returning file extension from filename.
'''

return self.__ext


def assign_coordinates(ds, coord_list):
"""
This procedure will create a new ACT dataset whose coordinates are
Expand Down
42 changes: 42 additions & 0 deletions examples/utils/plot_parse_filename.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""
Parse the ARM datastream filename
-------------------------

This is an example of how to parse
the datastream filename into its constituent parts.

"""

from act.utils.data_utils import DatastreamParserARM

# Here we have a full path filename.
filename = '/data/sgp/sgpmetE13.b1/sgpmetE13.b1.20190501.024254.nc'

# What if we want to extract some metadata from the filename instead of reading the file
# and extracting from the global attributes. We can call the DatastreamParserARM() method
# and extract the string value from the object using its properties.

fn_obj = DatastreamParserARM(filename)
print(f"Site is {fn_obj.site}")
print(f"Datastream Class is {fn_obj.datastream_class}")
print(f"Facility is {fn_obj.facility}")
print(f"Level is {fn_obj.level}")
print(f"Datastream is {fn_obj.datastream}")
print(f"Date is {fn_obj.date}")
print(f"Time is {fn_obj.time}")
print(f"File extension is {fn_obj.ext}")

# We can also use the parser for just the datastream part to extract the parts.
# The other methods will not have a value and return None.

filename = 'sgpmetE13.b1'

fn_obj = DatastreamParserARM(filename)
print(f"\nSite is {fn_obj.site}")
print(f"Datastream Class is {fn_obj.datastream_class}")
print(f"Facility is {fn_obj.facility}")
print(f"Level is {fn_obj.level}")
print(f"Datastream is {fn_obj.datastream}")
print(f"Date is {fn_obj.date}")
print(f"Time is {fn_obj.time}")
print(f"File extension is {fn_obj.ext}")
Loading