Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactoring the way data is returned in panedr #33

Merged
merged 14 commits into from
Jun 29, 2022
31 changes: 27 additions & 4 deletions panedr/panedr.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@
import sys
import itertools
import time
import pandas
import numpy as np


#Index for the IDs of additional blocks in the energy file.
#Blocks can be added without sacrificing backward and forward
Expand Down Expand Up @@ -75,7 +76,7 @@
Enxnm = collections.namedtuple('Enxnm', 'name unit')
ENX_VERSION = 5

__all__ = ['edr_to_df']
__all__ = ['edr_to_df', 'edr_to_dict', 'read_edr']


class EDRFile(object):
Expand Down Expand Up @@ -395,14 +396,14 @@ def edr_strings(data, file_version, n):

def is_frame_magic(data):
"""Unpacks an int and checks whether it matches the EDR frame magic number

Does not roll the reading position back.
"""
magic = data.unpack_int()
return magic == -7777777


def edr_to_df(path, verbose=False):
def read_edr(path, verbose=False):
begin = time.time()
edr_file = EDRFile(str(path))
all_energies = []
Expand All @@ -427,5 +428,27 @@ def edr_to_df(path, verbose=False):
end='', file=sys.stderr)
print('\n{} frame read in {:.2f} seconds'.format(ifr, end - begin),
file=sys.stderr)

return all_energies, all_names, times


def edr_to_df(path: str, verbose: bool = False):
try:
import pandas
except ImportError:
raise ImportError("""ERROR --- pandas was not found!
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There should be a test for that, which means a pipeline that does not install pandas and a test that runs the function and asserts the exception is raised.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have added this test now

pandas is required to use the `.edr_to_df()`
functionality. Try installing it using pip, e.g.:
python -m pip install pandas""")
all_energies, all_names, times = read_edr(path, verbose=verbose)
df = pandas.DataFrame(all_energies, columns=all_names, index=times)
return df


def edr_to_dict(path: str, verbose: bool = False):
all_energies, all_names, times = read_edr(path, verbose=verbose)
energy_dict = {}
for idx, name in enumerate(all_names):
energy_dict[name] = np.array(
[all_energies[frame][idx] for frame in range(len(times))])
return energy_dict
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make sure that the "Time" key is in. I expect it to be, but I do not remember exactly how I treated it.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"Time" is part of all_names and all_energies as returned by read_edr. This is done specifically by

all_names = [u'Time'] + [nm.name for nm in edr_file.nms]
[...]
all_energies.append([frame.t] + [ener.e for ener in frame.ener])

2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
pandas
numpy>=1.19.0
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@IAlibay is bumping the version to 1.20.0 for MDAnalysis in MDAnalysis/mdanalysis#3737

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't impact things here so it wouldn't think it matters. If you want to raise numpy to 1.20 you'll have to drop py3.6.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What should we prioritise here? Staying in-step with MDAnalysis' dependencies or keeping Py3.6?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't impact MDAnalysis if panedr supports a wider range of python & numpy versions, so IMHO it's fine to just leave it as-is.

pbr
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,5 @@ classifier =
test =
six
pytest
pandas =
pandas
26 changes: 23 additions & 3 deletions tests/test_edr.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,15 @@
EDR_Data = namedtuple('EDR_Data', ['df', 'xvgdata', 'xvgtime', 'xvgnames',
'xvgprec', 'edrfile', 'xvgfile'])


def test_failed_import(monkeypatch):
# Putting this test first to avoid datafiles already being loaded
errmsg = "ERROR --- pandas was not found!"
monkeypatch.setitem(sys.modules, 'pandas', None)
with pytest.raises(ImportError, match=errmsg):
panedr.edr_to_df(EDR)


@pytest.fixture(scope='module',
params=[(EDR, EDR_XVG),
(EDR_IRREGULAR, EDR_IRREGULAR_XVG),
Expand All @@ -73,7 +82,7 @@ def edr(request):
xvgtime = xvgdata[:, 0]
xvgdata = xvgdata[:, 1:]
return EDR_Data(df, xvgdata, xvgtime, xvgnames, xvgprec, edrfile, xvgfile)


class TestEdrToDf(object):
"""
Expand Down Expand Up @@ -163,10 +172,21 @@ def _assert_progress_range(self, progress, dt, start, stop, step):
assert ref_line == progress_line


def test_edr_to_dict_matches_edr_to_df():
array_dict = panedr.edr_to_dict(EDR)
ref_df = panedr.edr_to_df(EDR)
array_df = pandas.DataFrame.from_dict(array_dict).set_index(
"Time", drop=False)
assert array_df.equals(ref_df)


# TODO: write test that ImportError is raised when pandas not installed

hmacdope marked this conversation as resolved.
Show resolved Hide resolved

def read_xvg(path):
"""
Reads XVG file, returning the data, names, and precision.

The data is returned as a 2D numpy array. Column names are returned as an
array of string objects. Precision is an integer corresponding to the least
number of decimal places found, excluding the first (time) column.
Expand Down Expand Up @@ -205,7 +225,7 @@ def read_xvg(path):

def ndec(val):
"""Returns the number of decimal places of a string rep of a float

"""
try:
return len(re.split(NDEC_PATTERN, val)[1])
Expand Down