Skip to content

Commit

Permalink
Repository: Add the as_path context manager (#6151)
Browse files Browse the repository at this point in the history
The node repository interface intentionally does not provide access to
its file objects through filepaths on the file system. This is because,
for efficiency reasons, the content of a repository may not actually be
stored as individual files on a file system, but for example are stored
in an object store.

Therefore, the contents of the repository can only be retrieved as a
file-like object or read as a string or list of bytes into memory.
Certain use-cases require a file to be made available through a filepath.
An example is when it needs to be passed to an API that only accepts a
filepath, such as `numpy.loadfromtxt`.

Currently, the user will have to manually copy the content of the repo's
content to a temporary file on disk, and pass the temporary filepath.
This results in clients having to often resport to the following snippet:

    import pathlib
    import shutil
    import tempfile

    with tempfile.TemporaryDirectory() as tmp_path:

        # Copy the entire content to the temporary folder
        dirpath = pathlib.Path(tmp_path)
        node.base.repository.copy_tree(dirpath)

        # Or copy the content of a file. Should use streaming
        # to avoid reading everything into memory
        filepath = (dirpath / 'some_file.txt')
        with filepath.open('rb') as target:
            with node.base.repository.open('rb') as source:
                shutil.copyfileobj(source, target)

        # Now use `filepath` to library call, e.g.
        numpy.loadtxt(filepath)

This logic is now provided under the `as_path` context manager. This
will make it easy to access repository content as files on the local
file system. The snippet above is simplified to:

    with node.base.repository.as_path() as filepath:
        numpy.loadtxt(filepath)

The method is exposed directly in the interface of the `FolderData` and
`SinglfileData` data types. A warning is added to the docs explaining
the inefficiency of the content having to be read and written to a
temporary directory first, encouraging it only to be used when the
alternative is not an option.
  • Loading branch information
sphuber authored Oct 20, 2023
1 parent d8dd776 commit b0546e8
Show file tree
Hide file tree
Showing 7 changed files with 182 additions and 17 deletions.
12 changes: 12 additions & 0 deletions aiida/orm/nodes/data/folder.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,18 @@ def open(self, path: str, mode='r') -> Iterator[BinaryIO | TextIO]:
with self.base.repository.open(path, mode) as handle:
yield handle

@contextlib.contextmanager
def as_path(self, path: FilePath | None = None) -> Iterator[pathlib.Path]:
"""Make the contents of the repository available as a normal filepath on the local file system.
:param path: optional relative path of the object within the repository.
:return: the filepath of the content of the repository or object if ``path`` is specified.
:raises TypeError: if the path is not a string or ``Path``, or is an absolute path.
:raises FileNotFoundError: if no object exists for the given path.
"""
with self.base.repository.as_path(path) as filepath:
yield filepath

def get_object(self, path: FilePath | None = None) -> File:
"""Return the object at the given path.
Expand Down
12 changes: 12 additions & 0 deletions aiida/orm/nodes/data/singlefile.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,18 @@ def open(self, path: str | None = None, mode: t.Literal['r', 'rb'] = 'r') -> t.I
with self.base.repository.open(path, mode=mode) as handle:
yield handle

@contextlib.contextmanager
def as_path(self) -> t.Iterator[pathlib.Path]:
"""Make the contents of the file available as a normal filepath on the local file system.
:param path: optional relative path of the object within the repository.
:return: the filepath of the content of the repository or object if ``path`` is specified.
:raises TypeError: if the path is not a string or ``Path``, or is an absolute path.
:raises FileNotFoundError: if no object exists for the given path.
"""
with self.base.repository.as_path(self.filename) as filepath:
yield filepath

def get_content(self, mode: str = 'r') -> str | bytes:
"""Return the content of the single file stored for this data node.
Expand Down
29 changes: 28 additions & 1 deletion aiida/orm/nodes/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import copy
import io
import pathlib
import shutil
import tempfile
from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, Iterator, TextIO, Union

Expand Down Expand Up @@ -164,7 +165,7 @@ def list_object_names(self, path: str | None = None) -> list[str]:
return self._repository.list_object_names(path)

@contextlib.contextmanager
def open(self, path: str, mode='r') -> Iterator[BinaryIO | TextIO]:
def open(self, path: FilePath, mode='r') -> Iterator[BinaryIO | TextIO]:
"""Open a file handle to an object stored under the given key.
.. note:: this should only be used to open a handle to read an existing file. To write a new file use the method
Expand All @@ -186,6 +187,32 @@ def open(self, path: str, mode='r') -> Iterator[BinaryIO | TextIO]:
else:
yield handle

@contextlib.contextmanager
def as_path(self, path: FilePath | None = None) -> Iterator[pathlib.Path]:
"""Make the contents of the repository available as a normal filepath on the local file system.
:param path: optional relative path of the object within the repository.
:return: the filepath of the content of the repository or object if ``path`` is specified.
:raises TypeError: if the path is not a string or ``Path``, or is an absolute path.
:raises FileNotFoundError: if no object exists for the given path.
"""
obj = self.get_object(path)

with tempfile.TemporaryDirectory() as tmp_path:

dirpath = pathlib.Path(tmp_path)

if obj.is_dir():
self.copy_tree(dirpath, path)
yield dirpath
else:
filepath = dirpath / obj.name
assert path is not None
with self.open(path, mode='rb') as source:
with filepath.open('wb') as target:
shutil.copyfileobj(source, target) # type: ignore[misc]
yield filepath

def get_object(self, path: FilePath | None = None) -> File:
"""Return the object at the given path.
Expand Down
1 change: 1 addition & 0 deletions docs/source/nitpick-exceptions
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ py:class json.encoder.JSONEncoder
py:class EXPOSED_TYPE
py:class EVENT_CALLBACK_TYPE
py:class datetime
py:meth tempfile.TemporaryDirectory

### AiiDA
py:class ReturnType
Expand Down
116 changes: 101 additions & 15 deletions docs/source/topics/data_types.rst
Original file line number Diff line number Diff line change
Expand Up @@ -305,14 +305,51 @@ This class can be initialized via the **absolute** path to the file you want to
In [2]: single_file = SinglefileData('/absolute/path/to/file')
When storing the node, the filename is stored in the database and the file itself is copied to the repository.
The contents of the file in string format can be obtained using the :py:meth:`~aiida.orm.nodes.data.singlefile.SinglefileData.get_content()` method:

.. code-block:: ipython
In [3]: single_file.get_content()
Out[3]: 'The file content'
When storing the node, the filename is stored in the database and the file itself is copied to the repository.
For large files, reading the entire content into memory using :py:meth:`~aiida.orm.nodes.repository.NodeRepository.get_object_content()` may not be desirable.
Instead, a file-like handle can be opened to a file in the repository which can be used to read the content as a stream.
This can be useful, for example, to copy a large file from the repository to a file on disk, without loading it entirely into memory:

.. code-block:: ipython
In [4]: import shutil
with single_file.open(mode='rb') as source:
with open('copy.txt', mode='wb') as target:
shutil.copyfileobj(source, target)
.. note:: To guarantee the file is copied over identically (and there are no encoding issues), the files are opened in "binary" mode by including the ``b`` character in the ``mode`` argument.

For efficiency reasons, the repository interface only provides access to object content through file-like objects or strings.
However, for certain use-cases, the object content _needs_ to be made available as a file on the local file system.
For example, the ``numpy.loadtxt`` method only accepts a filepath, and no file-like objects.
In this case, the content of the file can be made available on the local file system using the :py:meth:`~aiida.orm.nodes.repository.NodeRepository.as_path()` context manager:

.. code-block:: ipython
In [5]: with single_file.as_path() as filepath:
numpy.loadtxt(filepath)
The yielded value ``filepath`` is an instance of ``pathlib.Path`` that points to a location on the local file system containing the content of the file.
The temporary copy on the local file system is automatically cleaned up once the context manager is exited.

.. note::

The temporary directory to which the content is copied is created using the :meth:`tempfile.TemporaryDirectory` function of the standard library.
Its location is chosen from a platform-dependent list or can be controlled through the ``TMPDIR`` environment variable (see `the official documentation <https://docs.python.org/3/library/tempfile.html#tempfile.mkstemp>`_ for details).

.. warning::

The :py:meth:`~aiida.orm.nodes.repository.NodeRepository.as_path()` context manager will copy the file content to a temporary folder on the local file system.
For large files this can be an expensive operation and it is inefficient since it requires an additional read and write operation.
Therefore, if it is possible to use file-like objects or read the content into memory, the :py:meth:`~aiida.orm.nodes.repository.NodeRepository.get_object_content()` and :py:meth:`~aiida.orm.nodes.repository.NodeRepository.open()` methods should be preferred.


.. _topics:data_types:core:folder:

Expand All @@ -324,57 +361,106 @@ To store a complete directory, simply use the ``tree`` keyword:

.. code-block:: ipython
In [1]: FolderData = DataFactory('core.folder')
In [1]: FolderData = DataFactory('core.folder')
In [2]: folder = FolderData(tree='/absolute/path/to/directory')
In [2]: folder = FolderData(tree='/absolute/path/to/directory')
Alternatively, you can construct the node first and then use the various repository methods to add objects from directory and file paths:

.. code-block:: ipython
In [1]: folder = FolderData()
In [1]: folder = FolderData()
In [2]: folder.put_object_from_tree('/absolute/path/to/directory')
In [2]: folder.put_object_from_tree('/absolute/path/to/directory')
In [3]: folder.put_object_from_file('/absolute/path/to/file1.txt', path='file1.txt')
In [3]: folder.put_object_from_file('/absolute/path/to/file1.txt', path='file1.txt')
or from `file-like objects <https://docs.python.org/3/glossary.html#term-file-like-object>`_:

.. code-block:: ipython
In [4]: folder.put_object_from_filelike(filelike_object, path='file2.txt')
In [4]: folder.put_object_from_filelike(filelike_object, path='file2.txt')
Inversely, the content of the files stored in the :py:class:`~aiida.orm.nodes.data.folder.FolderData` node can be accessed using the :py:meth:`~aiida.orm.nodes.repository.NodeRepository.get_object_content()` method:

.. code-block:: ipython
In [5]: folder.get_object_content('file1.txt')
Out[5]: 'File 1 content\n'
In [5]: folder.get_object_content('file1.txt')
Out[5]: 'File 1 content\n'
To see the files that are stored in the :py:class:`~aiida.orm.nodes.data.folder.FolderData`, you can use the :py:meth:`~aiida.orm.nodes.repository.NodeRepository.list_object_names()` method:

.. code-block:: ipython
In [6]: folder.list_object_names()
Out[6]: ['subdir', 'file1.txt', 'file2.txt']
In [6]: folder.list_object_names()
Out[6]: ['subdir', 'file1.txt', 'file2.txt']
In this example, ``subdir`` was a sub directory of ``/absolute/path/to/directory``, whose contents where added above.
to list the contents of the ``subdir`` directory, you can pass its path to the :py:meth:`~aiida.orm.nodes.repository.NodeRepository.list_object_names()` method:

.. code-block:: ipython
In [7]: folder.list_object_names('subdir')
Out[7]: ['file3.txt', 'module.py']
In [7]: folder.list_object_names('subdir')
Out[7]: ['file3.txt', 'module.py']
The content can once again be shown using the :py:meth:`~aiida.orm.nodes.repository.NodeRepository.get_object_content()` method by passing the correct path:

.. code-block:: ipython
In [8]: folder.get_object_content('subdir/file3.txt')
Out[8]: 'File 3 content\n'
In [8]: folder.get_object_content('subdir/file3.txt')
Out[8]: 'File 3 content\n'
Since the :py:class:`~aiida.orm.nodes.data.folder.FolderData` node is simply a collection of files, it simply stores these files in the repository.

For large files, reading the entire content into memory using :py:meth:`~aiida.orm.nodes.repository.NodeRepository.get_object_content()` may not be desirable.
Instead, a file-like handle can be opened to a file in the repository which can be used to read the content as a stream.
This can be useful, for example, to copy a large file from the repository to a file on disk, without loading it entirely into memory:

.. code-block:: ipython
In [9]: import shutil
with folder.open('subdir/file3.txt', mode='rb') as source:
with open('copy.txt', mode='wb') as target:
shutil.copyfileobj(source, target)
.. note:: To guarantee the file is copied over identically (and there are no encoding issues), the files are opened in "binary" mode by including the ``b`` character in the ``mode`` argument.

For efficiency reasons, the repository interface only provides access to object content through file-like objects or strings.
However, for certain use-cases, the object content _needs_ to be made available as a file on the local file system.
For example, the ``numpy.loadtxt`` method only accepts a filepath, and no file-like objects.
In this case, the content of the node's repository can be made available on the local file system using the :py:meth:`~aiida.orm.nodes.repository.NodeRepository.as_path()` context manager:

.. code-block:: ipython
In [10]: with folder.as_path() as filepath:
print(list(filepath.iterdir()))
Out[10]: ['subdir', 'file1.txt', 'file2.txt']
The yielded value ``dirpath`` is an instance of ``pathlib.Path`` that points to a location on the local file system containing the complete content of the repository.
The temporary copy on the local file system is automatically cleaned up once the context manager is exited.

.. note::

The temporary directory to which the content is copied is created using the :meth:`tempfile.TemporaryDirectory` function of the standard library.
Its location is chosen from a platform-dependent list or can be controlled through the ``TMPDIR`` environment variable (see `the official documentation <https://docs.python.org/3/library/tempfile.html#tempfile.mkstemp>`_ for details).

Optionally, an explicit object can be specified:

.. code-block:: ipython
In [11]: with folder.as_path('some_data_file.dat') as filepath:
numpy.loadtxt(filepath)
If the object at ``path`` is a directory, the returned value points to a directory that contains its contents.
If it is a file, the returned value points to a file with the content of the object.

.. warning::

The :py:meth:`~aiida.orm.nodes.repository.NodeRepository.as_path()` context manager will copy the content to a temporary folder on the local file system.
For large repositories this can be an expensive operation and it is inefficient since it requires an additional read and write operation.
Therefore, if it is possible to use file-like objects or read the content into memory, the :py:meth:`~aiida.orm.nodes.repository.NodeRepository.get_object_content()` and :py:meth:`~aiida.orm.nodes.repository.NodeRepository.open()` methods should be preferred.


.. _topics:data_types:core:remote:

RemoteData
Expand Down
1 change: 1 addition & 0 deletions tests/orm/nodes/data/test_folder.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def test_constructor_tree(tmp_path):
'list_objects',
'list_object_names',
'open',
'as_path',
'get_object',
'get_object_content',
'put_object_from_bytes',
Expand Down
28 changes: 27 additions & 1 deletion tests/orm/nodes/test_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ def test_glob():


def test_copy_tree(tmp_path):
"""Test the ``Repository.copy_tree`` method."""
"""Test the ``NodeRepository.copy_tree`` method."""
node = Data()
node.base.repository.put_object_from_bytes(b'content', 'relative/path')

Expand All @@ -236,3 +236,29 @@ def test_deprecated_methods(monkeypatch):
for method in node._deprecated_repo_methods:
with pytest.warns(AiidaDeprecationWarning):
getattr(node, method)


def test_as_path():
"""Test the ``NodeRepository.as_path`` method."""
node = Data()
node.base.repository.put_object_from_bytes(b'content_some_file', 'some_file.txt')
node.base.repository.put_object_from_bytes(b'content_relative', 'relative/path.dat')

with pytest.raises(FileNotFoundError):
with node.base.repository.as_path('non_existent'):
pass

with node.base.repository.as_path() as dirpath:
assert sorted([p.name for p in dirpath.iterdir()]) == ['relative', 'some_file.txt']
assert (dirpath / 'some_file.txt').read_bytes() == b'content_some_file'
assert (dirpath / 'relative' / 'path.dat').read_bytes() == b'content_relative'
assert not dirpath.exists()

with node.base.repository.as_path('relative') as dirpath:
assert sorted([p.name for p in dirpath.iterdir()]) == ['path.dat']
assert (dirpath / 'path.dat').read_bytes() == b'content_relative'
assert not dirpath.exists()

with node.base.repository.as_path('relative/path.dat') as filepath:
assert filepath.read_bytes() == b'content_relative'
assert not filepath.exists()

0 comments on commit b0546e8

Please sign in to comment.