Repository: Add the as_path context manager (#6151)

The node repository interface intentionally does not provide access to its file objects through filepaths on the file system. This is because, for efficiency reasons, the content of a repository may not actually be stored as individual files on a file system, but for example are stored in an object store. Therefore, the contents of the repository can only be retrieved as a file-like object or read as a string or list of bytes into memory. Certain use-cases require a file to be made available through a filepath. An example is when it needs to be passed to an API that only accepts a filepath, such as `numpy.loadfromtxt`. Currently, the user will have to manually copy the content of the repo's content to a temporary file on disk, and pass the temporary filepath. This results in clients having to often resport to the following snippet: import pathlib import shutil import tempfile with tempfile.TemporaryDirectory() as tmp_path: # Copy the entire content to the temporary folder dirpath = pathlib.Path(tmp_path) node.base.repository.copy_tree(dirpath) # Or copy the content of a file. Should use streaming # to avoid reading everything into memory filepath = (dirpath / 'some_file.txt') with filepath.open('rb') as target: with node.base.repository.open('rb') as source: shutil.copyfileobj(source, target) # Now use `filepath` to library call, e.g. numpy.loadtxt(filepath) This logic is now provided under the `as_path` context manager. This will make it easy to access repository content as files on the local file system. The snippet above is simplified to: with node.base.repository.as_path() as filepath: numpy.loadtxt(filepath) The method is exposed directly in the interface of the `FolderData` and `SinglfileData` data types. A warning is added to the docs explaining the inefficiency of the content having to be read and written to a temporary directory first, encouraging it only to be used when the alternative is not an option.
aiidateam · Oct 20, 2023 · b0546e8 · b0546e8
1 parent d8dd776
commit b0546e8
Show file tree

Hide file tree

Showing 7 changed files with 182 additions and 17 deletions.
diff --git a/aiida/orm/nodes/data/folder.py b/aiida/orm/nodes/data/folder.py
@@ -88,6 +88,18 @@ def open(self, path: str, mode='r') -> Iterator[BinaryIO | TextIO]:
         with self.base.repository.open(path, mode) as handle:
             yield handle
 
+    @contextlib.contextmanager
+    def as_path(self, path: FilePath | None = None) -> Iterator[pathlib.Path]:
+        """Make the contents of the repository available as a normal filepath on the local file system.
+
+        :param path: optional relative path of the object within the repository.
+        :return: the filepath of the content of the repository or object if ``path`` is specified.
+        :raises TypeError: if the path is not a string or ``Path``, or is an absolute path.
+        :raises FileNotFoundError: if no object exists for the given path.
+        """
+        with self.base.repository.as_path(path) as filepath:
+            yield filepath
+
     def get_object(self, path: FilePath | None = None) -> File:
         """Return the object at the given path.
 

diff --git a/aiida/orm/nodes/data/singlefile.py b/aiida/orm/nodes/data/singlefile.py
@@ -92,6 +92,18 @@ def open(self, path: str | None = None, mode: t.Literal['r', 'rb'] = 'r') -> t.I
         with self.base.repository.open(path, mode=mode) as handle:
             yield handle
 
+    @contextlib.contextmanager
+    def as_path(self) -> t.Iterator[pathlib.Path]:
+        """Make the contents of the file available as a normal filepath on the local file system.
+
+        :param path: optional relative path of the object within the repository.
+        :return: the filepath of the content of the repository or object if ``path`` is specified.
+        :raises TypeError: if the path is not a string or ``Path``, or is an absolute path.
+        :raises FileNotFoundError: if no object exists for the given path.
+        """
+        with self.base.repository.as_path(self.filename) as filepath:
+            yield filepath
+
     def get_content(self, mode: str = 'r') -> str | bytes:
         """Return the content of the single file stored for this data node.
 

diff --git a/aiida/orm/nodes/repository.py b/aiida/orm/nodes/repository.py
@@ -6,6 +6,7 @@
 import copy
 import io
 import pathlib
+import shutil
 import tempfile
 from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, Iterator, TextIO, Union
 
@@ -164,7 +165,7 @@ def list_object_names(self, path: str | None = None) -> list[str]:
         return self._repository.list_object_names(path)
 
     @contextlib.contextmanager
-    def open(self, path: str, mode='r') -> Iterator[BinaryIO | TextIO]:
+    def open(self, path: FilePath, mode='r') -> Iterator[BinaryIO | TextIO]:
         """Open a file handle to an object stored under the given key.
 
         .. note:: this should only be used to open a handle to read an existing file. To write a new file use the method
@@ -186,6 +187,32 @@ def open(self, path: str, mode='r') -> Iterator[BinaryIO | TextIO]:
             else:
                 yield handle
 
+    @contextlib.contextmanager
+    def as_path(self, path: FilePath | None = None) -> Iterator[pathlib.Path]:
+        """Make the contents of the repository available as a normal filepath on the local file system.
+
+        :param path: optional relative path of the object within the repository.
+        :return: the filepath of the content of the repository or object if ``path`` is specified.
+        :raises TypeError: if the path is not a string or ``Path``, or is an absolute path.
+        :raises FileNotFoundError: if no object exists for the given path.
+        """
+        obj = self.get_object(path)
+
+        with tempfile.TemporaryDirectory() as tmp_path:
+
+            dirpath = pathlib.Path(tmp_path)
+
+            if obj.is_dir():
+                self.copy_tree(dirpath, path)
+                yield dirpath
+            else:
+                filepath = dirpath / obj.name
+                assert path is not None
+                with self.open(path, mode='rb') as source:
+                    with filepath.open('wb') as target:
+                        shutil.copyfileobj(source, target)  # type: ignore[misc]
+                yield filepath
+
     def get_object(self, path: FilePath | None = None) -> File:
         """Return the object at the given path.
 

diff --git a/docs/source/nitpick-exceptions b/docs/source/nitpick-exceptions
@@ -21,6 +21,7 @@ py:class json.encoder.JSONEncoder
 py:class EXPOSED_TYPE
 py:class EVENT_CALLBACK_TYPE
 py:class datetime
+py:meth tempfile.TemporaryDirectory
 
 ### AiiDA
 py:class ReturnType

diff --git a/docs/source/topics/data_types.rst b/docs/source/topics/data_types.rst
@@ -305,14 +305,51 @@ This class can be initialized via the **absolute** path to the file you want to
 
   In [2]: single_file = SinglefileData('/absolute/path/to/file')
 
+When storing the node, the filename is stored in the database and the file itself is copied to the repository.
 The contents of the file in string format can be obtained using the :py:meth:`~aiida.orm.nodes.data.singlefile.SinglefileData.get_content()` method:
 
 .. code-block:: ipython
 
   In [3]: single_file.get_content()
   Out[3]: 'The file content'
 
-When storing the node, the filename is stored in the database and the file itself is copied to the repository.
+For large files, reading the entire content into memory using :py:meth:`~aiida.orm.nodes.repository.NodeRepository.get_object_content()` may not be desirable.
+Instead, a file-like handle can be opened to a file in the repository which can be used to read the content as a stream.
+This can be useful, for example, to copy a large file from the repository to a file on disk, without loading it entirely into memory:
+
+.. code-block:: ipython
+
+    In [4]: import shutil
+            with single_file.open(mode='rb') as source:
+                with open('copy.txt', mode='wb') as target:
+                    shutil.copyfileobj(source, target)
+
+.. note:: To guarantee the file is copied over identically (and there are no encoding issues), the files are opened in "binary" mode by including the ``b`` character in the ``mode`` argument.
+
+For efficiency reasons, the repository interface only provides access to object content through file-like objects or strings.
+However, for certain use-cases, the object content _needs_ to be made available as a file on the local file system.
+For example, the ``numpy.loadtxt`` method only accepts a filepath, and no file-like objects.
+In this case, the content of the file can be made available on the local file system using the :py:meth:`~aiida.orm.nodes.repository.NodeRepository.as_path()` context manager:
+
+.. code-block:: ipython
+
+    In [5]: with single_file.as_path() as filepath:
+                numpy.loadtxt(filepath)
+
+The yielded value ``filepath`` is an instance of ``pathlib.Path`` that points to a location on the local file system containing the content of the file.
+The temporary copy on the local file system is automatically cleaned up once the context manager is exited.
+
+.. note::
+
+    The temporary directory to which the content is copied is created using the :meth:`tempfile.TemporaryDirectory` function of the standard library.
+    Its location is chosen from a platform-dependent list or can be controlled through the ``TMPDIR`` environment variable (see `the official documentation <https://docs.python.org/3/library/tempfile.html#tempfile.mkstemp>`_ for details).
+
+.. warning::
+
+    The :py:meth:`~aiida.orm.nodes.repository.NodeRepository.as_path()` context manager will copy the file content to a temporary folder on the local file system.
+    For large files this can be an expensive operation and it is inefficient since it requires an additional read and write operation.
+    Therefore, if it is possible to use file-like objects or read the content into memory, the :py:meth:`~aiida.orm.nodes.repository.NodeRepository.get_object_content()` and :py:meth:`~aiida.orm.nodes.repository.NodeRepository.open()` methods should be preferred.
+
 
 .. _topics:data_types:core:folder:
 
@@ -324,57 +361,106 @@ To store a complete directory, simply use the ``tree`` keyword:
 
 .. code-block:: ipython
 
-  In [1]: FolderData = DataFactory('core.folder')
+    In [1]: FolderData = DataFactory('core.folder')
 
-  In [2]: folder = FolderData(tree='/absolute/path/to/directory')
+    In [2]: folder = FolderData(tree='/absolute/path/to/directory')
 
 Alternatively, you can construct the node first and then use the various repository methods to add objects from directory and file paths:
 
 .. code-block:: ipython
 
-  In [1]: folder = FolderData()
+    In [1]: folder = FolderData()
 
-  In [2]: folder.put_object_from_tree('/absolute/path/to/directory')
+    In [2]: folder.put_object_from_tree('/absolute/path/to/directory')
 
-  In [3]: folder.put_object_from_file('/absolute/path/to/file1.txt', path='file1.txt')
+    In [3]: folder.put_object_from_file('/absolute/path/to/file1.txt', path='file1.txt')
 
 or from `file-like objects <https://docs.python.org/3/glossary.html#term-file-like-object>`_:
 
 .. code-block:: ipython
 
-  In [4]: folder.put_object_from_filelike(filelike_object, path='file2.txt')
+    In [4]: folder.put_object_from_filelike(filelike_object, path='file2.txt')
 
 Inversely, the content of the files stored in the :py:class:`~aiida.orm.nodes.data.folder.FolderData` node can be accessed using the :py:meth:`~aiida.orm.nodes.repository.NodeRepository.get_object_content()` method:
 
 .. code-block:: ipython
 
-  In [5]: folder.get_object_content('file1.txt')
-  Out[5]: 'File 1 content\n'
+    In [5]: folder.get_object_content('file1.txt')
+    Out[5]: 'File 1 content\n'
 
 To see the files that are stored in the :py:class:`~aiida.orm.nodes.data.folder.FolderData`, you can use the :py:meth:`~aiida.orm.nodes.repository.NodeRepository.list_object_names()` method:
 
 .. code-block:: ipython
 
-  In [6]: folder.list_object_names()
-  Out[6]: ['subdir', 'file1.txt', 'file2.txt']
+    In [6]: folder.list_object_names()
+    Out[6]: ['subdir', 'file1.txt', 'file2.txt']
 
 In this example, ``subdir`` was a sub directory of ``/absolute/path/to/directory``, whose contents where added above.
 to list the contents of the ``subdir`` directory, you can pass its path to the :py:meth:`~aiida.orm.nodes.repository.NodeRepository.list_object_names()` method:
 
 .. code-block:: ipython
 
-  In [7]: folder.list_object_names('subdir')
-  Out[7]: ['file3.txt', 'module.py']
+    In [7]: folder.list_object_names('subdir')
+    Out[7]: ['file3.txt', 'module.py']
 
 The content can once again be shown using the :py:meth:`~aiida.orm.nodes.repository.NodeRepository.get_object_content()` method by passing the correct path:
 
 .. code-block:: ipython
 
- In [8]: folder.get_object_content('subdir/file3.txt')
- Out[8]: 'File 3 content\n'
+    In [8]: folder.get_object_content('subdir/file3.txt')
+    Out[8]: 'File 3 content\n'
 
 Since the :py:class:`~aiida.orm.nodes.data.folder.FolderData` node is simply a collection of files, it simply stores these files in the repository.
 
+For large files, reading the entire content into memory using :py:meth:`~aiida.orm.nodes.repository.NodeRepository.get_object_content()` may not be desirable.
+Instead, a file-like handle can be opened to a file in the repository which can be used to read the content as a stream.
+This can be useful, for example, to copy a large file from the repository to a file on disk, without loading it entirely into memory:
+
+.. code-block:: ipython
+
+    In [9]: import shutil
+            with folder.open('subdir/file3.txt', mode='rb') as source:
+                with open('copy.txt', mode='wb') as target:
+                    shutil.copyfileobj(source, target)
+
+.. note:: To guarantee the file is copied over identically (and there are no encoding issues), the files are opened in "binary" mode by including the ``b`` character in the ``mode`` argument.
+
+For efficiency reasons, the repository interface only provides access to object content through file-like objects or strings.
+However, for certain use-cases, the object content _needs_ to be made available as a file on the local file system.
+For example, the ``numpy.loadtxt`` method only accepts a filepath, and no file-like objects.
+In this case, the content of the node's repository can be made available on the local file system using the :py:meth:`~aiida.orm.nodes.repository.NodeRepository.as_path()` context manager:
+
+.. code-block:: ipython
+
+    In [10]: with folder.as_path() as filepath:
+                 print(list(filepath.iterdir()))
+    Out[10]: ['subdir', 'file1.txt', 'file2.txt']
+
+The yielded value ``dirpath`` is an instance of ``pathlib.Path`` that points to a location on the local file system containing the complete content of the repository.
+The temporary copy on the local file system is automatically cleaned up once the context manager is exited.
+
+.. note::
+
+    The temporary directory to which the content is copied is created using the :meth:`tempfile.TemporaryDirectory` function of the standard library.
+    Its location is chosen from a platform-dependent list or can be controlled through the ``TMPDIR`` environment variable (see `the official documentation <https://docs.python.org/3/library/tempfile.html#tempfile.mkstemp>`_ for details).
+
+Optionally, an explicit object can be specified:
+
+.. code-block:: ipython
+
+    In [11]: with folder.as_path('some_data_file.dat') as filepath:
+                 numpy.loadtxt(filepath)
+
+If the object at ``path`` is a directory, the returned value points to a directory that contains its contents.
+If it is a file, the returned value points to a file with the content of the object.
+
+.. warning::
+
+    The :py:meth:`~aiida.orm.nodes.repository.NodeRepository.as_path()` context manager will copy the content to a temporary folder on the local file system.
+    For large repositories this can be an expensive operation and it is inefficient since it requires an additional read and write operation.
+    Therefore, if it is possible to use file-like objects or read the content into memory, the :py:meth:`~aiida.orm.nodes.repository.NodeRepository.get_object_content()` and :py:meth:`~aiida.orm.nodes.repository.NodeRepository.open()` methods should be preferred.
+
+
 .. _topics:data_types:core:remote:
 
 RemoteData

diff --git a/tests/orm/nodes/data/test_folder.py b/tests/orm/nodes/data/test_folder.py
@@ -30,6 +30,7 @@ def test_constructor_tree(tmp_path):
         'list_objects',
         'list_object_names',
         'open',
+        'as_path',
         'get_object',
         'get_object_content',
         'put_object_from_bytes',

diff --git a/tests/orm/nodes/test_repository.py b/tests/orm/nodes/test_repository.py
@@ -216,7 +216,7 @@ def test_glob():
 
 
 def test_copy_tree(tmp_path):
-    """Test the ``Repository.copy_tree`` method."""
+    """Test the ``NodeRepository.copy_tree`` method."""
     node = Data()
     node.base.repository.put_object_from_bytes(b'content', 'relative/path')
 
@@ -236,3 +236,29 @@ def test_deprecated_methods(monkeypatch):
     for method in node._deprecated_repo_methods:
         with pytest.warns(AiidaDeprecationWarning):
             getattr(node, method)
+
+
+def test_as_path():
+    """Test the ``NodeRepository.as_path`` method."""
+    node = Data()
+    node.base.repository.put_object_from_bytes(b'content_some_file', 'some_file.txt')
+    node.base.repository.put_object_from_bytes(b'content_relative', 'relative/path.dat')
+
+    with pytest.raises(FileNotFoundError):
+        with node.base.repository.as_path('non_existent'):
+            pass
+
+    with node.base.repository.as_path() as dirpath:
+        assert sorted([p.name for p in dirpath.iterdir()]) == ['relative', 'some_file.txt']
+        assert (dirpath / 'some_file.txt').read_bytes() == b'content_some_file'
+        assert (dirpath / 'relative' / 'path.dat').read_bytes() == b'content_relative'
+    assert not dirpath.exists()
+
+    with node.base.repository.as_path('relative') as dirpath:
+        assert sorted([p.name for p in dirpath.iterdir()]) == ['path.dat']
+        assert (dirpath / 'path.dat').read_bytes() == b'content_relative'
+    assert not dirpath.exists()
+
+    with node.base.repository.as_path('relative/path.dat') as filepath:
+        assert filepath.read_bytes() == b'content_relative'
+    assert not filepath.exists()