diff --git a/aiida/orm/nodes/data/folder.py b/aiida/orm/nodes/data/folder.py index e9bf9171e7..6a365b6e09 100644 --- a/aiida/orm/nodes/data/folder.py +++ b/aiida/orm/nodes/data/folder.py @@ -88,6 +88,18 @@ def open(self, path: str, mode='r') -> Iterator[BinaryIO | TextIO]: with self.base.repository.open(path, mode) as handle: yield handle + @contextlib.contextmanager + def as_path(self, path: FilePath | None = None) -> Iterator[pathlib.Path]: + """Make the contents of the repository available as a normal filepath on the local file system. + + :param path: optional relative path of the object within the repository. + :return: the filepath of the content of the repository or object if ``path`` is specified. + :raises TypeError: if the path is not a string or ``Path``, or is an absolute path. + :raises FileNotFoundError: if no object exists for the given path. + """ + with self.base.repository.as_path(path) as filepath: + yield filepath + def get_object(self, path: FilePath | None = None) -> File: """Return the object at the given path. diff --git a/aiida/orm/nodes/data/singlefile.py b/aiida/orm/nodes/data/singlefile.py index 6a841083c9..d602895ed9 100644 --- a/aiida/orm/nodes/data/singlefile.py +++ b/aiida/orm/nodes/data/singlefile.py @@ -92,6 +92,18 @@ def open(self, path: str | None = None, mode: t.Literal['r', 'rb'] = 'r') -> t.I with self.base.repository.open(path, mode=mode) as handle: yield handle + @contextlib.contextmanager + def as_path(self) -> t.Iterator[pathlib.Path]: + """Make the contents of the file available as a normal filepath on the local file system. + + :param path: optional relative path of the object within the repository. + :return: the filepath of the content of the repository or object if ``path`` is specified. + :raises TypeError: if the path is not a string or ``Path``, or is an absolute path. + :raises FileNotFoundError: if no object exists for the given path. + """ + with self.base.repository.as_path(self.filename) as filepath: + yield filepath + def get_content(self, mode: str = 'r') -> str | bytes: """Return the content of the single file stored for this data node. diff --git a/aiida/orm/nodes/repository.py b/aiida/orm/nodes/repository.py index dc7a507a3f..37f642c59f 100644 --- a/aiida/orm/nodes/repository.py +++ b/aiida/orm/nodes/repository.py @@ -6,6 +6,7 @@ import copy import io import pathlib +import shutil import tempfile from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, Iterator, TextIO, Union @@ -164,7 +165,7 @@ def list_object_names(self, path: str | None = None) -> list[str]: return self._repository.list_object_names(path) @contextlib.contextmanager - def open(self, path: str, mode='r') -> Iterator[BinaryIO | TextIO]: + def open(self, path: FilePath, mode='r') -> Iterator[BinaryIO | TextIO]: """Open a file handle to an object stored under the given key. .. note:: this should only be used to open a handle to read an existing file. To write a new file use the method @@ -186,6 +187,32 @@ def open(self, path: str, mode='r') -> Iterator[BinaryIO | TextIO]: else: yield handle + @contextlib.contextmanager + def as_path(self, path: FilePath | None = None) -> Iterator[pathlib.Path]: + """Make the contents of the repository available as a normal filepath on the local file system. + + :param path: optional relative path of the object within the repository. + :return: the filepath of the content of the repository or object if ``path`` is specified. + :raises TypeError: if the path is not a string or ``Path``, or is an absolute path. + :raises FileNotFoundError: if no object exists for the given path. + """ + obj = self.get_object(path) + + with tempfile.TemporaryDirectory() as tmp_path: + + dirpath = pathlib.Path(tmp_path) + + if obj.is_dir(): + self.copy_tree(dirpath, path) + yield dirpath + else: + filepath = dirpath / obj.name + assert path is not None + with self.open(path, mode='rb') as source: + with filepath.open('wb') as target: + shutil.copyfileobj(source, target) # type: ignore[misc] + yield filepath + def get_object(self, path: FilePath | None = None) -> File: """Return the object at the given path. diff --git a/docs/source/nitpick-exceptions b/docs/source/nitpick-exceptions index bdec739ebf..8027da86dd 100644 --- a/docs/source/nitpick-exceptions +++ b/docs/source/nitpick-exceptions @@ -21,6 +21,7 @@ py:class json.encoder.JSONEncoder py:class EXPOSED_TYPE py:class EVENT_CALLBACK_TYPE py:class datetime +py:meth tempfile.TemporaryDirectory ### AiiDA py:class ReturnType diff --git a/docs/source/topics/data_types.rst b/docs/source/topics/data_types.rst index f18bb6a9ec..781e807883 100644 --- a/docs/source/topics/data_types.rst +++ b/docs/source/topics/data_types.rst @@ -305,6 +305,7 @@ This class can be initialized via the **absolute** path to the file you want to In [2]: single_file = SinglefileData('/absolute/path/to/file') +When storing the node, the filename is stored in the database and the file itself is copied to the repository. The contents of the file in string format can be obtained using the :py:meth:`~aiida.orm.nodes.data.singlefile.SinglefileData.get_content()` method: .. code-block:: ipython @@ -312,7 +313,43 @@ The contents of the file in string format can be obtained using the :py:meth:`~a In [3]: single_file.get_content() Out[3]: 'The file content' -When storing the node, the filename is stored in the database and the file itself is copied to the repository. +For large files, reading the entire content into memory using :py:meth:`~aiida.orm.nodes.repository.NodeRepository.get_object_content()` may not be desirable. +Instead, a file-like handle can be opened to a file in the repository which can be used to read the content as a stream. +This can be useful, for example, to copy a large file from the repository to a file on disk, without loading it entirely into memory: + +.. code-block:: ipython + + In [4]: import shutil + with single_file.open(mode='rb') as source: + with open('copy.txt', mode='wb') as target: + shutil.copyfileobj(source, target) + +.. note:: To guarantee the file is copied over identically (and there are no encoding issues), the files are opened in "binary" mode by including the ``b`` character in the ``mode`` argument. + +For efficiency reasons, the repository interface only provides access to object content through file-like objects or strings. +However, for certain use-cases, the object content _needs_ to be made available as a file on the local file system. +For example, the ``numpy.loadtxt`` method only accepts a filepath, and no file-like objects. +In this case, the content of the file can be made available on the local file system using the :py:meth:`~aiida.orm.nodes.repository.NodeRepository.as_path()` context manager: + +.. code-block:: ipython + + In [5]: with single_file.as_path() as filepath: + numpy.loadtxt(filepath) + +The yielded value ``filepath`` is an instance of ``pathlib.Path`` that points to a location on the local file system containing the content of the file. +The temporary copy on the local file system is automatically cleaned up once the context manager is exited. + +.. note:: + + The temporary directory to which the content is copied is created using the :meth:`tempfile.TemporaryDirectory` function of the standard library. + Its location is chosen from a platform-dependent list or can be controlled through the ``TMPDIR`` environment variable (see `the official documentation `_ for details). + +.. warning:: + + The :py:meth:`~aiida.orm.nodes.repository.NodeRepository.as_path()` context manager will copy the file content to a temporary folder on the local file system. + For large files this can be an expensive operation and it is inefficient since it requires an additional read and write operation. + Therefore, if it is possible to use file-like objects or read the content into memory, the :py:meth:`~aiida.orm.nodes.repository.NodeRepository.get_object_content()` and :py:meth:`~aiida.orm.nodes.repository.NodeRepository.open()` methods should be preferred. + .. _topics:data_types:core:folder: @@ -324,57 +361,106 @@ To store a complete directory, simply use the ``tree`` keyword: .. code-block:: ipython - In [1]: FolderData = DataFactory('core.folder') + In [1]: FolderData = DataFactory('core.folder') - In [2]: folder = FolderData(tree='/absolute/path/to/directory') + In [2]: folder = FolderData(tree='/absolute/path/to/directory') Alternatively, you can construct the node first and then use the various repository methods to add objects from directory and file paths: .. code-block:: ipython - In [1]: folder = FolderData() + In [1]: folder = FolderData() - In [2]: folder.put_object_from_tree('/absolute/path/to/directory') + In [2]: folder.put_object_from_tree('/absolute/path/to/directory') - In [3]: folder.put_object_from_file('/absolute/path/to/file1.txt', path='file1.txt') + In [3]: folder.put_object_from_file('/absolute/path/to/file1.txt', path='file1.txt') or from `file-like objects `_: .. code-block:: ipython - In [4]: folder.put_object_from_filelike(filelike_object, path='file2.txt') + In [4]: folder.put_object_from_filelike(filelike_object, path='file2.txt') Inversely, the content of the files stored in the :py:class:`~aiida.orm.nodes.data.folder.FolderData` node can be accessed using the :py:meth:`~aiida.orm.nodes.repository.NodeRepository.get_object_content()` method: .. code-block:: ipython - In [5]: folder.get_object_content('file1.txt') - Out[5]: 'File 1 content\n' + In [5]: folder.get_object_content('file1.txt') + Out[5]: 'File 1 content\n' To see the files that are stored in the :py:class:`~aiida.orm.nodes.data.folder.FolderData`, you can use the :py:meth:`~aiida.orm.nodes.repository.NodeRepository.list_object_names()` method: .. code-block:: ipython - In [6]: folder.list_object_names() - Out[6]: ['subdir', 'file1.txt', 'file2.txt'] + In [6]: folder.list_object_names() + Out[6]: ['subdir', 'file1.txt', 'file2.txt'] In this example, ``subdir`` was a sub directory of ``/absolute/path/to/directory``, whose contents where added above. to list the contents of the ``subdir`` directory, you can pass its path to the :py:meth:`~aiida.orm.nodes.repository.NodeRepository.list_object_names()` method: .. code-block:: ipython - In [7]: folder.list_object_names('subdir') - Out[7]: ['file3.txt', 'module.py'] + In [7]: folder.list_object_names('subdir') + Out[7]: ['file3.txt', 'module.py'] The content can once again be shown using the :py:meth:`~aiida.orm.nodes.repository.NodeRepository.get_object_content()` method by passing the correct path: .. code-block:: ipython - In [8]: folder.get_object_content('subdir/file3.txt') - Out[8]: 'File 3 content\n' + In [8]: folder.get_object_content('subdir/file3.txt') + Out[8]: 'File 3 content\n' Since the :py:class:`~aiida.orm.nodes.data.folder.FolderData` node is simply a collection of files, it simply stores these files in the repository. +For large files, reading the entire content into memory using :py:meth:`~aiida.orm.nodes.repository.NodeRepository.get_object_content()` may not be desirable. +Instead, a file-like handle can be opened to a file in the repository which can be used to read the content as a stream. +This can be useful, for example, to copy a large file from the repository to a file on disk, without loading it entirely into memory: + +.. code-block:: ipython + + In [9]: import shutil + with folder.open('subdir/file3.txt', mode='rb') as source: + with open('copy.txt', mode='wb') as target: + shutil.copyfileobj(source, target) + +.. note:: To guarantee the file is copied over identically (and there are no encoding issues), the files are opened in "binary" mode by including the ``b`` character in the ``mode`` argument. + +For efficiency reasons, the repository interface only provides access to object content through file-like objects or strings. +However, for certain use-cases, the object content _needs_ to be made available as a file on the local file system. +For example, the ``numpy.loadtxt`` method only accepts a filepath, and no file-like objects. +In this case, the content of the node's repository can be made available on the local file system using the :py:meth:`~aiida.orm.nodes.repository.NodeRepository.as_path()` context manager: + +.. code-block:: ipython + + In [10]: with folder.as_path() as filepath: + print(list(filepath.iterdir())) + Out[10]: ['subdir', 'file1.txt', 'file2.txt'] + +The yielded value ``dirpath`` is an instance of ``pathlib.Path`` that points to a location on the local file system containing the complete content of the repository. +The temporary copy on the local file system is automatically cleaned up once the context manager is exited. + +.. note:: + + The temporary directory to which the content is copied is created using the :meth:`tempfile.TemporaryDirectory` function of the standard library. + Its location is chosen from a platform-dependent list or can be controlled through the ``TMPDIR`` environment variable (see `the official documentation `_ for details). + +Optionally, an explicit object can be specified: + +.. code-block:: ipython + + In [11]: with folder.as_path('some_data_file.dat') as filepath: + numpy.loadtxt(filepath) + +If the object at ``path`` is a directory, the returned value points to a directory that contains its contents. +If it is a file, the returned value points to a file with the content of the object. + +.. warning:: + + The :py:meth:`~aiida.orm.nodes.repository.NodeRepository.as_path()` context manager will copy the content to a temporary folder on the local file system. + For large repositories this can be an expensive operation and it is inefficient since it requires an additional read and write operation. + Therefore, if it is possible to use file-like objects or read the content into memory, the :py:meth:`~aiida.orm.nodes.repository.NodeRepository.get_object_content()` and :py:meth:`~aiida.orm.nodes.repository.NodeRepository.open()` methods should be preferred. + + .. _topics:data_types:core:remote: RemoteData diff --git a/tests/orm/nodes/data/test_folder.py b/tests/orm/nodes/data/test_folder.py index dbb8a34116..35e7ebbc0d 100644 --- a/tests/orm/nodes/data/test_folder.py +++ b/tests/orm/nodes/data/test_folder.py @@ -30,6 +30,7 @@ def test_constructor_tree(tmp_path): 'list_objects', 'list_object_names', 'open', + 'as_path', 'get_object', 'get_object_content', 'put_object_from_bytes', diff --git a/tests/orm/nodes/test_repository.py b/tests/orm/nodes/test_repository.py index 3077359529..86b0323a9c 100644 --- a/tests/orm/nodes/test_repository.py +++ b/tests/orm/nodes/test_repository.py @@ -216,7 +216,7 @@ def test_glob(): def test_copy_tree(tmp_path): - """Test the ``Repository.copy_tree`` method.""" + """Test the ``NodeRepository.copy_tree`` method.""" node = Data() node.base.repository.put_object_from_bytes(b'content', 'relative/path') @@ -236,3 +236,29 @@ def test_deprecated_methods(monkeypatch): for method in node._deprecated_repo_methods: with pytest.warns(AiidaDeprecationWarning): getattr(node, method) + + +def test_as_path(): + """Test the ``NodeRepository.as_path`` method.""" + node = Data() + node.base.repository.put_object_from_bytes(b'content_some_file', 'some_file.txt') + node.base.repository.put_object_from_bytes(b'content_relative', 'relative/path.dat') + + with pytest.raises(FileNotFoundError): + with node.base.repository.as_path('non_existent'): + pass + + with node.base.repository.as_path() as dirpath: + assert sorted([p.name for p in dirpath.iterdir()]) == ['relative', 'some_file.txt'] + assert (dirpath / 'some_file.txt').read_bytes() == b'content_some_file' + assert (dirpath / 'relative' / 'path.dat').read_bytes() == b'content_relative' + assert not dirpath.exists() + + with node.base.repository.as_path('relative') as dirpath: + assert sorted([p.name for p in dirpath.iterdir()]) == ['path.dat'] + assert (dirpath / 'path.dat').read_bytes() == b'content_relative' + assert not dirpath.exists() + + with node.base.repository.as_path('relative/path.dat') as filepath: + assert filepath.read_bytes() == b'content_relative' + assert not filepath.exists()