From 742ae05424e56623c8201db610a71e8159e3fa5e Mon Sep 17 00:00:00 2001 From: ketiltrout Date: Wed, 6 Nov 2024 10:33:18 -0800 Subject: [PATCH] fix(lfs): make HSM interactions more robust == Use a timeout always with lfs It seems pretty clear now that `lfs` invocation should always have a timeout to guard against Lustre locking-up all our workers. So, now use a 1-minute timeout by default in `run_lfs`. Then, handle both `lfs quota` and `lfs hsm_state` not returning successfully. In `StorageNode.update_avail_gb`, passing `None` now means don't update `avail.gb` (but do still update the last check time). If the current value of `avail_gb` is not null/None, a warning is issued. For `hsm_state` failing, various places in `LustreHSM` where it was used now handles not being able to determine the state. == Use hsm_action to track in-progress restores I was reading through the lustre source code last night to see if I could figure out what's up with `hsm_restore` timing out when files are in the process of being restored. I didn't figure that out because I discovered, instead, the `hsm_action` command which tells you what HSM is currently doing to a file (e.g. is it working on restoring it?). So, there's another `HSMState`: `RESTORING` for files which HSM is in the process of restoring, and alpenhorn will now use that to track progress during restores. This removes the need for `_restore_retry` (because alpenhorn no longer needs to guess as to whether a restore is happening or not). == Removing stats from LustreHSM The idea here is: using `stat()` to check for file existance, while avoiding making an external `lfs` call, causes trouble when /nearline is acting up because the `stat()` will get a worker stuck in IO-wait, while the `lfs` call can be abandonned (by subprocess timeout) meaning the workers don't get stuck. So, I've removed the stat from at the top of `lfs.hsm_state`. I originally had it there because I thought that a cheap `stat` would save us from an expensive `lfs` call. I've modified `lfs_run` to detect and report missing files instead. I've also re-implemented `LustreHSMNodeIO.exists` to use `lfs` instead of stat-ting the filesystem. --- alpenhorn/db/storage.py | 11 ++- alpenhorn/io/lfs.py | 157 ++++++++++++++++++++++--------- alpenhorn/io/lustrehsm.py | 134 +++++++++++++------------- alpenhorn/server/update.py | 6 +- tests/conftest.py | 28 ++++-- tests/db/test_storage.py | 15 ++- tests/io/test_lfs.py | 67 +++++++++++-- tests/io/test_lustrehsmgroup.py | 7 ++ tests/io/test_lustrehsmnode.py | 114 +++++++++++++++++----- tests/server/test_auto_import.py | 5 +- tests/server/test_service.py | 5 +- 11 files changed, 392 insertions(+), 157 deletions(-) diff --git a/alpenhorn/db/storage.py b/alpenhorn/db/storage.py index 022783196..cabcfeb31 100644 --- a/alpenhorn/db/storage.py +++ b/alpenhorn/db/storage.py @@ -367,18 +367,19 @@ def update_avail_gb(self, new_avail: int | None) -> None: """ # The value in the database is in GiB (2**30 bytes) if new_avail is None: - self.avail_gb = None + # Warn unless we never knew the free space + if self.avail_gb is not None: + log.warning(f'Unable to determine available space for "{self.name}".') else: self.avail_gb = new_avail / 2**30 + + # Record check time, even if we failed to update self.avail_gb_last_checked = pw.utcnow() # Update the DB with the free space but don't clobber changes made - # manually to the database + # manually to the database. self.save(only=[StorageNode.avail_gb, StorageNode.avail_gb_last_checked]) - if new_avail is None: - log.info(f'Unable to determine available space for "{self.name}".') - class StorageTransferAction(base_model): """Storage transfer rules for the archive. diff --git a/alpenhorn/io/lfs.py b/alpenhorn/io/lfs.py index bd105ff42..c63f08864 100644 --- a/alpenhorn/io/lfs.py +++ b/alpenhorn/io/lfs.py @@ -18,6 +18,11 @@ * `UNARCHIVED`: file exists on disk, but not on external storage * `RESTORED`: file exists on both disk and external storage * `RELEASED`: file exists in external storage only +* `lfs hsm_action` + retrieves the current action. We use this to detect this + `HSMState`: + * `RESTORING`: file exists in external storage only, but HSM + is in the process of restoring it * `lfs hsm_restore` requests the state change `RELEASED -> RESTORED` * `lfs hsm_release` @@ -55,6 +60,9 @@ class HSMState(Enum): is the state of newly created files until they are archived. HSMState.RELEASED: The file is in external storage but not on disk. + HSMState.RESTORING: + The file is in external storage only, and HSM is in the process of + bringing it back to disk HSMState.RESTORED: The file is both in external storage and on disk. @@ -62,14 +70,18 @@ class HSMState(Enum): our control, but once the file has been archived, it moves from state UNARCHIVED to state RESTORED. - A `lfs.hsm_restore()` changes a file's state from RELEASED to RESTORED. - A `lfs.hsm_release()` changes a file's state from RESTORED to RELEASED. + A `lfs.hsm_restore()` requests a file's state change from RELEASED to RESTORED. + After this call, the files state changes to RESTORING until the restore is + complete. + + A `lfs.hsm_release()` requests a file's state change from RESTORED to RELEASED. """ MISSING = 0 UNARCHIVED = 1 RESTORED = 2 - RELEASED = 3 + RESTORING = 3 + RELEASED = 4 class LFS: @@ -98,6 +110,7 @@ class LFS: HSM_MISSING = HSMState.MISSING HSM_UNARCHIVED = HSMState.UNARCHIVED HSM_RESTORED = HSMState.RESTORED + HSM_RESTORING = HSMState.RESTORING HSM_RELEASED = HSMState.RELEASED def __init__( @@ -114,7 +127,7 @@ def __init__( if self._lfs is None: raise RuntimeError("lfs command not found.") - def run_lfs(self, *args: str, timeout: float | None = None) -> str | False | None: + def run_lfs(self, *args: str) -> dict: """Run the lfs command with the `args` provided. Parameters @@ -122,37 +135,55 @@ def run_lfs(self, *args: str, timeout: float | None = None) -> str | False | Non *args : strings The list of command-line arguments to pass to the lfs command. - timeout : float, optional - If not None, stop waiting for the command after `timeout` seconds - Retunrs + Returns ------- - output : str or False or None - If the command succeeded, returns standard output of - the command. If the command failed or timed out, - returns False (failed) or None (timed out) and logs - the failure. + result : dict + A dict is returned with the following keys: + + - missing : bool + True if the file was missing; False otherwise + - timeout : bool + True if the command timed out; False otherwise + - failed : bool + True if the command failed; False otherwise + - output : str or None + If the command succeeded (i.e. all three booleans + are false), this has the standard output of + the command. Otherwise, this is None. """ + result = {"missing": False, "timeout": False, "failed": False, "output": None} + # Stringify args args = [str(arg) for arg in args] - ret, stdout, stderr = util.run_command([self._lfs] + args, timeout=timeout) + ret, stdout, stderr = util.run_command([self._lfs] + args, timeout=60) + + # Timeout + if ret is None: + log.warning(f"LFS command timed out: " + " ".join(args)) + result["timeout"] = True + return result + + if ret == 0: + # Success, return output + result["output"] = stdout + return result - # Failure or timeout - if ret is None or ret != 0: - if ret is None: - result = "timed out" - else: - result = f"failed (ret={ret})" - ret = False - log.warning(f"LFS command {result}: " + " ".join(args)) - if stderr: - log.debug(f"LFS stderr: {stderr}") - if stdout: - log.debug(f"LFS stdout: {stdout}") - return ret + # Failure, look for a "No such file" remark in stderr + if stderr and "No such file or directory" in stderr: + log.debug(f"LFS missing file: " + " ".join(args)) + result["missing"] = True + else: + # Otherwise, report failure + log.warning(f"LFS command failed: " + " ".join(args)) + result["failed"] = True - return stdout + if stderr: + log.debug(f"LFS stderr: {stderr}") + if stdout: + log.debug(f"LFS stdout: {stdout}") + return result def quota_remaining(self, path: str | os.PathLike) -> int | None: """Retrieve the remaining quota for `path`. @@ -200,7 +231,7 @@ def quota_remaining(self, path: str | os.PathLike) -> int | None: # Stringify path path = str(path) - stdout = self.run_lfs("quota", "-q", "-g", self._quota_group, path) + stdout = self.run_lfs("quota", "-q", "-g", self._quota_group, path)["output"] if stdout is None: return None # Command returned error @@ -246,6 +277,35 @@ def quota_remaining(self, path: str | os.PathLike) -> int | None: # lfs quota reports values in kiByte blocks return (quota_limit - quota) * 2**10 + def hsm_restoring(self, path: os.PathLike | str) -> bool: + """Is HSM processing a restore request? + + Returns True if HSM is currently working on a restore + request for `path`, and False otherwise. + + Runs `lfs hsm_action` to check the current HSM action + or the file. + + Parameters + ---------- + path : path-like + The path to determine the state for. + + Returns + ------- + restoring : bool or None + True if a RESTORE is in progress. False otherwise. + None if the command failed. + """ + + # Stringify path + path = str(path) + + stdout = self.run_lfs("hsm_action", path)["output"] + if stdout is None: + return None # Command returned error + return "RESTORE" in stdout + def hsm_state(self, path: os.PathLike | str) -> HSMState: """Returns the HSM state of path. @@ -261,16 +321,16 @@ def hsm_state(self, path: os.PathLike | str) -> HSMState: failed. If `path` doesn't exist, this will be `HSMState.MISSING`. """ - # No need to check with HSM if the path isn't present - if not pathlib.Path(path).exists(): - return HSMState.MISSING - # Stringify path path = str(path) - stdout = self.run_lfs("hsm_state", path) - if stdout is False: + result = self.run_lfs("hsm_state", path) + if result["failed"] or result["timeout"]: return None # Command returned error + if result["missing"]: + return HSMState.MISSING + + stdout = result["output"] # The output of hsm_state looks like this: # @@ -301,18 +361,29 @@ def hsm_state(self, path: os.PathLike | str) -> HSMState: # See llapi_hsm_state_get(3) for full details about these. if "archived" not in stdout: return HSMState.UNARCHIVED - if "released" in stdout: - return HSMState.RELEASED - return HSMState.RESTORED + if "released" not in stdout: + return HSMState.RESTORED + + # File is released, so now run `hsm_action` to see if a restore is in + # progress + if self.hsm_restoring(path): + return HSMState.RESTORING + + return HSMState.RELEASED def hsm_archived(self, path: os.PathLike) -> bool: """Is `path` archived by HSM?""" state = self.hsm_state(path) - return state == HSMState.RESTORED or state == HSMState.RELEASED + return ( + state == HSMState.RESTORED + or state == HSMState.RESTORING + or state == HSMState.RELEASED + ) def hsm_released(self, path: os.PathLike) -> bool: """Is `path` released to external storage?""" - return self.hsm_state(path) == HSMState.RELEASED + state = self.hsm_state(path) + return state == HSMState.RELEASED or state == HSMState.RESTORING def hsm_restore(self, path: os.PathLike) -> bool | None: """Trigger restore of `path` from external storage. @@ -343,9 +414,11 @@ def hsm_restore(self, path: os.PathLike) -> bool | None: if state != HSMState.RELEASED: return True - result = self.run_lfs("hsm_restore", path, timeout=60) - if result is None or result is False: - return result + result = self.run_lfs("hsm_restore", path) + if result["missing"]: + return False + if result["timeout"] or result["failed"]: + return None return True def hsm_release(self, path: os.PathLike) -> bool: diff --git a/alpenhorn/io/lustrehsm.py b/alpenhorn/io/lustrehsm.py index 5e527f132..c07b5b48c 100644 --- a/alpenhorn/io/lustrehsm.py +++ b/alpenhorn/io/lustrehsm.py @@ -38,10 +38,6 @@ log = logging.getLogger(__name__) -# Retry delays (in seconds) after a successful or timed-out hsm_restore request -RESTORE_TIMEOUT_RETRY = 1 * 3600 # 1 hour -RESTORE_SUCCESS_RETRY = 4 * 3600 # 4 hours - class LustreHSMNodeRemote(BaseNodeRemote): """LustreHSMNodeRemote: information about a LustreHSM remote node.""" @@ -96,17 +92,13 @@ def __init__( self._headroom = config["headroom"] * 2**10 # convert from kiB # QueryWalker for the HSM state check - self._release_qw = None + self._statecheck_qw = None # Tracks files we're in the process of retrieving, so we can avoid # waiting for the same file twice. The elements in this set are # `ArchiveFile.id`s self._restoring = set() - # The time.monotonic value after which we should try to restore again. - # Keys are elements in self._restoring. - self._restore_retry = dict() - # For informational purposes. Keys are elements in self._restoring. self._restore_start = dict() @@ -140,14 +132,28 @@ def _restore_wait(self, copy: ArchiveFileCopy) -> bool | None: # What's the current situation? state = self._lfs.hsm_state(copy.path) + if state == None: + log.warning(f"Unable to restore {copy.path}: state check failed.") + self._restore_start.pop(copy.file.id, None) + self._restoring.discard(copy.file.id) + return None + if state == self._lfs.HSM_MISSING: log.warning(f"Unable to restore {copy.path}: missing.") self._restore_start.pop(copy.file.id, None) - self._restore_retry.pop(copy.file.id, None) self._restoring.discard(copy.file.id) return None - if state != self._lfs.HSM_RELEASED: + if state == self._lfs.HSM_RESTORING: + if copy.file.id not in self._restoring: + self._restoring.add(copy.file.id) + self._restore_start[copy.file.id] = time.monotonic() + + log.debug(f"Restore in progress: {copy.path}") + + # Tell the caller to wait + return True + elif state != self._lfs.HSM_RELEASED: # i.e. file is restored or unarchived, so we're done. if copy.file.id not in self._restoring: log.debug(f"Already restored: {copy.path}") @@ -157,54 +163,30 @@ def _restore_wait(self, copy: ArchiveFileCopy) -> bool | None: f"{copy.file.path} restored on node {self.node.name}" f"after {pretty_deltat(deltat)}" ) - self._restore_start.pop(copy.file.id, None) - self._restore_retry.pop(copy.file.id, None) + del self._restore_start[copy.file.id] self._restoring.discard(copy.file.id) return False # If we got here, copy is released. + if copy.file.id not in self._restoring: + self._restoring.add(copy.file.id) + self._restore_start[copy.file.id] = time.monotonic() - # Have we hit the retry time for an in-progress restore? - retry_restore = ( - copy.file.id in self._restoring - and self._restore_retry[copy.file.id] <= time.monotonic() - ) - - if retry_restore or copy.file.id not in self._restoring: - # Add this copy to the list of copies we're waiting on. Other tasks - # can use this to see if the file they're interested in is already - # "in use". - if not retry_restore: - self._restoring.add(copy.file.id) - self._restore_start[copy.file.id] = time.monotonic() - - # Try to restore it. - result = self._lfs.hsm_restore(copy.path) - log.warning(f"restore result: {result}") + # Try to restore it. + result = self._lfs.hsm_restore(copy.path) - if result is False: - # Reqeust failed. Abandon the restore attempt entirely, - # in case it was deleted from the node. - self._restore_retry.pop(copy.file.id, None) - self._restore_start.pop(copy.file.id, None) - self._restoring.discard(copy.file.id) + if result is False: + log.warning(f"Restore request failed: {copy.path}") + # Reqeust failed. Abandon the restore attempt entirely, + # in case it was deleted from the node. + del self._restore_start[copy.file.id] + self._restoring.discard(copy.file.id) - # Report failure - return None - elif result is None: - # Request timeout. This seems to happen when the file - # is already being restored, but let's try again in a - # little while, just to be safe. - self._restore_retry[copy.file.id] = ( - time.monotonic() + RESTORE_TIMEOUT_RETRY - ) - else: - # Request success; restore should be in-progress, but - # we'll still schedule a retry for some time in the future - # to guard against HSM forgetting/abandonning our request - self._restore_retry[copy.file.id] = ( - time.monotonic() + RESTORE_SUCCESS_RETRY - ) + # Report failure + return None + elif result is None: + # Might have worked. Caller should check again in a bit. + log.warning(f"Restore request timeout: {copy.path}") # Tell the caller to wait return True @@ -217,7 +199,13 @@ def release_files(self) -> None: to happen (i.e. the node is currently idle). """ - headroom_needed = self._headroom - self._lfs.quota_remaining(self.node.root) + # Can't do anything if we don't know how much free space we have + size_gib = self.node.avail_gb + if size_gib is None: + log.debug("Skipping release_files: free space unknown.") + return + + headroom_needed = self._headroom - int(size_gib * 2**30) # Nothing to do if headroom_needed <= 0: @@ -238,8 +226,8 @@ def _async(task, node, lfs, headroom_needed): ) .order_by(ArchiveFileCopy.last_update) ): - # Skip unarchived files - if not lfs.hsm_archived(copy.path): + # The only files we can release are ones that are fully restored + if lfs.hsm_state(copy.path) != lfs.HSM_RESTORED: continue log.debug( @@ -308,9 +296,9 @@ def idle_update(self, newly_idle) -> None: super().idle_update(newly_idle) # Check the query walker. Initialised if necessary. - if self._release_qw is None: + if self._statecheck_qw is None: try: - self._release_qw = QueryWalker( + self._statecheck_qw = QueryWalker( ArchiveFileCopy, ArchiveFileCopy.node == self.node, ArchiveFileCopy.has_file == "Y", @@ -320,17 +308,21 @@ def idle_update(self, newly_idle) -> None: # Try to get a bunch of copies to check try: - copies = self._release_qw.get(self._nrelease) + copies = self._statecheck_qw.get(self._nrelease) except pw.DoesNotExist: # Not sure why all the file copies have gone away, but given there's # nothing on the node now, can't hurt to re-init the QW in this case - self._release_qw = None + self._statecheck_qw = None return def _async(task, node, lfs, copies): for copy in copies: state = lfs.hsm_state(copy.path) - if state == lfs.HSM_MISSING: + if state is None: + log.warning( + f"Unable to determine state for {copy.file.path} on node {node.name}." + ) + elif state == lfs.HSM_MISSING: # File is unexpectedly gone. log.warning( f"File copy {copy.file.path} on node {node.name} is missing!" @@ -338,7 +330,7 @@ def _async(task, node, lfs, copies): ArchiveFileCopy.update( has_file="N", ready=False, last_update=utcnow() ).where(ArchiveFileCopy.id == copy.id).execute() - elif state == lfs.HSM_RELEASED: + elif state == lfs.HSM_RELEASED or state == lfs.HSM_RESTORING: if copy.ready: log.info(f"Updating file copy {copy.file.path}: ready -> False") ArchiveFileCopy.update(ready=False, last_update=utcnow()).where( @@ -452,6 +444,19 @@ def check_active(self) -> bool: """ return self.node.active + def exists(self, path: pathlib.PurePath) -> bool: + """Does `path` exist? + + Checks whether `lfs hsm_state` returns ENOENT. + + Parameters + ---------- + path : pathlib.PurePath + path relative to `node.root` + """ + full_path = pathlib.PurePath(self.node.root).joinpath(path) + return self._lfs.hsm_state(full_path) != self._lfs.HSM_MISSING + def filesize(self, path: pathlib.Path, actual: bool = False) -> int: """Return size in bytes of the file given by `path`. @@ -508,7 +513,10 @@ def open(self, path: os.PathLike | str, binary: bool = True) -> IO: # Make abs path p = pathlib.Path(self.node.root, path) - if self._lfs.hsm_released(p): + state = self._lfs.hsm_state(p) + if state is None: + raise OSError(f"Can't get state for {path}.") + if state != self._lfs.HSM_RESTORED and state != self._lfs.HSM_UNARCHIVED: raise OSError(f"{path} is not restored.") return open(p, mode="rb" if binary else "rt") @@ -559,7 +567,7 @@ def _async(task: Task, node_io: LustreHSMNodeIO, file_: ArchiveFile): restore_wait = node_io._restore_wait(copy) while restore_wait: # Wait for a bit - yield 60 + yield 600 # Now check again restore_wait = node_io._restore_wait(copy) diff --git a/alpenhorn/server/update.py b/alpenhorn/server/update.py index 27df4d081..ed2c9a1e4 100644 --- a/alpenhorn/server/update.py +++ b/alpenhorn/server/update.py @@ -301,8 +301,10 @@ def update_free_space(self) -> None: self.db.update_avail_gb(bytes_avail) - if bytes_avail is not None: - log.info(f"Node {self.name}: {util.pretty_bytes(bytes_avail)} available.") + if self.db.avail_gb is not None: + log.info( + f"Node {self.name}: {util.pretty_bytes(self.db.avail_gb * 2**30)} available." + ) def run_auto_verify(self) -> None: """Run auto-verification on this node. diff --git a/tests/conftest.py b/tests/conftest.py index 8dc99ed7a..0af8d8861 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -55,7 +55,7 @@ def pytest_configure(config): "lfs_hsm_restore_result(result): " "used on tests which mock alpenhorn.io.lfs.LFS " "to indicate the result of the hsm_restore call. result " - "may be 'fail', 'timeout', or 'wait'", + "may be 'fail', 'timeout', 'wait', or 'restore'", ) config.addinivalue_line( "markers", @@ -242,6 +242,8 @@ def _mocked_lfs_hsm_state(self, path): return HSMState.UNARCHIVED if state == "restored": return HSMState.RESTORED + if state == "restoring": + return HSMState.RESTORING if state == "released": return HSMState.RELEASED @@ -250,6 +252,9 @@ def _mocked_lfs_hsm_state(self, path): def _mocked_lfs_hsm_restore(self, path): nonlocal request, lfs_hsm_state + # de-pathlib-ify + path = str(path) + marker = request.node.get_closest_marker("lfs_hsm_restore_result") if marker: if marker.args[0] == "fail": @@ -260,16 +265,21 @@ def _mocked_lfs_hsm_restore(self, path): return None if marker.args[0] == "wait": # Return true (successful request) - # without changing state to "restored" + # without full restore + lfs_hsm_state[path] = "restoring" + return True + if marker.args[0] == "restore": + # Return true (successful request) + # with full restore + lfs_hsm_state[path] = "restored" return True - - # de-pathlib-ify - path = str(path) state = lfs_hsm_state.get(path, "missing") if state == "missing": return False if state == "released": + lfs_hsm_state[path] = "restoring" + elif state == "restoring": lfs_hsm_state[path] = "restored" return True @@ -280,12 +290,14 @@ def _mocked_lfs_hsm_release(self, path): path = str(path) state = lfs_hsm_state.get(path, "missing") - if state == "missing" or state == "unarchived": - return False + if state == "released": + return True if state == "restored": lfs_hsm_state[path] = "released" + return True - return True + # Missing, unarchived, or restoring + return False marker = request.node.get_closest_marker("lfs_quota_remaining") if marker is None: diff --git a/tests/db/test_storage.py b/tests/db/test_storage.py index 5c7582863..bfc1aa060 100644 --- a/tests/db/test_storage.py +++ b/tests/db/test_storage.py @@ -368,13 +368,22 @@ def test_update_avail_gb(simplenode): node = StorageNode.get(id=simplenode.id) after = pw.utcnow() - assert node.avail_gb == 10000.0 / 2.0**30 + avail = node.avail_gb + assert avail == 10000.0 / 2.0**30 assert node.avail_gb_last_checked >= before assert node.avail_gb_last_checked <= after - # Test None + # Reset time + StorageNode.update(avail_gb_last_checked=0).where( + StorageNode.id == simplenode.id + ).execute() + + # Test None -- shouldn't change value, but last + # update has happened simplenode.update_avail_gb(None) - assert StorageNode.get(id=simplenode.id).avail_gb is None + node = StorageNode.get(id=simplenode.id) + assert node.avail_gb == avail + assert node.avail_gb_last_checked >= after def test_edge_model(storagetransferaction, storagenode, storagegroup): diff --git a/tests/io/test_lfs.py b/tests/io/test_lfs.py index 9a54c3faf..689781a24 100644 --- a/tests/io/test_lfs.py +++ b/tests/io/test_lfs.py @@ -4,6 +4,7 @@ import pathlib from alpenhorn.io.lfs import LFS +from alpenhorn.db import StorageNode @pytest.mark.run_command_result(0, "lfs_out", "lfs_err") @@ -12,11 +13,16 @@ def test_run_lfs_success(have_lfs, mock_run_command): lfs = LFS(None) - assert lfs.run_lfs("arg1", "arg2") == "lfs_out" + assert lfs.run_lfs("arg1", "arg2") == { + "failed": False, + "missing": False, + "output": "lfs_out", + "timeout": False, + } assert mock_run_command() == { "cmd": ["LFS", "arg1", "arg2"], "kwargs": dict(), - "timeout": None, + "timeout": 60, } @@ -26,11 +32,11 @@ def test_run_lfs_stringify(have_lfs, mock_run_command): lfs = LFS(None) - assert lfs.run_lfs(pathlib.Path("path"), 2) == "lfs_out" + assert lfs.run_lfs(pathlib.Path("path"), 2)["output"] == "lfs_out" assert mock_run_command() == { "cmd": ["LFS", "path", "2"], "kwargs": dict(), - "timeout": None, + "timeout": 60, } @@ -40,11 +46,56 @@ def test_run_lfs_fail(have_lfs, mock_run_command): lfs = LFS(None) - assert lfs.run_lfs("arg1", "arg2") is False + assert lfs.run_lfs("arg1", "arg2") == { + "failed": True, + "missing": False, + "output": None, + "timeout": False, + } + assert mock_run_command() == { + "cmd": ["LFS", "arg1", "arg2"], + "kwargs": dict(), + "timeout": 60, + } + + +@pytest.mark.run_command_result(None, "lfs_out", "lfs_err") +def test_run_lfs_timeout(have_lfs, mock_run_command): + """Test timed out invocation of lfs.run_lfs.""" + + lfs = LFS(None) + + assert lfs.run_lfs("arg1", "arg2") == { + "failed": False, + "missing": False, + "output": None, + "timeout": True, + } + assert mock_run_command() == { + "cmd": ["LFS", "arg1", "arg2"], + "kwargs": dict(), + "timeout": 60, + } + + +@pytest.mark.run_command_result( + 2, "lfs_out", "Something didn't work: No such file or directory" +) +def test_run_lfs_missing(have_lfs, mock_run_command): + """Test ENOENT from lfs.run_fls.""" + + lfs = LFS(None) + + assert lfs.run_lfs("arg1", "arg2") == { + "failed": False, + "missing": True, + "output": None, + "timeout": False, + } assert mock_run_command() == { "cmd": ["LFS", "arg1", "arg2"], "kwargs": dict(), - "timeout": None, + "timeout": 60, } @@ -143,7 +194,9 @@ def test_quota_fixed(have_lfs, mock_run_command): assert lfs.quota_remaining("/path") == (2500 - 1234) * 2**10 -@pytest.mark.run_command_result(1, None, None) +@pytest.mark.run_command_result( + 2, "", "Something didn't work: No such file or directory" +) def test_hsm_state_missing(xfs, have_lfs, mock_run_command): """Test hsm_state on a missing file.""" diff --git a/tests/io/test_lustrehsmgroup.py b/tests/io/test_lustrehsmgroup.py index 0eab04502..b22095cf2 100644 --- a/tests/io/test_lustrehsmgroup.py +++ b/tests/io/test_lustrehsmgroup.py @@ -131,6 +131,13 @@ def test_idle(queue, group): assert group.idle is True +@pytest.mark.lfs_hsm_state( + { + "/hsm/test/one": "restored", + "/hsm/test/two": "released", + "/hsm/test/three": "missing", + } +) def test_exists(xfs, group): """Test TransportGroupIO.exists().""" group, hsm, smallfile = group diff --git a/tests/io/test_lustrehsmnode.py b/tests/io/test_lustrehsmnode.py index babed8534..268627db7 100644 --- a/tests/io/test_lustrehsmnode.py +++ b/tests/io/test_lustrehsmnode.py @@ -21,7 +21,7 @@ def node( simplenode.io_class = "LustreHSM" simplenode.io_config = ( - '{"quota_group": "qgroup", "headroom": 10250, "release_check_count": 6}' + '{"quota_group": "qgroup", "headroom": 10250, "release_check_count": 7}' ) # Some files @@ -32,10 +32,11 @@ def node( archivefile(name="file4", acq=simpleacq, size_b=800000), archivefile(name="file5", acq=simpleacq, size_b=50000), archivefile(name="file6", acq=simpleacq, size_b=300000), + archivefile(name="file7", acq=simpleacq, size_b=200000), ] # Some copies - last_updates = [3, 1, 5, 6, 2, 4] + last_updates = [3, 1, 6, 7, 2, 4, 5] for num, file in enumerate(files): archivefilecopy(file=file, node=simplenode, has_file="Y", size_b=10, ready=True) # We need to do it this way to set last_update @@ -68,17 +69,18 @@ def test_init_bad_release_count(simplenode, have_lfs): UpdateableNode(None, simplenode) -@pytest.mark.lfs_quota_remaining(20000000) def test_release_files_okay(queue, node): """Test running release_files when we're under headroom""" + node.db.avail_gb = 20000000.0 / 2**30 + node.db.save() + node.io.release_files() # Shouldn't be anything in the queue assert queue.qsize == 0 -@pytest.mark.lfs_quota_remaining(10000000) @pytest.mark.lfs_hsm_state( { "/node/simpleacq/file1": "restored", @@ -87,11 +89,18 @@ def test_release_files_okay(queue, node): "/node/simpleacq/file4": "restored", "/node/simpleacq/file5": "restored", "/node/simpleacq/file6": "unarchived", + "/node/simpleacq/file7": "restoring", } ) def test_release_files(queue, mock_lfs, node): """Test running release_files.""" + node.db.avail_gb = 10000000.0 / 2**30 + node.db.save() + + # File7 is not ready + ArchiveFileCopy.update(ready=False).where(ArchiveFileCopy.id == 7).execute() + node.io.release_files() before = pw.utcnow().replace(microsecond=0) @@ -110,7 +119,8 @@ def test_release_files(queue, mock_lfs, node): # - file5: 50 kB [ last_update = 2 ] # - file1: 100 kB [ last_update = 3 ] # - file6 is skipped because it's not archived [last_update = 4] - # - file3: 400 kB [ last_update = 5 ] + # - file7 is skipped because it's being restored [last_update = 5] + # - file3: 400 kB [ last_update = 6 ] # file4 remains restored assert not ArchiveFileCopy.get(id=1).ready assert ArchiveFileCopy.get(id=1).last_update >= before @@ -126,7 +136,8 @@ def test_release_files(queue, mock_lfs, node): assert not ArchiveFileCopy.get(id=5).ready assert ArchiveFileCopy.get(id=5).last_update >= before - assert ArchiveFileCopy.get(id=6).ready + assert ArchiveFileCopy.get(id=6).last_update == 4 + assert ArchiveFileCopy.get(id=7).last_update == 5 # Check hsm_relase was actually called lfs = mock_lfs("") @@ -136,12 +147,15 @@ def test_release_files(queue, mock_lfs, node): assert lfs.hsm_state("/node/simpleacq/file4") == lfs.HSM_RESTORED assert lfs.hsm_state("/node/simpleacq/file5") == lfs.HSM_RELEASED assert lfs.hsm_state("/node/simpleacq/file6") == lfs.HSM_UNARCHIVED + assert lfs.hsm_state("/node/simpleacq/file7") == lfs.HSM_RESTORING -@pytest.mark.lfs_quota_remaining(10000000) def test_before_update(queue, node): """Test LustreHSMNodeIO.before_update()""" + node.db.avail_gb = 10000000.0 / 2**30 + node.db.save() + # When not idle, the release_files task is not run node.io.before_update(idle=False) @@ -164,36 +178,48 @@ def test_filesize(xfs, node): { "/node/dir/file1": "released", "/node/dir/file2": "restored", + "/node/dir/file3": "restoring", } ) def test_open_binary(xfs, node): """Test binary LustreHSMNodeIO.open()""" - xfs.create_file("/node/dir/file2", contents="file contents") + xfs.create_file("/node/dir/file1", contents="file1 contents") + xfs.create_file("/node/dir/file2", contents="file2 contents") + xfs.create_file("/node/dir/file3", contents="file3 contents") with pytest.raises(OSError): node.io.open("dir/file1", binary=True) with node.io.open("dir/file2", binary=True) as f: - assert f.read() == b"file contents" + assert f.read() == b"file2 contents" + + with pytest.raises(OSError): + node.io.open("dir/file3", binary=True) @pytest.mark.lfs_hsm_state( { "/node/dir/file1": "released", "/node/dir/file2": "restored", + "/node/dir/file3": "restoring", } ) def test_open_text(xfs, node): """Test text LustreHSMNodeIO.open()""" - xfs.create_file("/node/dir/file2", contents="file contents") + xfs.create_file("/node/dir/file1", contents="file1 contents") + xfs.create_file("/node/dir/file2", contents="file2 contents") + xfs.create_file("/node/dir/file3", contents="file3 contents") with pytest.raises(OSError): node.io.open("dir/file1", binary=False) with node.io.open("dir/file2", binary=False) as f: - assert f.read() == "file contents" + assert f.read() == "file2 contents" + + with pytest.raises(OSError): + node.io.open("dir/file3", binary=False) def test_check_missing(queue, node): @@ -258,6 +284,7 @@ def test_check_ready_restored(xfs, queue, node, mock_lfs): "/node/simpleacq/file1": "released", } ) +@pytest.mark.lfs_hsm_restore_result("restore") def test_check_released(xfs, queue, mock_lfs, node): """Test check on a non-ready, released file.""" @@ -301,6 +328,7 @@ def test_check_released(xfs, queue, mock_lfs, node): "/node/simpleacq/file1": "released", } ) +@pytest.mark.lfs_hsm_restore_result("restore") def test_check_ready_released(xfs, queue, mock_lfs, node): """Test check on a ready, released file.""" @@ -341,6 +369,7 @@ def test_check_ready_released(xfs, queue, mock_lfs, node): "/node/simpleacq/file2": "released", "/node/simpleacq/file3": "unarchived", "/node/simpleacq/file4": "missing", + "/node/simpleacq/file5": "restoring", } ) def test_ready_path(mock_lfs, node): @@ -351,13 +380,15 @@ def test_ready_path(mock_lfs, node): assert not node.io.ready_path("/node/simpleacq/file2") assert node.io.ready_path("/node/simpleacq/file3") assert not node.io.ready_path("/node/simpleacq/file4") + assert not node.io.ready_path("/node/simpleacq/file5") # But now released file is recalled. lfs = mock_lfs("") assert lfs.hsm_state("/node/simpleacq/file1") == lfs.HSM_RESTORED - assert lfs.hsm_state("/node/simpleacq/file2") == lfs.HSM_RESTORED + assert lfs.hsm_state("/node/simpleacq/file2") == lfs.HSM_RESTORING assert lfs.hsm_state("/node/simpleacq/file3") == lfs.HSM_UNARCHIVED assert lfs.hsm_state("/node/simpleacq/file4") == lfs.HSM_MISSING + assert lfs.hsm_state("/node/simpleacq/file5") == lfs.HSM_RESTORING @pytest.mark.lfs_hsm_state( @@ -396,6 +427,41 @@ def test_ready_pull_restored(mock_lfs, node, queue, archivefilecopyrequest): assert lfs.hsm_state(copy.path) == lfs.HSM_RESTORED +@pytest.mark.lfs_hsm_state( + { + "/node/simpleacq/file1": "restoring", + } +) +def test_ready_pull_restoring(mock_lfs, node, queue, archivefilecopyrequest): + """Test LustreHSMNodeIO.ready_pull on file already being restored.""" + + before = pw.utcnow().replace(microsecond=0) + + copy = ArchiveFileCopy.get(id=1) + copy.ready = False + copy.save() + afcr = archivefilecopyrequest( + file=copy.file, node_from=node.db, group_to=node.db.group + ) + + node.io.ready_pull(afcr) + + # Task in queue + assert queue.qsize == 1 + + # Run task + task, key = queue.get() + task() + queue.task_done(key) + + # File is still not ready + assert not ArchiveFileCopy.get(id=1).ready + + # File is still being restored + lfs = mock_lfs("") + assert lfs.hsm_state(copy.path) == lfs.HSM_RESTORING + + @pytest.mark.lfs_hsm_state( { "/node/simpleacq/file1": "released", @@ -430,12 +496,9 @@ def test_ready_pull_released(mock_lfs, node, queue, archivefilecopyrequest): # Don't wait for the deferral to expire, just run the task again task() - # File is now ready - assert ArchiveFileCopy.get(id=1).ready - - # File is restored + # File is being restored lfs = mock_lfs("") - assert lfs.hsm_state(copy.path) == lfs.HSM_RESTORED + assert lfs.hsm_state(copy.path) == lfs.HSM_RESTORING def test_idle_update_empty(queue, mock_lfs, node): @@ -447,7 +510,7 @@ def test_idle_update_empty(queue, mock_lfs, node): node.io.idle_update(False) # QW has not been initialised - assert node.io._release_qw is None + assert node.io._statecheck_qw is None # No item in queue assert queue.qsize == 0 @@ -459,6 +522,7 @@ def test_idle_update_empty(queue, mock_lfs, node): "/node/simpleacq/file2": "restored", "/node/simpleacq/file3": "unarchived", "/node/simpleacq/file4": "missing", + "/node/simpleacq/file5": "restoring", } ) def test_idle_update_ready(xfs, queue, mock_lfs, node): @@ -469,7 +533,7 @@ def test_idle_update_ready(xfs, queue, mock_lfs, node): node.io.idle_update(False) # QW has been initialised - assert node.io._release_qw is not None + assert node.io._statecheck_qw is not None # Item in queue assert queue.qsize == 1 @@ -492,6 +556,9 @@ def test_idle_update_ready(xfs, queue, mock_lfs, node): assert ArchiveFileCopy.get(id=4).last_update >= before assert ArchiveFileCopy.get(id=4).has_file == "N" + # Copy five is not ready (being restored) + assert not ArchiveFileCopy.get(id=5).ready + @pytest.mark.lfs_hsm_state( { @@ -499,6 +566,7 @@ def test_idle_update_ready(xfs, queue, mock_lfs, node): "/node/simpleacq/file2": "restored", "/node/simpleacq/file3": "unarchived", "/node/simpleacq/file4": "missing", + "/node/simpleacq/file5": "restoring", } ) def test_idle_update_not_ready(xfs, queue, mock_lfs, node): @@ -512,7 +580,7 @@ def test_idle_update_not_ready(xfs, queue, mock_lfs, node): node.io.idle_update(False) # QW has been initialised - assert node.io._release_qw is not None + assert node.io._statecheck_qw is not None # Item in queue assert queue.qsize == 1 @@ -536,6 +604,9 @@ def test_idle_update_not_ready(xfs, queue, mock_lfs, node): assert ArchiveFileCopy.get(id=4).last_update >= before assert ArchiveFileCopy.get(id=4).has_file == "N" + # Copy five is not ready (being restored) + assert not ArchiveFileCopy.get(id=5).ready + @pytest.mark.lfs_hsm_state({"/node/simpleacq/file1": "released"}) @pytest.mark.lfs_hsm_restore_result("wait") @@ -567,7 +638,6 @@ def test_hsm_restore_twice(xfs, queue, mock_lfs, node): # Check the internal bookkeeping assert copy.id in node.io._restoring assert copy.id in node.io._restore_start - assert copy.id in node.io._restore_retry # Try to add another task node.io.check(copy) @@ -609,7 +679,6 @@ def test_hsm_restore_timeout(xfs, queue, mock_lfs, node): # Check the internal bookkeeping assert copy.id in node.io._restoring assert copy.id in node.io._restore_start - assert copy.id in node.io._restore_retry @pytest.mark.lfs_hsm_state({"/node/simpleacq/file1": "released"}) @@ -642,4 +711,3 @@ def test_hsm_restore_fail(xfs, queue, mock_lfs, node): # Check the internal bookkeeping assert copy.id not in node.io._restoring assert copy.id not in node.io._restore_start - assert copy.id not in node.io._restore_retry diff --git a/tests/server/test_auto_import.py b/tests/server/test_auto_import.py index 2b7310f2a..0481fd2c7 100644 --- a/tests/server/test_auto_import.py +++ b/tests/server/test_auto_import.py @@ -9,6 +9,7 @@ from alpenhorn.db.archive import ArchiveFileCopy from alpenhorn.db.acquisition import ArchiveAcq, ArchiveFile +from alpenhorn.io.lfs import HSMState from alpenhorn.server import auto_import from alpenhorn.server.update import UpdateableNode @@ -54,8 +55,8 @@ def test_import_file_not_ready(dbtables, queue, simplenode, mock_lfs): # _import_file is a generator function, so it needs to be interated to run. assert next(auto_import._import_file(None, unode, pathlib.PurePath("acq/file"))) > 0 - # File has been restored - assert not mock_lfs("").hsm_released("/node/acq/file") + # File is being restored + assert mock_lfs("").hsm_state("/node/acq/file") == HSMState.RESTORING def test_import_file_no_ext(dbtables, unode): diff --git a/tests/server/test_service.py b/tests/server/test_service.py index 567ea1bfa..573a065d7 100644 --- a/tests/server/test_service.py +++ b/tests/server/test_service.py @@ -109,6 +109,7 @@ def e2e_db(xfs, clidb_noinit, hostname): root="/nl1", host=hostname, active=True, + avail_gb=2000.0 / 2**30, # This is mostly ignored io_config='{"quota_group": "qgroup", "headroom": 10}', ) @@ -128,6 +129,7 @@ def e2e_db(xfs, clidb_noinit, hostname): host=hostname, active=True, io_config='{"quota_group": "qgroup", "headroom": 1, "release_check_count": 1}', + avail_gb=2000.0 / 2**30, ) StorageNode.create(name="sf2", group=nlgrp, root="/sf2", host=hostname, active=True) xfs.create_file("/sf2/ALPENHORN_NODE", contents="sf2") @@ -305,7 +307,6 @@ def e2e_config(xfs, hostname, clidb_uri): xfs.create_file("/etc/alpenhorn/alpenhorn.conf", contents=yaml.dump(config)) -@pytest.mark.lfs_quota_remaining(2000) @pytest.mark.lfs_hsm_state( { "/nl2/acq1/correct.me": "restored", @@ -323,7 +324,7 @@ def test_cli(e2e_db, e2e_config, mock_lfs, mock_rsync, loop_once): # Check HSM lfs = mock_lfs(quota_group="qgroup") assert lfs.hsm_state("/nl2/acq1/correct.me") == lfs.HSM_RESTORED - assert lfs.hsm_state("/nl1/acq1/restore.me") == lfs.HSM_RESTORED + assert lfs.hsm_state("/nl1/acq1/restore.me") == lfs.HSM_RESTORING assert lfs.hsm_state("/nl1/acq1/release.me") == lfs.HSM_RELEASED # Check results