fix!: Don't panic when reading empty files (#381)

There was a bug when using partition_point which in that case does not gave guarentees about its behavior. Hence the whole logic was refactored and unit/integration tests were added. closes rustic-rs/rustic#1380
rustic-rs · Dec 10, 2024 · 70bc278 · 70bc278
1 parent bf6c7d0
commit 70bc278
Show file tree

Hide file tree

Showing 3 changed files with 107 additions and 54 deletions.
diff --git a/crates/core/src/repository.rs b/crates/core/src/repository.rs
@@ -1605,7 +1605,7 @@ impl<P, S: IndexedFull> Repository<P, S> {
     ///
     // TODO: Document errors
     pub fn open_file(&self, node: &Node) -> RusticResult<OpenFile> {
-        Ok(OpenFile::from_node(self, node))
+        OpenFile::from_node(self, node)
     }
 
     /// Reads an opened file at the given position

diff --git a/crates/core/src/vfs.rs b/crates/core/src/vfs.rs
@@ -455,17 +455,8 @@ impl Vfs {
 #[derive(Debug)]
 pub struct OpenFile {
     // The list of blobs
-    content: Vec<BlobInfo>,
-}
-
-// Information about the blob: 1) The id 2) The cumulated sizes of all blobs prior to this one, a.k.a the starting point of this blob.
-#[derive(Debug)]
-struct BlobInfo {
-    // [`Id`] of the blob
-    id: DataId,
-
-    // the start position of this blob within the file
-    starts_at: usize,
+    content: Vec<DataId>,
+    startpoints: ContentStartpoints,
 }
 
 impl OpenFile {
@@ -477,37 +468,32 @@ impl OpenFile {
     /// * `node` - The `Node` to create the `OpenFile` for
     ///
     /// # Errors
-    ///
-    // TODO: Document errors
+    /// - If the index for the needed data blobs cannot be read
     ///
     /// # Returns
     ///
     /// The created `OpenFile`
-    ///
-    /// # Panics
-    ///
-    /// * Panics if the `Node` has no content
-    pub fn from_node<P, S: IndexedFull>(repo: &Repository<P, S>, node: &Node) -> Self {
-        let mut start = 0;
-        let mut content: Vec<_> = node
-            .content
-            .as_ref()
-            .unwrap()
-            .iter()
-            .map(|id| {
-                let starts_at = start;
-                start += repo.index().get_data(id).unwrap().data_length() as usize;
-                BlobInfo { id: *id, starts_at }
-            })
-            .collect();
-
-        // content is assumed to be partitioned, so we add a starts_at:MAX entry
-        content.push(BlobInfo {
-            id: DataId::default(),
-            starts_at: usize::MAX,
-        });
-
-        Self { content }
+    pub(crate) fn from_node<P, S: IndexedFull>(
+        repo: &Repository<P, S>,
+        node: &Node,
+    ) -> RusticResult<Self> {
+        let content: Vec<_> = node.content.clone().unwrap_or_default();
+
+        let startpoints = ContentStartpoints::from_sizes(content.iter().map(|id| {
+            Ok(repo
+                .index()
+                .get_data(id)
+                .ok_or_else(|| {
+                    RusticError::new(ErrorKind::Vfs, "blob {blob} is not contained in index")
+                        .attach_context("blob", id.to_string())
+                })?
+                .data_length() as usize)
+        }))?;
+
+        Ok(Self {
+            content,
+            startpoints,
+        })
     }
 
     /// Read the `OpenFile` at the given `offset` from the `repo`.
@@ -520,7 +506,7 @@ impl OpenFile {
     ///
     /// # Errors
     ///
-    // TODO: Document errors
+    /// - if reading the needed blob(s) from the backend fails
     ///
     /// # Returns
     ///
@@ -530,36 +516,99 @@ impl OpenFile {
     pub fn read_at<P, S: IndexedFull>(
         &self,
         repo: &Repository<P, S>,
-        mut offset: usize,
+        offset: usize,
         mut length: usize,
     ) -> RusticResult<Bytes> {
-        // find the start of relevant blobs => find the largest index such that self.content[i].starts_at <= offset, but
-        // self.content[i+1] > offset  (note that a last dummy element has been added)
-        let mut i = self.content.partition_point(|c| c.starts_at <= offset) - 1;
-
-        offset -= self.content[i].starts_at;
+        let (mut i, mut offset) = self.startpoints.compute_start(offset);
 
         let mut result = BytesMut::with_capacity(length);
 
-        while length > 0 && i < self.content.len() - 1 {
-            let data = repo.get_blob_cached(&BlobId::from(*self.content[i].id), BlobType::Data)?;
+        // The case of empty node.content is also correctly handled here
+        while length > 0 && i < self.content.len() {
+            let data = repo.get_blob_cached(&BlobId::from(self.content[i]), BlobType::Data)?;
 
             if offset > data.len() {
                 // we cannot read behind the blob. This only happens if offset is too large to fit in the last blob
                 break;
             }
 
             let to_copy = (data.len() - offset).min(length);
-
             result.extend_from_slice(&data[offset..offset + to_copy]);
-
             offset = 0;
-
             length -= to_copy;
-
             i += 1;
         }
 
         Ok(result.into())
     }
 }
+
+// helper struct holding blob startpoints of the content
+#[derive(Debug)]
+struct ContentStartpoints(Vec<usize>);
+
+impl ContentStartpoints {
+    fn from_sizes(sizes: impl IntoIterator<Item = RusticResult<usize>>) -> RusticResult<Self> {
+        let mut start = 0;
+        let mut offsets: Vec<_> = sizes
+            .into_iter()
+            .map(|size| -> RusticResult<_> {
+                let starts_at = start;
+                start += size?;
+                Ok(starts_at)
+            })
+            .collect::<RusticResult<_>>()?;
+
+        if !offsets.is_empty() {
+            // offsets is assumed to be partitioned, so we add a starts_at:MAX entry
+            offsets.push(usize::MAX);
+        }
+        Ok(Self(offsets))
+    }
+
+    // compute the correct blobid and effective offset from a file offset
+    fn compute_start(&self, mut offset: usize) -> (usize, usize) {
+        if self.0.is_empty() {
+            return (0, 0);
+        }
+        // find the start of relevant blobs => find the largest index such that self.offsets[i] <= offset, but
+        // self.offsets[i+1] > offset  (note that a last dummy element with usize::MAX has been added to ensure we always have two partitions)
+        // If offsets is non-empty, then offsets[0] = 0, hence partition_point returns an index >=1.
+        let i = self.0.partition_point(|o| o <= &offset) - 1;
+        offset -= self.0[i];
+        (i, offset)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // helper func
+    fn startpoints_from_ok_sizes(sizes: impl IntoIterator<Item = usize>) -> ContentStartpoints {
+        ContentStartpoints::from_sizes(sizes.into_iter().map(Ok)).unwrap()
+    }
+
+    #[test]
+    fn content_offsets_empty_sizes() {
+        let offsets = startpoints_from_ok_sizes([]);
+        assert_eq!(offsets.compute_start(0), (0, 0));
+        assert_eq!(offsets.compute_start(42), (0, 0));
+    }
+
+    #[test]
+    fn content_offsets_size() {
+        let offsets = startpoints_from_ok_sizes([15]);
+        assert_eq!(offsets.compute_start(0), (0, 0));
+        assert_eq!(offsets.compute_start(5), (0, 5));
+        assert_eq!(offsets.compute_start(20), (0, 20));
+    }
+    #[test]
+    fn content_offsets_sizes() {
+        let offsets = startpoints_from_ok_sizes([15, 24]);
+        assert_eq!(offsets.compute_start(0), (0, 0));
+        assert_eq!(offsets.compute_start(5), (0, 5));
+        assert_eq!(offsets.compute_start(20), (1, 5));
+        assert_eq!(offsets.compute_start(42), (1, 27));
+    }
+}
diff --git a/crates/core/tests/integration.rs b/crates/core/tests/integration.rs
@@ -456,11 +456,15 @@ fn test_ls_and_read(
 
     let data = repo.read_file_at(&file, 0, 21)?; // read full content
     assert_eq!(Bytes::from("This is a test file.\n"), &data);
-    let data2 = repo.read_file_at(&file, 0, 4096)?; // read beyond file end
-    assert_eq!(data2, &data);
+    assert_eq!(data, repo.read_file_at(&file, 0, 4096)?); // read beyond file end
     assert_eq!(Bytes::new(), repo.read_file_at(&file, 25, 1)?); // offset beyond file end
     assert_eq!(Bytes::from("test"), repo.read_file_at(&file, 10, 4)?); // read partial content
 
+    // test reading an empty file from the repository
+    let path: PathBuf = ["test", "0", "tests", "empty-file"].iter().collect();
+    let node = entries.get(&path).unwrap();
+    let file = repo.open_file(node)?;
+    assert_eq!(Bytes::new(), repo.read_file_at(&file, 0, 0)?); // empty files
     Ok(())
 }