From f4f9af0b70e07d9b2ec643722efff62ebf7d43cf Mon Sep 17 00:00:00 2001 From: Allison Karlitskaya Date: Tue, 8 Oct 2024 23:16:07 +0200 Subject: [PATCH] Up our tar game a bit Drop our lame hacked up tar header implementation for the real thing from the 'tar' crate. Reading data directly into it is actually pretty easy. Add a reader-side utility class for splitstream. Also: add a new SplitStreamData type for inline/external data and port our existing helper functions to use it. Add a 'cfsctl ls' command which lists out the content of a tar splitstream. So far it just lists files and if they are inline or external references. This is the start of support for creating a dumpfile... This is some dreadfully ugly code, but it seems to be working. Probably some tests would be nice at some point... --- Cargo.toml | 1 + src/bin/cfsctl.rs | 8 +++ src/repository.rs | 6 +++ src/splitstream.rs | 122 ++++++++++++++++++++++++++++++++++----------- src/tar.rs | 106 +++++++++++++++++++-------------------- 5 files changed, 159 insertions(+), 84 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index af856c9..56239fc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,7 @@ hex = "0.4.3" rand = "0.8.5" rustix = { version = "0.38.37", features = ["fs", "mount", "process"] } sha2 = "0.10.8" +tar = "0.4.42" zstd = "0.13.2" [profile.dev.package.sha2] diff --git a/src/bin/cfsctl.rs b/src/bin/cfsctl.rs index dd8095d..3468a4b 100644 --- a/src/bin/cfsctl.rs +++ b/src/bin/cfsctl.rs @@ -42,6 +42,11 @@ enum Command { reference: String, tarfile: Option, }, + /// Lists the contents of a tar stream + Ls { + /// the name of the stream + name: String, + }, /// Mounts a composefs, possibly enforcing fsverity of the image Mount { /// the name of the image to mount, either a sha256 digest or prefixed with 'ref/' @@ -84,6 +89,9 @@ fn main() -> Result<()> { Command::ImportTar { reference, tarfile: _ } => { repo.import_tar(&reference, &mut std::io::stdin()) }, + Command::Ls { name } => { + repo.ls(&name) + }, Command::Mount { name, mountpoint } => { repo.mount(&name, &mountpoint) }, diff --git a/src/repository.rs b/src/repository.rs index c1f240a..596bcd2 100644 --- a/src/repository.rs +++ b/src/repository.rs @@ -216,6 +216,12 @@ impl Repository { self.link_ref(name, "images", object_id) } + pub fn ls(self, name: &str) -> Result<()> { + let file = File::from(self.open_in_category("streams", name)?); + let mut split_stream = zstd::stream::read::Decoder::new(file)?; + crate::tar::ls(&mut split_stream) + } + pub fn mount(self, name: &str, mountpoint: &str) -> Result<()> { let image = self.open_in_category("images", name)?; let object_path = format!("{}/objects", self.path); diff --git a/src/splitstream.rs b/src/splitstream.rs index c45414d..12574c3 100644 --- a/src/splitstream.rs +++ b/src/splitstream.rs @@ -27,12 +27,18 @@ * That's it, really. There's no header. The file is over when there's no more blocks. */ -use std::io::{ - Read, - Write, +use std::{ + collections::VecDeque, + io::{ + Read, + Write, + }, }; -use anyhow::Result; +use anyhow::{ + Result, + bail, +}; use crate::{ fsverity::{ @@ -46,7 +52,6 @@ use crate::{ pub struct SplitStreamWriter<'w, W: Write> { inline_content: Vec, writer: &'w mut W - } impl<'w, W: Write> SplitStreamWriter<'w, W> { @@ -90,28 +95,93 @@ impl<'w, W: Write> SplitStreamWriter<'w, W> { } } -fn read_u64_le(reader: &mut R) -> Result> { +pub enum SplitStreamData { + Inline(Vec), + External(Sha256HashValue), +} + +pub fn read_splitstream_chunk(reader: &mut R) -> Result> { let mut buf = [0u8; 8]; - if read_exactish(reader, &mut buf)? { - Ok(Some(u64::from_le_bytes(buf))) - } else { - Ok(None) + match read_exactish(reader, &mut buf)? { + false => Ok(None), + true => match u64::from_le_bytes(buf) as usize { + 0 => { + let mut data = Sha256HashValue::EMPTY; + reader.read_exact(&mut data)?; + Ok(Some(SplitStreamData::External(data))) + }, + size => { + let mut data = vec![0u8; size]; + reader.read_exact(&mut data)?; + Ok(Some(SplitStreamData::Inline(data))) + } + } + } +} + +// utility class to help read splitstreams +pub struct SplitStreamReader<'w, R: Read> { + inline_content: VecDeque, + reader: &'w mut R +} + +impl<'r, R: Read> SplitStreamReader<'r, R> { + pub fn new(reader: &'r mut R) -> SplitStreamReader<'r, R> { + SplitStreamReader { inline_content: VecDeque::new(), reader } + } + + /// assumes that the data cannot be split across chunks + pub fn read_inline_exact(&mut self, data: &mut [u8]) -> Result { + if self.inline_content.is_empty() { + match read_splitstream_chunk(&mut self.reader)? { + None => { return Ok(false); } + Some(SplitStreamData::Inline(data)) => { self.inline_content = data.into() }, + Some(SplitStreamData::External(_)) => { bail!("Expecting inline data but found external chunk") } + } + } + + self.inline_content.read_exact(data)?; + Ok(true) + } + + pub fn read_exact(&mut self, actual_size: usize, stored_size: usize) -> Result { + if self.inline_content.is_empty() { + match read_splitstream_chunk(&mut self.reader)? { + None => { bail!("Unexpected EOF") }, + Some(SplitStreamData::Inline(data)) => { self.inline_content = data.into() }, + Some(ext) => { + if actual_size != stored_size { + // need to eat the padding... + match read_splitstream_chunk(&mut self.reader)? { + None => { bail!("bad eof") }, + Some(SplitStreamData::Inline(data)) => { self.inline_content = data.into() }, + Some(SplitStreamData::External(_)) => { bail!("Expecting inline data but found external chunk") } + } + // TODO: make this suck less + let mut padding = vec![0u8; stored_size - actual_size]; + self.inline_content.read_exact(&mut padding)?; + } + + return Ok(ext) + } + } + } + + // must be inline + let mut data = vec![0u8; stored_size]; + self.inline_content.read_exact(&mut data)?; + data.truncate(actual_size); + Ok(SplitStreamData::Inline(data)) } } pub fn splitstream_merge Result>>( split_stream: &mut R, result: &mut W, mut load_data: F, ) -> Result<()> { - while let Some(size) = read_u64_le(split_stream)? { - if size == 0 { - let mut hash = Sha256HashValue::EMPTY; - split_stream.read_exact(&mut hash)?; - let data = load_data(hash)?; - result.write_all(&data)?; - } else { - let mut data = vec![0u8; size as usize]; // TODO: bzzt bzzt - split_stream.read_exact(&mut data)?; - result.write_all(&data)?; + while let Some(data) = read_splitstream_chunk(split_stream)? { + match data { + SplitStreamData::Inline(data) => result.write_all(&data)?, + SplitStreamData::External(id) => result.write_all(&load_data(id)?)?, } } @@ -121,14 +191,10 @@ pub fn splitstream_merge Result< pub fn splitstream_objects( split_stream: &mut R, mut callback: F ) -> Result<()> { - while let Some(size) = read_u64_le(split_stream)? { - if size == 0 { - let mut hash = Sha256HashValue::EMPTY; - split_stream.read_exact(&mut hash)?; - callback(hash); - } else { - let mut discard = vec![0u8; size as usize]; // TODO: bzzt bzzt - split_stream.read_exact(&mut discard)?; + while let Some(data) = read_splitstream_chunk(split_stream)? { + match data { + SplitStreamData::Inline(_) => { /* no op */ }, + SplitStreamData::External(id) => callback(id) } } diff --git a/src/tar.rs b/src/tar.rs index 11d03a8..7adb1d2 100644 --- a/src/tar.rs +++ b/src/tar.rs @@ -1,64 +1,29 @@ use std::io::{Read, Write}; use anyhow::Result; +use tar::{ + EntryType, + Header, +}; use crate::{ fsverity::Sha256HashValue, - splitstream::SplitStreamWriter, + splitstream::{ + SplitStreamData, + SplitStreamReader, + SplitStreamWriter, + }, util::read_exactish, }; -struct TarHeader { - data: [u8; 512], -} - -impl TarHeader { - // we can't use Read::read_exact() because we need to be able to detect EOF - fn read(reader: &mut R) -> Result> { - let mut header = TarHeader { data: [0u8; 512] }; - if read_exactish(reader, &mut header.data)? { - Ok(Some(header)) - } else { - Ok(None) - } - } - - fn get_size(&self) -> usize { - let size_field = &self.data[124..124 + 12]; - let mut value = 0usize; - - if size_field[0] & 0x80 != 0 { - // binary representation - for byte in &size_field[4..12] { - value <<= 8; - value += *byte as usize; - } - } else { - // octal representation with nul terminator - for byte in size_field { - if *byte == b'\0' { - break; - } else { - // add octal digit value (no error checking) - value <<= 3; - value += (*byte - b'0') as usize; - } - } - } - - // TODO: not too big, I hope... - value - } - - fn get_storage_size(&self) -> usize { - // round up to nearest multiple of 512 - (self.get_size() + 511) & !511 - } - - fn is_reg(&self) -> bool { - self.data[156] == b'0' +fn read_header(reader: &mut R) -> Result> { + let mut header = Header::new_gnu(); + if read_exactish(reader, header.as_mut_bytes())? { + Ok(Some(header)) + } else { + Ok(None) } -} + } /// Splits the tar file from tar_stream into a Split Stream. The store_data function is /// responsible for ensuring that "external data" is in the composefs repository and returns the @@ -70,18 +35,22 @@ pub fn split Result>( ) -> Result<()> { let mut writer = SplitStreamWriter::new(split_stream); - while let Some(header) = TarHeader::read(tar_stream)? { + while let Some(header) = read_header(tar_stream)? { // the header always gets stored as inline data - writer.write_inline(&header.data); + writer.write_inline(header.as_bytes()); + + if header.as_bytes() == &[0u8; 512] { + continue; + } // read the corresponding data, if there is any - let storage_size = header.get_storage_size(); + let actual_size = header.entry_size()? as usize; + let storage_size = (actual_size + 511) & !511; let mut buffer = vec![0u8; storage_size]; tar_stream.read_exact(&mut buffer)?; - if header.is_reg() && storage_size > 0 { + if header.entry_type() == EntryType::Regular && storage_size > 0 { // non-empty regular file: store the data in the object store - let actual_size = header.get_size(); let padding = buffer.split_off(actual_size); let reference = store_data(&buffer)?; writer.write_reference(reference, padding)?; @@ -94,3 +63,28 @@ pub fn split Result>( // flush out any remaining inline data writer.done() } + +pub fn ls(split_stream: &mut R) -> Result<()> { + let mut reader = SplitStreamReader::new(split_stream); + + loop { + let mut buf = [0u8; 512]; + if !reader.read_inline_exact(&mut buf)? { + return Ok(()); + } + + if buf == [0u8; 512] { + return Ok(()); + } + + let header = tar::Header::from_byte_slice(&buf); + let actual_size = header.size()? as usize; + let stored_size = (actual_size + 511) & !511; + println!("{:?}", header.path()?); + match reader.read_exact(actual_size, stored_size)? { + SplitStreamData::Inline(data) => println!("{} data bytes inline", data.len()), + SplitStreamData::External(id) => println!("ext {}", hex::encode(id)) + } + println!(); + } +}