Skip to content

Commit

Permalink
function to compress data without having to store everything in memory
Browse files Browse the repository at this point in the history
  • Loading branch information
tibvdm committed May 17, 2024
1 parent 768d128 commit 7dd5c4b
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 25 deletions.
43 changes: 18 additions & 25 deletions bitarray/src/binary.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
//! This module provides utilities for reading and writing the bitarray as binary.

use std::io::{
BufRead,
Read,
Result,
Write
};
use std::io::{BufRead, Read, Result, Write};

use crate::BitArray;

Expand All @@ -20,7 +15,7 @@ pub trait Binary {
/// # Returns
///
/// Returns `Ok(())` if the write operation is successful, or an `Err` if an error occurs.
fn write_binary<W: Write>(&self, writer: W) -> Result<()>;
fn write_binary<W: Write>(&self, writer: &mut W) -> Result<()>;

/// Reads binary data into a struct from the given reader.
///
Expand All @@ -45,7 +40,7 @@ impl<const B: usize> Binary for BitArray<B> {
/// # Errors
///
/// Returns an error if there was a problem writing to the writer.
fn write_binary<W: Write>(&self, mut writer: W) -> Result<()> {
fn write_binary<W: Write>(&self, writer: &mut W) -> Result<()> {
for value in self.data.iter() {
writer.write_all(&value.to_le_bytes())?;
}
Expand All @@ -66,12 +61,11 @@ impl<const B: usize> Binary for BitArray<B> {
self.data.clear();

let mut buffer = vec![0; 8 * 1024];

loop {
let (finished, bytes_read) = fill_buffer(&mut reader, &mut buffer);
for buffer_slice in buffer[.. bytes_read].chunks_exact(8) {
self.data
.push(u64::from_le_bytes(buffer_slice.try_into().unwrap()));
for buffer_slice in buffer[..bytes_read].chunks_exact(8) {
self.data.push(u64::from_le_bytes(buffer_slice.try_into().unwrap()));
}

if finished {
Expand All @@ -92,8 +86,8 @@ impl<const B: usize> Binary for BitArray<B> {
///
/// # Returns
///
/// Returns a tuple `(finished, bytes_read)` where `finished` indicates whether the end of the input
/// is reached, and `bytes_read` is the number of bytes read into the buffer.
/// Returns a tuple `(finished, bytes_read)` where `finished` indicates whether the end of the input is reached,
/// and `bytes_read` is the number of bytes read into the buffer.
fn fill_buffer<T: Read>(input: &mut T, buffer: &mut Vec<u8>) -> (bool, usize) {
// Store the buffer size in advance, because rust will complain
// about the buffer being borrowed mutably while it's borrowed
Expand All @@ -115,7 +109,7 @@ fn fill_buffer<T: Read>(input: &mut T, buffer: &mut Vec<u8>) -> (bool, usize) {
// We've read {bytes_read} bytes
Ok(bytes_read) => {
// Shrink the writable buffer slice
writable_buffer_space = writable_buffer_space[bytes_read ..].as_mut();
writable_buffer_space = writable_buffer_space[bytes_read..].as_mut();
}

Err(err) => {
Expand Down Expand Up @@ -143,7 +137,7 @@ mod tests {
let mut input = input_str.as_bytes();

let mut buffer = vec![0; 800];

loop {
let (finished, bytes_read) = fill_buffer(&mut input, &mut buffer);

Expand Down Expand Up @@ -176,20 +170,19 @@ mod tests {
let mut buffer = Vec::new();
bitarray.write_binary(&mut buffer).unwrap();

assert_eq!(
buffer,
vec![
0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45,
0x23, 0x01, 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0
]
);
assert_eq!(buffer, vec![
0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12,
0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01,
0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0
]);
}

#[test]
fn test_read_binary() {
let buffer = vec![
0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45,
0x23, 0x01, 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0,
0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12,
0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01,
0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0
];

let mut bitarray = BitArray::<40>::with_capacity(4);
Expand Down
72 changes: 72 additions & 0 deletions bitarray/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

mod binary;

use std::io::{Write, Result};

/// Re-export the `Binary` trait.
pub use binary::Binary;

Expand Down Expand Up @@ -118,6 +120,62 @@ impl<const B: usize> BitArray<B> {
pub fn is_empty(&self) -> bool {
self.len == 0
}

/// Clears the `BitArray`, setting all bits to 0.
pub fn clear(&mut self) {
self.data.iter_mut().for_each(|x| *x = 0);
}

Check warning on line 127 in bitarray/src/lib.rs

View check run for this annotation

Codecov / codecov/patch

bitarray/src/lib.rs#L125-L127

Added lines #L125 - L127 were not covered by tests
}


/// Writes the data to a writer in a binary format using a bit array. This function is helpfull
/// when writing large amounts of data to a writer in chunks. The data is written in chunks of the
/// specified capacity, so memory usage is minimized.
///
/// # Arguments
///
/// * `data` - The data to write.
/// * `writer` - The writer to write the data to.
/// * `max_capacity` - The maximum amount of elements that may be stored in the bit array.
///
/// # Returns
///
/// A `Result` indicating whether the write operation was successful or not.
pub fn data_to_writer<const B: usize>(
data: Vec<i64>,
writer: &mut impl Write,
max_capacity: usize
) -> Result<()> {
// Calculate the capacity of the bit array so the data buffer can be stored entirely
// This makes the process of writing partial data to the writer easier as bounds checking is not needed
let capacity = max_capacity % (B * 64) * B * 64;

// Create a bit array that can store a single chunk of data
let mut bitarray = BitArray::<B>::with_capacity(capacity);

// Write the data to the writer in chunks of the specified capacity
let chunks = data.chunks_exact(capacity);

// Store the remainder before looping over the chunks
let remainder = chunks.remainder();

for chunk in chunks {
for (i, &value) in chunk.iter().enumerate() {
bitarray.set(i, value as u64);

Check warning on line 164 in bitarray/src/lib.rs

View check run for this annotation

Codecov / codecov/patch

bitarray/src/lib.rs#L163-L164

Added lines #L163 - L164 were not covered by tests
}
bitarray.write_binary(writer)?;
bitarray.clear();

Check warning on line 167 in bitarray/src/lib.rs

View check run for this annotation

Codecov / codecov/patch

bitarray/src/lib.rs#L166-L167

Added lines #L166 - L167 were not covered by tests
}

// Create a new bit array with the remainder capacity
bitarray = BitArray::<B>::with_capacity(remainder.len());

for (i, &value) in remainder.iter().enumerate() {
bitarray.set(i, value as u64);
}
bitarray.write_binary(writer)?;

Ok(())
}

#[cfg(test)]
Expand Down Expand Up @@ -172,4 +230,18 @@ mod tests {
let bitarray = BitArray::<40>::with_capacity(4);
assert!(!bitarray.is_empty());
}

#[test]
fn test_data_to_writer() {
let data = vec![0x1234567890, 0xabcdef0123, 0x4567890abc, 0xdef0123456];
let mut writer = Vec::new();

data_to_writer::<40>(data, &mut writer, 2).unwrap();

assert_eq!(writer, vec![
0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12,
0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01,
0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0
]);
}
}

0 comments on commit 7dd5c4b

Please sign in to comment.