Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Compress protein text using bit packing (from 8 bits per char to 5 bits) #26

Merged
merged 13 commits into from
Sep 19, 2024
Merged
10 changes: 10 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions bitarray/src/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,10 @@ mod tests {
#[test]
fn test_write_binary() {
let mut bitarray = BitArray::with_capacity(4, 40);
bitarray.set(0, 0x1234567890);
bitarray.set(1, 0xabcdef0123);
bitarray.set(2, 0x4567890abc);
bitarray.set(3, 0xdef0123456);
bitarray.set(0, 0x1234567890_u64);
bitarray.set(1, 0xabcdef0123_u64);
bitarray.set(2, 0x4567890abc_u64);
bitarray.set(3, 0xdef0123456_u64);

let mut buffer = Vec::new();
bitarray.write_binary(&mut buffer).unwrap();
Expand Down
18 changes: 11 additions & 7 deletions bitarray/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ impl BitArray {
/// * `index` - The index of the value to set.
/// * `value` - The value to set at the specified index.
pub fn set(&mut self, index: usize, value: u64) {
let value: u64 = value;
let start_block = index * self.bits_per_value / 64;
let start_block_offset = index * self.bits_per_value % 64;

Expand Down Expand Up @@ -142,11 +143,14 @@ impl BitArray {
pub fn clear(&mut self) {
self.data.iter_mut().for_each(|x| *x = 0);
}

pub fn get_data_slice(&self, start_slice: usize, end_slice: usize) -> &[u64] {
&self.data[start_slice..end_slice]
}
}

/// Writes the data to a writer in a binary format using a bit array. This function is helpfull
/// when writing large amounts of data to a writer in chunks. The data is written in chunks of the
/// specified capacity, so memory usage is minimized.
/// Writes the data to a writer in a binary format using a bit array. The data is written
/// in chunks of the specified capacity, so memory usage is minimized.
///
/// # Arguments
///
Expand Down Expand Up @@ -257,10 +261,10 @@ mod tests {
fn test_bitarray_set() {
let mut bitarray = BitArray::with_capacity(4, 40);

bitarray.set(0, 0b0001110011111010110001000111111100110010);
bitarray.set(1, 0b1100001001010010011000010100110111001001);
bitarray.set(2, 0b1111001101001101101101101011101001010001);
bitarray.set(3, 0b0000100010010001010001001110101110011100);
bitarray.set(0, 0b0001110011111010110001000111111100110010_u64);
bitarray.set(1, 0b1100001001010010011000010100110111001001_u64);
bitarray.set(2, 0b1111001101001101101101101011101001010001_u64);
bitarray.set(3, 0b0000100010010001010001001110101110011100_u64);

assert_eq!(bitarray.data, vec![0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144EB9C00000000]);
}
Expand Down
2 changes: 1 addition & 1 deletion sa-builder/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ fn main() {
eprintln!();
eprintln!("📋 Started loading the proteins...");
let start_proteins_time = get_time_ms().unwrap();
let mut data = Proteins::try_from_database_file_without_annotations(&database_file)
let mut data = Proteins::try_from_database_file_uncompressed(&database_file)
.unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str()));
eprintln!(
"✅ Successfully loaded the proteins in {} seconds!",
Expand Down
1 change: 1 addition & 0 deletions sa-index/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@ clap = { version = "4.4.8", features = ["derive"] }
rayon = "1.8.1"
serde = { version = "1.0.197", features = ["derive"] }
sa-mappings = { path = "../sa-mappings" }
text-compression = { path = "../text-compression" }
bitarray = { path = "../bitarray" }
serde_json = "1.0.116"
10 changes: 5 additions & 5 deletions sa-index/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,11 @@ mod tests {
#[test]
fn test_suffix_array_compressed() {
let mut bitarray = BitArray::with_capacity(5, 40);
bitarray.set(0, 1);
bitarray.set(1, 2);
bitarray.set(2, 3);
bitarray.set(3, 4);
bitarray.set(4, 5);
bitarray.set(0, 1 as u64);
bitarray.set(1, 2 as u64);
bitarray.set(2, 3 as u64);
bitarray.set(3, 4 as u64);
bitarray.set(4, 5 as u64);

let sa = SuffixArray::Compressed(bitarray, 1);
assert_eq!(sa.len(), 5);
Expand Down
Loading
Loading