Skip to content

Commit

Permalink
Merge branch 'develop' into tryptic-search
Browse files Browse the repository at this point in the history
  • Loading branch information
pverscha committed Sep 19, 2024
2 parents 4577a03 + 4d63609 commit 48794cd
Show file tree
Hide file tree
Showing 18 changed files with 948 additions and 113 deletions.
23 changes: 23 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
// README at: https://github.com/devcontainers/templates/tree/main/src/anaconda
{
"name": "Unipept Index",
"image": "mcr.microsoft.com/devcontainers/base:ubuntu",

// Features to add to the dev container. More info: https://containers.dev/features.
"features": {
"ghcr.io/devcontainers/features/rust:1": {}
},

// Use 'forwardPorts' to make a list of ports inside the container available locally.
// "forwardPorts": [],

// Use 'postCreateCommand' to run commands after the container is created.
// "postCreateCommand": "",

// Configure tool-specific properties.
// "customizations": {},

// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
// "remoteUser": "root"
}
82 changes: 82 additions & 0 deletions .github/workflows/build_index.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
name: Build index binaries

on:
schedule:
# Run on the first day of every month at midnight UTC
- cron: '0 0 1 * *'
push:
branches:
- feature/build_index_action
workflow_dispatch:

jobs:
build:
runs-on: ubuntu-latest

steps:
# Check out the most recent version of the repository with submodules
- name: Check out repository
uses: actions/checkout@v3
with:
submodules: recursive

# Set up Rust toolchain
- name: Set up Rust
uses: dtolnay/rust-toolchain@stable

# Compile Rust code
- name: Compile Rust code
run: cargo build --release

# Create a directory "build"
- name: Create build directory
run: mkdir -p build/input

# Download the file "suffix-array.zip" from the most recent release of "unipept-database"
- name: Download suffix-array.zip
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
latest_release_url=$(curl -s https://api.github.com/repos/unipept/unipept-database/releases/latest | grep "browser_download_url.*suffix-array.zip" | cut -d '"' -f 4)
release_date=$(curl -s https://api.github.com/repos/unipept/unipept-database/releases/latest | grep '"published_at":' | cut -d '"' -f 4 | cut -d'T' -f1)
release_date_formatted=$(date -d $release_date "+%Y-%m-%d")
SP_VERSION="SP_$release_date_formatted"
echo "SP_VERSION=$SP_VERSION" >> $GITHUB_ENV
curl -L -o build/suffix-array.zip $latest_release_url
# Extract the contents of the output folder from the zip into a folder "build/input"
- name: Extract zip contents
run: |
unzip build/suffix-array.zip '*' -d build/input
# Make a directory with the SP_VERSION and process files
- name: Process files
run: |
mkdir -p build/$SP_VERSION
lz4 -d build/input/uniprot_entries.tsv.lz4 | cut -f2,4,7,8 > build/$SP_VERSION/proteins.tsv
lz4 -d build/input/taxons.tsv.lz4 > build/$SP_VERSION/taxons.tsv
# Step 8: Run the sa-builder command
- name: Run sa-builder
run: |
prefix="build/$SP_VERSION"
./target/release/sa-builder -d "$prefix/proteins.tsv" -o "$prefix/sa_sparse3_compressed.bin" -s 3 -a lib-div-suf-sort -c
# Zip the contents of the build/$SP_VERSION directory
- name: Zip build contents
run: |
cd "build/$SP_VERSION" && zip "index_$SP_VERSION.zip" "proteins.tsv" "taxons.tsv" "sa_sparse3_compressed.bin"
# Create a GitHub release and upload the zip file
- name: Upload or Update Release
id: upload_or_update_release
uses: softprops/action-gh-release@v1
with:
files: build/${{ env.SP_VERSION }}/index_${{ env.SP_VERSION }}.zip
tag_name: index-${{ env.SP_VERSION }}
name: Index ${{ env.SP_VERSION }}
commitish: ${{ github.sha }}
draft: false
prerelease: false
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
8 changes: 8 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 10 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions bitarray/src/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,10 @@ mod tests {
#[test]
fn test_write_binary() {
let mut bitarray = BitArray::with_capacity(4, 40);
bitarray.set(0, 0x1234567890);
bitarray.set(1, 0xabcdef0123);
bitarray.set(2, 0x4567890abc);
bitarray.set(3, 0xdef0123456);
bitarray.set(0, 0x1234567890_u64);
bitarray.set(1, 0xabcdef0123_u64);
bitarray.set(2, 0x4567890abc_u64);
bitarray.set(3, 0xdef0123456_u64);

let mut buffer = Vec::new();
bitarray.write_binary(&mut buffer).unwrap();
Expand Down
18 changes: 11 additions & 7 deletions bitarray/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ impl BitArray {
/// * `index` - The index of the value to set.
/// * `value` - The value to set at the specified index.
pub fn set(&mut self, index: usize, value: u64) {
let value: u64 = value;
let start_block = index * self.bits_per_value / 64;
let start_block_offset = index * self.bits_per_value % 64;

Expand Down Expand Up @@ -142,11 +143,14 @@ impl BitArray {
pub fn clear(&mut self) {
self.data.iter_mut().for_each(|x| *x = 0);
}

pub fn get_data_slice(&self, start_slice: usize, end_slice: usize) -> &[u64] {
&self.data[start_slice..end_slice]
}
}

/// Writes the data to a writer in a binary format using a bit array. This function is helpfull
/// when writing large amounts of data to a writer in chunks. The data is written in chunks of the
/// specified capacity, so memory usage is minimized.
/// Writes the data to a writer in a binary format using a bit array. The data is written
/// in chunks of the specified capacity, so memory usage is minimized.
///
/// # Arguments
///
Expand Down Expand Up @@ -257,10 +261,10 @@ mod tests {
fn test_bitarray_set() {
let mut bitarray = BitArray::with_capacity(4, 40);

bitarray.set(0, 0b0001110011111010110001000111111100110010);
bitarray.set(1, 0b1100001001010010011000010100110111001001);
bitarray.set(2, 0b1111001101001101101101101011101001010001);
bitarray.set(3, 0b0000100010010001010001001110101110011100);
bitarray.set(0, 0b0001110011111010110001000111111100110010_u64);
bitarray.set(1, 0b1100001001010010011000010100110111001001_u64);
bitarray.set(2, 0b1111001101001101101101101011101001010001_u64);
bitarray.set(3, 0b0000100010010001010001001110101110011100_u64);

assert_eq!(bitarray.data, vec![0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144EB9C00000000]);
}
Expand Down
2 changes: 1 addition & 1 deletion sa-builder/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ fn main() {
eprintln!();
eprintln!("📋 Started loading the proteins...");
let start_proteins_time = get_time_ms().unwrap();
let mut data = Proteins::try_from_database_file_without_annotations(&database_file)
let mut data = Proteins::try_from_database_file_uncompressed(&database_file)
.unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str()));
eprintln!(
"✅ Successfully loaded the proteins in {} seconds!",
Expand Down
1 change: 1 addition & 0 deletions sa-index/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@ clap = { version = "4.4.8", features = ["derive"] }
rayon = "1.8.1"
serde = { version = "1.0.197", features = ["derive"] }
sa-mappings = { path = "../sa-mappings" }
text-compression = { path = "../text-compression" }
bitarray = { path = "../bitarray" }
serde_json = "1.0.116"
10 changes: 5 additions & 5 deletions sa-index/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,11 @@ mod tests {
#[test]
fn test_suffix_array_compressed() {
let mut bitarray = BitArray::with_capacity(5, 40);
bitarray.set(0, 1);
bitarray.set(1, 2);
bitarray.set(2, 3);
bitarray.set(3, 4);
bitarray.set(4, 5);
bitarray.set(0, 1 as u64);
bitarray.set(1, 2 as u64);
bitarray.set(2, 3 as u64);
bitarray.set(3, 4 as u64);
bitarray.set(4, 5 as u64);

let sa = SuffixArray::Compressed(bitarray, 1);
assert_eq!(sa.len(), 5);
Expand Down
Loading

0 comments on commit 48794cd

Please sign in to comment.