diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 8f982f7..39cd5fa 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -70,12 +70,12 @@ jobs: uses: actions/checkout@v2 - name: Generate code coverage - run: cargo +nightly tarpaulin --verbose --engine llvm --no-dead-code --all-features --workspace --timeout 120 --out xml + run: cargo +nightly tarpaulin --all-features --workspace --engine llvm --timeout 120 --out xml - name: Upload to codecov.io uses: codecov/codecov-action@v2 with: - # token: ${{secrets.CODECOV_TOKEN}} # not required for public repos + token: ${{secrets.CODECOV_TOKEN}} # not required for public repos fail_ci_if_error: true lints: diff --git a/.github/workflows/pypublish.yml b/.github/workflows/pypublish.yml index 8c7bd5e..7f64d70 100644 --- a/.github/workflows/pypublish.yml +++ b/.github/workflows/pypublish.yml @@ -11,8 +11,6 @@ on: - main tags: - '*' - pull_request: - workflow_dispatch: permissions: contents: read @@ -24,8 +22,8 @@ jobs: matrix: target: [x86_64, x86, aarch64, armv7, s390x, ppc64le] steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: python-version: '3.10' - name: Build wheels @@ -34,13 +32,13 @@ jobs: target: ${{ matrix.target }} args: --sdist --release --out dist --find-interpreter sccache: 'true' - working-directory: vcf2parquet-py + working-directory: python manylinux: auto - name: Upload wheels - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: wheels - path: vcf2parquet-py/dist + path: python/dist windows: runs-on: windows-latest @@ -48,8 +46,8 @@ jobs: matrix: target: [x64, x86] steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: python-version: '3.10' architecture: ${{ matrix.target }} @@ -59,12 +57,12 @@ jobs: target: ${{ matrix.target }} args: --sdist --release --out dist --find-interpreter sccache: 'true' - working-directory: vcf2parquet-py + working-directory: python - name: Upload wheels - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: wheels - path: vcf2parquet-py/dist + path: python/dist macos: runs-on: macos-latest @@ -72,8 +70,8 @@ jobs: matrix: target: [x86_64, aarch64] steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: python-version: '3.10' - name: Build wheels @@ -82,12 +80,12 @@ jobs: target: ${{ matrix.target }} args: --sdist --release --out dist --find-interpreter sccache: 'true' - working-directory: vcf2parquet-py + working-directory: python - name: Upload wheels - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: wheels - path: vcf2parquet-py/dist + path: python/dist release: name: Release diff --git a/Cargo.lock b/Cargo.lock index ca55af0..2a0aff6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -15,12 +15,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", + "const-random", "getrandom", "once_cell", "version_check", "zerocopy", ] +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + [[package]] name = "alloc-no-stdlib" version = "2.0.4" @@ -36,6 +46,21 @@ dependencies = [ "alloc-no-stdlib", ] +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + [[package]] name = "anstream" version = "0.6.13" @@ -85,133 +110,213 @@ dependencies = [ ] [[package]] -name = "array-init-cursor" -version = "0.2.0" +name = "arrow" +version = "51.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf7d0a018de4f6aa429b9d33d69edf69072b1c5b1cb8d3e4a5f7ef898fc3eb76" +checksum = "219d05930b81663fd3b32e3bde8ce5bff3c4d23052a99f11a8fa50a3b47b2658" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", +] [[package]] -name = "arrow-format" -version = "0.8.1" +name = "arrow-arith" +version = "51.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07884ea216994cdc32a2d5f8274a8bee979cfe90274b83f86f440866ee3132c7" +checksum = "0272150200c07a86a390be651abdd320a2d12e84535f0837566ca87ecd8f95e0" dependencies = [ - "planus", - "serde", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "num", ] [[package]] -name = "arrow2" -version = "0.17.4" +name = "arrow-array" +version = "51.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59c468daea140b747d781a1da9f7db5f0a8e6636d4af20cc539e43d05b0604fa" +checksum = "8010572cf8c745e242d1b632bd97bd6d4f40fefed5ed1290a8f433abaa686fea" dependencies = [ "ahash", - "arrow-format", - "base64", - "bytemuck", + "arrow-buffer", + "arrow-data", + "arrow-schema", "chrono", - "dyn-clone", - "either", - "ethnum", - "fallible-streaming-iterator", - "foreign_vec", - "futures", - "getrandom", - "hash_hasher", - "num-traits", - "parquet2", - "rustc_version", - "simdutf8", - "streaming-iterator", + "half", + "hashbrown", + "num", ] [[package]] -name = "arrow2" -version = "0.18.0" +name = "arrow-buffer" +version = "51.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "963fef509b757bcbbf9e5ffa23bcb345614d99f4f6f531f97417b27b8604d389" +checksum = "0d0a2432f0cba5692bf4cb757469c66791394bac9ec7ce63c1afe74744c37b27" dependencies = [ - "ahash", - "arrow-format", + "bytes", + "half", + "num", +] + +[[package]] +name = "arrow-cast" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9abc10cd7995e83505cc290df9384d6e5412b207b79ce6bdff89a10505ed2cba" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "atoi", "base64", - "bytemuck", "chrono", - "dyn-clone", - "either", - "ethnum", - "fallible-streaming-iterator", - "foreign_vec", - "futures", - "getrandom", - "hash_hasher", + "half", + "lexical-core", + "num", + "ryu", +] + +[[package]] +name = "arrow-data" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2742ac1f6650696ab08c88f6dd3f0eb68ce10f8c253958a18c943a68cd04aec5" +dependencies = [ + "arrow-buffer", + "arrow-schema", + "half", + "num", +] + +[[package]] +name = "arrow-ipc" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a42ea853130f7e78b9b9d178cb4cd01dee0f78e64d96c2949dc0a915d6d9e19d" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "flatbuffers", +] + +[[package]] +name = "arrow-ord" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3e6b61e3dc468f503181dccc2fc705bdcc5f2f146755fa5b56d0a6c5943f412" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "half", + "num", +] + +[[package]] +name = "arrow-row" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "848ee52bb92eb459b811fb471175ea3afcf620157674c8794f539838920f9228" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "half", "hashbrown", - "num-traits", - "parquet2", - "rustc_version", - "simdutf8", - "streaming-iterator", ] [[package]] -name = "assert_cmd" -version = "2.0.14" +name = "arrow-schema" +version = "51.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed72493ac66d5804837f480ab3766c72bdfab91a65e565fc54fa9e42db0073a8" +checksum = "02d9483aaabe910c4781153ae1b6ae0393f72d9ef757d38d09d450070cf2e528" + +[[package]] +name = "arrow-select" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "849524fa70e0e3c5ab58394c770cb8f514d0122d20de08475f7b472ed8075830" dependencies = [ - "anstyle", - "bstr", - "doc-comment", - "predicates", - "predicates-core", - "predicates-tree", - "wait-timeout", + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "num", ] [[package]] -name = "async-stream" -version = "0.3.5" +name = "arrow-string" +version = "51.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51" +checksum = "9373cb5a021aee58863498c37eb484998ef13377f69989c6c5ccfbd258236cdb" dependencies = [ - "async-stream-impl", - "futures-core", - "pin-project-lite", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "memchr", + "num", + "regex", + "regex-syntax", ] [[package]] -name = "async-stream-impl" -version = "0.3.5" +name = "assert_cmd" +version = "2.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" +checksum = "ed72493ac66d5804837f480ab3766c72bdfab91a65e565fc54fa9e42db0073a8" dependencies = [ - "proc-macro2", - "quote", - "syn", + "anstyle", + "bstr", + "doc-comment", + "predicates", + "predicates-core", + "predicates-tree", + "wait-timeout", ] [[package]] -name = "async-trait" -version = "0.1.78" +name = "atoi" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "461abc97219de0eaaf81fe3ef974a540158f3d079c2ab200f891f1a2ef201e85" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" dependencies = [ - "proc-macro2", - "quote", - "syn", + "num-traits", ] [[package]] name = "autocfg" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +checksum = "f1fdabc7756949593fe60f30ec81974b613357de856987752631dea1e3394c80" [[package]] name = "base64" -version = "0.21.7" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" +checksum = "9475866fec1451be56a3c2400fd081ff546538961565ccb5b7142cbd22bc7a51" [[package]] name = "bgzip" @@ -279,26 +384,6 @@ version = "3.15.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ff69b9dd49fd426c69a0db9fc04dd934cdb6645ff000864d98f7e2af8830eaa" -[[package]] -name = "bytemuck" -version = "1.15.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d6d68c57235a3a081186990eca2867354726650f42f7516ca50c28d6281fd15" -dependencies = [ - "bytemuck_derive", -] - -[[package]] -name = "bytemuck_derive" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4da9a32f3fed317401fa3c862968128267c3106685286e15d5aaa3d7389c2f60" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "byteorder" version = "1.5.0" @@ -350,11 +435,14 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.35" +version = "0.4.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8eaf5903dcbc0a39312feb77df2ff4c76387d591b9fc7b04a238dcf8bb62639a" +checksum = "8a0d04d43504c61aa6c7531f1871dd0d418d91130162063b789da00fd7057a5e" dependencies = [ + "android-tzdata", + "iana-time-zone", "num-traits", + "windows-targets", ] [[package]] @@ -385,7 +473,7 @@ version = "4.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90239a040c80f5e14809ca132ddc4176ab33d5e17e49691793296e3fcb34d72f" dependencies = [ - "heck 0.5.0", + "heck", "proc-macro2", "quote", "syn", @@ -403,6 +491,32 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" +[[package]] +name = "const-random" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom", + "once_cell", + "tiny-keccak", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" + [[package]] name = "crc32fast" version = "1.4.0" @@ -447,13 +561,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" [[package]] -name = "deranged" -version = "0.3.11" +name = "crunchy" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" -dependencies = [ - "powerfmt", -] +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" [[package]] name = "difflib" @@ -467,12 +578,6 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" -[[package]] -name = "dyn-clone" -version = "1.0.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d6ef0072f8a535281e4876be788938b528e9a1d43900b82c2569af7da799125" - [[package]] name = "either" version = "1.10.0" @@ -495,24 +600,22 @@ dependencies = [ "windows-sys", ] -[[package]] -name = "ethnum" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b90ca2580b73ab6a1f724b76ca11ab632df820fd6040c336200d2c1df7b3c82c" - -[[package]] -name = "fallible-streaming-iterator" -version = "0.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" - [[package]] name = "fastrand" version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" +[[package]] +name = "flatbuffers" +version = "23.5.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dac53e22462d78c16d64a1cd22371b54cc3fe94aa15e7886a2fa6e5d1ab8640" +dependencies = [ + "bitflags 1.3.2", + "rustc_version", +] + [[package]] name = "flate2" version = "1.0.28" @@ -524,185 +627,165 @@ dependencies = [ ] [[package]] -name = "foreign_vec" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee1b05cbd864bcaecbd3455d6d967862d446e4ebfc3c2e5e5b9841e53cba6673" - -[[package]] -name = "futures" -version = "0.3.30" +name = "getrandom" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" +checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" dependencies = [ - "futures-channel", - "futures-core", - "futures-executor", - "futures-io", - "futures-sink", - "futures-task", - "futures-util", + "cfg-if", + "libc", + "wasi", ] [[package]] -name = "futures-channel" -version = "0.3.30" +name = "half" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +checksum = "b5eceaaeec696539ddaf7b333340f1af35a5aa87ae3e4f3ead0532f72affab2e" dependencies = [ - "futures-core", - "futures-sink", + "cfg-if", + "crunchy", + "num-traits", ] [[package]] -name = "futures-core" -version = "0.3.30" +name = "hashbrown" +version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" +checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" [[package]] -name = "futures-executor" -version = "0.3.30" +name = "heck" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" -dependencies = [ - "futures-core", - "futures-task", - "futures-util", -] +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] -name = "futures-io" -version = "0.3.30" +name = "iana-time-zone" +version = "0.1.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" +checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows-core", +] [[package]] -name = "futures-macro" -version = "0.3.30" +name = "iana-time-zone-haiku" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" dependencies = [ - "proc-macro2", - "quote", - "syn", + "cc", ] [[package]] -name = "futures-sink" -version = "0.3.30" +name = "indexmap" +version = "2.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" +checksum = "7b0b929d511467233429c45a44ac1dcaa21ba0f5ba11e4879e6ed28ddb4f9df4" +dependencies = [ + "equivalent", + "hashbrown", +] [[package]] -name = "futures-task" -version = "0.3.30" +name = "integer-encoding" +version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" [[package]] -name = "futures-util" -version = "0.3.30" +name = "jobserver" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +checksum = "ab46a6e9526ddef3ae7f787c06f0f2600639ba80ea3eade3d8e670a2230f51d6" dependencies = [ - "futures-channel", - "futures-core", - "futures-io", - "futures-macro", - "futures-sink", - "futures-task", - "memchr", - "pin-project-lite", - "pin-utils", - "slab", + "libc", ] [[package]] -name = "getrandom" -version = "0.2.12" +name = "js-sys" +version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" +checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" dependencies = [ - "cfg-if", - "js-sys", - "libc", - "wasi", "wasm-bindgen", ] [[package]] -name = "hash_hasher" -version = "2.0.3" +name = "lazy_static" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74721d007512d0cb3338cd20f0654ac913920061a4c4d0d8708edb3f2a698c0c" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] -name = "hashbrown" -version = "0.14.3" +name = "lexical-core" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46" dependencies = [ - "ahash", + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", ] [[package]] -name = "heck" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" - -[[package]] -name = "heck" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" - -[[package]] -name = "indexmap" -version = "2.2.5" +name = "lexical-parse-float" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b0b929d511467233429c45a44ac1dcaa21ba0f5ba11e4879e6ed28ddb4f9df4" +checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f" dependencies = [ - "equivalent", - "hashbrown", + "lexical-parse-integer", + "lexical-util", + "static_assertions", ] [[package]] -name = "indoc" -version = "2.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e186cfbae8084e513daff4240b4797e342f988cecda4fb6c939150f96315fd8" - -[[package]] -name = "itoa" -version = "1.0.10" +name = "lexical-parse-integer" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" +checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9" +dependencies = [ + "lexical-util", + "static_assertions", +] [[package]] -name = "jobserver" -version = "0.1.28" +name = "lexical-util" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab46a6e9526ddef3ae7f787c06f0f2600639ba80ea3eade3d8e670a2230f51d6" +checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc" dependencies = [ - "libc", + "static_assertions", ] [[package]] -name = "js-sys" -version = "0.3.69" +name = "lexical-write-float" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" +checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862" dependencies = [ - "wasm-bindgen", + "lexical-util", + "lexical-write-integer", + "static_assertions", ] [[package]] -name = "lazy_static" -version = "1.4.0" +name = "lexical-write-integer" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446" +dependencies = [ + "lexical-util", + "static_assertions", +] [[package]] name = "libc" @@ -711,20 +794,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" [[package]] -name = "linux-raw-sys" -version = "0.4.13" +name = "libm" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" [[package]] -name = "lock_api" -version = "0.4.11" +name = "linux-raw-sys" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" -dependencies = [ - "autocfg", - "scopeguard", -] +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" [[package]] name = "log" @@ -733,23 +812,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" [[package]] -name = "lz4" -version = "1.24.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e9e2dd86df36ce760a60f6ff6ad526f7ba1f14ba0356f8254fb6905e6494df1" -dependencies = [ - "libc", - "lz4-sys", -] - -[[package]] -name = "lz4-sys" -version = "1.9.4" +name = "lz4_flex" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57d27b317e207b10f69f5e75494119e391a96f48861ae870d1da6edac98ca900" +checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5" dependencies = [ - "cc", - "libc", + "twox-hash", ] [[package]] @@ -769,15 +837,6 @@ version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" -[[package]] -name = "memoffset" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" -dependencies = [ - "autocfg", -] - [[package]] name = "miniz_oxide" version = "0.7.2" @@ -799,7 +858,7 @@ dependencies = [ "flate2", "thiserror", "xz2", - "zstd", + "zstd 0.12.4", ] [[package]] @@ -872,103 +931,139 @@ dependencies = [ ] [[package]] -name = "num-conv" -version = "0.1.0" +name = "num" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" +checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] [[package]] -name = "num-traits" -version = "0.2.18" +name = "num-bigint" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" +checksum = "608e7659b5c3d7cba262d894801b9ec9d00de989e8a82bd4bef91d08da45cdc0" dependencies = [ "autocfg", + "num-integer", + "num-traits", ] [[package]] -name = "num_threads" -version = "0.1.7" +name = "num-complex" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c7398b9c8b70908f6371f47ed36737907c87c52af34c268fed0bf0ceb92ead9" +checksum = "23c6602fda94a57c990fe0df199a035d83576b496aa29f4e634a8ac6004e68a6" dependencies = [ - "libc", + "num-traits", ] [[package]] -name = "once_cell" -version = "1.19.0" +name = "num-integer" +version = "0.1.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] [[package]] -name = "parking_lot" -version = "0.12.1" +name = "num-iter" +version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +checksum = "d869c01cc0c455284163fd0092f1f93835385ccab5a98a0dcc497b2f8bf055a9" dependencies = [ - "lock_api", - "parking_lot_core", + "autocfg", + "num-integer", + "num-traits", ] [[package]] -name = "parking_lot_core" -version = "0.9.9" +name = "num-rational" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" +checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0" dependencies = [ - "cfg-if", - "libc", - "redox_syscall", - "smallvec", - "windows-targets 0.48.5", + "autocfg", + "num-bigint", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" +dependencies = [ + "autocfg", + "libm", ] [[package]] -name = "parquet-format-safe" -version = "0.2.4" +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + +[[package]] +name = "ordered-float" +version = "2.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1131c54b167dd4e4799ce762e1ab01549ebb94d5bdd13e6ec1b467491c378e1f" +checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" dependencies = [ - "async-trait", - "futures", + "num-traits", ] [[package]] -name = "parquet2" -version = "0.17.2" +name = "parquet" +version = "51.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "579fe5745f02cef3d5f236bfed216fd4693e49e4e920a13475c6132233283bce" +checksum = "096795d4f47f65fd3ee1ec5a98b77ab26d602f2cc785b0e4be5443add17ecc32" dependencies = [ - "async-stream", + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ipc", + "arrow-schema", + "arrow-select", + "base64", "brotli", + "bytes", + "chrono", "flate2", - "futures", - "lz4", - "parquet-format-safe", + "half", + "hashbrown", + "lz4_flex", + "num", + "num-bigint", + "paste", "seq-macro", "snap", - "streaming-decompression", - "zstd", + "thrift", + "twox-hash", + "zstd 0.13.1", ] [[package]] -name = "percent-encoding" -version = "2.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" - -[[package]] -name = "pin-project-lite" -version = "0.2.13" +name = "paste" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" +checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" [[package]] -name = "pin-utils" -version = "0.1.0" +name = "percent-encoding" +version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "pkg-config" @@ -976,27 +1071,6 @@ version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" -[[package]] -name = "planus" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc1691dd09e82f428ce8d6310bd6d5da2557c82ff17694d2a32cad7242aea89f" -dependencies = [ - "array-init-cursor", -] - -[[package]] -name = "portable-atomic" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" - -[[package]] -name = "powerfmt" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" - [[package]] name = "predicates" version = "3.1.0" @@ -1033,69 +1107,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "pyo3" -version = "0.20.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53bdbb96d49157e65d45cc287af5f32ffadd5f4761438b527b055fb0d4bb8233" -dependencies = [ - "cfg-if", - "indoc", - "libc", - "memoffset", - "parking_lot", - "portable-atomic", - "pyo3-build-config", - "pyo3-ffi", - "pyo3-macros", - "unindent", -] - -[[package]] -name = "pyo3-build-config" -version = "0.20.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "deaa5745de3f5231ce10517a1f5dd97d53e5a2fd77aa6b5842292085831d48d7" -dependencies = [ - "once_cell", - "target-lexicon", -] - -[[package]] -name = "pyo3-ffi" -version = "0.20.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62b42531d03e08d4ef1f6e85a2ed422eb678b8cd62b762e53891c05faf0d4afa" -dependencies = [ - "libc", - "pyo3-build-config", -] - -[[package]] -name = "pyo3-macros" -version = "0.20.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7305c720fa01b8055ec95e484a6eca7a83c841267f0dd5280f0c8b8551d2c158" -dependencies = [ - "proc-macro2", - "pyo3-macros-backend", - "quote", - "syn", -] - -[[package]] -name = "pyo3-macros-backend" -version = "0.20.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c7e9b68bb9c3149c5b0cade5d07f953d6d125eb4337723c4ccdb665f1f96185" -dependencies = [ - "heck 0.4.1", - "proc-macro2", - "pyo3-build-config", - "quote", - "syn", -] - [[package]] name = "quote" version = "1.0.35" @@ -1126,12 +1137,15 @@ dependencies = [ ] [[package]] -name = "redox_syscall" -version = "0.4.1" +name = "regex" +version = "1.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c" dependencies = [ - "bitflags 1.3.2", + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", ] [[package]] @@ -1139,6 +1153,17 @@ name = "regex-automata" version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56" [[package]] name = "rustc-hash" @@ -1169,10 +1194,10 @@ dependencies = [ ] [[package]] -name = "scopeguard" -version = "1.2.0" +name = "ryu" +version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" [[package]] name = "semver" @@ -1206,38 +1231,6 @@ dependencies = [ "syn", ] -[[package]] -name = "simdutf8" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a" - -[[package]] -name = "simplelog" -version = "0.12.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16257adbfaef1ee58b1363bdc0664c9b8e1e30aed86049635fb5f147d065a9c0" -dependencies = [ - "log", - "termcolor", - "time", -] - -[[package]] -name = "slab" -version = "0.4.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" -dependencies = [ - "autocfg", -] - -[[package]] -name = "smallvec" -version = "1.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" - [[package]] name = "snap" version = "1.1.1" @@ -1245,19 +1238,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" [[package]] -name = "streaming-decompression" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf6cc3b19bfb128a8ad11026086e31d3ce9ad23f8ea37354b31383a187c44cf3" -dependencies = [ - "fallible-streaming-iterator", -] - -[[package]] -name = "streaming-iterator" -version = "0.1.9" +name = "static_assertions" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b2231b7c3057d5e4ad0156fb3dc807d900806020c5ffa3ee6ff2c8c76fb8520" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" [[package]] name = "strsim" @@ -1276,12 +1260,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "target-lexicon" -version = "0.12.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1fc403891a21bcfb7c37834ba66a547a8f402146eba7265b5a6d88059c9ff2f" - [[package]] name = "tempfile" version = "3.10.1" @@ -1294,15 +1272,6 @@ dependencies = [ "windows-sys", ] -[[package]] -name = "termcolor" -version = "1.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755" -dependencies = [ - "winapi-util", -] - [[package]] name = "termtree" version = "0.4.1" @@ -1330,36 +1299,33 @@ dependencies = [ ] [[package]] -name = "time" -version = "0.3.34" +name = "thrift" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8248b6521bb14bc45b4067159b9b6ad792e2d6d754d6c41fb50e29fefe38749" +checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" dependencies = [ - "deranged", - "itoa", - "libc", - "num-conv", - "num_threads", - "powerfmt", - "serde", - "time-core", - "time-macros", + "byteorder", + "integer-encoding", + "ordered-float", ] [[package]] -name = "time-core" -version = "0.1.2" +name = "tiny-keccak" +version = "2.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] [[package]] -name = "time-macros" -version = "0.2.17" +name = "twox-hash" +version = "1.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ba3a3ef41e6672a2f0f001392bb5dcd3ff0a9992d618ca761a11c3121547774" +checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" dependencies = [ - "num-conv", - "time-core", + "cfg-if", + "static_assertions", ] [[package]] @@ -1368,12 +1334,6 @@ version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" -[[package]] -name = "unindent" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" - [[package]] name = "utf8parse" version = "0.2.1" @@ -1384,51 +1344,20 @@ checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" name = "vcf2parquet" version = "0.6.0" dependencies = [ + "arrow", "assert_cmd", - "tempfile", - "vcf2parquet-bin", - "vcf2parquet-lib", -] - -[[package]] -name = "vcf2parquet-bin" -version = "0.6.0" -dependencies = [ - "arrow2 0.18.0", "clap", - "niffler", - "simplelog", - "thiserror", - "vcf2parquet-lib", -] - -[[package]] -name = "vcf2parquet-lib" -version = "0.6.0" -dependencies = [ - "arrow2 0.18.0", "lazy_static", "log", "niffler", "noodles", + "parquet", "rayon", "rustc-hash", "tempfile", "thiserror", ] -[[package]] -name = "vcf2parquet-py" -version = "0.6.0" -dependencies = [ - "arrow2 0.17.4", - "niffler", - "pyo3", - "tempfile", - "thiserror", - "vcf2parquet-lib", -] - [[package]] name = "version_check" version = "0.9.4" @@ -1505,58 +1434,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" [[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - -[[package]] -name = "winapi-util" -version = "0.1.6" +name = "windows-core" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" dependencies = [ - "winapi", + "windows-targets", ] -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - [[package]] name = "windows-sys" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.4", -] - -[[package]] -name = "windows-targets" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" -dependencies = [ - "windows_aarch64_gnullvm 0.48.5", - "windows_aarch64_msvc 0.48.5", - "windows_i686_gnu 0.48.5", - "windows_i686_msvc 0.48.5", - "windows_x86_64_gnu 0.48.5", - "windows_x86_64_gnullvm 0.48.5", - "windows_x86_64_msvc 0.48.5", + "windows-targets", ] [[package]] @@ -1565,93 +1457,51 @@ version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" dependencies = [ - "windows_aarch64_gnullvm 0.52.4", - "windows_aarch64_msvc 0.52.4", - "windows_i686_gnu 0.52.4", - "windows_i686_msvc 0.52.4", - "windows_x86_64_gnu 0.52.4", - "windows_x86_64_gnullvm 0.52.4", - "windows_x86_64_msvc 0.52.4", + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", ] -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" - [[package]] name = "windows_aarch64_gnullvm" version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" -[[package]] -name = "windows_aarch64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" - [[package]] name = "windows_aarch64_msvc" version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" -[[package]] -name = "windows_i686_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" - [[package]] name = "windows_i686_gnu" version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" -[[package]] -name = "windows_i686_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" - [[package]] name = "windows_i686_msvc" version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" -[[package]] -name = "windows_x86_64_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" - [[package]] name = "windows_x86_64_gnu" version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" - [[package]] name = "windows_x86_64_gnullvm" version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" -[[package]] -name = "windows_x86_64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" - [[package]] name = "windows_x86_64_msvc" version = "0.52.4" @@ -1693,7 +1543,16 @@ version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c" dependencies = [ - "zstd-safe", + "zstd-safe 6.0.6", +] + +[[package]] +name = "zstd" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d789b1514203a1120ad2429eae43a7bd32b90976a7bb8a05f7ec02fa88cc23a" +dependencies = [ + "zstd-safe 7.1.0", ] [[package]] @@ -1706,11 +1565,20 @@ dependencies = [ "zstd-sys", ] +[[package]] +name = "zstd-safe" +version = "7.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cd99b45c6bc03a018c8b8a86025678c87e55526064e38f9df301989dce7ec0a" +dependencies = [ + "zstd-sys", +] + [[package]] name = "zstd-sys" -version = "2.0.9+zstd.1.5.5" +version = "2.0.10+zstd.1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e16efa8a874a0481a574084d34cc26fdb3b99627480f785888deb6386506656" +checksum = "c253a4914af5bafc8fa8c86ee400827e83cf6ec01195ec1f1ed8441bf00d65aa" dependencies = [ "cc", "pkg-config", diff --git a/Cargo.toml b/Cargo.toml index f6de8ad..9d440e7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,32 +13,31 @@ keywords = ["bioinformatics", "parquet"] [dependencies] -vcf2parquet-lib = { path = "vcf2parquet-lib", version = "0.6.0", optional = true } -vcf2parquet-bin = { path = "vcf2parquet-bin", version = "0.6.0", optional = true } +# parallel +rayon = { version = "1" } -[dev-dependencies] -tempfile = { version = "3" } -assert_cmd = { version = "2" } - - -[workspace] -members = ["vcf2parquet-lib", "vcf2parquet-bin", "vcf2parquet-py"] +# input output management +niffler = { version = "2" } +noodles = { version = "0.64", features = ["vcf"] } +arrow = { version = "51", default-features = false } +parquet = { version = "51" } +rustc-hash = { version = "1" } +# logging management +log = { version = "0.4" } -[features] -default = ["lib"] +# error management +thiserror = { version = "1" } -lib = ["vcf2parquet-lib"] -bin = ["vcf2parquet-bin"] +# cli management +clap = { version = "4", features = ["derive"] } -[[bin]] -name = "vcf2parquet" -required-features = ["bin"] +[dev-dependencies] +lazy_static = { version = "1" } +tempfile = { version = "3" } +assert_cmd = { version = "2" } -[[test]] -name = "functional" -required-features = ["bin"] [package.metadata.docs.rs] all-features = true diff --git a/vcf2parquet-py/Cargo.toml b/python/Cargo.toml similarity index 50% rename from vcf2parquet-py/Cargo.toml rename to python/Cargo.toml index ffa4b6b..2c26e88 100644 --- a/vcf2parquet-py/Cargo.toml +++ b/python/Cargo.toml @@ -9,9 +9,10 @@ name = "pyvcf2parquet" crate-type = ["cdylib"] [dependencies] -vcf2parquet-lib = { version = "0.6.0", path = "../vcf2parquet-lib" } -thiserror = "1" +vcf2parquet = { version = "0.6", path = "../../vcf2parquet" } +thiserror = { version = "1" } niffler = { version = "2" } -arrow2 = { version = "0.17", features = ["io_parquet", "io_parquet_compression"] } -tempfile = "3.10.0" +arrow = { version = "51", default-features = false } +parquet = { version = "51" } +tempfile = { version = "3.10.0" } pyo3 = { version = "0.20", features = ["extension-module"] } diff --git a/vcf2parquet-py/Readme.md b/python/Readme.md similarity index 100% rename from vcf2parquet-py/Readme.md rename to python/Readme.md diff --git a/vcf2parquet-py/pyproject.toml b/python/pyproject.toml similarity index 100% rename from vcf2parquet-py/pyproject.toml rename to python/pyproject.toml diff --git a/vcf2parquet-py/src/error.rs b/python/src/error.rs similarity index 98% rename from vcf2parquet-py/src/error.rs rename to python/src/error.rs index 621da37..c5312fc 100644 --- a/vcf2parquet-py/src/error.rs +++ b/python/src/error.rs @@ -11,7 +11,7 @@ use pyo3::{ }, }; use thiserror::Error; -use vcf2parquet_lib::error::Error as Vcf2ParquetError; +use vcf2parquet::error::Error as Vcf2ParquetError; #[derive(Error)] pub enum PyVcf2ParquetErr { diff --git a/vcf2parquet-py/src/lib.rs b/python/src/lib.rs similarity index 56% rename from vcf2parquet-py/src/lib.rs rename to python/src/lib.rs index b1fd80b..892a02f 100644 --- a/vcf2parquet-py/src/lib.rs +++ b/python/src/lib.rs @@ -1,7 +1,7 @@ //! vcf2parquet python binding /* std use */ -use vcf2parquet_lib as lib; +use vcf2parquet as lib; /* crate use */ use pyo3::prelude::*; @@ -24,8 +24,15 @@ pub enum Compression { Zstd, } +#[pyclass] +#[derive(Debug, Clone, Copy)] +pub enum ParquetVersion { + V2_0, + V1_0, +} + #[pyfunction] -#[pyo3(signature = (input,output,read_buffer=8192,batch_size=100_000,compression=Compression::Snappy,info_optional=false))] +#[pyo3(signature = (input,output,read_buffer=8192,batch_size=100_000,compression=Compression::Snappy,info_optional=false,parquet_version=ParquetVersion::V2_0))] fn convert_vcf( input: std::path::PathBuf, output: std::path::PathBuf, @@ -33,6 +40,7 @@ fn convert_vcf( batch_size: usize, compression: Compression, info_optional: bool, + parquet_version: ParquetVersion, ) -> PyResult<()> { let mut reader = std::fs::File::open(input) .map(Box::new) @@ -43,13 +51,24 @@ fn convert_vcf( let mut output = std::fs::File::create(output)?; let compression = match compression { - Compression::Uncompressed => arrow2::io::parquet::write::CompressionOptions::Uncompressed, - Compression::Snappy => arrow2::io::parquet::write::CompressionOptions::Snappy, - Compression::Gzip => arrow2::io::parquet::write::CompressionOptions::Gzip(None), - Compression::Lzo => arrow2::io::parquet::write::CompressionOptions::Lzo, - Compression::Brotli => arrow2::io::parquet::write::CompressionOptions::Brotli(None), - Compression::Lz4 => arrow2::io::parquet::write::CompressionOptions::Lz4, - Compression::Zstd => arrow2::io::parquet::write::CompressionOptions::Zstd(None), + Compression::Uncompressed => parquet::basic::Compression::UNCOMPRESSED, + Compression::Snappy => parquet::basic::Compression::SNAPPY, + Compression::Gzip => { + parquet::basic::Compression::GZIP(parquet::basic::GzipLevel::default()) + } + Compression::Lzo => parquet::basic::Compression::LZO, + Compression::Brotli => { + parquet::basic::Compression::BROTLI(parquet::basic::BrotliLevel::default()) + } + Compression::Lz4 => parquet::basic::Compression::LZ4, + Compression::Zstd => { + parquet::basic::Compression::ZSTD(parquet::basic::ZstdLevel::default()) + } + }; + + let parquet_version = match parquet_version { + ParquetVersion::V2_0 => parquet::file::properties::WriterVersion::PARQUET_2_0, + ParquetVersion::V1_0 => parquet::file::properties::WriterVersion::PARQUET_1_0, }; lib::vcf2parquet( @@ -58,6 +77,7 @@ fn convert_vcf( batch_size, compression, info_optional, + parquet_version, ) .map_err(PyVcf2ParquetErr::from) .map_err(PyErr::from) diff --git a/vcf2parquet-py/tests/test_vcf2parquet.py b/python/tests/test_vcf2parquet.py similarity index 100% rename from vcf2parquet-py/tests/test_vcf2parquet.py rename to python/tests/test_vcf2parquet.py diff --git a/vcf2parquet-bin/src/lib.rs b/src/bin/vcf2parquet.rs similarity index 68% rename from vcf2parquet-bin/src/lib.rs rename to src/bin/vcf2parquet.rs index 8d49bf8..4cbed1b 100644 --- a/vcf2parquet-bin/src/lib.rs +++ b/src/bin/vcf2parquet.rs @@ -6,13 +6,12 @@ use clap::Parser as _; /* project use */ -use vcf2parquet_lib as lib; +use vcf2parquet::cli; +use vcf2parquet::error; /* mod section */ -pub mod cli; -pub mod error; -pub fn main() -> error::Result<()> { +fn main() -> error::Result<()> { let params = cli::Command::parse(); match params.subcommand() { @@ -23,42 +22,38 @@ pub fn main() -> error::Result<()> { fn convert(params: &cli::Command, subparams: &cli::Convert) -> error::Result<()> { let mut reader = std::fs::File::open(params.input()) - .map_err(error::mapping) .map(Box::new) - .map(|x| niffler::get_reader(x)) - .map_err(error::mapping)? + .map(|x| niffler::get_reader(x))? .map(|(file, _)| std::io::BufReader::with_capacity(params.read_buffer(), file))?; - let mut output = std::fs::File::create(subparams.output()).map_err(error::mapping)?; + let mut output = std::fs::File::create(subparams.output())?; - lib::vcf2parquet( + vcf2parquet::vcf2parquet( &mut reader, &mut output, params.batch_size(), params.compression(), params.info_optional(), - ) - .map_err(error::mapping)?; + params.parquet_version(), + )?; Ok(()) } fn split(params: &cli::Command, subparams: &cli::Split) -> error::Result<()> { let mut reader = std::fs::File::open(params.input()) - .map_err(error::mapping) .map(Box::new) - .map(|x| niffler::get_reader(x)) - .map_err(error::mapping)? + .map(|x| niffler::get_reader(x))? .map(|(file, _)| std::io::BufReader::with_capacity(params.read_buffer(), file))?; - lib::vcf2multiparquet( + vcf2parquet::vcf2multiparquet( &mut reader, subparams.format(), params.batch_size(), params.compression(), params.info_optional(), - ) - .map_err(error::mapping)?; + params.parquet_version(), + )?; Ok(()) } diff --git a/vcf2parquet-bin/src/cli.rs b/src/cli.rs similarity index 72% rename from vcf2parquet-bin/src/cli.rs rename to src/cli.rs index bbb15e2..8d1fd93 100644 --- a/vcf2parquet-bin/src/cli.rs +++ b/src/cli.rs @@ -3,20 +3,45 @@ /* std use */ /* crate use */ +use parquet::file::properties::WriterVersion; /* project use */ +/// Parquet version available for user +#[derive(Debug, clap::ValueEnum, Clone, Copy)] +pub enum ParquetVersion { + ///Parquet version 1 + V1, + ///Parquet version 2 + V2, +} + +/// Compression available for user #[derive(Debug, clap::ValueEnum, Clone, Copy)] pub enum Compression { + /// No compression Uncompressed, + + /// Snappy compression Snappy, + + /// Gzip compression Gzip, + + /// Lzo compression Lzo, + + /// Brotly compression Brotli, + + /// Lz4 compression Lz4, + + /// Zstd compression Zstd, } +/// Define cli of vcf2parquet #[derive(clap::Parser, std::fmt::Debug)] #[command( name = "vcf2parquet", @@ -45,13 +70,20 @@ pub struct Command { #[clap(short = 'I', long = "info-optional")] info_optional: bool, + #[clap(long = "parquet-version")] + parquet_version: Option, + #[clap(subcommand)] subcommand: SubCommand, } +/// Enum to manage sub command #[derive(clap::Parser, std::fmt::Debug, Clone)] pub enum SubCommand { + /// Convert a vcf in a parquet Convert(Convert), + + /// Convert a vcf in multiple parquet file each file contains `batch_size` record Split(Split), } @@ -83,20 +115,31 @@ impl Command { } /// Get compression set by user or default value - pub fn compression(&self) -> arrow2::io::parquet::write::CompressionOptions { + pub fn compression(&self) -> parquet::basic::Compression { match self.compression { - Some(Compression::Uncompressed) => { - arrow2::io::parquet::write::CompressionOptions::Uncompressed + Some(Compression::Uncompressed) => parquet::basic::Compression::UNCOMPRESSED, + Some(Compression::Snappy) => parquet::basic::Compression::SNAPPY, + Some(Compression::Gzip) => { + parquet::basic::Compression::GZIP(parquet::basic::GzipLevel::default()) } - Some(Compression::Snappy) => arrow2::io::parquet::write::CompressionOptions::Snappy, - Some(Compression::Gzip) => arrow2::io::parquet::write::CompressionOptions::Gzip(None), - Some(Compression::Lzo) => arrow2::io::parquet::write::CompressionOptions::Lzo, + Some(Compression::Lzo) => parquet::basic::Compression::LZO, Some(Compression::Brotli) => { - arrow2::io::parquet::write::CompressionOptions::Brotli(None) + parquet::basic::Compression::BROTLI(parquet::basic::BrotliLevel::default()) } - Some(Compression::Lz4) => arrow2::io::parquet::write::CompressionOptions::Lz4, - Some(Compression::Zstd) => arrow2::io::parquet::write::CompressionOptions::Zstd(None), - None => arrow2::io::parquet::write::CompressionOptions::Snappy, + Some(Compression::Lz4) => parquet::basic::Compression::LZ4, + Some(Compression::Zstd) => { + parquet::basic::Compression::ZSTD(parquet::basic::ZstdLevel::default()) + } + None => parquet::basic::Compression::SNAPPY, + } + } + + /// Get parquet version + pub fn parquet_version(&self) -> WriterVersion { + match self.parquet_version { + Some(ParquetVersion::V1) => WriterVersion::PARQUET_1_0, + Some(ParquetVersion::V2) => WriterVersion::PARQUET_2_0, + None => WriterVersion::PARQUET_2_0, } } @@ -132,6 +175,7 @@ impl Split { #[cfg(test)] mod tests { + use super::*; #[test] @@ -145,6 +189,7 @@ mod tests { output: std::path::Path::new("test/output.parquet").to_path_buf(), }), info_optional: false, + parquet_version: None, }; assert_eq!( @@ -172,10 +217,12 @@ mod tests { format: "test_{}.parquet".to_string(), }), info_optional: false, + parquet_version: Some(ParquetVersion::V1), }; assert_eq!(params.batch_size(), 100); assert_eq!(params.read_buffer(), 8194); + assert_eq!(params.parquet_version(), WriterVersion::PARQUET_1_0); match params.subcommand.clone() { SubCommand::Split(s) => assert_eq!(s.format(), "test_{}.parquet"), @@ -194,12 +241,11 @@ mod tests { output: std::path::Path::new("test/output.parquet").to_path_buf(), }), info_optional: false, + parquet_version: None, }; - assert_eq!( - params.compression(), - arrow2::io::parquet::write::CompressionOptions::Snappy - ); + assert_eq!(params.compression(), parquet::basic::Compression::SNAPPY); + assert_eq!(params.parquet_version(), WriterVersion::PARQUET_2_0); params = Command { input: std::path::Path::new("test/input.vcf").to_path_buf(), @@ -210,11 +256,12 @@ mod tests { output: std::path::Path::new("test/output.parquet").to_path_buf(), }), info_optional: false, + parquet_version: None, }; assert_eq!( params.compression(), - arrow2::io::parquet::write::CompressionOptions::Uncompressed + parquet::basic::Compression::UNCOMPRESSED ); params = Command { @@ -226,12 +273,10 @@ mod tests { output: std::path::Path::new("test/output.parquet").to_path_buf(), }), info_optional: false, + parquet_version: None, }; - assert_eq!( - params.compression(), - arrow2::io::parquet::write::CompressionOptions::Snappy - ); + assert_eq!(params.compression(), parquet::basic::Compression::SNAPPY); params = Command { input: std::path::Path::new("test/input.vcf").to_path_buf(), @@ -242,11 +287,12 @@ mod tests { output: std::path::Path::new("test/output.parquet").to_path_buf(), }), info_optional: false, + parquet_version: None, }; assert_eq!( params.compression(), - arrow2::io::parquet::write::CompressionOptions::Gzip(None) + parquet::basic::Compression::GZIP(parquet::basic::GzipLevel::default()) ); params = Command { @@ -258,12 +304,10 @@ mod tests { output: std::path::Path::new("test/output.parquet").to_path_buf(), }), info_optional: false, + parquet_version: None, }; - assert_eq!( - params.compression(), - arrow2::io::parquet::write::CompressionOptions::Lzo - ); + assert_eq!(params.compression(), parquet::basic::Compression::LZO); params = Command { input: std::path::Path::new("test/input.vcf").to_path_buf(), @@ -274,11 +318,12 @@ mod tests { output: std::path::Path::new("test/output.parquet").to_path_buf(), }), info_optional: false, + parquet_version: None, }; assert_eq!( params.compression(), - arrow2::io::parquet::write::CompressionOptions::Brotli(None) + parquet::basic::Compression::BROTLI(parquet::basic::BrotliLevel::default()) ); params = Command { @@ -290,11 +335,9 @@ mod tests { output: std::path::Path::new("test/output.parquet").to_path_buf(), }), info_optional: false, + parquet_version: None, }; - assert_eq!( - params.compression(), - arrow2::io::parquet::write::CompressionOptions::Lz4 - ); + assert_eq!(params.compression(), parquet::basic::Compression::LZ4); } } diff --git a/src/columndata.rs b/src/columndata.rs new file mode 100644 index 0000000..9004210 --- /dev/null +++ b/src/columndata.rs @@ -0,0 +1,220 @@ +//! Struct to link name and data + +/* std use */ + +/* crate use */ +use arrow::array::ArrayBuilder; + +/* project use */ + +/// Stores arrow array builders for each column datatype +#[derive(Debug)] +pub enum ColumnData { + /// Boolean column + Bool(arrow::array::BooleanBuilder), + /// Int32 column + Int(arrow::array::Int32Builder), + /// Float32 column + Float(arrow::array::Float32Builder), + /// String column + String(arrow::array::StringBuilder), + + /// List of int32 column + ListInt(arrow::array::ListBuilder), + /// List of float32 column + ListFloat(arrow::array::ListBuilder), + /// List of string column + ListString(arrow::array::ListBuilder), +} + +impl ColumnData { + /// Creates a new ColumnData based on arrow type, length and field name + pub fn new( + arrow_type: &arrow::datatypes::DataType, + length: usize, + field_name: &str, + nullable: bool, + ) -> Self { + match arrow_type { + arrow::datatypes::DataType::Boolean => { + ColumnData::Bool(arrow::array::BooleanBuilder::with_capacity(length)) + } + arrow::datatypes::DataType::Int32 => { + ColumnData::Int(arrow::array::Int32Builder::with_capacity(length)) + } + arrow::datatypes::DataType::Float32 => { + ColumnData::Float(arrow::array::Float32Builder::with_capacity(length)) + } + arrow::datatypes::DataType::Utf8 => ColumnData::String( + arrow::array::StringBuilder::with_capacity(length, length * 10), + ), + arrow::datatypes::DataType::List(field) => match field.data_type() { + arrow::datatypes::DataType::Int32 => ColumnData::ListInt( + arrow::array::ListBuilder::with_capacity( + arrow::array::Int32Builder::new(), + length, + ) + .with_field(arrow::datatypes::Field::new( + field_name, + arrow::datatypes::DataType::Int32, + nullable, + )), + ), + arrow::datatypes::DataType::Float32 => ColumnData::ListFloat( + arrow::array::ListBuilder::with_capacity( + arrow::array::Float32Builder::new(), + length, + ) + .with_field(arrow::datatypes::Field::new( + field_name, + arrow::datatypes::DataType::Float32, + nullable, + )), + ), + arrow::datatypes::DataType::Utf8 => ColumnData::ListString( + arrow::array::ListBuilder::with_capacity( + arrow::array::StringBuilder::new(), + length, + ) + .with_field(arrow::datatypes::Field::new( + field_name, + arrow::datatypes::DataType::Utf8, + nullable, + )), + ), + _ => todo!(), + }, + dt => unreachable!("Unsupported arrow type, please check Schema: {:?}", dt), + } + } + + /// Add a Null value in array + pub fn push_null(&mut self) { + match self { + ColumnData::Bool(a) => a.append_null(), + ColumnData::Int(a) => a.append_null(), + ColumnData::Float(a) => a.append_null(), + ColumnData::String(a) => a.append_null(), + + ColumnData::ListInt(a) => a.append_null(), + ColumnData::ListFloat(a) => a.append_null(), + ColumnData::ListString(a) => a.append_null(), + } + } + + /// Get the length of internal array + pub fn len(&self) -> usize { + match self { + ColumnData::Bool(a) => a.len(), + ColumnData::Int(a) => a.len(), + ColumnData::Float(a) => a.len(), + ColumnData::String(a) => a.len(), + + ColumnData::ListInt(a) => a.len(), + ColumnData::ListFloat(a) => a.len(), + ColumnData::ListString(a) => a.len(), + } + } + + /// Check if array is empty (not used for now) + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Add a boolean value in array, if it's not a boolean array failled + pub fn push_bool(&mut self, value: bool) { + match self { + ColumnData::Bool(a) => a.append_value(value), + _ => todo!(), + } + } + + /// Add a i32 value in array, if it's not a integer array failled + pub fn push_i32(&mut self, value: Option) { + match self { + ColumnData::Int(a) => a.append_option(value), + _ => todo!(), + } + } + + /// Add a f32 value in array, if it's not a float array failled + pub fn push_f32(&mut self, value: Option) { + match self { + ColumnData::Float(a) => a.append_option(value), + _ => todo!(), + } + } + + /// Add a string value in array, if it's not a string array failled + pub fn push_string(&mut self, value: String) { + match self { + ColumnData::String(a) => a.append_option(Some(value)), + _ => todo!(), + } + } + + /// Add a vector of integer value in array, if it's not a vector of integer array failled + pub fn push_veci32(&mut self, value: Vec>) -> arrow::error::Result<()> { + match self { + ColumnData::ListInt(a) => { + a.values().append_values( + &value + .iter() + .map(|v| v.unwrap_or_default()) + .collect::>(), + &value.iter().map(|v| v.is_some()).collect::>(), + ); + a.append(true); + Ok(()) + } + _ => todo!(), + } + } + + /// Add a vector of float value in array, if it's not a vector of float array failled + pub fn push_vecf32(&mut self, value: Vec>) -> arrow::error::Result<()> { + match self { + ColumnData::ListFloat(a) => { + a.values().append_values( + &value + .iter() + .map(|v| v.unwrap_or_default()) + .collect::>(), + &value.iter().map(|v| v.is_some()).collect::>(), + ); + a.append(true); + Ok(()) + } + _ => todo!(), + } + } + + /// Add a vector of string value in array, if it's not a vector of string array failled + pub fn push_vecstring(&mut self, value: Vec>) -> arrow::error::Result<()> { + match self { + ColumnData::ListString(a) => { + for v in value { + a.values().append_option(v); + } + a.append(true); + Ok(()) + } + _ => todo!(), + } + } + + /// Convert ColumnData in Arrow2 array + pub fn into_arc(self) -> std::sync::Arc { + let length = self.len(); + + match self { + ColumnData::Bool(mut a) => arrow::array::Array::slice(&a.finish(), 0, length), + ColumnData::Int(mut a) => arrow::array::Array::slice(&a.finish(), 0, length), + ColumnData::Float(mut a) => arrow::array::Array::slice(&a.finish(), 0, length), + ColumnData::String(mut a) => arrow::array::Array::slice(&a.finish(), 0, length), + ColumnData::ListInt(mut a) => arrow::array::Array::slice(&a.finish(), 0, length), + ColumnData::ListFloat(mut a) => arrow::array::Array::slice(&a.finish(), 0, length), + ColumnData::ListString(mut a) => arrow::array::Array::slice(&a.finish(), 0, length), + } + } +} diff --git a/vcf2parquet-lib/src/error.rs b/src/error.rs similarity index 74% rename from vcf2parquet-lib/src/error.rs rename to src/error.rs index 9182b43..face054 100644 --- a/vcf2parquet-lib/src/error.rs +++ b/src/error.rs @@ -6,6 +6,7 @@ /* project use */ +/// Error type #[non_exhaustive] #[derive(thiserror::Error, std::fmt::Debug)] pub enum Error { @@ -15,11 +16,11 @@ pub enum Error { /// Arrow error #[error(transparent)] - Arrow(#[from] arrow2::error::Error), + Arrow(#[from] arrow::error::ArrowError), /// Parquet error #[error(transparent)] - Parquet(#[from] arrow2::io::parquet::read::ParquetError), + Parquet(#[from] parquet::errors::ParquetError), /// Io error #[error(transparent)] @@ -28,6 +29,11 @@ pub enum Error { /// Noodles header vcf error #[error(transparent)] NoodlesHeader(#[from] noodles::vcf::header::ParseError), + + /// Niffler error + #[error(transparent)] + Niffler(#[from] niffler::Error), } +/// Result type pub type Result = std::result::Result; diff --git a/src/lib.rs b/src/lib.rs index c1ddeb6..d429857 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,312 @@ +//! vcf2parquet library + #![warn(missing_docs)] +/* std use */ + +/* crate use */ +use parquet::file::properties::WriterVersion; +/* project use */ + +/* mod section */ +pub mod cli; +pub mod columndata; +pub mod error; +pub mod name2data; +pub mod record2chunk; +pub mod schema; + +/// Read `input` vcf and write parquet in `output` +pub fn vcf2parquet( + input: &mut R, + output: &mut W, + batch_size: usize, + compression: parquet::basic::Compression, + info_optional: bool, + parquet_version: WriterVersion, +) -> error::Result<()> +where + R: std::io::BufRead, + W: std::io::Write + std::marker::Send, +{ + // VCF section + let mut reader = noodles::vcf::Reader::new(input); + + let vcf_header: noodles::vcf::Header = reader.read_header()?; + + // Parquet section + let schema = schema::from_header(&vcf_header, info_optional)?; + let schema_ptr = std::sync::Arc::new(schema); + + let mut iterator = reader.records(&vcf_header); + let chunk_iterator = record2chunk::Record2Chunk::new( + &mut iterator, + batch_size, + vcf_header.clone(), + schema_ptr.clone(), + ); + + let options = parquet::file::properties::WriterProperties::builder() + .set_compression(compression) + .set_statistics_enabled(parquet::file::properties::EnabledStatistics::Page) + .set_writer_version(parquet_version) + .set_write_batch_size(batch_size) + .build(); + + let row_groups = arrow::array::RecordBatchIterator::new(chunk_iterator, schema_ptr.clone()); + + let mut writer = + parquet::arrow::ArrowWriter::try_new(output, schema_ptr.clone(), Some(options))?; + + for result in row_groups { + let group = result?; + writer.write(&group)?; + } + let _ = writer.close()?; + + Ok(()) +} + +/// Read `input` vcf and write each row group in a parquet file match with template +pub fn vcf2multiparquet( + input: &mut R, + template: &str, + batch_size: usize, + compression: parquet::basic::Compression, + info_optional: bool, + parquet_version: WriterVersion, +) -> error::Result<()> +where + R: std::io::BufRead, +{ + // VCF section + let mut reader = noodles::vcf::Reader::new(input); + + let vcf_header: noodles::vcf::Header = reader.read_header()?; + + // Parquet section + let schema = schema::from_header(&vcf_header, info_optional)?; + let schema_ptr = std::sync::Arc::new(schema); + + let mut iterator = reader.records(&vcf_header); + let chunk_iterator = record2chunk::Record2Chunk::new( + &mut iterator, + batch_size, + vcf_header.clone(), + schema_ptr.clone(), + ); + + let options = parquet::file::properties::WriterProperties::builder() + .set_compression(compression) + .set_statistics_enabled(parquet::file::properties::EnabledStatistics::Page) + .set_writer_version(parquet_version) + .set_write_batch_size(batch_size) + .build(); + + let row_groups = arrow::array::RecordBatchIterator::new(chunk_iterator, schema_ptr.clone()); + + for (index, result) in row_groups.enumerate() { + let group = result?; + let output = std::fs::File::create(template.replace("{}", &index.to_string()))?; + let mut writer = parquet::arrow::ArrowWriter::try_new( + output, + schema_ptr.clone(), + Some(options.clone()), + )?; + + writer.write(&group)?; + writer.close()?; + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + static VCF_FILE: &[u8] = b"##fileformat=VCFv4.3 +#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO +1\t925952\t1019397\tG\tA\t.\t.\t. +"; + + static PARQUET_FILE: &[u8] = &[ + 80, 65, 82, 49, 21, 4, 21, 10, 21, 50, 76, 21, 2, 21, 0, 18, 0, 0, 31, 139, 8, 0, 0, 0, 0, + 0, 0, 255, 99, 100, 96, 96, 48, 4, 0, 151, 222, 156, 170, 5, 0, 0, 0, 21, 6, 21, 4, 21, 44, + 92, 21, 2, 21, 0, 21, 2, 21, 16, 21, 0, 21, 0, 17, 28, 88, 1, 49, 24, 1, 49, 17, 17, 0, 0, + 0, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 96, 2, 0, 211, 115, 215, 175, 2, 0, 0, 0, 21, 12, + 25, 53, 0, 6, 16, 25, 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, 109, 101, 21, 4, 22, + 2, 22, 106, 22, 186, 1, 38, 78, 38, 0, 28, 88, 1, 49, 24, 1, 49, 17, 17, 0, 0, 21, 4, 21, + 8, 21, 48, 76, 21, 2, 21, 0, 18, 0, 0, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 80, 228, 99, + 0, 0, 69, 222, 72, 134, 4, 0, 0, 0, 21, 6, 21, 4, 21, 44, 92, 21, 2, 21, 0, 21, 2, 21, 16, + 21, 0, 21, 0, 17, 28, 88, 4, 0, 33, 14, 0, 24, 4, 0, 33, 14, 0, 17, 17, 0, 0, 0, 31, 139, + 8, 0, 0, 0, 0, 0, 0, 255, 99, 96, 2, 0, 211, 115, 215, 175, 2, 0, 0, 0, 21, 2, 25, 53, 0, + 6, 16, 25, 24, 8, 112, 111, 115, 105, 116, 105, 111, 110, 21, 4, 22, 2, 22, 116, 22, 196, + 1, 38, 76, 38, 0, 28, 24, 4, 0, 33, 14, 0, 24, 4, 0, 33, 14, 0, 56, 4, 0, 33, 14, 0, 24, 4, + 0, 33, 14, 0, 17, 17, 0, 0, 21, 4, 21, 22, 21, 62, 76, 21, 2, 21, 0, 18, 0, 0, 31, 139, 8, + 0, 0, 0, 0, 0, 0, 255, 99, 103, 96, 96, 48, 52, 48, 180, 52, 182, 52, 7, 0, 69, 88, 164, + 201, 11, 0, 0, 0, 21, 6, 21, 12, 21, 52, 92, 21, 2, 21, 0, 21, 2, 21, 16, 21, 4, 21, 4, 17, + 28, 88, 7, 49, 48, 49, 57, 51, 57, 55, 24, 7, 49, 48, 49, 57, 51, 57, 55, 17, 17, 0, 0, 0, + 2, 0, 2, 1, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 96, 2, 0, 211, 115, 215, 175, 2, 0, 0, + 0, 21, 12, 25, 53, 0, 6, 16, 25, 56, 10, 105, 100, 101, 110, 116, 105, 102, 105, 101, 114, + 4, 108, 105, 115, 116, 10, 105, 100, 101, 110, 116, 105, 102, 105, 101, 114, 21, 4, 22, 2, + 22, 150, 1, 22, 230, 1, 38, 90, 38, 0, 28, 88, 7, 49, 48, 49, 57, 51, 57, 55, 24, 7, 49, + 48, 49, 57, 51, 57, 55, 17, 17, 0, 0, 21, 4, 21, 10, 21, 50, 76, 21, 2, 21, 0, 18, 0, 0, + 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 100, 96, 96, 112, 7, 0, 158, 10, 250, 19, 5, 0, 0, + 0, 21, 6, 21, 4, 21, 44, 92, 21, 2, 21, 0, 21, 2, 21, 16, 21, 0, 21, 0, 17, 28, 88, 1, 71, + 24, 1, 71, 17, 17, 0, 0, 0, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 96, 2, 0, 211, 115, 215, + 175, 2, 0, 0, 0, 21, 12, 25, 53, 0, 6, 16, 25, 24, 9, 114, 101, 102, 101, 114, 101, 110, + 99, 101, 21, 4, 22, 2, 22, 106, 22, 186, 1, 38, 78, 38, 0, 28, 88, 1, 71, 24, 1, 71, 17, + 17, 0, 0, 21, 4, 21, 10, 21, 50, 76, 21, 2, 21, 0, 18, 0, 0, 31, 139, 8, 0, 0, 0, 0, 0, 0, + 255, 99, 100, 96, 96, 112, 4, 0, 171, 175, 153, 250, 5, 0, 0, 0, 21, 6, 21, 4, 21, 44, 92, + 21, 2, 21, 0, 21, 2, 21, 16, 21, 0, 21, 0, 17, 28, 88, 1, 65, 24, 1, 65, 17, 17, 0, 0, 0, + 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 96, 2, 0, 211, 115, 215, 175, 2, 0, 0, 0, 21, 12, + 25, 53, 0, 6, 16, 25, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 21, 4, 22, 2, 22, + 106, 22, 186, 1, 38, 78, 38, 0, 28, 88, 1, 65, 24, 1, 65, 17, 17, 0, 0, 21, 4, 21, 0, 21, + 40, 76, 21, 0, 21, 0, 18, 0, 0, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 3, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 21, 6, 21, 6, 21, 46, 92, 21, 2, 21, 2, 21, 2, 21, 16, 21, 4, 21, 0, 17, 0, 0, 2, 0, + 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 0, 0, 141, 239, 2, 210, 1, 0, 0, 0, 21, 8, 25, 53, + 0, 6, 16, 25, 24, 7, 113, 117, 97, 108, 105, 116, 121, 21, 4, 22, 2, 22, 78, 22, 158, 1, + 38, 68, 38, 0, 28, 54, 2, 0, 0, 21, 4, 21, 0, 21, 40, 76, 21, 0, 21, 0, 18, 0, 0, 31, 139, + 8, 0, 0, 0, 0, 0, 0, 255, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 6, 21, 10, 21, 50, 92, 21, 2, + 21, 2, 21, 2, 21, 16, 21, 4, 21, 4, 17, 0, 0, 2, 0, 2, 0, 31, 139, 8, 0, 0, 0, 0, 0, 0, + 255, 99, 0, 0, 141, 239, 2, 210, 1, 0, 0, 0, 21, 12, 25, 53, 0, 6, 16, 25, 56, 6, 102, 105, + 108, 116, 101, 114, 4, 108, 105, 115, 116, 6, 102, 105, 108, 116, 101, 114, 21, 4, 22, 2, + 22, 82, 22, 162, 1, 38, 68, 38, 0, 28, 54, 2, 0, 0, 25, 17, 2, 25, 24, 1, 49, 25, 24, 1, + 49, 21, 2, 25, 22, 0, 0, 25, 17, 2, 25, 24, 4, 0, 33, 14, 0, 25, 24, 4, 0, 33, 14, 0, 21, + 2, 25, 22, 0, 0, 25, 17, 2, 25, 24, 7, 49, 48, 49, 57, 51, 57, 55, 25, 24, 7, 49, 48, 49, + 57, 51, 57, 55, 21, 2, 25, 22, 0, 0, 25, 17, 2, 25, 24, 1, 71, 25, 24, 1, 71, 21, 2, 25, + 22, 0, 0, 25, 17, 2, 25, 24, 1, 65, 25, 24, 1, 65, 21, 2, 25, 22, 0, 0, 25, 17, 1, 25, 24, + 1, 0, 25, 24, 1, 0, 21, 2, 25, 22, 2, 0, 25, 17, 1, 25, 24, 1, 0, 25, 24, 1, 0, 21, 2, 25, + 22, 2, 0, 25, 28, 22, 86, 21, 108, 22, 0, 0, 0, 25, 28, 22, 230, 2, 21, 120, 22, 0, 0, 0, + 25, 28, 22, 176, 5, 21, 140, 1, 22, 0, 0, 0, 25, 28, 22, 156, 8, 21, 108, 22, 0, 0, 0, 25, + 28, 22, 172, 10, 21, 108, 22, 0, 0, 0, 25, 28, 22, 178, 12, 21, 90, 22, 0, 0, 0, 25, 28, + 22, 150, 14, 21, 94, 22, 0, 0, 0, 21, 4, 25, 204, 72, 12, 97, 114, 114, 111, 119, 95, 115, + 99, 104, 101, 109, 97, 21, 14, 0, 21, 12, 37, 0, 24, 10, 99, 104, 114, 111, 109, 111, 115, + 111, 109, 101, 37, 0, 76, 28, 0, 0, 0, 21, 2, 37, 0, 24, 8, 112, 111, 115, 105, 116, 105, + 111, 110, 0, 53, 0, 24, 10, 105, 100, 101, 110, 116, 105, 102, 105, 101, 114, 21, 2, 21, 6, + 76, 60, 0, 0, 0, 53, 4, 24, 4, 108, 105, 115, 116, 21, 2, 0, 21, 12, 37, 0, 24, 10, 105, + 100, 101, 110, 116, 105, 102, 105, 101, 114, 37, 0, 76, 28, 0, 0, 0, 21, 12, 37, 0, 24, 9, + 114, 101, 102, 101, 114, 101, 110, 99, 101, 37, 0, 76, 28, 0, 0, 0, 21, 12, 37, 0, 24, 9, + 97, 108, 116, 101, 114, 110, 97, 116, 101, 37, 0, 76, 28, 0, 0, 0, 21, 8, 37, 2, 24, 7, + 113, 117, 97, 108, 105, 116, 121, 0, 53, 0, 24, 6, 102, 105, 108, 116, 101, 114, 21, 2, 21, + 6, 76, 60, 0, 0, 0, 53, 4, 24, 4, 108, 105, 115, 116, 21, 2, 0, 21, 12, 37, 0, 24, 6, 102, + 105, 108, 116, 101, 114, 37, 0, 76, 28, 0, 0, 0, 22, 2, 25, 28, 25, 124, 38, 194, 1, 28, + 21, 12, 25, 53, 0, 6, 16, 25, 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, 109, 101, 21, + 4, 22, 2, 22, 106, 22, 186, 1, 38, 86, 38, 8, 28, 88, 1, 49, 24, 1, 49, 17, 17, 0, 0, 22, + 226, 17, 21, 20, 22, 208, 15, 21, 34, 0, 38, 222, 3, 28, 21, 2, 25, 53, 0, 6, 16, 25, 24, + 8, 112, 111, 115, 105, 116, 105, 111, 110, 21, 4, 22, 2, 22, 116, 22, 196, 1, 38, 230, 2, + 38, 154, 2, 28, 24, 4, 0, 33, 14, 0, 24, 4, 0, 33, 14, 0, 56, 4, 0, 33, 14, 0, 24, 4, 0, + 33, 14, 0, 17, 17, 0, 0, 22, 246, 17, 21, 22, 22, 242, 15, 21, 46, 0, 38, 188, 6, 28, 21, + 12, 25, 53, 0, 6, 16, 25, 56, 10, 105, 100, 101, 110, 116, 105, 102, 105, 101, 114, 4, 108, + 105, 115, 116, 10, 105, 100, 101, 110, 116, 105, 102, 105, 101, 114, 21, 4, 22, 2, 22, 150, + 1, 22, 230, 1, 38, 176, 5, 38, 214, 4, 28, 88, 7, 49, 48, 49, 57, 51, 57, 55, 24, 7, 49, + 48, 49, 57, 51, 57, 55, 17, 17, 0, 0, 22, 140, 18, 21, 24, 22, 160, 16, 21, 58, 0, 38, 136, + 9, 28, 21, 12, 25, 53, 0, 6, 16, 25, 24, 9, 114, 101, 102, 101, 114, 101, 110, 99, 101, 21, + 4, 22, 2, 22, 106, 22, 186, 1, 38, 156, 8, 38, 206, 7, 28, 88, 1, 71, 24, 1, 71, 17, 17, 0, + 0, 22, 164, 18, 21, 22, 22, 218, 16, 21, 34, 0, 38, 152, 11, 28, 21, 12, 25, 53, 0, 6, 16, + 25, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 21, 4, 22, 2, 22, 106, 22, 186, 1, + 38, 172, 10, 38, 222, 9, 28, 88, 1, 65, 24, 1, 65, 17, 17, 0, 0, 22, 186, 18, 21, 22, 22, + 252, 16, 21, 34, 0, 38, 140, 13, 28, 21, 8, 25, 53, 0, 6, 16, 25, 24, 7, 113, 117, 97, 108, + 105, 116, 121, 21, 4, 22, 2, 22, 78, 22, 158, 1, 38, 178, 12, 38, 238, 11, 28, 54, 2, 0, 0, + 22, 208, 18, 21, 22, 22, 158, 17, 21, 34, 0, 38, 244, 14, 28, 21, 12, 25, 53, 0, 6, 16, 25, + 56, 6, 102, 105, 108, 116, 101, 114, 4, 108, 105, 115, 116, 6, 102, 105, 108, 116, 101, + 114, 21, 4, 22, 2, 22, 82, 22, 162, 1, 38, 150, 14, 38, 210, 13, 28, 54, 2, 0, 0, 22, 230, + 18, 21, 22, 22, 192, 17, 21, 34, 0, 22, 232, 5, 22, 2, 38, 8, 22, 152, 10, 20, 0, 0, 25, + 28, 24, 12, 65, 82, 82, 79, 87, 58, 115, 99, 104, 101, 109, 97, 24, 208, 5, 47, 47, 47, 47, + 47, 120, 81, 67, 65, 65, 65, 81, 65, 65, 65, 65, 65, 65, 65, 75, 65, 65, 119, 65, 67, 103, + 65, 74, 65, 65, 81, 65, 67, 103, 65, 65, 65, 66, 65, 65, 65, 65, 65, 65, 65, 81, 81, 65, + 67, 65, 65, 73, 65, 65, 65, 65, 66, 65, 65, 73, 65, 65, 65, 65, 66, 65, 65, 65, 65, 65, 99, + 65, 65, 65, 67, 48, 65, 81, 65, 65, 90, 65, 69, 65, 65, 65, 81, 66, 65, 65, 68, 85, 65, 65, + 65, 65, 112, 65, 65, 65, 65, 71, 119, 65, 65, 65, 65, 69, 65, 65, 65, 65, 101, 80, 55, 47, + 47, 120, 103, 65, 65, 65, 65, 77, 65, 65, 65, 65, 65, 65, 65, 65, 68, 68, 103, 65, 65, 65, + 65, 66, 65, 65, 65, 65, 67, 65, 65, 65, 65, 71, 122, 43, 47, 47, 43, 89, 47, 118, 47, 47, + 70, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, 65, 65, 65, 65, 70, 68, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 67, 73, 47, 118, 47, 47, 66, 103, 65, 65, 65, 71, 90, 112, 98, 72, 82, 108, + 99, 103, 65, 65, 66, 103, 65, 65, 65, 71, 90, 112, 98, 72, 82, 108, 99, 103, 65, 65, 69, + 65, 65, 87, 65, 66, 65, 65, 68, 103, 65, 80, 65, 65, 81, 65, 65, 65, 65, 73, 65, 66, 65, + 65, 65, 65, 65, 89, 65, 65, 65, 65, 72, 65, 65, 65, 65, 65, 65, 65, 65, 81, 77, 89, 65, 65, + 65, 65, 65, 65, 65, 71, 65, 65, 103, 65, 66, 103, 65, 71, 65, 65, 65, 65, 65, 65, 65, 66, + 65, 65, 65, 65, 65, 65, 65, 72, 65, 65, 65, 65, 99, 88, 86, 104, 98, 71, 108, 48, 101, 81, + 65, 81, 47, 47, 47, 47, 70, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, 65, 65, 65, 65, 70, + 68, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 47, 47, 47, 47, 67, 81, 65, 65, 65, 71, 70, + 115, 100, 71, 86, 121, 98, 109, 70, 48, 90, 81, 65, 65, 65, 68, 122, 47, 47, 47, 56, 85, + 65, 65, 65, 65, 68, 65, 65, 65, 65, 65, 65, 65, 65, 65, 85, 77, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 67, 122, 47, 47, 47, 56, 74, 65, 65, 65, 65, 99, 109, 86, 109, 90, 88, 74, 108, 98, + 109, 78, 108, 65, 65, 65, 65, 97, 80, 47, 47, 47, 120, 103, 65, 65, 65, 65, 77, 65, 65, 65, + 65, 65, 65, 65, 65, 68, 68, 119, 65, 65, 65, 65, 66, 65, 65, 65, 65, 67, 65, 65, 65, 65, + 70, 122, 47, 47, 47, 43, 73, 47, 47, 47, 47, 70, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, + 65, 65, 65, 65, 70, 68, 65, 65, 65, 65, 65, 65, 65, 65, 65, 66, 52, 47, 47, 47, 47, 67, + 103, 65, 65, 65, 71, 108, 107, 90, 87, 53, 48, 97, 87, 90, 112, 90, 88, 73, 65, 65, 65, + 111, 65, 65, 65, 66, 112, 90, 71, 86, 117, 100, 71, 108, 109, 97, 87, 86, 121, 65, 65, 68, + 69, 47, 47, 47, 47, 71, 65, 65, 65, 65, 67, 65, 65, 65, 65, 65, 65, 65, 65, 65, 67, 72, 65, + 65, 65, 65, 65, 103, 65, 68, 65, 65, 69, 65, 65, 115, 65, 67, 65, 65, 65, 65, 67, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 66, 65, 65, 65, 65, 65, 65, 103, 65, 65, 65, 66, 119, 98, 51, + 78, 112, 100, 71, 108, 118, 98, 103, 65, 65, 65, 65, 65, 81, 65, 66, 81, 65, 69, 65, 65, + 65, 65, 65, 56, 65, 66, 65, 65, 65, 65, 65, 103, 65, 69, 65, 65, 65, 65, 66, 103, 65, 65, + 65, 65, 77, 65, 65, 65, 65, 65, 65, 65, 65, 66, 82, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, + 66, 65, 65, 69, 65, 65, 81, 65, 65, 65, 65, 75, 65, 65, 65, 65, 89, 50, 104, 121, 98, 50, + 49, 118, 99, 50, 57, 116, 90, 81, 65, 65, 0, 24, 25, 112, 97, 114, 113, 117, 101, 116, 45, + 114, 115, 32, 118, 101, 114, 115, 105, 111, 110, 32, 53, 49, 46, 48, 46, 48, 25, 124, 28, + 0, 0, 28, 0, 0, 28, 0, 0, 28, 0, 0, 28, 0, 0, 28, 0, 0, 28, 0, 0, 0, 214, 5, 0, 0, 80, 65, + 82, 49, + ]; + + #[test] + fn convert_positives() { + let mut input = std::io::BufReader::new(&*VCF_FILE); + let mut output = Vec::new(); + + vcf2parquet( + &mut input, + &mut output, + 1, + parquet::basic::Compression::GZIP(parquet::basic::GzipLevel::try_new(6).unwrap()), + false, + WriterVersion::PARQUET_2_0, + ) + .unwrap(); + assert_eq!(output, *PARQUET_FILE); + } + + #[test] + fn not_a_vcf() { + let raw_data = [b'#', b'a', b'b', b'c', 255, 0x7F, b'\n'].to_vec(); + let mut input = std::io::BufReader::new(&raw_data[..]); + let mut output = Vec::new(); + + let result = vcf2parquet( + &mut input, + &mut output, + 1, + parquet::basic::Compression::GZIP(parquet::basic::GzipLevel::try_new(6).unwrap()), + false, + WriterVersion::PARQUET_2_0, + ); + + assert!(result.is_err()); + } + + #[test] + fn multi_positives() { + let mut input = std::io::BufReader::new(&*VCF_FILE); + let dir = tempfile::tempdir().unwrap(); -//! vcf2parquet allow user to convert a vcf in parquet format. + let format = dir + .path() + .join("test_{}.parquet") + .as_os_str() + .to_str() + .unwrap() + .to_string(); -pub use vcf2parquet_lib::*; + vcf2multiparquet( + &mut input, + &format, + 1, + parquet::basic::Compression::GZIP(parquet::basic::GzipLevel::try_new(6).unwrap()), + false, + WriterVersion::PARQUET_2_0, + ) + .unwrap(); + } +} diff --git a/src/main.rs b/src/main.rs deleted file mode 100644 index 5d6b6e8..0000000 --- a/src/main.rs +++ /dev/null @@ -1,6 +0,0 @@ -#[cfg(feature = "bin")] -pub use vcf2parquet_bin::{error, main as bin_main}; - -pub fn main() -> error::Result<()> { - bin_main() -} diff --git a/src/name2data.rs b/src/name2data.rs new file mode 100644 index 0000000..f997f82 --- /dev/null +++ b/src/name2data.rs @@ -0,0 +1,1021 @@ +//! Struct to link name and data + +/* std use */ + +/* crate use */ +use arrow::datatypes::Field; +use noodles::vcf::record::genotypes::sample::value::genotype::allele::Phasing; + +/* project use */ +use crate::columndata::ColumnData; + +///Alias of [std::collections::HashMap] that associate a column name and [ColumnData], a proxy of arrow2 datastructure +#[derive(Debug)] +pub struct Name2Data(rustc_hash::FxHashMap); + +impl Name2Data { + /// Create a new Name2Data, vcf header is required to add info and genotype column + /// length parameter is used to preallocate memory + pub fn new(length: usize, schema: &arrow::datatypes::Schema) -> Self { + let mut name2data = rustc_hash::FxHashMap::default(); + for field in schema.fields.iter() { + let nullable = match field.data_type() { + arrow::datatypes::DataType::List(a) => a.is_nullable(), + _ => field.is_nullable(), + }; + + let column = ColumnData::new(field.data_type(), length, field.name(), nullable); + name2data.insert(field.name().to_string(), column); + } + Name2Data(name2data) + } + + /// Just a wrapper arround [std::collections::HashMap::get] + pub fn get(&self, key: &str) -> Option<&ColumnData> { + self.0.get(key) + } + + /// Just a wrapper arround [std::collections::HashMap::get_mut] + pub fn get_mut(&mut self, key: &str) -> Option<&mut ColumnData> { + self.0.get_mut(key) + } + + /// Add a vcf record in [std::collections::HashMap] struct + pub fn add_record( + &mut self, + record: noodles::vcf::Record, + header: &noodles::vcf::Header, + schema: &rustc_hash::FxHashMap, + ) -> std::result::Result<(), arrow::error::ArrowError> { + let allele_count = record.alternate_bases().len() + 1; + for (alt_id, allele) in record.alternate_bases().iter().enumerate() { + for (key, column) in self.0.iter_mut() { + match key.as_str() { + "chromosome" => column.push_string(record.chromosome().to_string()), + "position" => column.push_i32(Some(usize::from(record.position()) as i32)), + "identifier" => column.push_vecstring( + record.ids().iter().map(|s| Some(s.to_string())).collect(), + )?, + "reference" => column.push_string(record.reference_bases().to_string()), + "alternate" => column.push_string(allele.to_string()), + "quality" => column.push_f32(record.quality_score().map(|v| v.into())), + "filter" => column.push_vecstring( + record + .filters() + .iter() + .map(|s| Some(s.to_string())) + .collect(), + )?, + _ => {} + } + } + self.add_info(&record, header, schema, alt_id, allele_count)?; + self.add_format(&record, header, schema, alt_id, allele_count)?; + } + Ok(()) + } + + fn add_info( + &mut self, + record: &noodles::vcf::Record, + header: &noodles::vcf::Header, + schema: &rustc_hash::FxHashMap, + alt_id: usize, + allele_count: usize, + ) -> std::result::Result<(), arrow::error::ArrowError> { + let info = record.info(); + + for key in header.infos().keys() { + let key_name = format!("info_{}", key); + let info_def = header.infos().get(key).unwrap(); + if let Some(column) = self.0.get_mut(&key_name) { + match info.get(key).flatten() { + Some(value) => match value { + noodles::vcf::record::info::field::Value::Flag => { + column.push_bool(true); + } + noodles::vcf::record::info::field::Value::Integer(value) => { + column.push_i32(Some(*value)); + } + noodles::vcf::record::info::field::Value::Float(value) => { + column.push_f32(Some(*value)); + } + noodles::vcf::record::info::field::Value::String(value) => { + column.push_string(value.to_string()); + } + noodles::vcf::record::info::field::Value::Character(value) => { + column.push_string(value.to_string()); + } + noodles::vcf::record::info::field::Value::Array(arr) => match arr.clone() { + noodles::vcf::record::info::field::value::Array::Integer(array_val) => { + match info_def.number() { + noodles::vcf::header::Number::Count(0 | 1) => { + unreachable!( + "Field {} declared as single value but found array", + key + ) + } + noodles::vcf::header::Number::Count(_) => { + column.push_veci32(array_val)?; + } + noodles::vcf::header::Number::A => { + column.push_i32(*array_val.get(alt_id).unwrap_or(&None)); + } + noodles::vcf::header::Number::R => { + column.push_veci32(vec![ + *array_val.first().unwrap_or(&None), + *array_val.get(alt_id + 1).unwrap_or(&None), + ])?; + } + noodles::vcf::header::Number::G => { + if array_val.len() + == (allele_count * (allele_count + 1) / 2) + { + column.push_veci32(vec![ + *array_val.first().unwrap_or(&None), + *array_val + .get((alt_id * alt_id + 3 * alt_id + 2) / 2) + .unwrap_or(&None), + *array_val + .get((alt_id * alt_id + 5 * alt_id + 4) / 2) + .unwrap_or(&None), + ])?; + } else if array_val.len() == allele_count { + column.push_veci32(vec![ + *array_val.first().unwrap_or(&None), + Some(0), + *array_val.get(alt_id + 1).unwrap_or(&None), + ])?; + } else { + column.push_null(); + } + } + noodles::vcf::header::Number::Unknown => { + column.push_veci32(array_val)?; + } + } + } + noodles::vcf::record::info::field::value::Array::Float(array_val) => { + match info_def.number() { + noodles::vcf::header::Number::Count(0 | 1) => { + unreachable!( + "Field {} declared as single value but found array", + key + ) + } + noodles::vcf::header::Number::Count(_) => { + column.push_vecf32(array_val)?; + } + noodles::vcf::header::Number::A => { + column.push_f32(*array_val.get(alt_id).unwrap_or(&None)); + } + noodles::vcf::header::Number::R => { + column.push_vecf32(vec![ + *array_val.first().unwrap_or(&None), + *array_val.get(alt_id + 1).unwrap_or(&None), + ])?; + } + noodles::vcf::header::Number::G => { + if array_val.len() + == (allele_count * (allele_count + 1) / 2) + { + column.push_vecf32(vec![ + *array_val.first().unwrap_or(&None), + *array_val + .get((alt_id * alt_id + 3 * alt_id + 2) / 2) + .unwrap_or(&None), + *array_val + .get((alt_id * alt_id + 5 * alt_id + 4) / 2) + .unwrap_or(&None), + ])?; + } else if array_val.len() == allele_count { + column.push_vecf32(vec![ + *array_val.first().unwrap_or(&None), + Some(0.), + *array_val.get(alt_id + 1).unwrap_or(&None), + ])?; + } else { + column.push_null(); + } + } + noodles::vcf::header::Number::Unknown => { + column.push_vecf32(array_val)?; + } + } + } + noodles::vcf::record::info::field::value::Array::String(array_val) => { + match info_def.number() { + noodles::vcf::header::Number::Count(0 | 1) => { + unreachable!( + "Field {} declared as single value but found array", + key_name + ) + } + noodles::vcf::header::Number::Count(_) => { + column.push_vecstring(array_val)?; + } + noodles::vcf::header::Number::A => { + column.push_string( + array_val + .get(alt_id) + .unwrap() + .clone() + .unwrap_or_default(), + ); + } + noodles::vcf::header::Number::R => { + column.push_vecstring(vec![ + Some( + array_val + .first() + .unwrap() + .clone() + .unwrap_or_default(), + ), + Some( + array_val + .get(alt_id + 1) + .unwrap() + .clone() + .unwrap_or_default(), + ), + ])?; + } + noodles::vcf::header::Number::G => { + if array_val.len() + == (allele_count * (allele_count + 1) / 2) + { + column.push_vecstring(vec![ + Some( + array_val + .first() + .unwrap() + .clone() + .unwrap_or_default(), + ), + Some( + array_val + .get((alt_id * alt_id + 3 * alt_id + 2) / 2) + .unwrap() + .clone() + .unwrap_or_default(), + ), + Some( + array_val + .get((alt_id * alt_id + 5 * alt_id + 4) / 2) + .unwrap() + .clone() + .unwrap_or_default(), + ), + ])?; + } else if array_val.len() == allele_count { + column.push_vecstring(vec![ + array_val.first().unwrap().clone(), + Some(".".to_string()), + array_val.get(alt_id + 1).unwrap().clone(), + ])?; + } else { + column.push_null(); + } + } + noodles::vcf::header::Number::Unknown => { + column.push_vecstring(array_val)?; + } + } + } + noodles::vcf::record::info::field::value::Array::Character( + array_val, + ) => match info_def.number() { + noodles::vcf::header::Number::Count(0 | 1) => { + unreachable!( + "Field {} declared as single value but found array", + key_name + ) + } + noodles::vcf::header::Number::Count(_) => { + column.push_vecstring( + array_val + .iter() + .map(|s| s.as_ref().map(|s| s.to_string())) + .collect::>>(), + )?; + } + noodles::vcf::header::Number::A => { + column.push_string( + (*array_val.get(alt_id).unwrap()).unwrap().to_string(), + ); + } + noodles::vcf::header::Number::R => { + column.push_vecstring(vec![ + Some(array_val.first().unwrap().unwrap().to_string()), + Some( + array_val.get(alt_id + 1).unwrap().unwrap().to_string(), + ), + ])?; + } + noodles::vcf::header::Number::G => { + if array_val.len() == (allele_count * (allele_count + 1) / 2) { + column.push_vecstring(vec![ + Some(array_val.first().unwrap().unwrap().to_string()), + Some( + array_val + .get((alt_id * alt_id + 3 * alt_id + 2) / 2) + .unwrap() + .unwrap() + .to_string(), + ), + Some( + array_val + .get((alt_id * alt_id + 5 * alt_id + 4) / 2) + .unwrap() + .unwrap() + .to_string(), + ), + ])?; + } else if array_val.len() == allele_count { + column.push_vecstring(vec![ + Some(array_val.first().unwrap().unwrap().to_string()), + Some(".".to_string()), + Some( + array_val + .get(alt_id + 1) + .unwrap() + .unwrap() + .to_string(), + ), + ])?; + } else { + column.push_null(); + } + } + noodles::vcf::header::Number::Unknown => { + column.push_vecstring( + array_val + .iter() + .map(|s| s.as_ref().map(|s| s.to_string())) + .collect::>>(), + )?; + } + }, + }, + }, + None => { + if info_def.ty() + == noodles::vcf::header::record::value::map::info::Type::Flag + { + column.push_bool(false); + } else { + //Handle missing info field, only matters for FixedSizeList + if schema.get(&key_name).is_some() { + match column { + ColumnData::ListFloat(_) => { + column.push_vecf32(vec![])?; + } + ColumnData::ListInt(_) => { + column.push_veci32(vec![])?; + } + ColumnData::ListString(_) => { + column.push_vecstring(vec![])?; + } + _ => column.push_null(), //Otherwise, just push null + } + } else { + unreachable!("Malformed VCF, {} should be in schema", key_name); + } + } + } + } + } + } + Ok(()) + } + + fn add_format( + &mut self, + record: &noodles::vcf::Record, + header: &noodles::vcf::Header, + schema: &rustc_hash::FxHashMap, + alt_id: usize, + allele_count: usize, + ) -> std::result::Result<(), arrow::error::ArrowError> { + for key in header.formats().keys() { + for (idx, sample) in header.sample_names().iter().enumerate() { + let key_name = format!("format_{}_{}", sample, key); + let format_def = header.formats().get(key).unwrap(); + if let Some(column) = self.0.get_mut(&key_name) { + if let Some(format_field) = record.genotypes().get_index(idx) { + match format_field.get(key).flatten() { + Some(value) => match value { + noodles::vcf::record::genotypes::sample::Value::Integer(value) => column.push_i32(Some(*value)), + noodles::vcf::record::genotypes::sample::Value::Float(value) => column.push_f32(Some(*value)), + noodles::vcf::record::genotypes::sample::Value::String(value) => { + if key.to_string()=="GT" { + let mut gt_str = String::with_capacity(32); //Arbitrary capacity + if let Some(gt) = format_field.genotype().and_then(|g|g.ok()) + { + gt.iter().enumerate().for_each(|(i,allele)| { + let (position, phasing) = (allele.position(), allele.phasing()); + match position { + Some(a) if a == alt_id + 1 => { + gt_str.push('1'); + } + Some(0)=>{ + gt_str.push('0'); + } + Some(_) =>{ + gt_str.push('.'); + } + None=>{ + gt_str.push('.'); + } + } + if i < gt.len() - 1 { + gt_str.push(match phasing { + Phasing::Phased => '|', + Phasing::Unphased => '/', + }); + } + }); + } + else { + unreachable!("If GT is not present, the match arm won't take us there") + } + column.push_string(gt_str); + } else { + column.push_string(value.to_string()); + } + } + noodles::vcf::record::genotypes::sample::Value::Character( + value + ) => { + column.push_string(value.to_string()); + } + noodles::vcf::record::genotypes::sample::Value::Array(arr) + => match arr.clone() { + noodles::vcf::record::genotypes::sample::value::Array::Integer( + array_val, + ) => match format_def.number() { + noodles::vcf::header::Number::Count(0 | 1) => { + unreachable!( + "Field {} declared as single value but found array", + key + ) + } + noodles::vcf::header::Number::Count(_) => { + column.push_veci32(array_val)?; + } + noodles::vcf::header::Number::A => { + column.push_i32(*array_val.get(alt_id).unwrap()); + } + noodles::vcf::header::Number::R => { + //TODO: Use push_fixed_size_i32 + column.push_veci32(vec![ + *array_val.first().unwrap(), + *array_val.get(alt_id + 1).unwrap(), + ])?; + } + noodles::vcf::header::Number::G => { + if array_val.len() + == (allele_count * (allele_count + 1) / 2) + { + column.push_veci32(vec![ + *array_val.first().unwrap(), + *array_val + .get((alt_id * alt_id + 3 * alt_id + 2) / 2) + .unwrap(), + *array_val + .get((alt_id * alt_id + 5 * alt_id + 4) / 2) + .unwrap(), + ])?; + } else if array_val.len() == allele_count { + column.push_veci32(vec![ + *array_val.first().unwrap(), + Some(0), + *array_val.get(alt_id + 1).unwrap(), + ])?; + } else { + column.push_null(); + } + } + noodles::vcf::header::Number::Unknown => { + column.push_veci32(array_val)?; + } + }, + noodles::vcf::record::genotypes::sample::value::Array::Float( + array_val, + ) => match format_def.number() { + noodles::vcf::header::Number::Count(0 | 1) => { + unreachable!( + "Field {} declared as single value but found array", + key + ) + } + noodles::vcf::header::Number::Count(_) => { + column.push_vecf32(array_val)?; + } + noodles::vcf::header::Number::A => { + column.push_f32(*array_val.get(alt_id).unwrap()); + } + noodles::vcf::header::Number::R => { + //TODO: Use push_fixed_size_f32 + column.push_vecf32(vec![ + *array_val.first().unwrap(), + *array_val.get(alt_id + 1).unwrap(), + ])?; + } + noodles::vcf::header::Number::G => { + if array_val.len() + == (allele_count * (allele_count + 1) / 2) + { + column.push_vecf32(vec![ + *array_val.first().unwrap(), + *array_val + .get((alt_id * alt_id + 3 * alt_id + 2) / 2) + .unwrap(), + *array_val + .get((alt_id * alt_id + 5 * alt_id + 4) / 2) + .unwrap(), + ])?; + } else if array_val.len() == allele_count { + column.push_vecf32(vec![ + *array_val.first().unwrap(), + Some(0.), + *array_val.get(alt_id + 1).unwrap(), + ])?; + } else { + column.push_null(); + } + } + noodles::vcf::header::Number::Unknown => { + column.push_vecf32(array_val)?; + } + }, + noodles::vcf::record::genotypes::sample::value::Array::String( + array_val, + ) => match format_def.number() { + noodles::vcf::header::Number::Count(0 | 1) => { + unreachable!( + "Field {} declared as single value but found array", + key_name + ) + }, + noodles::vcf::header::Number::Count(_) => { + column.push_vecstring(array_val)?; + }, + noodles::vcf::header::Number::A => { + column.push_string( + array_val.get(alt_id).unwrap().clone().unwrap(), + ); + }, + noodles::vcf::header::Number::R => { + //TODO: Use push_fixed_size_string + column.push_vecstring(vec![ + Some(array_val.first().unwrap().clone().unwrap()), + Some(array_val.get(alt_id + 1).unwrap().clone().unwrap()), + ])?; + }, + noodles::vcf::header::Number::G => { + if array_val.len() + == (allele_count * (allele_count + 1) / 2) + { + column.push_vecstring(vec![ + array_val.first().unwrap().clone(), + array_val + .get((alt_id * alt_id + 3 * alt_id + 2) / 2) + .unwrap() + .clone(), + array_val + .get((alt_id * alt_id + 5 * alt_id + 4) / 2) + .unwrap() + .clone(), + ])?; + } else if array_val.len() == allele_count { + column.push_vecstring(vec![ + array_val.first().unwrap().clone(), + Some(".".to_string()), + array_val.get(alt_id + 1).unwrap().clone(), + ])?; + } else { + column.push_null(); + } + } + noodles::vcf::header::Number::Unknown => { + column.push_vecstring(array_val)?; + } + }, + noodles::vcf::record::genotypes::sample::value::Array::Character( + array_val, + ) => match format_def.number() { + noodles::vcf::header::Number::Count(0 | 1) => { + unreachable!( + "Field {} declared as single value but found array", + key_name + ) + }, + noodles::vcf::header::Number::Count(_) => { + column.push_vecstring( + array_val + .iter() + .map(|s| s.as_ref().map(|s| s.to_string())) + .collect::>>(), + )?; + }, + noodles::vcf::header::Number::A => { + column.push_string( + (*array_val.get(alt_id).unwrap()).unwrap().to_string(), + ); + }, + noodles::vcf::header::Number::R => { + column.push_vecstring(vec![ + Some(array_val.first().unwrap().unwrap().to_string()), + Some(array_val.get(alt_id + 1).unwrap().unwrap().to_string()), + ])?; + }, + noodles::vcf::header::Number::G => { + if array_val.len() == (allele_count * (allele_count + 1) / 2) { + column.push_vecstring(vec![ + Some(array_val.first().unwrap().unwrap().to_string()), + Some( + array_val + .get((alt_id * alt_id + 3 * alt_id + 2) / 2) + .unwrap() + .unwrap() + .to_string(), + ), + Some( + array_val + .get((alt_id * alt_id + 5 * alt_id + 4) / 2) + .unwrap() + .unwrap() + .to_string(), + ), + ])?; + } else if array_val.len() == allele_count { + column.push_vecstring(vec![ + Some(array_val.first().unwrap().unwrap().to_string()), + Some(".".to_string()), + Some( + array_val.get(alt_id + 1).unwrap().unwrap().to_string(), + ), + ])?; + } else { + column.push_null(); + } + } + , + noodles::vcf::header::Number::Unknown => { + column.push_vecstring( + array_val + .iter() + .map(|s| s.as_ref().map(|s| s.to_string())) + .collect::>>(), + )?; + }, + }, + + }, + }, + None => if schema.get(&key_name).is_some() { + match column { + ColumnData::ListFloat(_) => { + column.push_vecf32(vec![])?; + } + ColumnData::ListInt(_) => { + column.push_veci32(vec![])?; + } + ColumnData::ListString(_) => { + column.push_vecstring(vec![])?; + } + _ if key.to_string() == "GT" => { + column.push_string("./.".to_string()); + } + _ => column.push_null(), + } + } else { + unreachable!("{} should be in schema", key_name); + }, + } + } else { + todo!("Understand how we could get there (the tests never did)"); + } + } + } + } + Ok(()) + } + + ///Convert Name2Data in vector of arrow2 array + pub fn into_arc( + mut self, + schema: &arrow::datatypes::Schema, + ) -> Vec> { + schema + .fields + .iter() + .map(|x| self.0.remove(x.name()).unwrap().into_arc()) + .collect() + } +} + +#[cfg(test)] +mod tests { + use crate::schema; + use arrow::array::ArrayBuilder; + + use super::*; + + static VCF_FILE: &[u8] = b"##fileformat=VCFv4.3 +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##FILTER= +##FILTER= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample_0 sample_1 +YAR028W 509242864 . a ATg 6 Filter_0 info_Integer_1=-1867486102;info_Integer_2=1180908493,1041698941;info_Integer_A=-207506013;info_Integer_R=-1221871784,-1356802777;info_Integer_G=-496257853,2127853583,-1498117417,-45419278,1783408501;info_Integer_.=2082620030,-344161839,-1022296779,-1007334133;info_Float_1=68.286865;info_Float_2=-96.154594,-23.433853;info_Float_A=-48.782158;info_Float_R=-46.15216,-92.639305;info_Float_G=-7.5115204,74.78337,1.5983124,-8.867523,77.741455;info_Float_.=26.825455;Flag_0info_Character_1=i;info_Character_2=r,[;info_Character_A=g;info_Character_R=M,D;info_Character_G=h,w,\\,v,o;info_Character_.=G;info_String_1=p]ZoXMTgQo;info_String_2=uVGn`JweVD,DUYytzAny[;info_String_A=_POshsqbSj;info_String_R=AdbZcRFrrQ,_[VS^RtSvz;info_String_G=MeTjonYVIn,jLIi`oWogn,tTH\\QXXOiA,LJLnuPtf`S,r^aaSswsvY;info_String_.=CzkT\\Wk_sG GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-1876597949:706761235,-251943823:394859496:-1947058767,424473864:1331697703,-73747609,1645597043,-1553292366,-1685240226:300184417:18.381859:55.763123,-25.909782:-23.853012:-65.84661,-26.444412:12.577988,-87.76228,-3.4822464,-95.66553,55.56636:-35.16729,6.755356:H:Y,N:m:[,Q:B,C,g,L,`:c,x:xXYm`NnOG[:K`QKgogYxZ,uNAMyDqpgZ:liSmUzRvGG:XBgqxa[aBw,_ZxxkAFA[o:`OIdJgjZDS,tKauvtaIhw,mmrIgNXcbh,Rd]QWyFOgu,kSjBlBKigq:znOIm[gGXi,[j\\RlwOmAi 0/1:1178180247:-284426189,-998625419:1871179132:2059063854,-2098693212:1608185708,-1406134851,1030174330,-2031052594,1598302707:-419749875,-1478145995,-1699207585,-1247215944:-58.38821:-62.55126,8.762314:-74.02904:-24.794365,46.083145:13.760803,33.24704,-86.315704,60.576385,-14.547348:82.95245,46.642517,90.124435:a:r,Z:o:y,T:n,Q,F,E,n:m,n:aDWuppugIL:wOFhYRxBZH,MqOWyQIIAH:u\\QQqQyZ`t:TnZk`XSq\\I,_HmAWXBIAy:CL_`ebjENF,E`pNSPd^wz,^tZVmq_oBY,JgQ`oPn^Z\\,`bla^yzIWt:gmoGx]WbcW,VPniuT_IlS,skBLwLHlF_,fwwGspJRS\\ 0/1:246980022:-1016832924,1861844708:-8173468:-1069804542,-70068572:-1451768444,-1682870970,-1829205528,-2068943681,363393119:-288960163,1831626585,1958104113:-80.60921:43.23416,-74.28625:-79.06761:53.03195,8.447456:-14.780685,-46.596863,61.897903,29.243942,-69.91906:64.31647:I:P,N:e:v,h:O,T,N,v,r:w:SEWbLtHSUi:CnsIsSMCBy,^pRIQ\\eLD]:QRzYyzV_sz:wqgYJ`TzLK,hHWZiobiKn:dAPiptpPRU,QyBPeNqLaR,rPFJcjVaEr,HHloMTrcoG,yzgqiA_WIL:`ot_PZwl^\\,Uz^rcVndZg,_IpyMneGSa +23 1165400956 . T t 199 . info_Integer_1=-597222189;info_Integer_2=446843965,1432841503;info_Integer_A=-1756403175;info_Integer_R=-1210584642,1067164582;info_Integer_G=-2026752623,1524204480,2063402043,-1671581234,1992411203;info_Integer_.=387204105,-2048329790;info_Float_1=29.60765;info_Float_2=-70.24462,91.82048;info_Float_A=-57.780792;info_Float_R=-19.511703,87.46164;info_Float_G=17.362617,-10.059616,-89.640594,-70.55726,-48.635937;info_Float_.=82.884,-31.403328,-83.54941,-54.887726;Flag_0info_Character_1=];info_Character_2=Y,R;info_Character_A=p;info_Character_R=A,W;info_Character_G=i,y,I,q,i;info_Character_.=w;info_String_1=]gp_[s]vDh;info_String_2=Y\\SmynkIV^,tOuGkqHsiE;info_String_A=QDdbnppEhM;info_String_R=VgQkWCCgEH,r^aAgT^sOf;info_String_G=z\\_iwMGBRH,EQy^RJwkWd,gu]hpIwaVj,iwKORqBPP[,_ShIZ^Mr\\P;info_String_.=zxs[sGNNuy,cmnjXNUPka,QaFrhEZaIB,_TjXJMdWCM GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-871261733:-1500509753,2025272017:1864754769:-1127684339,-447878996:-1851298122,1367475939,1988967275,-439362500,-447904679:-907504720:-73.56331:-86.60319,-31.910011:6.785797:-95.413086,19.286415:9.942863,-23.623634,31.06224,42.57071,92.734314:-51.402973,-25.126984,73.030045:J:K,Y:K:c,^:y,P,V,b,S:U,w,M,X:IKZ]ZMDszw:apRf\\BVTcU,UOJHFcgkaj:fIjt]RZCsd:TtoRPBHoRS,sDF^wkt`MK:boQ]OQxmec,eJfBqcdaUg,To]BkSYKbI,J\\qQxtjZBq,\\nQWJTeYEf:tHujPdNde\\,EWFfR`mig_ 0/1:205411993:1422316187,136922489:-1998113238:-1581743308,-1016113531:234539080,48396474,1428303612,1012371357,-608258082:807181400:-48.021435:56.736588,-5.4926376:2.772995:6.886032,49.36194:72.34972,19.888977,96.9234,-52.7704,-20.327469:-86.68194,-95.58513,40.37178:r:[,s:r:i,g:p,S,t,_,r:I,N,Z,_:RioGPuuW[_:ZM`VMm[mpf,dTaWj\\c`Pm:bjEzGJcxUg:Iaec]IAHYe,yWTlfKaF]e:JYeMoCeFYn,lwAXvGHCdL,yzGwQfB_YF,lpaF[kfilC,xgYHiD]pz`:W[nVSvJsm_,hxR`xIkWis,dNZNX`_eku 0/1:249006189:-1550023525,1431034968:364065807:630293442,-1899991908:1343119655,1148049825,1254870322,-805282094,-913065710:1033469396,811475314,-784376229,1101867431:-84.87093:84.413025,-28.448128:-35.850975:98.89691,85.76083:-7.8424683,-39.988495,-94.08006,-6.4476013,48.44284:90.68361,-75.537186:P:`,g:j:f,t:M,t,l,u,G:D,^:xYp\\]fYNUh:vzefhZ_x]E,FqtKVH]Xvn:Z\\zIZeOkBf:hYfZzwqLzB,drm^rQSCM\\:wEaPIq`oJt,ZfNBcc[uV_,`pF`wSolGO,BI\\aL`htut,^OEbgPyBTf:fEql_^pktX +ENA|LT795502|LT795502.1 525786811 . A ATgA 226 . info_Integer_1=1004917273;info_Integer_2=-1087925856,-1111801609;info_Integer_A=1142924498;info_Integer_R=397636772,575245484;info_Integer_G=631457844,1508219739,2060178753,1508815851,-1692774727;info_Integer_.=915360277;info_Float_1=81.456085;info_Float_2=-97.36381,99.07503;info_Float_A=-17.968132;info_Float_R=23.030853,17.895386;info_Float_G=36.786133,-36.816742,-79.92742,13.375832,-41.70673;info_Float_.=-56.670807,-88.687706;info_Character_1=x;info_Character_2=f,X;info_Character_A=Q;info_Character_R=M,v;info_Character_G=`,B,I,K,K;info_Character_.=s,S,a,E;info_String_1=RULoNvUdVj;info_String_2=YlKPytYpDY,hwIe\\Lokil;info_String_A=\\VZSHlparH;info_String_R=PVoBxilKPl,_s`t`swTzf;info_String_G=FiINYvUJIO,LtzFxYFFJp,mMeZaQtSZU,rFHUKkG]FO,LxQTXnzEJr;info_String_.=hoXwhfvniY,A\\txGAVbNp,SbcLeVnkYI GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-1992801692:305092475,-1179215612:-1739296736:-388696167,-659498070:1503808064,-183677921,-849522112,-1806185994,-1784485194:-498574773:14.566895:10.018013,-16.879608:99.71886:0.29120636,-9.131622:80.328705,-26.403976,-37.213943,57.87976,61.69644:58.0925,-70.47134,-59.34813:X:N,q:S:a,_:c,W,U,b,Y:m:^QkYoWratI:ryX^JlAyCx,WGZntpNsOo:FRryqZFoMj:gCO^BOI[ml,VJqiy[VWym:nbtqw^\\zmA,ZiBBJm[Vbv,aNMll`xnfr,nIJf`wjzny,i[qz[mHs_N:rHzB`UssLW,apsPd_lrip,Uih`ROsUql,tnBQQdhtwm 0/1:-24901120:240741600,-335142169:1743578406:-898920674,473452936:266099587,818869222,-1461529615,1643094296,-2054606423:641472069,-1850726656,-386681464,266081312:97.276596:48.907135,-99.147675:-17.34481:36.995285,-15.711685:-74.65256,-87.14964,0.5836029,36.94156,-83.004906:90.52608,5.7993164:b:`,L:w:l,M:G,m,s,V,n:i,p:ZMBpgnUaaU:[fe_qYtEAS,HcYIbeHcgz:kanq^dgO[M:`v^vmOCeeJ,XRDAZIQsWC:czkmbtrEXT,JHm^^jZjSZ,MrQ[yeKALl,hoEeih^Nvf,MYQGSi^Zux:XI_a[M^G[_,czNvmEmcXT,zwNxaRelTy,\\UHxM[KJGU 0/1:60157897:319490021,-742515096:-1289964762:1628004982,235029603:2020442014,308460461,1558271982,1627368865,894042318:1994633465,-1579924515:-10.664841:-62.51185,53.523605:94.14816:2.9510727,44.83983:-5.713913,-74.449394,-56.378246,-46.97802,13.483833:5.34816,65.6853,-95.33172,-58.07209:U:F,W:\\:A,E:t,j,c,F,J:H,R,S,^:`ZVfoyxdDK:gntf_rQo]a,mHMNJLO[`K:]PjxDCRfYV:MMGtGvm[wr,eQeumfsRZL:EqInOsdeDW,xOQBKswphI,nU_PiY]xef,cAHdxvRbFC,kvJ[v^kdcb:fyKXA\\hjfv +ENA|LT795502|LT795502.1 1506498921 . A gT 99 Filter_0 info_Integer_1=1074860489;info_Integer_2=-6784655,1952022752;info_Integer_A=-1765522773;info_Integer_R=1316333577,-554518728;info_Integer_G=-440746192,417172829,1208578807,-1256320970,168283749;info_Integer_.=-67150747,-701563860,1708267257;info_Float_1=-85.47166;info_Float_2=33.09308,37.761444;info_Float_A=99.544266;info_Float_R=-4.276779,27.070168;info_Float_G=-93.02027,-68.755196,-18.597626,-82.3945,94.890884;info_Float_.=45.63858;info_Character_1=R;info_Character_2=B,W;info_Character_A=l;info_Character_R=z,E;info_Character_G=Z,h,F,D,D;info_Character_.=n,G,v;info_String_1=ojZkSfujYX;info_String_2=`ZrZJtq_hx,StcGnLjWNS;info_String_A=k[feQ[mqyE;info_String_R=Grr[rGo^md,GkiXanc\\K\\;info_String_G=Yhij\\pOPji,yYlsCnJSCY,VggsEuC]ad,G^jiYbvsbn,IJmJvG`jzs;info_String_.=CKtxVQFr]_,G^jAnQaGyI,yvzleXG`vO GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-2039921838:1260782784,-2144365597:1110295788:-158846729,1495837063:390766793,1114219927,-790406568,1652554877,-1144133980:-610126444,1736640977:0.7073364:-56.6103,13.725807:-8.711052:-35.14733,55.534668:-41.20214,-69.47873,15.234543,1.8139114,-81.88782:-86.727234:]:f,Q:I:u,_:b,k,C,[,T:b,S,O:hGS[GPZUZu:uQuJZtwYq_,SyDvF`v_[[:ol]TtBXxVP:mFMNUVM`Ir,XbBPeoBkYj:`xGS_`zgey,v\\]bxaFPdJ,lGqnBWyHQI,ynpDFGuSsm,^bGxsDdgIl:q`HG\\bwqSl,GrnuUSzgVy,cRSPjljk_Q,TWbW]MISyd 0/1:925068425:1074963297,820496013:2032912248:-456701844,1354651711:-1215180367,1123368027,-680845673,-332079579,604760814:476241956:-25.982353:-75.11304,97.80142:62.201385:39.84816,-3.5477142:-50.861835,-43.965935,-45.22519,44.636627,64.44443:-27.258255,-56.71892,81.974884:D:g,j:o:C,D:T,v,A,[,r:c,L,I,I:OJBBSJbN`T:DdPsffHJuV,AJInhMhoiR:__FbESuepO:SDF_mGG^JG,YFfWtuVWFc:ObAtvWdiHC,nRn[JyLGrn,AvnzUN`iJP,BD\\thZTCSk,SuJCDkzPGU:U]SgYvoNeJ 0/1:180052409:3916924,-608184065:406358148:1618596409,-143985416:781007994,-768878726,1593943437,-803117731,-914254344:446901758,117973804,20424962:-9.037872:-71.62204,-38.234306:-38.99839:-78.47328,-74.93701:57.11064,53.769928,22.832726,-1.6777267,96.80972:77.72031,85.200424:J:Z,Y:S:V,u:O,],w,n,Q:Z:Hesgu[drFG:pWRnQtXSiX,D`cgIgjITG:D[lL^EIPfl:AOXfKDsetT,GbyjVXojJF:AcGyoIohdU,zIRGXEkwRv,JeuIkD^`cs,MIpZxRusP_,DvXBProTn_:X[sDTJasXv,[nehoK`q^y +chrMT 900574305 . c a 208 Filter_0 info_Integer_1=-523627641;info_Integer_2=828853617,538841733;info_Integer_A=-1070289656;info_Integer_R=1177092376,1248528320;info_Integer_G=1338006213,-939491184,2031520519,1625981257,-1542813010;info_Integer_.=1020802949,-325766450,-1975174725;info_Float_1=86.21178;info_Float_2=55.274292,57.992126;info_Float_A=98.6465;info_Float_R=-56.676746,-78.452255;info_Float_G=-15.1058655,-32.05681,-23.85817,25.53675,-44.79227;info_Float_.=-8.955811,49.62912,72.4005;Flag_0info_Character_1=x;info_Character_2=B,r;info_Character_A=i;info_Character_R=\\,D;info_Character_G=g,v,l,W,T;info_Character_.=g;info_String_1=j`zU\\K`PLc;info_String_2=l_rhk[Kr]b,rFB_aSUBR`;info_String_A=ZQYIsGAof_;info_String_R=HwrUBliWel,scCvWxgE[r;info_String_G=kY`HMgmH_p,IUvLZ]sdba,nVQJ_Fh`ET,QnYUFz`ShS,KXwJfcYmsw;info_String_.=Q]wexXkHyr,fL\\NGDMlkW,jVpWnME[tp,Zz\\hrFyx`] GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-1648034443:845207498,1438334805:-821666363:-532302872,-784878946:-1660896800,2008926111,1279825538,-233248668,-1578146061:-1728381686,-1354962949,-2095339305:4.250908:1.8742523,-78.206894:-22.634148:-61.518906,13.15873:-64.15179,17.086502,-25.609276,57.059555,-92.1911:26.885536,-27.22528,-51.00875:F:l,e:M:y,e:^,X,g,x,h:K:LXIC\\KLZsD:Nd[vIMiHJA,^ConDldYtT:CBqgBJnzRq:nOgdeNbqKd,rTYmYwJcQu:WMlQs]gO[a,Kk[sJ[UoxT,HT]XWH^ZTF,IIJXSmrLHg,qo`OJ_hgav:tSEKSQWXRL 0/1:-1351553081:219969512,1955232553:917524488:1134530757,-362836542:-2127154133,-1470782646,-1443121280,488596267,-1560382672:-1061588830,843145750,978368685:67.93184:-0.71754456,86.030624:-60.117985:36.45627,-81.18977:53.986176,92.98938,-65.36742,7.6014786,46.42549:27.035995,-57.711315,99.20256:M:E,b:k:v,o:\\,y,j,N,J:n,J,_:EvnT^AAXoY:cdVJ^MhXbh,y[[LcBZavI:f^EAgXrRhj:A`FeAFUqKi,fsOnJ\\kmaj:JyK`vg^yGK,G`eXThODuq,gRMJ`naYA^,WkywfqqL`h,ItKK^GJvpU:brd_fl`zxc 0/1:-334893903:1812791387,-1702573904:91913024:-1715303171,1720214253:-1065363642,-1781482473,1593677428,-1611378854,-1463000308:-918589861,802242226,-512257664:29.155685:36.117004,53.34468:-73.653984:-47.224236,80.070786:26.598068,-14.040497,75.17987,-24.15435,-7.163788:-31.709908,-72.07062,53.230316,-11.984253:f:c,r:F:Y,B:m,Q,O,J,S:e,x,V:wsZzDGtnRs:rr^refYy\\D,xWTSt]bRdz:pBWEGavAaK:ROjxoClYNb,ppopUzLGgP:LCytHxxfrF,dGG^fvX^iR,uqkLhoqbuy,yFAbgESl[^,SlKPAuDoaN:hbu_MpWRtk,kJqJRFtD`K +X 508903144 . T taAG 107 Filter_1 info_Integer_1=-442012684;info_Integer_2=1242798393,-893635990;info_Integer_A=-1049853993;info_Integer_R=242988245,-245551581;info_Integer_G=992362638,-556141956,-1436766801,1237135939,-1164555077;info_Integer_.=-1890267838;info_Float_1=50.181763;info_Float_2=5.1533203,32.221054;info_Float_A=31.930801;info_Float_R=18.487122,-4.3887863;info_Float_G=-98.10066,-69.57614,91.27092,60.39116,13.878372;info_Float_.=-46.815468,-65.40532;info_Character_1=_;info_Character_2=C,c;info_Character_A=w;info_Character_R=v,g;info_Character_G=v,Q,r,`,W;info_Character_.=\\,j,W;info_String_1=Ol_Gd[f\\tt;info_String_2=DX_Nqhsbft,ZvYZmhftHw;info_String_A=bXt^\\wzwfQ;info_String_R=ANOP[Zjcef,[mLDzYe^Xa;info_String_G=i_XoxRH]Up,N\\qKskBEfm,vceQjVrtTu,_LnQ_[ngn],yd[ZFNmECq;info_String_.=FPpMF]TI[C,skPWfxtNBS,uEV]cCazM[ GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-1784107859:-939663,-275551415:1120891123:-769476243,636006815:-1200543946,215752478,-1484326861,-561668774,-176043456:1567908440,-504633662,-461948338:41.60312:89.26175,-28.585121:-25.16079:-93.47296,-97.84482:-64.203835,62.569122,-16.536377,24.606033,-83.769844:-0.48690033,50.683716,-61.86049:u:p,x:G:O,q:J,g,s,K,g:c,l,a,w:mgLADffOAW:OFYzsNT]DN,knpK_\\ZlLw:NIGi]zaPz\\:vcAptCt\\VT,z[VAZFjS[p:tcGuXLiEpv,fCLE[^Bzbh,rkGwaRnhxi,fTMvHtFRwN,RPYaXfd^BF:DOnEKUN[]V,ppBNNWrhhK 0/1:1910075420:-934542610,-878954512:1171320700:1208017031,-340691680:712157209,1065873060,-1366658844,942622778,-783364205:1637019880,-955981111,1346196180,1597225767:-26.175598:93.89627,-80.47638:-29.046753:-60.67419,85.3492:-41.516758,-69.87106,22.452782,21.140648,61.20903:-78.91676,26.818916,-59.74772,-27.64888:k:n,]:U:T,X:B,r,c,R,G:Z:yGqzsPBWTl:AyQbJXbTNE,wQIyxKzeXn:F\\_wSzKzz_:GPJEnUVNAi,OGt^nVFblY:Npi`aeQSMU,_lhlMXNtTB,[QXjBWuj]d,rKRDMyolVK,QciXXonSdu:ycSYShz\\Gk,\\hYbIs`WDD,CfueEstMtZ 0/1:323372379:1922884232,266253196:-2038955647:-206847340,703053779:740621164,1717070470,2056797316,-1709077983,174222777:-229529326,1150748796:12.667084:-49.801563,43.236115:-82.59476:73.92726,70.36862:16.87722,-79.93007,70.9064,95.77089,-38.30352:86.8264:O:D,k:V:j,u:i,I,z,_,J:u:HEuDirTQc]:[`cTQh_wUo,gJqeDnKyu`:TexBlzobtU:YTlxHhXaRn,DAIwrYQqxu:LK`xEnzWq[,zZmLwxn\\m`,rujYx[cES^,ELVaIdJFDY,Z`Fuq`aaii:LTi`XfaE^C,jgR\\cCazHR,v_dFUwq^p^,wrsXkkAV\\I +23 2057099842 . g a 105 . info_Integer_1=-1463169227;info_Integer_2=1864557327,-1832965500;info_Integer_A=-1397690738;info_Integer_R=230018794,906575350;info_Integer_G=1606595129,-440932389,310954072,-1735028992,-71170678;info_Integer_.=-161655144,935253047,9786785;info_Float_1=-99.35045;info_Float_2=-42.93885,-82.69522;info_Float_A=83.79291;info_Float_R=-85.69043,-11.00209;info_Float_G=-60.064484,-27.798103,94.52054,-38.2653,3.591034;info_Float_.=70.61771,-63.402893,30.473663;Flag_0info_Character_1=g;info_Character_2=V,g;info_Character_A=V;info_Character_R=h,j;info_Character_G=G,R,J,\\,F;info_Character_.=`;info_String_1=ukwTtXgA^`;info_String_2=o_mXjM^xZX,WgiO^rhwGP;info_String_A=QGOHr_eINM;info_String_R=JvS[necmAA,BnmyIEEpWY;info_String_G=EPGPQVsCly,YXuaHkTy\\r,l\\kp_Z_cmw,yk`oYPWJTC,jPmpS\\c\\[i;info_String_.=pqZPLvIWXJ,u_QaCDUzqA,dNRpmBctzH GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:883085881:-1163194169,396734570:-864830302:-1015935718,416512239:1055659899,378650980,1719415308,-591094434,-602013467:1562372336,38255854:-96.96286:-22.160027,2.9108505:-76.4101:-31.626724,74.70122:-10.465622,92.86656,66.5076,-47.610092,73.46921:-35.273743,71.3008,-21.861176:N:Z,x:w:D,Q:g,[,i,i,f:b,u,q:rOrdrrnAAf:fkbpBSUynu,ks[PFA_asT:JK\\[phzgTx:sIMyglMmy[,zdTHmmDXr`:mJZqxRqayy,nzIi[BVIeR,TJX_tcRRuR,nwx\\LjKCPX,EMgDVxWAhQ:urxfXYFYWh,yxg_KbMfiV,O^NfHdYiTT,RQLQTrA`[D 0/1:712995789:-341947022,602595428:496080894:-317586736,-1599924675:1901874239,-1170280909,-1426570445,1489049109,-2000710204:448535337,-1814547565:99.9409:73.24715,-61.679005:42.36447:-67.20648,-37.822533:-44.426228,0.8399048,61.94937,67.76169,-47.838783:96.54474:I:l,g:W:f,T:C,j,],k,a:y,G,Q:nuKQNGtG`H:rk_WztmXlA,rCVaSXG`vY:XNJj\\lgAgb:qKSozwOYIa,K\\rGODc_`q:i_rCsVxkls,YErisA[XyI,nDyzaAV^te,NlxAGcHuIw,CYzN[Gykcx:bDyCEM_Ntk,hVlhY\\KH\\V,TyRCuSB`sn,XEuXmmuCrD 0/1:-1170229269:-1356256190,225279207:869161357:771625610,-569908878:293513696,-619213311,-1755999259,1604615807,1087899712:-745470529,2144376132,-1224677810:-28.347206:-60.524513,-69.2384:-86.569954:-58.52585,-74.32852:-59.802223,77.79262,69.73915,72.24533,-7.9125137:34.921783:N:h,i:u:i,l:L,g,L,B,g:S,c:^y[zODvjKw:NpMKSd_yN],vkeZUtFWRR:oqiWbtGZPN:ItYyF]PgmM,e\\qT_MdBPH:p[n_BBNiqU,my^WxdBmGo,RB__ZCtWd\\,bx_t`szKbQ,qQuZ`\\jq[C:LzEcfVrmKJ,\\QgmcLzgxq +NC_000015.10 278483743 . c Gc 68 Filter_0 info_Integer_1=-1771300013;info_Integer_2=611485162,1796725452;info_Integer_A=971438374;info_Integer_R=698255143,905472298;info_Integer_G=-200904731,1733482657,-1601571925,-95180709,852757134;info_Integer_.=1226331110,1800309665;info_Float_1=-8.41539;info_Float_2=-91.844246,33.56476;info_Float_A=-70.37154;info_Float_R=-88.31048,49.067856;info_Float_G=-56.826736,-72.017075,58.757156,-87.95636,27.40886;info_Float_.=-93.33689,-11.933228;info_Character_1=V;info_Character_2=F,H;info_Character_A=];info_Character_R=],k;info_Character_G=b,j,c,v,k;info_Character_.=W,J,`;info_String_1=[sg[NfQUjS;info_String_2=tuIDx]qY`n,sYihzjCcDX;info_String_A=rWeIJoLqif;info_String_R=Lc\\nOEn`SI,ohc]_`UFau;info_String_G=ts\\z[cGhVY,FWhsZospCl,EibJWC`AtQ,mNRPCdUKvw,lVzUvcofUf;info_String_.=ivsywIpK\\E,dw[sIibpcF,ngkSonUgLJ,OXEnvoSKPb GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:1053018922:-184938975,-1760026633:-857964358:-2102716665,1815665606:-1175872690,-347280558,1231790968,-790356303,-222139945:-1345714777,160079922:-82.493515:64.684265,-99.60785:-19.693016:-3.5498886,-99.20609:-5.4513702,-81.74608,-6.946541,-87.49165,-31.873795:-0.010063171:x:n,^:R:b,h:h,c,],\\,X:I:LWSccGJLM_:BFr_AeHgBF,HkaUOy`jqz:FGMQKwoeFA:^WCLcfxttz,YAipFMM\\Sa:WfUeg^ehSy,Rn^uqvYSmS,sEKSkLrC\\p,QDmf[JGzbG,QiZIx`^pZN:Kmi[\\ChDrN,OJdBvVq[Af 0/1:1914159851:1114283704,-2014859484:1560783535:1865312897,256404857:863585279,-267380020,-134015412,856986131,1668297008:-1579607045,1788206035:91.887955:-27.877113,-57.696247:-67.211655:-94.020226,87.17949:96.09091,-27.167084,99.71431,-84.021736,20.75412:50.480682:r:e,q:w:t,E:R,V,i,I,k:o:jO[Cax\\_zE:beNr\\DPEAj,[rxvLKUNpg:Em^Y^JfmRt:I_pzdiZKvX,KNhZiCPeWc:^rxclcxiEF,bUblZrRMet,KXXitlqZ^r,tDrkWBjrjr,IybJjtMXvv:Qavvd`qH]Z,lXvCHRinb`,EYSpZQdvya 0/1:2112335424:1117304177,1186842567:1518339830:1538701932,1525880826:1156301305,293295010,829335070,-1308097481,567438411:-876069694:-71.12594:80.75633,-92.32674:-51.611877:-83.43153,2.3728333:-16.384102,18.902657,34.962082,26.661896,29.297455:6.421852,-9.252121:u:R,F:\\:],v:A,B,y,p,c:v:miPNAjQGGF:`cKYG[lHBt,wbp^hin^Rk:^kE[kkegww:\\gzDFBIToj,SR]K\\pHId`:KuB]TPhgby,XbLVwenUxV,qxSSu^ko^v,TcVjGbYkIp,_Y`cUrzUBV:dc[XIJDWB_,R_YE]uzONR +NC_016845.1 1273217582 . A cA 185 Filter_1 info_Integer_1=1702504238;info_Integer_2=-1300020074,-1771363986;info_Integer_A=-666582393;info_Integer_R=-1483769984,-1241578554;info_Integer_G=1976807172,-1260807615,-108510257,1277543943,1016305186;info_Integer_.=1829528682,-928482172,-429726805,-2007283327;info_Float_1=70.20416;info_Float_2=94.22778,49.014664;info_Float_A=77.67261;info_Float_R=69.01376,-85.50122;info_Float_G=22.049858,-31.612656,67.47859,42.012314,-0.50154114;info_Float_.=-79.06835,36.144714,-11.66687,-33.392593;Flag_0info_Character_1=Y;info_Character_2=s,w;info_Character_A=Y;info_Character_R=E,S;info_Character_G=J,C,C,^,N;info_Character_.=s;info_String_1=RnqfrhRxGK;info_String_2=QCczZsqSMX,UadliszTvD;info_String_A=UeAmTxgIJs;info_String_R=RpC[yfli[m,UMXScERIAT;info_String_G=Piyg[YSyn],wrVNLOsrsd,CuaYlZzSG`,hHfxMnZBYb,fELmQbwhQV;info_String_.=MHfN_MSgEe,_REWJxavTD,SkkKdTmDLG,R[oCTWMP\\K GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-1879771452:-2024174827,895533931:25454566:-1413372025,-1730678484:1651476894,522323445,-801323168,-692607812,1081910993:-1359195475:64.78084:98.23514,-95.097374:97.43535:-65.953636,7.431885:74.098724,-84.2887,-56.68762,-86.44216,96.5443:32.076004:K:G,\\:G:W,z:h,o,j,X,Y:e,G,x:nbjlHj^`q]:aa^uI^^SoQ,JyJ\\ARpaJg:MniIYiZryL:qDr]fJV]eR,iuDxSPv[oY:qLjrYY]bPA,rejiDo^By[,`mxXgnjkPa,_grwZxX`kA,\\OLO_zFEeT:xJR]YarNNn 0/1:380883717:-62474379,-1769613882:-829056637:57433667,1227514553:68322866,1729576571,953880816,-1186526990,-1862887320:-1537612724:-98.82541:24.12381,-75.1378:-59.668877:57.233078,80.90968:96.60881,-88.86273,-30.641846,21.3125,-24.467636:-39.953922,-20.188927,99.38843,-5.183769:u:l,b:v:H,D:v,V,q,f,y:L,[:I]cUryB`WS:hRJHmG[JFU,[kaZP^eEbq:X]cDdXDjYF:tDkOUi`GjU,dwC`DsgfEO:lQmOhgcIB],ZNRGbnuLAI,j]CjtQNOW\\,cyS`hBmOSp,[E[YiTVZbZ:aGTIEgKEx] 0/1:-279336022:-1084797147,1134420570:-567169692:-1283635727,-362348689:1152882561,1764606448,29946627,-1092833737,-1928697170:-1487395248,-1139333107,431473979,158203585:92.27652:-53.75874,35.170288:-12.784073:-46.073364,63.364838:-34.01854,-96.420715,-46.4582,-62.54904,-3.9217987:75.17345,-85.619095,-76.64981,66.046524:Q:Q,j:w:h,s:q,R,x,w,e:B,G,V,b:swXpP]Tmxr:S^AnbSXQXf,LQJBTUgAex:LciDfNSEuH:Ls]QTpWGBO,X[Jns_eDFr:QKqHwAUcRa,bzPOGoHhNR,Ea^NYFQRqd,ClLx\\^fCXL,\\ROGUiBQUj:styjseYDT_,aEHA[zxJlW,eLIiDr_aGn,RTsLkpJjgO +ENA|LT795502|LT795502.1 566884162 . t c 22 . info_Integer_1=-63306296;info_Integer_2=1391506844,-1503768112;info_Integer_A=340548256;info_Integer_R=-1286314818,288781403;info_Integer_G=-800469678,-1311787939,-793948174,1533475939,755254594;info_Integer_.=-1341990003;info_Float_1=-76.227356;info_Float_2=-54.977512,-39.39898;info_Float_A=-35.61332;info_Float_R=-70.32056,-42.79394;info_Float_G=67.78093,43.006317,92.26671,-48.16651,4.3726654;info_Float_.=-60.336803,-45.87288,92.96947,-43.244385;info_Character_1=[;info_Character_2=c,J;info_Character_A=R;info_Character_R=o,d;info_Character_G=h,`,F,\\,q;info_Character_.=T;info_String_1=q^HZe_mW_C;info_String_2=FPSDvSVXAd,YbrjDSdRXm;info_String_A=IxDTHZYoq[;info_String_R=OsOWlbXzO\\,hAhG_b\\Ifw;info_String_G=jb^GYiHZRT,_[`_aqmUIf,PtWWNPUINQ,WkqQaaxSee,jRMUC_IYwu;info_String_.=ZVqn\\yRJEI,`vlpPiWkLZ,aVHocDfVJv GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:389250658:-1173892904,-995837010:380428736:-350796083,-1946061625:-1985077526,-956832721,-2103216081,1213731248,-1361646347:212134446:-93.3871:73.895645,-82.49681:-59.703255:-53.21877,-11.0794525:98.62854,-40.406464,36.850067,-61.214233,28.269058:-80.0885,25.734207,92.746826,24.650955:Q:l,w:j:t,Z:K,X,a,E,S:i:VAXYF^LWPG:SudBRfeYRI,axYzALsh[m:gWvHMgghOt:cIIIEUOOnN,Q`yNRLvwIx:HeiQgtTGFY,A[RlKUJYGM,EDyo[bNg]Z,[DQbbRhs\\H,DNoj_HFJZ]:u[WuJ]OfAC,ToajkjZMqO 0/1:1247618239:1495558316,1270330192:-1812953658:2099386438,-1719636933:-1719318579,-2036965806,-1361738579,-438246128,154780382:-2087289599:99.94664:-24.0057,11.140228:-54.74951:41.22667,-17.469597:62.76808,-47.069477,-82.23286,97.09668,34.973145:-90.37955,-0.9262085,87.107376,69.280334:y:M,T:Z:Z,`:K,y,f,N,c:s:DmKvSoUTTo:zjlBpBcCYU,wEhOIx\\sXm:YYmcVdAtGt:UlifazYxMd,snMxUXwcD^:AHixNQliMD,JdpvgRsGQe,fPSRoIIRVL,AHM[ETIVla,GXPIPLRtqa:pvvKB`NrwP,lTl[KlJinZ 0/1:-1942376961:1024730712,39811746:1702586628:1394978346,832590142:1113681009,1611955235,-169392027,706295232,-1382855589:1043164434,456932470,-1198813064:74.17302:79.9146,-59.669567:-14.853073:-39.896774,-26.528046:71.47615,-73.95854,14.603233,-59.66177,-35.773087:8.042595,-81.39966:i:\\,`:s:Y,S:f,y,L,s,e:W,a,s:vLy\\C]]Bkb:cU[]e_icry,peVDqFyCOL:pmMspaUUXk:uIhBoRPWTP,^jGL[\\`Ei]:CFKAibZSAV,]jyZA_dhSu,UVV]AtIjJu,Than`WdhfE,GzqGzeGtmq:vj\\WHa^Crz,lzLRWOruNj,[V\\LWB]XnM +"; + + #[test] + fn init() { + let mut reader = noodles::vcf::Reader::new(VCF_FILE); + + let header: noodles::vcf::Header = reader.read_header().unwrap(); + let schema = schema::from_header(&header, false).unwrap(); + + let mut data = Name2Data::new(10, &schema); + let mut col_names = data.0.keys().cloned().collect::>(); + col_names.sort(); + + assert_eq!( + col_names, + vec![ + "alternate", + "chromosome", + "filter", + "format_sample_0_GT", + "format_sample_0_format_Character_.", + "format_sample_0_format_Character_1", + "format_sample_0_format_Character_2", + "format_sample_0_format_Character_A", + "format_sample_0_format_Character_G", + "format_sample_0_format_Character_R", + "format_sample_0_format_Float_.", + "format_sample_0_format_Float_1", + "format_sample_0_format_Float_2", + "format_sample_0_format_Float_A", + "format_sample_0_format_Float_G", + "format_sample_0_format_Float_R", + "format_sample_0_format_Integer_.", + "format_sample_0_format_Integer_1", + "format_sample_0_format_Integer_2", + "format_sample_0_format_Integer_A", + "format_sample_0_format_Integer_G", + "format_sample_0_format_Integer_R", + "format_sample_0_format_String_.", + "format_sample_0_format_String_1", + "format_sample_0_format_String_2", + "format_sample_0_format_String_A", + "format_sample_0_format_String_G", + "format_sample_0_format_String_R", + "format_sample_1_GT", + "format_sample_1_format_Character_.", + "format_sample_1_format_Character_1", + "format_sample_1_format_Character_2", + "format_sample_1_format_Character_A", + "format_sample_1_format_Character_G", + "format_sample_1_format_Character_R", + "format_sample_1_format_Float_.", + "format_sample_1_format_Float_1", + "format_sample_1_format_Float_2", + "format_sample_1_format_Float_A", + "format_sample_1_format_Float_G", + "format_sample_1_format_Float_R", + "format_sample_1_format_Integer_.", + "format_sample_1_format_Integer_1", + "format_sample_1_format_Integer_2", + "format_sample_1_format_Integer_A", + "format_sample_1_format_Integer_G", + "format_sample_1_format_Integer_R", + "format_sample_1_format_String_.", + "format_sample_1_format_String_1", + "format_sample_1_format_String_2", + "format_sample_1_format_String_A", + "format_sample_1_format_String_G", + "format_sample_1_format_String_R", + "identifier", + "info_info_Character_.", + "info_info_Character_1", + "info_info_Character_2", + "info_info_Character_A", + "info_info_Character_G", + "info_info_Character_R", + "info_info_Flag_0", + "info_info_Float_.", + "info_info_Float_1", + "info_info_Float_2", + "info_info_Float_A", + "info_info_Float_G", + "info_info_Float_R", + "info_info_Integer_.", + "info_info_Integer_1", + "info_info_Integer_2", + "info_info_Integer_A", + "info_info_Integer_G", + "info_info_Integer_R", + "info_info_String_.", + "info_info_String_1", + "info_info_String_2", + "info_info_String_A", + "info_info_String_G", + "info_info_String_R", + "position", + "quality", + "reference" + ] + ); + + match data.get_mut("chromosome") { + Some(ColumnData::String(a)) => assert_eq!( + a.finish(), + arrow::array::StringBuilder::with_capacity(10, 10 * 10).finish() + ), + _ => panic!("Column chromosome not match type"), + } + } + + #[test] + fn add_record() { + let mut reader = noodles::vcf::Reader::new(VCF_FILE); + + let header: noodles::vcf::Header = reader.read_header().unwrap(); + + let schema = schema::from_header(&header, false).unwrap(); + let schema_map: rustc_hash::FxHashMap = schema + .all_fields() + .into_iter() + .map(|f| (f.name().to_string(), f.clone())) + .collect::>(); + + let mut data = Name2Data::new(10, &schema); + + let mut iterator = reader.records(&header); + let record = iterator.next().unwrap().unwrap(); + + data.add_record(record, &header, &schema_map).unwrap(); + match data.get("chromosome") { + Some(ColumnData::String(a)) => { + assert_eq!(a.len(), 1); + assert_eq!(a.values_slice(), b"YAR028W"); + assert_eq!(a.offsets_slice(), &[0, 7]); + } + _ => panic!("Column chromosome does not match type"), + } + match data.get("position") { + Some(ColumnData::Int(a)) => { + assert_eq!(a.len(), 1); + assert_eq!(a.values_slice(), &[509242864]); + } + _ => panic!("Column position does not match type"), + } + match data.get("identifier") { + Some(ColumnData::ListString(a)) => { + assert_eq!(a.len(), 1); + assert_eq!(a.values_ref().values_slice(), b""); + assert_eq!(a.offsets_slice(), &[0, 0]); + } + _ => panic!("Column identifier does not match type"), + } + match data.get("reference") { + Some(ColumnData::String(a)) => { + assert_eq!(a.len(), 1); + assert_eq!(a.values_slice(), b"A"); + assert_eq!(a.offsets_slice(), &[0, 1]); + } + _ => panic!("Column reference does not match type"), + } + match data.get("alternate") { + Some(ColumnData::String(a)) => { + assert_eq!(a.len(), 1); + assert_eq!(a.values_slice(), b"ATG"); + assert_eq!(a.offsets_slice(), &[0, 3]); + } + _ => panic!("Column alternate does not match type"), + } + match data.get("quality") { + Some(ColumnData::Float(a)) => { + assert_eq!(a.len(), 1); + assert_eq!(a.values_slice(), &[6.]); + } + _ => panic!("Column quality does not match type"), + } + match data.get("filter") { + Some(ColumnData::ListString(a)) => { + assert_eq!(a.len(), 1); + assert_eq!(a.values_ref().len(), 1); + assert_eq!(a.values_ref().values_slice(), b"Filter_0"); + assert_eq!(a.values_ref().offsets_slice(), &[0, 8]); + assert_eq!(a.offsets_slice(), &[0, 1]); + } + _ => panic!("Column filter does not match type"), + } + match data.get("info_info_Integer_1") { + Some(ColumnData::Int(a)) => { + assert_eq!(a.len(), 1); + assert_eq!(a.values_slice(), &[-1867486102]); + } + _ => panic!("Column info_info_Integer_1 does not match type"), + } + match data.get("info_info_Integer_2") { + Some(ColumnData::ListInt(a)) => { + assert_eq!(a.len(), 1); + assert_eq!(a.values_ref().len(), 2); + assert_eq!(a.values_ref().values_slice(), &[1180908493, 1041698941]); + assert_eq!(a.offsets_slice(), &[0, 2]); + assert_eq!(a.offsets_slice(), &[0, 2]); + } + _ => panic!("Column info_info_Integer_2 does not match type"), + } + match data.get("info_info_Integer_A") { + Some(ColumnData::Int(a)) => { + assert_eq!(a.len(), 1); + assert_eq!(a.values_slice(), &[-207506013]); + } + _ => panic!("Column info_info_Integer_A does not match type"), + } + match data.get("info_info_Integer_R") { + Some(ColumnData::ListInt(a)) => { + assert_eq!(a.len(), 1); + assert_eq!(a.values_ref().len(), 2); + assert_eq!(a.values_ref().values_slice(), &[-1221871784, -1356802777]); + assert_eq!(a.offsets_slice(), &[0, 2]); + assert_eq!(a.offsets_slice(), &[0, 2]); + } + _ => panic!("Column info_info_Integer_R does not match type"), + } + } +} diff --git a/vcf2parquet-lib/src/record2chunk.rs b/src/record2chunk.rs similarity index 52% rename from vcf2parquet-lib/src/record2chunk.rs rename to src/record2chunk.rs index 133e7fb..abaac95 100644 --- a/vcf2parquet-lib/src/record2chunk.rs +++ b/src/record2chunk.rs @@ -4,14 +4,17 @@ /* crate use */ +use arrow::datatypes::Field; + /* project use */ use crate::name2data::*; +/// Convert vcf record iterator into Parquet chunk pub struct Record2Chunk { inner: T, length: usize, header: noodles::vcf::Header, - schema: arrow2::datatypes::Schema, + schema: std::sync::Arc, end: bool, } @@ -19,11 +22,12 @@ impl Record2Chunk where T: Iterator>, { + /// Create a new Record2Chunk pub fn new( inner: T, length: usize, header: noodles::vcf::Header, - schema: arrow2::datatypes::Schema, + schema: std::sync::Arc, ) -> Self { Self { inner, @@ -33,28 +37,13 @@ where end: false, } } - - pub fn encodings(&self) -> Vec> { - self.schema - .fields - .iter() - .map(|f| { - arrow2::io::parquet::write::transverse(&f.data_type, |_| { - arrow2::io::parquet::write::Encoding::Plain - }) - }) - .collect() - } } impl Iterator for Record2Chunk where T: Iterator>, { - type Item = Result< - arrow2::chunk::Chunk>, - arrow2::error::Error, - >; + type Item = Result; fn next(&mut self) -> Option { if self.end { @@ -66,23 +55,36 @@ where for _ in 0..self.length { match self.inner.next() { Some(Ok(record)) => { - if let Err(e) = name2data.add_record(record, &self.header, &self.schema) { + if let Err(e) = name2data.add_record( + record, + &self.header, + &self + .schema + .all_fields() + .into_iter() + .map(|f| (f.name().to_string(), f.clone())) + .collect::>(), + ) { return Some(Err(e)); } } - Some(Err(e)) => return Some(Err(arrow2::error::Error::Io(e))), + Some(Err(e)) => { + return Some(Err(arrow::error::ArrowError::IoError("".to_string(), e))) + } None => { self.end = true; - return Some(Ok(arrow2::chunk::Chunk::new( + return Some(arrow::record_batch::RecordBatch::try_new( + self.schema.clone(), name2data.into_arc(&self.schema), - ))); + )); } } } - Some(Ok(arrow2::chunk::Chunk::new( + Some(arrow::record_batch::RecordBatch::try_new( + self.schema.clone(), name2data.into_arc(&self.schema), - ))) + )) } } diff --git a/src/schema.rs b/src/schema.rs new file mode 100644 index 0000000..63bb31a --- /dev/null +++ b/src/schema.rs @@ -0,0 +1,424 @@ +//! Construct parquet schema corresponding to vcf + +/* std use */ +use std::sync::Arc; + +/* crate use */ + +/* project use */ +use crate::*; + +/// Generate a parquet schema corresponding to vcf header +pub fn from_header( + header: &noodles::vcf::Header, + info_optional: bool, +) -> error::Result { + let mut columns = Vec::new(); + + // required column + columns.extend(required_column()); + + // info field + columns.extend(info(header, info_optional)); + + // genotype field + columns.extend(genotype(header)); + + Ok(arrow::datatypes::Schema::new(columns)) +} + +fn required_column() -> Vec { + vec![ + arrow::datatypes::Field::new("chromosome", arrow::datatypes::DataType::Utf8, false), + arrow::datatypes::Field::new("position", arrow::datatypes::DataType::Int32, false), + arrow::datatypes::Field::new( + "identifier", + arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new( + "identifier", + arrow::datatypes::DataType::Utf8, + false, + ))), + false, + ), + arrow::datatypes::Field::new("reference", arrow::datatypes::DataType::Utf8, false), + arrow::datatypes::Field::new("alternate", arrow::datatypes::DataType::Utf8, false), + arrow::datatypes::Field::new("quality", arrow::datatypes::DataType::Float32, true), + arrow::datatypes::Field::new( + "filter", + arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new( + "filter", + arrow::datatypes::DataType::Utf8, + false, + ))), + false, + ), + ] +} + +fn info(header: &noodles::vcf::Header, info_optional: bool) -> Vec { + let mut fields = Vec::new(); + + for (name, value) in header.infos() { + let key = format!("info_{name}"); + + let arrow_type = match value.ty() { + noodles::vcf::header::record::value::map::info::Type::Integer => { + arrow::datatypes::DataType::Int32 + } + noodles::vcf::header::record::value::map::info::Type::Float => { + arrow::datatypes::DataType::Float32 + } + noodles::vcf::header::record::value::map::info::Type::Flag => { + arrow::datatypes::DataType::Boolean + } + noodles::vcf::header::record::value::map::info::Type::Character => { + arrow::datatypes::DataType::Utf8 + } + noodles::vcf::header::record::value::map::info::Type::String => { + arrow::datatypes::DataType::Utf8 + } + }; + + match value.number() { + noodles::vcf::header::Number::Count(0 | 1) | noodles::vcf::header::Number::A => fields + .push(arrow::datatypes::Field::new( + &key, + arrow_type, + info_optional, + )), + noodles::vcf::header::Number::R => fields.push(arrow::datatypes::Field::new( + &key, + arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new( + &key, + arrow_type, + info_optional, + ))), + info_optional, + )), + noodles::vcf::header::Number::Count(_n) => fields.push(arrow::datatypes::Field::new( + &key, + arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new( + &key, + arrow_type, + info_optional, + ))), + info_optional, + )), + noodles::vcf::header::Number::G => fields.push(arrow::datatypes::Field::new( + &key, + arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new( + &key, + arrow_type, + info_optional, + ))), + info_optional, + )), + + noodles::vcf::header::Number::Unknown => fields.push(arrow::datatypes::Field::new( + &key, + arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new( + &key, + arrow_type, + info_optional, + ))), + info_optional, + )), + } + } + + fields +} + +fn genotype(header: &noodles::vcf::Header) -> Vec { + let mut fields = Vec::new(); + + for sample in header.sample_names() { + for (name, value) in header.formats() { + let key = format!("format_{sample}_{name}"); + + let arrow_type = match value.ty() { + noodles::vcf::header::record::value::map::format::Type::Integer => { + arrow::datatypes::DataType::Int32 + } + noodles::vcf::header::record::value::map::format::Type::Float => { + arrow::datatypes::DataType::Float32 + } + noodles::vcf::header::record::value::map::format::Type::Character => { + arrow::datatypes::DataType::Utf8 + } + noodles::vcf::header::record::value::map::format::Type::String => { + arrow::datatypes::DataType::Utf8 + } + }; + + match value.number() { + noodles::vcf::header::Number::Count(0 | 1) | noodles::vcf::header::Number::A => { + fields.push(arrow::datatypes::Field::new(key, arrow_type, true)) + } + noodles::vcf::header::Number::R => fields.push(arrow::datatypes::Field::new( + &key, + arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new( + &key, arrow_type, true, + ))), + true, + )), + noodles::vcf::header::Number::Count(_n) => { + fields.push(arrow::datatypes::Field::new( + &key, + arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new( + &key, arrow_type, true, + ))), + true, + )) + } + noodles::vcf::header::Number::G => fields.push(arrow::datatypes::Field::new( + &key, + arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new( + &key, arrow_type, true, + ))), + true, + )), + + noodles::vcf::header::Number::Unknown => fields.push(arrow::datatypes::Field::new( + &key, + arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new( + &key, arrow_type, true, + ))), + true, + )), + } + } + } + + fields +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use super::*; + + // + // + + static VCF_FILE: &[u8] = b"##fileformat=VCFv4.3 +##fileDate=20220528 +##source=ClinVar +##reference=GRCh38 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##SAMPLE= +##SAMPLE= +#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tfirst\tsecond +"; + + lazy_static::lazy_static! { + static ref MINI_COLS: Vec = vec![ + arrow::datatypes::Field::new("chromosome", arrow::datatypes::DataType::Utf8, false), + arrow::datatypes::Field::new("position", arrow::datatypes::DataType::Int32, false), + arrow::datatypes::Field::new( + "identifier", + arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new( + "identifier", + arrow::datatypes::DataType::Utf8, + false, + ))), + false, + ), + arrow::datatypes::Field::new("reference", arrow::datatypes::DataType::Utf8, false), + arrow::datatypes::Field::new("alternate", arrow::datatypes::DataType::Utf8, false), + arrow::datatypes::Field::new("quality", arrow::datatypes::DataType::Float32, true), + arrow::datatypes::Field::new( + "filter", + arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new( + "filter", + arrow::datatypes::DataType::Utf8, + false, + ))), + false, + ), + ]; + + static ref INFO_COLS: Vec = vec![ + arrow::datatypes::Field::new("info_Flag".to_string(), arrow::datatypes::DataType::Boolean, false), + arrow::datatypes::Field::new("info_Info1".to_string(),arrow::datatypes::DataType::Float32, false), + arrow::datatypes::Field::new( "info_Info_fixed".to_string(), arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new("info_Info_fixed".to_string(),arrow::datatypes::DataType::Int32, false)), ),false), + arrow::datatypes::Field::new("info_Info_A".to_string(),arrow::datatypes::DataType::Int32, false), + arrow::datatypes::Field::new("info_Info_RString".to_string(),arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new("info_Info_RString".to_string(),arrow::datatypes::DataType::Utf8, false)), ), false), + arrow::datatypes::Field::new("info_Info_RChar".to_string(),arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new("info_Info_RChar".to_string(),arrow::datatypes::DataType::Utf8, false)), ), false), + arrow::datatypes::Field::new("info_Info_G".to_string(), arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new("info_Info_G".to_string(),arrow::datatypes::DataType::Int32, false)), ), false), + arrow::datatypes::Field::new("info_Info_.".to_string(), arrow::datatypes::DataType::List(Arc::new(arrow::datatypes::Field::new("info_Info_.".to_string(),arrow::datatypes::DataType::Int32, false))), false) + ]; + + static ref FORMAT_COLS: Vec = vec![ + arrow::datatypes::Field::new( + "format_first_Format_1".to_string(), + arrow::datatypes::DataType::Int32, + true, + + ), + arrow::datatypes::Field::new( + "format_first_Format_fixed".to_string(), + arrow::datatypes::DataType::List(Arc::new( + arrow::datatypes::Field::new( + "format_first_Format_fixed".to_string(), + arrow::datatypes::DataType::Float32, + true, + + ) + ),), + true, + + ), + arrow::datatypes::Field::new( + "format_first_Format_A".to_string(), + arrow::datatypes::DataType::Utf8, + true, + + ), + arrow::datatypes::Field::new( + "format_first_Format_R".to_string(), + arrow::datatypes::DataType::List(Arc::new( + arrow::datatypes::Field::new( + "format_first_Format_R".to_string(), + arrow::datatypes::DataType::Utf8, + true, + + ) + ),), + true, + + ), + arrow::datatypes::Field::new( + "format_first_Format_G".to_string(), + arrow::datatypes::DataType::List(Arc::new( + arrow::datatypes::Field::new( + "format_first_Format_G".to_string(), + arrow::datatypes::DataType::Int32, + true, + + ) + ),), + true, + + ), + arrow::datatypes::Field::new( + "format_first_Format_.".to_string(), + arrow::datatypes::DataType::Int32, + true, + + ), + arrow::datatypes::Field::new( + "format_second_Format_1".to_string(), + arrow::datatypes::DataType::Int32, + true, + + ), + arrow::datatypes::Field::new( + "format_second_Format_fixed".to_string(), + arrow::datatypes::DataType::List(Arc::new( + arrow::datatypes::Field::new( + "format_second_Format_fixed".to_string(), + arrow::datatypes::DataType::Float32, + true, + + ) + ),), + true, + + ), + arrow::datatypes::Field::new( + "format_second_Format_A".to_string(), + arrow::datatypes::DataType::Utf8, + true, + + ), + arrow::datatypes::Field::new( + "format_second_Format_R".to_string(), + arrow::datatypes::DataType::List(Arc::new( + arrow::datatypes::Field::new( + "format_second_Format_R".to_string(), + arrow::datatypes::DataType::Utf8, + true, + + ) + ),), + true, + + ), + arrow::datatypes::Field::new( + "format_second_Format_G".to_string(), + arrow::datatypes::DataType::List(Arc::new( + arrow::datatypes::Field::new( + "format_second_Format_G".to_string(), + arrow::datatypes::DataType::Int32, + true, + + ) + ),), + true, + + ), + arrow::datatypes::Field::new( + "format_second_Format_.".to_string(), + arrow::datatypes::DataType::Int32, + true, + + ) + + ]; + } + + #[test] + fn mini_cols() { + assert_eq!(required_column(), *MINI_COLS) + } + + #[test] + fn info_cols() { + let mut reader = noodles::vcf::Reader::new(VCF_FILE); + + let header: noodles::vcf::Header = reader.read_header().unwrap(); + + assert_eq!(info(&header, false), *INFO_COLS); + } + + #[test] + fn genotype_cols() { + let mut reader = noodles::vcf::Reader::new(VCF_FILE); + + let header: noodles::vcf::Header = reader.read_header().unwrap(); + + assert_eq!(genotype(&header), *FORMAT_COLS); + } + + #[test] + fn all_cols() { + let mut reader = noodles::vcf::Reader::new(VCF_FILE); + + let header: noodles::vcf::Header = reader.read_header().unwrap(); + + let mut data: Vec = Vec::new(); + data.extend_from_slice(&MINI_COLS); + data.extend_from_slice(&INFO_COLS); + data.extend_from_slice(&FORMAT_COLS); + + assert_eq!( + from_header(&header, false).unwrap(), + arrow::datatypes::Schema::new(data.iter().map(|x| x.clone()).collect::>()), + ); + } +} diff --git a/tests/data/test.parquet b/tests/data/test.parquet index 40e9fbe..6dee4ae 100644 Binary files a/tests/data/test.parquet and b/tests/data/test.parquet differ diff --git a/tests/data/test.vcf b/tests/data/test.vcf index 79691ac..fc6b797 100644 --- a/tests/data/test.vcf +++ b/tests/data/test.vcf @@ -1,25 +1,74 @@ ##fileformat=VCFv4.3 -##fileDate=20220528 -##source=ClinVar -##reference=GRCh38 -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##FILTER= +##FILTER= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= ##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##SAMPLE= -##SAMPLE= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT first second -chr1 100 . A T 50 PASS Info_1=0;Info_fixed=1,2,3;Info_A=42;Info_RChar=r,a;Info_RString=ref,alt;Info_G=1,2,3;Info_u=0,1,2,3,4 GT:Format_1:Format_fixed:Format_A:Format_R:Format_G:Format_u 0/1:44:1,2,3,4:testA:R,A:1,2,3:0,2,4,6 1/1:44:1,2,3,5:testA:r,a:1,2,3:0,2,5,6,1 -chr1 200 . C G,CG 60 PASS Info_1=0;Info_fixed=1,2,3;Info_A=42,43;Info_RChar=r,a,A;Info_RString=ref,alt1,alt2;Info_G=1,2,3,4,5,6;Info_u=1,6,3,4,5 GT:Format_1:Format_fixed:Format_A:Format_R:Format_G:Format_u 0/1:44:2,4,6,8:testA1,testA2:R,A,B:1,2,3,4,5,6:0,2,4 1/2:45:2,1,6,8:testB1,testB2:R,a,b:1,2,3,4,5,6:0,2,4,5,6 -chr2 300 . G A 70 PASS Info_1=0;Info_fixed=1,2,3;Info_A=42;Info_RChar=r,a;Info_RString=ref,alt;Info_G=1,2,3;Info_u=0,1,2,3,4;Flag GT:Format_1:Format_fixed:Format_A:Format_R:Format_G:Format_u 0/1:44:1,2,3,4:testA:R,A:1,2,3:0,2,4,6 0/1:44:1,2,3,4:testA:R,A:1,2,3:0,2,4,6 \ No newline at end of file +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample_0 sample_1 +YAR028W 509242864 . a ATg 6 Filter_0 info_Flag_0;info_Integer_1=-1867486102;info_Integer_2=1180908493,1041698941;info_Integer_A=-207506013;info_Integer_R=-1221871784,-1356802777;info_Integer_G=-496257853,2127853583;info_Integer_.=2082620030,-344161839,-1022296779,-1007334133;info_Float_1=68.286865;info_Float_2=-96.154594,-23.433853;info_Float_A=-48.782158;info_Float_R=-46.15216,-92.639305;info_Float_G=-7.5115204,74.78337;info_Float_.=26.825455;Flag_0info_Character_1=i;info_Character_2=r,[;info_Character_A=g;info_Character_R=M,D;info_Character_G=h,w;info_Character_.=G;info_String_1=p]ZoXMTgQo;info_String_2=uVGn`JweVD,DUYytzAny[;info_String_A=_POshsqbSj;info_String_R=AdbZcRFrrQ,_[VS^RtSvz;info_String_G=MeTjonYVIn,jLIi`oWogn;info_String_.=CzkT\Wk_sG GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-1876597949:706761235,-251943823:394859496:-1947058767,424473864:1331697703,-73747609,1645597043,-1553292366,-1685240226:300184417:18.381859:55.763123,-25.909782:-23.853012:-65.84661,-26.444412:12.577988,-87.76228,-3.4822464,-95.66553,55.56636:-35.16729,6.755356:H:Y,N:m:[,Q:B,C,g,L,`:c,x:xXYm`NnOG[:K`QKgogYxZ,uNAMyDqpgZ:liSmUzRvGG:XBgqxa[aBw,_ZxxkAFA[o:`OIdJgjZDS,tKauvtaIhw,mmrIgNXcbh,Rd]QWyFOgu,kSjBlBKigq:znOIm[gGXi,[j\RlwOmAi 1/1:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:. +23 1165400956 . T t 199 . info_Integer_1=-597222189;info_Integer_2=446843965,1432841503;info_Integer_A=-1756403175;info_Integer_R=-1210584642,1067164582;info_Integer_G=-2026752623,1524204480,2063402043;info_Integer_.=387204105,-2048329790;info_Float_1=29.60765;info_Float_2=-70.24462,91.82048;info_Float_A=-57.780792;info_Float_R=-19.511703,87.46164;info_Float_G=17.362617,-10.059616,-89.640594;info_Float_.=82.884,-31.403328,-83.54941,-54.887726;Flag_0info_Character_1=];info_Character_2=Y,R;info_Character_A=p;info_Character_R=A,W;info_Character_G=i,y,I;info_Character_.=w;info_String_1=]gp_[s]vDh;info_String_2=Y\SmynkIV^,tOuGkqHsiE;info_String_A=QDdbnppEhM;info_String_R=VgQkWCCgEH,r^aAgT^sOf;info_String_G=z\_iwMGBRH,EQy^RJwkWd,gu]hpIwaVj;info_String_.=zxs[sGNNuy,cmnjXNUPka,QaFrhEZaIB,_TjXJMdWCM GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-871261733:-1500509753,2025272017:1864754769:-1127684339,-447878996:-1851298122,1367475939,1988967275,-439362500,-447904679:-907504720:-73.56331:-86.60319,-31.910011:6.785797:-95.413086,19.286415:9.942863,-23.623634,31.06224,42.57071,92.734314:-51.402973,-25.126984,73.030045:J:K,Y:K:c,^:y,P,V,b,S:U,w,M,X:IKZ]ZMDszw:apRf\BVTcU,UOJHFcgkaj:fIjt]RZCsd:TtoRPBHoRS,sDF^wkt`MK:boQ]OQxmec,eJfBqcdaUg,To]BkSYKbI,J\qQxtjZBq,\nQWJTeYEf:tHujPdNde\,EWFfR`mig_ 1/1:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:. +ENA|LT795502|LT795502.1 525786811 . A ATgA,CC 226 . info_Integer_1=1004917273;info_Integer_2=-1087925856,-1111801609;info_Integer_A=1142924498,51351;info_Integer_R=397636772,575245484,131531;info_Integer_G=631457844,1508219739,2060178753,1314135;info_Integer_.=915360277;info_Float_1=81.456085;info_Float_2=-97.36381,99.07503;info_Float_A=-17.968132,-30.1514;info_Float_R=23.030853,17.895386,16.51314;info_Float_G=36.786133,-36.816742,-79.92742,304.151351;info_Float_.=-56.670807,-88.687706;info_Character_1=x;info_Character_2=f,X;info_Character_A=Q,R;info_Character_R=M,v,p;info_Character_G=A,B,I,K;info_Character_.=s,S,a,E;info_String_1=RULoNvUdVj;info_String_2=YlKPytYpDY,hwIe\Lokil;info_String_A=\VZSHlparH,bonjour;info_String_R=PVoBxilKPl,_s`t`swTzf,il en faut peu pour être heureux;info_String_G=FiINYvUJIO,LtzFxYFFJp,mMeZaQtSZU,rFHUKkG]FO;info_String_.=hoXwhfvniY,A\txGAVbNp,SbcLeVnkYI GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-1992801692:305092475,-1179215612:-1739296736,1531141442:-388696167,-659498070,13514131:1503808064,-183677921,-849522112:-498574773:14.566895:10.018013,-16.879608:99.71886,31.141592:0.29120636,-9.131622,-2.71828:80.328705,-26.403976,-37.213943:58.0925,-70.47134,-59.34813:X:N,q:S,M:a,_,u:c,W,U:m:^QkYoWratI:ryX^JlAyCx,WGZntpNsOo:FRryqZFoMj,fromage:gCO^BOI[ml,VJqiy[VWym,ah bah non:nbtqw^\zmA,ZiBBJm[Vbv,aNMll`xnfr:rHzB`UssLW,apsPd_lrip,Uih`ROsUql,tnBQQdhtwm 1|2:.:.:.:.:1,2,3,4,5,6:.:.:.:.:.:1.4142,3.141592,2.71828,1.618,273.15,6.67:.:.:.:.:.:a,b,c,d,e,f:.:.:.:.:.:AA,BB,CC,DD,EEe,FF:. +ENA|LT795502|LT795502.1 1506498921 . A gT 99 Filter_0 info_Integer_1=1074860489;info_Integer_2=-6784655,1952022752;info_Integer_A=-1765522773;info_Integer_R=1316333577,-554518728;info_Integer_G=-440746192,417172829,1208578807;info_Integer_.=-67150747,-701563860,1708267257;info_Float_1=-85.47166;info_Float_2=33.09308,37.761444;info_Float_A=99.544266;info_Float_R=-4.276779,27.070168;info_Float_G=-93.02027,-68.755196,-18.597626;info_Float_.=45.63858;info_Character_1=R;info_Character_2=B,W;info_Character_A=l;info_Character_R=z,E;info_Character_G=Z,h,F,D,D;info_Character_.=n,G,v;info_String_1=ojZkSfujYX;info_String_2=`ZrZJtq_hx,StcGnLjWNS;info_String_A=k[feQ[mqyE;info_String_R=Grr[rGo^md,GkiXanc\K\;info_String_G=Yhij\pOPji,yYlsCnJSCY,VggsEuC]ad,G^jiYbvsbn,IJmJvG`jzs;info_String_.=CKtxVQFr]_,G^jAnQaGyI,yvzleXG`vO GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-2039921838:1260782784,-2144365597:1110295788:-158846729,1495837063:390766793,1114219927,-790406568,1652554877,-1144133980:-610126444,1736640977:0.7073364:-56.6103,13.725807:-8.711052:-35.14733,55.534668:-41.20214,-69.47873,15.234543,1.8139114,-81.88782:-86.727234:]:f,Q:I:u,_:b,k,C,[,T:b,S,O:hGS[GPZUZu:uQuJZtwYq_,SyDvF`v_[[:ol]TtBXxVP:mFMNUVM`Ir,XbBPeoBkYj:`xGS_`zgey,v\]bxaFPdJ,lGqnBWyHQI,ynpDFGuSsm,^bGxsDdgIl:q`HG\bwqSl,GrnuUSzgVy,cRSPjljk_Q,TWbW]MISyd 0|1:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:. +chrMT 900574305 . c a 208 Filter_0 info_Integer_1=-523627641;info_Integer_2=828853617,538841733;info_Integer_A=-1070289656;info_Integer_R=1177092376,1248528320;info_Integer_G=1338006213,-939491184,2031520519;info_Integer_.=1020802949,-325766450,-1975174725;info_Float_1=86.21178;info_Float_2=55.274292,57.992126;info_Float_A=98.6465;info_Float_R=-56.676746,-78.452255;info_Float_G=-15.1058655,-32.05681,-23.85817;info_Float_.=-8.955811,49.62912,72.4005;Flag_0info_Character_1=x;info_Character_2=B,r;info_Character_A=i;info_Character_R=\,D;info_Character_G=g,v,l,W,T;info_Character_.=g;info_String_1=j`zU\K`PLc;info_String_2=l_rhk[Kr]b,rFB_aSUBR`;info_String_A=ZQYIsGAof_;info_String_R=HwrUBliWel,scCvWxgE[r;info_String_G=kY`HMgmH_p,IUvLZ]sdba,nVQJ_Fh`ET,QnYUFz`ShS,KXwJfcYmsw;info_String_.=Q]wexXkHyr,fL\NGDMlkW,jVpWnME[tp,Zz\hrFyx`] GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-1648034443:845207498,1438334805:-821666363:-532302872,-784878946:-1660896800,2008926111,1279825538,-233248668,-1578146061:-1728381686,-1354962949,-2095339305:4.250908:1.8742523,-78.206894:-22.634148:-61.518906,13.15873:-64.15179,17.086502,-25.609276,57.059555,-92.1911:26.885536,-27.22528,-51.00875:F:l,e:M:y,e:^,X,g,x,h:K:LXIC\KLZsD:Nd[vIMiHJA,^ConDldYtT:CBqgBJnzRq:nOgdeNbqKd,rTYmYwJcQu:WMlQs]gO[a,Kk[sJ[UoxT,HT]XWH^ZTF,IIJXSmrLHg,qo`OJ_hgav:tSEKSQWXRL 1/1:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:. +X 508903144 . T taAG 107 Filter_1 info_Integer_1=-442012684;info_Integer_2=1242798393,-893635990;info_Integer_A=-1049853993;info_Integer_R=242988245,-245551581;info_Integer_G=992362638,-556141956,-1436766801;info_Integer_.=-1890267838;info_Float_1=50.181763;info_Float_2=5.1533203,32.221054;info_Float_A=31.930801;info_Float_R=18.487122,-4.3887863;info_Float_G=-98.10066,-69.57614,91.27092;info_Float_.=-46.815468,-65.40532;info_Character_1=_;info_Character_2=C,c;info_Character_A=w;info_Character_R=v,g;info_Character_G=v,Q,r,`,W;info_Character_.=\,j,W;info_String_1=Ol_Gd[f\tt;info_String_2=DX_Nqhsbft,ZvYZmhftHw;info_String_A=bXt^\wzwfQ;info_String_R=ANOP[Zjcef,[mLDzYe^Xa;info_String_G=i_XoxRH]Up,N\qKskBEfm,vceQjVrtTu,_LnQ_[ngn],yd[ZFNmECq;info_String_.=FPpMF]TI[C,skPWfxtNBS,uEV]cCazM[ GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-1784107859:-939663,-275551415:1120891123:-769476243,636006815:-1200543946,215752478,-1484326861,-561668774,-176043456:1567908440,-504633662,-461948338:41.60312:89.26175,-28.585121:-25.16079:-93.47296,-97.84482:-64.203835,62.569122,-16.536377,24.606033,-83.769844:-0.48690033,50.683716,-61.86049:u:p,x:G:O,q:J,g,s,K,g:c,l,a,w:mgLADffOAW:OFYzsNT]DN,knpK_\ZlLw:NIGi]zaPz\:vcAptCt\VT,z[VAZFjS[p:tcGuXLiEpv,fCLE[^Bzbh,rkGwaRnhxi,fTMvHtFRwN,RPYaXfd^BF:DOnEKUN[]V,ppBNNWrhhK ./.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:. +23 2057099842 . g a 105 . info_String_1=ukwTtXgA^` GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:883085881:-1163194169,396734570:-864830302:-1015935718,416512239:1055659899,378650980,1719415308:1562372336,38255854:-96.96286:-22.160027,2.9108505:-76.4101:-31.626724,74.70122:-10.465622,92.86656,66.5076:-35.273743,71.3008,-21.861176:N:Z,x:w:D,Q:g,[,i:b,u,q:rOrdrrnAAf:fkbpBSUynu,ks[PFA_asT:JK\[phzgTx:sIMyglMmy[,zdTHmmDXr`:mJZqxRqayy,nzIi[BVIeR,TJX_tcRRuR:urxfXYFYWh,yxg_KbMfiV,O^NfHdYiTT,RQLQTrA`[D 1/0:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:. +NC_000015.10 278483743 . c Gc 68 Filter_0 info_Integer_1=-1771300013;info_Integer_2=611485162,1796725452;info_Integer_A=971438374;info_Integer_R=698255143,905472298;info_Integer_G=-200904731,1733482657,-1601571925;info_Integer_.=1226331110,1800309665;info_Float_1=-8.41539;info_Float_2=-91.844246,33.56476;info_Float_A=-70.37154;info_Float_R=-88.31048,49.067856;info_Float_G=-56.826736,-72.017075,58.757156;info_Float_.=-93.33689,-11.933228;info_Character_1=V;info_Character_2=F,H;info_Character_A=];info_Character_R=],k;info_Character_G=b,j,c,v,k;info_Character_.=W,J,`;info_String_1=[sg[NfQUjS;info_String_2=tuIDx]qY`n,sYihzjCcDX;info_String_A=rWeIJoLqif;info_String_R=Lc\nOEn`SI,ohc]_`UFau;info_String_G=ts\z[cGhVY,FWhsZospCl,EibJWC`AtQ,mNRPCdUKvw,lVzUvcofUf;info_String_.=ivsywIpK\E,dw[sIibpcF,ngkSonUgLJ,OXEnvoSKPb GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:1053018922:-184938975,-1760026633:-857964358:-2102716665,1815665606:-1175872690,-347280558,1231790968,-790356303,-222139945:-1345714777,160079922:-82.493515:64.684265,-99.60785:-19.693016:-3.5498886,-99.20609:-5.4513702,-81.74608,-6.946541,-87.49165,-31.873795:-0.010063171:x:n,^:R:b,h:h,c,],\,X:I:LWSccGJLM_:BFr_AeHgBF,HkaUOy`jqz:FGMQKwoeFA:^WCLcfxttz,YAipFMM\Sa:WfUeg^ehSy,Rn^uqvYSmS,sEKSkLrC\p,QDmf[JGzbG,QiZIx`^pZN:Kmi[\ChDrN,OJdBvVq[Af 0/1:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:. +NC_016845.1 1273217582 . A cA 185 Filter_1 info_Integer_1=1702504238;info_Integer_2=-1300020074,-1771363986;info_Integer_A=-666582393;info_Integer_R=-1483769984,-1241578554;info_Integer_G=1976807172,-1260807615,-108510257;info_Integer_.=1829528682,-928482172,-429726805,-2007283327;info_Float_1=70.20416;info_Float_2=94.22778,49.014664;info_Float_A=77.67261;info_Float_R=69.01376,-85.50122;info_Float_G=22.049858,-31.612656,67.47859;info_Float_.=-79.06835,36.144714,-11.66687,-33.392593;Flag_0info_Character_1=Y;info_Character_2=s,w;info_Character_A=Y;info_Character_R=E,S;info_Character_G=J,C,C,^,N;info_Character_.=s;info_String_1=RnqfrhRxGK;info_String_2=QCczZsqSMX,UadliszTvD;info_String_A=UeAmTxgIJs;info_String_R=RpC[yfli[m,UMXScERIAT;info_String_G=Piyg[YSyn],wrVNLOsrsd,CuaYlZzSG`,hHfxMnZBYb,fELmQbwhQV;info_String_.=MHfN_MSgEe,_REWJxavTD,SkkKdTmDLG,R[oCTWMP\K GT:format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 0/1:-1879771452:-2024174827,895533931:25454566:-1413372025,-1730678484:1651476894,522323445,-801323168,-692607812,1081910993:-1359195475:64.78084:98.23514,-95.097374:97.43535:-65.953636,7.431885:74.098724,-84.2887,-56.68762,-86.44216,96.5443:32.076004:K:G,\:G:W,z:h,o,j,X,Y:e,G,x:nbjlHj^`q]:aa^uI^^SoQ,JyJ\ARpaJg:MniIYiZryL:qDr]fJV]eR,iuDxSPv[oY:qLjrYY]bPA,rejiDo^By[,`mxXgnjkPa,_grwZxX`kA,\OLO_zFEeT:xJR]YarNNn 0/1:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:. +ENA|LT795502|LT795502.1 566884162 . t c 22 . info_Integer_1=-63306296;info_Integer_2=1391506844,-1503768112;info_Integer_A=340548256;info_Integer_R=-1286314818,288781403;info_Integer_G=-800469678,-1311787939,-793948174;info_Integer_.=-1341990003;info_Float_1=-76.227356;info_Float_2=-54.977512,-39.39898;info_Float_A=-35.61332;info_Float_R=-70.32056,-42.79394;info_Float_G=67.78093,43.006317,92.26671;info_Float_.=-60.336803,-45.87288,92.96947,-43.244385;info_Character_1=[;info_Character_2=c,J;info_Character_A=R;info_Character_R=o,d;info_Character_G=h,`,F,\,q;info_Character_.=T;info_String_1=q^HZe_mW_C;info_String_2=FPSDvSVXAd,YbrjDSdRXm;info_String_A=IxDTHZYoq[;info_String_R=OsOWlbXzO\,hAhG_b\Ifw;info_String_G=jb^GYiHZRT,_[`_aqmUIf,PtWWNPUINQ,WkqQaaxSee,jRMUC_IYwu;info_String_.=ZVqn\yRJEI,`vlpPiWkLZ,aVHocDfVJv format_Integer_1:format_Integer_2:format_Integer_A:format_Integer_R:format_Integer_G:format_Integer_.:format_Float_1:format_Float_2:format_Float_A:format_Float_R:format_Float_G:format_Float_.:format_Character_1:format_Character_2:format_Character_A:format_Character_R:format_Character_G:format_Character_.:format_String_1:format_String_2:format_String_A:format_String_R:format_String_G:format_String_. 389250658:-1173892904,-995837010:380428736:-350796083,-1946061625:-1985077526,-956832721:212134446:-93.3871:73.895645,-82.49681:-59.703255:-53.21877,-11.0794525:98.62854,-40.406464,36.850067:-80.0885,25.734207,92.746826,24.650955:Q:l,w:j:t,Z:K,X:i:VAXYF^LWPG:SudBRfeYRI,axYzALsh[m:gWvHMgghOt:cIIIEUOOnN,Q`yNRLvwIx:HeiQgtTGFY,A[RlKUJYGM:u[WuJ]OfAC,ToajkjZMqO .:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:. \ No newline at end of file diff --git a/tests/functional.rs b/tests/functional.rs index 1f0024f..f212c1e 100644 --- a/tests/functional.rs +++ b/tests/functional.rs @@ -14,7 +14,36 @@ fn help() -> Result<(), assert_cmd::cargo::CargoError> { cmd.args(["-h"]); - let truth: &[u8] = b"Convert a vcf in parquet + let truth: &[u8] = if cfg!(windows) { + b"Convert a vcf in parquet + +Usage: vcf2parquet.exe [OPTIONS] --input + +Commands: + convert Convert a vcf in a parquet + split Convert a vcf in multiple parquet file each file contains `batch_size` record + help Print this message or the help of the given subcommand(s) + +Options: + -i, --input + Input path + -b, --batch-size + Batch size (default 100,000) + -c, --compression + Compression method (default snappy) [possible values: uncompressed, snappy, gzip, lzo, brotli, lz4, zstd] + -r, --read-buffer + Read buffer size in bytes (default 8192) + -I, --info-optional + All information fields are optional + --parquet-version + [possible values: v1, v2] + -h, --help + Print help (see more with \'--help\') + -V, --version + Print version +" + } else { + b"Convert a vcf in parquet Usage: vcf2parquet [OPTIONS] --input @@ -24,14 +53,24 @@ Commands: help Print this message or the help of the given subcommand(s) Options: - -i, --input Input path - -b, --batch-size Batch size (default 100,000) - -c, --compression Compression method (default snappy) [possible values: uncompressed, snappy, gzip, lzo, brotli, lz4, zstd] - -r, --read-buffer Read buffer size in bytes (default 8192) - -I, --info-optional All information fields are optional - -h, --help Print help - -V, --version Print version -"; + -i, --input + Input path + -b, --batch-size + Batch size (default 100,000) + -c, --compression + Compression method (default snappy) [possible values: uncompressed, snappy, gzip, lzo, brotli, lz4, zstd] + -r, --read-buffer + Read buffer size in bytes (default 8192) + -I, --info-optional + All information fields are optional + --parquet-version + [possible values: v1, v2] + -h, --help + Print help (see more with \'--help\') + -V, --version + Print version +" + }; let assert = cmd.assert(); @@ -49,6 +88,7 @@ fn convert() -> Result<(), assert_cmd::cargo::CargoError> { let parquet_path = temp_path.join("tests.parquet"); cmd.args([ + "-I", "-i", "tests/data/test.vcf", "convert", @@ -86,6 +126,7 @@ fn split() -> Result<(), assert_cmd::cargo::CargoError> { let parquet_path = temp_path.join("test_{}.parquet"); cmd.args([ + "-I", "-i", "tests/data/test.vcf", "split", diff --git a/vcf2parquet-bin/Cargo.toml b/vcf2parquet-bin/Cargo.toml deleted file mode 100644 index f9f92ec..0000000 --- a/vcf2parquet-bin/Cargo.toml +++ /dev/null @@ -1,18 +0,0 @@ -[package] -name = "vcf2parquet-bin" -version = "0.6.0" -edition = "2021" - -[dependencies] -vcf2parquet-lib = { path = "../vcf2parquet-lib", version = "0.6.0" } -niffler = { version = "2" } -arrow2 = { version = "0.18", features = ["io_parquet", "io_parquet_compression"] } - -# logging management -simplelog = { version = "0.12" } - -# error management -thiserror = { version = "1" } - -# cli management -clap = { version = "4", features = ["derive"] } diff --git a/vcf2parquet-bin/src/error.rs b/vcf2parquet-bin/src/error.rs deleted file mode 100644 index 921c1ae..0000000 --- a/vcf2parquet-bin/src/error.rs +++ /dev/null @@ -1,80 +0,0 @@ -//! error of vcf2parquet-bin - -/* std use */ - -/* crate use */ - -/* project use */ - -#[derive(thiserror::Error, std::fmt::Debug)] -pub enum Error { - /// Io error - #[error(transparent)] - Io { error: std::io::Error }, - - /// Niffler error - #[error(transparent)] - Niffler { error: niffler::Error }, - - /// vcf2parquet-lib error - #[error(transparent)] - Lib { - error: vcf2parquet_lib::error::Error, - }, -} - -pub fn mapping(error: E) -> Error -where - E: std::convert::Into, -{ - error.into() -} - -impl From for Error { - fn from(error: std::io::Error) -> Self { - Error::Io { error } - } -} - -impl From for Error { - fn from(error: niffler::Error) -> Self { - Error::Niffler { error } - } -} - -impl From for Error { - fn from(error: vcf2parquet_lib::error::Error) -> Self { - Error::Lib { error } - } -} - -pub type Result = std::result::Result; - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn error_conversion() { - assert_eq!( - format!( - "{:?}", - Error::from(std::io::Error::new(std::io::ErrorKind::NotFound, "test")) - ), - "Io { error: Custom { kind: NotFound, error: \"test\" } }".to_string() - ); - - assert_eq!( - format!("{:?}", Error::from(niffler::Error::FileTooShort)), - "Niffler { error: FileTooShort }".to_string() - ); - - assert_eq!( - format!( - "{:?}", - Error::from(vcf2parquet_lib::error::Error::NoConversion) - ), - "Lib { error: NoConversion }".to_string() - ); - } -} diff --git a/vcf2parquet-lib/Cargo.toml b/vcf2parquet-lib/Cargo.toml deleted file mode 100644 index 221788c..0000000 --- a/vcf2parquet-lib/Cargo.toml +++ /dev/null @@ -1,21 +0,0 @@ -[package] -name = "vcf2parquet-lib" -version = "0.6.0" -edition = "2021" - -[dependencies] -rayon = { version = "1" } - -# input output management -niffler = { version = "2" } -noodles = { version = "0.64", features = ["vcf"] } -arrow2 = { version = "0.18", features = ["io_parquet", "io_parquet_compression"] } -rustc-hash = { version = "1" } - -# logging and error management -log = { version = "0.4" } -thiserror = { version = "1" } - -[dev-dependencies] -lazy_static = { version = "1" } -tempfile = { version = "3" } \ No newline at end of file diff --git a/vcf2parquet-lib/src/lib.rs b/vcf2parquet-lib/src/lib.rs deleted file mode 100644 index e89c1f6..0000000 --- a/vcf2parquet-lib/src/lib.rs +++ /dev/null @@ -1,327 +0,0 @@ -//! vcf2parquet library - -/* std use */ - -/* crate use */ - -/* project use */ - -/* mod section */ -pub mod error; -pub mod name2data; -pub mod record2chunk; -pub mod schema; - -/// Read `input` vcf and write parquet in `output` -pub fn vcf2parquet( - input: &mut R, - output: &mut W, - batch_size: usize, - compression: arrow2::io::parquet::write::CompressionOptions, - info_optional: bool, -) -> error::Result<()> -where - R: std::io::BufRead, - W: std::io::Write, -{ - // VCF section - let mut reader = noodles::vcf::Reader::new(input); - - let vcf_header: noodles::vcf::Header = reader.read_header()?; - - // Parquet section - let schema = schema::from_header(&vcf_header, info_optional)?; - - let mut iterator = reader.records(&vcf_header); - let chunk_iterator = record2chunk::Record2Chunk::new( - &mut iterator, - batch_size, - vcf_header.clone(), - schema.clone(), - ); - - let options = arrow2::io::parquet::write::WriteOptions { - write_statistics: true, - compression, - version: arrow2::io::parquet::write::Version::V2, - data_pagesize_limit: Some(batch_size), - }; - - let encodings = chunk_iterator.encodings(); - let row_groups = arrow2::io::parquet::write::RowGroupIterator::try_new( - chunk_iterator, - &schema, - options, - encodings, - )?; - - let mut writer = arrow2::io::parquet::write::FileWriter::try_new(output, schema, options)?; - - for group in row_groups { - writer.write(group?)?; - } - let _ = writer.end(None)?; - - Ok(()) -} - -/// Read `input` vcf and write each row group in a parquet file match with template -pub fn vcf2multiparquet( - input: &mut R, - template: &str, - batch_size: usize, - compression: arrow2::io::parquet::write::CompressionOptions, - info_optional: bool, -) -> error::Result<()> -where - R: std::io::BufRead, -{ - // VCF section - let mut reader = noodles::vcf::Reader::new(input); - - let vcf_header: noodles::vcf::Header = reader.read_header()?; - - // Parquet section - let schema = schema::from_header(&vcf_header, info_optional)?; - - let mut iterator = reader.records(&vcf_header); - let chunk_iterator = record2chunk::Record2Chunk::new( - &mut iterator, - batch_size, - vcf_header.clone(), - schema.clone(), - ); - - let options = arrow2::io::parquet::write::WriteOptions { - write_statistics: true, - compression, - version: arrow2::io::parquet::write::Version::V2, - data_pagesize_limit: Some(batch_size), - }; - - let encodings = chunk_iterator.encodings(); - let row_groups = arrow2::io::parquet::write::RowGroupIterator::try_new( - chunk_iterator, - &schema, - options, - encodings, - )?; - - for (index, group) in row_groups.enumerate() { - let output = std::fs::File::create(template.replace("{}", &index.to_string()))?; - let mut writer = - arrow2::io::parquet::write::FileWriter::try_new(output, schema.clone(), options)?; - - writer.write(group?)?; - writer.end(None)?; - } - - Ok(()) -} - -#[cfg(test)] -mod tests { - use super::*; - - static VCF_FILE: &[u8] = b"##fileformat=VCFv4.3 -#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO -1\t925952\t1019397\tG\tA\t.\t.\t. -"; - - static PARQUET_FILE: &[u8] = &[ - 80, 65, 82, 49, 21, 6, 21, 10, 21, 50, 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 0, 21, 0, 17, - 28, 54, 0, 40, 1, 49, 24, 1, 49, 0, 0, 0, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 100, 96, - 96, 48, 4, 0, 151, 222, 156, 170, 5, 0, 0, 0, 21, 12, 25, 37, 0, 6, 25, 24, 10, 99, 104, - 114, 111, 109, 111, 115, 111, 109, 101, 21, 4, 22, 2, 22, 74, 22, 114, 38, 8, 60, 54, 0, - 40, 1, 49, 24, 1, 49, 0, 0, 21, 6, 21, 8, 21, 48, 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 0, - 21, 0, 17, 28, 54, 0, 40, 4, 0, 33, 14, 0, 24, 4, 0, 33, 14, 0, 0, 0, 0, 31, 139, 8, 0, 0, - 0, 0, 0, 0, 255, 99, 80, 228, 99, 0, 0, 69, 222, 72, 134, 4, 0, 0, 0, 21, 2, 25, 37, 0, 6, - 25, 24, 8, 112, 111, 115, 105, 116, 105, 111, 110, 21, 4, 22, 2, 22, 84, 22, 124, 38, 202, - 1, 60, 54, 0, 40, 4, 0, 33, 14, 0, 24, 4, 0, 33, 14, 0, 0, 0, 21, 6, 21, 30, 21, 70, 92, - 21, 2, 21, 0, 21, 2, 21, 0, 21, 4, 21, 4, 17, 28, 54, 0, 40, 7, 49, 48, 49, 57, 51, 57, 55, - 24, 7, 49, 48, 49, 57, 51, 57, 55, 0, 0, 0, 3, 0, 3, 1, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, - 99, 103, 96, 96, 48, 52, 48, 180, 52, 182, 52, 7, 0, 69, 88, 164, 201, 11, 0, 0, 0, 21, 12, - 25, 37, 0, 6, 25, 56, 10, 105, 100, 101, 110, 116, 105, 102, 105, 101, 114, 4, 108, 105, - 115, 116, 2, 105, 100, 21, 4, 22, 2, 22, 118, 22, 158, 1, 38, 160, 3, 60, 54, 0, 40, 7, 49, - 48, 49, 57, 51, 57, 55, 24, 7, 49, 48, 49, 57, 51, 57, 55, 0, 0, 21, 6, 21, 10, 21, 50, 92, - 21, 2, 21, 0, 21, 2, 21, 0, 21, 0, 21, 0, 17, 28, 54, 0, 40, 1, 71, 24, 1, 71, 0, 0, 0, 31, - 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 100, 96, 96, 112, 7, 0, 158, 10, 250, 19, 5, 0, 0, 0, - 21, 12, 25, 37, 0, 6, 25, 24, 9, 114, 101, 102, 101, 114, 101, 110, 99, 101, 21, 4, 22, 2, - 22, 74, 22, 114, 38, 186, 5, 60, 54, 0, 40, 1, 71, 24, 1, 71, 0, 0, 21, 6, 21, 10, 21, 50, - 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 0, 21, 0, 17, 28, 54, 0, 40, 1, 65, 24, 1, 65, 0, 0, 0, - 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 100, 96, 96, 112, 4, 0, 171, 175, 153, 250, 5, 0, 0, - 0, 21, 12, 25, 37, 0, 6, 25, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 21, 4, 22, - 2, 22, 74, 22, 114, 38, 252, 6, 60, 54, 0, 40, 1, 65, 24, 1, 65, 0, 0, 21, 6, 21, 4, 21, - 44, 92, 21, 2, 21, 2, 21, 2, 21, 0, 21, 4, 21, 0, 17, 28, 54, 2, 0, 0, 0, 3, 0, 31, 139, 8, - 0, 0, 0, 0, 0, 0, 255, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 8, 25, 37, 0, 6, 25, 24, 7, 113, - 117, 97, 108, 105, 116, 121, 21, 4, 22, 2, 22, 56, 22, 96, 38, 190, 8, 60, 54, 2, 0, 0, 21, - 6, 21, 8, 21, 48, 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 4, 21, 4, 17, 28, 54, 0, 0, 0, 0, 3, - 0, 3, 0, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 12, 25, 37, - 0, 6, 25, 56, 6, 102, 105, 108, 116, 101, 114, 4, 108, 105, 115, 116, 6, 102, 105, 108, - 116, 101, 114, 21, 4, 22, 2, 22, 60, 22, 100, 38, 222, 9, 60, 54, 0, 0, 0, 21, 12, 25, 5, - 25, 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, 109, 101, 21, 0, 22, 0, 22, 0, 22, 0, - 38, 0, 0, 21, 2, 25, 5, 25, 24, 8, 112, 111, 115, 105, 116, 105, 111, 110, 21, 0, 22, 0, - 22, 0, 22, 0, 38, 0, 0, 21, 12, 25, 5, 25, 56, 10, 105, 100, 101, 110, 116, 105, 102, 105, - 101, 114, 4, 108, 105, 115, 116, 2, 105, 100, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 21, 12, - 25, 5, 25, 24, 9, 114, 101, 102, 101, 114, 101, 110, 99, 101, 21, 0, 22, 0, 22, 0, 22, 0, - 38, 0, 0, 21, 12, 25, 5, 25, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 21, 0, 22, - 0, 22, 0, 22, 0, 38, 0, 0, 21, 8, 25, 5, 25, 24, 7, 113, 117, 97, 108, 105, 116, 121, 21, - 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 21, 12, 25, 5, 25, 56, 6, 102, 105, 108, 116, 101, 114, - 4, 108, 105, 115, 116, 6, 102, 105, 108, 116, 101, 114, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, - 0, 25, 17, 2, 25, 24, 1, 49, 25, 24, 1, 49, 21, 0, 25, 22, 0, 0, 25, 17, 2, 25, 24, 4, 0, - 33, 14, 0, 25, 24, 4, 0, 33, 14, 0, 21, 0, 25, 22, 0, 0, 25, 17, 2, 25, 24, 7, 49, 48, 49, - 57, 51, 57, 55, 25, 24, 7, 49, 48, 49, 57, 51, 57, 55, 21, 0, 25, 22, 0, 0, 25, 17, 2, 25, - 24, 1, 71, 25, 24, 1, 71, 21, 0, 25, 22, 0, 0, 25, 17, 2, 25, 24, 1, 65, 25, 24, 1, 65, 21, - 0, 25, 22, 0, 0, 25, 17, 1, 25, 24, 1, 0, 25, 24, 1, 0, 21, 0, 25, 22, 2, 0, 25, 17, 1, 25, - 24, 1, 0, 25, 24, 1, 0, 21, 0, 25, 22, 0, 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 1, - 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 1, 25, 8, 25, 8, - 21, 0, 25, 6, 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, - 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 28, 22, 8, 21, 114, 22, 0, 0, 0, 25, 28, 22, - 202, 1, 21, 124, 22, 0, 0, 0, 25, 28, 22, 160, 3, 21, 158, 1, 22, 0, 0, 0, 25, 28, 22, 186, - 5, 21, 114, 22, 0, 0, 0, 25, 28, 22, 252, 6, 21, 114, 22, 0, 0, 0, 25, 28, 22, 190, 8, 21, - 96, 22, 0, 0, 0, 25, 28, 22, 222, 9, 21, 100, 22, 0, 0, 0, 25, 12, 0, 25, 12, 0, 25, 12, 0, - 25, 12, 0, 25, 12, 0, 25, 12, 0, 25, 12, 0, 21, 4, 25, 204, 72, 4, 114, 111, 111, 116, 21, - 14, 0, 21, 12, 37, 0, 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, 109, 101, 37, 0, 76, - 28, 0, 0, 0, 21, 2, 37, 0, 24, 8, 112, 111, 115, 105, 116, 105, 111, 110, 0, 53, 0, 24, 10, - 105, 100, 101, 110, 116, 105, 102, 105, 101, 114, 21, 2, 21, 6, 76, 60, 0, 0, 0, 53, 4, 24, - 4, 108, 105, 115, 116, 21, 2, 0, 21, 12, 37, 0, 24, 2, 105, 100, 37, 0, 76, 28, 0, 0, 0, - 21, 12, 37, 0, 24, 9, 114, 101, 102, 101, 114, 101, 110, 99, 101, 37, 0, 76, 28, 0, 0, 0, - 21, 12, 37, 0, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 37, 0, 76, 28, 0, 0, 0, - 21, 8, 37, 2, 24, 7, 113, 117, 97, 108, 105, 116, 121, 0, 53, 0, 24, 6, 102, 105, 108, 116, - 101, 114, 21, 2, 21, 6, 76, 60, 0, 0, 0, 53, 4, 24, 4, 108, 105, 115, 116, 21, 2, 0, 21, - 12, 37, 0, 24, 6, 102, 105, 108, 116, 101, 114, 37, 0, 76, 28, 0, 0, 0, 22, 2, 25, 44, 25, - 124, 38, 122, 28, 21, 12, 25, 37, 0, 6, 25, 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, - 109, 101, 21, 4, 22, 2, 22, 74, 22, 114, 38, 8, 60, 54, 0, 40, 1, 49, 24, 1, 49, 0, 0, 22, - 222, 17, 21, 20, 22, 178, 14, 21, 34, 0, 38, 198, 2, 28, 21, 2, 25, 37, 0, 6, 25, 24, 8, - 112, 111, 115, 105, 116, 105, 111, 110, 21, 4, 22, 2, 22, 84, 22, 124, 38, 202, 1, 60, 54, - 0, 40, 4, 0, 33, 14, 0, 24, 4, 0, 33, 14, 0, 0, 0, 22, 242, 17, 21, 22, 22, 212, 14, 21, - 46, 0, 38, 190, 4, 28, 21, 12, 25, 37, 0, 6, 25, 56, 10, 105, 100, 101, 110, 116, 105, 102, - 105, 101, 114, 4, 108, 105, 115, 116, 2, 105, 100, 21, 4, 22, 2, 22, 118, 22, 158, 1, 38, - 160, 3, 60, 54, 0, 40, 7, 49, 48, 49, 57, 51, 57, 55, 24, 7, 49, 48, 49, 57, 51, 57, 55, 0, - 0, 22, 136, 18, 21, 24, 22, 130, 15, 21, 58, 0, 38, 172, 6, 28, 21, 12, 25, 37, 0, 6, 25, - 24, 9, 114, 101, 102, 101, 114, 101, 110, 99, 101, 21, 4, 22, 2, 22, 74, 22, 114, 38, 186, - 5, 60, 54, 0, 40, 1, 71, 24, 1, 71, 0, 0, 22, 160, 18, 21, 22, 22, 188, 15, 21, 34, 0, 38, - 238, 7, 28, 21, 12, 25, 37, 0, 6, 25, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 21, - 4, 22, 2, 22, 74, 22, 114, 38, 252, 6, 60, 54, 0, 40, 1, 65, 24, 1, 65, 0, 0, 22, 182, 18, - 21, 22, 22, 222, 15, 21, 34, 0, 38, 158, 9, 28, 21, 8, 25, 37, 0, 6, 25, 24, 7, 113, 117, - 97, 108, 105, 116, 121, 21, 4, 22, 2, 22, 56, 22, 96, 38, 190, 8, 60, 54, 2, 0, 0, 22, 204, - 18, 21, 22, 22, 128, 16, 21, 34, 0, 38, 194, 10, 28, 21, 12, 25, 37, 0, 6, 25, 56, 6, 102, - 105, 108, 116, 101, 114, 4, 108, 105, 115, 116, 6, 102, 105, 108, 116, 101, 114, 21, 4, 22, - 2, 22, 60, 22, 100, 38, 222, 9, 60, 54, 0, 0, 0, 22, 226, 18, 21, 22, 22, 162, 16, 21, 34, - 0, 22, 156, 4, 22, 2, 38, 8, 22, 180, 6, 20, 0, 0, 25, 124, 38, 0, 28, 21, 12, 25, 5, 25, - 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, 109, 101, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, - 0, 22, 248, 18, 21, 6, 22, 196, 16, 21, 22, 0, 38, 0, 28, 21, 2, 25, 5, 25, 24, 8, 112, - 111, 115, 105, 116, 105, 111, 110, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 254, 18, 21, - 6, 22, 218, 16, 21, 22, 0, 38, 0, 28, 21, 12, 25, 5, 25, 56, 10, 105, 100, 101, 110, 116, - 105, 102, 105, 101, 114, 4, 108, 105, 115, 116, 2, 105, 100, 21, 0, 22, 0, 22, 0, 22, 0, - 38, 0, 0, 22, 132, 19, 21, 6, 22, 240, 16, 21, 22, 0, 38, 0, 28, 21, 12, 25, 5, 25, 24, 9, - 114, 101, 102, 101, 114, 101, 110, 99, 101, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 138, - 19, 21, 6, 22, 134, 17, 21, 22, 0, 38, 0, 28, 21, 12, 25, 5, 25, 24, 9, 97, 108, 116, 101, - 114, 110, 97, 116, 101, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 144, 19, 21, 6, 22, 156, - 17, 21, 22, 0, 38, 0, 28, 21, 8, 25, 5, 25, 24, 7, 113, 117, 97, 108, 105, 116, 121, 21, 0, - 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 150, 19, 21, 6, 22, 178, 17, 21, 22, 0, 38, 0, 28, 21, - 12, 25, 5, 25, 56, 6, 102, 105, 108, 116, 101, 114, 4, 108, 105, 115, 116, 6, 102, 105, - 108, 116, 101, 114, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 156, 19, 21, 6, 22, 200, 17, - 21, 22, 0, 22, 0, 22, 0, 38, 0, 22, 0, 20, 2, 0, 25, 28, 24, 12, 65, 82, 82, 79, 87, 58, - 115, 99, 104, 101, 109, 97, 24, 244, 6, 47, 47, 47, 47, 47, 52, 56, 67, 65, 65, 65, 69, 65, - 65, 65, 65, 56, 118, 47, 47, 47, 120, 81, 65, 65, 65, 65, 69, 65, 65, 69, 65, 65, 65, 65, - 75, 65, 65, 115, 65, 67, 65, 65, 75, 65, 65, 81, 65, 43, 80, 47, 47, 47, 119, 119, 65, 65, - 65, 65, 73, 65, 65, 103, 65, 65, 65, 65, 69, 65, 65, 99, 65, 65, 65, 65, 103, 65, 103, 65, - 65, 48, 65, 69, 65, 65, 70, 65, 66, 65, 65, 65, 77, 65, 81, 65, 65, 121, 65, 65, 65, 65, - 73, 81, 65, 65, 65, 65, 69, 65, 65, 65, 65, 55, 80, 47, 47, 47, 50, 119, 65, 65, 65, 66, - 103, 65, 65, 65, 65, 71, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, 81, 65, 66, 69, 65, 66, - 65, 65, 65, 65, 66, 65, 65, 67, 65, 65, 65, 65, 65, 119, 65, 65, 81, 65, 65, 65, 65, 81, - 65, 65, 65, 68, 115, 47, 47, 47, 47, 76, 65, 65, 65, 65, 67, 65, 65, 65, 65, 65, 89, 65, - 65, 65, 65, 66, 81, 65, 65, 65, 66, 65, 65, 69, 81, 65, 69, 65, 65, 65, 65, 69, 65, 65, 73, - 65, 65, 65, 65, 68, 65, 65, 65, 65, 65, 65, 65, 47, 80, 47, 47, 47, 119, 81, 65, 66, 65, - 65, 71, 65, 65, 65, 65, 90, 109, 108, 115, 100, 71, 86, 121, 65, 65, 68, 56, 47, 47, 47, - 47, 66, 65, 65, 69, 65, 65, 89, 65, 65, 65, 66, 109, 97, 87, 120, 48, 90, 88, 73, 65, 65, - 79, 122, 47, 47, 47, 56, 119, 65, 65, 65, 65, 73, 65, 65, 65, 65, 66, 103, 65, 65, 65, 65, - 66, 65, 119, 65, 65, 69, 65, 65, 83, 65, 65, 81, 65, 69, 65, 65, 82, 65, 65, 103, 65, 65, - 65, 65, 77, 65, 65, 65, 65, 65, 65, 68, 54, 47, 47, 47, 47, 65, 81, 65, 71, 65, 65, 89, 65, - 66, 65, 65, 72, 65, 65, 65, 65, 99, 88, 86, 104, 98, 71, 108, 48, 101, 81, 68, 115, 47, 47, - 47, 47, 76, 65, 65, 65, 65, 67, 65, 65, 65, 65, 65, 89, 65, 65, 65, 65, 66, 81, 65, 65, 65, - 66, 65, 65, 69, 81, 65, 69, 65, 65, 65, 65, 69, 65, 65, 73, 65, 65, 65, 65, 68, 65, 65, 65, - 65, 65, 65, 65, 47, 80, 47, 47, 47, 119, 81, 65, 66, 65, 65, 74, 65, 65, 65, 65, 89, 87, - 120, 48, 90, 88, 74, 117, 89, 88, 82, 108, 65, 65, 65, 65, 55, 80, 47, 47, 47, 121, 119, - 65, 65, 65, 65, 103, 65, 65, 65, 65, 71, 65, 65, 65, 65, 65, 85, 65, 65, 65, 65, 81, 65, - 66, 69, 65, 66, 65, 65, 65, 65, 66, 65, 65, 67, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, - 65, 65, 80, 122, 47, 47, 47, 56, 69, 65, 65, 81, 65, 67, 81, 65, 65, 65, 72, 74, 108, 90, - 109, 86, 121, 90, 87, 53, 106, 90, 81, 65, 65, 65, 79, 122, 47, 47, 47, 57, 111, 65, 65, - 65, 65, 88, 65, 65, 65, 65, 66, 103, 65, 65, 65, 65, 77, 65, 65, 65, 65, 69, 65, 65, 82, - 65, 65, 81, 65, 65, 65, 65, 81, 65, 65, 103, 65, 65, 65, 65, 77, 65, 65, 69, 65, 65, 65, - 65, 69, 65, 65, 65, 65, 55, 80, 47, 47, 47, 121, 119, 65, 65, 65, 65, 103, 65, 65, 65, 65, - 71, 65, 65, 65, 65, 65, 85, 65, 65, 65, 65, 81, 65, 66, 69, 65, 66, 65, 65, 65, 65, 66, 65, - 65, 67, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, 65, 65, 80, 122, 47, 47, 47, 56, 69, 65, - 65, 81, 65, 65, 103, 65, 65, 65, 71, 108, 107, 65, 65, 68, 56, 47, 47, 47, 47, 66, 65, 65, - 69, 65, 65, 111, 65, 65, 65, 66, 112, 90, 71, 86, 117, 100, 71, 108, 109, 97, 87, 86, 121, - 65, 65, 68, 115, 47, 47, 47, 47, 79, 65, 65, 65, 65, 67, 65, 65, 65, 65, 65, 89, 65, 65, - 65, 65, 65, 103, 65, 65, 65, 66, 65, 65, 69, 81, 65, 69, 65, 65, 65, 65, 69, 65, 65, 73, - 65, 65, 65, 65, 68, 65, 65, 65, 65, 65, 65, 65, 57, 80, 47, 47, 47, 121, 65, 65, 65, 65, - 65, 66, 65, 65, 65, 65, 67, 65, 65, 74, 65, 65, 81, 65, 67, 65, 65, 73, 65, 65, 65, 65, 99, - 71, 57, 122, 97, 88, 82, 112, 98, 50, 52, 65, 65, 65, 65, 65, 55, 80, 47, 47, 47, 121, 119, - 65, 65, 65, 65, 103, 65, 65, 65, 65, 71, 65, 65, 65, 65, 65, 85, 65, 65, 65, 65, 81, 65, - 66, 69, 65, 66, 65, 65, 65, 65, 66, 65, 65, 67, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, - 65, 65, 80, 122, 47, 47, 47, 56, 69, 65, 65, 81, 65, 67, 103, 65, 65, 65, 71, 78, 111, 99, - 109, 57, 116, 98, 51, 78, 118, 98, 87, 85, 65, 0, 24, 44, 65, 114, 114, 111, 119, 50, 32, - 45, 32, 78, 97, 116, 105, 118, 101, 32, 82, 117, 115, 116, 32, 105, 109, 112, 108, 101, - 109, 101, 110, 116, 97, 116, 105, 111, 110, 32, 111, 102, 32, 65, 114, 114, 111, 119, 0, - 107, 7, 0, 0, 80, 65, 82, 49, - ]; - - #[test] - fn convert_positives() { - let mut input = std::io::BufReader::new(&*VCF_FILE); - let mut output = Vec::new(); - - vcf2parquet( - &mut input, - &mut output, - 1, - arrow2::io::parquet::write::CompressionOptions::Gzip(None), - false, - ) - .unwrap(); - assert_eq!(output, *PARQUET_FILE); - } - - #[test] - fn not_a_vcf() { - let raw_data = [b'#', b'a', b'b', b'c', 255, 0x7F, b'\n'].to_vec(); - let mut input = std::io::BufReader::new(&raw_data[..]); - let mut output = Vec::new(); - - let result = vcf2parquet( - &mut input, - &mut output, - 1, - arrow2::io::parquet::write::CompressionOptions::Gzip(None), - false, - ); - - assert!(result.is_err()); - } - - #[test] - fn multi_positives() { - let mut input = std::io::BufReader::new(&*VCF_FILE); - let dir = tempfile::tempdir().unwrap(); - - let format = dir - .path() - .join("test_{}.parquet") - .as_os_str() - .to_str() - .unwrap() - .to_string(); - - vcf2multiparquet( - &mut input, - &format, - 1, - arrow2::io::parquet::write::CompressionOptions::Gzip(None), - false, - ) - .unwrap(); - } -} diff --git a/vcf2parquet-lib/src/name2data.rs b/vcf2parquet-lib/src/name2data.rs deleted file mode 100644 index 4d48c84..0000000 --- a/vcf2parquet-lib/src/name2data.rs +++ /dev/null @@ -1,1177 +0,0 @@ -//! Struct to link name and data - -/* std use */ - -/* crate use */ -use arrow2::array::MutableArray; -use arrow2::array::MutablePrimitiveArray; -use arrow2::array::TryPush; -use noodles::vcf::record::genotypes::sample::value::genotype::allele::Phasing; - -/* project use */ - -///Alias of [std::collections::HashMap] that associate a column name and [ColumnData], a proxy of arrow2 datastructure -#[derive(Debug)] -pub struct Name2Data(rustc_hash::FxHashMap); - -impl Name2Data { - /// Create a new Name2Data, vcf header is required to add info and genotype column - /// length parameter is used to preallocate memory - pub fn new(length: usize, schema: &arrow2::datatypes::Schema) -> Self { - let mut name2data = rustc_hash::FxHashMap::default(); - for field in schema.fields.iter() { - name2data.insert( - field.name.clone(), - ColumnData::new(&field.data_type, length), - ); - } - Name2Data(name2data) - } - - /// Just a wrapper arround [std::collections::HashMap::get] - pub fn get(&self, key: &str) -> Option<&ColumnData> { - self.0.get(key) - } - - /// Just a wrapper arround [std::collections::HashMap::get_mut] - pub fn get_mut(&mut self, key: &str) -> Option<&mut ColumnData> { - self.0.get_mut(key) - } - - /// Add a vcf record in [std::collections::HashMap] struct - pub fn add_record( - &mut self, - record: noodles::vcf::Record, - header: &noodles::vcf::Header, - schema: &arrow2::datatypes::Schema, - ) -> std::result::Result<(), arrow2::error::Error> { - let allele_count = record.alternate_bases().len() + 1; - for (alt_id, allele) in record.alternate_bases().iter().enumerate() { - for (key, column) in self.0.iter_mut() { - match key.as_str() { - "chromosome" => column.push_string(record.chromosome().to_string()), - "position" => column.push_i32(Some(usize::from(record.position()) as i32)), - "identifier" => column.push_vecstring( - record.ids().iter().map(|s| Some(s.to_string())).collect(), - )?, - "reference" => column.push_string(record.reference_bases().to_string()), - "alternate" => column.push_string(allele.to_string()), - "quality" => column.push_f32(record.quality_score().map(|v| v.into())), - "filter" => column.push_vecstring( - record - .filters() - .iter() - .map(|s| Some(s.to_string())) - .collect(), - )?, - _ => {} - } - } - self.add_info(&record, header, schema, alt_id, allele_count)?; - self.add_format(&record, header, schema, alt_id, allele_count)?; - } - Ok(()) - } - - fn add_info( - &mut self, - record: &noodles::vcf::Record, - header: &noodles::vcf::Header, - schema: &arrow2::datatypes::Schema, - alt_id: usize, - allele_count: usize, - ) -> std::result::Result<(), arrow2::error::Error> { - let info = record.info(); - - for key in header.infos().keys() { - let key_name = format!("info_{}", key); - let info_def = header.infos().get(key).unwrap(); - if let Some(column) = self.0.get_mut(&key_name) { - match info.get(key) { - Some(value) => match value { - Some(noodles::vcf::record::info::field::Value::Flag) => { - column.push_bool(Some(true)); - } - Some(noodles::vcf::record::info::field::Value::Integer(value)) => { - column.push_i32(Some(*value)); - } - Some(noodles::vcf::record::info::field::Value::Float(value)) => { - column.push_f32(Some(*value)); - } - Some(noodles::vcf::record::info::field::Value::String(value)) => { - column.push_string(value.to_string()); - } - Some(noodles::vcf::record::info::field::Value::Character(value)) => { - column.push_string(value.to_string()); - } - Some(noodles::vcf::record::info::field::Value::Array(arr)) => match arr - .clone() - { - noodles::vcf::record::info::field::value::Array::Integer(array_val) => { - match info_def.number() { - noodles::vcf::header::Number::Count(0 | 1) => { - unreachable!( - "Field {} declared as single value but found array", - key - ) - } - noodles::vcf::header::Number::Count(_) => { - column.push_veci32(array_val)?; - } - noodles::vcf::header::Number::A => { - column.push_i32(*array_val.get(alt_id).unwrap()); - } - noodles::vcf::header::Number::R => { - column.push_veci32(vec![ - *array_val.first().unwrap(), - *array_val.get(alt_id + 1).unwrap(), - ])?; - } - noodles::vcf::header::Number::G => { - if array_val.len() - == (allele_count * (allele_count + 1) / 2) - { - column.push_veci32(vec![ - *array_val.first().unwrap(), - *array_val - .get((alt_id * alt_id + 3 * alt_id + 2) / 2) - .unwrap(), - *array_val - .get((alt_id * alt_id + 5 * alt_id + 4) / 2) - .unwrap(), - ])?; - } else if array_val.len() == allele_count { - column.push_veci32(vec![ - *array_val.first().unwrap(), - Some(0), - *array_val.get(alt_id + 1).unwrap(), - ])?; - } else { - eprintln!( - "Field {} declared as G but found array of size {}", - key, - array_val.len() - ); - column.push_null(); - } - } - noodles::vcf::header::Number::Unknown => { - column.push_veci32(array_val)?; - } - } - } - noodles::vcf::record::info::field::value::Array::Float(array_val) => { - match info_def.number() { - noodles::vcf::header::Number::Count(0 | 1) => { - unreachable!( - "Field {} declared as single value but found array", - key - ) - } - noodles::vcf::header::Number::Count(_) => { - column.push_vecf32(array_val)?; - } - noodles::vcf::header::Number::A => { - column.push_f32(*array_val.get(alt_id).unwrap()); - } - noodles::vcf::header::Number::R => { - column.push_vecf32(vec![ - *array_val.first().unwrap(), - *array_val.get(alt_id + 1).unwrap(), - ])?; - } - noodles::vcf::header::Number::G => { - if array_val.len() - == (allele_count * (allele_count + 1) / 2) - { - column.push_vecf32(vec![ - *array_val.first().unwrap(), - *array_val - .get((alt_id * alt_id + 3 * alt_id + 2) / 2) - .unwrap(), - *array_val - .get((alt_id * alt_id + 5 * alt_id + 4) / 2) - .unwrap(), - ])?; - } else if array_val.len() == allele_count { - column.push_vecf32(vec![ - *array_val.first().unwrap(), - Some(0.), - *array_val.get(alt_id + 1).unwrap(), - ])?; - } else { - eprintln!( - "Field {} declared as G but found array of size {}", - key, - array_val.len() - ); - column.push_null(); - } - } - noodles::vcf::header::Number::Unknown => { - column.push_vecf32(array_val)?; - } - } - } - noodles::vcf::record::info::field::value::Array::String(array_val) => { - match info_def.number() { - noodles::vcf::header::Number::Count(0 | 1) => { - unreachable!( - "Field {} declared as single value but found array", - key_name - ) - } - noodles::vcf::header::Number::Count(_) => { - column.push_vecstring(array_val)?; - } - noodles::vcf::header::Number::A => { - column.push_string( - array_val.get(alt_id).unwrap().clone().unwrap(), - ); - } - noodles::vcf::header::Number::R => { - column.push_vecstring(vec![ - Some(array_val.first().unwrap().clone().unwrap()), - Some( - array_val.get(alt_id + 1).unwrap().clone().unwrap(), - ), - ])?; - } - noodles::vcf::header::Number::G => { - if array_val.len() - == (allele_count * (allele_count + 1) / 2) - { - column.push_vecstring(vec![ - array_val.first().unwrap().clone(), - array_val - .get((alt_id * alt_id + 3 * alt_id + 2) / 2) - .unwrap() - .clone(), - array_val - .get((alt_id * alt_id + 5 * alt_id + 4) / 2) - .unwrap() - .clone(), - ])?; - } else if array_val.len() == allele_count { - column.push_vecstring(vec![ - array_val.first().unwrap().clone(), - Some(".".to_string()), - array_val.get(alt_id + 1).unwrap().clone(), - ])?; - } else { - eprintln!( - "Field {} declared as G but found array of size {}", - key, - array_val.len() - ); - column.push_null(); - } - } - noodles::vcf::header::Number::Unknown => { - column.push_vecstring(array_val)?; - } - } - } - noodles::vcf::record::info::field::value::Array::Character( - array_val, - ) => match info_def.number() { - noodles::vcf::header::Number::Count(0 | 1) => { - unreachable!( - "Field {} declared as single value but found array", - key_name - ) - } - noodles::vcf::header::Number::Count(_) => { - column.push_vecstring( - array_val - .iter() - .map(|s| s.as_ref().map(|s| s.to_string())) - .collect::>>(), - )?; - } - noodles::vcf::header::Number::A => { - column.push_string( - (*array_val.get(alt_id).unwrap()).unwrap().to_string(), - ); - } - noodles::vcf::header::Number::R => { - column.push_vecstring(vec![ - Some(array_val.first().unwrap().unwrap().to_string()), - Some( - array_val.get(alt_id + 1).unwrap().unwrap().to_string(), - ), - ])?; - } - noodles::vcf::header::Number::G => { - if array_val.len() == (allele_count * (allele_count + 1) / 2) { - column.push_vecstring(vec![ - Some(array_val.first().unwrap().unwrap().to_string()), - Some( - array_val - .get((alt_id * alt_id + 3 * alt_id + 2) / 2) - .unwrap() - .unwrap() - .to_string(), - ), - Some( - array_val - .get((alt_id * alt_id + 5 * alt_id + 4) / 2) - .unwrap() - .unwrap() - .to_string(), - ), - ])?; - } else if array_val.len() == allele_count { - column.push_vecstring(vec![ - Some(array_val.first().unwrap().unwrap().to_string()), - Some(".".to_string()), - Some( - array_val - .get(alt_id + 1) - .unwrap() - .unwrap() - .to_string(), - ), - ])?; - } else { - eprintln!( - "Field {} declared as G but found array of size {}", - key, - array_val.len() - ); - column.push_null(); - } - } - noodles::vcf::header::Number::Unknown => { - column.push_vecstring( - array_val - .iter() - .map(|s| s.as_ref().map(|s| s.to_string())) - .collect::>>(), - )?; - } - }, - }, - None => column.push_null(), - }, - None => { - if info_def.ty() - == noodles::vcf::header::record::value::map::info::Type::Flag - { - column.push_bool(Some(false)); - } else { - //Handle missing info field, only matters for FixedSizeList - for field in schema.fields.iter() { - if field.name == key_name { - match field.data_type { - arrow2::datatypes::DataType::FixedSizeList( - ref field_type, - fixed_size, - ) => match &field_type.data_type() { - arrow2::datatypes::DataType::Int32 => { - column.push_veci32(vec![None; fixed_size])? - } - - arrow2::datatypes::DataType::Float32 => { - column.push_vecf32(vec![None; fixed_size])? - } - - arrow2::datatypes::DataType::Utf8 => { - column.push_vecstring(vec![None; fixed_size])? - } - - _ => column.push_null(), - }, - _ => column.push_null(), //Otherwise, just push null - } - } - } - } - } - } - } - } - Ok(()) - } - - fn add_format( - &mut self, - record: &noodles::vcf::Record, - header: &noodles::vcf::Header, - schema: &arrow2::datatypes::Schema, - alt_id: usize, - allele_count: usize, - ) -> std::result::Result<(), arrow2::error::Error> { - for key in header.formats().keys() { - for (idx, sample) in header.sample_names().iter().enumerate() { - let key_name = format!("format_{}_{}", sample, key); - let format_def = header.formats().get(key).unwrap(); - if let Some(column) = self.0.get_mut(&key_name) { - if let Some(format_field) = record.genotypes().get_index(idx) { - match format_field.get(key) { - Some(value) => match value { - Some( - noodles::vcf::record::genotypes::sample::Value::Integer( - value, - ), - ) => { - column.push_i32(Some(*value)); - } - Some( - noodles::vcf::record::genotypes::sample::Value::Float( - value, - ), - ) => { - column.push_f32(Some(*value)); - } - Some( - noodles::vcf::record::genotypes::sample::Value::String( - value, - ), - ) => { - if key.to_string()=="GT" { - let mut gt_str = String::with_capacity(32);//Arbitrary capacity - if let Some(Ok(gt)) = format_field.genotype() - { - gt.iter().enumerate().for_each(|(i,allele)| { - let (position, phasing) = (allele.position(), allele.phasing()); - match position { - Some(a) if a == alt_id + 1 => { - gt_str.push('1'); - } - Some(0)=>{ - gt_str.push('0'); - } - Some(_) =>{ - gt_str.push('.'); - } - None=>{ - gt_str.push('.'); - } - } - if i < gt.len() - 1 { - gt_str.push(match phasing { - Phasing::Phased => '|', - Phasing::Unphased => '/', - }); - } - }); - } - else { - eprintln!("Should be unreachable"); - gt_str.push_str("./."); - } - column.push_string(gt_str); - } else { - column.push_string(value.to_string()); - } - } - Some( - noodles::vcf::record::genotypes::sample::Value::Character( - value, - ), - ) => { - column.push_string(value.to_string()); - } - Some( - noodles::vcf::record::genotypes::sample::Value::Array(arr), - ) => match arr.clone() { - noodles::vcf::record::genotypes::sample::value::Array::Integer( - array_val, - ) => match format_def.number() { - noodles::vcf::header::Number::Count(0 | 1) => { - unreachable!( - "Field {} declared as single value but found array", - key - ) - } - noodles::vcf::header::Number::Count(_) => { - column.push_veci32(array_val)?; - } - noodles::vcf::header::Number::A => { - column.push_i32(*array_val.get(alt_id).unwrap()); - } - noodles::vcf::header::Number::R => { - //TODO: Use push_fixed_size_i32 - column.push_veci32(vec![ - *array_val.first().unwrap(), - *array_val.get(alt_id + 1).unwrap(), - ])?; - } - noodles::vcf::header::Number::G => { - if array_val.len() - == (allele_count * (allele_count + 1) / 2) - { - column.push_veci32(vec![ - *array_val.first().unwrap(), - *array_val - .get((alt_id * alt_id + 3 * alt_id + 2) / 2) - .unwrap(), - *array_val - .get((alt_id * alt_id + 5 * alt_id + 4) / 2) - .unwrap(), - ])?; - } else if array_val.len() == allele_count { - column.push_veci32(vec![ - *array_val.first().unwrap(), - Some(0), - *array_val.get(alt_id + 1).unwrap(), - ])?; - } else { - eprintln!( - "Field {} declared as G but found array of size {}", - key, - array_val.len() - ); - column.push_null(); - } - } - noodles::vcf::header::Number::Unknown => { - column.push_veci32(array_val)?; - } - }, - noodles::vcf::record::genotypes::sample::value::Array::Float( - array_val, - ) => match format_def.number() { - noodles::vcf::header::Number::Count(0 | 1) => { - unreachable!( - "Field {} declared as single value but found array", - key - ) - } - noodles::vcf::header::Number::Count(_) => { - column.push_vecf32(array_val)?; - } - noodles::vcf::header::Number::A => { - column.push_f32(*array_val.get(alt_id).unwrap()); - } - noodles::vcf::header::Number::R => { - //TODO: Use push_fixed_size_f32 - column.push_vecf32(vec![ - *array_val.first().unwrap(), - *array_val.get(alt_id + 1).unwrap(), - ])?; - } - noodles::vcf::header::Number::G => { - if array_val.len() - == (allele_count * (allele_count + 1) / 2) - { - column.push_vecf32(vec![ - *array_val.first().unwrap(), - *array_val - .get((alt_id * alt_id + 3 * alt_id + 2) / 2) - .unwrap(), - *array_val - .get((alt_id * alt_id + 5 * alt_id + 4) / 2) - .unwrap(), - ])?; - } else if array_val.len() == allele_count { - column.push_vecf32(vec![ - *array_val.first().unwrap(), - Some(0.), - *array_val.get(alt_id + 1).unwrap(), - ])?; - } else { - eprintln!( - "Field {} declared as G but found array of size {}", - key, - array_val.len() - ); - column.push_null(); - } - } - noodles::vcf::header::Number::Unknown => { - column.push_vecf32(array_val)?; - } - }, - noodles::vcf::record::genotypes::sample::value::Array::String( - array_val, - ) => match format_def.number() { - noodles::vcf::header::Number::Count(0 | 1) => { - unreachable!( - "Field {} declared as single value but found array", - key_name - ) - }, - noodles::vcf::header::Number::Count(_) => { - column.push_vecstring(array_val)?; - }, - noodles::vcf::header::Number::A => { - column.push_string( - array_val.get(alt_id).unwrap().clone().unwrap(), - ); - }, - noodles::vcf::header::Number::R => { - //TODO: Use push_fixed_size_string - column.push_vecstring(vec![ - Some(array_val.first().unwrap().clone().unwrap()), - Some(array_val.get(alt_id + 1).unwrap().clone().unwrap()), - ])?; - }, - noodles::vcf::header::Number::G => { - if array_val.len() - == (allele_count * (allele_count + 1) / 2) - { - column.push_vecstring(vec![ - array_val.first().unwrap().clone(), - array_val - .get((alt_id * alt_id + 3 * alt_id + 2) / 2) - .unwrap() - .clone(), - array_val - .get((alt_id * alt_id + 5 * alt_id + 4) / 2) - .unwrap() - .clone(), - ])?; - } else if array_val.len() == allele_count { - column.push_vecstring(vec![ - array_val.first().unwrap().clone(), - Some(".".to_string()), - array_val.get(alt_id + 1).unwrap().clone(), - ])?; - } else { - eprintln!( - "Field {} declared as G but found array of size {}", - key, - array_val.len() - ); - column.push_null(); - } - } - noodles::vcf::header::Number::Unknown => { - column.push_vecstring(array_val)?; - } - }, - noodles::vcf::record::genotypes::sample::value::Array::Character( - array_val, - ) => match format_def.number() { - noodles::vcf::header::Number::Count(0 | 1) => { - unreachable!( - "Field {} declared as single value but found array", - key_name - ) - }, - noodles::vcf::header::Number::Count(_) => { - column.push_vecstring( - array_val - .iter() - .map(|s| s.as_ref().map(|s| s.to_string())) - .collect::>>(), - )?; - }, - noodles::vcf::header::Number::A => { - column.push_string( - (*array_val.get(alt_id).unwrap()).unwrap().to_string(), - ); - }, - noodles::vcf::header::Number::R => { - column.push_vecstring(vec![ - Some(array_val.first().unwrap().unwrap().to_string()), - Some(array_val.get(alt_id + 1).unwrap().unwrap().to_string()), - ])?; - }, - noodles::vcf::header::Number::G => { - if array_val.len() == (allele_count * (allele_count + 1) / 2) { - column.push_vecstring(vec![ - Some(array_val.first().unwrap().unwrap().to_string()), - Some( - array_val - .get((alt_id * alt_id + 3 * alt_id + 2) / 2) - .unwrap() - .unwrap() - .to_string(), - ), - Some( - array_val - .get((alt_id * alt_id + 5 * alt_id + 4) / 2) - .unwrap() - .unwrap() - .to_string(), - ), - ])?; - } else if array_val.len() == allele_count { - column.push_vecstring(vec![ - Some(array_val.first().unwrap().unwrap().to_string()), - Some(".".to_string()), - Some( - array_val.get(alt_id + 1).unwrap().unwrap().to_string(), - ), - ])?; - } else { - eprintln!( - "Field {} declared as G but found array of size {}", - key, - array_val.len() - ); - column.push_null(); - } - } - , - noodles::vcf::header::Number::Unknown => { - column.push_vecstring( - array_val - .iter() - .map(|s| s.as_ref().map(|s| s.to_string())) - .collect::>>(), - )?; - }, - }, - - }, - None => column.push_null(), - }, - None => column.push_null(), - } - } else { - //Handle missing format field, only matters for FixedSizeList - for field in schema.fields.iter() { - if field.name == key_name { - match field.data_type { - arrow2::datatypes::DataType::FixedSizeList( - ref field_type, - fixed_size, - ) => match &field_type.data_type() { - arrow2::datatypes::DataType::Int32 => { - column.push_veci32(vec![None; fixed_size])? - } - - arrow2::datatypes::DataType::Float32 => { - column.push_vecf32(vec![None; fixed_size])? - } - - arrow2::datatypes::DataType::Utf8 => { - column.push_vecstring(vec![None; fixed_size])? - } - - _ => column.push_null(), - }, - _ => column.push_null(), - } - } - } - } - } - } - } - Ok(()) - } - - ///Convert Name2Data in vector of arrow2 array - pub fn into_arc( - mut self, - schema: &arrow2::datatypes::Schema, - ) -> Vec> { - let s: Vec> = schema - .fields - .iter() - .map(|x| self.0.remove(&x.name).unwrap().into_arc()) - .collect(); - - s - } -} - -#[derive(Debug)] -pub enum ColumnData { - Bool(arrow2::array::MutableBooleanArray), - Int(arrow2::array::MutablePrimitiveArray), - Float(arrow2::array::MutablePrimitiveArray), - String(arrow2::array::MutableUtf8Array), - ListBool(arrow2::array::MutableListArray), - ListInt(arrow2::array::MutableListArray>), - ListFloat(arrow2::array::MutableListArray>), - ListString(arrow2::array::MutableListArray>), -} - -impl ColumnData { - pub fn new(arrow_type: &arrow2::datatypes::DataType, length: usize) -> Self { - match arrow_type { - arrow2::datatypes::DataType::Boolean => { - ColumnData::Bool(arrow2::array::MutableBooleanArray::with_capacity(length)) - } - arrow2::datatypes::DataType::Int32 => ColumnData::Int( - arrow2::array::MutablePrimitiveArray::::with_capacity(length), - ), - arrow2::datatypes::DataType::Float32 => ColumnData::Float( - arrow2::array::MutablePrimitiveArray::::with_capacity(length), - ), - arrow2::datatypes::DataType::Utf8 => ColumnData::String( - arrow2::array::MutableUtf8Array::::with_capacity(length), - ), - arrow2::datatypes::DataType::List(field) => match field.data_type() { - arrow2::datatypes::DataType::Boolean => { - ColumnData::ListBool(arrow2::array::MutableListArray::< - i32, - arrow2::array::MutableBooleanArray, - >::with_capacity(length)) - } - arrow2::datatypes::DataType::Int32 => { - ColumnData::ListInt(arrow2::array::MutableListArray::< - i32, - MutablePrimitiveArray, - >::with_capacity(length)) - } - arrow2::datatypes::DataType::Float32 => { - ColumnData::ListFloat(arrow2::array::MutableListArray::< - i32, - MutablePrimitiveArray, - >::with_capacity(length)) - } - arrow2::datatypes::DataType::Utf8 => { - ColumnData::ListString(arrow2::array::MutableListArray::< - i32, - arrow2::array::MutableUtf8Array, - >::with_capacity(length)) - } - _ => todo!(), - }, - arrow2::datatypes::DataType::FixedSizeList(field, _) => match field.data_type() { - arrow2::datatypes::DataType::Boolean => { - ColumnData::ListBool(arrow2::array::MutableListArray::< - i32, - arrow2::array::MutableBooleanArray, - >::with_capacity(length)) - } - arrow2::datatypes::DataType::Int32 => { - ColumnData::ListInt(arrow2::array::MutableListArray::< - i32, - MutablePrimitiveArray, - >::with_capacity(length)) - } - arrow2::datatypes::DataType::Float32 => { - ColumnData::ListFloat(arrow2::array::MutableListArray::< - i32, - MutablePrimitiveArray, - >::with_capacity(length)) - } - arrow2::datatypes::DataType::Utf8 => { - ColumnData::ListString(arrow2::array::MutableListArray::< - i32, - arrow2::array::MutableUtf8Array, - >::with_capacity(length)) - } - _ => todo!(), - }, - dt => unreachable!("Unsupported arrow type, please check Schema: {:?}", dt), - } - } - /// Add a Null value in array - pub fn push_null(&mut self) { - match self { - ColumnData::Bool(a) => a.push_null(), - ColumnData::Int(a) => a.push_null(), - ColumnData::Float(a) => a.push_null(), - ColumnData::String(a) => a.push_null(), - ColumnData::ListBool(a) => a.push_null(), - ColumnData::ListInt(a) => a.push_null(), - ColumnData::ListFloat(a) => a.push_null(), - ColumnData::ListString(_a) => { - if let Err(e) = self.push_vecstring(vec![None]) { - panic!("ListString {e:?}"); - } - } - } - } - - pub fn len(&self) -> usize { - match self { - ColumnData::Bool(a) => a.len(), - ColumnData::Int(a) => a.len(), - ColumnData::Float(a) => a.len(), - ColumnData::String(a) => a.len(), - ColumnData::ListBool(a) => a.len(), - ColumnData::ListInt(a) => a.len(), - ColumnData::ListFloat(a) => a.len(), - ColumnData::ListString(a) => a.len(), - } - } - - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - - /// Add a boolean value in array, if it's not a boolean array failled - pub fn push_bool(&mut self, value: Option) { - match self { - ColumnData::Bool(a) => a.push(value), - _ => todo!(), - } - } - - /// Add a i32 value in array, if it's not a integer array failled - pub fn push_i32(&mut self, value: Option) { - match self { - ColumnData::Int(a) => a.push(value), - _ => todo!(), - } - } - - /// Add a f32 value in array, if it's not a float array failled - pub fn push_f32(&mut self, value: Option) { - match self { - ColumnData::Float(a) => a.push(value), - _ => todo!(), - } - } - - /// Add a string value in array, if it's not a string array failled - pub fn push_string(&mut self, value: String) { - match self { - ColumnData::String(a) => a.push(Some(value)), - _ => todo!(), - } - } - - /// Add a vector of bool value in array, if it's not a vector of bool array failled - pub fn push_vecbool(&mut self, value: Vec>) -> arrow2::error::Result<()> { - match self { - ColumnData::ListBool(a) => a.try_push(Some(value)), - _ => todo!(), - } - } - - /// Add a vector of integer value in array, if it's not a vector of integer array failled - pub fn push_veci32(&mut self, value: Vec>) -> arrow2::error::Result<()> { - match self { - ColumnData::ListInt(a) => a.try_push(Some(value)), - _ => todo!(), - } - } - - /// Add a vector of float value in array, if it's not a vector of float array failled - pub fn push_vecf32(&mut self, value: Vec>) -> arrow2::error::Result<()> { - match self { - ColumnData::ListFloat(a) => a.try_push(Some(value)), - _ => todo!(), - } - } - - /// Add a vector of string value in array, if it's not a vector of string array failled - pub fn push_vecstring(&mut self, value: Vec>) -> arrow2::error::Result<()> { - match self { - ColumnData::ListString(a) => a.try_push(Some(value)), - _ => todo!(), - } - } - - /// Convert ColumnData in Arrow2 array - pub fn into_arc(self) -> std::sync::Arc { - match self { - ColumnData::Bool(a) => a.into_arc(), - ColumnData::Int(a) => a.into_arc(), - ColumnData::Float(a) => a.into_arc(), - ColumnData::String(a) => a.into_arc(), - ColumnData::ListBool(a) => a.into_arc(), - ColumnData::ListInt(a) => a.into_arc(), - ColumnData::ListFloat(a) => a.into_arc(), - ColumnData::ListString(a) => a.into_arc(), - } - } -} - -#[cfg(test)] -mod tests { - use crate::schema; - - use super::*; - - static VCF_FILE: &[u8] = b"##fileformat=VCFv4.3 -##fileDate=20220528 -##source=ClinVar -##reference=GRCh38 -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##SAMPLE= -##SAMPLE= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT first second -chr1 100 . A T 50 PASS Info_1=0;Info_fixed=1,2,3;Info_A=42;Info_RChar=r,a;Info_RString=ref,alt;Info_G=1,2,3;Info_u=0,1,2,3,4 GT:Format_1:Format_fixed:Format_A:Format_R:Format_G:Format_u 0/1:44:1,2,3,4:testA:R,A:1,2,3:0,2,4,6 1/1:44:1,2,3,5:testA:r,a:1,2,3:0,2,5,6,1 -chr1 200 . C G,CG 60 PASS Info_1=0;Info_fixed=1,2,3;Info_A=42,43;Info_RChar=r,a,A;Info_RString=ref,alt1,alt2;Info_G=1,2,3,4,5,6;Info_u=1,6,3,4,5 GT:Format_1:Format_fixed:Format_A:Format_R:Format_G:Format_u 0/1:44:2,4,6,8:testA1,testA2:R,A,B:1,2,3,4,5,6:0,2,4 1/2:45:2,1,6,8:testB1,testB2:R,a,b:1,2,3,4,5,6:0,2,4,5,6 -chr2 300 . G A 70 PASS Info_1=0;Info_fixed=1,2,3;Info_A=42;Info_RChar=r,a;Info_RString=ref,alt;Info_G=1,2,3;Info_u=0,1,2,3,4;Flag GT:Format_1:Format_fixed:Format_A:Format_R:Format_G:Format_u 0/1:44:1,2,3,4:testA:R,A:1,2,3:0,2,4,6 0/1:44:1,2,3,4:testA:R,A:1,2,3:0,2,4,6 -"; - - #[test] - fn init() { - let mut reader = noodles::vcf::Reader::new(VCF_FILE); - - let header: noodles::vcf::Header = reader.read_header().unwrap(); - let schema = schema::from_header(&header, false).unwrap(); - - let mut data = Name2Data::new(10, &schema); - let mut col_names = data.0.keys().cloned().collect::>(); - col_names.sort(); - - assert_eq!( - col_names, - vec![ - "alternate".to_string(), - "chromosome".to_string(), - "filter".to_string(), - "format_first_Format_1".to_string(), - "format_first_Format_A".to_string(), - "format_first_Format_G".to_string(), - "format_first_Format_R".to_string(), - "format_first_Format_fixed".to_string(), - "format_first_Format_u".to_string(), - "format_first_GT".to_string(), - "format_second_Format_1".to_string(), - "format_second_Format_A".to_string(), - "format_second_Format_G".to_string(), - "format_second_Format_R".to_string(), - "format_second_Format_fixed".to_string(), - "format_second_Format_u".to_string(), - "format_second_GT".to_string(), - "identifier".to_string(), - "info_Flag".to_string(), - "info_Info1".to_string(), - "info_Info_A".to_string(), - "info_Info_G".to_string(), - "info_Info_RChar".to_string(), - "info_Info_RString".to_string(), - "info_Info_fixed".to_string(), - "info_Info_u".to_string(), - "position".to_string(), - "quality".to_string(), - "reference".to_string(), - ] - ); - - assert_eq!( - format!("{:?}", data.get("chromosome")), - format!( - "{:?}", - Some(&ColumnData::String(arrow2::array::MutableUtf8Array::new())) - ) - ); - - assert_eq!( - format!("{:?}", data.get_mut("chromosome")), - format!( - "{:?}", - Some(&ColumnData::String(arrow2::array::MutableUtf8Array::new())) - ) - ); - } - - #[test] - fn add_record() { - let mut reader = noodles::vcf::Reader::new(VCF_FILE); - - let header: noodles::vcf::Header = reader.read_header().unwrap(); - - let schema = schema::from_header(&header, false).unwrap(); - let mut data = Name2Data::new(10, &schema); - - let mut iterator = reader.records(&header); - let record = iterator.next().unwrap().unwrap(); - - data.add_record(record, &header, &schema).unwrap(); - assert_eq!(format!("{:?}", data.get("alternate")), "Some(String(MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 1]), values: [84] }, validity: None }))".to_string()); - - assert_eq!(format!("{:?}", data.get("chromosome")), "Some(String(MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 4]), values: [99, 104, 114, 49] }, validity: None }))".to_string()); - assert_eq!(format!("{:?}", data.get("filter")), "Some(ListString(MutableListArray { data_type: List(Field { name: \"item\", data_type: Utf8, is_nullable: true, metadata: {} }), offsets: Offsets([0, 1]), values: MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 4]), values: [80, 65, 83, 83] }, validity: None }, validity: None }))".to_string()); - - assert_eq!( - format!("{:?}", data.get("format_first_Format_1")), - "Some(Int(MutablePrimitiveArray { data_type: Int32, values: [44], validity: None }))" - .to_string() - ); - assert_eq!( - format!("{:?}", data.get("format_first_Format_A")), - "Some(String(MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 5]), values: [116, 101, 115, 116, 65] }, validity: None }))".to_string() - ); - assert_eq!( - format!("{:?}", data.get("format_first_Format_G")), - "Some(ListInt(MutableListArray { data_type: List(Field { name: \"item\", data_type: Int32, is_nullable: true, metadata: {} }), offsets: Offsets([0, 3]), values: MutablePrimitiveArray { data_type: Int32, values: [1, 2, 3], validity: None }, validity: None }))".to_string() - ); - assert_eq!( - format!("{:?}", data.get("format_first_Format_R")), - "Some(ListString(MutableListArray { data_type: List(Field { name: \"item\", data_type: Utf8, is_nullable: true, metadata: {} }), offsets: Offsets([0, 2]), values: MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 1, 2]), values: [82, 65] }, validity: None }, validity: None }))".to_string() - ); - assert_eq!( - format!("{:?}", data.get("format_first_Format_fixed")), - "Some(ListFloat(MutableListArray { data_type: List(Field { name: \"item\", data_type: Float32, is_nullable: true, metadata: {} }), offsets: Offsets([0, 4]), values: MutablePrimitiveArray { data_type: Float32, values: [1.0, 2.0, 3.0, 4.0], validity: None }, validity: None }))".to_string() - ); - assert_eq!( - format!("{:?}", data.get("format_first_Format_u")), - "Some(ListInt(MutableListArray { data_type: List(Field { name: \"item\", data_type: Int32, is_nullable: true, metadata: {} }), offsets: Offsets([0, 4]), values: MutablePrimitiveArray { data_type: Int32, values: [0, 2, 4, 6], validity: None }, validity: None }))".to_string() - ); - assert_eq!(format!("{:?}", data.get("format_first_GT")), "Some(String(MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 3]), values: [48, 47, 49] }, validity: None }))".to_string()); - - assert_eq!( - format!("{:?}", data.get("format_second_Format_1")), - "Some(Int(MutablePrimitiveArray { data_type: Int32, values: [44], validity: None }))" - .to_string() - ); - assert_eq!( - format!("{:?}", data.get("format_second_Format_A")), - "Some(String(MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 5]), values: [116, 101, 115, 116, 65] }, validity: None }))".to_string() - ); - - assert_eq!( - format!("{:?}", data.get("format_second_Format_G")), - "Some(ListInt(MutableListArray { data_type: List(Field { name: \"item\", data_type: Int32, is_nullable: true, metadata: {} }), offsets: Offsets([0, 3]), values: MutablePrimitiveArray { data_type: Int32, values: [1, 2, 3], validity: None }, validity: None }))".to_string() - ); - assert_eq!( - format!("{:?}", data.get("format_second_Format_R")), - "Some(ListString(MutableListArray { data_type: List(Field { name: \"item\", data_type: Utf8, is_nullable: true, metadata: {} }), offsets: Offsets([0, 2]), values: MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 1, 2]), values: [114, 97] }, validity: None }, validity: None }))".to_string() - ); - assert_eq!( - format!("{:?}", data.get("format_second_Format_fixed")), - "Some(ListFloat(MutableListArray { data_type: List(Field { name: \"item\", data_type: Float32, is_nullable: true, metadata: {} }), offsets: Offsets([0, 4]), values: MutablePrimitiveArray { data_type: Float32, values: [1.0, 2.0, 3.0, 5.0], validity: None }, validity: None }))".to_string() - ); - assert_eq!( - format!("{:?}", data.get("format_second_Format_u")), - "Some(ListInt(MutableListArray { data_type: List(Field { name: \"item\", data_type: Int32, is_nullable: true, metadata: {} }), offsets: Offsets([0, 5]), values: MutablePrimitiveArray { data_type: Int32, values: [0, 2, 5, 6, 1], validity: None }, validity: None }))".to_string() - ); - assert_eq!( - format!("{:?}", data.get("format_second_GT")), - "Some(String(MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 3]), values: [49, 47, 49] }, validity: None }))".to_string() - ); - - assert_eq!(format!("{:?}", data.get("identifier")), "Some(ListString(MutableListArray { data_type: List(Field { name: \"item\", data_type: Utf8, is_nullable: true, metadata: {} }), offsets: Offsets([0, 0]), values: MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0]), values: [] }, validity: None }, validity: None }))".to_string()); - - assert_eq!(format!("{:?}", data.get("info_Flag")), "Some(Bool(MutableBooleanArray { data_type: Boolean, values: [0b_______0], validity: None }))".to_string()); - assert_eq!(format!("{:?}", data.get("info_Info1")), "Some(Float(MutablePrimitiveArray { data_type: Float32, values: [0.0], validity: Some([0b_______0]) }))".to_string()); - assert_eq!( - format!("{:?}", data.get("info_Info_A")), - "Some(Int(MutablePrimitiveArray { data_type: Int32, values: [42], validity: None }))" - .to_string() - ); - assert_eq!(format!("{:?}", data.get("info_Info_G")), "Some(ListInt(MutableListArray { data_type: List(Field { name: \"item\", data_type: Int32, is_nullable: true, metadata: {} }), offsets: Offsets([0, 3]), values: MutablePrimitiveArray { data_type: Int32, values: [1, 2, 3], validity: None }, validity: None }))".to_string()); - assert_eq!(format!("{:?}", data.get("info_Info_RChar")), "Some(ListString(MutableListArray { data_type: List(Field { name: \"item\", data_type: Utf8, is_nullable: true, metadata: {} }), offsets: Offsets([0, 2]), values: MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 1, 2]), values: [114, 97] }, validity: None }, validity: None }))".to_string()); - assert_eq!( - format!("{:?}", data.get("info_Info_RString")), - "Some(ListString(MutableListArray { data_type: List(Field { name: \"item\", data_type: Utf8, is_nullable: true, metadata: {} }), offsets: Offsets([0, 2]), values: MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 3, 6]), values: [114, 101, 102, 97, 108, 116] }, validity: None }, validity: None }))".to_string() - ); - assert_eq!(format!("{:?}", data.get("info_Info_fixed")), "Some(ListInt(MutableListArray { data_type: List(Field { name: \"item\", data_type: Int32, is_nullable: true, metadata: {} }), offsets: Offsets([0, 3]), values: MutablePrimitiveArray { data_type: Int32, values: [1, 2, 3], validity: None }, validity: None }))".to_string()); - assert_eq!(format!("{:?}", data.get("info_Info_u")), "Some(ListInt(MutableListArray { data_type: List(Field { name: \"item\", data_type: Int32, is_nullable: true, metadata: {} }), offsets: Offsets([0, 5]), values: MutablePrimitiveArray { data_type: Int32, values: [0, 1, 2, 3, 4], validity: None }, validity: None }))".to_string()); - - assert_eq!( - format!("{:?}", data.get("position")), - "Some(Int(MutablePrimitiveArray { data_type: Int32, values: [100], validity: None }))" - .to_string() - ); - assert_eq!(format!("{:?}", data.get("quality")), "Some(Float(MutablePrimitiveArray { data_type: Float32, values: [50.0], validity: None }))".to_string()); - assert_eq!(format!("{:?}", data.get("reference")), "Some(String(MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 1]), values: [65] }, validity: None }))".to_string()); - - let record = iterator.next().unwrap().unwrap(); - let mut data = Name2Data::new(10, &schema); - data.add_record(record, &header, &schema).unwrap(); - - assert_eq!(format!("{:?}", data.get("alternate")), "Some(String(MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 1, 3]), values: [71, 67, 71] }, validity: None }))".to_string()); - assert_eq!(format!("{:?}", data.get("filter")), "Some(ListString(MutableListArray { data_type: List(Field { name: \"item\", data_type: Utf8, is_nullable: true, metadata: {} }), offsets: Offsets([0, 1, 2]), values: MutableUtf8Array { values: MutableUtf8ValuesArray { data_type: Utf8, offsets: Offsets([0, 4, 8]), values: [80, 65, 83, 83, 80, 65, 83, 83] }, validity: None }, validity: None }))".to_string()); - } -} diff --git a/vcf2parquet-lib/src/schema.rs b/vcf2parquet-lib/src/schema.rs deleted file mode 100644 index 98b2227..0000000 --- a/vcf2parquet-lib/src/schema.rs +++ /dev/null @@ -1,427 +0,0 @@ -//! Construct parquet schema corresponding to vcf - -/* std use */ - -/* crate use */ - -/* project use */ -use crate::*; - -/// Generate a parquet schema corresponding to vcf header -pub fn from_header( - header: &noodles::vcf::Header, - info_optional: bool, -) -> error::Result { - let mut columns = Vec::new(); - - // required column - columns.extend(required_column()); - - // info field - columns.extend(info(header, info_optional)); - - // genotype field - columns.extend(genotype(header)); - - Ok(arrow2::datatypes::Schema::from(columns)) -} - -fn required_column() -> Vec { - vec![ - arrow2::datatypes::Field::new("chromosome", arrow2::datatypes::DataType::Utf8, false), - arrow2::datatypes::Field::new("position", arrow2::datatypes::DataType::Int32, false), - arrow2::datatypes::Field::new( - "identifier", - arrow2::datatypes::DataType::List(Box::new(arrow2::datatypes::Field::new( - "id", - arrow2::datatypes::DataType::Utf8, - false, - ))), - false, - ), - arrow2::datatypes::Field::new("reference", arrow2::datatypes::DataType::Utf8, false), - arrow2::datatypes::Field::new("alternate", arrow2::datatypes::DataType::Utf8, false), - arrow2::datatypes::Field::new("quality", arrow2::datatypes::DataType::Float32, true), - arrow2::datatypes::Field::new( - "filter", - arrow2::datatypes::DataType::List(Box::new(arrow2::datatypes::Field::new( - "filter", - arrow2::datatypes::DataType::Utf8, - false, - ))), - false, - ), - ] -} - -fn info(header: &noodles::vcf::Header, info_optional: bool) -> Vec { - let mut fields = Vec::new(); - - for (name, value) in header.infos() { - let key = format!("info_{name}"); - - let arrow_type = match value.ty() { - noodles::vcf::header::record::value::map::info::Type::Integer => { - arrow2::datatypes::DataType::Int32 - } - noodles::vcf::header::record::value::map::info::Type::Float => { - arrow2::datatypes::DataType::Float32 - } - noodles::vcf::header::record::value::map::info::Type::Flag => { - arrow2::datatypes::DataType::Boolean - } - noodles::vcf::header::record::value::map::info::Type::Character => { - arrow2::datatypes::DataType::Utf8 - } - noodles::vcf::header::record::value::map::info::Type::String => { - arrow2::datatypes::DataType::Utf8 - } - }; - - match value.number() { - noodles::vcf::header::Number::Count(0 | 1) | noodles::vcf::header::Number::A => fields - .push(arrow2::datatypes::Field::new( - &key, - arrow_type, - info_optional, - )), - noodles::vcf::header::Number::R => fields.push(arrow2::datatypes::Field::new( - &key, - arrow2::datatypes::DataType::FixedSizeList( - Box::new(arrow2::datatypes::Field::new( - &key, - arrow_type, - info_optional, - )), - 2, - ), - info_optional, - )), - noodles::vcf::header::Number::Count(n) => fields.push(arrow2::datatypes::Field::new( - &key, - arrow2::datatypes::DataType::FixedSizeList( - Box::new(arrow2::datatypes::Field::new( - &key, - arrow_type, - info_optional, - )), - n, - ), - false, - )), - noodles::vcf::header::Number::G => fields.push(arrow2::datatypes::Field::new( - &key, - arrow2::datatypes::DataType::FixedSizeList( - Box::new(arrow2::datatypes::Field::new( - &key, - arrow_type, - info_optional, - )), - 3, - ), - false, - )), - - noodles::vcf::header::Number::Unknown => fields.push(arrow2::datatypes::Field::new( - &key, - arrow2::datatypes::DataType::List(Box::new(arrow2::datatypes::Field::new( - &key, - arrow_type, - info_optional, - ))), - false, - )), - } - } - - fields -} - -fn genotype(header: &noodles::vcf::Header) -> Vec { - let mut fields = Vec::new(); - - for sample in header.sample_names() { - for (name, value) in header.formats() { - let key = format!("format_{sample}_{name}"); - - let arrow_type = match value.ty() { - noodles::vcf::header::record::value::map::format::Type::Integer => { - arrow2::datatypes::DataType::Int32 - } - noodles::vcf::header::record::value::map::format::Type::Float => { - arrow2::datatypes::DataType::Float32 - } - noodles::vcf::header::record::value::map::format::Type::Character => { - arrow2::datatypes::DataType::Utf8 - } - noodles::vcf::header::record::value::map::format::Type::String => { - arrow2::datatypes::DataType::Utf8 - } - }; - - match value.number() { - noodles::vcf::header::Number::Count(0 | 1) | noodles::vcf::header::Number::A => { - fields.push(arrow2::datatypes::Field::new(key, arrow_type, false)) - } - noodles::vcf::header::Number::R => fields.push(arrow2::datatypes::Field::new( - &key, - arrow2::datatypes::DataType::FixedSizeList( - Box::new(arrow2::datatypes::Field::new(&key, arrow_type, false)), - 2, - ), - false, - )), - noodles::vcf::header::Number::Count(n) => { - fields.push(arrow2::datatypes::Field::new( - &key, - arrow2::datatypes::DataType::FixedSizeList( - Box::new(arrow2::datatypes::Field::new(&key, arrow_type, false)), - n, - ), - false, - )) - } - noodles::vcf::header::Number::G => fields.push(arrow2::datatypes::Field::new( - &key, - arrow2::datatypes::DataType::FixedSizeList( - Box::new(arrow2::datatypes::Field::new(&key, arrow_type, false)), - 3, - ), - false, - )), - - noodles::vcf::header::Number::Unknown => { - fields.push(arrow2::datatypes::Field::new( - &key, - arrow2::datatypes::DataType::List(Box::new(arrow2::datatypes::Field::new( - &key, arrow_type, false, - ))), - false, - )) - } - } - } - } - - fields -} - -#[cfg(test)] -mod tests { - use super::*; - - // - // - - static VCF_FILE: &[u8] = b"##fileformat=VCFv4.3 -##fileDate=20220528 -##source=ClinVar -##reference=GRCh38 -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##SAMPLE= -##SAMPLE= -#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tfirst\tsecond -"; - - lazy_static::lazy_static! { - static ref MINI_COLS: Vec = vec![ - arrow2::datatypes::Field::new("chromosome", arrow2::datatypes::DataType::Utf8, false), - arrow2::datatypes::Field::new("position", arrow2::datatypes::DataType::Int32, false), - arrow2::datatypes::Field::new( - "identifier", - arrow2::datatypes::DataType::List(Box::new(arrow2::datatypes::Field::new( - "id", - arrow2::datatypes::DataType::Utf8, - false, - ))), - false, - ), - arrow2::datatypes::Field::new("reference", arrow2::datatypes::DataType::Utf8, false), - arrow2::datatypes::Field::new("alternate", arrow2::datatypes::DataType::Utf8, false), - arrow2::datatypes::Field::new("quality", arrow2::datatypes::DataType::Float32, true), - arrow2::datatypes::Field::new( - "filter", - arrow2::datatypes::DataType::List(Box::new(arrow2::datatypes::Field::new( - "filter", - arrow2::datatypes::DataType::Utf8, - false, - ))), - false, - ), - ]; - - static ref INFO_COLS: Vec = vec![ - arrow2::datatypes::Field { name: "info_Flag".to_string(), data_type: arrow2::datatypes::DataType::Boolean, is_nullable: false, metadata: std::collections::BTreeMap::new() }, arrow2::datatypes::Field { name: "info_Info1".to_string(), data_type: arrow2::datatypes::DataType::Float32, is_nullable: false, metadata: std::collections::BTreeMap::new() }, arrow2::datatypes::Field { name: "info_Info_fixed".to_string(), data_type: arrow2::datatypes::DataType::FixedSizeList(Box::new(arrow2::datatypes::Field { name: "info_Info_fixed".to_string(), data_type: arrow2::datatypes::DataType::Int32, is_nullable: false, metadata: std::collections::BTreeMap::new() }), 3), is_nullable: false, metadata: std::collections::BTreeMap::new() }, arrow2::datatypes::Field { name: "info_Info_A".to_string(), data_type: arrow2::datatypes::DataType::Int32, is_nullable: false, metadata: std::collections::BTreeMap::new() }, arrow2::datatypes::Field { name: "info_Info_RString".to_string(), data_type: arrow2::datatypes::DataType::FixedSizeList(Box::new(arrow2::datatypes::Field { name: "info_Info_RString".to_string(), data_type: arrow2::datatypes::DataType::Utf8, is_nullable: false, metadata: std::collections::BTreeMap::new() }), 2), is_nullable: false, metadata: std::collections::BTreeMap::new() }, arrow2::datatypes::Field { name: "info_Info_RChar".to_string(), data_type: arrow2::datatypes::DataType::FixedSizeList(Box::new(arrow2::datatypes::Field { name: "info_Info_RChar".to_string(), data_type: arrow2::datatypes::DataType::Utf8, is_nullable: false, metadata: std::collections::BTreeMap::new() }), 2), is_nullable: false, metadata: std::collections::BTreeMap::new() }, arrow2::datatypes::Field { name: "info_Info_G".to_string(), data_type: arrow2::datatypes::DataType::FixedSizeList(Box::new(arrow2::datatypes::Field { name: "info_Info_G".to_string(), data_type: arrow2::datatypes::DataType::Int32, is_nullable: false, metadata: std::collections::BTreeMap::new() }), 3), is_nullable: false, metadata: std::collections::BTreeMap::new() }, arrow2::datatypes::Field { name: "info_Info_.".to_string(), data_type: arrow2::datatypes::DataType::List(Box::new(arrow2::datatypes::Field { name: "info_Info_.".to_string(), data_type: arrow2::datatypes::DataType::Int32, is_nullable: false, metadata: std::collections::BTreeMap::new() })), is_nullable: false, metadata: std::collections::BTreeMap::new() }]; - - static ref FORMAT_COLS: Vec = vec![ - arrow2::datatypes::Field { - name: "format_first_Format_1".to_string(), - data_type: arrow2::datatypes::DataType::Int32, - is_nullable: false, - metadata: std::collections::BTreeMap::new() - }, - arrow2::datatypes::Field { - name: "format_first_Format_fixed".to_string(), - data_type: arrow2::datatypes::DataType::FixedSizeList(Box::new( - arrow2::datatypes::Field { - name: "format_first_Format_fixed".to_string(), - data_type: arrow2::datatypes::DataType::Float32, - is_nullable: false, - metadata: std::collections::BTreeMap::new() - } - ),4), - is_nullable: false, - metadata: std::collections::BTreeMap::new() - }, - arrow2::datatypes::Field { - name: "format_first_Format_A".to_string(), - data_type: arrow2::datatypes::DataType::Utf8, - is_nullable: false, - metadata: std::collections::BTreeMap::new() - }, - arrow2::datatypes::Field { - name: "format_first_Format_R".to_string(), - data_type: arrow2::datatypes::DataType::FixedSizeList(Box::new( - arrow2::datatypes::Field { - name: "format_first_Format_R".to_string(), - data_type: arrow2::datatypes::DataType::Utf8, - is_nullable: false, - metadata: std::collections::BTreeMap::new() - } - ),2), - is_nullable: false, - metadata: std::collections::BTreeMap::new() - }, - arrow2::datatypes::Field { - name: "format_first_Format_G".to_string(), - data_type: arrow2::datatypes::DataType::FixedSizeList(Box::new( - arrow2::datatypes::Field { - name: "format_first_Format_G".to_string(), - data_type: arrow2::datatypes::DataType::Int32, - is_nullable: false, - metadata: std::collections::BTreeMap::new() - } - ),3), - is_nullable: false, - metadata: std::collections::BTreeMap::new() - }, - arrow2::datatypes::Field { - name: "format_first_Format_.".to_string(), - data_type: arrow2::datatypes::DataType::Int32, - is_nullable: false, - metadata: std::collections::BTreeMap::new() - }, - arrow2::datatypes::Field { - name: "format_second_Format_1".to_string(), - data_type: arrow2::datatypes::DataType::Int32, - is_nullable: false, - metadata: std::collections::BTreeMap::new() - }, - arrow2::datatypes::Field { - name: "format_second_Format_fixed".to_string(), - data_type: arrow2::datatypes::DataType::FixedSizeList(Box::new( - arrow2::datatypes::Field { - name: "format_second_Format_fixed".to_string(), - data_type: arrow2::datatypes::DataType::Float32, - is_nullable: false, - metadata: std::collections::BTreeMap::new() - } - ),4), - is_nullable: false, - metadata: std::collections::BTreeMap::new() - }, - arrow2::datatypes::Field { - name: "format_second_Format_A".to_string(), - data_type: arrow2::datatypes::DataType::Utf8, - is_nullable: false, - metadata: std::collections::BTreeMap::new() - }, - arrow2::datatypes::Field { - name: "format_second_Format_R".to_string(), - data_type: arrow2::datatypes::DataType::FixedSizeList(Box::new( - arrow2::datatypes::Field { - name: "format_second_Format_R".to_string(), - data_type: arrow2::datatypes::DataType::Utf8, - is_nullable: false, - metadata: std::collections::BTreeMap::new() - } - ),2), - is_nullable: false, - metadata: std::collections::BTreeMap::new() - }, - arrow2::datatypes::Field { - name: "format_second_Format_G".to_string(), - data_type: arrow2::datatypes::DataType::FixedSizeList(Box::new( - arrow2::datatypes::Field { - name: "format_second_Format_G".to_string(), - data_type: arrow2::datatypes::DataType::Int32, - is_nullable: false, - metadata: std::collections::BTreeMap::new() - } - ),3), - is_nullable: false, - metadata: std::collections::BTreeMap::new() - }, - arrow2::datatypes::Field { - name: "format_second_Format_.".to_string(), - data_type: arrow2::datatypes::DataType::Int32, - is_nullable: false, - metadata: std::collections::BTreeMap::new() - } - - ]; - } - - #[test] - fn mini_cols() { - assert_eq!(required_column(), *MINI_COLS) - } - - #[test] - fn info_cols() { - let mut reader = noodles::vcf::Reader::new(VCF_FILE); - - let header: noodles::vcf::Header = reader.read_header().unwrap(); - - assert_eq!(info(&header, false), *INFO_COLS); - } - - #[test] - fn genotype_cols() { - let mut reader = noodles::vcf::Reader::new(VCF_FILE); - - let header: noodles::vcf::Header = reader.read_header().unwrap(); - - assert_eq!(genotype(&header), *FORMAT_COLS); - } - - #[test] - fn all_cols() { - let mut reader = noodles::vcf::Reader::new(VCF_FILE); - - let header: noodles::vcf::Header = reader.read_header().unwrap(); - - let mut data: Vec = Vec::new(); - data.extend_from_slice(&*MINI_COLS); - data.extend_from_slice(&*INFO_COLS); - data.extend_from_slice(&*FORMAT_COLS); - - assert_eq!( - from_header(&header, false).unwrap(), - arrow2::datatypes::Schema::from(data) - ); - } -}