diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 8f982f7..f95d5d2 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -70,7 +70,7 @@ jobs: uses: actions/checkout@v2 - name: Generate code coverage - run: cargo +nightly tarpaulin --verbose --engine llvm --no-dead-code --all-features --workspace --timeout 120 --out xml + run: cargo +nightly tarpaulin --all-features --workspace --timeout 120 --out xml - name: Upload to codecov.io uses: codecov/codecov-action@v2 diff --git a/.github/workflows/pypublish.yml b/.github/workflows/pypublish.yml index 2bd286a..7f64d70 100644 --- a/.github/workflows/pypublish.yml +++ b/.github/workflows/pypublish.yml @@ -32,13 +32,13 @@ jobs: target: ${{ matrix.target }} args: --sdist --release --out dist --find-interpreter sccache: 'true' - working-directory: vcf2parquet-py + working-directory: python manylinux: auto - name: Upload wheels uses: actions/upload-artifact@v4 with: name: wheels - path: vcf2parquet-py/dist + path: python/dist windows: runs-on: windows-latest @@ -57,12 +57,12 @@ jobs: target: ${{ matrix.target }} args: --sdist --release --out dist --find-interpreter sccache: 'true' - working-directory: vcf2parquet-py + working-directory: python - name: Upload wheels uses: actions/upload-artifact@v4 with: name: wheels - path: vcf2parquet-py/dist + path: python/dist macos: runs-on: macos-latest @@ -80,12 +80,12 @@ jobs: target: ${{ matrix.target }} args: --sdist --release --out dist --find-interpreter sccache: 'true' - working-directory: vcf2parquet-py + working-directory: python - name: Upload wheels uses: actions/upload-artifact@v4 with: name: wheels - path: vcf2parquet-py/dist + path: python/dist release: name: Release diff --git a/Cargo.lock b/Cargo.lock index ca55af0..874476d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -100,32 +100,6 @@ dependencies = [ "serde", ] -[[package]] -name = "arrow2" -version = "0.17.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59c468daea140b747d781a1da9f7db5f0a8e6636d4af20cc539e43d05b0604fa" -dependencies = [ - "ahash", - "arrow-format", - "base64", - "bytemuck", - "chrono", - "dyn-clone", - "either", - "ethnum", - "fallible-streaming-iterator", - "foreign_vec", - "futures", - "getrandom", - "hash_hasher", - "num-traits", - "parquet2", - "rustc_version", - "simdutf8", - "streaming-iterator", -] - [[package]] name = "arrow2" version = "0.18.0" @@ -229,12 +203,6 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - [[package]] name = "bitflags" version = "2.5.0" @@ -385,7 +353,7 @@ version = "4.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90239a040c80f5e14809ca132ddc4176ab33d5e17e49691793296e3fcb34d72f" dependencies = [ - "heck 0.5.0", + "heck", "proc-macro2", "quote", "syn", @@ -446,15 +414,6 @@ version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" -[[package]] -name = "deranged" -version = "0.3.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" -dependencies = [ - "powerfmt", -] - [[package]] name = "difflib" version = "0.4.0" @@ -646,12 +605,6 @@ dependencies = [ "ahash", ] -[[package]] -name = "heck" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" - [[package]] name = "heck" version = "0.5.0" @@ -668,18 +621,6 @@ dependencies = [ "hashbrown", ] -[[package]] -name = "indoc" -version = "2.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e186cfbae8084e513daff4240b4797e342f988cecda4fb6c939150f96315fd8" - -[[package]] -name = "itoa" -version = "1.0.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" - [[package]] name = "jobserver" version = "0.1.28" @@ -716,16 +657,6 @@ version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" -[[package]] -name = "lock_api" -version = "0.4.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" -dependencies = [ - "autocfg", - "scopeguard", -] - [[package]] name = "log" version = "0.4.21" @@ -769,15 +700,6 @@ version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" -[[package]] -name = "memoffset" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" -dependencies = [ - "autocfg", -] - [[package]] name = "miniz_oxide" version = "0.7.2" @@ -871,12 +793,6 @@ dependencies = [ "percent-encoding", ] -[[package]] -name = "num-conv" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" - [[package]] name = "num-traits" version = "0.2.18" @@ -886,44 +802,12 @@ dependencies = [ "autocfg", ] -[[package]] -name = "num_threads" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c7398b9c8b70908f6371f47ed36737907c87c52af34c268fed0bf0ceb92ead9" -dependencies = [ - "libc", -] - [[package]] name = "once_cell" version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" -[[package]] -name = "parking_lot" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" -dependencies = [ - "lock_api", - "parking_lot_core", -] - -[[package]] -name = "parking_lot_core" -version = "0.9.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" -dependencies = [ - "cfg-if", - "libc", - "redox_syscall", - "smallvec", - "windows-targets 0.48.5", -] - [[package]] name = "parquet-format-safe" version = "0.2.4" @@ -985,18 +869,6 @@ dependencies = [ "array-init-cursor", ] -[[package]] -name = "portable-atomic" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" - -[[package]] -name = "powerfmt" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" - [[package]] name = "predicates" version = "3.1.0" @@ -1033,69 +905,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "pyo3" -version = "0.20.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53bdbb96d49157e65d45cc287af5f32ffadd5f4761438b527b055fb0d4bb8233" -dependencies = [ - "cfg-if", - "indoc", - "libc", - "memoffset", - "parking_lot", - "portable-atomic", - "pyo3-build-config", - "pyo3-ffi", - "pyo3-macros", - "unindent", -] - -[[package]] -name = "pyo3-build-config" -version = "0.20.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "deaa5745de3f5231ce10517a1f5dd97d53e5a2fd77aa6b5842292085831d48d7" -dependencies = [ - "once_cell", - "target-lexicon", -] - -[[package]] -name = "pyo3-ffi" -version = "0.20.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62b42531d03e08d4ef1f6e85a2ed422eb678b8cd62b762e53891c05faf0d4afa" -dependencies = [ - "libc", - "pyo3-build-config", -] - -[[package]] -name = "pyo3-macros" -version = "0.20.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7305c720fa01b8055ec95e484a6eca7a83c841267f0dd5280f0c8b8551d2c158" -dependencies = [ - "proc-macro2", - "pyo3-macros-backend", - "quote", - "syn", -] - -[[package]] -name = "pyo3-macros-backend" -version = "0.20.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c7e9b68bb9c3149c5b0cade5d07f953d6d125eb4337723c4ccdb665f1f96185" -dependencies = [ - "heck 0.4.1", - "proc-macro2", - "pyo3-build-config", - "quote", - "syn", -] - [[package]] name = "quote" version = "1.0.35" @@ -1125,15 +934,6 @@ dependencies = [ "crossbeam-utils", ] -[[package]] -name = "redox_syscall" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" -dependencies = [ - "bitflags 1.3.2", -] - [[package]] name = "regex-automata" version = "0.4.6" @@ -1161,19 +961,13 @@ version = "0.38.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" dependencies = [ - "bitflags 2.5.0", + "bitflags", "errno", "libc", "linux-raw-sys", "windows-sys", ] -[[package]] -name = "scopeguard" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" - [[package]] name = "semver" version = "1.0.22" @@ -1212,17 +1006,6 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a" -[[package]] -name = "simplelog" -version = "0.12.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16257adbfaef1ee58b1363bdc0664c9b8e1e30aed86049635fb5f147d065a9c0" -dependencies = [ - "log", - "termcolor", - "time", -] - [[package]] name = "slab" version = "0.4.9" @@ -1232,12 +1015,6 @@ dependencies = [ "autocfg", ] -[[package]] -name = "smallvec" -version = "1.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" - [[package]] name = "snap" version = "1.1.1" @@ -1276,12 +1053,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "target-lexicon" -version = "0.12.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1fc403891a21bcfb7c37834ba66a547a8f402146eba7265b5a6d88059c9ff2f" - [[package]] name = "tempfile" version = "3.10.1" @@ -1294,15 +1065,6 @@ dependencies = [ "windows-sys", ] -[[package]] -name = "termcolor" -version = "1.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755" -dependencies = [ - "winapi-util", -] - [[package]] name = "termtree" version = "0.4.1" @@ -1329,51 +1091,12 @@ dependencies = [ "syn", ] -[[package]] -name = "time" -version = "0.3.34" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8248b6521bb14bc45b4067159b9b6ad792e2d6d754d6c41fb50e29fefe38749" -dependencies = [ - "deranged", - "itoa", - "libc", - "num-conv", - "num_threads", - "powerfmt", - "serde", - "time-core", - "time-macros", -] - -[[package]] -name = "time-core" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" - -[[package]] -name = "time-macros" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ba3a3ef41e6672a2f0f001392bb5dcd3ff0a9992d618ca761a11c3121547774" -dependencies = [ - "num-conv", - "time-core", -] - [[package]] name = "unicode-ident" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" -[[package]] -name = "unindent" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" - [[package]] name = "utf8parse" version = "0.2.1" @@ -1384,29 +1107,9 @@ checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" name = "vcf2parquet" version = "0.6.0" dependencies = [ + "arrow2", "assert_cmd", - "tempfile", - "vcf2parquet-bin", - "vcf2parquet-lib", -] - -[[package]] -name = "vcf2parquet-bin" -version = "0.6.0" -dependencies = [ - "arrow2 0.18.0", "clap", - "niffler", - "simplelog", - "thiserror", - "vcf2parquet-lib", -] - -[[package]] -name = "vcf2parquet-lib" -version = "0.6.0" -dependencies = [ - "arrow2 0.18.0", "lazy_static", "log", "niffler", @@ -1417,18 +1120,6 @@ dependencies = [ "thiserror", ] -[[package]] -name = "vcf2parquet-py" -version = "0.6.0" -dependencies = [ - "arrow2 0.17.4", - "niffler", - "pyo3", - "tempfile", - "thiserror", - "vcf2parquet-lib", -] - [[package]] name = "version_check" version = "0.9.4" @@ -1504,59 +1195,13 @@ version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - -[[package]] -name = "winapi-util" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" -dependencies = [ - "winapi", -] - -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - [[package]] name = "windows-sys" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.4", -] - -[[package]] -name = "windows-targets" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" -dependencies = [ - "windows_aarch64_gnullvm 0.48.5", - "windows_aarch64_msvc 0.48.5", - "windows_i686_gnu 0.48.5", - "windows_i686_msvc 0.48.5", - "windows_x86_64_gnu 0.48.5", - "windows_x86_64_gnullvm 0.48.5", - "windows_x86_64_msvc 0.48.5", + "windows-targets", ] [[package]] @@ -1565,93 +1210,51 @@ version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" dependencies = [ - "windows_aarch64_gnullvm 0.52.4", - "windows_aarch64_msvc 0.52.4", - "windows_i686_gnu 0.52.4", - "windows_i686_msvc 0.52.4", - "windows_x86_64_gnu 0.52.4", - "windows_x86_64_gnullvm 0.52.4", - "windows_x86_64_msvc 0.52.4", + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", ] -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" - [[package]] name = "windows_aarch64_gnullvm" version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" -[[package]] -name = "windows_aarch64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" - [[package]] name = "windows_aarch64_msvc" version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" -[[package]] -name = "windows_i686_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" - [[package]] name = "windows_i686_gnu" version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" -[[package]] -name = "windows_i686_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" - [[package]] name = "windows_i686_msvc" version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" -[[package]] -name = "windows_x86_64_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" - [[package]] name = "windows_x86_64_gnu" version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" - [[package]] name = "windows_x86_64_gnullvm" version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" -[[package]] -name = "windows_x86_64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" - [[package]] name = "windows_x86_64_msvc" version = "0.52.4" diff --git a/Cargo.toml b/Cargo.toml index f6de8ad..4b63f00 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,32 +13,30 @@ keywords = ["bioinformatics", "parquet"] [dependencies] -vcf2parquet-lib = { path = "vcf2parquet-lib", version = "0.6.0", optional = true } -vcf2parquet-bin = { path = "vcf2parquet-bin", version = "0.6.0", optional = true } +# parallel +rayon = { version = "1" } -[dev-dependencies] -tempfile = { version = "3" } -assert_cmd = { version = "2" } - - -[workspace] -members = ["vcf2parquet-lib", "vcf2parquet-bin", "vcf2parquet-py"] +# input output management +niffler = { version = "2" } +noodles = { version = "0.64", features = ["vcf"] } +arrow2 = { version = "0.18", features = ["io_parquet", "io_parquet_compression"] } +rustc-hash = { version = "1" } +# logging management +log = { version = "0.4" } -[features] -default = ["lib"] +# error management +thiserror = { version = "1" } -lib = ["vcf2parquet-lib"] -bin = ["vcf2parquet-bin"] +# cli management +clap = { version = "4", features = ["derive"] } -[[bin]] -name = "vcf2parquet" -required-features = ["bin"] +[dev-dependencies] +lazy_static = { version = "1" } +tempfile = { version = "3" } +assert_cmd = { version = "2" } -[[test]] -name = "functional" -required-features = ["bin"] [package.metadata.docs.rs] all-features = true diff --git a/vcf2parquet-py/Cargo.toml b/python/Cargo.toml similarity index 100% rename from vcf2parquet-py/Cargo.toml rename to python/Cargo.toml diff --git a/vcf2parquet-py/Readme.md b/python/Readme.md similarity index 100% rename from vcf2parquet-py/Readme.md rename to python/Readme.md diff --git a/vcf2parquet-py/pyproject.toml b/python/pyproject.toml similarity index 100% rename from vcf2parquet-py/pyproject.toml rename to python/pyproject.toml diff --git a/vcf2parquet-py/src/error.rs b/python/src/error.rs similarity index 100% rename from vcf2parquet-py/src/error.rs rename to python/src/error.rs diff --git a/vcf2parquet-py/src/lib.rs b/python/src/lib.rs similarity index 100% rename from vcf2parquet-py/src/lib.rs rename to python/src/lib.rs diff --git a/vcf2parquet-py/tests/test_vcf2parquet.py b/python/tests/test_vcf2parquet.py similarity index 100% rename from vcf2parquet-py/tests/test_vcf2parquet.py rename to python/tests/test_vcf2parquet.py diff --git a/vcf2parquet-bin/src/cli.rs b/src/cli.rs similarity index 95% rename from vcf2parquet-bin/src/cli.rs rename to src/cli.rs index bbb15e2..db4e958 100644 --- a/vcf2parquet-bin/src/cli.rs +++ b/src/cli.rs @@ -6,17 +6,32 @@ /* project use */ +/// Compression available for user #[derive(Debug, clap::ValueEnum, Clone, Copy)] pub enum Compression { + /// No compression Uncompressed, + + /// Snappy compression Snappy, + + /// Gzip compression Gzip, + + /// Lzo compression Lzo, + + /// Brotly compression Brotli, + + /// Lz4 compression Lz4, + + /// Zstd compression Zstd, } +/// Define cli of vcf2parquet #[derive(clap::Parser, std::fmt::Debug)] #[command( name = "vcf2parquet", @@ -49,9 +64,13 @@ pub struct Command { subcommand: SubCommand, } +/// Enum to manage sub command #[derive(clap::Parser, std::fmt::Debug, Clone)] pub enum SubCommand { + /// Convert a vcf in a parquet Convert(Convert), + + /// Convert a vcf in multiple parquet file each file contains `batch_size` record Split(Split), } diff --git a/vcf2parquet-lib/src/error.rs b/src/error.rs similarity index 89% rename from vcf2parquet-lib/src/error.rs rename to src/error.rs index 9182b43..dde4087 100644 --- a/vcf2parquet-lib/src/error.rs +++ b/src/error.rs @@ -28,6 +28,10 @@ pub enum Error { /// Noodles header vcf error #[error(transparent)] NoodlesHeader(#[from] noodles::vcf::header::ParseError), + + /// Niffler error + #[error(transparent)] + Niffler(#[from] niffler::Error), } pub type Result = std::result::Result; diff --git a/src/lib.rs b/src/lib.rs index c1ddeb6..98e38c9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,329 @@ -#![warn(missing_docs)] +//! vcf2parquet library -//! vcf2parquet allow user to convert a vcf in parquet format. +#[warn(missing_docs)] +/* std use */ -pub use vcf2parquet_lib::*; +/* crate use */ + +/* project use */ + +/* mod section */ +pub mod cli; +pub mod error; +pub mod name2data; +pub mod record2chunk; +pub mod schema; + +/// Read `input` vcf and write parquet in `output` +pub fn vcf2parquet( + input: &mut R, + output: &mut W, + batch_size: usize, + compression: arrow2::io::parquet::write::CompressionOptions, + info_optional: bool, +) -> error::Result<()> +where + R: std::io::BufRead, + W: std::io::Write, +{ + // VCF section + let mut reader = noodles::vcf::Reader::new(input); + + let vcf_header: noodles::vcf::Header = reader.read_header()?; + + // Parquet section + let schema = schema::from_header(&vcf_header, info_optional)?; + + let mut iterator = reader.records(&vcf_header); + let chunk_iterator = record2chunk::Record2Chunk::new( + &mut iterator, + batch_size, + vcf_header.clone(), + schema.clone(), + ); + + let options = arrow2::io::parquet::write::WriteOptions { + write_statistics: true, + compression, + version: arrow2::io::parquet::write::Version::V2, + data_pagesize_limit: Some(batch_size), + }; + + let encodings = chunk_iterator.encodings(); + let row_groups = arrow2::io::parquet::write::RowGroupIterator::try_new( + chunk_iterator, + &schema, + options, + encodings, + )?; + + let mut writer = arrow2::io::parquet::write::FileWriter::try_new(output, schema, options)?; + + for group in row_groups { + writer.write(group?)?; + } + let _ = writer.end(None)?; + + Ok(()) +} + +/// Read `input` vcf and write each row group in a parquet file match with template +pub fn vcf2multiparquet( + input: &mut R, + template: &str, + batch_size: usize, + compression: arrow2::io::parquet::write::CompressionOptions, + info_optional: bool, +) -> error::Result<()> +where + R: std::io::BufRead, +{ + // VCF section + let mut reader = noodles::vcf::Reader::new(input); + + let vcf_header: noodles::vcf::Header = reader.read_header()?; + + // Parquet section + let schema = schema::from_header(&vcf_header, info_optional)?; + + let mut iterator = reader.records(&vcf_header); + let chunk_iterator = record2chunk::Record2Chunk::new( + &mut iterator, + batch_size, + vcf_header.clone(), + schema.clone(), + ); + + let options = arrow2::io::parquet::write::WriteOptions { + write_statistics: true, + compression, + version: arrow2::io::parquet::write::Version::V2, + data_pagesize_limit: Some(batch_size), + }; + + let encodings = chunk_iterator.encodings(); + let row_groups = arrow2::io::parquet::write::RowGroupIterator::try_new( + chunk_iterator, + &schema, + options, + encodings, + )?; + + for (index, group) in row_groups.enumerate() { + let output = std::fs::File::create(template.replace("{}", &index.to_string()))?; + let mut writer = + arrow2::io::parquet::write::FileWriter::try_new(output, schema.clone(), options)?; + + writer.write(group?)?; + writer.end(None)?; + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + static VCF_FILE: &[u8] = b"##fileformat=VCFv4.3 +#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO +1\t925952\t1019397\tG\tA\t.\t.\t. +"; + + static PARQUET_FILE: &[u8] = &[ + 80, 65, 82, 49, 21, 6, 21, 10, 21, 50, 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 0, 21, 0, 17, + 28, 54, 0, 40, 1, 49, 24, 1, 49, 0, 0, 0, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 100, 96, + 96, 48, 4, 0, 151, 222, 156, 170, 5, 0, 0, 0, 21, 12, 25, 37, 0, 6, 25, 24, 10, 99, 104, + 114, 111, 109, 111, 115, 111, 109, 101, 21, 4, 22, 2, 22, 74, 22, 114, 38, 8, 60, 54, 0, + 40, 1, 49, 24, 1, 49, 0, 0, 21, 6, 21, 8, 21, 48, 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 0, + 21, 0, 17, 28, 54, 0, 40, 4, 0, 33, 14, 0, 24, 4, 0, 33, 14, 0, 0, 0, 0, 31, 139, 8, 0, 0, + 0, 0, 0, 0, 255, 99, 80, 228, 99, 0, 0, 69, 222, 72, 134, 4, 0, 0, 0, 21, 2, 25, 37, 0, 6, + 25, 24, 8, 112, 111, 115, 105, 116, 105, 111, 110, 21, 4, 22, 2, 22, 84, 22, 124, 38, 202, + 1, 60, 54, 0, 40, 4, 0, 33, 14, 0, 24, 4, 0, 33, 14, 0, 0, 0, 21, 6, 21, 30, 21, 70, 92, + 21, 2, 21, 0, 21, 2, 21, 0, 21, 4, 21, 4, 17, 28, 54, 0, 40, 7, 49, 48, 49, 57, 51, 57, 55, + 24, 7, 49, 48, 49, 57, 51, 57, 55, 0, 0, 0, 3, 0, 3, 1, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, + 99, 103, 96, 96, 48, 52, 48, 180, 52, 182, 52, 7, 0, 69, 88, 164, 201, 11, 0, 0, 0, 21, 12, + 25, 37, 0, 6, 25, 56, 10, 105, 100, 101, 110, 116, 105, 102, 105, 101, 114, 4, 108, 105, + 115, 116, 2, 105, 100, 21, 4, 22, 2, 22, 118, 22, 158, 1, 38, 160, 3, 60, 54, 0, 40, 7, 49, + 48, 49, 57, 51, 57, 55, 24, 7, 49, 48, 49, 57, 51, 57, 55, 0, 0, 21, 6, 21, 10, 21, 50, 92, + 21, 2, 21, 0, 21, 2, 21, 0, 21, 0, 21, 0, 17, 28, 54, 0, 40, 1, 71, 24, 1, 71, 0, 0, 0, 31, + 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 100, 96, 96, 112, 7, 0, 158, 10, 250, 19, 5, 0, 0, 0, + 21, 12, 25, 37, 0, 6, 25, 24, 9, 114, 101, 102, 101, 114, 101, 110, 99, 101, 21, 4, 22, 2, + 22, 74, 22, 114, 38, 186, 5, 60, 54, 0, 40, 1, 71, 24, 1, 71, 0, 0, 21, 6, 21, 10, 21, 50, + 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 0, 21, 0, 17, 28, 54, 0, 40, 1, 65, 24, 1, 65, 0, 0, 0, + 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 100, 96, 96, 112, 4, 0, 171, 175, 153, 250, 5, 0, 0, + 0, 21, 12, 25, 37, 0, 6, 25, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 21, 4, 22, + 2, 22, 74, 22, 114, 38, 252, 6, 60, 54, 0, 40, 1, 65, 24, 1, 65, 0, 0, 21, 6, 21, 4, 21, + 44, 92, 21, 2, 21, 2, 21, 2, 21, 0, 21, 4, 21, 0, 17, 28, 54, 2, 0, 0, 0, 3, 0, 31, 139, 8, + 0, 0, 0, 0, 0, 0, 255, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 8, 25, 37, 0, 6, 25, 24, 7, 113, + 117, 97, 108, 105, 116, 121, 21, 4, 22, 2, 22, 56, 22, 96, 38, 190, 8, 60, 54, 2, 0, 0, 21, + 6, 21, 8, 21, 48, 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 4, 21, 4, 17, 28, 54, 0, 0, 0, 0, 3, + 0, 3, 0, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 12, 25, 37, + 0, 6, 25, 56, 6, 102, 105, 108, 116, 101, 114, 4, 108, 105, 115, 116, 6, 102, 105, 108, + 116, 101, 114, 21, 4, 22, 2, 22, 60, 22, 100, 38, 222, 9, 60, 54, 0, 0, 0, 21, 12, 25, 5, + 25, 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, 109, 101, 21, 0, 22, 0, 22, 0, 22, 0, + 38, 0, 0, 21, 2, 25, 5, 25, 24, 8, 112, 111, 115, 105, 116, 105, 111, 110, 21, 0, 22, 0, + 22, 0, 22, 0, 38, 0, 0, 21, 12, 25, 5, 25, 56, 10, 105, 100, 101, 110, 116, 105, 102, 105, + 101, 114, 4, 108, 105, 115, 116, 2, 105, 100, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 21, 12, + 25, 5, 25, 24, 9, 114, 101, 102, 101, 114, 101, 110, 99, 101, 21, 0, 22, 0, 22, 0, 22, 0, + 38, 0, 0, 21, 12, 25, 5, 25, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 21, 0, 22, + 0, 22, 0, 22, 0, 38, 0, 0, 21, 8, 25, 5, 25, 24, 7, 113, 117, 97, 108, 105, 116, 121, 21, + 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 21, 12, 25, 5, 25, 56, 6, 102, 105, 108, 116, 101, 114, + 4, 108, 105, 115, 116, 6, 102, 105, 108, 116, 101, 114, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, + 0, 25, 17, 2, 25, 24, 1, 49, 25, 24, 1, 49, 21, 0, 25, 22, 0, 0, 25, 17, 2, 25, 24, 4, 0, + 33, 14, 0, 25, 24, 4, 0, 33, 14, 0, 21, 0, 25, 22, 0, 0, 25, 17, 2, 25, 24, 7, 49, 48, 49, + 57, 51, 57, 55, 25, 24, 7, 49, 48, 49, 57, 51, 57, 55, 21, 0, 25, 22, 0, 0, 25, 17, 2, 25, + 24, 1, 71, 25, 24, 1, 71, 21, 0, 25, 22, 0, 0, 25, 17, 2, 25, 24, 1, 65, 25, 24, 1, 65, 21, + 0, 25, 22, 0, 0, 25, 17, 1, 25, 24, 1, 0, 25, 24, 1, 0, 21, 0, 25, 22, 2, 0, 25, 17, 1, 25, + 24, 1, 0, 25, 24, 1, 0, 21, 0, 25, 22, 0, 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 1, + 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 1, 25, 8, 25, 8, + 21, 0, 25, 6, 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, + 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 28, 22, 8, 21, 114, 22, 0, 0, 0, 25, 28, 22, + 202, 1, 21, 124, 22, 0, 0, 0, 25, 28, 22, 160, 3, 21, 158, 1, 22, 0, 0, 0, 25, 28, 22, 186, + 5, 21, 114, 22, 0, 0, 0, 25, 28, 22, 252, 6, 21, 114, 22, 0, 0, 0, 25, 28, 22, 190, 8, 21, + 96, 22, 0, 0, 0, 25, 28, 22, 222, 9, 21, 100, 22, 0, 0, 0, 25, 12, 0, 25, 12, 0, 25, 12, 0, + 25, 12, 0, 25, 12, 0, 25, 12, 0, 25, 12, 0, 21, 4, 25, 204, 72, 4, 114, 111, 111, 116, 21, + 14, 0, 21, 12, 37, 0, 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, 109, 101, 37, 0, 76, + 28, 0, 0, 0, 21, 2, 37, 0, 24, 8, 112, 111, 115, 105, 116, 105, 111, 110, 0, 53, 0, 24, 10, + 105, 100, 101, 110, 116, 105, 102, 105, 101, 114, 21, 2, 21, 6, 76, 60, 0, 0, 0, 53, 4, 24, + 4, 108, 105, 115, 116, 21, 2, 0, 21, 12, 37, 0, 24, 2, 105, 100, 37, 0, 76, 28, 0, 0, 0, + 21, 12, 37, 0, 24, 9, 114, 101, 102, 101, 114, 101, 110, 99, 101, 37, 0, 76, 28, 0, 0, 0, + 21, 12, 37, 0, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 37, 0, 76, 28, 0, 0, 0, + 21, 8, 37, 2, 24, 7, 113, 117, 97, 108, 105, 116, 121, 0, 53, 0, 24, 6, 102, 105, 108, 116, + 101, 114, 21, 2, 21, 6, 76, 60, 0, 0, 0, 53, 4, 24, 4, 108, 105, 115, 116, 21, 2, 0, 21, + 12, 37, 0, 24, 6, 102, 105, 108, 116, 101, 114, 37, 0, 76, 28, 0, 0, 0, 22, 2, 25, 44, 25, + 124, 38, 122, 28, 21, 12, 25, 37, 0, 6, 25, 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, + 109, 101, 21, 4, 22, 2, 22, 74, 22, 114, 38, 8, 60, 54, 0, 40, 1, 49, 24, 1, 49, 0, 0, 22, + 222, 17, 21, 20, 22, 178, 14, 21, 34, 0, 38, 198, 2, 28, 21, 2, 25, 37, 0, 6, 25, 24, 8, + 112, 111, 115, 105, 116, 105, 111, 110, 21, 4, 22, 2, 22, 84, 22, 124, 38, 202, 1, 60, 54, + 0, 40, 4, 0, 33, 14, 0, 24, 4, 0, 33, 14, 0, 0, 0, 22, 242, 17, 21, 22, 22, 212, 14, 21, + 46, 0, 38, 190, 4, 28, 21, 12, 25, 37, 0, 6, 25, 56, 10, 105, 100, 101, 110, 116, 105, 102, + 105, 101, 114, 4, 108, 105, 115, 116, 2, 105, 100, 21, 4, 22, 2, 22, 118, 22, 158, 1, 38, + 160, 3, 60, 54, 0, 40, 7, 49, 48, 49, 57, 51, 57, 55, 24, 7, 49, 48, 49, 57, 51, 57, 55, 0, + 0, 22, 136, 18, 21, 24, 22, 130, 15, 21, 58, 0, 38, 172, 6, 28, 21, 12, 25, 37, 0, 6, 25, + 24, 9, 114, 101, 102, 101, 114, 101, 110, 99, 101, 21, 4, 22, 2, 22, 74, 22, 114, 38, 186, + 5, 60, 54, 0, 40, 1, 71, 24, 1, 71, 0, 0, 22, 160, 18, 21, 22, 22, 188, 15, 21, 34, 0, 38, + 238, 7, 28, 21, 12, 25, 37, 0, 6, 25, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 21, + 4, 22, 2, 22, 74, 22, 114, 38, 252, 6, 60, 54, 0, 40, 1, 65, 24, 1, 65, 0, 0, 22, 182, 18, + 21, 22, 22, 222, 15, 21, 34, 0, 38, 158, 9, 28, 21, 8, 25, 37, 0, 6, 25, 24, 7, 113, 117, + 97, 108, 105, 116, 121, 21, 4, 22, 2, 22, 56, 22, 96, 38, 190, 8, 60, 54, 2, 0, 0, 22, 204, + 18, 21, 22, 22, 128, 16, 21, 34, 0, 38, 194, 10, 28, 21, 12, 25, 37, 0, 6, 25, 56, 6, 102, + 105, 108, 116, 101, 114, 4, 108, 105, 115, 116, 6, 102, 105, 108, 116, 101, 114, 21, 4, 22, + 2, 22, 60, 22, 100, 38, 222, 9, 60, 54, 0, 0, 0, 22, 226, 18, 21, 22, 22, 162, 16, 21, 34, + 0, 22, 156, 4, 22, 2, 38, 8, 22, 180, 6, 20, 0, 0, 25, 124, 38, 0, 28, 21, 12, 25, 5, 25, + 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, 109, 101, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, + 0, 22, 248, 18, 21, 6, 22, 196, 16, 21, 22, 0, 38, 0, 28, 21, 2, 25, 5, 25, 24, 8, 112, + 111, 115, 105, 116, 105, 111, 110, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 254, 18, 21, + 6, 22, 218, 16, 21, 22, 0, 38, 0, 28, 21, 12, 25, 5, 25, 56, 10, 105, 100, 101, 110, 116, + 105, 102, 105, 101, 114, 4, 108, 105, 115, 116, 2, 105, 100, 21, 0, 22, 0, 22, 0, 22, 0, + 38, 0, 0, 22, 132, 19, 21, 6, 22, 240, 16, 21, 22, 0, 38, 0, 28, 21, 12, 25, 5, 25, 24, 9, + 114, 101, 102, 101, 114, 101, 110, 99, 101, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 138, + 19, 21, 6, 22, 134, 17, 21, 22, 0, 38, 0, 28, 21, 12, 25, 5, 25, 24, 9, 97, 108, 116, 101, + 114, 110, 97, 116, 101, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 144, 19, 21, 6, 22, 156, + 17, 21, 22, 0, 38, 0, 28, 21, 8, 25, 5, 25, 24, 7, 113, 117, 97, 108, 105, 116, 121, 21, 0, + 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 150, 19, 21, 6, 22, 178, 17, 21, 22, 0, 38, 0, 28, 21, + 12, 25, 5, 25, 56, 6, 102, 105, 108, 116, 101, 114, 4, 108, 105, 115, 116, 6, 102, 105, + 108, 116, 101, 114, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 156, 19, 21, 6, 22, 200, 17, + 21, 22, 0, 22, 0, 22, 0, 38, 0, 22, 0, 20, 2, 0, 25, 28, 24, 12, 65, 82, 82, 79, 87, 58, + 115, 99, 104, 101, 109, 97, 24, 244, 6, 47, 47, 47, 47, 47, 52, 56, 67, 65, 65, 65, 69, 65, + 65, 65, 65, 56, 118, 47, 47, 47, 120, 81, 65, 65, 65, 65, 69, 65, 65, 69, 65, 65, 65, 65, + 75, 65, 65, 115, 65, 67, 65, 65, 75, 65, 65, 81, 65, 43, 80, 47, 47, 47, 119, 119, 65, 65, + 65, 65, 73, 65, 65, 103, 65, 65, 65, 65, 69, 65, 65, 99, 65, 65, 65, 65, 103, 65, 103, 65, + 65, 48, 65, 69, 65, 65, 70, 65, 66, 65, 65, 65, 77, 65, 81, 65, 65, 121, 65, 65, 65, 65, + 73, 81, 65, 65, 65, 65, 69, 65, 65, 65, 65, 55, 80, 47, 47, 47, 50, 119, 65, 65, 65, 66, + 103, 65, 65, 65, 65, 71, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, 81, 65, 66, 69, 65, 66, + 65, 65, 65, 65, 66, 65, 65, 67, 65, 65, 65, 65, 65, 119, 65, 65, 81, 65, 65, 65, 65, 81, + 65, 65, 65, 68, 115, 47, 47, 47, 47, 76, 65, 65, 65, 65, 67, 65, 65, 65, 65, 65, 89, 65, + 65, 65, 65, 66, 81, 65, 65, 65, 66, 65, 65, 69, 81, 65, 69, 65, 65, 65, 65, 69, 65, 65, 73, + 65, 65, 65, 65, 68, 65, 65, 65, 65, 65, 65, 65, 47, 80, 47, 47, 47, 119, 81, 65, 66, 65, + 65, 71, 65, 65, 65, 65, 90, 109, 108, 115, 100, 71, 86, 121, 65, 65, 68, 56, 47, 47, 47, + 47, 66, 65, 65, 69, 65, 65, 89, 65, 65, 65, 66, 109, 97, 87, 120, 48, 90, 88, 73, 65, 65, + 79, 122, 47, 47, 47, 56, 119, 65, 65, 65, 65, 73, 65, 65, 65, 65, 66, 103, 65, 65, 65, 65, + 66, 65, 119, 65, 65, 69, 65, 65, 83, 65, 65, 81, 65, 69, 65, 65, 82, 65, 65, 103, 65, 65, + 65, 65, 77, 65, 65, 65, 65, 65, 65, 68, 54, 47, 47, 47, 47, 65, 81, 65, 71, 65, 65, 89, 65, + 66, 65, 65, 72, 65, 65, 65, 65, 99, 88, 86, 104, 98, 71, 108, 48, 101, 81, 68, 115, 47, 47, + 47, 47, 76, 65, 65, 65, 65, 67, 65, 65, 65, 65, 65, 89, 65, 65, 65, 65, 66, 81, 65, 65, 65, + 66, 65, 65, 69, 81, 65, 69, 65, 65, 65, 65, 69, 65, 65, 73, 65, 65, 65, 65, 68, 65, 65, 65, + 65, 65, 65, 65, 47, 80, 47, 47, 47, 119, 81, 65, 66, 65, 65, 74, 65, 65, 65, 65, 89, 87, + 120, 48, 90, 88, 74, 117, 89, 88, 82, 108, 65, 65, 65, 65, 55, 80, 47, 47, 47, 121, 119, + 65, 65, 65, 65, 103, 65, 65, 65, 65, 71, 65, 65, 65, 65, 65, 85, 65, 65, 65, 65, 81, 65, + 66, 69, 65, 66, 65, 65, 65, 65, 66, 65, 65, 67, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, + 65, 65, 80, 122, 47, 47, 47, 56, 69, 65, 65, 81, 65, 67, 81, 65, 65, 65, 72, 74, 108, 90, + 109, 86, 121, 90, 87, 53, 106, 90, 81, 65, 65, 65, 79, 122, 47, 47, 47, 57, 111, 65, 65, + 65, 65, 88, 65, 65, 65, 65, 66, 103, 65, 65, 65, 65, 77, 65, 65, 65, 65, 69, 65, 65, 82, + 65, 65, 81, 65, 65, 65, 65, 81, 65, 65, 103, 65, 65, 65, 65, 77, 65, 65, 69, 65, 65, 65, + 65, 69, 65, 65, 65, 65, 55, 80, 47, 47, 47, 121, 119, 65, 65, 65, 65, 103, 65, 65, 65, 65, + 71, 65, 65, 65, 65, 65, 85, 65, 65, 65, 65, 81, 65, 66, 69, 65, 66, 65, 65, 65, 65, 66, 65, + 65, 67, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, 65, 65, 80, 122, 47, 47, 47, 56, 69, 65, + 65, 81, 65, 65, 103, 65, 65, 65, 71, 108, 107, 65, 65, 68, 56, 47, 47, 47, 47, 66, 65, 65, + 69, 65, 65, 111, 65, 65, 65, 66, 112, 90, 71, 86, 117, 100, 71, 108, 109, 97, 87, 86, 121, + 65, 65, 68, 115, 47, 47, 47, 47, 79, 65, 65, 65, 65, 67, 65, 65, 65, 65, 65, 89, 65, 65, + 65, 65, 65, 103, 65, 65, 65, 66, 65, 65, 69, 81, 65, 69, 65, 65, 65, 65, 69, 65, 65, 73, + 65, 65, 65, 65, 68, 65, 65, 65, 65, 65, 65, 65, 57, 80, 47, 47, 47, 121, 65, 65, 65, 65, + 65, 66, 65, 65, 65, 65, 67, 65, 65, 74, 65, 65, 81, 65, 67, 65, 65, 73, 65, 65, 65, 65, 99, + 71, 57, 122, 97, 88, 82, 112, 98, 50, 52, 65, 65, 65, 65, 65, 55, 80, 47, 47, 47, 121, 119, + 65, 65, 65, 65, 103, 65, 65, 65, 65, 71, 65, 65, 65, 65, 65, 85, 65, 65, 65, 65, 81, 65, + 66, 69, 65, 66, 65, 65, 65, 65, 66, 65, 65, 67, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, + 65, 65, 80, 122, 47, 47, 47, 56, 69, 65, 65, 81, 65, 67, 103, 65, 65, 65, 71, 78, 111, 99, + 109, 57, 116, 98, 51, 78, 118, 98, 87, 85, 65, 0, 24, 44, 65, 114, 114, 111, 119, 50, 32, + 45, 32, 78, 97, 116, 105, 118, 101, 32, 82, 117, 115, 116, 32, 105, 109, 112, 108, 101, + 109, 101, 110, 116, 97, 116, 105, 111, 110, 32, 111, 102, 32, 65, 114, 114, 111, 119, 0, + 107, 7, 0, 0, 80, 65, 82, 49, + ]; + + #[test] + fn convert_positives() { + let mut input = std::io::BufReader::new(&*VCF_FILE); + let mut output = Vec::new(); + + vcf2parquet( + &mut input, + &mut output, + 1, + arrow2::io::parquet::write::CompressionOptions::Gzip(None), + false, + ) + .unwrap(); + assert_eq!(output, *PARQUET_FILE); + } + + #[test] + fn not_a_vcf() { + let raw_data = [b'#', b'a', b'b', b'c', 255, 0x7F, b'\n'].to_vec(); + let mut input = std::io::BufReader::new(&raw_data[..]); + let mut output = Vec::new(); + + let result = vcf2parquet( + &mut input, + &mut output, + 1, + arrow2::io::parquet::write::CompressionOptions::Gzip(None), + false, + ); + + assert!(result.is_err()); + } + + #[test] + fn multi_positives() { + let mut input = std::io::BufReader::new(&*VCF_FILE); + let dir = tempfile::tempdir().unwrap(); + + let format = dir + .path() + .join("test_{}.parquet") + .as_os_str() + .to_str() + .unwrap() + .to_string(); + + vcf2multiparquet( + &mut input, + &format, + 1, + arrow2::io::parquet::write::CompressionOptions::Gzip(None), + false, + ) + .unwrap(); + } +} diff --git a/src/main.rs b/src/main.rs index 5d6b6e8..efe6894 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,6 +1,57 @@ -#[cfg(feature = "bin")] -pub use vcf2parquet_bin::{error, main as bin_main}; +//! vcf2parquet bin + +/* std use */ + +/* crate use */ +use clap::Parser as _; + +/* project use */ +use vcf2parquet::cli; +use vcf2parquet::error; + +/* mod section */ pub fn main() -> error::Result<()> { - bin_main() + let params = cli::Command::parse(); + + match params.subcommand() { + cli::SubCommand::Convert(subparams) => convert(¶ms, subparams), + cli::SubCommand::Split(subparams) => split(¶ms, subparams), + } +} + +fn convert(params: &cli::Command, subparams: &cli::Convert) -> error::Result<()> { + let mut reader = std::fs::File::open(params.input()) + .map(Box::new) + .map(|x| niffler::get_reader(x))? + .map(|(file, _)| std::io::BufReader::with_capacity(params.read_buffer(), file))?; + + let mut output = std::fs::File::create(subparams.output())?; + + vcf2parquet::vcf2parquet( + &mut reader, + &mut output, + params.batch_size(), + params.compression(), + params.info_optional(), + )?; + + Ok(()) +} + +fn split(params: &cli::Command, subparams: &cli::Split) -> error::Result<()> { + let mut reader = std::fs::File::open(params.input()) + .map(Box::new) + .map(|x| niffler::get_reader(x))? + .map(|(file, _)| std::io::BufReader::with_capacity(params.read_buffer(), file))?; + + vcf2parquet::vcf2multiparquet( + &mut reader, + subparams.format(), + params.batch_size(), + params.compression(), + params.info_optional(), + )?; + + Ok(()) } diff --git a/vcf2parquet-lib/src/name2data.rs b/src/name2data.rs similarity index 100% rename from vcf2parquet-lib/src/name2data.rs rename to src/name2data.rs diff --git a/vcf2parquet-lib/src/record2chunk.rs b/src/record2chunk.rs similarity index 100% rename from vcf2parquet-lib/src/record2chunk.rs rename to src/record2chunk.rs diff --git a/vcf2parquet-lib/src/schema.rs b/src/schema.rs similarity index 100% rename from vcf2parquet-lib/src/schema.rs rename to src/schema.rs diff --git a/tests/functional.rs b/tests/functional.rs index 1f0024f..6114244 100644 --- a/tests/functional.rs +++ b/tests/functional.rs @@ -14,7 +14,27 @@ fn help() -> Result<(), assert_cmd::cargo::CargoError> { cmd.args(["-h"]); - let truth: &[u8] = b"Convert a vcf in parquet + let truth: &[u8] = if cfg!(windows) { + b"Convert a vcf in parquet + +Usage: vcf2parquet.exe [OPTIONS] --input + +Commands: + convert Convert a vcf in a parquet + split Convert a vcf in multiple parquet file each file contains `batch_size` record + help Print this message or the help of the given subcommand(s) + +Options: + -i, --input Input path + -b, --batch-size Batch size (default 100,000) + -c, --compression Compression method (default snappy) [possible values: uncompressed, snappy, gzip, lzo, brotli, lz4, zstd] + -r, --read-buffer Read buffer size in bytes (default 8192) + -I, --info-optional All information fields are optional + -h, --help Print help (see more with '--help') + -V, --version Print version +" + } else { + b"Convert a vcf in parquet Usage: vcf2parquet [OPTIONS] --input @@ -29,9 +49,10 @@ Options: -c, --compression Compression method (default snappy) [possible values: uncompressed, snappy, gzip, lzo, brotli, lz4, zstd] -r, --read-buffer Read buffer size in bytes (default 8192) -I, --info-optional All information fields are optional - -h, --help Print help + -h, --help Print help (see more with '--help') -V, --version Print version -"; +" + }; let assert = cmd.assert(); diff --git a/vcf2parquet-bin/Cargo.toml b/vcf2parquet-bin/Cargo.toml deleted file mode 100644 index f9f92ec..0000000 --- a/vcf2parquet-bin/Cargo.toml +++ /dev/null @@ -1,18 +0,0 @@ -[package] -name = "vcf2parquet-bin" -version = "0.6.0" -edition = "2021" - -[dependencies] -vcf2parquet-lib = { path = "../vcf2parquet-lib", version = "0.6.0" } -niffler = { version = "2" } -arrow2 = { version = "0.18", features = ["io_parquet", "io_parquet_compression"] } - -# logging management -simplelog = { version = "0.12" } - -# error management -thiserror = { version = "1" } - -# cli management -clap = { version = "4", features = ["derive"] } diff --git a/vcf2parquet-bin/src/error.rs b/vcf2parquet-bin/src/error.rs deleted file mode 100644 index 921c1ae..0000000 --- a/vcf2parquet-bin/src/error.rs +++ /dev/null @@ -1,80 +0,0 @@ -//! error of vcf2parquet-bin - -/* std use */ - -/* crate use */ - -/* project use */ - -#[derive(thiserror::Error, std::fmt::Debug)] -pub enum Error { - /// Io error - #[error(transparent)] - Io { error: std::io::Error }, - - /// Niffler error - #[error(transparent)] - Niffler { error: niffler::Error }, - - /// vcf2parquet-lib error - #[error(transparent)] - Lib { - error: vcf2parquet_lib::error::Error, - }, -} - -pub fn mapping(error: E) -> Error -where - E: std::convert::Into, -{ - error.into() -} - -impl From for Error { - fn from(error: std::io::Error) -> Self { - Error::Io { error } - } -} - -impl From for Error { - fn from(error: niffler::Error) -> Self { - Error::Niffler { error } - } -} - -impl From for Error { - fn from(error: vcf2parquet_lib::error::Error) -> Self { - Error::Lib { error } - } -} - -pub type Result = std::result::Result; - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn error_conversion() { - assert_eq!( - format!( - "{:?}", - Error::from(std::io::Error::new(std::io::ErrorKind::NotFound, "test")) - ), - "Io { error: Custom { kind: NotFound, error: \"test\" } }".to_string() - ); - - assert_eq!( - format!("{:?}", Error::from(niffler::Error::FileTooShort)), - "Niffler { error: FileTooShort }".to_string() - ); - - assert_eq!( - format!( - "{:?}", - Error::from(vcf2parquet_lib::error::Error::NoConversion) - ), - "Lib { error: NoConversion }".to_string() - ); - } -} diff --git a/vcf2parquet-bin/src/lib.rs b/vcf2parquet-bin/src/lib.rs deleted file mode 100644 index 8d49bf8..0000000 --- a/vcf2parquet-bin/src/lib.rs +++ /dev/null @@ -1,64 +0,0 @@ -//! vcf2parquet bin - -/* std use */ - -/* crate use */ -use clap::Parser as _; - -/* project use */ -use vcf2parquet_lib as lib; - -/* mod section */ -pub mod cli; -pub mod error; - -pub fn main() -> error::Result<()> { - let params = cli::Command::parse(); - - match params.subcommand() { - cli::SubCommand::Convert(subparams) => convert(¶ms, subparams), - cli::SubCommand::Split(subparams) => split(¶ms, subparams), - } -} - -fn convert(params: &cli::Command, subparams: &cli::Convert) -> error::Result<()> { - let mut reader = std::fs::File::open(params.input()) - .map_err(error::mapping) - .map(Box::new) - .map(|x| niffler::get_reader(x)) - .map_err(error::mapping)? - .map(|(file, _)| std::io::BufReader::with_capacity(params.read_buffer(), file))?; - - let mut output = std::fs::File::create(subparams.output()).map_err(error::mapping)?; - - lib::vcf2parquet( - &mut reader, - &mut output, - params.batch_size(), - params.compression(), - params.info_optional(), - ) - .map_err(error::mapping)?; - - Ok(()) -} - -fn split(params: &cli::Command, subparams: &cli::Split) -> error::Result<()> { - let mut reader = std::fs::File::open(params.input()) - .map_err(error::mapping) - .map(Box::new) - .map(|x| niffler::get_reader(x)) - .map_err(error::mapping)? - .map(|(file, _)| std::io::BufReader::with_capacity(params.read_buffer(), file))?; - - lib::vcf2multiparquet( - &mut reader, - subparams.format(), - params.batch_size(), - params.compression(), - params.info_optional(), - ) - .map_err(error::mapping)?; - - Ok(()) -} diff --git a/vcf2parquet-lib/Cargo.toml b/vcf2parquet-lib/Cargo.toml deleted file mode 100644 index 221788c..0000000 --- a/vcf2parquet-lib/Cargo.toml +++ /dev/null @@ -1,21 +0,0 @@ -[package] -name = "vcf2parquet-lib" -version = "0.6.0" -edition = "2021" - -[dependencies] -rayon = { version = "1" } - -# input output management -niffler = { version = "2" } -noodles = { version = "0.64", features = ["vcf"] } -arrow2 = { version = "0.18", features = ["io_parquet", "io_parquet_compression"] } -rustc-hash = { version = "1" } - -# logging and error management -log = { version = "0.4" } -thiserror = { version = "1" } - -[dev-dependencies] -lazy_static = { version = "1" } -tempfile = { version = "3" } \ No newline at end of file diff --git a/vcf2parquet-lib/src/lib.rs b/vcf2parquet-lib/src/lib.rs deleted file mode 100644 index e89c1f6..0000000 --- a/vcf2parquet-lib/src/lib.rs +++ /dev/null @@ -1,327 +0,0 @@ -//! vcf2parquet library - -/* std use */ - -/* crate use */ - -/* project use */ - -/* mod section */ -pub mod error; -pub mod name2data; -pub mod record2chunk; -pub mod schema; - -/// Read `input` vcf and write parquet in `output` -pub fn vcf2parquet( - input: &mut R, - output: &mut W, - batch_size: usize, - compression: arrow2::io::parquet::write::CompressionOptions, - info_optional: bool, -) -> error::Result<()> -where - R: std::io::BufRead, - W: std::io::Write, -{ - // VCF section - let mut reader = noodles::vcf::Reader::new(input); - - let vcf_header: noodles::vcf::Header = reader.read_header()?; - - // Parquet section - let schema = schema::from_header(&vcf_header, info_optional)?; - - let mut iterator = reader.records(&vcf_header); - let chunk_iterator = record2chunk::Record2Chunk::new( - &mut iterator, - batch_size, - vcf_header.clone(), - schema.clone(), - ); - - let options = arrow2::io::parquet::write::WriteOptions { - write_statistics: true, - compression, - version: arrow2::io::parquet::write::Version::V2, - data_pagesize_limit: Some(batch_size), - }; - - let encodings = chunk_iterator.encodings(); - let row_groups = arrow2::io::parquet::write::RowGroupIterator::try_new( - chunk_iterator, - &schema, - options, - encodings, - )?; - - let mut writer = arrow2::io::parquet::write::FileWriter::try_new(output, schema, options)?; - - for group in row_groups { - writer.write(group?)?; - } - let _ = writer.end(None)?; - - Ok(()) -} - -/// Read `input` vcf and write each row group in a parquet file match with template -pub fn vcf2multiparquet( - input: &mut R, - template: &str, - batch_size: usize, - compression: arrow2::io::parquet::write::CompressionOptions, - info_optional: bool, -) -> error::Result<()> -where - R: std::io::BufRead, -{ - // VCF section - let mut reader = noodles::vcf::Reader::new(input); - - let vcf_header: noodles::vcf::Header = reader.read_header()?; - - // Parquet section - let schema = schema::from_header(&vcf_header, info_optional)?; - - let mut iterator = reader.records(&vcf_header); - let chunk_iterator = record2chunk::Record2Chunk::new( - &mut iterator, - batch_size, - vcf_header.clone(), - schema.clone(), - ); - - let options = arrow2::io::parquet::write::WriteOptions { - write_statistics: true, - compression, - version: arrow2::io::parquet::write::Version::V2, - data_pagesize_limit: Some(batch_size), - }; - - let encodings = chunk_iterator.encodings(); - let row_groups = arrow2::io::parquet::write::RowGroupIterator::try_new( - chunk_iterator, - &schema, - options, - encodings, - )?; - - for (index, group) in row_groups.enumerate() { - let output = std::fs::File::create(template.replace("{}", &index.to_string()))?; - let mut writer = - arrow2::io::parquet::write::FileWriter::try_new(output, schema.clone(), options)?; - - writer.write(group?)?; - writer.end(None)?; - } - - Ok(()) -} - -#[cfg(test)] -mod tests { - use super::*; - - static VCF_FILE: &[u8] = b"##fileformat=VCFv4.3 -#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO -1\t925952\t1019397\tG\tA\t.\t.\t. -"; - - static PARQUET_FILE: &[u8] = &[ - 80, 65, 82, 49, 21, 6, 21, 10, 21, 50, 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 0, 21, 0, 17, - 28, 54, 0, 40, 1, 49, 24, 1, 49, 0, 0, 0, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 100, 96, - 96, 48, 4, 0, 151, 222, 156, 170, 5, 0, 0, 0, 21, 12, 25, 37, 0, 6, 25, 24, 10, 99, 104, - 114, 111, 109, 111, 115, 111, 109, 101, 21, 4, 22, 2, 22, 74, 22, 114, 38, 8, 60, 54, 0, - 40, 1, 49, 24, 1, 49, 0, 0, 21, 6, 21, 8, 21, 48, 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 0, - 21, 0, 17, 28, 54, 0, 40, 4, 0, 33, 14, 0, 24, 4, 0, 33, 14, 0, 0, 0, 0, 31, 139, 8, 0, 0, - 0, 0, 0, 0, 255, 99, 80, 228, 99, 0, 0, 69, 222, 72, 134, 4, 0, 0, 0, 21, 2, 25, 37, 0, 6, - 25, 24, 8, 112, 111, 115, 105, 116, 105, 111, 110, 21, 4, 22, 2, 22, 84, 22, 124, 38, 202, - 1, 60, 54, 0, 40, 4, 0, 33, 14, 0, 24, 4, 0, 33, 14, 0, 0, 0, 21, 6, 21, 30, 21, 70, 92, - 21, 2, 21, 0, 21, 2, 21, 0, 21, 4, 21, 4, 17, 28, 54, 0, 40, 7, 49, 48, 49, 57, 51, 57, 55, - 24, 7, 49, 48, 49, 57, 51, 57, 55, 0, 0, 0, 3, 0, 3, 1, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, - 99, 103, 96, 96, 48, 52, 48, 180, 52, 182, 52, 7, 0, 69, 88, 164, 201, 11, 0, 0, 0, 21, 12, - 25, 37, 0, 6, 25, 56, 10, 105, 100, 101, 110, 116, 105, 102, 105, 101, 114, 4, 108, 105, - 115, 116, 2, 105, 100, 21, 4, 22, 2, 22, 118, 22, 158, 1, 38, 160, 3, 60, 54, 0, 40, 7, 49, - 48, 49, 57, 51, 57, 55, 24, 7, 49, 48, 49, 57, 51, 57, 55, 0, 0, 21, 6, 21, 10, 21, 50, 92, - 21, 2, 21, 0, 21, 2, 21, 0, 21, 0, 21, 0, 17, 28, 54, 0, 40, 1, 71, 24, 1, 71, 0, 0, 0, 31, - 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 100, 96, 96, 112, 7, 0, 158, 10, 250, 19, 5, 0, 0, 0, - 21, 12, 25, 37, 0, 6, 25, 24, 9, 114, 101, 102, 101, 114, 101, 110, 99, 101, 21, 4, 22, 2, - 22, 74, 22, 114, 38, 186, 5, 60, 54, 0, 40, 1, 71, 24, 1, 71, 0, 0, 21, 6, 21, 10, 21, 50, - 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 0, 21, 0, 17, 28, 54, 0, 40, 1, 65, 24, 1, 65, 0, 0, 0, - 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 99, 100, 96, 96, 112, 4, 0, 171, 175, 153, 250, 5, 0, 0, - 0, 21, 12, 25, 37, 0, 6, 25, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 21, 4, 22, - 2, 22, 74, 22, 114, 38, 252, 6, 60, 54, 0, 40, 1, 65, 24, 1, 65, 0, 0, 21, 6, 21, 4, 21, - 44, 92, 21, 2, 21, 2, 21, 2, 21, 0, 21, 4, 21, 0, 17, 28, 54, 2, 0, 0, 0, 3, 0, 31, 139, 8, - 0, 0, 0, 0, 0, 0, 255, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 8, 25, 37, 0, 6, 25, 24, 7, 113, - 117, 97, 108, 105, 116, 121, 21, 4, 22, 2, 22, 56, 22, 96, 38, 190, 8, 60, 54, 2, 0, 0, 21, - 6, 21, 8, 21, 48, 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 4, 21, 4, 17, 28, 54, 0, 0, 0, 0, 3, - 0, 3, 0, 31, 139, 8, 0, 0, 0, 0, 0, 0, 255, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 12, 25, 37, - 0, 6, 25, 56, 6, 102, 105, 108, 116, 101, 114, 4, 108, 105, 115, 116, 6, 102, 105, 108, - 116, 101, 114, 21, 4, 22, 2, 22, 60, 22, 100, 38, 222, 9, 60, 54, 0, 0, 0, 21, 12, 25, 5, - 25, 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, 109, 101, 21, 0, 22, 0, 22, 0, 22, 0, - 38, 0, 0, 21, 2, 25, 5, 25, 24, 8, 112, 111, 115, 105, 116, 105, 111, 110, 21, 0, 22, 0, - 22, 0, 22, 0, 38, 0, 0, 21, 12, 25, 5, 25, 56, 10, 105, 100, 101, 110, 116, 105, 102, 105, - 101, 114, 4, 108, 105, 115, 116, 2, 105, 100, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 21, 12, - 25, 5, 25, 24, 9, 114, 101, 102, 101, 114, 101, 110, 99, 101, 21, 0, 22, 0, 22, 0, 22, 0, - 38, 0, 0, 21, 12, 25, 5, 25, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 21, 0, 22, - 0, 22, 0, 22, 0, 38, 0, 0, 21, 8, 25, 5, 25, 24, 7, 113, 117, 97, 108, 105, 116, 121, 21, - 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 21, 12, 25, 5, 25, 56, 6, 102, 105, 108, 116, 101, 114, - 4, 108, 105, 115, 116, 6, 102, 105, 108, 116, 101, 114, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, - 0, 25, 17, 2, 25, 24, 1, 49, 25, 24, 1, 49, 21, 0, 25, 22, 0, 0, 25, 17, 2, 25, 24, 4, 0, - 33, 14, 0, 25, 24, 4, 0, 33, 14, 0, 21, 0, 25, 22, 0, 0, 25, 17, 2, 25, 24, 7, 49, 48, 49, - 57, 51, 57, 55, 25, 24, 7, 49, 48, 49, 57, 51, 57, 55, 21, 0, 25, 22, 0, 0, 25, 17, 2, 25, - 24, 1, 71, 25, 24, 1, 71, 21, 0, 25, 22, 0, 0, 25, 17, 2, 25, 24, 1, 65, 25, 24, 1, 65, 21, - 0, 25, 22, 0, 0, 25, 17, 1, 25, 24, 1, 0, 25, 24, 1, 0, 21, 0, 25, 22, 2, 0, 25, 17, 1, 25, - 24, 1, 0, 25, 24, 1, 0, 21, 0, 25, 22, 0, 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 1, - 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 1, 25, 8, 25, 8, - 21, 0, 25, 6, 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, - 0, 25, 1, 25, 8, 25, 8, 21, 0, 25, 6, 0, 25, 28, 22, 8, 21, 114, 22, 0, 0, 0, 25, 28, 22, - 202, 1, 21, 124, 22, 0, 0, 0, 25, 28, 22, 160, 3, 21, 158, 1, 22, 0, 0, 0, 25, 28, 22, 186, - 5, 21, 114, 22, 0, 0, 0, 25, 28, 22, 252, 6, 21, 114, 22, 0, 0, 0, 25, 28, 22, 190, 8, 21, - 96, 22, 0, 0, 0, 25, 28, 22, 222, 9, 21, 100, 22, 0, 0, 0, 25, 12, 0, 25, 12, 0, 25, 12, 0, - 25, 12, 0, 25, 12, 0, 25, 12, 0, 25, 12, 0, 21, 4, 25, 204, 72, 4, 114, 111, 111, 116, 21, - 14, 0, 21, 12, 37, 0, 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, 109, 101, 37, 0, 76, - 28, 0, 0, 0, 21, 2, 37, 0, 24, 8, 112, 111, 115, 105, 116, 105, 111, 110, 0, 53, 0, 24, 10, - 105, 100, 101, 110, 116, 105, 102, 105, 101, 114, 21, 2, 21, 6, 76, 60, 0, 0, 0, 53, 4, 24, - 4, 108, 105, 115, 116, 21, 2, 0, 21, 12, 37, 0, 24, 2, 105, 100, 37, 0, 76, 28, 0, 0, 0, - 21, 12, 37, 0, 24, 9, 114, 101, 102, 101, 114, 101, 110, 99, 101, 37, 0, 76, 28, 0, 0, 0, - 21, 12, 37, 0, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 37, 0, 76, 28, 0, 0, 0, - 21, 8, 37, 2, 24, 7, 113, 117, 97, 108, 105, 116, 121, 0, 53, 0, 24, 6, 102, 105, 108, 116, - 101, 114, 21, 2, 21, 6, 76, 60, 0, 0, 0, 53, 4, 24, 4, 108, 105, 115, 116, 21, 2, 0, 21, - 12, 37, 0, 24, 6, 102, 105, 108, 116, 101, 114, 37, 0, 76, 28, 0, 0, 0, 22, 2, 25, 44, 25, - 124, 38, 122, 28, 21, 12, 25, 37, 0, 6, 25, 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, - 109, 101, 21, 4, 22, 2, 22, 74, 22, 114, 38, 8, 60, 54, 0, 40, 1, 49, 24, 1, 49, 0, 0, 22, - 222, 17, 21, 20, 22, 178, 14, 21, 34, 0, 38, 198, 2, 28, 21, 2, 25, 37, 0, 6, 25, 24, 8, - 112, 111, 115, 105, 116, 105, 111, 110, 21, 4, 22, 2, 22, 84, 22, 124, 38, 202, 1, 60, 54, - 0, 40, 4, 0, 33, 14, 0, 24, 4, 0, 33, 14, 0, 0, 0, 22, 242, 17, 21, 22, 22, 212, 14, 21, - 46, 0, 38, 190, 4, 28, 21, 12, 25, 37, 0, 6, 25, 56, 10, 105, 100, 101, 110, 116, 105, 102, - 105, 101, 114, 4, 108, 105, 115, 116, 2, 105, 100, 21, 4, 22, 2, 22, 118, 22, 158, 1, 38, - 160, 3, 60, 54, 0, 40, 7, 49, 48, 49, 57, 51, 57, 55, 24, 7, 49, 48, 49, 57, 51, 57, 55, 0, - 0, 22, 136, 18, 21, 24, 22, 130, 15, 21, 58, 0, 38, 172, 6, 28, 21, 12, 25, 37, 0, 6, 25, - 24, 9, 114, 101, 102, 101, 114, 101, 110, 99, 101, 21, 4, 22, 2, 22, 74, 22, 114, 38, 186, - 5, 60, 54, 0, 40, 1, 71, 24, 1, 71, 0, 0, 22, 160, 18, 21, 22, 22, 188, 15, 21, 34, 0, 38, - 238, 7, 28, 21, 12, 25, 37, 0, 6, 25, 24, 9, 97, 108, 116, 101, 114, 110, 97, 116, 101, 21, - 4, 22, 2, 22, 74, 22, 114, 38, 252, 6, 60, 54, 0, 40, 1, 65, 24, 1, 65, 0, 0, 22, 182, 18, - 21, 22, 22, 222, 15, 21, 34, 0, 38, 158, 9, 28, 21, 8, 25, 37, 0, 6, 25, 24, 7, 113, 117, - 97, 108, 105, 116, 121, 21, 4, 22, 2, 22, 56, 22, 96, 38, 190, 8, 60, 54, 2, 0, 0, 22, 204, - 18, 21, 22, 22, 128, 16, 21, 34, 0, 38, 194, 10, 28, 21, 12, 25, 37, 0, 6, 25, 56, 6, 102, - 105, 108, 116, 101, 114, 4, 108, 105, 115, 116, 6, 102, 105, 108, 116, 101, 114, 21, 4, 22, - 2, 22, 60, 22, 100, 38, 222, 9, 60, 54, 0, 0, 0, 22, 226, 18, 21, 22, 22, 162, 16, 21, 34, - 0, 22, 156, 4, 22, 2, 38, 8, 22, 180, 6, 20, 0, 0, 25, 124, 38, 0, 28, 21, 12, 25, 5, 25, - 24, 10, 99, 104, 114, 111, 109, 111, 115, 111, 109, 101, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, - 0, 22, 248, 18, 21, 6, 22, 196, 16, 21, 22, 0, 38, 0, 28, 21, 2, 25, 5, 25, 24, 8, 112, - 111, 115, 105, 116, 105, 111, 110, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 254, 18, 21, - 6, 22, 218, 16, 21, 22, 0, 38, 0, 28, 21, 12, 25, 5, 25, 56, 10, 105, 100, 101, 110, 116, - 105, 102, 105, 101, 114, 4, 108, 105, 115, 116, 2, 105, 100, 21, 0, 22, 0, 22, 0, 22, 0, - 38, 0, 0, 22, 132, 19, 21, 6, 22, 240, 16, 21, 22, 0, 38, 0, 28, 21, 12, 25, 5, 25, 24, 9, - 114, 101, 102, 101, 114, 101, 110, 99, 101, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 138, - 19, 21, 6, 22, 134, 17, 21, 22, 0, 38, 0, 28, 21, 12, 25, 5, 25, 24, 9, 97, 108, 116, 101, - 114, 110, 97, 116, 101, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 144, 19, 21, 6, 22, 156, - 17, 21, 22, 0, 38, 0, 28, 21, 8, 25, 5, 25, 24, 7, 113, 117, 97, 108, 105, 116, 121, 21, 0, - 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 150, 19, 21, 6, 22, 178, 17, 21, 22, 0, 38, 0, 28, 21, - 12, 25, 5, 25, 56, 6, 102, 105, 108, 116, 101, 114, 4, 108, 105, 115, 116, 6, 102, 105, - 108, 116, 101, 114, 21, 0, 22, 0, 22, 0, 22, 0, 38, 0, 0, 22, 156, 19, 21, 6, 22, 200, 17, - 21, 22, 0, 22, 0, 22, 0, 38, 0, 22, 0, 20, 2, 0, 25, 28, 24, 12, 65, 82, 82, 79, 87, 58, - 115, 99, 104, 101, 109, 97, 24, 244, 6, 47, 47, 47, 47, 47, 52, 56, 67, 65, 65, 65, 69, 65, - 65, 65, 65, 56, 118, 47, 47, 47, 120, 81, 65, 65, 65, 65, 69, 65, 65, 69, 65, 65, 65, 65, - 75, 65, 65, 115, 65, 67, 65, 65, 75, 65, 65, 81, 65, 43, 80, 47, 47, 47, 119, 119, 65, 65, - 65, 65, 73, 65, 65, 103, 65, 65, 65, 65, 69, 65, 65, 99, 65, 65, 65, 65, 103, 65, 103, 65, - 65, 48, 65, 69, 65, 65, 70, 65, 66, 65, 65, 65, 77, 65, 81, 65, 65, 121, 65, 65, 65, 65, - 73, 81, 65, 65, 65, 65, 69, 65, 65, 65, 65, 55, 80, 47, 47, 47, 50, 119, 65, 65, 65, 66, - 103, 65, 65, 65, 65, 71, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, 81, 65, 66, 69, 65, 66, - 65, 65, 65, 65, 66, 65, 65, 67, 65, 65, 65, 65, 65, 119, 65, 65, 81, 65, 65, 65, 65, 81, - 65, 65, 65, 68, 115, 47, 47, 47, 47, 76, 65, 65, 65, 65, 67, 65, 65, 65, 65, 65, 89, 65, - 65, 65, 65, 66, 81, 65, 65, 65, 66, 65, 65, 69, 81, 65, 69, 65, 65, 65, 65, 69, 65, 65, 73, - 65, 65, 65, 65, 68, 65, 65, 65, 65, 65, 65, 65, 47, 80, 47, 47, 47, 119, 81, 65, 66, 65, - 65, 71, 65, 65, 65, 65, 90, 109, 108, 115, 100, 71, 86, 121, 65, 65, 68, 56, 47, 47, 47, - 47, 66, 65, 65, 69, 65, 65, 89, 65, 65, 65, 66, 109, 97, 87, 120, 48, 90, 88, 73, 65, 65, - 79, 122, 47, 47, 47, 56, 119, 65, 65, 65, 65, 73, 65, 65, 65, 65, 66, 103, 65, 65, 65, 65, - 66, 65, 119, 65, 65, 69, 65, 65, 83, 65, 65, 81, 65, 69, 65, 65, 82, 65, 65, 103, 65, 65, - 65, 65, 77, 65, 65, 65, 65, 65, 65, 68, 54, 47, 47, 47, 47, 65, 81, 65, 71, 65, 65, 89, 65, - 66, 65, 65, 72, 65, 65, 65, 65, 99, 88, 86, 104, 98, 71, 108, 48, 101, 81, 68, 115, 47, 47, - 47, 47, 76, 65, 65, 65, 65, 67, 65, 65, 65, 65, 65, 89, 65, 65, 65, 65, 66, 81, 65, 65, 65, - 66, 65, 65, 69, 81, 65, 69, 65, 65, 65, 65, 69, 65, 65, 73, 65, 65, 65, 65, 68, 65, 65, 65, - 65, 65, 65, 65, 47, 80, 47, 47, 47, 119, 81, 65, 66, 65, 65, 74, 65, 65, 65, 65, 89, 87, - 120, 48, 90, 88, 74, 117, 89, 88, 82, 108, 65, 65, 65, 65, 55, 80, 47, 47, 47, 121, 119, - 65, 65, 65, 65, 103, 65, 65, 65, 65, 71, 65, 65, 65, 65, 65, 85, 65, 65, 65, 65, 81, 65, - 66, 69, 65, 66, 65, 65, 65, 65, 66, 65, 65, 67, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, - 65, 65, 80, 122, 47, 47, 47, 56, 69, 65, 65, 81, 65, 67, 81, 65, 65, 65, 72, 74, 108, 90, - 109, 86, 121, 90, 87, 53, 106, 90, 81, 65, 65, 65, 79, 122, 47, 47, 47, 57, 111, 65, 65, - 65, 65, 88, 65, 65, 65, 65, 66, 103, 65, 65, 65, 65, 77, 65, 65, 65, 65, 69, 65, 65, 82, - 65, 65, 81, 65, 65, 65, 65, 81, 65, 65, 103, 65, 65, 65, 65, 77, 65, 65, 69, 65, 65, 65, - 65, 69, 65, 65, 65, 65, 55, 80, 47, 47, 47, 121, 119, 65, 65, 65, 65, 103, 65, 65, 65, 65, - 71, 65, 65, 65, 65, 65, 85, 65, 65, 65, 65, 81, 65, 66, 69, 65, 66, 65, 65, 65, 65, 66, 65, - 65, 67, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, 65, 65, 80, 122, 47, 47, 47, 56, 69, 65, - 65, 81, 65, 65, 103, 65, 65, 65, 71, 108, 107, 65, 65, 68, 56, 47, 47, 47, 47, 66, 65, 65, - 69, 65, 65, 111, 65, 65, 65, 66, 112, 90, 71, 86, 117, 100, 71, 108, 109, 97, 87, 86, 121, - 65, 65, 68, 115, 47, 47, 47, 47, 79, 65, 65, 65, 65, 67, 65, 65, 65, 65, 65, 89, 65, 65, - 65, 65, 65, 103, 65, 65, 65, 66, 65, 65, 69, 81, 65, 69, 65, 65, 65, 65, 69, 65, 65, 73, - 65, 65, 65, 65, 68, 65, 65, 65, 65, 65, 65, 65, 57, 80, 47, 47, 47, 121, 65, 65, 65, 65, - 65, 66, 65, 65, 65, 65, 67, 65, 65, 74, 65, 65, 81, 65, 67, 65, 65, 73, 65, 65, 65, 65, 99, - 71, 57, 122, 97, 88, 82, 112, 98, 50, 52, 65, 65, 65, 65, 65, 55, 80, 47, 47, 47, 121, 119, - 65, 65, 65, 65, 103, 65, 65, 65, 65, 71, 65, 65, 65, 65, 65, 85, 65, 65, 65, 65, 81, 65, - 66, 69, 65, 66, 65, 65, 65, 65, 66, 65, 65, 67, 65, 65, 65, 65, 65, 119, 65, 65, 65, 65, - 65, 65, 80, 122, 47, 47, 47, 56, 69, 65, 65, 81, 65, 67, 103, 65, 65, 65, 71, 78, 111, 99, - 109, 57, 116, 98, 51, 78, 118, 98, 87, 85, 65, 0, 24, 44, 65, 114, 114, 111, 119, 50, 32, - 45, 32, 78, 97, 116, 105, 118, 101, 32, 82, 117, 115, 116, 32, 105, 109, 112, 108, 101, - 109, 101, 110, 116, 97, 116, 105, 111, 110, 32, 111, 102, 32, 65, 114, 114, 111, 119, 0, - 107, 7, 0, 0, 80, 65, 82, 49, - ]; - - #[test] - fn convert_positives() { - let mut input = std::io::BufReader::new(&*VCF_FILE); - let mut output = Vec::new(); - - vcf2parquet( - &mut input, - &mut output, - 1, - arrow2::io::parquet::write::CompressionOptions::Gzip(None), - false, - ) - .unwrap(); - assert_eq!(output, *PARQUET_FILE); - } - - #[test] - fn not_a_vcf() { - let raw_data = [b'#', b'a', b'b', b'c', 255, 0x7F, b'\n'].to_vec(); - let mut input = std::io::BufReader::new(&raw_data[..]); - let mut output = Vec::new(); - - let result = vcf2parquet( - &mut input, - &mut output, - 1, - arrow2::io::parquet::write::CompressionOptions::Gzip(None), - false, - ); - - assert!(result.is_err()); - } - - #[test] - fn multi_positives() { - let mut input = std::io::BufReader::new(&*VCF_FILE); - let dir = tempfile::tempdir().unwrap(); - - let format = dir - .path() - .join("test_{}.parquet") - .as_os_str() - .to_str() - .unwrap() - .to_string(); - - vcf2multiparquet( - &mut input, - &format, - 1, - arrow2::io::parquet::write::CompressionOptions::Gzip(None), - false, - ) - .unwrap(); - } -}